this is for holding javascript data
Ben Hirsch edited CUDA_Implementation.tex
about 11 years ago
Commit id: ff7000ec51222a18fe8bd20e437780ce622fb8fc
deletions | additions
diff --git a/CUDA_Implementation.tex b/CUDA_Implementation.tex
index 45a4d9a..8b323f4 100644
--- a/CUDA_Implementation.tex
+++ b/CUDA_Implementation.tex
...
We chose two different tactics for our CUDA implementation. On one hand, the CUDA extensions to the C language allows the programmer to define their own kernel and parallelization strategy. Usually this strategy will consist of identifying sections of code (often loops) where multiple sets of data are used with the same code. To effectively utilize the multi-thread model each thread should execute without dependencies on each other. We believed the area with greatest potential for speedup on the GPU was the loop sampling the effects for each marker locus which might run with 50 thousand to 700 thousand markers. The code for this loop follows
\begin{verbatim}
int idx;
Vector\ currenttDataLocus (X(0, Eigen::Map currentDataLocus(genotypeDataPointer, numObs);
Eigen::Map currentTestLocus(genotypeTestPointer, testObs);
for (unsigned i = 0; i < numberMarkers; i++)
{
new (¤tDataLocus) Eigen::Map (&dataPtr->X(0, i), numObs);
Vector\ currentTestLocus (test_X(0, new (¤tTestLocus) Eigen::Map (&testPtr->X(0, i), testObs);
float oldSamplei =
solSample\[i\]; solSample[i];
float rhsModeli = (currentDataLocus.dot(RInverseY)) + diagLhs(i) * oldSamplei;
rhsModeli *= invVarResidual;
float Lhs = diagLhs[i] * invVarResidual + invVarEffects;
float probDeltaOne = 1.0f / (1.0f + exp(logDeltaZero + 0.5f * (log(Lhs) + logVarEffects - rhsModeli*rhsModeli/Lhs) - logPiComp));
if (matvec::ranf() < probDeltaOne)
\{ {
sigma[i] = varEffects; // sigma[i] is used in sampleCommonSigma
solSample[i] = matvec::snorm() * sqrt(1.0f/Lhs) + rhsModeli/Lhs;
meanEffects[i] += (solSample[i] - meanEffects[i]) * invThisIter;
sdDelta1[i] += (solSample[i] * solSample[i] - sdDelta1[i]) * invThisIter;
mlFreq[i] += (1.0f - mlFreq[i]) * invThisIter;
estLocusVars[i] += (varEffects - estLocusVars[i]) * invThisIter;
RInverseY += ((currentDataLocus.array() * dataPtr->vectorRInverse.array()) * (oldSamplei - solSample[i])).matrix();
genotValueVec += currentDataLocus * solSample[i];
if (iteration % outputFreq == 0)
\{ {
testGenotValueVec += currentTestLocus * solSample[i];
if (wantWindowBV)
\{ {
windowBV.col(dataPtr->WindowcM[i]) += currentDataLocus * solSample[i];
cumSample[numberEffects].icolumn = i; // store samples
cumSample[numberEffects].fSample = solSample[i];
\}
\} }
}
int istart = max(0, int(i - windowWidth));
unsigned iend = min(i + windowWidth + 1, numberMarkers);
for (unsigned j = istart; j < iend; j++)
windowStatus[j] = 1.f; // record this window had a SNP fitted in the model
numberEffects++;
\} }
else
\{ {
if (oldSamplei)
RInverseY += ((currentDataLocus.array() * dataPtr->vectorRInverse.array()) * oldSamplei).matrix();
sigma[i] = 0.0;
solSample[i] = 0.0f;
meanEffects[i] -= meanEffects[i] * invThisIter;
sdDelta1[i] -= sdDelta1[i] * invThisIter;
mlFreq[i] -= mlFreq[i] * invThisIter;
estLocusVars[i] -= estLocusVars[i] * invThisIter;
\}
\} }
} // end of loop over markers (i is the loop variable)
\end{verbatim}
\begin{enumerate}