Ben Hirsch edited CUDA_Implementation.tex  about 11 years ago

Commit id: ff7000ec51222a18fe8bd20e437780ce622fb8fc

deletions | additions      

       

We chose two different tactics for our CUDA implementation. On one hand, the CUDA extensions to the C language allows the programmer to define their own kernel and parallelization strategy. Usually this strategy will consist of identifying sections of code (often loops) where multiple sets of data are used with the same code. To effectively utilize the multi-thread model each thread should execute without dependencies on each other. We believed the area with greatest potential for speedup on the GPU was the loop sampling the effects for each marker locus which might run with 50 thousand to 700 thousand markers. The code for this loop follows  \begin{verbatim}  int idx;  Vector\ currenttDataLocus (X(0, Eigen::Map currentDataLocus(genotypeDataPointer, numObs);  Eigen::Map currentTestLocus(genotypeTestPointer, testObs);  for (unsigned i = 0; i < numberMarkers; i++)  {  new (¤tDataLocus) Eigen::Map (&dataPtr->X(0,  i), numObs); Vector\ currentTestLocus (test_X(0, new (¤tTestLocus) Eigen::Map (&testPtr->X(0,  i), testObs); float oldSamplei = solSample\[i\]; solSample[i];  float rhsModeli = (currentDataLocus.dot(RInverseY)) + diagLhs(i) * oldSamplei; rhsModeli *= invVarResidual; float Lhs = diagLhs[i] * invVarResidual + invVarEffects; float probDeltaOne = 1.0f / (1.0f + exp(logDeltaZero + 0.5f * (log(Lhs) + logVarEffects - rhsModeli*rhsModeli/Lhs) - logPiComp)); if (matvec::ranf() < probDeltaOne) \{ {  sigma[i] = varEffects; // sigma[i] is used in sampleCommonSigma solSample[i] = matvec::snorm() * sqrt(1.0f/Lhs) + rhsModeli/Lhs; meanEffects[i] += (solSample[i] - meanEffects[i]) * invThisIter; sdDelta1[i] += (solSample[i] * solSample[i] - sdDelta1[i]) * invThisIter; mlFreq[i] += (1.0f - mlFreq[i]) * invThisIter; estLocusVars[i] += (varEffects - estLocusVars[i]) * invThisIter; RInverseY += ((currentDataLocus.array() * dataPtr->vectorRInverse.array()) * (oldSamplei - solSample[i])).matrix(); genotValueVec += currentDataLocus * solSample[i]; if (iteration % outputFreq == 0) \{ {  testGenotValueVec += currentTestLocus * solSample[i]; if (wantWindowBV) \{ {  windowBV.col(dataPtr->WindowcM[i]) += currentDataLocus * solSample[i]; cumSample[numberEffects].icolumn = i; // store samples cumSample[numberEffects].fSample = solSample[i]; \}  \} }  }  int istart = max(0, int(i - windowWidth)); unsigned iend = min(i + windowWidth + 1, numberMarkers); for (unsigned j = istart; j < iend; j++) windowStatus[j] = 1.f; // record this window had a SNP fitted in the model numberEffects++; \} }  else \{ {  if (oldSamplei) RInverseY += ((currentDataLocus.array() * dataPtr->vectorRInverse.array()) * oldSamplei).matrix(); sigma[i] = 0.0; solSample[i] = 0.0f; meanEffects[i] -= meanEffects[i] * invThisIter; sdDelta1[i] -= sdDelta1[i] * invThisIter; mlFreq[i] -= mlFreq[i] * invThisIter; estLocusVars[i] -= estLocusVars[i] * invThisIter; \}  \} }  }  // end of loop over markers (i is the loop variable) \end{verbatim}  \begin{enumerate}