Merge remote-tracking branch 'upstream/master' into nowr

Incorporate gpu_multicopy changes
N-BodyShop · Mar 2, 2024 · 611ed4b · 611ed4b
2 parents 0ca8444 + 2ff770b
commit 611ed4b
Show file tree

Hide file tree

Showing 11 changed files with 225 additions and 89 deletions.
diff --git a/CudaFunctions.h b/CudaFunctions.h
@@ -34,6 +34,8 @@ __global__ void nodeGravityComputation(
 		cudatype fperiod
                 );
 
+__global__ void ZeroVars(VariablePartData *particleVars, int nVars);
+
 #ifdef CUDA_2D_TB_KERNEL
 __global__ void particleGravityComputation(
                 CompactPartData *targetCores,

diff --git a/DataManager.cpp b/DataManager.cpp
@@ -476,15 +476,16 @@ void DataManager::serializeLocalTree(){
 #endif
   if(treePiecesDone == registeredTreePieces.length()){
     treePiecesDone = 0;
+    CmiUnlock(__nodelock);
 
     if(verbosity > 1)
         CkPrintf("[%d] Registered tree pieces length: %lu\n", CkMyPe(), registeredTreePieces.length());
     serializeLocal(root);
     if(verbosity > 1)
         CkPrintf("[%d] Registered tree pieces length after serialize local: %lu\n", CkMyPe(), registeredTreePieces.length());
   }
-  CmiUnlock(__nodelock);
-
+  else
+      CmiUnlock(__nodelock);
 }
 
 /// @brief Callback from local data transfer to GPU
@@ -649,8 +650,6 @@ PendingBuffers *DataManager::serializeRemoteChunk(GenericTreeNode *node){
   int numTreePieces = registeredTreePieces.length();
   int numNodes = 0;
   int numParticles = 0;
-  int numCachedNodes = 0;
-  int numCachedParticles = 0;
   int totalNumBuckets = 0;
 
   cacheType *wholeNodeCache = cacheNode.ckLocalBranch()->getCache();
@@ -767,16 +766,14 @@ PendingBuffers *DataManager::serializeRemoteChunk(GenericTreeNode *node){
 }// end serializeNodes
 
 /// @brief gather local nodes and particles and send to GPU
-/// @param node Root of tree to walk.
-void DataManager::serializeLocal(GenericTreeNode *node){
+/// @param nodeRoot Root of tree to walk.
+void DataManager::serializeLocal(GenericTreeNode *nodeRoot){
   /// queue for breadth first treewalk.
   CkQ<GenericTreeNode *> queue;
 
   int numTreePieces = registeredTreePieces.length();
   int numNodes = 0;
   int numParticles = 0;
-  int numCachedNodes = 0;
-  int numCachedParticles = 0;
 
   for(int i = 0; i < numTreePieces; i++){
     TreePiece *tp = registeredTreePieces[i].treePiece;
@@ -785,11 +782,7 @@ void DataManager::serializeLocal(GenericTreeNode *node){
   }
   numNodes -= cumNumReplicatedNodes;
 
-  CkVec<CudaMultipoleMoments> localMoments;
-  CkVec<CompactPartData> localParticles;
-
   localMoments.reserve(numNodes);
-  localParticles.resize(numParticles);
 
   localMoments.length() = 0;
 
@@ -804,7 +797,7 @@ void DataManager::serializeLocal(GenericTreeNode *node){
 #endif
   double  starttime = CmiWallTimer();
   // Walk local tree
-  queue.enq(node);
+  queue.enq(nodeRoot);
   while(!queue.isEmpty()){
     GenericTreeNode *node = queue.deq();
     NodeType type = node->getType();
@@ -837,93 +830,85 @@ void DataManager::serializeLocal(GenericTreeNode *node){
     }
   }// end while queue not empty
 
-  // here
+#ifdef HAPI_TRACE
+  traceUserBracketEvent(SER_LOCAL_WALK, starttime, CmiWallTimer());
+#endif
 
   // used later, when copying particle vars back to the host
   savedNumTotalParticles = numParticles;
   savedNumTotalNodes = localMoments.length();
 
-  for(int i = 0; i < registeredTreePieces.length(); i++){
-    TreePiece *tp = registeredTreePieces[i].treePiece;
-    tp->getDMParticles(localParticles.getVec(), partIndex);
-  }
+#ifdef CUDA_DM_PRINT_TREES
+  CkPrintf("*************\n");
+#endif
+  CkAssert(partIndex == numParticles);
+#if COSMO_PRINT_BK > 1
+  CkPrintf("(%d): DM->GPU local tree\n", CkMyPe());
+#endif
+  size_t sLocalParts = numParticles*sizeof(CompactPartData);
+  size_t sLocalMoments = localMoments.length()*sizeof(CudaMultipoleMoments);
+  allocatePinnedHostMemory((void **)&bufLocalParts, sLocalParts);
+  allocatePinnedHostMemory((void **)&bufLocalMoments, sLocalMoments);
 
-#ifdef GPU_LOCAL_TREE_WALK
-  // set proper active bucketStart and bucketSize for each bucketNode
+  int pTPindex = 0;
+  treePiecesBufferFilled = 0;
   for(int i = 0; i < numTreePieces; i++){
-    TreePiece *tp = registeredTreePieces[i].treePiece;
-    // set the bucketStart and bucketSize for each bucket Node
-    if (tp->largePhase()) {
-      for (int j = 0; j < tp->numBuckets; ++j) {
-          GenericTreeNode *bucketNode = tp->bucketList[j];
-          int id = bucketNode->nodeArrayIndex;
-          localMoments[id].bucketStart = bucketNode->bucketArrayIndex;
-          localMoments[id].bucketSize = bucketNode->lastParticle - bucketNode->firstParticle + 1;
+      treePieces[registeredTreePieces[i].treePiece->getIndex()].fillGPUBuffer((intptr_t) bufLocalParts,
+		      (intptr_t) bufLocalMoments, (intptr_t) localMoments.getVec(), pTPindex,
+		      numParticles, (intptr_t) nodeRoot);
+      pTPindex += registeredTreePieces[i].treePiece->getDMNumParticles();
       }
-    } else {
-      for (int j = 0; j < tp->numBuckets; ++j) {
-          GenericTreeNode *bucketNode = tp->bucketList[j];
-          int id = bucketNode->nodeArrayIndex;
-          localMoments[id].bucketStart = tp->bucketActiveInfo[id].start;
-          localMoments[id].bucketSize =  tp->bucketActiveInfo[id].size;
-      }
-    }
+}
 
-    // tell each particle which node it belongs to
-    CompactPartData *localParicalsVec = localParticles.getVec();
-    for (int j = 0; j < tp->numBuckets; ++j) {
-      GenericTreeNode *bucketNode = tp->bucketList[j];
-      int id = bucketNode->nodeArrayIndex;
-      int start = localMoments[id].bucketStart;
-      int end = start + localMoments[id].bucketSize;
-      for (int k = start; k < end; k ++) {
-        localParicalsVec[k].nodeId = id;
-      }
+///
+/// @brief After all pieces have filled the buffer, initiate the transfer.
+/// @param numParticles total number of particles on this node
+/// @param node root of tree
+///
+void DataManager::transferLocalToGPU(int numParticles, GenericTreeNode *node)
+{
+    CmiLock(__nodelock);
+    treePiecesBufferFilled++;
+    if(treePiecesBufferFilled == registeredTreePieces.length()){
+        treePiecesBufferFilled = 0;
+        CmiUnlock(__nodelock);
     }
-  }
-  // here
+    else {
+        CmiUnlock(__nodelock);
+        return;
+    }
+
+  double starttime = CmiWallTimer();
+#ifdef GPU_LOCAL_TREE_WALK
   transformLocalTreeRecursive(node, localMoments);
 #endif //GPU_LOCAL_TREE_WALK
-  starttime = CmiWallTimer();
-
-#ifdef CUDA_DM_PRINT_TREES
-  CkPrintf("*************\n");
-#endif
-  CkAssert(partIndex == numParticles);
-#if COSMO_PRINT_BK > 1
-  CkPrintf("(%d): DM->GPU local tree\n", CkMyPe());
+#ifdef HAPI_TRACE
+  traceUserBracketEvent(SER_LOCAL_TRANSFORM, starttime, CmiWallTimer());
 #endif
 
   localTransferCallback
       = new CkCallback(CkIndex_DataManager::startLocalWalk(), CkMyNode(), dMProxy);
 
   // XXX copies can be saved here.
-  sMoments = localMoments.length()*sizeof(CudaMultipoleMoments);
-  allocatePinnedHostMemory((void **)&bufLocalMoments, sMoments);
-  memcpy(bufLocalMoments, localMoments.getVec(), sMoments);
-
-  sCompactParts = localParticles.length()*sizeof(CompactPartData);
-  allocatePinnedHostMemory((void **)&bufLocalParts, sCompactParts);
-  memcpy(bufLocalParts, localParticles.getVec(), sCompactParts);
-
-  sVarParts = localParticles.length()*sizeof(VariablePartData);
-  allocatePinnedHostMemory((void **)&bufLocalVars, sVarParts);
-  VariablePartData *zeroArray =  (VariablePartData *) bufLocalVars;
-// XXX This could be done on the GPU.
-  for(int i = 0; i < numParticles; i++){
-      zeroArray[i].a.x = 0.0;
-      zeroArray[i].a.y = 0.0;
-      zeroArray[i].a.z = 0.0;
-      zeroArray[i].potential = 0.0;
-      zeroArray[i].dtGrav = 0.0;
-      }
+  starttime = CmiWallTimer();
+  size_t sLocalVars = numParticles*sizeof(VariablePartData);
+  size_t sLocalParts = numParticles*sizeof(CompactPartData);
+  size_t sLocalMoments = localMoments.length()*sizeof(CudaMultipoleMoments);
+
+  memcpy(bufLocalMoments, localMoments.getVec(), sLocalMoments);
+#ifdef HAPI_TRACE
+  traceUserBracketEvent(SER_LOCAL_MEMCPY, starttime, CmiWallTimer());
+#endif
+
+  allocatePinnedHostMemory((void **)&bufLocalVars, sLocalVars);
+
   // Transfer moments and particle cores to gpu
-  DataManagerTransferLocalTree(bufLocalMoments, sMoments, bufLocalParts,
-                               sCompactParts, bufLocalVars, sVarParts,
+  DataManagerTransferLocalTree(bufLocalMoments, sLocalMoments, bufLocalParts,
+                               sLocalParts, bufLocalVars, sLocalVars,
 			       (void **)&d_localMoments, (void **)&d_localParts, (void **)&d_localVars,
-			       streams[0],
+			       streams[0], numParticles,
                                localTransferCallback);
-}// end serializeLocal
+}
 
 #ifdef GPU_LOCAL_TREE_WALK
 // Add more information to each Moment, basically transform moment to a computable tree node

diff --git a/DataManager.h b/DataManager.h
@@ -103,11 +103,13 @@ class DataManager : public CBase_DataManager {
         // * either do not concern yourself with cached particles
         // * or for each entry, get key, find bucket node in CM, DM or TPs and get number
         // for now, the former
-
         std::map<NodeKey, int> cachedPartsOnGpu;
         // local particles that have been copied to the gpu
         //std::map<NodeKey, int> localPartsOnGpu;
 
+        // TreePiece counter for multi-threaded GPU host buffer copy
+	int treePiecesBufferFilled;
+
         // can the gpu accept a chunk of remote particles/nodes?
         bool gpuFree;
 
@@ -126,6 +128,8 @@ class DataManager : public CBase_DataManager {
         /// host buffer to transfer remote particles to GPU
         CompactPartData *bufRemoteParts;
 
+        /// Vector to accumulate localMoments for transfering to GPU
+        CkVec<CudaMultipoleMoments> localMoments;
         /// host buffer to transfer local moments to GPU
         CudaMultipoleMoments *bufLocalMoments;
         /// host buffer to transfer local particles to GPU
@@ -191,6 +195,7 @@ class DataManager : public CBase_DataManager {
         // actual serialization methods
         PendingBuffers *serializeRemoteChunk(GenericTreeNode *);
 	void serializeLocal(GenericTreeNode *);
+	void transferLocalToGPU(int nParts, GenericTreeNode *node);
         void freeLocalTreeMemory();
         void freeRemoteChunkMemory(int chunk);
         void transferParticleVarsBack();

diff --git a/HostCUDA.cu b/HostCUDA.cu
@@ -115,11 +115,12 @@ void freeDeviceMemory(void *ptr){
 /// @param d_compactParts Uninitalized pointer to particles on GPU
 /// @param d_varParts Uninitalized pointer to accelerations on GPU
 /// @param stream CUDA stream to handle the memory transfer
+/// @param numParticles Total number of particle accelerations to initalize
 void DataManagerTransferLocalTree(void *moments, size_t sMoments,
                                   void *compactParts, size_t sCompactParts,
                                   void *varParts, size_t sVarParts,
 				  void **d_localMoments, void **d_compactParts, void **d_varParts,
-				  cudaStream_t stream,
+				  cudaStream_t stream, int numParticles,
                                   void *callback) {
 
 #ifdef CUDA_VERBOSE_KERNEL_ENQUEUE
@@ -141,6 +142,11 @@ void DataManagerTransferLocalTree(void *moments, size_t sMoments,
   cudaChk(cudaMemcpyAsync(*d_compactParts, compactParts, sCompactParts, cudaMemcpyHostToDevice, stream));
   cudaChk(cudaMemcpyAsync(*d_varParts, varParts, sVarParts, cudaMemcpyHostToDevice, stream));
 
+  ZeroVars<<<numParticles / THREADS_PER_BLOCK + 1, dim3(THREADS_PER_BLOCK), 0, stream>>>(
+      (VariablePartData *) *d_varParts,
+      numParticles);
+  cudaChk(cudaPeekAtLastError());
+
   cudaStreamSynchronize(stream);
   HAPI_TRACE_END(CUDA_XFER_LOCAL);
 
@@ -2202,3 +2208,16 @@ __global__ void EwaldKernel(CompactPartData *particleCores,
 
   return;
 }
+
+// initialize accelerations and potentials to zero
+__global__ void ZeroVars(VariablePartData *particleVars, int nVars) {
+    int id;
+    id = blockIdx.x * BLOCK_SIZE + threadIdx.x;
+    if(id >= nVars) return;
+
+    particleVars[id].a.x = 0.0;
+    particleVars[id].a.y = 0.0;
+    particleVars[id].a.z = 0.0;
+    particleVars[id].potential = 0.0;
+    particleVars[id].dtGrav = 0.0;
+}
diff --git a/HostCUDA.h b/HostCUDA.h
@@ -104,7 +104,7 @@ void DataManagerTransferLocalTree(void *moments, size_t sMoments,
                                   void *compactParts, size_t sCompactParts,
                                   void *varParts, size_t sVarParts,
 				  void **d_localMoments, void **d_compactParts, void **d_varParts,
-				  cudaStream_t stream,
+				  cudaStream_t stream, int numParticles,
                                   void *callback);
 void DataManagerTransferRemoteChunk(void *moments, size_t sMoments,
                                   void *compactParts, size_t sCompactParts,

diff --git a/ParallelGravity.ci b/ParallelGravity.ci
@@ -54,6 +54,10 @@ mainmodule ParallelGravity {
   readonly int remoteResumePartsPerReq;
   readonly double largePhaseThreshold;
 
+  readonly int START_REG;
+  readonly int START_IB;
+  readonly int START_PW;
+
   readonly int boundaryEvaluationUE;
   readonly int weightBalanceUE;
   readonly int networkProgressUE;
@@ -536,6 +540,9 @@ mainmodule ParallelGravity {
     // jetley
 #ifdef CUDA
     entry void continueStartRemoteChunk(int chunk, intptr_t d_remoteMoments, intptr_t d_remoteParts);
+    entry void fillGPUBuffer(intptr_t bufLocalParts,
+                             intptr_t bufLocalMoments,
+                             intptr_t pLocalMoments, int partIndex, int nParts, intptr_t node);
     entry void updateParticles(intptr_t data, int partIndex);
 #else
     entry void continueStartRemoteChunk(int chunk);

diff --git a/ParallelGravity.cpp b/ParallelGravity.cpp
@@ -149,6 +149,9 @@ cosmoType thetaMono;               ///< Criterion of excepting monopole
 
 /// @brief Boundary evaluation user event (for Projections tracing).
 int boundaryEvaluationUE;
+int START_REG;
+int START_IB;
+int START_PW;
 /// @brief Weight balancing during Oct decomposition user event (for Projections tracing).
 int weightBalanceUE;
 int networkProgressUE;
@@ -232,6 +235,9 @@ Main::Main(CkArgMsg* m) {
 	// Floating point exceptions.
 	// feenableexcept(FE_OVERFLOW | FE_DIVBYZERO | FE_INVALID);
 
+        START_REG = traceRegisterUserEvent("Register");
+        START_IB = traceRegisterUserEvent("Init Buckets");
+        START_PW = traceRegisterUserEvent("Prefetch Walk");
         boundaryEvaluationUE = traceRegisterUserEvent("Evaluating Boudaries");
         weightBalanceUE = traceRegisterUserEvent("Weight Balancer");
         networkProgressUE = traceRegisterUserEvent("CmiNetworkProgress");

diff --git a/ParallelGravity.h b/ParallelGravity.h
@@ -175,6 +175,10 @@ extern CProxy_CkCacheManager<KeyType> cacheNode;
 /// The group ID of your DataManager.  You must set this!
 extern CkGroupID dataManagerID;
 
+extern int START_REG;
+extern int START_IB;
+extern int START_PW;
+
 extern int boundaryEvaluationUE;
 extern int weightBalanceUE;
 extern int networkProgressUE;
@@ -1009,7 +1013,10 @@ class TreePiece : public CBase_TreePiece {
 #endif
 
 #ifdef CUDA
-	void continueStartRemoteChunk(int chunk, intptr_t d_remoteMoments, intptr_t d_remoteParts);
+       void continueStartRemoteChunk(int chunk, intptr_t d_remoteMoments, intptr_t d_remoteParts);
+       void fillGPUBuffer(intptr_t bufLocalParts,
+                          intptr_t bufLocalMoments,
+                          intptr_t pLocalMoments, int partIndex, int nParts, intptr_t node);
         void updateParticles(intptr_t data, int partIndex);
 #else
         void continueStartRemoteChunk(int chunk);