N-BodyShop · trquinn · Mar 1, 2024 · Jul 4, 2020 · Jul 4, 2020 · Oct 27, 2023
diff --git a/CudaFunctions.h b/CudaFunctions.h
@@ -34,6 +34,8 @@ __global__ void nodeGravityComputation(
 		cudatype fperiod
                 );
 
+__global__ void ZeroVars(VariablePartData *particleVars, int nVars);
+
 #ifdef CUDA_2D_TB_KERNEL
 __global__ void particleGravityComputation(
                 CompactPartData *targetCores,

diff --git a/DataManager.cpp b/DataManager.cpp
@@ -491,15 +491,16 @@ void DataManager::serializeLocalTree(){
 #endif
   if(treePiecesDone == registeredTreePieces.length()){
     treePiecesDone = 0;
+    CmiUnlock(__nodelock);
 
     if(verbosity > 1)
         CkPrintf("[%d] Registered tree pieces length: %lu\n", CkMyPe(), registeredTreePieces.length());
     serializeLocal(root);
     if(verbosity > 1)
         CkPrintf("[%d] Registered tree pieces length after serialize local: %lu\n", CkMyPe(), registeredTreePieces.length());
   }
-  CmiUnlock(__nodelock);
-
+  else
+      CmiUnlock(__nodelock);
 }
 
 /// @brief Callback from local data transfer to GPU
@@ -662,8 +663,6 @@ PendingBuffers *DataManager::serializeRemoteChunk(GenericTreeNode *node){
   int numTreePieces = registeredTreePieces.length();
   int numNodes = 0;
   int numParticles = 0;
-  int numCachedNodes = 0;
-  int numCachedParticles = 0;
   int totalNumBuckets = 0;
 
   cacheType *wholeNodeCache = cacheNode.ckLocalBranch()->getCache();
@@ -780,16 +779,14 @@ PendingBuffers *DataManager::serializeRemoteChunk(GenericTreeNode *node){
 }// end serializeNodes
 
 /// @brief gather local nodes and particles and send to GPU
-/// @param node Root of tree to walk.
-void DataManager::serializeLocal(GenericTreeNode *node){
+/// @param nodeRoot Root of tree to walk.
+void DataManager::serializeLocal(GenericTreeNode *nodeRoot){
   /// queue for breadth first treewalk.
   CkQ<GenericTreeNode *> queue;
 
   int numTreePieces = registeredTreePieces.length();
   int numNodes = 0;
   int numParticles = 0;
-  int numCachedNodes = 0;
-  int numCachedParticles = 0;
 
   for(int i = 0; i < numTreePieces; i++){
     TreePiece *tp = registeredTreePieces[i].treePiece;
@@ -798,11 +795,7 @@ void DataManager::serializeLocal(GenericTreeNode *node){
   }
   numNodes -= cumNumReplicatedNodes;
 
-  CkVec<CudaMultipoleMoments> localMoments;
-  CkVec<CompactPartData> localParticles;
-
   localMoments.reserve(numNodes);
-  localParticles.resize(numParticles);
 
   localMoments.length() = 0;
 
@@ -815,8 +808,9 @@ void DataManager::serializeLocal(GenericTreeNode *node){
   CkPrintf("[%d] DM local tree\n", CkMyPe());
   CkPrintf("*************\n");
 #endif
+  double  starttime = CmiWallTimer();
   // Walk local tree
-  queue.enq(node);
+  queue.enq(nodeRoot);
   while(!queue.isEmpty()){
     GenericTreeNode *node = queue.deq();
     NodeType type = node->getType();
@@ -849,50 +843,14 @@ void DataManager::serializeLocal(GenericTreeNode *node){
     }
   }// end while queue not empty
 
+#ifdef HAPI_TRACE
+  traceUserBracketEvent(SER_LOCAL_WALK, starttime, CmiWallTimer());
+#endif
+
   // used later, when copying particle vars back to the host
   savedNumTotalParticles = numParticles;
   savedNumTotalNodes = localMoments.length();
 
-  for(int i = 0; i < registeredTreePieces.length(); i++){
-    TreePiece *tp = registeredTreePieces[i].treePiece;
-    tp->getDMParticles(localParticles.getVec(), partIndex);
-  }
-
-#ifdef GPU_LOCAL_TREE_WALK
-  // set proper active bucketStart and bucketSize for each bucketNode
-  for(int i = 0; i < numTreePieces; i++){
-    TreePiece *tp = registeredTreePieces[i].treePiece;
-    // set the bucketStart and bucketSize for each bucket Node
-    if (tp->largePhase()) {
-      for (int j = 0; j < tp->numBuckets; ++j) {
-          GenericTreeNode *bucketNode = tp->bucketList[j];
-          int id = bucketNode->nodeArrayIndex;
-          localMoments[id].bucketStart = bucketNode->bucketArrayIndex;
-          localMoments[id].bucketSize = bucketNode->lastParticle - bucketNode->firstParticle + 1;
-      }
-    } else {
-      for (int j = 0; j < tp->numBuckets; ++j) {
-          GenericTreeNode *bucketNode = tp->bucketList[j];
-          int id = bucketNode->nodeArrayIndex;
-          localMoments[id].bucketStart = tp->bucketActiveInfo[id].start;
-          localMoments[id].bucketSize =  tp->bucketActiveInfo[id].size;
-      }
-    }
-
-    // tell each particle which node it belongs to
-    CompactPartData *localParicalsVec = localParticles.getVec();
-    for (int j = 0; j < tp->numBuckets; ++j) {
-      GenericTreeNode *bucketNode = tp->bucketList[j];
-      int id = bucketNode->nodeArrayIndex;
-      int start = localMoments[id].bucketStart;
-      int end = start + localMoments[id].bucketSize;
-      for (int k = start; k < end; k ++) {
-        localParicalsVec[k].nodeId = id;
-      }
-    }
-  }
-  transformLocalTreeRecursive(node, localMoments);
-#endif //GPU_LOCAL_TREE_WALK
 
 #ifdef CUDA_DM_PRINT_TREES
   CkPrintf("*************\n");
@@ -901,41 +859,74 @@ void DataManager::serializeLocal(GenericTreeNode *node){
 #if COSMO_PRINT_BK > 1
   CkPrintf("(%d): DM->GPU local tree\n", CkMyPe());
 #endif
+  size_t sLocalParts = numParticles*sizeof(CompactPartData);
+  size_t sLocalMoments = localMoments.length()*sizeof(CudaMultipoleMoments);
+  allocatePinnedHostMemory((void **)&bufLocalParts, sLocalParts);
+  allocatePinnedHostMemory((void **)&bufLocalMoments, sLocalMoments);
+
+  int pTPindex = 0;
+  treePiecesBufferFilled = 0;
+  for(int i = 0; i < numTreePieces; i++){
+      treePieces[registeredTreePieces[i].treePiece->getIndex()].fillGPUBuffer((intptr_t) bufLocalParts,
+		      (intptr_t) bufLocalMoments, (intptr_t) localMoments.getVec(), pTPindex,
+		      numParticles, (intptr_t) nodeRoot);
+      pTPindex += registeredTreePieces[i].treePiece->getDMNumParticles();
+      }
+}
+
+///
+/// @brief After all pieces have filled the buffer, initiate the transfer.
+/// @param numParticles total number of particles on this node
+/// @param node root of tree
+///
+void DataManager::transferLocalToGPU(int numParticles, GenericTreeNode *node)
+{
+    CmiLock(__nodelock);
+    treePiecesBufferFilled++;
+    if(treePiecesBufferFilled == registeredTreePieces.length()){
+        treePiecesBufferFilled = 0;
+        CmiUnlock(__nodelock);
+    }
+    else {
+        CmiUnlock(__nodelock);
+        return;
+    }
+
+  double starttime = CmiWallTimer();
+#ifdef GPU_LOCAL_TREE_WALK
+  transformLocalTreeRecursive(node, localMoments);
+#endif //GPU_LOCAL_TREE_WALK
+#ifdef HAPI_TRACE
+  traceUserBracketEvent(SER_LOCAL_TRANS, starttime, CmiWallTimer());
+#endif
 
   localTransferCallback
       = new CkCallback(CkIndex_DataManager::startLocalWalk(), CkMyNode(), dMProxy);
 
   // XXX copies can be saved here.
+  starttime = CmiWallTimer();
+  size_t sLocalVars = numParticles*sizeof(VariablePartData);
+  size_t sLocalParts = numParticles*sizeof(CompactPartData);
+
   size_t sLocalMoments = localMoments.length()*sizeof(CudaMultipoleMoments);
-  allocatePinnedHostMemory((void **)&bufLocalMoments, sLocalMoments);
   memcpy(bufLocalMoments, localMoments.getVec(), sLocalMoments);
+#ifdef HAPI_TRACE
+  traceUserBracketEvent(SER_LOCAL_MEMCPY, starttime, CmiWallTimer());
+#endif
 
-  size_t sLocalParts = localParticles.length()*sizeof(CompactPartData);
-  allocatePinnedHostMemory((void **)&bufLocalParts, sLocalParts);
-  memcpy(bufLocalParts, localParticles.getVec(), sLocalParts);
-
-  size_t sLocalVars = localParticles.length()*sizeof(VariablePartData);
   allocatePinnedHostMemory((void **)&bufLocalVars, sLocalVars);
-  VariablePartData *zeroArray =  (VariablePartData *) bufLocalVars;
-// XXX This could be done on the GPU.
-  for(int i = 0; i < numParticles; i++){
-      zeroArray[i].a.x = 0.0;
-      zeroArray[i].a.y = 0.0;
-      zeroArray[i].a.z = 0.0;
-      zeroArray[i].potential = 0.0;
-      zeroArray[i].dtGrav = 0.0;
-      }
   // Transfer moments and particle cores to gpu
+
 #ifdef HAPI_INSTRUMENT_WRS
   DataManagerTransferLocalTree(bufLocalMoments, sLocalMoments, bufLocalParts,
-                               sLocalParts, bufLocalVars, sLocalVars, 0,
+                               sLocalParts, bufLocalVars, sLocalVars, 0, numParticles,
                                activeRung, localTransferCallback);
 #else
   DataManagerTransferLocalTree(bufLocalMoments, sLocalMoments, bufLocalParts,
-                               sLocalParts, bufLocalVars, sLocalVars,
+                               sLocalParts, bufLocalVars, sLocalVars, numParticles,
                                CkMyPe(), localTransferCallback);
 #endif
-}// end serializeLocal
+}
 
 #ifdef GPU_LOCAL_TREE_WALK
 // Add more information to each Moment, basically transform moment to a computable tree node

diff --git a/DataManager.h b/DataManager.h
@@ -104,11 +104,13 @@ class DataManager : public CBase_DataManager {
         // * either do not concern yourself with cached particles
         // * or for each entry, get key, find bucket node in CM, DM or TPs and get number
         // for now, the former
-
         std::map<NodeKey, int> cachedPartsOnGpu;
         // local particles that have been copied to the gpu
         //std::map<NodeKey, int> localPartsOnGpu;
 
+        // TreePiece counter for multi-threaded GPU host buffer copy
+	int treePiecesBufferFilled;
+
         // can the gpu accept a chunk of remote particles/nodes?
         bool gpuFree;
 
@@ -129,6 +131,8 @@ class DataManager : public CBase_DataManager {
         /// host buffer to transfer remote particles to GPU
         CompactPartData *bufRemoteParts;
 
+        /// Vector to accumulate localMoments for transfering to GPU
+        CkVec<CudaMultipoleMoments> localMoments;
         /// host buffer to transfer local moments to GPU
         CudaMultipoleMoments *bufLocalMoments;
         /// host buffer to transfer local particles to GPU
@@ -184,6 +188,7 @@ class DataManager : public CBase_DataManager {
         // actual serialization methods
         PendingBuffers *serializeRemoteChunk(GenericTreeNode *);
 	void serializeLocal(GenericTreeNode *);
+	void transferLocalToGPU(int nParts, GenericTreeNode *node);
         void freeLocalTreeMemory();
         void freeRemoteChunkMemory(int chunk);
         void transferParticleVarsBack();

diff --git a/HostCUDA.cu b/HostCUDA.cu
@@ -113,6 +113,10 @@ void run_DM_TRANSFER_LOCAL(hapiWorkRequest *wr, cudaStream_t kernel_stream,void*
   printf("cores: 0x%x\n", devBuffers[LOCAL_PARTICLE_CORES]);
   printf("vars: 0x%x\n", devBuffers[LOCAL_PARTICLE_VARS]);
 #endif
+  ZeroVars<<<wr->grid_dim, wr->block_dim, wr->shared_mem, kernel_stream>>>(
+      (VariablePartData *)devBuffers[LOCAL_PARTICLE_VARS],
+      wr->buffers[LOCAL_PARTICLE_VARS_IDX].size/sizeof(VariablePartData));
+  cudaChk(cudaPeekAtLastError());
 }
 
 void run_DM_TRANSFER_REMOTE_CHUNK(hapiWorkRequest *wr, cudaStream_t kernel_stream,void** devBuffers) {
@@ -147,12 +151,12 @@ void run_DM_TRANSFER_FREE_LOCAL(hapiWorkRequest *wr, cudaStream_t kernel_stream,
 #ifdef HAPI_INSTRUMENT_WRS
 void DataManagerTransferLocalTree(void *moments, size_t sMoments,
                                   void *compactParts, size_t sCompactParts,
-                                  void *varParts, size_t sVarParts,
+                                  void *varParts, size_t sVarParts, int numParticles,
                                   int mype, char phase, void *wrCallback) {
 #else
 void DataManagerTransferLocalTree(void *moments, size_t sMoments,
                                   void *compactParts, size_t sCompactParts,
-                                  void *varParts, size_t sVarParts,
+                                  void *varParts, size_t sVarParts, int numParticles,
                                   int mype, void *wrCallback) {
 #endif
 
@@ -168,6 +172,9 @@ void DataManagerTransferLocalTree(void *moments, size_t sMoments,
 	transferKernel->addBuffer(varParts, sVarParts, (sVarParts > 0), false,
                                   false, LOCAL_PARTICLE_VARS);
 
+       // +1 to avoid dimGrid = 0 when we run test with extremely small input.
+       transferKernel->setExecParams(numParticles / THREADS_PER_BLOCK + 1, dim3(THREADS_PER_BLOCK));
+
 	transferKernel->setDeviceToHostCallback(*(CkCallback *)wrCallback);
 #ifdef HAPI_TRACE
 	transferKernel->setTraceName("xferLocal");
@@ -2661,3 +2668,16 @@ __global__ void EwaldKernel(CompactPartData *particleCores,
 
   return;
 }
+
+// initialize accelerations and potentials to zero
+__global__ void ZeroVars(VariablePartData *particleVars, int nVars) {
+    int id;
+    id = blockIdx.x * BLOCK_SIZE + threadIdx.x;
+    if(id >= nVars) return;
+
+    particleVars[id].a.x = 0.0;
+    particleVars[id].a.y = 0.0;
+    particleVars[id].a.z = 0.0;
+    particleVars[id].potential = 0.0;
+    particleVars[id].dtGrav = 0.0;
+}
diff --git a/HostCUDA.h b/HostCUDA.h
@@ -178,7 +178,7 @@ void freePinnedHostMemory(void *);
 #ifdef HAPI_INSTRUMENT_WRS
 void DataManagerTransferLocalTree(void *moments, size_t sMoments,
                         void *compactParts, size_t sCompactParts,
-                        void *varParts, size_t sVarParts,
+                        void *varParts, size_t sVarParts, int numParticles,
                         int mype, char phase, void *wrCallback);
 void DataManagerTransferRemoteChunk(void *moments, size_t sMoments, 
                                     void *compactParts, size_t sCompactParts,
@@ -190,7 +190,7 @@ void TransferParticleVarsBack(VariablePartData *hostBuffer, size_t size, void *c
 #else
 void DataManagerTransferLocalTree(void *moments, size_t sMoments,
                                   void *compactParts, size_t sCompactParts,
-                                  void *varParts, size_t sVarParts,
+                                  void *varParts, size_t sVarParts, int numParticles,
                                   int mype, void *wrCallback);
 void DataManagerTransferRemoteChunk(void *moments, size_t sMoments,
                                   void *compactParts, size_t sCompactParts,

diff --git a/ParallelGravity.ci b/ParallelGravity.ci
@@ -54,6 +54,10 @@ mainmodule ParallelGravity {
   readonly int remoteResumePartsPerReq;
   readonly double largePhaseThreshold;
 
+  readonly int START_REG;
+  readonly int START_IB;
+  readonly int START_PW;
+
   readonly int boundaryEvaluationUE;
   readonly int weightBalanceUE;
   readonly int networkProgressUE;
@@ -524,6 +528,9 @@ mainmodule ParallelGravity {
     // jetley
     entry void continueStartRemoteChunk(int chunk);
 #ifdef CUDA
+    entry void fillGPUBuffer(intptr_t bufLocalParts,
+                             intptr_t bufLocalMoments,
+                             intptr_t pLocalMoments, int partIndex, int nParts, intptr_t node);
     entry void updateParticles(intptr_t data, int partIndex);
 #endif
     entry void continueWrapUp();

diff --git a/ParallelGravity.cpp b/ParallelGravity.cpp
@@ -145,6 +145,9 @@ cosmoType thetaMono;               ///< Criterion of excepting monopole
 
 /// @brief Boundary evaluation user event (for Projections tracing).
 int boundaryEvaluationUE;
+int START_REG;
+int START_IB;
+int START_PW;
 /// @brief Weight balancing during Oct decomposition user event (for Projections tracing).
 int weightBalanceUE;
 int networkProgressUE;
@@ -228,6 +231,9 @@ Main::Main(CkArgMsg* m) {
 	// Floating point exceptions.
 	// feenableexcept(FE_OVERFLOW | FE_DIVBYZERO | FE_INVALID);
 
+        START_REG = traceRegisterUserEvent("Register");
+        START_IB = traceRegisterUserEvent("Init Buckets");
+        START_PW = traceRegisterUserEvent("Prefetch Walk");
         boundaryEvaluationUE = traceRegisterUserEvent("Evaluating Boudaries");
         weightBalanceUE = traceRegisterUserEvent("Weight Balancer");
         networkProgressUE = traceRegisterUserEvent("CmiNetworkProgress");
@@ -236,6 +242,10 @@ Main::Main(CkArgMsg* m) {
 #ifdef HAPI_TRACE
         traceRegisterUserEvent("Tree Serialization", CUDA_SER_TREE);
         traceRegisterUserEvent("List Serialization", CUDA_SER_LIST);
+        traceRegisterUserEvent("Ser Local Walk", SER_LOCAL_WALK);
+        traceRegisterUserEvent("Ser Local Memcpy", SER_LOCAL_MEMCPY);
+        traceRegisterUserEvent("Ser Local Zero", SER_LOCAL_ZERO);
+        traceRegisterUserEvent("Ser Local Trans", SER_LOCAL_TRANS);
 
         traceRegisterUserEvent("Local Node", CUDA_LOCAL_NODE_KERNEL);
         traceRegisterUserEvent("Remote Node", CUDA_REMOTE_NODE_KERNEL);