Skip to content

Commit

Permalink
Merge remote-tracking branch 'upstream/master' into nowr
Browse files Browse the repository at this point in the history
Incorporate gpu_multicopy changes
  • Loading branch information
spencerw committed Mar 2, 2024
2 parents 0ca8444 + 2ff770b commit 611ed4b
Show file tree
Hide file tree
Showing 11 changed files with 225 additions and 89 deletions.
2 changes: 2 additions & 0 deletions CudaFunctions.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@ __global__ void nodeGravityComputation(
cudatype fperiod
);

__global__ void ZeroVars(VariablePartData *particleVars, int nVars);

#ifdef CUDA_2D_TB_KERNEL
__global__ void particleGravityComputation(
CompactPartData *targetCores,
Expand Down
143 changes: 64 additions & 79 deletions DataManager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -476,15 +476,16 @@ void DataManager::serializeLocalTree(){
#endif
if(treePiecesDone == registeredTreePieces.length()){
treePiecesDone = 0;
CmiUnlock(__nodelock);

if(verbosity > 1)
CkPrintf("[%d] Registered tree pieces length: %lu\n", CkMyPe(), registeredTreePieces.length());
serializeLocal(root);
if(verbosity > 1)
CkPrintf("[%d] Registered tree pieces length after serialize local: %lu\n", CkMyPe(), registeredTreePieces.length());
}
CmiUnlock(__nodelock);

else
CmiUnlock(__nodelock);
}

/// @brief Callback from local data transfer to GPU
Expand Down Expand Up @@ -649,8 +650,6 @@ PendingBuffers *DataManager::serializeRemoteChunk(GenericTreeNode *node){
int numTreePieces = registeredTreePieces.length();
int numNodes = 0;
int numParticles = 0;
int numCachedNodes = 0;
int numCachedParticles = 0;
int totalNumBuckets = 0;

cacheType *wholeNodeCache = cacheNode.ckLocalBranch()->getCache();
Expand Down Expand Up @@ -767,16 +766,14 @@ PendingBuffers *DataManager::serializeRemoteChunk(GenericTreeNode *node){
}// end serializeNodes

/// @brief gather local nodes and particles and send to GPU
/// @param node Root of tree to walk.
void DataManager::serializeLocal(GenericTreeNode *node){
/// @param nodeRoot Root of tree to walk.
void DataManager::serializeLocal(GenericTreeNode *nodeRoot){
/// queue for breadth first treewalk.
CkQ<GenericTreeNode *> queue;

int numTreePieces = registeredTreePieces.length();
int numNodes = 0;
int numParticles = 0;
int numCachedNodes = 0;
int numCachedParticles = 0;

for(int i = 0; i < numTreePieces; i++){
TreePiece *tp = registeredTreePieces[i].treePiece;
Expand All @@ -785,11 +782,7 @@ void DataManager::serializeLocal(GenericTreeNode *node){
}
numNodes -= cumNumReplicatedNodes;

CkVec<CudaMultipoleMoments> localMoments;
CkVec<CompactPartData> localParticles;

localMoments.reserve(numNodes);
localParticles.resize(numParticles);

localMoments.length() = 0;

Expand All @@ -804,7 +797,7 @@ void DataManager::serializeLocal(GenericTreeNode *node){
#endif
double starttime = CmiWallTimer();
// Walk local tree
queue.enq(node);
queue.enq(nodeRoot);
while(!queue.isEmpty()){
GenericTreeNode *node = queue.deq();
NodeType type = node->getType();
Expand Down Expand Up @@ -837,93 +830,85 @@ void DataManager::serializeLocal(GenericTreeNode *node){
}
}// end while queue not empty

// here
#ifdef HAPI_TRACE
traceUserBracketEvent(SER_LOCAL_WALK, starttime, CmiWallTimer());
#endif

// used later, when copying particle vars back to the host
savedNumTotalParticles = numParticles;
savedNumTotalNodes = localMoments.length();

for(int i = 0; i < registeredTreePieces.length(); i++){
TreePiece *tp = registeredTreePieces[i].treePiece;
tp->getDMParticles(localParticles.getVec(), partIndex);
}
#ifdef CUDA_DM_PRINT_TREES
CkPrintf("*************\n");
#endif
CkAssert(partIndex == numParticles);
#if COSMO_PRINT_BK > 1
CkPrintf("(%d): DM->GPU local tree\n", CkMyPe());
#endif
size_t sLocalParts = numParticles*sizeof(CompactPartData);
size_t sLocalMoments = localMoments.length()*sizeof(CudaMultipoleMoments);
allocatePinnedHostMemory((void **)&bufLocalParts, sLocalParts);
allocatePinnedHostMemory((void **)&bufLocalMoments, sLocalMoments);

#ifdef GPU_LOCAL_TREE_WALK
// set proper active bucketStart and bucketSize for each bucketNode
int pTPindex = 0;
treePiecesBufferFilled = 0;
for(int i = 0; i < numTreePieces; i++){
TreePiece *tp = registeredTreePieces[i].treePiece;
// set the bucketStart and bucketSize for each bucket Node
if (tp->largePhase()) {
for (int j = 0; j < tp->numBuckets; ++j) {
GenericTreeNode *bucketNode = tp->bucketList[j];
int id = bucketNode->nodeArrayIndex;
localMoments[id].bucketStart = bucketNode->bucketArrayIndex;
localMoments[id].bucketSize = bucketNode->lastParticle - bucketNode->firstParticle + 1;
treePieces[registeredTreePieces[i].treePiece->getIndex()].fillGPUBuffer((intptr_t) bufLocalParts,
(intptr_t) bufLocalMoments, (intptr_t) localMoments.getVec(), pTPindex,
numParticles, (intptr_t) nodeRoot);
pTPindex += registeredTreePieces[i].treePiece->getDMNumParticles();
}
} else {
for (int j = 0; j < tp->numBuckets; ++j) {
GenericTreeNode *bucketNode = tp->bucketList[j];
int id = bucketNode->nodeArrayIndex;
localMoments[id].bucketStart = tp->bucketActiveInfo[id].start;
localMoments[id].bucketSize = tp->bucketActiveInfo[id].size;
}
}
}

// tell each particle which node it belongs to
CompactPartData *localParicalsVec = localParticles.getVec();
for (int j = 0; j < tp->numBuckets; ++j) {
GenericTreeNode *bucketNode = tp->bucketList[j];
int id = bucketNode->nodeArrayIndex;
int start = localMoments[id].bucketStart;
int end = start + localMoments[id].bucketSize;
for (int k = start; k < end; k ++) {
localParicalsVec[k].nodeId = id;
}
///
/// @brief After all pieces have filled the buffer, initiate the transfer.
/// @param numParticles total number of particles on this node
/// @param node root of tree
///
void DataManager::transferLocalToGPU(int numParticles, GenericTreeNode *node)
{
CmiLock(__nodelock);
treePiecesBufferFilled++;
if(treePiecesBufferFilled == registeredTreePieces.length()){
treePiecesBufferFilled = 0;
CmiUnlock(__nodelock);
}
}
// here
else {
CmiUnlock(__nodelock);
return;
}

double starttime = CmiWallTimer();
#ifdef GPU_LOCAL_TREE_WALK
transformLocalTreeRecursive(node, localMoments);
#endif //GPU_LOCAL_TREE_WALK
starttime = CmiWallTimer();

#ifdef CUDA_DM_PRINT_TREES
CkPrintf("*************\n");
#endif
CkAssert(partIndex == numParticles);
#if COSMO_PRINT_BK > 1
CkPrintf("(%d): DM->GPU local tree\n", CkMyPe());
#ifdef HAPI_TRACE
traceUserBracketEvent(SER_LOCAL_TRANSFORM, starttime, CmiWallTimer());
#endif

localTransferCallback
= new CkCallback(CkIndex_DataManager::startLocalWalk(), CkMyNode(), dMProxy);

// XXX copies can be saved here.
sMoments = localMoments.length()*sizeof(CudaMultipoleMoments);
allocatePinnedHostMemory((void **)&bufLocalMoments, sMoments);
memcpy(bufLocalMoments, localMoments.getVec(), sMoments);

sCompactParts = localParticles.length()*sizeof(CompactPartData);
allocatePinnedHostMemory((void **)&bufLocalParts, sCompactParts);
memcpy(bufLocalParts, localParticles.getVec(), sCompactParts);

sVarParts = localParticles.length()*sizeof(VariablePartData);
allocatePinnedHostMemory((void **)&bufLocalVars, sVarParts);
VariablePartData *zeroArray = (VariablePartData *) bufLocalVars;
// XXX This could be done on the GPU.
for(int i = 0; i < numParticles; i++){
zeroArray[i].a.x = 0.0;
zeroArray[i].a.y = 0.0;
zeroArray[i].a.z = 0.0;
zeroArray[i].potential = 0.0;
zeroArray[i].dtGrav = 0.0;
}
starttime = CmiWallTimer();
size_t sLocalVars = numParticles*sizeof(VariablePartData);
size_t sLocalParts = numParticles*sizeof(CompactPartData);
size_t sLocalMoments = localMoments.length()*sizeof(CudaMultipoleMoments);

memcpy(bufLocalMoments, localMoments.getVec(), sLocalMoments);
#ifdef HAPI_TRACE
traceUserBracketEvent(SER_LOCAL_MEMCPY, starttime, CmiWallTimer());
#endif

allocatePinnedHostMemory((void **)&bufLocalVars, sLocalVars);

// Transfer moments and particle cores to gpu
DataManagerTransferLocalTree(bufLocalMoments, sMoments, bufLocalParts,
sCompactParts, bufLocalVars, sVarParts,
DataManagerTransferLocalTree(bufLocalMoments, sLocalMoments, bufLocalParts,
sLocalParts, bufLocalVars, sLocalVars,
(void **)&d_localMoments, (void **)&d_localParts, (void **)&d_localVars,
streams[0],
streams[0], numParticles,
localTransferCallback);
}// end serializeLocal
}

#ifdef GPU_LOCAL_TREE_WALK
// Add more information to each Moment, basically transform moment to a computable tree node
Expand Down
7 changes: 6 additions & 1 deletion DataManager.h
Original file line number Diff line number Diff line change
Expand Up @@ -103,11 +103,13 @@ class DataManager : public CBase_DataManager {
// * either do not concern yourself with cached particles
// * or for each entry, get key, find bucket node in CM, DM or TPs and get number
// for now, the former

std::map<NodeKey, int> cachedPartsOnGpu;
// local particles that have been copied to the gpu
//std::map<NodeKey, int> localPartsOnGpu;

// TreePiece counter for multi-threaded GPU host buffer copy
int treePiecesBufferFilled;

// can the gpu accept a chunk of remote particles/nodes?
bool gpuFree;

Expand All @@ -126,6 +128,8 @@ class DataManager : public CBase_DataManager {
/// host buffer to transfer remote particles to GPU
CompactPartData *bufRemoteParts;

/// Vector to accumulate localMoments for transfering to GPU
CkVec<CudaMultipoleMoments> localMoments;
/// host buffer to transfer local moments to GPU
CudaMultipoleMoments *bufLocalMoments;
/// host buffer to transfer local particles to GPU
Expand Down Expand Up @@ -191,6 +195,7 @@ class DataManager : public CBase_DataManager {
// actual serialization methods
PendingBuffers *serializeRemoteChunk(GenericTreeNode *);
void serializeLocal(GenericTreeNode *);
void transferLocalToGPU(int nParts, GenericTreeNode *node);
void freeLocalTreeMemory();
void freeRemoteChunkMemory(int chunk);
void transferParticleVarsBack();
Expand Down
21 changes: 20 additions & 1 deletion HostCUDA.cu
Original file line number Diff line number Diff line change
Expand Up @@ -115,11 +115,12 @@ void freeDeviceMemory(void *ptr){
/// @param d_compactParts Uninitalized pointer to particles on GPU
/// @param d_varParts Uninitalized pointer to accelerations on GPU
/// @param stream CUDA stream to handle the memory transfer
/// @param numParticles Total number of particle accelerations to initalize
void DataManagerTransferLocalTree(void *moments, size_t sMoments,
void *compactParts, size_t sCompactParts,
void *varParts, size_t sVarParts,
void **d_localMoments, void **d_compactParts, void **d_varParts,
cudaStream_t stream,
cudaStream_t stream, int numParticles,
void *callback) {

#ifdef CUDA_VERBOSE_KERNEL_ENQUEUE
Expand All @@ -141,6 +142,11 @@ void DataManagerTransferLocalTree(void *moments, size_t sMoments,
cudaChk(cudaMemcpyAsync(*d_compactParts, compactParts, sCompactParts, cudaMemcpyHostToDevice, stream));
cudaChk(cudaMemcpyAsync(*d_varParts, varParts, sVarParts, cudaMemcpyHostToDevice, stream));

ZeroVars<<<numParticles / THREADS_PER_BLOCK + 1, dim3(THREADS_PER_BLOCK), 0, stream>>>(
(VariablePartData *) *d_varParts,
numParticles);
cudaChk(cudaPeekAtLastError());

cudaStreamSynchronize(stream);
HAPI_TRACE_END(CUDA_XFER_LOCAL);

Expand Down Expand Up @@ -2202,3 +2208,16 @@ __global__ void EwaldKernel(CompactPartData *particleCores,

return;
}

// initialize accelerations and potentials to zero
__global__ void ZeroVars(VariablePartData *particleVars, int nVars) {
int id;
id = blockIdx.x * BLOCK_SIZE + threadIdx.x;
if(id >= nVars) return;

particleVars[id].a.x = 0.0;
particleVars[id].a.y = 0.0;
particleVars[id].a.z = 0.0;
particleVars[id].potential = 0.0;
particleVars[id].dtGrav = 0.0;
}
2 changes: 1 addition & 1 deletion HostCUDA.h
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ void DataManagerTransferLocalTree(void *moments, size_t sMoments,
void *compactParts, size_t sCompactParts,
void *varParts, size_t sVarParts,
void **d_localMoments, void **d_compactParts, void **d_varParts,
cudaStream_t stream,
cudaStream_t stream, int numParticles,
void *callback);
void DataManagerTransferRemoteChunk(void *moments, size_t sMoments,
void *compactParts, size_t sCompactParts,
Expand Down
7 changes: 7 additions & 0 deletions ParallelGravity.ci
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,10 @@ mainmodule ParallelGravity {
readonly int remoteResumePartsPerReq;
readonly double largePhaseThreshold;

readonly int START_REG;
readonly int START_IB;
readonly int START_PW;

readonly int boundaryEvaluationUE;
readonly int weightBalanceUE;
readonly int networkProgressUE;
Expand Down Expand Up @@ -536,6 +540,9 @@ mainmodule ParallelGravity {
// jetley
#ifdef CUDA
entry void continueStartRemoteChunk(int chunk, intptr_t d_remoteMoments, intptr_t d_remoteParts);
entry void fillGPUBuffer(intptr_t bufLocalParts,
intptr_t bufLocalMoments,
intptr_t pLocalMoments, int partIndex, int nParts, intptr_t node);
entry void updateParticles(intptr_t data, int partIndex);
#else
entry void continueStartRemoteChunk(int chunk);
Expand Down
6 changes: 6 additions & 0 deletions ParallelGravity.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,9 @@ cosmoType thetaMono; ///< Criterion of excepting monopole

/// @brief Boundary evaluation user event (for Projections tracing).
int boundaryEvaluationUE;
int START_REG;
int START_IB;
int START_PW;
/// @brief Weight balancing during Oct decomposition user event (for Projections tracing).
int weightBalanceUE;
int networkProgressUE;
Expand Down Expand Up @@ -232,6 +235,9 @@ Main::Main(CkArgMsg* m) {
// Floating point exceptions.
// feenableexcept(FE_OVERFLOW | FE_DIVBYZERO | FE_INVALID);

START_REG = traceRegisterUserEvent("Register");
START_IB = traceRegisterUserEvent("Init Buckets");
START_PW = traceRegisterUserEvent("Prefetch Walk");
boundaryEvaluationUE = traceRegisterUserEvent("Evaluating Boudaries");
weightBalanceUE = traceRegisterUserEvent("Weight Balancer");
networkProgressUE = traceRegisterUserEvent("CmiNetworkProgress");
Expand Down
9 changes: 8 additions & 1 deletion ParallelGravity.h
Original file line number Diff line number Diff line change
Expand Up @@ -175,6 +175,10 @@ extern CProxy_CkCacheManager<KeyType> cacheNode;
/// The group ID of your DataManager. You must set this!
extern CkGroupID dataManagerID;

extern int START_REG;
extern int START_IB;
extern int START_PW;

extern int boundaryEvaluationUE;
extern int weightBalanceUE;
extern int networkProgressUE;
Expand Down Expand Up @@ -1009,7 +1013,10 @@ class TreePiece : public CBase_TreePiece {
#endif

#ifdef CUDA
void continueStartRemoteChunk(int chunk, intptr_t d_remoteMoments, intptr_t d_remoteParts);
void continueStartRemoteChunk(int chunk, intptr_t d_remoteMoments, intptr_t d_remoteParts);
void fillGPUBuffer(intptr_t bufLocalParts,
intptr_t bufLocalMoments,
intptr_t pLocalMoments, int partIndex, int nParts, intptr_t node);
void updateParticles(intptr_t data, int partIndex);
#else
void continueStartRemoteChunk(int chunk);
Expand Down
Loading

0 comments on commit 611ed4b

Please sign in to comment.