Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Trq/gpu multicopy #73

Merged
merged 10 commits into from
Mar 1, 2024
2 changes: 2 additions & 0 deletions CudaFunctions.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@ __global__ void nodeGravityComputation(
cudatype fperiod
);

__global__ void ZeroVars(VariablePartData *particleVars, int nVars);

#ifdef CUDA_2D_TB_KERNEL
__global__ void particleGravityComputation(
CompactPartData *targetCores,
Expand Down
133 changes: 62 additions & 71 deletions DataManager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -491,15 +491,16 @@ void DataManager::serializeLocalTree(){
#endif
if(treePiecesDone == registeredTreePieces.length()){
treePiecesDone = 0;
CmiUnlock(__nodelock);

if(verbosity > 1)
CkPrintf("[%d] Registered tree pieces length: %lu\n", CkMyPe(), registeredTreePieces.length());
serializeLocal(root);
if(verbosity > 1)
CkPrintf("[%d] Registered tree pieces length after serialize local: %lu\n", CkMyPe(), registeredTreePieces.length());
}
CmiUnlock(__nodelock);

else
CmiUnlock(__nodelock);
}

/// @brief Callback from local data transfer to GPU
Expand Down Expand Up @@ -662,8 +663,6 @@ PendingBuffers *DataManager::serializeRemoteChunk(GenericTreeNode *node){
int numTreePieces = registeredTreePieces.length();
int numNodes = 0;
int numParticles = 0;
int numCachedNodes = 0;
int numCachedParticles = 0;
int totalNumBuckets = 0;

cacheType *wholeNodeCache = cacheNode.ckLocalBranch()->getCache();
Expand Down Expand Up @@ -780,16 +779,14 @@ PendingBuffers *DataManager::serializeRemoteChunk(GenericTreeNode *node){
}// end serializeNodes

/// @brief gather local nodes and particles and send to GPU
/// @param node Root of tree to walk.
void DataManager::serializeLocal(GenericTreeNode *node){
/// @param nodeRoot Root of tree to walk.
void DataManager::serializeLocal(GenericTreeNode *nodeRoot){
/// queue for breadth first treewalk.
CkQ<GenericTreeNode *> queue;

int numTreePieces = registeredTreePieces.length();
int numNodes = 0;
int numParticles = 0;
int numCachedNodes = 0;
int numCachedParticles = 0;

for(int i = 0; i < numTreePieces; i++){
TreePiece *tp = registeredTreePieces[i].treePiece;
Expand All @@ -798,11 +795,7 @@ void DataManager::serializeLocal(GenericTreeNode *node){
}
numNodes -= cumNumReplicatedNodes;

CkVec<CudaMultipoleMoments> localMoments;
CkVec<CompactPartData> localParticles;

localMoments.reserve(numNodes);
localParticles.resize(numParticles);

localMoments.length() = 0;

Expand All @@ -815,8 +808,9 @@ void DataManager::serializeLocal(GenericTreeNode *node){
CkPrintf("[%d] DM local tree\n", CkMyPe());
CkPrintf("*************\n");
#endif
double starttime = CmiWallTimer();
// Walk local tree
queue.enq(node);
queue.enq(nodeRoot);
while(!queue.isEmpty()){
GenericTreeNode *node = queue.deq();
NodeType type = node->getType();
Expand Down Expand Up @@ -849,50 +843,14 @@ void DataManager::serializeLocal(GenericTreeNode *node){
}
}// end while queue not empty

#ifdef HAPI_TRACE
traceUserBracketEvent(SER_LOCAL_WALK, starttime, CmiWallTimer());
#endif

// used later, when copying particle vars back to the host
savedNumTotalParticles = numParticles;
savedNumTotalNodes = localMoments.length();

for(int i = 0; i < registeredTreePieces.length(); i++){
TreePiece *tp = registeredTreePieces[i].treePiece;
tp->getDMParticles(localParticles.getVec(), partIndex);
}

#ifdef GPU_LOCAL_TREE_WALK
// set proper active bucketStart and bucketSize for each bucketNode
for(int i = 0; i < numTreePieces; i++){
TreePiece *tp = registeredTreePieces[i].treePiece;
// set the bucketStart and bucketSize for each bucket Node
if (tp->largePhase()) {
for (int j = 0; j < tp->numBuckets; ++j) {
GenericTreeNode *bucketNode = tp->bucketList[j];
int id = bucketNode->nodeArrayIndex;
localMoments[id].bucketStart = bucketNode->bucketArrayIndex;
localMoments[id].bucketSize = bucketNode->lastParticle - bucketNode->firstParticle + 1;
}
} else {
for (int j = 0; j < tp->numBuckets; ++j) {
GenericTreeNode *bucketNode = tp->bucketList[j];
int id = bucketNode->nodeArrayIndex;
localMoments[id].bucketStart = tp->bucketActiveInfo[id].start;
localMoments[id].bucketSize = tp->bucketActiveInfo[id].size;
}
}

// tell each particle which node it belongs to
CompactPartData *localParicalsVec = localParticles.getVec();
for (int j = 0; j < tp->numBuckets; ++j) {
GenericTreeNode *bucketNode = tp->bucketList[j];
int id = bucketNode->nodeArrayIndex;
int start = localMoments[id].bucketStart;
int end = start + localMoments[id].bucketSize;
for (int k = start; k < end; k ++) {
localParicalsVec[k].nodeId = id;
}
}
}
transformLocalTreeRecursive(node, localMoments);
#endif //GPU_LOCAL_TREE_WALK

#ifdef CUDA_DM_PRINT_TREES
CkPrintf("*************\n");
Expand All @@ -901,41 +859,74 @@ void DataManager::serializeLocal(GenericTreeNode *node){
#if COSMO_PRINT_BK > 1
CkPrintf("(%d): DM->GPU local tree\n", CkMyPe());
#endif
size_t sLocalParts = numParticles*sizeof(CompactPartData);
size_t sLocalMoments = localMoments.length()*sizeof(CudaMultipoleMoments);
allocatePinnedHostMemory((void **)&bufLocalParts, sLocalParts);
allocatePinnedHostMemory((void **)&bufLocalMoments, sLocalMoments);

int pTPindex = 0;
treePiecesBufferFilled = 0;
for(int i = 0; i < numTreePieces; i++){
treePieces[registeredTreePieces[i].treePiece->getIndex()].fillGPUBuffer((intptr_t) bufLocalParts,
(intptr_t) bufLocalMoments, (intptr_t) localMoments.getVec(), pTPindex,
numParticles, (intptr_t) nodeRoot);
pTPindex += registeredTreePieces[i].treePiece->getDMNumParticles();
}
}

///
/// @brief After all pieces have filled the buffer, initiate the transfer.
/// @param numParticles total number of particles on this node
/// @param node root of tree
///
void DataManager::transferLocalToGPU(int numParticles, GenericTreeNode *node)
{
CmiLock(__nodelock);
treePiecesBufferFilled++;
if(treePiecesBufferFilled == registeredTreePieces.length()){
treePiecesBufferFilled = 0;
CmiUnlock(__nodelock);
}
else {
CmiUnlock(__nodelock);
return;
}

double starttime = CmiWallTimer();
#ifdef GPU_LOCAL_TREE_WALK
transformLocalTreeRecursive(node, localMoments);
#endif //GPU_LOCAL_TREE_WALK
#ifdef HAPI_TRACE
traceUserBracketEvent(SER_LOCAL_TRANS, starttime, CmiWallTimer());
#endif

localTransferCallback
= new CkCallback(CkIndex_DataManager::startLocalWalk(), CkMyNode(), dMProxy);

// XXX copies can be saved here.
starttime = CmiWallTimer();
size_t sLocalVars = numParticles*sizeof(VariablePartData);
size_t sLocalParts = numParticles*sizeof(CompactPartData);

size_t sLocalMoments = localMoments.length()*sizeof(CudaMultipoleMoments);
allocatePinnedHostMemory((void **)&bufLocalMoments, sLocalMoments);
memcpy(bufLocalMoments, localMoments.getVec(), sLocalMoments);
#ifdef HAPI_TRACE
traceUserBracketEvent(SER_LOCAL_MEMCPY, starttime, CmiWallTimer());
#endif

size_t sLocalParts = localParticles.length()*sizeof(CompactPartData);
allocatePinnedHostMemory((void **)&bufLocalParts, sLocalParts);
memcpy(bufLocalParts, localParticles.getVec(), sLocalParts);

size_t sLocalVars = localParticles.length()*sizeof(VariablePartData);
allocatePinnedHostMemory((void **)&bufLocalVars, sLocalVars);
VariablePartData *zeroArray = (VariablePartData *) bufLocalVars;
// XXX This could be done on the GPU.
for(int i = 0; i < numParticles; i++){
zeroArray[i].a.x = 0.0;
zeroArray[i].a.y = 0.0;
zeroArray[i].a.z = 0.0;
zeroArray[i].potential = 0.0;
zeroArray[i].dtGrav = 0.0;
}
// Transfer moments and particle cores to gpu

#ifdef HAPI_INSTRUMENT_WRS
DataManagerTransferLocalTree(bufLocalMoments, sLocalMoments, bufLocalParts,
sLocalParts, bufLocalVars, sLocalVars, 0,
sLocalParts, bufLocalVars, sLocalVars, 0, numParticles,
activeRung, localTransferCallback);
#else
DataManagerTransferLocalTree(bufLocalMoments, sLocalMoments, bufLocalParts,
sLocalParts, bufLocalVars, sLocalVars,
sLocalParts, bufLocalVars, sLocalVars, numParticles,
CkMyPe(), localTransferCallback);
#endif
}// end serializeLocal
}

#ifdef GPU_LOCAL_TREE_WALK
// Add more information to each Moment, basically transform moment to a computable tree node
Expand Down
7 changes: 6 additions & 1 deletion DataManager.h
Original file line number Diff line number Diff line change
Expand Up @@ -104,11 +104,13 @@ class DataManager : public CBase_DataManager {
// * either do not concern yourself with cached particles
// * or for each entry, get key, find bucket node in CM, DM or TPs and get number
// for now, the former

std::map<NodeKey, int> cachedPartsOnGpu;
// local particles that have been copied to the gpu
//std::map<NodeKey, int> localPartsOnGpu;

// TreePiece counter for multi-threaded GPU host buffer copy
int treePiecesBufferFilled;

// can the gpu accept a chunk of remote particles/nodes?
bool gpuFree;

Expand All @@ -129,6 +131,8 @@ class DataManager : public CBase_DataManager {
/// host buffer to transfer remote particles to GPU
CompactPartData *bufRemoteParts;

/// Vector to accumulate localMoments for transfering to GPU
CkVec<CudaMultipoleMoments> localMoments;
/// host buffer to transfer local moments to GPU
CudaMultipoleMoments *bufLocalMoments;
/// host buffer to transfer local particles to GPU
Expand Down Expand Up @@ -184,6 +188,7 @@ class DataManager : public CBase_DataManager {
// actual serialization methods
PendingBuffers *serializeRemoteChunk(GenericTreeNode *);
void serializeLocal(GenericTreeNode *);
void transferLocalToGPU(int nParts, GenericTreeNode *node);
void freeLocalTreeMemory();
void freeRemoteChunkMemory(int chunk);
void transferParticleVarsBack();
Expand Down
24 changes: 22 additions & 2 deletions HostCUDA.cu
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,10 @@ void run_DM_TRANSFER_LOCAL(hapiWorkRequest *wr, cudaStream_t kernel_stream,void*
printf("cores: 0x%x\n", devBuffers[LOCAL_PARTICLE_CORES]);
printf("vars: 0x%x\n", devBuffers[LOCAL_PARTICLE_VARS]);
#endif
ZeroVars<<<wr->grid_dim, wr->block_dim, wr->shared_mem, kernel_stream>>>(
(VariablePartData *)devBuffers[LOCAL_PARTICLE_VARS],
wr->buffers[LOCAL_PARTICLE_VARS_IDX].size/sizeof(VariablePartData));
cudaChk(cudaPeekAtLastError());
}

void run_DM_TRANSFER_REMOTE_CHUNK(hapiWorkRequest *wr, cudaStream_t kernel_stream,void** devBuffers) {
Expand Down Expand Up @@ -147,12 +151,12 @@ void run_DM_TRANSFER_FREE_LOCAL(hapiWorkRequest *wr, cudaStream_t kernel_stream,
#ifdef HAPI_INSTRUMENT_WRS
void DataManagerTransferLocalTree(void *moments, size_t sMoments,
void *compactParts, size_t sCompactParts,
void *varParts, size_t sVarParts,
void *varParts, size_t sVarParts, int numParticles,
int mype, char phase, void *wrCallback) {
#else
void DataManagerTransferLocalTree(void *moments, size_t sMoments,
void *compactParts, size_t sCompactParts,
void *varParts, size_t sVarParts,
void *varParts, size_t sVarParts, int numParticles,
int mype, void *wrCallback) {
#endif

Expand All @@ -168,6 +172,9 @@ void DataManagerTransferLocalTree(void *moments, size_t sMoments,
transferKernel->addBuffer(varParts, sVarParts, (sVarParts > 0), false,
false, LOCAL_PARTICLE_VARS);

// +1 to avoid dimGrid = 0 when we run test with extremely small input.
transferKernel->setExecParams(numParticles / THREADS_PER_BLOCK + 1, dim3(THREADS_PER_BLOCK));

transferKernel->setDeviceToHostCallback(*(CkCallback *)wrCallback);
#ifdef HAPI_TRACE
transferKernel->setTraceName("xferLocal");
Expand Down Expand Up @@ -2661,3 +2668,16 @@ __global__ void EwaldKernel(CompactPartData *particleCores,

return;
}

// initialize accelerations and potentials to zero
__global__ void ZeroVars(VariablePartData *particleVars, int nVars) {
int id;
id = blockIdx.x * BLOCK_SIZE + threadIdx.x;
if(id >= nVars) return;

particleVars[id].a.x = 0.0;
particleVars[id].a.y = 0.0;
particleVars[id].a.z = 0.0;
particleVars[id].potential = 0.0;
particleVars[id].dtGrav = 0.0;
}
4 changes: 2 additions & 2 deletions HostCUDA.h
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,7 @@ void freePinnedHostMemory(void *);
#ifdef HAPI_INSTRUMENT_WRS
void DataManagerTransferLocalTree(void *moments, size_t sMoments,
void *compactParts, size_t sCompactParts,
void *varParts, size_t sVarParts,
void *varParts, size_t sVarParts, int numParticles,
int mype, char phase, void *wrCallback);
void DataManagerTransferRemoteChunk(void *moments, size_t sMoments,
void *compactParts, size_t sCompactParts,
Expand All @@ -190,7 +190,7 @@ void TransferParticleVarsBack(VariablePartData *hostBuffer, size_t size, void *c
#else
void DataManagerTransferLocalTree(void *moments, size_t sMoments,
void *compactParts, size_t sCompactParts,
void *varParts, size_t sVarParts,
void *varParts, size_t sVarParts, int numParticles,
int mype, void *wrCallback);
void DataManagerTransferRemoteChunk(void *moments, size_t sMoments,
void *compactParts, size_t sCompactParts,
Expand Down
7 changes: 7 additions & 0 deletions ParallelGravity.ci
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,10 @@ mainmodule ParallelGravity {
readonly int remoteResumePartsPerReq;
readonly double largePhaseThreshold;

readonly int START_REG;
readonly int START_IB;
readonly int START_PW;

readonly int boundaryEvaluationUE;
readonly int weightBalanceUE;
readonly int networkProgressUE;
Expand Down Expand Up @@ -524,6 +528,9 @@ mainmodule ParallelGravity {
// jetley
entry void continueStartRemoteChunk(int chunk);
#ifdef CUDA
entry void fillGPUBuffer(intptr_t bufLocalParts,
intptr_t bufLocalMoments,
intptr_t pLocalMoments, int partIndex, int nParts, intptr_t node);
entry void updateParticles(intptr_t data, int partIndex);
#endif
entry void continueWrapUp();
Expand Down
10 changes: 10 additions & 0 deletions ParallelGravity.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,9 @@ cosmoType thetaMono; ///< Criterion of excepting monopole

/// @brief Boundary evaluation user event (for Projections tracing).
int boundaryEvaluationUE;
int START_REG;
int START_IB;
int START_PW;
/// @brief Weight balancing during Oct decomposition user event (for Projections tracing).
int weightBalanceUE;
int networkProgressUE;
Expand Down Expand Up @@ -228,6 +231,9 @@ Main::Main(CkArgMsg* m) {
// Floating point exceptions.
// feenableexcept(FE_OVERFLOW | FE_DIVBYZERO | FE_INVALID);

START_REG = traceRegisterUserEvent("Register");
START_IB = traceRegisterUserEvent("Init Buckets");
START_PW = traceRegisterUserEvent("Prefetch Walk");
boundaryEvaluationUE = traceRegisterUserEvent("Evaluating Boudaries");
weightBalanceUE = traceRegisterUserEvent("Weight Balancer");
networkProgressUE = traceRegisterUserEvent("CmiNetworkProgress");
Expand All @@ -236,6 +242,10 @@ Main::Main(CkArgMsg* m) {
#ifdef HAPI_TRACE
traceRegisterUserEvent("Tree Serialization", CUDA_SER_TREE);
traceRegisterUserEvent("List Serialization", CUDA_SER_LIST);
traceRegisterUserEvent("Ser Local Walk", SER_LOCAL_WALK);
traceRegisterUserEvent("Ser Local Memcpy", SER_LOCAL_MEMCPY);
traceRegisterUserEvent("Ser Local Zero", SER_LOCAL_ZERO);
traceRegisterUserEvent("Ser Local Trans", SER_LOCAL_TRANS);

traceRegisterUserEvent("Local Node", CUDA_LOCAL_NODE_KERNEL);
traceRegisterUserEvent("Remote Node", CUDA_REMOTE_NODE_KERNEL);
Expand Down
Loading
Loading