Skip to content

Commit be4d354

Browse files
committed
btl/smcuda: convert to use accelerator framework functions
- convert smcuda to use accelerator framework API - convert the sync_memops feature to a global mca parameter that can be used by all interested components Signed-off-by: Edgar Gabriel <Edgar.Gabriel@amd.com>
1 parent 12b728e commit be4d354

File tree

10 files changed

+104
-114
lines changed

10 files changed

+104
-114
lines changed

opal/mca/btl/smcuda/btl_smcuda.c

Lines changed: 57 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
* Copyright (c) 2022 IBM Corporation. All rights reserved
2424
* Copyright (c) 2023 Triad National Security, LLC. All rights
2525
* reserved.
26+
* Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
2627
* $COPYRIGHT$
2728
*
2829
* Additional copyrights may follow
@@ -71,8 +72,7 @@
7172
#include "btl_smcuda_frag.h"
7273
#include "btl_smcuda_accelerator.h"
7374

74-
75-
#include "opal/include/opal/opal_cuda.h"
75+
#include "opal/include/opal/opal_gpu.h"
7676

7777
static struct mca_btl_base_registration_handle_t *
7878
mca_btl_smcuda_register_mem(struct mca_btl_base_module_t *btl,
@@ -354,15 +354,15 @@ static int smcuda_btl_first_time_init(mca_btl_smcuda_t *smcuda_btl, int32_t my_s
354354
* local process to know which parts of the memory are being utilized by a
355355
* remote process. */
356356
opal_output_verbose(10, opal_btl_base_framework.framework_output,
357-
"btl:smcuda: CUDA cuMemHostRegister address=%p, size=%d",
357+
"btl:smcuda: host_register address=%p, size=%d",
358358
mca_btl_smcuda_component.sm_mpool_base, (int) res->size);
359-
if (0 == strcmp(opal_accelerator_base_selected_component.base_version.mca_component_name, "cuda")) {
359+
if (0 != strcmp(opal_accelerator_base_selected_component.base_version.mca_component_name, "null")) {
360360
rc = opal_accelerator.host_register(MCA_ACCELERATOR_NO_DEVICE_ID, mca_btl_smcuda_component.sm_mpool_base, res->size);
361361
if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) {
362362
/* If registering the memory fails, print a message and continue.
363363
* This is not a fatal error. */
364364
opal_output_verbose(10, opal_btl_base_framework.framework_output,
365-
"btl:smcuda: CUDA cuMemHostRegister failed");
365+
"btl:smcuda: host_register failed");
366366
}
367367
}
368368

@@ -877,7 +877,7 @@ int mca_btl_smcuda_sendi(struct mca_btl_base_module_t *btl,
877877
}
878878
/* Initiate setting up CUDA IPC support. */
879879

880-
if (0 == strcmp(opal_accelerator_base_selected_component.base_version.mca_component_name, "cuda") && (IPC_INIT == endpoint->ipcstate)
880+
if (0 != strcmp(opal_accelerator_base_selected_component.base_version.mca_component_name, "null") && (IPC_INIT == endpoint->ipcstate)
881881
&& mca_btl_smcuda_component.use_cuda_ipc) {
882882
mca_btl_smcuda_send_cuda_ipc_request(btl, endpoint);
883883
}
@@ -967,7 +967,7 @@ int mca_btl_smcuda_send(struct mca_btl_base_module_t *btl, struct mca_btl_base_e
967967
mca_btl_smcuda_component_progress();
968968
}
969969
/* Initiate setting up CUDA IPC support */
970-
if (0 == strcmp(opal_accelerator_base_selected_component.base_version.mca_component_name, "cuda") && (IPC_INIT == endpoint->ipcstate)
970+
if (0 != strcmp(opal_accelerator_base_selected_component.base_version.mca_component_name, "null") && (IPC_INIT == endpoint->ipcstate)
971971
&& mca_btl_smcuda_component.use_cuda_ipc) {
972972
mca_btl_smcuda_send_cuda_ipc_request(btl, endpoint);
973973
}
@@ -1004,7 +1004,7 @@ mca_btl_smcuda_register_mem(struct mca_btl_base_module_t *btl,
10041004
uint32_t flags)
10051005
{
10061006
mca_btl_smcuda_t *smcuda_module = (mca_btl_smcuda_t *) btl;
1007-
mca_opal_cuda_reg_t *reg;
1007+
mca_opal_gpu_reg_t *reg;
10081008
int access_flags = flags & MCA_BTL_REG_FLAG_ACCESS_ANY;
10091009
int rcache_flags = 0;
10101010

@@ -1013,7 +1013,6 @@ mca_btl_smcuda_register_mem(struct mca_btl_base_module_t *btl,
10131013
rcache_flags |= MCA_RCACHE_FLAGS_ACCELERATOR_MEM;
10141014
}
10151015
#endif
1016-
10171016
smcuda_module->rcache->rcache_register(smcuda_module->rcache, base, size, rcache_flags,
10181017
access_flags, (mca_rcache_base_registration_t **) &reg);
10191018
if (OPAL_UNLIKELY(NULL == reg)) {
@@ -1027,9 +1026,8 @@ static int mca_btl_smcuda_deregister_mem(struct mca_btl_base_module_t *btl,
10271026
struct mca_btl_base_registration_handle_t *handle)
10281027
{
10291028
mca_btl_smcuda_t *smcuda_module = (mca_btl_smcuda_t *) btl;
1030-
mca_opal_cuda_reg_t *reg = (mca_opal_cuda_reg_t
1031-
*) ((intptr_t) handle
1032-
- offsetof(mca_opal_cuda_reg_t, data));
1029+
mca_opal_gpu_reg_t *reg = (mca_opal_gpu_reg_t *) ((intptr_t) handle
1030+
- offsetof(mca_opal_gpu_reg_t, data));
10331031

10341032
smcuda_module->rcache->rcache_deregister(smcuda_module->rcache, &reg->base);
10351033

@@ -1040,49 +1038,57 @@ static int mca_btl_smcuda_deregister_mem(struct mca_btl_base_module_t *btl,
10401038
* Put remote event on stream to ensure that the the start of the
10411039
* copy does not start until the completion of the event.
10421040
*/
1043-
static void mca_btl_smcuda_wait_stream_synchronize(mca_opal_cuda_reg_t *rget_reg)
1041+
static void mca_btl_smcuda_wait_stream_synchronize(mca_opal_gpu_reg_t *rget_reg)
10441042
{
1045-
#if OPAL_CUDA_SYNC_MEMOPS
1046-
/* No need for any of this with SYNC_MEMOPS feature */
1047-
return;
1048-
#else /* OPAL_CUDA_SYNC_MEMOPS */
1049-
CUipcEventHandle evtHandle;
1050-
CUevent event;
1051-
CUresult result;
1043+
opal_accelerator_ipc_event_handle_t evtHandle;
1044+
opal_accelerator_event_t event;
1045+
int result;
1046+
1047+
if (opal_accelerator_use_sync_memops) {
1048+
/* No need for any of this with SYNC_MEMOPS feature */
1049+
return;
1050+
}
10521051

1053-
memcpy(&evtHandle, rget_reg->data.evtHandle, sizeof(evtHandle));
1052+
result = opal_accelerator.import_ipc_event_handle(rget_reg->data.ipcEventHandle.handle, &evtHandle);
1053+
if (OPAL_UNLIKELY(OPAL_SUCCESS != result)) {
1054+
opal_output_verbose(10, mca_btl_smcuda_component.cuda_ipc_output,
1055+
"import_ipc_event_handle failed");
1056+
return;
1057+
}
10541058

1055-
result = cuIpcOpenEventHandle(&event, evtHandle);
1056-
if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
1059+
result = opal_accelerator.open_ipc_event_handle(&evtHandle, &event);
1060+
if (OPAL_UNLIKELY(OPAL_SUCCESS != result)) {
10571061
opal_output_verbose(10, mca_btl_smcuda_component.cuda_ipc_output,
1058-
"cuIpcOpenEventHandle failed");
1062+
"open_ipc_event_handle failed");
1063+
return;
10591064
}
10601065

1061-
/* BEGIN of Workaround - There is a bug in CUDA 4.1 RC2 and earlier
1062-
* versions. Need to record an event on the stream, even though
1066+
#if 0
1067+
/* BEGIN of Workaround to deal with a bug in an early CUDA releases (4.1. an older)
1068+
* Need to record an event on the stream, even though
10631069
* it is not used, to make sure we do not short circuit our way
10641070
* out of the cuStreamWaitEvent test.
10651071
*/
1066-
result = cuEventRecord(event, 0);
1067-
if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
1072+
result = opal_accelerator.record_event(MCA_ACCELERATOR_NO_DEVICE_ID, &event, MCA_ACCELERATOR_STREAM_DEFAULT);
1073+
if (OPAL_UNLIKELY(OPAL_SUCCESS != result)) {
10681074
opal_output_verbose(10, mca_btl_smcuda_component.cuda_ipc_output,
1069-
"cuEventRecord failed");
1075+
"record_event failed");
1076+
return;
10701077
}
10711078
/* END of Workaround */
1079+
#endif
10721080

1073-
result = cuStreamWaitEvent(0, event, 0);
1074-
if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
1081+
result = opal_accelerator.wait_event(MCA_ACCELERATOR_NO_DEVICE_ID, &event, MCA_ACCELERATOR_STREAM_DEFAULT);
1082+
if (OPAL_UNLIKELY(OPAL_SUCCESS != result)) {
10751083
opal_output_verbose(10, mca_btl_smcuda_component.cuda_ipc_output,
1076-
"cuStreamWaitEvent failed");
1084+
"wait_event failed");
1085+
return;
10771086
}
10781087

1079-
/* All done with this event. */
1080-
result = cuEventDestroy(event);
1081-
if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
1082-
opal_output_verbose(10, mca_btl_smcuda_component.cuda_ipc_output,
1083-
"cuStreamWaitEvent failed");
1084-
}
1085-
#endif /* OPAL_CUDA_SYNC_MEMOPS */
1088+
// ipc event are assumed to be static, hence no OBJ_RELEASE
1089+
// but OBJ_DESTRUCT here.
1090+
OBJ_DESTRUCT(&event);
1091+
return;
10861092
}
10871093

10881094
int mca_btl_smcuda_get_cuda(struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *ep,
@@ -1092,9 +1098,9 @@ int mca_btl_smcuda_get_cuda(struct mca_btl_base_module_t *btl, struct mca_btl_ba
10921098
int flags, int order, mca_btl_base_rdma_completion_fn_t cbfunc,
10931099
void *cbcontext, void *cbdata)
10941100
{
1095-
mca_opal_cuda_reg_t rget_reg;
1096-
mca_opal_cuda_reg_t *reg_ptr = &rget_reg;
1097-
int rc, done;
1101+
mca_opal_gpu_reg_t rget_reg;
1102+
mca_opal_gpu_reg_t *reg_ptr = &rget_reg;
1103+
int rc;
10981104
void *remote_memory_address;
10991105
size_t offset;
11001106
mca_btl_smcuda_frag_t *frag;
@@ -1121,13 +1127,14 @@ int mca_btl_smcuda_get_cuda(struct mca_btl_base_module_t *btl, struct mca_btl_ba
11211127
* garbage in the debugger. */
11221128

11231129
memset(&rget_reg, 0, sizeof(rget_reg));
1124-
memcpy(&rget_reg.data.memHandle, remote_handle->reg_data.memHandle,
1125-
sizeof(remote_handle->reg_data.memHandle));
1126-
# if !OPAL_CUDA_SYNC_MEMOPS
1127-
/* Only need the remote event handle when syncing with remote events */
1128-
memcpy(&rget_reg.data.evtHandle, remote_handle->reg_data.evtHandle,
1129-
sizeof(remote_handle->reg_data.evtHandle));
1130-
# endif
1130+
memcpy(&rget_reg.data.ipcHandle.handle, remote_handle->reg_data.ipcHandle.handle,
1131+
sizeof(remote_handle->reg_data.ipcHandle.handle));
1132+
1133+
if (!opal_accelerator_use_sync_memops) {
1134+
/* Only need the remote event handle when syncing with remote events */
1135+
memcpy(&rget_reg.data.ipcEventHandle.handle, remote_handle->reg_data.ipcEventHandle.handle,
1136+
sizeof(remote_handle->reg_data.ipcEventHandle.handle));
1137+
}
11311138

11321139
/* Open the memory handle to the remote memory. If it is cached, then
11331140
* we just retrieve it from cache and avoid a call to open the handle. That
@@ -1248,7 +1255,7 @@ static void mca_btl_smcuda_send_cuda_ipc_request(struct mca_btl_base_module_t *b
12481255
*/
12491256
OPAL_THREAD_ADD_FETCH32(&mca_btl_smcuda_component.num_outstanding_frags, +1);
12501257
opal_output_verbose(10, mca_btl_smcuda_component.cuda_ipc_output,
1251-
"Sending CUDA IPC REQ (try=%d): myrank=%d, mydev=%d, peerrank=%d",
1258+
"Sending IPC REQ (try=%d): myrank=%d, mydev=%d, peerrank=%d",
12521259
endpoint->ipctries, mca_btl_smcuda_component.my_smp_rank, mydevnum,
12531260
endpoint->peer_smp_rank);
12541261

opal/mca/btl/smcuda/btl_smcuda_accelerator.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@ int mca_btl_smcuda_accelerator_init(void)
6565
}
6666
/* Create the events since they can be reused. */
6767
for (i = 0; i < accelerator_event_max; i++) {
68-
rc = opal_accelerator.create_event(MCA_ACCELERATOR_NO_DEVICE_ID, &accelerator_event_ipc_array[i]);
68+
rc = opal_accelerator.create_event(MCA_ACCELERATOR_NO_DEVICE_ID, &accelerator_event_ipc_array[i], true);
6969
if (OPAL_SUCCESS != rc) {
7070
opal_output_verbose(1, mca_btl_smcuda_component.cuda_ipc_output, "Accelerator create event failed.");
7171
rc = OPAL_ERROR;
@@ -223,7 +223,7 @@ int mca_btl_smcuda_memcpy(void *dst, void *src, size_t amount, char *msg,
223223
return OPAL_ERROR;
224224
} else {
225225
opal_output_verbose(20, mca_btl_smcuda_component.cuda_ipc_output,
226-
"smcuda: cuMemcpyAsync passed: dst=%p, src=%p, size=%d", dst, src,
226+
"smcuda: mem_copy_async passed: dst=%p, src=%p, size=%d", dst, src,
227227
(int) amount);
228228
}
229229

opal/mca/btl/smcuda/btl_smcuda_component.c

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -160,7 +160,7 @@ static int smcuda_register(void)
160160
&mca_btl_smcuda_component.sm_max_procs);
161161
/* there is no practical use for the mpool name parameter since mpool resources differ
162162
between components */
163-
mca_btl_smcuda_component.sm_mpool_name = "sm";
163+
mca_btl_smcuda_component.sm_mpool_name = "smgpu";
164164
mca_btl_smcuda_param_register_uint("fifo_size", 4096, OPAL_INFO_LVL_4,
165165
&mca_btl_smcuda_component.fifo_size);
166166
mca_btl_smcuda_param_register_int("num_fifos", 1, OPAL_INFO_LVL_4,
@@ -180,7 +180,7 @@ static int smcuda_register(void)
180180
NULL, 0, 0, OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_LOCAL, &mca_btl_smcuda_component.allocator);
181181

182182
/* Lower priority when CUDA support is not requested */
183-
if (0 == strcmp(opal_accelerator_base_selected_component.base_version.mca_component_name, "cuda")) {
183+
if (0 != strcmp(opal_accelerator_base_selected_component.base_version.mca_component_name, "null")) {
184184

185185
mca_btl_smcuda.super.btl_exclusivity = MCA_BTL_EXCLUSIVITY_HIGH + 1;
186186
} else {
@@ -701,7 +701,7 @@ static void btl_smcuda_control(mca_btl_base_module_t *btl,
701701
} else {
702702
opal_output_verbose(
703703
10, mca_btl_smcuda_component.cuda_ipc_output,
704-
"Analyzed CUDA IPC request: myrank=%d, mydev=%d, peerrank=%d, "
704+
"Analyzed GPU IPC request: myrank=%d, mydev=%d, peerrank=%d, "
705705
"peerdev=%d --> Access is disabled by btl_smcuda_use_cuda_ipc_same_gpu",
706706
endpoint->my_smp_rank, mydevnum, endpoint->peer_smp_rank, ctrlhdr.cudev);
707707
endpoint->ipcstate = IPC_BAD;
@@ -712,7 +712,7 @@ static void btl_smcuda_control(mca_btl_base_module_t *btl,
712712
if (0 != res) {
713713
opal_output_verbose(
714714
10, mca_btl_smcuda_component.cuda_ipc_output,
715-
"Analyzed CUDA IPC request: myrank=%d, mydev=%d, peerrank=%d, "
715+
"Analyzed GPU IPC request: myrank=%d, mydev=%d, peerrank=%d, "
716716
"peerdev=%d --> Access is disabled because peer check failed with err=%d",
717717
endpoint->my_smp_rank, mydevnum, endpoint->peer_smp_rank, ctrlhdr.cudev,
718718
res);
@@ -722,36 +722,36 @@ static void btl_smcuda_control(mca_btl_base_module_t *btl,
722722
}
723723

724724
opal_output_verbose(10, mca_btl_smcuda_component.cuda_ipc_output,
725-
"Analyzed CUDA IPC request: myrank=%d, mydev=%d, peerrank=%d, "
725+
"Analyzed GPU IPC request: myrank=%d, mydev=%d, peerrank=%d, "
726726
"peerdev=%d --> ACCESS=%d",
727727
endpoint->my_smp_rank, mydevnum, endpoint->peer_smp_rank,
728728
ctrlhdr.cudev, ipcaccess);
729729

730730
if (0 == ipcaccess) {
731731
/* No CUDA IPC support */
732732
opal_output_verbose(10, mca_btl_smcuda_component.cuda_ipc_output,
733-
"Not sending CUDA IPC ACK, no P2P support");
733+
"Not sending GPU IPC ACK, no P2P support");
734734
endpoint->ipcstate = IPC_BAD;
735735
} else {
736736
/* CUDA IPC works */
737737
smcuda_btl->error_cb(&smcuda_btl->super, MCA_BTL_ERROR_FLAGS_ADD_ACCELERATOR_IPC, ep_proc,
738738
(char *) &mca_btl_smcuda_component.cuda_ipc_output);
739739
opal_output_verbose(
740740
10, mca_btl_smcuda_component.cuda_ipc_output,
741-
"Sending CUDA IPC ACK: myrank=%d, mydev=%d, peerrank=%d, peerdev=%d",
741+
"Sending GPU IPC ACK: myrank=%d, mydev=%d, peerrank=%d, peerdev=%d",
742742
endpoint->my_smp_rank, mydevnum, endpoint->peer_smp_rank, ctrlhdr.cudev);
743743
mca_btl_smcuda_send_cuda_ipc_ack(btl, endpoint, 1);
744744
}
745745
} else {
746746
OPAL_THREAD_UNLOCK(&endpoint->endpoint_lock);
747747
opal_output_verbose(10, mca_btl_smcuda_component.cuda_ipc_output,
748-
"Not sending CUDA IPC ACK because request already initiated");
748+
"Not sending GPU IPC ACK because request already initiated");
749749
}
750750
break;
751751

752752
case IPC_ACK:
753753
opal_output_verbose(10, mca_btl_smcuda_component.cuda_ipc_output,
754-
"Received CUDA IPC ACK, notifying PML: myrank=%d, peerrank=%d",
754+
"Received GPU IPC ACK, notifying PML: myrank=%d, peerrank=%d",
755755
endpoint->my_smp_rank, endpoint->peer_smp_rank);
756756

757757
smcuda_btl->error_cb(&smcuda_btl->super, MCA_BTL_ERROR_FLAGS_ADD_ACCELERATOR_IPC, ep_proc,
@@ -764,7 +764,7 @@ static void btl_smcuda_control(mca_btl_base_module_t *btl,
764764
/* The remote side is not ready. Reset state to initialized so next
765765
* send call will try again to set up connection. */
766766
opal_output_verbose(10, mca_btl_smcuda_component.cuda_ipc_output,
767-
"Received CUDA IPC NOTREADY, reset state to allow another attempt: "
767+
"Received GPU IPC NOTREADY, reset state to allow another attempt: "
768768
"myrank=%d, peerrank=%d",
769769
endpoint->my_smp_rank, endpoint->peer_smp_rank);
770770
OPAL_THREAD_LOCK(&endpoint->endpoint_lock);
@@ -775,7 +775,7 @@ static void btl_smcuda_control(mca_btl_base_module_t *btl,
775775
break;
776776

777777
default:
778-
opal_output(0, "Received UNKNOWN CUDA IPC control message. This should not happen.");
778+
opal_output(0, "Received UNKNOWN GPU IPC control message. This should not happen.");
779779
}
780780
}
781781

opal/mca/btl/smcuda/btl_smcuda_frag.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@
3131
#include "opal_config.h"
3232
#include "btl_smcuda.h"
3333

34-
#include "opal/include/opal/opal_cuda.h"
34+
#include "opal/include/opal/opal_gpu.h"
3535

3636
#define MCA_BTL_SMCUDA_FRAG_TYPE_MASK ((uintptr_t) 0x3)
3737
#define MCA_BTL_SMCUDA_FRAG_SEND ((uintptr_t) 0x0)
@@ -52,7 +52,7 @@ struct mca_btl_smcuda_hdr_t {
5252
typedef struct mca_btl_smcuda_hdr_t mca_btl_smcuda_hdr_t;
5353

5454
struct mca_btl_base_registration_handle_t {
55-
mca_opal_cuda_reg_data_t reg_data;
55+
mca_opal_gpu_reg_data_t reg_data;
5656
};
5757

5858
struct mca_btl_smcuda_segment_t {

opal/mca/btl/smcuda/configure.m4

Lines changed: 0 additions & 33 deletions
This file was deleted.

0 commit comments

Comments
 (0)