23
23
* Copyright (c) 2022 IBM Corporation. All rights reserved
24
24
* Copyright (c) 2023 Triad National Security, LLC. All rights
25
25
* reserved.
26
+ * Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
26
27
* $COPYRIGHT$
27
28
*
28
29
* Additional copyrights may follow
71
72
#include "btl_smcuda_frag.h"
72
73
#include "btl_smcuda_accelerator.h"
73
74
74
-
75
- #include "opal/include/opal/opal_cuda.h"
75
+ #include "opal/include/opal/opal_gpu.h"
76
76
77
77
static struct mca_btl_base_registration_handle_t *
78
78
mca_btl_smcuda_register_mem (struct mca_btl_base_module_t * btl ,
@@ -354,15 +354,15 @@ static int smcuda_btl_first_time_init(mca_btl_smcuda_t *smcuda_btl, int32_t my_s
354
354
* local process to know which parts of the memory are being utilized by a
355
355
* remote process. */
356
356
opal_output_verbose (10 , opal_btl_base_framework .framework_output ,
357
- "btl:smcuda: CUDA cuMemHostRegister address=%p, size=%d" ,
357
+ "btl:smcuda: host_register address=%p, size=%d" ,
358
358
mca_btl_smcuda_component .sm_mpool_base , (int ) res -> size );
359
- if (0 == strcmp (opal_accelerator_base_selected_component .base_version .mca_component_name , "cuda " )) {
359
+ if (0 != strcmp (opal_accelerator_base_selected_component .base_version .mca_component_name , "null " )) {
360
360
rc = opal_accelerator .host_register (MCA_ACCELERATOR_NO_DEVICE_ID , mca_btl_smcuda_component .sm_mpool_base , res -> size );
361
361
if (OPAL_UNLIKELY (OPAL_SUCCESS != rc )) {
362
362
/* If registering the memory fails, print a message and continue.
363
363
* This is not a fatal error. */
364
364
opal_output_verbose (10 , opal_btl_base_framework .framework_output ,
365
- "btl:smcuda: CUDA cuMemHostRegister failed" );
365
+ "btl:smcuda: host_register failed" );
366
366
}
367
367
}
368
368
@@ -877,7 +877,7 @@ int mca_btl_smcuda_sendi(struct mca_btl_base_module_t *btl,
877
877
}
878
878
/* Initiate setting up CUDA IPC support. */
879
879
880
- if (0 == strcmp (opal_accelerator_base_selected_component .base_version .mca_component_name , "cuda " ) && (IPC_INIT == endpoint -> ipcstate )
880
+ if (0 != strcmp (opal_accelerator_base_selected_component .base_version .mca_component_name , "null " ) && (IPC_INIT == endpoint -> ipcstate )
881
881
&& mca_btl_smcuda_component .use_cuda_ipc ) {
882
882
mca_btl_smcuda_send_cuda_ipc_request (btl , endpoint );
883
883
}
@@ -967,7 +967,7 @@ int mca_btl_smcuda_send(struct mca_btl_base_module_t *btl, struct mca_btl_base_e
967
967
mca_btl_smcuda_component_progress ();
968
968
}
969
969
/* Initiate setting up CUDA IPC support */
970
- if (0 == strcmp (opal_accelerator_base_selected_component .base_version .mca_component_name , "cuda " ) && (IPC_INIT == endpoint -> ipcstate )
970
+ if (0 != strcmp (opal_accelerator_base_selected_component .base_version .mca_component_name , "null " ) && (IPC_INIT == endpoint -> ipcstate )
971
971
&& mca_btl_smcuda_component .use_cuda_ipc ) {
972
972
mca_btl_smcuda_send_cuda_ipc_request (btl , endpoint );
973
973
}
@@ -1004,7 +1004,7 @@ mca_btl_smcuda_register_mem(struct mca_btl_base_module_t *btl,
1004
1004
uint32_t flags )
1005
1005
{
1006
1006
mca_btl_smcuda_t * smcuda_module = (mca_btl_smcuda_t * ) btl ;
1007
- mca_opal_cuda_reg_t * reg ;
1007
+ mca_opal_gpu_reg_t * reg ;
1008
1008
int access_flags = flags & MCA_BTL_REG_FLAG_ACCESS_ANY ;
1009
1009
int rcache_flags = 0 ;
1010
1010
@@ -1013,7 +1013,6 @@ mca_btl_smcuda_register_mem(struct mca_btl_base_module_t *btl,
1013
1013
rcache_flags |= MCA_RCACHE_FLAGS_ACCELERATOR_MEM ;
1014
1014
}
1015
1015
#endif
1016
-
1017
1016
smcuda_module -> rcache -> rcache_register (smcuda_module -> rcache , base , size , rcache_flags ,
1018
1017
access_flags , (mca_rcache_base_registration_t * * ) & reg );
1019
1018
if (OPAL_UNLIKELY (NULL == reg )) {
@@ -1027,9 +1026,8 @@ static int mca_btl_smcuda_deregister_mem(struct mca_btl_base_module_t *btl,
1027
1026
struct mca_btl_base_registration_handle_t * handle )
1028
1027
{
1029
1028
mca_btl_smcuda_t * smcuda_module = (mca_btl_smcuda_t * ) btl ;
1030
- mca_opal_cuda_reg_t * reg = (mca_opal_cuda_reg_t
1031
- * ) ((intptr_t ) handle
1032
- - offsetof(mca_opal_cuda_reg_t , data ));
1029
+ mca_opal_gpu_reg_t * reg = (mca_opal_gpu_reg_t * ) ((intptr_t ) handle
1030
+ - offsetof(mca_opal_gpu_reg_t , data ));
1033
1031
1034
1032
smcuda_module -> rcache -> rcache_deregister (smcuda_module -> rcache , & reg -> base );
1035
1033
@@ -1040,49 +1038,57 @@ static int mca_btl_smcuda_deregister_mem(struct mca_btl_base_module_t *btl,
1040
1038
* Put remote event on stream to ensure that the the start of the
1041
1039
* copy does not start until the completion of the event.
1042
1040
*/
1043
- static void mca_btl_smcuda_wait_stream_synchronize (mca_opal_cuda_reg_t * rget_reg )
1041
+ static void mca_btl_smcuda_wait_stream_synchronize (mca_opal_gpu_reg_t * rget_reg )
1044
1042
{
1045
- #if OPAL_CUDA_SYNC_MEMOPS
1046
- /* No need for any of this with SYNC_MEMOPS feature */
1047
- return ;
1048
- #else /* OPAL_CUDA_SYNC_MEMOPS */
1049
- CUipcEventHandle evtHandle ;
1050
- CUevent event ;
1051
- CUresult result ;
1043
+ opal_accelerator_ipc_event_handle_t evtHandle ;
1044
+ opal_accelerator_event_t event ;
1045
+ int result ;
1046
+
1047
+ if (opal_accelerator_use_sync_memops ) {
1048
+ /* No need for any of this with SYNC_MEMOPS feature */
1049
+ return ;
1050
+ }
1052
1051
1053
- memcpy (& evtHandle , rget_reg -> data .evtHandle , sizeof (evtHandle ));
1052
+ result = opal_accelerator .import_ipc_event_handle (rget_reg -> data .ipcEventHandle .handle , & evtHandle );
1053
+ if (OPAL_UNLIKELY (OPAL_SUCCESS != result )) {
1054
+ opal_output_verbose (10 , mca_btl_smcuda_component .cuda_ipc_output ,
1055
+ "import_ipc_event_handle failed" );
1056
+ return ;
1057
+ }
1054
1058
1055
- result = cuIpcOpenEventHandle ( & event , evtHandle );
1056
- if (OPAL_UNLIKELY (CUDA_SUCCESS != result )) {
1059
+ result = opal_accelerator . open_ipc_event_handle ( & evtHandle , & event );
1060
+ if (OPAL_UNLIKELY (OPAL_SUCCESS != result )) {
1057
1061
opal_output_verbose (10 , mca_btl_smcuda_component .cuda_ipc_output ,
1058
- "cuIpcOpenEventHandle failed" );
1062
+ "open_ipc_event_handle failed" );
1063
+ return ;
1059
1064
}
1060
1065
1061
- /* BEGIN of Workaround - There is a bug in CUDA 4.1 RC2 and earlier
1062
- * versions. Need to record an event on the stream, even though
1066
+ #if 0
1067
+ /* BEGIN of Workaround to deal with a bug in an early CUDA releases (4.1. an older)
1068
+ * Need to record an event on the stream, even though
1063
1069
* it is not used, to make sure we do not short circuit our way
1064
1070
* out of the cuStreamWaitEvent test.
1065
1071
*/
1066
- result = cuEventRecord ( event , 0 );
1067
- if (OPAL_UNLIKELY (CUDA_SUCCESS != result )) {
1072
+ result = opal_accelerator . record_event ( MCA_ACCELERATOR_NO_DEVICE_ID , & event , MCA_ACCELERATOR_STREAM_DEFAULT );
1073
+ if (OPAL_UNLIKELY (OPAL_SUCCESS != result )) {
1068
1074
opal_output_verbose (10 , mca_btl_smcuda_component .cuda_ipc_output ,
1069
- "cuEventRecord failed" );
1075
+ "record_event failed" );
1076
+ return ;
1070
1077
}
1071
1078
/* END of Workaround */
1079
+ #endif
1072
1080
1073
- result = cuStreamWaitEvent ( 0 , event , 0 );
1074
- if (OPAL_UNLIKELY (CUDA_SUCCESS != result )) {
1081
+ result = opal_accelerator . wait_event ( MCA_ACCELERATOR_NO_DEVICE_ID , & event , MCA_ACCELERATOR_STREAM_DEFAULT );
1082
+ if (OPAL_UNLIKELY (OPAL_SUCCESS != result )) {
1075
1083
opal_output_verbose (10 , mca_btl_smcuda_component .cuda_ipc_output ,
1076
- "cuStreamWaitEvent failed" );
1084
+ "wait_event failed" );
1085
+ return ;
1077
1086
}
1078
1087
1079
- /* All done with this event. */
1080
- result = cuEventDestroy (event );
1081
- if (OPAL_UNLIKELY (CUDA_SUCCESS != result )) {
1082
- opal_output_verbose (10 , mca_btl_smcuda_component .cuda_ipc_output ,
1083
- "cuStreamWaitEvent failed" );
1084
- }
1085
- #endif /* OPAL_CUDA_SYNC_MEMOPS */
1088
+ // ipc event are assumed to be static, hence no OBJ_RELEASE
1089
+ // but OBJ_DESTRUCT here.
1090
+ OBJ_DESTRUCT (& event );
1091
+ return ;
1086
1092
}
1087
1093
1088
1094
int mca_btl_smcuda_get_cuda (struct mca_btl_base_module_t * btl , struct mca_btl_base_endpoint_t * ep ,
@@ -1092,9 +1098,9 @@ int mca_btl_smcuda_get_cuda(struct mca_btl_base_module_t *btl, struct mca_btl_ba
1092
1098
int flags , int order , mca_btl_base_rdma_completion_fn_t cbfunc ,
1093
1099
void * cbcontext , void * cbdata )
1094
1100
{
1095
- mca_opal_cuda_reg_t rget_reg ;
1096
- mca_opal_cuda_reg_t * reg_ptr = & rget_reg ;
1097
- int rc , done ;
1101
+ mca_opal_gpu_reg_t rget_reg ;
1102
+ mca_opal_gpu_reg_t * reg_ptr = & rget_reg ;
1103
+ int rc ;
1098
1104
void * remote_memory_address ;
1099
1105
size_t offset ;
1100
1106
mca_btl_smcuda_frag_t * frag ;
@@ -1121,13 +1127,14 @@ int mca_btl_smcuda_get_cuda(struct mca_btl_base_module_t *btl, struct mca_btl_ba
1121
1127
* garbage in the debugger. */
1122
1128
1123
1129
memset (& rget_reg , 0 , sizeof (rget_reg ));
1124
- memcpy (& rget_reg .data .memHandle , remote_handle -> reg_data .memHandle ,
1125
- sizeof (remote_handle -> reg_data .memHandle ));
1126
- # if !OPAL_CUDA_SYNC_MEMOPS
1127
- /* Only need the remote event handle when syncing with remote events */
1128
- memcpy (& rget_reg .data .evtHandle , remote_handle -> reg_data .evtHandle ,
1129
- sizeof (remote_handle -> reg_data .evtHandle ));
1130
- # endif
1130
+ memcpy (& rget_reg .data .ipcHandle .handle , remote_handle -> reg_data .ipcHandle .handle ,
1131
+ sizeof (remote_handle -> reg_data .ipcHandle .handle ));
1132
+
1133
+ if (!opal_accelerator_use_sync_memops ) {
1134
+ /* Only need the remote event handle when syncing with remote events */
1135
+ memcpy (& rget_reg .data .ipcEventHandle .handle , remote_handle -> reg_data .ipcEventHandle .handle ,
1136
+ sizeof (remote_handle -> reg_data .ipcEventHandle .handle ));
1137
+ }
1131
1138
1132
1139
/* Open the memory handle to the remote memory. If it is cached, then
1133
1140
* we just retrieve it from cache and avoid a call to open the handle. That
@@ -1248,7 +1255,7 @@ static void mca_btl_smcuda_send_cuda_ipc_request(struct mca_btl_base_module_t *b
1248
1255
*/
1249
1256
OPAL_THREAD_ADD_FETCH32 (& mca_btl_smcuda_component .num_outstanding_frags , +1 );
1250
1257
opal_output_verbose (10 , mca_btl_smcuda_component .cuda_ipc_output ,
1251
- "Sending CUDA IPC REQ (try=%d): myrank=%d, mydev=%d, peerrank=%d" ,
1258
+ "Sending IPC REQ (try=%d): myrank=%d, mydev=%d, peerrank=%d" ,
1252
1259
endpoint -> ipctries , mca_btl_smcuda_component .my_smp_rank , mydevnum ,
1253
1260
endpoint -> peer_smp_rank );
1254
1261
0 commit comments