-
Notifications
You must be signed in to change notification settings - Fork 204
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
add a CUDA device code sanity check #4692
base: 5.0.x
Are you sure you want to change the base?
Changes from 1 commit
e329d46
c8cece2
ee63b8e
de6d49d
0e97868
6b6d2c8
6568909
3d07ef6
f13fca2
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -40,6 +40,7 @@ | |
* Maxime Boissonneault (Compute Canada) | ||
* Davide Vanzo (Vanderbilt University) | ||
* Caspar van Leeuwen (SURF) | ||
* Jasper Grimm (UoY) | ||
""" | ||
import concurrent | ||
import copy | ||
|
@@ -101,8 +102,9 @@ | |
from easybuild.tools.output import show_progress_bars, start_progress_bar, stop_progress_bar, update_progress_bar | ||
from easybuild.tools.package.utilities import package | ||
from easybuild.tools.repository.repository import init_repository | ||
from easybuild.tools.systemtools import check_linked_shared_libs, det_parallelism, get_linked_libs_raw | ||
from easybuild.tools.systemtools import get_shared_lib_ext, pick_system_specific_value, use_group | ||
from easybuild.tools.systemtools import check_linked_shared_libs, det_parallelism, get_cuda_device_code_architectures | ||
from easybuild.tools.systemtools import get_linked_libs_raw, get_shared_lib_ext, pick_system_specific_value, use_group | ||
from easybuild.tools.toolchain.toolchain import TOOLCHAIN_CAPABILITY_CUDA | ||
from easybuild.tools.utilities import INDENT_4SPACES, get_class_for, nub, quote_str | ||
from easybuild.tools.utilities import remove_unwanted_chars, time2str, trace_msg | ||
from easybuild.tools.version import this_is_easybuild, VERBOSE_VERSION, VERSION | ||
|
@@ -3193,6 +3195,59 @@ def _sanity_check_step_multi_deps(self, *args, **kwargs): | |
self.cfg['builddependencies'] = builddeps | ||
self.cfg.iterating = False | ||
|
||
def sanity_check_cuda(self, cuda_dirs=None, check_cuobjdump=True): | ||
"""Sanity check that binaries/libraries contain device code for the correct architecture targets.""" | ||
|
||
self.log.info("Checking binaries/libraries for CUDA device code...") | ||
|
||
fails = [] | ||
cfg_ccs = build_option('cuda_compute_capabilities') or self.cfg.get('cuda_compute_capabilities', None) | ||
|
||
if cuda_dirs is None: | ||
cuda_dirs = self.cfg['bin_lib_subdirs'] or self.bin_lib_subdirs() | ||
|
||
if not cuda_dirs: | ||
cuda_dirs = DEFAULT_BIN_LIB_SUBDIRS | ||
self.log.info("Using default subdirectories for binaries/libraries to verify CUDA device code: %s", | ||
cuda_dirs) | ||
else: | ||
self.log.info("Using default subdirectories for binaries/libraries to verify CUDA device code: %s", | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This info message seems wrong: this is not the default subdirectories, this is a custom defined |
||
cuda_dirs) | ||
|
||
for dirpath in [os.path.join(self.installdir, d) for d in cuda_dirs]: | ||
if os.path.exists(dirpath): | ||
self.log.debug(f"Sanity checking files for CUDA device code in {dirpath}") | ||
|
||
for path in [os.path.join(dirpath, x) for x in os.listdir(dirpath)]: | ||
self.log.debug("Sanity checking for CUDA device code in %s", path) | ||
|
||
derived_ccs = get_cuda_device_code_architectures(path) | ||
|
||
if derived_ccs is None: | ||
msg = f"No CUDA device code found in {path}, so skipping it in CUDA device code sanity check" | ||
self.log.debug(msg) | ||
else: | ||
# check whether device code architectures match cuda_compute_capabilities | ||
additional_ccs = list(set(derived_ccs) - set(cfg_ccs)) | ||
missing_ccs = list(set(cfg_ccs) - set(derived_ccs)) | ||
|
||
if additional_ccs or missing_ccs: | ||
fail_msg = f"Mismatch between cuda_compute_capabilities and device code in {path}. " | ||
if additional_ccs: | ||
fail_msg += "Surplus compute capabilities: %s. " % ', '.join(sorted(additional_ccs)) | ||
if missing_ccs: | ||
fail_msg += "Missing compute capabilities: %s." % ', '.join(sorted(missing_ccs)) | ||
self.log.warning(fail_msg) | ||
fails.append(fail_msg) | ||
else: | ||
msg = (f"Output of 'cuobjdump' checked for {path}; device code architecures match " | ||
"those in cuda_compute_capabilities") | ||
self.log.debug(msg) | ||
else: | ||
self.log.debug(f"Not sanity checking files in non-existing directory {dirpath}") | ||
|
||
return fails | ||
|
||
def sanity_check_rpath(self, rpath_dirs=None, check_readelf_rpath=True): | ||
"""Sanity check binaries/libraries w.r.t. RPATH linking.""" | ||
|
||
|
@@ -3782,6 +3837,14 @@ def xs2str(xs): | |
else: | ||
self.log.debug("Skipping RPATH sanity check") | ||
|
||
if get_software_root('CUDA'): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @boegel We have an EESSI-specific complication here. We drop CUDA to a build time dep so that we don't depend on the CUDA module at runtime. This means that we won't execute this code path so we need to trigger the module load here. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think you're right, but just to double check: are the build dependencies unloaded at sanity check time? Could we fix this through an EasyBuild hook in EESSI, that loads the CUDA that was a build dependency also in the sanity_check_step (and unloads after)? Should also work for There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Wait... actually, I don't think you're right. Because I just did this with There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I wonder what happens in the |
||
cuda_fails = self.sanity_check_cuda() | ||
if cuda_fails: | ||
self.log.warning("CUDA device code sanity check failed!") | ||
self.sanity_check_fail_msgs.extend(cuda_fails) | ||
else: | ||
self.log.debug("Skipping CUDA device code sanity check") | ||
|
||
# pass or fail | ||
if self.sanity_check_fail_msgs: | ||
raise EasyBuildError( | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -29,6 +29,7 @@ | |
|
||
* Jens Timmerman (Ghent University) | ||
* Ward Poelmans (Ghent University) | ||
* Jasper Grimm (UoY) | ||
""" | ||
import ctypes | ||
import errno | ||
|
@@ -963,6 +964,64 @@ def get_glibc_version(): | |
return glibc_ver | ||
|
||
|
||
def get_cuda_object_dump_raw(path): | ||
""" | ||
Get raw ouput from command which extracts information from CUDA binary files in a human-readable format, | ||
or None for files containing no CUDA device code. | ||
See https://docs.nvidia.com/cuda/cuda-binary-utilities/index.html#cuobjdump | ||
""" | ||
|
||
res = run_shell_cmd("file %s" % path, fail_on_error=False, hidden=True, output_file=False, stream_output=False) | ||
if res.exit_code != EasyBuildExit.SUCCESS: | ||
fail_msg = "Failed to run 'file %s': %s" % (path, res.output) | ||
_log.warning(fail_msg) | ||
|
||
# check that the file is an executable or library/object | ||
if any(x in res.output for x in ['executable', 'object', 'library']): | ||
cuda_cmd = f"cuobjdump {path}" | ||
else: | ||
return None | ||
|
||
res = run_shell_cmd(cuda_cmd, fail_on_error=False, hidden=True, output_file=False, stream_output=False) | ||
if res.exit_code == EasyBuildExit.SUCCESS: | ||
return res.output | ||
else: | ||
msg = "Dumping CUDA binary file information for '%s' via '%s' failed! Output: '%s'" | ||
_log.debug(msg % (path, cuda_cmd, res.output)) | ||
return None | ||
|
||
|
||
def get_cuda_device_code_architectures(path): | ||
""" | ||
Get list of supported CUDA architectures, by inspecting the device code of an executable/library. The format is the | ||
same as cuda_compute_capabilities (e.g. ['8.6', '9.0'] for sm_86 sm_90). | ||
Returns None if no CUDA device code is present in the file. | ||
""" | ||
|
||
# cudaobjdump uses the sm_XY format | ||
device_code_regex = re.compile('(?<=arch = sm_)([0-9])([0-9]+a{0,1})') | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It would be good to also capture whether the code can be jit compiled (so it can at least run on a future arch). In a script I had I did this with:
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In fact: To have a concrete example of something that has both, one can check e.g.
|
||
|
||
# resolve symlinks | ||
if os.path.islink(path) and os.path.exists(path): | ||
path = os.path.realpath(path) | ||
|
||
cuda_raw = get_cuda_object_dump_raw(path) | ||
if cuda_raw is None: | ||
return None | ||
|
||
# extract unique architectures from raw dump | ||
matches = re.findall(device_code_regex, cuda_raw) | ||
if matches is not None: | ||
# convert match tuples into unique list of cuda compute capabilities | ||
# e.g. [('8', '6'), ('8', '6'), ('9', '0')] -> ['8.6', '9.0'] | ||
matches = sorted(['.'.join(m) for m in set(matches)]) | ||
else: | ||
fail_msg = f"Failed to determine supported CUDA architectures from {path}" | ||
_log.warning(fail_msg) | ||
|
||
return matches | ||
|
||
|
||
def get_linked_libs_raw(path): | ||
""" | ||
Get raw output from command that reports linked libraries for dynamically linked executables/libraries, | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
'easybuild.tools.toolchain.toolchain.TOOLCHAIN_CAPABILITY_CUDA' imported but unused