-
Notifications
You must be signed in to change notification settings - Fork 204
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
add a CUDA device code sanity check #4692
base: 5.0.x
Are you sure you want to change the base?
Changes from all commits
e329d46
c8cece2
ee63b8e
de6d49d
0e97868
6b6d2c8
6568909
3d07ef6
f13fca2
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -41,6 +41,7 @@ | |
* Davide Vanzo (Vanderbilt University) | ||
* Caspar van Leeuwen (SURF) | ||
* Jan Andre Reuter (Juelich Supercomputing Centre) | ||
* Jasper Grimm (UoY) | ||
""" | ||
import concurrent | ||
import copy | ||
|
@@ -107,8 +108,9 @@ | |
from easybuild.tools.output import show_progress_bars, start_progress_bar, stop_progress_bar, update_progress_bar | ||
from easybuild.tools.package.utilities import package | ||
from easybuild.tools.repository.repository import init_repository | ||
from easybuild.tools.systemtools import check_linked_shared_libs, det_parallelism, get_linked_libs_raw | ||
from easybuild.tools.systemtools import get_shared_lib_ext, pick_system_specific_value, use_group | ||
from easybuild.tools.systemtools import check_linked_shared_libs, det_parallelism, get_cuda_device_code_architectures | ||
from easybuild.tools.systemtools import get_linked_libs_raw, get_shared_lib_ext, pick_system_specific_value, use_group | ||
from easybuild.tools.toolchain.toolchain import TOOLCHAIN_CAPABILITY_CUDA | ||
from easybuild.tools.utilities import INDENT_4SPACES, get_class_for, nub, quote_str | ||
from easybuild.tools.utilities import remove_unwanted_chars, time2str, trace_msg | ||
from easybuild.tools.version import this_is_easybuild, VERBOSE_VERSION, VERSION | ||
|
@@ -3312,6 +3314,101 @@ def _sanity_check_step_multi_deps(self, *args, **kwargs): | |
self.cfg['builddependencies'] = builddeps | ||
self.cfg.iterating = False | ||
|
||
def sanity_check_cuda(self, cuda_dirs=None, check_cuobjdump=True): | ||
"""Sanity check that binaries/libraries contain device code for the correct architecture targets.""" | ||
|
||
self.log.info("Checking binaries/libraries for CUDA device code...") | ||
|
||
fails = [] | ||
cfg_ccs = build_option('cuda_compute_capabilities') or self.cfg.get('cuda_compute_capabilities', None) | ||
strict_cc_check = build_option('strict_cuda_sanity_check') | ||
|
||
# Construct the list of files to ignore as full paths (cuda_sanity_ignore_files contains the paths | ||
# to ignore, relative to the installation prefix) | ||
ignore_file_list = [os.path.join(self.installdir, d) for d in self.cfg['cuda_sanity_ignore_files']] | ||
|
||
# If there are no CUDA compute capabilities defined, return | ||
if cfg_ccs is None or len(cfg_ccs) == 0: | ||
self.log.info("Skipping CUDA sanity check, as no CUDA compute capabilities where configured") | ||
return fails | ||
|
||
if cuda_dirs is None: | ||
cuda_dirs = self.cfg['bin_lib_subdirs'] or self.bin_lib_subdirs() | ||
|
||
if not cuda_dirs: | ||
cuda_dirs = DEFAULT_BIN_LIB_SUBDIRS | ||
self.log.info("Using default subdirectories for binaries/libraries to verify CUDA device code: %s", | ||
cuda_dirs) | ||
else: | ||
self.log.info("Using default subdirectories for binaries/libraries to verify CUDA device code: %s", | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This info message seems wrong: this is not the default subdirectories, this is a custom defined |
||
cuda_dirs) | ||
|
||
for dirpath in [os.path.join(self.installdir, d) for d in cuda_dirs]: | ||
if os.path.exists(dirpath): | ||
self.log.debug(f"Sanity checking files for CUDA device code in {dirpath}") | ||
|
||
for path in [os.path.join(dirpath, x) for x in os.listdir(dirpath)]: | ||
self.log.debug("Sanity checking for CUDA device code in %s", path) | ||
|
||
res = get_cuda_device_code_architectures(path) | ||
if res is None: | ||
msg = f"{path} does not appear to be a CUDA executable (no CUDA device code found), " | ||
msg += "so skipping CUDA sanity check." | ||
self.log.debug(msg) | ||
else: | ||
# unpack results | ||
derived_ccs = res.device_code_archs | ||
derived_ptx_ccs = res.ptx_archs | ||
|
||
# check whether device code architectures match cuda_compute_capabilities | ||
additional_ccs = list(set(derived_ccs) - set(cfg_ccs)) | ||
missing_ccs = list(set(cfg_ccs) - set(derived_ccs)) | ||
|
||
if additional_ccs or missing_ccs: | ||
# Do we log this as warning or produce a sanity failure? | ||
is_failure = False | ||
fail_msg = f"Mismatch between cuda_compute_capabilities and device code in {path}. " | ||
if additional_ccs: | ||
fail_msg += "Surplus compute capabilities: %s. " % ', '.join(sorted(additional_ccs)) | ||
if strict_cc_check: | ||
is_failure = True | ||
if missing_ccs: | ||
fail_msg += "Missing compute capabilities: %s. " % ', '.join(sorted(missing_ccs)) | ||
is_failure = True | ||
# We still log the result, but don't fail: | ||
if path in ignore_file_list: | ||
fail_msg += f"This failure will be ignored as {path} is listed in " | ||
fail_msg += "'ignore_cuda_sanity_failures'." | ||
is_failure = False | ||
|
||
# Log warning or sanity error | ||
if is_failure: | ||
fails.append(fail_msg) | ||
else: | ||
self.log.warning(fail_msg) | ||
else: | ||
msg = (f"Output of 'cuobjdump' checked for {path}; device code architectures match " | ||
"those in cuda_compute_capabilities") | ||
self.log.debug(msg) | ||
|
||
# Check whether there is ptx code for the highest CC in cfg_ccs | ||
highest_cc = [sorted(cfg_ccs)[-1]] | ||
missing_ptx_ccs = list(set(highest_cc) - set(derived_ptx_ccs)) | ||
|
||
if missing_ptx_ccs: | ||
fail_msg = "Configured highest compute capability was '%s', " | ||
fail_msg += "but no PTX code for this compute capability was found in '%s' " | ||
fail_msg += "PTX architectures supported in that file: %s" | ||
self.log.warning(fail_msg, highest_cc[0], path, derived_ptx_ccs) | ||
else: | ||
msg = (f"Output of 'cuobjdump' checked for {path}; ptx code was present for (at least) the" | ||
" highest CUDA compute capability in cuda_compute_capabilities") | ||
self.log.debug(msg) | ||
else: | ||
self.log.debug(f"Not sanity checking files in non-existing directory {dirpath}") | ||
|
||
return fails | ||
|
||
def sanity_check_rpath(self, rpath_dirs=None, check_readelf_rpath=True): | ||
"""Sanity check binaries/libraries w.r.t. RPATH linking.""" | ||
|
||
|
@@ -3900,6 +3997,14 @@ def xs2str(xs): | |
else: | ||
self.log.debug("Skipping RPATH sanity check") | ||
|
||
if get_software_root('CUDA'): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @boegel We have an EESSI-specific complication here. We drop CUDA to a build time dep so that we don't depend on the CUDA module at runtime. This means that we won't execute this code path so we need to trigger the module load here. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think you're right, but just to double check: are the build dependencies unloaded at sanity check time? Could we fix this through an EasyBuild hook in EESSI, that loads the CUDA that was a build dependency also in the sanity_check_step (and unloads after)? Should also work for There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Wait... actually, I don't think you're right. Because I just did this with There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I wonder what happens in the |
||
cuda_fails = self.sanity_check_cuda() | ||
if cuda_fails: | ||
self.log.warning("CUDA device code sanity check failed!") | ||
self.sanity_check_fail_msgs.extend(cuda_fails) | ||
else: | ||
self.log.debug("Skipping CUDA device code sanity check") | ||
|
||
# pass or fail | ||
if self.sanity_check_fail_msgs: | ||
raise EasyBuildError( | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
'easybuild.tools.toolchain.toolchain.TOOLCHAIN_CAPABILITY_CUDA' imported but unused