Skip to content

Commit

Permalink
Add strict-cuda-sanity-check option and make sure we only fail the sa…
Browse files Browse the repository at this point in the history
…nity check on surpluss CUDA archs if this option is set. Otherwise, print warning
  • Loading branch information
Caspar van Leeuwen committed Feb 21, 2025
1 parent 6b6d2c8 commit 6568909
Show file tree
Hide file tree
Showing 3 changed files with 21 additions and 4 deletions.
15 changes: 12 additions & 3 deletions easybuild/framework/easyblock.py
Original file line number Diff line number Diff line change
Expand Up @@ -3321,6 +3321,7 @@ def sanity_check_cuda(self, cuda_dirs=None, check_cuobjdump=True):

fails = []
cfg_ccs = build_option('cuda_compute_capabilities') or self.cfg.get('cuda_compute_capabilities', None)
strict_cc_check = build_option('strict_cuda_sanity_check')

# Construct the list of files to ignore as full paths (cuda_sanity_ignore_files contains the paths
# to ignore, relative to the installation prefix)
Expand Down Expand Up @@ -3364,21 +3365,29 @@ def sanity_check_cuda(self, cuda_dirs=None, check_cuobjdump=True):
missing_ccs = list(set(cfg_ccs) - set(derived_ccs))

if additional_ccs or missing_ccs:
# Do we log this as warning or produce a sanity failure?
is_failure = False
fail_msg = f"Mismatch between cuda_compute_capabilities and device code in {path}. "
if additional_ccs:
fail_msg += "Surplus compute capabilities: %s. " % ', '.join(sorted(additional_ccs))
if strict_cc_check:
is_failure = True
if missing_ccs:
fail_msg += "Missing compute capabilities: %s. " % ', '.join(sorted(missing_ccs))
is_failure = True
# We still log the result, but don't fail:
if path in ignore_file_list:
fail_msg += f"This failure will be ignored as {path} is listed in "
fail_msg += "'ignore_cuda_sanity_failures'."
self.log.warning(fail_msg)
is_failure = False

# Log warning or sanity error
if is_failure:
fails.append(fail_msg)
else:
self.log.warning(fail_msg)
fails.append(fail_msg)
else:
msg = (f"Output of 'cuobjdump' checked for {path}; device code architecures match "
msg = (f"Output of 'cuobjdump' checked for {path}; device code architectures match "
"those in cuda_compute_capabilities")
self.log.debug(msg)

Expand Down
1 change: 1 addition & 0 deletions easybuild/tools/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -358,6 +358,7 @@ def mk_full_default_path(name, prefix=DEFAULT_PREFIX):
'pre_create_installdir',
'show_progress_bar',
'strict_rpath_sanity_check',
'strict_cuda_sanity_check',
'trace',
],
EMPTY_LIST: [
Expand Down
9 changes: 8 additions & 1 deletion easybuild/tools/options.py
Original file line number Diff line number Diff line change
Expand Up @@ -540,9 +540,16 @@ def override_options(self):
"Git commit to use for the target software build (robot capabilities are automatically disabled)",
None, 'store', None),
'sticky-bit': ("Set sticky bit on newly created directories", None, 'store_true', False),
'strict-rpath-sanity-check': ("Perform strict RPATH sanity check, which involces unsetting "
'strict-rpath-sanity-check': ("Perform strict RPATH sanity check, which involves unsetting "
"$LD_LIBRARY_PATH before checking whether all required libraries are found",
None, 'store_true', False),
'strict-cuda-sanity-check': ("Perform strict CUDA sanity check. Without this option, the CUDA sanity "
"check will fail if the CUDA binaries don't contain code for (at least) "
"all compute capabilities defined in --cude-compute-capabilities, but will "
"accept if code for additional compute capabilities is present. "
"With this setting, the sanity check will also fail if code is present for "
"more compute capabilities than defined in --cuda-compute-capabilities.",
None, 'store_true', False),
'sysroot': ("Location root directory of system, prefix for standard paths like /usr/lib and /usr/include",
None, 'store', None),
'trace': ("Provide more information in output to stdout on progress", None, 'store_true', True, 'T'),
Expand Down

0 comments on commit 6568909

Please sign in to comment.