From e329d461957f0d6c05bbfd894634cb9ac19920c1 Mon Sep 17 00:00:00 2001 From: jfgrimm Date: Thu, 24 Oct 2024 16:09:36 +0100 Subject: [PATCH 1/8] sanity check binaries/libraries for device code matching cuda_compute_capabilities when CUDA is used --- easybuild/framework/easyblock.py | 67 +++++++++++++++++++++++++++++++- easybuild/tools/systemtools.py | 59 ++++++++++++++++++++++++++++ 2 files changed, 124 insertions(+), 2 deletions(-) diff --git a/easybuild/framework/easyblock.py b/easybuild/framework/easyblock.py index cfe0220202..9f0e0a98d7 100644 --- a/easybuild/framework/easyblock.py +++ b/easybuild/framework/easyblock.py @@ -40,6 +40,7 @@ * Maxime Boissonneault (Compute Canada) * Davide Vanzo (Vanderbilt University) * Caspar van Leeuwen (SURF) +* Jasper Grimm (UoY) """ import concurrent import copy @@ -101,8 +102,9 @@ from easybuild.tools.output import show_progress_bars, start_progress_bar, stop_progress_bar, update_progress_bar from easybuild.tools.package.utilities import package from easybuild.tools.repository.repository import init_repository -from easybuild.tools.systemtools import check_linked_shared_libs, det_parallelism, get_linked_libs_raw -from easybuild.tools.systemtools import get_shared_lib_ext, pick_system_specific_value, use_group +from easybuild.tools.systemtools import check_linked_shared_libs, det_parallelism, get_cuda_device_code_architectures +from easybuild.tools.systemtools import get_linked_libs_raw, get_shared_lib_ext, pick_system_specific_value, use_group +from easybuild.tools.toolchain.toolchain import TOOLCHAIN_CAPABILITY_CUDA from easybuild.tools.utilities import INDENT_4SPACES, get_class_for, nub, quote_str from easybuild.tools.utilities import remove_unwanted_chars, time2str, trace_msg from easybuild.tools.version import this_is_easybuild, VERBOSE_VERSION, VERSION @@ -3193,6 +3195,59 @@ def _sanity_check_step_multi_deps(self, *args, **kwargs): self.cfg['builddependencies'] = builddeps self.cfg.iterating = False + def sanity_check_cuda(self, cuda_dirs=None, check_cuobjdump=True): + """Sanity check that binaries/libraries contain device code for the correct architecture targets.""" + + self.log.info("Checking binaries/libraries for CUDA device code...") + + fails = [] + cfg_ccs = build_option('cuda_compute_capabilities') or self.cfg.get('cuda_compute_capabilities', None) + + if cuda_dirs is None: + cuda_dirs = self.cfg['bin_lib_subdirs'] or self.bin_lib_subdirs() + + if not cuda_dirs: + cuda_dirs = DEFAULT_BIN_LIB_SUBDIRS + self.log.info("Using default subdirectories for binaries/libraries to verify CUDA device code: %s", + cuda_dirs) + else: + self.log.info("Using default subdirectories for binaries/libraries to verify CUDA device code: %s", + cuda_dirs) + + for dirpath in [os.path.join(self.installdir, d) for d in cuda_dirs]: + if os.path.exists(dirpath): + self.log.debug(f"Sanity checking files for CUDA device code in {dirpath}") + + for path in [os.path.join(dirpath, x) for x in os.listdir(dirpath)]: + self.log.debug("Sanity checking for CUDA device code in %s", path) + + derived_ccs = get_cuda_device_code_architectures(path) + + if derived_ccs is None: + msg = f"No CUDA device code found in {path}, so skipping it in CUDA device code sanity check" + self.log.debug(msg) + else: + # check whether device code architectures match cuda_compute_capabilities + additional_ccs = list(set(derived_ccs) - set(cfg_ccs)) + missing_ccs = list(set(cfg_ccs) - set(derived_ccs)) + + if additional_ccs or missing_ccs: + fail_msg = f"Mismatch between cuda_compute_capabilities and device code in {path}. " + if additional_ccs: + fail_msg += "Surplus compute capabilities: %s. " % ', '.join(sorted(additional_ccs)) + if missing_ccs: + fail_msg += "Missing compute capabilities: %s." % ', '.join(sorted(missing_ccs)) + self.log.warning(fail_msg) + fails.append(fail_msg) + else: + msg = (f"Output of 'cuobjdump' checked for {path}; device code architecures match " + "those in cuda_compute_capabilities") + self.log.debug(msg) + else: + self.log.debug(f"Not sanity checking files in non-existing directory {dirpath}") + + return fails + def sanity_check_rpath(self, rpath_dirs=None, check_readelf_rpath=True): """Sanity check binaries/libraries w.r.t. RPATH linking.""" @@ -3782,6 +3837,14 @@ def xs2str(xs): else: self.log.debug("Skipping RPATH sanity check") + if get_software_root('CUDA'): + cuda_fails = self.sanity_check_cuda() + if cuda_fails: + self.log.warning("CUDA device code sanity check failed!") + self.sanity_check_fail_msgs.extend(cuda_fails) + else: + self.log.debug("Skipping CUDA device code sanity check") + # pass or fail if self.sanity_check_fail_msgs: raise EasyBuildError( diff --git a/easybuild/tools/systemtools.py b/easybuild/tools/systemtools.py index def6fbf3f1..9a1a337082 100644 --- a/easybuild/tools/systemtools.py +++ b/easybuild/tools/systemtools.py @@ -29,6 +29,7 @@ * Jens Timmerman (Ghent University) * Ward Poelmans (Ghent University) +* Jasper Grimm (UoY) """ import ctypes import errno @@ -963,6 +964,64 @@ def get_glibc_version(): return glibc_ver +def get_cuda_object_dump_raw(path): + """ + Get raw ouput from command which extracts information from CUDA binary files in a human-readable format, + or None for files containing no CUDA device code. + See https://docs.nvidia.com/cuda/cuda-binary-utilities/index.html#cuobjdump + """ + + res = run_shell_cmd("file %s" % path, fail_on_error=False, hidden=True, output_file=False, stream_output=False) + if res.exit_code != EasyBuildExit.SUCCESS: + fail_msg = "Failed to run 'file %s': %s" % (path, res.output) + _log.warning(fail_msg) + + # check that the file is an executable or library/object + if any(x in res.output for x in ['executable', 'object', 'library']): + cuda_cmd = f"cuobjdump {path}" + else: + return None + + res = run_shell_cmd(cuda_cmd, fail_on_error=False, hidden=True, output_file=False, stream_output=False) + if res.exit_code == EasyBuildExit.SUCCESS: + return res.output + else: + msg = "Dumping CUDA binary file information for '%s' via '%s' failed! Output: '%s'" + _log.debug(msg % (path, cuda_cmd, res.output)) + return None + + +def get_cuda_device_code_architectures(path): + """ + Get list of supported CUDA architectures, by inspecting the device code of an executable/library. The format is the + same as cuda_compute_capabilities (e.g. ['8.6', '9.0'] for sm_86 sm_90). + Returns None if no CUDA device code is present in the file. + """ + + # cudaobjdump uses the sm_XY format + device_code_regex = re.compile('(?<=arch = sm_)([0-9])([0-9]+a{0,1})') + + # resolve symlinks + if os.path.islink(path) and os.path.exists(path): + path = os.path.realpath(path) + + cuda_raw = get_cuda_object_dump_raw(path) + if cuda_raw is None: + return None + + # extract unique architectures from raw dump + matches = re.findall(device_code_regex, cuda_raw) + if matches is not None: + # convert match tuples into unique list of cuda compute capabilities + # e.g. [('8', '6'), ('8', '6'), ('9', '0')] -> ['8.6', '9.0'] + matches = sorted(['.'.join(m) for m in set(matches)]) + else: + fail_msg = f"Failed to determine supported CUDA architectures from {path}" + _log.warning(fail_msg) + + return matches + + def get_linked_libs_raw(path): """ Get raw output from command that reports linked libraries for dynamically linked executables/libraries, From ee63b8e64f834d20c1d695e17346e5315217b64e Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Thu, 20 Feb 2025 02:40:05 +0100 Subject: [PATCH 2/8] Add check for PTX, more explicit debug logging --- easybuild/framework/easyblock.py | 31 +++++++++-- easybuild/tools/systemtools.py | 88 ++++++++++++++++++++++++++++---- 2 files changed, 104 insertions(+), 15 deletions(-) diff --git a/easybuild/framework/easyblock.py b/easybuild/framework/easyblock.py index b9a3f341a4..80ecd25bbe 100644 --- a/easybuild/framework/easyblock.py +++ b/easybuild/framework/easyblock.py @@ -3322,6 +3322,11 @@ def sanity_check_cuda(self, cuda_dirs=None, check_cuobjdump=True): fails = [] cfg_ccs = build_option('cuda_compute_capabilities') or self.cfg.get('cuda_compute_capabilities', None) + # If there are no CUDA compute capabilities defined, return + if cfg_ccs is None or len(cfg_ccs) == 0: + self.log.info("Skipping CUDA sanity check, as no CUDA compute capabilities where configured") + return fails + if cuda_dirs is None: cuda_dirs = self.cfg['bin_lib_subdirs'] or self.bin_lib_subdirs() @@ -3340,12 +3345,17 @@ def sanity_check_cuda(self, cuda_dirs=None, check_cuobjdump=True): for path in [os.path.join(dirpath, x) for x in os.listdir(dirpath)]: self.log.debug("Sanity checking for CUDA device code in %s", path) - derived_ccs = get_cuda_device_code_architectures(path) - - if derived_ccs is None: - msg = f"No CUDA device code found in {path}, so skipping it in CUDA device code sanity check" + res = get_cuda_device_code_architectures(path) + if res is None: + msg = f"{path} does not appear to be a CUDA executable (no CUDA device code found), " + msg += "so skipping CUDA sanity check." self.log.debug(msg) + return fails else: + # unpack results + derived_ccs = res.device_code_archs + derived_ptx_ccs = res.ptx_archs + # check whether device code architectures match cuda_compute_capabilities additional_ccs = list(set(derived_ccs) - set(cfg_ccs)) missing_ccs = list(set(cfg_ccs) - set(derived_ccs)) @@ -3362,6 +3372,19 @@ def sanity_check_cuda(self, cuda_dirs=None, check_cuobjdump=True): msg = (f"Output of 'cuobjdump' checked for {path}; device code architecures match " "those in cuda_compute_capabilities") self.log.debug(msg) + + # Check whether there is ptx code for the highest CC in cfg_ccs + highest_cc = sorted(cfg_ccs)[-1] + missing_ptx_ccs = list(set(highest_cc) - set(derived_ptx_ccs)) + + if missing_ptx_ccs: + fail_msg = "Configured highest compute capability was '%s', " + fail_msg += "but no PTX code for this compute capability was found in '%s'" + self.log.warning(fail_msg, highest_cc, missing_ptx_ccs) + else: + msg = (f"Output of 'cuobjdump' checked for {path}; ptx code was present for (at least) the" + " highest CUDA compute capability in cuda_compute_capabilities") + self.log.debug(msg) else: self.log.debug(f"Not sanity checking files in non-existing directory {dirpath}") diff --git a/easybuild/tools/systemtools.py b/easybuild/tools/systemtools.py index cfcdb4bfe4..5afe1717e8 100644 --- a/easybuild/tools/systemtools.py +++ b/easybuild/tools/systemtools.py @@ -43,7 +43,7 @@ import sys import termios import warnings -from collections import OrderedDict +from collections import OrderedDict, namedtuple from ctypes.util import find_library from socket import gethostname @@ -215,6 +215,14 @@ } +# A named tuple, to be returned by e.g. `get_cuda_device_code_architectures` +cuda_dev_ptx_archs = namedtuple('cuda_dev_ptx_archs', ('device_code_archs', 'ptx_archs')) +cuda_dev_ptx_archs.__doc__ = """A namedtuple that represents the result of a call to get_cuda_device_code_architectures, +with the following fields: +- device_code_archs: a list of CUDA device compute capabilities for which device code was found +- ptx_archs: a list of CUDA (virtual) device compute capabilities for which ptx code was found +""" + class SystemToolsException(Exception): """raised when systemtools fails""" @@ -986,8 +994,15 @@ def get_cuda_object_dump_raw(path): if res.exit_code == EasyBuildExit.SUCCESS: return res.output else: - msg = "Dumping CUDA binary file information for '%s' via '%s' failed! Output: '%s'" - _log.debug(msg % (path, cuda_cmd, res.output)) + # Check and report for the common case that this is simply not a CUDA binary, i.e. does not + # contain CUDA device code + no_device_code_match = re.search(r'does not contain device code', res.output) + if no_device_code_match is not None: + msg = "'%s' does not appear to be a CUDA binary: cuobjdump failed to find device code in this file" + _log.debug(msg, path) + else: + msg = "Dumping CUDA binary file information for '%s' via '%s' failed! Output: '%s'" + _log.debug(msg, path, cuda_cmd, res.output) return None @@ -998,8 +1013,27 @@ def get_cuda_device_code_architectures(path): Returns None if no CUDA device code is present in the file. """ - # cudaobjdump uses the sm_XY format - device_code_regex = re.compile('(?<=arch = sm_)([0-9])([0-9]+a{0,1})') + # Note that typical output for a cuobjdump call will look like this for device code: + # + # Fatbin elf code: + # ================ + # arch = sm_90 + # code version = [1,7] + # host = linux + # compile_size = 64bit + # + # And for ptx code, it will look like this: + # + # Fatbin ptx code: + # ================ + # arch = sm_90 + # code version = [8,1] + # host = linux + # compile_size = 64bit + + # Pattern to extract elf code architectures and ptx code architectures respectively + device_code_regex = re.compile('Fatbin elf code:\n=+\narch = sm_([0-9])([0-9]+a{0,1})') + ptx_code_regex = re.compile('Fatbin ptx code:\n=+\narch = sm_([0-9])([0-9]+a{0,1})') # resolve symlinks if os.path.islink(path) and os.path.exists(path): @@ -1009,17 +1043,49 @@ def get_cuda_device_code_architectures(path): if cuda_raw is None: return None - # extract unique architectures from raw dump - matches = re.findall(device_code_regex, cuda_raw) - if matches is not None: + # extract unique device code architectures from raw dump + device_code_matches = re.findall(device_code_regex, cuda_raw) + if device_code_matches is not None: + # convert match tuples into unique list of cuda compute capabilities + # e.g. [('8', '6'), ('8', '6'), ('9', '0')] -> ['8.6', '9.0'] + device_code_matches = sorted(['.'.join(m) for m in set(device_code_matches)]) + else: + # Try to be clear in the warning... did we not find elf code sections at all? or was the arch missing? + device_section_regex = re.compile('Fatbin elf code') + device_section_matches = re.findall(device_section_regex, cuda_raw) + if device_section_matches is not None: + fail_msg = f"Found Fatbin elf code section(s) in cuobjdump output for {path}, " + fail_msg += "but failed to extract CUDA architecture" + else: + # In this case, the cuobjdump command _likely_ already returned a non-zero exit + # This error message would only be displayed if cuobjdump somehow completely successfully + # but still no Fatbin elf code section was found + fail_msg = f"Failed to find Fatbin elf code section(s) in cuobjdump output for {path}, " + fail_msg += "are you sure this is a CUDA binary?" + _log.warning(fail_msg) + + # extract unique ptx code architectures from raw dump + ptx_code_matches = re.findall(ptx_code_regex, cuda_raw) + if ptx_code_matches is not None: # convert match tuples into unique list of cuda compute capabilities # e.g. [('8', '6'), ('8', '6'), ('9', '0')] -> ['8.6', '9.0'] - matches = sorted(['.'.join(m) for m in set(matches)]) + ptx_code_matches = sorted(['.'.join(m) for m in set(ptx_code_matches)]) else: - fail_msg = f"Failed to determine supported CUDA architectures from {path}" + # Try to be clear in the warning... did we not find ptx code sections at all? or was the arch missing? + ptx_section_regex = re.compile('Fatbin ptx code') + ptx_section_matches = re.findall(ptx_section_regex, cuda_raw) + if ptx_section_matches is not None: + fail_msg = f"Found Fatbin ptx code section(s) in cuobjdump output for {path}, " + fail_msg += "but failed to extract CUDA architecture" + else: + # In this case, the cuobjdump command _likely_ already returned a non-zero exit + # This error message would only be displayed if cuobjdump somehow completely successfully + # but still no Fatbin ptx code section was found + fail_msg = f"Failed to find Fatbin ptx code section(s) in cuobjdump output for {path}, " + fail_msg += "are you sure this is a CUDA binary?" _log.warning(fail_msg) - return matches + return cuda_dev_ptx_archs(ptx_archs=ptx_code_matches, device_code_archs=device_code_matches) def get_linked_libs_raw(path): From de6d49d8186708b63b3dbae8e0f847e736797fb1 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Thu, 20 Feb 2025 02:42:01 +0100 Subject: [PATCH 3/8] That return should not be there, as it will stop the sanity check after the first non-cuda file. that's wrong --- easybuild/framework/easyblock.py | 1 - 1 file changed, 1 deletion(-) diff --git a/easybuild/framework/easyblock.py b/easybuild/framework/easyblock.py index 80ecd25bbe..0a718fd31f 100644 --- a/easybuild/framework/easyblock.py +++ b/easybuild/framework/easyblock.py @@ -3350,7 +3350,6 @@ def sanity_check_cuda(self, cuda_dirs=None, check_cuobjdump=True): msg = f"{path} does not appear to be a CUDA executable (no CUDA device code found), " msg += "so skipping CUDA sanity check." self.log.debug(msg) - return fails else: # unpack results derived_ccs = res.device_code_archs From 0e97868b72a3868b824c96ff0f28f26029aaac8a Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Thu, 20 Feb 2025 02:56:12 +0100 Subject: [PATCH 4/8] Fix some logic in the PTX warning printed --- easybuild/framework/easyblock.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/easybuild/framework/easyblock.py b/easybuild/framework/easyblock.py index 0a718fd31f..747838f959 100644 --- a/easybuild/framework/easyblock.py +++ b/easybuild/framework/easyblock.py @@ -3373,13 +3373,14 @@ def sanity_check_cuda(self, cuda_dirs=None, check_cuobjdump=True): self.log.debug(msg) # Check whether there is ptx code for the highest CC in cfg_ccs - highest_cc = sorted(cfg_ccs)[-1] + highest_cc = [sorted(cfg_ccs)[-1]] missing_ptx_ccs = list(set(highest_cc) - set(derived_ptx_ccs)) if missing_ptx_ccs: fail_msg = "Configured highest compute capability was '%s', " - fail_msg += "but no PTX code for this compute capability was found in '%s'" - self.log.warning(fail_msg, highest_cc, missing_ptx_ccs) + fail_msg += "but no PTX code for this compute capability was found in '%s' " + fail_msg += "PTX architectures supported in that file: %s" + self.log.warning(fail_msg, highest_cc[0], path, derived_ptx_ccs) else: msg = (f"Output of 'cuobjdump' checked for {path}; ptx code was present for (at least) the" " highest CUDA compute capability in cuda_compute_capabilities") From 6b6d2c8b77fd7f4a854f56b1d8f69fb4493167c1 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Thu, 20 Feb 2025 23:42:04 +0100 Subject: [PATCH 5/8] Add option for ignoring individual files in the CUDA sanity check --- easybuild/framework/easyblock.py | 16 +++++++++++++--- easybuild/framework/easyconfig/default.py | 5 +++++ 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/easybuild/framework/easyblock.py b/easybuild/framework/easyblock.py index 747838f959..9631da2617 100644 --- a/easybuild/framework/easyblock.py +++ b/easybuild/framework/easyblock.py @@ -3322,6 +3322,10 @@ def sanity_check_cuda(self, cuda_dirs=None, check_cuobjdump=True): fails = [] cfg_ccs = build_option('cuda_compute_capabilities') or self.cfg.get('cuda_compute_capabilities', None) + # Construct the list of files to ignore as full paths (cuda_sanity_ignore_files contains the paths + # to ignore, relative to the installation prefix) + ignore_file_list = [os.path.join(self.installdir, d) for d in self.cfg['cuda_sanity_ignore_files']] + # If there are no CUDA compute capabilities defined, return if cfg_ccs is None or len(cfg_ccs) == 0: self.log.info("Skipping CUDA sanity check, as no CUDA compute capabilities where configured") @@ -3364,9 +3368,15 @@ def sanity_check_cuda(self, cuda_dirs=None, check_cuobjdump=True): if additional_ccs: fail_msg += "Surplus compute capabilities: %s. " % ', '.join(sorted(additional_ccs)) if missing_ccs: - fail_msg += "Missing compute capabilities: %s." % ', '.join(sorted(missing_ccs)) - self.log.warning(fail_msg) - fails.append(fail_msg) + fail_msg += "Missing compute capabilities: %s. " % ', '.join(sorted(missing_ccs)) + # We still log the result, but don't fail: + if path in ignore_file_list: + fail_msg += f"This failure will be ignored as {path} is listed in " + fail_msg += "'ignore_cuda_sanity_failures'." + self.log.warning(fail_msg) + else: + self.log.warning(fail_msg) + fails.append(fail_msg) else: msg = (f"Output of 'cuobjdump' checked for {path}; device code architecures match " "those in cuda_compute_capabilities") diff --git a/easybuild/framework/easyconfig/default.py b/easybuild/framework/easyconfig/default.py index bca46c3856..80319c6ec9 100644 --- a/easybuild/framework/easyconfig/default.py +++ b/easybuild/framework/easyconfig/default.py @@ -126,6 +126,11 @@ 'after make (for e.g.,"test" for make test)'), BUILD], 'bin_lib_subdirs': [[], "List of subdirectories for binaries and libraries, which is used during sanity check " "to check RPATH linking and banned/required libraries", BUILD], + 'cuda_sanity_ignore_files': [[], "List of files (relative to the installation prefix) for which failurs in " + "the CUDA sanity check step are ignored. Typically used for files where you " + "know the CUDA architectures in those files don't match the " + "--cuda-compute-capabitilities configured for EasyBuild AND where you know " + "that this is ok / reasonable (e.g. binary installations)", BUILD], 'sanity_check_commands': [[], ("format: [(name, options)] e.g. [('gzip','-h')]. " "Using a non-tuple is equivalent to (name, '-h')"), BUILD], 'sanity_check_paths': [{}, ("List of files and directories to check " From 6568909bb43d0108eea310a0e7427be2c7e12295 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Fri, 21 Feb 2025 21:44:52 +0100 Subject: [PATCH 6/8] Add strict-cuda-sanity-check option and make sure we only fail the sanity check on surpluss CUDA archs if this option is set. Otherwise, print warning --- easybuild/framework/easyblock.py | 15 ++++++++++++--- easybuild/tools/config.py | 1 + easybuild/tools/options.py | 9 ++++++++- 3 files changed, 21 insertions(+), 4 deletions(-) diff --git a/easybuild/framework/easyblock.py b/easybuild/framework/easyblock.py index 9631da2617..6291c0fba7 100644 --- a/easybuild/framework/easyblock.py +++ b/easybuild/framework/easyblock.py @@ -3321,6 +3321,7 @@ def sanity_check_cuda(self, cuda_dirs=None, check_cuobjdump=True): fails = [] cfg_ccs = build_option('cuda_compute_capabilities') or self.cfg.get('cuda_compute_capabilities', None) + strict_cc_check = build_option('strict_cuda_sanity_check') # Construct the list of files to ignore as full paths (cuda_sanity_ignore_files contains the paths # to ignore, relative to the installation prefix) @@ -3364,21 +3365,29 @@ def sanity_check_cuda(self, cuda_dirs=None, check_cuobjdump=True): missing_ccs = list(set(cfg_ccs) - set(derived_ccs)) if additional_ccs or missing_ccs: + # Do we log this as warning or produce a sanity failure? + is_failure = False fail_msg = f"Mismatch between cuda_compute_capabilities and device code in {path}. " if additional_ccs: fail_msg += "Surplus compute capabilities: %s. " % ', '.join(sorted(additional_ccs)) + if strict_cc_check: + is_failure = True if missing_ccs: fail_msg += "Missing compute capabilities: %s. " % ', '.join(sorted(missing_ccs)) + is_failure = True # We still log the result, but don't fail: if path in ignore_file_list: fail_msg += f"This failure will be ignored as {path} is listed in " fail_msg += "'ignore_cuda_sanity_failures'." - self.log.warning(fail_msg) + is_failure = False + + # Log warning or sanity error + if is_failure: + fails.append(fail_msg) else: self.log.warning(fail_msg) - fails.append(fail_msg) else: - msg = (f"Output of 'cuobjdump' checked for {path}; device code architecures match " + msg = (f"Output of 'cuobjdump' checked for {path}; device code architectures match " "those in cuda_compute_capabilities") self.log.debug(msg) diff --git a/easybuild/tools/config.py b/easybuild/tools/config.py index 3503d5c2f5..04e86e8562 100644 --- a/easybuild/tools/config.py +++ b/easybuild/tools/config.py @@ -358,6 +358,7 @@ def mk_full_default_path(name, prefix=DEFAULT_PREFIX): 'pre_create_installdir', 'show_progress_bar', 'strict_rpath_sanity_check', + 'strict_cuda_sanity_check', 'trace', ], EMPTY_LIST: [ diff --git a/easybuild/tools/options.py b/easybuild/tools/options.py index 70671ce20f..0fb439baf4 100644 --- a/easybuild/tools/options.py +++ b/easybuild/tools/options.py @@ -540,9 +540,16 @@ def override_options(self): "Git commit to use for the target software build (robot capabilities are automatically disabled)", None, 'store', None), 'sticky-bit': ("Set sticky bit on newly created directories", None, 'store_true', False), - 'strict-rpath-sanity-check': ("Perform strict RPATH sanity check, which involces unsetting " + 'strict-rpath-sanity-check': ("Perform strict RPATH sanity check, which involves unsetting " "$LD_LIBRARY_PATH before checking whether all required libraries are found", None, 'store_true', False), + 'strict-cuda-sanity-check': ("Perform strict CUDA sanity check. Without this option, the CUDA sanity " + "check will fail if the CUDA binaries don't contain code for (at least) " + "all compute capabilities defined in --cude-compute-capabilities, but will " + "accept if code for additional compute capabilities is present. " + "With this setting, the sanity check will also fail if code is present for " + "more compute capabilities than defined in --cuda-compute-capabilities.", + None, 'store_true', False), 'sysroot': ("Location root directory of system, prefix for standard paths like /usr/lib and /usr/include", None, 'store', None), 'trace': ("Provide more information in output to stdout on progress", None, 'store_true', True, 'T'), From 3d07ef6ad3eba20bb6e0c87565ee90eaa9dc552f Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Fri, 21 Feb 2025 23:55:34 +0100 Subject: [PATCH 7/8] This is a work in progress for creating a set of tests... --- test/framework/toy_build.py | 93 ++++++++++++++++++++++++++++++++++++- 1 file changed, 92 insertions(+), 1 deletion(-) diff --git a/test/framework/toy_build.py b/test/framework/toy_build.py index cd9aefb26f..13e121bf71 100644 --- a/test/framework/toy_build.py +++ b/test/framework/toy_build.py @@ -53,7 +53,7 @@ from easybuild.main import main_with_hooks from easybuild.tools.build_log import EasyBuildError from easybuild.tools.config import get_module_syntax, get_repositorypath -from easybuild.tools.environment import modify_env +from easybuild.tools.environment import modify_env, setvar from easybuild.tools.filetools import adjust_permissions, change_dir, copy_file, mkdir, move_file from easybuild.tools.filetools import read_file, remove_dir, remove_file, which, write_file from easybuild.tools.module_generator import ModuleGeneratorTcl @@ -3008,6 +3008,97 @@ def test_toy_filter_rpath_sanity_libs(self): self.assertErrorRegex(EasyBuildError, error_pattern, self._test_toy_build, ec_file=toy_ec, extra_args=args, name='toy-app', raise_error=True, verbose=False) + def test_toy_cuda_sanity_check(self): + """Test the CUDA sanity check""" + # We need to mock a cuobjdump executable and prepend in on the PATH + # First, make sure we can restore environment at the end of this test + start_env = copy.deepcopy(os.environ) + + # Create mock cuobjdump + # First, lets define sections of echo's for cuobjdump for various scenarios + + # Shebang for cuobjdump + cuobjdump_txt_shebang = "#!/bin/bash\n" + + # Section for cuobjdump printing output for sm_80 architecture + cuobjdump_txt_sm80 = '\n'.join([ + "echo 'Fatbin elf code:'" + "echo '================'" + "echo 'arch = sm_80'" + "echo 'code version = [1,7]'" + "echo 'host = linux'" + "echo 'compile_size = 64bit'" + "echo ''" + ]) + + # Section for cuobjdump printing output for sm_90 architecture + cuobjdump_txt_sm90 = '\n'.join([ + "echo 'Fatbin elf code:'" + "echo '================'" + "echo 'arch = sm_90'" + "echo 'code version = [1,7]'" + "echo 'host = linux'" + "echo 'compile_size = 64bit'" + "echo ''" + ]) + + # Section for cuobjdump printing output for sm_80 PTX code + cuobjdump_txt_sm80_ptx = '\n'.join([ + "echo 'Fatbin ptx code:'" + "echo '================'" + "echo 'arch = sm_80'" + "echo 'code version = [8,1]'" + "echo 'host = linux'" + "echo 'compile_size = 64bit'" + "echo 'compressed'" + ]) + + # Section for cuobjdump printing output for sm_90 PTX code + cuobjdump_txt_sm90_ptx = '\n'.join([ + "echo 'Fatbin ptx code:'" + "echo '================'" + "echo 'arch = sm_90'" + "echo 'code version = [8,1]'" + "echo 'host = linux'" + "echo 'compile_size = 64bit'" + "echo 'compressed'" + ]) + + # Create temporary subdir for cuobjdump, so that we don't have to add self.test_prefix itself to the PATH + cuobjdump_dir = os.path.join(self.test_prefix, 'cuobjdump_dir') + mkdir(cuobjdump_dir, parents=True) + + # Add cuobjdump_dir to the path + setvar('PATH', '%s:%s' % (cuobjdump_dir, os.getenv('PATH'))) + + # Filepath to cuobjdump + cuobjdump_file = os.path.join(cuobjdump_dir, 'cuobjdump') + + # Test case 1: --cuda-compute-capabilities=8.0 and mocking a binary that contains 8.0 EFL code + write_file(cuobjdump_file, cuobjdump_txt_shebang), + write_file(cuobjdump_file, cuobjdump_txt_sm80, append=True) + adjust_permission(cuobjdump_file, stat.S_IXUSR, add=True) # Make sure our mock cuobjdump is executable + args = ['--cuda-compute-capabilities=8.0', '--debug'] # Need debug so we can check output + test_report_fp = os.path.join(self.test_buildpath, 'full_test_report.md') + # We expect this to pass, so no need to check errors + regex = r"DEBUG Output of 'cuobjdump' checked for .*toy; " + regex += "device code architectures match those in cuda_compute_capabilities" + self.test_toy_build(extra_args=args, test_report=test_report_fp, raise_error=True + test_report_regexs=[regex]) + + + + + + # Test single CUDA compute capability with --cuda-compute-capabilities=8.0 + + # Test multiple CUDA compute capabilities with --cuda-compute-capabilities=8.0,9.0 + + # Test stric CUDA check with --cuda-compute-capabilities=8.0 and a binary that also contains also 9.0 code + + # Restore original environment + modify_env(os.environ, start_env, verbose=False) + def test_toy_modaltsoftname(self): """Build two dependent toys as in test_toy_toy but using modaltsoftname""" topdir = os.path.dirname(os.path.abspath(__file__)) From f13fca23e26a1c1851fe1fc6524785d331523f6f Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Sat, 22 Feb 2025 02:50:24 +0100 Subject: [PATCH 8/8] First test working.. --- test/framework/toy_build.py | 74 ++++++++++++++++++++++--------------- 1 file changed, 44 insertions(+), 30 deletions(-) diff --git a/test/framework/toy_build.py b/test/framework/toy_build.py index 13e121bf71..0eb1c28e78 100644 --- a/test/framework/toy_build.py +++ b/test/framework/toy_build.py @@ -3014,6 +3014,10 @@ def test_toy_cuda_sanity_check(self): # First, make sure we can restore environment at the end of this test start_env = copy.deepcopy(os.environ) + # Define the toy_ec file we want to use + topdir = os.path.dirname(os.path.abspath(__file__)) + toy_ec = os.path.join(topdir, 'easyconfigs', 'test_ecs', 't', 'toy', 'toy-0.0.eb') + # Create mock cuobjdump # First, lets define sections of echo's for cuobjdump for various scenarios @@ -3022,48 +3026,54 @@ def test_toy_cuda_sanity_check(self): # Section for cuobjdump printing output for sm_80 architecture cuobjdump_txt_sm80 = '\n'.join([ - "echo 'Fatbin elf code:'" - "echo '================'" - "echo 'arch = sm_80'" - "echo 'code version = [1,7]'" - "echo 'host = linux'" - "echo 'compile_size = 64bit'" + "echo 'Fatbin elf code:'", + "echo '================'", + "echo 'arch = sm_80'", + "echo 'code version = [1,7]'", + "echo 'host = linux'", + "echo 'compile_size = 64bit'", "echo ''" ]) # Section for cuobjdump printing output for sm_90 architecture cuobjdump_txt_sm90 = '\n'.join([ - "echo 'Fatbin elf code:'" - "echo '================'" - "echo 'arch = sm_90'" - "echo 'code version = [1,7]'" - "echo 'host = linux'" - "echo 'compile_size = 64bit'" + "echo 'Fatbin elf code:'", + "echo '================'", + "echo 'arch = sm_90'", + "echo 'code version = [1,7]'", + "echo 'host = linux'", + "echo 'compile_size = 64bit'", "echo ''" ]) # Section for cuobjdump printing output for sm_80 PTX code cuobjdump_txt_sm80_ptx = '\n'.join([ - "echo 'Fatbin ptx code:'" - "echo '================'" - "echo 'arch = sm_80'" - "echo 'code version = [8,1]'" - "echo 'host = linux'" - "echo 'compile_size = 64bit'" + "echo 'Fatbin ptx code:'", + "echo '================'", + "echo 'arch = sm_80'", + "echo 'code version = [8,1]'", + "echo 'host = linux'", + "echo 'compile_size = 64bit'", "echo 'compressed'" ]) # Section for cuobjdump printing output for sm_90 PTX code cuobjdump_txt_sm90_ptx = '\n'.join([ - "echo 'Fatbin ptx code:'" - "echo '================'" - "echo 'arch = sm_90'" - "echo 'code version = [8,1]'" - "echo 'host = linux'" - "echo 'compile_size = 64bit'" + "echo 'Fatbin ptx code:'", + "echo '================'", + "echo 'arch = sm_90'", + "echo 'code version = [8,1]'", + "echo 'host = linux'", + "echo 'compile_size = 64bit'", "echo 'compressed'" ]) + # Created regex for success and failures + device_code_regex_pattern = r"DEBUG Output of 'cuobjdump' checked for .*/bin/toy; device code " + device_code_regex_pattern += "architectures match those in cuda_compute_capabilities" + device_code_regex = re.compile(device_code_regex_pattern, re.M) + # TODO: create regex for failures + # Create temporary subdir for cuobjdump, so that we don't have to add self.test_prefix itself to the PATH cuobjdump_dir = os.path.join(self.test_prefix, 'cuobjdump_dir') mkdir(cuobjdump_dir, parents=True) @@ -3071,20 +3081,24 @@ def test_toy_cuda_sanity_check(self): # Add cuobjdump_dir to the path setvar('PATH', '%s:%s' % (cuobjdump_dir, os.getenv('PATH'))) + # Pretend we have CUDA loaded, or the sanity check won't run + setvar('EBROOTCUDA', '/foo/bar') + # Filepath to cuobjdump cuobjdump_file = os.path.join(cuobjdump_dir, 'cuobjdump') # Test case 1: --cuda-compute-capabilities=8.0 and mocking a binary that contains 8.0 EFL code write_file(cuobjdump_file, cuobjdump_txt_shebang), write_file(cuobjdump_file, cuobjdump_txt_sm80, append=True) - adjust_permission(cuobjdump_file, stat.S_IXUSR, add=True) # Make sure our mock cuobjdump is executable - args = ['--cuda-compute-capabilities=8.0', '--debug'] # Need debug so we can check output + adjust_permissions(cuobjdump_file, stat.S_IXUSR, add=True) # Make sure our mock cuobjdump is executable + args = ['--cuda-compute-capabilities=8.0'] test_report_fp = os.path.join(self.test_buildpath, 'full_test_report.md') # We expect this to pass, so no need to check errors - regex = r"DEBUG Output of 'cuobjdump' checked for .*toy; " - regex += "device code architectures match those in cuda_compute_capabilities" - self.test_toy_build(extra_args=args, test_report=test_report_fp, raise_error=True - test_report_regexs=[regex]) + with self.mocked_stdout_stderr(): + outtxt = self._test_toy_build(ec_file=toy_ec, extra_args=args, raise_error=False) + msg = "Patter %s found in full build log:\n%s" % (device_code_regex.pattern, outtxt) + self.assertTrue(device_code_regex.search(outtxt), msg) +