Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add a CUDA device code sanity check #4692

Draft
wants to merge 9 commits into
base: 5.0.x
Choose a base branch
from
109 changes: 107 additions & 2 deletions easybuild/framework/easyblock.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
* Davide Vanzo (Vanderbilt University)
* Caspar van Leeuwen (SURF)
* Jan Andre Reuter (Juelich Supercomputing Centre)
* Jasper Grimm (UoY)
"""
import concurrent
import copy
Expand Down Expand Up @@ -107,8 +108,9 @@
from easybuild.tools.output import show_progress_bars, start_progress_bar, stop_progress_bar, update_progress_bar
from easybuild.tools.package.utilities import package
from easybuild.tools.repository.repository import init_repository
from easybuild.tools.systemtools import check_linked_shared_libs, det_parallelism, get_linked_libs_raw
from easybuild.tools.systemtools import get_shared_lib_ext, pick_system_specific_value, use_group
from easybuild.tools.systemtools import check_linked_shared_libs, det_parallelism, get_cuda_device_code_architectures
from easybuild.tools.systemtools import get_linked_libs_raw, get_shared_lib_ext, pick_system_specific_value, use_group
from easybuild.tools.toolchain.toolchain import TOOLCHAIN_CAPABILITY_CUDA
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

'easybuild.tools.toolchain.toolchain.TOOLCHAIN_CAPABILITY_CUDA' imported but unused

from easybuild.tools.utilities import INDENT_4SPACES, get_class_for, nub, quote_str
from easybuild.tools.utilities import remove_unwanted_chars, time2str, trace_msg
from easybuild.tools.version import this_is_easybuild, VERBOSE_VERSION, VERSION
Expand Down Expand Up @@ -3312,6 +3314,101 @@ def _sanity_check_step_multi_deps(self, *args, **kwargs):
self.cfg['builddependencies'] = builddeps
self.cfg.iterating = False

def sanity_check_cuda(self, cuda_dirs=None, check_cuobjdump=True):
"""Sanity check that binaries/libraries contain device code for the correct architecture targets."""

self.log.info("Checking binaries/libraries for CUDA device code...")

fails = []
cfg_ccs = build_option('cuda_compute_capabilities') or self.cfg.get('cuda_compute_capabilities', None)
strict_cc_check = build_option('strict_cuda_sanity_check')

# Construct the list of files to ignore as full paths (cuda_sanity_ignore_files contains the paths
# to ignore, relative to the installation prefix)
ignore_file_list = [os.path.join(self.installdir, d) for d in self.cfg['cuda_sanity_ignore_files']]

# If there are no CUDA compute capabilities defined, return
if cfg_ccs is None or len(cfg_ccs) == 0:
self.log.info("Skipping CUDA sanity check, as no CUDA compute capabilities where configured")
return fails

if cuda_dirs is None:
cuda_dirs = self.cfg['bin_lib_subdirs'] or self.bin_lib_subdirs()

if not cuda_dirs:
cuda_dirs = DEFAULT_BIN_LIB_SUBDIRS
self.log.info("Using default subdirectories for binaries/libraries to verify CUDA device code: %s",
cuda_dirs)
else:
self.log.info("Using default subdirectories for binaries/libraries to verify CUDA device code: %s",
Copy link
Contributor

@casparvl casparvl Feb 19, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This info message seems wrong: this is not the default subdirectories, this is a custom defined bin_lib_subdirs

cuda_dirs)

for dirpath in [os.path.join(self.installdir, d) for d in cuda_dirs]:
if os.path.exists(dirpath):
self.log.debug(f"Sanity checking files for CUDA device code in {dirpath}")

for path in [os.path.join(dirpath, x) for x in os.listdir(dirpath)]:
self.log.debug("Sanity checking for CUDA device code in %s", path)

res = get_cuda_device_code_architectures(path)
if res is None:
msg = f"{path} does not appear to be a CUDA executable (no CUDA device code found), "
msg += "so skipping CUDA sanity check."
self.log.debug(msg)
else:
# unpack results
derived_ccs = res.device_code_archs
derived_ptx_ccs = res.ptx_archs

# check whether device code architectures match cuda_compute_capabilities
additional_ccs = list(set(derived_ccs) - set(cfg_ccs))
missing_ccs = list(set(cfg_ccs) - set(derived_ccs))

if additional_ccs or missing_ccs:
# Do we log this as warning or produce a sanity failure?
is_failure = False
fail_msg = f"Mismatch between cuda_compute_capabilities and device code in {path}. "
if additional_ccs:
fail_msg += "Surplus compute capabilities: %s. " % ', '.join(sorted(additional_ccs))
if strict_cc_check:
is_failure = True
if missing_ccs:
fail_msg += "Missing compute capabilities: %s. " % ', '.join(sorted(missing_ccs))
is_failure = True
# We still log the result, but don't fail:
if path in ignore_file_list:
fail_msg += f"This failure will be ignored as {path} is listed in "
fail_msg += "'ignore_cuda_sanity_failures'."
is_failure = False

# Log warning or sanity error
if is_failure:
fails.append(fail_msg)
else:
self.log.warning(fail_msg)
else:
msg = (f"Output of 'cuobjdump' checked for {path}; device code architectures match "
"those in cuda_compute_capabilities")
self.log.debug(msg)

# Check whether there is ptx code for the highest CC in cfg_ccs
highest_cc = [sorted(cfg_ccs)[-1]]
missing_ptx_ccs = list(set(highest_cc) - set(derived_ptx_ccs))

if missing_ptx_ccs:
fail_msg = "Configured highest compute capability was '%s', "
fail_msg += "but no PTX code for this compute capability was found in '%s' "
fail_msg += "PTX architectures supported in that file: %s"
self.log.warning(fail_msg, highest_cc[0], path, derived_ptx_ccs)
else:
msg = (f"Output of 'cuobjdump' checked for {path}; ptx code was present for (at least) the"
" highest CUDA compute capability in cuda_compute_capabilities")
self.log.debug(msg)
else:
self.log.debug(f"Not sanity checking files in non-existing directory {dirpath}")

return fails

def sanity_check_rpath(self, rpath_dirs=None, check_readelf_rpath=True):
"""Sanity check binaries/libraries w.r.t. RPATH linking."""

Expand Down Expand Up @@ -3900,6 +3997,14 @@ def xs2str(xs):
else:
self.log.debug("Skipping RPATH sanity check")

if get_software_root('CUDA'):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@boegel We have an EESSI-specific complication here. We drop CUDA to a build time dep so that we don't depend on the CUDA module at runtime. This means that we won't execute this code path so we need to trigger the module load here.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think you're right, but just to double check: are the build dependencies unloaded at sanity check time?

Could we fix this through an EasyBuild hook in EESSI, that loads the CUDA that was a build dependency also in the sanity_check_step (and unloads after)? Should also work for EESSI-extend, and no changes on the framework side needed...

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Wait... actually, I don't think you're right. Because I just did this with EESSI-extend, and it did run the CUDA sanity check...? I'm not sure why, I would have expected the problem you mentioned. So... why didn't it appear?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wonder what happens in the --module-only case, when the sanity check step is being run without building first? Perhaps this really is expected behaviour?

cuda_fails = self.sanity_check_cuda()
if cuda_fails:
self.log.warning("CUDA device code sanity check failed!")
self.sanity_check_fail_msgs.extend(cuda_fails)
else:
self.log.debug("Skipping CUDA device code sanity check")

# pass or fail
if self.sanity_check_fail_msgs:
raise EasyBuildError(
Expand Down
5 changes: 5 additions & 0 deletions easybuild/framework/easyconfig/default.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,11 @@
'after make (for e.g.,"test" for make test)'), BUILD],
'bin_lib_subdirs': [[], "List of subdirectories for binaries and libraries, which is used during sanity check "
"to check RPATH linking and banned/required libraries", BUILD],
'cuda_sanity_ignore_files': [[], "List of files (relative to the installation prefix) for which failurs in "
"the CUDA sanity check step are ignored. Typically used for files where you "
"know the CUDA architectures in those files don't match the "
"--cuda-compute-capabitilities configured for EasyBuild AND where you know "
"that this is ok / reasonable (e.g. binary installations)", BUILD],
'sanity_check_commands': [[], ("format: [(name, options)] e.g. [('gzip','-h')]. "
"Using a non-tuple is equivalent to (name, '-h')"), BUILD],
'sanity_check_paths': [{}, ("List of files and directories to check "
Expand Down
1 change: 1 addition & 0 deletions easybuild/tools/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -358,6 +358,7 @@ def mk_full_default_path(name, prefix=DEFAULT_PREFIX):
'pre_create_installdir',
'show_progress_bar',
'strict_rpath_sanity_check',
'strict_cuda_sanity_check',
'trace',
],
EMPTY_LIST: [
Expand Down
9 changes: 8 additions & 1 deletion easybuild/tools/options.py
Original file line number Diff line number Diff line change
Expand Up @@ -540,9 +540,16 @@ def override_options(self):
"Git commit to use for the target software build (robot capabilities are automatically disabled)",
None, 'store', None),
'sticky-bit': ("Set sticky bit on newly created directories", None, 'store_true', False),
'strict-rpath-sanity-check': ("Perform strict RPATH sanity check, which involces unsetting "
'strict-rpath-sanity-check': ("Perform strict RPATH sanity check, which involves unsetting "
"$LD_LIBRARY_PATH before checking whether all required libraries are found",
None, 'store_true', False),
'strict-cuda-sanity-check': ("Perform strict CUDA sanity check. Without this option, the CUDA sanity "
"check will fail if the CUDA binaries don't contain code for (at least) "
"all compute capabilities defined in --cude-compute-capabilities, but will "
"accept if code for additional compute capabilities is present. "
"With this setting, the sanity check will also fail if code is present for "
"more compute capabilities than defined in --cuda-compute-capabilities.",
None, 'store_true', False),
'sysroot': ("Location root directory of system, prefix for standard paths like /usr/lib and /usr/include",
None, 'store', None),
'trace': ("Provide more information in output to stdout on progress", None, 'store_true', True, 'T'),
Expand Down
127 changes: 126 additions & 1 deletion easybuild/tools/systemtools.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@

* Jens Timmerman (Ghent University)
* Ward Poelmans (Ghent University)
* Jasper Grimm (UoY)
"""
import ctypes
import errno
Expand All @@ -42,7 +43,7 @@
import sys
import termios
import warnings
from collections import OrderedDict
from collections import OrderedDict, namedtuple
from ctypes.util import find_library
from socket import gethostname

Expand Down Expand Up @@ -214,6 +215,14 @@
}


# A named tuple, to be returned by e.g. `get_cuda_device_code_architectures`
cuda_dev_ptx_archs = namedtuple('cuda_dev_ptx_archs', ('device_code_archs', 'ptx_archs'))
cuda_dev_ptx_archs.__doc__ = """A namedtuple that represents the result of a call to get_cuda_device_code_architectures,
with the following fields:
- device_code_archs: a list of CUDA device compute capabilities for which device code was found
- ptx_archs: a list of CUDA (virtual) device compute capabilities for which ptx code was found
"""

class SystemToolsException(Exception):
"""raised when systemtools fails"""

Expand Down Expand Up @@ -963,6 +972,122 @@ def get_glibc_version():
return glibc_ver


def get_cuda_object_dump_raw(path):
"""
Get raw ouput from command which extracts information from CUDA binary files in a human-readable format,
or None for files containing no CUDA device code.
See https://docs.nvidia.com/cuda/cuda-binary-utilities/index.html#cuobjdump
"""

res = run_shell_cmd("file %s" % path, fail_on_error=False, hidden=True, output_file=False, stream_output=False)
if res.exit_code != EasyBuildExit.SUCCESS:
fail_msg = "Failed to run 'file %s': %s" % (path, res.output)
_log.warning(fail_msg)

# check that the file is an executable or library/object
if any(x in res.output for x in ['executable', 'object', 'library']):
cuda_cmd = f"cuobjdump {path}"
else:
return None

res = run_shell_cmd(cuda_cmd, fail_on_error=False, hidden=True, output_file=False, stream_output=False)
if res.exit_code == EasyBuildExit.SUCCESS:
return res.output
else:
# Check and report for the common case that this is simply not a CUDA binary, i.e. does not
# contain CUDA device code
no_device_code_match = re.search(r'does not contain device code', res.output)
if no_device_code_match is not None:
msg = "'%s' does not appear to be a CUDA binary: cuobjdump failed to find device code in this file"
_log.debug(msg, path)
else:
msg = "Dumping CUDA binary file information for '%s' via '%s' failed! Output: '%s'"
_log.debug(msg, path, cuda_cmd, res.output)
return None


def get_cuda_device_code_architectures(path):
"""
Get list of supported CUDA architectures, by inspecting the device code of an executable/library. The format is the
same as cuda_compute_capabilities (e.g. ['8.6', '9.0'] for sm_86 sm_90).
Returns None if no CUDA device code is present in the file.
"""

# Note that typical output for a cuobjdump call will look like this for device code:
#
# Fatbin elf code:
# ================
# arch = sm_90
# code version = [1,7]
# host = linux
# compile_size = 64bit
#
# And for ptx code, it will look like this:
#
# Fatbin ptx code:
# ================
# arch = sm_90
# code version = [8,1]
# host = linux
# compile_size = 64bit

# Pattern to extract elf code architectures and ptx code architectures respectively
device_code_regex = re.compile('Fatbin elf code:\n=+\narch = sm_([0-9])([0-9]+a{0,1})')
ptx_code_regex = re.compile('Fatbin ptx code:\n=+\narch = sm_([0-9])([0-9]+a{0,1})')

# resolve symlinks
if os.path.islink(path) and os.path.exists(path):
path = os.path.realpath(path)

cuda_raw = get_cuda_object_dump_raw(path)
if cuda_raw is None:
return None

# extract unique device code architectures from raw dump
device_code_matches = re.findall(device_code_regex, cuda_raw)
if device_code_matches is not None:
# convert match tuples into unique list of cuda compute capabilities
# e.g. [('8', '6'), ('8', '6'), ('9', '0')] -> ['8.6', '9.0']
device_code_matches = sorted(['.'.join(m) for m in set(device_code_matches)])
else:
# Try to be clear in the warning... did we not find elf code sections at all? or was the arch missing?
device_section_regex = re.compile('Fatbin elf code')
device_section_matches = re.findall(device_section_regex, cuda_raw)
if device_section_matches is not None:
fail_msg = f"Found Fatbin elf code section(s) in cuobjdump output for {path}, "
fail_msg += "but failed to extract CUDA architecture"
else:
# In this case, the cuobjdump command _likely_ already returned a non-zero exit
# This error message would only be displayed if cuobjdump somehow completely successfully
# but still no Fatbin elf code section was found
fail_msg = f"Failed to find Fatbin elf code section(s) in cuobjdump output for {path}, "
fail_msg += "are you sure this is a CUDA binary?"
_log.warning(fail_msg)

# extract unique ptx code architectures from raw dump
ptx_code_matches = re.findall(ptx_code_regex, cuda_raw)
if ptx_code_matches is not None:
# convert match tuples into unique list of cuda compute capabilities
# e.g. [('8', '6'), ('8', '6'), ('9', '0')] -> ['8.6', '9.0']
ptx_code_matches = sorted(['.'.join(m) for m in set(ptx_code_matches)])
else:
# Try to be clear in the warning... did we not find ptx code sections at all? or was the arch missing?
ptx_section_regex = re.compile('Fatbin ptx code')
ptx_section_matches = re.findall(ptx_section_regex, cuda_raw)
if ptx_section_matches is not None:
fail_msg = f"Found Fatbin ptx code section(s) in cuobjdump output for {path}, "
fail_msg += "but failed to extract CUDA architecture"
else:
# In this case, the cuobjdump command _likely_ already returned a non-zero exit
# This error message would only be displayed if cuobjdump somehow completely successfully
# but still no Fatbin ptx code section was found
fail_msg = f"Failed to find Fatbin ptx code section(s) in cuobjdump output for {path}, "
fail_msg += "are you sure this is a CUDA binary?"
_log.warning(fail_msg)

return cuda_dev_ptx_archs(ptx_archs=ptx_code_matches, device_code_archs=device_code_matches)


def get_linked_libs_raw(path):
"""
Get raw output from command that reports linked libraries for dynamically linked executables/libraries,
Expand Down
Loading
Loading