Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add a CUDA device code sanity check #4692

Draft
wants to merge 9 commits into
base: 5.0.x
Choose a base branch
from
67 changes: 65 additions & 2 deletions easybuild/framework/easyblock.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
* Maxime Boissonneault (Compute Canada)
* Davide Vanzo (Vanderbilt University)
* Caspar van Leeuwen (SURF)
* Jasper Grimm (UoY)
"""
import concurrent
import copy
Expand Down Expand Up @@ -101,8 +102,9 @@
from easybuild.tools.output import show_progress_bars, start_progress_bar, stop_progress_bar, update_progress_bar
from easybuild.tools.package.utilities import package
from easybuild.tools.repository.repository import init_repository
from easybuild.tools.systemtools import check_linked_shared_libs, det_parallelism, get_linked_libs_raw
from easybuild.tools.systemtools import get_shared_lib_ext, pick_system_specific_value, use_group
from easybuild.tools.systemtools import check_linked_shared_libs, det_parallelism, get_cuda_device_code_architectures
from easybuild.tools.systemtools import get_linked_libs_raw, get_shared_lib_ext, pick_system_specific_value, use_group
from easybuild.tools.toolchain.toolchain import TOOLCHAIN_CAPABILITY_CUDA
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

'easybuild.tools.toolchain.toolchain.TOOLCHAIN_CAPABILITY_CUDA' imported but unused

from easybuild.tools.utilities import INDENT_4SPACES, get_class_for, nub, quote_str
from easybuild.tools.utilities import remove_unwanted_chars, time2str, trace_msg
from easybuild.tools.version import this_is_easybuild, VERBOSE_VERSION, VERSION
Expand Down Expand Up @@ -3193,6 +3195,59 @@ def _sanity_check_step_multi_deps(self, *args, **kwargs):
self.cfg['builddependencies'] = builddeps
self.cfg.iterating = False

def sanity_check_cuda(self, cuda_dirs=None, check_cuobjdump=True):
"""Sanity check that binaries/libraries contain device code for the correct architecture targets."""

self.log.info("Checking binaries/libraries for CUDA device code...")

fails = []
cfg_ccs = build_option('cuda_compute_capabilities') or self.cfg.get('cuda_compute_capabilities', None)

if cuda_dirs is None:
cuda_dirs = self.cfg['bin_lib_subdirs'] or self.bin_lib_subdirs()

if not cuda_dirs:
cuda_dirs = DEFAULT_BIN_LIB_SUBDIRS
self.log.info("Using default subdirectories for binaries/libraries to verify CUDA device code: %s",
cuda_dirs)
else:
self.log.info("Using default subdirectories for binaries/libraries to verify CUDA device code: %s",
Copy link
Contributor

@casparvl casparvl Feb 19, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This info message seems wrong: this is not the default subdirectories, this is a custom defined bin_lib_subdirs

cuda_dirs)

for dirpath in [os.path.join(self.installdir, d) for d in cuda_dirs]:
if os.path.exists(dirpath):
self.log.debug(f"Sanity checking files for CUDA device code in {dirpath}")

for path in [os.path.join(dirpath, x) for x in os.listdir(dirpath)]:
self.log.debug("Sanity checking for CUDA device code in %s", path)

derived_ccs = get_cuda_device_code_architectures(path)

if derived_ccs is None:
msg = f"No CUDA device code found in {path}, so skipping it in CUDA device code sanity check"
self.log.debug(msg)
else:
# check whether device code architectures match cuda_compute_capabilities
additional_ccs = list(set(derived_ccs) - set(cfg_ccs))
missing_ccs = list(set(cfg_ccs) - set(derived_ccs))

if additional_ccs or missing_ccs:
fail_msg = f"Mismatch between cuda_compute_capabilities and device code in {path}. "
if additional_ccs:
fail_msg += "Surplus compute capabilities: %s. " % ', '.join(sorted(additional_ccs))
if missing_ccs:
fail_msg += "Missing compute capabilities: %s." % ', '.join(sorted(missing_ccs))
self.log.warning(fail_msg)
fails.append(fail_msg)
else:
msg = (f"Output of 'cuobjdump' checked for {path}; device code architecures match "
"those in cuda_compute_capabilities")
self.log.debug(msg)
else:
self.log.debug(f"Not sanity checking files in non-existing directory {dirpath}")

return fails

def sanity_check_rpath(self, rpath_dirs=None, check_readelf_rpath=True):
"""Sanity check binaries/libraries w.r.t. RPATH linking."""

Expand Down Expand Up @@ -3782,6 +3837,14 @@ def xs2str(xs):
else:
self.log.debug("Skipping RPATH sanity check")

if get_software_root('CUDA'):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@boegel We have an EESSI-specific complication here. We drop CUDA to a build time dep so that we don't depend on the CUDA module at runtime. This means that we won't execute this code path so we need to trigger the module load here.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think you're right, but just to double check: are the build dependencies unloaded at sanity check time?

Could we fix this through an EasyBuild hook in EESSI, that loads the CUDA that was a build dependency also in the sanity_check_step (and unloads after)? Should also work for EESSI-extend, and no changes on the framework side needed...

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Wait... actually, I don't think you're right. Because I just did this with EESSI-extend, and it did run the CUDA sanity check...? I'm not sure why, I would have expected the problem you mentioned. So... why didn't it appear?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wonder what happens in the --module-only case, when the sanity check step is being run without building first? Perhaps this really is expected behaviour?

cuda_fails = self.sanity_check_cuda()
if cuda_fails:
self.log.warning("CUDA device code sanity check failed!")
self.sanity_check_fail_msgs.extend(cuda_fails)
else:
self.log.debug("Skipping CUDA device code sanity check")

# pass or fail
if self.sanity_check_fail_msgs:
raise EasyBuildError(
Expand Down
59 changes: 59 additions & 0 deletions easybuild/tools/systemtools.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@

* Jens Timmerman (Ghent University)
* Ward Poelmans (Ghent University)
* Jasper Grimm (UoY)
"""
import ctypes
import errno
Expand Down Expand Up @@ -963,6 +964,64 @@ def get_glibc_version():
return glibc_ver


def get_cuda_object_dump_raw(path):
"""
Get raw ouput from command which extracts information from CUDA binary files in a human-readable format,
or None for files containing no CUDA device code.
See https://docs.nvidia.com/cuda/cuda-binary-utilities/index.html#cuobjdump
"""

res = run_shell_cmd("file %s" % path, fail_on_error=False, hidden=True, output_file=False, stream_output=False)
if res.exit_code != EasyBuildExit.SUCCESS:
fail_msg = "Failed to run 'file %s': %s" % (path, res.output)
_log.warning(fail_msg)

# check that the file is an executable or library/object
if any(x in res.output for x in ['executable', 'object', 'library']):
cuda_cmd = f"cuobjdump {path}"
else:
return None

res = run_shell_cmd(cuda_cmd, fail_on_error=False, hidden=True, output_file=False, stream_output=False)
if res.exit_code == EasyBuildExit.SUCCESS:
return res.output
else:
msg = "Dumping CUDA binary file information for '%s' via '%s' failed! Output: '%s'"
_log.debug(msg % (path, cuda_cmd, res.output))
return None


def get_cuda_device_code_architectures(path):
"""
Get list of supported CUDA architectures, by inspecting the device code of an executable/library. The format is the
same as cuda_compute_capabilities (e.g. ['8.6', '9.0'] for sm_86 sm_90).
Returns None if no CUDA device code is present in the file.
"""

# cudaobjdump uses the sm_XY format
device_code_regex = re.compile('(?<=arch = sm_)([0-9])([0-9]+a{0,1})')
Copy link
Member

@ocaisa ocaisa Feb 19, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It would be good to also capture whether the code can be jit compiled (so it can at least run on a future arch). In a script I had I did this with:

# Regex to find multiple PTX and ELF sections
        ptx_matches = re.findall(r'Fatbin ptx code:\n=+\narch = sm_(\d+)', result.stdout)
        elf_matches = re.findall(r'Fatbin elf code:\n=+\narch = sm_(\d+)', result.stdout)

        # Debug: Show if matches were found for PTX and ELF sections
        if debug:
            print(f"PTX Matches: {ptx_matches}")
            print(f"ELF Matches: {elf_matches}")

        # Return all PTX and ELF matches, remove duplicates using set and convert to lists
        return {
            "ptx": sorted(set(ptx_matches)),  # List of unique PTX capabilities
            "elf": sorted(set(elf_matches))   # List of unique ELF capabilities
        }

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In fact: re.compile('(?<=arch = sm_)([0-9])([0-9]+a{0,1})') is not specific enough, because it will treat the Fatbin ptx code and Fatbin elf code sections the same: it'll just extract any arch = string it can find.

To have a concrete example of something that has both, one can check e.g. libcusparse:

[casparl@tcn1 ~]$ cuobjdump /cvmfs/software.eessi.io/versions/2023.06/software/linux/x86_64/amd/zen2/accel/nvidia/cc80/software/CUDA/12.1.1/lib64/libcusparse.so | grep -A 5 ptx | tail -n 12
================
arch = sm_80
code version = [8,1]
host = linux
compile_size = 64bit
--
Fatbin ptx code:
================
arch = sm_90
code version = [8,1]
host = linux
compile_size = 64bit
[casparl@tcn1 ~]$ cuobjdump /cvmfs/software.eessi.io/versions/2023.06/software/linux/x86_64/amd/zen2/accel/nvidia/cc80/software/CUDA/12.1.1/lib64/libcusparse.so | grep -A 5 elf | tail -n 12
================
arch = sm_80
code version = [1,7]
host = linux
compile_size = 64bit
--
Fatbin elf code:
================
arch = sm_90
code version = [1,7]
host = linux
compile_size = 64bit


# resolve symlinks
if os.path.islink(path) and os.path.exists(path):
path = os.path.realpath(path)

cuda_raw = get_cuda_object_dump_raw(path)
if cuda_raw is None:
return None

# extract unique architectures from raw dump
matches = re.findall(device_code_regex, cuda_raw)
if matches is not None:
# convert match tuples into unique list of cuda compute capabilities
# e.g. [('8', '6'), ('8', '6'), ('9', '0')] -> ['8.6', '9.0']
matches = sorted(['.'.join(m) for m in set(matches)])
else:
fail_msg = f"Failed to determine supported CUDA architectures from {path}"
_log.warning(fail_msg)

return matches


def get_linked_libs_raw(path):
"""
Get raw output from command that reports linked libraries for dynamically linked executables/libraries,
Expand Down
Loading