Skip to content

Commit

Permalink
Copy build log and artifacts to a permanent location after failures
Browse files Browse the repository at this point in the history
The files can be build in some selected build path (--buildpath), and
the logs of successful compilation are then concentrated to some other
location for permanent storage (--logfile-format). Logs of failed builds
remain in the build path location so that they can be inspected.

However, this setup is problematic when building software in HPC jobs. Quite
often in HPC systems the build path is set to some fast storage local to
the node, like NVME raid mounted on `/tmp` or `/dev/shm` (as suggested
in the documentation: https://docs.easybuild.io/configuration/#buildpath).
The node storage is often wiped out after the end of a job, so the log
files and the artifacts are no longer available after the termination of
the job.

This commit adds options (--log-error-path and --artifact-error-path) to
accumulate error logs and artifacts in some more permanent locations, so
that the can be easily inspected after a failed build.
  • Loading branch information
gkaf89 committed Jan 15, 2025
1 parent a8b4403 commit bd24ef7
Show file tree
Hide file tree
Showing 7 changed files with 298 additions and 20 deletions.
122 changes: 110 additions & 12 deletions easybuild/framework/easyblock.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,15 +76,17 @@
from easybuild.tools.config import EASYBUILD_SOURCES_URL # noqa
from easybuild.tools.config import build_option, build_path, get_log_filename, get_repository, get_repositorypath
from easybuild.tools.config import install_path, log_path, package_path, source_paths
from easybuild.tools.config import get_log_error_path, get_artifact_error_path
from easybuild.tools.environment import restore_env, sanitize_env
from easybuild.tools.filetools import CHECKSUM_TYPE_MD5, CHECKSUM_TYPE_SHA256
from easybuild.tools.filetools import adjust_permissions, apply_patch, back_up_file, change_dir, check_lock
from easybuild.tools.filetools import compute_checksum, convert_name, copy_file, create_lock, create_patch_info
from easybuild.tools.filetools import derive_alt_pypi_url, diff_files, dir_contains_files, download_file
from easybuild.tools.filetools import encode_class_name, extract_file
from easybuild.tools.filetools import convert_name, copy_file, copy_dir, create_lock, create_patch_info
from easybuild.tools.filetools import create_unused_dir, derive_alt_pypi_url, diff_files, dir_contains_files
from easybuild.tools.filetools import download_file, encode_class_name, extract_file, compute_checksum
from easybuild.tools.filetools import find_backup_name_candidate, get_source_tarball_from_git, is_alt_pypi_url
from easybuild.tools.filetools import is_binary, is_sha256_checksum, mkdir, move_file, move_logs, read_file, remove_dir
from easybuild.tools.filetools import remove_file, remove_lock, verify_checksum, weld_paths, write_file, symlink
from easybuild.tools.filetools import is_binary, is_readable, is_sha256_checksum, mkdir, move_file, move_logs
from easybuild.tools.filetools import read_file, remove_dir, remove_file, remove_lock, verify_checksum, weld_paths
from easybuild.tools.filetools import write_file, symlink
from easybuild.tools.hooks import BUILD_STEP, CLEANUP_STEP, CONFIGURE_STEP, EXTENSIONS_STEP, FETCH_STEP, INSTALL_STEP
from easybuild.tools.hooks import MODULE_STEP, MODULE_WRITE, PACKAGE_STEP, PATCH_STEP, PERMISSIONS_STEP, POSTITER_STEP
from easybuild.tools.hooks import POSTPROC_STEP, PREPARE_STEP, READY_STEP, SANITYCHECK_STEP, SOURCE_STEP
Expand Down Expand Up @@ -1045,16 +1047,23 @@ def moduleGenerator(self):
#
# DIRECTORY UTILITY FUNCTIONS
#
def gen_builddir(self):
"""Generate the (unique) name for the builddir"""
def get_relative_builddir_base_path(self):
"""Generate builddir base name relative to build_path"""
clean_name = remove_unwanted_chars(self.name)

# if a toolchain version starts with a -, remove the - so prevent a -- in the path name
tc = self.cfg['toolchain']
tcversion = tc['version'].lstrip('-')
lastdir = "%s%s-%s%s" % (self.cfg['versionprefix'], tc['name'], tcversion, self.cfg['versionsuffix'])

builddir = os.path.join(os.path.abspath(build_path()), clean_name, self.version, lastdir)
relative_builddir = os.path.join(clean_name, self.version, lastdir)

return relative_builddir

def gen_builddir(self):
"""Generate the (unique) name for the builddir"""
relative_builddir = self.get_relative_builddir_base_path()
builddir = os.path.join(os.path.abspath(build_path()), relative_builddir)

# make sure build dir is unique if cleanupoldbuild is False or not set
if not self.cfg.get('cleanupoldbuild', False):
Expand Down Expand Up @@ -4198,6 +4207,98 @@ def print_dry_run_note(loc, silent=True):
dry_run_msg(msg, silent=silent)


def persist_failed_compilation_log_and_artifacts(build_successful, application_log, silent, app, easyconfig):
persistent_dirs = {}

def get_dir(path, dirname):
nonlocal persistent_dirs
dirpath = persistent_dirs.get(dirname)
if dirpath is None:
dirpath = create_unused_dir(path)
persistent_dirs[dirname] = dirpath
return persistent_dirs[dirname]

def get_artifact_dir(path):
return get_dir(path, 'artifact')

def get_log_dir(path):
return get_dir(path, 'log')

def initialize_persistent_dir_and_replicate_if_paths_distinct(replication_operation,
get_directory,
source_path,
destination_base_path,
destination_relative_path):
abs_source_path = os.path.abspath(source_path)
abs_destination_base_path = os.path.abspath(destination_base_path)
abs_destination_path = os.path.join(abs_destination_base_path, destination_relative_path)

if not os.path.exists(abs_source_path):
return
if os.path.realpath(abs_source_path) == os.path.realpath(abs_destination_path):
return

final_abs_destination_base_path = get_directory(destination_base_path)
final_abs_destination_path = os.path.join(final_abs_destination_base_path, destination_relative_path)

replication_operation(abs_source_path, final_abs_destination_path)

if not application_log:
return

# there may be multiple log files, or the file name may be different due to zipping
logs = glob.glob('%s*' % application_log)
print_msg(
"Results of the build can be found in the temporary log file(s) %s" % ', '.join(logs),
log=_log,
silent=silent
)

if build_successful:
return

datetime_stamp = time.strftime("%Y%m%d") + '-' + time.strftime("%H%M%S")

errorlog_path = get_log_error_path(ec=easyconfig)
if errorlog_path is not None:
errorlog_path = os.path.join(errorlog_path, datetime_stamp)

for log_file in logs:
initialize_persistent_dir_and_replicate_if_paths_distinct(
copy_file,
get_log_dir,
log_file,
errorlog_path,
os.path.basename(log_file)
)

print_msg(
"Build logs of failed build copied to permanent storage: %s" % errorlog_path,
log=_log,
silent=silent
)

errorartifact_path = get_artifact_error_path(ec=easyconfig)
if errorartifact_path is not None:
errorartifact_path = os.path.join(errorartifact_path, datetime_stamp)

builddir = app.builddir
if is_readable(builddir):
initialize_persistent_dir_and_replicate_if_paths_distinct(
copy_dir,
get_artifact_dir,
builddir,
errorartifact_path,
app.get_relative_builddir_base_path()
)

print_msg(
"Build artifacts of failed build copied to permanent storage: %s" % errorartifact_path,
log=_log,
silent=silent
)


def build_and_install_one(ecdict, init_env):
"""
Build the software
Expand Down Expand Up @@ -4456,10 +4557,7 @@ def ensure_writable_log_dir(log_dir):
else:
dry_run_msg("(no ignored errors during dry run)\n", silent=silent)

if application_log:
# there may be multiple log files, or the file name may be different due to zipping
logs = glob.glob('%s*' % application_log)
print_msg("Results of the build can be found in the log file(s) %s" % ', '.join(logs), log=_log, silent=silent)
persist_failed_compilation_log_and_artifacts(success, application_log, silent, app, ecdict['ec'])

del app

Expand Down
56 changes: 56 additions & 0 deletions easybuild/tools/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,9 +104,11 @@
DEFAULT_MODULE_SYNTAX = 'Lua'
DEFAULT_MODULES_TOOL = 'Lmod'
DEFAULT_PATH_SUBDIRS = {
'artifact_error_path': 'error_artifacts',
'buildpath': 'build',
'containerpath': 'containers',
'installpath': '',
'log_error_path': 'error_log',
'packagepath': 'packages',
'repositorypath': 'ebfiles_repo',
'sourcepath': 'sources',
Expand Down Expand Up @@ -475,13 +477,15 @@ class ConfigurationVariables(BaseConfigurationVariables):

# list of known/required keys
REQUIRED = [
'artifact_error_path',
'buildpath',
'config',
'containerpath',
'installpath',
'installpath_modules',
'installpath_software',
'job_backend',
'log_error_path',
'logfile_format',
'moduleclasses',
'module_naming_scheme',
Expand Down Expand Up @@ -843,6 +847,58 @@ def log_path(ec=None):
return log_file_format(return_directory=True, ec=ec, date=date, timestamp=timestamp)


def get_log_error_path(ec=None):
"""
Return the 'log_error_path', the location where logs are copied in case of failure
:param ec: dict-like value with at least the keys 'name' and 'version' defined
"""
log_error_path = ConfigurationVariables()['log_error_path']

if not log_error_path:
return None

if ec is None:
raise EasyBuildError("Easyconfig cannot be empty.")

name, version = ec.get('name'), ec.get('version')

if name is None:
raise EasyBuildError("The 'name' key is not defined.")
if version is None:
raise EasyBuildError("The 'version' key is not defined.")

path = os.path.join(log_error_path, name + '-' + version)

return path


def get_artifact_error_path(ec=None):
"""
Return the 'artifact_error_path', the location where build directories are copied in case of failure
:param ec: dict-like value with at least the keys 'name' and 'version' defined
"""
artifact_error_path = ConfigurationVariables()['artifact_error_path']

if not artifact_error_path:
return None

if ec is None:
raise EasyBuildError("Easyconfig cannot be empty.")

name, version = ec.get('name'), ec.get('version')

if name is None:
raise EasyBuildError("The 'name' key is not defined.")
if version is None:
raise EasyBuildError("The 'version' key is not defined.")

path = os.path.join(artifact_error_path, name + '-' + version)

return path


def get_build_log_path():
"""
Return (temporary) directory for build log
Expand Down
12 changes: 8 additions & 4 deletions easybuild/tools/options.py
Original file line number Diff line number Diff line change
Expand Up @@ -559,6 +559,8 @@ def config_options(self):
descr = ("Configuration options", "Configure EasyBuild behavior.")

opts = OrderedDict({
'artifact-error-path': ("Location where artifacts are copied in case of an error",
None, 'store', mk_full_default_path('artifact_error_path')),
'avail-module-naming-schemes': ("Show all supported module naming schemes",
None, 'store_true', False,),
'avail-modules-tools': ("Show all supported module tools",
Expand Down Expand Up @@ -589,6 +591,8 @@ def config_options(self):
None, 'store', None),
'job-backend': ("Backend to use for submitting jobs", 'choice', 'store',
DEFAULT_JOB_BACKEND, sorted(avail_job_backends().keys())),
'log-error-path': ("Location where logs and artifacts are copied in case of an error",
None, 'store', mk_full_default_path('log_error_path')),
# purposely take a copy for the default logfile format
'logfile-format': ("Directory name and format of the log file",
'strtuple', 'store', DEFAULT_LOGFILE_FORMAT[:], {'metavar': 'DIR,FORMAT'}),
Expand Down Expand Up @@ -1153,8 +1157,8 @@ def _postprocess_config(self):
# - the <path> could also specify the location of a *remote* (Git( repository,
# which can be done in variety of formats (git@<url>:<org>/<repo>), https://<url>, etc.)
# (see also https://github.com/easybuilders/easybuild-framework/issues/3892);
path_opt_names = ['buildpath', 'containerpath', 'git_working_dirs_path', 'installpath',
'installpath_modules', 'installpath_software', 'prefix', 'packagepath',
path_opt_names = ['artifact_error_path', 'buildpath', 'containerpath', 'git_working_dirs_path', 'installpath',
'installpath_modules', 'installpath_software', 'log_error_path', 'prefix', 'packagepath',
'robot_paths', 'sourcepath']

for opt_name in path_opt_names:
Expand All @@ -1163,8 +1167,8 @@ def _postprocess_config(self):
if self.options.prefix is not None:
# prefix applies to all paths, and repository has to be reinitialised to take new repositorypath in account
# in the legacy-style configuration, repository is initialised in configuration file itself
path_opts = ['buildpath', 'containerpath', 'installpath', 'packagepath', 'repository', 'repositorypath',
'sourcepath']
path_opts = ['artifact_error_path', 'buildpath', 'containerpath', 'installpath', 'log_error_path',
'packagepath', 'repository', 'repositorypath', 'sourcepath']
for dest in path_opts:
if not self.options._action_taken.get(dest, False):
if dest == 'repository':
Expand Down
11 changes: 10 additions & 1 deletion test/framework/options.py
Original file line number Diff line number Diff line change
Expand Up @@ -5283,7 +5283,16 @@ def test_prefix_option(self):

regex = re.compile(r"(?P<cfg_opt>\S*).*%s.*" % self.test_prefix, re.M)

expected = ['buildpath', 'containerpath', 'installpath', 'packagepath', 'prefix', 'repositorypath']
expected = [
'artifact-error-path',
'buildpath',
'containerpath',
'installpath',
'log-error-path',
'packagepath',
'prefix',
'repositorypath',
]
self.assertEqual(sorted(regex.findall(txt)), expected)

def test_dump_env_script(self):
Expand Down
10 changes: 10 additions & 0 deletions test/framework/sandbox/sources/toy/toy-0.0_add-bug.patch
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
--- a/toy-0.0.orig/toy.source 2014-03-06 18:48:16.000000000 +0100
+++ b/toy-0.0/toy.source 2020-08-18 12:19:35.000000000 +0200
@@ -2,6 +2,6 @@

int main(int argc, char* argv[]){

- printf("I'm a toy, and proud of it.\n");
+ printf("I'm a toy, and proud of it.\n")
return 0;
}
Loading

0 comments on commit bd24ef7

Please sign in to comment.