diff --git a/bin/submit_build.py b/bin/submit_build.py index 3feaac9..06b8a56 100755 --- a/bin/submit_build.py +++ b/bin/submit_build.py @@ -31,7 +31,7 @@ from build_tools.bwraptools import bwrap_prefix, rsync_copy from build_tools.clusters import ARCHS, PARTITIONS from build_tools.filetools import APPS_BRUSSEL, get_module -from build_tools.lmodtools import LMOD_CACHE_CLUSTERS, submit_lmod_cache_job +from build_tools.lmodtools import submit_lmod_cache_job from build_tools.softinstall import mk_job_name, set_toolchain_generation, submit_build_job # repositories with easyconfigs @@ -60,6 +60,7 @@ def main(): # Default job options job = { + 'lmod_cache': '1', 'langcode': 'en_US.utf8', 'cluster': 'hydra', 'target_arch': None, @@ -72,6 +73,7 @@ def main(): # Easybuild default paths # start using environment from local machine, job scripts get custom paths ebconf = { + 'accept-eula-for': 'Intel-oneAPI,CUDA', 'robot-paths': ":".join([os.path.join(VSCSOFTSTACK_ROOT, repo) for repo in EASYCONFIG_REPOS]), 'include-easyblocks': os.path.join(VSCSOFTSTACK_ROOT, EASYBLOCK_REPO), 'sourcepath': '/apps/brussel/sources:/apps/gent/source', @@ -229,8 +231,8 @@ def main(): logger.error("Failed to get module name/version for %s", easyconfig) sys.exit(1) - lmod_cache = not opts.options.skip_lmod_cache - if not lmod_cache: + if opts.options.skip_lmod_cache: + job['lmod_cache'] = '' logger.info("Not running Lmod cache after installation") # ---> main build + lmod cache loop <--- # @@ -326,7 +328,7 @@ def main(): ec, buildjob_out = submit_build_job( job_options, - opts.options.keep, + keep_job=opts.options.keep, sub_options=opts.options.extra_sub_flags, cluster=job_options['cluster'], local_exec=local_exec, @@ -337,13 +339,6 @@ def main(): logger.error("Failed to submit or run build job for '%s': %s", easyconfig, buildjob_out) sys.exit(1) - # submit lmod cache job(s) - if lmod_cache and job_options['cluster'] in LMOD_CACHE_CLUSTERS: - jobids_depend = None - if buildjob_out and not dry_run and not local_exec: - jobids_depend = [buildjob_out.rstrip().split(';')[0]] - submit_lmod_cache_job(host_partition, jobids_depend, dry_run=dry_run) - if __name__ == '__main__': main() diff --git a/src/build_tools/hooks_hydra.py b/src/build_tools/hooks_hydra.py index 36d7278..ba7d5cf 100644 --- a/src/build_tools/hooks_hydra.py +++ b/src/build_tools/hooks_hydra.py @@ -19,12 +19,15 @@ import os +from vsc.utils import fancylogger + from easybuild.framework.easyconfig.constants import EASYCONFIG_CONSTANTS from easybuild.tools import LooseVersion from easybuild.tools.hooks import SANITYCHECK_STEP from build_tools.clusters import ARCHS from build_tools.ib_modules import IB_MODULE_SOFTWARE, IB_MODULE_SUFFIX, IB_OPT_MARK +from build_tools.lmodtools import submit_lmod_cache_job # permission groups for licensed software SOFTWARE_GROUPS = { @@ -368,3 +371,21 @@ def pre_module_hook(self, *args, **kwargs): # pylint: disable=unused-argument ############################ self.cfg.enable_templating = en_templ + + +def end_hook(): + """Hook to run shortly before EasyBuild completes""" + + logger = fancylogger.getLogger() + fancylogger.logToScreen(True, stdout=True) + fancylogger.setLogLevelInfo() + + # submit Lmod cache job + if os.getenv('BUILD_TOOLS_RUN_LMOD_CACHE', '1') == '1': + partition = os.getenv('SLURM_JOB_PARTITION') + if partition: + logger.info('[end hook] Submitting Lmod cache job for partition %s', partition) + # set cluster=False to avoid loading cluster module in job + submit_lmod_cache_job(partition, cluster=False) + else: + logger.info('[end hook] Skipping Lmod cache job: not in a Slurm job') diff --git a/src/build_tools/jobtemplate.py b/src/build_tools/jobtemplate.py index f3635ca..c8be2ee 100644 --- a/src/build_tools/jobtemplate.py +++ b/src/build_tools/jobtemplate.py @@ -36,6 +36,8 @@ fi # set environment +export BUILD_TOOLS_LOAD_DUMMY_MODULES=1 +export BUILD_TOOLS_RUN_LMOD_CACHE=${lmod_cache} export LANG=${langcode} export PATH=$$PREFIX_EB/easybuild-framework:$$PATH export PYTHONPATH=$$PREFIX_EB/easybuild-easyconfigs:$$PREFIX_EB/easybuild-easyblocks:$$PREFIX_EB/easybuild-framework:$$PREFIX_EB/vsc-base/lib diff --git a/src/build_tools/lmodtools.py b/src/build_tools/lmodtools.py index 3165e49..c09f057 100644 --- a/src/build_tools/lmodtools.py +++ b/src/build_tools/lmodtools.py @@ -38,16 +38,20 @@ """ -def submit_lmod_cache_job(partition, jobids_depend=None, *args, **kwargs): +def submit_lmod_cache_job(partition, jobids_depend=None, cluster=None, **kwargs): """ Run Lmod cache in a Slurm job - :param jobids_depend: list of strings: jobids on with to set job dependency - :param mod_basedir: the module basedir :param partition: the partition to submit the job to + :param jobids_depend: list of strings: jobids on with to set job dependency + :param cluster: the Slurm cluster to submit the job to. + + if cluster is None, load the cluster module corresponding to the current partition + if cluster is False, don’t purge/load a cluster module (use the currently active cluster) """ archdir = PARTITIONS[partition]['arch'] - cluster = PARTITIONS[partition]['cluster'] + if cluster is None: + cluster = PARTITIONS[partition]['cluster'] cache_cmd = [ '/usr/libexec/lmod/run_lmod_cache.py', @@ -67,7 +71,7 @@ def submit_lmod_cache_job(partition, jobids_depend=None, *args, **kwargs): logger.info( "Refreshing Lmod cache on partition %s for architecture %s", partition or 'default', archdir or 'default') - ec, out = submit_job_script(job_file, cluster=cluster, *args, **kwargs) + ec, out = submit_job_script(job_file, cluster=cluster, **kwargs) if ec != 0: logger.error("Failed to submit Lmod cache job: %s", out) diff --git a/src/build_tools/package.py b/src/build_tools/package.py index 404d9f3..b5bcb9e 100644 --- a/src/build_tools/package.py +++ b/src/build_tools/package.py @@ -16,7 +16,7 @@ @author: Alex Domingo (Vrije Universiteit Brussel) """ -VERSION = '3.0.0' +VERSION = '3.1.0' AUTHOR = { 'wp': 'Ward Poelmans', diff --git a/src/build_tools/softinstall.py b/src/build_tools/softinstall.py index 7b102e3..5a7d684 100644 --- a/src/build_tools/softinstall.py +++ b/src/build_tools/softinstall.py @@ -106,28 +106,30 @@ def submit_job_script(job_file, sub_options='', cluster='hydra', local_exec=Fals :param dry_run: print submit command """ + submit_cmd = [] # switch to corresponding cluster and submit - submit_cmd = ["module --force purge"] - submit_cmd.append("module load cluster/%s" % cluster) - submit_cmd.append("sbatch --parsable %s %s" % (sub_options, job_file)) + if cluster: + submit_cmd.append("module --force purge") + submit_cmd.append(f"module load cluster/{cluster}") + submit_cmd.append(f"sbatch --parsable {sub_options} {job_file}") submit_cmd = " && ".join(submit_cmd) if dry_run: - log_msg = "(DRY RUN) Job submission command: %s" % submit_cmd + log_msg = f"(DRY RUN) Job submission command: {submit_cmd}" logger.info(log_msg) ec, out = 0, log_msg elif local_exec: logger.debug("Local execution of job script: %s", job_file) - ec, out = RunLoopStdout.run("bash %s" % job_file) + ec, out = RunLoopStdout.run(f"bash {job_file}") else: logger.debug("Job submission command: %s", submit_cmd) - ec, out = RunNoShell.run('bash -c "%s"' % submit_cmd) + ec, out = RunNoShell.run(f'bash -c "{submit_cmd}"') return ec, out -def submit_build_job(job_options, keep_job=False, *args, **kargs): +def submit_build_job(job_options, keep_job=False, **kwargs): """ Generate job script from BUILD_JOB template and submit it with Slurm to target cluster :param job_options: dict with options to pass to job template @@ -138,7 +140,7 @@ def submit_build_job(job_options, keep_job=False, *args, **kargs): job_file = write_tempfile(job_script) logger.debug("Job script written to %s", job_file) - ec, out = submit_job_script(job_file, *args, **kargs) + ec, out = submit_job_script(job_file, **kwargs) if not keep_job: try: diff --git a/tests/input/build_job_01.sh b/tests/input/build_job_01.sh index b986224..ccac2f5 100644 --- a/tests/input/build_job_01.sh +++ b/tests/input/build_job_01.sh @@ -14,6 +14,8 @@ if [ -z $PREFIX_EB ]; then fi # set environment +export BUILD_TOOLS_LOAD_DUMMY_MODULES=1 +export BUILD_TOOLS_RUN_LMOD_CACHE=1 export LANG=C export PATH=$PREFIX_EB/easybuild-framework:$PATH export PYTHONPATH=$PREFIX_EB/easybuild-easyconfigs:$PREFIX_EB/easybuild-easyblocks:$PREFIX_EB/easybuild-framework:$PREFIX_EB/vsc-base/lib diff --git a/tests/input/build_job_02.sh b/tests/input/build_job_02.sh index f225fde..6424a67 100644 --- a/tests/input/build_job_02.sh +++ b/tests/input/build_job_02.sh @@ -14,6 +14,8 @@ if [ -z $PREFIX_EB ]; then fi # set environment +export BUILD_TOOLS_LOAD_DUMMY_MODULES=1 +export BUILD_TOOLS_RUN_LMOD_CACHE= export LANG=C export PATH=$PREFIX_EB/easybuild-framework:$PATH export PYTHONPATH=$PREFIX_EB/easybuild-easyconfigs:$PREFIX_EB/easybuild-easyblocks:$PREFIX_EB/easybuild-framework:$PREFIX_EB/vsc-base/lib diff --git a/tests/test_lmodtools.py b/tests/test_lmodtools.py index b661f76..a46a82c 100644 --- a/tests/test_lmodtools.py +++ b/tests/test_lmodtools.py @@ -24,7 +24,7 @@ def test_submit_lmod_cache_job(inputdir): job_script = 'lmod_cache_job_01.sh' - _, out = submit_lmod_cache_job(jobids_depend=['123', '456'], partition='skylake_mpi', dry_run=True) + _, out = submit_lmod_cache_job('skylake_mpi', jobids_depend=['123', '456'], dry_run=True) new_job = out.split(' ')[-1] with open(new_job) as nj: diff --git a/tests/test_softinstall.py b/tests/test_softinstall.py index 7fe8d30..58ec5af 100644 --- a/tests/test_softinstall.py +++ b/tests/test_softinstall.py @@ -96,6 +96,7 @@ def test_mk_job_name(test_name): 'eb_installpath': '/apps/brussel/${VSC_OS_LOCAL}/skylake', 'tmp': '/tmp/eb-test-build', 'postinstall': '', + 'lmod_cache': '1', }), ('build_job_02.sh', { 'job_name': 'test-job-gpu', @@ -113,6 +114,7 @@ def test_mk_job_name(test_name): 'eb_installpath': '/apps/brussel/${VSC_OS_LOCAL}/zen2-ib', 'tmp': '/tmp/eb-test-build', 'postinstall': 'rsync src dest', + 'lmod_cache': '', }), ] ) @@ -122,7 +124,7 @@ def test_submit_build_job(inputdir, test_job): cluster = 'hydra' ec, out = softinstall.submit_build_job( - job_options, True, sub_options, cluster=cluster, local_exec=False, dry_run=True + job_options, keep_job=True, sub_options=sub_options, cluster=cluster, local_exec=False, dry_run=True ) new_job = out.split(' ')[-1]