Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

submit lmod cache job from EB end_hook #23

Merged
merged 3 commits into from
Jun 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 6 additions & 11 deletions bin/submit_build.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
from build_tools.bwraptools import bwrap_prefix, rsync_copy
from build_tools.clusters import ARCHS, PARTITIONS
from build_tools.filetools import APPS_BRUSSEL, get_module
from build_tools.lmodtools import LMOD_CACHE_CLUSTERS, submit_lmod_cache_job
from build_tools.lmodtools import submit_lmod_cache_job
from build_tools.softinstall import mk_job_name, set_toolchain_generation, submit_build_job

# repositories with easyconfigs
Expand Down Expand Up @@ -60,6 +60,7 @@ def main():

# Default job options
job = {
'lmod_cache': '1',
'langcode': 'en_US.utf8',
'cluster': 'hydra',
'target_arch': None,
Expand All @@ -72,6 +73,7 @@ def main():
# Easybuild default paths
# start using environment from local machine, job scripts get custom paths
ebconf = {
'accept-eula-for': 'Intel-oneAPI,CUDA',
'robot-paths': ":".join([os.path.join(VSCSOFTSTACK_ROOT, repo) for repo in EASYCONFIG_REPOS]),
'include-easyblocks': os.path.join(VSCSOFTSTACK_ROOT, EASYBLOCK_REPO),
'sourcepath': '/apps/brussel/sources:/apps/gent/source',
Expand Down Expand Up @@ -229,8 +231,8 @@ def main():
logger.error("Failed to get module name/version for %s", easyconfig)
sys.exit(1)

lmod_cache = not opts.options.skip_lmod_cache
if not lmod_cache:
if opts.options.skip_lmod_cache:
job['lmod_cache'] = ''
logger.info("Not running Lmod cache after installation")

# ---> main build + lmod cache loop <--- #
Expand Down Expand Up @@ -326,7 +328,7 @@ def main():

ec, buildjob_out = submit_build_job(
job_options,
opts.options.keep,
keep_job=opts.options.keep,
sub_options=opts.options.extra_sub_flags,
cluster=job_options['cluster'],
local_exec=local_exec,
Expand All @@ -337,13 +339,6 @@ def main():
logger.error("Failed to submit or run build job for '%s': %s", easyconfig, buildjob_out)
sys.exit(1)

# submit lmod cache job(s)
if lmod_cache and job_options['cluster'] in LMOD_CACHE_CLUSTERS:
jobids_depend = None
if buildjob_out and not dry_run and not local_exec:
jobids_depend = [buildjob_out.rstrip().split(';')[0]]
submit_lmod_cache_job(host_partition, jobids_depend, dry_run=dry_run)


if __name__ == '__main__':
main()
21 changes: 21 additions & 0 deletions src/build_tools/hooks_hydra.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,15 @@

import os

from vsc.utils import fancylogger

from easybuild.framework.easyconfig.constants import EASYCONFIG_CONSTANTS
from easybuild.tools import LooseVersion
from easybuild.tools.hooks import SANITYCHECK_STEP

from build_tools.clusters import ARCHS
from build_tools.ib_modules import IB_MODULE_SOFTWARE, IB_MODULE_SUFFIX, IB_OPT_MARK
from build_tools.lmodtools import submit_lmod_cache_job

# permission groups for licensed software
SOFTWARE_GROUPS = {
Expand Down Expand Up @@ -368,3 +371,21 @@ def pre_module_hook(self, *args, **kwargs): # pylint: disable=unused-argument
############################

self.cfg.enable_templating = en_templ


def end_hook():
"""Hook to run shortly before EasyBuild completes"""

logger = fancylogger.getLogger()
fancylogger.logToScreen(True, stdout=True)
fancylogger.setLogLevelInfo()

# submit Lmod cache job
if os.getenv('BUILD_TOOLS_RUN_LMOD_CACHE', '1') == '1':
partition = os.getenv('SLURM_JOB_PARTITION')
if partition:
logger.info('[end hook] Submitting Lmod cache job for partition %s', partition)
# set cluster=False to avoid loading cluster module in job
submit_lmod_cache_job(partition, cluster=False)
else:
logger.info('[end hook] Skipping Lmod cache job: not in a Slurm job')
2 changes: 2 additions & 0 deletions src/build_tools/jobtemplate.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@
fi

# set environment
export BUILD_TOOLS_LOAD_DUMMY_MODULES=1
export BUILD_TOOLS_RUN_LMOD_CACHE=${lmod_cache}
export LANG=${langcode}
export PATH=$$PREFIX_EB/easybuild-framework:$$PATH
export PYTHONPATH=$$PREFIX_EB/easybuild-easyconfigs:$$PREFIX_EB/easybuild-easyblocks:$$PREFIX_EB/easybuild-framework:$$PREFIX_EB/vsc-base/lib
Expand Down
14 changes: 9 additions & 5 deletions src/build_tools/lmodtools.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,16 +38,20 @@
"""


def submit_lmod_cache_job(partition, jobids_depend=None, *args, **kwargs):
def submit_lmod_cache_job(partition, jobids_depend=None, cluster=None, **kwargs):
"""
Run Lmod cache in a Slurm job
:param jobids_depend: list of strings: jobids on with to set job dependency
:param mod_basedir: the module basedir
:param partition: the partition to submit the job to
:param jobids_depend: list of strings: jobids on with to set job dependency
:param cluster: the Slurm cluster to submit the job to.

if cluster is None, load the cluster module corresponding to the current partition
if cluster is False, don’t purge/load a cluster module (use the currently active cluster)
"""

archdir = PARTITIONS[partition]['arch']
cluster = PARTITIONS[partition]['cluster']
if cluster is None:
cluster = PARTITIONS[partition]['cluster']

cache_cmd = [
'/usr/libexec/lmod/run_lmod_cache.py',
Expand All @@ -67,7 +71,7 @@ def submit_lmod_cache_job(partition, jobids_depend=None, *args, **kwargs):

logger.info(
"Refreshing Lmod cache on partition %s for architecture %s", partition or 'default', archdir or 'default')
ec, out = submit_job_script(job_file, cluster=cluster, *args, **kwargs)
ec, out = submit_job_script(job_file, cluster=cluster, **kwargs)

if ec != 0:
logger.error("Failed to submit Lmod cache job: %s", out)
Expand Down
2 changes: 1 addition & 1 deletion src/build_tools/package.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
@author: Alex Domingo (Vrije Universiteit Brussel)
"""

VERSION = '3.0.0'
VERSION = '3.1.0'

AUTHOR = {
'wp': 'Ward Poelmans',
Expand Down
18 changes: 10 additions & 8 deletions src/build_tools/softinstall.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,28 +106,30 @@ def submit_job_script(job_file, sub_options='', cluster='hydra', local_exec=Fals
:param dry_run: print submit command
"""

submit_cmd = []
# switch to corresponding cluster and submit
submit_cmd = ["module --force purge"]
submit_cmd.append("module load cluster/%s" % cluster)
submit_cmd.append("sbatch --parsable %s %s" % (sub_options, job_file))
if cluster:
submit_cmd.append("module --force purge")
submit_cmd.append(f"module load cluster/{cluster}")

submit_cmd.append(f"sbatch --parsable {sub_options} {job_file}")
submit_cmd = " && ".join(submit_cmd)

if dry_run:
log_msg = "(DRY RUN) Job submission command: %s" % submit_cmd
log_msg = f"(DRY RUN) Job submission command: {submit_cmd}"
logger.info(log_msg)
ec, out = 0, log_msg
elif local_exec:
logger.debug("Local execution of job script: %s", job_file)
ec, out = RunLoopStdout.run("bash %s" % job_file)
ec, out = RunLoopStdout.run(f"bash {job_file}")
else:
logger.debug("Job submission command: %s", submit_cmd)
ec, out = RunNoShell.run('bash -c "%s"' % submit_cmd)
ec, out = RunNoShell.run(f'bash -c "{submit_cmd}"')

return ec, out


def submit_build_job(job_options, keep_job=False, *args, **kargs):
def submit_build_job(job_options, keep_job=False, **kwargs):
"""
Generate job script from BUILD_JOB template and submit it with Slurm to target cluster
:param job_options: dict with options to pass to job template
Expand All @@ -138,7 +140,7 @@ def submit_build_job(job_options, keep_job=False, *args, **kargs):
job_file = write_tempfile(job_script)
logger.debug("Job script written to %s", job_file)

ec, out = submit_job_script(job_file, *args, **kargs)
ec, out = submit_job_script(job_file, **kwargs)

if not keep_job:
try:
Expand Down
2 changes: 2 additions & 0 deletions tests/input/build_job_01.sh
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ if [ -z $PREFIX_EB ]; then
fi

# set environment
export BUILD_TOOLS_LOAD_DUMMY_MODULES=1
export BUILD_TOOLS_RUN_LMOD_CACHE=1
export LANG=C
export PATH=$PREFIX_EB/easybuild-framework:$PATH
export PYTHONPATH=$PREFIX_EB/easybuild-easyconfigs:$PREFIX_EB/easybuild-easyblocks:$PREFIX_EB/easybuild-framework:$PREFIX_EB/vsc-base/lib
Expand Down
2 changes: 2 additions & 0 deletions tests/input/build_job_02.sh
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ if [ -z $PREFIX_EB ]; then
fi

# set environment
export BUILD_TOOLS_LOAD_DUMMY_MODULES=1
export BUILD_TOOLS_RUN_LMOD_CACHE=
export LANG=C
export PATH=$PREFIX_EB/easybuild-framework:$PATH
export PYTHONPATH=$PREFIX_EB/easybuild-easyconfigs:$PREFIX_EB/easybuild-easyblocks:$PREFIX_EB/easybuild-framework:$PREFIX_EB/vsc-base/lib
Expand Down
2 changes: 1 addition & 1 deletion tests/test_lmodtools.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
def test_submit_lmod_cache_job(inputdir):
job_script = 'lmod_cache_job_01.sh'

_, out = submit_lmod_cache_job(jobids_depend=['123', '456'], partition='skylake_mpi', dry_run=True)
_, out = submit_lmod_cache_job('skylake_mpi', jobids_depend=['123', '456'], dry_run=True)

new_job = out.split(' ')[-1]
with open(new_job) as nj:
Expand Down
4 changes: 3 additions & 1 deletion tests/test_softinstall.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@ def test_mk_job_name(test_name):
'eb_installpath': '/apps/brussel/${VSC_OS_LOCAL}/skylake',
'tmp': '/tmp/eb-test-build',
'postinstall': '',
'lmod_cache': '1',
}),
('build_job_02.sh', {
'job_name': 'test-job-gpu',
Expand All @@ -113,6 +114,7 @@ def test_mk_job_name(test_name):
'eb_installpath': '/apps/brussel/${VSC_OS_LOCAL}/zen2-ib',
'tmp': '/tmp/eb-test-build',
'postinstall': 'rsync src dest',
'lmod_cache': '',
}),
]
)
Expand All @@ -122,7 +124,7 @@ def test_submit_build_job(inputdir, test_job):
cluster = 'hydra'

ec, out = softinstall.submit_build_job(
job_options, True, sub_options, cluster=cluster, local_exec=False, dry_run=True
job_options, keep_job=True, sub_options=sub_options, cluster=cluster, local_exec=False, dry_run=True
)

new_job = out.split(' ')[-1]
Expand Down