Skip to content

Commit adcf3af

Browse files
authored
Merge pull request #4 from hppritcha/upstream_pr_1993
Provide a warning of potentially unknown Slurm params
2 parents 85de1b2 + 8d833ac commit adcf3af

File tree

3 files changed

+101
-8
lines changed

3 files changed

+101
-8
lines changed

src/mca/plm/slurm/help-plm-slurm.txt

Lines changed: 57 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
# All rights reserved.
1313
# Copyright (c) 2014-2020 Intel, Inc. All rights reserved.
1414
# Copyright (c) 2020 Cisco Systems, Inc. All rights reserved
15-
# Copyright (c) 2022 Nanook Consulting. All rights reserved.
15+
# Copyright (c) 2022-2024 Nanook Consulting All rights reserved.
1616
# $COPYRIGHT$
1717
#
1818
# Additional copyrights may follow
@@ -51,6 +51,7 @@ are running.
5151

5252
Please consult with your system administrator about obtaining
5353
such support.
54+
#
5455
[no-srun]
5556
The SLURM process starter for OpenMPI was unable to locate a
5657
usable "srun" command in its path. Please check your path
@@ -80,3 +81,58 @@ process starter via the following MCA parameter:
8081
This will result in use of the ssh process starter. This will have
8182
no impact on your application, but will result in any accounting
8283
being done solely at the allocation level instead of per-job.
84+
#
85+
[custom-args-in-env]
86+
The Slurm process starter for PRTE detected the presence of an MCA
87+
parameter in the environment that assigns custom command line arguments
88+
to the `srun` command used to start PRTE's daemons on remote nodes:
89+
90+
Paramater value: %s
91+
92+
This warning is provided to alert you (the user) to a perhaps
93+
unintentional setting of command line arguments, or the unseen
94+
overriding of your intended arguments by Slurm.
95+
96+
Background: Starting with Slurm version 23.11, a command line argument
97+
(`--external-launcher`) was added to `srun` to indicate that the
98+
command was being initiated from within a third-party launcher (e.g.,
99+
`prte` or `prterun`). This allows Slurm to essentially freely modify
100+
the `srun` command line while retaining a backward compatibility
101+
capability when explicitly told to use it. Notably, the Slurm
102+
environment does this by automatically setting the
103+
PRTE_MCA_plm_slurm_args environment variable to pass in its own
104+
command line arguments. This has the side effect of overriding most
105+
user- or system-level settings. Note that arguments passed on the
106+
PRTE command line will override any Slurm setting of the
107+
PRTE_MCA_plm_slurm_args environment variable, but with potentially
108+
undesirable side effects if newer versions of `srun` misinterpret or
109+
fail to understand the user-specified arguments.
110+
111+
If the setting of the MCA parameter was intentional, or if the
112+
parameter looks acceptable to you, then please set the following
113+
MCA parameter to disable this warning:
114+
115+
Environment: PRTE_MCA_plm_slurm_disable_warning=true
116+
Cmd line: --prtemca plm_slurm_disable_warning 1
117+
Default MCA param file: plm_slurm_disable_warning = true
118+
119+
If you did not intentionally set the identified command line
120+
arguments and do not wish them to be used, then set the
121+
following MCA param to have them ignored:
122+
123+
Environment: PRTE_MCA_plm_slurm_ignore_args=true
124+
Cmd line: --prtemca plm_slurm_ignore_args 1
125+
Default MCA param file: plm_slurm_ignore_args = true
126+
127+
Note that if you wish to provide custom `srun` command line
128+
arguments and are finding them being overridden by Slurm, you
129+
can ensure that your values are used by setting them with the
130+
following param:
131+
132+
Environment: PRTE_MCA_plm_slurm_force_args=foo
133+
Cmd line: --prtemca plm_slurm_force_args foo
134+
Default MCA param file: plm_slurm_force_args = foo
135+
136+
Note that you may need to add the `--external-launcher` option
137+
to your provided args to ensure that `srun` properly functions
138+
if you are using a relatively recent release of Slurm.

src/mca/plm/slurm/plm_slurm.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
* Copyright (c) 2019 Research Organization for Information Science
1414
* and Technology (RIST). All rights reserved.
1515
* Copyright (c) 2020 Cisco Systems, Inc. All rights reserved
16-
* Copyright (c) 2022 Nanook Consulting. All rights reserved.
16+
* Copyright (c) 2022-2024 Nanook Consulting All rights reserved.
1717
* $COPYRIGHT$
1818
*
1919
* Additional copyrights may follow
@@ -33,6 +33,7 @@ BEGIN_C_DECLS
3333

3434
struct prte_mca_plm_slurm_component_t {
3535
prte_plm_base_component_t super;
36+
int custom_args_index;
3637
char *custom_args;
3738
bool slurm_warning_msg;
3839
};

src/mca/plm/slurm/plm_slurm_component.c

Lines changed: 42 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
* Copyright (c) 2019 Research Organization for Information Science
1717
* and Technology (RIST). All rights reserved.
1818
* Copyright (c) 2020 Cisco Systems, Inc. All rights reserved
19-
* Copyright (c) 2021-2022 Nanook Consulting. All rights reserved.
19+
* Copyright (c) 2021-2024 Nanook Consulting All rights reserved.
2020
* $COPYRIGHT$
2121
*
2222
* Additional copyrights may follow
@@ -84,17 +84,26 @@ prte_mca_plm_slurm_component_t prte_mca_plm_slurm_component = {
8484
here; will be initialized in plm_slurm_open() */
8585
};
8686

87+
static char *custom_args = NULL;
88+
static char *force_args = NULL;
89+
8790
static int plm_slurm_register(void)
8891
{
8992
pmix_mca_base_component_t *comp = &prte_mca_plm_slurm_component.super;
9093

91-
prte_mca_plm_slurm_component.custom_args = NULL;
92-
(void) pmix_mca_base_component_var_register(comp, "args", "Custom arguments to srun",
94+
95+
prte_mca_plm_slurm_component.custom_args_index =
96+
pmix_mca_base_component_var_register(comp, "args", "Custom arguments to srun",
97+
PMIX_MCA_BASE_VAR_TYPE_STRING,
98+
&custom_args);
99+
100+
force_args = NULL;
101+
(void) pmix_mca_base_component_var_register(comp, "force_args", "Mandatory custom arguments to srun",
93102
PMIX_MCA_BASE_VAR_TYPE_STRING,
94-
&prte_mca_plm_slurm_component.custom_args);
103+
&force_args);
95104

96-
prte_mca_plm_slurm_component.slurm_warning_msg = true;
97-
(void) pmix_mca_base_component_var_register(comp, "warning", "Turn off warning message",
105+
prte_mca_plm_slurm_component.slurm_warning_msg = false;
106+
(void) pmix_mca_base_component_var_register(comp, "disable_warning", "Turn off warning message about custom args set in environment",
98107
PMIX_MCA_BASE_VAR_TYPE_BOOL,
99108
&prte_mca_plm_slurm_component.slurm_warning_msg);
100109

@@ -108,6 +117,9 @@ static int plm_slurm_open(void)
108117

109118
static int prte_mca_plm_slurm_component_query(pmix_mca_base_module_t **module, int *priority)
110119
{
120+
const pmix_mca_base_var_t *var;
121+
pmix_status_t rc;
122+
111123
/* Are we running under a SLURM job? */
112124

113125
if (NULL != getenv("SLURM_JOBID")) {
@@ -117,6 +129,30 @@ static int prte_mca_plm_slurm_component_query(pmix_mca_base_module_t **module, i
117129
"%s plm:slurm: available for selection",
118130
PRTE_NAME_PRINT(PRTE_PROC_MY_NAME)));
119131

132+
prte_mca_plm_slurm_component.custom_args = NULL;
133+
134+
// if we were are warning about externally set custom args, then
135+
// check to see if that was done
136+
if (!prte_mca_plm_slurm_component.slurm_warning_msg &&
137+
NULL == force_args) {
138+
// check for custom args
139+
rc = pmix_mca_base_var_get(prte_mca_plm_slurm_component.custom_args_index, &var);
140+
if (PMIX_SUCCESS == rc) {
141+
// the variable was set - see who set it
142+
if (PMIX_MCA_BASE_VAR_SOURCE_ENV == var->mbv_source) {
143+
// set in the environment - warn
144+
pmix_show_help("help-plm-slurm.txt", "custom-args-in-env", true,
145+
custom_args);
146+
}
147+
}
148+
}
149+
150+
if (NULL != force_args) {
151+
prte_mca_plm_slurm_component.custom_args = force_args;
152+
} else if (NULL != custom_args) {
153+
prte_mca_plm_slurm_component.custom_args = custom_args;
154+
}
155+
120156
*module = (pmix_mca_base_module_t *) &prte_plm_slurm_module;
121157
return PRTE_SUCCESS;
122158
}

0 commit comments

Comments
 (0)