Skip to content

Commit bf14425

Browse files
rhc54hppritcha
authored andcommitted
Repair the binding algorithm
If we use one cpu from an object, then we will get a NULL response if we ask for the next object of that type within the remaining cpuset since not all of the cpus in the object are still available. This problem resulted from the recent change to only use available cpus in PRRTE topologies. So instead scan across the cpus, check to see if it is inside the object of interest - if so, then we can bind to that cpu, if not then we keep searching. Signed-off-by: Ralph Castain <rhc@pmix.org>
1 parent cf0bf17 commit bf14425

File tree

3 files changed

+15
-25
lines changed

3 files changed

+15
-25
lines changed

src/hwloc/hwloc_base_util.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1328,7 +1328,7 @@ void prte_hwloc_get_binding_info(hwloc_const_cpuset_t cpuset,
13281328

13291329
/* if the cpuset is all zero, then something is wrong */
13301330
if (hwloc_bitmap_iszero(cpuset)) {
1331-
snprintf(cores, sz, "\n%*c<NOT MAPPED/>\n", 20, ' ');
1331+
snprintf(cores, sz, "\n%*c<EMPTY CPUSET/>\n", 20, ' ');
13321332
}
13331333

13341334
/* if the cpuset includes all available cpus, and
@@ -1401,7 +1401,7 @@ char *prte_hwloc_base_cset2str(hwloc_const_cpuset_t cpuset,
14011401

14021402
/* if the cpuset is all zero, then something is wrong */
14031403
if (hwloc_bitmap_iszero(cpuset)) {
1404-
return strdup("NOT MAPPED");
1404+
return strdup("EMPTY CPUSET");
14051405
}
14061406

14071407
/* if the cpuset includes all available cpus, and

src/mca/rmaps/base/rmaps_base_binding.c

Lines changed: 8 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
* Copyright (c) 2015-2017 Research Organization for Information Science
1717
* and Technology (RIST). All rights reserved.
1818
* Copyright (c) 2018 Inria. All rights reserved.
19-
* Copyright (c) 2021-2023 Nanook Consulting All rights reserved.
19+
* Copyright (c) 2021-2024 Nanook Consulting All rights reserved.
2020
* $COPYRIGHT$
2121
*
2222
* Additional copyrights may follow
@@ -61,6 +61,7 @@ static int bind_generic(prte_job_t *jdata, prte_proc_t *proc,
6161
hwloc_obj_type_t type;
6262
hwloc_obj_t target;
6363
hwloc_cpuset_t tgtcpus, tmpcpus;
64+
int nobjs, n;
6465

6566
pmix_output_verbose(5, prte_rmaps_base_framework.framework_output,
6667
"mca:rmaps: bind %s with policy %s",
@@ -82,18 +83,18 @@ static int bind_generic(prte_job_t *jdata, prte_proc_t *proc,
8283
#endif
8384
hwloc_bitmap_and(prte_rmaps_base.baseset, options->target, tgtcpus);
8485

85-
trg_obj = NULL;
86-
/* find the first object of that type in the target that has at least one available CPU */
87-
tmp_obj = hwloc_get_next_obj_inside_cpuset_by_type(node->topology->topo,
88-
prte_rmaps_base.baseset,
89-
options->hwb, NULL);
90-
while (NULL != tmp_obj) {
86+
nobjs = hwloc_get_nbobjs_by_type(node->topology->topo, options->hwb);
87+
88+
for (n=0; n < nobjs; n++) {
89+
tmp_obj = hwloc_get_obj_by_type(node->topology->topo, options->hwb, n);
9190
#if HWLOC_API_VERSION < 0x20000
9291
tmpcpus = tmp_obj->allowed_cpuset;
9392
#else
9493
tmpcpus = tmp_obj->cpuset;
9594
#endif
9695
hwloc_bitmap_and(prte_rmaps_base.available, node->available, tmpcpus);
96+
hwloc_bitmap_and(prte_rmaps_base.available, prte_rmaps_base.available, prte_rmaps_base.baseset);
97+
9798
if (options->use_hwthreads) {
9899
ncpus = hwloc_bitmap_weight(prte_rmaps_base.available);
99100
} else {
@@ -112,9 +113,6 @@ static int bind_generic(prte_job_t *jdata, prte_proc_t *proc,
112113
trg_obj = tmp_obj;
113114
break;
114115
}
115-
tmp_obj = hwloc_get_next_obj_inside_cpuset_by_type(node->topology->topo,
116-
prte_rmaps_base.baseset,
117-
options->hwb, tmp_obj);
118116
}
119117
if (NULL == trg_obj) {
120118
/* there aren't any appropriate targets under this object */

src/mca/rmaps/base/rmaps_base_support_fns.c

Lines changed: 5 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -631,27 +631,18 @@ int prte_rmaps_base_get_ncpus(prte_node_t *node,
631631
{
632632
int ncpus;
633633

634-
#if HWLOC_API_VERSION < 0x20000
635-
hwloc_obj_t root;
636-
root = hwloc_get_root_obj(node->topology->topo);
637634
if (NULL == options->job_cpuset) {
638-
hwloc_bitmap_copy(prte_rmaps_base.available, root->allowed_cpuset);
635+
hwloc_bitmap_copy(prte_rmaps_base.available, node->available);
639636
} else {
640-
hwloc_bitmap_and(prte_rmaps_base.available, root->allowed_cpuset, options->job_cpuset);
637+
hwloc_bitmap_and(prte_rmaps_base.available, node->available, options->job_cpuset);
641638
}
642639
if (NULL != obj) {
640+
#if HWLOC_API_VERSION < 0x20000
643641
hwloc_bitmap_and(prte_rmaps_base.available, prte_rmaps_base.available, obj->allowed_cpuset);
644-
}
645642
#else
646-
if (NULL == options->job_cpuset) {
647-
hwloc_bitmap_copy(prte_rmaps_base.available, hwloc_topology_get_allowed_cpuset(node->topology->topo));
648-
} else {
649-
hwloc_bitmap_and(prte_rmaps_base.available, hwloc_topology_get_allowed_cpuset(node->topology->topo), options->job_cpuset);
650-
}
651-
if (NULL != obj) {
652643
hwloc_bitmap_and(prte_rmaps_base.available, prte_rmaps_base.available, obj->cpuset);
653-
}
654644
#endif
645+
}
655646
if (options->use_hwthreads) {
656647
ncpus = hwloc_bitmap_weight(prte_rmaps_base.available);
657648
} else {
@@ -664,6 +655,7 @@ int prte_rmaps_base_get_ncpus(prte_node_t *node,
664655
*/
665656
ncpus = hwloc_get_nbobjs_inside_cpuset_by_type(node->topology->topo, prte_rmaps_base.available, HWLOC_OBJ_CORE);
666657
}
658+
667659
return ncpus;
668660
}
669661

0 commit comments

Comments
 (0)