From 339a6685899b55e4c92f5d9775ed493fa2e6aacb Mon Sep 17 00:00:00 2001 From: Xuanqi He Date: Tue, 13 May 2025 01:24:32 -0400 Subject: [PATCH 1/6] =?UTF-8?q?Fix=20a=20bug=20that=20was=20causing=20comp?= =?UTF-8?q?ute=20node=20self=E2=80=91termination=20to=20hang=20on=20Ubuntu?= =?UTF-8?q?=2024.04=20nodes?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- CHANGELOG.md | 4 ++-- src/slurm_plugin/computemgtd.py | 26 +++++++++++++++++++++++++- 2 files changed, 27 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b581e203e..fe3e8b7ea 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,8 +6,8 @@ This file is used to list changes made in each version of the aws-parallelcluste 3.13.1 ------ -**CHANGES** -- There were no changes for this version. +**BUG FIXES** +- Fix a bug that was causing compute node self‑termination to hang on Ubuntu 24.04 nodes. 3.13.0 ------ diff --git a/src/slurm_plugin/computemgtd.py b/src/slurm_plugin/computemgtd.py index 3ae4f3261..9dfa63a70 100644 --- a/src/slurm_plugin/computemgtd.py +++ b/src/slurm_plugin/computemgtd.py @@ -125,14 +125,38 @@ def _read_nodename_from_file(nodename_file_path): raise +def _is_ubuntu2404(): + """Return True if the OS is Ubuntu 24.04.""" + try: + with open("/etc/os-release", "r") as f: + info = dict( + line.strip().split("=", 1) + for line in f + if "=" in line + ) + os_id = info.get("ID", "").strip('"').lower() + version = info.get("VERSION_ID", "").strip('"') + return os_id == "ubuntu" and version.startswith("24.04") + except Exception as e: + log.warning("Unable to detect OS version from /etc/os-release: %s", e) + return False + + @log_exception(log, "self terminating compute instance", catch_exception=CalledProcessError, raise_on_error=False) def _self_terminate(): """Self terminate the instance.""" # Sleep for 10 seconds so termination log entries are uploaded to CW logs log.info("Preparing to self terminate the instance in 10 seconds!") time.sleep(10) + if _is_ubuntu2404(): + shutdown_cmd = "sudo systemctl poweroff --force" + log.info("Detected Ubuntu 24.04 – using `%s`", shutdown_cmd) + else: + shutdown_cmd = "sudo shutdown -h now" + log.info("Using default shutdown command `%s`", shutdown_cmd) + log.info("Self terminating instance now!") - run_command("sudo shutdown -h now") + run_command(shutdown_cmd) @retry(stop_max_attempt_number=3, wait_fixed=1500) From 4ed70cb3eaf09f778dc16cc6202bda3293c0c64d Mon Sep 17 00:00:00 2001 From: Xuanqi He Date: Tue, 13 May 2025 09:47:29 -0400 Subject: [PATCH 2/6] Auto Format --- src/slurm_plugin/computemgtd.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/src/slurm_plugin/computemgtd.py b/src/slurm_plugin/computemgtd.py index 9dfa63a70..5c88a8916 100644 --- a/src/slurm_plugin/computemgtd.py +++ b/src/slurm_plugin/computemgtd.py @@ -129,11 +129,7 @@ def _is_ubuntu2404(): """Return True if the OS is Ubuntu 24.04.""" try: with open("/etc/os-release", "r") as f: - info = dict( - line.strip().split("=", 1) - for line in f - if "=" in line - ) + info = dict(line.strip().split("=", 1) for line in f if "=" in line) os_id = info.get("ID", "").strip('"').lower() version = info.get("VERSION_ID", "").strip('"') return os_id == "ubuntu" and version.startswith("24.04") From 61db63043b2b48284af271ce66450f63677ebe36 Mon Sep 17 00:00:00 2001 From: Xuanqi He Date: Wed, 14 May 2025 02:25:29 -0400 Subject: [PATCH 3/6] Add a 600s sleep before self terminate to test. --- src/slurm_plugin/computemgtd.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/slurm_plugin/computemgtd.py b/src/slurm_plugin/computemgtd.py index 5c88a8916..7c0ce3939 100644 --- a/src/slurm_plugin/computemgtd.py +++ b/src/slurm_plugin/computemgtd.py @@ -152,6 +152,7 @@ def _self_terminate(): log.info("Using default shutdown command `%s`", shutdown_cmd) log.info("Self terminating instance now!") + time.sleep(600) run_command(shutdown_cmd) From 989d05a9a925a8ee299236c7e5c8a0c894f99fca Mon Sep 17 00:00:00 2001 From: Xuanqi He Date: Thu, 15 May 2025 14:09:14 -0400 Subject: [PATCH 4/6] Revert "Add a 600s sleep before self terminate to test." This reverts commit 61db63043b2b48284af271ce66450f63677ebe36. --- src/slurm_plugin/computemgtd.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/slurm_plugin/computemgtd.py b/src/slurm_plugin/computemgtd.py index 7c0ce3939..5c88a8916 100644 --- a/src/slurm_plugin/computemgtd.py +++ b/src/slurm_plugin/computemgtd.py @@ -152,7 +152,6 @@ def _self_terminate(): log.info("Using default shutdown command `%s`", shutdown_cmd) log.info("Self terminating instance now!") - time.sleep(600) run_command(shutdown_cmd) From 0d1220086dca2f35fc36c6701f611cc2e14bc401 Mon Sep 17 00:00:00 2001 From: Xuanqi He Date: Thu, 15 May 2025 14:37:26 -0400 Subject: [PATCH 5/6] Add new test cases to cover _is_ubuntu2404, and fix unit tests being affected. --- tests/slurm_plugin/test_computemgtd.py | 32 +++++++++++++++++++++++--- 1 file changed, 29 insertions(+), 3 deletions(-) diff --git a/tests/slurm_plugin/test_computemgtd.py b/tests/slurm_plugin/test_computemgtd.py index ba9745609..b60863959 100644 --- a/tests/slurm_plugin/test_computemgtd.py +++ b/tests/slurm_plugin/test_computemgtd.py @@ -16,8 +16,9 @@ import pytest import slurm_plugin from assertpy import assert_that -from slurm_plugin.computemgtd import ComputemgtdConfig, _is_self_node_down, _self_terminate +from slurm_plugin.computemgtd import ComputemgtdConfig, _is_self_node_down, _self_terminate, _is_ubuntu2404 from slurm_plugin.slurm_resources import DynamicNode +from unittest.mock import mock_open @pytest.mark.parametrize( @@ -103,13 +104,38 @@ def test_is_self_node_down(mock_node_info, expected_result, mocker): assert_that(_is_self_node_down("queue1-st-c5xlarge-1")).is_equal_to(expected_result) -def test_self_terminate(mocker, caplog): +@pytest.mark.parametrize( + ("is_ubuntu2404", "expected_cmd"), + [ + (True, "sudo systemctl poweroff --force"), + (False, "sudo shutdown -h now"), + ], +) +def test_self_terminate(mocker, caplog, is_ubuntu2404, expected_cmd): """Verify self-termination is implemented via a shutdown command rather than calling TerminateInstances.""" + mocker.patch("slurm_plugin.computemgtd._is_ubuntu2404", return_value=is_ubuntu2404) run_command_patch = mocker.patch("slurm_plugin.computemgtd.run_command") sleep_patch = mocker.patch("slurm_plugin.computemgtd.time.sleep") with caplog.at_level(logging.INFO): _self_terminate() assert_that(caplog.text).contains("Preparing to self terminate the instance in 10 seconds!") assert_that(caplog.text).contains("Self terminating instance now!") - run_command_patch.assert_called_with("sudo shutdown -h now") + run_command_patch.assert_called_with(expected_cmd) sleep_patch.assert_called_with(10) + + +@pytest.mark.parametrize( + ("file_content", "expected"), + [ + ('ID=ubuntu\nVERSION_ID="24.04"\n', True), + ('ID=ubuntu\nVERSION_ID="24.04.2"\n', True), + ('ID=ubuntu\nVERSION_ID="22.04"\n', False), + ('ID=rocky\nVERSION_ID="9.3"\n', False), + ('ID=ubuntu\n', False), + ], +) +def test_is_ubuntu2404(file_content, expected, mocker): + m = mock_open(read_data=file_content) + mocker.patch("builtins.open", m) + + assert _is_ubuntu2404() is expected From 6db93e2a01f9357c9023b69d3dbf5366e7ed5b94 Mon Sep 17 00:00:00 2001 From: Xuanqi He Date: Thu, 15 May 2025 15:01:13 -0400 Subject: [PATCH 6/6] Reformat --- tests/slurm_plugin/test_computemgtd.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/slurm_plugin/test_computemgtd.py b/tests/slurm_plugin/test_computemgtd.py index b60863959..d28354211 100644 --- a/tests/slurm_plugin/test_computemgtd.py +++ b/tests/slurm_plugin/test_computemgtd.py @@ -12,13 +12,13 @@ import logging import os +from unittest.mock import mock_open import pytest import slurm_plugin from assertpy import assert_that -from slurm_plugin.computemgtd import ComputemgtdConfig, _is_self_node_down, _self_terminate, _is_ubuntu2404 +from slurm_plugin.computemgtd import ComputemgtdConfig, _is_self_node_down, _is_ubuntu2404, _self_terminate from slurm_plugin.slurm_resources import DynamicNode -from unittest.mock import mock_open @pytest.mark.parametrize( @@ -131,7 +131,7 @@ def test_self_terminate(mocker, caplog, is_ubuntu2404, expected_cmd): ('ID=ubuntu\nVERSION_ID="24.04.2"\n', True), ('ID=ubuntu\nVERSION_ID="22.04"\n', False), ('ID=rocky\nVERSION_ID="9.3"\n', False), - ('ID=ubuntu\n', False), + ("ID=ubuntu\n", False), ], ) def test_is_ubuntu2404(file_content, expected, mocker):