diff --git a/CHANGELOG.md b/CHANGELOG.md index b581e203..fe3e8b7e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,8 +6,8 @@ This file is used to list changes made in each version of the aws-parallelcluste 3.13.1 ------ -**CHANGES** -- There were no changes for this version. +**BUG FIXES** +- Fix a bug that was causing compute node self‑termination to hang on Ubuntu 24.04 nodes. 3.13.0 ------ diff --git a/src/slurm_plugin/computemgtd.py b/src/slurm_plugin/computemgtd.py index 3ae4f326..5c88a891 100644 --- a/src/slurm_plugin/computemgtd.py +++ b/src/slurm_plugin/computemgtd.py @@ -125,14 +125,34 @@ def _read_nodename_from_file(nodename_file_path): raise +def _is_ubuntu2404(): + """Return True if the OS is Ubuntu 24.04.""" + try: + with open("/etc/os-release", "r") as f: + info = dict(line.strip().split("=", 1) for line in f if "=" in line) + os_id = info.get("ID", "").strip('"').lower() + version = info.get("VERSION_ID", "").strip('"') + return os_id == "ubuntu" and version.startswith("24.04") + except Exception as e: + log.warning("Unable to detect OS version from /etc/os-release: %s", e) + return False + + @log_exception(log, "self terminating compute instance", catch_exception=CalledProcessError, raise_on_error=False) def _self_terminate(): """Self terminate the instance.""" # Sleep for 10 seconds so termination log entries are uploaded to CW logs log.info("Preparing to self terminate the instance in 10 seconds!") time.sleep(10) + if _is_ubuntu2404(): + shutdown_cmd = "sudo systemctl poweroff --force" + log.info("Detected Ubuntu 24.04 – using `%s`", shutdown_cmd) + else: + shutdown_cmd = "sudo shutdown -h now" + log.info("Using default shutdown command `%s`", shutdown_cmd) + log.info("Self terminating instance now!") - run_command("sudo shutdown -h now") + run_command(shutdown_cmd) @retry(stop_max_attempt_number=3, wait_fixed=1500) diff --git a/tests/slurm_plugin/test_computemgtd.py b/tests/slurm_plugin/test_computemgtd.py index ba974560..d2835421 100644 --- a/tests/slurm_plugin/test_computemgtd.py +++ b/tests/slurm_plugin/test_computemgtd.py @@ -12,11 +12,12 @@ import logging import os +from unittest.mock import mock_open import pytest import slurm_plugin from assertpy import assert_that -from slurm_plugin.computemgtd import ComputemgtdConfig, _is_self_node_down, _self_terminate +from slurm_plugin.computemgtd import ComputemgtdConfig, _is_self_node_down, _is_ubuntu2404, _self_terminate from slurm_plugin.slurm_resources import DynamicNode @@ -103,13 +104,38 @@ def test_is_self_node_down(mock_node_info, expected_result, mocker): assert_that(_is_self_node_down("queue1-st-c5xlarge-1")).is_equal_to(expected_result) -def test_self_terminate(mocker, caplog): +@pytest.mark.parametrize( + ("is_ubuntu2404", "expected_cmd"), + [ + (True, "sudo systemctl poweroff --force"), + (False, "sudo shutdown -h now"), + ], +) +def test_self_terminate(mocker, caplog, is_ubuntu2404, expected_cmd): """Verify self-termination is implemented via a shutdown command rather than calling TerminateInstances.""" + mocker.patch("slurm_plugin.computemgtd._is_ubuntu2404", return_value=is_ubuntu2404) run_command_patch = mocker.patch("slurm_plugin.computemgtd.run_command") sleep_patch = mocker.patch("slurm_plugin.computemgtd.time.sleep") with caplog.at_level(logging.INFO): _self_terminate() assert_that(caplog.text).contains("Preparing to self terminate the instance in 10 seconds!") assert_that(caplog.text).contains("Self terminating instance now!") - run_command_patch.assert_called_with("sudo shutdown -h now") + run_command_patch.assert_called_with(expected_cmd) sleep_patch.assert_called_with(10) + + +@pytest.mark.parametrize( + ("file_content", "expected"), + [ + ('ID=ubuntu\nVERSION_ID="24.04"\n', True), + ('ID=ubuntu\nVERSION_ID="24.04.2"\n', True), + ('ID=ubuntu\nVERSION_ID="22.04"\n', False), + ('ID=rocky\nVERSION_ID="9.3"\n', False), + ("ID=ubuntu\n", False), + ], +) +def test_is_ubuntu2404(file_content, expected, mocker): + m = mock_open(read_data=file_content) + mocker.patch("builtins.open", m) + + assert _is_ubuntu2404() is expected