Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Mellanox][Nvidia-BF] Enable Watchdog support for Smartswitch DPU #22046

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
68 changes: 67 additions & 1 deletion platform/nvidia-bluefield/platform-api/sonic_platform/utils.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#
# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
# Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
Expand All @@ -14,6 +15,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
import ctypes
import functools
import subprocess
from sonic_py_common.logger import Logger
Expand Down Expand Up @@ -71,3 +73,67 @@ def _impl(*args, **kwargs):
return return_value
return _impl
return wrapper


def read_from_file(file_path, target_type, default='', raise_exception=False, log_func=logger.log_error):
"""
Read content from file and convert to target type
:param file_path: File path
:param target_type: target type
:param default: Default return value if any exception occur
:param raise_exception: Raise exception to caller if True else just return default value
:param log_func: function to log the error
:return: String content of the file
"""
try:
with open(file_path, 'r') as f:
value = f.read()
if value is None:
# None return value is not allowed in any case, so we log error here for further debug.
logger.log_error('Failed to read from {}, value is None, errno is {}'.format(file_path, ctypes.get_errno()))
# Raise ValueError for the except statement to handle this as a normal exception
raise ValueError('File content of {} is None'.format(file_path))
else:
value = target_type(value.strip())
except (ValueError, IOError) as e:
if log_func:
log_func('Failed to read from file {} - {}'.format(file_path, repr(e)))
if not raise_exception:
value = default
else:
raise e

return value


def read_float_from_file(file_path, default=0.0, raise_exception=False, log_func=logger.log_error):
"""
Read content from file and cast it to integer
:param file_path: File path
:param default: Default return value if any exception occur
:param raise_exception: Raise exception to caller if True else just return default value
:param log_func: function to log the error
:return: Integer value of the file content
"""
return read_from_file(file_path=file_path, target_type=float, default=default, raise_exception=raise_exception, log_func=log_func)


def write_file(file_path, content, raise_exception=False, log_func=logger.log_error):
"""
Write the given value to a file
:param file_path: File path
:param content: Value to write to the file
:param raise_exception: Raise exception to caller if True
:return: True if write success else False
"""
try:
with open(file_path, 'w') as f:
f.write(str(content))
except (ValueError, IOError) as e:
if log_func:
log_func('Failed to write {} to file {} - {}'.format(content, file_path, repr(e)))
if not raise_exception:
return False
else:
raise e
return True
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#
# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
# Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
Expand All @@ -15,17 +16,53 @@
# limitations under the License.
#


try:
from sonic_platform_base.watchdog_base import WatchdogBase
from sonic_py_common.logger import Logger
from sonic_py_common.syslogger import SysLogger
import time
from . import utils
import os
import subprocess
except ImportError as e:
raise ImportError(str(e) + "- required module not found")

logger = SysLogger(log_identifier='Watchdog')

WD_COMMON_ERROR = -1


class Watchdog(WatchdogBase):
"""Placeholder for watchdog implementation"""

def __init__(self):
self.MLXBF_DRIVER = "mlxbf-bootctl"
self.TIMESTAMP_FILE = '/tmp/nvidia/watchdog_timestamp'

def exec_cmd(self, cmd, raise_exception=True):
"""Execute commands related to watchdog api"""
try:
return subprocess.check_output(cmd).decode().strip()
except Exception as err:
if raise_exception:
raise err
else:
logger.log_info(f"Failed to run cmd {' '.join(cmd)}")

def get_conf_time_and_mode(self):
"""Obtain the mode in which the watchdog is configured and the time
Returns - A Tuple (Arm status, configured timeout)"""
status_cmd = {self.MLXBF_DRIVER}
ret_status = "disabled"
ret_time = 0
try:
stat_output_list = self.exec_cmd(status_cmd).split('\n')
bootctl_stat_dict = {item.split(': ')[0]: item.split(': ')[1] for item in stat_output_list}
ret_status = bootctl_stat_dict.get('boot watchdog mode', 'disabled')
ret_time = int(bootctl_stat_dict.get('boot watchdog interval', 0))
except Exception as err:
logger.log_error(f"Could not obtain status usind mlxbf-bootctl :{err}")
return ret_status, ret_time

def arm(self, seconds):
"""
Arm the hardware watchdog with a timeout of <seconds> seconds.
Expand All @@ -39,7 +76,24 @@ def arm(self, seconds):
An integer specifying the *actual* number of seconds the watchdog
was armed with. On failure returns -1.
"""
return -1
arm_time = WD_COMMON_ERROR
if seconds < 45 or seconds > 4095:
logger.log_error(f"Arm time provided {seconds} is outside the valid range 45-4095")
return arm_time
arm_command = [self.MLXBF_DRIVER, "--watchdog-boot-mode", "standard",
"--watchdog-boot-interval", str(seconds)]
try:
if self.is_armed_for_time(seconds):
# Already armed for the time specified
return seconds
self.exec_cmd(arm_command)
arm_time = seconds
os.makedirs('/tmp/nvidia', exist_ok=True)
utils.write_file(self.TIMESTAMP_FILE, str(time.monotonic()))
except Exception as err:
# On an error return code check_output raises exception, return Fault
logger.log_error(f"Could not arm watchdog :{err}")
return arm_time

def disarm(self):
"""
Expand All @@ -48,7 +102,29 @@ def disarm(self):
Returns:
A boolean, True if watchdog is disarmed successfully, False if not
"""
return False
disarm_command = [self.MLXBF_DRIVER, "--watchdog-boot-mode", "disabled"]
try:
self.exec_cmd(disarm_command)
except Exception as err:
logger.log_error(f"Could not disarm watchdog :{err}")
# On an error return code check_output raises exception, return Fault
return False
return True

def is_armed_for_time(self, time_check=None):
"""
Retrieves the armed state of the hardware watchdog
And it also checks if the time configured
If the time_check parameter is not provided, we check
if watchdog is just armed or not
Returns:
A boolean, True if watchdog is armed, False if not
"""
conf_mode, conf_time = self.get_conf_time_and_mode()
armed = conf_mode == 'standard'
if not time_check:
return armed
return armed and (time_check == conf_time)

def is_armed(self):
"""
Expand All @@ -57,7 +133,7 @@ def is_armed(self):
Returns:
A boolean, True if watchdog is armed, False if not
"""
return False
return self.is_armed_for_time()

def get_remaining_time(self):
"""
Expand All @@ -68,4 +144,9 @@ def get_remaining_time(self):
An integer specifying the number of seconds remaining on thei
watchdog timer. If the watchdog is not armed, returns -1.
"""
return -1
timeleft = WD_COMMON_ERROR
if self.is_armed():
arm_timestamp = utils.read_float_from_file(self.TIMESTAMP_FILE, raise_exception=True)
_, conf_time = self.get_conf_time_and_mode()
timeleft = int(conf_time - (time.monotonic() - arm_timestamp))
return timeleft
48 changes: 48 additions & 0 deletions platform/nvidia-bluefield/platform-api/tests/test_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
#
# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
# Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import os
import pytest
import sys
from sonic_platform import utils

test_path = os.path.dirname(os.path.abspath(__file__))
modules_path = os.path.dirname(test_path)
sys.path.insert(0, modules_path)


class TestUtils:
def test_read_file(self):
ret = utils.read_float_from_file('not exist', 3.14)
assert ret == 3.14

with pytest.raises(IOError):
ret = utils.read_float_from_file('not exist', 2.25, raise_exception=True)
assert ret == 2.25
file_path = '/tmp/test.txt'
utils.write_file(file_path, '3.09')
assert utils.read_float_from_file(file_path) == 3.09
utils.write_file(file_path, '3.00')
assert utils.read_float_from_file(file_path) == 3

def test_write_file(self):
file_path = '/tmp/test.txt'
utils.write_file(file_path, '3.14')
assert utils.read_float_from_file(file_path) == 3.14

with pytest.raises(IOError):
utils.write_file('/not/exist/file', '123', raise_exception=True)
Loading
Loading