Skip to content

Commit 651f28e

Browse files
committed
Dev: ui_sbd: Configure crashdump watchdog timeout (jsc#PED-11931)
1 parent 341103f commit 651f28e

File tree

4 files changed

+91
-35
lines changed

4 files changed

+91
-35
lines changed

crmsh/cibquery.py

+21
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,27 @@ def has_primitive_filesystem_with_fstype(cib: lxml.etree.Element, fstype: str) -
3333
f'/instance_attributes/nvpair[@name="fstype" and @value="{fstype}"]'
3434
))
3535

36+
37+
def has_primitive(cib: lxml.etree.Element, ra: ResourceAgent) -> list[str]:
38+
"""
39+
Given cib and ResourceAgent instance, return id list of primitives that matched
40+
consider provider as optional
41+
"""
42+
provider_condition = f' and @provider="{ra.m_provider}"' if ra.m_provider else ""
43+
return [e.get('id') for e in cib.xpath(
44+
f'/cib/configuration/resources//primitive[@class="{ra.m_class}"{provider_condition} and @type="{ra.m_type}"]'
45+
)]
46+
47+
48+
def get_parameter_value(cib: lxml.etree.Element, res_id: str, param_name: str) -> typing.Optional[str]:
49+
return next((
50+
e.get('value') for e in cib.xpath(
51+
f'/cib/configuration/resources//primitive[@id="{res_id}"]'
52+
f'/instance_attributes/nvpair[@name="{param_name}"]'
53+
)
54+
), None)
55+
56+
3657
def get_cluster_nodes(cib: lxml.etree.Element) -> list[ClusterNode]:
3758
"""Return a list of cluster nodes, excluding pacemaker-remote nodes"""
3859
result = list()

crmsh/sbd.py

+4-1
Original file line numberDiff line numberDiff line change
@@ -559,7 +559,10 @@ def configure_sbd(self):
559559
Configure fence_sbd resource and related properties
560560
'''
561561
if self.diskless_sbd:
562-
utils.set_property("stonith-watchdog-timeout", SBDTimeout.STONITH_WATCHDOG_TIMEOUT_DEFAULT)
562+
if "stonith-watchdog" in self.timeout_dict:
563+
utils.set_property("stonith-watchdog-timeout", self.timeout_dict.get("stonith-watchdog"), conditional=True)
564+
else:
565+
utils.set_property("stonith-watchdog-timeout", SBDTimeout.STONITH_WATCHDOG_TIMEOUT_DEFAULT)
563566
else:
564567
if utils.get_property("stonith-watchdog-timeout", get_default=False):
565568
utils.delete_property("stonith-watchdog-timeout")

crmsh/ui_sbd.py

+66-27
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
from crmsh import sh
1313
from crmsh import xmlutil
1414
from crmsh import constants
15+
from crmsh import cibquery
1516
from crmsh.service_manager import ServiceManager
1617

1718

@@ -88,8 +89,8 @@ class SBD(command.UI):
8889
- sbd purge
8990
'''
9091
name = "sbd"
91-
TIMEOUT_TYPES = ("watchdog", "allocate", "loop", "msgwait")
92-
DISKLESS_TIMEOUT_TYPES = ("watchdog",)
92+
TIMEOUT_TYPES = ("watchdog", "allocate", "loop", "msgwait", "crashdump-watchdog")
93+
DISKLESS_TIMEOUT_TYPES = ("watchdog", "crashdump-watchdog")
9394
SHOW_TYPES = ("disk_metadata", "sysconfig", "property")
9495
DISKLESS_SHOW_TYPES = ("sysconfig", "property")
9596
PCMK_ATTRS = (
@@ -102,12 +103,15 @@ class SBD(command.UI):
102103
PCMK_ATTRS_DISKLESS = ('stonith-watchdog-timeout',)
103104
PARSE_RE = re.compile(
104105
# Match keys with non-empty values, capturing possible suffix
105-
r'(\w+)(?:-(\w+))?=("[^"]+"|[\w/\d;]+)'
106+
r'([\w-]+)-([\w-]+)=([\w/\d]+)'
106107
)
107108

108109
class SyntaxError(Exception):
109110
pass
110111

112+
class MissingRequiredException(Exception):
113+
pass
114+
111115
def __init__(self):
112116
self.device_list_from_config: list[str] = None
113117
self.device_meta_dict_runtime: dict[str, int] = None
@@ -271,43 +275,63 @@ def _parse_args(self, args: tuple[str, ...]) -> dict[str, int|str]:
271275
logger.debug("Parsed arguments: %s", parameter_dict)
272276
return parameter_dict
273277

274-
@staticmethod
275-
def _adjust_timeout_dict(timeout_dict: dict) -> dict:
276-
watchdog_timeout = timeout_dict.get("watchdog")
277-
msgwait_timeout = timeout_dict.get("msgwait")
278-
if watchdog_timeout and msgwait_timeout and msgwait_timeout < 2*watchdog_timeout:
279-
logger.warning("It's recommended to set msgwait timeout >= 2*watchdog timeout")
280-
return timeout_dict
281-
if watchdog_timeout and not msgwait_timeout:
282-
timeout_dict["msgwait"] = 2*watchdog_timeout
283-
logger.info("No msgwait timeout specified, use 2*watchdog timeout: %s", 2*watchdog_timeout)
284-
return timeout_dict
285-
if msgwait_timeout and not watchdog_timeout:
286-
watchdog_timeout = msgwait_timeout//2
287-
timeout_dict["watchdog"] = watchdog_timeout
288-
logger.info("No watchdog timeout specified, use msgwait timeout/2: %s", watchdog_timeout)
289-
return timeout_dict
278+
def set_crashdump_option(self):
279+
'''
280+
Set crashdump option for fence_sbd resource
281+
'''
282+
shell = sh.LocalShell()
283+
cib = xmlutil.text2elem(shell.get_stdout_or_raise_error(None, 'crm configure show xml'))
284+
ra = cibquery.ResourceAgent("stonith", "", "fence_sbd")
285+
res_id_list = cibquery.has_primitive(cib, ra)
286+
if not res_id_list:
287+
logger.error("No fence_sbd resource found")
288+
raise self.MissingRequiredException
289+
crashdump_value = cibquery.get_parameter_value(cib, res_id_list[0], "crashdump")
290+
if utils.is_boolean_false(crashdump_value):
291+
cmd = f"crm resource param {res_id_list[0]} set crashdump 1"
292+
shell.get_stdout_or_raise_error(None, cmd)
293+
logger.info("Set crashdump option for fence_sbd resource")
294+
295+
def is_kdump_service_active(self) -> bool:
296+
result = True
297+
for node in self.cluster_nodes:
298+
if not self.service_manager.service_is_active("kdump.service", node):
299+
logger.error("Kdump service is not active on %s", node)
300+
result = False
301+
return result
290302

291303
def _configure_diskbase(self, parameter_dict: dict):
292304
'''
293305
Configure disk-based SBD based on input parameters and runtime config
294306
'''
295307
update_dict = {}
308+
timeout_dict = {
309+
item: parameter_dict.get(item) or self.device_meta_dict_runtime.get(item)
310+
for item in self.TIMEOUT_TYPES if item != "crashdump-watchdog"
311+
}
312+
313+
crashdump_watchdog_timeout = parameter_dict.get("crashdump-watchdog")
314+
if crashdump_watchdog_timeout:
315+
if not self.is_kdump_service_active():
316+
raise self.MissingRequiredException
317+
self.set_crashdump_option()
318+
timeout_dict["msgwait"] = 2*timeout_dict["watchdog"] + crashdump_watchdog_timeout
319+
logger.info("Set msgwait timeout to 2*watchdog + crashdump-watchdog: %s", timeout_dict["msgwait"])
320+
update_dict["SBD_TIMEOUT_ACTION"] = "flush,crashdump"
321+
update_dict["SBD_OPTS"] = f"-C {crashdump_watchdog_timeout}"
322+
323+
if timeout_dict["msgwait"] < 2*timeout_dict["watchdog"]:
324+
logger.warning("It's recommended to set msgwait timeout >= 2*watchdog timeout")
325+
return
326+
296327
watchdog_device = parameter_dict.get("watchdog-device")
297328
if watchdog_device != self.watchdog_device_from_config:
298329
update_dict["SBD_WATCHDOG_DEV"] = watchdog_device
299-
timeout_dict = {k: v for k, v in parameter_dict.items() if k in self.TIMEOUT_TYPES}
300-
is_subdict_timeout = utils.is_subdict(timeout_dict, self.device_meta_dict_runtime)
301330

302-
if is_subdict_timeout and not update_dict:
331+
if timeout_dict == self.device_meta_dict_runtime and not update_dict:
303332
logger.info("No change in SBD configuration")
304333
return
305334

306-
if not is_subdict_timeout:
307-
timeout_dict = self._adjust_timeout_dict(timeout_dict)
308-
# merge runtime timeout dict into parameter timeout dict without overwriting
309-
timeout_dict = {**self.device_meta_dict_runtime, **timeout_dict}
310-
311335
sbd_manager = sbd.SBDManager(
312336
device_list_to_init=self.device_list_from_config,
313337
timeout_dict=timeout_dict,
@@ -320,17 +344,29 @@ def _configure_diskless(self, parameter_dict: dict):
320344
Configure diskless SBD based on input parameters and runtime config
321345
'''
322346
update_dict = {}
347+
timeout_dict = {}
323348
watchdog_timeout = parameter_dict.get("watchdog")
324349
if watchdog_timeout and watchdog_timeout != self.watchdog_timeout_from_config:
325350
update_dict["SBD_WATCHDOG_TIMEOUT"] = str(watchdog_timeout)
326351
watchdog_device = parameter_dict.get("watchdog-device")
327352
if watchdog_device != self.watchdog_device_from_config:
328353
update_dict["SBD_WATCHDOG_DEV"] = watchdog_device
354+
crashdump_watchdog_timeout = parameter_dict.get("crashdump-watchdog")
355+
if crashdump_watchdog_timeout:
356+
if not self.is_kdump_service_active():
357+
raise self.MissingRequiredException
358+
update_dict["SBD_TIMEOUT_ACTION"] = "flush,crashdump"
359+
update_dict["SBD_OPTS"] = f"-C {crashdump_watchdog_timeout} -Z"
360+
sbd_watchdog_timeout = watchdog_timeout or self.watchdog_timeout_from_config
361+
stonith_watchdog_timeout = sbd_watchdog_timeout + crashdump_watchdog_timeout
362+
logger.info("Set stonith-watchdog-timeout to SBD_WATCHDOG_TIMEOUT + crashdump-watchdog: %s", stonith_watchdog_timeout)
363+
timeout_dict["stonith-watchdog"] = stonith_watchdog_timeout
329364
if not update_dict:
330365
logger.info("No change in SBD configuration")
331366
return
332367

333368
sbd_manager = sbd.SBDManager(
369+
timeout_dict=timeout_dict,
334370
update_dict=update_dict,
335371
diskless_sbd=True
336372
)
@@ -426,6 +462,7 @@ def do_configure(self, context, *args) -> bool:
426462
if args[0] == "show":
427463
self._configure_show(args)
428464
return True
465+
429466
parameter_dict = self._parse_args(args)
430467
if sbd.SBDUtils.is_using_disk_based_sbd():
431468
self._configure_diskbase(parameter_dict)
@@ -439,6 +476,8 @@ def do_configure(self, context, *args) -> bool:
439476
if usage:
440477
print(usage)
441478
return False
479+
except self.MissingRequiredException:
480+
return False
442481

443482
def do_purge(self, context) -> bool:
444483
'''

crmsh/utils.py

-7
Original file line numberDiff line numberDiff line change
@@ -3202,11 +3202,4 @@ def strip_ansi_escape_sequences(text):
32023202
"""
32033203
ansi_escape_pattern = re.compile(r'\x1B\[[0-?]*[ -/]*[@-~]')
32043204
return ansi_escape_pattern.sub('', text)
3205-
3206-
3207-
def is_subdict(sub_dict, main_dict):
3208-
"""
3209-
Check if sub_dict is a sub-dictionary of main_dict
3210-
"""
3211-
return all(main_dict.get(k) == v for k, v in sub_dict.items())
32123205
# vim:ts=4:sw=4:et:

0 commit comments

Comments
 (0)