Skip to content

Commit f195a89

Browse files
authored
Merge pull request #1401 from liangxin1300/20240408_stage_dependency_crmsh46
[crmsh-4.6] Setup bootstrap stages dependency
2 parents 73eaf02 + b750e09 commit f195a89

File tree

7 files changed

+151
-63
lines changed

7 files changed

+151
-63
lines changed

crmsh/bootstrap.py

+108-43
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@
4747
from .sh import ShellUtils
4848
from .ui_node import NodeMgmt
4949
from .user_of_host import UserOfHost, UserNotFoundError
50+
import crmsh.healthcheck
5051

5152
logger = log.setup_logger(__name__)
5253
logger_utils = log.LoggerUtils(logger)
@@ -74,7 +75,11 @@
7475
"/etc/drbd.conf", "/etc/drbd.d", "/etc/ha.d/ldirectord.cf", "/etc/lvm/lvm.conf", "/etc/multipath.conf",
7576
"/etc/samba/smb.conf", SYSCONFIG_NFS, SYSCONFIG_PCMK, SYSCONFIG_SBD, PCMK_REMOTE_AUTH, WATCHDOG_CFG,
7677
PROFILES_FILE, CRM_CFG, SBD_SYSTEMD_DELAY_START_DIR)
77-
INIT_STAGES = ("ssh", "csync2", "csync2_remote", "qnetd_remote", "corosync", "remote_auth", "sbd", "cluster", "ocfs2", "admin", "qdevice")
78+
79+
INIT_STAGES_EXTERNAL = ("ssh", "csync2", "corosync", "sbd", "cluster", "ocfs2", "admin", "qdevice")
80+
INIT_STAGES_INTERNAL = ("csync2_remote", "qnetd_remote", "remote_auth")
81+
INIT_STAGES_ALL = INIT_STAGES_EXTERNAL + INIT_STAGES_INTERNAL
82+
JOIN_STAGES_EXTERNAL = ("ssh", "csync2", "ssh_merge", "cluster")
7883

7984

8085
class Context(object):
@@ -232,15 +237,40 @@ def _validate_cluster_node(self):
232237
"""
233238
Validate cluster_node on join side
234239
"""
235-
if self.cluster_node and self.type == 'join':
240+
if self.type == "join" and self.cluster_node:
236241
user, node = _parse_user_at_host(self.cluster_node, None)
237242
try:
238243
# self.cluster_node might be hostname or IP address
239244
ip_addr = socket.gethostbyname(node)
240245
if utils.InterfacesInfo.ip_in_local(ip_addr):
241246
utils.fatal("Please specify peer node's hostname or IP address")
242247
except socket.gaierror as err:
243-
utils.fatal("\"{}\": {}".format(node, err))
248+
utils.fatal(f"\"{node}\": {err}")
249+
250+
def _validate_stage(self):
251+
"""
252+
Validate stage argument
253+
"""
254+
if not self.stage:
255+
if self.cluster_is_running:
256+
utils.fatal("Cluster is already running!")
257+
return
258+
259+
if self.type == "init":
260+
if self.stage not in INIT_STAGES_ALL:
261+
utils.fatal(f"Invalid stage: {self.stage}(available stages: {', '.join(INIT_STAGES_EXTERNAL)})")
262+
if self.stage in ("admin", "qdevice", "ocfs2") and not self.cluster_is_running:
263+
utils.fatal(f"Cluster is inactive, can't run '{self.stage}' stage")
264+
if self.stage in ("corosync", "cluster") and self.cluster_is_running:
265+
utils.fatal(f"Cluster is active, can't run '{self.stage}' stage")
266+
267+
elif self.type == "join":
268+
if self.stage not in JOIN_STAGES_EXTERNAL:
269+
utils.fatal(f"Invalid stage: {self.stage}(available stages: {', '.join(JOIN_STAGES_EXTERNAL)})")
270+
if self.stage and self.cluster_node is None:
271+
utils.fatal(f"Can't use stage({self.stage}) without specifying cluster node")
272+
if self.stage in ("cluster", ) and self.cluster_is_running:
273+
utils.fatal(f"Cluster is active, can't run '{self.stage}' stage")
244274

245275
def validate_option(self):
246276
"""
@@ -263,6 +293,7 @@ def validate_option(self):
263293
self.skip_csync2 = utils.get_boolean(os.getenv("SKIP_CSYNC2_SYNC"))
264294
if self.skip_csync2 and self.stage:
265295
utils.fatal("-x option or SKIP_CSYNC2_SYNC can't be used with any stage")
296+
self._validate_stage()
266297
self._validate_cluster_node()
267298
self._validate_nodes_option()
268299
self._validate_sbd_option()
@@ -553,7 +584,7 @@ def my_hostname_resolves():
553584
return False
554585

555586

556-
def check_prereqs(stage):
587+
def check_prereqs():
557588
warned = False
558589

559590
if not my_hostname_resolves():
@@ -1710,6 +1741,9 @@ def join_ssh_impl(local_user, seed_host, seed_user, ssh_public_keys: typing.List
17101741
change_user_shell('hacluster')
17111742
swap_public_ssh_key_for_secondary_user(sh.cluster_shell(), seed_host, 'hacluster')
17121743

1744+
if _context.stage:
1745+
setup_passwordless_with_other_nodes(seed_host, seed_user)
1746+
17131747

17141748
def join_ssh_with_ssh_agent(
17151749
local_shell: sh.LocalShell,
@@ -2367,46 +2401,87 @@ def decrease_expected_votes():
23672401
corosync.set_value("quorum.expected_votes", str(new_quorum))
23682402

23692403

2404+
def ssh_stage_finished():
2405+
"""
2406+
Dectect if the ssh stage is finished
2407+
"""
2408+
feature_check = crmsh.healthcheck.PasswordlessHaclusterAuthenticationFeature()
2409+
return feature_check.check_quick() and feature_check.check_local([utils.this_node()])
2410+
2411+
2412+
def csync2_stage_finished():
2413+
"""
2414+
Dectect if the csync2 stage is finished
2415+
"""
2416+
return ServiceManager().service_is_active(CSYNC2_SERVICE)
2417+
2418+
2419+
def corosync_stage_finished():
2420+
"""
2421+
Dectect if the corosync stage is finished
2422+
"""
2423+
return os.path.exists(corosync.conf())
2424+
2425+
2426+
INIT_STAGE_CHECKER = {
2427+
# stage: (function, is_internal)
2428+
"ssh": (ssh_stage_finished, False),
2429+
"csync2": (csync2_stage_finished, False),
2430+
"corosync": (corosync_stage_finished, False),
2431+
"remote_auth": (init_remote_auth, True),
2432+
"sbd": (lambda: True, False),
2433+
"upgradeutil": (init_upgradeutil, True),
2434+
"cluster": (is_online, False)
2435+
}
2436+
2437+
2438+
JOIN_STAGE_CHECKER = {
2439+
# stage: (function, is_internal)
2440+
"ssh": (ssh_stage_finished, False),
2441+
"csync2": (csync2_stage_finished, False),
2442+
"ssh_merge": (lambda: True, False),
2443+
"cluster": (is_online, False)
2444+
}
2445+
2446+
2447+
def check_stage_dependency(stage):
2448+
stage_checker = INIT_STAGE_CHECKER if _context.type == "init" else JOIN_STAGE_CHECKER
2449+
if stage not in stage_checker:
2450+
return
2451+
stage_order = list(stage_checker.keys())
2452+
for stage_name in stage_order:
2453+
if stage == stage_name:
2454+
break
2455+
func, is_internal = stage_checker[stage_name]
2456+
if is_internal:
2457+
func()
2458+
elif not func():
2459+
utils.fatal(f"Please run '{stage_name}' stage first")
2460+
2461+
23702462
def bootstrap_init(context):
23712463
"""
23722464
Init cluster process
23732465
"""
23742466
global _context
23752467
_context = context
2468+
stage = _context.stage
23762469

23772470
init()
23782471

2379-
stage = _context.stage
2380-
if stage is None:
2381-
stage = ""
2382-
2383-
# vgfs stage requires running cluster, everything else requires inactive cluster,
2384-
# except ssh and csync2 (which don't care) and csync2_remote (which mustn't care,
2385-
# just in case this breaks ha-cluster-join on another node).
2386-
if stage in ("vgfs", "admin", "qdevice", "ocfs2"):
2387-
if not _context.cluster_is_running:
2388-
utils.fatal("Cluster is inactive - can't run %s stage" % (stage))
2389-
elif stage == "":
2390-
if _context.cluster_is_running:
2391-
utils.fatal("Cluster is currently active - can't run")
2392-
elif stage not in ("ssh", "csync2", "csync2_remote", "qnetd_remote", "sbd", "ocfs2"):
2393-
if _context.cluster_is_running:
2394-
utils.fatal("Cluster is currently active - can't run %s stage" % (stage))
2395-
23962472
_context.load_profiles()
23972473
_context.init_sbd_manager()
23982474

2399-
# Need hostname resolution to work, want NTP (but don't block csync2_remote)
2400-
if stage not in ('csync2_remote', 'qnetd_remote'):
2401-
check_tty()
2402-
if not check_prereqs(stage):
2403-
return
2404-
else:
2475+
if stage in ('csync2_remote', 'qnetd_remote'):
24052476
args = _context.args
2406-
logger_utils.log_only_to_file("args: {}".format(args))
2477+
logger_utils.log_only_to_file(f"args: {args}")
24072478
if len(args) != 2:
2408-
utils.fatal(f"Expected NODE argument to {stage} stage")
2479+
utils.fatal(f"Expected NODE argument for '{stage}' stage")
24092480
_context.cluster_node = args[1]
2481+
else:
2482+
check_tty()
2483+
if not check_prereqs():
2484+
return
24102485

24112486
if stage and _context.cluster_is_running and \
24122487
not ServiceManager(shell=sh.ClusterShellAdaptorForLocalShell(sh.LocalShell())).service_is_active(CSYNC2_SERVICE):
@@ -2416,6 +2491,7 @@ def bootstrap_init(context):
24162491
_context.node_list_in_cluster = [utils.this_node()]
24172492

24182493
if stage != "":
2494+
check_stage_dependency(stage)
24192495
globals()["init_" + stage]()
24202496
else:
24212497
init_ssh()
@@ -2492,15 +2568,13 @@ def bootstrap_join(context):
24922568

24932569
check_tty()
24942570

2495-
corosync_active = ServiceManager(sh.ClusterShellAdaptorForLocalShell(sh.LocalShell())).service_is_active("corosync.service")
2496-
if corosync_active and _context.stage != "ssh":
2497-
utils.fatal("Abort: Cluster is currently active. Run this command on a node joining the cluster.")
2498-
2499-
if not check_prereqs("join"):
2571+
if not check_prereqs():
25002572
return
25012573

25022574
if _context.stage != "":
25032575
remote_user, cluster_node = _parse_user_at_host(_context.cluster_node, _context.current_user)
2576+
init_upgradeutil()
2577+
check_stage_dependency(_context.stage)
25042578
globals()["join_" + _context.stage](cluster_node, remote_user)
25052579
else:
25062580
if not _context.yes_to_all and _context.cluster_node is None:
@@ -2527,7 +2601,6 @@ def bootstrap_join(context):
25272601
service_manager = ServiceManager()
25282602
_context.node_list_in_cluster = utils.fetch_cluster_node_list_from_node(cluster_node)
25292603
setup_passwordless_with_other_nodes(cluster_node, remote_user)
2530-
join_remote_auth(cluster_node, remote_user)
25312604
_context.skip_csync2 = not service_manager.service_is_active(CSYNC2_SERVICE, cluster_node)
25322605
if _context.skip_csync2:
25332606
service_manager.stop_service(CSYNC2_SERVICE, disable=True)
@@ -2557,14 +2630,6 @@ def join_ocfs2(peer_host, peer_user):
25572630
ocfs2_inst.join_ocfs2(peer_host)
25582631

25592632

2560-
def join_remote_auth(node, user):
2561-
if os.path.exists(PCMK_REMOTE_AUTH):
2562-
utils.rmfile(PCMK_REMOTE_AUTH)
2563-
pcmk_remote_dir = os.path.dirname(PCMK_REMOTE_AUTH)
2564-
utils.mkdirs_owned(pcmk_remote_dir, mode=0o750, gid="haclient")
2565-
utils.touch(PCMK_REMOTE_AUTH)
2566-
2567-
25682633
def remove_qdevice():
25692634
"""
25702635
Remove qdevice service and configuration from cluster

crmsh/ui_cluster.py

+1-10
Original file line numberDiff line numberDiff line change
@@ -331,12 +331,6 @@ def do_init(self, context, *args):
331331
'''
332332
Initialize a cluster.
333333
'''
334-
def looks_like_hostnames(lst):
335-
sectionlist = bootstrap.INIT_STAGES
336-
return all(not (l.startswith('-') or l in sectionlist) for l in lst)
337-
if len(args) > 0:
338-
if '--dry-run' in args or looks_like_hostnames(args):
339-
args = ['--yes', '--nodes'] + [arg for arg in args if arg != '--dry-run']
340334
parser = ArgumentParser(description="""
341335
Initialize a cluster from scratch. This command configures
342336
a complete cluster, and can also add additional cluster
@@ -471,8 +465,6 @@ def looks_like_hostnames(lst):
471465
if stage == "vgfs":
472466
stage = "ocfs2"
473467
logger.warning("vgfs stage was deprecated and is an alias of ocfs2 stage now")
474-
if stage not in bootstrap.INIT_STAGES and stage != "":
475-
parser.error("Invalid stage (%s)" % (stage))
476468

477469
if options.qnetd_addr_input:
478470
if not ServiceManager().service_is_available("corosync-qdevice.service"):
@@ -547,12 +539,11 @@ def do_join(self, context, *args):
547539
stage = ""
548540
if len(args) == 1:
549541
stage = args[0]
550-
if stage not in ("ssh", "csync2", "ssh_merge", "cluster", ""):
551-
parser.error("Invalid stage (%s)" % (stage))
552542

553543
join_context = bootstrap.Context.set_context(options)
554544
join_context.ui_context = context
555545
join_context.stage = stage
546+
join_context.cluster_is_running = ServiceManager(sh.ClusterShellAdaptorForLocalShell(sh.LocalShell())).service_is_active("pacemaker.service")
556547
join_context.type = "join"
557548
join_context.validate_option()
558549

crmsh/utils.py

-8
Original file line numberDiff line numberDiff line change
@@ -468,14 +468,6 @@ def chmod(path, mod):
468468
fatal("Failed to chmod {}: {}".format(path, err))
469469

470470

471-
def touch(file_name):
472-
rc, out, err = ShellUtils().get_stdout_stderr("touch " + file_name, no_reg=True)
473-
if rc != 0:
474-
rc, out, err = ShellUtils().get_stdout_stderr("sudo touch " + file_name, no_reg=True)
475-
if rc != 0:
476-
fatal("Failed create file {}: {}".format(file_name, err))
477-
478-
479471
def copy_local_file(src, dest):
480472
try:
481473
shutil.copyfile(src, dest)

test/features/bootstrap_bugs.feature

+25
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,31 @@ Feature: Regression test for bootstrap bugs
44
Tag @clean means need to stop cluster service if the service is available
55
Need nodes: hanode1 hanode2 hanode3
66

7+
@clean
8+
Scenario: Stages dependency (bsc#1175865)
9+
Given Cluster service is "stopped" on "hanode1"
10+
And Cluster service is "stopped" on "hanode2"
11+
When Try "crm cluster init cluster -y" on "hanode1"
12+
Then Except "ERROR: cluster.init: Please run 'ssh' stage first"
13+
When Run "crm cluster init ssh -y" on "hanode1"
14+
When Try "crm cluster init cluster -y" on "hanode1"
15+
Then Except "ERROR: cluster.init: Please run 'csync2' stage first"
16+
When Run "crm cluster init csync2 -y" on "hanode1"
17+
When Try "crm cluster init cluster -y" on "hanode1"
18+
Then Except "ERROR: cluster.init: Please run 'corosync' stage first"
19+
When Run "crm cluster init corosync -y" on "hanode1"
20+
When Run "crm cluster init cluster -y" on "hanode1"
21+
Then Cluster service is "started" on "hanode1"
22+
23+
When Try "crm cluster join cluster -c hanode1 -y" on "hanode2"
24+
Then Except "ERROR: cluster.join: Please run 'ssh' stage first"
25+
When Try "crm cluster join ssh -c hanode1 -y" on "hanode2"
26+
When Try "crm cluster join cluster -c hanode1 -y" on "hanode2"
27+
Then Except "ERROR: cluster.join: Please run 'csync2' stage first"
28+
When Try "crm cluster join csync2 -c hanode1 -y" on "hanode2"
29+
When Try "crm cluster join cluster -c hanode1 -y" on "hanode2"
30+
Then Cluster service is "started" on "hanode2"
31+
732
@clean
833
Scenario: Set placement-strategy value as "default"(bsc#1129462)
934
Given Cluster service is "stopped" on "hanode1"

test/features/bootstrap_options.feature

+15
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,15 @@ Feature: crmsh bootstrap process - options
4141
When Try "crm cluster init sbd -N hanode1 -N hanode2 -y" on "hanode1"
4242
Then Expected "Can't use -N/--nodes option and stage(sbd) together" in stderr
4343

44+
@clean
45+
Scenario: Stage validation
46+
When Try "crm cluster init fdsf -y" on "hanode1"
47+
Then Expected "Invalid stage: fdsf(available stages: ssh, csync2, corosync, sbd, cluster, ocfs2, admin, qdevice)" in stderr
48+
When Try "crm cluster join fdsf -y" on "hanode1"
49+
Then Expected "Invalid stage: fdsf(available stages: ssh, csync2, ssh_merge, cluster)" in stderr
50+
When Try "crm cluster join ssh -y" on "hanode1"
51+
Then Expected "Can't use stage(ssh) without specifying cluster node" in stderr
52+
4453
@clean
4554
Scenario: Init whole cluster service on node "hanode1" using "--node" option
4655
Given Cluster service is "stopped" on "hanode1"
@@ -51,6 +60,9 @@ Feature: crmsh bootstrap process - options
5160
And Online nodes are "hanode1 hanode2"
5261
And Show cluster status on "hanode1"
5362

63+
When Try "crm cluster init cluster -y" on "hanode1"
64+
Then Expected "Cluster is active, can't run 'cluster' stage" in stderr
65+
5466
@clean
5567
Scenario: Bind specific network interface using "-i" option
5668
Given Cluster service is "stopped" on "hanode1"
@@ -96,6 +108,9 @@ Feature: crmsh bootstrap process - options
96108
And Cluster virtual IP is "@vip.0"
97109
And Show cluster status on "hanode1"
98110

111+
When Try "crm cluster init cluster -y" on "hanode1"
112+
Then Expected "Cluster is active, can't run 'cluster' stage" in stderr
113+
99114
@clean
100115
Scenario: Init cluster service with udpu using "-u" option
101116
Given Cluster service is "stopped" on "hanode1"

test/features/qdevice_validate.feature

+1-1
Original file line numberDiff line numberDiff line change
@@ -110,7 +110,7 @@ Feature: corosync qdevice/qnetd options validate
110110
Scenario: Run qdevice stage on inactive cluster node
111111
Given Cluster service is "stopped" on "hanode1"
112112
When Try "crm cluster init qdevice --qnetd-hostname=qnetd-node"
113-
Then Except "ERROR: cluster.init: Cluster is inactive - can't run qdevice stage"
113+
Then Except "ERROR: cluster.init: Cluster is inactive, can't run 'qdevice' stage"
114114

115115
@clean
116116
Scenario: Run qdevice stage but miss "--qnetd-hostname" option

test/unittests/test_bootstrap.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -564,7 +564,7 @@ def test_join_ssh(
564564
mock_get_node_cononical_hostname,
565565
mock_detect_cluster_service_on_node
566566
):
567-
bootstrap._context = mock.Mock(current_user="bob", default_nic="eth1", use_ssh_agent=False)
567+
bootstrap._context = mock.Mock(current_user="bob", default_nic="eth1", use_ssh_agent=False, stage=None)
568568
mock_swap.return_value = None
569569
mock_ssh_copy_id.return_value = 0
570570
mock_get_node_cononical_hostname.return_value='node1'

0 commit comments

Comments
 (0)