Skip to content

Commit ade18b0

Browse files
authored
Merge pull request #688 from ckulal/cephqe_CEPH-83575113
CEPH-83575113: Bring down a site while the sync is in progress and te…
2 parents acc1f5a + e88f83b commit ade18b0

File tree

4 files changed

+200
-3
lines changed

4 files changed

+200
-3
lines changed

rgw/v2/lib/sync_status.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
log = logging.getLogger(__name__)
1313

1414

15-
def sync_status(retry=25, delay=60, ssh_con=None):
15+
def sync_status(retry=25, delay=60, ssh_con=None, return_while_sync_inprogress=False):
1616
"""
1717
verify multisite sync status
1818
"""
@@ -65,6 +65,8 @@ def sync_status(retry=25, delay=60, ssh_con=None):
6565
)
6666
if "behind" in check_sync_status or "recovering" in check_sync_status:
6767
log.info("sync is in progress")
68+
if return_while_sync_inprogress:
69+
return "sync_progress"
6870
log.info(f"sleep of {delay} secs for sync to complete")
6971
for retry_count in range(retry):
7072
time.sleep(delay)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
# script: test_sync_post_destruptive_services.py
2+
# Polarion ID: CEPH-83575113
3+
config:
4+
test_ops:
5+
rgw_service_name: "rgw.shared.sec.sync"
6+
sync_retry: 24
7+
sync_delay: 900

rgw/v2/tests/s3_swift/reusable.py

+84-2
Original file line numberDiff line numberDiff line change
@@ -1629,13 +1629,20 @@ def time_to_list_via_boto(bucket_name, rgw):
16291629
return time_taken
16301630

16311631

1632-
def check_sync_status(retry=None, delay=None):
1632+
def check_sync_status(retry=25, delay=60, return_while_sync_inprogress=False):
16331633
"""
16341634
Check sync status if its a multisite cluster
16351635
"""
16361636
is_multisite = utils.is_cluster_multisite()
16371637
if is_multisite:
1638-
sync_status()
1638+
if return_while_sync_inprogress:
1639+
out = sync_status(
1640+
retry, delay, return_while_sync_inprogress=return_while_sync_inprogress
1641+
)
1642+
return out
1643+
sync_status(
1644+
retry, delay, return_while_sync_inprogress=return_while_sync_inprogress
1645+
)
16391646

16401647

16411648
def check_bucket_sync_status(bkt=None):
@@ -3082,3 +3089,78 @@ def node_reboot(node, service_name=None, retry=15, delay=60):
30823089
break
30833090
if retry_count + 1 == retry:
30843091
raise AssertionError("Node is not in expected state post 15min!!")
3092+
3093+
3094+
def bring_down_all_rgws_in_the_site(rgw_service_name, retry=10, delay=10):
3095+
"""
3096+
Method to bring down rgw services in all the nodes
3097+
rgw_service_name: RGW service name
3098+
"""
3099+
cmd = f"ceph orch stop {rgw_service_name}"
3100+
utils.exec_shell_cmd(cmd)
3101+
cmd = "ceph orch ps --format json-pretty"
3102+
out = json.loads(utils.exec_shell_cmd(cmd))
3103+
for entry in out:
3104+
daemon = entry["daemon_name"].split(".")[0]
3105+
log.info(f"daemon type is {daemon}")
3106+
if daemon == "rgw":
3107+
service_name = entry["daemon_name"]
3108+
log.info(f"daemon is {service_name}")
3109+
if rgw_service_name in service_name:
3110+
status = entry["status_desc"]
3111+
if str(status) == "running":
3112+
log.info(f"enter loop of retry")
3113+
for retry_count in range(retry):
3114+
log.info(f"try {retry_count}")
3115+
out = json.loads(utils.exec_shell_cmd(cmd))
3116+
for entry in out:
3117+
if service_name == entry["daemon_name"]:
3118+
status = entry["status_desc"]
3119+
log.info(f"status is {status}")
3120+
if str(status) == "running":
3121+
log.info(
3122+
f"Node is not in expected state, waiting for {delay} seconds"
3123+
)
3124+
time.sleep(delay)
3125+
else:
3126+
log.info(f"Node {service_name} is in expected state")
3127+
break
3128+
if retry_count + 1 == retry:
3129+
raise AssertionError("Node is not in expected state!!")
3130+
3131+
3132+
def bring_up_all_rgws_in_the_site(rgw_service_name, retry=10, delay=10):
3133+
"""
3134+
Method to bring up rgw services in all the nodes
3135+
"""
3136+
cmd = f"ceph orch start {rgw_service_name}"
3137+
utils.exec_shell_cmd(cmd)
3138+
cmd = "ceph orch ps --format json-pretty"
3139+
out = json.loads(utils.exec_shell_cmd(cmd))
3140+
for entry in out:
3141+
daemon = entry["daemon_name"].split(".")[0]
3142+
log.info(f"daemon type is {daemon}")
3143+
if daemon == "rgw":
3144+
service_name = entry["daemon_name"]
3145+
log.info(f"daemon is {service_name}")
3146+
if rgw_service_name in service_name:
3147+
status = entry["status_desc"]
3148+
if str(status) != "running":
3149+
log.info(f"enter loop of retry")
3150+
for retry_count in range(retry):
3151+
log.info(f"try {retry_count}")
3152+
out = json.loads(utils.exec_shell_cmd(cmd))
3153+
for entry in out:
3154+
if service_name == entry["daemon_name"]:
3155+
status = entry["status_desc"]
3156+
log.info(f"status is {status}")
3157+
if str(status) != "running":
3158+
log.info(
3159+
f"Node is not in expected state, waiting for {delay} seconds"
3160+
)
3161+
time.sleep(delay)
3162+
else:
3163+
log.info(f"Node {service_name} is in expected state")
3164+
break
3165+
if retry_count + 1 == retry:
3166+
raise AssertionError("Node is not in expected state!!")
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,106 @@
1+
"""
2+
test_sync_post_destruptive_services.py
3+
- Test sync consistent with the multisite setup post making rgw service destruptive
4+
5+
Usage : test_sync_post_destruptive_services.py -c <input_yaml>
6+
<input_yaml>
7+
test_sync_consisitent_post_service_down_up.yaml
8+
"""
9+
10+
import os
11+
import sys
12+
13+
sys.path.append(os.path.abspath(os.path.join(__file__, "../../../..")))
14+
import argparse
15+
import logging
16+
import time
17+
import traceback
18+
19+
import v2.utils.utils as utils
20+
from v2.lib.exceptions import RGWBaseException, TestExecError
21+
from v2.lib.resource_op import Config
22+
from v2.lib.s3.auth import Auth
23+
from v2.lib.s3.write_io_info import BasicIOInfoStructure, IOInfoInitialize
24+
from v2.tests.s3_swift import reusable
25+
from v2.utils.log import configure_logging
26+
from v2.utils.test_desc import AddTestInfo
27+
from v2.utils.utils import RGWService
28+
29+
log = logging.getLogger()
30+
TEST_DATA_PATH = None
31+
32+
33+
def test_exec(config, ssh_con):
34+
io_info_initialize = IOInfoInitialize()
35+
basic_io_structure = BasicIOInfoStructure()
36+
io_info_initialize.initialize(basic_io_structure.initial())
37+
38+
is_multisite = utils.is_cluster_multisite()
39+
if is_multisite:
40+
out = reusable.check_sync_status(return_while_sync_inprogress=True)
41+
if str(out) != "sync_progress":
42+
raise AssertionError("sync status is not in progress!!")
43+
rgw_service_name = config.test_ops.get("rgw_service_name")
44+
reusable.bring_down_all_rgws_in_the_site(rgw_service_name)
45+
log.info(f"Waiting for 10 min")
46+
time.sleep(600)
47+
reusable.bring_up_all_rgws_in_the_site(rgw_service_name)
48+
retry = config.test_ops.get("sync_retry", 25)
49+
delay = config.test_ops.get("sync_delay", 60)
50+
reusable.check_sync_status(retry, delay)
51+
52+
else:
53+
log.info("Cluster is not a Multisite!")
54+
55+
crash_info = reusable.check_for_crash()
56+
if crash_info:
57+
raise TestExecError("ceph daemon crash found!")
58+
59+
60+
if __name__ == "__main__":
61+
62+
test_info = AddTestInfo("Test Sync consistency post destruptive rgw services")
63+
64+
try:
65+
project_dir = os.path.abspath(os.path.join(__file__, "../../.."))
66+
test_data_dir = "test_data"
67+
TEST_DATA_PATH = os.path.join(project_dir, test_data_dir)
68+
log.info("TEST_DATA_PATH: %s" % TEST_DATA_PATH)
69+
if not os.path.exists(TEST_DATA_PATH):
70+
log.info("test data dir not exists, creating.. ")
71+
os.makedirs(TEST_DATA_PATH)
72+
parser = argparse.ArgumentParser(
73+
description="Test Sync consistency post destruptive rgw services"
74+
)
75+
parser.add_argument(
76+
"-c",
77+
dest="config",
78+
help="Test Sync consistency post destruptive rgw services",
79+
)
80+
parser.add_argument(
81+
"-log_level",
82+
dest="log_level",
83+
help="Set Log Level [DEBUG, INFO, WARNING, ERROR, CRITICAL]",
84+
default="info",
85+
)
86+
parser.add_argument(
87+
"--rgw-node", dest="rgw_node", help="RGW Node", default="127.0.0.1"
88+
)
89+
args = parser.parse_args()
90+
yaml_file = args.config
91+
rgw_node = args.rgw_node
92+
ssh_con = None
93+
if rgw_node != "127.0.0.1":
94+
ssh_con = utils.connect_remote(rgw_node)
95+
log_f_name = os.path.basename(os.path.splitext(yaml_file)[0])
96+
configure_logging(f_name=log_f_name, set_level=args.log_level.upper())
97+
config = Config(yaml_file)
98+
config.read(ssh_con)
99+
test_exec(config, ssh_con)
100+
test_info.success_status("test passed")
101+
sys.exit(0)
102+
except (RGWBaseException, Exception) as e:
103+
log.error(e)
104+
log.error(traceback.format_exc())
105+
test_info.failed_status("test failed")
106+
sys.exit(1)

0 commit comments

Comments
 (0)