Skip to content

Commit 681a619

Browse files
committed
fix(redis): cluster架构不要重复发起自愈 #10042
1. 自愈添加消息独立推送(忽略自愈、自愈发起、自愈失败) 2. 自愈修复cluster 集群类型,在自愈进行时再次发起自愈单子 3. 自愈添加自愈状态监控
1 parent d16b72e commit 681a619

File tree

6 files changed

+148
-7
lines changed

6 files changed

+148
-7
lines changed

dbm-ui/backend/db_periodic_task/local_tasks/redis_autofix.py

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,20 +8,26 @@
88
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
99
specific language governing permissions and limitations under the License.
1010
"""
11+
import json
1112
import logging
1213

1314
from celery.schedules import crontab
15+
from django.utils import timezone
1416
from django.utils import timezone as datetime
17+
from django.utils.translation import ugettext as _
1518

1619
from backend.db_periodic_task.local_tasks.register import register_periodic_task
1720
from backend.db_services.redis.autofix.bill import generate_autofix_ticket
1821
from backend.db_services.redis.autofix.enums import AutofixItem, AutofixStatus
22+
from backend.db_services.redis.autofix.message import send_msg_2_qywx
1923
from backend.db_services.redis.autofix.models import RedisAutofixCore, RedisAutofixCtl
2024
from backend.db_services.redis.autofix.watcher import (
2125
get_4_next_watch_ID,
2226
save_swithed_host_by_cluster,
2327
watcher_get_by_hosts,
2428
)
29+
from backend.ticket.constants import TicketStatus
30+
from backend.ticket.models import Ticket
2531
from backend.utils.time import datetime2str
2632

2733
logger = logging.getLogger("root")
@@ -74,3 +80,49 @@ def start_autofix_flow():
7480
return
7581

7682
generate_autofix_ticket(fixlists)
83+
84+
85+
@register_periodic_task(run_every=crontab(minute="*/1"))
86+
def watch_autofix_flow():
87+
"""监控自愈状态,已期进行流转"""
88+
89+
try:
90+
fixlists = RedisAutofixCore.objects.filter(
91+
deal_status__in=[
92+
AutofixStatus.AF_WFLOW.value,
93+
AutofixStatus.AF_REUSE.value,
94+
AutofixStatus.AF_RUNNING.value,
95+
]
96+
)
97+
except RedisAutofixCore.DoesNotExist:
98+
logger.info("waiting other flow items ... ")
99+
return
100+
if len(fixlists) == 0:
101+
logger.info("waiting other flow items ... ")
102+
return
103+
104+
for flow in fixlists:
105+
try:
106+
ticket_obj = Ticket.objects.get(id=flow.ticket_id)
107+
108+
if ticket_obj.status == TicketStatus.RUNNING.value:
109+
flow.deal_status = AutofixStatus.AF_RUNNING.value
110+
elif ticket_obj.status == TicketStatus.SUCCEEDED.value:
111+
flow.deal_status = AutofixStatus.AF_SUCC.value
112+
else:
113+
flow.deal_status = AutofixStatus.AF_FAIL.value
114+
flow.status_version = ticket_obj.status
115+
msgs, title = {}, _("{} - 自愈失败☹️".format(flow.immute_domain))
116+
msgs["BKID"] = flow.bk_biz_id
117+
msgs[_("集群类型")] = flow.cluster_type
118+
msgs[_("故障机器")] = json.dumps(flow.fault_machines)
119+
msgs[_("失败原因")] = flow.status_version
120+
send_msg_2_qywx(title, msgs)
121+
122+
flow.update_at = datetime2str(datetime.datetime.now(timezone.utc))
123+
flow.save(update_fields=["status_version", "deal_status", "update_at"])
124+
except Exception as e:
125+
flow.deal_status = AutofixStatus.AF_UNKOWN.value
126+
flow.status_version = e
127+
flow.update_at = datetime2str(datetime.datetime.now(timezone.utc))
128+
flow.save(update_fields=["status_version", "deal_status", "update_at"])

dbm-ui/backend/db_periodic_task/local_tasks/redis_clusternodes_update/task.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -385,13 +385,15 @@ def start_autofix_flow(self, cluster: Cluster):
385385
if all([slave_obj.status == InstanceStatus.UNAVAILABLE.value for slave_obj in slave_objs]):
386386
# 该机器所有实例都unrunning了
387387
# 继续检测该机器 10 分钟内没被加入到自愈流程中
388-
mins10_ago = datetime.now(timezone.utc) - timedelta(minutes=10)
388+
# 需要根据 时间 + 单据状态 (10分钟不够)
389+
mins10_ago = datetime.now(timezone.utc) - timedelta(minutes=600)
389390
rows = RedisAutofixCore.objects.filter(cluster_id=cluster.id, create_at__gt=mins10_ago)
390391
exists = False
391392
for row in rows:
392393
if ip in [item["ip"] for item in json.loads(row.fault_machines)]:
393-
exists = True
394-
break
394+
if row.deal_status in [AutofixStatus.AF_RUNNING.value, AutofixStatus.AF_TICKET.value]:
395+
exists = True
396+
break
395397
if not exists:
396398
fault_machines.append({"instance_type": slave_objs[0].machine_type, "ip": ip})
397399
if len(fault_machines) > 0:

dbm-ui/backend/db_services/redis/autofix/bill.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333
from backend.utils.time import datetime2str
3434

3535
from .enums import AutofixStatus
36+
from .message import send_msg_2_qywx
3637
from .models import RedisAutofixCore
3738

3839
logger = logging.getLogger("root")
@@ -151,6 +152,14 @@ def create_ticket(cluster: RedisAutofixCore, cluster_ids: list, redis_proxies: l
151152

152153
# 初始化builder类
153154
try:
155+
msgs, title = {}, _("{} - 发起自愈".format(cluster.immute_domain))
156+
msgs[_("BKID")] = cluster.bk_biz_id
157+
msgs[_("流程ID")] = ticket.id
158+
msgs[_("集群IDS")] = cluster_ids
159+
msgs[_("集群类型")] = cluster.cluster_type
160+
msgs[_("故障机S")] = json.dumps(ips)
161+
send_msg_2_qywx(title, msgs)
162+
154163
builder = BuilderFactory.create_builder(ticket)
155164
builder.patch_ticket_detail()
156165
builder.init_ticket_flows()

dbm-ui/backend/db_services/redis/autofix/enums.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ class AutofixItem(str, StructuredEnum):
2727

2828
AUTOFIX_ENABLE = EnumField("enable", _("自愈开关"))
2929
DBHA_ID = EnumField("last_id", _("监控到的id"))
30+
CHAT_IDS = EnumField("chat_ids", _("群消息IDS"))
3031
IGNORE_APPS = EnumField("ignore_apps", _("忽略自愈的APP列表"))
3132
IGNORE_DOMAINS = EnumField("ignore_domains", _("忽略自愈的集群列表"))
3233

@@ -36,8 +37,12 @@ class AutofixStatus(str, StructuredEnum):
3637

3738
AF_INIT = EnumField("initautofix", _("初始化"))
3839
AF_TICKET = EnumField("initticket", _("创建单据"))
39-
AF_SFLOW = EnumField("startflow", _("发起flow流程"))
40-
AF_WFLOW = EnumField("watchflow", _("监控流程完成状态"))
41-
AF_IGNORE = EnumField("ignore", _("不支持自愈"))
40+
AF_REUSE = EnumField("reuse_host", _("尝试复用"))
41+
AF_REUSE_SUCCESS = EnumField("reuse_succ", _("复用搞定"))
42+
AF_REUSE_FAILED = EnumField("reuse_fail", _("尝试失败"))
43+
AF_WFLOW = EnumField("watchflow", _("监控状态"))
44+
AF_IGNORE = EnumField("ignore", _("不理解类型"))
4245
AF_SUCC = EnumField("success", _("自愈成功"))
46+
AF_RUNNING = EnumField("running", _("拼命干活中"))
4347
AF_FAIL = EnumField("fail", _("自愈失败"))
48+
AF_UNKOWN = EnumField("unkown", _("未至之境"))
Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
# -*- coding: utf-8 -*-
2+
"""
3+
TencentBlueKing is pleased to support the open source community by making 蓝鲸智云-DB管理系统(BlueKing-BK-DBM) available.
4+
Copyright (C) 2017-2023 THL A29 Limited, a Tencent company. All rights reserved.
5+
Licensed under the MIT License (the "License"); you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at https://opensource.org/licenses/MIT
7+
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
8+
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
9+
specific language governing permissions and limitations under the License.
10+
"""
11+
import datetime
12+
import json
13+
import logging
14+
15+
from django.utils import timezone
16+
from django.utils.translation import ugettext as _
17+
18+
from backend.configuration.constants import DBType
19+
from backend.configuration.models.dba import DBAdministrator
20+
from backend.core.notify.handlers import CmsiHandler
21+
from backend.db_meta.models import AppCache
22+
from backend.utils.time import date2str
23+
24+
from .enums import AutofixItem
25+
from .models import RedisAutofixCtl
26+
27+
logger = logging.getLogger("root")
28+
29+
30+
def send_msg_2_qywx(sub_title: str, msgs):
31+
msg_ids = []
32+
try:
33+
msg_item = RedisAutofixCtl.objects.filter(ctl_name=AutofixItem.CHAT_IDS.value).get()
34+
if msg_item:
35+
msg_ids = json.loads(msg_item.ctl_value)
36+
except RedisAutofixCtl.DoesNotExist:
37+
RedisAutofixCtl.objects.create(
38+
bk_cloud_id=0, bk_biz_id=0, ctl_value=json.dumps("[]"), ctl_name=AutofixItem.CHAT_IDS.value
39+
).save()
40+
41+
if len(msg_ids) == 0:
42+
return
43+
44+
bk_biz_id = msgs["BKID"]
45+
redis_DBA = DBAdministrator.get_biz_db_type_admins(bk_biz_id=bk_biz_id, db_type=DBType.Redis.value)
46+
app_info = AppCache.objects.get(bk_biz_id=bk_biz_id)
47+
48+
content = _(">> Tendis-{}\n".format(sub_title))
49+
for k, v in msgs.items():
50+
if k == "BKID":
51+
content += _("业务信息 : {}(#{},{})\n".format(app_info.bk_biz_name, app_info.bk_biz_id, app_info.db_app_abbr))
52+
content += _("业务DBA : {}(@{})\n".format(redis_DBA[0], redis_DBA[0]))
53+
else:
54+
content += _("{} : {}\n".format(k, v))
55+
content += _("消息时间 : {}\n".format(date2str(datetime.datetime.now(timezone.utc), "%Y-%m-%d %H:%M:%S")))
56+
CmsiHandler(_("Tendis自愈"), content, msg_ids).send_wecom_robot()

dbm-ui/backend/db_services/redis/autofix/watcher.py

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515

1616
from django.utils import timezone
1717
from django.utils.crypto import get_random_string
18+
from django.utils.translation import ugettext_lazy as _
1819

1920
from backend.components.hadb.client import HADBApi
2021
from backend.constants import DEFAULT_BK_CLOUD_ID
@@ -25,6 +26,7 @@
2526

2627
from .const import REDIS_SWITCH_WAITER, SWITCH_MAX_WAIT_SECONDS, SWITCH_SMALL, RedisSwitchHost, RedisSwitchWait
2728
from .enums import AutofixItem, AutofixStatus, DBHASwitchResult
29+
from .message import send_msg_2_qywx
2830
from .models import RedisAutofixCore, RedisAutofixCtl, RedisIgnoreAutofix
2931

3032
logger = logging.getLogger("root")
@@ -243,7 +245,7 @@ def save_swithed_host_by_cluster(batch_small: int, switch_hosts: Dict):
243245

244246
# 把需要忽略自愈的保存起来
245247
def save_ignore_host(switched_host: RedisSwitchHost, msg):
246-
RedisIgnoreAutofix.objects.update_or_create(
248+
rst = RedisIgnoreAutofix.objects.update_or_create(
247249
bk_cloud_id=DEFAULT_BK_CLOUD_ID,
248250
bk_biz_id=switched_host.bk_biz_id,
249251
cluster_id=switched_host.cluster_id,
@@ -259,3 +261,18 @@ def save_ignore_host(switched_host: RedisSwitchHost, msg):
259261
sw_result=json.dumps(switched_host.sw_result),
260262
ignore_msg=msg,
261263
)
264+
265+
if switched_host.cluster_type in [
266+
ClusterType.TwemproxyTendisSSDInstance.value,
267+
ClusterType.TendisTwemproxyRedisInstance.value,
268+
ClusterType.TendisPredixyRedisCluster.value,
269+
ClusterType.TendisPredixyTendisplusCluster.value,
270+
ClusterType.TendisRedisInstance.value,
271+
]:
272+
msgs, title = {}, _("{}-忽略自愈".format(switched_host.immute_domain))
273+
msgs[_("BKID")] = switched_host.bk_biz_id
274+
msgs[_("故障IP")] = switched_host.ip
275+
msgs[_("实例类型")] = switched_host.instance_type
276+
msgs[_("ByDBHA")] = json.dumps(switched_host.sw_result)
277+
msgs[_("xxxxxx")] = rst
278+
send_msg_2_qywx(title, msgs)

0 commit comments

Comments
 (0)