Skip to content

Commit 053aa7a

Browse files
Merge pull request #145 from aiven/mark_failed_basebackups
mark basebackups as broken when restore fails #145
2 parents 7378b4d + 54aeb0e commit 053aa7a

File tree

4 files changed

+418
-8
lines changed

4 files changed

+418
-8
lines changed

myhoard/backup_stream.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,7 @@ class State(TypedDict):
116116
basebackup_errors: int
117117
basebackup_file_metadata: Optional[Dict]
118118
basebackup_info: Dict
119+
broken_info: Dict
119120
closed_info: Dict
120121
completed_info: Dict
121122
backup_reason: Optional["BackupStream.BackupReason"]
@@ -213,6 +214,7 @@ def __init__(
213214
"basebackup_errors": 0,
214215
"basebackup_file_metadata": None,
215216
"basebackup_info": {},
217+
"broken_info": {},
216218
"closed_info": {},
217219
"completed_info": {},
218220
"backup_reason": backup_reason,
@@ -391,6 +393,34 @@ def iterate_remote_binlogs(self, *, reverse: bool = False) -> Iterator[BinlogInf
391393
for binlog in binlogs:
392394
yield binlog
393395

396+
def mark_as_broken(self) -> None:
397+
if self.state["broken_info"]:
398+
self.log.warning("Stream %s marked as broken multiple times", self.stream_id)
399+
return
400+
401+
self.stop()
402+
self.log.info("Marking stream %s as broken.", self.stream_id)
403+
404+
broken_info = {
405+
"broken_at": time.time(),
406+
"server_id": self.server_id,
407+
}
408+
# Write the broken info to local state too
409+
self.state_manager.update_state(broken_info=broken_info)
410+
411+
try:
412+
key = self._build_full_name("broken.json")
413+
data = json.dumps(broken_info)
414+
metadata = make_fs_metadata(broken_info)
415+
416+
file_storage = self.file_storage_setup_fn()
417+
file_storage.store_file_from_memory(key, data.encode("utf-8"), metadata=metadata)
418+
except Exception as ex: # pylint: disable=broad-except
419+
self.log.exception("Failed to create broken.json")
420+
self.stats.unexpected_exception(ex=ex, where="BackupStream.mark_as_broken")
421+
self.state_manager.update_state(remote_write_errors=self.state["remote_write_errors"] + 1)
422+
self.stats.increase("myhoard.remote_write_errors")
423+
394424
def mark_as_closed(self) -> None:
395425
if self.state["closed_info"]:
396426
self.log.warning("Stream %s marked as closed multiple times", self.stream_id)

myhoard/controller.py

Lines changed: 46 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,7 @@ class Backup(TypedDict):
5454
basebackup_info: BaseBackup
5555
closed_at: Optional[float]
5656
completed_at: Optional[float]
57+
broken_at: Optional[float]
5758
recovery_site: bool
5859
stream_id: str
5960
resumable: bool
@@ -287,6 +288,10 @@ def restore_backup(
287288
continue
288289
if not backup["basebackup_info"]:
289290
raise ValueError(f"Backup {backup!r} cannot be restored")
291+
292+
if backup.get("broken_at"):
293+
raise ValueError(f"Cannot restore a broken backup: {backup!r}")
294+
290295
if target_time:
291296
if target_time < backup["basebackup_info"]["end_ts"]:
292297
raise ValueError(f"Requested target time {target_time} predates backup completion: {backup!r}")
@@ -559,6 +564,7 @@ def get_backup_list(backup_sites: Dict[str, BackupSiteInfo], *, seen_basebackup_
559564
for site_and_stream_id in streams:
560565
basebackup_compressed_size = None
561566
basebackup_info = {}
567+
broken_info = {}
562568
closed_info = {}
563569
completed_info = {}
564570
for info in file_storage.list_iter(site_and_stream_id):
@@ -573,6 +579,8 @@ def get_backup_list(backup_sites: Dict[str, BackupSiteInfo], *, seen_basebackup_
573579
info_str, _ = file_storage.get_contents_to_string(info["name"])
574580
basebackup_info = json.loads(info_str.decode("utf-8"))
575581
seen_basebackup_infos[site_and_stream_id] = basebackup_info
582+
elif file_name == "broken.json":
583+
broken_info = parse_fs_metadata(info["metadata"])
576584
elif file_name == "closed.json":
577585
closed_info = parse_fs_metadata(info["metadata"])
578586
elif file_name == "completed.json":
@@ -586,6 +594,7 @@ def get_backup_list(backup_sites: Dict[str, BackupSiteInfo], *, seen_basebackup_
586594
backups.append(
587595
{
588596
"basebackup_info": basebackup_info,
597+
"broken_at": broken_info.get("broken_at"),
589598
"closed_at": closed_info["closed_at"] if closed else None,
590599
"completed_at": completed_info["completed_at"] if completed else None,
591600
"recovery_site": site_config.get("recovery_only", False),
@@ -1140,6 +1149,7 @@ def _handle_mode_restore(self):
11401149
self._process_local_binlog_updates()
11411150
self._extend_binlog_stream_list()
11421151
if self.restore_coordinator.phase == RestoreCoordinator.Phase.failed_basebackup:
1152+
self._mark_failed_restore_backup_as_broken()
11431153
self._switch_basebackup_if_possible()
11441154
if self.state["promote_on_restore_completion"] and self.restore_coordinator.is_complete():
11451155
self.state_manager.update_state(
@@ -1151,6 +1161,20 @@ def _handle_mode_restore(self):
11511161
restore_options={},
11521162
)
11531163

1164+
def _mark_failed_restore_backup_as_broken(self) -> None:
1165+
broken_backup = None
1166+
failed_stream_id = self.state["restore_options"]["stream_id"]
1167+
backups = self.state["backups"]
1168+
for backup in backups:
1169+
if backup["stream_id"] == failed_stream_id:
1170+
broken_backup = backup
1171+
break
1172+
1173+
if not broken_backup:
1174+
raise Exception(f"Stream {failed_stream_id} to be marked as broken not found in completed backups: {backups}")
1175+
1176+
self._build_backup_stream(broken_backup).mark_as_broken()
1177+
11541178
def _mark_periodic_backup_requested_if_interval_exceeded(self):
11551179
normalized_backup_time = self._current_normalized_backup_timestamp()
11561180
most_recent_scheduled = None
@@ -1224,21 +1248,37 @@ def _process_removed_binlogs(self, binlogs):
12241248

12251249
def _purge_old_backups(self):
12261250
purgeable = [backup for backup in self.state["backups"] if backup["completed_at"]]
1227-
if len(purgeable) <= self.backup_settings["backup_count_min"]:
1251+
broken_backups_count = sum(backup["broken_at"] is not None for backup in purgeable)
1252+
# do not consider broken backups for the count, they will still be purged
1253+
# but we should only purge when the count of non-broken backups has exceeded the limit.
1254+
non_broken_backups_count = len(purgeable) - broken_backups_count
1255+
1256+
if non_broken_backups_count <= self.backup_settings["backup_count_max"] < len(purgeable):
1257+
self.log.info(
1258+
"Backup count %s is above max allowed, but %s are broken, not dropping",
1259+
len(purgeable),
1260+
broken_backups_count,
1261+
)
1262+
return
1263+
1264+
if non_broken_backups_count <= self.backup_settings["backup_count_min"]:
12281265
return
12291266

12301267
# For simplicity only ever drop one backup here. This function
12311268
# is called repeatedly so if there are for any reason more backups
12321269
# to drop they will be dropped soon enough
12331270
purgeable = sort_completed_backups(purgeable)
12341271
backup = purgeable[0]
1272+
12351273
if not backup["closed_at"]:
12361274
return
12371275

12381276
if time.time() > backup["closed_at"] + self.backup_settings["backup_age_days_max"] * 24 * 60 * 60:
12391277
self.log.info("Backup %r is older than max backup age, dropping it", backup["stream_id"])
1240-
elif len(purgeable) > self.backup_settings["backup_count_max"]:
1241-
self.log.info("Backup count %s is above max allowed, dropping %r", len(purgeable), backup["stream_id"])
1278+
elif non_broken_backups_count > self.backup_settings["backup_count_max"]:
1279+
self.log.info(
1280+
"Non-broken backup count %s is above max allowed, dropping %r", non_broken_backups_count, backup["stream_id"]
1281+
)
12421282
else:
12431283
return
12441284

@@ -1619,7 +1659,9 @@ def _switch_basebackup_if_possible(self):
16191659
for backup in backups:
16201660
if backup["stream_id"] == current_stream_id:
16211661
break
1622-
earlier_backup = backup
1662+
1663+
if not backup["broken_at"]:
1664+
earlier_backup = backup
16231665
else:
16241666
raise Exception(f"Stream {current_stream_id} being restored not found in completed backups: {backups}")
16251667

0 commit comments

Comments
 (0)