From 8c82f7f0c78319bc5b9932c7bc2014f8918801bc Mon Sep 17 00:00:00 2001 From: Adrian Alic <39585474+dist1ll@users.noreply.github.com> Date: Mon, 23 Dec 2024 15:33:31 +0100 Subject: [PATCH] feat(recovery): [CON-1425] Allow skipping nodes during certification downloads (#3188) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This PR adds a way to manually skip nodes during certification downloads when rsync fails. Background: During recoveries, we should download and merge the certification pools of as many nodes on the subnet as possible. Sometimes there are connection issues to individual nodes, and the tool gets stuck trying to download the node’s pool. In those cases we may want to skip the problematic nodes. --- rs/recovery/src/app_subnet_recovery.rs | 9 +++---- rs/recovery/src/error.rs | 2 ++ rs/recovery/src/file_sync_helper.rs | 24 +++++++++++++------ rs/recovery/src/lib.rs | 10 +++++++- .../src/nns_recovery_failover_nodes.rs | 11 +++++---- rs/recovery/src/nns_recovery_same_nodes.rs | 11 +++++---- rs/recovery/src/steps.rs | 4 +++- 7 files changed, 50 insertions(+), 21 deletions(-) diff --git a/rs/recovery/src/app_subnet_recovery.rs b/rs/recovery/src/app_subnet_recovery.rs index b216b03b2d4..7923983735a 100644 --- a/rs/recovery/src/app_subnet_recovery.rs +++ b/rs/recovery/src/app_subnet_recovery.rs @@ -306,10 +306,11 @@ impl RecoveryIterator for AppSubnetRecovery { StepType::DownloadCertifications => { if self.params.pub_key.is_some() { - Ok(Box::new( - self.recovery - .get_download_certs_step(self.params.subnet_id, false), - )) + Ok(Box::new(self.recovery.get_download_certs_step( + self.params.subnet_id, + false, + !self.interactive(), + ))) } else { Err(RecoveryError::StepSkipped) } diff --git a/rs/recovery/src/error.rs b/rs/recovery/src/error.rs index 805a5825422..71a0b085961 100644 --- a/rs/recovery/src/error.rs +++ b/rs/recovery/src/error.rs @@ -26,6 +26,7 @@ pub enum RecoveryError { RegistryError(String), ValidationFailed(String), AgentError(String), + RsyncFailed, StepSkipped, } @@ -101,6 +102,7 @@ impl fmt::Display for RecoveryError { write!(f, "Validation failed: {}", msg) } RecoveryError::AgentError(msg) => write!(f, "ic-agent error: {}", msg), + RecoveryError::RsyncFailed => write!(f, "Rsync command failed"), } } } diff --git a/rs/recovery/src/file_sync_helper.rs b/rs/recovery/src/file_sync_helper.rs index a29de1834d2..ec3b221a866 100644 --- a/rs/recovery/src/file_sync_helper.rs +++ b/rs/recovery/src/file_sync_helper.rs @@ -4,8 +4,8 @@ use crate::{ error::{RecoveryError, RecoveryResult}, ssh_helper, }; -use core::time; use ic_http_utils::file_downloader::FileDownloader; +use ic_replay::consent_given; use ic_types::ReplicaVersion; use slog::{info, warn, Logger}; use std::{ @@ -13,7 +13,6 @@ use std::{ io::Write, path::{Path, PathBuf}, process::Command, - thread, time::Duration, }; @@ -67,6 +66,7 @@ pub async fn download_binary( Ok(file) } +/// If auto-retry is set to false, the user will be prompted for retries on rsync failures. pub fn rsync_with_retries( logger: &Logger, excludes: Vec<&str>, @@ -74,9 +74,10 @@ pub fn rsync_with_retries( target: &str, require_confirmation: bool, key_file: Option<&PathBuf>, - retries: usize, + auto_retry: bool, + max_retries: usize, ) -> RecoveryResult> { - for _ in 0..retries { + for _ in 0..max_retries { match rsync( logger, excludes.clone(), @@ -85,12 +86,21 @@ pub fn rsync_with_retries( require_confirmation, key_file, ) { - Err(e) => warn!(logger, "Rsync failed: {:?}, retrying...", e), + Err(e) => { + warn!(logger, "Rsync failed: {:?}", e); + if auto_retry { + // In non-interactive cases, we wait a short while + // before re-trying rsync. + info!(logger, "Retrying in 10 seconds..."); + std::thread::sleep(Duration::from_secs(10)); + } else if !consent_given("Do you want to retry the download for this node?") { + return Err(RecoveryError::RsyncFailed); + } + } success => return success, } - thread::sleep(time::Duration::from_secs(10)); } - Err(RecoveryError::UnexpectedError("All retries failed".into())) + Err(RecoveryError::RsyncFailed) } /// Copy the files from src to target using [rsync](https://linux.die.net/man/1/rsync) and options `--delete`, `-acP`. diff --git a/rs/recovery/src/lib.rs b/rs/recovery/src/lib.rs index 23b3cd9c052..091ba060bd0 100644 --- a/rs/recovery/src/lib.rs +++ b/rs/recovery/src/lib.rs @@ -304,7 +304,14 @@ impl Recovery { /// Return a [DownloadCertificationsStep] downloading the certification pools of all reachable /// nodes in the given subnet to the recovery data directory using the readonly account. - pub fn get_download_certs_step(&self, subnet_id: SubnetId, admin: bool) -> impl Step { + /// If auto-retry is false, the user will be prompted on what to do (skip or continue). In + /// non-interactive recoveries, auto-retry should be set to true. + pub fn get_download_certs_step( + &self, + subnet_id: SubnetId, + admin: bool, + auto_retry: bool, + ) -> impl Step { DownloadCertificationsStep { logger: self.logger.clone(), subnet_id, @@ -312,6 +319,7 @@ impl Recovery { work_dir: self.work_dir.clone(), require_confirmation: self.ssh_confirmation, key_file: self.key_file.clone(), + auto_retry, admin, } } diff --git a/rs/recovery/src/nns_recovery_failover_nodes.rs b/rs/recovery/src/nns_recovery_failover_nodes.rs index fc7fcae4b60..0a1d7908356 100644 --- a/rs/recovery/src/nns_recovery_failover_nodes.rs +++ b/rs/recovery/src/nns_recovery_failover_nodes.rs @@ -257,10 +257,13 @@ impl RecoveryIterator for NNSRecoveryFailoverNodes { } } - StepType::DownloadCertifications => Ok(Box::new( - self.recovery - .get_download_certs_step(self.params.subnet_id, true), - )), + StepType::DownloadCertifications => { + Ok(Box::new(self.recovery.get_download_certs_step( + self.params.subnet_id, + true, + !self.interactive(), + ))) + } StepType::MergeCertificationPools => { Ok(Box::new(self.recovery.get_merge_certification_pools_step())) diff --git a/rs/recovery/src/nns_recovery_same_nodes.rs b/rs/recovery/src/nns_recovery_same_nodes.rs index ae470547509..60ae170a3a9 100644 --- a/rs/recovery/src/nns_recovery_same_nodes.rs +++ b/rs/recovery/src/nns_recovery_same_nodes.rs @@ -183,10 +183,13 @@ impl RecoveryIterator for NNSRecoverySameNodes { } } - StepType::DownloadCertifications => Ok(Box::new( - self.recovery - .get_download_certs_step(self.params.subnet_id, true), - )), + StepType::DownloadCertifications => { + Ok(Box::new(self.recovery.get_download_certs_step( + self.params.subnet_id, + true, + !self.interactive(), + ))) + } StepType::MergeCertificationPools => { Ok(Box::new(self.recovery.get_merge_certification_pools_step())) diff --git a/rs/recovery/src/steps.rs b/rs/recovery/src/steps.rs index fae2469dfa0..16aeaf57a77 100644 --- a/rs/recovery/src/steps.rs +++ b/rs/recovery/src/steps.rs @@ -66,6 +66,7 @@ pub struct DownloadCertificationsStep { pub registry_helper: RegistryHelper, pub work_dir: PathBuf, pub require_confirmation: bool, + pub auto_retry: bool, pub key_file: Option, pub admin: bool, } @@ -98,9 +99,10 @@ impl Step for DownloadCertificationsStep { &target.display().to_string(), self.require_confirmation, self.key_file.as_ref(), + self.auto_retry, 5, ) - .map_err(|e| warn!(self.logger, "Failed to download certifications: {:?}", e)); + .map_err(|e| warn!(self.logger, "Skipping download: {:?}", e)); success || res.is_ok() });