diff --git a/rs/recovery/src/app_subnet_recovery.rs b/rs/recovery/src/app_subnet_recovery.rs index b216b03b2d4..7923983735a 100644 --- a/rs/recovery/src/app_subnet_recovery.rs +++ b/rs/recovery/src/app_subnet_recovery.rs @@ -306,10 +306,11 @@ impl RecoveryIterator for AppSubnetRecovery { StepType::DownloadCertifications => { if self.params.pub_key.is_some() { - Ok(Box::new( - self.recovery - .get_download_certs_step(self.params.subnet_id, false), - )) + Ok(Box::new(self.recovery.get_download_certs_step( + self.params.subnet_id, + false, + !self.interactive(), + ))) } else { Err(RecoveryError::StepSkipped) } diff --git a/rs/recovery/src/error.rs b/rs/recovery/src/error.rs index 805a5825422..71a0b085961 100644 --- a/rs/recovery/src/error.rs +++ b/rs/recovery/src/error.rs @@ -26,6 +26,7 @@ pub enum RecoveryError { RegistryError(String), ValidationFailed(String), AgentError(String), + RsyncFailed, StepSkipped, } @@ -101,6 +102,7 @@ impl fmt::Display for RecoveryError { write!(f, "Validation failed: {}", msg) } RecoveryError::AgentError(msg) => write!(f, "ic-agent error: {}", msg), + RecoveryError::RsyncFailed => write!(f, "Rsync command failed"), } } } diff --git a/rs/recovery/src/file_sync_helper.rs b/rs/recovery/src/file_sync_helper.rs index a29de1834d2..ec3b221a866 100644 --- a/rs/recovery/src/file_sync_helper.rs +++ b/rs/recovery/src/file_sync_helper.rs @@ -4,8 +4,8 @@ use crate::{ error::{RecoveryError, RecoveryResult}, ssh_helper, }; -use core::time; use ic_http_utils::file_downloader::FileDownloader; +use ic_replay::consent_given; use ic_types::ReplicaVersion; use slog::{info, warn, Logger}; use std::{ @@ -13,7 +13,6 @@ use std::{ io::Write, path::{Path, PathBuf}, process::Command, - thread, time::Duration, }; @@ -67,6 +66,7 @@ pub async fn download_binary( Ok(file) } +/// If auto-retry is set to false, the user will be prompted for retries on rsync failures. pub fn rsync_with_retries( logger: &Logger, excludes: Vec<&str>, @@ -74,9 +74,10 @@ pub fn rsync_with_retries( target: &str, require_confirmation: bool, key_file: Option<&PathBuf>, - retries: usize, + auto_retry: bool, + max_retries: usize, ) -> RecoveryResult> { - for _ in 0..retries { + for _ in 0..max_retries { match rsync( logger, excludes.clone(), @@ -85,12 +86,21 @@ pub fn rsync_with_retries( require_confirmation, key_file, ) { - Err(e) => warn!(logger, "Rsync failed: {:?}, retrying...", e), + Err(e) => { + warn!(logger, "Rsync failed: {:?}", e); + if auto_retry { + // In non-interactive cases, we wait a short while + // before re-trying rsync. + info!(logger, "Retrying in 10 seconds..."); + std::thread::sleep(Duration::from_secs(10)); + } else if !consent_given("Do you want to retry the download for this node?") { + return Err(RecoveryError::RsyncFailed); + } + } success => return success, } - thread::sleep(time::Duration::from_secs(10)); } - Err(RecoveryError::UnexpectedError("All retries failed".into())) + Err(RecoveryError::RsyncFailed) } /// Copy the files from src to target using [rsync](https://linux.die.net/man/1/rsync) and options `--delete`, `-acP`. diff --git a/rs/recovery/src/lib.rs b/rs/recovery/src/lib.rs index 23b3cd9c052..091ba060bd0 100644 --- a/rs/recovery/src/lib.rs +++ b/rs/recovery/src/lib.rs @@ -304,7 +304,14 @@ impl Recovery { /// Return a [DownloadCertificationsStep] downloading the certification pools of all reachable /// nodes in the given subnet to the recovery data directory using the readonly account. - pub fn get_download_certs_step(&self, subnet_id: SubnetId, admin: bool) -> impl Step { + /// If auto-retry is false, the user will be prompted on what to do (skip or continue). In + /// non-interactive recoveries, auto-retry should be set to true. + pub fn get_download_certs_step( + &self, + subnet_id: SubnetId, + admin: bool, + auto_retry: bool, + ) -> impl Step { DownloadCertificationsStep { logger: self.logger.clone(), subnet_id, @@ -312,6 +319,7 @@ impl Recovery { work_dir: self.work_dir.clone(), require_confirmation: self.ssh_confirmation, key_file: self.key_file.clone(), + auto_retry, admin, } } diff --git a/rs/recovery/src/nns_recovery_failover_nodes.rs b/rs/recovery/src/nns_recovery_failover_nodes.rs index fc7fcae4b60..0a1d7908356 100644 --- a/rs/recovery/src/nns_recovery_failover_nodes.rs +++ b/rs/recovery/src/nns_recovery_failover_nodes.rs @@ -257,10 +257,13 @@ impl RecoveryIterator for NNSRecoveryFailoverNodes { } } - StepType::DownloadCertifications => Ok(Box::new( - self.recovery - .get_download_certs_step(self.params.subnet_id, true), - )), + StepType::DownloadCertifications => { + Ok(Box::new(self.recovery.get_download_certs_step( + self.params.subnet_id, + true, + !self.interactive(), + ))) + } StepType::MergeCertificationPools => { Ok(Box::new(self.recovery.get_merge_certification_pools_step())) diff --git a/rs/recovery/src/nns_recovery_same_nodes.rs b/rs/recovery/src/nns_recovery_same_nodes.rs index ae470547509..60ae170a3a9 100644 --- a/rs/recovery/src/nns_recovery_same_nodes.rs +++ b/rs/recovery/src/nns_recovery_same_nodes.rs @@ -183,10 +183,13 @@ impl RecoveryIterator for NNSRecoverySameNodes { } } - StepType::DownloadCertifications => Ok(Box::new( - self.recovery - .get_download_certs_step(self.params.subnet_id, true), - )), + StepType::DownloadCertifications => { + Ok(Box::new(self.recovery.get_download_certs_step( + self.params.subnet_id, + true, + !self.interactive(), + ))) + } StepType::MergeCertificationPools => { Ok(Box::new(self.recovery.get_merge_certification_pools_step())) diff --git a/rs/recovery/src/steps.rs b/rs/recovery/src/steps.rs index fae2469dfa0..16aeaf57a77 100644 --- a/rs/recovery/src/steps.rs +++ b/rs/recovery/src/steps.rs @@ -66,6 +66,7 @@ pub struct DownloadCertificationsStep { pub registry_helper: RegistryHelper, pub work_dir: PathBuf, pub require_confirmation: bool, + pub auto_retry: bool, pub key_file: Option, pub admin: bool, } @@ -98,9 +99,10 @@ impl Step for DownloadCertificationsStep { &target.display().to_string(), self.require_confirmation, self.key_file.as_ref(), + self.auto_retry, 5, ) - .map_err(|e| warn!(self.logger, "Failed to download certifications: {:?}", e)); + .map_err(|e| warn!(self.logger, "Skipping download: {:?}", e)); success || res.is_ok() });