Skip to content

Commit

Permalink
feat(recovery): [CON-1425] Allow skipping nodes during certification …
Browse files Browse the repository at this point in the history
…downloads (#3188)

This PR adds a way to manually skip nodes during certification downloads
when rsync fails.

Background: During recoveries, we should download and merge the
certification pools of as many nodes on the subnet as possible.
Sometimes there are connection issues to individual nodes, and the tool
gets stuck trying to download the node’s pool. In those cases we may
want to skip the problematic nodes.
  • Loading branch information
dist1ll authored Dec 23, 2024
1 parent 0a0b29b commit 8c82f7f
Show file tree
Hide file tree
Showing 7 changed files with 50 additions and 21 deletions.
9 changes: 5 additions & 4 deletions rs/recovery/src/app_subnet_recovery.rs
Original file line number Diff line number Diff line change
Expand Up @@ -306,10 +306,11 @@ impl RecoveryIterator<StepType, StepTypeIter> for AppSubnetRecovery {

StepType::DownloadCertifications => {
if self.params.pub_key.is_some() {
Ok(Box::new(
self.recovery
.get_download_certs_step(self.params.subnet_id, false),
))
Ok(Box::new(self.recovery.get_download_certs_step(
self.params.subnet_id,
false,
!self.interactive(),
)))
} else {
Err(RecoveryError::StepSkipped)
}
Expand Down
2 changes: 2 additions & 0 deletions rs/recovery/src/error.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ pub enum RecoveryError {
RegistryError(String),
ValidationFailed(String),
AgentError(String),
RsyncFailed,
StepSkipped,
}

Expand Down Expand Up @@ -101,6 +102,7 @@ impl fmt::Display for RecoveryError {
write!(f, "Validation failed: {}", msg)
}
RecoveryError::AgentError(msg) => write!(f, "ic-agent error: {}", msg),
RecoveryError::RsyncFailed => write!(f, "Rsync command failed"),
}
}
}
Expand Down
24 changes: 17 additions & 7 deletions rs/recovery/src/file_sync_helper.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,15 @@ use crate::{
error::{RecoveryError, RecoveryResult},
ssh_helper,
};
use core::time;
use ic_http_utils::file_downloader::FileDownloader;
use ic_replay::consent_given;
use ic_types::ReplicaVersion;
use slog::{info, warn, Logger};
use std::{
fs::{self, File, ReadDir},
io::Write,
path::{Path, PathBuf},
process::Command,
thread,
time::Duration,
};

Expand Down Expand Up @@ -67,16 +66,18 @@ pub async fn download_binary(
Ok(file)
}

/// If auto-retry is set to false, the user will be prompted for retries on rsync failures.
pub fn rsync_with_retries(
logger: &Logger,
excludes: Vec<&str>,
src: &str,
target: &str,
require_confirmation: bool,
key_file: Option<&PathBuf>,
retries: usize,
auto_retry: bool,
max_retries: usize,
) -> RecoveryResult<Option<String>> {
for _ in 0..retries {
for _ in 0..max_retries {
match rsync(
logger,
excludes.clone(),
Expand All @@ -85,12 +86,21 @@ pub fn rsync_with_retries(
require_confirmation,
key_file,
) {
Err(e) => warn!(logger, "Rsync failed: {:?}, retrying...", e),
Err(e) => {
warn!(logger, "Rsync failed: {:?}", e);
if auto_retry {
// In non-interactive cases, we wait a short while
// before re-trying rsync.
info!(logger, "Retrying in 10 seconds...");
std::thread::sleep(Duration::from_secs(10));
} else if !consent_given("Do you want to retry the download for this node?") {
return Err(RecoveryError::RsyncFailed);
}
}
success => return success,
}
thread::sleep(time::Duration::from_secs(10));
}
Err(RecoveryError::UnexpectedError("All retries failed".into()))
Err(RecoveryError::RsyncFailed)
}

/// Copy the files from src to target using [rsync](https://linux.die.net/man/1/rsync) and options `--delete`, `-acP`.
Expand Down
10 changes: 9 additions & 1 deletion rs/recovery/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -304,14 +304,22 @@ impl Recovery {

/// Return a [DownloadCertificationsStep] downloading the certification pools of all reachable
/// nodes in the given subnet to the recovery data directory using the readonly account.
pub fn get_download_certs_step(&self, subnet_id: SubnetId, admin: bool) -> impl Step {
/// If auto-retry is false, the user will be prompted on what to do (skip or continue). In
/// non-interactive recoveries, auto-retry should be set to true.
pub fn get_download_certs_step(
&self,
subnet_id: SubnetId,
admin: bool,
auto_retry: bool,
) -> impl Step {
DownloadCertificationsStep {
logger: self.logger.clone(),
subnet_id,
registry_helper: self.registry_helper.clone(),
work_dir: self.work_dir.clone(),
require_confirmation: self.ssh_confirmation,
key_file: self.key_file.clone(),
auto_retry,
admin,
}
}
Expand Down
11 changes: 7 additions & 4 deletions rs/recovery/src/nns_recovery_failover_nodes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -257,10 +257,13 @@ impl RecoveryIterator<StepType, StepTypeIter> for NNSRecoveryFailoverNodes {
}
}

StepType::DownloadCertifications => Ok(Box::new(
self.recovery
.get_download_certs_step(self.params.subnet_id, true),
)),
StepType::DownloadCertifications => {
Ok(Box::new(self.recovery.get_download_certs_step(
self.params.subnet_id,
true,
!self.interactive(),
)))
}

StepType::MergeCertificationPools => {
Ok(Box::new(self.recovery.get_merge_certification_pools_step()))
Expand Down
11 changes: 7 additions & 4 deletions rs/recovery/src/nns_recovery_same_nodes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -183,10 +183,13 @@ impl RecoveryIterator<StepType, StepTypeIter> for NNSRecoverySameNodes {
}
}

StepType::DownloadCertifications => Ok(Box::new(
self.recovery
.get_download_certs_step(self.params.subnet_id, true),
)),
StepType::DownloadCertifications => {
Ok(Box::new(self.recovery.get_download_certs_step(
self.params.subnet_id,
true,
!self.interactive(),
)))
}

StepType::MergeCertificationPools => {
Ok(Box::new(self.recovery.get_merge_certification_pools_step()))
Expand Down
4 changes: 3 additions & 1 deletion rs/recovery/src/steps.rs
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ pub struct DownloadCertificationsStep {
pub registry_helper: RegistryHelper,
pub work_dir: PathBuf,
pub require_confirmation: bool,
pub auto_retry: bool,
pub key_file: Option<PathBuf>,
pub admin: bool,
}
Expand Down Expand Up @@ -98,9 +99,10 @@ impl Step for DownloadCertificationsStep {
&target.display().to_string(),
self.require_confirmation,
self.key_file.as_ref(),
self.auto_retry,
5,
)
.map_err(|e| warn!(self.logger, "Failed to download certifications: {:?}", e));
.map_err(|e| warn!(self.logger, "Skipping download: {:?}", e));

success || res.is_ok()
});
Expand Down

0 comments on commit 8c82f7f

Please sign in to comment.