Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

OpenTelemetry OTLP setup for tracing, take 2 #697

Open
wants to merge 22 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
583d124
feat: tracing with OpenTelemetry OTLP
mzabaluev Oct 8, 2024
57845ba
feat(process-compose):telemetry overlay
mzabaluev Oct 8, 2024
bece93e
feat(tracing)!: break out telemetry
mzabaluev Oct 10, 2024
607ba0b
fix(process-compose): correctly specify enviroment
mzabaluev Oct 10, 2024
56899ca
feat: produce some telemetry spans and events
mzabaluev Oct 10, 2024
f713d71
fix(tracing): work around another shutdown problem
mzabaluev Oct 16, 2024
8ce2e07
feat: spans for each telemetry event
mzabaluev Oct 16, 2024
c915948
feat(suzuka-full-node): execute_block telemetry
mzabaluev Oct 17, 2024
c1ba7fc
chore: comments on some metrics
mzabaluev Oct 17, 2024
372f9b4
feat(opt-executor): emit telemetry on tx failure
mzabaluev Oct 17, 2024
acc202b
feat(suzuka-full-node): telemetry on executed tx
mzabaluev Oct 17, 2024
954c8e4
fix: add feed.
l-monninger Oct 18, 2024
e6144f8
fix: add otlp test.
l-monninger Oct 18, 2024
3fed933
Merge branch 'main' into mikhail/opentelemetry-through-tracing
mzabaluev Oct 21, 2024
f59cb2f
chore(tracing): comment on the guard
mzabaluev Oct 21, 2024
2fab8d7
feat(ci): add telemetry to local tests
mzabaluev Oct 21, 2024
f2af21d
Merge branch 'main' into mikhail/opentelemetry-through-tracing
mzabaluev Oct 21, 2024
f06cf53
feat(docker-compose): add telemetry overlay
mzabaluev Oct 21, 2024
2688dec
Merge branch 'main' into mikhail/opentelemetry-through-tracing
mzabaluev Nov 1, 2024
a2e6653
chore: commit Cargo.lock updates
mzabaluev Nov 1, 2024
1474c53
Merge branch 'main' into mikhail/opentelemetry-through-tracing
mzabaluev Nov 4, 2024
f61c345
Merge branch 'main' into mikhail/opentelemetry-through-tracing
mzabaluev Dec 19, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions .github/workflows/checks-all.yml
Original file line number Diff line number Diff line change
Expand Up @@ -101,8 +101,10 @@ jobs:
env:
CELESTIA_LOG_LEVEL: FATAL # adjust the log level while debugging
run: |
nix develop --command bash -c "just movement-full-node native build.setup.eth-local.celestia-local.test -t=false"
nix develop --command bash -c "just movement-full-node native build.setup.eth-local.celestia-local.test -t=false"
nix develop --command bash \
-c "just movement-full-node native build.setup.eth-local.celestia-local.test.telemetry.test-telemetry -t=false"
nix develop --command bash \
-c "just movement-full-node native build.setup.eth-local.celestia-local.test -t=false"

movement-full-node-malicious:
if: github.event.label.name == 'cicd:movement-full-node-malicious' || github.ref == 'refs/heads/main'
Expand Down
121 changes: 107 additions & 14 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 5 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -243,6 +243,10 @@ move-vm-ext = { path = "types/move-vm-ext" }
num-derive = "0.4.2"
num-traits = "0.2.14"
once_cell = "1.8.0"
opentelemetry = { version = "0.26" }
opentelemetry-otlp = { version = "0.26" }
opentelemetry_sdk = { version = "0.26", features = ["rt-tokio"] }
opentelemetry-semantic-conventions = { version = "0.26" }
parking_lot = { version = "0.12.1" }
poem = { version = "=1.3.59", features = ["anyhow", "rustls"] }
poem-openapi = { version = "=2.0.11", features = ["swagger-ui", "url"] }
Expand Down Expand Up @@ -280,7 +284,7 @@ tonic-reflection = "0.12.3"
tonic-web = "0.12.3"
### To try (experimental) std support, add `features = [ "std" ]` to risc0-zkvm
tracing = "0.1.40"
tracing-appender = "0.2"
tracing-opentelemetry = { version = "0.27" }
tracing-subscriber = { version = "0.3", features = ["env-filter"] }
tracing-test = "0.2.5"
trie-db = "0.28.0"
Expand Down
26 changes: 26 additions & 0 deletions docker/compose/movement-full-node/docker-compose.telemetry.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
services:
jaeger:
image: jaegertracing/all-in-one:1.62.0
container_name: jaeger
environment:
- COLLECTOR_OTLP_ENABLED=true
ports:
- "4317:4317"
- "16686:16686"
healthcheck:
test: curl http://localhost:16686/api/traces
start_period: 3s

movement-full-node:
environment:
- MOVEMENT_OTLP=http://jaeger:4317
depends_on:
- jaeger:
condition: service_healthy

movement-celestia-da-light-node:
environment:
- MOVEMENT_OTLP=http://jaeger:4317
depends_on:
- jaeger:
condition: service_healthy
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,6 @@ use tokio::sync::RwLock;
use tracing::info;
use url::Url;

const TIMING_LOG_ENV: &str = "SUZUKA_TIMING_LOG";

pub fn get_movement_config(
dot_movement: &DotMovement,
) -> Result<movement_config::Config, anyhow::Error> {
Expand Down Expand Up @@ -393,10 +391,12 @@ pub async fn basic_coin_transfers(

#[tokio::main]
async fn main() -> Result<(), anyhow::Error> {
let tracing_config = movement_tracing::Config {
timing_log_path: std::env::var_os(TIMING_LOG_ENV).map(Into::into),
};
let _guard = movement_tracing::init_tracing_subscriber(tracing_config);
let tracing_config = movement_tracing::Config::from_env()?;
let _guard = movement_tracing::init_tracing_subscriber(
env!("CARGO_BIN_NAME"),
env!("CARGO_PKG_VERSION"),
&tracing_config,
)?;

// get the lead dot movement from the environment
let dot_movement = DotMovement::try_from_env()?;
Expand Down
11 changes: 6 additions & 5 deletions networks/movement/movement-full-node/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,15 @@

use clap::*;
use movement_full_node::MovementFullNode;
const TIMING_LOG_ENV: &str = "SUZUKA_TIMING_LOG";
use std::env;

#[tokio::main]
async fn main() -> Result<(), anyhow::Error> {
let tracing_config =
movement_tracing::Config { timing_log_path: env::var_os(TIMING_LOG_ENV).map(Into::into) };
let _guard = movement_tracing::init_tracing_subscriber(tracing_config);
let tracing_config = movement_tracing::Config::from_env()?;
let _guard = movement_tracing::init_tracing_subscriber(
env!("CARGO_BIN_NAME"),
env!("CARGO_PKG_VERSION"),
&tracing_config,
);

let suzuka_util = MovementFullNode::parse();

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -110,9 +110,10 @@ where
}
};

let block_id_hex = hex::encode(&block_id);
info!(
block_id = %hex::encode(block_id.clone()),
da_height = da_height,
block_id = block_id_hex,

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

target: "movement_telemetry" is missing here too. What's the logic for including/not including?

da_height,
time = block_timestamp,
"Processing block from DA"
);
Expand All @@ -130,9 +131,10 @@ where

let block: Block = bcs::from_bytes(&block_bytes[..])?;

// get the transactions
// get the transactions count before the block is consumed
let transactions_count = block.transactions().len();
let span = info_span!(target: "movement_timing", "execute_block", id = ?block_id);
let span =
info_span!(target: "movement_telemetry", "execute_block", block_id = %block_id_hex);
let commitment =
self.execute_block_with_retries(block, block_timestamp).instrument(span).await?;

Expand Down Expand Up @@ -166,7 +168,7 @@ where
}
}
} else {
info!(block_id = ?block_id, "Skipping settlement");
info!(block_id = block_id_hex, "Skipping settlement");

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

target: "movement_telemetry" is missing here, is that intentional?

}

Ok(())
Expand All @@ -185,18 +187,22 @@ where
block: Block,
mut block_timestamp: u64,
) -> anyhow::Result<BlockCommitment> {
for _ in 0..self.execution_extension.block_retry_count {
let retry_count = self.execution_extension.block_retry_count;
for _ in 0..retry_count {
// we have to clone here because the block is supposed to be consumed by the executor
match self.execute_block(block.clone(), block_timestamp).await {
Ok(commitment) => return Ok(commitment),
Ok(commitment) => {
info!(target: "movement_telemetry", "execute_block_succeeded");
return Ok(commitment);
}
Err(e) => {
info!("Failed to execute block: {:?}. Retrying", e);
info!(target: "movement_telemetry", error = %e, "execute_block_failed");
block_timestamp += self.execution_extension.block_retry_increment_microseconds; // increase the timestamp by 5 ms (5000 microseconds)
}
}
}

anyhow::bail!("Failed to execute block after 5 retries")
anyhow::bail!("Failed to execute block after {retry_count} retries")
}

async fn execute_block(
Expand Down Expand Up @@ -228,6 +234,22 @@ where
continue;
}

// Instrumentation for aggregated metrics:
// Transactions per second: https://github.com/movementlabsxyz/movement/discussions/422
// Transaction latency: https://github.com/movementlabsxyz/movement/discussions/423
// Transaction failure rate: https://github.com/movementlabsxyz/movement/discussions/428
//
// TODO: as the block can be attempted to be executed repeatedly,
// collect this data once and export in telemetry
// on the final success or failure.
info!(
target: "movement_telemetry",
tx_hash = %signed_transaction.committed_hash(),
sender = %signed_transaction.sender(),
sequence_number = signed_transaction.sequence_number(),
"executing_transaction"
);

let signature_verified_transaction = SignatureVerifiedTransaction::Valid(
Transaction::UserTransaction(signed_transaction),
);
Expand Down
Loading
Loading