Skip to content

Commit ddc0b9d

Browse files
committed
app: Implement a separate health check server
A separate server provides identically behaving /live and /ready routes to the admin server. Does not remove the existing admin server's routes. Background: On some Kubernetes distributions, requests from the control plane may not come from a private address range IP address or even a consistent IP address. This poses a problem, because the admin server used in a multicluster mesh needs to simultaneously serve /live and /ready routes to: * The Kubernetes control plane, for liveness and readiness probes respectively * Remote clusters as part of probing for remote gateway In order to avoid exposing the other admin routes, the multicluster gateway uses an authorization policy forbidding unauthorized and out-of-cluster requests. This causes the gateway to fail readiness and liveness probes. Resolution: Implement a separate server in the proxy app that can securely serve /live and /ready routes. The port that server listens on can be used for health check probes internally, without an authorization policy. See: linkerd/linkerd2#7548 Signed-off-by: Aaron Friel <mayreply@aaronfriel.com>
1 parent 56a4511 commit ddc0b9d

File tree

13 files changed

+578
-10
lines changed

13 files changed

+578
-10
lines changed

Cargo.lock

+16
Original file line numberDiff line numberDiff line change
@@ -747,6 +747,7 @@ dependencies = [
747747
"linkerd-app-admin",
748748
"linkerd-app-core",
749749
"linkerd-app-gateway",
750+
"linkerd-app-health",
750751
"linkerd-app-inbound",
751752
"linkerd-app-outbound",
752753
"linkerd-error",
@@ -855,6 +856,21 @@ dependencies = [
855856
"tracing",
856857
]
857858

859+
[[package]]
860+
name = "linkerd-app-health"
861+
version = "0.1.0"
862+
dependencies = [
863+
"futures",
864+
"http",
865+
"hyper",
866+
"linkerd-app-core",
867+
"linkerd-app-inbound",
868+
"thiserror",
869+
"tokio",
870+
"tower",
871+
"tracing",
872+
]
873+
858874
[[package]]
859875
name = "linkerd-app-inbound"
860876
version = "0.1.0"

linkerd/app/Cargo.toml

+1
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ allow-loopback = ["linkerd-app-outbound/allow-loopback"]
1717
[dependencies]
1818
futures = { version = "0.3", default-features = false }
1919
linkerd-app-admin = { path = "./admin" }
20+
linkerd-app-health = { path = "./health" }
2021
linkerd-app-core = { path = "./core" }
2122
linkerd-app-gateway = { path = "./gateway" }
2223
linkerd-app-inbound = { path = "./inbound" }

linkerd/app/health/Cargo.toml

+31
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
[package]
2+
name = "linkerd-app-health"
3+
version = "0.1.0"
4+
authors = ["Linkerd Developers <cncf-linkerd-dev@lists.cncf.io>"]
5+
license = "Apache-2.0"
6+
edition = "2021"
7+
publish = false
8+
description = """
9+
The linkerd proxy's health check server.
10+
"""
11+
12+
[dependencies]
13+
http = "0.2"
14+
hyper = { version = "0.14", features = ["http1", "http2"] }
15+
futures = { version = "0.3", default-features = false }
16+
linkerd-app-core = { path = "../core" }
17+
linkerd-app-inbound = { path = "../inbound" }
18+
thiserror = "1"
19+
tokio = { version = "1", features = ["macros", "sync", "parking_lot"]}
20+
tracing = "0.1"
21+
22+
[dependencies.tower]
23+
version = "0.4"
24+
default-features = false
25+
features = [
26+
"buffer",
27+
"make",
28+
"spawn-ready",
29+
"timeout",
30+
"util",
31+
]

linkerd/app/health/src/lib.rs

+8
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
#![deny(warnings, rust_2018_idioms)]
2+
#![forbid(unsafe_code)]
3+
4+
mod server;
5+
mod stack;
6+
7+
pub use self::server::{Health, Latch, Readiness};
8+
pub use self::stack::{Config, Task};

linkerd/app/health/src/server.rs

+127
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,127 @@
1+
//! Serves an HTTP health server.
2+
//!
3+
//! * `GET /ready` -- returns 200 when the proxy is ready to participate in meshed
4+
//! traffic.
5+
//! * `GET /live` -- returns 200 when the proxy is live.
6+
7+
use futures::future;
8+
use http::StatusCode;
9+
use hyper::{
10+
body::{Body, HttpBody},
11+
Request, Response,
12+
};
13+
use linkerd_app_core::Error;
14+
use std::{
15+
future::Future,
16+
pin::Pin,
17+
task::{Context, Poll},
18+
};
19+
20+
mod readiness;
21+
22+
pub use self::readiness::{Latch, Readiness};
23+
24+
#[derive(Clone)]
25+
pub struct Health {
26+
ready: Readiness,
27+
}
28+
29+
pub type ResponseFuture =
30+
Pin<Box<dyn Future<Output = Result<Response<Body>, Error>> + Send + 'static>>;
31+
32+
impl Health {
33+
pub fn new(ready: Readiness) -> Self {
34+
Self { ready }
35+
}
36+
37+
fn ready_rsp(&self) -> Response<Body> {
38+
if self.ready.is_ready() {
39+
Response::builder()
40+
.status(StatusCode::OK)
41+
.header(http::header::CONTENT_TYPE, "text/plain")
42+
.body("ready\n".into())
43+
.expect("builder with known status code must not fail")
44+
} else {
45+
Response::builder()
46+
.status(StatusCode::SERVICE_UNAVAILABLE)
47+
.body("not ready\n".into())
48+
.expect("builder with known status code must not fail")
49+
}
50+
}
51+
52+
fn live_rsp() -> Response<Body> {
53+
Response::builder()
54+
.status(StatusCode::OK)
55+
.header(http::header::CONTENT_TYPE, "text/plain")
56+
.body("live\n".into())
57+
.expect("builder with known status code must not fail")
58+
}
59+
60+
fn not_found() -> Response<Body> {
61+
Response::builder()
62+
.status(http::StatusCode::NOT_FOUND)
63+
.body(Body::empty())
64+
.expect("builder with known status code must not fail")
65+
}
66+
}
67+
68+
impl<B> tower::Service<http::Request<B>> for Health
69+
where
70+
B: HttpBody + Send + Sync + 'static,
71+
B::Error: Into<Error>,
72+
B::Data: Send,
73+
{
74+
type Response = http::Response<Body>;
75+
type Error = Error;
76+
type Future = ResponseFuture;
77+
78+
fn poll_ready(&mut self, _: &mut Context<'_>) -> Poll<Result<(), Self::Error>> {
79+
Poll::Ready(Ok(()))
80+
}
81+
82+
fn call(&mut self, req: Request<B>) -> Self::Future {
83+
match req.uri().path() {
84+
"/live" => Box::pin(future::ok(Self::live_rsp())),
85+
"/ready" => Box::pin(future::ok(self.ready_rsp())),
86+
_ => Box::pin(future::ok(Self::not_found())),
87+
}
88+
}
89+
}
90+
91+
#[cfg(test)]
92+
mod tests {
93+
use super::*;
94+
use http::method::Method;
95+
use std::time::Duration;
96+
use tokio::time::timeout;
97+
use tower::util::ServiceExt;
98+
99+
const TIMEOUT: Duration = Duration::from_secs(1);
100+
101+
#[tokio::test]
102+
async fn ready_when_latches_dropped() {
103+
let (r, l0) = Readiness::new();
104+
let l1 = l0.clone();
105+
106+
let health = Health::new(r);
107+
macro_rules! call {
108+
() => {{
109+
let r = Request::builder()
110+
.method(Method::GET)
111+
.uri("http://0.0.0.0/ready")
112+
.body(Body::empty())
113+
.unwrap();
114+
let f = health.clone().oneshot(r);
115+
timeout(TIMEOUT, f).await.expect("timeout").expect("call")
116+
}};
117+
}
118+
119+
assert_eq!(call!().status(), StatusCode::SERVICE_UNAVAILABLE);
120+
121+
drop(l0);
122+
assert_eq!(call!().status(), StatusCode::SERVICE_UNAVAILABLE);
123+
124+
drop(l1);
125+
assert_eq!(call!().status(), StatusCode::OK);
126+
}
127+
}
+36
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
use std::sync::{Arc, Weak};
2+
3+
/// Tracks the processes's readiness to serve traffic.
4+
///
5+
/// Once `is_ready()` returns true, it will never return false.
6+
#[derive(Clone, Debug)]
7+
pub struct Readiness(Weak<()>);
8+
9+
/// When all latches are dropped, the process is considered ready.
10+
#[derive(Clone, Debug)]
11+
pub struct Latch(Arc<()>);
12+
13+
impl Readiness {
14+
pub fn new() -> (Readiness, Latch) {
15+
let r = Arc::new(());
16+
(Readiness(Arc::downgrade(&r)), Latch(r))
17+
}
18+
19+
pub fn is_ready(&self) -> bool {
20+
self.0.upgrade().is_none()
21+
}
22+
}
23+
24+
/// ALways ready.
25+
impl Default for Readiness {
26+
fn default() -> Self {
27+
Self::new().0
28+
}
29+
}
30+
31+
impl Latch {
32+
/// Releases this readiness latch.
33+
pub fn release(self) {
34+
drop(self);
35+
}
36+
}

0 commit comments

Comments
 (0)