Skip to content

Commit 0045fe8

Browse files
committed
Add cgroup v2 support
cgroup v1 uses two special files to determine access, where cgroup v2 uses eBPF programs to control access. The code will attach a custom eBPF program which allows run-time reconfiguration and detach docker's default. eBPF programs will be detached when the attaching program dies, which can be dangerous if container-hotplug exits unexpectedly while the program is running, so we instead pin it (so it stays when the program exits) and unpin it after the docker container is down. In this case we might have garbage eBPF programs pinned when container-hotplug exits unexpectedly but it is safe.
1 parent e945f32 commit 0045fe8

File tree

6 files changed

+194
-36
lines changed

6 files changed

+194
-36
lines changed

Cargo.lock

Lines changed: 71 additions & 20 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ bollard = "0.16"
3030
futures = "0.3"
3131
rustix = { version = "0.38", features = ["fs", "stdio", "termios"] }
3232
bitflags = "2"
33+
aya = { git = "https://github.com/aya-rs/aya.git" }
3334

3435
[build-dependencies]
3536
anyhow = { version = "1", features = ["backtrace"] }

README.md

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -25,10 +25,7 @@ Another concern is providing a container with well known paths for the devices.
2525
On bare-metal systems this would usually be achieved with a `SYMLINK` directive in a udev rule.
2626
This program tries to provide a similar functionality for containers, allowing you to specify symlinks for certain devices.
2727

28-
## Limitations
29-
30-
`container-hotplug` needs to be run as root and relies on `cgroup v1`. It does not support `cgroup v2`.
31-
On distributions with `cgroup v2`, you can switch back to `cgroup v1` by setting the [kernel parameter](https://wiki.ubuntu.com/Kernel/KernelBootParameters) `systemd.unified_cgroup_hierarchy=0`.
28+
This tool supports both cgroup v1 and v2.
3229

3330
## Example
3431

src/docker/cgroup.rs

Lines changed: 100 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,8 @@
1-
use anyhow::{ensure, Result};
1+
use anyhow::{ensure, Context, Result};
2+
use aya::maps::{HashMap, MapData};
3+
use aya::programs::{CgroupDevice, Link};
4+
use std::fs::File;
5+
use std::mem::ManuallyDrop;
26
use std::path::PathBuf;
37

48
// The numerical representation below needs to match BPF_DEVCG constants.
@@ -26,6 +30,10 @@ pub trait DeviceAccessController {
2630
minor: u32,
2731
access: Access,
2832
) -> Result<()>;
33+
34+
/// Stop performing access control. This may allow all accesses, so should only be used when
35+
/// the cgroup is shutdown.
36+
fn stop(self: Box<Self>) -> Result<()>;
2937
}
3038

3139
pub struct DeviceAccessControllerV1 {
@@ -96,4 +104,95 @@ impl DeviceAccessController for DeviceAccessControllerV1 {
96104

97105
Ok(())
98106
}
107+
108+
fn stop(self: Box<Self>) -> Result<()> {
109+
Ok(())
110+
}
111+
}
112+
113+
#[repr(C)] // This is read as POD by the BPF program.
114+
#[derive(Clone, Copy)]
115+
struct Device {
116+
device_type: u32,
117+
major: u32,
118+
minor: u32,
119+
}
120+
121+
// SAFETY: Device is `repr(C)`` and has no padding.
122+
unsafe impl aya::Pod for Device {}
123+
124+
pub struct DeviceAccessControllerV2 {
125+
map: HashMap<MapData, Device, u32>,
126+
pin: PathBuf,
127+
}
128+
129+
impl DeviceAccessControllerV2 {
130+
pub fn new(id: &str) -> Result<Self> {
131+
// We want to take control of the device cgroup filtering from docker. To do this, we attach our own
132+
// filter program and detach the one by docker.
133+
let cgroup_path = format!("/sys/fs/cgroup/system.slice/docker-{id}.scope");
134+
let cgroup = File::open(cgroup_path)?;
135+
136+
let mut bpf = aya::Bpf::load(include_bytes!(concat!(
137+
env!("CARGO_MANIFEST_DIR"),
138+
"/cgroup_device_filter/target/bpfel-unknown-none/release/cgroup_device_filter"
139+
)))?;
140+
141+
let program: &mut CgroupDevice = bpf
142+
.program_mut("check_device")
143+
.context("cannot find check_device program")?
144+
.try_into()?;
145+
146+
program.load()?;
147+
148+
// Iterate existing programs. We'll need to detach them later.
149+
// Wrap this inside `ManuallyDrop` to prevent accidental detaching.
150+
let existing_programs = ManuallyDrop::new(CgroupDevice::query(&cgroup)?);
151+
152+
program.attach(&cgroup)?;
153+
154+
// Pin the program so that if container-hotplug accidentally exits, the filter won't be removed from the docker
155+
// container.
156+
let pin: PathBuf = format!("/sys/fs/bpf/docker-{id}-device-filter").into();
157+
program.pin(&pin)?;
158+
159+
// Now our new filter is attached, detach all docker filters.
160+
for existing_program in ManuallyDrop::into_inner(existing_programs) {
161+
existing_program.detach()?;
162+
}
163+
164+
let map: HashMap<_, Device, u32> = bpf
165+
.take_map("DEVICE_PERM")
166+
.context("cannot find DEVICE_PERM map")?
167+
.try_into()?;
168+
169+
Ok(Self { map, pin })
170+
}
171+
}
172+
173+
impl DeviceAccessController for DeviceAccessControllerV2 {
174+
fn set_permission(
175+
&mut self,
176+
ty: DeviceType,
177+
major: u32,
178+
minor: u32,
179+
access: Access,
180+
) -> Result<()> {
181+
let device = Device {
182+
device_type: ty as u32,
183+
major,
184+
minor,
185+
};
186+
if access.is_empty() {
187+
self.map.remove(&device)?;
188+
} else {
189+
self.map.insert(device, access.bits(), 0)?;
190+
}
191+
Ok(())
192+
}
193+
194+
fn stop(self: Box<Self>) -> Result<()> {
195+
CgroupDevice::from_pin(&self.pin)?.unpin()?;
196+
Ok(())
197+
}
99198
}

0 commit comments

Comments
 (0)