Skip to content

Commit 3bcfdc0

Browse files
committed
Support containers launched with user namespace
1 parent a6c0dad commit 3bcfdc0

File tree

3 files changed

+97
-19
lines changed

3 files changed

+97
-19
lines changed

shell.nix

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,5 +8,8 @@ pkgs.mkShell {
88

99
# For llvm-objdump
1010
llvmPackages.bintools
11+
12+
# To aid testing
13+
runc
1114
];
1215
}

src/runc/container.rs

Lines changed: 11 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,8 @@ use std::fs::File;
22
use std::io::{BufRead, BufReader, Seek};
33
use std::path::Path;
44

5-
use anyhow::{bail, ensure, Context, Result};
6-
use rustix::fs::{FileType, Gid, Mode, Uid};
5+
use anyhow::{bail, Context, Result};
6+
use rustix::fs::{FileType, Mode};
77
use rustix::process::{Pid, Signal};
88
use tokio::io::unix::AsyncFd;
99
use tokio::io::Interest;
@@ -63,8 +63,10 @@ impl CgroupEventNotifier {
6363
}
6464

6565
pub struct Container {
66-
uid: Uid,
67-
gid: Gid,
66+
// Uid and gid of the primary container user.
67+
// Note that they're inside the user namespace (if any).
68+
uid: u32,
69+
gid: u32,
6870
pid: Pid,
6971
wait: tokio::sync::watch::Receiver<bool>,
7072
cgroup_device_filter: Mutex<Box<dyn DeviceAccessController + Send>>,
@@ -87,11 +89,9 @@ impl Container {
8789
Box::new(DeviceAccessControllerV2::new(&state.cgroup_paths.unified)?)
8890
};
8991

90-
ensure!(config.process.user.uid != u32::MAX && config.process.user.gid != u32::MAX);
91-
9292
Ok(Self {
93-
uid: unsafe { Uid::from_raw(config.process.user.uid) },
94-
gid: unsafe { Gid::from_raw(config.process.user.gid) },
93+
uid: config.process.user.uid,
94+
gid: config.process.user.gid,
9595
pid: Pid::from_raw(state.init_process_pid.try_into()?).context("Invalid PID")?,
9696
wait: recv,
9797
cgroup_device_filter: Mutex::new(cgroup_device_filter),
@@ -113,7 +113,8 @@ impl Container {
113113
}
114114

115115
pub async fn mknod(&self, node: &Path, (major, minor): (u32, u32)) -> Result<()> {
116-
crate::util::namespace::MntNamespace::of_pid(self.pid)?.enter(|| {
116+
let ns = crate::util::namespace::MntNamespace::of_pid(self.pid)?;
117+
ns.enter(|| {
117118
if let Some(parent) = node.parent() {
118119
let _ = std::fs::create_dir_all(parent);
119120
}
@@ -125,9 +126,7 @@ impl Container {
125126
Mode::from(0o644),
126127
rustix::fs::makedev(major, minor),
127128
)?;
128-
if !self.uid.is_root() {
129-
rustix::fs::chown(node, Some(self.uid), Some(self.gid))?;
130-
}
129+
std::os::unix::fs::chown(node, Some(ns.uid(self.uid)?), Some(ns.gid(self.gid)?))?;
131130
Ok(())
132131
})?
133132
}

src/util/namespace.rs

Lines changed: 83 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,70 @@
11
use std::fs::File;
22
use std::os::fd::AsFd;
3+
use std::path::Path;
34

4-
use anyhow::Result;
5+
use anyhow::{Context, Result};
6+
use rustix::fs::{Gid, Uid};
57
use rustix::process::Pid;
6-
use rustix::thread::{LinkNameSpaceType, UnshareFlags};
8+
use rustix::thread::{CapabilitiesSecureBits, LinkNameSpaceType, UnshareFlags};
9+
10+
pub struct IdMap {
11+
map: Vec<(u32, u32, u32)>,
12+
}
13+
14+
impl IdMap {
15+
fn read(path: &Path) -> Result<Self> {
16+
Self::parse(&std::fs::read_to_string(path)?)
17+
}
18+
19+
fn parse(content: &str) -> Result<Self> {
20+
let mut map = Vec::new();
21+
for line in content.lines() {
22+
let mut words = line.split_ascii_whitespace();
23+
let inside = words.next().context("unexpected id_map")?.parse()?;
24+
let outside = words.next().context("unexpected id_map")?.parse()?;
25+
let count = words.next().context("unexpected id_map")?.parse()?;
26+
map.push((inside, outside, count));
27+
}
28+
Ok(Self { map })
29+
}
30+
31+
fn translate(&self, id: u32) -> Option<u32> {
32+
for &(inside, outside, count) in self.map.iter() {
33+
if (inside..inside.checked_add(count)?).contains(&id) {
34+
return (id - inside).checked_add(outside);
35+
}
36+
}
37+
None
38+
}
39+
}
740

841
pub struct MntNamespace {
9-
fd: File,
42+
mnt_fd: File,
43+
uid_map: IdMap,
44+
gid_map: IdMap,
1045
}
1146

1247
impl MntNamespace {
1348
/// Open the mount namespace of a process.
1449
pub fn of_pid(pid: Pid) -> Result<MntNamespace> {
15-
let path = format!("/proc/{}/ns/mnt", pid.as_raw_nonzero());
16-
let fd = File::open(path)?;
17-
Ok(MntNamespace { fd })
50+
let mnt_fd = File::open(format!("/proc/{}/ns/mnt", pid.as_raw_nonzero()))?;
51+
let uid_map = IdMap::read(format!("/proc/{}/uid_map", pid.as_raw_nonzero()).as_ref())?;
52+
let gid_map = IdMap::read(format!("/proc/{}/gid_map", pid.as_raw_nonzero()).as_ref())?;
53+
Ok(MntNamespace {
54+
mnt_fd,
55+
uid_map,
56+
gid_map,
57+
})
58+
}
59+
60+
/// Translate user ID into a UID in the namespace.
61+
pub fn uid(&self, uid: u32) -> Result<u32> {
62+
Ok(self.uid_map.translate(uid).context("UID overflows")?)
63+
}
64+
65+
/// Translate group ID into a GID in the namespace.
66+
pub fn gid(&self, gid: u32) -> Result<u32> {
67+
Ok(self.gid_map.translate(gid).context("GID overflows")?)
1868
}
1969

2070
/// Enter the mount namespace.
@@ -30,9 +80,35 @@ impl MntNamespace {
3080

3181
// Switch this particular thread to the container's mount namespace.
3282
rustix::thread::move_into_link_name_space(
33-
self.fd.as_fd(),
83+
self.mnt_fd.as_fd(),
3484
Some(LinkNameSpaceType::Mount),
3585
)?;
86+
87+
// If user namespace is used, we must act like the root user *inside*
88+
// namespace to be able to create files properly (otherwise EOVERFLOW
89+
// will be returned when creating file).
90+
//
91+
// Entering the user namespace turns out to be problematic.
92+
// The reason seems to be this line [1]:
93+
// which means `CAP_MKNOD` capability of the *init* namespace is needed.
94+
// However task's associated security context is all relative to its current
95+
// user namespace [2], so once you enter a user namespace there's no way of getting
96+
// back `CAP_MKNOD` of the init namespace anymore.
97+
// (Yes this means that even if CAP_MKNOD is granted to the container, you cannot
98+
// create device nodes within it.)
99+
//
100+
// [1]: https://elixir.bootlin.com/linux/v6.11.1/source/fs/namei.c#L4073
101+
// [2]: https://elixir.bootlin.com/linux/v6.11.1/source/include/linux/cred.h#L111
102+
103+
// By default `setuid` will drop capabilities when transitioning from root
104+
// to non-root user. This bit prevents it so our code still have superpower.
105+
rustix::thread::set_capabilities_secure_bits(
106+
CapabilitiesSecureBits::NO_SETUID_FIXUP,
107+
)?;
108+
109+
rustix::thread::set_thread_uid(unsafe { Uid::from_raw(self.uid(0)?) })?;
110+
rustix::thread::set_thread_gid(unsafe { Gid::from_raw(self.gid(0)?) })?;
111+
36112
Ok(f())
37113
})
38114
.join()

0 commit comments

Comments
 (0)