diff --git a/cmd/nvidia-cdi-hook/update-ldcache/container-root.go b/cmd/nvidia-cdi-hook/update-ldcache/container-root.go
new file mode 100644
index 000000000..f62b3ae9a
--- /dev/null
+++ b/cmd/nvidia-cdi-hook/update-ldcache/container-root.go
@@ -0,0 +1,76 @@
+/**
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+**/
+
+package ldcache
+
+import (
+ "os"
+ "path/filepath"
+
+ "github.com/moby/sys/symlink"
+)
+
+// A containerRoot represents the root filesystem of a container.
+type containerRoot string
+
+// hasPath checks whether the specified path exists in the root.
+func (r containerRoot) hasPath(path string) bool {
+ resolved, err := r.resolve(path)
+ if err != nil {
+ return false
+ }
+ if _, err := os.Stat(resolved); err != nil && os.IsNotExist(err) {
+ return false
+ }
+ return true
+}
+
+// globFiles matches the specified pattern in the root.
+// The files that match must be regular files.
+func (r containerRoot) globFiles(pattern string) ([]string, error) {
+ patternPath, err := r.resolve(pattern)
+ if err != nil {
+ return nil, err
+ }
+ matches, err := filepath.Glob(patternPath)
+ if err != nil {
+ return nil, err
+ }
+ var files []string
+ for _, match := range matches {
+ info, err := os.Lstat(match)
+ if err != nil {
+ return nil, err
+ }
+ // Ignore symlinks.
+ if info.Mode()&os.ModeSymlink != 0 {
+ continue
+ }
+ // Ignore directories.
+ if info.IsDir() {
+ continue
+ }
+ files = append(files, match)
+ }
+ return files, nil
+}
+
+// resolve returns the absolute path including root path.
+// Symlinks are resolved, but are guaranteed to resolve in the root.
+func (r containerRoot) resolve(path string) (string, error) {
+ absolute := filepath.Clean(filepath.Join(string(r), path))
+ return symlink.FollowSymlinkInScope(absolute, string(r))
+}
diff --git a/cmd/nvidia-cdi-hook/update-ldcache/safe-exec_linux.go b/cmd/nvidia-cdi-hook/update-ldcache/safe-exec_linux.go
new file mode 100644
index 000000000..01e0d8772
--- /dev/null
+++ b/cmd/nvidia-cdi-hook/update-ldcache/safe-exec_linux.go
@@ -0,0 +1,55 @@
+/**
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+**/
+
+package ldcache
+
+import (
+ "fmt"
+ "os"
+ "strconv"
+ "syscall"
+
+ "github.com/opencontainers/runc/libcontainer/dmz"
+)
+
+func SafeExec(path string, args []string, envv []string) error {
+ safeExe, err := cloneBinary(path)
+ if err != nil {
+ return fmt.Errorf("failed to clone binary: %w", err)
+ }
+ defer safeExe.Close()
+
+ exePath := "/proc/self/fd/" + strconv.Itoa(int(safeExe.Fd()))
+
+ //nolint:gosec // TODO: Can we harden this so that there is less risk of command injection
+ return syscall.Exec(exePath, args, envv)
+}
+
+func cloneBinary(path string) (*os.File, error) {
+ exe, err := os.Open(path)
+ if err != nil {
+ return nil, fmt.Errorf("opening current binary: %w", err)
+ }
+ defer exe.Close()
+
+ stat, err := exe.Stat()
+ if err != nil {
+ return nil, fmt.Errorf("checking /proc/self/exe size: %w", err)
+ }
+ size := stat.Size()
+
+ return dmz.CloneBinary(exe, size, path, os.TempDir())
+}
diff --git a/cmd/nvidia-cdi-hook/update-ldcache/safe-exec_other.go b/cmd/nvidia-cdi-hook/update-ldcache/safe-exec_other.go
new file mode 100644
index 000000000..d11f20a7f
--- /dev/null
+++ b/cmd/nvidia-cdi-hook/update-ldcache/safe-exec_other.go
@@ -0,0 +1,26 @@
+//go:build !linux
+// +build !linux
+
+/**
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+**/
+
+package ldcache
+
+import "syscall"
+
+func SafeExec(path string, args []string, envv []string) error {
+ return syscall.Exec(path, args, envv)
+}
diff --git a/cmd/nvidia-cdi-hook/update-ldcache/update-ldcache.go b/cmd/nvidia-cdi-hook/update-ldcache/update-ldcache.go
index 7128aa6b5..abe100cc4 100644
--- a/cmd/nvidia-cdi-hook/update-ldcache/update-ldcache.go
+++ b/cmd/nvidia-cdi-hook/update-ldcache/update-ldcache.go
@@ -22,7 +22,6 @@ import (
"os"
"path/filepath"
"strings"
- "syscall"
"github.com/urfave/cli/v2"
@@ -131,9 +130,7 @@ func (m command) run(c *cli.Context, cfg *options) error {
// Explicitly specify using /etc/ld.so.conf since the host's ldconfig may
// be configured to use a different config file by default.
args = append(args, "-f", "/etc/ld.so.conf")
-
- //nolint:gosec // TODO: Can we harden this so that there is less risk of command injection
- return syscall.Exec(ldconfigPath, args, nil)
+ return SafeExec(ldconfigPath, args, nil)
}
type root string
diff --git a/go.mod b/go.mod
index 4766aea61..c577e53af 100644
--- a/go.mod
+++ b/go.mod
@@ -6,6 +6,7 @@ require (
github.com/NVIDIA/go-nvlib v0.7.1
github.com/NVIDIA/go-nvml v0.12.4-1
github.com/moby/sys/symlink v0.3.0
+ github.com/opencontainers/runc v1.2.5
github.com/opencontainers/runtime-spec v1.2.0
github.com/pelletier/go-toml v1.9.5
github.com/sirupsen/logrus v1.9.3
@@ -19,13 +20,13 @@ require (
require (
github.com/cpuguy83/go-md2man/v2 v2.0.5 // indirect
+ github.com/cyphar/filepath-securejoin v0.4.1 // indirect
github.com/davecgh/go-spew v1.1.1 // indirect
github.com/fsnotify/fsnotify v1.7.0 // indirect
github.com/google/uuid v1.6.0 // indirect
github.com/hashicorp/errwrap v1.1.0 // indirect
github.com/kr/pretty v0.3.1 // indirect
github.com/opencontainers/runtime-tools v0.9.1-0.20221107090550-2e043c6bd626 // indirect
- github.com/opencontainers/selinux v1.11.0 // indirect
github.com/pmezard/go-difflib v1.0.0 // indirect
github.com/russross/blackfriday/v2 v2.1.0 // indirect
github.com/syndtr/gocapability v0.0.0-20200815063812-42c35b437635 // indirect
diff --git a/go.sum b/go.sum
index 228c2aa98..e1ec49d98 100644
--- a/go.sum
+++ b/go.sum
@@ -7,6 +7,8 @@ github.com/blang/semver/v4 v4.0.0/go.mod h1:IbckMUScFkM3pff0VJDNKRiT6TG/YpiHIM2y
github.com/cpuguy83/go-md2man/v2 v2.0.5 h1:ZtcqGrnekaHpVLArFSe4HK5DoKx1T0rq2DwVB0alcyc=
github.com/cpuguy83/go-md2man/v2 v2.0.5/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o=
github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E=
+github.com/cyphar/filepath-securejoin v0.4.1 h1:JyxxyPEaktOD+GAnqIqTf9A8tHyAG22rowi7HkoSU1s=
+github.com/cyphar/filepath-securejoin v0.4.1/go.mod h1:Sdj7gXlvMcPZsbhwhQ33GguGLDGQL7h7bg04C/+u9jI=
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
@@ -31,6 +33,8 @@ github.com/mndrix/tap-go v0.0.0-20171203230836-629fa407e90b/go.mod h1:pzzDgJWZ34
github.com/moby/sys/symlink v0.3.0 h1:GZX89mEZ9u53f97npBy4Rc3vJKj7JBDj/PN2I22GrNU=
github.com/moby/sys/symlink v0.3.0/go.mod h1:3eNdhduHmYPcgsJtZXW1W4XUJdZGBIkttZ8xKqPUJq0=
github.com/mrunalp/fileutils v0.5.0/go.mod h1:M1WthSahJixYnrXQl/DFQuteStB1weuxD2QJNHXfbSQ=
+github.com/opencontainers/runc v1.2.5 h1:8KAkq3Wrem8bApgOHyhRI/8IeLXIfmZ6Qaw6DNSLnA4=
+github.com/opencontainers/runc v1.2.5/go.mod h1:dOQeFo29xZKBNeRBI0B19mJtfHv68YgCTh1X+YphA+4=
github.com/opencontainers/runtime-spec v1.0.3-0.20220825212826-86290f6a00fb/go.mod h1:jwyrGlmzljRJv/Fgzds9SsS/C5hL+LL3ko9hs6T5lQ0=
github.com/opencontainers/runtime-spec v1.2.0 h1:z97+pHb3uELt/yiAWD691HNHQIF07bE7dzrbT927iTk=
github.com/opencontainers/runtime-spec v1.2.0/go.mod h1:jwyrGlmzljRJv/Fgzds9SsS/C5hL+LL3ko9hs6T5lQ0=
diff --git a/vendor/github.com/cyphar/filepath-securejoin/CHANGELOG.md b/vendor/github.com/cyphar/filepath-securejoin/CHANGELOG.md
new file mode 100644
index 000000000..ca0e3c62c
--- /dev/null
+++ b/vendor/github.com/cyphar/filepath-securejoin/CHANGELOG.md
@@ -0,0 +1,256 @@
+# Changelog #
+All notable changes to this project will be documented in this file.
+
+The format is based on [Keep a Changelog](http://keepachangelog.com/)
+and this project adheres to [Semantic Versioning](http://semver.org/).
+
+## [Unreleased] ##
+
+## [0.4.1] - 2025-01-28 ##
+
+### Fixed ###
+- The restrictions added for `root` paths passed to `SecureJoin` in 0.4.0 was
+ found to be too strict and caused some regressions when folks tried to
+ update, so this restriction has been relaxed to only return an error if the
+ path contains a `..` component. We still recommend users use `filepath.Clean`
+ (and even `filepath.EvalSymlinks`) on the `root` path they are using, but at
+ least you will no longer be punished for "trivial" unclean paths.
+
+## [0.4.0] - 2025-01-13 ##
+
+### Breaking ####
+- `SecureJoin(VFS)` will now return an error if the provided `root` is not a
+ `filepath.Clean`'d path.
+
+ While it is ultimately the responsibility of the caller to ensure the root is
+ a safe path to use, passing a path like `/symlink/..` as a root would result
+ in the `SecureJoin`'d path being placed in `/` even though `/symlink/..`
+ might be a different directory, and so we should more strongly discourage
+ such usage.
+
+ All major users of `securejoin.SecureJoin` already ensure that the paths they
+ provide are safe (and this is ultimately a question of user error), but
+ removing this foot-gun is probably a good idea. Of course, this is
+ necessarily a breaking API change (though we expect no real users to be
+ affected by it).
+
+ Thanks to [Erik Sjölund](https://github.com/eriksjolund), who initially
+ reported this issue as a possible security issue.
+
+- `MkdirAll` and `MkdirHandle` now take an `os.FileMode`-style mode argument
+ instead of a raw `unix.S_*`-style mode argument, which may cause compile-time
+ type errors depending on how you use `filepath-securejoin`. For most users,
+ there will be no change in behaviour aside from the type change (as the
+ bottom `0o777` bits are the same in both formats, and most users are probably
+ only using those bits).
+
+ However, if you were using `unix.S_ISVTX` to set the sticky bit with
+ `MkdirAll(Handle)` you will need to switch to `os.ModeSticky` otherwise you
+ will get a runtime error with this update. In addition, the error message you
+ will get from passing `unix.S_ISUID` and `unix.S_ISGID` will be different as
+ they are treated as invalid bits now (note that previously passing said bits
+ was also an error).
+
+## [0.3.6] - 2024-12-17 ##
+
+### Compatibility ###
+- The minimum Go version requirement for `filepath-securejoin` is now Go 1.18
+ (we use generics internally).
+
+ For reference, `filepath-securejoin@v0.3.0` somewhat-arbitrarily bumped the
+ Go version requirement to 1.21.
+
+ While we did make some use of Go 1.21 stdlib features (and in principle Go
+ versions <= 1.21 are no longer even supported by upstream anymore), some
+ downstreams have complained that the version bump has meant that they have to
+ do workarounds when backporting fixes that use the new `filepath-securejoin`
+ API onto old branches. This is not an ideal situation, but since using this
+ library is probably better for most downstreams than a hand-rolled
+ workaround, we now have compatibility shims that allow us to build on older
+ Go versions.
+- Lower minimum version requirement for `golang.org/x/sys` to `v0.18.0` (we
+ need the wrappers for `fsconfig(2)`), which should also make backporting
+ patches to older branches easier.
+
+## [0.3.5] - 2024-12-06 ##
+
+### Fixed ###
+- `MkdirAll` will now no longer return an `EEXIST` error if two racing
+ processes are creating the same directory. We will still verify that the path
+ is a directory, but this will avoid spurious errors when multiple threads or
+ programs are trying to `MkdirAll` the same path. opencontainers/runc#4543
+
+## [0.3.4] - 2024-10-09 ##
+
+### Fixed ###
+- Previously, some testing mocks we had resulted in us doing `import "testing"`
+ in non-`_test.go` code, which made some downstreams like Kubernetes unhappy.
+ This has been fixed. (#32)
+
+## [0.3.3] - 2024-09-30 ##
+
+### Fixed ###
+- The mode and owner verification logic in `MkdirAll` has been removed. This
+ was originally intended to protect against some theoretical attacks but upon
+ further consideration these protections don't actually buy us anything and
+ they were causing spurious errors with more complicated filesystem setups.
+- The "is the created directory empty" logic in `MkdirAll` has also been
+ removed. This was not causing us issues yet, but some pseudofilesystems (such
+ as `cgroup`) create non-empty directories and so this logic would've been
+ wrong for such cases.
+
+## [0.3.2] - 2024-09-13 ##
+
+### Changed ###
+- Passing the `S_ISUID` or `S_ISGID` modes to `MkdirAllInRoot` will now return
+ an explicit error saying that those bits are ignored by `mkdirat(2)`. In the
+ past a different error was returned, but since the silent ignoring behaviour
+ is codified in the man pages a more explicit error seems apt. While silently
+ ignoring these bits would be the most compatible option, it could lead to
+ users thinking their code sets these bits when it doesn't. Programs that need
+ to deal with compatibility can mask the bits themselves. (#23, #25)
+
+### Fixed ###
+- If a directory has `S_ISGID` set, then all child directories will have
+ `S_ISGID` set when created and a different gid will be used for any inode
+ created under the directory. Previously, the "expected owner and mode"
+ validation in `securejoin.MkdirAll` did not correctly handle this. We now
+ correctly handle this case. (#24, #25)
+
+## [0.3.1] - 2024-07-23 ##
+
+### Changed ###
+- By allowing `Open(at)InRoot` to opt-out of the extra work done by `MkdirAll`
+ to do the necessary "partial lookups", `Open(at)InRoot` now does less work
+ for both implementations (resulting in a many-fold decrease in the number of
+ operations for `openat2`, and a modest improvement for non-`openat2`) and is
+ far more guaranteed to match the correct `openat2(RESOLVE_IN_ROOT)`
+ behaviour.
+- We now use `readlinkat(fd, "")` where possible. For `Open(at)InRoot` this
+ effectively just means that we no longer risk getting spurious errors during
+ rename races. However, for our hardened procfs handler, this in theory should
+ prevent mount attacks from tricking us when doing magic-link readlinks (even
+ when using the unsafe host `/proc` handle). Unfortunately `Reopen` is still
+ potentially vulnerable to those kinds of somewhat-esoteric attacks.
+
+ Technically this [will only work on post-2.6.39 kernels][linux-readlinkat-emptypath]
+ but it seems incredibly unlikely anyone is using `filepath-securejoin` on a
+ pre-2011 kernel.
+
+### Fixed ###
+- Several improvements were made to the errors returned by `Open(at)InRoot` and
+ `MkdirAll` when dealing with invalid paths under the emulated (ie.
+ non-`openat2`) implementation. Previously, some paths would return the wrong
+ error (`ENOENT` when the last component was a non-directory), and other paths
+ would be returned as though they were acceptable (trailing-slash components
+ after a non-directory would be ignored by `Open(at)InRoot`).
+
+ These changes were done to match `openat2`'s behaviour and purely is a
+ consistency fix (most users are going to be using `openat2` anyway).
+
+[linux-readlinkat-emptypath]: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=65cfc6722361570bfe255698d9cd4dccaf47570d
+
+## [0.3.0] - 2024-07-11 ##
+
+### Added ###
+- A new set of `*os.File`-based APIs have been added. These are adapted from
+ [libpathrs][] and we strongly suggest using them if possible (as they provide
+ far more protection against attacks than `SecureJoin`):
+
+ - `Open(at)InRoot` resolves a path inside a rootfs and returns an `*os.File`
+ handle to the path. Note that the handle returned is an `O_PATH` handle,
+ which cannot be used for reading or writing (as well as some other
+ operations -- [see open(2) for more details][open.2])
+
+ - `Reopen` takes an `O_PATH` file handle and safely re-opens it to upgrade
+ it to a regular handle. This can also be used with non-`O_PATH` handles,
+ but `O_PATH` is the most obvious application.
+
+ - `MkdirAll` is an implementation of `os.MkdirAll` that is safe to use to
+ create a directory tree within a rootfs.
+
+ As these are new APIs, they may change in the future. However, they should be
+ safe to start migrating to as we have extensive tests ensuring they behave
+ correctly and are safe against various races and other attacks.
+
+[libpathrs]: https://github.com/openSUSE/libpathrs
+[open.2]: https://www.man7.org/linux/man-pages/man2/open.2.html
+
+## [0.2.5] - 2024-05-03 ##
+
+### Changed ###
+- Some minor changes were made to how lexical components (like `..` and `.`)
+ are handled during path generation in `SecureJoin`. There is no behaviour
+ change as a result of this fix (the resulting paths are the same).
+
+### Fixed ###
+- The error returned when we hit a symlink loop now references the correct
+ path. (#10)
+
+## [0.2.4] - 2023-09-06 ##
+
+### Security ###
+- This release fixes a potential security issue in filepath-securejoin when
+ used on Windows ([GHSA-6xv5-86q9-7xr8][], which could be used to generate
+ paths outside of the provided rootfs in certain cases), as well as improving
+ the overall behaviour of filepath-securejoin when dealing with Windows paths
+ that contain volume names. Thanks to Paulo Gomes for discovering and fixing
+ these issues.
+
+### Fixed ###
+- Switch to GitHub Actions for CI so we can test on Windows as well as Linux
+ and MacOS.
+
+[GHSA-6xv5-86q9-7xr8]: https://github.com/advisories/GHSA-6xv5-86q9-7xr8
+
+## [0.2.3] - 2021-06-04 ##
+
+### Changed ###
+- Switch to Go 1.13-style `%w` error wrapping, letting us drop the dependency
+ on `github.com/pkg/errors`.
+
+## [0.2.2] - 2018-09-05 ##
+
+### Changed ###
+- Use `syscall.ELOOP` as the base error for symlink loops, rather than our own
+ (internal) error. This allows callers to more easily use `errors.Is` to check
+ for this case.
+
+## [0.2.1] - 2018-09-05 ##
+
+### Fixed ###
+- Use our own `IsNotExist` implementation, which lets us handle `ENOTDIR`
+ properly within `SecureJoin`.
+
+## [0.2.0] - 2017-07-19 ##
+
+We now have 100% test coverage!
+
+### Added ###
+- Add a `SecureJoinVFS` API that can be used for mocking (as we do in our new
+ tests) or for implementing custom handling of lookup operations (such as for
+ rootless containers, where work is necessary to access directories with weird
+ modes because we don't have `CAP_DAC_READ_SEARCH` or `CAP_DAC_OVERRIDE`).
+
+## 0.1.0 - 2017-07-19
+
+This is our first release of `github.com/cyphar/filepath-securejoin`,
+containing a full implementation with a coverage of 93.5% (the only missing
+cases are the error cases, which are hard to mocktest at the moment).
+
+[Unreleased]: https://github.com/cyphar/filepath-securejoin/compare/v0.4.1...HEAD
+[0.4.1]: https://github.com/cyphar/filepath-securejoin/compare/v0.4.0...v0.4.1
+[0.4.0]: https://github.com/cyphar/filepath-securejoin/compare/v0.3.6...v0.4.0
+[0.3.6]: https://github.com/cyphar/filepath-securejoin/compare/v0.3.5...v0.3.6
+[0.3.5]: https://github.com/cyphar/filepath-securejoin/compare/v0.3.4...v0.3.5
+[0.3.4]: https://github.com/cyphar/filepath-securejoin/compare/v0.3.3...v0.3.4
+[0.3.3]: https://github.com/cyphar/filepath-securejoin/compare/v0.3.2...v0.3.3
+[0.3.2]: https://github.com/cyphar/filepath-securejoin/compare/v0.3.1...v0.3.2
+[0.3.1]: https://github.com/cyphar/filepath-securejoin/compare/v0.3.0...v0.3.1
+[0.3.0]: https://github.com/cyphar/filepath-securejoin/compare/v0.2.5...v0.3.0
+[0.2.5]: https://github.com/cyphar/filepath-securejoin/compare/v0.2.4...v0.2.5
+[0.2.4]: https://github.com/cyphar/filepath-securejoin/compare/v0.2.3...v0.2.4
+[0.2.3]: https://github.com/cyphar/filepath-securejoin/compare/v0.2.2...v0.2.3
+[0.2.2]: https://github.com/cyphar/filepath-securejoin/compare/v0.2.1...v0.2.2
+[0.2.1]: https://github.com/cyphar/filepath-securejoin/compare/v0.2.0...v0.2.1
+[0.2.0]: https://github.com/cyphar/filepath-securejoin/compare/v0.1.0...v0.2.0
diff --git a/vendor/github.com/cyphar/filepath-securejoin/LICENSE b/vendor/github.com/cyphar/filepath-securejoin/LICENSE
new file mode 100644
index 000000000..cb1ab88da
--- /dev/null
+++ b/vendor/github.com/cyphar/filepath-securejoin/LICENSE
@@ -0,0 +1,28 @@
+Copyright (C) 2014-2015 Docker Inc & Go Authors. All rights reserved.
+Copyright (C) 2017-2024 SUSE LLC. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+ * Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above
+copyright notice, this list of conditions and the following disclaimer
+in the documentation and/or other materials provided with the
+distribution.
+ * Neither the name of Google Inc. nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/vendor/github.com/cyphar/filepath-securejoin/README.md b/vendor/github.com/cyphar/filepath-securejoin/README.md
new file mode 100644
index 000000000..eaeb53fcd
--- /dev/null
+++ b/vendor/github.com/cyphar/filepath-securejoin/README.md
@@ -0,0 +1,169 @@
+## `filepath-securejoin` ##
+
+[data:image/s3,"s3://crabby-images/12dd2/12dd2fc9cee6fada97fdab1a16f7bff619458a00" alt="Go Documentation"](https://pkg.go.dev/github.com/cyphar/filepath-securejoin)
+[data:image/s3,"s3://crabby-images/8e42c/8e42c4394d1a9d15aaf0efeb38641b349e721126" alt="Build Status"](https://github.com/cyphar/filepath-securejoin/actions/workflows/ci.yml)
+
+### Old API ###
+
+This library was originally just an implementation of `SecureJoin` which was
+[intended to be included in the Go standard library][go#20126] as a safer
+`filepath.Join` that would restrict the path lookup to be inside a root
+directory.
+
+The implementation was based on code that existed in several container
+runtimes. Unfortunately, this API is **fundamentally unsafe** against attackers
+that can modify path components after `SecureJoin` returns and before the
+caller uses the path, allowing for some fairly trivial TOCTOU attacks.
+
+`SecureJoin` (and `SecureJoinVFS`) are still provided by this library to
+support legacy users, but new users are strongly suggested to avoid using
+`SecureJoin` and instead use the [new api](#new-api) or switch to
+[libpathrs][libpathrs].
+
+With the above limitations in mind, this library guarantees the following:
+
+* If no error is set, the resulting string **must** be a child path of
+ `root` and will not contain any symlink path components (they will all be
+ expanded).
+
+* When expanding symlinks, all symlink path components **must** be resolved
+ relative to the provided root. In particular, this can be considered a
+ userspace implementation of how `chroot(2)` operates on file paths. Note that
+ these symlinks will **not** be expanded lexically (`filepath.Clean` is not
+ called on the input before processing).
+
+* Non-existent path components are unaffected by `SecureJoin` (similar to
+ `filepath.EvalSymlinks`'s semantics).
+
+* The returned path will always be `filepath.Clean`ed and thus not contain any
+ `..` components.
+
+A (trivial) implementation of this function on GNU/Linux systems could be done
+with the following (note that this requires root privileges and is far more
+opaque than the implementation in this library, and also requires that
+`readlink` is inside the `root` path and is trustworthy):
+
+```go
+package securejoin
+
+import (
+ "os/exec"
+ "path/filepath"
+)
+
+func SecureJoin(root, unsafePath string) (string, error) {
+ unsafePath = string(filepath.Separator) + unsafePath
+ cmd := exec.Command("chroot", root,
+ "readlink", "--canonicalize-missing", "--no-newline", unsafePath)
+ output, err := cmd.CombinedOutput()
+ if err != nil {
+ return "", err
+ }
+ expanded := string(output)
+ return filepath.Join(root, expanded), nil
+}
+```
+
+[libpathrs]: https://github.com/openSUSE/libpathrs
+[go#20126]: https://github.com/golang/go/issues/20126
+
+### New API ###
+
+While we recommend users switch to [libpathrs][libpathrs] as soon as it has a
+stable release, some methods implemented by libpathrs have been ported to this
+library to ease the transition. These APIs are only supported on Linux.
+
+These APIs are implemented such that `filepath-securejoin` will
+opportunistically use certain newer kernel APIs that make these operations far
+more secure. In particular:
+
+* All of the lookup operations will use [`openat2`][openat2.2] on new enough
+ kernels (Linux 5.6 or later) to restrict lookups through magic-links and
+ bind-mounts (for certain operations) and to make use of `RESOLVE_IN_ROOT` to
+ efficiently resolve symlinks within a rootfs.
+
+* The APIs provide hardening against a malicious `/proc` mount to either detect
+ or avoid being tricked by a `/proc` that is not legitimate. This is done
+ using [`openat2`][openat2.2] for all users, and privileged users will also be
+ further protected by using [`fsopen`][fsopen.2] and [`open_tree`][open_tree.2]
+ (Linux 5.2 or later).
+
+[openat2.2]: https://www.man7.org/linux/man-pages/man2/openat2.2.html
+[fsopen.2]: https://github.com/brauner/man-pages-md/blob/main/fsopen.md
+[open_tree.2]: https://github.com/brauner/man-pages-md/blob/main/open_tree.md
+
+#### `OpenInRoot` ####
+
+```go
+func OpenInRoot(root, unsafePath string) (*os.File, error)
+func OpenatInRoot(root *os.File, unsafePath string) (*os.File, error)
+func Reopen(handle *os.File, flags int) (*os.File, error)
+```
+
+`OpenInRoot` is a much safer version of
+
+```go
+path, err := securejoin.SecureJoin(root, unsafePath)
+file, err := os.OpenFile(path, unix.O_PATH|unix.O_CLOEXEC)
+```
+
+that protects against various race attacks that could lead to serious security
+issues, depending on the application. Note that the returned `*os.File` is an
+`O_PATH` file descriptor, which is quite restricted. Callers will probably need
+to use `Reopen` to get a more usable handle (this split is done to provide
+useful features like PTY spawning and to avoid users accidentally opening bad
+inodes that could cause a DoS).
+
+Callers need to be careful in how they use the returned `*os.File`. Usually it
+is only safe to operate on the handle directly, and it is very easy to create a
+security issue. [libpathrs][libpathrs] provides far more helpers to make using
+these handles safer -- there is currently no plan to port them to
+`filepath-securejoin`.
+
+`OpenatInRoot` is like `OpenInRoot` except that the root is provided using an
+`*os.File`. This allows you to ensure that multiple `OpenatInRoot` (or
+`MkdirAllHandle`) calls are operating on the same rootfs.
+
+> **NOTE**: Unlike `SecureJoin`, `OpenInRoot` will error out as soon as it hits
+> a dangling symlink or non-existent path. This is in contrast to `SecureJoin`
+> which treated non-existent components as though they were real directories,
+> and would allow for partial resolution of dangling symlinks. These behaviours
+> are at odds with how Linux treats non-existent paths and dangling symlinks,
+> and so these are no longer allowed.
+
+#### `MkdirAll` ####
+
+```go
+func MkdirAll(root, unsafePath string, mode int) error
+func MkdirAllHandle(root *os.File, unsafePath string, mode int) (*os.File, error)
+```
+
+`MkdirAll` is a much safer version of
+
+```go
+path, err := securejoin.SecureJoin(root, unsafePath)
+err = os.MkdirAll(path, mode)
+```
+
+that protects against the same kinds of races that `OpenInRoot` protects
+against.
+
+`MkdirAllHandle` is like `MkdirAll` except that the root is provided using an
+`*os.File` (the reason for this is the same as with `OpenatInRoot`) and an
+`*os.File` of the final created directory is returned (this directory is
+guaranteed to be effectively identical to the directory created by
+`MkdirAllHandle`, which is not possible to ensure by just using `OpenatInRoot`
+after `MkdirAll`).
+
+> **NOTE**: Unlike `SecureJoin`, `MkdirAll` will error out as soon as it hits
+> a dangling symlink or non-existent path. This is in contrast to `SecureJoin`
+> which treated non-existent components as though they were real directories,
+> and would allow for partial resolution of dangling symlinks. These behaviours
+> are at odds with how Linux treats non-existent paths and dangling symlinks,
+> and so these are no longer allowed. This means that `MkdirAll` will not
+> create non-existent directories referenced by a dangling symlink.
+
+### License ###
+
+The license of this project is the same as Go, which is a BSD 3-clause license
+available in the `LICENSE` file.
diff --git a/vendor/github.com/cyphar/filepath-securejoin/VERSION b/vendor/github.com/cyphar/filepath-securejoin/VERSION
new file mode 100644
index 000000000..267577d47
--- /dev/null
+++ b/vendor/github.com/cyphar/filepath-securejoin/VERSION
@@ -0,0 +1 @@
+0.4.1
diff --git a/vendor/github.com/cyphar/filepath-securejoin/doc.go b/vendor/github.com/cyphar/filepath-securejoin/doc.go
new file mode 100644
index 000000000..1ec7d065e
--- /dev/null
+++ b/vendor/github.com/cyphar/filepath-securejoin/doc.go
@@ -0,0 +1,39 @@
+// Copyright (C) 2014-2015 Docker Inc & Go Authors. All rights reserved.
+// Copyright (C) 2017-2024 SUSE LLC. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package securejoin implements a set of helpers to make it easier to write Go
+// code that is safe against symlink-related escape attacks. The primary idea
+// is to let you resolve a path within a rootfs directory as if the rootfs was
+// a chroot.
+//
+// securejoin has two APIs, a "legacy" API and a "modern" API.
+//
+// The legacy API is [SecureJoin] and [SecureJoinVFS]. These methods are
+// **not** safe against race conditions where an attacker changes the
+// filesystem after (or during) the [SecureJoin] operation.
+//
+// The new API is made up of [OpenInRoot] and [MkdirAll] (and derived
+// functions). These are safe against racing attackers and have several other
+// protections that are not provided by the legacy API. There are many more
+// operations that most programs expect to be able to do safely, but we do not
+// provide explicit support for them because we want to encourage users to
+// switch to [libpathrs](https://github.com/openSUSE/libpathrs) which is a
+// cross-language next-generation library that is entirely designed around
+// operating on paths safely.
+//
+// securejoin has been used by several container runtimes (Docker, runc,
+// Kubernetes, etc) for quite a few years as a de-facto standard for operating
+// on container filesystem paths "safely". However, most users still use the
+// legacy API which is unsafe against various attacks (there is a fairly long
+// history of CVEs in dependent as a result). Users should switch to the modern
+// API as soon as possible (or even better, switch to libpathrs).
+//
+// This project was initially intended to be included in the Go standard
+// library, but [it was rejected](https://go.dev/issue/20126). There is now a
+// [new Go proposal](https://go.dev/issue/67002) for a safe path resolution API
+// that shares some of the goals of filepath-securejoin. However, that design
+// is intended to work like `openat2(RESOLVE_BENEATH)` which does not fit the
+// usecase of container runtimes and most system tools.
+package securejoin
diff --git a/vendor/github.com/cyphar/filepath-securejoin/gocompat_errors_go120.go b/vendor/github.com/cyphar/filepath-securejoin/gocompat_errors_go120.go
new file mode 100644
index 000000000..42452bbf9
--- /dev/null
+++ b/vendor/github.com/cyphar/filepath-securejoin/gocompat_errors_go120.go
@@ -0,0 +1,18 @@
+//go:build linux && go1.20
+
+// Copyright (C) 2024 SUSE LLC. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package securejoin
+
+import (
+ "fmt"
+)
+
+// wrapBaseError is a helper that is equivalent to fmt.Errorf("%w: %w"), except
+// that on pre-1.20 Go versions only errors.Is() works properly (errors.Unwrap)
+// is only guaranteed to give you baseErr.
+func wrapBaseError(baseErr, extraErr error) error {
+ return fmt.Errorf("%w: %w", extraErr, baseErr)
+}
diff --git a/vendor/github.com/cyphar/filepath-securejoin/gocompat_errors_unsupported.go b/vendor/github.com/cyphar/filepath-securejoin/gocompat_errors_unsupported.go
new file mode 100644
index 000000000..e7adca3fd
--- /dev/null
+++ b/vendor/github.com/cyphar/filepath-securejoin/gocompat_errors_unsupported.go
@@ -0,0 +1,38 @@
+//go:build linux && !go1.20
+
+// Copyright (C) 2024 SUSE LLC. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package securejoin
+
+import (
+ "fmt"
+)
+
+type wrappedError struct {
+ inner error
+ isError error
+}
+
+func (err wrappedError) Is(target error) bool {
+ return err.isError == target
+}
+
+func (err wrappedError) Unwrap() error {
+ return err.inner
+}
+
+func (err wrappedError) Error() string {
+ return fmt.Sprintf("%v: %v", err.isError, err.inner)
+}
+
+// wrapBaseError is a helper that is equivalent to fmt.Errorf("%w: %w"), except
+// that on pre-1.20 Go versions only errors.Is() works properly (errors.Unwrap)
+// is only guaranteed to give you baseErr.
+func wrapBaseError(baseErr, extraErr error) error {
+ return wrappedError{
+ inner: baseErr,
+ isError: extraErr,
+ }
+}
diff --git a/vendor/github.com/cyphar/filepath-securejoin/gocompat_generics_go121.go b/vendor/github.com/cyphar/filepath-securejoin/gocompat_generics_go121.go
new file mode 100644
index 000000000..ddd6fa9a4
--- /dev/null
+++ b/vendor/github.com/cyphar/filepath-securejoin/gocompat_generics_go121.go
@@ -0,0 +1,32 @@
+//go:build linux && go1.21
+
+// Copyright (C) 2024 SUSE LLC. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package securejoin
+
+import (
+ "slices"
+ "sync"
+)
+
+func slices_DeleteFunc[S ~[]E, E any](slice S, delFn func(E) bool) S {
+ return slices.DeleteFunc(slice, delFn)
+}
+
+func slices_Contains[S ~[]E, E comparable](slice S, val E) bool {
+ return slices.Contains(slice, val)
+}
+
+func slices_Clone[S ~[]E, E any](slice S) S {
+ return slices.Clone(slice)
+}
+
+func sync_OnceValue[T any](f func() T) func() T {
+ return sync.OnceValue(f)
+}
+
+func sync_OnceValues[T1, T2 any](f func() (T1, T2)) func() (T1, T2) {
+ return sync.OnceValues(f)
+}
diff --git a/vendor/github.com/cyphar/filepath-securejoin/gocompat_generics_unsupported.go b/vendor/github.com/cyphar/filepath-securejoin/gocompat_generics_unsupported.go
new file mode 100644
index 000000000..f1e6fe7e7
--- /dev/null
+++ b/vendor/github.com/cyphar/filepath-securejoin/gocompat_generics_unsupported.go
@@ -0,0 +1,124 @@
+//go:build linux && !go1.21
+
+// Copyright (C) 2024 SUSE LLC. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package securejoin
+
+import (
+ "sync"
+)
+
+// These are very minimal implementations of functions that appear in Go 1.21's
+// stdlib, included so that we can build on older Go versions. Most are
+// borrowed directly from the stdlib, and a few are modified to be "obviously
+// correct" without needing to copy too many other helpers.
+
+// clearSlice is equivalent to the builtin clear from Go 1.21.
+// Copied from the Go 1.24 stdlib implementation.
+func clearSlice[S ~[]E, E any](slice S) {
+ var zero E
+ for i := range slice {
+ slice[i] = zero
+ }
+}
+
+// Copied from the Go 1.24 stdlib implementation.
+func slices_IndexFunc[S ~[]E, E any](s S, f func(E) bool) int {
+ for i := range s {
+ if f(s[i]) {
+ return i
+ }
+ }
+ return -1
+}
+
+// Copied from the Go 1.24 stdlib implementation.
+func slices_DeleteFunc[S ~[]E, E any](s S, del func(E) bool) S {
+ i := slices_IndexFunc(s, del)
+ if i == -1 {
+ return s
+ }
+ // Don't start copying elements until we find one to delete.
+ for j := i + 1; j < len(s); j++ {
+ if v := s[j]; !del(v) {
+ s[i] = v
+ i++
+ }
+ }
+ clearSlice(s[i:]) // zero/nil out the obsolete elements, for GC
+ return s[:i]
+}
+
+// Similar to the stdlib slices.Contains, except that we don't have
+// slices.Index so we need to use slices.IndexFunc for this non-Func helper.
+func slices_Contains[S ~[]E, E comparable](s S, v E) bool {
+ return slices_IndexFunc(s, func(e E) bool { return e == v }) >= 0
+}
+
+// Copied from the Go 1.24 stdlib implementation.
+func slices_Clone[S ~[]E, E any](s S) S {
+ // Preserve nil in case it matters.
+ if s == nil {
+ return nil
+ }
+ return append(S([]E{}), s...)
+}
+
+// Copied from the Go 1.24 stdlib implementation.
+func sync_OnceValue[T any](f func() T) func() T {
+ var (
+ once sync.Once
+ valid bool
+ p any
+ result T
+ )
+ g := func() {
+ defer func() {
+ p = recover()
+ if !valid {
+ panic(p)
+ }
+ }()
+ result = f()
+ f = nil
+ valid = true
+ }
+ return func() T {
+ once.Do(g)
+ if !valid {
+ panic(p)
+ }
+ return result
+ }
+}
+
+// Copied from the Go 1.24 stdlib implementation.
+func sync_OnceValues[T1, T2 any](f func() (T1, T2)) func() (T1, T2) {
+ var (
+ once sync.Once
+ valid bool
+ p any
+ r1 T1
+ r2 T2
+ )
+ g := func() {
+ defer func() {
+ p = recover()
+ if !valid {
+ panic(p)
+ }
+ }()
+ r1, r2 = f()
+ f = nil
+ valid = true
+ }
+ return func() (T1, T2) {
+ once.Do(g)
+ if !valid {
+ panic(p)
+ }
+ return r1, r2
+ }
+}
diff --git a/vendor/github.com/cyphar/filepath-securejoin/join.go b/vendor/github.com/cyphar/filepath-securejoin/join.go
new file mode 100644
index 000000000..e6634d477
--- /dev/null
+++ b/vendor/github.com/cyphar/filepath-securejoin/join.go
@@ -0,0 +1,166 @@
+// Copyright (C) 2014-2015 Docker Inc & Go Authors. All rights reserved.
+// Copyright (C) 2017-2025 SUSE LLC. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package securejoin
+
+import (
+ "errors"
+ "os"
+ "path/filepath"
+ "strings"
+ "syscall"
+)
+
+const maxSymlinkLimit = 255
+
+// IsNotExist tells you if err is an error that implies that either the path
+// accessed does not exist (or path components don't exist). This is
+// effectively a more broad version of [os.IsNotExist].
+func IsNotExist(err error) bool {
+ // Check that it's not actually an ENOTDIR, which in some cases is a more
+ // convoluted case of ENOENT (usually involving weird paths).
+ return errors.Is(err, os.ErrNotExist) || errors.Is(err, syscall.ENOTDIR) || errors.Is(err, syscall.ENOENT)
+}
+
+// errUnsafeRoot is returned if the user provides SecureJoinVFS with a path
+// that contains ".." components.
+var errUnsafeRoot = errors.New("root path provided to SecureJoin contains '..' components")
+
+// stripVolume just gets rid of the Windows volume included in a path. Based on
+// some godbolt tests, the Go compiler is smart enough to make this a no-op on
+// Linux.
+func stripVolume(path string) string {
+ return path[len(filepath.VolumeName(path)):]
+}
+
+// hasDotDot checks if the path contains ".." components in a platform-agnostic
+// way.
+func hasDotDot(path string) bool {
+ // If we are on Windows, strip any volume letters. It turns out that
+ // C:..\foo may (or may not) be a valid pathname and we need to handle that
+ // leading "..".
+ path = stripVolume(path)
+ // Look for "/../" in the path, but we need to handle leading and trailing
+ // ".."s by adding separators. Doing this with filepath.Separator is ugly
+ // so just convert to Unix-style "/" first.
+ path = filepath.ToSlash(path)
+ return strings.Contains("/"+path+"/", "/../")
+}
+
+// SecureJoinVFS joins the two given path components (similar to [filepath.Join]) except
+// that the returned path is guaranteed to be scoped inside the provided root
+// path (when evaluated). Any symbolic links in the path are evaluated with the
+// given root treated as the root of the filesystem, similar to a chroot. The
+// filesystem state is evaluated through the given [VFS] interface (if nil, the
+// standard [os].* family of functions are used).
+//
+// Note that the guarantees provided by this function only apply if the path
+// components in the returned string are not modified (in other words are not
+// replaced with symlinks on the filesystem) after this function has returned.
+// Such a symlink race is necessarily out-of-scope of SecureJoinVFS.
+//
+// NOTE: Due to the above limitation, Linux users are strongly encouraged to
+// use [OpenInRoot] instead, which does safely protect against these kinds of
+// attacks. There is no way to solve this problem with SecureJoinVFS because
+// the API is fundamentally wrong (you cannot return a "safe" path string and
+// guarantee it won't be modified afterwards).
+//
+// Volume names in unsafePath are always discarded, regardless if they are
+// provided via direct input or when evaluating symlinks. Therefore:
+//
+// "C:\Temp" + "D:\path\to\file.txt" results in "C:\Temp\path\to\file.txt"
+//
+// If the provided root is not [filepath.Clean] then an error will be returned,
+// as such root paths are bordering on somewhat unsafe and using such paths is
+// not best practice. We also strongly suggest that any root path is first
+// fully resolved using [filepath.EvalSymlinks] or otherwise constructed to
+// avoid containing symlink components. Of course, the root also *must not* be
+// attacker-controlled.
+func SecureJoinVFS(root, unsafePath string, vfs VFS) (string, error) {
+ // The root path must not contain ".." components, otherwise when we join
+ // the subpath we will end up with a weird path. We could work around this
+ // in other ways but users shouldn't be giving us non-lexical root paths in
+ // the first place.
+ if hasDotDot(root) {
+ return "", errUnsafeRoot
+ }
+
+ // Use the os.* VFS implementation if none was specified.
+ if vfs == nil {
+ vfs = osVFS{}
+ }
+
+ unsafePath = filepath.FromSlash(unsafePath)
+ var (
+ currentPath string
+ remainingPath = unsafePath
+ linksWalked int
+ )
+ for remainingPath != "" {
+ // On Windows, if we managed to end up at a path referencing a volume,
+ // drop the volume to make sure we don't end up with broken paths or
+ // escaping the root volume.
+ remainingPath = stripVolume(remainingPath)
+
+ // Get the next path component.
+ var part string
+ if i := strings.IndexRune(remainingPath, filepath.Separator); i == -1 {
+ part, remainingPath = remainingPath, ""
+ } else {
+ part, remainingPath = remainingPath[:i], remainingPath[i+1:]
+ }
+
+ // Apply the component lexically to the path we are building.
+ // currentPath does not contain any symlinks, and we are lexically
+ // dealing with a single component, so it's okay to do a filepath.Clean
+ // here.
+ nextPath := filepath.Join(string(filepath.Separator), currentPath, part)
+ if nextPath == string(filepath.Separator) {
+ currentPath = ""
+ continue
+ }
+ fullPath := root + string(filepath.Separator) + nextPath
+
+ // Figure out whether the path is a symlink.
+ fi, err := vfs.Lstat(fullPath)
+ if err != nil && !IsNotExist(err) {
+ return "", err
+ }
+ // Treat non-existent path components the same as non-symlinks (we
+ // can't do any better here).
+ if IsNotExist(err) || fi.Mode()&os.ModeSymlink == 0 {
+ currentPath = nextPath
+ continue
+ }
+
+ // It's a symlink, so get its contents and expand it by prepending it
+ // to the yet-unparsed path.
+ linksWalked++
+ if linksWalked > maxSymlinkLimit {
+ return "", &os.PathError{Op: "SecureJoin", Path: root + string(filepath.Separator) + unsafePath, Err: syscall.ELOOP}
+ }
+
+ dest, err := vfs.Readlink(fullPath)
+ if err != nil {
+ return "", err
+ }
+ remainingPath = dest + string(filepath.Separator) + remainingPath
+ // Absolute symlinks reset any work we've already done.
+ if filepath.IsAbs(dest) {
+ currentPath = ""
+ }
+ }
+
+ // There should be no lexical components like ".." left in the path here,
+ // but for safety clean up the path before joining it to the root.
+ finalPath := filepath.Join(string(filepath.Separator), currentPath)
+ return filepath.Join(root, finalPath), nil
+}
+
+// SecureJoin is a wrapper around [SecureJoinVFS] that just uses the [os].* library
+// of functions as the [VFS]. If in doubt, use this function over [SecureJoinVFS].
+func SecureJoin(root, unsafePath string) (string, error) {
+ return SecureJoinVFS(root, unsafePath, nil)
+}
diff --git a/vendor/github.com/cyphar/filepath-securejoin/lookup_linux.go b/vendor/github.com/cyphar/filepath-securejoin/lookup_linux.go
new file mode 100644
index 000000000..be81e498d
--- /dev/null
+++ b/vendor/github.com/cyphar/filepath-securejoin/lookup_linux.go
@@ -0,0 +1,388 @@
+//go:build linux
+
+// Copyright (C) 2024 SUSE LLC. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package securejoin
+
+import (
+ "errors"
+ "fmt"
+ "os"
+ "path"
+ "path/filepath"
+ "strings"
+
+ "golang.org/x/sys/unix"
+)
+
+type symlinkStackEntry struct {
+ // (dir, remainingPath) is what we would've returned if the link didn't
+ // exist. This matches what openat2(RESOLVE_IN_ROOT) would return in
+ // this case.
+ dir *os.File
+ remainingPath string
+ // linkUnwalked is the remaining path components from the original
+ // Readlink which we have yet to walk. When this slice is empty, we
+ // drop the link from the stack.
+ linkUnwalked []string
+}
+
+func (se symlinkStackEntry) String() string {
+ return fmt.Sprintf("<%s>/%s [->%s]", se.dir.Name(), se.remainingPath, strings.Join(se.linkUnwalked, "/"))
+}
+
+func (se symlinkStackEntry) Close() {
+ _ = se.dir.Close()
+}
+
+type symlinkStack []*symlinkStackEntry
+
+func (s *symlinkStack) IsEmpty() bool {
+ return s == nil || len(*s) == 0
+}
+
+func (s *symlinkStack) Close() {
+ if s != nil {
+ for _, link := range *s {
+ link.Close()
+ }
+ // TODO: Switch to clear once we switch to Go 1.21.
+ *s = nil
+ }
+}
+
+var (
+ errEmptyStack = errors.New("[internal] stack is empty")
+ errBrokenSymlinkStack = errors.New("[internal error] broken symlink stack")
+)
+
+func (s *symlinkStack) popPart(part string) error {
+ if s == nil || s.IsEmpty() {
+ // If there is nothing in the symlink stack, then the part was from the
+ // real path provided by the user, and this is a no-op.
+ return errEmptyStack
+ }
+ if part == "." {
+ // "." components are no-ops -- we drop them when doing SwapLink.
+ return nil
+ }
+
+ tailEntry := (*s)[len(*s)-1]
+
+ // Double-check that we are popping the component we expect.
+ if len(tailEntry.linkUnwalked) == 0 {
+ return fmt.Errorf("%w: trying to pop component %q of empty stack entry %s", errBrokenSymlinkStack, part, tailEntry)
+ }
+ headPart := tailEntry.linkUnwalked[0]
+ if headPart != part {
+ return fmt.Errorf("%w: trying to pop component %q but the last stack entry is %s (%q)", errBrokenSymlinkStack, part, tailEntry, headPart)
+ }
+
+ // Drop the component, but keep the entry around in case we are dealing
+ // with a "tail-chained" symlink.
+ tailEntry.linkUnwalked = tailEntry.linkUnwalked[1:]
+ return nil
+}
+
+func (s *symlinkStack) PopPart(part string) error {
+ if err := s.popPart(part); err != nil {
+ if errors.Is(err, errEmptyStack) {
+ // Skip empty stacks.
+ err = nil
+ }
+ return err
+ }
+
+ // Clean up any of the trailing stack entries that are empty.
+ for lastGood := len(*s) - 1; lastGood >= 0; lastGood-- {
+ entry := (*s)[lastGood]
+ if len(entry.linkUnwalked) > 0 {
+ break
+ }
+ entry.Close()
+ (*s) = (*s)[:lastGood]
+ }
+ return nil
+}
+
+func (s *symlinkStack) push(dir *os.File, remainingPath, linkTarget string) error {
+ if s == nil {
+ return nil
+ }
+ // Split the link target and clean up any "" parts.
+ linkTargetParts := slices_DeleteFunc(
+ strings.Split(linkTarget, "/"),
+ func(part string) bool { return part == "" || part == "." })
+
+ // Copy the directory so the caller doesn't close our copy.
+ dirCopy, err := dupFile(dir)
+ if err != nil {
+ return err
+ }
+
+ // Add to the stack.
+ *s = append(*s, &symlinkStackEntry{
+ dir: dirCopy,
+ remainingPath: remainingPath,
+ linkUnwalked: linkTargetParts,
+ })
+ return nil
+}
+
+func (s *symlinkStack) SwapLink(linkPart string, dir *os.File, remainingPath, linkTarget string) error {
+ // If we are currently inside a symlink resolution, remove the symlink
+ // component from the last symlink entry, but don't remove the entry even
+ // if it's empty. If we are a "tail-chained" symlink (a trailing symlink we
+ // hit during a symlink resolution) we need to keep the old symlink until
+ // we finish the resolution.
+ if err := s.popPart(linkPart); err != nil {
+ if !errors.Is(err, errEmptyStack) {
+ return err
+ }
+ // Push the component regardless of whether the stack was empty.
+ }
+ return s.push(dir, remainingPath, linkTarget)
+}
+
+func (s *symlinkStack) PopTopSymlink() (*os.File, string, bool) {
+ if s == nil || s.IsEmpty() {
+ return nil, "", false
+ }
+ tailEntry := (*s)[0]
+ *s = (*s)[1:]
+ return tailEntry.dir, tailEntry.remainingPath, true
+}
+
+// partialLookupInRoot tries to lookup as much of the request path as possible
+// within the provided root (a-la RESOLVE_IN_ROOT) and opens the final existing
+// component of the requested path, returning a file handle to the final
+// existing component and a string containing the remaining path components.
+func partialLookupInRoot(root *os.File, unsafePath string) (*os.File, string, error) {
+ return lookupInRoot(root, unsafePath, true)
+}
+
+func completeLookupInRoot(root *os.File, unsafePath string) (*os.File, error) {
+ handle, remainingPath, err := lookupInRoot(root, unsafePath, false)
+ if remainingPath != "" && err == nil {
+ // should never happen
+ err = fmt.Errorf("[bug] non-empty remaining path when doing a non-partial lookup: %q", remainingPath)
+ }
+ // lookupInRoot(partial=false) will always close the handle if an error is
+ // returned, so no need to double-check here.
+ return handle, err
+}
+
+func lookupInRoot(root *os.File, unsafePath string, partial bool) (Handle *os.File, _ string, _ error) {
+ unsafePath = filepath.ToSlash(unsafePath) // noop
+
+ // This is very similar to SecureJoin, except that we operate on the
+ // components using file descriptors. We then return the last component we
+ // managed open, along with the remaining path components not opened.
+
+ // Try to use openat2 if possible.
+ if hasOpenat2() {
+ return lookupOpenat2(root, unsafePath, partial)
+ }
+
+ // Get the "actual" root path from /proc/self/fd. This is necessary if the
+ // root is some magic-link like /proc/$pid/root, in which case we want to
+ // make sure when we do checkProcSelfFdPath that we are using the correct
+ // root path.
+ logicalRootPath, err := procSelfFdReadlink(root)
+ if err != nil {
+ return nil, "", fmt.Errorf("get real root path: %w", err)
+ }
+
+ currentDir, err := dupFile(root)
+ if err != nil {
+ return nil, "", fmt.Errorf("clone root fd: %w", err)
+ }
+ defer func() {
+ // If a handle is not returned, close the internal handle.
+ if Handle == nil {
+ _ = currentDir.Close()
+ }
+ }()
+
+ // symlinkStack is used to emulate how openat2(RESOLVE_IN_ROOT) treats
+ // dangling symlinks. If we hit a non-existent path while resolving a
+ // symlink, we need to return the (dir, remainingPath) that we had when we
+ // hit the symlink (treating the symlink as though it were a regular file).
+ // The set of (dir, remainingPath) sets is stored within the symlinkStack
+ // and we add and remove parts when we hit symlink and non-symlink
+ // components respectively. We need a stack because of recursive symlinks
+ // (symlinks that contain symlink components in their target).
+ //
+ // Note that the stack is ONLY used for book-keeping. All of the actual
+ // path walking logic is still based on currentPath/remainingPath and
+ // currentDir (as in SecureJoin).
+ var symStack *symlinkStack
+ if partial {
+ symStack = new(symlinkStack)
+ defer symStack.Close()
+ }
+
+ var (
+ linksWalked int
+ currentPath string
+ remainingPath = unsafePath
+ )
+ for remainingPath != "" {
+ // Save the current remaining path so if the part is not real we can
+ // return the path including the component.
+ oldRemainingPath := remainingPath
+
+ // Get the next path component.
+ var part string
+ if i := strings.IndexByte(remainingPath, '/'); i == -1 {
+ part, remainingPath = remainingPath, ""
+ } else {
+ part, remainingPath = remainingPath[:i], remainingPath[i+1:]
+ }
+ // If we hit an empty component, we need to treat it as though it is
+ // "." so that trailing "/" and "//" components on a non-directory
+ // correctly return the right error code.
+ if part == "" {
+ part = "."
+ }
+
+ // Apply the component lexically to the path we are building.
+ // currentPath does not contain any symlinks, and we are lexically
+ // dealing with a single component, so it's okay to do a filepath.Clean
+ // here.
+ nextPath := path.Join("/", currentPath, part)
+ // If we logically hit the root, just clone the root rather than
+ // opening the part and doing all of the other checks.
+ if nextPath == "/" {
+ if err := symStack.PopPart(part); err != nil {
+ return nil, "", fmt.Errorf("walking into root with part %q failed: %w", part, err)
+ }
+ // Jump to root.
+ rootClone, err := dupFile(root)
+ if err != nil {
+ return nil, "", fmt.Errorf("clone root fd: %w", err)
+ }
+ _ = currentDir.Close()
+ currentDir = rootClone
+ currentPath = nextPath
+ continue
+ }
+
+ // Try to open the next component.
+ nextDir, err := openatFile(currentDir, part, unix.O_PATH|unix.O_NOFOLLOW|unix.O_CLOEXEC, 0)
+ switch {
+ case err == nil:
+ st, err := nextDir.Stat()
+ if err != nil {
+ _ = nextDir.Close()
+ return nil, "", fmt.Errorf("stat component %q: %w", part, err)
+ }
+
+ switch st.Mode() & os.ModeType {
+ case os.ModeSymlink:
+ // readlinkat implies AT_EMPTY_PATH since Linux 2.6.39. See
+ // Linux commit 65cfc6722361 ("readlinkat(), fchownat() and
+ // fstatat() with empty relative pathnames").
+ linkDest, err := readlinkatFile(nextDir, "")
+ // We don't need the handle anymore.
+ _ = nextDir.Close()
+ if err != nil {
+ return nil, "", err
+ }
+
+ linksWalked++
+ if linksWalked > maxSymlinkLimit {
+ return nil, "", &os.PathError{Op: "securejoin.lookupInRoot", Path: logicalRootPath + "/" + unsafePath, Err: unix.ELOOP}
+ }
+
+ // Swap out the symlink's component for the link entry itself.
+ if err := symStack.SwapLink(part, currentDir, oldRemainingPath, linkDest); err != nil {
+ return nil, "", fmt.Errorf("walking into symlink %q failed: push symlink: %w", part, err)
+ }
+
+ // Update our logical remaining path.
+ remainingPath = linkDest + "/" + remainingPath
+ // Absolute symlinks reset any work we've already done.
+ if path.IsAbs(linkDest) {
+ // Jump to root.
+ rootClone, err := dupFile(root)
+ if err != nil {
+ return nil, "", fmt.Errorf("clone root fd: %w", err)
+ }
+ _ = currentDir.Close()
+ currentDir = rootClone
+ currentPath = "/"
+ }
+
+ default:
+ // If we are dealing with a directory, simply walk into it.
+ _ = currentDir.Close()
+ currentDir = nextDir
+ currentPath = nextPath
+
+ // The part was real, so drop it from the symlink stack.
+ if err := symStack.PopPart(part); err != nil {
+ return nil, "", fmt.Errorf("walking into directory %q failed: %w", part, err)
+ }
+
+ // If we are operating on a .., make sure we haven't escaped.
+ // We only have to check for ".." here because walking down
+ // into a regular component component cannot cause you to
+ // escape. This mirrors the logic in RESOLVE_IN_ROOT, except we
+ // have to check every ".." rather than only checking after a
+ // rename or mount on the system.
+ if part == ".." {
+ // Make sure the root hasn't moved.
+ if err := checkProcSelfFdPath(logicalRootPath, root); err != nil {
+ return nil, "", fmt.Errorf("root path moved during lookup: %w", err)
+ }
+ // Make sure the path is what we expect.
+ fullPath := logicalRootPath + nextPath
+ if err := checkProcSelfFdPath(fullPath, currentDir); err != nil {
+ return nil, "", fmt.Errorf("walking into %q had unexpected result: %w", part, err)
+ }
+ }
+ }
+
+ default:
+ if !partial {
+ return nil, "", err
+ }
+ // If there are any remaining components in the symlink stack, we
+ // are still within a symlink resolution and thus we hit a dangling
+ // symlink. So pretend that the first symlink in the stack we hit
+ // was an ENOENT (to match openat2).
+ if oldDir, remainingPath, ok := symStack.PopTopSymlink(); ok {
+ _ = currentDir.Close()
+ return oldDir, remainingPath, err
+ }
+ // We have hit a final component that doesn't exist, so we have our
+ // partial open result. Note that we have to use the OLD remaining
+ // path, since the lookup failed.
+ return currentDir, oldRemainingPath, err
+ }
+ }
+
+ // If the unsafePath had a trailing slash, we need to make sure we try to
+ // do a relative "." open so that we will correctly return an error when
+ // the final component is a non-directory (to match openat2). In the
+ // context of openat2, a trailing slash and a trailing "/." are completely
+ // equivalent.
+ if strings.HasSuffix(unsafePath, "/") {
+ nextDir, err := openatFile(currentDir, ".", unix.O_PATH|unix.O_NOFOLLOW|unix.O_CLOEXEC, 0)
+ if err != nil {
+ if !partial {
+ _ = currentDir.Close()
+ currentDir = nil
+ }
+ return currentDir, "", err
+ }
+ _ = currentDir.Close()
+ currentDir = nextDir
+ }
+
+ // All of the components existed!
+ return currentDir, "", nil
+}
diff --git a/vendor/github.com/cyphar/filepath-securejoin/mkdir_linux.go b/vendor/github.com/cyphar/filepath-securejoin/mkdir_linux.go
new file mode 100644
index 000000000..a17ae3b03
--- /dev/null
+++ b/vendor/github.com/cyphar/filepath-securejoin/mkdir_linux.go
@@ -0,0 +1,236 @@
+//go:build linux
+
+// Copyright (C) 2024 SUSE LLC. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package securejoin
+
+import (
+ "errors"
+ "fmt"
+ "os"
+ "path/filepath"
+ "strings"
+
+ "golang.org/x/sys/unix"
+)
+
+var (
+ errInvalidMode = errors.New("invalid permission mode")
+ errPossibleAttack = errors.New("possible attack detected")
+)
+
+// modePermExt is like os.ModePerm except that it also includes the set[ug]id
+// and sticky bits.
+const modePermExt = os.ModePerm | os.ModeSetuid | os.ModeSetgid | os.ModeSticky
+
+//nolint:cyclop // this function needs to handle a lot of cases
+func toUnixMode(mode os.FileMode) (uint32, error) {
+ sysMode := uint32(mode.Perm())
+ if mode&os.ModeSetuid != 0 {
+ sysMode |= unix.S_ISUID
+ }
+ if mode&os.ModeSetgid != 0 {
+ sysMode |= unix.S_ISGID
+ }
+ if mode&os.ModeSticky != 0 {
+ sysMode |= unix.S_ISVTX
+ }
+ // We don't allow file type bits.
+ if mode&os.ModeType != 0 {
+ return 0, fmt.Errorf("%w %+.3o (%s): type bits not permitted", errInvalidMode, mode, mode)
+ }
+ // We don't allow other unknown modes.
+ if mode&^modePermExt != 0 || sysMode&unix.S_IFMT != 0 {
+ return 0, fmt.Errorf("%w %+.3o (%s): unknown mode bits", errInvalidMode, mode, mode)
+ }
+ return sysMode, nil
+}
+
+// MkdirAllHandle is equivalent to [MkdirAll], except that it is safer to use
+// in two respects:
+//
+// - The caller provides the root directory as an *[os.File] (preferably O_PATH)
+// handle. This means that the caller can be sure which root directory is
+// being used. Note that this can be emulated by using /proc/self/fd/... as
+// the root path with [os.MkdirAll].
+//
+// - Once all of the directories have been created, an *[os.File] O_PATH handle
+// to the directory at unsafePath is returned to the caller. This is done in
+// an effectively-race-free way (an attacker would only be able to swap the
+// final directory component), which is not possible to emulate with
+// [MkdirAll].
+//
+// In addition, the returned handle is obtained far more efficiently than doing
+// a brand new lookup of unsafePath (such as with [SecureJoin] or openat2) after
+// doing [MkdirAll]. If you intend to open the directory after creating it, you
+// should use MkdirAllHandle.
+func MkdirAllHandle(root *os.File, unsafePath string, mode os.FileMode) (_ *os.File, Err error) {
+ unixMode, err := toUnixMode(mode)
+ if err != nil {
+ return nil, err
+ }
+ // On Linux, mkdirat(2) (and os.Mkdir) silently ignore the suid and sgid
+ // bits. We could also silently ignore them but since we have very few
+ // users it seems more prudent to return an error so users notice that
+ // these bits will not be set.
+ if unixMode&^0o1777 != 0 {
+ return nil, fmt.Errorf("%w for mkdir %+.3o: suid and sgid are ignored by mkdir", errInvalidMode, mode)
+ }
+
+ // Try to open as much of the path as possible.
+ currentDir, remainingPath, err := partialLookupInRoot(root, unsafePath)
+ defer func() {
+ if Err != nil {
+ _ = currentDir.Close()
+ }
+ }()
+ if err != nil && !errors.Is(err, unix.ENOENT) {
+ return nil, fmt.Errorf("find existing subpath of %q: %w", unsafePath, err)
+ }
+
+ // If there is an attacker deleting directories as we walk into them,
+ // detect this proactively. Note this is guaranteed to detect if the
+ // attacker deleted any part of the tree up to currentDir.
+ //
+ // Once we walk into a dead directory, partialLookupInRoot would not be
+ // able to walk further down the tree (directories must be empty before
+ // they are deleted), and if the attacker has removed the entire tree we
+ // can be sure that anything that was originally inside a dead directory
+ // must also be deleted and thus is a dead directory in its own right.
+ //
+ // This is mostly a quality-of-life check, because mkdir will simply fail
+ // later if the attacker deletes the tree after this check.
+ if err := isDeadInode(currentDir); err != nil {
+ return nil, fmt.Errorf("finding existing subpath of %q: %w", unsafePath, err)
+ }
+
+ // Re-open the path to match the O_DIRECTORY reopen loop later (so that we
+ // always return a non-O_PATH handle). We also check that we actually got a
+ // directory.
+ if reopenDir, err := Reopen(currentDir, unix.O_DIRECTORY|unix.O_CLOEXEC); errors.Is(err, unix.ENOTDIR) {
+ return nil, fmt.Errorf("cannot create subdirectories in %q: %w", currentDir.Name(), unix.ENOTDIR)
+ } else if err != nil {
+ return nil, fmt.Errorf("re-opening handle to %q: %w", currentDir.Name(), err)
+ } else {
+ _ = currentDir.Close()
+ currentDir = reopenDir
+ }
+
+ remainingParts := strings.Split(remainingPath, string(filepath.Separator))
+ if slices_Contains(remainingParts, "..") {
+ // The path contained ".." components after the end of the "real"
+ // components. We could try to safely resolve ".." here but that would
+ // add a bunch of extra logic for something that it's not clear even
+ // needs to be supported. So just return an error.
+ //
+ // If we do filepath.Clean(remainingPath) then we end up with the
+ // problem that ".." can erase a trailing dangling symlink and produce
+ // a path that doesn't quite match what the user asked for.
+ return nil, fmt.Errorf("%w: yet-to-be-created path %q contains '..' components", unix.ENOENT, remainingPath)
+ }
+
+ // Create the remaining components.
+ for _, part := range remainingParts {
+ switch part {
+ case "", ".":
+ // Skip over no-op paths.
+ continue
+ }
+
+ // NOTE: mkdir(2) will not follow trailing symlinks, so we can safely
+ // create the final component without worrying about symlink-exchange
+ // attacks.
+ //
+ // If we get -EEXIST, it's possible that another program created the
+ // directory at the same time as us. In that case, just continue on as
+ // if we created it (if the created inode is not a directory, the
+ // following open call will fail).
+ if err := unix.Mkdirat(int(currentDir.Fd()), part, unixMode); err != nil && !errors.Is(err, unix.EEXIST) {
+ err = &os.PathError{Op: "mkdirat", Path: currentDir.Name() + "/" + part, Err: err}
+ // Make the error a bit nicer if the directory is dead.
+ if deadErr := isDeadInode(currentDir); deadErr != nil {
+ // TODO: Once we bump the minimum Go version to 1.20, we can use
+ // multiple %w verbs for this wrapping. For now we need to use a
+ // compatibility shim for older Go versions.
+ //err = fmt.Errorf("%w (%w)", err, deadErr)
+ err = wrapBaseError(err, deadErr)
+ }
+ return nil, err
+ }
+
+ // Get a handle to the next component. O_DIRECTORY means we don't need
+ // to use O_PATH.
+ var nextDir *os.File
+ if hasOpenat2() {
+ nextDir, err = openat2File(currentDir, part, &unix.OpenHow{
+ Flags: unix.O_NOFOLLOW | unix.O_DIRECTORY | unix.O_CLOEXEC,
+ Resolve: unix.RESOLVE_BENEATH | unix.RESOLVE_NO_SYMLINKS | unix.RESOLVE_NO_XDEV,
+ })
+ } else {
+ nextDir, err = openatFile(currentDir, part, unix.O_NOFOLLOW|unix.O_DIRECTORY|unix.O_CLOEXEC, 0)
+ }
+ if err != nil {
+ return nil, err
+ }
+ _ = currentDir.Close()
+ currentDir = nextDir
+
+ // It's possible that the directory we just opened was swapped by an
+ // attacker. Unfortunately there isn't much we can do to protect
+ // against this, and MkdirAll's behaviour is that we will reuse
+ // existing directories anyway so the need to protect against this is
+ // incredibly limited (and arguably doesn't even deserve mention here).
+ //
+ // Ideally we might want to check that the owner and mode match what we
+ // would've created -- unfortunately, it is non-trivial to verify that
+ // the owner and mode of the created directory match. While plain Unix
+ // DAC rules seem simple enough to emulate, there are a bunch of other
+ // factors that can change the mode or owner of created directories
+ // (default POSIX ACLs, mount options like uid=1,gid=2,umask=0 on
+ // filesystems like vfat, etc etc). We used to try to verify this but
+ // it just lead to a series of spurious errors.
+ //
+ // We could also check that the directory is non-empty, but
+ // unfortunately some pseduofilesystems (like cgroupfs) create
+ // non-empty directories, which would result in different spurious
+ // errors.
+ }
+ return currentDir, nil
+}
+
+// MkdirAll is a race-safe alternative to the [os.MkdirAll] function,
+// where the new directory is guaranteed to be within the root directory (if an
+// attacker can move directories from inside the root to outside the root, the
+// created directory tree might be outside of the root but the key constraint
+// is that at no point will we walk outside of the directory tree we are
+// creating).
+//
+// Effectively, MkdirAll(root, unsafePath, mode) is equivalent to
+//
+// path, _ := securejoin.SecureJoin(root, unsafePath)
+// err := os.MkdirAll(path, mode)
+//
+// But is much safer. The above implementation is unsafe because if an attacker
+// can modify the filesystem tree between [SecureJoin] and [os.MkdirAll], it is
+// possible for MkdirAll to resolve unsafe symlink components and create
+// directories outside of the root.
+//
+// If you plan to open the directory after you have created it or want to use
+// an open directory handle as the root, you should use [MkdirAllHandle] instead.
+// This function is a wrapper around [MkdirAllHandle].
+func MkdirAll(root, unsafePath string, mode os.FileMode) error {
+ rootDir, err := os.OpenFile(root, unix.O_PATH|unix.O_DIRECTORY|unix.O_CLOEXEC, 0)
+ if err != nil {
+ return err
+ }
+ defer rootDir.Close()
+
+ f, err := MkdirAllHandle(rootDir, unsafePath, mode)
+ if err != nil {
+ return err
+ }
+ _ = f.Close()
+ return nil
+}
diff --git a/vendor/github.com/cyphar/filepath-securejoin/open_linux.go b/vendor/github.com/cyphar/filepath-securejoin/open_linux.go
new file mode 100644
index 000000000..230be73f0
--- /dev/null
+++ b/vendor/github.com/cyphar/filepath-securejoin/open_linux.go
@@ -0,0 +1,103 @@
+//go:build linux
+
+// Copyright (C) 2024 SUSE LLC. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package securejoin
+
+import (
+ "fmt"
+ "os"
+ "strconv"
+
+ "golang.org/x/sys/unix"
+)
+
+// OpenatInRoot is equivalent to [OpenInRoot], except that the root is provided
+// using an *[os.File] handle, to ensure that the correct root directory is used.
+func OpenatInRoot(root *os.File, unsafePath string) (*os.File, error) {
+ handle, err := completeLookupInRoot(root, unsafePath)
+ if err != nil {
+ return nil, &os.PathError{Op: "securejoin.OpenInRoot", Path: unsafePath, Err: err}
+ }
+ return handle, nil
+}
+
+// OpenInRoot safely opens the provided unsafePath within the root.
+// Effectively, OpenInRoot(root, unsafePath) is equivalent to
+//
+// path, _ := securejoin.SecureJoin(root, unsafePath)
+// handle, err := os.OpenFile(path, unix.O_PATH|unix.O_CLOEXEC)
+//
+// But is much safer. The above implementation is unsafe because if an attacker
+// can modify the filesystem tree between [SecureJoin] and [os.OpenFile], it is
+// possible for the returned file to be outside of the root.
+//
+// Note that the returned handle is an O_PATH handle, meaning that only a very
+// limited set of operations will work on the handle. This is done to avoid
+// accidentally opening an untrusted file that could cause issues (such as a
+// disconnected TTY that could cause a DoS, or some other issue). In order to
+// use the returned handle, you can "upgrade" it to a proper handle using
+// [Reopen].
+func OpenInRoot(root, unsafePath string) (*os.File, error) {
+ rootDir, err := os.OpenFile(root, unix.O_PATH|unix.O_DIRECTORY|unix.O_CLOEXEC, 0)
+ if err != nil {
+ return nil, err
+ }
+ defer rootDir.Close()
+ return OpenatInRoot(rootDir, unsafePath)
+}
+
+// Reopen takes an *[os.File] handle and re-opens it through /proc/self/fd.
+// Reopen(file, flags) is effectively equivalent to
+//
+// fdPath := fmt.Sprintf("/proc/self/fd/%d", file.Fd())
+// os.OpenFile(fdPath, flags|unix.O_CLOEXEC)
+//
+// But with some extra hardenings to ensure that we are not tricked by a
+// maliciously-configured /proc mount. While this attack scenario is not
+// common, in container runtimes it is possible for higher-level runtimes to be
+// tricked into configuring an unsafe /proc that can be used to attack file
+// operations. See [CVE-2019-19921] for more details.
+//
+// [CVE-2019-19921]: https://github.com/advisories/GHSA-fh74-hm69-rqjw
+func Reopen(handle *os.File, flags int) (*os.File, error) {
+ procRoot, err := getProcRoot()
+ if err != nil {
+ return nil, err
+ }
+
+ // We can't operate on /proc/thread-self/fd/$n directly when doing a
+ // re-open, so we need to open /proc/thread-self/fd and then open a single
+ // final component.
+ procFdDir, closer, err := procThreadSelf(procRoot, "fd/")
+ if err != nil {
+ return nil, fmt.Errorf("get safe /proc/thread-self/fd handle: %w", err)
+ }
+ defer procFdDir.Close()
+ defer closer()
+
+ // Try to detect if there is a mount on top of the magic-link we are about
+ // to open. If we are using unsafeHostProcRoot(), this could change after
+ // we check it (and there's nothing we can do about that) but for
+ // privateProcRoot() this should be guaranteed to be safe (at least since
+ // Linux 5.12[1], when anonymous mount namespaces were completely isolated
+ // from external mounts including mount propagation events).
+ //
+ // [1]: Linux commit ee2e3f50629f ("mount: fix mounting of detached mounts
+ // onto targets that reside on shared mounts").
+ fdStr := strconv.Itoa(int(handle.Fd()))
+ if err := checkSymlinkOvermount(procRoot, procFdDir, fdStr); err != nil {
+ return nil, fmt.Errorf("check safety of /proc/thread-self/fd/%s magiclink: %w", fdStr, err)
+ }
+
+ flags |= unix.O_CLOEXEC
+ // Rather than just wrapping openatFile, open-code it so we can copy
+ // handle.Name().
+ reopenFd, err := unix.Openat(int(procFdDir.Fd()), fdStr, flags, 0)
+ if err != nil {
+ return nil, fmt.Errorf("reopen fd %d: %w", handle.Fd(), err)
+ }
+ return os.NewFile(uintptr(reopenFd), handle.Name()), nil
+}
diff --git a/vendor/github.com/cyphar/filepath-securejoin/openat2_linux.go b/vendor/github.com/cyphar/filepath-securejoin/openat2_linux.go
new file mode 100644
index 000000000..f7a13e69c
--- /dev/null
+++ b/vendor/github.com/cyphar/filepath-securejoin/openat2_linux.go
@@ -0,0 +1,127 @@
+//go:build linux
+
+// Copyright (C) 2024 SUSE LLC. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package securejoin
+
+import (
+ "errors"
+ "fmt"
+ "os"
+ "path/filepath"
+ "strings"
+
+ "golang.org/x/sys/unix"
+)
+
+var hasOpenat2 = sync_OnceValue(func() bool {
+ fd, err := unix.Openat2(unix.AT_FDCWD, ".", &unix.OpenHow{
+ Flags: unix.O_PATH | unix.O_CLOEXEC,
+ Resolve: unix.RESOLVE_NO_SYMLINKS | unix.RESOLVE_IN_ROOT,
+ })
+ if err != nil {
+ return false
+ }
+ _ = unix.Close(fd)
+ return true
+})
+
+func scopedLookupShouldRetry(how *unix.OpenHow, err error) bool {
+ // RESOLVE_IN_ROOT (and RESOLVE_BENEATH) can return -EAGAIN if we resolve
+ // ".." while a mount or rename occurs anywhere on the system. This could
+ // happen spuriously, or as the result of an attacker trying to mess with
+ // us during lookup.
+ //
+ // In addition, scoped lookups have a "safety check" at the end of
+ // complete_walk which will return -EXDEV if the final path is not in the
+ // root.
+ return how.Resolve&(unix.RESOLVE_IN_ROOT|unix.RESOLVE_BENEATH) != 0 &&
+ (errors.Is(err, unix.EAGAIN) || errors.Is(err, unix.EXDEV))
+}
+
+const scopedLookupMaxRetries = 10
+
+func openat2File(dir *os.File, path string, how *unix.OpenHow) (*os.File, error) {
+ fullPath := dir.Name() + "/" + path
+ // Make sure we always set O_CLOEXEC.
+ how.Flags |= unix.O_CLOEXEC
+ var tries int
+ for tries < scopedLookupMaxRetries {
+ fd, err := unix.Openat2(int(dir.Fd()), path, how)
+ if err != nil {
+ if scopedLookupShouldRetry(how, err) {
+ // We retry a couple of times to avoid the spurious errors, and
+ // if we are being attacked then returning -EAGAIN is the best
+ // we can do.
+ tries++
+ continue
+ }
+ return nil, &os.PathError{Op: "openat2", Path: fullPath, Err: err}
+ }
+ // If we are using RESOLVE_IN_ROOT, the name we generated may be wrong.
+ // NOTE: The procRoot code MUST NOT use RESOLVE_IN_ROOT, otherwise
+ // you'll get infinite recursion here.
+ if how.Resolve&unix.RESOLVE_IN_ROOT == unix.RESOLVE_IN_ROOT {
+ if actualPath, err := rawProcSelfFdReadlink(fd); err == nil {
+ fullPath = actualPath
+ }
+ }
+ return os.NewFile(uintptr(fd), fullPath), nil
+ }
+ return nil, &os.PathError{Op: "openat2", Path: fullPath, Err: errPossibleAttack}
+}
+
+func lookupOpenat2(root *os.File, unsafePath string, partial bool) (*os.File, string, error) {
+ if !partial {
+ file, err := openat2File(root, unsafePath, &unix.OpenHow{
+ Flags: unix.O_PATH | unix.O_CLOEXEC,
+ Resolve: unix.RESOLVE_IN_ROOT | unix.RESOLVE_NO_MAGICLINKS,
+ })
+ return file, "", err
+ }
+ return partialLookupOpenat2(root, unsafePath)
+}
+
+// partialLookupOpenat2 is an alternative implementation of
+// partialLookupInRoot, using openat2(RESOLVE_IN_ROOT) to more safely get a
+// handle to the deepest existing child of the requested path within the root.
+func partialLookupOpenat2(root *os.File, unsafePath string) (*os.File, string, error) {
+ // TODO: Implement this as a git-bisect-like binary search.
+
+ unsafePath = filepath.ToSlash(unsafePath) // noop
+ endIdx := len(unsafePath)
+ var lastError error
+ for endIdx > 0 {
+ subpath := unsafePath[:endIdx]
+
+ handle, err := openat2File(root, subpath, &unix.OpenHow{
+ Flags: unix.O_PATH | unix.O_CLOEXEC,
+ Resolve: unix.RESOLVE_IN_ROOT | unix.RESOLVE_NO_MAGICLINKS,
+ })
+ if err == nil {
+ // Jump over the slash if we have a non-"" remainingPath.
+ if endIdx < len(unsafePath) {
+ endIdx += 1
+ }
+ // We found a subpath!
+ return handle, unsafePath[endIdx:], lastError
+ }
+ if errors.Is(err, unix.ENOENT) || errors.Is(err, unix.ENOTDIR) {
+ // That path doesn't exist, let's try the next directory up.
+ endIdx = strings.LastIndexByte(subpath, '/')
+ lastError = err
+ continue
+ }
+ return nil, "", fmt.Errorf("open subpath: %w", err)
+ }
+ // If we couldn't open anything, the whole subpath is missing. Return a
+ // copy of the root fd so that the caller doesn't close this one by
+ // accident.
+ rootClone, err := dupFile(root)
+ if err != nil {
+ return nil, "", err
+ }
+ return rootClone, unsafePath, lastError
+}
diff --git a/vendor/github.com/cyphar/filepath-securejoin/openat_linux.go b/vendor/github.com/cyphar/filepath-securejoin/openat_linux.go
new file mode 100644
index 000000000..949fb5f2d
--- /dev/null
+++ b/vendor/github.com/cyphar/filepath-securejoin/openat_linux.go
@@ -0,0 +1,59 @@
+//go:build linux
+
+// Copyright (C) 2024 SUSE LLC. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package securejoin
+
+import (
+ "os"
+ "path/filepath"
+
+ "golang.org/x/sys/unix"
+)
+
+func dupFile(f *os.File) (*os.File, error) {
+ fd, err := unix.FcntlInt(f.Fd(), unix.F_DUPFD_CLOEXEC, 0)
+ if err != nil {
+ return nil, os.NewSyscallError("fcntl(F_DUPFD_CLOEXEC)", err)
+ }
+ return os.NewFile(uintptr(fd), f.Name()), nil
+}
+
+func openatFile(dir *os.File, path string, flags int, mode int) (*os.File, error) {
+ // Make sure we always set O_CLOEXEC.
+ flags |= unix.O_CLOEXEC
+ fd, err := unix.Openat(int(dir.Fd()), path, flags, uint32(mode))
+ if err != nil {
+ return nil, &os.PathError{Op: "openat", Path: dir.Name() + "/" + path, Err: err}
+ }
+ // All of the paths we use with openatFile(2) are guaranteed to be
+ // lexically safe, so we can use path.Join here.
+ fullPath := filepath.Join(dir.Name(), path)
+ return os.NewFile(uintptr(fd), fullPath), nil
+}
+
+func fstatatFile(dir *os.File, path string, flags int) (unix.Stat_t, error) {
+ var stat unix.Stat_t
+ if err := unix.Fstatat(int(dir.Fd()), path, &stat, flags); err != nil {
+ return stat, &os.PathError{Op: "fstatat", Path: dir.Name() + "/" + path, Err: err}
+ }
+ return stat, nil
+}
+
+func readlinkatFile(dir *os.File, path string) (string, error) {
+ size := 4096
+ for {
+ linkBuf := make([]byte, size)
+ n, err := unix.Readlinkat(int(dir.Fd()), path, linkBuf)
+ if err != nil {
+ return "", &os.PathError{Op: "readlinkat", Path: dir.Name() + "/" + path, Err: err}
+ }
+ if n != size {
+ return string(linkBuf[:n]), nil
+ }
+ // Possible truncation, resize the buffer.
+ size *= 2
+ }
+}
diff --git a/vendor/github.com/cyphar/filepath-securejoin/procfs_linux.go b/vendor/github.com/cyphar/filepath-securejoin/procfs_linux.go
new file mode 100644
index 000000000..809a579cb
--- /dev/null
+++ b/vendor/github.com/cyphar/filepath-securejoin/procfs_linux.go
@@ -0,0 +1,452 @@
+//go:build linux
+
+// Copyright (C) 2024 SUSE LLC. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package securejoin
+
+import (
+ "errors"
+ "fmt"
+ "os"
+ "runtime"
+ "strconv"
+
+ "golang.org/x/sys/unix"
+)
+
+func fstat(f *os.File) (unix.Stat_t, error) {
+ var stat unix.Stat_t
+ if err := unix.Fstat(int(f.Fd()), &stat); err != nil {
+ return stat, &os.PathError{Op: "fstat", Path: f.Name(), Err: err}
+ }
+ return stat, nil
+}
+
+func fstatfs(f *os.File) (unix.Statfs_t, error) {
+ var statfs unix.Statfs_t
+ if err := unix.Fstatfs(int(f.Fd()), &statfs); err != nil {
+ return statfs, &os.PathError{Op: "fstatfs", Path: f.Name(), Err: err}
+ }
+ return statfs, nil
+}
+
+// The kernel guarantees that the root inode of a procfs mount has an
+// f_type of PROC_SUPER_MAGIC and st_ino of PROC_ROOT_INO.
+const (
+ procSuperMagic = 0x9fa0 // PROC_SUPER_MAGIC
+ procRootIno = 1 // PROC_ROOT_INO
+)
+
+func verifyProcRoot(procRoot *os.File) error {
+ if statfs, err := fstatfs(procRoot); err != nil {
+ return err
+ } else if statfs.Type != procSuperMagic {
+ return fmt.Errorf("%w: incorrect procfs root filesystem type 0x%x", errUnsafeProcfs, statfs.Type)
+ }
+ if stat, err := fstat(procRoot); err != nil {
+ return err
+ } else if stat.Ino != procRootIno {
+ return fmt.Errorf("%w: incorrect procfs root inode number %d", errUnsafeProcfs, stat.Ino)
+ }
+ return nil
+}
+
+var hasNewMountApi = sync_OnceValue(func() bool {
+ // All of the pieces of the new mount API we use (fsopen, fsconfig,
+ // fsmount, open_tree) were added together in Linux 5.1[1,2], so we can
+ // just check for one of the syscalls and the others should also be
+ // available.
+ //
+ // Just try to use open_tree(2) to open a file without OPEN_TREE_CLONE.
+ // This is equivalent to openat(2), but tells us if open_tree is
+ // available (and thus all of the other basic new mount API syscalls).
+ // open_tree(2) is most light-weight syscall to test here.
+ //
+ // [1]: merge commit 400913252d09
+ // [2]:
+ fd, err := unix.OpenTree(-int(unix.EBADF), "/", unix.OPEN_TREE_CLOEXEC)
+ if err != nil {
+ return false
+ }
+ _ = unix.Close(fd)
+ return true
+})
+
+func fsopen(fsName string, flags int) (*os.File, error) {
+ // Make sure we always set O_CLOEXEC.
+ flags |= unix.FSOPEN_CLOEXEC
+ fd, err := unix.Fsopen(fsName, flags)
+ if err != nil {
+ return nil, os.NewSyscallError("fsopen "+fsName, err)
+ }
+ return os.NewFile(uintptr(fd), "fscontext:"+fsName), nil
+}
+
+func fsmount(ctx *os.File, flags, mountAttrs int) (*os.File, error) {
+ // Make sure we always set O_CLOEXEC.
+ flags |= unix.FSMOUNT_CLOEXEC
+ fd, err := unix.Fsmount(int(ctx.Fd()), flags, mountAttrs)
+ if err != nil {
+ return nil, os.NewSyscallError("fsmount "+ctx.Name(), err)
+ }
+ return os.NewFile(uintptr(fd), "fsmount:"+ctx.Name()), nil
+}
+
+func newPrivateProcMount() (*os.File, error) {
+ procfsCtx, err := fsopen("proc", unix.FSOPEN_CLOEXEC)
+ if err != nil {
+ return nil, err
+ }
+ defer procfsCtx.Close()
+
+ // Try to configure hidepid=ptraceable,subset=pid if possible, but ignore errors.
+ _ = unix.FsconfigSetString(int(procfsCtx.Fd()), "hidepid", "ptraceable")
+ _ = unix.FsconfigSetString(int(procfsCtx.Fd()), "subset", "pid")
+
+ // Get an actual handle.
+ if err := unix.FsconfigCreate(int(procfsCtx.Fd())); err != nil {
+ return nil, os.NewSyscallError("fsconfig create procfs", err)
+ }
+ return fsmount(procfsCtx, unix.FSMOUNT_CLOEXEC, unix.MS_RDONLY|unix.MS_NODEV|unix.MS_NOEXEC|unix.MS_NOSUID)
+}
+
+func openTree(dir *os.File, path string, flags uint) (*os.File, error) {
+ dirFd := -int(unix.EBADF)
+ dirName := "."
+ if dir != nil {
+ dirFd = int(dir.Fd())
+ dirName = dir.Name()
+ }
+ // Make sure we always set O_CLOEXEC.
+ flags |= unix.OPEN_TREE_CLOEXEC
+ fd, err := unix.OpenTree(dirFd, path, flags)
+ if err != nil {
+ return nil, &os.PathError{Op: "open_tree", Path: path, Err: err}
+ }
+ return os.NewFile(uintptr(fd), dirName+"/"+path), nil
+}
+
+func clonePrivateProcMount() (_ *os.File, Err error) {
+ // Try to make a clone without using AT_RECURSIVE if we can. If this works,
+ // we can be sure there are no over-mounts and so if the root is valid then
+ // we're golden. Otherwise, we have to deal with over-mounts.
+ procfsHandle, err := openTree(nil, "/proc", unix.OPEN_TREE_CLONE)
+ if err != nil || hookForcePrivateProcRootOpenTreeAtRecursive(procfsHandle) {
+ procfsHandle, err = openTree(nil, "/proc", unix.OPEN_TREE_CLONE|unix.AT_RECURSIVE)
+ }
+ if err != nil {
+ return nil, fmt.Errorf("creating a detached procfs clone: %w", err)
+ }
+ defer func() {
+ if Err != nil {
+ _ = procfsHandle.Close()
+ }
+ }()
+ if err := verifyProcRoot(procfsHandle); err != nil {
+ return nil, err
+ }
+ return procfsHandle, nil
+}
+
+func privateProcRoot() (*os.File, error) {
+ if !hasNewMountApi() || hookForceGetProcRootUnsafe() {
+ return nil, fmt.Errorf("new mount api: %w", unix.ENOTSUP)
+ }
+ // Try to create a new procfs mount from scratch if we can. This ensures we
+ // can get a procfs mount even if /proc is fake (for whatever reason).
+ procRoot, err := newPrivateProcMount()
+ if err != nil || hookForcePrivateProcRootOpenTree(procRoot) {
+ // Try to clone /proc then...
+ procRoot, err = clonePrivateProcMount()
+ }
+ return procRoot, err
+}
+
+func unsafeHostProcRoot() (_ *os.File, Err error) {
+ procRoot, err := os.OpenFile("/proc", unix.O_PATH|unix.O_NOFOLLOW|unix.O_DIRECTORY|unix.O_CLOEXEC, 0)
+ if err != nil {
+ return nil, err
+ }
+ defer func() {
+ if Err != nil {
+ _ = procRoot.Close()
+ }
+ }()
+ if err := verifyProcRoot(procRoot); err != nil {
+ return nil, err
+ }
+ return procRoot, nil
+}
+
+func doGetProcRoot() (*os.File, error) {
+ procRoot, err := privateProcRoot()
+ if err != nil {
+ // Fall back to using a /proc handle if making a private mount failed.
+ // If we have openat2, at least we can avoid some kinds of over-mount
+ // attacks, but without openat2 there's not much we can do.
+ procRoot, err = unsafeHostProcRoot()
+ }
+ return procRoot, err
+}
+
+var getProcRoot = sync_OnceValues(func() (*os.File, error) {
+ return doGetProcRoot()
+})
+
+var hasProcThreadSelf = sync_OnceValue(func() bool {
+ return unix.Access("/proc/thread-self/", unix.F_OK) == nil
+})
+
+var errUnsafeProcfs = errors.New("unsafe procfs detected")
+
+type procThreadSelfCloser func()
+
+// procThreadSelf returns a handle to /proc/thread-self/ (or an
+// equivalent handle on older kernels where /proc/thread-self doesn't exist).
+// Once finished with the handle, you must call the returned closer function
+// (runtime.UnlockOSThread). You must not pass the returned *os.File to other
+// Go threads or use the handle after calling the closer.
+//
+// This is similar to ProcThreadSelf from runc, but with extra hardening
+// applied and using *os.File.
+func procThreadSelf(procRoot *os.File, subpath string) (_ *os.File, _ procThreadSelfCloser, Err error) {
+ // We need to lock our thread until the caller is done with the handle
+ // because between getting the handle and using it we could get interrupted
+ // by the Go runtime and hit the case where the underlying thread is
+ // swapped out and the original thread is killed, resulting in
+ // pull-your-hair-out-hard-to-debug issues in the caller.
+ runtime.LockOSThread()
+ defer func() {
+ if Err != nil {
+ runtime.UnlockOSThread()
+ }
+ }()
+
+ // Figure out what prefix we want to use.
+ threadSelf := "thread-self/"
+ if !hasProcThreadSelf() || hookForceProcSelfTask() {
+ /// Pre-3.17 kernels don't have /proc/thread-self, so do it manually.
+ threadSelf = "self/task/" + strconv.Itoa(unix.Gettid()) + "/"
+ if _, err := fstatatFile(procRoot, threadSelf, unix.AT_SYMLINK_NOFOLLOW); err != nil || hookForceProcSelf() {
+ // In this case, we running in a pid namespace that doesn't match
+ // the /proc mount we have. This can happen inside runc.
+ //
+ // Unfortunately, there is no nice way to get the correct TID to
+ // use here because of the age of the kernel, so we have to just
+ // use /proc/self and hope that it works.
+ threadSelf = "self/"
+ }
+ }
+
+ // Grab the handle.
+ var (
+ handle *os.File
+ err error
+ )
+ if hasOpenat2() {
+ // We prefer being able to use RESOLVE_NO_XDEV if we can, to be
+ // absolutely sure we are operating on a clean /proc handle that
+ // doesn't have any cheeky overmounts that could trick us (including
+ // symlink mounts on top of /proc/thread-self). RESOLVE_BENEATH isn't
+ // strictly needed, but just use it since we have it.
+ //
+ // NOTE: /proc/self is technically a magic-link (the contents of the
+ // symlink are generated dynamically), but it doesn't use
+ // nd_jump_link() so RESOLVE_NO_MAGICLINKS allows it.
+ //
+ // NOTE: We MUST NOT use RESOLVE_IN_ROOT here, as openat2File uses
+ // procSelfFdReadlink to clean up the returned f.Name() if we use
+ // RESOLVE_IN_ROOT (which would lead to an infinite recursion).
+ handle, err = openat2File(procRoot, threadSelf+subpath, &unix.OpenHow{
+ Flags: unix.O_PATH | unix.O_NOFOLLOW | unix.O_CLOEXEC,
+ Resolve: unix.RESOLVE_BENEATH | unix.RESOLVE_NO_XDEV | unix.RESOLVE_NO_MAGICLINKS,
+ })
+ if err != nil {
+ // TODO: Once we bump the minimum Go version to 1.20, we can use
+ // multiple %w verbs for this wrapping. For now we need to use a
+ // compatibility shim for older Go versions.
+ //err = fmt.Errorf("%w: %w", errUnsafeProcfs, err)
+ return nil, nil, wrapBaseError(err, errUnsafeProcfs)
+ }
+ } else {
+ handle, err = openatFile(procRoot, threadSelf+subpath, unix.O_PATH|unix.O_NOFOLLOW|unix.O_CLOEXEC, 0)
+ if err != nil {
+ // TODO: Once we bump the minimum Go version to 1.20, we can use
+ // multiple %w verbs for this wrapping. For now we need to use a
+ // compatibility shim for older Go versions.
+ //err = fmt.Errorf("%w: %w", errUnsafeProcfs, err)
+ return nil, nil, wrapBaseError(err, errUnsafeProcfs)
+ }
+ defer func() {
+ if Err != nil {
+ _ = handle.Close()
+ }
+ }()
+ // We can't detect bind-mounts of different parts of procfs on top of
+ // /proc (a-la RESOLVE_NO_XDEV), but we can at least be sure that we
+ // aren't on the wrong filesystem here.
+ if statfs, err := fstatfs(handle); err != nil {
+ return nil, nil, err
+ } else if statfs.Type != procSuperMagic {
+ return nil, nil, fmt.Errorf("%w: incorrect /proc/self/fd filesystem type 0x%x", errUnsafeProcfs, statfs.Type)
+ }
+ }
+ return handle, runtime.UnlockOSThread, nil
+}
+
+// STATX_MNT_ID_UNIQUE is provided in golang.org/x/sys@v0.20.0, but in order to
+// avoid bumping the requirement for a single constant we can just define it
+// ourselves.
+const STATX_MNT_ID_UNIQUE = 0x4000
+
+var hasStatxMountId = sync_OnceValue(func() bool {
+ var (
+ stx unix.Statx_t
+ // We don't care which mount ID we get. The kernel will give us the
+ // unique one if it is supported.
+ wantStxMask uint32 = STATX_MNT_ID_UNIQUE | unix.STATX_MNT_ID
+ )
+ err := unix.Statx(-int(unix.EBADF), "/", 0, int(wantStxMask), &stx)
+ return err == nil && stx.Mask&wantStxMask != 0
+})
+
+func getMountId(dir *os.File, path string) (uint64, error) {
+ // If we don't have statx(STATX_MNT_ID*) support, we can't do anything.
+ if !hasStatxMountId() {
+ return 0, nil
+ }
+
+ var (
+ stx unix.Statx_t
+ // We don't care which mount ID we get. The kernel will give us the
+ // unique one if it is supported.
+ wantStxMask uint32 = STATX_MNT_ID_UNIQUE | unix.STATX_MNT_ID
+ )
+
+ err := unix.Statx(int(dir.Fd()), path, unix.AT_EMPTY_PATH|unix.AT_SYMLINK_NOFOLLOW, int(wantStxMask), &stx)
+ if stx.Mask&wantStxMask == 0 {
+ // It's not a kernel limitation, for some reason we couldn't get a
+ // mount ID. Assume it's some kind of attack.
+ err = fmt.Errorf("%w: could not get mount id", errUnsafeProcfs)
+ }
+ if err != nil {
+ return 0, &os.PathError{Op: "statx(STATX_MNT_ID_...)", Path: dir.Name() + "/" + path, Err: err}
+ }
+ return stx.Mnt_id, nil
+}
+
+func checkSymlinkOvermount(procRoot *os.File, dir *os.File, path string) error {
+ // Get the mntId of our procfs handle.
+ expectedMountId, err := getMountId(procRoot, "")
+ if err != nil {
+ return err
+ }
+ // Get the mntId of the target magic-link.
+ gotMountId, err := getMountId(dir, path)
+ if err != nil {
+ return err
+ }
+ // As long as the directory mount is alive, even with wrapping mount IDs,
+ // we would expect to see a different mount ID here. (Of course, if we're
+ // using unsafeHostProcRoot() then an attaker could change this after we
+ // did this check.)
+ if expectedMountId != gotMountId {
+ return fmt.Errorf("%w: symlink %s/%s has an overmount obscuring the real link (mount ids do not match %d != %d)", errUnsafeProcfs, dir.Name(), path, expectedMountId, gotMountId)
+ }
+ return nil
+}
+
+func doRawProcSelfFdReadlink(procRoot *os.File, fd int) (string, error) {
+ fdPath := fmt.Sprintf("fd/%d", fd)
+ procFdLink, closer, err := procThreadSelf(procRoot, fdPath)
+ if err != nil {
+ return "", fmt.Errorf("get safe /proc/thread-self/%s handle: %w", fdPath, err)
+ }
+ defer procFdLink.Close()
+ defer closer()
+
+ // Try to detect if there is a mount on top of the magic-link. Since we use the handle directly
+ // provide to the closure. If the closure uses the handle directly, this
+ // should be safe in general (a mount on top of the path afterwards would
+ // not affect the handle itself) and will definitely be safe if we are
+ // using privateProcRoot() (at least since Linux 5.12[1], when anonymous
+ // mount namespaces were completely isolated from external mounts including
+ // mount propagation events).
+ //
+ // [1]: Linux commit ee2e3f50629f ("mount: fix mounting of detached mounts
+ // onto targets that reside on shared mounts").
+ if err := checkSymlinkOvermount(procRoot, procFdLink, ""); err != nil {
+ return "", fmt.Errorf("check safety of /proc/thread-self/fd/%d magiclink: %w", fd, err)
+ }
+
+ // readlinkat implies AT_EMPTY_PATH since Linux 2.6.39. See Linux commit
+ // 65cfc6722361 ("readlinkat(), fchownat() and fstatat() with empty
+ // relative pathnames").
+ return readlinkatFile(procFdLink, "")
+}
+
+func rawProcSelfFdReadlink(fd int) (string, error) {
+ procRoot, err := getProcRoot()
+ if err != nil {
+ return "", err
+ }
+ return doRawProcSelfFdReadlink(procRoot, fd)
+}
+
+func procSelfFdReadlink(f *os.File) (string, error) {
+ return rawProcSelfFdReadlink(int(f.Fd()))
+}
+
+var (
+ errPossibleBreakout = errors.New("possible breakout detected")
+ errInvalidDirectory = errors.New("wandered into deleted directory")
+ errDeletedInode = errors.New("cannot verify path of deleted inode")
+)
+
+func isDeadInode(file *os.File) error {
+ // If the nlink of a file drops to 0, there is an attacker deleting
+ // directories during our walk, which could result in weird /proc values.
+ // It's better to error out in this case.
+ stat, err := fstat(file)
+ if err != nil {
+ return fmt.Errorf("check for dead inode: %w", err)
+ }
+ if stat.Nlink == 0 {
+ err := errDeletedInode
+ if stat.Mode&unix.S_IFMT == unix.S_IFDIR {
+ err = errInvalidDirectory
+ }
+ return fmt.Errorf("%w %q", err, file.Name())
+ }
+ return nil
+}
+
+func checkProcSelfFdPath(path string, file *os.File) error {
+ if err := isDeadInode(file); err != nil {
+ return err
+ }
+ actualPath, err := procSelfFdReadlink(file)
+ if err != nil {
+ return fmt.Errorf("get path of handle: %w", err)
+ }
+ if actualPath != path {
+ return fmt.Errorf("%w: handle path %q doesn't match expected path %q", errPossibleBreakout, actualPath, path)
+ }
+ return nil
+}
+
+// Test hooks used in the procfs tests to verify that the fallback logic works.
+// See testing_mocks_linux_test.go and procfs_linux_test.go for more details.
+var (
+ hookForcePrivateProcRootOpenTree = hookDummyFile
+ hookForcePrivateProcRootOpenTreeAtRecursive = hookDummyFile
+ hookForceGetProcRootUnsafe = hookDummy
+
+ hookForceProcSelfTask = hookDummy
+ hookForceProcSelf = hookDummy
+)
+
+func hookDummy() bool { return false }
+func hookDummyFile(_ *os.File) bool { return false }
diff --git a/vendor/github.com/cyphar/filepath-securejoin/vfs.go b/vendor/github.com/cyphar/filepath-securejoin/vfs.go
new file mode 100644
index 000000000..36373f8c5
--- /dev/null
+++ b/vendor/github.com/cyphar/filepath-securejoin/vfs.go
@@ -0,0 +1,35 @@
+// Copyright (C) 2017-2024 SUSE LLC. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package securejoin
+
+import "os"
+
+// In future this should be moved into a separate package, because now there
+// are several projects (umoci and go-mtree) that are using this sort of
+// interface.
+
+// VFS is the minimal interface necessary to use [SecureJoinVFS]. A nil VFS is
+// equivalent to using the standard [os].* family of functions. This is mainly
+// used for the purposes of mock testing, but also can be used to otherwise use
+// [SecureJoinVFS] with VFS-like system.
+type VFS interface {
+ // Lstat returns an [os.FileInfo] describing the named file. If the
+ // file is a symbolic link, the returned [os.FileInfo] describes the
+ // symbolic link. Lstat makes no attempt to follow the link.
+ // The semantics are identical to [os.Lstat].
+ Lstat(name string) (os.FileInfo, error)
+
+ // Readlink returns the destination of the named symbolic link.
+ // The semantics are identical to [os.Readlink].
+ Readlink(name string) (string, error)
+}
+
+// osVFS is the "nil" VFS, in that it just passes everything through to the os
+// module.
+type osVFS struct{}
+
+func (o osVFS) Lstat(name string) (os.FileInfo, error) { return os.Lstat(name) }
+
+func (o osVFS) Readlink(name string) (string, error) { return os.Readlink(name) }
diff --git a/vendor/github.com/opencontainers/runc/LICENSE b/vendor/github.com/opencontainers/runc/LICENSE
new file mode 100644
index 000000000..27448585a
--- /dev/null
+++ b/vendor/github.com/opencontainers/runc/LICENSE
@@ -0,0 +1,191 @@
+
+ Apache License
+ Version 2.0, January 2004
+ http://www.apache.org/licenses/
+
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+ 1. Definitions.
+
+ "License" shall mean the terms and conditions for use, reproduction,
+ and distribution as defined by Sections 1 through 9 of this document.
+
+ "Licensor" shall mean the copyright owner or entity authorized by
+ the copyright owner that is granting the License.
+
+ "Legal Entity" shall mean the union of the acting entity and all
+ other entities that control, are controlled by, or are under common
+ control with that entity. For the purposes of this definition,
+ "control" means (i) the power, direct or indirect, to cause the
+ direction or management of such entity, whether by contract or
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
+ outstanding shares, or (iii) beneficial ownership of such entity.
+
+ "You" (or "Your") shall mean an individual or Legal Entity
+ exercising permissions granted by this License.
+
+ "Source" form shall mean the preferred form for making modifications,
+ including but not limited to software source code, documentation
+ source, and configuration files.
+
+ "Object" form shall mean any form resulting from mechanical
+ transformation or translation of a Source form, including but
+ not limited to compiled object code, generated documentation,
+ and conversions to other media types.
+
+ "Work" shall mean the work of authorship, whether in Source or
+ Object form, made available under the License, as indicated by a
+ copyright notice that is included in or attached to the work
+ (an example is provided in the Appendix below).
+
+ "Derivative Works" shall mean any work, whether in Source or Object
+ form, that is based on (or derived from) the Work and for which the
+ editorial revisions, annotations, elaborations, or other modifications
+ represent, as a whole, an original work of authorship. For the purposes
+ of this License, Derivative Works shall not include works that remain
+ separable from, or merely link (or bind by name) to the interfaces of,
+ the Work and Derivative Works thereof.
+
+ "Contribution" shall mean any work of authorship, including
+ the original version of the Work and any modifications or additions
+ to that Work or Derivative Works thereof, that is intentionally
+ submitted to Licensor for inclusion in the Work by the copyright owner
+ or by an individual or Legal Entity authorized to submit on behalf of
+ the copyright owner. For the purposes of this definition, "submitted"
+ means any form of electronic, verbal, or written communication sent
+ to the Licensor or its representatives, including but not limited to
+ communication on electronic mailing lists, source code control systems,
+ and issue tracking systems that are managed by, or on behalf of, the
+ Licensor for the purpose of discussing and improving the Work, but
+ excluding communication that is conspicuously marked or otherwise
+ designated in writing by the copyright owner as "Not a Contribution."
+
+ "Contributor" shall mean Licensor and any individual or Legal Entity
+ on behalf of whom a Contribution has been received by Licensor and
+ subsequently incorporated within the Work.
+
+ 2. Grant of Copyright License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ copyright license to reproduce, prepare Derivative Works of,
+ publicly display, publicly perform, sublicense, and distribute the
+ Work and such Derivative Works in Source or Object form.
+
+ 3. Grant of Patent License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ (except as stated in this section) patent license to make, have made,
+ use, offer to sell, sell, import, and otherwise transfer the Work,
+ where such license applies only to those patent claims licensable
+ by such Contributor that are necessarily infringed by their
+ Contribution(s) alone or by combination of their Contribution(s)
+ with the Work to which such Contribution(s) was submitted. If You
+ institute patent litigation against any entity (including a
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
+ or a Contribution incorporated within the Work constitutes direct
+ or contributory patent infringement, then any patent licenses
+ granted to You under this License for that Work shall terminate
+ as of the date such litigation is filed.
+
+ 4. Redistribution. You may reproduce and distribute copies of the
+ Work or Derivative Works thereof in any medium, with or without
+ modifications, and in Source or Object form, provided that You
+ meet the following conditions:
+
+ (a) You must give any other recipients of the Work or
+ Derivative Works a copy of this License; and
+
+ (b) You must cause any modified files to carry prominent notices
+ stating that You changed the files; and
+
+ (c) You must retain, in the Source form of any Derivative Works
+ that You distribute, all copyright, patent, trademark, and
+ attribution notices from the Source form of the Work,
+ excluding those notices that do not pertain to any part of
+ the Derivative Works; and
+
+ (d) If the Work includes a "NOTICE" text file as part of its
+ distribution, then any Derivative Works that You distribute must
+ include a readable copy of the attribution notices contained
+ within such NOTICE file, excluding those notices that do not
+ pertain to any part of the Derivative Works, in at least one
+ of the following places: within a NOTICE text file distributed
+ as part of the Derivative Works; within the Source form or
+ documentation, if provided along with the Derivative Works; or,
+ within a display generated by the Derivative Works, if and
+ wherever such third-party notices normally appear. The contents
+ of the NOTICE file are for informational purposes only and
+ do not modify the License. You may add Your own attribution
+ notices within Derivative Works that You distribute, alongside
+ or as an addendum to the NOTICE text from the Work, provided
+ that such additional attribution notices cannot be construed
+ as modifying the License.
+
+ You may add Your own copyright statement to Your modifications and
+ may provide additional or different license terms and conditions
+ for use, reproduction, or distribution of Your modifications, or
+ for any such Derivative Works as a whole, provided Your use,
+ reproduction, and distribution of the Work otherwise complies with
+ the conditions stated in this License.
+
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
+ any Contribution intentionally submitted for inclusion in the Work
+ by You to the Licensor shall be under the terms and conditions of
+ this License, without any additional terms or conditions.
+ Notwithstanding the above, nothing herein shall supersede or modify
+ the terms of any separate license agreement you may have executed
+ with Licensor regarding such Contributions.
+
+ 6. Trademarks. This License does not grant permission to use the trade
+ names, trademarks, service marks, or product names of the Licensor,
+ except as required for reasonable and customary use in describing the
+ origin of the Work and reproducing the content of the NOTICE file.
+
+ 7. Disclaimer of Warranty. Unless required by applicable law or
+ agreed to in writing, Licensor provides the Work (and each
+ Contributor provides its Contributions) on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ implied, including, without limitation, any warranties or conditions
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+ PARTICULAR PURPOSE. You are solely responsible for determining the
+ appropriateness of using or redistributing the Work and assume any
+ risks associated with Your exercise of permissions under this License.
+
+ 8. Limitation of Liability. In no event and under no legal theory,
+ whether in tort (including negligence), contract, or otherwise,
+ unless required by applicable law (such as deliberate and grossly
+ negligent acts) or agreed to in writing, shall any Contributor be
+ liable to You for damages, including any direct, indirect, special,
+ incidental, or consequential damages of any character arising as a
+ result of this License or out of the use or inability to use the
+ Work (including but not limited to damages for loss of goodwill,
+ work stoppage, computer failure or malfunction, or any and all
+ other commercial damages or losses), even if such Contributor
+ has been advised of the possibility of such damages.
+
+ 9. Accepting Warranty or Additional Liability. While redistributing
+ the Work or Derivative Works thereof, You may choose to offer,
+ and charge a fee for, acceptance of support, warranty, indemnity,
+ or other liability obligations and/or rights consistent with this
+ License. However, in accepting such obligations, You may act only
+ on Your own behalf and on Your sole responsibility, not on behalf
+ of any other Contributor, and only if You agree to indemnify,
+ defend, and hold each Contributor harmless for any liability
+ incurred by, or claims asserted against, such Contributor by reason
+ of your accepting any such warranty or additional liability.
+
+ END OF TERMS AND CONDITIONS
+
+ Copyright 2014 Docker, Inc.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
diff --git a/vendor/github.com/opencontainers/runc/NOTICE b/vendor/github.com/opencontainers/runc/NOTICE
new file mode 100644
index 000000000..c29775c0d
--- /dev/null
+++ b/vendor/github.com/opencontainers/runc/NOTICE
@@ -0,0 +1,17 @@
+runc
+
+Copyright 2012-2015 Docker, Inc.
+
+This product includes software developed at Docker, Inc. (http://www.docker.com).
+
+The following is courtesy of our legal counsel:
+
+
+Use and transfer of Docker may be subject to certain restrictions by the
+United States and other governments.
+It is your responsibility to ensure that your use and/or transfer does not
+violate applicable laws.
+
+For more information, please see http://www.bis.doc.gov
+
+See also http://www.apache.org/dev/crypto.html and/or seek legal counsel.
diff --git a/vendor/github.com/opencontainers/runc/libcontainer/dmz/cloned_binary_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/dmz/cloned_binary_linux.go
new file mode 100644
index 000000000..1c034e4e6
--- /dev/null
+++ b/vendor/github.com/opencontainers/runc/libcontainer/dmz/cloned_binary_linux.go
@@ -0,0 +1,258 @@
+package dmz
+
+import (
+ "errors"
+ "fmt"
+ "io"
+ "os"
+ "strconv"
+
+ "github.com/sirupsen/logrus"
+ "golang.org/x/sys/unix"
+
+ "github.com/opencontainers/runc/libcontainer/system"
+)
+
+type SealFunc func(**os.File) error
+
+var (
+ _ SealFunc = sealMemfd
+ _ SealFunc = sealFile
+)
+
+func isExecutable(f *os.File) bool {
+ if err := unix.Faccessat(int(f.Fd()), "", unix.X_OK, unix.AT_EACCESS|unix.AT_EMPTY_PATH); err == nil {
+ return true
+ } else if err == unix.EACCES {
+ return false
+ }
+ path := "/proc/self/fd/" + strconv.Itoa(int(f.Fd()))
+ if err := unix.Access(path, unix.X_OK); err == nil {
+ return true
+ } else if err == unix.EACCES {
+ return false
+ }
+ // Cannot check -- assume it's executable (if not, exec will fail).
+ logrus.Debugf("cannot do X_OK check on binary %s -- assuming it's executable", f.Name())
+ return true
+}
+
+const baseMemfdSeals = unix.F_SEAL_SEAL | unix.F_SEAL_SHRINK | unix.F_SEAL_GROW | unix.F_SEAL_WRITE
+
+func sealMemfd(f **os.File) error {
+ if err := (*f).Chmod(0o511); err != nil {
+ return err
+ }
+ // Try to set the newer memfd sealing flags, but we ignore
+ // errors because they are not needed and we want to continue
+ // to work on older kernels.
+ fd := (*f).Fd()
+ // F_SEAL_FUTURE_WRITE -- Linux 5.1
+ _, _ = unix.FcntlInt(fd, unix.F_ADD_SEALS, unix.F_SEAL_FUTURE_WRITE)
+ // F_SEAL_EXEC -- Linux 6.3
+ const F_SEAL_EXEC = 0x20 //nolint:revive // this matches the unix.* name
+ _, _ = unix.FcntlInt(fd, unix.F_ADD_SEALS, F_SEAL_EXEC)
+ // Apply all original memfd seals.
+ _, err := unix.FcntlInt(fd, unix.F_ADD_SEALS, baseMemfdSeals)
+ return os.NewSyscallError("fcntl(F_ADD_SEALS)", err)
+}
+
+// Memfd creates a sealable executable memfd (supported since Linux 3.17).
+func Memfd(comment string) (*os.File, SealFunc, error) {
+ file, err := system.ExecutableMemfd("runc_cloned:"+comment, unix.MFD_ALLOW_SEALING|unix.MFD_CLOEXEC)
+ return file, sealMemfd, err
+}
+
+func sealFile(f **os.File) error {
+ // When sealing an O_TMPFILE-style descriptor we need to
+ // re-open the path as O_PATH to clear the existing write
+ // handle we have.
+ opath, err := os.OpenFile(fmt.Sprintf("/proc/self/fd/%d", (*f).Fd()), unix.O_PATH|unix.O_CLOEXEC, 0)
+ if err != nil {
+ return fmt.Errorf("reopen tmpfile: %w", err)
+ }
+ _ = (*f).Close()
+ *f = opath
+ return nil
+}
+
+// otmpfile creates an open(O_TMPFILE) file in the given directory (supported
+// since Linux 3.11).
+func otmpfile(dir string) (*os.File, SealFunc, error) {
+ file, err := os.OpenFile(dir, unix.O_TMPFILE|unix.O_RDWR|unix.O_EXCL|unix.O_CLOEXEC, 0o700)
+ if err != nil {
+ return nil, nil, fmt.Errorf("O_TMPFILE creation failed: %w", err)
+ }
+ // Make sure we actually got an unlinked O_TMPFILE descriptor.
+ var stat unix.Stat_t
+ if err := unix.Fstat(int(file.Fd()), &stat); err != nil {
+ file.Close()
+ return nil, nil, fmt.Errorf("cannot fstat O_TMPFILE fd: %w", err)
+ } else if stat.Nlink != 0 {
+ file.Close()
+ return nil, nil, errors.New("O_TMPFILE has non-zero nlink")
+ }
+ return file, sealFile, err
+}
+
+// mktemp creates a classic unlinked file in the given directory.
+func mktemp(dir string) (*os.File, SealFunc, error) {
+ file, err := os.CreateTemp(dir, "runc.")
+ if err != nil {
+ return nil, nil, err
+ }
+ // Unlink the file and verify it was unlinked.
+ if err := os.Remove(file.Name()); err != nil {
+ return nil, nil, fmt.Errorf("unlinking classic tmpfile: %w", err)
+ }
+ if err := file.Chmod(0o511); err != nil {
+ return nil, nil, fmt.Errorf("chmod classic tmpfile: %w", err)
+ }
+ var stat unix.Stat_t
+ if err := unix.Fstat(int(file.Fd()), &stat); err != nil {
+ return nil, nil, fmt.Errorf("cannot fstat classic tmpfile: %w", err)
+ } else if stat.Nlink != 0 {
+ return nil, nil, fmt.Errorf("classic tmpfile %s has non-zero nlink after unlink", file.Name())
+ }
+ return file, sealFile, err
+}
+
+func getSealableFile(comment, tmpDir string) (file *os.File, sealFn SealFunc, err error) {
+ // First, try an executable memfd (supported since Linux 3.17).
+ file, sealFn, err = Memfd(comment)
+ if err == nil {
+ return
+ }
+ logrus.Debugf("memfd cloned binary failed, falling back to O_TMPFILE: %v", err)
+
+ // The tmpDir here (c.root) might be mounted noexec, so we need a couple of
+ // fallbacks to try. It's possible that none of these are writable and
+ // executable, in which case there's nothing we can practically do (other
+ // than mounting our own executable tmpfs, which would have its own
+ // issues).
+ tmpDirs := []string{
+ tmpDir,
+ os.TempDir(),
+ "/tmp",
+ ".",
+ "/bin",
+ "/",
+ }
+
+ // Try to fallback to O_TMPFILE (supported since Linux 3.11).
+ for _, dir := range tmpDirs {
+ file, sealFn, err = otmpfile(dir)
+ if err != nil {
+ continue
+ }
+ if !isExecutable(file) {
+ logrus.Debugf("tmpdir %s is noexec -- trying a different tmpdir", dir)
+ file.Close()
+ continue
+ }
+ return
+ }
+ logrus.Debugf("O_TMPFILE cloned binary failed, falling back to mktemp(): %v", err)
+ // Finally, try a classic unlinked temporary file.
+ for _, dir := range tmpDirs {
+ file, sealFn, err = mktemp(dir)
+ if err != nil {
+ continue
+ }
+ if !isExecutable(file) {
+ logrus.Debugf("tmpdir %s is noexec -- trying a different tmpdir", dir)
+ file.Close()
+ continue
+ }
+ return
+ }
+ return nil, nil, fmt.Errorf("could not create sealable file for cloned binary: %w", err)
+}
+
+// CloneBinary creates a "sealed" clone of a given binary, which can be used to
+// thwart attempts by the container process to gain access to host binaries
+// through procfs magic-link shenanigans. For more details on why this is
+// necessary, see CVE-2019-5736.
+func CloneBinary(src io.Reader, size int64, name, tmpDir string) (*os.File, error) {
+ logrus.Debugf("cloning %s binary (%d bytes)", name, size)
+ file, sealFn, err := getSealableFile(name, tmpDir)
+ if err != nil {
+ return nil, err
+ }
+ copied, err := system.Copy(file, src)
+ if err != nil {
+ file.Close()
+ return nil, fmt.Errorf("copy binary: %w", err)
+ } else if copied != size {
+ file.Close()
+ return nil, fmt.Errorf("copied binary size mismatch: %d != %d", copied, size)
+ }
+ if err := sealFn(&file); err != nil {
+ file.Close()
+ return nil, fmt.Errorf("could not seal fd: %w", err)
+ }
+ return file, nil
+}
+
+// IsCloned returns whether the given file can be guaranteed to be a safe exe.
+func IsCloned(exe *os.File) bool {
+ seals, err := unix.FcntlInt(exe.Fd(), unix.F_GET_SEALS, 0)
+ if err != nil {
+ // /proc/self/exe is probably not a memfd
+ logrus.Debugf("F_GET_SEALS on %s failed: %v", exe.Name(), err)
+ return false
+ }
+ // The memfd must have all of the base seals applied.
+ logrus.Debugf("checking %s memfd seals: 0x%x", exe.Name(), seals)
+ return seals&baseMemfdSeals == baseMemfdSeals
+}
+
+// CloneSelfExe makes a clone of the current process's binary (through
+// /proc/self/exe). This binary can then be used for "runc init" in order to
+// make sure the container process can never resolve the original runc binary.
+// For more details on why this is necessary, see CVE-2019-5736.
+func CloneSelfExe(tmpDir string) (*os.File, error) {
+ // Try to create a temporary overlayfs to produce a readonly version of
+ // /proc/self/exe that cannot be "unwrapped" by the container. In contrast
+ // to CloneBinary, this technique does not require any extra memory usage
+ // and does not have the (fairly noticeable) performance impact of copying
+ // a large binary file into a memfd.
+ //
+ // Based on some basic performance testing, the overlayfs approach has
+ // effectively no performance overhead (it is on par with both
+ // MS_BIND+MS_RDONLY and no binary cloning at all) while memfd copying adds
+ // around ~60% overhead during container startup.
+ overlayFile, err := sealedOverlayfs("/proc/self/exe", tmpDir)
+ if err == nil {
+ logrus.Debug("runc-dmz: using overlayfs for sealed /proc/self/exe") // used for tests
+ return overlayFile, nil
+ }
+ logrus.WithError(err).Debugf("could not use overlayfs for /proc/self/exe sealing -- falling back to making a temporary copy")
+
+ selfExe, err := os.Open("/proc/self/exe")
+ if err != nil {
+ return nil, fmt.Errorf("opening current binary: %w", err)
+ }
+ defer selfExe.Close()
+
+ stat, err := selfExe.Stat()
+ if err != nil {
+ return nil, fmt.Errorf("checking /proc/self/exe size: %w", err)
+ }
+ size := stat.Size()
+
+ return CloneBinary(selfExe, size, "/proc/self/exe", tmpDir)
+}
+
+// IsSelfExeCloned returns whether /proc/self/exe is a cloned binary that can
+// be guaranteed to be safe. This means that it must be a sealed memfd. Other
+// types of clones cannot be completely verified as safe.
+func IsSelfExeCloned() bool {
+ selfExe, err := os.Open("/proc/self/exe")
+ if err != nil {
+ logrus.Debugf("open /proc/self/exe failed: %v", err)
+ return false
+ }
+ defer selfExe.Close()
+ return IsCloned(selfExe)
+}
diff --git a/vendor/github.com/opencontainers/runc/libcontainer/dmz/overlayfs_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/dmz/overlayfs_linux.go
new file mode 100644
index 000000000..b81b70258
--- /dev/null
+++ b/vendor/github.com/opencontainers/runc/libcontainer/dmz/overlayfs_linux.go
@@ -0,0 +1,122 @@
+package dmz
+
+import (
+ "fmt"
+ "os"
+ "path/filepath"
+ "runtime"
+ "strings"
+
+ "golang.org/x/sys/unix"
+
+ "github.com/opencontainers/runc/libcontainer/utils"
+)
+
+func fsopen(fsName string, flags int) (*os.File, error) {
+ // Make sure we always set O_CLOEXEC.
+ flags |= unix.FSOPEN_CLOEXEC
+ fd, err := unix.Fsopen(fsName, flags)
+ if err != nil {
+ return nil, os.NewSyscallError("fsopen "+fsName, err)
+ }
+ return os.NewFile(uintptr(fd), "fscontext:"+fsName), nil
+}
+
+func fsmount(ctx *os.File, flags, mountAttrs int) (*os.File, error) {
+ // Make sure we always set O_CLOEXEC.
+ flags |= unix.FSMOUNT_CLOEXEC
+ fd, err := unix.Fsmount(int(ctx.Fd()), flags, mountAttrs)
+ if err != nil {
+ return nil, os.NewSyscallError("fsmount "+ctx.Name(), err)
+ }
+ runtime.KeepAlive(ctx) // make sure fd is kept alive while it's used
+ return os.NewFile(uintptr(fd), "fsmount:"+ctx.Name()), nil
+}
+
+func escapeOverlayLowerDir(path string) string {
+ // If the lowerdir path contains ":" we need to escape them, and if there
+ // were any escape characters already (\) we need to escape those first.
+ return strings.ReplaceAll(strings.ReplaceAll(path, `\`, `\\`), `:`, `\:`)
+}
+
+// sealedOverlayfs will create an internal overlayfs mount using fsopen() that
+// uses the directory containing the binary as a lowerdir and a temporary tmpfs
+// as an upperdir. There is no way to "unwrap" this (unlike MS_BIND+MS_RDONLY)
+// and so we can create a safe zero-copy sealed version of /proc/self/exe.
+// This only works for privileged users and on kernels with overlayfs and
+// fsopen() enabled.
+//
+// TODO: Since Linux 5.11, overlayfs can be created inside user namespaces so
+// it is technically possible to create an overlayfs even for rootless
+// containers. Unfortunately, this would require some ugly manual CGo+fork
+// magic so we can do this later if we feel it's really needed.
+func sealedOverlayfs(binPath, tmpDir string) (_ *os.File, Err error) {
+ // Try to do the superblock creation first to bail out early if we can't
+ // use this method.
+ overlayCtx, err := fsopen("overlay", unix.FSOPEN_CLOEXEC)
+ if err != nil {
+ return nil, err
+ }
+ defer overlayCtx.Close()
+
+ // binPath is going to be /proc/self/exe, so do a readlink to get the real
+ // path. overlayfs needs the real underlying directory for this protection
+ // mode to work properly.
+ if realPath, err := os.Readlink(binPath); err == nil {
+ binPath = realPath
+ }
+ binLowerDirPath, binName := filepath.Split(binPath)
+ // Escape any ":"s or "\"s in the path.
+ binLowerDirPath = escapeOverlayLowerDir(binLowerDirPath)
+
+ // Overlayfs requires two lowerdirs in order to run in "lower-only" mode,
+ // where writes are completely blocked. Ideally we would create a dummy
+ // tmpfs for this, but it turns out that overlayfs doesn't allow for
+ // anonymous mountns paths.
+ // NOTE: I'm working on a patch to fix this but it won't be backported.
+ dummyLowerDirPath := escapeOverlayLowerDir(tmpDir)
+
+ // Configure the lowerdirs. The binary lowerdir needs to be on the top to
+ // ensure that a file called "runc" (binName) in the dummy lowerdir doesn't
+ // mask the binary.
+ lowerDirStr := binLowerDirPath + ":" + dummyLowerDirPath
+ if err := unix.FsconfigSetString(int(overlayCtx.Fd()), "lowerdir", lowerDirStr); err != nil {
+ return nil, fmt.Errorf("fsconfig set overlayfs lowerdir=%s: %w", lowerDirStr, err)
+ }
+
+ // We don't care about xino (Linux 4.17) but it will be auto-enabled on
+ // some systems (if /run/runc and /usr/bin are on different filesystems)
+ // and this produces spurious dmesg log entries. We can safely ignore
+ // errors when disabling this because we don't actually care about the
+ // setting and we're just opportunistically disabling it.
+ _ = unix.FsconfigSetString(int(overlayCtx.Fd()), "xino", "off")
+
+ // Get an actual handle to the overlayfs.
+ if err := unix.FsconfigCreate(int(overlayCtx.Fd())); err != nil {
+ return nil, os.NewSyscallError("fsconfig create overlayfs", err)
+ }
+ overlayFd, err := fsmount(overlayCtx, unix.FSMOUNT_CLOEXEC, unix.MS_RDONLY|unix.MS_NODEV|unix.MS_NOSUID)
+ if err != nil {
+ return nil, err
+ }
+ defer overlayFd.Close()
+
+ // Grab a handle to the binary through overlayfs.
+ exeFile, err := utils.Openat(overlayFd, binName, unix.O_PATH|unix.O_NOFOLLOW|unix.O_CLOEXEC, 0)
+ if err != nil {
+ return nil, fmt.Errorf("open %s from overlayfs (lowerdir=%s): %w", binName, lowerDirStr, err)
+ }
+ // NOTE: We would like to check that exeFile is the same as /proc/self/exe,
+ // except this is a little difficult. Depending on what filesystems the
+ // layers are on, overlayfs can remap the inode numbers (and it always
+ // creates its own device numbers -- see ovl_map_dev_ino) so we can't do a
+ // basic stat-based check. The only reasonable option would be to hash both
+ // files and compare them, but this would require fully reading both files
+ // which would produce a similar performance overhead to memfd cloning.
+ //
+ // Ultimately, there isn't a real attack to be worried about here. An
+ // attacker would need to be able to modify files in /usr/sbin (or wherever
+ // runc lives), at which point they could just replace the runc binary with
+ // something malicious anyway.
+ return exeFile, nil
+}
diff --git a/vendor/github.com/opencontainers/runc/libcontainer/system/linux.go b/vendor/github.com/opencontainers/runc/libcontainer/system/linux.go
new file mode 100644
index 000000000..7bbf92a3d
--- /dev/null
+++ b/vendor/github.com/opencontainers/runc/libcontainer/system/linux.go
@@ -0,0 +1,216 @@
+//go:build linux
+
+package system
+
+import (
+ "fmt"
+ "io"
+ "os"
+ "strconv"
+ "syscall"
+ "unsafe"
+
+ "github.com/sirupsen/logrus"
+ "golang.org/x/sys/unix"
+)
+
+type ParentDeathSignal int
+
+func (p ParentDeathSignal) Restore() error {
+ if p == 0 {
+ return nil
+ }
+ current, err := GetParentDeathSignal()
+ if err != nil {
+ return err
+ }
+ if p == current {
+ return nil
+ }
+ return p.Set()
+}
+
+func (p ParentDeathSignal) Set() error {
+ return SetParentDeathSignal(uintptr(p))
+}
+
+func Exec(cmd string, args []string, env []string) error {
+ for {
+ err := unix.Exec(cmd, args, env)
+ if err != unix.EINTR {
+ return &os.PathError{Op: "exec", Path: cmd, Err: err}
+ }
+ }
+}
+
+func execveat(fd uintptr, pathname string, args []string, env []string, flags int) error {
+ pathnamep, err := syscall.BytePtrFromString(pathname)
+ if err != nil {
+ return err
+ }
+
+ argvp, err := syscall.SlicePtrFromStrings(args)
+ if err != nil {
+ return err
+ }
+
+ envp, err := syscall.SlicePtrFromStrings(env)
+ if err != nil {
+ return err
+ }
+
+ _, _, errno := syscall.Syscall6(
+ unix.SYS_EXECVEAT,
+ fd,
+ uintptr(unsafe.Pointer(pathnamep)),
+ uintptr(unsafe.Pointer(&argvp[0])),
+ uintptr(unsafe.Pointer(&envp[0])),
+ uintptr(flags),
+ 0,
+ )
+ return errno
+}
+
+func Fexecve(fd uintptr, args []string, env []string) error {
+ var err error
+ for {
+ err = execveat(fd, "", args, env, unix.AT_EMPTY_PATH)
+ if err != unix.EINTR { // nolint:errorlint // unix errors are bare
+ break
+ }
+ }
+ if err == unix.ENOSYS { // nolint:errorlint // unix errors are bare
+ // Fallback to classic /proc/self/fd/... exec.
+ return Exec("/proc/self/fd/"+strconv.Itoa(int(fd)), args, env)
+ }
+ return os.NewSyscallError("execveat", err)
+}
+
+func SetParentDeathSignal(sig uintptr) error {
+ if err := unix.Prctl(unix.PR_SET_PDEATHSIG, sig, 0, 0, 0); err != nil {
+ return err
+ }
+ return nil
+}
+
+func GetParentDeathSignal() (ParentDeathSignal, error) {
+ var sig int
+ if err := unix.Prctl(unix.PR_GET_PDEATHSIG, uintptr(unsafe.Pointer(&sig)), 0, 0, 0); err != nil {
+ return -1, err
+ }
+ return ParentDeathSignal(sig), nil
+}
+
+func SetKeepCaps() error {
+ if err := unix.Prctl(unix.PR_SET_KEEPCAPS, 1, 0, 0, 0); err != nil {
+ return err
+ }
+
+ return nil
+}
+
+func ClearKeepCaps() error {
+ if err := unix.Prctl(unix.PR_SET_KEEPCAPS, 0, 0, 0, 0); err != nil {
+ return err
+ }
+
+ return nil
+}
+
+func Setctty() error {
+ if err := unix.IoctlSetInt(0, unix.TIOCSCTTY, 0); err != nil {
+ return err
+ }
+ return nil
+}
+
+// SetSubreaper sets the value i as the subreaper setting for the calling process
+func SetSubreaper(i int) error {
+ return unix.Prctl(unix.PR_SET_CHILD_SUBREAPER, uintptr(i), 0, 0, 0)
+}
+
+// GetSubreaper returns the subreaper setting for the calling process
+func GetSubreaper() (int, error) {
+ var i uintptr
+
+ if err := unix.Prctl(unix.PR_GET_CHILD_SUBREAPER, uintptr(unsafe.Pointer(&i)), 0, 0, 0); err != nil {
+ return -1, err
+ }
+
+ return int(i), nil
+}
+
+func ExecutableMemfd(comment string, flags int) (*os.File, error) {
+ // Try to use MFD_EXEC first. On pre-6.3 kernels we get -EINVAL for this
+ // flag. On post-6.3 kernels, with vm.memfd_noexec=1 this ensures we get an
+ // executable memfd. For vm.memfd_noexec=2 this is a bit more complicated.
+ // The original vm.memfd_noexec=2 implementation incorrectly silently
+ // allowed MFD_EXEC[1] -- this should be fixed in 6.6. On 6.6 and newer
+ // kernels, we will get -EACCES if we try to use MFD_EXEC with
+ // vm.memfd_noexec=2 (for 6.3-6.5, -EINVAL was the intended return value).
+ //
+ // The upshot is we only need to retry without MFD_EXEC on -EINVAL because
+ // it just so happens that passing MFD_EXEC bypasses vm.memfd_noexec=2 on
+ // kernels where -EINVAL is actually a security denial.
+ memfd, err := unix.MemfdCreate(comment, flags|unix.MFD_EXEC)
+ if err == unix.EINVAL {
+ memfd, err = unix.MemfdCreate(comment, flags)
+ }
+ if err != nil {
+ if err == unix.EACCES {
+ logrus.Info("memfd_create(MFD_EXEC) failed, possibly due to vm.memfd_noexec=2 -- falling back to less secure O_TMPFILE")
+ }
+ err := os.NewSyscallError("memfd_create", err)
+ return nil, fmt.Errorf("failed to create executable memfd: %w", err)
+ }
+ return os.NewFile(uintptr(memfd), "/memfd:"+comment), nil
+}
+
+// Copy is like io.Copy except it uses sendfile(2) if the source and sink are
+// both (*os.File) as an optimisation to make copies faster.
+func Copy(dst io.Writer, src io.Reader) (copied int64, err error) {
+ dstFile, _ := dst.(*os.File)
+ srcFile, _ := src.(*os.File)
+
+ if dstFile != nil && srcFile != nil {
+ fi, err := srcFile.Stat()
+ if err != nil {
+ goto fallback
+ }
+ size := fi.Size()
+ for size > 0 {
+ n, err := unix.Sendfile(int(dstFile.Fd()), int(srcFile.Fd()), nil, int(size))
+ if n > 0 {
+ size -= int64(n)
+ copied += int64(n)
+ }
+ if err == unix.EINTR {
+ continue
+ }
+ if err != nil {
+ if copied == 0 {
+ // If we haven't copied anything so far, we can safely just
+ // fallback to io.Copy. We could always do the fallback but
+ // it's safer to error out in the case of a partial copy
+ // followed by an error (which should never happen).
+ goto fallback
+ }
+ return copied, fmt.Errorf("partial sendfile copy: %w", err)
+ }
+ }
+ return copied, nil
+ }
+
+fallback:
+ return io.Copy(dst, src)
+}
+
+// SetLinuxPersonality sets the Linux execution personality. For more information see the personality syscall documentation.
+// checkout getLinuxPersonalityFromStr() from libcontainer/specconv/spec_linux.go for type conversion.
+func SetLinuxPersonality(personality int) error {
+ _, _, errno := unix.Syscall(unix.SYS_PERSONALITY, uintptr(personality), 0, 0)
+ if errno != 0 {
+ return &os.SyscallError{Syscall: "set_personality", Err: errno}
+ }
+ return nil
+}
diff --git a/vendor/github.com/opencontainers/runc/libcontainer/system/proc.go b/vendor/github.com/opencontainers/runc/libcontainer/system/proc.go
new file mode 100644
index 000000000..774443ec9
--- /dev/null
+++ b/vendor/github.com/opencontainers/runc/libcontainer/system/proc.go
@@ -0,0 +1,127 @@
+package system
+
+import (
+ "fmt"
+ "os"
+ "path/filepath"
+ "strconv"
+ "strings"
+)
+
+// State is the status of a process.
+type State rune
+
+const ( // Only values for Linux 3.14 and later are listed here
+ Dead State = 'X'
+ DiskSleep State = 'D'
+ Running State = 'R'
+ Sleeping State = 'S'
+ Stopped State = 'T'
+ TracingStop State = 't'
+ Zombie State = 'Z'
+ Parked State = 'P'
+ Idle State = 'I'
+)
+
+// String forms of the state from proc(5)'s documentation for
+// /proc/[pid]/status' "State" field.
+func (s State) String() string {
+ switch s {
+ case Dead:
+ return "dead"
+ case DiskSleep:
+ return "disk sleep"
+ case Running:
+ return "running"
+ case Sleeping:
+ return "sleeping"
+ case Stopped:
+ return "stopped"
+ case TracingStop:
+ return "tracing stop"
+ case Zombie:
+ return "zombie"
+ case Parked:
+ return "parked"
+ case Idle:
+ return "idle" // kernel thread
+ default:
+ return fmt.Sprintf("unknown (%c)", s)
+ }
+}
+
+// Stat_t represents the information from /proc/[pid]/stat, as
+// described in proc(5) with names based on the /proc/[pid]/status
+// fields.
+type Stat_t struct {
+ // Name is the command run by the process.
+ Name string
+
+ // State is the state of the process.
+ State State
+
+ // StartTime is the number of clock ticks after system boot (since
+ // Linux 2.6).
+ StartTime uint64
+}
+
+// Stat returns a Stat_t instance for the specified process.
+func Stat(pid int) (stat Stat_t, err error) {
+ bytes, err := os.ReadFile(filepath.Join("/proc", strconv.Itoa(pid), "stat"))
+ if err != nil {
+ return stat, err
+ }
+ return parseStat(string(bytes))
+}
+
+func parseStat(data string) (stat Stat_t, err error) {
+ // Example:
+ // 89653 (gunicorn: maste) S 89630 89653 89653 0 -1 4194560 29689 28896 0 3 146 32 76 19 20 0 1 0 2971844 52965376 3920 18446744073709551615 1 1 0 0 0 0 0 16781312 137447943 0 0 0 17 1 0 0 0 0 0 0 0 0 0 0 0 0 0
+ // The fields are space-separated, see full description in proc(5).
+ //
+ // We are only interested in:
+ // * field 2: process name. It is the only field enclosed into
+ // parenthesis, as it can contain spaces (and parenthesis) inside.
+ // * field 3: process state, a single character (%c)
+ // * field 22: process start time, a long unsigned integer (%llu).
+
+ // 1. Look for the first '(' and the last ')' first, what's in between is Name.
+ // We expect at least 20 fields and a space after the last one.
+
+ const minAfterName = 20*2 + 1 // the min field is '0 '.
+
+ first := strings.IndexByte(data, '(')
+ if first < 0 || first+minAfterName >= len(data) {
+ return stat, fmt.Errorf("invalid stat data (no comm or too short): %q", data)
+ }
+
+ last := strings.LastIndexByte(data, ')')
+ if last <= first || last+minAfterName >= len(data) {
+ return stat, fmt.Errorf("invalid stat data (no comm or too short): %q", data)
+ }
+
+ stat.Name = data[first+1 : last]
+
+ // 2. Remove fields 1 and 2 and a space after. State is right after.
+ data = data[last+2:]
+ stat.State = State(data[0])
+
+ // 3. StartTime is field 22, data is at field 3 now, so we need to skip 19 spaces.
+ skipSpaces := 22 - 3
+ for first = 0; skipSpaces > 0 && first < len(data); first++ {
+ if data[first] == ' ' {
+ skipSpaces--
+ }
+ }
+ // Now first points to StartTime; look for space right after.
+ i := strings.IndexByte(data[first:], ' ')
+ if i < 0 {
+ return stat, fmt.Errorf("invalid stat data (too short): %q", data)
+ }
+ stat.StartTime, err = strconv.ParseUint(data[first:first+i], 10, 64)
+ if err != nil {
+ return stat, fmt.Errorf("invalid stat data (bad start time): %w", err)
+ }
+
+ return stat, nil
+}
diff --git a/vendor/github.com/opencontainers/runc/libcontainer/system/rlimit_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/system/rlimit_linux.go
new file mode 100644
index 000000000..4595fa82a
--- /dev/null
+++ b/vendor/github.com/opencontainers/runc/libcontainer/system/rlimit_linux.go
@@ -0,0 +1,15 @@
+//go:build go1.23
+
+package system
+
+import (
+ "syscall"
+)
+
+// ClearRlimitNofileCache clears go runtime's nofile rlimit cache. The argument
+// is process RLIMIT_NOFILE values. Relies on go.dev/cl/588076.
+func ClearRlimitNofileCache(lim *syscall.Rlimit) {
+ // Ignore the return values since we only need to clean the cache,
+ // the limit is going to be set via unix.Prlimit elsewhere.
+ _ = syscall.Setrlimit(syscall.RLIMIT_NOFILE, lim)
+}
diff --git a/vendor/github.com/opencontainers/runc/libcontainer/system/rlimit_linux_go122.go b/vendor/github.com/opencontainers/runc/libcontainer/system/rlimit_linux_go122.go
new file mode 100644
index 000000000..865d18022
--- /dev/null
+++ b/vendor/github.com/opencontainers/runc/libcontainer/system/rlimit_linux_go122.go
@@ -0,0 +1,27 @@
+//go:build !go1.23
+
+// TODO: remove this file once go 1.22 is no longer supported.
+
+package system
+
+import (
+ "sync/atomic"
+ "syscall"
+ _ "unsafe" // Needed for go:linkname to work.
+)
+
+//go:linkname syscallOrigRlimitNofile syscall.origRlimitNofile
+var syscallOrigRlimitNofile atomic.Pointer[syscall.Rlimit]
+
+// ClearRlimitNofileCache clears go runtime's nofile rlimit cache.
+// The argument is process RLIMIT_NOFILE values.
+func ClearRlimitNofileCache(_ *syscall.Rlimit) {
+ // As reported in issue #4195, the new version of go runtime(since 1.19)
+ // will cache rlimit-nofile. Before executing execve, the rlimit-nofile
+ // of the process will be restored with the cache. In runc, this will
+ // cause the rlimit-nofile setting by the parent process for the container
+ // to become invalid. It can be solved by clearing this cache. But
+ // unfortunately, go stdlib doesn't provide such function, so we need to
+ // link to the private var `origRlimitNofile` in package syscall to hack.
+ syscallOrigRlimitNofile.Store(nil)
+}
diff --git a/vendor/github.com/opencontainers/runc/libcontainer/utils/cmsg.go b/vendor/github.com/opencontainers/runc/libcontainer/utils/cmsg.go
new file mode 100644
index 000000000..2edd1417a
--- /dev/null
+++ b/vendor/github.com/opencontainers/runc/libcontainer/utils/cmsg.go
@@ -0,0 +1,119 @@
+package utils
+
+/*
+ * Copyright 2016, 2017 SUSE LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import (
+ "fmt"
+ "os"
+ "runtime"
+
+ "golang.org/x/sys/unix"
+)
+
+// MaxNameLen is the maximum length of the name of a file descriptor being sent
+// using SendFile. The name of the file handle returned by RecvFile will never be
+// larger than this value.
+const MaxNameLen = 4096
+
+// oobSpace is the size of the oob slice required to store a single FD. Note
+// that unix.UnixRights appears to make the assumption that fd is always int32,
+// so sizeof(fd) = 4.
+var oobSpace = unix.CmsgSpace(4)
+
+// RecvFile waits for a file descriptor to be sent over the given AF_UNIX
+// socket. The file name of the remote file descriptor will be recreated
+// locally (it is sent as non-auxiliary data in the same payload).
+func RecvFile(socket *os.File) (_ *os.File, Err error) {
+ name := make([]byte, MaxNameLen)
+ oob := make([]byte, oobSpace)
+
+ sockfd := socket.Fd()
+ n, oobn, _, _, err := unix.Recvmsg(int(sockfd), name, oob, unix.MSG_CMSG_CLOEXEC)
+ if err != nil {
+ return nil, err
+ }
+ if n >= MaxNameLen || oobn != oobSpace {
+ return nil, fmt.Errorf("recvfile: incorrect number of bytes read (n=%d oobn=%d)", n, oobn)
+ }
+ // Truncate.
+ name = name[:n]
+ oob = oob[:oobn]
+
+ scms, err := unix.ParseSocketControlMessage(oob)
+ if err != nil {
+ return nil, err
+ }
+
+ // We cannot control how many SCM_RIGHTS we receive, and upon receiving
+ // them all of the descriptors are installed in our fd table, so we need to
+ // parse all of the SCM_RIGHTS we received in order to close all of the
+ // descriptors on error.
+ var fds []int
+ defer func() {
+ for i, fd := range fds {
+ if i == 0 && Err == nil {
+ // Only close the first one on error.
+ continue
+ }
+ // Always close extra ones.
+ _ = unix.Close(fd)
+ }
+ }()
+ var lastErr error
+ for _, scm := range scms {
+ if scm.Header.Type == unix.SCM_RIGHTS {
+ scmFds, err := unix.ParseUnixRights(&scm)
+ if err != nil {
+ lastErr = err
+ } else {
+ fds = append(fds, scmFds...)
+ }
+ }
+ }
+ if lastErr != nil {
+ return nil, lastErr
+ }
+
+ // We do this after collecting the fds to make sure we close them all when
+ // returning an error here.
+ if len(scms) != 1 {
+ return nil, fmt.Errorf("recvfd: number of SCMs is not 1: %d", len(scms))
+ }
+ if len(fds) != 1 {
+ return nil, fmt.Errorf("recvfd: number of fds is not 1: %d", len(fds))
+ }
+ return os.NewFile(uintptr(fds[0]), string(name)), nil
+}
+
+// SendFile sends a file over the given AF_UNIX socket. file.Name() is also
+// included so that if the other end uses RecvFile, the file will have the same
+// name information.
+func SendFile(socket *os.File, file *os.File) error {
+ name := file.Name()
+ if len(name) >= MaxNameLen {
+ return fmt.Errorf("sendfd: filename too long: %s", name)
+ }
+ err := SendRawFd(socket, name, file.Fd())
+ runtime.KeepAlive(file)
+ return err
+}
+
+// SendRawFd sends a specific file descriptor over the given AF_UNIX socket.
+func SendRawFd(socket *os.File, msg string, fd uintptr) error {
+ oob := unix.UnixRights(int(fd))
+ return unix.Sendmsg(int(socket.Fd()), []byte(msg), oob, nil, 0)
+}
diff --git a/vendor/github.com/opencontainers/runc/libcontainer/utils/utils.go b/vendor/github.com/opencontainers/runc/libcontainer/utils/utils.go
new file mode 100644
index 000000000..db420ea68
--- /dev/null
+++ b/vendor/github.com/opencontainers/runc/libcontainer/utils/utils.go
@@ -0,0 +1,115 @@
+package utils
+
+import (
+ "encoding/json"
+ "io"
+ "os"
+ "path/filepath"
+ "strings"
+
+ "golang.org/x/sys/unix"
+)
+
+const (
+ exitSignalOffset = 128
+)
+
+// ExitStatus returns the correct exit status for a process based on if it
+// was signaled or exited cleanly
+func ExitStatus(status unix.WaitStatus) int {
+ if status.Signaled() {
+ return exitSignalOffset + int(status.Signal())
+ }
+ return status.ExitStatus()
+}
+
+// WriteJSON writes the provided struct v to w using standard json marshaling
+// without a trailing newline. This is used instead of json.Encoder because
+// there might be a problem in json decoder in some cases, see:
+// https://github.com/docker/docker/issues/14203#issuecomment-174177790
+func WriteJSON(w io.Writer, v interface{}) error {
+ data, err := json.Marshal(v)
+ if err != nil {
+ return err
+ }
+ _, err = w.Write(data)
+ return err
+}
+
+// CleanPath makes a path safe for use with filepath.Join. This is done by not
+// only cleaning the path, but also (if the path is relative) adding a leading
+// '/' and cleaning it (then removing the leading '/'). This ensures that a
+// path resulting from prepending another path will always resolve to lexically
+// be a subdirectory of the prefixed path. This is all done lexically, so paths
+// that include symlinks won't be safe as a result of using CleanPath.
+func CleanPath(path string) string {
+ // Deal with empty strings nicely.
+ if path == "" {
+ return ""
+ }
+
+ // Ensure that all paths are cleaned (especially problematic ones like
+ // "/../../../../../" which can cause lots of issues).
+ path = filepath.Clean(path)
+
+ // If the path isn't absolute, we need to do more processing to fix paths
+ // such as "../../../..//some/path". We also shouldn't convert absolute
+ // paths to relative ones.
+ if !filepath.IsAbs(path) {
+ path = filepath.Clean(string(os.PathSeparator) + path)
+ // This can't fail, as (by definition) all paths are relative to root.
+ path, _ = filepath.Rel(string(os.PathSeparator), path)
+ }
+
+ // Clean the path again for good measure.
+ return filepath.Clean(path)
+}
+
+// stripRoot returns the passed path, stripping the root path if it was
+// (lexicially) inside it. Note that both passed paths will always be treated
+// as absolute, and the returned path will also always be absolute. In
+// addition, the paths are cleaned before stripping the root.
+func stripRoot(root, path string) string {
+ // Make the paths clean and absolute.
+ root, path = CleanPath("/"+root), CleanPath("/"+path)
+ switch {
+ case path == root:
+ path = "/"
+ case root == "/":
+ // do nothing
+ case strings.HasPrefix(path, root+"/"):
+ path = strings.TrimPrefix(path, root+"/")
+ }
+ return CleanPath("/" + path)
+}
+
+// SearchLabels searches through a list of key=value pairs for a given key,
+// returning its value, and the binary flag telling whether the key exist.
+func SearchLabels(labels []string, key string) (string, bool) {
+ key += "="
+ for _, s := range labels {
+ if strings.HasPrefix(s, key) {
+ return s[len(key):], true
+ }
+ }
+ return "", false
+}
+
+// Annotations returns the bundle path and user defined annotations from the
+// libcontainer state. We need to remove the bundle because that is a label
+// added by libcontainer.
+func Annotations(labels []string) (bundle string, userAnnotations map[string]string) {
+ userAnnotations = make(map[string]string)
+ for _, l := range labels {
+ name, value, ok := strings.Cut(l, "=")
+ if !ok {
+ continue
+ }
+ if name == "bundle" {
+ bundle = value
+ } else {
+ userAnnotations[name] = value
+ }
+ }
+ return
+}
diff --git a/vendor/github.com/opencontainers/runc/libcontainer/utils/utils_unix.go b/vendor/github.com/opencontainers/runc/libcontainer/utils/utils_unix.go
new file mode 100644
index 000000000..8f179b6aa
--- /dev/null
+++ b/vendor/github.com/opencontainers/runc/libcontainer/utils/utils_unix.go
@@ -0,0 +1,360 @@
+//go:build !windows
+
+package utils
+
+import (
+ "fmt"
+ "math"
+ "os"
+ "path/filepath"
+ "runtime"
+ "strconv"
+ "strings"
+ "sync"
+ _ "unsafe" // for go:linkname
+
+ securejoin "github.com/cyphar/filepath-securejoin"
+ "github.com/sirupsen/logrus"
+ "golang.org/x/sys/unix"
+)
+
+// EnsureProcHandle returns whether or not the given file handle is on procfs.
+func EnsureProcHandle(fh *os.File) error {
+ var buf unix.Statfs_t
+ if err := unix.Fstatfs(int(fh.Fd()), &buf); err != nil {
+ return fmt.Errorf("ensure %s is on procfs: %w", fh.Name(), err)
+ }
+ if buf.Type != unix.PROC_SUPER_MAGIC {
+ return fmt.Errorf("%s is not on procfs", fh.Name())
+ }
+ return nil
+}
+
+var (
+ haveCloseRangeCloexecBool bool
+ haveCloseRangeCloexecOnce sync.Once
+)
+
+func haveCloseRangeCloexec() bool {
+ haveCloseRangeCloexecOnce.Do(func() {
+ // Make sure we're not closing a random file descriptor.
+ tmpFd, err := unix.FcntlInt(0, unix.F_DUPFD_CLOEXEC, 0)
+ if err != nil {
+ return
+ }
+ defer unix.Close(tmpFd)
+
+ err = unix.CloseRange(uint(tmpFd), uint(tmpFd), unix.CLOSE_RANGE_CLOEXEC)
+ // Any error means we cannot use close_range(CLOSE_RANGE_CLOEXEC).
+ // -ENOSYS and -EINVAL ultimately mean we don't have support, but any
+ // other potential error would imply that even the most basic close
+ // operation wouldn't work.
+ haveCloseRangeCloexecBool = err == nil
+ })
+ return haveCloseRangeCloexecBool
+}
+
+type fdFunc func(fd int)
+
+// fdRangeFrom calls the passed fdFunc for each file descriptor that is open in
+// the current process.
+func fdRangeFrom(minFd int, fn fdFunc) error {
+ procSelfFd, closer := ProcThreadSelf("fd")
+ defer closer()
+
+ fdDir, err := os.Open(procSelfFd)
+ if err != nil {
+ return err
+ }
+ defer fdDir.Close()
+
+ if err := EnsureProcHandle(fdDir); err != nil {
+ return err
+ }
+
+ fdList, err := fdDir.Readdirnames(-1)
+ if err != nil {
+ return err
+ }
+ for _, fdStr := range fdList {
+ fd, err := strconv.Atoi(fdStr)
+ // Ignore non-numeric file names.
+ if err != nil {
+ continue
+ }
+ // Ignore descriptors lower than our specified minimum.
+ if fd < minFd {
+ continue
+ }
+ // Ignore the file descriptor we used for readdir, as it will be closed
+ // when we return.
+ if uintptr(fd) == fdDir.Fd() {
+ continue
+ }
+ // Run the closure.
+ fn(fd)
+ }
+ return nil
+}
+
+// CloseExecFrom sets the O_CLOEXEC flag on all file descriptors greater or
+// equal to minFd in the current process.
+func CloseExecFrom(minFd int) error {
+ // Use close_range(CLOSE_RANGE_CLOEXEC) if possible.
+ if haveCloseRangeCloexec() {
+ err := unix.CloseRange(uint(minFd), math.MaxUint, unix.CLOSE_RANGE_CLOEXEC)
+ return os.NewSyscallError("close_range", err)
+ }
+ // Otherwise, fall back to the standard loop.
+ return fdRangeFrom(minFd, unix.CloseOnExec)
+}
+
+//go:linkname runtime_IsPollDescriptor internal/poll.IsPollDescriptor
+
+// In order to make sure we do not close the internal epoll descriptors the Go
+// runtime uses, we need to ensure that we skip descriptors that match
+// "internal/poll".IsPollDescriptor. Yes, this is a Go runtime internal thing,
+// unfortunately there's no other way to be sure we're only keeping the file
+// descriptors the Go runtime needs. Hopefully nothing blows up doing this...
+func runtime_IsPollDescriptor(fd uintptr) bool //nolint:revive
+
+// UnsafeCloseFrom closes all file descriptors greater or equal to minFd in the
+// current process, except for those critical to Go's runtime (such as the
+// netpoll management descriptors).
+//
+// NOTE: That this function is incredibly dangerous to use in most Go code, as
+// closing file descriptors from underneath *os.File handles can lead to very
+// bad behaviour (the closed file descriptor can be re-used and then any
+// *os.File operations would apply to the wrong file). This function is only
+// intended to be called from the last stage of runc init.
+func UnsafeCloseFrom(minFd int) error {
+ // We cannot use close_range(2) even if it is available, because we must
+ // not close some file descriptors.
+ return fdRangeFrom(minFd, func(fd int) {
+ if runtime_IsPollDescriptor(uintptr(fd)) {
+ // These are the Go runtimes internal netpoll file descriptors.
+ // These file descriptors are operated on deep in the Go scheduler,
+ // and closing those files from underneath Go can result in panics.
+ // There is no issue with keeping them because they are not
+ // executable and are not useful to an attacker anyway. Also we
+ // don't have any choice.
+ return
+ }
+ // There's nothing we can do about errors from close(2), and the
+ // only likely error to be seen is EBADF which indicates the fd was
+ // already closed (in which case, we got what we wanted).
+ _ = unix.Close(fd)
+ })
+}
+
+// NewSockPair returns a new SOCK_STREAM unix socket pair.
+func NewSockPair(name string) (parent, child *os.File, err error) {
+ fds, err := unix.Socketpair(unix.AF_LOCAL, unix.SOCK_STREAM|unix.SOCK_CLOEXEC, 0)
+ if err != nil {
+ return nil, nil, err
+ }
+ return os.NewFile(uintptr(fds[1]), name+"-p"), os.NewFile(uintptr(fds[0]), name+"-c"), nil
+}
+
+// WithProcfd runs the passed closure with a procfd path (/proc/self/fd/...)
+// corresponding to the unsafePath resolved within the root. Before passing the
+// fd, this path is verified to have been inside the root -- so operating on it
+// through the passed fdpath should be safe. Do not access this path through
+// the original path strings, and do not attempt to use the pathname outside of
+// the passed closure (the file handle will be freed once the closure returns).
+func WithProcfd(root, unsafePath string, fn func(procfd string) error) error {
+ // Remove the root then forcefully resolve inside the root.
+ unsafePath = stripRoot(root, unsafePath)
+ path, err := securejoin.SecureJoin(root, unsafePath)
+ if err != nil {
+ return fmt.Errorf("resolving path inside rootfs failed: %w", err)
+ }
+
+ procSelfFd, closer := ProcThreadSelf("fd/")
+ defer closer()
+
+ // Open the target path.
+ fh, err := os.OpenFile(path, unix.O_PATH|unix.O_CLOEXEC, 0)
+ if err != nil {
+ return fmt.Errorf("open o_path procfd: %w", err)
+ }
+ defer fh.Close()
+
+ procfd := filepath.Join(procSelfFd, strconv.Itoa(int(fh.Fd())))
+ // Double-check the path is the one we expected.
+ if realpath, err := os.Readlink(procfd); err != nil {
+ return fmt.Errorf("procfd verification failed: %w", err)
+ } else if realpath != path {
+ return fmt.Errorf("possibly malicious path detected -- refusing to operate on %s", realpath)
+ }
+
+ return fn(procfd)
+}
+
+type ProcThreadSelfCloser func()
+
+var (
+ haveProcThreadSelf bool
+ haveProcThreadSelfOnce sync.Once
+)
+
+// ProcThreadSelf returns a string that is equivalent to
+// /proc/thread-self/, with a graceful fallback on older kernels where
+// /proc/thread-self doesn't exist. This method DOES NOT use SecureJoin,
+// meaning that the passed string needs to be trusted. The caller _must_ call
+// the returned procThreadSelfCloser function (which is runtime.UnlockOSThread)
+// *only once* after it has finished using the returned path string.
+func ProcThreadSelf(subpath string) (string, ProcThreadSelfCloser) {
+ haveProcThreadSelfOnce.Do(func() {
+ if _, err := os.Stat("/proc/thread-self/"); err == nil {
+ haveProcThreadSelf = true
+ } else {
+ logrus.Debugf("cannot stat /proc/thread-self (%v), falling back to /proc/self/task/", err)
+ }
+ })
+
+ // We need to lock our thread until the caller is done with the path string
+ // because any non-atomic operation on the path (such as opening a file,
+ // then reading it) could be interrupted by the Go runtime where the
+ // underlying thread is swapped out and the original thread is killed,
+ // resulting in pull-your-hair-out-hard-to-debug issues in the caller. In
+ // addition, the pre-3.17 fallback makes everything non-atomic because the
+ // same thing could happen between unix.Gettid() and the path operations.
+ //
+ // In theory, we don't need to lock in the atomic user case when using
+ // /proc/thread-self/, but it's better to be safe than sorry (and there are
+ // only one or two truly atomic users of /proc/thread-self/).
+ runtime.LockOSThread()
+
+ threadSelf := "/proc/thread-self/"
+ if !haveProcThreadSelf {
+ // Pre-3.17 kernels did not have /proc/thread-self, so do it manually.
+ threadSelf = "/proc/self/task/" + strconv.Itoa(unix.Gettid()) + "/"
+ if _, err := os.Stat(threadSelf); err != nil {
+ // Unfortunately, this code is called from rootfs_linux.go where we
+ // are running inside the pid namespace of the container but /proc
+ // is the host's procfs. Unfortunately there is no real way to get
+ // the correct tid to use here (the kernel age means we cannot do
+ // things like set up a private fsopen("proc") -- even scanning
+ // NSpid in all of the tasks in /proc/self/task/*/status requires
+ // Linux 4.1).
+ //
+ // So, we just have to assume that /proc/self is acceptable in this
+ // one specific case.
+ if os.Getpid() == 1 {
+ logrus.Debugf("/proc/thread-self (tid=%d) cannot be emulated inside the initial container setup -- using /proc/self instead: %v", unix.Gettid(), err)
+ } else {
+ // This should never happen, but the fallback should work in most cases...
+ logrus.Warnf("/proc/thread-self could not be emulated for pid=%d (tid=%d) -- using more buggy /proc/self fallback instead: %v", os.Getpid(), unix.Gettid(), err)
+ }
+ threadSelf = "/proc/self/"
+ }
+ }
+ return threadSelf + subpath, runtime.UnlockOSThread
+}
+
+// ProcThreadSelfFd is small wrapper around ProcThreadSelf to make it easier to
+// create a /proc/thread-self handle for given file descriptor.
+//
+// It is basically equivalent to ProcThreadSelf(fmt.Sprintf("fd/%d", fd)), but
+// without using fmt.Sprintf to avoid unneeded overhead.
+func ProcThreadSelfFd(fd uintptr) (string, ProcThreadSelfCloser) {
+ return ProcThreadSelf("fd/" + strconv.FormatUint(uint64(fd), 10))
+}
+
+// IsLexicallyInRoot is shorthand for strings.HasPrefix(path+"/", root+"/"),
+// but properly handling the case where path or root are "/".
+//
+// NOTE: The return value only make sense if the path doesn't contain "..".
+func IsLexicallyInRoot(root, path string) bool {
+ if root != "/" {
+ root += "/"
+ }
+ if path != "/" {
+ path += "/"
+ }
+ return strings.HasPrefix(path, root)
+}
+
+// MkdirAllInRootOpen attempts to make
+//
+// path, _ := securejoin.SecureJoin(root, unsafePath)
+// os.MkdirAll(path, mode)
+// os.Open(path)
+//
+// safer against attacks where components in the path are changed between
+// SecureJoin returning and MkdirAll (or Open) being called. In particular, we
+// try to detect any symlink components in the path while we are doing the
+// MkdirAll.
+//
+// NOTE: If unsafePath is a subpath of root, we assume that you have already
+// called SecureJoin and so we use the provided path verbatim without resolving
+// any symlinks (this is done in a way that avoids symlink-exchange races).
+// This means that the path also must not contain ".." elements, otherwise an
+// error will occur.
+//
+// This uses securejoin.MkdirAllHandle under the hood, but it has special
+// handling if unsafePath has already been scoped within the rootfs (this is
+// needed for a lot of runc callers and fixing this would require reworking a
+// lot of path logic).
+func MkdirAllInRootOpen(root, unsafePath string, mode os.FileMode) (_ *os.File, Err error) {
+ // If the path is already "within" the root, get the path relative to the
+ // root and use that as the unsafe path. This is necessary because a lot of
+ // MkdirAllInRootOpen callers have already done SecureJoin, and refactoring
+ // all of them to stop using these SecureJoin'd paths would require a fair
+ // amount of work.
+ // TODO(cyphar): Do the refactor to libpathrs once it's ready.
+ if IsLexicallyInRoot(root, unsafePath) {
+ subPath, err := filepath.Rel(root, unsafePath)
+ if err != nil {
+ return nil, err
+ }
+ unsafePath = subPath
+ }
+
+ // Check for any silly mode bits.
+ if mode&^0o7777 != 0 {
+ return nil, fmt.Errorf("tried to include non-mode bits in MkdirAll mode: 0o%.3o", mode)
+ }
+ // Linux (and thus os.MkdirAll) silently ignores the suid and sgid bits if
+ // passed. While it would make sense to return an error in that case (since
+ // the user has asked for a mode that won't be applied), for compatibility
+ // reasons we have to ignore these bits.
+ if ignoredBits := mode &^ 0o1777; ignoredBits != 0 {
+ logrus.Warnf("MkdirAll called with no-op mode bits that are ignored by Linux: 0o%.3o", ignoredBits)
+ mode &= 0o1777
+ }
+
+ rootDir, err := os.OpenFile(root, unix.O_DIRECTORY|unix.O_CLOEXEC, 0)
+ if err != nil {
+ return nil, fmt.Errorf("open root handle: %w", err)
+ }
+ defer rootDir.Close()
+
+ return securejoin.MkdirAllHandle(rootDir, unsafePath, mode)
+}
+
+// MkdirAllInRoot is a wrapper around MkdirAllInRootOpen which closes the
+// returned handle, for callers that don't need to use it.
+func MkdirAllInRoot(root, unsafePath string, mode os.FileMode) error {
+ f, err := MkdirAllInRootOpen(root, unsafePath, mode)
+ if err == nil {
+ _ = f.Close()
+ }
+ return err
+}
+
+// Openat is a Go-friendly openat(2) wrapper.
+func Openat(dir *os.File, path string, flags int, mode uint32) (*os.File, error) {
+ dirFd := unix.AT_FDCWD
+ if dir != nil {
+ dirFd = int(dir.Fd())
+ }
+ flags |= unix.O_CLOEXEC
+
+ fd, err := unix.Openat(dirFd, path, flags, mode)
+ if err != nil {
+ return nil, &os.PathError{Op: "openat", Path: path, Err: err}
+ }
+ return os.NewFile(uintptr(fd), dir.Name()+"/"+path), nil
+}
diff --git a/vendor/modules.txt b/vendor/modules.txt
index babcea307..f68615ce1 100644
--- a/vendor/modules.txt
+++ b/vendor/modules.txt
@@ -14,6 +14,9 @@ github.com/NVIDIA/go-nvml/pkg/nvml/mock
# github.com/cpuguy83/go-md2man/v2 v2.0.5
## explicit; go 1.11
github.com/cpuguy83/go-md2man/v2/md2man
+# github.com/cyphar/filepath-securejoin v0.4.1
+## explicit; go 1.18
+github.com/cyphar/filepath-securejoin
# github.com/davecgh/go-spew v1.1.1
## explicit
github.com/davecgh/go-spew/spew
@@ -30,6 +33,11 @@ github.com/google/uuid
# github.com/moby/sys/symlink v0.3.0
## explicit; go 1.17
github.com/moby/sys/symlink
+# github.com/opencontainers/runc v1.2.5
+## explicit; go 1.22
+github.com/opencontainers/runc/libcontainer/dmz
+github.com/opencontainers/runc/libcontainer/system
+github.com/opencontainers/runc/libcontainer/utils
# github.com/opencontainers/runtime-spec v1.2.0
## explicit
github.com/opencontainers/runtime-spec/specs-go
@@ -38,8 +46,6 @@ github.com/opencontainers/runtime-spec/specs-go
github.com/opencontainers/runtime-tools/generate
github.com/opencontainers/runtime-tools/generate/seccomp
github.com/opencontainers/runtime-tools/validate/capabilities
-# github.com/opencontainers/selinux v1.11.0
-## explicit; go 1.19
# github.com/pelletier/go-toml v1.9.5
## explicit; go 1.12
github.com/pelletier/go-toml