diff --git a/cmd/nvidia-cdi-hook/update-ldcache/container-root.go b/cmd/nvidia-cdi-hook/update-ldcache/container-root.go new file mode 100644 index 000000000..f62b3ae9a --- /dev/null +++ b/cmd/nvidia-cdi-hook/update-ldcache/container-root.go @@ -0,0 +1,76 @@ +/** +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package ldcache + +import ( + "os" + "path/filepath" + + "github.com/moby/sys/symlink" +) + +// A containerRoot represents the root filesystem of a container. +type containerRoot string + +// hasPath checks whether the specified path exists in the root. +func (r containerRoot) hasPath(path string) bool { + resolved, err := r.resolve(path) + if err != nil { + return false + } + if _, err := os.Stat(resolved); err != nil && os.IsNotExist(err) { + return false + } + return true +} + +// globFiles matches the specified pattern in the root. +// The files that match must be regular files. +func (r containerRoot) globFiles(pattern string) ([]string, error) { + patternPath, err := r.resolve(pattern) + if err != nil { + return nil, err + } + matches, err := filepath.Glob(patternPath) + if err != nil { + return nil, err + } + var files []string + for _, match := range matches { + info, err := os.Lstat(match) + if err != nil { + return nil, err + } + // Ignore symlinks. + if info.Mode()&os.ModeSymlink != 0 { + continue + } + // Ignore directories. + if info.IsDir() { + continue + } + files = append(files, match) + } + return files, nil +} + +// resolve returns the absolute path including root path. +// Symlinks are resolved, but are guaranteed to resolve in the root. +func (r containerRoot) resolve(path string) (string, error) { + absolute := filepath.Clean(filepath.Join(string(r), path)) + return symlink.FollowSymlinkInScope(absolute, string(r)) +} diff --git a/cmd/nvidia-cdi-hook/update-ldcache/safe-exec_linux.go b/cmd/nvidia-cdi-hook/update-ldcache/safe-exec_linux.go new file mode 100644 index 000000000..01e0d8772 --- /dev/null +++ b/cmd/nvidia-cdi-hook/update-ldcache/safe-exec_linux.go @@ -0,0 +1,55 @@ +/** +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package ldcache + +import ( + "fmt" + "os" + "strconv" + "syscall" + + "github.com/opencontainers/runc/libcontainer/dmz" +) + +func SafeExec(path string, args []string, envv []string) error { + safeExe, err := cloneBinary(path) + if err != nil { + return fmt.Errorf("failed to clone binary: %w", err) + } + defer safeExe.Close() + + exePath := "/proc/self/fd/" + strconv.Itoa(int(safeExe.Fd())) + + //nolint:gosec // TODO: Can we harden this so that there is less risk of command injection + return syscall.Exec(exePath, args, envv) +} + +func cloneBinary(path string) (*os.File, error) { + exe, err := os.Open(path) + if err != nil { + return nil, fmt.Errorf("opening current binary: %w", err) + } + defer exe.Close() + + stat, err := exe.Stat() + if err != nil { + return nil, fmt.Errorf("checking /proc/self/exe size: %w", err) + } + size := stat.Size() + + return dmz.CloneBinary(exe, size, path, os.TempDir()) +} diff --git a/cmd/nvidia-cdi-hook/update-ldcache/safe-exec_other.go b/cmd/nvidia-cdi-hook/update-ldcache/safe-exec_other.go new file mode 100644 index 000000000..d11f20a7f --- /dev/null +++ b/cmd/nvidia-cdi-hook/update-ldcache/safe-exec_other.go @@ -0,0 +1,26 @@ +//go:build !linux +// +build !linux + +/** +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package ldcache + +import "syscall" + +func SafeExec(path string, args []string, envv []string) error { + return syscall.Exec(path, args, envv) +} diff --git a/cmd/nvidia-cdi-hook/update-ldcache/update-ldcache.go b/cmd/nvidia-cdi-hook/update-ldcache/update-ldcache.go index 7128aa6b5..abe100cc4 100644 --- a/cmd/nvidia-cdi-hook/update-ldcache/update-ldcache.go +++ b/cmd/nvidia-cdi-hook/update-ldcache/update-ldcache.go @@ -22,7 +22,6 @@ import ( "os" "path/filepath" "strings" - "syscall" "github.com/urfave/cli/v2" @@ -131,9 +130,7 @@ func (m command) run(c *cli.Context, cfg *options) error { // Explicitly specify using /etc/ld.so.conf since the host's ldconfig may // be configured to use a different config file by default. args = append(args, "-f", "/etc/ld.so.conf") - - //nolint:gosec // TODO: Can we harden this so that there is less risk of command injection - return syscall.Exec(ldconfigPath, args, nil) + return SafeExec(ldconfigPath, args, nil) } type root string diff --git a/go.mod b/go.mod index 4766aea61..c577e53af 100644 --- a/go.mod +++ b/go.mod @@ -6,6 +6,7 @@ require ( github.com/NVIDIA/go-nvlib v0.7.1 github.com/NVIDIA/go-nvml v0.12.4-1 github.com/moby/sys/symlink v0.3.0 + github.com/opencontainers/runc v1.2.5 github.com/opencontainers/runtime-spec v1.2.0 github.com/pelletier/go-toml v1.9.5 github.com/sirupsen/logrus v1.9.3 @@ -19,13 +20,13 @@ require ( require ( github.com/cpuguy83/go-md2man/v2 v2.0.5 // indirect + github.com/cyphar/filepath-securejoin v0.4.1 // indirect github.com/davecgh/go-spew v1.1.1 // indirect github.com/fsnotify/fsnotify v1.7.0 // indirect github.com/google/uuid v1.6.0 // indirect github.com/hashicorp/errwrap v1.1.0 // indirect github.com/kr/pretty v0.3.1 // indirect github.com/opencontainers/runtime-tools v0.9.1-0.20221107090550-2e043c6bd626 // indirect - github.com/opencontainers/selinux v1.11.0 // indirect github.com/pmezard/go-difflib v1.0.0 // indirect github.com/russross/blackfriday/v2 v2.1.0 // indirect github.com/syndtr/gocapability v0.0.0-20200815063812-42c35b437635 // indirect diff --git a/go.sum b/go.sum index 228c2aa98..e1ec49d98 100644 --- a/go.sum +++ b/go.sum @@ -7,6 +7,8 @@ github.com/blang/semver/v4 v4.0.0/go.mod h1:IbckMUScFkM3pff0VJDNKRiT6TG/YpiHIM2y github.com/cpuguy83/go-md2man/v2 v2.0.5 h1:ZtcqGrnekaHpVLArFSe4HK5DoKx1T0rq2DwVB0alcyc= github.com/cpuguy83/go-md2man/v2 v2.0.5/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o= github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E= +github.com/cyphar/filepath-securejoin v0.4.1 h1:JyxxyPEaktOD+GAnqIqTf9A8tHyAG22rowi7HkoSU1s= +github.com/cyphar/filepath-securejoin v0.4.1/go.mod h1:Sdj7gXlvMcPZsbhwhQ33GguGLDGQL7h7bg04C/+u9jI= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= @@ -31,6 +33,8 @@ github.com/mndrix/tap-go v0.0.0-20171203230836-629fa407e90b/go.mod h1:pzzDgJWZ34 github.com/moby/sys/symlink v0.3.0 h1:GZX89mEZ9u53f97npBy4Rc3vJKj7JBDj/PN2I22GrNU= github.com/moby/sys/symlink v0.3.0/go.mod h1:3eNdhduHmYPcgsJtZXW1W4XUJdZGBIkttZ8xKqPUJq0= github.com/mrunalp/fileutils v0.5.0/go.mod h1:M1WthSahJixYnrXQl/DFQuteStB1weuxD2QJNHXfbSQ= +github.com/opencontainers/runc v1.2.5 h1:8KAkq3Wrem8bApgOHyhRI/8IeLXIfmZ6Qaw6DNSLnA4= +github.com/opencontainers/runc v1.2.5/go.mod h1:dOQeFo29xZKBNeRBI0B19mJtfHv68YgCTh1X+YphA+4= github.com/opencontainers/runtime-spec v1.0.3-0.20220825212826-86290f6a00fb/go.mod h1:jwyrGlmzljRJv/Fgzds9SsS/C5hL+LL3ko9hs6T5lQ0= github.com/opencontainers/runtime-spec v1.2.0 h1:z97+pHb3uELt/yiAWD691HNHQIF07bE7dzrbT927iTk= github.com/opencontainers/runtime-spec v1.2.0/go.mod h1:jwyrGlmzljRJv/Fgzds9SsS/C5hL+LL3ko9hs6T5lQ0= diff --git a/vendor/github.com/cyphar/filepath-securejoin/CHANGELOG.md b/vendor/github.com/cyphar/filepath-securejoin/CHANGELOG.md new file mode 100644 index 000000000..ca0e3c62c --- /dev/null +++ b/vendor/github.com/cyphar/filepath-securejoin/CHANGELOG.md @@ -0,0 +1,256 @@ +# Changelog # +All notable changes to this project will be documented in this file. + +The format is based on [Keep a Changelog](http://keepachangelog.com/) +and this project adheres to [Semantic Versioning](http://semver.org/). + +## [Unreleased] ## + +## [0.4.1] - 2025-01-28 ## + +### Fixed ### +- The restrictions added for `root` paths passed to `SecureJoin` in 0.4.0 was + found to be too strict and caused some regressions when folks tried to + update, so this restriction has been relaxed to only return an error if the + path contains a `..` component. We still recommend users use `filepath.Clean` + (and even `filepath.EvalSymlinks`) on the `root` path they are using, but at + least you will no longer be punished for "trivial" unclean paths. + +## [0.4.0] - 2025-01-13 ## + +### Breaking #### +- `SecureJoin(VFS)` will now return an error if the provided `root` is not a + `filepath.Clean`'d path. + + While it is ultimately the responsibility of the caller to ensure the root is + a safe path to use, passing a path like `/symlink/..` as a root would result + in the `SecureJoin`'d path being placed in `/` even though `/symlink/..` + might be a different directory, and so we should more strongly discourage + such usage. + + All major users of `securejoin.SecureJoin` already ensure that the paths they + provide are safe (and this is ultimately a question of user error), but + removing this foot-gun is probably a good idea. Of course, this is + necessarily a breaking API change (though we expect no real users to be + affected by it). + + Thanks to [Erik Sjölund](https://github.com/eriksjolund), who initially + reported this issue as a possible security issue. + +- `MkdirAll` and `MkdirHandle` now take an `os.FileMode`-style mode argument + instead of a raw `unix.S_*`-style mode argument, which may cause compile-time + type errors depending on how you use `filepath-securejoin`. For most users, + there will be no change in behaviour aside from the type change (as the + bottom `0o777` bits are the same in both formats, and most users are probably + only using those bits). + + However, if you were using `unix.S_ISVTX` to set the sticky bit with + `MkdirAll(Handle)` you will need to switch to `os.ModeSticky` otherwise you + will get a runtime error with this update. In addition, the error message you + will get from passing `unix.S_ISUID` and `unix.S_ISGID` will be different as + they are treated as invalid bits now (note that previously passing said bits + was also an error). + +## [0.3.6] - 2024-12-17 ## + +### Compatibility ### +- The minimum Go version requirement for `filepath-securejoin` is now Go 1.18 + (we use generics internally). + + For reference, `filepath-securejoin@v0.3.0` somewhat-arbitrarily bumped the + Go version requirement to 1.21. + + While we did make some use of Go 1.21 stdlib features (and in principle Go + versions <= 1.21 are no longer even supported by upstream anymore), some + downstreams have complained that the version bump has meant that they have to + do workarounds when backporting fixes that use the new `filepath-securejoin` + API onto old branches. This is not an ideal situation, but since using this + library is probably better for most downstreams than a hand-rolled + workaround, we now have compatibility shims that allow us to build on older + Go versions. +- Lower minimum version requirement for `golang.org/x/sys` to `v0.18.0` (we + need the wrappers for `fsconfig(2)`), which should also make backporting + patches to older branches easier. + +## [0.3.5] - 2024-12-06 ## + +### Fixed ### +- `MkdirAll` will now no longer return an `EEXIST` error if two racing + processes are creating the same directory. We will still verify that the path + is a directory, but this will avoid spurious errors when multiple threads or + programs are trying to `MkdirAll` the same path. opencontainers/runc#4543 + +## [0.3.4] - 2024-10-09 ## + +### Fixed ### +- Previously, some testing mocks we had resulted in us doing `import "testing"` + in non-`_test.go` code, which made some downstreams like Kubernetes unhappy. + This has been fixed. (#32) + +## [0.3.3] - 2024-09-30 ## + +### Fixed ### +- The mode and owner verification logic in `MkdirAll` has been removed. This + was originally intended to protect against some theoretical attacks but upon + further consideration these protections don't actually buy us anything and + they were causing spurious errors with more complicated filesystem setups. +- The "is the created directory empty" logic in `MkdirAll` has also been + removed. This was not causing us issues yet, but some pseudofilesystems (such + as `cgroup`) create non-empty directories and so this logic would've been + wrong for such cases. + +## [0.3.2] - 2024-09-13 ## + +### Changed ### +- Passing the `S_ISUID` or `S_ISGID` modes to `MkdirAllInRoot` will now return + an explicit error saying that those bits are ignored by `mkdirat(2)`. In the + past a different error was returned, but since the silent ignoring behaviour + is codified in the man pages a more explicit error seems apt. While silently + ignoring these bits would be the most compatible option, it could lead to + users thinking their code sets these bits when it doesn't. Programs that need + to deal with compatibility can mask the bits themselves. (#23, #25) + +### Fixed ### +- If a directory has `S_ISGID` set, then all child directories will have + `S_ISGID` set when created and a different gid will be used for any inode + created under the directory. Previously, the "expected owner and mode" + validation in `securejoin.MkdirAll` did not correctly handle this. We now + correctly handle this case. (#24, #25) + +## [0.3.1] - 2024-07-23 ## + +### Changed ### +- By allowing `Open(at)InRoot` to opt-out of the extra work done by `MkdirAll` + to do the necessary "partial lookups", `Open(at)InRoot` now does less work + for both implementations (resulting in a many-fold decrease in the number of + operations for `openat2`, and a modest improvement for non-`openat2`) and is + far more guaranteed to match the correct `openat2(RESOLVE_IN_ROOT)` + behaviour. +- We now use `readlinkat(fd, "")` where possible. For `Open(at)InRoot` this + effectively just means that we no longer risk getting spurious errors during + rename races. However, for our hardened procfs handler, this in theory should + prevent mount attacks from tricking us when doing magic-link readlinks (even + when using the unsafe host `/proc` handle). Unfortunately `Reopen` is still + potentially vulnerable to those kinds of somewhat-esoteric attacks. + + Technically this [will only work on post-2.6.39 kernels][linux-readlinkat-emptypath] + but it seems incredibly unlikely anyone is using `filepath-securejoin` on a + pre-2011 kernel. + +### Fixed ### +- Several improvements were made to the errors returned by `Open(at)InRoot` and + `MkdirAll` when dealing with invalid paths under the emulated (ie. + non-`openat2`) implementation. Previously, some paths would return the wrong + error (`ENOENT` when the last component was a non-directory), and other paths + would be returned as though they were acceptable (trailing-slash components + after a non-directory would be ignored by `Open(at)InRoot`). + + These changes were done to match `openat2`'s behaviour and purely is a + consistency fix (most users are going to be using `openat2` anyway). + +[linux-readlinkat-emptypath]: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=65cfc6722361570bfe255698d9cd4dccaf47570d + +## [0.3.0] - 2024-07-11 ## + +### Added ### +- A new set of `*os.File`-based APIs have been added. These are adapted from + [libpathrs][] and we strongly suggest using them if possible (as they provide + far more protection against attacks than `SecureJoin`): + + - `Open(at)InRoot` resolves a path inside a rootfs and returns an `*os.File` + handle to the path. Note that the handle returned is an `O_PATH` handle, + which cannot be used for reading or writing (as well as some other + operations -- [see open(2) for more details][open.2]) + + - `Reopen` takes an `O_PATH` file handle and safely re-opens it to upgrade + it to a regular handle. This can also be used with non-`O_PATH` handles, + but `O_PATH` is the most obvious application. + + - `MkdirAll` is an implementation of `os.MkdirAll` that is safe to use to + create a directory tree within a rootfs. + + As these are new APIs, they may change in the future. However, they should be + safe to start migrating to as we have extensive tests ensuring they behave + correctly and are safe against various races and other attacks. + +[libpathrs]: https://github.com/openSUSE/libpathrs +[open.2]: https://www.man7.org/linux/man-pages/man2/open.2.html + +## [0.2.5] - 2024-05-03 ## + +### Changed ### +- Some minor changes were made to how lexical components (like `..` and `.`) + are handled during path generation in `SecureJoin`. There is no behaviour + change as a result of this fix (the resulting paths are the same). + +### Fixed ### +- The error returned when we hit a symlink loop now references the correct + path. (#10) + +## [0.2.4] - 2023-09-06 ## + +### Security ### +- This release fixes a potential security issue in filepath-securejoin when + used on Windows ([GHSA-6xv5-86q9-7xr8][], which could be used to generate + paths outside of the provided rootfs in certain cases), as well as improving + the overall behaviour of filepath-securejoin when dealing with Windows paths + that contain volume names. Thanks to Paulo Gomes for discovering and fixing + these issues. + +### Fixed ### +- Switch to GitHub Actions for CI so we can test on Windows as well as Linux + and MacOS. + +[GHSA-6xv5-86q9-7xr8]: https://github.com/advisories/GHSA-6xv5-86q9-7xr8 + +## [0.2.3] - 2021-06-04 ## + +### Changed ### +- Switch to Go 1.13-style `%w` error wrapping, letting us drop the dependency + on `github.com/pkg/errors`. + +## [0.2.2] - 2018-09-05 ## + +### Changed ### +- Use `syscall.ELOOP` as the base error for symlink loops, rather than our own + (internal) error. This allows callers to more easily use `errors.Is` to check + for this case. + +## [0.2.1] - 2018-09-05 ## + +### Fixed ### +- Use our own `IsNotExist` implementation, which lets us handle `ENOTDIR` + properly within `SecureJoin`. + +## [0.2.0] - 2017-07-19 ## + +We now have 100% test coverage! + +### Added ### +- Add a `SecureJoinVFS` API that can be used for mocking (as we do in our new + tests) or for implementing custom handling of lookup operations (such as for + rootless containers, where work is necessary to access directories with weird + modes because we don't have `CAP_DAC_READ_SEARCH` or `CAP_DAC_OVERRIDE`). + +## 0.1.0 - 2017-07-19 + +This is our first release of `github.com/cyphar/filepath-securejoin`, +containing a full implementation with a coverage of 93.5% (the only missing +cases are the error cases, which are hard to mocktest at the moment). + +[Unreleased]: https://github.com/cyphar/filepath-securejoin/compare/v0.4.1...HEAD +[0.4.1]: https://github.com/cyphar/filepath-securejoin/compare/v0.4.0...v0.4.1 +[0.4.0]: https://github.com/cyphar/filepath-securejoin/compare/v0.3.6...v0.4.0 +[0.3.6]: https://github.com/cyphar/filepath-securejoin/compare/v0.3.5...v0.3.6 +[0.3.5]: https://github.com/cyphar/filepath-securejoin/compare/v0.3.4...v0.3.5 +[0.3.4]: https://github.com/cyphar/filepath-securejoin/compare/v0.3.3...v0.3.4 +[0.3.3]: https://github.com/cyphar/filepath-securejoin/compare/v0.3.2...v0.3.3 +[0.3.2]: https://github.com/cyphar/filepath-securejoin/compare/v0.3.1...v0.3.2 +[0.3.1]: https://github.com/cyphar/filepath-securejoin/compare/v0.3.0...v0.3.1 +[0.3.0]: https://github.com/cyphar/filepath-securejoin/compare/v0.2.5...v0.3.0 +[0.2.5]: https://github.com/cyphar/filepath-securejoin/compare/v0.2.4...v0.2.5 +[0.2.4]: https://github.com/cyphar/filepath-securejoin/compare/v0.2.3...v0.2.4 +[0.2.3]: https://github.com/cyphar/filepath-securejoin/compare/v0.2.2...v0.2.3 +[0.2.2]: https://github.com/cyphar/filepath-securejoin/compare/v0.2.1...v0.2.2 +[0.2.1]: https://github.com/cyphar/filepath-securejoin/compare/v0.2.0...v0.2.1 +[0.2.0]: https://github.com/cyphar/filepath-securejoin/compare/v0.1.0...v0.2.0 diff --git a/vendor/github.com/cyphar/filepath-securejoin/LICENSE b/vendor/github.com/cyphar/filepath-securejoin/LICENSE new file mode 100644 index 000000000..cb1ab88da --- /dev/null +++ b/vendor/github.com/cyphar/filepath-securejoin/LICENSE @@ -0,0 +1,28 @@ +Copyright (C) 2014-2015 Docker Inc & Go Authors. All rights reserved. +Copyright (C) 2017-2024 SUSE LLC. All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above +copyright notice, this list of conditions and the following disclaimer +in the documentation and/or other materials provided with the +distribution. + * Neither the name of Google Inc. nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/vendor/github.com/cyphar/filepath-securejoin/README.md b/vendor/github.com/cyphar/filepath-securejoin/README.md new file mode 100644 index 000000000..eaeb53fcd --- /dev/null +++ b/vendor/github.com/cyphar/filepath-securejoin/README.md @@ -0,0 +1,169 @@ +## `filepath-securejoin` ## + +[![Go Documentation](https://pkg.go.dev/badge/github.com/cyphar/filepath-securejoin.svg)](https://pkg.go.dev/github.com/cyphar/filepath-securejoin) +[![Build Status](https://github.com/cyphar/filepath-securejoin/actions/workflows/ci.yml/badge.svg)](https://github.com/cyphar/filepath-securejoin/actions/workflows/ci.yml) + +### Old API ### + +This library was originally just an implementation of `SecureJoin` which was +[intended to be included in the Go standard library][go#20126] as a safer +`filepath.Join` that would restrict the path lookup to be inside a root +directory. + +The implementation was based on code that existed in several container +runtimes. Unfortunately, this API is **fundamentally unsafe** against attackers +that can modify path components after `SecureJoin` returns and before the +caller uses the path, allowing for some fairly trivial TOCTOU attacks. + +`SecureJoin` (and `SecureJoinVFS`) are still provided by this library to +support legacy users, but new users are strongly suggested to avoid using +`SecureJoin` and instead use the [new api](#new-api) or switch to +[libpathrs][libpathrs]. + +With the above limitations in mind, this library guarantees the following: + +* If no error is set, the resulting string **must** be a child path of + `root` and will not contain any symlink path components (they will all be + expanded). + +* When expanding symlinks, all symlink path components **must** be resolved + relative to the provided root. In particular, this can be considered a + userspace implementation of how `chroot(2)` operates on file paths. Note that + these symlinks will **not** be expanded lexically (`filepath.Clean` is not + called on the input before processing). + +* Non-existent path components are unaffected by `SecureJoin` (similar to + `filepath.EvalSymlinks`'s semantics). + +* The returned path will always be `filepath.Clean`ed and thus not contain any + `..` components. + +A (trivial) implementation of this function on GNU/Linux systems could be done +with the following (note that this requires root privileges and is far more +opaque than the implementation in this library, and also requires that +`readlink` is inside the `root` path and is trustworthy): + +```go +package securejoin + +import ( + "os/exec" + "path/filepath" +) + +func SecureJoin(root, unsafePath string) (string, error) { + unsafePath = string(filepath.Separator) + unsafePath + cmd := exec.Command("chroot", root, + "readlink", "--canonicalize-missing", "--no-newline", unsafePath) + output, err := cmd.CombinedOutput() + if err != nil { + return "", err + } + expanded := string(output) + return filepath.Join(root, expanded), nil +} +``` + +[libpathrs]: https://github.com/openSUSE/libpathrs +[go#20126]: https://github.com/golang/go/issues/20126 + +### New API ### + +While we recommend users switch to [libpathrs][libpathrs] as soon as it has a +stable release, some methods implemented by libpathrs have been ported to this +library to ease the transition. These APIs are only supported on Linux. + +These APIs are implemented such that `filepath-securejoin` will +opportunistically use certain newer kernel APIs that make these operations far +more secure. In particular: + +* All of the lookup operations will use [`openat2`][openat2.2] on new enough + kernels (Linux 5.6 or later) to restrict lookups through magic-links and + bind-mounts (for certain operations) and to make use of `RESOLVE_IN_ROOT` to + efficiently resolve symlinks within a rootfs. + +* The APIs provide hardening against a malicious `/proc` mount to either detect + or avoid being tricked by a `/proc` that is not legitimate. This is done + using [`openat2`][openat2.2] for all users, and privileged users will also be + further protected by using [`fsopen`][fsopen.2] and [`open_tree`][open_tree.2] + (Linux 5.2 or later). + +[openat2.2]: https://www.man7.org/linux/man-pages/man2/openat2.2.html +[fsopen.2]: https://github.com/brauner/man-pages-md/blob/main/fsopen.md +[open_tree.2]: https://github.com/brauner/man-pages-md/blob/main/open_tree.md + +#### `OpenInRoot` #### + +```go +func OpenInRoot(root, unsafePath string) (*os.File, error) +func OpenatInRoot(root *os.File, unsafePath string) (*os.File, error) +func Reopen(handle *os.File, flags int) (*os.File, error) +``` + +`OpenInRoot` is a much safer version of + +```go +path, err := securejoin.SecureJoin(root, unsafePath) +file, err := os.OpenFile(path, unix.O_PATH|unix.O_CLOEXEC) +``` + +that protects against various race attacks that could lead to serious security +issues, depending on the application. Note that the returned `*os.File` is an +`O_PATH` file descriptor, which is quite restricted. Callers will probably need +to use `Reopen` to get a more usable handle (this split is done to provide +useful features like PTY spawning and to avoid users accidentally opening bad +inodes that could cause a DoS). + +Callers need to be careful in how they use the returned `*os.File`. Usually it +is only safe to operate on the handle directly, and it is very easy to create a +security issue. [libpathrs][libpathrs] provides far more helpers to make using +these handles safer -- there is currently no plan to port them to +`filepath-securejoin`. + +`OpenatInRoot` is like `OpenInRoot` except that the root is provided using an +`*os.File`. This allows you to ensure that multiple `OpenatInRoot` (or +`MkdirAllHandle`) calls are operating on the same rootfs. + +> **NOTE**: Unlike `SecureJoin`, `OpenInRoot` will error out as soon as it hits +> a dangling symlink or non-existent path. This is in contrast to `SecureJoin` +> which treated non-existent components as though they were real directories, +> and would allow for partial resolution of dangling symlinks. These behaviours +> are at odds with how Linux treats non-existent paths and dangling symlinks, +> and so these are no longer allowed. + +#### `MkdirAll` #### + +```go +func MkdirAll(root, unsafePath string, mode int) error +func MkdirAllHandle(root *os.File, unsafePath string, mode int) (*os.File, error) +``` + +`MkdirAll` is a much safer version of + +```go +path, err := securejoin.SecureJoin(root, unsafePath) +err = os.MkdirAll(path, mode) +``` + +that protects against the same kinds of races that `OpenInRoot` protects +against. + +`MkdirAllHandle` is like `MkdirAll` except that the root is provided using an +`*os.File` (the reason for this is the same as with `OpenatInRoot`) and an +`*os.File` of the final created directory is returned (this directory is +guaranteed to be effectively identical to the directory created by +`MkdirAllHandle`, which is not possible to ensure by just using `OpenatInRoot` +after `MkdirAll`). + +> **NOTE**: Unlike `SecureJoin`, `MkdirAll` will error out as soon as it hits +> a dangling symlink or non-existent path. This is in contrast to `SecureJoin` +> which treated non-existent components as though they were real directories, +> and would allow for partial resolution of dangling symlinks. These behaviours +> are at odds with how Linux treats non-existent paths and dangling symlinks, +> and so these are no longer allowed. This means that `MkdirAll` will not +> create non-existent directories referenced by a dangling symlink. + +### License ### + +The license of this project is the same as Go, which is a BSD 3-clause license +available in the `LICENSE` file. diff --git a/vendor/github.com/cyphar/filepath-securejoin/VERSION b/vendor/github.com/cyphar/filepath-securejoin/VERSION new file mode 100644 index 000000000..267577d47 --- /dev/null +++ b/vendor/github.com/cyphar/filepath-securejoin/VERSION @@ -0,0 +1 @@ +0.4.1 diff --git a/vendor/github.com/cyphar/filepath-securejoin/doc.go b/vendor/github.com/cyphar/filepath-securejoin/doc.go new file mode 100644 index 000000000..1ec7d065e --- /dev/null +++ b/vendor/github.com/cyphar/filepath-securejoin/doc.go @@ -0,0 +1,39 @@ +// Copyright (C) 2014-2015 Docker Inc & Go Authors. All rights reserved. +// Copyright (C) 2017-2024 SUSE LLC. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Package securejoin implements a set of helpers to make it easier to write Go +// code that is safe against symlink-related escape attacks. The primary idea +// is to let you resolve a path within a rootfs directory as if the rootfs was +// a chroot. +// +// securejoin has two APIs, a "legacy" API and a "modern" API. +// +// The legacy API is [SecureJoin] and [SecureJoinVFS]. These methods are +// **not** safe against race conditions where an attacker changes the +// filesystem after (or during) the [SecureJoin] operation. +// +// The new API is made up of [OpenInRoot] and [MkdirAll] (and derived +// functions). These are safe against racing attackers and have several other +// protections that are not provided by the legacy API. There are many more +// operations that most programs expect to be able to do safely, but we do not +// provide explicit support for them because we want to encourage users to +// switch to [libpathrs](https://github.com/openSUSE/libpathrs) which is a +// cross-language next-generation library that is entirely designed around +// operating on paths safely. +// +// securejoin has been used by several container runtimes (Docker, runc, +// Kubernetes, etc) for quite a few years as a de-facto standard for operating +// on container filesystem paths "safely". However, most users still use the +// legacy API which is unsafe against various attacks (there is a fairly long +// history of CVEs in dependent as a result). Users should switch to the modern +// API as soon as possible (or even better, switch to libpathrs). +// +// This project was initially intended to be included in the Go standard +// library, but [it was rejected](https://go.dev/issue/20126). There is now a +// [new Go proposal](https://go.dev/issue/67002) for a safe path resolution API +// that shares some of the goals of filepath-securejoin. However, that design +// is intended to work like `openat2(RESOLVE_BENEATH)` which does not fit the +// usecase of container runtimes and most system tools. +package securejoin diff --git a/vendor/github.com/cyphar/filepath-securejoin/gocompat_errors_go120.go b/vendor/github.com/cyphar/filepath-securejoin/gocompat_errors_go120.go new file mode 100644 index 000000000..42452bbf9 --- /dev/null +++ b/vendor/github.com/cyphar/filepath-securejoin/gocompat_errors_go120.go @@ -0,0 +1,18 @@ +//go:build linux && go1.20 + +// Copyright (C) 2024 SUSE LLC. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package securejoin + +import ( + "fmt" +) + +// wrapBaseError is a helper that is equivalent to fmt.Errorf("%w: %w"), except +// that on pre-1.20 Go versions only errors.Is() works properly (errors.Unwrap) +// is only guaranteed to give you baseErr. +func wrapBaseError(baseErr, extraErr error) error { + return fmt.Errorf("%w: %w", extraErr, baseErr) +} diff --git a/vendor/github.com/cyphar/filepath-securejoin/gocompat_errors_unsupported.go b/vendor/github.com/cyphar/filepath-securejoin/gocompat_errors_unsupported.go new file mode 100644 index 000000000..e7adca3fd --- /dev/null +++ b/vendor/github.com/cyphar/filepath-securejoin/gocompat_errors_unsupported.go @@ -0,0 +1,38 @@ +//go:build linux && !go1.20 + +// Copyright (C) 2024 SUSE LLC. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package securejoin + +import ( + "fmt" +) + +type wrappedError struct { + inner error + isError error +} + +func (err wrappedError) Is(target error) bool { + return err.isError == target +} + +func (err wrappedError) Unwrap() error { + return err.inner +} + +func (err wrappedError) Error() string { + return fmt.Sprintf("%v: %v", err.isError, err.inner) +} + +// wrapBaseError is a helper that is equivalent to fmt.Errorf("%w: %w"), except +// that on pre-1.20 Go versions only errors.Is() works properly (errors.Unwrap) +// is only guaranteed to give you baseErr. +func wrapBaseError(baseErr, extraErr error) error { + return wrappedError{ + inner: baseErr, + isError: extraErr, + } +} diff --git a/vendor/github.com/cyphar/filepath-securejoin/gocompat_generics_go121.go b/vendor/github.com/cyphar/filepath-securejoin/gocompat_generics_go121.go new file mode 100644 index 000000000..ddd6fa9a4 --- /dev/null +++ b/vendor/github.com/cyphar/filepath-securejoin/gocompat_generics_go121.go @@ -0,0 +1,32 @@ +//go:build linux && go1.21 + +// Copyright (C) 2024 SUSE LLC. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package securejoin + +import ( + "slices" + "sync" +) + +func slices_DeleteFunc[S ~[]E, E any](slice S, delFn func(E) bool) S { + return slices.DeleteFunc(slice, delFn) +} + +func slices_Contains[S ~[]E, E comparable](slice S, val E) bool { + return slices.Contains(slice, val) +} + +func slices_Clone[S ~[]E, E any](slice S) S { + return slices.Clone(slice) +} + +func sync_OnceValue[T any](f func() T) func() T { + return sync.OnceValue(f) +} + +func sync_OnceValues[T1, T2 any](f func() (T1, T2)) func() (T1, T2) { + return sync.OnceValues(f) +} diff --git a/vendor/github.com/cyphar/filepath-securejoin/gocompat_generics_unsupported.go b/vendor/github.com/cyphar/filepath-securejoin/gocompat_generics_unsupported.go new file mode 100644 index 000000000..f1e6fe7e7 --- /dev/null +++ b/vendor/github.com/cyphar/filepath-securejoin/gocompat_generics_unsupported.go @@ -0,0 +1,124 @@ +//go:build linux && !go1.21 + +// Copyright (C) 2024 SUSE LLC. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package securejoin + +import ( + "sync" +) + +// These are very minimal implementations of functions that appear in Go 1.21's +// stdlib, included so that we can build on older Go versions. Most are +// borrowed directly from the stdlib, and a few are modified to be "obviously +// correct" without needing to copy too many other helpers. + +// clearSlice is equivalent to the builtin clear from Go 1.21. +// Copied from the Go 1.24 stdlib implementation. +func clearSlice[S ~[]E, E any](slice S) { + var zero E + for i := range slice { + slice[i] = zero + } +} + +// Copied from the Go 1.24 stdlib implementation. +func slices_IndexFunc[S ~[]E, E any](s S, f func(E) bool) int { + for i := range s { + if f(s[i]) { + return i + } + } + return -1 +} + +// Copied from the Go 1.24 stdlib implementation. +func slices_DeleteFunc[S ~[]E, E any](s S, del func(E) bool) S { + i := slices_IndexFunc(s, del) + if i == -1 { + return s + } + // Don't start copying elements until we find one to delete. + for j := i + 1; j < len(s); j++ { + if v := s[j]; !del(v) { + s[i] = v + i++ + } + } + clearSlice(s[i:]) // zero/nil out the obsolete elements, for GC + return s[:i] +} + +// Similar to the stdlib slices.Contains, except that we don't have +// slices.Index so we need to use slices.IndexFunc for this non-Func helper. +func slices_Contains[S ~[]E, E comparable](s S, v E) bool { + return slices_IndexFunc(s, func(e E) bool { return e == v }) >= 0 +} + +// Copied from the Go 1.24 stdlib implementation. +func slices_Clone[S ~[]E, E any](s S) S { + // Preserve nil in case it matters. + if s == nil { + return nil + } + return append(S([]E{}), s...) +} + +// Copied from the Go 1.24 stdlib implementation. +func sync_OnceValue[T any](f func() T) func() T { + var ( + once sync.Once + valid bool + p any + result T + ) + g := func() { + defer func() { + p = recover() + if !valid { + panic(p) + } + }() + result = f() + f = nil + valid = true + } + return func() T { + once.Do(g) + if !valid { + panic(p) + } + return result + } +} + +// Copied from the Go 1.24 stdlib implementation. +func sync_OnceValues[T1, T2 any](f func() (T1, T2)) func() (T1, T2) { + var ( + once sync.Once + valid bool + p any + r1 T1 + r2 T2 + ) + g := func() { + defer func() { + p = recover() + if !valid { + panic(p) + } + }() + r1, r2 = f() + f = nil + valid = true + } + return func() (T1, T2) { + once.Do(g) + if !valid { + panic(p) + } + return r1, r2 + } +} diff --git a/vendor/github.com/cyphar/filepath-securejoin/join.go b/vendor/github.com/cyphar/filepath-securejoin/join.go new file mode 100644 index 000000000..e6634d477 --- /dev/null +++ b/vendor/github.com/cyphar/filepath-securejoin/join.go @@ -0,0 +1,166 @@ +// Copyright (C) 2014-2015 Docker Inc & Go Authors. All rights reserved. +// Copyright (C) 2017-2025 SUSE LLC. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package securejoin + +import ( + "errors" + "os" + "path/filepath" + "strings" + "syscall" +) + +const maxSymlinkLimit = 255 + +// IsNotExist tells you if err is an error that implies that either the path +// accessed does not exist (or path components don't exist). This is +// effectively a more broad version of [os.IsNotExist]. +func IsNotExist(err error) bool { + // Check that it's not actually an ENOTDIR, which in some cases is a more + // convoluted case of ENOENT (usually involving weird paths). + return errors.Is(err, os.ErrNotExist) || errors.Is(err, syscall.ENOTDIR) || errors.Is(err, syscall.ENOENT) +} + +// errUnsafeRoot is returned if the user provides SecureJoinVFS with a path +// that contains ".." components. +var errUnsafeRoot = errors.New("root path provided to SecureJoin contains '..' components") + +// stripVolume just gets rid of the Windows volume included in a path. Based on +// some godbolt tests, the Go compiler is smart enough to make this a no-op on +// Linux. +func stripVolume(path string) string { + return path[len(filepath.VolumeName(path)):] +} + +// hasDotDot checks if the path contains ".." components in a platform-agnostic +// way. +func hasDotDot(path string) bool { + // If we are on Windows, strip any volume letters. It turns out that + // C:..\foo may (or may not) be a valid pathname and we need to handle that + // leading "..". + path = stripVolume(path) + // Look for "/../" in the path, but we need to handle leading and trailing + // ".."s by adding separators. Doing this with filepath.Separator is ugly + // so just convert to Unix-style "/" first. + path = filepath.ToSlash(path) + return strings.Contains("/"+path+"/", "/../") +} + +// SecureJoinVFS joins the two given path components (similar to [filepath.Join]) except +// that the returned path is guaranteed to be scoped inside the provided root +// path (when evaluated). Any symbolic links in the path are evaluated with the +// given root treated as the root of the filesystem, similar to a chroot. The +// filesystem state is evaluated through the given [VFS] interface (if nil, the +// standard [os].* family of functions are used). +// +// Note that the guarantees provided by this function only apply if the path +// components in the returned string are not modified (in other words are not +// replaced with symlinks on the filesystem) after this function has returned. +// Such a symlink race is necessarily out-of-scope of SecureJoinVFS. +// +// NOTE: Due to the above limitation, Linux users are strongly encouraged to +// use [OpenInRoot] instead, which does safely protect against these kinds of +// attacks. There is no way to solve this problem with SecureJoinVFS because +// the API is fundamentally wrong (you cannot return a "safe" path string and +// guarantee it won't be modified afterwards). +// +// Volume names in unsafePath are always discarded, regardless if they are +// provided via direct input or when evaluating symlinks. Therefore: +// +// "C:\Temp" + "D:\path\to\file.txt" results in "C:\Temp\path\to\file.txt" +// +// If the provided root is not [filepath.Clean] then an error will be returned, +// as such root paths are bordering on somewhat unsafe and using such paths is +// not best practice. We also strongly suggest that any root path is first +// fully resolved using [filepath.EvalSymlinks] or otherwise constructed to +// avoid containing symlink components. Of course, the root also *must not* be +// attacker-controlled. +func SecureJoinVFS(root, unsafePath string, vfs VFS) (string, error) { + // The root path must not contain ".." components, otherwise when we join + // the subpath we will end up with a weird path. We could work around this + // in other ways but users shouldn't be giving us non-lexical root paths in + // the first place. + if hasDotDot(root) { + return "", errUnsafeRoot + } + + // Use the os.* VFS implementation if none was specified. + if vfs == nil { + vfs = osVFS{} + } + + unsafePath = filepath.FromSlash(unsafePath) + var ( + currentPath string + remainingPath = unsafePath + linksWalked int + ) + for remainingPath != "" { + // On Windows, if we managed to end up at a path referencing a volume, + // drop the volume to make sure we don't end up with broken paths or + // escaping the root volume. + remainingPath = stripVolume(remainingPath) + + // Get the next path component. + var part string + if i := strings.IndexRune(remainingPath, filepath.Separator); i == -1 { + part, remainingPath = remainingPath, "" + } else { + part, remainingPath = remainingPath[:i], remainingPath[i+1:] + } + + // Apply the component lexically to the path we are building. + // currentPath does not contain any symlinks, and we are lexically + // dealing with a single component, so it's okay to do a filepath.Clean + // here. + nextPath := filepath.Join(string(filepath.Separator), currentPath, part) + if nextPath == string(filepath.Separator) { + currentPath = "" + continue + } + fullPath := root + string(filepath.Separator) + nextPath + + // Figure out whether the path is a symlink. + fi, err := vfs.Lstat(fullPath) + if err != nil && !IsNotExist(err) { + return "", err + } + // Treat non-existent path components the same as non-symlinks (we + // can't do any better here). + if IsNotExist(err) || fi.Mode()&os.ModeSymlink == 0 { + currentPath = nextPath + continue + } + + // It's a symlink, so get its contents and expand it by prepending it + // to the yet-unparsed path. + linksWalked++ + if linksWalked > maxSymlinkLimit { + return "", &os.PathError{Op: "SecureJoin", Path: root + string(filepath.Separator) + unsafePath, Err: syscall.ELOOP} + } + + dest, err := vfs.Readlink(fullPath) + if err != nil { + return "", err + } + remainingPath = dest + string(filepath.Separator) + remainingPath + // Absolute symlinks reset any work we've already done. + if filepath.IsAbs(dest) { + currentPath = "" + } + } + + // There should be no lexical components like ".." left in the path here, + // but for safety clean up the path before joining it to the root. + finalPath := filepath.Join(string(filepath.Separator), currentPath) + return filepath.Join(root, finalPath), nil +} + +// SecureJoin is a wrapper around [SecureJoinVFS] that just uses the [os].* library +// of functions as the [VFS]. If in doubt, use this function over [SecureJoinVFS]. +func SecureJoin(root, unsafePath string) (string, error) { + return SecureJoinVFS(root, unsafePath, nil) +} diff --git a/vendor/github.com/cyphar/filepath-securejoin/lookup_linux.go b/vendor/github.com/cyphar/filepath-securejoin/lookup_linux.go new file mode 100644 index 000000000..be81e498d --- /dev/null +++ b/vendor/github.com/cyphar/filepath-securejoin/lookup_linux.go @@ -0,0 +1,388 @@ +//go:build linux + +// Copyright (C) 2024 SUSE LLC. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package securejoin + +import ( + "errors" + "fmt" + "os" + "path" + "path/filepath" + "strings" + + "golang.org/x/sys/unix" +) + +type symlinkStackEntry struct { + // (dir, remainingPath) is what we would've returned if the link didn't + // exist. This matches what openat2(RESOLVE_IN_ROOT) would return in + // this case. + dir *os.File + remainingPath string + // linkUnwalked is the remaining path components from the original + // Readlink which we have yet to walk. When this slice is empty, we + // drop the link from the stack. + linkUnwalked []string +} + +func (se symlinkStackEntry) String() string { + return fmt.Sprintf("<%s>/%s [->%s]", se.dir.Name(), se.remainingPath, strings.Join(se.linkUnwalked, "/")) +} + +func (se symlinkStackEntry) Close() { + _ = se.dir.Close() +} + +type symlinkStack []*symlinkStackEntry + +func (s *symlinkStack) IsEmpty() bool { + return s == nil || len(*s) == 0 +} + +func (s *symlinkStack) Close() { + if s != nil { + for _, link := range *s { + link.Close() + } + // TODO: Switch to clear once we switch to Go 1.21. + *s = nil + } +} + +var ( + errEmptyStack = errors.New("[internal] stack is empty") + errBrokenSymlinkStack = errors.New("[internal error] broken symlink stack") +) + +func (s *symlinkStack) popPart(part string) error { + if s == nil || s.IsEmpty() { + // If there is nothing in the symlink stack, then the part was from the + // real path provided by the user, and this is a no-op. + return errEmptyStack + } + if part == "." { + // "." components are no-ops -- we drop them when doing SwapLink. + return nil + } + + tailEntry := (*s)[len(*s)-1] + + // Double-check that we are popping the component we expect. + if len(tailEntry.linkUnwalked) == 0 { + return fmt.Errorf("%w: trying to pop component %q of empty stack entry %s", errBrokenSymlinkStack, part, tailEntry) + } + headPart := tailEntry.linkUnwalked[0] + if headPart != part { + return fmt.Errorf("%w: trying to pop component %q but the last stack entry is %s (%q)", errBrokenSymlinkStack, part, tailEntry, headPart) + } + + // Drop the component, but keep the entry around in case we are dealing + // with a "tail-chained" symlink. + tailEntry.linkUnwalked = tailEntry.linkUnwalked[1:] + return nil +} + +func (s *symlinkStack) PopPart(part string) error { + if err := s.popPart(part); err != nil { + if errors.Is(err, errEmptyStack) { + // Skip empty stacks. + err = nil + } + return err + } + + // Clean up any of the trailing stack entries that are empty. + for lastGood := len(*s) - 1; lastGood >= 0; lastGood-- { + entry := (*s)[lastGood] + if len(entry.linkUnwalked) > 0 { + break + } + entry.Close() + (*s) = (*s)[:lastGood] + } + return nil +} + +func (s *symlinkStack) push(dir *os.File, remainingPath, linkTarget string) error { + if s == nil { + return nil + } + // Split the link target and clean up any "" parts. + linkTargetParts := slices_DeleteFunc( + strings.Split(linkTarget, "/"), + func(part string) bool { return part == "" || part == "." }) + + // Copy the directory so the caller doesn't close our copy. + dirCopy, err := dupFile(dir) + if err != nil { + return err + } + + // Add to the stack. + *s = append(*s, &symlinkStackEntry{ + dir: dirCopy, + remainingPath: remainingPath, + linkUnwalked: linkTargetParts, + }) + return nil +} + +func (s *symlinkStack) SwapLink(linkPart string, dir *os.File, remainingPath, linkTarget string) error { + // If we are currently inside a symlink resolution, remove the symlink + // component from the last symlink entry, but don't remove the entry even + // if it's empty. If we are a "tail-chained" symlink (a trailing symlink we + // hit during a symlink resolution) we need to keep the old symlink until + // we finish the resolution. + if err := s.popPart(linkPart); err != nil { + if !errors.Is(err, errEmptyStack) { + return err + } + // Push the component regardless of whether the stack was empty. + } + return s.push(dir, remainingPath, linkTarget) +} + +func (s *symlinkStack) PopTopSymlink() (*os.File, string, bool) { + if s == nil || s.IsEmpty() { + return nil, "", false + } + tailEntry := (*s)[0] + *s = (*s)[1:] + return tailEntry.dir, tailEntry.remainingPath, true +} + +// partialLookupInRoot tries to lookup as much of the request path as possible +// within the provided root (a-la RESOLVE_IN_ROOT) and opens the final existing +// component of the requested path, returning a file handle to the final +// existing component and a string containing the remaining path components. +func partialLookupInRoot(root *os.File, unsafePath string) (*os.File, string, error) { + return lookupInRoot(root, unsafePath, true) +} + +func completeLookupInRoot(root *os.File, unsafePath string) (*os.File, error) { + handle, remainingPath, err := lookupInRoot(root, unsafePath, false) + if remainingPath != "" && err == nil { + // should never happen + err = fmt.Errorf("[bug] non-empty remaining path when doing a non-partial lookup: %q", remainingPath) + } + // lookupInRoot(partial=false) will always close the handle if an error is + // returned, so no need to double-check here. + return handle, err +} + +func lookupInRoot(root *os.File, unsafePath string, partial bool) (Handle *os.File, _ string, _ error) { + unsafePath = filepath.ToSlash(unsafePath) // noop + + // This is very similar to SecureJoin, except that we operate on the + // components using file descriptors. We then return the last component we + // managed open, along with the remaining path components not opened. + + // Try to use openat2 if possible. + if hasOpenat2() { + return lookupOpenat2(root, unsafePath, partial) + } + + // Get the "actual" root path from /proc/self/fd. This is necessary if the + // root is some magic-link like /proc/$pid/root, in which case we want to + // make sure when we do checkProcSelfFdPath that we are using the correct + // root path. + logicalRootPath, err := procSelfFdReadlink(root) + if err != nil { + return nil, "", fmt.Errorf("get real root path: %w", err) + } + + currentDir, err := dupFile(root) + if err != nil { + return nil, "", fmt.Errorf("clone root fd: %w", err) + } + defer func() { + // If a handle is not returned, close the internal handle. + if Handle == nil { + _ = currentDir.Close() + } + }() + + // symlinkStack is used to emulate how openat2(RESOLVE_IN_ROOT) treats + // dangling symlinks. If we hit a non-existent path while resolving a + // symlink, we need to return the (dir, remainingPath) that we had when we + // hit the symlink (treating the symlink as though it were a regular file). + // The set of (dir, remainingPath) sets is stored within the symlinkStack + // and we add and remove parts when we hit symlink and non-symlink + // components respectively. We need a stack because of recursive symlinks + // (symlinks that contain symlink components in their target). + // + // Note that the stack is ONLY used for book-keeping. All of the actual + // path walking logic is still based on currentPath/remainingPath and + // currentDir (as in SecureJoin). + var symStack *symlinkStack + if partial { + symStack = new(symlinkStack) + defer symStack.Close() + } + + var ( + linksWalked int + currentPath string + remainingPath = unsafePath + ) + for remainingPath != "" { + // Save the current remaining path so if the part is not real we can + // return the path including the component. + oldRemainingPath := remainingPath + + // Get the next path component. + var part string + if i := strings.IndexByte(remainingPath, '/'); i == -1 { + part, remainingPath = remainingPath, "" + } else { + part, remainingPath = remainingPath[:i], remainingPath[i+1:] + } + // If we hit an empty component, we need to treat it as though it is + // "." so that trailing "/" and "//" components on a non-directory + // correctly return the right error code. + if part == "" { + part = "." + } + + // Apply the component lexically to the path we are building. + // currentPath does not contain any symlinks, and we are lexically + // dealing with a single component, so it's okay to do a filepath.Clean + // here. + nextPath := path.Join("/", currentPath, part) + // If we logically hit the root, just clone the root rather than + // opening the part and doing all of the other checks. + if nextPath == "/" { + if err := symStack.PopPart(part); err != nil { + return nil, "", fmt.Errorf("walking into root with part %q failed: %w", part, err) + } + // Jump to root. + rootClone, err := dupFile(root) + if err != nil { + return nil, "", fmt.Errorf("clone root fd: %w", err) + } + _ = currentDir.Close() + currentDir = rootClone + currentPath = nextPath + continue + } + + // Try to open the next component. + nextDir, err := openatFile(currentDir, part, unix.O_PATH|unix.O_NOFOLLOW|unix.O_CLOEXEC, 0) + switch { + case err == nil: + st, err := nextDir.Stat() + if err != nil { + _ = nextDir.Close() + return nil, "", fmt.Errorf("stat component %q: %w", part, err) + } + + switch st.Mode() & os.ModeType { + case os.ModeSymlink: + // readlinkat implies AT_EMPTY_PATH since Linux 2.6.39. See + // Linux commit 65cfc6722361 ("readlinkat(), fchownat() and + // fstatat() with empty relative pathnames"). + linkDest, err := readlinkatFile(nextDir, "") + // We don't need the handle anymore. + _ = nextDir.Close() + if err != nil { + return nil, "", err + } + + linksWalked++ + if linksWalked > maxSymlinkLimit { + return nil, "", &os.PathError{Op: "securejoin.lookupInRoot", Path: logicalRootPath + "/" + unsafePath, Err: unix.ELOOP} + } + + // Swap out the symlink's component for the link entry itself. + if err := symStack.SwapLink(part, currentDir, oldRemainingPath, linkDest); err != nil { + return nil, "", fmt.Errorf("walking into symlink %q failed: push symlink: %w", part, err) + } + + // Update our logical remaining path. + remainingPath = linkDest + "/" + remainingPath + // Absolute symlinks reset any work we've already done. + if path.IsAbs(linkDest) { + // Jump to root. + rootClone, err := dupFile(root) + if err != nil { + return nil, "", fmt.Errorf("clone root fd: %w", err) + } + _ = currentDir.Close() + currentDir = rootClone + currentPath = "/" + } + + default: + // If we are dealing with a directory, simply walk into it. + _ = currentDir.Close() + currentDir = nextDir + currentPath = nextPath + + // The part was real, so drop it from the symlink stack. + if err := symStack.PopPart(part); err != nil { + return nil, "", fmt.Errorf("walking into directory %q failed: %w", part, err) + } + + // If we are operating on a .., make sure we haven't escaped. + // We only have to check for ".." here because walking down + // into a regular component component cannot cause you to + // escape. This mirrors the logic in RESOLVE_IN_ROOT, except we + // have to check every ".." rather than only checking after a + // rename or mount on the system. + if part == ".." { + // Make sure the root hasn't moved. + if err := checkProcSelfFdPath(logicalRootPath, root); err != nil { + return nil, "", fmt.Errorf("root path moved during lookup: %w", err) + } + // Make sure the path is what we expect. + fullPath := logicalRootPath + nextPath + if err := checkProcSelfFdPath(fullPath, currentDir); err != nil { + return nil, "", fmt.Errorf("walking into %q had unexpected result: %w", part, err) + } + } + } + + default: + if !partial { + return nil, "", err + } + // If there are any remaining components in the symlink stack, we + // are still within a symlink resolution and thus we hit a dangling + // symlink. So pretend that the first symlink in the stack we hit + // was an ENOENT (to match openat2). + if oldDir, remainingPath, ok := symStack.PopTopSymlink(); ok { + _ = currentDir.Close() + return oldDir, remainingPath, err + } + // We have hit a final component that doesn't exist, so we have our + // partial open result. Note that we have to use the OLD remaining + // path, since the lookup failed. + return currentDir, oldRemainingPath, err + } + } + + // If the unsafePath had a trailing slash, we need to make sure we try to + // do a relative "." open so that we will correctly return an error when + // the final component is a non-directory (to match openat2). In the + // context of openat2, a trailing slash and a trailing "/." are completely + // equivalent. + if strings.HasSuffix(unsafePath, "/") { + nextDir, err := openatFile(currentDir, ".", unix.O_PATH|unix.O_NOFOLLOW|unix.O_CLOEXEC, 0) + if err != nil { + if !partial { + _ = currentDir.Close() + currentDir = nil + } + return currentDir, "", err + } + _ = currentDir.Close() + currentDir = nextDir + } + + // All of the components existed! + return currentDir, "", nil +} diff --git a/vendor/github.com/cyphar/filepath-securejoin/mkdir_linux.go b/vendor/github.com/cyphar/filepath-securejoin/mkdir_linux.go new file mode 100644 index 000000000..a17ae3b03 --- /dev/null +++ b/vendor/github.com/cyphar/filepath-securejoin/mkdir_linux.go @@ -0,0 +1,236 @@ +//go:build linux + +// Copyright (C) 2024 SUSE LLC. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package securejoin + +import ( + "errors" + "fmt" + "os" + "path/filepath" + "strings" + + "golang.org/x/sys/unix" +) + +var ( + errInvalidMode = errors.New("invalid permission mode") + errPossibleAttack = errors.New("possible attack detected") +) + +// modePermExt is like os.ModePerm except that it also includes the set[ug]id +// and sticky bits. +const modePermExt = os.ModePerm | os.ModeSetuid | os.ModeSetgid | os.ModeSticky + +//nolint:cyclop // this function needs to handle a lot of cases +func toUnixMode(mode os.FileMode) (uint32, error) { + sysMode := uint32(mode.Perm()) + if mode&os.ModeSetuid != 0 { + sysMode |= unix.S_ISUID + } + if mode&os.ModeSetgid != 0 { + sysMode |= unix.S_ISGID + } + if mode&os.ModeSticky != 0 { + sysMode |= unix.S_ISVTX + } + // We don't allow file type bits. + if mode&os.ModeType != 0 { + return 0, fmt.Errorf("%w %+.3o (%s): type bits not permitted", errInvalidMode, mode, mode) + } + // We don't allow other unknown modes. + if mode&^modePermExt != 0 || sysMode&unix.S_IFMT != 0 { + return 0, fmt.Errorf("%w %+.3o (%s): unknown mode bits", errInvalidMode, mode, mode) + } + return sysMode, nil +} + +// MkdirAllHandle is equivalent to [MkdirAll], except that it is safer to use +// in two respects: +// +// - The caller provides the root directory as an *[os.File] (preferably O_PATH) +// handle. This means that the caller can be sure which root directory is +// being used. Note that this can be emulated by using /proc/self/fd/... as +// the root path with [os.MkdirAll]. +// +// - Once all of the directories have been created, an *[os.File] O_PATH handle +// to the directory at unsafePath is returned to the caller. This is done in +// an effectively-race-free way (an attacker would only be able to swap the +// final directory component), which is not possible to emulate with +// [MkdirAll]. +// +// In addition, the returned handle is obtained far more efficiently than doing +// a brand new lookup of unsafePath (such as with [SecureJoin] or openat2) after +// doing [MkdirAll]. If you intend to open the directory after creating it, you +// should use MkdirAllHandle. +func MkdirAllHandle(root *os.File, unsafePath string, mode os.FileMode) (_ *os.File, Err error) { + unixMode, err := toUnixMode(mode) + if err != nil { + return nil, err + } + // On Linux, mkdirat(2) (and os.Mkdir) silently ignore the suid and sgid + // bits. We could also silently ignore them but since we have very few + // users it seems more prudent to return an error so users notice that + // these bits will not be set. + if unixMode&^0o1777 != 0 { + return nil, fmt.Errorf("%w for mkdir %+.3o: suid and sgid are ignored by mkdir", errInvalidMode, mode) + } + + // Try to open as much of the path as possible. + currentDir, remainingPath, err := partialLookupInRoot(root, unsafePath) + defer func() { + if Err != nil { + _ = currentDir.Close() + } + }() + if err != nil && !errors.Is(err, unix.ENOENT) { + return nil, fmt.Errorf("find existing subpath of %q: %w", unsafePath, err) + } + + // If there is an attacker deleting directories as we walk into them, + // detect this proactively. Note this is guaranteed to detect if the + // attacker deleted any part of the tree up to currentDir. + // + // Once we walk into a dead directory, partialLookupInRoot would not be + // able to walk further down the tree (directories must be empty before + // they are deleted), and if the attacker has removed the entire tree we + // can be sure that anything that was originally inside a dead directory + // must also be deleted and thus is a dead directory in its own right. + // + // This is mostly a quality-of-life check, because mkdir will simply fail + // later if the attacker deletes the tree after this check. + if err := isDeadInode(currentDir); err != nil { + return nil, fmt.Errorf("finding existing subpath of %q: %w", unsafePath, err) + } + + // Re-open the path to match the O_DIRECTORY reopen loop later (so that we + // always return a non-O_PATH handle). We also check that we actually got a + // directory. + if reopenDir, err := Reopen(currentDir, unix.O_DIRECTORY|unix.O_CLOEXEC); errors.Is(err, unix.ENOTDIR) { + return nil, fmt.Errorf("cannot create subdirectories in %q: %w", currentDir.Name(), unix.ENOTDIR) + } else if err != nil { + return nil, fmt.Errorf("re-opening handle to %q: %w", currentDir.Name(), err) + } else { + _ = currentDir.Close() + currentDir = reopenDir + } + + remainingParts := strings.Split(remainingPath, string(filepath.Separator)) + if slices_Contains(remainingParts, "..") { + // The path contained ".." components after the end of the "real" + // components. We could try to safely resolve ".." here but that would + // add a bunch of extra logic for something that it's not clear even + // needs to be supported. So just return an error. + // + // If we do filepath.Clean(remainingPath) then we end up with the + // problem that ".." can erase a trailing dangling symlink and produce + // a path that doesn't quite match what the user asked for. + return nil, fmt.Errorf("%w: yet-to-be-created path %q contains '..' components", unix.ENOENT, remainingPath) + } + + // Create the remaining components. + for _, part := range remainingParts { + switch part { + case "", ".": + // Skip over no-op paths. + continue + } + + // NOTE: mkdir(2) will not follow trailing symlinks, so we can safely + // create the final component without worrying about symlink-exchange + // attacks. + // + // If we get -EEXIST, it's possible that another program created the + // directory at the same time as us. In that case, just continue on as + // if we created it (if the created inode is not a directory, the + // following open call will fail). + if err := unix.Mkdirat(int(currentDir.Fd()), part, unixMode); err != nil && !errors.Is(err, unix.EEXIST) { + err = &os.PathError{Op: "mkdirat", Path: currentDir.Name() + "/" + part, Err: err} + // Make the error a bit nicer if the directory is dead. + if deadErr := isDeadInode(currentDir); deadErr != nil { + // TODO: Once we bump the minimum Go version to 1.20, we can use + // multiple %w verbs for this wrapping. For now we need to use a + // compatibility shim for older Go versions. + //err = fmt.Errorf("%w (%w)", err, deadErr) + err = wrapBaseError(err, deadErr) + } + return nil, err + } + + // Get a handle to the next component. O_DIRECTORY means we don't need + // to use O_PATH. + var nextDir *os.File + if hasOpenat2() { + nextDir, err = openat2File(currentDir, part, &unix.OpenHow{ + Flags: unix.O_NOFOLLOW | unix.O_DIRECTORY | unix.O_CLOEXEC, + Resolve: unix.RESOLVE_BENEATH | unix.RESOLVE_NO_SYMLINKS | unix.RESOLVE_NO_XDEV, + }) + } else { + nextDir, err = openatFile(currentDir, part, unix.O_NOFOLLOW|unix.O_DIRECTORY|unix.O_CLOEXEC, 0) + } + if err != nil { + return nil, err + } + _ = currentDir.Close() + currentDir = nextDir + + // It's possible that the directory we just opened was swapped by an + // attacker. Unfortunately there isn't much we can do to protect + // against this, and MkdirAll's behaviour is that we will reuse + // existing directories anyway so the need to protect against this is + // incredibly limited (and arguably doesn't even deserve mention here). + // + // Ideally we might want to check that the owner and mode match what we + // would've created -- unfortunately, it is non-trivial to verify that + // the owner and mode of the created directory match. While plain Unix + // DAC rules seem simple enough to emulate, there are a bunch of other + // factors that can change the mode or owner of created directories + // (default POSIX ACLs, mount options like uid=1,gid=2,umask=0 on + // filesystems like vfat, etc etc). We used to try to verify this but + // it just lead to a series of spurious errors. + // + // We could also check that the directory is non-empty, but + // unfortunately some pseduofilesystems (like cgroupfs) create + // non-empty directories, which would result in different spurious + // errors. + } + return currentDir, nil +} + +// MkdirAll is a race-safe alternative to the [os.MkdirAll] function, +// where the new directory is guaranteed to be within the root directory (if an +// attacker can move directories from inside the root to outside the root, the +// created directory tree might be outside of the root but the key constraint +// is that at no point will we walk outside of the directory tree we are +// creating). +// +// Effectively, MkdirAll(root, unsafePath, mode) is equivalent to +// +// path, _ := securejoin.SecureJoin(root, unsafePath) +// err := os.MkdirAll(path, mode) +// +// But is much safer. The above implementation is unsafe because if an attacker +// can modify the filesystem tree between [SecureJoin] and [os.MkdirAll], it is +// possible for MkdirAll to resolve unsafe symlink components and create +// directories outside of the root. +// +// If you plan to open the directory after you have created it or want to use +// an open directory handle as the root, you should use [MkdirAllHandle] instead. +// This function is a wrapper around [MkdirAllHandle]. +func MkdirAll(root, unsafePath string, mode os.FileMode) error { + rootDir, err := os.OpenFile(root, unix.O_PATH|unix.O_DIRECTORY|unix.O_CLOEXEC, 0) + if err != nil { + return err + } + defer rootDir.Close() + + f, err := MkdirAllHandle(rootDir, unsafePath, mode) + if err != nil { + return err + } + _ = f.Close() + return nil +} diff --git a/vendor/github.com/cyphar/filepath-securejoin/open_linux.go b/vendor/github.com/cyphar/filepath-securejoin/open_linux.go new file mode 100644 index 000000000..230be73f0 --- /dev/null +++ b/vendor/github.com/cyphar/filepath-securejoin/open_linux.go @@ -0,0 +1,103 @@ +//go:build linux + +// Copyright (C) 2024 SUSE LLC. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package securejoin + +import ( + "fmt" + "os" + "strconv" + + "golang.org/x/sys/unix" +) + +// OpenatInRoot is equivalent to [OpenInRoot], except that the root is provided +// using an *[os.File] handle, to ensure that the correct root directory is used. +func OpenatInRoot(root *os.File, unsafePath string) (*os.File, error) { + handle, err := completeLookupInRoot(root, unsafePath) + if err != nil { + return nil, &os.PathError{Op: "securejoin.OpenInRoot", Path: unsafePath, Err: err} + } + return handle, nil +} + +// OpenInRoot safely opens the provided unsafePath within the root. +// Effectively, OpenInRoot(root, unsafePath) is equivalent to +// +// path, _ := securejoin.SecureJoin(root, unsafePath) +// handle, err := os.OpenFile(path, unix.O_PATH|unix.O_CLOEXEC) +// +// But is much safer. The above implementation is unsafe because if an attacker +// can modify the filesystem tree between [SecureJoin] and [os.OpenFile], it is +// possible for the returned file to be outside of the root. +// +// Note that the returned handle is an O_PATH handle, meaning that only a very +// limited set of operations will work on the handle. This is done to avoid +// accidentally opening an untrusted file that could cause issues (such as a +// disconnected TTY that could cause a DoS, or some other issue). In order to +// use the returned handle, you can "upgrade" it to a proper handle using +// [Reopen]. +func OpenInRoot(root, unsafePath string) (*os.File, error) { + rootDir, err := os.OpenFile(root, unix.O_PATH|unix.O_DIRECTORY|unix.O_CLOEXEC, 0) + if err != nil { + return nil, err + } + defer rootDir.Close() + return OpenatInRoot(rootDir, unsafePath) +} + +// Reopen takes an *[os.File] handle and re-opens it through /proc/self/fd. +// Reopen(file, flags) is effectively equivalent to +// +// fdPath := fmt.Sprintf("/proc/self/fd/%d", file.Fd()) +// os.OpenFile(fdPath, flags|unix.O_CLOEXEC) +// +// But with some extra hardenings to ensure that we are not tricked by a +// maliciously-configured /proc mount. While this attack scenario is not +// common, in container runtimes it is possible for higher-level runtimes to be +// tricked into configuring an unsafe /proc that can be used to attack file +// operations. See [CVE-2019-19921] for more details. +// +// [CVE-2019-19921]: https://github.com/advisories/GHSA-fh74-hm69-rqjw +func Reopen(handle *os.File, flags int) (*os.File, error) { + procRoot, err := getProcRoot() + if err != nil { + return nil, err + } + + // We can't operate on /proc/thread-self/fd/$n directly when doing a + // re-open, so we need to open /proc/thread-self/fd and then open a single + // final component. + procFdDir, closer, err := procThreadSelf(procRoot, "fd/") + if err != nil { + return nil, fmt.Errorf("get safe /proc/thread-self/fd handle: %w", err) + } + defer procFdDir.Close() + defer closer() + + // Try to detect if there is a mount on top of the magic-link we are about + // to open. If we are using unsafeHostProcRoot(), this could change after + // we check it (and there's nothing we can do about that) but for + // privateProcRoot() this should be guaranteed to be safe (at least since + // Linux 5.12[1], when anonymous mount namespaces were completely isolated + // from external mounts including mount propagation events). + // + // [1]: Linux commit ee2e3f50629f ("mount: fix mounting of detached mounts + // onto targets that reside on shared mounts"). + fdStr := strconv.Itoa(int(handle.Fd())) + if err := checkSymlinkOvermount(procRoot, procFdDir, fdStr); err != nil { + return nil, fmt.Errorf("check safety of /proc/thread-self/fd/%s magiclink: %w", fdStr, err) + } + + flags |= unix.O_CLOEXEC + // Rather than just wrapping openatFile, open-code it so we can copy + // handle.Name(). + reopenFd, err := unix.Openat(int(procFdDir.Fd()), fdStr, flags, 0) + if err != nil { + return nil, fmt.Errorf("reopen fd %d: %w", handle.Fd(), err) + } + return os.NewFile(uintptr(reopenFd), handle.Name()), nil +} diff --git a/vendor/github.com/cyphar/filepath-securejoin/openat2_linux.go b/vendor/github.com/cyphar/filepath-securejoin/openat2_linux.go new file mode 100644 index 000000000..f7a13e69c --- /dev/null +++ b/vendor/github.com/cyphar/filepath-securejoin/openat2_linux.go @@ -0,0 +1,127 @@ +//go:build linux + +// Copyright (C) 2024 SUSE LLC. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package securejoin + +import ( + "errors" + "fmt" + "os" + "path/filepath" + "strings" + + "golang.org/x/sys/unix" +) + +var hasOpenat2 = sync_OnceValue(func() bool { + fd, err := unix.Openat2(unix.AT_FDCWD, ".", &unix.OpenHow{ + Flags: unix.O_PATH | unix.O_CLOEXEC, + Resolve: unix.RESOLVE_NO_SYMLINKS | unix.RESOLVE_IN_ROOT, + }) + if err != nil { + return false + } + _ = unix.Close(fd) + return true +}) + +func scopedLookupShouldRetry(how *unix.OpenHow, err error) bool { + // RESOLVE_IN_ROOT (and RESOLVE_BENEATH) can return -EAGAIN if we resolve + // ".." while a mount or rename occurs anywhere on the system. This could + // happen spuriously, or as the result of an attacker trying to mess with + // us during lookup. + // + // In addition, scoped lookups have a "safety check" at the end of + // complete_walk which will return -EXDEV if the final path is not in the + // root. + return how.Resolve&(unix.RESOLVE_IN_ROOT|unix.RESOLVE_BENEATH) != 0 && + (errors.Is(err, unix.EAGAIN) || errors.Is(err, unix.EXDEV)) +} + +const scopedLookupMaxRetries = 10 + +func openat2File(dir *os.File, path string, how *unix.OpenHow) (*os.File, error) { + fullPath := dir.Name() + "/" + path + // Make sure we always set O_CLOEXEC. + how.Flags |= unix.O_CLOEXEC + var tries int + for tries < scopedLookupMaxRetries { + fd, err := unix.Openat2(int(dir.Fd()), path, how) + if err != nil { + if scopedLookupShouldRetry(how, err) { + // We retry a couple of times to avoid the spurious errors, and + // if we are being attacked then returning -EAGAIN is the best + // we can do. + tries++ + continue + } + return nil, &os.PathError{Op: "openat2", Path: fullPath, Err: err} + } + // If we are using RESOLVE_IN_ROOT, the name we generated may be wrong. + // NOTE: The procRoot code MUST NOT use RESOLVE_IN_ROOT, otherwise + // you'll get infinite recursion here. + if how.Resolve&unix.RESOLVE_IN_ROOT == unix.RESOLVE_IN_ROOT { + if actualPath, err := rawProcSelfFdReadlink(fd); err == nil { + fullPath = actualPath + } + } + return os.NewFile(uintptr(fd), fullPath), nil + } + return nil, &os.PathError{Op: "openat2", Path: fullPath, Err: errPossibleAttack} +} + +func lookupOpenat2(root *os.File, unsafePath string, partial bool) (*os.File, string, error) { + if !partial { + file, err := openat2File(root, unsafePath, &unix.OpenHow{ + Flags: unix.O_PATH | unix.O_CLOEXEC, + Resolve: unix.RESOLVE_IN_ROOT | unix.RESOLVE_NO_MAGICLINKS, + }) + return file, "", err + } + return partialLookupOpenat2(root, unsafePath) +} + +// partialLookupOpenat2 is an alternative implementation of +// partialLookupInRoot, using openat2(RESOLVE_IN_ROOT) to more safely get a +// handle to the deepest existing child of the requested path within the root. +func partialLookupOpenat2(root *os.File, unsafePath string) (*os.File, string, error) { + // TODO: Implement this as a git-bisect-like binary search. + + unsafePath = filepath.ToSlash(unsafePath) // noop + endIdx := len(unsafePath) + var lastError error + for endIdx > 0 { + subpath := unsafePath[:endIdx] + + handle, err := openat2File(root, subpath, &unix.OpenHow{ + Flags: unix.O_PATH | unix.O_CLOEXEC, + Resolve: unix.RESOLVE_IN_ROOT | unix.RESOLVE_NO_MAGICLINKS, + }) + if err == nil { + // Jump over the slash if we have a non-"" remainingPath. + if endIdx < len(unsafePath) { + endIdx += 1 + } + // We found a subpath! + return handle, unsafePath[endIdx:], lastError + } + if errors.Is(err, unix.ENOENT) || errors.Is(err, unix.ENOTDIR) { + // That path doesn't exist, let's try the next directory up. + endIdx = strings.LastIndexByte(subpath, '/') + lastError = err + continue + } + return nil, "", fmt.Errorf("open subpath: %w", err) + } + // If we couldn't open anything, the whole subpath is missing. Return a + // copy of the root fd so that the caller doesn't close this one by + // accident. + rootClone, err := dupFile(root) + if err != nil { + return nil, "", err + } + return rootClone, unsafePath, lastError +} diff --git a/vendor/github.com/cyphar/filepath-securejoin/openat_linux.go b/vendor/github.com/cyphar/filepath-securejoin/openat_linux.go new file mode 100644 index 000000000..949fb5f2d --- /dev/null +++ b/vendor/github.com/cyphar/filepath-securejoin/openat_linux.go @@ -0,0 +1,59 @@ +//go:build linux + +// Copyright (C) 2024 SUSE LLC. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package securejoin + +import ( + "os" + "path/filepath" + + "golang.org/x/sys/unix" +) + +func dupFile(f *os.File) (*os.File, error) { + fd, err := unix.FcntlInt(f.Fd(), unix.F_DUPFD_CLOEXEC, 0) + if err != nil { + return nil, os.NewSyscallError("fcntl(F_DUPFD_CLOEXEC)", err) + } + return os.NewFile(uintptr(fd), f.Name()), nil +} + +func openatFile(dir *os.File, path string, flags int, mode int) (*os.File, error) { + // Make sure we always set O_CLOEXEC. + flags |= unix.O_CLOEXEC + fd, err := unix.Openat(int(dir.Fd()), path, flags, uint32(mode)) + if err != nil { + return nil, &os.PathError{Op: "openat", Path: dir.Name() + "/" + path, Err: err} + } + // All of the paths we use with openatFile(2) are guaranteed to be + // lexically safe, so we can use path.Join here. + fullPath := filepath.Join(dir.Name(), path) + return os.NewFile(uintptr(fd), fullPath), nil +} + +func fstatatFile(dir *os.File, path string, flags int) (unix.Stat_t, error) { + var stat unix.Stat_t + if err := unix.Fstatat(int(dir.Fd()), path, &stat, flags); err != nil { + return stat, &os.PathError{Op: "fstatat", Path: dir.Name() + "/" + path, Err: err} + } + return stat, nil +} + +func readlinkatFile(dir *os.File, path string) (string, error) { + size := 4096 + for { + linkBuf := make([]byte, size) + n, err := unix.Readlinkat(int(dir.Fd()), path, linkBuf) + if err != nil { + return "", &os.PathError{Op: "readlinkat", Path: dir.Name() + "/" + path, Err: err} + } + if n != size { + return string(linkBuf[:n]), nil + } + // Possible truncation, resize the buffer. + size *= 2 + } +} diff --git a/vendor/github.com/cyphar/filepath-securejoin/procfs_linux.go b/vendor/github.com/cyphar/filepath-securejoin/procfs_linux.go new file mode 100644 index 000000000..809a579cb --- /dev/null +++ b/vendor/github.com/cyphar/filepath-securejoin/procfs_linux.go @@ -0,0 +1,452 @@ +//go:build linux + +// Copyright (C) 2024 SUSE LLC. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package securejoin + +import ( + "errors" + "fmt" + "os" + "runtime" + "strconv" + + "golang.org/x/sys/unix" +) + +func fstat(f *os.File) (unix.Stat_t, error) { + var stat unix.Stat_t + if err := unix.Fstat(int(f.Fd()), &stat); err != nil { + return stat, &os.PathError{Op: "fstat", Path: f.Name(), Err: err} + } + return stat, nil +} + +func fstatfs(f *os.File) (unix.Statfs_t, error) { + var statfs unix.Statfs_t + if err := unix.Fstatfs(int(f.Fd()), &statfs); err != nil { + return statfs, &os.PathError{Op: "fstatfs", Path: f.Name(), Err: err} + } + return statfs, nil +} + +// The kernel guarantees that the root inode of a procfs mount has an +// f_type of PROC_SUPER_MAGIC and st_ino of PROC_ROOT_INO. +const ( + procSuperMagic = 0x9fa0 // PROC_SUPER_MAGIC + procRootIno = 1 // PROC_ROOT_INO +) + +func verifyProcRoot(procRoot *os.File) error { + if statfs, err := fstatfs(procRoot); err != nil { + return err + } else if statfs.Type != procSuperMagic { + return fmt.Errorf("%w: incorrect procfs root filesystem type 0x%x", errUnsafeProcfs, statfs.Type) + } + if stat, err := fstat(procRoot); err != nil { + return err + } else if stat.Ino != procRootIno { + return fmt.Errorf("%w: incorrect procfs root inode number %d", errUnsafeProcfs, stat.Ino) + } + return nil +} + +var hasNewMountApi = sync_OnceValue(func() bool { + // All of the pieces of the new mount API we use (fsopen, fsconfig, + // fsmount, open_tree) were added together in Linux 5.1[1,2], so we can + // just check for one of the syscalls and the others should also be + // available. + // + // Just try to use open_tree(2) to open a file without OPEN_TREE_CLONE. + // This is equivalent to openat(2), but tells us if open_tree is + // available (and thus all of the other basic new mount API syscalls). + // open_tree(2) is most light-weight syscall to test here. + // + // [1]: merge commit 400913252d09 + // [2]: + fd, err := unix.OpenTree(-int(unix.EBADF), "/", unix.OPEN_TREE_CLOEXEC) + if err != nil { + return false + } + _ = unix.Close(fd) + return true +}) + +func fsopen(fsName string, flags int) (*os.File, error) { + // Make sure we always set O_CLOEXEC. + flags |= unix.FSOPEN_CLOEXEC + fd, err := unix.Fsopen(fsName, flags) + if err != nil { + return nil, os.NewSyscallError("fsopen "+fsName, err) + } + return os.NewFile(uintptr(fd), "fscontext:"+fsName), nil +} + +func fsmount(ctx *os.File, flags, mountAttrs int) (*os.File, error) { + // Make sure we always set O_CLOEXEC. + flags |= unix.FSMOUNT_CLOEXEC + fd, err := unix.Fsmount(int(ctx.Fd()), flags, mountAttrs) + if err != nil { + return nil, os.NewSyscallError("fsmount "+ctx.Name(), err) + } + return os.NewFile(uintptr(fd), "fsmount:"+ctx.Name()), nil +} + +func newPrivateProcMount() (*os.File, error) { + procfsCtx, err := fsopen("proc", unix.FSOPEN_CLOEXEC) + if err != nil { + return nil, err + } + defer procfsCtx.Close() + + // Try to configure hidepid=ptraceable,subset=pid if possible, but ignore errors. + _ = unix.FsconfigSetString(int(procfsCtx.Fd()), "hidepid", "ptraceable") + _ = unix.FsconfigSetString(int(procfsCtx.Fd()), "subset", "pid") + + // Get an actual handle. + if err := unix.FsconfigCreate(int(procfsCtx.Fd())); err != nil { + return nil, os.NewSyscallError("fsconfig create procfs", err) + } + return fsmount(procfsCtx, unix.FSMOUNT_CLOEXEC, unix.MS_RDONLY|unix.MS_NODEV|unix.MS_NOEXEC|unix.MS_NOSUID) +} + +func openTree(dir *os.File, path string, flags uint) (*os.File, error) { + dirFd := -int(unix.EBADF) + dirName := "." + if dir != nil { + dirFd = int(dir.Fd()) + dirName = dir.Name() + } + // Make sure we always set O_CLOEXEC. + flags |= unix.OPEN_TREE_CLOEXEC + fd, err := unix.OpenTree(dirFd, path, flags) + if err != nil { + return nil, &os.PathError{Op: "open_tree", Path: path, Err: err} + } + return os.NewFile(uintptr(fd), dirName+"/"+path), nil +} + +func clonePrivateProcMount() (_ *os.File, Err error) { + // Try to make a clone without using AT_RECURSIVE if we can. If this works, + // we can be sure there are no over-mounts and so if the root is valid then + // we're golden. Otherwise, we have to deal with over-mounts. + procfsHandle, err := openTree(nil, "/proc", unix.OPEN_TREE_CLONE) + if err != nil || hookForcePrivateProcRootOpenTreeAtRecursive(procfsHandle) { + procfsHandle, err = openTree(nil, "/proc", unix.OPEN_TREE_CLONE|unix.AT_RECURSIVE) + } + if err != nil { + return nil, fmt.Errorf("creating a detached procfs clone: %w", err) + } + defer func() { + if Err != nil { + _ = procfsHandle.Close() + } + }() + if err := verifyProcRoot(procfsHandle); err != nil { + return nil, err + } + return procfsHandle, nil +} + +func privateProcRoot() (*os.File, error) { + if !hasNewMountApi() || hookForceGetProcRootUnsafe() { + return nil, fmt.Errorf("new mount api: %w", unix.ENOTSUP) + } + // Try to create a new procfs mount from scratch if we can. This ensures we + // can get a procfs mount even if /proc is fake (for whatever reason). + procRoot, err := newPrivateProcMount() + if err != nil || hookForcePrivateProcRootOpenTree(procRoot) { + // Try to clone /proc then... + procRoot, err = clonePrivateProcMount() + } + return procRoot, err +} + +func unsafeHostProcRoot() (_ *os.File, Err error) { + procRoot, err := os.OpenFile("/proc", unix.O_PATH|unix.O_NOFOLLOW|unix.O_DIRECTORY|unix.O_CLOEXEC, 0) + if err != nil { + return nil, err + } + defer func() { + if Err != nil { + _ = procRoot.Close() + } + }() + if err := verifyProcRoot(procRoot); err != nil { + return nil, err + } + return procRoot, nil +} + +func doGetProcRoot() (*os.File, error) { + procRoot, err := privateProcRoot() + if err != nil { + // Fall back to using a /proc handle if making a private mount failed. + // If we have openat2, at least we can avoid some kinds of over-mount + // attacks, but without openat2 there's not much we can do. + procRoot, err = unsafeHostProcRoot() + } + return procRoot, err +} + +var getProcRoot = sync_OnceValues(func() (*os.File, error) { + return doGetProcRoot() +}) + +var hasProcThreadSelf = sync_OnceValue(func() bool { + return unix.Access("/proc/thread-self/", unix.F_OK) == nil +}) + +var errUnsafeProcfs = errors.New("unsafe procfs detected") + +type procThreadSelfCloser func() + +// procThreadSelf returns a handle to /proc/thread-self/ (or an +// equivalent handle on older kernels where /proc/thread-self doesn't exist). +// Once finished with the handle, you must call the returned closer function +// (runtime.UnlockOSThread). You must not pass the returned *os.File to other +// Go threads or use the handle after calling the closer. +// +// This is similar to ProcThreadSelf from runc, but with extra hardening +// applied and using *os.File. +func procThreadSelf(procRoot *os.File, subpath string) (_ *os.File, _ procThreadSelfCloser, Err error) { + // We need to lock our thread until the caller is done with the handle + // because between getting the handle and using it we could get interrupted + // by the Go runtime and hit the case where the underlying thread is + // swapped out and the original thread is killed, resulting in + // pull-your-hair-out-hard-to-debug issues in the caller. + runtime.LockOSThread() + defer func() { + if Err != nil { + runtime.UnlockOSThread() + } + }() + + // Figure out what prefix we want to use. + threadSelf := "thread-self/" + if !hasProcThreadSelf() || hookForceProcSelfTask() { + /// Pre-3.17 kernels don't have /proc/thread-self, so do it manually. + threadSelf = "self/task/" + strconv.Itoa(unix.Gettid()) + "/" + if _, err := fstatatFile(procRoot, threadSelf, unix.AT_SYMLINK_NOFOLLOW); err != nil || hookForceProcSelf() { + // In this case, we running in a pid namespace that doesn't match + // the /proc mount we have. This can happen inside runc. + // + // Unfortunately, there is no nice way to get the correct TID to + // use here because of the age of the kernel, so we have to just + // use /proc/self and hope that it works. + threadSelf = "self/" + } + } + + // Grab the handle. + var ( + handle *os.File + err error + ) + if hasOpenat2() { + // We prefer being able to use RESOLVE_NO_XDEV if we can, to be + // absolutely sure we are operating on a clean /proc handle that + // doesn't have any cheeky overmounts that could trick us (including + // symlink mounts on top of /proc/thread-self). RESOLVE_BENEATH isn't + // strictly needed, but just use it since we have it. + // + // NOTE: /proc/self is technically a magic-link (the contents of the + // symlink are generated dynamically), but it doesn't use + // nd_jump_link() so RESOLVE_NO_MAGICLINKS allows it. + // + // NOTE: We MUST NOT use RESOLVE_IN_ROOT here, as openat2File uses + // procSelfFdReadlink to clean up the returned f.Name() if we use + // RESOLVE_IN_ROOT (which would lead to an infinite recursion). + handle, err = openat2File(procRoot, threadSelf+subpath, &unix.OpenHow{ + Flags: unix.O_PATH | unix.O_NOFOLLOW | unix.O_CLOEXEC, + Resolve: unix.RESOLVE_BENEATH | unix.RESOLVE_NO_XDEV | unix.RESOLVE_NO_MAGICLINKS, + }) + if err != nil { + // TODO: Once we bump the minimum Go version to 1.20, we can use + // multiple %w verbs for this wrapping. For now we need to use a + // compatibility shim for older Go versions. + //err = fmt.Errorf("%w: %w", errUnsafeProcfs, err) + return nil, nil, wrapBaseError(err, errUnsafeProcfs) + } + } else { + handle, err = openatFile(procRoot, threadSelf+subpath, unix.O_PATH|unix.O_NOFOLLOW|unix.O_CLOEXEC, 0) + if err != nil { + // TODO: Once we bump the minimum Go version to 1.20, we can use + // multiple %w verbs for this wrapping. For now we need to use a + // compatibility shim for older Go versions. + //err = fmt.Errorf("%w: %w", errUnsafeProcfs, err) + return nil, nil, wrapBaseError(err, errUnsafeProcfs) + } + defer func() { + if Err != nil { + _ = handle.Close() + } + }() + // We can't detect bind-mounts of different parts of procfs on top of + // /proc (a-la RESOLVE_NO_XDEV), but we can at least be sure that we + // aren't on the wrong filesystem here. + if statfs, err := fstatfs(handle); err != nil { + return nil, nil, err + } else if statfs.Type != procSuperMagic { + return nil, nil, fmt.Errorf("%w: incorrect /proc/self/fd filesystem type 0x%x", errUnsafeProcfs, statfs.Type) + } + } + return handle, runtime.UnlockOSThread, nil +} + +// STATX_MNT_ID_UNIQUE is provided in golang.org/x/sys@v0.20.0, but in order to +// avoid bumping the requirement for a single constant we can just define it +// ourselves. +const STATX_MNT_ID_UNIQUE = 0x4000 + +var hasStatxMountId = sync_OnceValue(func() bool { + var ( + stx unix.Statx_t + // We don't care which mount ID we get. The kernel will give us the + // unique one if it is supported. + wantStxMask uint32 = STATX_MNT_ID_UNIQUE | unix.STATX_MNT_ID + ) + err := unix.Statx(-int(unix.EBADF), "/", 0, int(wantStxMask), &stx) + return err == nil && stx.Mask&wantStxMask != 0 +}) + +func getMountId(dir *os.File, path string) (uint64, error) { + // If we don't have statx(STATX_MNT_ID*) support, we can't do anything. + if !hasStatxMountId() { + return 0, nil + } + + var ( + stx unix.Statx_t + // We don't care which mount ID we get. The kernel will give us the + // unique one if it is supported. + wantStxMask uint32 = STATX_MNT_ID_UNIQUE | unix.STATX_MNT_ID + ) + + err := unix.Statx(int(dir.Fd()), path, unix.AT_EMPTY_PATH|unix.AT_SYMLINK_NOFOLLOW, int(wantStxMask), &stx) + if stx.Mask&wantStxMask == 0 { + // It's not a kernel limitation, for some reason we couldn't get a + // mount ID. Assume it's some kind of attack. + err = fmt.Errorf("%w: could not get mount id", errUnsafeProcfs) + } + if err != nil { + return 0, &os.PathError{Op: "statx(STATX_MNT_ID_...)", Path: dir.Name() + "/" + path, Err: err} + } + return stx.Mnt_id, nil +} + +func checkSymlinkOvermount(procRoot *os.File, dir *os.File, path string) error { + // Get the mntId of our procfs handle. + expectedMountId, err := getMountId(procRoot, "") + if err != nil { + return err + } + // Get the mntId of the target magic-link. + gotMountId, err := getMountId(dir, path) + if err != nil { + return err + } + // As long as the directory mount is alive, even with wrapping mount IDs, + // we would expect to see a different mount ID here. (Of course, if we're + // using unsafeHostProcRoot() then an attaker could change this after we + // did this check.) + if expectedMountId != gotMountId { + return fmt.Errorf("%w: symlink %s/%s has an overmount obscuring the real link (mount ids do not match %d != %d)", errUnsafeProcfs, dir.Name(), path, expectedMountId, gotMountId) + } + return nil +} + +func doRawProcSelfFdReadlink(procRoot *os.File, fd int) (string, error) { + fdPath := fmt.Sprintf("fd/%d", fd) + procFdLink, closer, err := procThreadSelf(procRoot, fdPath) + if err != nil { + return "", fmt.Errorf("get safe /proc/thread-self/%s handle: %w", fdPath, err) + } + defer procFdLink.Close() + defer closer() + + // Try to detect if there is a mount on top of the magic-link. Since we use the handle directly + // provide to the closure. If the closure uses the handle directly, this + // should be safe in general (a mount on top of the path afterwards would + // not affect the handle itself) and will definitely be safe if we are + // using privateProcRoot() (at least since Linux 5.12[1], when anonymous + // mount namespaces were completely isolated from external mounts including + // mount propagation events). + // + // [1]: Linux commit ee2e3f50629f ("mount: fix mounting of detached mounts + // onto targets that reside on shared mounts"). + if err := checkSymlinkOvermount(procRoot, procFdLink, ""); err != nil { + return "", fmt.Errorf("check safety of /proc/thread-self/fd/%d magiclink: %w", fd, err) + } + + // readlinkat implies AT_EMPTY_PATH since Linux 2.6.39. See Linux commit + // 65cfc6722361 ("readlinkat(), fchownat() and fstatat() with empty + // relative pathnames"). + return readlinkatFile(procFdLink, "") +} + +func rawProcSelfFdReadlink(fd int) (string, error) { + procRoot, err := getProcRoot() + if err != nil { + return "", err + } + return doRawProcSelfFdReadlink(procRoot, fd) +} + +func procSelfFdReadlink(f *os.File) (string, error) { + return rawProcSelfFdReadlink(int(f.Fd())) +} + +var ( + errPossibleBreakout = errors.New("possible breakout detected") + errInvalidDirectory = errors.New("wandered into deleted directory") + errDeletedInode = errors.New("cannot verify path of deleted inode") +) + +func isDeadInode(file *os.File) error { + // If the nlink of a file drops to 0, there is an attacker deleting + // directories during our walk, which could result in weird /proc values. + // It's better to error out in this case. + stat, err := fstat(file) + if err != nil { + return fmt.Errorf("check for dead inode: %w", err) + } + if stat.Nlink == 0 { + err := errDeletedInode + if stat.Mode&unix.S_IFMT == unix.S_IFDIR { + err = errInvalidDirectory + } + return fmt.Errorf("%w %q", err, file.Name()) + } + return nil +} + +func checkProcSelfFdPath(path string, file *os.File) error { + if err := isDeadInode(file); err != nil { + return err + } + actualPath, err := procSelfFdReadlink(file) + if err != nil { + return fmt.Errorf("get path of handle: %w", err) + } + if actualPath != path { + return fmt.Errorf("%w: handle path %q doesn't match expected path %q", errPossibleBreakout, actualPath, path) + } + return nil +} + +// Test hooks used in the procfs tests to verify that the fallback logic works. +// See testing_mocks_linux_test.go and procfs_linux_test.go for more details. +var ( + hookForcePrivateProcRootOpenTree = hookDummyFile + hookForcePrivateProcRootOpenTreeAtRecursive = hookDummyFile + hookForceGetProcRootUnsafe = hookDummy + + hookForceProcSelfTask = hookDummy + hookForceProcSelf = hookDummy +) + +func hookDummy() bool { return false } +func hookDummyFile(_ *os.File) bool { return false } diff --git a/vendor/github.com/cyphar/filepath-securejoin/vfs.go b/vendor/github.com/cyphar/filepath-securejoin/vfs.go new file mode 100644 index 000000000..36373f8c5 --- /dev/null +++ b/vendor/github.com/cyphar/filepath-securejoin/vfs.go @@ -0,0 +1,35 @@ +// Copyright (C) 2017-2024 SUSE LLC. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package securejoin + +import "os" + +// In future this should be moved into a separate package, because now there +// are several projects (umoci and go-mtree) that are using this sort of +// interface. + +// VFS is the minimal interface necessary to use [SecureJoinVFS]. A nil VFS is +// equivalent to using the standard [os].* family of functions. This is mainly +// used for the purposes of mock testing, but also can be used to otherwise use +// [SecureJoinVFS] with VFS-like system. +type VFS interface { + // Lstat returns an [os.FileInfo] describing the named file. If the + // file is a symbolic link, the returned [os.FileInfo] describes the + // symbolic link. Lstat makes no attempt to follow the link. + // The semantics are identical to [os.Lstat]. + Lstat(name string) (os.FileInfo, error) + + // Readlink returns the destination of the named symbolic link. + // The semantics are identical to [os.Readlink]. + Readlink(name string) (string, error) +} + +// osVFS is the "nil" VFS, in that it just passes everything through to the os +// module. +type osVFS struct{} + +func (o osVFS) Lstat(name string) (os.FileInfo, error) { return os.Lstat(name) } + +func (o osVFS) Readlink(name string) (string, error) { return os.Readlink(name) } diff --git a/vendor/github.com/opencontainers/runc/LICENSE b/vendor/github.com/opencontainers/runc/LICENSE new file mode 100644 index 000000000..27448585a --- /dev/null +++ b/vendor/github.com/opencontainers/runc/LICENSE @@ -0,0 +1,191 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + Copyright 2014 Docker, Inc. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/vendor/github.com/opencontainers/runc/NOTICE b/vendor/github.com/opencontainers/runc/NOTICE new file mode 100644 index 000000000..c29775c0d --- /dev/null +++ b/vendor/github.com/opencontainers/runc/NOTICE @@ -0,0 +1,17 @@ +runc + +Copyright 2012-2015 Docker, Inc. + +This product includes software developed at Docker, Inc. (http://www.docker.com). + +The following is courtesy of our legal counsel: + + +Use and transfer of Docker may be subject to certain restrictions by the +United States and other governments. +It is your responsibility to ensure that your use and/or transfer does not +violate applicable laws. + +For more information, please see http://www.bis.doc.gov + +See also http://www.apache.org/dev/crypto.html and/or seek legal counsel. diff --git a/vendor/github.com/opencontainers/runc/libcontainer/dmz/cloned_binary_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/dmz/cloned_binary_linux.go new file mode 100644 index 000000000..1c034e4e6 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/dmz/cloned_binary_linux.go @@ -0,0 +1,258 @@ +package dmz + +import ( + "errors" + "fmt" + "io" + "os" + "strconv" + + "github.com/sirupsen/logrus" + "golang.org/x/sys/unix" + + "github.com/opencontainers/runc/libcontainer/system" +) + +type SealFunc func(**os.File) error + +var ( + _ SealFunc = sealMemfd + _ SealFunc = sealFile +) + +func isExecutable(f *os.File) bool { + if err := unix.Faccessat(int(f.Fd()), "", unix.X_OK, unix.AT_EACCESS|unix.AT_EMPTY_PATH); err == nil { + return true + } else if err == unix.EACCES { + return false + } + path := "/proc/self/fd/" + strconv.Itoa(int(f.Fd())) + if err := unix.Access(path, unix.X_OK); err == nil { + return true + } else if err == unix.EACCES { + return false + } + // Cannot check -- assume it's executable (if not, exec will fail). + logrus.Debugf("cannot do X_OK check on binary %s -- assuming it's executable", f.Name()) + return true +} + +const baseMemfdSeals = unix.F_SEAL_SEAL | unix.F_SEAL_SHRINK | unix.F_SEAL_GROW | unix.F_SEAL_WRITE + +func sealMemfd(f **os.File) error { + if err := (*f).Chmod(0o511); err != nil { + return err + } + // Try to set the newer memfd sealing flags, but we ignore + // errors because they are not needed and we want to continue + // to work on older kernels. + fd := (*f).Fd() + // F_SEAL_FUTURE_WRITE -- Linux 5.1 + _, _ = unix.FcntlInt(fd, unix.F_ADD_SEALS, unix.F_SEAL_FUTURE_WRITE) + // F_SEAL_EXEC -- Linux 6.3 + const F_SEAL_EXEC = 0x20 //nolint:revive // this matches the unix.* name + _, _ = unix.FcntlInt(fd, unix.F_ADD_SEALS, F_SEAL_EXEC) + // Apply all original memfd seals. + _, err := unix.FcntlInt(fd, unix.F_ADD_SEALS, baseMemfdSeals) + return os.NewSyscallError("fcntl(F_ADD_SEALS)", err) +} + +// Memfd creates a sealable executable memfd (supported since Linux 3.17). +func Memfd(comment string) (*os.File, SealFunc, error) { + file, err := system.ExecutableMemfd("runc_cloned:"+comment, unix.MFD_ALLOW_SEALING|unix.MFD_CLOEXEC) + return file, sealMemfd, err +} + +func sealFile(f **os.File) error { + // When sealing an O_TMPFILE-style descriptor we need to + // re-open the path as O_PATH to clear the existing write + // handle we have. + opath, err := os.OpenFile(fmt.Sprintf("/proc/self/fd/%d", (*f).Fd()), unix.O_PATH|unix.O_CLOEXEC, 0) + if err != nil { + return fmt.Errorf("reopen tmpfile: %w", err) + } + _ = (*f).Close() + *f = opath + return nil +} + +// otmpfile creates an open(O_TMPFILE) file in the given directory (supported +// since Linux 3.11). +func otmpfile(dir string) (*os.File, SealFunc, error) { + file, err := os.OpenFile(dir, unix.O_TMPFILE|unix.O_RDWR|unix.O_EXCL|unix.O_CLOEXEC, 0o700) + if err != nil { + return nil, nil, fmt.Errorf("O_TMPFILE creation failed: %w", err) + } + // Make sure we actually got an unlinked O_TMPFILE descriptor. + var stat unix.Stat_t + if err := unix.Fstat(int(file.Fd()), &stat); err != nil { + file.Close() + return nil, nil, fmt.Errorf("cannot fstat O_TMPFILE fd: %w", err) + } else if stat.Nlink != 0 { + file.Close() + return nil, nil, errors.New("O_TMPFILE has non-zero nlink") + } + return file, sealFile, err +} + +// mktemp creates a classic unlinked file in the given directory. +func mktemp(dir string) (*os.File, SealFunc, error) { + file, err := os.CreateTemp(dir, "runc.") + if err != nil { + return nil, nil, err + } + // Unlink the file and verify it was unlinked. + if err := os.Remove(file.Name()); err != nil { + return nil, nil, fmt.Errorf("unlinking classic tmpfile: %w", err) + } + if err := file.Chmod(0o511); err != nil { + return nil, nil, fmt.Errorf("chmod classic tmpfile: %w", err) + } + var stat unix.Stat_t + if err := unix.Fstat(int(file.Fd()), &stat); err != nil { + return nil, nil, fmt.Errorf("cannot fstat classic tmpfile: %w", err) + } else if stat.Nlink != 0 { + return nil, nil, fmt.Errorf("classic tmpfile %s has non-zero nlink after unlink", file.Name()) + } + return file, sealFile, err +} + +func getSealableFile(comment, tmpDir string) (file *os.File, sealFn SealFunc, err error) { + // First, try an executable memfd (supported since Linux 3.17). + file, sealFn, err = Memfd(comment) + if err == nil { + return + } + logrus.Debugf("memfd cloned binary failed, falling back to O_TMPFILE: %v", err) + + // The tmpDir here (c.root) might be mounted noexec, so we need a couple of + // fallbacks to try. It's possible that none of these are writable and + // executable, in which case there's nothing we can practically do (other + // than mounting our own executable tmpfs, which would have its own + // issues). + tmpDirs := []string{ + tmpDir, + os.TempDir(), + "/tmp", + ".", + "/bin", + "/", + } + + // Try to fallback to O_TMPFILE (supported since Linux 3.11). + for _, dir := range tmpDirs { + file, sealFn, err = otmpfile(dir) + if err != nil { + continue + } + if !isExecutable(file) { + logrus.Debugf("tmpdir %s is noexec -- trying a different tmpdir", dir) + file.Close() + continue + } + return + } + logrus.Debugf("O_TMPFILE cloned binary failed, falling back to mktemp(): %v", err) + // Finally, try a classic unlinked temporary file. + for _, dir := range tmpDirs { + file, sealFn, err = mktemp(dir) + if err != nil { + continue + } + if !isExecutable(file) { + logrus.Debugf("tmpdir %s is noexec -- trying a different tmpdir", dir) + file.Close() + continue + } + return + } + return nil, nil, fmt.Errorf("could not create sealable file for cloned binary: %w", err) +} + +// CloneBinary creates a "sealed" clone of a given binary, which can be used to +// thwart attempts by the container process to gain access to host binaries +// through procfs magic-link shenanigans. For more details on why this is +// necessary, see CVE-2019-5736. +func CloneBinary(src io.Reader, size int64, name, tmpDir string) (*os.File, error) { + logrus.Debugf("cloning %s binary (%d bytes)", name, size) + file, sealFn, err := getSealableFile(name, tmpDir) + if err != nil { + return nil, err + } + copied, err := system.Copy(file, src) + if err != nil { + file.Close() + return nil, fmt.Errorf("copy binary: %w", err) + } else if copied != size { + file.Close() + return nil, fmt.Errorf("copied binary size mismatch: %d != %d", copied, size) + } + if err := sealFn(&file); err != nil { + file.Close() + return nil, fmt.Errorf("could not seal fd: %w", err) + } + return file, nil +} + +// IsCloned returns whether the given file can be guaranteed to be a safe exe. +func IsCloned(exe *os.File) bool { + seals, err := unix.FcntlInt(exe.Fd(), unix.F_GET_SEALS, 0) + if err != nil { + // /proc/self/exe is probably not a memfd + logrus.Debugf("F_GET_SEALS on %s failed: %v", exe.Name(), err) + return false + } + // The memfd must have all of the base seals applied. + logrus.Debugf("checking %s memfd seals: 0x%x", exe.Name(), seals) + return seals&baseMemfdSeals == baseMemfdSeals +} + +// CloneSelfExe makes a clone of the current process's binary (through +// /proc/self/exe). This binary can then be used for "runc init" in order to +// make sure the container process can never resolve the original runc binary. +// For more details on why this is necessary, see CVE-2019-5736. +func CloneSelfExe(tmpDir string) (*os.File, error) { + // Try to create a temporary overlayfs to produce a readonly version of + // /proc/self/exe that cannot be "unwrapped" by the container. In contrast + // to CloneBinary, this technique does not require any extra memory usage + // and does not have the (fairly noticeable) performance impact of copying + // a large binary file into a memfd. + // + // Based on some basic performance testing, the overlayfs approach has + // effectively no performance overhead (it is on par with both + // MS_BIND+MS_RDONLY and no binary cloning at all) while memfd copying adds + // around ~60% overhead during container startup. + overlayFile, err := sealedOverlayfs("/proc/self/exe", tmpDir) + if err == nil { + logrus.Debug("runc-dmz: using overlayfs for sealed /proc/self/exe") // used for tests + return overlayFile, nil + } + logrus.WithError(err).Debugf("could not use overlayfs for /proc/self/exe sealing -- falling back to making a temporary copy") + + selfExe, err := os.Open("/proc/self/exe") + if err != nil { + return nil, fmt.Errorf("opening current binary: %w", err) + } + defer selfExe.Close() + + stat, err := selfExe.Stat() + if err != nil { + return nil, fmt.Errorf("checking /proc/self/exe size: %w", err) + } + size := stat.Size() + + return CloneBinary(selfExe, size, "/proc/self/exe", tmpDir) +} + +// IsSelfExeCloned returns whether /proc/self/exe is a cloned binary that can +// be guaranteed to be safe. This means that it must be a sealed memfd. Other +// types of clones cannot be completely verified as safe. +func IsSelfExeCloned() bool { + selfExe, err := os.Open("/proc/self/exe") + if err != nil { + logrus.Debugf("open /proc/self/exe failed: %v", err) + return false + } + defer selfExe.Close() + return IsCloned(selfExe) +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/dmz/overlayfs_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/dmz/overlayfs_linux.go new file mode 100644 index 000000000..b81b70258 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/dmz/overlayfs_linux.go @@ -0,0 +1,122 @@ +package dmz + +import ( + "fmt" + "os" + "path/filepath" + "runtime" + "strings" + + "golang.org/x/sys/unix" + + "github.com/opencontainers/runc/libcontainer/utils" +) + +func fsopen(fsName string, flags int) (*os.File, error) { + // Make sure we always set O_CLOEXEC. + flags |= unix.FSOPEN_CLOEXEC + fd, err := unix.Fsopen(fsName, flags) + if err != nil { + return nil, os.NewSyscallError("fsopen "+fsName, err) + } + return os.NewFile(uintptr(fd), "fscontext:"+fsName), nil +} + +func fsmount(ctx *os.File, flags, mountAttrs int) (*os.File, error) { + // Make sure we always set O_CLOEXEC. + flags |= unix.FSMOUNT_CLOEXEC + fd, err := unix.Fsmount(int(ctx.Fd()), flags, mountAttrs) + if err != nil { + return nil, os.NewSyscallError("fsmount "+ctx.Name(), err) + } + runtime.KeepAlive(ctx) // make sure fd is kept alive while it's used + return os.NewFile(uintptr(fd), "fsmount:"+ctx.Name()), nil +} + +func escapeOverlayLowerDir(path string) string { + // If the lowerdir path contains ":" we need to escape them, and if there + // were any escape characters already (\) we need to escape those first. + return strings.ReplaceAll(strings.ReplaceAll(path, `\`, `\\`), `:`, `\:`) +} + +// sealedOverlayfs will create an internal overlayfs mount using fsopen() that +// uses the directory containing the binary as a lowerdir and a temporary tmpfs +// as an upperdir. There is no way to "unwrap" this (unlike MS_BIND+MS_RDONLY) +// and so we can create a safe zero-copy sealed version of /proc/self/exe. +// This only works for privileged users and on kernels with overlayfs and +// fsopen() enabled. +// +// TODO: Since Linux 5.11, overlayfs can be created inside user namespaces so +// it is technically possible to create an overlayfs even for rootless +// containers. Unfortunately, this would require some ugly manual CGo+fork +// magic so we can do this later if we feel it's really needed. +func sealedOverlayfs(binPath, tmpDir string) (_ *os.File, Err error) { + // Try to do the superblock creation first to bail out early if we can't + // use this method. + overlayCtx, err := fsopen("overlay", unix.FSOPEN_CLOEXEC) + if err != nil { + return nil, err + } + defer overlayCtx.Close() + + // binPath is going to be /proc/self/exe, so do a readlink to get the real + // path. overlayfs needs the real underlying directory for this protection + // mode to work properly. + if realPath, err := os.Readlink(binPath); err == nil { + binPath = realPath + } + binLowerDirPath, binName := filepath.Split(binPath) + // Escape any ":"s or "\"s in the path. + binLowerDirPath = escapeOverlayLowerDir(binLowerDirPath) + + // Overlayfs requires two lowerdirs in order to run in "lower-only" mode, + // where writes are completely blocked. Ideally we would create a dummy + // tmpfs for this, but it turns out that overlayfs doesn't allow for + // anonymous mountns paths. + // NOTE: I'm working on a patch to fix this but it won't be backported. + dummyLowerDirPath := escapeOverlayLowerDir(tmpDir) + + // Configure the lowerdirs. The binary lowerdir needs to be on the top to + // ensure that a file called "runc" (binName) in the dummy lowerdir doesn't + // mask the binary. + lowerDirStr := binLowerDirPath + ":" + dummyLowerDirPath + if err := unix.FsconfigSetString(int(overlayCtx.Fd()), "lowerdir", lowerDirStr); err != nil { + return nil, fmt.Errorf("fsconfig set overlayfs lowerdir=%s: %w", lowerDirStr, err) + } + + // We don't care about xino (Linux 4.17) but it will be auto-enabled on + // some systems (if /run/runc and /usr/bin are on different filesystems) + // and this produces spurious dmesg log entries. We can safely ignore + // errors when disabling this because we don't actually care about the + // setting and we're just opportunistically disabling it. + _ = unix.FsconfigSetString(int(overlayCtx.Fd()), "xino", "off") + + // Get an actual handle to the overlayfs. + if err := unix.FsconfigCreate(int(overlayCtx.Fd())); err != nil { + return nil, os.NewSyscallError("fsconfig create overlayfs", err) + } + overlayFd, err := fsmount(overlayCtx, unix.FSMOUNT_CLOEXEC, unix.MS_RDONLY|unix.MS_NODEV|unix.MS_NOSUID) + if err != nil { + return nil, err + } + defer overlayFd.Close() + + // Grab a handle to the binary through overlayfs. + exeFile, err := utils.Openat(overlayFd, binName, unix.O_PATH|unix.O_NOFOLLOW|unix.O_CLOEXEC, 0) + if err != nil { + return nil, fmt.Errorf("open %s from overlayfs (lowerdir=%s): %w", binName, lowerDirStr, err) + } + // NOTE: We would like to check that exeFile is the same as /proc/self/exe, + // except this is a little difficult. Depending on what filesystems the + // layers are on, overlayfs can remap the inode numbers (and it always + // creates its own device numbers -- see ovl_map_dev_ino) so we can't do a + // basic stat-based check. The only reasonable option would be to hash both + // files and compare them, but this would require fully reading both files + // which would produce a similar performance overhead to memfd cloning. + // + // Ultimately, there isn't a real attack to be worried about here. An + // attacker would need to be able to modify files in /usr/sbin (or wherever + // runc lives), at which point they could just replace the runc binary with + // something malicious anyway. + return exeFile, nil +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/system/linux.go b/vendor/github.com/opencontainers/runc/libcontainer/system/linux.go new file mode 100644 index 000000000..7bbf92a3d --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/system/linux.go @@ -0,0 +1,216 @@ +//go:build linux + +package system + +import ( + "fmt" + "io" + "os" + "strconv" + "syscall" + "unsafe" + + "github.com/sirupsen/logrus" + "golang.org/x/sys/unix" +) + +type ParentDeathSignal int + +func (p ParentDeathSignal) Restore() error { + if p == 0 { + return nil + } + current, err := GetParentDeathSignal() + if err != nil { + return err + } + if p == current { + return nil + } + return p.Set() +} + +func (p ParentDeathSignal) Set() error { + return SetParentDeathSignal(uintptr(p)) +} + +func Exec(cmd string, args []string, env []string) error { + for { + err := unix.Exec(cmd, args, env) + if err != unix.EINTR { + return &os.PathError{Op: "exec", Path: cmd, Err: err} + } + } +} + +func execveat(fd uintptr, pathname string, args []string, env []string, flags int) error { + pathnamep, err := syscall.BytePtrFromString(pathname) + if err != nil { + return err + } + + argvp, err := syscall.SlicePtrFromStrings(args) + if err != nil { + return err + } + + envp, err := syscall.SlicePtrFromStrings(env) + if err != nil { + return err + } + + _, _, errno := syscall.Syscall6( + unix.SYS_EXECVEAT, + fd, + uintptr(unsafe.Pointer(pathnamep)), + uintptr(unsafe.Pointer(&argvp[0])), + uintptr(unsafe.Pointer(&envp[0])), + uintptr(flags), + 0, + ) + return errno +} + +func Fexecve(fd uintptr, args []string, env []string) error { + var err error + for { + err = execveat(fd, "", args, env, unix.AT_EMPTY_PATH) + if err != unix.EINTR { // nolint:errorlint // unix errors are bare + break + } + } + if err == unix.ENOSYS { // nolint:errorlint // unix errors are bare + // Fallback to classic /proc/self/fd/... exec. + return Exec("/proc/self/fd/"+strconv.Itoa(int(fd)), args, env) + } + return os.NewSyscallError("execveat", err) +} + +func SetParentDeathSignal(sig uintptr) error { + if err := unix.Prctl(unix.PR_SET_PDEATHSIG, sig, 0, 0, 0); err != nil { + return err + } + return nil +} + +func GetParentDeathSignal() (ParentDeathSignal, error) { + var sig int + if err := unix.Prctl(unix.PR_GET_PDEATHSIG, uintptr(unsafe.Pointer(&sig)), 0, 0, 0); err != nil { + return -1, err + } + return ParentDeathSignal(sig), nil +} + +func SetKeepCaps() error { + if err := unix.Prctl(unix.PR_SET_KEEPCAPS, 1, 0, 0, 0); err != nil { + return err + } + + return nil +} + +func ClearKeepCaps() error { + if err := unix.Prctl(unix.PR_SET_KEEPCAPS, 0, 0, 0, 0); err != nil { + return err + } + + return nil +} + +func Setctty() error { + if err := unix.IoctlSetInt(0, unix.TIOCSCTTY, 0); err != nil { + return err + } + return nil +} + +// SetSubreaper sets the value i as the subreaper setting for the calling process +func SetSubreaper(i int) error { + return unix.Prctl(unix.PR_SET_CHILD_SUBREAPER, uintptr(i), 0, 0, 0) +} + +// GetSubreaper returns the subreaper setting for the calling process +func GetSubreaper() (int, error) { + var i uintptr + + if err := unix.Prctl(unix.PR_GET_CHILD_SUBREAPER, uintptr(unsafe.Pointer(&i)), 0, 0, 0); err != nil { + return -1, err + } + + return int(i), nil +} + +func ExecutableMemfd(comment string, flags int) (*os.File, error) { + // Try to use MFD_EXEC first. On pre-6.3 kernels we get -EINVAL for this + // flag. On post-6.3 kernels, with vm.memfd_noexec=1 this ensures we get an + // executable memfd. For vm.memfd_noexec=2 this is a bit more complicated. + // The original vm.memfd_noexec=2 implementation incorrectly silently + // allowed MFD_EXEC[1] -- this should be fixed in 6.6. On 6.6 and newer + // kernels, we will get -EACCES if we try to use MFD_EXEC with + // vm.memfd_noexec=2 (for 6.3-6.5, -EINVAL was the intended return value). + // + // The upshot is we only need to retry without MFD_EXEC on -EINVAL because + // it just so happens that passing MFD_EXEC bypasses vm.memfd_noexec=2 on + // kernels where -EINVAL is actually a security denial. + memfd, err := unix.MemfdCreate(comment, flags|unix.MFD_EXEC) + if err == unix.EINVAL { + memfd, err = unix.MemfdCreate(comment, flags) + } + if err != nil { + if err == unix.EACCES { + logrus.Info("memfd_create(MFD_EXEC) failed, possibly due to vm.memfd_noexec=2 -- falling back to less secure O_TMPFILE") + } + err := os.NewSyscallError("memfd_create", err) + return nil, fmt.Errorf("failed to create executable memfd: %w", err) + } + return os.NewFile(uintptr(memfd), "/memfd:"+comment), nil +} + +// Copy is like io.Copy except it uses sendfile(2) if the source and sink are +// both (*os.File) as an optimisation to make copies faster. +func Copy(dst io.Writer, src io.Reader) (copied int64, err error) { + dstFile, _ := dst.(*os.File) + srcFile, _ := src.(*os.File) + + if dstFile != nil && srcFile != nil { + fi, err := srcFile.Stat() + if err != nil { + goto fallback + } + size := fi.Size() + for size > 0 { + n, err := unix.Sendfile(int(dstFile.Fd()), int(srcFile.Fd()), nil, int(size)) + if n > 0 { + size -= int64(n) + copied += int64(n) + } + if err == unix.EINTR { + continue + } + if err != nil { + if copied == 0 { + // If we haven't copied anything so far, we can safely just + // fallback to io.Copy. We could always do the fallback but + // it's safer to error out in the case of a partial copy + // followed by an error (which should never happen). + goto fallback + } + return copied, fmt.Errorf("partial sendfile copy: %w", err) + } + } + return copied, nil + } + +fallback: + return io.Copy(dst, src) +} + +// SetLinuxPersonality sets the Linux execution personality. For more information see the personality syscall documentation. +// checkout getLinuxPersonalityFromStr() from libcontainer/specconv/spec_linux.go for type conversion. +func SetLinuxPersonality(personality int) error { + _, _, errno := unix.Syscall(unix.SYS_PERSONALITY, uintptr(personality), 0, 0) + if errno != 0 { + return &os.SyscallError{Syscall: "set_personality", Err: errno} + } + return nil +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/system/proc.go b/vendor/github.com/opencontainers/runc/libcontainer/system/proc.go new file mode 100644 index 000000000..774443ec9 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/system/proc.go @@ -0,0 +1,127 @@ +package system + +import ( + "fmt" + "os" + "path/filepath" + "strconv" + "strings" +) + +// State is the status of a process. +type State rune + +const ( // Only values for Linux 3.14 and later are listed here + Dead State = 'X' + DiskSleep State = 'D' + Running State = 'R' + Sleeping State = 'S' + Stopped State = 'T' + TracingStop State = 't' + Zombie State = 'Z' + Parked State = 'P' + Idle State = 'I' +) + +// String forms of the state from proc(5)'s documentation for +// /proc/[pid]/status' "State" field. +func (s State) String() string { + switch s { + case Dead: + return "dead" + case DiskSleep: + return "disk sleep" + case Running: + return "running" + case Sleeping: + return "sleeping" + case Stopped: + return "stopped" + case TracingStop: + return "tracing stop" + case Zombie: + return "zombie" + case Parked: + return "parked" + case Idle: + return "idle" // kernel thread + default: + return fmt.Sprintf("unknown (%c)", s) + } +} + +// Stat_t represents the information from /proc/[pid]/stat, as +// described in proc(5) with names based on the /proc/[pid]/status +// fields. +type Stat_t struct { + // Name is the command run by the process. + Name string + + // State is the state of the process. + State State + + // StartTime is the number of clock ticks after system boot (since + // Linux 2.6). + StartTime uint64 +} + +// Stat returns a Stat_t instance for the specified process. +func Stat(pid int) (stat Stat_t, err error) { + bytes, err := os.ReadFile(filepath.Join("/proc", strconv.Itoa(pid), "stat")) + if err != nil { + return stat, err + } + return parseStat(string(bytes)) +} + +func parseStat(data string) (stat Stat_t, err error) { + // Example: + // 89653 (gunicorn: maste) S 89630 89653 89653 0 -1 4194560 29689 28896 0 3 146 32 76 19 20 0 1 0 2971844 52965376 3920 18446744073709551615 1 1 0 0 0 0 0 16781312 137447943 0 0 0 17 1 0 0 0 0 0 0 0 0 0 0 0 0 0 + // The fields are space-separated, see full description in proc(5). + // + // We are only interested in: + // * field 2: process name. It is the only field enclosed into + // parenthesis, as it can contain spaces (and parenthesis) inside. + // * field 3: process state, a single character (%c) + // * field 22: process start time, a long unsigned integer (%llu). + + // 1. Look for the first '(' and the last ')' first, what's in between is Name. + // We expect at least 20 fields and a space after the last one. + + const minAfterName = 20*2 + 1 // the min field is '0 '. + + first := strings.IndexByte(data, '(') + if first < 0 || first+minAfterName >= len(data) { + return stat, fmt.Errorf("invalid stat data (no comm or too short): %q", data) + } + + last := strings.LastIndexByte(data, ')') + if last <= first || last+minAfterName >= len(data) { + return stat, fmt.Errorf("invalid stat data (no comm or too short): %q", data) + } + + stat.Name = data[first+1 : last] + + // 2. Remove fields 1 and 2 and a space after. State is right after. + data = data[last+2:] + stat.State = State(data[0]) + + // 3. StartTime is field 22, data is at field 3 now, so we need to skip 19 spaces. + skipSpaces := 22 - 3 + for first = 0; skipSpaces > 0 && first < len(data); first++ { + if data[first] == ' ' { + skipSpaces-- + } + } + // Now first points to StartTime; look for space right after. + i := strings.IndexByte(data[first:], ' ') + if i < 0 { + return stat, fmt.Errorf("invalid stat data (too short): %q", data) + } + stat.StartTime, err = strconv.ParseUint(data[first:first+i], 10, 64) + if err != nil { + return stat, fmt.Errorf("invalid stat data (bad start time): %w", err) + } + + return stat, nil +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/system/rlimit_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/system/rlimit_linux.go new file mode 100644 index 000000000..4595fa82a --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/system/rlimit_linux.go @@ -0,0 +1,15 @@ +//go:build go1.23 + +package system + +import ( + "syscall" +) + +// ClearRlimitNofileCache clears go runtime's nofile rlimit cache. The argument +// is process RLIMIT_NOFILE values. Relies on go.dev/cl/588076. +func ClearRlimitNofileCache(lim *syscall.Rlimit) { + // Ignore the return values since we only need to clean the cache, + // the limit is going to be set via unix.Prlimit elsewhere. + _ = syscall.Setrlimit(syscall.RLIMIT_NOFILE, lim) +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/system/rlimit_linux_go122.go b/vendor/github.com/opencontainers/runc/libcontainer/system/rlimit_linux_go122.go new file mode 100644 index 000000000..865d18022 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/system/rlimit_linux_go122.go @@ -0,0 +1,27 @@ +//go:build !go1.23 + +// TODO: remove this file once go 1.22 is no longer supported. + +package system + +import ( + "sync/atomic" + "syscall" + _ "unsafe" // Needed for go:linkname to work. +) + +//go:linkname syscallOrigRlimitNofile syscall.origRlimitNofile +var syscallOrigRlimitNofile atomic.Pointer[syscall.Rlimit] + +// ClearRlimitNofileCache clears go runtime's nofile rlimit cache. +// The argument is process RLIMIT_NOFILE values. +func ClearRlimitNofileCache(_ *syscall.Rlimit) { + // As reported in issue #4195, the new version of go runtime(since 1.19) + // will cache rlimit-nofile. Before executing execve, the rlimit-nofile + // of the process will be restored with the cache. In runc, this will + // cause the rlimit-nofile setting by the parent process for the container + // to become invalid. It can be solved by clearing this cache. But + // unfortunately, go stdlib doesn't provide such function, so we need to + // link to the private var `origRlimitNofile` in package syscall to hack. + syscallOrigRlimitNofile.Store(nil) +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/utils/cmsg.go b/vendor/github.com/opencontainers/runc/libcontainer/utils/cmsg.go new file mode 100644 index 000000000..2edd1417a --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/utils/cmsg.go @@ -0,0 +1,119 @@ +package utils + +/* + * Copyright 2016, 2017 SUSE LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import ( + "fmt" + "os" + "runtime" + + "golang.org/x/sys/unix" +) + +// MaxNameLen is the maximum length of the name of a file descriptor being sent +// using SendFile. The name of the file handle returned by RecvFile will never be +// larger than this value. +const MaxNameLen = 4096 + +// oobSpace is the size of the oob slice required to store a single FD. Note +// that unix.UnixRights appears to make the assumption that fd is always int32, +// so sizeof(fd) = 4. +var oobSpace = unix.CmsgSpace(4) + +// RecvFile waits for a file descriptor to be sent over the given AF_UNIX +// socket. The file name of the remote file descriptor will be recreated +// locally (it is sent as non-auxiliary data in the same payload). +func RecvFile(socket *os.File) (_ *os.File, Err error) { + name := make([]byte, MaxNameLen) + oob := make([]byte, oobSpace) + + sockfd := socket.Fd() + n, oobn, _, _, err := unix.Recvmsg(int(sockfd), name, oob, unix.MSG_CMSG_CLOEXEC) + if err != nil { + return nil, err + } + if n >= MaxNameLen || oobn != oobSpace { + return nil, fmt.Errorf("recvfile: incorrect number of bytes read (n=%d oobn=%d)", n, oobn) + } + // Truncate. + name = name[:n] + oob = oob[:oobn] + + scms, err := unix.ParseSocketControlMessage(oob) + if err != nil { + return nil, err + } + + // We cannot control how many SCM_RIGHTS we receive, and upon receiving + // them all of the descriptors are installed in our fd table, so we need to + // parse all of the SCM_RIGHTS we received in order to close all of the + // descriptors on error. + var fds []int + defer func() { + for i, fd := range fds { + if i == 0 && Err == nil { + // Only close the first one on error. + continue + } + // Always close extra ones. + _ = unix.Close(fd) + } + }() + var lastErr error + for _, scm := range scms { + if scm.Header.Type == unix.SCM_RIGHTS { + scmFds, err := unix.ParseUnixRights(&scm) + if err != nil { + lastErr = err + } else { + fds = append(fds, scmFds...) + } + } + } + if lastErr != nil { + return nil, lastErr + } + + // We do this after collecting the fds to make sure we close them all when + // returning an error here. + if len(scms) != 1 { + return nil, fmt.Errorf("recvfd: number of SCMs is not 1: %d", len(scms)) + } + if len(fds) != 1 { + return nil, fmt.Errorf("recvfd: number of fds is not 1: %d", len(fds)) + } + return os.NewFile(uintptr(fds[0]), string(name)), nil +} + +// SendFile sends a file over the given AF_UNIX socket. file.Name() is also +// included so that if the other end uses RecvFile, the file will have the same +// name information. +func SendFile(socket *os.File, file *os.File) error { + name := file.Name() + if len(name) >= MaxNameLen { + return fmt.Errorf("sendfd: filename too long: %s", name) + } + err := SendRawFd(socket, name, file.Fd()) + runtime.KeepAlive(file) + return err +} + +// SendRawFd sends a specific file descriptor over the given AF_UNIX socket. +func SendRawFd(socket *os.File, msg string, fd uintptr) error { + oob := unix.UnixRights(int(fd)) + return unix.Sendmsg(int(socket.Fd()), []byte(msg), oob, nil, 0) +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/utils/utils.go b/vendor/github.com/opencontainers/runc/libcontainer/utils/utils.go new file mode 100644 index 000000000..db420ea68 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/utils/utils.go @@ -0,0 +1,115 @@ +package utils + +import ( + "encoding/json" + "io" + "os" + "path/filepath" + "strings" + + "golang.org/x/sys/unix" +) + +const ( + exitSignalOffset = 128 +) + +// ExitStatus returns the correct exit status for a process based on if it +// was signaled or exited cleanly +func ExitStatus(status unix.WaitStatus) int { + if status.Signaled() { + return exitSignalOffset + int(status.Signal()) + } + return status.ExitStatus() +} + +// WriteJSON writes the provided struct v to w using standard json marshaling +// without a trailing newline. This is used instead of json.Encoder because +// there might be a problem in json decoder in some cases, see: +// https://github.com/docker/docker/issues/14203#issuecomment-174177790 +func WriteJSON(w io.Writer, v interface{}) error { + data, err := json.Marshal(v) + if err != nil { + return err + } + _, err = w.Write(data) + return err +} + +// CleanPath makes a path safe for use with filepath.Join. This is done by not +// only cleaning the path, but also (if the path is relative) adding a leading +// '/' and cleaning it (then removing the leading '/'). This ensures that a +// path resulting from prepending another path will always resolve to lexically +// be a subdirectory of the prefixed path. This is all done lexically, so paths +// that include symlinks won't be safe as a result of using CleanPath. +func CleanPath(path string) string { + // Deal with empty strings nicely. + if path == "" { + return "" + } + + // Ensure that all paths are cleaned (especially problematic ones like + // "/../../../../../" which can cause lots of issues). + path = filepath.Clean(path) + + // If the path isn't absolute, we need to do more processing to fix paths + // such as "../../../..//some/path". We also shouldn't convert absolute + // paths to relative ones. + if !filepath.IsAbs(path) { + path = filepath.Clean(string(os.PathSeparator) + path) + // This can't fail, as (by definition) all paths are relative to root. + path, _ = filepath.Rel(string(os.PathSeparator), path) + } + + // Clean the path again for good measure. + return filepath.Clean(path) +} + +// stripRoot returns the passed path, stripping the root path if it was +// (lexicially) inside it. Note that both passed paths will always be treated +// as absolute, and the returned path will also always be absolute. In +// addition, the paths are cleaned before stripping the root. +func stripRoot(root, path string) string { + // Make the paths clean and absolute. + root, path = CleanPath("/"+root), CleanPath("/"+path) + switch { + case path == root: + path = "/" + case root == "/": + // do nothing + case strings.HasPrefix(path, root+"/"): + path = strings.TrimPrefix(path, root+"/") + } + return CleanPath("/" + path) +} + +// SearchLabels searches through a list of key=value pairs for a given key, +// returning its value, and the binary flag telling whether the key exist. +func SearchLabels(labels []string, key string) (string, bool) { + key += "=" + for _, s := range labels { + if strings.HasPrefix(s, key) { + return s[len(key):], true + } + } + return "", false +} + +// Annotations returns the bundle path and user defined annotations from the +// libcontainer state. We need to remove the bundle because that is a label +// added by libcontainer. +func Annotations(labels []string) (bundle string, userAnnotations map[string]string) { + userAnnotations = make(map[string]string) + for _, l := range labels { + name, value, ok := strings.Cut(l, "=") + if !ok { + continue + } + if name == "bundle" { + bundle = value + } else { + userAnnotations[name] = value + } + } + return +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/utils/utils_unix.go b/vendor/github.com/opencontainers/runc/libcontainer/utils/utils_unix.go new file mode 100644 index 000000000..8f179b6aa --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/utils/utils_unix.go @@ -0,0 +1,360 @@ +//go:build !windows + +package utils + +import ( + "fmt" + "math" + "os" + "path/filepath" + "runtime" + "strconv" + "strings" + "sync" + _ "unsafe" // for go:linkname + + securejoin "github.com/cyphar/filepath-securejoin" + "github.com/sirupsen/logrus" + "golang.org/x/sys/unix" +) + +// EnsureProcHandle returns whether or not the given file handle is on procfs. +func EnsureProcHandle(fh *os.File) error { + var buf unix.Statfs_t + if err := unix.Fstatfs(int(fh.Fd()), &buf); err != nil { + return fmt.Errorf("ensure %s is on procfs: %w", fh.Name(), err) + } + if buf.Type != unix.PROC_SUPER_MAGIC { + return fmt.Errorf("%s is not on procfs", fh.Name()) + } + return nil +} + +var ( + haveCloseRangeCloexecBool bool + haveCloseRangeCloexecOnce sync.Once +) + +func haveCloseRangeCloexec() bool { + haveCloseRangeCloexecOnce.Do(func() { + // Make sure we're not closing a random file descriptor. + tmpFd, err := unix.FcntlInt(0, unix.F_DUPFD_CLOEXEC, 0) + if err != nil { + return + } + defer unix.Close(tmpFd) + + err = unix.CloseRange(uint(tmpFd), uint(tmpFd), unix.CLOSE_RANGE_CLOEXEC) + // Any error means we cannot use close_range(CLOSE_RANGE_CLOEXEC). + // -ENOSYS and -EINVAL ultimately mean we don't have support, but any + // other potential error would imply that even the most basic close + // operation wouldn't work. + haveCloseRangeCloexecBool = err == nil + }) + return haveCloseRangeCloexecBool +} + +type fdFunc func(fd int) + +// fdRangeFrom calls the passed fdFunc for each file descriptor that is open in +// the current process. +func fdRangeFrom(minFd int, fn fdFunc) error { + procSelfFd, closer := ProcThreadSelf("fd") + defer closer() + + fdDir, err := os.Open(procSelfFd) + if err != nil { + return err + } + defer fdDir.Close() + + if err := EnsureProcHandle(fdDir); err != nil { + return err + } + + fdList, err := fdDir.Readdirnames(-1) + if err != nil { + return err + } + for _, fdStr := range fdList { + fd, err := strconv.Atoi(fdStr) + // Ignore non-numeric file names. + if err != nil { + continue + } + // Ignore descriptors lower than our specified minimum. + if fd < minFd { + continue + } + // Ignore the file descriptor we used for readdir, as it will be closed + // when we return. + if uintptr(fd) == fdDir.Fd() { + continue + } + // Run the closure. + fn(fd) + } + return nil +} + +// CloseExecFrom sets the O_CLOEXEC flag on all file descriptors greater or +// equal to minFd in the current process. +func CloseExecFrom(minFd int) error { + // Use close_range(CLOSE_RANGE_CLOEXEC) if possible. + if haveCloseRangeCloexec() { + err := unix.CloseRange(uint(minFd), math.MaxUint, unix.CLOSE_RANGE_CLOEXEC) + return os.NewSyscallError("close_range", err) + } + // Otherwise, fall back to the standard loop. + return fdRangeFrom(minFd, unix.CloseOnExec) +} + +//go:linkname runtime_IsPollDescriptor internal/poll.IsPollDescriptor + +// In order to make sure we do not close the internal epoll descriptors the Go +// runtime uses, we need to ensure that we skip descriptors that match +// "internal/poll".IsPollDescriptor. Yes, this is a Go runtime internal thing, +// unfortunately there's no other way to be sure we're only keeping the file +// descriptors the Go runtime needs. Hopefully nothing blows up doing this... +func runtime_IsPollDescriptor(fd uintptr) bool //nolint:revive + +// UnsafeCloseFrom closes all file descriptors greater or equal to minFd in the +// current process, except for those critical to Go's runtime (such as the +// netpoll management descriptors). +// +// NOTE: That this function is incredibly dangerous to use in most Go code, as +// closing file descriptors from underneath *os.File handles can lead to very +// bad behaviour (the closed file descriptor can be re-used and then any +// *os.File operations would apply to the wrong file). This function is only +// intended to be called from the last stage of runc init. +func UnsafeCloseFrom(minFd int) error { + // We cannot use close_range(2) even if it is available, because we must + // not close some file descriptors. + return fdRangeFrom(minFd, func(fd int) { + if runtime_IsPollDescriptor(uintptr(fd)) { + // These are the Go runtimes internal netpoll file descriptors. + // These file descriptors are operated on deep in the Go scheduler, + // and closing those files from underneath Go can result in panics. + // There is no issue with keeping them because they are not + // executable and are not useful to an attacker anyway. Also we + // don't have any choice. + return + } + // There's nothing we can do about errors from close(2), and the + // only likely error to be seen is EBADF which indicates the fd was + // already closed (in which case, we got what we wanted). + _ = unix.Close(fd) + }) +} + +// NewSockPair returns a new SOCK_STREAM unix socket pair. +func NewSockPair(name string) (parent, child *os.File, err error) { + fds, err := unix.Socketpair(unix.AF_LOCAL, unix.SOCK_STREAM|unix.SOCK_CLOEXEC, 0) + if err != nil { + return nil, nil, err + } + return os.NewFile(uintptr(fds[1]), name+"-p"), os.NewFile(uintptr(fds[0]), name+"-c"), nil +} + +// WithProcfd runs the passed closure with a procfd path (/proc/self/fd/...) +// corresponding to the unsafePath resolved within the root. Before passing the +// fd, this path is verified to have been inside the root -- so operating on it +// through the passed fdpath should be safe. Do not access this path through +// the original path strings, and do not attempt to use the pathname outside of +// the passed closure (the file handle will be freed once the closure returns). +func WithProcfd(root, unsafePath string, fn func(procfd string) error) error { + // Remove the root then forcefully resolve inside the root. + unsafePath = stripRoot(root, unsafePath) + path, err := securejoin.SecureJoin(root, unsafePath) + if err != nil { + return fmt.Errorf("resolving path inside rootfs failed: %w", err) + } + + procSelfFd, closer := ProcThreadSelf("fd/") + defer closer() + + // Open the target path. + fh, err := os.OpenFile(path, unix.O_PATH|unix.O_CLOEXEC, 0) + if err != nil { + return fmt.Errorf("open o_path procfd: %w", err) + } + defer fh.Close() + + procfd := filepath.Join(procSelfFd, strconv.Itoa(int(fh.Fd()))) + // Double-check the path is the one we expected. + if realpath, err := os.Readlink(procfd); err != nil { + return fmt.Errorf("procfd verification failed: %w", err) + } else if realpath != path { + return fmt.Errorf("possibly malicious path detected -- refusing to operate on %s", realpath) + } + + return fn(procfd) +} + +type ProcThreadSelfCloser func() + +var ( + haveProcThreadSelf bool + haveProcThreadSelfOnce sync.Once +) + +// ProcThreadSelf returns a string that is equivalent to +// /proc/thread-self/, with a graceful fallback on older kernels where +// /proc/thread-self doesn't exist. This method DOES NOT use SecureJoin, +// meaning that the passed string needs to be trusted. The caller _must_ call +// the returned procThreadSelfCloser function (which is runtime.UnlockOSThread) +// *only once* after it has finished using the returned path string. +func ProcThreadSelf(subpath string) (string, ProcThreadSelfCloser) { + haveProcThreadSelfOnce.Do(func() { + if _, err := os.Stat("/proc/thread-self/"); err == nil { + haveProcThreadSelf = true + } else { + logrus.Debugf("cannot stat /proc/thread-self (%v), falling back to /proc/self/task/", err) + } + }) + + // We need to lock our thread until the caller is done with the path string + // because any non-atomic operation on the path (such as opening a file, + // then reading it) could be interrupted by the Go runtime where the + // underlying thread is swapped out and the original thread is killed, + // resulting in pull-your-hair-out-hard-to-debug issues in the caller. In + // addition, the pre-3.17 fallback makes everything non-atomic because the + // same thing could happen between unix.Gettid() and the path operations. + // + // In theory, we don't need to lock in the atomic user case when using + // /proc/thread-self/, but it's better to be safe than sorry (and there are + // only one or two truly atomic users of /proc/thread-self/). + runtime.LockOSThread() + + threadSelf := "/proc/thread-self/" + if !haveProcThreadSelf { + // Pre-3.17 kernels did not have /proc/thread-self, so do it manually. + threadSelf = "/proc/self/task/" + strconv.Itoa(unix.Gettid()) + "/" + if _, err := os.Stat(threadSelf); err != nil { + // Unfortunately, this code is called from rootfs_linux.go where we + // are running inside the pid namespace of the container but /proc + // is the host's procfs. Unfortunately there is no real way to get + // the correct tid to use here (the kernel age means we cannot do + // things like set up a private fsopen("proc") -- even scanning + // NSpid in all of the tasks in /proc/self/task/*/status requires + // Linux 4.1). + // + // So, we just have to assume that /proc/self is acceptable in this + // one specific case. + if os.Getpid() == 1 { + logrus.Debugf("/proc/thread-self (tid=%d) cannot be emulated inside the initial container setup -- using /proc/self instead: %v", unix.Gettid(), err) + } else { + // This should never happen, but the fallback should work in most cases... + logrus.Warnf("/proc/thread-self could not be emulated for pid=%d (tid=%d) -- using more buggy /proc/self fallback instead: %v", os.Getpid(), unix.Gettid(), err) + } + threadSelf = "/proc/self/" + } + } + return threadSelf + subpath, runtime.UnlockOSThread +} + +// ProcThreadSelfFd is small wrapper around ProcThreadSelf to make it easier to +// create a /proc/thread-self handle for given file descriptor. +// +// It is basically equivalent to ProcThreadSelf(fmt.Sprintf("fd/%d", fd)), but +// without using fmt.Sprintf to avoid unneeded overhead. +func ProcThreadSelfFd(fd uintptr) (string, ProcThreadSelfCloser) { + return ProcThreadSelf("fd/" + strconv.FormatUint(uint64(fd), 10)) +} + +// IsLexicallyInRoot is shorthand for strings.HasPrefix(path+"/", root+"/"), +// but properly handling the case where path or root are "/". +// +// NOTE: The return value only make sense if the path doesn't contain "..". +func IsLexicallyInRoot(root, path string) bool { + if root != "/" { + root += "/" + } + if path != "/" { + path += "/" + } + return strings.HasPrefix(path, root) +} + +// MkdirAllInRootOpen attempts to make +// +// path, _ := securejoin.SecureJoin(root, unsafePath) +// os.MkdirAll(path, mode) +// os.Open(path) +// +// safer against attacks where components in the path are changed between +// SecureJoin returning and MkdirAll (or Open) being called. In particular, we +// try to detect any symlink components in the path while we are doing the +// MkdirAll. +// +// NOTE: If unsafePath is a subpath of root, we assume that you have already +// called SecureJoin and so we use the provided path verbatim without resolving +// any symlinks (this is done in a way that avoids symlink-exchange races). +// This means that the path also must not contain ".." elements, otherwise an +// error will occur. +// +// This uses securejoin.MkdirAllHandle under the hood, but it has special +// handling if unsafePath has already been scoped within the rootfs (this is +// needed for a lot of runc callers and fixing this would require reworking a +// lot of path logic). +func MkdirAllInRootOpen(root, unsafePath string, mode os.FileMode) (_ *os.File, Err error) { + // If the path is already "within" the root, get the path relative to the + // root and use that as the unsafe path. This is necessary because a lot of + // MkdirAllInRootOpen callers have already done SecureJoin, and refactoring + // all of them to stop using these SecureJoin'd paths would require a fair + // amount of work. + // TODO(cyphar): Do the refactor to libpathrs once it's ready. + if IsLexicallyInRoot(root, unsafePath) { + subPath, err := filepath.Rel(root, unsafePath) + if err != nil { + return nil, err + } + unsafePath = subPath + } + + // Check for any silly mode bits. + if mode&^0o7777 != 0 { + return nil, fmt.Errorf("tried to include non-mode bits in MkdirAll mode: 0o%.3o", mode) + } + // Linux (and thus os.MkdirAll) silently ignores the suid and sgid bits if + // passed. While it would make sense to return an error in that case (since + // the user has asked for a mode that won't be applied), for compatibility + // reasons we have to ignore these bits. + if ignoredBits := mode &^ 0o1777; ignoredBits != 0 { + logrus.Warnf("MkdirAll called with no-op mode bits that are ignored by Linux: 0o%.3o", ignoredBits) + mode &= 0o1777 + } + + rootDir, err := os.OpenFile(root, unix.O_DIRECTORY|unix.O_CLOEXEC, 0) + if err != nil { + return nil, fmt.Errorf("open root handle: %w", err) + } + defer rootDir.Close() + + return securejoin.MkdirAllHandle(rootDir, unsafePath, mode) +} + +// MkdirAllInRoot is a wrapper around MkdirAllInRootOpen which closes the +// returned handle, for callers that don't need to use it. +func MkdirAllInRoot(root, unsafePath string, mode os.FileMode) error { + f, err := MkdirAllInRootOpen(root, unsafePath, mode) + if err == nil { + _ = f.Close() + } + return err +} + +// Openat is a Go-friendly openat(2) wrapper. +func Openat(dir *os.File, path string, flags int, mode uint32) (*os.File, error) { + dirFd := unix.AT_FDCWD + if dir != nil { + dirFd = int(dir.Fd()) + } + flags |= unix.O_CLOEXEC + + fd, err := unix.Openat(dirFd, path, flags, mode) + if err != nil { + return nil, &os.PathError{Op: "openat", Path: path, Err: err} + } + return os.NewFile(uintptr(fd), dir.Name()+"/"+path), nil +} diff --git a/vendor/modules.txt b/vendor/modules.txt index babcea307..f68615ce1 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -14,6 +14,9 @@ github.com/NVIDIA/go-nvml/pkg/nvml/mock # github.com/cpuguy83/go-md2man/v2 v2.0.5 ## explicit; go 1.11 github.com/cpuguy83/go-md2man/v2/md2man +# github.com/cyphar/filepath-securejoin v0.4.1 +## explicit; go 1.18 +github.com/cyphar/filepath-securejoin # github.com/davecgh/go-spew v1.1.1 ## explicit github.com/davecgh/go-spew/spew @@ -30,6 +33,11 @@ github.com/google/uuid # github.com/moby/sys/symlink v0.3.0 ## explicit; go 1.17 github.com/moby/sys/symlink +# github.com/opencontainers/runc v1.2.5 +## explicit; go 1.22 +github.com/opencontainers/runc/libcontainer/dmz +github.com/opencontainers/runc/libcontainer/system +github.com/opencontainers/runc/libcontainer/utils # github.com/opencontainers/runtime-spec v1.2.0 ## explicit github.com/opencontainers/runtime-spec/specs-go @@ -38,8 +46,6 @@ github.com/opencontainers/runtime-spec/specs-go github.com/opencontainers/runtime-tools/generate github.com/opencontainers/runtime-tools/generate/seccomp github.com/opencontainers/runtime-tools/validate/capabilities -# github.com/opencontainers/selinux v1.11.0 -## explicit; go 1.19 # github.com/pelletier/go-toml v1.9.5 ## explicit; go 1.12 github.com/pelletier/go-toml