diff --git a/book/src/kernel/linux-compatibility/README.md b/book/src/kernel/linux-compatibility/README.md index f2dfca238..72fb90f4a 100644 --- a/book/src/kernel/linux-compatibility/README.md +++ b/book/src/kernel/linux-compatibility/README.md @@ -175,7 +175,7 @@ which are summarized in the table below. | 152 | munlockall | ❌ | N/A | | 153 | vhangup | ❌ | N/A | | 154 | modify_ldt | ❌ | N/A | -| 155 | pivot_root | ❌ | N/A | +| 155 | pivot_root | ✅ | 💯 | | 156 | _sysctl | ❌ | N/A | | 157 | prctl | ✅ | [⚠️](syscall-flag-coverage/namespaces-cgroups-and-security/#prctl) | | 158 | arch_prctl | ✅ | [⚠️](syscall-flag-coverage/system-information-and-misc/#arch_prctl) | diff --git a/book/src/kernel/linux-compatibility/syscall-flag-coverage/file-systems-and-mount-control/fully_covered.scml b/book/src/kernel/linux-compatibility/syscall-flag-coverage/file-systems-and-mount-control/fully_covered.scml index 3ca04832f..c71dfc052 100644 --- a/book/src/kernel/linux-compatibility/syscall-flag-coverage/file-systems-and-mount-control/fully_covered.scml +++ b/book/src/kernel/linux-compatibility/syscall-flag-coverage/file-systems-and-mount-control/fully_covered.scml @@ -14,3 +14,6 @@ chroot(path); // Remove a watch from an inotify instance inotify_rm_watch(fd, wd); + +// Change the root mount in the mount namespace of the calling thread +pivot_root(new_root, put_old); diff --git a/distro/etc_nixos/modules/container.nix b/distro/etc_nixos/modules/container.nix index b85578612..7721fdd5c 100644 --- a/distro/etc_nixos/modules/container.nix +++ b/distro/etc_nixos/modules/container.nix @@ -31,7 +31,7 @@ engine = { cgroup_manager = "cgroupfs"; events_logger = "none"; - no_pivot_root = true; + no_pivot_root = false; runtime = "runc"; }; }; diff --git a/kernel/src/fs/path/mod.rs b/kernel/src/fs/path/mod.rs index ea28eb5c0..499b5b784 100644 --- a/kernel/src/fs/path/mod.rs +++ b/kernel/src/fs/path/mod.rs @@ -159,7 +159,7 @@ impl Path { /// For example, first `mount /dev/sda1 /mnt` and then `mount /dev/sda2 /mnt`. /// After the second mount is completed, the content of the first mount will be overridden. /// We need to recursively obtain the top `Path`. - pub(super) fn get_top_path(mut self) -> Self { + fn get_top_path(mut self) -> Self { while self.dentry.is_mountpoint() { if let Some(child_mount) = self.mount.get(&self.dentry) { let inner = child_mount.root_dentry().clone(); @@ -173,13 +173,47 @@ impl Path { } /// Finds the corresponding `Path` in the given mount namespace. - pub(super) fn find_corresponding_mount(&self, mnt_ns: &Arc) -> Option { + fn find_corresponding_mount(&self, mnt_ns: &Arc) -> Option { let corresponding_mount = self.mount.find_corresponding_mount(mnt_ns)?; let corresponding_path = Self::new(corresponding_mount, self.dentry.clone()); Some(corresponding_path) } + /// Checks if this path is reachable from the given `root` path. + /// + /// A path is considered reachable if it is the same as or a descendant + /// of the `root` path. The check traverses upwards from the current path, + /// crossing mount point boundaries as necessary, until it either finds + /// the `root` path or reaches the global root. + fn is_reachable_from(&self, root: &Path) -> bool { + let mut owned; + let mut current = self; + + loop { + if current.mount.id() != root.mount.id() { + let Some(parent_mount) = current.mount.parent().and_then(|mount| mount.upgrade()) + else { + return false; + }; + + owned = Path::new( + parent_mount, + current + .mount + .mountpoint() + .expect("Mounts with parents must have a mount point") + .clone(), + ); + current = &owned; + + continue; + } + + return current.dentry.is_equal_or_descendant_of(&root.dentry); + } + } + /// Returns true if the `Path` represents a pseudo file. fn is_pseudo(&self) -> bool { self.dentry.is_pseudo() @@ -333,7 +367,7 @@ impl Path { } let new_mount = self.mount.clone_mount_tree(&self.dentry, None, recursive); - new_mount.graft_mount_tree(dst_path)?; + new_mount.graft_mount_tree(dst_path); Ok(()) } @@ -369,7 +403,9 @@ impl Path { ); } - self.mount.graft_mount_tree(dst_path) + self.mount.graft_mount_tree(dst_path); + + Ok(()) } /// Sets the propagation type of the mount of this `Path`. diff --git a/kernel/src/fs/path/mount.rs b/kernel/src/fs/path/mount.rs index 73e10d7c7..c6c6cae86 100644 --- a/kernel/src/fs/path/mount.rs +++ b/kernel/src/fs/path/mount.rs @@ -420,10 +420,9 @@ impl Mount { } /// Grafts the mount node tree to the mountpoint. - pub(super) fn graft_mount_tree(&self, target_path: &Path) -> Result<()> { + pub(super) fn graft_mount_tree(&self, target_path: &Path) { self.detach_from_parent(); self.attach_to_path(target_path); - Ok(()) } /// Gets a child mount node from the mountpoint if any. diff --git a/kernel/src/fs/path/resolver.rs b/kernel/src/fs/path/resolver.rs index c36ef2096..1e4dfca2b 100644 --- a/kernel/src/fs/path/resolver.rs +++ b/kernel/src/fs/path/resolver.rs @@ -12,7 +12,7 @@ use crate::{ utils::{InodeType, NAME_MAX, PATH_MAX, Permission, SYMLINKS_MAX, SymbolicLink}, }, prelude::*, - process::posix_thread::AsThreadLocal, + process::posix_thread::{AsPosixThread, AsThreadLocal, thread_table::ThreadTable}, }; /// The file descriptor of the current working directory. @@ -228,6 +228,121 @@ impl PathResolver { Ok(()) } + + /// Changes the root mount in the mount namespace of the calling thread. + /// + /// This function moves the original root mount of the calling thread to `put_old_path` and makes + /// `new_root_path` the new root mount. For other threads in the current mount namespace, if their + /// root directory and current working directory are the same as the current thread's root directory, + /// they will also be changed to `new_root_path`. + // + // TODO: this method should only iterate threads in the current PID namespace instead of + // the whole thread table. + pub fn pivot_root( + &mut self, + new_root_path: FsPath, + put_old_path: FsPath, + thread_table: &ThreadTable, + ctx: &Context, + ) -> Result<()> { + let new_root_path = self.lookup(&new_root_path)?; + let put_old_path = self.lookup(&put_old_path)?; + + if new_root_path.type_() != InodeType::Dir || put_old_path.type_() != InodeType::Dir { + return_errno_with_message!( + Errno::ENOTDIR, + "`new_root` or `put_old` is not a directory" + ); + } + if self.root.mount.id() == new_root_path.mount.id() + || self.root.mount.id() == put_old_path.mount.id() + { + return_errno_with_message!( + Errno::EBUSY, + "`new_root` or `put_old` is on the current root mount" + ); + } + if !new_root_path.is_mount_root() || !self.root.is_mount_root() { + return_errno_with_message!( + Errno::EINVAL, + "`new_root` or the current root is not a mount point" + ); + } + if new_root_path.mount.parent().is_none() || self.root.mount.parent().is_none() { + return_errno_with_message!( + Errno::EINVAL, + "`new_root` or the current root is on the rootfs mount" + ); + } + + let current_ns_proxy = ctx.thread_local.borrow_ns_proxy(); + let current_mnt_ns = current_ns_proxy.unwrap().mnt_ns(); + if !current_mnt_ns.owns(&new_root_path.mount) || !current_mnt_ns.owns(&put_old_path.mount) { + return_errno_with_message!( + Errno::EINVAL, + "`new_root` or `put_old` is not in the current mount namespace" + ); + } + + if !put_old_path.is_reachable_from(&new_root_path) { + return_errno_with_message!( + Errno::EINVAL, + "`put_old` is not at or underneath `new_root`" + ); + } + if !new_root_path.is_reachable_from(&self.root) { + return_errno_with_message!( + Errno::EINVAL, + "`new_root` is not underneath the current root" + ); + } + + // TODO: Check the following once we support `MS_SHARED`: + // "The propagation type of the parent mount of `new_root` and the + // parent mount of the current root directory must not be + // `MS_SHARED`; similarly, if `put_old` is an existing mount point, + // its propagation type must not be `MS_SHARED`." + + let parent_path = { + let parent_mount = self.root.mount.parent().unwrap().upgrade().unwrap(); + let mountpoint = self.root.mount.mountpoint().unwrap(); + Path::new(parent_mount, mountpoint) + }; + + self.root.mount.graft_mount_tree(&put_old_path); + new_root_path.mount.graft_mount_tree(&parent_path); + + for thread in thread_table.values() { + let posix_thread = thread.as_posix_thread().unwrap(); + let ns_proxy = posix_thread.ns_proxy().lock(); + let Some(ns_proxy) = ns_proxy.as_ref() else { + continue; + }; + let mnt_ns = ns_proxy.mnt_ns(); + if !Arc::ptr_eq(mnt_ns, current_mnt_ns) { + continue; + } + let fs = posix_thread.read_fs(); + if Arc::ptr_eq(&fs, &ctx.thread_local.borrow_fs()) { + continue; + } + + let mut fs_resolver = fs.resolver().write(); + if fs_resolver.root() == &self.root { + fs_resolver.set_root(new_root_path.clone()); + } + if fs_resolver.cwd() == &self.root { + fs_resolver.set_cwd(new_root_path.clone()); + } + } + + if self.cwd == self.root { + self.cwd = new_root_path.clone(); + } + self.root = new_root_path; + + Ok(()) + } } /// The result of resolving an absolute path name. diff --git a/kernel/src/process/posix_thread/thread_table.rs b/kernel/src/process/posix_thread/thread_table.rs index 61d4be387..4069dfb81 100644 --- a/kernel/src/process/posix_thread/thread_table.rs +++ b/kernel/src/process/posix_thread/thread_table.rs @@ -3,20 +3,22 @@ use super::{Thread, Tid}; use crate::{prelude::*, process::posix_thread::AsPosixThread}; -static THREAD_TABLE: SpinLock>> = SpinLock::new(BTreeMap::new()); +pub type ThreadTable = BTreeMap>; -/// Adds a posix thread to global thread table +static THREAD_TABLE: Mutex = Mutex::new(BTreeMap::new()); + +/// Adds a POSIX thread to the global thread table. pub fn add_thread(tid: Tid, thread: Arc) { debug_assert_eq!(tid, thread.as_posix_thread().unwrap().tid()); THREAD_TABLE.lock().insert(tid, thread); } -/// Removes a posix thread to global thread table +/// Removes a POSIX thread from the global thread table. pub fn remove_thread(tid: Tid) { THREAD_TABLE.lock().remove(&tid); } -/// Gets a posix thread from the global thread table +/// Gets a POSIX thread from the global thread table. pub fn get_thread(tid: Tid) -> Option> { THREAD_TABLE.lock().get(&tid).cloned() } @@ -47,3 +49,12 @@ pub(in crate::process) fn make_current_main_thread(ctx: &Context) { let thread = thread_table.remove(&old_tid).unwrap(); thread_table.insert(pid, thread); } + +/// Applies the given function to the global thread table. +pub fn with_global_threads(f: F) -> R +where + F: FnOnce(&ThreadTable) -> R, +{ + let table = THREAD_TABLE.lock(); + f(&table) +} diff --git a/kernel/src/syscall/arch/generic.rs b/kernel/src/syscall/arch/generic.rs index 309fc37eb..e25ea4b4c 100644 --- a/kernel/src/syscall/arch/generic.rs +++ b/kernel/src/syscall/arch/generic.rs @@ -84,6 +84,7 @@ macro_rules! import_generic_syscall_entries { pidfd_open::sys_pidfd_open, pidfd_send_signal::sys_pidfd_send_signal, pipe::sys_pipe2, + pivot_root::sys_pivot_root, ppoll::sys_ppoll, prctl::sys_prctl, pread64::sys_pread64, @@ -234,6 +235,7 @@ macro_rules! define_syscalls_with_generic_syscall_table { SYS_LINKAT = 37 => sys_linkat(args[..5]); SYS_UMOUNT = 39 => sys_umount(args[..2]); SYS_MOUNT = 40 => sys_mount(args[..5]); + SYS_PIVOT_ROOT = 41 => sys_pivot_root(args[..2]); SYS_STATFS = 43 => sys_statfs(args[..2]); SYS_FSTATFS = 44 => sys_fstatfs(args[..2]); SYS_TRUNCATE = 45 => sys_truncate(args[..2]); diff --git a/kernel/src/syscall/arch/x86.rs b/kernel/src/syscall/arch/x86.rs index b1a8cddb4..0ab9bf122 100644 --- a/kernel/src/syscall/arch/x86.rs +++ b/kernel/src/syscall/arch/x86.rs @@ -85,6 +85,7 @@ use super::{ pidfd_open::sys_pidfd_open, pidfd_send_signal::sys_pidfd_send_signal, pipe::{sys_pipe, sys_pipe2}, + pivot_root::sys_pivot_root, poll::sys_poll, ppoll::sys_ppoll, prctl::sys_prctl, @@ -309,6 +310,7 @@ impl_syscall_nums_and_dispatch_fn! { SYS_SCHED_GETSCHEDULER = 145 => sys_sched_getscheduler(args[..1]); SYS_SCHED_GET_PRIORITY_MAX = 146 => sys_sched_get_priority_max(args[..1]); SYS_SCHED_GET_PRIORITY_MIN = 147 => sys_sched_get_priority_min(args[..1]); + SYS_PIVOT_ROOT = 155 => sys_pivot_root(args[..2]); SYS_PRCTL = 157 => sys_prctl(args[..5]); SYS_ARCH_PRCTL = 158 => sys_arch_prctl(args[..2], &mut user_ctx); SYS_SETRLIMIT = 160 => sys_setrlimit(args[..2]); diff --git a/kernel/src/syscall/mod.rs b/kernel/src/syscall/mod.rs index a41f85c27..75527aeb1 100644 --- a/kernel/src/syscall/mod.rs +++ b/kernel/src/syscall/mod.rs @@ -97,6 +97,7 @@ mod pidfd_getfd; mod pidfd_open; mod pidfd_send_signal; mod pipe; +mod pivot_root; mod poll; mod ppoll; mod prctl; diff --git a/kernel/src/syscall/pivot_root.rs b/kernel/src/syscall/pivot_root.rs new file mode 100644 index 000000000..94384623b --- /dev/null +++ b/kernel/src/syscall/pivot_root.rs @@ -0,0 +1,41 @@ +// SPDX-License-Identifier: MPL-2.0 + +use super::SyscallReturn; +use crate::{ + fs::path::FsPath, prelude::*, process::posix_thread::thread_table::with_global_threads, + syscall::constants::MAX_FILENAME_LEN, +}; + +pub fn sys_pivot_root( + new_root_ptr: Vaddr, + put_old_ptr: Vaddr, + ctx: &Context, +) -> Result { + let new_root_name = ctx + .user_space() + .read_cstring(new_root_ptr, MAX_FILENAME_LEN)?; + let put_old_name = ctx + .user_space() + .read_cstring(put_old_ptr, MAX_FILENAME_LEN)?; + + debug!( + "pivot_root: new_root = {:?}, put_old = {:?}", + new_root_name, put_old_name + ); + + let new_root_name = new_root_name.to_string_lossy(); + let new_root_path = FsPath::try_from(new_root_name.as_ref())?; + let put_old_name = put_old_name.to_string_lossy(); + let put_old_path = FsPath::try_from(put_old_name.as_ref())?; + + // TODO: Locking the global thread table here is a workaround. We need to use a more + // suitable lock (i.e. the global mount lock or the namespace lock) to avoid deadlock. + with_global_threads(move |table| { + let fs_ref = ctx.thread_local.borrow_fs(); + let mut fs_resolver = fs_ref.resolver().write(); + + fs_resolver.pivot_root(new_root_path, put_old_path, table, ctx) + })?; + + Ok(SyscallReturn::Return(0)) +} diff --git a/test/initramfs/src/apps/Makefile b/test/initramfs/src/apps/Makefile index eef78efe3..c8b7f2d20 100644 --- a/test/initramfs/src/apps/Makefile +++ b/test/initramfs/src/apps/Makefile @@ -15,7 +15,6 @@ TEST_BUILD_DIR ?= $(INITRAMFS)/test TEST_APPS := \ alarm \ capability \ - chroot \ clone3 \ cpu_affinity \ devfs \ @@ -27,6 +26,7 @@ TEST_APPS := \ fdatasync \ file_io \ fork_c \ + fs_isolation \ getcpu \ getpid \ hello_pie \ diff --git a/test/initramfs/src/apps/chroot/Makefile b/test/initramfs/src/apps/fs_isolation/Makefile similarity index 100% rename from test/initramfs/src/apps/chroot/Makefile rename to test/initramfs/src/apps/fs_isolation/Makefile diff --git a/test/initramfs/src/apps/chroot/chroot_jail.c b/test/initramfs/src/apps/fs_isolation/chroot.c similarity index 100% rename from test/initramfs/src/apps/chroot/chroot_jail.c rename to test/initramfs/src/apps/fs_isolation/chroot.c diff --git a/test/initramfs/src/apps/fs_isolation/pivot_root.c b/test/initramfs/src/apps/fs_isolation/pivot_root.c new file mode 100644 index 000000000..02dfea461 --- /dev/null +++ b/test/initramfs/src/apps/fs_isolation/pivot_root.c @@ -0,0 +1,137 @@ +// SPDX-License-Identifier: MPL-2.0 + +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../test.h" + +// --- Test Configuration --- +#define CHROOT_DIR "/new_root" +#define PIVOT_TARGET_DIR "/second_root" +#define PUT_OLD_DIR_NAME "old_root" + +// Marker directories and files to verify the pivot operation. +// These will be backed by tmpfs to ensure they are on distinct filesystems. +#define OLD_ROOT_MARKER_MNT "/old_root_marker_mnt" +#define OLD_ROOT_MARKER_FILE OLD_ROOT_MARKER_MNT "/old.txt" + +#define NEW_ROOT_MARKER_MNT "/new_root_marker_mnt" +#define NEW_ROOT_MARKER_FILE NEW_ROOT_MARKER_MNT "/new.txt" + +// Helper to create a directory if it doesn't exist. +static void ensure_dir(const char *path) +{ + CHECK_WITH(mkdir(path, 0755), errno == 0 || errno == EEXIST); +} + +// Helper to create a marker file on a tmpfs mount. +static void create_marker(const char *mount_point, const char *file_path) +{ + ensure_dir(mount_point); + CHECK(mount("tmpfs", mount_point, "tmpfs", 0, "")); + int fd = CHECK(open(file_path, O_CREAT | O_WRONLY, 0644)); + CHECK(close(fd)); +} + +FN_TEST(pivot_root_test) +{ + // --- Phase 1: Setup a chroot environment with a nested bind mount --- + + TEST_SUCC(unshare(CLONE_NEWNS)); + ensure_dir(CHROOT_DIR); + TEST_SUCC(mount("/", CHROOT_DIR, NULL, MS_BIND | MS_REC, NULL)); + + char pivot_target_full_path[256]; + snprintf(pivot_target_full_path, sizeof(pivot_target_full_path), "%s%s", + CHROOT_DIR, PIVOT_TARGET_DIR); + ensure_dir(pivot_target_full_path); + + // Negative test: pivot_root should fail if the root mount is the root mount of rootfs. + TEST_ERRNO(syscall(SYS_pivot_root, pivot_target_full_path, + pivot_target_full_path), + EINVAL); + + TEST_SUCC(chroot(CHROOT_DIR)); + TEST_SUCC(chdir("/")); // Change to the `new_root` after chroot. + + // --- Phase 2: Prepare for and execute pivot_root --- + + TEST_SUCC(mount("/", PIVOT_TARGET_DIR, NULL, MS_BIND | MS_REC, NULL)); + + // Create the directory where the old root will be placed. + char put_old_full_path[256]; + snprintf(put_old_full_path, sizeof(put_old_full_path), "%s/%s", + PIVOT_TARGET_DIR, PUT_OLD_DIR_NAME); + TEST_SUCC(mkdir(put_old_full_path, 0755)); + // Mount a tmpfs on `put_old_full_path`. This makes it a different mount + // from `new_root`. + TEST_SUCC(mount("tmpfs", put_old_full_path, "tmpfs", 0, NULL)); + + // Create marker files on separate tmpfs mounts to verify the pivot. + create_marker(OLD_ROOT_MARKER_MNT, OLD_ROOT_MARKER_FILE); + + char new_root_marker_mnt_full_path[256]; + snprintf(new_root_marker_mnt_full_path, + sizeof(new_root_marker_mnt_full_path), "%s%s", + PIVOT_TARGET_DIR, NEW_ROOT_MARKER_MNT); + char new_root_marker_file_full_path[256]; + snprintf(new_root_marker_file_full_path, + sizeof(new_root_marker_file_full_path), "%s%s", + PIVOT_TARGET_DIR, NEW_ROOT_MARKER_FILE); + create_marker(new_root_marker_mnt_full_path, + new_root_marker_file_full_path); + + // --- Phase 3: Negative tests for pivot_root --- + + // pivot_root fails with ENOTDIR if the `new_root` or `put_old` is not a directory. + TEST_ERRNO(syscall(SYS_pivot_root, new_root_marker_file_full_path, + put_old_full_path), + ENOTDIR); + // pivot_root fails with EINVAL if the `put_old` is not underneath the `new_root`. + TEST_ERRNO(syscall(SYS_pivot_root, "./proc", put_old_full_path), + EINVAL); + // pivot_root fails with EINVAL if the `new_root` is not a mount root. + TEST_ERRNO(syscall(SYS_pivot_root, "./sys/fs", "./sys/fs"), EINVAL); + // pivot_root fails with EBUSY if the `new_root` is the current root. + TEST_ERRNO(syscall(SYS_pivot_root, ".", "./bin"), EBUSY); + + // --- Phase 4: Do pivot_root and verification --- + + // Perform the pivot_root operation. + TEST_SUCC(syscall(SYS_pivot_root, PIVOT_TARGET_DIR, put_old_full_path)); + + // After pivot, the cwd is changed to the `new_root`. + char cwd[1024]; + TEST_RES(syscall(SYS_getcwd, cwd, sizeof(cwd)), strcmp(cwd, "/") == 0); + + // Verify that the `new_root` is active by checking for its marker file. + TEST_SUCC(access(NEW_ROOT_MARKER_FILE, F_OK)); + + // Verify that the old root has been moved. + char old_marker_in_new_path[256]; + snprintf(old_marker_in_new_path, sizeof(old_marker_in_new_path), + "/%s%s", PUT_OLD_DIR_NAME, OLD_ROOT_MARKER_FILE); + TEST_SUCC(access(old_marker_in_new_path, F_OK)); + + // Verify that the old root marker is no longer at the root. + TEST_ERRNO(access(OLD_ROOT_MARKER_FILE, F_OK), ENOENT); + + // --- Phase 5: Cleanup --- + + char old_root_path[256]; + snprintf(old_root_path, sizeof(old_root_path), "/%s", PUT_OLD_DIR_NAME); + TEST_SUCC(umount2(old_root_path, MNT_DETACH)); + TEST_SUCC(umount(old_root_path)); + TEST_SUCC(rmdir(old_root_path)); +} +END_TEST() \ No newline at end of file diff --git a/test/initramfs/src/apps/scripts/process.sh b/test/initramfs/src/apps/scripts/process.sh index 773de695b..0ebdd6b4e 100755 --- a/test/initramfs/src/apps/scripts/process.sh +++ b/test/initramfs/src/apps/scripts/process.sh @@ -10,7 +10,6 @@ cd ${SCRIPT_DIR}/.. echo "Start process test......" # These test programs are sorted by name. tests=" -chroot/chroot_jail clone3/clone_exit_signal clone3/clone_files clone3/clone_no_exit_signal @@ -26,6 +25,8 @@ exit/exit_procfs eventfd2/eventfd2 fork/fork fork_c/fork +fs_isolation/chroot +fs_isolation/pivot_root getcpu/getcpu getpid/getpid hello_pie/hello