From 6ba1a84ae90da614fe7d63528ca47174415a1246 Mon Sep 17 00:00:00 2001 From: Wang Siyuan Date: Thu, 30 Oct 2025 07:54:37 +0000 Subject: [PATCH] Support sealing memfd files --- kernel/src/syscall/fcntl.rs | 38 +++++ kernel/src/syscall/memfd_create.rs | 6 +- kernel/src/vm/memfd.rs | 141 +++++++++++++++++- kernel/src/vm/vmar/mod.rs | 13 +- test/src/syscall/gvisor/blocklists/memfd_test | 4 +- 5 files changed, 192 insertions(+), 10 deletions(-) diff --git a/kernel/src/syscall/fcntl.rs b/kernel/src/syscall/fcntl.rs index 0c75aae9b..ea89a5c88 100644 --- a/kernel/src/syscall/fcntl.rs +++ b/kernel/src/syscall/fcntl.rs @@ -9,6 +9,7 @@ use crate::{ }, prelude::*, process::{process_table, Pid}, + vm::memfd::{FileSeals, MemfdFile}, }; pub fn sys_fcntl(fd: FileDesc, cmd: i32, arg: u64, ctx: &Context) -> Result { @@ -29,6 +30,8 @@ pub fn sys_fcntl(fd: FileDesc, cmd: i32, arg: u64, ctx: &Context) -> Result handle_getown(fd, ctx), FcntlCmd::F_SETOWN => handle_setown(fd, arg, ctx), + FcntlCmd::F_ADD_SEALS => handle_addseal(fd, arg, ctx), + FcntlCmd::F_GET_SEALS => handle_getseal(fd, ctx), } } @@ -154,6 +157,39 @@ fn handle_setown(fd: FileDesc, arg: u64, ctx: &Context) -> Result Ok(SyscallReturn::Return(0)) } +fn handle_addseal(fd: FileDesc, arg: u64, ctx: &Context) -> Result { + let new_seals = FileSeals::from_bits(arg as u32) + .ok_or_else(|| Error::with_message(Errno::EINVAL, "invalid seals"))?; + + let mut file_table = ctx.thread_local.borrow_file_table_mut(); + let file = get_file_fast!(&mut file_table, fd); + let memfd_file = file.downcast_ref::().ok_or_else(|| { + Error::with_message( + Errno::EINVAL, + "file seals can only be applied to memfd files", + ) + })?; + + memfd_file.add_seals(new_seals)?; + + Ok(SyscallReturn::Return(0)) +} + +fn handle_getseal(fd: FileDesc, ctx: &Context) -> Result { + let mut file_table = ctx.thread_local.borrow_file_table_mut(); + let file = get_file_fast!(&mut file_table, fd); + let memfd_file = file.downcast_ref::().ok_or_else(|| { + Error::with_message( + Errno::EINVAL, + "file seals can only be applied to memfd files", + ) + })?; + + let file_seals = memfd_file.get_seals(); + + Ok(SyscallReturn::Return(file_seals.bits() as _)) +} + #[repr(i32)] #[derive(Debug, Clone, Copy, TryFromInt)] #[expect(non_camel_case_types)] @@ -169,6 +205,8 @@ enum FcntlCmd { F_SETOWN = 8, F_GETOWN = 9, F_DUPFD_CLOEXEC = 1030, + F_ADD_SEALS = 1033, + F_GET_SEALS = 1034, } #[expect(non_camel_case_types)] diff --git a/kernel/src/syscall/memfd_create.rs b/kernel/src/syscall/memfd_create.rs index 63fee3cc4..61eff9ec3 100644 --- a/kernel/src/syscall/memfd_create.rs +++ b/kernel/src/syscall/memfd_create.rs @@ -26,9 +26,9 @@ pub fn sys_memfd_create(name_addr: Vaddr, flags: u32, ctx: &Context) -> Result, +} + +impl MemfdInode { + pub fn add_seals(&self, mut new_seals: FileSeals) -> Result<()> { + let mut seals = self.seals.lock(); + + if seals.contains(FileSeals::F_SEAL_SEAL) { + return_errno_with_message!(Errno::EPERM, "the file is sealed against sealing"); + } + + // Reference: + if new_seals.contains(FileSeals::F_SEAL_EXEC) + && self + .mode() + .unwrap() + .intersects(InodeMode::from_bits_truncate(0o111)) + { + new_seals |= FileSeals::F_SEAL_SHRINK + | FileSeals::F_SEAL_GROW + | FileSeals::F_SEAL_WRITE + | FileSeals::F_SEAL_FUTURE_WRITE; + } + + if new_seals.contains(FileSeals::F_SEAL_WRITE) { + let page_cache = self.page_cache().unwrap(); + page_cache + .writable_mapping_status() + .as_ref() + .unwrap() + .deny()?; + } + + *seals |= new_seals; + + Ok(()) + } + + pub fn get_seals(&self) -> FileSeals { + *self.seals.lock() + } + + /// Checks whether writing to this memfd inode is allowed. + /// + /// This method restricts the `may_perms` if needed. + pub fn check_writable(&self, perms: VmPerms, may_perms: &mut VmPerms) -> Result<()> { + let seals = self.seals.lock(); + if seals.intersects(FileSeals::F_SEAL_WRITE | FileSeals::F_SEAL_FUTURE_WRITE) { + if perms.contains(VmPerms::WRITE) { + return_errno_with_message!(Errno::EPERM, "the file is sealed against writing"); + } + // Reference: + may_perms.remove(VmPerms::MAY_WRITE); + } + Ok(()) + } } #[inherit_methods(from = "self.inode")] @@ -85,18 +141,64 @@ impl Inode for MemfdInode { fn remove_xattr(&self, name: XattrName) -> Result<()>; fn write_at(&self, offset: usize, reader: &mut VmReader) -> Result { + if !reader.has_remain() { + return Ok(0); + } + + let seals = self.seals.lock(); + if seals.intersects(FileSeals::F_SEAL_WRITE | FileSeals::F_SEAL_FUTURE_WRITE) { + return_errno_with_message!(Errno::EPERM, "the file is sealed against writing"); + } + + if seals.contains(FileSeals::F_SEAL_GROW) { + let file_size = self.inode.size(); + if offset >= file_size { + return_errno_with_message!(Errno::EPERM, "the file is sealed against growing"); + } else { + reader.limit(file_size - offset); + } + } + self.inode.write_at(offset, reader) } fn resize(&self, new_size: usize) -> Result<()> { + let seals = self.seals.lock(); + if seals.contains(FileSeals::F_SEAL_SHRINK) && new_size < self.inode.size() { + return_errno_with_message!(Errno::EPERM, "the file is sealed against shrinking"); + } + if seals.contains(FileSeals::F_SEAL_GROW) && new_size > self.inode.size() { + return_errno_with_message!(Errno::EPERM, "the file is sealed against growing"); + } + self.inode.resize(new_size) } fn set_mode(&self, mode: InodeMode) -> Result<()> { + let seals = self.seals.lock(); + if seals.contains(FileSeals::F_SEAL_EXEC) + && (self.mode().unwrap() ^ mode).intersects(InodeMode::from_bits_truncate(0o111)) + { + return_errno_with_message!( + Errno::EPERM, + "the file is sealed against modifying executable bits" + ); + } + self.inode.set_mode(mode) } fn fallocate(&self, mode: FallocMode, offset: usize, len: usize) -> Result<()> { + let seals = self.seals.lock(); + if seals.contains(FileSeals::F_SEAL_GROW) && offset + len > self.inode.size() { + return_errno_with_message!(Errno::EPERM, "the file is sealed against growing"); + } + if seals.intersects(FileSeals::F_SEAL_WRITE | FileSeals::F_SEAL_FUTURE_WRITE) + && mode == FallocMode::PunchHoleKeepSize + { + return_errno_with_message!(Errno::EPERM, "the file is sealed against writing"); + } + self.inode.fallocate(mode, offset, len) } @@ -138,9 +240,18 @@ impl MemfdFile { let ram_inode = new_detached_inode_in_memfd(weak_self, mode, Uid::new_root(), Gid::new_root()); + let mut seals = FileSeals::empty(); + if !allow_sealing { + seals |= FileSeals::F_SEAL_SEAL; + } + if !executable { + seals |= FileSeals::F_SEAL_EXEC; + } + MemfdInode { inode: ram_inode, name, + seals: Mutex::new(seals), } }); @@ -152,6 +263,17 @@ impl MemfdFile { }) } + pub fn add_seals(&self, new_seals: FileSeals) -> Result<()> { + if !self.access_mode.is_writable() { + return_errno_with_message!(Errno::EPERM, "the file is not opened writable"); + } + self.memfd_inode().add_seals(new_seals) + } + + pub fn get_seals(&self) -> FileSeals { + self.memfd_inode().get_seals() + } + fn memfd_inode(&self) -> &MemfdInode { self.memfd_inode.downcast_ref::().unwrap() } @@ -252,3 +374,20 @@ bitflags! { const MFD_EXEC = 1 << 4; } } + +bitflags! { + pub struct FileSeals: u32 { + /// Prevent further seals from being set. + const F_SEAL_SEAL = 0x0001; + /// Prevent file from shrinking. + const F_SEAL_SHRINK = 0x0002; + /// Prevent file from growing. + const F_SEAL_GROW = 0x0004; + /// Prevent writes. + const F_SEAL_WRITE = 0x0008; + /// Prevent future writes while mapped. + const F_SEAL_FUTURE_WRITE = 0x0010; + /// Prevent chmod modifying exec bits. + const F_SEAL_EXEC = 0x0020; + } +} diff --git a/kernel/src/vm/vmar/mod.rs b/kernel/src/vm/vmar/mod.rs index e032334af..12e472499 100644 --- a/kernel/src/vm/vmar/mod.rs +++ b/kernel/src/vm/vmar/mod.rs @@ -1128,7 +1128,7 @@ impl<'a> VmarMapOptions<'a> { vmo, mappable, perms, - may_perms, + mut may_perms, vmo_offset, size: map_size, offset, @@ -1183,9 +1183,16 @@ impl<'a> VmarMapOptions<'a> { // Handle the memory backed by device or page cache. match mappable { Mappable::Inode(inode) => { - let is_writable_tracked = inode.downcast_ref::().is_some() + let is_writable_tracked = if let Some(memfd_inode) = + inode.downcast_ref::() && is_shared - && may_perms.contains(VmPerms::MAY_WRITE); + && may_perms.contains(VmPerms::MAY_WRITE) + { + memfd_inode.check_writable(perms, &mut may_perms)?; + true + } else { + false + }; // Since `Mappable::Inode` is provided, it is // reasonable to assume that the VMO is provided. diff --git a/test/src/syscall/gvisor/blocklists/memfd_test b/test/src/syscall/gvisor/blocklists/memfd_test index f374cd972..cede6717a 100644 --- a/test/src/syscall/gvisor/blocklists/memfd_test +++ b/test/src/syscall/gvisor/blocklists/memfd_test @@ -1,6 +1,4 @@ MemfdTest.Name -MemfdTest.Seal* -MemfdTest.NoPartialSealApplicationWhenWriteSealRejected -MemfdTest.TmpfsFilesHaveSealSeal MemfdTest.CanOpenFromProcfs +MemfdTest.SealGrowPartialWriteTruncatedSamePage MemfdTest.MemfdMustBeWritableToModifySeals \ No newline at end of file