diff --git a/docs/src/kernel/linux-compatibility.md b/docs/src/kernel/linux-compatibility.md index 0ccf22197..83d977bc7 100644 --- a/docs/src/kernel/linux-compatibility.md +++ b/docs/src/kernel/linux-compatibility.md @@ -15,7 +15,7 @@ support the loading of Linux kernel modules. ## System Calls At the time of writing, -Asterinas implements 214 out of the 336 system calls +Asterinas implements 219 out of the 336 system calls provided by Linux on x86-64 architecture. | Numbers | Names | Is Implemented | @@ -337,6 +337,7 @@ provided by Linux on x86-64 architecture. | 314 | sched_setattr | ✅ | | 315 | sched_getattr | ✅ | | 318 | getrandom | ✅ | +| 319 | memfd_create | ✅ | | 322 | execveat | ✅ | | 327 | preadv2 | ✅ | | 328 | pwritev2 | ✅ | diff --git a/kernel/src/fs/inode_handle/mod.rs b/kernel/src/fs/inode_handle/mod.rs index 4d1409e05..aa647888a 100644 --- a/kernel/src/fs/inode_handle/mod.rs +++ b/kernel/src/fs/inode_handle/mod.rs @@ -18,9 +18,9 @@ use crate::{ file_handle::FileLike, path::Dentry, utils::{ - AccessMode, DirentVisitor, FallocMode, FileRange, FlockItem, FlockList, InodeMode, - InodeType, IoctlCmd, Metadata, RangeLockItem, RangeLockItemBuilder, RangeLockList, - RangeLockType, SeekFrom, StatusFlags, OFFSET_MAX, + AccessMode, DirentVisitor, FallocMode, FileRange, FlockItem, FlockList, Inode, + InodeMode, InodeType, IoctlCmd, Metadata, RangeLockItem, RangeLockItemBuilder, + RangeLockList, RangeLockType, SeekFrom, StatusFlags, OFFSET_MAX, }, }, prelude::*, @@ -114,32 +114,7 @@ impl InodeHandle_ { } pub fn seek(&self, pos: SeekFrom) -> Result { - let mut offset = self.offset.lock(); - let new_offset: isize = match pos { - SeekFrom::Start(off /* as usize */) => { - if off > isize::MAX as usize { - return_errno_with_message!(Errno::EINVAL, "file offset is too large"); - } - off as isize - } - SeekFrom::End(off /* as isize */) => { - let file_size = self.dentry.size() as isize; - assert!(file_size >= 0); - file_size - .checked_add(off) - .ok_or_else(|| Error::with_message(Errno::EOVERFLOW, "file offset overflow"))? - } - SeekFrom::Current(off /* as isize */) => (*offset as isize) - .checked_add(off) - .ok_or_else(|| Error::with_message(Errno::EOVERFLOW, "file offset overflow"))?, - }; - if new_offset < 0 { - return_errno_with_message!(Errno::EINVAL, "file offset must not be negative"); - } - // Invariant: 0 <= new_offset <= isize::MAX - let new_offset = new_offset as usize; - *offset = new_offset; - Ok(new_offset) + do_seek_util(self.dentry.inode(), &self.offset, pos) } pub fn offset(&self) -> usize { @@ -148,10 +123,7 @@ impl InodeHandle_ { } pub fn resize(&self, new_size: usize) -> Result<()> { - if self.status_flags().contains(StatusFlags::O_APPEND) { - return_errno_with_message!(Errno::EPERM, "can not resize append-only file"); - } - self.dentry.resize(new_size) + do_resize_util(self.dentry.inode(), self.status_flags(), new_size) } pub fn access_mode(&self) -> AccessMode { @@ -184,27 +156,7 @@ impl InodeHandle_ { } fn fallocate(&self, mode: FallocMode, offset: usize, len: usize) -> Result<()> { - let status_flags = self.status_flags(); - if status_flags.contains(StatusFlags::O_APPEND) - && (mode == FallocMode::PunchHoleKeepSize - || mode == FallocMode::CollapseRange - || mode == FallocMode::InsertRange) - { - return_errno_with_message!( - Errno::EPERM, - "the flags do not work on the append-only file" - ); - } - if status_flags.contains(StatusFlags::O_DIRECT) - || status_flags.contains(StatusFlags::O_PATH) - { - return_errno_with_message!( - Errno::EBADF, - "currently fallocate file with O_DIRECT or O_PATH is not supported" - ); - } - - self.dentry.inode().fallocate(mode, offset, len) + do_fallocate_util(self.dentry.inode(), self.status_flags(), mode, offset, len) } fn ioctl(&self, cmd: IoctlCmd, arg: usize) -> Result { @@ -388,3 +340,71 @@ pub trait FileIo: Pollable + Send + Sync + 'static { return_errno_with_message!(Errno::EINVAL, "ioctl is not supported"); } } + +pub fn do_seek_util(inode: &Arc, offset: &Mutex, pos: SeekFrom) -> Result { + let mut offset = offset.lock(); + let new_offset: isize = match pos { + SeekFrom::Start(off /* as usize */) => { + if off > isize::MAX as usize { + return_errno_with_message!(Errno::EINVAL, "file offset is too large"); + } + off as isize + } + SeekFrom::End(off /* as isize */) => { + let file_size = inode.size() as isize; + assert!(file_size >= 0); + file_size + .checked_add(off) + .ok_or_else(|| Error::with_message(Errno::EOVERFLOW, "file offset overflow"))? + } + SeekFrom::Current(off /* as isize */) => (*offset as isize) + .checked_add(off) + .ok_or_else(|| Error::with_message(Errno::EOVERFLOW, "file offset overflow"))?, + }; + if new_offset < 0 { + return_errno_with_message!(Errno::EINVAL, "file offset must not be negative"); + } + // Invariant: 0 <= new_offset <= isize::MAX + let new_offset = new_offset as usize; + *offset = new_offset; + Ok(new_offset) +} + +pub fn do_fallocate_util( + inode: &Arc, + status_flags: StatusFlags, + mode: FallocMode, + offset: usize, + len: usize, +) -> Result<()> { + if status_flags.contains(StatusFlags::O_APPEND) + && (mode == FallocMode::PunchHoleKeepSize + || mode == FallocMode::CollapseRange + || mode == FallocMode::InsertRange) + { + return_errno_with_message!( + Errno::EPERM, + "the flags do not work on the append-only file" + ); + } + if status_flags.contains(StatusFlags::O_DIRECT) || status_flags.contains(StatusFlags::O_PATH) { + return_errno_with_message!( + Errno::EBADF, + "currently fallocate file with O_DIRECT or O_PATH is not supported" + ); + } + + inode.fallocate(mode, offset, len) +} + +pub fn do_resize_util( + inode: &Arc, + status_flags: StatusFlags, + new_size: usize, +) -> Result<()> { + if status_flags.contains(StatusFlags::O_APPEND) { + // FIXME: It's allowed to `ftruncate` an append-only file on Linux. + return_errno_with_message!(Errno::EPERM, "can not resize append-only file"); + } + inode.resize(new_size) +} diff --git a/kernel/src/fs/ramfs/fs.rs b/kernel/src/fs/ramfs/fs.rs index 365eff1b6..e2a6485d4 100644 --- a/kernel/src/fs/ramfs/fs.rs +++ b/kernel/src/fs/ramfs/fs.rs @@ -420,6 +420,19 @@ impl RamInode { }) } + fn new_file_detached(mode: InodeMode, uid: Uid, gid: Gid) -> Arc { + Arc::new_cyclic(|weak_self| RamInode { + inner: Inner::new_file(weak_self.clone()), + metadata: SpinLock::new(InodeMeta::new(mode, uid, gid)), + ino: weak_self.as_ptr() as u64, + typ: InodeType::File, + this: weak_self.clone(), + fs: Weak::new(), + extension: Extension::new(), + xattr: RamXattr::new(), + }) + } + fn new_symlink(fs: &Arc, mode: InodeMode, uid: Uid, gid: Gid) -> Arc { Arc::new_cyclic(|weak_self| RamInode { inner: Inner::new_symlink(), @@ -1236,6 +1249,13 @@ impl Inode for RamInode { } } +/// Creates a RAM inode that is detached from any `RamFS`. +/// +// TODO: Add "anonymous inode fs" and link the inode to it. +pub fn new_detached_inode(mode: InodeMode, uid: Uid, gid: Gid) -> Arc { + RamInode::new_file_detached(mode, uid, gid) +} + fn write_lock_two_direntries_by_ino<'a>( this: (u64, &'a RwLock), other: (u64, &'a RwLock), diff --git a/kernel/src/fs/ramfs/mod.rs b/kernel/src/fs/ramfs/mod.rs index 96a7acbb9..7e9a55a13 100644 --- a/kernel/src/fs/ramfs/mod.rs +++ b/kernel/src/fs/ramfs/mod.rs @@ -4,7 +4,7 @@ use alloc::sync::Arc; -pub use fs::RamFS; +pub use fs::{new_detached_inode, RamFS}; use crate::fs::ramfs::fs::RamFsType; diff --git a/kernel/src/syscall/arch/loongarch.rs b/kernel/src/syscall/arch/loongarch.rs index 2664d2c3d..8375361fb 100644 --- a/kernel/src/syscall/arch/loongarch.rs +++ b/kernel/src/syscall/arch/loongarch.rs @@ -57,6 +57,7 @@ use super::{ listen::sys_listen, lseek::sys_lseek, madvise::sys_madvise, + memfd_create::sys_memfd_create, mkdir::sys_mkdirat, mknod::sys_mknodat, mmap::sys_mmap, @@ -293,6 +294,7 @@ impl_syscall_nums_and_dispatch_fn! { SYS_SCHED_SETATTR = 274 => sys_sched_setattr(args[..3]); SYS_SCHED_GETATTR = 275 => sys_sched_getattr(args[..4]); SYS_GETRANDOM = 278 => sys_getrandom(args[..3]); + SYS_MEMFD_CREATE = 279 => sys_memfd_create(args[..2]); SYS_EXECVEAT = 281 => sys_execveat(args[..5], &mut user_ctx); SYS_PREADV2 = 286 => sys_preadv2(args[..5]); SYS_PWRITEV2 = 287 => sys_pwritev2(args[..5]); diff --git a/kernel/src/syscall/arch/riscv.rs b/kernel/src/syscall/arch/riscv.rs index fdc21b84f..9c7bf9e82 100644 --- a/kernel/src/syscall/arch/riscv.rs +++ b/kernel/src/syscall/arch/riscv.rs @@ -59,6 +59,7 @@ use super::{ listen::sys_listen, lseek::sys_lseek, madvise::sys_madvise, + memfd_create::sys_memfd_create, mkdir::sys_mkdirat, mknod::sys_mknodat, mmap::sys_mmap, @@ -300,6 +301,7 @@ impl_syscall_nums_and_dispatch_fn! { SYS_SCHED_SETATTR = 274 => sys_sched_setattr(args[..3]); SYS_SCHED_GETATTR = 275 => sys_sched_getattr(args[..4]); SYS_GETRANDOM = 278 => sys_getrandom(args[..3]); + SYS_MEMFD_CREATE = 279 => sys_memfd_create(args[..2]); SYS_EXECVEAT = 281 => sys_execveat(args[..5], &mut user_ctx); SYS_PREADV2 = 286 => sys_preadv2(args[..5]); SYS_PWRITEV2 = 287 => sys_pwritev2(args[..5]); diff --git a/kernel/src/syscall/arch/x86.rs b/kernel/src/syscall/arch/x86.rs index 756462020..dce649622 100644 --- a/kernel/src/syscall/arch/x86.rs +++ b/kernel/src/syscall/arch/x86.rs @@ -68,6 +68,7 @@ use super::{ listxattr::{sys_flistxattr, sys_listxattr, sys_llistxattr}, lseek::sys_lseek, madvise::sys_madvise, + memfd_create::sys_memfd_create, mkdir::{sys_mkdir, sys_mkdirat}, mknod::{sys_mknod, sys_mknodat}, mmap::sys_mmap, @@ -377,6 +378,7 @@ impl_syscall_nums_and_dispatch_fn! { SYS_SCHED_SETATTR = 314 => sys_sched_setattr(args[..3]); SYS_SCHED_GETATTR = 315 => sys_sched_getattr(args[..4]); SYS_GETRANDOM = 318 => sys_getrandom(args[..3]); + SYS_MEMFD_CREATE = 319 => sys_memfd_create(args[..2]); SYS_EXECVEAT = 322 => sys_execveat(args[..5], &mut user_ctx); SYS_PREADV2 = 327 => sys_preadv2(args[..5]); SYS_PWRITEV2 = 328 => sys_pwritev2(args[..5]); diff --git a/kernel/src/syscall/memfd_create.rs b/kernel/src/syscall/memfd_create.rs new file mode 100644 index 000000000..0bce23e90 --- /dev/null +++ b/kernel/src/syscall/memfd_create.rs @@ -0,0 +1,52 @@ +// SPDX-License-Identifier: MPL-2.0 + +use bitflags::bitflags; + +use super::SyscallReturn; +use crate::{ + fs::file_table::FdFlags, + prelude::*, + vm::memfd::{MemfdFile, MAX_MEMFD_NAME_LEN}, +}; + +bitflags! { + struct MemfdFlags: u32 { + /// Close on exec. + const MFD_CLOEXEC = 1 << 0; + /// Allow sealing operations on this file. + const MFD_ALLOW_SEALING = 1 << 1; + /// Create in the hugetlbfs. + const MFD_HUGETLB = 1 << 2; + } +} + +pub fn sys_memfd_create(name_addr: Vaddr, flags: u32, ctx: &Context) -> Result { + // FIXME: When `name` is too long, `read_cstring` returns `EFAULT`. However, + // according to , + // we should return `EINVAL` in this case. + let name = ctx + .user_space() + .read_cstring(name_addr, MAX_MEMFD_NAME_LEN + 1)?; + debug!("sys_memfd_create: name = {:?}, flags = {}", name, flags); + + let memfd_file = MemfdFile::new(name.to_string_lossy().as_ref())?; + + let fd = { + let memfd_flags = MemfdFlags::from_bits(flags).ok_or(Errno::EINVAL)?; + let fd_flags = if memfd_flags.contains(MemfdFlags::MFD_CLOEXEC) { + FdFlags::CLOEXEC + } else { + FdFlags::empty() + }; + let file_table = ctx.thread_local.borrow_file_table(); + let mut file_table_locked = file_table.unwrap().write(); + + // FIXME: Support `MFD_ALLOW_SEALING` and `MFD_HUGETLB`. + if memfd_flags.contains(MemfdFlags::MFD_ALLOW_SEALING) { + warn!("sealing not supported"); + } + file_table_locked.insert(Arc::new(memfd_file), fd_flags) + }; + + Ok(SyscallReturn::Return(fd as _)) +} diff --git a/kernel/src/syscall/mod.rs b/kernel/src/syscall/mod.rs index b0cbea97d..3cbe6e341 100644 --- a/kernel/src/syscall/mod.rs +++ b/kernel/src/syscall/mod.rs @@ -79,6 +79,7 @@ mod listen; mod listxattr; mod lseek; mod madvise; +mod memfd_create; mod mkdir; mod mknod; mod mmap; diff --git a/kernel/src/vm/memfd.rs b/kernel/src/vm/memfd.rs new file mode 100644 index 000000000..5b0986723 --- /dev/null +++ b/kernel/src/vm/memfd.rs @@ -0,0 +1,150 @@ +// SPDX-License-Identifier: MPL-2.0 + +//! Memfd Implementation. + +use alloc::format; +use core::sync::atomic::{AtomicU32, Ordering}; + +use inherit_methods_macro::inherit_methods; + +use crate::{ + events::IoEvents, + fs::{ + file_handle::FileLike, + inode_handle::{do_fallocate_util, do_resize_util, do_seek_util}, + ramfs::new_detached_inode, + utils::{ + AccessMode, FallocMode, Inode, InodeMode, IoctlCmd, Metadata, SeekFrom, StatusFlags, + }, + }, + prelude::*, + process::{ + signal::{PollHandle, Pollable}, + Gid, Uid, + }, +}; + +/// Maximum file name length for `memfd_create`, excluding the final `\0` byte. +/// +/// See +pub const MAX_MEMFD_NAME_LEN: usize = 249; + +pub struct MemfdFile { + inode: Arc, + #[expect(dead_code)] + name: String, + offset: Mutex, + access_mode: AccessMode, + status_flags: AtomicU32, +} + +impl MemfdFile { + pub fn new(name: &str) -> Result { + if name.len() > MAX_MEMFD_NAME_LEN { + return_errno_with_message!(Errno::EINVAL, "MemfdManager: `name` is too long."); + } + + // When Linux performs `memfd_create`, it first creates a RAM inode in a ramfs, + // then immediately unlinks it, and finally returns only the file descriptor. + // Therefore, when using `readlink("/proc//fd/", ...)` to get the file + // path of a `memfd` file, the result will have a `(deleted)` suffix. We stay + // consistent with Linux here. + // + // Reference: + let name = format!("/memfd:{} (deleted)", name); + let inode = new_detached_inode( + InodeMode::from_bits_truncate(0o777), + Uid::new_root(), + Gid::new_root(), + ); + + Ok(Self { + inode, + name, + offset: Mutex::new(0), + access_mode: AccessMode::O_RDWR, + status_flags: AtomicU32::new(0), + }) + } +} + +impl Pollable for MemfdFile { + fn poll(&self, mask: IoEvents, _poller: Option<&mut PollHandle>) -> IoEvents { + (IoEvents::IN | IoEvents::OUT) & mask + } +} + +#[inherit_methods(from = "self.inode")] +impl FileLike for MemfdFile { + fn read_at(&self, offset: usize, writer: &mut VmWriter) -> Result; + fn ioctl(&self, cmd: IoctlCmd, arg: usize) -> Result; + fn metadata(&self) -> Metadata; + fn mode(&self) -> Result; + fn set_mode(&self, mode: InodeMode) -> Result<()>; + fn owner(&self) -> Result; + fn set_owner(&self, uid: Uid) -> Result<()>; + fn group(&self) -> Result; + fn set_group(&self, gid: Gid) -> Result<()>; + + fn read(&self, writer: &mut VmWriter) -> Result { + let mut offset = self.offset.lock(); + + let len = self.read_at(*offset, writer)?; + *offset += len; + + Ok(len) + } + + fn write(&self, reader: &mut VmReader) -> Result { + let mut offset = self.offset.lock(); + + if self.status_flags().contains(StatusFlags::O_APPEND) { + *offset = self.inode.size(); + } + + let len = self.write_at(*offset, reader)?; + *offset += len; + + Ok(len) + } + + fn write_at(&self, mut offset: usize, reader: &mut VmReader) -> Result { + if self.status_flags().contains(StatusFlags::O_APPEND) { + // If the file has the O_APPEND flag, the offset is ignored + offset = self.inode.size(); + } + + self.inode.write_at(offset, reader) + } + + fn resize(&self, new_size: usize) -> Result<()> { + do_resize_util(&self.inode, self.status_flags(), new_size) + } + + fn status_flags(&self) -> StatusFlags { + let bits = self.status_flags.load(Ordering::Relaxed); + StatusFlags::from_bits(bits).unwrap() + } + + fn set_status_flags(&self, new_status_flags: StatusFlags) -> Result<()> { + self.status_flags + .store(new_status_flags.bits(), Ordering::Relaxed); + Ok(()) + } + + fn access_mode(&self) -> AccessMode { + self.access_mode + } + + fn seek(&self, pos: SeekFrom) -> Result { + do_seek_util(&self.inode, &self.offset, pos) + } + + fn fallocate(&self, mode: FallocMode, offset: usize, len: usize) -> Result<()> { + do_fallocate_util(&self.inode, self.status_flags(), mode, offset, len) + } + + fn inode(&self) -> Option<&Arc> { + Some(&self.inode) + } +} diff --git a/kernel/src/vm/mod.rs b/kernel/src/vm/mod.rs index 48f274dfa..270ae5e57 100644 --- a/kernel/src/vm/mod.rs +++ b/kernel/src/vm/mod.rs @@ -19,6 +19,7 @@ use osdk_frame_allocator::FrameAllocator; use osdk_heap_allocator::{type_from_layout, HeapAllocator}; +pub mod memfd; pub mod page_fault_handler; pub mod perms; pub mod util; diff --git a/test/src/syscall/gvisor/Makefile b/test/src/syscall/gvisor/Makefile index b47ce3a38..fbe30b8d1 100644 --- a/test/src/syscall/gvisor/Makefile +++ b/test/src/syscall/gvisor/Makefile @@ -21,6 +21,7 @@ TESTS ?= \ ioctl_test \ link_test \ lseek_test \ + memfd_test \ mkdir_test \ mknod_test \ mmap_test \ diff --git a/test/src/syscall/gvisor/blocklists/memfd_test b/test/src/syscall/gvisor/blocklists/memfd_test new file mode 100644 index 000000000..f374cd972 --- /dev/null +++ b/test/src/syscall/gvisor/blocklists/memfd_test @@ -0,0 +1,6 @@ +MemfdTest.Name +MemfdTest.Seal* +MemfdTest.NoPartialSealApplicationWhenWriteSealRejected +MemfdTest.TmpfsFilesHaveSealSeal +MemfdTest.CanOpenFromProcfs +MemfdTest.MemfdMustBeWritableToModifySeals \ No newline at end of file