Support the system call `memfd_create`

This commit is contained in:
Wang Siyuan 2025-07-30 09:00:08 +00:00 committed by Tate, Hongliang Tian
parent 59ceb6d27c
commit 48fa40bbf2
13 changed files with 314 additions and 56 deletions

View File

@ -15,7 +15,7 @@ support the loading of Linux kernel modules.
## System Calls
At the time of writing,
Asterinas implements 214 out of the 336 system calls
Asterinas implements 219 out of the 336 system calls
provided by Linux on x86-64 architecture.
| Numbers | Names | Is Implemented |
@ -337,6 +337,7 @@ provided by Linux on x86-64 architecture.
| 314 | sched_setattr | ✅ |
| 315 | sched_getattr | ✅ |
| 318 | getrandom | ✅ |
| 319 | memfd_create | ✅ |
| 322 | execveat | ✅ |
| 327 | preadv2 | ✅ |
| 328 | pwritev2 | ✅ |

View File

@ -18,9 +18,9 @@ use crate::{
file_handle::FileLike,
path::Dentry,
utils::{
AccessMode, DirentVisitor, FallocMode, FileRange, FlockItem, FlockList, InodeMode,
InodeType, IoctlCmd, Metadata, RangeLockItem, RangeLockItemBuilder, RangeLockList,
RangeLockType, SeekFrom, StatusFlags, OFFSET_MAX,
AccessMode, DirentVisitor, FallocMode, FileRange, FlockItem, FlockList, Inode,
InodeMode, InodeType, IoctlCmd, Metadata, RangeLockItem, RangeLockItemBuilder,
RangeLockList, RangeLockType, SeekFrom, StatusFlags, OFFSET_MAX,
},
},
prelude::*,
@ -114,32 +114,7 @@ impl InodeHandle_ {
}
pub fn seek(&self, pos: SeekFrom) -> Result<usize> {
let mut offset = self.offset.lock();
let new_offset: isize = match pos {
SeekFrom::Start(off /* as usize */) => {
if off > isize::MAX as usize {
return_errno_with_message!(Errno::EINVAL, "file offset is too large");
}
off as isize
}
SeekFrom::End(off /* as isize */) => {
let file_size = self.dentry.size() as isize;
assert!(file_size >= 0);
file_size
.checked_add(off)
.ok_or_else(|| Error::with_message(Errno::EOVERFLOW, "file offset overflow"))?
}
SeekFrom::Current(off /* as isize */) => (*offset as isize)
.checked_add(off)
.ok_or_else(|| Error::with_message(Errno::EOVERFLOW, "file offset overflow"))?,
};
if new_offset < 0 {
return_errno_with_message!(Errno::EINVAL, "file offset must not be negative");
}
// Invariant: 0 <= new_offset <= isize::MAX
let new_offset = new_offset as usize;
*offset = new_offset;
Ok(new_offset)
do_seek_util(self.dentry.inode(), &self.offset, pos)
}
pub fn offset(&self) -> usize {
@ -148,10 +123,7 @@ impl InodeHandle_ {
}
pub fn resize(&self, new_size: usize) -> Result<()> {
if self.status_flags().contains(StatusFlags::O_APPEND) {
return_errno_with_message!(Errno::EPERM, "can not resize append-only file");
}
self.dentry.resize(new_size)
do_resize_util(self.dentry.inode(), self.status_flags(), new_size)
}
pub fn access_mode(&self) -> AccessMode {
@ -184,27 +156,7 @@ impl InodeHandle_ {
}
fn fallocate(&self, mode: FallocMode, offset: usize, len: usize) -> Result<()> {
let status_flags = self.status_flags();
if status_flags.contains(StatusFlags::O_APPEND)
&& (mode == FallocMode::PunchHoleKeepSize
|| mode == FallocMode::CollapseRange
|| mode == FallocMode::InsertRange)
{
return_errno_with_message!(
Errno::EPERM,
"the flags do not work on the append-only file"
);
}
if status_flags.contains(StatusFlags::O_DIRECT)
|| status_flags.contains(StatusFlags::O_PATH)
{
return_errno_with_message!(
Errno::EBADF,
"currently fallocate file with O_DIRECT or O_PATH is not supported"
);
}
self.dentry.inode().fallocate(mode, offset, len)
do_fallocate_util(self.dentry.inode(), self.status_flags(), mode, offset, len)
}
fn ioctl(&self, cmd: IoctlCmd, arg: usize) -> Result<i32> {
@ -388,3 +340,71 @@ pub trait FileIo: Pollable + Send + Sync + 'static {
return_errno_with_message!(Errno::EINVAL, "ioctl is not supported");
}
}
pub fn do_seek_util(inode: &Arc<dyn Inode>, offset: &Mutex<usize>, pos: SeekFrom) -> Result<usize> {
let mut offset = offset.lock();
let new_offset: isize = match pos {
SeekFrom::Start(off /* as usize */) => {
if off > isize::MAX as usize {
return_errno_with_message!(Errno::EINVAL, "file offset is too large");
}
off as isize
}
SeekFrom::End(off /* as isize */) => {
let file_size = inode.size() as isize;
assert!(file_size >= 0);
file_size
.checked_add(off)
.ok_or_else(|| Error::with_message(Errno::EOVERFLOW, "file offset overflow"))?
}
SeekFrom::Current(off /* as isize */) => (*offset as isize)
.checked_add(off)
.ok_or_else(|| Error::with_message(Errno::EOVERFLOW, "file offset overflow"))?,
};
if new_offset < 0 {
return_errno_with_message!(Errno::EINVAL, "file offset must not be negative");
}
// Invariant: 0 <= new_offset <= isize::MAX
let new_offset = new_offset as usize;
*offset = new_offset;
Ok(new_offset)
}
pub fn do_fallocate_util(
inode: &Arc<dyn Inode>,
status_flags: StatusFlags,
mode: FallocMode,
offset: usize,
len: usize,
) -> Result<()> {
if status_flags.contains(StatusFlags::O_APPEND)
&& (mode == FallocMode::PunchHoleKeepSize
|| mode == FallocMode::CollapseRange
|| mode == FallocMode::InsertRange)
{
return_errno_with_message!(
Errno::EPERM,
"the flags do not work on the append-only file"
);
}
if status_flags.contains(StatusFlags::O_DIRECT) || status_flags.contains(StatusFlags::O_PATH) {
return_errno_with_message!(
Errno::EBADF,
"currently fallocate file with O_DIRECT or O_PATH is not supported"
);
}
inode.fallocate(mode, offset, len)
}
pub fn do_resize_util(
inode: &Arc<dyn Inode>,
status_flags: StatusFlags,
new_size: usize,
) -> Result<()> {
if status_flags.contains(StatusFlags::O_APPEND) {
// FIXME: It's allowed to `ftruncate` an append-only file on Linux.
return_errno_with_message!(Errno::EPERM, "can not resize append-only file");
}
inode.resize(new_size)
}

View File

@ -420,6 +420,19 @@ impl RamInode {
})
}
fn new_file_detached(mode: InodeMode, uid: Uid, gid: Gid) -> Arc<Self> {
Arc::new_cyclic(|weak_self| RamInode {
inner: Inner::new_file(weak_self.clone()),
metadata: SpinLock::new(InodeMeta::new(mode, uid, gid)),
ino: weak_self.as_ptr() as u64,
typ: InodeType::File,
this: weak_self.clone(),
fs: Weak::new(),
extension: Extension::new(),
xattr: RamXattr::new(),
})
}
fn new_symlink(fs: &Arc<RamFS>, mode: InodeMode, uid: Uid, gid: Gid) -> Arc<Self> {
Arc::new_cyclic(|weak_self| RamInode {
inner: Inner::new_symlink(),
@ -1236,6 +1249,13 @@ impl Inode for RamInode {
}
}
/// Creates a RAM inode that is detached from any `RamFS`.
///
// TODO: Add "anonymous inode fs" and link the inode to it.
pub fn new_detached_inode(mode: InodeMode, uid: Uid, gid: Gid) -> Arc<dyn Inode> {
RamInode::new_file_detached(mode, uid, gid)
}
fn write_lock_two_direntries_by_ino<'a>(
this: (u64, &'a RwLock<DirEntry>),
other: (u64, &'a RwLock<DirEntry>),

View File

@ -4,7 +4,7 @@
use alloc::sync::Arc;
pub use fs::RamFS;
pub use fs::{new_detached_inode, RamFS};
use crate::fs::ramfs::fs::RamFsType;

View File

@ -57,6 +57,7 @@ use super::{
listen::sys_listen,
lseek::sys_lseek,
madvise::sys_madvise,
memfd_create::sys_memfd_create,
mkdir::sys_mkdirat,
mknod::sys_mknodat,
mmap::sys_mmap,
@ -293,6 +294,7 @@ impl_syscall_nums_and_dispatch_fn! {
SYS_SCHED_SETATTR = 274 => sys_sched_setattr(args[..3]);
SYS_SCHED_GETATTR = 275 => sys_sched_getattr(args[..4]);
SYS_GETRANDOM = 278 => sys_getrandom(args[..3]);
SYS_MEMFD_CREATE = 279 => sys_memfd_create(args[..2]);
SYS_EXECVEAT = 281 => sys_execveat(args[..5], &mut user_ctx);
SYS_PREADV2 = 286 => sys_preadv2(args[..5]);
SYS_PWRITEV2 = 287 => sys_pwritev2(args[..5]);

View File

@ -59,6 +59,7 @@ use super::{
listen::sys_listen,
lseek::sys_lseek,
madvise::sys_madvise,
memfd_create::sys_memfd_create,
mkdir::sys_mkdirat,
mknod::sys_mknodat,
mmap::sys_mmap,
@ -300,6 +301,7 @@ impl_syscall_nums_and_dispatch_fn! {
SYS_SCHED_SETATTR = 274 => sys_sched_setattr(args[..3]);
SYS_SCHED_GETATTR = 275 => sys_sched_getattr(args[..4]);
SYS_GETRANDOM = 278 => sys_getrandom(args[..3]);
SYS_MEMFD_CREATE = 279 => sys_memfd_create(args[..2]);
SYS_EXECVEAT = 281 => sys_execveat(args[..5], &mut user_ctx);
SYS_PREADV2 = 286 => sys_preadv2(args[..5]);
SYS_PWRITEV2 = 287 => sys_pwritev2(args[..5]);

View File

@ -68,6 +68,7 @@ use super::{
listxattr::{sys_flistxattr, sys_listxattr, sys_llistxattr},
lseek::sys_lseek,
madvise::sys_madvise,
memfd_create::sys_memfd_create,
mkdir::{sys_mkdir, sys_mkdirat},
mknod::{sys_mknod, sys_mknodat},
mmap::sys_mmap,
@ -377,6 +378,7 @@ impl_syscall_nums_and_dispatch_fn! {
SYS_SCHED_SETATTR = 314 => sys_sched_setattr(args[..3]);
SYS_SCHED_GETATTR = 315 => sys_sched_getattr(args[..4]);
SYS_GETRANDOM = 318 => sys_getrandom(args[..3]);
SYS_MEMFD_CREATE = 319 => sys_memfd_create(args[..2]);
SYS_EXECVEAT = 322 => sys_execveat(args[..5], &mut user_ctx);
SYS_PREADV2 = 327 => sys_preadv2(args[..5]);
SYS_PWRITEV2 = 328 => sys_pwritev2(args[..5]);

View File

@ -0,0 +1,52 @@
// SPDX-License-Identifier: MPL-2.0
use bitflags::bitflags;
use super::SyscallReturn;
use crate::{
fs::file_table::FdFlags,
prelude::*,
vm::memfd::{MemfdFile, MAX_MEMFD_NAME_LEN},
};
bitflags! {
struct MemfdFlags: u32 {
/// Close on exec.
const MFD_CLOEXEC = 1 << 0;
/// Allow sealing operations on this file.
const MFD_ALLOW_SEALING = 1 << 1;
/// Create in the hugetlbfs.
const MFD_HUGETLB = 1 << 2;
}
}
pub fn sys_memfd_create(name_addr: Vaddr, flags: u32, ctx: &Context) -> Result<SyscallReturn> {
// FIXME: When `name` is too long, `read_cstring` returns `EFAULT`. However,
// according to <https://man7.org/linux/man-pages/man2/memfd_create.2.html>,
// we should return `EINVAL` in this case.
let name = ctx
.user_space()
.read_cstring(name_addr, MAX_MEMFD_NAME_LEN + 1)?;
debug!("sys_memfd_create: name = {:?}, flags = {}", name, flags);
let memfd_file = MemfdFile::new(name.to_string_lossy().as_ref())?;
let fd = {
let memfd_flags = MemfdFlags::from_bits(flags).ok_or(Errno::EINVAL)?;
let fd_flags = if memfd_flags.contains(MemfdFlags::MFD_CLOEXEC) {
FdFlags::CLOEXEC
} else {
FdFlags::empty()
};
let file_table = ctx.thread_local.borrow_file_table();
let mut file_table_locked = file_table.unwrap().write();
// FIXME: Support `MFD_ALLOW_SEALING` and `MFD_HUGETLB`.
if memfd_flags.contains(MemfdFlags::MFD_ALLOW_SEALING) {
warn!("sealing not supported");
}
file_table_locked.insert(Arc::new(memfd_file), fd_flags)
};
Ok(SyscallReturn::Return(fd as _))
}

View File

@ -79,6 +79,7 @@ mod listen;
mod listxattr;
mod lseek;
mod madvise;
mod memfd_create;
mod mkdir;
mod mknod;
mod mmap;

150
kernel/src/vm/memfd.rs Normal file
View File

@ -0,0 +1,150 @@
// SPDX-License-Identifier: MPL-2.0
//! Memfd Implementation.
use alloc::format;
use core::sync::atomic::{AtomicU32, Ordering};
use inherit_methods_macro::inherit_methods;
use crate::{
events::IoEvents,
fs::{
file_handle::FileLike,
inode_handle::{do_fallocate_util, do_resize_util, do_seek_util},
ramfs::new_detached_inode,
utils::{
AccessMode, FallocMode, Inode, InodeMode, IoctlCmd, Metadata, SeekFrom, StatusFlags,
},
},
prelude::*,
process::{
signal::{PollHandle, Pollable},
Gid, Uid,
},
};
/// Maximum file name length for `memfd_create`, excluding the final `\0` byte.
///
/// See <https://man7.org/linux/man-pages/man2/memfd_create.2.html>
pub const MAX_MEMFD_NAME_LEN: usize = 249;
pub struct MemfdFile {
inode: Arc<dyn Inode>,
#[expect(dead_code)]
name: String,
offset: Mutex<usize>,
access_mode: AccessMode,
status_flags: AtomicU32,
}
impl MemfdFile {
pub fn new(name: &str) -> Result<Self> {
if name.len() > MAX_MEMFD_NAME_LEN {
return_errno_with_message!(Errno::EINVAL, "MemfdManager: `name` is too long.");
}
// When Linux performs `memfd_create`, it first creates a RAM inode in a ramfs,
// then immediately unlinks it, and finally returns only the file descriptor.
// Therefore, when using `readlink("/proc/<pid>/fd/<fd>", ...)` to get the file
// path of a `memfd` file, the result will have a `(deleted)` suffix. We stay
// consistent with Linux here.
//
// Reference: <https://github.com/torvalds/linux/blob/379f604cc3dc2c865dc2b13d81faa166b6df59ec/mm/shmem.c#L5803-L5837>
let name = format!("/memfd:{} (deleted)", name);
let inode = new_detached_inode(
InodeMode::from_bits_truncate(0o777),
Uid::new_root(),
Gid::new_root(),
);
Ok(Self {
inode,
name,
offset: Mutex::new(0),
access_mode: AccessMode::O_RDWR,
status_flags: AtomicU32::new(0),
})
}
}
impl Pollable for MemfdFile {
fn poll(&self, mask: IoEvents, _poller: Option<&mut PollHandle>) -> IoEvents {
(IoEvents::IN | IoEvents::OUT) & mask
}
}
#[inherit_methods(from = "self.inode")]
impl FileLike for MemfdFile {
fn read_at(&self, offset: usize, writer: &mut VmWriter) -> Result<usize>;
fn ioctl(&self, cmd: IoctlCmd, arg: usize) -> Result<i32>;
fn metadata(&self) -> Metadata;
fn mode(&self) -> Result<InodeMode>;
fn set_mode(&self, mode: InodeMode) -> Result<()>;
fn owner(&self) -> Result<Uid>;
fn set_owner(&self, uid: Uid) -> Result<()>;
fn group(&self) -> Result<Gid>;
fn set_group(&self, gid: Gid) -> Result<()>;
fn read(&self, writer: &mut VmWriter) -> Result<usize> {
let mut offset = self.offset.lock();
let len = self.read_at(*offset, writer)?;
*offset += len;
Ok(len)
}
fn write(&self, reader: &mut VmReader) -> Result<usize> {
let mut offset = self.offset.lock();
if self.status_flags().contains(StatusFlags::O_APPEND) {
*offset = self.inode.size();
}
let len = self.write_at(*offset, reader)?;
*offset += len;
Ok(len)
}
fn write_at(&self, mut offset: usize, reader: &mut VmReader) -> Result<usize> {
if self.status_flags().contains(StatusFlags::O_APPEND) {
// If the file has the O_APPEND flag, the offset is ignored
offset = self.inode.size();
}
self.inode.write_at(offset, reader)
}
fn resize(&self, new_size: usize) -> Result<()> {
do_resize_util(&self.inode, self.status_flags(), new_size)
}
fn status_flags(&self) -> StatusFlags {
let bits = self.status_flags.load(Ordering::Relaxed);
StatusFlags::from_bits(bits).unwrap()
}
fn set_status_flags(&self, new_status_flags: StatusFlags) -> Result<()> {
self.status_flags
.store(new_status_flags.bits(), Ordering::Relaxed);
Ok(())
}
fn access_mode(&self) -> AccessMode {
self.access_mode
}
fn seek(&self, pos: SeekFrom) -> Result<usize> {
do_seek_util(&self.inode, &self.offset, pos)
}
fn fallocate(&self, mode: FallocMode, offset: usize, len: usize) -> Result<()> {
do_fallocate_util(&self.inode, self.status_flags(), mode, offset, len)
}
fn inode(&self) -> Option<&Arc<dyn Inode>> {
Some(&self.inode)
}
}

View File

@ -19,6 +19,7 @@
use osdk_frame_allocator::FrameAllocator;
use osdk_heap_allocator::{type_from_layout, HeapAllocator};
pub mod memfd;
pub mod page_fault_handler;
pub mod perms;
pub mod util;

View File

@ -21,6 +21,7 @@ TESTS ?= \
ioctl_test \
link_test \
lseek_test \
memfd_test \
mkdir_test \
mknod_test \
mmap_test \

View File

@ -0,0 +1,6 @@
MemfdTest.Name
MemfdTest.Seal*
MemfdTest.NoPartialSealApplicationWhenWriteSealRejected
MemfdTest.TmpfsFilesHaveSealSeal
MemfdTest.CanOpenFromProcfs
MemfdTest.MemfdMustBeWritableToModifySeals