Add CLONE_PIDFD & pidfd_open support

This commit is contained in:
jiangjianfeng 2025-06-05 07:25:07 +00:00 committed by Tate, Hongliang Tian
parent e75cbb0545
commit 9219207b97
19 changed files with 279 additions and 40 deletions

View File

@ -341,6 +341,7 @@ provided by Linux on x86-64 architecture.
| 327 | preadv2 | ✅ |
| 328 | pwritev2 | ✅ |
| 332 | statx | ✅ |
| 434 | pidfd_open | ✅ |
| 435 | clone3 | ✅ |
| 436 | close_range | ✅ |
| 439 | faccessat2 | ✅ |

View File

@ -123,6 +123,7 @@ enum DirEntryFileType {
impl From<InodeType> for DirEntryFileType {
fn from(inode_type: InodeType) -> Self {
match inode_type {
InodeType::Unknown => DirEntryFileType::Unknown,
InodeType::NamedPipe => Self::Fifo,
InodeType::CharDevice => Self::Char,
InodeType::Dir => Self::Dir,

View File

@ -25,6 +25,7 @@ use crate::{
#[repr(u16)]
#[derive(Copy, Clone, Debug, Eq, PartialEq, TryFromInt)]
pub enum InodeType {
Unknown = 0o000000,
NamedPipe = 0o010000,
CharDevice = 0o020000,
Dir = 0o040000,

View File

@ -15,9 +15,12 @@ use super::{
use crate::{
cpu::LinuxAbi,
current_userspace,
fs::{file_table::FileTable, thread_info::ThreadFsInfo},
fs::{
file_table::{FdFlags, FileTable},
thread_info::ThreadFsInfo,
},
prelude::*,
process::posix_thread::allocate_posix_tid,
process::{pid_file::PidFile, posix_thread::allocate_posix_tid},
sched::Nice,
thread::{AsThread, Tid},
};
@ -79,7 +82,7 @@ bitflags! {
#[derive(Debug, Clone, Copy, Default)]
pub struct CloneArgs {
pub flags: CloneFlags,
pub _pidfd: Option<u64>,
pub pidfd: Option<Vaddr>,
pub child_tid: Vaddr,
pub parent_tid: Option<Vaddr>,
pub exit_signal: Option<SigNum>,
@ -111,7 +114,7 @@ impl CloneArgs {
flags.contains(CloneFlags::CLONE_PARENT_SETTID),
) {
(false, false) => (None, None),
(true, false) => (Some(parent_tid as u64), None),
(true, false) => (Some(parent_tid), None),
(false, true) => (None, Some(parent_tid)),
(true, true) => {
return_errno_with_message!(
@ -123,7 +126,7 @@ impl CloneArgs {
Ok(Self {
flags,
_pidfd: pidfd,
pidfd,
child_tid,
parent_tid,
exit_signal: (exit_signal != 0).then(|| SigNum::from_u8(exit_signal as u8)),
@ -163,6 +166,7 @@ impl CloneFlags {
| CloneFlags::CLONE_FS
| CloneFlags::CLONE_FILES
| CloneFlags::CLONE_SIGHAND
| CloneFlags::CLONE_PIDFD
| CloneFlags::CLONE_THREAD
| CloneFlags::CLONE_SYSVSEM
| CloneFlags::CLONE_SETTLS
@ -230,6 +234,13 @@ fn clone_child_task(
);
}
if clone_flags.contains(CloneFlags::CLONE_PIDFD) {
return_errno_with_message!(
Errno::EINVAL,
"`CLONE_THREAD` cannot be used together with `CLONE_PIDFD`"
);
}
let Context {
process,
thread_local,
@ -387,6 +398,8 @@ fn clone_child_process(
)
};
clone_pidfd(ctx, &child, clone_flags, clone_args.pidfd)?;
if let Some(sig) = clone_args.exit_signal {
child.set_exit_signal(sig);
};
@ -523,6 +536,38 @@ fn clone_sysvsem(clone_flags: CloneFlags) -> Result<()> {
Ok(())
}
fn clone_pidfd(
ctx: &Context,
child: &Arc<Process>,
clone_flags: CloneFlags,
pidfd_addr: Option<Vaddr>,
) -> Result<()> {
if !clone_flags.contains(CloneFlags::CLONE_PIDFD) {
return Ok(());
}
let pidfd_addr = pidfd_addr.unwrap();
let fd = {
let pid_file = PidFile::new(child.clone(), false);
let file_table = ctx.thread_local.borrow_file_table();
let mut file_table_locked = file_table.unwrap().write();
file_table_locked.insert(Arc::new(pid_file), FdFlags::CLOEXEC)
};
// Since `write_val` may sleep, we cannot hold the file table lock during its execution.
// FIXME: Should we remove the file from the file table if the write operation fails?
match ctx.user_space().write_val(pidfd_addr, &fd) {
Ok(()) => Ok(()),
Err(e) => {
let file_table = ctx.thread_local.borrow_file_table();
let mut file_table_locked = file_table.unwrap().write();
file_table_locked.close_file(fd);
Err(e)
}
}
}
#[expect(clippy::too_many_arguments)]
fn create_child_process(
pid: Pid,

View File

@ -3,7 +3,7 @@
use core::sync::atomic::Ordering;
use super::{process_table, Pid, Process};
use crate::{prelude::*, process::signal::signals::kernel::KernelSignal};
use crate::{events::IoEvents, prelude::*, process::signal::signals::kernel::KernelSignal};
/// Exits the current POSIX process.
///
@ -19,6 +19,8 @@ pub(super) fn exit_process(current_process: &Process) {
// Drop fields in `Process`.
current_process.lock_root_vmar().set_vmar(None);
current_process.pidfile_pollee.notify(IoEvents::IN);
send_parent_death_signal(current_process);
move_children_to_reaper_process(current_process);

View File

@ -4,6 +4,7 @@ mod clone;
pub mod credentials;
mod exit;
mod kill;
mod pid_file;
pub mod posix_thread;
#[expect(clippy::module_inception)]
mod process;
@ -22,6 +23,7 @@ mod wait;
pub use clone::{clone_child, CloneArgs, CloneFlags};
pub use credentials::{Credentials, Gid, Uid};
pub use kill::{kill, kill_all, kill_group, tgkill};
pub use pid_file::PidFile;
pub use process::{
broadcast_signal_async, enqueue_signal_async, spawn_init_process, ExitCode, JobControl, Pgid,
Pid, Process, ProcessGroup, Session, Sid, Terminal,

View File

@ -0,0 +1,120 @@
// SPDX-License-Identifier: MPL-2.0
use core::sync::atomic::{AtomicBool, Ordering};
use crate::{
events::IoEvents,
fs::{
file_handle::FileLike,
utils::{InodeMode, InodeType, Metadata, StatusFlags},
},
prelude::*,
process::{
signal::{PollHandle, Pollable},
Gid, Process, Uid,
},
time::clocks::RealTimeClock,
};
pub struct PidFile {
process: Arc<Process>,
is_nonblocking: AtomicBool,
}
impl Debug for PidFile {
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
f.debug_struct("PidFile")
.field("process", &self.process.pid())
.field(
"is_nonblocking",
&self.is_nonblocking.load(Ordering::Relaxed),
)
.finish_non_exhaustive()
}
}
impl PidFile {
pub fn new(process: Arc<Process>, is_nonblocking: bool) -> Self {
Self {
process,
is_nonblocking: AtomicBool::new(is_nonblocking),
}
}
fn check_io_events(&self) -> IoEvents {
// "A PID file descriptor can be monitored using poll(2), select(2),
// and epoll(7). When the process that it refers to terminates, these
// interfaces indicate the file descriptor as readable."
// Reference: <https://man7.org/linux/man-pages/man2/pidfd_open.2.html>.
if self.process.status().is_zombie() {
IoEvents::IN
} else {
IoEvents::empty()
}
}
pub(super) fn is_nonblocking(&self) -> bool {
self.is_nonblocking.load(Ordering::Relaxed)
}
pub(super) fn process(&self) -> &Arc<Process> {
&self.process
}
}
impl FileLike for PidFile {
fn read(&self, _writer: &mut VmWriter) -> Result<usize> {
return_errno_with_message!(Errno::EINVAL, "PID file cannot be read");
}
fn write(&self, _reader: &mut VmReader) -> Result<usize> {
return_errno_with_message!(Errno::EINVAL, "PID file cannot be written");
}
fn metadata(&self) -> Metadata {
let now = RealTimeClock::get().read_time();
Metadata {
dev: 0,
ino: 0,
size: 0,
blk_size: 4096,
blocks: 0,
atime: now,
mtime: now,
ctime: now,
type_: InodeType::Unknown,
mode: InodeMode::from_bits_truncate(0o600),
nlinks: 1,
// FIXME: Should we use the process's UID and GID here?
uid: Uid::new_root(),
gid: Gid::new_root(),
rdev: 0,
}
}
fn set_status_flags(&self, new_flags: StatusFlags) -> Result<()> {
if new_flags.contains(StatusFlags::O_NONBLOCK) {
self.is_nonblocking.store(true, Ordering::Relaxed);
} else {
self.is_nonblocking.store(false, Ordering::Relaxed);
}
Ok(())
}
fn status_flags(&self) -> StatusFlags {
if self.is_nonblocking() {
StatusFlags::O_NONBLOCK
} else {
StatusFlags::empty()
}
}
}
impl Pollable for PidFile {
fn poll(&self, mask: IoEvents, poller: Option<&mut PollHandle>) -> IoEvents {
self.process
.pidfile_pollee
.poll_with(mask, poller, || self.check_io_events())
}
}

View File

@ -18,7 +18,7 @@ use super::{
};
use crate::{
prelude::*,
process::{status::StopWaitStatus, WaitOptions},
process::{signal::Pollee, status::StopWaitStatus, WaitOptions},
sched::{AtomicNice, Nice},
thread::{AsThread, Thread},
time::clocks::ProfClock,
@ -64,6 +64,7 @@ pub struct Process {
process_vm: ProcessVm,
/// Wait for child status changed
children_wait_queue: WaitQueue,
pub(super) pidfile_pollee: Pollee,
// Mutable Part
/// The executable path.
@ -205,6 +206,7 @@ impl Process {
executable_path: RwLock::new(executable_path),
process_vm,
children_wait_queue,
pidfile_pollee: Pollee::new(),
status: ProcessStatus::default(),
parent: ParentProcess::new(parent),
children: Mutex::new(BTreeMap::new()),

View File

@ -1,18 +1,19 @@
// SPDX-License-Identifier: MPL-2.0
use super::{Pgid, Pid};
use crate::prelude::*;
use crate::{fs::file_table::get_file_fast, prelude::*, process::PidFile};
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
#[derive(Debug, Clone)]
pub enum ProcessFilter {
Any,
WithPid(Pid),
WithPgid(Pgid),
WithPidfd(Arc<PidFile>),
}
impl ProcessFilter {
// For `waitpid`.
pub fn from_which_and_id(which: u64, id: u32) -> Result<Self> {
pub fn from_which_and_id(which: u64, id: u32, ctx: &Context) -> Result<Self> {
// Reference:
// <https://elixir.bootlin.com/linux/v6.14.4/source/include/uapi/linux/wait.h#L16-L20>
const P_ALL: u64 = 0;
@ -25,11 +26,14 @@ impl ProcessFilter {
P_PID => Ok(ProcessFilter::WithPid(id)),
P_PGID => Ok(ProcessFilter::WithPgid(id)),
P_PIDFD => {
warn!("the process filter `P_PIDFD` is not supported");
return_errno_with_message!(
Errno::EINVAL,
"the process filter `P_PIDFD` is not supported"
);
let file = {
let mut file_table = ctx.thread_local.borrow_file_table_mut();
get_file_fast!(&mut file_table, id.cast_signed()).into_owned()
};
let pid_file = Arc::downcast(file).map_err(|_| {
Error::with_message(Errno::EINVAL, "the file is not a PID file")
})?;
Ok(ProcessFilter::WithPidfd(pid_file))
}
_ => return_errno_with_message!(Errno::EINVAL, "the process filter is invalid"),
}

View File

@ -66,6 +66,12 @@ pub fn do_wait(
) -> Result<Option<WaitStatus>> {
wait_options.check()?;
let is_nonblocking = if let ProcessFilter::WithPidfd(pid_file) = &child_filter {
pid_file.is_nonblocking()
} else {
false
};
let zombie_child = with_sigmask_changed(
ctx,
|sigmask| sigmask + SIGCHLD,
@ -78,10 +84,13 @@ pub fn do_wait(
let unwaited_children = children_lock
.values()
.filter(|child| match child_filter {
.filter(|child| match &child_filter {
ProcessFilter::Any => true,
ProcessFilter::WithPid(pid) => child.pid() == pid,
ProcessFilter::WithPgid(pgid) => child.pgid() == pgid,
ProcessFilter::WithPid(pid) => child.pid() == *pid,
ProcessFilter::WithPgid(pgid) => child.pgid() == *pgid,
ProcessFilter::WithPidfd(pid_file) => {
Arc::ptr_eq(pid_file.process(), *child)
}
})
.collect::<Box<_>>();
@ -107,6 +116,13 @@ pub fn do_wait(
return Some(Ok(None));
}
if is_nonblocking {
return Some(Err(Error::with_message(
Errno::EAGAIN,
"the PID file is nonblocking and the child has not terminated",
)));
}
// wait
None
})

View File

@ -69,6 +69,7 @@ use super::{
munmap::sys_munmap,
nanosleep::{sys_clock_nanosleep, sys_nanosleep},
open::sys_openat,
pidfd_open::sys_pidfd_open,
pipe::sys_pipe2,
prctl::sys_prctl,
pread64::sys_pread64,
@ -311,6 +312,7 @@ impl_syscall_nums_and_dispatch_fn! {
SYS_TIMERFD_SETTIME = 411 => sys_timerfd_settime(args[..4]);
SYS_UTIMENSAT = 412 => sys_utimensat(args[..4]);
SYS_SEMTIMEDOP = 420 => sys_semtimedop(args[..4]);
SYS_PIDFD_OPEN = 434 => sys_pidfd_open(args[..2]);
SYS_CLONE3 = 435 => sys_clone3(args[..2], &user_ctx);
SYS_CLOSE_RANGE = 436 => sys_close_range(args[..3]);
SYS_FACCESSAT2 = 439 => sys_faccessat2(args[..4]);

View File

@ -79,6 +79,7 @@ use super::{
nanosleep::{sys_clock_nanosleep, sys_nanosleep},
open::{sys_creat, sys_open, sys_openat},
pause::sys_pause,
pidfd_open::sys_pidfd_open,
pipe::{sys_pipe, sys_pipe2},
poll::sys_poll,
ppoll::sys_ppoll,
@ -380,6 +381,7 @@ impl_syscall_nums_and_dispatch_fn! {
SYS_PREADV2 = 327 => sys_preadv2(args[..5]);
SYS_PWRITEV2 = 328 => sys_pwritev2(args[..5]);
SYS_STATX = 332 => sys_statx(args[..5]);
SYS_PIDFD_OPEN = 434 => sys_pidfd_open(args[..2]);
SYS_CLONE3 = 435 => sys_clone3(args[..2], &user_ctx);
SYS_CLOSE_RANGE = 436 => sys_close_range(args[..3]);
SYS_FACCESSAT2 = 439 => sys_faccessat2(args[..4]);

View File

@ -83,11 +83,7 @@ struct Clone3Args {
impl From<Clone3Args> for CloneArgs {
fn from(value: Clone3Args) -> Self {
// TODO: deal with pidfd, set_tid, set_tid_size, cgroup
if value.pidfd != 0 {
warn!("pidfd is not supported");
}
// TODO: Deal with set_tid, set_tid_size, cgroup
if value.set_tid != 0 || value.set_tid_size != 0 {
warn!("set_tid is not supported");
}
@ -98,7 +94,7 @@ impl From<Clone3Args> for CloneArgs {
Self {
flags: CloneFlags::from_bits_truncate(value.flags as u32),
_pidfd: Some(value.pidfd),
pidfd: Some(value.pidfd as Vaddr),
child_tid: value.child_tid as _,
parent_tid: Some(value.parent_tid as _),
exit_signal: (value.exit_signal != 0).then(|| SigNum::from_u8(value.exit_signal as u8)),

View File

@ -233,7 +233,6 @@ impl DirentSerializer for Dirent64 {
#[repr(u8)]
#[derive(Debug, Clone, Copy)]
enum DirentType {
#[expect(dead_code)]
DT_UNKNOWN = 0,
DT_FIFO = 1,
DT_CHR = 2,
@ -249,6 +248,7 @@ enum DirentType {
impl From<InodeType> for DirentType {
fn from(type_: InodeType) -> Self {
match type_ {
InodeType::Unknown => DirentType::DT_UNKNOWN,
InodeType::File => DirentType::DT_REG,
InodeType::Dir => DirentType::DT_DIR,
InodeType::SymLink => DirentType::DT_LNK,

View File

@ -39,6 +39,7 @@ pub fn do_sys_kill(filter: ProcessFilter, sig_num: Option<SigNum>, ctx: &Context
ProcessFilter::Any => kill_all(signal, ctx)?,
ProcessFilter::WithPid(pid) => kill(pid, signal, ctx)?,
ProcessFilter::WithPgid(pgid) => kill_group(pgid, signal, ctx)?,
ProcessFilter::WithPidfd(_) => unreachable!(),
}
Ok(())
}

View File

@ -87,6 +87,7 @@ mod munmap;
mod nanosleep;
mod open;
mod pause;
mod pidfd_open;
mod pipe;
mod poll;
mod ppoll;

View File

@ -0,0 +1,41 @@
// SPDX-License-Identifier: MPL-2.0
use crate::{
fs::{file_table::FdFlags, utils::StatusFlags},
prelude::*,
process::{process_table, Pid, PidFile},
syscall::SyscallReturn,
};
pub fn sys_pidfd_open(pid: Pid, flags: u32, ctx: &Context) -> Result<SyscallReturn> {
let is_nonblocking = {
let flags = PidfdFlags::from_bits(flags)
.ok_or_else(|| Error::with_message(Errno::EINVAL, "invalid flags"))?;
debug!("pid = {}, flags = {:?}", pid, flags);
flags.contains(PidfdFlags::PIDFD_NONBLOCK)
};
if pid.cast_signed() < 0 {
return_errno_with_message!(Errno::EINVAL, "all negative PIDs are not valid");
}
let process = process_table::get_process(pid)
.ok_or_else(|| Error::with_message(Errno::ESRCH, "the process does not exist"))?;
let pid_fd = {
let pid_file = Arc::new(PidFile::new(process, is_nonblocking));
let file_table = ctx.thread_local.borrow_file_table();
let mut file_table_locked = file_table.unwrap().write();
// "the close-on-exec flag is set on the file descriptor."
// Reference: <https://man7.org/linux/man-pages/man2/pidfd_open.2.html>.
file_table_locked.insert(pid_file, FdFlags::CLOEXEC)
};
Ok(SyscallReturn::Return(pid_fd as _))
}
bitflags! {
struct PidfdFlags: u32 {
const PIDFD_NONBLOCK = StatusFlags::O_NONBLOCK.bits();
}
}

View File

@ -22,7 +22,7 @@ pub fn sys_waitid(
ctx: &Context,
) -> Result<SyscallReturn> {
// FIXME: what does rusage use for?
let process_filter = ProcessFilter::from_which_and_id(which, upid as _)?;
let process_filter = ProcessFilter::from_which_and_id(which, upid as _, ctx)?;
let wait_options = WaitOptions::from_bits(options as u32)
.ok_or(Error::with_message(Errno::EINVAL, "invalid options"))?;
@ -46,19 +46,21 @@ pub fn sys_waitid(
return Ok(SyscallReturn::Return(0));
};
let siginfo = {
let (si_code, si_status) = calculate_si_code_and_si_status(&wait_status);
let pid = wait_status.pid();
let uid = wait_status.uid();
if infoq_addr != 0 {
let siginfo = {
let (si_code, si_status) = calculate_si_code_and_si_status(&wait_status);
let pid = wait_status.pid();
let uid = wait_status.uid();
let mut siginfo = siginfo_t::new(SIGCHLD, si_code);
siginfo.set_pid_uid(pid, uid);
siginfo.set_status(si_status);
let mut siginfo = siginfo_t::new(SIGCHLD, si_code);
siginfo.set_pid_uid(pid, uid);
siginfo.set_status(si_status);
siginfo
};
siginfo
};
ctx.user_space().write_val(infoq_addr as usize, &siginfo)?;
ctx.user_space().write_val(infoq_addr as usize, &siginfo)?;
}
Ok(SyscallReturn::Return(0))
}

View File

@ -1014,10 +1014,10 @@ pathconf01
# pidfd_getfd01
# pidfd_getfd02
# pidfd_open01
# pidfd_open02
# pidfd_open03
# pidfd_open04
pidfd_open01
pidfd_open02
pidfd_open03
pidfd_open04
# pidfd_send_signal01
# pidfd_send_signal02