asterinas/kernel/src/process/posix_thread/mod.rs

// SPDX-License-Identifier: MPL-2.0

use core::sync::atomic::{AtomicU32, AtomicU64, Ordering};

use aster_rights::{ReadDupOp, ReadOp, WriteOp};
use ostd::{
    sync::{RoArc, RwMutexReadGuard, Waker},
    task::Task,
};

use super::{
    Credentials, Process,
    signal::{sig_mask::AtomicSigMask, sig_num::SigNum, sig_queues::SigQueues, signals::Signal},
};
use crate::{
    events::IoEvents,
    fs::{file_table::FileTable, thread_info::ThreadFsInfo},
    prelude::*,
    process::{
        Pid,
        namespace::nsproxy::NsProxy,
        signal::{PauseReason, PollHandle},
    },
    thread::{Thread, Tid},
    time::{Timer, TimerManager, clocks::ProfClock, timer::TimerGuard},
};

mod builder;
mod exit;
pub mod futex;
mod name;
mod posix_thread_ext;
pub mod ptrace;
mod robust_list;
mod thread_local;
pub mod thread_table;

pub use builder::PosixThreadBuilder;
pub(super) use exit::sigkill_other_threads;
pub use exit::{do_exit, do_exit_group};
pub use name::{MAX_THREAD_NAME_LEN, ThreadName};
pub use posix_thread_ext::AsPosixThread;
pub use robust_list::RobustListHead;
pub use thread_local::{AsThreadLocal, FileTableRefMut, ThreadLocal};

pub struct PosixThread {
    // Immutable part
    process: Weak<Process>,
    task: Weak<Task>,

    // Mutable part
    tid: AtomicU32,

    name: Mutex<ThreadName>,

    /// Process credentials. At the kernel level, credentials are a per-thread attribute.
    credentials: Credentials,

    /// The file system information of the thread.
    fs: RwMutex<Arc<ThreadFsInfo>>,

    // Files
    /// File table
    file_table: Mutex<Option<RoArc<FileTable>>>,

    // Signal
    /// Blocked signals
    sig_mask: AtomicSigMask,
    /// Thread-directed sigqueue
    sig_queues: SigQueues,
    /// The per-thread signal [`Waker`], which will be used to wake up the thread
    /// when enqueuing a signal, along with the reason why the thread is paused.
    signalled_waker: SpinLock<Option<(Arc<Waker>, PauseReason)>>,

    /// A profiling clock measures the user CPU time and kernel CPU time in the thread.
    prof_clock: Arc<ProfClock>,

    /// A manager that manages timers based on the user CPU time of the current thread.
    virtual_timer_manager: Arc<TimerManager>,

    /// A manager that manages timers based on the profiling clock of the current thread.
    prof_timer_manager: Arc<TimerManager>,

    /// I/O Scheduling priority value
    io_priority: AtomicU32,

    /// The namespaces that the thread belongs to.
    ns_proxy: Mutex<Option<Arc<NsProxy>>>,

    /// The current timer slack value for this thread.
    timer_slack_ns: AtomicU64,
    /// The default timer slack value for this thread.
    default_timer_slack_ns: AtomicU64,
}

impl PosixThread {
    pub fn process(&self) -> Arc<Process> {
        self.process.upgrade().unwrap()
    }

    pub fn weak_process(&self) -> &Weak<Process> {
        &self.process
    }

    /// Returns the thread id
    pub fn tid(&self) -> Tid {
        self.tid.load(Ordering::Relaxed)
    }

    /// Sets the thread as the main thread by changing its thread ID.
    pub(super) fn set_main(&self, pid: Pid) {
        debug_assert_eq!(pid, self.process.upgrade().unwrap().pid());
        debug_assert_ne!(pid, self.tid.load(Ordering::Relaxed));

        self.tid.store(pid, Ordering::Relaxed);
    }

    pub fn thread_name(&self) -> &Mutex<ThreadName> {
        &self.name
    }

    /// Returns a read guard to the filesystem information of the thread.
    pub fn read_fs(&self) -> RwMutexReadGuard<'_, Arc<ThreadFsInfo>> {
        self.fs.read()
    }

    /// Sets the filesystem information of the thread.
    pub(in crate::process) fn set_fs(&self, new_fs: Arc<ThreadFsInfo>) {
        let mut fs_lock = self.fs.write();
        *fs_lock = new_fs;
    }

    pub fn file_table(&self) -> &Mutex<Option<RoArc<FileTable>>> {
        &self.file_table
    }

    /// Gets the reference to the signal mask of the thread.
    ///
    /// Note that while this function offers mutable access to the signal mask,
    /// it is not sound for callers other than the current thread to modify the
    /// signal mask. They may only read the signal mask.
    pub fn sig_mask(&self) -> &AtomicSigMask {
        &self.sig_mask
    }

    pub(super) fn sig_queues(&self) -> &SigQueues {
        &self.sig_queues
    }

    /// Returns whether the signal is blocked by the thread.
    pub fn has_signal_blocked(&self, signum: SigNum) -> bool {
        // FIXME: Some signals cannot be blocked, even set in sig_mask.
        self.sig_mask.contains(signum, Ordering::Relaxed)
    }

    /// Sets the input [`Waker`] as the signalled waker of this thread,
    /// along with the reason why the thread is paused.
    ///
    /// This approach can collaborate with signal-aware wait methods.
    /// Once a signalled waker is set for a thread, it cannot be reset until it is cleared.
    ///
    /// # Panics
    ///
    /// If setting a new waker before clearing the current thread's signalled waker
    /// this method will panic.
    pub fn set_signalled_waker(&self, waker: Arc<Waker>, reason: PauseReason) {
        let mut signalled_waker = self.signalled_waker.lock();
        assert!(signalled_waker.is_none());
        *signalled_waker = Some((waker, reason));
    }

    /// Clears the signalled waker of this thread.
    pub fn clear_signalled_waker(&self) {
        *self.signalled_waker.lock() = None;
    }

    /// Returns the sleeping state of this thread.
    pub fn sleeping_state(&self) -> SleepingState {
        // This implementation prevents a thread (let's call it `threadA`) that is
        // sleeping in an interruptible wait from being mistakenly reported as
        // sleeping in an uninterruptible wait due to a race condition, where another
        // thread (`threadB`) may observe that its `task.schedule_info().cpu` is
        // `AtomicCpuId::NONE` and its `signalled_waker` is `None` (not set yet or
        // already cleared).
        //
        // When `threadA` enters an interruptible wait, it executes the following steps:
        // ```
        // A1: Acquire signalled_waker.lock |
        // A2: set signalled_waker to Some  |-- critical section #1
        // A3: Release signalled_waker.lock |
        // A4: cpu.set_to_none(Relaxed)
        // A5: cpu.set_if_is_none(cpuid, Relaxed)
        // A6: Acquire signalled_waker.lock |
        // A7: set signalled_waker to None  |-- critical section #2
        // A8: Release signalled_waker.lock |
        // ```
        //
        // When `threadB` calls `threadA.sleeping_state()`, it executes the following steps:
        // ```
        // B1: Acquire threadA.signalled_waker.lock |
        // B2: check threadA.signalled_waker        |-- critical section #3
        // B3: check threadA.cpu.get(Relaxed)       |
        // B4: Release threadA.signalled_waker.lock |
        // ```
        //
        // We can see that:
        //  - If #3 happens before #1, B3 can not observe the effect of A4 due to the
        //    release-acquire pair B4-A1.
        //  - If #3 happens between #1 and #2, B2 will always see a `Some`.
        //  - If #3 happens after #2, B3 can observe the effect of A5 due to the
        //    release-acquire pair A8-B1.
        // Therefore, the condition where both B2 and B3 see `None` will never happen.
        //
        // Similarly, this implementation prevents a process that has been stopped by
        // a signal or ptrace from being incorrectly reported as sleeping in an
        // (un)interruptible wait.
        //
        // FIXME: This implementation cannot prevent a stopped process from being
        // reported as running when `crate::process::signal::handle_pending_signal`
        // is called, but the pending signal is not a `SIGCONT`. However, is this
        // actually a problem? We considered an approach to fix this issue, but it
        // does not fully resolve it and has some drawbacks. For more details, see
        // <https://github.com/asterinas/asterinas/pull/2491#issuecomment-3527958970>.
        let signalled_waker = self.signalled_waker.lock();
        let task = self.task.upgrade().unwrap();
        match (
            signalled_waker.as_ref(),
            task.schedule_info().cpu.get().is_none(),
        ) {
            (Some((_, PauseReason::Sleep)), true) => SleepingState::Interruptible,
            (Some((_, PauseReason::StopBySignal)), true) => SleepingState::StopBySignal,
            (Some((_, PauseReason::StopByPtrace)), true) => SleepingState::StopByPtrace,
            (None, true) => SleepingState::Uninterruptible,
            (_, false) => SleepingState::Running,
        }
    }

    /// Wakes up the signalled waker.
    pub fn wake_signalled_waker(&self) {
        if let Some((waker, _)) = &*self.signalled_waker.lock() {
            waker.wake_up();
        }
    }

    /// Enqueues a thread-directed signal.
    ///
    /// This method does not perform permission checks on user signals.
    /// Therefore, unless the caller can ensure that there are no permission issues,
    /// this method should be used to enqueue kernel signals or fault signals.
    pub fn enqueue_signal(&self, signal: Box<dyn Signal>) {
        self.sig_queues.enqueue(signal);
        self.wake_signalled_waker();
    }

    pub fn register_signalfd_poller(&self, poller: &mut PollHandle, mask: IoEvents) {
        self.sig_queues.register_signalfd_poller(poller, mask);
        self.process()
            .sig_queues()
            .register_signalfd_poller(poller, mask);
    }

    /// Returns a reference to the profiling clock of the current thread.
    pub fn prof_clock(&self) -> &Arc<ProfClock> {
        &self.prof_clock
    }

    /// Creates a timer based on the profiling CPU clock of the current thread.
    pub fn create_prof_timer<F>(&self, func: F) -> Arc<Timer>
    where
        F: Fn(TimerGuard) + Send + Sync + 'static,
    {
        self.prof_timer_manager.create_timer(func)
    }

    /// Creates a timer based on the user CPU clock of the current thread.
    pub fn create_virtual_timer<F>(&self, func: F) -> Arc<Timer>
    where
        F: Fn(TimerGuard) + Send + Sync + 'static,
    {
        self.virtual_timer_manager.create_timer(func)
    }

    /// Checks the `TimerCallback`s that are managed by the `prof_timer_manager`.
    /// If any have timed out, call the corresponding callback functions.
    pub fn process_expired_timers(&self) {
        self.prof_timer_manager.process_expired_timers();
    }

    /// Gets the read-only credentials of the thread.
    pub fn credentials(&self) -> Credentials<ReadOp> {
        self.credentials.dup().restrict()
    }

    /// Gets the duplicatable read-only credentials of the thread.
    pub fn credentials_dup(&self) -> Credentials<ReadDupOp> {
        self.credentials.dup().restrict()
    }

    /// Gets the write-only credentials of the current thread.
    ///
    /// It is illegal to mutate the credentials from a thread other than the
    /// current thread. For performance reasons, this function only checks it
    /// using debug assertions.
    pub fn credentials_mut(&self) -> Credentials<WriteOp> {
        debug_assert!(core::ptr::eq(
            current_thread!().as_posix_thread().unwrap(),
            self
        ));
        self.credentials.dup().restrict()
    }

    /// Returns the I/O priority value of the thread.
    pub fn io_priority(&self) -> &AtomicU32 {
        &self.io_priority
    }

    /// Returns the namespaces which the thread belongs to.
    pub fn ns_proxy(&self) -> &Mutex<Option<Arc<NsProxy>>> {
        &self.ns_proxy
    }

    /// Returns the current timer slack value in nanoseconds.
    pub fn timer_slack_ns(&self) -> u64 {
        self.timer_slack_ns.load(Ordering::Relaxed)
    }

    /// Sets the current timer slack value in nanoseconds.
    pub fn set_timer_slack_ns(&self, slack_ns: u64) {
        self.timer_slack_ns.store(slack_ns, Ordering::Relaxed);
    }

    /// Resets the current timer slack to the default value.
    pub fn reset_timer_slack_to_default(&self) {
        let default = self.default_timer_slack_ns.load(Ordering::Relaxed);
        self.timer_slack_ns.store(default, Ordering::Relaxed);
    }
}

static POSIX_TID_ALLOCATOR: AtomicU32 = AtomicU32::new(1);

/// Allocates a new tid for the new posix thread
pub fn allocate_posix_tid() -> Tid {
    let tid = POSIX_TID_ALLOCATOR.fetch_add(1, Ordering::SeqCst);
    if tid >= PID_MAX {
        // When the kernel's next PID value reaches `PID_MAX`,
        // it should wrap back to a minimum PID value.
        // PIDs with a value of `PID_MAX` or larger should not be allocated.
        // Reference: <https://docs.kernel.org/admin-guide/sysctl/kernel.html#pid-max>.
        //
        // FIXME: Currently, we cannot determine which PID is recycled,
        // so we are unable to allocate smaller PIDs.
        warn!("the allocated ID is greater than the maximum allowed PID");
    }
    tid
}

/// Returns the last allocated tid
pub fn last_tid() -> Tid {
    POSIX_TID_ALLOCATOR.load(Ordering::SeqCst) - 1
}

/// The maximum allowed process ID.
//
// FIXME: The current value is chosen arbitrarily.
// This value can be modified by the user by writing to `/proc/sys/kernel/pid_max`.
pub const PID_MAX: u32 = u32::MAX / 2;

/// The sleeping state of a thread.
#[derive(Debug, Clone, Copy)]
pub enum SleepingState {
    /// The thread is running.
    Running,
    /// The thread is sleeping in an interruptible wait.
    Interruptible,
    /// The thread is sleeping in an uninterruptible wait.
    Uninterruptible,
    /// The thread is stopped by a signal.
    StopBySignal,
    /// The thread is stopped by ptrace.
    StopByPtrace,
}