diff --git a/kernel/src/net/socket/ip/datagram/mod.rs b/kernel/src/net/socket/ip/datagram/mod.rs index 1c8303e92..b88dfea86 100644 --- a/kernel/src/net/socket/ip/datagram/mod.rs +++ b/kernel/src/net/socket/ip/datagram/mod.rs @@ -176,7 +176,7 @@ impl Socket for DatagramSocket { let MessageHeader { addr, - control_message, + control_messages, } = message_header; let endpoint = match addr { @@ -184,7 +184,7 @@ impl Socket for DatagramSocket { None => None, }; - if control_message.is_some() { + if !control_messages.is_empty() { // TODO: Support sending control message warn!("sending control message is not supported"); } @@ -208,7 +208,7 @@ impl Socket for DatagramSocket { // TODO: Receive control message - let message_header = MessageHeader::new(Some(peer_addr), None); + let message_header = MessageHeader::new(Some(peer_addr), Vec::new()); Ok((received_bytes, message_header)) } diff --git a/kernel/src/net/socket/ip/stream/mod.rs b/kernel/src/net/socket/ip/stream/mod.rs index 3054b00ad..438cc0910 100644 --- a/kernel/src/net/socket/ip/stream/mod.rs +++ b/kernel/src/net/socket/ip/stream/mod.rs @@ -541,14 +541,14 @@ impl Socket for StreamSocket { } let MessageHeader { - control_message, .. + control_messages, .. } = message_header; // According to the Linux man pages, `EISCONN` _may_ be returned when the destination // address is specified for a connection-mode socket. In practice, the destination address // is simply ignored. We follow the same behavior as the Linux implementation to ignore it. - if control_message.is_some() { + if !control_messages.is_empty() { // TODO: Support sending control message warn!("sending control message is not supported"); } @@ -574,7 +574,7 @@ impl Socket for StreamSocket { // According to , // peer address is ignored for connected socket. - let message_header = MessageHeader::new(None, None); + let message_header = MessageHeader::new(None, Vec::new()); Ok((received_bytes, message_header)) } diff --git a/kernel/src/net/socket/mod.rs b/kernel/src/net/socket/mod.rs index 90479c7e0..3aed22095 100644 --- a/kernel/src/net/socket/mod.rs +++ b/kernel/src/net/socket/mod.rs @@ -139,10 +139,7 @@ impl FileLike for T { // TODO: Set correct flags self.sendmsg( reader, - MessageHeader { - addr: None, - control_message: None, - }, + MessageHeader::new(None, Vec::new()), SendRecvFlags::empty(), ) } diff --git a/kernel/src/net/socket/netlink/common/mod.rs b/kernel/src/net/socket/netlink/common/mod.rs index 329456909..303df0da1 100644 --- a/kernel/src/net/socket/netlink/common/mod.rs +++ b/kernel/src/net/socket/netlink/common/mod.rs @@ -129,7 +129,7 @@ where ) -> Result { let MessageHeader { addr, - control_message, + control_messages, } = message_header; let remote = match addr { @@ -137,7 +137,7 @@ where Some(addr) => Some(addr.try_into()?), }; - if control_message.is_some() { + if !control_messages.is_empty() { // TODO: Support sending control message warn!("sending control message is not supported"); } @@ -160,7 +160,7 @@ where // TODO: Receive control message - let message_header = MessageHeader::new(Some(addr), None); + let message_header = MessageHeader::new(Some(addr), Vec::new()); Ok((received_len, message_header)) } diff --git a/kernel/src/net/socket/unix/ctrl_msg.rs b/kernel/src/net/socket/unix/ctrl_msg.rs new file mode 100644 index 000000000..cda41dd50 --- /dev/null +++ b/kernel/src/net/socket/unix/ctrl_msg.rs @@ -0,0 +1,233 @@ +// SPDX-License-Identifier: MPL-2.0 + +use core::fmt; + +use ostd::task::Task; + +use super::UnixStreamSocket; +use crate::{ + fs::{ + file_handle::FileLike, + file_table::{get_file_fast, FdFlags}, + }, + net::socket::util::{CControlHeader, ControlMessage}, + prelude::*, + process::{credentials::capabilities::CapSet, posix_thread::AsPosixThread}, + util::net::CSocketOptionLevel, +}; + +#[derive(Debug)] +pub struct UnixControlMessage(Message); + +#[derive(Debug)] +enum Message { + Files(FileMessage), +} + +impl UnixControlMessage { + pub fn read_from(header: &CControlHeader, reader: &mut VmReader) -> Result> { + debug_assert_eq!(header.level(), Some(CSocketOptionLevel::SOL_SOCKET)); + + let Ok(type_) = CControlType::try_from(header.type_()) else { + warn!("unsupported control message type in {:?}", header); + reader.skip(header.payload_len()); + return Ok(None); + }; + + match type_ { + CControlType::SCM_RIGHTS => { + let msg = FileMessage::read_from(header, reader)?; + Ok(Some(Self(Message::Files(msg)))) + } + _ => { + warn!("unsupported control message type in {:?}", header); + reader.skip(header.payload_len()); + Ok(None) + } + } + } + + pub fn write_to(&self, writer: &mut VmWriter) -> Result { + match &self.0 { + Message::Files(msg) => msg.write_to(writer), + } + } +} + +struct FileMessage { + files: Vec>, +} + +impl fmt::Debug for FileMessage { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("FileMessage") + .field("len", &self.files.len()) + .finish_non_exhaustive() + } +} + +/// The maximum number of the file descriptors in the control messages. +/// +/// Reference: . +const MAX_NR_FILES: usize = 253; + +impl FileMessage { + fn read_from(header: &CControlHeader, reader: &mut VmReader) -> Result { + let payload_len = header.payload_len(); + if payload_len % size_of::() != 0 { + return_errno_with_message!(Errno::EINVAL, "the SCM_RIGHTS message is invalid"); + } + let nfiles = payload_len / size_of::(); + + // "Attempting to send an array larger than this limit causes sendmsg(2) to fail with the + // error EINVAL." -- Reference: . + if nfiles > MAX_NR_FILES { + return_errno_with_message!(Errno::EINVAL, "the SCM_RIGHTS message is too large"); + } + // TODO: "[the ETOOMANYREFS error] occurs if the number of "in-flight" file descriptors + // exceeds the RLIMIT_NOFILE resource limit and the caller does not have the + // CAP_SYS_RESOURCE capability." + + let mut files = Vec::with_capacity(nfiles); + + let current = Task::current().unwrap(); + let mut file_table = current.as_thread_local().unwrap().borrow_file_table_mut(); + for _ in 0..nfiles { + let fd = reader.read_val::()?; + let file = get_file_fast!(&mut file_table, fd).into_owned(); + files.push(file); + } + + Ok(FileMessage { files }) + } + + fn write_to(&self, writer: &mut VmWriter) -> Result { + let nfiles = self + .files + .len() + .min(CControlHeader::payload_len_from_total(writer.avail())? / size_of::()); + if nfiles == 0 { + return_errno_with_message!(Errno::EINVAL, "the control message buffer is too small"); + } + if nfiles < self.files.len() { + warn!("setting MSG_CTRUNC is not supported"); + } + + let header = CControlHeader::new( + CSocketOptionLevel::SOL_SOCKET, + CControlType::SCM_RIGHTS as i32, + nfiles * size_of::(), + ); + writer.write_val::(&header)?; + + let current = Task::current().unwrap(); + let file_table = current.as_thread_local().unwrap().borrow_file_table(); + for file in self.files[..nfiles].iter() { + // TODO: Deal with the `O_CLOEXEC` flag. + let fd = file_table + .unwrap() + .write() + .insert(file.clone(), FdFlags::empty()); + // Perhaps we should remove the inserted files from the file table if we cannot write + // the file descriptor back to user space? However, even Linux cannot handle every + // corner case (https://elixir.bootlin.com/linux/v6.15.2/source/net/core/scm.c#L357). + writer.write_val::(&fd)?; + } + + Ok(header) + } +} + +/// Control message types. +/// +/// Reference: . +#[repr(i32)] +#[derive(Debug, Clone, Copy, TryFromInt, PartialEq, Eq)] +#[expect(non_camel_case_types)] +enum CControlType { + SCM_RIGHTS = 1, + SCM_CREDENTIALS = 2, + SCM_SECURITY = 3, + SCM_PIDFD = 4, +} + +/// Auxiliary data associated with UNIX messages. +/// +/// In UNIX sockets, one can send payload bytes with multiple control messages. If these control +/// messages need to be sent to a remote endpoint, they are packaged in this type and transmitted. +/// +/// We use this type instead of transmitting control messages directly to the remote endpoint +/// because control messages of the same type (e.g., files) can be merged and missing control +/// messages of certain types (e.g., credentials) can be supplied automatically according to socket +/// option settings. +#[derive(Default)] +pub(super) struct AuxiliaryData { + files: Vec>, +} + +impl AuxiliaryData { + /// Builds the auxiliary data from the control messages. + pub(super) fn from_control(ctrl_msgs: Vec) -> Result { + let mut files = Vec::new(); + + for ctrl_msg in ctrl_msgs.into_iter() { + let ControlMessage::Unix(unix_ctrl_msg) = ctrl_msg; + // TODO: What should we do if there are control messages of other protocols? + + match unix_ctrl_msg.0 { + Message::Files(FileMessage { + files: mut msg_files, + }) => { + if msg_files.len() > MAX_NR_FILES - files.len() { + return_errno_with_message!( + Errno::EINVAL, + "the SCM_RIGHTS message is too large" + ); + } + files.append(&mut msg_files); + } // TODO: Deal with other kinds of UNIX control messages. + } + } + + // FIXME: Sending UNIX sockets over UNIX sockets can easily lead to circular references and + // memory leaks. Linux uses a complex garbage collection algorithm to address these issues. + // See also . + if files + .iter() + .any(|file| (&**file as &dyn Any).is::()) + { + warn!("UNIX sockets in SCM_RIGHTS messages can leak kernel resource"); + + let credentials = current_thread!().as_posix_thread().unwrap().credentials(); + if !credentials.euid().is_root() + && !credentials.effective_capset().contains(CapSet::SYS_ADMIN) + { + return_errno_with_message!( + Errno::EPERM, + "UNIX sockets in SCM_RIGHTS messages can leak kernel resource" + ) + } + } + + Ok(Self { files }) + } + + /// Converts the auxiliary data back to the control messages. + pub(super) fn into_control(self) -> Vec { + let mut ctrl_msgs = Vec::new(); + + let Self { files } = self; + + if !files.is_empty() { + let unix_ctrl_msg = UnixControlMessage(Message::Files(FileMessage { files })); + ctrl_msgs.push(ControlMessage::Unix(unix_ctrl_msg)); + } + + ctrl_msgs + } + + /// Returns whether the auxiliary data contains nothing. + pub(super) fn is_empty(&self) -> bool { + self.files.is_empty() + } +} diff --git a/kernel/src/net/socket/unix/mod.rs b/kernel/src/net/socket/unix/mod.rs index d135bba83..1184d464f 100644 --- a/kernel/src/net/socket/unix/mod.rs +++ b/kernel/src/net/socket/unix/mod.rs @@ -2,10 +2,12 @@ mod addr; mod cred; +mod ctrl_msg; mod ns; mod stream; pub use addr::UnixSocketAddr; pub use cred::CUserCred; +pub(super) use ctrl_msg::UnixControlMessage; pub use stream::UnixStreamSocket; pub(super) use stream::UNIX_STREAM_DEFAULT_BUF_SIZE; diff --git a/kernel/src/net/socket/unix/stream/connected.rs b/kernel/src/net/socket/unix/stream/connected.rs index 34c12055d..c23389978 100644 --- a/kernel/src/net/socket/unix/stream/connected.rs +++ b/kernel/src/net/socket/unix/stream/connected.rs @@ -1,11 +1,18 @@ // SPDX-License-Identifier: MPL-2.0 +use core::{ + num::Wrapping, + sync::atomic::{AtomicBool, Ordering}, +}; + use crate::{ events::IoEvents, fs::utils::{Endpoint, EndpointState}, net::socket::{ - unix::{addr::UnixSocketAddrBound, cred::SocketCred, UnixSocketAddr}, - util::SockShutdownCmd, + unix::{ + addr::UnixSocketAddrBound, cred::SocketCred, ctrl_msg::AuxiliaryData, UnixSocketAddr, + }, + util::{ControlMessage, SockShutdownCmd}, }, prelude::*, process::signal::Pollee, @@ -37,10 +44,14 @@ impl Connected { let this_inner = Inner { addr: SpinLock::new(addr), state, + all_aux: Mutex::new(VecDeque::new()), + has_aux: AtomicBool::new(false), }; let peer_inner = Inner { addr: SpinLock::new(peer_addr), state: peer_state, + all_aux: Mutex::new(VecDeque::new()), + has_aux: AtomicBool::new(false), }; let (this_inner, peer_inner) = Endpoint::new_pair(this_inner, peer_inner); @@ -82,23 +93,64 @@ impl Connected { Ok(()) } - pub(super) fn try_read(&self, writer: &mut dyn MultiWrite) -> Result { + pub(super) fn try_read( + &self, + writer: &mut dyn MultiWrite, + ) -> Result<(usize, Vec)> { if writer.is_empty() { if self.reader.lock().is_empty() { return_errno_with_message!(Errno::EAGAIN, "the channel is empty"); } - return Ok(0); + return Ok((0, Vec::new())); } - let read = || { - let mut reader = self.reader.lock(); - reader.read_fallible(writer) + let mut reader = self.reader.lock(); + // `reader.len()` is an `Acquire` operation. So it can guarantee that the `has_aux` + // check below sees the up-to-date value. + let no_aux_len = reader.len(); + + let peer_end = self.inner.peer_end(); + + // Fast path: There are no auxiliary data to receive. + if !peer_end.has_aux.load(Ordering::Relaxed) { + let read_len = self + .inner + .read_with(move || reader.read_fallible_with_max_len(writer, no_aux_len))?; + return Ok((read_len, Vec::new())); + } + + let mut all_aux = peer_end.all_aux.lock(); + + let read_start = reader.head(); + let (len_to_aux, len_to_aux_end) = if let Some(front) = all_aux.front() { + ((front.start - read_start).0, (front.end - read_start).0) + } else { + (usize::MAX, usize::MAX) }; - self.inner.read_with(read) + // It is not allowed to receive two sets of auxiliary data in one `recvmsg`. So we cannot + // read more than `len_to_aux_end` bytes. + let read_len = self + .inner + .read_with(move || reader.read_fallible_with_max_len(writer, len_to_aux_end))?; + if read_len <= len_to_aux { + return Ok((read_len, Vec::new())); + } + + // We have received the first set of auxiliary data. + let ctrl_msgs = all_aux.pop_front().unwrap().data.into_control(); + peer_end + .has_aux + .store(!all_aux.is_empty(), Ordering::Relaxed); + + Ok((read_len, ctrl_msgs)) } - pub(super) fn try_write(&self, reader: &mut dyn MultiRead) -> Result { + pub(super) fn try_write( + &self, + reader: &mut dyn MultiRead, + aux_data: &mut AuxiliaryData, + ) -> Result { if reader.is_empty() { if self.inner.is_shutdown() { return_errno_with_message!(Errno::EPIPE, "the channel is shut down"); @@ -106,12 +158,40 @@ impl Connected { return Ok(0); } - let write = || { + // Fast path: There are no auxiliary data to transmit. + if aux_data.is_empty() { let mut writer = self.writer.lock(); - writer.write_fallible(reader) + return self.inner.write_with(move || writer.write_fallible(reader)); + } + + let this_end = self.inner.this_end(); + let mut all_aux = this_end.all_aux.lock(); + + // No matter we succeed later or not, set the flag first to ensure that the auxiliary + // data are always visible to `try_recv`. + this_end.has_aux.store(true, Ordering::Relaxed); + + let (write_start, write_res) = { + let mut writer = self.writer.lock(); + let write_start = writer.tail(); + let write_res = self.inner.write_with(move || writer.write_fallible(reader)); + (write_start, write_res) + }; + let Ok(write_len) = write_res else { + this_end + .has_aux + .store(!all_aux.is_empty(), Ordering::Relaxed); + return write_res; }; - self.inner.write_with(write) + let aux_range = RangedAuxiliaryData { + data: core::mem::take(aux_data), + start: write_start, + end: write_start + Wrapping(write_len), + }; + all_aux.push_back(aux_range); + + Ok(write_len) } pub(super) fn shutdown(&self, cmd: SockShutdownCmd) { @@ -165,6 +245,9 @@ impl Drop for Connected { struct Inner { addr: SpinLock>, state: EndpointState, + // Lock order: `reader` -> `all_aux` & `all_aux` -> `writer` + all_aux: Mutex>, + has_aux: AtomicBool, } impl AsRef for Inner { @@ -173,4 +256,10 @@ impl AsRef for Inner { } } +struct RangedAuxiliaryData { + data: AuxiliaryData, + start: Wrapping, // inclusive + end: Wrapping, // exclusive +} + pub(in crate::net) const UNIX_STREAM_DEFAULT_BUF_SIZE: usize = 65536; diff --git a/kernel/src/net/socket/unix/stream/socket.rs b/kernel/src/net/socket/unix/stream/socket.rs index 6ed269fad..273ff9ae6 100644 --- a/kernel/src/net/socket/unix/stream/socket.rs +++ b/kernel/src/net/socket/unix/stream/socket.rs @@ -17,10 +17,10 @@ use crate::{ net::socket::{ options::{PeerCred, PeerGroups, SocketOption}, private::SocketPrivate, - unix::{cred::SocketCred, CUserCred, UnixSocketAddr}, + unix::{cred::SocketCred, ctrl_msg::AuxiliaryData, CUserCred, UnixSocketAddr}, util::{ options::{GetSocketLevelOption, SetSocketLevelOption, SocketOptionSet}, - MessageHeader, SendRecvFlags, SockShutdownCmd, SocketAddr, + ControlMessage, MessageHeader, SendRecvFlags, SockShutdownCmd, SocketAddr, }, Socket, }, @@ -176,16 +176,25 @@ impl UnixStreamSocket { ) } - fn try_send(&self, buf: &mut dyn MultiRead, _flags: SendRecvFlags) -> Result { + fn try_send( + &self, + buf: &mut dyn MultiRead, + aux_data: &mut AuxiliaryData, + _flags: SendRecvFlags, + ) -> Result { match self.state.read().as_ref() { - State::Connected(connected) => connected.try_write(buf), + State::Connected(connected) => connected.try_write(buf, aux_data), State::Init(_) | State::Listen(_) => { return_errno_with_message!(Errno::ENOTCONN, "the socket is not connected") } } } - fn try_recv(&self, buf: &mut dyn MultiWrite, _flags: SendRecvFlags) -> Result { + fn try_recv( + &self, + buf: &mut dyn MultiWrite, + _flags: SendRecvFlags, + ) -> Result<(usize, Vec)> { match self.state.read().as_ref() { State::Connected(connected) => connected.try_read(buf), State::Init(_) | State::Listen(_) => { @@ -371,7 +380,7 @@ impl Socket for UnixStreamSocket { } let MessageHeader { - control_message, + control_messages, addr, } = message_header; @@ -390,13 +399,11 @@ impl Socket for UnixStreamSocket { ), } } + let mut auxiliary_data = AuxiliaryData::from_control(control_messages)?; - if control_message.is_some() { - // TODO: Support sending control message - warn!("sending control message is not supported"); - } - - self.block_on(IoEvents::OUT, || self.try_send(reader, flags)) + self.block_on(IoEvents::OUT, || { + self.try_send(reader, &mut auxiliary_data, flags) + }) } fn recvmsg( @@ -409,11 +416,10 @@ impl Socket for UnixStreamSocket { warn!("unsupported flags: {:?}", flags); } - let received_bytes = self.block_on(IoEvents::IN, || self.try_recv(writer, flags))?; + let (received_bytes, control_messages) = + self.block_on(IoEvents::IN, || self.try_recv(writer, flags))?; - // TODO: Receive control message - - let message_header = MessageHeader::new(None, None); + let message_header = MessageHeader::new(None, control_messages); Ok((received_bytes, message_header)) } diff --git a/kernel/src/net/socket/util/message_header.rs b/kernel/src/net/socket/util/message_header.rs index 3f70eafbc..45f360dfa 100644 --- a/kernel/src/net/socket/util/message_header.rs +++ b/kernel/src/net/socket/util/message_header.rs @@ -1,21 +1,23 @@ // SPDX-License-Identifier: MPL-2.0 +use align_ext::AlignExt; + use super::SocketAddr; -use crate::prelude::*; +use crate::{net::socket::unix::UnixControlMessage, prelude::*, util::net::CSocketOptionLevel}; /// Message header used for sendmsg/recvmsg. #[derive(Debug)] pub struct MessageHeader { pub(in crate::net) addr: Option, - pub(in crate::net) control_message: Option, + pub(in crate::net) control_messages: Vec, } impl MessageHeader { /// Creates a new `MessageHeader`. - pub const fn new(addr: Option, control_message: Option) -> Self { + pub const fn new(addr: Option, control_messages: Vec) -> Self { Self { addr, - control_message, + control_messages, } } @@ -23,10 +25,175 @@ impl MessageHeader { pub fn addr(&self) -> Option<&SocketAddr> { self.addr.as_ref() } + + /// Returns the control messages. + pub fn control_messages(&self) -> &Vec { + &self.control_messages + } } -/// Control message carried by MessageHeader. -/// -/// TODO: Implement the struct. The struct is empty now. +/// Control messages in [`MessageHeader`]. #[derive(Debug)] -pub struct ControlMessage; +pub enum ControlMessage { + Unix(UnixControlMessage), +} + +impl ControlMessage { + pub fn read_all_from(reader: &mut VmReader) -> Result> { + // FIXME: This method may exhaust kernel memory and cause a panic if the program is + // malicious and attempts to send too many control messages. To prevent this, we limit the + // number of control messages, but this limit does not have a Linux equivalent. + const MAX_NR_MSGS: usize = 32; + + let mut msgs = Vec::new(); + + while reader.has_remain() && msgs.len() < MAX_NR_MSGS { + let header = reader.read_val::()?; + if header.len <= size_of::() || header.payload_len() > reader.remain() { + return_errno_with_message!( + Errno::EINVAL, + "the size of the control message is invalid" + ); + } + + if let Some(msg) = Self::read_from(&header, reader)? { + msgs.push(msg); + } + + let padding_len = header.padding_len().min(reader.remain()); + reader.skip(padding_len); + } + + if reader.has_remain() { + warn!("excessive control messages are currently not permitted"); + return_errno_with_message!( + Errno::E2BIG, + "excessive control messages are currently not permitted" + ); + } + + Ok(msgs) + } + + fn read_from(header: &CControlHeader, reader: &mut VmReader) -> Result> { + let Some(level) = header.level() else { + warn!("unsupported control message level in {:?}", header); + reader.skip(header.payload_len()); + return Ok(None); + }; + + match level { + CSocketOptionLevel::SOL_SOCKET => { + // Linux manual pages say (https://man7.org/linux/man-pages/man7/unix.7.html): + // "For historical reasons, the ancillary message types listed below are specified + // with a SOL_SOCKET type even though they are AF_UNIX specific." + let msg = UnixControlMessage::read_from(header, reader)?; + Ok(msg.map(Self::Unix)) + } + _ => { + warn!("unsupported control message level in {:?}", header); + reader.skip(header.payload_len()); + Ok(None) + } + } + } + + pub fn write_all_to(msgs: &[Self], writer: &mut VmWriter) -> usize { + let mut len = 0; + + for msg in msgs.iter() { + let header = match msg.write_to(writer) { + Ok(header) => header, + // This occurs when the buffer is too short or when some page faults cannot be + // handled. However, at this point, there is no good way to report the errors to + // user space. According to the Linux implementation, it seems okay to silently + // ignore errors here. + Err(_) => { + warn!("setting MSG_CTRUNC is not supported"); + break; + } + }; + + len += header.total_len(); + + let padding_len = header.padding_len().min(writer.avail()); + writer.skip(padding_len); + len += padding_len; + } + + len + } + + fn write_to(&self, writer: &mut VmWriter) -> Result { + match self { + Self::Unix(msg) => msg.write_to(writer), + } + } +} + +/// `cmsghdr` in Linux. +/// +/// Reference: . +#[repr(C)] +#[derive(Debug, Clone, Copy, Pod)] +pub struct CControlHeader { + /// Data byte count, including hdr + len: usize, + /// Originating protocol + level: i32, + /// Protocol-specific type + type_: i32, +} + +/// Alignment of control messages. +/// +/// Reference: . +const CMSG_ALIGN: usize = size_of::(); + +impl CControlHeader { + /// Creates a control message header with the level, type, and payload length. + pub fn new(level: CSocketOptionLevel, type_: i32, payload_len: usize) -> Self { + Self { + len: payload_len + size_of::(), + level: level as i32, + type_, + } + } + + /// Computes the payload length from the total length. + pub fn payload_len_from_total(total_len: usize) -> Result { + total_len.checked_sub(size_of::()).ok_or_else(|| { + Error::with_message(Errno::EINVAL, "the control message buffer is too small") + }) + } + + /// Returns the level of the control message. + pub fn level(&self) -> Option { + CSocketOptionLevel::try_from(self.level).ok() + } + + /// Returns the type of the control message. + pub fn type_(&self) -> i32 { + self.type_ + } + + /// Returns the payload length of the control message. + pub fn payload_len(&self) -> usize { + self.len - size_of::() + } + + /// Returns the length of the control message (payload + header, excluding paddings). + pub fn total_len(&self) -> usize { + self.len + } + + /// Returns the length of the padding bytes for the control message. + pub(self) fn padding_len(&self) -> usize { + self.total_len_with_padding() - self.total_len() + } + + /// Returns the length of the control message (payload + header, including paddings). + fn total_len_with_padding(&self) -> usize { + self.len.align_up(CMSG_ALIGN) + } +} diff --git a/kernel/src/net/socket/util/mod.rs b/kernel/src/net/socket/util/mod.rs index 782e7708a..b847868cf 100644 --- a/kernel/src/net/socket/util/mod.rs +++ b/kernel/src/net/socket/util/mod.rs @@ -9,7 +9,8 @@ mod shutdown_cmd; mod socket_addr; pub use linger_option::LingerOption; -pub use message_header::MessageHeader; +pub(super) use message_header::CControlHeader; +pub use message_header::{ControlMessage, MessageHeader}; pub use send_recv_flags::SendRecvFlags; pub use shutdown_cmd::SockShutdownCmd; pub use socket_addr::SocketAddr; diff --git a/kernel/src/net/socket/vsock/stream/socket.rs b/kernel/src/net/socket/vsock/stream/socket.rs index 546bb32ac..6b984a192 100644 --- a/kernel/src/net/socket/vsock/stream/socket.rs +++ b/kernel/src/net/socket/vsock/stream/socket.rs @@ -254,10 +254,10 @@ impl Socket for VsockStreamSocket { } let MessageHeader { - control_message, .. + control_messages, .. } = message_header; - if control_message.is_some() { + if !control_messages.is_empty() { // TODO: Support sending control message warn!("sending control message is not supported"); } @@ -279,7 +279,7 @@ impl Socket for VsockStreamSocket { // TODO: Receive control message - let messsge_header = MessageHeader::new(None, None); + let messsge_header = MessageHeader::new(None, Vec::new()); Ok((received_bytes, messsge_header)) } diff --git a/kernel/src/syscall/recvmsg.rs b/kernel/src/syscall/recvmsg.rs index 647b686e1..db30f8bcd 100644 --- a/kernel/src/syscall/recvmsg.rs +++ b/kernel/src/syscall/recvmsg.rs @@ -14,7 +14,8 @@ pub fn sys_recvmsg( flags: i32, ctx: &Context, ) -> Result { - let c_user_msghdr: CUserMsgHdr = ctx.user_space().read_val(user_msghdr_ptr)?; + let user_space = ctx.user_space(); + let mut c_user_msghdr: CUserMsgHdr = user_space.read_val(user_msghdr_ptr)?; let flags = SendRecvFlags::from_bits_truncate(flags); debug!( @@ -27,7 +28,6 @@ pub fn sys_recvmsg( let socket = file.as_socket_or_err()?; let (total_bytes, message_header) = { - let user_space = ctx.user_space(); let mut io_vec_writer = c_user_msghdr.copy_writer_array_from_user(&user_space)?; socket .recvmsg(&mut io_vec_writer, flags) @@ -38,13 +38,18 @@ pub fn sys_recvmsg( })? }; - if let Some(addr) = message_header.addr() { - c_user_msghdr.write_socket_addr_to_user(addr)?; - } + // Writing control messages may access the file table, so it should be called after dropping + // the file table borrow. + drop(file_table); - if c_user_msghdr.msg_control != 0 { - warn!("receiving control message is not supported"); - } + let addr = message_header.addr(); + c_user_msghdr.msg_namelen = c_user_msghdr.write_socket_addr_to_user(addr)?; + + let control_messages = message_header.control_messages(); + c_user_msghdr.msg_controllen = + c_user_msghdr.write_control_messages_to_user(control_messages, &user_space)?; + + user_space.write_val(user_msghdr_ptr, &c_user_msghdr)?; Ok(SyscallReturn::Return(total_bytes as _)) } diff --git a/kernel/src/syscall/sendmsg.rs b/kernel/src/syscall/sendmsg.rs index e702605c6..f571153b8 100644 --- a/kernel/src/syscall/sendmsg.rs +++ b/kernel/src/syscall/sendmsg.rs @@ -14,7 +14,8 @@ pub fn sys_sendmsg( flags: i32, ctx: &Context, ) -> Result { - let c_user_msghdr: CUserMsgHdr = ctx.user_space().read_val(user_msghdr_ptr)?; + let user_space = ctx.user_space(); + let c_user_msghdr: CUserMsgHdr = user_space.read_val(user_msghdr_ptr)?; let flags = SendRecvFlags::from_bits_truncate(flags); debug!( @@ -22,26 +23,19 @@ pub fn sys_sendmsg( sockfd, c_user_msghdr, flags ); + let message_header = { + let addr = c_user_msghdr.read_socket_addr_from_user()?; + // Reading control messages may access the file table, so it should be called before + // `borrow_file_table_mut`. + let control_messages = c_user_msghdr.read_control_messages_from_user(&user_space)?; + MessageHeader::new(addr, control_messages) + }; + let mut io_vec_reader = c_user_msghdr.copy_reader_array_from_user(&user_space)?; + let mut file_table = ctx.thread_local.borrow_file_table_mut(); let file = get_file_fast!(&mut file_table, sockfd); let socket = file.as_socket_or_err()?; - let user_space = ctx.user_space(); - let (mut io_vec_reader, message_header) = { - let addr = c_user_msghdr.read_socket_addr_from_user()?; - let io_vec_reader = c_user_msghdr.copy_reader_array_from_user(&user_space)?; - - let control_message = { - if c_user_msghdr.msg_control != 0 { - // TODO: support sending control message - warn!("control message is not supported now"); - } - None - }; - - (io_vec_reader, MessageHeader::new(addr, control_message)) - }; - let total_bytes = socket .sendmsg(&mut io_vec_reader, message_header, flags) .map_err(|err| match err.error() { diff --git a/kernel/src/syscall/sendto.rs b/kernel/src/syscall/sendto.rs index c31fb5a25..555ca84e8 100644 --- a/kernel/src/syscall/sendto.rs +++ b/kernel/src/syscall/sendto.rs @@ -30,7 +30,7 @@ pub fn sys_sendto( let file = get_file_fast!(&mut file_table, sockfd); let socket = file.as_socket_or_err()?; - let message_header = MessageHeader::new(socket_addr, None); + let message_header = MessageHeader::new(socket_addr, Vec::new()); let user_space = ctx.user_space(); let mut reader = user_space.reader(buf, len)?; diff --git a/kernel/src/util/net/options/mod.rs b/kernel/src/util/net/options/mod.rs index 117bc16c2..51d71eea3 100644 --- a/kernel/src/util/net/options/mod.rs +++ b/kernel/src/util/net/options/mod.rs @@ -173,7 +173,11 @@ pub fn new_raw_socket_option( } } -/// Sock Opt level. The definition is from https://elixir.bootlin.com/linux/v6.0.9/source/include/linux/socket.h#L343 +/// Socket level. +/// +/// This can refer to either a socket option or a control message level. +/// +/// Reference: . #[repr(i32)] #[derive(Debug, Clone, Copy, TryFromInt, PartialEq, Eq)] #[expect(non_camel_case_types)] diff --git a/kernel/src/util/net/socket.rs b/kernel/src/util/net/socket.rs index 8ca7af693..2217edef5 100644 --- a/kernel/src/util/net/socket.rs +++ b/kernel/src/util/net/socket.rs @@ -2,7 +2,7 @@ use super::read_socket_addr_from_user; use crate::{ - net::socket::util::SocketAddr, + net::socket::util::{ControlMessage, SocketAddr}, prelude::*, util::{net::write_socket_addr_with_max_len, VmReaderArray, VmWriterArray}, }; @@ -103,13 +103,51 @@ impl CUserMsgHdr { Ok(Some(socket_addr)) } - pub fn write_socket_addr_to_user(&self, addr: &SocketAddr) -> Result<()> { + pub fn write_socket_addr_to_user(&self, addr: Option<&SocketAddr>) -> Result { if self.msg_name == 0 { - return Ok(()); + // The length field will not be touched if the name pointer is NULL. + // See . + return Ok(self.msg_namelen); } - write_socket_addr_with_max_len(addr, self.msg_name, self.msg_namelen)?; - Ok(()) + let actual_len = if let Some(addr) = addr { + write_socket_addr_with_max_len(addr, self.msg_name, self.msg_namelen)? + } else { + 0 + }; + Ok(actual_len) + } + + pub fn read_control_messages_from_user( + &self, + user_space: &CurrentUserSpace, + ) -> Result> { + if self.msg_control == 0 { + return Ok(Vec::new()); + } + + let mut reader = user_space.reader(self.msg_control, self.msg_controllen as usize)?; + let control_messages = ControlMessage::read_all_from(&mut reader)?; + Ok(control_messages) + } + + pub fn write_control_messages_to_user( + &self, + control_messages: &[ControlMessage], + user_space: &CurrentUserSpace, + ) -> Result { + if self.msg_control == 0 { + if !control_messages.is_empty() { + warn!("setting MSG_CTRUNC is not supported"); + } + // The length field will be set even if the control message pointer is NULL. + // See . + return Ok(0); + } + + let mut writer = user_space.writer(self.msg_control, self.msg_controllen as usize)?; + let write_len = ControlMessage::write_all_to(control_messages, &mut writer) as u32; + Ok(write_len) } pub fn copy_reader_array_from_user<'a>( diff --git a/kernel/src/util/ring_buffer.rs b/kernel/src/util/ring_buffer.rs index 92c38b13a..fcf7c5973 100644 --- a/kernel/src/util/ring_buffer.rs +++ b/kernel/src/util/ring_buffer.rs @@ -307,8 +307,19 @@ impl>> Producer { /// /// Returns the number of bytes written. pub fn write_fallible(&mut self, reader: &mut dyn MultiRead) -> Result { + self.write_fallible_with_max_len(reader, usize::MAX) + } + + /// Writes data from the `VmReader` to the `RingBuffer` with the maximum length. + /// + /// Returns the number of bytes written. + pub fn write_fallible_with_max_len( + &mut self, + reader: &mut dyn MultiRead, + max_len: usize, + ) -> Result { let rb = &self.rb; - let free_len = rb.free_len(); + let free_len = rb.free_len().min(max_len); let tail = rb.tail(); let offset = tail.0 & (rb.capacity - 1); @@ -410,8 +421,19 @@ impl>> Consumer { /// /// Returns the number of bytes read. pub fn read_fallible(&mut self, writer: &mut dyn MultiWrite) -> Result { + self.read_fallible_with_max_len(writer, usize::MAX) + } + + /// Reads data from the `VmWriter` to the `RingBuffer` with the maximum length. + /// + /// Returns the number of bytes read. + pub fn read_fallible_with_max_len( + &mut self, + writer: &mut dyn MultiWrite, + max_len: usize, + ) -> Result { let rb = &self.rb; - let len = rb.len(); + let len = rb.len().min(max_len); let head = rb.head(); let offset = head.0 & (rb.capacity - 1); diff --git a/test/src/apps/network/unix_err.c b/test/src/apps/network/unix_err.c index 3ab9e41cf..36e8f7eaa 100644 --- a/test/src/apps/network/unix_err.c +++ b/test/src/apps/network/unix_err.c @@ -729,3 +729,98 @@ FN_TEST(zero_sends_may_fail) TEST_SUCC(close(fildes[1])); } END_TEST() + +FN_TEST(scm_rights) +{ + int fildes[2]; + char buf[20] = "abcdefg"; + char cbuf[CMSG_SPACE(sizeof(int) * 3)]; + struct iovec iov; + struct msghdr mhdr; + struct cmsghdr *chdr; + int *cdata; + int cfds[2]; + + TEST_SUCC(socketpair(AF_UNIX, SOCK_STREAM | SOCK_NONBLOCK, 0, fildes)); + + memset(&mhdr, 0, sizeof(mhdr)); + mhdr.msg_iov = &iov; + mhdr.msg_iovlen = 1; + mhdr.msg_control = cbuf; + mhdr.msg_controllen = CMSG_SPACE(sizeof(int) * 3); + + iov.iov_base = buf; + iov.iov_len = 1; + + chdr = CMSG_FIRSTHDR(&mhdr); + chdr->cmsg_level = SOL_SOCKET; + chdr->cmsg_type = SCM_RIGHTS; + chdr->cmsg_len = CMSG_SPACE(sizeof(int) * 3); + + cdata = (int *)CMSG_DATA(chdr); + TEST_SUCC(pipe(cfds)); + cdata[0] = cfds[0]; + cdata[1] = cfds[0]; + cdata[2] = cfds[1]; + + // Sending control messages with zero bytes to a stream socket + // seems to "succeed". However, no data or control messages can + // be transmitted. + mhdr.msg_iovlen = 0; + TEST_SUCC(sendmsg(fildes[0], &mhdr, 0)); + mhdr.msg_iovlen = 1; + + // > (1) sendmsg(2) of four bytes, with no ancillary data. + // > (2) sendmsg(2) of one byte, with ancillary data. + // > (3) sendmsg(2) of four bytes, with no ancillary data. + // -- https://man7.org/linux/man-pages/man7/unix.7.html + TEST_RES(send(fildes[0], buf, 4, 0), _ret == 4); + TEST_RES(sendmsg(fildes[0], &mhdr, 0), _ret == 1); + TEST_RES(send(fildes[0], buf, 4, 0), _ret == 4); + + memset(&mhdr, 0, sizeof(mhdr)); + mhdr.msg_iov = &iov; + mhdr.msg_iovlen = 1; + mhdr.msg_control = cbuf; + mhdr.msg_controllen = CMSG_SPACE(sizeof(int)); + + iov.iov_base = buf; + iov.iov_len = sizeof(buf); + + memset(cbuf, 0, sizeof(cbuf)); + + // > Suppose that the receiver now performs recvmsg(2) calls each with + // > a buffer size of 20 bytes. The first call will receive five bytes + // > of data, along with the ancillary data sent by the second + // > sendmsg(2) call. + TEST_RES(recvmsg(fildes[1], &mhdr, 0), + _ret == 5 && + mhdr.msg_controllen == CMSG_SPACE(sizeof(int) * 2) && + (chdr = CMSG_FIRSTHDR(&mhdr)) && + chdr->cmsg_level == SOL_SOCKET && + chdr->cmsg_type == SCM_RIGHTS && + chdr->cmsg_len == CMSG_SPACE(sizeof(int) * 2) && + (cdata = (int *)CMSG_DATA(chdr)) && + cdata[0] == cfds[1] + 1 && cdata[1] == cfds[1] + 2); + // > The next call will receive the remaining four + // > bytes of data. + TEST_RES(recv(fildes[1], buf, sizeof(buf), 0), _ret == 4); + + // The purpose of the tests below is to verify that the received file + // descriptors are functional. + TEST_RES(write(cfds[1], "x", 1), _ret == 1); + TEST_RES(read(cdata[0], buf, 1), _ret == 1 && buf[0] == 'x'); + TEST_RES(write(cfds[1], "y", 1), _ret == 1); + TEST_RES(read(cdata[1], buf, 1), _ret == 1 && buf[0] == 'y'); + + TEST_SUCC(close(cdata[0])); + TEST_SUCC(close(cdata[1])); + TEST_SUCC(close(cfds[0])); + + TEST_ERRNO(write(cfds[1], "y", 1), EPIPE); + TEST_SUCC(close(cfds[1])); + + TEST_SUCC(close(fildes[0])); + TEST_SUCC(close(fildes[1])); +} +END_TEST()