Send and receive UNIX control messages

This commit is contained in:
Ruihan Li 2025-06-13 11:55:18 +08:00 committed by Jianfeng Jiang
parent bc7515389b
commit 4d8078166e
18 changed files with 740 additions and 87 deletions

View File

@ -176,7 +176,7 @@ impl Socket for DatagramSocket {
let MessageHeader {
addr,
control_message,
control_messages,
} = message_header;
let endpoint = match addr {
@ -184,7 +184,7 @@ impl Socket for DatagramSocket {
None => None,
};
if control_message.is_some() {
if !control_messages.is_empty() {
// TODO: Support sending control message
warn!("sending control message is not supported");
}
@ -208,7 +208,7 @@ impl Socket for DatagramSocket {
// TODO: Receive control message
let message_header = MessageHeader::new(Some(peer_addr), None);
let message_header = MessageHeader::new(Some(peer_addr), Vec::new());
Ok((received_bytes, message_header))
}

View File

@ -541,14 +541,14 @@ impl Socket for StreamSocket {
}
let MessageHeader {
control_message, ..
control_messages, ..
} = message_header;
// According to the Linux man pages, `EISCONN` _may_ be returned when the destination
// address is specified for a connection-mode socket. In practice, the destination address
// is simply ignored. We follow the same behavior as the Linux implementation to ignore it.
if control_message.is_some() {
if !control_messages.is_empty() {
// TODO: Support sending control message
warn!("sending control message is not supported");
}
@ -574,7 +574,7 @@ impl Socket for StreamSocket {
// According to <https://elixir.bootlin.com/linux/v6.0.9/source/net/ipv4/tcp.c#L2645>,
// peer address is ignored for connected socket.
let message_header = MessageHeader::new(None, None);
let message_header = MessageHeader::new(None, Vec::new());
Ok((received_bytes, message_header))
}

View File

@ -139,10 +139,7 @@ impl<T: Socket + 'static> FileLike for T {
// TODO: Set correct flags
self.sendmsg(
reader,
MessageHeader {
addr: None,
control_message: None,
},
MessageHeader::new(None, Vec::new()),
SendRecvFlags::empty(),
)
}

View File

@ -129,7 +129,7 @@ where
) -> Result<usize> {
let MessageHeader {
addr,
control_message,
control_messages,
} = message_header;
let remote = match addr {
@ -137,7 +137,7 @@ where
Some(addr) => Some(addr.try_into()?),
};
if control_message.is_some() {
if !control_messages.is_empty() {
// TODO: Support sending control message
warn!("sending control message is not supported");
}
@ -160,7 +160,7 @@ where
// TODO: Receive control message
let message_header = MessageHeader::new(Some(addr), None);
let message_header = MessageHeader::new(Some(addr), Vec::new());
Ok((received_len, message_header))
}

View File

@ -0,0 +1,233 @@
// SPDX-License-Identifier: MPL-2.0
use core::fmt;
use ostd::task::Task;
use super::UnixStreamSocket;
use crate::{
fs::{
file_handle::FileLike,
file_table::{get_file_fast, FdFlags},
},
net::socket::util::{CControlHeader, ControlMessage},
prelude::*,
process::{credentials::capabilities::CapSet, posix_thread::AsPosixThread},
util::net::CSocketOptionLevel,
};
#[derive(Debug)]
pub struct UnixControlMessage(Message);
#[derive(Debug)]
enum Message {
Files(FileMessage),
}
impl UnixControlMessage {
pub fn read_from(header: &CControlHeader, reader: &mut VmReader) -> Result<Option<Self>> {
debug_assert_eq!(header.level(), Some(CSocketOptionLevel::SOL_SOCKET));
let Ok(type_) = CControlType::try_from(header.type_()) else {
warn!("unsupported control message type in {:?}", header);
reader.skip(header.payload_len());
return Ok(None);
};
match type_ {
CControlType::SCM_RIGHTS => {
let msg = FileMessage::read_from(header, reader)?;
Ok(Some(Self(Message::Files(msg))))
}
_ => {
warn!("unsupported control message type in {:?}", header);
reader.skip(header.payload_len());
Ok(None)
}
}
}
pub fn write_to(&self, writer: &mut VmWriter) -> Result<CControlHeader> {
match &self.0 {
Message::Files(msg) => msg.write_to(writer),
}
}
}
struct FileMessage {
files: Vec<Arc<dyn FileLike>>,
}
impl fmt::Debug for FileMessage {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
f.debug_struct("FileMessage")
.field("len", &self.files.len())
.finish_non_exhaustive()
}
}
/// The maximum number of the file descriptors in the control messages.
///
/// Reference: <https://elixir.bootlin.com/linux/v6.15/source/include/net/scm.h#L18>.
const MAX_NR_FILES: usize = 253;
impl FileMessage {
fn read_from(header: &CControlHeader, reader: &mut VmReader) -> Result<Self> {
let payload_len = header.payload_len();
if payload_len % size_of::<i32>() != 0 {
return_errno_with_message!(Errno::EINVAL, "the SCM_RIGHTS message is invalid");
}
let nfiles = payload_len / size_of::<i32>();
// "Attempting to send an array larger than this limit causes sendmsg(2) to fail with the
// error EINVAL." -- Reference: <https://man7.org/linux/man-pages/man7/unix.7.html>.
if nfiles > MAX_NR_FILES {
return_errno_with_message!(Errno::EINVAL, "the SCM_RIGHTS message is too large");
}
// TODO: "[the ETOOMANYREFS error] occurs if the number of "in-flight" file descriptors
// exceeds the RLIMIT_NOFILE resource limit and the caller does not have the
// CAP_SYS_RESOURCE capability."
let mut files = Vec::with_capacity(nfiles);
let current = Task::current().unwrap();
let mut file_table = current.as_thread_local().unwrap().borrow_file_table_mut();
for _ in 0..nfiles {
let fd = reader.read_val::<i32>()?;
let file = get_file_fast!(&mut file_table, fd).into_owned();
files.push(file);
}
Ok(FileMessage { files })
}
fn write_to(&self, writer: &mut VmWriter) -> Result<CControlHeader> {
let nfiles = self
.files
.len()
.min(CControlHeader::payload_len_from_total(writer.avail())? / size_of::<i32>());
if nfiles == 0 {
return_errno_with_message!(Errno::EINVAL, "the control message buffer is too small");
}
if nfiles < self.files.len() {
warn!("setting MSG_CTRUNC is not supported");
}
let header = CControlHeader::new(
CSocketOptionLevel::SOL_SOCKET,
CControlType::SCM_RIGHTS as i32,
nfiles * size_of::<i32>(),
);
writer.write_val::<CControlHeader>(&header)?;
let current = Task::current().unwrap();
let file_table = current.as_thread_local().unwrap().borrow_file_table();
for file in self.files[..nfiles].iter() {
// TODO: Deal with the `O_CLOEXEC` flag.
let fd = file_table
.unwrap()
.write()
.insert(file.clone(), FdFlags::empty());
// Perhaps we should remove the inserted files from the file table if we cannot write
// the file descriptor back to user space? However, even Linux cannot handle every
// corner case (https://elixir.bootlin.com/linux/v6.15.2/source/net/core/scm.c#L357).
writer.write_val::<i32>(&fd)?;
}
Ok(header)
}
}
/// Control message types.
///
/// Reference: <https://elixir.bootlin.com/linux/v6.13/source/include/linux/socket.h#L178>.
#[repr(i32)]
#[derive(Debug, Clone, Copy, TryFromInt, PartialEq, Eq)]
#[expect(non_camel_case_types)]
enum CControlType {
SCM_RIGHTS = 1,
SCM_CREDENTIALS = 2,
SCM_SECURITY = 3,
SCM_PIDFD = 4,
}
/// Auxiliary data associated with UNIX messages.
///
/// In UNIX sockets, one can send payload bytes with multiple control messages. If these control
/// messages need to be sent to a remote endpoint, they are packaged in this type and transmitted.
///
/// We use this type instead of transmitting control messages directly to the remote endpoint
/// because control messages of the same type (e.g., files) can be merged and missing control
/// messages of certain types (e.g., credentials) can be supplied automatically according to socket
/// option settings.
#[derive(Default)]
pub(super) struct AuxiliaryData {
files: Vec<Arc<dyn FileLike>>,
}
impl AuxiliaryData {
/// Builds the auxiliary data from the control messages.
pub(super) fn from_control(ctrl_msgs: Vec<ControlMessage>) -> Result<Self> {
let mut files = Vec::new();
for ctrl_msg in ctrl_msgs.into_iter() {
let ControlMessage::Unix(unix_ctrl_msg) = ctrl_msg;
// TODO: What should we do if there are control messages of other protocols?
match unix_ctrl_msg.0 {
Message::Files(FileMessage {
files: mut msg_files,
}) => {
if msg_files.len() > MAX_NR_FILES - files.len() {
return_errno_with_message!(
Errno::EINVAL,
"the SCM_RIGHTS message is too large"
);
}
files.append(&mut msg_files);
} // TODO: Deal with other kinds of UNIX control messages.
}
}
// FIXME: Sending UNIX sockets over UNIX sockets can easily lead to circular references and
// memory leaks. Linux uses a complex garbage collection algorithm to address these issues.
// See also <https://elixir.bootlin.com/linux/v6.15/source/net/unix/garbage.c#L592>.
if files
.iter()
.any(|file| (&**file as &dyn Any).is::<UnixStreamSocket>())
{
warn!("UNIX sockets in SCM_RIGHTS messages can leak kernel resource");
let credentials = current_thread!().as_posix_thread().unwrap().credentials();
if !credentials.euid().is_root()
&& !credentials.effective_capset().contains(CapSet::SYS_ADMIN)
{
return_errno_with_message!(
Errno::EPERM,
"UNIX sockets in SCM_RIGHTS messages can leak kernel resource"
)
}
}
Ok(Self { files })
}
/// Converts the auxiliary data back to the control messages.
pub(super) fn into_control(self) -> Vec<ControlMessage> {
let mut ctrl_msgs = Vec::new();
let Self { files } = self;
if !files.is_empty() {
let unix_ctrl_msg = UnixControlMessage(Message::Files(FileMessage { files }));
ctrl_msgs.push(ControlMessage::Unix(unix_ctrl_msg));
}
ctrl_msgs
}
/// Returns whether the auxiliary data contains nothing.
pub(super) fn is_empty(&self) -> bool {
self.files.is_empty()
}
}

View File

@ -2,10 +2,12 @@
mod addr;
mod cred;
mod ctrl_msg;
mod ns;
mod stream;
pub use addr::UnixSocketAddr;
pub use cred::CUserCred;
pub(super) use ctrl_msg::UnixControlMessage;
pub use stream::UnixStreamSocket;
pub(super) use stream::UNIX_STREAM_DEFAULT_BUF_SIZE;

View File

@ -1,11 +1,18 @@
// SPDX-License-Identifier: MPL-2.0
use core::{
num::Wrapping,
sync::atomic::{AtomicBool, Ordering},
};
use crate::{
events::IoEvents,
fs::utils::{Endpoint, EndpointState},
net::socket::{
unix::{addr::UnixSocketAddrBound, cred::SocketCred, UnixSocketAddr},
util::SockShutdownCmd,
unix::{
addr::UnixSocketAddrBound, cred::SocketCred, ctrl_msg::AuxiliaryData, UnixSocketAddr,
},
util::{ControlMessage, SockShutdownCmd},
},
prelude::*,
process::signal::Pollee,
@ -37,10 +44,14 @@ impl Connected {
let this_inner = Inner {
addr: SpinLock::new(addr),
state,
all_aux: Mutex::new(VecDeque::new()),
has_aux: AtomicBool::new(false),
};
let peer_inner = Inner {
addr: SpinLock::new(peer_addr),
state: peer_state,
all_aux: Mutex::new(VecDeque::new()),
has_aux: AtomicBool::new(false),
};
let (this_inner, peer_inner) = Endpoint::new_pair(this_inner, peer_inner);
@ -82,23 +93,64 @@ impl Connected {
Ok(())
}
pub(super) fn try_read(&self, writer: &mut dyn MultiWrite) -> Result<usize> {
pub(super) fn try_read(
&self,
writer: &mut dyn MultiWrite,
) -> Result<(usize, Vec<ControlMessage>)> {
if writer.is_empty() {
if self.reader.lock().is_empty() {
return_errno_with_message!(Errno::EAGAIN, "the channel is empty");
}
return Ok(0);
return Ok((0, Vec::new()));
}
let read = || {
let mut reader = self.reader.lock();
reader.read_fallible(writer)
let mut reader = self.reader.lock();
// `reader.len()` is an `Acquire` operation. So it can guarantee that the `has_aux`
// check below sees the up-to-date value.
let no_aux_len = reader.len();
let peer_end = self.inner.peer_end();
// Fast path: There are no auxiliary data to receive.
if !peer_end.has_aux.load(Ordering::Relaxed) {
let read_len = self
.inner
.read_with(move || reader.read_fallible_with_max_len(writer, no_aux_len))?;
return Ok((read_len, Vec::new()));
}
let mut all_aux = peer_end.all_aux.lock();
let read_start = reader.head();
let (len_to_aux, len_to_aux_end) = if let Some(front) = all_aux.front() {
((front.start - read_start).0, (front.end - read_start).0)
} else {
(usize::MAX, usize::MAX)
};
self.inner.read_with(read)
// It is not allowed to receive two sets of auxiliary data in one `recvmsg`. So we cannot
// read more than `len_to_aux_end` bytes.
let read_len = self
.inner
.read_with(move || reader.read_fallible_with_max_len(writer, len_to_aux_end))?;
if read_len <= len_to_aux {
return Ok((read_len, Vec::new()));
}
// We have received the first set of auxiliary data.
let ctrl_msgs = all_aux.pop_front().unwrap().data.into_control();
peer_end
.has_aux
.store(!all_aux.is_empty(), Ordering::Relaxed);
Ok((read_len, ctrl_msgs))
}
pub(super) fn try_write(&self, reader: &mut dyn MultiRead) -> Result<usize> {
pub(super) fn try_write(
&self,
reader: &mut dyn MultiRead,
aux_data: &mut AuxiliaryData,
) -> Result<usize> {
if reader.is_empty() {
if self.inner.is_shutdown() {
return_errno_with_message!(Errno::EPIPE, "the channel is shut down");
@ -106,12 +158,40 @@ impl Connected {
return Ok(0);
}
let write = || {
// Fast path: There are no auxiliary data to transmit.
if aux_data.is_empty() {
let mut writer = self.writer.lock();
writer.write_fallible(reader)
return self.inner.write_with(move || writer.write_fallible(reader));
}
let this_end = self.inner.this_end();
let mut all_aux = this_end.all_aux.lock();
// No matter we succeed later or not, set the flag first to ensure that the auxiliary
// data are always visible to `try_recv`.
this_end.has_aux.store(true, Ordering::Relaxed);
let (write_start, write_res) = {
let mut writer = self.writer.lock();
let write_start = writer.tail();
let write_res = self.inner.write_with(move || writer.write_fallible(reader));
(write_start, write_res)
};
let Ok(write_len) = write_res else {
this_end
.has_aux
.store(!all_aux.is_empty(), Ordering::Relaxed);
return write_res;
};
self.inner.write_with(write)
let aux_range = RangedAuxiliaryData {
data: core::mem::take(aux_data),
start: write_start,
end: write_start + Wrapping(write_len),
};
all_aux.push_back(aux_range);
Ok(write_len)
}
pub(super) fn shutdown(&self, cmd: SockShutdownCmd) {
@ -165,6 +245,9 @@ impl Drop for Connected {
struct Inner {
addr: SpinLock<Option<UnixSocketAddrBound>>,
state: EndpointState,
// Lock order: `reader` -> `all_aux` & `all_aux` -> `writer`
all_aux: Mutex<VecDeque<RangedAuxiliaryData>>,
has_aux: AtomicBool,
}
impl AsRef<EndpointState> for Inner {
@ -173,4 +256,10 @@ impl AsRef<EndpointState> for Inner {
}
}
struct RangedAuxiliaryData {
data: AuxiliaryData,
start: Wrapping<usize>, // inclusive
end: Wrapping<usize>, // exclusive
}
pub(in crate::net) const UNIX_STREAM_DEFAULT_BUF_SIZE: usize = 65536;

View File

@ -17,10 +17,10 @@ use crate::{
net::socket::{
options::{PeerCred, PeerGroups, SocketOption},
private::SocketPrivate,
unix::{cred::SocketCred, CUserCred, UnixSocketAddr},
unix::{cred::SocketCred, ctrl_msg::AuxiliaryData, CUserCred, UnixSocketAddr},
util::{
options::{GetSocketLevelOption, SetSocketLevelOption, SocketOptionSet},
MessageHeader, SendRecvFlags, SockShutdownCmd, SocketAddr,
ControlMessage, MessageHeader, SendRecvFlags, SockShutdownCmd, SocketAddr,
},
Socket,
},
@ -176,16 +176,25 @@ impl UnixStreamSocket {
)
}
fn try_send(&self, buf: &mut dyn MultiRead, _flags: SendRecvFlags) -> Result<usize> {
fn try_send(
&self,
buf: &mut dyn MultiRead,
aux_data: &mut AuxiliaryData,
_flags: SendRecvFlags,
) -> Result<usize> {
match self.state.read().as_ref() {
State::Connected(connected) => connected.try_write(buf),
State::Connected(connected) => connected.try_write(buf, aux_data),
State::Init(_) | State::Listen(_) => {
return_errno_with_message!(Errno::ENOTCONN, "the socket is not connected")
}
}
}
fn try_recv(&self, buf: &mut dyn MultiWrite, _flags: SendRecvFlags) -> Result<usize> {
fn try_recv(
&self,
buf: &mut dyn MultiWrite,
_flags: SendRecvFlags,
) -> Result<(usize, Vec<ControlMessage>)> {
match self.state.read().as_ref() {
State::Connected(connected) => connected.try_read(buf),
State::Init(_) | State::Listen(_) => {
@ -371,7 +380,7 @@ impl Socket for UnixStreamSocket {
}
let MessageHeader {
control_message,
control_messages,
addr,
} = message_header;
@ -390,13 +399,11 @@ impl Socket for UnixStreamSocket {
),
}
}
let mut auxiliary_data = AuxiliaryData::from_control(control_messages)?;
if control_message.is_some() {
// TODO: Support sending control message
warn!("sending control message is not supported");
}
self.block_on(IoEvents::OUT, || self.try_send(reader, flags))
self.block_on(IoEvents::OUT, || {
self.try_send(reader, &mut auxiliary_data, flags)
})
}
fn recvmsg(
@ -409,11 +416,10 @@ impl Socket for UnixStreamSocket {
warn!("unsupported flags: {:?}", flags);
}
let received_bytes = self.block_on(IoEvents::IN, || self.try_recv(writer, flags))?;
let (received_bytes, control_messages) =
self.block_on(IoEvents::IN, || self.try_recv(writer, flags))?;
// TODO: Receive control message
let message_header = MessageHeader::new(None, None);
let message_header = MessageHeader::new(None, control_messages);
Ok((received_bytes, message_header))
}

View File

@ -1,21 +1,23 @@
// SPDX-License-Identifier: MPL-2.0
use align_ext::AlignExt;
use super::SocketAddr;
use crate::prelude::*;
use crate::{net::socket::unix::UnixControlMessage, prelude::*, util::net::CSocketOptionLevel};
/// Message header used for sendmsg/recvmsg.
#[derive(Debug)]
pub struct MessageHeader {
pub(in crate::net) addr: Option<SocketAddr>,
pub(in crate::net) control_message: Option<ControlMessage>,
pub(in crate::net) control_messages: Vec<ControlMessage>,
}
impl MessageHeader {
/// Creates a new `MessageHeader`.
pub const fn new(addr: Option<SocketAddr>, control_message: Option<ControlMessage>) -> Self {
pub const fn new(addr: Option<SocketAddr>, control_messages: Vec<ControlMessage>) -> Self {
Self {
addr,
control_message,
control_messages,
}
}
@ -23,10 +25,175 @@ impl MessageHeader {
pub fn addr(&self) -> Option<&SocketAddr> {
self.addr.as_ref()
}
/// Returns the control messages.
pub fn control_messages(&self) -> &Vec<ControlMessage> {
&self.control_messages
}
}
/// Control message carried by MessageHeader.
///
/// TODO: Implement the struct. The struct is empty now.
/// Control messages in [`MessageHeader`].
#[derive(Debug)]
pub struct ControlMessage;
pub enum ControlMessage {
Unix(UnixControlMessage),
}
impl ControlMessage {
pub fn read_all_from(reader: &mut VmReader) -> Result<Vec<Self>> {
// FIXME: This method may exhaust kernel memory and cause a panic if the program is
// malicious and attempts to send too many control messages. To prevent this, we limit the
// number of control messages, but this limit does not have a Linux equivalent.
const MAX_NR_MSGS: usize = 32;
let mut msgs = Vec::new();
while reader.has_remain() && msgs.len() < MAX_NR_MSGS {
let header = reader.read_val::<CControlHeader>()?;
if header.len <= size_of::<CControlHeader>() || header.payload_len() > reader.remain() {
return_errno_with_message!(
Errno::EINVAL,
"the size of the control message is invalid"
);
}
if let Some(msg) = Self::read_from(&header, reader)? {
msgs.push(msg);
}
let padding_len = header.padding_len().min(reader.remain());
reader.skip(padding_len);
}
if reader.has_remain() {
warn!("excessive control messages are currently not permitted");
return_errno_with_message!(
Errno::E2BIG,
"excessive control messages are currently not permitted"
);
}
Ok(msgs)
}
fn read_from(header: &CControlHeader, reader: &mut VmReader) -> Result<Option<Self>> {
let Some(level) = header.level() else {
warn!("unsupported control message level in {:?}", header);
reader.skip(header.payload_len());
return Ok(None);
};
match level {
CSocketOptionLevel::SOL_SOCKET => {
// Linux manual pages say (https://man7.org/linux/man-pages/man7/unix.7.html):
// "For historical reasons, the ancillary message types listed below are specified
// with a SOL_SOCKET type even though they are AF_UNIX specific."
let msg = UnixControlMessage::read_from(header, reader)?;
Ok(msg.map(Self::Unix))
}
_ => {
warn!("unsupported control message level in {:?}", header);
reader.skip(header.payload_len());
Ok(None)
}
}
}
pub fn write_all_to(msgs: &[Self], writer: &mut VmWriter) -> usize {
let mut len = 0;
for msg in msgs.iter() {
let header = match msg.write_to(writer) {
Ok(header) => header,
// This occurs when the buffer is too short or when some page faults cannot be
// handled. However, at this point, there is no good way to report the errors to
// user space. According to the Linux implementation, it seems okay to silently
// ignore errors here.
Err(_) => {
warn!("setting MSG_CTRUNC is not supported");
break;
}
};
len += header.total_len();
let padding_len = header.padding_len().min(writer.avail());
writer.skip(padding_len);
len += padding_len;
}
len
}
fn write_to(&self, writer: &mut VmWriter) -> Result<CControlHeader> {
match self {
Self::Unix(msg) => msg.write_to(writer),
}
}
}
/// `cmsghdr` in Linux.
///
/// Reference: <https://elixir.bootlin.com/linux/v6.13/source/include/linux/socket.h#L105>.
#[repr(C)]
#[derive(Debug, Clone, Copy, Pod)]
pub struct CControlHeader {
/// Data byte count, including hdr
len: usize,
/// Originating protocol
level: i32,
/// Protocol-specific type
type_: i32,
}
/// Alignment of control messages.
///
/// Reference: <https://elixir.bootlin.com/linux/v6.13/source/include/linux/socket.h#L119>.
const CMSG_ALIGN: usize = size_of::<usize>();
impl CControlHeader {
/// Creates a control message header with the level, type, and payload length.
pub fn new(level: CSocketOptionLevel, type_: i32, payload_len: usize) -> Self {
Self {
len: payload_len + size_of::<Self>(),
level: level as i32,
type_,
}
}
/// Computes the payload length from the total length.
pub fn payload_len_from_total(total_len: usize) -> Result<usize> {
total_len.checked_sub(size_of::<Self>()).ok_or_else(|| {
Error::with_message(Errno::EINVAL, "the control message buffer is too small")
})
}
/// Returns the level of the control message.
pub fn level(&self) -> Option<CSocketOptionLevel> {
CSocketOptionLevel::try_from(self.level).ok()
}
/// Returns the type of the control message.
pub fn type_(&self) -> i32 {
self.type_
}
/// Returns the payload length of the control message.
pub fn payload_len(&self) -> usize {
self.len - size_of::<Self>()
}
/// Returns the length of the control message (payload + header, excluding paddings).
pub fn total_len(&self) -> usize {
self.len
}
/// Returns the length of the padding bytes for the control message.
pub(self) fn padding_len(&self) -> usize {
self.total_len_with_padding() - self.total_len()
}
/// Returns the length of the control message (payload + header, including paddings).
fn total_len_with_padding(&self) -> usize {
self.len.align_up(CMSG_ALIGN)
}
}

View File

@ -9,7 +9,8 @@ mod shutdown_cmd;
mod socket_addr;
pub use linger_option::LingerOption;
pub use message_header::MessageHeader;
pub(super) use message_header::CControlHeader;
pub use message_header::{ControlMessage, MessageHeader};
pub use send_recv_flags::SendRecvFlags;
pub use shutdown_cmd::SockShutdownCmd;
pub use socket_addr::SocketAddr;

View File

@ -254,10 +254,10 @@ impl Socket for VsockStreamSocket {
}
let MessageHeader {
control_message, ..
control_messages, ..
} = message_header;
if control_message.is_some() {
if !control_messages.is_empty() {
// TODO: Support sending control message
warn!("sending control message is not supported");
}
@ -279,7 +279,7 @@ impl Socket for VsockStreamSocket {
// TODO: Receive control message
let messsge_header = MessageHeader::new(None, None);
let messsge_header = MessageHeader::new(None, Vec::new());
Ok((received_bytes, messsge_header))
}

View File

@ -14,7 +14,8 @@ pub fn sys_recvmsg(
flags: i32,
ctx: &Context,
) -> Result<SyscallReturn> {
let c_user_msghdr: CUserMsgHdr = ctx.user_space().read_val(user_msghdr_ptr)?;
let user_space = ctx.user_space();
let mut c_user_msghdr: CUserMsgHdr = user_space.read_val(user_msghdr_ptr)?;
let flags = SendRecvFlags::from_bits_truncate(flags);
debug!(
@ -27,7 +28,6 @@ pub fn sys_recvmsg(
let socket = file.as_socket_or_err()?;
let (total_bytes, message_header) = {
let user_space = ctx.user_space();
let mut io_vec_writer = c_user_msghdr.copy_writer_array_from_user(&user_space)?;
socket
.recvmsg(&mut io_vec_writer, flags)
@ -38,13 +38,18 @@ pub fn sys_recvmsg(
})?
};
if let Some(addr) = message_header.addr() {
c_user_msghdr.write_socket_addr_to_user(addr)?;
}
// Writing control messages may access the file table, so it should be called after dropping
// the file table borrow.
drop(file_table);
if c_user_msghdr.msg_control != 0 {
warn!("receiving control message is not supported");
}
let addr = message_header.addr();
c_user_msghdr.msg_namelen = c_user_msghdr.write_socket_addr_to_user(addr)?;
let control_messages = message_header.control_messages();
c_user_msghdr.msg_controllen =
c_user_msghdr.write_control_messages_to_user(control_messages, &user_space)?;
user_space.write_val(user_msghdr_ptr, &c_user_msghdr)?;
Ok(SyscallReturn::Return(total_bytes as _))
}

View File

@ -14,7 +14,8 @@ pub fn sys_sendmsg(
flags: i32,
ctx: &Context,
) -> Result<SyscallReturn> {
let c_user_msghdr: CUserMsgHdr = ctx.user_space().read_val(user_msghdr_ptr)?;
let user_space = ctx.user_space();
let c_user_msghdr: CUserMsgHdr = user_space.read_val(user_msghdr_ptr)?;
let flags = SendRecvFlags::from_bits_truncate(flags);
debug!(
@ -22,26 +23,19 @@ pub fn sys_sendmsg(
sockfd, c_user_msghdr, flags
);
let message_header = {
let addr = c_user_msghdr.read_socket_addr_from_user()?;
// Reading control messages may access the file table, so it should be called before
// `borrow_file_table_mut`.
let control_messages = c_user_msghdr.read_control_messages_from_user(&user_space)?;
MessageHeader::new(addr, control_messages)
};
let mut io_vec_reader = c_user_msghdr.copy_reader_array_from_user(&user_space)?;
let mut file_table = ctx.thread_local.borrow_file_table_mut();
let file = get_file_fast!(&mut file_table, sockfd);
let socket = file.as_socket_or_err()?;
let user_space = ctx.user_space();
let (mut io_vec_reader, message_header) = {
let addr = c_user_msghdr.read_socket_addr_from_user()?;
let io_vec_reader = c_user_msghdr.copy_reader_array_from_user(&user_space)?;
let control_message = {
if c_user_msghdr.msg_control != 0 {
// TODO: support sending control message
warn!("control message is not supported now");
}
None
};
(io_vec_reader, MessageHeader::new(addr, control_message))
};
let total_bytes = socket
.sendmsg(&mut io_vec_reader, message_header, flags)
.map_err(|err| match err.error() {

View File

@ -30,7 +30,7 @@ pub fn sys_sendto(
let file = get_file_fast!(&mut file_table, sockfd);
let socket = file.as_socket_or_err()?;
let message_header = MessageHeader::new(socket_addr, None);
let message_header = MessageHeader::new(socket_addr, Vec::new());
let user_space = ctx.user_space();
let mut reader = user_space.reader(buf, len)?;

View File

@ -173,7 +173,11 @@ pub fn new_raw_socket_option(
}
}
/// Sock Opt level. The definition is from https://elixir.bootlin.com/linux/v6.0.9/source/include/linux/socket.h#L343
/// Socket level.
///
/// This can refer to either a socket option or a control message level.
///
/// Reference: <https://elixir.bootlin.com/linux/v6.0.9/source/include/linux/socket.h#L343>.
#[repr(i32)]
#[derive(Debug, Clone, Copy, TryFromInt, PartialEq, Eq)]
#[expect(non_camel_case_types)]

View File

@ -2,7 +2,7 @@
use super::read_socket_addr_from_user;
use crate::{
net::socket::util::SocketAddr,
net::socket::util::{ControlMessage, SocketAddr},
prelude::*,
util::{net::write_socket_addr_with_max_len, VmReaderArray, VmWriterArray},
};
@ -103,13 +103,51 @@ impl CUserMsgHdr {
Ok(Some(socket_addr))
}
pub fn write_socket_addr_to_user(&self, addr: &SocketAddr) -> Result<()> {
pub fn write_socket_addr_to_user(&self, addr: Option<&SocketAddr>) -> Result<i32> {
if self.msg_name == 0 {
return Ok(());
// The length field will not be touched if the name pointer is NULL.
// See <https://elixir.bootlin.com/linux/v6.15.6/source/net/socket.c#L2792>.
return Ok(self.msg_namelen);
}
write_socket_addr_with_max_len(addr, self.msg_name, self.msg_namelen)?;
Ok(())
let actual_len = if let Some(addr) = addr {
write_socket_addr_with_max_len(addr, self.msg_name, self.msg_namelen)?
} else {
0
};
Ok(actual_len)
}
pub fn read_control_messages_from_user(
&self,
user_space: &CurrentUserSpace,
) -> Result<Vec<ControlMessage>> {
if self.msg_control == 0 {
return Ok(Vec::new());
}
let mut reader = user_space.reader(self.msg_control, self.msg_controllen as usize)?;
let control_messages = ControlMessage::read_all_from(&mut reader)?;
Ok(control_messages)
}
pub fn write_control_messages_to_user(
&self,
control_messages: &[ControlMessage],
user_space: &CurrentUserSpace,
) -> Result<u32> {
if self.msg_control == 0 {
if !control_messages.is_empty() {
warn!("setting MSG_CTRUNC is not supported");
}
// The length field will be set even if the control message pointer is NULL.
// See <https://elixir.bootlin.com/linux/v6.15.6/source/net/socket.c#L2807>.
return Ok(0);
}
let mut writer = user_space.writer(self.msg_control, self.msg_controllen as usize)?;
let write_len = ControlMessage::write_all_to(control_messages, &mut writer) as u32;
Ok(write_len)
}
pub fn copy_reader_array_from_user<'a>(

View File

@ -307,8 +307,19 @@ impl<R: Deref<Target = RingBuffer<u8>>> Producer<u8, R> {
///
/// Returns the number of bytes written.
pub fn write_fallible(&mut self, reader: &mut dyn MultiRead) -> Result<usize> {
self.write_fallible_with_max_len(reader, usize::MAX)
}
/// Writes data from the `VmReader` to the `RingBuffer` with the maximum length.
///
/// Returns the number of bytes written.
pub fn write_fallible_with_max_len(
&mut self,
reader: &mut dyn MultiRead,
max_len: usize,
) -> Result<usize> {
let rb = &self.rb;
let free_len = rb.free_len();
let free_len = rb.free_len().min(max_len);
let tail = rb.tail();
let offset = tail.0 & (rb.capacity - 1);
@ -410,8 +421,19 @@ impl<R: Deref<Target = RingBuffer<u8>>> Consumer<u8, R> {
///
/// Returns the number of bytes read.
pub fn read_fallible(&mut self, writer: &mut dyn MultiWrite) -> Result<usize> {
self.read_fallible_with_max_len(writer, usize::MAX)
}
/// Reads data from the `VmWriter` to the `RingBuffer` with the maximum length.
///
/// Returns the number of bytes read.
pub fn read_fallible_with_max_len(
&mut self,
writer: &mut dyn MultiWrite,
max_len: usize,
) -> Result<usize> {
let rb = &self.rb;
let len = rb.len();
let len = rb.len().min(max_len);
let head = rb.head();
let offset = head.0 & (rb.capacity - 1);

View File

@ -729,3 +729,98 @@ FN_TEST(zero_sends_may_fail)
TEST_SUCC(close(fildes[1]));
}
END_TEST()
FN_TEST(scm_rights)
{
int fildes[2];
char buf[20] = "abcdefg";
char cbuf[CMSG_SPACE(sizeof(int) * 3)];
struct iovec iov;
struct msghdr mhdr;
struct cmsghdr *chdr;
int *cdata;
int cfds[2];
TEST_SUCC(socketpair(AF_UNIX, SOCK_STREAM | SOCK_NONBLOCK, 0, fildes));
memset(&mhdr, 0, sizeof(mhdr));
mhdr.msg_iov = &iov;
mhdr.msg_iovlen = 1;
mhdr.msg_control = cbuf;
mhdr.msg_controllen = CMSG_SPACE(sizeof(int) * 3);
iov.iov_base = buf;
iov.iov_len = 1;
chdr = CMSG_FIRSTHDR(&mhdr);
chdr->cmsg_level = SOL_SOCKET;
chdr->cmsg_type = SCM_RIGHTS;
chdr->cmsg_len = CMSG_SPACE(sizeof(int) * 3);
cdata = (int *)CMSG_DATA(chdr);
TEST_SUCC(pipe(cfds));
cdata[0] = cfds[0];
cdata[1] = cfds[0];
cdata[2] = cfds[1];
// Sending control messages with zero bytes to a stream socket
// seems to "succeed". However, no data or control messages can
// be transmitted.
mhdr.msg_iovlen = 0;
TEST_SUCC(sendmsg(fildes[0], &mhdr, 0));
mhdr.msg_iovlen = 1;
// > (1) sendmsg(2) of four bytes, with no ancillary data.
// > (2) sendmsg(2) of one byte, with ancillary data.
// > (3) sendmsg(2) of four bytes, with no ancillary data.
// -- https://man7.org/linux/man-pages/man7/unix.7.html
TEST_RES(send(fildes[0], buf, 4, 0), _ret == 4);
TEST_RES(sendmsg(fildes[0], &mhdr, 0), _ret == 1);
TEST_RES(send(fildes[0], buf, 4, 0), _ret == 4);
memset(&mhdr, 0, sizeof(mhdr));
mhdr.msg_iov = &iov;
mhdr.msg_iovlen = 1;
mhdr.msg_control = cbuf;
mhdr.msg_controllen = CMSG_SPACE(sizeof(int));
iov.iov_base = buf;
iov.iov_len = sizeof(buf);
memset(cbuf, 0, sizeof(cbuf));
// > Suppose that the receiver now performs recvmsg(2) calls each with
// > a buffer size of 20 bytes. The first call will receive five bytes
// > of data, along with the ancillary data sent by the second
// > sendmsg(2) call.
TEST_RES(recvmsg(fildes[1], &mhdr, 0),
_ret == 5 &&
mhdr.msg_controllen == CMSG_SPACE(sizeof(int) * 2) &&
(chdr = CMSG_FIRSTHDR(&mhdr)) &&
chdr->cmsg_level == SOL_SOCKET &&
chdr->cmsg_type == SCM_RIGHTS &&
chdr->cmsg_len == CMSG_SPACE(sizeof(int) * 2) &&
(cdata = (int *)CMSG_DATA(chdr)) &&
cdata[0] == cfds[1] + 1 && cdata[1] == cfds[1] + 2);
// > The next call will receive the remaining four
// > bytes of data.
TEST_RES(recv(fildes[1], buf, sizeof(buf), 0), _ret == 4);
// The purpose of the tests below is to verify that the received file
// descriptors are functional.
TEST_RES(write(cfds[1], "x", 1), _ret == 1);
TEST_RES(read(cdata[0], buf, 1), _ret == 1 && buf[0] == 'x');
TEST_RES(write(cfds[1], "y", 1), _ret == 1);
TEST_RES(read(cdata[1], buf, 1), _ret == 1 && buf[0] == 'y');
TEST_SUCC(close(cdata[0]));
TEST_SUCC(close(cdata[1]));
TEST_SUCC(close(cfds[0]));
TEST_ERRNO(write(cfds[1], "y", 1), EPIPE);
TEST_SUCC(close(cfds[1]));
TEST_SUCC(close(fildes[0]));
TEST_SUCC(close(fildes[1]));
}
END_TEST()