From 7be20254a743be4f02414b9d56cc3fe5f84e6500 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 23 Sep 2025 04:25:22 -0600 Subject: [PATCH 01/68] io_uring: unify task_work cancelation checks Rather than do per-tw checking, which needs to dip into the task_struct for checking flags, do it upfront before running task_work. This places a 'cancel' member in io_tw_token_t, which is assigned before running task_work for that given ctx. This is both more efficient in doing it upfront rather than for every task_work, and it means that io_should_terminate_tw() can be made private in io_uring.c rather than need to be called by various callbacks of task_work. Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 1 + io_uring/io_uring.c | 27 ++++++++++++++++++++------- io_uring/io_uring.h | 13 ------------- io_uring/poll.c | 2 +- io_uring/timeout.c | 2 +- io_uring/uring_cmd.c | 2 +- 6 files changed, 24 insertions(+), 23 deletions(-) diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index c2ea6280901d..25ee982eb435 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -474,6 +474,7 @@ struct io_ring_ctx { * ONLY core io_uring.c should instantiate this struct. */ struct io_tw_state { + bool cancel; }; /* Alias to use in code that doesn't instantiate struct io_tw_state */ typedef struct io_tw_state io_tw_token_t; diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 820ef0527666..c397118da85e 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -265,6 +265,20 @@ static __cold void io_ring_ctx_ref_free(struct percpu_ref *ref) complete(&ctx->ref_comp); } +/* + * Terminate the request if either of these conditions are true: + * + * 1) It's being executed by the original task, but that task is marked + * with PF_EXITING as it's exiting. + * 2) PF_KTHREAD is set, in which case the invoker of the task_work is + * our fallback task_work. + * 3) The ring has been closed and is going away. + */ +static inline bool io_should_terminate_tw(struct io_ring_ctx *ctx) +{ + return (current->flags & (PF_EXITING | PF_KTHREAD)) || percpu_ref_is_dying(&ctx->refs); +} + static __cold void io_fallback_req_func(struct work_struct *work) { struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx, @@ -275,8 +289,10 @@ static __cold void io_fallback_req_func(struct work_struct *work) percpu_ref_get(&ctx->refs); mutex_lock(&ctx->uring_lock); - llist_for_each_entry_safe(req, tmp, node, io_task_work.node) + llist_for_each_entry_safe(req, tmp, node, io_task_work.node) { + ts.cancel = io_should_terminate_tw(req->ctx); req->io_task_work.func(req, ts); + } io_submit_flush_completions(ctx); mutex_unlock(&ctx->uring_lock); percpu_ref_put(&ctx->refs); @@ -1147,6 +1163,7 @@ struct llist_node *io_handle_tw_list(struct llist_node *node, ctx = req->ctx; mutex_lock(&ctx->uring_lock); percpu_ref_get(&ctx->refs); + ts.cancel = io_should_terminate_tw(ctx); } INDIRECT_CALL_2(req->io_task_work.func, io_poll_task_func, io_req_rw_complete, @@ -1205,11 +1222,6 @@ struct llist_node *tctx_task_work_run(struct io_uring_task *tctx, { struct llist_node *node; - if (unlikely(current->flags & PF_EXITING)) { - io_fallback_tw(tctx, true); - return NULL; - } - node = llist_del_all(&tctx->task_list); if (node) { node = llist_reverse_order(node); @@ -1399,6 +1411,7 @@ static int __io_run_local_work(struct io_ring_ctx *ctx, io_tw_token_t tw, if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) atomic_andnot(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); again: + tw.cancel = io_should_terminate_tw(ctx); min_events -= ret; ret = __io_run_local_work_loop(&ctx->retry_llist.first, tw, max_events); if (ctx->retry_llist.first) @@ -1458,7 +1471,7 @@ void io_req_task_submit(struct io_kiocb *req, io_tw_token_t tw) struct io_ring_ctx *ctx = req->ctx; io_tw_lock(ctx, tw); - if (unlikely(io_should_terminate_tw(ctx))) + if (unlikely(tw.cancel)) io_req_defer_failed(req, -EFAULT); else if (req->flags & REQ_F_FORCE_ASYNC) io_queue_iowq(req); diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 46d9141d772a..78777bf1ea4b 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -558,19 +558,6 @@ static inline bool io_allowed_run_tw(struct io_ring_ctx *ctx) ctx->submitter_task == current); } -/* - * Terminate the request if either of these conditions are true: - * - * 1) It's being executed by the original task, but that task is marked - * with PF_EXITING as it's exiting. - * 2) PF_KTHREAD is set, in which case the invoker of the task_work is - * our fallback task_work. - */ -static inline bool io_should_terminate_tw(struct io_ring_ctx *ctx) -{ - return (current->flags & (PF_KTHREAD | PF_EXITING)) || percpu_ref_is_dying(&ctx->refs); -} - static inline void io_req_queue_tw_complete(struct io_kiocb *req, s32 res) { io_req_set_res(req, res, 0); diff --git a/io_uring/poll.c b/io_uring/poll.c index b9681d0f9f13..c403e751841a 100644 --- a/io_uring/poll.c +++ b/io_uring/poll.c @@ -224,7 +224,7 @@ static int io_poll_check_events(struct io_kiocb *req, io_tw_token_t tw) { int v; - if (unlikely(io_should_terminate_tw(req->ctx))) + if (unlikely(tw.cancel)) return -ECANCELED; do { diff --git a/io_uring/timeout.c b/io_uring/timeout.c index 17e3aab0af36..444142ba9d04 100644 --- a/io_uring/timeout.c +++ b/io_uring/timeout.c @@ -324,7 +324,7 @@ static void io_req_task_link_timeout(struct io_kiocb *req, io_tw_token_t tw) int ret; if (prev) { - if (!io_should_terminate_tw(req->ctx)) { + if (!tw.cancel) { struct io_cancel_data cd = { .ctx = req->ctx, .data = prev->cqe.user_data, diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c index d1e3ba62ee8e..1225f8124e4b 100644 --- a/io_uring/uring_cmd.c +++ b/io_uring/uring_cmd.c @@ -118,7 +118,7 @@ static void io_uring_cmd_work(struct io_kiocb *req, io_tw_token_t tw) struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd); unsigned int flags = IO_URING_F_COMPLETE_DEFER; - if (io_should_terminate_tw(req->ctx)) + if (unlikely(tw.cancel)) flags |= IO_URING_F_TASK_DEAD; /* task_work executor checks the deffered list completion */ From a48c0cbf28c03f6c590a14ceb31bf6e619c2f6da Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 9 Oct 2025 07:33:36 -0600 Subject: [PATCH 02/68] io_uring/waitid: have io_waitid_complete() remove wait queue entry Both callers of this need the entry potentially removed, so shift the removal into the completion side and kill it from the two callers. While at it, add a helper for removing the wait_queue_entry based on the passed in io_kiocb. Signed-off-by: Jens Axboe --- io_uring/waitid.c | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/io_uring/waitid.c b/io_uring/waitid.c index f25110fb1b12..ebe3769c54dc 100644 --- a/io_uring/waitid.c +++ b/io_uring/waitid.c @@ -109,6 +109,22 @@ static int io_waitid_finish(struct io_kiocb *req, int ret) return ret; } +static void io_waitid_remove_wq(struct io_kiocb *req) +{ + struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid); + struct wait_queue_head *head; + + head = READ_ONCE(iw->head); + if (head) { + struct io_waitid_async *iwa = req->async_data; + + iw->head = NULL; + spin_lock_irq(&head->lock); + list_del_init(&iwa->wo.child_wait.entry); + spin_unlock_irq(&head->lock); + } +} + static void io_waitid_complete(struct io_kiocb *req, int ret) { struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid); @@ -119,6 +135,7 @@ static void io_waitid_complete(struct io_kiocb *req, int ret) lockdep_assert_held(&req->ctx->uring_lock); hlist_del_init(&req->hash_node); + io_waitid_remove_wq(req); ret = io_waitid_finish(req, ret); if (ret < 0) @@ -129,7 +146,8 @@ static void io_waitid_complete(struct io_kiocb *req, int ret) static bool __io_waitid_cancel(struct io_kiocb *req) { struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid); - struct io_waitid_async *iwa = req->async_data; + + lockdep_assert_held(&req->ctx->uring_lock); /* * Mark us canceled regardless of ownership. This will prevent a @@ -141,9 +159,6 @@ static bool __io_waitid_cancel(struct io_kiocb *req) if (atomic_fetch_inc(&iw->refs) & IO_WAITID_REF_MASK) return false; - spin_lock_irq(&iw->head->lock); - list_del_init(&iwa->wo.child_wait.entry); - spin_unlock_irq(&iw->head->lock); io_waitid_complete(req, -ECANCELED); io_req_queue_tw_complete(req, -ECANCELED); return true; @@ -209,8 +224,7 @@ static void io_waitid_cb(struct io_kiocb *req, io_tw_token_t tw) io_waitid_drop_issue_ref(req); return; } - - remove_wait_queue(iw->head, &iwa->wo.child_wait); + /* fall through to complete, will kill waitqueue */ } } From ab673c1bcaf20ac70352eeb6bf5b828462676693 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 9 Oct 2025 10:55:08 -0600 Subject: [PATCH 03/68] io_uring/waitid: use io_waitid_remove_wq() consistently Use it everywhere that the wait_queue_entry is removed from the head, and be a bit more cautious in zeroing out iw->head whenever the entry is removed from the list. Signed-off-by: Jens Axboe --- io_uring/waitid.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/io_uring/waitid.c b/io_uring/waitid.c index ebe3769c54dc..c5e0d979903a 100644 --- a/io_uring/waitid.c +++ b/io_uring/waitid.c @@ -179,18 +179,18 @@ bool io_waitid_remove_all(struct io_ring_ctx *ctx, struct io_uring_task *tctx, static inline bool io_waitid_drop_issue_ref(struct io_kiocb *req) { struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid); - struct io_waitid_async *iwa = req->async_data; if (!atomic_sub_return(1, &iw->refs)) return false; + io_waitid_remove_wq(req); + /* * Wakeup triggered, racing with us. It was prevented from * completing because of that, queue up the tw to do that. */ req->io_task_work.func = io_waitid_cb; io_req_task_work_add(req); - remove_wait_queue(iw->head, &iwa->wo.child_wait); return true; } @@ -245,6 +245,7 @@ static int io_waitid_wait(struct wait_queue_entry *wait, unsigned mode, return 0; list_del_init(&wait->entry); + iw->head = NULL; /* cancel is in progress */ if (atomic_fetch_inc(&iw->refs) & IO_WAITID_REF_MASK) @@ -271,6 +272,7 @@ int io_waitid_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) iw->which = READ_ONCE(sqe->len); iw->upid = READ_ONCE(sqe->fd); iw->options = READ_ONCE(sqe->file_index); + iw->head = NULL; iw->infop = u64_to_user_ptr(READ_ONCE(sqe->addr2)); return 0; } @@ -301,11 +303,16 @@ int io_waitid(struct io_kiocb *req, unsigned int issue_flags) * callback. */ io_ring_submit_lock(ctx, issue_flags); + + /* + * iw->head is valid under the ring lock, and as long as the request + * is on the waitid_list where cancelations may find it. + */ + iw->head = ¤t->signal->wait_chldexit; hlist_add_head(&req->hash_node, &ctx->waitid_list); init_waitqueue_func_entry(&iwa->wo.child_wait, io_waitid_wait); iwa->wo.child_wait.private = req->tctx->task; - iw->head = ¤t->signal->wait_chldexit; add_wait_queue(iw->head, &iwa->wo.child_wait); ret = __do_wait(&iwa->wo); @@ -328,7 +335,7 @@ int io_waitid(struct io_kiocb *req, unsigned int issue_flags) } hlist_del_init(&req->hash_node); - remove_wait_queue(iw->head, &iwa->wo.child_wait); + io_waitid_remove_wq(req); ret = io_waitid_finish(req, ret); io_ring_submit_unlock(ctx, issue_flags); From 12aced0a551e18f2162091e388c3a36ea75ccb13 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 16 Oct 2025 14:23:17 +0100 Subject: [PATCH 04/68] io_uring: deduplicate array_size in io_allocate_scq_urings A minor cleanup precomputing the sq size first instead of branching array_size() in io_allocate_scq_urings(). Signed-off-by: Pavel Begunkov Reviewed-by: Gabriel Krisman Bertazi Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index c397118da85e..31602e0e5f37 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -3610,6 +3610,7 @@ static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx, struct io_uring_region_desc rd; struct io_rings *rings; size_t size, sq_array_offset; + size_t sqe_size; int ret; /* make sure these are sane, as we already accounted them */ @@ -3639,10 +3640,11 @@ static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx, rings->sq_ring_entries = p->sq_entries; rings->cq_ring_entries = p->cq_entries; + sqe_size = sizeof(struct io_uring_sqe); if (p->flags & IORING_SETUP_SQE128) - size = array_size(2 * sizeof(struct io_uring_sqe), p->sq_entries); - else - size = array_size(sizeof(struct io_uring_sqe), p->sq_entries); + sqe_size *= 2; + + size = array_size(sqe_size, p->sq_entries); if (size == SIZE_MAX) { io_rings_free(ctx); return -EOVERFLOW; From 284306f6e6045e3f7b932914d1368df90033e87e Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 16 Oct 2025 14:23:18 +0100 Subject: [PATCH 05/68] io_uring: sanity check sizes before attempting allocation It's a good practice to validate parameters before doing any heavy stuff like queue allocations. Do that for io_allocate_scq_urings(). Signed-off-by: Pavel Begunkov Reviewed-by: Gabriel Krisman Bertazi Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 28 ++++++++++++---------------- 1 file changed, 12 insertions(+), 16 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 31602e0e5f37..e4ede0bad36f 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -3609,21 +3609,27 @@ static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx, { struct io_uring_region_desc rd; struct io_rings *rings; - size_t size, sq_array_offset; - size_t sqe_size; + size_t sq_array_offset; + size_t sq_size, cq_size, sqe_size; int ret; /* make sure these are sane, as we already accounted them */ ctx->sq_entries = p->sq_entries; ctx->cq_entries = p->cq_entries; - size = rings_size(ctx->flags, p->sq_entries, p->cq_entries, + sqe_size = sizeof(struct io_uring_sqe); + if (p->flags & IORING_SETUP_SQE128) + sqe_size *= 2; + sq_size = array_size(sqe_size, p->sq_entries); + if (sq_size == SIZE_MAX) + return -EOVERFLOW; + cq_size = rings_size(ctx->flags, p->sq_entries, p->cq_entries, &sq_array_offset); - if (size == SIZE_MAX) + if (cq_size == SIZE_MAX) return -EOVERFLOW; memset(&rd, 0, sizeof(rd)); - rd.size = PAGE_ALIGN(size); + rd.size = PAGE_ALIGN(cq_size); if (ctx->flags & IORING_SETUP_NO_MMAP) { rd.user_addr = p->cq_off.user_addr; rd.flags |= IORING_MEM_REGION_TYPE_USER; @@ -3640,18 +3646,8 @@ static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx, rings->sq_ring_entries = p->sq_entries; rings->cq_ring_entries = p->cq_entries; - sqe_size = sizeof(struct io_uring_sqe); - if (p->flags & IORING_SETUP_SQE128) - sqe_size *= 2; - - size = array_size(sqe_size, p->sq_entries); - if (size == SIZE_MAX) { - io_rings_free(ctx); - return -EOVERFLOW; - } - memset(&rd, 0, sizeof(rd)); - rd.size = PAGE_ALIGN(size); + rd.size = PAGE_ALIGN(sq_size); if (ctx->flags & IORING_SETUP_NO_MMAP) { rd.user_addr = p->sq_off.user_addr; rd.flags |= IORING_MEM_REGION_TYPE_USER; From 4c53e392a194f2bb37403a5b9bcf8e77401234cc Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 16 Oct 2025 14:23:19 +0100 Subject: [PATCH 06/68] io_uring: use no mmap safe region helpers on resizing io_create_region_mmap_safe() is only needed when the created region is exposed to userspace code via mmap. io_register_resize_rings() creates them locally on stack, so the no mmap_safe version of the helper is enough. Signed-off-by: Pavel Begunkov Reviewed-by: Gabriel Krisman Bertazi Signed-off-by: Jens Axboe --- io_uring/register.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/io_uring/register.c b/io_uring/register.c index 2e4717f1357c..a809d95153e4 100644 --- a/io_uring/register.c +++ b/io_uring/register.c @@ -432,7 +432,7 @@ static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg) rd.user_addr = p.cq_off.user_addr; rd.flags |= IORING_MEM_REGION_TYPE_USER; } - ret = io_create_region_mmap_safe(ctx, &n.ring_region, &rd, IORING_OFF_CQ_RING); + ret = io_create_region(ctx, &n.ring_region, &rd, IORING_OFF_CQ_RING); if (ret) { io_register_free_rings(ctx, &p, &n); return ret; @@ -472,7 +472,7 @@ static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg) rd.user_addr = p.sq_off.user_addr; rd.flags |= IORING_MEM_REGION_TYPE_USER; } - ret = io_create_region_mmap_safe(ctx, &n.sq_region, &rd, IORING_OFF_SQES); + ret = io_create_region(ctx, &n.sq_region, &rd, IORING_OFF_SQES); if (ret) { io_register_free_rings(ctx, &p, &n); return ret; From 0c89dbbcadf126920e6f9ebfa64e2538af84fef3 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 16 Oct 2025 14:23:20 +0100 Subject: [PATCH 07/68] io_uring: remove extra args from io_register_free_rings io_register_free_rings() doesn't use its "struct io_uring_params" parameter, remove it. Signed-off-by: Pavel Begunkov Reviewed-by: Gabriel Krisman Bertazi Signed-off-by: Jens Axboe --- io_uring/register.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/io_uring/register.c b/io_uring/register.c index a809d95153e4..f7f71f035b0d 100644 --- a/io_uring/register.c +++ b/io_uring/register.c @@ -379,7 +379,6 @@ struct io_ring_ctx_rings { }; static void io_register_free_rings(struct io_ring_ctx *ctx, - struct io_uring_params *p, struct io_ring_ctx_rings *r) { io_free_region(ctx, &r->sq_region); @@ -434,7 +433,7 @@ static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg) } ret = io_create_region(ctx, &n.ring_region, &rd, IORING_OFF_CQ_RING); if (ret) { - io_register_free_rings(ctx, &p, &n); + io_register_free_rings(ctx, &n); return ret; } n.rings = io_region_get_ptr(&n.ring_region); @@ -453,7 +452,7 @@ static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg) WRITE_ONCE(n.rings->cq_ring_entries, p.cq_entries); if (copy_to_user(arg, &p, sizeof(p))) { - io_register_free_rings(ctx, &p, &n); + io_register_free_rings(ctx, &n); return -EFAULT; } @@ -462,7 +461,7 @@ static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg) else size = array_size(sizeof(struct io_uring_sqe), p.sq_entries); if (size == SIZE_MAX) { - io_register_free_rings(ctx, &p, &n); + io_register_free_rings(ctx, &n); return -EOVERFLOW; } @@ -474,7 +473,7 @@ static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg) } ret = io_create_region(ctx, &n.sq_region, &rd, IORING_OFF_SQES); if (ret) { - io_register_free_rings(ctx, &p, &n); + io_register_free_rings(ctx, &n); return ret; } n.sq_sqes = io_region_get_ptr(&n.sq_region); @@ -564,7 +563,7 @@ overflow: out: spin_unlock(&ctx->completion_lock); mutex_unlock(&ctx->mmap_lock); - io_register_free_rings(ctx, &p, to_free); + io_register_free_rings(ctx, to_free); if (ctx->sq_data) io_sq_thread_unpark(ctx->sq_data); From 6e9752977caa47c200f88d7df1ff114955a03bad Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 16 Oct 2025 14:23:21 +0100 Subject: [PATCH 08/68] io_uring: don't free never created regions io_free_region() tolerates empty regions but there is no reason to that either. If the first io_create_region() in io_register_resize_rings() fails, just return the error without attempting to clean it up. Signed-off-by: Pavel Begunkov Reviewed-by: Gabriel Krisman Bertazi Signed-off-by: Jens Axboe --- io_uring/register.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/io_uring/register.c b/io_uring/register.c index f7f71f035b0d..b11550ed940c 100644 --- a/io_uring/register.c +++ b/io_uring/register.c @@ -432,10 +432,9 @@ static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg) rd.flags |= IORING_MEM_REGION_TYPE_USER; } ret = io_create_region(ctx, &n.ring_region, &rd, IORING_OFF_CQ_RING); - if (ret) { - io_register_free_rings(ctx, &n); + if (ret) return ret; - } + n.rings = io_region_get_ptr(&n.ring_region); /* From dec10a1ad1d5f9d46e7f6e7c8b414a805e00717c Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 16 Oct 2025 14:23:22 +0100 Subject: [PATCH 09/68] io_uring/kbuf: use io_create_region for kbuf creation kbuf ring is published by io_buffer_add_list(), which correctly protects with mmap_lock, there is no need to use io_create_region_mmap_safe() before as the region is not yet exposed to the userspace via mmap. Signed-off-by: Pavel Begunkov Reviewed-by: Gabriel Krisman Bertazi Signed-off-by: Jens Axboe --- io_uring/kbuf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index aad655e38672..e271b44ff73e 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -630,7 +630,7 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) rd.user_addr = reg.ring_addr; rd.flags |= IORING_MEM_REGION_TYPE_USER; } - ret = io_create_region_mmap_safe(ctx, &bl->region, &rd, mmap_offset); + ret = io_create_region(ctx, &bl->region, &rd, mmap_offset); if (ret) goto fail; br = io_region_get_ptr(&bl->region); From 5b6d8a032e807c48a843fb81d9e3d74391f731ea Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 16 Oct 2025 14:23:23 +0100 Subject: [PATCH 10/68] io_uring: only publish fully handled mem region io_register_mem_region() can try to remove a region right after publishing it. This non-atomicity is annoying. Do it in two steps similar to io_register_mem_region(), create memory first and publish it once the rest of the handling is done. Remove now unused io_create_region_mmap_safe(), which was assumed to be a temporary solution from day one. Signed-off-by: Pavel Begunkov Reviewed-by: Gabriel Krisman Bertazi Signed-off-by: Jens Axboe --- io_uring/memmap.c | 21 --------------------- io_uring/memmap.h | 12 ++++++++++++ io_uring/register.c | 11 ++++++----- 3 files changed, 18 insertions(+), 26 deletions(-) diff --git a/io_uring/memmap.c b/io_uring/memmap.c index 2e99dffddfc5..aa388ecd4754 100644 --- a/io_uring/memmap.c +++ b/io_uring/memmap.c @@ -234,27 +234,6 @@ out_free: return ret; } -int io_create_region_mmap_safe(struct io_ring_ctx *ctx, struct io_mapped_region *mr, - struct io_uring_region_desc *reg, - unsigned long mmap_offset) -{ - struct io_mapped_region tmp_mr; - int ret; - - memcpy(&tmp_mr, mr, sizeof(tmp_mr)); - ret = io_create_region(ctx, &tmp_mr, reg, mmap_offset); - if (ret) - return ret; - - /* - * Once published mmap can find it without holding only the ->mmap_lock - * and not ->uring_lock. - */ - guard(mutex)(&ctx->mmap_lock); - memcpy(mr, &tmp_mr, sizeof(tmp_mr)); - return 0; -} - static struct io_mapped_region *io_mmap_get_region(struct io_ring_ctx *ctx, loff_t pgoff) { diff --git a/io_uring/memmap.h b/io_uring/memmap.h index 08419684e4bc..58002976e0c3 100644 --- a/io_uring/memmap.h +++ b/io_uring/memmap.h @@ -36,4 +36,16 @@ static inline bool io_region_is_set(struct io_mapped_region *mr) return !!mr->nr_pages; } +static inline void io_region_publish(struct io_ring_ctx *ctx, + struct io_mapped_region *src_region, + struct io_mapped_region *dst_region) +{ + /* + * Once published mmap can find it without holding only the ->mmap_lock + * and not ->uring_lock. + */ + guard(mutex)(&ctx->mmap_lock); + *dst_region = *src_region; +} + #endif diff --git a/io_uring/register.c b/io_uring/register.c index b11550ed940c..43eb02004824 100644 --- a/io_uring/register.c +++ b/io_uring/register.c @@ -576,6 +576,7 @@ static int io_register_mem_region(struct io_ring_ctx *ctx, void __user *uarg) struct io_uring_mem_region_reg reg; struct io_uring_region_desc __user *rd_uptr; struct io_uring_region_desc rd; + struct io_mapped_region region = {}; int ret; if (io_region_is_set(&ctx->param_region)) @@ -599,20 +600,20 @@ static int io_register_mem_region(struct io_ring_ctx *ctx, void __user *uarg) !(ctx->flags & IORING_SETUP_R_DISABLED)) return -EINVAL; - ret = io_create_region_mmap_safe(ctx, &ctx->param_region, &rd, - IORING_MAP_OFF_PARAM_REGION); + ret = io_create_region(ctx, ®ion, &rd, IORING_MAP_OFF_PARAM_REGION); if (ret) return ret; if (copy_to_user(rd_uptr, &rd, sizeof(rd))) { - guard(mutex)(&ctx->mmap_lock); - io_free_region(ctx, &ctx->param_region); + io_free_region(ctx, ®ion); return -EFAULT; } if (reg.flags & IORING_MEM_REGION_REG_WAIT_ARG) { - ctx->cq_wait_arg = io_region_get_ptr(&ctx->param_region); + ctx->cq_wait_arg = io_region_get_ptr(®ion); ctx->cq_wait_size = rd.size; } + + io_region_publish(ctx, ®ion, &ctx->param_region); return 0; } From 1cba30bf9fdd6c982708f3587f609a30c370d889 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Thu, 16 Oct 2025 11:09:38 -0700 Subject: [PATCH 11/68] io_uring: add support for IORING_SETUP_SQE_MIXED Normal rings support 64b SQEs for posting submissions, while certain features require the ring to be configured with IORING_SETUP_SQE128, as they need to convey more information per submission. This, in turn, makes ALL the SQEs be 128b in size. This is somewhat wasteful and inefficient, particularly when only certain SQEs need to be of the bigger variant. This adds support for setting up a ring with mixed SQE sizes, using IORING_SETUP_SQE_MIXED. When setup in this mode, SQEs posted to the ring may be either 64b or 128b in size. If a SQE is 128b in size, then opcode will be set to a variante to indicate that this is the case. Any other non-128b opcode will assume the SQ's default size. SQEs on these types of mixed rings may also utilize NOP with skip success set. This can happen if the ring is one (small) SQE entry away from wrapping, and an attempt is made to get a 128b SQE. As SQEs must be contiguous in the SQ ring, a 128b SQE cannot wrap the ring. For this case, a single NOP SQE should be inserted with the SKIP_SUCCESS flag set. The kernel will process this as a normal NOP and without posting a CQE. Signed-off-by: Keith Busch [axboe: {} style fix and assign sqe before opcode read] Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 8 ++++++++ io_uring/fdinfo.c | 34 +++++++++++++++++++++++++------- io_uring/io_uring.c | 37 +++++++++++++++++++++++++++++++---- io_uring/io_uring.h | 14 ++----------- io_uring/opdef.c | 26 ++++++++++++++++++++++++ io_uring/opdef.h | 2 ++ io_uring/register.c | 2 +- io_uring/uring_cmd.c | 17 ++++++++++++++-- 8 files changed, 114 insertions(+), 26 deletions(-) diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 263bed13473e..04797a9b76bc 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -231,6 +231,12 @@ enum io_uring_sqe_flags_bit { */ #define IORING_SETUP_CQE_MIXED (1U << 18) +/* + * Allow both 64b and 128b SQEs. If a 128b SQE is posted, it will have + * a 128b opcode. + */ +#define IORING_SETUP_SQE_MIXED (1U << 19) + enum io_uring_op { IORING_OP_NOP, IORING_OP_READV, @@ -295,6 +301,8 @@ enum io_uring_op { IORING_OP_READV_FIXED, IORING_OP_WRITEV_FIXED, IORING_OP_PIPE, + IORING_OP_NOP128, + IORING_OP_URING_CMD128, /* this goes last, obviously */ IORING_OP_LAST, diff --git a/io_uring/fdinfo.c b/io_uring/fdinfo.c index ff3364531c77..1a806ad16840 100644 --- a/io_uring/fdinfo.c +++ b/io_uring/fdinfo.c @@ -14,6 +14,7 @@ #include "fdinfo.h" #include "cancel.h" #include "rsrc.h" +#include "opdef.h" #ifdef CONFIG_NET_RX_BUSY_POLL static __cold void common_tracking_show_fdinfo(struct io_ring_ctx *ctx, @@ -66,7 +67,6 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m) unsigned int cq_head = READ_ONCE(r->cq.head); unsigned int cq_tail = READ_ONCE(r->cq.tail); unsigned int sq_shift = 0; - unsigned int sq_entries; int sq_pid = -1, sq_cpu = -1; u64 sq_total_time = 0, sq_work_time = 0; unsigned int i; @@ -89,26 +89,45 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m) seq_printf(m, "CqTail:\t%u\n", cq_tail); seq_printf(m, "CachedCqTail:\t%u\n", data_race(ctx->cached_cq_tail)); seq_printf(m, "SQEs:\t%u\n", sq_tail - sq_head); - sq_entries = min(sq_tail - sq_head, ctx->sq_entries); - for (i = 0; i < sq_entries; i++) { - unsigned int entry = i + sq_head; + while (sq_head < sq_tail) { struct io_uring_sqe *sqe; unsigned int sq_idx; + bool sqe128 = false; + u8 opcode; if (ctx->flags & IORING_SETUP_NO_SQARRAY) break; - sq_idx = READ_ONCE(ctx->sq_array[entry & sq_mask]); + sq_idx = READ_ONCE(ctx->sq_array[sq_head & sq_mask]); if (sq_idx > sq_mask) continue; + sqe = &ctx->sq_sqes[sq_idx << sq_shift]; + opcode = READ_ONCE(sqe->opcode); + if (sq_shift) { + sqe128 = true; + } else if (io_issue_defs[opcode].is_128) { + if (!(ctx->flags & IORING_SETUP_SQE_MIXED)) { + seq_printf(m, + "%5u: invalid sqe, 128B entry on non-mixed sq\n", + sq_idx); + break; + } + if ((++sq_head & sq_mask) == 0) { + seq_printf(m, + "%5u: corrupted sqe, wrapping 128B entry\n", + sq_idx); + break; + } + sqe128 = true; + } seq_printf(m, "%5u: opcode:%s, fd:%d, flags:%x, off:%llu, " "addr:0x%llx, rw_flags:0x%x, buf_index:%d " "user_data:%llu", - sq_idx, io_uring_get_opcode(sqe->opcode), sqe->fd, + sq_idx, io_uring_get_opcode(opcode), sqe->fd, sqe->flags, (unsigned long long) sqe->off, (unsigned long long) sqe->addr, sqe->rw_flags, sqe->buf_index, sqe->user_data); - if (sq_shift) { + if (sqe128) { u64 *sqeb = (void *) (sqe + 1); int size = sizeof(struct io_uring_sqe) / sizeof(u64); int j; @@ -120,6 +139,7 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m) } } seq_printf(m, "\n"); + sq_head++; } seq_printf(m, "CQEs:\t%u\n", cq_tail - cq_head); while (cq_head < cq_tail) { diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index e4ede0bad36f..be44d636fe1f 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -2164,7 +2164,7 @@ static __cold int io_init_fail_req(struct io_kiocb *req, int err) } static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, - const struct io_uring_sqe *sqe) + const struct io_uring_sqe *sqe, unsigned int *left) __must_hold(&ctx->uring_lock) { const struct io_issue_def *def; @@ -2190,6 +2190,24 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, opcode = array_index_nospec(opcode, IORING_OP_LAST); def = &io_issue_defs[opcode]; + if (def->is_128 && !(ctx->flags & IORING_SETUP_SQE128)) { + /* + * A 128b op on a non-128b SQ requires mixed SQE support as + * well as 2 contiguous entries. + */ + if (!(ctx->flags & IORING_SETUP_SQE_MIXED) || *left < 2 || + !(ctx->cached_sq_head & (ctx->sq_entries - 1))) + return io_init_fail_req(req, -EINVAL); + /* + * A 128b operation on a mixed SQ uses two entries, so we have + * to increment the head and cached refs, and decrement what's + * left. + */ + current->io_uring->cached_refs++; + ctx->cached_sq_head++; + (*left)--; + } + if (unlikely(sqe_flags & ~SQE_COMMON_FLAGS)) { /* enforce forwards compatibility on users */ if (sqe_flags & ~SQE_VALID_FLAGS) @@ -2299,13 +2317,13 @@ static __cold int io_submit_fail_init(const struct io_uring_sqe *sqe, } static inline int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, - const struct io_uring_sqe *sqe) + const struct io_uring_sqe *sqe, unsigned int *left) __must_hold(&ctx->uring_lock) { struct io_submit_link *link = &ctx->submit_state.link; int ret; - ret = io_init_req(ctx, req, sqe); + ret = io_init_req(ctx, req, sqe, left); if (unlikely(ret)) return io_submit_fail_init(sqe, req, ret); @@ -2457,7 +2475,7 @@ int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr) * Continue submitting even for sqe failure if the * ring was setup with IORING_SETUP_SUBMIT_ALL */ - if (unlikely(io_submit_sqe(ctx, req, sqe)) && + if (unlikely(io_submit_sqe(ctx, req, sqe, &left)) && !(ctx->flags & IORING_SETUP_SUBMIT_ALL)) { left--; break; @@ -2802,6 +2820,10 @@ unsigned long rings_size(unsigned int flags, unsigned int sq_entries, if (cq_entries < 2) return SIZE_MAX; } + if (flags & IORING_SETUP_SQE_MIXED) { + if (sq_entries < 2) + return SIZE_MAX; + } #ifdef CONFIG_SMP off = ALIGN(off, SMP_CACHE_BYTES); @@ -3726,6 +3748,13 @@ static int io_uring_sanitise_params(struct io_uring_params *p) if ((flags & (IORING_SETUP_CQE32|IORING_SETUP_CQE_MIXED)) == (IORING_SETUP_CQE32|IORING_SETUP_CQE_MIXED)) return -EINVAL; + /* + * Nonsensical to ask for SQE128 and mixed SQE support, it's not + * supported to post 64b SQEs on a ring setup with SQE128. + */ + if ((flags & (IORING_SETUP_SQE128|IORING_SETUP_SQE_MIXED)) == + (IORING_SETUP_SQE128|IORING_SETUP_SQE_MIXED)) + return -EINVAL; return 0; } diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 78777bf1ea4b..44b8091c7fcd 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -54,7 +54,8 @@ IORING_SETUP_REGISTERED_FD_ONLY |\ IORING_SETUP_NO_SQARRAY |\ IORING_SETUP_HYBRID_IOPOLL |\ - IORING_SETUP_CQE_MIXED) + IORING_SETUP_CQE_MIXED |\ + IORING_SETUP_SQE_MIXED) #define IORING_ENTER_FLAGS (IORING_ENTER_GETEVENTS |\ IORING_ENTER_SQ_WAKEUP |\ @@ -565,17 +566,6 @@ static inline void io_req_queue_tw_complete(struct io_kiocb *req, s32 res) io_req_task_work_add(req); } -/* - * IORING_SETUP_SQE128 contexts allocate twice the normal SQE size for each - * slot. - */ -static inline size_t uring_sqe_size(struct io_ring_ctx *ctx) -{ - if (ctx->flags & IORING_SETUP_SQE128) - return 2 * sizeof(struct io_uring_sqe); - return sizeof(struct io_uring_sqe); -} - static inline bool io_file_can_poll(struct io_kiocb *req) { if (req->flags & REQ_F_CAN_POLL) diff --git a/io_uring/opdef.c b/io_uring/opdef.c index 932319633eac..df52d760240e 100644 --- a/io_uring/opdef.c +++ b/io_uring/opdef.c @@ -575,6 +575,24 @@ const struct io_issue_def io_issue_defs[] = { .prep = io_pipe_prep, .issue = io_pipe, }, + [IORING_OP_NOP128] = { + .audit_skip = 1, + .iopoll = 1, + .is_128 = 1, + .prep = io_nop_prep, + .issue = io_nop, + }, + [IORING_OP_URING_CMD128] = { + .buffer_select = 1, + .needs_file = 1, + .plug = 1, + .iopoll = 1, + .iopoll_queue = 1, + .is_128 = 1, + .async_size = sizeof(struct io_async_cmd), + .prep = io_uring_cmd_prep, + .issue = io_uring_cmd, + }, }; const struct io_cold_def io_cold_defs[] = { @@ -825,6 +843,14 @@ const struct io_cold_def io_cold_defs[] = { [IORING_OP_PIPE] = { .name = "PIPE", }, + [IORING_OP_NOP128] = { + .name = "NOP128", + }, + [IORING_OP_URING_CMD128] = { + .name = "URING_CMD128", + .sqe_copy = io_uring_cmd_sqe_copy, + .cleanup = io_uring_cmd_cleanup, + }, }; const char *io_uring_get_opcode(u8 opcode) diff --git a/io_uring/opdef.h b/io_uring/opdef.h index c2f0907ed78c..aa37846880ff 100644 --- a/io_uring/opdef.h +++ b/io_uring/opdef.h @@ -27,6 +27,8 @@ struct io_issue_def { unsigned iopoll_queue : 1; /* vectored opcode, set if 1) vectored, and 2) handler needs to know */ unsigned vectored : 1; + /* set to 1 if this opcode uses 128b sqes in a mixed sq */ + unsigned is_128 : 1; /* size of async data needed, if any */ unsigned short async_size; diff --git a/io_uring/register.c b/io_uring/register.c index 43eb02004824..1a3e05be6e7b 100644 --- a/io_uring/register.c +++ b/io_uring/register.c @@ -394,7 +394,7 @@ static void io_register_free_rings(struct io_ring_ctx *ctx, #define RESIZE_FLAGS (IORING_SETUP_CQSIZE | IORING_SETUP_CLAMP) #define COPY_FLAGS (IORING_SETUP_NO_SQARRAY | IORING_SETUP_SQE128 | \ IORING_SETUP_CQE32 | IORING_SETUP_NO_MMAP | \ - IORING_SETUP_CQE_MIXED) + IORING_SETUP_CQE_MIXED | IORING_SETUP_SQE_MIXED) static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg) { diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c index 1225f8124e4b..9d67a2a721aa 100644 --- a/io_uring/uring_cmd.c +++ b/io_uring/uring_cmd.c @@ -216,6 +216,18 @@ int io_uring_cmd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return 0; } +/* + * IORING_SETUP_SQE128 contexts allocate twice the normal SQE size for each + * slot. + */ +static inline size_t uring_sqe_size(struct io_kiocb *req) +{ + if (req->ctx->flags & IORING_SETUP_SQE128 || + req->opcode == IORING_OP_URING_CMD128) + return 2 * sizeof(struct io_uring_sqe); + return sizeof(struct io_uring_sqe); +} + void io_uring_cmd_sqe_copy(struct io_kiocb *req) { struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd); @@ -224,7 +236,7 @@ void io_uring_cmd_sqe_copy(struct io_kiocb *req) /* Should not happen, as REQ_F_SQE_COPIED covers this */ if (WARN_ON_ONCE(ioucmd->sqe == ac->sqes)) return; - memcpy(ac->sqes, ioucmd->sqe, uring_sqe_size(req->ctx)); + memcpy(ac->sqes, ioucmd->sqe, uring_sqe_size(req)); ioucmd->sqe = ac->sqes; } @@ -242,7 +254,8 @@ int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags) if (ret) return ret; - if (ctx->flags & IORING_SETUP_SQE128) + if (ctx->flags & IORING_SETUP_SQE128 || + req->opcode == IORING_OP_URING_CMD128) issue_flags |= IO_URING_F_SQE128; if (ctx->flags & (IORING_SETUP_CQE32 | IORING_SETUP_CQE_MIXED)) issue_flags |= IO_URING_F_CQE32; From dde92a5026d81df1a146e9c243d09b27d1bf04bf Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 16 Oct 2025 12:20:31 +0100 Subject: [PATCH 12/68] io_uring: check for user passing 0 nr_submit io_submit_sqes() shouldn't be stepping into its main loop when there is nothing to submit, i.e. nr=0. Fix 0 submission queue entries checks, which should follow after all user input truncations. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index be44d636fe1f..93a1cc2bf383 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -2453,10 +2453,11 @@ int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr) unsigned int left; int ret; + entries = min(nr, entries); if (unlikely(!entries)) return 0; - /* make sure SQ entry isn't read before tail */ - ret = left = min(nr, entries); + + ret = left = entries; io_get_task_refs(left); io_submit_state_start(&ctx->submit_state, left); From 0ecf0e6748120842700efc5dbf22a18580f7efcf Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Wed, 22 Oct 2025 13:56:07 -0700 Subject: [PATCH 13/68] io_uring/fdinfo: show SQEs for no array setup The sq_head indicates the index directly in the submission queue when the IORING_SETUP_NO_SQARRAY option is used, so use that instead of skipping showing the entries. Signed-off-by: Keith Busch Signed-off-by: Jens Axboe --- io_uring/fdinfo.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/io_uring/fdinfo.c b/io_uring/fdinfo.c index 1a806ad16840..a3ce92183540 100644 --- a/io_uring/fdinfo.c +++ b/io_uring/fdinfo.c @@ -96,8 +96,10 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m) u8 opcode; if (ctx->flags & IORING_SETUP_NO_SQARRAY) - break; - sq_idx = READ_ONCE(ctx->sq_array[sq_head & sq_mask]); + sq_idx = sq_head & sq_mask; + else + sq_idx = READ_ONCE(ctx->sq_array[sq_head & sq_mask]); + if (sq_idx > sq_mask) continue; From 101e596e7404d07a85b38358a392009503aad797 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 27 Oct 2025 19:09:28 -0600 Subject: [PATCH 14/68] io_uring/fdinfo: cap SQ iteration at max SQ entries A previous commit changed the logic around how SQ entries are iterated, and as a result, had a few bugs. One is that it fully trusts the SQ head and tail, which are user exposed. Another is that it fails to increment the SQ head if the SQ index is out of range. Fix both of those up, reverting to the previous logic of how to iterate SQ entries. Link: https://lore.kernel.org/io-uring/68ffdf18.050a0220.3344a1.039e.GAE@google.com/ Fixes: 1cba30bf9fdd ("io_uring: add support for IORING_SETUP_SQE_MIXED") Reported-by: syzbot+10a9b495f54a17b607a6@syzkaller.appspotmail.com Tested-by: syzbot+10a9b495f54a17b607a6@syzkaller.appspotmail.com Reviewed-by: Keith Busch Signed-off-by: Jens Axboe --- io_uring/fdinfo.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/io_uring/fdinfo.c b/io_uring/fdinfo.c index a3ce92183540..248006424cab 100644 --- a/io_uring/fdinfo.c +++ b/io_uring/fdinfo.c @@ -67,6 +67,7 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m) unsigned int cq_head = READ_ONCE(r->cq.head); unsigned int cq_tail = READ_ONCE(r->cq.tail); unsigned int sq_shift = 0; + unsigned int sq_entries; int sq_pid = -1, sq_cpu = -1; u64 sq_total_time = 0, sq_work_time = 0; unsigned int i; @@ -89,17 +90,18 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m) seq_printf(m, "CqTail:\t%u\n", cq_tail); seq_printf(m, "CachedCqTail:\t%u\n", data_race(ctx->cached_cq_tail)); seq_printf(m, "SQEs:\t%u\n", sq_tail - sq_head); - while (sq_head < sq_tail) { + sq_entries = min(sq_tail - sq_head, ctx->sq_entries); + for (i = 0; i < sq_entries; i++) { + unsigned int entry = i + sq_head; struct io_uring_sqe *sqe; unsigned int sq_idx; bool sqe128 = false; u8 opcode; if (ctx->flags & IORING_SETUP_NO_SQARRAY) - sq_idx = sq_head & sq_mask; + sq_idx = entry & sq_mask; else - sq_idx = READ_ONCE(ctx->sq_array[sq_head & sq_mask]); - + sq_idx = READ_ONCE(ctx->sq_array[entry & sq_mask]); if (sq_idx > sq_mask) continue; @@ -141,7 +143,6 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m) } } seq_printf(m, "\n"); - sq_head++; } seq_printf(m, "CQEs:\t%u\n", cq_tail - cq_head); while (cq_head < cq_tail) { From 8cd5a59e4d512c6e1df47bf8ce60f7d16e4b3c18 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 30 Oct 2025 17:02:13 -0600 Subject: [PATCH 15/68] io_uring/fdinfo: validate opcode before checking if it's an 128b one The mixed SQE support assumes that userspace always passes valid data, that is not the case. Validate the opcode properly before indexing the io_issue_defs[] array, and pass it through the nospec indexing as well as it's a user valid indexing a kernel array. Fixes: 1cba30bf9fdd ("io_uring: add support for IORING_SETUP_SQE_MIXED") Reported-by: syzbot+b883b008a0b1067d5833@syzkaller.appspotmail.com Signed-off-by: Jens Axboe --- io_uring/fdinfo.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/io_uring/fdinfo.c b/io_uring/fdinfo.c index 248006424cab..ac6e7edc7027 100644 --- a/io_uring/fdinfo.c +++ b/io_uring/fdinfo.c @@ -5,6 +5,7 @@ #include #include #include +#include #include #include @@ -107,6 +108,9 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m) sqe = &ctx->sq_sqes[sq_idx << sq_shift]; opcode = READ_ONCE(sqe->opcode); + if (opcode >= IORING_OP_LAST) + continue; + opcode = array_index_nospec(opcode, IORING_OP_LAST); if (sq_shift) { sqe128 = true; } else if (io_issue_defs[opcode].is_128) { From 4531d165ee39edb315b42a4a43e29339fa068e51 Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Fri, 31 Oct 2025 14:34:28 -0600 Subject: [PATCH 16/68] io_uring: only call io_should_terminate_tw() once for ctx io_fallback_req_func() calls io_should_terminate_tw() on each req's ctx. But since the reqs all come from the ctx's fallback_llist, req->ctx will be ctx for all of the reqs. Therefore, compute ts.cancel as io_should_terminate_tw(ctx) just once, outside the loop. Signed-off-by: Caleb Sander Mateos Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 93a1cc2bf383..4e6676ac4662 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -289,10 +289,9 @@ static __cold void io_fallback_req_func(struct work_struct *work) percpu_ref_get(&ctx->refs); mutex_lock(&ctx->uring_lock); - llist_for_each_entry_safe(req, tmp, node, io_task_work.node) { - ts.cancel = io_should_terminate_tw(req->ctx); + ts.cancel = io_should_terminate_tw(ctx); + llist_for_each_entry_safe(req, tmp, node, io_task_work.node) req->io_task_work.func(req, ts); - } io_submit_flush_completions(ctx); mutex_unlock(&ctx->uring_lock); percpu_ref_put(&ctx->refs); From c33e779aba6804778c1440192a8033a145ba588d Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Fri, 31 Oct 2025 14:34:29 -0600 Subject: [PATCH 17/68] io_uring: add wrapper type for io_req_tw_func_t arg In preparation for uring_cmd implementations to implement functions with the io_req_tw_func_t signature, introduce a wrapper struct io_tw_req to hide the struct io_kiocb * argument. The intention is for only the io_uring core to access the inner struct io_kiocb *. uring_cmd implementations should instead call a helper from io_uring/cmd.h to convert struct io_tw_req to struct io_uring_cmd *. Signed-off-by: Caleb Sander Mateos Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 6 +++++- io_uring/futex.c | 16 +++++++++------- io_uring/io_uring.c | 21 ++++++++++++--------- io_uring/io_uring.h | 4 ++-- io_uring/msg_ring.c | 3 ++- io_uring/notif.c | 5 +++-- io_uring/poll.c | 11 ++++++----- io_uring/poll.h | 2 +- io_uring/rw.c | 5 +++-- io_uring/rw.h | 2 +- io_uring/timeout.c | 18 +++++++++++------- io_uring/uring_cmd.c | 3 ++- io_uring/waitid.c | 7 ++++--- 13 files changed, 61 insertions(+), 42 deletions(-) diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 25ee982eb435..f064a438ce43 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -615,7 +615,11 @@ enum { REQ_F_SQE_COPIED = IO_REQ_FLAG(REQ_F_SQE_COPIED_BIT), }; -typedef void (*io_req_tw_func_t)(struct io_kiocb *req, io_tw_token_t tw); +struct io_tw_req { + struct io_kiocb *req; +}; + +typedef void (*io_req_tw_func_t)(struct io_tw_req tw_req, io_tw_token_t tw); struct io_task_work { struct llist_node node; diff --git a/io_uring/futex.c b/io_uring/futex.c index 64f3bd51c84c..4e022c76236d 100644 --- a/io_uring/futex.c +++ b/io_uring/futex.c @@ -41,24 +41,26 @@ void io_futex_cache_free(struct io_ring_ctx *ctx) io_alloc_cache_free(&ctx->futex_cache, kfree); } -static void __io_futex_complete(struct io_kiocb *req, io_tw_token_t tw) +static void __io_futex_complete(struct io_tw_req tw_req, io_tw_token_t tw) { - hlist_del_init(&req->hash_node); - io_req_task_complete(req, tw); + hlist_del_init(&tw_req.req->hash_node); + io_req_task_complete(tw_req, tw); } -static void io_futex_complete(struct io_kiocb *req, io_tw_token_t tw) +static void io_futex_complete(struct io_tw_req tw_req, io_tw_token_t tw) { + struct io_kiocb *req = tw_req.req; struct io_ring_ctx *ctx = req->ctx; io_tw_lock(ctx, tw); io_cache_free(&ctx->futex_cache, req->async_data); io_req_async_data_clear(req, 0); - __io_futex_complete(req, tw); + __io_futex_complete(tw_req, tw); } -static void io_futexv_complete(struct io_kiocb *req, io_tw_token_t tw) +static void io_futexv_complete(struct io_tw_req tw_req, io_tw_token_t tw) { + struct io_kiocb *req = tw_req.req; struct io_futex *iof = io_kiocb_to_cmd(req, struct io_futex); struct futex_vector *futexv = req->async_data; @@ -73,7 +75,7 @@ static void io_futexv_complete(struct io_kiocb *req, io_tw_token_t tw) } io_req_async_data_free(req); - __io_futex_complete(req, tw); + __io_futex_complete(tw_req, tw); } static bool io_futexv_claim(struct io_futex *iof) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 4e6676ac4662..01631b6ff442 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -291,7 +291,7 @@ static __cold void io_fallback_req_func(struct work_struct *work) mutex_lock(&ctx->uring_lock); ts.cancel = io_should_terminate_tw(ctx); llist_for_each_entry_safe(req, tmp, node, io_task_work.node) - req->io_task_work.func(req, ts); + req->io_task_work.func((struct io_tw_req){req}, ts); io_submit_flush_completions(ctx); mutex_unlock(&ctx->uring_lock); percpu_ref_put(&ctx->refs); @@ -539,9 +539,9 @@ static void io_queue_iowq(struct io_kiocb *req) io_wq_enqueue(tctx->io_wq, &req->work); } -static void io_req_queue_iowq_tw(struct io_kiocb *req, io_tw_token_t tw) +static void io_req_queue_iowq_tw(struct io_tw_req tw_req, io_tw_token_t tw) { - io_queue_iowq(req); + io_queue_iowq(tw_req.req); } void io_req_queue_iowq(struct io_kiocb *req) @@ -1166,7 +1166,7 @@ struct llist_node *io_handle_tw_list(struct llist_node *node, } INDIRECT_CALL_2(req->io_task_work.func, io_poll_task_func, io_req_rw_complete, - req, ts); + (struct io_tw_req){req}, ts); node = next; (*count)++; if (unlikely(need_resched())) { @@ -1389,7 +1389,7 @@ static int __io_run_local_work_loop(struct llist_node **node, io_task_work.node); INDIRECT_CALL_2(req->io_task_work.func, io_poll_task_func, io_req_rw_complete, - req, tw); + (struct io_tw_req){req}, tw); *node = next; if (++ret >= events) break; @@ -1459,14 +1459,17 @@ static int io_run_local_work(struct io_ring_ctx *ctx, int min_events, return ret; } -static void io_req_task_cancel(struct io_kiocb *req, io_tw_token_t tw) +static void io_req_task_cancel(struct io_tw_req tw_req, io_tw_token_t tw) { + struct io_kiocb *req = tw_req.req; + io_tw_lock(req->ctx, tw); io_req_defer_failed(req, req->cqe.res); } -void io_req_task_submit(struct io_kiocb *req, io_tw_token_t tw) +void io_req_task_submit(struct io_tw_req tw_req, io_tw_token_t tw) { + struct io_kiocb *req = tw_req.req; struct io_ring_ctx *ctx = req->ctx; io_tw_lock(ctx, tw); @@ -1702,9 +1705,9 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned int min_events) return 0; } -void io_req_task_complete(struct io_kiocb *req, io_tw_token_t tw) +void io_req_task_complete(struct io_tw_req tw_req, io_tw_token_t tw) { - io_req_complete_defer(req); + io_req_complete_defer(tw_req.req); } /* diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 44b8091c7fcd..f97356ce29d0 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -149,9 +149,9 @@ struct file *io_file_get_fixed(struct io_kiocb *req, int fd, void __io_req_task_work_add(struct io_kiocb *req, unsigned flags); void io_req_task_work_add_remote(struct io_kiocb *req, unsigned flags); void io_req_task_queue(struct io_kiocb *req); -void io_req_task_complete(struct io_kiocb *req, io_tw_token_t tw); +void io_req_task_complete(struct io_tw_req tw_req, io_tw_token_t tw); void io_req_task_queue_fail(struct io_kiocb *req, int ret); -void io_req_task_submit(struct io_kiocb *req, io_tw_token_t tw); +void io_req_task_submit(struct io_tw_req tw_req, io_tw_token_t tw); struct llist_node *io_handle_tw_list(struct llist_node *node, unsigned int *count, unsigned int max_entries); struct llist_node *tctx_task_work_run(struct io_uring_task *tctx, unsigned int max_entries, unsigned int *count); void tctx_task_work(struct callback_head *cb); diff --git a/io_uring/msg_ring.c b/io_uring/msg_ring.c index 5e5b94236d72..7063ea7964e7 100644 --- a/io_uring/msg_ring.c +++ b/io_uring/msg_ring.c @@ -70,8 +70,9 @@ static inline bool io_msg_need_remote(struct io_ring_ctx *target_ctx) return target_ctx->task_complete; } -static void io_msg_tw_complete(struct io_kiocb *req, io_tw_token_t tw) +static void io_msg_tw_complete(struct io_tw_req tw_req, io_tw_token_t tw) { + struct io_kiocb *req = tw_req.req; struct io_ring_ctx *ctx = req->ctx; io_add_aux_cqe(ctx, req->cqe.user_data, req->cqe.res, req->cqe.flags); diff --git a/io_uring/notif.c b/io_uring/notif.c index d8ba1165c949..9960bb2a32d5 100644 --- a/io_uring/notif.c +++ b/io_uring/notif.c @@ -11,8 +11,9 @@ static const struct ubuf_info_ops io_ubuf_ops; -static void io_notif_tw_complete(struct io_kiocb *notif, io_tw_token_t tw) +static void io_notif_tw_complete(struct io_tw_req tw_req, io_tw_token_t tw) { + struct io_kiocb *notif = tw_req.req; struct io_notif_data *nd = io_notif_to_data(notif); struct io_ring_ctx *ctx = notif->ctx; @@ -34,7 +35,7 @@ static void io_notif_tw_complete(struct io_kiocb *notif, io_tw_token_t tw) } nd = nd->next; - io_req_task_complete(notif, tw); + io_req_task_complete((struct io_tw_req){notif}, tw); } while (nd); } diff --git a/io_uring/poll.c b/io_uring/poll.c index c403e751841a..8aa4e3a31e73 100644 --- a/io_uring/poll.c +++ b/io_uring/poll.c @@ -310,8 +310,9 @@ static int io_poll_check_events(struct io_kiocb *req, io_tw_token_t tw) return IOU_POLL_NO_ACTION; } -void io_poll_task_func(struct io_kiocb *req, io_tw_token_t tw) +void io_poll_task_func(struct io_tw_req tw_req, io_tw_token_t tw) { + struct io_kiocb *req = tw_req.req; int ret; ret = io_poll_check_events(req, tw); @@ -332,7 +333,7 @@ void io_poll_task_func(struct io_kiocb *req, io_tw_token_t tw) poll = io_kiocb_to_cmd(req, struct io_poll); req->cqe.res = mangle_poll(req->cqe.res & poll->events); } else if (ret == IOU_POLL_REISSUE) { - io_req_task_submit(req, tw); + io_req_task_submit(tw_req, tw); return; } else if (ret != IOU_POLL_REMOVE_POLL_USE_RES) { req->cqe.res = ret; @@ -340,14 +341,14 @@ void io_poll_task_func(struct io_kiocb *req, io_tw_token_t tw) } io_req_set_res(req, req->cqe.res, 0); - io_req_task_complete(req, tw); + io_req_task_complete(tw_req, tw); } else { io_tw_lock(req->ctx, tw); if (ret == IOU_POLL_REMOVE_POLL_USE_RES) - io_req_task_complete(req, tw); + io_req_task_complete(tw_req, tw); else if (ret == IOU_POLL_DONE || ret == IOU_POLL_REISSUE) - io_req_task_submit(req, tw); + io_req_task_submit(tw_req, tw); else io_req_defer_failed(req, ret); } diff --git a/io_uring/poll.h b/io_uring/poll.h index c8438286dfa0..5647c5138932 100644 --- a/io_uring/poll.h +++ b/io_uring/poll.h @@ -46,4 +46,4 @@ int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags); bool io_poll_remove_all(struct io_ring_ctx *ctx, struct io_uring_task *tctx, bool cancel_all); -void io_poll_task_func(struct io_kiocb *req, io_tw_token_t tw); +void io_poll_task_func(struct io_tw_req tw_req, io_tw_token_t tw); diff --git a/io_uring/rw.c b/io_uring/rw.c index 5b2241a5813c..828ac4f902b4 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -564,8 +564,9 @@ static inline int io_fixup_rw_res(struct io_kiocb *req, long res) return res; } -void io_req_rw_complete(struct io_kiocb *req, io_tw_token_t tw) +void io_req_rw_complete(struct io_tw_req tw_req, io_tw_token_t tw) { + struct io_kiocb *req = tw_req.req; struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); struct kiocb *kiocb = &rw->kiocb; @@ -581,7 +582,7 @@ void io_req_rw_complete(struct io_kiocb *req, io_tw_token_t tw) req->cqe.flags |= io_put_kbuf(req, req->cqe.res, NULL); io_req_rw_cleanup(req, 0); - io_req_task_complete(req, tw); + io_req_task_complete(tw_req, tw); } static void io_complete_rw(struct kiocb *kiocb, long res) diff --git a/io_uring/rw.h b/io_uring/rw.h index 129a53fe5482..9bd7fbf70ea9 100644 --- a/io_uring/rw.h +++ b/io_uring/rw.h @@ -46,7 +46,7 @@ int io_read_fixed(struct io_kiocb *req, unsigned int issue_flags); int io_write_fixed(struct io_kiocb *req, unsigned int issue_flags); void io_readv_writev_cleanup(struct io_kiocb *req); void io_rw_fail(struct io_kiocb *req); -void io_req_rw_complete(struct io_kiocb *req, io_tw_token_t tw); +void io_req_rw_complete(struct io_tw_req tw_req, io_tw_token_t tw); int io_read_mshot_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_read_mshot(struct io_kiocb *req, unsigned int issue_flags); void io_rw_cache_free(const void *entry); diff --git a/io_uring/timeout.c b/io_uring/timeout.c index 444142ba9d04..d8fbbaf31cf3 100644 --- a/io_uring/timeout.c +++ b/io_uring/timeout.c @@ -68,8 +68,9 @@ static inline bool io_timeout_finish(struct io_timeout *timeout, static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer); -static void io_timeout_complete(struct io_kiocb *req, io_tw_token_t tw) +static void io_timeout_complete(struct io_tw_req tw_req, io_tw_token_t tw) { + struct io_kiocb *req = tw_req.req; struct io_timeout *timeout = io_kiocb_to_cmd(req, struct io_timeout); struct io_timeout_data *data = req->async_data; struct io_ring_ctx *ctx = req->ctx; @@ -85,7 +86,7 @@ static void io_timeout_complete(struct io_kiocb *req, io_tw_token_t tw) } } - io_req_task_complete(req, tw); + io_req_task_complete(tw_req, tw); } static __cold bool io_flush_killed_timeouts(struct list_head *list, int err) @@ -157,8 +158,10 @@ __cold void io_flush_timeouts(struct io_ring_ctx *ctx) io_flush_killed_timeouts(&list, 0); } -static void io_req_tw_fail_links(struct io_kiocb *link, io_tw_token_t tw) +static void io_req_tw_fail_links(struct io_tw_req tw_req, io_tw_token_t tw) { + struct io_kiocb *link = tw_req.req; + io_tw_lock(link->ctx, tw); while (link) { struct io_kiocb *nxt = link->link; @@ -168,7 +171,7 @@ static void io_req_tw_fail_links(struct io_kiocb *link, io_tw_token_t tw) res = link->cqe.res; link->link = NULL; io_req_set_res(link, res, 0); - io_req_task_complete(link, tw); + io_req_task_complete((struct io_tw_req){link}, tw); link = nxt; } } @@ -317,8 +320,9 @@ int io_timeout_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd) return 0; } -static void io_req_task_link_timeout(struct io_kiocb *req, io_tw_token_t tw) +static void io_req_task_link_timeout(struct io_tw_req tw_req, io_tw_token_t tw) { + struct io_kiocb *req = tw_req.req; struct io_timeout *timeout = io_kiocb_to_cmd(req, struct io_timeout); struct io_kiocb *prev = timeout->prev; int ret; @@ -335,11 +339,11 @@ static void io_req_task_link_timeout(struct io_kiocb *req, io_tw_token_t tw) ret = -ECANCELED; } io_req_set_res(req, ret ?: -ETIME, 0); - io_req_task_complete(req, tw); + io_req_task_complete(tw_req, tw); io_put_req(prev); } else { io_req_set_res(req, -ETIME, 0); - io_req_task_complete(req, tw); + io_req_task_complete(tw_req, tw); } } diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c index 9d67a2a721aa..c09b99e91c86 100644 --- a/io_uring/uring_cmd.c +++ b/io_uring/uring_cmd.c @@ -113,8 +113,9 @@ void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd, } EXPORT_SYMBOL_GPL(io_uring_cmd_mark_cancelable); -static void io_uring_cmd_work(struct io_kiocb *req, io_tw_token_t tw) +static void io_uring_cmd_work(struct io_tw_req tw_req, io_tw_token_t tw) { + struct io_kiocb *req = tw_req.req; struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd); unsigned int flags = IO_URING_F_COMPLETE_DEFER; diff --git a/io_uring/waitid.c b/io_uring/waitid.c index c5e0d979903a..62f7f1f004a5 100644 --- a/io_uring/waitid.c +++ b/io_uring/waitid.c @@ -16,7 +16,7 @@ #include "waitid.h" #include "../kernel/exit.h" -static void io_waitid_cb(struct io_kiocb *req, io_tw_token_t tw); +static void io_waitid_cb(struct io_tw_req tw_req, io_tw_token_t tw); #define IO_WAITID_CANCEL_FLAG BIT(31) #define IO_WAITID_REF_MASK GENMASK(30, 0) @@ -194,8 +194,9 @@ static inline bool io_waitid_drop_issue_ref(struct io_kiocb *req) return true; } -static void io_waitid_cb(struct io_kiocb *req, io_tw_token_t tw) +static void io_waitid_cb(struct io_tw_req tw_req, io_tw_token_t tw) { + struct io_kiocb *req = tw_req.req; struct io_waitid_async *iwa = req->async_data; struct io_ring_ctx *ctx = req->ctx; int ret; @@ -229,7 +230,7 @@ static void io_waitid_cb(struct io_kiocb *req, io_tw_token_t tw) } io_waitid_complete(req, ret); - io_req_task_complete(req, tw); + io_req_task_complete(tw_req, tw); } static int io_waitid_wait(struct wait_queue_entry *wait, unsigned mode, From 20fb3d05a34b55c8ec28ec3d3555e70c5bc0c72d Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Fri, 31 Oct 2025 14:34:30 -0600 Subject: [PATCH 18/68] io_uring/uring_cmd: avoid double indirect call in task work dispatch io_uring task work dispatch makes an indirect call to struct io_kiocb's io_task_work.func field to allow running arbitrary task work functions. In the uring_cmd case, this calls io_uring_cmd_work(), which immediately makes another indirect call to struct io_uring_cmd's task_work_cb field. Change the uring_cmd task work callbacks to functions whose signatures match io_req_tw_func_t. Add a function io_uring_cmd_from_tw() to convert from the task work's struct io_tw_req argument to struct io_uring_cmd *. Define a constant IO_URING_CMD_TASK_WORK_ISSUE_FLAGS to avoid manufacturing issue_flags in the uring_cmd task work callbacks. Now uring_cmd task work dispatch makes a single indirect call to the uring_cmd implementation's callback. This also allows removing the task_work_cb field from struct io_uring_cmd, freeing up 8 bytes for future storage. Since fuse_uring_send_in_task() now has access to the io_tw_token_t, check its cancel field directly instead of relying on the IO_URING_F_TASK_DEAD issue flag. Signed-off-by: Caleb Sander Mateos Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/ioctl.c | 6 ++++-- drivers/block/ublk_drv.c | 22 +++++++++++----------- drivers/nvme/host/ioctl.c | 7 ++++--- fs/btrfs/ioctl.c | 5 +++-- fs/fuse/dev_uring.c | 7 ++++--- include/linux/io_uring/cmd.h | 22 +++++++++++++--------- include/linux/io_uring_types.h | 1 - io_uring/uring_cmd.c | 18 ++---------------- 8 files changed, 41 insertions(+), 47 deletions(-) diff --git a/block/ioctl.c b/block/ioctl.c index d7489a56b33c..4ed17c5a4acc 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -769,14 +769,16 @@ struct blk_iou_cmd { bool nowait; }; -static void blk_cmd_complete(struct io_uring_cmd *cmd, unsigned int issue_flags) +static void blk_cmd_complete(struct io_tw_req tw_req, io_tw_token_t tw) { + struct io_uring_cmd *cmd = io_uring_cmd_from_tw(tw_req); struct blk_iou_cmd *bic = io_uring_cmd_to_pdu(cmd, struct blk_iou_cmd); if (bic->res == -EAGAIN && bic->nowait) io_uring_cmd_issue_blocking(cmd); else - io_uring_cmd_done(cmd, bic->res, issue_flags); + io_uring_cmd_done(cmd, bic->res, + IO_URING_CMD_TASK_WORK_ISSUE_FLAGS); } static void bio_cmd_bio_end_io(struct bio *bio) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 0c74a41a6753..e0c601128efa 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -1302,10 +1302,9 @@ static bool ublk_start_io(const struct ublk_queue *ubq, struct request *req, return true; } -static void ublk_dispatch_req(struct ublk_queue *ubq, - struct request *req, - unsigned int issue_flags) +static void ublk_dispatch_req(struct ublk_queue *ubq, struct request *req) { + unsigned int issue_flags = IO_URING_CMD_TASK_WORK_ISSUE_FLAGS; int tag = req->tag; struct ublk_io *io = &ubq->ios[tag]; @@ -1348,13 +1347,13 @@ static void ublk_dispatch_req(struct ublk_queue *ubq, ublk_complete_io_cmd(io, req, UBLK_IO_RES_OK, issue_flags); } -static void ublk_cmd_tw_cb(struct io_uring_cmd *cmd, - unsigned int issue_flags) +static void ublk_cmd_tw_cb(struct io_tw_req tw_req, io_tw_token_t tw) { + struct io_uring_cmd *cmd = io_uring_cmd_from_tw(tw_req); struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd); struct ublk_queue *ubq = pdu->ubq; - ublk_dispatch_req(ubq, pdu->req, issue_flags); + ublk_dispatch_req(ubq, pdu->req); } static void ublk_queue_cmd(struct ublk_queue *ubq, struct request *rq) @@ -1366,9 +1365,9 @@ static void ublk_queue_cmd(struct ublk_queue *ubq, struct request *rq) io_uring_cmd_complete_in_task(cmd, ublk_cmd_tw_cb); } -static void ublk_cmd_list_tw_cb(struct io_uring_cmd *cmd, - unsigned int issue_flags) +static void ublk_cmd_list_tw_cb(struct io_tw_req tw_req, io_tw_token_t tw) { + struct io_uring_cmd *cmd = io_uring_cmd_from_tw(tw_req); struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd); struct request *rq = pdu->req_list; struct request *next; @@ -1376,7 +1375,7 @@ static void ublk_cmd_list_tw_cb(struct io_uring_cmd *cmd, do { next = rq->rq_next; rq->rq_next = NULL; - ublk_dispatch_req(rq->mq_hctx->driver_data, rq, issue_flags); + ublk_dispatch_req(rq->mq_hctx->driver_data, rq); rq = next; } while (rq); } @@ -2523,9 +2522,10 @@ fail_put: return NULL; } -static void ublk_ch_uring_cmd_cb(struct io_uring_cmd *cmd, - unsigned int issue_flags) +static void ublk_ch_uring_cmd_cb(struct io_tw_req tw_req, io_tw_token_t tw) { + unsigned int issue_flags = IO_URING_CMD_TASK_WORK_ISSUE_FLAGS; + struct io_uring_cmd *cmd = io_uring_cmd_from_tw(tw_req); int ret = ublk_ch_uring_cmd_local(cmd, issue_flags); if (ret != -EIOCBQUEUED) diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c index c212fa952c0f..4fa8400a5627 100644 --- a/drivers/nvme/host/ioctl.c +++ b/drivers/nvme/host/ioctl.c @@ -398,14 +398,15 @@ static inline struct nvme_uring_cmd_pdu *nvme_uring_cmd_pdu( return io_uring_cmd_to_pdu(ioucmd, struct nvme_uring_cmd_pdu); } -static void nvme_uring_task_cb(struct io_uring_cmd *ioucmd, - unsigned issue_flags) +static void nvme_uring_task_cb(struct io_tw_req tw_req, io_tw_token_t tw) { + struct io_uring_cmd *ioucmd = io_uring_cmd_from_tw(tw_req); struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd); if (pdu->bio) blk_rq_unmap_user(pdu->bio); - io_uring_cmd_done32(ioucmd, pdu->status, pdu->result, issue_flags); + io_uring_cmd_done32(ioucmd, pdu->status, pdu->result, + IO_URING_CMD_TASK_WORK_ISSUE_FLAGS); } static enum rq_end_io_ret nvme_uring_cmd_end_io(struct request *req, diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 8cb7d5a462ef..3171d9df0246 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -4649,8 +4649,9 @@ struct io_btrfs_cmd { struct btrfs_uring_priv *priv; }; -static void btrfs_uring_read_finished(struct io_uring_cmd *cmd, unsigned int issue_flags) +static void btrfs_uring_read_finished(struct io_tw_req tw_req, io_tw_token_t tw) { + struct io_uring_cmd *cmd = io_uring_cmd_from_tw(tw_req); struct io_btrfs_cmd *bc = io_uring_cmd_to_pdu(cmd, struct io_btrfs_cmd); struct btrfs_uring_priv *priv = bc->priv; struct btrfs_inode *inode = BTRFS_I(file_inode(priv->iocb.ki_filp)); @@ -4695,7 +4696,7 @@ out: btrfs_unlock_extent(io_tree, priv->start, priv->lockend, &priv->cached_state); btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); - io_uring_cmd_done(cmd, ret, issue_flags); + io_uring_cmd_done(cmd, ret, IO_URING_CMD_TASK_WORK_ISSUE_FLAGS); add_rchar(current, ret); for (index = 0; index < priv->nr_pages; index++) diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c index f6b12aebb8bb..f8c93dc45768 100644 --- a/fs/fuse/dev_uring.c +++ b/fs/fuse/dev_uring.c @@ -1209,14 +1209,15 @@ static void fuse_uring_send(struct fuse_ring_ent *ent, struct io_uring_cmd *cmd, * User buffers are not mapped yet - the application does not have permission * to write to it - this has to be executed in ring task context. */ -static void fuse_uring_send_in_task(struct io_uring_cmd *cmd, - unsigned int issue_flags) +static void fuse_uring_send_in_task(struct io_tw_req tw_req, io_tw_token_t tw) { + unsigned int issue_flags = IO_URING_CMD_TASK_WORK_ISSUE_FLAGS; + struct io_uring_cmd *cmd = io_uring_cmd_from_tw(tw_req); struct fuse_ring_ent *ent = uring_cmd_to_ring_ent(cmd); struct fuse_ring_queue *queue = ent->queue; int err; - if (!(issue_flags & IO_URING_F_TASK_DEAD)) { + if (!tw.cancel) { err = fuse_uring_prepare_send(ent, ent->fuse_req); if (err) { fuse_uring_next_fuse_req(ent, queue, issue_flags); diff --git a/include/linux/io_uring/cmd.h b/include/linux/io_uring/cmd.h index 7509025b4071..375fd048c4cb 100644 --- a/include/linux/io_uring/cmd.h +++ b/include/linux/io_uring/cmd.h @@ -11,17 +11,13 @@ /* io_uring_cmd is being issued again */ #define IORING_URING_CMD_REISSUE (1U << 31) -typedef void (*io_uring_cmd_tw_t)(struct io_uring_cmd *cmd, - unsigned issue_flags); - struct io_uring_cmd { struct file *file; const struct io_uring_sqe *sqe; - /* callback to defer completions to task context */ - io_uring_cmd_tw_t task_work_cb; u32 cmd_op; u32 flags; u8 pdu[32]; /* available inline for free use */ + u8 unused[8]; }; static inline const void *io_uring_sqe_cmd(const struct io_uring_sqe *sqe) @@ -60,7 +56,7 @@ void __io_uring_cmd_done(struct io_uring_cmd *cmd, s32 ret, u64 res2, unsigned issue_flags, bool is_cqe32); void __io_uring_cmd_do_in_task(struct io_uring_cmd *ioucmd, - io_uring_cmd_tw_t task_work_cb, + io_req_tw_func_t task_work_cb, unsigned flags); /* @@ -109,7 +105,7 @@ static inline void __io_uring_cmd_done(struct io_uring_cmd *cmd, s32 ret, { } static inline void __io_uring_cmd_do_in_task(struct io_uring_cmd *ioucmd, - io_uring_cmd_tw_t task_work_cb, unsigned flags) + io_req_tw_func_t task_work_cb, unsigned flags) { } static inline void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd, @@ -132,15 +128,23 @@ static inline bool io_uring_mshot_cmd_post_cqe(struct io_uring_cmd *ioucmd, } #endif +static inline struct io_uring_cmd *io_uring_cmd_from_tw(struct io_tw_req tw_req) +{ + return io_kiocb_to_cmd(tw_req.req, struct io_uring_cmd); +} + +/* task_work executor checks the deferred list completion */ +#define IO_URING_CMD_TASK_WORK_ISSUE_FLAGS IO_URING_F_COMPLETE_DEFER + /* users must follow the IOU_F_TWQ_LAZY_WAKE semantics */ static inline void io_uring_cmd_do_in_task_lazy(struct io_uring_cmd *ioucmd, - io_uring_cmd_tw_t task_work_cb) + io_req_tw_func_t task_work_cb) { __io_uring_cmd_do_in_task(ioucmd, task_work_cb, IOU_F_TWQ_LAZY_WAKE); } static inline void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd, - io_uring_cmd_tw_t task_work_cb) + io_req_tw_func_t task_work_cb) { __io_uring_cmd_do_in_task(ioucmd, task_work_cb, 0); } diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index f064a438ce43..92780764d5fa 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -39,7 +39,6 @@ enum io_uring_cmd_flags { /* set when uring wants to cancel a previously issued command */ IO_URING_F_CANCEL = (1 << 11), IO_URING_F_COMPAT = (1 << 12), - IO_URING_F_TASK_DEAD = (1 << 13), }; struct io_wq_work_node { diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c index c09b99e91c86..197474911f04 100644 --- a/io_uring/uring_cmd.c +++ b/io_uring/uring_cmd.c @@ -113,21 +113,8 @@ void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd, } EXPORT_SYMBOL_GPL(io_uring_cmd_mark_cancelable); -static void io_uring_cmd_work(struct io_tw_req tw_req, io_tw_token_t tw) -{ - struct io_kiocb *req = tw_req.req; - struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd); - unsigned int flags = IO_URING_F_COMPLETE_DEFER; - - if (unlikely(tw.cancel)) - flags |= IO_URING_F_TASK_DEAD; - - /* task_work executor checks the deffered list completion */ - ioucmd->task_work_cb(ioucmd, flags); -} - void __io_uring_cmd_do_in_task(struct io_uring_cmd *ioucmd, - io_uring_cmd_tw_t task_work_cb, + io_req_tw_func_t task_work_cb, unsigned flags) { struct io_kiocb *req = cmd_to_io_kiocb(ioucmd); @@ -135,8 +122,7 @@ void __io_uring_cmd_do_in_task(struct io_uring_cmd *ioucmd, if (WARN_ON_ONCE(req->flags & REQ_F_APOLL_MULTISHOT)) return; - ioucmd->task_work_cb = task_work_cb; - req->io_task_work.func = io_uring_cmd_work; + req->io_task_work.func = task_work_cb; __io_req_task_work_add(req, flags); } EXPORT_SYMBOL_GPL(__io_uring_cmd_do_in_task); From adb395c457a6a202240ebbb3255bf41b19d08a0d Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 3 Nov 2025 10:51:39 -0700 Subject: [PATCH 19/68] io_uring/slist: remove unused wq list splice helpers Nobody is using those helpers anymore, get rid of them. Signed-off-by: Jens Axboe --- io_uring/slist.h | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/io_uring/slist.h b/io_uring/slist.h index 0eb194817242..7ef747442754 100644 --- a/io_uring/slist.h +++ b/io_uring/slist.h @@ -67,24 +67,6 @@ static inline void wq_list_cut(struct io_wq_work_list *list, last->next = NULL; } -static inline void __wq_list_splice(struct io_wq_work_list *list, - struct io_wq_work_node *to) -{ - list->last->next = to->next; - to->next = list->first; - INIT_WQ_LIST(list); -} - -static inline bool wq_list_splice(struct io_wq_work_list *list, - struct io_wq_work_node *to) -{ - if (!wq_list_empty(list)) { - __wq_list_splice(list, to); - return true; - } - return false; -} - static inline void wq_stack_add_head(struct io_wq_work_node *node, struct io_wq_work_node *stack) { From 3615e3f7947a3c1cb15d362da921ac46d771e02c Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 3 Nov 2025 11:02:54 -0700 Subject: [PATCH 20/68] io_uring/rsrc: use get/put_user() for integer copy It's just getting an integer from userspace, installing a file, then copying the output direct descriptor back. No need to use the full copy_to/from_user() for that. Signed-off-by: Jens Axboe --- io_uring/rsrc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index d787c16dc1c3..4cc38eb56758 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -454,7 +454,7 @@ static int io_files_update_with_index_alloc(struct io_kiocb *req, return -ENXIO; for (done = 0; done < up->nr_args; done++) { - if (copy_from_user(&fd, &fds[done], sizeof(fd))) { + if (get_user(fd, &fds[done])) { ret = -EFAULT; break; } @@ -468,7 +468,7 @@ static int io_files_update_with_index_alloc(struct io_kiocb *req, IORING_FILE_INDEX_ALLOC); if (ret < 0) break; - if (copy_to_user(&fds[done], &ret, sizeof(ret))) { + if (put_user(ret, &fds[done])) { __io_close_fixed(req->ctx, issue_flags, ret); ret = -EFAULT; break; From bc82b02218204d89f26fd1fde5aed265f40453d3 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 3 Nov 2025 11:11:27 -0700 Subject: [PATCH 21/68] io_uring/memmap: remove dead io_create_region_mmap_safe() declaration No longer used and doesn't even exist, kill it from the memmap header file. Signed-off-by: Jens Axboe --- io_uring/memmap.h | 5 ----- 1 file changed, 5 deletions(-) diff --git a/io_uring/memmap.h b/io_uring/memmap.h index 58002976e0c3..f9e94458c01f 100644 --- a/io_uring/memmap.h +++ b/io_uring/memmap.h @@ -21,11 +21,6 @@ int io_create_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr, struct io_uring_region_desc *reg, unsigned long mmap_offset); -int io_create_region_mmap_safe(struct io_ring_ctx *ctx, - struct io_mapped_region *mr, - struct io_uring_region_desc *reg, - unsigned long mmap_offset); - static inline void *io_region_get_ptr(struct io_mapped_region *mr) { return mr->ptr; From 0d677936d67774f1b4ebfb3b26f207320f0fe3c6 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 3 Nov 2025 11:21:39 -0700 Subject: [PATCH 22/68] io_uring/cancel: move request/task cancelation logic into cancel.c Move io_match_task_safe() and helpers into cancel.c, where it belongs. Signed-off-by: Jens Axboe --- io_uring/cancel.c | 38 ++++++++++++++++++++++++++++++++++++++ io_uring/cancel.h | 2 ++ io_uring/io_uring.c | 38 -------------------------------------- io_uring/io_uring.h | 3 --- 4 files changed, 40 insertions(+), 41 deletions(-) diff --git a/io_uring/cancel.c b/io_uring/cancel.c index 64b51e82baa2..2754ea80e288 100644 --- a/io_uring/cancel.c +++ b/io_uring/cancel.c @@ -384,3 +384,41 @@ int io_cancel_remove(struct io_ring_ctx *ctx, struct io_cancel_data *cd, io_ring_submit_unlock(ctx, issue_flags); return nr ?: -ENOENT; } + +static bool io_match_linked(struct io_kiocb *head) +{ + struct io_kiocb *req; + + io_for_each_link(req, head) { + if (req->flags & REQ_F_INFLIGHT) + return true; + } + return false; +} + +/* + * As io_match_task() but protected against racing with linked timeouts. + * User must not hold timeout_lock. + */ +bool io_match_task_safe(struct io_kiocb *head, struct io_uring_task *tctx, + bool cancel_all) +{ + bool matched; + + if (tctx && head->tctx != tctx) + return false; + if (cancel_all) + return true; + + if (head->flags & REQ_F_LINK_TIMEOUT) { + struct io_ring_ctx *ctx = head->ctx; + + /* protect against races with linked timeouts */ + raw_spin_lock_irq(&ctx->timeout_lock); + matched = io_match_linked(head); + raw_spin_unlock_irq(&ctx->timeout_lock); + } else { + matched = io_match_linked(head); + } + return matched; +} diff --git a/io_uring/cancel.h b/io_uring/cancel.h index 43e9bb74e9d1..6d5208e9d7a6 100644 --- a/io_uring/cancel.h +++ b/io_uring/cancel.h @@ -23,6 +23,8 @@ int io_try_cancel(struct io_uring_task *tctx, struct io_cancel_data *cd, int io_sync_cancel(struct io_ring_ctx *ctx, void __user *arg); bool io_cancel_req_match(struct io_kiocb *req, struct io_cancel_data *cd); +bool io_match_task_safe(struct io_kiocb *head, struct io_uring_task *tctx, + bool cancel_all); bool io_cancel_remove_all(struct io_ring_ctx *ctx, struct io_uring_task *tctx, struct hlist_head *list, bool cancel_all, diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 01631b6ff442..75bd049a1efd 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -207,44 +207,6 @@ static inline unsigned int __io_cqring_events_user(struct io_ring_ctx *ctx) return READ_ONCE(ctx->rings->cq.tail) - READ_ONCE(ctx->rings->cq.head); } -static bool io_match_linked(struct io_kiocb *head) -{ - struct io_kiocb *req; - - io_for_each_link(req, head) { - if (req->flags & REQ_F_INFLIGHT) - return true; - } - return false; -} - -/* - * As io_match_task() but protected against racing with linked timeouts. - * User must not hold timeout_lock. - */ -bool io_match_task_safe(struct io_kiocb *head, struct io_uring_task *tctx, - bool cancel_all) -{ - bool matched; - - if (tctx && head->tctx != tctx) - return false; - if (cancel_all) - return true; - - if (head->flags & REQ_F_LINK_TIMEOUT) { - struct io_ring_ctx *ctx = head->ctx; - - /* protect against races with linked timeouts */ - raw_spin_lock_irq(&ctx->timeout_lock); - matched = io_match_linked(head); - raw_spin_unlock_irq(&ctx->timeout_lock); - } else { - matched = io_match_linked(head); - } - return matched; -} - static inline void req_fail_link_node(struct io_kiocb *req, int res) { req_set_fail(req); diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index f97356ce29d0..2f4d43e69648 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -174,9 +174,6 @@ void io_queue_next(struct io_kiocb *req); void io_task_refs_refill(struct io_uring_task *tctx); bool __io_alloc_req_refill(struct io_ring_ctx *ctx); -bool io_match_task_safe(struct io_kiocb *head, struct io_uring_task *tctx, - bool cancel_all); - void io_activate_pollwq(struct io_ring_ctx *ctx); static inline void io_lockdep_assert_cq_locked(struct io_ring_ctx *ctx) From 01e019b2a30df41c485f602a5246124ea911071b Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 3 Nov 2025 11:23:23 -0700 Subject: [PATCH 23/68] io_uring/cancel: move __io_uring_cancel() into cancel.c Yet another function that should be in cancel.c, move it over. Signed-off-by: Jens Axboe --- io_uring/cancel.c | 6 ++++++ io_uring/io_uring.c | 6 ------ 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/io_uring/cancel.c b/io_uring/cancel.c index 2754ea80e288..3ba82a1bfe80 100644 --- a/io_uring/cancel.c +++ b/io_uring/cancel.c @@ -422,3 +422,9 @@ bool io_match_task_safe(struct io_kiocb *head, struct io_uring_task *tctx, } return matched; } + +void __io_uring_cancel(bool cancel_all) +{ + io_uring_unreg_ringfd(); + io_uring_cancel_generic(cancel_all, NULL); +} diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 75bd049a1efd..b3be305b99be 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -3336,12 +3336,6 @@ end_wait: } } -void __io_uring_cancel(bool cancel_all) -{ - io_uring_unreg_ringfd(); - io_uring_cancel_generic(cancel_all, NULL); -} - static struct io_uring_reg_wait *io_get_ext_arg_reg(struct io_ring_ctx *ctx, const struct io_uring_getevents_arg __user *uarg) { From ffce324364318220acf83e576eac06549cbf9911 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 3 Nov 2025 12:39:05 -0700 Subject: [PATCH 24/68] io_uring/cancel: move cancelation code from io_uring.c to cancel.c There's a bunch of code strictly dealing with cancelations, and that code really belongs in cancel.c rather than in the core io_uring.c file. Move the code there. Mostly mechanical, only real oddity here is that struct io_defer_entry now needs to be visible across both io_uring.c and cancel.c. Signed-off-by: Jens Axboe --- io_uring/cancel.c | 226 ++++++++++++++++++++++++++++++++++++++++ io_uring/cancel.h | 6 +- io_uring/io_uring.c | 244 +------------------------------------------- io_uring/io_uring.h | 10 +- io_uring/sqpoll.c | 1 + 5 files changed, 245 insertions(+), 242 deletions(-) diff --git a/io_uring/cancel.c b/io_uring/cancel.c index 3ba82a1bfe80..ca12ac10c0ae 100644 --- a/io_uring/cancel.c +++ b/io_uring/cancel.c @@ -14,6 +14,8 @@ #include "filetable.h" #include "io_uring.h" #include "tctx.h" +#include "sqpoll.h" +#include "uring_cmd.h" #include "poll.h" #include "timeout.h" #include "waitid.h" @@ -428,3 +430,227 @@ void __io_uring_cancel(bool cancel_all) io_uring_unreg_ringfd(); io_uring_cancel_generic(cancel_all, NULL); } + +struct io_task_cancel { + struct io_uring_task *tctx; + bool all; +}; + +static bool io_cancel_task_cb(struct io_wq_work *work, void *data) +{ + struct io_kiocb *req = container_of(work, struct io_kiocb, work); + struct io_task_cancel *cancel = data; + + return io_match_task_safe(req, cancel->tctx, cancel->all); +} + +static __cold bool io_cancel_defer_files(struct io_ring_ctx *ctx, + struct io_uring_task *tctx, + bool cancel_all) +{ + struct io_defer_entry *de; + LIST_HEAD(list); + + list_for_each_entry_reverse(de, &ctx->defer_list, list) { + if (io_match_task_safe(de->req, tctx, cancel_all)) { + list_cut_position(&list, &ctx->defer_list, &de->list); + break; + } + } + if (list_empty(&list)) + return false; + + while (!list_empty(&list)) { + de = list_first_entry(&list, struct io_defer_entry, list); + list_del_init(&de->list); + ctx->nr_drained -= io_linked_nr(de->req); + io_req_task_queue_fail(de->req, -ECANCELED); + kfree(de); + } + return true; +} + +__cold bool io_cancel_ctx_cb(struct io_wq_work *work, void *data) +{ + struct io_kiocb *req = container_of(work, struct io_kiocb, work); + + return req->ctx == data; +} + +static __cold bool io_uring_try_cancel_iowq(struct io_ring_ctx *ctx) +{ + struct io_tctx_node *node; + enum io_wq_cancel cret; + bool ret = false; + + mutex_lock(&ctx->uring_lock); + list_for_each_entry(node, &ctx->tctx_list, ctx_node) { + struct io_uring_task *tctx = node->task->io_uring; + + /* + * io_wq will stay alive while we hold uring_lock, because it's + * killed after ctx nodes, which requires to take the lock. + */ + if (!tctx || !tctx->io_wq) + continue; + cret = io_wq_cancel_cb(tctx->io_wq, io_cancel_ctx_cb, ctx, true); + ret |= (cret != IO_WQ_CANCEL_NOTFOUND); + } + mutex_unlock(&ctx->uring_lock); + + return ret; +} + +__cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx, + struct io_uring_task *tctx, + bool cancel_all, bool is_sqpoll_thread) +{ + struct io_task_cancel cancel = { .tctx = tctx, .all = cancel_all, }; + enum io_wq_cancel cret; + bool ret = false; + + /* set it so io_req_local_work_add() would wake us up */ + if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) { + atomic_set(&ctx->cq_wait_nr, 1); + smp_mb(); + } + + /* failed during ring init, it couldn't have issued any requests */ + if (!ctx->rings) + return false; + + if (!tctx) { + ret |= io_uring_try_cancel_iowq(ctx); + } else if (tctx->io_wq) { + /* + * Cancels requests of all rings, not only @ctx, but + * it's fine as the task is in exit/exec. + */ + cret = io_wq_cancel_cb(tctx->io_wq, io_cancel_task_cb, + &cancel, true); + ret |= (cret != IO_WQ_CANCEL_NOTFOUND); + } + + /* SQPOLL thread does its own polling */ + if ((!(ctx->flags & IORING_SETUP_SQPOLL) && cancel_all) || + is_sqpoll_thread) { + while (!wq_list_empty(&ctx->iopoll_list)) { + io_iopoll_try_reap_events(ctx); + ret = true; + cond_resched(); + } + } + + if ((ctx->flags & IORING_SETUP_DEFER_TASKRUN) && + io_allowed_defer_tw_run(ctx)) + ret |= io_run_local_work(ctx, INT_MAX, INT_MAX) > 0; + mutex_lock(&ctx->uring_lock); + ret |= io_cancel_defer_files(ctx, tctx, cancel_all); + ret |= io_poll_remove_all(ctx, tctx, cancel_all); + ret |= io_waitid_remove_all(ctx, tctx, cancel_all); + ret |= io_futex_remove_all(ctx, tctx, cancel_all); + ret |= io_uring_try_cancel_uring_cmd(ctx, tctx, cancel_all); + mutex_unlock(&ctx->uring_lock); + ret |= io_kill_timeouts(ctx, tctx, cancel_all); + if (tctx) + ret |= io_run_task_work() > 0; + else + ret |= flush_delayed_work(&ctx->fallback_work); + return ret; +} + +static s64 tctx_inflight(struct io_uring_task *tctx, bool tracked) +{ + if (tracked) + return atomic_read(&tctx->inflight_tracked); + return percpu_counter_sum(&tctx->inflight); +} + +/* + * Find any io_uring ctx that this task has registered or done IO on, and cancel + * requests. @sqd should be not-null IFF it's an SQPOLL thread cancellation. + */ +__cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd) +{ + struct io_uring_task *tctx = current->io_uring; + struct io_ring_ctx *ctx; + struct io_tctx_node *node; + unsigned long index; + s64 inflight; + DEFINE_WAIT(wait); + + WARN_ON_ONCE(sqd && sqpoll_task_locked(sqd) != current); + + if (!current->io_uring) + return; + if (tctx->io_wq) + io_wq_exit_start(tctx->io_wq); + + atomic_inc(&tctx->in_cancel); + do { + bool loop = false; + + io_uring_drop_tctx_refs(current); + if (!tctx_inflight(tctx, !cancel_all)) + break; + + /* read completions before cancelations */ + inflight = tctx_inflight(tctx, false); + if (!inflight) + break; + + if (!sqd) { + xa_for_each(&tctx->xa, index, node) { + /* sqpoll task will cancel all its requests */ + if (node->ctx->sq_data) + continue; + loop |= io_uring_try_cancel_requests(node->ctx, + current->io_uring, + cancel_all, + false); + } + } else { + list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) + loop |= io_uring_try_cancel_requests(ctx, + current->io_uring, + cancel_all, + true); + } + + if (loop) { + cond_resched(); + continue; + } + + prepare_to_wait(&tctx->wait, &wait, TASK_INTERRUPTIBLE); + io_run_task_work(); + io_uring_drop_tctx_refs(current); + xa_for_each(&tctx->xa, index, node) { + if (io_local_work_pending(node->ctx)) { + WARN_ON_ONCE(node->ctx->submitter_task && + node->ctx->submitter_task != current); + goto end_wait; + } + } + /* + * If we've seen completions, retry without waiting. This + * avoids a race where a completion comes in before we did + * prepare_to_wait(). + */ + if (inflight == tctx_inflight(tctx, !cancel_all)) + schedule(); +end_wait: + finish_wait(&tctx->wait, &wait); + } while (1); + + io_uring_clean_tctx(tctx); + if (cancel_all) { + /* + * We shouldn't run task_works after cancel, so just leave + * ->in_cancel set for normal exit. + */ + atomic_dec(&tctx->in_cancel); + /* for exec all current's requests should be gone, kill tctx */ + __io_uring_free(current); + } +} diff --git a/io_uring/cancel.h b/io_uring/cancel.h index 6d5208e9d7a6..6783961ede1b 100644 --- a/io_uring/cancel.h +++ b/io_uring/cancel.h @@ -29,10 +29,14 @@ bool io_match_task_safe(struct io_kiocb *head, struct io_uring_task *tctx, bool io_cancel_remove_all(struct io_ring_ctx *ctx, struct io_uring_task *tctx, struct hlist_head *list, bool cancel_all, bool (*cancel)(struct io_kiocb *)); - int io_cancel_remove(struct io_ring_ctx *ctx, struct io_cancel_data *cd, unsigned int issue_flags, struct hlist_head *list, bool (*cancel)(struct io_kiocb *)); +__cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx, + struct io_uring_task *tctx, + bool cancel_all, bool is_sqpoll_thread); +__cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd); +__cold bool io_cancel_ctx_cb(struct io_wq_work *work, void *data); static inline bool io_cancel_match_sequence(struct io_kiocb *req, int sequence) { diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index b3be305b99be..3f0489261d11 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -124,11 +124,6 @@ #define IO_REQ_ALLOC_BATCH 8 #define IO_LOCAL_TW_DEFAULT_MAX 20 -struct io_defer_entry { - struct list_head list; - struct io_kiocb *req; -}; - /* requests with any of those set should undergo io_disarm_next() */ #define IO_DISARM_MASK (REQ_F_ARM_LTIMEOUT | REQ_F_LINK_TIMEOUT | REQ_F_FAIL) @@ -140,11 +135,6 @@ struct io_defer_entry { /* Forced wake up if there is a waiter regardless of ->cq_wait_nr */ #define IO_CQ_WAKE_FORCE (IO_CQ_WAKE_INIT >> 1) -static bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx, - struct io_uring_task *tctx, - bool cancel_all, - bool is_sqpoll_thread); - static void io_queue_sqe(struct io_kiocb *req, unsigned int extra_flags); static void __io_req_caches_free(struct io_ring_ctx *ctx); @@ -512,7 +502,7 @@ void io_req_queue_iowq(struct io_kiocb *req) io_req_task_work_add(req); } -static unsigned io_linked_nr(struct io_kiocb *req) +unsigned io_linked_nr(struct io_kiocb *req) { struct io_kiocb *tmp; unsigned nr = 0; @@ -681,7 +671,7 @@ void io_task_refs_refill(struct io_uring_task *tctx) tctx->cached_refs += refill; } -static __cold void io_uring_drop_tctx_refs(struct task_struct *task) +__cold void io_uring_drop_tctx_refs(struct task_struct *task) { struct io_uring_task *tctx = task->io_uring; unsigned int refs = tctx->cached_refs; @@ -1409,8 +1399,7 @@ static inline int io_run_local_work_locked(struct io_ring_ctx *ctx, max(IO_LOCAL_TW_DEFAULT_MAX, min_events)); } -static int io_run_local_work(struct io_ring_ctx *ctx, int min_events, - int max_events) +int io_run_local_work(struct io_ring_ctx *ctx, int min_events, int max_events) { struct io_tw_state ts = {}; int ret; @@ -1564,7 +1553,7 @@ static unsigned io_cqring_events(struct io_ring_ctx *ctx) * We can't just wait for polled events to come to us, we have to actively * find and complete them. */ -static __cold void io_iopoll_try_reap_events(struct io_ring_ctx *ctx) +__cold void io_iopoll_try_reap_events(struct io_ring_ctx *ctx) { if (!(ctx->flags & IORING_SETUP_IOPOLL)) return; @@ -2978,13 +2967,6 @@ static __cold void io_tctx_exit_cb(struct callback_head *cb) complete(&work->completion); } -static __cold bool io_cancel_ctx_cb(struct io_wq_work *work, void *data) -{ - struct io_kiocb *req = container_of(work, struct io_kiocb, work); - - return req->ctx == data; -} - static __cold void io_ring_exit_work(struct work_struct *work) { struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx, exit_work); @@ -3118,224 +3100,6 @@ static int io_uring_release(struct inode *inode, struct file *file) return 0; } -struct io_task_cancel { - struct io_uring_task *tctx; - bool all; -}; - -static bool io_cancel_task_cb(struct io_wq_work *work, void *data) -{ - struct io_kiocb *req = container_of(work, struct io_kiocb, work); - struct io_task_cancel *cancel = data; - - return io_match_task_safe(req, cancel->tctx, cancel->all); -} - -static __cold bool io_cancel_defer_files(struct io_ring_ctx *ctx, - struct io_uring_task *tctx, - bool cancel_all) -{ - struct io_defer_entry *de; - LIST_HEAD(list); - - list_for_each_entry_reverse(de, &ctx->defer_list, list) { - if (io_match_task_safe(de->req, tctx, cancel_all)) { - list_cut_position(&list, &ctx->defer_list, &de->list); - break; - } - } - if (list_empty(&list)) - return false; - - while (!list_empty(&list)) { - de = list_first_entry(&list, struct io_defer_entry, list); - list_del_init(&de->list); - ctx->nr_drained -= io_linked_nr(de->req); - io_req_task_queue_fail(de->req, -ECANCELED); - kfree(de); - } - return true; -} - -static __cold bool io_uring_try_cancel_iowq(struct io_ring_ctx *ctx) -{ - struct io_tctx_node *node; - enum io_wq_cancel cret; - bool ret = false; - - mutex_lock(&ctx->uring_lock); - list_for_each_entry(node, &ctx->tctx_list, ctx_node) { - struct io_uring_task *tctx = node->task->io_uring; - - /* - * io_wq will stay alive while we hold uring_lock, because it's - * killed after ctx nodes, which requires to take the lock. - */ - if (!tctx || !tctx->io_wq) - continue; - cret = io_wq_cancel_cb(tctx->io_wq, io_cancel_ctx_cb, ctx, true); - ret |= (cret != IO_WQ_CANCEL_NOTFOUND); - } - mutex_unlock(&ctx->uring_lock); - - return ret; -} - -static __cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx, - struct io_uring_task *tctx, - bool cancel_all, - bool is_sqpoll_thread) -{ - struct io_task_cancel cancel = { .tctx = tctx, .all = cancel_all, }; - enum io_wq_cancel cret; - bool ret = false; - - /* set it so io_req_local_work_add() would wake us up */ - if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) { - atomic_set(&ctx->cq_wait_nr, 1); - smp_mb(); - } - - /* failed during ring init, it couldn't have issued any requests */ - if (!ctx->rings) - return false; - - if (!tctx) { - ret |= io_uring_try_cancel_iowq(ctx); - } else if (tctx->io_wq) { - /* - * Cancels requests of all rings, not only @ctx, but - * it's fine as the task is in exit/exec. - */ - cret = io_wq_cancel_cb(tctx->io_wq, io_cancel_task_cb, - &cancel, true); - ret |= (cret != IO_WQ_CANCEL_NOTFOUND); - } - - /* SQPOLL thread does its own polling */ - if ((!(ctx->flags & IORING_SETUP_SQPOLL) && cancel_all) || - is_sqpoll_thread) { - while (!wq_list_empty(&ctx->iopoll_list)) { - io_iopoll_try_reap_events(ctx); - ret = true; - cond_resched(); - } - } - - if ((ctx->flags & IORING_SETUP_DEFER_TASKRUN) && - io_allowed_defer_tw_run(ctx)) - ret |= io_run_local_work(ctx, INT_MAX, INT_MAX) > 0; - mutex_lock(&ctx->uring_lock); - ret |= io_cancel_defer_files(ctx, tctx, cancel_all); - ret |= io_poll_remove_all(ctx, tctx, cancel_all); - ret |= io_waitid_remove_all(ctx, tctx, cancel_all); - ret |= io_futex_remove_all(ctx, tctx, cancel_all); - ret |= io_uring_try_cancel_uring_cmd(ctx, tctx, cancel_all); - mutex_unlock(&ctx->uring_lock); - ret |= io_kill_timeouts(ctx, tctx, cancel_all); - if (tctx) - ret |= io_run_task_work() > 0; - else - ret |= flush_delayed_work(&ctx->fallback_work); - return ret; -} - -static s64 tctx_inflight(struct io_uring_task *tctx, bool tracked) -{ - if (tracked) - return atomic_read(&tctx->inflight_tracked); - return percpu_counter_sum(&tctx->inflight); -} - -/* - * Find any io_uring ctx that this task has registered or done IO on, and cancel - * requests. @sqd should be not-null IFF it's an SQPOLL thread cancellation. - */ -__cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd) -{ - struct io_uring_task *tctx = current->io_uring; - struct io_ring_ctx *ctx; - struct io_tctx_node *node; - unsigned long index; - s64 inflight; - DEFINE_WAIT(wait); - - WARN_ON_ONCE(sqd && sqpoll_task_locked(sqd) != current); - - if (!current->io_uring) - return; - if (tctx->io_wq) - io_wq_exit_start(tctx->io_wq); - - atomic_inc(&tctx->in_cancel); - do { - bool loop = false; - - io_uring_drop_tctx_refs(current); - if (!tctx_inflight(tctx, !cancel_all)) - break; - - /* read completions before cancelations */ - inflight = tctx_inflight(tctx, false); - if (!inflight) - break; - - if (!sqd) { - xa_for_each(&tctx->xa, index, node) { - /* sqpoll task will cancel all its requests */ - if (node->ctx->sq_data) - continue; - loop |= io_uring_try_cancel_requests(node->ctx, - current->io_uring, - cancel_all, - false); - } - } else { - list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) - loop |= io_uring_try_cancel_requests(ctx, - current->io_uring, - cancel_all, - true); - } - - if (loop) { - cond_resched(); - continue; - } - - prepare_to_wait(&tctx->wait, &wait, TASK_INTERRUPTIBLE); - io_run_task_work(); - io_uring_drop_tctx_refs(current); - xa_for_each(&tctx->xa, index, node) { - if (io_local_work_pending(node->ctx)) { - WARN_ON_ONCE(node->ctx->submitter_task && - node->ctx->submitter_task != current); - goto end_wait; - } - } - /* - * If we've seen completions, retry without waiting. This - * avoids a race where a completion comes in before we did - * prepare_to_wait(). - */ - if (inflight == tctx_inflight(tctx, !cancel_all)) - schedule(); -end_wait: - finish_wait(&tctx->wait, &wait); - } while (1); - - io_uring_clean_tctx(tctx); - if (cancel_all) { - /* - * We shouldn't run task_works after cancel, so just leave - * ->in_cancel set for normal exit. - */ - atomic_dec(&tctx->in_cancel); - /* for exec all current's requests should be gone, kill tctx */ - __io_uring_free(current); - } -} - static struct io_uring_reg_wait *io_get_ext_arg_reg(struct io_ring_ctx *ctx, const struct io_uring_getevents_arg __user *uarg) { diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 2f4d43e69648..23c268ab1c8f 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -96,6 +96,11 @@ enum { IOU_REQUEUE = -3072, }; +struct io_defer_entry { + struct list_head list; + struct io_kiocb *req; +}; + struct io_wait_queue { struct wait_queue_entry wq; struct io_ring_ctx *ctx; @@ -134,6 +139,7 @@ unsigned long rings_size(unsigned int flags, unsigned int sq_entries, int io_uring_fill_params(unsigned entries, struct io_uring_params *p); bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow, bool cqe32); int io_run_task_work_sig(struct io_ring_ctx *ctx); +int io_run_local_work(struct io_ring_ctx *ctx, int min_events, int max_events); void io_req_defer_failed(struct io_kiocb *req, s32 res); bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags); void io_add_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags); @@ -141,6 +147,7 @@ bool io_req_post_cqe(struct io_kiocb *req, s32 res, u32 cflags); bool io_req_post_cqe32(struct io_kiocb *req, struct io_uring_cqe src_cqe[2]); void __io_commit_cqring_flush(struct io_ring_ctx *ctx); +unsigned io_linked_nr(struct io_kiocb *req); void io_req_track_inflight(struct io_kiocb *req); struct file *io_file_get_normal(struct io_kiocb *req, int fd); struct file *io_file_get_fixed(struct io_kiocb *req, int fd, @@ -155,7 +162,7 @@ void io_req_task_submit(struct io_tw_req tw_req, io_tw_token_t tw); struct llist_node *io_handle_tw_list(struct llist_node *node, unsigned int *count, unsigned int max_entries); struct llist_node *tctx_task_work_run(struct io_uring_task *tctx, unsigned int max_entries, unsigned int *count); void tctx_task_work(struct callback_head *cb); -__cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd); +__cold void io_uring_drop_tctx_refs(struct task_struct *task); int io_ring_add_registered_file(struct io_uring_task *tctx, struct file *file, int start, int end); @@ -164,6 +171,7 @@ void io_req_queue_iowq(struct io_kiocb *req); int io_poll_issue(struct io_kiocb *req, io_tw_token_t tw); int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr); int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin); +__cold void io_iopoll_try_reap_events(struct io_ring_ctx *ctx); void __io_submit_flush_completions(struct io_ring_ctx *ctx); struct io_wq_work *io_wq_free_work(struct io_wq_work *work); diff --git a/io_uring/sqpoll.c b/io_uring/sqpoll.c index a3f11349ce06..e82997d26ebb 100644 --- a/io_uring/sqpoll.c +++ b/io_uring/sqpoll.c @@ -18,6 +18,7 @@ #include "io_uring.h" #include "tctx.h" #include "napi.h" +#include "cancel.h" #include "sqpoll.h" #define IORING_SQPOLL_CAP_ENTRIES_VALUE 8 From 4b25b75c30d90a2ad45eb6c79d4c71fdbb06bb4e Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Wed, 15 Oct 2025 11:25:54 -0600 Subject: [PATCH 25/68] io_uring/memmap: return bool from io_mem_alloc_compound() io_mem_alloc_compound() returns either ERR_PTR(-ENOMEM) or a virtual address for the allocated memory, but its caller just checks whether the result is an error. Return a bool success value instead. Signed-off-by: Caleb Sander Mateos Signed-off-by: Jens Axboe --- io_uring/memmap.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/io_uring/memmap.c b/io_uring/memmap.c index aa388ecd4754..67b7b17ece31 100644 --- a/io_uring/memmap.c +++ b/io_uring/memmap.c @@ -15,26 +15,26 @@ #include "rsrc.h" #include "zcrx.h" -static void *io_mem_alloc_compound(struct page **pages, int nr_pages, - size_t size, gfp_t gfp) +static bool io_mem_alloc_compound(struct page **pages, int nr_pages, + size_t size, gfp_t gfp) { struct page *page; int i, order; order = get_order(size); if (order > MAX_PAGE_ORDER) - return ERR_PTR(-ENOMEM); + return false; else if (order) gfp |= __GFP_COMP; page = alloc_pages(gfp, order); if (!page) - return ERR_PTR(-ENOMEM); + return false; for (i = 0; i < nr_pages; i++) pages[i] = page + i; - return page_address(page); + return true; } struct page **io_pin_pages(unsigned long uaddr, unsigned long len, int *npages) @@ -159,14 +159,12 @@ static int io_region_allocate_pages(struct io_ring_ctx *ctx, size_t size = (size_t) mr->nr_pages << PAGE_SHIFT; unsigned long nr_allocated; struct page **pages; - void *p; pages = kvmalloc_array(mr->nr_pages, sizeof(*pages), gfp); if (!pages) return -ENOMEM; - p = io_mem_alloc_compound(pages, mr->nr_pages, size, gfp); - if (!IS_ERR(p)) { + if (io_mem_alloc_compound(pages, mr->nr_pages, size, gfp)) { mr->flags |= IO_REGION_F_SINGLE_REF; goto done; } From 59f44afbe8cfe7904e8cf8d2bb67eb86b79e58da Mon Sep 17 00:00:00 2001 From: Alok Tiwari Date: Tue, 4 Nov 2025 21:01:09 -0800 Subject: [PATCH 26/68] io_uring: fix typos and comment wording Corrected spelling mistakes in comments "reuqests" -> "requests", "noifications" -> "notifications", "seperately" -> "separately"). Fixed a small grammar issue ("then" -> "than"). Updated "flag" -> "flags" in fdinfo.c Signed-off-by: Alok Tiwari Signed-off-by: Jens Axboe --- io_uring/fdinfo.c | 2 +- io_uring/io_uring.c | 4 ++-- io_uring/notif.c | 2 +- io_uring/rw.c | 6 +++--- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/io_uring/fdinfo.c b/io_uring/fdinfo.c index ac6e7edc7027..7bc985bcc56d 100644 --- a/io_uring/fdinfo.c +++ b/io_uring/fdinfo.c @@ -156,7 +156,7 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m) cqe = &r->cqes[(cq_head & cq_mask)]; if (cqe->flags & IORING_CQE_F_32 || ctx->flags & IORING_SETUP_CQE32) cqe32 = true; - seq_printf(m, "%5u: user_data:%llu, res:%d, flag:%x", + seq_printf(m, "%5u: user_data:%llu, res:%d, flags:%x", cq_head & cq_mask, cqe->user_data, cqe->res, cqe->flags); if (cqe32) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 3f0489261d11..2d49b48568ab 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -882,7 +882,7 @@ bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags } /* - * Must be called from inline task_work so we now a flush will happen later, + * Must be called from inline task_work so we know a flush will happen later, * and obviously with ctx->uring_lock held (tw always has that). */ void io_add_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags) @@ -1209,7 +1209,7 @@ static void io_req_local_work_add(struct io_kiocb *req, unsigned flags) BUILD_BUG_ON(IO_CQ_WAKE_FORCE <= IORING_MAX_CQ_ENTRIES); /* - * We don't know how many reuqests is there in the link and whether + * We don't know how many requests there are in the link and whether * they can even be queued lazily, fall back to non-lazy. */ if (req->flags & IO_REQ_LINK_FLAGS) diff --git a/io_uring/notif.c b/io_uring/notif.c index 9960bb2a32d5..f476775ba44b 100644 --- a/io_uring/notif.c +++ b/io_uring/notif.c @@ -93,7 +93,7 @@ static int io_link_skb(struct sk_buff *skb, struct ubuf_info *uarg) prev_nd = container_of(prev_uarg, struct io_notif_data, uarg); prev_notif = cmd_to_io_kiocb(prev_nd); - /* make sure all noifications can be finished in the same task_work */ + /* make sure all notifications can be finished in the same task_work */ if (unlikely(notif->ctx != prev_notif->ctx || notif->tctx != prev_notif->tctx)) return -EEXIST; diff --git a/io_uring/rw.c b/io_uring/rw.c index 828ac4f902b4..a943f879bccd 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -186,7 +186,7 @@ static void io_req_rw_cleanup(struct io_kiocb *req, unsigned int issue_flags) * This is really a bug in the core code that does this, any issue * path should assume that a successful (or -EIOCBQUEUED) return can * mean that the underlying data can be gone at any time. But that - * should be fixed seperately, and then this check could be killed. + * should be fixed separately, and then this check could be killed. */ if (!(req->flags & (REQ_F_REISSUE | REQ_F_REFCOUNT))) { req->flags &= ~REQ_F_NEED_CLEANUP; @@ -349,7 +349,7 @@ static int io_prep_rwv(struct io_kiocb *req, const struct io_uring_sqe *sqe, /* * Have to do this validation here, as this is in io_read() rw->len - * might have chanaged due to buffer selection + * might have changed due to buffer selection */ return io_iov_buffer_select_prep(req); } @@ -1020,7 +1020,7 @@ static int __io_read(struct io_kiocb *req, struct io_br_sel *sel, iov_iter_restore(&io->iter, &io->iter_state); } while (ret > 0); done: - /* it's faster to check here then delegate to kfree */ + /* it's faster to check here than delegate to kfree */ return ret; } From 88559f8b2a25a0293d7679907ffe3a58151662ef Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 4 Jun 2025 08:58:39 -0600 Subject: [PATCH 27/68] io_uring/futex: move futexv async data handling to struct io_futexv_data Rather than alloc an array of struct futex_vector for the futexv wait handling, wrap it in a struct io_futexv_data struct, similar to what the non-vectored futex wait handling does. No functional changes in this patch. Signed-off-by: Jens Axboe --- io_uring/futex.c | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/io_uring/futex.c b/io_uring/futex.c index 4e022c76236d..bb3ae3e9c956 100644 --- a/io_uring/futex.c +++ b/io_uring/futex.c @@ -28,6 +28,10 @@ struct io_futex_data { struct io_kiocb *req; }; +struct io_futexv_data { + struct futex_vector futexv[]; +}; + #define IO_FUTEX_ALLOC_CACHE_MAX 32 bool io_futex_cache_init(struct io_ring_ctx *ctx) @@ -62,14 +66,14 @@ static void io_futexv_complete(struct io_tw_req tw_req, io_tw_token_t tw) { struct io_kiocb *req = tw_req.req; struct io_futex *iof = io_kiocb_to_cmd(req, struct io_futex); - struct futex_vector *futexv = req->async_data; + struct io_futexv_data *ifd = req->async_data; io_tw_lock(req->ctx, tw); if (!iof->futexv_unqueued) { int res; - res = futex_unqueue_multiple(futexv, iof->futex_nr); + res = futex_unqueue_multiple(ifd->futexv, iof->futex_nr); if (res != -1) io_req_set_res(req, res, 0); } @@ -169,7 +173,7 @@ static void io_futex_wakev_fn(struct wake_q_head *wake_q, struct futex_q *q) int io_futexv_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_futex *iof = io_kiocb_to_cmd(req, struct io_futex); - struct futex_vector *futexv; + struct io_futexv_data *ifd; int ret; /* No flags or mask supported for waitv */ @@ -182,14 +186,15 @@ int io_futexv_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (!iof->futex_nr || iof->futex_nr > FUTEX_WAITV_MAX) return -EINVAL; - futexv = kcalloc(iof->futex_nr, sizeof(*futexv), GFP_KERNEL); - if (!futexv) + ifd = kzalloc(struct_size_t(struct io_futexv_data, futexv, iof->futex_nr), + GFP_KERNEL); + if (!ifd) return -ENOMEM; - ret = futex_parse_waitv(futexv, iof->uaddr, iof->futex_nr, + ret = futex_parse_waitv(ifd->futexv, iof->uaddr, iof->futex_nr, io_futex_wakev_fn, req); if (ret) { - kfree(futexv); + kfree(ifd); return ret; } @@ -198,7 +203,7 @@ int io_futexv_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) iof->futexv_owned = 0; iof->futexv_unqueued = 0; req->flags |= REQ_F_ASYNC_DATA; - req->async_data = futexv; + req->async_data = ifd; return 0; } @@ -218,13 +223,13 @@ static void io_futex_wake_fn(struct wake_q_head *wake_q, struct futex_q *q) int io_futexv_wait(struct io_kiocb *req, unsigned int issue_flags) { struct io_futex *iof = io_kiocb_to_cmd(req, struct io_futex); - struct futex_vector *futexv = req->async_data; + struct io_futexv_data *ifd = req->async_data; struct io_ring_ctx *ctx = req->ctx; int ret, woken = -1; io_ring_submit_lock(ctx, issue_flags); - ret = futex_wait_multiple_setup(futexv, iof->futex_nr, &woken); + ret = futex_wait_multiple_setup(ifd->futexv, iof->futex_nr, &woken); /* * Error case, ret is < 0. Mark the request as failed. From 92469795363454aee7b79bd26650a44c3669b9c7 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 4 Jun 2025 09:02:08 -0600 Subject: [PATCH 28/68] io_uring/futex: move futexv owned status to struct io_futexv_data Free up a bit of space in the shared futex opcode private data, by moving the futexv specific futexv_owned out of there and into the struct specific to vectored futexes. Signed-off-by: Jens Axboe --- io_uring/futex.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/io_uring/futex.c b/io_uring/futex.c index bb3ae3e9c956..11bfff5a80df 100644 --- a/io_uring/futex.c +++ b/io_uring/futex.c @@ -17,7 +17,6 @@ struct io_futex { void __user *uaddr; unsigned long futex_val; unsigned long futex_mask; - unsigned long futexv_owned; u32 futex_flags; unsigned int futex_nr; bool futexv_unqueued; @@ -29,6 +28,7 @@ struct io_futex_data { }; struct io_futexv_data { + unsigned long owned; struct futex_vector futexv[]; }; @@ -82,10 +82,9 @@ static void io_futexv_complete(struct io_tw_req tw_req, io_tw_token_t tw) __io_futex_complete(tw_req, tw); } -static bool io_futexv_claim(struct io_futex *iof) +static bool io_futexv_claim(struct io_futexv_data *ifd) { - if (test_bit(0, &iof->futexv_owned) || - test_and_set_bit_lock(0, &iof->futexv_owned)) + if (test_bit(0, &ifd->owned) || test_and_set_bit_lock(0, &ifd->owned)) return false; return true; } @@ -100,9 +99,9 @@ static bool __io_futex_cancel(struct io_kiocb *req) return false; req->io_task_work.func = io_futex_complete; } else { - struct io_futex *iof = io_kiocb_to_cmd(req, struct io_futex); + struct io_futexv_data *ifd = req->async_data; - if (!io_futexv_claim(iof)) + if (!io_futexv_claim(ifd)) return false; req->io_task_work.func = io_futexv_complete; } @@ -158,9 +157,9 @@ int io_futex_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) static void io_futex_wakev_fn(struct wake_q_head *wake_q, struct futex_q *q) { struct io_kiocb *req = q->wake_data; - struct io_futex *iof = io_kiocb_to_cmd(req, struct io_futex); + struct io_futexv_data *ifd = req->async_data; - if (!io_futexv_claim(iof)) + if (!io_futexv_claim(ifd)) return; if (unlikely(!__futex_wake_mark(q))) return; @@ -200,7 +199,6 @@ int io_futexv_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) /* Mark as inflight, so file exit cancelation will find it */ io_req_track_inflight(req); - iof->futexv_owned = 0; iof->futexv_unqueued = 0; req->flags |= REQ_F_ASYNC_DATA; req->async_data = ifd; From a5af56a9020c0dd27bc6ab2b58d1820b01621612 Mon Sep 17 00:00:00 2001 From: David Wei Date: Tue, 4 Nov 2025 14:44:52 -0800 Subject: [PATCH 29/68] io_uring/memmap: remove unneeded io_ring_ctx arg Remove io_ring_ctx arg from io_region_pin_pages() and io_region_allocate_pages() that isn't used. Signed-off-by: David Wei Reviewed-by: Pavel Begunkov Signed-off-by: Jens Axboe --- io_uring/memmap.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/io_uring/memmap.c b/io_uring/memmap.c index 67b7b17ece31..11be347a49d7 100644 --- a/io_uring/memmap.c +++ b/io_uring/memmap.c @@ -131,9 +131,8 @@ static int io_region_init_ptr(struct io_mapped_region *mr) return 0; } -static int io_region_pin_pages(struct io_ring_ctx *ctx, - struct io_mapped_region *mr, - struct io_uring_region_desc *reg) +static int io_region_pin_pages(struct io_mapped_region *mr, + struct io_uring_region_desc *reg) { unsigned long size = mr->nr_pages << PAGE_SHIFT; struct page **pages; @@ -150,8 +149,7 @@ static int io_region_pin_pages(struct io_ring_ctx *ctx, return 0; } -static int io_region_allocate_pages(struct io_ring_ctx *ctx, - struct io_mapped_region *mr, +static int io_region_allocate_pages(struct io_mapped_region *mr, struct io_uring_region_desc *reg, unsigned long mmap_offset) { @@ -217,9 +215,9 @@ int io_create_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr, mr->nr_pages = nr_pages; if (reg->flags & IORING_MEM_REGION_TYPE_USER) - ret = io_region_pin_pages(ctx, mr, reg); + ret = io_region_pin_pages(mr, reg); else - ret = io_region_allocate_pages(ctx, mr, reg, mmap_offset); + ret = io_region_allocate_pages(mr, reg, mmap_offset); if (ret) goto out_free; From 1fa7a34131110e3c41a13b19127da132dea32dcd Mon Sep 17 00:00:00 2001 From: David Wei Date: Tue, 4 Nov 2025 14:44:53 -0800 Subject: [PATCH 30/68] io_uring/memmap: refactor io_free_region() to take user_struct param Refactor io_free_region() to take user_struct directly, instead of accessing it from the ring ctx. Signed-off-by: David Wei Reviewed-by: Pavel Begunkov Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 6 +++--- io_uring/kbuf.c | 4 ++-- io_uring/memmap.c | 8 ++++---- io_uring/memmap.h | 2 +- io_uring/register.c | 6 +++--- io_uring/zcrx.c | 2 +- 6 files changed, 14 insertions(+), 14 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 2d49b48568ab..85081d7453b7 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -2751,8 +2751,8 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags, static void io_rings_free(struct io_ring_ctx *ctx) { - io_free_region(ctx, &ctx->sq_region); - io_free_region(ctx, &ctx->ring_region); + io_free_region(ctx->user, &ctx->sq_region); + io_free_region(ctx->user, &ctx->ring_region); ctx->rings = NULL; ctx->sq_sqes = NULL; } @@ -2837,7 +2837,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) io_eventfd_unregister(ctx); io_free_alloc_caches(ctx); io_destroy_buffers(ctx); - io_free_region(ctx, &ctx->param_region); + io_free_region(ctx->user, &ctx->param_region); mutex_unlock(&ctx->uring_lock); if (ctx->sq_creds) put_cred(ctx->sq_creds); diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index e271b44ff73e..f1d644189068 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -417,7 +417,7 @@ static int io_remove_buffers_legacy(struct io_ring_ctx *ctx, static void io_put_bl(struct io_ring_ctx *ctx, struct io_buffer_list *bl) { if (bl->flags & IOBL_BUF_RING) - io_free_region(ctx, &bl->region); + io_free_region(ctx->user, &bl->region); else io_remove_buffers_legacy(ctx, bl, -1U); @@ -661,7 +661,7 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) io_buffer_add_list(ctx, bl, reg.bgid); return 0; fail: - io_free_region(ctx, &bl->region); + io_free_region(ctx->user, &bl->region); kfree(bl); return ret; } diff --git a/io_uring/memmap.c b/io_uring/memmap.c index 11be347a49d7..24da17a5f08f 100644 --- a/io_uring/memmap.c +++ b/io_uring/memmap.c @@ -88,7 +88,7 @@ enum { IO_REGION_F_SINGLE_REF = 4, }; -void io_free_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr) +void io_free_region(struct user_struct *user, struct io_mapped_region *mr) { if (mr->pages) { long nr_refs = mr->nr_pages; @@ -105,8 +105,8 @@ void io_free_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr) } if ((mr->flags & IO_REGION_F_VMAP) && mr->ptr) vunmap(mr->ptr); - if (mr->nr_pages && ctx->user) - __io_unaccount_mem(ctx->user, mr->nr_pages); + if (mr->nr_pages && user) + __io_unaccount_mem(user, mr->nr_pages); memset(mr, 0, sizeof(*mr)); } @@ -226,7 +226,7 @@ int io_create_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr, goto out_free; return 0; out_free: - io_free_region(ctx, mr); + io_free_region(ctx->user, mr); return ret; } diff --git a/io_uring/memmap.h b/io_uring/memmap.h index f9e94458c01f..a6c63ca2c6f1 100644 --- a/io_uring/memmap.h +++ b/io_uring/memmap.h @@ -16,7 +16,7 @@ unsigned long io_uring_get_unmapped_area(struct file *file, unsigned long addr, unsigned long flags); int io_uring_mmap(struct file *file, struct vm_area_struct *vma); -void io_free_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr); +void io_free_region(struct user_struct *user, struct io_mapped_region *mr); int io_create_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr, struct io_uring_region_desc *reg, unsigned long mmap_offset); diff --git a/io_uring/register.c b/io_uring/register.c index 1a3e05be6e7b..023f5e7a18da 100644 --- a/io_uring/register.c +++ b/io_uring/register.c @@ -381,8 +381,8 @@ struct io_ring_ctx_rings { static void io_register_free_rings(struct io_ring_ctx *ctx, struct io_ring_ctx_rings *r) { - io_free_region(ctx, &r->sq_region); - io_free_region(ctx, &r->ring_region); + io_free_region(ctx->user, &r->sq_region); + io_free_region(ctx->user, &r->ring_region); } #define swap_old(ctx, o, n, field) \ @@ -604,7 +604,7 @@ static int io_register_mem_region(struct io_ring_ctx *ctx, void __user *uarg) if (ret) return ret; if (copy_to_user(rd_uptr, &rd, sizeof(rd))) { - io_free_region(ctx, ®ion); + io_free_region(ctx->user, ®ion); return -EFAULT; } diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c index a816f5902091..d15453884004 100644 --- a/io_uring/zcrx.c +++ b/io_uring/zcrx.c @@ -378,7 +378,7 @@ static int io_allocate_rbuf_ring(struct io_zcrx_ifq *ifq, static void io_free_rbuf_ring(struct io_zcrx_ifq *ifq) { - io_free_region(ifq->ctx, &ifq->region); + io_free_region(ifq->ctx->user, &ifq->region); ifq->rq_ring = NULL; ifq->rqes = NULL; } From 6ab39b392e7973ffc45bf7ab523d8777904c4128 Mon Sep 17 00:00:00 2001 From: David Wei Date: Tue, 4 Nov 2025 14:44:54 -0800 Subject: [PATCH 31/68] io_uring/rsrc: refactor io_{un}account_mem() to take {user,mm}_struct param Refactor io_{un}account_mem() to take user_struct and mm_struct directly, instead of accessing it from the ring ctx. Signed-off-by: David Wei Reviewed-by: Pavel Begunkov Signed-off-by: Jens Axboe --- io_uring/rsrc.c | 26 ++++++++++++++------------ io_uring/rsrc.h | 6 ++++-- io_uring/zcrx.c | 5 +++-- 3 files changed, 21 insertions(+), 16 deletions(-) diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index 4cc38eb56758..4053d104bf4c 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -56,27 +56,29 @@ int __io_account_mem(struct user_struct *user, unsigned long nr_pages) return 0; } -void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages) +void io_unaccount_mem(struct user_struct *user, struct mm_struct *mm_account, + unsigned long nr_pages) { - if (ctx->user) - __io_unaccount_mem(ctx->user, nr_pages); + if (user) + __io_unaccount_mem(user, nr_pages); - if (ctx->mm_account) - atomic64_sub(nr_pages, &ctx->mm_account->pinned_vm); + if (mm_account) + atomic64_sub(nr_pages, &mm_account->pinned_vm); } -int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages) +int io_account_mem(struct user_struct *user, struct mm_struct *mm_account, + unsigned long nr_pages) { int ret; - if (ctx->user) { - ret = __io_account_mem(ctx->user, nr_pages); + if (user) { + ret = __io_account_mem(user, nr_pages); if (ret) return ret; } - if (ctx->mm_account) - atomic64_add(nr_pages, &ctx->mm_account->pinned_vm); + if (mm_account) + atomic64_add(nr_pages, &mm_account->pinned_vm); return 0; } @@ -145,7 +147,7 @@ static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf *imu) } if (imu->acct_pages) - io_unaccount_mem(ctx, imu->acct_pages); + io_unaccount_mem(ctx->user, ctx->mm_account, imu->acct_pages); imu->release(imu->priv); io_free_imu(ctx, imu); } @@ -684,7 +686,7 @@ static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages, if (!imu->acct_pages) return 0; - ret = io_account_mem(ctx, imu->acct_pages); + ret = io_account_mem(ctx->user, ctx->mm_account, imu->acct_pages); if (ret) imu->acct_pages = 0; return ret; diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h index a3ca6ba66596..d603f6a47f5e 100644 --- a/io_uring/rsrc.h +++ b/io_uring/rsrc.h @@ -120,8 +120,10 @@ int io_files_update(struct io_kiocb *req, unsigned int issue_flags); int io_files_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); int __io_account_mem(struct user_struct *user, unsigned long nr_pages); -int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages); -void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages); +int io_account_mem(struct user_struct *user, struct mm_struct *mm_account, + unsigned long nr_pages); +void io_unaccount_mem(struct user_struct *user, struct mm_struct *mm_account, + unsigned long nr_pages); static inline void __io_unaccount_mem(struct user_struct *user, unsigned long nr_pages) diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c index d15453884004..30d3a7b3c407 100644 --- a/io_uring/zcrx.c +++ b/io_uring/zcrx.c @@ -200,7 +200,7 @@ static int io_import_umem(struct io_zcrx_ifq *ifq, } mem->account_pages = io_count_account_pages(pages, nr_pages); - ret = io_account_mem(ifq->ctx, mem->account_pages); + ret = io_account_mem(ifq->ctx->user, ifq->ctx->mm_account, mem->account_pages); if (ret < 0) mem->account_pages = 0; @@ -389,7 +389,8 @@ static void io_zcrx_free_area(struct io_zcrx_area *area) io_release_area_mem(&area->mem); if (area->mem.account_pages) - io_unaccount_mem(area->ifq->ctx, area->mem.account_pages); + io_unaccount_mem(area->ifq->ctx->user, area->ifq->ctx->mm_account, + area->mem.account_pages); kvfree(area->freelist); kvfree(area->nia.niovs); From edd706ede85fc9a563556945069f87dbec769e07 Mon Sep 17 00:00:00 2001 From: David Wei Date: Tue, 4 Nov 2025 14:44:55 -0800 Subject: [PATCH 32/68] io_uring/zcrx: add io_zcrx_ifq arg to io_zcrx_free_area() Add io_zcrx_ifq arg to io_zcrx_free_area(). A QOL change to reduce line widths. Signed-off-by: David Wei Reviewed-by: Pavel Begunkov Signed-off-by: Jens Axboe --- io_uring/zcrx.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c index 30d3a7b3c407..5c90404283ff 100644 --- a/io_uring/zcrx.c +++ b/io_uring/zcrx.c @@ -383,9 +383,10 @@ static void io_free_rbuf_ring(struct io_zcrx_ifq *ifq) ifq->rqes = NULL; } -static void io_zcrx_free_area(struct io_zcrx_area *area) +static void io_zcrx_free_area(struct io_zcrx_ifq *ifq, + struct io_zcrx_area *area) { - io_zcrx_unmap_area(area->ifq, area); + io_zcrx_unmap_area(ifq, area); io_release_area_mem(&area->mem); if (area->mem.account_pages) @@ -464,7 +465,7 @@ static int io_zcrx_create_area(struct io_zcrx_ifq *ifq, return 0; err: if (area) - io_zcrx_free_area(area); + io_zcrx_free_area(ifq, area); return ret; } @@ -523,7 +524,7 @@ static void io_zcrx_ifq_free(struct io_zcrx_ifq *ifq) io_close_queue(ifq); if (ifq->area) - io_zcrx_free_area(ifq->area); + io_zcrx_free_area(ifq, ifq->area); if (ifq->dev) put_device(ifq->dev); From 5c686456a4e83ef06c74d40be05c21a0ef136684 Mon Sep 17 00:00:00 2001 From: David Wei Date: Tue, 4 Nov 2025 14:44:56 -0800 Subject: [PATCH 33/68] io_uring/zcrx: add user_struct and mm_struct to io_zcrx_ifq In preparation for removing ifq->ctx and making ifq lifetime independent of ring ctx, add user_struct and mm_struct to io_zcrx_ifq. In the ifq cleanup path, these are the only fields used from the main ring ctx to do accounting. Taking a copy in the ifq allows ifq->ctx to be removed later, including the ctx->refs held by the ifq. Signed-off-by: David Wei Reviewed-by: Pavel Begunkov Signed-off-by: Jens Axboe --- io_uring/zcrx.c | 24 ++++++++++++++++++------ io_uring/zcrx.h | 2 ++ 2 files changed, 20 insertions(+), 6 deletions(-) diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c index 5c90404283ff..774efbce8cb6 100644 --- a/io_uring/zcrx.c +++ b/io_uring/zcrx.c @@ -200,7 +200,7 @@ static int io_import_umem(struct io_zcrx_ifq *ifq, } mem->account_pages = io_count_account_pages(pages, nr_pages); - ret = io_account_mem(ifq->ctx->user, ifq->ctx->mm_account, mem->account_pages); + ret = io_account_mem(ifq->user, ifq->mm_account, mem->account_pages); if (ret < 0) mem->account_pages = 0; @@ -344,7 +344,8 @@ static void io_zcrx_get_niov_uref(struct net_iov *niov) atomic_inc(io_get_user_counter(niov)); } -static int io_allocate_rbuf_ring(struct io_zcrx_ifq *ifq, +static int io_allocate_rbuf_ring(struct io_ring_ctx *ctx, + struct io_zcrx_ifq *ifq, struct io_uring_zcrx_ifq_reg *reg, struct io_uring_region_desc *rd, u32 id) @@ -362,7 +363,7 @@ static int io_allocate_rbuf_ring(struct io_zcrx_ifq *ifq, mmap_offset = IORING_MAP_OFF_ZCRX_REGION; mmap_offset += id << IORING_OFF_PBUF_SHIFT; - ret = io_create_region(ifq->ctx, &ifq->region, rd, mmap_offset); + ret = io_create_region(ctx, &ifq->region, rd, mmap_offset); if (ret < 0) return ret; @@ -378,7 +379,7 @@ static int io_allocate_rbuf_ring(struct io_zcrx_ifq *ifq, static void io_free_rbuf_ring(struct io_zcrx_ifq *ifq) { - io_free_region(ifq->ctx->user, &ifq->region); + io_free_region(ifq->user, &ifq->region); ifq->rq_ring = NULL; ifq->rqes = NULL; } @@ -390,7 +391,7 @@ static void io_zcrx_free_area(struct io_zcrx_ifq *ifq, io_release_area_mem(&area->mem); if (area->mem.account_pages) - io_unaccount_mem(area->ifq->ctx->user, area->ifq->ctx->mm_account, + io_unaccount_mem(ifq->user, ifq->mm_account, area->mem.account_pages); kvfree(area->freelist); @@ -525,6 +526,9 @@ static void io_zcrx_ifq_free(struct io_zcrx_ifq *ifq) if (ifq->area) io_zcrx_free_area(ifq, ifq->area); + free_uid(ifq->user); + if (ifq->mm_account) + mmdrop(ifq->mm_account); if (ifq->dev) put_device(ifq->dev); @@ -588,6 +592,14 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx, ifq = io_zcrx_ifq_alloc(ctx); if (!ifq) return -ENOMEM; + if (ctx->user) { + get_uid(ctx->user); + ifq->user = ctx->user; + } + if (ctx->mm_account) { + mmgrab(ctx->mm_account); + ifq->mm_account = ctx->mm_account; + } ifq->rq_entries = reg.rq_entries; scoped_guard(mutex, &ctx->mmap_lock) { @@ -597,7 +609,7 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx, goto ifq_free; } - ret = io_allocate_rbuf_ring(ifq, ®, &rd, id); + ret = io_allocate_rbuf_ring(ctx, ifq, ®, &rd, id); if (ret) goto err; diff --git a/io_uring/zcrx.h b/io_uring/zcrx.h index 33ef61503092..8d828dc9b0e4 100644 --- a/io_uring/zcrx.h +++ b/io_uring/zcrx.h @@ -42,6 +42,8 @@ struct io_zcrx_ifq { struct io_ring_ctx *ctx; struct io_zcrx_area *area; unsigned niov_shift; + struct user_struct *user; + struct mm_struct *mm_account; spinlock_t rq_lock ____cacheline_aligned_in_smp; struct io_uring *rq_ring; From 1bd95163dae80b940ea4b7bfa0720d3cc538a68b Mon Sep 17 00:00:00 2001 From: David Wei Date: Tue, 4 Nov 2025 14:44:57 -0800 Subject: [PATCH 34/68] io_uring/zcrx: move io_unregister_zcrx_ifqs() down In preparation for removing the ref on ctx->refs held by an ifq and removing io_shutdown_zcrx_ifqs(), move io_unregister_zcrx_ifqs() down such that it can call io_zcrx_scrub(). Signed-off-by: David Wei Reviewed-by: Pavel Begunkov Signed-off-by: Jens Axboe --- io_uring/zcrx.c | 44 ++++++++++++++++++++++---------------------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c index 774efbce8cb6..b3f3d55d2f63 100644 --- a/io_uring/zcrx.c +++ b/io_uring/zcrx.c @@ -662,28 +662,6 @@ ifq_free: return ret; } -void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx) -{ - struct io_zcrx_ifq *ifq; - - lockdep_assert_held(&ctx->uring_lock); - - while (1) { - scoped_guard(mutex, &ctx->mmap_lock) { - unsigned long id = 0; - - ifq = xa_find(&ctx->zcrx_ctxs, &id, ULONG_MAX, XA_PRESENT); - if (ifq) - xa_erase(&ctx->zcrx_ctxs, id); - } - if (!ifq) - break; - io_zcrx_ifq_free(ifq); - } - - xa_destroy(&ctx->zcrx_ctxs); -} - static struct net_iov *__io_zcrx_get_free_niov(struct io_zcrx_area *area) { unsigned niov_idx; @@ -749,6 +727,28 @@ void io_shutdown_zcrx_ifqs(struct io_ring_ctx *ctx) } } +void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx) +{ + struct io_zcrx_ifq *ifq; + + lockdep_assert_held(&ctx->uring_lock); + + while (1) { + scoped_guard(mutex, &ctx->mmap_lock) { + unsigned long id = 0; + + ifq = xa_find(&ctx->zcrx_ctxs, &id, ULONG_MAX, XA_PRESENT); + if (ifq) + xa_erase(&ctx->zcrx_ctxs, id); + } + if (!ifq) + break; + io_zcrx_ifq_free(ifq); + } + + xa_destroy(&ctx->zcrx_ctxs); +} + static inline u32 io_zcrx_rqring_entries(struct io_zcrx_ifq *ifq) { u32 entries; From 75c299a917e4547dfe640ce7fd83c8a14d8409d0 Mon Sep 17 00:00:00 2001 From: David Wei Date: Tue, 4 Nov 2025 14:44:58 -0800 Subject: [PATCH 35/68] io_uring/zcrx: reverse ifq refcount Add a refcount to struct io_zcrx_ifq to reverse the refcounting relationship i.e. rings now reference ifqs instead. As a result of this, remove ctx->refs that an ifq holds on a ring via the page pool memory provider. This ref ifq->refs is held by internal users of an ifq, namely rings and the page pool memory provider associated with an ifq. This is needed to keep the ifq around until the page pool is destroyed. Since ifqs now no longer hold refs to ring ctx, there isn't a need to split the cleanup of ifqs into two: io_shutdown_zcrx_ifqs() in io_ring_exit_work() while waiting for ctx->refs to drop to 0, and io_unregister_zcrx_ifqs() after. Remove io_shutdown_zcrx_ifqs(). Signed-off-by: David Wei Co-developed-by: Pavel Begunkov Signed-off-by: Pavel Begunkov Reviewed-by: Pavel Begunkov Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 5 ----- io_uring/zcrx.c | 33 ++++++++++++++------------------- io_uring/zcrx.h | 6 +----- 3 files changed, 15 insertions(+), 29 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 85081d7453b7..e493d4358ab7 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -2988,11 +2988,6 @@ static __cold void io_ring_exit_work(struct work_struct *work) io_cqring_overflow_kill(ctx); mutex_unlock(&ctx->uring_lock); } - if (!xa_empty(&ctx->zcrx_ctxs)) { - mutex_lock(&ctx->uring_lock); - io_shutdown_zcrx_ifqs(ctx); - mutex_unlock(&ctx->uring_lock); - } if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) io_move_task_work_from_local(ctx); diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c index b3f3d55d2f63..5752ff9a103f 100644 --- a/io_uring/zcrx.c +++ b/io_uring/zcrx.c @@ -479,9 +479,9 @@ static struct io_zcrx_ifq *io_zcrx_ifq_alloc(struct io_ring_ctx *ctx) return NULL; ifq->if_rxq = -1; - ifq->ctx = ctx; spin_lock_init(&ifq->rq_lock); mutex_init(&ifq->pp_lock); + refcount_set(&ifq->refs, 1); return ifq; } @@ -537,6 +537,12 @@ static void io_zcrx_ifq_free(struct io_zcrx_ifq *ifq) kfree(ifq); } +static void io_put_zcrx_ifq(struct io_zcrx_ifq *ifq) +{ + if (refcount_dec_and_test(&ifq->refs)) + io_zcrx_ifq_free(ifq); +} + struct io_mapped_region *io_zcrx_get_region(struct io_ring_ctx *ctx, unsigned int id) { @@ -592,6 +598,7 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx, ifq = io_zcrx_ifq_alloc(ctx); if (!ifq) return -ENOMEM; + if (ctx->user) { get_uid(ctx->user); ifq->user = ctx->user; @@ -714,19 +721,6 @@ static void io_zcrx_scrub(struct io_zcrx_ifq *ifq) } } -void io_shutdown_zcrx_ifqs(struct io_ring_ctx *ctx) -{ - struct io_zcrx_ifq *ifq; - unsigned long index; - - lockdep_assert_held(&ctx->uring_lock); - - xa_for_each(&ctx->zcrx_ctxs, index, ifq) { - io_zcrx_scrub(ifq); - io_close_queue(ifq); - } -} - void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx) { struct io_zcrx_ifq *ifq; @@ -743,7 +737,10 @@ void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx) } if (!ifq) break; - io_zcrx_ifq_free(ifq); + + io_close_queue(ifq); + io_zcrx_scrub(ifq); + io_put_zcrx_ifq(ifq); } xa_destroy(&ctx->zcrx_ctxs); @@ -894,15 +891,13 @@ static int io_pp_zc_init(struct page_pool *pp) if (ret) return ret; - percpu_ref_get(&ifq->ctx->refs); + refcount_inc(&ifq->refs); return 0; } static void io_pp_zc_destroy(struct page_pool *pp) { - struct io_zcrx_ifq *ifq = io_pp_to_ifq(pp); - - percpu_ref_put(&ifq->ctx->refs); + io_put_zcrx_ifq(io_pp_to_ifq(pp)); } static int io_pp_nl_fill(void *mp_priv, struct sk_buff *rsp, diff --git a/io_uring/zcrx.h b/io_uring/zcrx.h index 8d828dc9b0e4..45e3e71448ff 100644 --- a/io_uring/zcrx.h +++ b/io_uring/zcrx.h @@ -39,7 +39,6 @@ struct io_zcrx_area { }; struct io_zcrx_ifq { - struct io_ring_ctx *ctx; struct io_zcrx_area *area; unsigned niov_shift; struct user_struct *user; @@ -55,6 +54,7 @@ struct io_zcrx_ifq { struct device *dev; struct net_device *netdev; netdevice_tracker netdev_tracker; + refcount_t refs; /* * Page pool and net configuration lock, can be taken deeper in the @@ -70,7 +70,6 @@ int io_zcrx_return_bufs(struct io_ring_ctx *ctx, int io_register_zcrx_ifq(struct io_ring_ctx *ctx, struct io_uring_zcrx_ifq_reg __user *arg); void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx); -void io_shutdown_zcrx_ifqs(struct io_ring_ctx *ctx); int io_zcrx_recv(struct io_kiocb *req, struct io_zcrx_ifq *ifq, struct socket *sock, unsigned int flags, unsigned issue_flags, unsigned int *len); @@ -85,9 +84,6 @@ static inline int io_register_zcrx_ifq(struct io_ring_ctx *ctx, static inline void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx) { } -static inline void io_shutdown_zcrx_ifqs(struct io_ring_ctx *ctx) -{ -} static inline int io_zcrx_recv(struct io_kiocb *req, struct io_zcrx_ifq *ifq, struct socket *sock, unsigned int flags, unsigned issue_flags, unsigned int *len) From 93e197e524b14d185d011813b72773a1a49d932d Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 6 Nov 2025 12:58:19 +0000 Subject: [PATCH 36/68] io_uring: use WRITE_ONCE for user shared memory IORING_SETUP_NO_MMAP rings remain user accessible even before the ctx setup is finalised, so use WRITE_ONCE consistently when initialising rings. Fixes: 03d89a2de25bb ("io_uring: support for user allocated memory for rings/sqes") Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index e493d4358ab7..d11d0e9723a1 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -3381,10 +3381,6 @@ static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx, if (!(ctx->flags & IORING_SETUP_NO_SQARRAY)) ctx->sq_array = (u32 *)((char *)rings + sq_array_offset); - rings->sq_ring_mask = p->sq_entries - 1; - rings->cq_ring_mask = p->cq_entries - 1; - rings->sq_ring_entries = p->sq_entries; - rings->cq_ring_entries = p->cq_entries; memset(&rd, 0, sizeof(rd)); rd.size = PAGE_ALIGN(sq_size); @@ -3398,6 +3394,12 @@ static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx, return ret; } ctx->sq_sqes = io_region_get_ptr(&ctx->sq_region); + + memset(rings, 0, sizeof(*rings)); + WRITE_ONCE(rings->sq_ring_mask, ctx->sq_entries - 1); + WRITE_ONCE(rings->cq_ring_mask, ctx->cq_entries - 1); + WRITE_ONCE(rings->sq_ring_entries, ctx->sq_entries); + WRITE_ONCE(rings->cq_ring_entries, ctx->cq_entries); return 0; } From c07a491c1b735e0c27454ea5c27a446d43401b1e Mon Sep 17 00:00:00 2001 From: David Wei Date: Fri, 31 Oct 2025 19:24:48 -0700 Subject: [PATCH 37/68] net: export netdev_get_by_index_lock() Need to call netdev_get_by_index_lock() from io_uring/zcrx.c, but it is currently private to net. Export the function in linux/netdevice.h. Signed-off-by: David Wei Acked-by: Jakub Kicinski Signed-off-by: Jens Axboe --- include/linux/netdevice.h | 1 + net/core/dev.h | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index d1a687444b27..77c46a2823ec 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3401,6 +3401,7 @@ struct net_device *dev_get_by_index(struct net *net, int ifindex); struct net_device *__dev_get_by_index(struct net *net, int ifindex); struct net_device *netdev_get_by_index(struct net *net, int ifindex, netdevice_tracker *tracker, gfp_t gfp); +struct net_device *netdev_get_by_index_lock(struct net *net, int ifindex); struct net_device *netdev_get_by_name(struct net *net, const char *name, netdevice_tracker *tracker, gfp_t gfp); struct net_device *netdev_get_by_flags_rcu(struct net *net, netdevice_tracker *tracker, diff --git a/net/core/dev.h b/net/core/dev.h index 900880e8b5b4..df8a90fe89f8 100644 --- a/net/core/dev.h +++ b/net/core/dev.h @@ -29,7 +29,6 @@ struct napi_struct * netdev_napi_by_id_lock(struct net *net, unsigned int napi_id); struct net_device *dev_get_by_napi_id(unsigned int napi_id); -struct net_device *netdev_get_by_index_lock(struct net *net, int ifindex); struct net_device *__netdev_put_lock(struct net_device *dev, struct net *net); struct net_device * netdev_xa_find_lock(struct net *net, struct net_device *dev, From b6c5f9454ef34fd2753ba7843ef4d9a295c43eee Mon Sep 17 00:00:00 2001 From: David Wei Date: Fri, 31 Oct 2025 19:24:49 -0700 Subject: [PATCH 38/68] io_uring/zcrx: call netdev_queue_get_dma_dev() under instance lock netdev ops must be called under instance lock or rtnl_lock, but io_register_zcrx_ifq() isn't doing this for netdev_queue_get_dma_dev(). Fix this by taking the instance lock using netdev_get_by_index_lock(). Extended the instance lock section to include attaching a memory provider. Could not move io_zcrx_create_area() outside, since the dmabuf codepath IORING_ZCRX_AREA_DMABUF requires ifq->dev. Fixes: 59b8b32ac8d4 ("io_uring/zcrx: add support for custom DMA devices") Signed-off-by: David Wei Reviewed-by: Pavel Begunkov Reviewed-by: Jakub Kicinski Signed-off-by: Jens Axboe --- io_uring/zcrx.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c index 5752ff9a103f..d80cb9060735 100644 --- a/io_uring/zcrx.c +++ b/io_uring/zcrx.c @@ -620,29 +620,30 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx, if (ret) goto err; - ifq->netdev = netdev_get_by_index(current->nsproxy->net_ns, reg.if_idx, - &ifq->netdev_tracker, GFP_KERNEL); + ifq->netdev = netdev_get_by_index_lock(current->nsproxy->net_ns, reg.if_idx); if (!ifq->netdev) { ret = -ENODEV; goto err; } + netdev_hold(ifq->netdev, &ifq->netdev_tracker, GFP_KERNEL); ifq->dev = netdev_queue_get_dma_dev(ifq->netdev, reg.if_rxq); if (!ifq->dev) { ret = -EOPNOTSUPP; - goto err; + goto netdev_put_unlock; } get_device(ifq->dev); ret = io_zcrx_create_area(ifq, &area); if (ret) - goto err; + goto netdev_put_unlock; mp_param.mp_ops = &io_uring_pp_zc_ops; mp_param.mp_priv = ifq; - ret = net_mp_open_rxq(ifq->netdev, reg.if_rxq, &mp_param); + ret = __net_mp_open_rxq(ifq->netdev, reg.if_rxq, &mp_param, NULL); if (ret) - goto err; + goto netdev_put_unlock; + netdev_unlock(ifq->netdev); ifq->if_rxq = reg.if_rxq; reg.zcrx_id = id; @@ -661,6 +662,9 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx, goto err; } return 0; +netdev_put_unlock: + netdev_put(ifq->netdev, &ifq->netdev_tracker); + netdev_unlock(ifq->netdev); err: scoped_guard(mutex, &ctx->mmap_lock) xa_erase(&ctx->zcrx_ctxs, id); From 21bd7b14a32de35bc6c4fff7a739dc5d33ce04f1 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 6 Nov 2025 12:31:56 +0000 Subject: [PATCH 39/68] io_uring/query: buffer size calculations with a union Instead of having an array of a calculated size as a buffer, put all query uapi structures into a union and pass that around. That way everything is well typed, and the compiler will prevent opcode handling using a structure not accounted into the buffer size. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- io_uring/query.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/io_uring/query.c b/io_uring/query.c index 645301bd2c82..6cf732936b3d 100644 --- a/io_uring/query.c +++ b/io_uring/query.c @@ -5,14 +5,16 @@ #include "query.h" #include "io_uring.h" -#define IO_MAX_QUERY_SIZE (sizeof(struct io_uring_query_opcode)) +union io_query_data { + struct io_uring_query_opcode opcodes; +}; + +#define IO_MAX_QUERY_SIZE sizeof(union io_query_data) #define IO_MAX_QUERY_ENTRIES 1000 -static ssize_t io_query_ops(void *data) +static ssize_t io_query_ops(union io_query_data *data) { - struct io_uring_query_opcode *e = data; - - BUILD_BUG_ON(sizeof(*e) > IO_MAX_QUERY_SIZE); + struct io_uring_query_opcode *e = &data->opcodes; e->nr_request_opcodes = IORING_OP_LAST; e->nr_register_opcodes = IORING_REGISTER_LAST; @@ -24,7 +26,7 @@ static ssize_t io_query_ops(void *data) } static int io_handle_query_entry(struct io_ring_ctx *ctx, - void *data, void __user *uhdr, + union io_query_data *data, void __user *uhdr, u64 *next_entry) { struct io_uring_query_hdr hdr; @@ -73,11 +75,11 @@ out: int io_query(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args) { - char entry_buffer[IO_MAX_QUERY_SIZE]; + union io_query_data entry_buffer; void __user *uhdr = arg; int ret, nr = 0; - memset(entry_buffer, 0, sizeof(entry_buffer)); + memset(&entry_buffer, 0, sizeof(entry_buffer)); if (nr_args) return -EINVAL; @@ -85,7 +87,7 @@ int io_query(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args) while (uhdr) { u64 next_hdr; - ret = io_handle_query_entry(ctx, entry_buffer, uhdr, &next_hdr); + ret = io_handle_query_entry(ctx, &entry_buffer, uhdr, &next_hdr); if (ret) return ret; uhdr = u64_to_user_ptr(next_hdr); From 4aed5b4e6d276d2308d0ea8932b0c6ebfd3d19f8 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 10 Nov 2025 13:04:49 +0000 Subject: [PATCH 40/68] io_uring: add helper calculating region byte size There has been type related issues with region size calculation, add an utility helper function that returns the size and handles type conversions right. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- io_uring/memmap.c | 4 ++-- io_uring/memmap.h | 5 +++++ 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/io_uring/memmap.c b/io_uring/memmap.c index 24da17a5f08f..dc4bfc5b6fb8 100644 --- a/io_uring/memmap.c +++ b/io_uring/memmap.c @@ -134,7 +134,7 @@ static int io_region_init_ptr(struct io_mapped_region *mr) static int io_region_pin_pages(struct io_mapped_region *mr, struct io_uring_region_desc *reg) { - unsigned long size = mr->nr_pages << PAGE_SHIFT; + size_t size = io_region_size(mr); struct page **pages; int nr_pages; @@ -154,7 +154,7 @@ static int io_region_allocate_pages(struct io_mapped_region *mr, unsigned long mmap_offset) { gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN; - size_t size = (size_t) mr->nr_pages << PAGE_SHIFT; + size_t size = io_region_size(mr); unsigned long nr_allocated; struct page **pages; diff --git a/io_uring/memmap.h b/io_uring/memmap.h index a6c63ca2c6f1..a39d9e518905 100644 --- a/io_uring/memmap.h +++ b/io_uring/memmap.h @@ -43,4 +43,9 @@ static inline void io_region_publish(struct io_ring_ctx *ctx, *dst_region = *src_region; } +static inline size_t io_region_size(struct io_mapped_region *mr) +{ + return (size_t) mr->nr_pages << PAGE_SHIFT; +} + #endif From 7bb21a52e2d435b9edffe3b40264ab4462e338e0 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 10 Nov 2025 13:04:50 +0000 Subject: [PATCH 41/68] io_uring: pass sq entries in the params struct There is no need to pass the user requested number of SQ entries separately from the main parameter structure io_uring_params. Initialise it at the beginning and stop passing it in favour of struct io_uring_params::sq_entries. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 11 +++++++---- io_uring/io_uring.h | 2 +- io_uring/register.c | 2 +- 3 files changed, 9 insertions(+), 6 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index d11d0e9723a1..023b0e3a829c 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -3479,8 +3479,10 @@ static int io_uring_sanitise_params(struct io_uring_params *p) return 0; } -int io_uring_fill_params(unsigned entries, struct io_uring_params *p) +int io_uring_fill_params(struct io_uring_params *p) { + unsigned entries = p->sq_entries; + if (!entries) return -EINVAL; if (entries > IORING_MAX_ENTRIES) { @@ -3542,7 +3544,7 @@ int io_uring_fill_params(unsigned entries, struct io_uring_params *p) return 0; } -static __cold int io_uring_create(unsigned entries, struct io_uring_params *p, +static __cold int io_uring_create(struct io_uring_params *p, struct io_uring_params __user *params) { struct io_ring_ctx *ctx; @@ -3554,7 +3556,7 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p, if (ret) return ret; - ret = io_uring_fill_params(entries, p); + ret = io_uring_fill_params(p); if (unlikely(ret)) return ret; @@ -3693,7 +3695,8 @@ static long io_uring_setup(u32 entries, struct io_uring_params __user *params) if (p.flags & ~IORING_SETUP_FLAGS) return -EINVAL; - return io_uring_create(entries, &p, params); + p.sq_entries = entries; + return io_uring_create(&p, params); } static inline int io_uring_allowed(void) diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 23c268ab1c8f..b2251446497a 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -136,7 +136,7 @@ static inline bool io_should_wake(struct io_wait_queue *iowq) unsigned long rings_size(unsigned int flags, unsigned int sq_entries, unsigned int cq_entries, size_t *sq_offset); -int io_uring_fill_params(unsigned entries, struct io_uring_params *p); +int io_uring_fill_params(struct io_uring_params *p); bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow, bool cqe32); int io_run_task_work_sig(struct io_ring_ctx *ctx); int io_run_local_work(struct io_ring_ctx *ctx, int min_events, int max_events); diff --git a/io_uring/register.c b/io_uring/register.c index 023f5e7a18da..afb924ceb9b6 100644 --- a/io_uring/register.c +++ b/io_uring/register.c @@ -416,7 +416,7 @@ static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg) /* properties that are always inherited */ p.flags |= (ctx->flags & COPY_FLAGS); - ret = io_uring_fill_params(p.sq_entries, &p); + ret = io_uring_fill_params(&p); if (unlikely(ret)) return ret; From 01405895c1e7d950964bebc8e4b0fc7aa77de24c Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 10 Nov 2025 13:04:51 +0000 Subject: [PATCH 42/68] io_uring: use mem_is_zero to check ring params mem_is_zero() does the job without hand rolled loops, use that to verify reserved fields of ring params. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 023b0e3a829c..af7b4cbe9850 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -3684,14 +3684,12 @@ err_fput: static long io_uring_setup(u32 entries, struct io_uring_params __user *params) { struct io_uring_params p; - int i; if (copy_from_user(&p, params, sizeof(p))) return -EFAULT; - for (i = 0; i < ARRAY_SIZE(p.resv); i++) { - if (p.resv[i]) - return -EINVAL; - } + + if (!mem_is_zero(&p.resv, sizeof(p.resv))) + return -EINVAL; if (p.flags & ~IORING_SETUP_FLAGS) return -EINVAL; From 712fbe97c3322cb7a6ae1112e67a680e7ff1b206 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 10 Nov 2025 13:04:52 +0000 Subject: [PATCH 43/68] io_uring: move flags check to io_uring_sanitise_params io_uring_sanitise_params() sanitises most of the setup flags invariants, move the IORING_SETUP_FLAGS check from io_uring_setup() into it. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index af7b4cbe9850..7e069d56b8a1 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -3430,6 +3430,9 @@ static int io_uring_sanitise_params(struct io_uring_params *p) { unsigned flags = p->flags; + if (flags & ~IORING_SETUP_FLAGS) + return -EINVAL; + /* There is no way to mmap rings without a real fd */ if ((flags & IORING_SETUP_REGISTERED_FD_ONLY) && !(flags & IORING_SETUP_NO_MMAP)) @@ -3691,8 +3694,6 @@ static long io_uring_setup(u32 entries, struct io_uring_params __user *params) if (!mem_is_zero(&p.resv, sizeof(p.resv))) return -EINVAL; - if (p.flags & ~IORING_SETUP_FLAGS) - return -EINVAL; p.sq_entries = entries; return io_uring_create(&p, params); } From e279bb4b4c4d012808fb21ff41183a2e76c26679 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 12 Nov 2025 12:45:53 +0000 Subject: [PATCH 44/68] io_uring: refactor rings_size nosqarray handling A preparation patch inversing the IORING_SETUP_NO_SQARRAY check, this way there is only one successful return path from the function, which will be helpful later. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index c7535159ad6b..c1dc4bf3cf62 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -2761,7 +2761,9 @@ unsigned long rings_size(unsigned int flags, unsigned int sq_entries, unsigned int cq_entries, size_t *sq_offset) { struct io_rings *rings; - size_t off, sq_array_size; + size_t off; + + *sq_offset = SIZE_MAX; off = struct_size(rings, cqes, cq_entries); if (off == SIZE_MAX) @@ -2785,20 +2787,18 @@ unsigned long rings_size(unsigned int flags, unsigned int sq_entries, return SIZE_MAX; #endif - if (flags & IORING_SETUP_NO_SQARRAY) { - *sq_offset = SIZE_MAX; - return off; + if (!(flags & IORING_SETUP_NO_SQARRAY)) { + size_t sq_array_size; + + *sq_offset = off; + + sq_array_size = array_size(sizeof(u32), sq_entries); + if (sq_array_size == SIZE_MAX) + return SIZE_MAX; + if (check_add_overflow(off, sq_array_size, &off)) + return SIZE_MAX; } - *sq_offset = off; - - sq_array_size = array_size(sizeof(u32), sq_entries); - if (sq_array_size == SIZE_MAX) - return SIZE_MAX; - - if (check_add_overflow(off, sq_array_size, &off)) - return SIZE_MAX; - return off; } From 94cd832916521d8d51b25b40691354c24831c655 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 12 Nov 2025 12:45:54 +0000 Subject: [PATCH 45/68] io_uring: use size_add helpers for ring offsets Use size_add / size_mul set of functions for rings_size() calculations. It's more consistent with struct_size(), and errors are preserved across a series of calculations, so intermediate result checks can be omitted. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index c1dc4bf3cf62..bd8dfa919b61 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -2765,13 +2765,6 @@ unsigned long rings_size(unsigned int flags, unsigned int sq_entries, *sq_offset = SIZE_MAX; - off = struct_size(rings, cqes, cq_entries); - if (off == SIZE_MAX) - return SIZE_MAX; - if (flags & IORING_SETUP_CQE32) { - if (check_shl_overflow(off, 1, &off)) - return SIZE_MAX; - } if (flags & IORING_SETUP_CQE_MIXED) { if (cq_entries < 2) return SIZE_MAX; @@ -2781,6 +2774,12 @@ unsigned long rings_size(unsigned int flags, unsigned int sq_entries, return SIZE_MAX; } + off = struct_size(rings, cqes, cq_entries); + if (flags & IORING_SETUP_CQE32) + off = size_mul(off, 2); + if (off == SIZE_MAX) + return SIZE_MAX; + #ifdef CONFIG_SMP off = ALIGN(off, SMP_CACHE_BYTES); if (off == 0) @@ -2793,9 +2792,8 @@ unsigned long rings_size(unsigned int flags, unsigned int sq_entries, *sq_offset = off; sq_array_size = array_size(sizeof(u32), sq_entries); - if (sq_array_size == SIZE_MAX) - return SIZE_MAX; - if (check_add_overflow(off, sq_array_size, &off)) + off = size_add(off, sq_array_size); + if (off == SIZE_MAX) return SIZE_MAX; } From 929dbbb699110c9377da721ed7b44a660bb4ee01 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 12 Nov 2025 12:45:55 +0000 Subject: [PATCH 46/68] io_uring: convert params to pointer in ring reisze The parameters in io_register_resize_rings() will be moved into another structure in a later patch. In preparation to that, convert the params variable it to a pointer, but still store the data on stack. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- io_uring/register.c | 48 ++++++++++++++++++++++----------------------- 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/io_uring/register.c b/io_uring/register.c index ec13ff876a38..b5c2275d5ccc 100644 --- a/io_uring/register.c +++ b/io_uring/register.c @@ -402,33 +402,33 @@ static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg) struct io_ring_ctx_rings o = { }, n = { }, *to_free = NULL; size_t size, sq_array_offset; unsigned i, tail, old_head; - struct io_uring_params p; + struct io_uring_params __p, *p = &__p; int ret; /* limited to DEFER_TASKRUN for now */ if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN)) return -EINVAL; - if (copy_from_user(&p, arg, sizeof(p))) + if (copy_from_user(p, arg, sizeof(*p))) return -EFAULT; - if (p.flags & ~RESIZE_FLAGS) + if (p->flags & ~RESIZE_FLAGS) return -EINVAL; /* properties that are always inherited */ - p.flags |= (ctx->flags & COPY_FLAGS); + p->flags |= (ctx->flags & COPY_FLAGS); - ret = io_uring_fill_params(&p); + ret = io_uring_fill_params(p); if (unlikely(ret)) return ret; - size = rings_size(p.flags, p.sq_entries, p.cq_entries, + size = rings_size(p->flags, p->sq_entries, p->cq_entries, &sq_array_offset); if (size == SIZE_MAX) return -EOVERFLOW; memset(&rd, 0, sizeof(rd)); rd.size = PAGE_ALIGN(size); - if (p.flags & IORING_SETUP_NO_MMAP) { - rd.user_addr = p.cq_off.user_addr; + if (p->flags & IORING_SETUP_NO_MMAP) { + rd.user_addr = p->cq_off.user_addr; rd.flags |= IORING_MEM_REGION_TYPE_USER; } ret = io_create_region(ctx, &n.ring_region, &rd, IORING_OFF_CQ_RING); @@ -445,20 +445,20 @@ static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg) * intent... Use read/write once helpers from here on to indicate the * shared nature of it. */ - WRITE_ONCE(n.rings->sq_ring_mask, p.sq_entries - 1); - WRITE_ONCE(n.rings->cq_ring_mask, p.cq_entries - 1); - WRITE_ONCE(n.rings->sq_ring_entries, p.sq_entries); - WRITE_ONCE(n.rings->cq_ring_entries, p.cq_entries); + WRITE_ONCE(n.rings->sq_ring_mask, p->sq_entries - 1); + WRITE_ONCE(n.rings->cq_ring_mask, p->cq_entries - 1); + WRITE_ONCE(n.rings->sq_ring_entries, p->sq_entries); + WRITE_ONCE(n.rings->cq_ring_entries, p->cq_entries); - if (copy_to_user(arg, &p, sizeof(p))) { + if (copy_to_user(arg, p, sizeof(*p))) { io_register_free_rings(ctx, &n); return -EFAULT; } - if (p.flags & IORING_SETUP_SQE128) - size = array_size(2 * sizeof(struct io_uring_sqe), p.sq_entries); + if (p->flags & IORING_SETUP_SQE128) + size = array_size(2 * sizeof(struct io_uring_sqe), p->sq_entries); else - size = array_size(sizeof(struct io_uring_sqe), p.sq_entries); + size = array_size(sizeof(struct io_uring_sqe), p->sq_entries); if (size == SIZE_MAX) { io_register_free_rings(ctx, &n); return -EOVERFLOW; @@ -466,8 +466,8 @@ static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg) memset(&rd, 0, sizeof(rd)); rd.size = PAGE_ALIGN(size); - if (p.flags & IORING_SETUP_NO_MMAP) { - rd.user_addr = p.sq_off.user_addr; + if (p->flags & IORING_SETUP_NO_MMAP) { + rd.user_addr = p->sq_off.user_addr; rd.flags |= IORING_MEM_REGION_TYPE_USER; } ret = io_create_region(ctx, &n.sq_region, &rd, IORING_OFF_SQES); @@ -508,11 +508,11 @@ static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg) */ tail = READ_ONCE(o.rings->sq.tail); old_head = READ_ONCE(o.rings->sq.head); - if (tail - old_head > p.sq_entries) + if (tail - old_head > p->sq_entries) goto overflow; for (i = old_head; i < tail; i++) { unsigned src_head = i & (ctx->sq_entries - 1); - unsigned dst_head = i & (p.sq_entries - 1); + unsigned dst_head = i & (p->sq_entries - 1); n.sq_sqes[dst_head] = o.sq_sqes[src_head]; } @@ -521,7 +521,7 @@ static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg) tail = READ_ONCE(o.rings->cq.tail); old_head = READ_ONCE(o.rings->cq.head); - if (tail - old_head > p.cq_entries) { + if (tail - old_head > p->cq_entries) { overflow: /* restore old rings, and return -EOVERFLOW via cleanup path */ ctx->rings = o.rings; @@ -532,7 +532,7 @@ overflow: } for (i = old_head; i < tail; i++) { unsigned src_head = i & (ctx->cq_entries - 1); - unsigned dst_head = i & (p.cq_entries - 1); + unsigned dst_head = i & (p->cq_entries - 1); n.rings->cqes[dst_head] = o.rings->cqes[src_head]; } @@ -550,8 +550,8 @@ overflow: if (!(ctx->flags & IORING_SETUP_NO_SQARRAY)) ctx->sq_array = (u32 *)((char *)n.rings + sq_array_offset); - ctx->sq_entries = p.sq_entries; - ctx->cq_entries = p.cq_entries; + ctx->sq_entries = p->sq_entries; + ctx->cq_entries = p->cq_entries; ctx->rings = n.rings; ctx->sq_sqes = n.sq_sqes; From 0f4b537363cb66c78e97bb58c26986af62856356 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 12 Nov 2025 12:45:56 +0000 Subject: [PATCH 47/68] io_uring: introduce struct io_ctx_config There will be more information needed during ctx setup, and instead of passing a handful of pointers around, wrap them all into a new structure. Add a helper for encapsulating all configuration checks and preparation, that's also reused for ring resizing. Note, it indirectly adds a io_uring_sanitise_params() check to ring resizing, which is a good thing. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 41 ++++++++++++++++++++++++++++------------- io_uring/io_uring.h | 8 +++++++- io_uring/register.c | 7 +++++-- 3 files changed, 40 insertions(+), 16 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index bd8dfa919b61..40dfb851d46b 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -3480,7 +3480,7 @@ static int io_uring_sanitise_params(struct io_uring_params *p) return 0; } -int io_uring_fill_params(struct io_uring_params *p) +static int io_uring_fill_params(struct io_uring_params *p) { unsigned entries = p->sq_entries; @@ -3545,12 +3545,9 @@ int io_uring_fill_params(struct io_uring_params *p) return 0; } -static __cold int io_uring_create(struct io_uring_params *p, - struct io_uring_params __user *params) +int io_prepare_config(struct io_ctx_config *config) { - struct io_ring_ctx *ctx; - struct io_uring_task *tctx; - struct file *file; + struct io_uring_params *p = &config->p; int ret; ret = io_uring_sanitise_params(p); @@ -3558,7 +3555,22 @@ static __cold int io_uring_create(struct io_uring_params *p, return ret; ret = io_uring_fill_params(p); - if (unlikely(ret)) + if (ret) + return ret; + + return 0; +} + +static __cold int io_uring_create(struct io_ctx_config *config) +{ + struct io_uring_params *p = &config->p; + struct io_ring_ctx *ctx; + struct io_uring_task *tctx; + struct file *file; + int ret; + + ret = io_prepare_config(config); + if (ret) return ret; ctx = io_ring_ctx_alloc(p); @@ -3631,7 +3643,7 @@ static __cold int io_uring_create(struct io_uring_params *p, p->features = IORING_FEAT_FLAGS; - if (copy_to_user(params, p, sizeof(*p))) { + if (copy_to_user(config->uptr, p, sizeof(*p))) { ret = -EFAULT; goto err; } @@ -3684,16 +3696,19 @@ err_fput: */ static long io_uring_setup(u32 entries, struct io_uring_params __user *params) { - struct io_uring_params p; + struct io_ctx_config config; - if (copy_from_user(&p, params, sizeof(p))) + memset(&config, 0, sizeof(config)); + + if (copy_from_user(&config.p, params, sizeof(config.p))) return -EFAULT; - if (!mem_is_zero(&p.resv, sizeof(p.resv))) + if (!mem_is_zero(&config.p.resv, sizeof(config.p.resv))) return -EINVAL; - p.sq_entries = entries; - return io_uring_create(&p, params); + config.p.sq_entries = entries; + config.uptr = params; + return io_uring_create(&config); } static inline int io_uring_allowed(void) diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index b2251446497a..d8bc44acb9fa 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -17,6 +17,11 @@ #include #endif +struct io_ctx_config { + struct io_uring_params p; + struct io_uring_params __user *uptr; +}; + #define IORING_FEAT_FLAGS (IORING_FEAT_SINGLE_MMAP |\ IORING_FEAT_NODROP |\ IORING_FEAT_SUBMIT_STABLE |\ @@ -136,7 +141,8 @@ static inline bool io_should_wake(struct io_wait_queue *iowq) unsigned long rings_size(unsigned int flags, unsigned int sq_entries, unsigned int cq_entries, size_t *sq_offset); -int io_uring_fill_params(struct io_uring_params *p); +int io_prepare_config(struct io_ctx_config *config); + bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow, bool cqe32); int io_run_task_work_sig(struct io_ring_ctx *ctx); int io_run_local_work(struct io_ring_ctx *ctx, int min_events, int max_events); diff --git a/io_uring/register.c b/io_uring/register.c index b5c2275d5ccc..6b0024c20ce7 100644 --- a/io_uring/register.c +++ b/io_uring/register.c @@ -398,13 +398,16 @@ static void io_register_free_rings(struct io_ring_ctx *ctx, static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg) { + struct io_ctx_config config; struct io_uring_region_desc rd; struct io_ring_ctx_rings o = { }, n = { }, *to_free = NULL; size_t size, sq_array_offset; unsigned i, tail, old_head; - struct io_uring_params __p, *p = &__p; + struct io_uring_params *p = &config.p; int ret; + memset(&config, 0, sizeof(config)); + /* limited to DEFER_TASKRUN for now */ if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN)) return -EINVAL; @@ -416,7 +419,7 @@ static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg) /* properties that are always inherited */ p->flags |= (ctx->flags & COPY_FLAGS); - ret = io_uring_fill_params(p); + ret = io_prepare_config(&config); if (unlikely(ret)) return ret; From 001b76b7e755767d847e9aebf1fd6e525f1e58c8 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 12 Nov 2025 12:45:57 +0000 Subject: [PATCH 48/68] io_uring: keep ring laoyut in a structure Add a structure keeping SQ/CQ sizes and offsets. For now it only records data previously returned from rings_size and the SQ size. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 54 +++++++++++++++++++++++---------------------- io_uring/io_uring.h | 12 ++++++++-- io_uring/register.c | 24 ++++++-------------- 3 files changed, 45 insertions(+), 45 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 40dfb851d46b..58e0c0ece6f1 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -2757,47 +2757,57 @@ static void io_rings_free(struct io_ring_ctx *ctx) ctx->sq_sqes = NULL; } -unsigned long rings_size(unsigned int flags, unsigned int sq_entries, - unsigned int cq_entries, size_t *sq_offset) +int rings_size(unsigned int flags, unsigned int sq_entries, + unsigned int cq_entries, struct io_rings_layout *rl) { struct io_rings *rings; + size_t sqe_size; size_t off; - *sq_offset = SIZE_MAX; - if (flags & IORING_SETUP_CQE_MIXED) { if (cq_entries < 2) - return SIZE_MAX; + return -EOVERFLOW; } if (flags & IORING_SETUP_SQE_MIXED) { if (sq_entries < 2) - return SIZE_MAX; + return -EOVERFLOW; } + rl->sq_array_offset = SIZE_MAX; + + sqe_size = sizeof(struct io_uring_sqe); + if (flags & IORING_SETUP_SQE128) + sqe_size *= 2; + + rl->sq_size = array_size(sqe_size, sq_entries); + if (rl->sq_size == SIZE_MAX) + return -EOVERFLOW; + off = struct_size(rings, cqes, cq_entries); if (flags & IORING_SETUP_CQE32) off = size_mul(off, 2); if (off == SIZE_MAX) - return SIZE_MAX; + return -EOVERFLOW; #ifdef CONFIG_SMP off = ALIGN(off, SMP_CACHE_BYTES); if (off == 0) - return SIZE_MAX; + return -EOVERFLOW; #endif if (!(flags & IORING_SETUP_NO_SQARRAY)) { size_t sq_array_size; - *sq_offset = off; + rl->sq_array_offset = off; sq_array_size = array_size(sizeof(u32), sq_entries); off = size_add(off, sq_array_size); if (off == SIZE_MAX) - return SIZE_MAX; + return -EOVERFLOW; } - return off; + rl->rings_size = off; + return 0; } static __cold void __io_req_caches_free(struct io_ring_ctx *ctx) @@ -3346,28 +3356,20 @@ static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx, struct io_uring_params *p) { struct io_uring_region_desc rd; + struct io_rings_layout __rl, *rl = &__rl; struct io_rings *rings; - size_t sq_array_offset; - size_t sq_size, cq_size, sqe_size; int ret; /* make sure these are sane, as we already accounted them */ ctx->sq_entries = p->sq_entries; ctx->cq_entries = p->cq_entries; - sqe_size = sizeof(struct io_uring_sqe); - if (p->flags & IORING_SETUP_SQE128) - sqe_size *= 2; - sq_size = array_size(sqe_size, p->sq_entries); - if (sq_size == SIZE_MAX) - return -EOVERFLOW; - cq_size = rings_size(ctx->flags, p->sq_entries, p->cq_entries, - &sq_array_offset); - if (cq_size == SIZE_MAX) - return -EOVERFLOW; + ret = rings_size(ctx->flags, p->sq_entries, p->cq_entries, rl); + if (ret) + return ret; memset(&rd, 0, sizeof(rd)); - rd.size = PAGE_ALIGN(cq_size); + rd.size = PAGE_ALIGN(rl->rings_size); if (ctx->flags & IORING_SETUP_NO_MMAP) { rd.user_addr = p->cq_off.user_addr; rd.flags |= IORING_MEM_REGION_TYPE_USER; @@ -3378,10 +3380,10 @@ static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx, ctx->rings = rings = io_region_get_ptr(&ctx->ring_region); if (!(ctx->flags & IORING_SETUP_NO_SQARRAY)) - ctx->sq_array = (u32 *)((char *)rings + sq_array_offset); + ctx->sq_array = (u32 *)((char *)rings + rl->sq_array_offset); memset(&rd, 0, sizeof(rd)); - rd.size = PAGE_ALIGN(sq_size); + rd.size = PAGE_ALIGN(rl->sq_size); if (ctx->flags & IORING_SETUP_NO_MMAP) { rd.user_addr = p->sq_off.user_addr; rd.flags |= IORING_MEM_REGION_TYPE_USER; diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index d8bc44acb9fa..5e544c2d27c8 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -17,6 +17,14 @@ #include #endif +struct io_rings_layout { + /* size of CQ + headers + SQ offset array */ + size_t rings_size; + size_t sq_size; + + size_t sq_array_offset; +}; + struct io_ctx_config { struct io_uring_params p; struct io_uring_params __user *uptr; @@ -139,8 +147,8 @@ static inline bool io_should_wake(struct io_wait_queue *iowq) #define IORING_MAX_ENTRIES 32768 #define IORING_MAX_CQ_ENTRIES (2 * IORING_MAX_ENTRIES) -unsigned long rings_size(unsigned int flags, unsigned int sq_entries, - unsigned int cq_entries, size_t *sq_offset); +int rings_size(unsigned int flags, unsigned int sq_entries, + unsigned int cq_entries, struct io_rings_layout *rl); int io_prepare_config(struct io_ctx_config *config); bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow, bool cqe32); diff --git a/io_uring/register.c b/io_uring/register.c index 6b0024c20ce7..fa245c87978a 100644 --- a/io_uring/register.c +++ b/io_uring/register.c @@ -401,9 +401,9 @@ static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg) struct io_ctx_config config; struct io_uring_region_desc rd; struct io_ring_ctx_rings o = { }, n = { }, *to_free = NULL; - size_t size, sq_array_offset; unsigned i, tail, old_head; struct io_uring_params *p = &config.p; + struct io_rings_layout __rl, *rl = &__rl; int ret; memset(&config, 0, sizeof(config)); @@ -423,13 +423,12 @@ static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg) if (unlikely(ret)) return ret; - size = rings_size(p->flags, p->sq_entries, p->cq_entries, - &sq_array_offset); - if (size == SIZE_MAX) - return -EOVERFLOW; + ret = rings_size(p->flags, p->sq_entries, p->cq_entries, rl); + if (ret) + return ret; memset(&rd, 0, sizeof(rd)); - rd.size = PAGE_ALIGN(size); + rd.size = PAGE_ALIGN(rl->rings_size); if (p->flags & IORING_SETUP_NO_MMAP) { rd.user_addr = p->cq_off.user_addr; rd.flags |= IORING_MEM_REGION_TYPE_USER; @@ -458,17 +457,8 @@ static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg) return -EFAULT; } - if (p->flags & IORING_SETUP_SQE128) - size = array_size(2 * sizeof(struct io_uring_sqe), p->sq_entries); - else - size = array_size(sizeof(struct io_uring_sqe), p->sq_entries); - if (size == SIZE_MAX) { - io_register_free_rings(ctx, &n); - return -EOVERFLOW; - } - memset(&rd, 0, sizeof(rd)); - rd.size = PAGE_ALIGN(size); + rd.size = PAGE_ALIGN(rl->sq_size); if (p->flags & IORING_SETUP_NO_MMAP) { rd.user_addr = p->sq_off.user_addr; rd.flags |= IORING_MEM_REGION_TYPE_USER; @@ -551,7 +541,7 @@ overflow: /* all done, store old pointers and assign new ones */ if (!(ctx->flags & IORING_SETUP_NO_SQARRAY)) - ctx->sq_array = (u32 *)((char *)n.rings + sq_array_offset); + ctx->sq_array = (u32 *)((char *)n.rings + rl->sq_array_offset); ctx->sq_entries = p->sq_entries; ctx->cq_entries = p->cq_entries; From eb76ff6a6829a9a54a385804cc9dbe4460f156d6 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 12 Nov 2025 12:45:58 +0000 Subject: [PATCH 49/68] io_uring: pre-calculate scq layout Move ring layouts calculations into io_prepare_config(), so that more misconfiguration checking can be done earlier before creating a ctx. It also deduplicates some code with ring resizing. And as a bonus, now it initialises params->sq_off.array, which is closer to all other user offset init, and also applies it to ring resizing, which was previously missing it. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 27 ++++++++++++++------------- io_uring/io_uring.h | 3 +-- io_uring/register.c | 4 ---- 3 files changed, 15 insertions(+), 19 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 58e0c0ece6f1..ea5d9e26a10f 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -2757,8 +2757,8 @@ static void io_rings_free(struct io_ring_ctx *ctx) ctx->sq_sqes = NULL; } -int rings_size(unsigned int flags, unsigned int sq_entries, - unsigned int cq_entries, struct io_rings_layout *rl) +static int rings_size(unsigned int flags, unsigned int sq_entries, + unsigned int cq_entries, struct io_rings_layout *rl) { struct io_rings *rings; size_t sqe_size; @@ -3353,10 +3353,11 @@ bool io_is_uring_fops(struct file *file) } static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx, - struct io_uring_params *p) + struct io_ctx_config *config) { + struct io_uring_params *p = &config->p; + struct io_rings_layout *rl = &config->layout; struct io_uring_region_desc rd; - struct io_rings_layout __rl, *rl = &__rl; struct io_rings *rings; int ret; @@ -3364,10 +3365,6 @@ static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx, ctx->sq_entries = p->sq_entries; ctx->cq_entries = p->cq_entries; - ret = rings_size(ctx->flags, p->sq_entries, p->cq_entries, rl); - if (ret) - return ret; - memset(&rd, 0, sizeof(rd)); rd.size = PAGE_ALIGN(rl->rings_size); if (ctx->flags & IORING_SETUP_NO_MMAP) { @@ -3378,7 +3375,6 @@ static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx, if (ret) return ret; ctx->rings = rings = io_region_get_ptr(&ctx->ring_region); - if (!(ctx->flags & IORING_SETUP_NO_SQARRAY)) ctx->sq_array = (u32 *)((char *)rings + rl->sq_array_offset); @@ -3560,6 +3556,14 @@ int io_prepare_config(struct io_ctx_config *config) if (ret) return ret; + ret = rings_size(p->flags, p->sq_entries, p->cq_entries, + &config->layout); + if (ret) + return ret; + + if (!(p->flags & IORING_SETUP_NO_SQARRAY)) + p->sq_off.array = config->layout.sq_array_offset; + return 0; } @@ -3632,13 +3636,10 @@ static __cold int io_uring_create(struct io_ctx_config *config) mmgrab(current->mm); ctx->mm_account = current->mm; - ret = io_allocate_scq_urings(ctx, p); + ret = io_allocate_scq_urings(ctx, config); if (ret) goto err; - if (!(p->flags & IORING_SETUP_NO_SQARRAY)) - p->sq_off.array = (char *)ctx->sq_array - (char *)ctx->rings; - ret = io_sq_offload_create(ctx, p); if (ret) goto err; diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 5e544c2d27c8..a790c16854d3 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -27,6 +27,7 @@ struct io_rings_layout { struct io_ctx_config { struct io_uring_params p; + struct io_rings_layout layout; struct io_uring_params __user *uptr; }; @@ -147,8 +148,6 @@ static inline bool io_should_wake(struct io_wait_queue *iowq) #define IORING_MAX_ENTRIES 32768 #define IORING_MAX_CQ_ENTRIES (2 * IORING_MAX_ENTRIES) -int rings_size(unsigned int flags, unsigned int sq_entries, - unsigned int cq_entries, struct io_rings_layout *rl); int io_prepare_config(struct io_ctx_config *config); bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow, bool cqe32); diff --git a/io_uring/register.c b/io_uring/register.c index fa245c87978a..334a457da3f7 100644 --- a/io_uring/register.c +++ b/io_uring/register.c @@ -423,10 +423,6 @@ static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg) if (unlikely(ret)) return ret; - ret = rings_size(p->flags, p->sq_entries, p->cq_entries, rl); - if (ret) - return ret; - memset(&rd, 0, sizeof(rd)); rd.size = PAGE_ALIGN(rl->rings_size); if (p->flags & IORING_SETUP_NO_MMAP) { From d741c6255524f0691aea53381219fadcd2b38408 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 12 Nov 2025 12:45:59 +0000 Subject: [PATCH 50/68] io_uring: move cq/sq user offset init around Move user SQ/CQ offset initialisation at the end of io_prepare_config() where it already calculated all information to set it properly. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 41 ++++++++++++++++++++--------------------- 1 file changed, 20 insertions(+), 21 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index ea5d9e26a10f..1e58fc1d5667 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -3519,27 +3519,6 @@ static int io_uring_fill_params(struct io_uring_params *p) p->cq_entries = 2 * p->sq_entries; } - p->sq_off.head = offsetof(struct io_rings, sq.head); - p->sq_off.tail = offsetof(struct io_rings, sq.tail); - p->sq_off.ring_mask = offsetof(struct io_rings, sq_ring_mask); - p->sq_off.ring_entries = offsetof(struct io_rings, sq_ring_entries); - p->sq_off.flags = offsetof(struct io_rings, sq_flags); - p->sq_off.dropped = offsetof(struct io_rings, sq_dropped); - p->sq_off.resv1 = 0; - if (!(p->flags & IORING_SETUP_NO_MMAP)) - p->sq_off.user_addr = 0; - - p->cq_off.head = offsetof(struct io_rings, cq.head); - p->cq_off.tail = offsetof(struct io_rings, cq.tail); - p->cq_off.ring_mask = offsetof(struct io_rings, cq_ring_mask); - p->cq_off.ring_entries = offsetof(struct io_rings, cq_ring_entries); - p->cq_off.overflow = offsetof(struct io_rings, cq_overflow); - p->cq_off.cqes = offsetof(struct io_rings, cqes); - p->cq_off.flags = offsetof(struct io_rings, cq_flags); - p->cq_off.resv1 = 0; - if (!(p->flags & IORING_SETUP_NO_MMAP)) - p->cq_off.user_addr = 0; - return 0; } @@ -3561,6 +3540,26 @@ int io_prepare_config(struct io_ctx_config *config) if (ret) return ret; + p->sq_off.head = offsetof(struct io_rings, sq.head); + p->sq_off.tail = offsetof(struct io_rings, sq.tail); + p->sq_off.ring_mask = offsetof(struct io_rings, sq_ring_mask); + p->sq_off.ring_entries = offsetof(struct io_rings, sq_ring_entries); + p->sq_off.flags = offsetof(struct io_rings, sq_flags); + p->sq_off.dropped = offsetof(struct io_rings, sq_dropped); + p->sq_off.resv1 = 0; + if (!(p->flags & IORING_SETUP_NO_MMAP)) + p->sq_off.user_addr = 0; + + p->cq_off.head = offsetof(struct io_rings, cq.head); + p->cq_off.tail = offsetof(struct io_rings, cq.tail); + p->cq_off.ring_mask = offsetof(struct io_rings, cq_ring_mask); + p->cq_off.ring_entries = offsetof(struct io_rings, cq_ring_entries); + p->cq_off.overflow = offsetof(struct io_rings, cq_overflow); + p->cq_off.cqes = offsetof(struct io_rings, cqes); + p->cq_off.flags = offsetof(struct io_rings, cq_flags); + p->cq_off.resv1 = 0; + if (!(p->flags & IORING_SETUP_NO_MMAP)) + p->cq_off.user_addr = 0; if (!(p->flags & IORING_SETUP_NO_SQARRAY)) p->sq_off.array = config->layout.sq_array_offset; From 2647e2ecc096d2330d6b6a34a3a1f0a99828c14c Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 13 Nov 2025 10:48:57 +0000 Subject: [PATCH 51/68] io_uring/query: introduce zcrx query Add a new query type IO_URING_QUERY_ZCRX returning the user some basic information about the interface, which includes allowed flags for areas and registration and supported IORING_REGISTER_ZCRX_CTRL subcodes. There is also a chicken-egg problem with user provided refill queue memory, where offsets and size information is returned after registration, but to properly allocate memory you need to know it beforehand, which is why the userspace currently has to guess the RQ headers size and severely overestimates it. Return the size information. It's split into "size" and "alignment" fields because for default placement modes the user is interested in the aligned size, however if it gets support for more flexible placement, it'll need to only know the actual header size. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring/query.h | 16 ++++++++++++++++ io_uring/query.c | 19 +++++++++++++++++++ 2 files changed, 35 insertions(+) diff --git a/include/uapi/linux/io_uring/query.h b/include/uapi/linux/io_uring/query.h index 3539ccbfd064..fc0cb1580e47 100644 --- a/include/uapi/linux/io_uring/query.h +++ b/include/uapi/linux/io_uring/query.h @@ -18,6 +18,7 @@ struct io_uring_query_hdr { enum { IO_URING_QUERY_OPCODES = 0, + IO_URING_QUERY_ZCRX = 1, __IO_URING_QUERY_MAX, }; @@ -41,4 +42,19 @@ struct io_uring_query_opcode { __u32 __pad; }; +struct io_uring_query_zcrx { + /* Bitmask of supported ZCRX_REG_* flags, */ + __u64 register_flags; + /* Bitmask of all supported IORING_ZCRX_AREA_* flags */ + __u64 area_flags; + /* The number of supported ZCRX_CTRL_* opcodes */ + __u32 nr_ctrl_opcodes; + __u32 __resv1; + /* The refill ring header size */ + __u32 rq_hdr_size; + /* The alignment for the header */ + __u32 rq_hdr_alignment; + __u64 __resv2; +}; + #endif diff --git a/io_uring/query.c b/io_uring/query.c index e1435cdc2665..6f9fa5153903 100644 --- a/io_uring/query.c +++ b/io_uring/query.c @@ -4,9 +4,11 @@ #include "query.h" #include "io_uring.h" +#include "zcrx.h" union io_query_data { struct io_uring_query_opcode opcodes; + struct io_uring_query_zcrx zcrx; }; #define IO_MAX_QUERY_SIZE sizeof(union io_query_data) @@ -27,6 +29,20 @@ static ssize_t io_query_ops(union io_query_data *data) return sizeof(*e); } +static ssize_t io_query_zcrx(union io_query_data *data) +{ + struct io_uring_query_zcrx *e = &data->zcrx; + + e->register_flags = ZCRX_REG_IMPORT; + e->area_flags = IORING_ZCRX_AREA_DMABUF; + e->nr_ctrl_opcodes = __ZCRX_CTRL_LAST; + e->rq_hdr_size = sizeof(struct io_uring); + e->rq_hdr_alignment = L1_CACHE_BYTES; + e->__resv1 = 0; + e->__resv2 = 0; + return sizeof(*e); +} + static int io_handle_query_entry(struct io_ring_ctx *ctx, union io_query_data *data, void __user *uhdr, u64 *next_entry) @@ -55,6 +71,9 @@ static int io_handle_query_entry(struct io_ring_ctx *ctx, case IO_URING_QUERY_OPCODES: ret = io_query_ops(data); break; + case IO_URING_QUERY_ZCRX: + ret = io_query_zcrx(data); + break; } if (ret >= 0) { From 4aaa9bc4d5921363490d95fe66c4db086a915799 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 13 Nov 2025 10:48:58 +0000 Subject: [PATCH 52/68] io_uring/query: introduce rings info query Same problem as with zcrx in the previous patch, the user needs to know SQ/CQ header sizes to allocated memory before setup to use it for user provided rings, i.e. IORING_SETUP_NO_MMAP, however that information is only returned after registration, hence the user is guessing kernel implementation details. Return the header size and alignment, which is split with the same motivation, to allow the user to know the real structure size without alignment in case there will be more flexible placement schemes in the future. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring/query.h | 8 ++++++++ io_uring/query.c | 13 +++++++++++++ 2 files changed, 21 insertions(+) diff --git a/include/uapi/linux/io_uring/query.h b/include/uapi/linux/io_uring/query.h index fc0cb1580e47..2456e6c5ebb5 100644 --- a/include/uapi/linux/io_uring/query.h +++ b/include/uapi/linux/io_uring/query.h @@ -19,6 +19,7 @@ struct io_uring_query_hdr { enum { IO_URING_QUERY_OPCODES = 0, IO_URING_QUERY_ZCRX = 1, + IO_URING_QUERY_SCQ = 2, __IO_URING_QUERY_MAX, }; @@ -57,4 +58,11 @@ struct io_uring_query_zcrx { __u64 __resv2; }; +struct io_uring_query_scq { + /* The SQ/CQ rings header size */ + __u64 hdr_size; + /* The alignment for the header */ + __u64 hdr_alignment; +}; + #endif diff --git a/io_uring/query.c b/io_uring/query.c index 6f9fa5153903..e61b6221f87f 100644 --- a/io_uring/query.c +++ b/io_uring/query.c @@ -9,6 +9,7 @@ union io_query_data { struct io_uring_query_opcode opcodes; struct io_uring_query_zcrx zcrx; + struct io_uring_query_scq scq; }; #define IO_MAX_QUERY_SIZE sizeof(union io_query_data) @@ -43,6 +44,15 @@ static ssize_t io_query_zcrx(union io_query_data *data) return sizeof(*e); } +static ssize_t io_query_scq(union io_query_data *data) +{ + struct io_uring_query_scq *e = &data->scq; + + e->hdr_size = sizeof(struct io_rings); + e->hdr_alignment = SMP_CACHE_BYTES; + return sizeof(*e); +} + static int io_handle_query_entry(struct io_ring_ctx *ctx, union io_query_data *data, void __user *uhdr, u64 *next_entry) @@ -74,6 +84,9 @@ static int io_handle_query_entry(struct io_ring_ctx *ctx, case IO_URING_QUERY_ZCRX: ret = io_query_zcrx(data); break; + case IO_URING_QUERY_SCQ: + ret = io_query_scq(data); + break; } if (ret >= 0) { From f0243d2b86b97a575a7a013370e934f70ee77dd3 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 13 Nov 2025 10:46:09 +0000 Subject: [PATCH 53/68] io_uring/zcrx: convert to use netmem_desc Convert zcrx to struct netmem_desc, and use struct net_iov::desc to access its fields instead of struct net_iov inner union alises. zcrx only directly reads niov->pp, so with this patch it doesn't depend on the union anymore. Signed-off-by: Pavel Begunkov Reviewed-by: Byungchul Park Signed-off-by: Jens Axboe --- io_uring/zcrx.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c index c57ab332acbd..635ee4eb5d8d 100644 --- a/io_uring/zcrx.c +++ b/io_uring/zcrx.c @@ -696,12 +696,12 @@ static void io_zcrx_return_niov(struct net_iov *niov) { netmem_ref netmem = net_iov_to_netmem(niov); - if (!niov->pp) { + if (!niov->desc.pp) { /* copy fallback allocated niovs */ io_zcrx_return_niov_freelist(niov); return; } - page_pool_put_unrefed_netmem(niov->pp, netmem, -1, false); + page_pool_put_unrefed_netmem(niov->desc.pp, netmem, -1, false); } static void io_zcrx_scrub(struct io_zcrx_ifq *ifq) @@ -815,7 +815,7 @@ static void io_zcrx_ring_refill(struct page_pool *pp, if (!page_pool_unref_and_test(netmem)) continue; - if (unlikely(niov->pp != pp)) { + if (unlikely(niov->desc.pp != pp)) { io_zcrx_return_niov(niov); continue; } @@ -1082,13 +1082,15 @@ static int io_zcrx_recv_frag(struct io_kiocb *req, struct io_zcrx_ifq *ifq, const skb_frag_t *frag, int off, int len) { struct net_iov *niov; + struct page_pool *pp; if (unlikely(!skb_frag_is_net_iov(frag))) return io_zcrx_copy_frag(req, ifq, frag, off, len); niov = netmem_to_net_iov(frag->netmem); - if (!niov->pp || niov->pp->mp_ops != &io_uring_pp_zc_ops || - io_pp_to_ifq(niov->pp) != ifq) + pp = niov->desc.pp; + + if (!pp || pp->mp_ops != &io_uring_pp_zc_ops || io_pp_to_ifq(pp) != ifq) return -EFAULT; if (!io_zcrx_queue_cqe(req, niov, ifq, off + skb_frag_off(frag), len)) From a0169c3a62875d1bafa0caffa42e1d1cf6aa40e6 Mon Sep 17 00:00:00 2001 From: Pedro Demarchi Gomes Date: Thu, 13 Nov 2025 10:46:10 +0000 Subject: [PATCH 54/68] io_uring/zcrx: use folio_nr_pages() instead of shift operation folio_nr_pages() is a faster helper function to get the number of pages when NR_PAGES_IN_LARGE_FOLIO is enabled. Signed-off-by: Pedro Demarchi Gomes Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- io_uring/zcrx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c index 635ee4eb5d8d..149bf9d5b983 100644 --- a/io_uring/zcrx.c +++ b/io_uring/zcrx.c @@ -170,7 +170,7 @@ static unsigned long io_count_account_pages(struct page **pages, unsigned nr_pag if (folio == last_folio) continue; last_folio = folio; - res += 1UL << folio_order(folio); + res += folio_nr_pages(folio); } return res; } From 1b8b5d0316da7468ae4d40f6c2102d559d9e3ca2 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 13 Nov 2025 10:46:11 +0000 Subject: [PATCH 55/68] io_uring/zcrx: elide passing msg flags zcrx sqe->msg_flags has never been defined and checked to be zero. It doesn't need to be a MSG_* bitmask. Keep them undefined, don't mix with MSG_DONTWAIT, and don't pass into io_zcrx_recv() as it's ignored anyway. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- io_uring/net.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/io_uring/net.c b/io_uring/net.c index a95cc9ca2a4d..69f901fa3040 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -110,7 +110,6 @@ enum sr_retry_flags { struct io_recvzc { struct file *file; - unsigned msg_flags; u16 flags; u32 len; struct io_zcrx_ifq *ifq; @@ -1253,8 +1252,7 @@ int io_recvzc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) zc->len = READ_ONCE(sqe->len); zc->flags = READ_ONCE(sqe->ioprio); - zc->msg_flags = READ_ONCE(sqe->msg_flags); - if (zc->msg_flags) + if (READ_ONCE(sqe->msg_flags)) return -EINVAL; if (zc->flags & ~(IORING_RECVSEND_POLL_FIRST | IORING_RECV_MULTISHOT)) return -EINVAL; @@ -1283,8 +1281,7 @@ int io_recvzc(struct io_kiocb *req, unsigned int issue_flags) return -ENOTSOCK; len = zc->len; - ret = io_zcrx_recv(req, zc->ifq, sock, zc->msg_flags | MSG_DONTWAIT, - issue_flags, &zc->len); + ret = io_zcrx_recv(req, zc->ifq, sock, 0, issue_flags, &zc->len); if (len && zc->len == 0) { io_req_set_res(req, 0, 0); From d663976dad68de9b2e3df59cc31f0a24ee4c4511 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 13 Nov 2025 10:46:12 +0000 Subject: [PATCH 56/68] io_uring/zcrx: introduce IORING_REGISTER_ZCRX_CTRL It'll be annoying and take enough of boilerplate code to implement new zcrx features as separate io_uring register opcode. Introduce IORING_REGISTER_ZCRX_CTRL that will multiplex such calls to zcrx. Note, there are no real users of the opcode in this patch. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 13 +++++++++++++ io_uring/register.c | 3 +++ io_uring/zcrx.c | 21 +++++++++++++++++++++ io_uring/zcrx.h | 6 ++++++ 4 files changed, 43 insertions(+) diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index e96080db3e4d..0e1d353fab1d 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -697,6 +697,9 @@ enum io_uring_register_op { /* query various aspects of io_uring, see linux/io_uring/query.h */ IORING_REGISTER_QUERY = 35, + /* auxiliary zcrx configuration, see enum zcrx_ctrl_op */ + IORING_REGISTER_ZCRX_CTRL = 36, + /* this goes last */ IORING_REGISTER_LAST, @@ -1078,6 +1081,16 @@ struct io_uring_zcrx_ifq_reg { __u64 __resv[3]; }; +enum zcrx_ctrl_op { + __ZCRX_CTRL_LAST, +}; + +struct zcrx_ctrl { + __u32 zcrx_id; + __u32 op; /* see enum zcrx_ctrl_op */ + __u64 __resv[8]; +}; + #ifdef __cplusplus } #endif diff --git a/io_uring/register.c b/io_uring/register.c index 334a457da3f7..fc66a5364483 100644 --- a/io_uring/register.c +++ b/io_uring/register.c @@ -815,6 +815,9 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, case IORING_REGISTER_QUERY: ret = io_query(ctx, arg, nr_args); break; + case IORING_REGISTER_ZCRX_CTRL: + ret = io_zcrx_ctrl(ctx, arg, nr_args); + break; default: ret = -EINVAL; break; diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c index 149bf9d5b983..0b5f4320c7a9 100644 --- a/io_uring/zcrx.c +++ b/io_uring/zcrx.c @@ -941,6 +941,27 @@ static const struct memory_provider_ops io_uring_pp_zc_ops = { .uninstall = io_pp_uninstall, }; +int io_zcrx_ctrl(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args) +{ + struct zcrx_ctrl ctrl; + struct io_zcrx_ifq *zcrx; + + if (nr_args) + return -EINVAL; + if (copy_from_user(&ctrl, arg, sizeof(ctrl))) + return -EFAULT; + if (!mem_is_zero(&ctrl.__resv, sizeof(ctrl.__resv))) + return -EFAULT; + + zcrx = xa_load(&ctx->zcrx_ctxs, ctrl.zcrx_id); + if (!zcrx) + return -ENXIO; + if (ctrl.op >= __ZCRX_CTRL_LAST) + return -EOPNOTSUPP; + + return -EINVAL; +} + static bool io_zcrx_queue_cqe(struct io_kiocb *req, struct net_iov *niov, struct io_zcrx_ifq *ifq, int off, int len) { diff --git a/io_uring/zcrx.h b/io_uring/zcrx.h index c9b9bfae0547..f29edc22c91f 100644 --- a/io_uring/zcrx.h +++ b/io_uring/zcrx.h @@ -65,6 +65,7 @@ struct io_zcrx_ifq { }; #if defined(CONFIG_IO_URING_ZCRX) +int io_zcrx_ctrl(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_arg); int io_register_zcrx_ifq(struct io_ring_ctx *ctx, struct io_uring_zcrx_ifq_reg __user *arg); void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx); @@ -93,6 +94,11 @@ static inline struct io_mapped_region *io_zcrx_get_region(struct io_ring_ctx *ct { return NULL; } +static inline int io_zcrx_ctrl(struct io_ring_ctx *ctx, + void __user *arg, unsigned nr_arg) +{ + return -EOPNOTSUPP; +} #endif int io_recvzc(struct io_kiocb *req, unsigned int issue_flags); From 475eb39b00478b1898bc9080344dcd8e86c53c7a Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 13 Nov 2025 10:46:13 +0000 Subject: [PATCH 57/68] io_uring/zcrx: add sync refill queue flushing Add an zcrx interface via IORING_REGISTER_ZCRX_CTRL that forces the kernel to flush / consume entries from the refill queue. Just as with the IORING_REGISTER_ZCRX_REFILL attempt, the motivation is to address cases where the refill queue becomes full, and the user can't return buffers and needs to stash them. It's still a slow path, and the user should size refill queue appropriately, but it should be helpful for handling temporary traffic spikes and other unpredictable conditions. The interface is simpler comparing to ZCRX_REFILL as it doesn't need temporary refill entry arrays and gives natural batching, whereas ZCRX_REFILL requires even more user logic to be somewhat efficient. Also, add a structure for the operation. It's not currently used but can serve for future improvements like limiting the number of buffers to process, etc. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 10 ++++- io_uring/zcrx.c | 74 +++++++++++++++++++++++++++++++++-- 2 files changed, 80 insertions(+), 4 deletions(-) diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 0e1d353fab1d..db47fced2cc6 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -1082,13 +1082,21 @@ struct io_uring_zcrx_ifq_reg { }; enum zcrx_ctrl_op { + ZCRX_CTRL_FLUSH_RQ, + __ZCRX_CTRL_LAST, }; +struct zcrx_ctrl_flush_rq { + __u64 __resv[6]; +}; + struct zcrx_ctrl { __u32 zcrx_id; __u32 op; /* see enum zcrx_ctrl_op */ - __u64 __resv[8]; + __u64 __resv[2]; + + struct zcrx_ctrl_flush_rq zc_flush; }; #ifdef __cplusplus diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c index 0b5f4320c7a9..08c103af69bc 100644 --- a/io_uring/zcrx.c +++ b/io_uring/zcrx.c @@ -941,6 +941,71 @@ static const struct memory_provider_ops io_uring_pp_zc_ops = { .uninstall = io_pp_uninstall, }; +static unsigned zcrx_parse_rq(netmem_ref *netmem_array, unsigned nr, + struct io_zcrx_ifq *zcrx) +{ + unsigned int mask = zcrx->rq_entries - 1; + unsigned int i; + + guard(spinlock_bh)(&zcrx->rq_lock); + + nr = min(nr, io_zcrx_rqring_entries(zcrx)); + for (i = 0; i < nr; i++) { + struct io_uring_zcrx_rqe *rqe = io_zcrx_get_rqe(zcrx, mask); + struct net_iov *niov; + + if (!io_parse_rqe(rqe, zcrx, &niov)) + break; + netmem_array[i] = net_iov_to_netmem(niov); + } + + smp_store_release(&zcrx->rq_ring->head, zcrx->cached_rq_head); + return i; +} + +#define ZCRX_FLUSH_BATCH 32 + +static void zcrx_return_buffers(netmem_ref *netmems, unsigned nr) +{ + unsigned i; + + for (i = 0; i < nr; i++) { + netmem_ref netmem = netmems[i]; + struct net_iov *niov = netmem_to_net_iov(netmem); + + if (!io_zcrx_put_niov_uref(niov)) + continue; + if (!page_pool_unref_and_test(netmem)) + continue; + io_zcrx_return_niov(niov); + } +} + +static int zcrx_flush_rq(struct io_ring_ctx *ctx, struct io_zcrx_ifq *zcrx, + struct zcrx_ctrl *ctrl) +{ + struct zcrx_ctrl_flush_rq *frq = &ctrl->zc_flush; + netmem_ref netmems[ZCRX_FLUSH_BATCH]; + unsigned total = 0; + unsigned nr; + + if (!mem_is_zero(&frq->__resv, sizeof(frq->__resv))) + return -EINVAL; + + do { + nr = zcrx_parse_rq(netmems, ZCRX_FLUSH_BATCH, zcrx); + + zcrx_return_buffers(netmems, nr); + total += nr; + + if (fatal_signal_pending(current)) + break; + cond_resched(); + } while (nr == ZCRX_FLUSH_BATCH && total < zcrx->rq_entries); + + return 0; +} + int io_zcrx_ctrl(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args) { struct zcrx_ctrl ctrl; @@ -956,10 +1021,13 @@ int io_zcrx_ctrl(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args) zcrx = xa_load(&ctx->zcrx_ctxs, ctrl.zcrx_id); if (!zcrx) return -ENXIO; - if (ctrl.op >= __ZCRX_CTRL_LAST) - return -EOPNOTSUPP; - return -EINVAL; + switch (ctrl.op) { + case ZCRX_CTRL_FLUSH_RQ: + return zcrx_flush_rq(ctx, zcrx, &ctrl); + } + + return -EOPNOTSUPP; } static bool io_zcrx_queue_cqe(struct io_kiocb *req, struct net_iov *niov, From 39c9676f789eb71ce1005a22eebe2be80a00de6a Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 13 Nov 2025 10:46:14 +0000 Subject: [PATCH 58/68] io_uring/zcrx: count zcrx users zcrx tries to detach ifq / terminate page pools when the io_uring ctx owning it is being destroyed. There will be multiple io_uring instances attached to it in the future, so add a separate counter to track the users. Note, refs can't be reused for this purpose as it only used to prevent zcrx and rings destruction, and also used by page pools to keep it alive. Signed-off-by: David Wei Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- io_uring/zcrx.c | 7 +++++-- io_uring/zcrx.h | 2 ++ 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c index 08c103af69bc..2335f140ff19 100644 --- a/io_uring/zcrx.c +++ b/io_uring/zcrx.c @@ -482,6 +482,7 @@ static struct io_zcrx_ifq *io_zcrx_ifq_alloc(struct io_ring_ctx *ctx) spin_lock_init(&ifq->rq_lock); mutex_init(&ifq->pp_lock); refcount_set(&ifq->refs, 1); + refcount_set(&ifq->user_refs, 1); return ifq; } @@ -742,8 +743,10 @@ void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx) if (!ifq) break; - io_close_queue(ifq); - io_zcrx_scrub(ifq); + if (refcount_dec_and_test(&ifq->user_refs)) { + io_close_queue(ifq); + io_zcrx_scrub(ifq); + } io_put_zcrx_ifq(ifq); } diff --git a/io_uring/zcrx.h b/io_uring/zcrx.h index f29edc22c91f..32ab95b2cb81 100644 --- a/io_uring/zcrx.h +++ b/io_uring/zcrx.h @@ -55,6 +55,8 @@ struct io_zcrx_ifq { struct net_device *netdev; netdevice_tracker netdev_tracker; refcount_t refs; + /* counts userspace facing users like io_uring */ + refcount_t user_refs; /* * Page pool and net configuration lock, can be taken deeper in the From 742cb2e14ecb059cd4a77b92aa4945c20f85d414 Mon Sep 17 00:00:00 2001 From: David Wei Date: Thu, 13 Nov 2025 10:46:15 +0000 Subject: [PATCH 59/68] io_uring/zcrx: move io_zcrx_scrub() and dependencies up In preparation for adding zcrx ifq exporting and importing, move io_zcrx_scrub() and its dependencies up the file to be closer to io_close_queue(). Signed-off-by: David Wei Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- io_uring/zcrx.c | 84 ++++++++++++++++++++++++------------------------- 1 file changed, 42 insertions(+), 42 deletions(-) diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c index 2335f140ff19..e60c5c00a611 100644 --- a/io_uring/zcrx.c +++ b/io_uring/zcrx.c @@ -544,6 +544,48 @@ static void io_put_zcrx_ifq(struct io_zcrx_ifq *ifq) io_zcrx_ifq_free(ifq); } +static void io_zcrx_return_niov_freelist(struct net_iov *niov) +{ + struct io_zcrx_area *area = io_zcrx_iov_to_area(niov); + + spin_lock_bh(&area->freelist_lock); + area->freelist[area->free_count++] = net_iov_idx(niov); + spin_unlock_bh(&area->freelist_lock); +} + +static void io_zcrx_return_niov(struct net_iov *niov) +{ + netmem_ref netmem = net_iov_to_netmem(niov); + + if (!niov->desc.pp) { + /* copy fallback allocated niovs */ + io_zcrx_return_niov_freelist(niov); + return; + } + page_pool_put_unrefed_netmem(niov->desc.pp, netmem, -1, false); +} + +static void io_zcrx_scrub(struct io_zcrx_ifq *ifq) +{ + struct io_zcrx_area *area = ifq->area; + int i; + + if (!area) + return; + + /* Reclaim back all buffers given to the user space. */ + for (i = 0; i < area->nia.num_niovs; i++) { + struct net_iov *niov = &area->nia.niovs[i]; + int nr; + + if (!atomic_read(io_get_user_counter(niov))) + continue; + nr = atomic_xchg(io_get_user_counter(niov), 0); + if (nr && !page_pool_unref_netmem(net_iov_to_netmem(niov), nr)) + io_zcrx_return_niov(niov); + } +} + struct io_mapped_region *io_zcrx_get_region(struct io_ring_ctx *ctx, unsigned int id) { @@ -684,48 +726,6 @@ static struct net_iov *__io_zcrx_get_free_niov(struct io_zcrx_area *area) return &area->nia.niovs[niov_idx]; } -static void io_zcrx_return_niov_freelist(struct net_iov *niov) -{ - struct io_zcrx_area *area = io_zcrx_iov_to_area(niov); - - spin_lock_bh(&area->freelist_lock); - area->freelist[area->free_count++] = net_iov_idx(niov); - spin_unlock_bh(&area->freelist_lock); -} - -static void io_zcrx_return_niov(struct net_iov *niov) -{ - netmem_ref netmem = net_iov_to_netmem(niov); - - if (!niov->desc.pp) { - /* copy fallback allocated niovs */ - io_zcrx_return_niov_freelist(niov); - return; - } - page_pool_put_unrefed_netmem(niov->desc.pp, netmem, -1, false); -} - -static void io_zcrx_scrub(struct io_zcrx_ifq *ifq) -{ - struct io_zcrx_area *area = ifq->area; - int i; - - if (!area) - return; - - /* Reclaim back all buffers given to the user space. */ - for (i = 0; i < area->nia.num_niovs; i++) { - struct net_iov *niov = &area->nia.niovs[i]; - int nr; - - if (!atomic_read(io_get_user_counter(niov))) - continue; - nr = atomic_xchg(io_get_user_counter(niov), 0); - if (nr && !page_pool_unref_netmem(net_iov_to_netmem(niov), nr)) - io_zcrx_return_niov(niov); - } -} - void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx) { struct io_zcrx_ifq *ifq; From d7af80b213e5675664b14f12240cb282e81773d5 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 13 Nov 2025 10:46:16 +0000 Subject: [PATCH 60/68] io_uring/zcrx: export zcrx via a file Add an option to wrap a zcrx instance into a file and expose it to the user space. Currently, users can't do anything meaningful with the file, but it'll be used in a next patch to import it into another io_uring instance. It's implemented as a new op called ZCRX_CTRL_EXPORT for the IORING_REGISTER_ZCRX_CTRL registration opcode. Signed-off-by: David Wei Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 11 +++++- io_uring/zcrx.c | 68 +++++++++++++++++++++++++++++++---- 2 files changed, 72 insertions(+), 7 deletions(-) diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index db47fced2cc6..4bedc0310a55 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -1083,6 +1083,7 @@ struct io_uring_zcrx_ifq_reg { enum zcrx_ctrl_op { ZCRX_CTRL_FLUSH_RQ, + ZCRX_CTRL_EXPORT, __ZCRX_CTRL_LAST, }; @@ -1091,12 +1092,20 @@ struct zcrx_ctrl_flush_rq { __u64 __resv[6]; }; +struct zcrx_ctrl_export { + __u32 zcrx_fd; + __u32 __resv1[11]; +}; + struct zcrx_ctrl { __u32 zcrx_id; __u32 op; /* see enum zcrx_ctrl_op */ __u64 __resv[2]; - struct zcrx_ctrl_flush_rq zc_flush; + union { + struct zcrx_ctrl_export zc_export; + struct zcrx_ctrl_flush_rq zc_flush; + }; }; #ifdef __cplusplus diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c index e60c5c00a611..815992aff246 100644 --- a/io_uring/zcrx.c +++ b/io_uring/zcrx.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include @@ -586,6 +587,15 @@ static void io_zcrx_scrub(struct io_zcrx_ifq *ifq) } } +static void zcrx_unregister(struct io_zcrx_ifq *ifq) +{ + if (refcount_dec_and_test(&ifq->user_refs)) { + io_close_queue(ifq); + io_zcrx_scrub(ifq); + } + io_put_zcrx_ifq(ifq); +} + struct io_mapped_region *io_zcrx_get_region(struct io_ring_ctx *ctx, unsigned int id) { @@ -596,6 +606,55 @@ struct io_mapped_region *io_zcrx_get_region(struct io_ring_ctx *ctx, return ifq ? &ifq->region : NULL; } +static int zcrx_box_release(struct inode *inode, struct file *file) +{ + struct io_zcrx_ifq *ifq = file->private_data; + + if (WARN_ON_ONCE(!ifq)) + return -EFAULT; + zcrx_unregister(ifq); + return 0; +} + +static const struct file_operations zcrx_box_fops = { + .owner = THIS_MODULE, + .release = zcrx_box_release, +}; + +static int zcrx_export(struct io_ring_ctx *ctx, struct io_zcrx_ifq *ifq, + struct zcrx_ctrl *ctrl, void __user *arg) +{ + struct zcrx_ctrl_export *ce = &ctrl->zc_export; + struct file *file; + int fd = -1; + + if (!mem_is_zero(ce, sizeof(*ce))) + return -EINVAL; + fd = get_unused_fd_flags(O_CLOEXEC); + if (fd < 0) + return fd; + + ce->zcrx_fd = fd; + if (copy_to_user(arg, ctrl, sizeof(*ctrl))) { + put_unused_fd(fd); + return -EFAULT; + } + + refcount_inc(&ifq->refs); + refcount_inc(&ifq->user_refs); + + file = anon_inode_create_getfile("[zcrx]", &zcrx_box_fops, + ifq, O_CLOEXEC, NULL); + if (IS_ERR(file)) { + put_unused_fd(fd); + zcrx_unregister(ifq); + return PTR_ERR(file); + } + + fd_install(fd, file); + return 0; +} + int io_register_zcrx_ifq(struct io_ring_ctx *ctx, struct io_uring_zcrx_ifq_reg __user *arg) { @@ -742,12 +801,7 @@ void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx) } if (!ifq) break; - - if (refcount_dec_and_test(&ifq->user_refs)) { - io_close_queue(ifq); - io_zcrx_scrub(ifq); - } - io_put_zcrx_ifq(ifq); + zcrx_unregister(ifq); } xa_destroy(&ctx->zcrx_ctxs); @@ -1028,6 +1082,8 @@ int io_zcrx_ctrl(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args) switch (ctrl.op) { case ZCRX_CTRL_FLUSH_RQ: return zcrx_flush_rq(ctx, zcrx, &ctrl); + case ZCRX_CTRL_EXPORT: + return zcrx_export(ctx, zcrx, &ctrl, arg); } return -EOPNOTSUPP; From 0926f94ab36a6d76d07fa8f0934e65f5f66647ec Mon Sep 17 00:00:00 2001 From: David Wei Date: Thu, 13 Nov 2025 10:46:17 +0000 Subject: [PATCH 61/68] io_uring/zcrx: add io_fill_zcrx_offsets() Add a helper io_fill_zcrx_offsets() that sets the constant offsets in struct io_uring_zcrx_offsets returned to userspace. Signed-off-by: David Wei Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- io_uring/zcrx.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c index 815992aff246..da7e556c349e 100644 --- a/io_uring/zcrx.c +++ b/io_uring/zcrx.c @@ -345,6 +345,13 @@ static void io_zcrx_get_niov_uref(struct net_iov *niov) atomic_inc(io_get_user_counter(niov)); } +static void io_fill_zcrx_offsets(struct io_uring_zcrx_offsets *offsets) +{ + offsets->head = offsetof(struct io_uring, head); + offsets->tail = offsetof(struct io_uring, tail); + offsets->rqes = ALIGN(sizeof(struct io_uring), L1_CACHE_BYTES); +} + static int io_allocate_rbuf_ring(struct io_ring_ctx *ctx, struct io_zcrx_ifq *ifq, struct io_uring_zcrx_ifq_reg *reg, @@ -356,7 +363,8 @@ static int io_allocate_rbuf_ring(struct io_ring_ctx *ctx, void *ptr; int ret; - off = ALIGN(sizeof(struct io_uring), L1_CACHE_BYTES); + io_fill_zcrx_offsets(®->offsets); + off = reg->offsets.rqes; size = off + sizeof(struct io_uring_zcrx_rqe) * reg->rq_entries; if (size > rd->size) return -EINVAL; @@ -372,9 +380,6 @@ static int io_allocate_rbuf_ring(struct io_ring_ctx *ctx, ifq->rq_ring = (struct io_uring *)ptr; ifq->rqes = (struct io_uring_zcrx_rqe *)(ptr + off); - reg->offsets.head = offsetof(struct io_uring, head); - reg->offsets.tail = offsetof(struct io_uring, tail); - reg->offsets.rqes = off; return 0; } From 00d91481279fb2df8c46d19090578afd523ca630 Mon Sep 17 00:00:00 2001 From: David Wei Date: Thu, 13 Nov 2025 10:46:18 +0000 Subject: [PATCH 62/68] io_uring/zcrx: share an ifq between rings Add a way to share an ifq from a src ring that is real (i.e. bound to a HW RX queue) with other rings. This is done by passing a new flag IORING_ZCRX_IFQ_REG_IMPORT in the registration struct io_uring_zcrx_ifq_reg, alongside the fd of an exported zcrx ifq. Signed-off-by: David Wei Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 4 +++ io_uring/zcrx.c | 63 +++++++++++++++++++++++++++++++++-- 2 files changed, 65 insertions(+), 2 deletions(-) diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 4bedc0310a55..deb772222b6d 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -1063,6 +1063,10 @@ struct io_uring_zcrx_area_reg { __u64 __resv2[2]; }; +enum zcrx_reg_flags { + ZCRX_REG_IMPORT = 1, +}; + /* * Argument for IORING_REGISTER_ZCRX_IFQ */ diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c index da7e556c349e..b99cf2c6670a 100644 --- a/io_uring/zcrx.c +++ b/io_uring/zcrx.c @@ -660,6 +660,63 @@ static int zcrx_export(struct io_ring_ctx *ctx, struct io_zcrx_ifq *ifq, return 0; } +static int import_zcrx(struct io_ring_ctx *ctx, + struct io_uring_zcrx_ifq_reg __user *arg, + struct io_uring_zcrx_ifq_reg *reg) +{ + struct io_zcrx_ifq *ifq; + struct file *file; + int fd, ret; + u32 id; + + if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN)) + return -EINVAL; + if (!(ctx->flags & (IORING_SETUP_CQE32|IORING_SETUP_CQE_MIXED))) + return -EINVAL; + if (reg->if_rxq || reg->rq_entries || reg->area_ptr || reg->region_ptr) + return -EINVAL; + + fd = reg->if_idx; + CLASS(fd, f)(fd); + if (fd_empty(f)) + return -EBADF; + + file = fd_file(f); + if (file->f_op != &zcrx_box_fops || !file->private_data) + return -EBADF; + + ifq = file->private_data; + refcount_inc(&ifq->refs); + refcount_inc(&ifq->user_refs); + + scoped_guard(mutex, &ctx->mmap_lock) { + ret = xa_alloc(&ctx->zcrx_ctxs, &id, NULL, xa_limit_31b, GFP_KERNEL); + if (ret) + goto err; + } + + reg->zcrx_id = id; + io_fill_zcrx_offsets(®->offsets); + if (copy_to_user(arg, reg, sizeof(*reg))) { + ret = -EFAULT; + goto err_xa_erase; + } + + scoped_guard(mutex, &ctx->mmap_lock) { + ret = -ENOMEM; + if (xa_store(&ctx->zcrx_ctxs, id, ifq, GFP_KERNEL)) + goto err_xa_erase; + } + + return 0; +err_xa_erase: + scoped_guard(mutex, &ctx->mmap_lock) + xa_erase(&ctx->zcrx_ctxs, id); +err: + zcrx_unregister(ifq); + return ret; +} + int io_register_zcrx_ifq(struct io_ring_ctx *ctx, struct io_uring_zcrx_ifq_reg __user *arg) { @@ -685,11 +742,13 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx, return -EINVAL; if (copy_from_user(®, arg, sizeof(reg))) return -EFAULT; - if (copy_from_user(&rd, u64_to_user_ptr(reg.region_ptr), sizeof(rd))) - return -EFAULT; if (!mem_is_zero(®.__resv, sizeof(reg.__resv)) || reg.__resv2 || reg.zcrx_id) return -EINVAL; + if (reg.flags & ZCRX_REG_IMPORT) + return import_zcrx(ctx, arg, ®); + if (copy_from_user(&rd, u64_to_user_ptr(reg.region_ptr), sizeof(rd))) + return -EFAULT; if (reg.if_rxq == -1 || !reg.rq_entries || reg.flags) return -EINVAL; if (reg.rq_entries > IO_RQ_MAX_ENTRIES) { From f779ac0b8784858c3700f6660d606f436c62157a Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 18 Nov 2025 19:30:45 -0700 Subject: [PATCH 63/68] io_uring/register: use correct location for io_rings_layout A previous consolidated the ring size etc calculations into io_prepare_config(), but missed updating io_register_resize_rings() correctly to use the calculated values. As a result, it ended up using on-stack uninitialized values, and hence either failed validating the size correctly, or just failed resizing because the sizes were random. This caused failures in the liburing regression tests: [...] Running test resize-rings.t resize=-7 test_basic 3000 failed Test resize-rings.t failed with ret 1 Running test resize-rings.t /dev/sda resize=-7 test_basic 3000 failed Test resize-rings.t failed with ret 1 Running test resize-rings.t /dev/nvme1n1 resize=-7 test_basic 3000 failed Test resize-rings.t failed with ret 1 Running test resize-rings.t /dev/dm-0 resize=-7 test_basic 3000 failed Test resize-rings.t failed with ret 1 because io_create_region() would return -E2BIG because of unintialized reg->size values. Adjust the struct io_rings_layout rl pointer to point to the correct location, and remove the (now dead) __rl on stack struct. Fixes: eb76ff6a6829 ("io_uring: pre-calculate scq layout") Signed-off-by: Jens Axboe --- io_uring/register.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/io_uring/register.c b/io_uring/register.c index fc66a5364483..db42f98562c4 100644 --- a/io_uring/register.c +++ b/io_uring/register.c @@ -403,7 +403,7 @@ static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg) struct io_ring_ctx_rings o = { }, n = { }, *to_free = NULL; unsigned i, tail, old_head; struct io_uring_params *p = &config.p; - struct io_rings_layout __rl, *rl = &__rl; + struct io_rings_layout *rl = &config.layout; int ret; memset(&config, 0, sizeof(config)); From 84692a1519b32d61ff882cf24a9eda900961acad Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Thu, 20 Nov 2025 11:15:56 -0800 Subject: [PATCH 64/68] io_uring/kbuf: remove obsolete buf_nr_pages and update comments The buf_nr_pages field in io_buffer_list was previously used to determine whether the buffer list uses ring-provided buffers or classic provided buffers. This is now determined by checking the IOBL_BUF_RING flag. Remove the buf_nr_pages field and update related comments. Signed-off-by: Joanne Koong Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 4 ++-- io_uring/kbuf.h | 5 ++--- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 92780764d5fa..e1adb0d20a0a 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -327,8 +327,8 @@ struct io_ring_ctx { /* * Modifications are protected by ->uring_lock and ->mmap_lock. - * The flags, buf_pages and buf_nr_pages fields should be stable - * once published. + * The buffer list's io mapped region should be stable once + * published. */ struct xarray io_bl_xa; diff --git a/io_uring/kbuf.h b/io_uring/kbuf.h index ada382ff38d7..bf15e26520d3 100644 --- a/io_uring/kbuf.h +++ b/io_uring/kbuf.h @@ -14,8 +14,8 @@ enum { struct io_buffer_list { /* - * If ->buf_nr_pages is set, then buf_pages/buf_ring are used. If not, - * then these are classic provided buffers and ->buf_list is used. + * If the IOBL_BUF_RING flag is set, then buf_ring is used. If not, then + * these are classic provided buffers and ->buf_list is used. */ union { struct list_head buf_list; @@ -27,7 +27,6 @@ struct io_buffer_list { __u16 bgid; /* below is for ring provided buffers */ - __u16 buf_nr_pages; __u16 nr_entries; __u16 head; __u16 mask; From 1e93de9205b4d5c0f06507e9e1c398574a07fb80 Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Tue, 25 Nov 2025 17:59:34 -0700 Subject: [PATCH 65/68] io_uring/query: drop unused io_handle_query_entry() ctx arg io_handle_query_entry() doesn't use its struct io_ring_ctx *ctx argument. So remove it from the function and its callers. Signed-off-by: Caleb Sander Mateos Signed-off-by: Jens Axboe --- io_uring/query.c | 7 +++---- io_uring/query.h | 2 +- io_uring/register.c | 4 ++-- 3 files changed, 6 insertions(+), 7 deletions(-) diff --git a/io_uring/query.c b/io_uring/query.c index e61b6221f87f..abdd6f3e1223 100644 --- a/io_uring/query.c +++ b/io_uring/query.c @@ -53,8 +53,7 @@ static ssize_t io_query_scq(union io_query_data *data) return sizeof(*e); } -static int io_handle_query_entry(struct io_ring_ctx *ctx, - union io_query_data *data, void __user *uhdr, +static int io_handle_query_entry(union io_query_data *data, void __user *uhdr, u64 *next_entry) { struct io_uring_query_hdr hdr; @@ -107,7 +106,7 @@ out: return 0; } -int io_query(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args) +int io_query(void __user *arg, unsigned nr_args) { union io_query_data entry_buffer; void __user *uhdr = arg; @@ -121,7 +120,7 @@ int io_query(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args) while (uhdr) { u64 next_hdr; - ret = io_handle_query_entry(ctx, &entry_buffer, uhdr, &next_hdr); + ret = io_handle_query_entry(&entry_buffer, uhdr, &next_hdr); if (ret) return ret; uhdr = u64_to_user_ptr(next_hdr); diff --git a/io_uring/query.h b/io_uring/query.h index 171d47ccaaba..b35eb52f0ea8 100644 --- a/io_uring/query.h +++ b/io_uring/query.h @@ -4,6 +4,6 @@ #include -int io_query(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args); +int io_query(void __user *arg, unsigned nr_args); #endif diff --git a/io_uring/register.c b/io_uring/register.c index db42f98562c4..62d39b3ff317 100644 --- a/io_uring/register.c +++ b/io_uring/register.c @@ -813,7 +813,7 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, ret = io_register_mem_region(ctx, arg); break; case IORING_REGISTER_QUERY: - ret = io_query(ctx, arg, nr_args); + ret = io_query(arg, nr_args); break; case IORING_REGISTER_ZCRX_CTRL: ret = io_zcrx_ctrl(ctx, arg, nr_args); @@ -888,7 +888,7 @@ static int io_uring_register_blind(unsigned int opcode, void __user *arg, case IORING_REGISTER_SEND_MSG_RING: return io_uring_register_send_msg_ring(arg, nr_args); case IORING_REGISTER_QUERY: - return io_query(NULL, arg, nr_args); + return io_query(arg, nr_args); } return -EINVAL; } From 4677e78800bbde62a9edce0eb3b40c775ec55e0d Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Tue, 25 Nov 2025 16:17:59 -0500 Subject: [PATCH 66/68] socket: Unify getsockname and getpeername implementation They are already implemented by the same get_name hook in the protocol level. Bring the unification one level up to reduce code duplication in preparation to supporting these as io_uring operations. Reviewed-by: Kuniyuki Iwashima Signed-off-by: Gabriel Krisman Bertazi Signed-off-by: Jens Axboe --- include/linux/socket.h | 4 +-- net/compat.c | 4 +-- net/socket.c | 55 ++++++++++-------------------------------- 3 files changed, 16 insertions(+), 47 deletions(-) diff --git a/include/linux/socket.h b/include/linux/socket.h index 3b262487ec06..937fe331ff1e 100644 --- a/include/linux/socket.h +++ b/include/linux/socket.h @@ -454,9 +454,7 @@ extern int __sys_connect(int fd, struct sockaddr __user *uservaddr, extern int __sys_listen(int fd, int backlog); extern int __sys_listen_socket(struct socket *sock, int backlog); extern int __sys_getsockname(int fd, struct sockaddr __user *usockaddr, - int __user *usockaddr_len); -extern int __sys_getpeername(int fd, struct sockaddr __user *usockaddr, - int __user *usockaddr_len); + int __user *usockaddr_len, int peer); extern int __sys_socketpair(int family, int type, int protocol, int __user *usockvec); extern int __sys_shutdown_sock(struct socket *sock, int how); diff --git a/net/compat.c b/net/compat.c index 485db8ee9b28..2c9bd0edac99 100644 --- a/net/compat.c +++ b/net/compat.c @@ -460,10 +460,10 @@ COMPAT_SYSCALL_DEFINE2(socketcall, int, call, u32 __user *, args) ret = __sys_accept4(a0, compat_ptr(a1), compat_ptr(a[2]), 0); break; case SYS_GETSOCKNAME: - ret = __sys_getsockname(a0, compat_ptr(a1), compat_ptr(a[2])); + ret = __sys_getsockname(a0, compat_ptr(a1), compat_ptr(a[2]), 0); break; case SYS_GETPEERNAME: - ret = __sys_getpeername(a0, compat_ptr(a1), compat_ptr(a[2])); + ret = __sys_getsockname(a0, compat_ptr(a1), compat_ptr(a[2]), 1); break; case SYS_SOCKETPAIR: ret = __sys_socketpair(a0, a1, a[2], compat_ptr(a[3])); diff --git a/net/socket.c b/net/socket.c index e8892b218708..208d92ccf0fb 100644 --- a/net/socket.c +++ b/net/socket.c @@ -2128,12 +2128,11 @@ SYSCALL_DEFINE3(connect, int, fd, struct sockaddr __user *, uservaddr, } /* - * Get the local address ('name') of a socket object. Move the obtained - * name to user space. + * Get the remote or local address ('name') of a socket object. Move the + * obtained name to user space. */ - int __sys_getsockname(int fd, struct sockaddr __user *usockaddr, - int __user *usockaddr_len) + int __user *usockaddr_len, int peer) { struct socket *sock; struct sockaddr_storage address; @@ -2146,11 +2145,14 @@ int __sys_getsockname(int fd, struct sockaddr __user *usockaddr, if (unlikely(!sock)) return -ENOTSOCK; - err = security_socket_getsockname(sock); + if (peer) + err = security_socket_getpeername(sock); + else + err = security_socket_getsockname(sock); if (err) return err; - err = READ_ONCE(sock->ops)->getname(sock, (struct sockaddr *)&address, 0); + err = READ_ONCE(sock->ops)->getname(sock, (struct sockaddr *)&address, peer); if (err < 0) return err; @@ -2161,44 +2163,13 @@ int __sys_getsockname(int fd, struct sockaddr __user *usockaddr, SYSCALL_DEFINE3(getsockname, int, fd, struct sockaddr __user *, usockaddr, int __user *, usockaddr_len) { - return __sys_getsockname(fd, usockaddr, usockaddr_len); -} - -/* - * Get the remote address ('name') of a socket object. Move the obtained - * name to user space. - */ - -int __sys_getpeername(int fd, struct sockaddr __user *usockaddr, - int __user *usockaddr_len) -{ - struct socket *sock; - struct sockaddr_storage address; - CLASS(fd, f)(fd); - int err; - - if (fd_empty(f)) - return -EBADF; - sock = sock_from_file(fd_file(f)); - if (unlikely(!sock)) - return -ENOTSOCK; - - err = security_socket_getpeername(sock); - if (err) - return err; - - err = READ_ONCE(sock->ops)->getname(sock, (struct sockaddr *)&address, 1); - if (err < 0) - return err; - - /* "err" is actually length in this case */ - return move_addr_to_user(&address, err, usockaddr, usockaddr_len); + return __sys_getsockname(fd, usockaddr, usockaddr_len, 0); } SYSCALL_DEFINE3(getpeername, int, fd, struct sockaddr __user *, usockaddr, int __user *, usockaddr_len) { - return __sys_getpeername(fd, usockaddr, usockaddr_len); + return __sys_getsockname(fd, usockaddr, usockaddr_len, 1); } /* @@ -3162,12 +3133,12 @@ SYSCALL_DEFINE2(socketcall, int, call, unsigned long __user *, args) case SYS_GETSOCKNAME: err = __sys_getsockname(a0, (struct sockaddr __user *)a1, - (int __user *)a[2]); + (int __user *)a[2], 0); break; case SYS_GETPEERNAME: err = - __sys_getpeername(a0, (struct sockaddr __user *)a1, - (int __user *)a[2]); + __sys_getsockname(a0, (struct sockaddr __user *)a1, + (int __user *)a[2], 1); break; case SYS_SOCKETPAIR: err = __sys_socketpair(a0, a1, a[2], (int __user *)a[3]); From d73c1677087391379441c0bb444c7fb4238fc6e7 Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Tue, 25 Nov 2025 16:18:00 -0500 Subject: [PATCH 67/68] socket: Split out a getsockname helper for io_uring Similar to getsockopt, split out a helper to check security and issue the operation from the main handler that can be used by io_uring. Signed-off-by: Gabriel Krisman Bertazi Reviewed-by: Kuniyuki Iwashima Signed-off-by: Jens Axboe --- include/linux/socket.h | 2 ++ net/socket.c | 36 ++++++++++++++++++++---------------- 2 files changed, 22 insertions(+), 16 deletions(-) diff --git a/include/linux/socket.h b/include/linux/socket.h index 937fe331ff1e..8d580074ddea 100644 --- a/include/linux/socket.h +++ b/include/linux/socket.h @@ -453,6 +453,8 @@ extern int __sys_connect(int fd, struct sockaddr __user *uservaddr, int addrlen); extern int __sys_listen(int fd, int backlog); extern int __sys_listen_socket(struct socket *sock, int backlog); +extern int do_getsockname(struct socket *sock, int peer, + struct sockaddr __user *usockaddr, int __user *usockaddr_len); extern int __sys_getsockname(int fd, struct sockaddr __user *usockaddr, int __user *usockaddr_len, int peer); extern int __sys_socketpair(int family, int type, int protocol, diff --git a/net/socket.c b/net/socket.c index 208d92ccf0fb..89bac0a17e5a 100644 --- a/net/socket.c +++ b/net/socket.c @@ -2127,6 +2127,25 @@ SYSCALL_DEFINE3(connect, int, fd, struct sockaddr __user *, uservaddr, return __sys_connect(fd, uservaddr, addrlen); } +int do_getsockname(struct socket *sock, int peer, + struct sockaddr __user *usockaddr, int __user *usockaddr_len) +{ + struct sockaddr_storage address; + int err; + + if (peer) + err = security_socket_getpeername(sock); + else + err = security_socket_getsockname(sock); + if (err) + return err; + err = READ_ONCE(sock->ops)->getname(sock, (struct sockaddr *)&address, peer); + if (err < 0) + return err; + /* "err" is actually length in this case */ + return move_addr_to_user(&address, err, usockaddr, usockaddr_len); +} + /* * Get the remote or local address ('name') of a socket object. Move the * obtained name to user space. @@ -2135,29 +2154,14 @@ int __sys_getsockname(int fd, struct sockaddr __user *usockaddr, int __user *usockaddr_len, int peer) { struct socket *sock; - struct sockaddr_storage address; CLASS(fd, f)(fd); - int err; if (fd_empty(f)) return -EBADF; sock = sock_from_file(fd_file(f)); if (unlikely(!sock)) return -ENOTSOCK; - - if (peer) - err = security_socket_getpeername(sock); - else - err = security_socket_getsockname(sock); - if (err) - return err; - - err = READ_ONCE(sock->ops)->getname(sock, (struct sockaddr *)&address, peer); - if (err < 0) - return err; - - /* "err" is actually length in this case */ - return move_addr_to_user(&address, err, usockaddr, usockaddr_len); + return do_getsockname(sock, peer, usockaddr, usockaddr_len); } SYSCALL_DEFINE3(getsockname, int, fd, struct sockaddr __user *, usockaddr, From 5d24321e4c159088604512d7a5c5cf634d23e01a Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Tue, 25 Nov 2025 16:18:01 -0500 Subject: [PATCH 68/68] io_uring: Introduce getsockname io_uring cmd Introduce a socket-specific io_uring_cmd to support getsockname/getpeername via io_uring. I made this an io_uring_cmd instead of a new operation to avoid polluting the command namespace with what is exclusively a socket operation. In addition, since we don't need to conform to existing interfaces, this merges the getsockname/getpeername in a single operation, since the implementation is pretty much the same. This has been frequently requested, for instance at [1] and more recently in the project Discord channel. The main use-case is to support fixed socket file descriptors. [1] https://github.com/axboe/liburing/issues/1356 Signed-off-by: Gabriel Krisman Bertazi Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 1 + io_uring/cmd_net.c | 22 ++++++++++++++++++++++ 2 files changed, 23 insertions(+) diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index deb772222b6d..b5b23c0d5283 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -1009,6 +1009,7 @@ enum io_uring_socket_op { SOCKET_URING_OP_GETSOCKOPT, SOCKET_URING_OP_SETSOCKOPT, SOCKET_URING_OP_TX_TIMESTAMP, + SOCKET_URING_OP_GETSOCKNAME, }; /* diff --git a/io_uring/cmd_net.c b/io_uring/cmd_net.c index 27a09aa4c9d0..5d11caf5509c 100644 --- a/io_uring/cmd_net.c +++ b/io_uring/cmd_net.c @@ -132,6 +132,26 @@ static int io_uring_cmd_timestamp(struct socket *sock, return -EAGAIN; } +static int io_uring_cmd_getsockname(struct socket *sock, + struct io_uring_cmd *cmd, + unsigned int issue_flags) +{ + const struct io_uring_sqe *sqe = cmd->sqe; + struct sockaddr __user *uaddr; + unsigned int peer; + int __user *ulen; + + if (sqe->ioprio || sqe->__pad1 || sqe->len || sqe->rw_flags) + return -EINVAL; + + uaddr = u64_to_user_ptr(READ_ONCE(sqe->addr)); + ulen = u64_to_user_ptr(sqe->addr3); + peer = READ_ONCE(sqe->optlen); + if (peer > 1) + return -EINVAL; + return do_getsockname(sock, peer, uaddr, ulen); +} + int io_uring_cmd_sock(struct io_uring_cmd *cmd, unsigned int issue_flags) { struct socket *sock = cmd->file->private_data; @@ -159,6 +179,8 @@ int io_uring_cmd_sock(struct io_uring_cmd *cmd, unsigned int issue_flags) return io_uring_cmd_setsockopt(sock, cmd, issue_flags); case SOCKET_URING_OP_TX_TIMESTAMP: return io_uring_cmd_timestamp(sock, cmd, issue_flags); + case SOCKET_URING_OP_GETSOCKNAME: + return io_uring_cmd_getsockname(sock, cmd, issue_flags); default: return -EOPNOTSUPP; }