2022-06-13 13:12:45 +00:00
|
|
|
// SPDX-License-Identifier: GPL-2.0
|
|
|
|
#include <linux/kernel.h>
|
|
|
|
#include <linux/errno.h>
|
|
|
|
#include <linux/fs.h>
|
|
|
|
#include <linux/file.h>
|
|
|
|
#include <linux/mm.h>
|
|
|
|
#include <linux/slab.h>
|
|
|
|
#include <linux/nospec.h>
|
|
|
|
#include <linux/hugetlb.h>
|
|
|
|
#include <linux/compat.h>
|
|
|
|
#include <linux/io_uring.h>
|
2025-02-27 22:39:14 +00:00
|
|
|
#include <linux/io_uring/cmd.h>
|
2022-06-13 13:12:45 +00:00
|
|
|
|
|
|
|
#include <uapi/linux/io_uring.h>
|
|
|
|
|
|
|
|
#include "io_uring.h"
|
|
|
|
#include "openclose.h"
|
|
|
|
#include "rsrc.h"
|
2024-03-27 20:59:09 +00:00
|
|
|
#include "memmap.h"
|
io_uring: add IORING_REGISTER_COPY_BUFFERS method
Buffers can get registered with io_uring, which allows to skip the
repeated pin_pages, unpin/unref pages for each O_DIRECT operation. This
reduces the overhead of O_DIRECT IO.
However, registrering buffers can take some time. Normally this isn't an
issue as it's done at initialization time (and hence less critical), but
for cases where rings can be created and destroyed as part of an IO
thread pool, registering the same buffers for multiple rings become a
more time sensitive proposition. As an example, let's say an application
has an IO memory pool of 500G. Initial registration takes:
Got 500 huge pages (each 1024MB)
Registered 500 pages in 409 msec
or about 0.4 seconds. If we go higher to 900 1GB huge pages being
registered:
Registered 900 pages in 738 msec
which is, as expected, a fully linear scaling.
Rather than have each ring pin/map/register the same buffer pool,
provide an io_uring_register(2) opcode to simply duplicate the buffers
that are registered with another ring. Adding the same 900GB of
registered buffers to the target ring can then be accomplished in:
Copied 900 pages in 17 usec
While timing differs a bit, this provides around a 25,000-40,000x
speedup for this use case.
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2024-09-11 19:56:08 +00:00
|
|
|
#include "register.h"
|
2022-06-13 13:12:45 +00:00
|
|
|
|
|
|
|
struct io_rsrc_update {
|
|
|
|
struct file *file;
|
|
|
|
u64 arg;
|
|
|
|
u32 nr_args;
|
|
|
|
u32 offset;
|
|
|
|
};
|
|
|
|
|
io_uring/rsrc: get rid of per-ring io_rsrc_node list
Work in progress, but get rid of the per-ring serialization of resource
nodes, like registered buffers and files. Main issue here is that one
node can otherwise hold up a bunch of other nodes from getting freed,
which is especially a problem for file resource nodes and networked
workloads where some descriptors may not see activity in a long time.
As an example, instantiate an io_uring ring fd and create a sparse
registered file table. Even 2 will do. Then create a socket and register
it as fixed file 0, F0. The number of open files in the app is now 5,
with 0/1/2 being the usual stdin/out/err, 3 being the ring fd, and 4
being the socket. Register this socket (eg "the listener") in slot 0 of
the registered file table. Now add an operation on the socket that uses
slot 0. Finally, loop N times, where each loop creates a new socket,
registers said socket as a file, then unregisters the socket, and
finally closes the socket. This is roughly similar to what a basic
accept loop would look like.
At the end of this loop, it's not unreasonable to expect that there
would still be 5 open files. Each socket created and registered in the
loop is also unregistered and closed. But since the listener socket
registered first still has references to its resource node due to still
being active, each subsequent socket unregistration is stuck behind it
for reclaim. Hence 5 + N files are still open at that point, where N is
awaiting the final put held up by the listener socket.
Rewrite the io_rsrc_node handling to NOT rely on serialization. Struct
io_kiocb now gets explicit resource nodes assigned, with each holding a
reference to the parent node. A parent node is either of type FILE or
BUFFER, which are the two types of nodes that exist. A request can have
two nodes assigned, if it's using both registered files and buffers.
Since request issue and task_work completion is both under the ring
private lock, no atomics are needed to handle these references. It's a
simple unlocked inc/dec. As before, the registered buffer or file table
each hold a reference as well to the registered nodes. Final put of the
node will remove the node and free the underlying resource, eg unmap the
buffer or put the file.
Outside of removing the stall in resource reclaim described above, it
has the following advantages:
1) It's a lot simpler than the previous scheme, and easier to follow.
No need to specific quiesce handling anymore.
2) There are no resource node allocations in the fast path, all of that
happens at resource registration time.
3) The structs related to resource handling can all get simplified
quite a bit, like io_rsrc_node and io_rsrc_data. io_rsrc_put can
go away completely.
4) Handling of resource tags is much simpler, and doesn't require
persistent storage as it can simply get assigned up front at
registration time. Just copy them in one-by-one at registration time
and assign to the resource node.
The only real downside is that a request is now explicitly limited to
pinning 2 resources, one file and one buffer, where before just
assigning a resource node to a request would pin all of them. The upside
is that it's easier to follow now, as an individual resource is
explicitly referenced and assigned to the request.
With this in place, the above mentioned example will be using exactly 5
files at the end of the loop, not N.
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2024-10-26 01:27:39 +00:00
|
|
|
static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx,
|
|
|
|
struct iovec *iov, struct page **last_hpage);
|
2022-06-13 13:12:45 +00:00
|
|
|
|
|
|
|
/* only define max */
|
|
|
|
#define IORING_MAX_FIXED_FILES (1U << 20)
|
|
|
|
#define IORING_MAX_REG_BUFFERS (1U << 14)
|
|
|
|
|
2025-02-27 22:39:16 +00:00
|
|
|
#define IO_CACHED_BVECS_SEGS 32
|
|
|
|
|
2022-07-25 09:52:05 +00:00
|
|
|
int __io_account_mem(struct user_struct *user, unsigned long nr_pages)
|
2022-06-13 13:12:45 +00:00
|
|
|
{
|
|
|
|
unsigned long page_limit, cur_pages, new_pages;
|
|
|
|
|
2022-07-25 09:52:05 +00:00
|
|
|
if (!nr_pages)
|
|
|
|
return 0;
|
|
|
|
|
2022-06-13 13:12:45 +00:00
|
|
|
/* Don't allow more pages than we can safely lock */
|
|
|
|
page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
|
|
|
|
|
2022-07-14 16:33:01 +00:00
|
|
|
cur_pages = atomic_long_read(&user->locked_vm);
|
2022-06-13 13:12:45 +00:00
|
|
|
do {
|
|
|
|
new_pages = cur_pages + nr_pages;
|
|
|
|
if (new_pages > page_limit)
|
|
|
|
return -ENOMEM;
|
2022-07-14 16:33:01 +00:00
|
|
|
} while (!atomic_long_try_cmpxchg(&user->locked_vm,
|
|
|
|
&cur_pages, new_pages));
|
2022-06-13 13:12:45 +00:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages)
|
|
|
|
{
|
|
|
|
if (ctx->user)
|
|
|
|
__io_unaccount_mem(ctx->user, nr_pages);
|
|
|
|
|
|
|
|
if (ctx->mm_account)
|
|
|
|
atomic64_sub(nr_pages, &ctx->mm_account->pinned_vm);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages)
|
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
if (ctx->user) {
|
|
|
|
ret = __io_account_mem(ctx->user, nr_pages);
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (ctx->mm_account)
|
|
|
|
atomic64_add(nr_pages, &ctx->mm_account->pinned_vm);
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2025-05-01 12:17:14 +00:00
|
|
|
int io_validate_user_buf_range(u64 uaddr, u64 ulen)
|
2022-06-13 13:12:45 +00:00
|
|
|
{
|
2025-05-01 12:17:14 +00:00
|
|
|
unsigned long tmp, base = (unsigned long)uaddr;
|
|
|
|
unsigned long acct_len = (unsigned long)PAGE_ALIGN(ulen);
|
2022-06-13 13:12:45 +00:00
|
|
|
|
2025-05-01 12:17:14 +00:00
|
|
|
/* arbitrary limit, but we need something */
|
|
|
|
if (ulen > SZ_1G || !ulen)
|
|
|
|
return -EFAULT;
|
|
|
|
if (check_add_overflow(base, acct_len, &tmp))
|
|
|
|
return -EOVERFLOW;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int io_buffer_validate(struct iovec *iov)
|
|
|
|
{
|
2022-06-13 13:12:45 +00:00
|
|
|
/*
|
|
|
|
* Don't impose further limits on the size and buffer
|
|
|
|
* constraints here, we'll -EINVAL later when IO is
|
|
|
|
* submitted if they are wrong.
|
|
|
|
*/
|
|
|
|
if (!iov->iov_base)
|
|
|
|
return iov->iov_len ? -EFAULT : 0;
|
|
|
|
|
2025-05-01 12:17:14 +00:00
|
|
|
return io_validate_user_buf_range((unsigned long)iov->iov_base,
|
|
|
|
iov->iov_len);
|
2022-06-13 13:12:45 +00:00
|
|
|
}
|
|
|
|
|
2025-02-27 22:39:14 +00:00
|
|
|
static void io_release_ubuf(void *priv)
|
2022-06-13 13:12:45 +00:00
|
|
|
{
|
2025-02-27 22:39:14 +00:00
|
|
|
struct io_mapped_ubuf *imu = priv;
|
2022-06-13 13:12:45 +00:00
|
|
|
unsigned int i;
|
|
|
|
|
2025-02-24 21:31:06 +00:00
|
|
|
for (i = 0; i < imu->nr_bvecs; i++)
|
|
|
|
unpin_user_page(imu->bvec[i].bv_page);
|
2025-02-27 22:39:14 +00:00
|
|
|
}
|
|
|
|
|
2025-02-27 22:39:16 +00:00
|
|
|
static struct io_mapped_ubuf *io_alloc_imu(struct io_ring_ctx *ctx,
|
|
|
|
int nr_bvecs)
|
|
|
|
{
|
|
|
|
if (nr_bvecs <= IO_CACHED_BVECS_SEGS)
|
|
|
|
return io_cache_alloc(&ctx->imu_cache, GFP_KERNEL);
|
|
|
|
return kvmalloc(struct_size_t(struct io_mapped_ubuf, bvec, nr_bvecs),
|
|
|
|
GFP_KERNEL);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void io_free_imu(struct io_ring_ctx *ctx, struct io_mapped_ubuf *imu)
|
|
|
|
{
|
2025-03-04 19:48:12 +00:00
|
|
|
if (imu->nr_bvecs <= IO_CACHED_BVECS_SEGS)
|
|
|
|
io_cache_free(&ctx->imu_cache, imu);
|
|
|
|
else
|
2025-02-27 22:39:16 +00:00
|
|
|
kvfree(imu);
|
|
|
|
}
|
|
|
|
|
2025-02-27 22:39:14 +00:00
|
|
|
static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf *imu)
|
|
|
|
{
|
|
|
|
if (!refcount_dec_and_test(&imu->refs))
|
|
|
|
return;
|
|
|
|
|
2025-02-24 21:31:06 +00:00
|
|
|
if (imu->acct_pages)
|
|
|
|
io_unaccount_mem(ctx, imu->acct_pages);
|
2025-02-27 22:39:14 +00:00
|
|
|
imu->release(imu->priv);
|
2025-02-27 22:39:16 +00:00
|
|
|
io_free_imu(ctx, imu);
|
2022-06-13 13:12:45 +00:00
|
|
|
}
|
|
|
|
|
2025-02-27 22:39:16 +00:00
|
|
|
struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx, int type)
|
2023-04-04 12:39:52 +00:00
|
|
|
{
|
io_uring/rsrc: get rid of per-ring io_rsrc_node list
Work in progress, but get rid of the per-ring serialization of resource
nodes, like registered buffers and files. Main issue here is that one
node can otherwise hold up a bunch of other nodes from getting freed,
which is especially a problem for file resource nodes and networked
workloads where some descriptors may not see activity in a long time.
As an example, instantiate an io_uring ring fd and create a sparse
registered file table. Even 2 will do. Then create a socket and register
it as fixed file 0, F0. The number of open files in the app is now 5,
with 0/1/2 being the usual stdin/out/err, 3 being the ring fd, and 4
being the socket. Register this socket (eg "the listener") in slot 0 of
the registered file table. Now add an operation on the socket that uses
slot 0. Finally, loop N times, where each loop creates a new socket,
registers said socket as a file, then unregisters the socket, and
finally closes the socket. This is roughly similar to what a basic
accept loop would look like.
At the end of this loop, it's not unreasonable to expect that there
would still be 5 open files. Each socket created and registered in the
loop is also unregistered and closed. But since the listener socket
registered first still has references to its resource node due to still
being active, each subsequent socket unregistration is stuck behind it
for reclaim. Hence 5 + N files are still open at that point, where N is
awaiting the final put held up by the listener socket.
Rewrite the io_rsrc_node handling to NOT rely on serialization. Struct
io_kiocb now gets explicit resource nodes assigned, with each holding a
reference to the parent node. A parent node is either of type FILE or
BUFFER, which are the two types of nodes that exist. A request can have
two nodes assigned, if it's using both registered files and buffers.
Since request issue and task_work completion is both under the ring
private lock, no atomics are needed to handle these references. It's a
simple unlocked inc/dec. As before, the registered buffer or file table
each hold a reference as well to the registered nodes. Final put of the
node will remove the node and free the underlying resource, eg unmap the
buffer or put the file.
Outside of removing the stall in resource reclaim described above, it
has the following advantages:
1) It's a lot simpler than the previous scheme, and easier to follow.
No need to specific quiesce handling anymore.
2) There are no resource node allocations in the fast path, all of that
happens at resource registration time.
3) The structs related to resource handling can all get simplified
quite a bit, like io_rsrc_node and io_rsrc_data. io_rsrc_put can
go away completely.
4) Handling of resource tags is much simpler, and doesn't require
persistent storage as it can simply get assigned up front at
registration time. Just copy them in one-by-one at registration time
and assign to the resource node.
The only real downside is that a request is now explicitly limited to
pinning 2 resources, one file and one buffer, where before just
assigning a resource node to a request would pin all of them. The upside
is that it's easier to follow now, as an individual resource is
explicitly referenced and assigned to the request.
With this in place, the above mentioned example will be using exactly 5
files at the end of the loop, not N.
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2024-10-26 01:27:39 +00:00
|
|
|
struct io_rsrc_node *node;
|
2022-06-13 13:12:45 +00:00
|
|
|
|
2025-02-27 22:39:16 +00:00
|
|
|
node = io_cache_alloc(&ctx->node_cache, GFP_KERNEL);
|
2024-10-26 12:43:44 +00:00
|
|
|
if (node) {
|
2024-11-07 11:01:35 +00:00
|
|
|
node->type = type;
|
2024-10-26 12:43:44 +00:00
|
|
|
node->refs = 1;
|
2025-02-27 22:39:16 +00:00
|
|
|
node->tag = 0;
|
|
|
|
node->file_ptr = 0;
|
2023-04-04 12:39:54 +00:00
|
|
|
}
|
io_uring/rsrc: get rid of per-ring io_rsrc_node list
Work in progress, but get rid of the per-ring serialization of resource
nodes, like registered buffers and files. Main issue here is that one
node can otherwise hold up a bunch of other nodes from getting freed,
which is especially a problem for file resource nodes and networked
workloads where some descriptors may not see activity in a long time.
As an example, instantiate an io_uring ring fd and create a sparse
registered file table. Even 2 will do. Then create a socket and register
it as fixed file 0, F0. The number of open files in the app is now 5,
with 0/1/2 being the usual stdin/out/err, 3 being the ring fd, and 4
being the socket. Register this socket (eg "the listener") in slot 0 of
the registered file table. Now add an operation on the socket that uses
slot 0. Finally, loop N times, where each loop creates a new socket,
registers said socket as a file, then unregisters the socket, and
finally closes the socket. This is roughly similar to what a basic
accept loop would look like.
At the end of this loop, it's not unreasonable to expect that there
would still be 5 open files. Each socket created and registered in the
loop is also unregistered and closed. But since the listener socket
registered first still has references to its resource node due to still
being active, each subsequent socket unregistration is stuck behind it
for reclaim. Hence 5 + N files are still open at that point, where N is
awaiting the final put held up by the listener socket.
Rewrite the io_rsrc_node handling to NOT rely on serialization. Struct
io_kiocb now gets explicit resource nodes assigned, with each holding a
reference to the parent node. A parent node is either of type FILE or
BUFFER, which are the two types of nodes that exist. A request can have
two nodes assigned, if it's using both registered files and buffers.
Since request issue and task_work completion is both under the ring
private lock, no atomics are needed to handle these references. It's a
simple unlocked inc/dec. As before, the registered buffer or file table
each hold a reference as well to the registered nodes. Final put of the
node will remove the node and free the underlying resource, eg unmap the
buffer or put the file.
Outside of removing the stall in resource reclaim described above, it
has the following advantages:
1) It's a lot simpler than the previous scheme, and easier to follow.
No need to specific quiesce handling anymore.
2) There are no resource node allocations in the fast path, all of that
happens at resource registration time.
3) The structs related to resource handling can all get simplified
quite a bit, like io_rsrc_node and io_rsrc_data. io_rsrc_put can
go away completely.
4) Handling of resource tags is much simpler, and doesn't require
persistent storage as it can simply get assigned up front at
registration time. Just copy them in one-by-one at registration time
and assign to the resource node.
The only real downside is that a request is now explicitly limited to
pinning 2 resources, one file and one buffer, where before just
assigning a resource node to a request would pin all of them. The upside
is that it's easier to follow now, as an individual resource is
explicitly referenced and assigned to the request.
With this in place, the above mentioned example will be using exactly 5
files at the end of the loop, not N.
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2024-10-26 01:27:39 +00:00
|
|
|
return node;
|
2022-06-13 13:12:45 +00:00
|
|
|
}
|
|
|
|
|
2025-02-27 22:39:16 +00:00
|
|
|
bool io_rsrc_cache_init(struct io_ring_ctx *ctx)
|
|
|
|
{
|
|
|
|
const int imu_cache_size = struct_size_t(struct io_mapped_ubuf, bvec,
|
|
|
|
IO_CACHED_BVECS_SEGS);
|
|
|
|
const int node_size = sizeof(struct io_rsrc_node);
|
|
|
|
bool ret;
|
|
|
|
|
|
|
|
ret = io_alloc_cache_init(&ctx->node_cache, IO_ALLOC_CACHE_MAX,
|
|
|
|
node_size, 0);
|
|
|
|
ret |= io_alloc_cache_init(&ctx->imu_cache, IO_ALLOC_CACHE_MAX,
|
|
|
|
imu_cache_size, 0);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
void io_rsrc_cache_free(struct io_ring_ctx *ctx)
|
|
|
|
{
|
|
|
|
io_alloc_cache_free(&ctx->node_cache, kfree);
|
|
|
|
io_alloc_cache_free(&ctx->imu_cache, kfree);
|
|
|
|
}
|
|
|
|
|
2025-04-04 14:46:34 +00:00
|
|
|
static void io_clear_table_tags(struct io_rsrc_data *data)
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
|
|
|
|
for (i = 0; i < data->nr; i++) {
|
|
|
|
struct io_rsrc_node *node = data->nodes[i];
|
|
|
|
|
|
|
|
if (node)
|
|
|
|
node->tag = 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2025-02-27 22:39:16 +00:00
|
|
|
__cold void io_rsrc_data_free(struct io_ring_ctx *ctx,
|
|
|
|
struct io_rsrc_data *data)
|
2022-06-13 13:12:45 +00:00
|
|
|
{
|
2024-10-26 20:50:13 +00:00
|
|
|
if (!data->nr)
|
|
|
|
return;
|
|
|
|
while (data->nr--) {
|
|
|
|
if (data->nodes[data->nr])
|
2024-11-07 11:01:34 +00:00
|
|
|
io_put_rsrc_node(ctx, data->nodes[data->nr]);
|
2022-06-13 13:12:45 +00:00
|
|
|
}
|
io_uring/rsrc: get rid of per-ring io_rsrc_node list
Work in progress, but get rid of the per-ring serialization of resource
nodes, like registered buffers and files. Main issue here is that one
node can otherwise hold up a bunch of other nodes from getting freed,
which is especially a problem for file resource nodes and networked
workloads where some descriptors may not see activity in a long time.
As an example, instantiate an io_uring ring fd and create a sparse
registered file table. Even 2 will do. Then create a socket and register
it as fixed file 0, F0. The number of open files in the app is now 5,
with 0/1/2 being the usual stdin/out/err, 3 being the ring fd, and 4
being the socket. Register this socket (eg "the listener") in slot 0 of
the registered file table. Now add an operation on the socket that uses
slot 0. Finally, loop N times, where each loop creates a new socket,
registers said socket as a file, then unregisters the socket, and
finally closes the socket. This is roughly similar to what a basic
accept loop would look like.
At the end of this loop, it's not unreasonable to expect that there
would still be 5 open files. Each socket created and registered in the
loop is also unregistered and closed. But since the listener socket
registered first still has references to its resource node due to still
being active, each subsequent socket unregistration is stuck behind it
for reclaim. Hence 5 + N files are still open at that point, where N is
awaiting the final put held up by the listener socket.
Rewrite the io_rsrc_node handling to NOT rely on serialization. Struct
io_kiocb now gets explicit resource nodes assigned, with each holding a
reference to the parent node. A parent node is either of type FILE or
BUFFER, which are the two types of nodes that exist. A request can have
two nodes assigned, if it's using both registered files and buffers.
Since request issue and task_work completion is both under the ring
private lock, no atomics are needed to handle these references. It's a
simple unlocked inc/dec. As before, the registered buffer or file table
each hold a reference as well to the registered nodes. Final put of the
node will remove the node and free the underlying resource, eg unmap the
buffer or put the file.
Outside of removing the stall in resource reclaim described above, it
has the following advantages:
1) It's a lot simpler than the previous scheme, and easier to follow.
No need to specific quiesce handling anymore.
2) There are no resource node allocations in the fast path, all of that
happens at resource registration time.
3) The structs related to resource handling can all get simplified
quite a bit, like io_rsrc_node and io_rsrc_data. io_rsrc_put can
go away completely.
4) Handling of resource tags is much simpler, and doesn't require
persistent storage as it can simply get assigned up front at
registration time. Just copy them in one-by-one at registration time
and assign to the resource node.
The only real downside is that a request is now explicitly limited to
pinning 2 resources, one file and one buffer, where before just
assigning a resource node to a request would pin all of them. The upside
is that it's easier to follow now, as an individual resource is
explicitly referenced and assigned to the request.
With this in place, the above mentioned example will be using exactly 5
files at the end of the loop, not N.
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2024-10-26 01:27:39 +00:00
|
|
|
kvfree(data->nodes);
|
2024-10-26 20:50:13 +00:00
|
|
|
data->nodes = NULL;
|
|
|
|
data->nr = 0;
|
2022-06-13 13:12:45 +00:00
|
|
|
}
|
|
|
|
|
2024-10-26 20:50:13 +00:00
|
|
|
__cold int io_rsrc_data_alloc(struct io_rsrc_data *data, unsigned nr)
|
2022-06-13 13:12:45 +00:00
|
|
|
{
|
io_uring/rsrc: get rid of per-ring io_rsrc_node list
Work in progress, but get rid of the per-ring serialization of resource
nodes, like registered buffers and files. Main issue here is that one
node can otherwise hold up a bunch of other nodes from getting freed,
which is especially a problem for file resource nodes and networked
workloads where some descriptors may not see activity in a long time.
As an example, instantiate an io_uring ring fd and create a sparse
registered file table. Even 2 will do. Then create a socket and register
it as fixed file 0, F0. The number of open files in the app is now 5,
with 0/1/2 being the usual stdin/out/err, 3 being the ring fd, and 4
being the socket. Register this socket (eg "the listener") in slot 0 of
the registered file table. Now add an operation on the socket that uses
slot 0. Finally, loop N times, where each loop creates a new socket,
registers said socket as a file, then unregisters the socket, and
finally closes the socket. This is roughly similar to what a basic
accept loop would look like.
At the end of this loop, it's not unreasonable to expect that there
would still be 5 open files. Each socket created and registered in the
loop is also unregistered and closed. But since the listener socket
registered first still has references to its resource node due to still
being active, each subsequent socket unregistration is stuck behind it
for reclaim. Hence 5 + N files are still open at that point, where N is
awaiting the final put held up by the listener socket.
Rewrite the io_rsrc_node handling to NOT rely on serialization. Struct
io_kiocb now gets explicit resource nodes assigned, with each holding a
reference to the parent node. A parent node is either of type FILE or
BUFFER, which are the two types of nodes that exist. A request can have
two nodes assigned, if it's using both registered files and buffers.
Since request issue and task_work completion is both under the ring
private lock, no atomics are needed to handle these references. It's a
simple unlocked inc/dec. As before, the registered buffer or file table
each hold a reference as well to the registered nodes. Final put of the
node will remove the node and free the underlying resource, eg unmap the
buffer or put the file.
Outside of removing the stall in resource reclaim described above, it
has the following advantages:
1) It's a lot simpler than the previous scheme, and easier to follow.
No need to specific quiesce handling anymore.
2) There are no resource node allocations in the fast path, all of that
happens at resource registration time.
3) The structs related to resource handling can all get simplified
quite a bit, like io_rsrc_node and io_rsrc_data. io_rsrc_put can
go away completely.
4) Handling of resource tags is much simpler, and doesn't require
persistent storage as it can simply get assigned up front at
registration time. Just copy them in one-by-one at registration time
and assign to the resource node.
The only real downside is that a request is now explicitly limited to
pinning 2 resources, one file and one buffer, where before just
assigning a resource node to a request would pin all of them. The upside
is that it's easier to follow now, as an individual resource is
explicitly referenced and assigned to the request.
With this in place, the above mentioned example will be using exactly 5
files at the end of the loop, not N.
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2024-10-26 01:27:39 +00:00
|
|
|
data->nodes = kvmalloc_array(nr, sizeof(struct io_rsrc_node *),
|
2024-10-26 20:50:13 +00:00
|
|
|
GFP_KERNEL_ACCOUNT | __GFP_ZERO);
|
|
|
|
if (data->nodes) {
|
|
|
|
data->nr = nr;
|
|
|
|
return 0;
|
2022-06-13 13:12:45 +00:00
|
|
|
}
|
2024-10-26 20:50:13 +00:00
|
|
|
return -ENOMEM;
|
2022-06-13 13:12:45 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static int __io_sqe_files_update(struct io_ring_ctx *ctx,
|
|
|
|
struct io_uring_rsrc_update2 *up,
|
|
|
|
unsigned nr_args)
|
|
|
|
{
|
|
|
|
u64 __user *tags = u64_to_user_ptr(up->tags);
|
|
|
|
__s32 __user *fds = u64_to_user_ptr(up->data);
|
|
|
|
int fd, i, err = 0;
|
|
|
|
unsigned int done;
|
|
|
|
|
2024-10-26 20:50:13 +00:00
|
|
|
if (!ctx->file_table.data.nr)
|
2022-06-13 13:12:45 +00:00
|
|
|
return -ENXIO;
|
2024-10-26 20:50:13 +00:00
|
|
|
if (up->offset + nr_args > ctx->file_table.data.nr)
|
2022-06-13 13:12:45 +00:00
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
for (done = 0; done < nr_args; done++) {
|
|
|
|
u64 tag = 0;
|
|
|
|
|
|
|
|
if ((tags && copy_from_user(&tag, &tags[done], sizeof(tag))) ||
|
|
|
|
copy_from_user(&fd, &fds[done], sizeof(fd))) {
|
|
|
|
err = -EFAULT;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
if ((fd == IORING_REGISTER_FILES_SKIP || fd == -1) && tag) {
|
|
|
|
err = -EINVAL;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
if (fd == IORING_REGISTER_FILES_SKIP)
|
|
|
|
continue;
|
|
|
|
|
2024-10-27 15:08:31 +00:00
|
|
|
i = up->offset + done;
|
2024-11-07 11:01:34 +00:00
|
|
|
if (io_reset_rsrc_node(ctx, &ctx->file_table.data, i))
|
2022-06-13 13:12:45 +00:00
|
|
|
io_file_bitmap_clear(&ctx->file_table, i);
|
2024-10-29 15:02:38 +00:00
|
|
|
|
2022-06-13 13:12:45 +00:00
|
|
|
if (fd != -1) {
|
2023-06-20 11:32:35 +00:00
|
|
|
struct file *file = fget(fd);
|
io_uring/rsrc: get rid of per-ring io_rsrc_node list
Work in progress, but get rid of the per-ring serialization of resource
nodes, like registered buffers and files. Main issue here is that one
node can otherwise hold up a bunch of other nodes from getting freed,
which is especially a problem for file resource nodes and networked
workloads where some descriptors may not see activity in a long time.
As an example, instantiate an io_uring ring fd and create a sparse
registered file table. Even 2 will do. Then create a socket and register
it as fixed file 0, F0. The number of open files in the app is now 5,
with 0/1/2 being the usual stdin/out/err, 3 being the ring fd, and 4
being the socket. Register this socket (eg "the listener") in slot 0 of
the registered file table. Now add an operation on the socket that uses
slot 0. Finally, loop N times, where each loop creates a new socket,
registers said socket as a file, then unregisters the socket, and
finally closes the socket. This is roughly similar to what a basic
accept loop would look like.
At the end of this loop, it's not unreasonable to expect that there
would still be 5 open files. Each socket created and registered in the
loop is also unregistered and closed. But since the listener socket
registered first still has references to its resource node due to still
being active, each subsequent socket unregistration is stuck behind it
for reclaim. Hence 5 + N files are still open at that point, where N is
awaiting the final put held up by the listener socket.
Rewrite the io_rsrc_node handling to NOT rely on serialization. Struct
io_kiocb now gets explicit resource nodes assigned, with each holding a
reference to the parent node. A parent node is either of type FILE or
BUFFER, which are the two types of nodes that exist. A request can have
two nodes assigned, if it's using both registered files and buffers.
Since request issue and task_work completion is both under the ring
private lock, no atomics are needed to handle these references. It's a
simple unlocked inc/dec. As before, the registered buffer or file table
each hold a reference as well to the registered nodes. Final put of the
node will remove the node and free the underlying resource, eg unmap the
buffer or put the file.
Outside of removing the stall in resource reclaim described above, it
has the following advantages:
1) It's a lot simpler than the previous scheme, and easier to follow.
No need to specific quiesce handling anymore.
2) There are no resource node allocations in the fast path, all of that
happens at resource registration time.
3) The structs related to resource handling can all get simplified
quite a bit, like io_rsrc_node and io_rsrc_data. io_rsrc_put can
go away completely.
4) Handling of resource tags is much simpler, and doesn't require
persistent storage as it can simply get assigned up front at
registration time. Just copy them in one-by-one at registration time
and assign to the resource node.
The only real downside is that a request is now explicitly limited to
pinning 2 resources, one file and one buffer, where before just
assigning a resource node to a request would pin all of them. The upside
is that it's easier to follow now, as an individual resource is
explicitly referenced and assigned to the request.
With this in place, the above mentioned example will be using exactly 5
files at the end of the loop, not N.
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2024-10-26 01:27:39 +00:00
|
|
|
struct io_rsrc_node *node;
|
2023-06-20 11:32:35 +00:00
|
|
|
|
2022-06-13 13:12:45 +00:00
|
|
|
if (!file) {
|
|
|
|
err = -EBADF;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
/*
|
2023-12-19 19:36:34 +00:00
|
|
|
* Don't allow io_uring instances to be registered.
|
2022-06-13 13:12:45 +00:00
|
|
|
*/
|
|
|
|
if (io_is_uring_fops(file)) {
|
|
|
|
fput(file);
|
|
|
|
err = -EBADF;
|
|
|
|
break;
|
|
|
|
}
|
2025-02-27 22:39:16 +00:00
|
|
|
node = io_rsrc_node_alloc(ctx, IORING_RSRC_FILE);
|
io_uring/rsrc: get rid of per-ring io_rsrc_node list
Work in progress, but get rid of the per-ring serialization of resource
nodes, like registered buffers and files. Main issue here is that one
node can otherwise hold up a bunch of other nodes from getting freed,
which is especially a problem for file resource nodes and networked
workloads where some descriptors may not see activity in a long time.
As an example, instantiate an io_uring ring fd and create a sparse
registered file table. Even 2 will do. Then create a socket and register
it as fixed file 0, F0. The number of open files in the app is now 5,
with 0/1/2 being the usual stdin/out/err, 3 being the ring fd, and 4
being the socket. Register this socket (eg "the listener") in slot 0 of
the registered file table. Now add an operation on the socket that uses
slot 0. Finally, loop N times, where each loop creates a new socket,
registers said socket as a file, then unregisters the socket, and
finally closes the socket. This is roughly similar to what a basic
accept loop would look like.
At the end of this loop, it's not unreasonable to expect that there
would still be 5 open files. Each socket created and registered in the
loop is also unregistered and closed. But since the listener socket
registered first still has references to its resource node due to still
being active, each subsequent socket unregistration is stuck behind it
for reclaim. Hence 5 + N files are still open at that point, where N is
awaiting the final put held up by the listener socket.
Rewrite the io_rsrc_node handling to NOT rely on serialization. Struct
io_kiocb now gets explicit resource nodes assigned, with each holding a
reference to the parent node. A parent node is either of type FILE or
BUFFER, which are the two types of nodes that exist. A request can have
two nodes assigned, if it's using both registered files and buffers.
Since request issue and task_work completion is both under the ring
private lock, no atomics are needed to handle these references. It's a
simple unlocked inc/dec. As before, the registered buffer or file table
each hold a reference as well to the registered nodes. Final put of the
node will remove the node and free the underlying resource, eg unmap the
buffer or put the file.
Outside of removing the stall in resource reclaim described above, it
has the following advantages:
1) It's a lot simpler than the previous scheme, and easier to follow.
No need to specific quiesce handling anymore.
2) There are no resource node allocations in the fast path, all of that
happens at resource registration time.
3) The structs related to resource handling can all get simplified
quite a bit, like io_rsrc_node and io_rsrc_data. io_rsrc_put can
go away completely.
4) Handling of resource tags is much simpler, and doesn't require
persistent storage as it can simply get assigned up front at
registration time. Just copy them in one-by-one at registration time
and assign to the resource node.
The only real downside is that a request is now explicitly limited to
pinning 2 resources, one file and one buffer, where before just
assigning a resource node to a request would pin all of them. The upside
is that it's easier to follow now, as an individual resource is
explicitly referenced and assigned to the request.
With this in place, the above mentioned example will be using exactly 5
files at the end of the loop, not N.
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2024-10-26 01:27:39 +00:00
|
|
|
if (!node) {
|
|
|
|
err = -ENOMEM;
|
|
|
|
fput(file);
|
|
|
|
break;
|
|
|
|
}
|
2024-10-26 20:50:13 +00:00
|
|
|
ctx->file_table.data.nodes[i] = node;
|
2024-10-26 16:41:51 +00:00
|
|
|
if (tag)
|
|
|
|
node->tag = tag;
|
io_uring/rsrc: get rid of per-ring io_rsrc_node list
Work in progress, but get rid of the per-ring serialization of resource
nodes, like registered buffers and files. Main issue here is that one
node can otherwise hold up a bunch of other nodes from getting freed,
which is especially a problem for file resource nodes and networked
workloads where some descriptors may not see activity in a long time.
As an example, instantiate an io_uring ring fd and create a sparse
registered file table. Even 2 will do. Then create a socket and register
it as fixed file 0, F0. The number of open files in the app is now 5,
with 0/1/2 being the usual stdin/out/err, 3 being the ring fd, and 4
being the socket. Register this socket (eg "the listener") in slot 0 of
the registered file table. Now add an operation on the socket that uses
slot 0. Finally, loop N times, where each loop creates a new socket,
registers said socket as a file, then unregisters the socket, and
finally closes the socket. This is roughly similar to what a basic
accept loop would look like.
At the end of this loop, it's not unreasonable to expect that there
would still be 5 open files. Each socket created and registered in the
loop is also unregistered and closed. But since the listener socket
registered first still has references to its resource node due to still
being active, each subsequent socket unregistration is stuck behind it
for reclaim. Hence 5 + N files are still open at that point, where N is
awaiting the final put held up by the listener socket.
Rewrite the io_rsrc_node handling to NOT rely on serialization. Struct
io_kiocb now gets explicit resource nodes assigned, with each holding a
reference to the parent node. A parent node is either of type FILE or
BUFFER, which are the two types of nodes that exist. A request can have
two nodes assigned, if it's using both registered files and buffers.
Since request issue and task_work completion is both under the ring
private lock, no atomics are needed to handle these references. It's a
simple unlocked inc/dec. As before, the registered buffer or file table
each hold a reference as well to the registered nodes. Final put of the
node will remove the node and free the underlying resource, eg unmap the
buffer or put the file.
Outside of removing the stall in resource reclaim described above, it
has the following advantages:
1) It's a lot simpler than the previous scheme, and easier to follow.
No need to specific quiesce handling anymore.
2) There are no resource node allocations in the fast path, all of that
happens at resource registration time.
3) The structs related to resource handling can all get simplified
quite a bit, like io_rsrc_node and io_rsrc_data. io_rsrc_put can
go away completely.
4) Handling of resource tags is much simpler, and doesn't require
persistent storage as it can simply get assigned up front at
registration time. Just copy them in one-by-one at registration time
and assign to the resource node.
The only real downside is that a request is now explicitly limited to
pinning 2 resources, one file and one buffer, where before just
assigning a resource node to a request would pin all of them. The upside
is that it's easier to follow now, as an individual resource is
explicitly referenced and assigned to the request.
With this in place, the above mentioned example will be using exactly 5
files at the end of the loop, not N.
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2024-10-26 01:27:39 +00:00
|
|
|
io_fixed_file_set(node, file);
|
2022-06-13 13:12:45 +00:00
|
|
|
io_file_bitmap_set(&ctx->file_table, i);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return done ? done : err;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
|
|
|
|
struct io_uring_rsrc_update2 *up,
|
|
|
|
unsigned int nr_args)
|
|
|
|
{
|
|
|
|
u64 __user *tags = u64_to_user_ptr(up->tags);
|
2024-05-23 21:45:35 +00:00
|
|
|
struct iovec fast_iov, *iov;
|
2022-06-13 13:12:45 +00:00
|
|
|
struct page *last_hpage = NULL;
|
2024-08-28 15:42:33 +00:00
|
|
|
struct iovec __user *uvec;
|
|
|
|
u64 user_data = up->data;
|
2022-06-13 13:12:45 +00:00
|
|
|
__u32 done;
|
|
|
|
int i, err;
|
|
|
|
|
2024-10-26 20:50:13 +00:00
|
|
|
if (!ctx->buf_table.nr)
|
2022-06-13 13:12:45 +00:00
|
|
|
return -ENXIO;
|
2024-10-26 20:50:13 +00:00
|
|
|
if (up->offset + nr_args > ctx->buf_table.nr)
|
2022-06-13 13:12:45 +00:00
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
for (done = 0; done < nr_args; done++) {
|
io_uring/rsrc: get rid of per-ring io_rsrc_node list
Work in progress, but get rid of the per-ring serialization of resource
nodes, like registered buffers and files. Main issue here is that one
node can otherwise hold up a bunch of other nodes from getting freed,
which is especially a problem for file resource nodes and networked
workloads where some descriptors may not see activity in a long time.
As an example, instantiate an io_uring ring fd and create a sparse
registered file table. Even 2 will do. Then create a socket and register
it as fixed file 0, F0. The number of open files in the app is now 5,
with 0/1/2 being the usual stdin/out/err, 3 being the ring fd, and 4
being the socket. Register this socket (eg "the listener") in slot 0 of
the registered file table. Now add an operation on the socket that uses
slot 0. Finally, loop N times, where each loop creates a new socket,
registers said socket as a file, then unregisters the socket, and
finally closes the socket. This is roughly similar to what a basic
accept loop would look like.
At the end of this loop, it's not unreasonable to expect that there
would still be 5 open files. Each socket created and registered in the
loop is also unregistered and closed. But since the listener socket
registered first still has references to its resource node due to still
being active, each subsequent socket unregistration is stuck behind it
for reclaim. Hence 5 + N files are still open at that point, where N is
awaiting the final put held up by the listener socket.
Rewrite the io_rsrc_node handling to NOT rely on serialization. Struct
io_kiocb now gets explicit resource nodes assigned, with each holding a
reference to the parent node. A parent node is either of type FILE or
BUFFER, which are the two types of nodes that exist. A request can have
two nodes assigned, if it's using both registered files and buffers.
Since request issue and task_work completion is both under the ring
private lock, no atomics are needed to handle these references. It's a
simple unlocked inc/dec. As before, the registered buffer or file table
each hold a reference as well to the registered nodes. Final put of the
node will remove the node and free the underlying resource, eg unmap the
buffer or put the file.
Outside of removing the stall in resource reclaim described above, it
has the following advantages:
1) It's a lot simpler than the previous scheme, and easier to follow.
No need to specific quiesce handling anymore.
2) There are no resource node allocations in the fast path, all of that
happens at resource registration time.
3) The structs related to resource handling can all get simplified
quite a bit, like io_rsrc_node and io_rsrc_data. io_rsrc_put can
go away completely.
4) Handling of resource tags is much simpler, and doesn't require
persistent storage as it can simply get assigned up front at
registration time. Just copy them in one-by-one at registration time
and assign to the resource node.
The only real downside is that a request is now explicitly limited to
pinning 2 resources, one file and one buffer, where before just
assigning a resource node to a request would pin all of them. The upside
is that it's easier to follow now, as an individual resource is
explicitly referenced and assigned to the request.
With this in place, the above mentioned example will be using exactly 5
files at the end of the loop, not N.
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2024-10-26 01:27:39 +00:00
|
|
|
struct io_rsrc_node *node;
|
2022-06-13 13:12:45 +00:00
|
|
|
u64 tag = 0;
|
|
|
|
|
2024-08-28 15:42:33 +00:00
|
|
|
uvec = u64_to_user_ptr(user_data);
|
|
|
|
iov = iovec_from_user(uvec, 1, 1, &fast_iov, ctx->compat);
|
2024-05-23 21:45:35 +00:00
|
|
|
if (IS_ERR(iov)) {
|
|
|
|
err = PTR_ERR(iov);
|
2022-06-13 13:12:45 +00:00
|
|
|
break;
|
2024-05-23 21:45:35 +00:00
|
|
|
}
|
2022-06-13 13:12:45 +00:00
|
|
|
if (tags && copy_from_user(&tag, &tags[done], sizeof(tag))) {
|
|
|
|
err = -EFAULT;
|
|
|
|
break;
|
|
|
|
}
|
2024-05-23 21:45:35 +00:00
|
|
|
err = io_buffer_validate(iov);
|
2022-06-13 13:12:45 +00:00
|
|
|
if (err)
|
|
|
|
break;
|
io_uring/rsrc: get rid of per-ring io_rsrc_node list
Work in progress, but get rid of the per-ring serialization of resource
nodes, like registered buffers and files. Main issue here is that one
node can otherwise hold up a bunch of other nodes from getting freed,
which is especially a problem for file resource nodes and networked
workloads where some descriptors may not see activity in a long time.
As an example, instantiate an io_uring ring fd and create a sparse
registered file table. Even 2 will do. Then create a socket and register
it as fixed file 0, F0. The number of open files in the app is now 5,
with 0/1/2 being the usual stdin/out/err, 3 being the ring fd, and 4
being the socket. Register this socket (eg "the listener") in slot 0 of
the registered file table. Now add an operation on the socket that uses
slot 0. Finally, loop N times, where each loop creates a new socket,
registers said socket as a file, then unregisters the socket, and
finally closes the socket. This is roughly similar to what a basic
accept loop would look like.
At the end of this loop, it's not unreasonable to expect that there
would still be 5 open files. Each socket created and registered in the
loop is also unregistered and closed. But since the listener socket
registered first still has references to its resource node due to still
being active, each subsequent socket unregistration is stuck behind it
for reclaim. Hence 5 + N files are still open at that point, where N is
awaiting the final put held up by the listener socket.
Rewrite the io_rsrc_node handling to NOT rely on serialization. Struct
io_kiocb now gets explicit resource nodes assigned, with each holding a
reference to the parent node. A parent node is either of type FILE or
BUFFER, which are the two types of nodes that exist. A request can have
two nodes assigned, if it's using both registered files and buffers.
Since request issue and task_work completion is both under the ring
private lock, no atomics are needed to handle these references. It's a
simple unlocked inc/dec. As before, the registered buffer or file table
each hold a reference as well to the registered nodes. Final put of the
node will remove the node and free the underlying resource, eg unmap the
buffer or put the file.
Outside of removing the stall in resource reclaim described above, it
has the following advantages:
1) It's a lot simpler than the previous scheme, and easier to follow.
No need to specific quiesce handling anymore.
2) There are no resource node allocations in the fast path, all of that
happens at resource registration time.
3) The structs related to resource handling can all get simplified
quite a bit, like io_rsrc_node and io_rsrc_data. io_rsrc_put can
go away completely.
4) Handling of resource tags is much simpler, and doesn't require
persistent storage as it can simply get assigned up front at
registration time. Just copy them in one-by-one at registration time
and assign to the resource node.
The only real downside is that a request is now explicitly limited to
pinning 2 resources, one file and one buffer, where before just
assigning a resource node to a request would pin all of them. The upside
is that it's easier to follow now, as an individual resource is
explicitly referenced and assigned to the request.
With this in place, the above mentioned example will be using exactly 5
files at the end of the loop, not N.
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2024-10-26 01:27:39 +00:00
|
|
|
node = io_sqe_buffer_register(ctx, iov, &last_hpage);
|
|
|
|
if (IS_ERR(node)) {
|
|
|
|
err = PTR_ERR(node);
|
|
|
|
break;
|
2022-06-13 13:12:45 +00:00
|
|
|
}
|
2024-10-30 15:51:58 +00:00
|
|
|
if (tag) {
|
|
|
|
if (!node) {
|
|
|
|
err = -EINVAL;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
node->tag = tag;
|
|
|
|
}
|
2024-10-26 20:50:13 +00:00
|
|
|
i = array_index_nospec(up->offset + done, ctx->buf_table.nr);
|
2024-11-07 11:01:34 +00:00
|
|
|
io_reset_rsrc_node(ctx, &ctx->buf_table, i);
|
2024-10-26 20:50:13 +00:00
|
|
|
ctx->buf_table.nodes[i] = node;
|
2024-08-28 15:42:33 +00:00
|
|
|
if (ctx->compat)
|
|
|
|
user_data += sizeof(struct compat_iovec);
|
|
|
|
else
|
|
|
|
user_data += sizeof(struct iovec);
|
2022-06-13 13:12:45 +00:00
|
|
|
}
|
|
|
|
return done ? done : err;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type,
|
|
|
|
struct io_uring_rsrc_update2 *up,
|
|
|
|
unsigned nr_args)
|
|
|
|
{
|
|
|
|
__u32 tmp;
|
|
|
|
|
2023-04-11 11:06:04 +00:00
|
|
|
lockdep_assert_held(&ctx->uring_lock);
|
|
|
|
|
2022-06-13 13:12:45 +00:00
|
|
|
if (check_add_overflow(up->offset, nr_args, &tmp))
|
|
|
|
return -EOVERFLOW;
|
|
|
|
|
|
|
|
switch (type) {
|
|
|
|
case IORING_RSRC_FILE:
|
|
|
|
return __io_sqe_files_update(ctx, up, nr_args);
|
|
|
|
case IORING_RSRC_BUFFER:
|
|
|
|
return __io_sqe_buffers_update(ctx, up, nr_args);
|
|
|
|
}
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
|
|
|
int io_register_files_update(struct io_ring_ctx *ctx, void __user *arg,
|
|
|
|
unsigned nr_args)
|
|
|
|
{
|
|
|
|
struct io_uring_rsrc_update2 up;
|
|
|
|
|
|
|
|
if (!nr_args)
|
|
|
|
return -EINVAL;
|
|
|
|
memset(&up, 0, sizeof(up));
|
|
|
|
if (copy_from_user(&up, arg, sizeof(struct io_uring_rsrc_update)))
|
|
|
|
return -EFAULT;
|
|
|
|
if (up.resv || up.resv2)
|
|
|
|
return -EINVAL;
|
|
|
|
return __io_register_rsrc_update(ctx, IORING_RSRC_FILE, &up, nr_args);
|
|
|
|
}
|
|
|
|
|
|
|
|
int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg,
|
|
|
|
unsigned size, unsigned type)
|
|
|
|
{
|
|
|
|
struct io_uring_rsrc_update2 up;
|
|
|
|
|
|
|
|
if (size != sizeof(up))
|
|
|
|
return -EINVAL;
|
|
|
|
if (copy_from_user(&up, arg, sizeof(up)))
|
|
|
|
return -EFAULT;
|
|
|
|
if (!up.nr || up.resv || up.resv2)
|
|
|
|
return -EINVAL;
|
|
|
|
return __io_register_rsrc_update(ctx, type, &up, up.nr);
|
|
|
|
}
|
|
|
|
|
|
|
|
__cold int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg,
|
|
|
|
unsigned int size, unsigned int type)
|
|
|
|
{
|
|
|
|
struct io_uring_rsrc_register rr;
|
|
|
|
|
|
|
|
/* keep it extendible */
|
|
|
|
if (size != sizeof(rr))
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
memset(&rr, 0, sizeof(rr));
|
|
|
|
if (copy_from_user(&rr, arg, size))
|
|
|
|
return -EFAULT;
|
|
|
|
if (!rr.nr || rr.resv2)
|
|
|
|
return -EINVAL;
|
|
|
|
if (rr.flags & ~IORING_RSRC_REGISTER_SPARSE)
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
switch (type) {
|
|
|
|
case IORING_RSRC_FILE:
|
|
|
|
if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data)
|
|
|
|
break;
|
|
|
|
return io_sqe_files_register(ctx, u64_to_user_ptr(rr.data),
|
|
|
|
rr.nr, u64_to_user_ptr(rr.tags));
|
|
|
|
case IORING_RSRC_BUFFER:
|
|
|
|
if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data)
|
|
|
|
break;
|
|
|
|
return io_sqe_buffers_register(ctx, u64_to_user_ptr(rr.data),
|
|
|
|
rr.nr, u64_to_user_ptr(rr.tags));
|
|
|
|
}
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
2022-09-01 10:54:02 +00:00
|
|
|
int io_files_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
|
2022-06-13 13:12:45 +00:00
|
|
|
{
|
2022-08-11 07:11:15 +00:00
|
|
|
struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update);
|
2022-06-13 13:12:45 +00:00
|
|
|
|
|
|
|
if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
|
|
|
|
return -EINVAL;
|
|
|
|
if (sqe->rw_flags || sqe->splice_fd_in)
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
up->offset = READ_ONCE(sqe->off);
|
|
|
|
up->nr_args = READ_ONCE(sqe->len);
|
|
|
|
if (!up->nr_args)
|
|
|
|
return -EINVAL;
|
|
|
|
up->arg = READ_ONCE(sqe->addr);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int io_files_update_with_index_alloc(struct io_kiocb *req,
|
|
|
|
unsigned int issue_flags)
|
|
|
|
{
|
2022-08-11 07:11:15 +00:00
|
|
|
struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update);
|
2022-06-13 13:12:45 +00:00
|
|
|
__s32 __user *fds = u64_to_user_ptr(up->arg);
|
|
|
|
unsigned int done;
|
|
|
|
struct file *file;
|
|
|
|
int ret, fd;
|
|
|
|
|
2024-10-26 20:50:13 +00:00
|
|
|
if (!req->ctx->file_table.data.nr)
|
2022-06-13 13:12:45 +00:00
|
|
|
return -ENXIO;
|
|
|
|
|
|
|
|
for (done = 0; done < up->nr_args; done++) {
|
|
|
|
if (copy_from_user(&fd, &fds[done], sizeof(fd))) {
|
|
|
|
ret = -EFAULT;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
file = fget(fd);
|
|
|
|
if (!file) {
|
|
|
|
ret = -EBADF;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
ret = io_fixed_fd_install(req, issue_flags, file,
|
|
|
|
IORING_FILE_INDEX_ALLOC);
|
|
|
|
if (ret < 0)
|
|
|
|
break;
|
|
|
|
if (copy_to_user(&fds[done], &ret, sizeof(ret))) {
|
2022-06-13 10:42:56 +00:00
|
|
|
__io_close_fixed(req->ctx, issue_flags, ret);
|
2022-06-13 13:12:45 +00:00
|
|
|
ret = -EFAULT;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (done)
|
|
|
|
return done;
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2022-09-01 10:54:02 +00:00
|
|
|
int io_files_update(struct io_kiocb *req, unsigned int issue_flags)
|
2022-06-13 13:12:45 +00:00
|
|
|
{
|
2022-08-11 07:11:15 +00:00
|
|
|
struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update);
|
2022-06-13 13:12:45 +00:00
|
|
|
struct io_ring_ctx *ctx = req->ctx;
|
|
|
|
struct io_uring_rsrc_update2 up2;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
up2.offset = up->offset;
|
|
|
|
up2.data = up->arg;
|
|
|
|
up2.nr = 0;
|
|
|
|
up2.tags = 0;
|
|
|
|
up2.resv = 0;
|
|
|
|
up2.resv2 = 0;
|
|
|
|
|
|
|
|
if (up->offset == IORING_FILE_INDEX_ALLOC) {
|
|
|
|
ret = io_files_update_with_index_alloc(req, issue_flags);
|
|
|
|
} else {
|
|
|
|
io_ring_submit_lock(ctx, issue_flags);
|
|
|
|
ret = __io_register_rsrc_update(ctx, IORING_RSRC_FILE,
|
|
|
|
&up2, up->nr_args);
|
|
|
|
io_ring_submit_unlock(ctx, issue_flags);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (ret < 0)
|
|
|
|
req_set_fail(req);
|
|
|
|
io_req_set_res(req, ret, 0);
|
2025-05-08 20:48:33 +00:00
|
|
|
return IOU_COMPLETE;
|
2022-06-13 13:12:45 +00:00
|
|
|
}
|
|
|
|
|
2024-11-07 11:01:34 +00:00
|
|
|
void io_free_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node *node)
|
2022-06-13 13:12:45 +00:00
|
|
|
{
|
io_uring/rsrc: get rid of per-ring io_rsrc_node list
Work in progress, but get rid of the per-ring serialization of resource
nodes, like registered buffers and files. Main issue here is that one
node can otherwise hold up a bunch of other nodes from getting freed,
which is especially a problem for file resource nodes and networked
workloads where some descriptors may not see activity in a long time.
As an example, instantiate an io_uring ring fd and create a sparse
registered file table. Even 2 will do. Then create a socket and register
it as fixed file 0, F0. The number of open files in the app is now 5,
with 0/1/2 being the usual stdin/out/err, 3 being the ring fd, and 4
being the socket. Register this socket (eg "the listener") in slot 0 of
the registered file table. Now add an operation on the socket that uses
slot 0. Finally, loop N times, where each loop creates a new socket,
registers said socket as a file, then unregisters the socket, and
finally closes the socket. This is roughly similar to what a basic
accept loop would look like.
At the end of this loop, it's not unreasonable to expect that there
would still be 5 open files. Each socket created and registered in the
loop is also unregistered and closed. But since the listener socket
registered first still has references to its resource node due to still
being active, each subsequent socket unregistration is stuck behind it
for reclaim. Hence 5 + N files are still open at that point, where N is
awaiting the final put held up by the listener socket.
Rewrite the io_rsrc_node handling to NOT rely on serialization. Struct
io_kiocb now gets explicit resource nodes assigned, with each holding a
reference to the parent node. A parent node is either of type FILE or
BUFFER, which are the two types of nodes that exist. A request can have
two nodes assigned, if it's using both registered files and buffers.
Since request issue and task_work completion is both under the ring
private lock, no atomics are needed to handle these references. It's a
simple unlocked inc/dec. As before, the registered buffer or file table
each hold a reference as well to the registered nodes. Final put of the
node will remove the node and free the underlying resource, eg unmap the
buffer or put the file.
Outside of removing the stall in resource reclaim described above, it
has the following advantages:
1) It's a lot simpler than the previous scheme, and easier to follow.
No need to specific quiesce handling anymore.
2) There are no resource node allocations in the fast path, all of that
happens at resource registration time.
3) The structs related to resource handling can all get simplified
quite a bit, like io_rsrc_node and io_rsrc_data. io_rsrc_put can
go away completely.
4) Handling of resource tags is much simpler, and doesn't require
persistent storage as it can simply get assigned up front at
registration time. Just copy them in one-by-one at registration time
and assign to the resource node.
The only real downside is that a request is now explicitly limited to
pinning 2 resources, one file and one buffer, where before just
assigning a resource node to a request would pin all of them. The upside
is that it's easier to follow now, as an individual resource is
explicitly referenced and assigned to the request.
With this in place, the above mentioned example will be using exactly 5
files at the end of the loop, not N.
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2024-10-26 01:27:39 +00:00
|
|
|
if (node->tag)
|
2024-11-03 15:17:28 +00:00
|
|
|
io_post_aux_cqe(ctx, node->tag, 0, 0);
|
io_uring/rsrc: get rid of per-ring io_rsrc_node list
Work in progress, but get rid of the per-ring serialization of resource
nodes, like registered buffers and files. Main issue here is that one
node can otherwise hold up a bunch of other nodes from getting freed,
which is especially a problem for file resource nodes and networked
workloads where some descriptors may not see activity in a long time.
As an example, instantiate an io_uring ring fd and create a sparse
registered file table. Even 2 will do. Then create a socket and register
it as fixed file 0, F0. The number of open files in the app is now 5,
with 0/1/2 being the usual stdin/out/err, 3 being the ring fd, and 4
being the socket. Register this socket (eg "the listener") in slot 0 of
the registered file table. Now add an operation on the socket that uses
slot 0. Finally, loop N times, where each loop creates a new socket,
registers said socket as a file, then unregisters the socket, and
finally closes the socket. This is roughly similar to what a basic
accept loop would look like.
At the end of this loop, it's not unreasonable to expect that there
would still be 5 open files. Each socket created and registered in the
loop is also unregistered and closed. But since the listener socket
registered first still has references to its resource node due to still
being active, each subsequent socket unregistration is stuck behind it
for reclaim. Hence 5 + N files are still open at that point, where N is
awaiting the final put held up by the listener socket.
Rewrite the io_rsrc_node handling to NOT rely on serialization. Struct
io_kiocb now gets explicit resource nodes assigned, with each holding a
reference to the parent node. A parent node is either of type FILE or
BUFFER, which are the two types of nodes that exist. A request can have
two nodes assigned, if it's using both registered files and buffers.
Since request issue and task_work completion is both under the ring
private lock, no atomics are needed to handle these references. It's a
simple unlocked inc/dec. As before, the registered buffer or file table
each hold a reference as well to the registered nodes. Final put of the
node will remove the node and free the underlying resource, eg unmap the
buffer or put the file.
Outside of removing the stall in resource reclaim described above, it
has the following advantages:
1) It's a lot simpler than the previous scheme, and easier to follow.
No need to specific quiesce handling anymore.
2) There are no resource node allocations in the fast path, all of that
happens at resource registration time.
3) The structs related to resource handling can all get simplified
quite a bit, like io_rsrc_node and io_rsrc_data. io_rsrc_put can
go away completely.
4) Handling of resource tags is much simpler, and doesn't require
persistent storage as it can simply get assigned up front at
registration time. Just copy them in one-by-one at registration time
and assign to the resource node.
The only real downside is that a request is now explicitly limited to
pinning 2 resources, one file and one buffer, where before just
assigning a resource node to a request would pin all of them. The upside
is that it's easier to follow now, as an individual resource is
explicitly referenced and assigned to the request.
With this in place, the above mentioned example will be using exactly 5
files at the end of the loop, not N.
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2024-10-26 01:27:39 +00:00
|
|
|
|
2024-11-07 11:01:35 +00:00
|
|
|
switch (node->type) {
|
io_uring/rsrc: get rid of per-ring io_rsrc_node list
Work in progress, but get rid of the per-ring serialization of resource
nodes, like registered buffers and files. Main issue here is that one
node can otherwise hold up a bunch of other nodes from getting freed,
which is especially a problem for file resource nodes and networked
workloads where some descriptors may not see activity in a long time.
As an example, instantiate an io_uring ring fd and create a sparse
registered file table. Even 2 will do. Then create a socket and register
it as fixed file 0, F0. The number of open files in the app is now 5,
with 0/1/2 being the usual stdin/out/err, 3 being the ring fd, and 4
being the socket. Register this socket (eg "the listener") in slot 0 of
the registered file table. Now add an operation on the socket that uses
slot 0. Finally, loop N times, where each loop creates a new socket,
registers said socket as a file, then unregisters the socket, and
finally closes the socket. This is roughly similar to what a basic
accept loop would look like.
At the end of this loop, it's not unreasonable to expect that there
would still be 5 open files. Each socket created and registered in the
loop is also unregistered and closed. But since the listener socket
registered first still has references to its resource node due to still
being active, each subsequent socket unregistration is stuck behind it
for reclaim. Hence 5 + N files are still open at that point, where N is
awaiting the final put held up by the listener socket.
Rewrite the io_rsrc_node handling to NOT rely on serialization. Struct
io_kiocb now gets explicit resource nodes assigned, with each holding a
reference to the parent node. A parent node is either of type FILE or
BUFFER, which are the two types of nodes that exist. A request can have
two nodes assigned, if it's using both registered files and buffers.
Since request issue and task_work completion is both under the ring
private lock, no atomics are needed to handle these references. It's a
simple unlocked inc/dec. As before, the registered buffer or file table
each hold a reference as well to the registered nodes. Final put of the
node will remove the node and free the underlying resource, eg unmap the
buffer or put the file.
Outside of removing the stall in resource reclaim described above, it
has the following advantages:
1) It's a lot simpler than the previous scheme, and easier to follow.
No need to specific quiesce handling anymore.
2) There are no resource node allocations in the fast path, all of that
happens at resource registration time.
3) The structs related to resource handling can all get simplified
quite a bit, like io_rsrc_node and io_rsrc_data. io_rsrc_put can
go away completely.
4) Handling of resource tags is much simpler, and doesn't require
persistent storage as it can simply get assigned up front at
registration time. Just copy them in one-by-one at registration time
and assign to the resource node.
The only real downside is that a request is now explicitly limited to
pinning 2 resources, one file and one buffer, where before just
assigning a resource node to a request would pin all of them. The upside
is that it's easier to follow now, as an individual resource is
explicitly referenced and assigned to the request.
With this in place, the above mentioned example will be using exactly 5
files at the end of the loop, not N.
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2024-10-26 01:27:39 +00:00
|
|
|
case IORING_RSRC_FILE:
|
2025-02-28 23:59:14 +00:00
|
|
|
fput(io_slot_file(node));
|
io_uring/rsrc: get rid of per-ring io_rsrc_node list
Work in progress, but get rid of the per-ring serialization of resource
nodes, like registered buffers and files. Main issue here is that one
node can otherwise hold up a bunch of other nodes from getting freed,
which is especially a problem for file resource nodes and networked
workloads where some descriptors may not see activity in a long time.
As an example, instantiate an io_uring ring fd and create a sparse
registered file table. Even 2 will do. Then create a socket and register
it as fixed file 0, F0. The number of open files in the app is now 5,
with 0/1/2 being the usual stdin/out/err, 3 being the ring fd, and 4
being the socket. Register this socket (eg "the listener") in slot 0 of
the registered file table. Now add an operation on the socket that uses
slot 0. Finally, loop N times, where each loop creates a new socket,
registers said socket as a file, then unregisters the socket, and
finally closes the socket. This is roughly similar to what a basic
accept loop would look like.
At the end of this loop, it's not unreasonable to expect that there
would still be 5 open files. Each socket created and registered in the
loop is also unregistered and closed. But since the listener socket
registered first still has references to its resource node due to still
being active, each subsequent socket unregistration is stuck behind it
for reclaim. Hence 5 + N files are still open at that point, where N is
awaiting the final put held up by the listener socket.
Rewrite the io_rsrc_node handling to NOT rely on serialization. Struct
io_kiocb now gets explicit resource nodes assigned, with each holding a
reference to the parent node. A parent node is either of type FILE or
BUFFER, which are the two types of nodes that exist. A request can have
two nodes assigned, if it's using both registered files and buffers.
Since request issue and task_work completion is both under the ring
private lock, no atomics are needed to handle these references. It's a
simple unlocked inc/dec. As before, the registered buffer or file table
each hold a reference as well to the registered nodes. Final put of the
node will remove the node and free the underlying resource, eg unmap the
buffer or put the file.
Outside of removing the stall in resource reclaim described above, it
has the following advantages:
1) It's a lot simpler than the previous scheme, and easier to follow.
No need to specific quiesce handling anymore.
2) There are no resource node allocations in the fast path, all of that
happens at resource registration time.
3) The structs related to resource handling can all get simplified
quite a bit, like io_rsrc_node and io_rsrc_data. io_rsrc_put can
go away completely.
4) Handling of resource tags is much simpler, and doesn't require
persistent storage as it can simply get assigned up front at
registration time. Just copy them in one-by-one at registration time
and assign to the resource node.
The only real downside is that a request is now explicitly limited to
pinning 2 resources, one file and one buffer, where before just
assigning a resource node to a request would pin all of them. The upside
is that it's easier to follow now, as an individual resource is
explicitly referenced and assigned to the request.
With this in place, the above mentioned example will be using exactly 5
files at the end of the loop, not N.
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2024-10-26 01:27:39 +00:00
|
|
|
break;
|
|
|
|
case IORING_RSRC_BUFFER:
|
2025-02-28 23:59:14 +00:00
|
|
|
io_buffer_unmap(ctx, node->buf);
|
io_uring/rsrc: get rid of per-ring io_rsrc_node list
Work in progress, but get rid of the per-ring serialization of resource
nodes, like registered buffers and files. Main issue here is that one
node can otherwise hold up a bunch of other nodes from getting freed,
which is especially a problem for file resource nodes and networked
workloads where some descriptors may not see activity in a long time.
As an example, instantiate an io_uring ring fd and create a sparse
registered file table. Even 2 will do. Then create a socket and register
it as fixed file 0, F0. The number of open files in the app is now 5,
with 0/1/2 being the usual stdin/out/err, 3 being the ring fd, and 4
being the socket. Register this socket (eg "the listener") in slot 0 of
the registered file table. Now add an operation on the socket that uses
slot 0. Finally, loop N times, where each loop creates a new socket,
registers said socket as a file, then unregisters the socket, and
finally closes the socket. This is roughly similar to what a basic
accept loop would look like.
At the end of this loop, it's not unreasonable to expect that there
would still be 5 open files. Each socket created and registered in the
loop is also unregistered and closed. But since the listener socket
registered first still has references to its resource node due to still
being active, each subsequent socket unregistration is stuck behind it
for reclaim. Hence 5 + N files are still open at that point, where N is
awaiting the final put held up by the listener socket.
Rewrite the io_rsrc_node handling to NOT rely on serialization. Struct
io_kiocb now gets explicit resource nodes assigned, with each holding a
reference to the parent node. A parent node is either of type FILE or
BUFFER, which are the two types of nodes that exist. A request can have
two nodes assigned, if it's using both registered files and buffers.
Since request issue and task_work completion is both under the ring
private lock, no atomics are needed to handle these references. It's a
simple unlocked inc/dec. As before, the registered buffer or file table
each hold a reference as well to the registered nodes. Final put of the
node will remove the node and free the underlying resource, eg unmap the
buffer or put the file.
Outside of removing the stall in resource reclaim described above, it
has the following advantages:
1) It's a lot simpler than the previous scheme, and easier to follow.
No need to specific quiesce handling anymore.
2) There are no resource node allocations in the fast path, all of that
happens at resource registration time.
3) The structs related to resource handling can all get simplified
quite a bit, like io_rsrc_node and io_rsrc_data. io_rsrc_put can
go away completely.
4) Handling of resource tags is much simpler, and doesn't require
persistent storage as it can simply get assigned up front at
registration time. Just copy them in one-by-one at registration time
and assign to the resource node.
The only real downside is that a request is now explicitly limited to
pinning 2 resources, one file and one buffer, where before just
assigning a resource node to a request would pin all of them. The upside
is that it's easier to follow now, as an individual resource is
explicitly referenced and assigned to the request.
With this in place, the above mentioned example will be using exactly 5
files at the end of the loop, not N.
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2024-10-26 01:27:39 +00:00
|
|
|
break;
|
|
|
|
default:
|
|
|
|
WARN_ON_ONCE(1);
|
|
|
|
break;
|
2023-04-04 12:39:52 +00:00
|
|
|
}
|
2022-06-13 13:12:45 +00:00
|
|
|
|
2025-03-04 19:48:12 +00:00
|
|
|
io_cache_free(&ctx->node_cache, node);
|
2022-06-13 13:12:45 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
int io_sqe_files_unregister(struct io_ring_ctx *ctx)
|
|
|
|
{
|
2024-10-26 20:50:13 +00:00
|
|
|
if (!ctx->file_table.data.nr)
|
2022-06-13 13:12:45 +00:00
|
|
|
return -ENXIO;
|
|
|
|
|
2024-11-07 11:01:34 +00:00
|
|
|
io_free_file_tables(ctx, &ctx->file_table);
|
2024-10-26 20:50:13 +00:00
|
|
|
io_file_table_set_alloc_range(ctx, 0, 0);
|
io_uring/rsrc: get rid of per-ring io_rsrc_node list
Work in progress, but get rid of the per-ring serialization of resource
nodes, like registered buffers and files. Main issue here is that one
node can otherwise hold up a bunch of other nodes from getting freed,
which is especially a problem for file resource nodes and networked
workloads where some descriptors may not see activity in a long time.
As an example, instantiate an io_uring ring fd and create a sparse
registered file table. Even 2 will do. Then create a socket and register
it as fixed file 0, F0. The number of open files in the app is now 5,
with 0/1/2 being the usual stdin/out/err, 3 being the ring fd, and 4
being the socket. Register this socket (eg "the listener") in slot 0 of
the registered file table. Now add an operation on the socket that uses
slot 0. Finally, loop N times, where each loop creates a new socket,
registers said socket as a file, then unregisters the socket, and
finally closes the socket. This is roughly similar to what a basic
accept loop would look like.
At the end of this loop, it's not unreasonable to expect that there
would still be 5 open files. Each socket created and registered in the
loop is also unregistered and closed. But since the listener socket
registered first still has references to its resource node due to still
being active, each subsequent socket unregistration is stuck behind it
for reclaim. Hence 5 + N files are still open at that point, where N is
awaiting the final put held up by the listener socket.
Rewrite the io_rsrc_node handling to NOT rely on serialization. Struct
io_kiocb now gets explicit resource nodes assigned, with each holding a
reference to the parent node. A parent node is either of type FILE or
BUFFER, which are the two types of nodes that exist. A request can have
two nodes assigned, if it's using both registered files and buffers.
Since request issue and task_work completion is both under the ring
private lock, no atomics are needed to handle these references. It's a
simple unlocked inc/dec. As before, the registered buffer or file table
each hold a reference as well to the registered nodes. Final put of the
node will remove the node and free the underlying resource, eg unmap the
buffer or put the file.
Outside of removing the stall in resource reclaim described above, it
has the following advantages:
1) It's a lot simpler than the previous scheme, and easier to follow.
No need to specific quiesce handling anymore.
2) There are no resource node allocations in the fast path, all of that
happens at resource registration time.
3) The structs related to resource handling can all get simplified
quite a bit, like io_rsrc_node and io_rsrc_data. io_rsrc_put can
go away completely.
4) Handling of resource tags is much simpler, and doesn't require
persistent storage as it can simply get assigned up front at
registration time. Just copy them in one-by-one at registration time
and assign to the resource node.
The only real downside is that a request is now explicitly limited to
pinning 2 resources, one file and one buffer, where before just
assigning a resource node to a request would pin all of them. The upside
is that it's easier to follow now, as an individual resource is
explicitly referenced and assigned to the request.
With this in place, the above mentioned example will be using exactly 5
files at the end of the loop, not N.
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2024-10-26 01:27:39 +00:00
|
|
|
return 0;
|
2022-06-13 13:12:45 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
|
|
|
|
unsigned nr_args, u64 __user *tags)
|
|
|
|
{
|
|
|
|
__s32 __user *fds = (__s32 __user *) arg;
|
|
|
|
struct file *file;
|
|
|
|
int fd, ret;
|
|
|
|
unsigned i;
|
|
|
|
|
2024-10-26 20:50:13 +00:00
|
|
|
if (ctx->file_table.data.nr)
|
2022-06-13 13:12:45 +00:00
|
|
|
return -EBUSY;
|
|
|
|
if (!nr_args)
|
|
|
|
return -EINVAL;
|
|
|
|
if (nr_args > IORING_MAX_FIXED_FILES)
|
|
|
|
return -EMFILE;
|
|
|
|
if (nr_args > rlimit(RLIMIT_NOFILE))
|
|
|
|
return -EMFILE;
|
2024-11-07 11:01:34 +00:00
|
|
|
if (!io_alloc_file_tables(ctx, &ctx->file_table, nr_args))
|
2022-06-13 13:12:45 +00:00
|
|
|
return -ENOMEM;
|
|
|
|
|
2024-10-26 20:50:13 +00:00
|
|
|
for (i = 0; i < nr_args; i++) {
|
io_uring/rsrc: get rid of per-ring io_rsrc_node list
Work in progress, but get rid of the per-ring serialization of resource
nodes, like registered buffers and files. Main issue here is that one
node can otherwise hold up a bunch of other nodes from getting freed,
which is especially a problem for file resource nodes and networked
workloads where some descriptors may not see activity in a long time.
As an example, instantiate an io_uring ring fd and create a sparse
registered file table. Even 2 will do. Then create a socket and register
it as fixed file 0, F0. The number of open files in the app is now 5,
with 0/1/2 being the usual stdin/out/err, 3 being the ring fd, and 4
being the socket. Register this socket (eg "the listener") in slot 0 of
the registered file table. Now add an operation on the socket that uses
slot 0. Finally, loop N times, where each loop creates a new socket,
registers said socket as a file, then unregisters the socket, and
finally closes the socket. This is roughly similar to what a basic
accept loop would look like.
At the end of this loop, it's not unreasonable to expect that there
would still be 5 open files. Each socket created and registered in the
loop is also unregistered and closed. But since the listener socket
registered first still has references to its resource node due to still
being active, each subsequent socket unregistration is stuck behind it
for reclaim. Hence 5 + N files are still open at that point, where N is
awaiting the final put held up by the listener socket.
Rewrite the io_rsrc_node handling to NOT rely on serialization. Struct
io_kiocb now gets explicit resource nodes assigned, with each holding a
reference to the parent node. A parent node is either of type FILE or
BUFFER, which are the two types of nodes that exist. A request can have
two nodes assigned, if it's using both registered files and buffers.
Since request issue and task_work completion is both under the ring
private lock, no atomics are needed to handle these references. It's a
simple unlocked inc/dec. As before, the registered buffer or file table
each hold a reference as well to the registered nodes. Final put of the
node will remove the node and free the underlying resource, eg unmap the
buffer or put the file.
Outside of removing the stall in resource reclaim described above, it
has the following advantages:
1) It's a lot simpler than the previous scheme, and easier to follow.
No need to specific quiesce handling anymore.
2) There are no resource node allocations in the fast path, all of that
happens at resource registration time.
3) The structs related to resource handling can all get simplified
quite a bit, like io_rsrc_node and io_rsrc_data. io_rsrc_put can
go away completely.
4) Handling of resource tags is much simpler, and doesn't require
persistent storage as it can simply get assigned up front at
registration time. Just copy them in one-by-one at registration time
and assign to the resource node.
The only real downside is that a request is now explicitly limited to
pinning 2 resources, one file and one buffer, where before just
assigning a resource node to a request would pin all of them. The upside
is that it's easier to follow now, as an individual resource is
explicitly referenced and assigned to the request.
With this in place, the above mentioned example will be using exactly 5
files at the end of the loop, not N.
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2024-10-26 01:27:39 +00:00
|
|
|
struct io_rsrc_node *node;
|
|
|
|
u64 tag = 0;
|
2022-06-13 13:12:45 +00:00
|
|
|
|
io_uring/rsrc: get rid of per-ring io_rsrc_node list
Work in progress, but get rid of the per-ring serialization of resource
nodes, like registered buffers and files. Main issue here is that one
node can otherwise hold up a bunch of other nodes from getting freed,
which is especially a problem for file resource nodes and networked
workloads where some descriptors may not see activity in a long time.
As an example, instantiate an io_uring ring fd and create a sparse
registered file table. Even 2 will do. Then create a socket and register
it as fixed file 0, F0. The number of open files in the app is now 5,
with 0/1/2 being the usual stdin/out/err, 3 being the ring fd, and 4
being the socket. Register this socket (eg "the listener") in slot 0 of
the registered file table. Now add an operation on the socket that uses
slot 0. Finally, loop N times, where each loop creates a new socket,
registers said socket as a file, then unregisters the socket, and
finally closes the socket. This is roughly similar to what a basic
accept loop would look like.
At the end of this loop, it's not unreasonable to expect that there
would still be 5 open files. Each socket created and registered in the
loop is also unregistered and closed. But since the listener socket
registered first still has references to its resource node due to still
being active, each subsequent socket unregistration is stuck behind it
for reclaim. Hence 5 + N files are still open at that point, where N is
awaiting the final put held up by the listener socket.
Rewrite the io_rsrc_node handling to NOT rely on serialization. Struct
io_kiocb now gets explicit resource nodes assigned, with each holding a
reference to the parent node. A parent node is either of type FILE or
BUFFER, which are the two types of nodes that exist. A request can have
two nodes assigned, if it's using both registered files and buffers.
Since request issue and task_work completion is both under the ring
private lock, no atomics are needed to handle these references. It's a
simple unlocked inc/dec. As before, the registered buffer or file table
each hold a reference as well to the registered nodes. Final put of the
node will remove the node and free the underlying resource, eg unmap the
buffer or put the file.
Outside of removing the stall in resource reclaim described above, it
has the following advantages:
1) It's a lot simpler than the previous scheme, and easier to follow.
No need to specific quiesce handling anymore.
2) There are no resource node allocations in the fast path, all of that
happens at resource registration time.
3) The structs related to resource handling can all get simplified
quite a bit, like io_rsrc_node and io_rsrc_data. io_rsrc_put can
go away completely.
4) Handling of resource tags is much simpler, and doesn't require
persistent storage as it can simply get assigned up front at
registration time. Just copy them in one-by-one at registration time
and assign to the resource node.
The only real downside is that a request is now explicitly limited to
pinning 2 resources, one file and one buffer, where before just
assigning a resource node to a request would pin all of them. The upside
is that it's easier to follow now, as an individual resource is
explicitly referenced and assigned to the request.
With this in place, the above mentioned example will be using exactly 5
files at the end of the loop, not N.
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2024-10-26 01:27:39 +00:00
|
|
|
ret = -EFAULT;
|
|
|
|
if (tags && copy_from_user(&tag, &tags[i], sizeof(tag)))
|
|
|
|
goto fail;
|
|
|
|
if (fds && copy_from_user(&fd, &fds[i], sizeof(fd)))
|
2022-06-13 13:12:45 +00:00
|
|
|
goto fail;
|
|
|
|
/* allow sparse sets */
|
|
|
|
if (!fds || fd == -1) {
|
|
|
|
ret = -EINVAL;
|
io_uring/rsrc: get rid of per-ring io_rsrc_node list
Work in progress, but get rid of the per-ring serialization of resource
nodes, like registered buffers and files. Main issue here is that one
node can otherwise hold up a bunch of other nodes from getting freed,
which is especially a problem for file resource nodes and networked
workloads where some descriptors may not see activity in a long time.
As an example, instantiate an io_uring ring fd and create a sparse
registered file table. Even 2 will do. Then create a socket and register
it as fixed file 0, F0. The number of open files in the app is now 5,
with 0/1/2 being the usual stdin/out/err, 3 being the ring fd, and 4
being the socket. Register this socket (eg "the listener") in slot 0 of
the registered file table. Now add an operation on the socket that uses
slot 0. Finally, loop N times, where each loop creates a new socket,
registers said socket as a file, then unregisters the socket, and
finally closes the socket. This is roughly similar to what a basic
accept loop would look like.
At the end of this loop, it's not unreasonable to expect that there
would still be 5 open files. Each socket created and registered in the
loop is also unregistered and closed. But since the listener socket
registered first still has references to its resource node due to still
being active, each subsequent socket unregistration is stuck behind it
for reclaim. Hence 5 + N files are still open at that point, where N is
awaiting the final put held up by the listener socket.
Rewrite the io_rsrc_node handling to NOT rely on serialization. Struct
io_kiocb now gets explicit resource nodes assigned, with each holding a
reference to the parent node. A parent node is either of type FILE or
BUFFER, which are the two types of nodes that exist. A request can have
two nodes assigned, if it's using both registered files and buffers.
Since request issue and task_work completion is both under the ring
private lock, no atomics are needed to handle these references. It's a
simple unlocked inc/dec. As before, the registered buffer or file table
each hold a reference as well to the registered nodes. Final put of the
node will remove the node and free the underlying resource, eg unmap the
buffer or put the file.
Outside of removing the stall in resource reclaim described above, it
has the following advantages:
1) It's a lot simpler than the previous scheme, and easier to follow.
No need to specific quiesce handling anymore.
2) There are no resource node allocations in the fast path, all of that
happens at resource registration time.
3) The structs related to resource handling can all get simplified
quite a bit, like io_rsrc_node and io_rsrc_data. io_rsrc_put can
go away completely.
4) Handling of resource tags is much simpler, and doesn't require
persistent storage as it can simply get assigned up front at
registration time. Just copy them in one-by-one at registration time
and assign to the resource node.
The only real downside is that a request is now explicitly limited to
pinning 2 resources, one file and one buffer, where before just
assigning a resource node to a request would pin all of them. The upside
is that it's easier to follow now, as an individual resource is
explicitly referenced and assigned to the request.
With this in place, the above mentioned example will be using exactly 5
files at the end of the loop, not N.
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2024-10-26 01:27:39 +00:00
|
|
|
if (tag)
|
2022-06-13 13:12:45 +00:00
|
|
|
goto fail;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
file = fget(fd);
|
|
|
|
ret = -EBADF;
|
|
|
|
if (unlikely(!file))
|
|
|
|
goto fail;
|
|
|
|
|
|
|
|
/*
|
2023-12-19 19:36:34 +00:00
|
|
|
* Don't allow io_uring instances to be registered.
|
2022-06-13 13:12:45 +00:00
|
|
|
*/
|
|
|
|
if (io_is_uring_fops(file)) {
|
|
|
|
fput(file);
|
|
|
|
goto fail;
|
|
|
|
}
|
io_uring/rsrc: get rid of per-ring io_rsrc_node list
Work in progress, but get rid of the per-ring serialization of resource
nodes, like registered buffers and files. Main issue here is that one
node can otherwise hold up a bunch of other nodes from getting freed,
which is especially a problem for file resource nodes and networked
workloads where some descriptors may not see activity in a long time.
As an example, instantiate an io_uring ring fd and create a sparse
registered file table. Even 2 will do. Then create a socket and register
it as fixed file 0, F0. The number of open files in the app is now 5,
with 0/1/2 being the usual stdin/out/err, 3 being the ring fd, and 4
being the socket. Register this socket (eg "the listener") in slot 0 of
the registered file table. Now add an operation on the socket that uses
slot 0. Finally, loop N times, where each loop creates a new socket,
registers said socket as a file, then unregisters the socket, and
finally closes the socket. This is roughly similar to what a basic
accept loop would look like.
At the end of this loop, it's not unreasonable to expect that there
would still be 5 open files. Each socket created and registered in the
loop is also unregistered and closed. But since the listener socket
registered first still has references to its resource node due to still
being active, each subsequent socket unregistration is stuck behind it
for reclaim. Hence 5 + N files are still open at that point, where N is
awaiting the final put held up by the listener socket.
Rewrite the io_rsrc_node handling to NOT rely on serialization. Struct
io_kiocb now gets explicit resource nodes assigned, with each holding a
reference to the parent node. A parent node is either of type FILE or
BUFFER, which are the two types of nodes that exist. A request can have
two nodes assigned, if it's using both registered files and buffers.
Since request issue and task_work completion is both under the ring
private lock, no atomics are needed to handle these references. It's a
simple unlocked inc/dec. As before, the registered buffer or file table
each hold a reference as well to the registered nodes. Final put of the
node will remove the node and free the underlying resource, eg unmap the
buffer or put the file.
Outside of removing the stall in resource reclaim described above, it
has the following advantages:
1) It's a lot simpler than the previous scheme, and easier to follow.
No need to specific quiesce handling anymore.
2) There are no resource node allocations in the fast path, all of that
happens at resource registration time.
3) The structs related to resource handling can all get simplified
quite a bit, like io_rsrc_node and io_rsrc_data. io_rsrc_put can
go away completely.
4) Handling of resource tags is much simpler, and doesn't require
persistent storage as it can simply get assigned up front at
registration time. Just copy them in one-by-one at registration time
and assign to the resource node.
The only real downside is that a request is now explicitly limited to
pinning 2 resources, one file and one buffer, where before just
assigning a resource node to a request would pin all of them. The upside
is that it's easier to follow now, as an individual resource is
explicitly referenced and assigned to the request.
With this in place, the above mentioned example will be using exactly 5
files at the end of the loop, not N.
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2024-10-26 01:27:39 +00:00
|
|
|
ret = -ENOMEM;
|
2025-02-27 22:39:16 +00:00
|
|
|
node = io_rsrc_node_alloc(ctx, IORING_RSRC_FILE);
|
io_uring/rsrc: get rid of per-ring io_rsrc_node list
Work in progress, but get rid of the per-ring serialization of resource
nodes, like registered buffers and files. Main issue here is that one
node can otherwise hold up a bunch of other nodes from getting freed,
which is especially a problem for file resource nodes and networked
workloads where some descriptors may not see activity in a long time.
As an example, instantiate an io_uring ring fd and create a sparse
registered file table. Even 2 will do. Then create a socket and register
it as fixed file 0, F0. The number of open files in the app is now 5,
with 0/1/2 being the usual stdin/out/err, 3 being the ring fd, and 4
being the socket. Register this socket (eg "the listener") in slot 0 of
the registered file table. Now add an operation on the socket that uses
slot 0. Finally, loop N times, where each loop creates a new socket,
registers said socket as a file, then unregisters the socket, and
finally closes the socket. This is roughly similar to what a basic
accept loop would look like.
At the end of this loop, it's not unreasonable to expect that there
would still be 5 open files. Each socket created and registered in the
loop is also unregistered and closed. But since the listener socket
registered first still has references to its resource node due to still
being active, each subsequent socket unregistration is stuck behind it
for reclaim. Hence 5 + N files are still open at that point, where N is
awaiting the final put held up by the listener socket.
Rewrite the io_rsrc_node handling to NOT rely on serialization. Struct
io_kiocb now gets explicit resource nodes assigned, with each holding a
reference to the parent node. A parent node is either of type FILE or
BUFFER, which are the two types of nodes that exist. A request can have
two nodes assigned, if it's using both registered files and buffers.
Since request issue and task_work completion is both under the ring
private lock, no atomics are needed to handle these references. It's a
simple unlocked inc/dec. As before, the registered buffer or file table
each hold a reference as well to the registered nodes. Final put of the
node will remove the node and free the underlying resource, eg unmap the
buffer or put the file.
Outside of removing the stall in resource reclaim described above, it
has the following advantages:
1) It's a lot simpler than the previous scheme, and easier to follow.
No need to specific quiesce handling anymore.
2) There are no resource node allocations in the fast path, all of that
happens at resource registration time.
3) The structs related to resource handling can all get simplified
quite a bit, like io_rsrc_node and io_rsrc_data. io_rsrc_put can
go away completely.
4) Handling of resource tags is much simpler, and doesn't require
persistent storage as it can simply get assigned up front at
registration time. Just copy them in one-by-one at registration time
and assign to the resource node.
The only real downside is that a request is now explicitly limited to
pinning 2 resources, one file and one buffer, where before just
assigning a resource node to a request would pin all of them. The upside
is that it's easier to follow now, as an individual resource is
explicitly referenced and assigned to the request.
With this in place, the above mentioned example will be using exactly 5
files at the end of the loop, not N.
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2024-10-26 01:27:39 +00:00
|
|
|
if (!node) {
|
|
|
|
fput(file);
|
|
|
|
goto fail;
|
|
|
|
}
|
|
|
|
if (tag)
|
|
|
|
node->tag = tag;
|
2024-10-26 20:50:13 +00:00
|
|
|
ctx->file_table.data.nodes[i] = node;
|
io_uring/rsrc: get rid of per-ring io_rsrc_node list
Work in progress, but get rid of the per-ring serialization of resource
nodes, like registered buffers and files. Main issue here is that one
node can otherwise hold up a bunch of other nodes from getting freed,
which is especially a problem for file resource nodes and networked
workloads where some descriptors may not see activity in a long time.
As an example, instantiate an io_uring ring fd and create a sparse
registered file table. Even 2 will do. Then create a socket and register
it as fixed file 0, F0. The number of open files in the app is now 5,
with 0/1/2 being the usual stdin/out/err, 3 being the ring fd, and 4
being the socket. Register this socket (eg "the listener") in slot 0 of
the registered file table. Now add an operation on the socket that uses
slot 0. Finally, loop N times, where each loop creates a new socket,
registers said socket as a file, then unregisters the socket, and
finally closes the socket. This is roughly similar to what a basic
accept loop would look like.
At the end of this loop, it's not unreasonable to expect that there
would still be 5 open files. Each socket created and registered in the
loop is also unregistered and closed. But since the listener socket
registered first still has references to its resource node due to still
being active, each subsequent socket unregistration is stuck behind it
for reclaim. Hence 5 + N files are still open at that point, where N is
awaiting the final put held up by the listener socket.
Rewrite the io_rsrc_node handling to NOT rely on serialization. Struct
io_kiocb now gets explicit resource nodes assigned, with each holding a
reference to the parent node. A parent node is either of type FILE or
BUFFER, which are the two types of nodes that exist. A request can have
two nodes assigned, if it's using both registered files and buffers.
Since request issue and task_work completion is both under the ring
private lock, no atomics are needed to handle these references. It's a
simple unlocked inc/dec. As before, the registered buffer or file table
each hold a reference as well to the registered nodes. Final put of the
node will remove the node and free the underlying resource, eg unmap the
buffer or put the file.
Outside of removing the stall in resource reclaim described above, it
has the following advantages:
1) It's a lot simpler than the previous scheme, and easier to follow.
No need to specific quiesce handling anymore.
2) There are no resource node allocations in the fast path, all of that
happens at resource registration time.
3) The structs related to resource handling can all get simplified
quite a bit, like io_rsrc_node and io_rsrc_data. io_rsrc_put can
go away completely.
4) Handling of resource tags is much simpler, and doesn't require
persistent storage as it can simply get assigned up front at
registration time. Just copy them in one-by-one at registration time
and assign to the resource node.
The only real downside is that a request is now explicitly limited to
pinning 2 resources, one file and one buffer, where before just
assigning a resource node to a request would pin all of them. The upside
is that it's easier to follow now, as an individual resource is
explicitly referenced and assigned to the request.
With this in place, the above mentioned example will be using exactly 5
files at the end of the loop, not N.
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2024-10-26 01:27:39 +00:00
|
|
|
io_fixed_file_set(node, file);
|
2022-06-13 13:12:45 +00:00
|
|
|
io_file_bitmap_set(&ctx->file_table, i);
|
|
|
|
}
|
|
|
|
|
2022-06-25 10:55:38 +00:00
|
|
|
/* default it to the whole table */
|
2024-10-26 20:50:13 +00:00
|
|
|
io_file_table_set_alloc_range(ctx, 0, ctx->file_table.data.nr);
|
2022-06-13 13:12:45 +00:00
|
|
|
return 0;
|
|
|
|
fail:
|
2025-04-04 14:46:34 +00:00
|
|
|
io_clear_table_tags(&ctx->file_table.data);
|
2024-10-26 20:50:13 +00:00
|
|
|
io_sqe_files_unregister(ctx);
|
2022-06-13 13:12:45 +00:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
int io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
|
|
|
|
{
|
2024-10-26 20:50:13 +00:00
|
|
|
if (!ctx->buf_table.nr)
|
2022-06-13 13:12:45 +00:00
|
|
|
return -ENXIO;
|
2024-11-07 11:01:34 +00:00
|
|
|
io_rsrc_data_free(ctx, &ctx->buf_table);
|
io_uring/rsrc: get rid of per-ring io_rsrc_node list
Work in progress, but get rid of the per-ring serialization of resource
nodes, like registered buffers and files. Main issue here is that one
node can otherwise hold up a bunch of other nodes from getting freed,
which is especially a problem for file resource nodes and networked
workloads where some descriptors may not see activity in a long time.
As an example, instantiate an io_uring ring fd and create a sparse
registered file table. Even 2 will do. Then create a socket and register
it as fixed file 0, F0. The number of open files in the app is now 5,
with 0/1/2 being the usual stdin/out/err, 3 being the ring fd, and 4
being the socket. Register this socket (eg "the listener") in slot 0 of
the registered file table. Now add an operation on the socket that uses
slot 0. Finally, loop N times, where each loop creates a new socket,
registers said socket as a file, then unregisters the socket, and
finally closes the socket. This is roughly similar to what a basic
accept loop would look like.
At the end of this loop, it's not unreasonable to expect that there
would still be 5 open files. Each socket created and registered in the
loop is also unregistered and closed. But since the listener socket
registered first still has references to its resource node due to still
being active, each subsequent socket unregistration is stuck behind it
for reclaim. Hence 5 + N files are still open at that point, where N is
awaiting the final put held up by the listener socket.
Rewrite the io_rsrc_node handling to NOT rely on serialization. Struct
io_kiocb now gets explicit resource nodes assigned, with each holding a
reference to the parent node. A parent node is either of type FILE or
BUFFER, which are the two types of nodes that exist. A request can have
two nodes assigned, if it's using both registered files and buffers.
Since request issue and task_work completion is both under the ring
private lock, no atomics are needed to handle these references. It's a
simple unlocked inc/dec. As before, the registered buffer or file table
each hold a reference as well to the registered nodes. Final put of the
node will remove the node and free the underlying resource, eg unmap the
buffer or put the file.
Outside of removing the stall in resource reclaim described above, it
has the following advantages:
1) It's a lot simpler than the previous scheme, and easier to follow.
No need to specific quiesce handling anymore.
2) There are no resource node allocations in the fast path, all of that
happens at resource registration time.
3) The structs related to resource handling can all get simplified
quite a bit, like io_rsrc_node and io_rsrc_data. io_rsrc_put can
go away completely.
4) Handling of resource tags is much simpler, and doesn't require
persistent storage as it can simply get assigned up front at
registration time. Just copy them in one-by-one at registration time
and assign to the resource node.
The only real downside is that a request is now explicitly limited to
pinning 2 resources, one file and one buffer, where before just
assigning a resource node to a request would pin all of them. The upside
is that it's easier to follow now, as an individual resource is
explicitly referenced and assigned to the request.
With this in place, the above mentioned example will be using exactly 5
files at the end of the loop, not N.
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2024-10-26 01:27:39 +00:00
|
|
|
return 0;
|
2022-06-13 13:12:45 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Not super efficient, but this is just a registration time. And we do cache
|
|
|
|
* the last compound head, so generally we'll only do a full search if we don't
|
|
|
|
* match that one.
|
|
|
|
*
|
|
|
|
* We check if the given compound head page has already been accounted, to
|
|
|
|
* avoid double accounting it. This allows us to account the full size of the
|
|
|
|
* page, not just the constituent pages of a huge page.
|
|
|
|
*/
|
|
|
|
static bool headpage_already_acct(struct io_ring_ctx *ctx, struct page **pages,
|
|
|
|
int nr_pages, struct page *hpage)
|
|
|
|
{
|
|
|
|
int i, j;
|
|
|
|
|
|
|
|
/* check current page array */
|
|
|
|
for (i = 0; i < nr_pages; i++) {
|
|
|
|
if (!PageCompound(pages[i]))
|
|
|
|
continue;
|
|
|
|
if (compound_head(pages[i]) == hpage)
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* check previously registered pages */
|
2024-10-26 20:50:13 +00:00
|
|
|
for (i = 0; i < ctx->buf_table.nr; i++) {
|
|
|
|
struct io_rsrc_node *node = ctx->buf_table.nodes[i];
|
2024-10-30 15:51:58 +00:00
|
|
|
struct io_mapped_ubuf *imu;
|
2022-06-13 13:12:45 +00:00
|
|
|
|
2024-10-30 15:51:58 +00:00
|
|
|
if (!node)
|
|
|
|
continue;
|
|
|
|
imu = node->buf;
|
2022-06-13 13:12:45 +00:00
|
|
|
for (j = 0; j < imu->nr_bvecs; j++) {
|
|
|
|
if (!PageCompound(imu->bvec[j].bv_page))
|
|
|
|
continue;
|
|
|
|
if (compound_head(imu->bvec[j].bv_page) == hpage)
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages,
|
|
|
|
int nr_pages, struct io_mapped_ubuf *imu,
|
|
|
|
struct page **last_hpage)
|
|
|
|
{
|
|
|
|
int i, ret;
|
|
|
|
|
|
|
|
imu->acct_pages = 0;
|
|
|
|
for (i = 0; i < nr_pages; i++) {
|
|
|
|
if (!PageCompound(pages[i])) {
|
|
|
|
imu->acct_pages++;
|
|
|
|
} else {
|
|
|
|
struct page *hpage;
|
|
|
|
|
|
|
|
hpage = compound_head(pages[i]);
|
|
|
|
if (hpage == *last_hpage)
|
|
|
|
continue;
|
|
|
|
*last_hpage = hpage;
|
|
|
|
if (headpage_already_acct(ctx, pages, i, hpage))
|
|
|
|
continue;
|
|
|
|
imu->acct_pages += page_size(hpage) >> PAGE_SHIFT;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!imu->acct_pages)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
ret = io_account_mem(ctx, imu->acct_pages);
|
|
|
|
if (ret)
|
|
|
|
imu->acct_pages = 0;
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2024-11-29 13:34:23 +00:00
|
|
|
static bool io_coalesce_buffer(struct page ***pages, int *nr_pages,
|
|
|
|
struct io_imu_folio_data *data)
|
2024-07-31 09:01:33 +00:00
|
|
|
{
|
|
|
|
struct page **page_array = *pages, **new_array = NULL;
|
2025-04-19 17:47:05 +00:00
|
|
|
unsigned nr_pages_left = *nr_pages;
|
|
|
|
unsigned nr_folios = data->nr_folios;
|
|
|
|
unsigned i, j;
|
2024-07-31 09:01:33 +00:00
|
|
|
|
|
|
|
/* Store head pages only*/
|
2025-04-19 17:47:05 +00:00
|
|
|
new_array = kvmalloc_array(nr_folios, sizeof(struct page *), GFP_KERNEL);
|
2024-07-31 09:01:33 +00:00
|
|
|
if (!new_array)
|
|
|
|
return false;
|
|
|
|
|
2025-04-19 17:47:05 +00:00
|
|
|
for (i = 0, j = 0; i < nr_folios; i++) {
|
|
|
|
struct page *p = compound_head(page_array[j]);
|
|
|
|
struct folio *folio = page_folio(p);
|
|
|
|
unsigned int nr;
|
|
|
|
|
|
|
|
WARN_ON_ONCE(i > 0 && p != page_array[j]);
|
|
|
|
|
|
|
|
nr = i ? data->nr_pages_mid : data->nr_pages_head;
|
|
|
|
nr = min(nr, nr_pages_left);
|
|
|
|
/* Drop all but one ref, the entire folio will remain pinned. */
|
|
|
|
if (nr > 1)
|
|
|
|
unpin_user_folio(folio, nr - 1);
|
|
|
|
j += nr;
|
|
|
|
nr_pages_left -= nr;
|
|
|
|
new_array[i] = p;
|
2024-07-31 09:01:33 +00:00
|
|
|
}
|
2025-04-19 17:47:05 +00:00
|
|
|
|
|
|
|
WARN_ON_ONCE(j != *nr_pages);
|
|
|
|
|
2024-07-31 09:01:33 +00:00
|
|
|
kvfree(page_array);
|
|
|
|
*pages = new_array;
|
|
|
|
*nr_pages = nr_folios;
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2024-11-29 13:34:23 +00:00
|
|
|
bool io_check_coalesce_buffer(struct page **page_array, int nr_pages,
|
|
|
|
struct io_imu_folio_data *data)
|
2024-07-31 09:01:33 +00:00
|
|
|
{
|
|
|
|
struct folio *folio = page_folio(page_array[0]);
|
|
|
|
unsigned int count = 1, nr_folios = 1;
|
|
|
|
int i;
|
|
|
|
|
|
|
|
data->nr_pages_mid = folio_nr_pages(folio);
|
|
|
|
data->folio_shift = folio_shift(folio);
|
2024-12-08 21:46:01 +00:00
|
|
|
|
2024-07-31 09:01:33 +00:00
|
|
|
/*
|
|
|
|
* Check if pages are contiguous inside a folio, and all folios have
|
|
|
|
* the same page count except for the head and tail.
|
|
|
|
*/
|
2024-11-29 13:34:23 +00:00
|
|
|
for (i = 1; i < nr_pages; i++) {
|
2024-07-31 09:01:33 +00:00
|
|
|
if (page_folio(page_array[i]) == folio &&
|
|
|
|
page_array[i] == page_array[i-1] + 1) {
|
|
|
|
count++;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (nr_folios == 1) {
|
|
|
|
if (folio_page_idx(folio, page_array[i-1]) !=
|
|
|
|
data->nr_pages_mid - 1)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
data->nr_pages_head = count;
|
|
|
|
} else if (count != data->nr_pages_mid) {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
folio = page_folio(page_array[i]);
|
|
|
|
if (folio_size(folio) != (1UL << data->folio_shift) ||
|
|
|
|
folio_page_idx(folio, page_array[i]) != 0)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
count = 1;
|
|
|
|
nr_folios++;
|
|
|
|
}
|
|
|
|
if (nr_folios == 1)
|
|
|
|
data->nr_pages_head = count;
|
|
|
|
|
2024-11-29 13:34:23 +00:00
|
|
|
data->nr_folios = nr_folios;
|
|
|
|
return true;
|
2024-07-31 09:01:33 +00:00
|
|
|
}
|
|
|
|
|
io_uring/rsrc: get rid of per-ring io_rsrc_node list
Work in progress, but get rid of the per-ring serialization of resource
nodes, like registered buffers and files. Main issue here is that one
node can otherwise hold up a bunch of other nodes from getting freed,
which is especially a problem for file resource nodes and networked
workloads where some descriptors may not see activity in a long time.
As an example, instantiate an io_uring ring fd and create a sparse
registered file table. Even 2 will do. Then create a socket and register
it as fixed file 0, F0. The number of open files in the app is now 5,
with 0/1/2 being the usual stdin/out/err, 3 being the ring fd, and 4
being the socket. Register this socket (eg "the listener") in slot 0 of
the registered file table. Now add an operation on the socket that uses
slot 0. Finally, loop N times, where each loop creates a new socket,
registers said socket as a file, then unregisters the socket, and
finally closes the socket. This is roughly similar to what a basic
accept loop would look like.
At the end of this loop, it's not unreasonable to expect that there
would still be 5 open files. Each socket created and registered in the
loop is also unregistered and closed. But since the listener socket
registered first still has references to its resource node due to still
being active, each subsequent socket unregistration is stuck behind it
for reclaim. Hence 5 + N files are still open at that point, where N is
awaiting the final put held up by the listener socket.
Rewrite the io_rsrc_node handling to NOT rely on serialization. Struct
io_kiocb now gets explicit resource nodes assigned, with each holding a
reference to the parent node. A parent node is either of type FILE or
BUFFER, which are the two types of nodes that exist. A request can have
two nodes assigned, if it's using both registered files and buffers.
Since request issue and task_work completion is both under the ring
private lock, no atomics are needed to handle these references. It's a
simple unlocked inc/dec. As before, the registered buffer or file table
each hold a reference as well to the registered nodes. Final put of the
node will remove the node and free the underlying resource, eg unmap the
buffer or put the file.
Outside of removing the stall in resource reclaim described above, it
has the following advantages:
1) It's a lot simpler than the previous scheme, and easier to follow.
No need to specific quiesce handling anymore.
2) There are no resource node allocations in the fast path, all of that
happens at resource registration time.
3) The structs related to resource handling can all get simplified
quite a bit, like io_rsrc_node and io_rsrc_data. io_rsrc_put can
go away completely.
4) Handling of resource tags is much simpler, and doesn't require
persistent storage as it can simply get assigned up front at
registration time. Just copy them in one-by-one at registration time
and assign to the resource node.
The only real downside is that a request is now explicitly limited to
pinning 2 resources, one file and one buffer, where before just
assigning a resource node to a request would pin all of them. The upside
is that it's easier to follow now, as an individual resource is
explicitly referenced and assigned to the request.
With this in place, the above mentioned example will be using exactly 5
files at the end of the loop, not N.
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2024-10-26 01:27:39 +00:00
|
|
|
static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx,
|
|
|
|
struct iovec *iov,
|
|
|
|
struct page **last_hpage)
|
2022-06-13 13:12:45 +00:00
|
|
|
{
|
|
|
|
struct io_mapped_ubuf *imu = NULL;
|
|
|
|
struct page **pages = NULL;
|
io_uring/rsrc: get rid of per-ring io_rsrc_node list
Work in progress, but get rid of the per-ring serialization of resource
nodes, like registered buffers and files. Main issue here is that one
node can otherwise hold up a bunch of other nodes from getting freed,
which is especially a problem for file resource nodes and networked
workloads where some descriptors may not see activity in a long time.
As an example, instantiate an io_uring ring fd and create a sparse
registered file table. Even 2 will do. Then create a socket and register
it as fixed file 0, F0. The number of open files in the app is now 5,
with 0/1/2 being the usual stdin/out/err, 3 being the ring fd, and 4
being the socket. Register this socket (eg "the listener") in slot 0 of
the registered file table. Now add an operation on the socket that uses
slot 0. Finally, loop N times, where each loop creates a new socket,
registers said socket as a file, then unregisters the socket, and
finally closes the socket. This is roughly similar to what a basic
accept loop would look like.
At the end of this loop, it's not unreasonable to expect that there
would still be 5 open files. Each socket created and registered in the
loop is also unregistered and closed. But since the listener socket
registered first still has references to its resource node due to still
being active, each subsequent socket unregistration is stuck behind it
for reclaim. Hence 5 + N files are still open at that point, where N is
awaiting the final put held up by the listener socket.
Rewrite the io_rsrc_node handling to NOT rely on serialization. Struct
io_kiocb now gets explicit resource nodes assigned, with each holding a
reference to the parent node. A parent node is either of type FILE or
BUFFER, which are the two types of nodes that exist. A request can have
two nodes assigned, if it's using both registered files and buffers.
Since request issue and task_work completion is both under the ring
private lock, no atomics are needed to handle these references. It's a
simple unlocked inc/dec. As before, the registered buffer or file table
each hold a reference as well to the registered nodes. Final put of the
node will remove the node and free the underlying resource, eg unmap the
buffer or put the file.
Outside of removing the stall in resource reclaim described above, it
has the following advantages:
1) It's a lot simpler than the previous scheme, and easier to follow.
No need to specific quiesce handling anymore.
2) There are no resource node allocations in the fast path, all of that
happens at resource registration time.
3) The structs related to resource handling can all get simplified
quite a bit, like io_rsrc_node and io_rsrc_data. io_rsrc_put can
go away completely.
4) Handling of resource tags is much simpler, and doesn't require
persistent storage as it can simply get assigned up front at
registration time. Just copy them in one-by-one at registration time
and assign to the resource node.
The only real downside is that a request is now explicitly limited to
pinning 2 resources, one file and one buffer, where before just
assigning a resource node to a request would pin all of them. The upside
is that it's easier to follow now, as an individual resource is
explicitly referenced and assigned to the request.
With this in place, the above mentioned example will be using exactly 5
files at the end of the loop, not N.
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2024-10-26 01:27:39 +00:00
|
|
|
struct io_rsrc_node *node;
|
2022-06-13 13:12:45 +00:00
|
|
|
unsigned long off;
|
|
|
|
size_t size;
|
|
|
|
int ret, nr_pages, i;
|
2024-07-31 09:01:33 +00:00
|
|
|
struct io_imu_folio_data data;
|
2024-11-29 13:34:23 +00:00
|
|
|
bool coalesced = false;
|
2022-06-13 13:12:45 +00:00
|
|
|
|
2024-10-26 20:50:13 +00:00
|
|
|
if (!iov->iov_base)
|
2024-10-30 15:51:58 +00:00
|
|
|
return NULL;
|
2024-10-26 20:50:13 +00:00
|
|
|
|
2025-02-27 22:39:16 +00:00
|
|
|
node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER);
|
io_uring/rsrc: get rid of per-ring io_rsrc_node list
Work in progress, but get rid of the per-ring serialization of resource
nodes, like registered buffers and files. Main issue here is that one
node can otherwise hold up a bunch of other nodes from getting freed,
which is especially a problem for file resource nodes and networked
workloads where some descriptors may not see activity in a long time.
As an example, instantiate an io_uring ring fd and create a sparse
registered file table. Even 2 will do. Then create a socket and register
it as fixed file 0, F0. The number of open files in the app is now 5,
with 0/1/2 being the usual stdin/out/err, 3 being the ring fd, and 4
being the socket. Register this socket (eg "the listener") in slot 0 of
the registered file table. Now add an operation on the socket that uses
slot 0. Finally, loop N times, where each loop creates a new socket,
registers said socket as a file, then unregisters the socket, and
finally closes the socket. This is roughly similar to what a basic
accept loop would look like.
At the end of this loop, it's not unreasonable to expect that there
would still be 5 open files. Each socket created and registered in the
loop is also unregistered and closed. But since the listener socket
registered first still has references to its resource node due to still
being active, each subsequent socket unregistration is stuck behind it
for reclaim. Hence 5 + N files are still open at that point, where N is
awaiting the final put held up by the listener socket.
Rewrite the io_rsrc_node handling to NOT rely on serialization. Struct
io_kiocb now gets explicit resource nodes assigned, with each holding a
reference to the parent node. A parent node is either of type FILE or
BUFFER, which are the two types of nodes that exist. A request can have
two nodes assigned, if it's using both registered files and buffers.
Since request issue and task_work completion is both under the ring
private lock, no atomics are needed to handle these references. It's a
simple unlocked inc/dec. As before, the registered buffer or file table
each hold a reference as well to the registered nodes. Final put of the
node will remove the node and free the underlying resource, eg unmap the
buffer or put the file.
Outside of removing the stall in resource reclaim described above, it
has the following advantages:
1) It's a lot simpler than the previous scheme, and easier to follow.
No need to specific quiesce handling anymore.
2) There are no resource node allocations in the fast path, all of that
happens at resource registration time.
3) The structs related to resource handling can all get simplified
quite a bit, like io_rsrc_node and io_rsrc_data. io_rsrc_put can
go away completely.
4) Handling of resource tags is much simpler, and doesn't require
persistent storage as it can simply get assigned up front at
registration time. Just copy them in one-by-one at registration time
and assign to the resource node.
The only real downside is that a request is now explicitly limited to
pinning 2 resources, one file and one buffer, where before just
assigning a resource node to a request would pin all of them. The upside
is that it's easier to follow now, as an individual resource is
explicitly referenced and assigned to the request.
With this in place, the above mentioned example will be using exactly 5
files at the end of the loop, not N.
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2024-10-26 01:27:39 +00:00
|
|
|
if (!node)
|
|
|
|
return ERR_PTR(-ENOMEM);
|
|
|
|
|
2022-06-13 13:12:45 +00:00
|
|
|
ret = -ENOMEM;
|
|
|
|
pages = io_pin_pages((unsigned long) iov->iov_base, iov->iov_len,
|
|
|
|
&nr_pages);
|
|
|
|
if (IS_ERR(pages)) {
|
|
|
|
ret = PTR_ERR(pages);
|
|
|
|
pages = NULL;
|
|
|
|
goto done;
|
|
|
|
}
|
|
|
|
|
2024-07-31 09:01:33 +00:00
|
|
|
/* If it's huge page(s), try to coalesce them into fewer bvec entries */
|
2024-12-08 21:46:01 +00:00
|
|
|
if (nr_pages > 1 && io_check_coalesce_buffer(pages, nr_pages, &data)) {
|
|
|
|
if (data.nr_pages_mid != 1)
|
|
|
|
coalesced = io_coalesce_buffer(&pages, &nr_pages, &data);
|
|
|
|
}
|
2023-02-22 14:36:51 +00:00
|
|
|
|
2025-02-27 22:39:16 +00:00
|
|
|
imu = io_alloc_imu(ctx, nr_pages);
|
2022-06-13 13:12:45 +00:00
|
|
|
if (!imu)
|
|
|
|
goto done;
|
|
|
|
|
2025-02-27 22:39:16 +00:00
|
|
|
imu->nr_bvecs = nr_pages;
|
2022-06-13 13:12:45 +00:00
|
|
|
ret = io_buffer_account_pin(ctx, pages, nr_pages, imu, last_hpage);
|
|
|
|
if (ret) {
|
|
|
|
unpin_user_pages(pages, nr_pages);
|
|
|
|
goto done;
|
|
|
|
}
|
|
|
|
|
|
|
|
size = iov->iov_len;
|
2023-02-22 14:36:51 +00:00
|
|
|
/* store original address for later verification */
|
|
|
|
imu->ubuf = (unsigned long) iov->iov_base;
|
2024-09-15 14:53:45 +00:00
|
|
|
imu->len = iov->iov_len;
|
2024-07-31 09:01:32 +00:00
|
|
|
imu->folio_shift = PAGE_SHIFT;
|
2025-02-27 22:39:14 +00:00
|
|
|
imu->release = io_release_ubuf;
|
|
|
|
imu->priv = imu;
|
|
|
|
imu->is_kbuf = false;
|
|
|
|
imu->dir = IO_IMU_DEST | IO_IMU_SOURCE;
|
2024-09-15 14:51:20 +00:00
|
|
|
if (coalesced)
|
2024-07-31 09:01:33 +00:00
|
|
|
imu->folio_shift = data.folio_shift;
|
2024-09-11 19:54:32 +00:00
|
|
|
refcount_set(&imu->refs, 1);
|
2024-09-15 14:51:20 +00:00
|
|
|
off = (unsigned long) iov->iov_base & ((1UL << imu->folio_shift) - 1);
|
io_uring/rsrc: get rid of per-ring io_rsrc_node list
Work in progress, but get rid of the per-ring serialization of resource
nodes, like registered buffers and files. Main issue here is that one
node can otherwise hold up a bunch of other nodes from getting freed,
which is especially a problem for file resource nodes and networked
workloads where some descriptors may not see activity in a long time.
As an example, instantiate an io_uring ring fd and create a sparse
registered file table. Even 2 will do. Then create a socket and register
it as fixed file 0, F0. The number of open files in the app is now 5,
with 0/1/2 being the usual stdin/out/err, 3 being the ring fd, and 4
being the socket. Register this socket (eg "the listener") in slot 0 of
the registered file table. Now add an operation on the socket that uses
slot 0. Finally, loop N times, where each loop creates a new socket,
registers said socket as a file, then unregisters the socket, and
finally closes the socket. This is roughly similar to what a basic
accept loop would look like.
At the end of this loop, it's not unreasonable to expect that there
would still be 5 open files. Each socket created and registered in the
loop is also unregistered and closed. But since the listener socket
registered first still has references to its resource node due to still
being active, each subsequent socket unregistration is stuck behind it
for reclaim. Hence 5 + N files are still open at that point, where N is
awaiting the final put held up by the listener socket.
Rewrite the io_rsrc_node handling to NOT rely on serialization. Struct
io_kiocb now gets explicit resource nodes assigned, with each holding a
reference to the parent node. A parent node is either of type FILE or
BUFFER, which are the two types of nodes that exist. A request can have
two nodes assigned, if it's using both registered files and buffers.
Since request issue and task_work completion is both under the ring
private lock, no atomics are needed to handle these references. It's a
simple unlocked inc/dec. As before, the registered buffer or file table
each hold a reference as well to the registered nodes. Final put of the
node will remove the node and free the underlying resource, eg unmap the
buffer or put the file.
Outside of removing the stall in resource reclaim described above, it
has the following advantages:
1) It's a lot simpler than the previous scheme, and easier to follow.
No need to specific quiesce handling anymore.
2) There are no resource node allocations in the fast path, all of that
happens at resource registration time.
3) The structs related to resource handling can all get simplified
quite a bit, like io_rsrc_node and io_rsrc_data. io_rsrc_put can
go away completely.
4) Handling of resource tags is much simpler, and doesn't require
persistent storage as it can simply get assigned up front at
registration time. Just copy them in one-by-one at registration time
and assign to the resource node.
The only real downside is that a request is now explicitly limited to
pinning 2 resources, one file and one buffer, where before just
assigning a resource node to a request would pin all of them. The upside
is that it's easier to follow now, as an individual resource is
explicitly referenced and assigned to the request.
With this in place, the above mentioned example will be using exactly 5
files at the end of the loop, not N.
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2024-10-26 01:27:39 +00:00
|
|
|
node->buf = imu;
|
2023-02-22 14:36:51 +00:00
|
|
|
ret = 0;
|
|
|
|
|
2022-06-13 13:12:45 +00:00
|
|
|
for (i = 0; i < nr_pages; i++) {
|
|
|
|
size_t vec_len;
|
|
|
|
|
2024-07-31 09:01:33 +00:00
|
|
|
vec_len = min_t(size_t, size, (1UL << imu->folio_shift) - off);
|
2023-02-03 15:06:29 +00:00
|
|
|
bvec_set_page(&imu->bvec[i], pages[i], vec_len, off);
|
2022-06-13 13:12:45 +00:00
|
|
|
off = 0;
|
|
|
|
size -= vec_len;
|
|
|
|
}
|
|
|
|
done:
|
io_uring/rsrc: get rid of per-ring io_rsrc_node list
Work in progress, but get rid of the per-ring serialization of resource
nodes, like registered buffers and files. Main issue here is that one
node can otherwise hold up a bunch of other nodes from getting freed,
which is especially a problem for file resource nodes and networked
workloads where some descriptors may not see activity in a long time.
As an example, instantiate an io_uring ring fd and create a sparse
registered file table. Even 2 will do. Then create a socket and register
it as fixed file 0, F0. The number of open files in the app is now 5,
with 0/1/2 being the usual stdin/out/err, 3 being the ring fd, and 4
being the socket. Register this socket (eg "the listener") in slot 0 of
the registered file table. Now add an operation on the socket that uses
slot 0. Finally, loop N times, where each loop creates a new socket,
registers said socket as a file, then unregisters the socket, and
finally closes the socket. This is roughly similar to what a basic
accept loop would look like.
At the end of this loop, it's not unreasonable to expect that there
would still be 5 open files. Each socket created and registered in the
loop is also unregistered and closed. But since the listener socket
registered first still has references to its resource node due to still
being active, each subsequent socket unregistration is stuck behind it
for reclaim. Hence 5 + N files are still open at that point, where N is
awaiting the final put held up by the listener socket.
Rewrite the io_rsrc_node handling to NOT rely on serialization. Struct
io_kiocb now gets explicit resource nodes assigned, with each holding a
reference to the parent node. A parent node is either of type FILE or
BUFFER, which are the two types of nodes that exist. A request can have
two nodes assigned, if it's using both registered files and buffers.
Since request issue and task_work completion is both under the ring
private lock, no atomics are needed to handle these references. It's a
simple unlocked inc/dec. As before, the registered buffer or file table
each hold a reference as well to the registered nodes. Final put of the
node will remove the node and free the underlying resource, eg unmap the
buffer or put the file.
Outside of removing the stall in resource reclaim described above, it
has the following advantages:
1) It's a lot simpler than the previous scheme, and easier to follow.
No need to specific quiesce handling anymore.
2) There are no resource node allocations in the fast path, all of that
happens at resource registration time.
3) The structs related to resource handling can all get simplified
quite a bit, like io_rsrc_node and io_rsrc_data. io_rsrc_put can
go away completely.
4) Handling of resource tags is much simpler, and doesn't require
persistent storage as it can simply get assigned up front at
registration time. Just copy them in one-by-one at registration time
and assign to the resource node.
The only real downside is that a request is now explicitly limited to
pinning 2 resources, one file and one buffer, where before just
assigning a resource node to a request would pin all of them. The upside
is that it's easier to follow now, as an individual resource is
explicitly referenced and assigned to the request.
With this in place, the above mentioned example will be using exactly 5
files at the end of the loop, not N.
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2024-10-26 01:27:39 +00:00
|
|
|
if (ret) {
|
2025-02-27 22:39:16 +00:00
|
|
|
if (imu)
|
|
|
|
io_free_imu(ctx, imu);
|
2025-03-04 19:48:12 +00:00
|
|
|
io_cache_free(&ctx->node_cache, node);
|
io_uring/rsrc: get rid of per-ring io_rsrc_node list
Work in progress, but get rid of the per-ring serialization of resource
nodes, like registered buffers and files. Main issue here is that one
node can otherwise hold up a bunch of other nodes from getting freed,
which is especially a problem for file resource nodes and networked
workloads where some descriptors may not see activity in a long time.
As an example, instantiate an io_uring ring fd and create a sparse
registered file table. Even 2 will do. Then create a socket and register
it as fixed file 0, F0. The number of open files in the app is now 5,
with 0/1/2 being the usual stdin/out/err, 3 being the ring fd, and 4
being the socket. Register this socket (eg "the listener") in slot 0 of
the registered file table. Now add an operation on the socket that uses
slot 0. Finally, loop N times, where each loop creates a new socket,
registers said socket as a file, then unregisters the socket, and
finally closes the socket. This is roughly similar to what a basic
accept loop would look like.
At the end of this loop, it's not unreasonable to expect that there
would still be 5 open files. Each socket created and registered in the
loop is also unregistered and closed. But since the listener socket
registered first still has references to its resource node due to still
being active, each subsequent socket unregistration is stuck behind it
for reclaim. Hence 5 + N files are still open at that point, where N is
awaiting the final put held up by the listener socket.
Rewrite the io_rsrc_node handling to NOT rely on serialization. Struct
io_kiocb now gets explicit resource nodes assigned, with each holding a
reference to the parent node. A parent node is either of type FILE or
BUFFER, which are the two types of nodes that exist. A request can have
two nodes assigned, if it's using both registered files and buffers.
Since request issue and task_work completion is both under the ring
private lock, no atomics are needed to handle these references. It's a
simple unlocked inc/dec. As before, the registered buffer or file table
each hold a reference as well to the registered nodes. Final put of the
node will remove the node and free the underlying resource, eg unmap the
buffer or put the file.
Outside of removing the stall in resource reclaim described above, it
has the following advantages:
1) It's a lot simpler than the previous scheme, and easier to follow.
No need to specific quiesce handling anymore.
2) There are no resource node allocations in the fast path, all of that
happens at resource registration time.
3) The structs related to resource handling can all get simplified
quite a bit, like io_rsrc_node and io_rsrc_data. io_rsrc_put can
go away completely.
4) Handling of resource tags is much simpler, and doesn't require
persistent storage as it can simply get assigned up front at
registration time. Just copy them in one-by-one at registration time
and assign to the resource node.
The only real downside is that a request is now explicitly limited to
pinning 2 resources, one file and one buffer, where before just
assigning a resource node to a request would pin all of them. The upside
is that it's easier to follow now, as an individual resource is
explicitly referenced and assigned to the request.
With this in place, the above mentioned example will be using exactly 5
files at the end of the loop, not N.
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2024-10-26 01:27:39 +00:00
|
|
|
node = ERR_PTR(ret);
|
|
|
|
}
|
2022-06-13 13:12:45 +00:00
|
|
|
kvfree(pages);
|
io_uring/rsrc: get rid of per-ring io_rsrc_node list
Work in progress, but get rid of the per-ring serialization of resource
nodes, like registered buffers and files. Main issue here is that one
node can otherwise hold up a bunch of other nodes from getting freed,
which is especially a problem for file resource nodes and networked
workloads where some descriptors may not see activity in a long time.
As an example, instantiate an io_uring ring fd and create a sparse
registered file table. Even 2 will do. Then create a socket and register
it as fixed file 0, F0. The number of open files in the app is now 5,
with 0/1/2 being the usual stdin/out/err, 3 being the ring fd, and 4
being the socket. Register this socket (eg "the listener") in slot 0 of
the registered file table. Now add an operation on the socket that uses
slot 0. Finally, loop N times, where each loop creates a new socket,
registers said socket as a file, then unregisters the socket, and
finally closes the socket. This is roughly similar to what a basic
accept loop would look like.
At the end of this loop, it's not unreasonable to expect that there
would still be 5 open files. Each socket created and registered in the
loop is also unregistered and closed. But since the listener socket
registered first still has references to its resource node due to still
being active, each subsequent socket unregistration is stuck behind it
for reclaim. Hence 5 + N files are still open at that point, where N is
awaiting the final put held up by the listener socket.
Rewrite the io_rsrc_node handling to NOT rely on serialization. Struct
io_kiocb now gets explicit resource nodes assigned, with each holding a
reference to the parent node. A parent node is either of type FILE or
BUFFER, which are the two types of nodes that exist. A request can have
two nodes assigned, if it's using both registered files and buffers.
Since request issue and task_work completion is both under the ring
private lock, no atomics are needed to handle these references. It's a
simple unlocked inc/dec. As before, the registered buffer or file table
each hold a reference as well to the registered nodes. Final put of the
node will remove the node and free the underlying resource, eg unmap the
buffer or put the file.
Outside of removing the stall in resource reclaim described above, it
has the following advantages:
1) It's a lot simpler than the previous scheme, and easier to follow.
No need to specific quiesce handling anymore.
2) There are no resource node allocations in the fast path, all of that
happens at resource registration time.
3) The structs related to resource handling can all get simplified
quite a bit, like io_rsrc_node and io_rsrc_data. io_rsrc_put can
go away completely.
4) Handling of resource tags is much simpler, and doesn't require
persistent storage as it can simply get assigned up front at
registration time. Just copy them in one-by-one at registration time
and assign to the resource node.
The only real downside is that a request is now explicitly limited to
pinning 2 resources, one file and one buffer, where before just
assigning a resource node to a request would pin all of them. The upside
is that it's easier to follow now, as an individual resource is
explicitly referenced and assigned to the request.
With this in place, the above mentioned example will be using exactly 5
files at the end of the loop, not N.
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2024-10-26 01:27:39 +00:00
|
|
|
return node;
|
2022-06-13 13:12:45 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
|
|
|
|
unsigned int nr_args, u64 __user *tags)
|
|
|
|
{
|
|
|
|
struct page *last_hpage = NULL;
|
2024-10-26 20:50:13 +00:00
|
|
|
struct io_rsrc_data data;
|
2024-05-23 21:45:35 +00:00
|
|
|
struct iovec fast_iov, *iov = &fast_iov;
|
2024-08-28 15:42:33 +00:00
|
|
|
const struct iovec __user *uvec;
|
2022-06-13 13:12:45 +00:00
|
|
|
int i, ret;
|
|
|
|
|
|
|
|
BUILD_BUG_ON(IORING_MAX_REG_BUFFERS >= (1u << 16));
|
|
|
|
|
2024-10-26 20:50:13 +00:00
|
|
|
if (ctx->buf_table.nr)
|
2022-06-13 13:12:45 +00:00
|
|
|
return -EBUSY;
|
|
|
|
if (!nr_args || nr_args > IORING_MAX_REG_BUFFERS)
|
|
|
|
return -EINVAL;
|
2024-10-26 20:50:13 +00:00
|
|
|
ret = io_rsrc_data_alloc(&data, nr_args);
|
2022-06-13 13:12:45 +00:00
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
|
2024-05-23 21:45:35 +00:00
|
|
|
if (!arg)
|
|
|
|
memset(iov, 0, sizeof(*iov));
|
|
|
|
|
2024-10-26 20:50:13 +00:00
|
|
|
for (i = 0; i < nr_args; i++) {
|
io_uring/rsrc: get rid of per-ring io_rsrc_node list
Work in progress, but get rid of the per-ring serialization of resource
nodes, like registered buffers and files. Main issue here is that one
node can otherwise hold up a bunch of other nodes from getting freed,
which is especially a problem for file resource nodes and networked
workloads where some descriptors may not see activity in a long time.
As an example, instantiate an io_uring ring fd and create a sparse
registered file table. Even 2 will do. Then create a socket and register
it as fixed file 0, F0. The number of open files in the app is now 5,
with 0/1/2 being the usual stdin/out/err, 3 being the ring fd, and 4
being the socket. Register this socket (eg "the listener") in slot 0 of
the registered file table. Now add an operation on the socket that uses
slot 0. Finally, loop N times, where each loop creates a new socket,
registers said socket as a file, then unregisters the socket, and
finally closes the socket. This is roughly similar to what a basic
accept loop would look like.
At the end of this loop, it's not unreasonable to expect that there
would still be 5 open files. Each socket created and registered in the
loop is also unregistered and closed. But since the listener socket
registered first still has references to its resource node due to still
being active, each subsequent socket unregistration is stuck behind it
for reclaim. Hence 5 + N files are still open at that point, where N is
awaiting the final put held up by the listener socket.
Rewrite the io_rsrc_node handling to NOT rely on serialization. Struct
io_kiocb now gets explicit resource nodes assigned, with each holding a
reference to the parent node. A parent node is either of type FILE or
BUFFER, which are the two types of nodes that exist. A request can have
two nodes assigned, if it's using both registered files and buffers.
Since request issue and task_work completion is both under the ring
private lock, no atomics are needed to handle these references. It's a
simple unlocked inc/dec. As before, the registered buffer or file table
each hold a reference as well to the registered nodes. Final put of the
node will remove the node and free the underlying resource, eg unmap the
buffer or put the file.
Outside of removing the stall in resource reclaim described above, it
has the following advantages:
1) It's a lot simpler than the previous scheme, and easier to follow.
No need to specific quiesce handling anymore.
2) There are no resource node allocations in the fast path, all of that
happens at resource registration time.
3) The structs related to resource handling can all get simplified
quite a bit, like io_rsrc_node and io_rsrc_data. io_rsrc_put can
go away completely.
4) Handling of resource tags is much simpler, and doesn't require
persistent storage as it can simply get assigned up front at
registration time. Just copy them in one-by-one at registration time
and assign to the resource node.
The only real downside is that a request is now explicitly limited to
pinning 2 resources, one file and one buffer, where before just
assigning a resource node to a request would pin all of them. The upside
is that it's easier to follow now, as an individual resource is
explicitly referenced and assigned to the request.
With this in place, the above mentioned example will be using exactly 5
files at the end of the loop, not N.
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2024-10-26 01:27:39 +00:00
|
|
|
struct io_rsrc_node *node;
|
|
|
|
u64 tag = 0;
|
|
|
|
|
2022-06-13 13:12:45 +00:00
|
|
|
if (arg) {
|
2024-08-28 15:42:33 +00:00
|
|
|
uvec = (struct iovec __user *) arg;
|
|
|
|
iov = iovec_from_user(uvec, 1, 1, &fast_iov, ctx->compat);
|
2024-05-23 21:45:35 +00:00
|
|
|
if (IS_ERR(iov)) {
|
|
|
|
ret = PTR_ERR(iov);
|
2022-06-13 13:12:45 +00:00
|
|
|
break;
|
2024-05-23 21:45:35 +00:00
|
|
|
}
|
|
|
|
ret = io_buffer_validate(iov);
|
2022-06-13 13:12:45 +00:00
|
|
|
if (ret)
|
|
|
|
break;
|
2024-08-28 15:42:33 +00:00
|
|
|
if (ctx->compat)
|
|
|
|
arg += sizeof(struct compat_iovec);
|
|
|
|
else
|
|
|
|
arg += sizeof(struct iovec);
|
2022-06-13 13:12:45 +00:00
|
|
|
}
|
|
|
|
|
io_uring/rsrc: get rid of per-ring io_rsrc_node list
Work in progress, but get rid of the per-ring serialization of resource
nodes, like registered buffers and files. Main issue here is that one
node can otherwise hold up a bunch of other nodes from getting freed,
which is especially a problem for file resource nodes and networked
workloads where some descriptors may not see activity in a long time.
As an example, instantiate an io_uring ring fd and create a sparse
registered file table. Even 2 will do. Then create a socket and register
it as fixed file 0, F0. The number of open files in the app is now 5,
with 0/1/2 being the usual stdin/out/err, 3 being the ring fd, and 4
being the socket. Register this socket (eg "the listener") in slot 0 of
the registered file table. Now add an operation on the socket that uses
slot 0. Finally, loop N times, where each loop creates a new socket,
registers said socket as a file, then unregisters the socket, and
finally closes the socket. This is roughly similar to what a basic
accept loop would look like.
At the end of this loop, it's not unreasonable to expect that there
would still be 5 open files. Each socket created and registered in the
loop is also unregistered and closed. But since the listener socket
registered first still has references to its resource node due to still
being active, each subsequent socket unregistration is stuck behind it
for reclaim. Hence 5 + N files are still open at that point, where N is
awaiting the final put held up by the listener socket.
Rewrite the io_rsrc_node handling to NOT rely on serialization. Struct
io_kiocb now gets explicit resource nodes assigned, with each holding a
reference to the parent node. A parent node is either of type FILE or
BUFFER, which are the two types of nodes that exist. A request can have
two nodes assigned, if it's using both registered files and buffers.
Since request issue and task_work completion is both under the ring
private lock, no atomics are needed to handle these references. It's a
simple unlocked inc/dec. As before, the registered buffer or file table
each hold a reference as well to the registered nodes. Final put of the
node will remove the node and free the underlying resource, eg unmap the
buffer or put the file.
Outside of removing the stall in resource reclaim described above, it
has the following advantages:
1) It's a lot simpler than the previous scheme, and easier to follow.
No need to specific quiesce handling anymore.
2) There are no resource node allocations in the fast path, all of that
happens at resource registration time.
3) The structs related to resource handling can all get simplified
quite a bit, like io_rsrc_node and io_rsrc_data. io_rsrc_put can
go away completely.
4) Handling of resource tags is much simpler, and doesn't require
persistent storage as it can simply get assigned up front at
registration time. Just copy them in one-by-one at registration time
and assign to the resource node.
The only real downside is that a request is now explicitly limited to
pinning 2 resources, one file and one buffer, where before just
assigning a resource node to a request would pin all of them. The upside
is that it's easier to follow now, as an individual resource is
explicitly referenced and assigned to the request.
With this in place, the above mentioned example will be using exactly 5
files at the end of the loop, not N.
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2024-10-26 01:27:39 +00:00
|
|
|
if (tags) {
|
|
|
|
if (copy_from_user(&tag, &tags[i], sizeof(tag))) {
|
|
|
|
ret = -EFAULT;
|
|
|
|
break;
|
|
|
|
}
|
2022-06-13 13:12:45 +00:00
|
|
|
}
|
|
|
|
|
io_uring/rsrc: get rid of per-ring io_rsrc_node list
Work in progress, but get rid of the per-ring serialization of resource
nodes, like registered buffers and files. Main issue here is that one
node can otherwise hold up a bunch of other nodes from getting freed,
which is especially a problem for file resource nodes and networked
workloads where some descriptors may not see activity in a long time.
As an example, instantiate an io_uring ring fd and create a sparse
registered file table. Even 2 will do. Then create a socket and register
it as fixed file 0, F0. The number of open files in the app is now 5,
with 0/1/2 being the usual stdin/out/err, 3 being the ring fd, and 4
being the socket. Register this socket (eg "the listener") in slot 0 of
the registered file table. Now add an operation on the socket that uses
slot 0. Finally, loop N times, where each loop creates a new socket,
registers said socket as a file, then unregisters the socket, and
finally closes the socket. This is roughly similar to what a basic
accept loop would look like.
At the end of this loop, it's not unreasonable to expect that there
would still be 5 open files. Each socket created and registered in the
loop is also unregistered and closed. But since the listener socket
registered first still has references to its resource node due to still
being active, each subsequent socket unregistration is stuck behind it
for reclaim. Hence 5 + N files are still open at that point, where N is
awaiting the final put held up by the listener socket.
Rewrite the io_rsrc_node handling to NOT rely on serialization. Struct
io_kiocb now gets explicit resource nodes assigned, with each holding a
reference to the parent node. A parent node is either of type FILE or
BUFFER, which are the two types of nodes that exist. A request can have
two nodes assigned, if it's using both registered files and buffers.
Since request issue and task_work completion is both under the ring
private lock, no atomics are needed to handle these references. It's a
simple unlocked inc/dec. As before, the registered buffer or file table
each hold a reference as well to the registered nodes. Final put of the
node will remove the node and free the underlying resource, eg unmap the
buffer or put the file.
Outside of removing the stall in resource reclaim described above, it
has the following advantages:
1) It's a lot simpler than the previous scheme, and easier to follow.
No need to specific quiesce handling anymore.
2) There are no resource node allocations in the fast path, all of that
happens at resource registration time.
3) The structs related to resource handling can all get simplified
quite a bit, like io_rsrc_node and io_rsrc_data. io_rsrc_put can
go away completely.
4) Handling of resource tags is much simpler, and doesn't require
persistent storage as it can simply get assigned up front at
registration time. Just copy them in one-by-one at registration time
and assign to the resource node.
The only real downside is that a request is now explicitly limited to
pinning 2 resources, one file and one buffer, where before just
assigning a resource node to a request would pin all of them. The upside
is that it's easier to follow now, as an individual resource is
explicitly referenced and assigned to the request.
With this in place, the above mentioned example will be using exactly 5
files at the end of the loop, not N.
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2024-10-26 01:27:39 +00:00
|
|
|
node = io_sqe_buffer_register(ctx, iov, &last_hpage);
|
|
|
|
if (IS_ERR(node)) {
|
|
|
|
ret = PTR_ERR(node);
|
2022-06-13 13:12:45 +00:00
|
|
|
break;
|
io_uring/rsrc: get rid of per-ring io_rsrc_node list
Work in progress, but get rid of the per-ring serialization of resource
nodes, like registered buffers and files. Main issue here is that one
node can otherwise hold up a bunch of other nodes from getting freed,
which is especially a problem for file resource nodes and networked
workloads where some descriptors may not see activity in a long time.
As an example, instantiate an io_uring ring fd and create a sparse
registered file table. Even 2 will do. Then create a socket and register
it as fixed file 0, F0. The number of open files in the app is now 5,
with 0/1/2 being the usual stdin/out/err, 3 being the ring fd, and 4
being the socket. Register this socket (eg "the listener") in slot 0 of
the registered file table. Now add an operation on the socket that uses
slot 0. Finally, loop N times, where each loop creates a new socket,
registers said socket as a file, then unregisters the socket, and
finally closes the socket. This is roughly similar to what a basic
accept loop would look like.
At the end of this loop, it's not unreasonable to expect that there
would still be 5 open files. Each socket created and registered in the
loop is also unregistered and closed. But since the listener socket
registered first still has references to its resource node due to still
being active, each subsequent socket unregistration is stuck behind it
for reclaim. Hence 5 + N files are still open at that point, where N is
awaiting the final put held up by the listener socket.
Rewrite the io_rsrc_node handling to NOT rely on serialization. Struct
io_kiocb now gets explicit resource nodes assigned, with each holding a
reference to the parent node. A parent node is either of type FILE or
BUFFER, which are the two types of nodes that exist. A request can have
two nodes assigned, if it's using both registered files and buffers.
Since request issue and task_work completion is both under the ring
private lock, no atomics are needed to handle these references. It's a
simple unlocked inc/dec. As before, the registered buffer or file table
each hold a reference as well to the registered nodes. Final put of the
node will remove the node and free the underlying resource, eg unmap the
buffer or put the file.
Outside of removing the stall in resource reclaim described above, it
has the following advantages:
1) It's a lot simpler than the previous scheme, and easier to follow.
No need to specific quiesce handling anymore.
2) There are no resource node allocations in the fast path, all of that
happens at resource registration time.
3) The structs related to resource handling can all get simplified
quite a bit, like io_rsrc_node and io_rsrc_data. io_rsrc_put can
go away completely.
4) Handling of resource tags is much simpler, and doesn't require
persistent storage as it can simply get assigned up front at
registration time. Just copy them in one-by-one at registration time
and assign to the resource node.
The only real downside is that a request is now explicitly limited to
pinning 2 resources, one file and one buffer, where before just
assigning a resource node to a request would pin all of them. The upside
is that it's easier to follow now, as an individual resource is
explicitly referenced and assigned to the request.
With this in place, the above mentioned example will be using exactly 5
files at the end of the loop, not N.
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2024-10-26 01:27:39 +00:00
|
|
|
}
|
2024-10-30 15:51:58 +00:00
|
|
|
if (tag) {
|
|
|
|
if (!node) {
|
|
|
|
ret = -EINVAL;
|
|
|
|
break;
|
|
|
|
}
|
2024-10-26 16:41:51 +00:00
|
|
|
node->tag = tag;
|
2024-10-30 15:51:58 +00:00
|
|
|
}
|
2024-10-26 20:50:13 +00:00
|
|
|
data.nodes[i] = node;
|
2022-06-13 13:12:45 +00:00
|
|
|
}
|
|
|
|
|
2024-10-26 20:50:13 +00:00
|
|
|
ctx->buf_table = data;
|
2025-04-04 14:46:34 +00:00
|
|
|
if (ret) {
|
|
|
|
io_clear_table_tags(&ctx->buf_table);
|
2024-10-26 20:50:13 +00:00
|
|
|
io_sqe_buffers_unregister(ctx);
|
2025-04-04 14:46:34 +00:00
|
|
|
}
|
2022-06-13 13:12:45 +00:00
|
|
|
return ret;
|
|
|
|
}
|
2022-06-20 00:25:59 +00:00
|
|
|
|
2025-02-27 22:39:14 +00:00
|
|
|
int io_buffer_register_bvec(struct io_uring_cmd *cmd, struct request *rq,
|
|
|
|
void (*release)(void *), unsigned int index,
|
|
|
|
unsigned int issue_flags)
|
|
|
|
{
|
|
|
|
struct io_ring_ctx *ctx = cmd_to_io_kiocb(cmd)->ctx;
|
|
|
|
struct io_rsrc_data *data = &ctx->buf_table;
|
|
|
|
struct req_iterator rq_iter;
|
|
|
|
struct io_mapped_ubuf *imu;
|
|
|
|
struct io_rsrc_node *node;
|
|
|
|
struct bio_vec bv, *bvec;
|
|
|
|
u16 nr_bvecs;
|
|
|
|
int ret = 0;
|
|
|
|
|
|
|
|
io_ring_submit_lock(ctx, issue_flags);
|
|
|
|
if (index >= data->nr) {
|
|
|
|
ret = -EINVAL;
|
|
|
|
goto unlock;
|
|
|
|
}
|
|
|
|
index = array_index_nospec(index, data->nr);
|
|
|
|
|
|
|
|
if (data->nodes[index]) {
|
|
|
|
ret = -EBUSY;
|
|
|
|
goto unlock;
|
|
|
|
}
|
|
|
|
|
2025-02-27 22:39:16 +00:00
|
|
|
node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER);
|
2025-02-27 22:39:14 +00:00
|
|
|
if (!node) {
|
|
|
|
ret = -ENOMEM;
|
|
|
|
goto unlock;
|
|
|
|
}
|
|
|
|
|
|
|
|
nr_bvecs = blk_rq_nr_phys_segments(rq);
|
2025-02-27 22:39:16 +00:00
|
|
|
imu = io_alloc_imu(ctx, nr_bvecs);
|
2025-02-27 22:39:14 +00:00
|
|
|
if (!imu) {
|
|
|
|
kfree(node);
|
|
|
|
ret = -ENOMEM;
|
|
|
|
goto unlock;
|
|
|
|
}
|
|
|
|
|
|
|
|
imu->ubuf = 0;
|
|
|
|
imu->len = blk_rq_bytes(rq);
|
|
|
|
imu->acct_pages = 0;
|
|
|
|
imu->folio_shift = PAGE_SHIFT;
|
|
|
|
imu->nr_bvecs = nr_bvecs;
|
|
|
|
refcount_set(&imu->refs, 1);
|
|
|
|
imu->release = release;
|
|
|
|
imu->priv = rq;
|
|
|
|
imu->is_kbuf = true;
|
2025-02-28 22:30:56 +00:00
|
|
|
imu->dir = 1 << rq_data_dir(rq);
|
2025-02-27 22:39:14 +00:00
|
|
|
|
|
|
|
bvec = imu->bvec;
|
|
|
|
rq_for_each_bvec(bv, rq, rq_iter)
|
|
|
|
*bvec++ = bv;
|
|
|
|
|
|
|
|
node->buf = imu;
|
|
|
|
data->nodes[index] = node;
|
|
|
|
unlock:
|
|
|
|
io_ring_submit_unlock(ctx, issue_flags);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(io_buffer_register_bvec);
|
|
|
|
|
2025-02-28 23:14:31 +00:00
|
|
|
int io_buffer_unregister_bvec(struct io_uring_cmd *cmd, unsigned int index,
|
|
|
|
unsigned int issue_flags)
|
2025-02-27 22:39:14 +00:00
|
|
|
{
|
|
|
|
struct io_ring_ctx *ctx = cmd_to_io_kiocb(cmd)->ctx;
|
|
|
|
struct io_rsrc_data *data = &ctx->buf_table;
|
|
|
|
struct io_rsrc_node *node;
|
2025-02-28 23:14:31 +00:00
|
|
|
int ret = 0;
|
2025-02-27 22:39:14 +00:00
|
|
|
|
|
|
|
io_ring_submit_lock(ctx, issue_flags);
|
2025-02-28 23:14:31 +00:00
|
|
|
if (index >= data->nr) {
|
|
|
|
ret = -EINVAL;
|
2025-02-27 22:39:14 +00:00
|
|
|
goto unlock;
|
2025-02-28 23:14:31 +00:00
|
|
|
}
|
2025-02-27 22:39:14 +00:00
|
|
|
index = array_index_nospec(index, data->nr);
|
|
|
|
|
|
|
|
node = data->nodes[index];
|
2025-02-28 23:14:31 +00:00
|
|
|
if (!node) {
|
|
|
|
ret = -EINVAL;
|
2025-02-27 22:39:14 +00:00
|
|
|
goto unlock;
|
2025-02-28 23:14:31 +00:00
|
|
|
}
|
|
|
|
if (!node->buf->is_kbuf) {
|
|
|
|
ret = -EBUSY;
|
|
|
|
goto unlock;
|
|
|
|
}
|
2025-02-27 22:39:14 +00:00
|
|
|
|
|
|
|
io_put_rsrc_node(ctx, node);
|
|
|
|
data->nodes[index] = NULL;
|
|
|
|
unlock:
|
|
|
|
io_ring_submit_unlock(ctx, issue_flags);
|
2025-02-28 23:14:31 +00:00
|
|
|
return ret;
|
2025-02-27 22:39:14 +00:00
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(io_buffer_unregister_bvec);
|
|
|
|
|
2025-03-25 13:51:50 +00:00
|
|
|
static int validate_fixed_range(u64 buf_addr, size_t len,
|
|
|
|
const struct io_mapped_ubuf *imu)
|
2022-06-20 00:25:59 +00:00
|
|
|
{
|
|
|
|
u64 buf_end;
|
|
|
|
|
|
|
|
if (unlikely(check_add_overflow(buf_addr, (u64)len, &buf_end)))
|
|
|
|
return -EFAULT;
|
|
|
|
/* not inside the mapped region */
|
2024-09-15 14:53:45 +00:00
|
|
|
if (unlikely(buf_addr < imu->ubuf || buf_end > (imu->ubuf + imu->len)))
|
2022-06-20 00:25:59 +00:00
|
|
|
return -EFAULT;
|
2025-03-31 18:40:21 +00:00
|
|
|
if (unlikely(len > MAX_RW_COUNT))
|
|
|
|
return -EFAULT;
|
2025-03-25 13:51:50 +00:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2025-04-16 22:48:26 +00:00
|
|
|
static int io_import_kbuf(int ddir, struct iov_iter *iter,
|
|
|
|
struct io_mapped_ubuf *imu, size_t len, size_t offset)
|
|
|
|
{
|
|
|
|
size_t count = len + offset;
|
|
|
|
|
|
|
|
iov_iter_bvec(iter, ddir, imu->bvec, imu->nr_bvecs, count);
|
|
|
|
iov_iter_advance(iter, offset);
|
|
|
|
|
|
|
|
if (count < imu->len) {
|
|
|
|
const struct bio_vec *bvec = iter->bvec;
|
|
|
|
|
|
|
|
while (len > bvec->bv_len) {
|
|
|
|
len -= bvec->bv_len;
|
|
|
|
bvec++;
|
|
|
|
}
|
|
|
|
iter->nr_segs = 1 + bvec - iter->bvec;
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2025-03-25 13:51:50 +00:00
|
|
|
static int io_import_fixed(int ddir, struct iov_iter *iter,
|
|
|
|
struct io_mapped_ubuf *imu,
|
|
|
|
u64 buf_addr, size_t len)
|
|
|
|
{
|
2025-04-17 09:32:31 +00:00
|
|
|
const struct bio_vec *bvec;
|
2025-04-17 09:32:34 +00:00
|
|
|
size_t folio_mask;
|
2025-04-17 09:32:33 +00:00
|
|
|
unsigned nr_segs;
|
2025-03-25 13:51:50 +00:00
|
|
|
size_t offset;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
ret = validate_fixed_range(buf_addr, len, imu);
|
|
|
|
if (unlikely(ret))
|
|
|
|
return ret;
|
2025-02-27 22:39:14 +00:00
|
|
|
if (!(imu->dir & (1 << ddir)))
|
|
|
|
return -EFAULT;
|
2022-06-20 00:25:59 +00:00
|
|
|
|
|
|
|
offset = buf_addr - imu->ubuf;
|
2025-04-17 09:32:32 +00:00
|
|
|
|
2025-04-16 22:48:26 +00:00
|
|
|
if (imu->is_kbuf)
|
|
|
|
return io_import_kbuf(ddir, iter, imu, len, offset);
|
2025-04-17 09:32:32 +00:00
|
|
|
|
2025-04-17 09:32:31 +00:00
|
|
|
/*
|
|
|
|
* Don't use iov_iter_advance() here, as it's really slow for
|
|
|
|
* using the latter parts of a big fixed buffer - it iterates
|
|
|
|
* over each segment manually. We can cheat a bit here for user
|
|
|
|
* registered nodes, because we know that:
|
|
|
|
*
|
|
|
|
* 1) it's a BVEC iter, we set it up
|
|
|
|
* 2) all bvecs are the same in size, except potentially the
|
|
|
|
* first and last bvec
|
|
|
|
*/
|
2025-04-17 09:32:34 +00:00
|
|
|
folio_mask = (1UL << imu->folio_shift) - 1;
|
2025-04-17 09:32:31 +00:00
|
|
|
bvec = imu->bvec;
|
2025-04-17 09:32:33 +00:00
|
|
|
if (offset >= bvec->bv_len) {
|
2025-04-17 09:32:31 +00:00
|
|
|
unsigned long seg_skip;
|
2022-06-20 00:25:59 +00:00
|
|
|
|
2025-04-17 09:32:31 +00:00
|
|
|
/* skip first vec */
|
|
|
|
offset -= bvec->bv_len;
|
|
|
|
seg_skip = 1 + (offset >> imu->folio_shift);
|
2025-04-17 09:32:33 +00:00
|
|
|
bvec += seg_skip;
|
2025-04-17 09:32:34 +00:00
|
|
|
offset &= folio_mask;
|
2022-06-20 00:25:59 +00:00
|
|
|
}
|
2025-04-17 09:32:34 +00:00
|
|
|
nr_segs = (offset + len + bvec->bv_offset + folio_mask) >> imu->folio_shift;
|
2025-04-17 09:32:33 +00:00
|
|
|
iov_iter_bvec(iter, ddir, bvec, nr_segs, len);
|
|
|
|
iter->iov_offset = offset;
|
2022-06-20 00:25:59 +00:00
|
|
|
return 0;
|
|
|
|
}
|
io_uring: add IORING_REGISTER_COPY_BUFFERS method
Buffers can get registered with io_uring, which allows to skip the
repeated pin_pages, unpin/unref pages for each O_DIRECT operation. This
reduces the overhead of O_DIRECT IO.
However, registrering buffers can take some time. Normally this isn't an
issue as it's done at initialization time (and hence less critical), but
for cases where rings can be created and destroyed as part of an IO
thread pool, registering the same buffers for multiple rings become a
more time sensitive proposition. As an example, let's say an application
has an IO memory pool of 500G. Initial registration takes:
Got 500 huge pages (each 1024MB)
Registered 500 pages in 409 msec
or about 0.4 seconds. If we go higher to 900 1GB huge pages being
registered:
Registered 900 pages in 738 msec
which is, as expected, a fully linear scaling.
Rather than have each ring pin/map/register the same buffer pool,
provide an io_uring_register(2) opcode to simply duplicate the buffers
that are registered with another ring. Adding the same 900GB of
registered buffers to the target ring can then be accomplished in:
Copied 900 pages in 17 usec
While timing differs a bit, this provides around a 25,000-40,000x
speedup for this use case.
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2024-09-11 19:56:08 +00:00
|
|
|
|
2025-03-01 00:16:07 +00:00
|
|
|
inline struct io_rsrc_node *io_find_buf_node(struct io_kiocb *req,
|
|
|
|
unsigned issue_flags)
|
2025-02-24 21:31:10 +00:00
|
|
|
{
|
|
|
|
struct io_ring_ctx *ctx = req->ctx;
|
|
|
|
struct io_rsrc_node *node;
|
|
|
|
|
|
|
|
if (req->flags & REQ_F_BUF_NODE)
|
|
|
|
return req->buf_node;
|
2025-04-16 19:25:03 +00:00
|
|
|
req->flags |= REQ_F_BUF_NODE;
|
2025-02-24 21:31:10 +00:00
|
|
|
|
|
|
|
io_ring_submit_lock(ctx, issue_flags);
|
|
|
|
node = io_rsrc_node_lookup(&ctx->buf_table, req->buf_index);
|
2025-04-16 19:25:03 +00:00
|
|
|
if (node) {
|
|
|
|
node->refs++;
|
|
|
|
req->buf_node = node;
|
|
|
|
io_ring_submit_unlock(ctx, issue_flags);
|
|
|
|
return node;
|
|
|
|
}
|
|
|
|
req->flags &= ~REQ_F_BUF_NODE;
|
2025-02-24 21:31:10 +00:00
|
|
|
io_ring_submit_unlock(ctx, issue_flags);
|
2025-04-16 19:25:03 +00:00
|
|
|
return NULL;
|
2025-02-24 21:31:10 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
int io_import_reg_buf(struct io_kiocb *req, struct iov_iter *iter,
|
|
|
|
u64 buf_addr, size_t len, int ddir,
|
|
|
|
unsigned issue_flags)
|
|
|
|
{
|
|
|
|
struct io_rsrc_node *node;
|
|
|
|
|
|
|
|
node = io_find_buf_node(req, issue_flags);
|
|
|
|
if (!node)
|
|
|
|
return -EFAULT;
|
|
|
|
return io_import_fixed(ddir, iter, node->buf, buf_addr, len);
|
|
|
|
}
|
|
|
|
|
2025-01-15 20:26:03 +00:00
|
|
|
/* Lock two rings at once. The rings must be different! */
|
|
|
|
static void lock_two_rings(struct io_ring_ctx *ctx1, struct io_ring_ctx *ctx2)
|
|
|
|
{
|
|
|
|
if (ctx1 > ctx2)
|
|
|
|
swap(ctx1, ctx2);
|
|
|
|
mutex_lock(&ctx1->uring_lock);
|
|
|
|
mutex_lock_nested(&ctx2->uring_lock, SINGLE_DEPTH_NESTING);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Both rings are locked by the caller. */
|
2024-10-29 00:43:13 +00:00
|
|
|
static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx,
|
|
|
|
struct io_uring_clone_buffers *arg)
|
io_uring: add IORING_REGISTER_COPY_BUFFERS method
Buffers can get registered with io_uring, which allows to skip the
repeated pin_pages, unpin/unref pages for each O_DIRECT operation. This
reduces the overhead of O_DIRECT IO.
However, registrering buffers can take some time. Normally this isn't an
issue as it's done at initialization time (and hence less critical), but
for cases where rings can be created and destroyed as part of an IO
thread pool, registering the same buffers for multiple rings become a
more time sensitive proposition. As an example, let's say an application
has an IO memory pool of 500G. Initial registration takes:
Got 500 huge pages (each 1024MB)
Registered 500 pages in 409 msec
or about 0.4 seconds. If we go higher to 900 1GB huge pages being
registered:
Registered 900 pages in 738 msec
which is, as expected, a fully linear scaling.
Rather than have each ring pin/map/register the same buffer pool,
provide an io_uring_register(2) opcode to simply duplicate the buffers
that are registered with another ring. Adding the same 900GB of
registered buffers to the target ring can then be accomplished in:
Copied 900 pages in 17 usec
While timing differs a bit, this provides around a 25,000-40,000x
speedup for this use case.
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2024-09-11 19:56:08 +00:00
|
|
|
{
|
2024-10-26 20:50:13 +00:00
|
|
|
struct io_rsrc_data data;
|
2024-10-29 13:50:56 +00:00
|
|
|
int i, ret, off, nr;
|
|
|
|
unsigned int nbufs;
|
|
|
|
|
2025-01-15 20:26:03 +00:00
|
|
|
lockdep_assert_held(&ctx->uring_lock);
|
|
|
|
lockdep_assert_held(&src_ctx->uring_lock);
|
|
|
|
|
2025-01-14 17:49:00 +00:00
|
|
|
/*
|
|
|
|
* Accounting state is shared between the two rings; that only works if
|
|
|
|
* both rings are accounted towards the same counters.
|
|
|
|
*/
|
|
|
|
if (ctx->user != src_ctx->user || ctx->mm_account != src_ctx->mm_account)
|
|
|
|
return -EINVAL;
|
|
|
|
|
2024-10-29 13:50:56 +00:00
|
|
|
/* if offsets are given, must have nr specified too */
|
|
|
|
if (!arg->nr && (arg->dst_off || arg->src_off))
|
|
|
|
return -EINVAL;
|
|
|
|
/* not allowed unless REPLACE is set */
|
|
|
|
if (ctx->buf_table.nr && !(arg->flags & IORING_REGISTER_DST_REPLACE))
|
|
|
|
return -EBUSY;
|
|
|
|
|
2025-01-15 20:26:03 +00:00
|
|
|
nbufs = src_ctx->buf_table.nr;
|
2024-10-29 13:50:56 +00:00
|
|
|
if (!arg->nr)
|
|
|
|
arg->nr = nbufs;
|
|
|
|
else if (arg->nr > nbufs)
|
|
|
|
return -EINVAL;
|
|
|
|
else if (arg->nr > IORING_MAX_REG_BUFFERS)
|
|
|
|
return -EINVAL;
|
|
|
|
if (check_add_overflow(arg->nr, arg->dst_off, &nbufs))
|
|
|
|
return -EOVERFLOW;
|
|
|
|
|
|
|
|
ret = io_rsrc_data_alloc(&data, max(nbufs, ctx->buf_table.nr));
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
|
|
|
|
/* Fill entries in data from dst that won't overlap with src */
|
|
|
|
for (i = 0; i < min(arg->dst_off, ctx->buf_table.nr); i++) {
|
|
|
|
struct io_rsrc_node *src_node = ctx->buf_table.nodes[i];
|
|
|
|
|
|
|
|
if (src_node) {
|
|
|
|
data.nodes[i] = src_node;
|
|
|
|
src_node->refs++;
|
|
|
|
}
|
|
|
|
}
|
io_uring: add IORING_REGISTER_COPY_BUFFERS method
Buffers can get registered with io_uring, which allows to skip the
repeated pin_pages, unpin/unref pages for each O_DIRECT operation. This
reduces the overhead of O_DIRECT IO.
However, registrering buffers can take some time. Normally this isn't an
issue as it's done at initialization time (and hence less critical), but
for cases where rings can be created and destroyed as part of an IO
thread pool, registering the same buffers for multiple rings become a
more time sensitive proposition. As an example, let's say an application
has an IO memory pool of 500G. Initial registration takes:
Got 500 huge pages (each 1024MB)
Registered 500 pages in 409 msec
or about 0.4 seconds. If we go higher to 900 1GB huge pages being
registered:
Registered 900 pages in 738 msec
which is, as expected, a fully linear scaling.
Rather than have each ring pin/map/register the same buffer pool,
provide an io_uring_register(2) opcode to simply duplicate the buffers
that are registered with another ring. Adding the same 900GB of
registered buffers to the target ring can then be accomplished in:
Copied 900 pages in 17 usec
While timing differs a bit, this provides around a 25,000-40,000x
speedup for this use case.
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2024-09-11 19:56:08 +00:00
|
|
|
|
|
|
|
ret = -ENXIO;
|
2024-10-26 20:50:13 +00:00
|
|
|
nbufs = src_ctx->buf_table.nr;
|
io_uring: add IORING_REGISTER_COPY_BUFFERS method
Buffers can get registered with io_uring, which allows to skip the
repeated pin_pages, unpin/unref pages for each O_DIRECT operation. This
reduces the overhead of O_DIRECT IO.
However, registrering buffers can take some time. Normally this isn't an
issue as it's done at initialization time (and hence less critical), but
for cases where rings can be created and destroyed as part of an IO
thread pool, registering the same buffers for multiple rings become a
more time sensitive proposition. As an example, let's say an application
has an IO memory pool of 500G. Initial registration takes:
Got 500 huge pages (each 1024MB)
Registered 500 pages in 409 msec
or about 0.4 seconds. If we go higher to 900 1GB huge pages being
registered:
Registered 900 pages in 738 msec
which is, as expected, a fully linear scaling.
Rather than have each ring pin/map/register the same buffer pool,
provide an io_uring_register(2) opcode to simply duplicate the buffers
that are registered with another ring. Adding the same 900GB of
registered buffers to the target ring can then be accomplished in:
Copied 900 pages in 17 usec
While timing differs a bit, this provides around a 25,000-40,000x
speedup for this use case.
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2024-09-11 19:56:08 +00:00
|
|
|
if (!nbufs)
|
2025-01-15 20:26:03 +00:00
|
|
|
goto out_free;
|
2024-10-29 00:43:13 +00:00
|
|
|
ret = -EINVAL;
|
|
|
|
if (!arg->nr)
|
|
|
|
arg->nr = nbufs;
|
|
|
|
else if (arg->nr > nbufs)
|
2025-01-15 20:26:03 +00:00
|
|
|
goto out_free;
|
2024-10-29 00:43:13 +00:00
|
|
|
ret = -EOVERFLOW;
|
|
|
|
if (check_add_overflow(arg->nr, arg->src_off, &off))
|
2025-01-15 20:26:03 +00:00
|
|
|
goto out_free;
|
2024-10-29 00:43:13 +00:00
|
|
|
if (off > nbufs)
|
2025-01-15 20:26:03 +00:00
|
|
|
goto out_free;
|
io_uring: add IORING_REGISTER_COPY_BUFFERS method
Buffers can get registered with io_uring, which allows to skip the
repeated pin_pages, unpin/unref pages for each O_DIRECT operation. This
reduces the overhead of O_DIRECT IO.
However, registrering buffers can take some time. Normally this isn't an
issue as it's done at initialization time (and hence less critical), but
for cases where rings can be created and destroyed as part of an IO
thread pool, registering the same buffers for multiple rings become a
more time sensitive proposition. As an example, let's say an application
has an IO memory pool of 500G. Initial registration takes:
Got 500 huge pages (each 1024MB)
Registered 500 pages in 409 msec
or about 0.4 seconds. If we go higher to 900 1GB huge pages being
registered:
Registered 900 pages in 738 msec
which is, as expected, a fully linear scaling.
Rather than have each ring pin/map/register the same buffer pool,
provide an io_uring_register(2) opcode to simply duplicate the buffers
that are registered with another ring. Adding the same 900GB of
registered buffers to the target ring can then be accomplished in:
Copied 900 pages in 17 usec
While timing differs a bit, this provides around a 25,000-40,000x
speedup for this use case.
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2024-09-11 19:56:08 +00:00
|
|
|
|
2024-10-29 00:43:13 +00:00
|
|
|
off = arg->dst_off;
|
|
|
|
i = arg->src_off;
|
|
|
|
nr = arg->nr;
|
|
|
|
while (nr--) {
|
2024-10-27 15:08:31 +00:00
|
|
|
struct io_rsrc_node *dst_node, *src_node;
|
io_uring: add IORING_REGISTER_COPY_BUFFERS method
Buffers can get registered with io_uring, which allows to skip the
repeated pin_pages, unpin/unref pages for each O_DIRECT operation. This
reduces the overhead of O_DIRECT IO.
However, registrering buffers can take some time. Normally this isn't an
issue as it's done at initialization time (and hence less critical), but
for cases where rings can be created and destroyed as part of an IO
thread pool, registering the same buffers for multiple rings become a
more time sensitive proposition. As an example, let's say an application
has an IO memory pool of 500G. Initial registration takes:
Got 500 huge pages (each 1024MB)
Registered 500 pages in 409 msec
or about 0.4 seconds. If we go higher to 900 1GB huge pages being
registered:
Registered 900 pages in 738 msec
which is, as expected, a fully linear scaling.
Rather than have each ring pin/map/register the same buffer pool,
provide an io_uring_register(2) opcode to simply duplicate the buffers
that are registered with another ring. Adding the same 900GB of
registered buffers to the target ring can then be accomplished in:
Copied 900 pages in 17 usec
While timing differs a bit, this provides around a 25,000-40,000x
speedup for this use case.
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2024-09-11 19:56:08 +00:00
|
|
|
|
2024-10-27 15:08:31 +00:00
|
|
|
src_node = io_rsrc_node_lookup(&src_ctx->buf_table, i);
|
2024-10-30 15:51:58 +00:00
|
|
|
if (!src_node) {
|
|
|
|
dst_node = NULL;
|
io_uring/rsrc: get rid of per-ring io_rsrc_node list
Work in progress, but get rid of the per-ring serialization of resource
nodes, like registered buffers and files. Main issue here is that one
node can otherwise hold up a bunch of other nodes from getting freed,
which is especially a problem for file resource nodes and networked
workloads where some descriptors may not see activity in a long time.
As an example, instantiate an io_uring ring fd and create a sparse
registered file table. Even 2 will do. Then create a socket and register
it as fixed file 0, F0. The number of open files in the app is now 5,
with 0/1/2 being the usual stdin/out/err, 3 being the ring fd, and 4
being the socket. Register this socket (eg "the listener") in slot 0 of
the registered file table. Now add an operation on the socket that uses
slot 0. Finally, loop N times, where each loop creates a new socket,
registers said socket as a file, then unregisters the socket, and
finally closes the socket. This is roughly similar to what a basic
accept loop would look like.
At the end of this loop, it's not unreasonable to expect that there
would still be 5 open files. Each socket created and registered in the
loop is also unregistered and closed. But since the listener socket
registered first still has references to its resource node due to still
being active, each subsequent socket unregistration is stuck behind it
for reclaim. Hence 5 + N files are still open at that point, where N is
awaiting the final put held up by the listener socket.
Rewrite the io_rsrc_node handling to NOT rely on serialization. Struct
io_kiocb now gets explicit resource nodes assigned, with each holding a
reference to the parent node. A parent node is either of type FILE or
BUFFER, which are the two types of nodes that exist. A request can have
two nodes assigned, if it's using both registered files and buffers.
Since request issue and task_work completion is both under the ring
private lock, no atomics are needed to handle these references. It's a
simple unlocked inc/dec. As before, the registered buffer or file table
each hold a reference as well to the registered nodes. Final put of the
node will remove the node and free the underlying resource, eg unmap the
buffer or put the file.
Outside of removing the stall in resource reclaim described above, it
has the following advantages:
1) It's a lot simpler than the previous scheme, and easier to follow.
No need to specific quiesce handling anymore.
2) There are no resource node allocations in the fast path, all of that
happens at resource registration time.
3) The structs related to resource handling can all get simplified
quite a bit, like io_rsrc_node and io_rsrc_data. io_rsrc_put can
go away completely.
4) Handling of resource tags is much simpler, and doesn't require
persistent storage as it can simply get assigned up front at
registration time. Just copy them in one-by-one at registration time
and assign to the resource node.
The only real downside is that a request is now explicitly limited to
pinning 2 resources, one file and one buffer, where before just
assigning a resource node to a request would pin all of them. The upside
is that it's easier to follow now, as an individual resource is
explicitly referenced and assigned to the request.
With this in place, the above mentioned example will be using exactly 5
files at the end of the loop, not N.
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2024-10-26 01:27:39 +00:00
|
|
|
} else {
|
2025-02-27 22:39:16 +00:00
|
|
|
dst_node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER);
|
2024-10-26 20:50:13 +00:00
|
|
|
if (!dst_node) {
|
|
|
|
ret = -ENOMEM;
|
2025-01-15 20:26:03 +00:00
|
|
|
goto out_free;
|
2024-10-26 20:50:13 +00:00
|
|
|
}
|
2024-10-26 16:41:51 +00:00
|
|
|
|
|
|
|
refcount_inc(&src_node->buf->refs);
|
|
|
|
dst_node->buf = src_node->buf;
|
io_uring/rsrc: get rid of per-ring io_rsrc_node list
Work in progress, but get rid of the per-ring serialization of resource
nodes, like registered buffers and files. Main issue here is that one
node can otherwise hold up a bunch of other nodes from getting freed,
which is especially a problem for file resource nodes and networked
workloads where some descriptors may not see activity in a long time.
As an example, instantiate an io_uring ring fd and create a sparse
registered file table. Even 2 will do. Then create a socket and register
it as fixed file 0, F0. The number of open files in the app is now 5,
with 0/1/2 being the usual stdin/out/err, 3 being the ring fd, and 4
being the socket. Register this socket (eg "the listener") in slot 0 of
the registered file table. Now add an operation on the socket that uses
slot 0. Finally, loop N times, where each loop creates a new socket,
registers said socket as a file, then unregisters the socket, and
finally closes the socket. This is roughly similar to what a basic
accept loop would look like.
At the end of this loop, it's not unreasonable to expect that there
would still be 5 open files. Each socket created and registered in the
loop is also unregistered and closed. But since the listener socket
registered first still has references to its resource node due to still
being active, each subsequent socket unregistration is stuck behind it
for reclaim. Hence 5 + N files are still open at that point, where N is
awaiting the final put held up by the listener socket.
Rewrite the io_rsrc_node handling to NOT rely on serialization. Struct
io_kiocb now gets explicit resource nodes assigned, with each holding a
reference to the parent node. A parent node is either of type FILE or
BUFFER, which are the two types of nodes that exist. A request can have
two nodes assigned, if it's using both registered files and buffers.
Since request issue and task_work completion is both under the ring
private lock, no atomics are needed to handle these references. It's a
simple unlocked inc/dec. As before, the registered buffer or file table
each hold a reference as well to the registered nodes. Final put of the
node will remove the node and free the underlying resource, eg unmap the
buffer or put the file.
Outside of removing the stall in resource reclaim described above, it
has the following advantages:
1) It's a lot simpler than the previous scheme, and easier to follow.
No need to specific quiesce handling anymore.
2) There are no resource node allocations in the fast path, all of that
happens at resource registration time.
3) The structs related to resource handling can all get simplified
quite a bit, like io_rsrc_node and io_rsrc_data. io_rsrc_put can
go away completely.
4) Handling of resource tags is much simpler, and doesn't require
persistent storage as it can simply get assigned up front at
registration time. Just copy them in one-by-one at registration time
and assign to the resource node.
The only real downside is that a request is now explicitly limited to
pinning 2 resources, one file and one buffer, where before just
assigning a resource node to a request would pin all of them. The upside
is that it's easier to follow now, as an individual resource is
explicitly referenced and assigned to the request.
With this in place, the above mentioned example will be using exactly 5
files at the end of the loop, not N.
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2024-10-26 01:27:39 +00:00
|
|
|
}
|
2024-10-29 00:43:13 +00:00
|
|
|
data.nodes[off++] = dst_node;
|
|
|
|
i++;
|
io_uring: add IORING_REGISTER_COPY_BUFFERS method
Buffers can get registered with io_uring, which allows to skip the
repeated pin_pages, unpin/unref pages for each O_DIRECT operation. This
reduces the overhead of O_DIRECT IO.
However, registrering buffers can take some time. Normally this isn't an
issue as it's done at initialization time (and hence less critical), but
for cases where rings can be created and destroyed as part of an IO
thread pool, registering the same buffers for multiple rings become a
more time sensitive proposition. As an example, let's say an application
has an IO memory pool of 500G. Initial registration takes:
Got 500 huge pages (each 1024MB)
Registered 500 pages in 409 msec
or about 0.4 seconds. If we go higher to 900 1GB huge pages being
registered:
Registered 900 pages in 738 msec
which is, as expected, a fully linear scaling.
Rather than have each ring pin/map/register the same buffer pool,
provide an io_uring_register(2) opcode to simply duplicate the buffers
that are registered with another ring. Adding the same 900GB of
registered buffers to the target ring can then be accomplished in:
Copied 900 pages in 17 usec
While timing differs a bit, this provides around a 25,000-40,000x
speedup for this use case.
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2024-09-11 19:56:08 +00:00
|
|
|
}
|
|
|
|
|
2024-10-29 13:50:56 +00:00
|
|
|
/*
|
|
|
|
* If asked for replace, put the old table. data->nodes[] holds both
|
|
|
|
* old and new nodes at this point.
|
|
|
|
*/
|
|
|
|
if (arg->flags & IORING_REGISTER_DST_REPLACE)
|
2024-11-07 11:01:34 +00:00
|
|
|
io_rsrc_data_free(ctx, &ctx->buf_table);
|
2024-10-29 13:50:56 +00:00
|
|
|
|
|
|
|
/*
|
2025-01-15 20:26:03 +00:00
|
|
|
* ctx->buf_table must be empty now - either the contents are being
|
|
|
|
* replaced and we just freed the table, or the contents are being
|
|
|
|
* copied to a ring that does not have buffers yet (checked at function
|
|
|
|
* entry).
|
2024-10-29 13:50:56 +00:00
|
|
|
*/
|
2025-01-15 20:26:03 +00:00
|
|
|
WARN_ON_ONCE(ctx->buf_table.nr);
|
|
|
|
ctx->buf_table = data;
|
|
|
|
return 0;
|
io_uring: add IORING_REGISTER_COPY_BUFFERS method
Buffers can get registered with io_uring, which allows to skip the
repeated pin_pages, unpin/unref pages for each O_DIRECT operation. This
reduces the overhead of O_DIRECT IO.
However, registrering buffers can take some time. Normally this isn't an
issue as it's done at initialization time (and hence less critical), but
for cases where rings can be created and destroyed as part of an IO
thread pool, registering the same buffers for multiple rings become a
more time sensitive proposition. As an example, let's say an application
has an IO memory pool of 500G. Initial registration takes:
Got 500 huge pages (each 1024MB)
Registered 500 pages in 409 msec
or about 0.4 seconds. If we go higher to 900 1GB huge pages being
registered:
Registered 900 pages in 738 msec
which is, as expected, a fully linear scaling.
Rather than have each ring pin/map/register the same buffer pool,
provide an io_uring_register(2) opcode to simply duplicate the buffers
that are registered with another ring. Adding the same 900GB of
registered buffers to the target ring can then be accomplished in:
Copied 900 pages in 17 usec
While timing differs a bit, this provides around a 25,000-40,000x
speedup for this use case.
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2024-09-11 19:56:08 +00:00
|
|
|
|
2025-01-15 20:26:03 +00:00
|
|
|
out_free:
|
2024-11-07 11:01:34 +00:00
|
|
|
io_rsrc_data_free(ctx, &data);
|
io_uring: add IORING_REGISTER_COPY_BUFFERS method
Buffers can get registered with io_uring, which allows to skip the
repeated pin_pages, unpin/unref pages for each O_DIRECT operation. This
reduces the overhead of O_DIRECT IO.
However, registrering buffers can take some time. Normally this isn't an
issue as it's done at initialization time (and hence less critical), but
for cases where rings can be created and destroyed as part of an IO
thread pool, registering the same buffers for multiple rings become a
more time sensitive proposition. As an example, let's say an application
has an IO memory pool of 500G. Initial registration takes:
Got 500 huge pages (each 1024MB)
Registered 500 pages in 409 msec
or about 0.4 seconds. If we go higher to 900 1GB huge pages being
registered:
Registered 900 pages in 738 msec
which is, as expected, a fully linear scaling.
Rather than have each ring pin/map/register the same buffer pool,
provide an io_uring_register(2) opcode to simply duplicate the buffers
that are registered with another ring. Adding the same 900GB of
registered buffers to the target ring can then be accomplished in:
Copied 900 pages in 17 usec
While timing differs a bit, this provides around a 25,000-40,000x
speedup for this use case.
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2024-09-11 19:56:08 +00:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Copy the registered buffers from the source ring whose file descriptor
|
|
|
|
* is given in the src_fd to the current ring. This is identical to registering
|
|
|
|
* the buffers with ctx, except faster as mappings already exist.
|
|
|
|
*
|
|
|
|
* Since the memory is already accounted once, don't account it again.
|
|
|
|
*/
|
2024-09-14 14:51:15 +00:00
|
|
|
int io_register_clone_buffers(struct io_ring_ctx *ctx, void __user *arg)
|
io_uring: add IORING_REGISTER_COPY_BUFFERS method
Buffers can get registered with io_uring, which allows to skip the
repeated pin_pages, unpin/unref pages for each O_DIRECT operation. This
reduces the overhead of O_DIRECT IO.
However, registrering buffers can take some time. Normally this isn't an
issue as it's done at initialization time (and hence less critical), but
for cases where rings can be created and destroyed as part of an IO
thread pool, registering the same buffers for multiple rings become a
more time sensitive proposition. As an example, let's say an application
has an IO memory pool of 500G. Initial registration takes:
Got 500 huge pages (each 1024MB)
Registered 500 pages in 409 msec
or about 0.4 seconds. If we go higher to 900 1GB huge pages being
registered:
Registered 900 pages in 738 msec
which is, as expected, a fully linear scaling.
Rather than have each ring pin/map/register the same buffer pool,
provide an io_uring_register(2) opcode to simply duplicate the buffers
that are registered with another ring. Adding the same 900GB of
registered buffers to the target ring can then be accomplished in:
Copied 900 pages in 17 usec
While timing differs a bit, this provides around a 25,000-40,000x
speedup for this use case.
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2024-09-11 19:56:08 +00:00
|
|
|
{
|
2024-09-14 14:51:15 +00:00
|
|
|
struct io_uring_clone_buffers buf;
|
2025-01-15 20:26:03 +00:00
|
|
|
struct io_ring_ctx *src_ctx;
|
io_uring: add IORING_REGISTER_COPY_BUFFERS method
Buffers can get registered with io_uring, which allows to skip the
repeated pin_pages, unpin/unref pages for each O_DIRECT operation. This
reduces the overhead of O_DIRECT IO.
However, registrering buffers can take some time. Normally this isn't an
issue as it's done at initialization time (and hence less critical), but
for cases where rings can be created and destroyed as part of an IO
thread pool, registering the same buffers for multiple rings become a
more time sensitive proposition. As an example, let's say an application
has an IO memory pool of 500G. Initial registration takes:
Got 500 huge pages (each 1024MB)
Registered 500 pages in 409 msec
or about 0.4 seconds. If we go higher to 900 1GB huge pages being
registered:
Registered 900 pages in 738 msec
which is, as expected, a fully linear scaling.
Rather than have each ring pin/map/register the same buffer pool,
provide an io_uring_register(2) opcode to simply duplicate the buffers
that are registered with another ring. Adding the same 900GB of
registered buffers to the target ring can then be accomplished in:
Copied 900 pages in 17 usec
While timing differs a bit, this provides around a 25,000-40,000x
speedup for this use case.
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2024-09-11 19:56:08 +00:00
|
|
|
bool registered_src;
|
|
|
|
struct file *file;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
if (copy_from_user(&buf, arg, sizeof(buf)))
|
|
|
|
return -EFAULT;
|
2024-10-29 13:50:56 +00:00
|
|
|
if (buf.flags & ~(IORING_REGISTER_SRC_REGISTERED|IORING_REGISTER_DST_REPLACE))
|
io_uring: add IORING_REGISTER_COPY_BUFFERS method
Buffers can get registered with io_uring, which allows to skip the
repeated pin_pages, unpin/unref pages for each O_DIRECT operation. This
reduces the overhead of O_DIRECT IO.
However, registrering buffers can take some time. Normally this isn't an
issue as it's done at initialization time (and hence less critical), but
for cases where rings can be created and destroyed as part of an IO
thread pool, registering the same buffers for multiple rings become a
more time sensitive proposition. As an example, let's say an application
has an IO memory pool of 500G. Initial registration takes:
Got 500 huge pages (each 1024MB)
Registered 500 pages in 409 msec
or about 0.4 seconds. If we go higher to 900 1GB huge pages being
registered:
Registered 900 pages in 738 msec
which is, as expected, a fully linear scaling.
Rather than have each ring pin/map/register the same buffer pool,
provide an io_uring_register(2) opcode to simply duplicate the buffers
that are registered with another ring. Adding the same 900GB of
registered buffers to the target ring can then be accomplished in:
Copied 900 pages in 17 usec
While timing differs a bit, this provides around a 25,000-40,000x
speedup for this use case.
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2024-09-11 19:56:08 +00:00
|
|
|
return -EINVAL;
|
2024-10-29 13:50:56 +00:00
|
|
|
if (!(buf.flags & IORING_REGISTER_DST_REPLACE) && ctx->buf_table.nr)
|
|
|
|
return -EBUSY;
|
io_uring: add IORING_REGISTER_COPY_BUFFERS method
Buffers can get registered with io_uring, which allows to skip the
repeated pin_pages, unpin/unref pages for each O_DIRECT operation. This
reduces the overhead of O_DIRECT IO.
However, registrering buffers can take some time. Normally this isn't an
issue as it's done at initialization time (and hence less critical), but
for cases where rings can be created and destroyed as part of an IO
thread pool, registering the same buffers for multiple rings become a
more time sensitive proposition. As an example, let's say an application
has an IO memory pool of 500G. Initial registration takes:
Got 500 huge pages (each 1024MB)
Registered 500 pages in 409 msec
or about 0.4 seconds. If we go higher to 900 1GB huge pages being
registered:
Registered 900 pages in 738 msec
which is, as expected, a fully linear scaling.
Rather than have each ring pin/map/register the same buffer pool,
provide an io_uring_register(2) opcode to simply duplicate the buffers
that are registered with another ring. Adding the same 900GB of
registered buffers to the target ring can then be accomplished in:
Copied 900 pages in 17 usec
While timing differs a bit, this provides around a 25,000-40,000x
speedup for this use case.
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2024-09-11 19:56:08 +00:00
|
|
|
if (memchr_inv(buf.pad, 0, sizeof(buf.pad)))
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
registered_src = (buf.flags & IORING_REGISTER_SRC_REGISTERED) != 0;
|
|
|
|
file = io_uring_register_get_file(buf.src_fd, registered_src);
|
|
|
|
if (IS_ERR(file))
|
|
|
|
return PTR_ERR(file);
|
2025-01-15 20:26:03 +00:00
|
|
|
|
|
|
|
src_ctx = file->private_data;
|
|
|
|
if (src_ctx != ctx) {
|
|
|
|
mutex_unlock(&ctx->uring_lock);
|
|
|
|
lock_two_rings(ctx, src_ctx);
|
|
|
|
}
|
|
|
|
|
|
|
|
ret = io_clone_buffers(ctx, src_ctx, &buf);
|
|
|
|
|
|
|
|
if (src_ctx != ctx)
|
|
|
|
mutex_unlock(&src_ctx->uring_lock);
|
|
|
|
|
2025-01-16 02:53:26 +00:00
|
|
|
fput(file);
|
io_uring: add IORING_REGISTER_COPY_BUFFERS method
Buffers can get registered with io_uring, which allows to skip the
repeated pin_pages, unpin/unref pages for each O_DIRECT operation. This
reduces the overhead of O_DIRECT IO.
However, registrering buffers can take some time. Normally this isn't an
issue as it's done at initialization time (and hence less critical), but
for cases where rings can be created and destroyed as part of an IO
thread pool, registering the same buffers for multiple rings become a
more time sensitive proposition. As an example, let's say an application
has an IO memory pool of 500G. Initial registration takes:
Got 500 huge pages (each 1024MB)
Registered 500 pages in 409 msec
or about 0.4 seconds. If we go higher to 900 1GB huge pages being
registered:
Registered 900 pages in 738 msec
which is, as expected, a fully linear scaling.
Rather than have each ring pin/map/register the same buffer pool,
provide an io_uring_register(2) opcode to simply duplicate the buffers
that are registered with another ring. Adding the same 900GB of
registered buffers to the target ring can then be accomplished in:
Copied 900 pages in 17 usec
While timing differs a bit, this provides around a 25,000-40,000x
speedup for this use case.
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2024-09-11 19:56:08 +00:00
|
|
|
return ret;
|
|
|
|
}
|
2025-03-07 16:00:29 +00:00
|
|
|
|
|
|
|
void io_vec_free(struct iou_vec *iv)
|
|
|
|
{
|
|
|
|
if (!iv->iovec)
|
|
|
|
return;
|
|
|
|
kfree(iv->iovec);
|
|
|
|
iv->iovec = NULL;
|
|
|
|
iv->nr = 0;
|
|
|
|
}
|
2025-03-07 16:00:30 +00:00
|
|
|
|
|
|
|
int io_vec_realloc(struct iou_vec *iv, unsigned nr_entries)
|
|
|
|
{
|
|
|
|
gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
|
|
|
|
struct iovec *iov;
|
|
|
|
|
|
|
|
iov = kmalloc_array(nr_entries, sizeof(iov[0]), gfp);
|
|
|
|
if (!iov)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
io_vec_free(iv);
|
|
|
|
iv->iovec = iov;
|
|
|
|
iv->nr = nr_entries;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int io_vec_fill_bvec(int ddir, struct iov_iter *iter,
|
|
|
|
struct io_mapped_ubuf *imu,
|
|
|
|
struct iovec *iovec, unsigned nr_iovs,
|
|
|
|
struct iou_vec *vec)
|
|
|
|
{
|
|
|
|
unsigned long folio_size = 1 << imu->folio_shift;
|
|
|
|
unsigned long folio_mask = folio_size - 1;
|
|
|
|
u64 folio_addr = imu->ubuf & ~folio_mask;
|
|
|
|
struct bio_vec *res_bvec = vec->bvec;
|
|
|
|
size_t total_len = 0;
|
|
|
|
unsigned bvec_idx = 0;
|
|
|
|
unsigned iov_idx;
|
|
|
|
|
|
|
|
for (iov_idx = 0; iov_idx < nr_iovs; iov_idx++) {
|
|
|
|
size_t iov_len = iovec[iov_idx].iov_len;
|
|
|
|
u64 buf_addr = (u64)(uintptr_t)iovec[iov_idx].iov_base;
|
|
|
|
struct bio_vec *src_bvec;
|
|
|
|
size_t offset;
|
2025-03-25 13:51:50 +00:00
|
|
|
int ret;
|
|
|
|
|
|
|
|
ret = validate_fixed_range(buf_addr, iov_len, imu);
|
|
|
|
if (unlikely(ret))
|
|
|
|
return ret;
|
2025-03-07 16:00:30 +00:00
|
|
|
|
|
|
|
if (unlikely(!iov_len))
|
|
|
|
return -EFAULT;
|
|
|
|
if (unlikely(check_add_overflow(total_len, iov_len, &total_len)))
|
|
|
|
return -EOVERFLOW;
|
|
|
|
|
|
|
|
/* by using folio address it also accounts for bvec offset */
|
|
|
|
offset = buf_addr - folio_addr;
|
|
|
|
src_bvec = imu->bvec + (offset >> imu->folio_shift);
|
|
|
|
offset &= folio_mask;
|
|
|
|
|
|
|
|
for (; iov_len; offset = 0, bvec_idx++, src_bvec++) {
|
|
|
|
size_t seg_size = min_t(size_t, iov_len,
|
|
|
|
folio_size - offset);
|
|
|
|
|
|
|
|
bvec_set_page(&res_bvec[bvec_idx],
|
|
|
|
src_bvec->bv_page, seg_size, offset);
|
|
|
|
iov_len -= seg_size;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (total_len > MAX_RW_COUNT)
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
iov_iter_bvec(iter, ddir, res_bvec, bvec_idx, total_len);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int io_estimate_bvec_size(struct iovec *iov, unsigned nr_iovs,
|
|
|
|
struct io_mapped_ubuf *imu)
|
|
|
|
{
|
|
|
|
unsigned shift = imu->folio_shift;
|
|
|
|
size_t max_segs = 0;
|
|
|
|
unsigned i;
|
|
|
|
|
|
|
|
for (i = 0; i < nr_iovs; i++)
|
|
|
|
max_segs += (iov[i].iov_len >> shift) + 2;
|
|
|
|
return max_segs;
|
|
|
|
}
|
|
|
|
|
2025-03-25 13:51:52 +00:00
|
|
|
static int io_vec_fill_kern_bvec(int ddir, struct iov_iter *iter,
|
|
|
|
struct io_mapped_ubuf *imu,
|
|
|
|
struct iovec *iovec, unsigned nr_iovs,
|
|
|
|
struct iou_vec *vec)
|
|
|
|
{
|
|
|
|
const struct bio_vec *src_bvec = imu->bvec;
|
|
|
|
struct bio_vec *res_bvec = vec->bvec;
|
|
|
|
unsigned res_idx = 0;
|
|
|
|
size_t total_len = 0;
|
|
|
|
unsigned iov_idx;
|
|
|
|
|
|
|
|
for (iov_idx = 0; iov_idx < nr_iovs; iov_idx++) {
|
|
|
|
size_t offset = (size_t)(uintptr_t)iovec[iov_idx].iov_base;
|
|
|
|
size_t iov_len = iovec[iov_idx].iov_len;
|
|
|
|
struct bvec_iter bi = {
|
|
|
|
.bi_size = offset + iov_len,
|
|
|
|
};
|
|
|
|
struct bio_vec bv;
|
|
|
|
|
|
|
|
bvec_iter_advance(src_bvec, &bi, offset);
|
|
|
|
for_each_mp_bvec(bv, src_bvec, bi, bi)
|
|
|
|
res_bvec[res_idx++] = bv;
|
|
|
|
total_len += iov_len;
|
|
|
|
}
|
|
|
|
iov_iter_bvec(iter, ddir, res_bvec, res_idx, total_len);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int iov_kern_bvec_size(const struct iovec *iov,
|
|
|
|
const struct io_mapped_ubuf *imu,
|
|
|
|
unsigned int *nr_seg)
|
|
|
|
{
|
|
|
|
size_t offset = (size_t)(uintptr_t)iov->iov_base;
|
|
|
|
const struct bio_vec *bvec = imu->bvec;
|
|
|
|
int start = 0, i = 0;
|
|
|
|
size_t off = 0;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
ret = validate_fixed_range(offset, iov->iov_len, imu);
|
|
|
|
if (unlikely(ret))
|
|
|
|
return ret;
|
|
|
|
|
|
|
|
for (i = 0; off < offset + iov->iov_len && i < imu->nr_bvecs;
|
|
|
|
off += bvec[i].bv_len, i++) {
|
|
|
|
if (offset >= off && offset < off + bvec[i].bv_len)
|
|
|
|
start = i;
|
|
|
|
}
|
|
|
|
*nr_seg = i - start;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int io_kern_bvec_size(struct iovec *iov, unsigned nr_iovs,
|
|
|
|
struct io_mapped_ubuf *imu, unsigned *nr_segs)
|
|
|
|
{
|
|
|
|
unsigned max_segs = 0;
|
|
|
|
size_t total_len = 0;
|
|
|
|
unsigned i;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
*nr_segs = 0;
|
|
|
|
for (i = 0; i < nr_iovs; i++) {
|
|
|
|
if (unlikely(!iov[i].iov_len))
|
|
|
|
return -EFAULT;
|
|
|
|
if (unlikely(check_add_overflow(total_len, iov[i].iov_len,
|
|
|
|
&total_len)))
|
|
|
|
return -EOVERFLOW;
|
|
|
|
ret = iov_kern_bvec_size(&iov[i], imu, &max_segs);
|
|
|
|
if (unlikely(ret))
|
|
|
|
return ret;
|
|
|
|
*nr_segs += max_segs;
|
|
|
|
}
|
|
|
|
if (total_len > MAX_RW_COUNT)
|
|
|
|
return -EINVAL;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2025-03-07 16:00:30 +00:00
|
|
|
int io_import_reg_vec(int ddir, struct iov_iter *iter,
|
|
|
|
struct io_kiocb *req, struct iou_vec *vec,
|
2025-03-08 18:21:16 +00:00
|
|
|
unsigned nr_iovs, unsigned issue_flags)
|
2025-03-07 16:00:30 +00:00
|
|
|
{
|
|
|
|
struct io_rsrc_node *node;
|
|
|
|
struct io_mapped_ubuf *imu;
|
2025-03-08 18:21:16 +00:00
|
|
|
unsigned iovec_off;
|
2025-03-07 16:00:30 +00:00
|
|
|
struct iovec *iov;
|
|
|
|
unsigned nr_segs;
|
|
|
|
|
|
|
|
node = io_find_buf_node(req, issue_flags);
|
|
|
|
if (!node)
|
|
|
|
return -EFAULT;
|
|
|
|
imu = node->buf;
|
|
|
|
if (!(imu->dir & (1 << ddir)))
|
|
|
|
return -EFAULT;
|
|
|
|
|
2025-03-08 18:21:16 +00:00
|
|
|
iovec_off = vec->nr - nr_iovs;
|
2025-03-07 16:00:30 +00:00
|
|
|
iov = vec->iovec + iovec_off;
|
2025-03-25 13:51:52 +00:00
|
|
|
|
|
|
|
if (imu->is_kbuf) {
|
|
|
|
int ret = io_kern_bvec_size(iov, nr_iovs, imu, &nr_segs);
|
|
|
|
|
|
|
|
if (unlikely(ret))
|
|
|
|
return ret;
|
|
|
|
} else {
|
|
|
|
nr_segs = io_estimate_bvec_size(iov, nr_iovs, imu);
|
|
|
|
}
|
2025-03-07 16:00:30 +00:00
|
|
|
|
|
|
|
if (sizeof(struct bio_vec) > sizeof(struct iovec)) {
|
|
|
|
size_t bvec_bytes;
|
|
|
|
|
|
|
|
bvec_bytes = nr_segs * sizeof(struct bio_vec);
|
|
|
|
nr_segs = (bvec_bytes + sizeof(*iov) - 1) / sizeof(*iov);
|
|
|
|
nr_segs += nr_iovs;
|
|
|
|
}
|
|
|
|
|
2025-03-08 18:21:16 +00:00
|
|
|
if (nr_segs > vec->nr) {
|
2025-03-07 16:00:30 +00:00
|
|
|
struct iou_vec tmp_vec = {};
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
ret = io_vec_realloc(&tmp_vec, nr_segs);
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
|
|
|
|
iovec_off = tmp_vec.nr - nr_iovs;
|
|
|
|
memcpy(tmp_vec.iovec + iovec_off, iov, sizeof(*iov) * nr_iovs);
|
|
|
|
io_vec_free(vec);
|
|
|
|
|
|
|
|
*vec = tmp_vec;
|
|
|
|
iov = vec->iovec + iovec_off;
|
|
|
|
req->flags |= REQ_F_NEED_CLEANUP;
|
|
|
|
}
|
|
|
|
|
2025-03-25 13:51:52 +00:00
|
|
|
if (imu->is_kbuf)
|
|
|
|
return io_vec_fill_kern_bvec(ddir, iter, imu, iov, nr_iovs, vec);
|
|
|
|
|
2025-03-07 16:00:30 +00:00
|
|
|
return io_vec_fill_bvec(ddir, iter, imu, iov, nr_iovs, vec);
|
|
|
|
}
|
2025-03-08 18:21:15 +00:00
|
|
|
|
|
|
|
int io_prep_reg_iovec(struct io_kiocb *req, struct iou_vec *iv,
|
|
|
|
const struct iovec __user *uvec, size_t uvec_segs)
|
|
|
|
{
|
|
|
|
struct iovec *iov;
|
|
|
|
int iovec_off, ret;
|
|
|
|
void *res;
|
|
|
|
|
|
|
|
if (uvec_segs > iv->nr) {
|
|
|
|
ret = io_vec_realloc(iv, uvec_segs);
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
req->flags |= REQ_F_NEED_CLEANUP;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* pad iovec to the right */
|
|
|
|
iovec_off = iv->nr - uvec_segs;
|
|
|
|
iov = iv->iovec + iovec_off;
|
|
|
|
res = iovec_from_user(uvec, uvec_segs, uvec_segs, iov,
|
|
|
|
io_is_compat(req->ctx));
|
|
|
|
if (IS_ERR(res))
|
|
|
|
return PTR_ERR(res);
|
|
|
|
|
|
|
|
req->flags |= REQ_F_IMPORT_BUFFER;
|
|
|
|
return 0;
|
|
|
|
}
|