tcp: defer skb freeing after socket lock is released

JIRA: https://issues.redhat.com/browse/RHEL-9145

Conflicts:
* include/net/tcp.h: we already have
  7a26dc9e7b43 ("net: tcp: add skb drop reasons to tcp_add_backlog()")
* net/ipv4/tcp.c: we already have
* 67b688aecd tcp: fix tcp_cleanup_rbuf() for tcp_read_skb()
* 0240ed7c51 tcp: allow again tcp_disconnect() when threads are waiting
* 0d5e52df56 bpf: net: Avoid do_tcp_getsockopt() taking sk lock when called from bpf

commit f35f821935d8df76f9c92e2431a225bdff938169
Author: Eric Dumazet <edumazet@google.com>
Date:   Mon Nov 15 11:02:46 2021 -0800

    tcp: defer skb freeing after socket lock is released

    tcp recvmsg() (or rx zerocopy) spends a fair amount of time
    freeing skbs after their payload has been consumed.

    A typical ~64KB GRO packet has to release ~45 page
    references, eventually going to page allocator
    for each of them.

    Currently, this freeing is performed while socket lock
    is held, meaning that there is a high chance that
    BH handler has to queue incoming packets to tcp socket backlog.

    This can cause additional latencies, because the user
    thread has to process the backlog at release_sock() time,
    and while doing so, additional frames can be added
    by BH handler.

    This patch adds logic to defer these frees after socket
    lock is released, or directly from BH handler if possible.

    Being able to free these skbs from BH handler helps a lot,
    because this avoids the usual alloc/free assymetry,
    when BH handler and user thread do not run on same cpu or
    NUMA node.

    One cpu can now be fully utilized for the kernel->user copy,
    and another cpu is handling BH processing and skb/page
    allocs/frees (assuming RFS is not forcing use of a single CPU)

    Tested:
     100Gbit NIC
     Max throughput for one TCP_STREAM flow, over 10 runs

    MTU : 1500
    Before: 55 Gbit
    After:  66 Gbit

    MTU : 4096+(headers)
    Before: 82 Gbit
    After:  95 Gbit

    Signed-off-by: Eric Dumazet <edumazet@google.com>
    Signed-off-by: David S. Miller <davem@davemloft.net>

Signed-off-by: Wander Lairson Costa <wander@redhat.com>
This commit is contained in:
Wander Lairson Costa 2024-07-12 14:45:57 -03:00
parent 7e24f69ef0
commit 19b7cb57b3
No known key found for this signature in database
GPG Key ID: 084C5584542E1574
4 changed files with 37 additions and 2 deletions

View File

@ -1400,6 +1400,16 @@ static inline bool tcp_checksum_complete(struct sk_buff *skb)
bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb,
enum skb_drop_reason *reason);
void __sk_defer_free_flush(struct sock *sk);
static inline void sk_defer_free_flush(struct sock *sk)
{
if (llist_empty(&sk->defer_list))
return;
__sk_defer_free_flush(sk);
}
int tcp_filter(struct sock *sk, struct sk_buff *skb);
void tcp_set_state(struct sock *sk, int state);
void tcp_done(struct sock *sk);

View File

@ -1652,14 +1652,34 @@ void tcp_cleanup_rbuf(struct sock *sk, int copied)
__tcp_cleanup_rbuf(sk, copied);
}
void __sk_defer_free_flush(struct sock *sk)
{
struct llist_node *head;
struct sk_buff *skb, *n;
head = llist_del_all(&sk->defer_list);
llist_for_each_entry_safe(skb, n, head, ll_node) {
prefetch(n);
skb_mark_not_on_list(skb);
__kfree_skb(skb);
}
}
EXPORT_SYMBOL(__sk_defer_free_flush);
static void tcp_eat_recv_skb(struct sock *sk, struct sk_buff *skb)
{
__skb_unlink(skb, &sk->sk_receive_queue);
if (likely(skb->destructor == sock_rfree)) {
sock_rfree(skb);
skb->destructor = NULL;
skb->sk = NULL;
if (!skb_queue_empty(&sk->sk_receive_queue) ||
!llist_empty(&sk->defer_list)) {
llist_add(&skb->ll_node, &sk->defer_list);
return;
}
}
sk_eat_skb(sk, skb);
__kfree_skb(skb);
}
struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
@ -2624,6 +2644,7 @@ static int tcp_recvmsg_locked(struct sock *sk, struct msghdr *msg, size_t len,
release_sock(sk);
lock_sock(sk);
} else {
sk_defer_free_flush(sk);
err = sk_wait_data(sk, &timeo, last);
if (err < 0) {
err = copied ? : err;
@ -2746,6 +2767,7 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
ret = tcp_recvmsg_locked(sk, msg, len, nonblock, flags, &tss,
&cmsg_flags);
release_sock(sk);
sk_defer_free_flush(sk);
if ((cmsg_flags || msg->msg_get_inq) && ret >= 0) {
if (cmsg_flags & TCP_CMSG_TS)
@ -3278,7 +3300,7 @@ int tcp_disconnect(struct sock *sk, int flags)
sk->sk_frag.page = NULL;
sk->sk_frag.offset = 0;
}
sk_defer_free_flush(sk);
sk_error_report(sk);
return 0;
}
@ -4408,6 +4430,7 @@ int do_tcp_getsockopt(struct sock *sk, int level,
err = BPF_CGROUP_RUN_PROG_GETSOCKOPT_KERN(sk, level, optname,
&zc, &len, err);
sockopt_release_sock(sk);
sk_defer_free_flush(sk);
if (len >= offsetofend(struct tcp_zerocopy_receive, msg_flags))
goto zerocopy_rcv_cmsg;
switch (len) {

View File

@ -2077,6 +2077,7 @@ process:
sk_incoming_cpu_update(sk);
sk_defer_free_flush(sk);
bh_lock_sock_nested(sk);
tcp_segs_in(tcp_sk(sk), skb);
ret = 0;

View File

@ -1732,6 +1732,7 @@ process:
sk_incoming_cpu_update(sk);
sk_defer_free_flush(sk);
bh_lock_sock_nested(sk);
tcp_segs_in(tcp_sk(sk), skb);
ret = 0;