2019-05-27 06:55:01 +00:00
|
|
|
// SPDX-License-Identifier: GPL-2.0-or-later
|
2005-08-12 12:26:18 +00:00
|
|
|
/*
|
|
|
|
|
* INET An implementation of the TCP/IP protocol suite for the LINUX
|
|
|
|
|
* operating system. INET is implemented using the BSD Socket
|
|
|
|
|
* interface as the means of communication with the user level.
|
|
|
|
|
*
|
|
|
|
|
* Generic INET6 transport hashtables
|
|
|
|
|
*
|
2005-12-14 07:25:44 +00:00
|
|
|
* Authors: Lotsa people, from code originally in tcp, generalised here
|
2014-08-24 20:53:10 +00:00
|
|
|
* by Arnaldo Carvalho de Melo <acme@mandriva.com>
|
2005-08-12 12:26:18 +00:00
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
#include <linux/module.h>
|
2005-12-14 07:25:44 +00:00
|
|
|
#include <linux/random.h>
|
2005-08-12 12:26:18 +00:00
|
|
|
|
2016-02-10 16:50:40 +00:00
|
|
|
#include <net/addrconf.h>
|
2024-03-06 16:00:28 +00:00
|
|
|
#include <net/hotdata.h>
|
2005-08-12 12:26:18 +00:00
|
|
|
#include <net/inet_connection_sock.h>
|
|
|
|
|
#include <net/inet_hashtables.h>
|
|
|
|
|
#include <net/inet6_hashtables.h>
|
2011-08-04 03:50:44 +00:00
|
|
|
#include <net/secure_seq.h>
|
2005-12-14 07:25:44 +00:00
|
|
|
#include <net/ip.h>
|
2016-02-10 16:50:40 +00:00
|
|
|
#include <net/sock_reuseport.h>
|
2024-05-07 16:41:40 +00:00
|
|
|
#include <net/tcp.h>
|
2005-08-12 12:26:18 +00:00
|
|
|
|
2015-03-18 21:05:35 +00:00
|
|
|
u32 inet6_ehashfn(const struct net *net,
|
|
|
|
|
const struct in6_addr *laddr, const u16 lport,
|
|
|
|
|
const struct in6_addr *faddr, const __be16 fport)
|
2013-10-19 19:48:52 +00:00
|
|
|
{
|
2013-10-19 19:48:57 +00:00
|
|
|
u32 lhash, fhash;
|
|
|
|
|
|
|
|
|
|
net_get_random_once(&inet6_ehash_secret, sizeof(inet6_ehash_secret));
|
2024-03-06 16:00:29 +00:00
|
|
|
net_get_random_once(&tcp_ipv6_hash_secret, sizeof(tcp_ipv6_hash_secret));
|
2013-10-19 19:48:57 +00:00
|
|
|
|
|
|
|
|
lhash = (__force u32)laddr->s6_addr32[3];
|
2024-03-06 16:00:29 +00:00
|
|
|
fhash = __ipv6_addr_jhash(faddr, tcp_ipv6_hash_secret);
|
2013-10-19 19:48:57 +00:00
|
|
|
|
inet: change lport contribution to inet_ehashfn() and inet6_ehashfn()
In order to speedup __inet_hash_connect(), we want to ensure hash values
for <source address, port X, destination address, destination port>
are not randomly spread, but monotonically increasing.
Goal is to allow __inet_hash_connect() to derive the hash value
of a candidate 4-tuple with a single addition in the following
patch in the series.
Given :
hash_0 = inet_ehashfn(saddr, 0, daddr, dport)
hash_sport = inet_ehashfn(saddr, sport, daddr, dport)
Then (hash_sport == hash_0 + sport) for all sport values.
As far as I know, there is no security implication with this change.
After this patch, when __inet_hash_connect() has to try XXXX candidates,
the hash table buckets are contiguous and packed, allowing
a better use of cpu caches and hardware prefetchers.
Tested:
Server: ulimit -n 40000; neper/tcp_crr -T 200 -F 30000 -6 --nolog
Client: ulimit -n 40000; neper/tcp_crr -T 200 -F 30000 -6 --nolog -c -H server
Before this patch:
utime_start=0.271607
utime_end=3.847111
stime_start=18.407684
stime_end=1997.485557
num_transactions=1350742
latency_min=0.014131929
latency_max=17.895073144
latency_mean=0.505675853
latency_stddev=2.125164772
num_samples=307884
throughput=139866.80
perf top on client:
56.86% [kernel] [k] __inet6_check_established
17.96% [kernel] [k] __inet_hash_connect
13.88% [kernel] [k] inet6_ehashfn
2.52% [kernel] [k] rcu_all_qs
2.01% [kernel] [k] __cond_resched
0.41% [kernel] [k] _raw_spin_lock
After this patch:
utime_start=0.286131
utime_end=4.378886
stime_start=11.952556
stime_end=1991.655533
num_transactions=1446830
latency_min=0.001061085
latency_max=12.075275028
latency_mean=0.376375302
latency_stddev=1.361969596
num_samples=306383
throughput=151866.56
perf top:
50.01% [kernel] [k] __inet6_check_established
20.65% [kernel] [k] __inet_hash_connect
15.81% [kernel] [k] inet6_ehashfn
2.92% [kernel] [k] rcu_all_qs
2.34% [kernel] [k] __cond_resched
0.50% [kernel] [k] _raw_spin_lock
0.34% [kernel] [k] sched_balance_trigger
0.24% [kernel] [k] queued_spin_lock_slowpath
There is indeed an increase of throughput and reduction of latency.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Tested-by: Jason Xing <kerneljasonxing@gmail.com>
Reviewed-by: Jason Xing <kerneljasonxing@gmail.com>
Link: https://patch.msgid.link/20250305034550.879255-2-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-03-05 03:45:49 +00:00
|
|
|
return lport + __inet6_ehashfn(lhash, 0, fhash, fport,
|
|
|
|
|
inet6_ehash_secret + net_hash_mix(net));
|
2013-10-19 19:48:52 +00:00
|
|
|
}
|
2023-07-20 15:30:08 +00:00
|
|
|
EXPORT_SYMBOL_GPL(inet6_ehashfn);
|
2013-10-19 19:48:52 +00:00
|
|
|
|
2006-04-10 05:48:59 +00:00
|
|
|
/*
|
|
|
|
|
* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so
|
|
|
|
|
* we need not check it for TCP lookups anymore, thanks Alexey. -DaveM
|
|
|
|
|
*
|
|
|
|
|
* The sockhash lock must be held as a reader here.
|
|
|
|
|
*/
|
2024-08-02 13:40:28 +00:00
|
|
|
struct sock *__inet6_lookup_established(const struct net *net,
|
2025-08-22 19:06:59 +00:00
|
|
|
const struct in6_addr *saddr,
|
|
|
|
|
const __be16 sport,
|
|
|
|
|
const struct in6_addr *daddr,
|
|
|
|
|
const u16 hnum,
|
|
|
|
|
const int dif, const int sdif)
|
2006-04-10 05:48:59 +00:00
|
|
|
{
|
2006-09-28 01:43:07 +00:00
|
|
|
const __portpair ports = INET_COMBINED_PORTS(sport, hnum);
|
2025-08-22 19:06:59 +00:00
|
|
|
const struct hlist_nulls_node *node;
|
|
|
|
|
struct inet_ehash_bucket *head;
|
|
|
|
|
struct inet_hashinfo *hashinfo;
|
|
|
|
|
unsigned int hash, slot;
|
|
|
|
|
struct sock *sk;
|
2008-11-17 03:40:17 +00:00
|
|
|
|
2025-08-22 19:06:59 +00:00
|
|
|
hashinfo = net->ipv4.tcp_death_row.hashinfo;
|
|
|
|
|
hash = inet6_ehashfn(net, daddr, hnum, saddr, sport);
|
|
|
|
|
slot = hash & hashinfo->ehash_mask;
|
|
|
|
|
head = &hashinfo->ehash[slot];
|
2008-11-17 03:40:17 +00:00
|
|
|
begin:
|
|
|
|
|
sk_nulls_for_each_rcu(sk, node, &head->chain) {
|
2012-11-30 09:49:27 +00:00
|
|
|
if (sk->sk_hash != hash)
|
|
|
|
|
continue;
|
2022-05-13 18:55:49 +00:00
|
|
|
if (!inet6_match(net, sk, saddr, daddr, ports, dif, sdif))
|
ipv6: make lookups simpler and faster
TCP listener refactoring, part 4 :
To speed up inet lookups, we moved IPv4 addresses from inet to struct
sock_common
Now is time to do the same for IPv6, because it permits us to have fast
lookups for all kind of sockets, including upcoming SYN_RECV.
Getting IPv6 addresses in TCP lookups currently requires two extra cache
lines, plus a dereference (and memory stall).
inet6_sk(sk) does the dereference of inet_sk(__sk)->pinet6
This patch is way bigger than its IPv4 counter part, because for IPv4,
we could add aliases (inet_daddr, inet_rcv_saddr), while on IPv6,
it's not doable easily.
inet6_sk(sk)->daddr becomes sk->sk_v6_daddr
inet6_sk(sk)->rcv_saddr becomes sk->sk_v6_rcv_saddr
And timewait socket also have tw->tw_v6_daddr & tw->tw_v6_rcv_saddr
at the same offset.
We get rid of INET6_TW_MATCH() as INET6_MATCH() is now the generic
macro.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2013-10-03 22:42:29 +00:00
|
|
|
continue;
|
2017-06-30 10:08:01 +00:00
|
|
|
if (unlikely(!refcount_inc_not_zero(&sk->sk_refcnt)))
|
tcp/dccp: remove twchain
TCP listener refactoring, part 3 :
Our goal is to hash SYN_RECV sockets into main ehash for fast lookup,
and parallel SYN processing.
Current inet_ehash_bucket contains two chains, one for ESTABLISH (and
friend states) sockets, another for TIME_WAIT sockets only.
As the hash table is sized to get at most one socket per bucket, it
makes little sense to have separate twchain, as it makes the lookup
slightly more complicated, and doubles hash table memory usage.
If we make sure all socket types have the lookup keys at the same
offsets, we can use a generic and faster lookup. It turns out TIME_WAIT
and ESTABLISHED sockets already have common lookup fields for IPv4.
[ INET_TW_MATCH() is no longer needed ]
I'll provide a follow-up to factorize IPv6 lookup as well, to remove
INET6_TW_MATCH()
This way, SYN_RECV pseudo sockets will be supported the same.
A new sock_gen_put() helper is added, doing either a sock_put() or
inet_twsk_put() [ and will support SYN_RECV later ].
Note this helper should only be called in real slow path, when rcu
lookup found a socket that was moved to another identity (freed/reused
immediately), but could eventually be used in other contexts, like
sock_edemux()
Before patch :
dmesg | grep "TCP established"
TCP established hash table entries: 524288 (order: 11, 8388608 bytes)
After patch :
TCP established hash table entries: 524288 (order: 10, 4194304 bytes)
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2013-10-03 07:22:02 +00:00
|
|
|
goto out;
|
|
|
|
|
|
2022-05-13 18:55:49 +00:00
|
|
|
if (unlikely(!inet6_match(net, sk, saddr, daddr, ports, dif, sdif))) {
|
ipv6: make lookups simpler and faster
TCP listener refactoring, part 4 :
To speed up inet lookups, we moved IPv4 addresses from inet to struct
sock_common
Now is time to do the same for IPv6, because it permits us to have fast
lookups for all kind of sockets, including upcoming SYN_RECV.
Getting IPv6 addresses in TCP lookups currently requires two extra cache
lines, plus a dereference (and memory stall).
inet6_sk(sk) does the dereference of inet_sk(__sk)->pinet6
This patch is way bigger than its IPv4 counter part, because for IPv4,
we could add aliases (inet_daddr, inet_rcv_saddr), while on IPv6,
it's not doable easily.
inet6_sk(sk)->daddr becomes sk->sk_v6_daddr
inet6_sk(sk)->rcv_saddr becomes sk->sk_v6_rcv_saddr
And timewait socket also have tw->tw_v6_daddr & tw->tw_v6_rcv_saddr
at the same offset.
We get rid of INET6_TW_MATCH() as INET6_MATCH() is now the generic
macro.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2013-10-03 22:42:29 +00:00
|
|
|
sock_gen_put(sk);
|
|
|
|
|
goto begin;
|
2008-11-17 03:40:17 +00:00
|
|
|
}
|
ipv6: make lookups simpler and faster
TCP listener refactoring, part 4 :
To speed up inet lookups, we moved IPv4 addresses from inet to struct
sock_common
Now is time to do the same for IPv6, because it permits us to have fast
lookups for all kind of sockets, including upcoming SYN_RECV.
Getting IPv6 addresses in TCP lookups currently requires two extra cache
lines, plus a dereference (and memory stall).
inet6_sk(sk) does the dereference of inet_sk(__sk)->pinet6
This patch is way bigger than its IPv4 counter part, because for IPv4,
we could add aliases (inet_daddr, inet_rcv_saddr), while on IPv6,
it's not doable easily.
inet6_sk(sk)->daddr becomes sk->sk_v6_daddr
inet6_sk(sk)->rcv_saddr becomes sk->sk_v6_rcv_saddr
And timewait socket also have tw->tw_v6_daddr & tw->tw_v6_rcv_saddr
at the same offset.
We get rid of INET6_TW_MATCH() as INET6_MATCH() is now the generic
macro.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2013-10-03 22:42:29 +00:00
|
|
|
goto found;
|
2006-04-10 05:48:59 +00:00
|
|
|
}
|
2008-11-17 03:40:17 +00:00
|
|
|
if (get_nulls_value(node) != slot)
|
|
|
|
|
goto begin;
|
|
|
|
|
out:
|
tcp/dccp: remove twchain
TCP listener refactoring, part 3 :
Our goal is to hash SYN_RECV sockets into main ehash for fast lookup,
and parallel SYN processing.
Current inet_ehash_bucket contains two chains, one for ESTABLISH (and
friend states) sockets, another for TIME_WAIT sockets only.
As the hash table is sized to get at most one socket per bucket, it
makes little sense to have separate twchain, as it makes the lookup
slightly more complicated, and doubles hash table memory usage.
If we make sure all socket types have the lookup keys at the same
offsets, we can use a generic and faster lookup. It turns out TIME_WAIT
and ESTABLISHED sockets already have common lookup fields for IPv4.
[ INET_TW_MATCH() is no longer needed ]
I'll provide a follow-up to factorize IPv6 lookup as well, to remove
INET6_TW_MATCH()
This way, SYN_RECV pseudo sockets will be supported the same.
A new sock_gen_put() helper is added, doing either a sock_put() or
inet_twsk_put() [ and will support SYN_RECV later ].
Note this helper should only be called in real slow path, when rcu
lookup found a socket that was moved to another identity (freed/reused
immediately), but could eventually be used in other contexts, like
sock_edemux()
Before patch :
dmesg | grep "TCP established"
TCP established hash table entries: 524288 (order: 11, 8388608 bytes)
After patch :
TCP established hash table entries: 524288 (order: 10, 4194304 bytes)
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2013-10-03 07:22:02 +00:00
|
|
|
sk = NULL;
|
|
|
|
|
found:
|
2006-04-10 05:48:59 +00:00
|
|
|
return sk;
|
|
|
|
|
}
|
|
|
|
|
EXPORT_SYMBOL(__inet6_lookup_established);
|
|
|
|
|
|
2024-08-02 13:40:25 +00:00
|
|
|
static inline int compute_score(struct sock *sk, const struct net *net,
|
2008-11-24 01:22:55 +00:00
|
|
|
const unsigned short hnum,
|
|
|
|
|
const struct in6_addr *daddr,
|
2020-08-31 06:26:10 +00:00
|
|
|
const int dif, const int sdif)
|
2008-11-24 01:22:55 +00:00
|
|
|
{
|
|
|
|
|
int score = -1;
|
|
|
|
|
|
2009-10-15 06:30:45 +00:00
|
|
|
if (net_eq(sock_net(sk), net) && inet_sk(sk)->inet_num == hnum &&
|
2008-11-24 01:22:55 +00:00
|
|
|
sk->sk_family == PF_INET6) {
|
2018-12-12 21:15:36 +00:00
|
|
|
if (!ipv6_addr_equal(&sk->sk_v6_rcv_saddr, daddr))
|
|
|
|
|
return -1;
|
2008-11-24 01:22:55 +00:00
|
|
|
|
2018-12-12 21:15:36 +00:00
|
|
|
if (!inet_sk_bound_dev_eq(net, sk->sk_bound_dev_if, dif, sdif))
|
2018-11-07 15:36:03 +00:00
|
|
|
return -1;
|
2017-08-07 15:44:21 +00:00
|
|
|
|
2021-10-05 13:03:42 +00:00
|
|
|
score = sk->sk_bound_dev_if ? 2 : 1;
|
2019-10-30 20:00:04 +00:00
|
|
|
if (READ_ONCE(sk->sk_incoming_cpu) == raw_smp_processor_id())
|
2015-10-09 02:33:21 +00:00
|
|
|
score++;
|
2008-11-24 01:22:55 +00:00
|
|
|
}
|
|
|
|
|
return score;
|
|
|
|
|
}
|
|
|
|
|
|
2023-07-20 15:30:09 +00:00
|
|
|
/**
|
|
|
|
|
* inet6_lookup_reuseport() - execute reuseport logic on AF_INET6 socket if necessary.
|
|
|
|
|
* @net: network namespace.
|
|
|
|
|
* @sk: AF_INET6 socket, must be in TCP_LISTEN state for TCP or TCP_CLOSE for UDP.
|
|
|
|
|
* @skb: context for a potential SK_REUSEPORT program.
|
|
|
|
|
* @doff: header offset.
|
|
|
|
|
* @saddr: source address.
|
|
|
|
|
* @sport: source port.
|
|
|
|
|
* @daddr: destination address.
|
|
|
|
|
* @hnum: destination port in host byte order.
|
|
|
|
|
* @ehashfn: hash function used to generate the fallback hash.
|
|
|
|
|
*
|
|
|
|
|
* Return: NULL if sk doesn't have SO_REUSEPORT set, otherwise a pointer to
|
|
|
|
|
* the selected sock or an error.
|
|
|
|
|
*/
|
2024-08-02 13:40:28 +00:00
|
|
|
struct sock *inet6_lookup_reuseport(const struct net *net, struct sock *sk,
|
2023-07-20 15:30:07 +00:00
|
|
|
struct sk_buff *skb, int doff,
|
|
|
|
|
const struct in6_addr *saddr,
|
|
|
|
|
__be16 sport,
|
|
|
|
|
const struct in6_addr *daddr,
|
2023-07-20 15:30:08 +00:00
|
|
|
unsigned short hnum,
|
|
|
|
|
inet6_ehashfn_t *ehashfn)
|
2020-07-17 10:35:26 +00:00
|
|
|
{
|
|
|
|
|
struct sock *reuse_sk = NULL;
|
|
|
|
|
u32 phash;
|
|
|
|
|
|
|
|
|
|
if (sk->sk_reuseport) {
|
2023-07-20 15:30:08 +00:00
|
|
|
phash = INDIRECT_CALL_INET(ehashfn, udp6_ehashfn, inet6_ehashfn,
|
|
|
|
|
net, daddr, hnum, saddr, sport);
|
2020-07-17 10:35:26 +00:00
|
|
|
reuse_sk = reuseport_select_sock(sk, phash, skb, doff);
|
|
|
|
|
}
|
|
|
|
|
return reuse_sk;
|
|
|
|
|
}
|
2023-07-20 15:30:07 +00:00
|
|
|
EXPORT_SYMBOL_GPL(inet6_lookup_reuseport);
|
2020-07-17 10:35:26 +00:00
|
|
|
|
2016-04-01 15:52:17 +00:00
|
|
|
/* called with rcu_read_lock() */
|
2024-08-02 13:40:28 +00:00
|
|
|
static struct sock *inet6_lhash2_lookup(const struct net *net,
|
2017-12-01 20:52:31 +00:00
|
|
|
struct inet_listen_hashbucket *ilb2,
|
|
|
|
|
struct sk_buff *skb, int doff,
|
|
|
|
|
const struct in6_addr *saddr,
|
|
|
|
|
const __be16 sport, const struct in6_addr *daddr,
|
|
|
|
|
const unsigned short hnum, const int dif, const int sdif)
|
|
|
|
|
{
|
|
|
|
|
struct sock *sk, *result = NULL;
|
2022-05-12 00:06:05 +00:00
|
|
|
struct hlist_nulls_node *node;
|
2017-12-01 20:52:31 +00:00
|
|
|
int score, hiscore = 0;
|
|
|
|
|
|
2022-05-12 00:06:05 +00:00
|
|
|
sk_nulls_for_each_rcu(sk, node, &ilb2->nulls_head) {
|
2020-08-31 06:26:10 +00:00
|
|
|
score = compute_score(sk, net, hnum, daddr, dif, sdif);
|
2017-12-01 20:52:31 +00:00
|
|
|
if (score > hiscore) {
|
2023-07-20 15:30:07 +00:00
|
|
|
result = inet6_lookup_reuseport(net, sk, skb, doff,
|
2023-07-20 15:30:08 +00:00
|
|
|
saddr, sport, daddr, hnum, inet6_ehashfn);
|
2020-07-17 10:35:26 +00:00
|
|
|
if (result)
|
|
|
|
|
return result;
|
|
|
|
|
|
2017-12-01 20:52:31 +00:00
|
|
|
result = sk;
|
|
|
|
|
hiscore = score;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return result;
|
|
|
|
|
}
|
|
|
|
|
|
2024-08-02 13:40:28 +00:00
|
|
|
struct sock *inet6_lookup_run_sk_lookup(const struct net *net,
|
2023-07-20 15:30:10 +00:00
|
|
|
int protocol,
|
|
|
|
|
struct sk_buff *skb, int doff,
|
|
|
|
|
const struct in6_addr *saddr,
|
|
|
|
|
const __be16 sport,
|
|
|
|
|
const struct in6_addr *daddr,
|
|
|
|
|
const u16 hnum, const int dif,
|
|
|
|
|
inet6_ehashfn_t *ehashfn)
|
2020-07-17 10:35:27 +00:00
|
|
|
{
|
|
|
|
|
struct sock *sk, *reuse_sk;
|
|
|
|
|
bool no_reuseport;
|
|
|
|
|
|
2023-07-20 15:30:10 +00:00
|
|
|
no_reuseport = bpf_sk_lookup_run_v6(net, protocol, saddr, sport,
|
2021-11-10 11:10:15 +00:00
|
|
|
daddr, hnum, dif, &sk);
|
2020-07-17 10:35:27 +00:00
|
|
|
if (no_reuseport || IS_ERR_OR_NULL(sk))
|
|
|
|
|
return sk;
|
|
|
|
|
|
2023-07-20 15:30:08 +00:00
|
|
|
reuse_sk = inet6_lookup_reuseport(net, sk, skb, doff,
|
2023-07-20 15:30:10 +00:00
|
|
|
saddr, sport, daddr, hnum, ehashfn);
|
2020-07-17 10:35:27 +00:00
|
|
|
if (reuse_sk)
|
|
|
|
|
sk = reuse_sk;
|
|
|
|
|
return sk;
|
|
|
|
|
}
|
2023-07-20 15:30:10 +00:00
|
|
|
EXPORT_SYMBOL_GPL(inet6_lookup_run_sk_lookup);
|
2020-07-17 10:35:27 +00:00
|
|
|
|
2024-08-02 13:40:28 +00:00
|
|
|
struct sock *inet6_lookup_listener(const struct net *net,
|
2025-08-22 19:06:59 +00:00
|
|
|
struct sk_buff *skb, int doff,
|
|
|
|
|
const struct in6_addr *saddr,
|
|
|
|
|
const __be16 sport,
|
|
|
|
|
const struct in6_addr *daddr,
|
|
|
|
|
const unsigned short hnum,
|
|
|
|
|
const int dif, const int sdif)
|
2005-08-12 12:26:18 +00:00
|
|
|
{
|
2017-12-01 20:52:31 +00:00
|
|
|
struct inet_listen_hashbucket *ilb2;
|
2025-08-22 19:06:59 +00:00
|
|
|
struct inet_hashinfo *hashinfo;
|
2018-12-12 21:15:36 +00:00
|
|
|
struct sock *result = NULL;
|
2017-12-01 20:52:31 +00:00
|
|
|
unsigned int hash2;
|
|
|
|
|
|
2020-07-17 10:35:27 +00:00
|
|
|
/* Lookup redirect from BPF */
|
2025-08-22 19:06:58 +00:00
|
|
|
if (static_branch_unlikely(&bpf_sk_lookup_enabled)) {
|
2023-07-20 15:30:10 +00:00
|
|
|
result = inet6_lookup_run_sk_lookup(net, IPPROTO_TCP, skb, doff,
|
|
|
|
|
saddr, sport, daddr, hnum, dif,
|
|
|
|
|
inet6_ehashfn);
|
2020-07-17 10:35:27 +00:00
|
|
|
if (result)
|
|
|
|
|
goto done;
|
|
|
|
|
}
|
|
|
|
|
|
2025-08-22 19:06:59 +00:00
|
|
|
hashinfo = net->ipv4.tcp_death_row.hashinfo;
|
2017-12-01 20:52:31 +00:00
|
|
|
hash2 = ipv6_portaddr_hash(net, daddr, hnum);
|
|
|
|
|
ilb2 = inet_lhash2_bucket(hashinfo, hash2);
|
|
|
|
|
|
|
|
|
|
result = inet6_lhash2_lookup(net, ilb2, skb, doff,
|
|
|
|
|
saddr, sport, daddr, hnum,
|
|
|
|
|
dif, sdif);
|
|
|
|
|
if (result)
|
2018-08-08 08:01:26 +00:00
|
|
|
goto done;
|
2017-12-01 20:52:31 +00:00
|
|
|
|
|
|
|
|
/* Lookup lhash2 with in6addr_any */
|
|
|
|
|
hash2 = ipv6_portaddr_hash(net, &in6addr_any, hnum);
|
|
|
|
|
ilb2 = inet_lhash2_bucket(hashinfo, hash2);
|
|
|
|
|
|
2018-08-08 08:01:26 +00:00
|
|
|
result = inet6_lhash2_lookup(net, ilb2, skb, doff,
|
2018-12-12 21:15:36 +00:00
|
|
|
saddr, sport, &in6addr_any, hnum,
|
2018-08-08 08:01:26 +00:00
|
|
|
dif, sdif);
|
|
|
|
|
done:
|
2019-06-05 21:11:34 +00:00
|
|
|
if (IS_ERR(result))
|
2018-08-08 08:01:26 +00:00
|
|
|
return NULL;
|
2005-08-12 12:26:18 +00:00
|
|
|
return result;
|
|
|
|
|
}
|
|
|
|
|
EXPORT_SYMBOL_GPL(inet6_lookup_listener);
|
|
|
|
|
|
2024-08-02 13:40:28 +00:00
|
|
|
struct sock *inet6_lookup(const struct net *net,
|
2016-02-10 16:50:38 +00:00
|
|
|
struct sk_buff *skb, int doff,
|
2006-11-08 08:20:00 +00:00
|
|
|
const struct in6_addr *saddr, const __be16 sport,
|
|
|
|
|
const struct in6_addr *daddr, const __be16 dport,
|
2005-08-12 12:26:18 +00:00
|
|
|
const int dif)
|
|
|
|
|
{
|
|
|
|
|
struct sock *sk;
|
2016-04-01 15:52:17 +00:00
|
|
|
bool refcounted;
|
2005-08-12 12:26:18 +00:00
|
|
|
|
2025-08-22 19:06:59 +00:00
|
|
|
sk = __inet6_lookup(net, skb, doff, saddr, sport, daddr,
|
2017-08-07 15:44:21 +00:00
|
|
|
ntohs(dport), dif, 0, &refcounted);
|
2017-06-30 10:08:01 +00:00
|
|
|
if (sk && !refcounted && !refcount_inc_not_zero(&sk->sk_refcnt))
|
2016-04-01 15:52:17 +00:00
|
|
|
sk = NULL;
|
2005-08-12 12:26:18 +00:00
|
|
|
return sk;
|
|
|
|
|
}
|
|
|
|
|
EXPORT_SYMBOL_GPL(inet6_lookup);
|
2005-12-14 07:25:44 +00:00
|
|
|
|
|
|
|
|
static int __inet6_check_established(struct inet_timewait_death_row *death_row,
|
|
|
|
|
struct sock *sk, const __u16 lport,
|
2025-03-02 12:42:37 +00:00
|
|
|
struct inet_timewait_sock **twp,
|
inet: call inet6_ehashfn() once from inet6_hash_connect()
inet6_ehashfn() being called from __inet6_check_established()
has a big impact on performance, as shown in the Tested section.
After prior patch, we can compute the hash for port 0
from inet6_hash_connect(), and derive each hash in
__inet_hash_connect() from this initial hash:
hash(saddr, lport, daddr, dport) == hash(saddr, 0, daddr, dport) + lport
Apply the same principle for __inet_check_established(),
although inet_ehashfn() has a smaller cost.
Tested:
Server: ulimit -n 40000; neper/tcp_crr -T 200 -F 30000 -6 --nolog
Client: ulimit -n 40000; neper/tcp_crr -T 200 -F 30000 -6 --nolog -c -H server
Before this patch:
utime_start=0.286131
utime_end=4.378886
stime_start=11.952556
stime_end=1991.655533
num_transactions=1446830
latency_min=0.001061085
latency_max=12.075275028
latency_mean=0.376375302
latency_stddev=1.361969596
num_samples=306383
throughput=151866.56
perf top:
50.01% [kernel] [k] __inet6_check_established
20.65% [kernel] [k] __inet_hash_connect
15.81% [kernel] [k] inet6_ehashfn
2.92% [kernel] [k] rcu_all_qs
2.34% [kernel] [k] __cond_resched
0.50% [kernel] [k] _raw_spin_lock
0.34% [kernel] [k] sched_balance_trigger
0.24% [kernel] [k] queued_spin_lock_slowpath
After this patch:
utime_start=0.315047
utime_end=9.257617
stime_start=7.041489
stime_end=1923.688387
num_transactions=3057968
latency_min=0.003041375
latency_max=7.056589232
latency_mean=0.141075048 # Better latency metrics
latency_stddev=0.526900516
num_samples=312996
throughput=320677.21 # 111 % increase, and 229 % for the series
perf top: inet6_ehashfn is no longer seen.
39.67% [kernel] [k] __inet_hash_connect
37.06% [kernel] [k] __inet6_check_established
4.79% [kernel] [k] rcu_all_qs
3.82% [kernel] [k] __cond_resched
1.76% [kernel] [k] sched_balance_domains
0.82% [kernel] [k] _raw_spin_lock
0.81% [kernel] [k] sched_balance_rq
0.81% [kernel] [k] sched_balance_trigger
0.76% [kernel] [k] queued_spin_lock_slowpath
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Tested-by: Jason Xing <kerneljasonxing@gmail.com>
Reviewed-by: Jason Xing <kerneljasonxing@gmail.com>
Link: https://patch.msgid.link/20250305034550.879255-3-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-03-05 03:45:50 +00:00
|
|
|
bool rcu_lookup,
|
|
|
|
|
u32 hash)
|
2005-12-14 07:25:44 +00:00
|
|
|
{
|
|
|
|
|
struct inet_hashinfo *hinfo = death_row->hashinfo;
|
2006-03-13 22:26:12 +00:00
|
|
|
struct inet_sock *inet = inet_sk(sk);
|
ipv6: make lookups simpler and faster
TCP listener refactoring, part 4 :
To speed up inet lookups, we moved IPv4 addresses from inet to struct
sock_common
Now is time to do the same for IPv6, because it permits us to have fast
lookups for all kind of sockets, including upcoming SYN_RECV.
Getting IPv6 addresses in TCP lookups currently requires two extra cache
lines, plus a dereference (and memory stall).
inet6_sk(sk) does the dereference of inet_sk(__sk)->pinet6
This patch is way bigger than its IPv4 counter part, because for IPv4,
we could add aliases (inet_daddr, inet_rcv_saddr), while on IPv6,
it's not doable easily.
inet6_sk(sk)->daddr becomes sk->sk_v6_daddr
inet6_sk(sk)->rcv_saddr becomes sk->sk_v6_rcv_saddr
And timewait socket also have tw->tw_v6_daddr & tw->tw_v6_rcv_saddr
at the same offset.
We get rid of INET6_TW_MATCH() as INET6_MATCH() is now the generic
macro.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2013-10-03 22:42:29 +00:00
|
|
|
const struct in6_addr *daddr = &sk->sk_v6_rcv_saddr;
|
|
|
|
|
const struct in6_addr *saddr = &sk->sk_v6_daddr;
|
2005-12-14 07:25:44 +00:00
|
|
|
const int dif = sk->sk_bound_dev_if;
|
2008-06-17 00:13:48 +00:00
|
|
|
struct net *net = sock_net(sk);
|
2017-08-07 15:44:21 +00:00
|
|
|
const int sdif = l3mdev_master_ifindex_by_index(net, dif);
|
|
|
|
|
const __portpair ports = INET_COMBINED_PORTS(inet->inet_dport, lport);
|
2005-12-14 07:25:44 +00:00
|
|
|
struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash);
|
tcp/dccp: remove twchain
TCP listener refactoring, part 3 :
Our goal is to hash SYN_RECV sockets into main ehash for fast lookup,
and parallel SYN processing.
Current inet_ehash_bucket contains two chains, one for ESTABLISH (and
friend states) sockets, another for TIME_WAIT sockets only.
As the hash table is sized to get at most one socket per bucket, it
makes little sense to have separate twchain, as it makes the lookup
slightly more complicated, and doubles hash table memory usage.
If we make sure all socket types have the lookup keys at the same
offsets, we can use a generic and faster lookup. It turns out TIME_WAIT
and ESTABLISHED sockets already have common lookup fields for IPv4.
[ INET_TW_MATCH() is no longer needed ]
I'll provide a follow-up to factorize IPv6 lookup as well, to remove
INET6_TW_MATCH()
This way, SYN_RECV pseudo sockets will be supported the same.
A new sock_gen_put() helper is added, doing either a sock_put() or
inet_twsk_put() [ and will support SYN_RECV later ].
Note this helper should only be called in real slow path, when rcu
lookup found a socket that was moved to another identity (freed/reused
immediately), but could eventually be used in other contexts, like
sock_edemux()
Before patch :
dmesg | grep "TCP established"
TCP established hash table entries: 524288 (order: 11, 8388608 bytes)
After patch :
TCP established hash table entries: 524288 (order: 10, 4194304 bytes)
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2013-10-03 07:22:02 +00:00
|
|
|
struct inet_timewait_sock *tw = NULL;
|
2025-03-02 12:42:34 +00:00
|
|
|
const struct hlist_nulls_node *node;
|
|
|
|
|
struct sock *sk2;
|
|
|
|
|
spinlock_t *lock;
|
|
|
|
|
|
2025-03-02 12:42:37 +00:00
|
|
|
if (rcu_lookup) {
|
|
|
|
|
sk_nulls_for_each(sk2, node, &head->chain) {
|
|
|
|
|
if (sk2->sk_hash != hash ||
|
|
|
|
|
!inet6_match(net, sk2, saddr, daddr,
|
|
|
|
|
ports, dif, sdif))
|
|
|
|
|
continue;
|
|
|
|
|
if (sk2->sk_state == TCP_TIME_WAIT)
|
|
|
|
|
break;
|
|
|
|
|
return -EADDRNOTAVAIL;
|
|
|
|
|
}
|
|
|
|
|
return 0;
|
2025-03-02 12:42:34 +00:00
|
|
|
}
|
2005-12-14 07:25:44 +00:00
|
|
|
|
2025-03-02 12:42:34 +00:00
|
|
|
lock = inet_ehash_lockp(hinfo, hash);
|
2008-11-21 04:39:09 +00:00
|
|
|
spin_lock(lock);
|
2005-12-14 07:25:44 +00:00
|
|
|
|
tcp/dccp: remove twchain
TCP listener refactoring, part 3 :
Our goal is to hash SYN_RECV sockets into main ehash for fast lookup,
and parallel SYN processing.
Current inet_ehash_bucket contains two chains, one for ESTABLISH (and
friend states) sockets, another for TIME_WAIT sockets only.
As the hash table is sized to get at most one socket per bucket, it
makes little sense to have separate twchain, as it makes the lookup
slightly more complicated, and doubles hash table memory usage.
If we make sure all socket types have the lookup keys at the same
offsets, we can use a generic and faster lookup. It turns out TIME_WAIT
and ESTABLISHED sockets already have common lookup fields for IPv4.
[ INET_TW_MATCH() is no longer needed ]
I'll provide a follow-up to factorize IPv6 lookup as well, to remove
INET6_TW_MATCH()
This way, SYN_RECV pseudo sockets will be supported the same.
A new sock_gen_put() helper is added, doing either a sock_put() or
inet_twsk_put() [ and will support SYN_RECV later ].
Note this helper should only be called in real slow path, when rcu
lookup found a socket that was moved to another identity (freed/reused
immediately), but could eventually be used in other contexts, like
sock_edemux()
Before patch :
dmesg | grep "TCP established"
TCP established hash table entries: 524288 (order: 11, 8388608 bytes)
After patch :
TCP established hash table entries: 524288 (order: 10, 4194304 bytes)
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2013-10-03 07:22:02 +00:00
|
|
|
sk_nulls_for_each(sk2, node, &head->chain) {
|
2012-11-30 09:49:27 +00:00
|
|
|
if (sk2->sk_hash != hash)
|
|
|
|
|
continue;
|
2005-12-14 07:25:44 +00:00
|
|
|
|
2022-05-13 18:55:49 +00:00
|
|
|
if (likely(inet6_match(net, sk2, saddr, daddr, ports,
|
2017-08-07 15:44:21 +00:00
|
|
|
dif, sdif))) {
|
ipv6: make lookups simpler and faster
TCP listener refactoring, part 4 :
To speed up inet lookups, we moved IPv4 addresses from inet to struct
sock_common
Now is time to do the same for IPv6, because it permits us to have fast
lookups for all kind of sockets, including upcoming SYN_RECV.
Getting IPv6 addresses in TCP lookups currently requires two extra cache
lines, plus a dereference (and memory stall).
inet6_sk(sk) does the dereference of inet_sk(__sk)->pinet6
This patch is way bigger than its IPv4 counter part, because for IPv4,
we could add aliases (inet_daddr, inet_rcv_saddr), while on IPv6,
it's not doable easily.
inet6_sk(sk)->daddr becomes sk->sk_v6_daddr
inet6_sk(sk)->rcv_saddr becomes sk->sk_v6_rcv_saddr
And timewait socket also have tw->tw_v6_daddr & tw->tw_v6_rcv_saddr
at the same offset.
We get rid of INET6_TW_MATCH() as INET6_MATCH() is now the generic
macro.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2013-10-03 22:42:29 +00:00
|
|
|
if (sk2->sk_state == TCP_TIME_WAIT) {
|
tcp/dccp: remove twchain
TCP listener refactoring, part 3 :
Our goal is to hash SYN_RECV sockets into main ehash for fast lookup,
and parallel SYN processing.
Current inet_ehash_bucket contains two chains, one for ESTABLISH (and
friend states) sockets, another for TIME_WAIT sockets only.
As the hash table is sized to get at most one socket per bucket, it
makes little sense to have separate twchain, as it makes the lookup
slightly more complicated, and doubles hash table memory usage.
If we make sure all socket types have the lookup keys at the same
offsets, we can use a generic and faster lookup. It turns out TIME_WAIT
and ESTABLISHED sockets already have common lookup fields for IPv4.
[ INET_TW_MATCH() is no longer needed ]
I'll provide a follow-up to factorize IPv6 lookup as well, to remove
INET6_TW_MATCH()
This way, SYN_RECV pseudo sockets will be supported the same.
A new sock_gen_put() helper is added, doing either a sock_put() or
inet_twsk_put() [ and will support SYN_RECV later ].
Note this helper should only be called in real slow path, when rcu
lookup found a socket that was moved to another identity (freed/reused
immediately), but could eventually be used in other contexts, like
sock_edemux()
Before patch :
dmesg | grep "TCP established"
TCP established hash table entries: 524288 (order: 11, 8388608 bytes)
After patch :
TCP established hash table entries: 524288 (order: 10, 4194304 bytes)
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2013-10-03 07:22:02 +00:00
|
|
|
tw = inet_twsk(sk2);
|
2025-08-22 19:06:56 +00:00
|
|
|
if (tcp_twsk_unique(sk, sk2, twp))
|
ipv6: make lookups simpler and faster
TCP listener refactoring, part 4 :
To speed up inet lookups, we moved IPv4 addresses from inet to struct
sock_common
Now is time to do the same for IPv6, because it permits us to have fast
lookups for all kind of sockets, including upcoming SYN_RECV.
Getting IPv6 addresses in TCP lookups currently requires two extra cache
lines, plus a dereference (and memory stall).
inet6_sk(sk) does the dereference of inet_sk(__sk)->pinet6
This patch is way bigger than its IPv4 counter part, because for IPv4,
we could add aliases (inet_daddr, inet_rcv_saddr), while on IPv6,
it's not doable easily.
inet6_sk(sk)->daddr becomes sk->sk_v6_daddr
inet6_sk(sk)->rcv_saddr becomes sk->sk_v6_rcv_saddr
And timewait socket also have tw->tw_v6_daddr & tw->tw_v6_rcv_saddr
at the same offset.
We get rid of INET6_TW_MATCH() as INET6_MATCH() is now the generic
macro.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2013-10-03 22:42:29 +00:00
|
|
|
break;
|
tcp/dccp: remove twchain
TCP listener refactoring, part 3 :
Our goal is to hash SYN_RECV sockets into main ehash for fast lookup,
and parallel SYN processing.
Current inet_ehash_bucket contains two chains, one for ESTABLISH (and
friend states) sockets, another for TIME_WAIT sockets only.
As the hash table is sized to get at most one socket per bucket, it
makes little sense to have separate twchain, as it makes the lookup
slightly more complicated, and doubles hash table memory usage.
If we make sure all socket types have the lookup keys at the same
offsets, we can use a generic and faster lookup. It turns out TIME_WAIT
and ESTABLISHED sockets already have common lookup fields for IPv4.
[ INET_TW_MATCH() is no longer needed ]
I'll provide a follow-up to factorize IPv6 lookup as well, to remove
INET6_TW_MATCH()
This way, SYN_RECV pseudo sockets will be supported the same.
A new sock_gen_put() helper is added, doing either a sock_put() or
inet_twsk_put() [ and will support SYN_RECV later ].
Note this helper should only be called in real slow path, when rcu
lookup found a socket that was moved to another identity (freed/reused
immediately), but could eventually be used in other contexts, like
sock_edemux()
Before patch :
dmesg | grep "TCP established"
TCP established hash table entries: 524288 (order: 11, 8388608 bytes)
After patch :
TCP established hash table entries: 524288 (order: 10, 4194304 bytes)
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2013-10-03 07:22:02 +00:00
|
|
|
}
|
2005-12-14 07:25:44 +00:00
|
|
|
goto not_unique;
|
ipv6: make lookups simpler and faster
TCP listener refactoring, part 4 :
To speed up inet lookups, we moved IPv4 addresses from inet to struct
sock_common
Now is time to do the same for IPv6, because it permits us to have fast
lookups for all kind of sockets, including upcoming SYN_RECV.
Getting IPv6 addresses in TCP lookups currently requires two extra cache
lines, plus a dereference (and memory stall).
inet6_sk(sk) does the dereference of inet_sk(__sk)->pinet6
This patch is way bigger than its IPv4 counter part, because for IPv4,
we could add aliases (inet_daddr, inet_rcv_saddr), while on IPv6,
it's not doable easily.
inet6_sk(sk)->daddr becomes sk->sk_v6_daddr
inet6_sk(sk)->rcv_saddr becomes sk->sk_v6_rcv_saddr
And timewait socket also have tw->tw_v6_daddr & tw->tw_v6_rcv_saddr
at the same offset.
We get rid of INET6_TW_MATCH() as INET6_MATCH() is now the generic
macro.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2013-10-03 22:42:29 +00:00
|
|
|
}
|
2005-12-14 07:25:44 +00:00
|
|
|
}
|
|
|
|
|
|
2006-03-13 22:26:12 +00:00
|
|
|
/* Must record num and sport now. Otherwise we will see
|
ipv6: make lookups simpler and faster
TCP listener refactoring, part 4 :
To speed up inet lookups, we moved IPv4 addresses from inet to struct
sock_common
Now is time to do the same for IPv6, because it permits us to have fast
lookups for all kind of sockets, including upcoming SYN_RECV.
Getting IPv6 addresses in TCP lookups currently requires two extra cache
lines, plus a dereference (and memory stall).
inet6_sk(sk) does the dereference of inet_sk(__sk)->pinet6
This patch is way bigger than its IPv4 counter part, because for IPv4,
we could add aliases (inet_daddr, inet_rcv_saddr), while on IPv6,
it's not doable easily.
inet6_sk(sk)->daddr becomes sk->sk_v6_daddr
inet6_sk(sk)->rcv_saddr becomes sk->sk_v6_rcv_saddr
And timewait socket also have tw->tw_v6_daddr & tw->tw_v6_rcv_saddr
at the same offset.
We get rid of INET6_TW_MATCH() as INET6_MATCH() is now the generic
macro.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2013-10-03 22:42:29 +00:00
|
|
|
* in hash table socket with a funny identity.
|
|
|
|
|
*/
|
2009-10-15 06:30:45 +00:00
|
|
|
inet->inet_num = lport;
|
|
|
|
|
inet->inet_sport = htons(lport);
|
2009-12-02 22:31:19 +00:00
|
|
|
sk->sk_hash = hash;
|
2008-07-26 04:43:18 +00:00
|
|
|
WARN_ON(!sk_unhashed(sk));
|
2008-11-17 03:40:17 +00:00
|
|
|
__sk_nulls_add_node_rcu(sk, &head->chain);
|
2009-12-02 22:31:19 +00:00
|
|
|
if (tw) {
|
2015-07-08 21:28:29 +00:00
|
|
|
sk_nulls_del_node_init_rcu((struct sock *)tw);
|
2016-04-27 23:44:39 +00:00
|
|
|
__NET_INC_STATS(net, LINUX_MIB_TIMEWAITRECYCLED);
|
2009-12-02 22:31:19 +00:00
|
|
|
}
|
2008-11-21 04:39:09 +00:00
|
|
|
spin_unlock(lock);
|
2008-04-01 02:41:46 +00:00
|
|
|
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
|
2005-12-14 07:25:44 +00:00
|
|
|
|
2009-12-02 22:31:19 +00:00
|
|
|
if (twp) {
|
2005-12-14 07:25:44 +00:00
|
|
|
*twp = tw;
|
2009-12-02 22:31:19 +00:00
|
|
|
} else if (tw) {
|
2005-12-14 07:25:44 +00:00
|
|
|
/* Silly. Should hash-dance instead... */
|
2015-07-08 21:28:30 +00:00
|
|
|
inet_twsk_deschedule_put(tw);
|
2005-12-14 07:25:44 +00:00
|
|
|
}
|
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
|
|
not_unique:
|
2008-11-21 04:39:09 +00:00
|
|
|
spin_unlock(lock);
|
2005-12-14 07:25:44 +00:00
|
|
|
return -EADDRNOTAVAIL;
|
|
|
|
|
}
|
|
|
|
|
|
2022-05-02 08:46:08 +00:00
|
|
|
static u64 inet6_sk_port_offset(const struct sock *sk)
|
2005-12-14 07:25:44 +00:00
|
|
|
{
|
|
|
|
|
const struct inet_sock *inet = inet_sk(sk);
|
ipv6: make lookups simpler and faster
TCP listener refactoring, part 4 :
To speed up inet lookups, we moved IPv4 addresses from inet to struct
sock_common
Now is time to do the same for IPv6, because it permits us to have fast
lookups for all kind of sockets, including upcoming SYN_RECV.
Getting IPv6 addresses in TCP lookups currently requires two extra cache
lines, plus a dereference (and memory stall).
inet6_sk(sk) does the dereference of inet_sk(__sk)->pinet6
This patch is way bigger than its IPv4 counter part, because for IPv4,
we could add aliases (inet_daddr, inet_rcv_saddr), while on IPv6,
it's not doable easily.
inet6_sk(sk)->daddr becomes sk->sk_v6_daddr
inet6_sk(sk)->rcv_saddr becomes sk->sk_v6_rcv_saddr
And timewait socket also have tw->tw_v6_daddr & tw->tw_v6_rcv_saddr
at the same offset.
We get rid of INET6_TW_MATCH() as INET6_MATCH() is now the generic
macro.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2013-10-03 22:42:29 +00:00
|
|
|
|
|
|
|
|
return secure_ipv6_port_ephemeral(sk->sk_v6_rcv_saddr.s6_addr32,
|
|
|
|
|
sk->sk_v6_daddr.s6_addr32,
|
2009-10-15 06:30:45 +00:00
|
|
|
inet->inet_dport);
|
2005-12-14 07:25:44 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
int inet6_hash_connect(struct inet_timewait_death_row *death_row,
|
|
|
|
|
struct sock *sk)
|
|
|
|
|
{
|
inet: call inet6_ehashfn() once from inet6_hash_connect()
inet6_ehashfn() being called from __inet6_check_established()
has a big impact on performance, as shown in the Tested section.
After prior patch, we can compute the hash for port 0
from inet6_hash_connect(), and derive each hash in
__inet_hash_connect() from this initial hash:
hash(saddr, lport, daddr, dport) == hash(saddr, 0, daddr, dport) + lport
Apply the same principle for __inet_check_established(),
although inet_ehashfn() has a smaller cost.
Tested:
Server: ulimit -n 40000; neper/tcp_crr -T 200 -F 30000 -6 --nolog
Client: ulimit -n 40000; neper/tcp_crr -T 200 -F 30000 -6 --nolog -c -H server
Before this patch:
utime_start=0.286131
utime_end=4.378886
stime_start=11.952556
stime_end=1991.655533
num_transactions=1446830
latency_min=0.001061085
latency_max=12.075275028
latency_mean=0.376375302
latency_stddev=1.361969596
num_samples=306383
throughput=151866.56
perf top:
50.01% [kernel] [k] __inet6_check_established
20.65% [kernel] [k] __inet_hash_connect
15.81% [kernel] [k] inet6_ehashfn
2.92% [kernel] [k] rcu_all_qs
2.34% [kernel] [k] __cond_resched
0.50% [kernel] [k] _raw_spin_lock
0.34% [kernel] [k] sched_balance_trigger
0.24% [kernel] [k] queued_spin_lock_slowpath
After this patch:
utime_start=0.315047
utime_end=9.257617
stime_start=7.041489
stime_end=1923.688387
num_transactions=3057968
latency_min=0.003041375
latency_max=7.056589232
latency_mean=0.141075048 # Better latency metrics
latency_stddev=0.526900516
num_samples=312996
throughput=320677.21 # 111 % increase, and 229 % for the series
perf top: inet6_ehashfn is no longer seen.
39.67% [kernel] [k] __inet_hash_connect
37.06% [kernel] [k] __inet6_check_established
4.79% [kernel] [k] rcu_all_qs
3.82% [kernel] [k] __cond_resched
1.76% [kernel] [k] sched_balance_domains
0.82% [kernel] [k] _raw_spin_lock
0.81% [kernel] [k] sched_balance_rq
0.81% [kernel] [k] sched_balance_trigger
0.76% [kernel] [k] queued_spin_lock_slowpath
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Tested-by: Jason Xing <kerneljasonxing@gmail.com>
Reviewed-by: Jason Xing <kerneljasonxing@gmail.com>
Link: https://patch.msgid.link/20250305034550.879255-3-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-03-05 03:45:50 +00:00
|
|
|
const struct in6_addr *daddr = &sk->sk_v6_rcv_saddr;
|
|
|
|
|
const struct in6_addr *saddr = &sk->sk_v6_daddr;
|
|
|
|
|
const struct inet_sock *inet = inet_sk(sk);
|
|
|
|
|
const struct net *net = sock_net(sk);
|
2022-05-02 08:46:08 +00:00
|
|
|
u64 port_offset = 0;
|
inet: call inet6_ehashfn() once from inet6_hash_connect()
inet6_ehashfn() being called from __inet6_check_established()
has a big impact on performance, as shown in the Tested section.
After prior patch, we can compute the hash for port 0
from inet6_hash_connect(), and derive each hash in
__inet_hash_connect() from this initial hash:
hash(saddr, lport, daddr, dport) == hash(saddr, 0, daddr, dport) + lport
Apply the same principle for __inet_check_established(),
although inet_ehashfn() has a smaller cost.
Tested:
Server: ulimit -n 40000; neper/tcp_crr -T 200 -F 30000 -6 --nolog
Client: ulimit -n 40000; neper/tcp_crr -T 200 -F 30000 -6 --nolog -c -H server
Before this patch:
utime_start=0.286131
utime_end=4.378886
stime_start=11.952556
stime_end=1991.655533
num_transactions=1446830
latency_min=0.001061085
latency_max=12.075275028
latency_mean=0.376375302
latency_stddev=1.361969596
num_samples=306383
throughput=151866.56
perf top:
50.01% [kernel] [k] __inet6_check_established
20.65% [kernel] [k] __inet_hash_connect
15.81% [kernel] [k] inet6_ehashfn
2.92% [kernel] [k] rcu_all_qs
2.34% [kernel] [k] __cond_resched
0.50% [kernel] [k] _raw_spin_lock
0.34% [kernel] [k] sched_balance_trigger
0.24% [kernel] [k] queued_spin_lock_slowpath
After this patch:
utime_start=0.315047
utime_end=9.257617
stime_start=7.041489
stime_end=1923.688387
num_transactions=3057968
latency_min=0.003041375
latency_max=7.056589232
latency_mean=0.141075048 # Better latency metrics
latency_stddev=0.526900516
num_samples=312996
throughput=320677.21 # 111 % increase, and 229 % for the series
perf top: inet6_ehashfn is no longer seen.
39.67% [kernel] [k] __inet_hash_connect
37.06% [kernel] [k] __inet6_check_established
4.79% [kernel] [k] rcu_all_qs
3.82% [kernel] [k] __cond_resched
1.76% [kernel] [k] sched_balance_domains
0.82% [kernel] [k] _raw_spin_lock
0.81% [kernel] [k] sched_balance_rq
0.81% [kernel] [k] sched_balance_trigger
0.76% [kernel] [k] queued_spin_lock_slowpath
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Tested-by: Jason Xing <kerneljasonxing@gmail.com>
Reviewed-by: Jason Xing <kerneljasonxing@gmail.com>
Link: https://patch.msgid.link/20250305034550.879255-3-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-03-05 03:45:50 +00:00
|
|
|
u32 hash_port0;
|
2015-05-27 17:46:02 +00:00
|
|
|
|
|
|
|
|
if (!inet_sk(sk)->inet_num)
|
|
|
|
|
port_offset = inet6_sk_port_offset(sk);
|
inet: call inet6_ehashfn() once from inet6_hash_connect()
inet6_ehashfn() being called from __inet6_check_established()
has a big impact on performance, as shown in the Tested section.
After prior patch, we can compute the hash for port 0
from inet6_hash_connect(), and derive each hash in
__inet_hash_connect() from this initial hash:
hash(saddr, lport, daddr, dport) == hash(saddr, 0, daddr, dport) + lport
Apply the same principle for __inet_check_established(),
although inet_ehashfn() has a smaller cost.
Tested:
Server: ulimit -n 40000; neper/tcp_crr -T 200 -F 30000 -6 --nolog
Client: ulimit -n 40000; neper/tcp_crr -T 200 -F 30000 -6 --nolog -c -H server
Before this patch:
utime_start=0.286131
utime_end=4.378886
stime_start=11.952556
stime_end=1991.655533
num_transactions=1446830
latency_min=0.001061085
latency_max=12.075275028
latency_mean=0.376375302
latency_stddev=1.361969596
num_samples=306383
throughput=151866.56
perf top:
50.01% [kernel] [k] __inet6_check_established
20.65% [kernel] [k] __inet_hash_connect
15.81% [kernel] [k] inet6_ehashfn
2.92% [kernel] [k] rcu_all_qs
2.34% [kernel] [k] __cond_resched
0.50% [kernel] [k] _raw_spin_lock
0.34% [kernel] [k] sched_balance_trigger
0.24% [kernel] [k] queued_spin_lock_slowpath
After this patch:
utime_start=0.315047
utime_end=9.257617
stime_start=7.041489
stime_end=1923.688387
num_transactions=3057968
latency_min=0.003041375
latency_max=7.056589232
latency_mean=0.141075048 # Better latency metrics
latency_stddev=0.526900516
num_samples=312996
throughput=320677.21 # 111 % increase, and 229 % for the series
perf top: inet6_ehashfn is no longer seen.
39.67% [kernel] [k] __inet_hash_connect
37.06% [kernel] [k] __inet6_check_established
4.79% [kernel] [k] rcu_all_qs
3.82% [kernel] [k] __cond_resched
1.76% [kernel] [k] sched_balance_domains
0.82% [kernel] [k] _raw_spin_lock
0.81% [kernel] [k] sched_balance_rq
0.81% [kernel] [k] sched_balance_trigger
0.76% [kernel] [k] queued_spin_lock_slowpath
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Tested-by: Jason Xing <kerneljasonxing@gmail.com>
Reviewed-by: Jason Xing <kerneljasonxing@gmail.com>
Link: https://patch.msgid.link/20250305034550.879255-3-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-03-05 03:45:50 +00:00
|
|
|
|
|
|
|
|
hash_port0 = inet6_ehashfn(net, daddr, 0, saddr, inet->inet_dport);
|
|
|
|
|
|
|
|
|
|
return __inet_hash_connect(death_row, sk, port_offset, hash_port0,
|
2015-03-18 21:05:37 +00:00
|
|
|
__inet6_check_established);
|
2005-12-14 07:25:44 +00:00
|
|
|
}
|
|
|
|
|
EXPORT_SYMBOL_GPL(inet6_hash_connect);
|