1 /* SPDX-License-Identifier: GPL-2.0-or-later */
3 * INET An implementation of the TCP/IP protocol suite for the LINUX
4 * operating system. INET is implemented using the BSD Socket
5 * interface as the means of communication with the user level.
7 * Authors: Lotsa people, from code originally in tcp
10 #ifndef _INET_HASHTABLES_H
11 #define _INET_HASHTABLES_H
14 #include <linux/interrupt.h>
16 #include <linux/ipv6.h>
17 #include <linux/list.h>
18 #include <linux/slab.h>
19 #include <linux/socket.h>
20 #include <linux/spinlock.h>
21 #include <linux/types.h>
22 #include <linux/wait.h>
24 #include <net/inet_connection_sock.h>
25 #include <net/inet_sock.h>
28 #include <net/route.h>
29 #include <net/tcp_states.h>
30 #include <net/netns/hash.h>
32 #include <linux/refcount.h>
33 #include <asm/byteorder.h>
35 /* This is for all connections with a full identity, no wildcards.
36 * The 'e' prefix stands for Establish, but we really put all sockets
39 struct inet_ehash_bucket
{
40 struct hlist_nulls_head chain
;
43 /* There are a few simple rules, which allow for local port reuse by
44 * an application. In essence:
46 * 1) Sockets bound to different interfaces may share a local port.
47 * Failing that, goto test 2.
48 * 2) If all sockets have sk->sk_reuse set, and none of them are in
49 * TCP_LISTEN state, the port may be shared.
50 * Failing that, goto test 3.
51 * 3) If all sockets are bound to a specific inet_sk(sk)->rcv_saddr local
52 * address, and none of them are the same, the port may be
54 * Failing this, the port cannot be shared.
56 * The interesting point, is test #2. This is what an FTP server does
57 * all day. To optimize this case we use a specific flag bit defined
58 * below. As we add sockets to a bind bucket list, we perform a
59 * check of: (newsk->sk_reuse && (newsk->sk_state != TCP_LISTEN))
60 * As long as all sockets added to a bind bucket pass this test,
61 * the flag bit will be set.
62 * The resulting situation is that tcp_v[46]_verify_bind() can just check
63 * for this flag bit, if it is set and the socket trying to bind has
64 * sk->sk_reuse set, we don't even have to walk the owners list at all,
65 * we return that it is ok to bind this socket to the requested local port.
67 * Sounds like a lot of work, but it is worth it. In a more naive
68 * implementation (ie. current FreeBSD etc.) the entire list of ports
69 * must be walked for each data port opened by an ftp server. Needless
70 * to say, this does not scale at all. With a couple thousand FTP
71 * users logged onto your box, isn't it nice to know that new data
72 * ports are created in O(1) time? I thought so. ;-) -DaveM
74 #define FASTREUSEPORT_ANY 1
75 #define FASTREUSEPORT_STRICT 2
77 struct inet_bind_bucket
{
78 possible_net_t ib_net
;
81 signed char fastreuse
;
82 signed char fastreuseport
;
84 #if IS_ENABLED(CONFIG_IPV6)
85 struct in6_addr fast_v6_rcv_saddr
;
87 __be32 fast_rcv_saddr
;
88 unsigned short fast_sk_family
;
90 struct hlist_node node
;
91 struct hlist_head bhash2
;
94 struct inet_bind2_bucket
{
95 possible_net_t ib_net
;
98 #if IS_ENABLED(CONFIG_IPV6)
99 unsigned short addr_type
;
100 struct in6_addr v6_rcv_saddr
;
101 #define rcv_saddr v6_rcv_saddr.s6_addr32[3]
105 /* Node in the bhash2 inet_bind_hashbucket chain */
106 struct hlist_node node
;
107 struct hlist_node bhash_node
;
108 /* List of sockets hashed to this bucket */
109 struct hlist_head owners
;
112 static inline struct net
*ib_net(const struct inet_bind_bucket
*ib
)
114 return read_pnet(&ib
->ib_net
);
117 static inline struct net
*ib2_net(const struct inet_bind2_bucket
*ib
)
119 return read_pnet(&ib
->ib_net
);
122 #define inet_bind_bucket_for_each(tb, head) \
123 hlist_for_each_entry(tb, head, node)
125 struct inet_bind_hashbucket
{
127 struct hlist_head chain
;
130 /* Sockets can be hashed in established or listening table.
131 * We must use different 'nulls' end-of-chain value for all hash buckets :
132 * A socket might transition from ESTABLISH to LISTEN state without
133 * RCU grace period. A lookup in ehash table needs to handle this case.
135 #define LISTENING_NULLS_BASE (1U << 29)
136 struct inet_listen_hashbucket
{
138 struct hlist_nulls_head nulls_head
;
141 /* This is for listening sockets, thus all sockets which possess wildcards. */
142 #define INET_LHTABLE_SIZE 32 /* Yes, really, this is all you need. */
144 struct inet_hashinfo
{
145 /* This is for sockets with full identity only. Sockets here will
146 * always be without wildcards and will have the following invariant:
148 * TCP_ESTABLISHED <= sk->sk_state < TCP_CLOSE
151 struct inet_ehash_bucket
*ehash
;
152 spinlock_t
*ehash_locks
;
153 unsigned int ehash_mask
;
154 unsigned int ehash_locks_mask
;
156 /* Ok, let's try this, I give up, we do need a local binding
157 * TCP hash as well as the others for fast bind/connect.
159 struct kmem_cache
*bind_bucket_cachep
;
160 /* This bind table is hashed by local port */
161 struct inet_bind_hashbucket
*bhash
;
162 struct kmem_cache
*bind2_bucket_cachep
;
163 /* This bind table is hashed by local port and sk->sk_rcv_saddr (ipv4)
164 * or sk->sk_v6_rcv_saddr (ipv6). This 2nd bind table is used
165 * primarily for expediting bind conflict resolution.
167 struct inet_bind_hashbucket
*bhash2
;
168 unsigned int bhash_size
;
170 /* The 2nd listener table hashed by local port and address */
171 unsigned int lhash2_mask
;
172 struct inet_listen_hashbucket
*lhash2
;
175 } ____cacheline_aligned_in_smp
;
177 static inline struct inet_hashinfo
*tcp_or_dccp_get_hashinfo(const struct sock
*sk
)
179 #if IS_ENABLED(CONFIG_IP_DCCP)
180 return sk
->sk_prot
->h
.hashinfo
? :
181 sock_net(sk
)->ipv4
.tcp_death_row
.hashinfo
;
183 return sock_net(sk
)->ipv4
.tcp_death_row
.hashinfo
;
187 static inline struct inet_listen_hashbucket
*
188 inet_lhash2_bucket(struct inet_hashinfo
*h
, u32 hash
)
190 return &h
->lhash2
[hash
& h
->lhash2_mask
];
193 static inline struct inet_ehash_bucket
*inet_ehash_bucket(
194 struct inet_hashinfo
*hashinfo
,
197 return &hashinfo
->ehash
[hash
& hashinfo
->ehash_mask
];
200 static inline spinlock_t
*inet_ehash_lockp(
201 struct inet_hashinfo
*hashinfo
,
204 return &hashinfo
->ehash_locks
[hash
& hashinfo
->ehash_locks_mask
];
207 int inet_ehash_locks_alloc(struct inet_hashinfo
*hashinfo
);
209 static inline void inet_hashinfo2_free_mod(struct inet_hashinfo
*h
)
215 static inline void inet_ehash_locks_free(struct inet_hashinfo
*hashinfo
)
217 kvfree(hashinfo
->ehash_locks
);
218 hashinfo
->ehash_locks
= NULL
;
221 struct inet_hashinfo
*inet_pernet_hashinfo_alloc(struct inet_hashinfo
*hashinfo
,
222 unsigned int ehash_entries
);
223 void inet_pernet_hashinfo_free(struct inet_hashinfo
*hashinfo
);
225 struct inet_bind_bucket
*
226 inet_bind_bucket_create(struct kmem_cache
*cachep
, struct net
*net
,
227 struct inet_bind_hashbucket
*head
,
228 const unsigned short snum
, int l3mdev
);
229 void inet_bind_bucket_destroy(struct kmem_cache
*cachep
,
230 struct inet_bind_bucket
*tb
);
232 bool inet_bind_bucket_match(const struct inet_bind_bucket
*tb
,
233 const struct net
*net
, unsigned short port
,
236 struct inet_bind2_bucket
*
237 inet_bind2_bucket_create(struct kmem_cache
*cachep
, struct net
*net
,
238 struct inet_bind_hashbucket
*head
,
239 struct inet_bind_bucket
*tb
,
240 const struct sock
*sk
);
242 void inet_bind2_bucket_destroy(struct kmem_cache
*cachep
,
243 struct inet_bind2_bucket
*tb
);
245 struct inet_bind2_bucket
*
246 inet_bind2_bucket_find(const struct inet_bind_hashbucket
*head
,
247 const struct net
*net
,
248 unsigned short port
, int l3mdev
,
249 const struct sock
*sk
);
251 bool inet_bind2_bucket_match_addr_any(const struct inet_bind2_bucket
*tb
,
252 const struct net
*net
, unsigned short port
,
253 int l3mdev
, const struct sock
*sk
);
255 static inline u32
inet_bhashfn(const struct net
*net
, const __u16 lport
,
256 const u32 bhash_size
)
258 return (lport
+ net_hash_mix(net
)) & (bhash_size
- 1);
261 static inline struct inet_bind_hashbucket
*
262 inet_bhashfn_portaddr(const struct inet_hashinfo
*hinfo
, const struct sock
*sk
,
263 const struct net
*net
, unsigned short port
)
267 #if IS_ENABLED(CONFIG_IPV6)
268 if (sk
->sk_family
== AF_INET6
)
269 hash
= ipv6_portaddr_hash(net
, &sk
->sk_v6_rcv_saddr
, port
);
272 hash
= ipv4_portaddr_hash(net
, sk
->sk_rcv_saddr
, port
);
273 return &hinfo
->bhash2
[hash
& (hinfo
->bhash_size
- 1)];
276 struct inet_bind_hashbucket
*
277 inet_bhash2_addr_any_hashbucket(const struct sock
*sk
, const struct net
*net
, int port
);
279 /* This should be called whenever a socket's sk_rcv_saddr (ipv4) or
280 * sk_v6_rcv_saddr (ipv6) changes after it has been binded. The socket's
281 * rcv_saddr field should already have been updated when this is called.
283 int inet_bhash2_update_saddr(struct sock
*sk
, void *saddr
, int family
);
284 void inet_bhash2_reset_saddr(struct sock
*sk
);
286 void inet_bind_hash(struct sock
*sk
, struct inet_bind_bucket
*tb
,
287 struct inet_bind2_bucket
*tb2
, unsigned short port
);
289 /* Caller must disable local BH processing. */
290 int __inet_inherit_port(const struct sock
*sk
, struct sock
*child
);
292 void inet_put_port(struct sock
*sk
);
294 void inet_hashinfo2_init(struct inet_hashinfo
*h
, const char *name
,
295 unsigned long numentries
, int scale
,
296 unsigned long low_limit
,
297 unsigned long high_limit
);
298 int inet_hashinfo2_init_mod(struct inet_hashinfo
*h
);
300 bool inet_ehash_insert(struct sock
*sk
, struct sock
*osk
, bool *found_dup_sk
);
301 bool inet_ehash_nolisten(struct sock
*sk
, struct sock
*osk
,
303 int __inet_hash(struct sock
*sk
, struct sock
*osk
);
304 int inet_hash(struct sock
*sk
);
305 void inet_unhash(struct sock
*sk
);
307 struct sock
*__inet_lookup_listener(const struct net
*net
,
308 struct inet_hashinfo
*hashinfo
,
309 struct sk_buff
*skb
, int doff
,
310 const __be32 saddr
, const __be16 sport
,
312 const unsigned short hnum
,
313 const int dif
, const int sdif
);
315 static inline struct sock
*inet_lookup_listener(struct net
*net
,
316 struct inet_hashinfo
*hashinfo
,
317 struct sk_buff
*skb
, int doff
,
318 __be32 saddr
, __be16 sport
,
319 __be32 daddr
, __be16 dport
, int dif
, int sdif
)
321 return __inet_lookup_listener(net
, hashinfo
, skb
, doff
, saddr
, sport
,
322 daddr
, ntohs(dport
), dif
, sdif
);
325 /* Socket demux engine toys. */
326 /* What happens here is ugly; there's a pair of adjacent fields in
327 struct inet_sock; __be16 dport followed by __u16 num. We want to
328 search by pair, so we combine the keys into a single 32bit value
329 and compare with 32bit value read from &...->dport. Let's at least
330 make sure that it's not mixed with anything else...
331 On 64bit targets we combine comparisons with pair of adjacent __be32
332 fields in the same way.
335 #define INET_COMBINED_PORTS(__sport, __dport) \
336 ((__force __portpair)(((__force __u32)(__be16)(__sport) << 16) | (__u32)(__dport)))
337 #else /* __LITTLE_ENDIAN */
338 #define INET_COMBINED_PORTS(__sport, __dport) \
339 ((__force __portpair)(((__u32)(__dport) << 16) | (__force __u32)(__be16)(__sport)))
343 #define INET_ADDR_COOKIE(__name, __saddr, __daddr) \
344 const __addrpair __name = (__force __addrpair) ( \
345 (((__force __u64)(__be32)(__saddr)) << 32) | \
346 ((__force __u64)(__be32)(__daddr)))
347 #else /* __LITTLE_ENDIAN */
348 #define INET_ADDR_COOKIE(__name, __saddr, __daddr) \
349 const __addrpair __name = (__force __addrpair) ( \
350 (((__force __u64)(__be32)(__daddr)) << 32) | \
351 ((__force __u64)(__be32)(__saddr)))
352 #endif /* __BIG_ENDIAN */
354 static inline bool inet_match(const struct net
*net
, const struct sock
*sk
,
355 const __addrpair cookie
, const __portpair ports
,
358 if (!net_eq(sock_net(sk
), net
) ||
359 sk
->sk_portpair
!= ports
||
360 sk
->sk_addrpair
!= cookie
)
363 /* READ_ONCE() paired with WRITE_ONCE() in sock_bindtoindex_locked() */
364 return inet_sk_bound_dev_eq(net
, READ_ONCE(sk
->sk_bound_dev_if
), dif
,
368 /* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so we need
369 * not check it for lookups anymore, thanks Alexey. -DaveM
371 struct sock
*__inet_lookup_established(const struct net
*net
,
372 struct inet_hashinfo
*hashinfo
,
373 const __be32 saddr
, const __be16 sport
,
374 const __be32 daddr
, const u16 hnum
,
375 const int dif
, const int sdif
);
377 typedef u32 (inet_ehashfn_t
)(const struct net
*net
,
378 const __be32 laddr
, const __u16 lport
,
379 const __be32 faddr
, const __be16 fport
);
381 inet_ehashfn_t inet_ehashfn
;
383 INDIRECT_CALLABLE_DECLARE(inet_ehashfn_t udp_ehashfn
);
385 struct sock
*inet_lookup_reuseport(const struct net
*net
, struct sock
*sk
,
386 struct sk_buff
*skb
, int doff
,
387 __be32 saddr
, __be16 sport
,
388 __be32 daddr
, unsigned short hnum
,
389 inet_ehashfn_t
*ehashfn
);
391 struct sock
*inet_lookup_run_sk_lookup(const struct net
*net
,
393 struct sk_buff
*skb
, int doff
,
394 __be32 saddr
, __be16 sport
,
395 __be32 daddr
, u16 hnum
, const int dif
,
396 inet_ehashfn_t
*ehashfn
);
398 static inline struct sock
*
399 inet_lookup_established(struct net
*net
, struct inet_hashinfo
*hashinfo
,
400 const __be32 saddr
, const __be16 sport
,
401 const __be32 daddr
, const __be16 dport
,
404 return __inet_lookup_established(net
, hashinfo
, saddr
, sport
, daddr
,
405 ntohs(dport
), dif
, 0);
408 static inline struct sock
*__inet_lookup(struct net
*net
,
409 struct inet_hashinfo
*hashinfo
,
410 struct sk_buff
*skb
, int doff
,
411 const __be32 saddr
, const __be16 sport
,
412 const __be32 daddr
, const __be16 dport
,
413 const int dif
, const int sdif
,
416 u16 hnum
= ntohs(dport
);
419 sk
= __inet_lookup_established(net
, hashinfo
, saddr
, sport
,
420 daddr
, hnum
, dif
, sdif
);
425 return __inet_lookup_listener(net
, hashinfo
, skb
, doff
, saddr
,
426 sport
, daddr
, hnum
, dif
, sdif
);
429 static inline struct sock
*inet_lookup(struct net
*net
,
430 struct inet_hashinfo
*hashinfo
,
431 struct sk_buff
*skb
, int doff
,
432 const __be32 saddr
, const __be16 sport
,
433 const __be32 daddr
, const __be16 dport
,
439 sk
= __inet_lookup(net
, hashinfo
, skb
, doff
, saddr
, sport
, daddr
,
440 dport
, dif
, 0, &refcounted
);
442 if (sk
&& !refcounted
&& !refcount_inc_not_zero(&sk
->sk_refcnt
))
448 struct sock
*inet_steal_sock(struct net
*net
, struct sk_buff
*skb
, int doff
,
449 const __be32 saddr
, const __be16 sport
,
450 const __be32 daddr
, const __be16 dport
,
451 bool *refcounted
, inet_ehashfn_t
*ehashfn
)
453 struct sock
*sk
, *reuse_sk
;
456 sk
= skb_steal_sock(skb
, refcounted
, &prefetched
);
460 if (!prefetched
|| !sk_fullsock(sk
))
463 if (sk
->sk_protocol
== IPPROTO_TCP
) {
464 if (sk
->sk_state
!= TCP_LISTEN
)
466 } else if (sk
->sk_protocol
== IPPROTO_UDP
) {
467 if (sk
->sk_state
!= TCP_CLOSE
)
473 reuse_sk
= inet_lookup_reuseport(net
, sk
, skb
, doff
,
474 saddr
, sport
, daddr
, ntohs(dport
),
479 /* We've chosen a new reuseport sock which is never refcounted. This
480 * implies that sk also isn't refcounted.
482 WARN_ON_ONCE(*refcounted
);
487 static inline struct sock
*__inet_lookup_skb(struct inet_hashinfo
*hashinfo
,
495 struct net
*net
= dev_net(skb_dst(skb
)->dev
);
496 const struct iphdr
*iph
= ip_hdr(skb
);
499 sk
= inet_steal_sock(net
, skb
, doff
, iph
->saddr
, sport
, iph
->daddr
, dport
,
500 refcounted
, inet_ehashfn
);
506 return __inet_lookup(net
, hashinfo
, skb
,
507 doff
, iph
->saddr
, sport
,
508 iph
->daddr
, dport
, inet_iif(skb
), sdif
,
512 static inline void sk_daddr_set(struct sock
*sk
, __be32 addr
)
514 sk
->sk_daddr
= addr
; /* alias of inet_daddr */
515 #if IS_ENABLED(CONFIG_IPV6)
516 ipv6_addr_set_v4mapped(addr
, &sk
->sk_v6_daddr
);
520 static inline void sk_rcv_saddr_set(struct sock
*sk
, __be32 addr
)
522 sk
->sk_rcv_saddr
= addr
; /* alias of inet_rcv_saddr */
523 #if IS_ENABLED(CONFIG_IPV6)
524 ipv6_addr_set_v4mapped(addr
, &sk
->sk_v6_rcv_saddr
);
528 int __inet_hash_connect(struct inet_timewait_death_row
*death_row
,
529 struct sock
*sk
, u64 port_offset
,
530 int (*check_established
)(struct inet_timewait_death_row
*,
531 struct sock
*, __u16
,
532 struct inet_timewait_sock
**));
534 int inet_hash_connect(struct inet_timewait_death_row
*death_row
,
536 #endif /* _INET_HASHTABLES_H */