1 // SPDX-License-Identifier: GPL-2.0-or-later
3 * INET An implementation of the TCP/IP protocol suite for the LINUX
4 * operating system. INET is implemented using the BSD Socket
5 * interface as the means of communication with the user level.
7 * Implementation of the Transmission Control Protocol(TCP).
9 * IPv4 specific functions
13 * linux/ipv4/tcp_input.c
14 * linux/ipv4/tcp_output.c
16 * See tcp.c for author information
21 * David S. Miller : New socket lookup architecture.
22 * This code is dedicated to John Dyson.
23 * David S. Miller : Change semantics of established hash,
24 * half is devoted to TIME_WAIT sockets
25 * and the rest go in the other half.
26 * Andi Kleen : Add support for syncookies and fixed
27 * some bugs: ip options weren't passed to
28 * the TCP layer, missed a check for an
30 * Andi Kleen : Implemented fast path mtu discovery.
31 * Fixed many serious bugs in the
32 * request_sock handling and moved
33 * most of it into the af independent code.
34 * Added tail drop and some other bugfixes.
35 * Added new listen semantics.
36 * Mike McLagan : Routing by source
37 * Juan Jose Ciarlante: ip_dynaddr bits
38 * Andi Kleen: various fixes.
39 * Vitaly E. Lavrov : Transparent proxy revived after year
41 * Andi Kleen : Fix new listen.
42 * Andi Kleen : Fix accept error reporting.
43 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
44 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
45 * a single port at the same time.
48 #define pr_fmt(fmt) "TCP: " fmt
50 #include <linux/bottom_half.h>
51 #include <linux/types.h>
52 #include <linux/fcntl.h>
53 #include <linux/module.h>
54 #include <linux/random.h>
55 #include <linux/cache.h>
56 #include <linux/jhash.h>
57 #include <linux/init.h>
58 #include <linux/times.h>
59 #include <linux/slab.h>
61 #include <net/net_namespace.h>
63 #include <net/inet_hashtables.h>
65 #include <net/transp_v6.h>
67 #include <net/inet_common.h>
68 #include <net/timewait_sock.h>
70 #include <net/secure_seq.h>
71 #include <net/busy_poll.h>
73 #include <linux/inet.h>
74 #include <linux/ipv6.h>
75 #include <linux/stddef.h>
76 #include <linux/proc_fs.h>
77 #include <linux/seq_file.h>
78 #include <linux/inetdevice.h>
80 #include <crypto/hash.h>
81 #include <linux/scatterlist.h>
83 #include <trace/events/tcp.h>
85 #ifdef CONFIG_TCP_MD5SIG
86 static int tcp_v4_md5_hash_hdr(char *md5_hash
, const struct tcp_md5sig_key
*key
,
87 __be32 daddr
, __be32 saddr
, const struct tcphdr
*th
);
90 struct inet_hashinfo tcp_hashinfo
;
91 EXPORT_SYMBOL(tcp_hashinfo
);
93 static u32
tcp_v4_init_seq(const struct sk_buff
*skb
)
95 return secure_tcp_seq(ip_hdr(skb
)->daddr
,
98 tcp_hdr(skb
)->source
);
101 static u32
tcp_v4_init_ts_off(const struct net
*net
, const struct sk_buff
*skb
)
103 return secure_tcp_ts_off(net
, ip_hdr(skb
)->daddr
, ip_hdr(skb
)->saddr
);
106 int tcp_twsk_unique(struct sock
*sk
, struct sock
*sktw
, void *twp
)
108 const struct inet_timewait_sock
*tw
= inet_twsk(sktw
);
109 const struct tcp_timewait_sock
*tcptw
= tcp_twsk(sktw
);
110 struct tcp_sock
*tp
= tcp_sk(sk
);
111 int reuse
= sock_net(sk
)->ipv4
.sysctl_tcp_tw_reuse
;
114 /* Still does not detect *everything* that goes through
115 * lo, since we require a loopback src or dst address
116 * or direct binding to 'lo' interface.
118 bool loopback
= false;
119 if (tw
->tw_bound_dev_if
== LOOPBACK_IFINDEX
)
121 #if IS_ENABLED(CONFIG_IPV6)
122 if (tw
->tw_family
== AF_INET6
) {
123 if (ipv6_addr_loopback(&tw
->tw_v6_daddr
) ||
124 ipv6_addr_v4mapped_loopback(&tw
->tw_v6_daddr
) ||
125 ipv6_addr_loopback(&tw
->tw_v6_rcv_saddr
) ||
126 ipv6_addr_v4mapped_loopback(&tw
->tw_v6_rcv_saddr
))
131 if (ipv4_is_loopback(tw
->tw_daddr
) ||
132 ipv4_is_loopback(tw
->tw_rcv_saddr
))
139 /* With PAWS, it is safe from the viewpoint
140 of data integrity. Even without PAWS it is safe provided sequence
141 spaces do not overlap i.e. at data rates <= 80Mbit/sec.
143 Actually, the idea is close to VJ's one, only timestamp cache is
144 held not per host, but per port pair and TW bucket is used as state
147 If TW bucket has been already destroyed we fall back to VJ's scheme
148 and use initial timestamp retrieved from peer table.
150 if (tcptw
->tw_ts_recent_stamp
&&
151 (!twp
|| (reuse
&& time_after32(ktime_get_seconds(),
152 tcptw
->tw_ts_recent_stamp
)))) {
153 /* In case of repair and re-using TIME-WAIT sockets we still
154 * want to be sure that it is safe as above but honor the
155 * sequence numbers and time stamps set as part of the repair
158 * Without this check re-using a TIME-WAIT socket with TCP
159 * repair would accumulate a -1 on the repair assigned
160 * sequence number. The first time it is reused the sequence
161 * is -1, the second time -2, etc. This fixes that issue
162 * without appearing to create any others.
164 if (likely(!tp
->repair
)) {
165 u32 seq
= tcptw
->tw_snd_nxt
+ 65535 + 2;
169 WRITE_ONCE(tp
->write_seq
, seq
);
170 tp
->rx_opt
.ts_recent
= tcptw
->tw_ts_recent
;
171 tp
->rx_opt
.ts_recent_stamp
= tcptw
->tw_ts_recent_stamp
;
179 EXPORT_SYMBOL_GPL(tcp_twsk_unique
);
181 static int tcp_v4_pre_connect(struct sock
*sk
, struct sockaddr
*uaddr
,
184 /* This check is replicated from tcp_v4_connect() and intended to
185 * prevent BPF program called below from accessing bytes that are out
186 * of the bound specified by user in addr_len.
188 if (addr_len
< sizeof(struct sockaddr_in
))
191 sock_owned_by_me(sk
);
193 return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk
, uaddr
);
196 /* This will initiate an outgoing connection. */
197 int tcp_v4_connect(struct sock
*sk
, struct sockaddr
*uaddr
, int addr_len
)
199 struct sockaddr_in
*usin
= (struct sockaddr_in
*)uaddr
;
200 struct inet_sock
*inet
= inet_sk(sk
);
201 struct tcp_sock
*tp
= tcp_sk(sk
);
202 __be16 orig_sport
, orig_dport
;
203 __be32 daddr
, nexthop
;
207 struct ip_options_rcu
*inet_opt
;
208 struct inet_timewait_death_row
*tcp_death_row
= &sock_net(sk
)->ipv4
.tcp_death_row
;
210 if (addr_len
< sizeof(struct sockaddr_in
))
213 if (usin
->sin_family
!= AF_INET
)
214 return -EAFNOSUPPORT
;
216 nexthop
= daddr
= usin
->sin_addr
.s_addr
;
217 inet_opt
= rcu_dereference_protected(inet
->inet_opt
,
218 lockdep_sock_is_held(sk
));
219 if (inet_opt
&& inet_opt
->opt
.srr
) {
222 nexthop
= inet_opt
->opt
.faddr
;
225 orig_sport
= inet
->inet_sport
;
226 orig_dport
= usin
->sin_port
;
227 fl4
= &inet
->cork
.fl
.u
.ip4
;
228 rt
= ip_route_connect(fl4
, nexthop
, inet
->inet_saddr
,
229 RT_CONN_FLAGS(sk
), sk
->sk_bound_dev_if
,
231 orig_sport
, orig_dport
, sk
);
234 if (err
== -ENETUNREACH
)
235 IP_INC_STATS(sock_net(sk
), IPSTATS_MIB_OUTNOROUTES
);
239 if (rt
->rt_flags
& (RTCF_MULTICAST
| RTCF_BROADCAST
)) {
244 if (!inet_opt
|| !inet_opt
->opt
.srr
)
247 if (!inet
->inet_saddr
)
248 inet
->inet_saddr
= fl4
->saddr
;
249 sk_rcv_saddr_set(sk
, inet
->inet_saddr
);
251 if (tp
->rx_opt
.ts_recent_stamp
&& inet
->inet_daddr
!= daddr
) {
252 /* Reset inherited state */
253 tp
->rx_opt
.ts_recent
= 0;
254 tp
->rx_opt
.ts_recent_stamp
= 0;
255 if (likely(!tp
->repair
))
256 WRITE_ONCE(tp
->write_seq
, 0);
259 inet
->inet_dport
= usin
->sin_port
;
260 sk_daddr_set(sk
, daddr
);
262 inet_csk(sk
)->icsk_ext_hdr_len
= 0;
264 inet_csk(sk
)->icsk_ext_hdr_len
= inet_opt
->opt
.optlen
;
266 tp
->rx_opt
.mss_clamp
= TCP_MSS_DEFAULT
;
268 /* Socket identity is still unknown (sport may be zero).
269 * However we set state to SYN-SENT and not releasing socket
270 * lock select source port, enter ourselves into the hash tables and
271 * complete initialization after this.
273 tcp_set_state(sk
, TCP_SYN_SENT
);
274 err
= inet_hash_connect(tcp_death_row
, sk
);
280 rt
= ip_route_newports(fl4
, rt
, orig_sport
, orig_dport
,
281 inet
->inet_sport
, inet
->inet_dport
, sk
);
287 /* OK, now commit destination to socket. */
288 sk
->sk_gso_type
= SKB_GSO_TCPV4
;
289 sk_setup_caps(sk
, &rt
->dst
);
292 if (likely(!tp
->repair
)) {
294 WRITE_ONCE(tp
->write_seq
,
295 secure_tcp_seq(inet
->inet_saddr
,
299 tp
->tsoffset
= secure_tcp_ts_off(sock_net(sk
),
304 inet
->inet_id
= prandom_u32();
306 if (tcp_fastopen_defer_connect(sk
, &err
))
311 err
= tcp_connect(sk
);
320 * This unhashes the socket and releases the local port,
323 tcp_set_state(sk
, TCP_CLOSE
);
325 sk
->sk_route_caps
= 0;
326 inet
->inet_dport
= 0;
329 EXPORT_SYMBOL(tcp_v4_connect
);
332 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
333 * It can be called through tcp_release_cb() if socket was owned by user
334 * at the time tcp_v4_err() was called to handle ICMP message.
336 void tcp_v4_mtu_reduced(struct sock
*sk
)
338 struct inet_sock
*inet
= inet_sk(sk
);
339 struct dst_entry
*dst
;
342 if ((1 << sk
->sk_state
) & (TCPF_LISTEN
| TCPF_CLOSE
))
344 mtu
= tcp_sk(sk
)->mtu_info
;
345 dst
= inet_csk_update_pmtu(sk
, mtu
);
349 /* Something is about to be wrong... Remember soft error
350 * for the case, if this connection will not able to recover.
352 if (mtu
< dst_mtu(dst
) && ip_dont_fragment(sk
, dst
))
353 sk
->sk_err_soft
= EMSGSIZE
;
357 if (inet
->pmtudisc
!= IP_PMTUDISC_DONT
&&
358 ip_sk_accept_pmtu(sk
) &&
359 inet_csk(sk
)->icsk_pmtu_cookie
> mtu
) {
360 tcp_sync_mss(sk
, mtu
);
362 /* Resend the TCP packet because it's
363 * clear that the old packet has been
364 * dropped. This is the new "fast" path mtu
367 tcp_simple_retransmit(sk
);
368 } /* else let the usual retransmit timer handle it */
370 EXPORT_SYMBOL(tcp_v4_mtu_reduced
);
372 static void do_redirect(struct sk_buff
*skb
, struct sock
*sk
)
374 struct dst_entry
*dst
= __sk_dst_check(sk
, 0);
377 dst
->ops
->redirect(dst
, sk
, skb
);
381 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
382 void tcp_req_err(struct sock
*sk
, u32 seq
, bool abort
)
384 struct request_sock
*req
= inet_reqsk(sk
);
385 struct net
*net
= sock_net(sk
);
387 /* ICMPs are not backlogged, hence we cannot get
388 * an established socket here.
390 if (seq
!= tcp_rsk(req
)->snt_isn
) {
391 __NET_INC_STATS(net
, LINUX_MIB_OUTOFWINDOWICMPS
);
394 * Still in SYN_RECV, just remove it silently.
395 * There is no good way to pass the error to the newly
396 * created socket, and POSIX does not want network
397 * errors returned from accept().
399 inet_csk_reqsk_queue_drop(req
->rsk_listener
, req
);
400 tcp_listendrop(req
->rsk_listener
);
404 EXPORT_SYMBOL(tcp_req_err
);
407 * This routine is called by the ICMP module when it gets some
408 * sort of error condition. If err < 0 then the socket should
409 * be closed and the error returned to the user. If err > 0
410 * it's just the icmp type << 8 | icmp code. After adjustment
411 * header points to the first 8 bytes of the tcp header. We need
412 * to find the appropriate port.
414 * The locking strategy used here is very "optimistic". When
415 * someone else accesses the socket the ICMP is just dropped
416 * and for some paths there is no check at all.
417 * A more general error queue to queue errors for later handling
418 * is probably better.
422 int tcp_v4_err(struct sk_buff
*icmp_skb
, u32 info
)
424 const struct iphdr
*iph
= (const struct iphdr
*)icmp_skb
->data
;
425 struct tcphdr
*th
= (struct tcphdr
*)(icmp_skb
->data
+ (iph
->ihl
<< 2));
426 struct inet_connection_sock
*icsk
;
428 struct inet_sock
*inet
;
429 const int type
= icmp_hdr(icmp_skb
)->type
;
430 const int code
= icmp_hdr(icmp_skb
)->code
;
433 struct request_sock
*fastopen
;
438 struct net
*net
= dev_net(icmp_skb
->dev
);
440 sk
= __inet_lookup_established(net
, &tcp_hashinfo
, iph
->daddr
,
441 th
->dest
, iph
->saddr
, ntohs(th
->source
),
442 inet_iif(icmp_skb
), 0);
444 __ICMP_INC_STATS(net
, ICMP_MIB_INERRORS
);
447 if (sk
->sk_state
== TCP_TIME_WAIT
) {
448 inet_twsk_put(inet_twsk(sk
));
451 seq
= ntohl(th
->seq
);
452 if (sk
->sk_state
== TCP_NEW_SYN_RECV
) {
453 tcp_req_err(sk
, seq
, type
== ICMP_PARAMETERPROB
||
454 type
== ICMP_TIME_EXCEEDED
||
455 (type
== ICMP_DEST_UNREACH
&&
456 (code
== ICMP_NET_UNREACH
||
457 code
== ICMP_HOST_UNREACH
)));
462 /* If too many ICMPs get dropped on busy
463 * servers this needs to be solved differently.
464 * We do take care of PMTU discovery (RFC1191) special case :
465 * we can receive locally generated ICMP messages while socket is held.
467 if (sock_owned_by_user(sk
)) {
468 if (!(type
== ICMP_DEST_UNREACH
&& code
== ICMP_FRAG_NEEDED
))
469 __NET_INC_STATS(net
, LINUX_MIB_LOCKDROPPEDICMPS
);
471 if (sk
->sk_state
== TCP_CLOSE
)
474 if (unlikely(iph
->ttl
< inet_sk(sk
)->min_ttl
)) {
475 __NET_INC_STATS(net
, LINUX_MIB_TCPMINTTLDROP
);
481 /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
482 fastopen
= rcu_dereference(tp
->fastopen_rsk
);
483 snd_una
= fastopen
? tcp_rsk(fastopen
)->snt_isn
: tp
->snd_una
;
484 if (sk
->sk_state
!= TCP_LISTEN
&&
485 !between(seq
, snd_una
, tp
->snd_nxt
)) {
486 __NET_INC_STATS(net
, LINUX_MIB_OUTOFWINDOWICMPS
);
492 if (!sock_owned_by_user(sk
))
493 do_redirect(icmp_skb
, sk
);
495 case ICMP_SOURCE_QUENCH
:
496 /* Just silently ignore these. */
498 case ICMP_PARAMETERPROB
:
501 case ICMP_DEST_UNREACH
:
502 if (code
> NR_ICMP_UNREACH
)
505 if (code
== ICMP_FRAG_NEEDED
) { /* PMTU discovery (RFC1191) */
506 /* We are not interested in TCP_LISTEN and open_requests
507 * (SYN-ACKs send out by Linux are always <576bytes so
508 * they should go through unfragmented).
510 if (sk
->sk_state
== TCP_LISTEN
)
514 if (!sock_owned_by_user(sk
)) {
515 tcp_v4_mtu_reduced(sk
);
517 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED
, &sk
->sk_tsq_flags
))
523 err
= icmp_err_convert
[code
].errno
;
524 /* check if icmp_skb allows revert of backoff
525 * (see draft-zimmermann-tcp-lcd) */
526 if (code
!= ICMP_NET_UNREACH
&& code
!= ICMP_HOST_UNREACH
)
528 if (seq
!= tp
->snd_una
|| !icsk
->icsk_retransmits
||
529 !icsk
->icsk_backoff
|| fastopen
)
532 if (sock_owned_by_user(sk
))
535 skb
= tcp_rtx_queue_head(sk
);
536 if (WARN_ON_ONCE(!skb
))
539 icsk
->icsk_backoff
--;
540 icsk
->icsk_rto
= tp
->srtt_us
? __tcp_set_rto(tp
) :
542 icsk
->icsk_rto
= inet_csk_rto_backoff(icsk
, TCP_RTO_MAX
);
545 tcp_mstamp_refresh(tp
);
546 delta_us
= (u32
)(tp
->tcp_mstamp
- tcp_skb_timestamp_us(skb
));
547 remaining
= icsk
->icsk_rto
-
548 usecs_to_jiffies(delta_us
);
551 inet_csk_reset_xmit_timer(sk
, ICSK_TIME_RETRANS
,
552 remaining
, TCP_RTO_MAX
);
554 /* RTO revert clocked out retransmission.
555 * Will retransmit now */
556 tcp_retransmit_timer(sk
);
560 case ICMP_TIME_EXCEEDED
:
567 switch (sk
->sk_state
) {
570 /* Only in fast or simultaneous open. If a fast open socket is
571 * is already accepted it is treated as a connected one below.
573 if (fastopen
&& !fastopen
->sk
)
576 if (!sock_owned_by_user(sk
)) {
579 sk
->sk_error_report(sk
);
583 sk
->sk_err_soft
= err
;
588 /* If we've already connected we will keep trying
589 * until we time out, or the user gives up.
591 * rfc1122 4.2.3.9 allows to consider as hard errors
592 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
593 * but it is obsoleted by pmtu discovery).
595 * Note, that in modern internet, where routing is unreliable
596 * and in each dark corner broken firewalls sit, sending random
597 * errors ordered by their masters even this two messages finally lose
598 * their original sense (even Linux sends invalid PORT_UNREACHs)
600 * Now we are in compliance with RFCs.
605 if (!sock_owned_by_user(sk
) && inet
->recverr
) {
607 sk
->sk_error_report(sk
);
608 } else { /* Only an error on timeout */
609 sk
->sk_err_soft
= err
;
618 void __tcp_v4_send_check(struct sk_buff
*skb
, __be32 saddr
, __be32 daddr
)
620 struct tcphdr
*th
= tcp_hdr(skb
);
622 th
->check
= ~tcp_v4_check(skb
->len
, saddr
, daddr
, 0);
623 skb
->csum_start
= skb_transport_header(skb
) - skb
->head
;
624 skb
->csum_offset
= offsetof(struct tcphdr
, check
);
627 /* This routine computes an IPv4 TCP checksum. */
628 void tcp_v4_send_check(struct sock
*sk
, struct sk_buff
*skb
)
630 const struct inet_sock
*inet
= inet_sk(sk
);
632 __tcp_v4_send_check(skb
, inet
->inet_saddr
, inet
->inet_daddr
);
634 EXPORT_SYMBOL(tcp_v4_send_check
);
637 * This routine will send an RST to the other tcp.
639 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
641 * Answer: if a packet caused RST, it is not for a socket
642 * existing in our system, if it is matched to a socket,
643 * it is just duplicate segment or bug in other side's TCP.
644 * So that we build reply only basing on parameters
645 * arrived with segment.
646 * Exception: precedence violation. We do not implement it in any case.
649 static void tcp_v4_send_reset(const struct sock
*sk
, struct sk_buff
*skb
)
651 const struct tcphdr
*th
= tcp_hdr(skb
);
654 #ifdef CONFIG_TCP_MD5SIG
655 __be32 opt
[(TCPOLEN_MD5SIG_ALIGNED
>> 2)];
658 struct ip_reply_arg arg
;
659 #ifdef CONFIG_TCP_MD5SIG
660 struct tcp_md5sig_key
*key
= NULL
;
661 const __u8
*hash_location
= NULL
;
662 unsigned char newhash
[16];
664 struct sock
*sk1
= NULL
;
666 u64 transmit_time
= 0;
670 /* Never send a reset in response to a reset. */
674 /* If sk not NULL, it means we did a successful lookup and incoming
675 * route had to be correct. prequeue might have dropped our dst.
677 if (!sk
&& skb_rtable(skb
)->rt_type
!= RTN_LOCAL
)
680 /* Swap the send and the receive. */
681 memset(&rep
, 0, sizeof(rep
));
682 rep
.th
.dest
= th
->source
;
683 rep
.th
.source
= th
->dest
;
684 rep
.th
.doff
= sizeof(struct tcphdr
) / 4;
688 rep
.th
.seq
= th
->ack_seq
;
691 rep
.th
.ack_seq
= htonl(ntohl(th
->seq
) + th
->syn
+ th
->fin
+
692 skb
->len
- (th
->doff
<< 2));
695 memset(&arg
, 0, sizeof(arg
));
696 arg
.iov
[0].iov_base
= (unsigned char *)&rep
;
697 arg
.iov
[0].iov_len
= sizeof(rep
.th
);
699 net
= sk
? sock_net(sk
) : dev_net(skb_dst(skb
)->dev
);
700 #ifdef CONFIG_TCP_MD5SIG
702 hash_location
= tcp_parse_md5sig_option(th
);
703 if (sk
&& sk_fullsock(sk
)) {
704 const union tcp_md5_addr
*addr
;
707 /* sdif set, means packet ingressed via a device
708 * in an L3 domain and inet_iif is set to it.
710 l3index
= tcp_v4_sdif(skb
) ? inet_iif(skb
) : 0;
711 addr
= (union tcp_md5_addr
*)&ip_hdr(skb
)->saddr
;
712 key
= tcp_md5_do_lookup(sk
, l3index
, addr
, AF_INET
);
713 } else if (hash_location
) {
714 const union tcp_md5_addr
*addr
;
715 int sdif
= tcp_v4_sdif(skb
);
716 int dif
= inet_iif(skb
);
720 * active side is lost. Try to find listening socket through
721 * source port, and then find md5 key through listening socket.
722 * we are not loose security here:
723 * Incoming packet is checked with md5 hash with finding key,
724 * no RST generated if md5 hash doesn't match.
726 sk1
= __inet_lookup_listener(net
, &tcp_hashinfo
, NULL
, 0,
728 th
->source
, ip_hdr(skb
)->daddr
,
729 ntohs(th
->source
), dif
, sdif
);
730 /* don't send rst if it can't find key */
734 /* sdif set, means packet ingressed via a device
735 * in an L3 domain and dif is set to it.
737 l3index
= sdif
? dif
: 0;
738 addr
= (union tcp_md5_addr
*)&ip_hdr(skb
)->saddr
;
739 key
= tcp_md5_do_lookup(sk1
, l3index
, addr
, AF_INET
);
744 genhash
= tcp_v4_md5_hash_skb(newhash
, key
, NULL
, skb
);
745 if (genhash
|| memcmp(hash_location
, newhash
, 16) != 0)
751 rep
.opt
[0] = htonl((TCPOPT_NOP
<< 24) |
753 (TCPOPT_MD5SIG
<< 8) |
755 /* Update length and the length the header thinks exists */
756 arg
.iov
[0].iov_len
+= TCPOLEN_MD5SIG_ALIGNED
;
757 rep
.th
.doff
= arg
.iov
[0].iov_len
/ 4;
759 tcp_v4_md5_hash_hdr((__u8
*) &rep
.opt
[1],
760 key
, ip_hdr(skb
)->saddr
,
761 ip_hdr(skb
)->daddr
, &rep
.th
);
764 arg
.csum
= csum_tcpudp_nofold(ip_hdr(skb
)->daddr
,
765 ip_hdr(skb
)->saddr
, /* XXX */
766 arg
.iov
[0].iov_len
, IPPROTO_TCP
, 0);
767 arg
.csumoffset
= offsetof(struct tcphdr
, check
) / 2;
768 arg
.flags
= (sk
&& inet_sk_transparent(sk
)) ? IP_REPLY_ARG_NOSRCCHECK
: 0;
770 /* When socket is gone, all binding information is lost.
771 * routing might fail in this case. No choice here, if we choose to force
772 * input interface, we will misroute in case of asymmetric route.
775 arg
.bound_dev_if
= sk
->sk_bound_dev_if
;
777 trace_tcp_send_reset(sk
, skb
);
780 BUILD_BUG_ON(offsetof(struct sock
, sk_bound_dev_if
) !=
781 offsetof(struct inet_timewait_sock
, tw_bound_dev_if
));
783 arg
.tos
= ip_hdr(skb
)->tos
;
784 arg
.uid
= sock_net_uid(net
, sk
&& sk_fullsock(sk
) ? sk
: NULL
);
786 ctl_sk
= this_cpu_read(*net
->ipv4
.tcp_sk
);
788 ctl_sk
->sk_mark
= (sk
->sk_state
== TCP_TIME_WAIT
) ?
789 inet_twsk(sk
)->tw_mark
: sk
->sk_mark
;
790 ctl_sk
->sk_priority
= (sk
->sk_state
== TCP_TIME_WAIT
) ?
791 inet_twsk(sk
)->tw_priority
: sk
->sk_priority
;
792 transmit_time
= tcp_transmit_time(sk
);
794 ip_send_unicast_reply(ctl_sk
,
795 skb
, &TCP_SKB_CB(skb
)->header
.h4
.opt
,
796 ip_hdr(skb
)->saddr
, ip_hdr(skb
)->daddr
,
797 &arg
, arg
.iov
[0].iov_len
,
801 __TCP_INC_STATS(net
, TCP_MIB_OUTSEGS
);
802 __TCP_INC_STATS(net
, TCP_MIB_OUTRSTS
);
805 #ifdef CONFIG_TCP_MD5SIG
811 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
812 outside socket context is ugly, certainly. What can I do?
815 static void tcp_v4_send_ack(const struct sock
*sk
,
816 struct sk_buff
*skb
, u32 seq
, u32 ack
,
817 u32 win
, u32 tsval
, u32 tsecr
, int oif
,
818 struct tcp_md5sig_key
*key
,
819 int reply_flags
, u8 tos
)
821 const struct tcphdr
*th
= tcp_hdr(skb
);
824 __be32 opt
[(TCPOLEN_TSTAMP_ALIGNED
>> 2)
825 #ifdef CONFIG_TCP_MD5SIG
826 + (TCPOLEN_MD5SIG_ALIGNED
>> 2)
830 struct net
*net
= sock_net(sk
);
831 struct ip_reply_arg arg
;
835 memset(&rep
.th
, 0, sizeof(struct tcphdr
));
836 memset(&arg
, 0, sizeof(arg
));
838 arg
.iov
[0].iov_base
= (unsigned char *)&rep
;
839 arg
.iov
[0].iov_len
= sizeof(rep
.th
);
841 rep
.opt
[0] = htonl((TCPOPT_NOP
<< 24) | (TCPOPT_NOP
<< 16) |
842 (TCPOPT_TIMESTAMP
<< 8) |
844 rep
.opt
[1] = htonl(tsval
);
845 rep
.opt
[2] = htonl(tsecr
);
846 arg
.iov
[0].iov_len
+= TCPOLEN_TSTAMP_ALIGNED
;
849 /* Swap the send and the receive. */
850 rep
.th
.dest
= th
->source
;
851 rep
.th
.source
= th
->dest
;
852 rep
.th
.doff
= arg
.iov
[0].iov_len
/ 4;
853 rep
.th
.seq
= htonl(seq
);
854 rep
.th
.ack_seq
= htonl(ack
);
856 rep
.th
.window
= htons(win
);
858 #ifdef CONFIG_TCP_MD5SIG
860 int offset
= (tsecr
) ? 3 : 0;
862 rep
.opt
[offset
++] = htonl((TCPOPT_NOP
<< 24) |
864 (TCPOPT_MD5SIG
<< 8) |
866 arg
.iov
[0].iov_len
+= TCPOLEN_MD5SIG_ALIGNED
;
867 rep
.th
.doff
= arg
.iov
[0].iov_len
/4;
869 tcp_v4_md5_hash_hdr((__u8
*) &rep
.opt
[offset
],
870 key
, ip_hdr(skb
)->saddr
,
871 ip_hdr(skb
)->daddr
, &rep
.th
);
874 arg
.flags
= reply_flags
;
875 arg
.csum
= csum_tcpudp_nofold(ip_hdr(skb
)->daddr
,
876 ip_hdr(skb
)->saddr
, /* XXX */
877 arg
.iov
[0].iov_len
, IPPROTO_TCP
, 0);
878 arg
.csumoffset
= offsetof(struct tcphdr
, check
) / 2;
880 arg
.bound_dev_if
= oif
;
882 arg
.uid
= sock_net_uid(net
, sk_fullsock(sk
) ? sk
: NULL
);
884 ctl_sk
= this_cpu_read(*net
->ipv4
.tcp_sk
);
885 ctl_sk
->sk_mark
= (sk
->sk_state
== TCP_TIME_WAIT
) ?
886 inet_twsk(sk
)->tw_mark
: sk
->sk_mark
;
887 ctl_sk
->sk_priority
= (sk
->sk_state
== TCP_TIME_WAIT
) ?
888 inet_twsk(sk
)->tw_priority
: sk
->sk_priority
;
889 transmit_time
= tcp_transmit_time(sk
);
890 ip_send_unicast_reply(ctl_sk
,
891 skb
, &TCP_SKB_CB(skb
)->header
.h4
.opt
,
892 ip_hdr(skb
)->saddr
, ip_hdr(skb
)->daddr
,
893 &arg
, arg
.iov
[0].iov_len
,
897 __TCP_INC_STATS(net
, TCP_MIB_OUTSEGS
);
901 static void tcp_v4_timewait_ack(struct sock
*sk
, struct sk_buff
*skb
)
903 struct inet_timewait_sock
*tw
= inet_twsk(sk
);
904 struct tcp_timewait_sock
*tcptw
= tcp_twsk(sk
);
906 tcp_v4_send_ack(sk
, skb
,
907 tcptw
->tw_snd_nxt
, tcptw
->tw_rcv_nxt
,
908 tcptw
->tw_rcv_wnd
>> tw
->tw_rcv_wscale
,
909 tcp_time_stamp_raw() + tcptw
->tw_ts_offset
,
912 tcp_twsk_md5_key(tcptw
),
913 tw
->tw_transparent
? IP_REPLY_ARG_NOSRCCHECK
: 0,
920 static void tcp_v4_reqsk_send_ack(const struct sock
*sk
, struct sk_buff
*skb
,
921 struct request_sock
*req
)
923 const union tcp_md5_addr
*addr
;
926 /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
927 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
929 u32 seq
= (sk
->sk_state
== TCP_LISTEN
) ? tcp_rsk(req
)->snt_isn
+ 1 :
933 * The window field (SEG.WND) of every outgoing segment, with the
934 * exception of <SYN> segments, MUST be right-shifted by
935 * Rcv.Wind.Shift bits:
937 addr
= (union tcp_md5_addr
*)&ip_hdr(skb
)->saddr
;
938 l3index
= tcp_v4_sdif(skb
) ? inet_iif(skb
) : 0;
939 tcp_v4_send_ack(sk
, skb
, seq
,
940 tcp_rsk(req
)->rcv_nxt
,
941 req
->rsk_rcv_wnd
>> inet_rsk(req
)->rcv_wscale
,
942 tcp_time_stamp_raw() + tcp_rsk(req
)->ts_off
,
945 tcp_md5_do_lookup(sk
, l3index
, addr
, AF_INET
),
946 inet_rsk(req
)->no_srccheck
? IP_REPLY_ARG_NOSRCCHECK
: 0,
951 * Send a SYN-ACK after having received a SYN.
952 * This still operates on a request_sock only, not on a big
955 static int tcp_v4_send_synack(const struct sock
*sk
, struct dst_entry
*dst
,
957 struct request_sock
*req
,
958 struct tcp_fastopen_cookie
*foc
,
959 enum tcp_synack_type synack_type
)
961 const struct inet_request_sock
*ireq
= inet_rsk(req
);
966 /* First, grab a route. */
967 if (!dst
&& (dst
= inet_csk_route_req(sk
, &fl4
, req
)) == NULL
)
970 skb
= tcp_make_synack(sk
, dst
, req
, foc
, synack_type
);
973 __tcp_v4_send_check(skb
, ireq
->ir_loc_addr
, ireq
->ir_rmt_addr
);
976 err
= ip_build_and_send_pkt(skb
, sk
, ireq
->ir_loc_addr
,
978 rcu_dereference(ireq
->ireq_opt
));
980 err
= net_xmit_eval(err
);
987 * IPv4 request_sock destructor.
989 static void tcp_v4_reqsk_destructor(struct request_sock
*req
)
991 kfree(rcu_dereference_protected(inet_rsk(req
)->ireq_opt
, 1));
994 #ifdef CONFIG_TCP_MD5SIG
996 * RFC2385 MD5 checksumming requires a mapping of
997 * IP address->MD5 Key.
998 * We need to maintain these in the sk structure.
1001 DEFINE_STATIC_KEY_FALSE(tcp_md5_needed
);
1002 EXPORT_SYMBOL(tcp_md5_needed
);
1004 /* Find the Key structure for an address. */
1005 struct tcp_md5sig_key
*__tcp_md5_do_lookup(const struct sock
*sk
, int l3index
,
1006 const union tcp_md5_addr
*addr
,
1009 const struct tcp_sock
*tp
= tcp_sk(sk
);
1010 struct tcp_md5sig_key
*key
;
1011 const struct tcp_md5sig_info
*md5sig
;
1013 struct tcp_md5sig_key
*best_match
= NULL
;
1016 /* caller either holds rcu_read_lock() or socket lock */
1017 md5sig
= rcu_dereference_check(tp
->md5sig_info
,
1018 lockdep_sock_is_held(sk
));
1022 hlist_for_each_entry_rcu(key
, &md5sig
->head
, node
) {
1023 if (key
->family
!= family
)
1025 if (key
->l3index
&& key
->l3index
!= l3index
)
1027 if (family
== AF_INET
) {
1028 mask
= inet_make_mask(key
->prefixlen
);
1029 match
= (key
->addr
.a4
.s_addr
& mask
) ==
1030 (addr
->a4
.s_addr
& mask
);
1031 #if IS_ENABLED(CONFIG_IPV6)
1032 } else if (family
== AF_INET6
) {
1033 match
= ipv6_prefix_equal(&key
->addr
.a6
, &addr
->a6
,
1040 if (match
&& (!best_match
||
1041 key
->prefixlen
> best_match
->prefixlen
))
1046 EXPORT_SYMBOL(__tcp_md5_do_lookup
);
1048 static struct tcp_md5sig_key
*tcp_md5_do_lookup_exact(const struct sock
*sk
,
1049 const union tcp_md5_addr
*addr
,
1050 int family
, u8 prefixlen
,
1053 const struct tcp_sock
*tp
= tcp_sk(sk
);
1054 struct tcp_md5sig_key
*key
;
1055 unsigned int size
= sizeof(struct in_addr
);
1056 const struct tcp_md5sig_info
*md5sig
;
1058 /* caller either holds rcu_read_lock() or socket lock */
1059 md5sig
= rcu_dereference_check(tp
->md5sig_info
,
1060 lockdep_sock_is_held(sk
));
1063 #if IS_ENABLED(CONFIG_IPV6)
1064 if (family
== AF_INET6
)
1065 size
= sizeof(struct in6_addr
);
1067 hlist_for_each_entry_rcu(key
, &md5sig
->head
, node
) {
1068 if (key
->family
!= family
)
1070 if (key
->l3index
&& key
->l3index
!= l3index
)
1072 if (!memcmp(&key
->addr
, addr
, size
) &&
1073 key
->prefixlen
== prefixlen
)
1079 struct tcp_md5sig_key
*tcp_v4_md5_lookup(const struct sock
*sk
,
1080 const struct sock
*addr_sk
)
1082 const union tcp_md5_addr
*addr
;
1085 l3index
= l3mdev_master_ifindex_by_index(sock_net(sk
),
1086 addr_sk
->sk_bound_dev_if
);
1087 addr
= (const union tcp_md5_addr
*)&addr_sk
->sk_daddr
;
1088 return tcp_md5_do_lookup(sk
, l3index
, addr
, AF_INET
);
1090 EXPORT_SYMBOL(tcp_v4_md5_lookup
);
1092 /* This can be called on a newly created socket, from other files */
1093 int tcp_md5_do_add(struct sock
*sk
, const union tcp_md5_addr
*addr
,
1094 int family
, u8 prefixlen
, int l3index
,
1095 const u8
*newkey
, u8 newkeylen
, gfp_t gfp
)
1097 /* Add Key to the list */
1098 struct tcp_md5sig_key
*key
;
1099 struct tcp_sock
*tp
= tcp_sk(sk
);
1100 struct tcp_md5sig_info
*md5sig
;
1102 key
= tcp_md5_do_lookup_exact(sk
, addr
, family
, prefixlen
, l3index
);
1104 /* Pre-existing entry - just update that one. */
1105 memcpy(key
->key
, newkey
, newkeylen
);
1106 key
->keylen
= newkeylen
;
1110 md5sig
= rcu_dereference_protected(tp
->md5sig_info
,
1111 lockdep_sock_is_held(sk
));
1113 md5sig
= kmalloc(sizeof(*md5sig
), gfp
);
1117 sk_nocaps_add(sk
, NETIF_F_GSO_MASK
);
1118 INIT_HLIST_HEAD(&md5sig
->head
);
1119 rcu_assign_pointer(tp
->md5sig_info
, md5sig
);
1122 key
= sock_kmalloc(sk
, sizeof(*key
), gfp
);
1125 if (!tcp_alloc_md5sig_pool()) {
1126 sock_kfree_s(sk
, key
, sizeof(*key
));
1130 memcpy(key
->key
, newkey
, newkeylen
);
1131 key
->keylen
= newkeylen
;
1132 key
->family
= family
;
1133 key
->prefixlen
= prefixlen
;
1134 key
->l3index
= l3index
;
1135 memcpy(&key
->addr
, addr
,
1136 (family
== AF_INET6
) ? sizeof(struct in6_addr
) :
1137 sizeof(struct in_addr
));
1138 hlist_add_head_rcu(&key
->node
, &md5sig
->head
);
1141 EXPORT_SYMBOL(tcp_md5_do_add
);
1143 int tcp_md5_do_del(struct sock
*sk
, const union tcp_md5_addr
*addr
, int family
,
1144 u8 prefixlen
, int l3index
)
1146 struct tcp_md5sig_key
*key
;
1148 key
= tcp_md5_do_lookup_exact(sk
, addr
, family
, prefixlen
, l3index
);
1151 hlist_del_rcu(&key
->node
);
1152 atomic_sub(sizeof(*key
), &sk
->sk_omem_alloc
);
1153 kfree_rcu(key
, rcu
);
1156 EXPORT_SYMBOL(tcp_md5_do_del
);
1158 static void tcp_clear_md5_list(struct sock
*sk
)
1160 struct tcp_sock
*tp
= tcp_sk(sk
);
1161 struct tcp_md5sig_key
*key
;
1162 struct hlist_node
*n
;
1163 struct tcp_md5sig_info
*md5sig
;
1165 md5sig
= rcu_dereference_protected(tp
->md5sig_info
, 1);
1167 hlist_for_each_entry_safe(key
, n
, &md5sig
->head
, node
) {
1168 hlist_del_rcu(&key
->node
);
1169 atomic_sub(sizeof(*key
), &sk
->sk_omem_alloc
);
1170 kfree_rcu(key
, rcu
);
1174 static int tcp_v4_parse_md5_keys(struct sock
*sk
, int optname
,
1175 char __user
*optval
, int optlen
)
1177 struct tcp_md5sig cmd
;
1178 struct sockaddr_in
*sin
= (struct sockaddr_in
*)&cmd
.tcpm_addr
;
1179 const union tcp_md5_addr
*addr
;
1183 if (optlen
< sizeof(cmd
))
1186 if (copy_from_user(&cmd
, optval
, sizeof(cmd
)))
1189 if (sin
->sin_family
!= AF_INET
)
1192 if (optname
== TCP_MD5SIG_EXT
&&
1193 cmd
.tcpm_flags
& TCP_MD5SIG_FLAG_PREFIX
) {
1194 prefixlen
= cmd
.tcpm_prefixlen
;
1199 if (optname
== TCP_MD5SIG_EXT
&&
1200 cmd
.tcpm_flags
& TCP_MD5SIG_FLAG_IFINDEX
) {
1201 struct net_device
*dev
;
1204 dev
= dev_get_by_index_rcu(sock_net(sk
), cmd
.tcpm_ifindex
);
1205 if (dev
&& netif_is_l3_master(dev
))
1206 l3index
= dev
->ifindex
;
1210 /* ok to reference set/not set outside of rcu;
1211 * right now device MUST be an L3 master
1213 if (!dev
|| !l3index
)
1217 addr
= (union tcp_md5_addr
*)&sin
->sin_addr
.s_addr
;
1219 if (!cmd
.tcpm_keylen
)
1220 return tcp_md5_do_del(sk
, addr
, AF_INET
, prefixlen
, l3index
);
1222 if (cmd
.tcpm_keylen
> TCP_MD5SIG_MAXKEYLEN
)
1225 return tcp_md5_do_add(sk
, addr
, AF_INET
, prefixlen
, l3index
,
1226 cmd
.tcpm_key
, cmd
.tcpm_keylen
, GFP_KERNEL
);
1229 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool
*hp
,
1230 __be32 daddr
, __be32 saddr
,
1231 const struct tcphdr
*th
, int nbytes
)
1233 struct tcp4_pseudohdr
*bp
;
1234 struct scatterlist sg
;
1241 bp
->protocol
= IPPROTO_TCP
;
1242 bp
->len
= cpu_to_be16(nbytes
);
1244 _th
= (struct tcphdr
*)(bp
+ 1);
1245 memcpy(_th
, th
, sizeof(*th
));
1248 sg_init_one(&sg
, bp
, sizeof(*bp
) + sizeof(*th
));
1249 ahash_request_set_crypt(hp
->md5_req
, &sg
, NULL
,
1250 sizeof(*bp
) + sizeof(*th
));
1251 return crypto_ahash_update(hp
->md5_req
);
1254 static int tcp_v4_md5_hash_hdr(char *md5_hash
, const struct tcp_md5sig_key
*key
,
1255 __be32 daddr
, __be32 saddr
, const struct tcphdr
*th
)
1257 struct tcp_md5sig_pool
*hp
;
1258 struct ahash_request
*req
;
1260 hp
= tcp_get_md5sig_pool();
1262 goto clear_hash_noput
;
1265 if (crypto_ahash_init(req
))
1267 if (tcp_v4_md5_hash_headers(hp
, daddr
, saddr
, th
, th
->doff
<< 2))
1269 if (tcp_md5_hash_key(hp
, key
))
1271 ahash_request_set_crypt(req
, NULL
, md5_hash
, 0);
1272 if (crypto_ahash_final(req
))
1275 tcp_put_md5sig_pool();
1279 tcp_put_md5sig_pool();
1281 memset(md5_hash
, 0, 16);
1285 int tcp_v4_md5_hash_skb(char *md5_hash
, const struct tcp_md5sig_key
*key
,
1286 const struct sock
*sk
,
1287 const struct sk_buff
*skb
)
1289 struct tcp_md5sig_pool
*hp
;
1290 struct ahash_request
*req
;
1291 const struct tcphdr
*th
= tcp_hdr(skb
);
1292 __be32 saddr
, daddr
;
1294 if (sk
) { /* valid for establish/request sockets */
1295 saddr
= sk
->sk_rcv_saddr
;
1296 daddr
= sk
->sk_daddr
;
1298 const struct iphdr
*iph
= ip_hdr(skb
);
1303 hp
= tcp_get_md5sig_pool();
1305 goto clear_hash_noput
;
1308 if (crypto_ahash_init(req
))
1311 if (tcp_v4_md5_hash_headers(hp
, daddr
, saddr
, th
, skb
->len
))
1313 if (tcp_md5_hash_skb_data(hp
, skb
, th
->doff
<< 2))
1315 if (tcp_md5_hash_key(hp
, key
))
1317 ahash_request_set_crypt(req
, NULL
, md5_hash
, 0);
1318 if (crypto_ahash_final(req
))
1321 tcp_put_md5sig_pool();
1325 tcp_put_md5sig_pool();
1327 memset(md5_hash
, 0, 16);
1330 EXPORT_SYMBOL(tcp_v4_md5_hash_skb
);
1334 /* Called with rcu_read_lock() */
1335 static bool tcp_v4_inbound_md5_hash(const struct sock
*sk
,
1336 const struct sk_buff
*skb
,
1339 #ifdef CONFIG_TCP_MD5SIG
1341 * This gets called for each TCP segment that arrives
1342 * so we want to be efficient.
1343 * We have 3 drop cases:
1344 * o No MD5 hash and one expected.
1345 * o MD5 hash and we're not expecting one.
1346 * o MD5 hash and its wrong.
1348 const __u8
*hash_location
= NULL
;
1349 struct tcp_md5sig_key
*hash_expected
;
1350 const struct iphdr
*iph
= ip_hdr(skb
);
1351 const struct tcphdr
*th
= tcp_hdr(skb
);
1352 const union tcp_md5_addr
*addr
;
1353 unsigned char newhash
[16];
1354 int genhash
, l3index
;
1356 /* sdif set, means packet ingressed via a device
1357 * in an L3 domain and dif is set to the l3mdev
1359 l3index
= sdif
? dif
: 0;
1361 addr
= (union tcp_md5_addr
*)&iph
->saddr
;
1362 hash_expected
= tcp_md5_do_lookup(sk
, l3index
, addr
, AF_INET
);
1363 hash_location
= tcp_parse_md5sig_option(th
);
1365 /* We've parsed the options - do we have a hash? */
1366 if (!hash_expected
&& !hash_location
)
1369 if (hash_expected
&& !hash_location
) {
1370 NET_INC_STATS(sock_net(sk
), LINUX_MIB_TCPMD5NOTFOUND
);
1374 if (!hash_expected
&& hash_location
) {
1375 NET_INC_STATS(sock_net(sk
), LINUX_MIB_TCPMD5UNEXPECTED
);
1379 /* Okay, so this is hash_expected and hash_location -
1380 * so we need to calculate the checksum.
1382 genhash
= tcp_v4_md5_hash_skb(newhash
,
1386 if (genhash
|| memcmp(hash_location
, newhash
, 16) != 0) {
1387 NET_INC_STATS(sock_net(sk
), LINUX_MIB_TCPMD5FAILURE
);
1388 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s L3 index %d\n",
1389 &iph
->saddr
, ntohs(th
->source
),
1390 &iph
->daddr
, ntohs(th
->dest
),
1391 genhash
? " tcp_v4_calc_md5_hash failed"
1400 static void tcp_v4_init_req(struct request_sock
*req
,
1401 const struct sock
*sk_listener
,
1402 struct sk_buff
*skb
)
1404 struct inet_request_sock
*ireq
= inet_rsk(req
);
1405 struct net
*net
= sock_net(sk_listener
);
1407 sk_rcv_saddr_set(req_to_sk(req
), ip_hdr(skb
)->daddr
);
1408 sk_daddr_set(req_to_sk(req
), ip_hdr(skb
)->saddr
);
1409 RCU_INIT_POINTER(ireq
->ireq_opt
, tcp_v4_save_options(net
, skb
));
1412 static struct dst_entry
*tcp_v4_route_req(const struct sock
*sk
,
1414 const struct request_sock
*req
)
1416 return inet_csk_route_req(sk
, &fl
->u
.ip4
, req
);
1419 struct request_sock_ops tcp_request_sock_ops __read_mostly
= {
1421 .obj_size
= sizeof(struct tcp_request_sock
),
1422 .rtx_syn_ack
= tcp_rtx_synack
,
1423 .send_ack
= tcp_v4_reqsk_send_ack
,
1424 .destructor
= tcp_v4_reqsk_destructor
,
1425 .send_reset
= tcp_v4_send_reset
,
1426 .syn_ack_timeout
= tcp_syn_ack_timeout
,
1429 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops
= {
1430 .mss_clamp
= TCP_MSS_DEFAULT
,
1431 #ifdef CONFIG_TCP_MD5SIG
1432 .req_md5_lookup
= tcp_v4_md5_lookup
,
1433 .calc_md5_hash
= tcp_v4_md5_hash_skb
,
1435 .init_req
= tcp_v4_init_req
,
1436 #ifdef CONFIG_SYN_COOKIES
1437 .cookie_init_seq
= cookie_v4_init_sequence
,
1439 .route_req
= tcp_v4_route_req
,
1440 .init_seq
= tcp_v4_init_seq
,
1441 .init_ts_off
= tcp_v4_init_ts_off
,
1442 .send_synack
= tcp_v4_send_synack
,
1445 int tcp_v4_conn_request(struct sock
*sk
, struct sk_buff
*skb
)
1447 /* Never answer to SYNs send to broadcast or multicast */
1448 if (skb_rtable(skb
)->rt_flags
& (RTCF_BROADCAST
| RTCF_MULTICAST
))
1451 return tcp_conn_request(&tcp_request_sock_ops
,
1452 &tcp_request_sock_ipv4_ops
, sk
, skb
);
1458 EXPORT_SYMBOL(tcp_v4_conn_request
);
1462 * The three way handshake has completed - we got a valid synack -
1463 * now create the new socket.
1465 struct sock
*tcp_v4_syn_recv_sock(const struct sock
*sk
, struct sk_buff
*skb
,
1466 struct request_sock
*req
,
1467 struct dst_entry
*dst
,
1468 struct request_sock
*req_unhash
,
1471 struct inet_request_sock
*ireq
;
1472 struct inet_sock
*newinet
;
1473 struct tcp_sock
*newtp
;
1475 #ifdef CONFIG_TCP_MD5SIG
1476 const union tcp_md5_addr
*addr
;
1477 struct tcp_md5sig_key
*key
;
1480 struct ip_options_rcu
*inet_opt
;
1482 if (sk_acceptq_is_full(sk
))
1485 newsk
= tcp_create_openreq_child(sk
, req
, skb
);
1489 newsk
->sk_gso_type
= SKB_GSO_TCPV4
;
1490 inet_sk_rx_dst_set(newsk
, skb
);
1492 newtp
= tcp_sk(newsk
);
1493 newinet
= inet_sk(newsk
);
1494 ireq
= inet_rsk(req
);
1495 sk_daddr_set(newsk
, ireq
->ir_rmt_addr
);
1496 sk_rcv_saddr_set(newsk
, ireq
->ir_loc_addr
);
1497 newsk
->sk_bound_dev_if
= ireq
->ir_iif
;
1498 newinet
->inet_saddr
= ireq
->ir_loc_addr
;
1499 inet_opt
= rcu_dereference(ireq
->ireq_opt
);
1500 RCU_INIT_POINTER(newinet
->inet_opt
, inet_opt
);
1501 newinet
->mc_index
= inet_iif(skb
);
1502 newinet
->mc_ttl
= ip_hdr(skb
)->ttl
;
1503 newinet
->rcv_tos
= ip_hdr(skb
)->tos
;
1504 inet_csk(newsk
)->icsk_ext_hdr_len
= 0;
1506 inet_csk(newsk
)->icsk_ext_hdr_len
= inet_opt
->opt
.optlen
;
1507 newinet
->inet_id
= prandom_u32();
1510 dst
= inet_csk_route_child_sock(sk
, newsk
, req
);
1514 /* syncookie case : see end of cookie_v4_check() */
1516 sk_setup_caps(newsk
, dst
);
1518 tcp_ca_openreq_child(newsk
, dst
);
1520 tcp_sync_mss(newsk
, dst_mtu(dst
));
1521 newtp
->advmss
= tcp_mss_clamp(tcp_sk(sk
), dst_metric_advmss(dst
));
1523 tcp_initialize_rcv_mss(newsk
);
1525 #ifdef CONFIG_TCP_MD5SIG
1526 l3index
= l3mdev_master_ifindex_by_index(sock_net(sk
), ireq
->ir_iif
);
1527 /* Copy over the MD5 key from the original socket */
1528 addr
= (union tcp_md5_addr
*)&newinet
->inet_daddr
;
1529 key
= tcp_md5_do_lookup(sk
, l3index
, addr
, AF_INET
);
1532 * We're using one, so create a matching key
1533 * on the newsk structure. If we fail to get
1534 * memory, then we end up not copying the key
1537 tcp_md5_do_add(newsk
, addr
, AF_INET
, 32, l3index
,
1538 key
->key
, key
->keylen
, GFP_ATOMIC
);
1539 sk_nocaps_add(newsk
, NETIF_F_GSO_MASK
);
1543 if (__inet_inherit_port(sk
, newsk
) < 0)
1545 *own_req
= inet_ehash_nolisten(newsk
, req_to_sk(req_unhash
));
1546 if (likely(*own_req
)) {
1547 tcp_move_syn(newtp
, req
);
1548 ireq
->ireq_opt
= NULL
;
1550 newinet
->inet_opt
= NULL
;
1555 NET_INC_STATS(sock_net(sk
), LINUX_MIB_LISTENOVERFLOWS
);
1562 newinet
->inet_opt
= NULL
;
1563 inet_csk_prepare_forced_close(newsk
);
1567 EXPORT_SYMBOL(tcp_v4_syn_recv_sock
);
1569 static struct sock
*tcp_v4_cookie_check(struct sock
*sk
, struct sk_buff
*skb
)
1571 #ifdef CONFIG_SYN_COOKIES
1572 const struct tcphdr
*th
= tcp_hdr(skb
);
1575 sk
= cookie_v4_check(sk
, skb
);
1580 u16
tcp_v4_get_syncookie(struct sock
*sk
, struct iphdr
*iph
,
1581 struct tcphdr
*th
, u32
*cookie
)
1584 #ifdef CONFIG_SYN_COOKIES
1585 mss
= tcp_get_syncookie_mss(&tcp_request_sock_ops
,
1586 &tcp_request_sock_ipv4_ops
, sk
, th
);
1588 *cookie
= __cookie_v4_init_sequence(iph
, th
, &mss
);
1589 tcp_synq_overflow(sk
);
1595 /* The socket must have it's spinlock held when we get
1596 * here, unless it is a TCP_LISTEN socket.
1598 * We have a potential double-lock case here, so even when
1599 * doing backlog processing we use the BH locking scheme.
1600 * This is because we cannot sleep with the original spinlock
1603 int tcp_v4_do_rcv(struct sock
*sk
, struct sk_buff
*skb
)
1607 if (sk
->sk_state
== TCP_ESTABLISHED
) { /* Fast path */
1608 struct dst_entry
*dst
= sk
->sk_rx_dst
;
1610 sock_rps_save_rxhash(sk
, skb
);
1611 sk_mark_napi_id(sk
, skb
);
1613 if (inet_sk(sk
)->rx_dst_ifindex
!= skb
->skb_iif
||
1614 !dst
->ops
->check(dst
, 0)) {
1616 sk
->sk_rx_dst
= NULL
;
1619 tcp_rcv_established(sk
, skb
);
1623 if (tcp_checksum_complete(skb
))
1626 if (sk
->sk_state
== TCP_LISTEN
) {
1627 struct sock
*nsk
= tcp_v4_cookie_check(sk
, skb
);
1632 if (tcp_child_process(sk
, nsk
, skb
)) {
1639 sock_rps_save_rxhash(sk
, skb
);
1641 if (tcp_rcv_state_process(sk
, skb
)) {
1648 tcp_v4_send_reset(rsk
, skb
);
1651 /* Be careful here. If this function gets more complicated and
1652 * gcc suffers from register pressure on the x86, sk (in %ebx)
1653 * might be destroyed here. This current version compiles correctly,
1654 * but you have been warned.
1659 TCP_INC_STATS(sock_net(sk
), TCP_MIB_CSUMERRORS
);
1660 TCP_INC_STATS(sock_net(sk
), TCP_MIB_INERRS
);
1663 EXPORT_SYMBOL(tcp_v4_do_rcv
);
1665 int tcp_v4_early_demux(struct sk_buff
*skb
)
1667 const struct iphdr
*iph
;
1668 const struct tcphdr
*th
;
1671 if (skb
->pkt_type
!= PACKET_HOST
)
1674 if (!pskb_may_pull(skb
, skb_transport_offset(skb
) + sizeof(struct tcphdr
)))
1680 if (th
->doff
< sizeof(struct tcphdr
) / 4)
1683 sk
= __inet_lookup_established(dev_net(skb
->dev
), &tcp_hashinfo
,
1684 iph
->saddr
, th
->source
,
1685 iph
->daddr
, ntohs(th
->dest
),
1686 skb
->skb_iif
, inet_sdif(skb
));
1689 skb
->destructor
= sock_edemux
;
1690 if (sk_fullsock(sk
)) {
1691 struct dst_entry
*dst
= READ_ONCE(sk
->sk_rx_dst
);
1694 dst
= dst_check(dst
, 0);
1696 inet_sk(sk
)->rx_dst_ifindex
== skb
->skb_iif
)
1697 skb_dst_set_noref(skb
, dst
);
1703 bool tcp_add_backlog(struct sock
*sk
, struct sk_buff
*skb
)
1705 u32 limit
= READ_ONCE(sk
->sk_rcvbuf
) + READ_ONCE(sk
->sk_sndbuf
);
1706 struct skb_shared_info
*shinfo
;
1707 const struct tcphdr
*th
;
1708 struct tcphdr
*thtail
;
1709 struct sk_buff
*tail
;
1710 unsigned int hdrlen
;
1715 /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1716 * we can fix skb->truesize to its real value to avoid future drops.
1717 * This is valid because skb is not yet charged to the socket.
1718 * It has been noticed pure SACK packets were sometimes dropped
1719 * (if cooked by drivers without copybreak feature).
1725 if (unlikely(tcp_checksum_complete(skb
))) {
1727 __TCP_INC_STATS(sock_net(sk
), TCP_MIB_CSUMERRORS
);
1728 __TCP_INC_STATS(sock_net(sk
), TCP_MIB_INERRS
);
1732 /* Attempt coalescing to last skb in backlog, even if we are
1734 * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
1736 th
= (const struct tcphdr
*)skb
->data
;
1737 hdrlen
= th
->doff
* 4;
1738 shinfo
= skb_shinfo(skb
);
1740 if (!shinfo
->gso_size
)
1741 shinfo
->gso_size
= skb
->len
- hdrlen
;
1743 if (!shinfo
->gso_segs
)
1744 shinfo
->gso_segs
= 1;
1746 tail
= sk
->sk_backlog
.tail
;
1749 thtail
= (struct tcphdr
*)tail
->data
;
1751 if (TCP_SKB_CB(tail
)->end_seq
!= TCP_SKB_CB(skb
)->seq
||
1752 TCP_SKB_CB(tail
)->ip_dsfield
!= TCP_SKB_CB(skb
)->ip_dsfield
||
1753 ((TCP_SKB_CB(tail
)->tcp_flags
|
1754 TCP_SKB_CB(skb
)->tcp_flags
) & (TCPHDR_SYN
| TCPHDR_RST
| TCPHDR_URG
)) ||
1755 !((TCP_SKB_CB(tail
)->tcp_flags
&
1756 TCP_SKB_CB(skb
)->tcp_flags
) & TCPHDR_ACK
) ||
1757 ((TCP_SKB_CB(tail
)->tcp_flags
^
1758 TCP_SKB_CB(skb
)->tcp_flags
) & (TCPHDR_ECE
| TCPHDR_CWR
)) ||
1759 #ifdef CONFIG_TLS_DEVICE
1760 tail
->decrypted
!= skb
->decrypted
||
1762 thtail
->doff
!= th
->doff
||
1763 memcmp(thtail
+ 1, th
+ 1, hdrlen
- sizeof(*th
)))
1766 __skb_pull(skb
, hdrlen
);
1767 if (skb_try_coalesce(tail
, skb
, &fragstolen
, &delta
)) {
1768 thtail
->window
= th
->window
;
1770 TCP_SKB_CB(tail
)->end_seq
= TCP_SKB_CB(skb
)->end_seq
;
1772 if (after(TCP_SKB_CB(skb
)->ack_seq
, TCP_SKB_CB(tail
)->ack_seq
))
1773 TCP_SKB_CB(tail
)->ack_seq
= TCP_SKB_CB(skb
)->ack_seq
;
1775 /* We have to update both TCP_SKB_CB(tail)->tcp_flags and
1776 * thtail->fin, so that the fast path in tcp_rcv_established()
1777 * is not entered if we append a packet with a FIN.
1778 * SYN, RST, URG are not present.
1779 * ACK is set on both packets.
1780 * PSH : we do not really care in TCP stack,
1781 * at least for 'GRO' packets.
1783 thtail
->fin
|= th
->fin
;
1784 TCP_SKB_CB(tail
)->tcp_flags
|= TCP_SKB_CB(skb
)->tcp_flags
;
1786 if (TCP_SKB_CB(skb
)->has_rxtstamp
) {
1787 TCP_SKB_CB(tail
)->has_rxtstamp
= true;
1788 tail
->tstamp
= skb
->tstamp
;
1789 skb_hwtstamps(tail
)->hwtstamp
= skb_hwtstamps(skb
)->hwtstamp
;
1792 /* Not as strict as GRO. We only need to carry mss max value */
1793 skb_shinfo(tail
)->gso_size
= max(shinfo
->gso_size
,
1794 skb_shinfo(tail
)->gso_size
);
1796 gso_segs
= skb_shinfo(tail
)->gso_segs
+ shinfo
->gso_segs
;
1797 skb_shinfo(tail
)->gso_segs
= min_t(u32
, gso_segs
, 0xFFFF);
1799 sk
->sk_backlog
.len
+= delta
;
1800 __NET_INC_STATS(sock_net(sk
),
1801 LINUX_MIB_TCPBACKLOGCOALESCE
);
1802 kfree_skb_partial(skb
, fragstolen
);
1805 __skb_push(skb
, hdrlen
);
1808 /* Only socket owner can try to collapse/prune rx queues
1809 * to reduce memory overhead, so add a little headroom here.
1810 * Few sockets backlog are possibly concurrently non empty.
1814 if (unlikely(sk_add_backlog(sk
, skb
, limit
))) {
1816 __NET_INC_STATS(sock_net(sk
), LINUX_MIB_TCPBACKLOGDROP
);
1821 EXPORT_SYMBOL(tcp_add_backlog
);
1823 int tcp_filter(struct sock
*sk
, struct sk_buff
*skb
)
1825 struct tcphdr
*th
= (struct tcphdr
*)skb
->data
;
1827 return sk_filter_trim_cap(sk
, skb
, th
->doff
* 4);
1829 EXPORT_SYMBOL(tcp_filter
);
1831 static void tcp_v4_restore_cb(struct sk_buff
*skb
)
1833 memmove(IPCB(skb
), &TCP_SKB_CB(skb
)->header
.h4
,
1834 sizeof(struct inet_skb_parm
));
1837 static void tcp_v4_fill_cb(struct sk_buff
*skb
, const struct iphdr
*iph
,
1838 const struct tcphdr
*th
)
1840 /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1841 * barrier() makes sure compiler wont play fool^Waliasing games.
1843 memmove(&TCP_SKB_CB(skb
)->header
.h4
, IPCB(skb
),
1844 sizeof(struct inet_skb_parm
));
1847 TCP_SKB_CB(skb
)->seq
= ntohl(th
->seq
);
1848 TCP_SKB_CB(skb
)->end_seq
= (TCP_SKB_CB(skb
)->seq
+ th
->syn
+ th
->fin
+
1849 skb
->len
- th
->doff
* 4);
1850 TCP_SKB_CB(skb
)->ack_seq
= ntohl(th
->ack_seq
);
1851 TCP_SKB_CB(skb
)->tcp_flags
= tcp_flag_byte(th
);
1852 TCP_SKB_CB(skb
)->tcp_tw_isn
= 0;
1853 TCP_SKB_CB(skb
)->ip_dsfield
= ipv4_get_dsfield(iph
);
1854 TCP_SKB_CB(skb
)->sacked
= 0;
1855 TCP_SKB_CB(skb
)->has_rxtstamp
=
1856 skb
->tstamp
|| skb_hwtstamps(skb
)->hwtstamp
;
1863 int tcp_v4_rcv(struct sk_buff
*skb
)
1865 struct net
*net
= dev_net(skb
->dev
);
1866 struct sk_buff
*skb_to_free
;
1867 int sdif
= inet_sdif(skb
);
1868 int dif
= inet_iif(skb
);
1869 const struct iphdr
*iph
;
1870 const struct tcphdr
*th
;
1875 if (skb
->pkt_type
!= PACKET_HOST
)
1878 /* Count it even if it's bad */
1879 __TCP_INC_STATS(net
, TCP_MIB_INSEGS
);
1881 if (!pskb_may_pull(skb
, sizeof(struct tcphdr
)))
1884 th
= (const struct tcphdr
*)skb
->data
;
1886 if (unlikely(th
->doff
< sizeof(struct tcphdr
) / 4))
1888 if (!pskb_may_pull(skb
, th
->doff
* 4))
1891 /* An explanation is required here, I think.
1892 * Packet length and doff are validated by header prediction,
1893 * provided case of th->doff==0 is eliminated.
1894 * So, we defer the checks. */
1896 if (skb_checksum_init(skb
, IPPROTO_TCP
, inet_compute_pseudo
))
1899 th
= (const struct tcphdr
*)skb
->data
;
1902 sk
= __inet_lookup_skb(&tcp_hashinfo
, skb
, __tcp_hdrlen(th
), th
->source
,
1903 th
->dest
, sdif
, &refcounted
);
1908 if (sk
->sk_state
== TCP_TIME_WAIT
)
1911 if (sk
->sk_state
== TCP_NEW_SYN_RECV
) {
1912 struct request_sock
*req
= inet_reqsk(sk
);
1913 bool req_stolen
= false;
1916 sk
= req
->rsk_listener
;
1917 if (unlikely(tcp_v4_inbound_md5_hash(sk
, skb
, dif
, sdif
))) {
1918 sk_drops_add(sk
, skb
);
1922 if (tcp_checksum_complete(skb
)) {
1926 if (unlikely(sk
->sk_state
!= TCP_LISTEN
)) {
1927 inet_csk_reqsk_queue_drop_and_put(sk
, req
);
1930 /* We own a reference on the listener, increase it again
1931 * as we might lose it too soon.
1936 if (!tcp_filter(sk
, skb
)) {
1937 th
= (const struct tcphdr
*)skb
->data
;
1939 tcp_v4_fill_cb(skb
, iph
, th
);
1940 nsk
= tcp_check_req(sk
, skb
, req
, false, &req_stolen
);
1945 /* Another cpu got exclusive access to req
1946 * and created a full blown socket.
1947 * Try to feed this packet to this socket
1948 * instead of discarding it.
1950 tcp_v4_restore_cb(skb
);
1954 goto discard_and_relse
;
1958 tcp_v4_restore_cb(skb
);
1959 } else if (tcp_child_process(sk
, nsk
, skb
)) {
1960 tcp_v4_send_reset(nsk
, skb
);
1961 goto discard_and_relse
;
1967 if (unlikely(iph
->ttl
< inet_sk(sk
)->min_ttl
)) {
1968 __NET_INC_STATS(net
, LINUX_MIB_TCPMINTTLDROP
);
1969 goto discard_and_relse
;
1972 if (!xfrm4_policy_check(sk
, XFRM_POLICY_IN
, skb
))
1973 goto discard_and_relse
;
1975 if (tcp_v4_inbound_md5_hash(sk
, skb
, dif
, sdif
))
1976 goto discard_and_relse
;
1980 if (tcp_filter(sk
, skb
))
1981 goto discard_and_relse
;
1982 th
= (const struct tcphdr
*)skb
->data
;
1984 tcp_v4_fill_cb(skb
, iph
, th
);
1988 if (sk
->sk_state
== TCP_LISTEN
) {
1989 ret
= tcp_v4_do_rcv(sk
, skb
);
1990 goto put_and_return
;
1993 sk_incoming_cpu_update(sk
);
1995 bh_lock_sock_nested(sk
);
1996 tcp_segs_in(tcp_sk(sk
), skb
);
1998 if (!sock_owned_by_user(sk
)) {
1999 skb_to_free
= sk
->sk_rx_skb_cache
;
2000 sk
->sk_rx_skb_cache
= NULL
;
2001 ret
= tcp_v4_do_rcv(sk
, skb
);
2003 if (tcp_add_backlog(sk
, skb
))
2004 goto discard_and_relse
;
2009 __kfree_skb(skb_to_free
);
2018 if (!xfrm4_policy_check(NULL
, XFRM_POLICY_IN
, skb
))
2021 tcp_v4_fill_cb(skb
, iph
, th
);
2023 if (tcp_checksum_complete(skb
)) {
2025 __TCP_INC_STATS(net
, TCP_MIB_CSUMERRORS
);
2027 __TCP_INC_STATS(net
, TCP_MIB_INERRS
);
2029 tcp_v4_send_reset(NULL
, skb
);
2033 /* Discard frame. */
2038 sk_drops_add(sk
, skb
);
2044 if (!xfrm4_policy_check(NULL
, XFRM_POLICY_IN
, skb
)) {
2045 inet_twsk_put(inet_twsk(sk
));
2049 tcp_v4_fill_cb(skb
, iph
, th
);
2051 if (tcp_checksum_complete(skb
)) {
2052 inet_twsk_put(inet_twsk(sk
));
2055 switch (tcp_timewait_state_process(inet_twsk(sk
), skb
, th
)) {
2057 struct sock
*sk2
= inet_lookup_listener(dev_net(skb
->dev
),
2060 iph
->saddr
, th
->source
,
2061 iph
->daddr
, th
->dest
,
2065 inet_twsk_deschedule_put(inet_twsk(sk
));
2067 tcp_v4_restore_cb(skb
);
2075 tcp_v4_timewait_ack(sk
, skb
);
2078 tcp_v4_send_reset(sk
, skb
);
2079 inet_twsk_deschedule_put(inet_twsk(sk
));
2081 case TCP_TW_SUCCESS
:;
2086 static struct timewait_sock_ops tcp_timewait_sock_ops
= {
2087 .twsk_obj_size
= sizeof(struct tcp_timewait_sock
),
2088 .twsk_unique
= tcp_twsk_unique
,
2089 .twsk_destructor
= tcp_twsk_destructor
,
2092 void inet_sk_rx_dst_set(struct sock
*sk
, const struct sk_buff
*skb
)
2094 struct dst_entry
*dst
= skb_dst(skb
);
2096 if (dst
&& dst_hold_safe(dst
)) {
2097 sk
->sk_rx_dst
= dst
;
2098 inet_sk(sk
)->rx_dst_ifindex
= skb
->skb_iif
;
2101 EXPORT_SYMBOL(inet_sk_rx_dst_set
);
2103 const struct inet_connection_sock_af_ops ipv4_specific
= {
2104 .queue_xmit
= ip_queue_xmit
,
2105 .send_check
= tcp_v4_send_check
,
2106 .rebuild_header
= inet_sk_rebuild_header
,
2107 .sk_rx_dst_set
= inet_sk_rx_dst_set
,
2108 .conn_request
= tcp_v4_conn_request
,
2109 .syn_recv_sock
= tcp_v4_syn_recv_sock
,
2110 .net_header_len
= sizeof(struct iphdr
),
2111 .setsockopt
= ip_setsockopt
,
2112 .getsockopt
= ip_getsockopt
,
2113 .addr2sockaddr
= inet_csk_addr2sockaddr
,
2114 .sockaddr_len
= sizeof(struct sockaddr_in
),
2115 #ifdef CONFIG_COMPAT
2116 .compat_setsockopt
= compat_ip_setsockopt
,
2117 .compat_getsockopt
= compat_ip_getsockopt
,
2119 .mtu_reduced
= tcp_v4_mtu_reduced
,
2121 EXPORT_SYMBOL(ipv4_specific
);
2123 #ifdef CONFIG_TCP_MD5SIG
2124 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific
= {
2125 .md5_lookup
= tcp_v4_md5_lookup
,
2126 .calc_md5_hash
= tcp_v4_md5_hash_skb
,
2127 .md5_parse
= tcp_v4_parse_md5_keys
,
2131 /* NOTE: A lot of things set to zero explicitly by call to
2132 * sk_alloc() so need not be done here.
2134 static int tcp_v4_init_sock(struct sock
*sk
)
2136 struct inet_connection_sock
*icsk
= inet_csk(sk
);
2140 icsk
->icsk_af_ops
= &ipv4_specific
;
2142 #ifdef CONFIG_TCP_MD5SIG
2143 tcp_sk(sk
)->af_specific
= &tcp_sock_ipv4_specific
;
2149 void tcp_v4_destroy_sock(struct sock
*sk
)
2151 struct tcp_sock
*tp
= tcp_sk(sk
);
2153 trace_tcp_destroy_sock(sk
);
2155 tcp_clear_xmit_timers(sk
);
2157 tcp_cleanup_congestion_control(sk
);
2159 tcp_cleanup_ulp(sk
);
2161 /* Cleanup up the write buffer. */
2162 tcp_write_queue_purge(sk
);
2164 /* Check if we want to disable active TFO */
2165 tcp_fastopen_active_disable_ofo_check(sk
);
2167 /* Cleans up our, hopefully empty, out_of_order_queue. */
2168 skb_rbtree_purge(&tp
->out_of_order_queue
);
2170 #ifdef CONFIG_TCP_MD5SIG
2171 /* Clean up the MD5 key list, if any */
2172 if (tp
->md5sig_info
) {
2173 tcp_clear_md5_list(sk
);
2174 kfree_rcu(rcu_dereference_protected(tp
->md5sig_info
, 1), rcu
);
2175 tp
->md5sig_info
= NULL
;
2179 /* Clean up a referenced TCP bind bucket. */
2180 if (inet_csk(sk
)->icsk_bind_hash
)
2183 BUG_ON(rcu_access_pointer(tp
->fastopen_rsk
));
2185 /* If socket is aborted during connect operation */
2186 tcp_free_fastopen_req(tp
);
2187 tcp_fastopen_destroy_cipher(sk
);
2188 tcp_saved_syn_free(tp
);
2190 sk_sockets_allocated_dec(sk
);
2192 EXPORT_SYMBOL(tcp_v4_destroy_sock
);
2194 #ifdef CONFIG_PROC_FS
2195 /* Proc filesystem TCP sock list dumping. */
2198 * Get next listener socket follow cur. If cur is NULL, get first socket
2199 * starting from bucket given in st->bucket; when st->bucket is zero the
2200 * very first socket in the hash table is returned.
2202 static void *listening_get_next(struct seq_file
*seq
, void *cur
)
2204 struct tcp_seq_afinfo
*afinfo
= PDE_DATA(file_inode(seq
->file
));
2205 struct tcp_iter_state
*st
= seq
->private;
2206 struct net
*net
= seq_file_net(seq
);
2207 struct inet_listen_hashbucket
*ilb
;
2208 struct hlist_nulls_node
*node
;
2209 struct sock
*sk
= cur
;
2213 ilb
= &tcp_hashinfo
.listening_hash
[st
->bucket
];
2214 spin_lock(&ilb
->lock
);
2215 sk
= sk_nulls_head(&ilb
->nulls_head
);
2219 ilb
= &tcp_hashinfo
.listening_hash
[st
->bucket
];
2223 sk
= sk_nulls_next(sk
);
2225 sk_nulls_for_each_from(sk
, node
) {
2226 if (!net_eq(sock_net(sk
), net
))
2228 if (sk
->sk_family
== afinfo
->family
)
2231 spin_unlock(&ilb
->lock
);
2233 if (++st
->bucket
< INET_LHTABLE_SIZE
)
2238 static void *listening_get_idx(struct seq_file
*seq
, loff_t
*pos
)
2240 struct tcp_iter_state
*st
= seq
->private;
2245 rc
= listening_get_next(seq
, NULL
);
2247 while (rc
&& *pos
) {
2248 rc
= listening_get_next(seq
, rc
);
2254 static inline bool empty_bucket(const struct tcp_iter_state
*st
)
2256 return hlist_nulls_empty(&tcp_hashinfo
.ehash
[st
->bucket
].chain
);
2260 * Get first established socket starting from bucket given in st->bucket.
2261 * If st->bucket is zero, the very first socket in the hash is returned.
2263 static void *established_get_first(struct seq_file
*seq
)
2265 struct tcp_seq_afinfo
*afinfo
= PDE_DATA(file_inode(seq
->file
));
2266 struct tcp_iter_state
*st
= seq
->private;
2267 struct net
*net
= seq_file_net(seq
);
2271 for (; st
->bucket
<= tcp_hashinfo
.ehash_mask
; ++st
->bucket
) {
2273 struct hlist_nulls_node
*node
;
2274 spinlock_t
*lock
= inet_ehash_lockp(&tcp_hashinfo
, st
->bucket
);
2276 /* Lockless fast path for the common case of empty buckets */
2277 if (empty_bucket(st
))
2281 sk_nulls_for_each(sk
, node
, &tcp_hashinfo
.ehash
[st
->bucket
].chain
) {
2282 if (sk
->sk_family
!= afinfo
->family
||
2283 !net_eq(sock_net(sk
), net
)) {
2289 spin_unlock_bh(lock
);
2295 static void *established_get_next(struct seq_file
*seq
, void *cur
)
2297 struct tcp_seq_afinfo
*afinfo
= PDE_DATA(file_inode(seq
->file
));
2298 struct sock
*sk
= cur
;
2299 struct hlist_nulls_node
*node
;
2300 struct tcp_iter_state
*st
= seq
->private;
2301 struct net
*net
= seq_file_net(seq
);
2306 sk
= sk_nulls_next(sk
);
2308 sk_nulls_for_each_from(sk
, node
) {
2309 if (sk
->sk_family
== afinfo
->family
&&
2310 net_eq(sock_net(sk
), net
))
2314 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo
, st
->bucket
));
2316 return established_get_first(seq
);
2319 static void *established_get_idx(struct seq_file
*seq
, loff_t pos
)
2321 struct tcp_iter_state
*st
= seq
->private;
2325 rc
= established_get_first(seq
);
2328 rc
= established_get_next(seq
, rc
);
2334 static void *tcp_get_idx(struct seq_file
*seq
, loff_t pos
)
2337 struct tcp_iter_state
*st
= seq
->private;
2339 st
->state
= TCP_SEQ_STATE_LISTENING
;
2340 rc
= listening_get_idx(seq
, &pos
);
2343 st
->state
= TCP_SEQ_STATE_ESTABLISHED
;
2344 rc
= established_get_idx(seq
, pos
);
2350 static void *tcp_seek_last_pos(struct seq_file
*seq
)
2352 struct tcp_iter_state
*st
= seq
->private;
2353 int offset
= st
->offset
;
2354 int orig_num
= st
->num
;
2357 switch (st
->state
) {
2358 case TCP_SEQ_STATE_LISTENING
:
2359 if (st
->bucket
>= INET_LHTABLE_SIZE
)
2361 st
->state
= TCP_SEQ_STATE_LISTENING
;
2362 rc
= listening_get_next(seq
, NULL
);
2363 while (offset
-- && rc
)
2364 rc
= listening_get_next(seq
, rc
);
2368 st
->state
= TCP_SEQ_STATE_ESTABLISHED
;
2370 case TCP_SEQ_STATE_ESTABLISHED
:
2371 if (st
->bucket
> tcp_hashinfo
.ehash_mask
)
2373 rc
= established_get_first(seq
);
2374 while (offset
-- && rc
)
2375 rc
= established_get_next(seq
, rc
);
2383 void *tcp_seq_start(struct seq_file
*seq
, loff_t
*pos
)
2385 struct tcp_iter_state
*st
= seq
->private;
2388 if (*pos
&& *pos
== st
->last_pos
) {
2389 rc
= tcp_seek_last_pos(seq
);
2394 st
->state
= TCP_SEQ_STATE_LISTENING
;
2398 rc
= *pos
? tcp_get_idx(seq
, *pos
- 1) : SEQ_START_TOKEN
;
2401 st
->last_pos
= *pos
;
2404 EXPORT_SYMBOL(tcp_seq_start
);
2406 void *tcp_seq_next(struct seq_file
*seq
, void *v
, loff_t
*pos
)
2408 struct tcp_iter_state
*st
= seq
->private;
2411 if (v
== SEQ_START_TOKEN
) {
2412 rc
= tcp_get_idx(seq
, 0);
2416 switch (st
->state
) {
2417 case TCP_SEQ_STATE_LISTENING
:
2418 rc
= listening_get_next(seq
, v
);
2420 st
->state
= TCP_SEQ_STATE_ESTABLISHED
;
2423 rc
= established_get_first(seq
);
2426 case TCP_SEQ_STATE_ESTABLISHED
:
2427 rc
= established_get_next(seq
, v
);
2432 st
->last_pos
= *pos
;
2435 EXPORT_SYMBOL(tcp_seq_next
);
2437 void tcp_seq_stop(struct seq_file
*seq
, void *v
)
2439 struct tcp_iter_state
*st
= seq
->private;
2441 switch (st
->state
) {
2442 case TCP_SEQ_STATE_LISTENING
:
2443 if (v
!= SEQ_START_TOKEN
)
2444 spin_unlock(&tcp_hashinfo
.listening_hash
[st
->bucket
].lock
);
2446 case TCP_SEQ_STATE_ESTABLISHED
:
2448 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo
, st
->bucket
));
2452 EXPORT_SYMBOL(tcp_seq_stop
);
2454 static void get_openreq4(const struct request_sock
*req
,
2455 struct seq_file
*f
, int i
)
2457 const struct inet_request_sock
*ireq
= inet_rsk(req
);
2458 long delta
= req
->rsk_timer
.expires
- jiffies
;
2460 seq_printf(f
, "%4d: %08X:%04X %08X:%04X"
2461 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2466 ntohs(ireq
->ir_rmt_port
),
2468 0, 0, /* could print option size, but that is af dependent. */
2469 1, /* timers active (only the expire timer) */
2470 jiffies_delta_to_clock_t(delta
),
2472 from_kuid_munged(seq_user_ns(f
),
2473 sock_i_uid(req
->rsk_listener
)),
2474 0, /* non standard timer */
2475 0, /* open_requests have no inode */
2480 static void get_tcp4_sock(struct sock
*sk
, struct seq_file
*f
, int i
)
2483 unsigned long timer_expires
;
2484 const struct tcp_sock
*tp
= tcp_sk(sk
);
2485 const struct inet_connection_sock
*icsk
= inet_csk(sk
);
2486 const struct inet_sock
*inet
= inet_sk(sk
);
2487 const struct fastopen_queue
*fastopenq
= &icsk
->icsk_accept_queue
.fastopenq
;
2488 __be32 dest
= inet
->inet_daddr
;
2489 __be32 src
= inet
->inet_rcv_saddr
;
2490 __u16 destp
= ntohs(inet
->inet_dport
);
2491 __u16 srcp
= ntohs(inet
->inet_sport
);
2495 if (icsk
->icsk_pending
== ICSK_TIME_RETRANS
||
2496 icsk
->icsk_pending
== ICSK_TIME_REO_TIMEOUT
||
2497 icsk
->icsk_pending
== ICSK_TIME_LOSS_PROBE
) {
2499 timer_expires
= icsk
->icsk_timeout
;
2500 } else if (icsk
->icsk_pending
== ICSK_TIME_PROBE0
) {
2502 timer_expires
= icsk
->icsk_timeout
;
2503 } else if (timer_pending(&sk
->sk_timer
)) {
2505 timer_expires
= sk
->sk_timer
.expires
;
2508 timer_expires
= jiffies
;
2511 state
= inet_sk_state_load(sk
);
2512 if (state
== TCP_LISTEN
)
2513 rx_queue
= READ_ONCE(sk
->sk_ack_backlog
);
2515 /* Because we don't lock the socket,
2516 * we might find a transient negative value.
2518 rx_queue
= max_t(int, READ_ONCE(tp
->rcv_nxt
) -
2519 READ_ONCE(tp
->copied_seq
), 0);
2521 seq_printf(f
, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2522 "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2523 i
, src
, srcp
, dest
, destp
, state
,
2524 READ_ONCE(tp
->write_seq
) - tp
->snd_una
,
2527 jiffies_delta_to_clock_t(timer_expires
- jiffies
),
2528 icsk
->icsk_retransmits
,
2529 from_kuid_munged(seq_user_ns(f
), sock_i_uid(sk
)),
2530 icsk
->icsk_probes_out
,
2532 refcount_read(&sk
->sk_refcnt
), sk
,
2533 jiffies_to_clock_t(icsk
->icsk_rto
),
2534 jiffies_to_clock_t(icsk
->icsk_ack
.ato
),
2535 (icsk
->icsk_ack
.quick
<< 1) | inet_csk_in_pingpong_mode(sk
),
2537 state
== TCP_LISTEN
?
2538 fastopenq
->max_qlen
:
2539 (tcp_in_initial_slowstart(tp
) ? -1 : tp
->snd_ssthresh
));
2542 static void get_timewait4_sock(const struct inet_timewait_sock
*tw
,
2543 struct seq_file
*f
, int i
)
2545 long delta
= tw
->tw_timer
.expires
- jiffies
;
2549 dest
= tw
->tw_daddr
;
2550 src
= tw
->tw_rcv_saddr
;
2551 destp
= ntohs(tw
->tw_dport
);
2552 srcp
= ntohs(tw
->tw_sport
);
2554 seq_printf(f
, "%4d: %08X:%04X %08X:%04X"
2555 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2556 i
, src
, srcp
, dest
, destp
, tw
->tw_substate
, 0, 0,
2557 3, jiffies_delta_to_clock_t(delta
), 0, 0, 0, 0,
2558 refcount_read(&tw
->tw_refcnt
), tw
);
2563 static int tcp4_seq_show(struct seq_file
*seq
, void *v
)
2565 struct tcp_iter_state
*st
;
2566 struct sock
*sk
= v
;
2568 seq_setwidth(seq
, TMPSZ
- 1);
2569 if (v
== SEQ_START_TOKEN
) {
2570 seq_puts(seq
, " sl local_address rem_address st tx_queue "
2571 "rx_queue tr tm->when retrnsmt uid timeout "
2577 if (sk
->sk_state
== TCP_TIME_WAIT
)
2578 get_timewait4_sock(v
, seq
, st
->num
);
2579 else if (sk
->sk_state
== TCP_NEW_SYN_RECV
)
2580 get_openreq4(v
, seq
, st
->num
);
2582 get_tcp4_sock(v
, seq
, st
->num
);
2588 static const struct seq_operations tcp4_seq_ops
= {
2589 .show
= tcp4_seq_show
,
2590 .start
= tcp_seq_start
,
2591 .next
= tcp_seq_next
,
2592 .stop
= tcp_seq_stop
,
2595 static struct tcp_seq_afinfo tcp4_seq_afinfo
= {
2599 static int __net_init
tcp4_proc_init_net(struct net
*net
)
2601 if (!proc_create_net_data("tcp", 0444, net
->proc_net
, &tcp4_seq_ops
,
2602 sizeof(struct tcp_iter_state
), &tcp4_seq_afinfo
))
2607 static void __net_exit
tcp4_proc_exit_net(struct net
*net
)
2609 remove_proc_entry("tcp", net
->proc_net
);
2612 static struct pernet_operations tcp4_net_ops
= {
2613 .init
= tcp4_proc_init_net
,
2614 .exit
= tcp4_proc_exit_net
,
2617 int __init
tcp4_proc_init(void)
2619 return register_pernet_subsys(&tcp4_net_ops
);
2622 void tcp4_proc_exit(void)
2624 unregister_pernet_subsys(&tcp4_net_ops
);
2626 #endif /* CONFIG_PROC_FS */
2628 struct proto tcp_prot
= {
2630 .owner
= THIS_MODULE
,
2632 .pre_connect
= tcp_v4_pre_connect
,
2633 .connect
= tcp_v4_connect
,
2634 .disconnect
= tcp_disconnect
,
2635 .accept
= inet_csk_accept
,
2637 .init
= tcp_v4_init_sock
,
2638 .destroy
= tcp_v4_destroy_sock
,
2639 .shutdown
= tcp_shutdown
,
2640 .setsockopt
= tcp_setsockopt
,
2641 .getsockopt
= tcp_getsockopt
,
2642 .keepalive
= tcp_set_keepalive
,
2643 .recvmsg
= tcp_recvmsg
,
2644 .sendmsg
= tcp_sendmsg
,
2645 .sendpage
= tcp_sendpage
,
2646 .backlog_rcv
= tcp_v4_do_rcv
,
2647 .release_cb
= tcp_release_cb
,
2649 .unhash
= inet_unhash
,
2650 .get_port
= inet_csk_get_port
,
2651 .enter_memory_pressure
= tcp_enter_memory_pressure
,
2652 .leave_memory_pressure
= tcp_leave_memory_pressure
,
2653 .stream_memory_free
= tcp_stream_memory_free
,
2654 .sockets_allocated
= &tcp_sockets_allocated
,
2655 .orphan_count
= &tcp_orphan_count
,
2656 .memory_allocated
= &tcp_memory_allocated
,
2657 .memory_pressure
= &tcp_memory_pressure
,
2658 .sysctl_mem
= sysctl_tcp_mem
,
2659 .sysctl_wmem_offset
= offsetof(struct net
, ipv4
.sysctl_tcp_wmem
),
2660 .sysctl_rmem_offset
= offsetof(struct net
, ipv4
.sysctl_tcp_rmem
),
2661 .max_header
= MAX_TCP_HEADER
,
2662 .obj_size
= sizeof(struct tcp_sock
),
2663 .slab_flags
= SLAB_TYPESAFE_BY_RCU
,
2664 .twsk_prot
= &tcp_timewait_sock_ops
,
2665 .rsk_prot
= &tcp_request_sock_ops
,
2666 .h
.hashinfo
= &tcp_hashinfo
,
2667 .no_autobind
= true,
2668 #ifdef CONFIG_COMPAT
2669 .compat_setsockopt
= compat_tcp_setsockopt
,
2670 .compat_getsockopt
= compat_tcp_getsockopt
,
2672 .diag_destroy
= tcp_abort
,
2674 EXPORT_SYMBOL(tcp_prot
);
2676 static void __net_exit
tcp_sk_exit(struct net
*net
)
2680 if (net
->ipv4
.tcp_congestion_control
)
2681 bpf_module_put(net
->ipv4
.tcp_congestion_control
,
2682 net
->ipv4
.tcp_congestion_control
->owner
);
2684 for_each_possible_cpu(cpu
)
2685 inet_ctl_sock_destroy(*per_cpu_ptr(net
->ipv4
.tcp_sk
, cpu
));
2686 free_percpu(net
->ipv4
.tcp_sk
);
2689 static int __net_init
tcp_sk_init(struct net
*net
)
2693 net
->ipv4
.tcp_sk
= alloc_percpu(struct sock
*);
2694 if (!net
->ipv4
.tcp_sk
)
2697 for_each_possible_cpu(cpu
) {
2700 res
= inet_ctl_sock_create(&sk
, PF_INET
, SOCK_RAW
,
2704 sock_set_flag(sk
, SOCK_USE_WRITE_QUEUE
);
2706 /* Please enforce IP_DF and IPID==0 for RST and
2707 * ACK sent in SYN-RECV and TIME-WAIT state.
2709 inet_sk(sk
)->pmtudisc
= IP_PMTUDISC_DO
;
2711 *per_cpu_ptr(net
->ipv4
.tcp_sk
, cpu
) = sk
;
2714 net
->ipv4
.sysctl_tcp_ecn
= 2;
2715 net
->ipv4
.sysctl_tcp_ecn_fallback
= 1;
2717 net
->ipv4
.sysctl_tcp_base_mss
= TCP_BASE_MSS
;
2718 net
->ipv4
.sysctl_tcp_min_snd_mss
= TCP_MIN_SND_MSS
;
2719 net
->ipv4
.sysctl_tcp_probe_threshold
= TCP_PROBE_THRESHOLD
;
2720 net
->ipv4
.sysctl_tcp_probe_interval
= TCP_PROBE_INTERVAL
;
2721 net
->ipv4
.sysctl_tcp_mtu_probe_floor
= TCP_MIN_SND_MSS
;
2723 net
->ipv4
.sysctl_tcp_keepalive_time
= TCP_KEEPALIVE_TIME
;
2724 net
->ipv4
.sysctl_tcp_keepalive_probes
= TCP_KEEPALIVE_PROBES
;
2725 net
->ipv4
.sysctl_tcp_keepalive_intvl
= TCP_KEEPALIVE_INTVL
;
2727 net
->ipv4
.sysctl_tcp_syn_retries
= TCP_SYN_RETRIES
;
2728 net
->ipv4
.sysctl_tcp_synack_retries
= TCP_SYNACK_RETRIES
;
2729 net
->ipv4
.sysctl_tcp_syncookies
= 1;
2730 net
->ipv4
.sysctl_tcp_reordering
= TCP_FASTRETRANS_THRESH
;
2731 net
->ipv4
.sysctl_tcp_retries1
= TCP_RETR1
;
2732 net
->ipv4
.sysctl_tcp_retries2
= TCP_RETR2
;
2733 net
->ipv4
.sysctl_tcp_orphan_retries
= 0;
2734 net
->ipv4
.sysctl_tcp_fin_timeout
= TCP_FIN_TIMEOUT
;
2735 net
->ipv4
.sysctl_tcp_notsent_lowat
= UINT_MAX
;
2736 net
->ipv4
.sysctl_tcp_tw_reuse
= 2;
2737 net
->ipv4
.sysctl_tcp_no_ssthresh_metrics_save
= 1;
2739 cnt
= tcp_hashinfo
.ehash_mask
+ 1;
2740 net
->ipv4
.tcp_death_row
.sysctl_max_tw_buckets
= cnt
/ 2;
2741 net
->ipv4
.tcp_death_row
.hashinfo
= &tcp_hashinfo
;
2743 net
->ipv4
.sysctl_max_syn_backlog
= max(128, cnt
/ 128);
2744 net
->ipv4
.sysctl_tcp_sack
= 1;
2745 net
->ipv4
.sysctl_tcp_window_scaling
= 1;
2746 net
->ipv4
.sysctl_tcp_timestamps
= 1;
2747 net
->ipv4
.sysctl_tcp_early_retrans
= 3;
2748 net
->ipv4
.sysctl_tcp_recovery
= TCP_RACK_LOSS_DETECTION
;
2749 net
->ipv4
.sysctl_tcp_slow_start_after_idle
= 1; /* By default, RFC2861 behavior. */
2750 net
->ipv4
.sysctl_tcp_retrans_collapse
= 1;
2751 net
->ipv4
.sysctl_tcp_max_reordering
= 300;
2752 net
->ipv4
.sysctl_tcp_dsack
= 1;
2753 net
->ipv4
.sysctl_tcp_app_win
= 31;
2754 net
->ipv4
.sysctl_tcp_adv_win_scale
= 1;
2755 net
->ipv4
.sysctl_tcp_frto
= 2;
2756 net
->ipv4
.sysctl_tcp_moderate_rcvbuf
= 1;
2757 /* This limits the percentage of the congestion window which we
2758 * will allow a single TSO frame to consume. Building TSO frames
2759 * which are too large can cause TCP streams to be bursty.
2761 net
->ipv4
.sysctl_tcp_tso_win_divisor
= 3;
2762 /* Default TSQ limit of 16 TSO segments */
2763 net
->ipv4
.sysctl_tcp_limit_output_bytes
= 16 * 65536;
2764 /* rfc5961 challenge ack rate limiting */
2765 net
->ipv4
.sysctl_tcp_challenge_ack_limit
= 1000;
2766 net
->ipv4
.sysctl_tcp_min_tso_segs
= 2;
2767 net
->ipv4
.sysctl_tcp_min_rtt_wlen
= 300;
2768 net
->ipv4
.sysctl_tcp_autocorking
= 1;
2769 net
->ipv4
.sysctl_tcp_invalid_ratelimit
= HZ
/2;
2770 net
->ipv4
.sysctl_tcp_pacing_ss_ratio
= 200;
2771 net
->ipv4
.sysctl_tcp_pacing_ca_ratio
= 120;
2772 if (net
!= &init_net
) {
2773 memcpy(net
->ipv4
.sysctl_tcp_rmem
,
2774 init_net
.ipv4
.sysctl_tcp_rmem
,
2775 sizeof(init_net
.ipv4
.sysctl_tcp_rmem
));
2776 memcpy(net
->ipv4
.sysctl_tcp_wmem
,
2777 init_net
.ipv4
.sysctl_tcp_wmem
,
2778 sizeof(init_net
.ipv4
.sysctl_tcp_wmem
));
2780 net
->ipv4
.sysctl_tcp_comp_sack_delay_ns
= NSEC_PER_MSEC
;
2781 net
->ipv4
.sysctl_tcp_comp_sack_nr
= 44;
2782 net
->ipv4
.sysctl_tcp_fastopen
= TFO_CLIENT_ENABLE
;
2783 spin_lock_init(&net
->ipv4
.tcp_fastopen_ctx_lock
);
2784 net
->ipv4
.sysctl_tcp_fastopen_blackhole_timeout
= 60 * 60;
2785 atomic_set(&net
->ipv4
.tfo_active_disable_times
, 0);
2787 /* Reno is always built in */
2788 if (!net_eq(net
, &init_net
) &&
2789 bpf_try_module_get(init_net
.ipv4
.tcp_congestion_control
,
2790 init_net
.ipv4
.tcp_congestion_control
->owner
))
2791 net
->ipv4
.tcp_congestion_control
= init_net
.ipv4
.tcp_congestion_control
;
2793 net
->ipv4
.tcp_congestion_control
= &tcp_reno
;
2802 static void __net_exit
tcp_sk_exit_batch(struct list_head
*net_exit_list
)
2806 inet_twsk_purge(&tcp_hashinfo
, AF_INET
);
2808 list_for_each_entry(net
, net_exit_list
, exit_list
)
2809 tcp_fastopen_ctx_destroy(net
);
2812 static struct pernet_operations __net_initdata tcp_sk_ops
= {
2813 .init
= tcp_sk_init
,
2814 .exit
= tcp_sk_exit
,
2815 .exit_batch
= tcp_sk_exit_batch
,
2818 void __init
tcp_v4_init(void)
2820 if (register_pernet_subsys(&tcp_sk_ops
))
2821 panic("Failed to create the TCP control socket.\n");