1 // SPDX-License-Identifier: GPL-2.0-or-later
3 * INET An implementation of the TCP/IP protocol suite for the LINUX
4 * operating system. INET is implemented using the BSD Socket
5 * interface as the means of communication with the user level.
7 * Implementation of the Transmission Control Protocol(TCP).
9 * IPv4 specific functions
13 * linux/ipv4/tcp_input.c
14 * linux/ipv4/tcp_output.c
16 * See tcp.c for author information
21 * David S. Miller : New socket lookup architecture.
22 * This code is dedicated to John Dyson.
23 * David S. Miller : Change semantics of established hash,
24 * half is devoted to TIME_WAIT sockets
25 * and the rest go in the other half.
26 * Andi Kleen : Add support for syncookies and fixed
27 * some bugs: ip options weren't passed to
28 * the TCP layer, missed a check for an
30 * Andi Kleen : Implemented fast path mtu discovery.
31 * Fixed many serious bugs in the
32 * request_sock handling and moved
33 * most of it into the af independent code.
34 * Added tail drop and some other bugfixes.
35 * Added new listen semantics.
36 * Mike McLagan : Routing by source
37 * Juan Jose Ciarlante: ip_dynaddr bits
38 * Andi Kleen: various fixes.
39 * Vitaly E. Lavrov : Transparent proxy revived after year
41 * Andi Kleen : Fix new listen.
42 * Andi Kleen : Fix accept error reporting.
43 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
44 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
45 * a single port at the same time.
48 #define pr_fmt(fmt) "TCP: " fmt
50 #include <linux/bottom_half.h>
51 #include <linux/types.h>
52 #include <linux/fcntl.h>
53 #include <linux/module.h>
54 #include <linux/random.h>
55 #include <linux/cache.h>
56 #include <linux/jhash.h>
57 #include <linux/init.h>
58 #include <linux/times.h>
59 #include <linux/slab.h>
61 #include <net/net_namespace.h>
63 #include <net/inet_hashtables.h>
65 #include <net/transp_v6.h>
67 #include <net/inet_common.h>
68 #include <net/timewait_sock.h>
70 #include <net/secure_seq.h>
71 #include <net/busy_poll.h>
73 #include <linux/inet.h>
74 #include <linux/ipv6.h>
75 #include <linux/stddef.h>
76 #include <linux/proc_fs.h>
77 #include <linux/seq_file.h>
78 #include <linux/inetdevice.h>
80 #include <crypto/hash.h>
81 #include <linux/scatterlist.h>
83 #include <trace/events/tcp.h>
85 #ifdef CONFIG_TCP_MD5SIG
86 static int tcp_v4_md5_hash_hdr(char *md5_hash
, const struct tcp_md5sig_key
*key
,
87 __be32 daddr
, __be32 saddr
, const struct tcphdr
*th
);
90 struct inet_hashinfo tcp_hashinfo
;
91 EXPORT_SYMBOL(tcp_hashinfo
);
93 static u32
tcp_v4_init_seq(const struct sk_buff
*skb
)
95 return secure_tcp_seq(ip_hdr(skb
)->daddr
,
98 tcp_hdr(skb
)->source
);
101 static u32
tcp_v4_init_ts_off(const struct net
*net
, const struct sk_buff
*skb
)
103 return secure_tcp_ts_off(net
, ip_hdr(skb
)->daddr
, ip_hdr(skb
)->saddr
);
106 int tcp_twsk_unique(struct sock
*sk
, struct sock
*sktw
, void *twp
)
108 const struct inet_timewait_sock
*tw
= inet_twsk(sktw
);
109 const struct tcp_timewait_sock
*tcptw
= tcp_twsk(sktw
);
110 struct tcp_sock
*tp
= tcp_sk(sk
);
111 int reuse
= sock_net(sk
)->ipv4
.sysctl_tcp_tw_reuse
;
114 /* Still does not detect *everything* that goes through
115 * lo, since we require a loopback src or dst address
116 * or direct binding to 'lo' interface.
118 bool loopback
= false;
119 if (tw
->tw_bound_dev_if
== LOOPBACK_IFINDEX
)
121 #if IS_ENABLED(CONFIG_IPV6)
122 if (tw
->tw_family
== AF_INET6
) {
123 if (ipv6_addr_loopback(&tw
->tw_v6_daddr
) ||
124 (ipv6_addr_v4mapped(&tw
->tw_v6_daddr
) &&
125 (tw
->tw_v6_daddr
.s6_addr
[12] == 127)) ||
126 ipv6_addr_loopback(&tw
->tw_v6_rcv_saddr
) ||
127 (ipv6_addr_v4mapped(&tw
->tw_v6_rcv_saddr
) &&
128 (tw
->tw_v6_rcv_saddr
.s6_addr
[12] == 127)))
133 if (ipv4_is_loopback(tw
->tw_daddr
) ||
134 ipv4_is_loopback(tw
->tw_rcv_saddr
))
141 /* With PAWS, it is safe from the viewpoint
142 of data integrity. Even without PAWS it is safe provided sequence
143 spaces do not overlap i.e. at data rates <= 80Mbit/sec.
145 Actually, the idea is close to VJ's one, only timestamp cache is
146 held not per host, but per port pair and TW bucket is used as state
149 If TW bucket has been already destroyed we fall back to VJ's scheme
150 and use initial timestamp retrieved from peer table.
152 if (tcptw
->tw_ts_recent_stamp
&&
153 (!twp
|| (reuse
&& time_after32(ktime_get_seconds(),
154 tcptw
->tw_ts_recent_stamp
)))) {
155 /* In case of repair and re-using TIME-WAIT sockets we still
156 * want to be sure that it is safe as above but honor the
157 * sequence numbers and time stamps set as part of the repair
160 * Without this check re-using a TIME-WAIT socket with TCP
161 * repair would accumulate a -1 on the repair assigned
162 * sequence number. The first time it is reused the sequence
163 * is -1, the second time -2, etc. This fixes that issue
164 * without appearing to create any others.
166 if (likely(!tp
->repair
)) {
167 tp
->write_seq
= tcptw
->tw_snd_nxt
+ 65535 + 2;
168 if (tp
->write_seq
== 0)
170 tp
->rx_opt
.ts_recent
= tcptw
->tw_ts_recent
;
171 tp
->rx_opt
.ts_recent_stamp
= tcptw
->tw_ts_recent_stamp
;
179 EXPORT_SYMBOL_GPL(tcp_twsk_unique
);
181 static int tcp_v4_pre_connect(struct sock
*sk
, struct sockaddr
*uaddr
,
184 /* This check is replicated from tcp_v4_connect() and intended to
185 * prevent BPF program called below from accessing bytes that are out
186 * of the bound specified by user in addr_len.
188 if (addr_len
< sizeof(struct sockaddr_in
))
191 sock_owned_by_me(sk
);
193 return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk
, uaddr
);
196 /* This will initiate an outgoing connection. */
197 int tcp_v4_connect(struct sock
*sk
, struct sockaddr
*uaddr
, int addr_len
)
199 struct sockaddr_in
*usin
= (struct sockaddr_in
*)uaddr
;
200 struct inet_sock
*inet
= inet_sk(sk
);
201 struct tcp_sock
*tp
= tcp_sk(sk
);
202 __be16 orig_sport
, orig_dport
;
203 __be32 daddr
, nexthop
;
207 struct ip_options_rcu
*inet_opt
;
208 struct inet_timewait_death_row
*tcp_death_row
= &sock_net(sk
)->ipv4
.tcp_death_row
;
210 if (addr_len
< sizeof(struct sockaddr_in
))
213 if (usin
->sin_family
!= AF_INET
)
214 return -EAFNOSUPPORT
;
216 nexthop
= daddr
= usin
->sin_addr
.s_addr
;
217 inet_opt
= rcu_dereference_protected(inet
->inet_opt
,
218 lockdep_sock_is_held(sk
));
219 if (inet_opt
&& inet_opt
->opt
.srr
) {
222 nexthop
= inet_opt
->opt
.faddr
;
225 orig_sport
= inet
->inet_sport
;
226 orig_dport
= usin
->sin_port
;
227 fl4
= &inet
->cork
.fl
.u
.ip4
;
228 rt
= ip_route_connect(fl4
, nexthop
, inet
->inet_saddr
,
229 RT_CONN_FLAGS(sk
), sk
->sk_bound_dev_if
,
231 orig_sport
, orig_dport
, sk
);
234 if (err
== -ENETUNREACH
)
235 IP_INC_STATS(sock_net(sk
), IPSTATS_MIB_OUTNOROUTES
);
239 if (rt
->rt_flags
& (RTCF_MULTICAST
| RTCF_BROADCAST
)) {
244 if (!inet_opt
|| !inet_opt
->opt
.srr
)
247 if (!inet
->inet_saddr
)
248 inet
->inet_saddr
= fl4
->saddr
;
249 sk_rcv_saddr_set(sk
, inet
->inet_saddr
);
251 if (tp
->rx_opt
.ts_recent_stamp
&& inet
->inet_daddr
!= daddr
) {
252 /* Reset inherited state */
253 tp
->rx_opt
.ts_recent
= 0;
254 tp
->rx_opt
.ts_recent_stamp
= 0;
255 if (likely(!tp
->repair
))
259 inet
->inet_dport
= usin
->sin_port
;
260 sk_daddr_set(sk
, daddr
);
262 inet_csk(sk
)->icsk_ext_hdr_len
= 0;
264 inet_csk(sk
)->icsk_ext_hdr_len
= inet_opt
->opt
.optlen
;
266 tp
->rx_opt
.mss_clamp
= TCP_MSS_DEFAULT
;
268 /* Socket identity is still unknown (sport may be zero).
269 * However we set state to SYN-SENT and not releasing socket
270 * lock select source port, enter ourselves into the hash tables and
271 * complete initialization after this.
273 tcp_set_state(sk
, TCP_SYN_SENT
);
274 err
= inet_hash_connect(tcp_death_row
, sk
);
280 rt
= ip_route_newports(fl4
, rt
, orig_sport
, orig_dport
,
281 inet
->inet_sport
, inet
->inet_dport
, sk
);
287 /* OK, now commit destination to socket. */
288 sk
->sk_gso_type
= SKB_GSO_TCPV4
;
289 sk_setup_caps(sk
, &rt
->dst
);
292 if (likely(!tp
->repair
)) {
294 tp
->write_seq
= secure_tcp_seq(inet
->inet_saddr
,
298 tp
->tsoffset
= secure_tcp_ts_off(sock_net(sk
),
303 inet
->inet_id
= tp
->write_seq
^ jiffies
;
305 if (tcp_fastopen_defer_connect(sk
, &err
))
310 err
= tcp_connect(sk
);
319 * This unhashes the socket and releases the local port,
322 tcp_set_state(sk
, TCP_CLOSE
);
324 sk
->sk_route_caps
= 0;
325 inet
->inet_dport
= 0;
328 EXPORT_SYMBOL(tcp_v4_connect
);
331 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
332 * It can be called through tcp_release_cb() if socket was owned by user
333 * at the time tcp_v4_err() was called to handle ICMP message.
335 void tcp_v4_mtu_reduced(struct sock
*sk
)
337 struct inet_sock
*inet
= inet_sk(sk
);
338 struct dst_entry
*dst
;
341 if ((1 << sk
->sk_state
) & (TCPF_LISTEN
| TCPF_CLOSE
))
343 mtu
= tcp_sk(sk
)->mtu_info
;
344 dst
= inet_csk_update_pmtu(sk
, mtu
);
348 /* Something is about to be wrong... Remember soft error
349 * for the case, if this connection will not able to recover.
351 if (mtu
< dst_mtu(dst
) && ip_dont_fragment(sk
, dst
))
352 sk
->sk_err_soft
= EMSGSIZE
;
356 if (inet
->pmtudisc
!= IP_PMTUDISC_DONT
&&
357 ip_sk_accept_pmtu(sk
) &&
358 inet_csk(sk
)->icsk_pmtu_cookie
> mtu
) {
359 tcp_sync_mss(sk
, mtu
);
361 /* Resend the TCP packet because it's
362 * clear that the old packet has been
363 * dropped. This is the new "fast" path mtu
366 tcp_simple_retransmit(sk
);
367 } /* else let the usual retransmit timer handle it */
369 EXPORT_SYMBOL(tcp_v4_mtu_reduced
);
371 static void do_redirect(struct sk_buff
*skb
, struct sock
*sk
)
373 struct dst_entry
*dst
= __sk_dst_check(sk
, 0);
376 dst
->ops
->redirect(dst
, sk
, skb
);
380 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
381 void tcp_req_err(struct sock
*sk
, u32 seq
, bool abort
)
383 struct request_sock
*req
= inet_reqsk(sk
);
384 struct net
*net
= sock_net(sk
);
386 /* ICMPs are not backlogged, hence we cannot get
387 * an established socket here.
389 if (seq
!= tcp_rsk(req
)->snt_isn
) {
390 __NET_INC_STATS(net
, LINUX_MIB_OUTOFWINDOWICMPS
);
393 * Still in SYN_RECV, just remove it silently.
394 * There is no good way to pass the error to the newly
395 * created socket, and POSIX does not want network
396 * errors returned from accept().
398 inet_csk_reqsk_queue_drop(req
->rsk_listener
, req
);
399 tcp_listendrop(req
->rsk_listener
);
403 EXPORT_SYMBOL(tcp_req_err
);
406 * This routine is called by the ICMP module when it gets some
407 * sort of error condition. If err < 0 then the socket should
408 * be closed and the error returned to the user. If err > 0
409 * it's just the icmp type << 8 | icmp code. After adjustment
410 * header points to the first 8 bytes of the tcp header. We need
411 * to find the appropriate port.
413 * The locking strategy used here is very "optimistic". When
414 * someone else accesses the socket the ICMP is just dropped
415 * and for some paths there is no check at all.
416 * A more general error queue to queue errors for later handling
417 * is probably better.
421 int tcp_v4_err(struct sk_buff
*icmp_skb
, u32 info
)
423 const struct iphdr
*iph
= (const struct iphdr
*)icmp_skb
->data
;
424 struct tcphdr
*th
= (struct tcphdr
*)(icmp_skb
->data
+ (iph
->ihl
<< 2));
425 struct inet_connection_sock
*icsk
;
427 struct inet_sock
*inet
;
428 const int type
= icmp_hdr(icmp_skb
)->type
;
429 const int code
= icmp_hdr(icmp_skb
)->code
;
432 struct request_sock
*fastopen
;
437 struct net
*net
= dev_net(icmp_skb
->dev
);
439 sk
= __inet_lookup_established(net
, &tcp_hashinfo
, iph
->daddr
,
440 th
->dest
, iph
->saddr
, ntohs(th
->source
),
441 inet_iif(icmp_skb
), 0);
443 __ICMP_INC_STATS(net
, ICMP_MIB_INERRORS
);
446 if (sk
->sk_state
== TCP_TIME_WAIT
) {
447 inet_twsk_put(inet_twsk(sk
));
450 seq
= ntohl(th
->seq
);
451 if (sk
->sk_state
== TCP_NEW_SYN_RECV
) {
452 tcp_req_err(sk
, seq
, type
== ICMP_PARAMETERPROB
||
453 type
== ICMP_TIME_EXCEEDED
||
454 (type
== ICMP_DEST_UNREACH
&&
455 (code
== ICMP_NET_UNREACH
||
456 code
== ICMP_HOST_UNREACH
)));
461 /* If too many ICMPs get dropped on busy
462 * servers this needs to be solved differently.
463 * We do take care of PMTU discovery (RFC1191) special case :
464 * we can receive locally generated ICMP messages while socket is held.
466 if (sock_owned_by_user(sk
)) {
467 if (!(type
== ICMP_DEST_UNREACH
&& code
== ICMP_FRAG_NEEDED
))
468 __NET_INC_STATS(net
, LINUX_MIB_LOCKDROPPEDICMPS
);
470 if (sk
->sk_state
== TCP_CLOSE
)
473 if (unlikely(iph
->ttl
< inet_sk(sk
)->min_ttl
)) {
474 __NET_INC_STATS(net
, LINUX_MIB_TCPMINTTLDROP
);
480 /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
481 fastopen
= tp
->fastopen_rsk
;
482 snd_una
= fastopen
? tcp_rsk(fastopen
)->snt_isn
: tp
->snd_una
;
483 if (sk
->sk_state
!= TCP_LISTEN
&&
484 !between(seq
, snd_una
, tp
->snd_nxt
)) {
485 __NET_INC_STATS(net
, LINUX_MIB_OUTOFWINDOWICMPS
);
491 if (!sock_owned_by_user(sk
))
492 do_redirect(icmp_skb
, sk
);
494 case ICMP_SOURCE_QUENCH
:
495 /* Just silently ignore these. */
497 case ICMP_PARAMETERPROB
:
500 case ICMP_DEST_UNREACH
:
501 if (code
> NR_ICMP_UNREACH
)
504 if (code
== ICMP_FRAG_NEEDED
) { /* PMTU discovery (RFC1191) */
505 /* We are not interested in TCP_LISTEN and open_requests
506 * (SYN-ACKs send out by Linux are always <576bytes so
507 * they should go through unfragmented).
509 if (sk
->sk_state
== TCP_LISTEN
)
513 if (!sock_owned_by_user(sk
)) {
514 tcp_v4_mtu_reduced(sk
);
516 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED
, &sk
->sk_tsq_flags
))
522 err
= icmp_err_convert
[code
].errno
;
523 /* check if icmp_skb allows revert of backoff
524 * (see draft-zimmermann-tcp-lcd) */
525 if (code
!= ICMP_NET_UNREACH
&& code
!= ICMP_HOST_UNREACH
)
527 if (seq
!= tp
->snd_una
|| !icsk
->icsk_retransmits
||
528 !icsk
->icsk_backoff
|| fastopen
)
531 if (sock_owned_by_user(sk
))
534 skb
= tcp_rtx_queue_head(sk
);
535 if (WARN_ON_ONCE(!skb
))
538 icsk
->icsk_backoff
--;
539 icsk
->icsk_rto
= tp
->srtt_us
? __tcp_set_rto(tp
) :
541 icsk
->icsk_rto
= inet_csk_rto_backoff(icsk
, TCP_RTO_MAX
);
544 tcp_mstamp_refresh(tp
);
545 delta_us
= (u32
)(tp
->tcp_mstamp
- tcp_skb_timestamp_us(skb
));
546 remaining
= icsk
->icsk_rto
-
547 usecs_to_jiffies(delta_us
);
550 inet_csk_reset_xmit_timer(sk
, ICSK_TIME_RETRANS
,
551 remaining
, TCP_RTO_MAX
);
553 /* RTO revert clocked out retransmission.
554 * Will retransmit now */
555 tcp_retransmit_timer(sk
);
559 case ICMP_TIME_EXCEEDED
:
566 switch (sk
->sk_state
) {
569 /* Only in fast or simultaneous open. If a fast open socket is
570 * is already accepted it is treated as a connected one below.
572 if (fastopen
&& !fastopen
->sk
)
575 if (!sock_owned_by_user(sk
)) {
578 sk
->sk_error_report(sk
);
582 sk
->sk_err_soft
= err
;
587 /* If we've already connected we will keep trying
588 * until we time out, or the user gives up.
590 * rfc1122 4.2.3.9 allows to consider as hard errors
591 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
592 * but it is obsoleted by pmtu discovery).
594 * Note, that in modern internet, where routing is unreliable
595 * and in each dark corner broken firewalls sit, sending random
596 * errors ordered by their masters even this two messages finally lose
597 * their original sense (even Linux sends invalid PORT_UNREACHs)
599 * Now we are in compliance with RFCs.
604 if (!sock_owned_by_user(sk
) && inet
->recverr
) {
606 sk
->sk_error_report(sk
);
607 } else { /* Only an error on timeout */
608 sk
->sk_err_soft
= err
;
617 void __tcp_v4_send_check(struct sk_buff
*skb
, __be32 saddr
, __be32 daddr
)
619 struct tcphdr
*th
= tcp_hdr(skb
);
621 th
->check
= ~tcp_v4_check(skb
->len
, saddr
, daddr
, 0);
622 skb
->csum_start
= skb_transport_header(skb
) - skb
->head
;
623 skb
->csum_offset
= offsetof(struct tcphdr
, check
);
626 /* This routine computes an IPv4 TCP checksum. */
627 void tcp_v4_send_check(struct sock
*sk
, struct sk_buff
*skb
)
629 const struct inet_sock
*inet
= inet_sk(sk
);
631 __tcp_v4_send_check(skb
, inet
->inet_saddr
, inet
->inet_daddr
);
633 EXPORT_SYMBOL(tcp_v4_send_check
);
636 * This routine will send an RST to the other tcp.
638 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
640 * Answer: if a packet caused RST, it is not for a socket
641 * existing in our system, if it is matched to a socket,
642 * it is just duplicate segment or bug in other side's TCP.
643 * So that we build reply only basing on parameters
644 * arrived with segment.
645 * Exception: precedence violation. We do not implement it in any case.
648 static void tcp_v4_send_reset(const struct sock
*sk
, struct sk_buff
*skb
)
650 const struct tcphdr
*th
= tcp_hdr(skb
);
653 #ifdef CONFIG_TCP_MD5SIG
654 __be32 opt
[(TCPOLEN_MD5SIG_ALIGNED
>> 2)];
657 struct ip_reply_arg arg
;
658 #ifdef CONFIG_TCP_MD5SIG
659 struct tcp_md5sig_key
*key
= NULL
;
660 const __u8
*hash_location
= NULL
;
661 unsigned char newhash
[16];
663 struct sock
*sk1
= NULL
;
668 /* Never send a reset in response to a reset. */
672 /* If sk not NULL, it means we did a successful lookup and incoming
673 * route had to be correct. prequeue might have dropped our dst.
675 if (!sk
&& skb_rtable(skb
)->rt_type
!= RTN_LOCAL
)
678 /* Swap the send and the receive. */
679 memset(&rep
, 0, sizeof(rep
));
680 rep
.th
.dest
= th
->source
;
681 rep
.th
.source
= th
->dest
;
682 rep
.th
.doff
= sizeof(struct tcphdr
) / 4;
686 rep
.th
.seq
= th
->ack_seq
;
689 rep
.th
.ack_seq
= htonl(ntohl(th
->seq
) + th
->syn
+ th
->fin
+
690 skb
->len
- (th
->doff
<< 2));
693 memset(&arg
, 0, sizeof(arg
));
694 arg
.iov
[0].iov_base
= (unsigned char *)&rep
;
695 arg
.iov
[0].iov_len
= sizeof(rep
.th
);
697 net
= sk
? sock_net(sk
) : dev_net(skb_dst(skb
)->dev
);
698 #ifdef CONFIG_TCP_MD5SIG
700 hash_location
= tcp_parse_md5sig_option(th
);
701 if (sk
&& sk_fullsock(sk
)) {
702 key
= tcp_md5_do_lookup(sk
, (union tcp_md5_addr
*)
703 &ip_hdr(skb
)->saddr
, AF_INET
);
704 } else if (hash_location
) {
706 * active side is lost. Try to find listening socket through
707 * source port, and then find md5 key through listening socket.
708 * we are not loose security here:
709 * Incoming packet is checked with md5 hash with finding key,
710 * no RST generated if md5 hash doesn't match.
712 sk1
= __inet_lookup_listener(net
, &tcp_hashinfo
, NULL
, 0,
714 th
->source
, ip_hdr(skb
)->daddr
,
715 ntohs(th
->source
), inet_iif(skb
),
717 /* don't send rst if it can't find key */
721 key
= tcp_md5_do_lookup(sk1
, (union tcp_md5_addr
*)
722 &ip_hdr(skb
)->saddr
, AF_INET
);
727 genhash
= tcp_v4_md5_hash_skb(newhash
, key
, NULL
, skb
);
728 if (genhash
|| memcmp(hash_location
, newhash
, 16) != 0)
734 rep
.opt
[0] = htonl((TCPOPT_NOP
<< 24) |
736 (TCPOPT_MD5SIG
<< 8) |
738 /* Update length and the length the header thinks exists */
739 arg
.iov
[0].iov_len
+= TCPOLEN_MD5SIG_ALIGNED
;
740 rep
.th
.doff
= arg
.iov
[0].iov_len
/ 4;
742 tcp_v4_md5_hash_hdr((__u8
*) &rep
.opt
[1],
743 key
, ip_hdr(skb
)->saddr
,
744 ip_hdr(skb
)->daddr
, &rep
.th
);
747 arg
.csum
= csum_tcpudp_nofold(ip_hdr(skb
)->daddr
,
748 ip_hdr(skb
)->saddr
, /* XXX */
749 arg
.iov
[0].iov_len
, IPPROTO_TCP
, 0);
750 arg
.csumoffset
= offsetof(struct tcphdr
, check
) / 2;
751 arg
.flags
= (sk
&& inet_sk_transparent(sk
)) ? IP_REPLY_ARG_NOSRCCHECK
: 0;
753 /* When socket is gone, all binding information is lost.
754 * routing might fail in this case. No choice here, if we choose to force
755 * input interface, we will misroute in case of asymmetric route.
758 arg
.bound_dev_if
= sk
->sk_bound_dev_if
;
760 trace_tcp_send_reset(sk
, skb
);
763 BUILD_BUG_ON(offsetof(struct sock
, sk_bound_dev_if
) !=
764 offsetof(struct inet_timewait_sock
, tw_bound_dev_if
));
766 arg
.tos
= ip_hdr(skb
)->tos
;
767 arg
.uid
= sock_net_uid(net
, sk
&& sk_fullsock(sk
) ? sk
: NULL
);
769 ctl_sk
= *this_cpu_ptr(net
->ipv4
.tcp_sk
);
771 ctl_sk
->sk_mark
= (sk
->sk_state
== TCP_TIME_WAIT
) ?
772 inet_twsk(sk
)->tw_mark
: sk
->sk_mark
;
773 ip_send_unicast_reply(ctl_sk
,
774 skb
, &TCP_SKB_CB(skb
)->header
.h4
.opt
,
775 ip_hdr(skb
)->saddr
, ip_hdr(skb
)->daddr
,
776 &arg
, arg
.iov
[0].iov_len
);
779 __TCP_INC_STATS(net
, TCP_MIB_OUTSEGS
);
780 __TCP_INC_STATS(net
, TCP_MIB_OUTRSTS
);
783 #ifdef CONFIG_TCP_MD5SIG
789 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
790 outside socket context is ugly, certainly. What can I do?
793 static void tcp_v4_send_ack(const struct sock
*sk
,
794 struct sk_buff
*skb
, u32 seq
, u32 ack
,
795 u32 win
, u32 tsval
, u32 tsecr
, int oif
,
796 struct tcp_md5sig_key
*key
,
797 int reply_flags
, u8 tos
)
799 const struct tcphdr
*th
= tcp_hdr(skb
);
802 __be32 opt
[(TCPOLEN_TSTAMP_ALIGNED
>> 2)
803 #ifdef CONFIG_TCP_MD5SIG
804 + (TCPOLEN_MD5SIG_ALIGNED
>> 2)
808 struct net
*net
= sock_net(sk
);
809 struct ip_reply_arg arg
;
812 memset(&rep
.th
, 0, sizeof(struct tcphdr
));
813 memset(&arg
, 0, sizeof(arg
));
815 arg
.iov
[0].iov_base
= (unsigned char *)&rep
;
816 arg
.iov
[0].iov_len
= sizeof(rep
.th
);
818 rep
.opt
[0] = htonl((TCPOPT_NOP
<< 24) | (TCPOPT_NOP
<< 16) |
819 (TCPOPT_TIMESTAMP
<< 8) |
821 rep
.opt
[1] = htonl(tsval
);
822 rep
.opt
[2] = htonl(tsecr
);
823 arg
.iov
[0].iov_len
+= TCPOLEN_TSTAMP_ALIGNED
;
826 /* Swap the send and the receive. */
827 rep
.th
.dest
= th
->source
;
828 rep
.th
.source
= th
->dest
;
829 rep
.th
.doff
= arg
.iov
[0].iov_len
/ 4;
830 rep
.th
.seq
= htonl(seq
);
831 rep
.th
.ack_seq
= htonl(ack
);
833 rep
.th
.window
= htons(win
);
835 #ifdef CONFIG_TCP_MD5SIG
837 int offset
= (tsecr
) ? 3 : 0;
839 rep
.opt
[offset
++] = htonl((TCPOPT_NOP
<< 24) |
841 (TCPOPT_MD5SIG
<< 8) |
843 arg
.iov
[0].iov_len
+= TCPOLEN_MD5SIG_ALIGNED
;
844 rep
.th
.doff
= arg
.iov
[0].iov_len
/4;
846 tcp_v4_md5_hash_hdr((__u8
*) &rep
.opt
[offset
],
847 key
, ip_hdr(skb
)->saddr
,
848 ip_hdr(skb
)->daddr
, &rep
.th
);
851 arg
.flags
= reply_flags
;
852 arg
.csum
= csum_tcpudp_nofold(ip_hdr(skb
)->daddr
,
853 ip_hdr(skb
)->saddr
, /* XXX */
854 arg
.iov
[0].iov_len
, IPPROTO_TCP
, 0);
855 arg
.csumoffset
= offsetof(struct tcphdr
, check
) / 2;
857 arg
.bound_dev_if
= oif
;
859 arg
.uid
= sock_net_uid(net
, sk_fullsock(sk
) ? sk
: NULL
);
861 ctl_sk
= *this_cpu_ptr(net
->ipv4
.tcp_sk
);
863 ctl_sk
->sk_mark
= (sk
->sk_state
== TCP_TIME_WAIT
) ?
864 inet_twsk(sk
)->tw_mark
: sk
->sk_mark
;
865 ip_send_unicast_reply(ctl_sk
,
866 skb
, &TCP_SKB_CB(skb
)->header
.h4
.opt
,
867 ip_hdr(skb
)->saddr
, ip_hdr(skb
)->daddr
,
868 &arg
, arg
.iov
[0].iov_len
);
871 __TCP_INC_STATS(net
, TCP_MIB_OUTSEGS
);
875 static void tcp_v4_timewait_ack(struct sock
*sk
, struct sk_buff
*skb
)
877 struct inet_timewait_sock
*tw
= inet_twsk(sk
);
878 struct tcp_timewait_sock
*tcptw
= tcp_twsk(sk
);
880 tcp_v4_send_ack(sk
, skb
,
881 tcptw
->tw_snd_nxt
, tcptw
->tw_rcv_nxt
,
882 tcptw
->tw_rcv_wnd
>> tw
->tw_rcv_wscale
,
883 tcp_time_stamp_raw() + tcptw
->tw_ts_offset
,
886 tcp_twsk_md5_key(tcptw
),
887 tw
->tw_transparent
? IP_REPLY_ARG_NOSRCCHECK
: 0,
894 static void tcp_v4_reqsk_send_ack(const struct sock
*sk
, struct sk_buff
*skb
,
895 struct request_sock
*req
)
897 /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
898 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
900 u32 seq
= (sk
->sk_state
== TCP_LISTEN
) ? tcp_rsk(req
)->snt_isn
+ 1 :
904 * The window field (SEG.WND) of every outgoing segment, with the
905 * exception of <SYN> segments, MUST be right-shifted by
906 * Rcv.Wind.Shift bits:
908 tcp_v4_send_ack(sk
, skb
, seq
,
909 tcp_rsk(req
)->rcv_nxt
,
910 req
->rsk_rcv_wnd
>> inet_rsk(req
)->rcv_wscale
,
911 tcp_time_stamp_raw() + tcp_rsk(req
)->ts_off
,
914 tcp_md5_do_lookup(sk
, (union tcp_md5_addr
*)&ip_hdr(skb
)->saddr
,
916 inet_rsk(req
)->no_srccheck
? IP_REPLY_ARG_NOSRCCHECK
: 0,
921 * Send a SYN-ACK after having received a SYN.
922 * This still operates on a request_sock only, not on a big
925 static int tcp_v4_send_synack(const struct sock
*sk
, struct dst_entry
*dst
,
927 struct request_sock
*req
,
928 struct tcp_fastopen_cookie
*foc
,
929 enum tcp_synack_type synack_type
)
931 const struct inet_request_sock
*ireq
= inet_rsk(req
);
936 /* First, grab a route. */
937 if (!dst
&& (dst
= inet_csk_route_req(sk
, &fl4
, req
)) == NULL
)
940 skb
= tcp_make_synack(sk
, dst
, req
, foc
, synack_type
);
943 __tcp_v4_send_check(skb
, ireq
->ir_loc_addr
, ireq
->ir_rmt_addr
);
946 err
= ip_build_and_send_pkt(skb
, sk
, ireq
->ir_loc_addr
,
948 rcu_dereference(ireq
->ireq_opt
));
950 err
= net_xmit_eval(err
);
957 * IPv4 request_sock destructor.
959 static void tcp_v4_reqsk_destructor(struct request_sock
*req
)
961 kfree(rcu_dereference_protected(inet_rsk(req
)->ireq_opt
, 1));
964 #ifdef CONFIG_TCP_MD5SIG
966 * RFC2385 MD5 checksumming requires a mapping of
967 * IP address->MD5 Key.
968 * We need to maintain these in the sk structure.
971 DEFINE_STATIC_KEY_FALSE(tcp_md5_needed
);
972 EXPORT_SYMBOL(tcp_md5_needed
);
974 /* Find the Key structure for an address. */
975 struct tcp_md5sig_key
*__tcp_md5_do_lookup(const struct sock
*sk
,
976 const union tcp_md5_addr
*addr
,
979 const struct tcp_sock
*tp
= tcp_sk(sk
);
980 struct tcp_md5sig_key
*key
;
981 const struct tcp_md5sig_info
*md5sig
;
983 struct tcp_md5sig_key
*best_match
= NULL
;
986 /* caller either holds rcu_read_lock() or socket lock */
987 md5sig
= rcu_dereference_check(tp
->md5sig_info
,
988 lockdep_sock_is_held(sk
));
992 hlist_for_each_entry_rcu(key
, &md5sig
->head
, node
) {
993 if (key
->family
!= family
)
996 if (family
== AF_INET
) {
997 mask
= inet_make_mask(key
->prefixlen
);
998 match
= (key
->addr
.a4
.s_addr
& mask
) ==
999 (addr
->a4
.s_addr
& mask
);
1000 #if IS_ENABLED(CONFIG_IPV6)
1001 } else if (family
== AF_INET6
) {
1002 match
= ipv6_prefix_equal(&key
->addr
.a6
, &addr
->a6
,
1009 if (match
&& (!best_match
||
1010 key
->prefixlen
> best_match
->prefixlen
))
1015 EXPORT_SYMBOL(__tcp_md5_do_lookup
);
1017 static struct tcp_md5sig_key
*tcp_md5_do_lookup_exact(const struct sock
*sk
,
1018 const union tcp_md5_addr
*addr
,
1019 int family
, u8 prefixlen
)
1021 const struct tcp_sock
*tp
= tcp_sk(sk
);
1022 struct tcp_md5sig_key
*key
;
1023 unsigned int size
= sizeof(struct in_addr
);
1024 const struct tcp_md5sig_info
*md5sig
;
1026 /* caller either holds rcu_read_lock() or socket lock */
1027 md5sig
= rcu_dereference_check(tp
->md5sig_info
,
1028 lockdep_sock_is_held(sk
));
1031 #if IS_ENABLED(CONFIG_IPV6)
1032 if (family
== AF_INET6
)
1033 size
= sizeof(struct in6_addr
);
1035 hlist_for_each_entry_rcu(key
, &md5sig
->head
, node
) {
1036 if (key
->family
!= family
)
1038 if (!memcmp(&key
->addr
, addr
, size
) &&
1039 key
->prefixlen
== prefixlen
)
1045 struct tcp_md5sig_key
*tcp_v4_md5_lookup(const struct sock
*sk
,
1046 const struct sock
*addr_sk
)
1048 const union tcp_md5_addr
*addr
;
1050 addr
= (const union tcp_md5_addr
*)&addr_sk
->sk_daddr
;
1051 return tcp_md5_do_lookup(sk
, addr
, AF_INET
);
1053 EXPORT_SYMBOL(tcp_v4_md5_lookup
);
1055 /* This can be called on a newly created socket, from other files */
1056 int tcp_md5_do_add(struct sock
*sk
, const union tcp_md5_addr
*addr
,
1057 int family
, u8 prefixlen
, const u8
*newkey
, u8 newkeylen
,
1060 /* Add Key to the list */
1061 struct tcp_md5sig_key
*key
;
1062 struct tcp_sock
*tp
= tcp_sk(sk
);
1063 struct tcp_md5sig_info
*md5sig
;
1065 key
= tcp_md5_do_lookup_exact(sk
, addr
, family
, prefixlen
);
1067 /* Pre-existing entry - just update that one. */
1068 memcpy(key
->key
, newkey
, newkeylen
);
1069 key
->keylen
= newkeylen
;
1073 md5sig
= rcu_dereference_protected(tp
->md5sig_info
,
1074 lockdep_sock_is_held(sk
));
1076 md5sig
= kmalloc(sizeof(*md5sig
), gfp
);
1080 sk_nocaps_add(sk
, NETIF_F_GSO_MASK
);
1081 INIT_HLIST_HEAD(&md5sig
->head
);
1082 rcu_assign_pointer(tp
->md5sig_info
, md5sig
);
1085 key
= sock_kmalloc(sk
, sizeof(*key
), gfp
);
1088 if (!tcp_alloc_md5sig_pool()) {
1089 sock_kfree_s(sk
, key
, sizeof(*key
));
1093 memcpy(key
->key
, newkey
, newkeylen
);
1094 key
->keylen
= newkeylen
;
1095 key
->family
= family
;
1096 key
->prefixlen
= prefixlen
;
1097 memcpy(&key
->addr
, addr
,
1098 (family
== AF_INET6
) ? sizeof(struct in6_addr
) :
1099 sizeof(struct in_addr
));
1100 hlist_add_head_rcu(&key
->node
, &md5sig
->head
);
1103 EXPORT_SYMBOL(tcp_md5_do_add
);
1105 int tcp_md5_do_del(struct sock
*sk
, const union tcp_md5_addr
*addr
, int family
,
1108 struct tcp_md5sig_key
*key
;
1110 key
= tcp_md5_do_lookup_exact(sk
, addr
, family
, prefixlen
);
1113 hlist_del_rcu(&key
->node
);
1114 atomic_sub(sizeof(*key
), &sk
->sk_omem_alloc
);
1115 kfree_rcu(key
, rcu
);
1118 EXPORT_SYMBOL(tcp_md5_do_del
);
1120 static void tcp_clear_md5_list(struct sock
*sk
)
1122 struct tcp_sock
*tp
= tcp_sk(sk
);
1123 struct tcp_md5sig_key
*key
;
1124 struct hlist_node
*n
;
1125 struct tcp_md5sig_info
*md5sig
;
1127 md5sig
= rcu_dereference_protected(tp
->md5sig_info
, 1);
1129 hlist_for_each_entry_safe(key
, n
, &md5sig
->head
, node
) {
1130 hlist_del_rcu(&key
->node
);
1131 atomic_sub(sizeof(*key
), &sk
->sk_omem_alloc
);
1132 kfree_rcu(key
, rcu
);
1136 static int tcp_v4_parse_md5_keys(struct sock
*sk
, int optname
,
1137 char __user
*optval
, int optlen
)
1139 struct tcp_md5sig cmd
;
1140 struct sockaddr_in
*sin
= (struct sockaddr_in
*)&cmd
.tcpm_addr
;
1143 if (optlen
< sizeof(cmd
))
1146 if (copy_from_user(&cmd
, optval
, sizeof(cmd
)))
1149 if (sin
->sin_family
!= AF_INET
)
1152 if (optname
== TCP_MD5SIG_EXT
&&
1153 cmd
.tcpm_flags
& TCP_MD5SIG_FLAG_PREFIX
) {
1154 prefixlen
= cmd
.tcpm_prefixlen
;
1159 if (!cmd
.tcpm_keylen
)
1160 return tcp_md5_do_del(sk
, (union tcp_md5_addr
*)&sin
->sin_addr
.s_addr
,
1161 AF_INET
, prefixlen
);
1163 if (cmd
.tcpm_keylen
> TCP_MD5SIG_MAXKEYLEN
)
1166 return tcp_md5_do_add(sk
, (union tcp_md5_addr
*)&sin
->sin_addr
.s_addr
,
1167 AF_INET
, prefixlen
, cmd
.tcpm_key
, cmd
.tcpm_keylen
,
1171 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool
*hp
,
1172 __be32 daddr
, __be32 saddr
,
1173 const struct tcphdr
*th
, int nbytes
)
1175 struct tcp4_pseudohdr
*bp
;
1176 struct scatterlist sg
;
1183 bp
->protocol
= IPPROTO_TCP
;
1184 bp
->len
= cpu_to_be16(nbytes
);
1186 _th
= (struct tcphdr
*)(bp
+ 1);
1187 memcpy(_th
, th
, sizeof(*th
));
1190 sg_init_one(&sg
, bp
, sizeof(*bp
) + sizeof(*th
));
1191 ahash_request_set_crypt(hp
->md5_req
, &sg
, NULL
,
1192 sizeof(*bp
) + sizeof(*th
));
1193 return crypto_ahash_update(hp
->md5_req
);
1196 static int tcp_v4_md5_hash_hdr(char *md5_hash
, const struct tcp_md5sig_key
*key
,
1197 __be32 daddr
, __be32 saddr
, const struct tcphdr
*th
)
1199 struct tcp_md5sig_pool
*hp
;
1200 struct ahash_request
*req
;
1202 hp
= tcp_get_md5sig_pool();
1204 goto clear_hash_noput
;
1207 if (crypto_ahash_init(req
))
1209 if (tcp_v4_md5_hash_headers(hp
, daddr
, saddr
, th
, th
->doff
<< 2))
1211 if (tcp_md5_hash_key(hp
, key
))
1213 ahash_request_set_crypt(req
, NULL
, md5_hash
, 0);
1214 if (crypto_ahash_final(req
))
1217 tcp_put_md5sig_pool();
1221 tcp_put_md5sig_pool();
1223 memset(md5_hash
, 0, 16);
1227 int tcp_v4_md5_hash_skb(char *md5_hash
, const struct tcp_md5sig_key
*key
,
1228 const struct sock
*sk
,
1229 const struct sk_buff
*skb
)
1231 struct tcp_md5sig_pool
*hp
;
1232 struct ahash_request
*req
;
1233 const struct tcphdr
*th
= tcp_hdr(skb
);
1234 __be32 saddr
, daddr
;
1236 if (sk
) { /* valid for establish/request sockets */
1237 saddr
= sk
->sk_rcv_saddr
;
1238 daddr
= sk
->sk_daddr
;
1240 const struct iphdr
*iph
= ip_hdr(skb
);
1245 hp
= tcp_get_md5sig_pool();
1247 goto clear_hash_noput
;
1250 if (crypto_ahash_init(req
))
1253 if (tcp_v4_md5_hash_headers(hp
, daddr
, saddr
, th
, skb
->len
))
1255 if (tcp_md5_hash_skb_data(hp
, skb
, th
->doff
<< 2))
1257 if (tcp_md5_hash_key(hp
, key
))
1259 ahash_request_set_crypt(req
, NULL
, md5_hash
, 0);
1260 if (crypto_ahash_final(req
))
1263 tcp_put_md5sig_pool();
1267 tcp_put_md5sig_pool();
1269 memset(md5_hash
, 0, 16);
1272 EXPORT_SYMBOL(tcp_v4_md5_hash_skb
);
1276 /* Called with rcu_read_lock() */
1277 static bool tcp_v4_inbound_md5_hash(const struct sock
*sk
,
1278 const struct sk_buff
*skb
)
1280 #ifdef CONFIG_TCP_MD5SIG
1282 * This gets called for each TCP segment that arrives
1283 * so we want to be efficient.
1284 * We have 3 drop cases:
1285 * o No MD5 hash and one expected.
1286 * o MD5 hash and we're not expecting one.
1287 * o MD5 hash and its wrong.
1289 const __u8
*hash_location
= NULL
;
1290 struct tcp_md5sig_key
*hash_expected
;
1291 const struct iphdr
*iph
= ip_hdr(skb
);
1292 const struct tcphdr
*th
= tcp_hdr(skb
);
1294 unsigned char newhash
[16];
1296 hash_expected
= tcp_md5_do_lookup(sk
, (union tcp_md5_addr
*)&iph
->saddr
,
1298 hash_location
= tcp_parse_md5sig_option(th
);
1300 /* We've parsed the options - do we have a hash? */
1301 if (!hash_expected
&& !hash_location
)
1304 if (hash_expected
&& !hash_location
) {
1305 NET_INC_STATS(sock_net(sk
), LINUX_MIB_TCPMD5NOTFOUND
);
1309 if (!hash_expected
&& hash_location
) {
1310 NET_INC_STATS(sock_net(sk
), LINUX_MIB_TCPMD5UNEXPECTED
);
1314 /* Okay, so this is hash_expected and hash_location -
1315 * so we need to calculate the checksum.
1317 genhash
= tcp_v4_md5_hash_skb(newhash
,
1321 if (genhash
|| memcmp(hash_location
, newhash
, 16) != 0) {
1322 NET_INC_STATS(sock_net(sk
), LINUX_MIB_TCPMD5FAILURE
);
1323 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1324 &iph
->saddr
, ntohs(th
->source
),
1325 &iph
->daddr
, ntohs(th
->dest
),
1326 genhash
? " tcp_v4_calc_md5_hash failed"
1335 static void tcp_v4_init_req(struct request_sock
*req
,
1336 const struct sock
*sk_listener
,
1337 struct sk_buff
*skb
)
1339 struct inet_request_sock
*ireq
= inet_rsk(req
);
1340 struct net
*net
= sock_net(sk_listener
);
1342 sk_rcv_saddr_set(req_to_sk(req
), ip_hdr(skb
)->daddr
);
1343 sk_daddr_set(req_to_sk(req
), ip_hdr(skb
)->saddr
);
1344 RCU_INIT_POINTER(ireq
->ireq_opt
, tcp_v4_save_options(net
, skb
));
1347 static struct dst_entry
*tcp_v4_route_req(const struct sock
*sk
,
1349 const struct request_sock
*req
)
1351 return inet_csk_route_req(sk
, &fl
->u
.ip4
, req
);
1354 struct request_sock_ops tcp_request_sock_ops __read_mostly
= {
1356 .obj_size
= sizeof(struct tcp_request_sock
),
1357 .rtx_syn_ack
= tcp_rtx_synack
,
1358 .send_ack
= tcp_v4_reqsk_send_ack
,
1359 .destructor
= tcp_v4_reqsk_destructor
,
1360 .send_reset
= tcp_v4_send_reset
,
1361 .syn_ack_timeout
= tcp_syn_ack_timeout
,
1364 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops
= {
1365 .mss_clamp
= TCP_MSS_DEFAULT
,
1366 #ifdef CONFIG_TCP_MD5SIG
1367 .req_md5_lookup
= tcp_v4_md5_lookup
,
1368 .calc_md5_hash
= tcp_v4_md5_hash_skb
,
1370 .init_req
= tcp_v4_init_req
,
1371 #ifdef CONFIG_SYN_COOKIES
1372 .cookie_init_seq
= cookie_v4_init_sequence
,
1374 .route_req
= tcp_v4_route_req
,
1375 .init_seq
= tcp_v4_init_seq
,
1376 .init_ts_off
= tcp_v4_init_ts_off
,
1377 .send_synack
= tcp_v4_send_synack
,
1380 int tcp_v4_conn_request(struct sock
*sk
, struct sk_buff
*skb
)
1382 /* Never answer to SYNs send to broadcast or multicast */
1383 if (skb_rtable(skb
)->rt_flags
& (RTCF_BROADCAST
| RTCF_MULTICAST
))
1386 return tcp_conn_request(&tcp_request_sock_ops
,
1387 &tcp_request_sock_ipv4_ops
, sk
, skb
);
1393 EXPORT_SYMBOL(tcp_v4_conn_request
);
1397 * The three way handshake has completed - we got a valid synack -
1398 * now create the new socket.
1400 struct sock
*tcp_v4_syn_recv_sock(const struct sock
*sk
, struct sk_buff
*skb
,
1401 struct request_sock
*req
,
1402 struct dst_entry
*dst
,
1403 struct request_sock
*req_unhash
,
1406 struct inet_request_sock
*ireq
;
1407 struct inet_sock
*newinet
;
1408 struct tcp_sock
*newtp
;
1410 #ifdef CONFIG_TCP_MD5SIG
1411 struct tcp_md5sig_key
*key
;
1413 struct ip_options_rcu
*inet_opt
;
1415 if (sk_acceptq_is_full(sk
))
1418 newsk
= tcp_create_openreq_child(sk
, req
, skb
);
1422 newsk
->sk_gso_type
= SKB_GSO_TCPV4
;
1423 inet_sk_rx_dst_set(newsk
, skb
);
1425 newtp
= tcp_sk(newsk
);
1426 newinet
= inet_sk(newsk
);
1427 ireq
= inet_rsk(req
);
1428 sk_daddr_set(newsk
, ireq
->ir_rmt_addr
);
1429 sk_rcv_saddr_set(newsk
, ireq
->ir_loc_addr
);
1430 newsk
->sk_bound_dev_if
= ireq
->ir_iif
;
1431 newinet
->inet_saddr
= ireq
->ir_loc_addr
;
1432 inet_opt
= rcu_dereference(ireq
->ireq_opt
);
1433 RCU_INIT_POINTER(newinet
->inet_opt
, inet_opt
);
1434 newinet
->mc_index
= inet_iif(skb
);
1435 newinet
->mc_ttl
= ip_hdr(skb
)->ttl
;
1436 newinet
->rcv_tos
= ip_hdr(skb
)->tos
;
1437 inet_csk(newsk
)->icsk_ext_hdr_len
= 0;
1439 inet_csk(newsk
)->icsk_ext_hdr_len
= inet_opt
->opt
.optlen
;
1440 newinet
->inet_id
= newtp
->write_seq
^ jiffies
;
1443 dst
= inet_csk_route_child_sock(sk
, newsk
, req
);
1447 /* syncookie case : see end of cookie_v4_check() */
1449 sk_setup_caps(newsk
, dst
);
1451 tcp_ca_openreq_child(newsk
, dst
);
1453 tcp_sync_mss(newsk
, dst_mtu(dst
));
1454 newtp
->advmss
= tcp_mss_clamp(tcp_sk(sk
), dst_metric_advmss(dst
));
1456 tcp_initialize_rcv_mss(newsk
);
1458 #ifdef CONFIG_TCP_MD5SIG
1459 /* Copy over the MD5 key from the original socket */
1460 key
= tcp_md5_do_lookup(sk
, (union tcp_md5_addr
*)&newinet
->inet_daddr
,
1464 * We're using one, so create a matching key
1465 * on the newsk structure. If we fail to get
1466 * memory, then we end up not copying the key
1469 tcp_md5_do_add(newsk
, (union tcp_md5_addr
*)&newinet
->inet_daddr
,
1470 AF_INET
, 32, key
->key
, key
->keylen
, GFP_ATOMIC
);
1471 sk_nocaps_add(newsk
, NETIF_F_GSO_MASK
);
1475 if (__inet_inherit_port(sk
, newsk
) < 0)
1477 *own_req
= inet_ehash_nolisten(newsk
, req_to_sk(req_unhash
));
1478 if (likely(*own_req
)) {
1479 tcp_move_syn(newtp
, req
);
1480 ireq
->ireq_opt
= NULL
;
1482 newinet
->inet_opt
= NULL
;
1487 NET_INC_STATS(sock_net(sk
), LINUX_MIB_LISTENOVERFLOWS
);
1494 newinet
->inet_opt
= NULL
;
1495 inet_csk_prepare_forced_close(newsk
);
1499 EXPORT_SYMBOL(tcp_v4_syn_recv_sock
);
1501 static struct sock
*tcp_v4_cookie_check(struct sock
*sk
, struct sk_buff
*skb
)
1503 #ifdef CONFIG_SYN_COOKIES
1504 const struct tcphdr
*th
= tcp_hdr(skb
);
1507 sk
= cookie_v4_check(sk
, skb
);
1512 /* The socket must have it's spinlock held when we get
1513 * here, unless it is a TCP_LISTEN socket.
1515 * We have a potential double-lock case here, so even when
1516 * doing backlog processing we use the BH locking scheme.
1517 * This is because we cannot sleep with the original spinlock
1520 int tcp_v4_do_rcv(struct sock
*sk
, struct sk_buff
*skb
)
1524 if (sk
->sk_state
== TCP_ESTABLISHED
) { /* Fast path */
1525 struct dst_entry
*dst
= sk
->sk_rx_dst
;
1527 sock_rps_save_rxhash(sk
, skb
);
1528 sk_mark_napi_id(sk
, skb
);
1530 if (inet_sk(sk
)->rx_dst_ifindex
!= skb
->skb_iif
||
1531 !dst
->ops
->check(dst
, 0)) {
1533 sk
->sk_rx_dst
= NULL
;
1536 tcp_rcv_established(sk
, skb
);
1540 if (tcp_checksum_complete(skb
))
1543 if (sk
->sk_state
== TCP_LISTEN
) {
1544 struct sock
*nsk
= tcp_v4_cookie_check(sk
, skb
);
1549 if (tcp_child_process(sk
, nsk
, skb
)) {
1556 sock_rps_save_rxhash(sk
, skb
);
1558 if (tcp_rcv_state_process(sk
, skb
)) {
1565 tcp_v4_send_reset(rsk
, skb
);
1568 /* Be careful here. If this function gets more complicated and
1569 * gcc suffers from register pressure on the x86, sk (in %ebx)
1570 * might be destroyed here. This current version compiles correctly,
1571 * but you have been warned.
1576 TCP_INC_STATS(sock_net(sk
), TCP_MIB_CSUMERRORS
);
1577 TCP_INC_STATS(sock_net(sk
), TCP_MIB_INERRS
);
1580 EXPORT_SYMBOL(tcp_v4_do_rcv
);
1582 int tcp_v4_early_demux(struct sk_buff
*skb
)
1584 const struct iphdr
*iph
;
1585 const struct tcphdr
*th
;
1588 if (skb
->pkt_type
!= PACKET_HOST
)
1591 if (!pskb_may_pull(skb
, skb_transport_offset(skb
) + sizeof(struct tcphdr
)))
1597 if (th
->doff
< sizeof(struct tcphdr
) / 4)
1600 sk
= __inet_lookup_established(dev_net(skb
->dev
), &tcp_hashinfo
,
1601 iph
->saddr
, th
->source
,
1602 iph
->daddr
, ntohs(th
->dest
),
1603 skb
->skb_iif
, inet_sdif(skb
));
1606 skb
->destructor
= sock_edemux
;
1607 if (sk_fullsock(sk
)) {
1608 struct dst_entry
*dst
= READ_ONCE(sk
->sk_rx_dst
);
1611 dst
= dst_check(dst
, 0);
1613 inet_sk(sk
)->rx_dst_ifindex
== skb
->skb_iif
)
1614 skb_dst_set_noref(skb
, dst
);
1620 bool tcp_add_backlog(struct sock
*sk
, struct sk_buff
*skb
)
1622 u32 limit
= sk
->sk_rcvbuf
+ sk
->sk_sndbuf
;
1623 struct skb_shared_info
*shinfo
;
1624 const struct tcphdr
*th
;
1625 struct tcphdr
*thtail
;
1626 struct sk_buff
*tail
;
1627 unsigned int hdrlen
;
1632 /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1633 * we can fix skb->truesize to its real value to avoid future drops.
1634 * This is valid because skb is not yet charged to the socket.
1635 * It has been noticed pure SACK packets were sometimes dropped
1636 * (if cooked by drivers without copybreak feature).
1642 if (unlikely(tcp_checksum_complete(skb
))) {
1644 __TCP_INC_STATS(sock_net(sk
), TCP_MIB_CSUMERRORS
);
1645 __TCP_INC_STATS(sock_net(sk
), TCP_MIB_INERRS
);
1649 /* Attempt coalescing to last skb in backlog, even if we are
1651 * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
1653 th
= (const struct tcphdr
*)skb
->data
;
1654 hdrlen
= th
->doff
* 4;
1655 shinfo
= skb_shinfo(skb
);
1657 if (!shinfo
->gso_size
)
1658 shinfo
->gso_size
= skb
->len
- hdrlen
;
1660 if (!shinfo
->gso_segs
)
1661 shinfo
->gso_segs
= 1;
1663 tail
= sk
->sk_backlog
.tail
;
1666 thtail
= (struct tcphdr
*)tail
->data
;
1668 if (TCP_SKB_CB(tail
)->end_seq
!= TCP_SKB_CB(skb
)->seq
||
1669 TCP_SKB_CB(tail
)->ip_dsfield
!= TCP_SKB_CB(skb
)->ip_dsfield
||
1670 ((TCP_SKB_CB(tail
)->tcp_flags
|
1671 TCP_SKB_CB(skb
)->tcp_flags
) & (TCPHDR_SYN
| TCPHDR_RST
| TCPHDR_URG
)) ||
1672 !((TCP_SKB_CB(tail
)->tcp_flags
&
1673 TCP_SKB_CB(skb
)->tcp_flags
) & TCPHDR_ACK
) ||
1674 ((TCP_SKB_CB(tail
)->tcp_flags
^
1675 TCP_SKB_CB(skb
)->tcp_flags
) & (TCPHDR_ECE
| TCPHDR_CWR
)) ||
1676 #ifdef CONFIG_TLS_DEVICE
1677 tail
->decrypted
!= skb
->decrypted
||
1679 thtail
->doff
!= th
->doff
||
1680 memcmp(thtail
+ 1, th
+ 1, hdrlen
- sizeof(*th
)))
1683 __skb_pull(skb
, hdrlen
);
1684 if (skb_try_coalesce(tail
, skb
, &fragstolen
, &delta
)) {
1685 thtail
->window
= th
->window
;
1687 TCP_SKB_CB(tail
)->end_seq
= TCP_SKB_CB(skb
)->end_seq
;
1689 if (after(TCP_SKB_CB(skb
)->ack_seq
, TCP_SKB_CB(tail
)->ack_seq
))
1690 TCP_SKB_CB(tail
)->ack_seq
= TCP_SKB_CB(skb
)->ack_seq
;
1692 /* We have to update both TCP_SKB_CB(tail)->tcp_flags and
1693 * thtail->fin, so that the fast path in tcp_rcv_established()
1694 * is not entered if we append a packet with a FIN.
1695 * SYN, RST, URG are not present.
1696 * ACK is set on both packets.
1697 * PSH : we do not really care in TCP stack,
1698 * at least for 'GRO' packets.
1700 thtail
->fin
|= th
->fin
;
1701 TCP_SKB_CB(tail
)->tcp_flags
|= TCP_SKB_CB(skb
)->tcp_flags
;
1703 if (TCP_SKB_CB(skb
)->has_rxtstamp
) {
1704 TCP_SKB_CB(tail
)->has_rxtstamp
= true;
1705 tail
->tstamp
= skb
->tstamp
;
1706 skb_hwtstamps(tail
)->hwtstamp
= skb_hwtstamps(skb
)->hwtstamp
;
1709 /* Not as strict as GRO. We only need to carry mss max value */
1710 skb_shinfo(tail
)->gso_size
= max(shinfo
->gso_size
,
1711 skb_shinfo(tail
)->gso_size
);
1713 gso_segs
= skb_shinfo(tail
)->gso_segs
+ shinfo
->gso_segs
;
1714 skb_shinfo(tail
)->gso_segs
= min_t(u32
, gso_segs
, 0xFFFF);
1716 sk
->sk_backlog
.len
+= delta
;
1717 __NET_INC_STATS(sock_net(sk
),
1718 LINUX_MIB_TCPBACKLOGCOALESCE
);
1719 kfree_skb_partial(skb
, fragstolen
);
1722 __skb_push(skb
, hdrlen
);
1725 /* Only socket owner can try to collapse/prune rx queues
1726 * to reduce memory overhead, so add a little headroom here.
1727 * Few sockets backlog are possibly concurrently non empty.
1731 if (unlikely(sk_add_backlog(sk
, skb
, limit
))) {
1733 __NET_INC_STATS(sock_net(sk
), LINUX_MIB_TCPBACKLOGDROP
);
1738 EXPORT_SYMBOL(tcp_add_backlog
);
1740 int tcp_filter(struct sock
*sk
, struct sk_buff
*skb
)
1742 struct tcphdr
*th
= (struct tcphdr
*)skb
->data
;
1744 return sk_filter_trim_cap(sk
, skb
, th
->doff
* 4);
1746 EXPORT_SYMBOL(tcp_filter
);
1748 static void tcp_v4_restore_cb(struct sk_buff
*skb
)
1750 memmove(IPCB(skb
), &TCP_SKB_CB(skb
)->header
.h4
,
1751 sizeof(struct inet_skb_parm
));
1754 static void tcp_v4_fill_cb(struct sk_buff
*skb
, const struct iphdr
*iph
,
1755 const struct tcphdr
*th
)
1757 /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1758 * barrier() makes sure compiler wont play fool^Waliasing games.
1760 memmove(&TCP_SKB_CB(skb
)->header
.h4
, IPCB(skb
),
1761 sizeof(struct inet_skb_parm
));
1764 TCP_SKB_CB(skb
)->seq
= ntohl(th
->seq
);
1765 TCP_SKB_CB(skb
)->end_seq
= (TCP_SKB_CB(skb
)->seq
+ th
->syn
+ th
->fin
+
1766 skb
->len
- th
->doff
* 4);
1767 TCP_SKB_CB(skb
)->ack_seq
= ntohl(th
->ack_seq
);
1768 TCP_SKB_CB(skb
)->tcp_flags
= tcp_flag_byte(th
);
1769 TCP_SKB_CB(skb
)->tcp_tw_isn
= 0;
1770 TCP_SKB_CB(skb
)->ip_dsfield
= ipv4_get_dsfield(iph
);
1771 TCP_SKB_CB(skb
)->sacked
= 0;
1772 TCP_SKB_CB(skb
)->has_rxtstamp
=
1773 skb
->tstamp
|| skb_hwtstamps(skb
)->hwtstamp
;
1780 int tcp_v4_rcv(struct sk_buff
*skb
)
1782 struct net
*net
= dev_net(skb
->dev
);
1783 struct sk_buff
*skb_to_free
;
1784 int sdif
= inet_sdif(skb
);
1785 const struct iphdr
*iph
;
1786 const struct tcphdr
*th
;
1791 if (skb
->pkt_type
!= PACKET_HOST
)
1794 /* Count it even if it's bad */
1795 __TCP_INC_STATS(net
, TCP_MIB_INSEGS
);
1797 if (!pskb_may_pull(skb
, sizeof(struct tcphdr
)))
1800 th
= (const struct tcphdr
*)skb
->data
;
1802 if (unlikely(th
->doff
< sizeof(struct tcphdr
) / 4))
1804 if (!pskb_may_pull(skb
, th
->doff
* 4))
1807 /* An explanation is required here, I think.
1808 * Packet length and doff are validated by header prediction,
1809 * provided case of th->doff==0 is eliminated.
1810 * So, we defer the checks. */
1812 if (skb_checksum_init(skb
, IPPROTO_TCP
, inet_compute_pseudo
))
1815 th
= (const struct tcphdr
*)skb
->data
;
1818 sk
= __inet_lookup_skb(&tcp_hashinfo
, skb
, __tcp_hdrlen(th
), th
->source
,
1819 th
->dest
, sdif
, &refcounted
);
1824 if (sk
->sk_state
== TCP_TIME_WAIT
)
1827 if (sk
->sk_state
== TCP_NEW_SYN_RECV
) {
1828 struct request_sock
*req
= inet_reqsk(sk
);
1829 bool req_stolen
= false;
1832 sk
= req
->rsk_listener
;
1833 if (unlikely(tcp_v4_inbound_md5_hash(sk
, skb
))) {
1834 sk_drops_add(sk
, skb
);
1838 if (tcp_checksum_complete(skb
)) {
1842 if (unlikely(sk
->sk_state
!= TCP_LISTEN
)) {
1843 inet_csk_reqsk_queue_drop_and_put(sk
, req
);
1846 /* We own a reference on the listener, increase it again
1847 * as we might lose it too soon.
1852 if (!tcp_filter(sk
, skb
)) {
1853 th
= (const struct tcphdr
*)skb
->data
;
1855 tcp_v4_fill_cb(skb
, iph
, th
);
1856 nsk
= tcp_check_req(sk
, skb
, req
, false, &req_stolen
);
1861 /* Another cpu got exclusive access to req
1862 * and created a full blown socket.
1863 * Try to feed this packet to this socket
1864 * instead of discarding it.
1866 tcp_v4_restore_cb(skb
);
1870 goto discard_and_relse
;
1874 tcp_v4_restore_cb(skb
);
1875 } else if (tcp_child_process(sk
, nsk
, skb
)) {
1876 tcp_v4_send_reset(nsk
, skb
);
1877 goto discard_and_relse
;
1883 if (unlikely(iph
->ttl
< inet_sk(sk
)->min_ttl
)) {
1884 __NET_INC_STATS(net
, LINUX_MIB_TCPMINTTLDROP
);
1885 goto discard_and_relse
;
1888 if (!xfrm4_policy_check(sk
, XFRM_POLICY_IN
, skb
))
1889 goto discard_and_relse
;
1891 if (tcp_v4_inbound_md5_hash(sk
, skb
))
1892 goto discard_and_relse
;
1896 if (tcp_filter(sk
, skb
))
1897 goto discard_and_relse
;
1898 th
= (const struct tcphdr
*)skb
->data
;
1900 tcp_v4_fill_cb(skb
, iph
, th
);
1904 if (sk
->sk_state
== TCP_LISTEN
) {
1905 ret
= tcp_v4_do_rcv(sk
, skb
);
1906 goto put_and_return
;
1909 sk_incoming_cpu_update(sk
);
1911 bh_lock_sock_nested(sk
);
1912 tcp_segs_in(tcp_sk(sk
), skb
);
1914 if (!sock_owned_by_user(sk
)) {
1915 skb_to_free
= sk
->sk_rx_skb_cache
;
1916 sk
->sk_rx_skb_cache
= NULL
;
1917 ret
= tcp_v4_do_rcv(sk
, skb
);
1919 if (tcp_add_backlog(sk
, skb
))
1920 goto discard_and_relse
;
1925 __kfree_skb(skb_to_free
);
1934 if (!xfrm4_policy_check(NULL
, XFRM_POLICY_IN
, skb
))
1937 tcp_v4_fill_cb(skb
, iph
, th
);
1939 if (tcp_checksum_complete(skb
)) {
1941 __TCP_INC_STATS(net
, TCP_MIB_CSUMERRORS
);
1943 __TCP_INC_STATS(net
, TCP_MIB_INERRS
);
1945 tcp_v4_send_reset(NULL
, skb
);
1949 /* Discard frame. */
1954 sk_drops_add(sk
, skb
);
1960 if (!xfrm4_policy_check(NULL
, XFRM_POLICY_IN
, skb
)) {
1961 inet_twsk_put(inet_twsk(sk
));
1965 tcp_v4_fill_cb(skb
, iph
, th
);
1967 if (tcp_checksum_complete(skb
)) {
1968 inet_twsk_put(inet_twsk(sk
));
1971 switch (tcp_timewait_state_process(inet_twsk(sk
), skb
, th
)) {
1973 struct sock
*sk2
= inet_lookup_listener(dev_net(skb
->dev
),
1976 iph
->saddr
, th
->source
,
1977 iph
->daddr
, th
->dest
,
1981 inet_twsk_deschedule_put(inet_twsk(sk
));
1983 tcp_v4_restore_cb(skb
);
1991 tcp_v4_timewait_ack(sk
, skb
);
1994 tcp_v4_send_reset(sk
, skb
);
1995 inet_twsk_deschedule_put(inet_twsk(sk
));
1997 case TCP_TW_SUCCESS
:;
2002 static struct timewait_sock_ops tcp_timewait_sock_ops
= {
2003 .twsk_obj_size
= sizeof(struct tcp_timewait_sock
),
2004 .twsk_unique
= tcp_twsk_unique
,
2005 .twsk_destructor
= tcp_twsk_destructor
,
2008 void inet_sk_rx_dst_set(struct sock
*sk
, const struct sk_buff
*skb
)
2010 struct dst_entry
*dst
= skb_dst(skb
);
2012 if (dst
&& dst_hold_safe(dst
)) {
2013 sk
->sk_rx_dst
= dst
;
2014 inet_sk(sk
)->rx_dst_ifindex
= skb
->skb_iif
;
2017 EXPORT_SYMBOL(inet_sk_rx_dst_set
);
2019 const struct inet_connection_sock_af_ops ipv4_specific
= {
2020 .queue_xmit
= ip_queue_xmit
,
2021 .send_check
= tcp_v4_send_check
,
2022 .rebuild_header
= inet_sk_rebuild_header
,
2023 .sk_rx_dst_set
= inet_sk_rx_dst_set
,
2024 .conn_request
= tcp_v4_conn_request
,
2025 .syn_recv_sock
= tcp_v4_syn_recv_sock
,
2026 .net_header_len
= sizeof(struct iphdr
),
2027 .setsockopt
= ip_setsockopt
,
2028 .getsockopt
= ip_getsockopt
,
2029 .addr2sockaddr
= inet_csk_addr2sockaddr
,
2030 .sockaddr_len
= sizeof(struct sockaddr_in
),
2031 #ifdef CONFIG_COMPAT
2032 .compat_setsockopt
= compat_ip_setsockopt
,
2033 .compat_getsockopt
= compat_ip_getsockopt
,
2035 .mtu_reduced
= tcp_v4_mtu_reduced
,
2037 EXPORT_SYMBOL(ipv4_specific
);
2039 #ifdef CONFIG_TCP_MD5SIG
2040 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific
= {
2041 .md5_lookup
= tcp_v4_md5_lookup
,
2042 .calc_md5_hash
= tcp_v4_md5_hash_skb
,
2043 .md5_parse
= tcp_v4_parse_md5_keys
,
2047 /* NOTE: A lot of things set to zero explicitly by call to
2048 * sk_alloc() so need not be done here.
2050 static int tcp_v4_init_sock(struct sock
*sk
)
2052 struct inet_connection_sock
*icsk
= inet_csk(sk
);
2056 icsk
->icsk_af_ops
= &ipv4_specific
;
2058 #ifdef CONFIG_TCP_MD5SIG
2059 tcp_sk(sk
)->af_specific
= &tcp_sock_ipv4_specific
;
2065 void tcp_v4_destroy_sock(struct sock
*sk
)
2067 struct tcp_sock
*tp
= tcp_sk(sk
);
2069 trace_tcp_destroy_sock(sk
);
2071 tcp_clear_xmit_timers(sk
);
2073 tcp_cleanup_congestion_control(sk
);
2075 tcp_cleanup_ulp(sk
);
2077 /* Cleanup up the write buffer. */
2078 tcp_write_queue_purge(sk
);
2080 /* Check if we want to disable active TFO */
2081 tcp_fastopen_active_disable_ofo_check(sk
);
2083 /* Cleans up our, hopefully empty, out_of_order_queue. */
2084 skb_rbtree_purge(&tp
->out_of_order_queue
);
2086 #ifdef CONFIG_TCP_MD5SIG
2087 /* Clean up the MD5 key list, if any */
2088 if (tp
->md5sig_info
) {
2089 tcp_clear_md5_list(sk
);
2090 kfree_rcu(rcu_dereference_protected(tp
->md5sig_info
, 1), rcu
);
2091 tp
->md5sig_info
= NULL
;
2095 /* Clean up a referenced TCP bind bucket. */
2096 if (inet_csk(sk
)->icsk_bind_hash
)
2099 BUG_ON(tp
->fastopen_rsk
);
2101 /* If socket is aborted during connect operation */
2102 tcp_free_fastopen_req(tp
);
2103 tcp_fastopen_destroy_cipher(sk
);
2104 tcp_saved_syn_free(tp
);
2106 sk_sockets_allocated_dec(sk
);
2108 EXPORT_SYMBOL(tcp_v4_destroy_sock
);
2110 #ifdef CONFIG_PROC_FS
2111 /* Proc filesystem TCP sock list dumping. */
2114 * Get next listener socket follow cur. If cur is NULL, get first socket
2115 * starting from bucket given in st->bucket; when st->bucket is zero the
2116 * very first socket in the hash table is returned.
2118 static void *listening_get_next(struct seq_file
*seq
, void *cur
)
2120 struct tcp_seq_afinfo
*afinfo
= PDE_DATA(file_inode(seq
->file
));
2121 struct tcp_iter_state
*st
= seq
->private;
2122 struct net
*net
= seq_file_net(seq
);
2123 struct inet_listen_hashbucket
*ilb
;
2124 struct sock
*sk
= cur
;
2128 ilb
= &tcp_hashinfo
.listening_hash
[st
->bucket
];
2129 spin_lock(&ilb
->lock
);
2130 sk
= sk_head(&ilb
->head
);
2134 ilb
= &tcp_hashinfo
.listening_hash
[st
->bucket
];
2140 sk_for_each_from(sk
) {
2141 if (!net_eq(sock_net(sk
), net
))
2143 if (sk
->sk_family
== afinfo
->family
)
2146 spin_unlock(&ilb
->lock
);
2148 if (++st
->bucket
< INET_LHTABLE_SIZE
)
2153 static void *listening_get_idx(struct seq_file
*seq
, loff_t
*pos
)
2155 struct tcp_iter_state
*st
= seq
->private;
2160 rc
= listening_get_next(seq
, NULL
);
2162 while (rc
&& *pos
) {
2163 rc
= listening_get_next(seq
, rc
);
2169 static inline bool empty_bucket(const struct tcp_iter_state
*st
)
2171 return hlist_nulls_empty(&tcp_hashinfo
.ehash
[st
->bucket
].chain
);
2175 * Get first established socket starting from bucket given in st->bucket.
2176 * If st->bucket is zero, the very first socket in the hash is returned.
2178 static void *established_get_first(struct seq_file
*seq
)
2180 struct tcp_seq_afinfo
*afinfo
= PDE_DATA(file_inode(seq
->file
));
2181 struct tcp_iter_state
*st
= seq
->private;
2182 struct net
*net
= seq_file_net(seq
);
2186 for (; st
->bucket
<= tcp_hashinfo
.ehash_mask
; ++st
->bucket
) {
2188 struct hlist_nulls_node
*node
;
2189 spinlock_t
*lock
= inet_ehash_lockp(&tcp_hashinfo
, st
->bucket
);
2191 /* Lockless fast path for the common case of empty buckets */
2192 if (empty_bucket(st
))
2196 sk_nulls_for_each(sk
, node
, &tcp_hashinfo
.ehash
[st
->bucket
].chain
) {
2197 if (sk
->sk_family
!= afinfo
->family
||
2198 !net_eq(sock_net(sk
), net
)) {
2204 spin_unlock_bh(lock
);
2210 static void *established_get_next(struct seq_file
*seq
, void *cur
)
2212 struct tcp_seq_afinfo
*afinfo
= PDE_DATA(file_inode(seq
->file
));
2213 struct sock
*sk
= cur
;
2214 struct hlist_nulls_node
*node
;
2215 struct tcp_iter_state
*st
= seq
->private;
2216 struct net
*net
= seq_file_net(seq
);
2221 sk
= sk_nulls_next(sk
);
2223 sk_nulls_for_each_from(sk
, node
) {
2224 if (sk
->sk_family
== afinfo
->family
&&
2225 net_eq(sock_net(sk
), net
))
2229 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo
, st
->bucket
));
2231 return established_get_first(seq
);
2234 static void *established_get_idx(struct seq_file
*seq
, loff_t pos
)
2236 struct tcp_iter_state
*st
= seq
->private;
2240 rc
= established_get_first(seq
);
2243 rc
= established_get_next(seq
, rc
);
2249 static void *tcp_get_idx(struct seq_file
*seq
, loff_t pos
)
2252 struct tcp_iter_state
*st
= seq
->private;
2254 st
->state
= TCP_SEQ_STATE_LISTENING
;
2255 rc
= listening_get_idx(seq
, &pos
);
2258 st
->state
= TCP_SEQ_STATE_ESTABLISHED
;
2259 rc
= established_get_idx(seq
, pos
);
2265 static void *tcp_seek_last_pos(struct seq_file
*seq
)
2267 struct tcp_iter_state
*st
= seq
->private;
2268 int offset
= st
->offset
;
2269 int orig_num
= st
->num
;
2272 switch (st
->state
) {
2273 case TCP_SEQ_STATE_LISTENING
:
2274 if (st
->bucket
>= INET_LHTABLE_SIZE
)
2276 st
->state
= TCP_SEQ_STATE_LISTENING
;
2277 rc
= listening_get_next(seq
, NULL
);
2278 while (offset
-- && rc
)
2279 rc
= listening_get_next(seq
, rc
);
2283 st
->state
= TCP_SEQ_STATE_ESTABLISHED
;
2285 case TCP_SEQ_STATE_ESTABLISHED
:
2286 if (st
->bucket
> tcp_hashinfo
.ehash_mask
)
2288 rc
= established_get_first(seq
);
2289 while (offset
-- && rc
)
2290 rc
= established_get_next(seq
, rc
);
2298 void *tcp_seq_start(struct seq_file
*seq
, loff_t
*pos
)
2300 struct tcp_iter_state
*st
= seq
->private;
2303 if (*pos
&& *pos
== st
->last_pos
) {
2304 rc
= tcp_seek_last_pos(seq
);
2309 st
->state
= TCP_SEQ_STATE_LISTENING
;
2313 rc
= *pos
? tcp_get_idx(seq
, *pos
- 1) : SEQ_START_TOKEN
;
2316 st
->last_pos
= *pos
;
2319 EXPORT_SYMBOL(tcp_seq_start
);
2321 void *tcp_seq_next(struct seq_file
*seq
, void *v
, loff_t
*pos
)
2323 struct tcp_iter_state
*st
= seq
->private;
2326 if (v
== SEQ_START_TOKEN
) {
2327 rc
= tcp_get_idx(seq
, 0);
2331 switch (st
->state
) {
2332 case TCP_SEQ_STATE_LISTENING
:
2333 rc
= listening_get_next(seq
, v
);
2335 st
->state
= TCP_SEQ_STATE_ESTABLISHED
;
2338 rc
= established_get_first(seq
);
2341 case TCP_SEQ_STATE_ESTABLISHED
:
2342 rc
= established_get_next(seq
, v
);
2347 st
->last_pos
= *pos
;
2350 EXPORT_SYMBOL(tcp_seq_next
);
2352 void tcp_seq_stop(struct seq_file
*seq
, void *v
)
2354 struct tcp_iter_state
*st
= seq
->private;
2356 switch (st
->state
) {
2357 case TCP_SEQ_STATE_LISTENING
:
2358 if (v
!= SEQ_START_TOKEN
)
2359 spin_unlock(&tcp_hashinfo
.listening_hash
[st
->bucket
].lock
);
2361 case TCP_SEQ_STATE_ESTABLISHED
:
2363 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo
, st
->bucket
));
2367 EXPORT_SYMBOL(tcp_seq_stop
);
2369 static void get_openreq4(const struct request_sock
*req
,
2370 struct seq_file
*f
, int i
)
2372 const struct inet_request_sock
*ireq
= inet_rsk(req
);
2373 long delta
= req
->rsk_timer
.expires
- jiffies
;
2375 seq_printf(f
, "%4d: %08X:%04X %08X:%04X"
2376 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2381 ntohs(ireq
->ir_rmt_port
),
2383 0, 0, /* could print option size, but that is af dependent. */
2384 1, /* timers active (only the expire timer) */
2385 jiffies_delta_to_clock_t(delta
),
2387 from_kuid_munged(seq_user_ns(f
),
2388 sock_i_uid(req
->rsk_listener
)),
2389 0, /* non standard timer */
2390 0, /* open_requests have no inode */
2395 static void get_tcp4_sock(struct sock
*sk
, struct seq_file
*f
, int i
)
2398 unsigned long timer_expires
;
2399 const struct tcp_sock
*tp
= tcp_sk(sk
);
2400 const struct inet_connection_sock
*icsk
= inet_csk(sk
);
2401 const struct inet_sock
*inet
= inet_sk(sk
);
2402 const struct fastopen_queue
*fastopenq
= &icsk
->icsk_accept_queue
.fastopenq
;
2403 __be32 dest
= inet
->inet_daddr
;
2404 __be32 src
= inet
->inet_rcv_saddr
;
2405 __u16 destp
= ntohs(inet
->inet_dport
);
2406 __u16 srcp
= ntohs(inet
->inet_sport
);
2410 if (icsk
->icsk_pending
== ICSK_TIME_RETRANS
||
2411 icsk
->icsk_pending
== ICSK_TIME_REO_TIMEOUT
||
2412 icsk
->icsk_pending
== ICSK_TIME_LOSS_PROBE
) {
2414 timer_expires
= icsk
->icsk_timeout
;
2415 } else if (icsk
->icsk_pending
== ICSK_TIME_PROBE0
) {
2417 timer_expires
= icsk
->icsk_timeout
;
2418 } else if (timer_pending(&sk
->sk_timer
)) {
2420 timer_expires
= sk
->sk_timer
.expires
;
2423 timer_expires
= jiffies
;
2426 state
= inet_sk_state_load(sk
);
2427 if (state
== TCP_LISTEN
)
2428 rx_queue
= sk
->sk_ack_backlog
;
2430 /* Because we don't lock the socket,
2431 * we might find a transient negative value.
2433 rx_queue
= max_t(int, tp
->rcv_nxt
- tp
->copied_seq
, 0);
2435 seq_printf(f
, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2436 "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2437 i
, src
, srcp
, dest
, destp
, state
,
2438 tp
->write_seq
- tp
->snd_una
,
2441 jiffies_delta_to_clock_t(timer_expires
- jiffies
),
2442 icsk
->icsk_retransmits
,
2443 from_kuid_munged(seq_user_ns(f
), sock_i_uid(sk
)),
2444 icsk
->icsk_probes_out
,
2446 refcount_read(&sk
->sk_refcnt
), sk
,
2447 jiffies_to_clock_t(icsk
->icsk_rto
),
2448 jiffies_to_clock_t(icsk
->icsk_ack
.ato
),
2449 (icsk
->icsk_ack
.quick
<< 1) | inet_csk_in_pingpong_mode(sk
),
2451 state
== TCP_LISTEN
?
2452 fastopenq
->max_qlen
:
2453 (tcp_in_initial_slowstart(tp
) ? -1 : tp
->snd_ssthresh
));
2456 static void get_timewait4_sock(const struct inet_timewait_sock
*tw
,
2457 struct seq_file
*f
, int i
)
2459 long delta
= tw
->tw_timer
.expires
- jiffies
;
2463 dest
= tw
->tw_daddr
;
2464 src
= tw
->tw_rcv_saddr
;
2465 destp
= ntohs(tw
->tw_dport
);
2466 srcp
= ntohs(tw
->tw_sport
);
2468 seq_printf(f
, "%4d: %08X:%04X %08X:%04X"
2469 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2470 i
, src
, srcp
, dest
, destp
, tw
->tw_substate
, 0, 0,
2471 3, jiffies_delta_to_clock_t(delta
), 0, 0, 0, 0,
2472 refcount_read(&tw
->tw_refcnt
), tw
);
2477 static int tcp4_seq_show(struct seq_file
*seq
, void *v
)
2479 struct tcp_iter_state
*st
;
2480 struct sock
*sk
= v
;
2482 seq_setwidth(seq
, TMPSZ
- 1);
2483 if (v
== SEQ_START_TOKEN
) {
2484 seq_puts(seq
, " sl local_address rem_address st tx_queue "
2485 "rx_queue tr tm->when retrnsmt uid timeout "
2491 if (sk
->sk_state
== TCP_TIME_WAIT
)
2492 get_timewait4_sock(v
, seq
, st
->num
);
2493 else if (sk
->sk_state
== TCP_NEW_SYN_RECV
)
2494 get_openreq4(v
, seq
, st
->num
);
2496 get_tcp4_sock(v
, seq
, st
->num
);
2502 static const struct seq_operations tcp4_seq_ops
= {
2503 .show
= tcp4_seq_show
,
2504 .start
= tcp_seq_start
,
2505 .next
= tcp_seq_next
,
2506 .stop
= tcp_seq_stop
,
2509 static struct tcp_seq_afinfo tcp4_seq_afinfo
= {
2513 static int __net_init
tcp4_proc_init_net(struct net
*net
)
2515 if (!proc_create_net_data("tcp", 0444, net
->proc_net
, &tcp4_seq_ops
,
2516 sizeof(struct tcp_iter_state
), &tcp4_seq_afinfo
))
2521 static void __net_exit
tcp4_proc_exit_net(struct net
*net
)
2523 remove_proc_entry("tcp", net
->proc_net
);
2526 static struct pernet_operations tcp4_net_ops
= {
2527 .init
= tcp4_proc_init_net
,
2528 .exit
= tcp4_proc_exit_net
,
2531 int __init
tcp4_proc_init(void)
2533 return register_pernet_subsys(&tcp4_net_ops
);
2536 void tcp4_proc_exit(void)
2538 unregister_pernet_subsys(&tcp4_net_ops
);
2540 #endif /* CONFIG_PROC_FS */
2542 struct proto tcp_prot
= {
2544 .owner
= THIS_MODULE
,
2546 .pre_connect
= tcp_v4_pre_connect
,
2547 .connect
= tcp_v4_connect
,
2548 .disconnect
= tcp_disconnect
,
2549 .accept
= inet_csk_accept
,
2551 .init
= tcp_v4_init_sock
,
2552 .destroy
= tcp_v4_destroy_sock
,
2553 .shutdown
= tcp_shutdown
,
2554 .setsockopt
= tcp_setsockopt
,
2555 .getsockopt
= tcp_getsockopt
,
2556 .keepalive
= tcp_set_keepalive
,
2557 .recvmsg
= tcp_recvmsg
,
2558 .sendmsg
= tcp_sendmsg
,
2559 .sendpage
= tcp_sendpage
,
2560 .backlog_rcv
= tcp_v4_do_rcv
,
2561 .release_cb
= tcp_release_cb
,
2563 .unhash
= inet_unhash
,
2564 .get_port
= inet_csk_get_port
,
2565 .enter_memory_pressure
= tcp_enter_memory_pressure
,
2566 .leave_memory_pressure
= tcp_leave_memory_pressure
,
2567 .stream_memory_free
= tcp_stream_memory_free
,
2568 .sockets_allocated
= &tcp_sockets_allocated
,
2569 .orphan_count
= &tcp_orphan_count
,
2570 .memory_allocated
= &tcp_memory_allocated
,
2571 .memory_pressure
= &tcp_memory_pressure
,
2572 .sysctl_mem
= sysctl_tcp_mem
,
2573 .sysctl_wmem_offset
= offsetof(struct net
, ipv4
.sysctl_tcp_wmem
),
2574 .sysctl_rmem_offset
= offsetof(struct net
, ipv4
.sysctl_tcp_rmem
),
2575 .max_header
= MAX_TCP_HEADER
,
2576 .obj_size
= sizeof(struct tcp_sock
),
2577 .slab_flags
= SLAB_TYPESAFE_BY_RCU
,
2578 .twsk_prot
= &tcp_timewait_sock_ops
,
2579 .rsk_prot
= &tcp_request_sock_ops
,
2580 .h
.hashinfo
= &tcp_hashinfo
,
2581 .no_autobind
= true,
2582 #ifdef CONFIG_COMPAT
2583 .compat_setsockopt
= compat_tcp_setsockopt
,
2584 .compat_getsockopt
= compat_tcp_getsockopt
,
2586 .diag_destroy
= tcp_abort
,
2588 EXPORT_SYMBOL(tcp_prot
);
2590 static void __net_exit
tcp_sk_exit(struct net
*net
)
2594 if (net
->ipv4
.tcp_congestion_control
)
2595 module_put(net
->ipv4
.tcp_congestion_control
->owner
);
2597 for_each_possible_cpu(cpu
)
2598 inet_ctl_sock_destroy(*per_cpu_ptr(net
->ipv4
.tcp_sk
, cpu
));
2599 free_percpu(net
->ipv4
.tcp_sk
);
2602 static int __net_init
tcp_sk_init(struct net
*net
)
2606 net
->ipv4
.tcp_sk
= alloc_percpu(struct sock
*);
2607 if (!net
->ipv4
.tcp_sk
)
2610 for_each_possible_cpu(cpu
) {
2613 res
= inet_ctl_sock_create(&sk
, PF_INET
, SOCK_RAW
,
2617 sock_set_flag(sk
, SOCK_USE_WRITE_QUEUE
);
2619 /* Please enforce IP_DF and IPID==0 for RST and
2620 * ACK sent in SYN-RECV and TIME-WAIT state.
2622 inet_sk(sk
)->pmtudisc
= IP_PMTUDISC_DO
;
2624 *per_cpu_ptr(net
->ipv4
.tcp_sk
, cpu
) = sk
;
2627 net
->ipv4
.sysctl_tcp_ecn
= 2;
2628 net
->ipv4
.sysctl_tcp_ecn_fallback
= 1;
2630 net
->ipv4
.sysctl_tcp_base_mss
= TCP_BASE_MSS
;
2631 net
->ipv4
.sysctl_tcp_min_snd_mss
= TCP_MIN_SND_MSS
;
2632 net
->ipv4
.sysctl_tcp_probe_threshold
= TCP_PROBE_THRESHOLD
;
2633 net
->ipv4
.sysctl_tcp_probe_interval
= TCP_PROBE_INTERVAL
;
2635 net
->ipv4
.sysctl_tcp_keepalive_time
= TCP_KEEPALIVE_TIME
;
2636 net
->ipv4
.sysctl_tcp_keepalive_probes
= TCP_KEEPALIVE_PROBES
;
2637 net
->ipv4
.sysctl_tcp_keepalive_intvl
= TCP_KEEPALIVE_INTVL
;
2639 net
->ipv4
.sysctl_tcp_syn_retries
= TCP_SYN_RETRIES
;
2640 net
->ipv4
.sysctl_tcp_synack_retries
= TCP_SYNACK_RETRIES
;
2641 net
->ipv4
.sysctl_tcp_syncookies
= 1;
2642 net
->ipv4
.sysctl_tcp_reordering
= TCP_FASTRETRANS_THRESH
;
2643 net
->ipv4
.sysctl_tcp_retries1
= TCP_RETR1
;
2644 net
->ipv4
.sysctl_tcp_retries2
= TCP_RETR2
;
2645 net
->ipv4
.sysctl_tcp_orphan_retries
= 0;
2646 net
->ipv4
.sysctl_tcp_fin_timeout
= TCP_FIN_TIMEOUT
;
2647 net
->ipv4
.sysctl_tcp_notsent_lowat
= UINT_MAX
;
2648 net
->ipv4
.sysctl_tcp_tw_reuse
= 2;
2650 cnt
= tcp_hashinfo
.ehash_mask
+ 1;
2651 net
->ipv4
.tcp_death_row
.sysctl_max_tw_buckets
= cnt
/ 2;
2652 net
->ipv4
.tcp_death_row
.hashinfo
= &tcp_hashinfo
;
2654 net
->ipv4
.sysctl_max_syn_backlog
= max(128, cnt
/ 256);
2655 net
->ipv4
.sysctl_tcp_sack
= 1;
2656 net
->ipv4
.sysctl_tcp_window_scaling
= 1;
2657 net
->ipv4
.sysctl_tcp_timestamps
= 1;
2658 net
->ipv4
.sysctl_tcp_early_retrans
= 3;
2659 net
->ipv4
.sysctl_tcp_recovery
= TCP_RACK_LOSS_DETECTION
;
2660 net
->ipv4
.sysctl_tcp_slow_start_after_idle
= 1; /* By default, RFC2861 behavior. */
2661 net
->ipv4
.sysctl_tcp_retrans_collapse
= 1;
2662 net
->ipv4
.sysctl_tcp_max_reordering
= 300;
2663 net
->ipv4
.sysctl_tcp_dsack
= 1;
2664 net
->ipv4
.sysctl_tcp_app_win
= 31;
2665 net
->ipv4
.sysctl_tcp_adv_win_scale
= 1;
2666 net
->ipv4
.sysctl_tcp_frto
= 2;
2667 net
->ipv4
.sysctl_tcp_moderate_rcvbuf
= 1;
2668 /* This limits the percentage of the congestion window which we
2669 * will allow a single TSO frame to consume. Building TSO frames
2670 * which are too large can cause TCP streams to be bursty.
2672 net
->ipv4
.sysctl_tcp_tso_win_divisor
= 3;
2673 /* Default TSQ limit of 16 TSO segments */
2674 net
->ipv4
.sysctl_tcp_limit_output_bytes
= 16 * 65536;
2675 /* rfc5961 challenge ack rate limiting */
2676 net
->ipv4
.sysctl_tcp_challenge_ack_limit
= 1000;
2677 net
->ipv4
.sysctl_tcp_min_tso_segs
= 2;
2678 net
->ipv4
.sysctl_tcp_min_rtt_wlen
= 300;
2679 net
->ipv4
.sysctl_tcp_autocorking
= 1;
2680 net
->ipv4
.sysctl_tcp_invalid_ratelimit
= HZ
/2;
2681 net
->ipv4
.sysctl_tcp_pacing_ss_ratio
= 200;
2682 net
->ipv4
.sysctl_tcp_pacing_ca_ratio
= 120;
2683 if (net
!= &init_net
) {
2684 memcpy(net
->ipv4
.sysctl_tcp_rmem
,
2685 init_net
.ipv4
.sysctl_tcp_rmem
,
2686 sizeof(init_net
.ipv4
.sysctl_tcp_rmem
));
2687 memcpy(net
->ipv4
.sysctl_tcp_wmem
,
2688 init_net
.ipv4
.sysctl_tcp_wmem
,
2689 sizeof(init_net
.ipv4
.sysctl_tcp_wmem
));
2691 net
->ipv4
.sysctl_tcp_comp_sack_delay_ns
= NSEC_PER_MSEC
;
2692 net
->ipv4
.sysctl_tcp_comp_sack_nr
= 44;
2693 net
->ipv4
.sysctl_tcp_fastopen
= TFO_CLIENT_ENABLE
;
2694 spin_lock_init(&net
->ipv4
.tcp_fastopen_ctx_lock
);
2695 net
->ipv4
.sysctl_tcp_fastopen_blackhole_timeout
= 60 * 60;
2696 atomic_set(&net
->ipv4
.tfo_active_disable_times
, 0);
2698 /* Reno is always built in */
2699 if (!net_eq(net
, &init_net
) &&
2700 try_module_get(init_net
.ipv4
.tcp_congestion_control
->owner
))
2701 net
->ipv4
.tcp_congestion_control
= init_net
.ipv4
.tcp_congestion_control
;
2703 net
->ipv4
.tcp_congestion_control
= &tcp_reno
;
2712 static void __net_exit
tcp_sk_exit_batch(struct list_head
*net_exit_list
)
2716 inet_twsk_purge(&tcp_hashinfo
, AF_INET
);
2718 list_for_each_entry(net
, net_exit_list
, exit_list
)
2719 tcp_fastopen_ctx_destroy(net
);
2722 static struct pernet_operations __net_initdata tcp_sk_ops
= {
2723 .init
= tcp_sk_init
,
2724 .exit
= tcp_sk_exit
,
2725 .exit_batch
= tcp_sk_exit_batch
,
2728 void __init
tcp_v4_init(void)
2730 if (register_pernet_subsys(&tcp_sk_ops
))
2731 panic("Failed to create the TCP control socket.\n");