Merge tag 'nfsd-5.2-2' of git://linux-nfs.org/~bfields/linux
[linux-2.6/linux-2.6-arm.git] / net / ipv4 / tcp_ipv4.c
blobcfa81190a1b1af30d05f4f6cd84c05b025a6afeb
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3 * INET An implementation of the TCP/IP protocol suite for the LINUX
4 * operating system. INET is implemented using the BSD Socket
5 * interface as the means of communication with the user level.
7 * Implementation of the Transmission Control Protocol(TCP).
9 * IPv4 specific functions
11 * code split from:
12 * linux/ipv4/tcp.c
13 * linux/ipv4/tcp_input.c
14 * linux/ipv4/tcp_output.c
16 * See tcp.c for author information
20 * Changes:
21 * David S. Miller : New socket lookup architecture.
22 * This code is dedicated to John Dyson.
23 * David S. Miller : Change semantics of established hash,
24 * half is devoted to TIME_WAIT sockets
25 * and the rest go in the other half.
26 * Andi Kleen : Add support for syncookies and fixed
27 * some bugs: ip options weren't passed to
28 * the TCP layer, missed a check for an
29 * ACK bit.
30 * Andi Kleen : Implemented fast path mtu discovery.
31 * Fixed many serious bugs in the
32 * request_sock handling and moved
33 * most of it into the af independent code.
34 * Added tail drop and some other bugfixes.
35 * Added new listen semantics.
36 * Mike McLagan : Routing by source
37 * Juan Jose Ciarlante: ip_dynaddr bits
38 * Andi Kleen: various fixes.
39 * Vitaly E. Lavrov : Transparent proxy revived after year
40 * coma.
41 * Andi Kleen : Fix new listen.
42 * Andi Kleen : Fix accept error reporting.
43 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
44 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
45 * a single port at the same time.
48 #define pr_fmt(fmt) "TCP: " fmt
50 #include <linux/bottom_half.h>
51 #include <linux/types.h>
52 #include <linux/fcntl.h>
53 #include <linux/module.h>
54 #include <linux/random.h>
55 #include <linux/cache.h>
56 #include <linux/jhash.h>
57 #include <linux/init.h>
58 #include <linux/times.h>
59 #include <linux/slab.h>
61 #include <net/net_namespace.h>
62 #include <net/icmp.h>
63 #include <net/inet_hashtables.h>
64 #include <net/tcp.h>
65 #include <net/transp_v6.h>
66 #include <net/ipv6.h>
67 #include <net/inet_common.h>
68 #include <net/timewait_sock.h>
69 #include <net/xfrm.h>
70 #include <net/secure_seq.h>
71 #include <net/busy_poll.h>
73 #include <linux/inet.h>
74 #include <linux/ipv6.h>
75 #include <linux/stddef.h>
76 #include <linux/proc_fs.h>
77 #include <linux/seq_file.h>
78 #include <linux/inetdevice.h>
80 #include <crypto/hash.h>
81 #include <linux/scatterlist.h>
83 #include <trace/events/tcp.h>
85 #ifdef CONFIG_TCP_MD5SIG
86 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
87 __be32 daddr, __be32 saddr, const struct tcphdr *th);
88 #endif
90 struct inet_hashinfo tcp_hashinfo;
91 EXPORT_SYMBOL(tcp_hashinfo);
93 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
95 return secure_tcp_seq(ip_hdr(skb)->daddr,
96 ip_hdr(skb)->saddr,
97 tcp_hdr(skb)->dest,
98 tcp_hdr(skb)->source);
101 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
103 return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
106 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
108 const struct inet_timewait_sock *tw = inet_twsk(sktw);
109 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
110 struct tcp_sock *tp = tcp_sk(sk);
111 int reuse = sock_net(sk)->ipv4.sysctl_tcp_tw_reuse;
113 if (reuse == 2) {
114 /* Still does not detect *everything* that goes through
115 * lo, since we require a loopback src or dst address
116 * or direct binding to 'lo' interface.
118 bool loopback = false;
119 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
120 loopback = true;
121 #if IS_ENABLED(CONFIG_IPV6)
122 if (tw->tw_family == AF_INET6) {
123 if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
124 (ipv6_addr_v4mapped(&tw->tw_v6_daddr) &&
125 (tw->tw_v6_daddr.s6_addr[12] == 127)) ||
126 ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
127 (ipv6_addr_v4mapped(&tw->tw_v6_rcv_saddr) &&
128 (tw->tw_v6_rcv_saddr.s6_addr[12] == 127)))
129 loopback = true;
130 } else
131 #endif
133 if (ipv4_is_loopback(tw->tw_daddr) ||
134 ipv4_is_loopback(tw->tw_rcv_saddr))
135 loopback = true;
137 if (!loopback)
138 reuse = 0;
141 /* With PAWS, it is safe from the viewpoint
142 of data integrity. Even without PAWS it is safe provided sequence
143 spaces do not overlap i.e. at data rates <= 80Mbit/sec.
145 Actually, the idea is close to VJ's one, only timestamp cache is
146 held not per host, but per port pair and TW bucket is used as state
147 holder.
149 If TW bucket has been already destroyed we fall back to VJ's scheme
150 and use initial timestamp retrieved from peer table.
152 if (tcptw->tw_ts_recent_stamp &&
153 (!twp || (reuse && time_after32(ktime_get_seconds(),
154 tcptw->tw_ts_recent_stamp)))) {
155 /* In case of repair and re-using TIME-WAIT sockets we still
156 * want to be sure that it is safe as above but honor the
157 * sequence numbers and time stamps set as part of the repair
158 * process.
160 * Without this check re-using a TIME-WAIT socket with TCP
161 * repair would accumulate a -1 on the repair assigned
162 * sequence number. The first time it is reused the sequence
163 * is -1, the second time -2, etc. This fixes that issue
164 * without appearing to create any others.
166 if (likely(!tp->repair)) {
167 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
168 if (tp->write_seq == 0)
169 tp->write_seq = 1;
170 tp->rx_opt.ts_recent = tcptw->tw_ts_recent;
171 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
173 sock_hold(sktw);
174 return 1;
177 return 0;
179 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
181 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
182 int addr_len)
184 /* This check is replicated from tcp_v4_connect() and intended to
185 * prevent BPF program called below from accessing bytes that are out
186 * of the bound specified by user in addr_len.
188 if (addr_len < sizeof(struct sockaddr_in))
189 return -EINVAL;
191 sock_owned_by_me(sk);
193 return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
196 /* This will initiate an outgoing connection. */
197 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
199 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
200 struct inet_sock *inet = inet_sk(sk);
201 struct tcp_sock *tp = tcp_sk(sk);
202 __be16 orig_sport, orig_dport;
203 __be32 daddr, nexthop;
204 struct flowi4 *fl4;
205 struct rtable *rt;
206 int err;
207 struct ip_options_rcu *inet_opt;
208 struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
210 if (addr_len < sizeof(struct sockaddr_in))
211 return -EINVAL;
213 if (usin->sin_family != AF_INET)
214 return -EAFNOSUPPORT;
216 nexthop = daddr = usin->sin_addr.s_addr;
217 inet_opt = rcu_dereference_protected(inet->inet_opt,
218 lockdep_sock_is_held(sk));
219 if (inet_opt && inet_opt->opt.srr) {
220 if (!daddr)
221 return -EINVAL;
222 nexthop = inet_opt->opt.faddr;
225 orig_sport = inet->inet_sport;
226 orig_dport = usin->sin_port;
227 fl4 = &inet->cork.fl.u.ip4;
228 rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
229 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
230 IPPROTO_TCP,
231 orig_sport, orig_dport, sk);
232 if (IS_ERR(rt)) {
233 err = PTR_ERR(rt);
234 if (err == -ENETUNREACH)
235 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
236 return err;
239 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
240 ip_rt_put(rt);
241 return -ENETUNREACH;
244 if (!inet_opt || !inet_opt->opt.srr)
245 daddr = fl4->daddr;
247 if (!inet->inet_saddr)
248 inet->inet_saddr = fl4->saddr;
249 sk_rcv_saddr_set(sk, inet->inet_saddr);
251 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
252 /* Reset inherited state */
253 tp->rx_opt.ts_recent = 0;
254 tp->rx_opt.ts_recent_stamp = 0;
255 if (likely(!tp->repair))
256 tp->write_seq = 0;
259 inet->inet_dport = usin->sin_port;
260 sk_daddr_set(sk, daddr);
262 inet_csk(sk)->icsk_ext_hdr_len = 0;
263 if (inet_opt)
264 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
266 tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
268 /* Socket identity is still unknown (sport may be zero).
269 * However we set state to SYN-SENT and not releasing socket
270 * lock select source port, enter ourselves into the hash tables and
271 * complete initialization after this.
273 tcp_set_state(sk, TCP_SYN_SENT);
274 err = inet_hash_connect(tcp_death_row, sk);
275 if (err)
276 goto failure;
278 sk_set_txhash(sk);
280 rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
281 inet->inet_sport, inet->inet_dport, sk);
282 if (IS_ERR(rt)) {
283 err = PTR_ERR(rt);
284 rt = NULL;
285 goto failure;
287 /* OK, now commit destination to socket. */
288 sk->sk_gso_type = SKB_GSO_TCPV4;
289 sk_setup_caps(sk, &rt->dst);
290 rt = NULL;
292 if (likely(!tp->repair)) {
293 if (!tp->write_seq)
294 tp->write_seq = secure_tcp_seq(inet->inet_saddr,
295 inet->inet_daddr,
296 inet->inet_sport,
297 usin->sin_port);
298 tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
299 inet->inet_saddr,
300 inet->inet_daddr);
303 inet->inet_id = tp->write_seq ^ jiffies;
305 if (tcp_fastopen_defer_connect(sk, &err))
306 return err;
307 if (err)
308 goto failure;
310 err = tcp_connect(sk);
312 if (err)
313 goto failure;
315 return 0;
317 failure:
319 * This unhashes the socket and releases the local port,
320 * if necessary.
322 tcp_set_state(sk, TCP_CLOSE);
323 ip_rt_put(rt);
324 sk->sk_route_caps = 0;
325 inet->inet_dport = 0;
326 return err;
328 EXPORT_SYMBOL(tcp_v4_connect);
331 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
332 * It can be called through tcp_release_cb() if socket was owned by user
333 * at the time tcp_v4_err() was called to handle ICMP message.
335 void tcp_v4_mtu_reduced(struct sock *sk)
337 struct inet_sock *inet = inet_sk(sk);
338 struct dst_entry *dst;
339 u32 mtu;
341 if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
342 return;
343 mtu = tcp_sk(sk)->mtu_info;
344 dst = inet_csk_update_pmtu(sk, mtu);
345 if (!dst)
346 return;
348 /* Something is about to be wrong... Remember soft error
349 * for the case, if this connection will not able to recover.
351 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
352 sk->sk_err_soft = EMSGSIZE;
354 mtu = dst_mtu(dst);
356 if (inet->pmtudisc != IP_PMTUDISC_DONT &&
357 ip_sk_accept_pmtu(sk) &&
358 inet_csk(sk)->icsk_pmtu_cookie > mtu) {
359 tcp_sync_mss(sk, mtu);
361 /* Resend the TCP packet because it's
362 * clear that the old packet has been
363 * dropped. This is the new "fast" path mtu
364 * discovery.
366 tcp_simple_retransmit(sk);
367 } /* else let the usual retransmit timer handle it */
369 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
371 static void do_redirect(struct sk_buff *skb, struct sock *sk)
373 struct dst_entry *dst = __sk_dst_check(sk, 0);
375 if (dst)
376 dst->ops->redirect(dst, sk, skb);
380 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
381 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
383 struct request_sock *req = inet_reqsk(sk);
384 struct net *net = sock_net(sk);
386 /* ICMPs are not backlogged, hence we cannot get
387 * an established socket here.
389 if (seq != tcp_rsk(req)->snt_isn) {
390 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
391 } else if (abort) {
393 * Still in SYN_RECV, just remove it silently.
394 * There is no good way to pass the error to the newly
395 * created socket, and POSIX does not want network
396 * errors returned from accept().
398 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
399 tcp_listendrop(req->rsk_listener);
401 reqsk_put(req);
403 EXPORT_SYMBOL(tcp_req_err);
406 * This routine is called by the ICMP module when it gets some
407 * sort of error condition. If err < 0 then the socket should
408 * be closed and the error returned to the user. If err > 0
409 * it's just the icmp type << 8 | icmp code. After adjustment
410 * header points to the first 8 bytes of the tcp header. We need
411 * to find the appropriate port.
413 * The locking strategy used here is very "optimistic". When
414 * someone else accesses the socket the ICMP is just dropped
415 * and for some paths there is no check at all.
416 * A more general error queue to queue errors for later handling
417 * is probably better.
421 int tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
423 const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
424 struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
425 struct inet_connection_sock *icsk;
426 struct tcp_sock *tp;
427 struct inet_sock *inet;
428 const int type = icmp_hdr(icmp_skb)->type;
429 const int code = icmp_hdr(icmp_skb)->code;
430 struct sock *sk;
431 struct sk_buff *skb;
432 struct request_sock *fastopen;
433 u32 seq, snd_una;
434 s32 remaining;
435 u32 delta_us;
436 int err;
437 struct net *net = dev_net(icmp_skb->dev);
439 sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
440 th->dest, iph->saddr, ntohs(th->source),
441 inet_iif(icmp_skb), 0);
442 if (!sk) {
443 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
444 return -ENOENT;
446 if (sk->sk_state == TCP_TIME_WAIT) {
447 inet_twsk_put(inet_twsk(sk));
448 return 0;
450 seq = ntohl(th->seq);
451 if (sk->sk_state == TCP_NEW_SYN_RECV) {
452 tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
453 type == ICMP_TIME_EXCEEDED ||
454 (type == ICMP_DEST_UNREACH &&
455 (code == ICMP_NET_UNREACH ||
456 code == ICMP_HOST_UNREACH)));
457 return 0;
460 bh_lock_sock(sk);
461 /* If too many ICMPs get dropped on busy
462 * servers this needs to be solved differently.
463 * We do take care of PMTU discovery (RFC1191) special case :
464 * we can receive locally generated ICMP messages while socket is held.
466 if (sock_owned_by_user(sk)) {
467 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
468 __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
470 if (sk->sk_state == TCP_CLOSE)
471 goto out;
473 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
474 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
475 goto out;
478 icsk = inet_csk(sk);
479 tp = tcp_sk(sk);
480 /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
481 fastopen = tp->fastopen_rsk;
482 snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
483 if (sk->sk_state != TCP_LISTEN &&
484 !between(seq, snd_una, tp->snd_nxt)) {
485 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
486 goto out;
489 switch (type) {
490 case ICMP_REDIRECT:
491 if (!sock_owned_by_user(sk))
492 do_redirect(icmp_skb, sk);
493 goto out;
494 case ICMP_SOURCE_QUENCH:
495 /* Just silently ignore these. */
496 goto out;
497 case ICMP_PARAMETERPROB:
498 err = EPROTO;
499 break;
500 case ICMP_DEST_UNREACH:
501 if (code > NR_ICMP_UNREACH)
502 goto out;
504 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
505 /* We are not interested in TCP_LISTEN and open_requests
506 * (SYN-ACKs send out by Linux are always <576bytes so
507 * they should go through unfragmented).
509 if (sk->sk_state == TCP_LISTEN)
510 goto out;
512 tp->mtu_info = info;
513 if (!sock_owned_by_user(sk)) {
514 tcp_v4_mtu_reduced(sk);
515 } else {
516 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
517 sock_hold(sk);
519 goto out;
522 err = icmp_err_convert[code].errno;
523 /* check if icmp_skb allows revert of backoff
524 * (see draft-zimmermann-tcp-lcd) */
525 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
526 break;
527 if (seq != tp->snd_una || !icsk->icsk_retransmits ||
528 !icsk->icsk_backoff || fastopen)
529 break;
531 if (sock_owned_by_user(sk))
532 break;
534 skb = tcp_rtx_queue_head(sk);
535 if (WARN_ON_ONCE(!skb))
536 break;
538 icsk->icsk_backoff--;
539 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
540 TCP_TIMEOUT_INIT;
541 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
544 tcp_mstamp_refresh(tp);
545 delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
546 remaining = icsk->icsk_rto -
547 usecs_to_jiffies(delta_us);
549 if (remaining > 0) {
550 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
551 remaining, TCP_RTO_MAX);
552 } else {
553 /* RTO revert clocked out retransmission.
554 * Will retransmit now */
555 tcp_retransmit_timer(sk);
558 break;
559 case ICMP_TIME_EXCEEDED:
560 err = EHOSTUNREACH;
561 break;
562 default:
563 goto out;
566 switch (sk->sk_state) {
567 case TCP_SYN_SENT:
568 case TCP_SYN_RECV:
569 /* Only in fast or simultaneous open. If a fast open socket is
570 * is already accepted it is treated as a connected one below.
572 if (fastopen && !fastopen->sk)
573 break;
575 if (!sock_owned_by_user(sk)) {
576 sk->sk_err = err;
578 sk->sk_error_report(sk);
580 tcp_done(sk);
581 } else {
582 sk->sk_err_soft = err;
584 goto out;
587 /* If we've already connected we will keep trying
588 * until we time out, or the user gives up.
590 * rfc1122 4.2.3.9 allows to consider as hard errors
591 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
592 * but it is obsoleted by pmtu discovery).
594 * Note, that in modern internet, where routing is unreliable
595 * and in each dark corner broken firewalls sit, sending random
596 * errors ordered by their masters even this two messages finally lose
597 * their original sense (even Linux sends invalid PORT_UNREACHs)
599 * Now we are in compliance with RFCs.
600 * --ANK (980905)
603 inet = inet_sk(sk);
604 if (!sock_owned_by_user(sk) && inet->recverr) {
605 sk->sk_err = err;
606 sk->sk_error_report(sk);
607 } else { /* Only an error on timeout */
608 sk->sk_err_soft = err;
611 out:
612 bh_unlock_sock(sk);
613 sock_put(sk);
614 return 0;
617 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
619 struct tcphdr *th = tcp_hdr(skb);
621 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
622 skb->csum_start = skb_transport_header(skb) - skb->head;
623 skb->csum_offset = offsetof(struct tcphdr, check);
626 /* This routine computes an IPv4 TCP checksum. */
627 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
629 const struct inet_sock *inet = inet_sk(sk);
631 __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
633 EXPORT_SYMBOL(tcp_v4_send_check);
636 * This routine will send an RST to the other tcp.
638 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
639 * for reset.
640 * Answer: if a packet caused RST, it is not for a socket
641 * existing in our system, if it is matched to a socket,
642 * it is just duplicate segment or bug in other side's TCP.
643 * So that we build reply only basing on parameters
644 * arrived with segment.
645 * Exception: precedence violation. We do not implement it in any case.
648 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
650 const struct tcphdr *th = tcp_hdr(skb);
651 struct {
652 struct tcphdr th;
653 #ifdef CONFIG_TCP_MD5SIG
654 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
655 #endif
656 } rep;
657 struct ip_reply_arg arg;
658 #ifdef CONFIG_TCP_MD5SIG
659 struct tcp_md5sig_key *key = NULL;
660 const __u8 *hash_location = NULL;
661 unsigned char newhash[16];
662 int genhash;
663 struct sock *sk1 = NULL;
664 #endif
665 struct net *net;
666 struct sock *ctl_sk;
668 /* Never send a reset in response to a reset. */
669 if (th->rst)
670 return;
672 /* If sk not NULL, it means we did a successful lookup and incoming
673 * route had to be correct. prequeue might have dropped our dst.
675 if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
676 return;
678 /* Swap the send and the receive. */
679 memset(&rep, 0, sizeof(rep));
680 rep.th.dest = th->source;
681 rep.th.source = th->dest;
682 rep.th.doff = sizeof(struct tcphdr) / 4;
683 rep.th.rst = 1;
685 if (th->ack) {
686 rep.th.seq = th->ack_seq;
687 } else {
688 rep.th.ack = 1;
689 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
690 skb->len - (th->doff << 2));
693 memset(&arg, 0, sizeof(arg));
694 arg.iov[0].iov_base = (unsigned char *)&rep;
695 arg.iov[0].iov_len = sizeof(rep.th);
697 net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
698 #ifdef CONFIG_TCP_MD5SIG
699 rcu_read_lock();
700 hash_location = tcp_parse_md5sig_option(th);
701 if (sk && sk_fullsock(sk)) {
702 key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
703 &ip_hdr(skb)->saddr, AF_INET);
704 } else if (hash_location) {
706 * active side is lost. Try to find listening socket through
707 * source port, and then find md5 key through listening socket.
708 * we are not loose security here:
709 * Incoming packet is checked with md5 hash with finding key,
710 * no RST generated if md5 hash doesn't match.
712 sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
713 ip_hdr(skb)->saddr,
714 th->source, ip_hdr(skb)->daddr,
715 ntohs(th->source), inet_iif(skb),
716 tcp_v4_sdif(skb));
717 /* don't send rst if it can't find key */
718 if (!sk1)
719 goto out;
721 key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
722 &ip_hdr(skb)->saddr, AF_INET);
723 if (!key)
724 goto out;
727 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
728 if (genhash || memcmp(hash_location, newhash, 16) != 0)
729 goto out;
733 if (key) {
734 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
735 (TCPOPT_NOP << 16) |
736 (TCPOPT_MD5SIG << 8) |
737 TCPOLEN_MD5SIG);
738 /* Update length and the length the header thinks exists */
739 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
740 rep.th.doff = arg.iov[0].iov_len / 4;
742 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
743 key, ip_hdr(skb)->saddr,
744 ip_hdr(skb)->daddr, &rep.th);
746 #endif
747 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
748 ip_hdr(skb)->saddr, /* XXX */
749 arg.iov[0].iov_len, IPPROTO_TCP, 0);
750 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
751 arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
753 /* When socket is gone, all binding information is lost.
754 * routing might fail in this case. No choice here, if we choose to force
755 * input interface, we will misroute in case of asymmetric route.
757 if (sk) {
758 arg.bound_dev_if = sk->sk_bound_dev_if;
759 if (sk_fullsock(sk))
760 trace_tcp_send_reset(sk, skb);
763 BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
764 offsetof(struct inet_timewait_sock, tw_bound_dev_if));
766 arg.tos = ip_hdr(skb)->tos;
767 arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
768 local_bh_disable();
769 ctl_sk = *this_cpu_ptr(net->ipv4.tcp_sk);
770 if (sk)
771 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
772 inet_twsk(sk)->tw_mark : sk->sk_mark;
773 ip_send_unicast_reply(ctl_sk,
774 skb, &TCP_SKB_CB(skb)->header.h4.opt,
775 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
776 &arg, arg.iov[0].iov_len);
778 ctl_sk->sk_mark = 0;
779 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
780 __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
781 local_bh_enable();
783 #ifdef CONFIG_TCP_MD5SIG
784 out:
785 rcu_read_unlock();
786 #endif
789 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
790 outside socket context is ugly, certainly. What can I do?
793 static void tcp_v4_send_ack(const struct sock *sk,
794 struct sk_buff *skb, u32 seq, u32 ack,
795 u32 win, u32 tsval, u32 tsecr, int oif,
796 struct tcp_md5sig_key *key,
797 int reply_flags, u8 tos)
799 const struct tcphdr *th = tcp_hdr(skb);
800 struct {
801 struct tcphdr th;
802 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
803 #ifdef CONFIG_TCP_MD5SIG
804 + (TCPOLEN_MD5SIG_ALIGNED >> 2)
805 #endif
807 } rep;
808 struct net *net = sock_net(sk);
809 struct ip_reply_arg arg;
810 struct sock *ctl_sk;
812 memset(&rep.th, 0, sizeof(struct tcphdr));
813 memset(&arg, 0, sizeof(arg));
815 arg.iov[0].iov_base = (unsigned char *)&rep;
816 arg.iov[0].iov_len = sizeof(rep.th);
817 if (tsecr) {
818 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
819 (TCPOPT_TIMESTAMP << 8) |
820 TCPOLEN_TIMESTAMP);
821 rep.opt[1] = htonl(tsval);
822 rep.opt[2] = htonl(tsecr);
823 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
826 /* Swap the send and the receive. */
827 rep.th.dest = th->source;
828 rep.th.source = th->dest;
829 rep.th.doff = arg.iov[0].iov_len / 4;
830 rep.th.seq = htonl(seq);
831 rep.th.ack_seq = htonl(ack);
832 rep.th.ack = 1;
833 rep.th.window = htons(win);
835 #ifdef CONFIG_TCP_MD5SIG
836 if (key) {
837 int offset = (tsecr) ? 3 : 0;
839 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
840 (TCPOPT_NOP << 16) |
841 (TCPOPT_MD5SIG << 8) |
842 TCPOLEN_MD5SIG);
843 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
844 rep.th.doff = arg.iov[0].iov_len/4;
846 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
847 key, ip_hdr(skb)->saddr,
848 ip_hdr(skb)->daddr, &rep.th);
850 #endif
851 arg.flags = reply_flags;
852 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
853 ip_hdr(skb)->saddr, /* XXX */
854 arg.iov[0].iov_len, IPPROTO_TCP, 0);
855 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
856 if (oif)
857 arg.bound_dev_if = oif;
858 arg.tos = tos;
859 arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
860 local_bh_disable();
861 ctl_sk = *this_cpu_ptr(net->ipv4.tcp_sk);
862 if (sk)
863 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
864 inet_twsk(sk)->tw_mark : sk->sk_mark;
865 ip_send_unicast_reply(ctl_sk,
866 skb, &TCP_SKB_CB(skb)->header.h4.opt,
867 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
868 &arg, arg.iov[0].iov_len);
870 ctl_sk->sk_mark = 0;
871 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
872 local_bh_enable();
875 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
877 struct inet_timewait_sock *tw = inet_twsk(sk);
878 struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
880 tcp_v4_send_ack(sk, skb,
881 tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
882 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
883 tcp_time_stamp_raw() + tcptw->tw_ts_offset,
884 tcptw->tw_ts_recent,
885 tw->tw_bound_dev_if,
886 tcp_twsk_md5_key(tcptw),
887 tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
888 tw->tw_tos
891 inet_twsk_put(tw);
894 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
895 struct request_sock *req)
897 /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
898 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
900 u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
901 tcp_sk(sk)->snd_nxt;
903 /* RFC 7323 2.3
904 * The window field (SEG.WND) of every outgoing segment, with the
905 * exception of <SYN> segments, MUST be right-shifted by
906 * Rcv.Wind.Shift bits:
908 tcp_v4_send_ack(sk, skb, seq,
909 tcp_rsk(req)->rcv_nxt,
910 req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
911 tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
912 req->ts_recent,
914 tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->saddr,
915 AF_INET),
916 inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
917 ip_hdr(skb)->tos);
921 * Send a SYN-ACK after having received a SYN.
922 * This still operates on a request_sock only, not on a big
923 * socket.
925 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
926 struct flowi *fl,
927 struct request_sock *req,
928 struct tcp_fastopen_cookie *foc,
929 enum tcp_synack_type synack_type)
931 const struct inet_request_sock *ireq = inet_rsk(req);
932 struct flowi4 fl4;
933 int err = -1;
934 struct sk_buff *skb;
936 /* First, grab a route. */
937 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
938 return -1;
940 skb = tcp_make_synack(sk, dst, req, foc, synack_type);
942 if (skb) {
943 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
945 rcu_read_lock();
946 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
947 ireq->ir_rmt_addr,
948 rcu_dereference(ireq->ireq_opt));
949 rcu_read_unlock();
950 err = net_xmit_eval(err);
953 return err;
957 * IPv4 request_sock destructor.
959 static void tcp_v4_reqsk_destructor(struct request_sock *req)
961 kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
964 #ifdef CONFIG_TCP_MD5SIG
966 * RFC2385 MD5 checksumming requires a mapping of
967 * IP address->MD5 Key.
968 * We need to maintain these in the sk structure.
971 DEFINE_STATIC_KEY_FALSE(tcp_md5_needed);
972 EXPORT_SYMBOL(tcp_md5_needed);
974 /* Find the Key structure for an address. */
975 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk,
976 const union tcp_md5_addr *addr,
977 int family)
979 const struct tcp_sock *tp = tcp_sk(sk);
980 struct tcp_md5sig_key *key;
981 const struct tcp_md5sig_info *md5sig;
982 __be32 mask;
983 struct tcp_md5sig_key *best_match = NULL;
984 bool match;
986 /* caller either holds rcu_read_lock() or socket lock */
987 md5sig = rcu_dereference_check(tp->md5sig_info,
988 lockdep_sock_is_held(sk));
989 if (!md5sig)
990 return NULL;
992 hlist_for_each_entry_rcu(key, &md5sig->head, node) {
993 if (key->family != family)
994 continue;
996 if (family == AF_INET) {
997 mask = inet_make_mask(key->prefixlen);
998 match = (key->addr.a4.s_addr & mask) ==
999 (addr->a4.s_addr & mask);
1000 #if IS_ENABLED(CONFIG_IPV6)
1001 } else if (family == AF_INET6) {
1002 match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1003 key->prefixlen);
1004 #endif
1005 } else {
1006 match = false;
1009 if (match && (!best_match ||
1010 key->prefixlen > best_match->prefixlen))
1011 best_match = key;
1013 return best_match;
1015 EXPORT_SYMBOL(__tcp_md5_do_lookup);
1017 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1018 const union tcp_md5_addr *addr,
1019 int family, u8 prefixlen)
1021 const struct tcp_sock *tp = tcp_sk(sk);
1022 struct tcp_md5sig_key *key;
1023 unsigned int size = sizeof(struct in_addr);
1024 const struct tcp_md5sig_info *md5sig;
1026 /* caller either holds rcu_read_lock() or socket lock */
1027 md5sig = rcu_dereference_check(tp->md5sig_info,
1028 lockdep_sock_is_held(sk));
1029 if (!md5sig)
1030 return NULL;
1031 #if IS_ENABLED(CONFIG_IPV6)
1032 if (family == AF_INET6)
1033 size = sizeof(struct in6_addr);
1034 #endif
1035 hlist_for_each_entry_rcu(key, &md5sig->head, node) {
1036 if (key->family != family)
1037 continue;
1038 if (!memcmp(&key->addr, addr, size) &&
1039 key->prefixlen == prefixlen)
1040 return key;
1042 return NULL;
1045 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1046 const struct sock *addr_sk)
1048 const union tcp_md5_addr *addr;
1050 addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1051 return tcp_md5_do_lookup(sk, addr, AF_INET);
1053 EXPORT_SYMBOL(tcp_v4_md5_lookup);
1055 /* This can be called on a newly created socket, from other files */
1056 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1057 int family, u8 prefixlen, const u8 *newkey, u8 newkeylen,
1058 gfp_t gfp)
1060 /* Add Key to the list */
1061 struct tcp_md5sig_key *key;
1062 struct tcp_sock *tp = tcp_sk(sk);
1063 struct tcp_md5sig_info *md5sig;
1065 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
1066 if (key) {
1067 /* Pre-existing entry - just update that one. */
1068 memcpy(key->key, newkey, newkeylen);
1069 key->keylen = newkeylen;
1070 return 0;
1073 md5sig = rcu_dereference_protected(tp->md5sig_info,
1074 lockdep_sock_is_held(sk));
1075 if (!md5sig) {
1076 md5sig = kmalloc(sizeof(*md5sig), gfp);
1077 if (!md5sig)
1078 return -ENOMEM;
1080 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1081 INIT_HLIST_HEAD(&md5sig->head);
1082 rcu_assign_pointer(tp->md5sig_info, md5sig);
1085 key = sock_kmalloc(sk, sizeof(*key), gfp);
1086 if (!key)
1087 return -ENOMEM;
1088 if (!tcp_alloc_md5sig_pool()) {
1089 sock_kfree_s(sk, key, sizeof(*key));
1090 return -ENOMEM;
1093 memcpy(key->key, newkey, newkeylen);
1094 key->keylen = newkeylen;
1095 key->family = family;
1096 key->prefixlen = prefixlen;
1097 memcpy(&key->addr, addr,
1098 (family == AF_INET6) ? sizeof(struct in6_addr) :
1099 sizeof(struct in_addr));
1100 hlist_add_head_rcu(&key->node, &md5sig->head);
1101 return 0;
1103 EXPORT_SYMBOL(tcp_md5_do_add);
1105 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1106 u8 prefixlen)
1108 struct tcp_md5sig_key *key;
1110 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
1111 if (!key)
1112 return -ENOENT;
1113 hlist_del_rcu(&key->node);
1114 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1115 kfree_rcu(key, rcu);
1116 return 0;
1118 EXPORT_SYMBOL(tcp_md5_do_del);
1120 static void tcp_clear_md5_list(struct sock *sk)
1122 struct tcp_sock *tp = tcp_sk(sk);
1123 struct tcp_md5sig_key *key;
1124 struct hlist_node *n;
1125 struct tcp_md5sig_info *md5sig;
1127 md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1129 hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1130 hlist_del_rcu(&key->node);
1131 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1132 kfree_rcu(key, rcu);
1136 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1137 char __user *optval, int optlen)
1139 struct tcp_md5sig cmd;
1140 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1141 u8 prefixlen = 32;
1143 if (optlen < sizeof(cmd))
1144 return -EINVAL;
1146 if (copy_from_user(&cmd, optval, sizeof(cmd)))
1147 return -EFAULT;
1149 if (sin->sin_family != AF_INET)
1150 return -EINVAL;
1152 if (optname == TCP_MD5SIG_EXT &&
1153 cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1154 prefixlen = cmd.tcpm_prefixlen;
1155 if (prefixlen > 32)
1156 return -EINVAL;
1159 if (!cmd.tcpm_keylen)
1160 return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1161 AF_INET, prefixlen);
1163 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1164 return -EINVAL;
1166 return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1167 AF_INET, prefixlen, cmd.tcpm_key, cmd.tcpm_keylen,
1168 GFP_KERNEL);
1171 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1172 __be32 daddr, __be32 saddr,
1173 const struct tcphdr *th, int nbytes)
1175 struct tcp4_pseudohdr *bp;
1176 struct scatterlist sg;
1177 struct tcphdr *_th;
1179 bp = hp->scratch;
1180 bp->saddr = saddr;
1181 bp->daddr = daddr;
1182 bp->pad = 0;
1183 bp->protocol = IPPROTO_TCP;
1184 bp->len = cpu_to_be16(nbytes);
1186 _th = (struct tcphdr *)(bp + 1);
1187 memcpy(_th, th, sizeof(*th));
1188 _th->check = 0;
1190 sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1191 ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1192 sizeof(*bp) + sizeof(*th));
1193 return crypto_ahash_update(hp->md5_req);
1196 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1197 __be32 daddr, __be32 saddr, const struct tcphdr *th)
1199 struct tcp_md5sig_pool *hp;
1200 struct ahash_request *req;
1202 hp = tcp_get_md5sig_pool();
1203 if (!hp)
1204 goto clear_hash_noput;
1205 req = hp->md5_req;
1207 if (crypto_ahash_init(req))
1208 goto clear_hash;
1209 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1210 goto clear_hash;
1211 if (tcp_md5_hash_key(hp, key))
1212 goto clear_hash;
1213 ahash_request_set_crypt(req, NULL, md5_hash, 0);
1214 if (crypto_ahash_final(req))
1215 goto clear_hash;
1217 tcp_put_md5sig_pool();
1218 return 0;
1220 clear_hash:
1221 tcp_put_md5sig_pool();
1222 clear_hash_noput:
1223 memset(md5_hash, 0, 16);
1224 return 1;
1227 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1228 const struct sock *sk,
1229 const struct sk_buff *skb)
1231 struct tcp_md5sig_pool *hp;
1232 struct ahash_request *req;
1233 const struct tcphdr *th = tcp_hdr(skb);
1234 __be32 saddr, daddr;
1236 if (sk) { /* valid for establish/request sockets */
1237 saddr = sk->sk_rcv_saddr;
1238 daddr = sk->sk_daddr;
1239 } else {
1240 const struct iphdr *iph = ip_hdr(skb);
1241 saddr = iph->saddr;
1242 daddr = iph->daddr;
1245 hp = tcp_get_md5sig_pool();
1246 if (!hp)
1247 goto clear_hash_noput;
1248 req = hp->md5_req;
1250 if (crypto_ahash_init(req))
1251 goto clear_hash;
1253 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1254 goto clear_hash;
1255 if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1256 goto clear_hash;
1257 if (tcp_md5_hash_key(hp, key))
1258 goto clear_hash;
1259 ahash_request_set_crypt(req, NULL, md5_hash, 0);
1260 if (crypto_ahash_final(req))
1261 goto clear_hash;
1263 tcp_put_md5sig_pool();
1264 return 0;
1266 clear_hash:
1267 tcp_put_md5sig_pool();
1268 clear_hash_noput:
1269 memset(md5_hash, 0, 16);
1270 return 1;
1272 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1274 #endif
1276 /* Called with rcu_read_lock() */
1277 static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1278 const struct sk_buff *skb)
1280 #ifdef CONFIG_TCP_MD5SIG
1282 * This gets called for each TCP segment that arrives
1283 * so we want to be efficient.
1284 * We have 3 drop cases:
1285 * o No MD5 hash and one expected.
1286 * o MD5 hash and we're not expecting one.
1287 * o MD5 hash and its wrong.
1289 const __u8 *hash_location = NULL;
1290 struct tcp_md5sig_key *hash_expected;
1291 const struct iphdr *iph = ip_hdr(skb);
1292 const struct tcphdr *th = tcp_hdr(skb);
1293 int genhash;
1294 unsigned char newhash[16];
1296 hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1297 AF_INET);
1298 hash_location = tcp_parse_md5sig_option(th);
1300 /* We've parsed the options - do we have a hash? */
1301 if (!hash_expected && !hash_location)
1302 return false;
1304 if (hash_expected && !hash_location) {
1305 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1306 return true;
1309 if (!hash_expected && hash_location) {
1310 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1311 return true;
1314 /* Okay, so this is hash_expected and hash_location -
1315 * so we need to calculate the checksum.
1317 genhash = tcp_v4_md5_hash_skb(newhash,
1318 hash_expected,
1319 NULL, skb);
1321 if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1322 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
1323 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1324 &iph->saddr, ntohs(th->source),
1325 &iph->daddr, ntohs(th->dest),
1326 genhash ? " tcp_v4_calc_md5_hash failed"
1327 : "");
1328 return true;
1330 return false;
1331 #endif
1332 return false;
1335 static void tcp_v4_init_req(struct request_sock *req,
1336 const struct sock *sk_listener,
1337 struct sk_buff *skb)
1339 struct inet_request_sock *ireq = inet_rsk(req);
1340 struct net *net = sock_net(sk_listener);
1342 sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1343 sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1344 RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1347 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1348 struct flowi *fl,
1349 const struct request_sock *req)
1351 return inet_csk_route_req(sk, &fl->u.ip4, req);
1354 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1355 .family = PF_INET,
1356 .obj_size = sizeof(struct tcp_request_sock),
1357 .rtx_syn_ack = tcp_rtx_synack,
1358 .send_ack = tcp_v4_reqsk_send_ack,
1359 .destructor = tcp_v4_reqsk_destructor,
1360 .send_reset = tcp_v4_send_reset,
1361 .syn_ack_timeout = tcp_syn_ack_timeout,
1364 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1365 .mss_clamp = TCP_MSS_DEFAULT,
1366 #ifdef CONFIG_TCP_MD5SIG
1367 .req_md5_lookup = tcp_v4_md5_lookup,
1368 .calc_md5_hash = tcp_v4_md5_hash_skb,
1369 #endif
1370 .init_req = tcp_v4_init_req,
1371 #ifdef CONFIG_SYN_COOKIES
1372 .cookie_init_seq = cookie_v4_init_sequence,
1373 #endif
1374 .route_req = tcp_v4_route_req,
1375 .init_seq = tcp_v4_init_seq,
1376 .init_ts_off = tcp_v4_init_ts_off,
1377 .send_synack = tcp_v4_send_synack,
1380 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1382 /* Never answer to SYNs send to broadcast or multicast */
1383 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1384 goto drop;
1386 return tcp_conn_request(&tcp_request_sock_ops,
1387 &tcp_request_sock_ipv4_ops, sk, skb);
1389 drop:
1390 tcp_listendrop(sk);
1391 return 0;
1393 EXPORT_SYMBOL(tcp_v4_conn_request);
1397 * The three way handshake has completed - we got a valid synack -
1398 * now create the new socket.
1400 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1401 struct request_sock *req,
1402 struct dst_entry *dst,
1403 struct request_sock *req_unhash,
1404 bool *own_req)
1406 struct inet_request_sock *ireq;
1407 struct inet_sock *newinet;
1408 struct tcp_sock *newtp;
1409 struct sock *newsk;
1410 #ifdef CONFIG_TCP_MD5SIG
1411 struct tcp_md5sig_key *key;
1412 #endif
1413 struct ip_options_rcu *inet_opt;
1415 if (sk_acceptq_is_full(sk))
1416 goto exit_overflow;
1418 newsk = tcp_create_openreq_child(sk, req, skb);
1419 if (!newsk)
1420 goto exit_nonewsk;
1422 newsk->sk_gso_type = SKB_GSO_TCPV4;
1423 inet_sk_rx_dst_set(newsk, skb);
1425 newtp = tcp_sk(newsk);
1426 newinet = inet_sk(newsk);
1427 ireq = inet_rsk(req);
1428 sk_daddr_set(newsk, ireq->ir_rmt_addr);
1429 sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1430 newsk->sk_bound_dev_if = ireq->ir_iif;
1431 newinet->inet_saddr = ireq->ir_loc_addr;
1432 inet_opt = rcu_dereference(ireq->ireq_opt);
1433 RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1434 newinet->mc_index = inet_iif(skb);
1435 newinet->mc_ttl = ip_hdr(skb)->ttl;
1436 newinet->rcv_tos = ip_hdr(skb)->tos;
1437 inet_csk(newsk)->icsk_ext_hdr_len = 0;
1438 if (inet_opt)
1439 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1440 newinet->inet_id = newtp->write_seq ^ jiffies;
1442 if (!dst) {
1443 dst = inet_csk_route_child_sock(sk, newsk, req);
1444 if (!dst)
1445 goto put_and_exit;
1446 } else {
1447 /* syncookie case : see end of cookie_v4_check() */
1449 sk_setup_caps(newsk, dst);
1451 tcp_ca_openreq_child(newsk, dst);
1453 tcp_sync_mss(newsk, dst_mtu(dst));
1454 newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1456 tcp_initialize_rcv_mss(newsk);
1458 #ifdef CONFIG_TCP_MD5SIG
1459 /* Copy over the MD5 key from the original socket */
1460 key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1461 AF_INET);
1462 if (key) {
1464 * We're using one, so create a matching key
1465 * on the newsk structure. If we fail to get
1466 * memory, then we end up not copying the key
1467 * across. Shucks.
1469 tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1470 AF_INET, 32, key->key, key->keylen, GFP_ATOMIC);
1471 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1473 #endif
1475 if (__inet_inherit_port(sk, newsk) < 0)
1476 goto put_and_exit;
1477 *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
1478 if (likely(*own_req)) {
1479 tcp_move_syn(newtp, req);
1480 ireq->ireq_opt = NULL;
1481 } else {
1482 newinet->inet_opt = NULL;
1484 return newsk;
1486 exit_overflow:
1487 NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1488 exit_nonewsk:
1489 dst_release(dst);
1490 exit:
1491 tcp_listendrop(sk);
1492 return NULL;
1493 put_and_exit:
1494 newinet->inet_opt = NULL;
1495 inet_csk_prepare_forced_close(newsk);
1496 tcp_done(newsk);
1497 goto exit;
1499 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1501 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1503 #ifdef CONFIG_SYN_COOKIES
1504 const struct tcphdr *th = tcp_hdr(skb);
1506 if (!th->syn)
1507 sk = cookie_v4_check(sk, skb);
1508 #endif
1509 return sk;
1512 /* The socket must have it's spinlock held when we get
1513 * here, unless it is a TCP_LISTEN socket.
1515 * We have a potential double-lock case here, so even when
1516 * doing backlog processing we use the BH locking scheme.
1517 * This is because we cannot sleep with the original spinlock
1518 * held.
1520 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1522 struct sock *rsk;
1524 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1525 struct dst_entry *dst = sk->sk_rx_dst;
1527 sock_rps_save_rxhash(sk, skb);
1528 sk_mark_napi_id(sk, skb);
1529 if (dst) {
1530 if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1531 !dst->ops->check(dst, 0)) {
1532 dst_release(dst);
1533 sk->sk_rx_dst = NULL;
1536 tcp_rcv_established(sk, skb);
1537 return 0;
1540 if (tcp_checksum_complete(skb))
1541 goto csum_err;
1543 if (sk->sk_state == TCP_LISTEN) {
1544 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1546 if (!nsk)
1547 goto discard;
1548 if (nsk != sk) {
1549 if (tcp_child_process(sk, nsk, skb)) {
1550 rsk = nsk;
1551 goto reset;
1553 return 0;
1555 } else
1556 sock_rps_save_rxhash(sk, skb);
1558 if (tcp_rcv_state_process(sk, skb)) {
1559 rsk = sk;
1560 goto reset;
1562 return 0;
1564 reset:
1565 tcp_v4_send_reset(rsk, skb);
1566 discard:
1567 kfree_skb(skb);
1568 /* Be careful here. If this function gets more complicated and
1569 * gcc suffers from register pressure on the x86, sk (in %ebx)
1570 * might be destroyed here. This current version compiles correctly,
1571 * but you have been warned.
1573 return 0;
1575 csum_err:
1576 TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1577 TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1578 goto discard;
1580 EXPORT_SYMBOL(tcp_v4_do_rcv);
1582 int tcp_v4_early_demux(struct sk_buff *skb)
1584 const struct iphdr *iph;
1585 const struct tcphdr *th;
1586 struct sock *sk;
1588 if (skb->pkt_type != PACKET_HOST)
1589 return 0;
1591 if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1592 return 0;
1594 iph = ip_hdr(skb);
1595 th = tcp_hdr(skb);
1597 if (th->doff < sizeof(struct tcphdr) / 4)
1598 return 0;
1600 sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1601 iph->saddr, th->source,
1602 iph->daddr, ntohs(th->dest),
1603 skb->skb_iif, inet_sdif(skb));
1604 if (sk) {
1605 skb->sk = sk;
1606 skb->destructor = sock_edemux;
1607 if (sk_fullsock(sk)) {
1608 struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
1610 if (dst)
1611 dst = dst_check(dst, 0);
1612 if (dst &&
1613 inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1614 skb_dst_set_noref(skb, dst);
1617 return 0;
1620 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1622 u32 limit = sk->sk_rcvbuf + sk->sk_sndbuf;
1623 struct skb_shared_info *shinfo;
1624 const struct tcphdr *th;
1625 struct tcphdr *thtail;
1626 struct sk_buff *tail;
1627 unsigned int hdrlen;
1628 bool fragstolen;
1629 u32 gso_segs;
1630 int delta;
1632 /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1633 * we can fix skb->truesize to its real value to avoid future drops.
1634 * This is valid because skb is not yet charged to the socket.
1635 * It has been noticed pure SACK packets were sometimes dropped
1636 * (if cooked by drivers without copybreak feature).
1638 skb_condense(skb);
1640 skb_dst_drop(skb);
1642 if (unlikely(tcp_checksum_complete(skb))) {
1643 bh_unlock_sock(sk);
1644 __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1645 __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1646 return true;
1649 /* Attempt coalescing to last skb in backlog, even if we are
1650 * above the limits.
1651 * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
1653 th = (const struct tcphdr *)skb->data;
1654 hdrlen = th->doff * 4;
1655 shinfo = skb_shinfo(skb);
1657 if (!shinfo->gso_size)
1658 shinfo->gso_size = skb->len - hdrlen;
1660 if (!shinfo->gso_segs)
1661 shinfo->gso_segs = 1;
1663 tail = sk->sk_backlog.tail;
1664 if (!tail)
1665 goto no_coalesce;
1666 thtail = (struct tcphdr *)tail->data;
1668 if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
1669 TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
1670 ((TCP_SKB_CB(tail)->tcp_flags |
1671 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
1672 !((TCP_SKB_CB(tail)->tcp_flags &
1673 TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
1674 ((TCP_SKB_CB(tail)->tcp_flags ^
1675 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
1676 #ifdef CONFIG_TLS_DEVICE
1677 tail->decrypted != skb->decrypted ||
1678 #endif
1679 thtail->doff != th->doff ||
1680 memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
1681 goto no_coalesce;
1683 __skb_pull(skb, hdrlen);
1684 if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
1685 thtail->window = th->window;
1687 TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
1689 if (after(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))
1690 TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
1692 /* We have to update both TCP_SKB_CB(tail)->tcp_flags and
1693 * thtail->fin, so that the fast path in tcp_rcv_established()
1694 * is not entered if we append a packet with a FIN.
1695 * SYN, RST, URG are not present.
1696 * ACK is set on both packets.
1697 * PSH : we do not really care in TCP stack,
1698 * at least for 'GRO' packets.
1700 thtail->fin |= th->fin;
1701 TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1703 if (TCP_SKB_CB(skb)->has_rxtstamp) {
1704 TCP_SKB_CB(tail)->has_rxtstamp = true;
1705 tail->tstamp = skb->tstamp;
1706 skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
1709 /* Not as strict as GRO. We only need to carry mss max value */
1710 skb_shinfo(tail)->gso_size = max(shinfo->gso_size,
1711 skb_shinfo(tail)->gso_size);
1713 gso_segs = skb_shinfo(tail)->gso_segs + shinfo->gso_segs;
1714 skb_shinfo(tail)->gso_segs = min_t(u32, gso_segs, 0xFFFF);
1716 sk->sk_backlog.len += delta;
1717 __NET_INC_STATS(sock_net(sk),
1718 LINUX_MIB_TCPBACKLOGCOALESCE);
1719 kfree_skb_partial(skb, fragstolen);
1720 return false;
1722 __skb_push(skb, hdrlen);
1724 no_coalesce:
1725 /* Only socket owner can try to collapse/prune rx queues
1726 * to reduce memory overhead, so add a little headroom here.
1727 * Few sockets backlog are possibly concurrently non empty.
1729 limit += 64*1024;
1731 if (unlikely(sk_add_backlog(sk, skb, limit))) {
1732 bh_unlock_sock(sk);
1733 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1734 return true;
1736 return false;
1738 EXPORT_SYMBOL(tcp_add_backlog);
1740 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1742 struct tcphdr *th = (struct tcphdr *)skb->data;
1744 return sk_filter_trim_cap(sk, skb, th->doff * 4);
1746 EXPORT_SYMBOL(tcp_filter);
1748 static void tcp_v4_restore_cb(struct sk_buff *skb)
1750 memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1751 sizeof(struct inet_skb_parm));
1754 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1755 const struct tcphdr *th)
1757 /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1758 * barrier() makes sure compiler wont play fool^Waliasing games.
1760 memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1761 sizeof(struct inet_skb_parm));
1762 barrier();
1764 TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1765 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1766 skb->len - th->doff * 4);
1767 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1768 TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1769 TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1770 TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1771 TCP_SKB_CB(skb)->sacked = 0;
1772 TCP_SKB_CB(skb)->has_rxtstamp =
1773 skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1777 * From tcp_input.c
1780 int tcp_v4_rcv(struct sk_buff *skb)
1782 struct net *net = dev_net(skb->dev);
1783 struct sk_buff *skb_to_free;
1784 int sdif = inet_sdif(skb);
1785 const struct iphdr *iph;
1786 const struct tcphdr *th;
1787 bool refcounted;
1788 struct sock *sk;
1789 int ret;
1791 if (skb->pkt_type != PACKET_HOST)
1792 goto discard_it;
1794 /* Count it even if it's bad */
1795 __TCP_INC_STATS(net, TCP_MIB_INSEGS);
1797 if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1798 goto discard_it;
1800 th = (const struct tcphdr *)skb->data;
1802 if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
1803 goto bad_packet;
1804 if (!pskb_may_pull(skb, th->doff * 4))
1805 goto discard_it;
1807 /* An explanation is required here, I think.
1808 * Packet length and doff are validated by header prediction,
1809 * provided case of th->doff==0 is eliminated.
1810 * So, we defer the checks. */
1812 if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1813 goto csum_error;
1815 th = (const struct tcphdr *)skb->data;
1816 iph = ip_hdr(skb);
1817 lookup:
1818 sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1819 th->dest, sdif, &refcounted);
1820 if (!sk)
1821 goto no_tcp_socket;
1823 process:
1824 if (sk->sk_state == TCP_TIME_WAIT)
1825 goto do_time_wait;
1827 if (sk->sk_state == TCP_NEW_SYN_RECV) {
1828 struct request_sock *req = inet_reqsk(sk);
1829 bool req_stolen = false;
1830 struct sock *nsk;
1832 sk = req->rsk_listener;
1833 if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) {
1834 sk_drops_add(sk, skb);
1835 reqsk_put(req);
1836 goto discard_it;
1838 if (tcp_checksum_complete(skb)) {
1839 reqsk_put(req);
1840 goto csum_error;
1842 if (unlikely(sk->sk_state != TCP_LISTEN)) {
1843 inet_csk_reqsk_queue_drop_and_put(sk, req);
1844 goto lookup;
1846 /* We own a reference on the listener, increase it again
1847 * as we might lose it too soon.
1849 sock_hold(sk);
1850 refcounted = true;
1851 nsk = NULL;
1852 if (!tcp_filter(sk, skb)) {
1853 th = (const struct tcphdr *)skb->data;
1854 iph = ip_hdr(skb);
1855 tcp_v4_fill_cb(skb, iph, th);
1856 nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
1858 if (!nsk) {
1859 reqsk_put(req);
1860 if (req_stolen) {
1861 /* Another cpu got exclusive access to req
1862 * and created a full blown socket.
1863 * Try to feed this packet to this socket
1864 * instead of discarding it.
1866 tcp_v4_restore_cb(skb);
1867 sock_put(sk);
1868 goto lookup;
1870 goto discard_and_relse;
1872 if (nsk == sk) {
1873 reqsk_put(req);
1874 tcp_v4_restore_cb(skb);
1875 } else if (tcp_child_process(sk, nsk, skb)) {
1876 tcp_v4_send_reset(nsk, skb);
1877 goto discard_and_relse;
1878 } else {
1879 sock_put(sk);
1880 return 0;
1883 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1884 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
1885 goto discard_and_relse;
1888 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1889 goto discard_and_relse;
1891 if (tcp_v4_inbound_md5_hash(sk, skb))
1892 goto discard_and_relse;
1894 nf_reset(skb);
1896 if (tcp_filter(sk, skb))
1897 goto discard_and_relse;
1898 th = (const struct tcphdr *)skb->data;
1899 iph = ip_hdr(skb);
1900 tcp_v4_fill_cb(skb, iph, th);
1902 skb->dev = NULL;
1904 if (sk->sk_state == TCP_LISTEN) {
1905 ret = tcp_v4_do_rcv(sk, skb);
1906 goto put_and_return;
1909 sk_incoming_cpu_update(sk);
1911 bh_lock_sock_nested(sk);
1912 tcp_segs_in(tcp_sk(sk), skb);
1913 ret = 0;
1914 if (!sock_owned_by_user(sk)) {
1915 skb_to_free = sk->sk_rx_skb_cache;
1916 sk->sk_rx_skb_cache = NULL;
1917 ret = tcp_v4_do_rcv(sk, skb);
1918 } else {
1919 if (tcp_add_backlog(sk, skb))
1920 goto discard_and_relse;
1921 skb_to_free = NULL;
1923 bh_unlock_sock(sk);
1924 if (skb_to_free)
1925 __kfree_skb(skb_to_free);
1927 put_and_return:
1928 if (refcounted)
1929 sock_put(sk);
1931 return ret;
1933 no_tcp_socket:
1934 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1935 goto discard_it;
1937 tcp_v4_fill_cb(skb, iph, th);
1939 if (tcp_checksum_complete(skb)) {
1940 csum_error:
1941 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
1942 bad_packet:
1943 __TCP_INC_STATS(net, TCP_MIB_INERRS);
1944 } else {
1945 tcp_v4_send_reset(NULL, skb);
1948 discard_it:
1949 /* Discard frame. */
1950 kfree_skb(skb);
1951 return 0;
1953 discard_and_relse:
1954 sk_drops_add(sk, skb);
1955 if (refcounted)
1956 sock_put(sk);
1957 goto discard_it;
1959 do_time_wait:
1960 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1961 inet_twsk_put(inet_twsk(sk));
1962 goto discard_it;
1965 tcp_v4_fill_cb(skb, iph, th);
1967 if (tcp_checksum_complete(skb)) {
1968 inet_twsk_put(inet_twsk(sk));
1969 goto csum_error;
1971 switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1972 case TCP_TW_SYN: {
1973 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1974 &tcp_hashinfo, skb,
1975 __tcp_hdrlen(th),
1976 iph->saddr, th->source,
1977 iph->daddr, th->dest,
1978 inet_iif(skb),
1979 sdif);
1980 if (sk2) {
1981 inet_twsk_deschedule_put(inet_twsk(sk));
1982 sk = sk2;
1983 tcp_v4_restore_cb(skb);
1984 refcounted = false;
1985 goto process;
1988 /* to ACK */
1989 /* fall through */
1990 case TCP_TW_ACK:
1991 tcp_v4_timewait_ack(sk, skb);
1992 break;
1993 case TCP_TW_RST:
1994 tcp_v4_send_reset(sk, skb);
1995 inet_twsk_deschedule_put(inet_twsk(sk));
1996 goto discard_it;
1997 case TCP_TW_SUCCESS:;
1999 goto discard_it;
2002 static struct timewait_sock_ops tcp_timewait_sock_ops = {
2003 .twsk_obj_size = sizeof(struct tcp_timewait_sock),
2004 .twsk_unique = tcp_twsk_unique,
2005 .twsk_destructor= tcp_twsk_destructor,
2008 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2010 struct dst_entry *dst = skb_dst(skb);
2012 if (dst && dst_hold_safe(dst)) {
2013 sk->sk_rx_dst = dst;
2014 inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
2017 EXPORT_SYMBOL(inet_sk_rx_dst_set);
2019 const struct inet_connection_sock_af_ops ipv4_specific = {
2020 .queue_xmit = ip_queue_xmit,
2021 .send_check = tcp_v4_send_check,
2022 .rebuild_header = inet_sk_rebuild_header,
2023 .sk_rx_dst_set = inet_sk_rx_dst_set,
2024 .conn_request = tcp_v4_conn_request,
2025 .syn_recv_sock = tcp_v4_syn_recv_sock,
2026 .net_header_len = sizeof(struct iphdr),
2027 .setsockopt = ip_setsockopt,
2028 .getsockopt = ip_getsockopt,
2029 .addr2sockaddr = inet_csk_addr2sockaddr,
2030 .sockaddr_len = sizeof(struct sockaddr_in),
2031 #ifdef CONFIG_COMPAT
2032 .compat_setsockopt = compat_ip_setsockopt,
2033 .compat_getsockopt = compat_ip_getsockopt,
2034 #endif
2035 .mtu_reduced = tcp_v4_mtu_reduced,
2037 EXPORT_SYMBOL(ipv4_specific);
2039 #ifdef CONFIG_TCP_MD5SIG
2040 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2041 .md5_lookup = tcp_v4_md5_lookup,
2042 .calc_md5_hash = tcp_v4_md5_hash_skb,
2043 .md5_parse = tcp_v4_parse_md5_keys,
2045 #endif
2047 /* NOTE: A lot of things set to zero explicitly by call to
2048 * sk_alloc() so need not be done here.
2050 static int tcp_v4_init_sock(struct sock *sk)
2052 struct inet_connection_sock *icsk = inet_csk(sk);
2054 tcp_init_sock(sk);
2056 icsk->icsk_af_ops = &ipv4_specific;
2058 #ifdef CONFIG_TCP_MD5SIG
2059 tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2060 #endif
2062 return 0;
2065 void tcp_v4_destroy_sock(struct sock *sk)
2067 struct tcp_sock *tp = tcp_sk(sk);
2069 trace_tcp_destroy_sock(sk);
2071 tcp_clear_xmit_timers(sk);
2073 tcp_cleanup_congestion_control(sk);
2075 tcp_cleanup_ulp(sk);
2077 /* Cleanup up the write buffer. */
2078 tcp_write_queue_purge(sk);
2080 /* Check if we want to disable active TFO */
2081 tcp_fastopen_active_disable_ofo_check(sk);
2083 /* Cleans up our, hopefully empty, out_of_order_queue. */
2084 skb_rbtree_purge(&tp->out_of_order_queue);
2086 #ifdef CONFIG_TCP_MD5SIG
2087 /* Clean up the MD5 key list, if any */
2088 if (tp->md5sig_info) {
2089 tcp_clear_md5_list(sk);
2090 kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
2091 tp->md5sig_info = NULL;
2093 #endif
2095 /* Clean up a referenced TCP bind bucket. */
2096 if (inet_csk(sk)->icsk_bind_hash)
2097 inet_put_port(sk);
2099 BUG_ON(tp->fastopen_rsk);
2101 /* If socket is aborted during connect operation */
2102 tcp_free_fastopen_req(tp);
2103 tcp_fastopen_destroy_cipher(sk);
2104 tcp_saved_syn_free(tp);
2106 sk_sockets_allocated_dec(sk);
2108 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2110 #ifdef CONFIG_PROC_FS
2111 /* Proc filesystem TCP sock list dumping. */
2114 * Get next listener socket follow cur. If cur is NULL, get first socket
2115 * starting from bucket given in st->bucket; when st->bucket is zero the
2116 * very first socket in the hash table is returned.
2118 static void *listening_get_next(struct seq_file *seq, void *cur)
2120 struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
2121 struct tcp_iter_state *st = seq->private;
2122 struct net *net = seq_file_net(seq);
2123 struct inet_listen_hashbucket *ilb;
2124 struct sock *sk = cur;
2126 if (!sk) {
2127 get_head:
2128 ilb = &tcp_hashinfo.listening_hash[st->bucket];
2129 spin_lock(&ilb->lock);
2130 sk = sk_head(&ilb->head);
2131 st->offset = 0;
2132 goto get_sk;
2134 ilb = &tcp_hashinfo.listening_hash[st->bucket];
2135 ++st->num;
2136 ++st->offset;
2138 sk = sk_next(sk);
2139 get_sk:
2140 sk_for_each_from(sk) {
2141 if (!net_eq(sock_net(sk), net))
2142 continue;
2143 if (sk->sk_family == afinfo->family)
2144 return sk;
2146 spin_unlock(&ilb->lock);
2147 st->offset = 0;
2148 if (++st->bucket < INET_LHTABLE_SIZE)
2149 goto get_head;
2150 return NULL;
2153 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2155 struct tcp_iter_state *st = seq->private;
2156 void *rc;
2158 st->bucket = 0;
2159 st->offset = 0;
2160 rc = listening_get_next(seq, NULL);
2162 while (rc && *pos) {
2163 rc = listening_get_next(seq, rc);
2164 --*pos;
2166 return rc;
2169 static inline bool empty_bucket(const struct tcp_iter_state *st)
2171 return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
2175 * Get first established socket starting from bucket given in st->bucket.
2176 * If st->bucket is zero, the very first socket in the hash is returned.
2178 static void *established_get_first(struct seq_file *seq)
2180 struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
2181 struct tcp_iter_state *st = seq->private;
2182 struct net *net = seq_file_net(seq);
2183 void *rc = NULL;
2185 st->offset = 0;
2186 for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2187 struct sock *sk;
2188 struct hlist_nulls_node *node;
2189 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2191 /* Lockless fast path for the common case of empty buckets */
2192 if (empty_bucket(st))
2193 continue;
2195 spin_lock_bh(lock);
2196 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2197 if (sk->sk_family != afinfo->family ||
2198 !net_eq(sock_net(sk), net)) {
2199 continue;
2201 rc = sk;
2202 goto out;
2204 spin_unlock_bh(lock);
2206 out:
2207 return rc;
2210 static void *established_get_next(struct seq_file *seq, void *cur)
2212 struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
2213 struct sock *sk = cur;
2214 struct hlist_nulls_node *node;
2215 struct tcp_iter_state *st = seq->private;
2216 struct net *net = seq_file_net(seq);
2218 ++st->num;
2219 ++st->offset;
2221 sk = sk_nulls_next(sk);
2223 sk_nulls_for_each_from(sk, node) {
2224 if (sk->sk_family == afinfo->family &&
2225 net_eq(sock_net(sk), net))
2226 return sk;
2229 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2230 ++st->bucket;
2231 return established_get_first(seq);
2234 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2236 struct tcp_iter_state *st = seq->private;
2237 void *rc;
2239 st->bucket = 0;
2240 rc = established_get_first(seq);
2242 while (rc && pos) {
2243 rc = established_get_next(seq, rc);
2244 --pos;
2246 return rc;
2249 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2251 void *rc;
2252 struct tcp_iter_state *st = seq->private;
2254 st->state = TCP_SEQ_STATE_LISTENING;
2255 rc = listening_get_idx(seq, &pos);
2257 if (!rc) {
2258 st->state = TCP_SEQ_STATE_ESTABLISHED;
2259 rc = established_get_idx(seq, pos);
2262 return rc;
2265 static void *tcp_seek_last_pos(struct seq_file *seq)
2267 struct tcp_iter_state *st = seq->private;
2268 int offset = st->offset;
2269 int orig_num = st->num;
2270 void *rc = NULL;
2272 switch (st->state) {
2273 case TCP_SEQ_STATE_LISTENING:
2274 if (st->bucket >= INET_LHTABLE_SIZE)
2275 break;
2276 st->state = TCP_SEQ_STATE_LISTENING;
2277 rc = listening_get_next(seq, NULL);
2278 while (offset-- && rc)
2279 rc = listening_get_next(seq, rc);
2280 if (rc)
2281 break;
2282 st->bucket = 0;
2283 st->state = TCP_SEQ_STATE_ESTABLISHED;
2284 /* Fallthrough */
2285 case TCP_SEQ_STATE_ESTABLISHED:
2286 if (st->bucket > tcp_hashinfo.ehash_mask)
2287 break;
2288 rc = established_get_first(seq);
2289 while (offset-- && rc)
2290 rc = established_get_next(seq, rc);
2293 st->num = orig_num;
2295 return rc;
2298 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2300 struct tcp_iter_state *st = seq->private;
2301 void *rc;
2303 if (*pos && *pos == st->last_pos) {
2304 rc = tcp_seek_last_pos(seq);
2305 if (rc)
2306 goto out;
2309 st->state = TCP_SEQ_STATE_LISTENING;
2310 st->num = 0;
2311 st->bucket = 0;
2312 st->offset = 0;
2313 rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2315 out:
2316 st->last_pos = *pos;
2317 return rc;
2319 EXPORT_SYMBOL(tcp_seq_start);
2321 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2323 struct tcp_iter_state *st = seq->private;
2324 void *rc = NULL;
2326 if (v == SEQ_START_TOKEN) {
2327 rc = tcp_get_idx(seq, 0);
2328 goto out;
2331 switch (st->state) {
2332 case TCP_SEQ_STATE_LISTENING:
2333 rc = listening_get_next(seq, v);
2334 if (!rc) {
2335 st->state = TCP_SEQ_STATE_ESTABLISHED;
2336 st->bucket = 0;
2337 st->offset = 0;
2338 rc = established_get_first(seq);
2340 break;
2341 case TCP_SEQ_STATE_ESTABLISHED:
2342 rc = established_get_next(seq, v);
2343 break;
2345 out:
2346 ++*pos;
2347 st->last_pos = *pos;
2348 return rc;
2350 EXPORT_SYMBOL(tcp_seq_next);
2352 void tcp_seq_stop(struct seq_file *seq, void *v)
2354 struct tcp_iter_state *st = seq->private;
2356 switch (st->state) {
2357 case TCP_SEQ_STATE_LISTENING:
2358 if (v != SEQ_START_TOKEN)
2359 spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock);
2360 break;
2361 case TCP_SEQ_STATE_ESTABLISHED:
2362 if (v)
2363 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2364 break;
2367 EXPORT_SYMBOL(tcp_seq_stop);
2369 static void get_openreq4(const struct request_sock *req,
2370 struct seq_file *f, int i)
2372 const struct inet_request_sock *ireq = inet_rsk(req);
2373 long delta = req->rsk_timer.expires - jiffies;
2375 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2376 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2378 ireq->ir_loc_addr,
2379 ireq->ir_num,
2380 ireq->ir_rmt_addr,
2381 ntohs(ireq->ir_rmt_port),
2382 TCP_SYN_RECV,
2383 0, 0, /* could print option size, but that is af dependent. */
2384 1, /* timers active (only the expire timer) */
2385 jiffies_delta_to_clock_t(delta),
2386 req->num_timeout,
2387 from_kuid_munged(seq_user_ns(f),
2388 sock_i_uid(req->rsk_listener)),
2389 0, /* non standard timer */
2390 0, /* open_requests have no inode */
2392 req);
2395 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2397 int timer_active;
2398 unsigned long timer_expires;
2399 const struct tcp_sock *tp = tcp_sk(sk);
2400 const struct inet_connection_sock *icsk = inet_csk(sk);
2401 const struct inet_sock *inet = inet_sk(sk);
2402 const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2403 __be32 dest = inet->inet_daddr;
2404 __be32 src = inet->inet_rcv_saddr;
2405 __u16 destp = ntohs(inet->inet_dport);
2406 __u16 srcp = ntohs(inet->inet_sport);
2407 int rx_queue;
2408 int state;
2410 if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2411 icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2412 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2413 timer_active = 1;
2414 timer_expires = icsk->icsk_timeout;
2415 } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2416 timer_active = 4;
2417 timer_expires = icsk->icsk_timeout;
2418 } else if (timer_pending(&sk->sk_timer)) {
2419 timer_active = 2;
2420 timer_expires = sk->sk_timer.expires;
2421 } else {
2422 timer_active = 0;
2423 timer_expires = jiffies;
2426 state = inet_sk_state_load(sk);
2427 if (state == TCP_LISTEN)
2428 rx_queue = sk->sk_ack_backlog;
2429 else
2430 /* Because we don't lock the socket,
2431 * we might find a transient negative value.
2433 rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2435 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2436 "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2437 i, src, srcp, dest, destp, state,
2438 tp->write_seq - tp->snd_una,
2439 rx_queue,
2440 timer_active,
2441 jiffies_delta_to_clock_t(timer_expires - jiffies),
2442 icsk->icsk_retransmits,
2443 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2444 icsk->icsk_probes_out,
2445 sock_i_ino(sk),
2446 refcount_read(&sk->sk_refcnt), sk,
2447 jiffies_to_clock_t(icsk->icsk_rto),
2448 jiffies_to_clock_t(icsk->icsk_ack.ato),
2449 (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2450 tp->snd_cwnd,
2451 state == TCP_LISTEN ?
2452 fastopenq->max_qlen :
2453 (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2456 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2457 struct seq_file *f, int i)
2459 long delta = tw->tw_timer.expires - jiffies;
2460 __be32 dest, src;
2461 __u16 destp, srcp;
2463 dest = tw->tw_daddr;
2464 src = tw->tw_rcv_saddr;
2465 destp = ntohs(tw->tw_dport);
2466 srcp = ntohs(tw->tw_sport);
2468 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2469 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2470 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2471 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2472 refcount_read(&tw->tw_refcnt), tw);
2475 #define TMPSZ 150
2477 static int tcp4_seq_show(struct seq_file *seq, void *v)
2479 struct tcp_iter_state *st;
2480 struct sock *sk = v;
2482 seq_setwidth(seq, TMPSZ - 1);
2483 if (v == SEQ_START_TOKEN) {
2484 seq_puts(seq, " sl local_address rem_address st tx_queue "
2485 "rx_queue tr tm->when retrnsmt uid timeout "
2486 "inode");
2487 goto out;
2489 st = seq->private;
2491 if (sk->sk_state == TCP_TIME_WAIT)
2492 get_timewait4_sock(v, seq, st->num);
2493 else if (sk->sk_state == TCP_NEW_SYN_RECV)
2494 get_openreq4(v, seq, st->num);
2495 else
2496 get_tcp4_sock(v, seq, st->num);
2497 out:
2498 seq_pad(seq, '\n');
2499 return 0;
2502 static const struct seq_operations tcp4_seq_ops = {
2503 .show = tcp4_seq_show,
2504 .start = tcp_seq_start,
2505 .next = tcp_seq_next,
2506 .stop = tcp_seq_stop,
2509 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2510 .family = AF_INET,
2513 static int __net_init tcp4_proc_init_net(struct net *net)
2515 if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
2516 sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
2517 return -ENOMEM;
2518 return 0;
2521 static void __net_exit tcp4_proc_exit_net(struct net *net)
2523 remove_proc_entry("tcp", net->proc_net);
2526 static struct pernet_operations tcp4_net_ops = {
2527 .init = tcp4_proc_init_net,
2528 .exit = tcp4_proc_exit_net,
2531 int __init tcp4_proc_init(void)
2533 return register_pernet_subsys(&tcp4_net_ops);
2536 void tcp4_proc_exit(void)
2538 unregister_pernet_subsys(&tcp4_net_ops);
2540 #endif /* CONFIG_PROC_FS */
2542 struct proto tcp_prot = {
2543 .name = "TCP",
2544 .owner = THIS_MODULE,
2545 .close = tcp_close,
2546 .pre_connect = tcp_v4_pre_connect,
2547 .connect = tcp_v4_connect,
2548 .disconnect = tcp_disconnect,
2549 .accept = inet_csk_accept,
2550 .ioctl = tcp_ioctl,
2551 .init = tcp_v4_init_sock,
2552 .destroy = tcp_v4_destroy_sock,
2553 .shutdown = tcp_shutdown,
2554 .setsockopt = tcp_setsockopt,
2555 .getsockopt = tcp_getsockopt,
2556 .keepalive = tcp_set_keepalive,
2557 .recvmsg = tcp_recvmsg,
2558 .sendmsg = tcp_sendmsg,
2559 .sendpage = tcp_sendpage,
2560 .backlog_rcv = tcp_v4_do_rcv,
2561 .release_cb = tcp_release_cb,
2562 .hash = inet_hash,
2563 .unhash = inet_unhash,
2564 .get_port = inet_csk_get_port,
2565 .enter_memory_pressure = tcp_enter_memory_pressure,
2566 .leave_memory_pressure = tcp_leave_memory_pressure,
2567 .stream_memory_free = tcp_stream_memory_free,
2568 .sockets_allocated = &tcp_sockets_allocated,
2569 .orphan_count = &tcp_orphan_count,
2570 .memory_allocated = &tcp_memory_allocated,
2571 .memory_pressure = &tcp_memory_pressure,
2572 .sysctl_mem = sysctl_tcp_mem,
2573 .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem),
2574 .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem),
2575 .max_header = MAX_TCP_HEADER,
2576 .obj_size = sizeof(struct tcp_sock),
2577 .slab_flags = SLAB_TYPESAFE_BY_RCU,
2578 .twsk_prot = &tcp_timewait_sock_ops,
2579 .rsk_prot = &tcp_request_sock_ops,
2580 .h.hashinfo = &tcp_hashinfo,
2581 .no_autobind = true,
2582 #ifdef CONFIG_COMPAT
2583 .compat_setsockopt = compat_tcp_setsockopt,
2584 .compat_getsockopt = compat_tcp_getsockopt,
2585 #endif
2586 .diag_destroy = tcp_abort,
2588 EXPORT_SYMBOL(tcp_prot);
2590 static void __net_exit tcp_sk_exit(struct net *net)
2592 int cpu;
2594 if (net->ipv4.tcp_congestion_control)
2595 module_put(net->ipv4.tcp_congestion_control->owner);
2597 for_each_possible_cpu(cpu)
2598 inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2599 free_percpu(net->ipv4.tcp_sk);
2602 static int __net_init tcp_sk_init(struct net *net)
2604 int res, cpu, cnt;
2606 net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2607 if (!net->ipv4.tcp_sk)
2608 return -ENOMEM;
2610 for_each_possible_cpu(cpu) {
2611 struct sock *sk;
2613 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2614 IPPROTO_TCP, net);
2615 if (res)
2616 goto fail;
2617 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
2619 /* Please enforce IP_DF and IPID==0 for RST and
2620 * ACK sent in SYN-RECV and TIME-WAIT state.
2622 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
2624 *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2627 net->ipv4.sysctl_tcp_ecn = 2;
2628 net->ipv4.sysctl_tcp_ecn_fallback = 1;
2630 net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2631 net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
2632 net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2633 net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2635 net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
2636 net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
2637 net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
2639 net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
2640 net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
2641 net->ipv4.sysctl_tcp_syncookies = 1;
2642 net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
2643 net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
2644 net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
2645 net->ipv4.sysctl_tcp_orphan_retries = 0;
2646 net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
2647 net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
2648 net->ipv4.sysctl_tcp_tw_reuse = 2;
2650 cnt = tcp_hashinfo.ehash_mask + 1;
2651 net->ipv4.tcp_death_row.sysctl_max_tw_buckets = cnt / 2;
2652 net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
2654 net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 256);
2655 net->ipv4.sysctl_tcp_sack = 1;
2656 net->ipv4.sysctl_tcp_window_scaling = 1;
2657 net->ipv4.sysctl_tcp_timestamps = 1;
2658 net->ipv4.sysctl_tcp_early_retrans = 3;
2659 net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
2660 net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior. */
2661 net->ipv4.sysctl_tcp_retrans_collapse = 1;
2662 net->ipv4.sysctl_tcp_max_reordering = 300;
2663 net->ipv4.sysctl_tcp_dsack = 1;
2664 net->ipv4.sysctl_tcp_app_win = 31;
2665 net->ipv4.sysctl_tcp_adv_win_scale = 1;
2666 net->ipv4.sysctl_tcp_frto = 2;
2667 net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
2668 /* This limits the percentage of the congestion window which we
2669 * will allow a single TSO frame to consume. Building TSO frames
2670 * which are too large can cause TCP streams to be bursty.
2672 net->ipv4.sysctl_tcp_tso_win_divisor = 3;
2673 /* Default TSQ limit of 16 TSO segments */
2674 net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
2675 /* rfc5961 challenge ack rate limiting */
2676 net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
2677 net->ipv4.sysctl_tcp_min_tso_segs = 2;
2678 net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
2679 net->ipv4.sysctl_tcp_autocorking = 1;
2680 net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
2681 net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
2682 net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
2683 if (net != &init_net) {
2684 memcpy(net->ipv4.sysctl_tcp_rmem,
2685 init_net.ipv4.sysctl_tcp_rmem,
2686 sizeof(init_net.ipv4.sysctl_tcp_rmem));
2687 memcpy(net->ipv4.sysctl_tcp_wmem,
2688 init_net.ipv4.sysctl_tcp_wmem,
2689 sizeof(init_net.ipv4.sysctl_tcp_wmem));
2691 net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
2692 net->ipv4.sysctl_tcp_comp_sack_nr = 44;
2693 net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
2694 spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
2695 net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60;
2696 atomic_set(&net->ipv4.tfo_active_disable_times, 0);
2698 /* Reno is always built in */
2699 if (!net_eq(net, &init_net) &&
2700 try_module_get(init_net.ipv4.tcp_congestion_control->owner))
2701 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
2702 else
2703 net->ipv4.tcp_congestion_control = &tcp_reno;
2705 return 0;
2706 fail:
2707 tcp_sk_exit(net);
2709 return res;
2712 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2714 struct net *net;
2716 inet_twsk_purge(&tcp_hashinfo, AF_INET);
2718 list_for_each_entry(net, net_exit_list, exit_list)
2719 tcp_fastopen_ctx_destroy(net);
2722 static struct pernet_operations __net_initdata tcp_sk_ops = {
2723 .init = tcp_sk_init,
2724 .exit = tcp_sk_exit,
2725 .exit_batch = tcp_sk_exit_batch,
2728 void __init tcp_v4_init(void)
2730 if (register_pernet_subsys(&tcp_sk_ops))
2731 panic("Failed to create the TCP control socket.\n");