net/ipv4/tcp_ipv4.c

   1 // SPDX-License-Identifier: GPL-2.0-or-later
   2 /*
   3  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   4  *              operating system.  INET is implemented using the  BSD Socket
   5  *              interface as the means of communication with the user level.
   6  *
   7  *              Implementation of the Transmission Control Protocol(TCP).
   8  *
   9  *              IPv4 specific functions
  10  *
  11  *              code split from:
  12  *              linux/ipv4/tcp.c
  13  *              linux/ipv4/tcp_input.c
  14  *              linux/ipv4/tcp_output.c
  15  *
  16  *              See tcp.c for author information
  17  */
  18
  19 /*
  20  * Changes:
  21  *              David S. Miller :       New socket lookup architecture.
  22  *                                      This code is dedicated to John Dyson.
  23  *              David S. Miller :       Change semantics of established hash,
  24  *                                      half is devoted to TIME_WAIT sockets
  25  *                                      and the rest go in the other half.
  26  *              Andi Kleen :            Add support for syncookies and fixed
  27  *                                      some bugs: ip options weren't passed to
  28  *                                      the TCP layer, missed a check for an
  29  *                                      ACK bit.
  30  *              Andi Kleen :            Implemented fast path mtu discovery.
  31  *                                      Fixed many serious bugs in the
  32  *                                      request_sock handling and moved
  33  *                                      most of it into the af independent code.
  34  *                                      Added tail drop and some other bugfixes.
  35  *                                      Added new listen semantics.
  36  *              Mike McLagan    :       Routing by source
  37  *      Juan Jose Ciarlante:            ip_dynaddr bits
  38  *              Andi Kleen:             various fixes.
  39  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
  40  *                                      coma.
  41  *      Andi Kleen              :       Fix new listen.
  42  *      Andi Kleen              :       Fix accept error reporting.
  43  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
  44  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
  45  *                                      a single port at the same time.
  46  */
  47
  48 #define pr_fmt(fmt) "TCP: " fmt
  49
  50 #include <linux/bottom_half.h>
  51 #include <linux/types.h>
  52 #include <linux/fcntl.h>
  53 #include <linux/module.h>
  54 #include <linux/random.h>
  55 #include <linux/cache.h>
  56 #include <linux/jhash.h>
  57 #include <linux/init.h>
  58 #include <linux/times.h>
  59 #include <linux/slab.h>
  60 #include <linux/sched.h>
  61
  62 #include <net/net_namespace.h>
  63 #include <net/icmp.h>
  64 #include <net/inet_hashtables.h>
  65 #include <net/tcp.h>
  66 #include <net/transp_v6.h>
  67 #include <net/ipv6.h>
  68 #include <net/inet_common.h>
  69 #include <net/timewait_sock.h>
  70 #include <net/xfrm.h>
  71 #include <net/secure_seq.h>
  72 #include <net/busy_poll.h>
  73 #include <net/rstreason.h>
  74
  75 #include <linux/inet.h>
  76 #include <linux/ipv6.h>
  77 #include <linux/stddef.h>
  78 #include <linux/proc_fs.h>
  79 #include <linux/seq_file.h>
  80 #include <linux/inetdevice.h>
  81 #include <linux/btf_ids.h>
  82 #include <linux/skbuff_ref.h>
  83
  84 #include <crypto/hash.h>
  85 #include <linux/scatterlist.h>
  86
  87 #include <trace/events/tcp.h>
  88
  89 #ifdef CONFIG_TCP_MD5SIG
  90 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
  91                                __be32 daddr, __be32 saddr, const struct tcphdr *th);
  92 #endif
  93
  94 struct inet_hashinfo tcp_hashinfo;
  95 EXPORT_SYMBOL(tcp_hashinfo);
  96
  97 static DEFINE_PER_CPU(struct sock_bh_locked, ipv4_tcp_sk) = {
  98         .bh_lock = INIT_LOCAL_LOCK(bh_lock),
  99 };
 100
 101 static DEFINE_MUTEX(tcp_exit_batch_mutex);
 102
 103 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
 104 {
 105         return secure_tcp_seq(ip_hdr(skb)->daddr,
 106                               ip_hdr(skb)->saddr,
 107                               tcp_hdr(skb)->dest,
 108                               tcp_hdr(skb)->source);
 109 }
 110
 111 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
 112 {
 113         return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
 114 }
 115
 116 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
 117 {
 118         int reuse = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse);
 119         const struct inet_timewait_sock *tw = inet_twsk(sktw);
 120         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
 121         struct tcp_sock *tp = tcp_sk(sk);
 122         int ts_recent_stamp;
 123
 124         if (READ_ONCE(tw->tw_substate) == TCP_FIN_WAIT2)
 125                 reuse = 0;
 126
 127         if (reuse == 2) {
 128                 /* Still does not detect *everything* that goes through
 129                  * lo, since we require a loopback src or dst address
 130                  * or direct binding to 'lo' interface.
 131                  */
 132                 bool loopback = false;
 133                 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
 134                         loopback = true;
 135 #if IS_ENABLED(CONFIG_IPV6)
 136                 if (tw->tw_family == AF_INET6) {
 137                         if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
 138                             ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
 139                             ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
 140                             ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
 141                                 loopback = true;
 142                 } else
 143 #endif
 144                 {
 145                         if (ipv4_is_loopback(tw->tw_daddr) ||
 146                             ipv4_is_loopback(tw->tw_rcv_saddr))
 147                                 loopback = true;
 148                 }
 149                 if (!loopback)
 150                         reuse = 0;
 151         }
 152
 153         /* With PAWS, it is safe from the viewpoint
 154            of data integrity. Even without PAWS it is safe provided sequence
 155            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
 156
 157            Actually, the idea is close to VJ's one, only timestamp cache is
 158            held not per host, but per port pair and TW bucket is used as state
 159            holder.
 160
 161            If TW bucket has been already destroyed we fall back to VJ's scheme
 162            and use initial timestamp retrieved from peer table.
 163          */
 164         ts_recent_stamp = READ_ONCE(tcptw->tw_ts_recent_stamp);
 165         if (ts_recent_stamp &&
 166             (!twp || (reuse && time_after32(ktime_get_seconds(),
 167                                             ts_recent_stamp)))) {
 168                 /* inet_twsk_hashdance_schedule() sets sk_refcnt after putting twsk
 169                  * and releasing the bucket lock.
 170                  */
 171                 if (unlikely(!refcount_inc_not_zero(&sktw->sk_refcnt)))
 172                         return 0;
 173
 174                 /* In case of repair and re-using TIME-WAIT sockets we still
 175                  * want to be sure that it is safe as above but honor the
 176                  * sequence numbers and time stamps set as part of the repair
 177                  * process.
 178                  *
 179                  * Without this check re-using a TIME-WAIT socket with TCP
 180                  * repair would accumulate a -1 on the repair assigned
 181                  * sequence number. The first time it is reused the sequence
 182                  * is -1, the second time -2, etc. This fixes that issue
 183                  * without appearing to create any others.
 184                  */
 185                 if (likely(!tp->repair)) {
 186                         u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
 187
 188                         if (!seq)
 189                                 seq = 1;
 190                         WRITE_ONCE(tp->write_seq, seq);
 191                         tp->rx_opt.ts_recent       = READ_ONCE(tcptw->tw_ts_recent);
 192                         tp->rx_opt.ts_recent_stamp = ts_recent_stamp;
 193                 }
 194
 195                 return 1;
 196         }
 197
 198         return 0;
 199 }
 200 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
 201
 202 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
 203                               int addr_len)
 204 {
 205         /* This check is replicated from tcp_v4_connect() and intended to
 206          * prevent BPF program called below from accessing bytes that are out
 207          * of the bound specified by user in addr_len.
 208          */
 209         if (addr_len < sizeof(struct sockaddr_in))
 210                 return -EINVAL;
 211
 212         sock_owned_by_me(sk);
 213
 214         return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr, &addr_len);
 215 }
 216
 217 /* This will initiate an outgoing connection. */
 218 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 219 {
 220         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 221         struct inet_timewait_death_row *tcp_death_row;
 222         struct inet_sock *inet = inet_sk(sk);
 223         struct tcp_sock *tp = tcp_sk(sk);
 224         struct ip_options_rcu *inet_opt;
 225         struct net *net = sock_net(sk);
 226         __be16 orig_sport, orig_dport;
 227         __be32 daddr, nexthop;
 228         struct flowi4 *fl4;
 229         struct rtable *rt;
 230         int err;
 231
 232         if (addr_len < sizeof(struct sockaddr_in))
 233                 return -EINVAL;
 234
 235         if (usin->sin_family != AF_INET)
 236                 return -EAFNOSUPPORT;
 237
 238         nexthop = daddr = usin->sin_addr.s_addr;
 239         inet_opt = rcu_dereference_protected(inet->inet_opt,
 240                                              lockdep_sock_is_held(sk));
 241         if (inet_opt && inet_opt->opt.srr) {
 242                 if (!daddr)
 243                         return -EINVAL;
 244                 nexthop = inet_opt->opt.faddr;
 245         }
 246
 247         orig_sport = inet->inet_sport;
 248         orig_dport = usin->sin_port;
 249         fl4 = &inet->cork.fl.u.ip4;
 250         rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
 251                               sk->sk_bound_dev_if, IPPROTO_TCP, orig_sport,
 252                               orig_dport, sk);
 253         if (IS_ERR(rt)) {
 254                 err = PTR_ERR(rt);
 255                 if (err == -ENETUNREACH)
 256                         IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
 257                 return err;
 258         }
 259
 260         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
 261                 ip_rt_put(rt);
 262                 return -ENETUNREACH;
 263         }
 264
 265         if (!inet_opt || !inet_opt->opt.srr)
 266                 daddr = fl4->daddr;
 267
 268         tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
 269
 270         if (!inet->inet_saddr) {
 271                 err = inet_bhash2_update_saddr(sk,  &fl4->saddr, AF_INET);
 272                 if (err) {
 273                         ip_rt_put(rt);
 274                         return err;
 275                 }
 276         } else {
 277                 sk_rcv_saddr_set(sk, inet->inet_saddr);
 278         }
 279
 280         if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
 281                 /* Reset inherited state */
 282                 tp->rx_opt.ts_recent       = 0;
 283                 tp->rx_opt.ts_recent_stamp = 0;
 284                 if (likely(!tp->repair))
 285                         WRITE_ONCE(tp->write_seq, 0);
 286         }
 287
 288         inet->inet_dport = usin->sin_port;
 289         sk_daddr_set(sk, daddr);
 290
 291         inet_csk(sk)->icsk_ext_hdr_len = 0;
 292         if (inet_opt)
 293                 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
 294
 295         tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
 296
 297         /* Socket identity is still unknown (sport may be zero).
 298          * However we set state to SYN-SENT and not releasing socket
 299          * lock select source port, enter ourselves into the hash tables and
 300          * complete initialization after this.
 301          */
 302         tcp_set_state(sk, TCP_SYN_SENT);
 303         err = inet_hash_connect(tcp_death_row, sk);
 304         if (err)
 305                 goto failure;
 306
 307         sk_set_txhash(sk);
 308
 309         rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
 310                                inet->inet_sport, inet->inet_dport, sk);
 311         if (IS_ERR(rt)) {
 312                 err = PTR_ERR(rt);
 313                 rt = NULL;
 314                 goto failure;
 315         }
 316         tp->tcp_usec_ts = dst_tcp_usec_ts(&rt->dst);
 317         /* OK, now commit destination to socket.  */
 318         sk->sk_gso_type = SKB_GSO_TCPV4;
 319         sk_setup_caps(sk, &rt->dst);
 320         rt = NULL;
 321
 322         if (likely(!tp->repair)) {
 323                 if (!tp->write_seq)
 324                         WRITE_ONCE(tp->write_seq,
 325                                    secure_tcp_seq(inet->inet_saddr,
 326                                                   inet->inet_daddr,
 327                                                   inet->inet_sport,
 328                                                   usin->sin_port));
 329                 WRITE_ONCE(tp->tsoffset,
 330                            secure_tcp_ts_off(net, inet->inet_saddr,
 331                                              inet->inet_daddr));
 332         }
 333
 334         atomic_set(&inet->inet_id, get_random_u16());
 335
 336         if (tcp_fastopen_defer_connect(sk, &err))
 337                 return err;
 338         if (err)
 339                 goto failure;
 340
 341         err = tcp_connect(sk);
 342
 343         if (err)
 344                 goto failure;
 345
 346         return 0;
 347
 348 failure:
 349         /*
 350          * This unhashes the socket and releases the local port,
 351          * if necessary.
 352          */
 353         tcp_set_state(sk, TCP_CLOSE);
 354         inet_bhash2_reset_saddr(sk);
 355         ip_rt_put(rt);
 356         sk->sk_route_caps = 0;
 357         inet->inet_dport = 0;
 358         return err;
 359 }
 360 EXPORT_SYMBOL(tcp_v4_connect);
 361
 362 /*
 363  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
 364  * It can be called through tcp_release_cb() if socket was owned by user
 365  * at the time tcp_v4_err() was called to handle ICMP message.
 366  */
 367 void tcp_v4_mtu_reduced(struct sock *sk)
 368 {
 369         struct inet_sock *inet = inet_sk(sk);
 370         struct dst_entry *dst;
 371         u32 mtu;
 372
 373         if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
 374                 return;
 375         mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
 376         dst = inet_csk_update_pmtu(sk, mtu);
 377         if (!dst)
 378                 return;
 379
 380         /* Something is about to be wrong... Remember soft error
 381          * for the case, if this connection will not able to recover.
 382          */
 383         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
 384                 WRITE_ONCE(sk->sk_err_soft, EMSGSIZE);
 385
 386         mtu = dst_mtu(dst);
 387
 388         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
 389             ip_sk_accept_pmtu(sk) &&
 390             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
 391                 tcp_sync_mss(sk, mtu);
 392
 393                 /* Resend the TCP packet because it's
 394                  * clear that the old packet has been
 395                  * dropped. This is the new "fast" path mtu
 396                  * discovery.
 397                  */
 398                 tcp_simple_retransmit(sk);
 399         } /* else let the usual retransmit timer handle it */
 400 }
 401 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
 402
 403 static void do_redirect(struct sk_buff *skb, struct sock *sk)
 404 {
 405         struct dst_entry *dst = __sk_dst_check(sk, 0);
 406
 407         if (dst)
 408                 dst->ops->redirect(dst, sk, skb);
 409 }
 410
 411
 412 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
 413 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
 414 {
 415         struct request_sock *req = inet_reqsk(sk);
 416         struct net *net = sock_net(sk);
 417
 418         /* ICMPs are not backlogged, hence we cannot get
 419          * an established socket here.
 420          */
 421         if (seq != tcp_rsk(req)->snt_isn) {
 422                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
 423         } else if (abort) {
 424                 /*
 425                  * Still in SYN_RECV, just remove it silently.
 426                  * There is no good way to pass the error to the newly
 427                  * created socket, and POSIX does not want network
 428                  * errors returned from accept().
 429                  */
 430                 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
 431                 tcp_listendrop(req->rsk_listener);
 432         }
 433         reqsk_put(req);
 434 }
 435 EXPORT_SYMBOL(tcp_req_err);
 436
 437 /* TCP-LD (RFC 6069) logic */
 438 void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
 439 {
 440         struct inet_connection_sock *icsk = inet_csk(sk);
 441         struct tcp_sock *tp = tcp_sk(sk);
 442         struct sk_buff *skb;
 443         s32 remaining;
 444         u32 delta_us;
 445
 446         if (sock_owned_by_user(sk))
 447                 return;
 448
 449         if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
 450             !icsk->icsk_backoff)
 451                 return;
 452
 453         skb = tcp_rtx_queue_head(sk);
 454         if (WARN_ON_ONCE(!skb))
 455                 return;
 456
 457         icsk->icsk_backoff--;
 458         icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
 459         icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
 460
 461         tcp_mstamp_refresh(tp);
 462         delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
 463         remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
 464
 465         if (remaining > 0) {
 466                 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
 467                                           remaining, TCP_RTO_MAX);
 468         } else {
 469                 /* RTO revert clocked out retransmission.
 470                  * Will retransmit now.
 471                  */
 472                 tcp_retransmit_timer(sk);
 473         }
 474 }
 475 EXPORT_SYMBOL(tcp_ld_RTO_revert);
 476
 477 /*
 478  * This routine is called by the ICMP module when it gets some
 479  * sort of error condition.  If err < 0 then the socket should
 480  * be closed and the error returned to the user.  If err > 0
 481  * it's just the icmp type << 8 | icmp code.  After adjustment
 482  * header points to the first 8 bytes of the tcp header.  We need
 483  * to find the appropriate port.
 484  *
 485  * The locking strategy used here is very "optimistic". When
 486  * someone else accesses the socket the ICMP is just dropped
 487  * and for some paths there is no check at all.
 488  * A more general error queue to queue errors for later handling
 489  * is probably better.
 490  *
 491  */
 492
 493 int tcp_v4_err(struct sk_buff *skb, u32 info)
 494 {
 495         const struct iphdr *iph = (const struct iphdr *)skb->data;
 496         struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
 497         struct tcp_sock *tp;
 498         const int type = icmp_hdr(skb)->type;
 499         const int code = icmp_hdr(skb)->code;
 500         struct sock *sk;
 501         struct request_sock *fastopen;
 502         u32 seq, snd_una;
 503         int err;
 504         struct net *net = dev_net(skb->dev);
 505
 506         sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
 507                                        iph->daddr, th->dest, iph->saddr,
 508                                        ntohs(th->source), inet_iif(skb), 0);
 509         if (!sk) {
 510                 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
 511                 return -ENOENT;
 512         }
 513         if (sk->sk_state == TCP_TIME_WAIT) {
 514                 /* To increase the counter of ignored icmps for TCP-AO */
 515                 tcp_ao_ignore_icmp(sk, AF_INET, type, code);
 516                 inet_twsk_put(inet_twsk(sk));
 517                 return 0;
 518         }
 519         seq = ntohl(th->seq);
 520         if (sk->sk_state == TCP_NEW_SYN_RECV) {
 521                 tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
 522                                      type == ICMP_TIME_EXCEEDED ||
 523                                      (type == ICMP_DEST_UNREACH &&
 524                                       (code == ICMP_NET_UNREACH ||
 525                                        code == ICMP_HOST_UNREACH)));
 526                 return 0;
 527         }
 528
 529         if (tcp_ao_ignore_icmp(sk, AF_INET, type, code)) {
 530                 sock_put(sk);
 531                 return 0;
 532         }
 533
 534         bh_lock_sock(sk);
 535         /* If too many ICMPs get dropped on busy
 536          * servers this needs to be solved differently.
 537          * We do take care of PMTU discovery (RFC1191) special case :
 538          * we can receive locally generated ICMP messages while socket is held.
 539          */
 540         if (sock_owned_by_user(sk)) {
 541                 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
 542                         __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
 543         }
 544         if (sk->sk_state == TCP_CLOSE)
 545                 goto out;
 546
 547         if (static_branch_unlikely(&ip4_min_ttl)) {
 548                 /* min_ttl can be changed concurrently from do_ip_setsockopt() */
 549                 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
 550                         __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
 551                         goto out;
 552                 }
 553         }
 554
 555         tp = tcp_sk(sk);
 556         /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
 557         fastopen = rcu_dereference(tp->fastopen_rsk);
 558         snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
 559         if (sk->sk_state != TCP_LISTEN &&
 560             !between(seq, snd_una, tp->snd_nxt)) {
 561                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
 562                 goto out;
 563         }
 564
 565         switch (type) {
 566         case ICMP_REDIRECT:
 567                 if (!sock_owned_by_user(sk))
 568                         do_redirect(skb, sk);
 569                 goto out;
 570         case ICMP_SOURCE_QUENCH:
 571                 /* Just silently ignore these. */
 572                 goto out;
 573         case ICMP_PARAMETERPROB:
 574                 err = EPROTO;
 575                 break;
 576         case ICMP_DEST_UNREACH:
 577                 if (code > NR_ICMP_UNREACH)
 578                         goto out;
 579
 580                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
 581                         /* We are not interested in TCP_LISTEN and open_requests
 582                          * (SYN-ACKs send out by Linux are always <576bytes so
 583                          * they should go through unfragmented).
 584                          */
 585                         if (sk->sk_state == TCP_LISTEN)
 586                                 goto out;
 587
 588                         WRITE_ONCE(tp->mtu_info, info);
 589                         if (!sock_owned_by_user(sk)) {
 590                                 tcp_v4_mtu_reduced(sk);
 591                         } else {
 592                                 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
 593                                         sock_hold(sk);
 594                         }
 595                         goto out;
 596                 }
 597
 598                 err = icmp_err_convert[code].errno;
 599                 /* check if this ICMP message allows revert of backoff.
 600                  * (see RFC 6069)
 601                  */
 602                 if (!fastopen &&
 603                     (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
 604                         tcp_ld_RTO_revert(sk, seq);
 605                 break;
 606         case ICMP_TIME_EXCEEDED:
 607                 err = EHOSTUNREACH;
 608                 break;
 609         default:
 610                 goto out;
 611         }
 612
 613         switch (sk->sk_state) {
 614         case TCP_SYN_SENT:
 615         case TCP_SYN_RECV:
 616                 /* Only in fast or simultaneous open. If a fast open socket is
 617                  * already accepted it is treated as a connected one below.
 618                  */
 619                 if (fastopen && !fastopen->sk)
 620                         break;
 621
 622                 ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
 623
 624                 if (!sock_owned_by_user(sk))
 625                         tcp_done_with_error(sk, err);
 626                 else
 627                         WRITE_ONCE(sk->sk_err_soft, err);
 628                 goto out;
 629         }
 630
 631         /* If we've already connected we will keep trying
 632          * until we time out, or the user gives up.
 633          *
 634          * rfc1122 4.2.3.9 allows to consider as hard errors
 635          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
 636          * but it is obsoleted by pmtu discovery).
 637          *
 638          * Note, that in modern internet, where routing is unreliable
 639          * and in each dark corner broken firewalls sit, sending random
 640          * errors ordered by their masters even this two messages finally lose
 641          * their original sense (even Linux sends invalid PORT_UNREACHs)
 642          *
 643          * Now we are in compliance with RFCs.
 644          *                                                      --ANK (980905)
 645          */
 646
 647         if (!sock_owned_by_user(sk) &&
 648             inet_test_bit(RECVERR, sk)) {
 649                 WRITE_ONCE(sk->sk_err, err);
 650                 sk_error_report(sk);
 651         } else  { /* Only an error on timeout */
 652                 WRITE_ONCE(sk->sk_err_soft, err);
 653         }
 654
 655 out:
 656         bh_unlock_sock(sk);
 657         sock_put(sk);
 658         return 0;
 659 }
 660
 661 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
 662 {
 663         struct tcphdr *th = tcp_hdr(skb);
 664
 665         th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
 666         skb->csum_start = skb_transport_header(skb) - skb->head;
 667         skb->csum_offset = offsetof(struct tcphdr, check);
 668 }
 669
 670 /* This routine computes an IPv4 TCP checksum. */
 671 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
 672 {
 673         const struct inet_sock *inet = inet_sk(sk);
 674
 675         __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
 676 }
 677 EXPORT_SYMBOL(tcp_v4_send_check);
 678
 679 #define REPLY_OPTIONS_LEN      (MAX_TCP_OPTION_SPACE / sizeof(__be32))
 680
 681 static bool tcp_v4_ao_sign_reset(const struct sock *sk, struct sk_buff *skb,
 682                                  const struct tcp_ao_hdr *aoh,
 683                                  struct ip_reply_arg *arg, struct tcphdr *reply,
 684                                  __be32 reply_options[REPLY_OPTIONS_LEN])
 685 {
 686 #ifdef CONFIG_TCP_AO
 687         int sdif = tcp_v4_sdif(skb);
 688         int dif = inet_iif(skb);
 689         int l3index = sdif ? dif : 0;
 690         bool allocated_traffic_key;
 691         struct tcp_ao_key *key;
 692         char *traffic_key;
 693         bool drop = true;
 694         u32 ao_sne = 0;
 695         u8 keyid;
 696
 697         rcu_read_lock();
 698         if (tcp_ao_prepare_reset(sk, skb, aoh, l3index, ntohl(reply->seq),
 699                                  &key, &traffic_key, &allocated_traffic_key,
 700                                  &keyid, &ao_sne))
 701                 goto out;
 702
 703         reply_options[0] = htonl((TCPOPT_AO << 24) | (tcp_ao_len(key) << 16) |
 704                                  (aoh->rnext_keyid << 8) | keyid);
 705         arg->iov[0].iov_len += tcp_ao_len_aligned(key);
 706         reply->doff = arg->iov[0].iov_len / 4;
 707
 708         if (tcp_ao_hash_hdr(AF_INET, (char *)&reply_options[1],
 709                             key, traffic_key,
 710                             (union tcp_ao_addr *)&ip_hdr(skb)->saddr,
 711                             (union tcp_ao_addr *)&ip_hdr(skb)->daddr,
 712                             reply, ao_sne))
 713                 goto out;
 714         drop = false;
 715 out:
 716         rcu_read_unlock();
 717         if (allocated_traffic_key)
 718                 kfree(traffic_key);
 719         return drop;
 720 #else
 721         return true;
 722 #endif
 723 }
 724
 725 /*
 726  *      This routine will send an RST to the other tcp.
 727  *
 728  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
 729  *                    for reset.
 730  *      Answer: if a packet caused RST, it is not for a socket
 731  *              existing in our system, if it is matched to a socket,
 732  *              it is just duplicate segment or bug in other side's TCP.
 733  *              So that we build reply only basing on parameters
 734  *              arrived with segment.
 735  *      Exception: precedence violation. We do not implement it in any case.
 736  */
 737
 738 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb,
 739                               enum sk_rst_reason reason)
 740 {
 741         const struct tcphdr *th = tcp_hdr(skb);
 742         struct {
 743                 struct tcphdr th;
 744                 __be32 opt[REPLY_OPTIONS_LEN];
 745         } rep;
 746         const __u8 *md5_hash_location = NULL;
 747         const struct tcp_ao_hdr *aoh;
 748         struct ip_reply_arg arg;
 749 #ifdef CONFIG_TCP_MD5SIG
 750         struct tcp_md5sig_key *key = NULL;
 751         unsigned char newhash[16];
 752         struct sock *sk1 = NULL;
 753         int genhash;
 754 #endif
 755         u64 transmit_time = 0;
 756         struct sock *ctl_sk;
 757         struct net *net;
 758         u32 txhash = 0;
 759
 760         /* Never send a reset in response to a reset. */
 761         if (th->rst)
 762                 return;
 763
 764         /* If sk not NULL, it means we did a successful lookup and incoming
 765          * route had to be correct. prequeue might have dropped our dst.
 766          */
 767         if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
 768                 return;
 769
 770         /* Swap the send and the receive. */
 771         memset(&rep, 0, sizeof(rep));
 772         rep.th.dest   = th->source;
 773         rep.th.source = th->dest;
 774         rep.th.doff   = sizeof(struct tcphdr) / 4;
 775         rep.th.rst    = 1;
 776
 777         if (th->ack) {
 778                 rep.th.seq = th->ack_seq;
 779         } else {
 780                 rep.th.ack = 1;
 781                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
 782                                        skb->len - (th->doff << 2));
 783         }
 784
 785         memset(&arg, 0, sizeof(arg));
 786         arg.iov[0].iov_base = (unsigned char *)&rep;
 787         arg.iov[0].iov_len  = sizeof(rep.th);
 788
 789         net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
 790
 791         /* Invalid TCP option size or twice included auth */
 792         if (tcp_parse_auth_options(tcp_hdr(skb), &md5_hash_location, &aoh))
 793                 return;
 794
 795         if (aoh && tcp_v4_ao_sign_reset(sk, skb, aoh, &arg, &rep.th, rep.opt))
 796                 return;
 797
 798 #ifdef CONFIG_TCP_MD5SIG
 799         rcu_read_lock();
 800         if (sk && sk_fullsock(sk)) {
 801                 const union tcp_md5_addr *addr;
 802                 int l3index;
 803
 804                 /* sdif set, means packet ingressed via a device
 805                  * in an L3 domain and inet_iif is set to it.
 806                  */
 807                 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
 808                 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
 809                 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
 810         } else if (md5_hash_location) {
 811                 const union tcp_md5_addr *addr;
 812                 int sdif = tcp_v4_sdif(skb);
 813                 int dif = inet_iif(skb);
 814                 int l3index;
 815
 816                 /*
 817                  * active side is lost. Try to find listening socket through
 818                  * source port, and then find md5 key through listening socket.
 819                  * we are not loose security here:
 820                  * Incoming packet is checked with md5 hash with finding key,
 821                  * no RST generated if md5 hash doesn't match.
 822                  */
 823                 sk1 = __inet_lookup_listener(net, net->ipv4.tcp_death_row.hashinfo,
 824                                              NULL, 0, ip_hdr(skb)->saddr,
 825                                              th->source, ip_hdr(skb)->daddr,
 826                                              ntohs(th->source), dif, sdif);
 827                 /* don't send rst if it can't find key */
 828                 if (!sk1)
 829                         goto out;
 830
 831                 /* sdif set, means packet ingressed via a device
 832                  * in an L3 domain and dif is set to it.
 833                  */
 834                 l3index = sdif ? dif : 0;
 835                 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
 836                 key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
 837                 if (!key)
 838                         goto out;
 839
 840
 841                 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
 842                 if (genhash || memcmp(md5_hash_location, newhash, 16) != 0)
 843                         goto out;
 844
 845         }
 846
 847         if (key) {
 848                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
 849                                    (TCPOPT_NOP << 16) |
 850                                    (TCPOPT_MD5SIG << 8) |
 851                                    TCPOLEN_MD5SIG);
 852                 /* Update length and the length the header thinks exists */
 853                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 854                 rep.th.doff = arg.iov[0].iov_len / 4;
 855
 856                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
 857                                      key, ip_hdr(skb)->saddr,
 858                                      ip_hdr(skb)->daddr, &rep.th);
 859         }
 860 #endif
 861         /* Can't co-exist with TCPMD5, hence check rep.opt[0] */
 862         if (rep.opt[0] == 0) {
 863                 __be32 mrst = mptcp_reset_option(skb);
 864
 865                 if (mrst) {
 866                         rep.opt[0] = mrst;
 867                         arg.iov[0].iov_len += sizeof(mrst);
 868                         rep.th.doff = arg.iov[0].iov_len / 4;
 869                 }
 870         }
 871
 872         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 873                                       ip_hdr(skb)->saddr, /* XXX */
 874                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 875         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 876         arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
 877
 878         /* When socket is gone, all binding information is lost.
 879          * routing might fail in this case. No choice here, if we choose to force
 880          * input interface, we will misroute in case of asymmetric route.
 881          */
 882         if (sk)
 883                 arg.bound_dev_if = sk->sk_bound_dev_if;
 884
 885         trace_tcp_send_reset(sk, skb, reason);
 886
 887         BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
 888                      offsetof(struct inet_timewait_sock, tw_bound_dev_if));
 889
 890         arg.tos = ip_hdr(skb)->tos;
 891         arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
 892         local_bh_disable();
 893         local_lock_nested_bh(&ipv4_tcp_sk.bh_lock);
 894         ctl_sk = this_cpu_read(ipv4_tcp_sk.sock);
 895
 896         sock_net_set(ctl_sk, net);
 897         if (sk) {
 898                 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
 899                                    inet_twsk(sk)->tw_mark : sk->sk_mark;
 900                 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
 901                                    inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority);
 902                 transmit_time = tcp_transmit_time(sk);
 903                 xfrm_sk_clone_policy(ctl_sk, sk);
 904                 txhash = (sk->sk_state == TCP_TIME_WAIT) ?
 905                          inet_twsk(sk)->tw_txhash : sk->sk_txhash;
 906         } else {
 907                 ctl_sk->sk_mark = 0;
 908                 ctl_sk->sk_priority = 0;
 909         }
 910         ip_send_unicast_reply(ctl_sk, sk,
 911                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
 912                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 913                               &arg, arg.iov[0].iov_len,
 914                               transmit_time, txhash);
 915
 916         xfrm_sk_free_policy(ctl_sk);
 917         sock_net_set(ctl_sk, &init_net);
 918         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 919         __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
 920         local_unlock_nested_bh(&ipv4_tcp_sk.bh_lock);
 921         local_bh_enable();
 922
 923 #ifdef CONFIG_TCP_MD5SIG
 924 out:
 925         rcu_read_unlock();
 926 #endif
 927 }
 928
 929 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
 930    outside socket context is ugly, certainly. What can I do?
 931  */
 932
 933 static void tcp_v4_send_ack(const struct sock *sk,
 934                             struct sk_buff *skb, u32 seq, u32 ack,
 935                             u32 win, u32 tsval, u32 tsecr, int oif,
 936                             struct tcp_key *key,
 937                             int reply_flags, u8 tos, u32 txhash)
 938 {
 939         const struct tcphdr *th = tcp_hdr(skb);
 940         struct {
 941                 struct tcphdr th;
 942                 __be32 opt[(MAX_TCP_OPTION_SPACE  >> 2)];
 943         } rep;
 944         struct net *net = sock_net(sk);
 945         struct ip_reply_arg arg;
 946         struct sock *ctl_sk;
 947         u64 transmit_time;
 948
 949         memset(&rep.th, 0, sizeof(struct tcphdr));
 950         memset(&arg, 0, sizeof(arg));
 951
 952         arg.iov[0].iov_base = (unsigned char *)&rep;
 953         arg.iov[0].iov_len  = sizeof(rep.th);
 954         if (tsecr) {
 955                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
 956                                    (TCPOPT_TIMESTAMP << 8) |
 957                                    TCPOLEN_TIMESTAMP);
 958                 rep.opt[1] = htonl(tsval);
 959                 rep.opt[2] = htonl(tsecr);
 960                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
 961         }
 962
 963         /* Swap the send and the receive. */
 964         rep.th.dest    = th->source;
 965         rep.th.source  = th->dest;
 966         rep.th.doff    = arg.iov[0].iov_len / 4;
 967         rep.th.seq     = htonl(seq);
 968         rep.th.ack_seq = htonl(ack);
 969         rep.th.ack     = 1;
 970         rep.th.window  = htons(win);
 971
 972 #ifdef CONFIG_TCP_MD5SIG
 973         if (tcp_key_is_md5(key)) {
 974                 int offset = (tsecr) ? 3 : 0;
 975
 976                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
 977                                           (TCPOPT_NOP << 16) |
 978                                           (TCPOPT_MD5SIG << 8) |
 979                                           TCPOLEN_MD5SIG);
 980                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 981                 rep.th.doff = arg.iov[0].iov_len/4;
 982
 983                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
 984                                     key->md5_key, ip_hdr(skb)->saddr,
 985                                     ip_hdr(skb)->daddr, &rep.th);
 986         }
 987 #endif
 988 #ifdef CONFIG_TCP_AO
 989         if (tcp_key_is_ao(key)) {
 990                 int offset = (tsecr) ? 3 : 0;
 991
 992                 rep.opt[offset++] = htonl((TCPOPT_AO << 24) |
 993                                           (tcp_ao_len(key->ao_key) << 16) |
 994                                           (key->ao_key->sndid << 8) |
 995                                           key->rcv_next);
 996                 arg.iov[0].iov_len += tcp_ao_len_aligned(key->ao_key);
 997                 rep.th.doff = arg.iov[0].iov_len / 4;
 998
 999                 tcp_ao_hash_hdr(AF_INET, (char *)&rep.opt[offset],
1000                                 key->ao_key, key->traffic_key,
1001                                 (union tcp_ao_addr *)&ip_hdr(skb)->saddr,
1002                                 (union tcp_ao_addr *)&ip_hdr(skb)->daddr,
1003                                 &rep.th, key->sne);
1004         }
1005 #endif
1006         arg.flags = reply_flags;
1007         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
1008                                       ip_hdr(skb)->saddr, /* XXX */
1009                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
1010         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1011         if (oif)
1012                 arg.bound_dev_if = oif;
1013         arg.tos = tos;
1014         arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
1015         local_bh_disable();
1016         local_lock_nested_bh(&ipv4_tcp_sk.bh_lock);
1017         ctl_sk = this_cpu_read(ipv4_tcp_sk.sock);
1018         sock_net_set(ctl_sk, net);
1019         ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
1020                            inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark);
1021         ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
1022                            inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority);
1023         transmit_time = tcp_transmit_time(sk);
1024         ip_send_unicast_reply(ctl_sk, sk,
1025                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
1026                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
1027                               &arg, arg.iov[0].iov_len,
1028                               transmit_time, txhash);
1029
1030         sock_net_set(ctl_sk, &init_net);
1031         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
1032         local_unlock_nested_bh(&ipv4_tcp_sk.bh_lock);
1033         local_bh_enable();
1034 }
1035
1036 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
1037 {
1038         struct inet_timewait_sock *tw = inet_twsk(sk);
1039         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
1040         struct tcp_key key = {};
1041 #ifdef CONFIG_TCP_AO
1042         struct tcp_ao_info *ao_info;
1043
1044         if (static_branch_unlikely(&tcp_ao_needed.key)) {
1045                 /* FIXME: the segment to-be-acked is not verified yet */
1046                 ao_info = rcu_dereference(tcptw->ao_info);
1047                 if (ao_info) {
1048                         const struct tcp_ao_hdr *aoh;
1049
1050                         if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh)) {
1051                                 inet_twsk_put(tw);
1052                                 return;
1053                         }
1054
1055                         if (aoh)
1056                                 key.ao_key = tcp_ao_established_key(sk, ao_info,
1057                                                                     aoh->rnext_keyid, -1);
1058                 }
1059         }
1060         if (key.ao_key) {
1061                 struct tcp_ao_key *rnext_key;
1062
1063                 key.traffic_key = snd_other_key(key.ao_key);
1064                 key.sne = READ_ONCE(ao_info->snd_sne);
1065                 rnext_key = READ_ONCE(ao_info->rnext_key);
1066                 key.rcv_next = rnext_key->rcvid;
1067                 key.type = TCP_KEY_AO;
1068 #else
1069         if (0) {
1070 #endif
1071         } else if (static_branch_tcp_md5()) {
1072                 key.md5_key = tcp_twsk_md5_key(tcptw);
1073                 if (key.md5_key)
1074                         key.type = TCP_KEY_MD5;
1075         }
1076
1077         tcp_v4_send_ack(sk, skb,
1078                         tcptw->tw_snd_nxt, READ_ONCE(tcptw->tw_rcv_nxt),
1079                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
1080                         tcp_tw_tsval(tcptw),
1081                         READ_ONCE(tcptw->tw_ts_recent),
1082                         tw->tw_bound_dev_if, &key,
1083                         tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
1084                         tw->tw_tos,
1085                         tw->tw_txhash);
1086
1087         inet_twsk_put(tw);
1088 }
1089
1090 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
1091                                   struct request_sock *req)
1092 {
1093         struct tcp_key key = {};
1094
1095         /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
1096          * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
1097          */
1098         u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
1099                                              tcp_sk(sk)->snd_nxt;
1100
1101 #ifdef CONFIG_TCP_AO
1102         if (static_branch_unlikely(&tcp_ao_needed.key) &&
1103             tcp_rsk_used_ao(req)) {
1104                 const union tcp_md5_addr *addr;
1105                 const struct tcp_ao_hdr *aoh;
1106                 int l3index;
1107
1108                 /* Invalid TCP option size or twice included auth */
1109                 if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh))
1110                         return;
1111                 if (!aoh)
1112                         return;
1113
1114                 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
1115                 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
1116                 key.ao_key = tcp_ao_do_lookup(sk, l3index, addr, AF_INET,
1117                                               aoh->rnext_keyid, -1);
1118                 if (unlikely(!key.ao_key)) {
1119                         /* Send ACK with any matching MKT for the peer */
1120                         key.ao_key = tcp_ao_do_lookup(sk, l3index, addr, AF_INET, -1, -1);
1121                         /* Matching key disappeared (user removed the key?)
1122                          * let the handshake timeout.
1123                          */
1124                         if (!key.ao_key) {
1125                                 net_info_ratelimited("TCP-AO key for (%pI4, %d)->(%pI4, %d) suddenly disappeared, won't ACK new connection\n",
1126                                                      addr,
1127                                                      ntohs(tcp_hdr(skb)->source),
1128                                                      &ip_hdr(skb)->daddr,
1129                                                      ntohs(tcp_hdr(skb)->dest));
1130                                 return;
1131                         }
1132                 }
1133                 key.traffic_key = kmalloc(tcp_ao_digest_size(key.ao_key), GFP_ATOMIC);
1134                 if (!key.traffic_key)
1135                         return;
1136
1137                 key.type = TCP_KEY_AO;
1138                 key.rcv_next = aoh->keyid;
1139                 tcp_v4_ao_calc_key_rsk(key.ao_key, key.traffic_key, req);
1140 #else
1141         if (0) {
1142 #endif
1143         } else if (static_branch_tcp_md5()) {
1144                 const union tcp_md5_addr *addr;
1145                 int l3index;
1146
1147                 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
1148                 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
1149                 key.md5_key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1150                 if (key.md5_key)
1151                         key.type = TCP_KEY_MD5;
1152         }
1153
1154         tcp_v4_send_ack(sk, skb, seq,
1155                         tcp_rsk(req)->rcv_nxt,
1156                         tcp_synack_window(req) >> inet_rsk(req)->rcv_wscale,
1157                         tcp_rsk_tsval(tcp_rsk(req)),
1158                         READ_ONCE(req->ts_recent),
1159                         0, &key,
1160                         inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
1161                         ip_hdr(skb)->tos,
1162                         READ_ONCE(tcp_rsk(req)->txhash));
1163         if (tcp_key_is_ao(&key))
1164                 kfree(key.traffic_key);
1165 }
1166
1167 /*
1168  *      Send a SYN-ACK after having received a SYN.
1169  *      This still operates on a request_sock only, not on a big
1170  *      socket.
1171  */
1172 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
1173                               struct flowi *fl,
1174                               struct request_sock *req,
1175                               struct tcp_fastopen_cookie *foc,
1176                               enum tcp_synack_type synack_type,
1177                               struct sk_buff *syn_skb)
1178 {
1179         const struct inet_request_sock *ireq = inet_rsk(req);
1180         struct flowi4 fl4;
1181         int err = -1;
1182         struct sk_buff *skb;
1183         u8 tos;
1184
1185         /* First, grab a route. */
1186         if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
1187                 return -1;
1188
1189         skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
1190
1191         if (skb) {
1192                 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
1193
1194                 tos = READ_ONCE(inet_sk(sk)->tos);
1195
1196                 if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
1197                         tos = (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
1198                               (tos & INET_ECN_MASK);
1199
1200                 if (!INET_ECN_is_capable(tos) &&
1201                     tcp_bpf_ca_needs_ecn((struct sock *)req))
1202                         tos |= INET_ECN_ECT_0;
1203
1204                 rcu_read_lock();
1205                 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
1206                                             ireq->ir_rmt_addr,
1207                                             rcu_dereference(ireq->ireq_opt),
1208                                             tos);
1209                 rcu_read_unlock();
1210                 err = net_xmit_eval(err);
1211         }
1212
1213         return err;
1214 }
1215
1216 /*
1217  *      IPv4 request_sock destructor.
1218  */
1219 static void tcp_v4_reqsk_destructor(struct request_sock *req)
1220 {
1221         kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
1222 }
1223
1224 #ifdef CONFIG_TCP_MD5SIG
1225 /*
1226  * RFC2385 MD5 checksumming requires a mapping of
1227  * IP address->MD5 Key.
1228  * We need to maintain these in the sk structure.
1229  */
1230
1231 DEFINE_STATIC_KEY_DEFERRED_FALSE(tcp_md5_needed, HZ);
1232 EXPORT_SYMBOL(tcp_md5_needed);
1233
1234 static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new)
1235 {
1236         if (!old)
1237                 return true;
1238
1239         /* l3index always overrides non-l3index */
1240         if (old->l3index && new->l3index == 0)
1241                 return false;
1242         if (old->l3index == 0 && new->l3index)
1243                 return true;
1244
1245         return old->prefixlen < new->prefixlen;
1246 }
1247
1248 /* Find the Key structure for an address.  */
1249 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
1250                                            const union tcp_md5_addr *addr,
1251                                            int family, bool any_l3index)
1252 {
1253         const struct tcp_sock *tp = tcp_sk(sk);
1254         struct tcp_md5sig_key *key;
1255         const struct tcp_md5sig_info *md5sig;
1256         __be32 mask;
1257         struct tcp_md5sig_key *best_match = NULL;
1258         bool match;
1259
1260         /* caller either holds rcu_read_lock() or socket lock */
1261         md5sig = rcu_dereference_check(tp->md5sig_info,
1262                                        lockdep_sock_is_held(sk));
1263         if (!md5sig)
1264                 return NULL;
1265
1266         hlist_for_each_entry_rcu(key, &md5sig->head, node,
1267                                  lockdep_sock_is_held(sk)) {
1268                 if (key->family != family)
1269                         continue;
1270                 if (!any_l3index && key->flags & TCP_MD5SIG_FLAG_IFINDEX &&
1271                     key->l3index != l3index)
1272                         continue;
1273                 if (family == AF_INET) {
1274                         mask = inet_make_mask(key->prefixlen);
1275                         match = (key->addr.a4.s_addr & mask) ==
1276                                 (addr->a4.s_addr & mask);
1277 #if IS_ENABLED(CONFIG_IPV6)
1278                 } else if (family == AF_INET6) {
1279                         match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1280                                                   key->prefixlen);
1281 #endif
1282                 } else {
1283                         match = false;
1284                 }
1285
1286                 if (match && better_md5_match(best_match, key))
1287                         best_match = key;
1288         }
1289         return best_match;
1290 }
1291 EXPORT_SYMBOL(__tcp_md5_do_lookup);
1292
1293 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1294                                                       const union tcp_md5_addr *addr,
1295                                                       int family, u8 prefixlen,
1296                                                       int l3index, u8 flags)
1297 {
1298         const struct tcp_sock *tp = tcp_sk(sk);
1299         struct tcp_md5sig_key *key;
1300         unsigned int size = sizeof(struct in_addr);
1301         const struct tcp_md5sig_info *md5sig;
1302
1303         /* caller either holds rcu_read_lock() or socket lock */
1304         md5sig = rcu_dereference_check(tp->md5sig_info,
1305                                        lockdep_sock_is_held(sk));
1306         if (!md5sig)
1307                 return NULL;
1308 #if IS_ENABLED(CONFIG_IPV6)
1309         if (family == AF_INET6)
1310                 size = sizeof(struct in6_addr);
1311 #endif
1312         hlist_for_each_entry_rcu(key, &md5sig->head, node,
1313                                  lockdep_sock_is_held(sk)) {
1314                 if (key->family != family)
1315                         continue;
1316                 if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX))
1317                         continue;
1318                 if (key->l3index != l3index)
1319                         continue;
1320                 if (!memcmp(&key->addr, addr, size) &&
1321                     key->prefixlen == prefixlen)
1322                         return key;
1323         }
1324         return NULL;
1325 }
1326
1327 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1328                                          const struct sock *addr_sk)
1329 {
1330         const union tcp_md5_addr *addr;
1331         int l3index;
1332
1333         l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
1334                                                  addr_sk->sk_bound_dev_if);
1335         addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1336         return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1337 }
1338 EXPORT_SYMBOL(tcp_v4_md5_lookup);
1339
1340 static int tcp_md5sig_info_add(struct sock *sk, gfp_t gfp)
1341 {
1342         struct tcp_sock *tp = tcp_sk(sk);
1343         struct tcp_md5sig_info *md5sig;
1344
1345         md5sig = kmalloc(sizeof(*md5sig), gfp);
1346         if (!md5sig)
1347                 return -ENOMEM;
1348
1349         sk_gso_disable(sk);
1350         INIT_HLIST_HEAD(&md5sig->head);
1351         rcu_assign_pointer(tp->md5sig_info, md5sig);
1352         return 0;
1353 }
1354
1355 /* This can be called on a newly created socket, from other files */
1356 static int __tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1357                             int family, u8 prefixlen, int l3index, u8 flags,
1358                             const u8 *newkey, u8 newkeylen, gfp_t gfp)
1359 {
1360         /* Add Key to the list */
1361         struct tcp_md5sig_key *key;
1362         struct tcp_sock *tp = tcp_sk(sk);
1363         struct tcp_md5sig_info *md5sig;
1364
1365         key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1366         if (key) {
1367                 /* Pre-existing entry - just update that one.
1368                  * Note that the key might be used concurrently.
1369                  * data_race() is telling kcsan that we do not care of
1370                  * key mismatches, since changing MD5 key on live flows
1371                  * can lead to packet drops.
1372                  */
1373                 data_race(memcpy(key->key, newkey, newkeylen));
1374
1375                 /* Pairs with READ_ONCE() in tcp_md5_hash_key().
1376                  * Also note that a reader could catch new key->keylen value
1377                  * but old key->key[], this is the reason we use __GFP_ZERO
1378                  * at sock_kmalloc() time below these lines.
1379                  */
1380                 WRITE_ONCE(key->keylen, newkeylen);
1381
1382                 return 0;
1383         }
1384
1385         md5sig = rcu_dereference_protected(tp->md5sig_info,
1386                                            lockdep_sock_is_held(sk));
1387
1388         key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
1389         if (!key)
1390                 return -ENOMEM;
1391
1392         memcpy(key->key, newkey, newkeylen);
1393         key->keylen = newkeylen;
1394         key->family = family;
1395         key->prefixlen = prefixlen;
1396         key->l3index = l3index;
1397         key->flags = flags;
1398         memcpy(&key->addr, addr,
1399                (IS_ENABLED(CONFIG_IPV6) && family == AF_INET6) ? sizeof(struct in6_addr) :
1400                                                                  sizeof(struct in_addr));
1401         hlist_add_head_rcu(&key->node, &md5sig->head);
1402         return 0;
1403 }
1404
1405 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1406                    int family, u8 prefixlen, int l3index, u8 flags,
1407                    const u8 *newkey, u8 newkeylen)
1408 {
1409         struct tcp_sock *tp = tcp_sk(sk);
1410
1411         if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) {
1412                 if (tcp_md5_alloc_sigpool())
1413                         return -ENOMEM;
1414
1415                 if (tcp_md5sig_info_add(sk, GFP_KERNEL)) {
1416                         tcp_md5_release_sigpool();
1417                         return -ENOMEM;
1418                 }
1419
1420                 if (!static_branch_inc(&tcp_md5_needed.key)) {
1421                         struct tcp_md5sig_info *md5sig;
1422
1423                         md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk));
1424                         rcu_assign_pointer(tp->md5sig_info, NULL);
1425                         kfree_rcu(md5sig, rcu);
1426                         tcp_md5_release_sigpool();
1427                         return -EUSERS;
1428                 }
1429         }
1430
1431         return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index, flags,
1432                                 newkey, newkeylen, GFP_KERNEL);
1433 }
1434 EXPORT_SYMBOL(tcp_md5_do_add);
1435
1436 int tcp_md5_key_copy(struct sock *sk, const union tcp_md5_addr *addr,
1437                      int family, u8 prefixlen, int l3index,
1438                      struct tcp_md5sig_key *key)
1439 {
1440         struct tcp_sock *tp = tcp_sk(sk);
1441
1442         if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) {
1443                 tcp_md5_add_sigpool();
1444
1445                 if (tcp_md5sig_info_add(sk, sk_gfp_mask(sk, GFP_ATOMIC))) {
1446                         tcp_md5_release_sigpool();
1447                         return -ENOMEM;
1448                 }
1449
1450                 if (!static_key_fast_inc_not_disabled(&tcp_md5_needed.key.key)) {
1451                         struct tcp_md5sig_info *md5sig;
1452
1453                         md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk));
1454                         net_warn_ratelimited("Too many TCP-MD5 keys in the system\n");
1455                         rcu_assign_pointer(tp->md5sig_info, NULL);
1456                         kfree_rcu(md5sig, rcu);
1457                         tcp_md5_release_sigpool();
1458                         return -EUSERS;
1459                 }
1460         }
1461
1462         return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index,
1463                                 key->flags, key->key, key->keylen,
1464                                 sk_gfp_mask(sk, GFP_ATOMIC));
1465 }
1466 EXPORT_SYMBOL(tcp_md5_key_copy);
1467
1468 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1469                    u8 prefixlen, int l3index, u8 flags)
1470 {
1471         struct tcp_md5sig_key *key;
1472
1473         key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1474         if (!key)
1475                 return -ENOENT;
1476         hlist_del_rcu(&key->node);
1477         atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1478         kfree_rcu(key, rcu);
1479         return 0;
1480 }
1481 EXPORT_SYMBOL(tcp_md5_do_del);
1482
1483 void tcp_clear_md5_list(struct sock *sk)
1484 {
1485         struct tcp_sock *tp = tcp_sk(sk);
1486         struct tcp_md5sig_key *key;
1487         struct hlist_node *n;
1488         struct tcp_md5sig_info *md5sig;
1489
1490         md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1491
1492         hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1493                 hlist_del_rcu(&key->node);
1494                 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1495                 kfree_rcu(key, rcu);
1496         }
1497 }
1498
1499 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1500                                  sockptr_t optval, int optlen)
1501 {
1502         struct tcp_md5sig cmd;
1503         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1504         const union tcp_md5_addr *addr;
1505         u8 prefixlen = 32;
1506         int l3index = 0;
1507         bool l3flag;
1508         u8 flags;
1509
1510         if (optlen < sizeof(cmd))
1511                 return -EINVAL;
1512
1513         if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
1514                 return -EFAULT;
1515
1516         if (sin->sin_family != AF_INET)
1517                 return -EINVAL;
1518
1519         flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1520         l3flag = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1521
1522         if (optname == TCP_MD5SIG_EXT &&
1523             cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1524                 prefixlen = cmd.tcpm_prefixlen;
1525                 if (prefixlen > 32)
1526                         return -EINVAL;
1527         }
1528
1529         if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex &&
1530             cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1531                 struct net_device *dev;
1532
1533                 rcu_read_lock();
1534                 dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
1535                 if (dev && netif_is_l3_master(dev))
1536                         l3index = dev->ifindex;
1537
1538                 rcu_read_unlock();
1539
1540                 /* ok to reference set/not set outside of rcu;
1541                  * right now device MUST be an L3 master
1542                  */
1543                 if (!dev || !l3index)
1544                         return -EINVAL;
1545         }
1546
1547         addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1548
1549         if (!cmd.tcpm_keylen)
1550                 return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags);
1551
1552         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1553                 return -EINVAL;
1554
1555         /* Don't allow keys for peers that have a matching TCP-AO key.
1556          * See the comment in tcp_ao_add_cmd()
1557          */
1558         if (tcp_ao_required(sk, addr, AF_INET, l3flag ? l3index : -1, false))
1559                 return -EKEYREJECTED;
1560
1561         return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags,
1562                               cmd.tcpm_key, cmd.tcpm_keylen);
1563 }
1564
1565 static int tcp_v4_md5_hash_headers(struct tcp_sigpool *hp,
1566                                    __be32 daddr, __be32 saddr,
1567                                    const struct tcphdr *th, int nbytes)
1568 {
1569         struct tcp4_pseudohdr *bp;
1570         struct scatterlist sg;
1571         struct tcphdr *_th;
1572
1573         bp = hp->scratch;
1574         bp->saddr = saddr;
1575         bp->daddr = daddr;
1576         bp->pad = 0;
1577         bp->protocol = IPPROTO_TCP;
1578         bp->len = cpu_to_be16(nbytes);
1579
1580         _th = (struct tcphdr *)(bp + 1);
1581         memcpy(_th, th, sizeof(*th));
1582         _th->check = 0;
1583
1584         sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1585         ahash_request_set_crypt(hp->req, &sg, NULL,
1586                                 sizeof(*bp) + sizeof(*th));
1587         return crypto_ahash_update(hp->req);
1588 }
1589
1590 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1591                                __be32 daddr, __be32 saddr, const struct tcphdr *th)
1592 {
1593         struct tcp_sigpool hp;
1594
1595         if (tcp_sigpool_start(tcp_md5_sigpool_id, &hp))
1596                 goto clear_hash_nostart;
1597
1598         if (crypto_ahash_init(hp.req))
1599                 goto clear_hash;
1600         if (tcp_v4_md5_hash_headers(&hp, daddr, saddr, th, th->doff << 2))
1601                 goto clear_hash;
1602         if (tcp_md5_hash_key(&hp, key))
1603                 goto clear_hash;
1604         ahash_request_set_crypt(hp.req, NULL, md5_hash, 0);
1605         if (crypto_ahash_final(hp.req))
1606                 goto clear_hash;
1607
1608         tcp_sigpool_end(&hp);
1609         return 0;
1610
1611 clear_hash:
1612         tcp_sigpool_end(&hp);
1613 clear_hash_nostart:
1614         memset(md5_hash, 0, 16);
1615         return 1;
1616 }
1617
1618 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1619                         const struct sock *sk,
1620                         const struct sk_buff *skb)
1621 {
1622         const struct tcphdr *th = tcp_hdr(skb);
1623         struct tcp_sigpool hp;
1624         __be32 saddr, daddr;
1625
1626         if (sk) { /* valid for establish/request sockets */
1627                 saddr = sk->sk_rcv_saddr;
1628                 daddr = sk->sk_daddr;
1629         } else {
1630                 const struct iphdr *iph = ip_hdr(skb);
1631                 saddr = iph->saddr;
1632                 daddr = iph->daddr;
1633         }
1634
1635         if (tcp_sigpool_start(tcp_md5_sigpool_id, &hp))
1636                 goto clear_hash_nostart;
1637
1638         if (crypto_ahash_init(hp.req))
1639                 goto clear_hash;
1640
1641         if (tcp_v4_md5_hash_headers(&hp, daddr, saddr, th, skb->len))
1642                 goto clear_hash;
1643         if (tcp_sigpool_hash_skb_data(&hp, skb, th->doff << 2))
1644                 goto clear_hash;
1645         if (tcp_md5_hash_key(&hp, key))
1646                 goto clear_hash;
1647         ahash_request_set_crypt(hp.req, NULL, md5_hash, 0);
1648         if (crypto_ahash_final(hp.req))
1649                 goto clear_hash;
1650
1651         tcp_sigpool_end(&hp);
1652         return 0;
1653
1654 clear_hash:
1655         tcp_sigpool_end(&hp);
1656 clear_hash_nostart:
1657         memset(md5_hash, 0, 16);
1658         return 1;
1659 }
1660 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1661
1662 #endif
1663
1664 static void tcp_v4_init_req(struct request_sock *req,
1665                             const struct sock *sk_listener,
1666                             struct sk_buff *skb)
1667 {
1668         struct inet_request_sock *ireq = inet_rsk(req);
1669         struct net *net = sock_net(sk_listener);
1670
1671         sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1672         sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1673         RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1674 }
1675
1676 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1677                                           struct sk_buff *skb,
1678                                           struct flowi *fl,
1679                                           struct request_sock *req,
1680                                           u32 tw_isn)
1681 {
1682         tcp_v4_init_req(req, sk, skb);
1683
1684         if (security_inet_conn_request(sk, skb, req))
1685                 return NULL;
1686
1687         return inet_csk_route_req(sk, &fl->u.ip4, req);
1688 }
1689
1690 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1691         .family         =       PF_INET,
1692         .obj_size       =       sizeof(struct tcp_request_sock),
1693         .rtx_syn_ack    =       tcp_rtx_synack,
1694         .send_ack       =       tcp_v4_reqsk_send_ack,
1695         .destructor     =       tcp_v4_reqsk_destructor,
1696         .send_reset     =       tcp_v4_send_reset,
1697         .syn_ack_timeout =      tcp_syn_ack_timeout,
1698 };
1699
1700 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1701         .mss_clamp      =       TCP_MSS_DEFAULT,
1702 #ifdef CONFIG_TCP_MD5SIG
1703         .req_md5_lookup =       tcp_v4_md5_lookup,
1704         .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1705 #endif
1706 #ifdef CONFIG_TCP_AO
1707         .ao_lookup      =       tcp_v4_ao_lookup_rsk,
1708         .ao_calc_key    =       tcp_v4_ao_calc_key_rsk,
1709         .ao_synack_hash =       tcp_v4_ao_synack_hash,
1710 #endif
1711 #ifdef CONFIG_SYN_COOKIES
1712         .cookie_init_seq =      cookie_v4_init_sequence,
1713 #endif
1714         .route_req      =       tcp_v4_route_req,
1715         .init_seq       =       tcp_v4_init_seq,
1716         .init_ts_off    =       tcp_v4_init_ts_off,
1717         .send_synack    =       tcp_v4_send_synack,
1718 };
1719
1720 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1721 {
1722         /* Never answer to SYNs send to broadcast or multicast */
1723         if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1724                 goto drop;
1725
1726         return tcp_conn_request(&tcp_request_sock_ops,
1727                                 &tcp_request_sock_ipv4_ops, sk, skb);
1728
1729 drop:
1730         tcp_listendrop(sk);
1731         return 0;
1732 }
1733 EXPORT_SYMBOL(tcp_v4_conn_request);
1734
1735
1736 /*
1737  * The three way handshake has completed - we got a valid synack -
1738  * now create the new socket.
1739  */
1740 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1741                                   struct request_sock *req,
1742                                   struct dst_entry *dst,
1743                                   struct request_sock *req_unhash,
1744                                   bool *own_req)
1745 {
1746         struct inet_request_sock *ireq;
1747         bool found_dup_sk = false;
1748         struct inet_sock *newinet;
1749         struct tcp_sock *newtp;
1750         struct sock *newsk;
1751 #ifdef CONFIG_TCP_MD5SIG
1752         const union tcp_md5_addr *addr;
1753         struct tcp_md5sig_key *key;
1754         int l3index;
1755 #endif
1756         struct ip_options_rcu *inet_opt;
1757
1758         if (sk_acceptq_is_full(sk))
1759                 goto exit_overflow;
1760
1761         newsk = tcp_create_openreq_child(sk, req, skb);
1762         if (!newsk)
1763                 goto exit_nonewsk;
1764
1765         newsk->sk_gso_type = SKB_GSO_TCPV4;
1766         inet_sk_rx_dst_set(newsk, skb);
1767
1768         newtp                 = tcp_sk(newsk);
1769         newinet               = inet_sk(newsk);
1770         ireq                  = inet_rsk(req);
1771         sk_daddr_set(newsk, ireq->ir_rmt_addr);
1772         sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1773         newsk->sk_bound_dev_if = ireq->ir_iif;
1774         newinet->inet_saddr   = ireq->ir_loc_addr;
1775         inet_opt              = rcu_dereference(ireq->ireq_opt);
1776         RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1777         newinet->mc_index     = inet_iif(skb);
1778         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1779         newinet->rcv_tos      = ip_hdr(skb)->tos;
1780         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1781         if (inet_opt)
1782                 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1783         atomic_set(&newinet->inet_id, get_random_u16());
1784
1785         /* Set ToS of the new socket based upon the value of incoming SYN.
1786          * ECT bits are set later in tcp_init_transfer().
1787          */
1788         if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
1789                 newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
1790
1791         if (!dst) {
1792                 dst = inet_csk_route_child_sock(sk, newsk, req);
1793                 if (!dst)
1794                         goto put_and_exit;
1795         } else {
1796                 /* syncookie case : see end of cookie_v4_check() */
1797         }
1798         sk_setup_caps(newsk, dst);
1799
1800         tcp_ca_openreq_child(newsk, dst);
1801
1802         tcp_sync_mss(newsk, dst_mtu(dst));
1803         newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1804
1805         tcp_initialize_rcv_mss(newsk);
1806
1807 #ifdef CONFIG_TCP_MD5SIG
1808         l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
1809         /* Copy over the MD5 key from the original socket */
1810         addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1811         key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1812         if (key && !tcp_rsk_used_ao(req)) {
1813                 if (tcp_md5_key_copy(newsk, addr, AF_INET, 32, l3index, key))
1814                         goto put_and_exit;
1815                 sk_gso_disable(newsk);
1816         }
1817 #endif
1818 #ifdef CONFIG_TCP_AO
1819         if (tcp_ao_copy_all_matching(sk, newsk, req, skb, AF_INET))
1820                 goto put_and_exit; /* OOM, release back memory */
1821 #endif
1822
1823         if (__inet_inherit_port(sk, newsk) < 0)
1824                 goto put_and_exit;
1825         *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
1826                                        &found_dup_sk);
1827         if (likely(*own_req)) {
1828                 tcp_move_syn(newtp, req);
1829                 ireq->ireq_opt = NULL;
1830         } else {
1831                 newinet->inet_opt = NULL;
1832
1833                 if (!req_unhash && found_dup_sk) {
1834                         /* This code path should only be executed in the
1835                          * syncookie case only
1836                          */
1837                         bh_unlock_sock(newsk);
1838                         sock_put(newsk);
1839                         newsk = NULL;
1840                 }
1841         }
1842         return newsk;
1843
1844 exit_overflow:
1845         NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1846 exit_nonewsk:
1847         dst_release(dst);
1848 exit:
1849         tcp_listendrop(sk);
1850         return NULL;
1851 put_and_exit:
1852         newinet->inet_opt = NULL;
1853         inet_csk_prepare_forced_close(newsk);
1854         tcp_done(newsk);
1855         goto exit;
1856 }
1857 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1858
1859 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1860 {
1861 #ifdef CONFIG_SYN_COOKIES
1862         const struct tcphdr *th = tcp_hdr(skb);
1863
1864         if (!th->syn)
1865                 sk = cookie_v4_check(sk, skb);
1866 #endif
1867         return sk;
1868 }
1869
1870 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1871                          struct tcphdr *th, u32 *cookie)
1872 {
1873         u16 mss = 0;
1874 #ifdef CONFIG_SYN_COOKIES
1875         mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1876                                     &tcp_request_sock_ipv4_ops, sk, th);
1877         if (mss) {
1878                 *cookie = __cookie_v4_init_sequence(iph, th, &mss);
1879                 tcp_synq_overflow(sk);
1880         }
1881 #endif
1882         return mss;
1883 }
1884
1885 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
1886                                                            u32));
1887 /* The socket must have it's spinlock held when we get
1888  * here, unless it is a TCP_LISTEN socket.
1889  *
1890  * We have a potential double-lock case here, so even when
1891  * doing backlog processing we use the BH locking scheme.
1892  * This is because we cannot sleep with the original spinlock
1893  * held.
1894  */
1895 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1896 {
1897         enum skb_drop_reason reason;
1898         struct sock *rsk;
1899
1900         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1901                 struct dst_entry *dst;
1902
1903                 dst = rcu_dereference_protected(sk->sk_rx_dst,
1904                                                 lockdep_sock_is_held(sk));
1905
1906                 sock_rps_save_rxhash(sk, skb);
1907                 sk_mark_napi_id(sk, skb);
1908                 if (dst) {
1909                         if (sk->sk_rx_dst_ifindex != skb->skb_iif ||
1910                             !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check,
1911                                              dst, 0)) {
1912                                 RCU_INIT_POINTER(sk->sk_rx_dst, NULL);
1913                                 dst_release(dst);
1914                         }
1915                 }
1916                 tcp_rcv_established(sk, skb);
1917                 return 0;
1918         }
1919
1920         if (tcp_checksum_complete(skb))
1921                 goto csum_err;
1922
1923         if (sk->sk_state == TCP_LISTEN) {
1924                 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1925
1926                 if (!nsk)
1927                         return 0;
1928                 if (nsk != sk) {
1929                         reason = tcp_child_process(sk, nsk, skb);
1930                         if (reason) {
1931                                 rsk = nsk;
1932                                 goto reset;
1933                         }
1934                         return 0;
1935                 }
1936         } else
1937                 sock_rps_save_rxhash(sk, skb);
1938
1939         reason = tcp_rcv_state_process(sk, skb);
1940         if (reason) {
1941                 rsk = sk;
1942                 goto reset;
1943         }
1944         return 0;
1945
1946 reset:
1947         tcp_v4_send_reset(rsk, skb, sk_rst_convert_drop_reason(reason));
1948 discard:
1949         sk_skb_reason_drop(sk, skb, reason);
1950         /* Be careful here. If this function gets more complicated and
1951          * gcc suffers from register pressure on the x86, sk (in %ebx)
1952          * might be destroyed here. This current version compiles correctly,
1953          * but you have been warned.
1954          */
1955         return 0;
1956
1957 csum_err:
1958         reason = SKB_DROP_REASON_TCP_CSUM;
1959         trace_tcp_bad_csum(skb);
1960         TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1961         TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1962         goto discard;
1963 }
1964 EXPORT_SYMBOL(tcp_v4_do_rcv);
1965
1966 int tcp_v4_early_demux(struct sk_buff *skb)
1967 {
1968         struct net *net = dev_net(skb->dev);
1969         const struct iphdr *iph;
1970         const struct tcphdr *th;
1971         struct sock *sk;
1972
1973         if (skb->pkt_type != PACKET_HOST)
1974                 return 0;
1975
1976         if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1977                 return 0;
1978
1979         iph = ip_hdr(skb);
1980         th = tcp_hdr(skb);
1981
1982         if (th->doff < sizeof(struct tcphdr) / 4)
1983                 return 0;
1984
1985         sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
1986                                        iph->saddr, th->source,
1987                                        iph->daddr, ntohs(th->dest),
1988                                        skb->skb_iif, inet_sdif(skb));
1989         if (sk) {
1990                 skb->sk = sk;
1991                 skb->destructor = sock_edemux;
1992                 if (sk_fullsock(sk)) {
1993                         struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst);
1994
1995                         if (dst)
1996                                 dst = dst_check(dst, 0);
1997                         if (dst &&
1998                             sk->sk_rx_dst_ifindex == skb->skb_iif)
1999                                 skb_dst_set_noref(skb, dst);
2000                 }
2001         }
2002         return 0;
2003 }
2004
2005 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb,
2006                      enum skb_drop_reason *reason)
2007 {
2008         u32 tail_gso_size, tail_gso_segs;
2009         struct skb_shared_info *shinfo;
2010         const struct tcphdr *th;
2011         struct tcphdr *thtail;
2012         struct sk_buff *tail;
2013         unsigned int hdrlen;
2014         bool fragstolen;
2015         u32 gso_segs;
2016         u32 gso_size;
2017         u64 limit;
2018         int delta;
2019
2020         /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
2021          * we can fix skb->truesize to its real value to avoid future drops.
2022          * This is valid because skb is not yet charged to the socket.
2023          * It has been noticed pure SACK packets were sometimes dropped
2024          * (if cooked by drivers without copybreak feature).
2025          */
2026         skb_condense(skb);
2027
2028         skb_dst_drop(skb);
2029
2030         if (unlikely(tcp_checksum_complete(skb))) {
2031                 bh_unlock_sock(sk);
2032                 trace_tcp_bad_csum(skb);
2033                 *reason = SKB_DROP_REASON_TCP_CSUM;
2034                 __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
2035                 __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
2036                 return true;
2037         }
2038
2039         /* Attempt coalescing to last skb in backlog, even if we are
2040          * above the limits.
2041          * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
2042          */
2043         th = (const struct tcphdr *)skb->data;
2044         hdrlen = th->doff * 4;
2045
2046         tail = sk->sk_backlog.tail;
2047         if (!tail)
2048                 goto no_coalesce;
2049         thtail = (struct tcphdr *)tail->data;
2050
2051         if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
2052             TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
2053             ((TCP_SKB_CB(tail)->tcp_flags |
2054               TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
2055             !((TCP_SKB_CB(tail)->tcp_flags &
2056               TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
2057             ((TCP_SKB_CB(tail)->tcp_flags ^
2058               TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
2059             !tcp_skb_can_collapse_rx(tail, skb) ||
2060             thtail->doff != th->doff ||
2061             memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
2062                 goto no_coalesce;
2063
2064         __skb_pull(skb, hdrlen);
2065
2066         shinfo = skb_shinfo(skb);
2067         gso_size = shinfo->gso_size ?: skb->len;
2068         gso_segs = shinfo->gso_segs ?: 1;
2069
2070         shinfo = skb_shinfo(tail);
2071         tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
2072         tail_gso_segs = shinfo->gso_segs ?: 1;
2073
2074         if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
2075                 TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
2076
2077                 if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
2078                         TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
2079                         thtail->window = th->window;
2080                 }
2081
2082                 /* We have to update both TCP_SKB_CB(tail)->tcp_flags and
2083                  * thtail->fin, so that the fast path in tcp_rcv_established()
2084                  * is not entered if we append a packet with a FIN.
2085                  * SYN, RST, URG are not present.
2086                  * ACK is set on both packets.
2087                  * PSH : we do not really care in TCP stack,
2088                  *       at least for 'GRO' packets.
2089                  */
2090                 thtail->fin |= th->fin;
2091                 TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
2092
2093                 if (TCP_SKB_CB(skb)->has_rxtstamp) {
2094                         TCP_SKB_CB(tail)->has_rxtstamp = true;
2095                         tail->tstamp = skb->tstamp;
2096                         skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
2097                 }
2098
2099                 /* Not as strict as GRO. We only need to carry mss max value */
2100                 shinfo->gso_size = max(gso_size, tail_gso_size);
2101                 shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF);
2102
2103                 sk->sk_backlog.len += delta;
2104                 __NET_INC_STATS(sock_net(sk),
2105                                 LINUX_MIB_TCPBACKLOGCOALESCE);
2106                 kfree_skb_partial(skb, fragstolen);
2107                 return false;
2108         }
2109         __skb_push(skb, hdrlen);
2110
2111 no_coalesce:
2112         /* sk->sk_backlog.len is reset only at the end of __release_sock().
2113          * Both sk->sk_backlog.len and sk->sk_rmem_alloc could reach
2114          * sk_rcvbuf in normal conditions.
2115          */
2116         limit = ((u64)READ_ONCE(sk->sk_rcvbuf)) << 1;
2117
2118         limit += ((u32)READ_ONCE(sk->sk_sndbuf)) >> 1;
2119
2120         /* Only socket owner can try to collapse/prune rx queues
2121          * to reduce memory overhead, so add a little headroom here.
2122          * Few sockets backlog are possibly concurrently non empty.
2123          */
2124         limit += 64 * 1024;
2125
2126         limit = min_t(u64, limit, UINT_MAX);
2127
2128         if (unlikely(sk_add_backlog(sk, skb, limit))) {
2129                 bh_unlock_sock(sk);
2130                 *reason = SKB_DROP_REASON_SOCKET_BACKLOG;
2131                 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
2132                 return true;
2133         }
2134         return false;
2135 }
2136 EXPORT_SYMBOL(tcp_add_backlog);
2137
2138 int tcp_filter(struct sock *sk, struct sk_buff *skb)
2139 {
2140         struct tcphdr *th = (struct tcphdr *)skb->data;
2141
2142         return sk_filter_trim_cap(sk, skb, th->doff * 4);
2143 }
2144 EXPORT_SYMBOL(tcp_filter);
2145
2146 static void tcp_v4_restore_cb(struct sk_buff *skb)
2147 {
2148         memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
2149                 sizeof(struct inet_skb_parm));
2150 }
2151
2152 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
2153                            const struct tcphdr *th)
2154 {
2155         /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
2156          * barrier() makes sure compiler wont play fool^Waliasing games.
2157          */
2158         memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
2159                 sizeof(struct inet_skb_parm));
2160         barrier();
2161
2162         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
2163         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
2164                                     skb->len - th->doff * 4);
2165         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
2166         TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
2167         TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
2168         TCP_SKB_CB(skb)->sacked  = 0;
2169         TCP_SKB_CB(skb)->has_rxtstamp =
2170                         skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
2171 }
2172
2173 /*
2174  *      From tcp_input.c
2175  */
2176
2177 int tcp_v4_rcv(struct sk_buff *skb)
2178 {
2179         struct net *net = dev_net(skb->dev);
2180         enum skb_drop_reason drop_reason;
2181         int sdif = inet_sdif(skb);
2182         int dif = inet_iif(skb);
2183         const struct iphdr *iph;
2184         const struct tcphdr *th;
2185         struct sock *sk = NULL;
2186         bool refcounted;
2187         int ret;
2188         u32 isn;
2189
2190         drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
2191         if (skb->pkt_type != PACKET_HOST)
2192                 goto discard_it;
2193
2194         /* Count it even if it's bad */
2195         __TCP_INC_STATS(net, TCP_MIB_INSEGS);
2196
2197         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
2198                 goto discard_it;
2199
2200         th = (const struct tcphdr *)skb->data;
2201
2202         if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) {
2203                 drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL;
2204                 goto bad_packet;
2205         }
2206         if (!pskb_may_pull(skb, th->doff * 4))
2207                 goto discard_it;
2208
2209         /* An explanation is required here, I think.
2210          * Packet length and doff are validated by header prediction,
2211          * provided case of th->doff==0 is eliminated.
2212          * So, we defer the checks. */
2213
2214         if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
2215                 goto csum_error;
2216
2217         th = (const struct tcphdr *)skb->data;
2218         iph = ip_hdr(skb);
2219 lookup:
2220         sk = __inet_lookup_skb(net->ipv4.tcp_death_row.hashinfo,
2221                                skb, __tcp_hdrlen(th), th->source,
2222                                th->dest, sdif, &refcounted);
2223         if (!sk)
2224                 goto no_tcp_socket;
2225
2226         if (sk->sk_state == TCP_TIME_WAIT)
2227                 goto do_time_wait;
2228
2229         if (sk->sk_state == TCP_NEW_SYN_RECV) {
2230                 struct request_sock *req = inet_reqsk(sk);
2231                 bool req_stolen = false;
2232                 struct sock *nsk;
2233
2234                 sk = req->rsk_listener;
2235                 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
2236                         drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2237                 else
2238                         drop_reason = tcp_inbound_hash(sk, req, skb,
2239                                                        &iph->saddr, &iph->daddr,
2240                                                        AF_INET, dif, sdif);
2241                 if (unlikely(drop_reason)) {
2242                         sk_drops_add(sk, skb);
2243                         reqsk_put(req);
2244                         goto discard_it;
2245                 }
2246                 if (tcp_checksum_complete(skb)) {
2247                         reqsk_put(req);
2248                         goto csum_error;
2249                 }
2250                 if (unlikely(sk->sk_state != TCP_LISTEN)) {
2251                         nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb);
2252                         if (!nsk) {
2253                                 inet_csk_reqsk_queue_drop_and_put(sk, req);
2254                                 goto lookup;
2255                         }
2256                         sk = nsk;
2257                         /* reuseport_migrate_sock() has already held one sk_refcnt
2258                          * before returning.
2259                          */
2260                 } else {
2261                         /* We own a reference on the listener, increase it again
2262                          * as we might lose it too soon.
2263                          */
2264                         sock_hold(sk);
2265                 }
2266                 refcounted = true;
2267                 nsk = NULL;
2268                 if (!tcp_filter(sk, skb)) {
2269                         th = (const struct tcphdr *)skb->data;
2270                         iph = ip_hdr(skb);
2271                         tcp_v4_fill_cb(skb, iph, th);
2272                         nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
2273                 } else {
2274                         drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2275                 }
2276                 if (!nsk) {
2277                         reqsk_put(req);
2278                         if (req_stolen) {
2279                                 /* Another cpu got exclusive access to req
2280                                  * and created a full blown socket.
2281                                  * Try to feed this packet to this socket
2282                                  * instead of discarding it.
2283                                  */
2284                                 tcp_v4_restore_cb(skb);
2285                                 sock_put(sk);
2286                                 goto lookup;
2287                         }
2288                         goto discard_and_relse;
2289                 }
2290                 nf_reset_ct(skb);
2291                 if (nsk == sk) {
2292                         reqsk_put(req);
2293                         tcp_v4_restore_cb(skb);
2294                 } else {
2295                         drop_reason = tcp_child_process(sk, nsk, skb);
2296                         if (drop_reason) {
2297                                 enum sk_rst_reason rst_reason;
2298
2299                                 rst_reason = sk_rst_convert_drop_reason(drop_reason);
2300                                 tcp_v4_send_reset(nsk, skb, rst_reason);
2301                                 goto discard_and_relse;
2302                         }
2303                         sock_put(sk);
2304                         return 0;
2305                 }
2306         }
2307
2308 process:
2309         if (static_branch_unlikely(&ip4_min_ttl)) {
2310                 /* min_ttl can be changed concurrently from do_ip_setsockopt() */
2311                 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
2312                         __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
2313                         drop_reason = SKB_DROP_REASON_TCP_MINTTL;
2314                         goto discard_and_relse;
2315                 }
2316         }
2317
2318         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) {
2319                 drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2320                 goto discard_and_relse;
2321         }
2322
2323         drop_reason = tcp_inbound_hash(sk, NULL, skb, &iph->saddr, &iph->daddr,
2324                                        AF_INET, dif, sdif);
2325         if (drop_reason)
2326                 goto discard_and_relse;
2327
2328         nf_reset_ct(skb);
2329
2330         if (tcp_filter(sk, skb)) {
2331                 drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2332                 goto discard_and_relse;
2333         }
2334         th = (const struct tcphdr *)skb->data;
2335         iph = ip_hdr(skb);
2336         tcp_v4_fill_cb(skb, iph, th);
2337
2338         skb->dev = NULL;
2339
2340         if (sk->sk_state == TCP_LISTEN) {
2341                 ret = tcp_v4_do_rcv(sk, skb);
2342                 goto put_and_return;
2343         }
2344
2345         sk_incoming_cpu_update(sk);
2346
2347         bh_lock_sock_nested(sk);
2348         tcp_segs_in(tcp_sk(sk), skb);
2349         ret = 0;
2350         if (!sock_owned_by_user(sk)) {
2351                 ret = tcp_v4_do_rcv(sk, skb);
2352         } else {
2353                 if (tcp_add_backlog(sk, skb, &drop_reason))
2354                         goto discard_and_relse;
2355         }
2356         bh_unlock_sock(sk);
2357
2358 put_and_return:
2359         if (refcounted)
2360                 sock_put(sk);
2361
2362         return ret;
2363
2364 no_tcp_socket:
2365         drop_reason = SKB_DROP_REASON_NO_SOCKET;
2366         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2367                 goto discard_it;
2368
2369         tcp_v4_fill_cb(skb, iph, th);
2370
2371         if (tcp_checksum_complete(skb)) {
2372 csum_error:
2373                 drop_reason = SKB_DROP_REASON_TCP_CSUM;
2374                 trace_tcp_bad_csum(skb);
2375                 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
2376 bad_packet:
2377                 __TCP_INC_STATS(net, TCP_MIB_INERRS);
2378         } else {
2379                 tcp_v4_send_reset(NULL, skb, sk_rst_convert_drop_reason(drop_reason));
2380         }
2381
2382 discard_it:
2383         SKB_DR_OR(drop_reason, NOT_SPECIFIED);
2384         /* Discard frame. */
2385         sk_skb_reason_drop(sk, skb, drop_reason);
2386         return 0;
2387
2388 discard_and_relse:
2389         sk_drops_add(sk, skb);
2390         if (refcounted)
2391                 sock_put(sk);
2392         goto discard_it;
2393
2394 do_time_wait:
2395         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2396                 drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2397                 inet_twsk_put(inet_twsk(sk));
2398                 goto discard_it;
2399         }
2400
2401         tcp_v4_fill_cb(skb, iph, th);
2402
2403         if (tcp_checksum_complete(skb)) {
2404                 inet_twsk_put(inet_twsk(sk));
2405                 goto csum_error;
2406         }
2407         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th, &isn)) {
2408         case TCP_TW_SYN: {
2409                 struct sock *sk2 = inet_lookup_listener(net,
2410                                                         net->ipv4.tcp_death_row.hashinfo,
2411                                                         skb, __tcp_hdrlen(th),
2412                                                         iph->saddr, th->source,
2413                                                         iph->daddr, th->dest,
2414                                                         inet_iif(skb),
2415                                                         sdif);
2416                 if (sk2) {
2417                         inet_twsk_deschedule_put(inet_twsk(sk));
2418                         sk = sk2;
2419                         tcp_v4_restore_cb(skb);
2420                         refcounted = false;
2421                         __this_cpu_write(tcp_tw_isn, isn);
2422                         goto process;
2423                 }
2424         }
2425                 /* to ACK */
2426                 fallthrough;
2427         case TCP_TW_ACK:
2428                 tcp_v4_timewait_ack(sk, skb);
2429                 break;
2430         case TCP_TW_RST:
2431                 tcp_v4_send_reset(sk, skb, SK_RST_REASON_TCP_TIMEWAIT_SOCKET);
2432                 inet_twsk_deschedule_put(inet_twsk(sk));
2433                 goto discard_it;
2434         case TCP_TW_SUCCESS:;
2435         }
2436         goto discard_it;
2437 }
2438
2439 static struct timewait_sock_ops tcp_timewait_sock_ops = {
2440         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
2441         .twsk_destructor= tcp_twsk_destructor,
2442 };
2443
2444 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2445 {
2446         struct dst_entry *dst = skb_dst(skb);
2447
2448         if (dst && dst_hold_safe(dst)) {
2449                 rcu_assign_pointer(sk->sk_rx_dst, dst);
2450                 sk->sk_rx_dst_ifindex = skb->skb_iif;
2451         }
2452 }
2453 EXPORT_SYMBOL(inet_sk_rx_dst_set);
2454
2455 const struct inet_connection_sock_af_ops ipv4_specific = {
2456         .queue_xmit        = ip_queue_xmit,
2457         .send_check        = tcp_v4_send_check,
2458         .rebuild_header    = inet_sk_rebuild_header,
2459         .sk_rx_dst_set     = inet_sk_rx_dst_set,
2460         .conn_request      = tcp_v4_conn_request,
2461         .syn_recv_sock     = tcp_v4_syn_recv_sock,
2462         .net_header_len    = sizeof(struct iphdr),
2463         .setsockopt        = ip_setsockopt,
2464         .getsockopt        = ip_getsockopt,
2465         .addr2sockaddr     = inet_csk_addr2sockaddr,
2466         .sockaddr_len      = sizeof(struct sockaddr_in),
2467         .mtu_reduced       = tcp_v4_mtu_reduced,
2468 };
2469 EXPORT_SYMBOL(ipv4_specific);
2470
2471 #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
2472 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2473 #ifdef CONFIG_TCP_MD5SIG
2474         .md5_lookup             = tcp_v4_md5_lookup,
2475         .calc_md5_hash          = tcp_v4_md5_hash_skb,
2476         .md5_parse              = tcp_v4_parse_md5_keys,
2477 #endif
2478 #ifdef CONFIG_TCP_AO
2479         .ao_lookup              = tcp_v4_ao_lookup,
2480         .calc_ao_hash           = tcp_v4_ao_hash_skb,
2481         .ao_parse               = tcp_v4_parse_ao,
2482         .ao_calc_key_sk         = tcp_v4_ao_calc_key_sk,
2483 #endif
2484 };
2485 #endif
2486
2487 /* NOTE: A lot of things set to zero explicitly by call to
2488  *       sk_alloc() so need not be done here.
2489  */
2490 static int tcp_v4_init_sock(struct sock *sk)
2491 {
2492         struct inet_connection_sock *icsk = inet_csk(sk);
2493
2494         tcp_init_sock(sk);
2495
2496         icsk->icsk_af_ops = &ipv4_specific;
2497
2498 #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
2499         tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2500 #endif
2501
2502         return 0;
2503 }
2504
2505 #ifdef CONFIG_TCP_MD5SIG
2506 static void tcp_md5sig_info_free_rcu(struct rcu_head *head)
2507 {
2508         struct tcp_md5sig_info *md5sig;
2509
2510         md5sig = container_of(head, struct tcp_md5sig_info, rcu);
2511         kfree(md5sig);
2512         static_branch_slow_dec_deferred(&tcp_md5_needed);
2513         tcp_md5_release_sigpool();
2514 }
2515 #endif
2516
2517 static void tcp_release_user_frags(struct sock *sk)
2518 {
2519 #ifdef CONFIG_PAGE_POOL
2520         unsigned long index;
2521         void *netmem;
2522
2523         xa_for_each(&sk->sk_user_frags, index, netmem)
2524                 WARN_ON_ONCE(!napi_pp_put_page((__force netmem_ref)netmem));
2525 #endif
2526 }
2527
2528 void tcp_v4_destroy_sock(struct sock *sk)
2529 {
2530         struct tcp_sock *tp = tcp_sk(sk);
2531
2532         tcp_release_user_frags(sk);
2533
2534         xa_destroy(&sk->sk_user_frags);
2535
2536         trace_tcp_destroy_sock(sk);
2537
2538         tcp_clear_xmit_timers(sk);
2539
2540         tcp_cleanup_congestion_control(sk);
2541
2542         tcp_cleanup_ulp(sk);
2543
2544         /* Cleanup up the write buffer. */
2545         tcp_write_queue_purge(sk);
2546
2547         /* Check if we want to disable active TFO */
2548         tcp_fastopen_active_disable_ofo_check(sk);
2549
2550         /* Cleans up our, hopefully empty, out_of_order_queue. */
2551         skb_rbtree_purge(&tp->out_of_order_queue);
2552
2553 #ifdef CONFIG_TCP_MD5SIG
2554         /* Clean up the MD5 key list, if any */
2555         if (tp->md5sig_info) {
2556                 struct tcp_md5sig_info *md5sig;
2557
2558                 md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
2559                 tcp_clear_md5_list(sk);
2560                 call_rcu(&md5sig->rcu, tcp_md5sig_info_free_rcu);
2561                 rcu_assign_pointer(tp->md5sig_info, NULL);
2562         }
2563 #endif
2564         tcp_ao_destroy_sock(sk, false);
2565
2566         /* Clean up a referenced TCP bind bucket. */
2567         if (inet_csk(sk)->icsk_bind_hash)
2568                 inet_put_port(sk);
2569
2570         BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2571
2572         /* If socket is aborted during connect operation */
2573         tcp_free_fastopen_req(tp);
2574         tcp_fastopen_destroy_cipher(sk);
2575         tcp_saved_syn_free(tp);
2576
2577         sk_sockets_allocated_dec(sk);
2578 }
2579 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2580
2581 #ifdef CONFIG_PROC_FS
2582 /* Proc filesystem TCP sock list dumping. */
2583
2584 static unsigned short seq_file_family(const struct seq_file *seq);
2585
2586 static bool seq_sk_match(struct seq_file *seq, const struct sock *sk)
2587 {
2588         unsigned short family = seq_file_family(seq);
2589
2590         /* AF_UNSPEC is used as a match all */
2591         return ((family == AF_UNSPEC || family == sk->sk_family) &&
2592                 net_eq(sock_net(sk), seq_file_net(seq)));
2593 }
2594
2595 /* Find a non empty bucket (starting from st->bucket)
2596  * and return the first sk from it.
2597  */
2598 static void *listening_get_first(struct seq_file *seq)
2599 {
2600         struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2601         struct tcp_iter_state *st = seq->private;
2602
2603         st->offset = 0;
2604         for (; st->bucket <= hinfo->lhash2_mask; st->bucket++) {
2605                 struct inet_listen_hashbucket *ilb2;
2606                 struct hlist_nulls_node *node;
2607                 struct sock *sk;
2608
2609                 ilb2 = &hinfo->lhash2[st->bucket];
2610                 if (hlist_nulls_empty(&ilb2->nulls_head))
2611                         continue;
2612
2613                 spin_lock(&ilb2->lock);
2614                 sk_nulls_for_each(sk, node, &ilb2->nulls_head) {
2615                         if (seq_sk_match(seq, sk))
2616                                 return sk;
2617                 }
2618                 spin_unlock(&ilb2->lock);
2619         }
2620
2621         return NULL;
2622 }
2623
2624 /* Find the next sk of "cur" within the same bucket (i.e. st->bucket).
2625  * If "cur" is the last one in the st->bucket,
2626  * call listening_get_first() to return the first sk of the next
2627  * non empty bucket.
2628  */
2629 static void *listening_get_next(struct seq_file *seq, void *cur)
2630 {
2631         struct tcp_iter_state *st = seq->private;
2632         struct inet_listen_hashbucket *ilb2;
2633         struct hlist_nulls_node *node;
2634         struct inet_hashinfo *hinfo;
2635         struct sock *sk = cur;
2636
2637         ++st->num;
2638         ++st->offset;
2639
2640         sk = sk_nulls_next(sk);
2641         sk_nulls_for_each_from(sk, node) {
2642                 if (seq_sk_match(seq, sk))
2643                         return sk;
2644         }
2645
2646         hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2647         ilb2 = &hinfo->lhash2[st->bucket];
2648         spin_unlock(&ilb2->lock);
2649         ++st->bucket;
2650         return listening_get_first(seq);
2651 }
2652
2653 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2654 {
2655         struct tcp_iter_state *st = seq->private;
2656         void *rc;
2657
2658         st->bucket = 0;
2659         st->offset = 0;
2660         rc = listening_get_first(seq);
2661
2662         while (rc && *pos) {
2663                 rc = listening_get_next(seq, rc);
2664                 --*pos;
2665         }
2666         return rc;
2667 }
2668
2669 static inline bool empty_bucket(struct inet_hashinfo *hinfo,
2670                                 const struct tcp_iter_state *st)
2671 {
2672         return hlist_nulls_empty(&hinfo->ehash[st->bucket].chain);
2673 }
2674
2675 /*
2676  * Get first established socket starting from bucket given in st->bucket.
2677  * If st->bucket is zero, the very first socket in the hash is returned.
2678  */
2679 static void *established_get_first(struct seq_file *seq)
2680 {
2681         struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2682         struct tcp_iter_state *st = seq->private;
2683
2684         st->offset = 0;
2685         for (; st->bucket <= hinfo->ehash_mask; ++st->bucket) {
2686                 struct sock *sk;
2687                 struct hlist_nulls_node *node;
2688                 spinlock_t *lock = inet_ehash_lockp(hinfo, st->bucket);
2689
2690                 cond_resched();
2691
2692                 /* Lockless fast path for the common case of empty buckets */
2693                 if (empty_bucket(hinfo, st))
2694                         continue;
2695
2696                 spin_lock_bh(lock);
2697                 sk_nulls_for_each(sk, node, &hinfo->ehash[st->bucket].chain) {
2698                         if (seq_sk_match(seq, sk))
2699                                 return sk;
2700                 }
2701                 spin_unlock_bh(lock);
2702         }
2703
2704         return NULL;
2705 }
2706
2707 static void *established_get_next(struct seq_file *seq, void *cur)
2708 {
2709         struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2710         struct tcp_iter_state *st = seq->private;
2711         struct hlist_nulls_node *node;
2712         struct sock *sk = cur;
2713
2714         ++st->num;
2715         ++st->offset;
2716
2717         sk = sk_nulls_next(sk);
2718
2719         sk_nulls_for_each_from(sk, node) {
2720                 if (seq_sk_match(seq, sk))
2721                         return sk;
2722         }
2723
2724         spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
2725         ++st->bucket;
2726         return established_get_first(seq);
2727 }
2728
2729 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2730 {
2731         struct tcp_iter_state *st = seq->private;
2732         void *rc;
2733
2734         st->bucket = 0;
2735         rc = established_get_first(seq);
2736
2737         while (rc && pos) {
2738                 rc = established_get_next(seq, rc);
2739                 --pos;
2740         }
2741         return rc;
2742 }
2743
2744 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2745 {
2746         void *rc;
2747         struct tcp_iter_state *st = seq->private;
2748
2749         st->state = TCP_SEQ_STATE_LISTENING;
2750         rc        = listening_get_idx(seq, &pos);
2751
2752         if (!rc) {
2753                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2754                 rc        = established_get_idx(seq, pos);
2755         }
2756
2757         return rc;
2758 }
2759
2760 static void *tcp_seek_last_pos(struct seq_file *seq)
2761 {
2762         struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2763         struct tcp_iter_state *st = seq->private;
2764         int bucket = st->bucket;
2765         int offset = st->offset;
2766         int orig_num = st->num;
2767         void *rc = NULL;
2768
2769         switch (st->state) {
2770         case TCP_SEQ_STATE_LISTENING:
2771                 if (st->bucket > hinfo->lhash2_mask)
2772                         break;
2773                 rc = listening_get_first(seq);
2774                 while (offset-- && rc && bucket == st->bucket)
2775                         rc = listening_get_next(seq, rc);
2776                 if (rc)
2777                         break;
2778                 st->bucket = 0;
2779                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2780                 fallthrough;
2781         case TCP_SEQ_STATE_ESTABLISHED:
2782                 if (st->bucket > hinfo->ehash_mask)
2783                         break;
2784                 rc = established_get_first(seq);
2785                 while (offset-- && rc && bucket == st->bucket)
2786                         rc = established_get_next(seq, rc);
2787         }
2788
2789         st->num = orig_num;
2790
2791         return rc;
2792 }
2793
2794 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2795 {
2796         struct tcp_iter_state *st = seq->private;
2797         void *rc;
2798
2799         if (*pos && *pos == st->last_pos) {
2800                 rc = tcp_seek_last_pos(seq);
2801                 if (rc)
2802                         goto out;
2803         }
2804
2805         st->state = TCP_SEQ_STATE_LISTENING;
2806         st->num = 0;
2807         st->bucket = 0;
2808         st->offset = 0;
2809         rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2810
2811 out:
2812         st->last_pos = *pos;
2813         return rc;
2814 }
2815 EXPORT_SYMBOL(tcp_seq_start);
2816
2817 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2818 {
2819         struct tcp_iter_state *st = seq->private;
2820         void *rc = NULL;
2821
2822         if (v == SEQ_START_TOKEN) {
2823                 rc = tcp_get_idx(seq, 0);
2824                 goto out;
2825         }
2826
2827         switch (st->state) {
2828         case TCP_SEQ_STATE_LISTENING:
2829                 rc = listening_get_next(seq, v);
2830                 if (!rc) {
2831                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2832                         st->bucket = 0;
2833                         st->offset = 0;
2834                         rc        = established_get_first(seq);
2835                 }
2836                 break;
2837         case TCP_SEQ_STATE_ESTABLISHED:
2838                 rc = established_get_next(seq, v);
2839                 break;
2840         }
2841 out:
2842         ++*pos;
2843         st->last_pos = *pos;
2844         return rc;
2845 }
2846 EXPORT_SYMBOL(tcp_seq_next);
2847
2848 void tcp_seq_stop(struct seq_file *seq, void *v)
2849 {
2850         struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2851         struct tcp_iter_state *st = seq->private;
2852
2853         switch (st->state) {
2854         case TCP_SEQ_STATE_LISTENING:
2855                 if (v != SEQ_START_TOKEN)
2856                         spin_unlock(&hinfo->lhash2[st->bucket].lock);
2857                 break;
2858         case TCP_SEQ_STATE_ESTABLISHED:
2859                 if (v)
2860                         spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
2861                 break;
2862         }
2863 }
2864 EXPORT_SYMBOL(tcp_seq_stop);
2865
2866 static void get_openreq4(const struct request_sock *req,
2867                          struct seq_file *f, int i)
2868 {
2869         const struct inet_request_sock *ireq = inet_rsk(req);
2870         long delta = req->rsk_timer.expires - jiffies;
2871
2872         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2873                 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2874                 i,
2875                 ireq->ir_loc_addr,
2876                 ireq->ir_num,
2877                 ireq->ir_rmt_addr,
2878                 ntohs(ireq->ir_rmt_port),
2879                 TCP_SYN_RECV,
2880                 0, 0, /* could print option size, but that is af dependent. */
2881                 1,    /* timers active (only the expire timer) */
2882                 jiffies_delta_to_clock_t(delta),
2883                 req->num_timeout,
2884                 from_kuid_munged(seq_user_ns(f),
2885                                  sock_i_uid(req->rsk_listener)),
2886                 0,  /* non standard timer */
2887                 0, /* open_requests have no inode */
2888                 0,
2889                 req);
2890 }
2891
2892 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2893 {
2894         int timer_active;
2895         unsigned long timer_expires;
2896         const struct tcp_sock *tp = tcp_sk(sk);
2897         const struct inet_connection_sock *icsk = inet_csk(sk);
2898         const struct inet_sock *inet = inet_sk(sk);
2899         const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2900         __be32 dest = inet->inet_daddr;
2901         __be32 src = inet->inet_rcv_saddr;
2902         __u16 destp = ntohs(inet->inet_dport);
2903         __u16 srcp = ntohs(inet->inet_sport);
2904         u8 icsk_pending;
2905         int rx_queue;
2906         int state;
2907
2908         icsk_pending = smp_load_acquire(&icsk->icsk_pending);
2909         if (icsk_pending == ICSK_TIME_RETRANS ||
2910             icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2911             icsk_pending == ICSK_TIME_LOSS_PROBE) {
2912                 timer_active    = 1;
2913                 timer_expires   = icsk->icsk_timeout;
2914         } else if (icsk_pending == ICSK_TIME_PROBE0) {
2915                 timer_active    = 4;
2916                 timer_expires   = icsk->icsk_timeout;
2917         } else if (timer_pending(&sk->sk_timer)) {
2918                 timer_active    = 2;
2919                 timer_expires   = sk->sk_timer.expires;
2920         } else {
2921                 timer_active    = 0;
2922                 timer_expires = jiffies;
2923         }
2924
2925         state = inet_sk_state_load(sk);
2926         if (state == TCP_LISTEN)
2927                 rx_queue = READ_ONCE(sk->sk_ack_backlog);
2928         else
2929                 /* Because we don't lock the socket,
2930                  * we might find a transient negative value.
2931                  */
2932                 rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2933                                       READ_ONCE(tp->copied_seq), 0);
2934
2935         seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2936                         "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2937                 i, src, srcp, dest, destp, state,
2938                 READ_ONCE(tp->write_seq) - tp->snd_una,
2939                 rx_queue,
2940                 timer_active,
2941                 jiffies_delta_to_clock_t(timer_expires - jiffies),
2942                 icsk->icsk_retransmits,
2943                 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2944                 icsk->icsk_probes_out,
2945                 sock_i_ino(sk),
2946                 refcount_read(&sk->sk_refcnt), sk,
2947                 jiffies_to_clock_t(icsk->icsk_rto),
2948                 jiffies_to_clock_t(icsk->icsk_ack.ato),
2949                 (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2950                 tcp_snd_cwnd(tp),
2951                 state == TCP_LISTEN ?
2952                     fastopenq->max_qlen :
2953                     (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2954 }
2955
2956 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2957                                struct seq_file *f, int i)
2958 {
2959         long delta = tw->tw_timer.expires - jiffies;
2960         __be32 dest, src;
2961         __u16 destp, srcp;
2962
2963         dest  = tw->tw_daddr;
2964         src   = tw->tw_rcv_saddr;
2965         destp = ntohs(tw->tw_dport);
2966         srcp  = ntohs(tw->tw_sport);
2967
2968         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2969                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2970                 i, src, srcp, dest, destp, READ_ONCE(tw->tw_substate), 0, 0,
2971                 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2972                 refcount_read(&tw->tw_refcnt), tw);
2973 }
2974
2975 #define TMPSZ 150
2976
2977 static int tcp4_seq_show(struct seq_file *seq, void *v)
2978 {
2979         struct tcp_iter_state *st;
2980         struct sock *sk = v;
2981
2982         seq_setwidth(seq, TMPSZ - 1);
2983         if (v == SEQ_START_TOKEN) {
2984                 seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2985                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2986                            "inode");
2987                 goto out;
2988         }
2989         st = seq->private;
2990
2991         if (sk->sk_state == TCP_TIME_WAIT)
2992                 get_timewait4_sock(v, seq, st->num);
2993         else if (sk->sk_state == TCP_NEW_SYN_RECV)
2994                 get_openreq4(v, seq, st->num);
2995         else
2996                 get_tcp4_sock(v, seq, st->num);
2997 out:
2998         seq_pad(seq, '\n');
2999         return 0;
3000 }
3001
3002 #ifdef CONFIG_BPF_SYSCALL
3003 struct bpf_tcp_iter_state {
3004         struct tcp_iter_state state;
3005         unsigned int cur_sk;
3006         unsigned int end_sk;
3007         unsigned int max_sk;
3008         struct sock **batch;
3009         bool st_bucket_done;
3010 };
3011
3012 struct bpf_iter__tcp {
3013         __bpf_md_ptr(struct bpf_iter_meta *, meta);
3014         __bpf_md_ptr(struct sock_common *, sk_common);
3015         uid_t uid __aligned(8);
3016 };
3017
3018 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
3019                              struct sock_common *sk_common, uid_t uid)
3020 {
3021         struct bpf_iter__tcp ctx;
3022
3023         meta->seq_num--;  /* skip SEQ_START_TOKEN */
3024         ctx.meta = meta;
3025         ctx.sk_common = sk_common;
3026         ctx.uid = uid;
3027         return bpf_iter_run_prog(prog, &ctx);
3028 }
3029
3030 static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter)
3031 {
3032         while (iter->cur_sk < iter->end_sk)
3033                 sock_gen_put(iter->batch[iter->cur_sk++]);
3034 }
3035
3036 static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter,
3037                                       unsigned int new_batch_sz)
3038 {
3039         struct sock **new_batch;
3040
3041         new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
3042                              GFP_USER | __GFP_NOWARN);
3043         if (!new_batch)
3044                 return -ENOMEM;
3045
3046         bpf_iter_tcp_put_batch(iter);
3047         kvfree(iter->batch);
3048         iter->batch = new_batch;
3049         iter->max_sk = new_batch_sz;
3050
3051         return 0;
3052 }
3053
3054 static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq,
3055                                                  struct sock *start_sk)
3056 {
3057         struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
3058         struct bpf_tcp_iter_state *iter = seq->private;
3059         struct tcp_iter_state *st = &iter->state;
3060         struct hlist_nulls_node *node;
3061         unsigned int expected = 1;
3062         struct sock *sk;
3063
3064         sock_hold(start_sk);
3065         iter->batch[iter->end_sk++] = start_sk;
3066
3067         sk = sk_nulls_next(start_sk);
3068         sk_nulls_for_each_from(sk, node) {
3069                 if (seq_sk_match(seq, sk)) {
3070                         if (iter->end_sk < iter->max_sk) {
3071                                 sock_hold(sk);
3072                                 iter->batch[iter->end_sk++] = sk;
3073                         }
3074                         expected++;
3075                 }
3076         }
3077         spin_unlock(&hinfo->lhash2[st->bucket].lock);
3078
3079         return expected;
3080 }
3081
3082 static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq,
3083                                                    struct sock *start_sk)
3084 {
3085         struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
3086         struct bpf_tcp_iter_state *iter = seq->private;
3087         struct tcp_iter_state *st = &iter->state;
3088         struct hlist_nulls_node *node;
3089         unsigned int expected = 1;
3090         struct sock *sk;
3091
3092         sock_hold(start_sk);
3093         iter->batch[iter->end_sk++] = start_sk;
3094
3095         sk = sk_nulls_next(start_sk);
3096         sk_nulls_for_each_from(sk, node) {
3097                 if (seq_sk_match(seq, sk)) {
3098                         if (iter->end_sk < iter->max_sk) {
3099                                 sock_hold(sk);
3100                                 iter->batch[iter->end_sk++] = sk;
3101                         }
3102                         expected++;
3103                 }
3104         }
3105         spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
3106
3107         return expected;
3108 }
3109
3110 static struct sock *bpf_iter_tcp_batch(struct seq_file *seq)
3111 {
3112         struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
3113         struct bpf_tcp_iter_state *iter = seq->private;
3114         struct tcp_iter_state *st = &iter->state;
3115         unsigned int expected;
3116         bool resized = false;
3117         struct sock *sk;
3118
3119         /* The st->bucket is done.  Directly advance to the next
3120          * bucket instead of having the tcp_seek_last_pos() to skip
3121          * one by one in the current bucket and eventually find out
3122          * it has to advance to the next bucket.
3123          */
3124         if (iter->st_bucket_done) {
3125                 st->offset = 0;
3126                 st->bucket++;
3127                 if (st->state == TCP_SEQ_STATE_LISTENING &&
3128                     st->bucket > hinfo->lhash2_mask) {
3129                         st->state = TCP_SEQ_STATE_ESTABLISHED;
3130                         st->bucket = 0;
3131                 }
3132         }
3133
3134 again:
3135         /* Get a new batch */
3136         iter->cur_sk = 0;
3137         iter->end_sk = 0;
3138         iter->st_bucket_done = false;
3139
3140         sk = tcp_seek_last_pos(seq);
3141         if (!sk)
3142                 return NULL; /* Done */
3143
3144         if (st->state == TCP_SEQ_STATE_LISTENING)
3145                 expected = bpf_iter_tcp_listening_batch(seq, sk);
3146         else
3147                 expected = bpf_iter_tcp_established_batch(seq, sk);
3148
3149         if (iter->end_sk == expected) {
3150                 iter->st_bucket_done = true;
3151                 return sk;
3152         }
3153
3154         if (!resized && !bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2)) {
3155                 resized = true;
3156                 goto again;
3157         }
3158
3159         return sk;
3160 }
3161
3162 static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos)
3163 {
3164         /* bpf iter does not support lseek, so it always
3165          * continue from where it was stop()-ped.
3166          */
3167         if (*pos)
3168                 return bpf_iter_tcp_batch(seq);
3169
3170         return SEQ_START_TOKEN;
3171 }
3172
3173 static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3174 {
3175         struct bpf_tcp_iter_state *iter = seq->private;
3176         struct tcp_iter_state *st = &iter->state;
3177         struct sock *sk;
3178
3179         /* Whenever seq_next() is called, the iter->cur_sk is
3180          * done with seq_show(), so advance to the next sk in
3181          * the batch.
3182          */
3183         if (iter->cur_sk < iter->end_sk) {
3184                 /* Keeping st->num consistent in tcp_iter_state.
3185                  * bpf_iter_tcp does not use st->num.
3186                  * meta.seq_num is used instead.
3187                  */
3188                 st->num++;
3189                 /* Move st->offset to the next sk in the bucket such that
3190                  * the future start() will resume at st->offset in
3191                  * st->bucket.  See tcp_seek_last_pos().
3192                  */
3193                 st->offset++;
3194                 sock_gen_put(iter->batch[iter->cur_sk++]);
3195         }
3196
3197         if (iter->cur_sk < iter->end_sk)
3198                 sk = iter->batch[iter->cur_sk];
3199         else
3200                 sk = bpf_iter_tcp_batch(seq);
3201
3202         ++*pos;
3203         /* Keeping st->last_pos consistent in tcp_iter_state.
3204          * bpf iter does not do lseek, so st->last_pos always equals to *pos.
3205          */
3206         st->last_pos = *pos;
3207         return sk;
3208 }
3209
3210 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
3211 {
3212         struct bpf_iter_meta meta;
3213         struct bpf_prog *prog;
3214         struct sock *sk = v;
3215         uid_t uid;
3216         int ret;
3217
3218         if (v == SEQ_START_TOKEN)
3219                 return 0;
3220
3221         if (sk_fullsock(sk))
3222                 lock_sock(sk);
3223
3224         if (unlikely(sk_unhashed(sk))) {
3225                 ret = SEQ_SKIP;
3226                 goto unlock;
3227         }
3228
3229         if (sk->sk_state == TCP_TIME_WAIT) {
3230                 uid = 0;
3231         } else if (sk->sk_state == TCP_NEW_SYN_RECV) {
3232                 const struct request_sock *req = v;
3233
3234                 uid = from_kuid_munged(seq_user_ns(seq),
3235                                        sock_i_uid(req->rsk_listener));
3236         } else {
3237                 uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
3238         }
3239
3240         meta.seq = seq;
3241         prog = bpf_iter_get_info(&meta, false);
3242         ret = tcp_prog_seq_show(prog, &meta, v, uid);
3243
3244 unlock:
3245         if (sk_fullsock(sk))
3246                 release_sock(sk);
3247         return ret;
3248
3249 }
3250
3251 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
3252 {
3253         struct bpf_tcp_iter_state *iter = seq->private;
3254         struct bpf_iter_meta meta;
3255         struct bpf_prog *prog;
3256
3257         if (!v) {
3258                 meta.seq = seq;
3259                 prog = bpf_iter_get_info(&meta, true);
3260                 if (prog)
3261                         (void)tcp_prog_seq_show(prog, &meta, v, 0);
3262         }
3263
3264         if (iter->cur_sk < iter->end_sk) {
3265                 bpf_iter_tcp_put_batch(iter);
3266                 iter->st_bucket_done = false;
3267         }
3268 }
3269
3270 static const struct seq_operations bpf_iter_tcp_seq_ops = {
3271         .show           = bpf_iter_tcp_seq_show,
3272         .start          = bpf_iter_tcp_seq_start,
3273         .next           = bpf_iter_tcp_seq_next,
3274         .stop           = bpf_iter_tcp_seq_stop,
3275 };
3276 #endif
3277 static unsigned short seq_file_family(const struct seq_file *seq)
3278 {
3279         const struct tcp_seq_afinfo *afinfo;
3280
3281 #ifdef CONFIG_BPF_SYSCALL
3282         /* Iterated from bpf_iter.  Let the bpf prog to filter instead. */
3283         if (seq->op == &bpf_iter_tcp_seq_ops)
3284                 return AF_UNSPEC;
3285 #endif
3286
3287         /* Iterated from proc fs */
3288         afinfo = pde_data(file_inode(seq->file));
3289         return afinfo->family;
3290 }
3291
3292 static const struct seq_operations tcp4_seq_ops = {
3293         .show           = tcp4_seq_show,
3294         .start          = tcp_seq_start,
3295         .next           = tcp_seq_next,
3296         .stop           = tcp_seq_stop,
3297 };
3298
3299 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
3300         .family         = AF_INET,
3301 };
3302
3303 static int __net_init tcp4_proc_init_net(struct net *net)
3304 {
3305         if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
3306                         sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
3307                 return -ENOMEM;
3308         return 0;
3309 }
3310
3311 static void __net_exit tcp4_proc_exit_net(struct net *net)
3312 {
3313         remove_proc_entry("tcp", net->proc_net);
3314 }
3315
3316 static struct pernet_operations tcp4_net_ops = {
3317         .init = tcp4_proc_init_net,
3318         .exit = tcp4_proc_exit_net,
3319 };
3320
3321 int __init tcp4_proc_init(void)
3322 {
3323         return register_pernet_subsys(&tcp4_net_ops);
3324 }
3325
3326 void tcp4_proc_exit(void)
3327 {
3328         unregister_pernet_subsys(&tcp4_net_ops);
3329 }
3330 #endif /* CONFIG_PROC_FS */
3331
3332 /* @wake is one when sk_stream_write_space() calls us.
3333  * This sends EPOLLOUT only if notsent_bytes is half the limit.
3334  * This mimics the strategy used in sock_def_write_space().
3335  */
3336 bool tcp_stream_memory_free(const struct sock *sk, int wake)
3337 {
3338         const struct tcp_sock *tp = tcp_sk(sk);
3339         u32 notsent_bytes = READ_ONCE(tp->write_seq) -
3340                             READ_ONCE(tp->snd_nxt);
3341
3342         return (notsent_bytes << wake) < tcp_notsent_lowat(tp);
3343 }
3344 EXPORT_SYMBOL(tcp_stream_memory_free);
3345
3346 struct proto tcp_prot = {
3347         .name                   = "TCP",
3348         .owner                  = THIS_MODULE,
3349         .close                  = tcp_close,
3350         .pre_connect            = tcp_v4_pre_connect,
3351         .connect                = tcp_v4_connect,
3352         .disconnect             = tcp_disconnect,
3353         .accept                 = inet_csk_accept,
3354         .ioctl                  = tcp_ioctl,
3355         .init                   = tcp_v4_init_sock,
3356         .destroy                = tcp_v4_destroy_sock,
3357         .shutdown               = tcp_shutdown,
3358         .setsockopt             = tcp_setsockopt,
3359         .getsockopt             = tcp_getsockopt,
3360         .bpf_bypass_getsockopt  = tcp_bpf_bypass_getsockopt,
3361         .keepalive              = tcp_set_keepalive,
3362         .recvmsg                = tcp_recvmsg,
3363         .sendmsg                = tcp_sendmsg,
3364         .splice_eof             = tcp_splice_eof,
3365         .backlog_rcv            = tcp_v4_do_rcv,
3366         .release_cb             = tcp_release_cb,
3367         .hash                   = inet_hash,
3368         .unhash                 = inet_unhash,
3369         .get_port               = inet_csk_get_port,
3370         .put_port               = inet_put_port,
3371 #ifdef CONFIG_BPF_SYSCALL
3372         .psock_update_sk_prot   = tcp_bpf_update_proto,
3373 #endif
3374         .enter_memory_pressure  = tcp_enter_memory_pressure,
3375         .leave_memory_pressure  = tcp_leave_memory_pressure,
3376         .stream_memory_free     = tcp_stream_memory_free,
3377         .sockets_allocated      = &tcp_sockets_allocated,
3378         .orphan_count           = &tcp_orphan_count,
3379
3380         .memory_allocated       = &tcp_memory_allocated,
3381         .per_cpu_fw_alloc       = &tcp_memory_per_cpu_fw_alloc,
3382
3383         .memory_pressure        = &tcp_memory_pressure,
3384         .sysctl_mem             = sysctl_tcp_mem,
3385         .sysctl_wmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_wmem),
3386         .sysctl_rmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_rmem),
3387         .max_header             = MAX_TCP_HEADER,
3388         .obj_size               = sizeof(struct tcp_sock),
3389         .slab_flags             = SLAB_TYPESAFE_BY_RCU,
3390         .twsk_prot              = &tcp_timewait_sock_ops,
3391         .rsk_prot               = &tcp_request_sock_ops,
3392         .h.hashinfo             = NULL,
3393         .no_autobind            = true,
3394         .diag_destroy           = tcp_abort,
3395 };
3396 EXPORT_SYMBOL(tcp_prot);
3397
3398 static void __net_exit tcp_sk_exit(struct net *net)
3399 {
3400         if (net->ipv4.tcp_congestion_control)
3401                 bpf_module_put(net->ipv4.tcp_congestion_control,
3402                                net->ipv4.tcp_congestion_control->owner);
3403 }
3404
3405 static void __net_init tcp_set_hashinfo(struct net *net)
3406 {
3407         struct inet_hashinfo *hinfo;
3408         unsigned int ehash_entries;
3409         struct net *old_net;
3410
3411         if (net_eq(net, &init_net))
3412                 goto fallback;
3413
3414         old_net = current->nsproxy->net_ns;
3415         ehash_entries = READ_ONCE(old_net->ipv4.sysctl_tcp_child_ehash_entries);
3416         if (!ehash_entries)
3417                 goto fallback;
3418
3419         ehash_entries = roundup_pow_of_two(ehash_entries);
3420         hinfo = inet_pernet_hashinfo_alloc(&tcp_hashinfo, ehash_entries);
3421         if (!hinfo) {
3422                 pr_warn("Failed to allocate TCP ehash (entries: %u) "
3423                         "for a netns, fallback to the global one\n",
3424                         ehash_entries);
3425 fallback:
3426                 hinfo = &tcp_hashinfo;
3427                 ehash_entries = tcp_hashinfo.ehash_mask + 1;
3428         }
3429
3430         net->ipv4.tcp_death_row.hashinfo = hinfo;
3431         net->ipv4.tcp_death_row.sysctl_max_tw_buckets = ehash_entries / 2;
3432         net->ipv4.sysctl_max_syn_backlog = max(128U, ehash_entries / 128);
3433 }
3434
3435 static int __net_init tcp_sk_init(struct net *net)
3436 {
3437         net->ipv4.sysctl_tcp_ecn = 2;
3438         net->ipv4.sysctl_tcp_ecn_fallback = 1;
3439
3440         net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
3441         net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
3442         net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
3443         net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
3444         net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
3445
3446         net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
3447         net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
3448         net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
3449
3450         net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
3451         net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
3452         net->ipv4.sysctl_tcp_syncookies = 1;
3453         net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
3454         net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
3455         net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
3456         net->ipv4.sysctl_tcp_orphan_retries = 0;
3457         net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
3458         net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
3459         net->ipv4.sysctl_tcp_tw_reuse = 2;
3460         net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
3461
3462         refcount_set(&net->ipv4.tcp_death_row.tw_refcount, 1);
3463         tcp_set_hashinfo(net);
3464
3465         net->ipv4.sysctl_tcp_sack = 1;
3466         net->ipv4.sysctl_tcp_window_scaling = 1;
3467         net->ipv4.sysctl_tcp_timestamps = 1;
3468         net->ipv4.sysctl_tcp_early_retrans = 3;
3469         net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
3470         net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
3471         net->ipv4.sysctl_tcp_retrans_collapse = 1;
3472         net->ipv4.sysctl_tcp_max_reordering = 300;
3473         net->ipv4.sysctl_tcp_dsack = 1;
3474         net->ipv4.sysctl_tcp_app_win = 31;
3475         net->ipv4.sysctl_tcp_adv_win_scale = 1;
3476         net->ipv4.sysctl_tcp_frto = 2;
3477         net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
3478         /* This limits the percentage of the congestion window which we
3479          * will allow a single TSO frame to consume.  Building TSO frames
3480          * which are too large can cause TCP streams to be bursty.
3481          */
3482         net->ipv4.sysctl_tcp_tso_win_divisor = 3;
3483         /* Default TSQ limit of 16 TSO segments */
3484         net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
3485
3486         /* rfc5961 challenge ack rate limiting, per net-ns, disabled by default. */
3487         net->ipv4.sysctl_tcp_challenge_ack_limit = INT_MAX;
3488
3489         net->ipv4.sysctl_tcp_min_tso_segs = 2;
3490         net->ipv4.sysctl_tcp_tso_rtt_log = 9;  /* 2^9 = 512 usec */
3491         net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
3492         net->ipv4.sysctl_tcp_autocorking = 1;
3493         net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
3494         net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
3495         net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
3496         if (net != &init_net) {
3497                 memcpy(net->ipv4.sysctl_tcp_rmem,
3498                        init_net.ipv4.sysctl_tcp_rmem,
3499                        sizeof(init_net.ipv4.sysctl_tcp_rmem));
3500                 memcpy(net->ipv4.sysctl_tcp_wmem,
3501                        init_net.ipv4.sysctl_tcp_wmem,
3502                        sizeof(init_net.ipv4.sysctl_tcp_wmem));
3503         }
3504         net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
3505         net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
3506         net->ipv4.sysctl_tcp_comp_sack_nr = 44;
3507         net->ipv4.sysctl_tcp_backlog_ack_defer = 1;
3508         net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
3509         net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;
3510         atomic_set(&net->ipv4.tfo_active_disable_times, 0);
3511
3512         /* Set default values for PLB */
3513         net->ipv4.sysctl_tcp_plb_enabled = 0; /* Disabled by default */
3514         net->ipv4.sysctl_tcp_plb_idle_rehash_rounds = 3;
3515         net->ipv4.sysctl_tcp_plb_rehash_rounds = 12;
3516         net->ipv4.sysctl_tcp_plb_suspend_rto_sec = 60;
3517         /* Default congestion threshold for PLB to mark a round is 50% */
3518         net->ipv4.sysctl_tcp_plb_cong_thresh = (1 << TCP_PLB_SCALE) / 2;
3519
3520         /* Reno is always built in */
3521         if (!net_eq(net, &init_net) &&
3522             bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
3523                                init_net.ipv4.tcp_congestion_control->owner))
3524                 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
3525         else
3526                 net->ipv4.tcp_congestion_control = &tcp_reno;
3527
3528         net->ipv4.sysctl_tcp_syn_linear_timeouts = 4;
3529         net->ipv4.sysctl_tcp_shrink_window = 0;
3530
3531         net->ipv4.sysctl_tcp_pingpong_thresh = 1;
3532         net->ipv4.sysctl_tcp_rto_min_us = jiffies_to_usecs(TCP_RTO_MIN);
3533
3534         return 0;
3535 }
3536
3537 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
3538 {
3539         struct net *net;
3540
3541         /* make sure concurrent calls to tcp_sk_exit_batch from net_cleanup_work
3542          * and failed setup_net error unwinding path are serialized.
3543          *
3544          * tcp_twsk_purge() handles twsk in any dead netns, not just those in
3545          * net_exit_list, the thread that dismantles a particular twsk must
3546          * do so without other thread progressing to refcount_dec_and_test() of
3547          * tcp_death_row.tw_refcount.
3548          */
3549         mutex_lock(&tcp_exit_batch_mutex);
3550
3551         tcp_twsk_purge(net_exit_list);
3552
3553         list_for_each_entry(net, net_exit_list, exit_list) {
3554                 inet_pernet_hashinfo_free(net->ipv4.tcp_death_row.hashinfo);
3555                 WARN_ON_ONCE(!refcount_dec_and_test(&net->ipv4.tcp_death_row.tw_refcount));
3556                 tcp_fastopen_ctx_destroy(net);
3557         }
3558
3559         mutex_unlock(&tcp_exit_batch_mutex);
3560 }
3561
3562 static struct pernet_operations __net_initdata tcp_sk_ops = {
3563        .init       = tcp_sk_init,
3564        .exit       = tcp_sk_exit,
3565        .exit_batch = tcp_sk_exit_batch,
3566 };
3567
3568 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3569 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
3570                      struct sock_common *sk_common, uid_t uid)
3571
3572 #define INIT_BATCH_SZ 16
3573
3574 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
3575 {
3576         struct bpf_tcp_iter_state *iter = priv_data;
3577         int err;
3578
3579         err = bpf_iter_init_seq_net(priv_data, aux);
3580         if (err)
3581                 return err;
3582
3583         err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ);
3584         if (err) {
3585                 bpf_iter_fini_seq_net(priv_data);
3586                 return err;
3587         }
3588
3589         return 0;
3590 }
3591
3592 static void bpf_iter_fini_tcp(void *priv_data)
3593 {
3594         struct bpf_tcp_iter_state *iter = priv_data;
3595
3596         bpf_iter_fini_seq_net(priv_data);
3597         kvfree(iter->batch);
3598 }
3599
3600 static const struct bpf_iter_seq_info tcp_seq_info = {
3601         .seq_ops                = &bpf_iter_tcp_seq_ops,
3602         .init_seq_private       = bpf_iter_init_tcp,
3603         .fini_seq_private       = bpf_iter_fini_tcp,
3604         .seq_priv_size          = sizeof(struct bpf_tcp_iter_state),
3605 };
3606
3607 static const struct bpf_func_proto *
3608 bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id,
3609                             const struct bpf_prog *prog)
3610 {
3611         switch (func_id) {
3612         case BPF_FUNC_setsockopt:
3613                 return &bpf_sk_setsockopt_proto;
3614         case BPF_FUNC_getsockopt:
3615                 return &bpf_sk_getsockopt_proto;
3616         default:
3617                 return NULL;
3618         }
3619 }
3620
3621 static struct bpf_iter_reg tcp_reg_info = {
3622         .target                 = "tcp",
3623         .ctx_arg_info_size      = 1,
3624         .ctx_arg_info           = {
3625                 { offsetof(struct bpf_iter__tcp, sk_common),
3626                   PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED },
3627         },
3628         .get_func_proto         = bpf_iter_tcp_get_func_proto,
3629         .seq_info               = &tcp_seq_info,
3630 };
3631
3632 static void __init bpf_iter_register(void)
3633 {
3634         tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
3635         if (bpf_iter_reg_target(&tcp_reg_info))
3636                 pr_warn("Warning: could not register bpf iterator tcp\n");
3637 }
3638
3639 #endif
3640
3641 void __init tcp_v4_init(void)
3642 {
3643         int cpu, res;
3644
3645         for_each_possible_cpu(cpu) {
3646                 struct sock *sk;
3647
3648                 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
3649                                            IPPROTO_TCP, &init_net);
3650                 if (res)
3651                         panic("Failed to create the TCP control socket.\n");
3652                 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
3653
3654                 /* Please enforce IP_DF and IPID==0 for RST and
3655                  * ACK sent in SYN-RECV and TIME-WAIT state.
3656                  */
3657                 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
3658
3659                 sk->sk_clockid = CLOCK_MONOTONIC;
3660
3661                 per_cpu(ipv4_tcp_sk.sock, cpu) = sk;
3662         }
3663         if (register_pernet_subsys(&tcp_sk_ops))
3664                 panic("Failed to create the TCP control socket.\n");
3665
3666 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3667         bpf_iter_register();
3668 #endif
3669 }