net/ipv4/tcp_ipv4.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  *              IPv4 specific functions
   9  *
  10  *
  11  *              code split from:
  12  *              linux/ipv4/tcp.c
  13  *              linux/ipv4/tcp_input.c
  14  *              linux/ipv4/tcp_output.c
  15  *
  16  *              See tcp.c for author information
  17  *
  18  *      This program is free software; you can redistribute it and/or
  19  *      modify it under the terms of the GNU General Public License
  20  *      as published by the Free Software Foundation; either version
  21  *      2 of the License, or (at your option) any later version.
  22  */
  23
  24 /*
  25  * Changes:
  26  *              David S. Miller :       New socket lookup architecture.
  27  *                                      This code is dedicated to John Dyson.
  28  *              David S. Miller :       Change semantics of established hash,
  29  *                                      half is devoted to TIME_WAIT sockets
  30  *                                      and the rest go in the other half.
  31  *              Andi Kleen :            Add support for syncookies and fixed
  32  *                                      some bugs: ip options weren't passed to
  33  *                                      the TCP layer, missed a check for an
  34  *                                      ACK bit.
  35  *              Andi Kleen :            Implemented fast path mtu discovery.
  36  *                                      Fixed many serious bugs in the
  37  *                                      request_sock handling and moved
  38  *                                      most of it into the af independent code.
  39  *                                      Added tail drop and some other bugfixes.
  40  *                                      Added new listen semantics.
  41  *              Mike McLagan    :       Routing by source
  42  *      Juan Jose Ciarlante:            ip_dynaddr bits
  43  *              Andi Kleen:             various fixes.
  44  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
  45  *                                      coma.
  46  *      Andi Kleen              :       Fix new listen.
  47  *      Andi Kleen              :       Fix accept error reporting.
  48  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
  49  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
  50  *                                      a single port at the same time.
  51  */
  52
  53 #define pr_fmt(fmt) "TCP: " fmt
  54
  55 #include <linux/bottom_half.h>
  56 #include <linux/types.h>
  57 #include <linux/fcntl.h>
  58 #include <linux/module.h>
  59 #include <linux/random.h>
  60 #include <linux/cache.h>
  61 #include <linux/jhash.h>
  62 #include <linux/init.h>
  63 #include <linux/times.h>
  64 #include <linux/slab.h>
  65
  66 #include <net/net_namespace.h>
  67 #include <net/icmp.h>
  68 #include <net/inet_hashtables.h>
  69 #include <net/tcp.h>
  70 #include <net/transp_v6.h>
  71 #include <net/ipv6.h>
  72 #include <net/inet_common.h>
  73 #include <net/timewait_sock.h>
  74 #include <net/xfrm.h>
  75 #include <net/secure_seq.h>
  76 #include <net/busy_poll.h>
  77
  78 #include <linux/inet.h>
  79 #include <linux/ipv6.h>
  80 #include <linux/stddef.h>
  81 #include <linux/proc_fs.h>
  82 #include <linux/seq_file.h>
  83 #include <linux/inetdevice.h>
  84
  85 #include <crypto/hash.h>
  86 #include <linux/scatterlist.h>
  87
  88 #include <trace/events/tcp.h>
  89
  90 #ifdef CONFIG_TCP_MD5SIG
  91 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
  92                                __be32 daddr, __be32 saddr, const struct tcphdr *th);
  93 #endif
  94
  95 struct inet_hashinfo tcp_hashinfo;
  96 EXPORT_SYMBOL(tcp_hashinfo);
  97
  98 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
  99 {
 100         return secure_tcp_seq(ip_hdr(skb)->daddr,
 101                               ip_hdr(skb)->saddr,
 102                               tcp_hdr(skb)->dest,
 103                               tcp_hdr(skb)->source);
 104 }
 105
 106 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
 107 {
 108         return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
 109 }
 110
 111 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
 112 {
 113         const struct inet_timewait_sock *tw = inet_twsk(sktw);
 114         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
 115         struct tcp_sock *tp = tcp_sk(sk);
 116         int reuse = sock_net(sk)->ipv4.sysctl_tcp_tw_reuse;
 117
 118         if (reuse == 2) {
 119                 /* Still does not detect *everything* that goes through
 120                  * lo, since we require a loopback src or dst address
 121                  * or direct binding to 'lo' interface.
 122                  */
 123                 bool loopback = false;
 124                 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
 125                         loopback = true;
 126 #if IS_ENABLED(CONFIG_IPV6)
 127                 if (tw->tw_family == AF_INET6) {
 128                         if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
 129                             (ipv6_addr_v4mapped(&tw->tw_v6_daddr) &&
 130                              (tw->tw_v6_daddr.s6_addr[12] == 127)) ||
 131                             ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
 132                             (ipv6_addr_v4mapped(&tw->tw_v6_rcv_saddr) &&
 133                              (tw->tw_v6_rcv_saddr.s6_addr[12] == 127)))
 134                                 loopback = true;
 135                 } else
 136 #endif
 137                 {
 138                         if (ipv4_is_loopback(tw->tw_daddr) ||
 139                             ipv4_is_loopback(tw->tw_rcv_saddr))
 140                                 loopback = true;
 141                 }
 142                 if (!loopback)
 143                         reuse = 0;
 144         }
 145
 146         /* With PAWS, it is safe from the viewpoint
 147            of data integrity. Even without PAWS it is safe provided sequence
 148            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
 149
 150            Actually, the idea is close to VJ's one, only timestamp cache is
 151            held not per host, but per port pair and TW bucket is used as state
 152            holder.
 153
 154            If TW bucket has been already destroyed we fall back to VJ's scheme
 155            and use initial timestamp retrieved from peer table.
 156          */
 157         if (tcptw->tw_ts_recent_stamp &&
 158             (!twp || (reuse && time_after32(ktime_get_seconds(),
 159                                             tcptw->tw_ts_recent_stamp)))) {
 160                 /* In case of repair and re-using TIME-WAIT sockets we still
 161                  * want to be sure that it is safe as above but honor the
 162                  * sequence numbers and time stamps set as part of the repair
 163                  * process.
 164                  *
 165                  * Without this check re-using a TIME-WAIT socket with TCP
 166                  * repair would accumulate a -1 on the repair assigned
 167                  * sequence number. The first time it is reused the sequence
 168                  * is -1, the second time -2, etc. This fixes that issue
 169                  * without appearing to create any others.
 170                  */
 171                 if (likely(!tp->repair)) {
 172                         tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
 173                         if (tp->write_seq == 0)
 174                                 tp->write_seq = 1;
 175                         tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
 176                         tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
 177                 }
 178                 sock_hold(sktw);
 179                 return 1;
 180         }
 181
 182         return 0;
 183 }
 184 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
 185
 186 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
 187                               int addr_len)
 188 {
 189         /* This check is replicated from tcp_v4_connect() and intended to
 190          * prevent BPF program called below from accessing bytes that are out
 191          * of the bound specified by user in addr_len.
 192          */
 193         if (addr_len < sizeof(struct sockaddr_in))
 194                 return -EINVAL;
 195
 196         sock_owned_by_me(sk);
 197
 198         return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
 199 }
 200
 201 /* This will initiate an outgoing connection. */
 202 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 203 {
 204         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 205         struct inet_sock *inet = inet_sk(sk);
 206         struct tcp_sock *tp = tcp_sk(sk);
 207         __be16 orig_sport, orig_dport;
 208         __be32 daddr, nexthop;
 209         struct flowi4 *fl4;
 210         struct rtable *rt;
 211         int err;
 212         struct ip_options_rcu *inet_opt;
 213         struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
 214
 215         if (addr_len < sizeof(struct sockaddr_in))
 216                 return -EINVAL;
 217
 218         if (usin->sin_family != AF_INET)
 219                 return -EAFNOSUPPORT;
 220
 221         nexthop = daddr = usin->sin_addr.s_addr;
 222         inet_opt = rcu_dereference_protected(inet->inet_opt,
 223                                              lockdep_sock_is_held(sk));
 224         if (inet_opt && inet_opt->opt.srr) {
 225                 if (!daddr)
 226                         return -EINVAL;
 227                 nexthop = inet_opt->opt.faddr;
 228         }
 229
 230         orig_sport = inet->inet_sport;
 231         orig_dport = usin->sin_port;
 232         fl4 = &inet->cork.fl.u.ip4;
 233         rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
 234                               RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
 235                               IPPROTO_TCP,
 236                               orig_sport, orig_dport, sk);
 237         if (IS_ERR(rt)) {
 238                 err = PTR_ERR(rt);
 239                 if (err == -ENETUNREACH)
 240                         IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
 241                 return err;
 242         }
 243
 244         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
 245                 ip_rt_put(rt);
 246                 return -ENETUNREACH;
 247         }
 248
 249         if (!inet_opt || !inet_opt->opt.srr)
 250                 daddr = fl4->daddr;
 251
 252         if (!inet->inet_saddr)
 253                 inet->inet_saddr = fl4->saddr;
 254         sk_rcv_saddr_set(sk, inet->inet_saddr);
 255
 256         if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
 257                 /* Reset inherited state */
 258                 tp->rx_opt.ts_recent       = 0;
 259                 tp->rx_opt.ts_recent_stamp = 0;
 260                 if (likely(!tp->repair))
 261                         tp->write_seq      = 0;
 262         }
 263
 264         inet->inet_dport = usin->sin_port;
 265         sk_daddr_set(sk, daddr);
 266
 267         inet_csk(sk)->icsk_ext_hdr_len = 0;
 268         if (inet_opt)
 269                 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
 270
 271         tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
 272
 273         /* Socket identity is still unknown (sport may be zero).
 274          * However we set state to SYN-SENT and not releasing socket
 275          * lock select source port, enter ourselves into the hash tables and
 276          * complete initialization after this.
 277          */
 278         tcp_set_state(sk, TCP_SYN_SENT);
 279         err = inet_hash_connect(tcp_death_row, sk);
 280         if (err)
 281                 goto failure;
 282
 283         sk_set_txhash(sk);
 284
 285         rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
 286                                inet->inet_sport, inet->inet_dport, sk);
 287         if (IS_ERR(rt)) {
 288                 err = PTR_ERR(rt);
 289                 rt = NULL;
 290                 goto failure;
 291         }
 292         /* OK, now commit destination to socket.  */
 293         sk->sk_gso_type = SKB_GSO_TCPV4;
 294         sk_setup_caps(sk, &rt->dst);
 295         rt = NULL;
 296
 297         if (likely(!tp->repair)) {
 298                 if (!tp->write_seq)
 299                         tp->write_seq = secure_tcp_seq(inet->inet_saddr,
 300                                                        inet->inet_daddr,
 301                                                        inet->inet_sport,
 302                                                        usin->sin_port);
 303                 tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
 304                                                  inet->inet_saddr,
 305                                                  inet->inet_daddr);
 306         }
 307
 308         inet->inet_id = tp->write_seq ^ jiffies;
 309
 310         if (tcp_fastopen_defer_connect(sk, &err))
 311                 return err;
 312         if (err)
 313                 goto failure;
 314
 315         err = tcp_connect(sk);
 316
 317         if (err)
 318                 goto failure;
 319
 320         return 0;
 321
 322 failure:
 323         /*
 324          * This unhashes the socket and releases the local port,
 325          * if necessary.
 326          */
 327         tcp_set_state(sk, TCP_CLOSE);
 328         ip_rt_put(rt);
 329         sk->sk_route_caps = 0;
 330         inet->inet_dport = 0;
 331         return err;
 332 }
 333 EXPORT_SYMBOL(tcp_v4_connect);
 334
 335 /*
 336  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
 337  * It can be called through tcp_release_cb() if socket was owned by user
 338  * at the time tcp_v4_err() was called to handle ICMP message.
 339  */
 340 void tcp_v4_mtu_reduced(struct sock *sk)
 341 {
 342         struct inet_sock *inet = inet_sk(sk);
 343         struct dst_entry *dst;
 344         u32 mtu;
 345
 346         if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
 347                 return;
 348         mtu = tcp_sk(sk)->mtu_info;
 349         dst = inet_csk_update_pmtu(sk, mtu);
 350         if (!dst)
 351                 return;
 352
 353         /* Something is about to be wrong... Remember soft error
 354          * for the case, if this connection will not able to recover.
 355          */
 356         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
 357                 sk->sk_err_soft = EMSGSIZE;
 358
 359         mtu = dst_mtu(dst);
 360
 361         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
 362             ip_sk_accept_pmtu(sk) &&
 363             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
 364                 tcp_sync_mss(sk, mtu);
 365
 366                 /* Resend the TCP packet because it's
 367                  * clear that the old packet has been
 368                  * dropped. This is the new "fast" path mtu
 369                  * discovery.
 370                  */
 371                 tcp_simple_retransmit(sk);
 372         } /* else let the usual retransmit timer handle it */
 373 }
 374 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
 375
 376 static void do_redirect(struct sk_buff *skb, struct sock *sk)
 377 {
 378         struct dst_entry *dst = __sk_dst_check(sk, 0);
 379
 380         if (dst)
 381                 dst->ops->redirect(dst, sk, skb);
 382 }
 383
 384
 385 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
 386 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
 387 {
 388         struct request_sock *req = inet_reqsk(sk);
 389         struct net *net = sock_net(sk);
 390
 391         /* ICMPs are not backlogged, hence we cannot get
 392          * an established socket here.
 393          */
 394         if (seq != tcp_rsk(req)->snt_isn) {
 395                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
 396         } else if (abort) {
 397                 /*
 398                  * Still in SYN_RECV, just remove it silently.
 399                  * There is no good way to pass the error to the newly
 400                  * created socket, and POSIX does not want network
 401                  * errors returned from accept().
 402                  */
 403                 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
 404                 tcp_listendrop(req->rsk_listener);
 405         }
 406         reqsk_put(req);
 407 }
 408 EXPORT_SYMBOL(tcp_req_err);
 409
 410 /*
 411  * This routine is called by the ICMP module when it gets some
 412  * sort of error condition.  If err < 0 then the socket should
 413  * be closed and the error returned to the user.  If err > 0
 414  * it's just the icmp type << 8 | icmp code.  After adjustment
 415  * header points to the first 8 bytes of the tcp header.  We need
 416  * to find the appropriate port.
 417  *
 418  * The locking strategy used here is very "optimistic". When
 419  * someone else accesses the socket the ICMP is just dropped
 420  * and for some paths there is no check at all.
 421  * A more general error queue to queue errors for later handling
 422  * is probably better.
 423  *
 424  */
 425
 426 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
 427 {
 428         const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
 429         struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
 430         struct inet_connection_sock *icsk;
 431         struct tcp_sock *tp;
 432         struct inet_sock *inet;
 433         const int type = icmp_hdr(icmp_skb)->type;
 434         const int code = icmp_hdr(icmp_skb)->code;
 435         struct sock *sk;
 436         struct sk_buff *skb;
 437         struct request_sock *fastopen;
 438         u32 seq, snd_una;
 439         s32 remaining;
 440         u32 delta_us;
 441         int err;
 442         struct net *net = dev_net(icmp_skb->dev);
 443
 444         sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
 445                                        th->dest, iph->saddr, ntohs(th->source),
 446                                        inet_iif(icmp_skb), 0);
 447         if (!sk) {
 448                 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
 449                 return;
 450         }
 451         if (sk->sk_state == TCP_TIME_WAIT) {
 452                 inet_twsk_put(inet_twsk(sk));
 453                 return;
 454         }
 455         seq = ntohl(th->seq);
 456         if (sk->sk_state == TCP_NEW_SYN_RECV)
 457                 return tcp_req_err(sk, seq,
 458                                   type == ICMP_PARAMETERPROB ||
 459                                   type == ICMP_TIME_EXCEEDED ||
 460                                   (type == ICMP_DEST_UNREACH &&
 461                                    (code == ICMP_NET_UNREACH ||
 462                                     code == ICMP_HOST_UNREACH)));
 463
 464         bh_lock_sock(sk);
 465         /* If too many ICMPs get dropped on busy
 466          * servers this needs to be solved differently.
 467          * We do take care of PMTU discovery (RFC1191) special case :
 468          * we can receive locally generated ICMP messages while socket is held.
 469          */
 470         if (sock_owned_by_user(sk)) {
 471                 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
 472                         __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
 473         }
 474         if (sk->sk_state == TCP_CLOSE)
 475                 goto out;
 476
 477         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
 478                 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
 479                 goto out;
 480         }
 481
 482         icsk = inet_csk(sk);
 483         tp = tcp_sk(sk);
 484         /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
 485         fastopen = tp->fastopen_rsk;
 486         snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
 487         if (sk->sk_state != TCP_LISTEN &&
 488             !between(seq, snd_una, tp->snd_nxt)) {
 489                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
 490                 goto out;
 491         }
 492
 493         switch (type) {
 494         case ICMP_REDIRECT:
 495                 if (!sock_owned_by_user(sk))
 496                         do_redirect(icmp_skb, sk);
 497                 goto out;
 498         case ICMP_SOURCE_QUENCH:
 499                 /* Just silently ignore these. */
 500                 goto out;
 501         case ICMP_PARAMETERPROB:
 502                 err = EPROTO;
 503                 break;
 504         case ICMP_DEST_UNREACH:
 505                 if (code > NR_ICMP_UNREACH)
 506                         goto out;
 507
 508                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
 509                         /* We are not interested in TCP_LISTEN and open_requests
 510                          * (SYN-ACKs send out by Linux are always <576bytes so
 511                          * they should go through unfragmented).
 512                          */
 513                         if (sk->sk_state == TCP_LISTEN)
 514                                 goto out;
 515
 516                         tp->mtu_info = info;
 517                         if (!sock_owned_by_user(sk)) {
 518                                 tcp_v4_mtu_reduced(sk);
 519                         } else {
 520                                 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
 521                                         sock_hold(sk);
 522                         }
 523                         goto out;
 524                 }
 525
 526                 err = icmp_err_convert[code].errno;
 527                 /* check if icmp_skb allows revert of backoff
 528                  * (see draft-zimmermann-tcp-lcd) */
 529                 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
 530                         break;
 531                 if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
 532                     !icsk->icsk_backoff || fastopen)
 533                         break;
 534
 535                 if (sock_owned_by_user(sk))
 536                         break;
 537
 538                 icsk->icsk_backoff--;
 539                 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
 540                                                TCP_TIMEOUT_INIT;
 541                 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
 542
 543                 skb = tcp_rtx_queue_head(sk);
 544                 BUG_ON(!skb);
 545
 546                 tcp_mstamp_refresh(tp);
 547                 delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
 548                 remaining = icsk->icsk_rto -
 549                             usecs_to_jiffies(delta_us);
 550
 551                 if (remaining > 0) {
 552                         inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
 553                                                   remaining, TCP_RTO_MAX);
 554                 } else {
 555                         /* RTO revert clocked out retransmission.
 556                          * Will retransmit now */
 557                         tcp_retransmit_timer(sk);
 558                 }
 559
 560                 break;
 561         case ICMP_TIME_EXCEEDED:
 562                 err = EHOSTUNREACH;
 563                 break;
 564         default:
 565                 goto out;
 566         }
 567
 568         switch (sk->sk_state) {
 569         case TCP_SYN_SENT:
 570         case TCP_SYN_RECV:
 571                 /* Only in fast or simultaneous open. If a fast open socket is
 572                  * is already accepted it is treated as a connected one below.
 573                  */
 574                 if (fastopen && !fastopen->sk)
 575                         break;
 576
 577                 if (!sock_owned_by_user(sk)) {
 578                         sk->sk_err = err;
 579
 580                         sk->sk_error_report(sk);
 581
 582                         tcp_done(sk);
 583                 } else {
 584                         sk->sk_err_soft = err;
 585                 }
 586                 goto out;
 587         }
 588
 589         /* If we've already connected we will keep trying
 590          * until we time out, or the user gives up.
 591          *
 592          * rfc1122 4.2.3.9 allows to consider as hard errors
 593          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
 594          * but it is obsoleted by pmtu discovery).
 595          *
 596          * Note, that in modern internet, where routing is unreliable
 597          * and in each dark corner broken firewalls sit, sending random
 598          * errors ordered by their masters even this two messages finally lose
 599          * their original sense (even Linux sends invalid PORT_UNREACHs)
 600          *
 601          * Now we are in compliance with RFCs.
 602          *                                                      --ANK (980905)
 603          */
 604
 605         inet = inet_sk(sk);
 606         if (!sock_owned_by_user(sk) && inet->recverr) {
 607                 sk->sk_err = err;
 608                 sk->sk_error_report(sk);
 609         } else  { /* Only an error on timeout */
 610                 sk->sk_err_soft = err;
 611         }
 612
 613 out:
 614         bh_unlock_sock(sk);
 615         sock_put(sk);
 616 }
 617
 618 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
 619 {
 620         struct tcphdr *th = tcp_hdr(skb);
 621
 622         th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
 623         skb->csum_start = skb_transport_header(skb) - skb->head;
 624         skb->csum_offset = offsetof(struct tcphdr, check);
 625 }
 626
 627 /* This routine computes an IPv4 TCP checksum. */
 628 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
 629 {
 630         const struct inet_sock *inet = inet_sk(sk);
 631
 632         __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
 633 }
 634 EXPORT_SYMBOL(tcp_v4_send_check);
 635
 636 /*
 637  *      This routine will send an RST to the other tcp.
 638  *
 639  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
 640  *                    for reset.
 641  *      Answer: if a packet caused RST, it is not for a socket
 642  *              existing in our system, if it is matched to a socket,
 643  *              it is just duplicate segment or bug in other side's TCP.
 644  *              So that we build reply only basing on parameters
 645  *              arrived with segment.
 646  *      Exception: precedence violation. We do not implement it in any case.
 647  */
 648
 649 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
 650 {
 651         const struct tcphdr *th = tcp_hdr(skb);
 652         struct {
 653                 struct tcphdr th;
 654 #ifdef CONFIG_TCP_MD5SIG
 655                 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
 656 #endif
 657         } rep;
 658         struct ip_reply_arg arg;
 659 #ifdef CONFIG_TCP_MD5SIG
 660         struct tcp_md5sig_key *key = NULL;
 661         const __u8 *hash_location = NULL;
 662         unsigned char newhash[16];
 663         int genhash;
 664         struct sock *sk1 = NULL;
 665 #endif
 666         struct net *net;
 667         struct sock *ctl_sk;
 668
 669         /* Never send a reset in response to a reset. */
 670         if (th->rst)
 671                 return;
 672
 673         /* If sk not NULL, it means we did a successful lookup and incoming
 674          * route had to be correct. prequeue might have dropped our dst.
 675          */
 676         if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
 677                 return;
 678
 679         /* Swap the send and the receive. */
 680         memset(&rep, 0, sizeof(rep));
 681         rep.th.dest   = th->source;
 682         rep.th.source = th->dest;
 683         rep.th.doff   = sizeof(struct tcphdr) / 4;
 684         rep.th.rst    = 1;
 685
 686         if (th->ack) {
 687                 rep.th.seq = th->ack_seq;
 688         } else {
 689                 rep.th.ack = 1;
 690                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
 691                                        skb->len - (th->doff << 2));
 692         }
 693
 694         memset(&arg, 0, sizeof(arg));
 695         arg.iov[0].iov_base = (unsigned char *)&rep;
 696         arg.iov[0].iov_len  = sizeof(rep.th);
 697
 698         net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
 699 #ifdef CONFIG_TCP_MD5SIG
 700         rcu_read_lock();
 701         hash_location = tcp_parse_md5sig_option(th);
 702         if (sk && sk_fullsock(sk)) {
 703                 key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
 704                                         &ip_hdr(skb)->saddr, AF_INET);
 705         } else if (hash_location) {
 706                 /*
 707                  * active side is lost. Try to find listening socket through
 708                  * source port, and then find md5 key through listening socket.
 709                  * we are not loose security here:
 710                  * Incoming packet is checked with md5 hash with finding key,
 711                  * no RST generated if md5 hash doesn't match.
 712                  */
 713                 sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
 714                                              ip_hdr(skb)->saddr,
 715                                              th->source, ip_hdr(skb)->daddr,
 716                                              ntohs(th->source), inet_iif(skb),
 717                                              tcp_v4_sdif(skb));
 718                 /* don't send rst if it can't find key */
 719                 if (!sk1)
 720                         goto out;
 721
 722                 key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
 723                                         &ip_hdr(skb)->saddr, AF_INET);
 724                 if (!key)
 725                         goto out;
 726
 727
 728                 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
 729                 if (genhash || memcmp(hash_location, newhash, 16) != 0)
 730                         goto out;
 731
 732         }
 733
 734         if (key) {
 735                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
 736                                    (TCPOPT_NOP << 16) |
 737                                    (TCPOPT_MD5SIG << 8) |
 738                                    TCPOLEN_MD5SIG);
 739                 /* Update length and the length the header thinks exists */
 740                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 741                 rep.th.doff = arg.iov[0].iov_len / 4;
 742
 743                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
 744                                      key, ip_hdr(skb)->saddr,
 745                                      ip_hdr(skb)->daddr, &rep.th);
 746         }
 747 #endif
 748         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 749                                       ip_hdr(skb)->saddr, /* XXX */
 750                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 751         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 752         arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
 753
 754         /* When socket is gone, all binding information is lost.
 755          * routing might fail in this case. No choice here, if we choose to force
 756          * input interface, we will misroute in case of asymmetric route.
 757          */
 758         if (sk) {
 759                 arg.bound_dev_if = sk->sk_bound_dev_if;
 760                 if (sk_fullsock(sk))
 761                         trace_tcp_send_reset(sk, skb);
 762         }
 763
 764         BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
 765                      offsetof(struct inet_timewait_sock, tw_bound_dev_if));
 766
 767         arg.tos = ip_hdr(skb)->tos;
 768         arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
 769         local_bh_disable();
 770         ctl_sk = *this_cpu_ptr(net->ipv4.tcp_sk);
 771         if (sk)
 772                 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
 773                                    inet_twsk(sk)->tw_mark : sk->sk_mark;
 774         ip_send_unicast_reply(ctl_sk,
 775                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
 776                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 777                               &arg, arg.iov[0].iov_len);
 778
 779         ctl_sk->sk_mark = 0;
 780         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 781         __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
 782         local_bh_enable();
 783
 784 #ifdef CONFIG_TCP_MD5SIG
 785 out:
 786         rcu_read_unlock();
 787 #endif
 788 }
 789
 790 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
 791    outside socket context is ugly, certainly. What can I do?
 792  */
 793
 794 static void tcp_v4_send_ack(const struct sock *sk,
 795                             struct sk_buff *skb, u32 seq, u32 ack,
 796                             u32 win, u32 tsval, u32 tsecr, int oif,
 797                             struct tcp_md5sig_key *key,
 798                             int reply_flags, u8 tos)
 799 {
 800         const struct tcphdr *th = tcp_hdr(skb);
 801         struct {
 802                 struct tcphdr th;
 803                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
 804 #ifdef CONFIG_TCP_MD5SIG
 805                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
 806 #endif
 807                         ];
 808         } rep;
 809         struct net *net = sock_net(sk);
 810         struct ip_reply_arg arg;
 811         struct sock *ctl_sk;
 812
 813         memset(&rep.th, 0, sizeof(struct tcphdr));
 814         memset(&arg, 0, sizeof(arg));
 815
 816         arg.iov[0].iov_base = (unsigned char *)&rep;
 817         arg.iov[0].iov_len  = sizeof(rep.th);
 818         if (tsecr) {
 819                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
 820                                    (TCPOPT_TIMESTAMP << 8) |
 821                                    TCPOLEN_TIMESTAMP);
 822                 rep.opt[1] = htonl(tsval);
 823                 rep.opt[2] = htonl(tsecr);
 824                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
 825         }
 826
 827         /* Swap the send and the receive. */
 828         rep.th.dest    = th->source;
 829         rep.th.source  = th->dest;
 830         rep.th.doff    = arg.iov[0].iov_len / 4;
 831         rep.th.seq     = htonl(seq);
 832         rep.th.ack_seq = htonl(ack);
 833         rep.th.ack     = 1;
 834         rep.th.window  = htons(win);
 835
 836 #ifdef CONFIG_TCP_MD5SIG
 837         if (key) {
 838                 int offset = (tsecr) ? 3 : 0;
 839
 840                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
 841                                           (TCPOPT_NOP << 16) |
 842                                           (TCPOPT_MD5SIG << 8) |
 843                                           TCPOLEN_MD5SIG);
 844                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 845                 rep.th.doff = arg.iov[0].iov_len/4;
 846
 847                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
 848                                     key, ip_hdr(skb)->saddr,
 849                                     ip_hdr(skb)->daddr, &rep.th);
 850         }
 851 #endif
 852         arg.flags = reply_flags;
 853         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 854                                       ip_hdr(skb)->saddr, /* XXX */
 855                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 856         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 857         if (oif)
 858                 arg.bound_dev_if = oif;
 859         arg.tos = tos;
 860         arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
 861         local_bh_disable();
 862         ctl_sk = *this_cpu_ptr(net->ipv4.tcp_sk);
 863         if (sk)
 864                 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
 865                                    inet_twsk(sk)->tw_mark : sk->sk_mark;
 866         ip_send_unicast_reply(ctl_sk,
 867                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
 868                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 869                               &arg, arg.iov[0].iov_len);
 870
 871         ctl_sk->sk_mark = 0;
 872         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 873         local_bh_enable();
 874 }
 875
 876 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
 877 {
 878         struct inet_timewait_sock *tw = inet_twsk(sk);
 879         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
 880
 881         tcp_v4_send_ack(sk, skb,
 882                         tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
 883                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
 884                         tcp_time_stamp_raw() + tcptw->tw_ts_offset,
 885                         tcptw->tw_ts_recent,
 886                         tw->tw_bound_dev_if,
 887                         tcp_twsk_md5_key(tcptw),
 888                         tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
 889                         tw->tw_tos
 890                         );
 891
 892         inet_twsk_put(tw);
 893 }
 894
 895 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
 896                                   struct request_sock *req)
 897 {
 898         /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
 899          * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
 900          */
 901         u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
 902                                              tcp_sk(sk)->snd_nxt;
 903
 904         /* RFC 7323 2.3
 905          * The window field (SEG.WND) of every outgoing segment, with the
 906          * exception of <SYN> segments, MUST be right-shifted by
 907          * Rcv.Wind.Shift bits:
 908          */
 909         tcp_v4_send_ack(sk, skb, seq,
 910                         tcp_rsk(req)->rcv_nxt,
 911                         req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
 912                         tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
 913                         req->ts_recent,
 914                         0,
 915                         tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->saddr,
 916                                           AF_INET),
 917                         inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
 918                         ip_hdr(skb)->tos);
 919 }
 920
 921 /*
 922  *      Send a SYN-ACK after having received a SYN.
 923  *      This still operates on a request_sock only, not on a big
 924  *      socket.
 925  */
 926 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
 927                               struct flowi *fl,
 928                               struct request_sock *req,
 929                               struct tcp_fastopen_cookie *foc,
 930                               enum tcp_synack_type synack_type)
 931 {
 932         const struct inet_request_sock *ireq = inet_rsk(req);
 933         struct flowi4 fl4;
 934         int err = -1;
 935         struct sk_buff *skb;
 936
 937         /* First, grab a route. */
 938         if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
 939                 return -1;
 940
 941         skb = tcp_make_synack(sk, dst, req, foc, synack_type);
 942
 943         if (skb) {
 944                 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
 945
 946                 rcu_read_lock();
 947                 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
 948                                             ireq->ir_rmt_addr,
 949                                             rcu_dereference(ireq->ireq_opt));
 950                 rcu_read_unlock();
 951                 err = net_xmit_eval(err);
 952         }
 953
 954         return err;
 955 }
 956
 957 /*
 958  *      IPv4 request_sock destructor.
 959  */
 960 static void tcp_v4_reqsk_destructor(struct request_sock *req)
 961 {
 962         kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
 963 }
 964
 965 #ifdef CONFIG_TCP_MD5SIG
 966 /*
 967  * RFC2385 MD5 checksumming requires a mapping of
 968  * IP address->MD5 Key.
 969  * We need to maintain these in the sk structure.
 970  */
 971
 972 /* Find the Key structure for an address.  */
 973 struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk,
 974                                          const union tcp_md5_addr *addr,
 975                                          int family)
 976 {
 977         const struct tcp_sock *tp = tcp_sk(sk);
 978         struct tcp_md5sig_key *key;
 979         const struct tcp_md5sig_info *md5sig;
 980         __be32 mask;
 981         struct tcp_md5sig_key *best_match = NULL;
 982         bool match;
 983
 984         /* caller either holds rcu_read_lock() or socket lock */
 985         md5sig = rcu_dereference_check(tp->md5sig_info,
 986                                        lockdep_sock_is_held(sk));
 987         if (!md5sig)
 988                 return NULL;
 989
 990         hlist_for_each_entry_rcu(key, &md5sig->head, node) {
 991                 if (key->family != family)
 992                         continue;
 993
 994                 if (family == AF_INET) {
 995                         mask = inet_make_mask(key->prefixlen);
 996                         match = (key->addr.a4.s_addr & mask) ==
 997                                 (addr->a4.s_addr & mask);
 998 #if IS_ENABLED(CONFIG_IPV6)
 999                 } else if (family == AF_INET6) {
1000                         match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1001                                                   key->prefixlen);
1002 #endif
1003                 } else {
1004                         match = false;
1005                 }
1006
1007                 if (match && (!best_match ||
1008                               key->prefixlen > best_match->prefixlen))
1009                         best_match = key;
1010         }
1011         return best_match;
1012 }
1013 EXPORT_SYMBOL(tcp_md5_do_lookup);
1014
1015 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1016                                                       const union tcp_md5_addr *addr,
1017                                                       int family, u8 prefixlen)
1018 {
1019         const struct tcp_sock *tp = tcp_sk(sk);
1020         struct tcp_md5sig_key *key;
1021         unsigned int size = sizeof(struct in_addr);
1022         const struct tcp_md5sig_info *md5sig;
1023
1024         /* caller either holds rcu_read_lock() or socket lock */
1025         md5sig = rcu_dereference_check(tp->md5sig_info,
1026                                        lockdep_sock_is_held(sk));
1027         if (!md5sig)
1028                 return NULL;
1029 #if IS_ENABLED(CONFIG_IPV6)
1030         if (family == AF_INET6)
1031                 size = sizeof(struct in6_addr);
1032 #endif
1033         hlist_for_each_entry_rcu(key, &md5sig->head, node) {
1034                 if (key->family != family)
1035                         continue;
1036                 if (!memcmp(&key->addr, addr, size) &&
1037                     key->prefixlen == prefixlen)
1038                         return key;
1039         }
1040         return NULL;
1041 }
1042
1043 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1044                                          const struct sock *addr_sk)
1045 {
1046         const union tcp_md5_addr *addr;
1047
1048         addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1049         return tcp_md5_do_lookup(sk, addr, AF_INET);
1050 }
1051 EXPORT_SYMBOL(tcp_v4_md5_lookup);
1052
1053 /* This can be called on a newly created socket, from other files */
1054 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1055                    int family, u8 prefixlen, const u8 *newkey, u8 newkeylen,
1056                    gfp_t gfp)
1057 {
1058         /* Add Key to the list */
1059         struct tcp_md5sig_key *key;
1060         struct tcp_sock *tp = tcp_sk(sk);
1061         struct tcp_md5sig_info *md5sig;
1062
1063         key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
1064         if (key) {
1065                 /* Pre-existing entry - just update that one. */
1066                 memcpy(key->key, newkey, newkeylen);
1067                 key->keylen = newkeylen;
1068                 return 0;
1069         }
1070
1071         md5sig = rcu_dereference_protected(tp->md5sig_info,
1072                                            lockdep_sock_is_held(sk));
1073         if (!md5sig) {
1074                 md5sig = kmalloc(sizeof(*md5sig), gfp);
1075                 if (!md5sig)
1076                         return -ENOMEM;
1077
1078                 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1079                 INIT_HLIST_HEAD(&md5sig->head);
1080                 rcu_assign_pointer(tp->md5sig_info, md5sig);
1081         }
1082
1083         key = sock_kmalloc(sk, sizeof(*key), gfp);
1084         if (!key)
1085                 return -ENOMEM;
1086         if (!tcp_alloc_md5sig_pool()) {
1087                 sock_kfree_s(sk, key, sizeof(*key));
1088                 return -ENOMEM;
1089         }
1090
1091         memcpy(key->key, newkey, newkeylen);
1092         key->keylen = newkeylen;
1093         key->family = family;
1094         key->prefixlen = prefixlen;
1095         memcpy(&key->addr, addr,
1096                (family == AF_INET6) ? sizeof(struct in6_addr) :
1097                                       sizeof(struct in_addr));
1098         hlist_add_head_rcu(&key->node, &md5sig->head);
1099         return 0;
1100 }
1101 EXPORT_SYMBOL(tcp_md5_do_add);
1102
1103 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1104                    u8 prefixlen)
1105 {
1106         struct tcp_md5sig_key *key;
1107
1108         key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
1109         if (!key)
1110                 return -ENOENT;
1111         hlist_del_rcu(&key->node);
1112         atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1113         kfree_rcu(key, rcu);
1114         return 0;
1115 }
1116 EXPORT_SYMBOL(tcp_md5_do_del);
1117
1118 static void tcp_clear_md5_list(struct sock *sk)
1119 {
1120         struct tcp_sock *tp = tcp_sk(sk);
1121         struct tcp_md5sig_key *key;
1122         struct hlist_node *n;
1123         struct tcp_md5sig_info *md5sig;
1124
1125         md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1126
1127         hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1128                 hlist_del_rcu(&key->node);
1129                 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1130                 kfree_rcu(key, rcu);
1131         }
1132 }
1133
1134 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1135                                  char __user *optval, int optlen)
1136 {
1137         struct tcp_md5sig cmd;
1138         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1139         u8 prefixlen = 32;
1140
1141         if (optlen < sizeof(cmd))
1142                 return -EINVAL;
1143
1144         if (copy_from_user(&cmd, optval, sizeof(cmd)))
1145                 return -EFAULT;
1146
1147         if (sin->sin_family != AF_INET)
1148                 return -EINVAL;
1149
1150         if (optname == TCP_MD5SIG_EXT &&
1151             cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1152                 prefixlen = cmd.tcpm_prefixlen;
1153                 if (prefixlen > 32)
1154                         return -EINVAL;
1155         }
1156
1157         if (!cmd.tcpm_keylen)
1158                 return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1159                                       AF_INET, prefixlen);
1160
1161         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1162                 return -EINVAL;
1163
1164         return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1165                               AF_INET, prefixlen, cmd.tcpm_key, cmd.tcpm_keylen,
1166                               GFP_KERNEL);
1167 }
1168
1169 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1170                                    __be32 daddr, __be32 saddr,
1171                                    const struct tcphdr *th, int nbytes)
1172 {
1173         struct tcp4_pseudohdr *bp;
1174         struct scatterlist sg;
1175         struct tcphdr *_th;
1176
1177         bp = hp->scratch;
1178         bp->saddr = saddr;
1179         bp->daddr = daddr;
1180         bp->pad = 0;
1181         bp->protocol = IPPROTO_TCP;
1182         bp->len = cpu_to_be16(nbytes);
1183
1184         _th = (struct tcphdr *)(bp + 1);
1185         memcpy(_th, th, sizeof(*th));
1186         _th->check = 0;
1187
1188         sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1189         ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1190                                 sizeof(*bp) + sizeof(*th));
1191         return crypto_ahash_update(hp->md5_req);
1192 }
1193
1194 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1195                                __be32 daddr, __be32 saddr, const struct tcphdr *th)
1196 {
1197         struct tcp_md5sig_pool *hp;
1198         struct ahash_request *req;
1199
1200         hp = tcp_get_md5sig_pool();
1201         if (!hp)
1202                 goto clear_hash_noput;
1203         req = hp->md5_req;
1204
1205         if (crypto_ahash_init(req))
1206                 goto clear_hash;
1207         if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1208                 goto clear_hash;
1209         if (tcp_md5_hash_key(hp, key))
1210                 goto clear_hash;
1211         ahash_request_set_crypt(req, NULL, md5_hash, 0);
1212         if (crypto_ahash_final(req))
1213                 goto clear_hash;
1214
1215         tcp_put_md5sig_pool();
1216         return 0;
1217
1218 clear_hash:
1219         tcp_put_md5sig_pool();
1220 clear_hash_noput:
1221         memset(md5_hash, 0, 16);
1222         return 1;
1223 }
1224
1225 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1226                         const struct sock *sk,
1227                         const struct sk_buff *skb)
1228 {
1229         struct tcp_md5sig_pool *hp;
1230         struct ahash_request *req;
1231         const struct tcphdr *th = tcp_hdr(skb);
1232         __be32 saddr, daddr;
1233
1234         if (sk) { /* valid for establish/request sockets */
1235                 saddr = sk->sk_rcv_saddr;
1236                 daddr = sk->sk_daddr;
1237         } else {
1238                 const struct iphdr *iph = ip_hdr(skb);
1239                 saddr = iph->saddr;
1240                 daddr = iph->daddr;
1241         }
1242
1243         hp = tcp_get_md5sig_pool();
1244         if (!hp)
1245                 goto clear_hash_noput;
1246         req = hp->md5_req;
1247
1248         if (crypto_ahash_init(req))
1249                 goto clear_hash;
1250
1251         if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1252                 goto clear_hash;
1253         if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1254                 goto clear_hash;
1255         if (tcp_md5_hash_key(hp, key))
1256                 goto clear_hash;
1257         ahash_request_set_crypt(req, NULL, md5_hash, 0);
1258         if (crypto_ahash_final(req))
1259                 goto clear_hash;
1260
1261         tcp_put_md5sig_pool();
1262         return 0;
1263
1264 clear_hash:
1265         tcp_put_md5sig_pool();
1266 clear_hash_noput:
1267         memset(md5_hash, 0, 16);
1268         return 1;
1269 }
1270 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1271
1272 #endif
1273
1274 /* Called with rcu_read_lock() */
1275 static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1276                                     const struct sk_buff *skb)
1277 {
1278 #ifdef CONFIG_TCP_MD5SIG
1279         /*
1280          * This gets called for each TCP segment that arrives
1281          * so we want to be efficient.
1282          * We have 3 drop cases:
1283          * o No MD5 hash and one expected.
1284          * o MD5 hash and we're not expecting one.
1285          * o MD5 hash and its wrong.
1286          */
1287         const __u8 *hash_location = NULL;
1288         struct tcp_md5sig_key *hash_expected;
1289         const struct iphdr *iph = ip_hdr(skb);
1290         const struct tcphdr *th = tcp_hdr(skb);
1291         int genhash;
1292         unsigned char newhash[16];
1293
1294         hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1295                                           AF_INET);
1296         hash_location = tcp_parse_md5sig_option(th);
1297
1298         /* We've parsed the options - do we have a hash? */
1299         if (!hash_expected && !hash_location)
1300                 return false;
1301
1302         if (hash_expected && !hash_location) {
1303                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1304                 return true;
1305         }
1306
1307         if (!hash_expected && hash_location) {
1308                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1309                 return true;
1310         }
1311
1312         /* Okay, so this is hash_expected and hash_location -
1313          * so we need to calculate the checksum.
1314          */
1315         genhash = tcp_v4_md5_hash_skb(newhash,
1316                                       hash_expected,
1317                                       NULL, skb);
1318
1319         if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1320                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
1321                 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1322                                      &iph->saddr, ntohs(th->source),
1323                                      &iph->daddr, ntohs(th->dest),
1324                                      genhash ? " tcp_v4_calc_md5_hash failed"
1325                                      : "");
1326                 return true;
1327         }
1328         return false;
1329 #endif
1330         return false;
1331 }
1332
1333 static void tcp_v4_init_req(struct request_sock *req,
1334                             const struct sock *sk_listener,
1335                             struct sk_buff *skb)
1336 {
1337         struct inet_request_sock *ireq = inet_rsk(req);
1338         struct net *net = sock_net(sk_listener);
1339
1340         sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1341         sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1342         RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1343 }
1344
1345 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1346                                           struct flowi *fl,
1347                                           const struct request_sock *req)
1348 {
1349         return inet_csk_route_req(sk, &fl->u.ip4, req);
1350 }
1351
1352 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1353         .family         =       PF_INET,
1354         .obj_size       =       sizeof(struct tcp_request_sock),
1355         .rtx_syn_ack    =       tcp_rtx_synack,
1356         .send_ack       =       tcp_v4_reqsk_send_ack,
1357         .destructor     =       tcp_v4_reqsk_destructor,
1358         .send_reset     =       tcp_v4_send_reset,
1359         .syn_ack_timeout =      tcp_syn_ack_timeout,
1360 };
1361
1362 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1363         .mss_clamp      =       TCP_MSS_DEFAULT,
1364 #ifdef CONFIG_TCP_MD5SIG
1365         .req_md5_lookup =       tcp_v4_md5_lookup,
1366         .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1367 #endif
1368         .init_req       =       tcp_v4_init_req,
1369 #ifdef CONFIG_SYN_COOKIES
1370         .cookie_init_seq =      cookie_v4_init_sequence,
1371 #endif
1372         .route_req      =       tcp_v4_route_req,
1373         .init_seq       =       tcp_v4_init_seq,
1374         .init_ts_off    =       tcp_v4_init_ts_off,
1375         .send_synack    =       tcp_v4_send_synack,
1376 };
1377
1378 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1379 {
1380         /* Never answer to SYNs send to broadcast or multicast */
1381         if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1382                 goto drop;
1383
1384         return tcp_conn_request(&tcp_request_sock_ops,
1385                                 &tcp_request_sock_ipv4_ops, sk, skb);
1386
1387 drop:
1388         tcp_listendrop(sk);
1389         return 0;
1390 }
1391 EXPORT_SYMBOL(tcp_v4_conn_request);
1392
1393
1394 /*
1395  * The three way handshake has completed - we got a valid synack -
1396  * now create the new socket.
1397  */
1398 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1399                                   struct request_sock *req,
1400                                   struct dst_entry *dst,
1401                                   struct request_sock *req_unhash,
1402                                   bool *own_req)
1403 {
1404         struct inet_request_sock *ireq;
1405         struct inet_sock *newinet;
1406         struct tcp_sock *newtp;
1407         struct sock *newsk;
1408 #ifdef CONFIG_TCP_MD5SIG
1409         struct tcp_md5sig_key *key;
1410 #endif
1411         struct ip_options_rcu *inet_opt;
1412
1413         if (sk_acceptq_is_full(sk))
1414                 goto exit_overflow;
1415
1416         newsk = tcp_create_openreq_child(sk, req, skb);
1417         if (!newsk)
1418                 goto exit_nonewsk;
1419
1420         newsk->sk_gso_type = SKB_GSO_TCPV4;
1421         inet_sk_rx_dst_set(newsk, skb);
1422
1423         newtp                 = tcp_sk(newsk);
1424         newinet               = inet_sk(newsk);
1425         ireq                  = inet_rsk(req);
1426         sk_daddr_set(newsk, ireq->ir_rmt_addr);
1427         sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1428         newsk->sk_bound_dev_if = ireq->ir_iif;
1429         newinet->inet_saddr   = ireq->ir_loc_addr;
1430         inet_opt              = rcu_dereference(ireq->ireq_opt);
1431         RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1432         newinet->mc_index     = inet_iif(skb);
1433         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1434         newinet->rcv_tos      = ip_hdr(skb)->tos;
1435         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1436         if (inet_opt)
1437                 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1438         newinet->inet_id = newtp->write_seq ^ jiffies;
1439
1440         if (!dst) {
1441                 dst = inet_csk_route_child_sock(sk, newsk, req);
1442                 if (!dst)
1443                         goto put_and_exit;
1444         } else {
1445                 /* syncookie case : see end of cookie_v4_check() */
1446         }
1447         sk_setup_caps(newsk, dst);
1448
1449         tcp_ca_openreq_child(newsk, dst);
1450
1451         tcp_sync_mss(newsk, dst_mtu(dst));
1452         newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1453
1454         tcp_initialize_rcv_mss(newsk);
1455
1456 #ifdef CONFIG_TCP_MD5SIG
1457         /* Copy over the MD5 key from the original socket */
1458         key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1459                                 AF_INET);
1460         if (key) {
1461                 /*
1462                  * We're using one, so create a matching key
1463                  * on the newsk structure. If we fail to get
1464                  * memory, then we end up not copying the key
1465                  * across. Shucks.
1466                  */
1467                 tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1468                                AF_INET, 32, key->key, key->keylen, GFP_ATOMIC);
1469                 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1470         }
1471 #endif
1472
1473         if (__inet_inherit_port(sk, newsk) < 0)
1474                 goto put_and_exit;
1475         *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
1476         if (likely(*own_req)) {
1477                 tcp_move_syn(newtp, req);
1478                 ireq->ireq_opt = NULL;
1479         } else {
1480                 newinet->inet_opt = NULL;
1481         }
1482         return newsk;
1483
1484 exit_overflow:
1485         NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1486 exit_nonewsk:
1487         dst_release(dst);
1488 exit:
1489         tcp_listendrop(sk);
1490         return NULL;
1491 put_and_exit:
1492         newinet->inet_opt = NULL;
1493         inet_csk_prepare_forced_close(newsk);
1494         tcp_done(newsk);
1495         goto exit;
1496 }
1497 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1498
1499 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1500 {
1501 #ifdef CONFIG_SYN_COOKIES
1502         const struct tcphdr *th = tcp_hdr(skb);
1503
1504         if (!th->syn)
1505                 sk = cookie_v4_check(sk, skb);
1506 #endif
1507         return sk;
1508 }
1509
1510 /* The socket must have it's spinlock held when we get
1511  * here, unless it is a TCP_LISTEN socket.
1512  *
1513  * We have a potential double-lock case here, so even when
1514  * doing backlog processing we use the BH locking scheme.
1515  * This is because we cannot sleep with the original spinlock
1516  * held.
1517  */
1518 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1519 {
1520         struct sock *rsk;
1521
1522         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1523                 struct dst_entry *dst = sk->sk_rx_dst;
1524
1525                 sock_rps_save_rxhash(sk, skb);
1526                 sk_mark_napi_id(sk, skb);
1527                 if (dst) {
1528                         if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1529                             !dst->ops->check(dst, 0)) {
1530                                 dst_release(dst);
1531                                 sk->sk_rx_dst = NULL;
1532                         }
1533                 }
1534                 tcp_rcv_established(sk, skb);
1535                 return 0;
1536         }
1537
1538         if (tcp_checksum_complete(skb))
1539                 goto csum_err;
1540
1541         if (sk->sk_state == TCP_LISTEN) {
1542                 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1543
1544                 if (!nsk)
1545                         goto discard;
1546                 if (nsk != sk) {
1547                         if (tcp_child_process(sk, nsk, skb)) {
1548                                 rsk = nsk;
1549                                 goto reset;
1550                         }
1551                         return 0;
1552                 }
1553         } else
1554                 sock_rps_save_rxhash(sk, skb);
1555
1556         if (tcp_rcv_state_process(sk, skb)) {
1557                 rsk = sk;
1558                 goto reset;
1559         }
1560         return 0;
1561
1562 reset:
1563         tcp_v4_send_reset(rsk, skb);
1564 discard:
1565         kfree_skb(skb);
1566         /* Be careful here. If this function gets more complicated and
1567          * gcc suffers from register pressure on the x86, sk (in %ebx)
1568          * might be destroyed here. This current version compiles correctly,
1569          * but you have been warned.
1570          */
1571         return 0;
1572
1573 csum_err:
1574         TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1575         TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1576         goto discard;
1577 }
1578 EXPORT_SYMBOL(tcp_v4_do_rcv);
1579
1580 int tcp_v4_early_demux(struct sk_buff *skb)
1581 {
1582         const struct iphdr *iph;
1583         const struct tcphdr *th;
1584         struct sock *sk;
1585
1586         if (skb->pkt_type != PACKET_HOST)
1587                 return 0;
1588
1589         if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1590                 return 0;
1591
1592         iph = ip_hdr(skb);
1593         th = tcp_hdr(skb);
1594
1595         if (th->doff < sizeof(struct tcphdr) / 4)
1596                 return 0;
1597
1598         sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1599                                        iph->saddr, th->source,
1600                                        iph->daddr, ntohs(th->dest),
1601                                        skb->skb_iif, inet_sdif(skb));
1602         if (sk) {
1603                 skb->sk = sk;
1604                 skb->destructor = sock_edemux;
1605                 if (sk_fullsock(sk)) {
1606                         struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
1607
1608                         if (dst)
1609                                 dst = dst_check(dst, 0);
1610                         if (dst &&
1611                             inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1612                                 skb_dst_set_noref(skb, dst);
1613                 }
1614         }
1615         return 0;
1616 }
1617
1618 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1619 {
1620         u32 limit = sk->sk_rcvbuf + sk->sk_sndbuf;
1621
1622         /* Only socket owner can try to collapse/prune rx queues
1623          * to reduce memory overhead, so add a little headroom here.
1624          * Few sockets backlog are possibly concurrently non empty.
1625          */
1626         limit += 64*1024;
1627
1628         /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1629          * we can fix skb->truesize to its real value to avoid future drops.
1630          * This is valid because skb is not yet charged to the socket.
1631          * It has been noticed pure SACK packets were sometimes dropped
1632          * (if cooked by drivers without copybreak feature).
1633          */
1634         skb_condense(skb);
1635
1636         if (unlikely(sk_add_backlog(sk, skb, limit))) {
1637                 bh_unlock_sock(sk);
1638                 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1639                 return true;
1640         }
1641         return false;
1642 }
1643 EXPORT_SYMBOL(tcp_add_backlog);
1644
1645 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1646 {
1647         struct tcphdr *th = (struct tcphdr *)skb->data;
1648         unsigned int eaten = skb->len;
1649         int err;
1650
1651         err = sk_filter_trim_cap(sk, skb, th->doff * 4);
1652         if (!err) {
1653                 eaten -= skb->len;
1654                 TCP_SKB_CB(skb)->end_seq -= eaten;
1655         }
1656         return err;
1657 }
1658 EXPORT_SYMBOL(tcp_filter);
1659
1660 static void tcp_v4_restore_cb(struct sk_buff *skb)
1661 {
1662         memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1663                 sizeof(struct inet_skb_parm));
1664 }
1665
1666 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1667                            const struct tcphdr *th)
1668 {
1669         /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1670          * barrier() makes sure compiler wont play fool^Waliasing games.
1671          */
1672         memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1673                 sizeof(struct inet_skb_parm));
1674         barrier();
1675
1676         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1677         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1678                                     skb->len - th->doff * 4);
1679         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1680         TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1681         TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1682         TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1683         TCP_SKB_CB(skb)->sacked  = 0;
1684         TCP_SKB_CB(skb)->has_rxtstamp =
1685                         skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1686 }
1687
1688 /*
1689  *      From tcp_input.c
1690  */
1691
1692 int tcp_v4_rcv(struct sk_buff *skb)
1693 {
1694         struct net *net = dev_net(skb->dev);
1695         int sdif = inet_sdif(skb);
1696         const struct iphdr *iph;
1697         const struct tcphdr *th;
1698         bool refcounted;
1699         struct sock *sk;
1700         int ret;
1701
1702         if (skb->pkt_type != PACKET_HOST)
1703                 goto discard_it;
1704
1705         /* Count it even if it's bad */
1706         __TCP_INC_STATS(net, TCP_MIB_INSEGS);
1707
1708         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1709                 goto discard_it;
1710
1711         th = (const struct tcphdr *)skb->data;
1712
1713         if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
1714                 goto bad_packet;
1715         if (!pskb_may_pull(skb, th->doff * 4))
1716                 goto discard_it;
1717
1718         /* An explanation is required here, I think.
1719          * Packet length and doff are validated by header prediction,
1720          * provided case of th->doff==0 is eliminated.
1721          * So, we defer the checks. */
1722
1723         if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1724                 goto csum_error;
1725
1726         th = (const struct tcphdr *)skb->data;
1727         iph = ip_hdr(skb);
1728 lookup:
1729         sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1730                                th->dest, sdif, &refcounted);
1731         if (!sk)
1732                 goto no_tcp_socket;
1733
1734 process:
1735         if (sk->sk_state == TCP_TIME_WAIT)
1736                 goto do_time_wait;
1737
1738         if (sk->sk_state == TCP_NEW_SYN_RECV) {
1739                 struct request_sock *req = inet_reqsk(sk);
1740                 bool req_stolen = false;
1741                 struct sock *nsk;
1742
1743                 sk = req->rsk_listener;
1744                 if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) {
1745                         sk_drops_add(sk, skb);
1746                         reqsk_put(req);
1747                         goto discard_it;
1748                 }
1749                 if (tcp_checksum_complete(skb)) {
1750                         reqsk_put(req);
1751                         goto csum_error;
1752                 }
1753                 if (unlikely(sk->sk_state != TCP_LISTEN)) {
1754                         inet_csk_reqsk_queue_drop_and_put(sk, req);
1755                         goto lookup;
1756                 }
1757                 /* We own a reference on the listener, increase it again
1758                  * as we might lose it too soon.
1759                  */
1760                 sock_hold(sk);
1761                 refcounted = true;
1762                 nsk = NULL;
1763                 if (!tcp_filter(sk, skb)) {
1764                         th = (const struct tcphdr *)skb->data;
1765                         iph = ip_hdr(skb);
1766                         tcp_v4_fill_cb(skb, iph, th);
1767                         nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
1768                 }
1769                 if (!nsk) {
1770                         reqsk_put(req);
1771                         if (req_stolen) {
1772                                 /* Another cpu got exclusive access to req
1773                                  * and created a full blown socket.
1774                                  * Try to feed this packet to this socket
1775                                  * instead of discarding it.
1776                                  */
1777                                 tcp_v4_restore_cb(skb);
1778                                 sock_put(sk);
1779                                 goto lookup;
1780                         }
1781                         goto discard_and_relse;
1782                 }
1783                 if (nsk == sk) {
1784                         reqsk_put(req);
1785                         tcp_v4_restore_cb(skb);
1786                 } else if (tcp_child_process(sk, nsk, skb)) {
1787                         tcp_v4_send_reset(nsk, skb);
1788                         goto discard_and_relse;
1789                 } else {
1790                         sock_put(sk);
1791                         return 0;
1792                 }
1793         }
1794         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1795                 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
1796                 goto discard_and_relse;
1797         }
1798
1799         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1800                 goto discard_and_relse;
1801
1802         if (tcp_v4_inbound_md5_hash(sk, skb))
1803                 goto discard_and_relse;
1804
1805         nf_reset(skb);
1806
1807         if (tcp_filter(sk, skb))
1808                 goto discard_and_relse;
1809         th = (const struct tcphdr *)skb->data;
1810         iph = ip_hdr(skb);
1811         tcp_v4_fill_cb(skb, iph, th);
1812
1813         skb->dev = NULL;
1814
1815         if (sk->sk_state == TCP_LISTEN) {
1816                 ret = tcp_v4_do_rcv(sk, skb);
1817                 goto put_and_return;
1818         }
1819
1820         sk_incoming_cpu_update(sk);
1821
1822         bh_lock_sock_nested(sk);
1823         tcp_segs_in(tcp_sk(sk), skb);
1824         ret = 0;
1825         if (!sock_owned_by_user(sk)) {
1826                 ret = tcp_v4_do_rcv(sk, skb);
1827         } else if (tcp_add_backlog(sk, skb)) {
1828                 goto discard_and_relse;
1829         }
1830         bh_unlock_sock(sk);
1831
1832 put_and_return:
1833         if (refcounted)
1834                 sock_put(sk);
1835
1836         return ret;
1837
1838 no_tcp_socket:
1839         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1840                 goto discard_it;
1841
1842         tcp_v4_fill_cb(skb, iph, th);
1843
1844         if (tcp_checksum_complete(skb)) {
1845 csum_error:
1846                 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
1847 bad_packet:
1848                 __TCP_INC_STATS(net, TCP_MIB_INERRS);
1849         } else {
1850                 tcp_v4_send_reset(NULL, skb);
1851         }
1852
1853 discard_it:
1854         /* Discard frame. */
1855         kfree_skb(skb);
1856         return 0;
1857
1858 discard_and_relse:
1859         sk_drops_add(sk, skb);
1860         if (refcounted)
1861                 sock_put(sk);
1862         goto discard_it;
1863
1864 do_time_wait:
1865         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1866                 inet_twsk_put(inet_twsk(sk));
1867                 goto discard_it;
1868         }
1869
1870         tcp_v4_fill_cb(skb, iph, th);
1871
1872         if (tcp_checksum_complete(skb)) {
1873                 inet_twsk_put(inet_twsk(sk));
1874                 goto csum_error;
1875         }
1876         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1877         case TCP_TW_SYN: {
1878                 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1879                                                         &tcp_hashinfo, skb,
1880                                                         __tcp_hdrlen(th),
1881                                                         iph->saddr, th->source,
1882                                                         iph->daddr, th->dest,
1883                                                         inet_iif(skb),
1884                                                         sdif);
1885                 if (sk2) {
1886                         inet_twsk_deschedule_put(inet_twsk(sk));
1887                         sk = sk2;
1888                         tcp_v4_restore_cb(skb);
1889                         refcounted = false;
1890                         goto process;
1891                 }
1892         }
1893                 /* to ACK */
1894                 /* fall through */
1895         case TCP_TW_ACK:
1896                 tcp_v4_timewait_ack(sk, skb);
1897                 break;
1898         case TCP_TW_RST:
1899                 tcp_v4_send_reset(sk, skb);
1900                 inet_twsk_deschedule_put(inet_twsk(sk));
1901                 goto discard_it;
1902         case TCP_TW_SUCCESS:;
1903         }
1904         goto discard_it;
1905 }
1906
1907 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1908         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
1909         .twsk_unique    = tcp_twsk_unique,
1910         .twsk_destructor= tcp_twsk_destructor,
1911 };
1912
1913 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
1914 {
1915         struct dst_entry *dst = skb_dst(skb);
1916
1917         if (dst && dst_hold_safe(dst)) {
1918                 sk->sk_rx_dst = dst;
1919                 inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
1920         }
1921 }
1922 EXPORT_SYMBOL(inet_sk_rx_dst_set);
1923
1924 const struct inet_connection_sock_af_ops ipv4_specific = {
1925         .queue_xmit        = ip_queue_xmit,
1926         .send_check        = tcp_v4_send_check,
1927         .rebuild_header    = inet_sk_rebuild_header,
1928         .sk_rx_dst_set     = inet_sk_rx_dst_set,
1929         .conn_request      = tcp_v4_conn_request,
1930         .syn_recv_sock     = tcp_v4_syn_recv_sock,
1931         .net_header_len    = sizeof(struct iphdr),
1932         .setsockopt        = ip_setsockopt,
1933         .getsockopt        = ip_getsockopt,
1934         .addr2sockaddr     = inet_csk_addr2sockaddr,
1935         .sockaddr_len      = sizeof(struct sockaddr_in),
1936 #ifdef CONFIG_COMPAT
1937         .compat_setsockopt = compat_ip_setsockopt,
1938         .compat_getsockopt = compat_ip_getsockopt,
1939 #endif
1940         .mtu_reduced       = tcp_v4_mtu_reduced,
1941 };
1942 EXPORT_SYMBOL(ipv4_specific);
1943
1944 #ifdef CONFIG_TCP_MD5SIG
1945 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1946         .md5_lookup             = tcp_v4_md5_lookup,
1947         .calc_md5_hash          = tcp_v4_md5_hash_skb,
1948         .md5_parse              = tcp_v4_parse_md5_keys,
1949 };
1950 #endif
1951
1952 /* NOTE: A lot of things set to zero explicitly by call to
1953  *       sk_alloc() so need not be done here.
1954  */
1955 static int tcp_v4_init_sock(struct sock *sk)
1956 {
1957         struct inet_connection_sock *icsk = inet_csk(sk);
1958
1959         tcp_init_sock(sk);
1960
1961         icsk->icsk_af_ops = &ipv4_specific;
1962
1963 #ifdef CONFIG_TCP_MD5SIG
1964         tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
1965 #endif
1966
1967         return 0;
1968 }
1969
1970 void tcp_v4_destroy_sock(struct sock *sk)
1971 {
1972         struct tcp_sock *tp = tcp_sk(sk);
1973
1974         trace_tcp_destroy_sock(sk);
1975
1976         tcp_clear_xmit_timers(sk);
1977
1978         tcp_cleanup_congestion_control(sk);
1979
1980         tcp_cleanup_ulp(sk);
1981
1982         /* Cleanup up the write buffer. */
1983         tcp_write_queue_purge(sk);
1984
1985         /* Check if we want to disable active TFO */
1986         tcp_fastopen_active_disable_ofo_check(sk);
1987
1988         /* Cleans up our, hopefully empty, out_of_order_queue. */
1989         skb_rbtree_purge(&tp->out_of_order_queue);
1990
1991 #ifdef CONFIG_TCP_MD5SIG
1992         /* Clean up the MD5 key list, if any */
1993         if (tp->md5sig_info) {
1994                 tcp_clear_md5_list(sk);
1995                 kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
1996                 tp->md5sig_info = NULL;
1997         }
1998 #endif
1999
2000         /* Clean up a referenced TCP bind bucket. */
2001         if (inet_csk(sk)->icsk_bind_hash)
2002                 inet_put_port(sk);
2003
2004         BUG_ON(tp->fastopen_rsk);
2005
2006         /* If socket is aborted during connect operation */
2007         tcp_free_fastopen_req(tp);
2008         tcp_fastopen_destroy_cipher(sk);
2009         tcp_saved_syn_free(tp);
2010
2011         sk_sockets_allocated_dec(sk);
2012 }
2013 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2014
2015 #ifdef CONFIG_PROC_FS
2016 /* Proc filesystem TCP sock list dumping. */
2017
2018 /*
2019  * Get next listener socket follow cur.  If cur is NULL, get first socket
2020  * starting from bucket given in st->bucket; when st->bucket is zero the
2021  * very first socket in the hash table is returned.
2022  */
2023 static void *listening_get_next(struct seq_file *seq, void *cur)
2024 {
2025         struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
2026         struct tcp_iter_state *st = seq->private;
2027         struct net *net = seq_file_net(seq);
2028         struct inet_listen_hashbucket *ilb;
2029         struct sock *sk = cur;
2030
2031         if (!sk) {
2032 get_head:
2033                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
2034                 spin_lock(&ilb->lock);
2035                 sk = sk_head(&ilb->head);
2036                 st->offset = 0;
2037                 goto get_sk;
2038         }
2039         ilb = &tcp_hashinfo.listening_hash[st->bucket];
2040         ++st->num;
2041         ++st->offset;
2042
2043         sk = sk_next(sk);
2044 get_sk:
2045         sk_for_each_from(sk) {
2046                 if (!net_eq(sock_net(sk), net))
2047                         continue;
2048                 if (sk->sk_family == afinfo->family)
2049                         return sk;
2050         }
2051         spin_unlock(&ilb->lock);
2052         st->offset = 0;
2053         if (++st->bucket < INET_LHTABLE_SIZE)
2054                 goto get_head;
2055         return NULL;
2056 }
2057
2058 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2059 {
2060         struct tcp_iter_state *st = seq->private;
2061         void *rc;
2062
2063         st->bucket = 0;
2064         st->offset = 0;
2065         rc = listening_get_next(seq, NULL);
2066
2067         while (rc && *pos) {
2068                 rc = listening_get_next(seq, rc);
2069                 --*pos;
2070         }
2071         return rc;
2072 }
2073
2074 static inline bool empty_bucket(const struct tcp_iter_state *st)
2075 {
2076         return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
2077 }
2078
2079 /*
2080  * Get first established socket starting from bucket given in st->bucket.
2081  * If st->bucket is zero, the very first socket in the hash is returned.
2082  */
2083 static void *established_get_first(struct seq_file *seq)
2084 {
2085         struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
2086         struct tcp_iter_state *st = seq->private;
2087         struct net *net = seq_file_net(seq);
2088         void *rc = NULL;
2089
2090         st->offset = 0;
2091         for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2092                 struct sock *sk;
2093                 struct hlist_nulls_node *node;
2094                 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2095
2096                 /* Lockless fast path for the common case of empty buckets */
2097                 if (empty_bucket(st))
2098                         continue;
2099
2100                 spin_lock_bh(lock);
2101                 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2102                         if (sk->sk_family != afinfo->family ||
2103                             !net_eq(sock_net(sk), net)) {
2104                                 continue;
2105                         }
2106                         rc = sk;
2107                         goto out;
2108                 }
2109                 spin_unlock_bh(lock);
2110         }
2111 out:
2112         return rc;
2113 }
2114
2115 static void *established_get_next(struct seq_file *seq, void *cur)
2116 {
2117         struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
2118         struct sock *sk = cur;
2119         struct hlist_nulls_node *node;
2120         struct tcp_iter_state *st = seq->private;
2121         struct net *net = seq_file_net(seq);
2122
2123         ++st->num;
2124         ++st->offset;
2125
2126         sk = sk_nulls_next(sk);
2127
2128         sk_nulls_for_each_from(sk, node) {
2129                 if (sk->sk_family == afinfo->family &&
2130                     net_eq(sock_net(sk), net))
2131                         return sk;
2132         }
2133
2134         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2135         ++st->bucket;
2136         return established_get_first(seq);
2137 }
2138
2139 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2140 {
2141         struct tcp_iter_state *st = seq->private;
2142         void *rc;
2143
2144         st->bucket = 0;
2145         rc = established_get_first(seq);
2146
2147         while (rc && pos) {
2148                 rc = established_get_next(seq, rc);
2149                 --pos;
2150         }
2151         return rc;
2152 }
2153
2154 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2155 {
2156         void *rc;
2157         struct tcp_iter_state *st = seq->private;
2158
2159         st->state = TCP_SEQ_STATE_LISTENING;
2160         rc        = listening_get_idx(seq, &pos);
2161
2162         if (!rc) {
2163                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2164                 rc        = established_get_idx(seq, pos);
2165         }
2166
2167         return rc;
2168 }
2169
2170 static void *tcp_seek_last_pos(struct seq_file *seq)
2171 {
2172         struct tcp_iter_state *st = seq->private;
2173         int offset = st->offset;
2174         int orig_num = st->num;
2175         void *rc = NULL;
2176
2177         switch (st->state) {
2178         case TCP_SEQ_STATE_LISTENING:
2179                 if (st->bucket >= INET_LHTABLE_SIZE)
2180                         break;
2181                 st->state = TCP_SEQ_STATE_LISTENING;
2182                 rc = listening_get_next(seq, NULL);
2183                 while (offset-- && rc)
2184                         rc = listening_get_next(seq, rc);
2185                 if (rc)
2186                         break;
2187                 st->bucket = 0;
2188                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2189                 /* Fallthrough */
2190         case TCP_SEQ_STATE_ESTABLISHED:
2191                 if (st->bucket > tcp_hashinfo.ehash_mask)
2192                         break;
2193                 rc = established_get_first(seq);
2194                 while (offset-- && rc)
2195                         rc = established_get_next(seq, rc);
2196         }
2197
2198         st->num = orig_num;
2199
2200         return rc;
2201 }
2202
2203 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2204 {
2205         struct tcp_iter_state *st = seq->private;
2206         void *rc;
2207
2208         if (*pos && *pos == st->last_pos) {
2209                 rc = tcp_seek_last_pos(seq);
2210                 if (rc)
2211                         goto out;
2212         }
2213
2214         st->state = TCP_SEQ_STATE_LISTENING;
2215         st->num = 0;
2216         st->bucket = 0;
2217         st->offset = 0;
2218         rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2219
2220 out:
2221         st->last_pos = *pos;
2222         return rc;
2223 }
2224 EXPORT_SYMBOL(tcp_seq_start);
2225
2226 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2227 {
2228         struct tcp_iter_state *st = seq->private;
2229         void *rc = NULL;
2230
2231         if (v == SEQ_START_TOKEN) {
2232                 rc = tcp_get_idx(seq, 0);
2233                 goto out;
2234         }
2235
2236         switch (st->state) {
2237         case TCP_SEQ_STATE_LISTENING:
2238                 rc = listening_get_next(seq, v);
2239                 if (!rc) {
2240                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2241                         st->bucket = 0;
2242                         st->offset = 0;
2243                         rc        = established_get_first(seq);
2244                 }
2245                 break;
2246         case TCP_SEQ_STATE_ESTABLISHED:
2247                 rc = established_get_next(seq, v);
2248                 break;
2249         }
2250 out:
2251         ++*pos;
2252         st->last_pos = *pos;
2253         return rc;
2254 }
2255 EXPORT_SYMBOL(tcp_seq_next);
2256
2257 void tcp_seq_stop(struct seq_file *seq, void *v)
2258 {
2259         struct tcp_iter_state *st = seq->private;
2260
2261         switch (st->state) {
2262         case TCP_SEQ_STATE_LISTENING:
2263                 if (v != SEQ_START_TOKEN)
2264                         spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock);
2265                 break;
2266         case TCP_SEQ_STATE_ESTABLISHED:
2267                 if (v)
2268                         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2269                 break;
2270         }
2271 }
2272 EXPORT_SYMBOL(tcp_seq_stop);
2273
2274 static void get_openreq4(const struct request_sock *req,
2275                          struct seq_file *f, int i)
2276 {
2277         const struct inet_request_sock *ireq = inet_rsk(req);
2278         long delta = req->rsk_timer.expires - jiffies;
2279
2280         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2281                 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2282                 i,
2283                 ireq->ir_loc_addr,
2284                 ireq->ir_num,
2285                 ireq->ir_rmt_addr,
2286                 ntohs(ireq->ir_rmt_port),
2287                 TCP_SYN_RECV,
2288                 0, 0, /* could print option size, but that is af dependent. */
2289                 1,    /* timers active (only the expire timer) */
2290                 jiffies_delta_to_clock_t(delta),
2291                 req->num_timeout,
2292                 from_kuid_munged(seq_user_ns(f),
2293                                  sock_i_uid(req->rsk_listener)),
2294                 0,  /* non standard timer */
2295                 0, /* open_requests have no inode */
2296                 0,
2297                 req);
2298 }
2299
2300 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2301 {
2302         int timer_active;
2303         unsigned long timer_expires;
2304         const struct tcp_sock *tp = tcp_sk(sk);
2305         const struct inet_connection_sock *icsk = inet_csk(sk);
2306         const struct inet_sock *inet = inet_sk(sk);
2307         const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2308         __be32 dest = inet->inet_daddr;
2309         __be32 src = inet->inet_rcv_saddr;
2310         __u16 destp = ntohs(inet->inet_dport);
2311         __u16 srcp = ntohs(inet->inet_sport);
2312         int rx_queue;
2313         int state;
2314
2315         if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2316             icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2317             icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2318                 timer_active    = 1;
2319                 timer_expires   = icsk->icsk_timeout;
2320         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2321                 timer_active    = 4;
2322                 timer_expires   = icsk->icsk_timeout;
2323         } else if (timer_pending(&sk->sk_timer)) {
2324                 timer_active    = 2;
2325                 timer_expires   = sk->sk_timer.expires;
2326         } else {
2327                 timer_active    = 0;
2328                 timer_expires = jiffies;
2329         }
2330
2331         state = inet_sk_state_load(sk);
2332         if (state == TCP_LISTEN)
2333                 rx_queue = sk->sk_ack_backlog;
2334         else
2335                 /* Because we don't lock the socket,
2336                  * we might find a transient negative value.
2337                  */
2338                 rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2339
2340         seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2341                         "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2342                 i, src, srcp, dest, destp, state,
2343                 tp->write_seq - tp->snd_una,
2344                 rx_queue,
2345                 timer_active,
2346                 jiffies_delta_to_clock_t(timer_expires - jiffies),
2347                 icsk->icsk_retransmits,
2348                 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2349                 icsk->icsk_probes_out,
2350                 sock_i_ino(sk),
2351                 refcount_read(&sk->sk_refcnt), sk,
2352                 jiffies_to_clock_t(icsk->icsk_rto),
2353                 jiffies_to_clock_t(icsk->icsk_ack.ato),
2354                 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2355                 tp->snd_cwnd,
2356                 state == TCP_LISTEN ?
2357                     fastopenq->max_qlen :
2358                     (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2359 }
2360
2361 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2362                                struct seq_file *f, int i)
2363 {
2364         long delta = tw->tw_timer.expires - jiffies;
2365         __be32 dest, src;
2366         __u16 destp, srcp;
2367
2368         dest  = tw->tw_daddr;
2369         src   = tw->tw_rcv_saddr;
2370         destp = ntohs(tw->tw_dport);
2371         srcp  = ntohs(tw->tw_sport);
2372
2373         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2374                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2375                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2376                 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2377                 refcount_read(&tw->tw_refcnt), tw);
2378 }
2379
2380 #define TMPSZ 150
2381
2382 static int tcp4_seq_show(struct seq_file *seq, void *v)
2383 {
2384         struct tcp_iter_state *st;
2385         struct sock *sk = v;
2386
2387         seq_setwidth(seq, TMPSZ - 1);
2388         if (v == SEQ_START_TOKEN) {
2389                 seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2390                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2391                            "inode");
2392                 goto out;
2393         }
2394         st = seq->private;
2395
2396         if (sk->sk_state == TCP_TIME_WAIT)
2397                 get_timewait4_sock(v, seq, st->num);
2398         else if (sk->sk_state == TCP_NEW_SYN_RECV)
2399                 get_openreq4(v, seq, st->num);
2400         else
2401                 get_tcp4_sock(v, seq, st->num);
2402 out:
2403         seq_pad(seq, '\n');
2404         return 0;
2405 }
2406
2407 static const struct seq_operations tcp4_seq_ops = {
2408         .show           = tcp4_seq_show,
2409         .start          = tcp_seq_start,
2410         .next           = tcp_seq_next,
2411         .stop           = tcp_seq_stop,
2412 };
2413
2414 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2415         .family         = AF_INET,
2416 };
2417
2418 static int __net_init tcp4_proc_init_net(struct net *net)
2419 {
2420         if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
2421                         sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
2422                 return -ENOMEM;
2423         return 0;
2424 }
2425
2426 static void __net_exit tcp4_proc_exit_net(struct net *net)
2427 {
2428         remove_proc_entry("tcp", net->proc_net);
2429 }
2430
2431 static struct pernet_operations tcp4_net_ops = {
2432         .init = tcp4_proc_init_net,
2433         .exit = tcp4_proc_exit_net,
2434 };
2435
2436 int __init tcp4_proc_init(void)
2437 {
2438         return register_pernet_subsys(&tcp4_net_ops);
2439 }
2440
2441 void tcp4_proc_exit(void)
2442 {
2443         unregister_pernet_subsys(&tcp4_net_ops);
2444 }
2445 #endif /* CONFIG_PROC_FS */
2446
2447 struct proto tcp_prot = {
2448         .name                   = "TCP",
2449         .owner                  = THIS_MODULE,
2450         .close                  = tcp_close,
2451         .pre_connect            = tcp_v4_pre_connect,
2452         .connect                = tcp_v4_connect,
2453         .disconnect             = tcp_disconnect,
2454         .accept                 = inet_csk_accept,
2455         .ioctl                  = tcp_ioctl,
2456         .init                   = tcp_v4_init_sock,
2457         .destroy                = tcp_v4_destroy_sock,
2458         .shutdown               = tcp_shutdown,
2459         .setsockopt             = tcp_setsockopt,
2460         .getsockopt             = tcp_getsockopt,
2461         .keepalive              = tcp_set_keepalive,
2462         .recvmsg                = tcp_recvmsg,
2463         .sendmsg                = tcp_sendmsg,
2464         .sendpage               = tcp_sendpage,
2465         .backlog_rcv            = tcp_v4_do_rcv,
2466         .release_cb             = tcp_release_cb,
2467         .hash                   = inet_hash,
2468         .unhash                 = inet_unhash,
2469         .get_port               = inet_csk_get_port,
2470         .enter_memory_pressure  = tcp_enter_memory_pressure,
2471         .leave_memory_pressure  = tcp_leave_memory_pressure,
2472         .stream_memory_free     = tcp_stream_memory_free,
2473         .sockets_allocated      = &tcp_sockets_allocated,
2474         .orphan_count           = &tcp_orphan_count,
2475         .memory_allocated       = &tcp_memory_allocated,
2476         .memory_pressure        = &tcp_memory_pressure,
2477         .sysctl_mem             = sysctl_tcp_mem,
2478         .sysctl_wmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_wmem),
2479         .sysctl_rmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_rmem),
2480         .max_header             = MAX_TCP_HEADER,
2481         .obj_size               = sizeof(struct tcp_sock),
2482         .slab_flags             = SLAB_TYPESAFE_BY_RCU,
2483         .twsk_prot              = &tcp_timewait_sock_ops,
2484         .rsk_prot               = &tcp_request_sock_ops,
2485         .h.hashinfo             = &tcp_hashinfo,
2486         .no_autobind            = true,
2487 #ifdef CONFIG_COMPAT
2488         .compat_setsockopt      = compat_tcp_setsockopt,
2489         .compat_getsockopt      = compat_tcp_getsockopt,
2490 #endif
2491         .diag_destroy           = tcp_abort,
2492 };
2493 EXPORT_SYMBOL(tcp_prot);
2494
2495 static void __net_exit tcp_sk_exit(struct net *net)
2496 {
2497         int cpu;
2498
2499         module_put(net->ipv4.tcp_congestion_control->owner);
2500
2501         for_each_possible_cpu(cpu)
2502                 inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2503         free_percpu(net->ipv4.tcp_sk);
2504 }
2505
2506 static int __net_init tcp_sk_init(struct net *net)
2507 {
2508         int res, cpu, cnt;
2509
2510         net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2511         if (!net->ipv4.tcp_sk)
2512                 return -ENOMEM;
2513
2514         for_each_possible_cpu(cpu) {
2515                 struct sock *sk;
2516
2517                 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2518                                            IPPROTO_TCP, net);
2519                 if (res)
2520                         goto fail;
2521                 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
2522
2523                 /* Please enforce IP_DF and IPID==0 for RST and
2524                  * ACK sent in SYN-RECV and TIME-WAIT state.
2525                  */
2526                 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
2527
2528                 *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2529         }
2530
2531         net->ipv4.sysctl_tcp_ecn = 2;
2532         net->ipv4.sysctl_tcp_ecn_fallback = 1;
2533
2534         net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2535         net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2536         net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2537
2538         net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
2539         net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
2540         net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
2541
2542         net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
2543         net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
2544         net->ipv4.sysctl_tcp_syncookies = 1;
2545         net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
2546         net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
2547         net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
2548         net->ipv4.sysctl_tcp_orphan_retries = 0;
2549         net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
2550         net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
2551         net->ipv4.sysctl_tcp_tw_reuse = 2;
2552
2553         cnt = tcp_hashinfo.ehash_mask + 1;
2554         net->ipv4.tcp_death_row.sysctl_max_tw_buckets = cnt / 2;
2555         net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
2556
2557         net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 256);
2558         net->ipv4.sysctl_tcp_sack = 1;
2559         net->ipv4.sysctl_tcp_window_scaling = 1;
2560         net->ipv4.sysctl_tcp_timestamps = 1;
2561         net->ipv4.sysctl_tcp_early_retrans = 3;
2562         net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
2563         net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
2564         net->ipv4.sysctl_tcp_retrans_collapse = 1;
2565         net->ipv4.sysctl_tcp_max_reordering = 300;
2566         net->ipv4.sysctl_tcp_dsack = 1;
2567         net->ipv4.sysctl_tcp_app_win = 31;
2568         net->ipv4.sysctl_tcp_adv_win_scale = 1;
2569         net->ipv4.sysctl_tcp_frto = 2;
2570         net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
2571         /* This limits the percentage of the congestion window which we
2572          * will allow a single TSO frame to consume.  Building TSO frames
2573          * which are too large can cause TCP streams to be bursty.
2574          */
2575         net->ipv4.sysctl_tcp_tso_win_divisor = 3;
2576         /* Default TSQ limit of four TSO segments */
2577         net->ipv4.sysctl_tcp_limit_output_bytes = 262144;
2578         /* rfc5961 challenge ack rate limiting */
2579         net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
2580         net->ipv4.sysctl_tcp_min_tso_segs = 2;
2581         net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
2582         net->ipv4.sysctl_tcp_autocorking = 1;
2583         net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
2584         net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
2585         net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
2586         if (net != &init_net) {
2587                 memcpy(net->ipv4.sysctl_tcp_rmem,
2588                        init_net.ipv4.sysctl_tcp_rmem,
2589                        sizeof(init_net.ipv4.sysctl_tcp_rmem));
2590                 memcpy(net->ipv4.sysctl_tcp_wmem,
2591                        init_net.ipv4.sysctl_tcp_wmem,
2592                        sizeof(init_net.ipv4.sysctl_tcp_wmem));
2593         }
2594         net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
2595         net->ipv4.sysctl_tcp_comp_sack_nr = 44;
2596         net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
2597         spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
2598         net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60;
2599         atomic_set(&net->ipv4.tfo_active_disable_times, 0);
2600
2601         /* Reno is always built in */
2602         if (!net_eq(net, &init_net) &&
2603             try_module_get(init_net.ipv4.tcp_congestion_control->owner))
2604                 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
2605         else
2606                 net->ipv4.tcp_congestion_control = &tcp_reno;
2607
2608         return 0;
2609 fail:
2610         tcp_sk_exit(net);
2611
2612         return res;
2613 }
2614
2615 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2616 {
2617         struct net *net;
2618
2619         inet_twsk_purge(&tcp_hashinfo, AF_INET);
2620
2621         list_for_each_entry(net, net_exit_list, exit_list)
2622                 tcp_fastopen_ctx_destroy(net);
2623 }
2624
2625 static struct pernet_operations __net_initdata tcp_sk_ops = {
2626        .init       = tcp_sk_init,
2627        .exit       = tcp_sk_exit,
2628        .exit_batch = tcp_sk_exit_batch,
2629 };
2630
2631 void __init tcp_v4_init(void)
2632 {
2633         if (register_pernet_subsys(&tcp_sk_ops))
2634                 panic("Failed to create the TCP control socket.\n");
2635 }