net/ipv4/tcp_ipv4.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  *              IPv4 specific functions
   9  *
  10  *
  11  *              code split from:
  12  *              linux/ipv4/tcp.c
  13  *              linux/ipv4/tcp_input.c
  14  *              linux/ipv4/tcp_output.c
  15  *
  16  *              See tcp.c for author information
  17  *
  18  *      This program is free software; you can redistribute it and/or
  19  *      modify it under the terms of the GNU General Public License
  20  *      as published by the Free Software Foundation; either version
  21  *      2 of the License, or (at your option) any later version.
  22  */
  23
  24 /*
  25  * Changes:
  26  *              David S. Miller :       New socket lookup architecture.
  27  *                                      This code is dedicated to John Dyson.
  28  *              David S. Miller :       Change semantics of established hash,
  29  *                                      half is devoted to TIME_WAIT sockets
  30  *                                      and the rest go in the other half.
  31  *              Andi Kleen :            Add support for syncookies and fixed
  32  *                                      some bugs: ip options weren't passed to
  33  *                                      the TCP layer, missed a check for an
  34  *                                      ACK bit.
  35  *              Andi Kleen :            Implemented fast path mtu discovery.
  36  *                                      Fixed many serious bugs in the
  37  *                                      request_sock handling and moved
  38  *                                      most of it into the af independent code.
  39  *                                      Added tail drop and some other bugfixes.
  40  *                                      Added new listen semantics.
  41  *              Mike McLagan    :       Routing by source
  42  *      Juan Jose Ciarlante:            ip_dynaddr bits
  43  *              Andi Kleen:             various fixes.
  44  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
  45  *                                      coma.
  46  *      Andi Kleen              :       Fix new listen.
  47  *      Andi Kleen              :       Fix accept error reporting.
  48  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
  49  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
  50  *                                      a single port at the same time.
  51  */
  52
  53 #define pr_fmt(fmt) "TCP: " fmt
  54
  55 #include <linux/bottom_half.h>
  56 #include <linux/types.h>
  57 #include <linux/fcntl.h>
  58 #include <linux/module.h>
  59 #include <linux/random.h>
  60 #include <linux/cache.h>
  61 #include <linux/jhash.h>
  62 #include <linux/init.h>
  63 #include <linux/times.h>
  64 #include <linux/slab.h>
  65
  66 #include <net/net_namespace.h>
  67 #include <net/icmp.h>
  68 #include <net/inet_hashtables.h>
  69 #include <net/tcp.h>
  70 #include <net/transp_v6.h>
  71 #include <net/ipv6.h>
  72 #include <net/inet_common.h>
  73 #include <net/timewait_sock.h>
  74 #include <net/xfrm.h>
  75 #include <net/secure_seq.h>
  76 #include <net/tcp_memcontrol.h>
  77 #include <net/busy_poll.h>
  78
  79 #include <linux/inet.h>
  80 #include <linux/ipv6.h>
  81 #include <linux/stddef.h>
  82 #include <linux/proc_fs.h>
  83 #include <linux/seq_file.h>
  84
  85 #include <linux/crypto.h>
  86 #include <linux/scatterlist.h>
  87
  88 int sysctl_tcp_tw_reuse __read_mostly;
  89 int sysctl_tcp_low_latency __read_mostly;
  90 EXPORT_SYMBOL(sysctl_tcp_low_latency);
  91
  92 #ifdef CONFIG_TCP_MD5SIG
  93 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
  94                                __be32 daddr, __be32 saddr, const struct tcphdr *th);
  95 #endif
  96
  97 struct inet_hashinfo tcp_hashinfo;
  98 EXPORT_SYMBOL(tcp_hashinfo);
  99
 100 static  __u32 tcp_v4_init_sequence(const struct sk_buff *skb)
 101 {
 102         return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
 103                                           ip_hdr(skb)->saddr,
 104                                           tcp_hdr(skb)->dest,
 105                                           tcp_hdr(skb)->source);
 106 }
 107
 108 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
 109 {
 110         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
 111         struct tcp_sock *tp = tcp_sk(sk);
 112
 113         /* With PAWS, it is safe from the viewpoint
 114            of data integrity. Even without PAWS it is safe provided sequence
 115            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
 116
 117            Actually, the idea is close to VJ's one, only timestamp cache is
 118            held not per host, but per port pair and TW bucket is used as state
 119            holder.
 120
 121            If TW bucket has been already destroyed we fall back to VJ's scheme
 122            and use initial timestamp retrieved from peer table.
 123          */
 124         if (tcptw->tw_ts_recent_stamp &&
 125             (!twp || (sysctl_tcp_tw_reuse &&
 126                              get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
 127                 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
 128                 if (tp->write_seq == 0)
 129                         tp->write_seq = 1;
 130                 tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
 131                 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
 132                 sock_hold(sktw);
 133                 return 1;
 134         }
 135
 136         return 0;
 137 }
 138 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
 139
 140 /* This will initiate an outgoing connection. */
 141 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 142 {
 143         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 144         struct inet_sock *inet = inet_sk(sk);
 145         struct tcp_sock *tp = tcp_sk(sk);
 146         __be16 orig_sport, orig_dport;
 147         __be32 daddr, nexthop;
 148         struct flowi4 *fl4;
 149         struct rtable *rt;
 150         int err;
 151         struct ip_options_rcu *inet_opt;
 152
 153         if (addr_len < sizeof(struct sockaddr_in))
 154                 return -EINVAL;
 155
 156         if (usin->sin_family != AF_INET)
 157                 return -EAFNOSUPPORT;
 158
 159         nexthop = daddr = usin->sin_addr.s_addr;
 160         inet_opt = rcu_dereference_protected(inet->inet_opt,
 161                                              sock_owned_by_user(sk));
 162         if (inet_opt && inet_opt->opt.srr) {
 163                 if (!daddr)
 164                         return -EINVAL;
 165                 nexthop = inet_opt->opt.faddr;
 166         }
 167
 168         orig_sport = inet->inet_sport;
 169         orig_dport = usin->sin_port;
 170         fl4 = &inet->cork.fl.u.ip4;
 171         rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
 172                               RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
 173                               IPPROTO_TCP,
 174                               orig_sport, orig_dport, sk);
 175         if (IS_ERR(rt)) {
 176                 err = PTR_ERR(rt);
 177                 if (err == -ENETUNREACH)
 178                         IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
 179                 return err;
 180         }
 181
 182         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
 183                 ip_rt_put(rt);
 184                 return -ENETUNREACH;
 185         }
 186
 187         if (!inet_opt || !inet_opt->opt.srr)
 188                 daddr = fl4->daddr;
 189
 190         if (!inet->inet_saddr)
 191                 inet->inet_saddr = fl4->saddr;
 192         sk_rcv_saddr_set(sk, inet->inet_saddr);
 193
 194         if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
 195                 /* Reset inherited state */
 196                 tp->rx_opt.ts_recent       = 0;
 197                 tp->rx_opt.ts_recent_stamp = 0;
 198                 if (likely(!tp->repair))
 199                         tp->write_seq      = 0;
 200         }
 201
 202         if (tcp_death_row.sysctl_tw_recycle &&
 203             !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr)
 204                 tcp_fetch_timewait_stamp(sk, &rt->dst);
 205
 206         inet->inet_dport = usin->sin_port;
 207         sk_daddr_set(sk, daddr);
 208
 209         inet_csk(sk)->icsk_ext_hdr_len = 0;
 210         if (inet_opt)
 211                 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
 212
 213         tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
 214
 215         /* Socket identity is still unknown (sport may be zero).
 216          * However we set state to SYN-SENT and not releasing socket
 217          * lock select source port, enter ourselves into the hash tables and
 218          * complete initialization after this.
 219          */
 220         tcp_set_state(sk, TCP_SYN_SENT);
 221         err = inet_hash_connect(&tcp_death_row, sk);
 222         if (err)
 223                 goto failure;
 224
 225         sk_set_txhash(sk);
 226
 227         rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
 228                                inet->inet_sport, inet->inet_dport, sk);
 229         if (IS_ERR(rt)) {
 230                 err = PTR_ERR(rt);
 231                 rt = NULL;
 232                 goto failure;
 233         }
 234         /* OK, now commit destination to socket.  */
 235         sk->sk_gso_type = SKB_GSO_TCPV4;
 236         sk_setup_caps(sk, &rt->dst);
 237
 238         if (!tp->write_seq && likely(!tp->repair))
 239                 tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
 240                                                            inet->inet_daddr,
 241                                                            inet->inet_sport,
 242                                                            usin->sin_port);
 243
 244         inet->inet_id = tp->write_seq ^ jiffies;
 245
 246         err = tcp_connect(sk);
 247
 248         rt = NULL;
 249         if (err)
 250                 goto failure;
 251
 252         return 0;
 253
 254 failure:
 255         /*
 256          * This unhashes the socket and releases the local port,
 257          * if necessary.
 258          */
 259         tcp_set_state(sk, TCP_CLOSE);
 260         ip_rt_put(rt);
 261         sk->sk_route_caps = 0;
 262         inet->inet_dport = 0;
 263         return err;
 264 }
 265 EXPORT_SYMBOL(tcp_v4_connect);
 266
 267 /*
 268  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
 269  * It can be called through tcp_release_cb() if socket was owned by user
 270  * at the time tcp_v4_err() was called to handle ICMP message.
 271  */
 272 void tcp_v4_mtu_reduced(struct sock *sk)
 273 {
 274         struct inet_sock *inet = inet_sk(sk);
 275         struct dst_entry *dst;
 276         u32 mtu;
 277
 278         if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
 279                 return;
 280         mtu = tcp_sk(sk)->mtu_info;
 281         dst = inet_csk_update_pmtu(sk, mtu);
 282         if (!dst)
 283                 return;
 284
 285         /* Something is about to be wrong... Remember soft error
 286          * for the case, if this connection will not able to recover.
 287          */
 288         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
 289                 sk->sk_err_soft = EMSGSIZE;
 290
 291         mtu = dst_mtu(dst);
 292
 293         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
 294             ip_sk_accept_pmtu(sk) &&
 295             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
 296                 tcp_sync_mss(sk, mtu);
 297
 298                 /* Resend the TCP packet because it's
 299                  * clear that the old packet has been
 300                  * dropped. This is the new "fast" path mtu
 301                  * discovery.
 302                  */
 303                 tcp_simple_retransmit(sk);
 304         } /* else let the usual retransmit timer handle it */
 305 }
 306 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
 307
 308 static void do_redirect(struct sk_buff *skb, struct sock *sk)
 309 {
 310         struct dst_entry *dst = __sk_dst_check(sk, 0);
 311
 312         if (dst)
 313                 dst->ops->redirect(dst, sk, skb);
 314 }
 315
 316
 317 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
 318 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
 319 {
 320         struct request_sock *req = inet_reqsk(sk);
 321         struct net *net = sock_net(sk);
 322
 323         /* ICMPs are not backlogged, hence we cannot get
 324          * an established socket here.
 325          */
 326         if (seq != tcp_rsk(req)->snt_isn) {
 327                 NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
 328         } else if (abort) {
 329                 /*
 330                  * Still in SYN_RECV, just remove it silently.
 331                  * There is no good way to pass the error to the newly
 332                  * created socket, and POSIX does not want network
 333                  * errors returned from accept().
 334                  */
 335                 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
 336                 NET_INC_STATS_BH(net, LINUX_MIB_LISTENDROPS);
 337         }
 338         reqsk_put(req);
 339 }
 340 EXPORT_SYMBOL(tcp_req_err);
 341
 342 /*
 343  * This routine is called by the ICMP module when it gets some
 344  * sort of error condition.  If err < 0 then the socket should
 345  * be closed and the error returned to the user.  If err > 0
 346  * it's just the icmp type << 8 | icmp code.  After adjustment
 347  * header points to the first 8 bytes of the tcp header.  We need
 348  * to find the appropriate port.
 349  *
 350  * The locking strategy used here is very "optimistic". When
 351  * someone else accesses the socket the ICMP is just dropped
 352  * and for some paths there is no check at all.
 353  * A more general error queue to queue errors for later handling
 354  * is probably better.
 355  *
 356  */
 357
 358 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
 359 {
 360         const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
 361         struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
 362         struct inet_connection_sock *icsk;
 363         struct tcp_sock *tp;
 364         struct inet_sock *inet;
 365         const int type = icmp_hdr(icmp_skb)->type;
 366         const int code = icmp_hdr(icmp_skb)->code;
 367         struct sock *sk;
 368         struct sk_buff *skb;
 369         struct request_sock *fastopen;
 370         __u32 seq, snd_una;
 371         __u32 remaining;
 372         int err;
 373         struct net *net = dev_net(icmp_skb->dev);
 374
 375         sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
 376                                        th->dest, iph->saddr, ntohs(th->source),
 377                                        inet_iif(icmp_skb));
 378         if (!sk) {
 379                 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
 380                 return;
 381         }
 382         if (sk->sk_state == TCP_TIME_WAIT) {
 383                 inet_twsk_put(inet_twsk(sk));
 384                 return;
 385         }
 386         seq = ntohl(th->seq);
 387         if (sk->sk_state == TCP_NEW_SYN_RECV)
 388                 return tcp_req_err(sk, seq,
 389                                   type == ICMP_PARAMETERPROB ||
 390                                   type == ICMP_TIME_EXCEEDED ||
 391                                   (type == ICMP_DEST_UNREACH &&
 392                                    (code == ICMP_NET_UNREACH ||
 393                                     code == ICMP_HOST_UNREACH)));
 394
 395         bh_lock_sock(sk);
 396         /* If too many ICMPs get dropped on busy
 397          * servers this needs to be solved differently.
 398          * We do take care of PMTU discovery (RFC1191) special case :
 399          * we can receive locally generated ICMP messages while socket is held.
 400          */
 401         if (sock_owned_by_user(sk)) {
 402                 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
 403                         NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
 404         }
 405         if (sk->sk_state == TCP_CLOSE)
 406                 goto out;
 407
 408         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
 409                 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
 410                 goto out;
 411         }
 412
 413         icsk = inet_csk(sk);
 414         tp = tcp_sk(sk);
 415         /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
 416         fastopen = tp->fastopen_rsk;
 417         snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
 418         if (sk->sk_state != TCP_LISTEN &&
 419             !between(seq, snd_una, tp->snd_nxt)) {
 420                 NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
 421                 goto out;
 422         }
 423
 424         switch (type) {
 425         case ICMP_REDIRECT:
 426                 if (!sock_owned_by_user(sk))
 427                         do_redirect(icmp_skb, sk);
 428                 goto out;
 429         case ICMP_SOURCE_QUENCH:
 430                 /* Just silently ignore these. */
 431                 goto out;
 432         case ICMP_PARAMETERPROB:
 433                 err = EPROTO;
 434                 break;
 435         case ICMP_DEST_UNREACH:
 436                 if (code > NR_ICMP_UNREACH)
 437                         goto out;
 438
 439                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
 440                         /* We are not interested in TCP_LISTEN and open_requests
 441                          * (SYN-ACKs send out by Linux are always <576bytes so
 442                          * they should go through unfragmented).
 443                          */
 444                         if (sk->sk_state == TCP_LISTEN)
 445                                 goto out;
 446
 447                         tp->mtu_info = info;
 448                         if (!sock_owned_by_user(sk)) {
 449                                 tcp_v4_mtu_reduced(sk);
 450                         } else {
 451                                 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags))
 452                                         sock_hold(sk);
 453                         }
 454                         goto out;
 455                 }
 456
 457                 err = icmp_err_convert[code].errno;
 458                 /* check if icmp_skb allows revert of backoff
 459                  * (see draft-zimmermann-tcp-lcd) */
 460                 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
 461                         break;
 462                 if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
 463                     !icsk->icsk_backoff || fastopen)
 464                         break;
 465
 466                 if (sock_owned_by_user(sk))
 467                         break;
 468
 469                 icsk->icsk_backoff--;
 470                 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
 471                                                TCP_TIMEOUT_INIT;
 472                 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
 473
 474                 skb = tcp_write_queue_head(sk);
 475                 BUG_ON(!skb);
 476
 477                 remaining = icsk->icsk_rto -
 478                             min(icsk->icsk_rto,
 479                                 tcp_time_stamp - tcp_skb_timestamp(skb));
 480
 481                 if (remaining) {
 482                         inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
 483                                                   remaining, TCP_RTO_MAX);
 484                 } else {
 485                         /* RTO revert clocked out retransmission.
 486                          * Will retransmit now */
 487                         tcp_retransmit_timer(sk);
 488                 }
 489
 490                 break;
 491         case ICMP_TIME_EXCEEDED:
 492                 err = EHOSTUNREACH;
 493                 break;
 494         default:
 495                 goto out;
 496         }
 497
 498         switch (sk->sk_state) {
 499         case TCP_SYN_SENT:
 500         case TCP_SYN_RECV:
 501                 /* Only in fast or simultaneous open. If a fast open socket is
 502                  * is already accepted it is treated as a connected one below.
 503                  */
 504                 if (fastopen && !fastopen->sk)
 505                         break;
 506
 507                 if (!sock_owned_by_user(sk)) {
 508                         sk->sk_err = err;
 509
 510                         sk->sk_error_report(sk);
 511
 512                         tcp_done(sk);
 513                 } else {
 514                         sk->sk_err_soft = err;
 515                 }
 516                 goto out;
 517         }
 518
 519         /* If we've already connected we will keep trying
 520          * until we time out, or the user gives up.
 521          *
 522          * rfc1122 4.2.3.9 allows to consider as hard errors
 523          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
 524          * but it is obsoleted by pmtu discovery).
 525          *
 526          * Note, that in modern internet, where routing is unreliable
 527          * and in each dark corner broken firewalls sit, sending random
 528          * errors ordered by their masters even this two messages finally lose
 529          * their original sense (even Linux sends invalid PORT_UNREACHs)
 530          *
 531          * Now we are in compliance with RFCs.
 532          *                                                      --ANK (980905)
 533          */
 534
 535         inet = inet_sk(sk);
 536         if (!sock_owned_by_user(sk) && inet->recverr) {
 537                 sk->sk_err = err;
 538                 sk->sk_error_report(sk);
 539         } else  { /* Only an error on timeout */
 540                 sk->sk_err_soft = err;
 541         }
 542
 543 out:
 544         bh_unlock_sock(sk);
 545         sock_put(sk);
 546 }
 547
 548 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
 549 {
 550         struct tcphdr *th = tcp_hdr(skb);
 551
 552         if (skb->ip_summed == CHECKSUM_PARTIAL) {
 553                 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
 554                 skb->csum_start = skb_transport_header(skb) - skb->head;
 555                 skb->csum_offset = offsetof(struct tcphdr, check);
 556         } else {
 557                 th->check = tcp_v4_check(skb->len, saddr, daddr,
 558                                          csum_partial(th,
 559                                                       th->doff << 2,
 560                                                       skb->csum));
 561         }
 562 }
 563
 564 /* This routine computes an IPv4 TCP checksum. */
 565 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
 566 {
 567         const struct inet_sock *inet = inet_sk(sk);
 568
 569         __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
 570 }
 571 EXPORT_SYMBOL(tcp_v4_send_check);
 572
 573 /*
 574  *      This routine will send an RST to the other tcp.
 575  *
 576  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
 577  *                    for reset.
 578  *      Answer: if a packet caused RST, it is not for a socket
 579  *              existing in our system, if it is matched to a socket,
 580  *              it is just duplicate segment or bug in other side's TCP.
 581  *              So that we build reply only basing on parameters
 582  *              arrived with segment.
 583  *      Exception: precedence violation. We do not implement it in any case.
 584  */
 585
 586 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
 587 {
 588         const struct tcphdr *th = tcp_hdr(skb);
 589         struct {
 590                 struct tcphdr th;
 591 #ifdef CONFIG_TCP_MD5SIG
 592                 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
 593 #endif
 594         } rep;
 595         struct ip_reply_arg arg;
 596 #ifdef CONFIG_TCP_MD5SIG
 597         struct tcp_md5sig_key *key;
 598         const __u8 *hash_location = NULL;
 599         unsigned char newhash[16];
 600         int genhash;
 601         struct sock *sk1 = NULL;
 602 #endif
 603         struct net *net;
 604
 605         /* Never send a reset in response to a reset. */
 606         if (th->rst)
 607                 return;
 608
 609         /* If sk not NULL, it means we did a successful lookup and incoming
 610          * route had to be correct. prequeue might have dropped our dst.
 611          */
 612         if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
 613                 return;
 614
 615         /* Swap the send and the receive. */
 616         memset(&rep, 0, sizeof(rep));
 617         rep.th.dest   = th->source;
 618         rep.th.source = th->dest;
 619         rep.th.doff   = sizeof(struct tcphdr) / 4;
 620         rep.th.rst    = 1;
 621
 622         if (th->ack) {
 623                 rep.th.seq = th->ack_seq;
 624         } else {
 625                 rep.th.ack = 1;
 626                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
 627                                        skb->len - (th->doff << 2));
 628         }
 629
 630         memset(&arg, 0, sizeof(arg));
 631         arg.iov[0].iov_base = (unsigned char *)&rep;
 632         arg.iov[0].iov_len  = sizeof(rep.th);
 633
 634         net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
 635 #ifdef CONFIG_TCP_MD5SIG
 636         hash_location = tcp_parse_md5sig_option(th);
 637         if (!sk && hash_location) {
 638                 /*
 639                  * active side is lost. Try to find listening socket through
 640                  * source port, and then find md5 key through listening socket.
 641                  * we are not loose security here:
 642                  * Incoming packet is checked with md5 hash with finding key,
 643                  * no RST generated if md5 hash doesn't match.
 644                  */
 645                 sk1 = __inet_lookup_listener(net,
 646                                              &tcp_hashinfo, ip_hdr(skb)->saddr,
 647                                              th->source, ip_hdr(skb)->daddr,
 648                                              ntohs(th->source), inet_iif(skb));
 649                 /* don't send rst if it can't find key */
 650                 if (!sk1)
 651                         return;
 652                 rcu_read_lock();
 653                 key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
 654                                         &ip_hdr(skb)->saddr, AF_INET);
 655                 if (!key)
 656                         goto release_sk1;
 657
 658                 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
 659                 if (genhash || memcmp(hash_location, newhash, 16) != 0)
 660                         goto release_sk1;
 661         } else {
 662                 key = sk ? tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
 663                                              &ip_hdr(skb)->saddr,
 664                                              AF_INET) : NULL;
 665         }
 666
 667         if (key) {
 668                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
 669                                    (TCPOPT_NOP << 16) |
 670                                    (TCPOPT_MD5SIG << 8) |
 671                                    TCPOLEN_MD5SIG);
 672                 /* Update length and the length the header thinks exists */
 673                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 674                 rep.th.doff = arg.iov[0].iov_len / 4;
 675
 676                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
 677                                      key, ip_hdr(skb)->saddr,
 678                                      ip_hdr(skb)->daddr, &rep.th);
 679         }
 680 #endif
 681         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 682                                       ip_hdr(skb)->saddr, /* XXX */
 683                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 684         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 685         arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0;
 686         /* When socket is gone, all binding information is lost.
 687          * routing might fail in this case. No choice here, if we choose to force
 688          * input interface, we will misroute in case of asymmetric route.
 689          */
 690         if (sk)
 691                 arg.bound_dev_if = sk->sk_bound_dev_if;
 692
 693         arg.tos = ip_hdr(skb)->tos;
 694         ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
 695                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
 696                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 697                               &arg, arg.iov[0].iov_len);
 698
 699         TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
 700         TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
 701
 702 #ifdef CONFIG_TCP_MD5SIG
 703 release_sk1:
 704         if (sk1) {
 705                 rcu_read_unlock();
 706                 sock_put(sk1);
 707         }
 708 #endif
 709 }
 710
 711 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
 712    outside socket context is ugly, certainly. What can I do?
 713  */
 714
 715 static void tcp_v4_send_ack(struct net *net,
 716                             struct sk_buff *skb, u32 seq, u32 ack,
 717                             u32 win, u32 tsval, u32 tsecr, int oif,
 718                             struct tcp_md5sig_key *key,
 719                             int reply_flags, u8 tos)
 720 {
 721         const struct tcphdr *th = tcp_hdr(skb);
 722         struct {
 723                 struct tcphdr th;
 724                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
 725 #ifdef CONFIG_TCP_MD5SIG
 726                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
 727 #endif
 728                         ];
 729         } rep;
 730         struct ip_reply_arg arg;
 731
 732         memset(&rep.th, 0, sizeof(struct tcphdr));
 733         memset(&arg, 0, sizeof(arg));
 734
 735         arg.iov[0].iov_base = (unsigned char *)&rep;
 736         arg.iov[0].iov_len  = sizeof(rep.th);
 737         if (tsecr) {
 738                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
 739                                    (TCPOPT_TIMESTAMP << 8) |
 740                                    TCPOLEN_TIMESTAMP);
 741                 rep.opt[1] = htonl(tsval);
 742                 rep.opt[2] = htonl(tsecr);
 743                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
 744         }
 745
 746         /* Swap the send and the receive. */
 747         rep.th.dest    = th->source;
 748         rep.th.source  = th->dest;
 749         rep.th.doff    = arg.iov[0].iov_len / 4;
 750         rep.th.seq     = htonl(seq);
 751         rep.th.ack_seq = htonl(ack);
 752         rep.th.ack     = 1;
 753         rep.th.window  = htons(win);
 754
 755 #ifdef CONFIG_TCP_MD5SIG
 756         if (key) {
 757                 int offset = (tsecr) ? 3 : 0;
 758
 759                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
 760                                           (TCPOPT_NOP << 16) |
 761                                           (TCPOPT_MD5SIG << 8) |
 762                                           TCPOLEN_MD5SIG);
 763                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 764                 rep.th.doff = arg.iov[0].iov_len/4;
 765
 766                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
 767                                     key, ip_hdr(skb)->saddr,
 768                                     ip_hdr(skb)->daddr, &rep.th);
 769         }
 770 #endif
 771         arg.flags = reply_flags;
 772         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 773                                       ip_hdr(skb)->saddr, /* XXX */
 774                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 775         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 776         if (oif)
 777                 arg.bound_dev_if = oif;
 778         arg.tos = tos;
 779         ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
 780                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
 781                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 782                               &arg, arg.iov[0].iov_len);
 783
 784         TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
 785 }
 786
 787 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
 788 {
 789         struct inet_timewait_sock *tw = inet_twsk(sk);
 790         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
 791
 792         tcp_v4_send_ack(sock_net(sk), skb,
 793                         tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
 794                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
 795                         tcp_time_stamp + tcptw->tw_ts_offset,
 796                         tcptw->tw_ts_recent,
 797                         tw->tw_bound_dev_if,
 798                         tcp_twsk_md5_key(tcptw),
 799                         tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
 800                         tw->tw_tos
 801                         );
 802
 803         inet_twsk_put(tw);
 804 }
 805
 806 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
 807                                   struct request_sock *req)
 808 {
 809         /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
 810          * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
 811          */
 812         u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
 813                                              tcp_sk(sk)->snd_nxt;
 814
 815         /* RFC 7323 2.3
 816          * The window field (SEG.WND) of every outgoing segment, with the
 817          * exception of <SYN> segments, MUST be right-shifted by
 818          * Rcv.Wind.Shift bits:
 819          */
 820         tcp_v4_send_ack(sock_net(sk), skb, seq,
 821                         tcp_rsk(req)->rcv_nxt,
 822                         req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
 823                         tcp_time_stamp,
 824                         req->ts_recent,
 825                         0,
 826                         tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->saddr,
 827                                           AF_INET),
 828                         inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
 829                         ip_hdr(skb)->tos);
 830 }
 831
 832 /*
 833  *      Send a SYN-ACK after having received a SYN.
 834  *      This still operates on a request_sock only, not on a big
 835  *      socket.
 836  */
 837 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
 838                               struct flowi *fl,
 839                               struct request_sock *req,
 840                               struct tcp_fastopen_cookie *foc,
 841                                   bool attach_req)
 842 {
 843         const struct inet_request_sock *ireq = inet_rsk(req);
 844         struct flowi4 fl4;
 845         int err = -1;
 846         struct sk_buff *skb;
 847
 848         /* First, grab a route. */
 849         if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
 850                 return -1;
 851
 852         skb = tcp_make_synack(sk, dst, req, foc, attach_req);
 853
 854         if (skb) {
 855                 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
 856
 857                 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
 858                                             ireq->ir_rmt_addr,
 859                                             ireq_opt_deref(ireq));
 860                 err = net_xmit_eval(err);
 861         }
 862
 863         return err;
 864 }
 865
 866 /*
 867  *      IPv4 request_sock destructor.
 868  */
 869 static void tcp_v4_reqsk_destructor(struct request_sock *req)
 870 {
 871         kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
 872 }
 873
 874
 875 #ifdef CONFIG_TCP_MD5SIG
 876 /*
 877  * RFC2385 MD5 checksumming requires a mapping of
 878  * IP address->MD5 Key.
 879  * We need to maintain these in the sk structure.
 880  */
 881
 882 /* Find the Key structure for an address.  */
 883 struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk,
 884                                          const union tcp_md5_addr *addr,
 885                                          int family)
 886 {
 887         const struct tcp_sock *tp = tcp_sk(sk);
 888         struct tcp_md5sig_key *key;
 889         unsigned int size = sizeof(struct in_addr);
 890         const struct tcp_md5sig_info *md5sig;
 891
 892         /* caller either holds rcu_read_lock() or socket lock */
 893         md5sig = rcu_dereference_check(tp->md5sig_info,
 894                                        sock_owned_by_user(sk) ||
 895                                        lockdep_is_held((spinlock_t *)&sk->sk_lock.slock));
 896         if (!md5sig)
 897                 return NULL;
 898 #if IS_ENABLED(CONFIG_IPV6)
 899         if (family == AF_INET6)
 900                 size = sizeof(struct in6_addr);
 901 #endif
 902         hlist_for_each_entry_rcu(key, &md5sig->head, node) {
 903                 if (key->family != family)
 904                         continue;
 905                 if (!memcmp(&key->addr, addr, size))
 906                         return key;
 907         }
 908         return NULL;
 909 }
 910 EXPORT_SYMBOL(tcp_md5_do_lookup);
 911
 912 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
 913                                          const struct sock *addr_sk)
 914 {
 915         const union tcp_md5_addr *addr;
 916
 917         addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
 918         return tcp_md5_do_lookup(sk, addr, AF_INET);
 919 }
 920 EXPORT_SYMBOL(tcp_v4_md5_lookup);
 921
 922 /* This can be called on a newly created socket, from other files */
 923 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
 924                    int family, const u8 *newkey, u8 newkeylen, gfp_t gfp)
 925 {
 926         /* Add Key to the list */
 927         struct tcp_md5sig_key *key;
 928         struct tcp_sock *tp = tcp_sk(sk);
 929         struct tcp_md5sig_info *md5sig;
 930
 931         key = tcp_md5_do_lookup(sk, addr, family);
 932         if (key) {
 933                 /* Pre-existing entry - just update that one. */
 934                 memcpy(key->key, newkey, newkeylen);
 935                 key->keylen = newkeylen;
 936                 return 0;
 937         }
 938
 939         md5sig = rcu_dereference_protected(tp->md5sig_info,
 940                                            sock_owned_by_user(sk) ||
 941                                            lockdep_is_held(&sk->sk_lock.slock));
 942         if (!md5sig) {
 943                 md5sig = kmalloc(sizeof(*md5sig), gfp);
 944                 if (!md5sig)
 945                         return -ENOMEM;
 946
 947                 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
 948                 INIT_HLIST_HEAD(&md5sig->head);
 949                 rcu_assign_pointer(tp->md5sig_info, md5sig);
 950         }
 951
 952         key = sock_kmalloc(sk, sizeof(*key), gfp);
 953         if (!key)
 954                 return -ENOMEM;
 955         if (!tcp_alloc_md5sig_pool()) {
 956                 sock_kfree_s(sk, key, sizeof(*key));
 957                 return -ENOMEM;
 958         }
 959
 960         memcpy(key->key, newkey, newkeylen);
 961         key->keylen = newkeylen;
 962         key->family = family;
 963         memcpy(&key->addr, addr,
 964                (family == AF_INET6) ? sizeof(struct in6_addr) :
 965                                       sizeof(struct in_addr));
 966         hlist_add_head_rcu(&key->node, &md5sig->head);
 967         return 0;
 968 }
 969 EXPORT_SYMBOL(tcp_md5_do_add);
 970
 971 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family)
 972 {
 973         struct tcp_md5sig_key *key;
 974
 975         key = tcp_md5_do_lookup(sk, addr, family);
 976         if (!key)
 977                 return -ENOENT;
 978         hlist_del_rcu(&key->node);
 979         atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
 980         kfree_rcu(key, rcu);
 981         return 0;
 982 }
 983 EXPORT_SYMBOL(tcp_md5_do_del);
 984
 985 static void tcp_clear_md5_list(struct sock *sk)
 986 {
 987         struct tcp_sock *tp = tcp_sk(sk);
 988         struct tcp_md5sig_key *key;
 989         struct hlist_node *n;
 990         struct tcp_md5sig_info *md5sig;
 991
 992         md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
 993
 994         hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
 995                 hlist_del_rcu(&key->node);
 996                 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
 997                 kfree_rcu(key, rcu);
 998         }
 999 }
1000
1001 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
1002                                  int optlen)
1003 {
1004         struct tcp_md5sig cmd;
1005         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1006
1007         if (optlen < sizeof(cmd))
1008                 return -EINVAL;
1009
1010         if (copy_from_user(&cmd, optval, sizeof(cmd)))
1011                 return -EFAULT;
1012
1013         if (sin->sin_family != AF_INET)
1014                 return -EINVAL;
1015
1016         if (!cmd.tcpm_keylen)
1017                 return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1018                                       AF_INET);
1019
1020         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1021                 return -EINVAL;
1022
1023         return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1024                               AF_INET, cmd.tcpm_key, cmd.tcpm_keylen,
1025                               GFP_KERNEL);
1026 }
1027
1028 static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
1029                                         __be32 daddr, __be32 saddr, int nbytes)
1030 {
1031         struct tcp4_pseudohdr *bp;
1032         struct scatterlist sg;
1033
1034         bp = &hp->md5_blk.ip4;
1035
1036         /*
1037          * 1. the TCP pseudo-header (in the order: source IP address,
1038          * destination IP address, zero-padded protocol number, and
1039          * segment length)
1040          */
1041         bp->saddr = saddr;
1042         bp->daddr = daddr;
1043         bp->pad = 0;
1044         bp->protocol = IPPROTO_TCP;
1045         bp->len = cpu_to_be16(nbytes);
1046
1047         sg_init_one(&sg, bp, sizeof(*bp));
1048         return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp));
1049 }
1050
1051 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1052                                __be32 daddr, __be32 saddr, const struct tcphdr *th)
1053 {
1054         struct tcp_md5sig_pool *hp;
1055         struct hash_desc *desc;
1056
1057         hp = tcp_get_md5sig_pool();
1058         if (!hp)
1059                 goto clear_hash_noput;
1060         desc = &hp->md5_desc;
1061
1062         if (crypto_hash_init(desc))
1063                 goto clear_hash;
1064         if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
1065                 goto clear_hash;
1066         if (tcp_md5_hash_header(hp, th))
1067                 goto clear_hash;
1068         if (tcp_md5_hash_key(hp, key))
1069                 goto clear_hash;
1070         if (crypto_hash_final(desc, md5_hash))
1071                 goto clear_hash;
1072
1073         tcp_put_md5sig_pool();
1074         return 0;
1075
1076 clear_hash:
1077         tcp_put_md5sig_pool();
1078 clear_hash_noput:
1079         memset(md5_hash, 0, 16);
1080         return 1;
1081 }
1082
1083 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1084                         const struct sock *sk,
1085                         const struct sk_buff *skb)
1086 {
1087         struct tcp_md5sig_pool *hp;
1088         struct hash_desc *desc;
1089         const struct tcphdr *th = tcp_hdr(skb);
1090         __be32 saddr, daddr;
1091
1092         if (sk) { /* valid for establish/request sockets */
1093                 saddr = sk->sk_rcv_saddr;
1094                 daddr = sk->sk_daddr;
1095         } else {
1096                 const struct iphdr *iph = ip_hdr(skb);
1097                 saddr = iph->saddr;
1098                 daddr = iph->daddr;
1099         }
1100
1101         hp = tcp_get_md5sig_pool();
1102         if (!hp)
1103                 goto clear_hash_noput;
1104         desc = &hp->md5_desc;
1105
1106         if (crypto_hash_init(desc))
1107                 goto clear_hash;
1108
1109         if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
1110                 goto clear_hash;
1111         if (tcp_md5_hash_header(hp, th))
1112                 goto clear_hash;
1113         if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1114                 goto clear_hash;
1115         if (tcp_md5_hash_key(hp, key))
1116                 goto clear_hash;
1117         if (crypto_hash_final(desc, md5_hash))
1118                 goto clear_hash;
1119
1120         tcp_put_md5sig_pool();
1121         return 0;
1122
1123 clear_hash:
1124         tcp_put_md5sig_pool();
1125 clear_hash_noput:
1126         memset(md5_hash, 0, 16);
1127         return 1;
1128 }
1129 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1130
1131 #endif
1132
1133 /* Called with rcu_read_lock() */
1134 static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1135                                     const struct sk_buff *skb)
1136 {
1137 #ifdef CONFIG_TCP_MD5SIG
1138         /*
1139          * This gets called for each TCP segment that arrives
1140          * so we want to be efficient.
1141          * We have 3 drop cases:
1142          * o No MD5 hash and one expected.
1143          * o MD5 hash and we're not expecting one.
1144          * o MD5 hash and its wrong.
1145          */
1146         const __u8 *hash_location = NULL;
1147         struct tcp_md5sig_key *hash_expected;
1148         const struct iphdr *iph = ip_hdr(skb);
1149         const struct tcphdr *th = tcp_hdr(skb);
1150         int genhash;
1151         unsigned char newhash[16];
1152
1153         hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1154                                           AF_INET);
1155         hash_location = tcp_parse_md5sig_option(th);
1156
1157         /* We've parsed the options - do we have a hash? */
1158         if (!hash_expected && !hash_location)
1159                 return false;
1160
1161         if (hash_expected && !hash_location) {
1162                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1163                 return true;
1164         }
1165
1166         if (!hash_expected && hash_location) {
1167                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1168                 return true;
1169         }
1170
1171         /* Okay, so this is hash_expected and hash_location -
1172          * so we need to calculate the checksum.
1173          */
1174         genhash = tcp_v4_md5_hash_skb(newhash,
1175                                       hash_expected,
1176                                       NULL, skb);
1177
1178         if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1179                 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1180                                      &iph->saddr, ntohs(th->source),
1181                                      &iph->daddr, ntohs(th->dest),
1182                                      genhash ? " tcp_v4_calc_md5_hash failed"
1183                                      : "");
1184                 return true;
1185         }
1186         return false;
1187 #endif
1188         return false;
1189 }
1190
1191 static void tcp_v4_init_req(struct request_sock *req,
1192                             const struct sock *sk_listener,
1193                             struct sk_buff *skb)
1194 {
1195         struct inet_request_sock *ireq = inet_rsk(req);
1196
1197         sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1198         sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1199         ireq->no_srccheck = inet_sk(sk_listener)->transparent;
1200         RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(skb));
1201 }
1202
1203 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1204                                           struct flowi *fl,
1205                                           const struct request_sock *req,
1206                                           bool *strict)
1207 {
1208         struct dst_entry *dst = inet_csk_route_req(sk, &fl->u.ip4, req);
1209
1210         if (strict) {
1211                 if (fl->u.ip4.daddr == inet_rsk(req)->ir_rmt_addr)
1212                         *strict = true;
1213                 else
1214                         *strict = false;
1215         }
1216
1217         return dst;
1218 }
1219
1220 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1221         .family         =       PF_INET,
1222         .obj_size       =       sizeof(struct tcp_request_sock),
1223         .rtx_syn_ack    =       tcp_rtx_synack,
1224         .send_ack       =       tcp_v4_reqsk_send_ack,
1225         .destructor     =       tcp_v4_reqsk_destructor,
1226         .send_reset     =       tcp_v4_send_reset,
1227         .syn_ack_timeout =      tcp_syn_ack_timeout,
1228 };
1229
1230 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1231         .mss_clamp      =       TCP_MSS_DEFAULT,
1232 #ifdef CONFIG_TCP_MD5SIG
1233         .req_md5_lookup =       tcp_v4_md5_lookup,
1234         .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1235 #endif
1236         .init_req       =       tcp_v4_init_req,
1237 #ifdef CONFIG_SYN_COOKIES
1238         .cookie_init_seq =      cookie_v4_init_sequence,
1239 #endif
1240         .route_req      =       tcp_v4_route_req,
1241         .init_seq       =       tcp_v4_init_sequence,
1242         .send_synack    =       tcp_v4_send_synack,
1243 };
1244
1245 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1246 {
1247         /* Never answer to SYNs send to broadcast or multicast */
1248         if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1249                 goto drop;
1250
1251         return tcp_conn_request(&tcp_request_sock_ops,
1252                                 &tcp_request_sock_ipv4_ops, sk, skb);
1253
1254 drop:
1255         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1256         return 0;
1257 }
1258 EXPORT_SYMBOL(tcp_v4_conn_request);
1259
1260
1261 /*
1262  * The three way handshake has completed - we got a valid synack -
1263  * now create the new socket.
1264  */
1265 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1266                                   struct request_sock *req,
1267                                   struct dst_entry *dst,
1268                                   struct request_sock *req_unhash,
1269                                   bool *own_req)
1270 {
1271         struct inet_request_sock *ireq;
1272         struct inet_sock *newinet;
1273         struct tcp_sock *newtp;
1274         struct sock *newsk;
1275 #ifdef CONFIG_TCP_MD5SIG
1276         struct tcp_md5sig_key *key;
1277 #endif
1278         struct ip_options_rcu *inet_opt;
1279
1280         if (sk_acceptq_is_full(sk))
1281                 goto exit_overflow;
1282
1283         newsk = tcp_create_openreq_child(sk, req, skb);
1284         if (!newsk)
1285                 goto exit_nonewsk;
1286
1287         newsk->sk_gso_type = SKB_GSO_TCPV4;
1288         inet_sk_rx_dst_set(newsk, skb);
1289
1290         newtp                 = tcp_sk(newsk);
1291         newinet               = inet_sk(newsk);
1292         ireq                  = inet_rsk(req);
1293         sk_daddr_set(newsk, ireq->ir_rmt_addr);
1294         sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1295         newinet->inet_saddr   = ireq->ir_loc_addr;
1296         inet_opt              = rcu_dereference(ireq->ireq_opt);
1297         RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1298         newinet->mc_index     = inet_iif(skb);
1299         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1300         newinet->rcv_tos      = ip_hdr(skb)->tos;
1301         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1302         if (inet_opt)
1303                 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1304         newinet->inet_id = newtp->write_seq ^ jiffies;
1305
1306         if (!dst) {
1307                 dst = inet_csk_route_child_sock(sk, newsk, req);
1308                 if (!dst)
1309                         goto put_and_exit;
1310         } else {
1311                 /* syncookie case : see end of cookie_v4_check() */
1312         }
1313         sk_setup_caps(newsk, dst);
1314
1315         tcp_ca_openreq_child(newsk, dst);
1316
1317         tcp_sync_mss(newsk, dst_mtu(dst));
1318         newtp->advmss = dst_metric_advmss(dst);
1319         if (tcp_sk(sk)->rx_opt.user_mss &&
1320             tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1321                 newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1322
1323         tcp_initialize_rcv_mss(newsk);
1324
1325 #ifdef CONFIG_TCP_MD5SIG
1326         /* Copy over the MD5 key from the original socket */
1327         key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1328                                 AF_INET);
1329         if (key) {
1330                 /*
1331                  * We're using one, so create a matching key
1332                  * on the newsk structure. If we fail to get
1333                  * memory, then we end up not copying the key
1334                  * across. Shucks.
1335                  */
1336                 tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1337                                AF_INET, key->key, key->keylen, GFP_ATOMIC);
1338                 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1339         }
1340 #endif
1341
1342         if (__inet_inherit_port(sk, newsk) < 0)
1343                 goto put_and_exit;
1344         *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
1345         if (likely(*own_req)) {
1346                 tcp_move_syn(newtp, req);
1347                 ireq->ireq_opt = NULL;
1348         } else {
1349                 newinet->inet_opt = NULL;
1350         }
1351         return newsk;
1352
1353 exit_overflow:
1354         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1355 exit_nonewsk:
1356         dst_release(dst);
1357 exit:
1358         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1359         return NULL;
1360 put_and_exit:
1361         newinet->inet_opt = NULL;
1362         inet_csk_prepare_forced_close(newsk);
1363         tcp_done(newsk);
1364         goto exit;
1365 }
1366 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1367
1368 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1369 {
1370 #ifdef CONFIG_SYN_COOKIES
1371         const struct tcphdr *th = tcp_hdr(skb);
1372
1373         if (!th->syn)
1374                 sk = cookie_v4_check(sk, skb);
1375 #endif
1376         return sk;
1377 }
1378
1379 /* The socket must have it's spinlock held when we get
1380  * here, unless it is a TCP_LISTEN socket.
1381  *
1382  * We have a potential double-lock case here, so even when
1383  * doing backlog processing we use the BH locking scheme.
1384  * This is because we cannot sleep with the original spinlock
1385  * held.
1386  */
1387 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1388 {
1389         struct sock *rsk;
1390
1391         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1392                 struct dst_entry *dst = sk->sk_rx_dst;
1393
1394                 sock_rps_save_rxhash(sk, skb);
1395                 sk_mark_napi_id(sk, skb);
1396                 if (dst) {
1397                         if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1398                             !dst->ops->check(dst, 0)) {
1399                                 dst_release(dst);
1400                                 sk->sk_rx_dst = NULL;
1401                         }
1402                 }
1403                 tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len);
1404                 return 0;
1405         }
1406
1407         if (tcp_checksum_complete(skb))
1408                 goto csum_err;
1409
1410         if (sk->sk_state == TCP_LISTEN) {
1411                 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1412
1413                 if (!nsk)
1414                         goto discard;
1415                 if (nsk != sk) {
1416                         sock_rps_save_rxhash(nsk, skb);
1417                         sk_mark_napi_id(nsk, skb);
1418                         if (tcp_child_process(sk, nsk, skb)) {
1419                                 rsk = nsk;
1420                                 goto reset;
1421                         }
1422                         return 0;
1423                 }
1424         } else
1425                 sock_rps_save_rxhash(sk, skb);
1426
1427         if (tcp_rcv_state_process(sk, skb)) {
1428                 rsk = sk;
1429                 goto reset;
1430         }
1431         return 0;
1432
1433 reset:
1434         tcp_v4_send_reset(rsk, skb);
1435 discard:
1436         kfree_skb(skb);
1437         /* Be careful here. If this function gets more complicated and
1438          * gcc suffers from register pressure on the x86, sk (in %ebx)
1439          * might be destroyed here. This current version compiles correctly,
1440          * but you have been warned.
1441          */
1442         return 0;
1443
1444 csum_err:
1445         TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_CSUMERRORS);
1446         TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
1447         goto discard;
1448 }
1449 EXPORT_SYMBOL(tcp_v4_do_rcv);
1450
1451 void tcp_v4_early_demux(struct sk_buff *skb)
1452 {
1453         const struct iphdr *iph;
1454         const struct tcphdr *th;
1455         struct sock *sk;
1456
1457         if (skb->pkt_type != PACKET_HOST)
1458                 return;
1459
1460         if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1461                 return;
1462
1463         iph = ip_hdr(skb);
1464         th = tcp_hdr(skb);
1465
1466         if (th->doff < sizeof(struct tcphdr) / 4)
1467                 return;
1468
1469         sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1470                                        iph->saddr, th->source,
1471                                        iph->daddr, ntohs(th->dest),
1472                                        skb->skb_iif);
1473         if (sk) {
1474                 skb->sk = sk;
1475                 skb->destructor = sock_edemux;
1476                 if (sk_fullsock(sk)) {
1477                         struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
1478
1479                         if (dst)
1480                                 dst = dst_check(dst, 0);
1481                         if (dst &&
1482                             inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1483                                 skb_dst_set_noref(skb, dst);
1484                 }
1485         }
1486 }
1487
1488 /* Packet is added to VJ-style prequeue for processing in process
1489  * context, if a reader task is waiting. Apparently, this exciting
1490  * idea (VJ's mail "Re: query about TCP header on tcp-ip" of 07 Sep 93)
1491  * failed somewhere. Latency? Burstiness? Well, at least now we will
1492  * see, why it failed. 8)8)                               --ANK
1493  *
1494  */
1495 bool tcp_prequeue(struct sock *sk, struct sk_buff *skb)
1496 {
1497         struct tcp_sock *tp = tcp_sk(sk);
1498
1499         if (sysctl_tcp_low_latency || !tp->ucopy.task)
1500                 return false;
1501
1502         if (skb->len <= tcp_hdrlen(skb) &&
1503             skb_queue_len(&tp->ucopy.prequeue) == 0)
1504                 return false;
1505
1506         /* Before escaping RCU protected region, we need to take care of skb
1507          * dst. Prequeue is only enabled for established sockets.
1508          * For such sockets, we might need the skb dst only to set sk->sk_rx_dst
1509          * Instead of doing full sk_rx_dst validity here, let's perform
1510          * an optimistic check.
1511          */
1512         if (likely(sk->sk_rx_dst))
1513                 skb_dst_drop(skb);
1514         else
1515                 skb_dst_force_safe(skb);
1516
1517         __skb_queue_tail(&tp->ucopy.prequeue, skb);
1518         tp->ucopy.memory += skb->truesize;
1519         if (tp->ucopy.memory > sk->sk_rcvbuf) {
1520                 struct sk_buff *skb1;
1521
1522                 BUG_ON(sock_owned_by_user(sk));
1523
1524                 while ((skb1 = __skb_dequeue(&tp->ucopy.prequeue)) != NULL) {
1525                         sk_backlog_rcv(sk, skb1);
1526                         NET_INC_STATS_BH(sock_net(sk),
1527                                          LINUX_MIB_TCPPREQUEUEDROPPED);
1528                 }
1529
1530                 tp->ucopy.memory = 0;
1531         } else if (skb_queue_len(&tp->ucopy.prequeue) == 1) {
1532                 wake_up_interruptible_sync_poll(sk_sleep(sk),
1533                                            POLLIN | POLLRDNORM | POLLRDBAND);
1534                 if (!inet_csk_ack_scheduled(sk))
1535                         inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
1536                                                   (3 * tcp_rto_min(sk)) / 4,
1537                                                   TCP_RTO_MAX);
1538         }
1539         return true;
1540 }
1541 EXPORT_SYMBOL(tcp_prequeue);
1542
1543 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1544 {
1545         struct tcphdr *th = (struct tcphdr *)skb->data;
1546         unsigned int eaten = skb->len;
1547         int err;
1548
1549         err = sk_filter_trim_cap(sk, skb, th->doff * 4);
1550         if (!err) {
1551                 eaten -= skb->len;
1552                 TCP_SKB_CB(skb)->end_seq -= eaten;
1553         }
1554         return err;
1555 }
1556 EXPORT_SYMBOL(tcp_filter);
1557
1558 /*
1559  *      From tcp_input.c
1560  */
1561
1562 int tcp_v4_rcv(struct sk_buff *skb)
1563 {
1564         const struct iphdr *iph;
1565         const struct tcphdr *th;
1566         struct sock *sk;
1567         int ret;
1568         struct net *net = dev_net(skb->dev);
1569
1570         if (skb->pkt_type != PACKET_HOST)
1571                 goto discard_it;
1572
1573         /* Count it even if it's bad */
1574         TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);
1575
1576         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1577                 goto discard_it;
1578
1579         th = tcp_hdr(skb);
1580
1581         if (th->doff < sizeof(struct tcphdr) / 4)
1582                 goto bad_packet;
1583         if (!pskb_may_pull(skb, th->doff * 4))
1584                 goto discard_it;
1585
1586         /* An explanation is required here, I think.
1587          * Packet length and doff are validated by header prediction,
1588          * provided case of th->doff==0 is eliminated.
1589          * So, we defer the checks. */
1590
1591         if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1592                 goto csum_error;
1593
1594         th = tcp_hdr(skb);
1595         iph = ip_hdr(skb);
1596         /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1597          * barrier() makes sure compiler wont play fool^Waliasing games.
1598          */
1599         memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1600                 sizeof(struct inet_skb_parm));
1601         barrier();
1602
1603         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1604         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1605                                     skb->len - th->doff * 4);
1606         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1607         TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1608         TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1609         TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1610         TCP_SKB_CB(skb)->sacked  = 0;
1611
1612 lookup:
1613         sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
1614         if (!sk)
1615                 goto no_tcp_socket;
1616
1617 process:
1618         if (sk->sk_state == TCP_TIME_WAIT)
1619                 goto do_time_wait;
1620
1621         if (sk->sk_state == TCP_NEW_SYN_RECV) {
1622                 struct request_sock *req = inet_reqsk(sk);
1623                 struct sock *nsk;
1624
1625                 sk = req->rsk_listener;
1626                 if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) {
1627                         reqsk_put(req);
1628                         goto discard_it;
1629                 }
1630                 if (tcp_checksum_complete(skb)) {
1631                         reqsk_put(req);
1632                         goto csum_error;
1633                 }
1634                 if (unlikely(sk->sk_state != TCP_LISTEN)) {
1635                         inet_csk_reqsk_queue_drop_and_put(sk, req);
1636                         goto lookup;
1637                 }
1638                 sock_hold(sk);
1639                 nsk = tcp_check_req(sk, skb, req, false);
1640                 if (!nsk) {
1641                         reqsk_put(req);
1642                         goto discard_and_relse;
1643                 }
1644                 if (nsk == sk) {
1645                         reqsk_put(req);
1646                 } else if (tcp_child_process(sk, nsk, skb)) {
1647                         tcp_v4_send_reset(nsk, skb);
1648                         goto discard_and_relse;
1649                 } else {
1650                         sock_put(sk);
1651                         return 0;
1652                 }
1653         }
1654         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1655                 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
1656                 goto discard_and_relse;
1657         }
1658
1659         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1660                 goto discard_and_relse;
1661
1662         if (tcp_v4_inbound_md5_hash(sk, skb))
1663                 goto discard_and_relse;
1664
1665         nf_reset(skb);
1666
1667         if (tcp_filter(sk, skb))
1668                 goto discard_and_relse;
1669         th = (const struct tcphdr *)skb->data;
1670         iph = ip_hdr(skb);
1671
1672         skb->dev = NULL;
1673
1674         if (sk->sk_state == TCP_LISTEN) {
1675                 ret = tcp_v4_do_rcv(sk, skb);
1676                 goto put_and_return;
1677         }
1678
1679         sk_incoming_cpu_update(sk);
1680
1681         bh_lock_sock_nested(sk);
1682         tcp_sk(sk)->segs_in += max_t(u16, 1, skb_shinfo(skb)->gso_segs);
1683         ret = 0;
1684         if (!sock_owned_by_user(sk)) {
1685                 if (!tcp_prequeue(sk, skb))
1686                         ret = tcp_v4_do_rcv(sk, skb);
1687         } else if (unlikely(sk_add_backlog(sk, skb,
1688                                            sk->sk_rcvbuf + sk->sk_sndbuf))) {
1689                 bh_unlock_sock(sk);
1690                 NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
1691                 goto discard_and_relse;
1692         }
1693         bh_unlock_sock(sk);
1694
1695 put_and_return:
1696         sock_put(sk);
1697
1698         return ret;
1699
1700 no_tcp_socket:
1701         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1702                 goto discard_it;
1703
1704         if (tcp_checksum_complete(skb)) {
1705 csum_error:
1706                 TCP_INC_STATS_BH(net, TCP_MIB_CSUMERRORS);
1707 bad_packet:
1708                 TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1709         } else {
1710                 tcp_v4_send_reset(NULL, skb);
1711         }
1712
1713 discard_it:
1714         /* Discard frame. */
1715         kfree_skb(skb);
1716         return 0;
1717
1718 discard_and_relse:
1719         sock_put(sk);
1720         goto discard_it;
1721
1722 do_time_wait:
1723         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1724                 inet_twsk_put(inet_twsk(sk));
1725                 goto discard_it;
1726         }
1727
1728         if (tcp_checksum_complete(skb)) {
1729                 inet_twsk_put(inet_twsk(sk));
1730                 goto csum_error;
1731         }
1732         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1733         case TCP_TW_SYN: {
1734                 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1735                                                         &tcp_hashinfo,
1736                                                         iph->saddr, th->source,
1737                                                         iph->daddr, th->dest,
1738                                                         inet_iif(skb));
1739                 if (sk2) {
1740                         inet_twsk_deschedule_put(inet_twsk(sk));
1741                         sk = sk2;
1742                         goto process;
1743                 }
1744                 /* Fall through to ACK */
1745         }
1746         case TCP_TW_ACK:
1747                 tcp_v4_timewait_ack(sk, skb);
1748                 break;
1749         case TCP_TW_RST:
1750                 goto no_tcp_socket;
1751         case TCP_TW_SUCCESS:;
1752         }
1753         goto discard_it;
1754 }
1755
1756 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1757         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
1758         .twsk_unique    = tcp_twsk_unique,
1759         .twsk_destructor= tcp_twsk_destructor,
1760 };
1761
1762 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
1763 {
1764         struct dst_entry *dst = skb_dst(skb);
1765
1766         if (dst && dst_hold_safe(dst)) {
1767                 sk->sk_rx_dst = dst;
1768                 inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
1769         }
1770 }
1771 EXPORT_SYMBOL(inet_sk_rx_dst_set);
1772
1773 const struct inet_connection_sock_af_ops ipv4_specific = {
1774         .queue_xmit        = ip_queue_xmit,
1775         .send_check        = tcp_v4_send_check,
1776         .rebuild_header    = inet_sk_rebuild_header,
1777         .sk_rx_dst_set     = inet_sk_rx_dst_set,
1778         .conn_request      = tcp_v4_conn_request,
1779         .syn_recv_sock     = tcp_v4_syn_recv_sock,
1780         .net_header_len    = sizeof(struct iphdr),
1781         .setsockopt        = ip_setsockopt,
1782         .getsockopt        = ip_getsockopt,
1783         .addr2sockaddr     = inet_csk_addr2sockaddr,
1784         .sockaddr_len      = sizeof(struct sockaddr_in),
1785         .bind_conflict     = inet_csk_bind_conflict,
1786 #ifdef CONFIG_COMPAT
1787         .compat_setsockopt = compat_ip_setsockopt,
1788         .compat_getsockopt = compat_ip_getsockopt,
1789 #endif
1790         .mtu_reduced       = tcp_v4_mtu_reduced,
1791 };
1792 EXPORT_SYMBOL(ipv4_specific);
1793
1794 #ifdef CONFIG_TCP_MD5SIG
1795 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1796         .md5_lookup             = tcp_v4_md5_lookup,
1797         .calc_md5_hash          = tcp_v4_md5_hash_skb,
1798         .md5_parse              = tcp_v4_parse_md5_keys,
1799 };
1800 #endif
1801
1802 /* NOTE: A lot of things set to zero explicitly by call to
1803  *       sk_alloc() so need not be done here.
1804  */
1805 static int tcp_v4_init_sock(struct sock *sk)
1806 {
1807         struct inet_connection_sock *icsk = inet_csk(sk);
1808
1809         tcp_init_sock(sk);
1810
1811         icsk->icsk_af_ops = &ipv4_specific;
1812
1813 #ifdef CONFIG_TCP_MD5SIG
1814         tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
1815 #endif
1816
1817         return 0;
1818 }
1819
1820 void tcp_v4_destroy_sock(struct sock *sk)
1821 {
1822         struct tcp_sock *tp = tcp_sk(sk);
1823
1824         tcp_clear_xmit_timers(sk);
1825
1826         tcp_cleanup_congestion_control(sk);
1827
1828         /* Cleanup up the write buffer. */
1829         tcp_write_queue_purge(sk);
1830
1831         /* Cleans up our, hopefully empty, out_of_order_queue. */
1832         __skb_queue_purge(&tp->out_of_order_queue);
1833
1834 #ifdef CONFIG_TCP_MD5SIG
1835         /* Clean up the MD5 key list, if any */
1836         if (tp->md5sig_info) {
1837                 tcp_clear_md5_list(sk);
1838                 kfree_rcu(tp->md5sig_info, rcu);
1839                 tp->md5sig_info = NULL;
1840         }
1841 #endif
1842
1843         /* Clean prequeue, it must be empty really */
1844         __skb_queue_purge(&tp->ucopy.prequeue);
1845
1846         /* Clean up a referenced TCP bind bucket. */
1847         if (inet_csk(sk)->icsk_bind_hash)
1848                 inet_put_port(sk);
1849
1850         BUG_ON(tp->fastopen_rsk);
1851
1852         /* If socket is aborted during connect operation */
1853         tcp_free_fastopen_req(tp);
1854         tcp_saved_syn_free(tp);
1855
1856         sk_sockets_allocated_dec(sk);
1857         sock_release_memcg(sk);
1858 }
1859 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1860
1861 #ifdef CONFIG_PROC_FS
1862 /* Proc filesystem TCP sock list dumping. */
1863
1864 /*
1865  * Get next listener socket follow cur.  If cur is NULL, get first socket
1866  * starting from bucket given in st->bucket; when st->bucket is zero the
1867  * very first socket in the hash table is returned.
1868  */
1869 static void *listening_get_next(struct seq_file *seq, void *cur)
1870 {
1871         struct inet_connection_sock *icsk;
1872         struct hlist_nulls_node *node;
1873         struct sock *sk = cur;
1874         struct inet_listen_hashbucket *ilb;
1875         struct tcp_iter_state *st = seq->private;
1876         struct net *net = seq_file_net(seq);
1877
1878         if (!sk) {
1879                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
1880                 spin_lock_bh(&ilb->lock);
1881                 sk = sk_nulls_head(&ilb->head);
1882                 st->offset = 0;
1883                 goto get_sk;
1884         }
1885         ilb = &tcp_hashinfo.listening_hash[st->bucket];
1886         ++st->num;
1887         ++st->offset;
1888
1889         sk = sk_nulls_next(sk);
1890 get_sk:
1891         sk_nulls_for_each_from(sk, node) {
1892                 if (!net_eq(sock_net(sk), net))
1893                         continue;
1894                 if (sk->sk_family == st->family) {
1895                         cur = sk;
1896                         goto out;
1897                 }
1898                 icsk = inet_csk(sk);
1899         }
1900         spin_unlock_bh(&ilb->lock);
1901         st->offset = 0;
1902         if (++st->bucket < INET_LHTABLE_SIZE) {
1903                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
1904                 spin_lock_bh(&ilb->lock);
1905                 sk = sk_nulls_head(&ilb->head);
1906                 goto get_sk;
1907         }
1908         cur = NULL;
1909 out:
1910         return cur;
1911 }
1912
1913 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1914 {
1915         struct tcp_iter_state *st = seq->private;
1916         void *rc;
1917
1918         st->bucket = 0;
1919         st->offset = 0;
1920         rc = listening_get_next(seq, NULL);
1921
1922         while (rc && *pos) {
1923                 rc = listening_get_next(seq, rc);
1924                 --*pos;
1925         }
1926         return rc;
1927 }
1928
1929 static inline bool empty_bucket(const struct tcp_iter_state *st)
1930 {
1931         return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
1932 }
1933
1934 /*
1935  * Get first established socket starting from bucket given in st->bucket.
1936  * If st->bucket is zero, the very first socket in the hash is returned.
1937  */
1938 static void *established_get_first(struct seq_file *seq)
1939 {
1940         struct tcp_iter_state *st = seq->private;
1941         struct net *net = seq_file_net(seq);
1942         void *rc = NULL;
1943
1944         st->offset = 0;
1945         for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
1946                 struct sock *sk;
1947                 struct hlist_nulls_node *node;
1948                 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
1949
1950                 /* Lockless fast path for the common case of empty buckets */
1951                 if (empty_bucket(st))
1952                         continue;
1953
1954                 spin_lock_bh(lock);
1955                 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
1956                         if (sk->sk_family != st->family ||
1957                             !net_eq(sock_net(sk), net)) {
1958                                 continue;
1959                         }
1960                         rc = sk;
1961                         goto out;
1962                 }
1963                 spin_unlock_bh(lock);
1964         }
1965 out:
1966         return rc;
1967 }
1968
1969 static void *established_get_next(struct seq_file *seq, void *cur)
1970 {
1971         struct sock *sk = cur;
1972         struct hlist_nulls_node *node;
1973         struct tcp_iter_state *st = seq->private;
1974         struct net *net = seq_file_net(seq);
1975
1976         ++st->num;
1977         ++st->offset;
1978
1979         sk = sk_nulls_next(sk);
1980
1981         sk_nulls_for_each_from(sk, node) {
1982                 if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
1983                         return sk;
1984         }
1985
1986         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
1987         ++st->bucket;
1988         return established_get_first(seq);
1989 }
1990
1991 static void *established_get_idx(struct seq_file *seq, loff_t pos)
1992 {
1993         struct tcp_iter_state *st = seq->private;
1994         void *rc;
1995
1996         st->bucket = 0;
1997         rc = established_get_first(seq);
1998
1999         while (rc && pos) {
2000                 rc = established_get_next(seq, rc);
2001                 --pos;
2002         }
2003         return rc;
2004 }
2005
2006 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2007 {
2008         void *rc;
2009         struct tcp_iter_state *st = seq->private;
2010
2011         st->state = TCP_SEQ_STATE_LISTENING;
2012         rc        = listening_get_idx(seq, &pos);
2013
2014         if (!rc) {
2015                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2016                 rc        = established_get_idx(seq, pos);
2017         }
2018
2019         return rc;
2020 }
2021
2022 static void *tcp_seek_last_pos(struct seq_file *seq)
2023 {
2024         struct tcp_iter_state *st = seq->private;
2025         int offset = st->offset;
2026         int orig_num = st->num;
2027         void *rc = NULL;
2028
2029         switch (st->state) {
2030         case TCP_SEQ_STATE_LISTENING:
2031                 if (st->bucket >= INET_LHTABLE_SIZE)
2032                         break;
2033                 st->state = TCP_SEQ_STATE_LISTENING;
2034                 rc = listening_get_next(seq, NULL);
2035                 while (offset-- && rc)
2036                         rc = listening_get_next(seq, rc);
2037                 if (rc)
2038                         break;
2039                 st->bucket = 0;
2040                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2041                 /* Fallthrough */
2042         case TCP_SEQ_STATE_ESTABLISHED:
2043                 if (st->bucket > tcp_hashinfo.ehash_mask)
2044                         break;
2045                 rc = established_get_first(seq);
2046                 while (offset-- && rc)
2047                         rc = established_get_next(seq, rc);
2048         }
2049
2050         st->num = orig_num;
2051
2052         return rc;
2053 }
2054
2055 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2056 {
2057         struct tcp_iter_state *st = seq->private;
2058         void *rc;
2059
2060         if (*pos && *pos == st->last_pos) {
2061                 rc = tcp_seek_last_pos(seq);
2062                 if (rc)
2063                         goto out;
2064         }
2065
2066         st->state = TCP_SEQ_STATE_LISTENING;
2067         st->num = 0;
2068         st->bucket = 0;
2069         st->offset = 0;
2070         rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2071
2072 out:
2073         st->last_pos = *pos;
2074         return rc;
2075 }
2076
2077 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2078 {
2079         struct tcp_iter_state *st = seq->private;
2080         void *rc = NULL;
2081
2082         if (v == SEQ_START_TOKEN) {
2083                 rc = tcp_get_idx(seq, 0);
2084                 goto out;
2085         }
2086
2087         switch (st->state) {
2088         case TCP_SEQ_STATE_LISTENING:
2089                 rc = listening_get_next(seq, v);
2090                 if (!rc) {
2091                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2092                         st->bucket = 0;
2093                         st->offset = 0;
2094                         rc        = established_get_first(seq);
2095                 }
2096                 break;
2097         case TCP_SEQ_STATE_ESTABLISHED:
2098                 rc = established_get_next(seq, v);
2099                 break;
2100         }
2101 out:
2102         ++*pos;
2103         st->last_pos = *pos;
2104         return rc;
2105 }
2106
2107 static void tcp_seq_stop(struct seq_file *seq, void *v)
2108 {
2109         struct tcp_iter_state *st = seq->private;
2110
2111         switch (st->state) {
2112         case TCP_SEQ_STATE_LISTENING:
2113                 if (v != SEQ_START_TOKEN)
2114                         spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
2115                 break;
2116         case TCP_SEQ_STATE_ESTABLISHED:
2117                 if (v)
2118                         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2119                 break;
2120         }
2121 }
2122
2123 int tcp_seq_open(struct inode *inode, struct file *file)
2124 {
2125         struct tcp_seq_afinfo *afinfo = PDE_DATA(inode);
2126         struct tcp_iter_state *s;
2127         int err;
2128
2129         err = seq_open_net(inode, file, &afinfo->seq_ops,
2130                           sizeof(struct tcp_iter_state));
2131         if (err < 0)
2132                 return err;
2133
2134         s = ((struct seq_file *)file->private_data)->private;
2135         s->family               = afinfo->family;
2136         s->last_pos             = 0;
2137         return 0;
2138 }
2139 EXPORT_SYMBOL(tcp_seq_open);
2140
2141 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2142 {
2143         int rc = 0;
2144         struct proc_dir_entry *p;
2145
2146         afinfo->seq_ops.start           = tcp_seq_start;
2147         afinfo->seq_ops.next            = tcp_seq_next;
2148         afinfo->seq_ops.stop            = tcp_seq_stop;
2149
2150         p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2151                              afinfo->seq_fops, afinfo);
2152         if (!p)
2153                 rc = -ENOMEM;
2154         return rc;
2155 }
2156 EXPORT_SYMBOL(tcp_proc_register);
2157
2158 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2159 {
2160         remove_proc_entry(afinfo->name, net->proc_net);
2161 }
2162 EXPORT_SYMBOL(tcp_proc_unregister);
2163
2164 static void get_openreq4(const struct request_sock *req,
2165                          struct seq_file *f, int i)
2166 {
2167         const struct inet_request_sock *ireq = inet_rsk(req);
2168         long delta = req->rsk_timer.expires - jiffies;
2169
2170         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2171                 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2172                 i,
2173                 ireq->ir_loc_addr,
2174                 ireq->ir_num,
2175                 ireq->ir_rmt_addr,
2176                 ntohs(ireq->ir_rmt_port),
2177                 TCP_SYN_RECV,
2178                 0, 0, /* could print option size, but that is af dependent. */
2179                 1,    /* timers active (only the expire timer) */
2180                 jiffies_delta_to_clock_t(delta),
2181                 req->num_timeout,
2182                 from_kuid_munged(seq_user_ns(f),
2183                                  sock_i_uid(req->rsk_listener)),
2184                 0,  /* non standard timer */
2185                 0, /* open_requests have no inode */
2186                 0,
2187                 req);
2188 }
2189
2190 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2191 {
2192         int timer_active;
2193         unsigned long timer_expires;
2194         const struct tcp_sock *tp = tcp_sk(sk);
2195         const struct inet_connection_sock *icsk = inet_csk(sk);
2196         const struct inet_sock *inet = inet_sk(sk);
2197         const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2198         __be32 dest = inet->inet_daddr;
2199         __be32 src = inet->inet_rcv_saddr;
2200         __u16 destp = ntohs(inet->inet_dport);
2201         __u16 srcp = ntohs(inet->inet_sport);
2202         int rx_queue;
2203         int state;
2204
2205         if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2206             icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
2207             icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2208                 timer_active    = 1;
2209                 timer_expires   = icsk->icsk_timeout;
2210         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2211                 timer_active    = 4;
2212                 timer_expires   = icsk->icsk_timeout;
2213         } else if (timer_pending(&sk->sk_timer)) {
2214                 timer_active    = 2;
2215                 timer_expires   = sk->sk_timer.expires;
2216         } else {
2217                 timer_active    = 0;
2218                 timer_expires = jiffies;
2219         }
2220
2221         state = sk_state_load(sk);
2222         if (state == TCP_LISTEN)
2223                 rx_queue = sk->sk_ack_backlog;
2224         else
2225                 /* Because we don't lock the socket,
2226                  * we might find a transient negative value.
2227                  */
2228                 rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2229
2230         seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2231                         "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2232                 i, src, srcp, dest, destp, state,
2233                 tp->write_seq - tp->snd_una,
2234                 rx_queue,
2235                 timer_active,
2236                 jiffies_delta_to_clock_t(timer_expires - jiffies),
2237                 icsk->icsk_retransmits,
2238                 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2239                 icsk->icsk_probes_out,
2240                 sock_i_ino(sk),
2241                 atomic_read(&sk->sk_refcnt), sk,
2242                 jiffies_to_clock_t(icsk->icsk_rto),
2243                 jiffies_to_clock_t(icsk->icsk_ack.ato),
2244                 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2245                 tp->snd_cwnd,
2246                 state == TCP_LISTEN ?
2247                     fastopenq->max_qlen :
2248                     (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2249 }
2250
2251 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2252                                struct seq_file *f, int i)
2253 {
2254         long delta = tw->tw_timer.expires - jiffies;
2255         __be32 dest, src;
2256         __u16 destp, srcp;
2257
2258         dest  = tw->tw_daddr;
2259         src   = tw->tw_rcv_saddr;
2260         destp = ntohs(tw->tw_dport);
2261         srcp  = ntohs(tw->tw_sport);
2262
2263         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2264                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2265                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2266                 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2267                 atomic_read(&tw->tw_refcnt), tw);
2268 }
2269
2270 #define TMPSZ 150
2271
2272 static int tcp4_seq_show(struct seq_file *seq, void *v)
2273 {
2274         struct tcp_iter_state *st;
2275         struct sock *sk = v;
2276
2277         seq_setwidth(seq, TMPSZ - 1);
2278         if (v == SEQ_START_TOKEN) {
2279                 seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2280                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2281                            "inode");
2282                 goto out;
2283         }
2284         st = seq->private;
2285
2286         if (sk->sk_state == TCP_TIME_WAIT)
2287                 get_timewait4_sock(v, seq, st->num);
2288         else if (sk->sk_state == TCP_NEW_SYN_RECV)
2289                 get_openreq4(v, seq, st->num);
2290         else
2291                 get_tcp4_sock(v, seq, st->num);
2292 out:
2293         seq_pad(seq, '\n');
2294         return 0;
2295 }
2296
2297 static const struct file_operations tcp_afinfo_seq_fops = {
2298         .owner   = THIS_MODULE,
2299         .open    = tcp_seq_open,
2300         .read    = seq_read,
2301         .llseek  = seq_lseek,
2302         .release = seq_release_net
2303 };
2304
2305 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2306         .name           = "tcp",
2307         .family         = AF_INET,
2308         .seq_fops       = &tcp_afinfo_seq_fops,
2309         .seq_ops        = {
2310                 .show           = tcp4_seq_show,
2311         },
2312 };
2313
2314 static int __net_init tcp4_proc_init_net(struct net *net)
2315 {
2316         return tcp_proc_register(net, &tcp4_seq_afinfo);
2317 }
2318
2319 static void __net_exit tcp4_proc_exit_net(struct net *net)
2320 {
2321         tcp_proc_unregister(net, &tcp4_seq_afinfo);
2322 }
2323
2324 static struct pernet_operations tcp4_net_ops = {
2325         .init = tcp4_proc_init_net,
2326         .exit = tcp4_proc_exit_net,
2327 };
2328
2329 int __init tcp4_proc_init(void)
2330 {
2331         return register_pernet_subsys(&tcp4_net_ops);
2332 }
2333
2334 void tcp4_proc_exit(void)
2335 {
2336         unregister_pernet_subsys(&tcp4_net_ops);
2337 }
2338 #endif /* CONFIG_PROC_FS */
2339
2340 struct proto tcp_prot = {
2341         .name                   = "TCP",
2342         .owner                  = THIS_MODULE,
2343         .close                  = tcp_close,
2344         .connect                = tcp_v4_connect,
2345         .disconnect             = tcp_disconnect,
2346         .accept                 = inet_csk_accept,
2347         .ioctl                  = tcp_ioctl,
2348         .init                   = tcp_v4_init_sock,
2349         .destroy                = tcp_v4_destroy_sock,
2350         .shutdown               = tcp_shutdown,
2351         .setsockopt             = tcp_setsockopt,
2352         .getsockopt             = tcp_getsockopt,
2353         .recvmsg                = tcp_recvmsg,
2354         .sendmsg                = tcp_sendmsg,
2355         .sendpage               = tcp_sendpage,
2356         .backlog_rcv            = tcp_v4_do_rcv,
2357         .release_cb             = tcp_release_cb,
2358         .hash                   = inet_hash,
2359         .unhash                 = inet_unhash,
2360         .get_port               = inet_csk_get_port,
2361         .enter_memory_pressure  = tcp_enter_memory_pressure,
2362         .stream_memory_free     = tcp_stream_memory_free,
2363         .sockets_allocated      = &tcp_sockets_allocated,
2364         .orphan_count           = &tcp_orphan_count,
2365         .memory_allocated       = &tcp_memory_allocated,
2366         .memory_pressure        = &tcp_memory_pressure,
2367         .sysctl_mem             = sysctl_tcp_mem,
2368         .sysctl_wmem            = sysctl_tcp_wmem,
2369         .sysctl_rmem            = sysctl_tcp_rmem,
2370         .max_header             = MAX_TCP_HEADER,
2371         .obj_size               = sizeof(struct tcp_sock),
2372         .slab_flags             = SLAB_DESTROY_BY_RCU,
2373         .twsk_prot              = &tcp_timewait_sock_ops,
2374         .rsk_prot               = &tcp_request_sock_ops,
2375         .h.hashinfo             = &tcp_hashinfo,
2376         .no_autobind            = true,
2377 #ifdef CONFIG_COMPAT
2378         .compat_setsockopt      = compat_tcp_setsockopt,
2379         .compat_getsockopt      = compat_tcp_getsockopt,
2380 #endif
2381 #ifdef CONFIG_MEMCG_KMEM
2382         .init_cgroup            = tcp_init_cgroup,
2383         .destroy_cgroup         = tcp_destroy_cgroup,
2384         .proto_cgroup           = tcp_proto_cgroup,
2385 #endif
2386 };
2387 EXPORT_SYMBOL(tcp_prot);
2388
2389 static void __net_exit tcp_sk_exit(struct net *net)
2390 {
2391         int cpu;
2392
2393         for_each_possible_cpu(cpu)
2394                 inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2395         free_percpu(net->ipv4.tcp_sk);
2396 }
2397
2398 static int __net_init tcp_sk_init(struct net *net)
2399 {
2400         int res, cpu;
2401
2402         net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2403         if (!net->ipv4.tcp_sk)
2404                 return -ENOMEM;
2405
2406         for_each_possible_cpu(cpu) {
2407                 struct sock *sk;
2408
2409                 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2410                                            IPPROTO_TCP, net);
2411                 if (res)
2412                         goto fail;
2413                 *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2414         }
2415
2416         net->ipv4.sysctl_tcp_ecn = 2;
2417         net->ipv4.sysctl_tcp_ecn_fallback = 1;
2418
2419         net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2420         net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2421         net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2422
2423         return 0;
2424 fail:
2425         tcp_sk_exit(net);
2426
2427         return res;
2428 }
2429
2430 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2431 {
2432         inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
2433 }
2434
2435 static struct pernet_operations __net_initdata tcp_sk_ops = {
2436        .init       = tcp_sk_init,
2437        .exit       = tcp_sk_exit,
2438        .exit_batch = tcp_sk_exit_batch,
2439 };
2440
2441 void __init tcp_v4_init(void)
2442 {
2443         inet_hashinfo_init(&tcp_hashinfo);
2444         if (register_pernet_subsys(&tcp_sk_ops))
2445                 panic("Failed to create the TCP control socket.\n");
2446 }