net/ipv4/tcp_ipv4.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  *              IPv4 specific functions
   9  *
  10  *
  11  *              code split from:
  12  *              linux/ipv4/tcp.c
  13  *              linux/ipv4/tcp_input.c
  14  *              linux/ipv4/tcp_output.c
  15  *
  16  *              See tcp.c for author information
  17  *
  18  *      This program is free software; you can redistribute it and/or
  19  *      modify it under the terms of the GNU General Public License
  20  *      as published by the Free Software Foundation; either version
  21  *      2 of the License, or (at your option) any later version.
  22  */
  23
  24 /*
  25  * Changes:
  26  *              David S. Miller :       New socket lookup architecture.
  27  *                                      This code is dedicated to John Dyson.
  28  *              David S. Miller :       Change semantics of established hash,
  29  *                                      half is devoted to TIME_WAIT sockets
  30  *                                      and the rest go in the other half.
  31  *              Andi Kleen :            Add support for syncookies and fixed
  32  *                                      some bugs: ip options weren't passed to
  33  *                                      the TCP layer, missed a check for an
  34  *                                      ACK bit.
  35  *              Andi Kleen :            Implemented fast path mtu discovery.
  36  *                                      Fixed many serious bugs in the
  37  *                                      request_sock handling and moved
  38  *                                      most of it into the af independent code.
  39  *                                      Added tail drop and some other bugfixes.
  40  *                                      Added new listen semantics.
  41  *              Mike McLagan    :       Routing by source
  42  *      Juan Jose Ciarlante:            ip_dynaddr bits
  43  *              Andi Kleen:             various fixes.
  44  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
  45  *                                      coma.
  46  *      Andi Kleen              :       Fix new listen.
  47  *      Andi Kleen              :       Fix accept error reporting.
  48  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
  49  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
  50  *                                      a single port at the same time.
  51  */
  52
  53 #define pr_fmt(fmt) "TCP: " fmt
  54
  55 #include <linux/bottom_half.h>
  56 #include <linux/types.h>
  57 #include <linux/fcntl.h>
  58 #include <linux/module.h>
  59 #include <linux/random.h>
  60 #include <linux/cache.h>
  61 #include <linux/jhash.h>
  62 #include <linux/init.h>
  63 #include <linux/times.h>
  64 #include <linux/slab.h>
  65
  66 #include <net/net_namespace.h>
  67 #include <net/icmp.h>
  68 #include <net/inet_hashtables.h>
  69 #include <net/tcp.h>
  70 #include <net/transp_v6.h>
  71 #include <net/ipv6.h>
  72 #include <net/inet_common.h>
  73 #include <net/timewait_sock.h>
  74 #include <net/xfrm.h>
  75 #include <net/secure_seq.h>
  76 #include <net/tcp_memcontrol.h>
  77 #include <net/busy_poll.h>
  78
  79 #include <linux/inet.h>
  80 #include <linux/ipv6.h>
  81 #include <linux/stddef.h>
  82 #include <linux/proc_fs.h>
  83 #include <linux/seq_file.h>
  84
  85 #include <linux/crypto.h>
  86 #include <linux/scatterlist.h>
  87
  88 int sysctl_tcp_tw_reuse __read_mostly;
  89 int sysctl_tcp_low_latency __read_mostly;
  90 EXPORT_SYMBOL(sysctl_tcp_low_latency);
  91
  92 #ifdef CONFIG_TCP_MD5SIG
  93 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
  94                                __be32 daddr, __be32 saddr, const struct tcphdr *th);
  95 #endif
  96
  97 struct inet_hashinfo tcp_hashinfo;
  98 EXPORT_SYMBOL(tcp_hashinfo);
  99
 100 static  __u32 tcp_v4_init_sequence(const struct sk_buff *skb)
 101 {
 102         return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
 103                                           ip_hdr(skb)->saddr,
 104                                           tcp_hdr(skb)->dest,
 105                                           tcp_hdr(skb)->source);
 106 }
 107
 108 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
 109 {
 110         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
 111         struct tcp_sock *tp = tcp_sk(sk);
 112
 113         /* With PAWS, it is safe from the viewpoint
 114            of data integrity. Even without PAWS it is safe provided sequence
 115            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
 116
 117            Actually, the idea is close to VJ's one, only timestamp cache is
 118            held not per host, but per port pair and TW bucket is used as state
 119            holder.
 120
 121            If TW bucket has been already destroyed we fall back to VJ's scheme
 122            and use initial timestamp retrieved from peer table.
 123          */
 124         if (tcptw->tw_ts_recent_stamp &&
 125             (!twp || (sysctl_tcp_tw_reuse &&
 126                              get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
 127                 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
 128                 if (tp->write_seq == 0)
 129                         tp->write_seq = 1;
 130                 tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
 131                 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
 132                 sock_hold(sktw);
 133                 return 1;
 134         }
 135
 136         return 0;
 137 }
 138 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
 139
 140 /* This will initiate an outgoing connection. */
 141 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 142 {
 143         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 144         struct inet_sock *inet = inet_sk(sk);
 145         struct tcp_sock *tp = tcp_sk(sk);
 146         __be16 orig_sport, orig_dport;
 147         __be32 daddr, nexthop;
 148         struct flowi4 *fl4;
 149         struct rtable *rt;
 150         int err;
 151         struct ip_options_rcu *inet_opt;
 152
 153         if (addr_len < sizeof(struct sockaddr_in))
 154                 return -EINVAL;
 155
 156         if (usin->sin_family != AF_INET)
 157                 return -EAFNOSUPPORT;
 158
 159         nexthop = daddr = usin->sin_addr.s_addr;
 160         inet_opt = rcu_dereference_protected(inet->inet_opt,
 161                                              sock_owned_by_user(sk));
 162         if (inet_opt && inet_opt->opt.srr) {
 163                 if (!daddr)
 164                         return -EINVAL;
 165                 nexthop = inet_opt->opt.faddr;
 166         }
 167
 168         orig_sport = inet->inet_sport;
 169         orig_dport = usin->sin_port;
 170         fl4 = &inet->cork.fl.u.ip4;
 171         rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
 172                               RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
 173                               IPPROTO_TCP,
 174                               orig_sport, orig_dport, sk);
 175         if (IS_ERR(rt)) {
 176                 err = PTR_ERR(rt);
 177                 if (err == -ENETUNREACH)
 178                         IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
 179                 return err;
 180         }
 181
 182         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
 183                 ip_rt_put(rt);
 184                 return -ENETUNREACH;
 185         }
 186
 187         if (!inet_opt || !inet_opt->opt.srr)
 188                 daddr = fl4->daddr;
 189
 190         if (!inet->inet_saddr)
 191                 inet->inet_saddr = fl4->saddr;
 192         sk_rcv_saddr_set(sk, inet->inet_saddr);
 193
 194         if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
 195                 /* Reset inherited state */
 196                 tp->rx_opt.ts_recent       = 0;
 197                 tp->rx_opt.ts_recent_stamp = 0;
 198                 if (likely(!tp->repair))
 199                         tp->write_seq      = 0;
 200         }
 201
 202         if (tcp_death_row.sysctl_tw_recycle &&
 203             !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr)
 204                 tcp_fetch_timewait_stamp(sk, &rt->dst);
 205
 206         inet->inet_dport = usin->sin_port;
 207         sk_daddr_set(sk, daddr);
 208
 209         inet_csk(sk)->icsk_ext_hdr_len = 0;
 210         if (inet_opt)
 211                 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
 212
 213         tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
 214
 215         /* Socket identity is still unknown (sport may be zero).
 216          * However we set state to SYN-SENT and not releasing socket
 217          * lock select source port, enter ourselves into the hash tables and
 218          * complete initialization after this.
 219          */
 220         tcp_set_state(sk, TCP_SYN_SENT);
 221         err = inet_hash_connect(&tcp_death_row, sk);
 222         if (err)
 223                 goto failure;
 224
 225         sk_set_txhash(sk);
 226
 227         rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
 228                                inet->inet_sport, inet->inet_dport, sk);
 229         if (IS_ERR(rt)) {
 230                 err = PTR_ERR(rt);
 231                 rt = NULL;
 232                 goto failure;
 233         }
 234         /* OK, now commit destination to socket.  */
 235         sk->sk_gso_type = SKB_GSO_TCPV4;
 236         sk_setup_caps(sk, &rt->dst);
 237
 238         if (!tp->write_seq && likely(!tp->repair))
 239                 tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
 240                                                            inet->inet_daddr,
 241                                                            inet->inet_sport,
 242                                                            usin->sin_port);
 243
 244         inet->inet_id = tp->write_seq ^ jiffies;
 245
 246         err = tcp_connect(sk);
 247
 248         rt = NULL;
 249         if (err)
 250                 goto failure;
 251
 252         return 0;
 253
 254 failure:
 255         /*
 256          * This unhashes the socket and releases the local port,
 257          * if necessary.
 258          */
 259         tcp_set_state(sk, TCP_CLOSE);
 260         ip_rt_put(rt);
 261         sk->sk_route_caps = 0;
 262         inet->inet_dport = 0;
 263         return err;
 264 }
 265 EXPORT_SYMBOL(tcp_v4_connect);
 266
 267 /*
 268  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
 269  * It can be called through tcp_release_cb() if socket was owned by user
 270  * at the time tcp_v4_err() was called to handle ICMP message.
 271  */
 272 void tcp_v4_mtu_reduced(struct sock *sk)
 273 {
 274         struct inet_sock *inet = inet_sk(sk);
 275         struct dst_entry *dst;
 276         u32 mtu;
 277
 278         if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
 279                 return;
 280         mtu = tcp_sk(sk)->mtu_info;
 281         dst = inet_csk_update_pmtu(sk, mtu);
 282         if (!dst)
 283                 return;
 284
 285         /* Something is about to be wrong... Remember soft error
 286          * for the case, if this connection will not able to recover.
 287          */
 288         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
 289                 sk->sk_err_soft = EMSGSIZE;
 290
 291         mtu = dst_mtu(dst);
 292
 293         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
 294             ip_sk_accept_pmtu(sk) &&
 295             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
 296                 tcp_sync_mss(sk, mtu);
 297
 298                 /* Resend the TCP packet because it's
 299                  * clear that the old packet has been
 300                  * dropped. This is the new "fast" path mtu
 301                  * discovery.
 302                  */
 303                 tcp_simple_retransmit(sk);
 304         } /* else let the usual retransmit timer handle it */
 305 }
 306 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
 307
 308 static void do_redirect(struct sk_buff *skb, struct sock *sk)
 309 {
 310         struct dst_entry *dst = __sk_dst_check(sk, 0);
 311
 312         if (dst)
 313                 dst->ops->redirect(dst, sk, skb);
 314 }
 315
 316
 317 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
 318 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
 319 {
 320         struct request_sock *req = inet_reqsk(sk);
 321         struct net *net = sock_net(sk);
 322
 323         /* ICMPs are not backlogged, hence we cannot get
 324          * an established socket here.
 325          */
 326         if (seq != tcp_rsk(req)->snt_isn) {
 327                 NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
 328         } else if (abort) {
 329                 /*
 330                  * Still in SYN_RECV, just remove it silently.
 331                  * There is no good way to pass the error to the newly
 332                  * created socket, and POSIX does not want network
 333                  * errors returned from accept().
 334                  */
 335                 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
 336                 NET_INC_STATS_BH(net, LINUX_MIB_LISTENDROPS);
 337         }
 338         reqsk_put(req);
 339 }
 340 EXPORT_SYMBOL(tcp_req_err);
 341
 342 /*
 343  * This routine is called by the ICMP module when it gets some
 344  * sort of error condition.  If err < 0 then the socket should
 345  * be closed and the error returned to the user.  If err > 0
 346  * it's just the icmp type << 8 | icmp code.  After adjustment
 347  * header points to the first 8 bytes of the tcp header.  We need
 348  * to find the appropriate port.
 349  *
 350  * The locking strategy used here is very "optimistic". When
 351  * someone else accesses the socket the ICMP is just dropped
 352  * and for some paths there is no check at all.
 353  * A more general error queue to queue errors for later handling
 354  * is probably better.
 355  *
 356  */
 357
 358 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
 359 {
 360         const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
 361         struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
 362         struct inet_connection_sock *icsk;
 363         struct tcp_sock *tp;
 364         struct inet_sock *inet;
 365         const int type = icmp_hdr(icmp_skb)->type;
 366         const int code = icmp_hdr(icmp_skb)->code;
 367         struct sock *sk;
 368         struct sk_buff *skb;
 369         struct request_sock *fastopen;
 370         __u32 seq, snd_una;
 371         __u32 remaining;
 372         int err;
 373         struct net *net = dev_net(icmp_skb->dev);
 374
 375         sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
 376                                        th->dest, iph->saddr, ntohs(th->source),
 377                                        inet_iif(icmp_skb));
 378         if (!sk) {
 379                 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
 380                 return;
 381         }
 382         if (sk->sk_state == TCP_TIME_WAIT) {
 383                 inet_twsk_put(inet_twsk(sk));
 384                 return;
 385         }
 386         seq = ntohl(th->seq);
 387         if (sk->sk_state == TCP_NEW_SYN_RECV)
 388                 return tcp_req_err(sk, seq,
 389                                   type == ICMP_PARAMETERPROB ||
 390                                   type == ICMP_TIME_EXCEEDED ||
 391                                   (type == ICMP_DEST_UNREACH &&
 392                                    (code == ICMP_NET_UNREACH ||
 393                                     code == ICMP_HOST_UNREACH)));
 394
 395         bh_lock_sock(sk);
 396         /* If too many ICMPs get dropped on busy
 397          * servers this needs to be solved differently.
 398          * We do take care of PMTU discovery (RFC1191) special case :
 399          * we can receive locally generated ICMP messages while socket is held.
 400          */
 401         if (sock_owned_by_user(sk)) {
 402                 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
 403                         NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
 404         }
 405         if (sk->sk_state == TCP_CLOSE)
 406                 goto out;
 407
 408         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
 409                 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
 410                 goto out;
 411         }
 412
 413         icsk = inet_csk(sk);
 414         tp = tcp_sk(sk);
 415         /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
 416         fastopen = tp->fastopen_rsk;
 417         snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
 418         if (sk->sk_state != TCP_LISTEN &&
 419             !between(seq, snd_una, tp->snd_nxt)) {
 420                 NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
 421                 goto out;
 422         }
 423
 424         switch (type) {
 425         case ICMP_REDIRECT:
 426                 if (!sock_owned_by_user(sk))
 427                         do_redirect(icmp_skb, sk);
 428                 goto out;
 429         case ICMP_SOURCE_QUENCH:
 430                 /* Just silently ignore these. */
 431                 goto out;
 432         case ICMP_PARAMETERPROB:
 433                 err = EPROTO;
 434                 break;
 435         case ICMP_DEST_UNREACH:
 436                 if (code > NR_ICMP_UNREACH)
 437                         goto out;
 438
 439                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
 440                         /* We are not interested in TCP_LISTEN and open_requests
 441                          * (SYN-ACKs send out by Linux are always <576bytes so
 442                          * they should go through unfragmented).
 443                          */
 444                         if (sk->sk_state == TCP_LISTEN)
 445                                 goto out;
 446
 447                         tp->mtu_info = info;
 448                         if (!sock_owned_by_user(sk)) {
 449                                 tcp_v4_mtu_reduced(sk);
 450                         } else {
 451                                 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags))
 452                                         sock_hold(sk);
 453                         }
 454                         goto out;
 455                 }
 456
 457                 err = icmp_err_convert[code].errno;
 458                 /* check if icmp_skb allows revert of backoff
 459                  * (see draft-zimmermann-tcp-lcd) */
 460                 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
 461                         break;
 462                 if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
 463                     !icsk->icsk_backoff || fastopen)
 464                         break;
 465
 466                 if (sock_owned_by_user(sk))
 467                         break;
 468
 469                 icsk->icsk_backoff--;
 470                 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
 471                                                TCP_TIMEOUT_INIT;
 472                 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
 473
 474                 skb = tcp_write_queue_head(sk);
 475                 BUG_ON(!skb);
 476
 477                 remaining = icsk->icsk_rto -
 478                             min(icsk->icsk_rto,
 479                                 tcp_time_stamp - tcp_skb_timestamp(skb));
 480
 481                 if (remaining) {
 482                         inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
 483                                                   remaining, TCP_RTO_MAX);
 484                 } else {
 485                         /* RTO revert clocked out retransmission.
 486                          * Will retransmit now */
 487                         tcp_retransmit_timer(sk);
 488                 }
 489
 490                 break;
 491         case ICMP_TIME_EXCEEDED:
 492                 err = EHOSTUNREACH;
 493                 break;
 494         default:
 495                 goto out;
 496         }
 497
 498         switch (sk->sk_state) {
 499         case TCP_SYN_SENT:
 500         case TCP_SYN_RECV:
 501                 /* Only in fast or simultaneous open. If a fast open socket is
 502                  * is already accepted it is treated as a connected one below.
 503                  */
 504                 if (fastopen && !fastopen->sk)
 505                         break;
 506
 507                 if (!sock_owned_by_user(sk)) {
 508                         sk->sk_err = err;
 509
 510                         sk->sk_error_report(sk);
 511
 512                         tcp_done(sk);
 513                 } else {
 514                         sk->sk_err_soft = err;
 515                 }
 516                 goto out;
 517         }
 518
 519         /* If we've already connected we will keep trying
 520          * until we time out, or the user gives up.
 521          *
 522          * rfc1122 4.2.3.9 allows to consider as hard errors
 523          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
 524          * but it is obsoleted by pmtu discovery).
 525          *
 526          * Note, that in modern internet, where routing is unreliable
 527          * and in each dark corner broken firewalls sit, sending random
 528          * errors ordered by their masters even this two messages finally lose
 529          * their original sense (even Linux sends invalid PORT_UNREACHs)
 530          *
 531          * Now we are in compliance with RFCs.
 532          *                                                      --ANK (980905)
 533          */
 534
 535         inet = inet_sk(sk);
 536         if (!sock_owned_by_user(sk) && inet->recverr) {
 537                 sk->sk_err = err;
 538                 sk->sk_error_report(sk);
 539         } else  { /* Only an error on timeout */
 540                 sk->sk_err_soft = err;
 541         }
 542
 543 out:
 544         bh_unlock_sock(sk);
 545         sock_put(sk);
 546 }
 547
 548 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
 549 {
 550         struct tcphdr *th = tcp_hdr(skb);
 551
 552         if (skb->ip_summed == CHECKSUM_PARTIAL) {
 553                 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
 554                 skb->csum_start = skb_transport_header(skb) - skb->head;
 555                 skb->csum_offset = offsetof(struct tcphdr, check);
 556         } else {
 557                 th->check = tcp_v4_check(skb->len, saddr, daddr,
 558                                          csum_partial(th,
 559                                                       th->doff << 2,
 560                                                       skb->csum));
 561         }
 562 }
 563
 564 /* This routine computes an IPv4 TCP checksum. */
 565 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
 566 {
 567         const struct inet_sock *inet = inet_sk(sk);
 568
 569         __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
 570 }
 571 EXPORT_SYMBOL(tcp_v4_send_check);
 572
 573 /*
 574  *      This routine will send an RST to the other tcp.
 575  *
 576  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
 577  *                    for reset.
 578  *      Answer: if a packet caused RST, it is not for a socket
 579  *              existing in our system, if it is matched to a socket,
 580  *              it is just duplicate segment or bug in other side's TCP.
 581  *              So that we build reply only basing on parameters
 582  *              arrived with segment.
 583  *      Exception: precedence violation. We do not implement it in any case.
 584  */
 585
 586 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
 587 {
 588         const struct tcphdr *th = tcp_hdr(skb);
 589         struct {
 590                 struct tcphdr th;
 591 #ifdef CONFIG_TCP_MD5SIG
 592                 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
 593 #endif
 594         } rep;
 595         struct ip_reply_arg arg;
 596 #ifdef CONFIG_TCP_MD5SIG
 597         struct tcp_md5sig_key *key;
 598         const __u8 *hash_location = NULL;
 599         unsigned char newhash[16];
 600         int genhash;
 601         struct sock *sk1 = NULL;
 602 #endif
 603         struct net *net;
 604
 605         /* Never send a reset in response to a reset. */
 606         if (th->rst)
 607                 return;
 608
 609         /* If sk not NULL, it means we did a successful lookup and incoming
 610          * route had to be correct. prequeue might have dropped our dst.
 611          */
 612         if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
 613                 return;
 614
 615         /* Swap the send and the receive. */
 616         memset(&rep, 0, sizeof(rep));
 617         rep.th.dest   = th->source;
 618         rep.th.source = th->dest;
 619         rep.th.doff   = sizeof(struct tcphdr) / 4;
 620         rep.th.rst    = 1;
 621
 622         if (th->ack) {
 623                 rep.th.seq = th->ack_seq;
 624         } else {
 625                 rep.th.ack = 1;
 626                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
 627                                        skb->len - (th->doff << 2));
 628         }
 629
 630         memset(&arg, 0, sizeof(arg));
 631         arg.iov[0].iov_base = (unsigned char *)&rep;
 632         arg.iov[0].iov_len  = sizeof(rep.th);
 633
 634         net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
 635 #ifdef CONFIG_TCP_MD5SIG
 636         hash_location = tcp_parse_md5sig_option(th);
 637         if (!sk && hash_location) {
 638                 /*
 639                  * active side is lost. Try to find listening socket through
 640                  * source port, and then find md5 key through listening socket.
 641                  * we are not loose security here:
 642                  * Incoming packet is checked with md5 hash with finding key,
 643                  * no RST generated if md5 hash doesn't match.
 644                  */
 645                 sk1 = __inet_lookup_listener(net,
 646                                              &tcp_hashinfo, ip_hdr(skb)->saddr,
 647                                              th->source, ip_hdr(skb)->daddr,
 648                                              ntohs(th->source), inet_iif(skb));
 649                 /* don't send rst if it can't find key */
 650                 if (!sk1)
 651                         return;
 652                 rcu_read_lock();
 653                 key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
 654                                         &ip_hdr(skb)->saddr, AF_INET);
 655                 if (!key)
 656                         goto release_sk1;
 657
 658                 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
 659                 if (genhash || memcmp(hash_location, newhash, 16) != 0)
 660                         goto release_sk1;
 661         } else {
 662                 key = sk ? tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
 663                                              &ip_hdr(skb)->saddr,
 664                                              AF_INET) : NULL;
 665         }
 666
 667         if (key) {
 668                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
 669                                    (TCPOPT_NOP << 16) |
 670                                    (TCPOPT_MD5SIG << 8) |
 671                                    TCPOLEN_MD5SIG);
 672                 /* Update length and the length the header thinks exists */
 673                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 674                 rep.th.doff = arg.iov[0].iov_len / 4;
 675
 676                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
 677                                      key, ip_hdr(skb)->saddr,
 678                                      ip_hdr(skb)->daddr, &rep.th);
 679         }
 680 #endif
 681         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 682                                       ip_hdr(skb)->saddr, /* XXX */
 683                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 684         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 685         arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0;
 686         /* When socket is gone, all binding information is lost.
 687          * routing might fail in this case. No choice here, if we choose to force
 688          * input interface, we will misroute in case of asymmetric route.
 689          */
 690         if (sk)
 691                 arg.bound_dev_if = sk->sk_bound_dev_if;
 692
 693         arg.tos = ip_hdr(skb)->tos;
 694         ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
 695                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
 696                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 697                               &arg, arg.iov[0].iov_len);
 698
 699         TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
 700         TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
 701
 702 #ifdef CONFIG_TCP_MD5SIG
 703 release_sk1:
 704         if (sk1) {
 705                 rcu_read_unlock();
 706                 sock_put(sk1);
 707         }
 708 #endif
 709 }
 710
 711 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
 712    outside socket context is ugly, certainly. What can I do?
 713  */
 714
 715 static void tcp_v4_send_ack(struct net *net,
 716                             struct sk_buff *skb, u32 seq, u32 ack,
 717                             u32 win, u32 tsval, u32 tsecr, int oif,
 718                             struct tcp_md5sig_key *key,
 719                             int reply_flags, u8 tos)
 720 {
 721         const struct tcphdr *th = tcp_hdr(skb);
 722         struct {
 723                 struct tcphdr th;
 724                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
 725 #ifdef CONFIG_TCP_MD5SIG
 726                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
 727 #endif
 728                         ];
 729         } rep;
 730         struct ip_reply_arg arg;
 731
 732         memset(&rep.th, 0, sizeof(struct tcphdr));
 733         memset(&arg, 0, sizeof(arg));
 734
 735         arg.iov[0].iov_base = (unsigned char *)&rep;
 736         arg.iov[0].iov_len  = sizeof(rep.th);
 737         if (tsecr) {
 738                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
 739                                    (TCPOPT_TIMESTAMP << 8) |
 740                                    TCPOLEN_TIMESTAMP);
 741                 rep.opt[1] = htonl(tsval);
 742                 rep.opt[2] = htonl(tsecr);
 743                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
 744         }
 745
 746         /* Swap the send and the receive. */
 747         rep.th.dest    = th->source;
 748         rep.th.source  = th->dest;
 749         rep.th.doff    = arg.iov[0].iov_len / 4;
 750         rep.th.seq     = htonl(seq);
 751         rep.th.ack_seq = htonl(ack);
 752         rep.th.ack     = 1;
 753         rep.th.window  = htons(win);
 754
 755 #ifdef CONFIG_TCP_MD5SIG
 756         if (key) {
 757                 int offset = (tsecr) ? 3 : 0;
 758
 759                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
 760                                           (TCPOPT_NOP << 16) |
 761                                           (TCPOPT_MD5SIG << 8) |
 762                                           TCPOLEN_MD5SIG);
 763                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 764                 rep.th.doff = arg.iov[0].iov_len/4;
 765
 766                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
 767                                     key, ip_hdr(skb)->saddr,
 768                                     ip_hdr(skb)->daddr, &rep.th);
 769         }
 770 #endif
 771         arg.flags = reply_flags;
 772         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 773                                       ip_hdr(skb)->saddr, /* XXX */
 774                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 775         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 776         if (oif)
 777                 arg.bound_dev_if = oif;
 778         arg.tos = tos;
 779         ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
 780                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
 781                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 782                               &arg, arg.iov[0].iov_len);
 783
 784         TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
 785 }
 786
 787 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
 788 {
 789         struct inet_timewait_sock *tw = inet_twsk(sk);
 790         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
 791
 792         tcp_v4_send_ack(sock_net(sk), skb,
 793                         tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
 794                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
 795                         tcp_time_stamp + tcptw->tw_ts_offset,
 796                         tcptw->tw_ts_recent,
 797                         tw->tw_bound_dev_if,
 798                         tcp_twsk_md5_key(tcptw),
 799                         tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
 800                         tw->tw_tos
 801                         );
 802
 803         inet_twsk_put(tw);
 804 }
 805
 806 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
 807                                   struct request_sock *req)
 808 {
 809         /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
 810          * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
 811          */
 812         u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
 813                                              tcp_sk(sk)->snd_nxt;
 814
 815         /* RFC 7323 2.3
 816          * The window field (SEG.WND) of every outgoing segment, with the
 817          * exception of <SYN> segments, MUST be right-shifted by
 818          * Rcv.Wind.Shift bits:
 819          */
 820         tcp_v4_send_ack(sock_net(sk), skb, seq,
 821                         tcp_rsk(req)->rcv_nxt,
 822                         req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
 823                         tcp_time_stamp,
 824                         req->ts_recent,
 825                         0,
 826                         tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->saddr,
 827                                           AF_INET),
 828                         inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
 829                         ip_hdr(skb)->tos);
 830 }
 831
 832 /*
 833  *      Send a SYN-ACK after having received a SYN.
 834  *      This still operates on a request_sock only, not on a big
 835  *      socket.
 836  */
 837 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
 838                               struct flowi *fl,
 839                               struct request_sock *req,
 840                               struct tcp_fastopen_cookie *foc,
 841                                   bool attach_req)
 842 {
 843         const struct inet_request_sock *ireq = inet_rsk(req);
 844         struct flowi4 fl4;
 845         int err = -1;
 846         struct sk_buff *skb;
 847
 848         /* First, grab a route. */
 849         if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
 850                 return -1;
 851
 852         skb = tcp_make_synack(sk, dst, req, foc, attach_req);
 853
 854         if (skb) {
 855                 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
 856
 857                 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
 858                                             ireq->ir_rmt_addr,
 859                                             ireq_opt_deref(ireq));
 860                 err = net_xmit_eval(err);
 861         }
 862
 863         return err;
 864 }
 865
 866 /*
 867  *      IPv4 request_sock destructor.
 868  */
 869 static void tcp_v4_reqsk_destructor(struct request_sock *req)
 870 {
 871         kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
 872 }
 873
 874
 875 #ifdef CONFIG_TCP_MD5SIG
 876 /*
 877  * RFC2385 MD5 checksumming requires a mapping of
 878  * IP address->MD5 Key.
 879  * We need to maintain these in the sk structure.
 880  */
 881
 882 /* Find the Key structure for an address.  */
 883 struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk,
 884                                          const union tcp_md5_addr *addr,
 885                                          int family)
 886 {
 887         const struct tcp_sock *tp = tcp_sk(sk);
 888         struct tcp_md5sig_key *key;
 889         unsigned int size = sizeof(struct in_addr);
 890         const struct tcp_md5sig_info *md5sig;
 891
 892         /* caller either holds rcu_read_lock() or socket lock */
 893         md5sig = rcu_dereference_check(tp->md5sig_info,
 894                                        sock_owned_by_user(sk) ||
 895                                        lockdep_is_held((spinlock_t *)&sk->sk_lock.slock));
 896         if (!md5sig)
 897                 return NULL;
 898 #if IS_ENABLED(CONFIG_IPV6)
 899         if (family == AF_INET6)
 900                 size = sizeof(struct in6_addr);
 901 #endif
 902         hlist_for_each_entry_rcu(key, &md5sig->head, node) {
 903                 if (key->family != family)
 904                         continue;
 905                 if (!memcmp(&key->addr, addr, size))
 906                         return key;
 907         }
 908         return NULL;
 909 }
 910 EXPORT_SYMBOL(tcp_md5_do_lookup);
 911
 912 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
 913                                          const struct sock *addr_sk)
 914 {
 915         const union tcp_md5_addr *addr;
 916
 917         addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
 918         return tcp_md5_do_lookup(sk, addr, AF_INET);
 919 }
 920 EXPORT_SYMBOL(tcp_v4_md5_lookup);
 921
 922 /* This can be called on a newly created socket, from other files */
 923 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
 924                    int family, const u8 *newkey, u8 newkeylen, gfp_t gfp)
 925 {
 926         /* Add Key to the list */
 927         struct tcp_md5sig_key *key;
 928         struct tcp_sock *tp = tcp_sk(sk);
 929         struct tcp_md5sig_info *md5sig;
 930
 931         key = tcp_md5_do_lookup(sk, addr, family);
 932         if (key) {
 933                 /* Pre-existing entry - just update that one. */
 934                 memcpy(key->key, newkey, newkeylen);
 935                 key->keylen = newkeylen;
 936                 return 0;
 937         }
 938
 939         md5sig = rcu_dereference_protected(tp->md5sig_info,
 940                                            sock_owned_by_user(sk) ||
 941                                            lockdep_is_held(&sk->sk_lock.slock));
 942         if (!md5sig) {
 943                 md5sig = kmalloc(sizeof(*md5sig), gfp);
 944                 if (!md5sig)
 945                         return -ENOMEM;
 946
 947                 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
 948                 INIT_HLIST_HEAD(&md5sig->head);
 949                 rcu_assign_pointer(tp->md5sig_info, md5sig);
 950         }
 951
 952         key = sock_kmalloc(sk, sizeof(*key), gfp);
 953         if (!key)
 954                 return -ENOMEM;
 955         if (!tcp_alloc_md5sig_pool()) {
 956                 sock_kfree_s(sk, key, sizeof(*key));
 957                 return -ENOMEM;
 958         }
 959
 960         memcpy(key->key, newkey, newkeylen);
 961         key->keylen = newkeylen;
 962         key->family = family;
 963         memcpy(&key->addr, addr,
 964                (family == AF_INET6) ? sizeof(struct in6_addr) :
 965                                       sizeof(struct in_addr));
 966         hlist_add_head_rcu(&key->node, &md5sig->head);
 967         return 0;
 968 }
 969 EXPORT_SYMBOL(tcp_md5_do_add);
 970
 971 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family)
 972 {
 973         struct tcp_md5sig_key *key;
 974
 975         key = tcp_md5_do_lookup(sk, addr, family);
 976         if (!key)
 977                 return -ENOENT;
 978         hlist_del_rcu(&key->node);
 979         atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
 980         kfree_rcu(key, rcu);
 981         return 0;
 982 }
 983 EXPORT_SYMBOL(tcp_md5_do_del);
 984
 985 static void tcp_clear_md5_list(struct sock *sk)
 986 {
 987         struct tcp_sock *tp = tcp_sk(sk);
 988         struct tcp_md5sig_key *key;
 989         struct hlist_node *n;
 990         struct tcp_md5sig_info *md5sig;
 991
 992         md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
 993
 994         hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
 995                 hlist_del_rcu(&key->node);
 996                 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
 997                 kfree_rcu(key, rcu);
 998         }
 999 }
1000
1001 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
1002                                  int optlen)
1003 {
1004         struct tcp_md5sig cmd;
1005         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1006
1007         if (optlen < sizeof(cmd))
1008                 return -EINVAL;
1009
1010         if (copy_from_user(&cmd, optval, sizeof(cmd)))
1011                 return -EFAULT;
1012
1013         if (sin->sin_family != AF_INET)
1014                 return -EINVAL;
1015
1016         if (!cmd.tcpm_keylen)
1017                 return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1018                                       AF_INET);
1019
1020         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1021                 return -EINVAL;
1022
1023         return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1024                               AF_INET, cmd.tcpm_key, cmd.tcpm_keylen,
1025                               GFP_KERNEL);
1026 }
1027
1028 static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
1029                                         __be32 daddr, __be32 saddr, int nbytes)
1030 {
1031         struct tcp4_pseudohdr *bp;
1032         struct scatterlist sg;
1033
1034         bp = &hp->md5_blk.ip4;
1035
1036         /*
1037          * 1. the TCP pseudo-header (in the order: source IP address,
1038          * destination IP address, zero-padded protocol number, and
1039          * segment length)
1040          */
1041         bp->saddr = saddr;
1042         bp->daddr = daddr;
1043         bp->pad = 0;
1044         bp->protocol = IPPROTO_TCP;
1045         bp->len = cpu_to_be16(nbytes);
1046
1047         sg_init_one(&sg, bp, sizeof(*bp));
1048         return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp));
1049 }
1050
1051 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1052                                __be32 daddr, __be32 saddr, const struct tcphdr *th)
1053 {
1054         struct tcp_md5sig_pool *hp;
1055         struct hash_desc *desc;
1056
1057         hp = tcp_get_md5sig_pool();
1058         if (!hp)
1059                 goto clear_hash_noput;
1060         desc = &hp->md5_desc;
1061
1062         if (crypto_hash_init(desc))
1063                 goto clear_hash;
1064         if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
1065                 goto clear_hash;
1066         if (tcp_md5_hash_header(hp, th))
1067                 goto clear_hash;
1068         if (tcp_md5_hash_key(hp, key))
1069                 goto clear_hash;
1070         if (crypto_hash_final(desc, md5_hash))
1071                 goto clear_hash;
1072
1073         tcp_put_md5sig_pool();
1074         return 0;
1075
1076 clear_hash:
1077         tcp_put_md5sig_pool();
1078 clear_hash_noput:
1079         memset(md5_hash, 0, 16);
1080         return 1;
1081 }
1082
1083 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1084                         const struct sock *sk,
1085                         const struct sk_buff *skb)
1086 {
1087         struct tcp_md5sig_pool *hp;
1088         struct hash_desc *desc;
1089         const struct tcphdr *th = tcp_hdr(skb);
1090         __be32 saddr, daddr;
1091
1092         if (sk) { /* valid for establish/request sockets */
1093                 saddr = sk->sk_rcv_saddr;
1094                 daddr = sk->sk_daddr;
1095         } else {
1096                 const struct iphdr *iph = ip_hdr(skb);
1097                 saddr = iph->saddr;
1098                 daddr = iph->daddr;
1099         }
1100
1101         hp = tcp_get_md5sig_pool();
1102         if (!hp)
1103                 goto clear_hash_noput;
1104         desc = &hp->md5_desc;
1105
1106         if (crypto_hash_init(desc))
1107                 goto clear_hash;
1108
1109         if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
1110                 goto clear_hash;
1111         if (tcp_md5_hash_header(hp, th))
1112                 goto clear_hash;
1113         if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1114                 goto clear_hash;
1115         if (tcp_md5_hash_key(hp, key))
1116                 goto clear_hash;
1117         if (crypto_hash_final(desc, md5_hash))
1118                 goto clear_hash;
1119
1120         tcp_put_md5sig_pool();
1121         return 0;
1122
1123 clear_hash:
1124         tcp_put_md5sig_pool();
1125 clear_hash_noput:
1126         memset(md5_hash, 0, 16);
1127         return 1;
1128 }
1129 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1130
1131 #endif
1132
1133 /* Called with rcu_read_lock() */
1134 static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1135                                     const struct sk_buff *skb)
1136 {
1137 #ifdef CONFIG_TCP_MD5SIG
1138         /*
1139          * This gets called for each TCP segment that arrives
1140          * so we want to be efficient.
1141          * We have 3 drop cases:
1142          * o No MD5 hash and one expected.
1143          * o MD5 hash and we're not expecting one.
1144          * o MD5 hash and its wrong.
1145          */
1146         const __u8 *hash_location = NULL;
1147         struct tcp_md5sig_key *hash_expected;
1148         const struct iphdr *iph = ip_hdr(skb);
1149         const struct tcphdr *th = tcp_hdr(skb);
1150         int genhash;
1151         unsigned char newhash[16];
1152
1153         hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1154                                           AF_INET);
1155         hash_location = tcp_parse_md5sig_option(th);
1156
1157         /* We've parsed the options - do we have a hash? */
1158         if (!hash_expected && !hash_location)
1159                 return false;
1160
1161         if (hash_expected && !hash_location) {
1162                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1163                 return true;
1164         }
1165
1166         if (!hash_expected && hash_location) {
1167                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1168                 return true;
1169         }
1170
1171         /* Okay, so this is hash_expected and hash_location -
1172          * so we need to calculate the checksum.
1173          */
1174         genhash = tcp_v4_md5_hash_skb(newhash,
1175                                       hash_expected,
1176                                       NULL, skb);
1177
1178         if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1179                 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1180                                      &iph->saddr, ntohs(th->source),
1181                                      &iph->daddr, ntohs(th->dest),
1182                                      genhash ? " tcp_v4_calc_md5_hash failed"
1183                                      : "");
1184                 return true;
1185         }
1186         return false;
1187 #endif
1188         return false;
1189 }
1190
1191 static void tcp_v4_init_req(struct request_sock *req,
1192                             const struct sock *sk_listener,
1193                             struct sk_buff *skb)
1194 {
1195         struct inet_request_sock *ireq = inet_rsk(req);
1196
1197         sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1198         sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1199         ireq->no_srccheck = inet_sk(sk_listener)->transparent;
1200         RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(skb));
1201 }
1202
1203 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1204                                           struct flowi *fl,
1205                                           const struct request_sock *req,
1206                                           bool *strict)
1207 {
1208         struct dst_entry *dst = inet_csk_route_req(sk, &fl->u.ip4, req);
1209
1210         if (strict) {
1211                 if (fl->u.ip4.daddr == inet_rsk(req)->ir_rmt_addr)
1212                         *strict = true;
1213                 else
1214                         *strict = false;
1215         }
1216
1217         return dst;
1218 }
1219
1220 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1221         .family         =       PF_INET,
1222         .obj_size       =       sizeof(struct tcp_request_sock),
1223         .rtx_syn_ack    =       tcp_rtx_synack,
1224         .send_ack       =       tcp_v4_reqsk_send_ack,
1225         .destructor     =       tcp_v4_reqsk_destructor,
1226         .send_reset     =       tcp_v4_send_reset,
1227         .syn_ack_timeout =      tcp_syn_ack_timeout,
1228 };
1229
1230 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1231         .mss_clamp      =       TCP_MSS_DEFAULT,
1232 #ifdef CONFIG_TCP_MD5SIG
1233         .req_md5_lookup =       tcp_v4_md5_lookup,
1234         .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1235 #endif
1236         .init_req       =       tcp_v4_init_req,
1237 #ifdef CONFIG_SYN_COOKIES
1238         .cookie_init_seq =      cookie_v4_init_sequence,
1239 #endif
1240         .route_req      =       tcp_v4_route_req,
1241         .init_seq       =       tcp_v4_init_sequence,
1242         .send_synack    =       tcp_v4_send_synack,
1243 };
1244
1245 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1246 {
1247         /* Never answer to SYNs send to broadcast or multicast */
1248         if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1249                 goto drop;
1250
1251         return tcp_conn_request(&tcp_request_sock_ops,
1252                                 &tcp_request_sock_ipv4_ops, sk, skb);
1253
1254 drop:
1255         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1256         return 0;
1257 }
1258 EXPORT_SYMBOL(tcp_v4_conn_request);
1259
1260
1261 /*
1262  * The three way handshake has completed - we got a valid synack -
1263  * now create the new socket.
1264  */
1265 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1266                                   struct request_sock *req,
1267                                   struct dst_entry *dst,
1268                                   struct request_sock *req_unhash,
1269                                   bool *own_req)
1270 {
1271         struct inet_request_sock *ireq;
1272         struct inet_sock *newinet;
1273         struct tcp_sock *newtp;
1274         struct sock *newsk;
1275 #ifdef CONFIG_TCP_MD5SIG
1276         struct tcp_md5sig_key *key;
1277 #endif
1278         struct ip_options_rcu *inet_opt;
1279
1280         if (sk_acceptq_is_full(sk))
1281                 goto exit_overflow;
1282
1283         newsk = tcp_create_openreq_child(sk, req, skb);
1284         if (!newsk)
1285                 goto exit_nonewsk;
1286
1287         newsk->sk_gso_type = SKB_GSO_TCPV4;
1288         inet_sk_rx_dst_set(newsk, skb);
1289
1290         newtp                 = tcp_sk(newsk);
1291         newinet               = inet_sk(newsk);
1292         ireq                  = inet_rsk(req);
1293         sk_daddr_set(newsk, ireq->ir_rmt_addr);
1294         sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1295         newinet->inet_saddr   = ireq->ir_loc_addr;
1296         inet_opt              = rcu_dereference(ireq->ireq_opt);
1297         RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1298         newinet->mc_index     = inet_iif(skb);
1299         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1300         newinet->rcv_tos      = ip_hdr(skb)->tos;
1301         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1302         if (inet_opt)
1303                 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1304         newinet->inet_id = newtp->write_seq ^ jiffies;
1305
1306         if (!dst) {
1307                 dst = inet_csk_route_child_sock(sk, newsk, req);
1308                 if (!dst)
1309                         goto put_and_exit;
1310         } else {
1311                 /* syncookie case : see end of cookie_v4_check() */
1312         }
1313         sk_setup_caps(newsk, dst);
1314
1315         tcp_ca_openreq_child(newsk, dst);
1316
1317         tcp_sync_mss(newsk, dst_mtu(dst));
1318         newtp->advmss = dst_metric_advmss(dst);
1319         if (tcp_sk(sk)->rx_opt.user_mss &&
1320             tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1321                 newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1322
1323         tcp_initialize_rcv_mss(newsk);
1324
1325 #ifdef CONFIG_TCP_MD5SIG
1326         /* Copy over the MD5 key from the original socket */
1327         key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1328                                 AF_INET);
1329         if (key) {
1330                 /*
1331                  * We're using one, so create a matching key
1332                  * on the newsk structure. If we fail to get
1333                  * memory, then we end up not copying the key
1334                  * across. Shucks.
1335                  */
1336                 tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1337                                AF_INET, key->key, key->keylen, GFP_ATOMIC);
1338                 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1339         }
1340 #endif
1341
1342         if (__inet_inherit_port(sk, newsk) < 0)
1343                 goto put_and_exit;
1344         *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
1345         if (likely(*own_req)) {
1346                 tcp_move_syn(newtp, req);
1347                 ireq->ireq_opt = NULL;
1348         } else {
1349                 newinet->inet_opt = NULL;
1350         }
1351         return newsk;
1352
1353 exit_overflow:
1354         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1355 exit_nonewsk:
1356         dst_release(dst);
1357 exit:
1358         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1359         return NULL;
1360 put_and_exit:
1361         newinet->inet_opt = NULL;
1362         inet_csk_prepare_forced_close(newsk);
1363         tcp_done(newsk);
1364         goto exit;
1365 }
1366 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1367
1368 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1369 {
1370 #ifdef CONFIG_SYN_COOKIES
1371         const struct tcphdr *th = tcp_hdr(skb);
1372
1373         if (!th->syn)
1374                 sk = cookie_v4_check(sk, skb);
1375 #endif
1376         return sk;
1377 }
1378
1379 /* The socket must have it's spinlock held when we get
1380  * here, unless it is a TCP_LISTEN socket.
1381  *
1382  * We have a potential double-lock case here, so even when
1383  * doing backlog processing we use the BH locking scheme.
1384  * This is because we cannot sleep with the original spinlock
1385  * held.
1386  */
1387 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1388 {
1389         struct sock *rsk;
1390
1391         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1392                 struct dst_entry *dst = sk->sk_rx_dst;
1393
1394                 sock_rps_save_rxhash(sk, skb);
1395                 sk_mark_napi_id(sk, skb);
1396                 if (dst) {
1397                         if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1398                             !dst->ops->check(dst, 0)) {
1399                                 dst_release(dst);
1400                                 sk->sk_rx_dst = NULL;
1401                         }
1402                 }
1403                 tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len);
1404                 return 0;
1405         }
1406
1407         if (tcp_checksum_complete(skb))
1408                 goto csum_err;
1409
1410         if (sk->sk_state == TCP_LISTEN) {
1411                 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1412
1413                 if (!nsk)
1414                         goto discard;
1415                 if (nsk != sk) {
1416                         sock_rps_save_rxhash(nsk, skb);
1417                         sk_mark_napi_id(nsk, skb);
1418                         if (tcp_child_process(sk, nsk, skb)) {
1419                                 rsk = nsk;
1420                                 goto reset;
1421                         }
1422                         return 0;
1423                 }
1424         } else
1425                 sock_rps_save_rxhash(sk, skb);
1426
1427         if (tcp_rcv_state_process(sk, skb)) {
1428                 rsk = sk;
1429                 goto reset;
1430         }
1431         return 0;
1432
1433 reset:
1434         tcp_v4_send_reset(rsk, skb);
1435 discard:
1436         kfree_skb(skb);
1437         /* Be careful here. If this function gets more complicated and
1438          * gcc suffers from register pressure on the x86, sk (in %ebx)
1439          * might be destroyed here. This current version compiles correctly,
1440          * but you have been warned.
1441          */
1442         return 0;
1443
1444 csum_err:
1445         TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_CSUMERRORS);
1446         TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
1447         goto discard;
1448 }
1449 EXPORT_SYMBOL(tcp_v4_do_rcv);
1450
1451 void tcp_v4_early_demux(struct sk_buff *skb)
1452 {
1453         const struct iphdr *iph;
1454         const struct tcphdr *th;
1455         struct sock *sk;
1456
1457         if (skb->pkt_type != PACKET_HOST)
1458                 return;
1459
1460         if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1461                 return;
1462
1463         iph = ip_hdr(skb);
1464         th = tcp_hdr(skb);
1465
1466         if (th->doff < sizeof(struct tcphdr) / 4)
1467                 return;
1468
1469         sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1470                                        iph->saddr, th->source,
1471                                        iph->daddr, ntohs(th->dest),
1472                                        skb->skb_iif);
1473         if (sk) {
1474                 skb->sk = sk;
1475                 skb->destructor = sock_edemux;
1476                 if (sk_fullsock(sk)) {
1477                         struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
1478
1479                         if (dst)
1480                                 dst = dst_check(dst, 0);
1481                         if (dst &&
1482                             inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1483                                 skb_dst_set_noref(skb, dst);
1484                 }
1485         }
1486 }
1487
1488 /* Packet is added to VJ-style prequeue for processing in process
1489  * context, if a reader task is waiting. Apparently, this exciting
1490  * idea (VJ's mail "Re: query about TCP header on tcp-ip" of 07 Sep 93)
1491  * failed somewhere. Latency? Burstiness? Well, at least now we will
1492  * see, why it failed. 8)8)                               --ANK
1493  *
1494  */
1495 bool tcp_prequeue(struct sock *sk, struct sk_buff *skb)
1496 {
1497         struct tcp_sock *tp = tcp_sk(sk);
1498
1499         if (sysctl_tcp_low_latency || !tp->ucopy.task)
1500                 return false;
1501
1502         if (skb->len <= tcp_hdrlen(skb) &&
1503             skb_queue_len(&tp->ucopy.prequeue) == 0)
1504                 return false;
1505
1506         /* Before escaping RCU protected region, we need to take care of skb
1507          * dst. Prequeue is only enabled for established sockets.
1508          * For such sockets, we might need the skb dst only to set sk->sk_rx_dst
1509          * Instead of doing full sk_rx_dst validity here, let's perform
1510          * an optimistic check.
1511          */
1512         if (likely(sk->sk_rx_dst))
1513                 skb_dst_drop(skb);
1514         else
1515                 skb_dst_force_safe(skb);
1516
1517         __skb_queue_tail(&tp->ucopy.prequeue, skb);
1518         tp->ucopy.memory += skb->truesize;
1519         if (tp->ucopy.memory > sk->sk_rcvbuf) {
1520                 struct sk_buff *skb1;
1521
1522                 BUG_ON(sock_owned_by_user(sk));
1523
1524                 while ((skb1 = __skb_dequeue(&tp->ucopy.prequeue)) != NULL) {
1525                         sk_backlog_rcv(sk, skb1);
1526                         NET_INC_STATS_BH(sock_net(sk),
1527                                          LINUX_MIB_TCPPREQUEUEDROPPED);
1528                 }
1529
1530                 tp->ucopy.memory = 0;
1531         } else if (skb_queue_len(&tp->ucopy.prequeue) == 1) {
1532                 wake_up_interruptible_sync_poll(sk_sleep(sk),
1533                                            POLLIN | POLLRDNORM | POLLRDBAND);
1534                 if (!inet_csk_ack_scheduled(sk))
1535                         inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
1536                                                   (3 * tcp_rto_min(sk)) / 4,
1537                                                   TCP_RTO_MAX);
1538         }
1539         return true;
1540 }
1541 EXPORT_SYMBOL(tcp_prequeue);
1542
1543 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1544 {
1545         struct tcphdr *th = (struct tcphdr *)skb->data;
1546         unsigned int eaten = skb->len;
1547         int err;
1548
1549         err = sk_filter_trim_cap(sk, skb, th->doff * 4);
1550         if (!err) {
1551                 eaten -= skb->len;
1552                 TCP_SKB_CB(skb)->end_seq -= eaten;
1553         }
1554         return err;
1555 }
1556 EXPORT_SYMBOL(tcp_filter);
1557
1558 /*
1559  *      From tcp_input.c
1560  */
1561
1562 int tcp_v4_rcv(struct sk_buff *skb)
1563 {
1564         const struct iphdr *iph;
1565         const struct tcphdr *th;
1566         struct sock *sk;
1567         int ret;
1568         struct net *net = dev_net(skb->dev);
1569
1570         if (skb->pkt_type != PACKET_HOST)
1571                 goto discard_it;
1572
1573         /* Count it even if it's bad */
1574         TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);
1575
1576         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1577                 goto discard_it;
1578
1579         th = tcp_hdr(skb);
1580
1581         if (th->doff < sizeof(struct tcphdr) / 4)
1582                 goto bad_packet;
1583         if (!pskb_may_pull(skb, th->doff * 4))
1584                 goto discard_it;
1585
1586         /* An explanation is required here, I think.
1587          * Packet length and doff are validated by header prediction,
1588          * provided case of th->doff==0 is eliminated.
1589          * So, we defer the checks. */
1590
1591         if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1592                 goto csum_error;
1593
1594         th = tcp_hdr(skb);
1595         iph = ip_hdr(skb);
1596         /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1597          * barrier() makes sure compiler wont play fool^Waliasing games.
1598          */
1599         memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1600                 sizeof(struct inet_skb_parm));
1601         barrier();
1602
1603         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1604         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1605                                     skb->len - th->doff * 4);
1606         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1607         TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1608         TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1609         TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1610         TCP_SKB_CB(skb)->sacked  = 0;
1611
1612 lookup:
1613         sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
1614         if (!sk)
1615                 goto no_tcp_socket;
1616
1617 process:
1618         if (sk->sk_state == TCP_TIME_WAIT)
1619                 goto do_time_wait;
1620
1621         if (sk->sk_state == TCP_NEW_SYN_RECV) {
1622                 struct request_sock *req = inet_reqsk(sk);
1623                 struct sock *nsk;
1624
1625                 sk = req->rsk_listener;
1626                 if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) {
1627                         reqsk_put(req);
1628                         goto discard_it;
1629                 }
1630                 if (unlikely(sk->sk_state != TCP_LISTEN)) {
1631                         inet_csk_reqsk_queue_drop_and_put(sk, req);
1632                         goto lookup;
1633                 }
1634                 sock_hold(sk);
1635                 nsk = tcp_check_req(sk, skb, req, false);
1636                 if (!nsk) {
1637                         reqsk_put(req);
1638                         goto discard_and_relse;
1639                 }
1640                 if (nsk == sk) {
1641                         reqsk_put(req);
1642                 } else if (tcp_child_process(sk, nsk, skb)) {
1643                         tcp_v4_send_reset(nsk, skb);
1644                         goto discard_and_relse;
1645                 } else {
1646                         sock_put(sk);
1647                         return 0;
1648                 }
1649         }
1650         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1651                 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
1652                 goto discard_and_relse;
1653         }
1654
1655         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1656                 goto discard_and_relse;
1657
1658         if (tcp_v4_inbound_md5_hash(sk, skb))
1659                 goto discard_and_relse;
1660
1661         nf_reset(skb);
1662
1663         if (tcp_filter(sk, skb))
1664                 goto discard_and_relse;
1665         th = (const struct tcphdr *)skb->data;
1666         iph = ip_hdr(skb);
1667
1668         skb->dev = NULL;
1669
1670         if (sk->sk_state == TCP_LISTEN) {
1671                 ret = tcp_v4_do_rcv(sk, skb);
1672                 goto put_and_return;
1673         }
1674
1675         sk_incoming_cpu_update(sk);
1676
1677         bh_lock_sock_nested(sk);
1678         tcp_sk(sk)->segs_in += max_t(u16, 1, skb_shinfo(skb)->gso_segs);
1679         ret = 0;
1680         if (!sock_owned_by_user(sk)) {
1681                 if (!tcp_prequeue(sk, skb))
1682                         ret = tcp_v4_do_rcv(sk, skb);
1683         } else if (unlikely(sk_add_backlog(sk, skb,
1684                                            sk->sk_rcvbuf + sk->sk_sndbuf))) {
1685                 bh_unlock_sock(sk);
1686                 NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
1687                 goto discard_and_relse;
1688         }
1689         bh_unlock_sock(sk);
1690
1691 put_and_return:
1692         sock_put(sk);
1693
1694         return ret;
1695
1696 no_tcp_socket:
1697         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1698                 goto discard_it;
1699
1700         if (tcp_checksum_complete(skb)) {
1701 csum_error:
1702                 TCP_INC_STATS_BH(net, TCP_MIB_CSUMERRORS);
1703 bad_packet:
1704                 TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1705         } else {
1706                 tcp_v4_send_reset(NULL, skb);
1707         }
1708
1709 discard_it:
1710         /* Discard frame. */
1711         kfree_skb(skb);
1712         return 0;
1713
1714 discard_and_relse:
1715         sock_put(sk);
1716         goto discard_it;
1717
1718 do_time_wait:
1719         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1720                 inet_twsk_put(inet_twsk(sk));
1721                 goto discard_it;
1722         }
1723
1724         if (tcp_checksum_complete(skb)) {
1725                 inet_twsk_put(inet_twsk(sk));
1726                 goto csum_error;
1727         }
1728         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1729         case TCP_TW_SYN: {
1730                 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1731                                                         &tcp_hashinfo,
1732                                                         iph->saddr, th->source,
1733                                                         iph->daddr, th->dest,
1734                                                         inet_iif(skb));
1735                 if (sk2) {
1736                         inet_twsk_deschedule_put(inet_twsk(sk));
1737                         sk = sk2;
1738                         goto process;
1739                 }
1740                 /* Fall through to ACK */
1741         }
1742         case TCP_TW_ACK:
1743                 tcp_v4_timewait_ack(sk, skb);
1744                 break;
1745         case TCP_TW_RST:
1746                 goto no_tcp_socket;
1747         case TCP_TW_SUCCESS:;
1748         }
1749         goto discard_it;
1750 }
1751
1752 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1753         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
1754         .twsk_unique    = tcp_twsk_unique,
1755         .twsk_destructor= tcp_twsk_destructor,
1756 };
1757
1758 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
1759 {
1760         struct dst_entry *dst = skb_dst(skb);
1761
1762         if (dst && dst_hold_safe(dst)) {
1763                 sk->sk_rx_dst = dst;
1764                 inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
1765         }
1766 }
1767 EXPORT_SYMBOL(inet_sk_rx_dst_set);
1768
1769 const struct inet_connection_sock_af_ops ipv4_specific = {
1770         .queue_xmit        = ip_queue_xmit,
1771         .send_check        = tcp_v4_send_check,
1772         .rebuild_header    = inet_sk_rebuild_header,
1773         .sk_rx_dst_set     = inet_sk_rx_dst_set,
1774         .conn_request      = tcp_v4_conn_request,
1775         .syn_recv_sock     = tcp_v4_syn_recv_sock,
1776         .net_header_len    = sizeof(struct iphdr),
1777         .setsockopt        = ip_setsockopt,
1778         .getsockopt        = ip_getsockopt,
1779         .addr2sockaddr     = inet_csk_addr2sockaddr,
1780         .sockaddr_len      = sizeof(struct sockaddr_in),
1781         .bind_conflict     = inet_csk_bind_conflict,
1782 #ifdef CONFIG_COMPAT
1783         .compat_setsockopt = compat_ip_setsockopt,
1784         .compat_getsockopt = compat_ip_getsockopt,
1785 #endif
1786         .mtu_reduced       = tcp_v4_mtu_reduced,
1787 };
1788 EXPORT_SYMBOL(ipv4_specific);
1789
1790 #ifdef CONFIG_TCP_MD5SIG
1791 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1792         .md5_lookup             = tcp_v4_md5_lookup,
1793         .calc_md5_hash          = tcp_v4_md5_hash_skb,
1794         .md5_parse              = tcp_v4_parse_md5_keys,
1795 };
1796 #endif
1797
1798 /* NOTE: A lot of things set to zero explicitly by call to
1799  *       sk_alloc() so need not be done here.
1800  */
1801 static int tcp_v4_init_sock(struct sock *sk)
1802 {
1803         struct inet_connection_sock *icsk = inet_csk(sk);
1804
1805         tcp_init_sock(sk);
1806
1807         icsk->icsk_af_ops = &ipv4_specific;
1808
1809 #ifdef CONFIG_TCP_MD5SIG
1810         tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
1811 #endif
1812
1813         return 0;
1814 }
1815
1816 void tcp_v4_destroy_sock(struct sock *sk)
1817 {
1818         struct tcp_sock *tp = tcp_sk(sk);
1819
1820         tcp_clear_xmit_timers(sk);
1821
1822         tcp_cleanup_congestion_control(sk);
1823
1824         /* Cleanup up the write buffer. */
1825         tcp_write_queue_purge(sk);
1826
1827         /* Cleans up our, hopefully empty, out_of_order_queue. */
1828         __skb_queue_purge(&tp->out_of_order_queue);
1829
1830 #ifdef CONFIG_TCP_MD5SIG
1831         /* Clean up the MD5 key list, if any */
1832         if (tp->md5sig_info) {
1833                 tcp_clear_md5_list(sk);
1834                 kfree_rcu(tp->md5sig_info, rcu);
1835                 tp->md5sig_info = NULL;
1836         }
1837 #endif
1838
1839         /* Clean prequeue, it must be empty really */
1840         __skb_queue_purge(&tp->ucopy.prequeue);
1841
1842         /* Clean up a referenced TCP bind bucket. */
1843         if (inet_csk(sk)->icsk_bind_hash)
1844                 inet_put_port(sk);
1845
1846         BUG_ON(tp->fastopen_rsk);
1847
1848         /* If socket is aborted during connect operation */
1849         tcp_free_fastopen_req(tp);
1850         tcp_saved_syn_free(tp);
1851
1852         sk_sockets_allocated_dec(sk);
1853         sock_release_memcg(sk);
1854 }
1855 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1856
1857 #ifdef CONFIG_PROC_FS
1858 /* Proc filesystem TCP sock list dumping. */
1859
1860 /*
1861  * Get next listener socket follow cur.  If cur is NULL, get first socket
1862  * starting from bucket given in st->bucket; when st->bucket is zero the
1863  * very first socket in the hash table is returned.
1864  */
1865 static void *listening_get_next(struct seq_file *seq, void *cur)
1866 {
1867         struct inet_connection_sock *icsk;
1868         struct hlist_nulls_node *node;
1869         struct sock *sk = cur;
1870         struct inet_listen_hashbucket *ilb;
1871         struct tcp_iter_state *st = seq->private;
1872         struct net *net = seq_file_net(seq);
1873
1874         if (!sk) {
1875                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
1876                 spin_lock_bh(&ilb->lock);
1877                 sk = sk_nulls_head(&ilb->head);
1878                 st->offset = 0;
1879                 goto get_sk;
1880         }
1881         ilb = &tcp_hashinfo.listening_hash[st->bucket];
1882         ++st->num;
1883         ++st->offset;
1884
1885         sk = sk_nulls_next(sk);
1886 get_sk:
1887         sk_nulls_for_each_from(sk, node) {
1888                 if (!net_eq(sock_net(sk), net))
1889                         continue;
1890                 if (sk->sk_family == st->family) {
1891                         cur = sk;
1892                         goto out;
1893                 }
1894                 icsk = inet_csk(sk);
1895         }
1896         spin_unlock_bh(&ilb->lock);
1897         st->offset = 0;
1898         if (++st->bucket < INET_LHTABLE_SIZE) {
1899                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
1900                 spin_lock_bh(&ilb->lock);
1901                 sk = sk_nulls_head(&ilb->head);
1902                 goto get_sk;
1903         }
1904         cur = NULL;
1905 out:
1906         return cur;
1907 }
1908
1909 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1910 {
1911         struct tcp_iter_state *st = seq->private;
1912         void *rc;
1913
1914         st->bucket = 0;
1915         st->offset = 0;
1916         rc = listening_get_next(seq, NULL);
1917
1918         while (rc && *pos) {
1919                 rc = listening_get_next(seq, rc);
1920                 --*pos;
1921         }
1922         return rc;
1923 }
1924
1925 static inline bool empty_bucket(const struct tcp_iter_state *st)
1926 {
1927         return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
1928 }
1929
1930 /*
1931  * Get first established socket starting from bucket given in st->bucket.
1932  * If st->bucket is zero, the very first socket in the hash is returned.
1933  */
1934 static void *established_get_first(struct seq_file *seq)
1935 {
1936         struct tcp_iter_state *st = seq->private;
1937         struct net *net = seq_file_net(seq);
1938         void *rc = NULL;
1939
1940         st->offset = 0;
1941         for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
1942                 struct sock *sk;
1943                 struct hlist_nulls_node *node;
1944                 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
1945
1946                 /* Lockless fast path for the common case of empty buckets */
1947                 if (empty_bucket(st))
1948                         continue;
1949
1950                 spin_lock_bh(lock);
1951                 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
1952                         if (sk->sk_family != st->family ||
1953                             !net_eq(sock_net(sk), net)) {
1954                                 continue;
1955                         }
1956                         rc = sk;
1957                         goto out;
1958                 }
1959                 spin_unlock_bh(lock);
1960         }
1961 out:
1962         return rc;
1963 }
1964
1965 static void *established_get_next(struct seq_file *seq, void *cur)
1966 {
1967         struct sock *sk = cur;
1968         struct hlist_nulls_node *node;
1969         struct tcp_iter_state *st = seq->private;
1970         struct net *net = seq_file_net(seq);
1971
1972         ++st->num;
1973         ++st->offset;
1974
1975         sk = sk_nulls_next(sk);
1976
1977         sk_nulls_for_each_from(sk, node) {
1978                 if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
1979                         return sk;
1980         }
1981
1982         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
1983         ++st->bucket;
1984         return established_get_first(seq);
1985 }
1986
1987 static void *established_get_idx(struct seq_file *seq, loff_t pos)
1988 {
1989         struct tcp_iter_state *st = seq->private;
1990         void *rc;
1991
1992         st->bucket = 0;
1993         rc = established_get_first(seq);
1994
1995         while (rc && pos) {
1996                 rc = established_get_next(seq, rc);
1997                 --pos;
1998         }
1999         return rc;
2000 }
2001
2002 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2003 {
2004         void *rc;
2005         struct tcp_iter_state *st = seq->private;
2006
2007         st->state = TCP_SEQ_STATE_LISTENING;
2008         rc        = listening_get_idx(seq, &pos);
2009
2010         if (!rc) {
2011                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2012                 rc        = established_get_idx(seq, pos);
2013         }
2014
2015         return rc;
2016 }
2017
2018 static void *tcp_seek_last_pos(struct seq_file *seq)
2019 {
2020         struct tcp_iter_state *st = seq->private;
2021         int offset = st->offset;
2022         int orig_num = st->num;
2023         void *rc = NULL;
2024
2025         switch (st->state) {
2026         case TCP_SEQ_STATE_LISTENING:
2027                 if (st->bucket >= INET_LHTABLE_SIZE)
2028                         break;
2029                 st->state = TCP_SEQ_STATE_LISTENING;
2030                 rc = listening_get_next(seq, NULL);
2031                 while (offset-- && rc)
2032                         rc = listening_get_next(seq, rc);
2033                 if (rc)
2034                         break;
2035                 st->bucket = 0;
2036                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2037                 /* Fallthrough */
2038         case TCP_SEQ_STATE_ESTABLISHED:
2039                 if (st->bucket > tcp_hashinfo.ehash_mask)
2040                         break;
2041                 rc = established_get_first(seq);
2042                 while (offset-- && rc)
2043                         rc = established_get_next(seq, rc);
2044         }
2045
2046         st->num = orig_num;
2047
2048         return rc;
2049 }
2050
2051 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2052 {
2053         struct tcp_iter_state *st = seq->private;
2054         void *rc;
2055
2056         if (*pos && *pos == st->last_pos) {
2057                 rc = tcp_seek_last_pos(seq);
2058                 if (rc)
2059                         goto out;
2060         }
2061
2062         st->state = TCP_SEQ_STATE_LISTENING;
2063         st->num = 0;
2064         st->bucket = 0;
2065         st->offset = 0;
2066         rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2067
2068 out:
2069         st->last_pos = *pos;
2070         return rc;
2071 }
2072
2073 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2074 {
2075         struct tcp_iter_state *st = seq->private;
2076         void *rc = NULL;
2077
2078         if (v == SEQ_START_TOKEN) {
2079                 rc = tcp_get_idx(seq, 0);
2080                 goto out;
2081         }
2082
2083         switch (st->state) {
2084         case TCP_SEQ_STATE_LISTENING:
2085                 rc = listening_get_next(seq, v);
2086                 if (!rc) {
2087                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2088                         st->bucket = 0;
2089                         st->offset = 0;
2090                         rc        = established_get_first(seq);
2091                 }
2092                 break;
2093         case TCP_SEQ_STATE_ESTABLISHED:
2094                 rc = established_get_next(seq, v);
2095                 break;
2096         }
2097 out:
2098         ++*pos;
2099         st->last_pos = *pos;
2100         return rc;
2101 }
2102
2103 static void tcp_seq_stop(struct seq_file *seq, void *v)
2104 {
2105         struct tcp_iter_state *st = seq->private;
2106
2107         switch (st->state) {
2108         case TCP_SEQ_STATE_LISTENING:
2109                 if (v != SEQ_START_TOKEN)
2110                         spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
2111                 break;
2112         case TCP_SEQ_STATE_ESTABLISHED:
2113                 if (v)
2114                         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2115                 break;
2116         }
2117 }
2118
2119 int tcp_seq_open(struct inode *inode, struct file *file)
2120 {
2121         struct tcp_seq_afinfo *afinfo = PDE_DATA(inode);
2122         struct tcp_iter_state *s;
2123         int err;
2124
2125         err = seq_open_net(inode, file, &afinfo->seq_ops,
2126                           sizeof(struct tcp_iter_state));
2127         if (err < 0)
2128                 return err;
2129
2130         s = ((struct seq_file *)file->private_data)->private;
2131         s->family               = afinfo->family;
2132         s->last_pos             = 0;
2133         return 0;
2134 }
2135 EXPORT_SYMBOL(tcp_seq_open);
2136
2137 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2138 {
2139         int rc = 0;
2140         struct proc_dir_entry *p;
2141
2142         afinfo->seq_ops.start           = tcp_seq_start;
2143         afinfo->seq_ops.next            = tcp_seq_next;
2144         afinfo->seq_ops.stop            = tcp_seq_stop;
2145
2146         p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2147                              afinfo->seq_fops, afinfo);
2148         if (!p)
2149                 rc = -ENOMEM;
2150         return rc;
2151 }
2152 EXPORT_SYMBOL(tcp_proc_register);
2153
2154 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2155 {
2156         remove_proc_entry(afinfo->name, net->proc_net);
2157 }
2158 EXPORT_SYMBOL(tcp_proc_unregister);
2159
2160 static void get_openreq4(const struct request_sock *req,
2161                          struct seq_file *f, int i)
2162 {
2163         const struct inet_request_sock *ireq = inet_rsk(req);
2164         long delta = req->rsk_timer.expires - jiffies;
2165
2166         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2167                 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2168                 i,
2169                 ireq->ir_loc_addr,
2170                 ireq->ir_num,
2171                 ireq->ir_rmt_addr,
2172                 ntohs(ireq->ir_rmt_port),
2173                 TCP_SYN_RECV,
2174                 0, 0, /* could print option size, but that is af dependent. */
2175                 1,    /* timers active (only the expire timer) */
2176                 jiffies_delta_to_clock_t(delta),
2177                 req->num_timeout,
2178                 from_kuid_munged(seq_user_ns(f),
2179                                  sock_i_uid(req->rsk_listener)),
2180                 0,  /* non standard timer */
2181                 0, /* open_requests have no inode */
2182                 0,
2183                 req);
2184 }
2185
2186 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2187 {
2188         int timer_active;
2189         unsigned long timer_expires;
2190         const struct tcp_sock *tp = tcp_sk(sk);
2191         const struct inet_connection_sock *icsk = inet_csk(sk);
2192         const struct inet_sock *inet = inet_sk(sk);
2193         const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2194         __be32 dest = inet->inet_daddr;
2195         __be32 src = inet->inet_rcv_saddr;
2196         __u16 destp = ntohs(inet->inet_dport);
2197         __u16 srcp = ntohs(inet->inet_sport);
2198         int rx_queue;
2199         int state;
2200
2201         if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2202             icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
2203             icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2204                 timer_active    = 1;
2205                 timer_expires   = icsk->icsk_timeout;
2206         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2207                 timer_active    = 4;
2208                 timer_expires   = icsk->icsk_timeout;
2209         } else if (timer_pending(&sk->sk_timer)) {
2210                 timer_active    = 2;
2211                 timer_expires   = sk->sk_timer.expires;
2212         } else {
2213                 timer_active    = 0;
2214                 timer_expires = jiffies;
2215         }
2216
2217         state = sk_state_load(sk);
2218         if (state == TCP_LISTEN)
2219                 rx_queue = sk->sk_ack_backlog;
2220         else
2221                 /* Because we don't lock the socket,
2222                  * we might find a transient negative value.
2223                  */
2224                 rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2225
2226         seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2227                         "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2228                 i, src, srcp, dest, destp, state,
2229                 tp->write_seq - tp->snd_una,
2230                 rx_queue,
2231                 timer_active,
2232                 jiffies_delta_to_clock_t(timer_expires - jiffies),
2233                 icsk->icsk_retransmits,
2234                 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2235                 icsk->icsk_probes_out,
2236                 sock_i_ino(sk),
2237                 atomic_read(&sk->sk_refcnt), sk,
2238                 jiffies_to_clock_t(icsk->icsk_rto),
2239                 jiffies_to_clock_t(icsk->icsk_ack.ato),
2240                 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2241                 tp->snd_cwnd,
2242                 state == TCP_LISTEN ?
2243                     fastopenq->max_qlen :
2244                     (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2245 }
2246
2247 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2248                                struct seq_file *f, int i)
2249 {
2250         long delta = tw->tw_timer.expires - jiffies;
2251         __be32 dest, src;
2252         __u16 destp, srcp;
2253
2254         dest  = tw->tw_daddr;
2255         src   = tw->tw_rcv_saddr;
2256         destp = ntohs(tw->tw_dport);
2257         srcp  = ntohs(tw->tw_sport);
2258
2259         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2260                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2261                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2262                 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2263                 atomic_read(&tw->tw_refcnt), tw);
2264 }
2265
2266 #define TMPSZ 150
2267
2268 static int tcp4_seq_show(struct seq_file *seq, void *v)
2269 {
2270         struct tcp_iter_state *st;
2271         struct sock *sk = v;
2272
2273         seq_setwidth(seq, TMPSZ - 1);
2274         if (v == SEQ_START_TOKEN) {
2275                 seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2276                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2277                            "inode");
2278                 goto out;
2279         }
2280         st = seq->private;
2281
2282         if (sk->sk_state == TCP_TIME_WAIT)
2283                 get_timewait4_sock(v, seq, st->num);
2284         else if (sk->sk_state == TCP_NEW_SYN_RECV)
2285                 get_openreq4(v, seq, st->num);
2286         else
2287                 get_tcp4_sock(v, seq, st->num);
2288 out:
2289         seq_pad(seq, '\n');
2290         return 0;
2291 }
2292
2293 static const struct file_operations tcp_afinfo_seq_fops = {
2294         .owner   = THIS_MODULE,
2295         .open    = tcp_seq_open,
2296         .read    = seq_read,
2297         .llseek  = seq_lseek,
2298         .release = seq_release_net
2299 };
2300
2301 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2302         .name           = "tcp",
2303         .family         = AF_INET,
2304         .seq_fops       = &tcp_afinfo_seq_fops,
2305         .seq_ops        = {
2306                 .show           = tcp4_seq_show,
2307         },
2308 };
2309
2310 static int __net_init tcp4_proc_init_net(struct net *net)
2311 {
2312         return tcp_proc_register(net, &tcp4_seq_afinfo);
2313 }
2314
2315 static void __net_exit tcp4_proc_exit_net(struct net *net)
2316 {
2317         tcp_proc_unregister(net, &tcp4_seq_afinfo);
2318 }
2319
2320 static struct pernet_operations tcp4_net_ops = {
2321         .init = tcp4_proc_init_net,
2322         .exit = tcp4_proc_exit_net,
2323 };
2324
2325 int __init tcp4_proc_init(void)
2326 {
2327         return register_pernet_subsys(&tcp4_net_ops);
2328 }
2329
2330 void tcp4_proc_exit(void)
2331 {
2332         unregister_pernet_subsys(&tcp4_net_ops);
2333 }
2334 #endif /* CONFIG_PROC_FS */
2335
2336 struct proto tcp_prot = {
2337         .name                   = "TCP",
2338         .owner                  = THIS_MODULE,
2339         .close                  = tcp_close,
2340         .connect                = tcp_v4_connect,
2341         .disconnect             = tcp_disconnect,
2342         .accept                 = inet_csk_accept,
2343         .ioctl                  = tcp_ioctl,
2344         .init                   = tcp_v4_init_sock,
2345         .destroy                = tcp_v4_destroy_sock,
2346         .shutdown               = tcp_shutdown,
2347         .setsockopt             = tcp_setsockopt,
2348         .getsockopt             = tcp_getsockopt,
2349         .recvmsg                = tcp_recvmsg,
2350         .sendmsg                = tcp_sendmsg,
2351         .sendpage               = tcp_sendpage,
2352         .backlog_rcv            = tcp_v4_do_rcv,
2353         .release_cb             = tcp_release_cb,
2354         .hash                   = inet_hash,
2355         .unhash                 = inet_unhash,
2356         .get_port               = inet_csk_get_port,
2357         .enter_memory_pressure  = tcp_enter_memory_pressure,
2358         .stream_memory_free     = tcp_stream_memory_free,
2359         .sockets_allocated      = &tcp_sockets_allocated,
2360         .orphan_count           = &tcp_orphan_count,
2361         .memory_allocated       = &tcp_memory_allocated,
2362         .memory_pressure        = &tcp_memory_pressure,
2363         .sysctl_mem             = sysctl_tcp_mem,
2364         .sysctl_wmem            = sysctl_tcp_wmem,
2365         .sysctl_rmem            = sysctl_tcp_rmem,
2366         .max_header             = MAX_TCP_HEADER,
2367         .obj_size               = sizeof(struct tcp_sock),
2368         .slab_flags             = SLAB_DESTROY_BY_RCU,
2369         .twsk_prot              = &tcp_timewait_sock_ops,
2370         .rsk_prot               = &tcp_request_sock_ops,
2371         .h.hashinfo             = &tcp_hashinfo,
2372         .no_autobind            = true,
2373 #ifdef CONFIG_COMPAT
2374         .compat_setsockopt      = compat_tcp_setsockopt,
2375         .compat_getsockopt      = compat_tcp_getsockopt,
2376 #endif
2377 #ifdef CONFIG_MEMCG_KMEM
2378         .init_cgroup            = tcp_init_cgroup,
2379         .destroy_cgroup         = tcp_destroy_cgroup,
2380         .proto_cgroup           = tcp_proto_cgroup,
2381 #endif
2382 };
2383 EXPORT_SYMBOL(tcp_prot);
2384
2385 static void __net_exit tcp_sk_exit(struct net *net)
2386 {
2387         int cpu;
2388
2389         for_each_possible_cpu(cpu)
2390                 inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2391         free_percpu(net->ipv4.tcp_sk);
2392 }
2393
2394 static int __net_init tcp_sk_init(struct net *net)
2395 {
2396         int res, cpu;
2397
2398         net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2399         if (!net->ipv4.tcp_sk)
2400                 return -ENOMEM;
2401
2402         for_each_possible_cpu(cpu) {
2403                 struct sock *sk;
2404
2405                 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2406                                            IPPROTO_TCP, net);
2407                 if (res)
2408                         goto fail;
2409                 *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2410         }
2411
2412         net->ipv4.sysctl_tcp_ecn = 2;
2413         net->ipv4.sysctl_tcp_ecn_fallback = 1;
2414
2415         net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2416         net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2417         net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2418
2419         return 0;
2420 fail:
2421         tcp_sk_exit(net);
2422
2423         return res;
2424 }
2425
2426 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2427 {
2428         inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
2429 }
2430
2431 static struct pernet_operations __net_initdata tcp_sk_ops = {
2432        .init       = tcp_sk_init,
2433        .exit       = tcp_sk_exit,
2434        .exit_batch = tcp_sk_exit_batch,
2435 };
2436
2437 void __init tcp_v4_init(void)
2438 {
2439         inet_hashinfo_init(&tcp_hashinfo);
2440         if (register_pernet_subsys(&tcp_sk_ops))
2441                 panic("Failed to create the TCP control socket.\n");
2442 }