net/ipv4/tcp_ipv4.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  *              IPv4 specific functions
   9  *
  10  *
  11  *              code split from:
  12  *              linux/ipv4/tcp.c
  13  *              linux/ipv4/tcp_input.c
  14  *              linux/ipv4/tcp_output.c
  15  *
  16  *              See tcp.c for author information
  17  *
  18  *      This program is free software; you can redistribute it and/or
  19  *      modify it under the terms of the GNU General Public License
  20  *      as published by the Free Software Foundation; either version
  21  *      2 of the License, or (at your option) any later version.
  22  */
  23
  24 /*
  25  * Changes:
  26  *              David S. Miller :       New socket lookup architecture.
  27  *                                      This code is dedicated to John Dyson.
  28  *              David S. Miller :       Change semantics of established hash,
  29  *                                      half is devoted to TIME_WAIT sockets
  30  *                                      and the rest go in the other half.
  31  *              Andi Kleen :            Add support for syncookies and fixed
  32  *                                      some bugs: ip options weren't passed to
  33  *                                      the TCP layer, missed a check for an
  34  *                                      ACK bit.
  35  *              Andi Kleen :            Implemented fast path mtu discovery.
  36  *                                      Fixed many serious bugs in the
  37  *                                      request_sock handling and moved
  38  *                                      most of it into the af independent code.
  39  *                                      Added tail drop and some other bugfixes.
  40  *                                      Added new listen semantics.
  41  *              Mike McLagan    :       Routing by source
  42  *      Juan Jose Ciarlante:            ip_dynaddr bits
  43  *              Andi Kleen:             various fixes.
  44  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
  45  *                                      coma.
  46  *      Andi Kleen              :       Fix new listen.
  47  *      Andi Kleen              :       Fix accept error reporting.
  48  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
  49  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
  50  *                                      a single port at the same time.
  51  */
  52
  53
  54 #include <linux/bottom_half.h>
  55 #include <linux/types.h>
  56 #include <linux/fcntl.h>
  57 #include <linux/module.h>
  58 #include <linux/random.h>
  59 #include <linux/cache.h>
  60 #include <linux/jhash.h>
  61 #include <linux/init.h>
  62 #include <linux/times.h>
  63
  64 #include <net/net_namespace.h>
  65 #include <net/icmp.h>
  66 #include <net/inet_hashtables.h>
  67 #include <net/tcp.h>
  68 #include <net/transp_v6.h>
  69 #include <net/ipv6.h>
  70 #include <net/inet_common.h>
  71 #include <net/timewait_sock.h>
  72 #include <net/xfrm.h>
  73 #include <net/netdma.h>
  74 #include <net/secure_seq.h>
  75
  76 #include <linux/inet.h>
  77 #include <linux/ipv6.h>
  78 #include <linux/stddef.h>
  79 #include <linux/proc_fs.h>
  80 #include <linux/seq_file.h>
  81
  82 #include <linux/crypto.h>
  83 #include <linux/scatterlist.h>
  84
  85 int sysctl_tcp_tw_reuse __read_mostly;
  86 int sysctl_tcp_low_latency __read_mostly;
  87
  88
  89 #ifdef CONFIG_TCP_MD5SIG
  90 static struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk,
  91                                                    __be32 addr);
  92 static int tcp_v4_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key,
  93                                __be32 daddr, __be32 saddr, struct tcphdr *th);
  94 #else
  95 static inline
  96 struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
  97 {
  98         return NULL;
  99 }
 100 #endif
 101
 102 struct inet_hashinfo tcp_hashinfo;
 103
 104 static inline __u32 tcp_v4_init_sequence(struct sk_buff *skb)
 105 {
 106         return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
 107                                           ip_hdr(skb)->saddr,
 108                                           tcp_hdr(skb)->dest,
 109                                           tcp_hdr(skb)->source);
 110 }
 111
 112 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
 113 {
 114         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
 115         struct tcp_sock *tp = tcp_sk(sk);
 116
 117         /* With PAWS, it is safe from the viewpoint
 118            of data integrity. Even without PAWS it is safe provided sequence
 119            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
 120
 121            Actually, the idea is close to VJ's one, only timestamp cache is
 122            held not per host, but per port pair and TW bucket is used as state
 123            holder.
 124
 125            If TW bucket has been already destroyed we fall back to VJ's scheme
 126            and use initial timestamp retrieved from peer table.
 127          */
 128         if (tcptw->tw_ts_recent_stamp &&
 129             (twp == NULL || (sysctl_tcp_tw_reuse &&
 130                              get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
 131                 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
 132                 if (tp->write_seq == 0)
 133                         tp->write_seq = 1;
 134                 tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
 135                 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
 136                 sock_hold(sktw);
 137                 return 1;
 138         }
 139
 140         return 0;
 141 }
 142
 143 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
 144
 145 /* This will initiate an outgoing connection. */
 146 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 147 {
 148         struct inet_sock *inet = inet_sk(sk);
 149         struct tcp_sock *tp = tcp_sk(sk);
 150         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 151         struct rtable *rt;
 152         __be32 daddr, nexthop;
 153         int tmp;
 154         int err;
 155
 156         if (addr_len < sizeof(struct sockaddr_in))
 157                 return -EINVAL;
 158
 159         if (usin->sin_family != AF_INET)
 160                 return -EAFNOSUPPORT;
 161
 162         nexthop = daddr = usin->sin_addr.s_addr;
 163         if (inet->opt && inet->opt->srr) {
 164                 if (!daddr)
 165                         return -EINVAL;
 166                 nexthop = inet->opt->faddr;
 167         }
 168
 169         tmp = ip_route_connect(&rt, nexthop, inet->saddr,
 170                                RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
 171                                IPPROTO_TCP,
 172                                inet->sport, usin->sin_port, sk, 1);
 173         if (tmp < 0) {
 174                 if (tmp == -ENETUNREACH)
 175                         IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
 176                 return tmp;
 177         }
 178
 179         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
 180                 ip_rt_put(rt);
 181                 return -ENETUNREACH;
 182         }
 183
 184         if (!inet->opt || !inet->opt->srr)
 185                 daddr = rt->rt_dst;
 186
 187         if (!inet->saddr)
 188                 inet->saddr = rt->rt_src;
 189         inet->rcv_saddr = inet->saddr;
 190
 191         if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) {
 192                 /* Reset inherited state */
 193                 tp->rx_opt.ts_recent       = 0;
 194                 tp->rx_opt.ts_recent_stamp = 0;
 195                 tp->write_seq              = 0;
 196         }
 197
 198         if (tcp_death_row.sysctl_tw_recycle &&
 199             !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
 200                 struct inet_peer *peer = rt_get_peer(rt);
 201                 /*
 202                  * VJ's idea. We save last timestamp seen from
 203                  * the destination in peer table, when entering state
 204                  * TIME-WAIT * and initialize rx_opt.ts_recent from it,
 205                  * when trying new connection.
 206                  */
 207                 if (peer != NULL &&
 208                     peer->tcp_ts_stamp + TCP_PAWS_MSL >= get_seconds()) {
 209                         tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
 210                         tp->rx_opt.ts_recent = peer->tcp_ts;
 211                 }
 212         }
 213
 214         inet->dport = usin->sin_port;
 215         inet->daddr = daddr;
 216
 217         inet_csk(sk)->icsk_ext_hdr_len = 0;
 218         if (inet->opt)
 219                 inet_csk(sk)->icsk_ext_hdr_len = inet->opt->optlen;
 220
 221         tp->rx_opt.mss_clamp = 536;
 222
 223         /* Socket identity is still unknown (sport may be zero).
 224          * However we set state to SYN-SENT and not releasing socket
 225          * lock select source port, enter ourselves into the hash tables and
 226          * complete initialization after this.
 227          */
 228         tcp_set_state(sk, TCP_SYN_SENT);
 229         err = inet_hash_connect(&tcp_death_row, sk);
 230         if (err)
 231                 goto failure;
 232
 233         err = ip_route_newports(&rt, IPPROTO_TCP,
 234                                 inet->sport, inet->dport, sk);
 235         if (err)
 236                 goto failure;
 237
 238         /* OK, now commit destination to socket.  */
 239         sk->sk_gso_type = SKB_GSO_TCPV4;
 240         sk_setup_caps(sk, &rt->u.dst);
 241
 242         if (!tp->write_seq)
 243                 tp->write_seq = secure_tcp_sequence_number(inet->saddr,
 244                                                            inet->daddr,
 245                                                            inet->sport,
 246                                                            usin->sin_port);
 247
 248         inet->id = tp->write_seq ^ jiffies;
 249
 250         err = tcp_connect(sk);
 251         rt = NULL;
 252         if (err)
 253                 goto failure;
 254
 255         return 0;
 256
 257 failure:
 258         /*
 259          * This unhashes the socket and releases the local port,
 260          * if necessary.
 261          */
 262         tcp_set_state(sk, TCP_CLOSE);
 263         ip_rt_put(rt);
 264         sk->sk_route_caps = 0;
 265         inet->dport = 0;
 266         return err;
 267 }
 268
 269 /*
 270  * This routine does path mtu discovery as defined in RFC1191.
 271  */
 272 static void do_pmtu_discovery(struct sock *sk, struct iphdr *iph, u32 mtu)
 273 {
 274         struct dst_entry *dst;
 275         struct inet_sock *inet = inet_sk(sk);
 276
 277         /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
 278          * send out by Linux are always <576bytes so they should go through
 279          * unfragmented).
 280          */
 281         if (sk->sk_state == TCP_LISTEN)
 282                 return;
 283
 284         /* We don't check in the destentry if pmtu discovery is forbidden
 285          * on this route. We just assume that no packet_to_big packets
 286          * are send back when pmtu discovery is not active.
 287          * There is a small race when the user changes this flag in the
 288          * route, but I think that's acceptable.
 289          */
 290         if ((dst = __sk_dst_check(sk, 0)) == NULL)
 291                 return;
 292
 293         dst->ops->update_pmtu(dst, mtu);
 294
 295         /* Something is about to be wrong... Remember soft error
 296          * for the case, if this connection will not able to recover.
 297          */
 298         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
 299                 sk->sk_err_soft = EMSGSIZE;
 300
 301         mtu = dst_mtu(dst);
 302
 303         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
 304             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
 305                 tcp_sync_mss(sk, mtu);
 306
 307                 /* Resend the TCP packet because it's
 308                  * clear that the old packet has been
 309                  * dropped. This is the new "fast" path mtu
 310                  * discovery.
 311                  */
 312                 tcp_simple_retransmit(sk);
 313         } /* else let the usual retransmit timer handle it */
 314 }
 315
 316 /*
 317  * This routine is called by the ICMP module when it gets some
 318  * sort of error condition.  If err < 0 then the socket should
 319  * be closed and the error returned to the user.  If err > 0
 320  * it's just the icmp type << 8 | icmp code.  After adjustment
 321  * header points to the first 8 bytes of the tcp header.  We need
 322  * to find the appropriate port.
 323  *
 324  * The locking strategy used here is very "optimistic". When
 325  * someone else accesses the socket the ICMP is just dropped
 326  * and for some paths there is no check at all.
 327  * A more general error queue to queue errors for later handling
 328  * is probably better.
 329  *
 330  */
 331
 332 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
 333 {
 334         struct iphdr *iph = (struct iphdr *)icmp_skb->data;
 335         struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
 336         struct inet_connection_sock *icsk;
 337         struct tcp_sock *tp;
 338         struct inet_sock *inet;
 339         const int type = icmp_hdr(icmp_skb)->type;
 340         const int code = icmp_hdr(icmp_skb)->code;
 341         struct sock *sk;
 342         struct sk_buff *skb;
 343         __u32 seq;
 344         __u32 remaining;
 345         int err;
 346         struct net *net = dev_net(icmp_skb->dev);
 347
 348         if (icmp_skb->len < (iph->ihl << 2) + 8) {
 349                 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
 350                 return;
 351         }
 352
 353         sk = inet_lookup(net, &tcp_hashinfo, iph->daddr, th->dest,
 354                         iph->saddr, th->source, inet_iif(icmp_skb));
 355         if (!sk) {
 356                 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
 357                 return;
 358         }
 359         if (sk->sk_state == TCP_TIME_WAIT) {
 360                 inet_twsk_put(inet_twsk(sk));
 361                 return;
 362         }
 363
 364         bh_lock_sock(sk);
 365         /* If too many ICMPs get dropped on busy
 366          * servers this needs to be solved differently.
 367          */
 368         if (sock_owned_by_user(sk))
 369                 NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
 370
 371         if (sk->sk_state == TCP_CLOSE)
 372                 goto out;
 373
 374         icsk = inet_csk(sk);
 375         tp = tcp_sk(sk);
 376         seq = ntohl(th->seq);
 377         if (sk->sk_state != TCP_LISTEN &&
 378             !between(seq, tp->snd_una, tp->snd_nxt)) {
 379                 NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
 380                 goto out;
 381         }
 382
 383         switch (type) {
 384         case ICMP_SOURCE_QUENCH:
 385                 /* Just silently ignore these. */
 386                 goto out;
 387         case ICMP_PARAMETERPROB:
 388                 err = EPROTO;
 389                 break;
 390         case ICMP_DEST_UNREACH:
 391                 if (code > NR_ICMP_UNREACH)
 392                         goto out;
 393
 394                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
 395                         if (!sock_owned_by_user(sk))
 396                                 do_pmtu_discovery(sk, iph, info);
 397                         goto out;
 398                 }
 399
 400                 err = icmp_err_convert[code].errno;
 401                 /* check if icmp_skb allows revert of backoff
 402                  * (see draft-zimmermann-tcp-lcd) */
 403                 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
 404                         break;
 405                 if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
 406                     !icsk->icsk_backoff)
 407                         break;
 408
 409                 if (sock_owned_by_user(sk))
 410                         break;
 411
 412                 icsk->icsk_backoff--;
 413                 inet_csk(sk)->icsk_rto = __tcp_set_rto(tp) <<
 414                                          icsk->icsk_backoff;
 415                 tcp_bound_rto(sk);
 416
 417                 skb = tcp_write_queue_head(sk);
 418                 BUG_ON(!skb);
 419
 420                 remaining = icsk->icsk_rto - min(icsk->icsk_rto,
 421                                 tcp_time_stamp - TCP_SKB_CB(skb)->when);
 422
 423                 if (remaining) {
 424                         inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
 425                                                   remaining, TCP_RTO_MAX);
 426                 } else {
 427                         /* RTO revert clocked out retransmission.
 428                          * Will retransmit now */
 429                         tcp_retransmit_timer(sk);
 430                 }
 431
 432                 break;
 433         case ICMP_TIME_EXCEEDED:
 434                 err = EHOSTUNREACH;
 435                 break;
 436         default:
 437                 goto out;
 438         }
 439
 440         switch (sk->sk_state) {
 441                 struct request_sock *req, **prev;
 442         case TCP_LISTEN:
 443                 if (sock_owned_by_user(sk))
 444                         goto out;
 445
 446                 req = inet_csk_search_req(sk, &prev, th->dest,
 447                                           iph->daddr, iph->saddr);
 448                 if (!req)
 449                         goto out;
 450
 451                 /* ICMPs are not backlogged, hence we cannot get
 452                    an established socket here.
 453                  */
 454                 WARN_ON(req->sk);
 455
 456                 if (seq != tcp_rsk(req)->snt_isn) {
 457                         NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
 458                         goto out;
 459                 }
 460
 461                 /*
 462                  * Still in SYN_RECV, just remove it silently.
 463                  * There is no good way to pass the error to the newly
 464                  * created socket, and POSIX does not want network
 465                  * errors returned from accept().
 466                  */
 467                 inet_csk_reqsk_queue_drop(sk, req, prev);
 468                 goto out;
 469
 470         case TCP_SYN_SENT:
 471         case TCP_SYN_RECV:  /* Cannot happen.
 472                                It can f.e. if SYNs crossed.
 473                              */
 474                 if (!sock_owned_by_user(sk)) {
 475                         sk->sk_err = err;
 476
 477                         sk->sk_error_report(sk);
 478
 479                         tcp_done(sk);
 480                 } else {
 481                         sk->sk_err_soft = err;
 482                 }
 483                 goto out;
 484         }
 485
 486         /* If we've already connected we will keep trying
 487          * until we time out, or the user gives up.
 488          *
 489          * rfc1122 4.2.3.9 allows to consider as hard errors
 490          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
 491          * but it is obsoleted by pmtu discovery).
 492          *
 493          * Note, that in modern internet, where routing is unreliable
 494          * and in each dark corner broken firewalls sit, sending random
 495          * errors ordered by their masters even this two messages finally lose
 496          * their original sense (even Linux sends invalid PORT_UNREACHs)
 497          *
 498          * Now we are in compliance with RFCs.
 499          *                                                      --ANK (980905)
 500          */
 501
 502         inet = inet_sk(sk);
 503         if (!sock_owned_by_user(sk) && inet->recverr) {
 504                 sk->sk_err = err;
 505                 sk->sk_error_report(sk);
 506         } else  { /* Only an error on timeout */
 507                 sk->sk_err_soft = err;
 508         }
 509
 510 out:
 511         bh_unlock_sock(sk);
 512         sock_put(sk);
 513 }
 514
 515 /* This routine computes an IPv4 TCP checksum. */
 516 void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb)
 517 {
 518         struct inet_sock *inet = inet_sk(sk);
 519         struct tcphdr *th = tcp_hdr(skb);
 520
 521         if (skb->ip_summed == CHECKSUM_PARTIAL) {
 522                 th->check = ~tcp_v4_check(len, inet->saddr,
 523                                           inet->daddr, 0);
 524                 skb->csum_start = skb_transport_header(skb) - skb->head;
 525                 skb->csum_offset = offsetof(struct tcphdr, check);
 526         } else {
 527                 th->check = tcp_v4_check(len, inet->saddr, inet->daddr,
 528                                          csum_partial(th,
 529                                                       th->doff << 2,
 530                                                       skb->csum));
 531         }
 532 }
 533
 534 int tcp_v4_gso_send_check(struct sk_buff *skb)
 535 {
 536         const struct iphdr *iph;
 537         struct tcphdr *th;
 538
 539         if (!pskb_may_pull(skb, sizeof(*th)))
 540                 return -EINVAL;
 541
 542         iph = ip_hdr(skb);
 543         th = tcp_hdr(skb);
 544
 545         th->check = 0;
 546         th->check = ~tcp_v4_check(skb->len, iph->saddr, iph->daddr, 0);
 547         skb->csum_start = skb_transport_header(skb) - skb->head;
 548         skb->csum_offset = offsetof(struct tcphdr, check);
 549         skb->ip_summed = CHECKSUM_PARTIAL;
 550         return 0;
 551 }
 552
 553 /*
 554  *      This routine will send an RST to the other tcp.
 555  *
 556  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
 557  *                    for reset.
 558  *      Answer: if a packet caused RST, it is not for a socket
 559  *              existing in our system, if it is matched to a socket,
 560  *              it is just duplicate segment or bug in other side's TCP.
 561  *              So that we build reply only basing on parameters
 562  *              arrived with segment.
 563  *      Exception: precedence violation. We do not implement it in any case.
 564  */
 565
 566 static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
 567 {
 568         struct tcphdr *th = tcp_hdr(skb);
 569         struct {
 570                 struct tcphdr th;
 571 #ifdef CONFIG_TCP_MD5SIG
 572                 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
 573 #endif
 574         } rep;
 575         struct ip_reply_arg arg;
 576 #ifdef CONFIG_TCP_MD5SIG
 577         struct tcp_md5sig_key *key;
 578 #endif
 579         struct net *net;
 580
 581         /* Never send a reset in response to a reset. */
 582         if (th->rst)
 583                 return;
 584
 585         if (skb_rtable(skb)->rt_type != RTN_LOCAL)
 586                 return;
 587
 588         /* Swap the send and the receive. */
 589         memset(&rep, 0, sizeof(rep));
 590         rep.th.dest   = th->source;
 591         rep.th.source = th->dest;
 592         rep.th.doff   = sizeof(struct tcphdr) / 4;
 593         rep.th.rst    = 1;
 594
 595         if (th->ack) {
 596                 rep.th.seq = th->ack_seq;
 597         } else {
 598                 rep.th.ack = 1;
 599                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
 600                                        skb->len - (th->doff << 2));
 601         }
 602
 603         memset(&arg, 0, sizeof(arg));
 604         arg.iov[0].iov_base = (unsigned char *)&rep;
 605         arg.iov[0].iov_len  = sizeof(rep.th);
 606
 607 #ifdef CONFIG_TCP_MD5SIG
 608         key = sk ? tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr) : NULL;
 609         if (key) {
 610                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
 611                                    (TCPOPT_NOP << 16) |
 612                                    (TCPOPT_MD5SIG << 8) |
 613                                    TCPOLEN_MD5SIG);
 614                 /* Update length and the length the header thinks exists */
 615                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 616                 rep.th.doff = arg.iov[0].iov_len / 4;
 617
 618                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
 619                                      key, ip_hdr(skb)->saddr,
 620                                      ip_hdr(skb)->daddr, &rep.th);
 621         }
 622 #endif
 623         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 624                                       ip_hdr(skb)->saddr, /* XXX */
 625                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 626         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 627         arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0;
 628
 629         net = dev_net(skb_dst(skb)->dev);
 630         ip_send_reply(net->ipv4.tcp_sock, skb,
 631                       &arg, arg.iov[0].iov_len);
 632
 633         TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
 634         TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
 635 }
 636
 637 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
 638    outside socket context is ugly, certainly. What can I do?
 639  */
 640
 641 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
 642                             u32 win, u32 ts, int oif,
 643                             struct tcp_md5sig_key *key,
 644                             int reply_flags)
 645 {
 646         struct tcphdr *th = tcp_hdr(skb);
 647         struct {
 648                 struct tcphdr th;
 649                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
 650 #ifdef CONFIG_TCP_MD5SIG
 651                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
 652 #endif
 653                         ];
 654         } rep;
 655         struct ip_reply_arg arg;
 656         struct net *net = dev_net(skb_dst(skb)->dev);
 657
 658         memset(&rep.th, 0, sizeof(struct tcphdr));
 659         memset(&arg, 0, sizeof(arg));
 660
 661         arg.iov[0].iov_base = (unsigned char *)&rep;
 662         arg.iov[0].iov_len  = sizeof(rep.th);
 663         if (ts) {
 664                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
 665                                    (TCPOPT_TIMESTAMP << 8) |
 666                                    TCPOLEN_TIMESTAMP);
 667                 rep.opt[1] = htonl(tcp_time_stamp);
 668                 rep.opt[2] = htonl(ts);
 669                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
 670         }
 671
 672         /* Swap the send and the receive. */
 673         rep.th.dest    = th->source;
 674         rep.th.source  = th->dest;
 675         rep.th.doff    = arg.iov[0].iov_len / 4;
 676         rep.th.seq     = htonl(seq);
 677         rep.th.ack_seq = htonl(ack);
 678         rep.th.ack     = 1;
 679         rep.th.window  = htons(win);
 680
 681 #ifdef CONFIG_TCP_MD5SIG
 682         if (key) {
 683                 int offset = (ts) ? 3 : 0;
 684
 685                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
 686                                           (TCPOPT_NOP << 16) |
 687                                           (TCPOPT_MD5SIG << 8) |
 688                                           TCPOLEN_MD5SIG);
 689                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 690                 rep.th.doff = arg.iov[0].iov_len/4;
 691
 692                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
 693                                     key, ip_hdr(skb)->saddr,
 694                                     ip_hdr(skb)->daddr, &rep.th);
 695         }
 696 #endif
 697         arg.flags = reply_flags;
 698         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 699                                       ip_hdr(skb)->saddr, /* XXX */
 700                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 701         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 702         if (oif)
 703                 arg.bound_dev_if = oif;
 704
 705         ip_send_reply(net->ipv4.tcp_sock, skb,
 706                       &arg, arg.iov[0].iov_len);
 707
 708         TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
 709 }
 710
 711 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
 712 {
 713         struct inet_timewait_sock *tw = inet_twsk(sk);
 714         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
 715
 716         tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
 717                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
 718                         tcptw->tw_ts_recent,
 719                         tw->tw_bound_dev_if,
 720                         tcp_twsk_md5_key(tcptw),
 721                         tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0
 722                         );
 723
 724         inet_twsk_put(tw);
 725 }
 726
 727 static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
 728                                   struct request_sock *req)
 729 {
 730         tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1,
 731                         tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
 732                         req->ts_recent,
 733                         0,
 734                         tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr),
 735                         inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0);
 736 }
 737
 738 /*
 739  *      Send a SYN-ACK after having received a SYN.
 740  *      This still operates on a request_sock only, not on a big
 741  *      socket.
 742  */
 743 static int __tcp_v4_send_synack(struct sock *sk, struct request_sock *req,
 744                                 struct dst_entry *dst)
 745 {
 746         const struct inet_request_sock *ireq = inet_rsk(req);
 747         int err = -1;
 748         struct sk_buff * skb;
 749
 750         /* First, grab a route. */
 751         if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
 752                 return -1;
 753
 754         skb = tcp_make_synack(sk, dst, req);
 755
 756         if (skb) {
 757                 struct tcphdr *th = tcp_hdr(skb);
 758
 759                 th->check = tcp_v4_check(skb->len,
 760                                          ireq->loc_addr,
 761                                          ireq->rmt_addr,
 762                                          csum_partial(th, skb->len,
 763                                                       skb->csum));
 764
 765                 err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
 766                                             ireq->rmt_addr,
 767                                             ireq->opt);
 768                 err = net_xmit_eval(err);
 769         }
 770
 771         dst_release(dst);
 772         return err;
 773 }
 774
 775 static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req)
 776 {
 777         return __tcp_v4_send_synack(sk, req, NULL);
 778 }
 779
 780 /*
 781  *      IPv4 request_sock destructor.
 782  */
 783 static void tcp_v4_reqsk_destructor(struct request_sock *req)
 784 {
 785         kfree(inet_rsk(req)->opt);
 786 }
 787
 788 #ifdef CONFIG_SYN_COOKIES
 789 static void syn_flood_warning(struct sk_buff *skb)
 790 {
 791         static unsigned long warntime;
 792
 793         if (time_after(jiffies, (warntime + HZ * 60))) {
 794                 warntime = jiffies;
 795                 printk(KERN_INFO
 796                        "possible SYN flooding on port %d. Sending cookies.\n",
 797                        ntohs(tcp_hdr(skb)->dest));
 798         }
 799 }
 800 #endif
 801
 802 /*
 803  * Save and compile IPv4 options into the request_sock if needed.
 804  */
 805 static struct ip_options *tcp_v4_save_options(struct sock *sk,
 806                                               struct sk_buff *skb)
 807 {
 808         struct ip_options *opt = &(IPCB(skb)->opt);
 809         struct ip_options *dopt = NULL;
 810
 811         if (opt && opt->optlen) {
 812                 int opt_size = optlength(opt);
 813                 dopt = kmalloc(opt_size, GFP_ATOMIC);
 814                 if (dopt) {
 815                         if (ip_options_echo(dopt, skb)) {
 816                                 kfree(dopt);
 817                                 dopt = NULL;
 818                         }
 819                 }
 820         }
 821         return dopt;
 822 }
 823
 824 #ifdef CONFIG_TCP_MD5SIG
 825 /*
 826  * RFC2385 MD5 checksumming requires a mapping of
 827  * IP address->MD5 Key.
 828  * We need to maintain these in the sk structure.
 829  */
 830
 831 /* Find the Key structure for an address.  */
 832 static struct tcp_md5sig_key *
 833                         tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
 834 {
 835         struct tcp_sock *tp = tcp_sk(sk);
 836         int i;
 837
 838         if (!tp->md5sig_info || !tp->md5sig_info->entries4)
 839                 return NULL;
 840         for (i = 0; i < tp->md5sig_info->entries4; i++) {
 841                 if (tp->md5sig_info->keys4[i].addr == addr)
 842                         return &tp->md5sig_info->keys4[i].base;
 843         }
 844         return NULL;
 845 }
 846
 847 struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
 848                                          struct sock *addr_sk)
 849 {
 850         return tcp_v4_md5_do_lookup(sk, inet_sk(addr_sk)->daddr);
 851 }
 852
 853 EXPORT_SYMBOL(tcp_v4_md5_lookup);
 854
 855 static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
 856                                                       struct request_sock *req)
 857 {
 858         return tcp_v4_md5_do_lookup(sk, inet_rsk(req)->rmt_addr);
 859 }
 860
 861 /* This can be called on a newly created socket, from other files */
 862 int tcp_v4_md5_do_add(struct sock *sk, __be32 addr,
 863                       u8 *newkey, u8 newkeylen)
 864 {
 865         /* Add Key to the list */
 866         struct tcp_md5sig_key *key;
 867         struct tcp_sock *tp = tcp_sk(sk);
 868         struct tcp4_md5sig_key *keys;
 869
 870         key = tcp_v4_md5_do_lookup(sk, addr);
 871         if (key) {
 872                 /* Pre-existing entry - just update that one. */
 873                 kfree(key->key);
 874                 key->key = newkey;
 875                 key->keylen = newkeylen;
 876         } else {
 877                 struct tcp_md5sig_info *md5sig;
 878
 879                 if (!tp->md5sig_info) {
 880                         tp->md5sig_info = kzalloc(sizeof(*tp->md5sig_info),
 881                                                   GFP_ATOMIC);
 882                         if (!tp->md5sig_info) {
 883                                 kfree(newkey);
 884                                 return -ENOMEM;
 885                         }
 886                         sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
 887                 }
 888                 if (tcp_alloc_md5sig_pool(sk) == NULL) {
 889                         kfree(newkey);
 890                         return -ENOMEM;
 891                 }
 892                 md5sig = tp->md5sig_info;
 893
 894                 if (md5sig->alloced4 == md5sig->entries4) {
 895                         keys = kmalloc((sizeof(*keys) *
 896                                         (md5sig->entries4 + 1)), GFP_ATOMIC);
 897                         if (!keys) {
 898                                 kfree(newkey);
 899                                 tcp_free_md5sig_pool();
 900                                 return -ENOMEM;
 901                         }
 902
 903                         if (md5sig->entries4)
 904                                 memcpy(keys, md5sig->keys4,
 905                                        sizeof(*keys) * md5sig->entries4);
 906
 907                         /* Free old key list, and reference new one */
 908                         kfree(md5sig->keys4);
 909                         md5sig->keys4 = keys;
 910                         md5sig->alloced4++;
 911                 }
 912                 md5sig->entries4++;
 913                 md5sig->keys4[md5sig->entries4 - 1].addr        = addr;
 914                 md5sig->keys4[md5sig->entries4 - 1].base.key    = newkey;
 915                 md5sig->keys4[md5sig->entries4 - 1].base.keylen = newkeylen;
 916         }
 917         return 0;
 918 }
 919
 920 EXPORT_SYMBOL(tcp_v4_md5_do_add);
 921
 922 static int tcp_v4_md5_add_func(struct sock *sk, struct sock *addr_sk,
 923                                u8 *newkey, u8 newkeylen)
 924 {
 925         return tcp_v4_md5_do_add(sk, inet_sk(addr_sk)->daddr,
 926                                  newkey, newkeylen);
 927 }
 928
 929 int tcp_v4_md5_do_del(struct sock *sk, __be32 addr)
 930 {
 931         struct tcp_sock *tp = tcp_sk(sk);
 932         int i;
 933
 934         for (i = 0; i < tp->md5sig_info->entries4; i++) {
 935                 if (tp->md5sig_info->keys4[i].addr == addr) {
 936                         /* Free the key */
 937                         kfree(tp->md5sig_info->keys4[i].base.key);
 938                         tp->md5sig_info->entries4--;
 939
 940                         if (tp->md5sig_info->entries4 == 0) {
 941                                 kfree(tp->md5sig_info->keys4);
 942                                 tp->md5sig_info->keys4 = NULL;
 943                                 tp->md5sig_info->alloced4 = 0;
 944                         } else if (tp->md5sig_info->entries4 != i) {
 945                                 /* Need to do some manipulation */
 946                                 memmove(&tp->md5sig_info->keys4[i],
 947                                         &tp->md5sig_info->keys4[i+1],
 948                                         (tp->md5sig_info->entries4 - i) *
 949                                          sizeof(struct tcp4_md5sig_key));
 950                         }
 951                         tcp_free_md5sig_pool();
 952                         return 0;
 953                 }
 954         }
 955         return -ENOENT;
 956 }
 957
 958 EXPORT_SYMBOL(tcp_v4_md5_do_del);
 959
 960 static void tcp_v4_clear_md5_list(struct sock *sk)
 961 {
 962         struct tcp_sock *tp = tcp_sk(sk);
 963
 964         /* Free each key, then the set of key keys,
 965          * the crypto element, and then decrement our
 966          * hold on the last resort crypto.
 967          */
 968         if (tp->md5sig_info->entries4) {
 969                 int i;
 970                 for (i = 0; i < tp->md5sig_info->entries4; i++)
 971                         kfree(tp->md5sig_info->keys4[i].base.key);
 972                 tp->md5sig_info->entries4 = 0;
 973                 tcp_free_md5sig_pool();
 974         }
 975         if (tp->md5sig_info->keys4) {
 976                 kfree(tp->md5sig_info->keys4);
 977                 tp->md5sig_info->keys4 = NULL;
 978                 tp->md5sig_info->alloced4  = 0;
 979         }
 980 }
 981
 982 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
 983                                  int optlen)
 984 {
 985         struct tcp_md5sig cmd;
 986         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
 987         u8 *newkey;
 988
 989         if (optlen < sizeof(cmd))
 990                 return -EINVAL;
 991
 992         if (copy_from_user(&cmd, optval, sizeof(cmd)))
 993                 return -EFAULT;
 994
 995         if (sin->sin_family != AF_INET)
 996                 return -EINVAL;
 997
 998         if (!cmd.tcpm_key || !cmd.tcpm_keylen) {
 999                 if (!tcp_sk(sk)->md5sig_info)
1000                         return -ENOENT;
1001                 return tcp_v4_md5_do_del(sk, sin->sin_addr.s_addr);
1002         }
1003
1004         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1005                 return -EINVAL;
1006
1007         if (!tcp_sk(sk)->md5sig_info) {
1008                 struct tcp_sock *tp = tcp_sk(sk);
1009                 struct tcp_md5sig_info *p;
1010
1011                 p = kzalloc(sizeof(*p), sk->sk_allocation);
1012                 if (!p)
1013                         return -EINVAL;
1014
1015                 tp->md5sig_info = p;
1016                 sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1017         }
1018
1019         newkey = kmemdup(cmd.tcpm_key, cmd.tcpm_keylen, sk->sk_allocation);
1020         if (!newkey)
1021                 return -ENOMEM;
1022         return tcp_v4_md5_do_add(sk, sin->sin_addr.s_addr,
1023                                  newkey, cmd.tcpm_keylen);
1024 }
1025
1026 static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
1027                                         __be32 daddr, __be32 saddr, int nbytes)
1028 {
1029         struct tcp4_pseudohdr *bp;
1030         struct scatterlist sg;
1031
1032         bp = &hp->md5_blk.ip4;
1033
1034         /*
1035          * 1. the TCP pseudo-header (in the order: source IP address,
1036          * destination IP address, zero-padded protocol number, and
1037          * segment length)
1038          */
1039         bp->saddr = saddr;
1040         bp->daddr = daddr;
1041         bp->pad = 0;
1042         bp->protocol = IPPROTO_TCP;
1043         bp->len = cpu_to_be16(nbytes);
1044
1045         sg_init_one(&sg, bp, sizeof(*bp));
1046         return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp));
1047 }
1048
1049 static int tcp_v4_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key,
1050                                __be32 daddr, __be32 saddr, struct tcphdr *th)
1051 {
1052         struct tcp_md5sig_pool *hp;
1053         struct hash_desc *desc;
1054
1055         hp = tcp_get_md5sig_pool();
1056         if (!hp)
1057                 goto clear_hash_noput;
1058         desc = &hp->md5_desc;
1059
1060         if (crypto_hash_init(desc))
1061                 goto clear_hash;
1062         if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
1063                 goto clear_hash;
1064         if (tcp_md5_hash_header(hp, th))
1065                 goto clear_hash;
1066         if (tcp_md5_hash_key(hp, key))
1067                 goto clear_hash;
1068         if (crypto_hash_final(desc, md5_hash))
1069                 goto clear_hash;
1070
1071         tcp_put_md5sig_pool();
1072         return 0;
1073
1074 clear_hash:
1075         tcp_put_md5sig_pool();
1076 clear_hash_noput:
1077         memset(md5_hash, 0, 16);
1078         return 1;
1079 }
1080
1081 int tcp_v4_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key,
1082                         struct sock *sk, struct request_sock *req,
1083                         struct sk_buff *skb)
1084 {
1085         struct tcp_md5sig_pool *hp;
1086         struct hash_desc *desc;
1087         struct tcphdr *th = tcp_hdr(skb);
1088         __be32 saddr, daddr;
1089
1090         if (sk) {
1091                 saddr = inet_sk(sk)->saddr;
1092                 daddr = inet_sk(sk)->daddr;
1093         } else if (req) {
1094                 saddr = inet_rsk(req)->loc_addr;
1095                 daddr = inet_rsk(req)->rmt_addr;
1096         } else {
1097                 const struct iphdr *iph = ip_hdr(skb);
1098                 saddr = iph->saddr;
1099                 daddr = iph->daddr;
1100         }
1101
1102         hp = tcp_get_md5sig_pool();
1103         if (!hp)
1104                 goto clear_hash_noput;
1105         desc = &hp->md5_desc;
1106
1107         if (crypto_hash_init(desc))
1108                 goto clear_hash;
1109
1110         if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
1111                 goto clear_hash;
1112         if (tcp_md5_hash_header(hp, th))
1113                 goto clear_hash;
1114         if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1115                 goto clear_hash;
1116         if (tcp_md5_hash_key(hp, key))
1117                 goto clear_hash;
1118         if (crypto_hash_final(desc, md5_hash))
1119                 goto clear_hash;
1120
1121         tcp_put_md5sig_pool();
1122         return 0;
1123
1124 clear_hash:
1125         tcp_put_md5sig_pool();
1126 clear_hash_noput:
1127         memset(md5_hash, 0, 16);
1128         return 1;
1129 }
1130
1131 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1132
1133 static int tcp_v4_inbound_md5_hash(struct sock *sk, struct sk_buff *skb)
1134 {
1135         /*
1136          * This gets called for each TCP segment that arrives
1137          * so we want to be efficient.
1138          * We have 3 drop cases:
1139          * o No MD5 hash and one expected.
1140          * o MD5 hash and we're not expecting one.
1141          * o MD5 hash and its wrong.
1142          */
1143         __u8 *hash_location = NULL;
1144         struct tcp_md5sig_key *hash_expected;
1145         const struct iphdr *iph = ip_hdr(skb);
1146         struct tcphdr *th = tcp_hdr(skb);
1147         int genhash;
1148         unsigned char newhash[16];
1149
1150         hash_expected = tcp_v4_md5_do_lookup(sk, iph->saddr);
1151         hash_location = tcp_parse_md5sig_option(th);
1152
1153         /* We've parsed the options - do we have a hash? */
1154         if (!hash_expected && !hash_location)
1155                 return 0;
1156
1157         if (hash_expected && !hash_location) {
1158                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1159                 return 1;
1160         }
1161
1162         if (!hash_expected && hash_location) {
1163                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1164                 return 1;
1165         }
1166
1167         /* Okay, so this is hash_expected and hash_location -
1168          * so we need to calculate the checksum.
1169          */
1170         genhash = tcp_v4_md5_hash_skb(newhash,
1171                                       hash_expected,
1172                                       NULL, NULL, skb);
1173
1174         if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1175                 if (net_ratelimit()) {
1176                         printk(KERN_INFO "MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1177                                &iph->saddr, ntohs(th->source),
1178                                &iph->daddr, ntohs(th->dest),
1179                                genhash ? " tcp_v4_calc_md5_hash failed" : "");
1180                 }
1181                 return 1;
1182         }
1183         return 0;
1184 }
1185
1186 #endif
1187
1188 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1189         .family         =       PF_INET,
1190         .obj_size       =       sizeof(struct tcp_request_sock),
1191         .rtx_syn_ack    =       tcp_v4_send_synack,
1192         .send_ack       =       tcp_v4_reqsk_send_ack,
1193         .destructor     =       tcp_v4_reqsk_destructor,
1194         .send_reset     =       tcp_v4_send_reset,
1195 };
1196
1197 #ifdef CONFIG_TCP_MD5SIG
1198 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1199         .md5_lookup     =       tcp_v4_reqsk_md5_lookup,
1200         .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1201 };
1202 #endif
1203
1204 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1205         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
1206         .twsk_unique    = tcp_twsk_unique,
1207         .twsk_destructor= tcp_twsk_destructor,
1208 };
1209
1210 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1211 {
1212         struct inet_request_sock *ireq;
1213         struct tcp_options_received tmp_opt;
1214         struct request_sock *req;
1215         __be32 saddr = ip_hdr(skb)->saddr;
1216         __be32 daddr = ip_hdr(skb)->daddr;
1217         __u32 isn = TCP_SKB_CB(skb)->when;
1218         struct dst_entry *dst = NULL;
1219 #ifdef CONFIG_SYN_COOKIES
1220         int want_cookie = 0;
1221 #else
1222 #define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1223 #endif
1224
1225         /* Never answer to SYNs send to broadcast or multicast */
1226         if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1227                 goto drop;
1228
1229         /* TW buckets are converted to open requests without
1230          * limitations, they conserve resources and peer is
1231          * evidently real one.
1232          */
1233         if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
1234 #ifdef CONFIG_SYN_COOKIES
1235                 if (sysctl_tcp_syncookies) {
1236                         want_cookie = 1;
1237                 } else
1238 #endif
1239                 goto drop;
1240         }
1241
1242         /* Accept backlog is full. If we have already queued enough
1243          * of warm entries in syn queue, drop request. It is better than
1244          * clogging syn queue with openreqs with exponentially increasing
1245          * timeout.
1246          */
1247         if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
1248                 goto drop;
1249
1250         req = inet_reqsk_alloc(&tcp_request_sock_ops);
1251         if (!req)
1252                 goto drop;
1253
1254 #ifdef CONFIG_TCP_MD5SIG
1255         tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
1256 #endif
1257
1258         tcp_clear_options(&tmp_opt);
1259         tmp_opt.mss_clamp = 536;
1260         tmp_opt.user_mss  = tcp_sk(sk)->rx_opt.user_mss;
1261
1262         tcp_parse_options(skb, &tmp_opt, 0);
1263
1264         if (want_cookie && !tmp_opt.saw_tstamp)
1265                 tcp_clear_options(&tmp_opt);
1266
1267         tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1268
1269         tcp_openreq_init(req, &tmp_opt, skb);
1270
1271         ireq = inet_rsk(req);
1272         ireq->loc_addr = daddr;
1273         ireq->rmt_addr = saddr;
1274         ireq->no_srccheck = inet_sk(sk)->transparent;
1275         ireq->opt = tcp_v4_save_options(sk, skb);
1276
1277         if (security_inet_conn_request(sk, skb, req))
1278                 goto drop_and_free;
1279
1280         if (!want_cookie)
1281                 TCP_ECN_create_request(req, tcp_hdr(skb));
1282
1283         if (want_cookie) {
1284 #ifdef CONFIG_SYN_COOKIES
1285                 syn_flood_warning(skb);
1286                 req->cookie_ts = tmp_opt.tstamp_ok;
1287 #endif
1288                 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1289         } else if (!isn) {
1290                 struct inet_peer *peer = NULL;
1291
1292                 /* VJ's idea. We save last timestamp seen
1293                  * from the destination in peer table, when entering
1294                  * state TIME-WAIT, and check against it before
1295                  * accepting new connection request.
1296                  *
1297                  * If "isn" is not zero, this request hit alive
1298                  * timewait bucket, so that all the necessary checks
1299                  * are made in the function processing timewait state.
1300                  */
1301                 if (tmp_opt.saw_tstamp &&
1302                     tcp_death_row.sysctl_tw_recycle &&
1303                     (dst = inet_csk_route_req(sk, req)) != NULL &&
1304                     (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
1305                     peer->v4daddr == saddr) {
1306                         if (get_seconds() < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
1307                             (s32)(peer->tcp_ts - req->ts_recent) >
1308                                                         TCP_PAWS_WINDOW) {
1309                                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
1310                                 goto drop_and_release;
1311                         }
1312                 }
1313                 /* Kill the following clause, if you dislike this way. */
1314                 else if (!sysctl_tcp_syncookies &&
1315                          (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
1316                           (sysctl_max_syn_backlog >> 2)) &&
1317                          (!peer || !peer->tcp_ts_stamp) &&
1318                          (!dst || !dst_metric(dst, RTAX_RTT))) {
1319                         /* Without syncookies last quarter of
1320                          * backlog is filled with destinations,
1321                          * proven to be alive.
1322                          * It means that we continue to communicate
1323                          * to destinations, already remembered
1324                          * to the moment of synflood.
1325                          */
1326                         LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open request from %pI4/%u\n",
1327                                        &saddr, ntohs(tcp_hdr(skb)->source));
1328                         goto drop_and_release;
1329                 }
1330
1331                 isn = tcp_v4_init_sequence(skb);
1332         }
1333         tcp_rsk(req)->snt_isn = isn;
1334
1335         if (__tcp_v4_send_synack(sk, req, dst) || want_cookie)
1336                 goto drop_and_free;
1337
1338         inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1339         return 0;
1340
1341 drop_and_release:
1342         dst_release(dst);
1343 drop_and_free:
1344         reqsk_free(req);
1345 drop:
1346         return 0;
1347 }
1348
1349
1350 /*
1351  * The three way handshake has completed - we got a valid synack -
1352  * now create the new socket.
1353  */
1354 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1355                                   struct request_sock *req,
1356                                   struct dst_entry *dst)
1357 {
1358         struct inet_request_sock *ireq;
1359         struct inet_sock *newinet;
1360         struct tcp_sock *newtp;
1361         struct sock *newsk;
1362 #ifdef CONFIG_TCP_MD5SIG
1363         struct tcp_md5sig_key *key;
1364 #endif
1365
1366         if (sk_acceptq_is_full(sk))
1367                 goto exit_overflow;
1368
1369         if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
1370                 goto exit;
1371
1372         newsk = tcp_create_openreq_child(sk, req, skb);
1373         if (!newsk)
1374                 goto exit;
1375
1376         newsk->sk_gso_type = SKB_GSO_TCPV4;
1377         sk_setup_caps(newsk, dst);
1378
1379         newtp                 = tcp_sk(newsk);
1380         newinet               = inet_sk(newsk);
1381         ireq                  = inet_rsk(req);
1382         newinet->daddr        = ireq->rmt_addr;
1383         newinet->rcv_saddr    = ireq->loc_addr;
1384         newinet->saddr        = ireq->loc_addr;
1385         newinet->opt          = ireq->opt;
1386         ireq->opt             = NULL;
1387         newinet->mc_index     = inet_iif(skb);
1388         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1389         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1390         if (newinet->opt)
1391                 inet_csk(newsk)->icsk_ext_hdr_len = newinet->opt->optlen;
1392         newinet->id = newtp->write_seq ^ jiffies;
1393
1394         tcp_mtup_init(newsk);
1395         tcp_sync_mss(newsk, dst_mtu(dst));
1396         newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
1397         if (tcp_sk(sk)->rx_opt.user_mss &&
1398             tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1399                 newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1400
1401         tcp_initialize_rcv_mss(newsk);
1402
1403 #ifdef CONFIG_TCP_MD5SIG
1404         /* Copy over the MD5 key from the original socket */
1405         if ((key = tcp_v4_md5_do_lookup(sk, newinet->daddr)) != NULL) {
1406                 /*
1407                  * We're using one, so create a matching key
1408                  * on the newsk structure. If we fail to get
1409                  * memory, then we end up not copying the key
1410                  * across. Shucks.
1411                  */
1412                 char *newkey = kmemdup(key->key, key->keylen, GFP_ATOMIC);
1413                 if (newkey != NULL)
1414                         tcp_v4_md5_do_add(newsk, newinet->daddr,
1415                                           newkey, key->keylen);
1416                 newsk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1417         }
1418 #endif
1419
1420         __inet_hash_nolisten(newsk);
1421         __inet_inherit_port(sk, newsk);
1422
1423         return newsk;
1424
1425 exit_overflow:
1426         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1427 exit:
1428         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1429         dst_release(dst);
1430         return NULL;
1431 }
1432
1433 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1434 {
1435         struct tcphdr *th = tcp_hdr(skb);
1436         const struct iphdr *iph = ip_hdr(skb);
1437         struct sock *nsk;
1438         struct request_sock **prev;
1439         /* Find possible connection requests. */
1440         struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1441                                                        iph->saddr, iph->daddr);
1442         if (req)
1443                 return tcp_check_req(sk, skb, req, prev);
1444
1445         nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
1446                         th->source, iph->daddr, th->dest, inet_iif(skb));
1447
1448         if (nsk) {
1449                 if (nsk->sk_state != TCP_TIME_WAIT) {
1450                         bh_lock_sock(nsk);
1451                         return nsk;
1452                 }
1453                 inet_twsk_put(inet_twsk(nsk));
1454                 return NULL;
1455         }
1456
1457 #ifdef CONFIG_SYN_COOKIES
1458         if (!th->rst && !th->syn && th->ack)
1459                 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1460 #endif
1461         return sk;
1462 }
1463
1464 static __sum16 tcp_v4_checksum_init(struct sk_buff *skb)
1465 {
1466         const struct iphdr *iph = ip_hdr(skb);
1467
1468         if (skb->ip_summed == CHECKSUM_COMPLETE) {
1469                 if (!tcp_v4_check(skb->len, iph->saddr,
1470                                   iph->daddr, skb->csum)) {
1471                         skb->ip_summed = CHECKSUM_UNNECESSARY;
1472                         return 0;
1473                 }
1474         }
1475
1476         skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
1477                                        skb->len, IPPROTO_TCP, 0);
1478
1479         if (skb->len <= 76) {
1480                 return __skb_checksum_complete(skb);
1481         }
1482         return 0;
1483 }
1484
1485
1486 /* The socket must have it's spinlock held when we get
1487  * here.
1488  *
1489  * We have a potential double-lock case here, so even when
1490  * doing backlog processing we use the BH locking scheme.
1491  * This is because we cannot sleep with the original spinlock
1492  * held.
1493  */
1494 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1495 {
1496         struct sock *rsk;
1497 #ifdef CONFIG_TCP_MD5SIG
1498         /*
1499          * We really want to reject the packet as early as possible
1500          * if:
1501          *  o We're expecting an MD5'd packet and this is no MD5 tcp option
1502          *  o There is an MD5 option and we're not expecting one
1503          */
1504         if (tcp_v4_inbound_md5_hash(sk, skb))
1505                 goto discard;
1506 #endif
1507
1508         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1509                 TCP_CHECK_TIMER(sk);
1510                 if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
1511                         rsk = sk;
1512                         goto reset;
1513                 }
1514                 TCP_CHECK_TIMER(sk);
1515                 return 0;
1516         }
1517
1518         if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
1519                 goto csum_err;
1520
1521         if (sk->sk_state == TCP_LISTEN) {
1522                 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1523                 if (!nsk)
1524                         goto discard;
1525
1526                 if (nsk != sk) {
1527                         if (tcp_child_process(sk, nsk, skb)) {
1528                                 rsk = nsk;
1529                                 goto reset;
1530                         }
1531                         return 0;
1532                 }
1533         }
1534
1535         TCP_CHECK_TIMER(sk);
1536         if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
1537                 rsk = sk;
1538                 goto reset;
1539         }
1540         TCP_CHECK_TIMER(sk);
1541         return 0;
1542
1543 reset:
1544         tcp_v4_send_reset(rsk, skb);
1545 discard:
1546         kfree_skb(skb);
1547         /* Be careful here. If this function gets more complicated and
1548          * gcc suffers from register pressure on the x86, sk (in %ebx)
1549          * might be destroyed here. This current version compiles correctly,
1550          * but you have been warned.
1551          */
1552         return 0;
1553
1554 csum_err:
1555         TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
1556         goto discard;
1557 }
1558
1559 /*
1560  *      From tcp_input.c
1561  */
1562
1563 int tcp_v4_rcv(struct sk_buff *skb)
1564 {
1565         const struct iphdr *iph;
1566         struct tcphdr *th;
1567         struct sock *sk;
1568         int ret;
1569         struct net *net = dev_net(skb->dev);
1570
1571         if (skb->pkt_type != PACKET_HOST)
1572                 goto discard_it;
1573
1574         /* Count it even if it's bad */
1575         TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);
1576
1577         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1578                 goto discard_it;
1579
1580         th = tcp_hdr(skb);
1581
1582         if (th->doff < sizeof(struct tcphdr) / 4)
1583                 goto bad_packet;
1584         if (!pskb_may_pull(skb, th->doff * 4))
1585                 goto discard_it;
1586
1587         /* An explanation is required here, I think.
1588          * Packet length and doff are validated by header prediction,
1589          * provided case of th->doff==0 is eliminated.
1590          * So, we defer the checks. */
1591         if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb))
1592                 goto bad_packet;
1593
1594         th = tcp_hdr(skb);
1595         iph = ip_hdr(skb);
1596         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1597         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1598                                     skb->len - th->doff * 4);
1599         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1600         TCP_SKB_CB(skb)->when    = 0;
1601         TCP_SKB_CB(skb)->flags   = iph->tos;
1602         TCP_SKB_CB(skb)->sacked  = 0;
1603
1604         sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
1605         if (!sk)
1606                 goto no_tcp_socket;
1607
1608 process:
1609         if (sk->sk_state == TCP_TIME_WAIT)
1610                 goto do_time_wait;
1611
1612         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1613                 goto discard_and_relse;
1614         nf_reset(skb);
1615
1616         if (sk_filter(sk, skb))
1617                 goto discard_and_relse;
1618
1619         skb->dev = NULL;
1620
1621         bh_lock_sock_nested(sk);
1622         ret = 0;
1623         if (!sock_owned_by_user(sk)) {
1624 #ifdef CONFIG_NET_DMA
1625                 struct tcp_sock *tp = tcp_sk(sk);
1626                 if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
1627                         tp->ucopy.dma_chan = dma_find_channel(DMA_MEMCPY);
1628                 if (tp->ucopy.dma_chan)
1629                         ret = tcp_v4_do_rcv(sk, skb);
1630                 else
1631 #endif
1632                 {
1633                         if (!tcp_prequeue(sk, skb))
1634                                 ret = tcp_v4_do_rcv(sk, skb);
1635                 }
1636         } else
1637                 sk_add_backlog(sk, skb);
1638         bh_unlock_sock(sk);
1639
1640         sock_put(sk);
1641
1642         return ret;
1643
1644 no_tcp_socket:
1645         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1646                 goto discard_it;
1647
1648         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1649 bad_packet:
1650                 TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1651         } else {
1652                 tcp_v4_send_reset(NULL, skb);
1653         }
1654
1655 discard_it:
1656         /* Discard frame. */
1657         kfree_skb(skb);
1658         return 0;
1659
1660 discard_and_relse:
1661         sock_put(sk);
1662         goto discard_it;
1663
1664 do_time_wait:
1665         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1666                 inet_twsk_put(inet_twsk(sk));
1667                 goto discard_it;
1668         }
1669
1670         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1671                 TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1672                 inet_twsk_put(inet_twsk(sk));
1673                 goto discard_it;
1674         }
1675         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1676         case TCP_TW_SYN: {
1677                 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1678                                                         &tcp_hashinfo,
1679                                                         iph->daddr, th->dest,
1680                                                         inet_iif(skb));
1681                 if (sk2) {
1682                         inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
1683                         inet_twsk_put(inet_twsk(sk));
1684                         sk = sk2;
1685                         goto process;
1686                 }
1687                 /* Fall through to ACK */
1688         }
1689         case TCP_TW_ACK:
1690                 tcp_v4_timewait_ack(sk, skb);
1691                 break;
1692         case TCP_TW_RST:
1693                 goto no_tcp_socket;
1694         case TCP_TW_SUCCESS:;
1695         }
1696         goto discard_it;
1697 }
1698
1699 /* VJ's idea. Save last timestamp seen from this destination
1700  * and hold it at least for normal timewait interval to use for duplicate
1701  * segment detection in subsequent connections, before they enter synchronized
1702  * state.
1703  */
1704
1705 int tcp_v4_remember_stamp(struct sock *sk)
1706 {
1707         struct inet_sock *inet = inet_sk(sk);
1708         struct tcp_sock *tp = tcp_sk(sk);
1709         struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
1710         struct inet_peer *peer = NULL;
1711         int release_it = 0;
1712
1713         if (!rt || rt->rt_dst != inet->daddr) {
1714                 peer = inet_getpeer(inet->daddr, 1);
1715                 release_it = 1;
1716         } else {
1717                 if (!rt->peer)
1718                         rt_bind_peer(rt, 1);
1719                 peer = rt->peer;
1720         }
1721
1722         if (peer) {
1723                 if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
1724                     (peer->tcp_ts_stamp + TCP_PAWS_MSL < get_seconds() &&
1725                      peer->tcp_ts_stamp <= tp->rx_opt.ts_recent_stamp)) {
1726                         peer->tcp_ts_stamp = tp->rx_opt.ts_recent_stamp;
1727                         peer->tcp_ts = tp->rx_opt.ts_recent;
1728                 }
1729                 if (release_it)
1730                         inet_putpeer(peer);
1731                 return 1;
1732         }
1733
1734         return 0;
1735 }
1736
1737 int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw)
1738 {
1739         struct inet_peer *peer = inet_getpeer(tw->tw_daddr, 1);
1740
1741         if (peer) {
1742                 const struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
1743
1744                 if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 ||
1745                     (peer->tcp_ts_stamp + TCP_PAWS_MSL < get_seconds() &&
1746                      peer->tcp_ts_stamp <= tcptw->tw_ts_recent_stamp)) {
1747                         peer->tcp_ts_stamp = tcptw->tw_ts_recent_stamp;
1748                         peer->tcp_ts       = tcptw->tw_ts_recent;
1749                 }
1750                 inet_putpeer(peer);
1751                 return 1;
1752         }
1753
1754         return 0;
1755 }
1756
1757 const struct inet_connection_sock_af_ops ipv4_specific = {
1758         .queue_xmit        = ip_queue_xmit,
1759         .send_check        = tcp_v4_send_check,
1760         .rebuild_header    = inet_sk_rebuild_header,
1761         .conn_request      = tcp_v4_conn_request,
1762         .syn_recv_sock     = tcp_v4_syn_recv_sock,
1763         .remember_stamp    = tcp_v4_remember_stamp,
1764         .net_header_len    = sizeof(struct iphdr),
1765         .setsockopt        = ip_setsockopt,
1766         .getsockopt        = ip_getsockopt,
1767         .addr2sockaddr     = inet_csk_addr2sockaddr,
1768         .sockaddr_len      = sizeof(struct sockaddr_in),
1769         .bind_conflict     = inet_csk_bind_conflict,
1770 #ifdef CONFIG_COMPAT
1771         .compat_setsockopt = compat_ip_setsockopt,
1772         .compat_getsockopt = compat_ip_getsockopt,
1773 #endif
1774 };
1775
1776 #ifdef CONFIG_TCP_MD5SIG
1777 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1778         .md5_lookup             = tcp_v4_md5_lookup,
1779         .calc_md5_hash          = tcp_v4_md5_hash_skb,
1780         .md5_add                = tcp_v4_md5_add_func,
1781         .md5_parse              = tcp_v4_parse_md5_keys,
1782 };
1783 #endif
1784
1785 /* NOTE: A lot of things set to zero explicitly by call to
1786  *       sk_alloc() so need not be done here.
1787  */
1788 static int tcp_v4_init_sock(struct sock *sk)
1789 {
1790         struct inet_connection_sock *icsk = inet_csk(sk);
1791         struct tcp_sock *tp = tcp_sk(sk);
1792
1793         skb_queue_head_init(&tp->out_of_order_queue);
1794         tcp_init_xmit_timers(sk);
1795         tcp_prequeue_init(tp);
1796
1797         icsk->icsk_rto = TCP_TIMEOUT_INIT;
1798         tp->mdev = TCP_TIMEOUT_INIT;
1799
1800         /* So many TCP implementations out there (incorrectly) count the
1801          * initial SYN frame in their delayed-ACK and congestion control
1802          * algorithms that we must have the following bandaid to talk
1803          * efficiently to them.  -DaveM
1804          */
1805         tp->snd_cwnd = 2;
1806
1807         /* See draft-stevens-tcpca-spec-01 for discussion of the
1808          * initialization of these values.
1809          */
1810         tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
1811         tp->snd_cwnd_clamp = ~0;
1812         tp->mss_cache = 536;
1813
1814         tp->reordering = sysctl_tcp_reordering;
1815         icsk->icsk_ca_ops = &tcp_init_congestion_ops;
1816
1817         sk->sk_state = TCP_CLOSE;
1818
1819         sk->sk_write_space = sk_stream_write_space;
1820         sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1821
1822         icsk->icsk_af_ops = &ipv4_specific;
1823         icsk->icsk_sync_mss = tcp_sync_mss;
1824 #ifdef CONFIG_TCP_MD5SIG
1825         tp->af_specific = &tcp_sock_ipv4_specific;
1826 #endif
1827
1828         sk->sk_sndbuf = sysctl_tcp_wmem[1];
1829         sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1830
1831         local_bh_disable();
1832         percpu_counter_inc(&tcp_sockets_allocated);
1833         local_bh_enable();
1834
1835         return 0;
1836 }
1837
1838 void tcp_v4_destroy_sock(struct sock *sk)
1839 {
1840         struct tcp_sock *tp = tcp_sk(sk);
1841
1842         tcp_clear_xmit_timers(sk);
1843
1844         tcp_cleanup_congestion_control(sk);
1845
1846         /* Cleanup up the write buffer. */
1847         tcp_write_queue_purge(sk);
1848
1849         /* Cleans up our, hopefully empty, out_of_order_queue. */
1850         __skb_queue_purge(&tp->out_of_order_queue);
1851
1852 #ifdef CONFIG_TCP_MD5SIG
1853         /* Clean up the MD5 key list, if any */
1854         if (tp->md5sig_info) {
1855                 tcp_v4_clear_md5_list(sk);
1856                 kfree(tp->md5sig_info);
1857                 tp->md5sig_info = NULL;
1858         }
1859 #endif
1860
1861 #ifdef CONFIG_NET_DMA
1862         /* Cleans up our sk_async_wait_queue */
1863         __skb_queue_purge(&sk->sk_async_wait_queue);
1864 #endif
1865
1866         /* Clean prequeue, it must be empty really */
1867         __skb_queue_purge(&tp->ucopy.prequeue);
1868
1869         /* Clean up a referenced TCP bind bucket. */
1870         if (inet_csk(sk)->icsk_bind_hash)
1871                 inet_put_port(sk);
1872
1873         /*
1874          * If sendmsg cached page exists, toss it.
1875          */
1876         if (sk->sk_sndmsg_page) {
1877                 __free_page(sk->sk_sndmsg_page);
1878                 sk->sk_sndmsg_page = NULL;
1879         }
1880
1881         percpu_counter_dec(&tcp_sockets_allocated);
1882 }
1883
1884 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1885
1886 #ifdef CONFIG_PROC_FS
1887 /* Proc filesystem TCP sock list dumping. */
1888
1889 static inline struct inet_timewait_sock *tw_head(struct hlist_nulls_head *head)
1890 {
1891         return hlist_nulls_empty(head) ? NULL :
1892                 list_entry(head->first, struct inet_timewait_sock, tw_node);
1893 }
1894
1895 static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
1896 {
1897         return !is_a_nulls(tw->tw_node.next) ?
1898                 hlist_nulls_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1899 }
1900
1901 static void *listening_get_next(struct seq_file *seq, void *cur)
1902 {
1903         struct inet_connection_sock *icsk;
1904         struct hlist_nulls_node *node;
1905         struct sock *sk = cur;
1906         struct inet_listen_hashbucket *ilb;
1907         struct tcp_iter_state *st = seq->private;
1908         struct net *net = seq_file_net(seq);
1909
1910         if (!sk) {
1911                 st->bucket = 0;
1912                 ilb = &tcp_hashinfo.listening_hash[0];
1913                 spin_lock_bh(&ilb->lock);
1914                 sk = sk_nulls_head(&ilb->head);
1915                 goto get_sk;
1916         }
1917         ilb = &tcp_hashinfo.listening_hash[st->bucket];
1918         ++st->num;
1919
1920         if (st->state == TCP_SEQ_STATE_OPENREQ) {
1921                 struct request_sock *req = cur;
1922
1923                 icsk = inet_csk(st->syn_wait_sk);
1924                 req = req->dl_next;
1925                 while (1) {
1926                         while (req) {
1927                                 if (req->rsk_ops->family == st->family) {
1928                                         cur = req;
1929                                         goto out;
1930                                 }
1931                                 req = req->dl_next;
1932                         }
1933                         if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
1934                                 break;
1935 get_req:
1936                         req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
1937                 }
1938                 sk        = sk_next(st->syn_wait_sk);
1939                 st->state = TCP_SEQ_STATE_LISTENING;
1940                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1941         } else {
1942                 icsk = inet_csk(sk);
1943                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1944                 if (reqsk_queue_len(&icsk->icsk_accept_queue))
1945                         goto start_req;
1946                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1947                 sk = sk_next(sk);
1948         }
1949 get_sk:
1950         sk_nulls_for_each_from(sk, node) {
1951                 if (sk->sk_family == st->family && net_eq(sock_net(sk), net)) {
1952                         cur = sk;
1953                         goto out;
1954                 }
1955                 icsk = inet_csk(sk);
1956                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1957                 if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
1958 start_req:
1959                         st->uid         = sock_i_uid(sk);
1960                         st->syn_wait_sk = sk;
1961                         st->state       = TCP_SEQ_STATE_OPENREQ;
1962                         st->sbucket     = 0;
1963                         goto get_req;
1964                 }
1965                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1966         }
1967         spin_unlock_bh(&ilb->lock);
1968         if (++st->bucket < INET_LHTABLE_SIZE) {
1969                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
1970                 spin_lock_bh(&ilb->lock);
1971                 sk = sk_nulls_head(&ilb->head);
1972                 goto get_sk;
1973         }
1974         cur = NULL;
1975 out:
1976         return cur;
1977 }
1978
1979 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1980 {
1981         void *rc = listening_get_next(seq, NULL);
1982
1983         while (rc && *pos) {
1984                 rc = listening_get_next(seq, rc);
1985                 --*pos;
1986         }
1987         return rc;
1988 }
1989
1990 static inline int empty_bucket(struct tcp_iter_state *st)
1991 {
1992         return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain) &&
1993                 hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].twchain);
1994 }
1995
1996 static void *established_get_first(struct seq_file *seq)
1997 {
1998         struct tcp_iter_state *st = seq->private;
1999         struct net *net = seq_file_net(seq);
2000         void *rc = NULL;
2001
2002         for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) {
2003                 struct sock *sk;
2004                 struct hlist_nulls_node *node;
2005                 struct inet_timewait_sock *tw;
2006                 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2007
2008                 /* Lockless fast path for the common case of empty buckets */
2009                 if (empty_bucket(st))
2010                         continue;
2011
2012                 spin_lock_bh(lock);
2013                 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2014                         if (sk->sk_family != st->family ||
2015                             !net_eq(sock_net(sk), net)) {
2016                                 continue;
2017                         }
2018                         rc = sk;
2019                         goto out;
2020                 }
2021                 st->state = TCP_SEQ_STATE_TIME_WAIT;
2022                 inet_twsk_for_each(tw, node,
2023                                    &tcp_hashinfo.ehash[st->bucket].twchain) {
2024                         if (tw->tw_family != st->family ||
2025                             !net_eq(twsk_net(tw), net)) {
2026                                 continue;
2027                         }
2028                         rc = tw;
2029                         goto out;
2030                 }
2031                 spin_unlock_bh(lock);
2032                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2033         }
2034 out:
2035         return rc;
2036 }
2037
2038 static void *established_get_next(struct seq_file *seq, void *cur)
2039 {
2040         struct sock *sk = cur;
2041         struct inet_timewait_sock *tw;
2042         struct hlist_nulls_node *node;
2043         struct tcp_iter_state *st = seq->private;
2044         struct net *net = seq_file_net(seq);
2045
2046         ++st->num;
2047
2048         if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2049                 tw = cur;
2050                 tw = tw_next(tw);
2051 get_tw:
2052                 while (tw && (tw->tw_family != st->family || !net_eq(twsk_net(tw), net))) {
2053                         tw = tw_next(tw);
2054                 }
2055                 if (tw) {
2056                         cur = tw;
2057                         goto out;
2058                 }
2059                 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2060                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2061
2062                 /* Look for next non empty bucket */
2063                 while (++st->bucket < tcp_hashinfo.ehash_size &&
2064                                 empty_bucket(st))
2065                         ;
2066                 if (st->bucket >= tcp_hashinfo.ehash_size)
2067                         return NULL;
2068
2069                 spin_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2070                 sk = sk_nulls_head(&tcp_hashinfo.ehash[st->bucket].chain);
2071         } else
2072                 sk = sk_nulls_next(sk);
2073
2074         sk_nulls_for_each_from(sk, node) {
2075                 if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
2076                         goto found;
2077         }
2078
2079         st->state = TCP_SEQ_STATE_TIME_WAIT;
2080         tw = tw_head(&tcp_hashinfo.ehash[st->bucket].twchain);
2081         goto get_tw;
2082 found:
2083         cur = sk;
2084 out:
2085         return cur;
2086 }
2087
2088 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2089 {
2090         void *rc = established_get_first(seq);
2091
2092         while (rc && pos) {
2093                 rc = established_get_next(seq, rc);
2094                 --pos;
2095         }
2096         return rc;
2097 }
2098
2099 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2100 {
2101         void *rc;
2102         struct tcp_iter_state *st = seq->private;
2103
2104         st->state = TCP_SEQ_STATE_LISTENING;
2105         rc        = listening_get_idx(seq, &pos);
2106
2107         if (!rc) {
2108                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2109                 rc        = established_get_idx(seq, pos);
2110         }
2111
2112         return rc;
2113 }
2114
2115 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2116 {
2117         struct tcp_iter_state *st = seq->private;
2118         st->state = TCP_SEQ_STATE_LISTENING;
2119         st->num = 0;
2120         return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2121 }
2122
2123 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2124 {
2125         void *rc = NULL;
2126         struct tcp_iter_state *st;
2127
2128         if (v == SEQ_START_TOKEN) {
2129                 rc = tcp_get_idx(seq, 0);
2130                 goto out;
2131         }
2132         st = seq->private;
2133
2134         switch (st->state) {
2135         case TCP_SEQ_STATE_OPENREQ:
2136         case TCP_SEQ_STATE_LISTENING:
2137                 rc = listening_get_next(seq, v);
2138                 if (!rc) {
2139                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2140                         rc        = established_get_first(seq);
2141                 }
2142                 break;
2143         case TCP_SEQ_STATE_ESTABLISHED:
2144         case TCP_SEQ_STATE_TIME_WAIT:
2145                 rc = established_get_next(seq, v);
2146                 break;
2147         }
2148 out:
2149         ++*pos;
2150         return rc;
2151 }
2152
2153 static void tcp_seq_stop(struct seq_file *seq, void *v)
2154 {
2155         struct tcp_iter_state *st = seq->private;
2156
2157         switch (st->state) {
2158         case TCP_SEQ_STATE_OPENREQ:
2159                 if (v) {
2160                         struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
2161                         read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2162                 }
2163         case TCP_SEQ_STATE_LISTENING:
2164                 if (v != SEQ_START_TOKEN)
2165                         spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
2166                 break;
2167         case TCP_SEQ_STATE_TIME_WAIT:
2168         case TCP_SEQ_STATE_ESTABLISHED:
2169                 if (v)
2170                         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2171                 break;
2172         }
2173 }
2174
2175 static int tcp_seq_open(struct inode *inode, struct file *file)
2176 {
2177         struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2178         struct tcp_iter_state *s;
2179         int err;
2180
2181         err = seq_open_net(inode, file, &afinfo->seq_ops,
2182                           sizeof(struct tcp_iter_state));
2183         if (err < 0)
2184                 return err;
2185
2186         s = ((struct seq_file *)file->private_data)->private;
2187         s->family               = afinfo->family;
2188         return 0;
2189 }
2190
2191 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2192 {
2193         int rc = 0;
2194         struct proc_dir_entry *p;
2195
2196         afinfo->seq_fops.open           = tcp_seq_open;
2197         afinfo->seq_fops.read           = seq_read;
2198         afinfo->seq_fops.llseek         = seq_lseek;
2199         afinfo->seq_fops.release        = seq_release_net;
2200
2201         afinfo->seq_ops.start           = tcp_seq_start;
2202         afinfo->seq_ops.next            = tcp_seq_next;
2203         afinfo->seq_ops.stop            = tcp_seq_stop;
2204
2205         p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2206                              &afinfo->seq_fops, afinfo);
2207         if (!p)
2208                 rc = -ENOMEM;
2209         return rc;
2210 }
2211
2212 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2213 {
2214         proc_net_remove(net, afinfo->name);
2215 }
2216
2217 static void get_openreq4(struct sock *sk, struct request_sock *req,
2218                          struct seq_file *f, int i, int uid, int *len)
2219 {
2220         const struct inet_request_sock *ireq = inet_rsk(req);
2221         int ttd = req->expires - jiffies;
2222
2223         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2224                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p%n",
2225                 i,
2226                 ireq->loc_addr,
2227                 ntohs(inet_sk(sk)->sport),
2228                 ireq->rmt_addr,
2229                 ntohs(ireq->rmt_port),
2230                 TCP_SYN_RECV,
2231                 0, 0, /* could print option size, but that is af dependent. */
2232                 1,    /* timers active (only the expire timer) */
2233                 jiffies_to_clock_t(ttd),
2234                 req->retrans,
2235                 uid,
2236                 0,  /* non standard timer */
2237                 0, /* open_requests have no inode */
2238                 atomic_read(&sk->sk_refcnt),
2239                 req,
2240                 len);
2241 }
2242
2243 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
2244 {
2245         int timer_active;
2246         unsigned long timer_expires;
2247         struct tcp_sock *tp = tcp_sk(sk);
2248         const struct inet_connection_sock *icsk = inet_csk(sk);
2249         struct inet_sock *inet = inet_sk(sk);
2250         __be32 dest = inet->daddr;
2251         __be32 src = inet->rcv_saddr;
2252         __u16 destp = ntohs(inet->dport);
2253         __u16 srcp = ntohs(inet->sport);
2254
2255         if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
2256                 timer_active    = 1;
2257                 timer_expires   = icsk->icsk_timeout;
2258         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2259                 timer_active    = 4;
2260                 timer_expires   = icsk->icsk_timeout;
2261         } else if (timer_pending(&sk->sk_timer)) {
2262                 timer_active    = 2;
2263                 timer_expires   = sk->sk_timer.expires;
2264         } else {
2265                 timer_active    = 0;
2266                 timer_expires = jiffies;
2267         }
2268
2269         seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2270                         "%08X %5d %8d %lu %d %p %lu %lu %u %u %d%n",
2271                 i, src, srcp, dest, destp, sk->sk_state,
2272                 tp->write_seq - tp->snd_una,
2273                 sk->sk_state == TCP_LISTEN ? sk->sk_ack_backlog :
2274                                              (tp->rcv_nxt - tp->copied_seq),
2275                 timer_active,
2276                 jiffies_to_clock_t(timer_expires - jiffies),
2277                 icsk->icsk_retransmits,
2278                 sock_i_uid(sk),
2279                 icsk->icsk_probes_out,
2280                 sock_i_ino(sk),
2281                 atomic_read(&sk->sk_refcnt), sk,
2282                 jiffies_to_clock_t(icsk->icsk_rto),
2283                 jiffies_to_clock_t(icsk->icsk_ack.ato),
2284                 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2285                 tp->snd_cwnd,
2286                 tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh,
2287                 len);
2288 }
2289
2290 static void get_timewait4_sock(struct inet_timewait_sock *tw,
2291                                struct seq_file *f, int i, int *len)
2292 {
2293         __be32 dest, src;
2294         __u16 destp, srcp;
2295         int ttd = tw->tw_ttd - jiffies;
2296
2297         if (ttd < 0)
2298                 ttd = 0;
2299
2300         dest  = tw->tw_daddr;
2301         src   = tw->tw_rcv_saddr;
2302         destp = ntohs(tw->tw_dport);
2303         srcp  = ntohs(tw->tw_sport);
2304
2305         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2306                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p%n",
2307                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2308                 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2309                 atomic_read(&tw->tw_refcnt), tw, len);
2310 }
2311
2312 #define TMPSZ 150
2313
2314 static int tcp4_seq_show(struct seq_file *seq, void *v)
2315 {
2316         struct tcp_iter_state *st;
2317         int len;
2318
2319         if (v == SEQ_START_TOKEN) {
2320                 seq_printf(seq, "%-*s\n", TMPSZ - 1,
2321                            "  sl  local_address rem_address   st tx_queue "
2322                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2323                            "inode");
2324                 goto out;
2325         }
2326         st = seq->private;
2327
2328         switch (st->state) {
2329         case TCP_SEQ_STATE_LISTENING:
2330         case TCP_SEQ_STATE_ESTABLISHED:
2331                 get_tcp4_sock(v, seq, st->num, &len);
2332                 break;
2333         case TCP_SEQ_STATE_OPENREQ:
2334                 get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid, &len);
2335                 break;
2336         case TCP_SEQ_STATE_TIME_WAIT:
2337                 get_timewait4_sock(v, seq, st->num, &len);
2338                 break;
2339         }
2340         seq_printf(seq, "%*s\n", TMPSZ - 1 - len, "");
2341 out:
2342         return 0;
2343 }
2344
2345 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2346         .name           = "tcp",
2347         .family         = AF_INET,
2348         .seq_fops       = {
2349                 .owner          = THIS_MODULE,
2350         },
2351         .seq_ops        = {
2352                 .show           = tcp4_seq_show,
2353         },
2354 };
2355
2356 static int tcp4_proc_init_net(struct net *net)
2357 {
2358         return tcp_proc_register(net, &tcp4_seq_afinfo);
2359 }
2360
2361 static void tcp4_proc_exit_net(struct net *net)
2362 {
2363         tcp_proc_unregister(net, &tcp4_seq_afinfo);
2364 }
2365
2366 static struct pernet_operations tcp4_net_ops = {
2367         .init = tcp4_proc_init_net,
2368         .exit = tcp4_proc_exit_net,
2369 };
2370
2371 int __init tcp4_proc_init(void)
2372 {
2373         return register_pernet_subsys(&tcp4_net_ops);
2374 }
2375
2376 void tcp4_proc_exit(void)
2377 {
2378         unregister_pernet_subsys(&tcp4_net_ops);
2379 }
2380 #endif /* CONFIG_PROC_FS */
2381
2382 struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb)
2383 {
2384         struct iphdr *iph = skb_gro_network_header(skb);
2385
2386         switch (skb->ip_summed) {
2387         case CHECKSUM_COMPLETE:
2388                 if (!tcp_v4_check(skb_gro_len(skb), iph->saddr, iph->daddr,
2389                                   skb->csum)) {
2390                         skb->ip_summed = CHECKSUM_UNNECESSARY;
2391                         break;
2392                 }
2393
2394                 /* fall through */
2395         case CHECKSUM_NONE:
2396                 NAPI_GRO_CB(skb)->flush = 1;
2397                 return NULL;
2398         }
2399
2400         return tcp_gro_receive(head, skb);
2401 }
2402 EXPORT_SYMBOL(tcp4_gro_receive);
2403
2404 int tcp4_gro_complete(struct sk_buff *skb)
2405 {
2406         struct iphdr *iph = ip_hdr(skb);
2407         struct tcphdr *th = tcp_hdr(skb);
2408
2409         th->check = ~tcp_v4_check(skb->len - skb_transport_offset(skb),
2410                                   iph->saddr, iph->daddr, 0);
2411         skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
2412
2413         return tcp_gro_complete(skb);
2414 }
2415 EXPORT_SYMBOL(tcp4_gro_complete);
2416
2417 struct proto tcp_prot = {
2418         .name                   = "TCP",
2419         .owner                  = THIS_MODULE,
2420         .close                  = tcp_close,
2421         .connect                = tcp_v4_connect,
2422         .disconnect             = tcp_disconnect,
2423         .accept                 = inet_csk_accept,
2424         .ioctl                  = tcp_ioctl,
2425         .init                   = tcp_v4_init_sock,
2426         .destroy                = tcp_v4_destroy_sock,
2427         .shutdown               = tcp_shutdown,
2428         .setsockopt             = tcp_setsockopt,
2429         .getsockopt             = tcp_getsockopt,
2430         .recvmsg                = tcp_recvmsg,
2431         .backlog_rcv            = tcp_v4_do_rcv,
2432         .hash                   = inet_hash,
2433         .unhash                 = inet_unhash,
2434         .get_port               = inet_csk_get_port,
2435         .enter_memory_pressure  = tcp_enter_memory_pressure,
2436         .sockets_allocated      = &tcp_sockets_allocated,
2437         .orphan_count           = &tcp_orphan_count,
2438         .memory_allocated       = &tcp_memory_allocated,
2439         .memory_pressure        = &tcp_memory_pressure,
2440         .sysctl_mem             = sysctl_tcp_mem,
2441         .sysctl_wmem            = sysctl_tcp_wmem,
2442         .sysctl_rmem            = sysctl_tcp_rmem,
2443         .max_header             = MAX_TCP_HEADER,
2444         .obj_size               = sizeof(struct tcp_sock),
2445         .slab_flags             = SLAB_DESTROY_BY_RCU,
2446         .twsk_prot              = &tcp_timewait_sock_ops,
2447         .rsk_prot               = &tcp_request_sock_ops,
2448         .h.hashinfo             = &tcp_hashinfo,
2449 #ifdef CONFIG_COMPAT
2450         .compat_setsockopt      = compat_tcp_setsockopt,
2451         .compat_getsockopt      = compat_tcp_getsockopt,
2452 #endif
2453 };
2454
2455
2456 static int __net_init tcp_sk_init(struct net *net)
2457 {
2458         return inet_ctl_sock_create(&net->ipv4.tcp_sock,
2459                                     PF_INET, SOCK_RAW, IPPROTO_TCP, net);
2460 }
2461
2462 static void __net_exit tcp_sk_exit(struct net *net)
2463 {
2464         inet_ctl_sock_destroy(net->ipv4.tcp_sock);
2465         inet_twsk_purge(net, &tcp_hashinfo, &tcp_death_row, AF_INET);
2466 }
2467
2468 static struct pernet_operations __net_initdata tcp_sk_ops = {
2469        .init = tcp_sk_init,
2470        .exit = tcp_sk_exit,
2471 };
2472
2473 void __init tcp_v4_init(void)
2474 {
2475         inet_hashinfo_init(&tcp_hashinfo);
2476         if (register_pernet_subsys(&tcp_sk_ops))
2477                 panic("Failed to create the TCP control socket.\n");
2478 }
2479
2480 EXPORT_SYMBOL(ipv4_specific);
2481 EXPORT_SYMBOL(tcp_hashinfo);
2482 EXPORT_SYMBOL(tcp_prot);
2483 EXPORT_SYMBOL(tcp_v4_conn_request);
2484 EXPORT_SYMBOL(tcp_v4_connect);
2485 EXPORT_SYMBOL(tcp_v4_do_rcv);
2486 EXPORT_SYMBOL(tcp_v4_remember_stamp);
2487 EXPORT_SYMBOL(tcp_v4_send_check);
2488 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
2489
2490 #ifdef CONFIG_PROC_FS
2491 EXPORT_SYMBOL(tcp_proc_register);
2492 EXPORT_SYMBOL(tcp_proc_unregister);
2493 #endif
2494 EXPORT_SYMBOL(sysctl_tcp_low_latency);
2495