net/ipv4/tcp_ipv4.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  *              IPv4 specific functions
   9  *
  10  *
  11  *              code split from:
  12  *              linux/ipv4/tcp.c
  13  *              linux/ipv4/tcp_input.c
  14  *              linux/ipv4/tcp_output.c
  15  *
  16  *              See tcp.c for author information
  17  *
  18  *      This program is free software; you can redistribute it and/or
  19  *      modify it under the terms of the GNU General Public License
  20  *      as published by the Free Software Foundation; either version
  21  *      2 of the License, or (at your option) any later version.
  22  */
  23
  24 /*
  25  * Changes:
  26  *              David S. Miller :       New socket lookup architecture.
  27  *                                      This code is dedicated to John Dyson.
  28  *              David S. Miller :       Change semantics of established hash,
  29  *                                      half is devoted to TIME_WAIT sockets
  30  *                                      and the rest go in the other half.
  31  *              Andi Kleen :            Add support for syncookies and fixed
  32  *                                      some bugs: ip options weren't passed to
  33  *                                      the TCP layer, missed a check for an
  34  *                                      ACK bit.
  35  *              Andi Kleen :            Implemented fast path mtu discovery.
  36  *                                      Fixed many serious bugs in the
  37  *                                      request_sock handling and moved
  38  *                                      most of it into the af independent code.
  39  *                                      Added tail drop and some other bugfixes.
  40  *                                      Added new listen semantics.
  41  *              Mike McLagan    :       Routing by source
  42  *      Juan Jose Ciarlante:            ip_dynaddr bits
  43  *              Andi Kleen:             various fixes.
  44  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
  45  *                                      coma.
  46  *      Andi Kleen              :       Fix new listen.
  47  *      Andi Kleen              :       Fix accept error reporting.
  48  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
  49  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
  50  *                                      a single port at the same time.
  51  */
  52
  53
  54 #include <linux/bottom_half.h>
  55 #include <linux/types.h>
  56 #include <linux/fcntl.h>
  57 #include <linux/module.h>
  58 #include <linux/random.h>
  59 #include <linux/cache.h>
  60 #include <linux/jhash.h>
  61 #include <linux/init.h>
  62 #include <linux/times.h>
  63 #include <linux/slab.h>
  64
  65 #include <net/net_namespace.h>
  66 #include <net/icmp.h>
  67 #include <net/inet_hashtables.h>
  68 #include <net/tcp.h>
  69 #include <net/transp_v6.h>
  70 #include <net/ipv6.h>
  71 #include <net/inet_common.h>
  72 #include <net/timewait_sock.h>
  73 #include <net/xfrm.h>
  74 #include <net/netdma.h>
  75 #include <net/secure_seq.h>
  76
  77 #include <linux/inet.h>
  78 #include <linux/ipv6.h>
  79 #include <linux/stddef.h>
  80 #include <linux/proc_fs.h>
  81 #include <linux/seq_file.h>
  82
  83 #include <linux/crypto.h>
  84 #include <linux/scatterlist.h>
  85
  86 int sysctl_tcp_tw_reuse __read_mostly;
  87 int sysctl_tcp_low_latency __read_mostly;
  88 EXPORT_SYMBOL(sysctl_tcp_low_latency);
  89
  90
  91 #ifdef CONFIG_TCP_MD5SIG
  92 static struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk,
  93                                                    __be32 addr);
  94 static int tcp_v4_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key,
  95                                __be32 daddr, __be32 saddr, struct tcphdr *th);
  96 #else
  97 static inline
  98 struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
  99 {
 100         return NULL;
 101 }
 102 #endif
 103
 104 struct inet_hashinfo tcp_hashinfo;
 105 EXPORT_SYMBOL(tcp_hashinfo);
 106
 107 static inline __u32 tcp_v4_init_sequence(struct sk_buff *skb)
 108 {
 109         return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
 110                                           ip_hdr(skb)->saddr,
 111                                           tcp_hdr(skb)->dest,
 112                                           tcp_hdr(skb)->source);
 113 }
 114
 115 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
 116 {
 117         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
 118         struct tcp_sock *tp = tcp_sk(sk);
 119
 120         /* With PAWS, it is safe from the viewpoint
 121            of data integrity. Even without PAWS it is safe provided sequence
 122            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
 123
 124            Actually, the idea is close to VJ's one, only timestamp cache is
 125            held not per host, but per port pair and TW bucket is used as state
 126            holder.
 127
 128            If TW bucket has been already destroyed we fall back to VJ's scheme
 129            and use initial timestamp retrieved from peer table.
 130          */
 131         if (tcptw->tw_ts_recent_stamp &&
 132             (twp == NULL || (sysctl_tcp_tw_reuse &&
 133                              get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
 134                 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
 135                 if (tp->write_seq == 0)
 136                         tp->write_seq = 1;
 137                 tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
 138                 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
 139                 sock_hold(sktw);
 140                 return 1;
 141         }
 142
 143         return 0;
 144 }
 145 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
 146
 147 /* This will initiate an outgoing connection. */
 148 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 149 {
 150         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 151         struct inet_sock *inet = inet_sk(sk);
 152         struct tcp_sock *tp = tcp_sk(sk);
 153         __be16 orig_sport, orig_dport;
 154         __be32 daddr, nexthop;
 155         struct flowi4 *fl4;
 156         struct rtable *rt;
 157         int err;
 158         struct ip_options_rcu *inet_opt;
 159
 160         if (addr_len < sizeof(struct sockaddr_in))
 161                 return -EINVAL;
 162
 163         if (usin->sin_family != AF_INET)
 164                 return -EAFNOSUPPORT;
 165
 166         nexthop = daddr = usin->sin_addr.s_addr;
 167         inet_opt = rcu_dereference_protected(inet->inet_opt,
 168                                              sock_owned_by_user(sk));
 169         if (inet_opt && inet_opt->opt.srr) {
 170                 if (!daddr)
 171                         return -EINVAL;
 172                 nexthop = inet_opt->opt.faddr;
 173         }
 174
 175         orig_sport = inet->inet_sport;
 176         orig_dport = usin->sin_port;
 177         fl4 = &inet->cork.fl.u.ip4;
 178         rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
 179                               RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
 180                               IPPROTO_TCP,
 181                               orig_sport, orig_dport, sk, true);
 182         if (IS_ERR(rt)) {
 183                 err = PTR_ERR(rt);
 184                 if (err == -ENETUNREACH)
 185                         IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
 186                 return err;
 187         }
 188
 189         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
 190                 ip_rt_put(rt);
 191                 return -ENETUNREACH;
 192         }
 193
 194         if (!inet_opt || !inet_opt->opt.srr)
 195                 daddr = fl4->daddr;
 196
 197         if (!inet->inet_saddr)
 198                 inet->inet_saddr = fl4->saddr;
 199         inet->inet_rcv_saddr = inet->inet_saddr;
 200
 201         if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
 202                 /* Reset inherited state */
 203                 tp->rx_opt.ts_recent       = 0;
 204                 tp->rx_opt.ts_recent_stamp = 0;
 205                 tp->write_seq              = 0;
 206         }
 207
 208         if (tcp_death_row.sysctl_tw_recycle &&
 209             !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr) {
 210                 struct inet_peer *peer = rt_get_peer(rt, fl4->daddr);
 211                 /*
 212                  * VJ's idea. We save last timestamp seen from
 213                  * the destination in peer table, when entering state
 214                  * TIME-WAIT * and initialize rx_opt.ts_recent from it,
 215                  * when trying new connection.
 216                  */
 217                 if (peer) {
 218                         inet_peer_refcheck(peer);
 219                         if ((u32)get_seconds() - peer->tcp_ts_stamp <= TCP_PAWS_MSL) {
 220                                 tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
 221                                 tp->rx_opt.ts_recent = peer->tcp_ts;
 222                         }
 223                 }
 224         }
 225
 226         inet->inet_dport = usin->sin_port;
 227         inet->inet_daddr = daddr;
 228
 229         inet_csk(sk)->icsk_ext_hdr_len = 0;
 230         if (inet_opt)
 231                 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
 232
 233         tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
 234
 235         /* Socket identity is still unknown (sport may be zero).
 236          * However we set state to SYN-SENT and not releasing socket
 237          * lock select source port, enter ourselves into the hash tables and
 238          * complete initialization after this.
 239          */
 240         tcp_set_state(sk, TCP_SYN_SENT);
 241         err = inet_hash_connect(&tcp_death_row, sk);
 242         if (err)
 243                 goto failure;
 244
 245         rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
 246                                inet->inet_sport, inet->inet_dport, sk);
 247         if (IS_ERR(rt)) {
 248                 err = PTR_ERR(rt);
 249                 rt = NULL;
 250                 goto failure;
 251         }
 252         /* OK, now commit destination to socket.  */
 253         sk->sk_gso_type = SKB_GSO_TCPV4;
 254         sk_setup_caps(sk, &rt->dst);
 255
 256         if (!tp->write_seq)
 257                 tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
 258                                                            inet->inet_daddr,
 259                                                            inet->inet_sport,
 260                                                            usin->sin_port);
 261
 262         inet->inet_id = tp->write_seq ^ jiffies;
 263
 264         err = tcp_connect(sk);
 265         rt = NULL;
 266         if (err)
 267                 goto failure;
 268
 269         return 0;
 270
 271 failure:
 272         /*
 273          * This unhashes the socket and releases the local port,
 274          * if necessary.
 275          */
 276         tcp_set_state(sk, TCP_CLOSE);
 277         ip_rt_put(rt);
 278         sk->sk_route_caps = 0;
 279         inet->inet_dport = 0;
 280         return err;
 281 }
 282 EXPORT_SYMBOL(tcp_v4_connect);
 283
 284 /*
 285  * This routine does path mtu discovery as defined in RFC1191.
 286  */
 287 static void do_pmtu_discovery(struct sock *sk, const struct iphdr *iph, u32 mtu)
 288 {
 289         struct dst_entry *dst;
 290         struct inet_sock *inet = inet_sk(sk);
 291
 292         /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
 293          * send out by Linux are always <576bytes so they should go through
 294          * unfragmented).
 295          */
 296         if (sk->sk_state == TCP_LISTEN)
 297                 return;
 298
 299         /* We don't check in the destentry if pmtu discovery is forbidden
 300          * on this route. We just assume that no packet_to_big packets
 301          * are send back when pmtu discovery is not active.
 302          * There is a small race when the user changes this flag in the
 303          * route, but I think that's acceptable.
 304          */
 305         if ((dst = __sk_dst_check(sk, 0)) == NULL)
 306                 return;
 307
 308         dst->ops->update_pmtu(dst, mtu);
 309
 310         /* Something is about to be wrong... Remember soft error
 311          * for the case, if this connection will not able to recover.
 312          */
 313         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
 314                 sk->sk_err_soft = EMSGSIZE;
 315
 316         mtu = dst_mtu(dst);
 317
 318         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
 319             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
 320                 tcp_sync_mss(sk, mtu);
 321
 322                 /* Resend the TCP packet because it's
 323                  * clear that the old packet has been
 324                  * dropped. This is the new "fast" path mtu
 325                  * discovery.
 326                  */
 327                 tcp_simple_retransmit(sk);
 328         } /* else let the usual retransmit timer handle it */
 329 }
 330
 331 /*
 332  * This routine is called by the ICMP module when it gets some
 333  * sort of error condition.  If err < 0 then the socket should
 334  * be closed and the error returned to the user.  If err > 0
 335  * it's just the icmp type << 8 | icmp code.  After adjustment
 336  * header points to the first 8 bytes of the tcp header.  We need
 337  * to find the appropriate port.
 338  *
 339  * The locking strategy used here is very "optimistic". When
 340  * someone else accesses the socket the ICMP is just dropped
 341  * and for some paths there is no check at all.
 342  * A more general error queue to queue errors for later handling
 343  * is probably better.
 344  *
 345  */
 346
 347 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
 348 {
 349         const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
 350         struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
 351         struct inet_connection_sock *icsk;
 352         struct tcp_sock *tp;
 353         struct inet_sock *inet;
 354         const int type = icmp_hdr(icmp_skb)->type;
 355         const int code = icmp_hdr(icmp_skb)->code;
 356         struct sock *sk;
 357         struct sk_buff *skb;
 358         __u32 seq;
 359         __u32 remaining;
 360         int err;
 361         struct net *net = dev_net(icmp_skb->dev);
 362
 363         if (icmp_skb->len < (iph->ihl << 2) + 8) {
 364                 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
 365                 return;
 366         }
 367
 368         sk = inet_lookup(net, &tcp_hashinfo, iph->daddr, th->dest,
 369                         iph->saddr, th->source, inet_iif(icmp_skb));
 370         if (!sk) {
 371                 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
 372                 return;
 373         }
 374         if (sk->sk_state == TCP_TIME_WAIT) {
 375                 inet_twsk_put(inet_twsk(sk));
 376                 return;
 377         }
 378
 379         bh_lock_sock(sk);
 380         /* If too many ICMPs get dropped on busy
 381          * servers this needs to be solved differently.
 382          */
 383         if (sock_owned_by_user(sk))
 384                 NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
 385
 386         if (sk->sk_state == TCP_CLOSE)
 387                 goto out;
 388
 389         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
 390                 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
 391                 goto out;
 392         }
 393
 394         icsk = inet_csk(sk);
 395         tp = tcp_sk(sk);
 396         seq = ntohl(th->seq);
 397         if (sk->sk_state != TCP_LISTEN &&
 398             !between(seq, tp->snd_una, tp->snd_nxt)) {
 399                 NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
 400                 goto out;
 401         }
 402
 403         switch (type) {
 404         case ICMP_SOURCE_QUENCH:
 405                 /* Just silently ignore these. */
 406                 goto out;
 407         case ICMP_PARAMETERPROB:
 408                 err = EPROTO;
 409                 break;
 410         case ICMP_DEST_UNREACH:
 411                 if (code > NR_ICMP_UNREACH)
 412                         goto out;
 413
 414                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
 415                         if (!sock_owned_by_user(sk))
 416                                 do_pmtu_discovery(sk, iph, info);
 417                         goto out;
 418                 }
 419
 420                 err = icmp_err_convert[code].errno;
 421                 /* check if icmp_skb allows revert of backoff
 422                  * (see draft-zimmermann-tcp-lcd) */
 423                 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
 424                         break;
 425                 if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
 426                     !icsk->icsk_backoff)
 427                         break;
 428
 429                 if (sock_owned_by_user(sk))
 430                         break;
 431
 432                 icsk->icsk_backoff--;
 433                 inet_csk(sk)->icsk_rto = __tcp_set_rto(tp) <<
 434                                          icsk->icsk_backoff;
 435                 tcp_bound_rto(sk);
 436
 437                 skb = tcp_write_queue_head(sk);
 438                 BUG_ON(!skb);
 439
 440                 remaining = icsk->icsk_rto - min(icsk->icsk_rto,
 441                                 tcp_time_stamp - TCP_SKB_CB(skb)->when);
 442
 443                 if (remaining) {
 444                         inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
 445                                                   remaining, TCP_RTO_MAX);
 446                 } else {
 447                         /* RTO revert clocked out retransmission.
 448                          * Will retransmit now */
 449                         tcp_retransmit_timer(sk);
 450                 }
 451
 452                 break;
 453         case ICMP_TIME_EXCEEDED:
 454                 err = EHOSTUNREACH;
 455                 break;
 456         default:
 457                 goto out;
 458         }
 459
 460         switch (sk->sk_state) {
 461                 struct request_sock *req, **prev;
 462         case TCP_LISTEN:
 463                 if (sock_owned_by_user(sk))
 464                         goto out;
 465
 466                 req = inet_csk_search_req(sk, &prev, th->dest,
 467                                           iph->daddr, iph->saddr);
 468                 if (!req)
 469                         goto out;
 470
 471                 /* ICMPs are not backlogged, hence we cannot get
 472                    an established socket here.
 473                  */
 474                 WARN_ON(req->sk);
 475
 476                 if (seq != tcp_rsk(req)->snt_isn) {
 477                         NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
 478                         goto out;
 479                 }
 480
 481                 /*
 482                  * Still in SYN_RECV, just remove it silently.
 483                  * There is no good way to pass the error to the newly
 484                  * created socket, and POSIX does not want network
 485                  * errors returned from accept().
 486                  */
 487                 inet_csk_reqsk_queue_drop(sk, req, prev);
 488                 goto out;
 489
 490         case TCP_SYN_SENT:
 491         case TCP_SYN_RECV:  /* Cannot happen.
 492                                It can f.e. if SYNs crossed.
 493                              */
 494                 if (!sock_owned_by_user(sk)) {
 495                         sk->sk_err = err;
 496
 497                         sk->sk_error_report(sk);
 498
 499                         tcp_done(sk);
 500                 } else {
 501                         sk->sk_err_soft = err;
 502                 }
 503                 goto out;
 504         }
 505
 506         /* If we've already connected we will keep trying
 507          * until we time out, or the user gives up.
 508          *
 509          * rfc1122 4.2.3.9 allows to consider as hard errors
 510          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
 511          * but it is obsoleted by pmtu discovery).
 512          *
 513          * Note, that in modern internet, where routing is unreliable
 514          * and in each dark corner broken firewalls sit, sending random
 515          * errors ordered by their masters even this two messages finally lose
 516          * their original sense (even Linux sends invalid PORT_UNREACHs)
 517          *
 518          * Now we are in compliance with RFCs.
 519          *                                                      --ANK (980905)
 520          */
 521
 522         inet = inet_sk(sk);
 523         if (!sock_owned_by_user(sk) && inet->recverr) {
 524                 sk->sk_err = err;
 525                 sk->sk_error_report(sk);
 526         } else  { /* Only an error on timeout */
 527                 sk->sk_err_soft = err;
 528         }
 529
 530 out:
 531         bh_unlock_sock(sk);
 532         sock_put(sk);
 533 }
 534
 535 static void __tcp_v4_send_check(struct sk_buff *skb,
 536                                 __be32 saddr, __be32 daddr)
 537 {
 538         struct tcphdr *th = tcp_hdr(skb);
 539
 540         if (skb->ip_summed == CHECKSUM_PARTIAL) {
 541                 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
 542                 skb->csum_start = skb_transport_header(skb) - skb->head;
 543                 skb->csum_offset = offsetof(struct tcphdr, check);
 544         } else {
 545                 th->check = tcp_v4_check(skb->len, saddr, daddr,
 546                                          csum_partial(th,
 547                                                       th->doff << 2,
 548                                                       skb->csum));
 549         }
 550 }
 551
 552 /* This routine computes an IPv4 TCP checksum. */
 553 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
 554 {
 555         struct inet_sock *inet = inet_sk(sk);
 556
 557         __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
 558 }
 559 EXPORT_SYMBOL(tcp_v4_send_check);
 560
 561 int tcp_v4_gso_send_check(struct sk_buff *skb)
 562 {
 563         const struct iphdr *iph;
 564         struct tcphdr *th;
 565
 566         if (!pskb_may_pull(skb, sizeof(*th)))
 567                 return -EINVAL;
 568
 569         iph = ip_hdr(skb);
 570         th = tcp_hdr(skb);
 571
 572         th->check = 0;
 573         skb->ip_summed = CHECKSUM_PARTIAL;
 574         __tcp_v4_send_check(skb, iph->saddr, iph->daddr);
 575         return 0;
 576 }
 577
 578 /*
 579  *      This routine will send an RST to the other tcp.
 580  *
 581  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
 582  *                    for reset.
 583  *      Answer: if a packet caused RST, it is not for a socket
 584  *              existing in our system, if it is matched to a socket,
 585  *              it is just duplicate segment or bug in other side's TCP.
 586  *              So that we build reply only basing on parameters
 587  *              arrived with segment.
 588  *      Exception: precedence violation. We do not implement it in any case.
 589  */
 590
 591 static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
 592 {
 593         struct tcphdr *th = tcp_hdr(skb);
 594         struct {
 595                 struct tcphdr th;
 596 #ifdef CONFIG_TCP_MD5SIG
 597                 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
 598 #endif
 599         } rep;
 600         struct ip_reply_arg arg;
 601 #ifdef CONFIG_TCP_MD5SIG
 602         struct tcp_md5sig_key *key;
 603 #endif
 604         struct net *net;
 605
 606         /* Never send a reset in response to a reset. */
 607         if (th->rst)
 608                 return;
 609
 610         if (skb_rtable(skb)->rt_type != RTN_LOCAL)
 611                 return;
 612
 613         /* Swap the send and the receive. */
 614         memset(&rep, 0, sizeof(rep));
 615         rep.th.dest   = th->source;
 616         rep.th.source = th->dest;
 617         rep.th.doff   = sizeof(struct tcphdr) / 4;
 618         rep.th.rst    = 1;
 619
 620         if (th->ack) {
 621                 rep.th.seq = th->ack_seq;
 622         } else {
 623                 rep.th.ack = 1;
 624                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
 625                                        skb->len - (th->doff << 2));
 626         }
 627
 628         memset(&arg, 0, sizeof(arg));
 629         arg.iov[0].iov_base = (unsigned char *)&rep;
 630         arg.iov[0].iov_len  = sizeof(rep.th);
 631
 632 #ifdef CONFIG_TCP_MD5SIG
 633         key = sk ? tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->saddr) : NULL;
 634         if (key) {
 635                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
 636                                    (TCPOPT_NOP << 16) |
 637                                    (TCPOPT_MD5SIG << 8) |
 638                                    TCPOLEN_MD5SIG);
 639                 /* Update length and the length the header thinks exists */
 640                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 641                 rep.th.doff = arg.iov[0].iov_len / 4;
 642
 643                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
 644                                      key, ip_hdr(skb)->saddr,
 645                                      ip_hdr(skb)->daddr, &rep.th);
 646         }
 647 #endif
 648         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 649                                       ip_hdr(skb)->saddr, /* XXX */
 650                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 651         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 652         arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0;
 653         /* When socket is gone, all binding information is lost.
 654          * routing might fail in this case. using iif for oif to
 655          * make sure we can deliver it
 656          */
 657         arg.bound_dev_if = sk ? sk->sk_bound_dev_if : inet_iif(skb);
 658
 659         net = dev_net(skb_dst(skb)->dev);
 660         ip_send_reply(net->ipv4.tcp_sock, skb, ip_hdr(skb)->saddr,
 661                       &arg, arg.iov[0].iov_len);
 662
 663         TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
 664         TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
 665 }
 666
 667 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
 668    outside socket context is ugly, certainly. What can I do?
 669  */
 670
 671 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
 672                             u32 win, u32 ts, int oif,
 673                             struct tcp_md5sig_key *key,
 674                             int reply_flags)
 675 {
 676         struct tcphdr *th = tcp_hdr(skb);
 677         struct {
 678                 struct tcphdr th;
 679                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
 680 #ifdef CONFIG_TCP_MD5SIG
 681                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
 682 #endif
 683                         ];
 684         } rep;
 685         struct ip_reply_arg arg;
 686         struct net *net = dev_net(skb_dst(skb)->dev);
 687
 688         memset(&rep.th, 0, sizeof(struct tcphdr));
 689         memset(&arg, 0, sizeof(arg));
 690
 691         arg.iov[0].iov_base = (unsigned char *)&rep;
 692         arg.iov[0].iov_len  = sizeof(rep.th);
 693         if (ts) {
 694                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
 695                                    (TCPOPT_TIMESTAMP << 8) |
 696                                    TCPOLEN_TIMESTAMP);
 697                 rep.opt[1] = htonl(tcp_time_stamp);
 698                 rep.opt[2] = htonl(ts);
 699                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
 700         }
 701
 702         /* Swap the send and the receive. */
 703         rep.th.dest    = th->source;
 704         rep.th.source  = th->dest;
 705         rep.th.doff    = arg.iov[0].iov_len / 4;
 706         rep.th.seq     = htonl(seq);
 707         rep.th.ack_seq = htonl(ack);
 708         rep.th.ack     = 1;
 709         rep.th.window  = htons(win);
 710
 711 #ifdef CONFIG_TCP_MD5SIG
 712         if (key) {
 713                 int offset = (ts) ? 3 : 0;
 714
 715                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
 716                                           (TCPOPT_NOP << 16) |
 717                                           (TCPOPT_MD5SIG << 8) |
 718                                           TCPOLEN_MD5SIG);
 719                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 720                 rep.th.doff = arg.iov[0].iov_len/4;
 721
 722                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
 723                                     key, ip_hdr(skb)->saddr,
 724                                     ip_hdr(skb)->daddr, &rep.th);
 725         }
 726 #endif
 727         arg.flags = reply_flags;
 728         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 729                                       ip_hdr(skb)->saddr, /* XXX */
 730                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 731         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 732         if (oif)
 733                 arg.bound_dev_if = oif;
 734
 735         ip_send_reply(net->ipv4.tcp_sock, skb, ip_hdr(skb)->saddr,
 736                       &arg, arg.iov[0].iov_len);
 737
 738         TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
 739 }
 740
 741 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
 742 {
 743         struct inet_timewait_sock *tw = inet_twsk(sk);
 744         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
 745
 746         tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
 747                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
 748                         tcptw->tw_ts_recent,
 749                         tw->tw_bound_dev_if,
 750                         tcp_twsk_md5_key(tcptw),
 751                         tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0
 752                         );
 753
 754         inet_twsk_put(tw);
 755 }
 756
 757 static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
 758                                   struct request_sock *req)
 759 {
 760         tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1,
 761                         tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
 762                         req->ts_recent,
 763                         0,
 764                         tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr),
 765                         inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0);
 766 }
 767
 768 /*
 769  *      Send a SYN-ACK after having received a SYN.
 770  *      This still operates on a request_sock only, not on a big
 771  *      socket.
 772  */
 773 static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
 774                               struct request_sock *req,
 775                               struct request_values *rvp)
 776 {
 777         const struct inet_request_sock *ireq = inet_rsk(req);
 778         struct flowi4 fl4;
 779         int err = -1;
 780         struct sk_buff * skb;
 781
 782         /* First, grab a route. */
 783         if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
 784                 return -1;
 785
 786         skb = tcp_make_synack(sk, dst, req, rvp);
 787
 788         if (skb) {
 789                 __tcp_v4_send_check(skb, ireq->loc_addr, ireq->rmt_addr);
 790
 791                 err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
 792                                             ireq->rmt_addr,
 793                                             ireq->opt);
 794                 err = net_xmit_eval(err);
 795         }
 796
 797         dst_release(dst);
 798         return err;
 799 }
 800
 801 static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req,
 802                               struct request_values *rvp)
 803 {
 804         TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
 805         return tcp_v4_send_synack(sk, NULL, req, rvp);
 806 }
 807
 808 /*
 809  *      IPv4 request_sock destructor.
 810  */
 811 static void tcp_v4_reqsk_destructor(struct request_sock *req)
 812 {
 813         kfree(inet_rsk(req)->opt);
 814 }
 815
 816 static void syn_flood_warning(const struct sk_buff *skb)
 817 {
 818         const char *msg;
 819
 820 #ifdef CONFIG_SYN_COOKIES
 821         if (sysctl_tcp_syncookies)
 822                 msg = "Sending cookies";
 823         else
 824 #endif
 825                 msg = "Dropping request";
 826
 827         pr_info("TCP: Possible SYN flooding on port %d. %s.\n",
 828                                 ntohs(tcp_hdr(skb)->dest), msg);
 829 }
 830
 831 /*
 832  * Save and compile IPv4 options into the request_sock if needed.
 833  */
 834 static struct ip_options_rcu *tcp_v4_save_options(struct sock *sk,
 835                                                   struct sk_buff *skb)
 836 {
 837         const struct ip_options *opt = &(IPCB(skb)->opt);
 838         struct ip_options_rcu *dopt = NULL;
 839
 840         if (opt && opt->optlen) {
 841                 int opt_size = sizeof(*dopt) + opt->optlen;
 842
 843                 dopt = kmalloc(opt_size, GFP_ATOMIC);
 844                 if (dopt) {
 845                         if (ip_options_echo(&dopt->opt, skb)) {
 846                                 kfree(dopt);
 847                                 dopt = NULL;
 848                         }
 849                 }
 850         }
 851         return dopt;
 852 }
 853
 854 #ifdef CONFIG_TCP_MD5SIG
 855 /*
 856  * RFC2385 MD5 checksumming requires a mapping of
 857  * IP address->MD5 Key.
 858  * We need to maintain these in the sk structure.
 859  */
 860
 861 /* Find the Key structure for an address.  */
 862 static struct tcp_md5sig_key *
 863                         tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
 864 {
 865         struct tcp_sock *tp = tcp_sk(sk);
 866         int i;
 867
 868         if (!tp->md5sig_info || !tp->md5sig_info->entries4)
 869                 return NULL;
 870         for (i = 0; i < tp->md5sig_info->entries4; i++) {
 871                 if (tp->md5sig_info->keys4[i].addr == addr)
 872                         return &tp->md5sig_info->keys4[i].base;
 873         }
 874         return NULL;
 875 }
 876
 877 struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
 878                                          struct sock *addr_sk)
 879 {
 880         return tcp_v4_md5_do_lookup(sk, inet_sk(addr_sk)->inet_daddr);
 881 }
 882 EXPORT_SYMBOL(tcp_v4_md5_lookup);
 883
 884 static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
 885                                                       struct request_sock *req)
 886 {
 887         return tcp_v4_md5_do_lookup(sk, inet_rsk(req)->rmt_addr);
 888 }
 889
 890 /* This can be called on a newly created socket, from other files */
 891 int tcp_v4_md5_do_add(struct sock *sk, __be32 addr,
 892                       u8 *newkey, u8 newkeylen)
 893 {
 894         /* Add Key to the list */
 895         struct tcp_md5sig_key *key;
 896         struct tcp_sock *tp = tcp_sk(sk);
 897         struct tcp4_md5sig_key *keys;
 898
 899         key = tcp_v4_md5_do_lookup(sk, addr);
 900         if (key) {
 901                 /* Pre-existing entry - just update that one. */
 902                 kfree(key->key);
 903                 key->key = newkey;
 904                 key->keylen = newkeylen;
 905         } else {
 906                 struct tcp_md5sig_info *md5sig;
 907
 908                 if (!tp->md5sig_info) {
 909                         tp->md5sig_info = kzalloc(sizeof(*tp->md5sig_info),
 910                                                   GFP_ATOMIC);
 911                         if (!tp->md5sig_info) {
 912                                 kfree(newkey);
 913                                 return -ENOMEM;
 914                         }
 915                         sk_nocaps_add(sk, NETIF_F_GSO_MASK);
 916                 }
 917
 918                 md5sig = tp->md5sig_info;
 919                 if (md5sig->entries4 == 0 &&
 920                     tcp_alloc_md5sig_pool(sk) == NULL) {
 921                         kfree(newkey);
 922                         return -ENOMEM;
 923                 }
 924
 925                 if (md5sig->alloced4 == md5sig->entries4) {
 926                         keys = kmalloc((sizeof(*keys) *
 927                                         (md5sig->entries4 + 1)), GFP_ATOMIC);
 928                         if (!keys) {
 929                                 kfree(newkey);
 930                                 if (md5sig->entries4 == 0)
 931                                         tcp_free_md5sig_pool();
 932                                 return -ENOMEM;
 933                         }
 934
 935                         if (md5sig->entries4)
 936                                 memcpy(keys, md5sig->keys4,
 937                                        sizeof(*keys) * md5sig->entries4);
 938
 939                         /* Free old key list, and reference new one */
 940                         kfree(md5sig->keys4);
 941                         md5sig->keys4 = keys;
 942                         md5sig->alloced4++;
 943                 }
 944                 md5sig->entries4++;
 945                 md5sig->keys4[md5sig->entries4 - 1].addr        = addr;
 946                 md5sig->keys4[md5sig->entries4 - 1].base.key    = newkey;
 947                 md5sig->keys4[md5sig->entries4 - 1].base.keylen = newkeylen;
 948         }
 949         return 0;
 950 }
 951 EXPORT_SYMBOL(tcp_v4_md5_do_add);
 952
 953 static int tcp_v4_md5_add_func(struct sock *sk, struct sock *addr_sk,
 954                                u8 *newkey, u8 newkeylen)
 955 {
 956         return tcp_v4_md5_do_add(sk, inet_sk(addr_sk)->inet_daddr,
 957                                  newkey, newkeylen);
 958 }
 959
 960 int tcp_v4_md5_do_del(struct sock *sk, __be32 addr)
 961 {
 962         struct tcp_sock *tp = tcp_sk(sk);
 963         int i;
 964
 965         for (i = 0; i < tp->md5sig_info->entries4; i++) {
 966                 if (tp->md5sig_info->keys4[i].addr == addr) {
 967                         /* Free the key */
 968                         kfree(tp->md5sig_info->keys4[i].base.key);
 969                         tp->md5sig_info->entries4--;
 970
 971                         if (tp->md5sig_info->entries4 == 0) {
 972                                 kfree(tp->md5sig_info->keys4);
 973                                 tp->md5sig_info->keys4 = NULL;
 974                                 tp->md5sig_info->alloced4 = 0;
 975                                 tcp_free_md5sig_pool();
 976                         } else if (tp->md5sig_info->entries4 != i) {
 977                                 /* Need to do some manipulation */
 978                                 memmove(&tp->md5sig_info->keys4[i],
 979                                         &tp->md5sig_info->keys4[i+1],
 980                                         (tp->md5sig_info->entries4 - i) *
 981                                          sizeof(struct tcp4_md5sig_key));
 982                         }
 983                         return 0;
 984                 }
 985         }
 986         return -ENOENT;
 987 }
 988 EXPORT_SYMBOL(tcp_v4_md5_do_del);
 989
 990 static void tcp_v4_clear_md5_list(struct sock *sk)
 991 {
 992         struct tcp_sock *tp = tcp_sk(sk);
 993
 994         /* Free each key, then the set of key keys,
 995          * the crypto element, and then decrement our
 996          * hold on the last resort crypto.
 997          */
 998         if (tp->md5sig_info->entries4) {
 999                 int i;
1000                 for (i = 0; i < tp->md5sig_info->entries4; i++)
1001                         kfree(tp->md5sig_info->keys4[i].base.key);
1002                 tp->md5sig_info->entries4 = 0;
1003                 tcp_free_md5sig_pool();
1004         }
1005         if (tp->md5sig_info->keys4) {
1006                 kfree(tp->md5sig_info->keys4);
1007                 tp->md5sig_info->keys4 = NULL;
1008                 tp->md5sig_info->alloced4  = 0;
1009         }
1010 }
1011
1012 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
1013                                  int optlen)
1014 {
1015         struct tcp_md5sig cmd;
1016         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1017         u8 *newkey;
1018
1019         if (optlen < sizeof(cmd))
1020                 return -EINVAL;
1021
1022         if (copy_from_user(&cmd, optval, sizeof(cmd)))
1023                 return -EFAULT;
1024
1025         if (sin->sin_family != AF_INET)
1026                 return -EINVAL;
1027
1028         if (!cmd.tcpm_key || !cmd.tcpm_keylen) {
1029                 if (!tcp_sk(sk)->md5sig_info)
1030                         return -ENOENT;
1031                 return tcp_v4_md5_do_del(sk, sin->sin_addr.s_addr);
1032         }
1033
1034         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1035                 return -EINVAL;
1036
1037         if (!tcp_sk(sk)->md5sig_info) {
1038                 struct tcp_sock *tp = tcp_sk(sk);
1039                 struct tcp_md5sig_info *p;
1040
1041                 p = kzalloc(sizeof(*p), sk->sk_allocation);
1042                 if (!p)
1043                         return -EINVAL;
1044
1045                 tp->md5sig_info = p;
1046                 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1047         }
1048
1049         newkey = kmemdup(cmd.tcpm_key, cmd.tcpm_keylen, sk->sk_allocation);
1050         if (!newkey)
1051                 return -ENOMEM;
1052         return tcp_v4_md5_do_add(sk, sin->sin_addr.s_addr,
1053                                  newkey, cmd.tcpm_keylen);
1054 }
1055
1056 static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
1057                                         __be32 daddr, __be32 saddr, int nbytes)
1058 {
1059         struct tcp4_pseudohdr *bp;
1060         struct scatterlist sg;
1061
1062         bp = &hp->md5_blk.ip4;
1063
1064         /*
1065          * 1. the TCP pseudo-header (in the order: source IP address,
1066          * destination IP address, zero-padded protocol number, and
1067          * segment length)
1068          */
1069         bp->saddr = saddr;
1070         bp->daddr = daddr;
1071         bp->pad = 0;
1072         bp->protocol = IPPROTO_TCP;
1073         bp->len = cpu_to_be16(nbytes);
1074
1075         sg_init_one(&sg, bp, sizeof(*bp));
1076         return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp));
1077 }
1078
1079 static int tcp_v4_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key,
1080                                __be32 daddr, __be32 saddr, struct tcphdr *th)
1081 {
1082         struct tcp_md5sig_pool *hp;
1083         struct hash_desc *desc;
1084
1085         hp = tcp_get_md5sig_pool();
1086         if (!hp)
1087                 goto clear_hash_noput;
1088         desc = &hp->md5_desc;
1089
1090         if (crypto_hash_init(desc))
1091                 goto clear_hash;
1092         if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
1093                 goto clear_hash;
1094         if (tcp_md5_hash_header(hp, th))
1095                 goto clear_hash;
1096         if (tcp_md5_hash_key(hp, key))
1097                 goto clear_hash;
1098         if (crypto_hash_final(desc, md5_hash))
1099                 goto clear_hash;
1100
1101         tcp_put_md5sig_pool();
1102         return 0;
1103
1104 clear_hash:
1105         tcp_put_md5sig_pool();
1106 clear_hash_noput:
1107         memset(md5_hash, 0, 16);
1108         return 1;
1109 }
1110
1111 int tcp_v4_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key,
1112                         struct sock *sk, struct request_sock *req,
1113                         struct sk_buff *skb)
1114 {
1115         struct tcp_md5sig_pool *hp;
1116         struct hash_desc *desc;
1117         struct tcphdr *th = tcp_hdr(skb);
1118         __be32 saddr, daddr;
1119
1120         if (sk) {
1121                 saddr = inet_sk(sk)->inet_saddr;
1122                 daddr = inet_sk(sk)->inet_daddr;
1123         } else if (req) {
1124                 saddr = inet_rsk(req)->loc_addr;
1125                 daddr = inet_rsk(req)->rmt_addr;
1126         } else {
1127                 const struct iphdr *iph = ip_hdr(skb);
1128                 saddr = iph->saddr;
1129                 daddr = iph->daddr;
1130         }
1131
1132         hp = tcp_get_md5sig_pool();
1133         if (!hp)
1134                 goto clear_hash_noput;
1135         desc = &hp->md5_desc;
1136
1137         if (crypto_hash_init(desc))
1138                 goto clear_hash;
1139
1140         if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
1141                 goto clear_hash;
1142         if (tcp_md5_hash_header(hp, th))
1143                 goto clear_hash;
1144         if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1145                 goto clear_hash;
1146         if (tcp_md5_hash_key(hp, key))
1147                 goto clear_hash;
1148         if (crypto_hash_final(desc, md5_hash))
1149                 goto clear_hash;
1150
1151         tcp_put_md5sig_pool();
1152         return 0;
1153
1154 clear_hash:
1155         tcp_put_md5sig_pool();
1156 clear_hash_noput:
1157         memset(md5_hash, 0, 16);
1158         return 1;
1159 }
1160 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1161
1162 static int tcp_v4_inbound_md5_hash(struct sock *sk, struct sk_buff *skb)
1163 {
1164         /*
1165          * This gets called for each TCP segment that arrives
1166          * so we want to be efficient.
1167          * We have 3 drop cases:
1168          * o No MD5 hash and one expected.
1169          * o MD5 hash and we're not expecting one.
1170          * o MD5 hash and its wrong.
1171          */
1172         __u8 *hash_location = NULL;
1173         struct tcp_md5sig_key *hash_expected;
1174         const struct iphdr *iph = ip_hdr(skb);
1175         struct tcphdr *th = tcp_hdr(skb);
1176         int genhash;
1177         unsigned char newhash[16];
1178
1179         hash_expected = tcp_v4_md5_do_lookup(sk, iph->saddr);
1180         hash_location = tcp_parse_md5sig_option(th);
1181
1182         /* We've parsed the options - do we have a hash? */
1183         if (!hash_expected && !hash_location)
1184                 return 0;
1185
1186         if (hash_expected && !hash_location) {
1187                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1188                 return 1;
1189         }
1190
1191         if (!hash_expected && hash_location) {
1192                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1193                 return 1;
1194         }
1195
1196         /* Okay, so this is hash_expected and hash_location -
1197          * so we need to calculate the checksum.
1198          */
1199         genhash = tcp_v4_md5_hash_skb(newhash,
1200                                       hash_expected,
1201                                       NULL, NULL, skb);
1202
1203         if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1204                 if (net_ratelimit()) {
1205                         printk(KERN_INFO "MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1206                                &iph->saddr, ntohs(th->source),
1207                                &iph->daddr, ntohs(th->dest),
1208                                genhash ? " tcp_v4_calc_md5_hash failed" : "");
1209                 }
1210                 return 1;
1211         }
1212         return 0;
1213 }
1214
1215 #endif
1216
1217 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1218         .family         =       PF_INET,
1219         .obj_size       =       sizeof(struct tcp_request_sock),
1220         .rtx_syn_ack    =       tcp_v4_rtx_synack,
1221         .send_ack       =       tcp_v4_reqsk_send_ack,
1222         .destructor     =       tcp_v4_reqsk_destructor,
1223         .send_reset     =       tcp_v4_send_reset,
1224         .syn_ack_timeout =      tcp_syn_ack_timeout,
1225 };
1226
1227 #ifdef CONFIG_TCP_MD5SIG
1228 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1229         .md5_lookup     =       tcp_v4_reqsk_md5_lookup,
1230         .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1231 };
1232 #endif
1233
1234 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1235 {
1236         struct tcp_extend_values tmp_ext;
1237         struct tcp_options_received tmp_opt;
1238         u8 *hash_location;
1239         struct request_sock *req;
1240         struct inet_request_sock *ireq;
1241         struct tcp_sock *tp = tcp_sk(sk);
1242         struct dst_entry *dst = NULL;
1243         __be32 saddr = ip_hdr(skb)->saddr;
1244         __be32 daddr = ip_hdr(skb)->daddr;
1245         __u32 isn = TCP_SKB_CB(skb)->when;
1246 #ifdef CONFIG_SYN_COOKIES
1247         int want_cookie = 0;
1248 #else
1249 #define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1250 #endif
1251
1252         /* Never answer to SYNs send to broadcast or multicast */
1253         if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1254                 goto drop;
1255
1256         /* TW buckets are converted to open requests without
1257          * limitations, they conserve resources and peer is
1258          * evidently real one.
1259          */
1260         if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
1261                 if (net_ratelimit())
1262                         syn_flood_warning(skb);
1263 #ifdef CONFIG_SYN_COOKIES
1264                 if (sysctl_tcp_syncookies) {
1265                         want_cookie = 1;
1266                 } else
1267 #endif
1268                 goto drop;
1269         }
1270
1271         /* Accept backlog is full. If we have already queued enough
1272          * of warm entries in syn queue, drop request. It is better than
1273          * clogging syn queue with openreqs with exponentially increasing
1274          * timeout.
1275          */
1276         if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
1277                 goto drop;
1278
1279         req = inet_reqsk_alloc(&tcp_request_sock_ops);
1280         if (!req)
1281                 goto drop;
1282
1283 #ifdef CONFIG_TCP_MD5SIG
1284         tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
1285 #endif
1286
1287         tcp_clear_options(&tmp_opt);
1288         tmp_opt.mss_clamp = TCP_MSS_DEFAULT;
1289         tmp_opt.user_mss  = tp->rx_opt.user_mss;
1290         tcp_parse_options(skb, &tmp_opt, &hash_location, 0);
1291
1292         if (tmp_opt.cookie_plus > 0 &&
1293             tmp_opt.saw_tstamp &&
1294             !tp->rx_opt.cookie_out_never &&
1295             (sysctl_tcp_cookie_size > 0 ||
1296              (tp->cookie_values != NULL &&
1297               tp->cookie_values->cookie_desired > 0))) {
1298                 u8 *c;
1299                 u32 *mess = &tmp_ext.cookie_bakery[COOKIE_DIGEST_WORDS];
1300                 int l = tmp_opt.cookie_plus - TCPOLEN_COOKIE_BASE;
1301
1302                 if (tcp_cookie_generator(&tmp_ext.cookie_bakery[0]) != 0)
1303                         goto drop_and_release;
1304
1305                 /* Secret recipe starts with IP addresses */
1306                 *mess++ ^= (__force u32)daddr;
1307                 *mess++ ^= (__force u32)saddr;
1308
1309                 /* plus variable length Initiator Cookie */
1310                 c = (u8 *)mess;
1311                 while (l-- > 0)
1312                         *c++ ^= *hash_location++;
1313
1314 #ifdef CONFIG_SYN_COOKIES
1315                 want_cookie = 0;        /* not our kind of cookie */
1316 #endif
1317                 tmp_ext.cookie_out_never = 0; /* false */
1318                 tmp_ext.cookie_plus = tmp_opt.cookie_plus;
1319         } else if (!tp->rx_opt.cookie_in_always) {
1320                 /* redundant indications, but ensure initialization. */
1321                 tmp_ext.cookie_out_never = 1; /* true */
1322                 tmp_ext.cookie_plus = 0;
1323         } else {
1324                 goto drop_and_release;
1325         }
1326         tmp_ext.cookie_in_always = tp->rx_opt.cookie_in_always;
1327
1328         if (want_cookie && !tmp_opt.saw_tstamp)
1329                 tcp_clear_options(&tmp_opt);
1330
1331         tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1332         tcp_openreq_init(req, &tmp_opt, skb);
1333
1334         ireq = inet_rsk(req);
1335         ireq->loc_addr = daddr;
1336         ireq->rmt_addr = saddr;
1337         ireq->no_srccheck = inet_sk(sk)->transparent;
1338         ireq->opt = tcp_v4_save_options(sk, skb);
1339
1340         if (security_inet_conn_request(sk, skb, req))
1341                 goto drop_and_free;
1342
1343         if (!want_cookie || tmp_opt.tstamp_ok)
1344                 TCP_ECN_create_request(req, tcp_hdr(skb));
1345
1346         if (want_cookie) {
1347                 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1348                 req->cookie_ts = tmp_opt.tstamp_ok;
1349         } else if (!isn) {
1350                 struct inet_peer *peer = NULL;
1351                 struct flowi4 fl4;
1352
1353                 /* VJ's idea. We save last timestamp seen
1354                  * from the destination in peer table, when entering
1355                  * state TIME-WAIT, and check against it before
1356                  * accepting new connection request.
1357                  *
1358                  * If "isn" is not zero, this request hit alive
1359                  * timewait bucket, so that all the necessary checks
1360                  * are made in the function processing timewait state.
1361                  */
1362                 if (tmp_opt.saw_tstamp &&
1363                     tcp_death_row.sysctl_tw_recycle &&
1364                     (dst = inet_csk_route_req(sk, &fl4, req)) != NULL &&
1365                     fl4.daddr == saddr &&
1366                     (peer = rt_get_peer((struct rtable *)dst, fl4.daddr)) != NULL) {
1367                         inet_peer_refcheck(peer);
1368                         if ((u32)get_seconds() - peer->tcp_ts_stamp < TCP_PAWS_MSL &&
1369                             (s32)(peer->tcp_ts - req->ts_recent) >
1370                                                         TCP_PAWS_WINDOW) {
1371                                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
1372                                 goto drop_and_release;
1373                         }
1374                 }
1375                 /* Kill the following clause, if you dislike this way. */
1376                 else if (!sysctl_tcp_syncookies &&
1377                          (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
1378                           (sysctl_max_syn_backlog >> 2)) &&
1379                          (!peer || !peer->tcp_ts_stamp) &&
1380                          (!dst || !dst_metric(dst, RTAX_RTT))) {
1381                         /* Without syncookies last quarter of
1382                          * backlog is filled with destinations,
1383                          * proven to be alive.
1384                          * It means that we continue to communicate
1385                          * to destinations, already remembered
1386                          * to the moment of synflood.
1387                          */
1388                         LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open request from %pI4/%u\n",
1389                                        &saddr, ntohs(tcp_hdr(skb)->source));
1390                         goto drop_and_release;
1391                 }
1392
1393                 isn = tcp_v4_init_sequence(skb);
1394         }
1395         tcp_rsk(req)->snt_isn = isn;
1396
1397         if (tcp_v4_send_synack(sk, dst, req,
1398                                (struct request_values *)&tmp_ext) ||
1399             want_cookie)
1400                 goto drop_and_free;
1401
1402         inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1403         return 0;
1404
1405 drop_and_release:
1406         dst_release(dst);
1407 drop_and_free:
1408         reqsk_free(req);
1409 drop:
1410         return 0;
1411 }
1412 EXPORT_SYMBOL(tcp_v4_conn_request);
1413
1414
1415 /*
1416  * The three way handshake has completed - we got a valid synack -
1417  * now create the new socket.
1418  */
1419 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1420                                   struct request_sock *req,
1421                                   struct dst_entry *dst)
1422 {
1423         struct inet_request_sock *ireq;
1424         struct inet_sock *newinet;
1425         struct tcp_sock *newtp;
1426         struct sock *newsk;
1427 #ifdef CONFIG_TCP_MD5SIG
1428         struct tcp_md5sig_key *key;
1429 #endif
1430         struct ip_options_rcu *inet_opt;
1431
1432         if (sk_acceptq_is_full(sk))
1433                 goto exit_overflow;
1434
1435         newsk = tcp_create_openreq_child(sk, req, skb);
1436         if (!newsk)
1437                 goto exit_nonewsk;
1438
1439         newsk->sk_gso_type = SKB_GSO_TCPV4;
1440
1441         newtp                 = tcp_sk(newsk);
1442         newinet               = inet_sk(newsk);
1443         ireq                  = inet_rsk(req);
1444         newinet->inet_daddr   = ireq->rmt_addr;
1445         newinet->inet_rcv_saddr = ireq->loc_addr;
1446         newinet->inet_saddr           = ireq->loc_addr;
1447         inet_opt              = ireq->opt;
1448         rcu_assign_pointer(newinet->inet_opt, inet_opt);
1449         ireq->opt             = NULL;
1450         newinet->mc_index     = inet_iif(skb);
1451         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1452         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1453         if (inet_opt)
1454                 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1455         newinet->inet_id = newtp->write_seq ^ jiffies;
1456
1457         if (!dst) {
1458                 dst = inet_csk_route_child_sock(sk, newsk, req);
1459                 if (!dst)
1460                         goto put_and_exit;
1461         } else {
1462                 /* syncookie case : see end of cookie_v4_check() */
1463         }
1464         sk_setup_caps(newsk, dst);
1465
1466         tcp_mtup_init(newsk);
1467         tcp_sync_mss(newsk, dst_mtu(dst));
1468         newtp->advmss = dst_metric_advmss(dst);
1469         if (tcp_sk(sk)->rx_opt.user_mss &&
1470             tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1471                 newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1472
1473         tcp_initialize_rcv_mss(newsk);
1474
1475 #ifdef CONFIG_TCP_MD5SIG
1476         /* Copy over the MD5 key from the original socket */
1477         key = tcp_v4_md5_do_lookup(sk, newinet->inet_daddr);
1478         if (key != NULL) {
1479                 /*
1480                  * We're using one, so create a matching key
1481                  * on the newsk structure. If we fail to get
1482                  * memory, then we end up not copying the key
1483                  * across. Shucks.
1484                  */
1485                 char *newkey = kmemdup(key->key, key->keylen, GFP_ATOMIC);
1486                 if (newkey != NULL)
1487                         tcp_v4_md5_do_add(newsk, newinet->inet_daddr,
1488                                           newkey, key->keylen);
1489                 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1490         }
1491 #endif
1492
1493         if (__inet_inherit_port(sk, newsk) < 0)
1494                 goto put_and_exit;
1495         __inet_hash_nolisten(newsk, NULL);
1496
1497         return newsk;
1498
1499 exit_overflow:
1500         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1501 exit_nonewsk:
1502         dst_release(dst);
1503 exit:
1504         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1505         return NULL;
1506 put_and_exit:
1507         sock_put(newsk);
1508         goto exit;
1509 }
1510 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1511
1512 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1513 {
1514         struct tcphdr *th = tcp_hdr(skb);
1515         const struct iphdr *iph = ip_hdr(skb);
1516         struct sock *nsk;
1517         struct request_sock **prev;
1518         /* Find possible connection requests. */
1519         struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1520                                                        iph->saddr, iph->daddr);
1521         if (req)
1522                 return tcp_check_req(sk, skb, req, prev);
1523
1524         nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
1525                         th->source, iph->daddr, th->dest, inet_iif(skb));
1526
1527         if (nsk) {
1528                 if (nsk->sk_state != TCP_TIME_WAIT) {
1529                         bh_lock_sock(nsk);
1530                         return nsk;
1531                 }
1532                 inet_twsk_put(inet_twsk(nsk));
1533                 return NULL;
1534         }
1535
1536 #ifdef CONFIG_SYN_COOKIES
1537         if (!th->syn)
1538                 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1539 #endif
1540         return sk;
1541 }
1542
1543 static __sum16 tcp_v4_checksum_init(struct sk_buff *skb)
1544 {
1545         const struct iphdr *iph = ip_hdr(skb);
1546
1547         if (skb->ip_summed == CHECKSUM_COMPLETE) {
1548                 if (!tcp_v4_check(skb->len, iph->saddr,
1549                                   iph->daddr, skb->csum)) {
1550                         skb->ip_summed = CHECKSUM_UNNECESSARY;
1551                         return 0;
1552                 }
1553         }
1554
1555         skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
1556                                        skb->len, IPPROTO_TCP, 0);
1557
1558         if (skb->len <= 76) {
1559                 return __skb_checksum_complete(skb);
1560         }
1561         return 0;
1562 }
1563
1564
1565 /* The socket must have it's spinlock held when we get
1566  * here.
1567  *
1568  * We have a potential double-lock case here, so even when
1569  * doing backlog processing we use the BH locking scheme.
1570  * This is because we cannot sleep with the original spinlock
1571  * held.
1572  */
1573 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1574 {
1575         struct sock *rsk;
1576 #ifdef CONFIG_TCP_MD5SIG
1577         /*
1578          * We really want to reject the packet as early as possible
1579          * if:
1580          *  o We're expecting an MD5'd packet and this is no MD5 tcp option
1581          *  o There is an MD5 option and we're not expecting one
1582          */
1583         if (tcp_v4_inbound_md5_hash(sk, skb))
1584                 goto discard;
1585 #endif
1586
1587         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1588                 sock_rps_save_rxhash(sk, skb->rxhash);
1589                 if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
1590                         rsk = sk;
1591                         goto reset;
1592                 }
1593                 return 0;
1594         }
1595
1596         if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
1597                 goto csum_err;
1598
1599         if (sk->sk_state == TCP_LISTEN) {
1600                 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1601                 if (!nsk)
1602                         goto discard;
1603
1604                 if (nsk != sk) {
1605                         sock_rps_save_rxhash(nsk, skb->rxhash);
1606                         if (tcp_child_process(sk, nsk, skb)) {
1607                                 rsk = nsk;
1608                                 goto reset;
1609                         }
1610                         return 0;
1611                 }
1612         } else
1613                 sock_rps_save_rxhash(sk, skb->rxhash);
1614
1615         if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
1616                 rsk = sk;
1617                 goto reset;
1618         }
1619         return 0;
1620
1621 reset:
1622         tcp_v4_send_reset(rsk, skb);
1623 discard:
1624         kfree_skb(skb);
1625         /* Be careful here. If this function gets more complicated and
1626          * gcc suffers from register pressure on the x86, sk (in %ebx)
1627          * might be destroyed here. This current version compiles correctly,
1628          * but you have been warned.
1629          */
1630         return 0;
1631
1632 csum_err:
1633         TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
1634         goto discard;
1635 }
1636 EXPORT_SYMBOL(tcp_v4_do_rcv);
1637
1638 /*
1639  *      From tcp_input.c
1640  */
1641
1642 int tcp_v4_rcv(struct sk_buff *skb)
1643 {
1644         const struct iphdr *iph;
1645         struct tcphdr *th;
1646         struct sock *sk;
1647         int ret;
1648         struct net *net = dev_net(skb->dev);
1649
1650         if (skb->pkt_type != PACKET_HOST)
1651                 goto discard_it;
1652
1653         /* Count it even if it's bad */
1654         TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);
1655
1656         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1657                 goto discard_it;
1658
1659         th = tcp_hdr(skb);
1660
1661         if (th->doff < sizeof(struct tcphdr) / 4)
1662                 goto bad_packet;
1663         if (!pskb_may_pull(skb, th->doff * 4))
1664                 goto discard_it;
1665
1666         /* An explanation is required here, I think.
1667          * Packet length and doff are validated by header prediction,
1668          * provided case of th->doff==0 is eliminated.
1669          * So, we defer the checks. */
1670         if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb))
1671                 goto bad_packet;
1672
1673         th = tcp_hdr(skb);
1674         iph = ip_hdr(skb);
1675         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1676         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1677                                     skb->len - th->doff * 4);
1678         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1679         TCP_SKB_CB(skb)->when    = 0;
1680         TCP_SKB_CB(skb)->flags   = iph->tos;
1681         TCP_SKB_CB(skb)->sacked  = 0;
1682
1683         sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
1684         if (!sk)
1685                 goto no_tcp_socket;
1686
1687 process:
1688         if (sk->sk_state == TCP_TIME_WAIT)
1689                 goto do_time_wait;
1690
1691         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1692                 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
1693                 goto discard_and_relse;
1694         }
1695
1696         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1697                 goto discard_and_relse;
1698         nf_reset(skb);
1699
1700         if (sk_filter(sk, skb))
1701                 goto discard_and_relse;
1702
1703         skb->dev = NULL;
1704
1705         bh_lock_sock_nested(sk);
1706         ret = 0;
1707         if (!sock_owned_by_user(sk)) {
1708 #ifdef CONFIG_NET_DMA
1709                 struct tcp_sock *tp = tcp_sk(sk);
1710                 if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
1711                         tp->ucopy.dma_chan = dma_find_channel(DMA_MEMCPY);
1712                 if (tp->ucopy.dma_chan)
1713                         ret = tcp_v4_do_rcv(sk, skb);
1714                 else
1715 #endif
1716                 {
1717                         if (!tcp_prequeue(sk, skb))
1718                                 ret = tcp_v4_do_rcv(sk, skb);
1719                 }
1720         } else if (unlikely(sk_add_backlog(sk, skb))) {
1721                 bh_unlock_sock(sk);
1722                 NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
1723                 goto discard_and_relse;
1724         }
1725         bh_unlock_sock(sk);
1726
1727         sock_put(sk);
1728
1729         return ret;
1730
1731 no_tcp_socket:
1732         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1733                 goto discard_it;
1734
1735         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1736 bad_packet:
1737                 TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1738         } else {
1739                 tcp_v4_send_reset(NULL, skb);
1740         }
1741
1742 discard_it:
1743         /* Discard frame. */
1744         kfree_skb(skb);
1745         return 0;
1746
1747 discard_and_relse:
1748         sock_put(sk);
1749         goto discard_it;
1750
1751 do_time_wait:
1752         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1753                 inet_twsk_put(inet_twsk(sk));
1754                 goto discard_it;
1755         }
1756
1757         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1758                 TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1759                 inet_twsk_put(inet_twsk(sk));
1760                 goto discard_it;
1761         }
1762         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1763         case TCP_TW_SYN: {
1764                 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1765                                                         &tcp_hashinfo,
1766                                                         iph->daddr, th->dest,
1767                                                         inet_iif(skb));
1768                 if (sk2) {
1769                         inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
1770                         inet_twsk_put(inet_twsk(sk));
1771                         sk = sk2;
1772                         goto process;
1773                 }
1774                 /* Fall through to ACK */
1775         }
1776         case TCP_TW_ACK:
1777                 tcp_v4_timewait_ack(sk, skb);
1778                 break;
1779         case TCP_TW_RST:
1780                 goto no_tcp_socket;
1781         case TCP_TW_SUCCESS:;
1782         }
1783         goto discard_it;
1784 }
1785
1786 struct inet_peer *tcp_v4_get_peer(struct sock *sk, bool *release_it)
1787 {
1788         struct rtable *rt = (struct rtable *) __sk_dst_get(sk);
1789         struct inet_sock *inet = inet_sk(sk);
1790         struct inet_peer *peer;
1791
1792         if (!rt ||
1793             inet->cork.fl.u.ip4.daddr != inet->inet_daddr) {
1794                 peer = inet_getpeer_v4(inet->inet_daddr, 1);
1795                 *release_it = true;
1796         } else {
1797                 if (!rt->peer)
1798                         rt_bind_peer(rt, inet->inet_daddr, 1);
1799                 peer = rt->peer;
1800                 *release_it = false;
1801         }
1802
1803         return peer;
1804 }
1805 EXPORT_SYMBOL(tcp_v4_get_peer);
1806
1807 void *tcp_v4_tw_get_peer(struct sock *sk)
1808 {
1809         struct inet_timewait_sock *tw = inet_twsk(sk);
1810
1811         return inet_getpeer_v4(tw->tw_daddr, 1);
1812 }
1813 EXPORT_SYMBOL(tcp_v4_tw_get_peer);
1814
1815 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1816         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
1817         .twsk_unique    = tcp_twsk_unique,
1818         .twsk_destructor= tcp_twsk_destructor,
1819         .twsk_getpeer   = tcp_v4_tw_get_peer,
1820 };
1821
1822 const struct inet_connection_sock_af_ops ipv4_specific = {
1823         .queue_xmit        = ip_queue_xmit,
1824         .send_check        = tcp_v4_send_check,
1825         .rebuild_header    = inet_sk_rebuild_header,
1826         .conn_request      = tcp_v4_conn_request,
1827         .syn_recv_sock     = tcp_v4_syn_recv_sock,
1828         .get_peer          = tcp_v4_get_peer,
1829         .net_header_len    = sizeof(struct iphdr),
1830         .setsockopt        = ip_setsockopt,
1831         .getsockopt        = ip_getsockopt,
1832         .addr2sockaddr     = inet_csk_addr2sockaddr,
1833         .sockaddr_len      = sizeof(struct sockaddr_in),
1834         .bind_conflict     = inet_csk_bind_conflict,
1835 #ifdef CONFIG_COMPAT
1836         .compat_setsockopt = compat_ip_setsockopt,
1837         .compat_getsockopt = compat_ip_getsockopt,
1838 #endif
1839 };
1840 EXPORT_SYMBOL(ipv4_specific);
1841
1842 #ifdef CONFIG_TCP_MD5SIG
1843 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1844         .md5_lookup             = tcp_v4_md5_lookup,
1845         .calc_md5_hash          = tcp_v4_md5_hash_skb,
1846         .md5_add                = tcp_v4_md5_add_func,
1847         .md5_parse              = tcp_v4_parse_md5_keys,
1848 };
1849 #endif
1850
1851 /* NOTE: A lot of things set to zero explicitly by call to
1852  *       sk_alloc() so need not be done here.
1853  */
1854 static int tcp_v4_init_sock(struct sock *sk)
1855 {
1856         struct inet_connection_sock *icsk = inet_csk(sk);
1857         struct tcp_sock *tp = tcp_sk(sk);
1858
1859         skb_queue_head_init(&tp->out_of_order_queue);
1860         tcp_init_xmit_timers(sk);
1861         tcp_prequeue_init(tp);
1862
1863         icsk->icsk_rto = TCP_TIMEOUT_INIT;
1864         tp->mdev = TCP_TIMEOUT_INIT;
1865
1866         /* So many TCP implementations out there (incorrectly) count the
1867          * initial SYN frame in their delayed-ACK and congestion control
1868          * algorithms that we must have the following bandaid to talk
1869          * efficiently to them.  -DaveM
1870          */
1871         tp->snd_cwnd = 2;
1872
1873         /* See draft-stevens-tcpca-spec-01 for discussion of the
1874          * initialization of these values.
1875          */
1876         tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
1877         tp->snd_cwnd_clamp = ~0;
1878         tp->mss_cache = TCP_MSS_DEFAULT;
1879
1880         tp->reordering = sysctl_tcp_reordering;
1881         icsk->icsk_ca_ops = &tcp_init_congestion_ops;
1882
1883         sk->sk_state = TCP_CLOSE;
1884
1885         sk->sk_write_space = sk_stream_write_space;
1886         sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1887
1888         icsk->icsk_af_ops = &ipv4_specific;
1889         icsk->icsk_sync_mss = tcp_sync_mss;
1890 #ifdef CONFIG_TCP_MD5SIG
1891         tp->af_specific = &tcp_sock_ipv4_specific;
1892 #endif
1893
1894         /* TCP Cookie Transactions */
1895         if (sysctl_tcp_cookie_size > 0) {
1896                 /* Default, cookies without s_data_payload. */
1897                 tp->cookie_values =
1898                         kzalloc(sizeof(*tp->cookie_values),
1899                                 sk->sk_allocation);
1900                 if (tp->cookie_values != NULL)
1901                         kref_init(&tp->cookie_values->kref);
1902         }
1903         /* Presumed zeroed, in order of appearance:
1904          *      cookie_in_always, cookie_out_never,
1905          *      s_data_constant, s_data_in, s_data_out
1906          */
1907         sk->sk_sndbuf = sysctl_tcp_wmem[1];
1908         sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1909
1910         local_bh_disable();
1911         percpu_counter_inc(&tcp_sockets_allocated);
1912         local_bh_enable();
1913
1914         return 0;
1915 }
1916
1917 void tcp_v4_destroy_sock(struct sock *sk)
1918 {
1919         struct tcp_sock *tp = tcp_sk(sk);
1920
1921         tcp_clear_xmit_timers(sk);
1922
1923         tcp_cleanup_congestion_control(sk);
1924
1925         /* Cleanup up the write buffer. */
1926         tcp_write_queue_purge(sk);
1927
1928         /* Cleans up our, hopefully empty, out_of_order_queue. */
1929         __skb_queue_purge(&tp->out_of_order_queue);
1930
1931 #ifdef CONFIG_TCP_MD5SIG
1932         /* Clean up the MD5 key list, if any */
1933         if (tp->md5sig_info) {
1934                 tcp_v4_clear_md5_list(sk);
1935                 kfree(tp->md5sig_info);
1936                 tp->md5sig_info = NULL;
1937         }
1938 #endif
1939
1940 #ifdef CONFIG_NET_DMA
1941         /* Cleans up our sk_async_wait_queue */
1942         __skb_queue_purge(&sk->sk_async_wait_queue);
1943 #endif
1944
1945         /* Clean prequeue, it must be empty really */
1946         __skb_queue_purge(&tp->ucopy.prequeue);
1947
1948         /* Clean up a referenced TCP bind bucket. */
1949         if (inet_csk(sk)->icsk_bind_hash)
1950                 inet_put_port(sk);
1951
1952         /*
1953          * If sendmsg cached page exists, toss it.
1954          */
1955         if (sk->sk_sndmsg_page) {
1956                 __free_page(sk->sk_sndmsg_page);
1957                 sk->sk_sndmsg_page = NULL;
1958         }
1959
1960         /* TCP Cookie Transactions */
1961         if (tp->cookie_values != NULL) {
1962                 kref_put(&tp->cookie_values->kref,
1963                          tcp_cookie_values_release);
1964                 tp->cookie_values = NULL;
1965         }
1966
1967         percpu_counter_dec(&tcp_sockets_allocated);
1968 }
1969 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1970
1971 #ifdef CONFIG_PROC_FS
1972 /* Proc filesystem TCP sock list dumping. */
1973
1974 static inline struct inet_timewait_sock *tw_head(struct hlist_nulls_head *head)
1975 {
1976         return hlist_nulls_empty(head) ? NULL :
1977                 list_entry(head->first, struct inet_timewait_sock, tw_node);
1978 }
1979
1980 static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
1981 {
1982         return !is_a_nulls(tw->tw_node.next) ?
1983                 hlist_nulls_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1984 }
1985
1986 /*
1987  * Get next listener socket follow cur.  If cur is NULL, get first socket
1988  * starting from bucket given in st->bucket; when st->bucket is zero the
1989  * very first socket in the hash table is returned.
1990  */
1991 static void *listening_get_next(struct seq_file *seq, void *cur)
1992 {
1993         struct inet_connection_sock *icsk;
1994         struct hlist_nulls_node *node;
1995         struct sock *sk = cur;
1996         struct inet_listen_hashbucket *ilb;
1997         struct tcp_iter_state *st = seq->private;
1998         struct net *net = seq_file_net(seq);
1999
2000         if (!sk) {
2001                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
2002                 spin_lock_bh(&ilb->lock);
2003                 sk = sk_nulls_head(&ilb->head);
2004                 st->offset = 0;
2005                 goto get_sk;
2006         }
2007         ilb = &tcp_hashinfo.listening_hash[st->bucket];
2008         ++st->num;
2009         ++st->offset;
2010
2011         if (st->state == TCP_SEQ_STATE_OPENREQ) {
2012                 struct request_sock *req = cur;
2013
2014                 icsk = inet_csk(st->syn_wait_sk);
2015                 req = req->dl_next;
2016                 while (1) {
2017                         while (req) {
2018                                 if (req->rsk_ops->family == st->family) {
2019                                         cur = req;
2020                                         goto out;
2021                                 }
2022                                 req = req->dl_next;
2023                         }
2024                         if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
2025                                 break;
2026 get_req:
2027                         req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
2028                 }
2029                 sk        = sk_nulls_next(st->syn_wait_sk);
2030                 st->state = TCP_SEQ_STATE_LISTENING;
2031                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2032         } else {
2033                 icsk = inet_csk(sk);
2034                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2035                 if (reqsk_queue_len(&icsk->icsk_accept_queue))
2036                         goto start_req;
2037                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2038                 sk = sk_nulls_next(sk);
2039         }
2040 get_sk:
2041         sk_nulls_for_each_from(sk, node) {
2042                 if (!net_eq(sock_net(sk), net))
2043                         continue;
2044                 if (sk->sk_family == st->family) {
2045                         cur = sk;
2046                         goto out;
2047                 }
2048                 icsk = inet_csk(sk);
2049                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2050                 if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
2051 start_req:
2052                         st->uid         = sock_i_uid(sk);
2053                         st->syn_wait_sk = sk;
2054                         st->state       = TCP_SEQ_STATE_OPENREQ;
2055                         st->sbucket     = 0;
2056                         goto get_req;
2057                 }
2058                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2059         }
2060         spin_unlock_bh(&ilb->lock);
2061         st->offset = 0;
2062         if (++st->bucket < INET_LHTABLE_SIZE) {
2063                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
2064                 spin_lock_bh(&ilb->lock);
2065                 sk = sk_nulls_head(&ilb->head);
2066                 goto get_sk;
2067         }
2068         cur = NULL;
2069 out:
2070         return cur;
2071 }
2072
2073 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2074 {
2075         struct tcp_iter_state *st = seq->private;
2076         void *rc;
2077
2078         st->bucket = 0;
2079         st->offset = 0;
2080         rc = listening_get_next(seq, NULL);
2081
2082         while (rc && *pos) {
2083                 rc = listening_get_next(seq, rc);
2084                 --*pos;
2085         }
2086         return rc;
2087 }
2088
2089 static inline int empty_bucket(struct tcp_iter_state *st)
2090 {
2091         return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain) &&
2092                 hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].twchain);
2093 }
2094
2095 /*
2096  * Get first established socket starting from bucket given in st->bucket.
2097  * If st->bucket is zero, the very first socket in the hash is returned.
2098  */
2099 static void *established_get_first(struct seq_file *seq)
2100 {
2101         struct tcp_iter_state *st = seq->private;
2102         struct net *net = seq_file_net(seq);
2103         void *rc = NULL;
2104
2105         st->offset = 0;
2106         for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2107                 struct sock *sk;
2108                 struct hlist_nulls_node *node;
2109                 struct inet_timewait_sock *tw;
2110                 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2111
2112                 /* Lockless fast path for the common case of empty buckets */
2113                 if (empty_bucket(st))
2114                         continue;
2115
2116                 spin_lock_bh(lock);
2117                 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2118                         if (sk->sk_family != st->family ||
2119                             !net_eq(sock_net(sk), net)) {
2120                                 continue;
2121                         }
2122                         rc = sk;
2123                         goto out;
2124                 }
2125                 st->state = TCP_SEQ_STATE_TIME_WAIT;
2126                 inet_twsk_for_each(tw, node,
2127                                    &tcp_hashinfo.ehash[st->bucket].twchain) {
2128                         if (tw->tw_family != st->family ||
2129                             !net_eq(twsk_net(tw), net)) {
2130                                 continue;
2131                         }
2132                         rc = tw;
2133                         goto out;
2134                 }
2135                 spin_unlock_bh(lock);
2136                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2137         }
2138 out:
2139         return rc;
2140 }
2141
2142 static void *established_get_next(struct seq_file *seq, void *cur)
2143 {
2144         struct sock *sk = cur;
2145         struct inet_timewait_sock *tw;
2146         struct hlist_nulls_node *node;
2147         struct tcp_iter_state *st = seq->private;
2148         struct net *net = seq_file_net(seq);
2149
2150         ++st->num;
2151         ++st->offset;
2152
2153         if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2154                 tw = cur;
2155                 tw = tw_next(tw);
2156 get_tw:
2157                 while (tw && (tw->tw_family != st->family || !net_eq(twsk_net(tw), net))) {
2158                         tw = tw_next(tw);
2159                 }
2160                 if (tw) {
2161                         cur = tw;
2162                         goto out;
2163                 }
2164                 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2165                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2166
2167                 /* Look for next non empty bucket */
2168                 st->offset = 0;
2169                 while (++st->bucket <= tcp_hashinfo.ehash_mask &&
2170                                 empty_bucket(st))
2171                         ;
2172                 if (st->bucket > tcp_hashinfo.ehash_mask)
2173                         return NULL;
2174
2175                 spin_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2176                 sk = sk_nulls_head(&tcp_hashinfo.ehash[st->bucket].chain);
2177         } else
2178                 sk = sk_nulls_next(sk);
2179
2180         sk_nulls_for_each_from(sk, node) {
2181                 if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
2182                         goto found;
2183         }
2184
2185         st->state = TCP_SEQ_STATE_TIME_WAIT;
2186         tw = tw_head(&tcp_hashinfo.ehash[st->bucket].twchain);
2187         goto get_tw;
2188 found:
2189         cur = sk;
2190 out:
2191         return cur;
2192 }
2193
2194 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2195 {
2196         struct tcp_iter_state *st = seq->private;
2197         void *rc;
2198
2199         st->bucket = 0;
2200         rc = established_get_first(seq);
2201
2202         while (rc && pos) {
2203                 rc = established_get_next(seq, rc);
2204                 --pos;
2205         }
2206         return rc;
2207 }
2208
2209 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2210 {
2211         void *rc;
2212         struct tcp_iter_state *st = seq->private;
2213
2214         st->state = TCP_SEQ_STATE_LISTENING;
2215         rc        = listening_get_idx(seq, &pos);
2216
2217         if (!rc) {
2218                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2219                 rc        = established_get_idx(seq, pos);
2220         }
2221
2222         return rc;
2223 }
2224
2225 static void *tcp_seek_last_pos(struct seq_file *seq)
2226 {
2227         struct tcp_iter_state *st = seq->private;
2228         int offset = st->offset;
2229         int orig_num = st->num;
2230         void *rc = NULL;
2231
2232         switch (st->state) {
2233         case TCP_SEQ_STATE_OPENREQ:
2234         case TCP_SEQ_STATE_LISTENING:
2235                 if (st->bucket >= INET_LHTABLE_SIZE)
2236                         break;
2237                 st->state = TCP_SEQ_STATE_LISTENING;
2238                 rc = listening_get_next(seq, NULL);
2239                 while (offset-- && rc)
2240                         rc = listening_get_next(seq, rc);
2241                 if (rc)
2242                         break;
2243                 st->bucket = 0;
2244                 /* Fallthrough */
2245         case TCP_SEQ_STATE_ESTABLISHED:
2246         case TCP_SEQ_STATE_TIME_WAIT:
2247                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2248                 if (st->bucket > tcp_hashinfo.ehash_mask)
2249                         break;
2250                 rc = established_get_first(seq);
2251                 while (offset-- && rc)
2252                         rc = established_get_next(seq, rc);
2253         }
2254
2255         st->num = orig_num;
2256
2257         return rc;
2258 }
2259
2260 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2261 {
2262         struct tcp_iter_state *st = seq->private;
2263         void *rc;
2264
2265         if (*pos && *pos == st->last_pos) {
2266                 rc = tcp_seek_last_pos(seq);
2267                 if (rc)
2268                         goto out;
2269         }
2270
2271         st->state = TCP_SEQ_STATE_LISTENING;
2272         st->num = 0;
2273         st->bucket = 0;
2274         st->offset = 0;
2275         rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2276
2277 out:
2278         st->last_pos = *pos;
2279         return rc;
2280 }
2281
2282 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2283 {
2284         struct tcp_iter_state *st = seq->private;
2285         void *rc = NULL;
2286
2287         if (v == SEQ_START_TOKEN) {
2288                 rc = tcp_get_idx(seq, 0);
2289                 goto out;
2290         }
2291
2292         switch (st->state) {
2293         case TCP_SEQ_STATE_OPENREQ:
2294         case TCP_SEQ_STATE_LISTENING:
2295                 rc = listening_get_next(seq, v);
2296                 if (!rc) {
2297                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2298                         st->bucket = 0;
2299                         st->offset = 0;
2300                         rc        = established_get_first(seq);
2301                 }
2302                 break;
2303         case TCP_SEQ_STATE_ESTABLISHED:
2304         case TCP_SEQ_STATE_TIME_WAIT:
2305                 rc = established_get_next(seq, v);
2306                 break;
2307         }
2308 out:
2309         ++*pos;
2310         st->last_pos = *pos;
2311         return rc;
2312 }
2313
2314 static void tcp_seq_stop(struct seq_file *seq, void *v)
2315 {
2316         struct tcp_iter_state *st = seq->private;
2317
2318         switch (st->state) {
2319         case TCP_SEQ_STATE_OPENREQ:
2320                 if (v) {
2321                         struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
2322                         read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2323                 }
2324         case TCP_SEQ_STATE_LISTENING:
2325                 if (v != SEQ_START_TOKEN)
2326                         spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
2327                 break;
2328         case TCP_SEQ_STATE_TIME_WAIT:
2329         case TCP_SEQ_STATE_ESTABLISHED:
2330                 if (v)
2331                         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2332                 break;
2333         }
2334 }
2335
2336 static int tcp_seq_open(struct inode *inode, struct file *file)
2337 {
2338         struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2339         struct tcp_iter_state *s;
2340         int err;
2341
2342         err = seq_open_net(inode, file, &afinfo->seq_ops,
2343                           sizeof(struct tcp_iter_state));
2344         if (err < 0)
2345                 return err;
2346
2347         s = ((struct seq_file *)file->private_data)->private;
2348         s->family               = afinfo->family;
2349         s->last_pos             = 0;
2350         return 0;
2351 }
2352
2353 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2354 {
2355         int rc = 0;
2356         struct proc_dir_entry *p;
2357
2358         afinfo->seq_fops.open           = tcp_seq_open;
2359         afinfo->seq_fops.read           = seq_read;
2360         afinfo->seq_fops.llseek         = seq_lseek;
2361         afinfo->seq_fops.release        = seq_release_net;
2362
2363         afinfo->seq_ops.start           = tcp_seq_start;
2364         afinfo->seq_ops.next            = tcp_seq_next;
2365         afinfo->seq_ops.stop            = tcp_seq_stop;
2366
2367         p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2368                              &afinfo->seq_fops, afinfo);
2369         if (!p)
2370                 rc = -ENOMEM;
2371         return rc;
2372 }
2373 EXPORT_SYMBOL(tcp_proc_register);
2374
2375 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2376 {
2377         proc_net_remove(net, afinfo->name);
2378 }
2379 EXPORT_SYMBOL(tcp_proc_unregister);
2380
2381 static void get_openreq4(struct sock *sk, struct request_sock *req,
2382                          struct seq_file *f, int i, int uid, int *len)
2383 {
2384         const struct inet_request_sock *ireq = inet_rsk(req);
2385         int ttd = req->expires - jiffies;
2386
2387         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2388                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %pK%n",
2389                 i,
2390                 ireq->loc_addr,
2391                 ntohs(inet_sk(sk)->inet_sport),
2392                 ireq->rmt_addr,
2393                 ntohs(ireq->rmt_port),
2394                 TCP_SYN_RECV,
2395                 0, 0, /* could print option size, but that is af dependent. */
2396                 1,    /* timers active (only the expire timer) */
2397                 jiffies_to_clock_t(ttd),
2398                 req->retrans,
2399                 uid,
2400                 0,  /* non standard timer */
2401                 0, /* open_requests have no inode */
2402                 atomic_read(&sk->sk_refcnt),
2403                 req,
2404                 len);
2405 }
2406
2407 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
2408 {
2409         int timer_active;
2410         unsigned long timer_expires;
2411         struct tcp_sock *tp = tcp_sk(sk);
2412         const struct inet_connection_sock *icsk = inet_csk(sk);
2413         struct inet_sock *inet = inet_sk(sk);
2414         __be32 dest = inet->inet_daddr;
2415         __be32 src = inet->inet_rcv_saddr;
2416         __u16 destp = ntohs(inet->inet_dport);
2417         __u16 srcp = ntohs(inet->inet_sport);
2418         int rx_queue;
2419
2420         if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
2421                 timer_active    = 1;
2422                 timer_expires   = icsk->icsk_timeout;
2423         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2424                 timer_active    = 4;
2425                 timer_expires   = icsk->icsk_timeout;
2426         } else if (timer_pending(&sk->sk_timer)) {
2427                 timer_active    = 2;
2428                 timer_expires   = sk->sk_timer.expires;
2429         } else {
2430                 timer_active    = 0;
2431                 timer_expires = jiffies;
2432         }
2433
2434         if (sk->sk_state == TCP_LISTEN)
2435                 rx_queue = sk->sk_ack_backlog;
2436         else
2437                 /*
2438                  * because we dont lock socket, we might find a transient negative value
2439                  */
2440                 rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2441
2442         seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2443                         "%08X %5d %8d %lu %d %pK %lu %lu %u %u %d%n",
2444                 i, src, srcp, dest, destp, sk->sk_state,
2445                 tp->write_seq - tp->snd_una,
2446                 rx_queue,
2447                 timer_active,
2448                 jiffies_to_clock_t(timer_expires - jiffies),
2449                 icsk->icsk_retransmits,
2450                 sock_i_uid(sk),
2451                 icsk->icsk_probes_out,
2452                 sock_i_ino(sk),
2453                 atomic_read(&sk->sk_refcnt), sk,
2454                 jiffies_to_clock_t(icsk->icsk_rto),
2455                 jiffies_to_clock_t(icsk->icsk_ack.ato),
2456                 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2457                 tp->snd_cwnd,
2458                 tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh,
2459                 len);
2460 }
2461
2462 static void get_timewait4_sock(struct inet_timewait_sock *tw,
2463                                struct seq_file *f, int i, int *len)
2464 {
2465         __be32 dest, src;
2466         __u16 destp, srcp;
2467         int ttd = tw->tw_ttd - jiffies;
2468
2469         if (ttd < 0)
2470                 ttd = 0;
2471
2472         dest  = tw->tw_daddr;
2473         src   = tw->tw_rcv_saddr;
2474         destp = ntohs(tw->tw_dport);
2475         srcp  = ntohs(tw->tw_sport);
2476
2477         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2478                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK%n",
2479                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2480                 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2481                 atomic_read(&tw->tw_refcnt), tw, len);
2482 }
2483
2484 #define TMPSZ 150
2485
2486 static int tcp4_seq_show(struct seq_file *seq, void *v)
2487 {
2488         struct tcp_iter_state *st;
2489         int len;
2490
2491         if (v == SEQ_START_TOKEN) {
2492                 seq_printf(seq, "%-*s\n", TMPSZ - 1,
2493                            "  sl  local_address rem_address   st tx_queue "
2494                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2495                            "inode");
2496                 goto out;
2497         }
2498         st = seq->private;
2499
2500         switch (st->state) {
2501         case TCP_SEQ_STATE_LISTENING:
2502         case TCP_SEQ_STATE_ESTABLISHED:
2503                 get_tcp4_sock(v, seq, st->num, &len);
2504                 break;
2505         case TCP_SEQ_STATE_OPENREQ:
2506                 get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid, &len);
2507                 break;
2508         case TCP_SEQ_STATE_TIME_WAIT:
2509                 get_timewait4_sock(v, seq, st->num, &len);
2510                 break;
2511         }
2512         seq_printf(seq, "%*s\n", TMPSZ - 1 - len, "");
2513 out:
2514         return 0;
2515 }
2516
2517 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2518         .name           = "tcp",
2519         .family         = AF_INET,
2520         .seq_fops       = {
2521                 .owner          = THIS_MODULE,
2522         },
2523         .seq_ops        = {
2524                 .show           = tcp4_seq_show,
2525         },
2526 };
2527
2528 static int __net_init tcp4_proc_init_net(struct net *net)
2529 {
2530         return tcp_proc_register(net, &tcp4_seq_afinfo);
2531 }
2532
2533 static void __net_exit tcp4_proc_exit_net(struct net *net)
2534 {
2535         tcp_proc_unregister(net, &tcp4_seq_afinfo);
2536 }
2537
2538 static struct pernet_operations tcp4_net_ops = {
2539         .init = tcp4_proc_init_net,
2540         .exit = tcp4_proc_exit_net,
2541 };
2542
2543 int __init tcp4_proc_init(void)
2544 {
2545         return register_pernet_subsys(&tcp4_net_ops);
2546 }
2547
2548 void tcp4_proc_exit(void)
2549 {
2550         unregister_pernet_subsys(&tcp4_net_ops);
2551 }
2552 #endif /* CONFIG_PROC_FS */
2553
2554 struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb)
2555 {
2556         const struct iphdr *iph = skb_gro_network_header(skb);
2557
2558         switch (skb->ip_summed) {
2559         case CHECKSUM_COMPLETE:
2560                 if (!tcp_v4_check(skb_gro_len(skb), iph->saddr, iph->daddr,
2561                                   skb->csum)) {
2562                         skb->ip_summed = CHECKSUM_UNNECESSARY;
2563                         break;
2564                 }
2565
2566                 /* fall through */
2567         case CHECKSUM_NONE:
2568                 NAPI_GRO_CB(skb)->flush = 1;
2569                 return NULL;
2570         }
2571
2572         return tcp_gro_receive(head, skb);
2573 }
2574
2575 int tcp4_gro_complete(struct sk_buff *skb)
2576 {
2577         const struct iphdr *iph = ip_hdr(skb);
2578         struct tcphdr *th = tcp_hdr(skb);
2579
2580         th->check = ~tcp_v4_check(skb->len - skb_transport_offset(skb),
2581                                   iph->saddr, iph->daddr, 0);
2582         skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
2583
2584         return tcp_gro_complete(skb);
2585 }
2586
2587 struct proto tcp_prot = {
2588         .name                   = "TCP",
2589         .owner                  = THIS_MODULE,
2590         .close                  = tcp_close,
2591         .connect                = tcp_v4_connect,
2592         .disconnect             = tcp_disconnect,
2593         .accept                 = inet_csk_accept,
2594         .ioctl                  = tcp_ioctl,
2595         .init                   = tcp_v4_init_sock,
2596         .destroy                = tcp_v4_destroy_sock,
2597         .shutdown               = tcp_shutdown,
2598         .setsockopt             = tcp_setsockopt,
2599         .getsockopt             = tcp_getsockopt,
2600         .recvmsg                = tcp_recvmsg,
2601         .sendmsg                = tcp_sendmsg,
2602         .sendpage               = tcp_sendpage,
2603         .backlog_rcv            = tcp_v4_do_rcv,
2604         .hash                   = inet_hash,
2605         .unhash                 = inet_unhash,
2606         .get_port               = inet_csk_get_port,
2607         .enter_memory_pressure  = tcp_enter_memory_pressure,
2608         .sockets_allocated      = &tcp_sockets_allocated,
2609         .orphan_count           = &tcp_orphan_count,
2610         .memory_allocated       = &tcp_memory_allocated,
2611         .memory_pressure        = &tcp_memory_pressure,
2612         .sysctl_mem             = sysctl_tcp_mem,
2613         .sysctl_wmem            = sysctl_tcp_wmem,
2614         .sysctl_rmem            = sysctl_tcp_rmem,
2615         .max_header             = MAX_TCP_HEADER,
2616         .obj_size               = sizeof(struct tcp_sock),
2617         .slab_flags             = SLAB_DESTROY_BY_RCU,
2618         .twsk_prot              = &tcp_timewait_sock_ops,
2619         .rsk_prot               = &tcp_request_sock_ops,
2620         .h.hashinfo             = &tcp_hashinfo,
2621         .no_autobind            = true,
2622 #ifdef CONFIG_COMPAT
2623         .compat_setsockopt      = compat_tcp_setsockopt,
2624         .compat_getsockopt      = compat_tcp_getsockopt,
2625 #endif
2626 };
2627 EXPORT_SYMBOL(tcp_prot);
2628
2629
2630 static int __net_init tcp_sk_init(struct net *net)
2631 {
2632         return inet_ctl_sock_create(&net->ipv4.tcp_sock,
2633                                     PF_INET, SOCK_RAW, IPPROTO_TCP, net);
2634 }
2635
2636 static void __net_exit tcp_sk_exit(struct net *net)
2637 {
2638         inet_ctl_sock_destroy(net->ipv4.tcp_sock);
2639 }
2640
2641 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2642 {
2643         inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
2644 }
2645
2646 static struct pernet_operations __net_initdata tcp_sk_ops = {
2647        .init       = tcp_sk_init,
2648        .exit       = tcp_sk_exit,
2649        .exit_batch = tcp_sk_exit_batch,
2650 };
2651
2652 void __init tcp_v4_init(void)
2653 {
2654         inet_hashinfo_init(&tcp_hashinfo);
2655         if (register_pernet_subsys(&tcp_sk_ops))
2656                 panic("Failed to create the TCP control socket.\n");
2657 }