net/ipv4/ip_output.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              The Internet Protocol (IP) output module.
   7  *
   8  * Version:     $Id: ip_output.c,v 1.100 2002/02/01 22:01:03 davem Exp $
   9  *
  10  * Authors:     Ross Biro
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Donald Becker, <becker@super.org>
  13  *              Alan Cox, <Alan.Cox@linux.org>
  14  *              Richard Underwood
  15  *              Stefan Becker, <stefanb@yello.ping.de>
  16  *              Jorge Cwik, <jorge@laser.satlink.net>
  17  *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
  18  *              Hirokazu Takahashi, <taka@valinux.co.jp>
  19  *
  20  *      See ip_input.c for original log
  21  *
  22  *      Fixes:
  23  *              Alan Cox        :       Missing nonblock feature in ip_build_xmit.
  24  *              Mike Kilburn    :       htons() missing in ip_build_xmit.
  25  *              Bradford Johnson:       Fix faulty handling of some frames when
  26  *                                      no route is found.
  27  *              Alexander Demenshin:    Missing sk/skb free in ip_queue_xmit
  28  *                                      (in case if packet not accepted by
  29  *                                      output firewall rules)
  30  *              Mike McLagan    :       Routing by source
  31  *              Alexey Kuznetsov:       use new route cache
  32  *              Andi Kleen:             Fix broken PMTU recovery and remove
  33  *                                      some redundant tests.
  34  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  35  *              Andi Kleen      :       Replace ip_reply with ip_send_reply.
  36  *              Andi Kleen      :       Split fast and slow ip_build_xmit path
  37  *                                      for decreased register pressure on x86
  38  *                                      and more readibility.
  39  *              Marc Boucher    :       When call_out_firewall returns FW_QUEUE,
  40  *                                      silently drop skb instead of failing with -EPERM.
  41  *              Detlev Wengorz  :       Copy protocol for fragments.
  42  *              Hirokazu Takahashi:     HW checksumming for outgoing UDP
  43  *                                      datagrams.
  44  *              Hirokazu Takahashi:     sendfile() on UDP works now.
  45  */
  46
  47 #include <asm/uaccess.h>
  48 #include <asm/system.h>
  49 #include <linux/module.h>
  50 #include <linux/types.h>
  51 #include <linux/kernel.h>
  52 #include <linux/sched.h>
  53 #include <linux/mm.h>
  54 #include <linux/string.h>
  55 #include <linux/errno.h>
  56 #include <linux/config.h>
  57
  58 #include <linux/socket.h>
  59 #include <linux/sockios.h>
  60 #include <linux/in.h>
  61 #include <linux/inet.h>
  62 #include <linux/netdevice.h>
  63 #include <linux/etherdevice.h>
  64 #include <linux/proc_fs.h>
  65 #include <linux/stat.h>
  66 #include <linux/init.h>
  67
  68 #include <net/snmp.h>
  69 #include <net/ip.h>
  70 #include <net/protocol.h>
  71 #include <net/route.h>
  72 #include <net/tcp.h>
  73 #include <net/udp.h>
  74 #include <linux/skbuff.h>
  75 #include <net/sock.h>
  76 #include <net/arp.h>
  77 #include <net/icmp.h>
  78 #include <net/raw.h>
  79 #include <net/checksum.h>
  80 #include <net/inetpeer.h>
  81 #include <net/checksum.h>
  82 #include <linux/igmp.h>
  83 #include <linux/netfilter_ipv4.h>
  84 #include <linux/netfilter_bridge.h>
  85 #include <linux/mroute.h>
  86 #include <linux/netlink.h>
  87
  88 /*
  89  *      Shall we try to damage output packets if routing dev changes?
  90  */
  91
  92 int sysctl_ip_dynaddr;
  93 int sysctl_ip_default_ttl = IPDEFTTL;
  94
  95 /* Generate a checksum for an outgoing IP datagram. */
  96 __inline__ void ip_send_check(struct iphdr *iph)
  97 {
  98         iph->check = 0;
  99         iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
 100 }
 101
 102 /* dev_loopback_xmit for use with netfilter. */
 103 static int ip_dev_loopback_xmit(struct sk_buff *newskb)
 104 {
 105         newskb->mac.raw = newskb->data;
 106         __skb_pull(newskb, newskb->nh.raw - newskb->data);
 107         newskb->pkt_type = PACKET_LOOPBACK;
 108         newskb->ip_summed = CHECKSUM_UNNECESSARY;
 109         BUG_TRAP(newskb->dst);
 110
 111 #ifdef CONFIG_NETFILTER_DEBUG
 112         nf_debug_ip_loopback_xmit(newskb);
 113 #endif
 114         nf_reset(newskb);
 115         netif_rx(newskb);
 116         return 0;
 117 }
 118
 119 static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
 120 {
 121         int ttl = inet->uc_ttl;
 122
 123         if (ttl < 0)
 124                 ttl = dst_metric(dst, RTAX_HOPLIMIT);
 125         return ttl;
 126 }
 127
 128 /*
 129  *              Add an ip header to a skbuff and send it out.
 130  *
 131  */
 132 int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
 133                           u32 saddr, u32 daddr, struct ip_options *opt)
 134 {
 135         struct inet_sock *inet = inet_sk(sk);
 136         struct rtable *rt = (struct rtable *)skb->dst;
 137         struct iphdr *iph;
 138
 139         /* Build the IP header. */
 140         if (opt)
 141                 iph=(struct iphdr *)skb_push(skb,sizeof(struct iphdr) + opt->optlen);
 142         else
 143                 iph=(struct iphdr *)skb_push(skb,sizeof(struct iphdr));
 144
 145         iph->version  = 4;
 146         iph->ihl      = 5;
 147         iph->tos      = inet->tos;
 148         if (ip_dont_fragment(sk, &rt->u.dst))
 149                 iph->frag_off = htons(IP_DF);
 150         else
 151                 iph->frag_off = 0;
 152         iph->ttl      = ip_select_ttl(inet, &rt->u.dst);
 153         iph->daddr    = rt->rt_dst;
 154         iph->saddr    = rt->rt_src;
 155         iph->protocol = sk->sk_protocol;
 156         iph->tot_len  = htons(skb->len);
 157         ip_select_ident(iph, &rt->u.dst, sk);
 158         skb->nh.iph   = iph;
 159
 160         if (opt && opt->optlen) {
 161                 iph->ihl += opt->optlen>>2;
 162                 ip_options_build(skb, opt, daddr, rt, 0);
 163         }
 164         ip_send_check(iph);
 165
 166         skb->priority = sk->sk_priority;
 167
 168         /* Send it out. */
 169         return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
 170                        dst_output);
 171 }
 172
 173 static inline int ip_finish_output2(struct sk_buff *skb)
 174 {
 175         struct dst_entry *dst = skb->dst;
 176         struct hh_cache *hh = dst->hh;
 177         struct net_device *dev = dst->dev;
 178         int hh_len = LL_RESERVED_SPACE(dev);
 179
 180         /* Be paranoid, rather than too clever. */
 181         if (unlikely(skb_headroom(skb) < hh_len && dev->hard_header)) {
 182                 struct sk_buff *skb2;
 183
 184                 skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
 185                 if (skb2 == NULL) {
 186                         kfree_skb(skb);
 187                         return -ENOMEM;
 188                 }
 189                 if (skb->sk)
 190                         skb_set_owner_w(skb2, skb->sk);
 191                 kfree_skb(skb);
 192                 skb = skb2;
 193         }
 194
 195 #ifdef CONFIG_NETFILTER_DEBUG
 196         nf_debug_ip_finish_output2(skb);
 197 #endif /*CONFIG_NETFILTER_DEBUG*/
 198
 199         nf_reset(skb);
 200
 201         if (hh) {
 202                 int hh_alen;
 203
 204                 read_lock_bh(&hh->hh_lock);
 205                 hh_alen = HH_DATA_ALIGN(hh->hh_len);
 206                 memcpy(skb->data - hh_alen, hh->hh_data, hh_alen);
 207                 read_unlock_bh(&hh->hh_lock);
 208                 skb_push(skb, hh->hh_len);
 209                 return hh->hh_output(skb);
 210         } else if (dst->neighbour)
 211                 return dst->neighbour->output(skb);
 212
 213         if (net_ratelimit())
 214                 printk(KERN_DEBUG "ip_finish_output2: No header cache and no neighbour!\n");
 215         kfree_skb(skb);
 216         return -EINVAL;
 217 }
 218
 219 int ip_finish_output(struct sk_buff *skb)
 220 {
 221         struct net_device *dev = skb->dst->dev;
 222
 223         skb->dev = dev;
 224         skb->protocol = htons(ETH_P_IP);
 225
 226         return NF_HOOK(PF_INET, NF_IP_POST_ROUTING, skb, NULL, dev,
 227                        ip_finish_output2);
 228 }
 229
 230 int ip_mc_output(struct sk_buff *skb)
 231 {
 232         struct sock *sk = skb->sk;
 233         struct rtable *rt = (struct rtable*)skb->dst;
 234         struct net_device *dev = rt->u.dst.dev;
 235
 236         /*
 237          *      If the indicated interface is up and running, send the packet.
 238          */
 239         IP_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
 240
 241         skb->dev = dev;
 242         skb->protocol = htons(ETH_P_IP);
 243
 244         /*
 245          *      Multicasts are looped back for other local users
 246          */
 247
 248         if (rt->rt_flags&RTCF_MULTICAST) {
 249                 if ((!sk || inet_sk(sk)->mc_loop)
 250 #ifdef CONFIG_IP_MROUTE
 251                 /* Small optimization: do not loopback not local frames,
 252                    which returned after forwarding; they will be  dropped
 253                    by ip_mr_input in any case.
 254                    Note, that local frames are looped back to be delivered
 255                    to local recipients.
 256
 257                    This check is duplicated in ip_mr_input at the moment.
 258                  */
 259                     && ((rt->rt_flags&RTCF_LOCAL) || !(IPCB(skb)->flags&IPSKB_FORWARDED))
 260 #endif
 261                 ) {
 262                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
 263                         if (newskb)
 264                                 NF_HOOK(PF_INET, NF_IP_POST_ROUTING, newskb, NULL,
 265                                         newskb->dev,
 266                                         ip_dev_loopback_xmit);
 267                 }
 268
 269                 /* Multicasts with ttl 0 must not go beyond the host */
 270
 271                 if (skb->nh.iph->ttl == 0) {
 272                         kfree_skb(skb);
 273                         return 0;
 274                 }
 275         }
 276
 277         if (rt->rt_flags&RTCF_BROADCAST) {
 278                 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
 279                 if (newskb)
 280                         NF_HOOK(PF_INET, NF_IP_POST_ROUTING, newskb, NULL,
 281                                 newskb->dev, ip_dev_loopback_xmit);
 282         }
 283
 284         if (skb->len > dst_mtu(&rt->u.dst))
 285                 return ip_fragment(skb, ip_finish_output);
 286         else
 287                 return ip_finish_output(skb);
 288 }
 289
 290 int ip_output(struct sk_buff *skb)
 291 {
 292         IP_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
 293
 294         if (skb->len > dst_mtu(skb->dst) && !skb_shinfo(skb)->tso_size)
 295                 return ip_fragment(skb, ip_finish_output);
 296         else
 297                 return ip_finish_output(skb);
 298 }
 299
 300 int ip_queue_xmit(struct sk_buff *skb, int ipfragok)
 301 {
 302         struct sock *sk = skb->sk;
 303         struct inet_sock *inet = inet_sk(sk);
 304         struct ip_options *opt = inet->opt;
 305         struct rtable *rt;
 306         struct iphdr *iph;
 307
 308         /* Skip all of this if the packet is already routed,
 309          * f.e. by something like SCTP.
 310          */
 311         rt = (struct rtable *) skb->dst;
 312         if (rt != NULL)
 313                 goto packet_routed;
 314
 315         /* Make sure we can route this packet. */
 316         rt = (struct rtable *)__sk_dst_check(sk, 0);
 317         if (rt == NULL) {
 318                 u32 daddr;
 319
 320                 /* Use correct destination address if we have options. */
 321                 daddr = inet->daddr;
 322                 if(opt && opt->srr)
 323                         daddr = opt->faddr;
 324
 325                 {
 326                         struct flowi fl = { .oif = sk->sk_bound_dev_if,
 327                                             .nl_u = { .ip4_u =
 328                                                       { .daddr = daddr,
 329                                                         .saddr = inet->saddr,
 330                                                         .tos = RT_CONN_FLAGS(sk) } },
 331                                             .proto = sk->sk_protocol,
 332                                             .uli_u = { .ports =
 333                                                        { .sport = inet->sport,
 334                                                          .dport = inet->dport } } };
 335
 336                         /* If this fails, retransmit mechanism of transport layer will
 337                          * keep trying until route appears or the connection times
 338                          * itself out.
 339                          */
 340                         if (ip_route_output_flow(&rt, &fl, sk, 0))
 341                                 goto no_route;
 342                 }
 343                 __sk_dst_set(sk, &rt->u.dst);
 344                 tcp_v4_setup_caps(sk, &rt->u.dst);
 345         }
 346         skb->dst = dst_clone(&rt->u.dst);
 347
 348 packet_routed:
 349         if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
 350                 goto no_route;
 351
 352         /* OK, we know where to send it, allocate and build IP header. */
 353         iph = (struct iphdr *) skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0));
 354         *((__u16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff));
 355         iph->tot_len = htons(skb->len);
 356         if (ip_dont_fragment(sk, &rt->u.dst) && !ipfragok)
 357                 iph->frag_off = htons(IP_DF);
 358         else
 359                 iph->frag_off = 0;
 360         iph->ttl      = ip_select_ttl(inet, &rt->u.dst);
 361         iph->protocol = sk->sk_protocol;
 362         iph->saddr    = rt->rt_src;
 363         iph->daddr    = rt->rt_dst;
 364         skb->nh.iph   = iph;
 365         /* Transport layer set skb->h.foo itself. */
 366
 367         if (opt && opt->optlen) {
 368                 iph->ihl += opt->optlen >> 2;
 369                 ip_options_build(skb, opt, inet->daddr, rt, 0);
 370         }
 371
 372         ip_select_ident_more(iph, &rt->u.dst, sk, skb_shinfo(skb)->tso_segs);
 373
 374         /* Add an IP checksum. */
 375         ip_send_check(iph);
 376
 377         skb->priority = sk->sk_priority;
 378
 379         return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
 380                        dst_output);
 381
 382 no_route:
 383         IP_INC_STATS(IPSTATS_MIB_OUTNOROUTES);
 384         kfree_skb(skb);
 385         return -EHOSTUNREACH;
 386 }
 387
 388
 389 static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 390 {
 391         to->pkt_type = from->pkt_type;
 392         to->priority = from->priority;
 393         to->protocol = from->protocol;
 394         to->security = from->security;
 395         dst_release(to->dst);
 396         to->dst = dst_clone(from->dst);
 397         to->dev = from->dev;
 398
 399         /* Copy the flags to each fragment. */
 400         IPCB(to)->flags = IPCB(from)->flags;
 401
 402 #ifdef CONFIG_NET_SCHED
 403         to->tc_index = from->tc_index;
 404 #endif
 405 #ifdef CONFIG_NETFILTER
 406         to->nfmark = from->nfmark;
 407         to->nfcache = from->nfcache;
 408         /* Connection association is same as pre-frag packet */
 409         nf_conntrack_put(to->nfct);
 410         to->nfct = from->nfct;
 411         nf_conntrack_get(to->nfct);
 412         to->nfctinfo = from->nfctinfo;
 413 #ifdef CONFIG_BRIDGE_NETFILTER
 414         nf_bridge_put(to->nf_bridge);
 415         to->nf_bridge = from->nf_bridge;
 416         nf_bridge_get(to->nf_bridge);
 417 #endif
 418 #ifdef CONFIG_NETFILTER_DEBUG
 419         to->nf_debug = from->nf_debug;
 420 #endif
 421 #endif
 422 }
 423
 424 /*
 425  *      This IP datagram is too large to be sent in one piece.  Break it up into
 426  *      smaller pieces (each of size equal to IP header plus
 427  *      a block of the data of the original IP data part) that will yet fit in a
 428  *      single device frame, and queue such a frame for sending.
 429  */
 430
 431 int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*))
 432 {
 433         struct iphdr *iph;
 434         int raw = 0;
 435         int ptr;
 436         struct net_device *dev;
 437         struct sk_buff *skb2;
 438         unsigned int mtu, hlen, left, len, ll_rs;
 439         int offset;
 440         int not_last_frag;
 441         struct rtable *rt = (struct rtable*)skb->dst;
 442         int err = 0;
 443
 444         dev = rt->u.dst.dev;
 445
 446         /*
 447          *      Point into the IP datagram header.
 448          */
 449
 450         iph = skb->nh.iph;
 451
 452         if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) {
 453                 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
 454                           htonl(dst_mtu(&rt->u.dst)));
 455                 kfree_skb(skb);
 456                 return -EMSGSIZE;
 457         }
 458
 459         /*
 460          *      Setup starting values.
 461          */
 462
 463         hlen = iph->ihl * 4;
 464         mtu = dst_mtu(&rt->u.dst) - hlen;       /* Size of data space */
 465
 466         /* When frag_list is given, use it. First, check its validity:
 467          * some transformers could create wrong frag_list or break existing
 468          * one, it is not prohibited. In this case fall back to copying.
 469          *
 470          * LATER: this step can be merged to real generation of fragments,
 471          * we can switch to copy when see the first bad fragment.
 472          */
 473         if (skb_shinfo(skb)->frag_list) {
 474                 struct sk_buff *frag;
 475                 int first_len = skb_pagelen(skb);
 476
 477                 if (first_len - hlen > mtu ||
 478                     ((first_len - hlen) & 7) ||
 479                     (iph->frag_off & htons(IP_MF|IP_OFFSET)) ||
 480                     skb_cloned(skb))
 481                         goto slow_path;
 482
 483                 for (frag = skb_shinfo(skb)->frag_list; frag; frag = frag->next) {
 484                         /* Correct geometry. */
 485                         if (frag->len > mtu ||
 486                             ((frag->len & 7) && frag->next) ||
 487                             skb_headroom(frag) < hlen)
 488                             goto slow_path;
 489
 490                         /* Partially cloned skb? */
 491                         if (skb_shared(frag))
 492                                 goto slow_path;
 493                 }
 494
 495                 /* Everything is OK. Generate! */
 496
 497                 err = 0;
 498                 offset = 0;
 499                 frag = skb_shinfo(skb)->frag_list;
 500                 skb_shinfo(skb)->frag_list = NULL;
 501                 skb->data_len = first_len - skb_headlen(skb);
 502                 skb->len = first_len;
 503                 iph->tot_len = htons(first_len);
 504                 iph->frag_off = htons(IP_MF);
 505                 ip_send_check(iph);
 506
 507                 for (;;) {
 508                         /* Prepare header of the next frame,
 509                          * before previous one went down. */
 510                         if (frag) {
 511                                 frag->ip_summed = CHECKSUM_NONE;
 512                                 frag->h.raw = frag->data;
 513                                 frag->nh.raw = __skb_push(frag, hlen);
 514                                 memcpy(frag->nh.raw, iph, hlen);
 515                                 iph = frag->nh.iph;
 516                                 iph->tot_len = htons(frag->len);
 517                                 ip_copy_metadata(frag, skb);
 518                                 if (offset == 0)
 519                                         ip_options_fragment(frag);
 520                                 offset += skb->len - hlen;
 521                                 iph->frag_off = htons(offset>>3);
 522                                 if (frag->next != NULL)
 523                                         iph->frag_off |= htons(IP_MF);
 524                                 /* Ready, complete checksum */
 525                                 ip_send_check(iph);
 526                         }
 527
 528                         err = output(skb);
 529
 530                         if (err || !frag)
 531                                 break;
 532
 533                         skb = frag;
 534                         frag = skb->next;
 535                         skb->next = NULL;
 536                 }
 537
 538                 if (err == 0) {
 539                         IP_INC_STATS(IPSTATS_MIB_FRAGOKS);
 540                         return 0;
 541                 }
 542
 543                 while (frag) {
 544                         skb = frag->next;
 545                         kfree_skb(frag);
 546                         frag = skb;
 547                 }
 548                 IP_INC_STATS(IPSTATS_MIB_FRAGFAILS);
 549                 return err;
 550         }
 551
 552 slow_path:
 553         left = skb->len - hlen;         /* Space per frame */
 554         ptr = raw + hlen;               /* Where to start from */
 555
 556 #ifdef CONFIG_BRIDGE_NETFILTER
 557         /* for bridged IP traffic encapsulated inside f.e. a vlan header,
 558          * we need to make room for the encapsulating header */
 559         ll_rs = LL_RESERVED_SPACE_EXTRA(rt->u.dst.dev, nf_bridge_pad(skb));
 560         mtu -= nf_bridge_pad(skb);
 561 #else
 562         ll_rs = LL_RESERVED_SPACE(rt->u.dst.dev);
 563 #endif
 564         /*
 565          *      Fragment the datagram.
 566          */
 567
 568         offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
 569         not_last_frag = iph->frag_off & htons(IP_MF);
 570
 571         /*
 572          *      Keep copying data until we run out.
 573          */
 574
 575         while(left > 0) {
 576                 len = left;
 577                 /* IF: it doesn't fit, use 'mtu' - the data space left */
 578                 if (len > mtu)
 579                         len = mtu;
 580                 /* IF: we are not sending upto and including the packet end
 581                    then align the next start on an eight byte boundary */
 582                 if (len < left) {
 583                         len &= ~7;
 584                 }
 585                 /*
 586                  *      Allocate buffer.
 587                  */
 588
 589                 if ((skb2 = alloc_skb(len+hlen+ll_rs, GFP_ATOMIC)) == NULL) {
 590                         NETDEBUG(printk(KERN_INFO "IP: frag: no memory for new fragment!\n"));
 591                         err = -ENOMEM;
 592                         goto fail;
 593                 }
 594
 595                 /*
 596                  *      Set up data on packet
 597                  */
 598
 599                 ip_copy_metadata(skb2, skb);
 600                 skb_reserve(skb2, ll_rs);
 601                 skb_put(skb2, len + hlen);
 602                 skb2->nh.raw = skb2->data;
 603                 skb2->h.raw = skb2->data + hlen;
 604
 605                 /*
 606                  *      Charge the memory for the fragment to any owner
 607                  *      it might possess
 608                  */
 609
 610                 if (skb->sk)
 611                         skb_set_owner_w(skb2, skb->sk);
 612
 613                 /*
 614                  *      Copy the packet header into the new buffer.
 615                  */
 616
 617                 memcpy(skb2->nh.raw, skb->data, hlen);
 618
 619                 /*
 620                  *      Copy a block of the IP datagram.
 621                  */
 622                 if (skb_copy_bits(skb, ptr, skb2->h.raw, len))
 623                         BUG();
 624                 left -= len;
 625
 626                 /*
 627                  *      Fill in the new header fields.
 628                  */
 629                 iph = skb2->nh.iph;
 630                 iph->frag_off = htons((offset >> 3));
 631
 632                 /* ANK: dirty, but effective trick. Upgrade options only if
 633                  * the segment to be fragmented was THE FIRST (otherwise,
 634                  * options are already fixed) and make it ONCE
 635                  * on the initial skb, so that all the following fragments
 636                  * will inherit fixed options.
 637                  */
 638                 if (offset == 0)
 639                         ip_options_fragment(skb);
 640
 641                 /*
 642                  *      Added AC : If we are fragmenting a fragment that's not the
 643                  *                 last fragment then keep MF on each bit
 644                  */
 645                 if (left > 0 || not_last_frag)
 646                         iph->frag_off |= htons(IP_MF);
 647                 ptr += len;
 648                 offset += len;
 649
 650                 /*
 651                  *      Put this fragment into the sending queue.
 652                  */
 653
 654                 IP_INC_STATS(IPSTATS_MIB_FRAGCREATES);
 655
 656                 iph->tot_len = htons(len + hlen);
 657
 658                 ip_send_check(iph);
 659
 660                 err = output(skb2);
 661                 if (err)
 662                         goto fail;
 663         }
 664         kfree_skb(skb);
 665         IP_INC_STATS(IPSTATS_MIB_FRAGOKS);
 666         return err;
 667
 668 fail:
 669         kfree_skb(skb);
 670         IP_INC_STATS(IPSTATS_MIB_FRAGFAILS);
 671         return err;
 672 }
 673
 674 int
 675 ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb)
 676 {
 677         struct iovec *iov = from;
 678
 679         if (skb->ip_summed == CHECKSUM_HW) {
 680                 if (memcpy_fromiovecend(to, iov, offset, len) < 0)
 681                         return -EFAULT;
 682         } else {
 683                 unsigned int csum = 0;
 684                 if (csum_partial_copy_fromiovecend(to, iov, offset, len, &csum) < 0)
 685                         return -EFAULT;
 686                 skb->csum = csum_block_add(skb->csum, csum, odd);
 687         }
 688         return 0;
 689 }
 690
 691 static inline unsigned int
 692 csum_page(struct page *page, int offset, int copy)
 693 {
 694         char *kaddr;
 695         unsigned int csum;
 696         kaddr = kmap(page);
 697         csum = csum_partial(kaddr + offset, copy, 0);
 698         kunmap(page);
 699         return csum;
 700 }
 701
 702 /*
 703  *      ip_append_data() and ip_append_page() can make one large IP datagram
 704  *      from many pieces of data. Each pieces will be holded on the socket
 705  *      until ip_push_pending_frames() is called. Each piece can be a page
 706  *      or non-page data.
 707  *
 708  *      Not only UDP, other transport protocols - e.g. raw sockets - can use
 709  *      this interface potentially.
 710  *
 711  *      LATER: length must be adjusted by pad at tail, when it is required.
 712  */
 713 int ip_append_data(struct sock *sk,
 714                    int getfrag(void *from, char *to, int offset, int len,
 715                                int odd, struct sk_buff *skb),
 716                    void *from, int length, int transhdrlen,
 717                    struct ipcm_cookie *ipc, struct rtable *rt,
 718                    unsigned int flags)
 719 {
 720         struct inet_sock *inet = inet_sk(sk);
 721         struct sk_buff *skb;
 722
 723         struct ip_options *opt = NULL;
 724         int hh_len;
 725         int exthdrlen;
 726         int mtu;
 727         int copy;
 728         int err;
 729         int offset = 0;
 730         unsigned int maxfraglen, fragheaderlen;
 731         int csummode = CHECKSUM_NONE;
 732
 733         if (flags&MSG_PROBE)
 734                 return 0;
 735
 736         if (skb_queue_empty(&sk->sk_write_queue)) {
 737                 /*
 738                  * setup for corking.
 739                  */
 740                 opt = ipc->opt;
 741                 if (opt) {
 742                         if (inet->cork.opt == NULL) {
 743                                 inet->cork.opt = kmalloc(sizeof(struct ip_options) + 40, sk->sk_allocation);
 744                                 if (unlikely(inet->cork.opt == NULL))
 745                                         return -ENOBUFS;
 746                         }
 747                         memcpy(inet->cork.opt, opt, sizeof(struct ip_options)+opt->optlen);
 748                         inet->cork.flags |= IPCORK_OPT;
 749                         inet->cork.addr = ipc->addr;
 750                 }
 751                 dst_hold(&rt->u.dst);
 752                 inet->cork.fragsize = mtu = dst_mtu(rt->u.dst.path);
 753                 inet->cork.rt = rt;
 754                 inet->cork.length = 0;
 755                 sk->sk_sndmsg_page = NULL;
 756                 sk->sk_sndmsg_off = 0;
 757                 if ((exthdrlen = rt->u.dst.header_len) != 0) {
 758                         length += exthdrlen;
 759                         transhdrlen += exthdrlen;
 760                 }
 761         } else {
 762                 rt = inet->cork.rt;
 763                 if (inet->cork.flags & IPCORK_OPT)
 764                         opt = inet->cork.opt;
 765
 766                 transhdrlen = 0;
 767                 exthdrlen = 0;
 768                 mtu = inet->cork.fragsize;
 769         }
 770         hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
 771
 772         fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
 773         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
 774
 775         if (inet->cork.length + length > 0xFFFF - fragheaderlen) {
 776                 ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu-exthdrlen);
 777                 return -EMSGSIZE;
 778         }
 779
 780         /*
 781          * transhdrlen > 0 means that this is the first fragment and we wish
 782          * it won't be fragmented in the future.
 783          */
 784         if (transhdrlen &&
 785             length + fragheaderlen <= mtu &&
 786             rt->u.dst.dev->features&(NETIF_F_IP_CSUM|NETIF_F_NO_CSUM|NETIF_F_HW_CSUM) &&
 787             !exthdrlen)
 788                 csummode = CHECKSUM_HW;
 789
 790         inet->cork.length += length;
 791
 792         /* So, what's going on in the loop below?
 793          *
 794          * We use calculated fragment length to generate chained skb,
 795          * each of segments is IP fragment ready for sending to network after
 796          * adding appropriate IP header.
 797          */
 798
 799         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
 800                 goto alloc_new_skb;
 801
 802         while (length > 0) {
 803                 /* Check if the remaining data fits into current packet. */
 804                 copy = mtu - skb->len;
 805                 if (copy < length)
 806                         copy = maxfraglen - skb->len;
 807                 if (copy <= 0) {
 808                         char *data;
 809                         unsigned int datalen;
 810                         unsigned int fraglen;
 811                         unsigned int fraggap;
 812                         unsigned int alloclen;
 813                         struct sk_buff *skb_prev;
 814 alloc_new_skb:
 815                         skb_prev = skb;
 816                         if (skb_prev)
 817                                 fraggap = skb_prev->len - maxfraglen;
 818                         else
 819                                 fraggap = 0;
 820
 821                         /*
 822                          * If remaining data exceeds the mtu,
 823                          * we know we need more fragment(s).
 824                          */
 825                         datalen = length + fraggap;
 826                         if (datalen > mtu - fragheaderlen)
 827                                 datalen = maxfraglen - fragheaderlen;
 828                         fraglen = datalen + fragheaderlen;
 829
 830                         if ((flags & MSG_MORE) &&
 831                             !(rt->u.dst.dev->features&NETIF_F_SG))
 832                                 alloclen = mtu;
 833                         else
 834                                 alloclen = datalen + fragheaderlen;
 835
 836                         /* The last fragment gets additional space at tail.
 837                          * Note, with MSG_MORE we overallocate on fragments,
 838                          * because we have no idea what fragment will be
 839                          * the last.
 840                          */
 841                         if (datalen == length)
 842                                 alloclen += rt->u.dst.trailer_len;
 843
 844                         if (transhdrlen) {
 845                                 skb = sock_alloc_send_skb(sk,
 846                                                 alloclen + hh_len + 15,
 847                                                 (flags & MSG_DONTWAIT), &err);
 848                         } else {
 849                                 skb = NULL;
 850                                 if (atomic_read(&sk->sk_wmem_alloc) <=
 851                                     2 * sk->sk_sndbuf)
 852                                         skb = sock_wmalloc(sk,
 853                                                            alloclen + hh_len + 15, 1,
 854                                                            sk->sk_allocation);
 855                                 if (unlikely(skb == NULL))
 856                                         err = -ENOBUFS;
 857                         }
 858                         if (skb == NULL)
 859                                 goto error;
 860
 861                         /*
 862                          *      Fill in the control structures
 863                          */
 864                         skb->ip_summed = csummode;
 865                         skb->csum = 0;
 866                         skb_reserve(skb, hh_len);
 867
 868                         /*
 869                          *      Find where to start putting bytes.
 870                          */
 871                         data = skb_put(skb, fraglen);
 872                         skb->nh.raw = data + exthdrlen;
 873                         data += fragheaderlen;
 874                         skb->h.raw = data + exthdrlen;
 875
 876                         if (fraggap) {
 877                                 skb->csum = skb_copy_and_csum_bits(
 878                                         skb_prev, maxfraglen,
 879                                         data + transhdrlen, fraggap, 0);
 880                                 skb_prev->csum = csum_sub(skb_prev->csum,
 881                                                           skb->csum);
 882                                 data += fraggap;
 883                                 skb_trim(skb_prev, maxfraglen);
 884                         }
 885
 886                         copy = datalen - transhdrlen - fraggap;
 887                         if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
 888                                 err = -EFAULT;
 889                                 kfree_skb(skb);
 890                                 goto error;
 891                         }
 892
 893                         offset += copy;
 894                         length -= datalen - fraggap;
 895                         transhdrlen = 0;
 896                         exthdrlen = 0;
 897                         csummode = CHECKSUM_NONE;
 898
 899                         /*
 900                          * Put the packet on the pending queue.
 901                          */
 902                         __skb_queue_tail(&sk->sk_write_queue, skb);
 903                         continue;
 904                 }
 905
 906                 if (copy > length)
 907                         copy = length;
 908
 909                 if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
 910                         unsigned int off;
 911
 912                         off = skb->len;
 913                         if (getfrag(from, skb_put(skb, copy),
 914                                         offset, copy, off, skb) < 0) {
 915                                 __skb_trim(skb, off);
 916                                 err = -EFAULT;
 917                                 goto error;
 918                         }
 919                 } else {
 920                         int i = skb_shinfo(skb)->nr_frags;
 921                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
 922                         struct page *page = sk->sk_sndmsg_page;
 923                         int off = sk->sk_sndmsg_off;
 924                         unsigned int left;
 925
 926                         if (page && (left = PAGE_SIZE - off) > 0) {
 927                                 if (copy >= left)
 928                                         copy = left;
 929                                 if (page != frag->page) {
 930                                         if (i == MAX_SKB_FRAGS) {
 931                                                 err = -EMSGSIZE;
 932                                                 goto error;
 933                                         }
 934                                         get_page(page);
 935                                         skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
 936                                         frag = &skb_shinfo(skb)->frags[i];
 937                                 }
 938                         } else if (i < MAX_SKB_FRAGS) {
 939                                 if (copy > PAGE_SIZE)
 940                                         copy = PAGE_SIZE;
 941                                 page = alloc_pages(sk->sk_allocation, 0);
 942                                 if (page == NULL)  {
 943                                         err = -ENOMEM;
 944                                         goto error;
 945                                 }
 946                                 sk->sk_sndmsg_page = page;
 947                                 sk->sk_sndmsg_off = 0;
 948
 949                                 skb_fill_page_desc(skb, i, page, 0, 0);
 950                                 frag = &skb_shinfo(skb)->frags[i];
 951                                 skb->truesize += PAGE_SIZE;
 952                                 atomic_add(PAGE_SIZE, &sk->sk_wmem_alloc);
 953                         } else {
 954                                 err = -EMSGSIZE;
 955                                 goto error;
 956                         }
 957                         if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
 958                                 err = -EFAULT;
 959                                 goto error;
 960                         }
 961                         sk->sk_sndmsg_off += copy;
 962                         frag->size += copy;
 963                         skb->len += copy;
 964                         skb->data_len += copy;
 965                 }
 966                 offset += copy;
 967                 length -= copy;
 968         }
 969
 970         return 0;
 971
 972 error:
 973         inet->cork.length -= length;
 974         IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
 975         return err;
 976 }
 977
 978 ssize_t ip_append_page(struct sock *sk, struct page *page,
 979                        int offset, size_t size, int flags)
 980 {
 981         struct inet_sock *inet = inet_sk(sk);
 982         struct sk_buff *skb;
 983         struct rtable *rt;
 984         struct ip_options *opt = NULL;
 985         int hh_len;
 986         int mtu;
 987         int len;
 988         int err;
 989         unsigned int maxfraglen, fragheaderlen, fraggap;
 990
 991         if (inet->hdrincl)
 992                 return -EPERM;
 993
 994         if (flags&MSG_PROBE)
 995                 return 0;
 996
 997         if (skb_queue_empty(&sk->sk_write_queue))
 998                 return -EINVAL;
 999
1000         rt = inet->cork.rt;
1001         if (inet->cork.flags & IPCORK_OPT)
1002                 opt = inet->cork.opt;
1003
1004         if (!(rt->u.dst.dev->features&NETIF_F_SG))
1005                 return -EOPNOTSUPP;
1006
1007         hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
1008         mtu = inet->cork.fragsize;
1009
1010         fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
1011         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
1012
1013         if (inet->cork.length + size > 0xFFFF - fragheaderlen) {
1014                 ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu);
1015                 return -EMSGSIZE;
1016         }
1017
1018         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1019                 return -EINVAL;
1020
1021         inet->cork.length += size;
1022
1023         while (size > 0) {
1024                 int i;
1025
1026                 /* Check if the remaining data fits into current packet. */
1027                 len = mtu - skb->len;
1028                 if (len < size)
1029                         len = maxfraglen - skb->len;
1030                 if (len <= 0) {
1031                         struct sk_buff *skb_prev;
1032                         char *data;
1033                         struct iphdr *iph;
1034                         int alloclen;
1035
1036                         skb_prev = skb;
1037                         if (skb_prev)
1038                                 fraggap = skb_prev->len - maxfraglen;
1039                         else
1040                                 fraggap = 0;
1041
1042                         alloclen = fragheaderlen + hh_len + fraggap + 15;
1043                         skb = sock_wmalloc(sk, alloclen, 1, sk->sk_allocation);
1044                         if (unlikely(!skb)) {
1045                                 err = -ENOBUFS;
1046                                 goto error;
1047                         }
1048
1049                         /*
1050                          *      Fill in the control structures
1051                          */
1052                         skb->ip_summed = CHECKSUM_NONE;
1053                         skb->csum = 0;
1054                         skb_reserve(skb, hh_len);
1055
1056                         /*
1057                          *      Find where to start putting bytes.
1058                          */
1059                         data = skb_put(skb, fragheaderlen + fraggap);
1060                         skb->nh.iph = iph = (struct iphdr *)data;
1061                         data += fragheaderlen;
1062                         skb->h.raw = data;
1063
1064                         if (fraggap) {
1065                                 skb->csum = skb_copy_and_csum_bits(
1066                                         skb_prev, maxfraglen,
1067                                         data, fraggap, 0);
1068                                 skb_prev->csum = csum_sub(skb_prev->csum,
1069                                                           skb->csum);
1070                                 skb_trim(skb_prev, maxfraglen);
1071                         }
1072
1073                         /*
1074                          * Put the packet on the pending queue.
1075                          */
1076                         __skb_queue_tail(&sk->sk_write_queue, skb);
1077                         continue;
1078                 }
1079
1080                 i = skb_shinfo(skb)->nr_frags;
1081                 if (len > size)
1082                         len = size;
1083                 if (skb_can_coalesce(skb, i, page, offset)) {
1084                         skb_shinfo(skb)->frags[i-1].size += len;
1085                 } else if (i < MAX_SKB_FRAGS) {
1086                         get_page(page);
1087                         skb_fill_page_desc(skb, i, page, offset, len);
1088                 } else {
1089                         err = -EMSGSIZE;
1090                         goto error;
1091                 }
1092
1093                 if (skb->ip_summed == CHECKSUM_NONE) {
1094                         unsigned int csum;
1095                         csum = csum_page(page, offset, len);
1096                         skb->csum = csum_block_add(skb->csum, csum, skb->len);
1097                 }
1098
1099                 skb->len += len;
1100                 skb->data_len += len;
1101                 offset += len;
1102                 size -= len;
1103         }
1104         return 0;
1105
1106 error:
1107         inet->cork.length -= size;
1108         IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1109         return err;
1110 }
1111
1112 /*
1113  *      Combined all pending IP fragments on the socket as one IP datagram
1114  *      and push them out.
1115  */
1116 int ip_push_pending_frames(struct sock *sk)
1117 {
1118         struct sk_buff *skb, *tmp_skb;
1119         struct sk_buff **tail_skb;
1120         struct inet_sock *inet = inet_sk(sk);
1121         struct ip_options *opt = NULL;
1122         struct rtable *rt = inet->cork.rt;
1123         struct iphdr *iph;
1124         int df = 0;
1125         __u8 ttl;
1126         int err = 0;
1127
1128         if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1129                 goto out;
1130         tail_skb = &(skb_shinfo(skb)->frag_list);
1131
1132         /* move skb->data to ip header from ext header */
1133         if (skb->data < skb->nh.raw)
1134                 __skb_pull(skb, skb->nh.raw - skb->data);
1135         while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1136                 __skb_pull(tmp_skb, skb->h.raw - skb->nh.raw);
1137                 *tail_skb = tmp_skb;
1138                 tail_skb = &(tmp_skb->next);
1139                 skb->len += tmp_skb->len;
1140                 skb->data_len += tmp_skb->len;
1141                 skb->truesize += tmp_skb->truesize;
1142                 __sock_put(tmp_skb->sk);
1143                 tmp_skb->destructor = NULL;
1144                 tmp_skb->sk = NULL;
1145         }
1146
1147         /* Unless user demanded real pmtu discovery (IP_PMTUDISC_DO), we allow
1148          * to fragment the frame generated here. No matter, what transforms
1149          * how transforms change size of the packet, it will come out.
1150          */
1151         if (inet->pmtudisc != IP_PMTUDISC_DO)
1152                 skb->local_df = 1;
1153
1154         /* DF bit is set when we want to see DF on outgoing frames.
1155          * If local_df is set too, we still allow to fragment this frame
1156          * locally. */
1157         if (inet->pmtudisc == IP_PMTUDISC_DO ||
1158             (skb->len <= dst_mtu(&rt->u.dst) &&
1159              ip_dont_fragment(sk, &rt->u.dst)))
1160                 df = htons(IP_DF);
1161
1162         if (inet->cork.flags & IPCORK_OPT)
1163                 opt = inet->cork.opt;
1164
1165         if (rt->rt_type == RTN_MULTICAST)
1166                 ttl = inet->mc_ttl;
1167         else
1168                 ttl = ip_select_ttl(inet, &rt->u.dst);
1169
1170         iph = (struct iphdr *)skb->data;
1171         iph->version = 4;
1172         iph->ihl = 5;
1173         if (opt) {
1174                 iph->ihl += opt->optlen>>2;
1175                 ip_options_build(skb, opt, inet->cork.addr, rt, 0);
1176         }
1177         iph->tos = inet->tos;
1178         iph->tot_len = htons(skb->len);
1179         iph->frag_off = df;
1180         if (!df) {
1181                 __ip_select_ident(iph, &rt->u.dst, 0);
1182         } else {
1183                 iph->id = htons(inet->id++);
1184         }
1185         iph->ttl = ttl;
1186         iph->protocol = sk->sk_protocol;
1187         iph->saddr = rt->rt_src;
1188         iph->daddr = rt->rt_dst;
1189         ip_send_check(iph);
1190
1191         skb->priority = sk->sk_priority;
1192         skb->dst = dst_clone(&rt->u.dst);
1193
1194         /* Netfilter gets whole the not fragmented skb. */
1195         err = NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL,
1196                       skb->dst->dev, dst_output);
1197         if (err) {
1198                 if (err > 0)
1199                         err = inet->recverr ? net_xmit_errno(err) : 0;
1200                 if (err)
1201                         goto error;
1202         }
1203
1204 out:
1205         inet->cork.flags &= ~IPCORK_OPT;
1206         if (inet->cork.opt) {
1207                 kfree(inet->cork.opt);
1208                 inet->cork.opt = NULL;
1209         }
1210         if (inet->cork.rt) {
1211                 ip_rt_put(inet->cork.rt);
1212                 inet->cork.rt = NULL;
1213         }
1214         return err;
1215
1216 error:
1217         IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1218         goto out;
1219 }
1220
1221 /*
1222  *      Throw away all pending data on the socket.
1223  */
1224 void ip_flush_pending_frames(struct sock *sk)
1225 {
1226         struct inet_sock *inet = inet_sk(sk);
1227         struct sk_buff *skb;
1228
1229         while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL)
1230                 kfree_skb(skb);
1231
1232         inet->cork.flags &= ~IPCORK_OPT;
1233         if (inet->cork.opt) {
1234                 kfree(inet->cork.opt);
1235                 inet->cork.opt = NULL;
1236         }
1237         if (inet->cork.rt) {
1238                 ip_rt_put(inet->cork.rt);
1239                 inet->cork.rt = NULL;
1240         }
1241 }
1242
1243
1244 /*
1245  *      Fetch data from kernel space and fill in checksum if needed.
1246  */
1247 static int ip_reply_glue_bits(void *dptr, char *to, int offset,
1248                               int len, int odd, struct sk_buff *skb)
1249 {
1250         unsigned int csum;
1251
1252         csum = csum_partial_copy_nocheck(dptr+offset, to, len, 0);
1253         skb->csum = csum_block_add(skb->csum, csum, odd);
1254         return 0;
1255 }
1256
1257 /*
1258  *      Generic function to send a packet as reply to another packet.
1259  *      Used to send TCP resets so far. ICMP should use this function too.
1260  *
1261  *      Should run single threaded per socket because it uses the sock
1262  *      structure to pass arguments.
1263  *
1264  *      LATER: switch from ip_build_xmit to ip_append_*
1265  */
1266 void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *arg,
1267                    unsigned int len)
1268 {
1269         struct inet_sock *inet = inet_sk(sk);
1270         struct {
1271                 struct ip_options       opt;
1272                 char                    data[40];
1273         } replyopts;
1274         struct ipcm_cookie ipc;
1275         u32 daddr;
1276         struct rtable *rt = (struct rtable*)skb->dst;
1277
1278         if (ip_options_echo(&replyopts.opt, skb))
1279                 return;
1280
1281         daddr = ipc.addr = rt->rt_src;
1282         ipc.opt = NULL;
1283
1284         if (replyopts.opt.optlen) {
1285                 ipc.opt = &replyopts.opt;
1286
1287                 if (ipc.opt->srr)
1288                         daddr = replyopts.opt.faddr;
1289         }
1290
1291         {
1292                 struct flowi fl = { .nl_u = { .ip4_u =
1293                                               { .daddr = daddr,
1294                                                 .saddr = rt->rt_spec_dst,
1295                                                 .tos = RT_TOS(skb->nh.iph->tos) } },
1296                                     /* Not quite clean, but right. */
1297                                     .uli_u = { .ports =
1298                                                { .sport = skb->h.th->dest,
1299                                                  .dport = skb->h.th->source } },
1300                                     .proto = sk->sk_protocol };
1301                 if (ip_route_output_key(&rt, &fl))
1302                         return;
1303         }
1304
1305         /* And let IP do all the hard work.
1306
1307            This chunk is not reenterable, hence spinlock.
1308            Note that it uses the fact, that this function is called
1309            with locally disabled BH and that sk cannot be already spinlocked.
1310          */
1311         bh_lock_sock(sk);
1312         inet->tos = skb->nh.iph->tos;
1313         sk->sk_priority = skb->priority;
1314         sk->sk_protocol = skb->nh.iph->protocol;
1315         ip_append_data(sk, ip_reply_glue_bits, arg->iov->iov_base, len, 0,
1316                        &ipc, rt, MSG_DONTWAIT);
1317         if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) {
1318                 if (arg->csumoffset >= 0)
1319                         *((u16 *)skb->h.raw + arg->csumoffset) = csum_fold(csum_add(skb->csum, arg->csum));
1320                 skb->ip_summed = CHECKSUM_NONE;
1321                 ip_push_pending_frames(sk);
1322         }
1323
1324         bh_unlock_sock(sk);
1325
1326         ip_rt_put(rt);
1327 }
1328
1329 /*
1330  *      IP protocol layer initialiser
1331  */
1332
1333 static struct packet_type ip_packet_type = {
1334         .type = __constant_htons(ETH_P_IP),
1335         .func = ip_rcv,
1336 };
1337
1338 /*
1339  *      IP registers the packet type and then calls the subprotocol initialisers
1340  */
1341
1342 void __init ip_init(void)
1343 {
1344         dev_add_pack(&ip_packet_type);
1345
1346         ip_rt_init();
1347         inet_initpeers();
1348
1349 #if defined(CONFIG_IP_MULTICAST) && defined(CONFIG_PROC_FS)
1350         igmp_mc_proc_init();
1351 #endif
1352 }
1353
1354 EXPORT_SYMBOL(ip_finish_output);
1355 EXPORT_SYMBOL(ip_fragment);
1356 EXPORT_SYMBOL(ip_generic_getfrag);
1357 EXPORT_SYMBOL(ip_queue_xmit);
1358 EXPORT_SYMBOL(ip_send_check);
1359
1360 #ifdef CONFIG_SYSCTL
1361 EXPORT_SYMBOL(sysctl_ip_default_ttl);
1362 #endif