net/ipv4/ip_output.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              The Internet Protocol (IP) output module.
   7  *
   8  * Version:     $Id: ip_output.c,v 1.100 2002/02/01 22:01:03 davem Exp $
   9  *
  10  * Authors:     Ross Biro
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Donald Becker, <becker@super.org>
  13  *              Alan Cox, <Alan.Cox@linux.org>
  14  *              Richard Underwood
  15  *              Stefan Becker, <stefanb@yello.ping.de>
  16  *              Jorge Cwik, <jorge@laser.satlink.net>
  17  *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
  18  *              Hirokazu Takahashi, <taka@valinux.co.jp>
  19  *
  20  *      See ip_input.c for original log
  21  *
  22  *      Fixes:
  23  *              Alan Cox        :       Missing nonblock feature in ip_build_xmit.
  24  *              Mike Kilburn    :       htons() missing in ip_build_xmit.
  25  *              Bradford Johnson:       Fix faulty handling of some frames when
  26  *                                      no route is found.
  27  *              Alexander Demenshin:    Missing sk/skb free in ip_queue_xmit
  28  *                                      (in case if packet not accepted by
  29  *                                      output firewall rules)
  30  *              Mike McLagan    :       Routing by source
  31  *              Alexey Kuznetsov:       use new route cache
  32  *              Andi Kleen:             Fix broken PMTU recovery and remove
  33  *                                      some redundant tests.
  34  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  35  *              Andi Kleen      :       Replace ip_reply with ip_send_reply.
  36  *              Andi Kleen      :       Split fast and slow ip_build_xmit path
  37  *                                      for decreased register pressure on x86
  38  *                                      and more readibility.
  39  *              Marc Boucher    :       When call_out_firewall returns FW_QUEUE,
  40  *                                      silently drop skb instead of failing with -EPERM.
  41  *              Detlev Wengorz  :       Copy protocol for fragments.
  42  *              Hirokazu Takahashi:     HW checksumming for outgoing UDP
  43  *                                      datagrams.
  44  *              Hirokazu Takahashi:     sendfile() on UDP works now.
  45  */
  46
  47 #include <asm/uaccess.h>
  48 #include <asm/system.h>
  49 #include <linux/module.h>
  50 #include <linux/types.h>
  51 #include <linux/kernel.h>
  52 #include <linux/mm.h>
  53 #include <linux/string.h>
  54 #include <linux/errno.h>
  55 #include <linux/highmem.h>
  56
  57 #include <linux/socket.h>
  58 #include <linux/sockios.h>
  59 #include <linux/in.h>
  60 #include <linux/inet.h>
  61 #include <linux/netdevice.h>
  62 #include <linux/etherdevice.h>
  63 #include <linux/proc_fs.h>
  64 #include <linux/stat.h>
  65 #include <linux/init.h>
  66
  67 #include <net/snmp.h>
  68 #include <net/ip.h>
  69 #include <net/protocol.h>
  70 #include <net/route.h>
  71 #include <net/xfrm.h>
  72 #include <linux/skbuff.h>
  73 #include <net/sock.h>
  74 #include <net/arp.h>
  75 #include <net/icmp.h>
  76 #include <net/checksum.h>
  77 #include <net/inetpeer.h>
  78 #include <linux/igmp.h>
  79 #include <linux/netfilter_ipv4.h>
  80 #include <linux/netfilter_bridge.h>
  81 #include <linux/mroute.h>
  82 #include <linux/netlink.h>
  83 #include <linux/tcp.h>
  84
  85 int sysctl_ip_default_ttl __read_mostly = IPDEFTTL;
  86
  87 /* Generate a checksum for an outgoing IP datagram. */
  88 __inline__ void ip_send_check(struct iphdr *iph)
  89 {
  90         iph->check = 0;
  91         iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
  92 }
  93
  94 int __ip_local_out(struct sk_buff *skb)
  95 {
  96         struct iphdr *iph = ip_hdr(skb);
  97
  98         iph->tot_len = htons(skb->len);
  99         ip_send_check(iph);
 100         return nf_hook(PF_INET, NF_INET_LOCAL_OUT, skb, NULL, skb->dst->dev,
 101                        dst_output);
 102 }
 103
 104 int ip_local_out(struct sk_buff *skb)
 105 {
 106         int err;
 107
 108         err = __ip_local_out(skb);
 109         if (likely(err == 1))
 110                 err = dst_output(skb);
 111
 112         return err;
 113 }
 114 EXPORT_SYMBOL_GPL(ip_local_out);
 115
 116 /* dev_loopback_xmit for use with netfilter. */
 117 static int ip_dev_loopback_xmit(struct sk_buff *newskb)
 118 {
 119         skb_reset_mac_header(newskb);
 120         __skb_pull(newskb, skb_network_offset(newskb));
 121         newskb->pkt_type = PACKET_LOOPBACK;
 122         newskb->ip_summed = CHECKSUM_UNNECESSARY;
 123         BUG_TRAP(newskb->dst);
 124         netif_rx(newskb);
 125         return 0;
 126 }
 127
 128 static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
 129 {
 130         int ttl = inet->uc_ttl;
 131
 132         if (ttl < 0)
 133                 ttl = dst_metric(dst, RTAX_HOPLIMIT);
 134         return ttl;
 135 }
 136
 137 /*
 138  *              Add an ip header to a skbuff and send it out.
 139  *
 140  */
 141 int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
 142                           __be32 saddr, __be32 daddr, struct ip_options *opt)
 143 {
 144         struct inet_sock *inet = inet_sk(sk);
 145         struct rtable *rt = (struct rtable *)skb->dst;
 146         struct iphdr *iph;
 147
 148         /* Build the IP header. */
 149         skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0));
 150         skb_reset_network_header(skb);
 151         iph = ip_hdr(skb);
 152         iph->version  = 4;
 153         iph->ihl      = 5;
 154         iph->tos      = inet->tos;
 155         if (ip_dont_fragment(sk, &rt->u.dst))
 156                 iph->frag_off = htons(IP_DF);
 157         else
 158                 iph->frag_off = 0;
 159         iph->ttl      = ip_select_ttl(inet, &rt->u.dst);
 160         iph->daddr    = rt->rt_dst;
 161         iph->saddr    = rt->rt_src;
 162         iph->protocol = sk->sk_protocol;
 163         ip_select_ident(iph, &rt->u.dst, sk);
 164
 165         if (opt && opt->optlen) {
 166                 iph->ihl += opt->optlen>>2;
 167                 ip_options_build(skb, opt, daddr, rt, 0);
 168         }
 169
 170         skb->priority = sk->sk_priority;
 171
 172         /* Send it out. */
 173         return ip_local_out(skb);
 174 }
 175
 176 EXPORT_SYMBOL_GPL(ip_build_and_send_pkt);
 177
 178 static inline int ip_finish_output2(struct sk_buff *skb)
 179 {
 180         struct dst_entry *dst = skb->dst;
 181         struct rtable *rt = (struct rtable *)dst;
 182         struct net_device *dev = dst->dev;
 183         unsigned int hh_len = LL_RESERVED_SPACE(dev);
 184
 185         if (rt->rt_type == RTN_MULTICAST)
 186                 IP_INC_STATS(IPSTATS_MIB_OUTMCASTPKTS);
 187         else if (rt->rt_type == RTN_BROADCAST)
 188                 IP_INC_STATS(IPSTATS_MIB_OUTBCASTPKTS);
 189
 190         /* Be paranoid, rather than too clever. */
 191         if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
 192                 struct sk_buff *skb2;
 193
 194                 skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
 195                 if (skb2 == NULL) {
 196                         kfree_skb(skb);
 197                         return -ENOMEM;
 198                 }
 199                 if (skb->sk)
 200                         skb_set_owner_w(skb2, skb->sk);
 201                 kfree_skb(skb);
 202                 skb = skb2;
 203         }
 204
 205         if (dst->hh)
 206                 return neigh_hh_output(dst->hh, skb);
 207         else if (dst->neighbour)
 208                 return dst->neighbour->output(skb);
 209
 210         if (net_ratelimit())
 211                 printk(KERN_DEBUG "ip_finish_output2: No header cache and no neighbour!\n");
 212         kfree_skb(skb);
 213         return -EINVAL;
 214 }
 215
 216 static inline int ip_skb_dst_mtu(struct sk_buff *skb)
 217 {
 218         struct inet_sock *inet = skb->sk ? inet_sk(skb->sk) : NULL;
 219
 220         return (inet && inet->pmtudisc == IP_PMTUDISC_PROBE) ?
 221                skb->dst->dev->mtu : dst_mtu(skb->dst);
 222 }
 223
 224 static int ip_finish_output(struct sk_buff *skb)
 225 {
 226 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
 227         /* Policy lookup after SNAT yielded a new policy */
 228         if (skb->dst->xfrm != NULL) {
 229                 IPCB(skb)->flags |= IPSKB_REROUTED;
 230                 return dst_output(skb);
 231         }
 232 #endif
 233         if (skb->len > ip_skb_dst_mtu(skb) && !skb_is_gso(skb))
 234                 return ip_fragment(skb, ip_finish_output2);
 235         else
 236                 return ip_finish_output2(skb);
 237 }
 238
 239 int ip_mc_output(struct sk_buff *skb)
 240 {
 241         struct sock *sk = skb->sk;
 242         struct rtable *rt = (struct rtable*)skb->dst;
 243         struct net_device *dev = rt->u.dst.dev;
 244
 245         /*
 246          *      If the indicated interface is up and running, send the packet.
 247          */
 248         IP_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
 249
 250         skb->dev = dev;
 251         skb->protocol = htons(ETH_P_IP);
 252
 253         /*
 254          *      Multicasts are looped back for other local users
 255          */
 256
 257         if (rt->rt_flags&RTCF_MULTICAST) {
 258                 if ((!sk || inet_sk(sk)->mc_loop)
 259 #ifdef CONFIG_IP_MROUTE
 260                 /* Small optimization: do not loopback not local frames,
 261                    which returned after forwarding; they will be  dropped
 262                    by ip_mr_input in any case.
 263                    Note, that local frames are looped back to be delivered
 264                    to local recipients.
 265
 266                    This check is duplicated in ip_mr_input at the moment.
 267                  */
 268                     && ((rt->rt_flags&RTCF_LOCAL) || !(IPCB(skb)->flags&IPSKB_FORWARDED))
 269 #endif
 270                 ) {
 271                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
 272                         if (newskb)
 273                                 NF_HOOK(PF_INET, NF_INET_POST_ROUTING, newskb,
 274                                         NULL, newskb->dev,
 275                                         ip_dev_loopback_xmit);
 276                 }
 277
 278                 /* Multicasts with ttl 0 must not go beyond the host */
 279
 280                 if (ip_hdr(skb)->ttl == 0) {
 281                         kfree_skb(skb);
 282                         return 0;
 283                 }
 284         }
 285
 286         if (rt->rt_flags&RTCF_BROADCAST) {
 287                 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
 288                 if (newskb)
 289                         NF_HOOK(PF_INET, NF_INET_POST_ROUTING, newskb, NULL,
 290                                 newskb->dev, ip_dev_loopback_xmit);
 291         }
 292
 293         return NF_HOOK_COND(PF_INET, NF_INET_POST_ROUTING, skb, NULL, skb->dev,
 294                             ip_finish_output,
 295                             !(IPCB(skb)->flags & IPSKB_REROUTED));
 296 }
 297
 298 int ip_output(struct sk_buff *skb)
 299 {
 300         struct net_device *dev = skb->dst->dev;
 301
 302         IP_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
 303
 304         skb->dev = dev;
 305         skb->protocol = htons(ETH_P_IP);
 306
 307         return NF_HOOK_COND(PF_INET, NF_INET_POST_ROUTING, skb, NULL, dev,
 308                             ip_finish_output,
 309                             !(IPCB(skb)->flags & IPSKB_REROUTED));
 310 }
 311
 312 int ip_queue_xmit(struct sk_buff *skb, int ipfragok)
 313 {
 314         struct sock *sk = skb->sk;
 315         struct inet_sock *inet = inet_sk(sk);
 316         struct ip_options *opt = inet->opt;
 317         struct rtable *rt;
 318         struct iphdr *iph;
 319
 320         /* Skip all of this if the packet is already routed,
 321          * f.e. by something like SCTP.
 322          */
 323         rt = (struct rtable *) skb->dst;
 324         if (rt != NULL)
 325                 goto packet_routed;
 326
 327         /* Make sure we can route this packet. */
 328         rt = (struct rtable *)__sk_dst_check(sk, 0);
 329         if (rt == NULL) {
 330                 __be32 daddr;
 331
 332                 /* Use correct destination address if we have options. */
 333                 daddr = inet->daddr;
 334                 if(opt && opt->srr)
 335                         daddr = opt->faddr;
 336
 337                 {
 338                         struct flowi fl = { .oif = sk->sk_bound_dev_if,
 339                                             .nl_u = { .ip4_u =
 340                                                       { .daddr = daddr,
 341                                                         .saddr = inet->saddr,
 342                                                         .tos = RT_CONN_FLAGS(sk) } },
 343                                             .proto = sk->sk_protocol,
 344                                             .uli_u = { .ports =
 345                                                        { .sport = inet->sport,
 346                                                          .dport = inet->dport } } };
 347
 348                         /* If this fails, retransmit mechanism of transport layer will
 349                          * keep trying until route appears or the connection times
 350                          * itself out.
 351                          */
 352                         security_sk_classify_flow(sk, &fl);
 353                         if (ip_route_output_flow(&init_net, &rt, &fl, sk, 0))
 354                                 goto no_route;
 355                 }
 356                 sk_setup_caps(sk, &rt->u.dst);
 357         }
 358         skb->dst = dst_clone(&rt->u.dst);
 359
 360 packet_routed:
 361         if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
 362                 goto no_route;
 363
 364         /* OK, we know where to send it, allocate and build IP header. */
 365         skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0));
 366         skb_reset_network_header(skb);
 367         iph = ip_hdr(skb);
 368         *((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff));
 369         if (ip_dont_fragment(sk, &rt->u.dst) && !ipfragok)
 370                 iph->frag_off = htons(IP_DF);
 371         else
 372                 iph->frag_off = 0;
 373         iph->ttl      = ip_select_ttl(inet, &rt->u.dst);
 374         iph->protocol = sk->sk_protocol;
 375         iph->saddr    = rt->rt_src;
 376         iph->daddr    = rt->rt_dst;
 377         /* Transport layer set skb->h.foo itself. */
 378
 379         if (opt && opt->optlen) {
 380                 iph->ihl += opt->optlen >> 2;
 381                 ip_options_build(skb, opt, inet->daddr, rt, 0);
 382         }
 383
 384         ip_select_ident_more(iph, &rt->u.dst, sk,
 385                              (skb_shinfo(skb)->gso_segs ?: 1) - 1);
 386
 387         skb->priority = sk->sk_priority;
 388
 389         return ip_local_out(skb);
 390
 391 no_route:
 392         IP_INC_STATS(IPSTATS_MIB_OUTNOROUTES);
 393         kfree_skb(skb);
 394         return -EHOSTUNREACH;
 395 }
 396
 397
 398 static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 399 {
 400         to->pkt_type = from->pkt_type;
 401         to->priority = from->priority;
 402         to->protocol = from->protocol;
 403         dst_release(to->dst);
 404         to->dst = dst_clone(from->dst);
 405         to->dev = from->dev;
 406         to->mark = from->mark;
 407
 408         /* Copy the flags to each fragment. */
 409         IPCB(to)->flags = IPCB(from)->flags;
 410
 411 #ifdef CONFIG_NET_SCHED
 412         to->tc_index = from->tc_index;
 413 #endif
 414         nf_copy(to, from);
 415 #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
 416     defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
 417         to->nf_trace = from->nf_trace;
 418 #endif
 419 #if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
 420         to->ipvs_property = from->ipvs_property;
 421 #endif
 422         skb_copy_secmark(to, from);
 423 }
 424
 425 /*
 426  *      This IP datagram is too large to be sent in one piece.  Break it up into
 427  *      smaller pieces (each of size equal to IP header plus
 428  *      a block of the data of the original IP data part) that will yet fit in a
 429  *      single device frame, and queue such a frame for sending.
 430  */
 431
 432 int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*))
 433 {
 434         struct iphdr *iph;
 435         int raw = 0;
 436         int ptr;
 437         struct net_device *dev;
 438         struct sk_buff *skb2;
 439         unsigned int mtu, hlen, left, len, ll_rs, pad;
 440         int offset;
 441         __be16 not_last_frag;
 442         struct rtable *rt = (struct rtable*)skb->dst;
 443         int err = 0;
 444
 445         dev = rt->u.dst.dev;
 446
 447         /*
 448          *      Point into the IP datagram header.
 449          */
 450
 451         iph = ip_hdr(skb);
 452
 453         if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) {
 454                 IP_INC_STATS(IPSTATS_MIB_FRAGFAILS);
 455                 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
 456                           htonl(ip_skb_dst_mtu(skb)));
 457                 kfree_skb(skb);
 458                 return -EMSGSIZE;
 459         }
 460
 461         /*
 462          *      Setup starting values.
 463          */
 464
 465         hlen = iph->ihl * 4;
 466         mtu = dst_mtu(&rt->u.dst) - hlen;       /* Size of data space */
 467         IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE;
 468
 469         /* When frag_list is given, use it. First, check its validity:
 470          * some transformers could create wrong frag_list or break existing
 471          * one, it is not prohibited. In this case fall back to copying.
 472          *
 473          * LATER: this step can be merged to real generation of fragments,
 474          * we can switch to copy when see the first bad fragment.
 475          */
 476         if (skb_shinfo(skb)->frag_list) {
 477                 struct sk_buff *frag;
 478                 int first_len = skb_pagelen(skb);
 479
 480                 if (first_len - hlen > mtu ||
 481                     ((first_len - hlen) & 7) ||
 482                     (iph->frag_off & htons(IP_MF|IP_OFFSET)) ||
 483                     skb_cloned(skb))
 484                         goto slow_path;
 485
 486                 for (frag = skb_shinfo(skb)->frag_list; frag; frag = frag->next) {
 487                         /* Correct geometry. */
 488                         if (frag->len > mtu ||
 489                             ((frag->len & 7) && frag->next) ||
 490                             skb_headroom(frag) < hlen)
 491                             goto slow_path;
 492
 493                         /* Partially cloned skb? */
 494                         if (skb_shared(frag))
 495                                 goto slow_path;
 496
 497                         BUG_ON(frag->sk);
 498                         if (skb->sk) {
 499                                 sock_hold(skb->sk);
 500                                 frag->sk = skb->sk;
 501                                 frag->destructor = sock_wfree;
 502                                 skb->truesize -= frag->truesize;
 503                         }
 504                 }
 505
 506                 /* Everything is OK. Generate! */
 507
 508                 err = 0;
 509                 offset = 0;
 510                 frag = skb_shinfo(skb)->frag_list;
 511                 skb_shinfo(skb)->frag_list = NULL;
 512                 skb->data_len = first_len - skb_headlen(skb);
 513                 skb->len = first_len;
 514                 iph->tot_len = htons(first_len);
 515                 iph->frag_off = htons(IP_MF);
 516                 ip_send_check(iph);
 517
 518                 for (;;) {
 519                         /* Prepare header of the next frame,
 520                          * before previous one went down. */
 521                         if (frag) {
 522                                 frag->ip_summed = CHECKSUM_NONE;
 523                                 skb_reset_transport_header(frag);
 524                                 __skb_push(frag, hlen);
 525                                 skb_reset_network_header(frag);
 526                                 memcpy(skb_network_header(frag), iph, hlen);
 527                                 iph = ip_hdr(frag);
 528                                 iph->tot_len = htons(frag->len);
 529                                 ip_copy_metadata(frag, skb);
 530                                 if (offset == 0)
 531                                         ip_options_fragment(frag);
 532                                 offset += skb->len - hlen;
 533                                 iph->frag_off = htons(offset>>3);
 534                                 if (frag->next != NULL)
 535                                         iph->frag_off |= htons(IP_MF);
 536                                 /* Ready, complete checksum */
 537                                 ip_send_check(iph);
 538                         }
 539
 540                         err = output(skb);
 541
 542                         if (!err)
 543                                 IP_INC_STATS(IPSTATS_MIB_FRAGCREATES);
 544                         if (err || !frag)
 545                                 break;
 546
 547                         skb = frag;
 548                         frag = skb->next;
 549                         skb->next = NULL;
 550                 }
 551
 552                 if (err == 0) {
 553                         IP_INC_STATS(IPSTATS_MIB_FRAGOKS);
 554                         return 0;
 555                 }
 556
 557                 while (frag) {
 558                         skb = frag->next;
 559                         kfree_skb(frag);
 560                         frag = skb;
 561                 }
 562                 IP_INC_STATS(IPSTATS_MIB_FRAGFAILS);
 563                 return err;
 564         }
 565
 566 slow_path:
 567         left = skb->len - hlen;         /* Space per frame */
 568         ptr = raw + hlen;               /* Where to start from */
 569
 570         /* for bridged IP traffic encapsulated inside f.e. a vlan header,
 571          * we need to make room for the encapsulating header
 572          */
 573         pad = nf_bridge_pad(skb);
 574         ll_rs = LL_RESERVED_SPACE_EXTRA(rt->u.dst.dev, pad);
 575         mtu -= pad;
 576
 577         /*
 578          *      Fragment the datagram.
 579          */
 580
 581         offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
 582         not_last_frag = iph->frag_off & htons(IP_MF);
 583
 584         /*
 585          *      Keep copying data until we run out.
 586          */
 587
 588         while (left > 0) {
 589                 len = left;
 590                 /* IF: it doesn't fit, use 'mtu' - the data space left */
 591                 if (len > mtu)
 592                         len = mtu;
 593                 /* IF: we are not sending upto and including the packet end
 594                    then align the next start on an eight byte boundary */
 595                 if (len < left) {
 596                         len &= ~7;
 597                 }
 598                 /*
 599                  *      Allocate buffer.
 600                  */
 601
 602                 if ((skb2 = alloc_skb(len+hlen+ll_rs, GFP_ATOMIC)) == NULL) {
 603                         NETDEBUG(KERN_INFO "IP: frag: no memory for new fragment!\n");
 604                         err = -ENOMEM;
 605                         goto fail;
 606                 }
 607
 608                 /*
 609                  *      Set up data on packet
 610                  */
 611
 612                 ip_copy_metadata(skb2, skb);
 613                 skb_reserve(skb2, ll_rs);
 614                 skb_put(skb2, len + hlen);
 615                 skb_reset_network_header(skb2);
 616                 skb2->transport_header = skb2->network_header + hlen;
 617
 618                 /*
 619                  *      Charge the memory for the fragment to any owner
 620                  *      it might possess
 621                  */
 622
 623                 if (skb->sk)
 624                         skb_set_owner_w(skb2, skb->sk);
 625
 626                 /*
 627                  *      Copy the packet header into the new buffer.
 628                  */
 629
 630                 skb_copy_from_linear_data(skb, skb_network_header(skb2), hlen);
 631
 632                 /*
 633                  *      Copy a block of the IP datagram.
 634                  */
 635                 if (skb_copy_bits(skb, ptr, skb_transport_header(skb2), len))
 636                         BUG();
 637                 left -= len;
 638
 639                 /*
 640                  *      Fill in the new header fields.
 641                  */
 642                 iph = ip_hdr(skb2);
 643                 iph->frag_off = htons((offset >> 3));
 644
 645                 /* ANK: dirty, but effective trick. Upgrade options only if
 646                  * the segment to be fragmented was THE FIRST (otherwise,
 647                  * options are already fixed) and make it ONCE
 648                  * on the initial skb, so that all the following fragments
 649                  * will inherit fixed options.
 650                  */
 651                 if (offset == 0)
 652                         ip_options_fragment(skb);
 653
 654                 /*
 655                  *      Added AC : If we are fragmenting a fragment that's not the
 656                  *                 last fragment then keep MF on each bit
 657                  */
 658                 if (left > 0 || not_last_frag)
 659                         iph->frag_off |= htons(IP_MF);
 660                 ptr += len;
 661                 offset += len;
 662
 663                 /*
 664                  *      Put this fragment into the sending queue.
 665                  */
 666                 iph->tot_len = htons(len + hlen);
 667
 668                 ip_send_check(iph);
 669
 670                 err = output(skb2);
 671                 if (err)
 672                         goto fail;
 673
 674                 IP_INC_STATS(IPSTATS_MIB_FRAGCREATES);
 675         }
 676         kfree_skb(skb);
 677         IP_INC_STATS(IPSTATS_MIB_FRAGOKS);
 678         return err;
 679
 680 fail:
 681         kfree_skb(skb);
 682         IP_INC_STATS(IPSTATS_MIB_FRAGFAILS);
 683         return err;
 684 }
 685
 686 EXPORT_SYMBOL(ip_fragment);
 687
 688 int
 689 ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb)
 690 {
 691         struct iovec *iov = from;
 692
 693         if (skb->ip_summed == CHECKSUM_PARTIAL) {
 694                 if (memcpy_fromiovecend(to, iov, offset, len) < 0)
 695                         return -EFAULT;
 696         } else {
 697                 __wsum csum = 0;
 698                 if (csum_partial_copy_fromiovecend(to, iov, offset, len, &csum) < 0)
 699                         return -EFAULT;
 700                 skb->csum = csum_block_add(skb->csum, csum, odd);
 701         }
 702         return 0;
 703 }
 704
 705 static inline __wsum
 706 csum_page(struct page *page, int offset, int copy)
 707 {
 708         char *kaddr;
 709         __wsum csum;
 710         kaddr = kmap(page);
 711         csum = csum_partial(kaddr + offset, copy, 0);
 712         kunmap(page);
 713         return csum;
 714 }
 715
 716 static inline int ip_ufo_append_data(struct sock *sk,
 717                         int getfrag(void *from, char *to, int offset, int len,
 718                                int odd, struct sk_buff *skb),
 719                         void *from, int length, int hh_len, int fragheaderlen,
 720                         int transhdrlen, int mtu,unsigned int flags)
 721 {
 722         struct sk_buff *skb;
 723         int err;
 724
 725         /* There is support for UDP fragmentation offload by network
 726          * device, so create one single skb packet containing complete
 727          * udp datagram
 728          */
 729         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
 730                 skb = sock_alloc_send_skb(sk,
 731                         hh_len + fragheaderlen + transhdrlen + 20,
 732                         (flags & MSG_DONTWAIT), &err);
 733
 734                 if (skb == NULL)
 735                         return err;
 736
 737                 /* reserve space for Hardware header */
 738                 skb_reserve(skb, hh_len);
 739
 740                 /* create space for UDP/IP header */
 741                 skb_put(skb,fragheaderlen + transhdrlen);
 742
 743                 /* initialize network header pointer */
 744                 skb_reset_network_header(skb);
 745
 746                 /* initialize protocol header pointer */
 747                 skb->transport_header = skb->network_header + fragheaderlen;
 748
 749                 skb->ip_summed = CHECKSUM_PARTIAL;
 750                 skb->csum = 0;
 751                 sk->sk_sndmsg_off = 0;
 752         }
 753
 754         err = skb_append_datato_frags(sk,skb, getfrag, from,
 755                                (length - transhdrlen));
 756         if (!err) {
 757                 /* specify the length of each IP datagram fragment*/
 758                 skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
 759                 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
 760                 __skb_queue_tail(&sk->sk_write_queue, skb);
 761
 762                 return 0;
 763         }
 764         /* There is not enough support do UFO ,
 765          * so follow normal path
 766          */
 767         kfree_skb(skb);
 768         return err;
 769 }
 770
 771 /*
 772  *      ip_append_data() and ip_append_page() can make one large IP datagram
 773  *      from many pieces of data. Each pieces will be holded on the socket
 774  *      until ip_push_pending_frames() is called. Each piece can be a page
 775  *      or non-page data.
 776  *
 777  *      Not only UDP, other transport protocols - e.g. raw sockets - can use
 778  *      this interface potentially.
 779  *
 780  *      LATER: length must be adjusted by pad at tail, when it is required.
 781  */
 782 int ip_append_data(struct sock *sk,
 783                    int getfrag(void *from, char *to, int offset, int len,
 784                                int odd, struct sk_buff *skb),
 785                    void *from, int length, int transhdrlen,
 786                    struct ipcm_cookie *ipc, struct rtable *rt,
 787                    unsigned int flags)
 788 {
 789         struct inet_sock *inet = inet_sk(sk);
 790         struct sk_buff *skb;
 791
 792         struct ip_options *opt = NULL;
 793         int hh_len;
 794         int exthdrlen;
 795         int mtu;
 796         int copy;
 797         int err;
 798         int offset = 0;
 799         unsigned int maxfraglen, fragheaderlen;
 800         int csummode = CHECKSUM_NONE;
 801
 802         if (flags&MSG_PROBE)
 803                 return 0;
 804
 805         if (skb_queue_empty(&sk->sk_write_queue)) {
 806                 /*
 807                  * setup for corking.
 808                  */
 809                 opt = ipc->opt;
 810                 if (opt) {
 811                         if (inet->cork.opt == NULL) {
 812                                 inet->cork.opt = kmalloc(sizeof(struct ip_options) + 40, sk->sk_allocation);
 813                                 if (unlikely(inet->cork.opt == NULL))
 814                                         return -ENOBUFS;
 815                         }
 816                         memcpy(inet->cork.opt, opt, sizeof(struct ip_options)+opt->optlen);
 817                         inet->cork.flags |= IPCORK_OPT;
 818                         inet->cork.addr = ipc->addr;
 819                 }
 820                 dst_hold(&rt->u.dst);
 821                 inet->cork.fragsize = mtu = inet->pmtudisc == IP_PMTUDISC_PROBE ?
 822                                             rt->u.dst.dev->mtu :
 823                                             dst_mtu(rt->u.dst.path);
 824                 inet->cork.rt = rt;
 825                 inet->cork.length = 0;
 826                 sk->sk_sndmsg_page = NULL;
 827                 sk->sk_sndmsg_off = 0;
 828                 if ((exthdrlen = rt->u.dst.header_len) != 0) {
 829                         length += exthdrlen;
 830                         transhdrlen += exthdrlen;
 831                 }
 832         } else {
 833                 rt = inet->cork.rt;
 834                 if (inet->cork.flags & IPCORK_OPT)
 835                         opt = inet->cork.opt;
 836
 837                 transhdrlen = 0;
 838                 exthdrlen = 0;
 839                 mtu = inet->cork.fragsize;
 840         }
 841         hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
 842
 843         fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
 844         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
 845
 846         if (inet->cork.length + length > 0xFFFF - fragheaderlen) {
 847                 ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu-exthdrlen);
 848                 return -EMSGSIZE;
 849         }
 850
 851         /*
 852          * transhdrlen > 0 means that this is the first fragment and we wish
 853          * it won't be fragmented in the future.
 854          */
 855         if (transhdrlen &&
 856             length + fragheaderlen <= mtu &&
 857             rt->u.dst.dev->features & NETIF_F_V4_CSUM &&
 858             !exthdrlen)
 859                 csummode = CHECKSUM_PARTIAL;
 860
 861         inet->cork.length += length;
 862         if (((length > mtu) && (sk->sk_protocol == IPPROTO_UDP)) &&
 863                         (rt->u.dst.dev->features & NETIF_F_UFO)) {
 864
 865                 err = ip_ufo_append_data(sk, getfrag, from, length, hh_len,
 866                                          fragheaderlen, transhdrlen, mtu,
 867                                          flags);
 868                 if (err)
 869                         goto error;
 870                 return 0;
 871         }
 872
 873         /* So, what's going on in the loop below?
 874          *
 875          * We use calculated fragment length to generate chained skb,
 876          * each of segments is IP fragment ready for sending to network after
 877          * adding appropriate IP header.
 878          */
 879
 880         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
 881                 goto alloc_new_skb;
 882
 883         while (length > 0) {
 884                 /* Check if the remaining data fits into current packet. */
 885                 copy = mtu - skb->len;
 886                 if (copy < length)
 887                         copy = maxfraglen - skb->len;
 888                 if (copy <= 0) {
 889                         char *data;
 890                         unsigned int datalen;
 891                         unsigned int fraglen;
 892                         unsigned int fraggap;
 893                         unsigned int alloclen;
 894                         struct sk_buff *skb_prev;
 895 alloc_new_skb:
 896                         skb_prev = skb;
 897                         if (skb_prev)
 898                                 fraggap = skb_prev->len - maxfraglen;
 899                         else
 900                                 fraggap = 0;
 901
 902                         /*
 903                          * If remaining data exceeds the mtu,
 904                          * we know we need more fragment(s).
 905                          */
 906                         datalen = length + fraggap;
 907                         if (datalen > mtu - fragheaderlen)
 908                                 datalen = maxfraglen - fragheaderlen;
 909                         fraglen = datalen + fragheaderlen;
 910
 911                         if ((flags & MSG_MORE) &&
 912                             !(rt->u.dst.dev->features&NETIF_F_SG))
 913                                 alloclen = mtu;
 914                         else
 915                                 alloclen = datalen + fragheaderlen;
 916
 917                         /* The last fragment gets additional space at tail.
 918                          * Note, with MSG_MORE we overallocate on fragments,
 919                          * because we have no idea what fragment will be
 920                          * the last.
 921                          */
 922                         if (datalen == length + fraggap)
 923                                 alloclen += rt->u.dst.trailer_len;
 924
 925                         if (transhdrlen) {
 926                                 skb = sock_alloc_send_skb(sk,
 927                                                 alloclen + hh_len + 15,
 928                                                 (flags & MSG_DONTWAIT), &err);
 929                         } else {
 930                                 skb = NULL;
 931                                 if (atomic_read(&sk->sk_wmem_alloc) <=
 932                                     2 * sk->sk_sndbuf)
 933                                         skb = sock_wmalloc(sk,
 934                                                            alloclen + hh_len + 15, 1,
 935                                                            sk->sk_allocation);
 936                                 if (unlikely(skb == NULL))
 937                                         err = -ENOBUFS;
 938                         }
 939                         if (skb == NULL)
 940                                 goto error;
 941
 942                         /*
 943                          *      Fill in the control structures
 944                          */
 945                         skb->ip_summed = csummode;
 946                         skb->csum = 0;
 947                         skb_reserve(skb, hh_len);
 948
 949                         /*
 950                          *      Find where to start putting bytes.
 951                          */
 952                         data = skb_put(skb, fraglen);
 953                         skb_set_network_header(skb, exthdrlen);
 954                         skb->transport_header = (skb->network_header +
 955                                                  fragheaderlen);
 956                         data += fragheaderlen;
 957
 958                         if (fraggap) {
 959                                 skb->csum = skb_copy_and_csum_bits(
 960                                         skb_prev, maxfraglen,
 961                                         data + transhdrlen, fraggap, 0);
 962                                 skb_prev->csum = csum_sub(skb_prev->csum,
 963                                                           skb->csum);
 964                                 data += fraggap;
 965                                 pskb_trim_unique(skb_prev, maxfraglen);
 966                         }
 967
 968                         copy = datalen - transhdrlen - fraggap;
 969                         if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
 970                                 err = -EFAULT;
 971                                 kfree_skb(skb);
 972                                 goto error;
 973                         }
 974
 975                         offset += copy;
 976                         length -= datalen - fraggap;
 977                         transhdrlen = 0;
 978                         exthdrlen = 0;
 979                         csummode = CHECKSUM_NONE;
 980
 981                         /*
 982                          * Put the packet on the pending queue.
 983                          */
 984                         __skb_queue_tail(&sk->sk_write_queue, skb);
 985                         continue;
 986                 }
 987
 988                 if (copy > length)
 989                         copy = length;
 990
 991                 if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
 992                         unsigned int off;
 993
 994                         off = skb->len;
 995                         if (getfrag(from, skb_put(skb, copy),
 996                                         offset, copy, off, skb) < 0) {
 997                                 __skb_trim(skb, off);
 998                                 err = -EFAULT;
 999                                 goto error;
1000                         }
1001                 } else {
1002                         int i = skb_shinfo(skb)->nr_frags;
1003                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1004                         struct page *page = sk->sk_sndmsg_page;
1005                         int off = sk->sk_sndmsg_off;
1006                         unsigned int left;
1007
1008                         if (page && (left = PAGE_SIZE - off) > 0) {
1009                                 if (copy >= left)
1010                                         copy = left;
1011                                 if (page != frag->page) {
1012                                         if (i == MAX_SKB_FRAGS) {
1013                                                 err = -EMSGSIZE;
1014                                                 goto error;
1015                                         }
1016                                         get_page(page);
1017                                         skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1018                                         frag = &skb_shinfo(skb)->frags[i];
1019                                 }
1020                         } else if (i < MAX_SKB_FRAGS) {
1021                                 if (copy > PAGE_SIZE)
1022                                         copy = PAGE_SIZE;
1023                                 page = alloc_pages(sk->sk_allocation, 0);
1024                                 if (page == NULL)  {
1025                                         err = -ENOMEM;
1026                                         goto error;
1027                                 }
1028                                 sk->sk_sndmsg_page = page;
1029                                 sk->sk_sndmsg_off = 0;
1030
1031                                 skb_fill_page_desc(skb, i, page, 0, 0);
1032                                 frag = &skb_shinfo(skb)->frags[i];
1033                         } else {
1034                                 err = -EMSGSIZE;
1035                                 goto error;
1036                         }
1037                         if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1038                                 err = -EFAULT;
1039                                 goto error;
1040                         }
1041                         sk->sk_sndmsg_off += copy;
1042                         frag->size += copy;
1043                         skb->len += copy;
1044                         skb->data_len += copy;
1045                         skb->truesize += copy;
1046                         atomic_add(copy, &sk->sk_wmem_alloc);
1047                 }
1048                 offset += copy;
1049                 length -= copy;
1050         }
1051
1052         return 0;
1053
1054 error:
1055         inet->cork.length -= length;
1056         IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1057         return err;
1058 }
1059
1060 ssize_t ip_append_page(struct sock *sk, struct page *page,
1061                        int offset, size_t size, int flags)
1062 {
1063         struct inet_sock *inet = inet_sk(sk);
1064         struct sk_buff *skb;
1065         struct rtable *rt;
1066         struct ip_options *opt = NULL;
1067         int hh_len;
1068         int mtu;
1069         int len;
1070         int err;
1071         unsigned int maxfraglen, fragheaderlen, fraggap;
1072
1073         if (inet->hdrincl)
1074                 return -EPERM;
1075
1076         if (flags&MSG_PROBE)
1077                 return 0;
1078
1079         if (skb_queue_empty(&sk->sk_write_queue))
1080                 return -EINVAL;
1081
1082         rt = inet->cork.rt;
1083         if (inet->cork.flags & IPCORK_OPT)
1084                 opt = inet->cork.opt;
1085
1086         if (!(rt->u.dst.dev->features&NETIF_F_SG))
1087                 return -EOPNOTSUPP;
1088
1089         hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
1090         mtu = inet->cork.fragsize;
1091
1092         fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
1093         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
1094
1095         if (inet->cork.length + size > 0xFFFF - fragheaderlen) {
1096                 ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu);
1097                 return -EMSGSIZE;
1098         }
1099
1100         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1101                 return -EINVAL;
1102
1103         inet->cork.length += size;
1104         if ((sk->sk_protocol == IPPROTO_UDP) &&
1105             (rt->u.dst.dev->features & NETIF_F_UFO)) {
1106                 skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
1107                 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1108         }
1109
1110
1111         while (size > 0) {
1112                 int i;
1113
1114                 if (skb_is_gso(skb))
1115                         len = size;
1116                 else {
1117
1118                         /* Check if the remaining data fits into current packet. */
1119                         len = mtu - skb->len;
1120                         if (len < size)
1121                                 len = maxfraglen - skb->len;
1122                 }
1123                 if (len <= 0) {
1124                         struct sk_buff *skb_prev;
1125                         int alloclen;
1126
1127                         skb_prev = skb;
1128                         fraggap = skb_prev->len - maxfraglen;
1129
1130                         alloclen = fragheaderlen + hh_len + fraggap + 15;
1131                         skb = sock_wmalloc(sk, alloclen, 1, sk->sk_allocation);
1132                         if (unlikely(!skb)) {
1133                                 err = -ENOBUFS;
1134                                 goto error;
1135                         }
1136
1137                         /*
1138                          *      Fill in the control structures
1139                          */
1140                         skb->ip_summed = CHECKSUM_NONE;
1141                         skb->csum = 0;
1142                         skb_reserve(skb, hh_len);
1143
1144                         /*
1145                          *      Find where to start putting bytes.
1146                          */
1147                         skb_put(skb, fragheaderlen + fraggap);
1148                         skb_reset_network_header(skb);
1149                         skb->transport_header = (skb->network_header +
1150                                                  fragheaderlen);
1151                         if (fraggap) {
1152                                 skb->csum = skb_copy_and_csum_bits(skb_prev,
1153                                                                    maxfraglen,
1154                                                     skb_transport_header(skb),
1155                                                                    fraggap, 0);
1156                                 skb_prev->csum = csum_sub(skb_prev->csum,
1157                                                           skb->csum);
1158                                 pskb_trim_unique(skb_prev, maxfraglen);
1159                         }
1160
1161                         /*
1162                          * Put the packet on the pending queue.
1163                          */
1164                         __skb_queue_tail(&sk->sk_write_queue, skb);
1165                         continue;
1166                 }
1167
1168                 i = skb_shinfo(skb)->nr_frags;
1169                 if (len > size)
1170                         len = size;
1171                 if (skb_can_coalesce(skb, i, page, offset)) {
1172                         skb_shinfo(skb)->frags[i-1].size += len;
1173                 } else if (i < MAX_SKB_FRAGS) {
1174                         get_page(page);
1175                         skb_fill_page_desc(skb, i, page, offset, len);
1176                 } else {
1177                         err = -EMSGSIZE;
1178                         goto error;
1179                 }
1180
1181                 if (skb->ip_summed == CHECKSUM_NONE) {
1182                         __wsum csum;
1183                         csum = csum_page(page, offset, len);
1184                         skb->csum = csum_block_add(skb->csum, csum, skb->len);
1185                 }
1186
1187                 skb->len += len;
1188                 skb->data_len += len;
1189                 skb->truesize += len;
1190                 atomic_add(len, &sk->sk_wmem_alloc);
1191                 offset += len;
1192                 size -= len;
1193         }
1194         return 0;
1195
1196 error:
1197         inet->cork.length -= size;
1198         IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1199         return err;
1200 }
1201
1202 static void ip_cork_release(struct inet_sock *inet)
1203 {
1204         inet->cork.flags &= ~IPCORK_OPT;
1205         kfree(inet->cork.opt);
1206         inet->cork.opt = NULL;
1207         if (inet->cork.rt) {
1208                 ip_rt_put(inet->cork.rt);
1209                 inet->cork.rt = NULL;
1210         }
1211 }
1212
1213 /*
1214  *      Combined all pending IP fragments on the socket as one IP datagram
1215  *      and push them out.
1216  */
1217 int ip_push_pending_frames(struct sock *sk)
1218 {
1219         struct sk_buff *skb, *tmp_skb;
1220         struct sk_buff **tail_skb;
1221         struct inet_sock *inet = inet_sk(sk);
1222         struct ip_options *opt = NULL;
1223         struct rtable *rt = inet->cork.rt;
1224         struct iphdr *iph;
1225         __be16 df = 0;
1226         __u8 ttl;
1227         int err = 0;
1228
1229         if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1230                 goto out;
1231         tail_skb = &(skb_shinfo(skb)->frag_list);
1232
1233         /* move skb->data to ip header from ext header */
1234         if (skb->data < skb_network_header(skb))
1235                 __skb_pull(skb, skb_network_offset(skb));
1236         while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1237                 __skb_pull(tmp_skb, skb_network_header_len(skb));
1238                 *tail_skb = tmp_skb;
1239                 tail_skb = &(tmp_skb->next);
1240                 skb->len += tmp_skb->len;
1241                 skb->data_len += tmp_skb->len;
1242                 skb->truesize += tmp_skb->truesize;
1243                 __sock_put(tmp_skb->sk);
1244                 tmp_skb->destructor = NULL;
1245                 tmp_skb->sk = NULL;
1246         }
1247
1248         /* Unless user demanded real pmtu discovery (IP_PMTUDISC_DO), we allow
1249          * to fragment the frame generated here. No matter, what transforms
1250          * how transforms change size of the packet, it will come out.
1251          */
1252         if (inet->pmtudisc < IP_PMTUDISC_DO)
1253                 skb->local_df = 1;
1254
1255         /* DF bit is set when we want to see DF on outgoing frames.
1256          * If local_df is set too, we still allow to fragment this frame
1257          * locally. */
1258         if (inet->pmtudisc >= IP_PMTUDISC_DO ||
1259             (skb->len <= dst_mtu(&rt->u.dst) &&
1260              ip_dont_fragment(sk, &rt->u.dst)))
1261                 df = htons(IP_DF);
1262
1263         if (inet->cork.flags & IPCORK_OPT)
1264                 opt = inet->cork.opt;
1265
1266         if (rt->rt_type == RTN_MULTICAST)
1267                 ttl = inet->mc_ttl;
1268         else
1269                 ttl = ip_select_ttl(inet, &rt->u.dst);
1270
1271         iph = (struct iphdr *)skb->data;
1272         iph->version = 4;
1273         iph->ihl = 5;
1274         if (opt) {
1275                 iph->ihl += opt->optlen>>2;
1276                 ip_options_build(skb, opt, inet->cork.addr, rt, 0);
1277         }
1278         iph->tos = inet->tos;
1279         iph->frag_off = df;
1280         ip_select_ident(iph, &rt->u.dst, sk);
1281         iph->ttl = ttl;
1282         iph->protocol = sk->sk_protocol;
1283         iph->saddr = rt->rt_src;
1284         iph->daddr = rt->rt_dst;
1285
1286         skb->priority = sk->sk_priority;
1287         skb->dst = dst_clone(&rt->u.dst);
1288
1289         if (iph->protocol == IPPROTO_ICMP)
1290                 icmp_out_count(((struct icmphdr *)
1291                         skb_transport_header(skb))->type);
1292
1293         /* Netfilter gets whole the not fragmented skb. */
1294         err = ip_local_out(skb);
1295         if (err) {
1296                 if (err > 0)
1297                         err = inet->recverr ? net_xmit_errno(err) : 0;
1298                 if (err)
1299                         goto error;
1300         }
1301
1302 out:
1303         ip_cork_release(inet);
1304         return err;
1305
1306 error:
1307         IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1308         goto out;
1309 }
1310
1311 /*
1312  *      Throw away all pending data on the socket.
1313  */
1314 void ip_flush_pending_frames(struct sock *sk)
1315 {
1316         struct sk_buff *skb;
1317
1318         while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL)
1319                 kfree_skb(skb);
1320
1321         ip_cork_release(inet_sk(sk));
1322 }
1323
1324
1325 /*
1326  *      Fetch data from kernel space and fill in checksum if needed.
1327  */
1328 static int ip_reply_glue_bits(void *dptr, char *to, int offset,
1329                               int len, int odd, struct sk_buff *skb)
1330 {
1331         __wsum csum;
1332
1333         csum = csum_partial_copy_nocheck(dptr+offset, to, len, 0);
1334         skb->csum = csum_block_add(skb->csum, csum, odd);
1335         return 0;
1336 }
1337
1338 /*
1339  *      Generic function to send a packet as reply to another packet.
1340  *      Used to send TCP resets so far. ICMP should use this function too.
1341  *
1342  *      Should run single threaded per socket because it uses the sock
1343  *      structure to pass arguments.
1344  */
1345 void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *arg,
1346                    unsigned int len)
1347 {
1348         struct inet_sock *inet = inet_sk(sk);
1349         struct {
1350                 struct ip_options       opt;
1351                 char                    data[40];
1352         } replyopts;
1353         struct ipcm_cookie ipc;
1354         __be32 daddr;
1355         struct rtable *rt = (struct rtable*)skb->dst;
1356
1357         if (ip_options_echo(&replyopts.opt, skb))
1358                 return;
1359
1360         daddr = ipc.addr = rt->rt_src;
1361         ipc.opt = NULL;
1362
1363         if (replyopts.opt.optlen) {
1364                 ipc.opt = &replyopts.opt;
1365
1366                 if (ipc.opt->srr)
1367                         daddr = replyopts.opt.faddr;
1368         }
1369
1370         {
1371                 struct flowi fl = { .oif = arg->bound_dev_if,
1372                                     .nl_u = { .ip4_u =
1373                                               { .daddr = daddr,
1374                                                 .saddr = rt->rt_spec_dst,
1375                                                 .tos = RT_TOS(ip_hdr(skb)->tos) } },
1376                                     /* Not quite clean, but right. */
1377                                     .uli_u = { .ports =
1378                                                { .sport = tcp_hdr(skb)->dest,
1379                                                  .dport = tcp_hdr(skb)->source } },
1380                                     .proto = sk->sk_protocol };
1381                 security_skb_classify_flow(skb, &fl);
1382                 if (ip_route_output_key(sk->sk_net, &rt, &fl))
1383                         return;
1384         }
1385
1386         /* And let IP do all the hard work.
1387
1388            This chunk is not reenterable, hence spinlock.
1389            Note that it uses the fact, that this function is called
1390            with locally disabled BH and that sk cannot be already spinlocked.
1391          */
1392         bh_lock_sock(sk);
1393         inet->tos = ip_hdr(skb)->tos;
1394         sk->sk_priority = skb->priority;
1395         sk->sk_protocol = ip_hdr(skb)->protocol;
1396         sk->sk_bound_dev_if = arg->bound_dev_if;
1397         ip_append_data(sk, ip_reply_glue_bits, arg->iov->iov_base, len, 0,
1398                        &ipc, rt, MSG_DONTWAIT);
1399         if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) {
1400                 if (arg->csumoffset >= 0)
1401                         *((__sum16 *)skb_transport_header(skb) +
1402                           arg->csumoffset) = csum_fold(csum_add(skb->csum,
1403                                                                 arg->csum));
1404                 skb->ip_summed = CHECKSUM_NONE;
1405                 ip_push_pending_frames(sk);
1406         }
1407
1408         bh_unlock_sock(sk);
1409
1410         ip_rt_put(rt);
1411 }
1412
1413 void __init ip_init(void)
1414 {
1415         ip_rt_init();
1416         inet_initpeers();
1417
1418 #if defined(CONFIG_IP_MULTICAST) && defined(CONFIG_PROC_FS)
1419         igmp_mc_proc_init();
1420 #endif
1421 }
1422
1423 EXPORT_SYMBOL(ip_generic_getfrag);
1424 EXPORT_SYMBOL(ip_queue_xmit);
1425 EXPORT_SYMBOL(ip_send_check);