net/ipv6/ip6_output.c

   1 /*
   2  *      IPv6 output functions
   3  *      Linux INET6 implementation
   4  *
   5  *      Authors:
   6  *      Pedro Roque             <roque@di.fc.ul.pt>
   7  *
   8  *      $Id: ip6_output.c,v 1.34 2002/02/01 22:01:04 davem Exp $
   9  *
  10  *      Based on linux/net/ipv4/ip_output.c
  11  *
  12  *      This program is free software; you can redistribute it and/or
  13  *      modify it under the terms of the GNU General Public License
  14  *      as published by the Free Software Foundation; either version
  15  *      2 of the License, or (at your option) any later version.
  16  *
  17  *      Changes:
  18  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
  19  *                              extension headers are implemented.
  20  *                              route changes now work.
  21  *                              ip6_forward does not confuse sniffers.
  22  *                              etc.
  23  *
  24  *      H. von Brand    :       Added missing #include <linux/string.h>
  25  *      Imran Patel     :       frag id should be in NBO
  26  *      Kazunori MIYAZAWA @USAGI
  27  *                      :       add ip6_append_data and related functions
  28  *                              for datagram xmit
  29  */
  30
  31 #include <linux/errno.h>
  32 #include <linux/kernel.h>
  33 #include <linux/string.h>
  34 #include <linux/socket.h>
  35 #include <linux/net.h>
  36 #include <linux/netdevice.h>
  37 #include <linux/if_arp.h>
  38 #include <linux/in6.h>
  39 #include <linux/tcp.h>
  40 #include <linux/route.h>
  41 #include <linux/module.h>
  42
  43 #include <linux/netfilter.h>
  44 #include <linux/netfilter_ipv6.h>
  45
  46 #include <net/sock.h>
  47 #include <net/snmp.h>
  48
  49 #include <net/ipv6.h>
  50 #include <net/ndisc.h>
  51 #include <net/protocol.h>
  52 #include <net/ip6_route.h>
  53 #include <net/addrconf.h>
  54 #include <net/rawv6.h>
  55 #include <net/icmp.h>
  56 #include <net/xfrm.h>
  57 #include <net/checksum.h>
  58
  59 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
  60
  61 static __inline__ void ipv6_select_ident(struct sk_buff *skb, struct frag_hdr *fhdr)
  62 {
  63         static u32 ipv6_fragmentation_id = 1;
  64         static DEFINE_SPINLOCK(ip6_id_lock);
  65
  66         spin_lock_bh(&ip6_id_lock);
  67         fhdr->identification = htonl(ipv6_fragmentation_id);
  68         if (++ipv6_fragmentation_id == 0)
  69                 ipv6_fragmentation_id = 1;
  70         spin_unlock_bh(&ip6_id_lock);
  71 }
  72
  73 int __ip6_local_out(struct sk_buff *skb)
  74 {
  75         int len;
  76
  77         len = skb->len - sizeof(struct ipv6hdr);
  78         if (len > IPV6_MAXPLEN)
  79                 len = 0;
  80         ipv6_hdr(skb)->payload_len = htons(len);
  81
  82         return nf_hook(PF_INET6, NF_INET_LOCAL_OUT, skb, NULL, skb->dst->dev,
  83                        dst_output);
  84 }
  85
  86 int ip6_local_out(struct sk_buff *skb)
  87 {
  88         int err;
  89
  90         err = __ip6_local_out(skb);
  91         if (likely(err == 1))
  92                 err = dst_output(skb);
  93
  94         return err;
  95 }
  96 EXPORT_SYMBOL_GPL(ip6_local_out);
  97
  98 static int ip6_output_finish(struct sk_buff *skb)
  99 {
 100         struct dst_entry *dst = skb->dst;
 101
 102         if (dst->hh)
 103                 return neigh_hh_output(dst->hh, skb);
 104         else if (dst->neighbour)
 105                 return dst->neighbour->output(skb);
 106
 107         IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
 108         kfree_skb(skb);
 109         return -EINVAL;
 110
 111 }
 112
 113 /* dev_loopback_xmit for use with netfilter. */
 114 static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
 115 {
 116         skb_reset_mac_header(newskb);
 117         __skb_pull(newskb, skb_network_offset(newskb));
 118         newskb->pkt_type = PACKET_LOOPBACK;
 119         newskb->ip_summed = CHECKSUM_UNNECESSARY;
 120         BUG_TRAP(newskb->dst);
 121
 122         netif_rx(newskb);
 123         return 0;
 124 }
 125
 126
 127 static int ip6_output2(struct sk_buff *skb)
 128 {
 129         struct dst_entry *dst = skb->dst;
 130         struct net_device *dev = dst->dev;
 131
 132         skb->protocol = htons(ETH_P_IPV6);
 133         skb->dev = dev;
 134
 135         if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
 136                 struct ipv6_pinfo* np = skb->sk ? inet6_sk(skb->sk) : NULL;
 137                 struct inet6_dev *idev = ip6_dst_idev(skb->dst);
 138
 139                 if (!(dev->flags & IFF_LOOPBACK) && (!np || np->mc_loop) &&
 140                     ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
 141                                         &ipv6_hdr(skb)->saddr)) {
 142                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
 143
 144                         /* Do not check for IFF_ALLMULTI; multicast routing
 145                            is not supported in any case.
 146                          */
 147                         if (newskb)
 148                                 NF_HOOK(PF_INET6, NF_INET_POST_ROUTING, newskb,
 149                                         NULL, newskb->dev,
 150                                         ip6_dev_loopback_xmit);
 151
 152                         if (ipv6_hdr(skb)->hop_limit == 0) {
 153                                 IP6_INC_STATS(idev, IPSTATS_MIB_OUTDISCARDS);
 154                                 kfree_skb(skb);
 155                                 return 0;
 156                         }
 157                 }
 158
 159                 IP6_INC_STATS(idev, IPSTATS_MIB_OUTMCASTPKTS);
 160         }
 161
 162         return NF_HOOK(PF_INET6, NF_INET_POST_ROUTING, skb, NULL, skb->dev,
 163                        ip6_output_finish);
 164 }
 165
 166 static inline int ip6_skb_dst_mtu(struct sk_buff *skb)
 167 {
 168         struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
 169
 170         return (np && np->pmtudisc == IPV6_PMTUDISC_PROBE) ?
 171                skb->dst->dev->mtu : dst_mtu(skb->dst);
 172 }
 173
 174 int ip6_output(struct sk_buff *skb)
 175 {
 176         if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
 177                                 dst_allfrag(skb->dst))
 178                 return ip6_fragment(skb, ip6_output2);
 179         else
 180                 return ip6_output2(skb);
 181 }
 182
 183 /*
 184  *      xmit an sk_buff (used by TCP)
 185  */
 186
 187 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
 188              struct ipv6_txoptions *opt, int ipfragok)
 189 {
 190         struct ipv6_pinfo *np = inet6_sk(sk);
 191         struct in6_addr *first_hop = &fl->fl6_dst;
 192         struct dst_entry *dst = skb->dst;
 193         struct ipv6hdr *hdr;
 194         u8  proto = fl->proto;
 195         int seg_len = skb->len;
 196         int hlimit, tclass;
 197         u32 mtu;
 198
 199         if (opt) {
 200                 unsigned int head_room;
 201
 202                 /* First: exthdrs may take lots of space (~8K for now)
 203                    MAX_HEADER is not enough.
 204                  */
 205                 head_room = opt->opt_nflen + opt->opt_flen;
 206                 seg_len += head_room;
 207                 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
 208
 209                 if (skb_headroom(skb) < head_room) {
 210                         struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
 211                         if (skb2 == NULL) {
 212                                 IP6_INC_STATS(ip6_dst_idev(skb->dst),
 213                                               IPSTATS_MIB_OUTDISCARDS);
 214                                 kfree_skb(skb);
 215                                 return -ENOBUFS;
 216                         }
 217                         kfree_skb(skb);
 218                         skb = skb2;
 219                         if (sk)
 220                                 skb_set_owner_w(skb, sk);
 221                 }
 222                 if (opt->opt_flen)
 223                         ipv6_push_frag_opts(skb, opt, &proto);
 224                 if (opt->opt_nflen)
 225                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
 226         }
 227
 228         skb_push(skb, sizeof(struct ipv6hdr));
 229         skb_reset_network_header(skb);
 230         hdr = ipv6_hdr(skb);
 231
 232         /* Allow local fragmentation. */
 233         if (ipfragok)
 234                 skb->local_df = 1;
 235
 236         /*
 237          *      Fill in the IPv6 header
 238          */
 239
 240         hlimit = -1;
 241         if (np)
 242                 hlimit = np->hop_limit;
 243         if (hlimit < 0)
 244                 hlimit = dst_metric(dst, RTAX_HOPLIMIT);
 245         if (hlimit < 0)
 246                 hlimit = ipv6_get_hoplimit(dst->dev);
 247
 248         tclass = -1;
 249         if (np)
 250                 tclass = np->tclass;
 251         if (tclass < 0)
 252                 tclass = 0;
 253
 254         *(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl->fl6_flowlabel;
 255
 256         hdr->payload_len = htons(seg_len);
 257         hdr->nexthdr = proto;
 258         hdr->hop_limit = hlimit;
 259
 260         ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
 261         ipv6_addr_copy(&hdr->daddr, first_hop);
 262
 263         skb->priority = sk->sk_priority;
 264         skb->mark = sk->sk_mark;
 265
 266         mtu = dst_mtu(dst);
 267         if ((skb->len <= mtu) || ipfragok || skb_is_gso(skb)) {
 268                 IP6_INC_STATS(ip6_dst_idev(skb->dst),
 269                               IPSTATS_MIB_OUTREQUESTS);
 270                 return NF_HOOK(PF_INET6, NF_INET_LOCAL_OUT, skb, NULL, dst->dev,
 271                                 dst_output);
 272         }
 273
 274         if (net_ratelimit())
 275                 printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
 276         skb->dev = dst->dev;
 277         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
 278         IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_FRAGFAILS);
 279         kfree_skb(skb);
 280         return -EMSGSIZE;
 281 }
 282
 283 EXPORT_SYMBOL(ip6_xmit);
 284
 285 /*
 286  *      To avoid extra problems ND packets are send through this
 287  *      routine. It's code duplication but I really want to avoid
 288  *      extra checks since ipv6_build_header is used by TCP (which
 289  *      is for us performance critical)
 290  */
 291
 292 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
 293                struct in6_addr *saddr, struct in6_addr *daddr,
 294                int proto, int len)
 295 {
 296         struct ipv6_pinfo *np = inet6_sk(sk);
 297         struct ipv6hdr *hdr;
 298         int totlen;
 299
 300         skb->protocol = htons(ETH_P_IPV6);
 301         skb->dev = dev;
 302
 303         totlen = len + sizeof(struct ipv6hdr);
 304
 305         skb_reset_network_header(skb);
 306         skb_put(skb, sizeof(struct ipv6hdr));
 307         hdr = ipv6_hdr(skb);
 308
 309         *(__be32*)hdr = htonl(0x60000000);
 310
 311         hdr->payload_len = htons(len);
 312         hdr->nexthdr = proto;
 313         hdr->hop_limit = np->hop_limit;
 314
 315         ipv6_addr_copy(&hdr->saddr, saddr);
 316         ipv6_addr_copy(&hdr->daddr, daddr);
 317
 318         return 0;
 319 }
 320
 321 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
 322 {
 323         struct ip6_ra_chain *ra;
 324         struct sock *last = NULL;
 325
 326         read_lock(&ip6_ra_lock);
 327         for (ra = ip6_ra_chain; ra; ra = ra->next) {
 328                 struct sock *sk = ra->sk;
 329                 if (sk && ra->sel == sel &&
 330                     (!sk->sk_bound_dev_if ||
 331                      sk->sk_bound_dev_if == skb->dev->ifindex)) {
 332                         if (last) {
 333                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
 334                                 if (skb2)
 335                                         rawv6_rcv(last, skb2);
 336                         }
 337                         last = sk;
 338                 }
 339         }
 340
 341         if (last) {
 342                 rawv6_rcv(last, skb);
 343                 read_unlock(&ip6_ra_lock);
 344                 return 1;
 345         }
 346         read_unlock(&ip6_ra_lock);
 347         return 0;
 348 }
 349
 350 static int ip6_forward_proxy_check(struct sk_buff *skb)
 351 {
 352         struct ipv6hdr *hdr = ipv6_hdr(skb);
 353         u8 nexthdr = hdr->nexthdr;
 354         int offset;
 355
 356         if (ipv6_ext_hdr(nexthdr)) {
 357                 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr);
 358                 if (offset < 0)
 359                         return 0;
 360         } else
 361                 offset = sizeof(struct ipv6hdr);
 362
 363         if (nexthdr == IPPROTO_ICMPV6) {
 364                 struct icmp6hdr *icmp6;
 365
 366                 if (!pskb_may_pull(skb, (skb_network_header(skb) +
 367                                          offset + 1 - skb->data)))
 368                         return 0;
 369
 370                 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
 371
 372                 switch (icmp6->icmp6_type) {
 373                 case NDISC_ROUTER_SOLICITATION:
 374                 case NDISC_ROUTER_ADVERTISEMENT:
 375                 case NDISC_NEIGHBOUR_SOLICITATION:
 376                 case NDISC_NEIGHBOUR_ADVERTISEMENT:
 377                 case NDISC_REDIRECT:
 378                         /* For reaction involving unicast neighbor discovery
 379                          * message destined to the proxied address, pass it to
 380                          * input function.
 381                          */
 382                         return 1;
 383                 default:
 384                         break;
 385                 }
 386         }
 387
 388         /*
 389          * The proxying router can't forward traffic sent to a link-local
 390          * address, so signal the sender and discard the packet. This
 391          * behavior is clarified by the MIPv6 specification.
 392          */
 393         if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
 394                 dst_link_failure(skb);
 395                 return -1;
 396         }
 397
 398         return 0;
 399 }
 400
 401 static inline int ip6_forward_finish(struct sk_buff *skb)
 402 {
 403         return dst_output(skb);
 404 }
 405
 406 int ip6_forward(struct sk_buff *skb)
 407 {
 408         struct dst_entry *dst = skb->dst;
 409         struct ipv6hdr *hdr = ipv6_hdr(skb);
 410         struct inet6_skb_parm *opt = IP6CB(skb);
 411
 412         if (ipv6_devconf.forwarding == 0)
 413                 goto error;
 414
 415         if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
 416                 IP6_INC_STATS(ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
 417                 goto drop;
 418         }
 419
 420         skb_forward_csum(skb);
 421
 422         /*
 423          *      We DO NOT make any processing on
 424          *      RA packets, pushing them to user level AS IS
 425          *      without ane WARRANTY that application will be able
 426          *      to interpret them. The reason is that we
 427          *      cannot make anything clever here.
 428          *
 429          *      We are not end-node, so that if packet contains
 430          *      AH/ESP, we cannot make anything.
 431          *      Defragmentation also would be mistake, RA packets
 432          *      cannot be fragmented, because there is no warranty
 433          *      that different fragments will go along one path. --ANK
 434          */
 435         if (opt->ra) {
 436                 u8 *ptr = skb_network_header(skb) + opt->ra;
 437                 if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
 438                         return 0;
 439         }
 440
 441         /*
 442          *      check and decrement ttl
 443          */
 444         if (hdr->hop_limit <= 1) {
 445                 /* Force OUTPUT device used as source address */
 446                 skb->dev = dst->dev;
 447                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT,
 448                             0, skb->dev);
 449                 IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
 450
 451                 kfree_skb(skb);
 452                 return -ETIMEDOUT;
 453         }
 454
 455         /* XXX: idev->cnf.proxy_ndp? */
 456         if (ipv6_devconf.proxy_ndp &&
 457             pneigh_lookup(&nd_tbl, &init_net, &hdr->daddr, skb->dev, 0)) {
 458                 int proxied = ip6_forward_proxy_check(skb);
 459                 if (proxied > 0)
 460                         return ip6_input(skb);
 461                 else if (proxied < 0) {
 462                         IP6_INC_STATS(ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
 463                         goto drop;
 464                 }
 465         }
 466
 467         if (!xfrm6_route_forward(skb)) {
 468                 IP6_INC_STATS(ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
 469                 goto drop;
 470         }
 471         dst = skb->dst;
 472
 473         /* IPv6 specs say nothing about it, but it is clear that we cannot
 474            send redirects to source routed frames.
 475            We don't send redirects to frames decapsulated from IPsec.
 476          */
 477         if (skb->dev == dst->dev && dst->neighbour && opt->srcrt == 0 &&
 478             !skb->sp) {
 479                 struct in6_addr *target = NULL;
 480                 struct rt6_info *rt;
 481                 struct neighbour *n = dst->neighbour;
 482
 483                 /*
 484                  *      incoming and outgoing devices are the same
 485                  *      send a redirect.
 486                  */
 487
 488                 rt = (struct rt6_info *) dst;
 489                 if ((rt->rt6i_flags & RTF_GATEWAY))
 490                         target = (struct in6_addr*)&n->primary_key;
 491                 else
 492                         target = &hdr->daddr;
 493
 494                 /* Limit redirects both by destination (here)
 495                    and by source (inside ndisc_send_redirect)
 496                  */
 497                 if (xrlim_allow(dst, 1*HZ))
 498                         ndisc_send_redirect(skb, n, target);
 499         } else {
 500                 int addrtype = ipv6_addr_type(&hdr->saddr);
 501
 502                 /* This check is security critical. */
 503                 if (addrtype & (IPV6_ADDR_MULTICAST|IPV6_ADDR_LOOPBACK))
 504                         goto error;
 505                 if (addrtype & IPV6_ADDR_LINKLOCAL) {
 506                         icmpv6_send(skb, ICMPV6_DEST_UNREACH,
 507                                 ICMPV6_NOT_NEIGHBOUR, 0, skb->dev);
 508                         goto error;
 509                 }
 510         }
 511
 512         if (skb->len > dst_mtu(dst)) {
 513                 /* Again, force OUTPUT device used as source address */
 514                 skb->dev = dst->dev;
 515                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, dst_mtu(dst), skb->dev);
 516                 IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS);
 517                 IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS);
 518                 kfree_skb(skb);
 519                 return -EMSGSIZE;
 520         }
 521
 522         if (skb_cow(skb, dst->dev->hard_header_len)) {
 523                 IP6_INC_STATS(ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
 524                 goto drop;
 525         }
 526
 527         hdr = ipv6_hdr(skb);
 528
 529         /* Mangling hops number delayed to point after skb COW */
 530
 531         hdr->hop_limit--;
 532
 533         IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
 534         return NF_HOOK(PF_INET6, NF_INET_FORWARD, skb, skb->dev, dst->dev,
 535                        ip6_forward_finish);
 536
 537 error:
 538         IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
 539 drop:
 540         kfree_skb(skb);
 541         return -EINVAL;
 542 }
 543
 544 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 545 {
 546         to->pkt_type = from->pkt_type;
 547         to->priority = from->priority;
 548         to->protocol = from->protocol;
 549         dst_release(to->dst);
 550         to->dst = dst_clone(from->dst);
 551         to->dev = from->dev;
 552         to->mark = from->mark;
 553
 554 #ifdef CONFIG_NET_SCHED
 555         to->tc_index = from->tc_index;
 556 #endif
 557         nf_copy(to, from);
 558 #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
 559     defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
 560         to->nf_trace = from->nf_trace;
 561 #endif
 562         skb_copy_secmark(to, from);
 563 }
 564
 565 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
 566 {
 567         u16 offset = sizeof(struct ipv6hdr);
 568         struct ipv6_opt_hdr *exthdr =
 569                                 (struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1);
 570         unsigned int packet_len = skb->tail - skb->network_header;
 571         int found_rhdr = 0;
 572         *nexthdr = &ipv6_hdr(skb)->nexthdr;
 573
 574         while (offset + 1 <= packet_len) {
 575
 576                 switch (**nexthdr) {
 577
 578                 case NEXTHDR_HOP:
 579                         break;
 580                 case NEXTHDR_ROUTING:
 581                         found_rhdr = 1;
 582                         break;
 583                 case NEXTHDR_DEST:
 584 #if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
 585                         if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0)
 586                                 break;
 587 #endif
 588                         if (found_rhdr)
 589                                 return offset;
 590                         break;
 591                 default :
 592                         return offset;
 593                 }
 594
 595                 offset += ipv6_optlen(exthdr);
 596                 *nexthdr = &exthdr->nexthdr;
 597                 exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) +
 598                                                  offset);
 599         }
 600
 601         return offset;
 602 }
 603 EXPORT_SYMBOL_GPL(ip6_find_1stfragopt);
 604
 605 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
 606 {
 607         struct net_device *dev;
 608         struct sk_buff *frag;
 609         struct rt6_info *rt = (struct rt6_info*)skb->dst;
 610         struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
 611         struct ipv6hdr *tmp_hdr;
 612         struct frag_hdr *fh;
 613         unsigned int mtu, hlen, left, len;
 614         __be32 frag_id = 0;
 615         int ptr, offset = 0, err=0;
 616         u8 *prevhdr, nexthdr = 0;
 617
 618         dev = rt->u.dst.dev;
 619         hlen = ip6_find_1stfragopt(skb, &prevhdr);
 620         nexthdr = *prevhdr;
 621
 622         mtu = ip6_skb_dst_mtu(skb);
 623
 624         /* We must not fragment if the socket is set to force MTU discovery
 625          * or if the skb it not generated by a local socket.  (This last
 626          * check should be redundant, but it's free.)
 627          */
 628         if (!skb->local_df) {
 629                 skb->dev = skb->dst->dev;
 630                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
 631                 IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_FRAGFAILS);
 632                 kfree_skb(skb);
 633                 return -EMSGSIZE;
 634         }
 635
 636         if (np && np->frag_size < mtu) {
 637                 if (np->frag_size)
 638                         mtu = np->frag_size;
 639         }
 640         mtu -= hlen + sizeof(struct frag_hdr);
 641
 642         if (skb_shinfo(skb)->frag_list) {
 643                 int first_len = skb_pagelen(skb);
 644                 int truesizes = 0;
 645
 646                 if (first_len - hlen > mtu ||
 647                     ((first_len - hlen) & 7) ||
 648                     skb_cloned(skb))
 649                         goto slow_path;
 650
 651                 for (frag = skb_shinfo(skb)->frag_list; frag; frag = frag->next) {
 652                         /* Correct geometry. */
 653                         if (frag->len > mtu ||
 654                             ((frag->len & 7) && frag->next) ||
 655                             skb_headroom(frag) < hlen)
 656                             goto slow_path;
 657
 658                         /* Partially cloned skb? */
 659                         if (skb_shared(frag))
 660                                 goto slow_path;
 661
 662                         BUG_ON(frag->sk);
 663                         if (skb->sk) {
 664                                 sock_hold(skb->sk);
 665                                 frag->sk = skb->sk;
 666                                 frag->destructor = sock_wfree;
 667                                 truesizes += frag->truesize;
 668                         }
 669                 }
 670
 671                 err = 0;
 672                 offset = 0;
 673                 frag = skb_shinfo(skb)->frag_list;
 674                 skb_shinfo(skb)->frag_list = NULL;
 675                 /* BUILD HEADER */
 676
 677                 *prevhdr = NEXTHDR_FRAGMENT;
 678                 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
 679                 if (!tmp_hdr) {
 680                         IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_FRAGFAILS);
 681                         return -ENOMEM;
 682                 }
 683
 684                 __skb_pull(skb, hlen);
 685                 fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
 686                 __skb_push(skb, hlen);
 687                 skb_reset_network_header(skb);
 688                 memcpy(skb_network_header(skb), tmp_hdr, hlen);
 689
 690                 ipv6_select_ident(skb, fh);
 691                 fh->nexthdr = nexthdr;
 692                 fh->reserved = 0;
 693                 fh->frag_off = htons(IP6_MF);
 694                 frag_id = fh->identification;
 695
 696                 first_len = skb_pagelen(skb);
 697                 skb->data_len = first_len - skb_headlen(skb);
 698                 skb->truesize -= truesizes;
 699                 skb->len = first_len;
 700                 ipv6_hdr(skb)->payload_len = htons(first_len -
 701                                                    sizeof(struct ipv6hdr));
 702
 703                 dst_hold(&rt->u.dst);
 704
 705                 for (;;) {
 706                         /* Prepare header of the next frame,
 707                          * before previous one went down. */
 708                         if (frag) {
 709                                 frag->ip_summed = CHECKSUM_NONE;
 710                                 skb_reset_transport_header(frag);
 711                                 fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
 712                                 __skb_push(frag, hlen);
 713                                 skb_reset_network_header(frag);
 714                                 memcpy(skb_network_header(frag), tmp_hdr,
 715                                        hlen);
 716                                 offset += skb->len - hlen - sizeof(struct frag_hdr);
 717                                 fh->nexthdr = nexthdr;
 718                                 fh->reserved = 0;
 719                                 fh->frag_off = htons(offset);
 720                                 if (frag->next != NULL)
 721                                         fh->frag_off |= htons(IP6_MF);
 722                                 fh->identification = frag_id;
 723                                 ipv6_hdr(frag)->payload_len =
 724                                                 htons(frag->len -
 725                                                       sizeof(struct ipv6hdr));
 726                                 ip6_copy_metadata(frag, skb);
 727                         }
 728
 729                         err = output(skb);
 730                         if(!err)
 731                                 IP6_INC_STATS(ip6_dst_idev(&rt->u.dst), IPSTATS_MIB_FRAGCREATES);
 732
 733                         if (err || !frag)
 734                                 break;
 735
 736                         skb = frag;
 737                         frag = skb->next;
 738                         skb->next = NULL;
 739                 }
 740
 741                 kfree(tmp_hdr);
 742
 743                 if (err == 0) {
 744                         IP6_INC_STATS(ip6_dst_idev(&rt->u.dst), IPSTATS_MIB_FRAGOKS);
 745                         dst_release(&rt->u.dst);
 746                         return 0;
 747                 }
 748
 749                 while (frag) {
 750                         skb = frag->next;
 751                         kfree_skb(frag);
 752                         frag = skb;
 753                 }
 754
 755                 IP6_INC_STATS(ip6_dst_idev(&rt->u.dst), IPSTATS_MIB_FRAGFAILS);
 756                 dst_release(&rt->u.dst);
 757                 return err;
 758         }
 759
 760 slow_path:
 761         left = skb->len - hlen;         /* Space per frame */
 762         ptr = hlen;                     /* Where to start from */
 763
 764         /*
 765          *      Fragment the datagram.
 766          */
 767
 768         *prevhdr = NEXTHDR_FRAGMENT;
 769
 770         /*
 771          *      Keep copying data until we run out.
 772          */
 773         while(left > 0) {
 774                 len = left;
 775                 /* IF: it doesn't fit, use 'mtu' - the data space left */
 776                 if (len > mtu)
 777                         len = mtu;
 778                 /* IF: we are not sending upto and including the packet end
 779                    then align the next start on an eight byte boundary */
 780                 if (len < left) {
 781                         len &= ~7;
 782                 }
 783                 /*
 784                  *      Allocate buffer.
 785                  */
 786
 787                 if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_RESERVED_SPACE(rt->u.dst.dev), GFP_ATOMIC)) == NULL) {
 788                         NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
 789                         IP6_INC_STATS(ip6_dst_idev(skb->dst),
 790                                       IPSTATS_MIB_FRAGFAILS);
 791                         err = -ENOMEM;
 792                         goto fail;
 793                 }
 794
 795                 /*
 796                  *      Set up data on packet
 797                  */
 798
 799                 ip6_copy_metadata(frag, skb);
 800                 skb_reserve(frag, LL_RESERVED_SPACE(rt->u.dst.dev));
 801                 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
 802                 skb_reset_network_header(frag);
 803                 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
 804                 frag->transport_header = (frag->network_header + hlen +
 805                                           sizeof(struct frag_hdr));
 806
 807                 /*
 808                  *      Charge the memory for the fragment to any owner
 809                  *      it might possess
 810                  */
 811                 if (skb->sk)
 812                         skb_set_owner_w(frag, skb->sk);
 813
 814                 /*
 815                  *      Copy the packet header into the new buffer.
 816                  */
 817                 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
 818
 819                 /*
 820                  *      Build fragment header.
 821                  */
 822                 fh->nexthdr = nexthdr;
 823                 fh->reserved = 0;
 824                 if (!frag_id) {
 825                         ipv6_select_ident(skb, fh);
 826                         frag_id = fh->identification;
 827                 } else
 828                         fh->identification = frag_id;
 829
 830                 /*
 831                  *      Copy a block of the IP datagram.
 832                  */
 833                 if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len))
 834                         BUG();
 835                 left -= len;
 836
 837                 fh->frag_off = htons(offset);
 838                 if (left > 0)
 839                         fh->frag_off |= htons(IP6_MF);
 840                 ipv6_hdr(frag)->payload_len = htons(frag->len -
 841                                                     sizeof(struct ipv6hdr));
 842
 843                 ptr += len;
 844                 offset += len;
 845
 846                 /*
 847                  *      Put this fragment into the sending queue.
 848                  */
 849                 err = output(frag);
 850                 if (err)
 851                         goto fail;
 852
 853                 IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_FRAGCREATES);
 854         }
 855         IP6_INC_STATS(ip6_dst_idev(skb->dst),
 856                       IPSTATS_MIB_FRAGOKS);
 857         kfree_skb(skb);
 858         return err;
 859
 860 fail:
 861         IP6_INC_STATS(ip6_dst_idev(skb->dst),
 862                       IPSTATS_MIB_FRAGFAILS);
 863         kfree_skb(skb);
 864         return err;
 865 }
 866
 867 static inline int ip6_rt_check(struct rt6key *rt_key,
 868                                struct in6_addr *fl_addr,
 869                                struct in6_addr *addr_cache)
 870 {
 871         return ((rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
 872                 (addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache)));
 873 }
 874
 875 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
 876                                           struct dst_entry *dst,
 877                                           struct flowi *fl)
 878 {
 879         struct ipv6_pinfo *np = inet6_sk(sk);
 880         struct rt6_info *rt = (struct rt6_info *)dst;
 881
 882         if (!dst)
 883                 goto out;
 884
 885         /* Yes, checking route validity in not connected
 886          * case is not very simple. Take into account,
 887          * that we do not support routing by source, TOS,
 888          * and MSG_DONTROUTE            --ANK (980726)
 889          *
 890          * 1. ip6_rt_check(): If route was host route,
 891          *    check that cached destination is current.
 892          *    If it is network route, we still may
 893          *    check its validity using saved pointer
 894          *    to the last used address: daddr_cache.
 895          *    We do not want to save whole address now,
 896          *    (because main consumer of this service
 897          *    is tcp, which has not this problem),
 898          *    so that the last trick works only on connected
 899          *    sockets.
 900          * 2. oif also should be the same.
 901          */
 902         if (ip6_rt_check(&rt->rt6i_dst, &fl->fl6_dst, np->daddr_cache) ||
 903 #ifdef CONFIG_IPV6_SUBTREES
 904             ip6_rt_check(&rt->rt6i_src, &fl->fl6_src, np->saddr_cache) ||
 905 #endif
 906             (fl->oif && fl->oif != dst->dev->ifindex)) {
 907                 dst_release(dst);
 908                 dst = NULL;
 909         }
 910
 911 out:
 912         return dst;
 913 }
 914
 915 static int ip6_dst_lookup_tail(struct sock *sk,
 916                                struct dst_entry **dst, struct flowi *fl)
 917 {
 918         int err;
 919
 920         if (*dst == NULL)
 921                 *dst = ip6_route_output(sk, fl);
 922
 923         if ((err = (*dst)->error))
 924                 goto out_err_release;
 925
 926         if (ipv6_addr_any(&fl->fl6_src)) {
 927                 err = ipv6_get_saddr(*dst, &fl->fl6_dst, &fl->fl6_src);
 928                 if (err)
 929                         goto out_err_release;
 930         }
 931
 932 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
 933         /*
 934          * Here if the dst entry we've looked up
 935          * has a neighbour entry that is in the INCOMPLETE
 936          * state and the src address from the flow is
 937          * marked as OPTIMISTIC, we release the found
 938          * dst entry and replace it instead with the
 939          * dst entry of the nexthop router
 940          */
 941         if ((*dst)->neighbour && !((*dst)->neighbour->nud_state & NUD_VALID)) {
 942                 struct inet6_ifaddr *ifp;
 943                 struct flowi fl_gw;
 944                 int redirect;
 945
 946                 ifp = ipv6_get_ifaddr(&init_net, &fl->fl6_src,
 947                                       (*dst)->dev, 1);
 948
 949                 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
 950                 if (ifp)
 951                         in6_ifa_put(ifp);
 952
 953                 if (redirect) {
 954                         /*
 955                          * We need to get the dst entry for the
 956                          * default router instead
 957                          */
 958                         dst_release(*dst);
 959                         memcpy(&fl_gw, fl, sizeof(struct flowi));
 960                         memset(&fl_gw.fl6_dst, 0, sizeof(struct in6_addr));
 961                         *dst = ip6_route_output(sk, &fl_gw);
 962                         if ((err = (*dst)->error))
 963                                 goto out_err_release;
 964                 }
 965         }
 966 #endif
 967
 968         return 0;
 969
 970 out_err_release:
 971         if (err == -ENETUNREACH)
 972                 IP6_INC_STATS_BH(NULL, IPSTATS_MIB_OUTNOROUTES);
 973         dst_release(*dst);
 974         *dst = NULL;
 975         return err;
 976 }
 977
 978 /**
 979  *      ip6_dst_lookup - perform route lookup on flow
 980  *      @sk: socket which provides route info
 981  *      @dst: pointer to dst_entry * for result
 982  *      @fl: flow to lookup
 983  *
 984  *      This function performs a route lookup on the given flow.
 985  *
 986  *      It returns zero on success, or a standard errno code on error.
 987  */
 988 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
 989 {
 990         *dst = NULL;
 991         return ip6_dst_lookup_tail(sk, dst, fl);
 992 }
 993 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
 994
 995 /**
 996  *      ip6_sk_dst_lookup - perform socket cached route lookup on flow
 997  *      @sk: socket which provides the dst cache and route info
 998  *      @dst: pointer to dst_entry * for result
 999  *      @fl: flow to lookup
1000  *
1001  *      This function performs a route lookup on the given flow with the
1002  *      possibility of using the cached route in the socket if it is valid.
1003  *      It will take the socket dst lock when operating on the dst cache.
1004  *      As a result, this function can only be used in process context.
1005  *
1006  *      It returns zero on success, or a standard errno code on error.
1007  */
1008 int ip6_sk_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
1009 {
1010         *dst = NULL;
1011         if (sk) {
1012                 *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1013                 *dst = ip6_sk_dst_check(sk, *dst, fl);
1014         }
1015
1016         return ip6_dst_lookup_tail(sk, dst, fl);
1017 }
1018 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup);
1019
1020 static inline int ip6_ufo_append_data(struct sock *sk,
1021                         int getfrag(void *from, char *to, int offset, int len,
1022                         int odd, struct sk_buff *skb),
1023                         void *from, int length, int hh_len, int fragheaderlen,
1024                         int transhdrlen, int mtu,unsigned int flags)
1025
1026 {
1027         struct sk_buff *skb;
1028         int err;
1029
1030         /* There is support for UDP large send offload by network
1031          * device, so create one single skb packet containing complete
1032          * udp datagram
1033          */
1034         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
1035                 skb = sock_alloc_send_skb(sk,
1036                         hh_len + fragheaderlen + transhdrlen + 20,
1037                         (flags & MSG_DONTWAIT), &err);
1038                 if (skb == NULL)
1039                         return -ENOMEM;
1040
1041                 /* reserve space for Hardware header */
1042                 skb_reserve(skb, hh_len);
1043
1044                 /* create space for UDP/IP header */
1045                 skb_put(skb,fragheaderlen + transhdrlen);
1046
1047                 /* initialize network header pointer */
1048                 skb_reset_network_header(skb);
1049
1050                 /* initialize protocol header pointer */
1051                 skb->transport_header = skb->network_header + fragheaderlen;
1052
1053                 skb->ip_summed = CHECKSUM_PARTIAL;
1054                 skb->csum = 0;
1055                 sk->sk_sndmsg_off = 0;
1056         }
1057
1058         err = skb_append_datato_frags(sk,skb, getfrag, from,
1059                                       (length - transhdrlen));
1060         if (!err) {
1061                 struct frag_hdr fhdr;
1062
1063                 /* specify the length of each IP datagram fragment*/
1064                 skb_shinfo(skb)->gso_size = mtu - fragheaderlen -
1065                                             sizeof(struct frag_hdr);
1066                 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1067                 ipv6_select_ident(skb, &fhdr);
1068                 skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
1069                 __skb_queue_tail(&sk->sk_write_queue, skb);
1070
1071                 return 0;
1072         }
1073         /* There is not enough support do UPD LSO,
1074          * so follow normal path
1075          */
1076         kfree_skb(skb);
1077
1078         return err;
1079 }
1080
1081 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
1082         int offset, int len, int odd, struct sk_buff *skb),
1083         void *from, int length, int transhdrlen,
1084         int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi *fl,
1085         struct rt6_info *rt, unsigned int flags)
1086 {
1087         struct inet_sock *inet = inet_sk(sk);
1088         struct ipv6_pinfo *np = inet6_sk(sk);
1089         struct sk_buff *skb;
1090         unsigned int maxfraglen, fragheaderlen;
1091         int exthdrlen;
1092         int hh_len;
1093         int mtu;
1094         int copy;
1095         int err;
1096         int offset = 0;
1097         int csummode = CHECKSUM_NONE;
1098
1099         if (flags&MSG_PROBE)
1100                 return 0;
1101         if (skb_queue_empty(&sk->sk_write_queue)) {
1102                 /*
1103                  * setup for corking
1104                  */
1105                 if (opt) {
1106                         if (np->cork.opt == NULL) {
1107                                 np->cork.opt = kmalloc(opt->tot_len,
1108                                                        sk->sk_allocation);
1109                                 if (unlikely(np->cork.opt == NULL))
1110                                         return -ENOBUFS;
1111                         } else if (np->cork.opt->tot_len < opt->tot_len) {
1112                                 printk(KERN_DEBUG "ip6_append_data: invalid option length\n");
1113                                 return -EINVAL;
1114                         }
1115                         memcpy(np->cork.opt, opt, opt->tot_len);
1116                         inet->cork.flags |= IPCORK_OPT;
1117                         /* need source address above miyazawa*/
1118                 }
1119                 dst_hold(&rt->u.dst);
1120                 np->cork.rt = rt;
1121                 inet->cork.fl = *fl;
1122                 np->cork.hop_limit = hlimit;
1123                 np->cork.tclass = tclass;
1124                 mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1125                       rt->u.dst.dev->mtu : dst_mtu(rt->u.dst.path);
1126                 if (np->frag_size < mtu) {
1127                         if (np->frag_size)
1128                                 mtu = np->frag_size;
1129                 }
1130                 inet->cork.fragsize = mtu;
1131                 if (dst_allfrag(rt->u.dst.path))
1132                         inet->cork.flags |= IPCORK_ALLFRAG;
1133                 inet->cork.length = 0;
1134                 sk->sk_sndmsg_page = NULL;
1135                 sk->sk_sndmsg_off = 0;
1136                 exthdrlen = rt->u.dst.header_len + (opt ? opt->opt_flen : 0) -
1137                             rt->rt6i_nfheader_len;
1138                 length += exthdrlen;
1139                 transhdrlen += exthdrlen;
1140         } else {
1141                 rt = np->cork.rt;
1142                 fl = &inet->cork.fl;
1143                 if (inet->cork.flags & IPCORK_OPT)
1144                         opt = np->cork.opt;
1145                 transhdrlen = 0;
1146                 exthdrlen = 0;
1147                 mtu = inet->cork.fragsize;
1148         }
1149
1150         hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
1151
1152         fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1153                         (opt ? opt->opt_nflen : 0);
1154         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
1155
1156         if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1157                 if (inet->cork.length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
1158                         ipv6_local_error(sk, EMSGSIZE, fl, mtu-exthdrlen);
1159                         return -EMSGSIZE;
1160                 }
1161         }
1162
1163         /*
1164          * Let's try using as much space as possible.
1165          * Use MTU if total length of the message fits into the MTU.
1166          * Otherwise, we need to reserve fragment header and
1167          * fragment alignment (= 8-15 octects, in total).
1168          *
1169          * Note that we may need to "move" the data from the tail of
1170          * of the buffer to the new fragment when we split
1171          * the message.
1172          *
1173          * FIXME: It may be fragmented into multiple chunks
1174          *        at once if non-fragmentable extension headers
1175          *        are too large.
1176          * --yoshfuji
1177          */
1178
1179         inet->cork.length += length;
1180         if (((length > mtu) && (sk->sk_protocol == IPPROTO_UDP)) &&
1181             (rt->u.dst.dev->features & NETIF_F_UFO)) {
1182
1183                 err = ip6_ufo_append_data(sk, getfrag, from, length, hh_len,
1184                                           fragheaderlen, transhdrlen, mtu,
1185                                           flags);
1186                 if (err)
1187                         goto error;
1188                 return 0;
1189         }
1190
1191         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1192                 goto alloc_new_skb;
1193
1194         while (length > 0) {
1195                 /* Check if the remaining data fits into current packet. */
1196                 copy = (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1197                 if (copy < length)
1198                         copy = maxfraglen - skb->len;
1199
1200                 if (copy <= 0) {
1201                         char *data;
1202                         unsigned int datalen;
1203                         unsigned int fraglen;
1204                         unsigned int fraggap;
1205                         unsigned int alloclen;
1206                         struct sk_buff *skb_prev;
1207 alloc_new_skb:
1208                         skb_prev = skb;
1209
1210                         /* There's no room in the current skb */
1211                         if (skb_prev)
1212                                 fraggap = skb_prev->len - maxfraglen;
1213                         else
1214                                 fraggap = 0;
1215
1216                         /*
1217                          * If remaining data exceeds the mtu,
1218                          * we know we need more fragment(s).
1219                          */
1220                         datalen = length + fraggap;
1221                         if (datalen > (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1222                                 datalen = maxfraglen - fragheaderlen;
1223
1224                         fraglen = datalen + fragheaderlen;
1225                         if ((flags & MSG_MORE) &&
1226                             !(rt->u.dst.dev->features&NETIF_F_SG))
1227                                 alloclen = mtu;
1228                         else
1229                                 alloclen = datalen + fragheaderlen;
1230
1231                         /*
1232                          * The last fragment gets additional space at tail.
1233                          * Note: we overallocate on fragments with MSG_MODE
1234                          * because we have no idea if we're the last one.
1235                          */
1236                         if (datalen == length + fraggap)
1237                                 alloclen += rt->u.dst.trailer_len;
1238
1239                         /*
1240                          * We just reserve space for fragment header.
1241                          * Note: this may be overallocation if the message
1242                          * (without MSG_MORE) fits into the MTU.
1243                          */
1244                         alloclen += sizeof(struct frag_hdr);
1245
1246                         if (transhdrlen) {
1247                                 skb = sock_alloc_send_skb(sk,
1248                                                 alloclen + hh_len,
1249                                                 (flags & MSG_DONTWAIT), &err);
1250                         } else {
1251                                 skb = NULL;
1252                                 if (atomic_read(&sk->sk_wmem_alloc) <=
1253                                     2 * sk->sk_sndbuf)
1254                                         skb = sock_wmalloc(sk,
1255                                                            alloclen + hh_len, 1,
1256                                                            sk->sk_allocation);
1257                                 if (unlikely(skb == NULL))
1258                                         err = -ENOBUFS;
1259                         }
1260                         if (skb == NULL)
1261                                 goto error;
1262                         /*
1263                          *      Fill in the control structures
1264                          */
1265                         skb->ip_summed = csummode;
1266                         skb->csum = 0;
1267                         /* reserve for fragmentation */
1268                         skb_reserve(skb, hh_len+sizeof(struct frag_hdr));
1269
1270                         /*
1271                          *      Find where to start putting bytes
1272                          */
1273                         data = skb_put(skb, fraglen);
1274                         skb_set_network_header(skb, exthdrlen);
1275                         data += fragheaderlen;
1276                         skb->transport_header = (skb->network_header +
1277                                                  fragheaderlen);
1278                         if (fraggap) {
1279                                 skb->csum = skb_copy_and_csum_bits(
1280                                         skb_prev, maxfraglen,
1281                                         data + transhdrlen, fraggap, 0);
1282                                 skb_prev->csum = csum_sub(skb_prev->csum,
1283                                                           skb->csum);
1284                                 data += fraggap;
1285                                 pskb_trim_unique(skb_prev, maxfraglen);
1286                         }
1287                         copy = datalen - transhdrlen - fraggap;
1288                         if (copy < 0) {
1289                                 err = -EINVAL;
1290                                 kfree_skb(skb);
1291                                 goto error;
1292                         } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1293                                 err = -EFAULT;
1294                                 kfree_skb(skb);
1295                                 goto error;
1296                         }
1297
1298                         offset += copy;
1299                         length -= datalen - fraggap;
1300                         transhdrlen = 0;
1301                         exthdrlen = 0;
1302                         csummode = CHECKSUM_NONE;
1303
1304                         /*
1305                          * Put the packet on the pending queue
1306                          */
1307                         __skb_queue_tail(&sk->sk_write_queue, skb);
1308                         continue;
1309                 }
1310
1311                 if (copy > length)
1312                         copy = length;
1313
1314                 if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
1315                         unsigned int off;
1316
1317                         off = skb->len;
1318                         if (getfrag(from, skb_put(skb, copy),
1319                                                 offset, copy, off, skb) < 0) {
1320                                 __skb_trim(skb, off);
1321                                 err = -EFAULT;
1322                                 goto error;
1323                         }
1324                 } else {
1325                         int i = skb_shinfo(skb)->nr_frags;
1326                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1327                         struct page *page = sk->sk_sndmsg_page;
1328                         int off = sk->sk_sndmsg_off;
1329                         unsigned int left;
1330
1331                         if (page && (left = PAGE_SIZE - off) > 0) {
1332                                 if (copy >= left)
1333                                         copy = left;
1334                                 if (page != frag->page) {
1335                                         if (i == MAX_SKB_FRAGS) {
1336                                                 err = -EMSGSIZE;
1337                                                 goto error;
1338                                         }
1339                                         get_page(page);
1340                                         skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1341                                         frag = &skb_shinfo(skb)->frags[i];
1342                                 }
1343                         } else if(i < MAX_SKB_FRAGS) {
1344                                 if (copy > PAGE_SIZE)
1345                                         copy = PAGE_SIZE;
1346                                 page = alloc_pages(sk->sk_allocation, 0);
1347                                 if (page == NULL) {
1348                                         err = -ENOMEM;
1349                                         goto error;
1350                                 }
1351                                 sk->sk_sndmsg_page = page;
1352                                 sk->sk_sndmsg_off = 0;
1353
1354                                 skb_fill_page_desc(skb, i, page, 0, 0);
1355                                 frag = &skb_shinfo(skb)->frags[i];
1356                         } else {
1357                                 err = -EMSGSIZE;
1358                                 goto error;
1359                         }
1360                         if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1361                                 err = -EFAULT;
1362                                 goto error;
1363                         }
1364                         sk->sk_sndmsg_off += copy;
1365                         frag->size += copy;
1366                         skb->len += copy;
1367                         skb->data_len += copy;
1368                         skb->truesize += copy;
1369                         atomic_add(copy, &sk->sk_wmem_alloc);
1370                 }
1371                 offset += copy;
1372                 length -= copy;
1373         }
1374         return 0;
1375 error:
1376         inet->cork.length -= length;
1377         IP6_INC_STATS(rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1378         return err;
1379 }
1380
1381 static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np)
1382 {
1383         inet->cork.flags &= ~IPCORK_OPT;
1384         kfree(np->cork.opt);
1385         np->cork.opt = NULL;
1386         if (np->cork.rt) {
1387                 dst_release(&np->cork.rt->u.dst);
1388                 np->cork.rt = NULL;
1389                 inet->cork.flags &= ~IPCORK_ALLFRAG;
1390         }
1391         memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1392 }
1393
1394 int ip6_push_pending_frames(struct sock *sk)
1395 {
1396         struct sk_buff *skb, *tmp_skb;
1397         struct sk_buff **tail_skb;
1398         struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1399         struct inet_sock *inet = inet_sk(sk);
1400         struct ipv6_pinfo *np = inet6_sk(sk);
1401         struct ipv6hdr *hdr;
1402         struct ipv6_txoptions *opt = np->cork.opt;
1403         struct rt6_info *rt = np->cork.rt;
1404         struct flowi *fl = &inet->cork.fl;
1405         unsigned char proto = fl->proto;
1406         int err = 0;
1407
1408         if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1409                 goto out;
1410         tail_skb = &(skb_shinfo(skb)->frag_list);
1411
1412         /* move skb->data to ip header from ext header */
1413         if (skb->data < skb_network_header(skb))
1414                 __skb_pull(skb, skb_network_offset(skb));
1415         while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1416                 __skb_pull(tmp_skb, skb_network_header_len(skb));
1417                 *tail_skb = tmp_skb;
1418                 tail_skb = &(tmp_skb->next);
1419                 skb->len += tmp_skb->len;
1420                 skb->data_len += tmp_skb->len;
1421                 skb->truesize += tmp_skb->truesize;
1422                 __sock_put(tmp_skb->sk);
1423                 tmp_skb->destructor = NULL;
1424                 tmp_skb->sk = NULL;
1425         }
1426
1427         /* Allow local fragmentation. */
1428         if (np->pmtudisc < IPV6_PMTUDISC_DO)
1429                 skb->local_df = 1;
1430
1431         ipv6_addr_copy(final_dst, &fl->fl6_dst);
1432         __skb_pull(skb, skb_network_header_len(skb));
1433         if (opt && opt->opt_flen)
1434                 ipv6_push_frag_opts(skb, opt, &proto);
1435         if (opt && opt->opt_nflen)
1436                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1437
1438         skb_push(skb, sizeof(struct ipv6hdr));
1439         skb_reset_network_header(skb);
1440         hdr = ipv6_hdr(skb);
1441
1442         *(__be32*)hdr = fl->fl6_flowlabel |
1443                      htonl(0x60000000 | ((int)np->cork.tclass << 20));
1444
1445         hdr->hop_limit = np->cork.hop_limit;
1446         hdr->nexthdr = proto;
1447         ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
1448         ipv6_addr_copy(&hdr->daddr, final_dst);
1449
1450         skb->priority = sk->sk_priority;
1451         skb->mark = sk->sk_mark;
1452
1453         skb->dst = dst_clone(&rt->u.dst);
1454         IP6_INC_STATS(rt->rt6i_idev, IPSTATS_MIB_OUTREQUESTS);
1455         if (proto == IPPROTO_ICMPV6) {
1456                 struct inet6_dev *idev = ip6_dst_idev(skb->dst);
1457
1458                 ICMP6MSGOUT_INC_STATS_BH(idev, icmp6_hdr(skb)->icmp6_type);
1459                 ICMP6_INC_STATS_BH(idev, ICMP6_MIB_OUTMSGS);
1460         }
1461
1462         err = ip6_local_out(skb);
1463         if (err) {
1464                 if (err > 0)
1465                         err = np->recverr ? net_xmit_errno(err) : 0;
1466                 if (err)
1467                         goto error;
1468         }
1469
1470 out:
1471         ip6_cork_release(inet, np);
1472         return err;
1473 error:
1474         goto out;
1475 }
1476
1477 void ip6_flush_pending_frames(struct sock *sk)
1478 {
1479         struct sk_buff *skb;
1480
1481         while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1482                 if (skb->dst)
1483                         IP6_INC_STATS(ip6_dst_idev(skb->dst),
1484                                       IPSTATS_MIB_OUTDISCARDS);
1485                 kfree_skb(skb);
1486         }
1487
1488         ip6_cork_release(inet_sk(sk), inet6_sk(sk));
1489 }