net/ipv6/ip6_output.c

   1 /*
   2  *      IPv6 output functions
   3  *      Linux INET6 implementation
   4  *
   5  *      Authors:
   6  *      Pedro Roque             <roque@di.fc.ul.pt>
   7  *
   8  *      Based on linux/net/ipv4/ip_output.c
   9  *
  10  *      This program is free software; you can redistribute it and/or
  11  *      modify it under the terms of the GNU General Public License
  12  *      as published by the Free Software Foundation; either version
  13  *      2 of the License, or (at your option) any later version.
  14  *
  15  *      Changes:
  16  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
  17  *                              extension headers are implemented.
  18  *                              route changes now work.
  19  *                              ip6_forward does not confuse sniffers.
  20  *                              etc.
  21  *
  22  *      H. von Brand    :       Added missing #include <linux/string.h>
  23  *      Imran Patel     :       frag id should be in NBO
  24  *      Kazunori MIYAZAWA @USAGI
  25  *                      :       add ip6_append_data and related functions
  26  *                              for datagram xmit
  27  */
  28
  29 #include <linux/errno.h>
  30 #include <linux/kernel.h>
  31 #include <linux/string.h>
  32 #include <linux/socket.h>
  33 #include <linux/net.h>
  34 #include <linux/netdevice.h>
  35 #include <linux/if_arp.h>
  36 #include <linux/in6.h>
  37 #include <linux/tcp.h>
  38 #include <linux/route.h>
  39 #include <linux/module.h>
  40
  41 #include <linux/netfilter.h>
  42 #include <linux/netfilter_ipv6.h>
  43
  44 #include <net/sock.h>
  45 #include <net/snmp.h>
  46
  47 #include <net/ipv6.h>
  48 #include <net/ndisc.h>
  49 #include <net/protocol.h>
  50 #include <net/ip6_route.h>
  51 #include <net/addrconf.h>
  52 #include <net/rawv6.h>
  53 #include <net/icmp.h>
  54 #include <net/xfrm.h>
  55 #include <net/checksum.h>
  56 #include <linux/mroute6.h>
  57
  58 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
  59
  60 int __ip6_local_out(struct sk_buff *skb)
  61 {
  62         int len;
  63
  64         len = skb->len - sizeof(struct ipv6hdr);
  65         if (len > IPV6_MAXPLEN)
  66                 len = 0;
  67         ipv6_hdr(skb)->payload_len = htons(len);
  68
  69         return nf_hook(PF_INET6, NF_INET_LOCAL_OUT, skb, NULL, skb_dst(skb)->dev,
  70                        dst_output);
  71 }
  72
  73 int ip6_local_out(struct sk_buff *skb)
  74 {
  75         int err;
  76
  77         err = __ip6_local_out(skb);
  78         if (likely(err == 1))
  79                 err = dst_output(skb);
  80
  81         return err;
  82 }
  83 EXPORT_SYMBOL_GPL(ip6_local_out);
  84
  85 static int ip6_output_finish(struct sk_buff *skb)
  86 {
  87         struct dst_entry *dst = skb_dst(skb);
  88
  89         if (dst->hh)
  90                 return neigh_hh_output(dst->hh, skb);
  91         else if (dst->neighbour)
  92                 return dst->neighbour->output(skb);
  93
  94         IP6_INC_STATS_BH(dev_net(dst->dev),
  95                          ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
  96         kfree_skb(skb);
  97         return -EINVAL;
  98
  99 }
 100
 101 /* dev_loopback_xmit for use with netfilter. */
 102 static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
 103 {
 104         skb_reset_mac_header(newskb);
 105         __skb_pull(newskb, skb_network_offset(newskb));
 106         newskb->pkt_type = PACKET_LOOPBACK;
 107         newskb->ip_summed = CHECKSUM_UNNECESSARY;
 108         WARN_ON(!skb_dst(newskb));
 109
 110         netif_rx(newskb);
 111         return 0;
 112 }
 113
 114
 115 static int ip6_output2(struct sk_buff *skb)
 116 {
 117         struct dst_entry *dst = skb_dst(skb);
 118         struct net_device *dev = dst->dev;
 119
 120         skb->protocol = htons(ETH_P_IPV6);
 121         skb->dev = dev;
 122
 123         if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
 124                 struct ipv6_pinfo* np = skb->sk ? inet6_sk(skb->sk) : NULL;
 125                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
 126
 127                 if (!(dev->flags & IFF_LOOPBACK) && (!np || np->mc_loop) &&
 128                     ((mroute6_socket(dev_net(dev)) &&
 129                      !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
 130                      ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
 131                                          &ipv6_hdr(skb)->saddr))) {
 132                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
 133
 134                         /* Do not check for IFF_ALLMULTI; multicast routing
 135                            is not supported in any case.
 136                          */
 137                         if (newskb)
 138                                 NF_HOOK(PF_INET6, NF_INET_POST_ROUTING, newskb,
 139                                         NULL, newskb->dev,
 140                                         ip6_dev_loopback_xmit);
 141
 142                         if (ipv6_hdr(skb)->hop_limit == 0) {
 143                                 IP6_INC_STATS(dev_net(dev), idev,
 144                                               IPSTATS_MIB_OUTDISCARDS);
 145                                 kfree_skb(skb);
 146                                 return 0;
 147                         }
 148                 }
 149
 150                 IP6_UPD_PO_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCAST,
 151                                 skb->len);
 152         }
 153
 154         return NF_HOOK(PF_INET6, NF_INET_POST_ROUTING, skb, NULL, skb->dev,
 155                        ip6_output_finish);
 156 }
 157
 158 static inline int ip6_skb_dst_mtu(struct sk_buff *skb)
 159 {
 160         struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
 161
 162         return (np && np->pmtudisc == IPV6_PMTUDISC_PROBE) ?
 163                skb_dst(skb)->dev->mtu : dst_mtu(skb_dst(skb));
 164 }
 165
 166 int ip6_output(struct sk_buff *skb)
 167 {
 168         struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
 169         if (unlikely(idev->cnf.disable_ipv6)) {
 170                 IP6_INC_STATS(dev_net(skb_dst(skb)->dev), idev,
 171                               IPSTATS_MIB_OUTDISCARDS);
 172                 kfree_skb(skb);
 173                 return 0;
 174         }
 175
 176         if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
 177                                 dst_allfrag(skb_dst(skb)))
 178                 return ip6_fragment(skb, ip6_output2);
 179         else
 180                 return ip6_output2(skb);
 181 }
 182
 183 /*
 184  *      xmit an sk_buff (used by TCP)
 185  */
 186
 187 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
 188              struct ipv6_txoptions *opt, int ipfragok)
 189 {
 190         struct net *net = sock_net(sk);
 191         struct ipv6_pinfo *np = inet6_sk(sk);
 192         struct in6_addr *first_hop = &fl->fl6_dst;
 193         struct dst_entry *dst = skb_dst(skb);
 194         struct ipv6hdr *hdr;
 195         u8  proto = fl->proto;
 196         int seg_len = skb->len;
 197         int hlimit = -1;
 198         int tclass = 0;
 199         u32 mtu;
 200
 201         if (opt) {
 202                 unsigned int head_room;
 203
 204                 /* First: exthdrs may take lots of space (~8K for now)
 205                    MAX_HEADER is not enough.
 206                  */
 207                 head_room = opt->opt_nflen + opt->opt_flen;
 208                 seg_len += head_room;
 209                 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
 210
 211                 if (skb_headroom(skb) < head_room) {
 212                         struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
 213                         if (skb2 == NULL) {
 214                                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 215                                               IPSTATS_MIB_OUTDISCARDS);
 216                                 kfree_skb(skb);
 217                                 return -ENOBUFS;
 218                         }
 219                         kfree_skb(skb);
 220                         skb = skb2;
 221                         if (sk)
 222                                 skb_set_owner_w(skb, sk);
 223                 }
 224                 if (opt->opt_flen)
 225                         ipv6_push_frag_opts(skb, opt, &proto);
 226                 if (opt->opt_nflen)
 227                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
 228         }
 229
 230         skb_push(skb, sizeof(struct ipv6hdr));
 231         skb_reset_network_header(skb);
 232         hdr = ipv6_hdr(skb);
 233
 234         /* Allow local fragmentation. */
 235         if (ipfragok)
 236                 skb->local_df = 1;
 237
 238         /*
 239          *      Fill in the IPv6 header
 240          */
 241         if (np) {
 242                 tclass = np->tclass;
 243                 hlimit = np->hop_limit;
 244         }
 245         if (hlimit < 0)
 246                 hlimit = ip6_dst_hoplimit(dst);
 247
 248         *(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl->fl6_flowlabel;
 249
 250         hdr->payload_len = htons(seg_len);
 251         hdr->nexthdr = proto;
 252         hdr->hop_limit = hlimit;
 253
 254         ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
 255         ipv6_addr_copy(&hdr->daddr, first_hop);
 256
 257         skb->priority = sk->sk_priority;
 258         skb->mark = sk->sk_mark;
 259
 260         mtu = dst_mtu(dst);
 261         if ((skb->len <= mtu) || skb->local_df || skb_is_gso(skb)) {
 262                 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
 263                               IPSTATS_MIB_OUT, skb->len);
 264                 return NF_HOOK(PF_INET6, NF_INET_LOCAL_OUT, skb, NULL, dst->dev,
 265                                 dst_output);
 266         }
 267
 268         if (net_ratelimit())
 269                 printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
 270         skb->dev = dst->dev;
 271         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
 272         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
 273         kfree_skb(skb);
 274         return -EMSGSIZE;
 275 }
 276
 277 EXPORT_SYMBOL(ip6_xmit);
 278
 279 /*
 280  *      To avoid extra problems ND packets are send through this
 281  *      routine. It's code duplication but I really want to avoid
 282  *      extra checks since ipv6_build_header is used by TCP (which
 283  *      is for us performance critical)
 284  */
 285
 286 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
 287                const struct in6_addr *saddr, const struct in6_addr *daddr,
 288                int proto, int len)
 289 {
 290         struct ipv6_pinfo *np = inet6_sk(sk);
 291         struct ipv6hdr *hdr;
 292         int totlen;
 293
 294         skb->protocol = htons(ETH_P_IPV6);
 295         skb->dev = dev;
 296
 297         totlen = len + sizeof(struct ipv6hdr);
 298
 299         skb_reset_network_header(skb);
 300         skb_put(skb, sizeof(struct ipv6hdr));
 301         hdr = ipv6_hdr(skb);
 302
 303         *(__be32*)hdr = htonl(0x60000000);
 304
 305         hdr->payload_len = htons(len);
 306         hdr->nexthdr = proto;
 307         hdr->hop_limit = np->hop_limit;
 308
 309         ipv6_addr_copy(&hdr->saddr, saddr);
 310         ipv6_addr_copy(&hdr->daddr, daddr);
 311
 312         return 0;
 313 }
 314
 315 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
 316 {
 317         struct ip6_ra_chain *ra;
 318         struct sock *last = NULL;
 319
 320         read_lock(&ip6_ra_lock);
 321         for (ra = ip6_ra_chain; ra; ra = ra->next) {
 322                 struct sock *sk = ra->sk;
 323                 if (sk && ra->sel == sel &&
 324                     (!sk->sk_bound_dev_if ||
 325                      sk->sk_bound_dev_if == skb->dev->ifindex)) {
 326                         if (last) {
 327                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
 328                                 if (skb2)
 329                                         rawv6_rcv(last, skb2);
 330                         }
 331                         last = sk;
 332                 }
 333         }
 334
 335         if (last) {
 336                 rawv6_rcv(last, skb);
 337                 read_unlock(&ip6_ra_lock);
 338                 return 1;
 339         }
 340         read_unlock(&ip6_ra_lock);
 341         return 0;
 342 }
 343
 344 static int ip6_forward_proxy_check(struct sk_buff *skb)
 345 {
 346         struct ipv6hdr *hdr = ipv6_hdr(skb);
 347         u8 nexthdr = hdr->nexthdr;
 348         int offset;
 349
 350         if (ipv6_ext_hdr(nexthdr)) {
 351                 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr);
 352                 if (offset < 0)
 353                         return 0;
 354         } else
 355                 offset = sizeof(struct ipv6hdr);
 356
 357         if (nexthdr == IPPROTO_ICMPV6) {
 358                 struct icmp6hdr *icmp6;
 359
 360                 if (!pskb_may_pull(skb, (skb_network_header(skb) +
 361                                          offset + 1 - skb->data)))
 362                         return 0;
 363
 364                 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
 365
 366                 switch (icmp6->icmp6_type) {
 367                 case NDISC_ROUTER_SOLICITATION:
 368                 case NDISC_ROUTER_ADVERTISEMENT:
 369                 case NDISC_NEIGHBOUR_SOLICITATION:
 370                 case NDISC_NEIGHBOUR_ADVERTISEMENT:
 371                 case NDISC_REDIRECT:
 372                         /* For reaction involving unicast neighbor discovery
 373                          * message destined to the proxied address, pass it to
 374                          * input function.
 375                          */
 376                         return 1;
 377                 default:
 378                         break;
 379                 }
 380         }
 381
 382         /*
 383          * The proxying router can't forward traffic sent to a link-local
 384          * address, so signal the sender and discard the packet. This
 385          * behavior is clarified by the MIPv6 specification.
 386          */
 387         if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
 388                 dst_link_failure(skb);
 389                 return -1;
 390         }
 391
 392         return 0;
 393 }
 394
 395 static inline int ip6_forward_finish(struct sk_buff *skb)
 396 {
 397         return dst_output(skb);
 398 }
 399
 400 int ip6_forward(struct sk_buff *skb)
 401 {
 402         struct dst_entry *dst = skb_dst(skb);
 403         struct ipv6hdr *hdr = ipv6_hdr(skb);
 404         struct inet6_skb_parm *opt = IP6CB(skb);
 405         struct net *net = dev_net(dst->dev);
 406
 407         if (net->ipv6.devconf_all->forwarding == 0)
 408                 goto error;
 409
 410         if (skb_warn_if_lro(skb))
 411                 goto drop;
 412
 413         if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
 414                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
 415                 goto drop;
 416         }
 417
 418         skb_forward_csum(skb);
 419
 420         /*
 421          *      We DO NOT make any processing on
 422          *      RA packets, pushing them to user level AS IS
 423          *      without ane WARRANTY that application will be able
 424          *      to interpret them. The reason is that we
 425          *      cannot make anything clever here.
 426          *
 427          *      We are not end-node, so that if packet contains
 428          *      AH/ESP, we cannot make anything.
 429          *      Defragmentation also would be mistake, RA packets
 430          *      cannot be fragmented, because there is no warranty
 431          *      that different fragments will go along one path. --ANK
 432          */
 433         if (opt->ra) {
 434                 u8 *ptr = skb_network_header(skb) + opt->ra;
 435                 if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
 436                         return 0;
 437         }
 438
 439         /*
 440          *      check and decrement ttl
 441          */
 442         if (hdr->hop_limit <= 1) {
 443                 /* Force OUTPUT device used as source address */
 444                 skb->dev = dst->dev;
 445                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT,
 446                             0, skb->dev);
 447                 IP6_INC_STATS_BH(net,
 448                                  ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
 449
 450                 kfree_skb(skb);
 451                 return -ETIMEDOUT;
 452         }
 453
 454         /* XXX: idev->cnf.proxy_ndp? */
 455         if (net->ipv6.devconf_all->proxy_ndp &&
 456             pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
 457                 int proxied = ip6_forward_proxy_check(skb);
 458                 if (proxied > 0)
 459                         return ip6_input(skb);
 460                 else if (proxied < 0) {
 461                         IP6_INC_STATS(net, ip6_dst_idev(dst),
 462                                       IPSTATS_MIB_INDISCARDS);
 463                         goto drop;
 464                 }
 465         }
 466
 467         if (!xfrm6_route_forward(skb)) {
 468                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
 469                 goto drop;
 470         }
 471         dst = skb_dst(skb);
 472
 473         /* IPv6 specs say nothing about it, but it is clear that we cannot
 474            send redirects to source routed frames.
 475            We don't send redirects to frames decapsulated from IPsec.
 476          */
 477         if (skb->dev == dst->dev && dst->neighbour && opt->srcrt == 0 &&
 478             !skb_sec_path(skb)) {
 479                 struct in6_addr *target = NULL;
 480                 struct rt6_info *rt;
 481                 struct neighbour *n = dst->neighbour;
 482
 483                 /*
 484                  *      incoming and outgoing devices are the same
 485                  *      send a redirect.
 486                  */
 487
 488                 rt = (struct rt6_info *) dst;
 489                 if ((rt->rt6i_flags & RTF_GATEWAY))
 490                         target = (struct in6_addr*)&n->primary_key;
 491                 else
 492                         target = &hdr->daddr;
 493
 494                 /* Limit redirects both by destination (here)
 495                    and by source (inside ndisc_send_redirect)
 496                  */
 497                 if (xrlim_allow(dst, 1*HZ))
 498                         ndisc_send_redirect(skb, n, target);
 499         } else {
 500                 int addrtype = ipv6_addr_type(&hdr->saddr);
 501
 502                 /* This check is security critical. */
 503                 if (addrtype == IPV6_ADDR_ANY ||
 504                     addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
 505                         goto error;
 506                 if (addrtype & IPV6_ADDR_LINKLOCAL) {
 507                         icmpv6_send(skb, ICMPV6_DEST_UNREACH,
 508                                 ICMPV6_NOT_NEIGHBOUR, 0, skb->dev);
 509                         goto error;
 510                 }
 511         }
 512
 513         if (skb->len > dst_mtu(dst) && !skb_is_gso(skb)) {
 514                 /* Again, force OUTPUT device used as source address */
 515                 skb->dev = dst->dev;
 516                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, dst_mtu(dst), skb->dev);
 517                 IP6_INC_STATS_BH(net,
 518                                  ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS);
 519                 IP6_INC_STATS_BH(net,
 520                                  ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS);
 521                 kfree_skb(skb);
 522                 return -EMSGSIZE;
 523         }
 524
 525         if (skb_cow(skb, dst->dev->hard_header_len)) {
 526                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
 527                 goto drop;
 528         }
 529
 530         hdr = ipv6_hdr(skb);
 531
 532         /* Mangling hops number delayed to point after skb COW */
 533
 534         hdr->hop_limit--;
 535
 536         IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
 537         return NF_HOOK(PF_INET6, NF_INET_FORWARD, skb, skb->dev, dst->dev,
 538                        ip6_forward_finish);
 539
 540 error:
 541         IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
 542 drop:
 543         kfree_skb(skb);
 544         return -EINVAL;
 545 }
 546
 547 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 548 {
 549         to->pkt_type = from->pkt_type;
 550         to->priority = from->priority;
 551         to->protocol = from->protocol;
 552         skb_dst_drop(to);
 553         skb_dst_set(to, dst_clone(skb_dst(from)));
 554         to->dev = from->dev;
 555         to->mark = from->mark;
 556
 557 #ifdef CONFIG_NET_SCHED
 558         to->tc_index = from->tc_index;
 559 #endif
 560         nf_copy(to, from);
 561 #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
 562     defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
 563         to->nf_trace = from->nf_trace;
 564 #endif
 565         skb_copy_secmark(to, from);
 566 }
 567
 568 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
 569 {
 570         u16 offset = sizeof(struct ipv6hdr);
 571         struct ipv6_opt_hdr *exthdr =
 572                                 (struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1);
 573         unsigned int packet_len = skb->tail - skb->network_header;
 574         int found_rhdr = 0;
 575         *nexthdr = &ipv6_hdr(skb)->nexthdr;
 576
 577         while (offset + 1 <= packet_len) {
 578
 579                 switch (**nexthdr) {
 580
 581                 case NEXTHDR_HOP:
 582                         break;
 583                 case NEXTHDR_ROUTING:
 584                         found_rhdr = 1;
 585                         break;
 586                 case NEXTHDR_DEST:
 587 #if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
 588                         if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0)
 589                                 break;
 590 #endif
 591                         if (found_rhdr)
 592                                 return offset;
 593                         break;
 594                 default :
 595                         return offset;
 596                 }
 597
 598                 offset += ipv6_optlen(exthdr);
 599                 *nexthdr = &exthdr->nexthdr;
 600                 exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) +
 601                                                  offset);
 602         }
 603
 604         return offset;
 605 }
 606
 607 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
 608 {
 609         struct sk_buff *frag;
 610         struct rt6_info *rt = (struct rt6_info*)skb_dst(skb);
 611         struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
 612         struct ipv6hdr *tmp_hdr;
 613         struct frag_hdr *fh;
 614         unsigned int mtu, hlen, left, len;
 615         __be32 frag_id = 0;
 616         int ptr, offset = 0, err=0;
 617         u8 *prevhdr, nexthdr = 0;
 618         struct net *net = dev_net(skb_dst(skb)->dev);
 619
 620         hlen = ip6_find_1stfragopt(skb, &prevhdr);
 621         nexthdr = *prevhdr;
 622
 623         mtu = ip6_skb_dst_mtu(skb);
 624
 625         /* We must not fragment if the socket is set to force MTU discovery
 626          * or if the skb it not generated by a local socket.  (This last
 627          * check should be redundant, but it's free.)
 628          */
 629         if (!skb->local_df) {
 630                 skb->dev = skb_dst(skb)->dev;
 631                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
 632                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 633                               IPSTATS_MIB_FRAGFAILS);
 634                 kfree_skb(skb);
 635                 return -EMSGSIZE;
 636         }
 637
 638         if (np && np->frag_size < mtu) {
 639                 if (np->frag_size)
 640                         mtu = np->frag_size;
 641         }
 642         mtu -= hlen + sizeof(struct frag_hdr);
 643
 644         if (skb_has_frags(skb)) {
 645                 int first_len = skb_pagelen(skb);
 646                 struct sk_buff *frag2;
 647
 648                 if (first_len - hlen > mtu ||
 649                     ((first_len - hlen) & 7) ||
 650                     skb_cloned(skb))
 651                         goto slow_path;
 652
 653                 skb_walk_frags(skb, frag) {
 654                         /* Correct geometry. */
 655                         if (frag->len > mtu ||
 656                             ((frag->len & 7) && frag->next) ||
 657                             skb_headroom(frag) < hlen)
 658                                 goto slow_path_clean;
 659
 660                         /* Partially cloned skb? */
 661                         if (skb_shared(frag))
 662                                 goto slow_path_clean;
 663
 664                         BUG_ON(frag->sk);
 665                         if (skb->sk) {
 666                                 frag->sk = skb->sk;
 667                                 frag->destructor = sock_wfree;
 668                         }
 669                         skb->truesize -= frag->truesize;
 670                 }
 671
 672                 err = 0;
 673                 offset = 0;
 674                 frag = skb_shinfo(skb)->frag_list;
 675                 skb_frag_list_init(skb);
 676                 /* BUILD HEADER */
 677
 678                 *prevhdr = NEXTHDR_FRAGMENT;
 679                 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
 680                 if (!tmp_hdr) {
 681                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 682                                       IPSTATS_MIB_FRAGFAILS);
 683                         return -ENOMEM;
 684                 }
 685
 686                 __skb_pull(skb, hlen);
 687                 fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
 688                 __skb_push(skb, hlen);
 689                 skb_reset_network_header(skb);
 690                 memcpy(skb_network_header(skb), tmp_hdr, hlen);
 691
 692                 ipv6_select_ident(fh);
 693                 fh->nexthdr = nexthdr;
 694                 fh->reserved = 0;
 695                 fh->frag_off = htons(IP6_MF);
 696                 frag_id = fh->identification;
 697
 698                 first_len = skb_pagelen(skb);
 699                 skb->data_len = first_len - skb_headlen(skb);
 700                 skb->len = first_len;
 701                 ipv6_hdr(skb)->payload_len = htons(first_len -
 702                                                    sizeof(struct ipv6hdr));
 703
 704                 dst_hold(&rt->u.dst);
 705
 706                 for (;;) {
 707                         /* Prepare header of the next frame,
 708                          * before previous one went down. */
 709                         if (frag) {
 710                                 frag->ip_summed = CHECKSUM_NONE;
 711                                 skb_reset_transport_header(frag);
 712                                 fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
 713                                 __skb_push(frag, hlen);
 714                                 skb_reset_network_header(frag);
 715                                 memcpy(skb_network_header(frag), tmp_hdr,
 716                                        hlen);
 717                                 offset += skb->len - hlen - sizeof(struct frag_hdr);
 718                                 fh->nexthdr = nexthdr;
 719                                 fh->reserved = 0;
 720                                 fh->frag_off = htons(offset);
 721                                 if (frag->next != NULL)
 722                                         fh->frag_off |= htons(IP6_MF);
 723                                 fh->identification = frag_id;
 724                                 ipv6_hdr(frag)->payload_len =
 725                                                 htons(frag->len -
 726                                                       sizeof(struct ipv6hdr));
 727                                 ip6_copy_metadata(frag, skb);
 728                         }
 729
 730                         err = output(skb);
 731                         if(!err)
 732                                 IP6_INC_STATS(net, ip6_dst_idev(&rt->u.dst),
 733                                               IPSTATS_MIB_FRAGCREATES);
 734
 735                         if (err || !frag)
 736                                 break;
 737
 738                         skb = frag;
 739                         frag = skb->next;
 740                         skb->next = NULL;
 741                 }
 742
 743                 kfree(tmp_hdr);
 744
 745                 if (err == 0) {
 746                         IP6_INC_STATS(net, ip6_dst_idev(&rt->u.dst),
 747                                       IPSTATS_MIB_FRAGOKS);
 748                         dst_release(&rt->u.dst);
 749                         return 0;
 750                 }
 751
 752                 while (frag) {
 753                         skb = frag->next;
 754                         kfree_skb(frag);
 755                         frag = skb;
 756                 }
 757
 758                 IP6_INC_STATS(net, ip6_dst_idev(&rt->u.dst),
 759                               IPSTATS_MIB_FRAGFAILS);
 760                 dst_release(&rt->u.dst);
 761                 return err;
 762
 763 slow_path_clean:
 764                 skb_walk_frags(skb, frag2) {
 765                         if (frag2 == frag)
 766                                 break;
 767                         frag2->sk = NULL;
 768                         frag2->destructor = NULL;
 769                         skb->truesize += frag2->truesize;
 770                 }
 771         }
 772
 773 slow_path:
 774         left = skb->len - hlen;         /* Space per frame */
 775         ptr = hlen;                     /* Where to start from */
 776
 777         /*
 778          *      Fragment the datagram.
 779          */
 780
 781         *prevhdr = NEXTHDR_FRAGMENT;
 782
 783         /*
 784          *      Keep copying data until we run out.
 785          */
 786         while(left > 0) {
 787                 len = left;
 788                 /* IF: it doesn't fit, use 'mtu' - the data space left */
 789                 if (len > mtu)
 790                         len = mtu;
 791                 /* IF: we are not sending upto and including the packet end
 792                    then align the next start on an eight byte boundary */
 793                 if (len < left) {
 794                         len &= ~7;
 795                 }
 796                 /*
 797                  *      Allocate buffer.
 798                  */
 799
 800                 if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_ALLOCATED_SPACE(rt->u.dst.dev), GFP_ATOMIC)) == NULL) {
 801                         NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
 802                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 803                                       IPSTATS_MIB_FRAGFAILS);
 804                         err = -ENOMEM;
 805                         goto fail;
 806                 }
 807
 808                 /*
 809                  *      Set up data on packet
 810                  */
 811
 812                 ip6_copy_metadata(frag, skb);
 813                 skb_reserve(frag, LL_RESERVED_SPACE(rt->u.dst.dev));
 814                 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
 815                 skb_reset_network_header(frag);
 816                 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
 817                 frag->transport_header = (frag->network_header + hlen +
 818                                           sizeof(struct frag_hdr));
 819
 820                 /*
 821                  *      Charge the memory for the fragment to any owner
 822                  *      it might possess
 823                  */
 824                 if (skb->sk)
 825                         skb_set_owner_w(frag, skb->sk);
 826
 827                 /*
 828                  *      Copy the packet header into the new buffer.
 829                  */
 830                 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
 831
 832                 /*
 833                  *      Build fragment header.
 834                  */
 835                 fh->nexthdr = nexthdr;
 836                 fh->reserved = 0;
 837                 if (!frag_id) {
 838                         ipv6_select_ident(fh);
 839                         frag_id = fh->identification;
 840                 } else
 841                         fh->identification = frag_id;
 842
 843                 /*
 844                  *      Copy a block of the IP datagram.
 845                  */
 846                 if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len))
 847                         BUG();
 848                 left -= len;
 849
 850                 fh->frag_off = htons(offset);
 851                 if (left > 0)
 852                         fh->frag_off |= htons(IP6_MF);
 853                 ipv6_hdr(frag)->payload_len = htons(frag->len -
 854                                                     sizeof(struct ipv6hdr));
 855
 856                 ptr += len;
 857                 offset += len;
 858
 859                 /*
 860                  *      Put this fragment into the sending queue.
 861                  */
 862                 err = output(frag);
 863                 if (err)
 864                         goto fail;
 865
 866                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 867                               IPSTATS_MIB_FRAGCREATES);
 868         }
 869         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 870                       IPSTATS_MIB_FRAGOKS);
 871         kfree_skb(skb);
 872         return err;
 873
 874 fail:
 875         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 876                       IPSTATS_MIB_FRAGFAILS);
 877         kfree_skb(skb);
 878         return err;
 879 }
 880
 881 static inline int ip6_rt_check(struct rt6key *rt_key,
 882                                struct in6_addr *fl_addr,
 883                                struct in6_addr *addr_cache)
 884 {
 885         return ((rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
 886                 (addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache)));
 887 }
 888
 889 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
 890                                           struct dst_entry *dst,
 891                                           struct flowi *fl)
 892 {
 893         struct ipv6_pinfo *np = inet6_sk(sk);
 894         struct rt6_info *rt = (struct rt6_info *)dst;
 895
 896         if (!dst)
 897                 goto out;
 898
 899         /* Yes, checking route validity in not connected
 900          * case is not very simple. Take into account,
 901          * that we do not support routing by source, TOS,
 902          * and MSG_DONTROUTE            --ANK (980726)
 903          *
 904          * 1. ip6_rt_check(): If route was host route,
 905          *    check that cached destination is current.
 906          *    If it is network route, we still may
 907          *    check its validity using saved pointer
 908          *    to the last used address: daddr_cache.
 909          *    We do not want to save whole address now,
 910          *    (because main consumer of this service
 911          *    is tcp, which has not this problem),
 912          *    so that the last trick works only on connected
 913          *    sockets.
 914          * 2. oif also should be the same.
 915          */
 916         if (ip6_rt_check(&rt->rt6i_dst, &fl->fl6_dst, np->daddr_cache) ||
 917 #ifdef CONFIG_IPV6_SUBTREES
 918             ip6_rt_check(&rt->rt6i_src, &fl->fl6_src, np->saddr_cache) ||
 919 #endif
 920             (fl->oif && fl->oif != dst->dev->ifindex)) {
 921                 dst_release(dst);
 922                 dst = NULL;
 923         }
 924
 925 out:
 926         return dst;
 927 }
 928
 929 static int ip6_dst_lookup_tail(struct sock *sk,
 930                                struct dst_entry **dst, struct flowi *fl)
 931 {
 932         int err;
 933         struct net *net = sock_net(sk);
 934
 935         if (*dst == NULL)
 936                 *dst = ip6_route_output(net, sk, fl);
 937
 938         if ((err = (*dst)->error))
 939                 goto out_err_release;
 940
 941         if (ipv6_addr_any(&fl->fl6_src)) {
 942                 err = ipv6_dev_get_saddr(net, ip6_dst_idev(*dst)->dev,
 943                                          &fl->fl6_dst,
 944                                          sk ? inet6_sk(sk)->srcprefs : 0,
 945                                          &fl->fl6_src);
 946                 if (err)
 947                         goto out_err_release;
 948         }
 949
 950 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
 951         /*
 952          * Here if the dst entry we've looked up
 953          * has a neighbour entry that is in the INCOMPLETE
 954          * state and the src address from the flow is
 955          * marked as OPTIMISTIC, we release the found
 956          * dst entry and replace it instead with the
 957          * dst entry of the nexthop router
 958          */
 959         if ((*dst)->neighbour && !((*dst)->neighbour->nud_state & NUD_VALID)) {
 960                 struct inet6_ifaddr *ifp;
 961                 struct flowi fl_gw;
 962                 int redirect;
 963
 964                 ifp = ipv6_get_ifaddr(net, &fl->fl6_src,
 965                                       (*dst)->dev, 1);
 966
 967                 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
 968                 if (ifp)
 969                         in6_ifa_put(ifp);
 970
 971                 if (redirect) {
 972                         /*
 973                          * We need to get the dst entry for the
 974                          * default router instead
 975                          */
 976                         dst_release(*dst);
 977                         memcpy(&fl_gw, fl, sizeof(struct flowi));
 978                         memset(&fl_gw.fl6_dst, 0, sizeof(struct in6_addr));
 979                         *dst = ip6_route_output(net, sk, &fl_gw);
 980                         if ((err = (*dst)->error))
 981                                 goto out_err_release;
 982                 }
 983         }
 984 #endif
 985
 986         return 0;
 987
 988 out_err_release:
 989         if (err == -ENETUNREACH)
 990                 IP6_INC_STATS_BH(net, NULL, IPSTATS_MIB_OUTNOROUTES);
 991         dst_release(*dst);
 992         *dst = NULL;
 993         return err;
 994 }
 995
 996 /**
 997  *      ip6_dst_lookup - perform route lookup on flow
 998  *      @sk: socket which provides route info
 999  *      @dst: pointer to dst_entry * for result
1000  *      @fl: flow to lookup
1001  *
1002  *      This function performs a route lookup on the given flow.
1003  *
1004  *      It returns zero on success, or a standard errno code on error.
1005  */
1006 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
1007 {
1008         *dst = NULL;
1009         return ip6_dst_lookup_tail(sk, dst, fl);
1010 }
1011 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1012
1013 /**
1014  *      ip6_sk_dst_lookup - perform socket cached route lookup on flow
1015  *      @sk: socket which provides the dst cache and route info
1016  *      @dst: pointer to dst_entry * for result
1017  *      @fl: flow to lookup
1018  *
1019  *      This function performs a route lookup on the given flow with the
1020  *      possibility of using the cached route in the socket if it is valid.
1021  *      It will take the socket dst lock when operating on the dst cache.
1022  *      As a result, this function can only be used in process context.
1023  *
1024  *      It returns zero on success, or a standard errno code on error.
1025  */
1026 int ip6_sk_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
1027 {
1028         *dst = NULL;
1029         if (sk) {
1030                 *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1031                 *dst = ip6_sk_dst_check(sk, *dst, fl);
1032         }
1033
1034         return ip6_dst_lookup_tail(sk, dst, fl);
1035 }
1036 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup);
1037
1038 static inline int ip6_ufo_append_data(struct sock *sk,
1039                         int getfrag(void *from, char *to, int offset, int len,
1040                         int odd, struct sk_buff *skb),
1041                         void *from, int length, int hh_len, int fragheaderlen,
1042                         int transhdrlen, int mtu,unsigned int flags)
1043
1044 {
1045         struct sk_buff *skb;
1046         int err;
1047
1048         /* There is support for UDP large send offload by network
1049          * device, so create one single skb packet containing complete
1050          * udp datagram
1051          */
1052         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
1053                 skb = sock_alloc_send_skb(sk,
1054                         hh_len + fragheaderlen + transhdrlen + 20,
1055                         (flags & MSG_DONTWAIT), &err);
1056                 if (skb == NULL)
1057                         return -ENOMEM;
1058
1059                 /* reserve space for Hardware header */
1060                 skb_reserve(skb, hh_len);
1061
1062                 /* create space for UDP/IP header */
1063                 skb_put(skb,fragheaderlen + transhdrlen);
1064
1065                 /* initialize network header pointer */
1066                 skb_reset_network_header(skb);
1067
1068                 /* initialize protocol header pointer */
1069                 skb->transport_header = skb->network_header + fragheaderlen;
1070
1071                 skb->ip_summed = CHECKSUM_PARTIAL;
1072                 skb->csum = 0;
1073                 sk->sk_sndmsg_off = 0;
1074         }
1075
1076         err = skb_append_datato_frags(sk,skb, getfrag, from,
1077                                       (length - transhdrlen));
1078         if (!err) {
1079                 struct frag_hdr fhdr;
1080
1081                 /* Specify the length of each IPv6 datagram fragment.
1082                  * It has to be a multiple of 8.
1083                  */
1084                 skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1085                                              sizeof(struct frag_hdr)) & ~7;
1086                 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1087                 ipv6_select_ident(&fhdr);
1088                 skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
1089                 __skb_queue_tail(&sk->sk_write_queue, skb);
1090
1091                 return 0;
1092         }
1093         /* There is not enough support do UPD LSO,
1094          * so follow normal path
1095          */
1096         kfree_skb(skb);
1097
1098         return err;
1099 }
1100
1101 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1102                                                gfp_t gfp)
1103 {
1104         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1105 }
1106
1107 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1108                                                 gfp_t gfp)
1109 {
1110         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1111 }
1112
1113 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
1114         int offset, int len, int odd, struct sk_buff *skb),
1115         void *from, int length, int transhdrlen,
1116         int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi *fl,
1117         struct rt6_info *rt, unsigned int flags)
1118 {
1119         struct inet_sock *inet = inet_sk(sk);
1120         struct ipv6_pinfo *np = inet6_sk(sk);
1121         struct sk_buff *skb;
1122         unsigned int maxfraglen, fragheaderlen;
1123         int exthdrlen;
1124         int hh_len;
1125         int mtu;
1126         int copy;
1127         int err;
1128         int offset = 0;
1129         int csummode = CHECKSUM_NONE;
1130
1131         if (flags&MSG_PROBE)
1132                 return 0;
1133         if (skb_queue_empty(&sk->sk_write_queue)) {
1134                 /*
1135                  * setup for corking
1136                  */
1137                 if (opt) {
1138                         if (WARN_ON(np->cork.opt))
1139                                 return -EINVAL;
1140
1141                         np->cork.opt = kmalloc(opt->tot_len, sk->sk_allocation);
1142                         if (unlikely(np->cork.opt == NULL))
1143                                 return -ENOBUFS;
1144
1145                         np->cork.opt->tot_len = opt->tot_len;
1146                         np->cork.opt->opt_flen = opt->opt_flen;
1147                         np->cork.opt->opt_nflen = opt->opt_nflen;
1148
1149                         np->cork.opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1150                                                             sk->sk_allocation);
1151                         if (opt->dst0opt && !np->cork.opt->dst0opt)
1152                                 return -ENOBUFS;
1153
1154                         np->cork.opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1155                                                             sk->sk_allocation);
1156                         if (opt->dst1opt && !np->cork.opt->dst1opt)
1157                                 return -ENOBUFS;
1158
1159                         np->cork.opt->hopopt = ip6_opt_dup(opt->hopopt,
1160                                                            sk->sk_allocation);
1161                         if (opt->hopopt && !np->cork.opt->hopopt)
1162                                 return -ENOBUFS;
1163
1164                         np->cork.opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1165                                                             sk->sk_allocation);
1166                         if (opt->srcrt && !np->cork.opt->srcrt)
1167                                 return -ENOBUFS;
1168
1169                         /* need source address above miyazawa*/
1170                 }
1171                 dst_hold(&rt->u.dst);
1172                 inet->cork.dst = &rt->u.dst;
1173                 inet->cork.fl = *fl;
1174                 np->cork.hop_limit = hlimit;
1175                 np->cork.tclass = tclass;
1176                 mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1177                       rt->u.dst.dev->mtu : dst_mtu(rt->u.dst.path);
1178                 if (np->frag_size < mtu) {
1179                         if (np->frag_size)
1180                                 mtu = np->frag_size;
1181                 }
1182                 inet->cork.fragsize = mtu;
1183                 if (dst_allfrag(rt->u.dst.path))
1184                         inet->cork.flags |= IPCORK_ALLFRAG;
1185                 inet->cork.length = 0;
1186                 sk->sk_sndmsg_page = NULL;
1187                 sk->sk_sndmsg_off = 0;
1188                 exthdrlen = rt->u.dst.header_len + (opt ? opt->opt_flen : 0) -
1189                             rt->rt6i_nfheader_len;
1190                 length += exthdrlen;
1191                 transhdrlen += exthdrlen;
1192         } else {
1193                 rt = (struct rt6_info *)inet->cork.dst;
1194                 fl = &inet->cork.fl;
1195                 opt = np->cork.opt;
1196                 transhdrlen = 0;
1197                 exthdrlen = 0;
1198                 mtu = inet->cork.fragsize;
1199         }
1200
1201         hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
1202
1203         fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1204                         (opt ? opt->opt_nflen : 0);
1205         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
1206
1207         if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1208                 if (inet->cork.length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
1209                         ipv6_local_error(sk, EMSGSIZE, fl, mtu-exthdrlen);
1210                         return -EMSGSIZE;
1211                 }
1212         }
1213
1214         /*
1215          * Let's try using as much space as possible.
1216          * Use MTU if total length of the message fits into the MTU.
1217          * Otherwise, we need to reserve fragment header and
1218          * fragment alignment (= 8-15 octects, in total).
1219          *
1220          * Note that we may need to "move" the data from the tail of
1221          * of the buffer to the new fragment when we split
1222          * the message.
1223          *
1224          * FIXME: It may be fragmented into multiple chunks
1225          *        at once if non-fragmentable extension headers
1226          *        are too large.
1227          * --yoshfuji
1228          */
1229
1230         inet->cork.length += length;
1231         if (((length > mtu) && (sk->sk_protocol == IPPROTO_UDP)) &&
1232             (rt->u.dst.dev->features & NETIF_F_UFO)) {
1233
1234                 err = ip6_ufo_append_data(sk, getfrag, from, length, hh_len,
1235                                           fragheaderlen, transhdrlen, mtu,
1236                                           flags);
1237                 if (err)
1238                         goto error;
1239                 return 0;
1240         }
1241
1242         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1243                 goto alloc_new_skb;
1244
1245         while (length > 0) {
1246                 /* Check if the remaining data fits into current packet. */
1247                 copy = (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1248                 if (copy < length)
1249                         copy = maxfraglen - skb->len;
1250
1251                 if (copy <= 0) {
1252                         char *data;
1253                         unsigned int datalen;
1254                         unsigned int fraglen;
1255                         unsigned int fraggap;
1256                         unsigned int alloclen;
1257                         struct sk_buff *skb_prev;
1258 alloc_new_skb:
1259                         skb_prev = skb;
1260
1261                         /* There's no room in the current skb */
1262                         if (skb_prev)
1263                                 fraggap = skb_prev->len - maxfraglen;
1264                         else
1265                                 fraggap = 0;
1266
1267                         /*
1268                          * If remaining data exceeds the mtu,
1269                          * we know we need more fragment(s).
1270                          */
1271                         datalen = length + fraggap;
1272                         if (datalen > (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1273                                 datalen = maxfraglen - fragheaderlen;
1274
1275                         fraglen = datalen + fragheaderlen;
1276                         if ((flags & MSG_MORE) &&
1277                             !(rt->u.dst.dev->features&NETIF_F_SG))
1278                                 alloclen = mtu;
1279                         else
1280                                 alloclen = datalen + fragheaderlen;
1281
1282                         /*
1283                          * The last fragment gets additional space at tail.
1284                          * Note: we overallocate on fragments with MSG_MODE
1285                          * because we have no idea if we're the last one.
1286                          */
1287                         if (datalen == length + fraggap)
1288                                 alloclen += rt->u.dst.trailer_len;
1289
1290                         /*
1291                          * We just reserve space for fragment header.
1292                          * Note: this may be overallocation if the message
1293                          * (without MSG_MORE) fits into the MTU.
1294                          */
1295                         alloclen += sizeof(struct frag_hdr);
1296
1297                         if (transhdrlen) {
1298                                 skb = sock_alloc_send_skb(sk,
1299                                                 alloclen + hh_len,
1300                                                 (flags & MSG_DONTWAIT), &err);
1301                         } else {
1302                                 skb = NULL;
1303                                 if (atomic_read(&sk->sk_wmem_alloc) <=
1304                                     2 * sk->sk_sndbuf)
1305                                         skb = sock_wmalloc(sk,
1306                                                            alloclen + hh_len, 1,
1307                                                            sk->sk_allocation);
1308                                 if (unlikely(skb == NULL))
1309                                         err = -ENOBUFS;
1310                         }
1311                         if (skb == NULL)
1312                                 goto error;
1313                         /*
1314                          *      Fill in the control structures
1315                          */
1316                         skb->ip_summed = csummode;
1317                         skb->csum = 0;
1318                         /* reserve for fragmentation */
1319                         skb_reserve(skb, hh_len+sizeof(struct frag_hdr));
1320
1321                         /*
1322                          *      Find where to start putting bytes
1323                          */
1324                         data = skb_put(skb, fraglen);
1325                         skb_set_network_header(skb, exthdrlen);
1326                         data += fragheaderlen;
1327                         skb->transport_header = (skb->network_header +
1328                                                  fragheaderlen);
1329                         if (fraggap) {
1330                                 skb->csum = skb_copy_and_csum_bits(
1331                                         skb_prev, maxfraglen,
1332                                         data + transhdrlen, fraggap, 0);
1333                                 skb_prev->csum = csum_sub(skb_prev->csum,
1334                                                           skb->csum);
1335                                 data += fraggap;
1336                                 pskb_trim_unique(skb_prev, maxfraglen);
1337                         }
1338                         copy = datalen - transhdrlen - fraggap;
1339                         if (copy < 0) {
1340                                 err = -EINVAL;
1341                                 kfree_skb(skb);
1342                                 goto error;
1343                         } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1344                                 err = -EFAULT;
1345                                 kfree_skb(skb);
1346                                 goto error;
1347                         }
1348
1349                         offset += copy;
1350                         length -= datalen - fraggap;
1351                         transhdrlen = 0;
1352                         exthdrlen = 0;
1353                         csummode = CHECKSUM_NONE;
1354
1355                         /*
1356                          * Put the packet on the pending queue
1357                          */
1358                         __skb_queue_tail(&sk->sk_write_queue, skb);
1359                         continue;
1360                 }
1361
1362                 if (copy > length)
1363                         copy = length;
1364
1365                 if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
1366                         unsigned int off;
1367
1368                         off = skb->len;
1369                         if (getfrag(from, skb_put(skb, copy),
1370                                                 offset, copy, off, skb) < 0) {
1371                                 __skb_trim(skb, off);
1372                                 err = -EFAULT;
1373                                 goto error;
1374                         }
1375                 } else {
1376                         int i = skb_shinfo(skb)->nr_frags;
1377                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1378                         struct page *page = sk->sk_sndmsg_page;
1379                         int off = sk->sk_sndmsg_off;
1380                         unsigned int left;
1381
1382                         if (page && (left = PAGE_SIZE - off) > 0) {
1383                                 if (copy >= left)
1384                                         copy = left;
1385                                 if (page != frag->page) {
1386                                         if (i == MAX_SKB_FRAGS) {
1387                                                 err = -EMSGSIZE;
1388                                                 goto error;
1389                                         }
1390                                         get_page(page);
1391                                         skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1392                                         frag = &skb_shinfo(skb)->frags[i];
1393                                 }
1394                         } else if(i < MAX_SKB_FRAGS) {
1395                                 if (copy > PAGE_SIZE)
1396                                         copy = PAGE_SIZE;
1397                                 page = alloc_pages(sk->sk_allocation, 0);
1398                                 if (page == NULL) {
1399                                         err = -ENOMEM;
1400                                         goto error;
1401                                 }
1402                                 sk->sk_sndmsg_page = page;
1403                                 sk->sk_sndmsg_off = 0;
1404
1405                                 skb_fill_page_desc(skb, i, page, 0, 0);
1406                                 frag = &skb_shinfo(skb)->frags[i];
1407                         } else {
1408                                 err = -EMSGSIZE;
1409                                 goto error;
1410                         }
1411                         if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1412                                 err = -EFAULT;
1413                                 goto error;
1414                         }
1415                         sk->sk_sndmsg_off += copy;
1416                         frag->size += copy;
1417                         skb->len += copy;
1418                         skb->data_len += copy;
1419                         skb->truesize += copy;
1420                         atomic_add(copy, &sk->sk_wmem_alloc);
1421                 }
1422                 offset += copy;
1423                 length -= copy;
1424         }
1425         return 0;
1426 error:
1427         inet->cork.length -= length;
1428         IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1429         return err;
1430 }
1431
1432 static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np)
1433 {
1434         if (np->cork.opt) {
1435                 kfree(np->cork.opt->dst0opt);
1436                 kfree(np->cork.opt->dst1opt);
1437                 kfree(np->cork.opt->hopopt);
1438                 kfree(np->cork.opt->srcrt);
1439                 kfree(np->cork.opt);
1440                 np->cork.opt = NULL;
1441         }
1442
1443         if (inet->cork.dst) {
1444                 dst_release(inet->cork.dst);
1445                 inet->cork.dst = NULL;
1446                 inet->cork.flags &= ~IPCORK_ALLFRAG;
1447         }
1448         memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1449 }
1450
1451 int ip6_push_pending_frames(struct sock *sk)
1452 {
1453         struct sk_buff *skb, *tmp_skb;
1454         struct sk_buff **tail_skb;
1455         struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1456         struct inet_sock *inet = inet_sk(sk);
1457         struct ipv6_pinfo *np = inet6_sk(sk);
1458         struct net *net = sock_net(sk);
1459         struct ipv6hdr *hdr;
1460         struct ipv6_txoptions *opt = np->cork.opt;
1461         struct rt6_info *rt = (struct rt6_info *)inet->cork.dst;
1462         struct flowi *fl = &inet->cork.fl;
1463         unsigned char proto = fl->proto;
1464         int err = 0;
1465
1466         if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1467                 goto out;
1468         tail_skb = &(skb_shinfo(skb)->frag_list);
1469
1470         /* move skb->data to ip header from ext header */
1471         if (skb->data < skb_network_header(skb))
1472                 __skb_pull(skb, skb_network_offset(skb));
1473         while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1474                 __skb_pull(tmp_skb, skb_network_header_len(skb));
1475                 *tail_skb = tmp_skb;
1476                 tail_skb = &(tmp_skb->next);
1477                 skb->len += tmp_skb->len;
1478                 skb->data_len += tmp_skb->len;
1479                 skb->truesize += tmp_skb->truesize;
1480                 tmp_skb->destructor = NULL;
1481                 tmp_skb->sk = NULL;
1482         }
1483
1484         /* Allow local fragmentation. */
1485         if (np->pmtudisc < IPV6_PMTUDISC_DO)
1486                 skb->local_df = 1;
1487
1488         ipv6_addr_copy(final_dst, &fl->fl6_dst);
1489         __skb_pull(skb, skb_network_header_len(skb));
1490         if (opt && opt->opt_flen)
1491                 ipv6_push_frag_opts(skb, opt, &proto);
1492         if (opt && opt->opt_nflen)
1493                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1494
1495         skb_push(skb, sizeof(struct ipv6hdr));
1496         skb_reset_network_header(skb);
1497         hdr = ipv6_hdr(skb);
1498
1499         *(__be32*)hdr = fl->fl6_flowlabel |
1500                      htonl(0x60000000 | ((int)np->cork.tclass << 20));
1501
1502         hdr->hop_limit = np->cork.hop_limit;
1503         hdr->nexthdr = proto;
1504         ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
1505         ipv6_addr_copy(&hdr->daddr, final_dst);
1506
1507         skb->priority = sk->sk_priority;
1508         skb->mark = sk->sk_mark;
1509
1510         skb_dst_set(skb, dst_clone(&rt->u.dst));
1511         IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1512         if (proto == IPPROTO_ICMPV6) {
1513                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1514
1515                 ICMP6MSGOUT_INC_STATS_BH(net, idev, icmp6_hdr(skb)->icmp6_type);
1516                 ICMP6_INC_STATS_BH(net, idev, ICMP6_MIB_OUTMSGS);
1517         }
1518
1519         err = ip6_local_out(skb);
1520         if (err) {
1521                 if (err > 0)
1522                         err = net_xmit_errno(err);
1523                 if (err)
1524                         goto error;
1525         }
1526
1527 out:
1528         ip6_cork_release(inet, np);
1529         return err;
1530 error:
1531         IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1532         goto out;
1533 }
1534
1535 void ip6_flush_pending_frames(struct sock *sk)
1536 {
1537         struct sk_buff *skb;
1538
1539         while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1540                 if (skb_dst(skb))
1541                         IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1542                                       IPSTATS_MIB_OUTDISCARDS);
1543                 kfree_skb(skb);
1544         }
1545
1546         ip6_cork_release(inet_sk(sk), inet6_sk(sk));
1547 }