net/ipv6/ip6_output.c

   1 /*
   2  *      IPv6 output functions
   3  *      Linux INET6 implementation
   4  *
   5  *      Authors:
   6  *      Pedro Roque             <roque@di.fc.ul.pt>
   7  *
   8  *      $Id: ip6_output.c,v 1.34 2002/02/01 22:01:04 davem Exp $
   9  *
  10  *      Based on linux/net/ipv4/ip_output.c
  11  *
  12  *      This program is free software; you can redistribute it and/or
  13  *      modify it under the terms of the GNU General Public License
  14  *      as published by the Free Software Foundation; either version
  15  *      2 of the License, or (at your option) any later version.
  16  *
  17  *      Changes:
  18  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
  19  *                              extension headers are implemented.
  20  *                              route changes now work.
  21  *                              ip6_forward does not confuse sniffers.
  22  *                              etc.
  23  *
  24  *      H. von Brand    :       Added missing #include <linux/string.h>
  25  *      Imran Patel     :       frag id should be in NBO
  26  *      Kazunori MIYAZAWA @USAGI
  27  *                      :       add ip6_append_data and related functions
  28  *                              for datagram xmit
  29  */
  30
  31 #include <linux/config.h>
  32 #include <linux/errno.h>
  33 #include <linux/types.h>
  34 #include <linux/string.h>
  35 #include <linux/socket.h>
  36 #include <linux/net.h>
  37 #include <linux/netdevice.h>
  38 #include <linux/if_arp.h>
  39 #include <linux/in6.h>
  40 #include <linux/tcp.h>
  41 #include <linux/route.h>
  42
  43 #include <linux/netfilter.h>
  44 #include <linux/netfilter_ipv6.h>
  45
  46 #include <net/sock.h>
  47 #include <net/snmp.h>
  48
  49 #include <net/ipv6.h>
  50 #include <net/ndisc.h>
  51 #include <net/protocol.h>
  52 #include <net/ip6_route.h>
  53 #include <net/addrconf.h>
  54 #include <net/rawv6.h>
  55 #include <net/icmp.h>
  56 #include <net/xfrm.h>
  57 #include <net/checksum.h>
  58
  59 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
  60
  61 static __inline__ void ipv6_select_ident(struct sk_buff *skb, struct frag_hdr *fhdr)
  62 {
  63         static u32 ipv6_fragmentation_id = 1;
  64         static DEFINE_SPINLOCK(ip6_id_lock);
  65
  66         spin_lock_bh(&ip6_id_lock);
  67         fhdr->identification = htonl(ipv6_fragmentation_id);
  68         if (++ipv6_fragmentation_id == 0)
  69                 ipv6_fragmentation_id = 1;
  70         spin_unlock_bh(&ip6_id_lock);
  71 }
  72
  73 static inline int ip6_output_finish(struct sk_buff *skb)
  74 {
  75
  76         struct dst_entry *dst = skb->dst;
  77         struct hh_cache *hh = dst->hh;
  78
  79         if (hh) {
  80                 int hh_alen;
  81
  82                 read_lock_bh(&hh->hh_lock);
  83                 hh_alen = HH_DATA_ALIGN(hh->hh_len);
  84                 memcpy(skb->data - hh_alen, hh->hh_data, hh_alen);
  85                 read_unlock_bh(&hh->hh_lock);
  86                 skb_push(skb, hh->hh_len);
  87                 return hh->hh_output(skb);
  88         } else if (dst->neighbour)
  89                 return dst->neighbour->output(skb);
  90
  91         IP6_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
  92         kfree_skb(skb);
  93         return -EINVAL;
  94
  95 }
  96
  97 /* dev_loopback_xmit for use with netfilter. */
  98 static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
  99 {
 100         newskb->mac.raw = newskb->data;
 101         __skb_pull(newskb, newskb->nh.raw - newskb->data);
 102         newskb->pkt_type = PACKET_LOOPBACK;
 103         newskb->ip_summed = CHECKSUM_UNNECESSARY;
 104         BUG_TRAP(newskb->dst);
 105
 106         netif_rx(newskb);
 107         return 0;
 108 }
 109
 110
 111 static int ip6_output2(struct sk_buff *skb)
 112 {
 113         struct dst_entry *dst = skb->dst;
 114         struct net_device *dev = dst->dev;
 115
 116         skb->protocol = htons(ETH_P_IPV6);
 117         skb->dev = dev;
 118
 119         if (ipv6_addr_is_multicast(&skb->nh.ipv6h->daddr)) {
 120                 struct ipv6_pinfo* np = skb->sk ? inet6_sk(skb->sk) : NULL;
 121
 122                 if (!(dev->flags & IFF_LOOPBACK) && (!np || np->mc_loop) &&
 123                     ipv6_chk_mcast_addr(dev, &skb->nh.ipv6h->daddr,
 124                                 &skb->nh.ipv6h->saddr)) {
 125                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
 126
 127                         /* Do not check for IFF_ALLMULTI; multicast routing
 128                            is not supported in any case.
 129                          */
 130                         if (newskb)
 131                                 NF_HOOK(PF_INET6, NF_IP6_POST_ROUTING, newskb, NULL,
 132                                         newskb->dev,
 133                                         ip6_dev_loopback_xmit);
 134
 135                         if (skb->nh.ipv6h->hop_limit == 0) {
 136                                 IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
 137                                 kfree_skb(skb);
 138                                 return 0;
 139                         }
 140                 }
 141
 142                 IP6_INC_STATS(IPSTATS_MIB_OUTMCASTPKTS);
 143         }
 144
 145         return NF_HOOK(PF_INET6, NF_IP6_POST_ROUTING, skb,NULL, skb->dev,ip6_output_finish);
 146 }
 147
 148 int ip6_output(struct sk_buff *skb)
 149 {
 150         if (skb->len > dst_mtu(skb->dst) || dst_allfrag(skb->dst))
 151                 return ip6_fragment(skb, ip6_output2);
 152         else
 153                 return ip6_output2(skb);
 154 }
 155
 156 #ifdef CONFIG_NETFILTER
 157 int ip6_route_me_harder(struct sk_buff *skb)
 158 {
 159         struct ipv6hdr *iph = skb->nh.ipv6h;
 160         struct dst_entry *dst;
 161         struct flowi fl = {
 162                 .oif = skb->sk ? skb->sk->sk_bound_dev_if : 0,
 163                 .nl_u =
 164                 { .ip6_u =
 165                   { .daddr = iph->daddr,
 166                     .saddr = iph->saddr, } },
 167                 .proto = iph->nexthdr,
 168         };
 169
 170         dst = ip6_route_output(skb->sk, &fl);
 171
 172         if (dst->error) {
 173                 IP6_INC_STATS(IPSTATS_MIB_OUTNOROUTES);
 174                 LIMIT_NETDEBUG(
 175                         printk(KERN_DEBUG "ip6_route_me_harder: No more route.\n"));
 176                 dst_release(dst);
 177                 return -EINVAL;
 178         }
 179
 180         /* Drop old route. */
 181         dst_release(skb->dst);
 182
 183         skb->dst = dst;
 184         return 0;
 185 }
 186 #endif
 187
 188 static inline int ip6_maybe_reroute(struct sk_buff *skb)
 189 {
 190 #ifdef CONFIG_NETFILTER
 191         if (skb->nfcache & NFC_ALTERED){
 192                 if (ip6_route_me_harder(skb) != 0){
 193                         kfree_skb(skb);
 194                         return -EINVAL;
 195                 }
 196         }
 197 #endif /* CONFIG_NETFILTER */
 198         return dst_output(skb);
 199 }
 200
 201 /*
 202  *      xmit an sk_buff (used by TCP)
 203  */
 204
 205 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
 206              struct ipv6_txoptions *opt, int ipfragok)
 207 {
 208         struct ipv6_pinfo *np = sk ? inet6_sk(sk) : NULL;
 209         struct in6_addr *first_hop = &fl->fl6_dst;
 210         struct dst_entry *dst = skb->dst;
 211         struct ipv6hdr *hdr;
 212         u8  proto = fl->proto;
 213         int seg_len = skb->len;
 214         int hlimit;
 215         u32 mtu;
 216
 217         if (opt) {
 218                 int head_room;
 219
 220                 /* First: exthdrs may take lots of space (~8K for now)
 221                    MAX_HEADER is not enough.
 222                  */
 223                 head_room = opt->opt_nflen + opt->opt_flen;
 224                 seg_len += head_room;
 225                 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
 226
 227                 if (skb_headroom(skb) < head_room) {
 228                         struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
 229                         kfree_skb(skb);
 230                         skb = skb2;
 231                         if (skb == NULL) {
 232                                 IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
 233                                 return -ENOBUFS;
 234                         }
 235                         if (sk)
 236                                 skb_set_owner_w(skb, sk);
 237                 }
 238                 if (opt->opt_flen)
 239                         ipv6_push_frag_opts(skb, opt, &proto);
 240                 if (opt->opt_nflen)
 241                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
 242         }
 243
 244         hdr = skb->nh.ipv6h = (struct ipv6hdr*)skb_push(skb, sizeof(struct ipv6hdr));
 245
 246         /*
 247          *      Fill in the IPv6 header
 248          */
 249
 250         *(u32*)hdr = htonl(0x60000000) | fl->fl6_flowlabel;
 251         hlimit = -1;
 252         if (np)
 253                 hlimit = np->hop_limit;
 254         if (hlimit < 0)
 255                 hlimit = dst_metric(dst, RTAX_HOPLIMIT);
 256         if (hlimit < 0)
 257                 hlimit = ipv6_get_hoplimit(dst->dev);
 258
 259         hdr->payload_len = htons(seg_len);
 260         hdr->nexthdr = proto;
 261         hdr->hop_limit = hlimit;
 262
 263         ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
 264         ipv6_addr_copy(&hdr->daddr, first_hop);
 265
 266         mtu = dst_mtu(dst);
 267         if ((skb->len <= mtu) || ipfragok) {
 268                 IP6_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
 269                 return NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, dst->dev, ip6_maybe_reroute);
 270         }
 271
 272         if (net_ratelimit())
 273                 printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
 274         skb->dev = dst->dev;
 275         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
 276         IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
 277         kfree_skb(skb);
 278         return -EMSGSIZE;
 279 }
 280
 281 /*
 282  *      To avoid extra problems ND packets are send through this
 283  *      routine. It's code duplication but I really want to avoid
 284  *      extra checks since ipv6_build_header is used by TCP (which
 285  *      is for us performance critical)
 286  */
 287
 288 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
 289                struct in6_addr *saddr, struct in6_addr *daddr,
 290                int proto, int len)
 291 {
 292         struct ipv6_pinfo *np = inet6_sk(sk);
 293         struct ipv6hdr *hdr;
 294         int totlen;
 295
 296         skb->protocol = htons(ETH_P_IPV6);
 297         skb->dev = dev;
 298
 299         totlen = len + sizeof(struct ipv6hdr);
 300
 301         hdr = (struct ipv6hdr *) skb_put(skb, sizeof(struct ipv6hdr));
 302         skb->nh.ipv6h = hdr;
 303
 304         *(u32*)hdr = htonl(0x60000000);
 305
 306         hdr->payload_len = htons(len);
 307         hdr->nexthdr = proto;
 308         hdr->hop_limit = np->hop_limit;
 309
 310         ipv6_addr_copy(&hdr->saddr, saddr);
 311         ipv6_addr_copy(&hdr->daddr, daddr);
 312
 313         return 0;
 314 }
 315
 316 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
 317 {
 318         struct ip6_ra_chain *ra;
 319         struct sock *last = NULL;
 320
 321         read_lock(&ip6_ra_lock);
 322         for (ra = ip6_ra_chain; ra; ra = ra->next) {
 323                 struct sock *sk = ra->sk;
 324                 if (sk && ra->sel == sel) {
 325                         if (last) {
 326                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
 327                                 if (skb2)
 328                                         rawv6_rcv(last, skb2);
 329                         }
 330                         last = sk;
 331                 }
 332         }
 333
 334         if (last) {
 335                 rawv6_rcv(last, skb);
 336                 read_unlock(&ip6_ra_lock);
 337                 return 1;
 338         }
 339         read_unlock(&ip6_ra_lock);
 340         return 0;
 341 }
 342
 343 static inline int ip6_forward_finish(struct sk_buff *skb)
 344 {
 345         return dst_output(skb);
 346 }
 347
 348 int ip6_forward(struct sk_buff *skb)
 349 {
 350         struct dst_entry *dst = skb->dst;
 351         struct ipv6hdr *hdr = skb->nh.ipv6h;
 352         struct inet6_skb_parm *opt = IP6CB(skb);
 353
 354         if (ipv6_devconf.forwarding == 0)
 355                 goto error;
 356
 357         if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
 358                 IP6_INC_STATS(IPSTATS_MIB_INDISCARDS);
 359                 goto drop;
 360         }
 361
 362         skb->ip_summed = CHECKSUM_NONE;
 363
 364         /*
 365          *      We DO NOT make any processing on
 366          *      RA packets, pushing them to user level AS IS
 367          *      without ane WARRANTY that application will be able
 368          *      to interpret them. The reason is that we
 369          *      cannot make anything clever here.
 370          *
 371          *      We are not end-node, so that if packet contains
 372          *      AH/ESP, we cannot make anything.
 373          *      Defragmentation also would be mistake, RA packets
 374          *      cannot be fragmented, because there is no warranty
 375          *      that different fragments will go along one path. --ANK
 376          */
 377         if (opt->ra) {
 378                 u8 *ptr = skb->nh.raw + opt->ra;
 379                 if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
 380                         return 0;
 381         }
 382
 383         /*
 384          *      check and decrement ttl
 385          */
 386         if (hdr->hop_limit <= 1) {
 387                 /* Force OUTPUT device used as source address */
 388                 skb->dev = dst->dev;
 389                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT,
 390                             0, skb->dev);
 391
 392                 kfree_skb(skb);
 393                 return -ETIMEDOUT;
 394         }
 395
 396         if (!xfrm6_route_forward(skb)) {
 397                 IP6_INC_STATS(IPSTATS_MIB_INDISCARDS);
 398                 goto drop;
 399         }
 400         dst = skb->dst;
 401
 402         /* IPv6 specs say nothing about it, but it is clear that we cannot
 403            send redirects to source routed frames.
 404          */
 405         if (skb->dev == dst->dev && dst->neighbour && opt->srcrt == 0) {
 406                 struct in6_addr *target = NULL;
 407                 struct rt6_info *rt;
 408                 struct neighbour *n = dst->neighbour;
 409
 410                 /*
 411                  *      incoming and outgoing devices are the same
 412                  *      send a redirect.
 413                  */
 414
 415                 rt = (struct rt6_info *) dst;
 416                 if ((rt->rt6i_flags & RTF_GATEWAY))
 417                         target = (struct in6_addr*)&n->primary_key;
 418                 else
 419                         target = &hdr->daddr;
 420
 421                 /* Limit redirects both by destination (here)
 422                    and by source (inside ndisc_send_redirect)
 423                  */
 424                 if (xrlim_allow(dst, 1*HZ))
 425                         ndisc_send_redirect(skb, n, target);
 426         } else if (ipv6_addr_type(&hdr->saddr)&(IPV6_ADDR_MULTICAST|IPV6_ADDR_LOOPBACK
 427                                                 |IPV6_ADDR_LINKLOCAL)) {
 428                 /* This check is security critical. */
 429                 goto error;
 430         }
 431
 432         if (skb->len > dst_mtu(dst)) {
 433                 /* Again, force OUTPUT device used as source address */
 434                 skb->dev = dst->dev;
 435                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, dst_mtu(dst), skb->dev);
 436                 IP6_INC_STATS_BH(IPSTATS_MIB_INTOOBIGERRORS);
 437                 IP6_INC_STATS_BH(IPSTATS_MIB_FRAGFAILS);
 438                 kfree_skb(skb);
 439                 return -EMSGSIZE;
 440         }
 441
 442         if (skb_cow(skb, dst->dev->hard_header_len)) {
 443                 IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
 444                 goto drop;
 445         }
 446
 447         hdr = skb->nh.ipv6h;
 448
 449         /* Mangling hops number delayed to point after skb COW */
 450
 451         hdr->hop_limit--;
 452
 453         IP6_INC_STATS_BH(IPSTATS_MIB_OUTFORWDATAGRAMS);
 454         return NF_HOOK(PF_INET6,NF_IP6_FORWARD, skb, skb->dev, dst->dev, ip6_forward_finish);
 455
 456 error:
 457         IP6_INC_STATS_BH(IPSTATS_MIB_INADDRERRORS);
 458 drop:
 459         kfree_skb(skb);
 460         return -EINVAL;
 461 }
 462
 463 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 464 {
 465         to->pkt_type = from->pkt_type;
 466         to->priority = from->priority;
 467         to->protocol = from->protocol;
 468         to->security = from->security;
 469         dst_release(to->dst);
 470         to->dst = dst_clone(from->dst);
 471         to->dev = from->dev;
 472
 473 #ifdef CONFIG_NET_SCHED
 474         to->tc_index = from->tc_index;
 475 #endif
 476 #ifdef CONFIG_NETFILTER
 477         to->nfmark = from->nfmark;
 478         /* Connection association is same as pre-frag packet */
 479         to->nfct = from->nfct;
 480         nf_conntrack_get(to->nfct);
 481         to->nfctinfo = from->nfctinfo;
 482 #ifdef CONFIG_BRIDGE_NETFILTER
 483         nf_bridge_put(to->nf_bridge);
 484         to->nf_bridge = from->nf_bridge;
 485         nf_bridge_get(to->nf_bridge);
 486 #endif
 487 #ifdef CONFIG_NETFILTER_DEBUG
 488         to->nf_debug = from->nf_debug;
 489 #endif
 490 #endif
 491 }
 492
 493 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
 494 {
 495         u16 offset = sizeof(struct ipv6hdr);
 496         struct ipv6_opt_hdr *exthdr = (struct ipv6_opt_hdr*)(skb->nh.ipv6h + 1);
 497         unsigned int packet_len = skb->tail - skb->nh.raw;
 498         int found_rhdr = 0;
 499         *nexthdr = &skb->nh.ipv6h->nexthdr;
 500
 501         while (offset + 1 <= packet_len) {
 502
 503                 switch (**nexthdr) {
 504
 505                 case NEXTHDR_HOP:
 506                 case NEXTHDR_ROUTING:
 507                 case NEXTHDR_DEST:
 508                         if (**nexthdr == NEXTHDR_ROUTING) found_rhdr = 1;
 509                         if (**nexthdr == NEXTHDR_DEST && found_rhdr) return offset;
 510                         offset += ipv6_optlen(exthdr);
 511                         *nexthdr = &exthdr->nexthdr;
 512                         exthdr = (struct ipv6_opt_hdr*)(skb->nh.raw + offset);
 513                         break;
 514                 default :
 515                         return offset;
 516                 }
 517         }
 518
 519         return offset;
 520 }
 521
 522 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
 523 {
 524         struct net_device *dev;
 525         struct sk_buff *frag;
 526         struct rt6_info *rt = (struct rt6_info*)skb->dst;
 527         struct ipv6hdr *tmp_hdr;
 528         struct frag_hdr *fh;
 529         unsigned int mtu, hlen, left, len;
 530         u32 frag_id = 0;
 531         int ptr, offset = 0, err=0;
 532         u8 *prevhdr, nexthdr = 0;
 533
 534         dev = rt->u.dst.dev;
 535         hlen = ip6_find_1stfragopt(skb, &prevhdr);
 536         nexthdr = *prevhdr;
 537
 538         mtu = dst_mtu(&rt->u.dst) - hlen - sizeof(struct frag_hdr);
 539
 540         if (skb_shinfo(skb)->frag_list) {
 541                 int first_len = skb_pagelen(skb);
 542
 543                 if (first_len - hlen > mtu ||
 544                     ((first_len - hlen) & 7) ||
 545                     skb_cloned(skb))
 546                         goto slow_path;
 547
 548                 for (frag = skb_shinfo(skb)->frag_list; frag; frag = frag->next) {
 549                         /* Correct geometry. */
 550                         if (frag->len > mtu ||
 551                             ((frag->len & 7) && frag->next) ||
 552                             skb_headroom(frag) < hlen)
 553                             goto slow_path;
 554
 555                         /* Correct socket ownership. */
 556                         if (frag->sk == NULL)
 557                                 goto slow_path;
 558
 559                         /* Partially cloned skb? */
 560                         if (skb_shared(frag))
 561                                 goto slow_path;
 562                 }
 563
 564                 err = 0;
 565                 offset = 0;
 566                 frag = skb_shinfo(skb)->frag_list;
 567                 skb_shinfo(skb)->frag_list = NULL;
 568                 /* BUILD HEADER */
 569
 570                 tmp_hdr = kmalloc(hlen, GFP_ATOMIC);
 571                 if (!tmp_hdr) {
 572                         IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
 573                         return -ENOMEM;
 574                 }
 575
 576                 *prevhdr = NEXTHDR_FRAGMENT;
 577                 memcpy(tmp_hdr, skb->nh.raw, hlen);
 578                 __skb_pull(skb, hlen);
 579                 fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
 580                 skb->nh.raw = __skb_push(skb, hlen);
 581                 memcpy(skb->nh.raw, tmp_hdr, hlen);
 582
 583                 ipv6_select_ident(skb, fh);
 584                 fh->nexthdr = nexthdr;
 585                 fh->reserved = 0;
 586                 fh->frag_off = htons(IP6_MF);
 587                 frag_id = fh->identification;
 588
 589                 first_len = skb_pagelen(skb);
 590                 skb->data_len = first_len - skb_headlen(skb);
 591                 skb->len = first_len;
 592                 skb->nh.ipv6h->payload_len = htons(first_len - sizeof(struct ipv6hdr));
 593
 594
 595                 for (;;) {
 596                         /* Prepare header of the next frame,
 597                          * before previous one went down. */
 598                         if (frag) {
 599                                 frag->ip_summed = CHECKSUM_NONE;
 600                                 frag->h.raw = frag->data;
 601                                 fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
 602                                 frag->nh.raw = __skb_push(frag, hlen);
 603                                 memcpy(frag->nh.raw, tmp_hdr, hlen);
 604                                 offset += skb->len - hlen - sizeof(struct frag_hdr);
 605                                 fh->nexthdr = nexthdr;
 606                                 fh->reserved = 0;
 607                                 fh->frag_off = htons(offset);
 608                                 if (frag->next != NULL)
 609                                         fh->frag_off |= htons(IP6_MF);
 610                                 fh->identification = frag_id;
 611                                 frag->nh.ipv6h->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
 612                                 ip6_copy_metadata(frag, skb);
 613                         }
 614
 615                         err = output(skb);
 616                         if (err || !frag)
 617                                 break;
 618
 619                         skb = frag;
 620                         frag = skb->next;
 621                         skb->next = NULL;
 622                 }
 623
 624                 if (tmp_hdr)
 625                         kfree(tmp_hdr);
 626
 627                 if (err == 0) {
 628                         IP6_INC_STATS(IPSTATS_MIB_FRAGOKS);
 629                         return 0;
 630                 }
 631
 632                 while (frag) {
 633                         skb = frag->next;
 634                         kfree_skb(frag);
 635                         frag = skb;
 636                 }
 637
 638                 IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
 639                 return err;
 640         }
 641
 642 slow_path:
 643         left = skb->len - hlen;         /* Space per frame */
 644         ptr = hlen;                     /* Where to start from */
 645
 646         /*
 647          *      Fragment the datagram.
 648          */
 649
 650         *prevhdr = NEXTHDR_FRAGMENT;
 651
 652         /*
 653          *      Keep copying data until we run out.
 654          */
 655         while(left > 0) {
 656                 len = left;
 657                 /* IF: it doesn't fit, use 'mtu' - the data space left */
 658                 if (len > mtu)
 659                         len = mtu;
 660                 /* IF: we are not sending upto and including the packet end
 661                    then align the next start on an eight byte boundary */
 662                 if (len < left) {
 663                         len &= ~7;
 664                 }
 665                 /*
 666                  *      Allocate buffer.
 667                  */
 668
 669                 if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_RESERVED_SPACE(rt->u.dst.dev), GFP_ATOMIC)) == NULL) {
 670                         NETDEBUG(printk(KERN_INFO "IPv6: frag: no memory for new fragment!\n"));
 671                         IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
 672                         err = -ENOMEM;
 673                         goto fail;
 674                 }
 675
 676                 /*
 677                  *      Set up data on packet
 678                  */
 679
 680                 ip6_copy_metadata(frag, skb);
 681                 skb_reserve(frag, LL_RESERVED_SPACE(rt->u.dst.dev));
 682                 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
 683                 frag->nh.raw = frag->data;
 684                 fh = (struct frag_hdr*)(frag->data + hlen);
 685                 frag->h.raw = frag->data + hlen + sizeof(struct frag_hdr);
 686
 687                 /*
 688                  *      Charge the memory for the fragment to any owner
 689                  *      it might possess
 690                  */
 691                 if (skb->sk)
 692                         skb_set_owner_w(frag, skb->sk);
 693
 694                 /*
 695                  *      Copy the packet header into the new buffer.
 696                  */
 697                 memcpy(frag->nh.raw, skb->data, hlen);
 698
 699                 /*
 700                  *      Build fragment header.
 701                  */
 702                 fh->nexthdr = nexthdr;
 703                 fh->reserved = 0;
 704                 if (frag_id) {
 705                         ipv6_select_ident(skb, fh);
 706                         frag_id = fh->identification;
 707                 } else
 708                         fh->identification = frag_id;
 709
 710                 /*
 711                  *      Copy a block of the IP datagram.
 712                  */
 713                 if (skb_copy_bits(skb, ptr, frag->h.raw, len))
 714                         BUG();
 715                 left -= len;
 716
 717                 fh->frag_off = htons(offset);
 718                 if (left > 0)
 719                         fh->frag_off |= htons(IP6_MF);
 720                 frag->nh.ipv6h->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
 721
 722                 ptr += len;
 723                 offset += len;
 724
 725                 /*
 726                  *      Put this fragment into the sending queue.
 727                  */
 728
 729                 IP6_INC_STATS(IPSTATS_MIB_FRAGCREATES);
 730
 731                 err = output(frag);
 732                 if (err)
 733                         goto fail;
 734         }
 735         kfree_skb(skb);
 736         IP6_INC_STATS(IPSTATS_MIB_FRAGOKS);
 737         return err;
 738
 739 fail:
 740         kfree_skb(skb);
 741         IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
 742         return err;
 743 }
 744
 745 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
 746 {
 747         int err = 0;
 748
 749         *dst = NULL;
 750         if (sk) {
 751                 struct ipv6_pinfo *np = inet6_sk(sk);
 752
 753                 *dst = sk_dst_check(sk, np->dst_cookie);
 754                 if (*dst) {
 755                         struct rt6_info *rt = (struct rt6_info*)*dst;
 756
 757                                 /* Yes, checking route validity in not connected
 758                                    case is not very simple. Take into account,
 759                                    that we do not support routing by source, TOS,
 760                                    and MSG_DONTROUTE            --ANK (980726)
 761
 762                                    1. If route was host route, check that
 763                                       cached destination is current.
 764                                       If it is network route, we still may
 765                                       check its validity using saved pointer
 766                                       to the last used address: daddr_cache.
 767                                       We do not want to save whole address now,
 768                                       (because main consumer of this service
 769                                        is tcp, which has not this problem),
 770                                       so that the last trick works only on connected
 771                                       sockets.
 772                                    2. oif also should be the same.
 773                                  */
 774
 775                         if (((rt->rt6i_dst.plen != 128 ||
 776                               !ipv6_addr_equal(&fl->fl6_dst, &rt->rt6i_dst.addr))
 777                              && (np->daddr_cache == NULL ||
 778                                  !ipv6_addr_equal(&fl->fl6_dst, np->daddr_cache)))
 779                             || (fl->oif && fl->oif != (*dst)->dev->ifindex)) {
 780                                 dst_release(*dst);
 781                                 *dst = NULL;
 782                         }
 783                 }
 784         }
 785
 786         if (*dst == NULL)
 787                 *dst = ip6_route_output(sk, fl);
 788
 789         if ((err = (*dst)->error))
 790                 goto out_err_release;
 791
 792         if (ipv6_addr_any(&fl->fl6_src)) {
 793                 err = ipv6_get_saddr(*dst, &fl->fl6_dst, &fl->fl6_src);
 794
 795                 if (err) {
 796 #if IP6_DEBUG >= 2
 797                         printk(KERN_DEBUG "ip6_dst_lookup: "
 798                                "no available source address\n");
 799 #endif
 800                         goto out_err_release;
 801                 }
 802         }
 803
 804         return 0;
 805
 806 out_err_release:
 807         dst_release(*dst);
 808         *dst = NULL;
 809         return err;
 810 }
 811
 812 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb),
 813                     void *from, int length, int transhdrlen,
 814                     int hlimit, struct ipv6_txoptions *opt, struct flowi *fl, struct rt6_info *rt,
 815                     unsigned int flags)
 816 {
 817         struct inet_sock *inet = inet_sk(sk);
 818         struct ipv6_pinfo *np = inet6_sk(sk);
 819         struct sk_buff *skb;
 820         unsigned int maxfraglen, fragheaderlen;
 821         int exthdrlen;
 822         int hh_len;
 823         int mtu;
 824         int copy;
 825         int err;
 826         int offset = 0;
 827         int csummode = CHECKSUM_NONE;
 828
 829         if (flags&MSG_PROBE)
 830                 return 0;
 831         if (skb_queue_empty(&sk->sk_write_queue)) {
 832                 /*
 833                  * setup for corking
 834                  */
 835                 if (opt) {
 836                         if (np->cork.opt == NULL) {
 837                                 np->cork.opt = kmalloc(opt->tot_len,
 838                                                        sk->sk_allocation);
 839                                 if (unlikely(np->cork.opt == NULL))
 840                                         return -ENOBUFS;
 841                         } else if (np->cork.opt->tot_len < opt->tot_len) {
 842                                 printk(KERN_DEBUG "ip6_append_data: invalid option length\n");
 843                                 return -EINVAL;
 844                         }
 845                         memcpy(np->cork.opt, opt, opt->tot_len);
 846                         inet->cork.flags |= IPCORK_OPT;
 847                         /* need source address above miyazawa*/
 848                 }
 849                 dst_hold(&rt->u.dst);
 850                 np->cork.rt = rt;
 851                 inet->cork.fl = *fl;
 852                 np->cork.hop_limit = hlimit;
 853                 inet->cork.fragsize = mtu = dst_mtu(rt->u.dst.path);
 854                 if (dst_allfrag(rt->u.dst.path))
 855                         inet->cork.flags |= IPCORK_ALLFRAG;
 856                 inet->cork.length = 0;
 857                 sk->sk_sndmsg_page = NULL;
 858                 sk->sk_sndmsg_off = 0;
 859                 exthdrlen = rt->u.dst.header_len + (opt ? opt->opt_flen : 0);
 860                 length += exthdrlen;
 861                 transhdrlen += exthdrlen;
 862         } else {
 863                 rt = np->cork.rt;
 864                 fl = &inet->cork.fl;
 865                 if (inet->cork.flags & IPCORK_OPT)
 866                         opt = np->cork.opt;
 867                 transhdrlen = 0;
 868                 exthdrlen = 0;
 869                 mtu = inet->cork.fragsize;
 870         }
 871
 872         hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
 873
 874         fragheaderlen = sizeof(struct ipv6hdr) + (opt ? opt->opt_nflen : 0);
 875         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
 876
 877         if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
 878                 if (inet->cork.length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
 879                         ipv6_local_error(sk, EMSGSIZE, fl, mtu-exthdrlen);
 880                         return -EMSGSIZE;
 881                 }
 882         }
 883
 884         /*
 885          * Let's try using as much space as possible.
 886          * Use MTU if total length of the message fits into the MTU.
 887          * Otherwise, we need to reserve fragment header and
 888          * fragment alignment (= 8-15 octects, in total).
 889          *
 890          * Note that we may need to "move" the data from the tail of
 891          * of the buffer to the new fragment when we split
 892          * the message.
 893          *
 894          * FIXME: It may be fragmented into multiple chunks
 895          *        at once if non-fragmentable extension headers
 896          *        are too large.
 897          * --yoshfuji
 898          */
 899
 900         inet->cork.length += length;
 901
 902         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
 903                 goto alloc_new_skb;
 904
 905         while (length > 0) {
 906                 /* Check if the remaining data fits into current packet. */
 907                 copy = (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
 908                 if (copy < length)
 909                         copy = maxfraglen - skb->len;
 910
 911                 if (copy <= 0) {
 912                         char *data;
 913                         unsigned int datalen;
 914                         unsigned int fraglen;
 915                         unsigned int fraggap;
 916                         unsigned int alloclen;
 917                         struct sk_buff *skb_prev;
 918 alloc_new_skb:
 919                         skb_prev = skb;
 920
 921                         /* There's no room in the current skb */
 922                         if (skb_prev)
 923                                 fraggap = skb_prev->len - maxfraglen;
 924                         else
 925                                 fraggap = 0;
 926
 927                         /*
 928                          * If remaining data exceeds the mtu,
 929                          * we know we need more fragment(s).
 930                          */
 931                         datalen = length + fraggap;
 932                         if (datalen > (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
 933                                 datalen = maxfraglen - fragheaderlen;
 934
 935                         fraglen = datalen + fragheaderlen;
 936                         if ((flags & MSG_MORE) &&
 937                             !(rt->u.dst.dev->features&NETIF_F_SG))
 938                                 alloclen = mtu;
 939                         else
 940                                 alloclen = datalen + fragheaderlen;
 941
 942                         /*
 943                          * The last fragment gets additional space at tail.
 944                          * Note: we overallocate on fragments with MSG_MODE
 945                          * because we have no idea if we're the last one.
 946                          */
 947                         if (datalen == length + fraggap)
 948                                 alloclen += rt->u.dst.trailer_len;
 949
 950                         /*
 951                          * We just reserve space for fragment header.
 952                          * Note: this may be overallocation if the message
 953                          * (without MSG_MORE) fits into the MTU.
 954                          */
 955                         alloclen += sizeof(struct frag_hdr);
 956
 957                         if (transhdrlen) {
 958                                 skb = sock_alloc_send_skb(sk,
 959                                                 alloclen + hh_len,
 960                                                 (flags & MSG_DONTWAIT), &err);
 961                         } else {
 962                                 skb = NULL;
 963                                 if (atomic_read(&sk->sk_wmem_alloc) <=
 964                                     2 * sk->sk_sndbuf)
 965                                         skb = sock_wmalloc(sk,
 966                                                            alloclen + hh_len, 1,
 967                                                            sk->sk_allocation);
 968                                 if (unlikely(skb == NULL))
 969                                         err = -ENOBUFS;
 970                         }
 971                         if (skb == NULL)
 972                                 goto error;
 973                         /*
 974                          *      Fill in the control structures
 975                          */
 976                         skb->ip_summed = csummode;
 977                         skb->csum = 0;
 978                         /* reserve for fragmentation */
 979                         skb_reserve(skb, hh_len+sizeof(struct frag_hdr));
 980
 981                         /*
 982                          *      Find where to start putting bytes
 983                          */
 984                         data = skb_put(skb, fraglen);
 985                         skb->nh.raw = data + exthdrlen;
 986                         data += fragheaderlen;
 987                         skb->h.raw = data + exthdrlen;
 988
 989                         if (fraggap) {
 990                                 skb->csum = skb_copy_and_csum_bits(
 991                                         skb_prev, maxfraglen,
 992                                         data + transhdrlen, fraggap, 0);
 993                                 skb_prev->csum = csum_sub(skb_prev->csum,
 994                                                           skb->csum);
 995                                 data += fraggap;
 996                                 skb_trim(skb_prev, maxfraglen);
 997                         }
 998                         copy = datalen - transhdrlen - fraggap;
 999                         if (copy < 0) {
1000                                 err = -EINVAL;
1001                                 kfree_skb(skb);
1002                                 goto error;
1003                         } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1004                                 err = -EFAULT;
1005                                 kfree_skb(skb);
1006                                 goto error;
1007                         }
1008
1009                         offset += copy;
1010                         length -= datalen - fraggap;
1011                         transhdrlen = 0;
1012                         exthdrlen = 0;
1013                         csummode = CHECKSUM_NONE;
1014
1015                         /*
1016                          * Put the packet on the pending queue
1017                          */
1018                         __skb_queue_tail(&sk->sk_write_queue, skb);
1019                         continue;
1020                 }
1021
1022                 if (copy > length)
1023                         copy = length;
1024
1025                 if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
1026                         unsigned int off;
1027
1028                         off = skb->len;
1029                         if (getfrag(from, skb_put(skb, copy),
1030                                                 offset, copy, off, skb) < 0) {
1031                                 __skb_trim(skb, off);
1032                                 err = -EFAULT;
1033                                 goto error;
1034                         }
1035                 } else {
1036                         int i = skb_shinfo(skb)->nr_frags;
1037                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1038                         struct page *page = sk->sk_sndmsg_page;
1039                         int off = sk->sk_sndmsg_off;
1040                         unsigned int left;
1041
1042                         if (page && (left = PAGE_SIZE - off) > 0) {
1043                                 if (copy >= left)
1044                                         copy = left;
1045                                 if (page != frag->page) {
1046                                         if (i == MAX_SKB_FRAGS) {
1047                                                 err = -EMSGSIZE;
1048                                                 goto error;
1049                                         }
1050                                         get_page(page);
1051                                         skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1052                                         frag = &skb_shinfo(skb)->frags[i];
1053                                 }
1054                         } else if(i < MAX_SKB_FRAGS) {
1055                                 if (copy > PAGE_SIZE)
1056                                         copy = PAGE_SIZE;
1057                                 page = alloc_pages(sk->sk_allocation, 0);
1058                                 if (page == NULL) {
1059                                         err = -ENOMEM;
1060                                         goto error;
1061                                 }
1062                                 sk->sk_sndmsg_page = page;
1063                                 sk->sk_sndmsg_off = 0;
1064
1065                                 skb_fill_page_desc(skb, i, page, 0, 0);
1066                                 frag = &skb_shinfo(skb)->frags[i];
1067                                 skb->truesize += PAGE_SIZE;
1068                                 atomic_add(PAGE_SIZE, &sk->sk_wmem_alloc);
1069                         } else {
1070                                 err = -EMSGSIZE;
1071                                 goto error;
1072                         }
1073                         if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1074                                 err = -EFAULT;
1075                                 goto error;
1076                         }
1077                         sk->sk_sndmsg_off += copy;
1078                         frag->size += copy;
1079                         skb->len += copy;
1080                         skb->data_len += copy;
1081                 }
1082                 offset += copy;
1083                 length -= copy;
1084         }
1085         return 0;
1086 error:
1087         inet->cork.length -= length;
1088         IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1089         return err;
1090 }
1091
1092 int ip6_push_pending_frames(struct sock *sk)
1093 {
1094         struct sk_buff *skb, *tmp_skb;
1095         struct sk_buff **tail_skb;
1096         struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1097         struct inet_sock *inet = inet_sk(sk);
1098         struct ipv6_pinfo *np = inet6_sk(sk);
1099         struct ipv6hdr *hdr;
1100         struct ipv6_txoptions *opt = np->cork.opt;
1101         struct rt6_info *rt = np->cork.rt;
1102         struct flowi *fl = &inet->cork.fl;
1103         unsigned char proto = fl->proto;
1104         int err = 0;
1105
1106         if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1107                 goto out;
1108         tail_skb = &(skb_shinfo(skb)->frag_list);
1109
1110         /* move skb->data to ip header from ext header */
1111         if (skb->data < skb->nh.raw)
1112                 __skb_pull(skb, skb->nh.raw - skb->data);
1113         while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1114                 __skb_pull(tmp_skb, skb->h.raw - skb->nh.raw);
1115                 *tail_skb = tmp_skb;
1116                 tail_skb = &(tmp_skb->next);
1117                 skb->len += tmp_skb->len;
1118                 skb->data_len += tmp_skb->len;
1119 #if 0 /* Logically correct, but useless work, ip_fragment() will have to undo */
1120                 skb->truesize += tmp_skb->truesize;
1121                 __sock_put(tmp_skb->sk);
1122                 tmp_skb->destructor = NULL;
1123                 tmp_skb->sk = NULL;
1124 #endif
1125         }
1126
1127         ipv6_addr_copy(final_dst, &fl->fl6_dst);
1128         __skb_pull(skb, skb->h.raw - skb->nh.raw);
1129         if (opt && opt->opt_flen)
1130                 ipv6_push_frag_opts(skb, opt, &proto);
1131         if (opt && opt->opt_nflen)
1132                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1133
1134         skb->nh.ipv6h = hdr = (struct ipv6hdr*) skb_push(skb, sizeof(struct ipv6hdr));
1135
1136         *(u32*)hdr = fl->fl6_flowlabel | htonl(0x60000000);
1137
1138         if (skb->len <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN)
1139                 hdr->payload_len = htons(skb->len - sizeof(struct ipv6hdr));
1140         else
1141                 hdr->payload_len = 0;
1142         hdr->hop_limit = np->cork.hop_limit;
1143         hdr->nexthdr = proto;
1144         ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
1145         ipv6_addr_copy(&hdr->daddr, final_dst);
1146
1147         skb->dst = dst_clone(&rt->u.dst);
1148         IP6_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
1149         err = NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, skb->dst->dev, dst_output);
1150         if (err) {
1151                 if (err > 0)
1152                         err = np->recverr ? net_xmit_errno(err) : 0;
1153                 if (err)
1154                         goto error;
1155         }
1156
1157 out:
1158         inet->cork.flags &= ~IPCORK_OPT;
1159         if (np->cork.opt) {
1160                 kfree(np->cork.opt);
1161                 np->cork.opt = NULL;
1162         }
1163         if (np->cork.rt) {
1164                 dst_release(&np->cork.rt->u.dst);
1165                 np->cork.rt = NULL;
1166                 inet->cork.flags &= ~IPCORK_ALLFRAG;
1167         }
1168         memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1169         return err;
1170 error:
1171         goto out;
1172 }
1173
1174 void ip6_flush_pending_frames(struct sock *sk)
1175 {
1176         struct inet_sock *inet = inet_sk(sk);
1177         struct ipv6_pinfo *np = inet6_sk(sk);
1178         struct sk_buff *skb;
1179
1180         while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1181                 IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1182                 kfree_skb(skb);
1183         }
1184
1185         inet->cork.flags &= ~IPCORK_OPT;
1186
1187         if (np->cork.opt) {
1188                 kfree(np->cork.opt);
1189                 np->cork.opt = NULL;
1190         }
1191         if (np->cork.rt) {
1192                 dst_release(&np->cork.rt->u.dst);
1193                 np->cork.rt = NULL;
1194                 inet->cork.flags &= ~IPCORK_ALLFRAG;
1195         }
1196         memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1197 }