net/ipv6/ip6_output.c

   1 /*
   2  *      IPv6 output functions
   3  *      Linux INET6 implementation
   4  *
   5  *      Authors:
   6  *      Pedro Roque             <roque@di.fc.ul.pt>
   7  *
   8  *      Based on linux/net/ipv4/ip_output.c
   9  *
  10  *      This program is free software; you can redistribute it and/or
  11  *      modify it under the terms of the GNU General Public License
  12  *      as published by the Free Software Foundation; either version
  13  *      2 of the License, or (at your option) any later version.
  14  *
  15  *      Changes:
  16  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
  17  *                              extension headers are implemented.
  18  *                              route changes now work.
  19  *                              ip6_forward does not confuse sniffers.
  20  *                              etc.
  21  *
  22  *      H. von Brand    :       Added missing #include <linux/string.h>
  23  *      Imran Patel     :       frag id should be in NBO
  24  *      Kazunori MIYAZAWA @USAGI
  25  *                      :       add ip6_append_data and related functions
  26  *                              for datagram xmit
  27  */
  28
  29 #include <linux/errno.h>
  30 #include <linux/kernel.h>
  31 #include <linux/string.h>
  32 #include <linux/socket.h>
  33 #include <linux/net.h>
  34 #include <linux/netdevice.h>
  35 #include <linux/if_arp.h>
  36 #include <linux/in6.h>
  37 #include <linux/tcp.h>
  38 #include <linux/route.h>
  39 #include <linux/module.h>
  40 #include <linux/slab.h>
  41
  42 #include <linux/bpf-cgroup.h>
  43 #include <linux/netfilter.h>
  44 #include <linux/netfilter_ipv6.h>
  45
  46 #include <net/sock.h>
  47 #include <net/snmp.h>
  48
  49 #include <net/ipv6.h>
  50 #include <net/ndisc.h>
  51 #include <net/protocol.h>
  52 #include <net/ip6_route.h>
  53 #include <net/addrconf.h>
  54 #include <net/rawv6.h>
  55 #include <net/icmp.h>
  56 #include <net/xfrm.h>
  57 #include <net/checksum.h>
  58 #include <linux/mroute6.h>
  59 #include <net/l3mdev.h>
  60 #include <net/lwtunnel.h>
  61
  62 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
  63 {
  64         struct dst_entry *dst = skb_dst(skb);
  65         struct net_device *dev = dst->dev;
  66         struct neighbour *neigh;
  67         struct in6_addr *nexthop;
  68         int ret;
  69
  70         if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
  71                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
  72
  73                 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
  74                     ((mroute6_is_socket(net, skb) &&
  75                      !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
  76                      ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
  77                                          &ipv6_hdr(skb)->saddr))) {
  78                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
  79
  80                         /* Do not check for IFF_ALLMULTI; multicast routing
  81                            is not supported in any case.
  82                          */
  83                         if (newskb)
  84                                 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
  85                                         net, sk, newskb, NULL, newskb->dev,
  86                                         dev_loopback_xmit);
  87
  88                         if (ipv6_hdr(skb)->hop_limit == 0) {
  89                                 IP6_INC_STATS(net, idev,
  90                                               IPSTATS_MIB_OUTDISCARDS);
  91                                 kfree_skb(skb);
  92                                 return 0;
  93                         }
  94                 }
  95
  96                 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
  97
  98                 if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <=
  99                     IPV6_ADDR_SCOPE_NODELOCAL &&
 100                     !(dev->flags & IFF_LOOPBACK)) {
 101                         kfree_skb(skb);
 102                         return 0;
 103                 }
 104         }
 105
 106         if (lwtunnel_xmit_redirect(dst->lwtstate)) {
 107                 int res = lwtunnel_xmit(skb);
 108
 109                 if (res < 0 || res == LWTUNNEL_XMIT_DONE)
 110                         return res;
 111         }
 112
 113         rcu_read_lock_bh();
 114         nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr);
 115         neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
 116         if (unlikely(!neigh))
 117                 neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
 118         if (!IS_ERR(neigh)) {
 119                 sock_confirm_neigh(skb, neigh);
 120                 ret = neigh_output(neigh, skb);
 121                 rcu_read_unlock_bh();
 122                 return ret;
 123         }
 124         rcu_read_unlock_bh();
 125
 126         IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
 127         kfree_skb(skb);
 128         return -EINVAL;
 129 }
 130
 131 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 132 {
 133         int ret;
 134
 135         ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
 136         if (ret) {
 137                 kfree_skb(skb);
 138                 return ret;
 139         }
 140
 141 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
 142         /* Policy lookup after SNAT yielded a new policy */
 143         if (skb_dst(skb)->xfrm) {
 144                 IPCB(skb)->flags |= IPSKB_REROUTED;
 145                 return dst_output(net, sk, skb);
 146         }
 147 #endif
 148
 149         if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
 150             dst_allfrag(skb_dst(skb)) ||
 151             (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
 152                 return ip6_fragment(net, sk, skb, ip6_finish_output2);
 153         else
 154                 return ip6_finish_output2(net, sk, skb);
 155 }
 156
 157 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 158 {
 159         struct net_device *dev = skb_dst(skb)->dev;
 160         struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
 161
 162         skb->protocol = htons(ETH_P_IPV6);
 163         skb->dev = dev;
 164
 165         if (unlikely(idev->cnf.disable_ipv6)) {
 166                 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
 167                 kfree_skb(skb);
 168                 return 0;
 169         }
 170
 171         return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
 172                             net, sk, skb, NULL, dev,
 173                             ip6_finish_output,
 174                             !(IP6CB(skb)->flags & IP6SKB_REROUTED));
 175 }
 176
 177 bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np)
 178 {
 179         if (!np->autoflowlabel_set)
 180                 return ip6_default_np_autolabel(net);
 181         else
 182                 return np->autoflowlabel;
 183 }
 184
 185 /*
 186  * xmit an sk_buff (used by TCP, SCTP and DCCP)
 187  * Note : socket lock is not held for SYNACK packets, but might be modified
 188  * by calls to skb_set_owner_w() and ipv6_local_error(),
 189  * which are using proper atomic operations or spinlocks.
 190  */
 191 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
 192              __u32 mark, struct ipv6_txoptions *opt, int tclass)
 193 {
 194         struct net *net = sock_net(sk);
 195         const struct ipv6_pinfo *np = inet6_sk(sk);
 196         struct in6_addr *first_hop = &fl6->daddr;
 197         struct dst_entry *dst = skb_dst(skb);
 198         unsigned int head_room;
 199         struct ipv6hdr *hdr;
 200         u8  proto = fl6->flowi6_proto;
 201         int seg_len = skb->len;
 202         int hlimit = -1;
 203         u32 mtu;
 204
 205         head_room = sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
 206         if (opt)
 207                 head_room += opt->opt_nflen + opt->opt_flen;
 208
 209         if (unlikely(skb_headroom(skb) < head_room)) {
 210                 struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
 211                 if (!skb2) {
 212                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 213                                       IPSTATS_MIB_OUTDISCARDS);
 214                         kfree_skb(skb);
 215                         return -ENOBUFS;
 216                 }
 217                 if (skb->sk)
 218                         skb_set_owner_w(skb2, skb->sk);
 219                 consume_skb(skb);
 220                 skb = skb2;
 221         }
 222
 223         if (opt) {
 224                 seg_len += opt->opt_nflen + opt->opt_flen;
 225
 226                 if (opt->opt_flen)
 227                         ipv6_push_frag_opts(skb, opt, &proto);
 228
 229                 if (opt->opt_nflen)
 230                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop,
 231                                              &fl6->saddr);
 232         }
 233
 234         skb_push(skb, sizeof(struct ipv6hdr));
 235         skb_reset_network_header(skb);
 236         hdr = ipv6_hdr(skb);
 237
 238         /*
 239          *      Fill in the IPv6 header
 240          */
 241         if (np)
 242                 hlimit = np->hop_limit;
 243         if (hlimit < 0)
 244                 hlimit = ip6_dst_hoplimit(dst);
 245
 246         ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
 247                                 ip6_autoflowlabel(net, np), fl6));
 248
 249         hdr->payload_len = htons(seg_len);
 250         hdr->nexthdr = proto;
 251         hdr->hop_limit = hlimit;
 252
 253         hdr->saddr = fl6->saddr;
 254         hdr->daddr = *first_hop;
 255
 256         skb->protocol = htons(ETH_P_IPV6);
 257         skb->priority = sk->sk_priority;
 258         skb->mark = mark;
 259
 260         mtu = dst_mtu(dst);
 261         if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
 262                 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
 263                               IPSTATS_MIB_OUT, skb->len);
 264
 265                 /* if egress device is enslaved to an L3 master device pass the
 266                  * skb to its handler for processing
 267                  */
 268                 skb = l3mdev_ip6_out((struct sock *)sk, skb);
 269                 if (unlikely(!skb))
 270                         return 0;
 271
 272                 /* hooks should never assume socket lock is held.
 273                  * we promote our socket to non const
 274                  */
 275                 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
 276                                net, (struct sock *)sk, skb, NULL, dst->dev,
 277                                dst_output);
 278         }
 279
 280         skb->dev = dst->dev;
 281         /* ipv6_local_error() does not require socket lock,
 282          * we promote our socket to non const
 283          */
 284         ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
 285
 286         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
 287         kfree_skb(skb);
 288         return -EMSGSIZE;
 289 }
 290 EXPORT_SYMBOL(ip6_xmit);
 291
 292 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
 293 {
 294         struct ip6_ra_chain *ra;
 295         struct sock *last = NULL;
 296
 297         read_lock(&ip6_ra_lock);
 298         for (ra = ip6_ra_chain; ra; ra = ra->next) {
 299                 struct sock *sk = ra->sk;
 300                 if (sk && ra->sel == sel &&
 301                     (!sk->sk_bound_dev_if ||
 302                      sk->sk_bound_dev_if == skb->dev->ifindex)) {
 303                         if (last) {
 304                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
 305                                 if (skb2)
 306                                         rawv6_rcv(last, skb2);
 307                         }
 308                         last = sk;
 309                 }
 310         }
 311
 312         if (last) {
 313                 rawv6_rcv(last, skb);
 314                 read_unlock(&ip6_ra_lock);
 315                 return 1;
 316         }
 317         read_unlock(&ip6_ra_lock);
 318         return 0;
 319 }
 320
 321 static int ip6_forward_proxy_check(struct sk_buff *skb)
 322 {
 323         struct ipv6hdr *hdr = ipv6_hdr(skb);
 324         u8 nexthdr = hdr->nexthdr;
 325         __be16 frag_off;
 326         int offset;
 327
 328         if (ipv6_ext_hdr(nexthdr)) {
 329                 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
 330                 if (offset < 0)
 331                         return 0;
 332         } else
 333                 offset = sizeof(struct ipv6hdr);
 334
 335         if (nexthdr == IPPROTO_ICMPV6) {
 336                 struct icmp6hdr *icmp6;
 337
 338                 if (!pskb_may_pull(skb, (skb_network_header(skb) +
 339                                          offset + 1 - skb->data)))
 340                         return 0;
 341
 342                 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
 343
 344                 switch (icmp6->icmp6_type) {
 345                 case NDISC_ROUTER_SOLICITATION:
 346                 case NDISC_ROUTER_ADVERTISEMENT:
 347                 case NDISC_NEIGHBOUR_SOLICITATION:
 348                 case NDISC_NEIGHBOUR_ADVERTISEMENT:
 349                 case NDISC_REDIRECT:
 350                         /* For reaction involving unicast neighbor discovery
 351                          * message destined to the proxied address, pass it to
 352                          * input function.
 353                          */
 354                         return 1;
 355                 default:
 356                         break;
 357                 }
 358         }
 359
 360         /*
 361          * The proxying router can't forward traffic sent to a link-local
 362          * address, so signal the sender and discard the packet. This
 363          * behavior is clarified by the MIPv6 specification.
 364          */
 365         if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
 366                 dst_link_failure(skb);
 367                 return -1;
 368         }
 369
 370         return 0;
 371 }
 372
 373 static inline int ip6_forward_finish(struct net *net, struct sock *sk,
 374                                      struct sk_buff *skb)
 375 {
 376         struct dst_entry *dst = skb_dst(skb);
 377
 378         __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
 379         __IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
 380
 381         skb->tstamp = 0;
 382         return dst_output(net, sk, skb);
 383 }
 384
 385 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
 386 {
 387         if (skb->len <= mtu)
 388                 return false;
 389
 390         /* ipv6 conntrack defrag sets max_frag_size + ignore_df */
 391         if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
 392                 return true;
 393
 394         if (skb->ignore_df)
 395                 return false;
 396
 397         if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
 398                 return false;
 399
 400         return true;
 401 }
 402
 403 int ip6_forward(struct sk_buff *skb)
 404 {
 405         struct inet6_dev *idev = __in6_dev_get_safely(skb->dev);
 406         struct dst_entry *dst = skb_dst(skb);
 407         struct ipv6hdr *hdr = ipv6_hdr(skb);
 408         struct inet6_skb_parm *opt = IP6CB(skb);
 409         struct net *net = dev_net(dst->dev);
 410         u32 mtu;
 411
 412         if (net->ipv6.devconf_all->forwarding == 0)
 413                 goto error;
 414
 415         if (skb->pkt_type != PACKET_HOST)
 416                 goto drop;
 417
 418         if (unlikely(skb->sk))
 419                 goto drop;
 420
 421         if (skb_warn_if_lro(skb))
 422                 goto drop;
 423
 424         if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
 425                 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
 426                 goto drop;
 427         }
 428
 429         skb_forward_csum(skb);
 430
 431         /*
 432          *      We DO NOT make any processing on
 433          *      RA packets, pushing them to user level AS IS
 434          *      without ane WARRANTY that application will be able
 435          *      to interpret them. The reason is that we
 436          *      cannot make anything clever here.
 437          *
 438          *      We are not end-node, so that if packet contains
 439          *      AH/ESP, we cannot make anything.
 440          *      Defragmentation also would be mistake, RA packets
 441          *      cannot be fragmented, because there is no warranty
 442          *      that different fragments will go along one path. --ANK
 443          */
 444         if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
 445                 if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
 446                         return 0;
 447         }
 448
 449         /*
 450          *      check and decrement ttl
 451          */
 452         if (hdr->hop_limit <= 1) {
 453                 /* Force OUTPUT device used as source address */
 454                 skb->dev = dst->dev;
 455                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
 456                 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
 457
 458                 kfree_skb(skb);
 459                 return -ETIMEDOUT;
 460         }
 461
 462         /* XXX: idev->cnf.proxy_ndp? */
 463         if (net->ipv6.devconf_all->proxy_ndp &&
 464             pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
 465                 int proxied = ip6_forward_proxy_check(skb);
 466                 if (proxied > 0)
 467                         return ip6_input(skb);
 468                 else if (proxied < 0) {
 469                         __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
 470                         goto drop;
 471                 }
 472         }
 473
 474         if (!xfrm6_route_forward(skb)) {
 475                 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
 476                 goto drop;
 477         }
 478         dst = skb_dst(skb);
 479
 480         /* IPv6 specs say nothing about it, but it is clear that we cannot
 481            send redirects to source routed frames.
 482            We don't send redirects to frames decapsulated from IPsec.
 483          */
 484         if (IP6CB(skb)->iif == dst->dev->ifindex &&
 485             opt->srcrt == 0 && !skb_sec_path(skb)) {
 486                 struct in6_addr *target = NULL;
 487                 struct inet_peer *peer;
 488                 struct rt6_info *rt;
 489
 490                 /*
 491                  *      incoming and outgoing devices are the same
 492                  *      send a redirect.
 493                  */
 494
 495                 rt = (struct rt6_info *) dst;
 496                 if (rt->rt6i_flags & RTF_GATEWAY)
 497                         target = &rt->rt6i_gateway;
 498                 else
 499                         target = &hdr->daddr;
 500
 501                 peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
 502
 503                 /* Limit redirects both by destination (here)
 504                    and by source (inside ndisc_send_redirect)
 505                  */
 506                 if (inet_peer_xrlim_allow(peer, 1*HZ))
 507                         ndisc_send_redirect(skb, target);
 508                 if (peer)
 509                         inet_putpeer(peer);
 510         } else {
 511                 int addrtype = ipv6_addr_type(&hdr->saddr);
 512
 513                 /* This check is security critical. */
 514                 if (addrtype == IPV6_ADDR_ANY ||
 515                     addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
 516                         goto error;
 517                 if (addrtype & IPV6_ADDR_LINKLOCAL) {
 518                         icmpv6_send(skb, ICMPV6_DEST_UNREACH,
 519                                     ICMPV6_NOT_NEIGHBOUR, 0);
 520                         goto error;
 521                 }
 522         }
 523
 524         mtu = ip6_dst_mtu_forward(dst);
 525         if (mtu < IPV6_MIN_MTU)
 526                 mtu = IPV6_MIN_MTU;
 527
 528         if (ip6_pkt_too_big(skb, mtu)) {
 529                 /* Again, force OUTPUT device used as source address */
 530                 skb->dev = dst->dev;
 531                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 532                 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INTOOBIGERRORS);
 533                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
 534                                 IPSTATS_MIB_FRAGFAILS);
 535                 kfree_skb(skb);
 536                 return -EMSGSIZE;
 537         }
 538
 539         if (skb_cow(skb, dst->dev->hard_header_len)) {
 540                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
 541                                 IPSTATS_MIB_OUTDISCARDS);
 542                 goto drop;
 543         }
 544
 545         hdr = ipv6_hdr(skb);
 546
 547         /* Mangling hops number delayed to point after skb COW */
 548
 549         hdr->hop_limit--;
 550
 551         return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
 552                        net, NULL, skb, skb->dev, dst->dev,
 553                        ip6_forward_finish);
 554
 555 error:
 556         __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
 557 drop:
 558         kfree_skb(skb);
 559         return -EINVAL;
 560 }
 561
 562 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 563 {
 564         to->pkt_type = from->pkt_type;
 565         to->priority = from->priority;
 566         to->protocol = from->protocol;
 567         skb_dst_drop(to);
 568         skb_dst_set(to, dst_clone(skb_dst(from)));
 569         to->dev = from->dev;
 570         to->mark = from->mark;
 571
 572         skb_copy_hash(to, from);
 573
 574 #ifdef CONFIG_NET_SCHED
 575         to->tc_index = from->tc_index;
 576 #endif
 577         nf_copy(to, from);
 578         skb_copy_secmark(to, from);
 579 }
 580
 581 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
 582                  int (*output)(struct net *, struct sock *, struct sk_buff *))
 583 {
 584         struct sk_buff *frag;
 585         struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
 586         struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
 587                                 inet6_sk(skb->sk) : NULL;
 588         struct ipv6hdr *tmp_hdr;
 589         struct frag_hdr *fh;
 590         unsigned int mtu, hlen, left, len, nexthdr_offset;
 591         int hroom, troom;
 592         __be32 frag_id;
 593         int ptr, offset = 0, err = 0;
 594         u8 *prevhdr, nexthdr = 0;
 595
 596         err = ip6_find_1stfragopt(skb, &prevhdr);
 597         if (err < 0)
 598                 goto fail;
 599         hlen = err;
 600         nexthdr = *prevhdr;
 601         nexthdr_offset = prevhdr - skb_network_header(skb);
 602
 603         mtu = ip6_skb_dst_mtu(skb);
 604
 605         /* We must not fragment if the socket is set to force MTU discovery
 606          * or if the skb it not generated by a local socket.
 607          */
 608         if (unlikely(!skb->ignore_df && skb->len > mtu))
 609                 goto fail_toobig;
 610
 611         if (IP6CB(skb)->frag_max_size) {
 612                 if (IP6CB(skb)->frag_max_size > mtu)
 613                         goto fail_toobig;
 614
 615                 /* don't send fragments larger than what we received */
 616                 mtu = IP6CB(skb)->frag_max_size;
 617                 if (mtu < IPV6_MIN_MTU)
 618                         mtu = IPV6_MIN_MTU;
 619         }
 620
 621         if (np && np->frag_size < mtu) {
 622                 if (np->frag_size)
 623                         mtu = np->frag_size;
 624         }
 625         if (mtu < hlen + sizeof(struct frag_hdr) + 8)
 626                 goto fail_toobig;
 627         mtu -= hlen + sizeof(struct frag_hdr);
 628
 629         frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
 630                                     &ipv6_hdr(skb)->saddr);
 631
 632         if (skb->ip_summed == CHECKSUM_PARTIAL &&
 633             (err = skb_checksum_help(skb)))
 634                 goto fail;
 635
 636         prevhdr = skb_network_header(skb) + nexthdr_offset;
 637         hroom = LL_RESERVED_SPACE(rt->dst.dev);
 638         if (skb_has_frag_list(skb)) {
 639                 unsigned int first_len = skb_pagelen(skb);
 640                 struct sk_buff *frag2;
 641
 642                 if (first_len - hlen > mtu ||
 643                     ((first_len - hlen) & 7) ||
 644                     skb_cloned(skb) ||
 645                     skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
 646                         goto slow_path;
 647
 648                 skb_walk_frags(skb, frag) {
 649                         /* Correct geometry. */
 650                         if (frag->len > mtu ||
 651                             ((frag->len & 7) && frag->next) ||
 652                             skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
 653                                 goto slow_path_clean;
 654
 655                         /* Partially cloned skb? */
 656                         if (skb_shared(frag))
 657                                 goto slow_path_clean;
 658
 659                         BUG_ON(frag->sk);
 660                         if (skb->sk) {
 661                                 frag->sk = skb->sk;
 662                                 frag->destructor = sock_wfree;
 663                         }
 664                         skb->truesize -= frag->truesize;
 665                 }
 666
 667                 err = 0;
 668                 offset = 0;
 669                 /* BUILD HEADER */
 670
 671                 *prevhdr = NEXTHDR_FRAGMENT;
 672                 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
 673                 if (!tmp_hdr) {
 674                         err = -ENOMEM;
 675                         goto fail;
 676                 }
 677                 frag = skb_shinfo(skb)->frag_list;
 678                 skb_frag_list_init(skb);
 679
 680                 __skb_pull(skb, hlen);
 681                 fh = __skb_push(skb, sizeof(struct frag_hdr));
 682                 __skb_push(skb, hlen);
 683                 skb_reset_network_header(skb);
 684                 memcpy(skb_network_header(skb), tmp_hdr, hlen);
 685
 686                 fh->nexthdr = nexthdr;
 687                 fh->reserved = 0;
 688                 fh->frag_off = htons(IP6_MF);
 689                 fh->identification = frag_id;
 690
 691                 first_len = skb_pagelen(skb);
 692                 skb->data_len = first_len - skb_headlen(skb);
 693                 skb->len = first_len;
 694                 ipv6_hdr(skb)->payload_len = htons(first_len -
 695                                                    sizeof(struct ipv6hdr));
 696
 697                 for (;;) {
 698                         /* Prepare header of the next frame,
 699                          * before previous one went down. */
 700                         if (frag) {
 701                                 frag->ip_summed = CHECKSUM_NONE;
 702                                 skb_reset_transport_header(frag);
 703                                 fh = __skb_push(frag, sizeof(struct frag_hdr));
 704                                 __skb_push(frag, hlen);
 705                                 skb_reset_network_header(frag);
 706                                 memcpy(skb_network_header(frag), tmp_hdr,
 707                                        hlen);
 708                                 offset += skb->len - hlen - sizeof(struct frag_hdr);
 709                                 fh->nexthdr = nexthdr;
 710                                 fh->reserved = 0;
 711                                 fh->frag_off = htons(offset);
 712                                 if (frag->next)
 713                                         fh->frag_off |= htons(IP6_MF);
 714                                 fh->identification = frag_id;
 715                                 ipv6_hdr(frag)->payload_len =
 716                                                 htons(frag->len -
 717                                                       sizeof(struct ipv6hdr));
 718                                 ip6_copy_metadata(frag, skb);
 719                         }
 720
 721                         err = output(net, sk, skb);
 722                         if (!err)
 723                                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 724                                               IPSTATS_MIB_FRAGCREATES);
 725
 726                         if (err || !frag)
 727                                 break;
 728
 729                         skb = frag;
 730                         frag = skb->next;
 731                         skb->next = NULL;
 732                 }
 733
 734                 kfree(tmp_hdr);
 735
 736                 if (err == 0) {
 737                         IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 738                                       IPSTATS_MIB_FRAGOKS);
 739                         return 0;
 740                 }
 741
 742                 kfree_skb_list(frag);
 743
 744                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 745                               IPSTATS_MIB_FRAGFAILS);
 746                 return err;
 747
 748 slow_path_clean:
 749                 skb_walk_frags(skb, frag2) {
 750                         if (frag2 == frag)
 751                                 break;
 752                         frag2->sk = NULL;
 753                         frag2->destructor = NULL;
 754                         skb->truesize += frag2->truesize;
 755                 }
 756         }
 757
 758 slow_path:
 759         left = skb->len - hlen;         /* Space per frame */
 760         ptr = hlen;                     /* Where to start from */
 761
 762         /*
 763          *      Fragment the datagram.
 764          */
 765
 766         troom = rt->dst.dev->needed_tailroom;
 767
 768         /*
 769          *      Keep copying data until we run out.
 770          */
 771         while (left > 0)        {
 772                 u8 *fragnexthdr_offset;
 773
 774                 len = left;
 775                 /* IF: it doesn't fit, use 'mtu' - the data space left */
 776                 if (len > mtu)
 777                         len = mtu;
 778                 /* IF: we are not sending up to and including the packet end
 779                    then align the next start on an eight byte boundary */
 780                 if (len < left) {
 781                         len &= ~7;
 782                 }
 783
 784                 /* Allocate buffer */
 785                 frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
 786                                  hroom + troom, GFP_ATOMIC);
 787                 if (!frag) {
 788                         err = -ENOMEM;
 789                         goto fail;
 790                 }
 791
 792                 /*
 793                  *      Set up data on packet
 794                  */
 795
 796                 ip6_copy_metadata(frag, skb);
 797                 skb_reserve(frag, hroom);
 798                 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
 799                 skb_reset_network_header(frag);
 800                 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
 801                 frag->transport_header = (frag->network_header + hlen +
 802                                           sizeof(struct frag_hdr));
 803
 804                 /*
 805                  *      Charge the memory for the fragment to any owner
 806                  *      it might possess
 807                  */
 808                 if (skb->sk)
 809                         skb_set_owner_w(frag, skb->sk);
 810
 811                 /*
 812                  *      Copy the packet header into the new buffer.
 813                  */
 814                 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
 815
 816                 fragnexthdr_offset = skb_network_header(frag);
 817                 fragnexthdr_offset += prevhdr - skb_network_header(skb);
 818                 *fragnexthdr_offset = NEXTHDR_FRAGMENT;
 819
 820                 /*
 821                  *      Build fragment header.
 822                  */
 823                 fh->nexthdr = nexthdr;
 824                 fh->reserved = 0;
 825                 fh->identification = frag_id;
 826
 827                 /*
 828                  *      Copy a block of the IP datagram.
 829                  */
 830                 BUG_ON(skb_copy_bits(skb, ptr, skb_transport_header(frag),
 831                                      len));
 832                 left -= len;
 833
 834                 fh->frag_off = htons(offset);
 835                 if (left > 0)
 836                         fh->frag_off |= htons(IP6_MF);
 837                 ipv6_hdr(frag)->payload_len = htons(frag->len -
 838                                                     sizeof(struct ipv6hdr));
 839
 840                 ptr += len;
 841                 offset += len;
 842
 843                 /*
 844                  *      Put this fragment into the sending queue.
 845                  */
 846                 err = output(net, sk, frag);
 847                 if (err)
 848                         goto fail;
 849
 850                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 851                               IPSTATS_MIB_FRAGCREATES);
 852         }
 853         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 854                       IPSTATS_MIB_FRAGOKS);
 855         consume_skb(skb);
 856         return err;
 857
 858 fail_toobig:
 859         if (skb->sk && dst_allfrag(skb_dst(skb)))
 860                 sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
 861
 862         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 863         err = -EMSGSIZE;
 864
 865 fail:
 866         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 867                       IPSTATS_MIB_FRAGFAILS);
 868         kfree_skb(skb);
 869         return err;
 870 }
 871
 872 static inline int ip6_rt_check(const struct rt6key *rt_key,
 873                                const struct in6_addr *fl_addr,
 874                                const struct in6_addr *addr_cache)
 875 {
 876         return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
 877                 (!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
 878 }
 879
 880 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
 881                                           struct dst_entry *dst,
 882                                           const struct flowi6 *fl6)
 883 {
 884         struct ipv6_pinfo *np = inet6_sk(sk);
 885         struct rt6_info *rt;
 886
 887         if (!dst)
 888                 goto out;
 889
 890         if (dst->ops->family != AF_INET6) {
 891                 dst_release(dst);
 892                 return NULL;
 893         }
 894
 895         rt = (struct rt6_info *)dst;
 896         /* Yes, checking route validity in not connected
 897          * case is not very simple. Take into account,
 898          * that we do not support routing by source, TOS,
 899          * and MSG_DONTROUTE            --ANK (980726)
 900          *
 901          * 1. ip6_rt_check(): If route was host route,
 902          *    check that cached destination is current.
 903          *    If it is network route, we still may
 904          *    check its validity using saved pointer
 905          *    to the last used address: daddr_cache.
 906          *    We do not want to save whole address now,
 907          *    (because main consumer of this service
 908          *    is tcp, which has not this problem),
 909          *    so that the last trick works only on connected
 910          *    sockets.
 911          * 2. oif also should be the same.
 912          */
 913         if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
 914 #ifdef CONFIG_IPV6_SUBTREES
 915             ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
 916 #endif
 917            (!(fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) &&
 918               (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex))) {
 919                 dst_release(dst);
 920                 dst = NULL;
 921         }
 922
 923 out:
 924         return dst;
 925 }
 926
 927 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
 928                                struct dst_entry **dst, struct flowi6 *fl6)
 929 {
 930 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
 931         struct neighbour *n;
 932         struct rt6_info *rt;
 933 #endif
 934         int err;
 935         int flags = 0;
 936
 937         /* The correct way to handle this would be to do
 938          * ip6_route_get_saddr, and then ip6_route_output; however,
 939          * the route-specific preferred source forces the
 940          * ip6_route_output call _before_ ip6_route_get_saddr.
 941          *
 942          * In source specific routing (no src=any default route),
 943          * ip6_route_output will fail given src=any saddr, though, so
 944          * that's why we try it again later.
 945          */
 946         if (ipv6_addr_any(&fl6->saddr) && (!*dst || !(*dst)->error)) {
 947                 struct fib6_info *from;
 948                 struct rt6_info *rt;
 949                 bool had_dst = *dst != NULL;
 950
 951                 if (!had_dst)
 952                         *dst = ip6_route_output(net, sk, fl6);
 953                 rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
 954
 955                 rcu_read_lock();
 956                 from = rt ? rcu_dereference(rt->from) : NULL;
 957                 err = ip6_route_get_saddr(net, from, &fl6->daddr,
 958                                           sk ? inet6_sk(sk)->srcprefs : 0,
 959                                           &fl6->saddr);
 960                 rcu_read_unlock();
 961
 962                 if (err)
 963                         goto out_err_release;
 964
 965                 /* If we had an erroneous initial result, pretend it
 966                  * never existed and let the SA-enabled version take
 967                  * over.
 968                  */
 969                 if (!had_dst && (*dst)->error) {
 970                         dst_release(*dst);
 971                         *dst = NULL;
 972                 }
 973
 974                 if (fl6->flowi6_oif)
 975                         flags |= RT6_LOOKUP_F_IFACE;
 976         }
 977
 978         if (!*dst)
 979                 *dst = ip6_route_output_flags(net, sk, fl6, flags);
 980
 981         err = (*dst)->error;
 982         if (err)
 983                 goto out_err_release;
 984
 985 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
 986         /*
 987          * Here if the dst entry we've looked up
 988          * has a neighbour entry that is in the INCOMPLETE
 989          * state and the src address from the flow is
 990          * marked as OPTIMISTIC, we release the found
 991          * dst entry and replace it instead with the
 992          * dst entry of the nexthop router
 993          */
 994         rt = (struct rt6_info *) *dst;
 995         rcu_read_lock_bh();
 996         n = __ipv6_neigh_lookup_noref(rt->dst.dev,
 997                                       rt6_nexthop(rt, &fl6->daddr));
 998         err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
 999         rcu_read_unlock_bh();
1000
1001         if (err) {
1002                 struct inet6_ifaddr *ifp;
1003                 struct flowi6 fl_gw6;
1004                 int redirect;
1005
1006                 ifp = ipv6_get_ifaddr(net, &fl6->saddr,
1007                                       (*dst)->dev, 1);
1008
1009                 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1010                 if (ifp)
1011                         in6_ifa_put(ifp);
1012
1013                 if (redirect) {
1014                         /*
1015                          * We need to get the dst entry for the
1016                          * default router instead
1017                          */
1018                         dst_release(*dst);
1019                         memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1020                         memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1021                         *dst = ip6_route_output(net, sk, &fl_gw6);
1022                         err = (*dst)->error;
1023                         if (err)
1024                                 goto out_err_release;
1025                 }
1026         }
1027 #endif
1028         if (ipv6_addr_v4mapped(&fl6->saddr) &&
1029             !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
1030                 err = -EAFNOSUPPORT;
1031                 goto out_err_release;
1032         }
1033
1034         return 0;
1035
1036 out_err_release:
1037         dst_release(*dst);
1038         *dst = NULL;
1039
1040         if (err == -ENETUNREACH)
1041                 IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1042         return err;
1043 }
1044
1045 /**
1046  *      ip6_dst_lookup - perform route lookup on flow
1047  *      @sk: socket which provides route info
1048  *      @dst: pointer to dst_entry * for result
1049  *      @fl6: flow to lookup
1050  *
1051  *      This function performs a route lookup on the given flow.
1052  *
1053  *      It returns zero on success, or a standard errno code on error.
1054  */
1055 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1056                    struct flowi6 *fl6)
1057 {
1058         *dst = NULL;
1059         return ip6_dst_lookup_tail(net, sk, dst, fl6);
1060 }
1061 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1062
1063 /**
1064  *      ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1065  *      @sk: socket which provides route info
1066  *      @fl6: flow to lookup
1067  *      @final_dst: final destination address for ipsec lookup
1068  *
1069  *      This function performs a route lookup on the given flow.
1070  *
1071  *      It returns a valid dst pointer on success, or a pointer encoded
1072  *      error code.
1073  */
1074 struct dst_entry *ip6_dst_lookup_flow(struct net *net, const struct sock *sk, struct flowi6 *fl6,
1075                                       const struct in6_addr *final_dst)
1076 {
1077         struct dst_entry *dst = NULL;
1078         int err;
1079
1080         err = ip6_dst_lookup_tail(net, sk, &dst, fl6);
1081         if (err)
1082                 return ERR_PTR(err);
1083         if (final_dst)
1084                 fl6->daddr = *final_dst;
1085
1086         return xfrm_lookup_route(net, dst, flowi6_to_flowi(fl6), sk, 0);
1087 }
1088 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1089
1090 /**
1091  *      ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1092  *      @sk: socket which provides the dst cache and route info
1093  *      @fl6: flow to lookup
1094  *      @final_dst: final destination address for ipsec lookup
1095  *      @connected: whether @sk is connected or not
1096  *
1097  *      This function performs a route lookup on the given flow with the
1098  *      possibility of using the cached route in the socket if it is valid.
1099  *      It will take the socket dst lock when operating on the dst cache.
1100  *      As a result, this function can only be used in process context.
1101  *
1102  *      In addition, for a connected socket, cache the dst in the socket
1103  *      if the current cache is not valid.
1104  *
1105  *      It returns a valid dst pointer on success, or a pointer encoded
1106  *      error code.
1107  */
1108 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1109                                          const struct in6_addr *final_dst,
1110                                          bool connected)
1111 {
1112         struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1113
1114         dst = ip6_sk_dst_check(sk, dst, fl6);
1115         if (dst)
1116                 return dst;
1117
1118         dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_dst);
1119         if (connected && !IS_ERR(dst))
1120                 ip6_sk_dst_store_flow(sk, dst_clone(dst), fl6);
1121
1122         return dst;
1123 }
1124 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1125
1126 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1127                                                gfp_t gfp)
1128 {
1129         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1130 }
1131
1132 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1133                                                 gfp_t gfp)
1134 {
1135         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1136 }
1137
1138 static void ip6_append_data_mtu(unsigned int *mtu,
1139                                 int *maxfraglen,
1140                                 unsigned int fragheaderlen,
1141                                 struct sk_buff *skb,
1142                                 struct rt6_info *rt,
1143                                 unsigned int orig_mtu)
1144 {
1145         if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1146                 if (!skb) {
1147                         /* first fragment, reserve header_len */
1148                         *mtu = orig_mtu - rt->dst.header_len;
1149
1150                 } else {
1151                         /*
1152                          * this fragment is not first, the headers
1153                          * space is regarded as data space.
1154                          */
1155                         *mtu = orig_mtu;
1156                 }
1157                 *maxfraglen = ((*mtu - fragheaderlen) & ~7)
1158                               + fragheaderlen - sizeof(struct frag_hdr);
1159         }
1160 }
1161
1162 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1163                           struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
1164                           struct rt6_info *rt, struct flowi6 *fl6)
1165 {
1166         struct ipv6_pinfo *np = inet6_sk(sk);
1167         unsigned int mtu;
1168         struct ipv6_txoptions *opt = ipc6->opt;
1169
1170         /*
1171          * setup for corking
1172          */
1173         if (opt) {
1174                 if (WARN_ON(v6_cork->opt))
1175                         return -EINVAL;
1176
1177                 v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation);
1178                 if (unlikely(!v6_cork->opt))
1179                         return -ENOBUFS;
1180
1181                 v6_cork->opt->tot_len = sizeof(*opt);
1182                 v6_cork->opt->opt_flen = opt->opt_flen;
1183                 v6_cork->opt->opt_nflen = opt->opt_nflen;
1184
1185                 v6_cork->opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1186                                                     sk->sk_allocation);
1187                 if (opt->dst0opt && !v6_cork->opt->dst0opt)
1188                         return -ENOBUFS;
1189
1190                 v6_cork->opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1191                                                     sk->sk_allocation);
1192                 if (opt->dst1opt && !v6_cork->opt->dst1opt)
1193                         return -ENOBUFS;
1194
1195                 v6_cork->opt->hopopt = ip6_opt_dup(opt->hopopt,
1196                                                    sk->sk_allocation);
1197                 if (opt->hopopt && !v6_cork->opt->hopopt)
1198                         return -ENOBUFS;
1199
1200                 v6_cork->opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1201                                                     sk->sk_allocation);
1202                 if (opt->srcrt && !v6_cork->opt->srcrt)
1203                         return -ENOBUFS;
1204
1205                 /* need source address above miyazawa*/
1206         }
1207         dst_hold(&rt->dst);
1208         cork->base.dst = &rt->dst;
1209         cork->fl.u.ip6 = *fl6;
1210         v6_cork->hop_limit = ipc6->hlimit;
1211         v6_cork->tclass = ipc6->tclass;
1212         if (rt->dst.flags & DST_XFRM_TUNNEL)
1213                 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1214                       READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst);
1215         else
1216                 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1217                         READ_ONCE(rt->dst.dev->mtu) : dst_mtu(xfrm_dst_path(&rt->dst));
1218         if (np->frag_size < mtu) {
1219                 if (np->frag_size)
1220                         mtu = np->frag_size;
1221         }
1222         if (mtu < IPV6_MIN_MTU)
1223                 return -EINVAL;
1224         cork->base.fragsize = mtu;
1225         cork->base.gso_size = ipc6->gso_size;
1226         cork->base.tx_flags = 0;
1227         sock_tx_timestamp(sk, ipc6->sockc.tsflags, &cork->base.tx_flags);
1228
1229         if (dst_allfrag(xfrm_dst_path(&rt->dst)))
1230                 cork->base.flags |= IPCORK_ALLFRAG;
1231         cork->base.length = 0;
1232
1233         cork->base.transmit_time = ipc6->sockc.transmit_time;
1234
1235         return 0;
1236 }
1237
1238 static int __ip6_append_data(struct sock *sk,
1239                              struct flowi6 *fl6,
1240                              struct sk_buff_head *queue,
1241                              struct inet_cork *cork,
1242                              struct inet6_cork *v6_cork,
1243                              struct page_frag *pfrag,
1244                              int getfrag(void *from, char *to, int offset,
1245                                          int len, int odd, struct sk_buff *skb),
1246                              void *from, int length, int transhdrlen,
1247                              unsigned int flags, struct ipcm6_cookie *ipc6)
1248 {
1249         struct sk_buff *skb, *skb_prev = NULL;
1250         unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu;
1251         int exthdrlen = 0;
1252         int dst_exthdrlen = 0;
1253         int hh_len;
1254         int copy;
1255         int err;
1256         int offset = 0;
1257         u32 tskey = 0;
1258         struct rt6_info *rt = (struct rt6_info *)cork->dst;
1259         struct ipv6_txoptions *opt = v6_cork->opt;
1260         int csummode = CHECKSUM_NONE;
1261         unsigned int maxnonfragsize, headersize;
1262         unsigned int wmem_alloc_delta = 0;
1263         bool paged;
1264
1265         skb = skb_peek_tail(queue);
1266         if (!skb) {
1267                 exthdrlen = opt ? opt->opt_flen : 0;
1268                 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1269         }
1270
1271         paged = !!cork->gso_size;
1272         mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize;
1273         orig_mtu = mtu;
1274
1275         if (cork->tx_flags & SKBTX_ANY_SW_TSTAMP &&
1276             sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
1277                 tskey = sk->sk_tskey++;
1278
1279         hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1280
1281         fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1282                         (opt ? opt->opt_nflen : 0);
1283         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1284                      sizeof(struct frag_hdr);
1285
1286         headersize = sizeof(struct ipv6hdr) +
1287                      (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1288                      (dst_allfrag(&rt->dst) ?
1289                       sizeof(struct frag_hdr) : 0) +
1290                      rt->rt6i_nfheader_len;
1291
1292         /* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit
1293          * the first fragment
1294          */
1295         if (headersize + transhdrlen > mtu)
1296                 goto emsgsize;
1297
1298         if (cork->length + length > mtu - headersize && ipc6->dontfrag &&
1299             (sk->sk_protocol == IPPROTO_UDP ||
1300              sk->sk_protocol == IPPROTO_RAW)) {
1301                 ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1302                                 sizeof(struct ipv6hdr));
1303                 goto emsgsize;
1304         }
1305
1306         if (ip6_sk_ignore_df(sk))
1307                 maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1308         else
1309                 maxnonfragsize = mtu;
1310
1311         if (cork->length + length > maxnonfragsize - headersize) {
1312 emsgsize:
1313                 pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0);
1314                 ipv6_local_error(sk, EMSGSIZE, fl6, pmtu);
1315                 return -EMSGSIZE;
1316         }
1317
1318         /* CHECKSUM_PARTIAL only with no extension headers and when
1319          * we are not going to fragment
1320          */
1321         if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1322             headersize == sizeof(struct ipv6hdr) &&
1323             length <= mtu - headersize &&
1324             (!(flags & MSG_MORE) || cork->gso_size) &&
1325             rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
1326                 csummode = CHECKSUM_PARTIAL;
1327
1328         /*
1329          * Let's try using as much space as possible.
1330          * Use MTU if total length of the message fits into the MTU.
1331          * Otherwise, we need to reserve fragment header and
1332          * fragment alignment (= 8-15 octects, in total).
1333          *
1334          * Note that we may need to "move" the data from the tail of
1335          * of the buffer to the new fragment when we split
1336          * the message.
1337          *
1338          * FIXME: It may be fragmented into multiple chunks
1339          *        at once if non-fragmentable extension headers
1340          *        are too large.
1341          * --yoshfuji
1342          */
1343
1344         cork->length += length;
1345         if (!skb)
1346                 goto alloc_new_skb;
1347
1348         while (length > 0) {
1349                 /* Check if the remaining data fits into current packet. */
1350                 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1351                 if (copy < length)
1352                         copy = maxfraglen - skb->len;
1353
1354                 if (copy <= 0) {
1355                         char *data;
1356                         unsigned int datalen;
1357                         unsigned int fraglen;
1358                         unsigned int fraggap;
1359                         unsigned int alloclen;
1360                         unsigned int pagedlen;
1361 alloc_new_skb:
1362                         /* There's no room in the current skb */
1363                         if (skb)
1364                                 fraggap = skb->len - maxfraglen;
1365                         else
1366                                 fraggap = 0;
1367                         /* update mtu and maxfraglen if necessary */
1368                         if (!skb || !skb_prev)
1369                                 ip6_append_data_mtu(&mtu, &maxfraglen,
1370                                                     fragheaderlen, skb, rt,
1371                                                     orig_mtu);
1372
1373                         skb_prev = skb;
1374
1375                         /*
1376                          * If remaining data exceeds the mtu,
1377                          * we know we need more fragment(s).
1378                          */
1379                         datalen = length + fraggap;
1380
1381                         if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1382                                 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1383                         fraglen = datalen + fragheaderlen;
1384                         pagedlen = 0;
1385
1386                         if ((flags & MSG_MORE) &&
1387                             !(rt->dst.dev->features&NETIF_F_SG))
1388                                 alloclen = mtu;
1389                         else if (!paged)
1390                                 alloclen = fraglen;
1391                         else {
1392                                 alloclen = min_t(int, fraglen, MAX_HEADER);
1393                                 pagedlen = fraglen - alloclen;
1394                         }
1395
1396                         alloclen += dst_exthdrlen;
1397
1398                         if (datalen != length + fraggap) {
1399                                 /*
1400                                  * this is not the last fragment, the trailer
1401                                  * space is regarded as data space.
1402                                  */
1403                                 datalen += rt->dst.trailer_len;
1404                         }
1405
1406                         alloclen += rt->dst.trailer_len;
1407                         fraglen = datalen + fragheaderlen;
1408
1409                         /*
1410                          * We just reserve space for fragment header.
1411                          * Note: this may be overallocation if the message
1412                          * (without MSG_MORE) fits into the MTU.
1413                          */
1414                         alloclen += sizeof(struct frag_hdr);
1415
1416                         copy = datalen - transhdrlen - fraggap - pagedlen;
1417                         if (copy < 0) {
1418                                 err = -EINVAL;
1419                                 goto error;
1420                         }
1421                         if (transhdrlen) {
1422                                 skb = sock_alloc_send_skb(sk,
1423                                                 alloclen + hh_len,
1424                                                 (flags & MSG_DONTWAIT), &err);
1425                         } else {
1426                                 skb = NULL;
1427                                 if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <=
1428                                     2 * sk->sk_sndbuf)
1429                                         skb = alloc_skb(alloclen + hh_len,
1430                                                         sk->sk_allocation);
1431                                 if (unlikely(!skb))
1432                                         err = -ENOBUFS;
1433                         }
1434                         if (!skb)
1435                                 goto error;
1436                         /*
1437                          *      Fill in the control structures
1438                          */
1439                         skb->protocol = htons(ETH_P_IPV6);
1440                         skb->ip_summed = csummode;
1441                         skb->csum = 0;
1442                         /* reserve for fragmentation and ipsec header */
1443                         skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1444                                     dst_exthdrlen);
1445
1446                         /* Only the initial fragment is time stamped */
1447                         skb_shinfo(skb)->tx_flags = cork->tx_flags;
1448                         cork->tx_flags = 0;
1449                         skb_shinfo(skb)->tskey = tskey;
1450                         tskey = 0;
1451
1452                         /*
1453                          *      Find where to start putting bytes
1454                          */
1455                         data = skb_put(skb, fraglen - pagedlen);
1456                         skb_set_network_header(skb, exthdrlen);
1457                         data += fragheaderlen;
1458                         skb->transport_header = (skb->network_header +
1459                                                  fragheaderlen);
1460                         if (fraggap) {
1461                                 skb->csum = skb_copy_and_csum_bits(
1462                                         skb_prev, maxfraglen,
1463                                         data + transhdrlen, fraggap, 0);
1464                                 skb_prev->csum = csum_sub(skb_prev->csum,
1465                                                           skb->csum);
1466                                 data += fraggap;
1467                                 pskb_trim_unique(skb_prev, maxfraglen);
1468                         }
1469                         if (copy > 0 &&
1470                             getfrag(from, data + transhdrlen, offset,
1471                                     copy, fraggap, skb) < 0) {
1472                                 err = -EFAULT;
1473                                 kfree_skb(skb);
1474                                 goto error;
1475                         }
1476
1477                         offset += copy;
1478                         length -= copy + transhdrlen;
1479                         transhdrlen = 0;
1480                         exthdrlen = 0;
1481                         dst_exthdrlen = 0;
1482
1483                         if ((flags & MSG_CONFIRM) && !skb_prev)
1484                                 skb_set_dst_pending_confirm(skb, 1);
1485
1486                         /*
1487                          * Put the packet on the pending queue
1488                          */
1489                         if (!skb->destructor) {
1490                                 skb->destructor = sock_wfree;
1491                                 skb->sk = sk;
1492                                 wmem_alloc_delta += skb->truesize;
1493                         }
1494                         __skb_queue_tail(queue, skb);
1495                         continue;
1496                 }
1497
1498                 if (copy > length)
1499                         copy = length;
1500
1501                 if (!(rt->dst.dev->features&NETIF_F_SG) &&
1502                     skb_tailroom(skb) >= copy) {
1503                         unsigned int off;
1504
1505                         off = skb->len;
1506                         if (getfrag(from, skb_put(skb, copy),
1507                                                 offset, copy, off, skb) < 0) {
1508                                 __skb_trim(skb, off);
1509                                 err = -EFAULT;
1510                                 goto error;
1511                         }
1512                 } else {
1513                         int i = skb_shinfo(skb)->nr_frags;
1514
1515                         err = -ENOMEM;
1516                         if (!sk_page_frag_refill(sk, pfrag))
1517                                 goto error;
1518
1519                         if (!skb_can_coalesce(skb, i, pfrag->page,
1520                                               pfrag->offset)) {
1521                                 err = -EMSGSIZE;
1522                                 if (i == MAX_SKB_FRAGS)
1523                                         goto error;
1524
1525                                 __skb_fill_page_desc(skb, i, pfrag->page,
1526                                                      pfrag->offset, 0);
1527                                 skb_shinfo(skb)->nr_frags = ++i;
1528                                 get_page(pfrag->page);
1529                         }
1530                         copy = min_t(int, copy, pfrag->size - pfrag->offset);
1531                         if (getfrag(from,
1532                                     page_address(pfrag->page) + pfrag->offset,
1533                                     offset, copy, skb->len, skb) < 0)
1534                                 goto error_efault;
1535
1536                         pfrag->offset += copy;
1537                         skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1538                         skb->len += copy;
1539                         skb->data_len += copy;
1540                         skb->truesize += copy;
1541                         wmem_alloc_delta += copy;
1542                 }
1543                 offset += copy;
1544                 length -= copy;
1545         }
1546
1547         if (wmem_alloc_delta)
1548                 refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1549         return 0;
1550
1551 error_efault:
1552         err = -EFAULT;
1553 error:
1554         cork->length -= length;
1555         IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1556         refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1557         return err;
1558 }
1559
1560 int ip6_append_data(struct sock *sk,
1561                     int getfrag(void *from, char *to, int offset, int len,
1562                                 int odd, struct sk_buff *skb),
1563                     void *from, int length, int transhdrlen,
1564                     struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1565                     struct rt6_info *rt, unsigned int flags)
1566 {
1567         struct inet_sock *inet = inet_sk(sk);
1568         struct ipv6_pinfo *np = inet6_sk(sk);
1569         int exthdrlen;
1570         int err;
1571
1572         if (flags&MSG_PROBE)
1573                 return 0;
1574         if (skb_queue_empty(&sk->sk_write_queue)) {
1575                 /*
1576                  * setup for corking
1577                  */
1578                 err = ip6_setup_cork(sk, &inet->cork, &np->cork,
1579                                      ipc6, rt, fl6);
1580                 if (err)
1581                         return err;
1582
1583                 exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1584                 length += exthdrlen;
1585                 transhdrlen += exthdrlen;
1586         } else {
1587                 fl6 = &inet->cork.fl.u.ip6;
1588                 transhdrlen = 0;
1589         }
1590
1591         return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base,
1592                                  &np->cork, sk_page_frag(sk), getfrag,
1593                                  from, length, transhdrlen, flags, ipc6);
1594 }
1595 EXPORT_SYMBOL_GPL(ip6_append_data);
1596
1597 static void ip6_cork_release(struct inet_cork_full *cork,
1598                              struct inet6_cork *v6_cork)
1599 {
1600         if (v6_cork->opt) {
1601                 kfree(v6_cork->opt->dst0opt);
1602                 kfree(v6_cork->opt->dst1opt);
1603                 kfree(v6_cork->opt->hopopt);
1604                 kfree(v6_cork->opt->srcrt);
1605                 kfree(v6_cork->opt);
1606                 v6_cork->opt = NULL;
1607         }
1608
1609         if (cork->base.dst) {
1610                 dst_release(cork->base.dst);
1611                 cork->base.dst = NULL;
1612                 cork->base.flags &= ~IPCORK_ALLFRAG;
1613         }
1614         memset(&cork->fl, 0, sizeof(cork->fl));
1615 }
1616
1617 struct sk_buff *__ip6_make_skb(struct sock *sk,
1618                                struct sk_buff_head *queue,
1619                                struct inet_cork_full *cork,
1620                                struct inet6_cork *v6_cork)
1621 {
1622         struct sk_buff *skb, *tmp_skb;
1623         struct sk_buff **tail_skb;
1624         struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1625         struct ipv6_pinfo *np = inet6_sk(sk);
1626         struct net *net = sock_net(sk);
1627         struct ipv6hdr *hdr;
1628         struct ipv6_txoptions *opt = v6_cork->opt;
1629         struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1630         struct flowi6 *fl6 = &cork->fl.u.ip6;
1631         unsigned char proto = fl6->flowi6_proto;
1632
1633         skb = __skb_dequeue(queue);
1634         if (!skb)
1635                 goto out;
1636         tail_skb = &(skb_shinfo(skb)->frag_list);
1637
1638         /* move skb->data to ip header from ext header */
1639         if (skb->data < skb_network_header(skb))
1640                 __skb_pull(skb, skb_network_offset(skb));
1641         while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1642                 __skb_pull(tmp_skb, skb_network_header_len(skb));
1643                 *tail_skb = tmp_skb;
1644                 tail_skb = &(tmp_skb->next);
1645                 skb->len += tmp_skb->len;
1646                 skb->data_len += tmp_skb->len;
1647                 skb->truesize += tmp_skb->truesize;
1648                 tmp_skb->destructor = NULL;
1649                 tmp_skb->sk = NULL;
1650         }
1651
1652         /* Allow local fragmentation. */
1653         skb->ignore_df = ip6_sk_ignore_df(sk);
1654
1655         *final_dst = fl6->daddr;
1656         __skb_pull(skb, skb_network_header_len(skb));
1657         if (opt && opt->opt_flen)
1658                 ipv6_push_frag_opts(skb, opt, &proto);
1659         if (opt && opt->opt_nflen)
1660                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr);
1661
1662         skb_push(skb, sizeof(struct ipv6hdr));
1663         skb_reset_network_header(skb);
1664         hdr = ipv6_hdr(skb);
1665
1666         ip6_flow_hdr(hdr, v6_cork->tclass,
1667                      ip6_make_flowlabel(net, skb, fl6->flowlabel,
1668                                         ip6_autoflowlabel(net, np), fl6));
1669         hdr->hop_limit = v6_cork->hop_limit;
1670         hdr->nexthdr = proto;
1671         hdr->saddr = fl6->saddr;
1672         hdr->daddr = *final_dst;
1673
1674         skb->priority = sk->sk_priority;
1675         skb->mark = sk->sk_mark;
1676
1677         skb->tstamp = cork->base.transmit_time;
1678
1679         skb_dst_set(skb, dst_clone(&rt->dst));
1680         IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1681         if (proto == IPPROTO_ICMPV6) {
1682                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1683
1684                 ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type);
1685                 ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1686         }
1687
1688         ip6_cork_release(cork, v6_cork);
1689 out:
1690         return skb;
1691 }
1692
1693 int ip6_send_skb(struct sk_buff *skb)
1694 {
1695         struct net *net = sock_net(skb->sk);
1696         struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
1697         int err;
1698
1699         err = ip6_local_out(net, skb->sk, skb);
1700         if (err) {
1701                 if (err > 0)
1702                         err = net_xmit_errno(err);
1703                 if (err)
1704                         IP6_INC_STATS(net, rt->rt6i_idev,
1705                                       IPSTATS_MIB_OUTDISCARDS);
1706         }
1707
1708         return err;
1709 }
1710
1711 int ip6_push_pending_frames(struct sock *sk)
1712 {
1713         struct sk_buff *skb;
1714
1715         skb = ip6_finish_skb(sk);
1716         if (!skb)
1717                 return 0;
1718
1719         return ip6_send_skb(skb);
1720 }
1721 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1722
1723 static void __ip6_flush_pending_frames(struct sock *sk,
1724                                        struct sk_buff_head *queue,
1725                                        struct inet_cork_full *cork,
1726                                        struct inet6_cork *v6_cork)
1727 {
1728         struct sk_buff *skb;
1729
1730         while ((skb = __skb_dequeue_tail(queue)) != NULL) {
1731                 if (skb_dst(skb))
1732                         IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1733                                       IPSTATS_MIB_OUTDISCARDS);
1734                 kfree_skb(skb);
1735         }
1736
1737         ip6_cork_release(cork, v6_cork);
1738 }
1739
1740 void ip6_flush_pending_frames(struct sock *sk)
1741 {
1742         __ip6_flush_pending_frames(sk, &sk->sk_write_queue,
1743                                    &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
1744 }
1745 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
1746
1747 struct sk_buff *ip6_make_skb(struct sock *sk,
1748                              int getfrag(void *from, char *to, int offset,
1749                                          int len, int odd, struct sk_buff *skb),
1750                              void *from, int length, int transhdrlen,
1751                              struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1752                              struct rt6_info *rt, unsigned int flags,
1753                              struct inet_cork_full *cork)
1754 {
1755         struct inet6_cork v6_cork;
1756         struct sk_buff_head queue;
1757         int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1758         int err;
1759
1760         if (flags & MSG_PROBE)
1761                 return NULL;
1762
1763         __skb_queue_head_init(&queue);
1764
1765         cork->base.flags = 0;
1766         cork->base.addr = 0;
1767         cork->base.opt = NULL;
1768         cork->base.dst = NULL;
1769         v6_cork.opt = NULL;
1770         err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt, fl6);
1771         if (err) {
1772                 ip6_cork_release(cork, &v6_cork);
1773                 return ERR_PTR(err);
1774         }
1775         if (ipc6->dontfrag < 0)
1776                 ipc6->dontfrag = inet6_sk(sk)->dontfrag;
1777
1778         err = __ip6_append_data(sk, fl6, &queue, &cork->base, &v6_cork,
1779                                 &current->task_frag, getfrag, from,
1780                                 length + exthdrlen, transhdrlen + exthdrlen,
1781                                 flags, ipc6);
1782         if (err) {
1783                 __ip6_flush_pending_frames(sk, &queue, cork, &v6_cork);
1784                 return ERR_PTR(err);
1785         }
1786
1787         return __ip6_make_skb(sk, &queue, cork, &v6_cork);
1788 }