net/ipv6/ip6_output.c

   1 /*
   2  *      IPv6 output functions
   3  *      Linux INET6 implementation
   4  *
   5  *      Authors:
   6  *      Pedro Roque             <roque@di.fc.ul.pt>
   7  *
   8  *      Based on linux/net/ipv4/ip_output.c
   9  *
  10  *      This program is free software; you can redistribute it and/or
  11  *      modify it under the terms of the GNU General Public License
  12  *      as published by the Free Software Foundation; either version
  13  *      2 of the License, or (at your option) any later version.
  14  *
  15  *      Changes:
  16  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
  17  *                              extension headers are implemented.
  18  *                              route changes now work.
  19  *                              ip6_forward does not confuse sniffers.
  20  *                              etc.
  21  *
  22  *      H. von Brand    :       Added missing #include <linux/string.h>
  23  *      Imran Patel     :       frag id should be in NBO
  24  *      Kazunori MIYAZAWA @USAGI
  25  *                      :       add ip6_append_data and related functions
  26  *                              for datagram xmit
  27  */
  28
  29 #include <linux/errno.h>
  30 #include <linux/kernel.h>
  31 #include <linux/string.h>
  32 #include <linux/socket.h>
  33 #include <linux/net.h>
  34 #include <linux/netdevice.h>
  35 #include <linux/if_arp.h>
  36 #include <linux/in6.h>
  37 #include <linux/tcp.h>
  38 #include <linux/route.h>
  39 #include <linux/module.h>
  40 #include <linux/slab.h>
  41
  42 #include <linux/bpf-cgroup.h>
  43 #include <linux/netfilter.h>
  44 #include <linux/netfilter_ipv6.h>
  45
  46 #include <net/sock.h>
  47 #include <net/snmp.h>
  48
  49 #include <net/ipv6.h>
  50 #include <net/ndisc.h>
  51 #include <net/protocol.h>
  52 #include <net/ip6_route.h>
  53 #include <net/addrconf.h>
  54 #include <net/rawv6.h>
  55 #include <net/icmp.h>
  56 #include <net/xfrm.h>
  57 #include <net/checksum.h>
  58 #include <linux/mroute6.h>
  59 #include <net/l3mdev.h>
  60 #include <net/lwtunnel.h>
  61
  62 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
  63 {
  64         struct dst_entry *dst = skb_dst(skb);
  65         struct net_device *dev = dst->dev;
  66         struct neighbour *neigh;
  67         struct in6_addr *nexthop;
  68         int ret;
  69
  70         if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
  71                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
  72
  73                 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
  74                     ((mroute6_socket(net, skb) &&
  75                      !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
  76                      ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
  77                                          &ipv6_hdr(skb)->saddr))) {
  78                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
  79
  80                         /* Do not check for IFF_ALLMULTI; multicast routing
  81                            is not supported in any case.
  82                          */
  83                         if (newskb)
  84                                 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
  85                                         net, sk, newskb, NULL, newskb->dev,
  86                                         dev_loopback_xmit);
  87
  88                         if (ipv6_hdr(skb)->hop_limit == 0) {
  89                                 IP6_INC_STATS(net, idev,
  90                                               IPSTATS_MIB_OUTDISCARDS);
  91                                 kfree_skb(skb);
  92                                 return 0;
  93                         }
  94                 }
  95
  96                 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
  97
  98                 if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <=
  99                     IPV6_ADDR_SCOPE_NODELOCAL &&
 100                     !(dev->flags & IFF_LOOPBACK)) {
 101                         kfree_skb(skb);
 102                         return 0;
 103                 }
 104         }
 105
 106         if (lwtunnel_xmit_redirect(dst->lwtstate)) {
 107                 int res = lwtunnel_xmit(skb);
 108
 109                 if (res < 0 || res == LWTUNNEL_XMIT_DONE)
 110                         return res;
 111         }
 112
 113         rcu_read_lock_bh();
 114         nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr);
 115         neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
 116         if (unlikely(!neigh))
 117                 neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
 118         if (!IS_ERR(neigh)) {
 119                 sock_confirm_neigh(skb, neigh);
 120                 ret = neigh_output(neigh, skb);
 121                 rcu_read_unlock_bh();
 122                 return ret;
 123         }
 124         rcu_read_unlock_bh();
 125
 126         IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
 127         kfree_skb(skb);
 128         return -EINVAL;
 129 }
 130
 131 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 132 {
 133         int ret;
 134
 135         ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
 136         if (ret) {
 137                 kfree_skb(skb);
 138                 return ret;
 139         }
 140
 141 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
 142         /* Policy lookup after SNAT yielded a new policy */
 143         if (skb_dst(skb)->xfrm) {
 144                 IPCB(skb)->flags |= IPSKB_REROUTED;
 145                 return dst_output(net, sk, skb);
 146         }
 147 #endif
 148
 149         if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
 150             dst_allfrag(skb_dst(skb)) ||
 151             (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
 152                 return ip6_fragment(net, sk, skb, ip6_finish_output2);
 153         else
 154                 return ip6_finish_output2(net, sk, skb);
 155 }
 156
 157 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 158 {
 159         struct net_device *dev = skb_dst(skb)->dev;
 160         struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
 161
 162         skb->protocol = htons(ETH_P_IPV6);
 163         skb->dev = dev;
 164
 165         if (unlikely(idev->cnf.disable_ipv6)) {
 166                 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
 167                 kfree_skb(skb);
 168                 return 0;
 169         }
 170
 171         return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
 172                             net, sk, skb, NULL, dev,
 173                             ip6_finish_output,
 174                             !(IP6CB(skb)->flags & IP6SKB_REROUTED));
 175 }
 176
 177 bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np)
 178 {
 179         if (!np->autoflowlabel_set)
 180                 return ip6_default_np_autolabel(net);
 181         else
 182                 return np->autoflowlabel;
 183 }
 184
 185 /*
 186  * xmit an sk_buff (used by TCP, SCTP and DCCP)
 187  * Note : socket lock is not held for SYNACK packets, but might be modified
 188  * by calls to skb_set_owner_w() and ipv6_local_error(),
 189  * which are using proper atomic operations or spinlocks.
 190  */
 191 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
 192              __u32 mark, struct ipv6_txoptions *opt, int tclass)
 193 {
 194         struct net *net = sock_net(sk);
 195         const struct ipv6_pinfo *np = inet6_sk(sk);
 196         struct in6_addr *first_hop = &fl6->daddr;
 197         struct dst_entry *dst = skb_dst(skb);
 198         unsigned int head_room;
 199         struct ipv6hdr *hdr;
 200         u8  proto = fl6->flowi6_proto;
 201         int seg_len = skb->len;
 202         int hlimit = -1;
 203         u32 mtu;
 204
 205         head_room = sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
 206         if (opt)
 207                 head_room += opt->opt_nflen + opt->opt_flen;
 208
 209         if (unlikely(skb_headroom(skb) < head_room)) {
 210                 struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
 211                 if (!skb2) {
 212                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 213                                       IPSTATS_MIB_OUTDISCARDS);
 214                         kfree_skb(skb);
 215                         return -ENOBUFS;
 216                 }
 217                 if (skb->sk)
 218                         skb_set_owner_w(skb2, skb->sk);
 219                 consume_skb(skb);
 220                 skb = skb2;
 221         }
 222
 223         if (opt) {
 224                 seg_len += opt->opt_nflen + opt->opt_flen;
 225
 226                 if (opt->opt_flen)
 227                         ipv6_push_frag_opts(skb, opt, &proto);
 228
 229                 if (opt->opt_nflen)
 230                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop,
 231                                              &fl6->saddr);
 232         }
 233
 234         skb_push(skb, sizeof(struct ipv6hdr));
 235         skb_reset_network_header(skb);
 236         hdr = ipv6_hdr(skb);
 237
 238         /*
 239          *      Fill in the IPv6 header
 240          */
 241         if (np)
 242                 hlimit = np->hop_limit;
 243         if (hlimit < 0)
 244                 hlimit = ip6_dst_hoplimit(dst);
 245
 246         ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
 247                                 ip6_autoflowlabel(net, np), fl6));
 248
 249         hdr->payload_len = htons(seg_len);
 250         hdr->nexthdr = proto;
 251         hdr->hop_limit = hlimit;
 252
 253         hdr->saddr = fl6->saddr;
 254         hdr->daddr = *first_hop;
 255
 256         skb->protocol = htons(ETH_P_IPV6);
 257         skb->priority = sk->sk_priority;
 258         skb->mark = mark;
 259
 260         mtu = dst_mtu(dst);
 261         if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
 262                 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
 263                               IPSTATS_MIB_OUT, skb->len);
 264
 265                 /* if egress device is enslaved to an L3 master device pass the
 266                  * skb to its handler for processing
 267                  */
 268                 skb = l3mdev_ip6_out((struct sock *)sk, skb);
 269                 if (unlikely(!skb))
 270                         return 0;
 271
 272                 /* hooks should never assume socket lock is held.
 273                  * we promote our socket to non const
 274                  */
 275                 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
 276                                net, (struct sock *)sk, skb, NULL, dst->dev,
 277                                dst_output);
 278         }
 279
 280         skb->dev = dst->dev;
 281         /* ipv6_local_error() does not require socket lock,
 282          * we promote our socket to non const
 283          */
 284         ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
 285
 286         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
 287         kfree_skb(skb);
 288         return -EMSGSIZE;
 289 }
 290 EXPORT_SYMBOL(ip6_xmit);
 291
 292 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
 293 {
 294         struct ip6_ra_chain *ra;
 295         struct sock *last = NULL;
 296
 297         read_lock(&ip6_ra_lock);
 298         for (ra = ip6_ra_chain; ra; ra = ra->next) {
 299                 struct sock *sk = ra->sk;
 300                 if (sk && ra->sel == sel &&
 301                     (!sk->sk_bound_dev_if ||
 302                      sk->sk_bound_dev_if == skb->dev->ifindex)) {
 303                         if (last) {
 304                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
 305                                 if (skb2)
 306                                         rawv6_rcv(last, skb2);
 307                         }
 308                         last = sk;
 309                 }
 310         }
 311
 312         if (last) {
 313                 rawv6_rcv(last, skb);
 314                 read_unlock(&ip6_ra_lock);
 315                 return 1;
 316         }
 317         read_unlock(&ip6_ra_lock);
 318         return 0;
 319 }
 320
 321 static int ip6_forward_proxy_check(struct sk_buff *skb)
 322 {
 323         struct ipv6hdr *hdr = ipv6_hdr(skb);
 324         u8 nexthdr = hdr->nexthdr;
 325         __be16 frag_off;
 326         int offset;
 327
 328         if (ipv6_ext_hdr(nexthdr)) {
 329                 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
 330                 if (offset < 0)
 331                         return 0;
 332         } else
 333                 offset = sizeof(struct ipv6hdr);
 334
 335         if (nexthdr == IPPROTO_ICMPV6) {
 336                 struct icmp6hdr *icmp6;
 337
 338                 if (!pskb_may_pull(skb, (skb_network_header(skb) +
 339                                          offset + 1 - skb->data)))
 340                         return 0;
 341
 342                 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
 343
 344                 switch (icmp6->icmp6_type) {
 345                 case NDISC_ROUTER_SOLICITATION:
 346                 case NDISC_ROUTER_ADVERTISEMENT:
 347                 case NDISC_NEIGHBOUR_SOLICITATION:
 348                 case NDISC_NEIGHBOUR_ADVERTISEMENT:
 349                 case NDISC_REDIRECT:
 350                         /* For reaction involving unicast neighbor discovery
 351                          * message destined to the proxied address, pass it to
 352                          * input function.
 353                          */
 354                         return 1;
 355                 default:
 356                         break;
 357                 }
 358         }
 359
 360         /*
 361          * The proxying router can't forward traffic sent to a link-local
 362          * address, so signal the sender and discard the packet. This
 363          * behavior is clarified by the MIPv6 specification.
 364          */
 365         if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
 366                 dst_link_failure(skb);
 367                 return -1;
 368         }
 369
 370         return 0;
 371 }
 372
 373 static inline int ip6_forward_finish(struct net *net, struct sock *sk,
 374                                      struct sk_buff *skb)
 375 {
 376         struct dst_entry *dst = skb_dst(skb);
 377
 378         __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
 379         __IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
 380
 381         return dst_output(net, sk, skb);
 382 }
 383
 384 static unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst)
 385 {
 386         unsigned int mtu;
 387         struct inet6_dev *idev;
 388
 389         if (dst_metric_locked(dst, RTAX_MTU)) {
 390                 mtu = dst_metric_raw(dst, RTAX_MTU);
 391                 if (mtu)
 392                         return mtu;
 393         }
 394
 395         mtu = IPV6_MIN_MTU;
 396         rcu_read_lock();
 397         idev = __in6_dev_get(dst->dev);
 398         if (idev)
 399                 mtu = idev->cnf.mtu6;
 400         rcu_read_unlock();
 401
 402         return mtu;
 403 }
 404
 405 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
 406 {
 407         if (skb->len <= mtu)
 408                 return false;
 409
 410         /* ipv6 conntrack defrag sets max_frag_size + ignore_df */
 411         if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
 412                 return true;
 413
 414         if (skb->ignore_df)
 415                 return false;
 416
 417         if (skb_is_gso(skb) && skb_gso_validate_mtu(skb, mtu))
 418                 return false;
 419
 420         return true;
 421 }
 422
 423 int ip6_forward(struct sk_buff *skb)
 424 {
 425         struct dst_entry *dst = skb_dst(skb);
 426         struct ipv6hdr *hdr = ipv6_hdr(skb);
 427         struct inet6_skb_parm *opt = IP6CB(skb);
 428         struct net *net = dev_net(dst->dev);
 429         u32 mtu;
 430
 431         if (net->ipv6.devconf_all->forwarding == 0)
 432                 goto error;
 433
 434         if (skb->pkt_type != PACKET_HOST)
 435                 goto drop;
 436
 437         if (unlikely(skb->sk))
 438                 goto drop;
 439
 440         if (skb_warn_if_lro(skb))
 441                 goto drop;
 442
 443         if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
 444                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
 445                                 IPSTATS_MIB_INDISCARDS);
 446                 goto drop;
 447         }
 448
 449         skb_forward_csum(skb);
 450
 451         /*
 452          *      We DO NOT make any processing on
 453          *      RA packets, pushing them to user level AS IS
 454          *      without ane WARRANTY that application will be able
 455          *      to interpret them. The reason is that we
 456          *      cannot make anything clever here.
 457          *
 458          *      We are not end-node, so that if packet contains
 459          *      AH/ESP, we cannot make anything.
 460          *      Defragmentation also would be mistake, RA packets
 461          *      cannot be fragmented, because there is no warranty
 462          *      that different fragments will go along one path. --ANK
 463          */
 464         if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
 465                 if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
 466                         return 0;
 467         }
 468
 469         /*
 470          *      check and decrement ttl
 471          */
 472         if (hdr->hop_limit <= 1) {
 473                 /* Force OUTPUT device used as source address */
 474                 skb->dev = dst->dev;
 475                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
 476                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
 477                                 IPSTATS_MIB_INHDRERRORS);
 478
 479                 kfree_skb(skb);
 480                 return -ETIMEDOUT;
 481         }
 482
 483         /* XXX: idev->cnf.proxy_ndp? */
 484         if (net->ipv6.devconf_all->proxy_ndp &&
 485             pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
 486                 int proxied = ip6_forward_proxy_check(skb);
 487                 if (proxied > 0)
 488                         return ip6_input(skb);
 489                 else if (proxied < 0) {
 490                         __IP6_INC_STATS(net, ip6_dst_idev(dst),
 491                                         IPSTATS_MIB_INDISCARDS);
 492                         goto drop;
 493                 }
 494         }
 495
 496         if (!xfrm6_route_forward(skb)) {
 497                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
 498                                 IPSTATS_MIB_INDISCARDS);
 499                 goto drop;
 500         }
 501         dst = skb_dst(skb);
 502
 503         /* IPv6 specs say nothing about it, but it is clear that we cannot
 504            send redirects to source routed frames.
 505            We don't send redirects to frames decapsulated from IPsec.
 506          */
 507         if (IP6CB(skb)->iif == dst->dev->ifindex &&
 508             opt->srcrt == 0 && !skb_sec_path(skb)) {
 509                 struct in6_addr *target = NULL;
 510                 struct inet_peer *peer;
 511                 struct rt6_info *rt;
 512
 513                 /*
 514                  *      incoming and outgoing devices are the same
 515                  *      send a redirect.
 516                  */
 517
 518                 rt = (struct rt6_info *) dst;
 519                 if (rt->rt6i_flags & RTF_GATEWAY)
 520                         target = &rt->rt6i_gateway;
 521                 else
 522                         target = &hdr->daddr;
 523
 524                 peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
 525
 526                 /* Limit redirects both by destination (here)
 527                    and by source (inside ndisc_send_redirect)
 528                  */
 529                 if (inet_peer_xrlim_allow(peer, 1*HZ))
 530                         ndisc_send_redirect(skb, target);
 531                 if (peer)
 532                         inet_putpeer(peer);
 533         } else {
 534                 int addrtype = ipv6_addr_type(&hdr->saddr);
 535
 536                 /* This check is security critical. */
 537                 if (addrtype == IPV6_ADDR_ANY ||
 538                     addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
 539                         goto error;
 540                 if (addrtype & IPV6_ADDR_LINKLOCAL) {
 541                         icmpv6_send(skb, ICMPV6_DEST_UNREACH,
 542                                     ICMPV6_NOT_NEIGHBOUR, 0);
 543                         goto error;
 544                 }
 545         }
 546
 547         mtu = ip6_dst_mtu_forward(dst);
 548         if (mtu < IPV6_MIN_MTU)
 549                 mtu = IPV6_MIN_MTU;
 550
 551         if (ip6_pkt_too_big(skb, mtu)) {
 552                 /* Again, force OUTPUT device used as source address */
 553                 skb->dev = dst->dev;
 554                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 555                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
 556                                 IPSTATS_MIB_INTOOBIGERRORS);
 557                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
 558                                 IPSTATS_MIB_FRAGFAILS);
 559                 kfree_skb(skb);
 560                 return -EMSGSIZE;
 561         }
 562
 563         if (skb_cow(skb, dst->dev->hard_header_len)) {
 564                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
 565                                 IPSTATS_MIB_OUTDISCARDS);
 566                 goto drop;
 567         }
 568
 569         hdr = ipv6_hdr(skb);
 570
 571         /* Mangling hops number delayed to point after skb COW */
 572
 573         hdr->hop_limit--;
 574
 575         return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
 576                        net, NULL, skb, skb->dev, dst->dev,
 577                        ip6_forward_finish);
 578
 579 error:
 580         __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
 581 drop:
 582         kfree_skb(skb);
 583         return -EINVAL;
 584 }
 585
 586 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 587 {
 588         to->pkt_type = from->pkt_type;
 589         to->priority = from->priority;
 590         to->protocol = from->protocol;
 591         skb_dst_drop(to);
 592         skb_dst_set(to, dst_clone(skb_dst(from)));
 593         to->dev = from->dev;
 594         to->mark = from->mark;
 595
 596         skb_copy_hash(to, from);
 597
 598 #ifdef CONFIG_NET_SCHED
 599         to->tc_index = from->tc_index;
 600 #endif
 601         nf_copy(to, from);
 602         skb_copy_secmark(to, from);
 603 }
 604
 605 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
 606                  int (*output)(struct net *, struct sock *, struct sk_buff *))
 607 {
 608         struct sk_buff *frag;
 609         struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
 610         struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
 611                                 inet6_sk(skb->sk) : NULL;
 612         struct ipv6hdr *tmp_hdr;
 613         struct frag_hdr *fh;
 614         unsigned int mtu, hlen, left, len, nexthdr_offset;
 615         int hroom, troom;
 616         __be32 frag_id;
 617         int ptr, offset = 0, err = 0;
 618         u8 *prevhdr, nexthdr = 0;
 619
 620         err = ip6_find_1stfragopt(skb, &prevhdr);
 621         if (err < 0)
 622                 goto fail;
 623         hlen = err;
 624         nexthdr = *prevhdr;
 625         nexthdr_offset = prevhdr - skb_network_header(skb);
 626
 627         mtu = ip6_skb_dst_mtu(skb);
 628
 629         /* We must not fragment if the socket is set to force MTU discovery
 630          * or if the skb it not generated by a local socket.
 631          */
 632         if (unlikely(!skb->ignore_df && skb->len > mtu))
 633                 goto fail_toobig;
 634
 635         if (IP6CB(skb)->frag_max_size) {
 636                 if (IP6CB(skb)->frag_max_size > mtu)
 637                         goto fail_toobig;
 638
 639                 /* don't send fragments larger than what we received */
 640                 mtu = IP6CB(skb)->frag_max_size;
 641                 if (mtu < IPV6_MIN_MTU)
 642                         mtu = IPV6_MIN_MTU;
 643         }
 644
 645         if (np && np->frag_size < mtu) {
 646                 if (np->frag_size)
 647                         mtu = np->frag_size;
 648         }
 649         if (mtu < hlen + sizeof(struct frag_hdr) + 8)
 650                 goto fail_toobig;
 651         mtu -= hlen + sizeof(struct frag_hdr);
 652
 653         frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
 654                                     &ipv6_hdr(skb)->saddr);
 655
 656         if (skb->ip_summed == CHECKSUM_PARTIAL &&
 657             (err = skb_checksum_help(skb)))
 658                 goto fail;
 659
 660         prevhdr = skb_network_header(skb) + nexthdr_offset;
 661         hroom = LL_RESERVED_SPACE(rt->dst.dev);
 662         if (skb_has_frag_list(skb)) {
 663                 unsigned int first_len = skb_pagelen(skb);
 664                 struct sk_buff *frag2;
 665
 666                 if (first_len - hlen > mtu ||
 667                     ((first_len - hlen) & 7) ||
 668                     skb_cloned(skb) ||
 669                     skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
 670                         goto slow_path;
 671
 672                 skb_walk_frags(skb, frag) {
 673                         /* Correct geometry. */
 674                         if (frag->len > mtu ||
 675                             ((frag->len & 7) && frag->next) ||
 676                             skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
 677                                 goto slow_path_clean;
 678
 679                         /* Partially cloned skb? */
 680                         if (skb_shared(frag))
 681                                 goto slow_path_clean;
 682
 683                         BUG_ON(frag->sk);
 684                         if (skb->sk) {
 685                                 frag->sk = skb->sk;
 686                                 frag->destructor = sock_wfree;
 687                         }
 688                         skb->truesize -= frag->truesize;
 689                 }
 690
 691                 err = 0;
 692                 offset = 0;
 693                 /* BUILD HEADER */
 694
 695                 *prevhdr = NEXTHDR_FRAGMENT;
 696                 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
 697                 if (!tmp_hdr) {
 698                         err = -ENOMEM;
 699                         goto fail;
 700                 }
 701                 frag = skb_shinfo(skb)->frag_list;
 702                 skb_frag_list_init(skb);
 703
 704                 __skb_pull(skb, hlen);
 705                 fh = __skb_push(skb, sizeof(struct frag_hdr));
 706                 __skb_push(skb, hlen);
 707                 skb_reset_network_header(skb);
 708                 memcpy(skb_network_header(skb), tmp_hdr, hlen);
 709
 710                 fh->nexthdr = nexthdr;
 711                 fh->reserved = 0;
 712                 fh->frag_off = htons(IP6_MF);
 713                 fh->identification = frag_id;
 714
 715                 first_len = skb_pagelen(skb);
 716                 skb->data_len = first_len - skb_headlen(skb);
 717                 skb->len = first_len;
 718                 ipv6_hdr(skb)->payload_len = htons(first_len -
 719                                                    sizeof(struct ipv6hdr));
 720
 721                 for (;;) {
 722                         /* Prepare header of the next frame,
 723                          * before previous one went down. */
 724                         if (frag) {
 725                                 frag->ip_summed = CHECKSUM_NONE;
 726                                 skb_reset_transport_header(frag);
 727                                 fh = __skb_push(frag, sizeof(struct frag_hdr));
 728                                 __skb_push(frag, hlen);
 729                                 skb_reset_network_header(frag);
 730                                 memcpy(skb_network_header(frag), tmp_hdr,
 731                                        hlen);
 732                                 offset += skb->len - hlen - sizeof(struct frag_hdr);
 733                                 fh->nexthdr = nexthdr;
 734                                 fh->reserved = 0;
 735                                 fh->frag_off = htons(offset);
 736                                 if (frag->next)
 737                                         fh->frag_off |= htons(IP6_MF);
 738                                 fh->identification = frag_id;
 739                                 ipv6_hdr(frag)->payload_len =
 740                                                 htons(frag->len -
 741                                                       sizeof(struct ipv6hdr));
 742                                 ip6_copy_metadata(frag, skb);
 743                         }
 744
 745                         err = output(net, sk, skb);
 746                         if (!err)
 747                                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 748                                               IPSTATS_MIB_FRAGCREATES);
 749
 750                         if (err || !frag)
 751                                 break;
 752
 753                         skb = frag;
 754                         frag = skb->next;
 755                         skb->next = NULL;
 756                 }
 757
 758                 kfree(tmp_hdr);
 759
 760                 if (err == 0) {
 761                         IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 762                                       IPSTATS_MIB_FRAGOKS);
 763                         return 0;
 764                 }
 765
 766                 kfree_skb_list(frag);
 767
 768                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 769                               IPSTATS_MIB_FRAGFAILS);
 770                 return err;
 771
 772 slow_path_clean:
 773                 skb_walk_frags(skb, frag2) {
 774                         if (frag2 == frag)
 775                                 break;
 776                         frag2->sk = NULL;
 777                         frag2->destructor = NULL;
 778                         skb->truesize += frag2->truesize;
 779                 }
 780         }
 781
 782 slow_path:
 783         left = skb->len - hlen;         /* Space per frame */
 784         ptr = hlen;                     /* Where to start from */
 785
 786         /*
 787          *      Fragment the datagram.
 788          */
 789
 790         troom = rt->dst.dev->needed_tailroom;
 791
 792         /*
 793          *      Keep copying data until we run out.
 794          */
 795         while (left > 0)        {
 796                 u8 *fragnexthdr_offset;
 797
 798                 len = left;
 799                 /* IF: it doesn't fit, use 'mtu' - the data space left */
 800                 if (len > mtu)
 801                         len = mtu;
 802                 /* IF: we are not sending up to and including the packet end
 803                    then align the next start on an eight byte boundary */
 804                 if (len < left) {
 805                         len &= ~7;
 806                 }
 807
 808                 /* Allocate buffer */
 809                 frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
 810                                  hroom + troom, GFP_ATOMIC);
 811                 if (!frag) {
 812                         err = -ENOMEM;
 813                         goto fail;
 814                 }
 815
 816                 /*
 817                  *      Set up data on packet
 818                  */
 819
 820                 ip6_copy_metadata(frag, skb);
 821                 skb_reserve(frag, hroom);
 822                 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
 823                 skb_reset_network_header(frag);
 824                 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
 825                 frag->transport_header = (frag->network_header + hlen +
 826                                           sizeof(struct frag_hdr));
 827
 828                 /*
 829                  *      Charge the memory for the fragment to any owner
 830                  *      it might possess
 831                  */
 832                 if (skb->sk)
 833                         skb_set_owner_w(frag, skb->sk);
 834
 835                 /*
 836                  *      Copy the packet header into the new buffer.
 837                  */
 838                 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
 839
 840                 fragnexthdr_offset = skb_network_header(frag);
 841                 fragnexthdr_offset += prevhdr - skb_network_header(skb);
 842                 *fragnexthdr_offset = NEXTHDR_FRAGMENT;
 843
 844                 /*
 845                  *      Build fragment header.
 846                  */
 847                 fh->nexthdr = nexthdr;
 848                 fh->reserved = 0;
 849                 fh->identification = frag_id;
 850
 851                 /*
 852                  *      Copy a block of the IP datagram.
 853                  */
 854                 BUG_ON(skb_copy_bits(skb, ptr, skb_transport_header(frag),
 855                                      len));
 856                 left -= len;
 857
 858                 fh->frag_off = htons(offset);
 859                 if (left > 0)
 860                         fh->frag_off |= htons(IP6_MF);
 861                 ipv6_hdr(frag)->payload_len = htons(frag->len -
 862                                                     sizeof(struct ipv6hdr));
 863
 864                 ptr += len;
 865                 offset += len;
 866
 867                 /*
 868                  *      Put this fragment into the sending queue.
 869                  */
 870                 err = output(net, sk, frag);
 871                 if (err)
 872                         goto fail;
 873
 874                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 875                               IPSTATS_MIB_FRAGCREATES);
 876         }
 877         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 878                       IPSTATS_MIB_FRAGOKS);
 879         consume_skb(skb);
 880         return err;
 881
 882 fail_toobig:
 883         if (skb->sk && dst_allfrag(skb_dst(skb)))
 884                 sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
 885
 886         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 887         err = -EMSGSIZE;
 888
 889 fail:
 890         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 891                       IPSTATS_MIB_FRAGFAILS);
 892         kfree_skb(skb);
 893         return err;
 894 }
 895
 896 static inline int ip6_rt_check(const struct rt6key *rt_key,
 897                                const struct in6_addr *fl_addr,
 898                                const struct in6_addr *addr_cache)
 899 {
 900         return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
 901                 (!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
 902 }
 903
 904 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
 905                                           struct dst_entry *dst,
 906                                           const struct flowi6 *fl6)
 907 {
 908         struct ipv6_pinfo *np = inet6_sk(sk);
 909         struct rt6_info *rt;
 910
 911         if (!dst)
 912                 goto out;
 913
 914         if (dst->ops->family != AF_INET6) {
 915                 dst_release(dst);
 916                 return NULL;
 917         }
 918
 919         rt = (struct rt6_info *)dst;
 920         /* Yes, checking route validity in not connected
 921          * case is not very simple. Take into account,
 922          * that we do not support routing by source, TOS,
 923          * and MSG_DONTROUTE            --ANK (980726)
 924          *
 925          * 1. ip6_rt_check(): If route was host route,
 926          *    check that cached destination is current.
 927          *    If it is network route, we still may
 928          *    check its validity using saved pointer
 929          *    to the last used address: daddr_cache.
 930          *    We do not want to save whole address now,
 931          *    (because main consumer of this service
 932          *    is tcp, which has not this problem),
 933          *    so that the last trick works only on connected
 934          *    sockets.
 935          * 2. oif also should be the same.
 936          */
 937         if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
 938 #ifdef CONFIG_IPV6_SUBTREES
 939             ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
 940 #endif
 941            (!(fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) &&
 942               (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex))) {
 943                 dst_release(dst);
 944                 dst = NULL;
 945         }
 946
 947 out:
 948         return dst;
 949 }
 950
 951 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
 952                                struct dst_entry **dst, struct flowi6 *fl6)
 953 {
 954 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
 955         struct neighbour *n;
 956         struct rt6_info *rt;
 957 #endif
 958         int err;
 959         int flags = 0;
 960
 961         /* The correct way to handle this would be to do
 962          * ip6_route_get_saddr, and then ip6_route_output; however,
 963          * the route-specific preferred source forces the
 964          * ip6_route_output call _before_ ip6_route_get_saddr.
 965          *
 966          * In source specific routing (no src=any default route),
 967          * ip6_route_output will fail given src=any saddr, though, so
 968          * that's why we try it again later.
 969          */
 970         if (ipv6_addr_any(&fl6->saddr) && (!*dst || !(*dst)->error)) {
 971                 struct rt6_info *rt;
 972                 bool had_dst = *dst != NULL;
 973
 974                 if (!had_dst)
 975                         *dst = ip6_route_output(net, sk, fl6);
 976                 rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
 977                 err = ip6_route_get_saddr(net, rt, &fl6->daddr,
 978                                           sk ? inet6_sk(sk)->srcprefs : 0,
 979                                           &fl6->saddr);
 980                 if (err)
 981                         goto out_err_release;
 982
 983                 /* If we had an erroneous initial result, pretend it
 984                  * never existed and let the SA-enabled version take
 985                  * over.
 986                  */
 987                 if (!had_dst && (*dst)->error) {
 988                         dst_release(*dst);
 989                         *dst = NULL;
 990                 }
 991
 992                 if (fl6->flowi6_oif)
 993                         flags |= RT6_LOOKUP_F_IFACE;
 994         }
 995
 996         if (!*dst)
 997                 *dst = ip6_route_output_flags(net, sk, fl6, flags);
 998
 999         err = (*dst)->error;
1000         if (err)
1001                 goto out_err_release;
1002
1003 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1004         /*
1005          * Here if the dst entry we've looked up
1006          * has a neighbour entry that is in the INCOMPLETE
1007          * state and the src address from the flow is
1008          * marked as OPTIMISTIC, we release the found
1009          * dst entry and replace it instead with the
1010          * dst entry of the nexthop router
1011          */
1012         rt = (struct rt6_info *) *dst;
1013         rcu_read_lock_bh();
1014         n = __ipv6_neigh_lookup_noref(rt->dst.dev,
1015                                       rt6_nexthop(rt, &fl6->daddr));
1016         err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
1017         rcu_read_unlock_bh();
1018
1019         if (err) {
1020                 struct inet6_ifaddr *ifp;
1021                 struct flowi6 fl_gw6;
1022                 int redirect;
1023
1024                 ifp = ipv6_get_ifaddr(net, &fl6->saddr,
1025                                       (*dst)->dev, 1);
1026
1027                 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1028                 if (ifp)
1029                         in6_ifa_put(ifp);
1030
1031                 if (redirect) {
1032                         /*
1033                          * We need to get the dst entry for the
1034                          * default router instead
1035                          */
1036                         dst_release(*dst);
1037                         memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1038                         memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1039                         *dst = ip6_route_output(net, sk, &fl_gw6);
1040                         err = (*dst)->error;
1041                         if (err)
1042                                 goto out_err_release;
1043                 }
1044         }
1045 #endif
1046         if (ipv6_addr_v4mapped(&fl6->saddr) &&
1047             !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
1048                 err = -EAFNOSUPPORT;
1049                 goto out_err_release;
1050         }
1051
1052         return 0;
1053
1054 out_err_release:
1055         dst_release(*dst);
1056         *dst = NULL;
1057
1058         if (err == -ENETUNREACH)
1059                 IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1060         return err;
1061 }
1062
1063 /**
1064  *      ip6_dst_lookup - perform route lookup on flow
1065  *      @sk: socket which provides route info
1066  *      @dst: pointer to dst_entry * for result
1067  *      @fl6: flow to lookup
1068  *
1069  *      This function performs a route lookup on the given flow.
1070  *
1071  *      It returns zero on success, or a standard errno code on error.
1072  */
1073 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1074                    struct flowi6 *fl6)
1075 {
1076         *dst = NULL;
1077         return ip6_dst_lookup_tail(net, sk, dst, fl6);
1078 }
1079 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1080
1081 /**
1082  *      ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1083  *      @sk: socket which provides route info
1084  *      @fl6: flow to lookup
1085  *      @final_dst: final destination address for ipsec lookup
1086  *
1087  *      This function performs a route lookup on the given flow.
1088  *
1089  *      It returns a valid dst pointer on success, or a pointer encoded
1090  *      error code.
1091  */
1092 struct dst_entry *ip6_dst_lookup_flow(const struct sock *sk, struct flowi6 *fl6,
1093                                       const struct in6_addr *final_dst)
1094 {
1095         struct dst_entry *dst = NULL;
1096         int err;
1097
1098         err = ip6_dst_lookup_tail(sock_net(sk), sk, &dst, fl6);
1099         if (err)
1100                 return ERR_PTR(err);
1101         if (final_dst)
1102                 fl6->daddr = *final_dst;
1103
1104         return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1105 }
1106 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1107
1108 /**
1109  *      ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1110  *      @sk: socket which provides the dst cache and route info
1111  *      @fl6: flow to lookup
1112  *      @final_dst: final destination address for ipsec lookup
1113  *
1114  *      This function performs a route lookup on the given flow with the
1115  *      possibility of using the cached route in the socket if it is valid.
1116  *      It will take the socket dst lock when operating on the dst cache.
1117  *      As a result, this function can only be used in process context.
1118  *
1119  *      It returns a valid dst pointer on success, or a pointer encoded
1120  *      error code.
1121  */
1122 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1123                                          const struct in6_addr *final_dst)
1124 {
1125         struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1126
1127         dst = ip6_sk_dst_check(sk, dst, fl6);
1128         if (!dst)
1129                 dst = ip6_dst_lookup_flow(sk, fl6, final_dst);
1130
1131         return dst;
1132 }
1133 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1134
1135 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1136                                                gfp_t gfp)
1137 {
1138         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1139 }
1140
1141 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1142                                                 gfp_t gfp)
1143 {
1144         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1145 }
1146
1147 static void ip6_append_data_mtu(unsigned int *mtu,
1148                                 int *maxfraglen,
1149                                 unsigned int fragheaderlen,
1150                                 struct sk_buff *skb,
1151                                 struct rt6_info *rt,
1152                                 unsigned int orig_mtu)
1153 {
1154         if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1155                 if (!skb) {
1156                         /* first fragment, reserve header_len */
1157                         *mtu = orig_mtu - rt->dst.header_len;
1158
1159                 } else {
1160                         /*
1161                          * this fragment is not first, the headers
1162                          * space is regarded as data space.
1163                          */
1164                         *mtu = orig_mtu;
1165                 }
1166                 *maxfraglen = ((*mtu - fragheaderlen) & ~7)
1167                               + fragheaderlen - sizeof(struct frag_hdr);
1168         }
1169 }
1170
1171 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1172                           struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
1173                           struct rt6_info *rt, struct flowi6 *fl6)
1174 {
1175         struct ipv6_pinfo *np = inet6_sk(sk);
1176         unsigned int mtu;
1177         struct ipv6_txoptions *opt = ipc6->opt;
1178
1179         /*
1180          * setup for corking
1181          */
1182         if (opt) {
1183                 if (WARN_ON(v6_cork->opt))
1184                         return -EINVAL;
1185
1186                 v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation);
1187                 if (unlikely(!v6_cork->opt))
1188                         return -ENOBUFS;
1189
1190                 v6_cork->opt->tot_len = sizeof(*opt);
1191                 v6_cork->opt->opt_flen = opt->opt_flen;
1192                 v6_cork->opt->opt_nflen = opt->opt_nflen;
1193
1194                 v6_cork->opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1195                                                     sk->sk_allocation);
1196                 if (opt->dst0opt && !v6_cork->opt->dst0opt)
1197                         return -ENOBUFS;
1198
1199                 v6_cork->opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1200                                                     sk->sk_allocation);
1201                 if (opt->dst1opt && !v6_cork->opt->dst1opt)
1202                         return -ENOBUFS;
1203
1204                 v6_cork->opt->hopopt = ip6_opt_dup(opt->hopopt,
1205                                                    sk->sk_allocation);
1206                 if (opt->hopopt && !v6_cork->opt->hopopt)
1207                         return -ENOBUFS;
1208
1209                 v6_cork->opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1210                                                     sk->sk_allocation);
1211                 if (opt->srcrt && !v6_cork->opt->srcrt)
1212                         return -ENOBUFS;
1213
1214                 /* need source address above miyazawa*/
1215         }
1216         dst_hold(&rt->dst);
1217         cork->base.dst = &rt->dst;
1218         cork->fl.u.ip6 = *fl6;
1219         v6_cork->hop_limit = ipc6->hlimit;
1220         v6_cork->tclass = ipc6->tclass;
1221         if (rt->dst.flags & DST_XFRM_TUNNEL)
1222                 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1223                       READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst);
1224         else
1225                 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1226                       READ_ONCE(rt->dst.dev->mtu) : dst_mtu(rt->dst.path);
1227         if (np->frag_size < mtu) {
1228                 if (np->frag_size)
1229                         mtu = np->frag_size;
1230         }
1231         if (mtu < IPV6_MIN_MTU)
1232                 return -EINVAL;
1233         cork->base.fragsize = mtu;
1234         if (dst_allfrag(rt->dst.path))
1235                 cork->base.flags |= IPCORK_ALLFRAG;
1236         cork->base.length = 0;
1237
1238         return 0;
1239 }
1240
1241 static int __ip6_append_data(struct sock *sk,
1242                              struct flowi6 *fl6,
1243                              struct sk_buff_head *queue,
1244                              struct inet_cork *cork,
1245                              struct inet6_cork *v6_cork,
1246                              struct page_frag *pfrag,
1247                              int getfrag(void *from, char *to, int offset,
1248                                          int len, int odd, struct sk_buff *skb),
1249                              void *from, int length, int transhdrlen,
1250                              unsigned int flags, struct ipcm6_cookie *ipc6,
1251                              const struct sockcm_cookie *sockc)
1252 {
1253         struct sk_buff *skb, *skb_prev = NULL;
1254         unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu;
1255         int exthdrlen = 0;
1256         int dst_exthdrlen = 0;
1257         int hh_len;
1258         int copy;
1259         int err;
1260         int offset = 0;
1261         __u8 tx_flags = 0;
1262         u32 tskey = 0;
1263         struct rt6_info *rt = (struct rt6_info *)cork->dst;
1264         struct ipv6_txoptions *opt = v6_cork->opt;
1265         int csummode = CHECKSUM_NONE;
1266         unsigned int maxnonfragsize, headersize;
1267
1268         skb = skb_peek_tail(queue);
1269         if (!skb) {
1270                 exthdrlen = opt ? opt->opt_flen : 0;
1271                 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1272         }
1273
1274         mtu = cork->fragsize;
1275         orig_mtu = mtu;
1276
1277         hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1278
1279         fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1280                         (opt ? opt->opt_nflen : 0);
1281         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1282                      sizeof(struct frag_hdr);
1283
1284         headersize = sizeof(struct ipv6hdr) +
1285                      (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1286                      (dst_allfrag(&rt->dst) ?
1287                       sizeof(struct frag_hdr) : 0) +
1288                      rt->rt6i_nfheader_len;
1289
1290         /* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit
1291          * the first fragment
1292          */
1293         if (headersize + transhdrlen > mtu)
1294                 goto emsgsize;
1295
1296         if (cork->length + length > mtu - headersize && ipc6->dontfrag &&
1297             (sk->sk_protocol == IPPROTO_UDP ||
1298              sk->sk_protocol == IPPROTO_RAW)) {
1299                 ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1300                                 sizeof(struct ipv6hdr));
1301                 goto emsgsize;
1302         }
1303
1304         if (ip6_sk_ignore_df(sk))
1305                 maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1306         else
1307                 maxnonfragsize = mtu;
1308
1309         if (cork->length + length > maxnonfragsize - headersize) {
1310 emsgsize:
1311                 pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0);
1312                 ipv6_local_error(sk, EMSGSIZE, fl6, pmtu);
1313                 return -EMSGSIZE;
1314         }
1315
1316         /* CHECKSUM_PARTIAL only with no extension headers and when
1317          * we are not going to fragment
1318          */
1319         if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1320             headersize == sizeof(struct ipv6hdr) &&
1321             length <= mtu - headersize &&
1322             !(flags & MSG_MORE) &&
1323             rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
1324                 csummode = CHECKSUM_PARTIAL;
1325
1326         if (sk->sk_type == SOCK_DGRAM || sk->sk_type == SOCK_RAW) {
1327                 sock_tx_timestamp(sk, sockc->tsflags, &tx_flags);
1328                 if (tx_flags & SKBTX_ANY_SW_TSTAMP &&
1329                     sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
1330                         tskey = sk->sk_tskey++;
1331         }
1332
1333         /*
1334          * Let's try using as much space as possible.
1335          * Use MTU if total length of the message fits into the MTU.
1336          * Otherwise, we need to reserve fragment header and
1337          * fragment alignment (= 8-15 octects, in total).
1338          *
1339          * Note that we may need to "move" the data from the tail of
1340          * of the buffer to the new fragment when we split
1341          * the message.
1342          *
1343          * FIXME: It may be fragmented into multiple chunks
1344          *        at once if non-fragmentable extension headers
1345          *        are too large.
1346          * --yoshfuji
1347          */
1348
1349         cork->length += length;
1350         if (!skb)
1351                 goto alloc_new_skb;
1352
1353         while (length > 0) {
1354                 /* Check if the remaining data fits into current packet. */
1355                 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1356                 if (copy < length)
1357                         copy = maxfraglen - skb->len;
1358
1359                 if (copy <= 0) {
1360                         char *data;
1361                         unsigned int datalen;
1362                         unsigned int fraglen;
1363                         unsigned int fraggap;
1364                         unsigned int alloclen;
1365 alloc_new_skb:
1366                         /* There's no room in the current skb */
1367                         if (skb)
1368                                 fraggap = skb->len - maxfraglen;
1369                         else
1370                                 fraggap = 0;
1371                         /* update mtu and maxfraglen if necessary */
1372                         if (!skb || !skb_prev)
1373                                 ip6_append_data_mtu(&mtu, &maxfraglen,
1374                                                     fragheaderlen, skb, rt,
1375                                                     orig_mtu);
1376
1377                         skb_prev = skb;
1378
1379                         /*
1380                          * If remaining data exceeds the mtu,
1381                          * we know we need more fragment(s).
1382                          */
1383                         datalen = length + fraggap;
1384
1385                         if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1386                                 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1387                         if ((flags & MSG_MORE) &&
1388                             !(rt->dst.dev->features&NETIF_F_SG))
1389                                 alloclen = mtu;
1390                         else
1391                                 alloclen = datalen + fragheaderlen;
1392
1393                         alloclen += dst_exthdrlen;
1394
1395                         if (datalen != length + fraggap) {
1396                                 /*
1397                                  * this is not the last fragment, the trailer
1398                                  * space is regarded as data space.
1399                                  */
1400                                 datalen += rt->dst.trailer_len;
1401                         }
1402
1403                         alloclen += rt->dst.trailer_len;
1404                         fraglen = datalen + fragheaderlen;
1405
1406                         /*
1407                          * We just reserve space for fragment header.
1408                          * Note: this may be overallocation if the message
1409                          * (without MSG_MORE) fits into the MTU.
1410                          */
1411                         alloclen += sizeof(struct frag_hdr);
1412
1413                         copy = datalen - transhdrlen - fraggap;
1414                         if (copy < 0) {
1415                                 err = -EINVAL;
1416                                 goto error;
1417                         }
1418                         if (transhdrlen) {
1419                                 skb = sock_alloc_send_skb(sk,
1420                                                 alloclen + hh_len,
1421                                                 (flags & MSG_DONTWAIT), &err);
1422                         } else {
1423                                 skb = NULL;
1424                                 if (refcount_read(&sk->sk_wmem_alloc) <=
1425                                     2 * sk->sk_sndbuf)
1426                                         skb = sock_wmalloc(sk,
1427                                                            alloclen + hh_len, 1,
1428                                                            sk->sk_allocation);
1429                                 if (unlikely(!skb))
1430                                         err = -ENOBUFS;
1431                         }
1432                         if (!skb)
1433                                 goto error;
1434                         /*
1435                          *      Fill in the control structures
1436                          */
1437                         skb->protocol = htons(ETH_P_IPV6);
1438                         skb->ip_summed = csummode;
1439                         skb->csum = 0;
1440                         /* reserve for fragmentation and ipsec header */
1441                         skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1442                                     dst_exthdrlen);
1443
1444                         /* Only the initial fragment is time stamped */
1445                         skb_shinfo(skb)->tx_flags = tx_flags;
1446                         tx_flags = 0;
1447                         skb_shinfo(skb)->tskey = tskey;
1448                         tskey = 0;
1449
1450                         /*
1451                          *      Find where to start putting bytes
1452                          */
1453                         data = skb_put(skb, fraglen);
1454                         skb_set_network_header(skb, exthdrlen);
1455                         data += fragheaderlen;
1456                         skb->transport_header = (skb->network_header +
1457                                                  fragheaderlen);
1458                         if (fraggap) {
1459                                 skb->csum = skb_copy_and_csum_bits(
1460                                         skb_prev, maxfraglen,
1461                                         data + transhdrlen, fraggap, 0);
1462                                 skb_prev->csum = csum_sub(skb_prev->csum,
1463                                                           skb->csum);
1464                                 data += fraggap;
1465                                 pskb_trim_unique(skb_prev, maxfraglen);
1466                         }
1467                         if (copy > 0 &&
1468                             getfrag(from, data + transhdrlen, offset,
1469                                     copy, fraggap, skb) < 0) {
1470                                 err = -EFAULT;
1471                                 kfree_skb(skb);
1472                                 goto error;
1473                         }
1474
1475                         offset += copy;
1476                         length -= datalen - fraggap;
1477                         transhdrlen = 0;
1478                         exthdrlen = 0;
1479                         dst_exthdrlen = 0;
1480
1481                         if ((flags & MSG_CONFIRM) && !skb_prev)
1482                                 skb_set_dst_pending_confirm(skb, 1);
1483
1484                         /*
1485                          * Put the packet on the pending queue
1486                          */
1487                         __skb_queue_tail(queue, skb);
1488                         continue;
1489                 }
1490
1491                 if (copy > length)
1492                         copy = length;
1493
1494                 if (!(rt->dst.dev->features&NETIF_F_SG) &&
1495                     skb_tailroom(skb) >= copy) {
1496                         unsigned int off;
1497
1498                         off = skb->len;
1499                         if (getfrag(from, skb_put(skb, copy),
1500                                                 offset, copy, off, skb) < 0) {
1501                                 __skb_trim(skb, off);
1502                                 err = -EFAULT;
1503                                 goto error;
1504                         }
1505                 } else {
1506                         int i = skb_shinfo(skb)->nr_frags;
1507
1508                         err = -ENOMEM;
1509                         if (!sk_page_frag_refill(sk, pfrag))
1510                                 goto error;
1511
1512                         if (!skb_can_coalesce(skb, i, pfrag->page,
1513                                               pfrag->offset)) {
1514                                 err = -EMSGSIZE;
1515                                 if (i == MAX_SKB_FRAGS)
1516                                         goto error;
1517
1518                                 __skb_fill_page_desc(skb, i, pfrag->page,
1519                                                      pfrag->offset, 0);
1520                                 skb_shinfo(skb)->nr_frags = ++i;
1521                                 get_page(pfrag->page);
1522                         }
1523                         copy = min_t(int, copy, pfrag->size - pfrag->offset);
1524                         if (getfrag(from,
1525                                     page_address(pfrag->page) + pfrag->offset,
1526                                     offset, copy, skb->len, skb) < 0)
1527                                 goto error_efault;
1528
1529                         pfrag->offset += copy;
1530                         skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1531                         skb->len += copy;
1532                         skb->data_len += copy;
1533                         skb->truesize += copy;
1534                         refcount_add(copy, &sk->sk_wmem_alloc);
1535                 }
1536                 offset += copy;
1537                 length -= copy;
1538         }
1539
1540         return 0;
1541
1542 error_efault:
1543         err = -EFAULT;
1544 error:
1545         cork->length -= length;
1546         IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1547         return err;
1548 }
1549
1550 int ip6_append_data(struct sock *sk,
1551                     int getfrag(void *from, char *to, int offset, int len,
1552                                 int odd, struct sk_buff *skb),
1553                     void *from, int length, int transhdrlen,
1554                     struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1555                     struct rt6_info *rt, unsigned int flags,
1556                     const struct sockcm_cookie *sockc)
1557 {
1558         struct inet_sock *inet = inet_sk(sk);
1559         struct ipv6_pinfo *np = inet6_sk(sk);
1560         int exthdrlen;
1561         int err;
1562
1563         if (flags&MSG_PROBE)
1564                 return 0;
1565         if (skb_queue_empty(&sk->sk_write_queue)) {
1566                 /*
1567                  * setup for corking
1568                  */
1569                 err = ip6_setup_cork(sk, &inet->cork, &np->cork,
1570                                      ipc6, rt, fl6);
1571                 if (err)
1572                         return err;
1573
1574                 exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1575                 length += exthdrlen;
1576                 transhdrlen += exthdrlen;
1577         } else {
1578                 fl6 = &inet->cork.fl.u.ip6;
1579                 transhdrlen = 0;
1580         }
1581
1582         return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base,
1583                                  &np->cork, sk_page_frag(sk), getfrag,
1584                                  from, length, transhdrlen, flags, ipc6, sockc);
1585 }
1586 EXPORT_SYMBOL_GPL(ip6_append_data);
1587
1588 static void ip6_cork_release(struct inet_cork_full *cork,
1589                              struct inet6_cork *v6_cork)
1590 {
1591         if (v6_cork->opt) {
1592                 kfree(v6_cork->opt->dst0opt);
1593                 kfree(v6_cork->opt->dst1opt);
1594                 kfree(v6_cork->opt->hopopt);
1595                 kfree(v6_cork->opt->srcrt);
1596                 kfree(v6_cork->opt);
1597                 v6_cork->opt = NULL;
1598         }
1599
1600         if (cork->base.dst) {
1601                 dst_release(cork->base.dst);
1602                 cork->base.dst = NULL;
1603                 cork->base.flags &= ~IPCORK_ALLFRAG;
1604         }
1605         memset(&cork->fl, 0, sizeof(cork->fl));
1606 }
1607
1608 struct sk_buff *__ip6_make_skb(struct sock *sk,
1609                                struct sk_buff_head *queue,
1610                                struct inet_cork_full *cork,
1611                                struct inet6_cork *v6_cork)
1612 {
1613         struct sk_buff *skb, *tmp_skb;
1614         struct sk_buff **tail_skb;
1615         struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1616         struct ipv6_pinfo *np = inet6_sk(sk);
1617         struct net *net = sock_net(sk);
1618         struct ipv6hdr *hdr;
1619         struct ipv6_txoptions *opt = v6_cork->opt;
1620         struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1621         struct flowi6 *fl6 = &cork->fl.u.ip6;
1622         unsigned char proto = fl6->flowi6_proto;
1623
1624         skb = __skb_dequeue(queue);
1625         if (!skb)
1626                 goto out;
1627         tail_skb = &(skb_shinfo(skb)->frag_list);
1628
1629         /* move skb->data to ip header from ext header */
1630         if (skb->data < skb_network_header(skb))
1631                 __skb_pull(skb, skb_network_offset(skb));
1632         while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1633                 __skb_pull(tmp_skb, skb_network_header_len(skb));
1634                 *tail_skb = tmp_skb;
1635                 tail_skb = &(tmp_skb->next);
1636                 skb->len += tmp_skb->len;
1637                 skb->data_len += tmp_skb->len;
1638                 skb->truesize += tmp_skb->truesize;
1639                 tmp_skb->destructor = NULL;
1640                 tmp_skb->sk = NULL;
1641         }
1642
1643         /* Allow local fragmentation. */
1644         skb->ignore_df = ip6_sk_ignore_df(sk);
1645
1646         *final_dst = fl6->daddr;
1647         __skb_pull(skb, skb_network_header_len(skb));
1648         if (opt && opt->opt_flen)
1649                 ipv6_push_frag_opts(skb, opt, &proto);
1650         if (opt && opt->opt_nflen)
1651                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr);
1652
1653         skb_push(skb, sizeof(struct ipv6hdr));
1654         skb_reset_network_header(skb);
1655         hdr = ipv6_hdr(skb);
1656
1657         ip6_flow_hdr(hdr, v6_cork->tclass,
1658                      ip6_make_flowlabel(net, skb, fl6->flowlabel,
1659                                         ip6_autoflowlabel(net, np), fl6));
1660         hdr->hop_limit = v6_cork->hop_limit;
1661         hdr->nexthdr = proto;
1662         hdr->saddr = fl6->saddr;
1663         hdr->daddr = *final_dst;
1664
1665         skb->priority = sk->sk_priority;
1666         skb->mark = sk->sk_mark;
1667
1668         skb_dst_set(skb, dst_clone(&rt->dst));
1669         IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1670         if (proto == IPPROTO_ICMPV6) {
1671                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1672
1673                 ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type);
1674                 ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1675         }
1676
1677         ip6_cork_release(cork, v6_cork);
1678 out:
1679         return skb;
1680 }
1681
1682 int ip6_send_skb(struct sk_buff *skb)
1683 {
1684         struct net *net = sock_net(skb->sk);
1685         struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
1686         int err;
1687
1688         err = ip6_local_out(net, skb->sk, skb);
1689         if (err) {
1690                 if (err > 0)
1691                         err = net_xmit_errno(err);
1692                 if (err)
1693                         IP6_INC_STATS(net, rt->rt6i_idev,
1694                                       IPSTATS_MIB_OUTDISCARDS);
1695         }
1696
1697         return err;
1698 }
1699
1700 int ip6_push_pending_frames(struct sock *sk)
1701 {
1702         struct sk_buff *skb;
1703
1704         skb = ip6_finish_skb(sk);
1705         if (!skb)
1706                 return 0;
1707
1708         return ip6_send_skb(skb);
1709 }
1710 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1711
1712 static void __ip6_flush_pending_frames(struct sock *sk,
1713                                        struct sk_buff_head *queue,
1714                                        struct inet_cork_full *cork,
1715                                        struct inet6_cork *v6_cork)
1716 {
1717         struct sk_buff *skb;
1718
1719         while ((skb = __skb_dequeue_tail(queue)) != NULL) {
1720                 if (skb_dst(skb))
1721                         IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1722                                       IPSTATS_MIB_OUTDISCARDS);
1723                 kfree_skb(skb);
1724         }
1725
1726         ip6_cork_release(cork, v6_cork);
1727 }
1728
1729 void ip6_flush_pending_frames(struct sock *sk)
1730 {
1731         __ip6_flush_pending_frames(sk, &sk->sk_write_queue,
1732                                    &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
1733 }
1734 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
1735
1736 struct sk_buff *ip6_make_skb(struct sock *sk,
1737                              int getfrag(void *from, char *to, int offset,
1738                                          int len, int odd, struct sk_buff *skb),
1739                              void *from, int length, int transhdrlen,
1740                              struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1741                              struct rt6_info *rt, unsigned int flags,
1742                              const struct sockcm_cookie *sockc)
1743 {
1744         struct inet_cork_full cork;
1745         struct inet6_cork v6_cork;
1746         struct sk_buff_head queue;
1747         int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1748         int err;
1749
1750         if (flags & MSG_PROBE)
1751                 return NULL;
1752
1753         __skb_queue_head_init(&queue);
1754
1755         cork.base.flags = 0;
1756         cork.base.addr = 0;
1757         cork.base.opt = NULL;
1758         cork.base.dst = NULL;
1759         v6_cork.opt = NULL;
1760         err = ip6_setup_cork(sk, &cork, &v6_cork, ipc6, rt, fl6);
1761         if (err) {
1762                 ip6_cork_release(&cork, &v6_cork);
1763                 return ERR_PTR(err);
1764         }
1765         if (ipc6->dontfrag < 0)
1766                 ipc6->dontfrag = inet6_sk(sk)->dontfrag;
1767
1768         err = __ip6_append_data(sk, fl6, &queue, &cork.base, &v6_cork,
1769                                 &current->task_frag, getfrag, from,
1770                                 length + exthdrlen, transhdrlen + exthdrlen,
1771                                 flags, ipc6, sockc);
1772         if (err) {
1773                 __ip6_flush_pending_frames(sk, &queue, &cork, &v6_cork);
1774                 return ERR_PTR(err);
1775         }
1776
1777         return __ip6_make_skb(sk, &queue, &cork, &v6_cork);
1778 }