SUNRPC: We must not use list_for_each_entry_safe() in rpc_wake_up()
[linux/fpc-iii.git] / net / ipv4 / ip_output.c
blob51a3eec2c7069fa5c866854e2fb25178c41a92fe
1 /*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * The Internet Protocol (IP) output module.
8 * Authors: Ross Biro
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Donald Becker, <becker@super.org>
11 * Alan Cox, <Alan.Cox@linux.org>
12 * Richard Underwood
13 * Stefan Becker, <stefanb@yello.ping.de>
14 * Jorge Cwik, <jorge@laser.satlink.net>
15 * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
16 * Hirokazu Takahashi, <taka@valinux.co.jp>
18 * See ip_input.c for original log
20 * Fixes:
21 * Alan Cox : Missing nonblock feature in ip_build_xmit.
22 * Mike Kilburn : htons() missing in ip_build_xmit.
23 * Bradford Johnson: Fix faulty handling of some frames when
24 * no route is found.
25 * Alexander Demenshin: Missing sk/skb free in ip_queue_xmit
26 * (in case if packet not accepted by
27 * output firewall rules)
28 * Mike McLagan : Routing by source
29 * Alexey Kuznetsov: use new route cache
30 * Andi Kleen: Fix broken PMTU recovery and remove
31 * some redundant tests.
32 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
33 * Andi Kleen : Replace ip_reply with ip_send_reply.
34 * Andi Kleen : Split fast and slow ip_build_xmit path
35 * for decreased register pressure on x86
36 * and more readibility.
37 * Marc Boucher : When call_out_firewall returns FW_QUEUE,
38 * silently drop skb instead of failing with -EPERM.
39 * Detlev Wengorz : Copy protocol for fragments.
40 * Hirokazu Takahashi: HW checksumming for outgoing UDP
41 * datagrams.
42 * Hirokazu Takahashi: sendfile() on UDP works now.
45 #include <asm/uaccess.h>
46 #include <asm/system.h>
47 #include <linux/module.h>
48 #include <linux/types.h>
49 #include <linux/kernel.h>
50 #include <linux/mm.h>
51 #include <linux/string.h>
52 #include <linux/errno.h>
53 #include <linux/highmem.h>
54 #include <linux/slab.h>
56 #include <linux/socket.h>
57 #include <linux/sockios.h>
58 #include <linux/in.h>
59 #include <linux/inet.h>
60 #include <linux/netdevice.h>
61 #include <linux/etherdevice.h>
62 #include <linux/proc_fs.h>
63 #include <linux/stat.h>
64 #include <linux/init.h>
66 #include <net/snmp.h>
67 #include <net/ip.h>
68 #include <net/protocol.h>
69 #include <net/route.h>
70 #include <net/xfrm.h>
71 #include <linux/skbuff.h>
72 #include <net/sock.h>
73 #include <net/arp.h>
74 #include <net/icmp.h>
75 #include <net/checksum.h>
76 #include <net/inetpeer.h>
77 #include <linux/igmp.h>
78 #include <linux/netfilter_ipv4.h>
79 #include <linux/netfilter_bridge.h>
80 #include <linux/mroute.h>
81 #include <linux/netlink.h>
82 #include <linux/tcp.h>
84 int sysctl_ip_default_ttl __read_mostly = IPDEFTTL;
85 EXPORT_SYMBOL(sysctl_ip_default_ttl);
87 /* Generate a checksum for an outgoing IP datagram. */
88 __inline__ void ip_send_check(struct iphdr *iph)
90 iph->check = 0;
91 iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
93 EXPORT_SYMBOL(ip_send_check);
95 int __ip_local_out(struct sk_buff *skb)
97 struct iphdr *iph = ip_hdr(skb);
99 iph->tot_len = htons(skb->len);
100 ip_send_check(iph);
101 return nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT, skb, NULL,
102 skb_dst(skb)->dev, dst_output);
105 int ip_local_out(struct sk_buff *skb)
107 int err;
109 err = __ip_local_out(skb);
110 if (likely(err == 1))
111 err = dst_output(skb);
113 return err;
115 EXPORT_SYMBOL_GPL(ip_local_out);
117 /* dev_loopback_xmit for use with netfilter. */
118 static int ip_dev_loopback_xmit(struct sk_buff *newskb)
120 skb_reset_mac_header(newskb);
121 __skb_pull(newskb, skb_network_offset(newskb));
122 newskb->pkt_type = PACKET_LOOPBACK;
123 newskb->ip_summed = CHECKSUM_UNNECESSARY;
124 WARN_ON(!skb_dst(newskb));
125 netif_rx_ni(newskb);
126 return 0;
129 static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
131 int ttl = inet->uc_ttl;
133 if (ttl < 0)
134 ttl = ip4_dst_hoplimit(dst);
135 return ttl;
139 * Add an ip header to a skbuff and send it out.
142 int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
143 __be32 saddr, __be32 daddr, struct ip_options_rcu *opt)
145 struct inet_sock *inet = inet_sk(sk);
146 struct rtable *rt = skb_rtable(skb);
147 struct iphdr *iph;
149 /* Build the IP header. */
150 skb_push(skb, sizeof(struct iphdr) + (opt ? opt->opt.optlen : 0));
151 skb_reset_network_header(skb);
152 iph = ip_hdr(skb);
153 iph->version = 4;
154 iph->ihl = 5;
155 iph->tos = inet->tos;
156 if (ip_dont_fragment(sk, &rt->dst))
157 iph->frag_off = htons(IP_DF);
158 else
159 iph->frag_off = 0;
160 iph->ttl = ip_select_ttl(inet, &rt->dst);
161 iph->daddr = (opt && opt->opt.srr ? opt->opt.faddr : daddr);
162 iph->saddr = saddr;
163 iph->protocol = sk->sk_protocol;
164 ip_select_ident(iph, &rt->dst, sk);
166 if (opt && opt->opt.optlen) {
167 iph->ihl += opt->opt.optlen>>2;
168 ip_options_build(skb, &opt->opt, daddr, rt, 0);
171 skb->priority = sk->sk_priority;
172 skb->mark = sk->sk_mark;
174 /* Send it out. */
175 return ip_local_out(skb);
177 EXPORT_SYMBOL_GPL(ip_build_and_send_pkt);
179 static inline int ip_finish_output2(struct sk_buff *skb)
181 struct dst_entry *dst = skb_dst(skb);
182 struct rtable *rt = (struct rtable *)dst;
183 struct net_device *dev = dst->dev;
184 unsigned int hh_len = LL_RESERVED_SPACE(dev);
185 struct neighbour *neigh;
186 int res;
188 if (rt->rt_type == RTN_MULTICAST) {
189 IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTMCAST, skb->len);
190 } else if (rt->rt_type == RTN_BROADCAST)
191 IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTBCAST, skb->len);
193 /* Be paranoid, rather than too clever. */
194 if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
195 struct sk_buff *skb2;
197 skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
198 if (skb2 == NULL) {
199 kfree_skb(skb);
200 return -ENOMEM;
202 if (skb->sk)
203 skb_set_owner_w(skb2, skb->sk);
204 kfree_skb(skb);
205 skb = skb2;
208 rcu_read_lock();
209 if (dst->hh) {
210 int res = neigh_hh_output(dst->hh, skb);
212 rcu_read_unlock();
213 return res;
214 } else {
215 neigh = dst_get_neighbour(dst);
216 if (neigh) {
217 res = neigh->output(skb);
219 rcu_read_unlock();
220 return res;
222 rcu_read_unlock();
225 if (net_ratelimit())
226 printk(KERN_DEBUG "ip_finish_output2: No header cache and no neighbour!\n");
227 kfree_skb(skb);
228 return -EINVAL;
231 static inline int ip_skb_dst_mtu(struct sk_buff *skb)
233 struct inet_sock *inet = skb->sk ? inet_sk(skb->sk) : NULL;
235 return (inet && inet->pmtudisc == IP_PMTUDISC_PROBE) ?
236 skb_dst(skb)->dev->mtu : dst_mtu(skb_dst(skb));
239 static int ip_finish_output(struct sk_buff *skb)
241 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
242 /* Policy lookup after SNAT yielded a new policy */
243 if (skb_dst(skb)->xfrm != NULL) {
244 IPCB(skb)->flags |= IPSKB_REROUTED;
245 return dst_output(skb);
247 #endif
248 if (skb->len > ip_skb_dst_mtu(skb) && !skb_is_gso(skb))
249 return ip_fragment(skb, ip_finish_output2);
250 else
251 return ip_finish_output2(skb);
254 int ip_mc_output(struct sk_buff *skb)
256 struct sock *sk = skb->sk;
257 struct rtable *rt = skb_rtable(skb);
258 struct net_device *dev = rt->dst.dev;
261 * If the indicated interface is up and running, send the packet.
263 IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);
265 skb->dev = dev;
266 skb->protocol = htons(ETH_P_IP);
269 * Multicasts are looped back for other local users
272 if (rt->rt_flags&RTCF_MULTICAST) {
273 if (sk_mc_loop(sk)
274 #ifdef CONFIG_IP_MROUTE
275 /* Small optimization: do not loopback not local frames,
276 which returned after forwarding; they will be dropped
277 by ip_mr_input in any case.
278 Note, that local frames are looped back to be delivered
279 to local recipients.
281 This check is duplicated in ip_mr_input at the moment.
284 ((rt->rt_flags & RTCF_LOCAL) ||
285 !(IPCB(skb)->flags & IPSKB_FORWARDED))
286 #endif
288 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
289 if (newskb)
290 NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING,
291 newskb, NULL, newskb->dev,
292 ip_dev_loopback_xmit);
295 /* Multicasts with ttl 0 must not go beyond the host */
297 if (ip_hdr(skb)->ttl == 0) {
298 kfree_skb(skb);
299 return 0;
303 if (rt->rt_flags&RTCF_BROADCAST) {
304 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
305 if (newskb)
306 NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, newskb,
307 NULL, newskb->dev, ip_dev_loopback_xmit);
310 return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL,
311 skb->dev, ip_finish_output,
312 !(IPCB(skb)->flags & IPSKB_REROUTED));
315 int ip_output(struct sk_buff *skb)
317 struct net_device *dev = skb_dst(skb)->dev;
319 IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);
321 skb->dev = dev;
322 skb->protocol = htons(ETH_P_IP);
324 return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL, dev,
325 ip_finish_output,
326 !(IPCB(skb)->flags & IPSKB_REROUTED));
329 int ip_queue_xmit(struct sk_buff *skb, struct flowi *fl)
331 struct sock *sk = skb->sk;
332 struct inet_sock *inet = inet_sk(sk);
333 struct ip_options_rcu *inet_opt;
334 struct flowi4 *fl4;
335 struct rtable *rt;
336 struct iphdr *iph;
337 int res;
339 /* Skip all of this if the packet is already routed,
340 * f.e. by something like SCTP.
342 rcu_read_lock();
343 inet_opt = rcu_dereference(inet->inet_opt);
344 fl4 = &fl->u.ip4;
345 rt = skb_rtable(skb);
346 if (rt != NULL)
347 goto packet_routed;
349 /* Make sure we can route this packet. */
350 rt = (struct rtable *)__sk_dst_check(sk, 0);
351 if (rt == NULL) {
352 __be32 daddr;
354 /* Use correct destination address if we have options. */
355 daddr = inet->inet_daddr;
356 if (inet_opt && inet_opt->opt.srr)
357 daddr = inet_opt->opt.faddr;
359 /* If this fails, retransmit mechanism of transport layer will
360 * keep trying until route appears or the connection times
361 * itself out.
363 rt = ip_route_output_ports(sock_net(sk), fl4, sk,
364 daddr, inet->inet_saddr,
365 inet->inet_dport,
366 inet->inet_sport,
367 sk->sk_protocol,
368 RT_CONN_FLAGS(sk),
369 sk->sk_bound_dev_if);
370 if (IS_ERR(rt))
371 goto no_route;
372 sk_setup_caps(sk, &rt->dst);
374 skb_dst_set_noref(skb, &rt->dst);
376 packet_routed:
377 if (inet_opt && inet_opt->opt.is_strictroute && fl4->daddr != rt->rt_gateway)
378 goto no_route;
380 /* OK, we know where to send it, allocate and build IP header. */
381 skb_push(skb, sizeof(struct iphdr) + (inet_opt ? inet_opt->opt.optlen : 0));
382 skb_reset_network_header(skb);
383 iph = ip_hdr(skb);
384 *((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff));
385 if (ip_dont_fragment(sk, &rt->dst) && !skb->local_df)
386 iph->frag_off = htons(IP_DF);
387 else
388 iph->frag_off = 0;
389 iph->ttl = ip_select_ttl(inet, &rt->dst);
390 iph->protocol = sk->sk_protocol;
391 iph->saddr = fl4->saddr;
392 iph->daddr = fl4->daddr;
393 /* Transport layer set skb->h.foo itself. */
395 if (inet_opt && inet_opt->opt.optlen) {
396 iph->ihl += inet_opt->opt.optlen >> 2;
397 ip_options_build(skb, &inet_opt->opt, inet->inet_daddr, rt, 0);
400 ip_select_ident_more(iph, &rt->dst, sk,
401 (skb_shinfo(skb)->gso_segs ?: 1) - 1);
403 skb->priority = sk->sk_priority;
404 skb->mark = sk->sk_mark;
406 res = ip_local_out(skb);
407 rcu_read_unlock();
408 return res;
410 no_route:
411 rcu_read_unlock();
412 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
413 kfree_skb(skb);
414 return -EHOSTUNREACH;
416 EXPORT_SYMBOL(ip_queue_xmit);
419 static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
421 to->pkt_type = from->pkt_type;
422 to->priority = from->priority;
423 to->protocol = from->protocol;
424 skb_dst_drop(to);
425 skb_dst_copy(to, from);
426 to->dev = from->dev;
427 to->mark = from->mark;
429 /* Copy the flags to each fragment. */
430 IPCB(to)->flags = IPCB(from)->flags;
432 #ifdef CONFIG_NET_SCHED
433 to->tc_index = from->tc_index;
434 #endif
435 nf_copy(to, from);
436 #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
437 defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
438 to->nf_trace = from->nf_trace;
439 #endif
440 #if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
441 to->ipvs_property = from->ipvs_property;
442 #endif
443 skb_copy_secmark(to, from);
447 * This IP datagram is too large to be sent in one piece. Break it up into
448 * smaller pieces (each of size equal to IP header plus
449 * a block of the data of the original IP data part) that will yet fit in a
450 * single device frame, and queue such a frame for sending.
453 int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
455 struct iphdr *iph;
456 int ptr;
457 struct net_device *dev;
458 struct sk_buff *skb2;
459 unsigned int mtu, hlen, left, len, ll_rs;
460 int offset;
461 __be16 not_last_frag;
462 struct rtable *rt = skb_rtable(skb);
463 int err = 0;
465 dev = rt->dst.dev;
468 * Point into the IP datagram header.
471 iph = ip_hdr(skb);
473 if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) {
474 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
475 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
476 htonl(ip_skb_dst_mtu(skb)));
477 kfree_skb(skb);
478 return -EMSGSIZE;
482 * Setup starting values.
485 hlen = iph->ihl * 4;
486 mtu = dst_mtu(&rt->dst) - hlen; /* Size of data space */
487 #ifdef CONFIG_BRIDGE_NETFILTER
488 if (skb->nf_bridge)
489 mtu -= nf_bridge_mtu_reduction(skb);
490 #endif
491 IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE;
493 /* When frag_list is given, use it. First, check its validity:
494 * some transformers could create wrong frag_list or break existing
495 * one, it is not prohibited. In this case fall back to copying.
497 * LATER: this step can be merged to real generation of fragments,
498 * we can switch to copy when see the first bad fragment.
500 if (skb_has_frag_list(skb)) {
501 struct sk_buff *frag, *frag2;
502 int first_len = skb_pagelen(skb);
504 if (first_len - hlen > mtu ||
505 ((first_len - hlen) & 7) ||
506 (iph->frag_off & htons(IP_MF|IP_OFFSET)) ||
507 skb_cloned(skb))
508 goto slow_path;
510 skb_walk_frags(skb, frag) {
511 /* Correct geometry. */
512 if (frag->len > mtu ||
513 ((frag->len & 7) && frag->next) ||
514 skb_headroom(frag) < hlen)
515 goto slow_path_clean;
517 /* Partially cloned skb? */
518 if (skb_shared(frag))
519 goto slow_path_clean;
521 BUG_ON(frag->sk);
522 if (skb->sk) {
523 frag->sk = skb->sk;
524 frag->destructor = sock_wfree;
526 skb->truesize -= frag->truesize;
529 /* Everything is OK. Generate! */
531 err = 0;
532 offset = 0;
533 frag = skb_shinfo(skb)->frag_list;
534 skb_frag_list_init(skb);
535 skb->data_len = first_len - skb_headlen(skb);
536 skb->len = first_len;
537 iph->tot_len = htons(first_len);
538 iph->frag_off = htons(IP_MF);
539 ip_send_check(iph);
541 for (;;) {
542 /* Prepare header of the next frame,
543 * before previous one went down. */
544 if (frag) {
545 frag->ip_summed = CHECKSUM_NONE;
546 skb_reset_transport_header(frag);
547 __skb_push(frag, hlen);
548 skb_reset_network_header(frag);
549 memcpy(skb_network_header(frag), iph, hlen);
550 iph = ip_hdr(frag);
551 iph->tot_len = htons(frag->len);
552 ip_copy_metadata(frag, skb);
553 if (offset == 0)
554 ip_options_fragment(frag);
555 offset += skb->len - hlen;
556 iph->frag_off = htons(offset>>3);
557 if (frag->next != NULL)
558 iph->frag_off |= htons(IP_MF);
559 /* Ready, complete checksum */
560 ip_send_check(iph);
563 err = output(skb);
565 if (!err)
566 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
567 if (err || !frag)
568 break;
570 skb = frag;
571 frag = skb->next;
572 skb->next = NULL;
575 if (err == 0) {
576 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
577 return 0;
580 while (frag) {
581 skb = frag->next;
582 kfree_skb(frag);
583 frag = skb;
585 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
586 return err;
588 slow_path_clean:
589 skb_walk_frags(skb, frag2) {
590 if (frag2 == frag)
591 break;
592 frag2->sk = NULL;
593 frag2->destructor = NULL;
594 skb->truesize += frag2->truesize;
598 slow_path:
599 left = skb->len - hlen; /* Space per frame */
600 ptr = hlen; /* Where to start from */
602 /* for bridged IP traffic encapsulated inside f.e. a vlan header,
603 * we need to make room for the encapsulating header
605 ll_rs = LL_RESERVED_SPACE_EXTRA(rt->dst.dev, nf_bridge_pad(skb));
608 * Fragment the datagram.
611 offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
612 not_last_frag = iph->frag_off & htons(IP_MF);
615 * Keep copying data until we run out.
618 while (left > 0) {
619 len = left;
620 /* IF: it doesn't fit, use 'mtu' - the data space left */
621 if (len > mtu)
622 len = mtu;
623 /* IF: we are not sending up to and including the packet end
624 then align the next start on an eight byte boundary */
625 if (len < left) {
626 len &= ~7;
629 * Allocate buffer.
632 if ((skb2 = alloc_skb(len+hlen+ll_rs, GFP_ATOMIC)) == NULL) {
633 NETDEBUG(KERN_INFO "IP: frag: no memory for new fragment!\n");
634 err = -ENOMEM;
635 goto fail;
639 * Set up data on packet
642 ip_copy_metadata(skb2, skb);
643 skb_reserve(skb2, ll_rs);
644 skb_put(skb2, len + hlen);
645 skb_reset_network_header(skb2);
646 skb2->transport_header = skb2->network_header + hlen;
649 * Charge the memory for the fragment to any owner
650 * it might possess
653 if (skb->sk)
654 skb_set_owner_w(skb2, skb->sk);
657 * Copy the packet header into the new buffer.
660 skb_copy_from_linear_data(skb, skb_network_header(skb2), hlen);
663 * Copy a block of the IP datagram.
665 if (skb_copy_bits(skb, ptr, skb_transport_header(skb2), len))
666 BUG();
667 left -= len;
670 * Fill in the new header fields.
672 iph = ip_hdr(skb2);
673 iph->frag_off = htons((offset >> 3));
675 /* ANK: dirty, but effective trick. Upgrade options only if
676 * the segment to be fragmented was THE FIRST (otherwise,
677 * options are already fixed) and make it ONCE
678 * on the initial skb, so that all the following fragments
679 * will inherit fixed options.
681 if (offset == 0)
682 ip_options_fragment(skb);
685 * Added AC : If we are fragmenting a fragment that's not the
686 * last fragment then keep MF on each bit
688 if (left > 0 || not_last_frag)
689 iph->frag_off |= htons(IP_MF);
690 ptr += len;
691 offset += len;
694 * Put this fragment into the sending queue.
696 iph->tot_len = htons(len + hlen);
698 ip_send_check(iph);
700 err = output(skb2);
701 if (err)
702 goto fail;
704 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
706 kfree_skb(skb);
707 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
708 return err;
710 fail:
711 kfree_skb(skb);
712 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
713 return err;
715 EXPORT_SYMBOL(ip_fragment);
718 ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb)
720 struct iovec *iov = from;
722 if (skb->ip_summed == CHECKSUM_PARTIAL) {
723 if (memcpy_fromiovecend(to, iov, offset, len) < 0)
724 return -EFAULT;
725 } else {
726 __wsum csum = 0;
727 if (csum_partial_copy_fromiovecend(to, iov, offset, len, &csum) < 0)
728 return -EFAULT;
729 skb->csum = csum_block_add(skb->csum, csum, odd);
731 return 0;
733 EXPORT_SYMBOL(ip_generic_getfrag);
735 static inline __wsum
736 csum_page(struct page *page, int offset, int copy)
738 char *kaddr;
739 __wsum csum;
740 kaddr = kmap(page);
741 csum = csum_partial(kaddr + offset, copy, 0);
742 kunmap(page);
743 return csum;
746 static inline int ip_ufo_append_data(struct sock *sk,
747 struct sk_buff_head *queue,
748 int getfrag(void *from, char *to, int offset, int len,
749 int odd, struct sk_buff *skb),
750 void *from, int length, int hh_len, int fragheaderlen,
751 int transhdrlen, int maxfraglen, unsigned int flags)
753 struct sk_buff *skb;
754 int err;
756 /* There is support for UDP fragmentation offload by network
757 * device, so create one single skb packet containing complete
758 * udp datagram
760 if ((skb = skb_peek_tail(queue)) == NULL) {
761 skb = sock_alloc_send_skb(sk,
762 hh_len + fragheaderlen + transhdrlen + 20,
763 (flags & MSG_DONTWAIT), &err);
765 if (skb == NULL)
766 return err;
768 /* reserve space for Hardware header */
769 skb_reserve(skb, hh_len);
771 /* create space for UDP/IP header */
772 skb_put(skb, fragheaderlen + transhdrlen);
774 /* initialize network header pointer */
775 skb_reset_network_header(skb);
777 /* initialize protocol header pointer */
778 skb->transport_header = skb->network_header + fragheaderlen;
780 skb->ip_summed = CHECKSUM_PARTIAL;
781 skb->csum = 0;
783 /* specify the length of each IP datagram fragment */
784 skb_shinfo(skb)->gso_size = maxfraglen - fragheaderlen;
785 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
786 __skb_queue_tail(queue, skb);
789 return skb_append_datato_frags(sk, skb, getfrag, from,
790 (length - transhdrlen));
793 static int __ip_append_data(struct sock *sk,
794 struct flowi4 *fl4,
795 struct sk_buff_head *queue,
796 struct inet_cork *cork,
797 int getfrag(void *from, char *to, int offset,
798 int len, int odd, struct sk_buff *skb),
799 void *from, int length, int transhdrlen,
800 unsigned int flags)
802 struct inet_sock *inet = inet_sk(sk);
803 struct sk_buff *skb;
805 struct ip_options *opt = cork->opt;
806 int hh_len;
807 int exthdrlen;
808 int mtu;
809 int copy;
810 int err;
811 int offset = 0;
812 unsigned int maxfraglen, fragheaderlen;
813 int csummode = CHECKSUM_NONE;
814 struct rtable *rt = (struct rtable *)cork->dst;
816 skb = skb_peek_tail(queue);
818 exthdrlen = !skb ? rt->dst.header_len : 0;
819 mtu = cork->fragsize;
821 hh_len = LL_RESERVED_SPACE(rt->dst.dev);
823 fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
824 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
826 if (cork->length + length > 0xFFFF - fragheaderlen) {
827 ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport,
828 mtu-exthdrlen);
829 return -EMSGSIZE;
833 * transhdrlen > 0 means that this is the first fragment and we wish
834 * it won't be fragmented in the future.
836 if (transhdrlen &&
837 length + fragheaderlen <= mtu &&
838 rt->dst.dev->features & NETIF_F_V4_CSUM &&
839 !exthdrlen)
840 csummode = CHECKSUM_PARTIAL;
842 cork->length += length;
843 if (((length > mtu) || (skb && skb_is_gso(skb))) &&
844 (sk->sk_protocol == IPPROTO_UDP) &&
845 (rt->dst.dev->features & NETIF_F_UFO) && !rt->dst.header_len) {
846 err = ip_ufo_append_data(sk, queue, getfrag, from, length,
847 hh_len, fragheaderlen, transhdrlen,
848 maxfraglen, flags);
849 if (err)
850 goto error;
851 return 0;
854 /* So, what's going on in the loop below?
856 * We use calculated fragment length to generate chained skb,
857 * each of segments is IP fragment ready for sending to network after
858 * adding appropriate IP header.
861 if (!skb)
862 goto alloc_new_skb;
864 while (length > 0) {
865 /* Check if the remaining data fits into current packet. */
866 copy = mtu - skb->len;
867 if (copy < length)
868 copy = maxfraglen - skb->len;
869 if (copy <= 0) {
870 char *data;
871 unsigned int datalen;
872 unsigned int fraglen;
873 unsigned int fraggap;
874 unsigned int alloclen;
875 struct sk_buff *skb_prev;
876 alloc_new_skb:
877 skb_prev = skb;
878 if (skb_prev)
879 fraggap = skb_prev->len - maxfraglen;
880 else
881 fraggap = 0;
884 * If remaining data exceeds the mtu,
885 * we know we need more fragment(s).
887 datalen = length + fraggap;
888 if (datalen > mtu - fragheaderlen)
889 datalen = maxfraglen - fragheaderlen;
890 fraglen = datalen + fragheaderlen;
892 if ((flags & MSG_MORE) &&
893 !(rt->dst.dev->features&NETIF_F_SG))
894 alloclen = mtu;
895 else
896 alloclen = fraglen;
898 alloclen += exthdrlen;
900 /* The last fragment gets additional space at tail.
901 * Note, with MSG_MORE we overallocate on fragments,
902 * because we have no idea what fragment will be
903 * the last.
905 if (datalen == length + fraggap)
906 alloclen += rt->dst.trailer_len;
908 if (transhdrlen) {
909 skb = sock_alloc_send_skb(sk,
910 alloclen + hh_len + 15,
911 (flags & MSG_DONTWAIT), &err);
912 } else {
913 skb = NULL;
914 if (atomic_read(&sk->sk_wmem_alloc) <=
915 2 * sk->sk_sndbuf)
916 skb = sock_wmalloc(sk,
917 alloclen + hh_len + 15, 1,
918 sk->sk_allocation);
919 if (unlikely(skb == NULL))
920 err = -ENOBUFS;
921 else
922 /* only the initial fragment is
923 time stamped */
924 cork->tx_flags = 0;
926 if (skb == NULL)
927 goto error;
930 * Fill in the control structures
932 skb->ip_summed = csummode;
933 skb->csum = 0;
934 skb_reserve(skb, hh_len);
935 skb_shinfo(skb)->tx_flags = cork->tx_flags;
938 * Find where to start putting bytes.
940 data = skb_put(skb, fraglen + exthdrlen);
941 skb_set_network_header(skb, exthdrlen);
942 skb->transport_header = (skb->network_header +
943 fragheaderlen);
944 data += fragheaderlen + exthdrlen;
946 if (fraggap) {
947 skb->csum = skb_copy_and_csum_bits(
948 skb_prev, maxfraglen,
949 data + transhdrlen, fraggap, 0);
950 skb_prev->csum = csum_sub(skb_prev->csum,
951 skb->csum);
952 data += fraggap;
953 pskb_trim_unique(skb_prev, maxfraglen);
956 copy = datalen - transhdrlen - fraggap;
957 if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
958 err = -EFAULT;
959 kfree_skb(skb);
960 goto error;
963 offset += copy;
964 length -= datalen - fraggap;
965 transhdrlen = 0;
966 exthdrlen = 0;
967 csummode = CHECKSUM_NONE;
970 * Put the packet on the pending queue.
972 __skb_queue_tail(queue, skb);
973 continue;
976 if (copy > length)
977 copy = length;
979 if (!(rt->dst.dev->features&NETIF_F_SG)) {
980 unsigned int off;
982 off = skb->len;
983 if (getfrag(from, skb_put(skb, copy),
984 offset, copy, off, skb) < 0) {
985 __skb_trim(skb, off);
986 err = -EFAULT;
987 goto error;
989 } else {
990 int i = skb_shinfo(skb)->nr_frags;
991 skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
992 struct page *page = cork->page;
993 int off = cork->off;
994 unsigned int left;
996 if (page && (left = PAGE_SIZE - off) > 0) {
997 if (copy >= left)
998 copy = left;
999 if (page != frag->page) {
1000 if (i == MAX_SKB_FRAGS) {
1001 err = -EMSGSIZE;
1002 goto error;
1004 get_page(page);
1005 skb_fill_page_desc(skb, i, page, off, 0);
1006 frag = &skb_shinfo(skb)->frags[i];
1008 } else if (i < MAX_SKB_FRAGS) {
1009 if (copy > PAGE_SIZE)
1010 copy = PAGE_SIZE;
1011 page = alloc_pages(sk->sk_allocation, 0);
1012 if (page == NULL) {
1013 err = -ENOMEM;
1014 goto error;
1016 cork->page = page;
1017 cork->off = 0;
1019 skb_fill_page_desc(skb, i, page, 0, 0);
1020 frag = &skb_shinfo(skb)->frags[i];
1021 } else {
1022 err = -EMSGSIZE;
1023 goto error;
1025 if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1026 err = -EFAULT;
1027 goto error;
1029 cork->off += copy;
1030 frag->size += copy;
1031 skb->len += copy;
1032 skb->data_len += copy;
1033 skb->truesize += copy;
1034 atomic_add(copy, &sk->sk_wmem_alloc);
1036 offset += copy;
1037 length -= copy;
1040 return 0;
1042 error:
1043 cork->length -= length;
1044 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
1045 return err;
1048 static int ip_setup_cork(struct sock *sk, struct inet_cork *cork,
1049 struct ipcm_cookie *ipc, struct rtable **rtp)
1051 struct inet_sock *inet = inet_sk(sk);
1052 struct ip_options_rcu *opt;
1053 struct rtable *rt;
1056 * setup for corking.
1058 opt = ipc->opt;
1059 if (opt) {
1060 if (cork->opt == NULL) {
1061 cork->opt = kmalloc(sizeof(struct ip_options) + 40,
1062 sk->sk_allocation);
1063 if (unlikely(cork->opt == NULL))
1064 return -ENOBUFS;
1066 memcpy(cork->opt, &opt->opt, sizeof(struct ip_options) + opt->opt.optlen);
1067 cork->flags |= IPCORK_OPT;
1068 cork->addr = ipc->addr;
1070 rt = *rtp;
1071 if (unlikely(!rt))
1072 return -EFAULT;
1074 * We steal reference to this route, caller should not release it
1076 *rtp = NULL;
1077 cork->fragsize = inet->pmtudisc == IP_PMTUDISC_PROBE ?
1078 rt->dst.dev->mtu : dst_mtu(&rt->dst);
1079 cork->dst = &rt->dst;
1080 cork->length = 0;
1081 cork->tx_flags = ipc->tx_flags;
1082 cork->page = NULL;
1083 cork->off = 0;
1085 return 0;
1089 * ip_append_data() and ip_append_page() can make one large IP datagram
1090 * from many pieces of data. Each pieces will be holded on the socket
1091 * until ip_push_pending_frames() is called. Each piece can be a page
1092 * or non-page data.
1094 * Not only UDP, other transport protocols - e.g. raw sockets - can use
1095 * this interface potentially.
1097 * LATER: length must be adjusted by pad at tail, when it is required.
1099 int ip_append_data(struct sock *sk, struct flowi4 *fl4,
1100 int getfrag(void *from, char *to, int offset, int len,
1101 int odd, struct sk_buff *skb),
1102 void *from, int length, int transhdrlen,
1103 struct ipcm_cookie *ipc, struct rtable **rtp,
1104 unsigned int flags)
1106 struct inet_sock *inet = inet_sk(sk);
1107 int err;
1109 if (flags&MSG_PROBE)
1110 return 0;
1112 if (skb_queue_empty(&sk->sk_write_queue)) {
1113 err = ip_setup_cork(sk, &inet->cork.base, ipc, rtp);
1114 if (err)
1115 return err;
1116 } else {
1117 transhdrlen = 0;
1120 return __ip_append_data(sk, fl4, &sk->sk_write_queue, &inet->cork.base, getfrag,
1121 from, length, transhdrlen, flags);
1124 ssize_t ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page,
1125 int offset, size_t size, int flags)
1127 struct inet_sock *inet = inet_sk(sk);
1128 struct sk_buff *skb;
1129 struct rtable *rt;
1130 struct ip_options *opt = NULL;
1131 struct inet_cork *cork;
1132 int hh_len;
1133 int mtu;
1134 int len;
1135 int err;
1136 unsigned int maxfraglen, fragheaderlen, fraggap;
1138 if (inet->hdrincl)
1139 return -EPERM;
1141 if (flags&MSG_PROBE)
1142 return 0;
1144 if (skb_queue_empty(&sk->sk_write_queue))
1145 return -EINVAL;
1147 cork = &inet->cork.base;
1148 rt = (struct rtable *)cork->dst;
1149 if (cork->flags & IPCORK_OPT)
1150 opt = cork->opt;
1152 if (!(rt->dst.dev->features&NETIF_F_SG))
1153 return -EOPNOTSUPP;
1155 hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1156 mtu = cork->fragsize;
1158 fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
1159 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
1161 if (cork->length + size > 0xFFFF - fragheaderlen) {
1162 ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport, mtu);
1163 return -EMSGSIZE;
1166 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1167 return -EINVAL;
1169 cork->length += size;
1170 if ((size + skb->len > mtu) &&
1171 (sk->sk_protocol == IPPROTO_UDP) &&
1172 (rt->dst.dev->features & NETIF_F_UFO)) {
1173 skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
1174 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1178 while (size > 0) {
1179 int i;
1181 if (skb_is_gso(skb))
1182 len = size;
1183 else {
1185 /* Check if the remaining data fits into current packet. */
1186 len = mtu - skb->len;
1187 if (len < size)
1188 len = maxfraglen - skb->len;
1190 if (len <= 0) {
1191 struct sk_buff *skb_prev;
1192 int alloclen;
1194 skb_prev = skb;
1195 fraggap = skb_prev->len - maxfraglen;
1197 alloclen = fragheaderlen + hh_len + fraggap + 15;
1198 skb = sock_wmalloc(sk, alloclen, 1, sk->sk_allocation);
1199 if (unlikely(!skb)) {
1200 err = -ENOBUFS;
1201 goto error;
1205 * Fill in the control structures
1207 skb->ip_summed = CHECKSUM_NONE;
1208 skb->csum = 0;
1209 skb_reserve(skb, hh_len);
1212 * Find where to start putting bytes.
1214 skb_put(skb, fragheaderlen + fraggap);
1215 skb_reset_network_header(skb);
1216 skb->transport_header = (skb->network_header +
1217 fragheaderlen);
1218 if (fraggap) {
1219 skb->csum = skb_copy_and_csum_bits(skb_prev,
1220 maxfraglen,
1221 skb_transport_header(skb),
1222 fraggap, 0);
1223 skb_prev->csum = csum_sub(skb_prev->csum,
1224 skb->csum);
1225 pskb_trim_unique(skb_prev, maxfraglen);
1229 * Put the packet on the pending queue.
1231 __skb_queue_tail(&sk->sk_write_queue, skb);
1232 continue;
1235 i = skb_shinfo(skb)->nr_frags;
1236 if (len > size)
1237 len = size;
1238 if (skb_can_coalesce(skb, i, page, offset)) {
1239 skb_shinfo(skb)->frags[i-1].size += len;
1240 } else if (i < MAX_SKB_FRAGS) {
1241 get_page(page);
1242 skb_fill_page_desc(skb, i, page, offset, len);
1243 } else {
1244 err = -EMSGSIZE;
1245 goto error;
1248 if (skb->ip_summed == CHECKSUM_NONE) {
1249 __wsum csum;
1250 csum = csum_page(page, offset, len);
1251 skb->csum = csum_block_add(skb->csum, csum, skb->len);
1254 skb->len += len;
1255 skb->data_len += len;
1256 skb->truesize += len;
1257 atomic_add(len, &sk->sk_wmem_alloc);
1258 offset += len;
1259 size -= len;
1261 return 0;
1263 error:
1264 cork->length -= size;
1265 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
1266 return err;
1269 static void ip_cork_release(struct inet_cork *cork)
1271 cork->flags &= ~IPCORK_OPT;
1272 kfree(cork->opt);
1273 cork->opt = NULL;
1274 dst_release(cork->dst);
1275 cork->dst = NULL;
1279 * Combined all pending IP fragments on the socket as one IP datagram
1280 * and push them out.
1282 struct sk_buff *__ip_make_skb(struct sock *sk,
1283 struct flowi4 *fl4,
1284 struct sk_buff_head *queue,
1285 struct inet_cork *cork)
1287 struct sk_buff *skb, *tmp_skb;
1288 struct sk_buff **tail_skb;
1289 struct inet_sock *inet = inet_sk(sk);
1290 struct net *net = sock_net(sk);
1291 struct ip_options *opt = NULL;
1292 struct rtable *rt = (struct rtable *)cork->dst;
1293 struct iphdr *iph;
1294 __be16 df = 0;
1295 __u8 ttl;
1297 if ((skb = __skb_dequeue(queue)) == NULL)
1298 goto out;
1299 tail_skb = &(skb_shinfo(skb)->frag_list);
1301 /* move skb->data to ip header from ext header */
1302 if (skb->data < skb_network_header(skb))
1303 __skb_pull(skb, skb_network_offset(skb));
1304 while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1305 __skb_pull(tmp_skb, skb_network_header_len(skb));
1306 *tail_skb = tmp_skb;
1307 tail_skb = &(tmp_skb->next);
1308 skb->len += tmp_skb->len;
1309 skb->data_len += tmp_skb->len;
1310 skb->truesize += tmp_skb->truesize;
1311 tmp_skb->destructor = NULL;
1312 tmp_skb->sk = NULL;
1315 /* Unless user demanded real pmtu discovery (IP_PMTUDISC_DO), we allow
1316 * to fragment the frame generated here. No matter, what transforms
1317 * how transforms change size of the packet, it will come out.
1319 if (inet->pmtudisc < IP_PMTUDISC_DO)
1320 skb->local_df = 1;
1322 /* DF bit is set when we want to see DF on outgoing frames.
1323 * If local_df is set too, we still allow to fragment this frame
1324 * locally. */
1325 if (inet->pmtudisc >= IP_PMTUDISC_DO ||
1326 (skb->len <= dst_mtu(&rt->dst) &&
1327 ip_dont_fragment(sk, &rt->dst)))
1328 df = htons(IP_DF);
1330 if (cork->flags & IPCORK_OPT)
1331 opt = cork->opt;
1333 if (rt->rt_type == RTN_MULTICAST)
1334 ttl = inet->mc_ttl;
1335 else
1336 ttl = ip_select_ttl(inet, &rt->dst);
1338 iph = (struct iphdr *)skb->data;
1339 iph->version = 4;
1340 iph->ihl = 5;
1341 iph->tos = inet->tos;
1342 iph->frag_off = df;
1343 ip_select_ident(iph, &rt->dst, sk);
1344 iph->ttl = ttl;
1345 iph->protocol = sk->sk_protocol;
1346 iph->saddr = fl4->saddr;
1347 iph->daddr = fl4->daddr;
1349 if (opt) {
1350 iph->ihl += opt->optlen>>2;
1351 ip_options_build(skb, opt, cork->addr, rt, 0);
1354 skb->priority = sk->sk_priority;
1355 skb->mark = sk->sk_mark;
1357 * Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec
1358 * on dst refcount
1360 cork->dst = NULL;
1361 skb_dst_set(skb, &rt->dst);
1363 if (iph->protocol == IPPROTO_ICMP)
1364 icmp_out_count(net, ((struct icmphdr *)
1365 skb_transport_header(skb))->type);
1367 ip_cork_release(cork);
1368 out:
1369 return skb;
1372 int ip_send_skb(struct sk_buff *skb)
1374 struct net *net = sock_net(skb->sk);
1375 int err;
1377 err = ip_local_out(skb);
1378 if (err) {
1379 if (err > 0)
1380 err = net_xmit_errno(err);
1381 if (err)
1382 IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS);
1385 return err;
1388 int ip_push_pending_frames(struct sock *sk, struct flowi4 *fl4)
1390 struct sk_buff *skb;
1392 skb = ip_finish_skb(sk, fl4);
1393 if (!skb)
1394 return 0;
1396 /* Netfilter gets whole the not fragmented skb. */
1397 return ip_send_skb(skb);
1401 * Throw away all pending data on the socket.
1403 static void __ip_flush_pending_frames(struct sock *sk,
1404 struct sk_buff_head *queue,
1405 struct inet_cork *cork)
1407 struct sk_buff *skb;
1409 while ((skb = __skb_dequeue_tail(queue)) != NULL)
1410 kfree_skb(skb);
1412 ip_cork_release(cork);
1415 void ip_flush_pending_frames(struct sock *sk)
1417 __ip_flush_pending_frames(sk, &sk->sk_write_queue, &inet_sk(sk)->cork.base);
1420 struct sk_buff *ip_make_skb(struct sock *sk,
1421 struct flowi4 *fl4,
1422 int getfrag(void *from, char *to, int offset,
1423 int len, int odd, struct sk_buff *skb),
1424 void *from, int length, int transhdrlen,
1425 struct ipcm_cookie *ipc, struct rtable **rtp,
1426 unsigned int flags)
1428 struct inet_cork cork;
1429 struct sk_buff_head queue;
1430 int err;
1432 if (flags & MSG_PROBE)
1433 return NULL;
1435 __skb_queue_head_init(&queue);
1437 cork.flags = 0;
1438 cork.addr = 0;
1439 cork.opt = NULL;
1440 err = ip_setup_cork(sk, &cork, ipc, rtp);
1441 if (err)
1442 return ERR_PTR(err);
1444 err = __ip_append_data(sk, fl4, &queue, &cork, getfrag,
1445 from, length, transhdrlen, flags);
1446 if (err) {
1447 __ip_flush_pending_frames(sk, &queue, &cork);
1448 return ERR_PTR(err);
1451 return __ip_make_skb(sk, fl4, &queue, &cork);
1455 * Fetch data from kernel space and fill in checksum if needed.
1457 static int ip_reply_glue_bits(void *dptr, char *to, int offset,
1458 int len, int odd, struct sk_buff *skb)
1460 __wsum csum;
1462 csum = csum_partial_copy_nocheck(dptr+offset, to, len, 0);
1463 skb->csum = csum_block_add(skb->csum, csum, odd);
1464 return 0;
1468 * Generic function to send a packet as reply to another packet.
1469 * Used to send TCP resets so far. ICMP should use this function too.
1471 * Should run single threaded per socket because it uses the sock
1472 * structure to pass arguments.
1474 void ip_send_reply(struct sock *sk, struct sk_buff *skb, __be32 daddr,
1475 struct ip_reply_arg *arg, unsigned int len)
1477 struct inet_sock *inet = inet_sk(sk);
1478 struct ip_options_data replyopts;
1479 struct ipcm_cookie ipc;
1480 struct flowi4 fl4;
1481 struct rtable *rt = skb_rtable(skb);
1483 if (ip_options_echo(&replyopts.opt.opt, skb))
1484 return;
1486 ipc.addr = daddr;
1487 ipc.opt = NULL;
1488 ipc.tx_flags = 0;
1490 if (replyopts.opt.opt.optlen) {
1491 ipc.opt = &replyopts.opt;
1493 if (replyopts.opt.opt.srr)
1494 daddr = replyopts.opt.opt.faddr;
1497 flowi4_init_output(&fl4, arg->bound_dev_if, 0,
1498 RT_TOS(ip_hdr(skb)->tos),
1499 RT_SCOPE_UNIVERSE, sk->sk_protocol,
1500 ip_reply_arg_flowi_flags(arg),
1501 daddr, rt->rt_spec_dst,
1502 tcp_hdr(skb)->source, tcp_hdr(skb)->dest);
1503 security_skb_classify_flow(skb, flowi4_to_flowi(&fl4));
1504 rt = ip_route_output_key(sock_net(sk), &fl4);
1505 if (IS_ERR(rt))
1506 return;
1508 /* And let IP do all the hard work.
1510 This chunk is not reenterable, hence spinlock.
1511 Note that it uses the fact, that this function is called
1512 with locally disabled BH and that sk cannot be already spinlocked.
1514 bh_lock_sock(sk);
1515 inet->tos = ip_hdr(skb)->tos;
1516 sk->sk_priority = skb->priority;
1517 sk->sk_protocol = ip_hdr(skb)->protocol;
1518 sk->sk_bound_dev_if = arg->bound_dev_if;
1519 ip_append_data(sk, &fl4, ip_reply_glue_bits, arg->iov->iov_base, len, 0,
1520 &ipc, &rt, MSG_DONTWAIT);
1521 if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) {
1522 if (arg->csumoffset >= 0)
1523 *((__sum16 *)skb_transport_header(skb) +
1524 arg->csumoffset) = csum_fold(csum_add(skb->csum,
1525 arg->csum));
1526 skb->ip_summed = CHECKSUM_NONE;
1527 ip_push_pending_frames(sk, &fl4);
1530 bh_unlock_sock(sk);
1532 ip_rt_put(rt);
1535 void __init ip_init(void)
1537 ip_rt_init();
1538 inet_initpeers();
1540 #if defined(CONFIG_IP_MULTICAST) && defined(CONFIG_PROC_FS)
1541 igmp_mc_proc_init();
1542 #endif