Initial EXPERIMENTAL implementation of device-mapper thin provisioning
[linux-2.6/next.git] / net / ipv4 / ip_output.c
blob77d3eded665ad2e30c03f8e62ea30b29ad1b4bb9
1 /*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * The Internet Protocol (IP) output module.
8 * Authors: Ross Biro
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Donald Becker, <becker@super.org>
11 * Alan Cox, <Alan.Cox@linux.org>
12 * Richard Underwood
13 * Stefan Becker, <stefanb@yello.ping.de>
14 * Jorge Cwik, <jorge@laser.satlink.net>
15 * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
16 * Hirokazu Takahashi, <taka@valinux.co.jp>
18 * See ip_input.c for original log
20 * Fixes:
21 * Alan Cox : Missing nonblock feature in ip_build_xmit.
22 * Mike Kilburn : htons() missing in ip_build_xmit.
23 * Bradford Johnson: Fix faulty handling of some frames when
24 * no route is found.
25 * Alexander Demenshin: Missing sk/skb free in ip_queue_xmit
26 * (in case if packet not accepted by
27 * output firewall rules)
28 * Mike McLagan : Routing by source
29 * Alexey Kuznetsov: use new route cache
30 * Andi Kleen: Fix broken PMTU recovery and remove
31 * some redundant tests.
32 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
33 * Andi Kleen : Replace ip_reply with ip_send_reply.
34 * Andi Kleen : Split fast and slow ip_build_xmit path
35 * for decreased register pressure on x86
36 * and more readibility.
37 * Marc Boucher : When call_out_firewall returns FW_QUEUE,
38 * silently drop skb instead of failing with -EPERM.
39 * Detlev Wengorz : Copy protocol for fragments.
40 * Hirokazu Takahashi: HW checksumming for outgoing UDP
41 * datagrams.
42 * Hirokazu Takahashi: sendfile() on UDP works now.
45 #include <asm/uaccess.h>
46 #include <asm/system.h>
47 #include <linux/module.h>
48 #include <linux/types.h>
49 #include <linux/kernel.h>
50 #include <linux/mm.h>
51 #include <linux/string.h>
52 #include <linux/errno.h>
53 #include <linux/highmem.h>
54 #include <linux/slab.h>
56 #include <linux/socket.h>
57 #include <linux/sockios.h>
58 #include <linux/in.h>
59 #include <linux/inet.h>
60 #include <linux/netdevice.h>
61 #include <linux/etherdevice.h>
62 #include <linux/proc_fs.h>
63 #include <linux/stat.h>
64 #include <linux/init.h>
66 #include <net/snmp.h>
67 #include <net/ip.h>
68 #include <net/protocol.h>
69 #include <net/route.h>
70 #include <net/xfrm.h>
71 #include <linux/skbuff.h>
72 #include <net/sock.h>
73 #include <net/arp.h>
74 #include <net/icmp.h>
75 #include <net/checksum.h>
76 #include <net/inetpeer.h>
77 #include <linux/igmp.h>
78 #include <linux/netfilter_ipv4.h>
79 #include <linux/netfilter_bridge.h>
80 #include <linux/mroute.h>
81 #include <linux/netlink.h>
82 #include <linux/tcp.h>
84 int sysctl_ip_default_ttl __read_mostly = IPDEFTTL;
85 EXPORT_SYMBOL(sysctl_ip_default_ttl);
87 /* Generate a checksum for an outgoing IP datagram. */
88 __inline__ void ip_send_check(struct iphdr *iph)
90 iph->check = 0;
91 iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
93 EXPORT_SYMBOL(ip_send_check);
95 int __ip_local_out(struct sk_buff *skb)
97 struct iphdr *iph = ip_hdr(skb);
99 iph->tot_len = htons(skb->len);
100 ip_send_check(iph);
101 return nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT, skb, NULL,
102 skb_dst(skb)->dev, dst_output);
105 int ip_local_out(struct sk_buff *skb)
107 int err;
109 err = __ip_local_out(skb);
110 if (likely(err == 1))
111 err = dst_output(skb);
113 return err;
115 EXPORT_SYMBOL_GPL(ip_local_out);
117 /* dev_loopback_xmit for use with netfilter. */
118 static int ip_dev_loopback_xmit(struct sk_buff *newskb)
120 skb_reset_mac_header(newskb);
121 __skb_pull(newskb, skb_network_offset(newskb));
122 newskb->pkt_type = PACKET_LOOPBACK;
123 newskb->ip_summed = CHECKSUM_UNNECESSARY;
124 WARN_ON(!skb_dst(newskb));
125 netif_rx_ni(newskb);
126 return 0;
129 static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
131 int ttl = inet->uc_ttl;
133 if (ttl < 0)
134 ttl = ip4_dst_hoplimit(dst);
135 return ttl;
139 * Add an ip header to a skbuff and send it out.
142 int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
143 __be32 saddr, __be32 daddr, struct ip_options_rcu *opt)
145 struct inet_sock *inet = inet_sk(sk);
146 struct rtable *rt = skb_rtable(skb);
147 struct iphdr *iph;
149 /* Build the IP header. */
150 skb_push(skb, sizeof(struct iphdr) + (opt ? opt->opt.optlen : 0));
151 skb_reset_network_header(skb);
152 iph = ip_hdr(skb);
153 iph->version = 4;
154 iph->ihl = 5;
155 iph->tos = inet->tos;
156 if (ip_dont_fragment(sk, &rt->dst))
157 iph->frag_off = htons(IP_DF);
158 else
159 iph->frag_off = 0;
160 iph->ttl = ip_select_ttl(inet, &rt->dst);
161 iph->daddr = (opt && opt->opt.srr ? opt->opt.faddr : daddr);
162 iph->saddr = saddr;
163 iph->protocol = sk->sk_protocol;
164 ip_select_ident(iph, &rt->dst, sk);
166 if (opt && opt->opt.optlen) {
167 iph->ihl += opt->opt.optlen>>2;
168 ip_options_build(skb, &opt->opt, daddr, rt, 0);
171 skb->priority = sk->sk_priority;
172 skb->mark = sk->sk_mark;
174 /* Send it out. */
175 return ip_local_out(skb);
177 EXPORT_SYMBOL_GPL(ip_build_and_send_pkt);
179 static inline int ip_finish_output2(struct sk_buff *skb)
181 struct dst_entry *dst = skb_dst(skb);
182 struct rtable *rt = (struct rtable *)dst;
183 struct net_device *dev = dst->dev;
184 unsigned int hh_len = LL_RESERVED_SPACE(dev);
185 struct neighbour *neigh;
187 if (rt->rt_type == RTN_MULTICAST) {
188 IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTMCAST, skb->len);
189 } else if (rt->rt_type == RTN_BROADCAST)
190 IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTBCAST, skb->len);
192 /* Be paranoid, rather than too clever. */
193 if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
194 struct sk_buff *skb2;
196 skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
197 if (skb2 == NULL) {
198 kfree_skb(skb);
199 return -ENOMEM;
201 if (skb->sk)
202 skb_set_owner_w(skb2, skb->sk);
203 kfree_skb(skb);
204 skb = skb2;
207 rcu_read_lock();
208 neigh = dst_get_neighbour(dst);
209 if (neigh) {
210 int res = neigh_output(neigh, skb);
212 rcu_read_unlock();
213 return res;
215 rcu_read_unlock();
217 if (net_ratelimit())
218 printk(KERN_DEBUG "ip_finish_output2: No header cache and no neighbour!\n");
219 kfree_skb(skb);
220 return -EINVAL;
223 static inline int ip_skb_dst_mtu(struct sk_buff *skb)
225 struct inet_sock *inet = skb->sk ? inet_sk(skb->sk) : NULL;
227 return (inet && inet->pmtudisc == IP_PMTUDISC_PROBE) ?
228 skb_dst(skb)->dev->mtu : dst_mtu(skb_dst(skb));
231 static int ip_finish_output(struct sk_buff *skb)
233 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
234 /* Policy lookup after SNAT yielded a new policy */
235 if (skb_dst(skb)->xfrm != NULL) {
236 IPCB(skb)->flags |= IPSKB_REROUTED;
237 return dst_output(skb);
239 #endif
240 if (skb->len > ip_skb_dst_mtu(skb) && !skb_is_gso(skb))
241 return ip_fragment(skb, ip_finish_output2);
242 else
243 return ip_finish_output2(skb);
246 int ip_mc_output(struct sk_buff *skb)
248 struct sock *sk = skb->sk;
249 struct rtable *rt = skb_rtable(skb);
250 struct net_device *dev = rt->dst.dev;
253 * If the indicated interface is up and running, send the packet.
255 IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);
257 skb->dev = dev;
258 skb->protocol = htons(ETH_P_IP);
261 * Multicasts are looped back for other local users
264 if (rt->rt_flags&RTCF_MULTICAST) {
265 if (sk_mc_loop(sk)
266 #ifdef CONFIG_IP_MROUTE
267 /* Small optimization: do not loopback not local frames,
268 which returned after forwarding; they will be dropped
269 by ip_mr_input in any case.
270 Note, that local frames are looped back to be delivered
271 to local recipients.
273 This check is duplicated in ip_mr_input at the moment.
276 ((rt->rt_flags & RTCF_LOCAL) ||
277 !(IPCB(skb)->flags & IPSKB_FORWARDED))
278 #endif
280 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
281 if (newskb)
282 NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING,
283 newskb, NULL, newskb->dev,
284 ip_dev_loopback_xmit);
287 /* Multicasts with ttl 0 must not go beyond the host */
289 if (ip_hdr(skb)->ttl == 0) {
290 kfree_skb(skb);
291 return 0;
295 if (rt->rt_flags&RTCF_BROADCAST) {
296 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
297 if (newskb)
298 NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, newskb,
299 NULL, newskb->dev, ip_dev_loopback_xmit);
302 return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL,
303 skb->dev, ip_finish_output,
304 !(IPCB(skb)->flags & IPSKB_REROUTED));
307 int ip_output(struct sk_buff *skb)
309 struct net_device *dev = skb_dst(skb)->dev;
311 IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);
313 skb->dev = dev;
314 skb->protocol = htons(ETH_P_IP);
316 return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL, dev,
317 ip_finish_output,
318 !(IPCB(skb)->flags & IPSKB_REROUTED));
321 int ip_queue_xmit(struct sk_buff *skb, struct flowi *fl)
323 struct sock *sk = skb->sk;
324 struct inet_sock *inet = inet_sk(sk);
325 struct ip_options_rcu *inet_opt;
326 struct flowi4 *fl4;
327 struct rtable *rt;
328 struct iphdr *iph;
329 int res;
331 /* Skip all of this if the packet is already routed,
332 * f.e. by something like SCTP.
334 rcu_read_lock();
335 inet_opt = rcu_dereference(inet->inet_opt);
336 fl4 = &fl->u.ip4;
337 rt = skb_rtable(skb);
338 if (rt != NULL)
339 goto packet_routed;
341 /* Make sure we can route this packet. */
342 rt = (struct rtable *)__sk_dst_check(sk, 0);
343 if (rt == NULL) {
344 __be32 daddr;
346 /* Use correct destination address if we have options. */
347 daddr = inet->inet_daddr;
348 if (inet_opt && inet_opt->opt.srr)
349 daddr = inet_opt->opt.faddr;
351 /* If this fails, retransmit mechanism of transport layer will
352 * keep trying until route appears or the connection times
353 * itself out.
355 rt = ip_route_output_ports(sock_net(sk), fl4, sk,
356 daddr, inet->inet_saddr,
357 inet->inet_dport,
358 inet->inet_sport,
359 sk->sk_protocol,
360 RT_CONN_FLAGS(sk),
361 sk->sk_bound_dev_if);
362 if (IS_ERR(rt))
363 goto no_route;
364 sk_setup_caps(sk, &rt->dst);
366 skb_dst_set_noref(skb, &rt->dst);
368 packet_routed:
369 if (inet_opt && inet_opt->opt.is_strictroute && fl4->daddr != rt->rt_gateway)
370 goto no_route;
372 /* OK, we know where to send it, allocate and build IP header. */
373 skb_push(skb, sizeof(struct iphdr) + (inet_opt ? inet_opt->opt.optlen : 0));
374 skb_reset_network_header(skb);
375 iph = ip_hdr(skb);
376 *((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff));
377 if (ip_dont_fragment(sk, &rt->dst) && !skb->local_df)
378 iph->frag_off = htons(IP_DF);
379 else
380 iph->frag_off = 0;
381 iph->ttl = ip_select_ttl(inet, &rt->dst);
382 iph->protocol = sk->sk_protocol;
383 iph->saddr = fl4->saddr;
384 iph->daddr = fl4->daddr;
385 /* Transport layer set skb->h.foo itself. */
387 if (inet_opt && inet_opt->opt.optlen) {
388 iph->ihl += inet_opt->opt.optlen >> 2;
389 ip_options_build(skb, &inet_opt->opt, inet->inet_daddr, rt, 0);
392 ip_select_ident_more(iph, &rt->dst, sk,
393 (skb_shinfo(skb)->gso_segs ?: 1) - 1);
395 skb->priority = sk->sk_priority;
396 skb->mark = sk->sk_mark;
398 res = ip_local_out(skb);
399 rcu_read_unlock();
400 return res;
402 no_route:
403 rcu_read_unlock();
404 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
405 kfree_skb(skb);
406 return -EHOSTUNREACH;
408 EXPORT_SYMBOL(ip_queue_xmit);
411 static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
413 to->pkt_type = from->pkt_type;
414 to->priority = from->priority;
415 to->protocol = from->protocol;
416 skb_dst_drop(to);
417 skb_dst_copy(to, from);
418 to->dev = from->dev;
419 to->mark = from->mark;
421 /* Copy the flags to each fragment. */
422 IPCB(to)->flags = IPCB(from)->flags;
424 #ifdef CONFIG_NET_SCHED
425 to->tc_index = from->tc_index;
426 #endif
427 nf_copy(to, from);
428 #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
429 defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
430 to->nf_trace = from->nf_trace;
431 #endif
432 #if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
433 to->ipvs_property = from->ipvs_property;
434 #endif
435 skb_copy_secmark(to, from);
439 * This IP datagram is too large to be sent in one piece. Break it up into
440 * smaller pieces (each of size equal to IP header plus
441 * a block of the data of the original IP data part) that will yet fit in a
442 * single device frame, and queue such a frame for sending.
445 int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
447 struct iphdr *iph;
448 int ptr;
449 struct net_device *dev;
450 struct sk_buff *skb2;
451 unsigned int mtu, hlen, left, len, ll_rs;
452 int offset;
453 __be16 not_last_frag;
454 struct rtable *rt = skb_rtable(skb);
455 int err = 0;
457 dev = rt->dst.dev;
460 * Point into the IP datagram header.
463 iph = ip_hdr(skb);
465 if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) {
466 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
467 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
468 htonl(ip_skb_dst_mtu(skb)));
469 kfree_skb(skb);
470 return -EMSGSIZE;
474 * Setup starting values.
477 hlen = iph->ihl * 4;
478 mtu = dst_mtu(&rt->dst) - hlen; /* Size of data space */
479 #ifdef CONFIG_BRIDGE_NETFILTER
480 if (skb->nf_bridge)
481 mtu -= nf_bridge_mtu_reduction(skb);
482 #endif
483 IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE;
485 /* When frag_list is given, use it. First, check its validity:
486 * some transformers could create wrong frag_list or break existing
487 * one, it is not prohibited. In this case fall back to copying.
489 * LATER: this step can be merged to real generation of fragments,
490 * we can switch to copy when see the first bad fragment.
492 if (skb_has_frag_list(skb)) {
493 struct sk_buff *frag, *frag2;
494 int first_len = skb_pagelen(skb);
496 if (first_len - hlen > mtu ||
497 ((first_len - hlen) & 7) ||
498 ip_is_fragment(iph) ||
499 skb_cloned(skb))
500 goto slow_path;
502 skb_walk_frags(skb, frag) {
503 /* Correct geometry. */
504 if (frag->len > mtu ||
505 ((frag->len & 7) && frag->next) ||
506 skb_headroom(frag) < hlen)
507 goto slow_path_clean;
509 /* Partially cloned skb? */
510 if (skb_shared(frag))
511 goto slow_path_clean;
513 BUG_ON(frag->sk);
514 if (skb->sk) {
515 frag->sk = skb->sk;
516 frag->destructor = sock_wfree;
518 skb->truesize -= frag->truesize;
521 /* Everything is OK. Generate! */
523 err = 0;
524 offset = 0;
525 frag = skb_shinfo(skb)->frag_list;
526 skb_frag_list_init(skb);
527 skb->data_len = first_len - skb_headlen(skb);
528 skb->len = first_len;
529 iph->tot_len = htons(first_len);
530 iph->frag_off = htons(IP_MF);
531 ip_send_check(iph);
533 for (;;) {
534 /* Prepare header of the next frame,
535 * before previous one went down. */
536 if (frag) {
537 frag->ip_summed = CHECKSUM_NONE;
538 skb_reset_transport_header(frag);
539 __skb_push(frag, hlen);
540 skb_reset_network_header(frag);
541 memcpy(skb_network_header(frag), iph, hlen);
542 iph = ip_hdr(frag);
543 iph->tot_len = htons(frag->len);
544 ip_copy_metadata(frag, skb);
545 if (offset == 0)
546 ip_options_fragment(frag);
547 offset += skb->len - hlen;
548 iph->frag_off = htons(offset>>3);
549 if (frag->next != NULL)
550 iph->frag_off |= htons(IP_MF);
551 /* Ready, complete checksum */
552 ip_send_check(iph);
555 err = output(skb);
557 if (!err)
558 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
559 if (err || !frag)
560 break;
562 skb = frag;
563 frag = skb->next;
564 skb->next = NULL;
567 if (err == 0) {
568 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
569 return 0;
572 while (frag) {
573 skb = frag->next;
574 kfree_skb(frag);
575 frag = skb;
577 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
578 return err;
580 slow_path_clean:
581 skb_walk_frags(skb, frag2) {
582 if (frag2 == frag)
583 break;
584 frag2->sk = NULL;
585 frag2->destructor = NULL;
586 skb->truesize += frag2->truesize;
590 slow_path:
591 left = skb->len - hlen; /* Space per frame */
592 ptr = hlen; /* Where to start from */
594 /* for bridged IP traffic encapsulated inside f.e. a vlan header,
595 * we need to make room for the encapsulating header
597 ll_rs = LL_RESERVED_SPACE_EXTRA(rt->dst.dev, nf_bridge_pad(skb));
600 * Fragment the datagram.
603 offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
604 not_last_frag = iph->frag_off & htons(IP_MF);
607 * Keep copying data until we run out.
610 while (left > 0) {
611 len = left;
612 /* IF: it doesn't fit, use 'mtu' - the data space left */
613 if (len > mtu)
614 len = mtu;
615 /* IF: we are not sending up to and including the packet end
616 then align the next start on an eight byte boundary */
617 if (len < left) {
618 len &= ~7;
621 * Allocate buffer.
624 if ((skb2 = alloc_skb(len+hlen+ll_rs, GFP_ATOMIC)) == NULL) {
625 NETDEBUG(KERN_INFO "IP: frag: no memory for new fragment!\n");
626 err = -ENOMEM;
627 goto fail;
631 * Set up data on packet
634 ip_copy_metadata(skb2, skb);
635 skb_reserve(skb2, ll_rs);
636 skb_put(skb2, len + hlen);
637 skb_reset_network_header(skb2);
638 skb2->transport_header = skb2->network_header + hlen;
641 * Charge the memory for the fragment to any owner
642 * it might possess
645 if (skb->sk)
646 skb_set_owner_w(skb2, skb->sk);
649 * Copy the packet header into the new buffer.
652 skb_copy_from_linear_data(skb, skb_network_header(skb2), hlen);
655 * Copy a block of the IP datagram.
657 if (skb_copy_bits(skb, ptr, skb_transport_header(skb2), len))
658 BUG();
659 left -= len;
662 * Fill in the new header fields.
664 iph = ip_hdr(skb2);
665 iph->frag_off = htons((offset >> 3));
667 /* ANK: dirty, but effective trick. Upgrade options only if
668 * the segment to be fragmented was THE FIRST (otherwise,
669 * options are already fixed) and make it ONCE
670 * on the initial skb, so that all the following fragments
671 * will inherit fixed options.
673 if (offset == 0)
674 ip_options_fragment(skb);
677 * Added AC : If we are fragmenting a fragment that's not the
678 * last fragment then keep MF on each bit
680 if (left > 0 || not_last_frag)
681 iph->frag_off |= htons(IP_MF);
682 ptr += len;
683 offset += len;
686 * Put this fragment into the sending queue.
688 iph->tot_len = htons(len + hlen);
690 ip_send_check(iph);
692 err = output(skb2);
693 if (err)
694 goto fail;
696 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
698 kfree_skb(skb);
699 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
700 return err;
702 fail:
703 kfree_skb(skb);
704 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
705 return err;
707 EXPORT_SYMBOL(ip_fragment);
710 ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb)
712 struct iovec *iov = from;
714 if (skb->ip_summed == CHECKSUM_PARTIAL) {
715 if (memcpy_fromiovecend(to, iov, offset, len) < 0)
716 return -EFAULT;
717 } else {
718 __wsum csum = 0;
719 if (csum_partial_copy_fromiovecend(to, iov, offset, len, &csum) < 0)
720 return -EFAULT;
721 skb->csum = csum_block_add(skb->csum, csum, odd);
723 return 0;
725 EXPORT_SYMBOL(ip_generic_getfrag);
727 static inline __wsum
728 csum_page(struct page *page, int offset, int copy)
730 char *kaddr;
731 __wsum csum;
732 kaddr = kmap(page);
733 csum = csum_partial(kaddr + offset, copy, 0);
734 kunmap(page);
735 return csum;
738 static inline int ip_ufo_append_data(struct sock *sk,
739 struct sk_buff_head *queue,
740 int getfrag(void *from, char *to, int offset, int len,
741 int odd, struct sk_buff *skb),
742 void *from, int length, int hh_len, int fragheaderlen,
743 int transhdrlen, int maxfraglen, unsigned int flags)
745 struct sk_buff *skb;
746 int err;
748 /* There is support for UDP fragmentation offload by network
749 * device, so create one single skb packet containing complete
750 * udp datagram
752 if ((skb = skb_peek_tail(queue)) == NULL) {
753 skb = sock_alloc_send_skb(sk,
754 hh_len + fragheaderlen + transhdrlen + 20,
755 (flags & MSG_DONTWAIT), &err);
757 if (skb == NULL)
758 return err;
760 /* reserve space for Hardware header */
761 skb_reserve(skb, hh_len);
763 /* create space for UDP/IP header */
764 skb_put(skb, fragheaderlen + transhdrlen);
766 /* initialize network header pointer */
767 skb_reset_network_header(skb);
769 /* initialize protocol header pointer */
770 skb->transport_header = skb->network_header + fragheaderlen;
772 skb->ip_summed = CHECKSUM_PARTIAL;
773 skb->csum = 0;
775 /* specify the length of each IP datagram fragment */
776 skb_shinfo(skb)->gso_size = maxfraglen - fragheaderlen;
777 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
778 __skb_queue_tail(queue, skb);
781 return skb_append_datato_frags(sk, skb, getfrag, from,
782 (length - transhdrlen));
785 static int __ip_append_data(struct sock *sk,
786 struct flowi4 *fl4,
787 struct sk_buff_head *queue,
788 struct inet_cork *cork,
789 int getfrag(void *from, char *to, int offset,
790 int len, int odd, struct sk_buff *skb),
791 void *from, int length, int transhdrlen,
792 unsigned int flags)
794 struct inet_sock *inet = inet_sk(sk);
795 struct sk_buff *skb;
797 struct ip_options *opt = cork->opt;
798 int hh_len;
799 int exthdrlen;
800 int mtu;
801 int copy;
802 int err;
803 int offset = 0;
804 unsigned int maxfraglen, fragheaderlen;
805 int csummode = CHECKSUM_NONE;
806 struct rtable *rt = (struct rtable *)cork->dst;
808 skb = skb_peek_tail(queue);
810 exthdrlen = !skb ? rt->dst.header_len : 0;
811 mtu = cork->fragsize;
813 hh_len = LL_RESERVED_SPACE(rt->dst.dev);
815 fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
816 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
818 if (cork->length + length > 0xFFFF - fragheaderlen) {
819 ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport,
820 mtu-exthdrlen);
821 return -EMSGSIZE;
825 * transhdrlen > 0 means that this is the first fragment and we wish
826 * it won't be fragmented in the future.
828 if (transhdrlen &&
829 length + fragheaderlen <= mtu &&
830 rt->dst.dev->features & NETIF_F_V4_CSUM &&
831 !exthdrlen)
832 csummode = CHECKSUM_PARTIAL;
834 cork->length += length;
835 if (((length > mtu) || (skb && skb_is_gso(skb))) &&
836 (sk->sk_protocol == IPPROTO_UDP) &&
837 (rt->dst.dev->features & NETIF_F_UFO) && !rt->dst.header_len) {
838 err = ip_ufo_append_data(sk, queue, getfrag, from, length,
839 hh_len, fragheaderlen, transhdrlen,
840 maxfraglen, flags);
841 if (err)
842 goto error;
843 return 0;
846 /* So, what's going on in the loop below?
848 * We use calculated fragment length to generate chained skb,
849 * each of segments is IP fragment ready for sending to network after
850 * adding appropriate IP header.
853 if (!skb)
854 goto alloc_new_skb;
856 while (length > 0) {
857 /* Check if the remaining data fits into current packet. */
858 copy = mtu - skb->len;
859 if (copy < length)
860 copy = maxfraglen - skb->len;
861 if (copy <= 0) {
862 char *data;
863 unsigned int datalen;
864 unsigned int fraglen;
865 unsigned int fraggap;
866 unsigned int alloclen;
867 struct sk_buff *skb_prev;
868 alloc_new_skb:
869 skb_prev = skb;
870 if (skb_prev)
871 fraggap = skb_prev->len - maxfraglen;
872 else
873 fraggap = 0;
876 * If remaining data exceeds the mtu,
877 * we know we need more fragment(s).
879 datalen = length + fraggap;
880 if (datalen > mtu - fragheaderlen)
881 datalen = maxfraglen - fragheaderlen;
882 fraglen = datalen + fragheaderlen;
884 if ((flags & MSG_MORE) &&
885 !(rt->dst.dev->features&NETIF_F_SG))
886 alloclen = mtu;
887 else
888 alloclen = fraglen;
890 alloclen += exthdrlen;
892 /* The last fragment gets additional space at tail.
893 * Note, with MSG_MORE we overallocate on fragments,
894 * because we have no idea what fragment will be
895 * the last.
897 if (datalen == length + fraggap)
898 alloclen += rt->dst.trailer_len;
900 if (transhdrlen) {
901 skb = sock_alloc_send_skb(sk,
902 alloclen + hh_len + 15,
903 (flags & MSG_DONTWAIT), &err);
904 } else {
905 skb = NULL;
906 if (atomic_read(&sk->sk_wmem_alloc) <=
907 2 * sk->sk_sndbuf)
908 skb = sock_wmalloc(sk,
909 alloclen + hh_len + 15, 1,
910 sk->sk_allocation);
911 if (unlikely(skb == NULL))
912 err = -ENOBUFS;
913 else
914 /* only the initial fragment is
915 time stamped */
916 cork->tx_flags = 0;
918 if (skb == NULL)
919 goto error;
922 * Fill in the control structures
924 skb->ip_summed = csummode;
925 skb->csum = 0;
926 skb_reserve(skb, hh_len);
927 skb_shinfo(skb)->tx_flags = cork->tx_flags;
930 * Find where to start putting bytes.
932 data = skb_put(skb, fraglen + exthdrlen);
933 skb_set_network_header(skb, exthdrlen);
934 skb->transport_header = (skb->network_header +
935 fragheaderlen);
936 data += fragheaderlen + exthdrlen;
938 if (fraggap) {
939 skb->csum = skb_copy_and_csum_bits(
940 skb_prev, maxfraglen,
941 data + transhdrlen, fraggap, 0);
942 skb_prev->csum = csum_sub(skb_prev->csum,
943 skb->csum);
944 data += fraggap;
945 pskb_trim_unique(skb_prev, maxfraglen);
948 copy = datalen - transhdrlen - fraggap;
949 if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
950 err = -EFAULT;
951 kfree_skb(skb);
952 goto error;
955 offset += copy;
956 length -= datalen - fraggap;
957 transhdrlen = 0;
958 exthdrlen = 0;
959 csummode = CHECKSUM_NONE;
962 * Put the packet on the pending queue.
964 __skb_queue_tail(queue, skb);
965 continue;
968 if (copy > length)
969 copy = length;
971 if (!(rt->dst.dev->features&NETIF_F_SG)) {
972 unsigned int off;
974 off = skb->len;
975 if (getfrag(from, skb_put(skb, copy),
976 offset, copy, off, skb) < 0) {
977 __skb_trim(skb, off);
978 err = -EFAULT;
979 goto error;
981 } else {
982 int i = skb_shinfo(skb)->nr_frags;
983 skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
984 struct page *page = cork->page;
985 int off = cork->off;
986 unsigned int left;
988 if (page && (left = PAGE_SIZE - off) > 0) {
989 if (copy >= left)
990 copy = left;
991 if (page != frag->page) {
992 if (i == MAX_SKB_FRAGS) {
993 err = -EMSGSIZE;
994 goto error;
996 get_page(page);
997 skb_fill_page_desc(skb, i, page, off, 0);
998 frag = &skb_shinfo(skb)->frags[i];
1000 } else if (i < MAX_SKB_FRAGS) {
1001 if (copy > PAGE_SIZE)
1002 copy = PAGE_SIZE;
1003 page = alloc_pages(sk->sk_allocation, 0);
1004 if (page == NULL) {
1005 err = -ENOMEM;
1006 goto error;
1008 cork->page = page;
1009 cork->off = 0;
1011 skb_fill_page_desc(skb, i, page, 0, 0);
1012 frag = &skb_shinfo(skb)->frags[i];
1013 } else {
1014 err = -EMSGSIZE;
1015 goto error;
1017 if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1018 err = -EFAULT;
1019 goto error;
1021 cork->off += copy;
1022 frag->size += copy;
1023 skb->len += copy;
1024 skb->data_len += copy;
1025 skb->truesize += copy;
1026 atomic_add(copy, &sk->sk_wmem_alloc);
1028 offset += copy;
1029 length -= copy;
1032 return 0;
1034 error:
1035 cork->length -= length;
1036 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
1037 return err;
1040 static int ip_setup_cork(struct sock *sk, struct inet_cork *cork,
1041 struct ipcm_cookie *ipc, struct rtable **rtp)
1043 struct inet_sock *inet = inet_sk(sk);
1044 struct ip_options_rcu *opt;
1045 struct rtable *rt;
1048 * setup for corking.
1050 opt = ipc->opt;
1051 if (opt) {
1052 if (cork->opt == NULL) {
1053 cork->opt = kmalloc(sizeof(struct ip_options) + 40,
1054 sk->sk_allocation);
1055 if (unlikely(cork->opt == NULL))
1056 return -ENOBUFS;
1058 memcpy(cork->opt, &opt->opt, sizeof(struct ip_options) + opt->opt.optlen);
1059 cork->flags |= IPCORK_OPT;
1060 cork->addr = ipc->addr;
1062 rt = *rtp;
1063 if (unlikely(!rt))
1064 return -EFAULT;
1066 * We steal reference to this route, caller should not release it
1068 *rtp = NULL;
1069 cork->fragsize = inet->pmtudisc == IP_PMTUDISC_PROBE ?
1070 rt->dst.dev->mtu : dst_mtu(&rt->dst);
1071 cork->dst = &rt->dst;
1072 cork->length = 0;
1073 cork->tx_flags = ipc->tx_flags;
1074 cork->page = NULL;
1075 cork->off = 0;
1077 return 0;
1081 * ip_append_data() and ip_append_page() can make one large IP datagram
1082 * from many pieces of data. Each pieces will be holded on the socket
1083 * until ip_push_pending_frames() is called. Each piece can be a page
1084 * or non-page data.
1086 * Not only UDP, other transport protocols - e.g. raw sockets - can use
1087 * this interface potentially.
1089 * LATER: length must be adjusted by pad at tail, when it is required.
1091 int ip_append_data(struct sock *sk, struct flowi4 *fl4,
1092 int getfrag(void *from, char *to, int offset, int len,
1093 int odd, struct sk_buff *skb),
1094 void *from, int length, int transhdrlen,
1095 struct ipcm_cookie *ipc, struct rtable **rtp,
1096 unsigned int flags)
1098 struct inet_sock *inet = inet_sk(sk);
1099 int err;
1101 if (flags&MSG_PROBE)
1102 return 0;
1104 if (skb_queue_empty(&sk->sk_write_queue)) {
1105 err = ip_setup_cork(sk, &inet->cork.base, ipc, rtp);
1106 if (err)
1107 return err;
1108 } else {
1109 transhdrlen = 0;
1112 return __ip_append_data(sk, fl4, &sk->sk_write_queue, &inet->cork.base, getfrag,
1113 from, length, transhdrlen, flags);
1116 ssize_t ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page,
1117 int offset, size_t size, int flags)
1119 struct inet_sock *inet = inet_sk(sk);
1120 struct sk_buff *skb;
1121 struct rtable *rt;
1122 struct ip_options *opt = NULL;
1123 struct inet_cork *cork;
1124 int hh_len;
1125 int mtu;
1126 int len;
1127 int err;
1128 unsigned int maxfraglen, fragheaderlen, fraggap;
1130 if (inet->hdrincl)
1131 return -EPERM;
1133 if (flags&MSG_PROBE)
1134 return 0;
1136 if (skb_queue_empty(&sk->sk_write_queue))
1137 return -EINVAL;
1139 cork = &inet->cork.base;
1140 rt = (struct rtable *)cork->dst;
1141 if (cork->flags & IPCORK_OPT)
1142 opt = cork->opt;
1144 if (!(rt->dst.dev->features&NETIF_F_SG))
1145 return -EOPNOTSUPP;
1147 hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1148 mtu = cork->fragsize;
1150 fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
1151 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
1153 if (cork->length + size > 0xFFFF - fragheaderlen) {
1154 ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport, mtu);
1155 return -EMSGSIZE;
1158 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1159 return -EINVAL;
1161 cork->length += size;
1162 if ((size + skb->len > mtu) &&
1163 (sk->sk_protocol == IPPROTO_UDP) &&
1164 (rt->dst.dev->features & NETIF_F_UFO)) {
1165 skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
1166 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1170 while (size > 0) {
1171 int i;
1173 if (skb_is_gso(skb))
1174 len = size;
1175 else {
1177 /* Check if the remaining data fits into current packet. */
1178 len = mtu - skb->len;
1179 if (len < size)
1180 len = maxfraglen - skb->len;
1182 if (len <= 0) {
1183 struct sk_buff *skb_prev;
1184 int alloclen;
1186 skb_prev = skb;
1187 fraggap = skb_prev->len - maxfraglen;
1189 alloclen = fragheaderlen + hh_len + fraggap + 15;
1190 skb = sock_wmalloc(sk, alloclen, 1, sk->sk_allocation);
1191 if (unlikely(!skb)) {
1192 err = -ENOBUFS;
1193 goto error;
1197 * Fill in the control structures
1199 skb->ip_summed = CHECKSUM_NONE;
1200 skb->csum = 0;
1201 skb_reserve(skb, hh_len);
1204 * Find where to start putting bytes.
1206 skb_put(skb, fragheaderlen + fraggap);
1207 skb_reset_network_header(skb);
1208 skb->transport_header = (skb->network_header +
1209 fragheaderlen);
1210 if (fraggap) {
1211 skb->csum = skb_copy_and_csum_bits(skb_prev,
1212 maxfraglen,
1213 skb_transport_header(skb),
1214 fraggap, 0);
1215 skb_prev->csum = csum_sub(skb_prev->csum,
1216 skb->csum);
1217 pskb_trim_unique(skb_prev, maxfraglen);
1221 * Put the packet on the pending queue.
1223 __skb_queue_tail(&sk->sk_write_queue, skb);
1224 continue;
1227 i = skb_shinfo(skb)->nr_frags;
1228 if (len > size)
1229 len = size;
1230 if (skb_can_coalesce(skb, i, page, offset)) {
1231 skb_shinfo(skb)->frags[i-1].size += len;
1232 } else if (i < MAX_SKB_FRAGS) {
1233 get_page(page);
1234 skb_fill_page_desc(skb, i, page, offset, len);
1235 } else {
1236 err = -EMSGSIZE;
1237 goto error;
1240 if (skb->ip_summed == CHECKSUM_NONE) {
1241 __wsum csum;
1242 csum = csum_page(page, offset, len);
1243 skb->csum = csum_block_add(skb->csum, csum, skb->len);
1246 skb->len += len;
1247 skb->data_len += len;
1248 skb->truesize += len;
1249 atomic_add(len, &sk->sk_wmem_alloc);
1250 offset += len;
1251 size -= len;
1253 return 0;
1255 error:
1256 cork->length -= size;
1257 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
1258 return err;
1261 static void ip_cork_release(struct inet_cork *cork)
1263 cork->flags &= ~IPCORK_OPT;
1264 kfree(cork->opt);
1265 cork->opt = NULL;
1266 dst_release(cork->dst);
1267 cork->dst = NULL;
1271 * Combined all pending IP fragments on the socket as one IP datagram
1272 * and push them out.
1274 struct sk_buff *__ip_make_skb(struct sock *sk,
1275 struct flowi4 *fl4,
1276 struct sk_buff_head *queue,
1277 struct inet_cork *cork)
1279 struct sk_buff *skb, *tmp_skb;
1280 struct sk_buff **tail_skb;
1281 struct inet_sock *inet = inet_sk(sk);
1282 struct net *net = sock_net(sk);
1283 struct ip_options *opt = NULL;
1284 struct rtable *rt = (struct rtable *)cork->dst;
1285 struct iphdr *iph;
1286 __be16 df = 0;
1287 __u8 ttl;
1289 if ((skb = __skb_dequeue(queue)) == NULL)
1290 goto out;
1291 tail_skb = &(skb_shinfo(skb)->frag_list);
1293 /* move skb->data to ip header from ext header */
1294 if (skb->data < skb_network_header(skb))
1295 __skb_pull(skb, skb_network_offset(skb));
1296 while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1297 __skb_pull(tmp_skb, skb_network_header_len(skb));
1298 *tail_skb = tmp_skb;
1299 tail_skb = &(tmp_skb->next);
1300 skb->len += tmp_skb->len;
1301 skb->data_len += tmp_skb->len;
1302 skb->truesize += tmp_skb->truesize;
1303 tmp_skb->destructor = NULL;
1304 tmp_skb->sk = NULL;
1307 /* Unless user demanded real pmtu discovery (IP_PMTUDISC_DO), we allow
1308 * to fragment the frame generated here. No matter, what transforms
1309 * how transforms change size of the packet, it will come out.
1311 if (inet->pmtudisc < IP_PMTUDISC_DO)
1312 skb->local_df = 1;
1314 /* DF bit is set when we want to see DF on outgoing frames.
1315 * If local_df is set too, we still allow to fragment this frame
1316 * locally. */
1317 if (inet->pmtudisc >= IP_PMTUDISC_DO ||
1318 (skb->len <= dst_mtu(&rt->dst) &&
1319 ip_dont_fragment(sk, &rt->dst)))
1320 df = htons(IP_DF);
1322 if (cork->flags & IPCORK_OPT)
1323 opt = cork->opt;
1325 if (rt->rt_type == RTN_MULTICAST)
1326 ttl = inet->mc_ttl;
1327 else
1328 ttl = ip_select_ttl(inet, &rt->dst);
1330 iph = (struct iphdr *)skb->data;
1331 iph->version = 4;
1332 iph->ihl = 5;
1333 iph->tos = inet->tos;
1334 iph->frag_off = df;
1335 ip_select_ident(iph, &rt->dst, sk);
1336 iph->ttl = ttl;
1337 iph->protocol = sk->sk_protocol;
1338 iph->saddr = fl4->saddr;
1339 iph->daddr = fl4->daddr;
1341 if (opt) {
1342 iph->ihl += opt->optlen>>2;
1343 ip_options_build(skb, opt, cork->addr, rt, 0);
1346 skb->priority = sk->sk_priority;
1347 skb->mark = sk->sk_mark;
1349 * Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec
1350 * on dst refcount
1352 cork->dst = NULL;
1353 skb_dst_set(skb, &rt->dst);
1355 if (iph->protocol == IPPROTO_ICMP)
1356 icmp_out_count(net, ((struct icmphdr *)
1357 skb_transport_header(skb))->type);
1359 ip_cork_release(cork);
1360 out:
1361 return skb;
1364 int ip_send_skb(struct sk_buff *skb)
1366 struct net *net = sock_net(skb->sk);
1367 int err;
1369 err = ip_local_out(skb);
1370 if (err) {
1371 if (err > 0)
1372 err = net_xmit_errno(err);
1373 if (err)
1374 IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS);
1377 return err;
1380 int ip_push_pending_frames(struct sock *sk, struct flowi4 *fl4)
1382 struct sk_buff *skb;
1384 skb = ip_finish_skb(sk, fl4);
1385 if (!skb)
1386 return 0;
1388 /* Netfilter gets whole the not fragmented skb. */
1389 return ip_send_skb(skb);
1393 * Throw away all pending data on the socket.
1395 static void __ip_flush_pending_frames(struct sock *sk,
1396 struct sk_buff_head *queue,
1397 struct inet_cork *cork)
1399 struct sk_buff *skb;
1401 while ((skb = __skb_dequeue_tail(queue)) != NULL)
1402 kfree_skb(skb);
1404 ip_cork_release(cork);
1407 void ip_flush_pending_frames(struct sock *sk)
1409 __ip_flush_pending_frames(sk, &sk->sk_write_queue, &inet_sk(sk)->cork.base);
1412 struct sk_buff *ip_make_skb(struct sock *sk,
1413 struct flowi4 *fl4,
1414 int getfrag(void *from, char *to, int offset,
1415 int len, int odd, struct sk_buff *skb),
1416 void *from, int length, int transhdrlen,
1417 struct ipcm_cookie *ipc, struct rtable **rtp,
1418 unsigned int flags)
1420 struct inet_cork cork;
1421 struct sk_buff_head queue;
1422 int err;
1424 if (flags & MSG_PROBE)
1425 return NULL;
1427 __skb_queue_head_init(&queue);
1429 cork.flags = 0;
1430 cork.addr = 0;
1431 cork.opt = NULL;
1432 err = ip_setup_cork(sk, &cork, ipc, rtp);
1433 if (err)
1434 return ERR_PTR(err);
1436 err = __ip_append_data(sk, fl4, &queue, &cork, getfrag,
1437 from, length, transhdrlen, flags);
1438 if (err) {
1439 __ip_flush_pending_frames(sk, &queue, &cork);
1440 return ERR_PTR(err);
1443 return __ip_make_skb(sk, fl4, &queue, &cork);
1447 * Fetch data from kernel space and fill in checksum if needed.
1449 static int ip_reply_glue_bits(void *dptr, char *to, int offset,
1450 int len, int odd, struct sk_buff *skb)
1452 __wsum csum;
1454 csum = csum_partial_copy_nocheck(dptr+offset, to, len, 0);
1455 skb->csum = csum_block_add(skb->csum, csum, odd);
1456 return 0;
1460 * Generic function to send a packet as reply to another packet.
1461 * Used to send TCP resets so far. ICMP should use this function too.
1463 * Should run single threaded per socket because it uses the sock
1464 * structure to pass arguments.
1466 void ip_send_reply(struct sock *sk, struct sk_buff *skb, __be32 daddr,
1467 struct ip_reply_arg *arg, unsigned int len)
1469 struct inet_sock *inet = inet_sk(sk);
1470 struct ip_options_data replyopts;
1471 struct ipcm_cookie ipc;
1472 struct flowi4 fl4;
1473 struct rtable *rt = skb_rtable(skb);
1475 if (ip_options_echo(&replyopts.opt.opt, skb))
1476 return;
1478 ipc.addr = daddr;
1479 ipc.opt = NULL;
1480 ipc.tx_flags = 0;
1482 if (replyopts.opt.opt.optlen) {
1483 ipc.opt = &replyopts.opt;
1485 if (replyopts.opt.opt.srr)
1486 daddr = replyopts.opt.opt.faddr;
1489 flowi4_init_output(&fl4, arg->bound_dev_if, 0,
1490 RT_TOS(ip_hdr(skb)->tos),
1491 RT_SCOPE_UNIVERSE, sk->sk_protocol,
1492 ip_reply_arg_flowi_flags(arg),
1493 daddr, rt->rt_spec_dst,
1494 tcp_hdr(skb)->source, tcp_hdr(skb)->dest);
1495 security_skb_classify_flow(skb, flowi4_to_flowi(&fl4));
1496 rt = ip_route_output_key(sock_net(sk), &fl4);
1497 if (IS_ERR(rt))
1498 return;
1500 /* And let IP do all the hard work.
1502 This chunk is not reenterable, hence spinlock.
1503 Note that it uses the fact, that this function is called
1504 with locally disabled BH and that sk cannot be already spinlocked.
1506 bh_lock_sock(sk);
1507 inet->tos = ip_hdr(skb)->tos;
1508 sk->sk_priority = skb->priority;
1509 sk->sk_protocol = ip_hdr(skb)->protocol;
1510 sk->sk_bound_dev_if = arg->bound_dev_if;
1511 ip_append_data(sk, &fl4, ip_reply_glue_bits, arg->iov->iov_base, len, 0,
1512 &ipc, &rt, MSG_DONTWAIT);
1513 if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) {
1514 if (arg->csumoffset >= 0)
1515 *((__sum16 *)skb_transport_header(skb) +
1516 arg->csumoffset) = csum_fold(csum_add(skb->csum,
1517 arg->csum));
1518 skb->ip_summed = CHECKSUM_NONE;
1519 ip_push_pending_frames(sk, &fl4);
1522 bh_unlock_sock(sk);
1524 ip_rt_put(rt);
1527 void __init ip_init(void)
1529 ip_rt_init();
1530 inet_initpeers();
1532 #if defined(CONFIG_IP_MULTICAST) && defined(CONFIG_PROC_FS)
1533 igmp_mc_proc_init();
1534 #endif