2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * The Internet Protocol (IP) output module.
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Donald Becker, <becker@super.org>
11 * Alan Cox, <Alan.Cox@linux.org>
13 * Stefan Becker, <stefanb@yello.ping.de>
14 * Jorge Cwik, <jorge@laser.satlink.net>
15 * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
16 * Hirokazu Takahashi, <taka@valinux.co.jp>
18 * See ip_input.c for original log
21 * Alan Cox : Missing nonblock feature in ip_build_xmit.
22 * Mike Kilburn : htons() missing in ip_build_xmit.
23 * Bradford Johnson: Fix faulty handling of some frames when
25 * Alexander Demenshin: Missing sk/skb free in ip_queue_xmit
26 * (in case if packet not accepted by
27 * output firewall rules)
28 * Mike McLagan : Routing by source
29 * Alexey Kuznetsov: use new route cache
30 * Andi Kleen: Fix broken PMTU recovery and remove
31 * some redundant tests.
32 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
33 * Andi Kleen : Replace ip_reply with ip_send_reply.
34 * Andi Kleen : Split fast and slow ip_build_xmit path
35 * for decreased register pressure on x86
36 * and more readibility.
37 * Marc Boucher : When call_out_firewall returns FW_QUEUE,
38 * silently drop skb instead of failing with -EPERM.
39 * Detlev Wengorz : Copy protocol for fragments.
40 * Hirokazu Takahashi: HW checksumming for outgoing UDP
42 * Hirokazu Takahashi: sendfile() on UDP works now.
45 #include <asm/uaccess.h>
46 #include <asm/system.h>
47 #include <linux/module.h>
48 #include <linux/types.h>
49 #include <linux/kernel.h>
51 #include <linux/string.h>
52 #include <linux/errno.h>
53 #include <linux/highmem.h>
54 #include <linux/slab.h>
56 #include <linux/socket.h>
57 #include <linux/sockios.h>
59 #include <linux/inet.h>
60 #include <linux/netdevice.h>
61 #include <linux/etherdevice.h>
62 #include <linux/proc_fs.h>
63 #include <linux/stat.h>
64 #include <linux/init.h>
68 #include <net/protocol.h>
69 #include <net/route.h>
71 #include <linux/skbuff.h>
75 #include <net/checksum.h>
76 #include <net/inetpeer.h>
77 #include <linux/igmp.h>
78 #include <linux/netfilter_ipv4.h>
79 #include <linux/netfilter_bridge.h>
80 #include <linux/mroute.h>
81 #include <linux/netlink.h>
82 #include <linux/tcp.h>
84 int sysctl_ip_default_ttl __read_mostly
= IPDEFTTL
;
85 EXPORT_SYMBOL(sysctl_ip_default_ttl
);
87 /* Generate a checksum for an outgoing IP datagram. */
88 __inline__
void ip_send_check(struct iphdr
*iph
)
91 iph
->check
= ip_fast_csum((unsigned char *)iph
, iph
->ihl
);
93 EXPORT_SYMBOL(ip_send_check
);
95 int __ip_local_out(struct sk_buff
*skb
)
97 struct iphdr
*iph
= ip_hdr(skb
);
99 iph
->tot_len
= htons(skb
->len
);
101 return nf_hook(NFPROTO_IPV4
, NF_INET_LOCAL_OUT
, skb
, NULL
,
102 skb_dst(skb
)->dev
, dst_output
);
105 int ip_local_out(struct sk_buff
*skb
)
109 err
= __ip_local_out(skb
);
110 if (likely(err
== 1))
111 err
= dst_output(skb
);
115 EXPORT_SYMBOL_GPL(ip_local_out
);
117 /* dev_loopback_xmit for use with netfilter. */
118 static int ip_dev_loopback_xmit(struct sk_buff
*newskb
)
120 skb_reset_mac_header(newskb
);
121 __skb_pull(newskb
, skb_network_offset(newskb
));
122 newskb
->pkt_type
= PACKET_LOOPBACK
;
123 newskb
->ip_summed
= CHECKSUM_UNNECESSARY
;
124 WARN_ON(!skb_dst(newskb
));
125 skb_dst_force(newskb
);
130 static inline int ip_select_ttl(struct inet_sock
*inet
, struct dst_entry
*dst
)
132 int ttl
= inet
->uc_ttl
;
135 ttl
= ip4_dst_hoplimit(dst
);
140 * Add an ip header to a skbuff and send it out.
143 int ip_build_and_send_pkt(struct sk_buff
*skb
, struct sock
*sk
,
144 __be32 saddr
, __be32 daddr
, struct ip_options_rcu
*opt
)
146 struct inet_sock
*inet
= inet_sk(sk
);
147 struct rtable
*rt
= skb_rtable(skb
);
150 /* Build the IP header. */
151 skb_push(skb
, sizeof(struct iphdr
) + (opt
? opt
->opt
.optlen
: 0));
152 skb_reset_network_header(skb
);
156 iph
->tos
= inet
->tos
;
157 if (ip_dont_fragment(sk
, &rt
->dst
))
158 iph
->frag_off
= htons(IP_DF
);
161 iph
->ttl
= ip_select_ttl(inet
, &rt
->dst
);
162 iph
->daddr
= (opt
&& opt
->opt
.srr
? opt
->opt
.faddr
: daddr
);
164 iph
->protocol
= sk
->sk_protocol
;
165 ip_select_ident(iph
, &rt
->dst
, sk
);
167 if (opt
&& opt
->opt
.optlen
) {
168 iph
->ihl
+= opt
->opt
.optlen
>>2;
169 ip_options_build(skb
, &opt
->opt
, daddr
, rt
, 0);
172 skb
->priority
= sk
->sk_priority
;
173 skb
->mark
= sk
->sk_mark
;
176 return ip_local_out(skb
);
178 EXPORT_SYMBOL_GPL(ip_build_and_send_pkt
);
180 static inline int ip_finish_output2(struct sk_buff
*skb
)
182 struct dst_entry
*dst
= skb_dst(skb
);
183 struct rtable
*rt
= (struct rtable
*)dst
;
184 struct net_device
*dev
= dst
->dev
;
185 unsigned int hh_len
= LL_RESERVED_SPACE(dev
);
186 struct neighbour
*neigh
;
188 if (rt
->rt_type
== RTN_MULTICAST
) {
189 IP_UPD_PO_STATS(dev_net(dev
), IPSTATS_MIB_OUTMCAST
, skb
->len
);
190 } else if (rt
->rt_type
== RTN_BROADCAST
)
191 IP_UPD_PO_STATS(dev_net(dev
), IPSTATS_MIB_OUTBCAST
, skb
->len
);
193 /* Be paranoid, rather than too clever. */
194 if (unlikely(skb_headroom(skb
) < hh_len
&& dev
->header_ops
)) {
195 struct sk_buff
*skb2
;
197 skb2
= skb_realloc_headroom(skb
, LL_RESERVED_SPACE(dev
));
203 skb_set_owner_w(skb2
, skb
->sk
);
209 neigh
= dst_get_neighbour(dst
);
211 int res
= neigh_output(neigh
, skb
);
219 printk(KERN_DEBUG
"ip_finish_output2: No header cache and no neighbour!\n");
224 static inline int ip_skb_dst_mtu(struct sk_buff
*skb
)
226 struct inet_sock
*inet
= skb
->sk
? inet_sk(skb
->sk
) : NULL
;
228 return (inet
&& inet
->pmtudisc
== IP_PMTUDISC_PROBE
) ?
229 skb_dst(skb
)->dev
->mtu
: dst_mtu(skb_dst(skb
));
232 static int ip_finish_output(struct sk_buff
*skb
)
234 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
235 /* Policy lookup after SNAT yielded a new policy */
236 if (skb_dst(skb
)->xfrm
!= NULL
) {
237 IPCB(skb
)->flags
|= IPSKB_REROUTED
;
238 return dst_output(skb
);
241 if (skb
->len
> ip_skb_dst_mtu(skb
) && !skb_is_gso(skb
))
242 return ip_fragment(skb
, ip_finish_output2
);
244 return ip_finish_output2(skb
);
247 int ip_mc_output(struct sk_buff
*skb
)
249 struct sock
*sk
= skb
->sk
;
250 struct rtable
*rt
= skb_rtable(skb
);
251 struct net_device
*dev
= rt
->dst
.dev
;
254 * If the indicated interface is up and running, send the packet.
256 IP_UPD_PO_STATS(dev_net(dev
), IPSTATS_MIB_OUT
, skb
->len
);
259 skb
->protocol
= htons(ETH_P_IP
);
262 * Multicasts are looped back for other local users
265 if (rt
->rt_flags
&RTCF_MULTICAST
) {
267 #ifdef CONFIG_IP_MROUTE
268 /* Small optimization: do not loopback not local frames,
269 which returned after forwarding; they will be dropped
270 by ip_mr_input in any case.
271 Note, that local frames are looped back to be delivered
274 This check is duplicated in ip_mr_input at the moment.
277 ((rt
->rt_flags
& RTCF_LOCAL
) ||
278 !(IPCB(skb
)->flags
& IPSKB_FORWARDED
))
281 struct sk_buff
*newskb
= skb_clone(skb
, GFP_ATOMIC
);
283 NF_HOOK(NFPROTO_IPV4
, NF_INET_POST_ROUTING
,
284 newskb
, NULL
, newskb
->dev
,
285 ip_dev_loopback_xmit
);
288 /* Multicasts with ttl 0 must not go beyond the host */
290 if (ip_hdr(skb
)->ttl
== 0) {
296 if (rt
->rt_flags
&RTCF_BROADCAST
) {
297 struct sk_buff
*newskb
= skb_clone(skb
, GFP_ATOMIC
);
299 NF_HOOK(NFPROTO_IPV4
, NF_INET_POST_ROUTING
, newskb
,
300 NULL
, newskb
->dev
, ip_dev_loopback_xmit
);
303 return NF_HOOK_COND(NFPROTO_IPV4
, NF_INET_POST_ROUTING
, skb
, NULL
,
304 skb
->dev
, ip_finish_output
,
305 !(IPCB(skb
)->flags
& IPSKB_REROUTED
));
308 int ip_output(struct sk_buff
*skb
)
310 struct net_device
*dev
= skb_dst(skb
)->dev
;
312 IP_UPD_PO_STATS(dev_net(dev
), IPSTATS_MIB_OUT
, skb
->len
);
315 skb
->protocol
= htons(ETH_P_IP
);
317 return NF_HOOK_COND(NFPROTO_IPV4
, NF_INET_POST_ROUTING
, skb
, NULL
, dev
,
319 !(IPCB(skb
)->flags
& IPSKB_REROUTED
));
322 int ip_queue_xmit(struct sk_buff
*skb
, struct flowi
*fl
)
324 struct sock
*sk
= skb
->sk
;
325 struct inet_sock
*inet
= inet_sk(sk
);
326 struct ip_options_rcu
*inet_opt
;
332 /* Skip all of this if the packet is already routed,
333 * f.e. by something like SCTP.
336 inet_opt
= rcu_dereference(inet
->inet_opt
);
338 rt
= skb_rtable(skb
);
342 /* Make sure we can route this packet. */
343 rt
= (struct rtable
*)__sk_dst_check(sk
, 0);
347 /* Use correct destination address if we have options. */
348 daddr
= inet
->inet_daddr
;
349 if (inet_opt
&& inet_opt
->opt
.srr
)
350 daddr
= inet_opt
->opt
.faddr
;
352 /* If this fails, retransmit mechanism of transport layer will
353 * keep trying until route appears or the connection times
356 rt
= ip_route_output_ports(sock_net(sk
), fl4
, sk
,
357 daddr
, inet
->inet_saddr
,
362 sk
->sk_bound_dev_if
);
365 sk_setup_caps(sk
, &rt
->dst
);
367 skb_dst_set_noref(skb
, &rt
->dst
);
370 if (inet_opt
&& inet_opt
->opt
.is_strictroute
&& fl4
->daddr
!= rt
->rt_gateway
)
373 /* OK, we know where to send it, allocate and build IP header. */
374 skb_push(skb
, sizeof(struct iphdr
) + (inet_opt
? inet_opt
->opt
.optlen
: 0));
375 skb_reset_network_header(skb
);
377 *((__be16
*)iph
) = htons((4 << 12) | (5 << 8) | (inet
->tos
& 0xff));
378 if (ip_dont_fragment(sk
, &rt
->dst
) && !skb
->local_df
)
379 iph
->frag_off
= htons(IP_DF
);
382 iph
->ttl
= ip_select_ttl(inet
, &rt
->dst
);
383 iph
->protocol
= sk
->sk_protocol
;
384 iph
->saddr
= fl4
->saddr
;
385 iph
->daddr
= fl4
->daddr
;
386 /* Transport layer set skb->h.foo itself. */
388 if (inet_opt
&& inet_opt
->opt
.optlen
) {
389 iph
->ihl
+= inet_opt
->opt
.optlen
>> 2;
390 ip_options_build(skb
, &inet_opt
->opt
, inet
->inet_daddr
, rt
, 0);
393 ip_select_ident_more(iph
, &rt
->dst
, sk
,
394 (skb_shinfo(skb
)->gso_segs
?: 1) - 1);
396 skb
->priority
= sk
->sk_priority
;
397 skb
->mark
= sk
->sk_mark
;
399 res
= ip_local_out(skb
);
405 IP_INC_STATS(sock_net(sk
), IPSTATS_MIB_OUTNOROUTES
);
407 return -EHOSTUNREACH
;
409 EXPORT_SYMBOL(ip_queue_xmit
);
412 static void ip_copy_metadata(struct sk_buff
*to
, struct sk_buff
*from
)
414 to
->pkt_type
= from
->pkt_type
;
415 to
->priority
= from
->priority
;
416 to
->protocol
= from
->protocol
;
418 skb_dst_copy(to
, from
);
420 to
->mark
= from
->mark
;
422 /* Copy the flags to each fragment. */
423 IPCB(to
)->flags
= IPCB(from
)->flags
;
425 #ifdef CONFIG_NET_SCHED
426 to
->tc_index
= from
->tc_index
;
429 #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
430 defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
431 to
->nf_trace
= from
->nf_trace
;
433 #if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
434 to
->ipvs_property
= from
->ipvs_property
;
436 skb_copy_secmark(to
, from
);
440 * This IP datagram is too large to be sent in one piece. Break it up into
441 * smaller pieces (each of size equal to IP header plus
442 * a block of the data of the original IP data part) that will yet fit in a
443 * single device frame, and queue such a frame for sending.
446 int ip_fragment(struct sk_buff
*skb
, int (*output
)(struct sk_buff
*))
450 struct net_device
*dev
;
451 struct sk_buff
*skb2
;
452 unsigned int mtu
, hlen
, left
, len
, ll_rs
;
454 __be16 not_last_frag
;
455 struct rtable
*rt
= skb_rtable(skb
);
461 * Point into the IP datagram header.
466 if (unlikely((iph
->frag_off
& htons(IP_DF
)) && !skb
->local_df
)) {
467 IP_INC_STATS(dev_net(dev
), IPSTATS_MIB_FRAGFAILS
);
468 icmp_send(skb
, ICMP_DEST_UNREACH
, ICMP_FRAG_NEEDED
,
469 htonl(ip_skb_dst_mtu(skb
)));
475 * Setup starting values.
479 mtu
= dst_mtu(&rt
->dst
) - hlen
; /* Size of data space */
480 #ifdef CONFIG_BRIDGE_NETFILTER
482 mtu
-= nf_bridge_mtu_reduction(skb
);
484 IPCB(skb
)->flags
|= IPSKB_FRAG_COMPLETE
;
486 /* When frag_list is given, use it. First, check its validity:
487 * some transformers could create wrong frag_list or break existing
488 * one, it is not prohibited. In this case fall back to copying.
490 * LATER: this step can be merged to real generation of fragments,
491 * we can switch to copy when see the first bad fragment.
493 if (skb_has_frag_list(skb
)) {
494 struct sk_buff
*frag
, *frag2
;
495 int first_len
= skb_pagelen(skb
);
497 if (first_len
- hlen
> mtu
||
498 ((first_len
- hlen
) & 7) ||
499 ip_is_fragment(iph
) ||
503 skb_walk_frags(skb
, frag
) {
504 /* Correct geometry. */
505 if (frag
->len
> mtu
||
506 ((frag
->len
& 7) && frag
->next
) ||
507 skb_headroom(frag
) < hlen
)
508 goto slow_path_clean
;
510 /* Partially cloned skb? */
511 if (skb_shared(frag
))
512 goto slow_path_clean
;
517 frag
->destructor
= sock_wfree
;
519 skb
->truesize
-= frag
->truesize
;
522 /* Everything is OK. Generate! */
526 frag
= skb_shinfo(skb
)->frag_list
;
527 skb_frag_list_init(skb
);
528 skb
->data_len
= first_len
- skb_headlen(skb
);
529 skb
->len
= first_len
;
530 iph
->tot_len
= htons(first_len
);
531 iph
->frag_off
= htons(IP_MF
);
535 /* Prepare header of the next frame,
536 * before previous one went down. */
538 frag
->ip_summed
= CHECKSUM_NONE
;
539 skb_reset_transport_header(frag
);
540 __skb_push(frag
, hlen
);
541 skb_reset_network_header(frag
);
542 memcpy(skb_network_header(frag
), iph
, hlen
);
544 iph
->tot_len
= htons(frag
->len
);
545 ip_copy_metadata(frag
, skb
);
547 ip_options_fragment(frag
);
548 offset
+= skb
->len
- hlen
;
549 iph
->frag_off
= htons(offset
>>3);
550 if (frag
->next
!= NULL
)
551 iph
->frag_off
|= htons(IP_MF
);
552 /* Ready, complete checksum */
559 IP_INC_STATS(dev_net(dev
), IPSTATS_MIB_FRAGCREATES
);
569 IP_INC_STATS(dev_net(dev
), IPSTATS_MIB_FRAGOKS
);
578 IP_INC_STATS(dev_net(dev
), IPSTATS_MIB_FRAGFAILS
);
582 skb_walk_frags(skb
, frag2
) {
586 frag2
->destructor
= NULL
;
587 skb
->truesize
+= frag2
->truesize
;
592 left
= skb
->len
- hlen
; /* Space per frame */
593 ptr
= hlen
; /* Where to start from */
595 /* for bridged IP traffic encapsulated inside f.e. a vlan header,
596 * we need to make room for the encapsulating header
598 ll_rs
= LL_RESERVED_SPACE_EXTRA(rt
->dst
.dev
, nf_bridge_pad(skb
));
601 * Fragment the datagram.
604 offset
= (ntohs(iph
->frag_off
) & IP_OFFSET
) << 3;
605 not_last_frag
= iph
->frag_off
& htons(IP_MF
);
608 * Keep copying data until we run out.
613 /* IF: it doesn't fit, use 'mtu' - the data space left */
616 /* IF: we are not sending up to and including the packet end
617 then align the next start on an eight byte boundary */
625 if ((skb2
= alloc_skb(len
+hlen
+ll_rs
, GFP_ATOMIC
)) == NULL
) {
626 NETDEBUG(KERN_INFO
"IP: frag: no memory for new fragment!\n");
632 * Set up data on packet
635 ip_copy_metadata(skb2
, skb
);
636 skb_reserve(skb2
, ll_rs
);
637 skb_put(skb2
, len
+ hlen
);
638 skb_reset_network_header(skb2
);
639 skb2
->transport_header
= skb2
->network_header
+ hlen
;
642 * Charge the memory for the fragment to any owner
647 skb_set_owner_w(skb2
, skb
->sk
);
650 * Copy the packet header into the new buffer.
653 skb_copy_from_linear_data(skb
, skb_network_header(skb2
), hlen
);
656 * Copy a block of the IP datagram.
658 if (skb_copy_bits(skb
, ptr
, skb_transport_header(skb2
), len
))
663 * Fill in the new header fields.
666 iph
->frag_off
= htons((offset
>> 3));
668 /* ANK: dirty, but effective trick. Upgrade options only if
669 * the segment to be fragmented was THE FIRST (otherwise,
670 * options are already fixed) and make it ONCE
671 * on the initial skb, so that all the following fragments
672 * will inherit fixed options.
675 ip_options_fragment(skb
);
678 * Added AC : If we are fragmenting a fragment that's not the
679 * last fragment then keep MF on each bit
681 if (left
> 0 || not_last_frag
)
682 iph
->frag_off
|= htons(IP_MF
);
687 * Put this fragment into the sending queue.
689 iph
->tot_len
= htons(len
+ hlen
);
697 IP_INC_STATS(dev_net(dev
), IPSTATS_MIB_FRAGCREATES
);
700 IP_INC_STATS(dev_net(dev
), IPSTATS_MIB_FRAGOKS
);
705 IP_INC_STATS(dev_net(dev
), IPSTATS_MIB_FRAGFAILS
);
708 EXPORT_SYMBOL(ip_fragment
);
711 ip_generic_getfrag(void *from
, char *to
, int offset
, int len
, int odd
, struct sk_buff
*skb
)
713 struct iovec
*iov
= from
;
715 if (skb
->ip_summed
== CHECKSUM_PARTIAL
) {
716 if (memcpy_fromiovecend(to
, iov
, offset
, len
) < 0)
720 if (csum_partial_copy_fromiovecend(to
, iov
, offset
, len
, &csum
) < 0)
722 skb
->csum
= csum_block_add(skb
->csum
, csum
, odd
);
726 EXPORT_SYMBOL(ip_generic_getfrag
);
729 csum_page(struct page
*page
, int offset
, int copy
)
734 csum
= csum_partial(kaddr
+ offset
, copy
, 0);
739 static inline int ip_ufo_append_data(struct sock
*sk
,
740 struct sk_buff_head
*queue
,
741 int getfrag(void *from
, char *to
, int offset
, int len
,
742 int odd
, struct sk_buff
*skb
),
743 void *from
, int length
, int hh_len
, int fragheaderlen
,
744 int transhdrlen
, int maxfraglen
, unsigned int flags
)
749 /* There is support for UDP fragmentation offload by network
750 * device, so create one single skb packet containing complete
753 if ((skb
= skb_peek_tail(queue
)) == NULL
) {
754 skb
= sock_alloc_send_skb(sk
,
755 hh_len
+ fragheaderlen
+ transhdrlen
+ 20,
756 (flags
& MSG_DONTWAIT
), &err
);
761 /* reserve space for Hardware header */
762 skb_reserve(skb
, hh_len
);
764 /* create space for UDP/IP header */
765 skb_put(skb
, fragheaderlen
+ transhdrlen
);
767 /* initialize network header pointer */
768 skb_reset_network_header(skb
);
770 /* initialize protocol header pointer */
771 skb
->transport_header
= skb
->network_header
+ fragheaderlen
;
773 skb
->ip_summed
= CHECKSUM_PARTIAL
;
776 /* specify the length of each IP datagram fragment */
777 skb_shinfo(skb
)->gso_size
= maxfraglen
- fragheaderlen
;
778 skb_shinfo(skb
)->gso_type
= SKB_GSO_UDP
;
779 __skb_queue_tail(queue
, skb
);
782 return skb_append_datato_frags(sk
, skb
, getfrag
, from
,
783 (length
- transhdrlen
));
786 static int __ip_append_data(struct sock
*sk
,
788 struct sk_buff_head
*queue
,
789 struct inet_cork
*cork
,
790 int getfrag(void *from
, char *to
, int offset
,
791 int len
, int odd
, struct sk_buff
*skb
),
792 void *from
, int length
, int transhdrlen
,
795 struct inet_sock
*inet
= inet_sk(sk
);
798 struct ip_options
*opt
= cork
->opt
;
805 unsigned int maxfraglen
, fragheaderlen
;
806 int csummode
= CHECKSUM_NONE
;
807 struct rtable
*rt
= (struct rtable
*)cork
->dst
;
809 skb
= skb_peek_tail(queue
);
811 exthdrlen
= !skb
? rt
->dst
.header_len
: 0;
812 mtu
= cork
->fragsize
;
814 hh_len
= LL_RESERVED_SPACE(rt
->dst
.dev
);
816 fragheaderlen
= sizeof(struct iphdr
) + (opt
? opt
->optlen
: 0);
817 maxfraglen
= ((mtu
- fragheaderlen
) & ~7) + fragheaderlen
;
819 if (cork
->length
+ length
> 0xFFFF - fragheaderlen
) {
820 ip_local_error(sk
, EMSGSIZE
, fl4
->daddr
, inet
->inet_dport
,
826 * transhdrlen > 0 means that this is the first fragment and we wish
827 * it won't be fragmented in the future.
830 length
+ fragheaderlen
<= mtu
&&
831 rt
->dst
.dev
->features
& NETIF_F_V4_CSUM
&&
833 csummode
= CHECKSUM_PARTIAL
;
835 cork
->length
+= length
;
836 if (((length
> mtu
) || (skb
&& skb_is_gso(skb
))) &&
837 (sk
->sk_protocol
== IPPROTO_UDP
) &&
838 (rt
->dst
.dev
->features
& NETIF_F_UFO
) && !rt
->dst
.header_len
) {
839 err
= ip_ufo_append_data(sk
, queue
, getfrag
, from
, length
,
840 hh_len
, fragheaderlen
, transhdrlen
,
847 /* So, what's going on in the loop below?
849 * We use calculated fragment length to generate chained skb,
850 * each of segments is IP fragment ready for sending to network after
851 * adding appropriate IP header.
858 /* Check if the remaining data fits into current packet. */
859 copy
= mtu
- skb
->len
;
861 copy
= maxfraglen
- skb
->len
;
864 unsigned int datalen
;
865 unsigned int fraglen
;
866 unsigned int fraggap
;
867 unsigned int alloclen
;
868 struct sk_buff
*skb_prev
;
872 fraggap
= skb_prev
->len
- maxfraglen
;
877 * If remaining data exceeds the mtu,
878 * we know we need more fragment(s).
880 datalen
= length
+ fraggap
;
881 if (datalen
> mtu
- fragheaderlen
)
882 datalen
= maxfraglen
- fragheaderlen
;
883 fraglen
= datalen
+ fragheaderlen
;
885 if ((flags
& MSG_MORE
) &&
886 !(rt
->dst
.dev
->features
&NETIF_F_SG
))
891 alloclen
+= exthdrlen
;
893 /* The last fragment gets additional space at tail.
894 * Note, with MSG_MORE we overallocate on fragments,
895 * because we have no idea what fragment will be
898 if (datalen
== length
+ fraggap
)
899 alloclen
+= rt
->dst
.trailer_len
;
902 skb
= sock_alloc_send_skb(sk
,
903 alloclen
+ hh_len
+ 15,
904 (flags
& MSG_DONTWAIT
), &err
);
907 if (atomic_read(&sk
->sk_wmem_alloc
) <=
909 skb
= sock_wmalloc(sk
,
910 alloclen
+ hh_len
+ 15, 1,
912 if (unlikely(skb
== NULL
))
915 /* only the initial fragment is
923 * Fill in the control structures
925 skb
->ip_summed
= csummode
;
927 skb_reserve(skb
, hh_len
);
928 skb_shinfo(skb
)->tx_flags
= cork
->tx_flags
;
931 * Find where to start putting bytes.
933 data
= skb_put(skb
, fraglen
+ exthdrlen
);
934 skb_set_network_header(skb
, exthdrlen
);
935 skb
->transport_header
= (skb
->network_header
+
937 data
+= fragheaderlen
+ exthdrlen
;
940 skb
->csum
= skb_copy_and_csum_bits(
941 skb_prev
, maxfraglen
,
942 data
+ transhdrlen
, fraggap
, 0);
943 skb_prev
->csum
= csum_sub(skb_prev
->csum
,
946 pskb_trim_unique(skb_prev
, maxfraglen
);
949 copy
= datalen
- transhdrlen
- fraggap
;
950 if (copy
> 0 && getfrag(from
, data
+ transhdrlen
, offset
, copy
, fraggap
, skb
) < 0) {
957 length
-= datalen
- fraggap
;
960 csummode
= CHECKSUM_NONE
;
963 * Put the packet on the pending queue.
965 __skb_queue_tail(queue
, skb
);
972 if (!(rt
->dst
.dev
->features
&NETIF_F_SG
)) {
976 if (getfrag(from
, skb_put(skb
, copy
),
977 offset
, copy
, off
, skb
) < 0) {
978 __skb_trim(skb
, off
);
983 int i
= skb_shinfo(skb
)->nr_frags
;
984 skb_frag_t
*frag
= &skb_shinfo(skb
)->frags
[i
-1];
985 struct page
*page
= cork
->page
;
989 if (page
&& (left
= PAGE_SIZE
- off
) > 0) {
992 if (page
!= skb_frag_page(frag
)) {
993 if (i
== MAX_SKB_FRAGS
) {
997 skb_fill_page_desc(skb
, i
, page
, off
, 0);
998 skb_frag_ref(skb
, i
);
999 frag
= &skb_shinfo(skb
)->frags
[i
];
1001 } else if (i
< MAX_SKB_FRAGS
) {
1002 if (copy
> PAGE_SIZE
)
1004 page
= alloc_pages(sk
->sk_allocation
, 0);
1012 skb_fill_page_desc(skb
, i
, page
, 0, 0);
1013 frag
= &skb_shinfo(skb
)->frags
[i
];
1018 if (getfrag(from
, skb_frag_address(frag
)+skb_frag_size(frag
),
1019 offset
, copy
, skb
->len
, skb
) < 0) {
1024 skb_frag_size_add(frag
, copy
);
1026 skb
->data_len
+= copy
;
1027 skb
->truesize
+= copy
;
1028 atomic_add(copy
, &sk
->sk_wmem_alloc
);
1037 cork
->length
-= length
;
1038 IP_INC_STATS(sock_net(sk
), IPSTATS_MIB_OUTDISCARDS
);
1042 static int ip_setup_cork(struct sock
*sk
, struct inet_cork
*cork
,
1043 struct ipcm_cookie
*ipc
, struct rtable
**rtp
)
1045 struct inet_sock
*inet
= inet_sk(sk
);
1046 struct ip_options_rcu
*opt
;
1050 * setup for corking.
1054 if (cork
->opt
== NULL
) {
1055 cork
->opt
= kmalloc(sizeof(struct ip_options
) + 40,
1057 if (unlikely(cork
->opt
== NULL
))
1060 memcpy(cork
->opt
, &opt
->opt
, sizeof(struct ip_options
) + opt
->opt
.optlen
);
1061 cork
->flags
|= IPCORK_OPT
;
1062 cork
->addr
= ipc
->addr
;
1068 * We steal reference to this route, caller should not release it
1071 cork
->fragsize
= inet
->pmtudisc
== IP_PMTUDISC_PROBE
?
1072 rt
->dst
.dev
->mtu
: dst_mtu(&rt
->dst
);
1073 cork
->dst
= &rt
->dst
;
1075 cork
->tx_flags
= ipc
->tx_flags
;
1083 * ip_append_data() and ip_append_page() can make one large IP datagram
1084 * from many pieces of data. Each pieces will be holded on the socket
1085 * until ip_push_pending_frames() is called. Each piece can be a page
1088 * Not only UDP, other transport protocols - e.g. raw sockets - can use
1089 * this interface potentially.
1091 * LATER: length must be adjusted by pad at tail, when it is required.
1093 int ip_append_data(struct sock
*sk
, struct flowi4
*fl4
,
1094 int getfrag(void *from
, char *to
, int offset
, int len
,
1095 int odd
, struct sk_buff
*skb
),
1096 void *from
, int length
, int transhdrlen
,
1097 struct ipcm_cookie
*ipc
, struct rtable
**rtp
,
1100 struct inet_sock
*inet
= inet_sk(sk
);
1103 if (flags
&MSG_PROBE
)
1106 if (skb_queue_empty(&sk
->sk_write_queue
)) {
1107 err
= ip_setup_cork(sk
, &inet
->cork
.base
, ipc
, rtp
);
1114 return __ip_append_data(sk
, fl4
, &sk
->sk_write_queue
, &inet
->cork
.base
, getfrag
,
1115 from
, length
, transhdrlen
, flags
);
1118 ssize_t
ip_append_page(struct sock
*sk
, struct flowi4
*fl4
, struct page
*page
,
1119 int offset
, size_t size
, int flags
)
1121 struct inet_sock
*inet
= inet_sk(sk
);
1122 struct sk_buff
*skb
;
1124 struct ip_options
*opt
= NULL
;
1125 struct inet_cork
*cork
;
1130 unsigned int maxfraglen
, fragheaderlen
, fraggap
;
1135 if (flags
&MSG_PROBE
)
1138 if (skb_queue_empty(&sk
->sk_write_queue
))
1141 cork
= &inet
->cork
.base
;
1142 rt
= (struct rtable
*)cork
->dst
;
1143 if (cork
->flags
& IPCORK_OPT
)
1146 if (!(rt
->dst
.dev
->features
&NETIF_F_SG
))
1149 hh_len
= LL_RESERVED_SPACE(rt
->dst
.dev
);
1150 mtu
= cork
->fragsize
;
1152 fragheaderlen
= sizeof(struct iphdr
) + (opt
? opt
->optlen
: 0);
1153 maxfraglen
= ((mtu
- fragheaderlen
) & ~7) + fragheaderlen
;
1155 if (cork
->length
+ size
> 0xFFFF - fragheaderlen
) {
1156 ip_local_error(sk
, EMSGSIZE
, fl4
->daddr
, inet
->inet_dport
, mtu
);
1160 if ((skb
= skb_peek_tail(&sk
->sk_write_queue
)) == NULL
)
1163 cork
->length
+= size
;
1164 if ((size
+ skb
->len
> mtu
) &&
1165 (sk
->sk_protocol
== IPPROTO_UDP
) &&
1166 (rt
->dst
.dev
->features
& NETIF_F_UFO
)) {
1167 skb_shinfo(skb
)->gso_size
= mtu
- fragheaderlen
;
1168 skb_shinfo(skb
)->gso_type
= SKB_GSO_UDP
;
1175 if (skb_is_gso(skb
))
1179 /* Check if the remaining data fits into current packet. */
1180 len
= mtu
- skb
->len
;
1182 len
= maxfraglen
- skb
->len
;
1185 struct sk_buff
*skb_prev
;
1189 fraggap
= skb_prev
->len
- maxfraglen
;
1191 alloclen
= fragheaderlen
+ hh_len
+ fraggap
+ 15;
1192 skb
= sock_wmalloc(sk
, alloclen
, 1, sk
->sk_allocation
);
1193 if (unlikely(!skb
)) {
1199 * Fill in the control structures
1201 skb
->ip_summed
= CHECKSUM_NONE
;
1203 skb_reserve(skb
, hh_len
);
1206 * Find where to start putting bytes.
1208 skb_put(skb
, fragheaderlen
+ fraggap
);
1209 skb_reset_network_header(skb
);
1210 skb
->transport_header
= (skb
->network_header
+
1213 skb
->csum
= skb_copy_and_csum_bits(skb_prev
,
1215 skb_transport_header(skb
),
1217 skb_prev
->csum
= csum_sub(skb_prev
->csum
,
1219 pskb_trim_unique(skb_prev
, maxfraglen
);
1223 * Put the packet on the pending queue.
1225 __skb_queue_tail(&sk
->sk_write_queue
, skb
);
1229 i
= skb_shinfo(skb
)->nr_frags
;
1232 if (skb_can_coalesce(skb
, i
, page
, offset
)) {
1233 skb_frag_size_add(&skb_shinfo(skb
)->frags
[i
-1], len
);
1234 } else if (i
< MAX_SKB_FRAGS
) {
1236 skb_fill_page_desc(skb
, i
, page
, offset
, len
);
1242 if (skb
->ip_summed
== CHECKSUM_NONE
) {
1244 csum
= csum_page(page
, offset
, len
);
1245 skb
->csum
= csum_block_add(skb
->csum
, csum
, skb
->len
);
1249 skb
->data_len
+= len
;
1250 skb
->truesize
+= len
;
1251 atomic_add(len
, &sk
->sk_wmem_alloc
);
1258 cork
->length
-= size
;
1259 IP_INC_STATS(sock_net(sk
), IPSTATS_MIB_OUTDISCARDS
);
1263 static void ip_cork_release(struct inet_cork
*cork
)
1265 cork
->flags
&= ~IPCORK_OPT
;
1268 dst_release(cork
->dst
);
1273 * Combined all pending IP fragments on the socket as one IP datagram
1274 * and push them out.
1276 struct sk_buff
*__ip_make_skb(struct sock
*sk
,
1278 struct sk_buff_head
*queue
,
1279 struct inet_cork
*cork
)
1281 struct sk_buff
*skb
, *tmp_skb
;
1282 struct sk_buff
**tail_skb
;
1283 struct inet_sock
*inet
= inet_sk(sk
);
1284 struct net
*net
= sock_net(sk
);
1285 struct ip_options
*opt
= NULL
;
1286 struct rtable
*rt
= (struct rtable
*)cork
->dst
;
1291 if ((skb
= __skb_dequeue(queue
)) == NULL
)
1293 tail_skb
= &(skb_shinfo(skb
)->frag_list
);
1295 /* move skb->data to ip header from ext header */
1296 if (skb
->data
< skb_network_header(skb
))
1297 __skb_pull(skb
, skb_network_offset(skb
));
1298 while ((tmp_skb
= __skb_dequeue(queue
)) != NULL
) {
1299 __skb_pull(tmp_skb
, skb_network_header_len(skb
));
1300 *tail_skb
= tmp_skb
;
1301 tail_skb
= &(tmp_skb
->next
);
1302 skb
->len
+= tmp_skb
->len
;
1303 skb
->data_len
+= tmp_skb
->len
;
1304 skb
->truesize
+= tmp_skb
->truesize
;
1305 tmp_skb
->destructor
= NULL
;
1309 /* Unless user demanded real pmtu discovery (IP_PMTUDISC_DO), we allow
1310 * to fragment the frame generated here. No matter, what transforms
1311 * how transforms change size of the packet, it will come out.
1313 if (inet
->pmtudisc
< IP_PMTUDISC_DO
)
1316 /* DF bit is set when we want to see DF on outgoing frames.
1317 * If local_df is set too, we still allow to fragment this frame
1319 if (inet
->pmtudisc
>= IP_PMTUDISC_DO
||
1320 (skb
->len
<= dst_mtu(&rt
->dst
) &&
1321 ip_dont_fragment(sk
, &rt
->dst
)))
1324 if (cork
->flags
& IPCORK_OPT
)
1327 if (rt
->rt_type
== RTN_MULTICAST
)
1330 ttl
= ip_select_ttl(inet
, &rt
->dst
);
1332 iph
= (struct iphdr
*)skb
->data
;
1335 iph
->tos
= inet
->tos
;
1337 ip_select_ident(iph
, &rt
->dst
, sk
);
1339 iph
->protocol
= sk
->sk_protocol
;
1340 iph
->saddr
= fl4
->saddr
;
1341 iph
->daddr
= fl4
->daddr
;
1344 iph
->ihl
+= opt
->optlen
>>2;
1345 ip_options_build(skb
, opt
, cork
->addr
, rt
, 0);
1348 skb
->priority
= sk
->sk_priority
;
1349 skb
->mark
= sk
->sk_mark
;
1351 * Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec
1355 skb_dst_set(skb
, &rt
->dst
);
1357 if (iph
->protocol
== IPPROTO_ICMP
)
1358 icmp_out_count(net
, ((struct icmphdr
*)
1359 skb_transport_header(skb
))->type
);
1361 ip_cork_release(cork
);
1366 int ip_send_skb(struct sk_buff
*skb
)
1368 struct net
*net
= sock_net(skb
->sk
);
1371 err
= ip_local_out(skb
);
1374 err
= net_xmit_errno(err
);
1376 IP_INC_STATS(net
, IPSTATS_MIB_OUTDISCARDS
);
1382 int ip_push_pending_frames(struct sock
*sk
, struct flowi4
*fl4
)
1384 struct sk_buff
*skb
;
1386 skb
= ip_finish_skb(sk
, fl4
);
1390 /* Netfilter gets whole the not fragmented skb. */
1391 return ip_send_skb(skb
);
1395 * Throw away all pending data on the socket.
1397 static void __ip_flush_pending_frames(struct sock
*sk
,
1398 struct sk_buff_head
*queue
,
1399 struct inet_cork
*cork
)
1401 struct sk_buff
*skb
;
1403 while ((skb
= __skb_dequeue_tail(queue
)) != NULL
)
1406 ip_cork_release(cork
);
1409 void ip_flush_pending_frames(struct sock
*sk
)
1411 __ip_flush_pending_frames(sk
, &sk
->sk_write_queue
, &inet_sk(sk
)->cork
.base
);
1414 struct sk_buff
*ip_make_skb(struct sock
*sk
,
1416 int getfrag(void *from
, char *to
, int offset
,
1417 int len
, int odd
, struct sk_buff
*skb
),
1418 void *from
, int length
, int transhdrlen
,
1419 struct ipcm_cookie
*ipc
, struct rtable
**rtp
,
1422 struct inet_cork cork
;
1423 struct sk_buff_head queue
;
1426 if (flags
& MSG_PROBE
)
1429 __skb_queue_head_init(&queue
);
1434 err
= ip_setup_cork(sk
, &cork
, ipc
, rtp
);
1436 return ERR_PTR(err
);
1438 err
= __ip_append_data(sk
, fl4
, &queue
, &cork
, getfrag
,
1439 from
, length
, transhdrlen
, flags
);
1441 __ip_flush_pending_frames(sk
, &queue
, &cork
);
1442 return ERR_PTR(err
);
1445 return __ip_make_skb(sk
, fl4
, &queue
, &cork
);
1449 * Fetch data from kernel space and fill in checksum if needed.
1451 static int ip_reply_glue_bits(void *dptr
, char *to
, int offset
,
1452 int len
, int odd
, struct sk_buff
*skb
)
1456 csum
= csum_partial_copy_nocheck(dptr
+offset
, to
, len
, 0);
1457 skb
->csum
= csum_block_add(skb
->csum
, csum
, odd
);
1462 * Generic function to send a packet as reply to another packet.
1463 * Used to send TCP resets so far. ICMP should use this function too.
1465 * Should run single threaded per socket because it uses the sock
1466 * structure to pass arguments.
1468 void ip_send_reply(struct sock
*sk
, struct sk_buff
*skb
, __be32 daddr
,
1469 const struct ip_reply_arg
*arg
, unsigned int len
)
1471 struct inet_sock
*inet
= inet_sk(sk
);
1472 struct ip_options_data replyopts
;
1473 struct ipcm_cookie ipc
;
1475 struct rtable
*rt
= skb_rtable(skb
);
1477 if (ip_options_echo(&replyopts
.opt
.opt
, skb
))
1484 if (replyopts
.opt
.opt
.optlen
) {
1485 ipc
.opt
= &replyopts
.opt
;
1487 if (replyopts
.opt
.opt
.srr
)
1488 daddr
= replyopts
.opt
.opt
.faddr
;
1491 flowi4_init_output(&fl4
, arg
->bound_dev_if
, 0,
1493 RT_SCOPE_UNIVERSE
, sk
->sk_protocol
,
1494 ip_reply_arg_flowi_flags(arg
),
1495 daddr
, rt
->rt_spec_dst
,
1496 tcp_hdr(skb
)->source
, tcp_hdr(skb
)->dest
);
1497 security_skb_classify_flow(skb
, flowi4_to_flowi(&fl4
));
1498 rt
= ip_route_output_key(sock_net(sk
), &fl4
);
1502 /* And let IP do all the hard work.
1504 This chunk is not reenterable, hence spinlock.
1505 Note that it uses the fact, that this function is called
1506 with locally disabled BH and that sk cannot be already spinlocked.
1509 inet
->tos
= arg
->tos
;
1510 sk
->sk_priority
= skb
->priority
;
1511 sk
->sk_protocol
= ip_hdr(skb
)->protocol
;
1512 sk
->sk_bound_dev_if
= arg
->bound_dev_if
;
1513 ip_append_data(sk
, &fl4
, ip_reply_glue_bits
, arg
->iov
->iov_base
, len
, 0,
1514 &ipc
, &rt
, MSG_DONTWAIT
);
1515 if ((skb
= skb_peek(&sk
->sk_write_queue
)) != NULL
) {
1516 if (arg
->csumoffset
>= 0)
1517 *((__sum16
*)skb_transport_header(skb
) +
1518 arg
->csumoffset
) = csum_fold(csum_add(skb
->csum
,
1520 skb
->ip_summed
= CHECKSUM_NONE
;
1521 ip_push_pending_frames(sk
, &fl4
);
1529 void __init
ip_init(void)
1534 #if defined(CONFIG_IP_MULTICAST) && defined(CONFIG_PROC_FS)
1535 igmp_mc_proc_init();