2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * The Internet Protocol (IP) output module.
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Donald Becker, <becker@super.org>
11 * Alan Cox, <Alan.Cox@linux.org>
13 * Stefan Becker, <stefanb@yello.ping.de>
14 * Jorge Cwik, <jorge@laser.satlink.net>
15 * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
16 * Hirokazu Takahashi, <taka@valinux.co.jp>
18 * See ip_input.c for original log
21 * Alan Cox : Missing nonblock feature in ip_build_xmit.
22 * Mike Kilburn : htons() missing in ip_build_xmit.
23 * Bradford Johnson: Fix faulty handling of some frames when
25 * Alexander Demenshin: Missing sk/skb free in ip_queue_xmit
26 * (in case if packet not accepted by
27 * output firewall rules)
28 * Mike McLagan : Routing by source
29 * Alexey Kuznetsov: use new route cache
30 * Andi Kleen: Fix broken PMTU recovery and remove
31 * some redundant tests.
32 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
33 * Andi Kleen : Replace ip_reply with ip_send_reply.
34 * Andi Kleen : Split fast and slow ip_build_xmit path
35 * for decreased register pressure on x86
36 * and more readibility.
37 * Marc Boucher : When call_out_firewall returns FW_QUEUE,
38 * silently drop skb instead of failing with -EPERM.
39 * Detlev Wengorz : Copy protocol for fragments.
40 * Hirokazu Takahashi: HW checksumming for outgoing UDP
42 * Hirokazu Takahashi: sendfile() on UDP works now.
45 #include <asm/uaccess.h>
46 #include <asm/system.h>
47 #include <linux/module.h>
48 #include <linux/types.h>
49 #include <linux/kernel.h>
51 #include <linux/string.h>
52 #include <linux/errno.h>
53 #include <linux/highmem.h>
54 #include <linux/slab.h>
56 #include <linux/socket.h>
57 #include <linux/sockios.h>
59 #include <linux/inet.h>
60 #include <linux/netdevice.h>
61 #include <linux/etherdevice.h>
62 #include <linux/proc_fs.h>
63 #include <linux/stat.h>
64 #include <linux/init.h>
68 #include <net/protocol.h>
69 #include <net/route.h>
71 #include <linux/skbuff.h>
75 #include <net/checksum.h>
76 #include <net/inetpeer.h>
77 #include <linux/igmp.h>
78 #include <linux/netfilter_ipv4.h>
79 #include <linux/netfilter_bridge.h>
80 #include <linux/mroute.h>
81 #include <linux/netlink.h>
82 #include <linux/tcp.h>
84 int sysctl_ip_default_ttl __read_mostly
= IPDEFTTL
;
85 EXPORT_SYMBOL(sysctl_ip_default_ttl
);
87 /* Generate a checksum for an outgoing IP datagram. */
88 __inline__
void ip_send_check(struct iphdr
*iph
)
91 iph
->check
= ip_fast_csum((unsigned char *)iph
, iph
->ihl
);
93 EXPORT_SYMBOL(ip_send_check
);
95 int __ip_local_out(struct sk_buff
*skb
)
97 struct iphdr
*iph
= ip_hdr(skb
);
99 iph
->tot_len
= htons(skb
->len
);
101 return nf_hook(NFPROTO_IPV4
, NF_INET_LOCAL_OUT
, skb
, NULL
,
102 skb_dst(skb
)->dev
, dst_output
);
105 int ip_local_out(struct sk_buff
*skb
)
109 err
= __ip_local_out(skb
);
110 if (likely(err
== 1))
111 err
= dst_output(skb
);
115 EXPORT_SYMBOL_GPL(ip_local_out
);
117 /* dev_loopback_xmit for use with netfilter. */
118 static int ip_dev_loopback_xmit(struct sk_buff
*newskb
)
120 skb_reset_mac_header(newskb
);
121 __skb_pull(newskb
, skb_network_offset(newskb
));
122 newskb
->pkt_type
= PACKET_LOOPBACK
;
123 newskb
->ip_summed
= CHECKSUM_UNNECESSARY
;
124 WARN_ON(!skb_dst(newskb
));
129 static inline int ip_select_ttl(struct inet_sock
*inet
, struct dst_entry
*dst
)
131 int ttl
= inet
->uc_ttl
;
134 ttl
= ip4_dst_hoplimit(dst
);
139 * Add an ip header to a skbuff and send it out.
142 int ip_build_and_send_pkt(struct sk_buff
*skb
, struct sock
*sk
,
143 __be32 saddr
, __be32 daddr
, struct ip_options_rcu
*opt
)
145 struct inet_sock
*inet
= inet_sk(sk
);
146 struct rtable
*rt
= skb_rtable(skb
);
149 /* Build the IP header. */
150 skb_push(skb
, sizeof(struct iphdr
) + (opt
? opt
->opt
.optlen
: 0));
151 skb_reset_network_header(skb
);
155 iph
->tos
= inet
->tos
;
156 if (ip_dont_fragment(sk
, &rt
->dst
))
157 iph
->frag_off
= htons(IP_DF
);
160 iph
->ttl
= ip_select_ttl(inet
, &rt
->dst
);
161 iph
->daddr
= (opt
&& opt
->opt
.srr
? opt
->opt
.faddr
: daddr
);
163 iph
->protocol
= sk
->sk_protocol
;
164 ip_select_ident(iph
, &rt
->dst
, sk
);
166 if (opt
&& opt
->opt
.optlen
) {
167 iph
->ihl
+= opt
->opt
.optlen
>>2;
168 ip_options_build(skb
, &opt
->opt
, daddr
, rt
, 0);
171 skb
->priority
= sk
->sk_priority
;
172 skb
->mark
= sk
->sk_mark
;
175 return ip_local_out(skb
);
177 EXPORT_SYMBOL_GPL(ip_build_and_send_pkt
);
179 static inline int ip_finish_output2(struct sk_buff
*skb
)
181 struct dst_entry
*dst
= skb_dst(skb
);
182 struct rtable
*rt
= (struct rtable
*)dst
;
183 struct net_device
*dev
= dst
->dev
;
184 unsigned int hh_len
= LL_RESERVED_SPACE(dev
);
185 struct neighbour
*neigh
;
188 if (rt
->rt_type
== RTN_MULTICAST
) {
189 IP_UPD_PO_STATS(dev_net(dev
), IPSTATS_MIB_OUTMCAST
, skb
->len
);
190 } else if (rt
->rt_type
== RTN_BROADCAST
)
191 IP_UPD_PO_STATS(dev_net(dev
), IPSTATS_MIB_OUTBCAST
, skb
->len
);
193 /* Be paranoid, rather than too clever. */
194 if (unlikely(skb_headroom(skb
) < hh_len
&& dev
->header_ops
)) {
195 struct sk_buff
*skb2
;
197 skb2
= skb_realloc_headroom(skb
, LL_RESERVED_SPACE(dev
));
203 skb_set_owner_w(skb2
, skb
->sk
);
210 int res
= neigh_hh_output(dst
->hh
, skb
);
215 neigh
= dst_get_neighbour(dst
);
217 res
= neigh
->output(skb
);
226 printk(KERN_DEBUG
"ip_finish_output2: No header cache and no neighbour!\n");
231 static inline int ip_skb_dst_mtu(struct sk_buff
*skb
)
233 struct inet_sock
*inet
= skb
->sk
? inet_sk(skb
->sk
) : NULL
;
235 return (inet
&& inet
->pmtudisc
== IP_PMTUDISC_PROBE
) ?
236 skb_dst(skb
)->dev
->mtu
: dst_mtu(skb_dst(skb
));
239 static int ip_finish_output(struct sk_buff
*skb
)
241 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
242 /* Policy lookup after SNAT yielded a new policy */
243 if (skb_dst(skb
)->xfrm
!= NULL
) {
244 IPCB(skb
)->flags
|= IPSKB_REROUTED
;
245 return dst_output(skb
);
248 if (skb
->len
> ip_skb_dst_mtu(skb
) && !skb_is_gso(skb
))
249 return ip_fragment(skb
, ip_finish_output2
);
251 return ip_finish_output2(skb
);
254 int ip_mc_output(struct sk_buff
*skb
)
256 struct sock
*sk
= skb
->sk
;
257 struct rtable
*rt
= skb_rtable(skb
);
258 struct net_device
*dev
= rt
->dst
.dev
;
261 * If the indicated interface is up and running, send the packet.
263 IP_UPD_PO_STATS(dev_net(dev
), IPSTATS_MIB_OUT
, skb
->len
);
266 skb
->protocol
= htons(ETH_P_IP
);
269 * Multicasts are looped back for other local users
272 if (rt
->rt_flags
&RTCF_MULTICAST
) {
274 #ifdef CONFIG_IP_MROUTE
275 /* Small optimization: do not loopback not local frames,
276 which returned after forwarding; they will be dropped
277 by ip_mr_input in any case.
278 Note, that local frames are looped back to be delivered
281 This check is duplicated in ip_mr_input at the moment.
284 ((rt
->rt_flags
& RTCF_LOCAL
) ||
285 !(IPCB(skb
)->flags
& IPSKB_FORWARDED
))
288 struct sk_buff
*newskb
= skb_clone(skb
, GFP_ATOMIC
);
290 NF_HOOK(NFPROTO_IPV4
, NF_INET_POST_ROUTING
,
291 newskb
, NULL
, newskb
->dev
,
292 ip_dev_loopback_xmit
);
295 /* Multicasts with ttl 0 must not go beyond the host */
297 if (ip_hdr(skb
)->ttl
== 0) {
303 if (rt
->rt_flags
&RTCF_BROADCAST
) {
304 struct sk_buff
*newskb
= skb_clone(skb
, GFP_ATOMIC
);
306 NF_HOOK(NFPROTO_IPV4
, NF_INET_POST_ROUTING
, newskb
,
307 NULL
, newskb
->dev
, ip_dev_loopback_xmit
);
310 return NF_HOOK_COND(NFPROTO_IPV4
, NF_INET_POST_ROUTING
, skb
, NULL
,
311 skb
->dev
, ip_finish_output
,
312 !(IPCB(skb
)->flags
& IPSKB_REROUTED
));
315 int ip_output(struct sk_buff
*skb
)
317 struct net_device
*dev
= skb_dst(skb
)->dev
;
319 IP_UPD_PO_STATS(dev_net(dev
), IPSTATS_MIB_OUT
, skb
->len
);
322 skb
->protocol
= htons(ETH_P_IP
);
324 return NF_HOOK_COND(NFPROTO_IPV4
, NF_INET_POST_ROUTING
, skb
, NULL
, dev
,
326 !(IPCB(skb
)->flags
& IPSKB_REROUTED
));
329 int ip_queue_xmit(struct sk_buff
*skb
, struct flowi
*fl
)
331 struct sock
*sk
= skb
->sk
;
332 struct inet_sock
*inet
= inet_sk(sk
);
333 struct ip_options_rcu
*inet_opt
;
339 /* Skip all of this if the packet is already routed,
340 * f.e. by something like SCTP.
343 inet_opt
= rcu_dereference(inet
->inet_opt
);
345 rt
= skb_rtable(skb
);
349 /* Make sure we can route this packet. */
350 rt
= (struct rtable
*)__sk_dst_check(sk
, 0);
354 /* Use correct destination address if we have options. */
355 daddr
= inet
->inet_daddr
;
356 if (inet_opt
&& inet_opt
->opt
.srr
)
357 daddr
= inet_opt
->opt
.faddr
;
359 /* If this fails, retransmit mechanism of transport layer will
360 * keep trying until route appears or the connection times
363 rt
= ip_route_output_ports(sock_net(sk
), fl4
, sk
,
364 daddr
, inet
->inet_saddr
,
369 sk
->sk_bound_dev_if
);
372 sk_setup_caps(sk
, &rt
->dst
);
374 skb_dst_set_noref(skb
, &rt
->dst
);
377 if (inet_opt
&& inet_opt
->opt
.is_strictroute
&& fl4
->daddr
!= rt
->rt_gateway
)
380 /* OK, we know where to send it, allocate and build IP header. */
381 skb_push(skb
, sizeof(struct iphdr
) + (inet_opt
? inet_opt
->opt
.optlen
: 0));
382 skb_reset_network_header(skb
);
384 *((__be16
*)iph
) = htons((4 << 12) | (5 << 8) | (inet
->tos
& 0xff));
385 if (ip_dont_fragment(sk
, &rt
->dst
) && !skb
->local_df
)
386 iph
->frag_off
= htons(IP_DF
);
389 iph
->ttl
= ip_select_ttl(inet
, &rt
->dst
);
390 iph
->protocol
= sk
->sk_protocol
;
391 iph
->saddr
= fl4
->saddr
;
392 iph
->daddr
= fl4
->daddr
;
393 /* Transport layer set skb->h.foo itself. */
395 if (inet_opt
&& inet_opt
->opt
.optlen
) {
396 iph
->ihl
+= inet_opt
->opt
.optlen
>> 2;
397 ip_options_build(skb
, &inet_opt
->opt
, inet
->inet_daddr
, rt
, 0);
400 ip_select_ident_more(iph
, &rt
->dst
, sk
,
401 (skb_shinfo(skb
)->gso_segs
?: 1) - 1);
403 skb
->priority
= sk
->sk_priority
;
404 skb
->mark
= sk
->sk_mark
;
406 res
= ip_local_out(skb
);
412 IP_INC_STATS(sock_net(sk
), IPSTATS_MIB_OUTNOROUTES
);
414 return -EHOSTUNREACH
;
416 EXPORT_SYMBOL(ip_queue_xmit
);
419 static void ip_copy_metadata(struct sk_buff
*to
, struct sk_buff
*from
)
421 to
->pkt_type
= from
->pkt_type
;
422 to
->priority
= from
->priority
;
423 to
->protocol
= from
->protocol
;
425 skb_dst_copy(to
, from
);
427 to
->mark
= from
->mark
;
429 /* Copy the flags to each fragment. */
430 IPCB(to
)->flags
= IPCB(from
)->flags
;
432 #ifdef CONFIG_NET_SCHED
433 to
->tc_index
= from
->tc_index
;
436 #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
437 defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
438 to
->nf_trace
= from
->nf_trace
;
440 #if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
441 to
->ipvs_property
= from
->ipvs_property
;
443 skb_copy_secmark(to
, from
);
447 * This IP datagram is too large to be sent in one piece. Break it up into
448 * smaller pieces (each of size equal to IP header plus
449 * a block of the data of the original IP data part) that will yet fit in a
450 * single device frame, and queue such a frame for sending.
453 int ip_fragment(struct sk_buff
*skb
, int (*output
)(struct sk_buff
*))
457 struct net_device
*dev
;
458 struct sk_buff
*skb2
;
459 unsigned int mtu
, hlen
, left
, len
, ll_rs
;
461 __be16 not_last_frag
;
462 struct rtable
*rt
= skb_rtable(skb
);
468 * Point into the IP datagram header.
473 if (unlikely((iph
->frag_off
& htons(IP_DF
)) && !skb
->local_df
)) {
474 IP_INC_STATS(dev_net(dev
), IPSTATS_MIB_FRAGFAILS
);
475 icmp_send(skb
, ICMP_DEST_UNREACH
, ICMP_FRAG_NEEDED
,
476 htonl(ip_skb_dst_mtu(skb
)));
482 * Setup starting values.
486 mtu
= dst_mtu(&rt
->dst
) - hlen
; /* Size of data space */
487 #ifdef CONFIG_BRIDGE_NETFILTER
489 mtu
-= nf_bridge_mtu_reduction(skb
);
491 IPCB(skb
)->flags
|= IPSKB_FRAG_COMPLETE
;
493 /* When frag_list is given, use it. First, check its validity:
494 * some transformers could create wrong frag_list or break existing
495 * one, it is not prohibited. In this case fall back to copying.
497 * LATER: this step can be merged to real generation of fragments,
498 * we can switch to copy when see the first bad fragment.
500 if (skb_has_frag_list(skb
)) {
501 struct sk_buff
*frag
, *frag2
;
502 int first_len
= skb_pagelen(skb
);
504 if (first_len
- hlen
> mtu
||
505 ((first_len
- hlen
) & 7) ||
506 (iph
->frag_off
& htons(IP_MF
|IP_OFFSET
)) ||
510 skb_walk_frags(skb
, frag
) {
511 /* Correct geometry. */
512 if (frag
->len
> mtu
||
513 ((frag
->len
& 7) && frag
->next
) ||
514 skb_headroom(frag
) < hlen
)
515 goto slow_path_clean
;
517 /* Partially cloned skb? */
518 if (skb_shared(frag
))
519 goto slow_path_clean
;
524 frag
->destructor
= sock_wfree
;
526 skb
->truesize
-= frag
->truesize
;
529 /* Everything is OK. Generate! */
533 frag
= skb_shinfo(skb
)->frag_list
;
534 skb_frag_list_init(skb
);
535 skb
->data_len
= first_len
- skb_headlen(skb
);
536 skb
->len
= first_len
;
537 iph
->tot_len
= htons(first_len
);
538 iph
->frag_off
= htons(IP_MF
);
542 /* Prepare header of the next frame,
543 * before previous one went down. */
545 frag
->ip_summed
= CHECKSUM_NONE
;
546 skb_reset_transport_header(frag
);
547 __skb_push(frag
, hlen
);
548 skb_reset_network_header(frag
);
549 memcpy(skb_network_header(frag
), iph
, hlen
);
551 iph
->tot_len
= htons(frag
->len
);
552 ip_copy_metadata(frag
, skb
);
554 ip_options_fragment(frag
);
555 offset
+= skb
->len
- hlen
;
556 iph
->frag_off
= htons(offset
>>3);
557 if (frag
->next
!= NULL
)
558 iph
->frag_off
|= htons(IP_MF
);
559 /* Ready, complete checksum */
566 IP_INC_STATS(dev_net(dev
), IPSTATS_MIB_FRAGCREATES
);
576 IP_INC_STATS(dev_net(dev
), IPSTATS_MIB_FRAGOKS
);
585 IP_INC_STATS(dev_net(dev
), IPSTATS_MIB_FRAGFAILS
);
589 skb_walk_frags(skb
, frag2
) {
593 frag2
->destructor
= NULL
;
594 skb
->truesize
+= frag2
->truesize
;
599 left
= skb
->len
- hlen
; /* Space per frame */
600 ptr
= hlen
; /* Where to start from */
602 /* for bridged IP traffic encapsulated inside f.e. a vlan header,
603 * we need to make room for the encapsulating header
605 ll_rs
= LL_RESERVED_SPACE_EXTRA(rt
->dst
.dev
, nf_bridge_pad(skb
));
608 * Fragment the datagram.
611 offset
= (ntohs(iph
->frag_off
) & IP_OFFSET
) << 3;
612 not_last_frag
= iph
->frag_off
& htons(IP_MF
);
615 * Keep copying data until we run out.
620 /* IF: it doesn't fit, use 'mtu' - the data space left */
623 /* IF: we are not sending up to and including the packet end
624 then align the next start on an eight byte boundary */
632 if ((skb2
= alloc_skb(len
+hlen
+ll_rs
, GFP_ATOMIC
)) == NULL
) {
633 NETDEBUG(KERN_INFO
"IP: frag: no memory for new fragment!\n");
639 * Set up data on packet
642 ip_copy_metadata(skb2
, skb
);
643 skb_reserve(skb2
, ll_rs
);
644 skb_put(skb2
, len
+ hlen
);
645 skb_reset_network_header(skb2
);
646 skb2
->transport_header
= skb2
->network_header
+ hlen
;
649 * Charge the memory for the fragment to any owner
654 skb_set_owner_w(skb2
, skb
->sk
);
657 * Copy the packet header into the new buffer.
660 skb_copy_from_linear_data(skb
, skb_network_header(skb2
), hlen
);
663 * Copy a block of the IP datagram.
665 if (skb_copy_bits(skb
, ptr
, skb_transport_header(skb2
), len
))
670 * Fill in the new header fields.
673 iph
->frag_off
= htons((offset
>> 3));
675 /* ANK: dirty, but effective trick. Upgrade options only if
676 * the segment to be fragmented was THE FIRST (otherwise,
677 * options are already fixed) and make it ONCE
678 * on the initial skb, so that all the following fragments
679 * will inherit fixed options.
682 ip_options_fragment(skb
);
685 * Added AC : If we are fragmenting a fragment that's not the
686 * last fragment then keep MF on each bit
688 if (left
> 0 || not_last_frag
)
689 iph
->frag_off
|= htons(IP_MF
);
694 * Put this fragment into the sending queue.
696 iph
->tot_len
= htons(len
+ hlen
);
704 IP_INC_STATS(dev_net(dev
), IPSTATS_MIB_FRAGCREATES
);
707 IP_INC_STATS(dev_net(dev
), IPSTATS_MIB_FRAGOKS
);
712 IP_INC_STATS(dev_net(dev
), IPSTATS_MIB_FRAGFAILS
);
715 EXPORT_SYMBOL(ip_fragment
);
718 ip_generic_getfrag(void *from
, char *to
, int offset
, int len
, int odd
, struct sk_buff
*skb
)
720 struct iovec
*iov
= from
;
722 if (skb
->ip_summed
== CHECKSUM_PARTIAL
) {
723 if (memcpy_fromiovecend(to
, iov
, offset
, len
) < 0)
727 if (csum_partial_copy_fromiovecend(to
, iov
, offset
, len
, &csum
) < 0)
729 skb
->csum
= csum_block_add(skb
->csum
, csum
, odd
);
733 EXPORT_SYMBOL(ip_generic_getfrag
);
736 csum_page(struct page
*page
, int offset
, int copy
)
741 csum
= csum_partial(kaddr
+ offset
, copy
, 0);
746 static inline int ip_ufo_append_data(struct sock
*sk
,
747 struct sk_buff_head
*queue
,
748 int getfrag(void *from
, char *to
, int offset
, int len
,
749 int odd
, struct sk_buff
*skb
),
750 void *from
, int length
, int hh_len
, int fragheaderlen
,
751 int transhdrlen
, int maxfraglen
, unsigned int flags
)
756 /* There is support for UDP fragmentation offload by network
757 * device, so create one single skb packet containing complete
760 if ((skb
= skb_peek_tail(queue
)) == NULL
) {
761 skb
= sock_alloc_send_skb(sk
,
762 hh_len
+ fragheaderlen
+ transhdrlen
+ 20,
763 (flags
& MSG_DONTWAIT
), &err
);
768 /* reserve space for Hardware header */
769 skb_reserve(skb
, hh_len
);
771 /* create space for UDP/IP header */
772 skb_put(skb
, fragheaderlen
+ transhdrlen
);
774 /* initialize network header pointer */
775 skb_reset_network_header(skb
);
777 /* initialize protocol header pointer */
778 skb
->transport_header
= skb
->network_header
+ fragheaderlen
;
780 skb
->ip_summed
= CHECKSUM_PARTIAL
;
783 /* specify the length of each IP datagram fragment */
784 skb_shinfo(skb
)->gso_size
= maxfraglen
- fragheaderlen
;
785 skb_shinfo(skb
)->gso_type
= SKB_GSO_UDP
;
786 __skb_queue_tail(queue
, skb
);
789 return skb_append_datato_frags(sk
, skb
, getfrag
, from
,
790 (length
- transhdrlen
));
793 static int __ip_append_data(struct sock
*sk
,
795 struct sk_buff_head
*queue
,
796 struct inet_cork
*cork
,
797 int getfrag(void *from
, char *to
, int offset
,
798 int len
, int odd
, struct sk_buff
*skb
),
799 void *from
, int length
, int transhdrlen
,
802 struct inet_sock
*inet
= inet_sk(sk
);
805 struct ip_options
*opt
= cork
->opt
;
812 unsigned int maxfraglen
, fragheaderlen
;
813 int csummode
= CHECKSUM_NONE
;
814 struct rtable
*rt
= (struct rtable
*)cork
->dst
;
816 skb
= skb_peek_tail(queue
);
818 exthdrlen
= !skb
? rt
->dst
.header_len
: 0;
819 mtu
= cork
->fragsize
;
821 hh_len
= LL_RESERVED_SPACE(rt
->dst
.dev
);
823 fragheaderlen
= sizeof(struct iphdr
) + (opt
? opt
->optlen
: 0);
824 maxfraglen
= ((mtu
- fragheaderlen
) & ~7) + fragheaderlen
;
826 if (cork
->length
+ length
> 0xFFFF - fragheaderlen
) {
827 ip_local_error(sk
, EMSGSIZE
, fl4
->daddr
, inet
->inet_dport
,
833 * transhdrlen > 0 means that this is the first fragment and we wish
834 * it won't be fragmented in the future.
837 length
+ fragheaderlen
<= mtu
&&
838 rt
->dst
.dev
->features
& NETIF_F_V4_CSUM
&&
840 csummode
= CHECKSUM_PARTIAL
;
842 cork
->length
+= length
;
843 if (((length
> mtu
) || (skb
&& skb_is_gso(skb
))) &&
844 (sk
->sk_protocol
== IPPROTO_UDP
) &&
845 (rt
->dst
.dev
->features
& NETIF_F_UFO
) && !rt
->dst
.header_len
) {
846 err
= ip_ufo_append_data(sk
, queue
, getfrag
, from
, length
,
847 hh_len
, fragheaderlen
, transhdrlen
,
854 /* So, what's going on in the loop below?
856 * We use calculated fragment length to generate chained skb,
857 * each of segments is IP fragment ready for sending to network after
858 * adding appropriate IP header.
865 /* Check if the remaining data fits into current packet. */
866 copy
= mtu
- skb
->len
;
868 copy
= maxfraglen
- skb
->len
;
871 unsigned int datalen
;
872 unsigned int fraglen
;
873 unsigned int fraggap
;
874 unsigned int alloclen
;
875 struct sk_buff
*skb_prev
;
879 fraggap
= skb_prev
->len
- maxfraglen
;
884 * If remaining data exceeds the mtu,
885 * we know we need more fragment(s).
887 datalen
= length
+ fraggap
;
888 if (datalen
> mtu
- fragheaderlen
)
889 datalen
= maxfraglen
- fragheaderlen
;
890 fraglen
= datalen
+ fragheaderlen
;
892 if ((flags
& MSG_MORE
) &&
893 !(rt
->dst
.dev
->features
&NETIF_F_SG
))
898 alloclen
+= exthdrlen
;
900 /* The last fragment gets additional space at tail.
901 * Note, with MSG_MORE we overallocate on fragments,
902 * because we have no idea what fragment will be
905 if (datalen
== length
+ fraggap
)
906 alloclen
+= rt
->dst
.trailer_len
;
909 skb
= sock_alloc_send_skb(sk
,
910 alloclen
+ hh_len
+ 15,
911 (flags
& MSG_DONTWAIT
), &err
);
914 if (atomic_read(&sk
->sk_wmem_alloc
) <=
916 skb
= sock_wmalloc(sk
,
917 alloclen
+ hh_len
+ 15, 1,
919 if (unlikely(skb
== NULL
))
922 /* only the initial fragment is
930 * Fill in the control structures
932 skb
->ip_summed
= csummode
;
934 skb_reserve(skb
, hh_len
);
935 skb_shinfo(skb
)->tx_flags
= cork
->tx_flags
;
938 * Find where to start putting bytes.
940 data
= skb_put(skb
, fraglen
+ exthdrlen
);
941 skb_set_network_header(skb
, exthdrlen
);
942 skb
->transport_header
= (skb
->network_header
+
944 data
+= fragheaderlen
+ exthdrlen
;
947 skb
->csum
= skb_copy_and_csum_bits(
948 skb_prev
, maxfraglen
,
949 data
+ transhdrlen
, fraggap
, 0);
950 skb_prev
->csum
= csum_sub(skb_prev
->csum
,
953 pskb_trim_unique(skb_prev
, maxfraglen
);
956 copy
= datalen
- transhdrlen
- fraggap
;
957 if (copy
> 0 && getfrag(from
, data
+ transhdrlen
, offset
, copy
, fraggap
, skb
) < 0) {
964 length
-= datalen
- fraggap
;
967 csummode
= CHECKSUM_NONE
;
970 * Put the packet on the pending queue.
972 __skb_queue_tail(queue
, skb
);
979 if (!(rt
->dst
.dev
->features
&NETIF_F_SG
)) {
983 if (getfrag(from
, skb_put(skb
, copy
),
984 offset
, copy
, off
, skb
) < 0) {
985 __skb_trim(skb
, off
);
990 int i
= skb_shinfo(skb
)->nr_frags
;
991 skb_frag_t
*frag
= &skb_shinfo(skb
)->frags
[i
-1];
992 struct page
*page
= cork
->page
;
996 if (page
&& (left
= PAGE_SIZE
- off
) > 0) {
999 if (page
!= frag
->page
) {
1000 if (i
== MAX_SKB_FRAGS
) {
1005 skb_fill_page_desc(skb
, i
, page
, off
, 0);
1006 frag
= &skb_shinfo(skb
)->frags
[i
];
1008 } else if (i
< MAX_SKB_FRAGS
) {
1009 if (copy
> PAGE_SIZE
)
1011 page
= alloc_pages(sk
->sk_allocation
, 0);
1019 skb_fill_page_desc(skb
, i
, page
, 0, 0);
1020 frag
= &skb_shinfo(skb
)->frags
[i
];
1025 if (getfrag(from
, page_address(frag
->page
)+frag
->page_offset
+frag
->size
, offset
, copy
, skb
->len
, skb
) < 0) {
1032 skb
->data_len
+= copy
;
1033 skb
->truesize
+= copy
;
1034 atomic_add(copy
, &sk
->sk_wmem_alloc
);
1043 cork
->length
-= length
;
1044 IP_INC_STATS(sock_net(sk
), IPSTATS_MIB_OUTDISCARDS
);
1048 static int ip_setup_cork(struct sock
*sk
, struct inet_cork
*cork
,
1049 struct ipcm_cookie
*ipc
, struct rtable
**rtp
)
1051 struct inet_sock
*inet
= inet_sk(sk
);
1052 struct ip_options_rcu
*opt
;
1056 * setup for corking.
1060 if (cork
->opt
== NULL
) {
1061 cork
->opt
= kmalloc(sizeof(struct ip_options
) + 40,
1063 if (unlikely(cork
->opt
== NULL
))
1066 memcpy(cork
->opt
, &opt
->opt
, sizeof(struct ip_options
) + opt
->opt
.optlen
);
1067 cork
->flags
|= IPCORK_OPT
;
1068 cork
->addr
= ipc
->addr
;
1074 * We steal reference to this route, caller should not release it
1077 cork
->fragsize
= inet
->pmtudisc
== IP_PMTUDISC_PROBE
?
1078 rt
->dst
.dev
->mtu
: dst_mtu(&rt
->dst
);
1079 cork
->dst
= &rt
->dst
;
1081 cork
->tx_flags
= ipc
->tx_flags
;
1089 * ip_append_data() and ip_append_page() can make one large IP datagram
1090 * from many pieces of data. Each pieces will be holded on the socket
1091 * until ip_push_pending_frames() is called. Each piece can be a page
1094 * Not only UDP, other transport protocols - e.g. raw sockets - can use
1095 * this interface potentially.
1097 * LATER: length must be adjusted by pad at tail, when it is required.
1099 int ip_append_data(struct sock
*sk
, struct flowi4
*fl4
,
1100 int getfrag(void *from
, char *to
, int offset
, int len
,
1101 int odd
, struct sk_buff
*skb
),
1102 void *from
, int length
, int transhdrlen
,
1103 struct ipcm_cookie
*ipc
, struct rtable
**rtp
,
1106 struct inet_sock
*inet
= inet_sk(sk
);
1109 if (flags
&MSG_PROBE
)
1112 if (skb_queue_empty(&sk
->sk_write_queue
)) {
1113 err
= ip_setup_cork(sk
, &inet
->cork
.base
, ipc
, rtp
);
1120 return __ip_append_data(sk
, fl4
, &sk
->sk_write_queue
, &inet
->cork
.base
, getfrag
,
1121 from
, length
, transhdrlen
, flags
);
1124 ssize_t
ip_append_page(struct sock
*sk
, struct flowi4
*fl4
, struct page
*page
,
1125 int offset
, size_t size
, int flags
)
1127 struct inet_sock
*inet
= inet_sk(sk
);
1128 struct sk_buff
*skb
;
1130 struct ip_options
*opt
= NULL
;
1131 struct inet_cork
*cork
;
1136 unsigned int maxfraglen
, fragheaderlen
, fraggap
;
1141 if (flags
&MSG_PROBE
)
1144 if (skb_queue_empty(&sk
->sk_write_queue
))
1147 cork
= &inet
->cork
.base
;
1148 rt
= (struct rtable
*)cork
->dst
;
1149 if (cork
->flags
& IPCORK_OPT
)
1152 if (!(rt
->dst
.dev
->features
&NETIF_F_SG
))
1155 hh_len
= LL_RESERVED_SPACE(rt
->dst
.dev
);
1156 mtu
= cork
->fragsize
;
1158 fragheaderlen
= sizeof(struct iphdr
) + (opt
? opt
->optlen
: 0);
1159 maxfraglen
= ((mtu
- fragheaderlen
) & ~7) + fragheaderlen
;
1161 if (cork
->length
+ size
> 0xFFFF - fragheaderlen
) {
1162 ip_local_error(sk
, EMSGSIZE
, fl4
->daddr
, inet
->inet_dport
, mtu
);
1166 if ((skb
= skb_peek_tail(&sk
->sk_write_queue
)) == NULL
)
1169 cork
->length
+= size
;
1170 if ((size
+ skb
->len
> mtu
) &&
1171 (sk
->sk_protocol
== IPPROTO_UDP
) &&
1172 (rt
->dst
.dev
->features
& NETIF_F_UFO
)) {
1173 skb_shinfo(skb
)->gso_size
= mtu
- fragheaderlen
;
1174 skb_shinfo(skb
)->gso_type
= SKB_GSO_UDP
;
1181 if (skb_is_gso(skb
))
1185 /* Check if the remaining data fits into current packet. */
1186 len
= mtu
- skb
->len
;
1188 len
= maxfraglen
- skb
->len
;
1191 struct sk_buff
*skb_prev
;
1195 fraggap
= skb_prev
->len
- maxfraglen
;
1197 alloclen
= fragheaderlen
+ hh_len
+ fraggap
+ 15;
1198 skb
= sock_wmalloc(sk
, alloclen
, 1, sk
->sk_allocation
);
1199 if (unlikely(!skb
)) {
1205 * Fill in the control structures
1207 skb
->ip_summed
= CHECKSUM_NONE
;
1209 skb_reserve(skb
, hh_len
);
1212 * Find where to start putting bytes.
1214 skb_put(skb
, fragheaderlen
+ fraggap
);
1215 skb_reset_network_header(skb
);
1216 skb
->transport_header
= (skb
->network_header
+
1219 skb
->csum
= skb_copy_and_csum_bits(skb_prev
,
1221 skb_transport_header(skb
),
1223 skb_prev
->csum
= csum_sub(skb_prev
->csum
,
1225 pskb_trim_unique(skb_prev
, maxfraglen
);
1229 * Put the packet on the pending queue.
1231 __skb_queue_tail(&sk
->sk_write_queue
, skb
);
1235 i
= skb_shinfo(skb
)->nr_frags
;
1238 if (skb_can_coalesce(skb
, i
, page
, offset
)) {
1239 skb_shinfo(skb
)->frags
[i
-1].size
+= len
;
1240 } else if (i
< MAX_SKB_FRAGS
) {
1242 skb_fill_page_desc(skb
, i
, page
, offset
, len
);
1248 if (skb
->ip_summed
== CHECKSUM_NONE
) {
1250 csum
= csum_page(page
, offset
, len
);
1251 skb
->csum
= csum_block_add(skb
->csum
, csum
, skb
->len
);
1255 skb
->data_len
+= len
;
1256 skb
->truesize
+= len
;
1257 atomic_add(len
, &sk
->sk_wmem_alloc
);
1264 cork
->length
-= size
;
1265 IP_INC_STATS(sock_net(sk
), IPSTATS_MIB_OUTDISCARDS
);
1269 static void ip_cork_release(struct inet_cork
*cork
)
1271 cork
->flags
&= ~IPCORK_OPT
;
1274 dst_release(cork
->dst
);
1279 * Combined all pending IP fragments on the socket as one IP datagram
1280 * and push them out.
1282 struct sk_buff
*__ip_make_skb(struct sock
*sk
,
1284 struct sk_buff_head
*queue
,
1285 struct inet_cork
*cork
)
1287 struct sk_buff
*skb
, *tmp_skb
;
1288 struct sk_buff
**tail_skb
;
1289 struct inet_sock
*inet
= inet_sk(sk
);
1290 struct net
*net
= sock_net(sk
);
1291 struct ip_options
*opt
= NULL
;
1292 struct rtable
*rt
= (struct rtable
*)cork
->dst
;
1297 if ((skb
= __skb_dequeue(queue
)) == NULL
)
1299 tail_skb
= &(skb_shinfo(skb
)->frag_list
);
1301 /* move skb->data to ip header from ext header */
1302 if (skb
->data
< skb_network_header(skb
))
1303 __skb_pull(skb
, skb_network_offset(skb
));
1304 while ((tmp_skb
= __skb_dequeue(queue
)) != NULL
) {
1305 __skb_pull(tmp_skb
, skb_network_header_len(skb
));
1306 *tail_skb
= tmp_skb
;
1307 tail_skb
= &(tmp_skb
->next
);
1308 skb
->len
+= tmp_skb
->len
;
1309 skb
->data_len
+= tmp_skb
->len
;
1310 skb
->truesize
+= tmp_skb
->truesize
;
1311 tmp_skb
->destructor
= NULL
;
1315 /* Unless user demanded real pmtu discovery (IP_PMTUDISC_DO), we allow
1316 * to fragment the frame generated here. No matter, what transforms
1317 * how transforms change size of the packet, it will come out.
1319 if (inet
->pmtudisc
< IP_PMTUDISC_DO
)
1322 /* DF bit is set when we want to see DF on outgoing frames.
1323 * If local_df is set too, we still allow to fragment this frame
1325 if (inet
->pmtudisc
>= IP_PMTUDISC_DO
||
1326 (skb
->len
<= dst_mtu(&rt
->dst
) &&
1327 ip_dont_fragment(sk
, &rt
->dst
)))
1330 if (cork
->flags
& IPCORK_OPT
)
1333 if (rt
->rt_type
== RTN_MULTICAST
)
1336 ttl
= ip_select_ttl(inet
, &rt
->dst
);
1338 iph
= (struct iphdr
*)skb
->data
;
1341 iph
->tos
= inet
->tos
;
1343 ip_select_ident(iph
, &rt
->dst
, sk
);
1345 iph
->protocol
= sk
->sk_protocol
;
1346 iph
->saddr
= fl4
->saddr
;
1347 iph
->daddr
= fl4
->daddr
;
1350 iph
->ihl
+= opt
->optlen
>>2;
1351 ip_options_build(skb
, opt
, cork
->addr
, rt
, 0);
1354 skb
->priority
= sk
->sk_priority
;
1355 skb
->mark
= sk
->sk_mark
;
1357 * Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec
1361 skb_dst_set(skb
, &rt
->dst
);
1363 if (iph
->protocol
== IPPROTO_ICMP
)
1364 icmp_out_count(net
, ((struct icmphdr
*)
1365 skb_transport_header(skb
))->type
);
1367 ip_cork_release(cork
);
1372 int ip_send_skb(struct sk_buff
*skb
)
1374 struct net
*net
= sock_net(skb
->sk
);
1377 err
= ip_local_out(skb
);
1380 err
= net_xmit_errno(err
);
1382 IP_INC_STATS(net
, IPSTATS_MIB_OUTDISCARDS
);
1388 int ip_push_pending_frames(struct sock
*sk
, struct flowi4
*fl4
)
1390 struct sk_buff
*skb
;
1392 skb
= ip_finish_skb(sk
, fl4
);
1396 /* Netfilter gets whole the not fragmented skb. */
1397 return ip_send_skb(skb
);
1401 * Throw away all pending data on the socket.
1403 static void __ip_flush_pending_frames(struct sock
*sk
,
1404 struct sk_buff_head
*queue
,
1405 struct inet_cork
*cork
)
1407 struct sk_buff
*skb
;
1409 while ((skb
= __skb_dequeue_tail(queue
)) != NULL
)
1412 ip_cork_release(cork
);
1415 void ip_flush_pending_frames(struct sock
*sk
)
1417 __ip_flush_pending_frames(sk
, &sk
->sk_write_queue
, &inet_sk(sk
)->cork
.base
);
1420 struct sk_buff
*ip_make_skb(struct sock
*sk
,
1422 int getfrag(void *from
, char *to
, int offset
,
1423 int len
, int odd
, struct sk_buff
*skb
),
1424 void *from
, int length
, int transhdrlen
,
1425 struct ipcm_cookie
*ipc
, struct rtable
**rtp
,
1428 struct inet_cork cork
;
1429 struct sk_buff_head queue
;
1432 if (flags
& MSG_PROBE
)
1435 __skb_queue_head_init(&queue
);
1440 err
= ip_setup_cork(sk
, &cork
, ipc
, rtp
);
1442 return ERR_PTR(err
);
1444 err
= __ip_append_data(sk
, fl4
, &queue
, &cork
, getfrag
,
1445 from
, length
, transhdrlen
, flags
);
1447 __ip_flush_pending_frames(sk
, &queue
, &cork
);
1448 return ERR_PTR(err
);
1451 return __ip_make_skb(sk
, fl4
, &queue
, &cork
);
1455 * Fetch data from kernel space and fill in checksum if needed.
1457 static int ip_reply_glue_bits(void *dptr
, char *to
, int offset
,
1458 int len
, int odd
, struct sk_buff
*skb
)
1462 csum
= csum_partial_copy_nocheck(dptr
+offset
, to
, len
, 0);
1463 skb
->csum
= csum_block_add(skb
->csum
, csum
, odd
);
1468 * Generic function to send a packet as reply to another packet.
1469 * Used to send TCP resets so far. ICMP should use this function too.
1471 * Should run single threaded per socket because it uses the sock
1472 * structure to pass arguments.
1474 void ip_send_reply(struct sock
*sk
, struct sk_buff
*skb
, __be32 daddr
,
1475 struct ip_reply_arg
*arg
, unsigned int len
)
1477 struct inet_sock
*inet
= inet_sk(sk
);
1478 struct ip_options_data replyopts
;
1479 struct ipcm_cookie ipc
;
1481 struct rtable
*rt
= skb_rtable(skb
);
1483 if (ip_options_echo(&replyopts
.opt
.opt
, skb
))
1490 if (replyopts
.opt
.opt
.optlen
) {
1491 ipc
.opt
= &replyopts
.opt
;
1493 if (replyopts
.opt
.opt
.srr
)
1494 daddr
= replyopts
.opt
.opt
.faddr
;
1497 flowi4_init_output(&fl4
, arg
->bound_dev_if
, 0,
1498 RT_TOS(ip_hdr(skb
)->tos
),
1499 RT_SCOPE_UNIVERSE
, sk
->sk_protocol
,
1500 ip_reply_arg_flowi_flags(arg
),
1501 daddr
, rt
->rt_spec_dst
,
1502 tcp_hdr(skb
)->source
, tcp_hdr(skb
)->dest
);
1503 security_skb_classify_flow(skb
, flowi4_to_flowi(&fl4
));
1504 rt
= ip_route_output_key(sock_net(sk
), &fl4
);
1508 /* And let IP do all the hard work.
1510 This chunk is not reenterable, hence spinlock.
1511 Note that it uses the fact, that this function is called
1512 with locally disabled BH and that sk cannot be already spinlocked.
1515 inet
->tos
= ip_hdr(skb
)->tos
;
1516 sk
->sk_priority
= skb
->priority
;
1517 sk
->sk_protocol
= ip_hdr(skb
)->protocol
;
1518 sk
->sk_bound_dev_if
= arg
->bound_dev_if
;
1519 ip_append_data(sk
, &fl4
, ip_reply_glue_bits
, arg
->iov
->iov_base
, len
, 0,
1520 &ipc
, &rt
, MSG_DONTWAIT
);
1521 if ((skb
= skb_peek(&sk
->sk_write_queue
)) != NULL
) {
1522 if (arg
->csumoffset
>= 0)
1523 *((__sum16
*)skb_transport_header(skb
) +
1524 arg
->csumoffset
) = csum_fold(csum_add(skb
->csum
,
1526 skb
->ip_summed
= CHECKSUM_NONE
;
1527 ip_push_pending_frames(sk
, &fl4
);
1535 void __init
ip_init(void)
1540 #if defined(CONFIG_IP_MULTICAST) && defined(CONFIG_PROC_FS)
1541 igmp_mc_proc_init();