2 * linux/net/ipv4/inet_lro.c
4 * Large Receive Offload (ipv4 / tcp)
6 * (C) Copyright IBM Corp. 2007
9 * Jan-Bernd Themann <themann@de.ibm.com>
10 * Christoph Raisch <raisch@de.ibm.com>
13 * This program is free software; you can redistribute it and/or modify
14 * it under the terms of the GNU General Public License as published by
15 * the Free Software Foundation; either version 2, or (at your option)
18 * This program is distributed in the hope that it will be useful,
19 * but WITHOUT ANY WARRANTY; without even the implied warranty of
20 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 * GNU General Public License for more details.
23 * You should have received a copy of the GNU General Public License
24 * along with this program; if not, write to the Free Software
25 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
29 #include <linux/module.h>
30 #include <linux/if_vlan.h>
31 #include <linux/inet_lro.h>
32 #include <net/checksum.h>
34 MODULE_LICENSE("GPL");
35 MODULE_AUTHOR("Jan-Bernd Themann <themann@de.ibm.com>");
36 MODULE_DESCRIPTION("Large Receive Offload (ipv4 / tcp)");
38 #define TCP_HDR_LEN(tcph) (tcph->doff << 2)
39 #define IP_HDR_LEN(iph) (iph->ihl << 2)
40 #define TCP_PAYLOAD_LENGTH(iph, tcph) \
41 (ntohs(iph->tot_len) - IP_HDR_LEN(iph) - TCP_HDR_LEN(tcph))
43 #define IPH_LEN_WO_OPTIONS 5
44 #define TCPH_LEN_WO_OPTIONS 5
45 #define TCPH_LEN_W_TIMESTAMP 8
47 #define LRO_MAX_PG_HLEN 64
49 #define LRO_INC_STATS(lro_mgr, attr) { lro_mgr->stats.attr++; }
52 * Basic tcp checks whether packet is suitable for LRO
55 static int lro_tcp_ip_check(const struct iphdr
*iph
, const struct tcphdr
*tcph
,
56 int len
, const struct net_lro_desc
*lro_desc
)
58 /* check ip header: don't aggregate padded frames */
59 if (ntohs(iph
->tot_len
) != len
)
62 if (TCP_PAYLOAD_LENGTH(iph
, tcph
) == 0)
65 if (iph
->ihl
!= IPH_LEN_WO_OPTIONS
)
68 if (tcph
->cwr
|| tcph
->ece
|| tcph
->urg
|| !tcph
->ack
||
69 tcph
->rst
|| tcph
->syn
|| tcph
->fin
)
72 if (INET_ECN_is_ce(ipv4_get_dsfield(iph
)))
75 if (tcph
->doff
!= TCPH_LEN_WO_OPTIONS
&&
76 tcph
->doff
!= TCPH_LEN_W_TIMESTAMP
)
79 /* check tcp options (only timestamp allowed) */
80 if (tcph
->doff
== TCPH_LEN_W_TIMESTAMP
) {
81 __be32
*topt
= (__be32
*)(tcph
+ 1);
83 if (*topt
!= htonl((TCPOPT_NOP
<< 24) | (TCPOPT_NOP
<< 16)
84 | (TCPOPT_TIMESTAMP
<< 8)
88 /* timestamp should be in right order */
90 if (lro_desc
&& after(ntohl(lro_desc
->tcp_rcv_tsval
),
94 /* timestamp reply should not be zero */
103 static void lro_update_tcp_ip_header(struct net_lro_desc
*lro_desc
)
105 struct iphdr
*iph
= lro_desc
->iph
;
106 struct tcphdr
*tcph
= lro_desc
->tcph
;
110 tcph
->ack_seq
= lro_desc
->tcp_ack
;
111 tcph
->window
= lro_desc
->tcp_window
;
113 if (lro_desc
->tcp_saw_tstamp
) {
114 p
= (__be32
*)(tcph
+ 1);
115 *(p
+2) = lro_desc
->tcp_rcv_tsecr
;
118 csum_replace2(&iph
->check
, iph
->tot_len
, htons(lro_desc
->ip_tot_len
));
119 iph
->tot_len
= htons(lro_desc
->ip_tot_len
);
122 tcp_hdr_csum
= csum_partial(tcph
, TCP_HDR_LEN(tcph
), 0);
123 lro_desc
->data_csum
= csum_add(lro_desc
->data_csum
, tcp_hdr_csum
);
124 tcph
->check
= csum_tcpudp_magic(iph
->saddr
, iph
->daddr
,
125 lro_desc
->ip_tot_len
-
126 IP_HDR_LEN(iph
), IPPROTO_TCP
,
127 lro_desc
->data_csum
);
130 static __wsum
lro_tcp_data_csum(struct iphdr
*iph
, struct tcphdr
*tcph
, int len
)
134 __wsum tcp_ps_hdr_csum
;
136 tcp_csum
= ~csum_unfold(tcph
->check
);
137 tcp_hdr_csum
= csum_partial(tcph
, TCP_HDR_LEN(tcph
), tcp_csum
);
139 tcp_ps_hdr_csum
= csum_tcpudp_nofold(iph
->saddr
, iph
->daddr
,
140 len
+ TCP_HDR_LEN(tcph
),
143 return csum_sub(csum_sub(tcp_csum
, tcp_hdr_csum
),
147 static void lro_init_desc(struct net_lro_desc
*lro_desc
, struct sk_buff
*skb
,
148 struct iphdr
*iph
, struct tcphdr
*tcph
)
152 u32 tcp_data_len
= TCP_PAYLOAD_LENGTH(iph
, tcph
);
154 nr_frags
= skb_shinfo(skb
)->nr_frags
;
155 lro_desc
->parent
= skb
;
156 lro_desc
->next_frag
= &(skb_shinfo(skb
)->frags
[nr_frags
]);
158 lro_desc
->tcph
= tcph
;
159 lro_desc
->tcp_next_seq
= ntohl(tcph
->seq
) + tcp_data_len
;
160 lro_desc
->tcp_ack
= tcph
->ack_seq
;
161 lro_desc
->tcp_window
= tcph
->window
;
163 lro_desc
->pkt_aggr_cnt
= 1;
164 lro_desc
->ip_tot_len
= ntohs(iph
->tot_len
);
166 if (tcph
->doff
== 8) {
167 ptr
= (__be32
*)(tcph
+1);
168 lro_desc
->tcp_saw_tstamp
= 1;
169 lro_desc
->tcp_rcv_tsval
= *(ptr
+1);
170 lro_desc
->tcp_rcv_tsecr
= *(ptr
+2);
173 lro_desc
->mss
= tcp_data_len
;
174 lro_desc
->active
= 1;
176 lro_desc
->data_csum
= lro_tcp_data_csum(iph
, tcph
,
180 static inline void lro_clear_desc(struct net_lro_desc
*lro_desc
)
182 memset(lro_desc
, 0, sizeof(struct net_lro_desc
));
185 static void lro_add_common(struct net_lro_desc
*lro_desc
, struct iphdr
*iph
,
186 struct tcphdr
*tcph
, int tcp_data_len
)
188 struct sk_buff
*parent
= lro_desc
->parent
;
191 lro_desc
->pkt_aggr_cnt
++;
192 lro_desc
->ip_tot_len
+= tcp_data_len
;
193 lro_desc
->tcp_next_seq
+= tcp_data_len
;
194 lro_desc
->tcp_window
= tcph
->window
;
195 lro_desc
->tcp_ack
= tcph
->ack_seq
;
197 /* don't update tcp_rcv_tsval, would not work with PAWS */
198 if (lro_desc
->tcp_saw_tstamp
) {
199 topt
= (__be32
*) (tcph
+ 1);
200 lro_desc
->tcp_rcv_tsecr
= *(topt
+ 2);
203 lro_desc
->data_csum
= csum_block_add(lro_desc
->data_csum
,
204 lro_tcp_data_csum(iph
, tcph
,
208 parent
->len
+= tcp_data_len
;
209 parent
->data_len
+= tcp_data_len
;
210 if (tcp_data_len
> lro_desc
->mss
)
211 lro_desc
->mss
= tcp_data_len
;
214 static void lro_add_packet(struct net_lro_desc
*lro_desc
, struct sk_buff
*skb
,
215 struct iphdr
*iph
, struct tcphdr
*tcph
)
217 struct sk_buff
*parent
= lro_desc
->parent
;
218 int tcp_data_len
= TCP_PAYLOAD_LENGTH(iph
, tcph
);
220 lro_add_common(lro_desc
, iph
, tcph
, tcp_data_len
);
222 skb_pull(skb
, (skb
->len
- tcp_data_len
));
223 parent
->truesize
+= skb
->truesize
;
225 if (lro_desc
->last_skb
)
226 lro_desc
->last_skb
->next
= skb
;
228 skb_shinfo(parent
)->frag_list
= skb
;
230 lro_desc
->last_skb
= skb
;
233 static void lro_add_frags(struct net_lro_desc
*lro_desc
,
234 int len
, int hlen
, int truesize
,
235 struct skb_frag_struct
*skb_frags
,
236 struct iphdr
*iph
, struct tcphdr
*tcph
)
238 struct sk_buff
*skb
= lro_desc
->parent
;
239 int tcp_data_len
= TCP_PAYLOAD_LENGTH(iph
, tcph
);
241 lro_add_common(lro_desc
, iph
, tcph
, tcp_data_len
);
243 skb
->truesize
+= truesize
;
245 skb_frags
[0].page_offset
+= hlen
;
246 skb_frag_size_sub(&skb_frags
[0], hlen
);
248 while (tcp_data_len
> 0) {
249 *(lro_desc
->next_frag
) = *skb_frags
;
250 tcp_data_len
-= skb_frag_size(skb_frags
);
251 lro_desc
->next_frag
++;
253 skb_shinfo(skb
)->nr_frags
++;
257 static int lro_check_tcp_conn(struct net_lro_desc
*lro_desc
,
261 if ((lro_desc
->iph
->saddr
!= iph
->saddr
) ||
262 (lro_desc
->iph
->daddr
!= iph
->daddr
) ||
263 (lro_desc
->tcph
->source
!= tcph
->source
) ||
264 (lro_desc
->tcph
->dest
!= tcph
->dest
))
269 static struct net_lro_desc
*lro_get_desc(struct net_lro_mgr
*lro_mgr
,
270 struct net_lro_desc
*lro_arr
,
274 struct net_lro_desc
*lro_desc
= NULL
;
275 struct net_lro_desc
*tmp
;
276 int max_desc
= lro_mgr
->max_desc
;
279 for (i
= 0; i
< max_desc
; i
++) {
282 if (!lro_check_tcp_conn(tmp
, iph
, tcph
)) {
288 for (i
= 0; i
< max_desc
; i
++) {
289 if (!lro_arr
[i
].active
) {
290 lro_desc
= &lro_arr
[i
];
295 LRO_INC_STATS(lro_mgr
, no_desc
);
300 static void lro_flush(struct net_lro_mgr
*lro_mgr
,
301 struct net_lro_desc
*lro_desc
)
303 if (lro_desc
->pkt_aggr_cnt
> 1)
304 lro_update_tcp_ip_header(lro_desc
);
306 skb_shinfo(lro_desc
->parent
)->gso_size
= lro_desc
->mss
;
308 if (lro_mgr
->features
& LRO_F_NAPI
)
309 netif_receive_skb(lro_desc
->parent
);
311 netif_rx(lro_desc
->parent
);
313 LRO_INC_STATS(lro_mgr
, flushed
);
314 lro_clear_desc(lro_desc
);
317 static int __lro_proc_skb(struct net_lro_mgr
*lro_mgr
, struct sk_buff
*skb
,
320 struct net_lro_desc
*lro_desc
;
324 int vlan_hdr_len
= 0;
326 if (!lro_mgr
->get_skb_header
||
327 lro_mgr
->get_skb_header(skb
, (void *)&iph
, (void *)&tcph
,
331 if (!(flags
& LRO_IPV4
) || !(flags
& LRO_TCP
))
334 lro_desc
= lro_get_desc(lro_mgr
, lro_mgr
->lro_arr
, iph
, tcph
);
338 if ((skb
->protocol
== htons(ETH_P_8021Q
)) &&
339 !(lro_mgr
->features
& LRO_F_EXTRACT_VLAN_ID
))
340 vlan_hdr_len
= VLAN_HLEN
;
342 if (!lro_desc
->active
) { /* start new lro session */
343 if (lro_tcp_ip_check(iph
, tcph
, skb
->len
- vlan_hdr_len
, NULL
))
346 skb
->ip_summed
= lro_mgr
->ip_summed_aggr
;
347 lro_init_desc(lro_desc
, skb
, iph
, tcph
);
348 LRO_INC_STATS(lro_mgr
, aggregated
);
352 if (lro_desc
->tcp_next_seq
!= ntohl(tcph
->seq
))
355 if (lro_tcp_ip_check(iph
, tcph
, skb
->len
, lro_desc
))
358 lro_add_packet(lro_desc
, skb
, iph
, tcph
);
359 LRO_INC_STATS(lro_mgr
, aggregated
);
361 if ((lro_desc
->pkt_aggr_cnt
>= lro_mgr
->max_aggr
) ||
362 lro_desc
->parent
->len
> (0xFFFF - lro_mgr
->dev
->mtu
))
363 lro_flush(lro_mgr
, lro_desc
);
367 out2
: /* send aggregated SKBs to stack */
368 lro_flush(lro_mgr
, lro_desc
);
375 static struct sk_buff
*lro_gen_skb(struct net_lro_mgr
*lro_mgr
,
376 struct skb_frag_struct
*frags
,
377 int len
, int true_size
,
379 int hlen
, __wsum sum
,
383 struct skb_frag_struct
*skb_frags
;
385 int hdr_len
= min(len
, hlen
);
387 skb
= netdev_alloc_skb(lro_mgr
->dev
, hlen
+ lro_mgr
->frag_align_pad
);
391 skb_reserve(skb
, lro_mgr
->frag_align_pad
);
393 skb
->data_len
= len
- hdr_len
;
394 skb
->truesize
+= true_size
;
395 skb
->tail
+= hdr_len
;
397 memcpy(skb
->data
, mac_hdr
, hdr_len
);
399 skb_frags
= skb_shinfo(skb
)->frags
;
400 while (data_len
> 0) {
402 data_len
-= skb_frag_size(frags
);
405 skb_shinfo(skb
)->nr_frags
++;
408 skb_shinfo(skb
)->frags
[0].page_offset
+= hdr_len
;
409 skb_frag_size_sub(&skb_shinfo(skb
)->frags
[0], hdr_len
);
411 skb
->ip_summed
= ip_summed
;
413 skb
->protocol
= eth_type_trans(skb
, lro_mgr
->dev
);
417 static struct sk_buff
*__lro_proc_segment(struct net_lro_mgr
*lro_mgr
,
418 struct skb_frag_struct
*frags
,
419 int len
, int true_size
,
420 void *priv
, __wsum sum
)
422 struct net_lro_desc
*lro_desc
;
429 int hdr_len
= LRO_MAX_PG_HLEN
;
430 int vlan_hdr_len
= 0;
432 if (!lro_mgr
->get_frag_header
||
433 lro_mgr
->get_frag_header(frags
, (void *)&mac_hdr
, (void *)&iph
,
434 (void *)&tcph
, &flags
, priv
)) {
435 mac_hdr
= skb_frag_address(frags
);
439 if (!(flags
& LRO_IPV4
) || !(flags
& LRO_TCP
))
442 hdr_len
= (int)((void *)(tcph
) + TCP_HDR_LEN(tcph
) - mac_hdr
);
443 mac_hdr_len
= (int)((void *)(iph
) - mac_hdr
);
445 lro_desc
= lro_get_desc(lro_mgr
, lro_mgr
->lro_arr
, iph
, tcph
);
449 if (!lro_desc
->active
) { /* start new lro session */
450 if (lro_tcp_ip_check(iph
, tcph
, len
- mac_hdr_len
, NULL
))
453 skb
= lro_gen_skb(lro_mgr
, frags
, len
, true_size
, mac_hdr
,
454 hdr_len
, 0, lro_mgr
->ip_summed_aggr
);
458 if ((skb
->protocol
== htons(ETH_P_8021Q
)) &&
459 !(lro_mgr
->features
& LRO_F_EXTRACT_VLAN_ID
))
460 vlan_hdr_len
= VLAN_HLEN
;
462 iph
= (void *)(skb
->data
+ vlan_hdr_len
);
463 tcph
= (void *)((u8
*)skb
->data
+ vlan_hdr_len
466 lro_init_desc(lro_desc
, skb
, iph
, tcph
);
467 LRO_INC_STATS(lro_mgr
, aggregated
);
471 if (lro_desc
->tcp_next_seq
!= ntohl(tcph
->seq
))
474 if (lro_tcp_ip_check(iph
, tcph
, len
- mac_hdr_len
, lro_desc
))
477 lro_add_frags(lro_desc
, len
, hdr_len
, true_size
, frags
, iph
, tcph
);
478 LRO_INC_STATS(lro_mgr
, aggregated
);
480 if ((skb_shinfo(lro_desc
->parent
)->nr_frags
>= lro_mgr
->max_aggr
) ||
481 lro_desc
->parent
->len
> (0xFFFF - lro_mgr
->dev
->mtu
))
482 lro_flush(lro_mgr
, lro_desc
);
486 out2
: /* send aggregated packets to the stack */
487 lro_flush(lro_mgr
, lro_desc
);
489 out1
: /* Original packet has to be posted to the stack */
490 skb
= lro_gen_skb(lro_mgr
, frags
, len
, true_size
, mac_hdr
,
491 hdr_len
, sum
, lro_mgr
->ip_summed
);
496 void lro_receive_skb(struct net_lro_mgr
*lro_mgr
,
500 if (__lro_proc_skb(lro_mgr
, skb
, priv
)) {
501 if (lro_mgr
->features
& LRO_F_NAPI
)
502 netif_receive_skb(skb
);
507 EXPORT_SYMBOL(lro_receive_skb
);
509 void lro_receive_frags(struct net_lro_mgr
*lro_mgr
,
510 struct skb_frag_struct
*frags
,
511 int len
, int true_size
, void *priv
, __wsum sum
)
515 skb
= __lro_proc_segment(lro_mgr
, frags
, len
, true_size
, priv
, sum
);
519 if (lro_mgr
->features
& LRO_F_NAPI
)
520 netif_receive_skb(skb
);
524 EXPORT_SYMBOL(lro_receive_frags
);
526 void lro_flush_all(struct net_lro_mgr
*lro_mgr
)
529 struct net_lro_desc
*lro_desc
= lro_mgr
->lro_arr
;
531 for (i
= 0; i
< lro_mgr
->max_desc
; i
++) {
532 if (lro_desc
[i
].active
)
533 lro_flush(lro_mgr
, &lro_desc
[i
]);
536 EXPORT_SYMBOL(lro_flush_all
);
538 void lro_flush_pkt(struct net_lro_mgr
*lro_mgr
,
539 struct iphdr
*iph
, struct tcphdr
*tcph
)
541 struct net_lro_desc
*lro_desc
;
543 lro_desc
= lro_get_desc(lro_mgr
, lro_mgr
->lro_arr
, iph
, tcph
);
544 if (lro_desc
->active
)
545 lro_flush(lro_mgr
, lro_desc
);
547 EXPORT_SYMBOL(lro_flush_pkt
);