2 * linux/net/ipv4/inet_lro.c
4 * Large Receive Offload (ipv4 / tcp)
6 * (C) Copyright IBM Corp. 2007
9 * Jan-Bernd Themann <themann@de.ibm.com>
10 * Christoph Raisch <raisch@de.ibm.com>
13 * This program is free software; you can redistribute it and/or modify
14 * it under the terms of the GNU General Public License as published by
15 * the Free Software Foundation; either version 2, or (at your option)
18 * This program is distributed in the hope that it will be useful,
19 * but WITHOUT ANY WARRANTY; without even the implied warranty of
20 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 * GNU General Public License for more details.
23 * You should have received a copy of the GNU General Public License
24 * along with this program; if not, write to the Free Software
25 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
29 #include <linux/module.h>
30 #include <linux/if_vlan.h>
31 #include <linux/inet_lro.h>
33 MODULE_LICENSE("GPL");
34 MODULE_AUTHOR("Jan-Bernd Themann <themann@de.ibm.com>");
35 MODULE_DESCRIPTION("Large Receive Offload (ipv4 / tcp)");
37 #define TCP_HDR_LEN(tcph) (tcph->doff << 2)
38 #define IP_HDR_LEN(iph) (iph->ihl << 2)
39 #define TCP_PAYLOAD_LENGTH(iph, tcph) \
40 (ntohs(iph->tot_len) - IP_HDR_LEN(iph) - TCP_HDR_LEN(tcph))
42 #define IPH_LEN_WO_OPTIONS 5
43 #define TCPH_LEN_WO_OPTIONS 5
44 #define TCPH_LEN_W_TIMESTAMP 8
46 #define LRO_MAX_PG_HLEN 64
48 #define LRO_INC_STATS(lro_mgr, attr) { lro_mgr->stats.attr++; }
51 * Basic tcp checks whether packet is suitable for LRO
54 static int lro_tcp_ip_check(const struct iphdr
*iph
, const struct tcphdr
*tcph
,
55 int len
, const struct net_lro_desc
*lro_desc
)
57 /* check ip header: don't aggregate padded frames */
58 if (ntohs(iph
->tot_len
) != len
)
61 if (TCP_PAYLOAD_LENGTH(iph
, tcph
) == 0)
64 if (iph
->ihl
!= IPH_LEN_WO_OPTIONS
)
67 if (tcph
->cwr
|| tcph
->ece
|| tcph
->urg
|| !tcph
->ack
||
68 tcph
->rst
|| tcph
->syn
|| tcph
->fin
)
71 if (INET_ECN_is_ce(ipv4_get_dsfield(iph
)))
74 if (tcph
->doff
!= TCPH_LEN_WO_OPTIONS
&&
75 tcph
->doff
!= TCPH_LEN_W_TIMESTAMP
)
78 /* check tcp options (only timestamp allowed) */
79 if (tcph
->doff
== TCPH_LEN_W_TIMESTAMP
) {
80 __be32
*topt
= (__be32
*)(tcph
+ 1);
82 if (*topt
!= htonl((TCPOPT_NOP
<< 24) | (TCPOPT_NOP
<< 16)
83 | (TCPOPT_TIMESTAMP
<< 8)
87 /* timestamp should be in right order */
89 if (lro_desc
&& after(ntohl(lro_desc
->tcp_rcv_tsval
),
93 /* timestamp reply should not be zero */
102 static void lro_update_tcp_ip_header(struct net_lro_desc
*lro_desc
)
104 struct iphdr
*iph
= lro_desc
->iph
;
105 struct tcphdr
*tcph
= lro_desc
->tcph
;
109 tcph
->ack_seq
= lro_desc
->tcp_ack
;
110 tcph
->window
= lro_desc
->tcp_window
;
112 if (lro_desc
->tcp_saw_tstamp
) {
113 p
= (__be32
*)(tcph
+ 1);
114 *(p
+2) = lro_desc
->tcp_rcv_tsecr
;
117 iph
->tot_len
= htons(lro_desc
->ip_tot_len
);
120 iph
->check
= ip_fast_csum((u8
*)lro_desc
->iph
, iph
->ihl
);
123 tcp_hdr_csum
= csum_partial(tcph
, TCP_HDR_LEN(tcph
), 0);
124 lro_desc
->data_csum
= csum_add(lro_desc
->data_csum
, tcp_hdr_csum
);
125 tcph
->check
= csum_tcpudp_magic(iph
->saddr
, iph
->daddr
,
126 lro_desc
->ip_tot_len
-
127 IP_HDR_LEN(iph
), IPPROTO_TCP
,
128 lro_desc
->data_csum
);
131 static __wsum
lro_tcp_data_csum(struct iphdr
*iph
, struct tcphdr
*tcph
, int len
)
135 __wsum tcp_ps_hdr_csum
;
137 tcp_csum
= ~csum_unfold(tcph
->check
);
138 tcp_hdr_csum
= csum_partial(tcph
, TCP_HDR_LEN(tcph
), tcp_csum
);
140 tcp_ps_hdr_csum
= csum_tcpudp_nofold(iph
->saddr
, iph
->daddr
,
141 len
+ TCP_HDR_LEN(tcph
),
144 return csum_sub(csum_sub(tcp_csum
, tcp_hdr_csum
),
148 static void lro_init_desc(struct net_lro_desc
*lro_desc
, struct sk_buff
*skb
,
149 struct iphdr
*iph
, struct tcphdr
*tcph
)
153 u32 tcp_data_len
= TCP_PAYLOAD_LENGTH(iph
, tcph
);
155 nr_frags
= skb_shinfo(skb
)->nr_frags
;
156 lro_desc
->parent
= skb
;
157 lro_desc
->next_frag
= &(skb_shinfo(skb
)->frags
[nr_frags
]);
159 lro_desc
->tcph
= tcph
;
160 lro_desc
->tcp_next_seq
= ntohl(tcph
->seq
) + tcp_data_len
;
161 lro_desc
->tcp_ack
= tcph
->ack_seq
;
162 lro_desc
->tcp_window
= tcph
->window
;
164 lro_desc
->pkt_aggr_cnt
= 1;
165 lro_desc
->ip_tot_len
= ntohs(iph
->tot_len
);
167 if (tcph
->doff
== 8) {
168 ptr
= (__be32
*)(tcph
+1);
169 lro_desc
->tcp_saw_tstamp
= 1;
170 lro_desc
->tcp_rcv_tsval
= *(ptr
+1);
171 lro_desc
->tcp_rcv_tsecr
= *(ptr
+2);
174 lro_desc
->mss
= tcp_data_len
;
175 lro_desc
->active
= 1;
177 lro_desc
->data_csum
= lro_tcp_data_csum(iph
, tcph
,
181 static inline void lro_clear_desc(struct net_lro_desc
*lro_desc
)
183 memset(lro_desc
, 0, sizeof(struct net_lro_desc
));
186 static void lro_add_common(struct net_lro_desc
*lro_desc
, struct iphdr
*iph
,
187 struct tcphdr
*tcph
, int tcp_data_len
)
189 struct sk_buff
*parent
= lro_desc
->parent
;
192 lro_desc
->pkt_aggr_cnt
++;
193 lro_desc
->ip_tot_len
+= tcp_data_len
;
194 lro_desc
->tcp_next_seq
+= tcp_data_len
;
195 lro_desc
->tcp_window
= tcph
->window
;
196 lro_desc
->tcp_ack
= tcph
->ack_seq
;
198 /* don't update tcp_rcv_tsval, would not work with PAWS */
199 if (lro_desc
->tcp_saw_tstamp
) {
200 topt
= (__be32
*) (tcph
+ 1);
201 lro_desc
->tcp_rcv_tsecr
= *(topt
+ 2);
204 lro_desc
->data_csum
= csum_block_add(lro_desc
->data_csum
,
205 lro_tcp_data_csum(iph
, tcph
,
209 parent
->len
+= tcp_data_len
;
210 parent
->data_len
+= tcp_data_len
;
211 if (tcp_data_len
> lro_desc
->mss
)
212 lro_desc
->mss
= tcp_data_len
;
215 static void lro_add_packet(struct net_lro_desc
*lro_desc
, struct sk_buff
*skb
,
216 struct iphdr
*iph
, struct tcphdr
*tcph
)
218 struct sk_buff
*parent
= lro_desc
->parent
;
219 int tcp_data_len
= TCP_PAYLOAD_LENGTH(iph
, tcph
);
221 lro_add_common(lro_desc
, iph
, tcph
, tcp_data_len
);
223 skb_pull(skb
, (skb
->len
- tcp_data_len
));
224 parent
->truesize
+= skb
->truesize
;
226 if (lro_desc
->last_skb
)
227 lro_desc
->last_skb
->next
= skb
;
229 skb_shinfo(parent
)->frag_list
= skb
;
231 lro_desc
->last_skb
= skb
;
234 static void lro_add_frags(struct net_lro_desc
*lro_desc
,
235 int len
, int hlen
, int truesize
,
236 struct skb_frag_struct
*skb_frags
,
237 struct iphdr
*iph
, struct tcphdr
*tcph
)
239 struct sk_buff
*skb
= lro_desc
->parent
;
240 int tcp_data_len
= TCP_PAYLOAD_LENGTH(iph
, tcph
);
242 lro_add_common(lro_desc
, iph
, tcph
, tcp_data_len
);
244 skb
->truesize
+= truesize
;
246 skb_frags
[0].page_offset
+= hlen
;
247 skb_frags
[0].size
-= hlen
;
249 while (tcp_data_len
> 0) {
250 *(lro_desc
->next_frag
) = *skb_frags
;
251 tcp_data_len
-= skb_frags
->size
;
252 lro_desc
->next_frag
++;
254 skb_shinfo(skb
)->nr_frags
++;
258 static int lro_check_tcp_conn(struct net_lro_desc
*lro_desc
,
262 if ((lro_desc
->iph
->saddr
!= iph
->saddr
) ||
263 (lro_desc
->iph
->daddr
!= iph
->daddr
) ||
264 (lro_desc
->tcph
->source
!= tcph
->source
) ||
265 (lro_desc
->tcph
->dest
!= tcph
->dest
))
270 static struct net_lro_desc
*lro_get_desc(struct net_lro_mgr
*lro_mgr
,
271 struct net_lro_desc
*lro_arr
,
275 struct net_lro_desc
*lro_desc
= NULL
;
276 struct net_lro_desc
*tmp
;
277 int max_desc
= lro_mgr
->max_desc
;
280 for (i
= 0; i
< max_desc
; i
++) {
283 if (!lro_check_tcp_conn(tmp
, iph
, tcph
)) {
289 for (i
= 0; i
< max_desc
; i
++) {
290 if (!lro_arr
[i
].active
) {
291 lro_desc
= &lro_arr
[i
];
296 LRO_INC_STATS(lro_mgr
, no_desc
);
301 static void lro_flush(struct net_lro_mgr
*lro_mgr
,
302 struct net_lro_desc
*lro_desc
)
304 if (lro_desc
->pkt_aggr_cnt
> 1)
305 lro_update_tcp_ip_header(lro_desc
);
307 skb_shinfo(lro_desc
->parent
)->gso_size
= lro_desc
->mss
;
309 if (lro_mgr
->features
& LRO_F_NAPI
)
310 netif_receive_skb(lro_desc
->parent
);
312 netif_rx(lro_desc
->parent
);
314 LRO_INC_STATS(lro_mgr
, flushed
);
315 lro_clear_desc(lro_desc
);
318 static int __lro_proc_skb(struct net_lro_mgr
*lro_mgr
, struct sk_buff
*skb
,
321 struct net_lro_desc
*lro_desc
;
325 int vlan_hdr_len
= 0;
327 if (!lro_mgr
->get_skb_header
||
328 lro_mgr
->get_skb_header(skb
, (void *)&iph
, (void *)&tcph
,
332 if (!(flags
& LRO_IPV4
) || !(flags
& LRO_TCP
))
335 lro_desc
= lro_get_desc(lro_mgr
, lro_mgr
->lro_arr
, iph
, tcph
);
339 if ((skb
->protocol
== htons(ETH_P_8021Q
)) &&
340 !(lro_mgr
->features
& LRO_F_EXTRACT_VLAN_ID
))
341 vlan_hdr_len
= VLAN_HLEN
;
343 if (!lro_desc
->active
) { /* start new lro session */
344 if (lro_tcp_ip_check(iph
, tcph
, skb
->len
- vlan_hdr_len
, NULL
))
347 skb
->ip_summed
= lro_mgr
->ip_summed_aggr
;
348 lro_init_desc(lro_desc
, skb
, iph
, tcph
);
349 LRO_INC_STATS(lro_mgr
, aggregated
);
353 if (lro_desc
->tcp_next_seq
!= ntohl(tcph
->seq
))
356 if (lro_tcp_ip_check(iph
, tcph
, skb
->len
, lro_desc
))
359 lro_add_packet(lro_desc
, skb
, iph
, tcph
);
360 LRO_INC_STATS(lro_mgr
, aggregated
);
362 if ((lro_desc
->pkt_aggr_cnt
>= lro_mgr
->max_aggr
) ||
363 lro_desc
->parent
->len
> (0xFFFF - lro_mgr
->dev
->mtu
))
364 lro_flush(lro_mgr
, lro_desc
);
368 out2
: /* send aggregated SKBs to stack */
369 lro_flush(lro_mgr
, lro_desc
);
376 static struct sk_buff
*lro_gen_skb(struct net_lro_mgr
*lro_mgr
,
377 struct skb_frag_struct
*frags
,
378 int len
, int true_size
,
380 int hlen
, __wsum sum
,
384 struct skb_frag_struct
*skb_frags
;
386 int hdr_len
= min(len
, hlen
);
388 skb
= netdev_alloc_skb(lro_mgr
->dev
, hlen
+ lro_mgr
->frag_align_pad
);
392 skb_reserve(skb
, lro_mgr
->frag_align_pad
);
394 skb
->data_len
= len
- hdr_len
;
395 skb
->truesize
+= true_size
;
396 skb
->tail
+= hdr_len
;
398 memcpy(skb
->data
, mac_hdr
, hdr_len
);
400 skb_frags
= skb_shinfo(skb
)->frags
;
401 while (data_len
> 0) {
403 data_len
-= frags
->size
;
406 skb_shinfo(skb
)->nr_frags
++;
409 skb_shinfo(skb
)->frags
[0].page_offset
+= hdr_len
;
410 skb_shinfo(skb
)->frags
[0].size
-= hdr_len
;
412 skb
->ip_summed
= ip_summed
;
414 skb
->protocol
= eth_type_trans(skb
, lro_mgr
->dev
);
418 static struct sk_buff
*__lro_proc_segment(struct net_lro_mgr
*lro_mgr
,
419 struct skb_frag_struct
*frags
,
420 int len
, int true_size
,
421 void *priv
, __wsum sum
)
423 struct net_lro_desc
*lro_desc
;
430 int hdr_len
= LRO_MAX_PG_HLEN
;
431 int vlan_hdr_len
= 0;
433 if (!lro_mgr
->get_frag_header
||
434 lro_mgr
->get_frag_header(frags
, (void *)&mac_hdr
, (void *)&iph
,
435 (void *)&tcph
, &flags
, priv
)) {
436 mac_hdr
= page_address(frags
->page
) + frags
->page_offset
;
440 if (!(flags
& LRO_IPV4
) || !(flags
& LRO_TCP
))
443 hdr_len
= (int)((void *)(tcph
) + TCP_HDR_LEN(tcph
) - mac_hdr
);
444 mac_hdr_len
= (int)((void *)(iph
) - mac_hdr
);
446 lro_desc
= lro_get_desc(lro_mgr
, lro_mgr
->lro_arr
, iph
, tcph
);
450 if (!lro_desc
->active
) { /* start new lro session */
451 if (lro_tcp_ip_check(iph
, tcph
, len
- mac_hdr_len
, NULL
))
454 skb
= lro_gen_skb(lro_mgr
, frags
, len
, true_size
, mac_hdr
,
455 hdr_len
, 0, lro_mgr
->ip_summed_aggr
);
459 if ((skb
->protocol
== htons(ETH_P_8021Q
)) &&
460 !(lro_mgr
->features
& LRO_F_EXTRACT_VLAN_ID
))
461 vlan_hdr_len
= VLAN_HLEN
;
463 iph
= (void *)(skb
->data
+ vlan_hdr_len
);
464 tcph
= (void *)((u8
*)skb
->data
+ vlan_hdr_len
467 lro_init_desc(lro_desc
, skb
, iph
, tcph
);
468 LRO_INC_STATS(lro_mgr
, aggregated
);
472 if (lro_desc
->tcp_next_seq
!= ntohl(tcph
->seq
))
475 if (lro_tcp_ip_check(iph
, tcph
, len
- mac_hdr_len
, lro_desc
))
478 lro_add_frags(lro_desc
, len
, hdr_len
, true_size
, frags
, iph
, tcph
);
479 LRO_INC_STATS(lro_mgr
, aggregated
);
481 if ((skb_shinfo(lro_desc
->parent
)->nr_frags
>= lro_mgr
->max_aggr
) ||
482 lro_desc
->parent
->len
> (0xFFFF - lro_mgr
->dev
->mtu
))
483 lro_flush(lro_mgr
, lro_desc
);
487 out2
: /* send aggregated packets to the stack */
488 lro_flush(lro_mgr
, lro_desc
);
490 out1
: /* Original packet has to be posted to the stack */
491 skb
= lro_gen_skb(lro_mgr
, frags
, len
, true_size
, mac_hdr
,
492 hdr_len
, sum
, lro_mgr
->ip_summed
);
497 void lro_receive_skb(struct net_lro_mgr
*lro_mgr
,
501 if (__lro_proc_skb(lro_mgr
, skb
, priv
)) {
502 if (lro_mgr
->features
& LRO_F_NAPI
)
503 netif_receive_skb(skb
);
508 EXPORT_SYMBOL(lro_receive_skb
);
510 void lro_receive_frags(struct net_lro_mgr
*lro_mgr
,
511 struct skb_frag_struct
*frags
,
512 int len
, int true_size
, void *priv
, __wsum sum
)
516 skb
= __lro_proc_segment(lro_mgr
, frags
, len
, true_size
, priv
, sum
);
520 if (lro_mgr
->features
& LRO_F_NAPI
)
521 netif_receive_skb(skb
);
525 EXPORT_SYMBOL(lro_receive_frags
);
527 void lro_flush_all(struct net_lro_mgr
*lro_mgr
)
530 struct net_lro_desc
*lro_desc
= lro_mgr
->lro_arr
;
532 for (i
= 0; i
< lro_mgr
->max_desc
; i
++) {
533 if (lro_desc
[i
].active
)
534 lro_flush(lro_mgr
, &lro_desc
[i
]);
537 EXPORT_SYMBOL(lro_flush_all
);
539 void lro_flush_pkt(struct net_lro_mgr
*lro_mgr
,
540 struct iphdr
*iph
, struct tcphdr
*tcph
)
542 struct net_lro_desc
*lro_desc
;
544 lro_desc
= lro_get_desc(lro_mgr
, lro_mgr
->lro_arr
, iph
, tcph
);
545 if (lro_desc
->active
)
546 lro_flush(lro_mgr
, lro_desc
);
548 EXPORT_SYMBOL(lro_flush_pkt
);