2 * linux/net/ipv4/inet_lro.c
4 * Large Receive Offload (ipv4 / tcp)
6 * (C) Copyright IBM Corp. 2007
9 * Jan-Bernd Themann <themann@de.ibm.com>
10 * Christoph Raisch <raisch@de.ibm.com>
13 * This program is free software; you can redistribute it and/or modify
14 * it under the terms of the GNU General Public License as published by
15 * the Free Software Foundation; either version 2, or (at your option)
18 * This program is distributed in the hope that it will be useful,
19 * but WITHOUT ANY WARRANTY; without even the implied warranty of
20 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 * GNU General Public License for more details.
23 * You should have received a copy of the GNU General Public License
24 * along with this program; if not, write to the Free Software
25 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
29 #include <linux/module.h>
30 #include <linux/if_vlan.h>
31 #include <linux/inet_lro.h>
32 #include <net/checksum.h>
34 MODULE_LICENSE("GPL");
35 MODULE_AUTHOR("Jan-Bernd Themann <themann@de.ibm.com>");
36 MODULE_DESCRIPTION("Large Receive Offload (ipv4 / tcp)");
38 #define TCP_HDR_LEN(tcph) (tcph->doff << 2)
39 #define IP_HDR_LEN(iph) (iph->ihl << 2)
40 #define TCP_PAYLOAD_LENGTH(iph, tcph) \
41 (ntohs(iph->tot_len) - IP_HDR_LEN(iph) - TCP_HDR_LEN(tcph))
43 #define IPH_LEN_WO_OPTIONS 5
44 #define TCPH_LEN_WO_OPTIONS 5
45 #define TCPH_LEN_W_TIMESTAMP 8
47 #define LRO_MAX_PG_HLEN 64
49 #define LRO_INC_STATS(lro_mgr, attr) { lro_mgr->stats.attr++; }
52 * Basic tcp checks whether packet is suitable for LRO
55 static int lro_tcp_ip_check(const struct iphdr
*iph
, const struct tcphdr
*tcph
,
56 int len
, const struct net_lro_desc
*lro_desc
)
58 /* check ip header: don't aggregate padded frames */
59 if (ntohs(iph
->tot_len
) != len
)
62 if (TCP_PAYLOAD_LENGTH(iph
, tcph
) == 0)
65 if (iph
->ihl
!= IPH_LEN_WO_OPTIONS
)
68 if (tcph
->cwr
|| tcph
->ece
|| tcph
->urg
|| !tcph
->ack
||
69 tcph
->rst
|| tcph
->syn
|| tcph
->fin
)
72 if (INET_ECN_is_ce(ipv4_get_dsfield(iph
)))
75 if (tcph
->doff
!= TCPH_LEN_WO_OPTIONS
&&
76 tcph
->doff
!= TCPH_LEN_W_TIMESTAMP
)
79 /* check tcp options (only timestamp allowed) */
80 if (tcph
->doff
== TCPH_LEN_W_TIMESTAMP
) {
81 __be32
*topt
= (__be32
*)(tcph
+ 1);
83 if (*topt
!= htonl((TCPOPT_NOP
<< 24) | (TCPOPT_NOP
<< 16)
84 | (TCPOPT_TIMESTAMP
<< 8)
88 /* timestamp should be in right order */
90 if (lro_desc
&& after(ntohl(lro_desc
->tcp_rcv_tsval
),
94 /* timestamp reply should not be zero */
103 static void lro_update_tcp_ip_header(struct net_lro_desc
*lro_desc
)
105 struct iphdr
*iph
= lro_desc
->iph
;
106 struct tcphdr
*tcph
= lro_desc
->tcph
;
110 tcph
->ack_seq
= lro_desc
->tcp_ack
;
111 tcph
->window
= lro_desc
->tcp_window
;
113 if (lro_desc
->tcp_saw_tstamp
) {
114 p
= (__be32
*)(tcph
+ 1);
115 *(p
+2) = lro_desc
->tcp_rcv_tsecr
;
118 csum_replace2(&iph
->check
, iph
->tot_len
, htons(lro_desc
->ip_tot_len
));
119 iph
->tot_len
= htons(lro_desc
->ip_tot_len
);
122 tcp_hdr_csum
= csum_partial(tcph
, TCP_HDR_LEN(tcph
), 0);
123 lro_desc
->data_csum
= csum_add(lro_desc
->data_csum
, tcp_hdr_csum
);
124 tcph
->check
= csum_tcpudp_magic(iph
->saddr
, iph
->daddr
,
125 lro_desc
->ip_tot_len
-
126 IP_HDR_LEN(iph
), IPPROTO_TCP
,
127 lro_desc
->data_csum
);
130 static __wsum
lro_tcp_data_csum(struct iphdr
*iph
, struct tcphdr
*tcph
, int len
)
134 __wsum tcp_ps_hdr_csum
;
136 tcp_csum
= ~csum_unfold(tcph
->check
);
137 tcp_hdr_csum
= csum_partial(tcph
, TCP_HDR_LEN(tcph
), tcp_csum
);
139 tcp_ps_hdr_csum
= csum_tcpudp_nofold(iph
->saddr
, iph
->daddr
,
140 len
+ TCP_HDR_LEN(tcph
),
143 return csum_sub(csum_sub(tcp_csum
, tcp_hdr_csum
),
147 static void lro_init_desc(struct net_lro_desc
*lro_desc
, struct sk_buff
*skb
,
148 struct iphdr
*iph
, struct tcphdr
*tcph
)
152 u32 tcp_data_len
= TCP_PAYLOAD_LENGTH(iph
, tcph
);
154 nr_frags
= skb_shinfo(skb
)->nr_frags
;
155 lro_desc
->parent
= skb
;
156 lro_desc
->next_frag
= &(skb_shinfo(skb
)->frags
[nr_frags
]);
158 lro_desc
->tcph
= tcph
;
159 lro_desc
->tcp_next_seq
= ntohl(tcph
->seq
) + tcp_data_len
;
160 lro_desc
->tcp_ack
= tcph
->ack_seq
;
161 lro_desc
->tcp_window
= tcph
->window
;
163 lro_desc
->pkt_aggr_cnt
= 1;
164 lro_desc
->ip_tot_len
= ntohs(iph
->tot_len
);
166 if (tcph
->doff
== 8) {
167 ptr
= (__be32
*)(tcph
+1);
168 lro_desc
->tcp_saw_tstamp
= 1;
169 lro_desc
->tcp_rcv_tsval
= *(ptr
+1);
170 lro_desc
->tcp_rcv_tsecr
= *(ptr
+2);
173 lro_desc
->mss
= tcp_data_len
;
174 lro_desc
->active
= 1;
176 lro_desc
->data_csum
= lro_tcp_data_csum(iph
, tcph
,
180 static inline void lro_clear_desc(struct net_lro_desc
*lro_desc
)
182 memset(lro_desc
, 0, sizeof(struct net_lro_desc
));
185 static void lro_add_common(struct net_lro_desc
*lro_desc
, struct iphdr
*iph
,
186 struct tcphdr
*tcph
, int tcp_data_len
)
188 struct sk_buff
*parent
= lro_desc
->parent
;
191 lro_desc
->pkt_aggr_cnt
++;
192 lro_desc
->ip_tot_len
+= tcp_data_len
;
193 lro_desc
->tcp_next_seq
+= tcp_data_len
;
194 lro_desc
->tcp_window
= tcph
->window
;
195 lro_desc
->tcp_ack
= tcph
->ack_seq
;
197 /* don't update tcp_rcv_tsval, would not work with PAWS */
198 if (lro_desc
->tcp_saw_tstamp
) {
199 topt
= (__be32
*) (tcph
+ 1);
200 lro_desc
->tcp_rcv_tsecr
= *(topt
+ 2);
203 lro_desc
->data_csum
= csum_block_add(lro_desc
->data_csum
,
204 lro_tcp_data_csum(iph
, tcph
,
208 parent
->len
+= tcp_data_len
;
209 parent
->data_len
+= tcp_data_len
;
210 if (tcp_data_len
> lro_desc
->mss
)
211 lro_desc
->mss
= tcp_data_len
;
214 static void lro_add_packet(struct net_lro_desc
*lro_desc
, struct sk_buff
*skb
,
215 struct iphdr
*iph
, struct tcphdr
*tcph
)
217 struct sk_buff
*parent
= lro_desc
->parent
;
218 int tcp_data_len
= TCP_PAYLOAD_LENGTH(iph
, tcph
);
220 lro_add_common(lro_desc
, iph
, tcph
, tcp_data_len
);
222 skb_pull(skb
, (skb
->len
- tcp_data_len
));
223 parent
->truesize
+= skb
->truesize
;
225 if (lro_desc
->last_skb
)
226 lro_desc
->last_skb
->next
= skb
;
228 skb_shinfo(parent
)->frag_list
= skb
;
230 lro_desc
->last_skb
= skb
;
234 static int lro_check_tcp_conn(struct net_lro_desc
*lro_desc
,
238 if ((lro_desc
->iph
->saddr
!= iph
->saddr
) ||
239 (lro_desc
->iph
->daddr
!= iph
->daddr
) ||
240 (lro_desc
->tcph
->source
!= tcph
->source
) ||
241 (lro_desc
->tcph
->dest
!= tcph
->dest
))
246 static struct net_lro_desc
*lro_get_desc(struct net_lro_mgr
*lro_mgr
,
247 struct net_lro_desc
*lro_arr
,
251 struct net_lro_desc
*lro_desc
= NULL
;
252 struct net_lro_desc
*tmp
;
253 int max_desc
= lro_mgr
->max_desc
;
256 for (i
= 0; i
< max_desc
; i
++) {
259 if (!lro_check_tcp_conn(tmp
, iph
, tcph
)) {
265 for (i
= 0; i
< max_desc
; i
++) {
266 if (!lro_arr
[i
].active
) {
267 lro_desc
= &lro_arr
[i
];
272 LRO_INC_STATS(lro_mgr
, no_desc
);
277 static void lro_flush(struct net_lro_mgr
*lro_mgr
,
278 struct net_lro_desc
*lro_desc
)
280 if (lro_desc
->pkt_aggr_cnt
> 1)
281 lro_update_tcp_ip_header(lro_desc
);
283 skb_shinfo(lro_desc
->parent
)->gso_size
= lro_desc
->mss
;
285 if (lro_mgr
->features
& LRO_F_NAPI
)
286 netif_receive_skb(lro_desc
->parent
);
288 netif_rx(lro_desc
->parent
);
290 LRO_INC_STATS(lro_mgr
, flushed
);
291 lro_clear_desc(lro_desc
);
294 static int __lro_proc_skb(struct net_lro_mgr
*lro_mgr
, struct sk_buff
*skb
,
297 struct net_lro_desc
*lro_desc
;
301 int vlan_hdr_len
= 0;
303 if (!lro_mgr
->get_skb_header
||
304 lro_mgr
->get_skb_header(skb
, (void *)&iph
, (void *)&tcph
,
308 if (!(flags
& LRO_IPV4
) || !(flags
& LRO_TCP
))
311 lro_desc
= lro_get_desc(lro_mgr
, lro_mgr
->lro_arr
, iph
, tcph
);
315 if ((skb
->protocol
== htons(ETH_P_8021Q
)) &&
316 !(lro_mgr
->features
& LRO_F_EXTRACT_VLAN_ID
))
317 vlan_hdr_len
= VLAN_HLEN
;
319 if (!lro_desc
->active
) { /* start new lro session */
320 if (lro_tcp_ip_check(iph
, tcph
, skb
->len
- vlan_hdr_len
, NULL
))
323 skb
->ip_summed
= lro_mgr
->ip_summed_aggr
;
324 lro_init_desc(lro_desc
, skb
, iph
, tcph
);
325 LRO_INC_STATS(lro_mgr
, aggregated
);
329 if (lro_desc
->tcp_next_seq
!= ntohl(tcph
->seq
))
332 if (lro_tcp_ip_check(iph
, tcph
, skb
->len
, lro_desc
))
335 lro_add_packet(lro_desc
, skb
, iph
, tcph
);
336 LRO_INC_STATS(lro_mgr
, aggregated
);
338 if ((lro_desc
->pkt_aggr_cnt
>= lro_mgr
->max_aggr
) ||
339 lro_desc
->parent
->len
> (0xFFFF - lro_mgr
->dev
->mtu
))
340 lro_flush(lro_mgr
, lro_desc
);
344 out2
: /* send aggregated SKBs to stack */
345 lro_flush(lro_mgr
, lro_desc
);
351 void lro_receive_skb(struct net_lro_mgr
*lro_mgr
,
355 if (__lro_proc_skb(lro_mgr
, skb
, priv
)) {
356 if (lro_mgr
->features
& LRO_F_NAPI
)
357 netif_receive_skb(skb
);
362 EXPORT_SYMBOL(lro_receive_skb
);
364 void lro_flush_all(struct net_lro_mgr
*lro_mgr
)
367 struct net_lro_desc
*lro_desc
= lro_mgr
->lro_arr
;
369 for (i
= 0; i
< lro_mgr
->max_desc
; i
++) {
370 if (lro_desc
[i
].active
)
371 lro_flush(lro_mgr
, &lro_desc
[i
]);
374 EXPORT_SYMBOL(lro_flush_all
);