3 * Copyright (c) 2008 Joerg Sonnenberger <joerg@NetBSD.org>.
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in
14 * the documentation and/or other materials provided with the
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
20 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
21 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
22 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
23 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
24 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
25 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
26 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
27 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31 #include <sys/cdefs.h>
32 __KERNEL_RCSID(0, "$NetBSD$");
34 #include <sys/param.h>
35 #include <sys/endian.h>
38 #include <sys/systm.h>
44 #define KASSERT(x) assert(x)
47 #include <machine/limits.h>
49 #include <netinet/in.h>
52 int cpu_in_cksum(struct mbuf
*, int, int, uint32_t);
56 * Checksum routine for Internet Protocol family headers (Portable Version).
58 * This routine is very heavily used in the network
59 * code and should be modified for each CPU to be as fast as possible.
61 * A discussion of different implementation techniques can be found in
64 * The default implementation for 32bit architectures is using
65 * a 32bit accumulator and operating on 16bit operands.
67 * The default implementation for 64bit architectures is using
68 * a 64bit accumulator and operating on 32bit operands.
70 * Both versions are unrolled to handle 32 Byte / 64 Byte fragments as core
71 * of the inner loop. After each iteration of the inner loop, a partial
72 * reduction is done to avoid carry in long packets.
75 #if ULONG_MAX == 0xffffffffUL
78 cpu_in_cksum(struct mbuf
*m
, int len
, int off
, uint32_t initial_sum
)
81 uint32_t sum
, partial
;
82 unsigned int final_acc
;
84 bool needs_swap
, started_on_odd
;
90 started_on_odd
= false;
91 sum
= (initial_sum
>> 16) + (initial_sum
& 0xffff);
94 if (__predict_false(m
== NULL
)) {
95 printf("in_cksum: out of data\n");
101 data
= mtod(m
, uint8_t *) + off
;
102 goto post_initial_offset
;
110 for (; len
> 0; m
= m
->m_next
) {
111 if (__predict_false(m
== NULL
)) {
112 printf("in_cksum: out of data\n");
116 data
= mtod(m
, uint8_t *);
125 if ((uintptr_t)data
& 1) {
126 /* Align on word boundary */
127 started_on_odd
= !started_on_odd
;
128 #if _BYTE_ORDER == _LITTLE_ENDIAN
129 partial
= *data
<< 8;
136 needs_swap
= started_on_odd
;
138 __builtin_prefetch(data
+ 32);
139 partial
+= *(uint16_t *)data
;
140 partial
+= *(uint16_t *)(data
+ 2);
141 partial
+= *(uint16_t *)(data
+ 4);
142 partial
+= *(uint16_t *)(data
+ 6);
143 partial
+= *(uint16_t *)(data
+ 8);
144 partial
+= *(uint16_t *)(data
+ 10);
145 partial
+= *(uint16_t *)(data
+ 12);
146 partial
+= *(uint16_t *)(data
+ 14);
147 partial
+= *(uint16_t *)(data
+ 16);
148 partial
+= *(uint16_t *)(data
+ 18);
149 partial
+= *(uint16_t *)(data
+ 20);
150 partial
+= *(uint16_t *)(data
+ 22);
151 partial
+= *(uint16_t *)(data
+ 24);
152 partial
+= *(uint16_t *)(data
+ 26);
153 partial
+= *(uint16_t *)(data
+ 28);
154 partial
+= *(uint16_t *)(data
+ 30);
157 if (__predict_false(partial
& 0xc0000000)) {
159 partial
= (partial
<< 8) + (partial
>> 24);
160 sum
+= (partial
>> 16);
161 sum
+= (partial
& 0xffff);
166 partial
+= *(uint16_t *)data
;
167 partial
+= *(uint16_t *)(data
+ 2);
168 partial
+= *(uint16_t *)(data
+ 4);
169 partial
+= *(uint16_t *)(data
+ 6);
170 partial
+= *(uint16_t *)(data
+ 8);
171 partial
+= *(uint16_t *)(data
+ 10);
172 partial
+= *(uint16_t *)(data
+ 12);
173 partial
+= *(uint16_t *)(data
+ 14);
178 * mlen is not updated below as the remaining tests
179 * are using bit masks, which are not affected.
182 partial
+= *(uint16_t *)data
;
183 partial
+= *(uint16_t *)(data
+ 2);
184 partial
+= *(uint16_t *)(data
+ 4);
185 partial
+= *(uint16_t *)(data
+ 6);
189 partial
+= *(uint16_t *)data
;
190 partial
+= *(uint16_t *)(data
+ 2);
194 partial
+= *(uint16_t *)data
;
198 #if _BYTE_ORDER == _LITTLE_ENDIAN
201 partial
+= *data
<< 8;
203 started_on_odd
= !started_on_odd
;
207 partial
= (partial
<< 8) + (partial
>> 24);
208 sum
+= (partial
>> 16) + (partial
& 0xffff);
210 * Reduce sum to allow potential byte swap
211 * in the next iteration without carry.
213 sum
= (sum
>> 16) + (sum
& 0xffff);
215 final_acc
= ((sum
>> 16) & 0xffff) + (sum
& 0xffff);
216 final_acc
= (final_acc
>> 16) + (final_acc
& 0xffff);
217 return ~final_acc
& 0xffff;
223 cpu_in_cksum(struct mbuf
*m
, int len
, int off
, uint32_t initial_sum
)
226 uint64_t sum
, partial
;
227 unsigned int final_acc
;
229 bool needs_swap
, started_on_odd
;
235 started_on_odd
= false;
239 if (__predict_false(m
== NULL
)) {
240 printf("in_cksum: out of data\n");
246 data
= mtod(m
, uint8_t *) + off
;
247 goto post_initial_offset
;
255 for (; len
> 0; m
= m
->m_next
) {
256 if (__predict_false(m
== NULL
)) {
257 printf("in_cksum: out of data\n");
261 data
= mtod(m
, uint8_t *);
270 if ((uintptr_t)data
& 1) {
271 /* Align on word boundary */
272 started_on_odd
= !started_on_odd
;
273 #if _BYTE_ORDER == _LITTLE_ENDIAN
274 partial
= *data
<< 8;
281 needs_swap
= started_on_odd
;
282 if ((uintptr_t)data
& 2) {
285 partial
+= *(uint16_t *)data
;
290 __builtin_prefetch(data
+ 32);
291 __builtin_prefetch(data
+ 64);
292 partial
+= *(uint32_t *)data
;
293 partial
+= *(uint32_t *)(data
+ 4);
294 partial
+= *(uint32_t *)(data
+ 8);
295 partial
+= *(uint32_t *)(data
+ 12);
296 partial
+= *(uint32_t *)(data
+ 16);
297 partial
+= *(uint32_t *)(data
+ 20);
298 partial
+= *(uint32_t *)(data
+ 24);
299 partial
+= *(uint32_t *)(data
+ 28);
300 partial
+= *(uint32_t *)(data
+ 32);
301 partial
+= *(uint32_t *)(data
+ 36);
302 partial
+= *(uint32_t *)(data
+ 40);
303 partial
+= *(uint32_t *)(data
+ 44);
304 partial
+= *(uint32_t *)(data
+ 48);
305 partial
+= *(uint32_t *)(data
+ 52);
306 partial
+= *(uint32_t *)(data
+ 56);
307 partial
+= *(uint32_t *)(data
+ 60);
310 if (__predict_false(partial
& (3ULL << 62))) {
312 partial
= (partial
<< 8) + (partial
>> 56);
313 sum
+= (partial
>> 32);
314 sum
+= (partial
& 0xffffffff);
319 * mlen is not updated below as the remaining tests
320 * are using bit masks, which are not affected.
323 partial
+= *(uint32_t *)data
;
324 partial
+= *(uint32_t *)(data
+ 4);
325 partial
+= *(uint32_t *)(data
+ 8);
326 partial
+= *(uint32_t *)(data
+ 12);
327 partial
+= *(uint32_t *)(data
+ 16);
328 partial
+= *(uint32_t *)(data
+ 20);
329 partial
+= *(uint32_t *)(data
+ 24);
330 partial
+= *(uint32_t *)(data
+ 28);
334 partial
+= *(uint32_t *)data
;
335 partial
+= *(uint32_t *)(data
+ 4);
336 partial
+= *(uint32_t *)(data
+ 8);
337 partial
+= *(uint32_t *)(data
+ 12);
341 partial
+= *(uint32_t *)data
;
342 partial
+= *(uint32_t *)(data
+ 4);
346 partial
+= *(uint32_t *)data
;
350 partial
+= *(uint16_t *)data
;
355 #if _BYTE_ORDER == _LITTLE_ENDIAN
358 partial
+= *data
<< 8;
360 started_on_odd
= !started_on_odd
;
364 partial
= (partial
<< 8) + (partial
>> 56);
365 sum
+= (partial
>> 32) + (partial
& 0xffffffff);
367 * Reduce sum to allow potential byte swap
368 * in the next iteration without carry.
370 sum
= (sum
>> 32) + (sum
& 0xffffffff);
372 final_acc
= (sum
>> 48) + ((sum
>> 32) & 0xffff) +
373 ((sum
>> 16) & 0xffff) + (sum
& 0xffff);
374 final_acc
= (final_acc
>> 16) + (final_acc
& 0xffff);
375 final_acc
= (final_acc
>> 16) + (final_acc
& 0xffff);
376 return ~final_acc
& 0xffff;