Merge tag 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mst/vhost
[cris-mirror.git] / arch / x86 / include / asm / xor.h
blob45c8605467f137b78041e2530c81a282829b1bd1
1 #ifndef _ASM_X86_XOR_H
2 #define _ASM_X86_XOR_H
4 /*
5 * Optimized RAID-5 checksumming functions for SSE.
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2, or (at your option)
10 * any later version.
12 * You should have received a copy of the GNU General Public License
13 * (for example /usr/src/linux/COPYING); if not, write to the Free
14 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18 * Cache avoiding checksumming functions utilizing KNI instructions
19 * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo)
23 * Based on
24 * High-speed RAID5 checksumming functions utilizing SSE instructions.
25 * Copyright (C) 1998 Ingo Molnar.
29 * x86-64 changes / gcc fixes from Andi Kleen.
30 * Copyright 2002 Andi Kleen, SuSE Labs.
32 * This hasn't been optimized for the hammer yet, but there are likely
33 * no advantages to be gotten from x86-64 here anyways.
36 #include <asm/fpu/api.h>
38 #ifdef CONFIG_X86_32
39 /* reduce register pressure */
40 # define XOR_CONSTANT_CONSTRAINT "i"
41 #else
42 # define XOR_CONSTANT_CONSTRAINT "re"
43 #endif
45 #define OFFS(x) "16*("#x")"
46 #define PF_OFFS(x) "256+16*("#x")"
47 #define PF0(x) " prefetchnta "PF_OFFS(x)"(%[p1]) ;\n"
48 #define LD(x, y) " movaps "OFFS(x)"(%[p1]), %%xmm"#y" ;\n"
49 #define ST(x, y) " movaps %%xmm"#y", "OFFS(x)"(%[p1]) ;\n"
50 #define PF1(x) " prefetchnta "PF_OFFS(x)"(%[p2]) ;\n"
51 #define PF2(x) " prefetchnta "PF_OFFS(x)"(%[p3]) ;\n"
52 #define PF3(x) " prefetchnta "PF_OFFS(x)"(%[p4]) ;\n"
53 #define PF4(x) " prefetchnta "PF_OFFS(x)"(%[p5]) ;\n"
54 #define XO1(x, y) " xorps "OFFS(x)"(%[p2]), %%xmm"#y" ;\n"
55 #define XO2(x, y) " xorps "OFFS(x)"(%[p3]), %%xmm"#y" ;\n"
56 #define XO3(x, y) " xorps "OFFS(x)"(%[p4]), %%xmm"#y" ;\n"
57 #define XO4(x, y) " xorps "OFFS(x)"(%[p5]), %%xmm"#y" ;\n"
58 #define NOP(x)
60 #define BLK64(pf, op, i) \
61 pf(i) \
62 op(i, 0) \
63 op(i + 1, 1) \
64 op(i + 2, 2) \
65 op(i + 3, 3)
67 static void
68 xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
70 unsigned long lines = bytes >> 8;
72 kernel_fpu_begin();
74 asm volatile(
75 #undef BLOCK
76 #define BLOCK(i) \
77 LD(i, 0) \
78 LD(i + 1, 1) \
79 PF1(i) \
80 PF1(i + 2) \
81 LD(i + 2, 2) \
82 LD(i + 3, 3) \
83 PF0(i + 4) \
84 PF0(i + 6) \
85 XO1(i, 0) \
86 XO1(i + 1, 1) \
87 XO1(i + 2, 2) \
88 XO1(i + 3, 3) \
89 ST(i, 0) \
90 ST(i + 1, 1) \
91 ST(i + 2, 2) \
92 ST(i + 3, 3) \
95 PF0(0)
96 PF0(2)
98 " .align 32 ;\n"
99 " 1: ;\n"
101 BLOCK(0)
102 BLOCK(4)
103 BLOCK(8)
104 BLOCK(12)
106 " add %[inc], %[p1] ;\n"
107 " add %[inc], %[p2] ;\n"
108 " dec %[cnt] ;\n"
109 " jnz 1b ;\n"
110 : [cnt] "+r" (lines),
111 [p1] "+r" (p1), [p2] "+r" (p2)
112 : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
113 : "memory");
115 kernel_fpu_end();
118 static void
119 xor_sse_2_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2)
121 unsigned long lines = bytes >> 8;
123 kernel_fpu_begin();
125 asm volatile(
126 #undef BLOCK
127 #define BLOCK(i) \
128 BLK64(PF0, LD, i) \
129 BLK64(PF1, XO1, i) \
130 BLK64(NOP, ST, i) \
132 " .align 32 ;\n"
133 " 1: ;\n"
135 BLOCK(0)
136 BLOCK(4)
137 BLOCK(8)
138 BLOCK(12)
140 " add %[inc], %[p1] ;\n"
141 " add %[inc], %[p2] ;\n"
142 " dec %[cnt] ;\n"
143 " jnz 1b ;\n"
144 : [cnt] "+r" (lines),
145 [p1] "+r" (p1), [p2] "+r" (p2)
146 : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
147 : "memory");
149 kernel_fpu_end();
152 static void
153 xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
154 unsigned long *p3)
156 unsigned long lines = bytes >> 8;
158 kernel_fpu_begin();
160 asm volatile(
161 #undef BLOCK
162 #define BLOCK(i) \
163 PF1(i) \
164 PF1(i + 2) \
165 LD(i, 0) \
166 LD(i + 1, 1) \
167 LD(i + 2, 2) \
168 LD(i + 3, 3) \
169 PF2(i) \
170 PF2(i + 2) \
171 PF0(i + 4) \
172 PF0(i + 6) \
173 XO1(i, 0) \
174 XO1(i + 1, 1) \
175 XO1(i + 2, 2) \
176 XO1(i + 3, 3) \
177 XO2(i, 0) \
178 XO2(i + 1, 1) \
179 XO2(i + 2, 2) \
180 XO2(i + 3, 3) \
181 ST(i, 0) \
182 ST(i + 1, 1) \
183 ST(i + 2, 2) \
184 ST(i + 3, 3) \
187 PF0(0)
188 PF0(2)
190 " .align 32 ;\n"
191 " 1: ;\n"
193 BLOCK(0)
194 BLOCK(4)
195 BLOCK(8)
196 BLOCK(12)
198 " add %[inc], %[p1] ;\n"
199 " add %[inc], %[p2] ;\n"
200 " add %[inc], %[p3] ;\n"
201 " dec %[cnt] ;\n"
202 " jnz 1b ;\n"
203 : [cnt] "+r" (lines),
204 [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
205 : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
206 : "memory");
208 kernel_fpu_end();
211 static void
212 xor_sse_3_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2,
213 unsigned long *p3)
215 unsigned long lines = bytes >> 8;
217 kernel_fpu_begin();
219 asm volatile(
220 #undef BLOCK
221 #define BLOCK(i) \
222 BLK64(PF0, LD, i) \
223 BLK64(PF1, XO1, i) \
224 BLK64(PF2, XO2, i) \
225 BLK64(NOP, ST, i) \
227 " .align 32 ;\n"
228 " 1: ;\n"
230 BLOCK(0)
231 BLOCK(4)
232 BLOCK(8)
233 BLOCK(12)
235 " add %[inc], %[p1] ;\n"
236 " add %[inc], %[p2] ;\n"
237 " add %[inc], %[p3] ;\n"
238 " dec %[cnt] ;\n"
239 " jnz 1b ;\n"
240 : [cnt] "+r" (lines),
241 [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
242 : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
243 : "memory");
245 kernel_fpu_end();
248 static void
249 xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
250 unsigned long *p3, unsigned long *p4)
252 unsigned long lines = bytes >> 8;
254 kernel_fpu_begin();
256 asm volatile(
257 #undef BLOCK
258 #define BLOCK(i) \
259 PF1(i) \
260 PF1(i + 2) \
261 LD(i, 0) \
262 LD(i + 1, 1) \
263 LD(i + 2, 2) \
264 LD(i + 3, 3) \
265 PF2(i) \
266 PF2(i + 2) \
267 XO1(i, 0) \
268 XO1(i + 1, 1) \
269 XO1(i + 2, 2) \
270 XO1(i + 3, 3) \
271 PF3(i) \
272 PF3(i + 2) \
273 PF0(i + 4) \
274 PF0(i + 6) \
275 XO2(i, 0) \
276 XO2(i + 1, 1) \
277 XO2(i + 2, 2) \
278 XO2(i + 3, 3) \
279 XO3(i, 0) \
280 XO3(i + 1, 1) \
281 XO3(i + 2, 2) \
282 XO3(i + 3, 3) \
283 ST(i, 0) \
284 ST(i + 1, 1) \
285 ST(i + 2, 2) \
286 ST(i + 3, 3) \
289 PF0(0)
290 PF0(2)
292 " .align 32 ;\n"
293 " 1: ;\n"
295 BLOCK(0)
296 BLOCK(4)
297 BLOCK(8)
298 BLOCK(12)
300 " add %[inc], %[p1] ;\n"
301 " add %[inc], %[p2] ;\n"
302 " add %[inc], %[p3] ;\n"
303 " add %[inc], %[p4] ;\n"
304 " dec %[cnt] ;\n"
305 " jnz 1b ;\n"
306 : [cnt] "+r" (lines), [p1] "+r" (p1),
307 [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
308 : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
309 : "memory");
311 kernel_fpu_end();
314 static void
315 xor_sse_4_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2,
316 unsigned long *p3, unsigned long *p4)
318 unsigned long lines = bytes >> 8;
320 kernel_fpu_begin();
322 asm volatile(
323 #undef BLOCK
324 #define BLOCK(i) \
325 BLK64(PF0, LD, i) \
326 BLK64(PF1, XO1, i) \
327 BLK64(PF2, XO2, i) \
328 BLK64(PF3, XO3, i) \
329 BLK64(NOP, ST, i) \
331 " .align 32 ;\n"
332 " 1: ;\n"
334 BLOCK(0)
335 BLOCK(4)
336 BLOCK(8)
337 BLOCK(12)
339 " add %[inc], %[p1] ;\n"
340 " add %[inc], %[p2] ;\n"
341 " add %[inc], %[p3] ;\n"
342 " add %[inc], %[p4] ;\n"
343 " dec %[cnt] ;\n"
344 " jnz 1b ;\n"
345 : [cnt] "+r" (lines), [p1] "+r" (p1),
346 [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
347 : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
348 : "memory");
350 kernel_fpu_end();
353 static void
354 xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
355 unsigned long *p3, unsigned long *p4, unsigned long *p5)
357 unsigned long lines = bytes >> 8;
359 kernel_fpu_begin();
361 asm volatile(
362 #undef BLOCK
363 #define BLOCK(i) \
364 PF1(i) \
365 PF1(i + 2) \
366 LD(i, 0) \
367 LD(i + 1, 1) \
368 LD(i + 2, 2) \
369 LD(i + 3, 3) \
370 PF2(i) \
371 PF2(i + 2) \
372 XO1(i, 0) \
373 XO1(i + 1, 1) \
374 XO1(i + 2, 2) \
375 XO1(i + 3, 3) \
376 PF3(i) \
377 PF3(i + 2) \
378 XO2(i, 0) \
379 XO2(i + 1, 1) \
380 XO2(i + 2, 2) \
381 XO2(i + 3, 3) \
382 PF4(i) \
383 PF4(i + 2) \
384 PF0(i + 4) \
385 PF0(i + 6) \
386 XO3(i, 0) \
387 XO3(i + 1, 1) \
388 XO3(i + 2, 2) \
389 XO3(i + 3, 3) \
390 XO4(i, 0) \
391 XO4(i + 1, 1) \
392 XO4(i + 2, 2) \
393 XO4(i + 3, 3) \
394 ST(i, 0) \
395 ST(i + 1, 1) \
396 ST(i + 2, 2) \
397 ST(i + 3, 3) \
400 PF0(0)
401 PF0(2)
403 " .align 32 ;\n"
404 " 1: ;\n"
406 BLOCK(0)
407 BLOCK(4)
408 BLOCK(8)
409 BLOCK(12)
411 " add %[inc], %[p1] ;\n"
412 " add %[inc], %[p2] ;\n"
413 " add %[inc], %[p3] ;\n"
414 " add %[inc], %[p4] ;\n"
415 " add %[inc], %[p5] ;\n"
416 " dec %[cnt] ;\n"
417 " jnz 1b ;\n"
418 : [cnt] "+r" (lines), [p1] "+r" (p1), [p2] "+r" (p2),
419 [p3] "+r" (p3), [p4] "+r" (p4), [p5] "+r" (p5)
420 : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
421 : "memory");
423 kernel_fpu_end();
426 static void
427 xor_sse_5_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2,
428 unsigned long *p3, unsigned long *p4, unsigned long *p5)
430 unsigned long lines = bytes >> 8;
432 kernel_fpu_begin();
434 asm volatile(
435 #undef BLOCK
436 #define BLOCK(i) \
437 BLK64(PF0, LD, i) \
438 BLK64(PF1, XO1, i) \
439 BLK64(PF2, XO2, i) \
440 BLK64(PF3, XO3, i) \
441 BLK64(PF4, XO4, i) \
442 BLK64(NOP, ST, i) \
444 " .align 32 ;\n"
445 " 1: ;\n"
447 BLOCK(0)
448 BLOCK(4)
449 BLOCK(8)
450 BLOCK(12)
452 " add %[inc], %[p1] ;\n"
453 " add %[inc], %[p2] ;\n"
454 " add %[inc], %[p3] ;\n"
455 " add %[inc], %[p4] ;\n"
456 " add %[inc], %[p5] ;\n"
457 " dec %[cnt] ;\n"
458 " jnz 1b ;\n"
459 : [cnt] "+r" (lines), [p1] "+r" (p1), [p2] "+r" (p2),
460 [p3] "+r" (p3), [p4] "+r" (p4), [p5] "+r" (p5)
461 : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
462 : "memory");
464 kernel_fpu_end();
467 static struct xor_block_template xor_block_sse_pf64 = {
468 .name = "prefetch64-sse",
469 .do_2 = xor_sse_2_pf64,
470 .do_3 = xor_sse_3_pf64,
471 .do_4 = xor_sse_4_pf64,
472 .do_5 = xor_sse_5_pf64,
475 #undef LD
476 #undef XO1
477 #undef XO2
478 #undef XO3
479 #undef XO4
480 #undef ST
481 #undef NOP
482 #undef BLK64
483 #undef BLOCK
485 #undef XOR_CONSTANT_CONSTRAINT
487 #ifdef CONFIG_X86_32
488 # include <asm/xor_32.h>
489 #else
490 # include <asm/xor_64.h>
491 #endif
493 #define XOR_SELECT_TEMPLATE(FASTEST) \
494 AVX_SELECT(FASTEST)
496 #endif /* _ASM_X86_XOR_H */