WIP FPC-III support
[linux/fpc-iii.git] / arch / x86 / include / asm / xor_32.h
blob67ceb790e63972d3e3b06461454bc689d5814f61
1 /* SPDX-License-Identifier: GPL-2.0-or-later */
2 #ifndef _ASM_X86_XOR_32_H
3 #define _ASM_X86_XOR_32_H
5 /*
6 * Optimized RAID-5 checksumming functions for MMX.
7 */
9 /*
10 * High-speed RAID5 checksumming functions utilizing MMX instructions.
11 * Copyright (C) 1998 Ingo Molnar.
14 #define LD(x, y) " movq 8*("#x")(%1), %%mm"#y" ;\n"
15 #define ST(x, y) " movq %%mm"#y", 8*("#x")(%1) ;\n"
16 #define XO1(x, y) " pxor 8*("#x")(%2), %%mm"#y" ;\n"
17 #define XO2(x, y) " pxor 8*("#x")(%3), %%mm"#y" ;\n"
18 #define XO3(x, y) " pxor 8*("#x")(%4), %%mm"#y" ;\n"
19 #define XO4(x, y) " pxor 8*("#x")(%5), %%mm"#y" ;\n"
21 #include <asm/fpu/api.h>
23 static void
24 xor_pII_mmx_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
26 unsigned long lines = bytes >> 7;
28 kernel_fpu_begin();
30 asm volatile(
31 #undef BLOCK
32 #define BLOCK(i) \
33 LD(i, 0) \
34 LD(i + 1, 1) \
35 LD(i + 2, 2) \
36 LD(i + 3, 3) \
37 XO1(i, 0) \
38 ST(i, 0) \
39 XO1(i+1, 1) \
40 ST(i+1, 1) \
41 XO1(i + 2, 2) \
42 ST(i + 2, 2) \
43 XO1(i + 3, 3) \
44 ST(i + 3, 3)
46 " .align 32 ;\n"
47 " 1: ;\n"
49 BLOCK(0)
50 BLOCK(4)
51 BLOCK(8)
52 BLOCK(12)
54 " addl $128, %1 ;\n"
55 " addl $128, %2 ;\n"
56 " decl %0 ;\n"
57 " jnz 1b ;\n"
58 : "+r" (lines),
59 "+r" (p1), "+r" (p2)
61 : "memory");
63 kernel_fpu_end();
66 static void
67 xor_pII_mmx_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
68 unsigned long *p3)
70 unsigned long lines = bytes >> 7;
72 kernel_fpu_begin();
74 asm volatile(
75 #undef BLOCK
76 #define BLOCK(i) \
77 LD(i, 0) \
78 LD(i + 1, 1) \
79 LD(i + 2, 2) \
80 LD(i + 3, 3) \
81 XO1(i, 0) \
82 XO1(i + 1, 1) \
83 XO1(i + 2, 2) \
84 XO1(i + 3, 3) \
85 XO2(i, 0) \
86 ST(i, 0) \
87 XO2(i + 1, 1) \
88 ST(i + 1, 1) \
89 XO2(i + 2, 2) \
90 ST(i + 2, 2) \
91 XO2(i + 3, 3) \
92 ST(i + 3, 3)
94 " .align 32 ;\n"
95 " 1: ;\n"
97 BLOCK(0)
98 BLOCK(4)
99 BLOCK(8)
100 BLOCK(12)
102 " addl $128, %1 ;\n"
103 " addl $128, %2 ;\n"
104 " addl $128, %3 ;\n"
105 " decl %0 ;\n"
106 " jnz 1b ;\n"
107 : "+r" (lines),
108 "+r" (p1), "+r" (p2), "+r" (p3)
110 : "memory");
112 kernel_fpu_end();
115 static void
116 xor_pII_mmx_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
117 unsigned long *p3, unsigned long *p4)
119 unsigned long lines = bytes >> 7;
121 kernel_fpu_begin();
123 asm volatile(
124 #undef BLOCK
125 #define BLOCK(i) \
126 LD(i, 0) \
127 LD(i + 1, 1) \
128 LD(i + 2, 2) \
129 LD(i + 3, 3) \
130 XO1(i, 0) \
131 XO1(i + 1, 1) \
132 XO1(i + 2, 2) \
133 XO1(i + 3, 3) \
134 XO2(i, 0) \
135 XO2(i + 1, 1) \
136 XO2(i + 2, 2) \
137 XO2(i + 3, 3) \
138 XO3(i, 0) \
139 ST(i, 0) \
140 XO3(i + 1, 1) \
141 ST(i + 1, 1) \
142 XO3(i + 2, 2) \
143 ST(i + 2, 2) \
144 XO3(i + 3, 3) \
145 ST(i + 3, 3)
147 " .align 32 ;\n"
148 " 1: ;\n"
150 BLOCK(0)
151 BLOCK(4)
152 BLOCK(8)
153 BLOCK(12)
155 " addl $128, %1 ;\n"
156 " addl $128, %2 ;\n"
157 " addl $128, %3 ;\n"
158 " addl $128, %4 ;\n"
159 " decl %0 ;\n"
160 " jnz 1b ;\n"
161 : "+r" (lines),
162 "+r" (p1), "+r" (p2), "+r" (p3), "+r" (p4)
164 : "memory");
166 kernel_fpu_end();
170 static void
171 xor_pII_mmx_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
172 unsigned long *p3, unsigned long *p4, unsigned long *p5)
174 unsigned long lines = bytes >> 7;
176 kernel_fpu_begin();
178 /* Make sure GCC forgets anything it knows about p4 or p5,
179 such that it won't pass to the asm volatile below a
180 register that is shared with any other variable. That's
181 because we modify p4 and p5 there, but we can't mark them
182 as read/write, otherwise we'd overflow the 10-asm-operands
183 limit of GCC < 3.1. */
184 asm("" : "+r" (p4), "+r" (p5));
186 asm volatile(
187 #undef BLOCK
188 #define BLOCK(i) \
189 LD(i, 0) \
190 LD(i + 1, 1) \
191 LD(i + 2, 2) \
192 LD(i + 3, 3) \
193 XO1(i, 0) \
194 XO1(i + 1, 1) \
195 XO1(i + 2, 2) \
196 XO1(i + 3, 3) \
197 XO2(i, 0) \
198 XO2(i + 1, 1) \
199 XO2(i + 2, 2) \
200 XO2(i + 3, 3) \
201 XO3(i, 0) \
202 XO3(i + 1, 1) \
203 XO3(i + 2, 2) \
204 XO3(i + 3, 3) \
205 XO4(i, 0) \
206 ST(i, 0) \
207 XO4(i + 1, 1) \
208 ST(i + 1, 1) \
209 XO4(i + 2, 2) \
210 ST(i + 2, 2) \
211 XO4(i + 3, 3) \
212 ST(i + 3, 3)
214 " .align 32 ;\n"
215 " 1: ;\n"
217 BLOCK(0)
218 BLOCK(4)
219 BLOCK(8)
220 BLOCK(12)
222 " addl $128, %1 ;\n"
223 " addl $128, %2 ;\n"
224 " addl $128, %3 ;\n"
225 " addl $128, %4 ;\n"
226 " addl $128, %5 ;\n"
227 " decl %0 ;\n"
228 " jnz 1b ;\n"
229 : "+r" (lines),
230 "+r" (p1), "+r" (p2), "+r" (p3)
231 : "r" (p4), "r" (p5)
232 : "memory");
234 /* p4 and p5 were modified, and now the variables are dead.
235 Clobber them just to be sure nobody does something stupid
236 like assuming they have some legal value. */
237 asm("" : "=r" (p4), "=r" (p5));
239 kernel_fpu_end();
242 #undef LD
243 #undef XO1
244 #undef XO2
245 #undef XO3
246 #undef XO4
247 #undef ST
248 #undef BLOCK
250 static void
251 xor_p5_mmx_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
253 unsigned long lines = bytes >> 6;
255 kernel_fpu_begin();
257 asm volatile(
258 " .align 32 ;\n"
259 " 1: ;\n"
260 " movq (%1), %%mm0 ;\n"
261 " movq 8(%1), %%mm1 ;\n"
262 " pxor (%2), %%mm0 ;\n"
263 " movq 16(%1), %%mm2 ;\n"
264 " movq %%mm0, (%1) ;\n"
265 " pxor 8(%2), %%mm1 ;\n"
266 " movq 24(%1), %%mm3 ;\n"
267 " movq %%mm1, 8(%1) ;\n"
268 " pxor 16(%2), %%mm2 ;\n"
269 " movq 32(%1), %%mm4 ;\n"
270 " movq %%mm2, 16(%1) ;\n"
271 " pxor 24(%2), %%mm3 ;\n"
272 " movq 40(%1), %%mm5 ;\n"
273 " movq %%mm3, 24(%1) ;\n"
274 " pxor 32(%2), %%mm4 ;\n"
275 " movq 48(%1), %%mm6 ;\n"
276 " movq %%mm4, 32(%1) ;\n"
277 " pxor 40(%2), %%mm5 ;\n"
278 " movq 56(%1), %%mm7 ;\n"
279 " movq %%mm5, 40(%1) ;\n"
280 " pxor 48(%2), %%mm6 ;\n"
281 " pxor 56(%2), %%mm7 ;\n"
282 " movq %%mm6, 48(%1) ;\n"
283 " movq %%mm7, 56(%1) ;\n"
285 " addl $64, %1 ;\n"
286 " addl $64, %2 ;\n"
287 " decl %0 ;\n"
288 " jnz 1b ;\n"
289 : "+r" (lines),
290 "+r" (p1), "+r" (p2)
292 : "memory");
294 kernel_fpu_end();
297 static void
298 xor_p5_mmx_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
299 unsigned long *p3)
301 unsigned long lines = bytes >> 6;
303 kernel_fpu_begin();
305 asm volatile(
306 " .align 32,0x90 ;\n"
307 " 1: ;\n"
308 " movq (%1), %%mm0 ;\n"
309 " movq 8(%1), %%mm1 ;\n"
310 " pxor (%2), %%mm0 ;\n"
311 " movq 16(%1), %%mm2 ;\n"
312 " pxor 8(%2), %%mm1 ;\n"
313 " pxor (%3), %%mm0 ;\n"
314 " pxor 16(%2), %%mm2 ;\n"
315 " movq %%mm0, (%1) ;\n"
316 " pxor 8(%3), %%mm1 ;\n"
317 " pxor 16(%3), %%mm2 ;\n"
318 " movq 24(%1), %%mm3 ;\n"
319 " movq %%mm1, 8(%1) ;\n"
320 " movq 32(%1), %%mm4 ;\n"
321 " movq 40(%1), %%mm5 ;\n"
322 " pxor 24(%2), %%mm3 ;\n"
323 " movq %%mm2, 16(%1) ;\n"
324 " pxor 32(%2), %%mm4 ;\n"
325 " pxor 24(%3), %%mm3 ;\n"
326 " pxor 40(%2), %%mm5 ;\n"
327 " movq %%mm3, 24(%1) ;\n"
328 " pxor 32(%3), %%mm4 ;\n"
329 " pxor 40(%3), %%mm5 ;\n"
330 " movq 48(%1), %%mm6 ;\n"
331 " movq %%mm4, 32(%1) ;\n"
332 " movq 56(%1), %%mm7 ;\n"
333 " pxor 48(%2), %%mm6 ;\n"
334 " movq %%mm5, 40(%1) ;\n"
335 " pxor 56(%2), %%mm7 ;\n"
336 " pxor 48(%3), %%mm6 ;\n"
337 " pxor 56(%3), %%mm7 ;\n"
338 " movq %%mm6, 48(%1) ;\n"
339 " movq %%mm7, 56(%1) ;\n"
341 " addl $64, %1 ;\n"
342 " addl $64, %2 ;\n"
343 " addl $64, %3 ;\n"
344 " decl %0 ;\n"
345 " jnz 1b ;\n"
346 : "+r" (lines),
347 "+r" (p1), "+r" (p2), "+r" (p3)
349 : "memory" );
351 kernel_fpu_end();
354 static void
355 xor_p5_mmx_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
356 unsigned long *p3, unsigned long *p4)
358 unsigned long lines = bytes >> 6;
360 kernel_fpu_begin();
362 asm volatile(
363 " .align 32,0x90 ;\n"
364 " 1: ;\n"
365 " movq (%1), %%mm0 ;\n"
366 " movq 8(%1), %%mm1 ;\n"
367 " pxor (%2), %%mm0 ;\n"
368 " movq 16(%1), %%mm2 ;\n"
369 " pxor 8(%2), %%mm1 ;\n"
370 " pxor (%3), %%mm0 ;\n"
371 " pxor 16(%2), %%mm2 ;\n"
372 " pxor 8(%3), %%mm1 ;\n"
373 " pxor (%4), %%mm0 ;\n"
374 " movq 24(%1), %%mm3 ;\n"
375 " pxor 16(%3), %%mm2 ;\n"
376 " pxor 8(%4), %%mm1 ;\n"
377 " movq %%mm0, (%1) ;\n"
378 " movq 32(%1), %%mm4 ;\n"
379 " pxor 24(%2), %%mm3 ;\n"
380 " pxor 16(%4), %%mm2 ;\n"
381 " movq %%mm1, 8(%1) ;\n"
382 " movq 40(%1), %%mm5 ;\n"
383 " pxor 32(%2), %%mm4 ;\n"
384 " pxor 24(%3), %%mm3 ;\n"
385 " movq %%mm2, 16(%1) ;\n"
386 " pxor 40(%2), %%mm5 ;\n"
387 " pxor 32(%3), %%mm4 ;\n"
388 " pxor 24(%4), %%mm3 ;\n"
389 " movq %%mm3, 24(%1) ;\n"
390 " movq 56(%1), %%mm7 ;\n"
391 " movq 48(%1), %%mm6 ;\n"
392 " pxor 40(%3), %%mm5 ;\n"
393 " pxor 32(%4), %%mm4 ;\n"
394 " pxor 48(%2), %%mm6 ;\n"
395 " movq %%mm4, 32(%1) ;\n"
396 " pxor 56(%2), %%mm7 ;\n"
397 " pxor 40(%4), %%mm5 ;\n"
398 " pxor 48(%3), %%mm6 ;\n"
399 " pxor 56(%3), %%mm7 ;\n"
400 " movq %%mm5, 40(%1) ;\n"
401 " pxor 48(%4), %%mm6 ;\n"
402 " pxor 56(%4), %%mm7 ;\n"
403 " movq %%mm6, 48(%1) ;\n"
404 " movq %%mm7, 56(%1) ;\n"
406 " addl $64, %1 ;\n"
407 " addl $64, %2 ;\n"
408 " addl $64, %3 ;\n"
409 " addl $64, %4 ;\n"
410 " decl %0 ;\n"
411 " jnz 1b ;\n"
412 : "+r" (lines),
413 "+r" (p1), "+r" (p2), "+r" (p3), "+r" (p4)
415 : "memory");
417 kernel_fpu_end();
420 static void
421 xor_p5_mmx_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
422 unsigned long *p3, unsigned long *p4, unsigned long *p5)
424 unsigned long lines = bytes >> 6;
426 kernel_fpu_begin();
428 /* Make sure GCC forgets anything it knows about p4 or p5,
429 such that it won't pass to the asm volatile below a
430 register that is shared with any other variable. That's
431 because we modify p4 and p5 there, but we can't mark them
432 as read/write, otherwise we'd overflow the 10-asm-operands
433 limit of GCC < 3.1. */
434 asm("" : "+r" (p4), "+r" (p5));
436 asm volatile(
437 " .align 32,0x90 ;\n"
438 " 1: ;\n"
439 " movq (%1), %%mm0 ;\n"
440 " movq 8(%1), %%mm1 ;\n"
441 " pxor (%2), %%mm0 ;\n"
442 " pxor 8(%2), %%mm1 ;\n"
443 " movq 16(%1), %%mm2 ;\n"
444 " pxor (%3), %%mm0 ;\n"
445 " pxor 8(%3), %%mm1 ;\n"
446 " pxor 16(%2), %%mm2 ;\n"
447 " pxor (%4), %%mm0 ;\n"
448 " pxor 8(%4), %%mm1 ;\n"
449 " pxor 16(%3), %%mm2 ;\n"
450 " movq 24(%1), %%mm3 ;\n"
451 " pxor (%5), %%mm0 ;\n"
452 " pxor 8(%5), %%mm1 ;\n"
453 " movq %%mm0, (%1) ;\n"
454 " pxor 16(%4), %%mm2 ;\n"
455 " pxor 24(%2), %%mm3 ;\n"
456 " movq %%mm1, 8(%1) ;\n"
457 " pxor 16(%5), %%mm2 ;\n"
458 " pxor 24(%3), %%mm3 ;\n"
459 " movq 32(%1), %%mm4 ;\n"
460 " movq %%mm2, 16(%1) ;\n"
461 " pxor 24(%4), %%mm3 ;\n"
462 " pxor 32(%2), %%mm4 ;\n"
463 " movq 40(%1), %%mm5 ;\n"
464 " pxor 24(%5), %%mm3 ;\n"
465 " pxor 32(%3), %%mm4 ;\n"
466 " pxor 40(%2), %%mm5 ;\n"
467 " movq %%mm3, 24(%1) ;\n"
468 " pxor 32(%4), %%mm4 ;\n"
469 " pxor 40(%3), %%mm5 ;\n"
470 " movq 48(%1), %%mm6 ;\n"
471 " movq 56(%1), %%mm7 ;\n"
472 " pxor 32(%5), %%mm4 ;\n"
473 " pxor 40(%4), %%mm5 ;\n"
474 " pxor 48(%2), %%mm6 ;\n"
475 " pxor 56(%2), %%mm7 ;\n"
476 " movq %%mm4, 32(%1) ;\n"
477 " pxor 48(%3), %%mm6 ;\n"
478 " pxor 56(%3), %%mm7 ;\n"
479 " pxor 40(%5), %%mm5 ;\n"
480 " pxor 48(%4), %%mm6 ;\n"
481 " pxor 56(%4), %%mm7 ;\n"
482 " movq %%mm5, 40(%1) ;\n"
483 " pxor 48(%5), %%mm6 ;\n"
484 " pxor 56(%5), %%mm7 ;\n"
485 " movq %%mm6, 48(%1) ;\n"
486 " movq %%mm7, 56(%1) ;\n"
488 " addl $64, %1 ;\n"
489 " addl $64, %2 ;\n"
490 " addl $64, %3 ;\n"
491 " addl $64, %4 ;\n"
492 " addl $64, %5 ;\n"
493 " decl %0 ;\n"
494 " jnz 1b ;\n"
495 : "+r" (lines),
496 "+r" (p1), "+r" (p2), "+r" (p3)
497 : "r" (p4), "r" (p5)
498 : "memory");
500 /* p4 and p5 were modified, and now the variables are dead.
501 Clobber them just to be sure nobody does something stupid
502 like assuming they have some legal value. */
503 asm("" : "=r" (p4), "=r" (p5));
505 kernel_fpu_end();
508 static struct xor_block_template xor_block_pII_mmx = {
509 .name = "pII_mmx",
510 .do_2 = xor_pII_mmx_2,
511 .do_3 = xor_pII_mmx_3,
512 .do_4 = xor_pII_mmx_4,
513 .do_5 = xor_pII_mmx_5,
516 static struct xor_block_template xor_block_p5_mmx = {
517 .name = "p5_mmx",
518 .do_2 = xor_p5_mmx_2,
519 .do_3 = xor_p5_mmx_3,
520 .do_4 = xor_p5_mmx_4,
521 .do_5 = xor_p5_mmx_5,
524 static struct xor_block_template xor_block_pIII_sse = {
525 .name = "pIII_sse",
526 .do_2 = xor_sse_2,
527 .do_3 = xor_sse_3,
528 .do_4 = xor_sse_4,
529 .do_5 = xor_sse_5,
532 /* Also try the AVX routines */
533 #include <asm/xor_avx.h>
535 /* Also try the generic routines. */
536 #include <asm-generic/xor.h>
538 /* We force the use of the SSE xor block because it can write around L2.
539 We may also be able to load into the L1 only depending on how the cpu
540 deals with a load to a line that is being prefetched. */
541 #undef XOR_TRY_TEMPLATES
542 #define XOR_TRY_TEMPLATES \
543 do { \
544 AVX_XOR_SPEED; \
545 if (boot_cpu_has(X86_FEATURE_XMM)) { \
546 xor_speed(&xor_block_pIII_sse); \
547 xor_speed(&xor_block_sse_pf64); \
548 } else if (boot_cpu_has(X86_FEATURE_MMX)) { \
549 xor_speed(&xor_block_pII_mmx); \
550 xor_speed(&xor_block_p5_mmx); \
551 } else { \
552 xor_speed(&xor_block_8regs); \
553 xor_speed(&xor_block_8regs_p); \
554 xor_speed(&xor_block_32regs); \
555 xor_speed(&xor_block_32regs_p); \
557 } while (0)
559 #endif /* _ASM_X86_XOR_32_H */