Linux 4.16-rc1
[cris-mirror.git] / lib / raid6 / avx512.c
blob46df7977b9718bdc4b6ca1960792bae62b8442b7
1 /* -*- linux-c -*- --------------------------------------------------------
3 * Copyright (C) 2016 Intel Corporation
5 * Author: Gayatri Kammela <gayatri.kammela@intel.com>
6 * Author: Megha Dey <megha.dey@linux.intel.com>
8 * Based on avx2.c: Copyright 2012 Yuanhan Liu All Rights Reserved
9 * Based on sse2.c: Copyright 2002 H. Peter Anvin - All Rights Reserved
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation, Inc., 53 Temple Place Ste 330,
14 * Boston MA 02111-1307, USA; either version 2 of the License, or
15 * (at your option) any later version; incorporated herein by reference.
17 * -----------------------------------------------------------------------
21 * AVX512 implementation of RAID-6 syndrome functions
25 #ifdef CONFIG_AS_AVX512
27 #include <linux/raid/pq.h>
28 #include "x86.h"
30 static const struct raid6_avx512_constants {
31 u64 x1d[8];
32 } raid6_avx512_constants __aligned(512/8) = {
33 { 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,
34 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,
35 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,
36 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,},
39 static int raid6_have_avx512(void)
41 return boot_cpu_has(X86_FEATURE_AVX2) &&
42 boot_cpu_has(X86_FEATURE_AVX) &&
43 boot_cpu_has(X86_FEATURE_AVX512F) &&
44 boot_cpu_has(X86_FEATURE_AVX512BW) &&
45 boot_cpu_has(X86_FEATURE_AVX512VL) &&
46 boot_cpu_has(X86_FEATURE_AVX512DQ);
49 static void raid6_avx5121_gen_syndrome(int disks, size_t bytes, void **ptrs)
51 u8 **dptr = (u8 **)ptrs;
52 u8 *p, *q;
53 int d, z, z0;
55 z0 = disks - 3; /* Highest data disk */
56 p = dptr[z0+1]; /* XOR parity */
57 q = dptr[z0+2]; /* RS syndrome */
59 kernel_fpu_begin();
61 asm volatile("vmovdqa64 %0,%%zmm0\n\t"
62 "vpxorq %%zmm1,%%zmm1,%%zmm1" /* Zero temp */
64 : "m" (raid6_avx512_constants.x1d[0]));
66 for (d = 0; d < bytes; d += 64) {
67 asm volatile("prefetchnta %0\n\t"
68 "vmovdqa64 %0,%%zmm2\n\t" /* P[0] */
69 "prefetchnta %1\n\t"
70 "vmovdqa64 %%zmm2,%%zmm4\n\t" /* Q[0] */
71 "vmovdqa64 %1,%%zmm6"
73 : "m" (dptr[z0][d]), "m" (dptr[z0-1][d]));
74 for (z = z0-2; z >= 0; z--) {
75 asm volatile("prefetchnta %0\n\t"
76 "vpcmpgtb %%zmm4,%%zmm1,%%k1\n\t"
77 "vpmovm2b %%k1,%%zmm5\n\t"
78 "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
79 "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
80 "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
81 "vpxorq %%zmm6,%%zmm2,%%zmm2\n\t"
82 "vpxorq %%zmm6,%%zmm4,%%zmm4\n\t"
83 "vmovdqa64 %0,%%zmm6"
85 : "m" (dptr[z][d]));
87 asm volatile("vpcmpgtb %%zmm4,%%zmm1,%%k1\n\t"
88 "vpmovm2b %%k1,%%zmm5\n\t"
89 "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
90 "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
91 "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
92 "vpxorq %%zmm6,%%zmm2,%%zmm2\n\t"
93 "vpxorq %%zmm6,%%zmm4,%%zmm4\n\t"
94 "vmovntdq %%zmm2,%0\n\t"
95 "vpxorq %%zmm2,%%zmm2,%%zmm2\n\t"
96 "vmovntdq %%zmm4,%1\n\t"
97 "vpxorq %%zmm4,%%zmm4,%%zmm4"
99 : "m" (p[d]), "m" (q[d]));
102 asm volatile("sfence" : : : "memory");
103 kernel_fpu_end();
106 static void raid6_avx5121_xor_syndrome(int disks, int start, int stop,
107 size_t bytes, void **ptrs)
109 u8 **dptr = (u8 **)ptrs;
110 u8 *p, *q;
111 int d, z, z0;
113 z0 = stop; /* P/Q right side optimization */
114 p = dptr[disks-2]; /* XOR parity */
115 q = dptr[disks-1]; /* RS syndrome */
117 kernel_fpu_begin();
119 asm volatile("vmovdqa64 %0,%%zmm0"
120 : : "m" (raid6_avx512_constants.x1d[0]));
122 for (d = 0 ; d < bytes ; d += 64) {
123 asm volatile("vmovdqa64 %0,%%zmm4\n\t"
124 "vmovdqa64 %1,%%zmm2\n\t"
125 "vpxorq %%zmm4,%%zmm2,%%zmm2"
127 : "m" (dptr[z0][d]), "m" (p[d]));
128 /* P/Q data pages */
129 for (z = z0-1 ; z >= start ; z--) {
130 asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t"
131 "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t"
132 "vpmovm2b %%k1,%%zmm5\n\t"
133 "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
134 "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
135 "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
136 "vmovdqa64 %0,%%zmm5\n\t"
137 "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t"
138 "vpxorq %%zmm5,%%zmm4,%%zmm4"
140 : "m" (dptr[z][d]));
142 /* P/Q left side optimization */
143 for (z = start-1 ; z >= 0 ; z--) {
144 asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t"
145 "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t"
146 "vpmovm2b %%k1,%%zmm5\n\t"
147 "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
148 "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
149 "vpxorq %%zmm5,%%zmm4,%%zmm4"
151 : );
153 asm volatile("vpxorq %0,%%zmm4,%%zmm4\n\t"
154 /* Don't use movntdq for r/w memory area < cache line */
155 "vmovdqa64 %%zmm4,%0\n\t"
156 "vmovdqa64 %%zmm2,%1"
158 : "m" (q[d]), "m" (p[d]));
161 asm volatile("sfence" : : : "memory");
162 kernel_fpu_end();
165 const struct raid6_calls raid6_avx512x1 = {
166 raid6_avx5121_gen_syndrome,
167 raid6_avx5121_xor_syndrome,
168 raid6_have_avx512,
169 "avx512x1",
170 1 /* Has cache hints */
174 * Unrolled-by-2 AVX512 implementation
176 static void raid6_avx5122_gen_syndrome(int disks, size_t bytes, void **ptrs)
178 u8 **dptr = (u8 **)ptrs;
179 u8 *p, *q;
180 int d, z, z0;
182 z0 = disks - 3; /* Highest data disk */
183 p = dptr[z0+1]; /* XOR parity */
184 q = dptr[z0+2]; /* RS syndrome */
186 kernel_fpu_begin();
188 asm volatile("vmovdqa64 %0,%%zmm0\n\t"
189 "vpxorq %%zmm1,%%zmm1,%%zmm1" /* Zero temp */
191 : "m" (raid6_avx512_constants.x1d[0]));
193 /* We uniformly assume a single prefetch covers at least 64 bytes */
194 for (d = 0; d < bytes; d += 128) {
195 asm volatile("prefetchnta %0\n\t"
196 "prefetchnta %1\n\t"
197 "vmovdqa64 %0,%%zmm2\n\t" /* P[0] */
198 "vmovdqa64 %1,%%zmm3\n\t" /* P[1] */
199 "vmovdqa64 %%zmm2,%%zmm4\n\t" /* Q[0] */
200 "vmovdqa64 %%zmm3,%%zmm6" /* Q[1] */
202 : "m" (dptr[z0][d]), "m" (dptr[z0][d+64]));
203 for (z = z0-1; z >= 0; z--) {
204 asm volatile("prefetchnta %0\n\t"
205 "prefetchnta %1\n\t"
206 "vpcmpgtb %%zmm4,%%zmm1,%%k1\n\t"
207 "vpcmpgtb %%zmm6,%%zmm1,%%k2\n\t"
208 "vpmovm2b %%k1,%%zmm5\n\t"
209 "vpmovm2b %%k2,%%zmm7\n\t"
210 "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
211 "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t"
212 "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
213 "vpandq %%zmm0,%%zmm7,%%zmm7\n\t"
214 "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
215 "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t"
216 "vmovdqa64 %0,%%zmm5\n\t"
217 "vmovdqa64 %1,%%zmm7\n\t"
218 "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t"
219 "vpxorq %%zmm7,%%zmm3,%%zmm3\n\t"
220 "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
221 "vpxorq %%zmm7,%%zmm6,%%zmm6"
223 : "m" (dptr[z][d]), "m" (dptr[z][d+64]));
225 asm volatile("vmovntdq %%zmm2,%0\n\t"
226 "vmovntdq %%zmm3,%1\n\t"
227 "vmovntdq %%zmm4,%2\n\t"
228 "vmovntdq %%zmm6,%3"
230 : "m" (p[d]), "m" (p[d+64]), "m" (q[d]),
231 "m" (q[d+64]));
234 asm volatile("sfence" : : : "memory");
235 kernel_fpu_end();
238 static void raid6_avx5122_xor_syndrome(int disks, int start, int stop,
239 size_t bytes, void **ptrs)
241 u8 **dptr = (u8 **)ptrs;
242 u8 *p, *q;
243 int d, z, z0;
245 z0 = stop; /* P/Q right side optimization */
246 p = dptr[disks-2]; /* XOR parity */
247 q = dptr[disks-1]; /* RS syndrome */
249 kernel_fpu_begin();
251 asm volatile("vmovdqa64 %0,%%zmm0"
252 : : "m" (raid6_avx512_constants.x1d[0]));
254 for (d = 0 ; d < bytes ; d += 128) {
255 asm volatile("vmovdqa64 %0,%%zmm4\n\t"
256 "vmovdqa64 %1,%%zmm6\n\t"
257 "vmovdqa64 %2,%%zmm2\n\t"
258 "vmovdqa64 %3,%%zmm3\n\t"
259 "vpxorq %%zmm4,%%zmm2,%%zmm2\n\t"
260 "vpxorq %%zmm6,%%zmm3,%%zmm3"
262 : "m" (dptr[z0][d]), "m" (dptr[z0][d+64]),
263 "m" (p[d]), "m" (p[d+64]));
264 /* P/Q data pages */
265 for (z = z0-1 ; z >= start ; z--) {
266 asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t"
267 "vpxorq %%zmm7,%%zmm7,%%zmm7\n\t"
268 "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t"
269 "vpcmpgtb %%zmm6,%%zmm7,%%k2\n\t"
270 "vpmovm2b %%k1,%%zmm5\n\t"
271 "vpmovm2b %%k2,%%zmm7\n\t"
272 "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
273 "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t"
274 "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
275 "vpandq %%zmm0,%%zmm7,%%zmm7\n\t"
276 "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
277 "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t"
278 "vmovdqa64 %0,%%zmm5\n\t"
279 "vmovdqa64 %1,%%zmm7\n\t"
280 "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t"
281 "vpxorq %%zmm7,%%zmm3,%%zmm3\n\t"
282 "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
283 "vpxorq %%zmm7,%%zmm6,%%zmm6"
285 : "m" (dptr[z][d]), "m" (dptr[z][d+64]));
287 /* P/Q left side optimization */
288 for (z = start-1 ; z >= 0 ; z--) {
289 asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t"
290 "vpxorq %%zmm7,%%zmm7,%%zmm7\n\t"
291 "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t"
292 "vpcmpgtb %%zmm6,%%zmm7,%%k2\n\t"
293 "vpmovm2b %%k1,%%zmm5\n\t"
294 "vpmovm2b %%k2,%%zmm7\n\t"
295 "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
296 "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t"
297 "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
298 "vpandq %%zmm0,%%zmm7,%%zmm7\n\t"
299 "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
300 "vpxorq %%zmm7,%%zmm6,%%zmm6"
302 : );
304 asm volatile("vpxorq %0,%%zmm4,%%zmm4\n\t"
305 "vpxorq %1,%%zmm6,%%zmm6\n\t"
306 /* Don't use movntdq for r/w
307 * memory area < cache line
309 "vmovdqa64 %%zmm4,%0\n\t"
310 "vmovdqa64 %%zmm6,%1\n\t"
311 "vmovdqa64 %%zmm2,%2\n\t"
312 "vmovdqa64 %%zmm3,%3"
314 : "m" (q[d]), "m" (q[d+64]), "m" (p[d]),
315 "m" (p[d+64]));
318 asm volatile("sfence" : : : "memory");
319 kernel_fpu_end();
322 const struct raid6_calls raid6_avx512x2 = {
323 raid6_avx5122_gen_syndrome,
324 raid6_avx5122_xor_syndrome,
325 raid6_have_avx512,
326 "avx512x2",
327 1 /* Has cache hints */
330 #ifdef CONFIG_X86_64
333 * Unrolled-by-4 AVX2 implementation
335 static void raid6_avx5124_gen_syndrome(int disks, size_t bytes, void **ptrs)
337 u8 **dptr = (u8 **)ptrs;
338 u8 *p, *q;
339 int d, z, z0;
341 z0 = disks - 3; /* Highest data disk */
342 p = dptr[z0+1]; /* XOR parity */
343 q = dptr[z0+2]; /* RS syndrome */
345 kernel_fpu_begin();
347 asm volatile("vmovdqa64 %0,%%zmm0\n\t"
348 "vpxorq %%zmm1,%%zmm1,%%zmm1\n\t" /* Zero temp */
349 "vpxorq %%zmm2,%%zmm2,%%zmm2\n\t" /* P[0] */
350 "vpxorq %%zmm3,%%zmm3,%%zmm3\n\t" /* P[1] */
351 "vpxorq %%zmm4,%%zmm4,%%zmm4\n\t" /* Q[0] */
352 "vpxorq %%zmm6,%%zmm6,%%zmm6\n\t" /* Q[1] */
353 "vpxorq %%zmm10,%%zmm10,%%zmm10\n\t" /* P[2] */
354 "vpxorq %%zmm11,%%zmm11,%%zmm11\n\t" /* P[3] */
355 "vpxorq %%zmm12,%%zmm12,%%zmm12\n\t" /* Q[2] */
356 "vpxorq %%zmm14,%%zmm14,%%zmm14" /* Q[3] */
358 : "m" (raid6_avx512_constants.x1d[0]));
360 for (d = 0; d < bytes; d += 256) {
361 for (z = z0; z >= 0; z--) {
362 asm volatile("prefetchnta %0\n\t"
363 "prefetchnta %1\n\t"
364 "prefetchnta %2\n\t"
365 "prefetchnta %3\n\t"
366 "vpcmpgtb %%zmm4,%%zmm1,%%k1\n\t"
367 "vpcmpgtb %%zmm6,%%zmm1,%%k2\n\t"
368 "vpcmpgtb %%zmm12,%%zmm1,%%k3\n\t"
369 "vpcmpgtb %%zmm14,%%zmm1,%%k4\n\t"
370 "vpmovm2b %%k1,%%zmm5\n\t"
371 "vpmovm2b %%k2,%%zmm7\n\t"
372 "vpmovm2b %%k3,%%zmm13\n\t"
373 "vpmovm2b %%k4,%%zmm15\n\t"
374 "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
375 "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t"
376 "vpaddb %%zmm12,%%zmm12,%%zmm12\n\t"
377 "vpaddb %%zmm14,%%zmm14,%%zmm14\n\t"
378 "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
379 "vpandq %%zmm0,%%zmm7,%%zmm7\n\t"
380 "vpandq %%zmm0,%%zmm13,%%zmm13\n\t"
381 "vpandq %%zmm0,%%zmm15,%%zmm15\n\t"
382 "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
383 "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t"
384 "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t"
385 "vpxorq %%zmm15,%%zmm14,%%zmm14\n\t"
386 "vmovdqa64 %0,%%zmm5\n\t"
387 "vmovdqa64 %1,%%zmm7\n\t"
388 "vmovdqa64 %2,%%zmm13\n\t"
389 "vmovdqa64 %3,%%zmm15\n\t"
390 "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t"
391 "vpxorq %%zmm7,%%zmm3,%%zmm3\n\t"
392 "vpxorq %%zmm13,%%zmm10,%%zmm10\n\t"
393 "vpxorq %%zmm15,%%zmm11,%%zmm11\n"
394 "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
395 "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t"
396 "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t"
397 "vpxorq %%zmm15,%%zmm14,%%zmm14"
399 : "m" (dptr[z][d]), "m" (dptr[z][d+64]),
400 "m" (dptr[z][d+128]), "m" (dptr[z][d+192]));
402 asm volatile("vmovntdq %%zmm2,%0\n\t"
403 "vpxorq %%zmm2,%%zmm2,%%zmm2\n\t"
404 "vmovntdq %%zmm3,%1\n\t"
405 "vpxorq %%zmm3,%%zmm3,%%zmm3\n\t"
406 "vmovntdq %%zmm10,%2\n\t"
407 "vpxorq %%zmm10,%%zmm10,%%zmm10\n\t"
408 "vmovntdq %%zmm11,%3\n\t"
409 "vpxorq %%zmm11,%%zmm11,%%zmm11\n\t"
410 "vmovntdq %%zmm4,%4\n\t"
411 "vpxorq %%zmm4,%%zmm4,%%zmm4\n\t"
412 "vmovntdq %%zmm6,%5\n\t"
413 "vpxorq %%zmm6,%%zmm6,%%zmm6\n\t"
414 "vmovntdq %%zmm12,%6\n\t"
415 "vpxorq %%zmm12,%%zmm12,%%zmm12\n\t"
416 "vmovntdq %%zmm14,%7\n\t"
417 "vpxorq %%zmm14,%%zmm14,%%zmm14"
419 : "m" (p[d]), "m" (p[d+64]), "m" (p[d+128]),
420 "m" (p[d+192]), "m" (q[d]), "m" (q[d+64]),
421 "m" (q[d+128]), "m" (q[d+192]));
424 asm volatile("sfence" : : : "memory");
425 kernel_fpu_end();
428 static void raid6_avx5124_xor_syndrome(int disks, int start, int stop,
429 size_t bytes, void **ptrs)
431 u8 **dptr = (u8 **)ptrs;
432 u8 *p, *q;
433 int d, z, z0;
435 z0 = stop; /* P/Q right side optimization */
436 p = dptr[disks-2]; /* XOR parity */
437 q = dptr[disks-1]; /* RS syndrome */
439 kernel_fpu_begin();
441 asm volatile("vmovdqa64 %0,%%zmm0"
442 :: "m" (raid6_avx512_constants.x1d[0]));
444 for (d = 0 ; d < bytes ; d += 256) {
445 asm volatile("vmovdqa64 %0,%%zmm4\n\t"
446 "vmovdqa64 %1,%%zmm6\n\t"
447 "vmovdqa64 %2,%%zmm12\n\t"
448 "vmovdqa64 %3,%%zmm14\n\t"
449 "vmovdqa64 %4,%%zmm2\n\t"
450 "vmovdqa64 %5,%%zmm3\n\t"
451 "vmovdqa64 %6,%%zmm10\n\t"
452 "vmovdqa64 %7,%%zmm11\n\t"
453 "vpxorq %%zmm4,%%zmm2,%%zmm2\n\t"
454 "vpxorq %%zmm6,%%zmm3,%%zmm3\n\t"
455 "vpxorq %%zmm12,%%zmm10,%%zmm10\n\t"
456 "vpxorq %%zmm14,%%zmm11,%%zmm11"
458 : "m" (dptr[z0][d]), "m" (dptr[z0][d+64]),
459 "m" (dptr[z0][d+128]), "m" (dptr[z0][d+192]),
460 "m" (p[d]), "m" (p[d+64]), "m" (p[d+128]),
461 "m" (p[d+192]));
462 /* P/Q data pages */
463 for (z = z0-1 ; z >= start ; z--) {
464 asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t"
465 "vpxorq %%zmm7,%%zmm7,%%zmm7\n\t"
466 "vpxorq %%zmm13,%%zmm13,%%zmm13\n\t"
467 "vpxorq %%zmm15,%%zmm15,%%zmm15\n\t"
468 "prefetchnta %0\n\t"
469 "prefetchnta %2\n\t"
470 "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t"
471 "vpcmpgtb %%zmm6,%%zmm7,%%k2\n\t"
472 "vpcmpgtb %%zmm12,%%zmm13,%%k3\n\t"
473 "vpcmpgtb %%zmm14,%%zmm15,%%k4\n\t"
474 "vpmovm2b %%k1,%%zmm5\n\t"
475 "vpmovm2b %%k2,%%zmm7\n\t"
476 "vpmovm2b %%k3,%%zmm13\n\t"
477 "vpmovm2b %%k4,%%zmm15\n\t"
478 "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
479 "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t"
480 "vpaddb %%zmm12,%%zmm12,%%zmm12\n\t"
481 "vpaddb %%Zmm14,%%zmm14,%%zmm14\n\t"
482 "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
483 "vpandq %%zmm0,%%zmm7,%%zmm7\n\t"
484 "vpandq %%zmm0,%%zmm13,%%zmm13\n\t"
485 "vpandq %%zmm0,%%zmm15,%%zmm15\n\t"
486 "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
487 "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t"
488 "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t"
489 "vpxorq %%zmm15,%%zmm14,%%zmm14\n\t"
490 "vmovdqa64 %0,%%zmm5\n\t"
491 "vmovdqa64 %1,%%zmm7\n\t"
492 "vmovdqa64 %2,%%zmm13\n\t"
493 "vmovdqa64 %3,%%zmm15\n\t"
494 "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t"
495 "vpxorq %%zmm7,%%zmm3,%%zmm3\n\t"
496 "vpxorq %%zmm13,%%zmm10,%%zmm10\n\t"
497 "vpxorq %%zmm15,%%zmm11,%%zmm11\n\t"
498 "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
499 "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t"
500 "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t"
501 "vpxorq %%zmm15,%%zmm14,%%zmm14"
503 : "m" (dptr[z][d]), "m" (dptr[z][d+64]),
504 "m" (dptr[z][d+128]),
505 "m" (dptr[z][d+192]));
507 asm volatile("prefetchnta %0\n\t"
508 "prefetchnta %1\n\t"
510 : "m" (q[d]), "m" (q[d+128]));
511 /* P/Q left side optimization */
512 for (z = start-1 ; z >= 0 ; z--) {
513 asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t"
514 "vpxorq %%zmm7,%%zmm7,%%zmm7\n\t"
515 "vpxorq %%zmm13,%%zmm13,%%zmm13\n\t"
516 "vpxorq %%zmm15,%%zmm15,%%zmm15\n\t"
517 "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t"
518 "vpcmpgtb %%zmm6,%%zmm7,%%k2\n\t"
519 "vpcmpgtb %%zmm12,%%zmm13,%%k3\n\t"
520 "vpcmpgtb %%zmm14,%%zmm15,%%k4\n\t"
521 "vpmovm2b %%k1,%%zmm5\n\t"
522 "vpmovm2b %%k2,%%zmm7\n\t"
523 "vpmovm2b %%k3,%%zmm13\n\t"
524 "vpmovm2b %%k4,%%zmm15\n\t"
525 "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
526 "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t"
527 "vpaddb %%zmm12,%%zmm12,%%zmm12\n\t"
528 "vpaddb %%zmm14,%%zmm14,%%zmm14\n\t"
529 "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
530 "vpandq %%zmm0,%%zmm7,%%zmm7\n\t"
531 "vpandq %%zmm0,%%zmm13,%%zmm13\n\t"
532 "vpandq %%zmm0,%%zmm15,%%zmm15\n\t"
533 "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
534 "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t"
535 "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t"
536 "vpxorq %%zmm15,%%zmm14,%%zmm14"
538 : );
540 asm volatile("vmovntdq %%zmm2,%0\n\t"
541 "vmovntdq %%zmm3,%1\n\t"
542 "vmovntdq %%zmm10,%2\n\t"
543 "vmovntdq %%zmm11,%3\n\t"
544 "vpxorq %4,%%zmm4,%%zmm4\n\t"
545 "vpxorq %5,%%zmm6,%%zmm6\n\t"
546 "vpxorq %6,%%zmm12,%%zmm12\n\t"
547 "vpxorq %7,%%zmm14,%%zmm14\n\t"
548 "vmovntdq %%zmm4,%4\n\t"
549 "vmovntdq %%zmm6,%5\n\t"
550 "vmovntdq %%zmm12,%6\n\t"
551 "vmovntdq %%zmm14,%7"
553 : "m" (p[d]), "m" (p[d+64]), "m" (p[d+128]),
554 "m" (p[d+192]), "m" (q[d]), "m" (q[d+64]),
555 "m" (q[d+128]), "m" (q[d+192]));
557 asm volatile("sfence" : : : "memory");
558 kernel_fpu_end();
560 const struct raid6_calls raid6_avx512x4 = {
561 raid6_avx5124_gen_syndrome,
562 raid6_avx5124_xor_syndrome,
563 raid6_have_avx512,
564 "avx512x4",
565 1 /* Has cache hints */
567 #endif
569 #endif /* CONFIG_AS_AVX512 */