1 /* SPDX-License-Identifier: GPL-2.0-or-later */
3 * ARIA Cipher 64-way parallel algorithm (AVX512)
5 * Copyright (c) 2022 Taehee Yoo <ap420073@gmail.com>
9 #include <linux/linkage.h>
10 #include <asm/frame.h>
11 #include <asm/asm-offsets.h>
12 #include <linux/cfi_types.h>
18 #define BV8(a0, a1, a2, a3, a4, a5, a6, a7) \
19 ( (((a0) & 1) << 0) | \
28 #define BM8X8(l0, l1, l2, l3, l4, l5, l6, l7) \
29 ( ((l7) << (0 * 8)) | \
38 #define add_le128(out, in, lo_counter, hi_counter1) \
39 vpaddq lo_counter, in, out; \
40 vpcmpuq $1, lo_counter, out, %k1; \
41 kaddb %k1, %k1, %k1; \
42 vpaddq hi_counter1, out, out{%k1};
44 #define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0) \
45 vpandq x, mask4bit, tmp0; \
46 vpandqn x, mask4bit, x; \
49 vpshufb tmp0, lo_t, tmp0; \
53 #define transpose_4x4(x0, x1, x2, x3, t1, t2) \
54 vpunpckhdq x1, x0, t2; \
55 vpunpckldq x1, x0, x0; \
57 vpunpckldq x3, x2, t1; \
58 vpunpckhdq x3, x2, x2; \
60 vpunpckhqdq t1, x0, x1; \
61 vpunpcklqdq t1, x0, x0; \
63 vpunpckhqdq x2, t2, x3; \
64 vpunpcklqdq x2, t2, x2;
66 #define byteslice_16x16b(a0, b0, c0, d0, \
73 transpose_4x4(a0, a1, a2, a3, d2, d3); \
74 transpose_4x4(b0, b1, b2, b3, d2, d3); \
80 transpose_4x4(c0, c1, c2, c3, a0, a1); \
81 transpose_4x4(d0, d1, d2, d3, a0, a1); \
83 vbroadcasti64x2 .Lshufb_16x16b(%rip), a0; \
102 vpshufb a0, d3, a0; \
105 transpose_4x4(a0, b0, c0, d0, d2, d3); \
106 transpose_4x4(a1, b1, c1, d1, d2, d3); \
112 transpose_4x4(a2, b2, c2, d2, b0, b1); \
113 transpose_4x4(a3, b3, c3, d3, b0, b1); \
116 /* does not adjust output bytes inside vectors */
118 #define debyteslice_16x16b(a0, b0, c0, d0, \
125 transpose_4x4(a0, a1, a2, a3, d2, d3); \
126 transpose_4x4(b0, b1, b2, b3, d2, d3); \
132 transpose_4x4(c0, c1, c2, c3, a0, a1); \
133 transpose_4x4(d0, d1, d2, d3, a0, a1); \
135 vbroadcasti64x2 .Lshufb_16x16b(%rip), a0; \
137 vpshufb a0, a2, a2; \
138 vpshufb a0, a3, a3; \
139 vpshufb a0, b0, b0; \
140 vpshufb a0, b1, b1; \
141 vpshufb a0, b2, b2; \
142 vpshufb a0, b3, b3; \
143 vpshufb a0, a1, a1; \
144 vpshufb a0, c0, c0; \
145 vpshufb a0, c1, c1; \
146 vpshufb a0, c2, c2; \
147 vpshufb a0, c3, c3; \
148 vpshufb a0, d0, d0; \
149 vpshufb a0, d1, d1; \
150 vpshufb a0, d2, d2; \
151 vpshufb a0, d3, d3; \
154 vpshufb a0, d3, a0; \
157 transpose_4x4(c0, d0, a0, b0, d2, d3); \
158 transpose_4x4(c1, d1, a1, b1, d2, d3); \
164 transpose_4x4(c2, d2, a2, b2, b0, b1); \
165 transpose_4x4(c3, d3, a3, b3, b0, b1); \
168 /* does not adjust output bytes inside vectors */
170 /* load blocks to registers and apply pre-whitening */
171 #define inpack16_pre(x0, x1, x2, x3, \
176 vmovdqu64 (0 * 64)(rio), x0; \
177 vmovdqu64 (1 * 64)(rio), x1; \
178 vmovdqu64 (2 * 64)(rio), x2; \
179 vmovdqu64 (3 * 64)(rio), x3; \
180 vmovdqu64 (4 * 64)(rio), x4; \
181 vmovdqu64 (5 * 64)(rio), x5; \
182 vmovdqu64 (6 * 64)(rio), x6; \
183 vmovdqu64 (7 * 64)(rio), x7; \
184 vmovdqu64 (8 * 64)(rio), y0; \
185 vmovdqu64 (9 * 64)(rio), y1; \
186 vmovdqu64 (10 * 64)(rio), y2; \
187 vmovdqu64 (11 * 64)(rio), y3; \
188 vmovdqu64 (12 * 64)(rio), y4; \
189 vmovdqu64 (13 * 64)(rio), y5; \
190 vmovdqu64 (14 * 64)(rio), y6; \
191 vmovdqu64 (15 * 64)(rio), y7;
193 /* byteslice pre-whitened blocks and store to temporary memory */
194 #define inpack16_post(x0, x1, x2, x3, \
199 byteslice_16x16b(x0, x1, x2, x3, \
203 (mem_ab), (mem_cd)); \
205 vmovdqu64 x0, 0 * 64(mem_ab); \
206 vmovdqu64 x1, 1 * 64(mem_ab); \
207 vmovdqu64 x2, 2 * 64(mem_ab); \
208 vmovdqu64 x3, 3 * 64(mem_ab); \
209 vmovdqu64 x4, 4 * 64(mem_ab); \
210 vmovdqu64 x5, 5 * 64(mem_ab); \
211 vmovdqu64 x6, 6 * 64(mem_ab); \
212 vmovdqu64 x7, 7 * 64(mem_ab); \
213 vmovdqu64 y0, 0 * 64(mem_cd); \
214 vmovdqu64 y1, 1 * 64(mem_cd); \
215 vmovdqu64 y2, 2 * 64(mem_cd); \
216 vmovdqu64 y3, 3 * 64(mem_cd); \
217 vmovdqu64 y4, 4 * 64(mem_cd); \
218 vmovdqu64 y5, 5 * 64(mem_cd); \
219 vmovdqu64 y6, 6 * 64(mem_cd); \
220 vmovdqu64 y7, 7 * 64(mem_cd);
222 #define write_output(x0, x1, x2, x3, \
227 vmovdqu64 x0, 0 * 64(mem); \
228 vmovdqu64 x1, 1 * 64(mem); \
229 vmovdqu64 x2, 2 * 64(mem); \
230 vmovdqu64 x3, 3 * 64(mem); \
231 vmovdqu64 x4, 4 * 64(mem); \
232 vmovdqu64 x5, 5 * 64(mem); \
233 vmovdqu64 x6, 6 * 64(mem); \
234 vmovdqu64 x7, 7 * 64(mem); \
235 vmovdqu64 y0, 8 * 64(mem); \
236 vmovdqu64 y1, 9 * 64(mem); \
237 vmovdqu64 y2, 10 * 64(mem); \
238 vmovdqu64 y3, 11 * 64(mem); \
239 vmovdqu64 y4, 12 * 64(mem); \
240 vmovdqu64 y5, 13 * 64(mem); \
241 vmovdqu64 y6, 14 * 64(mem); \
242 vmovdqu64 y7, 15 * 64(mem); \
244 #define aria_store_state_8way(x0, x1, x2, x3, \
247 vmovdqu64 x0, ((idx + 0) * 64)(mem_tmp); \
248 vmovdqu64 x1, ((idx + 1) * 64)(mem_tmp); \
249 vmovdqu64 x2, ((idx + 2) * 64)(mem_tmp); \
250 vmovdqu64 x3, ((idx + 3) * 64)(mem_tmp); \
251 vmovdqu64 x4, ((idx + 4) * 64)(mem_tmp); \
252 vmovdqu64 x5, ((idx + 5) * 64)(mem_tmp); \
253 vmovdqu64 x6, ((idx + 6) * 64)(mem_tmp); \
254 vmovdqu64 x7, ((idx + 7) * 64)(mem_tmp);
256 #define aria_load_state_8way(x0, x1, x2, x3, \
259 vmovdqu64 ((idx + 0) * 64)(mem_tmp), x0; \
260 vmovdqu64 ((idx + 1) * 64)(mem_tmp), x1; \
261 vmovdqu64 ((idx + 2) * 64)(mem_tmp), x2; \
262 vmovdqu64 ((idx + 3) * 64)(mem_tmp), x3; \
263 vmovdqu64 ((idx + 4) * 64)(mem_tmp), x4; \
264 vmovdqu64 ((idx + 5) * 64)(mem_tmp), x5; \
265 vmovdqu64 ((idx + 6) * 64)(mem_tmp), x6; \
266 vmovdqu64 ((idx + 7) * 64)(mem_tmp), x7;
268 #define aria_ark_16way(x0, x1, x2, x3, \
274 vpbroadcastb ((round * 16) + 3)(rk), t0; \
276 vpbroadcastb ((round * 16) + 2)(rk), t0; \
278 vpbroadcastb ((round * 16) + 1)(rk), t0; \
280 vpbroadcastb ((round * 16) + 0)(rk), t0; \
282 vpbroadcastb ((round * 16) + 7)(rk), t0; \
284 vpbroadcastb ((round * 16) + 6)(rk), t0; \
286 vpbroadcastb ((round * 16) + 5)(rk), t0; \
288 vpbroadcastb ((round * 16) + 4)(rk), t0; \
290 vpbroadcastb ((round * 16) + 11)(rk), t0; \
292 vpbroadcastb ((round * 16) + 10)(rk), t0; \
294 vpbroadcastb ((round * 16) + 9)(rk), t0; \
296 vpbroadcastb ((round * 16) + 8)(rk), t0; \
298 vpbroadcastb ((round * 16) + 15)(rk), t0; \
300 vpbroadcastb ((round * 16) + 14)(rk), t0; \
302 vpbroadcastb ((round * 16) + 13)(rk), t0; \
304 vpbroadcastb ((round * 16) + 12)(rk), t0; \
307 #define aria_sbox_8way_gfni(x0, x1, x2, x3, \
311 vpbroadcastq .Ltf_s2_bitmatrix(%rip), t0; \
312 vpbroadcastq .Ltf_inv_bitmatrix(%rip), t1; \
313 vpbroadcastq .Ltf_id_bitmatrix(%rip), t2; \
314 vpbroadcastq .Ltf_aff_bitmatrix(%rip), t3; \
315 vpbroadcastq .Ltf_x2_bitmatrix(%rip), t4; \
316 vgf2p8affineinvqb $(tf_s2_const), t0, x1, x1; \
317 vgf2p8affineinvqb $(tf_s2_const), t0, x5, x5; \
318 vgf2p8affineqb $(tf_inv_const), t1, x2, x2; \
319 vgf2p8affineqb $(tf_inv_const), t1, x6, x6; \
320 vgf2p8affineinvqb $0, t2, x2, x2; \
321 vgf2p8affineinvqb $0, t2, x6, x6; \
322 vgf2p8affineinvqb $(tf_aff_const), t3, x0, x0; \
323 vgf2p8affineinvqb $(tf_aff_const), t3, x4, x4; \
324 vgf2p8affineqb $(tf_x2_const), t4, x3, x3; \
325 vgf2p8affineqb $(tf_x2_const), t4, x7, x7; \
326 vgf2p8affineinvqb $0, t2, x3, x3; \
327 vgf2p8affineinvqb $0, t2, x7, x7;
329 #define aria_sbox_16way_gfni(x0, x1, x2, x3, \
335 vpbroadcastq .Ltf_s2_bitmatrix(%rip), t0; \
336 vpbroadcastq .Ltf_inv_bitmatrix(%rip), t1; \
337 vpbroadcastq .Ltf_id_bitmatrix(%rip), t2; \
338 vpbroadcastq .Ltf_aff_bitmatrix(%rip), t3; \
339 vpbroadcastq .Ltf_x2_bitmatrix(%rip), t4; \
340 vgf2p8affineinvqb $(tf_s2_const), t0, x1, x1; \
341 vgf2p8affineinvqb $(tf_s2_const), t0, x5, x5; \
342 vgf2p8affineqb $(tf_inv_const), t1, x2, x2; \
343 vgf2p8affineqb $(tf_inv_const), t1, x6, x6; \
344 vgf2p8affineinvqb $0, t2, x2, x2; \
345 vgf2p8affineinvqb $0, t2, x6, x6; \
346 vgf2p8affineinvqb $(tf_aff_const), t3, x0, x0; \
347 vgf2p8affineinvqb $(tf_aff_const), t3, x4, x4; \
348 vgf2p8affineqb $(tf_x2_const), t4, x3, x3; \
349 vgf2p8affineqb $(tf_x2_const), t4, x7, x7; \
350 vgf2p8affineinvqb $0, t2, x3, x3; \
351 vgf2p8affineinvqb $0, t2, x7, x7; \
352 vgf2p8affineinvqb $(tf_s2_const), t0, y1, y1; \
353 vgf2p8affineinvqb $(tf_s2_const), t0, y5, y5; \
354 vgf2p8affineqb $(tf_inv_const), t1, y2, y2; \
355 vgf2p8affineqb $(tf_inv_const), t1, y6, y6; \
356 vgf2p8affineinvqb $0, t2, y2, y2; \
357 vgf2p8affineinvqb $0, t2, y6, y6; \
358 vgf2p8affineinvqb $(tf_aff_const), t3, y0, y0; \
359 vgf2p8affineinvqb $(tf_aff_const), t3, y4, y4; \
360 vgf2p8affineqb $(tf_x2_const), t4, y3, y3; \
361 vgf2p8affineqb $(tf_x2_const), t4, y7, y7; \
362 vgf2p8affineinvqb $0, t2, y3, y3; \
363 vgf2p8affineinvqb $0, t2, y7, y7;
366 #define aria_diff_m(x0, x1, x2, x3, \
368 /* T = rotr32(X, 8); */ \
374 /* X = T ^ rotr(X, 16); */ \
381 #define aria_diff_word(x0, x1, x2, x3, \
421 #define aria_fe_gfni(x0, x1, x2, x3, \
427 mem_tmp, rk, round) \
428 aria_ark_16way(x0, x1, x2, x3, x4, x5, x6, x7, \
429 y0, y1, y2, y3, y4, y5, y6, y7, \
432 aria_sbox_16way_gfni(x2, x3, x0, x1, \
439 aria_diff_m(x0, x1, x2, x3, z0, z1, z2, z3); \
440 aria_diff_m(x4, x5, x6, x7, z0, z1, z2, z3); \
441 aria_diff_m(y0, y1, y2, y3, z0, z1, z2, z3); \
442 aria_diff_m(y4, y5, y6, y7, z0, z1, z2, z3); \
443 aria_diff_word(x0, x1, x2, x3, \
447 /* aria_diff_byte() \
448 * T3 = ABCD -> BADC \
449 * T3 = y4, y5, y6, y7 -> y5, y4, y7, y6 \
450 * T0 = ABCD -> CDAB \
451 * T0 = x0, x1, x2, x3 -> x2, x3, x0, x1 \
452 * T1 = ABCD -> DCBA \
453 * T1 = x4, x5, x6, x7 -> x7, x6, x5, x4 \
455 aria_diff_word(x2, x3, x0, x1, \
461 #define aria_fo_gfni(x0, x1, x2, x3, \
467 mem_tmp, rk, round) \
468 aria_ark_16way(x0, x1, x2, x3, x4, x5, x6, x7, \
469 y0, y1, y2, y3, y4, y5, y6, y7, \
472 aria_sbox_16way_gfni(x0, x1, x2, x3, \
479 aria_diff_m(x0, x1, x2, x3, z0, z1, z2, z3); \
480 aria_diff_m(x4, x5, x6, x7, z0, z1, z2, z3); \
481 aria_diff_m(y0, y1, y2, y3, z0, z1, z2, z3); \
482 aria_diff_m(y4, y5, y6, y7, z0, z1, z2, z3); \
483 aria_diff_word(x0, x1, x2, x3, \
487 /* aria_diff_byte() \
488 * T1 = ABCD -> BADC \
489 * T1 = x4, x5, x6, x7 -> x5, x4, x7, x6 \
490 * T2 = ABCD -> CDAB \
491 * T2 = y0, y1, y2, y3, -> y2, y3, y0, y1 \
492 * T3 = ABCD -> DCBA \
493 * T3 = y4, y5, y6, y7 -> y7, y6, y5, y4 \
495 aria_diff_word(x0, x1, x2, x3, \
500 #define aria_ff_gfni(x0, x1, x2, x3, \
506 mem_tmp, rk, round, last_round) \
507 aria_ark_16way(x0, x1, x2, x3, \
512 aria_sbox_16way_gfni(x2, x3, x0, x1, \
518 aria_ark_16way(x0, x1, x2, x3, \
525 .section .rodata.cst64, "aM", @progbits, 64
533 .section .rodata.cst32.shufb_16x16b, "aM", @progbits, 32
535 #define SHUFB_BYTES(idx) \
536 0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx)
538 .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3)
539 .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3)
541 .section .rodata.cst16, "aM", @progbits, 16
548 .Lcounter16161616_lo:
553 /* For CTR-mode IV byteswap */
555 .byte 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08
556 .byte 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00
558 .section .rodata.cst8, "aM", @progbits, 8
561 #define tf_aff_const BV8(1, 1, 0, 0, 0, 1, 1, 0)
563 .quad BM8X8(BV8(1, 0, 0, 0, 1, 1, 1, 1),
564 BV8(1, 1, 0, 0, 0, 1, 1, 1),
565 BV8(1, 1, 1, 0, 0, 0, 1, 1),
566 BV8(1, 1, 1, 1, 0, 0, 0, 1),
567 BV8(1, 1, 1, 1, 1, 0, 0, 0),
568 BV8(0, 1, 1, 1, 1, 1, 0, 0),
569 BV8(0, 0, 1, 1, 1, 1, 1, 0),
570 BV8(0, 0, 0, 1, 1, 1, 1, 1))
572 /* AES inverse affine: */
573 #define tf_inv_const BV8(1, 0, 1, 0, 0, 0, 0, 0)
575 .quad BM8X8(BV8(0, 0, 1, 0, 0, 1, 0, 1),
576 BV8(1, 0, 0, 1, 0, 0, 1, 0),
577 BV8(0, 1, 0, 0, 1, 0, 0, 1),
578 BV8(1, 0, 1, 0, 0, 1, 0, 0),
579 BV8(0, 1, 0, 1, 0, 0, 1, 0),
580 BV8(0, 0, 1, 0, 1, 0, 0, 1),
581 BV8(1, 0, 0, 1, 0, 1, 0, 0),
582 BV8(0, 1, 0, 0, 1, 0, 1, 0))
585 #define tf_s2_const BV8(0, 1, 0, 0, 0, 1, 1, 1)
587 .quad BM8X8(BV8(0, 1, 0, 1, 0, 1, 1, 1),
588 BV8(0, 0, 1, 1, 1, 1, 1, 1),
589 BV8(1, 1, 1, 0, 1, 1, 0, 1),
590 BV8(1, 1, 0, 0, 0, 0, 1, 1),
591 BV8(0, 1, 0, 0, 0, 0, 1, 1),
592 BV8(1, 1, 0, 0, 1, 1, 1, 0),
593 BV8(0, 1, 1, 0, 0, 0, 1, 1),
594 BV8(1, 1, 1, 1, 0, 1, 1, 0))
597 #define tf_x2_const BV8(0, 0, 1, 1, 0, 1, 0, 0)
599 .quad BM8X8(BV8(0, 0, 0, 1, 1, 0, 0, 0),
600 BV8(0, 0, 1, 0, 0, 1, 1, 0),
601 BV8(0, 0, 0, 0, 1, 0, 1, 0),
602 BV8(1, 1, 1, 0, 0, 0, 1, 1),
603 BV8(1, 1, 1, 0, 1, 1, 0, 0),
604 BV8(0, 1, 1, 0, 1, 0, 1, 1),
605 BV8(1, 0, 1, 1, 1, 1, 0, 1),
606 BV8(1, 0, 0, 1, 0, 0, 1, 1))
608 /* Identity matrix: */
610 .quad BM8X8(BV8(1, 0, 0, 0, 0, 0, 0, 0),
611 BV8(0, 1, 0, 0, 0, 0, 0, 0),
612 BV8(0, 0, 1, 0, 0, 0, 0, 0),
613 BV8(0, 0, 0, 1, 0, 0, 0, 0),
614 BV8(0, 0, 0, 0, 1, 0, 0, 0),
615 BV8(0, 0, 0, 0, 0, 1, 0, 0),
616 BV8(0, 0, 0, 0, 0, 0, 1, 0),
617 BV8(0, 0, 0, 0, 0, 0, 0, 1))
620 SYM_FUNC_START_LOCAL(__aria_gfni_avx512_crypt_64way)
625 * %zmm0..%zmm15: byte-sliced blocks
631 leaq 8 * 64(%rax), %r8;
633 inpack16_post(%zmm0, %zmm1, %zmm2, %zmm3,
634 %zmm4, %zmm5, %zmm6, %zmm7,
635 %zmm8, %zmm9, %zmm10, %zmm11,
636 %zmm12, %zmm13, %zmm14,
638 aria_fo_gfni(%zmm0, %zmm1, %zmm2, %zmm3,
639 %zmm4, %zmm5, %zmm6, %zmm7,
640 %zmm8, %zmm9, %zmm10, %zmm11,
641 %zmm12, %zmm13, %zmm14, %zmm15,
642 %zmm24, %zmm25, %zmm26, %zmm27,
643 %zmm28, %zmm29, %zmm30, %zmm31,
645 aria_fe_gfni(%zmm3, %zmm2, %zmm1, %zmm0,
646 %zmm6, %zmm7, %zmm4, %zmm5,
647 %zmm9, %zmm8, %zmm11, %zmm10,
648 %zmm12, %zmm13, %zmm14, %zmm15,
649 %zmm24, %zmm25, %zmm26, %zmm27,
650 %zmm28, %zmm29, %zmm30, %zmm31,
652 aria_fo_gfni(%zmm0, %zmm1, %zmm2, %zmm3,
653 %zmm4, %zmm5, %zmm6, %zmm7,
654 %zmm8, %zmm9, %zmm10, %zmm11,
655 %zmm12, %zmm13, %zmm14, %zmm15,
656 %zmm24, %zmm25, %zmm26, %zmm27,
657 %zmm28, %zmm29, %zmm30, %zmm31,
659 aria_fe_gfni(%zmm3, %zmm2, %zmm1, %zmm0,
660 %zmm6, %zmm7, %zmm4, %zmm5,
661 %zmm9, %zmm8, %zmm11, %zmm10,
662 %zmm12, %zmm13, %zmm14, %zmm15,
663 %zmm24, %zmm25, %zmm26, %zmm27,
664 %zmm28, %zmm29, %zmm30, %zmm31,
666 aria_fo_gfni(%zmm0, %zmm1, %zmm2, %zmm3,
667 %zmm4, %zmm5, %zmm6, %zmm7,
668 %zmm8, %zmm9, %zmm10, %zmm11,
669 %zmm12, %zmm13, %zmm14, %zmm15,
670 %zmm24, %zmm25, %zmm26, %zmm27,
671 %zmm28, %zmm29, %zmm30, %zmm31,
673 aria_fe_gfni(%zmm3, %zmm2, %zmm1, %zmm0,
674 %zmm6, %zmm7, %zmm4, %zmm5,
675 %zmm9, %zmm8, %zmm11, %zmm10,
676 %zmm12, %zmm13, %zmm14, %zmm15,
677 %zmm24, %zmm25, %zmm26, %zmm27,
678 %zmm28, %zmm29, %zmm30, %zmm31,
680 aria_fo_gfni(%zmm0, %zmm1, %zmm2, %zmm3,
681 %zmm4, %zmm5, %zmm6, %zmm7,
682 %zmm8, %zmm9, %zmm10, %zmm11,
683 %zmm12, %zmm13, %zmm14, %zmm15,
684 %zmm24, %zmm25, %zmm26, %zmm27,
685 %zmm28, %zmm29, %zmm30, %zmm31,
687 aria_fe_gfni(%zmm3, %zmm2, %zmm1, %zmm0,
688 %zmm6, %zmm7, %zmm4, %zmm5,
689 %zmm9, %zmm8, %zmm11, %zmm10,
690 %zmm12, %zmm13, %zmm14, %zmm15,
691 %zmm24, %zmm25, %zmm26, %zmm27,
692 %zmm28, %zmm29, %zmm30, %zmm31,
694 aria_fo_gfni(%zmm0, %zmm1, %zmm2, %zmm3,
695 %zmm4, %zmm5, %zmm6, %zmm7,
696 %zmm8, %zmm9, %zmm10, %zmm11,
697 %zmm12, %zmm13, %zmm14, %zmm15,
698 %zmm24, %zmm25, %zmm26, %zmm27,
699 %zmm28, %zmm29, %zmm30, %zmm31,
701 aria_fe_gfni(%zmm3, %zmm2, %zmm1, %zmm0,
702 %zmm6, %zmm7, %zmm4, %zmm5,
703 %zmm9, %zmm8, %zmm11, %zmm10,
704 %zmm12, %zmm13, %zmm14, %zmm15,
705 %zmm24, %zmm25, %zmm26, %zmm27,
706 %zmm28, %zmm29, %zmm30, %zmm31,
708 aria_fo_gfni(%zmm0, %zmm1, %zmm2, %zmm3,
709 %zmm4, %zmm5, %zmm6, %zmm7,
710 %zmm8, %zmm9, %zmm10, %zmm11,
711 %zmm12, %zmm13, %zmm14, %zmm15,
712 %zmm24, %zmm25, %zmm26, %zmm27,
713 %zmm28, %zmm29, %zmm30, %zmm31,
715 cmpl $12, ARIA_CTX_rounds(CTX);
717 aria_ff_gfni(%zmm3, %zmm2, %zmm1, %zmm0,
718 %zmm6, %zmm7, %zmm4, %zmm5,
719 %zmm9, %zmm8, %zmm11, %zmm10,
720 %zmm12, %zmm13, %zmm14, %zmm15,
721 %zmm24, %zmm25, %zmm26, %zmm27,
722 %zmm28, %zmm29, %zmm30, %zmm31,
726 aria_fe_gfni(%zmm3, %zmm2, %zmm1, %zmm0,
727 %zmm6, %zmm7, %zmm4, %zmm5,
728 %zmm9, %zmm8, %zmm11, %zmm10,
729 %zmm12, %zmm13, %zmm14, %zmm15,
730 %zmm24, %zmm25, %zmm26, %zmm27,
731 %zmm28, %zmm29, %zmm30, %zmm31,
733 aria_fo_gfni(%zmm0, %zmm1, %zmm2, %zmm3,
734 %zmm4, %zmm5, %zmm6, %zmm7,
735 %zmm8, %zmm9, %zmm10, %zmm11,
736 %zmm12, %zmm13, %zmm14, %zmm15,
737 %zmm24, %zmm25, %zmm26, %zmm27,
738 %zmm28, %zmm29, %zmm30, %zmm31,
740 cmpl $14, ARIA_CTX_rounds(CTX);
742 aria_ff_gfni(%zmm3, %zmm2, %zmm1, %zmm0,
743 %zmm6, %zmm7, %zmm4, %zmm5,
744 %zmm9, %zmm8, %zmm11, %zmm10,
745 %zmm12, %zmm13, %zmm14, %zmm15,
746 %zmm24, %zmm25, %zmm26, %zmm27,
747 %zmm28, %zmm29, %zmm30, %zmm31,
751 aria_fe_gfni(%zmm3, %zmm2, %zmm1, %zmm0,
752 %zmm6, %zmm7, %zmm4, %zmm5,
753 %zmm9, %zmm8, %zmm11, %zmm10,
754 %zmm12, %zmm13, %zmm14, %zmm15,
755 %zmm24, %zmm25, %zmm26, %zmm27,
756 %zmm28, %zmm29, %zmm30, %zmm31,
758 aria_fo_gfni(%zmm0, %zmm1, %zmm2, %zmm3,
759 %zmm4, %zmm5, %zmm6, %zmm7,
760 %zmm8, %zmm9, %zmm10, %zmm11,
761 %zmm12, %zmm13, %zmm14, %zmm15,
762 %zmm24, %zmm25, %zmm26, %zmm27,
763 %zmm28, %zmm29, %zmm30, %zmm31,
765 aria_ff_gfni(%zmm3, %zmm2, %zmm1, %zmm0,
766 %zmm6, %zmm7, %zmm4, %zmm5,
767 %zmm9, %zmm8, %zmm11, %zmm10,
768 %zmm12, %zmm13, %zmm14, %zmm15,
769 %zmm24, %zmm25, %zmm26, %zmm27,
770 %zmm28, %zmm29, %zmm30, %zmm31,
773 debyteslice_16x16b(%zmm9, %zmm12, %zmm3, %zmm6,
774 %zmm8, %zmm13, %zmm2, %zmm7,
775 %zmm11, %zmm14, %zmm1, %zmm4,
776 %zmm10, %zmm15, %zmm0, %zmm5,
780 SYM_FUNC_END(__aria_gfni_avx512_crypt_64way)
782 SYM_TYPED_FUNC_START(aria_gfni_avx512_encrypt_64way)
791 leaq ARIA_CTX_enc_key(CTX), %r9;
793 inpack16_pre(%zmm0, %zmm1, %zmm2, %zmm3, %zmm4, %zmm5, %zmm6, %zmm7,
794 %zmm8, %zmm9, %zmm10, %zmm11, %zmm12, %zmm13, %zmm14,
797 call __aria_gfni_avx512_crypt_64way;
799 write_output(%zmm3, %zmm2, %zmm1, %zmm0, %zmm6, %zmm7, %zmm4, %zmm5,
800 %zmm9, %zmm8, %zmm11, %zmm10, %zmm12, %zmm13, %zmm14,
805 SYM_FUNC_END(aria_gfni_avx512_encrypt_64way)
807 SYM_TYPED_FUNC_START(aria_gfni_avx512_decrypt_64way)
816 leaq ARIA_CTX_dec_key(CTX), %r9;
818 inpack16_pre(%zmm0, %zmm1, %zmm2, %zmm3, %zmm4, %zmm5, %zmm6, %zmm7,
819 %zmm8, %zmm9, %zmm10, %zmm11, %zmm12, %zmm13, %zmm14,
822 call __aria_gfni_avx512_crypt_64way;
824 write_output(%zmm3, %zmm2, %zmm1, %zmm0, %zmm6, %zmm7, %zmm4, %zmm5,
825 %zmm9, %zmm8, %zmm11, %zmm10, %zmm12, %zmm13, %zmm14,
830 SYM_FUNC_END(aria_gfni_avx512_decrypt_64way)
832 SYM_FUNC_START_LOCAL(__aria_gfni_avx512_ctr_gen_keystream_64way)
838 * %r8: iv (big endian, 128bit)
843 vbroadcasti64x2 .Lbswap128_mask (%rip), %zmm19;
844 vmovdqa64 .Lcounter0123_lo (%rip), %zmm21;
845 vbroadcasti64x2 .Lcounter4444_lo (%rip), %zmm22;
846 vbroadcasti64x2 .Lcounter8888_lo (%rip), %zmm23;
847 vbroadcasti64x2 .Lcounter16161616_lo (%rip), %zmm24;
848 vbroadcasti64x2 .Lcounter1111_hi (%rip), %zmm25;
850 /* load IV and byteswap */
855 vbroadcasti64x2 (%r8), %zmm20;
856 vpshufb %zmm19, %zmm20, %zmm20;
858 /* check need for handling 64-bit overflow and carry */
859 cmpq $(0xffffffffffffffff - 64), %r11;
863 vpaddq %zmm21, %zmm20, %zmm0; /* +0:+1:+2:+3 */
864 vpaddq %zmm22, %zmm0, %zmm1; /* +4:+5:+6:+7 */
865 vpaddq %zmm23, %zmm0, %zmm2; /* +8:+9:+10:+11 */
866 vpaddq %zmm23, %zmm1, %zmm3; /* +12:+13:+14:+15 */
867 vpaddq %zmm24, %zmm0, %zmm4; /* +16... */
868 vpaddq %zmm24, %zmm1, %zmm5; /* +20... */
869 vpaddq %zmm24, %zmm2, %zmm6; /* +24... */
870 vpaddq %zmm24, %zmm3, %zmm7; /* +28... */
871 vpaddq %zmm24, %zmm4, %zmm8; /* +32... */
872 vpaddq %zmm24, %zmm5, %zmm9; /* +36... */
873 vpaddq %zmm24, %zmm6, %zmm10; /* +40... */
874 vpaddq %zmm24, %zmm7, %zmm11; /* +44... */
875 vpaddq %zmm24, %zmm8, %zmm12; /* +48... */
876 vpaddq %zmm24, %zmm9, %zmm13; /* +52... */
877 vpaddq %zmm24, %zmm10, %zmm14; /* +56... */
878 vpaddq %zmm24, %zmm11, %zmm15; /* +60... */
883 add_le128(%zmm0, %zmm20, %zmm21, %zmm25); /* +0:+1:+2:+3 */
884 add_le128(%zmm1, %zmm0, %zmm22, %zmm25); /* +4:+5:+6:+7 */
885 add_le128(%zmm2, %zmm0, %zmm23, %zmm25); /* +8:+9:+10:+11 */
886 add_le128(%zmm3, %zmm1, %zmm23, %zmm25); /* +12:+13:+14:+15 */
887 add_le128(%zmm4, %zmm0, %zmm24, %zmm25); /* +16... */
888 add_le128(%zmm5, %zmm1, %zmm24, %zmm25); /* +20... */
889 add_le128(%zmm6, %zmm2, %zmm24, %zmm25); /* +24... */
890 add_le128(%zmm7, %zmm3, %zmm24, %zmm25); /* +28... */
891 add_le128(%zmm8, %zmm4, %zmm24, %zmm25); /* +32... */
892 add_le128(%zmm9, %zmm5, %zmm24, %zmm25); /* +36... */
893 add_le128(%zmm10, %zmm6, %zmm24, %zmm25); /* +40... */
894 add_le128(%zmm11, %zmm7, %zmm24, %zmm25); /* +44... */
895 add_le128(%zmm12, %zmm8, %zmm24, %zmm25); /* +48... */
896 add_le128(%zmm13, %zmm9, %zmm24, %zmm25); /* +52... */
897 add_le128(%zmm14, %zmm10, %zmm24, %zmm25); /* +56... */
898 add_le128(%zmm15, %zmm11, %zmm24, %zmm25); /* +60... */
901 /* Byte-swap IVs and update counter. */
904 vpshufb %zmm19, %zmm15, %zmm15;
905 vpshufb %zmm19, %zmm14, %zmm14;
906 vpshufb %zmm19, %zmm13, %zmm13;
907 vpshufb %zmm19, %zmm12, %zmm12;
908 vpshufb %zmm19, %zmm11, %zmm11;
909 vpshufb %zmm19, %zmm10, %zmm10;
910 vpshufb %zmm19, %zmm9, %zmm9;
911 vpshufb %zmm19, %zmm8, %zmm8;
914 vpshufb %zmm19, %zmm7, %zmm7;
915 vpshufb %zmm19, %zmm6, %zmm6;
916 vpshufb %zmm19, %zmm5, %zmm5;
917 vpshufb %zmm19, %zmm4, %zmm4;
918 vpshufb %zmm19, %zmm3, %zmm3;
919 vpshufb %zmm19, %zmm2, %zmm2;
920 vpshufb %zmm19, %zmm1, %zmm1;
921 vpshufb %zmm19, %zmm0, %zmm0;
927 SYM_FUNC_END(__aria_gfni_avx512_ctr_gen_keystream_64way)
929 SYM_TYPED_FUNC_START(aria_gfni_avx512_ctr_crypt_64way)
935 * %r8: iv (big endian, 128bit)
939 call __aria_gfni_avx512_ctr_gen_keystream_64way
945 leaq ARIA_CTX_enc_key(CTX), %r9;
947 call __aria_gfni_avx512_crypt_64way;
949 vpxorq (0 * 64)(%r11), %zmm3, %zmm3;
950 vpxorq (1 * 64)(%r11), %zmm2, %zmm2;
951 vpxorq (2 * 64)(%r11), %zmm1, %zmm1;
952 vpxorq (3 * 64)(%r11), %zmm0, %zmm0;
953 vpxorq (4 * 64)(%r11), %zmm6, %zmm6;
954 vpxorq (5 * 64)(%r11), %zmm7, %zmm7;
955 vpxorq (6 * 64)(%r11), %zmm4, %zmm4;
956 vpxorq (7 * 64)(%r11), %zmm5, %zmm5;
957 vpxorq (8 * 64)(%r11), %zmm9, %zmm9;
958 vpxorq (9 * 64)(%r11), %zmm8, %zmm8;
959 vpxorq (10 * 64)(%r11), %zmm11, %zmm11;
960 vpxorq (11 * 64)(%r11), %zmm10, %zmm10;
961 vpxorq (12 * 64)(%r11), %zmm12, %zmm12;
962 vpxorq (13 * 64)(%r11), %zmm13, %zmm13;
963 vpxorq (14 * 64)(%r11), %zmm14, %zmm14;
964 vpxorq (15 * 64)(%r11), %zmm15, %zmm15;
965 write_output(%zmm3, %zmm2, %zmm1, %zmm0, %zmm6, %zmm7, %zmm4, %zmm5,
966 %zmm9, %zmm8, %zmm11, %zmm10, %zmm12, %zmm13, %zmm14,
971 SYM_FUNC_END(aria_gfni_avx512_ctr_crypt_64way)