1 /* SPDX-License-Identifier: GPL-2.0-or-later */
3 * ARIA Cipher 32-way parallel algorithm (AVX2)
5 * Copyright (c) 2022 Taehee Yoo <ap420073@gmail.com>
9 #include <linux/linkage.h>
10 #include <asm/frame.h>
11 #include <asm/asm-offsets.h>
12 #include <linux/cfi_types.h>
34 #define BV8(a0, a1, a2, a3, a4, a5, a6, a7) \
35 ( (((a0) & 1) << 0) | \
44 #define BM8X8(l0, l1, l2, l3, l4, l5, l6, l7) \
45 ( ((l7) << (0 * 8)) | \
54 #define inc_le128(x, minus_one, tmp) \
55 vpcmpeqq minus_one, x, tmp; \
56 vpsubq minus_one, x, x; \
57 vpslldq $8, tmp, tmp; \
60 #define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0) \
61 vpand x, mask4bit, tmp0; \
62 vpandn x, mask4bit, x; \
65 vpshufb tmp0, lo_t, tmp0; \
69 #define transpose_4x4(x0, x1, x2, x3, t1, t2) \
70 vpunpckhdq x1, x0, t2; \
71 vpunpckldq x1, x0, x0; \
73 vpunpckldq x3, x2, t1; \
74 vpunpckhdq x3, x2, x2; \
76 vpunpckhqdq t1, x0, x1; \
77 vpunpcklqdq t1, x0, x0; \
79 vpunpckhqdq x2, t2, x3; \
80 vpunpcklqdq x2, t2, x2;
82 #define byteslice_16x16b(a0, b0, c0, d0, \
89 transpose_4x4(a0, a1, a2, a3, d2, d3); \
90 transpose_4x4(b0, b1, b2, b3, d2, d3); \
96 transpose_4x4(c0, c1, c2, c3, a0, a1); \
97 transpose_4x4(d0, d1, d2, d3, a0, a1); \
99 vbroadcasti128 .Lshufb_16x16b(%rip), a0; \
101 vpshufb a0, a2, a2; \
102 vpshufb a0, a3, a3; \
103 vpshufb a0, b0, b0; \
104 vpshufb a0, b1, b1; \
105 vpshufb a0, b2, b2; \
106 vpshufb a0, b3, b3; \
107 vpshufb a0, a1, a1; \
108 vpshufb a0, c0, c0; \
109 vpshufb a0, c1, c1; \
110 vpshufb a0, c2, c2; \
111 vpshufb a0, c3, c3; \
112 vpshufb a0, d0, d0; \
113 vpshufb a0, d1, d1; \
114 vpshufb a0, d2, d2; \
115 vpshufb a0, d3, d3; \
118 vpshufb a0, d3, a0; \
121 transpose_4x4(a0, b0, c0, d0, d2, d3); \
122 transpose_4x4(a1, b1, c1, d1, d2, d3); \
128 transpose_4x4(a2, b2, c2, d2, b0, b1); \
129 transpose_4x4(a3, b3, c3, d3, b0, b1); \
132 /* does not adjust output bytes inside vectors */
134 #define debyteslice_16x16b(a0, b0, c0, d0, \
141 transpose_4x4(a0, a1, a2, a3, d2, d3); \
142 transpose_4x4(b0, b1, b2, b3, d2, d3); \
148 transpose_4x4(c0, c1, c2, c3, a0, a1); \
149 transpose_4x4(d0, d1, d2, d3, a0, a1); \
151 vbroadcasti128 .Lshufb_16x16b(%rip), a0; \
153 vpshufb a0, a2, a2; \
154 vpshufb a0, a3, a3; \
155 vpshufb a0, b0, b0; \
156 vpshufb a0, b1, b1; \
157 vpshufb a0, b2, b2; \
158 vpshufb a0, b3, b3; \
159 vpshufb a0, a1, a1; \
160 vpshufb a0, c0, c0; \
161 vpshufb a0, c1, c1; \
162 vpshufb a0, c2, c2; \
163 vpshufb a0, c3, c3; \
164 vpshufb a0, d0, d0; \
165 vpshufb a0, d1, d1; \
166 vpshufb a0, d2, d2; \
167 vpshufb a0, d3, d3; \
170 vpshufb a0, d3, a0; \
173 transpose_4x4(c0, d0, a0, b0, d2, d3); \
174 transpose_4x4(c1, d1, a1, b1, d2, d3); \
180 transpose_4x4(c2, d2, a2, b2, b0, b1); \
181 transpose_4x4(c3, d3, a3, b3, b0, b1); \
184 /* does not adjust output bytes inside vectors */
186 /* load blocks to registers and apply pre-whitening */
187 #define inpack16_pre(x0, x1, x2, x3, \
192 vmovdqu (0 * 32)(rio), x0; \
193 vmovdqu (1 * 32)(rio), x1; \
194 vmovdqu (2 * 32)(rio), x2; \
195 vmovdqu (3 * 32)(rio), x3; \
196 vmovdqu (4 * 32)(rio), x4; \
197 vmovdqu (5 * 32)(rio), x5; \
198 vmovdqu (6 * 32)(rio), x6; \
199 vmovdqu (7 * 32)(rio), x7; \
200 vmovdqu (8 * 32)(rio), y0; \
201 vmovdqu (9 * 32)(rio), y1; \
202 vmovdqu (10 * 32)(rio), y2; \
203 vmovdqu (11 * 32)(rio), y3; \
204 vmovdqu (12 * 32)(rio), y4; \
205 vmovdqu (13 * 32)(rio), y5; \
206 vmovdqu (14 * 32)(rio), y6; \
207 vmovdqu (15 * 32)(rio), y7;
209 /* byteslice pre-whitened blocks and store to temporary memory */
210 #define inpack16_post(x0, x1, x2, x3, \
215 byteslice_16x16b(x0, x1, x2, x3, \
219 (mem_ab), (mem_cd)); \
221 vmovdqu x0, 0 * 32(mem_ab); \
222 vmovdqu x1, 1 * 32(mem_ab); \
223 vmovdqu x2, 2 * 32(mem_ab); \
224 vmovdqu x3, 3 * 32(mem_ab); \
225 vmovdqu x4, 4 * 32(mem_ab); \
226 vmovdqu x5, 5 * 32(mem_ab); \
227 vmovdqu x6, 6 * 32(mem_ab); \
228 vmovdqu x7, 7 * 32(mem_ab); \
229 vmovdqu y0, 0 * 32(mem_cd); \
230 vmovdqu y1, 1 * 32(mem_cd); \
231 vmovdqu y2, 2 * 32(mem_cd); \
232 vmovdqu y3, 3 * 32(mem_cd); \
233 vmovdqu y4, 4 * 32(mem_cd); \
234 vmovdqu y5, 5 * 32(mem_cd); \
235 vmovdqu y6, 6 * 32(mem_cd); \
236 vmovdqu y7, 7 * 32(mem_cd);
238 #define write_output(x0, x1, x2, x3, \
243 vmovdqu x0, 0 * 32(mem); \
244 vmovdqu x1, 1 * 32(mem); \
245 vmovdqu x2, 2 * 32(mem); \
246 vmovdqu x3, 3 * 32(mem); \
247 vmovdqu x4, 4 * 32(mem); \
248 vmovdqu x5, 5 * 32(mem); \
249 vmovdqu x6, 6 * 32(mem); \
250 vmovdqu x7, 7 * 32(mem); \
251 vmovdqu y0, 8 * 32(mem); \
252 vmovdqu y1, 9 * 32(mem); \
253 vmovdqu y2, 10 * 32(mem); \
254 vmovdqu y3, 11 * 32(mem); \
255 vmovdqu y4, 12 * 32(mem); \
256 vmovdqu y5, 13 * 32(mem); \
257 vmovdqu y6, 14 * 32(mem); \
258 vmovdqu y7, 15 * 32(mem); \
260 #define aria_store_state_8way(x0, x1, x2, x3, \
263 vmovdqu x0, ((idx + 0) * 32)(mem_tmp); \
264 vmovdqu x1, ((idx + 1) * 32)(mem_tmp); \
265 vmovdqu x2, ((idx + 2) * 32)(mem_tmp); \
266 vmovdqu x3, ((idx + 3) * 32)(mem_tmp); \
267 vmovdqu x4, ((idx + 4) * 32)(mem_tmp); \
268 vmovdqu x5, ((idx + 5) * 32)(mem_tmp); \
269 vmovdqu x6, ((idx + 6) * 32)(mem_tmp); \
270 vmovdqu x7, ((idx + 7) * 32)(mem_tmp);
272 #define aria_load_state_8way(x0, x1, x2, x3, \
275 vmovdqu ((idx + 0) * 32)(mem_tmp), x0; \
276 vmovdqu ((idx + 1) * 32)(mem_tmp), x1; \
277 vmovdqu ((idx + 2) * 32)(mem_tmp), x2; \
278 vmovdqu ((idx + 3) * 32)(mem_tmp), x3; \
279 vmovdqu ((idx + 4) * 32)(mem_tmp), x4; \
280 vmovdqu ((idx + 5) * 32)(mem_tmp), x5; \
281 vmovdqu ((idx + 6) * 32)(mem_tmp), x6; \
282 vmovdqu ((idx + 7) * 32)(mem_tmp), x7;
284 #define aria_ark_8way(x0, x1, x2, x3, \
286 t0, rk, idx, round) \
288 vpbroadcastb ((round * 16) + idx + 3)(rk), t0; \
290 vpbroadcastb ((round * 16) + idx + 2)(rk), t0; \
292 vpbroadcastb ((round * 16) + idx + 1)(rk), t0; \
294 vpbroadcastb ((round * 16) + idx + 0)(rk), t0; \
296 vpbroadcastb ((round * 16) + idx + 7)(rk), t0; \
298 vpbroadcastb ((round * 16) + idx + 6)(rk), t0; \
300 vpbroadcastb ((round * 16) + idx + 5)(rk), t0; \
302 vpbroadcastb ((round * 16) + idx + 4)(rk), t0; \
305 #ifdef CONFIG_AS_GFNI
306 #define aria_sbox_8way_gfni(x0, x1, x2, x3, \
310 vpbroadcastq .Ltf_s2_bitmatrix(%rip), t0; \
311 vpbroadcastq .Ltf_inv_bitmatrix(%rip), t1; \
312 vpbroadcastq .Ltf_id_bitmatrix(%rip), t2; \
313 vpbroadcastq .Ltf_aff_bitmatrix(%rip), t3; \
314 vpbroadcastq .Ltf_x2_bitmatrix(%rip), t4; \
315 vgf2p8affineinvqb $(tf_s2_const), t0, x1, x1; \
316 vgf2p8affineinvqb $(tf_s2_const), t0, x5, x5; \
317 vgf2p8affineqb $(tf_inv_const), t1, x2, x2; \
318 vgf2p8affineqb $(tf_inv_const), t1, x6, x6; \
319 vgf2p8affineinvqb $0, t2, x2, x2; \
320 vgf2p8affineinvqb $0, t2, x6, x6; \
321 vgf2p8affineinvqb $(tf_aff_const), t3, x0, x0; \
322 vgf2p8affineinvqb $(tf_aff_const), t3, x4, x4; \
323 vgf2p8affineqb $(tf_x2_const), t4, x3, x3; \
324 vgf2p8affineqb $(tf_x2_const), t4, x7, x7; \
325 vgf2p8affineinvqb $0, t2, x3, x3; \
326 vgf2p8affineinvqb $0, t2, x7, x7
328 #endif /* CONFIG_AS_GFNI */
329 #define aria_sbox_8way(x0, x1, x2, x3, \
335 vbroadcasti128 .Linv_shift_row(%rip), t0; \
336 vbroadcasti128 .Lshift_row(%rip), t1; \
337 vbroadcasti128 .Ltf_lo__inv_aff__and__s2(%rip), t2; \
338 vbroadcasti128 .Ltf_hi__inv_aff__and__s2(%rip), t3; \
339 vbroadcasti128 .Ltf_lo__x2__and__fwd_aff(%rip), t4; \
340 vbroadcasti128 .Ltf_hi__x2__and__fwd_aff(%rip), t5; \
342 vextracti128 $1, x0, t6##_x; \
343 vaesenclast t7##_x, x0##_x, x0##_x; \
344 vaesenclast t7##_x, t6##_x, t6##_x; \
345 vinserti128 $1, t6##_x, x0, x0; \
347 vextracti128 $1, x4, t6##_x; \
348 vaesenclast t7##_x, x4##_x, x4##_x; \
349 vaesenclast t7##_x, t6##_x, t6##_x; \
350 vinserti128 $1, t6##_x, x4, x4; \
352 vextracti128 $1, x1, t6##_x; \
353 vaesenclast t7##_x, x1##_x, x1##_x; \
354 vaesenclast t7##_x, t6##_x, t6##_x; \
355 vinserti128 $1, t6##_x, x1, x1; \
357 vextracti128 $1, x5, t6##_x; \
358 vaesenclast t7##_x, x5##_x, x5##_x; \
359 vaesenclast t7##_x, t6##_x, t6##_x; \
360 vinserti128 $1, t6##_x, x5, x5; \
362 vextracti128 $1, x2, t6##_x; \
363 vaesdeclast t7##_x, x2##_x, x2##_x; \
364 vaesdeclast t7##_x, t6##_x, t6##_x; \
365 vinserti128 $1, t6##_x, x2, x2; \
367 vextracti128 $1, x6, t6##_x; \
368 vaesdeclast t7##_x, x6##_x, x6##_x; \
369 vaesdeclast t7##_x, t6##_x, t6##_x; \
370 vinserti128 $1, t6##_x, x6, x6; \
372 vpbroadcastd .L0f0f0f0f(%rip), t6; \
374 /* AES inverse shift rows */ \
375 vpshufb t0, x0, x0; \
376 vpshufb t0, x4, x4; \
377 vpshufb t0, x1, x1; \
378 vpshufb t0, x5, x5; \
379 vpshufb t1, x3, x3; \
380 vpshufb t1, x7, x7; \
381 vpshufb t1, x2, x2; \
382 vpshufb t1, x6, x6; \
384 /* affine transformation for S2 */ \
385 filter_8bit(x1, t2, t3, t6, t0); \
386 /* affine transformation for S2 */ \
387 filter_8bit(x5, t2, t3, t6, t0); \
389 /* affine transformation for X2 */ \
390 filter_8bit(x3, t4, t5, t6, t0); \
391 /* affine transformation for X2 */ \
392 filter_8bit(x7, t4, t5, t6, t0); \
395 vextracti128 $1, x3, t6##_x; \
396 vaesdeclast t7##_x, x3##_x, x3##_x; \
397 vaesdeclast t7##_x, t6##_x, t6##_x; \
398 vinserti128 $1, t6##_x, x3, x3; \
400 vextracti128 $1, x7, t6##_x; \
401 vaesdeclast t7##_x, x7##_x, x7##_x; \
402 vaesdeclast t7##_x, t6##_x, t6##_x; \
403 vinserti128 $1, t6##_x, x7, x7; \
405 #define aria_diff_m(x0, x1, x2, x3, \
407 /* T = rotr32(X, 8); */ \
413 /* X = T ^ rotr(X, 16); */ \
420 #define aria_diff_word(x0, x1, x2, x3, \
460 #define aria_fe(x0, x1, x2, x3, \
464 mem_tmp, rk, round) \
465 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
468 aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \
469 y0, y1, y2, y3, y4, y5, y6, y7); \
471 aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
472 aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
473 aria_store_state_8way(x0, x1, x2, x3, \
477 aria_load_state_8way(x0, x1, x2, x3, \
480 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
483 aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \
484 y0, y1, y2, y3, y4, y5, y6, y7); \
486 aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
487 aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
488 aria_store_state_8way(x0, x1, x2, x3, \
491 aria_load_state_8way(y0, y1, y2, y3, \
494 aria_diff_word(x0, x1, x2, x3, \
498 /* aria_diff_byte() \
499 * T3 = ABCD -> BADC \
500 * T3 = y4, y5, y6, y7 -> y5, y4, y7, y6 \
501 * T0 = ABCD -> CDAB \
502 * T0 = x0, x1, x2, x3 -> x2, x3, x0, x1 \
503 * T1 = ABCD -> DCBA \
504 * T1 = x4, x5, x6, x7 -> x7, x6, x5, x4 \
506 aria_diff_word(x2, x3, x0, x1, \
510 aria_store_state_8way(x3, x2, x1, x0, \
514 #define aria_fo(x0, x1, x2, x3, \
518 mem_tmp, rk, round) \
519 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
522 aria_sbox_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
523 y0, y1, y2, y3, y4, y5, y6, y7); \
525 aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
526 aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
527 aria_store_state_8way(x0, x1, x2, x3, \
531 aria_load_state_8way(x0, x1, x2, x3, \
534 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
537 aria_sbox_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
538 y0, y1, y2, y3, y4, y5, y6, y7); \
540 aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
541 aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
542 aria_store_state_8way(x0, x1, x2, x3, \
545 aria_load_state_8way(y0, y1, y2, y3, \
548 aria_diff_word(x0, x1, x2, x3, \
552 /* aria_diff_byte() \
553 * T1 = ABCD -> BADC \
554 * T1 = x4, x5, x6, x7 -> x5, x4, x7, x6 \
555 * T2 = ABCD -> CDAB \
556 * T2 = y0, y1, y2, y3, -> y2, y3, y0, y1 \
557 * T3 = ABCD -> DCBA \
558 * T3 = y4, y5, y6, y7 -> y7, y6, y5, y4 \
560 aria_diff_word(x0, x1, x2, x3, \
564 aria_store_state_8way(x3, x2, x1, x0, \
568 #define aria_ff(x0, x1, x2, x3, \
572 mem_tmp, rk, round, last_round) \
573 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
576 aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \
577 y0, y1, y2, y3, y4, y5, y6, y7); \
579 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
580 y0, rk, 8, last_round); \
582 aria_store_state_8way(x0, x1, x2, x3, \
586 aria_load_state_8way(x0, x1, x2, x3, \
589 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
592 aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \
593 y0, y1, y2, y3, y4, y5, y6, y7); \
595 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
596 y0, rk, 0, last_round); \
598 aria_load_state_8way(y0, y1, y2, y3, \
601 #ifdef CONFIG_AS_GFNI
602 #define aria_fe_gfni(x0, x1, x2, x3, \
606 mem_tmp, rk, round) \
607 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
610 aria_sbox_8way_gfni(x2, x3, x0, x1, \
615 aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
616 aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
617 aria_store_state_8way(x0, x1, x2, x3, \
621 aria_load_state_8way(x0, x1, x2, x3, \
624 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
627 aria_sbox_8way_gfni(x2, x3, x0, x1, \
632 aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
633 aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
634 aria_store_state_8way(x0, x1, x2, x3, \
637 aria_load_state_8way(y0, y1, y2, y3, \
640 aria_diff_word(x0, x1, x2, x3, \
644 /* aria_diff_byte() \
645 * T3 = ABCD -> BADC \
646 * T3 = y4, y5, y6, y7 -> y5, y4, y7, y6 \
647 * T0 = ABCD -> CDAB \
648 * T0 = x0, x1, x2, x3 -> x2, x3, x0, x1 \
649 * T1 = ABCD -> DCBA \
650 * T1 = x4, x5, x6, x7 -> x7, x6, x5, x4 \
652 aria_diff_word(x2, x3, x0, x1, \
656 aria_store_state_8way(x3, x2, x1, x0, \
660 #define aria_fo_gfni(x0, x1, x2, x3, \
664 mem_tmp, rk, round) \
665 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
668 aria_sbox_8way_gfni(x0, x1, x2, x3, \
673 aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
674 aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
675 aria_store_state_8way(x0, x1, x2, x3, \
679 aria_load_state_8way(x0, x1, x2, x3, \
682 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
685 aria_sbox_8way_gfni(x0, x1, x2, x3, \
690 aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
691 aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
692 aria_store_state_8way(x0, x1, x2, x3, \
695 aria_load_state_8way(y0, y1, y2, y3, \
698 aria_diff_word(x0, x1, x2, x3, \
702 /* aria_diff_byte() \
703 * T1 = ABCD -> BADC \
704 * T1 = x4, x5, x6, x7 -> x5, x4, x7, x6 \
705 * T2 = ABCD -> CDAB \
706 * T2 = y0, y1, y2, y3, -> y2, y3, y0, y1 \
707 * T3 = ABCD -> DCBA \
708 * T3 = y4, y5, y6, y7 -> y7, y6, y5, y4 \
710 aria_diff_word(x0, x1, x2, x3, \
714 aria_store_state_8way(x3, x2, x1, x0, \
718 #define aria_ff_gfni(x0, x1, x2, x3, \
722 mem_tmp, rk, round, last_round) \
723 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
726 aria_sbox_8way_gfni(x2, x3, x0, x1, \
731 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
732 y0, rk, 8, last_round); \
734 aria_store_state_8way(x0, x1, x2, x3, \
738 aria_load_state_8way(x0, x1, x2, x3, \
741 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
744 aria_sbox_8way_gfni(x2, x3, x0, x1, \
749 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
750 y0, rk, 0, last_round); \
752 aria_load_state_8way(y0, y1, y2, y3, \
755 #endif /* CONFIG_AS_GFNI */
757 .section .rodata.cst32.shufb_16x16b, "aM", @progbits, 32
759 #define SHUFB_BYTES(idx) \
760 0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx)
762 .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3)
763 .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3)
765 .section .rodata.cst16, "aM", @progbits, 16
767 /* For isolating SubBytes from AESENCLAST, inverse shift row */
769 .byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b
770 .byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03
772 .byte 0x00, 0x05, 0x0a, 0x0f, 0x04, 0x09, 0x0e, 0x03
773 .byte 0x08, 0x0d, 0x02, 0x07, 0x0c, 0x01, 0x06, 0x0b
774 /* For CTR-mode IV byteswap */
776 .byte 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08
777 .byte 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00
779 /* AES inverse affine and S2 combined:
780 * 1 1 0 0 0 0 0 1 x0 0
781 * 0 1 0 0 1 0 0 0 x1 0
782 * 1 1 0 0 1 1 1 1 x2 0
783 * 0 1 1 0 1 0 0 1 x3 1
784 * 0 1 0 0 1 1 0 0 * x4 + 0
785 * 0 1 0 1 1 0 0 0 x5 0
786 * 0 0 0 0 0 1 0 1 x6 0
787 * 1 1 1 0 0 1 1 1 x7 1
789 .Ltf_lo__inv_aff__and__s2:
790 .octa 0x92172DA81A9FA520B2370D883ABF8500
791 .Ltf_hi__inv_aff__and__s2:
792 .octa 0x2B15FFC1AF917B45E6D8320C625CB688
794 /* X2 and AES forward affine combined:
795 * 1 0 1 1 0 0 0 1 x0 0
796 * 0 1 1 1 1 0 1 1 x1 0
797 * 0 0 0 1 1 0 1 0 x2 1
798 * 0 1 0 0 0 1 0 0 x3 0
799 * 0 0 1 1 1 0 1 1 * x4 + 0
800 * 0 1 0 0 1 0 0 0 x5 0
801 * 1 1 0 1 0 0 1 1 x6 0
802 * 0 1 0 0 1 0 1 0 x7 0
804 .Ltf_lo__x2__and__fwd_aff:
805 .octa 0xEFAE0544FCBD1657B8F95213ABEA4100
806 .Ltf_hi__x2__and__fwd_aff:
807 .octa 0x3F893781E95FE1576CDA64D2BA0CB204
809 #ifdef CONFIG_AS_GFNI
810 .section .rodata.cst8, "aM", @progbits, 8
813 #define tf_aff_const BV8(1, 1, 0, 0, 0, 1, 1, 0)
815 .quad BM8X8(BV8(1, 0, 0, 0, 1, 1, 1, 1),
816 BV8(1, 1, 0, 0, 0, 1, 1, 1),
817 BV8(1, 1, 1, 0, 0, 0, 1, 1),
818 BV8(1, 1, 1, 1, 0, 0, 0, 1),
819 BV8(1, 1, 1, 1, 1, 0, 0, 0),
820 BV8(0, 1, 1, 1, 1, 1, 0, 0),
821 BV8(0, 0, 1, 1, 1, 1, 1, 0),
822 BV8(0, 0, 0, 1, 1, 1, 1, 1))
824 /* AES inverse affine: */
825 #define tf_inv_const BV8(1, 0, 1, 0, 0, 0, 0, 0)
827 .quad BM8X8(BV8(0, 0, 1, 0, 0, 1, 0, 1),
828 BV8(1, 0, 0, 1, 0, 0, 1, 0),
829 BV8(0, 1, 0, 0, 1, 0, 0, 1),
830 BV8(1, 0, 1, 0, 0, 1, 0, 0),
831 BV8(0, 1, 0, 1, 0, 0, 1, 0),
832 BV8(0, 0, 1, 0, 1, 0, 0, 1),
833 BV8(1, 0, 0, 1, 0, 1, 0, 0),
834 BV8(0, 1, 0, 0, 1, 0, 1, 0))
837 #define tf_s2_const BV8(0, 1, 0, 0, 0, 1, 1, 1)
839 .quad BM8X8(BV8(0, 1, 0, 1, 0, 1, 1, 1),
840 BV8(0, 0, 1, 1, 1, 1, 1, 1),
841 BV8(1, 1, 1, 0, 1, 1, 0, 1),
842 BV8(1, 1, 0, 0, 0, 0, 1, 1),
843 BV8(0, 1, 0, 0, 0, 0, 1, 1),
844 BV8(1, 1, 0, 0, 1, 1, 1, 0),
845 BV8(0, 1, 1, 0, 0, 0, 1, 1),
846 BV8(1, 1, 1, 1, 0, 1, 1, 0))
849 #define tf_x2_const BV8(0, 0, 1, 1, 0, 1, 0, 0)
851 .quad BM8X8(BV8(0, 0, 0, 1, 1, 0, 0, 0),
852 BV8(0, 0, 1, 0, 0, 1, 1, 0),
853 BV8(0, 0, 0, 0, 1, 0, 1, 0),
854 BV8(1, 1, 1, 0, 0, 0, 1, 1),
855 BV8(1, 1, 1, 0, 1, 1, 0, 0),
856 BV8(0, 1, 1, 0, 1, 0, 1, 1),
857 BV8(1, 0, 1, 1, 1, 1, 0, 1),
858 BV8(1, 0, 0, 1, 0, 0, 1, 1))
860 /* Identity matrix: */
862 .quad BM8X8(BV8(1, 0, 0, 0, 0, 0, 0, 0),
863 BV8(0, 1, 0, 0, 0, 0, 0, 0),
864 BV8(0, 0, 1, 0, 0, 0, 0, 0),
865 BV8(0, 0, 0, 1, 0, 0, 0, 0),
866 BV8(0, 0, 0, 0, 1, 0, 0, 0),
867 BV8(0, 0, 0, 0, 0, 1, 0, 0),
868 BV8(0, 0, 0, 0, 0, 0, 1, 0),
869 BV8(0, 0, 0, 0, 0, 0, 0, 1))
871 #endif /* CONFIG_AS_GFNI */
874 .section .rodata.cst4.L0f0f0f0f, "aM", @progbits, 4
881 SYM_FUNC_START_LOCAL(__aria_aesni_avx2_crypt_32way)
886 * %ymm0..%ymm15: byte-sliced blocks
892 leaq 8 * 32(%rax), %r8;
894 inpack16_post(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
895 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
897 aria_fo(%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, %ymm15,
898 %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
900 aria_fe(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
901 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
902 %ymm15, %rax, %r9, 1);
903 aria_fo(%ymm9, %ymm8, %ymm11, %ymm10, %ymm12, %ymm13, %ymm14, %ymm15,
904 %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
906 aria_fe(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
907 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
908 %ymm15, %rax, %r9, 3);
909 aria_fo(%ymm9, %ymm8, %ymm11, %ymm10, %ymm12, %ymm13, %ymm14, %ymm15,
910 %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
912 aria_fe(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
913 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
914 %ymm15, %rax, %r9, 5);
915 aria_fo(%ymm9, %ymm8, %ymm11, %ymm10, %ymm12, %ymm13, %ymm14, %ymm15,
916 %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
918 aria_fe(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
919 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
920 %ymm15, %rax, %r9, 7);
921 aria_fo(%ymm9, %ymm8, %ymm11, %ymm10, %ymm12, %ymm13, %ymm14, %ymm15,
922 %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
924 aria_fe(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
925 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
926 %ymm15, %rax, %r9, 9);
927 aria_fo(%ymm9, %ymm8, %ymm11, %ymm10, %ymm12, %ymm13, %ymm14, %ymm15,
928 %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
930 cmpl $12, ARIA_CTX_rounds(CTX);
932 aria_ff(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
933 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
934 %ymm15, %rax, %r9, 11, 12);
937 aria_fe(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
938 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
939 %ymm15, %rax, %r9, 11);
940 aria_fo(%ymm9, %ymm8, %ymm11, %ymm10, %ymm12, %ymm13, %ymm14, %ymm15,
941 %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
943 cmpl $14, ARIA_CTX_rounds(CTX);
945 aria_ff(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
946 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
947 %ymm15, %rax, %r9, 13, 14);
950 aria_fe(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
951 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
952 %ymm15, %rax, %r9, 13);
953 aria_fo(%ymm9, %ymm8, %ymm11, %ymm10, %ymm12, %ymm13, %ymm14, %ymm15,
954 %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
956 aria_ff(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
957 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
958 %ymm15, %rax, %r9, 15, 16);
960 debyteslice_16x16b(%ymm8, %ymm12, %ymm1, %ymm4,
961 %ymm9, %ymm13, %ymm0, %ymm5,
962 %ymm10, %ymm14, %ymm3, %ymm6,
963 %ymm11, %ymm15, %ymm2, %ymm7,
968 SYM_FUNC_END(__aria_aesni_avx2_crypt_32way)
970 SYM_TYPED_FUNC_START(aria_aesni_avx2_encrypt_32way)
979 leaq ARIA_CTX_enc_key(CTX), %r9;
981 inpack16_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
982 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
985 call __aria_aesni_avx2_crypt_32way;
987 write_output(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
988 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
993 SYM_FUNC_END(aria_aesni_avx2_encrypt_32way)
995 SYM_TYPED_FUNC_START(aria_aesni_avx2_decrypt_32way)
1004 leaq ARIA_CTX_dec_key(CTX), %r9;
1006 inpack16_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
1007 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
1010 call __aria_aesni_avx2_crypt_32way;
1012 write_output(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
1013 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
1018 SYM_FUNC_END(aria_aesni_avx2_decrypt_32way)
1020 SYM_FUNC_START_LOCAL(__aria_aesni_avx2_ctr_gen_keystream_32way)
1026 * %r8: iv (big endian, 128bit)
1033 vbroadcasti128 .Lbswap128_mask (%rip), %ymm6;
1034 vpcmpeqd %ymm0, %ymm0, %ymm0;
1035 vpsrldq $8, %ymm0, %ymm0; /* ab: -1:0 ; cd: -1:0 */
1036 vpaddq %ymm0, %ymm0, %ymm5; /* ab: -2:0 ; cd: -2:0 */
1038 /* load IV and byteswap */
1039 vmovdqu (%r8), %xmm7;
1040 vpshufb %xmm6, %xmm7, %xmm7;
1041 vmovdqa %xmm7, %xmm3;
1042 inc_le128(%xmm7, %xmm0, %xmm4);
1043 vinserti128 $1, %xmm7, %ymm3, %ymm3;
1044 vpshufb %ymm6, %ymm3, %ymm8; /* +1 ; +0 */
1046 /* check need for handling 64-bit overflow and carry */
1047 cmpq $(0xffffffffffffffff - 32), %r11;
1048 ja .Lhandle_ctr_carry;
1051 vpsubq %ymm5, %ymm3, %ymm3; /* +3 ; +2 */
1052 vpshufb %ymm6, %ymm3, %ymm9;
1053 vpsubq %ymm5, %ymm3, %ymm3; /* +5 ; +4 */
1054 vpshufb %ymm6, %ymm3, %ymm10;
1055 vpsubq %ymm5, %ymm3, %ymm3; /* +7 ; +6 */
1056 vpshufb %ymm6, %ymm3, %ymm11;
1057 vpsubq %ymm5, %ymm3, %ymm3; /* +9 ; +8 */
1058 vpshufb %ymm6, %ymm3, %ymm12;
1059 vpsubq %ymm5, %ymm3, %ymm3; /* +11 ; +10 */
1060 vpshufb %ymm6, %ymm3, %ymm13;
1061 vpsubq %ymm5, %ymm3, %ymm3; /* +13 ; +12 */
1062 vpshufb %ymm6, %ymm3, %ymm14;
1063 vpsubq %ymm5, %ymm3, %ymm3; /* +15 ; +14 */
1064 vpshufb %ymm6, %ymm3, %ymm15;
1065 vmovdqu %ymm8, (0 * 32)(%rcx);
1066 vmovdqu %ymm9, (1 * 32)(%rcx);
1067 vmovdqu %ymm10, (2 * 32)(%rcx);
1068 vmovdqu %ymm11, (3 * 32)(%rcx);
1069 vmovdqu %ymm12, (4 * 32)(%rcx);
1070 vmovdqu %ymm13, (5 * 32)(%rcx);
1071 vmovdqu %ymm14, (6 * 32)(%rcx);
1072 vmovdqu %ymm15, (7 * 32)(%rcx);
1074 vpsubq %ymm5, %ymm3, %ymm3; /* +17 ; +16 */
1075 vpshufb %ymm6, %ymm3, %ymm8;
1076 vpsubq %ymm5, %ymm3, %ymm3; /* +19 ; +18 */
1077 vpshufb %ymm6, %ymm3, %ymm9;
1078 vpsubq %ymm5, %ymm3, %ymm3; /* +21 ; +20 */
1079 vpshufb %ymm6, %ymm3, %ymm10;
1080 vpsubq %ymm5, %ymm3, %ymm3; /* +23 ; +22 */
1081 vpshufb %ymm6, %ymm3, %ymm11;
1082 vpsubq %ymm5, %ymm3, %ymm3; /* +25 ; +24 */
1083 vpshufb %ymm6, %ymm3, %ymm12;
1084 vpsubq %ymm5, %ymm3, %ymm3; /* +27 ; +26 */
1085 vpshufb %ymm6, %ymm3, %ymm13;
1086 vpsubq %ymm5, %ymm3, %ymm3; /* +29 ; +28 */
1087 vpshufb %ymm6, %ymm3, %ymm14;
1088 vpsubq %ymm5, %ymm3, %ymm3; /* +31 ; +30 */
1089 vpshufb %ymm6, %ymm3, %ymm15;
1090 vpsubq %ymm5, %ymm3, %ymm3; /* +32 */
1091 vpshufb %xmm6, %xmm3, %xmm3;
1092 vmovdqu %xmm3, (%r8);
1093 vmovdqu (0 * 32)(%rcx), %ymm0;
1094 vmovdqu (1 * 32)(%rcx), %ymm1;
1095 vmovdqu (2 * 32)(%rcx), %ymm2;
1096 vmovdqu (3 * 32)(%rcx), %ymm3;
1097 vmovdqu (4 * 32)(%rcx), %ymm4;
1098 vmovdqu (5 * 32)(%rcx), %ymm5;
1099 vmovdqu (6 * 32)(%rcx), %ymm6;
1100 vmovdqu (7 * 32)(%rcx), %ymm7;
1101 jmp .Lctr_carry_done;
1105 inc_le128(%ymm3, %ymm0, %ymm4);
1106 inc_le128(%ymm3, %ymm0, %ymm4);
1107 vpshufb %ymm6, %ymm3, %ymm9; /* +3 ; +2 */
1108 inc_le128(%ymm3, %ymm0, %ymm4);
1109 inc_le128(%ymm3, %ymm0, %ymm4);
1110 vpshufb %ymm6, %ymm3, %ymm10; /* +5 ; +4 */
1111 inc_le128(%ymm3, %ymm0, %ymm4);
1112 inc_le128(%ymm3, %ymm0, %ymm4);
1113 vpshufb %ymm6, %ymm3, %ymm11; /* +7 ; +6 */
1114 inc_le128(%ymm3, %ymm0, %ymm4);
1115 inc_le128(%ymm3, %ymm0, %ymm4);
1116 vpshufb %ymm6, %ymm3, %ymm12; /* +9 ; +8 */
1117 inc_le128(%ymm3, %ymm0, %ymm4);
1118 inc_le128(%ymm3, %ymm0, %ymm4);
1119 vpshufb %ymm6, %ymm3, %ymm13; /* +11 ; +10 */
1120 inc_le128(%ymm3, %ymm0, %ymm4);
1121 inc_le128(%ymm3, %ymm0, %ymm4);
1122 vpshufb %ymm6, %ymm3, %ymm14; /* +13 ; +12 */
1123 inc_le128(%ymm3, %ymm0, %ymm4);
1124 inc_le128(%ymm3, %ymm0, %ymm4);
1125 vpshufb %ymm6, %ymm3, %ymm15; /* +15 ; +14 */
1126 vmovdqu %ymm8, (0 * 32)(%rcx);
1127 vmovdqu %ymm9, (1 * 32)(%rcx);
1128 vmovdqu %ymm10, (2 * 32)(%rcx);
1129 vmovdqu %ymm11, (3 * 32)(%rcx);
1130 vmovdqu %ymm12, (4 * 32)(%rcx);
1131 vmovdqu %ymm13, (5 * 32)(%rcx);
1132 vmovdqu %ymm14, (6 * 32)(%rcx);
1133 vmovdqu %ymm15, (7 * 32)(%rcx);
1135 inc_le128(%ymm3, %ymm0, %ymm4);
1136 inc_le128(%ymm3, %ymm0, %ymm4);
1137 vpshufb %ymm6, %ymm3, %ymm8; /* +17 ; +16 */
1138 inc_le128(%ymm3, %ymm0, %ymm4);
1139 inc_le128(%ymm3, %ymm0, %ymm4);
1140 vpshufb %ymm6, %ymm3, %ymm9; /* +19 ; +18 */
1141 inc_le128(%ymm3, %ymm0, %ymm4);
1142 inc_le128(%ymm3, %ymm0, %ymm4);
1143 vpshufb %ymm6, %ymm3, %ymm10; /* +21 ; +20 */
1144 inc_le128(%ymm3, %ymm0, %ymm4);
1145 inc_le128(%ymm3, %ymm0, %ymm4);
1146 vpshufb %ymm6, %ymm3, %ymm11; /* +23 ; +22 */
1147 inc_le128(%ymm3, %ymm0, %ymm4);
1148 inc_le128(%ymm3, %ymm0, %ymm4);
1149 vpshufb %ymm6, %ymm3, %ymm12; /* +25 ; +24 */
1150 inc_le128(%ymm3, %ymm0, %ymm4);
1151 inc_le128(%ymm3, %ymm0, %ymm4);
1152 vpshufb %ymm6, %ymm3, %ymm13; /* +27 ; +26 */
1153 inc_le128(%ymm3, %ymm0, %ymm4);
1154 inc_le128(%ymm3, %ymm0, %ymm4);
1155 vpshufb %ymm6, %ymm3, %ymm14; /* +29 ; +28 */
1156 inc_le128(%ymm3, %ymm0, %ymm4);
1157 inc_le128(%ymm3, %ymm0, %ymm4);
1158 vpshufb %ymm6, %ymm3, %ymm15; /* +31 ; +30 */
1159 inc_le128(%ymm3, %ymm0, %ymm4);
1160 vextracti128 $1, %ymm3, %xmm3;
1161 vpshufb %xmm6, %xmm3, %xmm3; /* +32 */
1162 vmovdqu %xmm3, (%r8);
1163 vmovdqu (0 * 32)(%rcx), %ymm0;
1164 vmovdqu (1 * 32)(%rcx), %ymm1;
1165 vmovdqu (2 * 32)(%rcx), %ymm2;
1166 vmovdqu (3 * 32)(%rcx), %ymm3;
1167 vmovdqu (4 * 32)(%rcx), %ymm4;
1168 vmovdqu (5 * 32)(%rcx), %ymm5;
1169 vmovdqu (6 * 32)(%rcx), %ymm6;
1170 vmovdqu (7 * 32)(%rcx), %ymm7;
1176 SYM_FUNC_END(__aria_aesni_avx2_ctr_gen_keystream_32way)
1178 SYM_TYPED_FUNC_START(aria_aesni_avx2_ctr_crypt_32way)
1184 * %r8: iv (big endian, 128bit)
1188 call __aria_aesni_avx2_ctr_gen_keystream_32way;
1194 leaq ARIA_CTX_enc_key(CTX), %r9;
1196 call __aria_aesni_avx2_crypt_32way;
1198 vpxor (0 * 32)(%r11), %ymm1, %ymm1;
1199 vpxor (1 * 32)(%r11), %ymm0, %ymm0;
1200 vpxor (2 * 32)(%r11), %ymm3, %ymm3;
1201 vpxor (3 * 32)(%r11), %ymm2, %ymm2;
1202 vpxor (4 * 32)(%r11), %ymm4, %ymm4;
1203 vpxor (5 * 32)(%r11), %ymm5, %ymm5;
1204 vpxor (6 * 32)(%r11), %ymm6, %ymm6;
1205 vpxor (7 * 32)(%r11), %ymm7, %ymm7;
1206 vpxor (8 * 32)(%r11), %ymm8, %ymm8;
1207 vpxor (9 * 32)(%r11), %ymm9, %ymm9;
1208 vpxor (10 * 32)(%r11), %ymm10, %ymm10;
1209 vpxor (11 * 32)(%r11), %ymm11, %ymm11;
1210 vpxor (12 * 32)(%r11), %ymm12, %ymm12;
1211 vpxor (13 * 32)(%r11), %ymm13, %ymm13;
1212 vpxor (14 * 32)(%r11), %ymm14, %ymm14;
1213 vpxor (15 * 32)(%r11), %ymm15, %ymm15;
1214 write_output(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
1215 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
1220 SYM_FUNC_END(aria_aesni_avx2_ctr_crypt_32way)
1222 #ifdef CONFIG_AS_GFNI
1223 SYM_FUNC_START_LOCAL(__aria_aesni_avx2_gfni_crypt_32way)
1228 * %ymm0..%ymm15: 16 byte-sliced blocks
1234 leaq 8 * 32(%rax), %r8;
1236 inpack16_post(%ymm0, %ymm1, %ymm2, %ymm3,
1237 %ymm4, %ymm5, %ymm6, %ymm7,
1238 %ymm8, %ymm9, %ymm10, %ymm11,
1239 %ymm12, %ymm13, %ymm14,
1241 aria_fo_gfni(%ymm8, %ymm9, %ymm10, %ymm11,
1242 %ymm12, %ymm13, %ymm14, %ymm15,
1243 %ymm0, %ymm1, %ymm2, %ymm3,
1244 %ymm4, %ymm5, %ymm6, %ymm7,
1246 aria_fe_gfni(%ymm1, %ymm0, %ymm3, %ymm2,
1247 %ymm4, %ymm5, %ymm6, %ymm7,
1248 %ymm8, %ymm9, %ymm10, %ymm11,
1249 %ymm12, %ymm13, %ymm14,
1250 %ymm15, %rax, %r9, 1);
1251 aria_fo_gfni(%ymm9, %ymm8, %ymm11, %ymm10,
1252 %ymm12, %ymm13, %ymm14, %ymm15,
1253 %ymm0, %ymm1, %ymm2, %ymm3,
1254 %ymm4, %ymm5, %ymm6, %ymm7,
1256 aria_fe_gfni(%ymm1, %ymm0, %ymm3, %ymm2,
1257 %ymm4, %ymm5, %ymm6, %ymm7,
1258 %ymm8, %ymm9, %ymm10, %ymm11,
1259 %ymm12, %ymm13, %ymm14,
1260 %ymm15, %rax, %r9, 3);
1261 aria_fo_gfni(%ymm9, %ymm8, %ymm11, %ymm10,
1262 %ymm12, %ymm13, %ymm14, %ymm15,
1263 %ymm0, %ymm1, %ymm2, %ymm3,
1264 %ymm4, %ymm5, %ymm6, %ymm7,
1266 aria_fe_gfni(%ymm1, %ymm0, %ymm3, %ymm2,
1267 %ymm4, %ymm5, %ymm6, %ymm7,
1268 %ymm8, %ymm9, %ymm10, %ymm11,
1269 %ymm12, %ymm13, %ymm14,
1270 %ymm15, %rax, %r9, 5);
1271 aria_fo_gfni(%ymm9, %ymm8, %ymm11, %ymm10,
1272 %ymm12, %ymm13, %ymm14, %ymm15,
1273 %ymm0, %ymm1, %ymm2, %ymm3,
1274 %ymm4, %ymm5, %ymm6, %ymm7,
1276 aria_fe_gfni(%ymm1, %ymm0, %ymm3, %ymm2,
1277 %ymm4, %ymm5, %ymm6, %ymm7,
1278 %ymm8, %ymm9, %ymm10, %ymm11,
1279 %ymm12, %ymm13, %ymm14,
1280 %ymm15, %rax, %r9, 7);
1281 aria_fo_gfni(%ymm9, %ymm8, %ymm11, %ymm10,
1282 %ymm12, %ymm13, %ymm14, %ymm15,
1283 %ymm0, %ymm1, %ymm2, %ymm3,
1284 %ymm4, %ymm5, %ymm6, %ymm7,
1286 aria_fe_gfni(%ymm1, %ymm0, %ymm3, %ymm2,
1287 %ymm4, %ymm5, %ymm6, %ymm7,
1288 %ymm8, %ymm9, %ymm10, %ymm11,
1289 %ymm12, %ymm13, %ymm14,
1290 %ymm15, %rax, %r9, 9);
1291 aria_fo_gfni(%ymm9, %ymm8, %ymm11, %ymm10,
1292 %ymm12, %ymm13, %ymm14, %ymm15,
1293 %ymm0, %ymm1, %ymm2, %ymm3,
1294 %ymm4, %ymm5, %ymm6, %ymm7,
1296 cmpl $12, ARIA_CTX_rounds(CTX);
1297 jne .Laria_gfni_192;
1298 aria_ff_gfni(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
1299 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
1300 %ymm15, %rax, %r9, 11, 12);
1301 jmp .Laria_gfni_end;
1303 aria_fe_gfni(%ymm1, %ymm0, %ymm3, %ymm2,
1304 %ymm4, %ymm5, %ymm6, %ymm7,
1305 %ymm8, %ymm9, %ymm10, %ymm11,
1306 %ymm12, %ymm13, %ymm14,
1307 %ymm15, %rax, %r9, 11);
1308 aria_fo_gfni(%ymm9, %ymm8, %ymm11, %ymm10,
1309 %ymm12, %ymm13, %ymm14, %ymm15,
1310 %ymm0, %ymm1, %ymm2, %ymm3,
1311 %ymm4, %ymm5, %ymm6, %ymm7,
1313 cmpl $14, ARIA_CTX_rounds(CTX);
1314 jne .Laria_gfni_256;
1315 aria_ff_gfni(%ymm1, %ymm0, %ymm3, %ymm2,
1316 %ymm4, %ymm5, %ymm6, %ymm7,
1317 %ymm8, %ymm9, %ymm10, %ymm11,
1318 %ymm12, %ymm13, %ymm14,
1319 %ymm15, %rax, %r9, 13, 14);
1320 jmp .Laria_gfni_end;
1322 aria_fe_gfni(%ymm1, %ymm0, %ymm3, %ymm2,
1323 %ymm4, %ymm5, %ymm6, %ymm7,
1324 %ymm8, %ymm9, %ymm10, %ymm11,
1325 %ymm12, %ymm13, %ymm14,
1326 %ymm15, %rax, %r9, 13);
1327 aria_fo_gfni(%ymm9, %ymm8, %ymm11, %ymm10,
1328 %ymm12, %ymm13, %ymm14, %ymm15,
1329 %ymm0, %ymm1, %ymm2, %ymm3,
1330 %ymm4, %ymm5, %ymm6, %ymm7,
1332 aria_ff_gfni(%ymm1, %ymm0, %ymm3, %ymm2,
1333 %ymm4, %ymm5, %ymm6, %ymm7,
1334 %ymm8, %ymm9, %ymm10, %ymm11,
1335 %ymm12, %ymm13, %ymm14,
1336 %ymm15, %rax, %r9, 15, 16);
1338 debyteslice_16x16b(%ymm8, %ymm12, %ymm1, %ymm4,
1339 %ymm9, %ymm13, %ymm0, %ymm5,
1340 %ymm10, %ymm14, %ymm3, %ymm6,
1341 %ymm11, %ymm15, %ymm2, %ymm7,
1346 SYM_FUNC_END(__aria_aesni_avx2_gfni_crypt_32way)
1348 SYM_TYPED_FUNC_START(aria_aesni_avx2_gfni_encrypt_32way)
1357 leaq ARIA_CTX_enc_key(CTX), %r9;
1359 inpack16_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
1360 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
1363 call __aria_aesni_avx2_gfni_crypt_32way;
1365 write_output(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
1366 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
1371 SYM_FUNC_END(aria_aesni_avx2_gfni_encrypt_32way)
1373 SYM_TYPED_FUNC_START(aria_aesni_avx2_gfni_decrypt_32way)
1382 leaq ARIA_CTX_dec_key(CTX), %r9;
1384 inpack16_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
1385 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
1388 call __aria_aesni_avx2_gfni_crypt_32way;
1390 write_output(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
1391 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
1396 SYM_FUNC_END(aria_aesni_avx2_gfni_decrypt_32way)
1398 SYM_TYPED_FUNC_START(aria_aesni_avx2_gfni_ctr_crypt_32way)
1404 * %r8: iv (big endian, 128bit)
1408 call __aria_aesni_avx2_ctr_gen_keystream_32way
1414 leaq ARIA_CTX_enc_key(CTX), %r9;
1416 call __aria_aesni_avx2_gfni_crypt_32way;
1418 vpxor (0 * 32)(%r11), %ymm1, %ymm1;
1419 vpxor (1 * 32)(%r11), %ymm0, %ymm0;
1420 vpxor (2 * 32)(%r11), %ymm3, %ymm3;
1421 vpxor (3 * 32)(%r11), %ymm2, %ymm2;
1422 vpxor (4 * 32)(%r11), %ymm4, %ymm4;
1423 vpxor (5 * 32)(%r11), %ymm5, %ymm5;
1424 vpxor (6 * 32)(%r11), %ymm6, %ymm6;
1425 vpxor (7 * 32)(%r11), %ymm7, %ymm7;
1426 vpxor (8 * 32)(%r11), %ymm8, %ymm8;
1427 vpxor (9 * 32)(%r11), %ymm9, %ymm9;
1428 vpxor (10 * 32)(%r11), %ymm10, %ymm10;
1429 vpxor (11 * 32)(%r11), %ymm11, %ymm11;
1430 vpxor (12 * 32)(%r11), %ymm12, %ymm12;
1431 vpxor (13 * 32)(%r11), %ymm13, %ymm13;
1432 vpxor (14 * 32)(%r11), %ymm14, %ymm14;
1433 vpxor (15 * 32)(%r11), %ymm15, %ymm15;
1434 write_output(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
1435 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
1440 SYM_FUNC_END(aria_aesni_avx2_gfni_ctr_crypt_32way)
1441 #endif /* CONFIG_AS_GFNI */