1 /* SPDX-License-Identifier: GPL-2.0-or-later */
3 * ARIA Cipher 16-way parallel algorithm (AVX)
5 * Copyright (c) 2022 Taehee Yoo <ap420073@gmail.com>
9 #include <linux/linkage.h>
10 #include <linux/cfi_types.h>
11 #include <asm/asm-offsets.h>
12 #include <asm/frame.h>
18 #define BV8(a0, a1, a2, a3, a4, a5, a6, a7) \
19 ( (((a0) & 1) << 0) | \
28 #define BM8X8(l0, l1, l2, l3, l4, l5, l6, l7) \
29 ( ((l7) << (0 * 8)) | \
38 #define inc_le128(x, minus_one, tmp) \
39 vpcmpeqq minus_one, x, tmp; \
40 vpsubq minus_one, x, x; \
41 vpslldq $8, tmp, tmp; \
44 #define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0) \
45 vpand x, mask4bit, tmp0; \
46 vpandn x, mask4bit, x; \
49 vpshufb tmp0, lo_t, tmp0; \
53 #define transpose_4x4(x0, x1, x2, x3, t1, t2) \
54 vpunpckhdq x1, x0, t2; \
55 vpunpckldq x1, x0, x0; \
57 vpunpckldq x3, x2, t1; \
58 vpunpckhdq x3, x2, x2; \
60 vpunpckhqdq t1, x0, x1; \
61 vpunpcklqdq t1, x0, x0; \
63 vpunpckhqdq x2, t2, x3; \
64 vpunpcklqdq x2, t2, x2;
66 #define byteslice_16x16b(a0, b0, c0, d0, \
73 transpose_4x4(a0, a1, a2, a3, d2, d3); \
74 transpose_4x4(b0, b1, b2, b3, d2, d3); \
80 transpose_4x4(c0, c1, c2, c3, a0, a1); \
81 transpose_4x4(d0, d1, d2, d3, a0, a1); \
83 vmovdqu .Lshufb_16x16b(%rip), a0; \
102 vpshufb a0, d3, a0; \
105 transpose_4x4(a0, b0, c0, d0, d2, d3); \
106 transpose_4x4(a1, b1, c1, d1, d2, d3); \
112 transpose_4x4(a2, b2, c2, d2, b0, b1); \
113 transpose_4x4(a3, b3, c3, d3, b0, b1); \
116 /* does not adjust output bytes inside vectors */
118 #define debyteslice_16x16b(a0, b0, c0, d0, \
125 transpose_4x4(a0, a1, a2, a3, d2, d3); \
126 transpose_4x4(b0, b1, b2, b3, d2, d3); \
132 transpose_4x4(c0, c1, c2, c3, a0, a1); \
133 transpose_4x4(d0, d1, d2, d3, a0, a1); \
135 vmovdqu .Lshufb_16x16b(%rip), a0; \
137 vpshufb a0, a2, a2; \
138 vpshufb a0, a3, a3; \
139 vpshufb a0, b0, b0; \
140 vpshufb a0, b1, b1; \
141 vpshufb a0, b2, b2; \
142 vpshufb a0, b3, b3; \
143 vpshufb a0, a1, a1; \
144 vpshufb a0, c0, c0; \
145 vpshufb a0, c1, c1; \
146 vpshufb a0, c2, c2; \
147 vpshufb a0, c3, c3; \
148 vpshufb a0, d0, d0; \
149 vpshufb a0, d1, d1; \
150 vpshufb a0, d2, d2; \
151 vpshufb a0, d3, d3; \
154 vpshufb a0, d3, a0; \
157 transpose_4x4(c0, d0, a0, b0, d2, d3); \
158 transpose_4x4(c1, d1, a1, b1, d2, d3); \
164 transpose_4x4(c2, d2, a2, b2, b0, b1); \
165 transpose_4x4(c3, d3, a3, b3, b0, b1); \
168 /* does not adjust output bytes inside vectors */
170 /* load blocks to registers and apply pre-whitening */
171 #define inpack16_pre(x0, x1, x2, x3, \
176 vmovdqu (0 * 16)(rio), x0; \
177 vmovdqu (1 * 16)(rio), x1; \
178 vmovdqu (2 * 16)(rio), x2; \
179 vmovdqu (3 * 16)(rio), x3; \
180 vmovdqu (4 * 16)(rio), x4; \
181 vmovdqu (5 * 16)(rio), x5; \
182 vmovdqu (6 * 16)(rio), x6; \
183 vmovdqu (7 * 16)(rio), x7; \
184 vmovdqu (8 * 16)(rio), y0; \
185 vmovdqu (9 * 16)(rio), y1; \
186 vmovdqu (10 * 16)(rio), y2; \
187 vmovdqu (11 * 16)(rio), y3; \
188 vmovdqu (12 * 16)(rio), y4; \
189 vmovdqu (13 * 16)(rio), y5; \
190 vmovdqu (14 * 16)(rio), y6; \
191 vmovdqu (15 * 16)(rio), y7;
193 /* byteslice pre-whitened blocks and store to temporary memory */
194 #define inpack16_post(x0, x1, x2, x3, \
199 byteslice_16x16b(x0, x1, x2, x3, \
203 (mem_ab), (mem_cd)); \
205 vmovdqu x0, 0 * 16(mem_ab); \
206 vmovdqu x1, 1 * 16(mem_ab); \
207 vmovdqu x2, 2 * 16(mem_ab); \
208 vmovdqu x3, 3 * 16(mem_ab); \
209 vmovdqu x4, 4 * 16(mem_ab); \
210 vmovdqu x5, 5 * 16(mem_ab); \
211 vmovdqu x6, 6 * 16(mem_ab); \
212 vmovdqu x7, 7 * 16(mem_ab); \
213 vmovdqu y0, 0 * 16(mem_cd); \
214 vmovdqu y1, 1 * 16(mem_cd); \
215 vmovdqu y2, 2 * 16(mem_cd); \
216 vmovdqu y3, 3 * 16(mem_cd); \
217 vmovdqu y4, 4 * 16(mem_cd); \
218 vmovdqu y5, 5 * 16(mem_cd); \
219 vmovdqu y6, 6 * 16(mem_cd); \
220 vmovdqu y7, 7 * 16(mem_cd);
222 #define write_output(x0, x1, x2, x3, \
227 vmovdqu x0, 0 * 16(mem); \
228 vmovdqu x1, 1 * 16(mem); \
229 vmovdqu x2, 2 * 16(mem); \
230 vmovdqu x3, 3 * 16(mem); \
231 vmovdqu x4, 4 * 16(mem); \
232 vmovdqu x5, 5 * 16(mem); \
233 vmovdqu x6, 6 * 16(mem); \
234 vmovdqu x7, 7 * 16(mem); \
235 vmovdqu y0, 8 * 16(mem); \
236 vmovdqu y1, 9 * 16(mem); \
237 vmovdqu y2, 10 * 16(mem); \
238 vmovdqu y3, 11 * 16(mem); \
239 vmovdqu y4, 12 * 16(mem); \
240 vmovdqu y5, 13 * 16(mem); \
241 vmovdqu y6, 14 * 16(mem); \
242 vmovdqu y7, 15 * 16(mem); \
244 #define aria_store_state_8way(x0, x1, x2, x3, \
247 vmovdqu x0, ((idx + 0) * 16)(mem_tmp); \
248 vmovdqu x1, ((idx + 1) * 16)(mem_tmp); \
249 vmovdqu x2, ((idx + 2) * 16)(mem_tmp); \
250 vmovdqu x3, ((idx + 3) * 16)(mem_tmp); \
251 vmovdqu x4, ((idx + 4) * 16)(mem_tmp); \
252 vmovdqu x5, ((idx + 5) * 16)(mem_tmp); \
253 vmovdqu x6, ((idx + 6) * 16)(mem_tmp); \
254 vmovdqu x7, ((idx + 7) * 16)(mem_tmp);
256 #define aria_load_state_8way(x0, x1, x2, x3, \
259 vmovdqu ((idx + 0) * 16)(mem_tmp), x0; \
260 vmovdqu ((idx + 1) * 16)(mem_tmp), x1; \
261 vmovdqu ((idx + 2) * 16)(mem_tmp), x2; \
262 vmovdqu ((idx + 3) * 16)(mem_tmp), x3; \
263 vmovdqu ((idx + 4) * 16)(mem_tmp), x4; \
264 vmovdqu ((idx + 5) * 16)(mem_tmp), x5; \
265 vmovdqu ((idx + 6) * 16)(mem_tmp), x6; \
266 vmovdqu ((idx + 7) * 16)(mem_tmp), x7;
268 #define aria_ark_8way(x0, x1, x2, x3, \
273 vbroadcastss ((round * 16) + idx + 0)(rk), t0; \
274 vpsrld $24, t0, t2; \
275 vpshufb t1, t2, t2; \
277 vpsrld $16, t0, t2; \
278 vpshufb t1, t2, t2; \
281 vpshufb t1, t2, t2; \
283 vpshufb t1, t0, t2; \
285 vbroadcastss ((round * 16) + idx + 4)(rk), t0; \
286 vpsrld $24, t0, t2; \
287 vpshufb t1, t2, t2; \
289 vpsrld $16, t0, t2; \
290 vpshufb t1, t2, t2; \
293 vpshufb t1, t2, t2; \
295 vpshufb t1, t0, t2; \
298 #ifdef CONFIG_AS_GFNI
299 #define aria_sbox_8way_gfni(x0, x1, x2, x3, \
303 vmovdqa .Ltf_s2_bitmatrix(%rip), t0; \
304 vmovdqa .Ltf_inv_bitmatrix(%rip), t1; \
305 vmovdqa .Ltf_id_bitmatrix(%rip), t2; \
306 vmovdqa .Ltf_aff_bitmatrix(%rip), t3; \
307 vmovdqa .Ltf_x2_bitmatrix(%rip), t4; \
308 vgf2p8affineinvqb $(tf_s2_const), t0, x1, x1; \
309 vgf2p8affineinvqb $(tf_s2_const), t0, x5, x5; \
310 vgf2p8affineqb $(tf_inv_const), t1, x2, x2; \
311 vgf2p8affineqb $(tf_inv_const), t1, x6, x6; \
312 vgf2p8affineinvqb $0, t2, x2, x2; \
313 vgf2p8affineinvqb $0, t2, x6, x6; \
314 vgf2p8affineinvqb $(tf_aff_const), t3, x0, x0; \
315 vgf2p8affineinvqb $(tf_aff_const), t3, x4, x4; \
316 vgf2p8affineqb $(tf_x2_const), t4, x3, x3; \
317 vgf2p8affineqb $(tf_x2_const), t4, x7, x7; \
318 vgf2p8affineinvqb $0, t2, x3, x3; \
319 vgf2p8affineinvqb $0, t2, x7, x7
321 #endif /* CONFIG_AS_GFNI */
323 #define aria_sbox_8way(x0, x1, x2, x3, \
327 vmovdqa .Linv_shift_row(%rip), t0; \
328 vmovdqa .Lshift_row(%rip), t1; \
329 vbroadcastss .L0f0f0f0f(%rip), t6; \
330 vmovdqa .Ltf_lo__inv_aff__and__s2(%rip), t2; \
331 vmovdqa .Ltf_hi__inv_aff__and__s2(%rip), t3; \
332 vmovdqa .Ltf_lo__x2__and__fwd_aff(%rip), t4; \
333 vmovdqa .Ltf_hi__x2__and__fwd_aff(%rip), t5; \
335 vaesenclast t7, x0, x0; \
336 vaesenclast t7, x4, x4; \
337 vaesenclast t7, x1, x1; \
338 vaesenclast t7, x5, x5; \
339 vaesdeclast t7, x2, x2; \
340 vaesdeclast t7, x6, x6; \
342 /* AES inverse shift rows */ \
343 vpshufb t0, x0, x0; \
344 vpshufb t0, x4, x4; \
345 vpshufb t0, x1, x1; \
346 vpshufb t0, x5, x5; \
347 vpshufb t1, x3, x3; \
348 vpshufb t1, x7, x7; \
349 vpshufb t1, x2, x2; \
350 vpshufb t1, x6, x6; \
352 /* affine transformation for S2 */ \
353 filter_8bit(x1, t2, t3, t6, t0); \
354 /* affine transformation for S2 */ \
355 filter_8bit(x5, t2, t3, t6, t0); \
357 /* affine transformation for X2 */ \
358 filter_8bit(x3, t4, t5, t6, t0); \
359 /* affine transformation for X2 */ \
360 filter_8bit(x7, t4, t5, t6, t0); \
361 vaesdeclast t7, x3, x3; \
362 vaesdeclast t7, x7, x7;
364 #define aria_diff_m(x0, x1, x2, x3, \
366 /* T = rotr32(X, 8); */ \
372 /* X = T ^ rotr(X, 16); */ \
379 #define aria_diff_word(x0, x1, x2, x3, \
419 #define aria_fe(x0, x1, x2, x3, \
423 mem_tmp, rk, round) \
425 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
426 y0, y7, y2, rk, 8, round); \
428 aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \
429 y0, y1, y2, y3, y4, y5, y6, y7); \
431 aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
432 aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
433 aria_store_state_8way(x0, x1, x2, x3, \
437 aria_load_state_8way(x0, x1, x2, x3, \
440 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
441 y0, y7, y2, rk, 0, round); \
443 aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \
444 y0, y1, y2, y3, y4, y5, y6, y7); \
446 aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
447 aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
448 aria_store_state_8way(x0, x1, x2, x3, \
451 aria_load_state_8way(y0, y1, y2, y3, \
454 aria_diff_word(x0, x1, x2, x3, \
458 /* aria_diff_byte() \
459 * T3 = ABCD -> BADC \
460 * T3 = y4, y5, y6, y7 -> y5, y4, y7, y6 \
461 * T0 = ABCD -> CDAB \
462 * T0 = x0, x1, x2, x3 -> x2, x3, x0, x1 \
463 * T1 = ABCD -> DCBA \
464 * T1 = x4, x5, x6, x7 -> x7, x6, x5, x4 \
466 aria_diff_word(x2, x3, x0, x1, \
470 aria_store_state_8way(x3, x2, x1, x0, \
474 #define aria_fo(x0, x1, x2, x3, \
478 mem_tmp, rk, round) \
480 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
481 y0, y7, y2, rk, 8, round); \
483 aria_sbox_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
484 y0, y1, y2, y3, y4, y5, y6, y7); \
486 aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
487 aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
488 aria_store_state_8way(x0, x1, x2, x3, \
492 aria_load_state_8way(x0, x1, x2, x3, \
495 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
496 y0, y7, y2, rk, 0, round); \
498 aria_sbox_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
499 y0, y1, y2, y3, y4, y5, y6, y7); \
501 aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
502 aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
503 aria_store_state_8way(x0, x1, x2, x3, \
506 aria_load_state_8way(y0, y1, y2, y3, \
509 aria_diff_word(x0, x1, x2, x3, \
513 /* aria_diff_byte() \
514 * T1 = ABCD -> BADC \
515 * T1 = x4, x5, x6, x7 -> x5, x4, x7, x6 \
516 * T2 = ABCD -> CDAB \
517 * T2 = y0, y1, y2, y3, -> y2, y3, y0, y1 \
518 * T3 = ABCD -> DCBA \
519 * T3 = y4, y5, y6, y7 -> y7, y6, y5, y4 \
521 aria_diff_word(x0, x1, x2, x3, \
525 aria_store_state_8way(x3, x2, x1, x0, \
529 #define aria_ff(x0, x1, x2, x3, \
533 mem_tmp, rk, round, last_round) \
535 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
536 y0, y7, y2, rk, 8, round); \
538 aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \
539 y0, y1, y2, y3, y4, y5, y6, y7); \
541 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
542 y0, y7, y2, rk, 8, last_round); \
544 aria_store_state_8way(x0, x1, x2, x3, \
548 aria_load_state_8way(x0, x1, x2, x3, \
551 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
552 y0, y7, y2, rk, 0, round); \
554 aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \
555 y0, y1, y2, y3, y4, y5, y6, y7); \
557 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
558 y0, y7, y2, rk, 0, last_round); \
560 aria_load_state_8way(y0, y1, y2, y3, \
564 #ifdef CONFIG_AS_GFNI
565 #define aria_fe_gfni(x0, x1, x2, x3, \
569 mem_tmp, rk, round) \
571 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
572 y0, y7, y2, rk, 8, round); \
574 aria_sbox_8way_gfni(x2, x3, x0, x1, \
579 aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
580 aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
581 aria_store_state_8way(x0, x1, x2, x3, \
585 aria_load_state_8way(x0, x1, x2, x3, \
588 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
589 y0, y7, y2, rk, 0, round); \
591 aria_sbox_8way_gfni(x2, x3, x0, x1, \
596 aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
597 aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
598 aria_store_state_8way(x0, x1, x2, x3, \
601 aria_load_state_8way(y0, y1, y2, y3, \
604 aria_diff_word(x0, x1, x2, x3, \
608 /* aria_diff_byte() \
609 * T3 = ABCD -> BADC \
610 * T3 = y4, y5, y6, y7 -> y5, y4, y7, y6 \
611 * T0 = ABCD -> CDAB \
612 * T0 = x0, x1, x2, x3 -> x2, x3, x0, x1 \
613 * T1 = ABCD -> DCBA \
614 * T1 = x4, x5, x6, x7 -> x7, x6, x5, x4 \
616 aria_diff_word(x2, x3, x0, x1, \
620 aria_store_state_8way(x3, x2, x1, x0, \
624 #define aria_fo_gfni(x0, x1, x2, x3, \
628 mem_tmp, rk, round) \
630 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
631 y0, y7, y2, rk, 8, round); \
633 aria_sbox_8way_gfni(x0, x1, x2, x3, \
638 aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
639 aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
640 aria_store_state_8way(x0, x1, x2, x3, \
644 aria_load_state_8way(x0, x1, x2, x3, \
647 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
648 y0, y7, y2, rk, 0, round); \
650 aria_sbox_8way_gfni(x0, x1, x2, x3, \
655 aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
656 aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
657 aria_store_state_8way(x0, x1, x2, x3, \
660 aria_load_state_8way(y0, y1, y2, y3, \
663 aria_diff_word(x0, x1, x2, x3, \
667 /* aria_diff_byte() \
668 * T1 = ABCD -> BADC \
669 * T1 = x4, x5, x6, x7 -> x5, x4, x7, x6 \
670 * T2 = ABCD -> CDAB \
671 * T2 = y0, y1, y2, y3, -> y2, y3, y0, y1 \
672 * T3 = ABCD -> DCBA \
673 * T3 = y4, y5, y6, y7 -> y7, y6, y5, y4 \
675 aria_diff_word(x0, x1, x2, x3, \
679 aria_store_state_8way(x3, x2, x1, x0, \
683 #define aria_ff_gfni(x0, x1, x2, x3, \
687 mem_tmp, rk, round, last_round) \
689 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
690 y0, y7, y2, rk, 8, round); \
692 aria_sbox_8way_gfni(x2, x3, x0, x1, \
697 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
698 y0, y7, y2, rk, 8, last_round); \
700 aria_store_state_8way(x0, x1, x2, x3, \
704 aria_load_state_8way(x0, x1, x2, x3, \
707 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
708 y0, y7, y2, rk, 0, round); \
710 aria_sbox_8way_gfni(x2, x3, x0, x1, \
715 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
716 y0, y7, y2, rk, 0, last_round); \
718 aria_load_state_8way(y0, y1, y2, y3, \
722 #endif /* CONFIG_AS_GFNI */
724 /* NB: section is mergeable, all elements must be aligned 16-byte blocks */
725 .section .rodata.cst16, "aM", @progbits, 16
728 #define SHUFB_BYTES(idx) \
729 0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx)
732 .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3);
733 /* For isolating SubBytes from AESENCLAST, inverse shift row */
735 .byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b
736 .byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03
738 .byte 0x00, 0x05, 0x0a, 0x0f, 0x04, 0x09, 0x0e, 0x03
739 .byte 0x08, 0x0d, 0x02, 0x07, 0x0c, 0x01, 0x06, 0x0b
740 /* For CTR-mode IV byteswap */
742 .byte 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08
743 .byte 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00
745 /* AES inverse affine and S2 combined:
746 * 1 1 0 0 0 0 0 1 x0 0
747 * 0 1 0 0 1 0 0 0 x1 0
748 * 1 1 0 0 1 1 1 1 x2 0
749 * 0 1 1 0 1 0 0 1 x3 1
750 * 0 1 0 0 1 1 0 0 * x4 + 0
751 * 0 1 0 1 1 0 0 0 x5 0
752 * 0 0 0 0 0 1 0 1 x6 0
753 * 1 1 1 0 0 1 1 1 x7 1
755 .Ltf_lo__inv_aff__and__s2:
756 .octa 0x92172DA81A9FA520B2370D883ABF8500
757 .Ltf_hi__inv_aff__and__s2:
758 .octa 0x2B15FFC1AF917B45E6D8320C625CB688
760 /* X2 and AES forward affine combined:
761 * 1 0 1 1 0 0 0 1 x0 0
762 * 0 1 1 1 1 0 1 1 x1 0
763 * 0 0 0 1 1 0 1 0 x2 1
764 * 0 1 0 0 0 1 0 0 x3 0
765 * 0 0 1 1 1 0 1 1 * x4 + 0
766 * 0 1 0 0 1 0 0 0 x5 0
767 * 1 1 0 1 0 0 1 1 x6 0
768 * 0 1 0 0 1 0 1 0 x7 0
770 .Ltf_lo__x2__and__fwd_aff:
771 .octa 0xEFAE0544FCBD1657B8F95213ABEA4100
772 .Ltf_hi__x2__and__fwd_aff:
773 .octa 0x3F893781E95FE1576CDA64D2BA0CB204
775 #ifdef CONFIG_AS_GFNI
777 #define tf_aff_const BV8(1, 1, 0, 0, 0, 1, 1, 0)
779 .quad BM8X8(BV8(1, 0, 0, 0, 1, 1, 1, 1),
780 BV8(1, 1, 0, 0, 0, 1, 1, 1),
781 BV8(1, 1, 1, 0, 0, 0, 1, 1),
782 BV8(1, 1, 1, 1, 0, 0, 0, 1),
783 BV8(1, 1, 1, 1, 1, 0, 0, 0),
784 BV8(0, 1, 1, 1, 1, 1, 0, 0),
785 BV8(0, 0, 1, 1, 1, 1, 1, 0),
786 BV8(0, 0, 0, 1, 1, 1, 1, 1))
787 .quad BM8X8(BV8(1, 0, 0, 0, 1, 1, 1, 1),
788 BV8(1, 1, 0, 0, 0, 1, 1, 1),
789 BV8(1, 1, 1, 0, 0, 0, 1, 1),
790 BV8(1, 1, 1, 1, 0, 0, 0, 1),
791 BV8(1, 1, 1, 1, 1, 0, 0, 0),
792 BV8(0, 1, 1, 1, 1, 1, 0, 0),
793 BV8(0, 0, 1, 1, 1, 1, 1, 0),
794 BV8(0, 0, 0, 1, 1, 1, 1, 1))
796 /* AES inverse affine: */
797 #define tf_inv_const BV8(1, 0, 1, 0, 0, 0, 0, 0)
799 .quad BM8X8(BV8(0, 0, 1, 0, 0, 1, 0, 1),
800 BV8(1, 0, 0, 1, 0, 0, 1, 0),
801 BV8(0, 1, 0, 0, 1, 0, 0, 1),
802 BV8(1, 0, 1, 0, 0, 1, 0, 0),
803 BV8(0, 1, 0, 1, 0, 0, 1, 0),
804 BV8(0, 0, 1, 0, 1, 0, 0, 1),
805 BV8(1, 0, 0, 1, 0, 1, 0, 0),
806 BV8(0, 1, 0, 0, 1, 0, 1, 0))
807 .quad BM8X8(BV8(0, 0, 1, 0, 0, 1, 0, 1),
808 BV8(1, 0, 0, 1, 0, 0, 1, 0),
809 BV8(0, 1, 0, 0, 1, 0, 0, 1),
810 BV8(1, 0, 1, 0, 0, 1, 0, 0),
811 BV8(0, 1, 0, 1, 0, 0, 1, 0),
812 BV8(0, 0, 1, 0, 1, 0, 0, 1),
813 BV8(1, 0, 0, 1, 0, 1, 0, 0),
814 BV8(0, 1, 0, 0, 1, 0, 1, 0))
817 #define tf_s2_const BV8(0, 1, 0, 0, 0, 1, 1, 1)
819 .quad BM8X8(BV8(0, 1, 0, 1, 0, 1, 1, 1),
820 BV8(0, 0, 1, 1, 1, 1, 1, 1),
821 BV8(1, 1, 1, 0, 1, 1, 0, 1),
822 BV8(1, 1, 0, 0, 0, 0, 1, 1),
823 BV8(0, 1, 0, 0, 0, 0, 1, 1),
824 BV8(1, 1, 0, 0, 1, 1, 1, 0),
825 BV8(0, 1, 1, 0, 0, 0, 1, 1),
826 BV8(1, 1, 1, 1, 0, 1, 1, 0))
827 .quad BM8X8(BV8(0, 1, 0, 1, 0, 1, 1, 1),
828 BV8(0, 0, 1, 1, 1, 1, 1, 1),
829 BV8(1, 1, 1, 0, 1, 1, 0, 1),
830 BV8(1, 1, 0, 0, 0, 0, 1, 1),
831 BV8(0, 1, 0, 0, 0, 0, 1, 1),
832 BV8(1, 1, 0, 0, 1, 1, 1, 0),
833 BV8(0, 1, 1, 0, 0, 0, 1, 1),
834 BV8(1, 1, 1, 1, 0, 1, 1, 0))
837 #define tf_x2_const BV8(0, 0, 1, 1, 0, 1, 0, 0)
839 .quad BM8X8(BV8(0, 0, 0, 1, 1, 0, 0, 0),
840 BV8(0, 0, 1, 0, 0, 1, 1, 0),
841 BV8(0, 0, 0, 0, 1, 0, 1, 0),
842 BV8(1, 1, 1, 0, 0, 0, 1, 1),
843 BV8(1, 1, 1, 0, 1, 1, 0, 0),
844 BV8(0, 1, 1, 0, 1, 0, 1, 1),
845 BV8(1, 0, 1, 1, 1, 1, 0, 1),
846 BV8(1, 0, 0, 1, 0, 0, 1, 1))
847 .quad BM8X8(BV8(0, 0, 0, 1, 1, 0, 0, 0),
848 BV8(0, 0, 1, 0, 0, 1, 1, 0),
849 BV8(0, 0, 0, 0, 1, 0, 1, 0),
850 BV8(1, 1, 1, 0, 0, 0, 1, 1),
851 BV8(1, 1, 1, 0, 1, 1, 0, 0),
852 BV8(0, 1, 1, 0, 1, 0, 1, 1),
853 BV8(1, 0, 1, 1, 1, 1, 0, 1),
854 BV8(1, 0, 0, 1, 0, 0, 1, 1))
856 /* Identity matrix: */
858 .quad BM8X8(BV8(1, 0, 0, 0, 0, 0, 0, 0),
859 BV8(0, 1, 0, 0, 0, 0, 0, 0),
860 BV8(0, 0, 1, 0, 0, 0, 0, 0),
861 BV8(0, 0, 0, 1, 0, 0, 0, 0),
862 BV8(0, 0, 0, 0, 1, 0, 0, 0),
863 BV8(0, 0, 0, 0, 0, 1, 0, 0),
864 BV8(0, 0, 0, 0, 0, 0, 1, 0),
865 BV8(0, 0, 0, 0, 0, 0, 0, 1))
866 .quad BM8X8(BV8(1, 0, 0, 0, 0, 0, 0, 0),
867 BV8(0, 1, 0, 0, 0, 0, 0, 0),
868 BV8(0, 0, 1, 0, 0, 0, 0, 0),
869 BV8(0, 0, 0, 1, 0, 0, 0, 0),
870 BV8(0, 0, 0, 0, 1, 0, 0, 0),
871 BV8(0, 0, 0, 0, 0, 1, 0, 0),
872 BV8(0, 0, 0, 0, 0, 0, 1, 0),
873 BV8(0, 0, 0, 0, 0, 0, 0, 1))
874 #endif /* CONFIG_AS_GFNI */
877 .section .rodata.cst4.L0f0f0f0f, "aM", @progbits, 4
884 SYM_FUNC_START_LOCAL(__aria_aesni_avx_crypt_16way)
889 * %xmm0..%xmm15: 16 byte-sliced blocks
895 leaq 8 * 16(%rax), %r8;
897 inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
898 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
900 aria_fo(%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15,
901 %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
903 aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
904 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
905 %xmm15, %rax, %r9, 1);
906 aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
907 %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
909 aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
910 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
911 %xmm15, %rax, %r9, 3);
912 aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
913 %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
915 aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
916 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
917 %xmm15, %rax, %r9, 5);
918 aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
919 %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
921 aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
922 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
923 %xmm15, %rax, %r9, 7);
924 aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
925 %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
927 aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
928 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
929 %xmm15, %rax, %r9, 9);
930 aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
931 %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
933 cmpl $12, ARIA_CTX_rounds(CTX);
935 aria_ff(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
936 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
937 %xmm15, %rax, %r9, 11, 12);
940 aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
941 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
942 %xmm15, %rax, %r9, 11);
943 aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
944 %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
946 cmpl $14, ARIA_CTX_rounds(CTX);
948 aria_ff(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
949 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
950 %xmm15, %rax, %r9, 13, 14);
953 aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
954 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
955 %xmm15, %rax, %r9, 13);
956 aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
957 %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
959 aria_ff(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
960 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
961 %xmm15, %rax, %r9, 15, 16);
963 debyteslice_16x16b(%xmm8, %xmm12, %xmm1, %xmm4,
964 %xmm9, %xmm13, %xmm0, %xmm5,
965 %xmm10, %xmm14, %xmm3, %xmm6,
966 %xmm11, %xmm15, %xmm2, %xmm7,
971 SYM_FUNC_END(__aria_aesni_avx_crypt_16way)
973 SYM_TYPED_FUNC_START(aria_aesni_avx_encrypt_16way)
982 leaq ARIA_CTX_enc_key(CTX), %r9;
984 inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
985 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
988 call __aria_aesni_avx_crypt_16way;
990 write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
991 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
996 SYM_FUNC_END(aria_aesni_avx_encrypt_16way)
998 SYM_TYPED_FUNC_START(aria_aesni_avx_decrypt_16way)
1007 leaq ARIA_CTX_dec_key(CTX), %r9;
1009 inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
1010 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1013 call __aria_aesni_avx_crypt_16way;
1015 write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
1016 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1021 SYM_FUNC_END(aria_aesni_avx_decrypt_16way)
1023 SYM_FUNC_START_LOCAL(__aria_aesni_avx_ctr_gen_keystream_16way)
1029 * %r8: iv (big endian, 128bit)
1033 /* load IV and byteswap */
1034 vmovdqu (%r8), %xmm8;
1036 vmovdqa .Lbswap128_mask (%rip), %xmm1;
1037 vpshufb %xmm1, %xmm8, %xmm3; /* be => le */
1039 vpcmpeqd %xmm0, %xmm0, %xmm0;
1040 vpsrldq $8, %xmm0, %xmm0; /* low: -1, high: 0 */
1043 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1044 vpshufb %xmm1, %xmm3, %xmm9;
1045 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1046 vpshufb %xmm1, %xmm3, %xmm10;
1047 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1048 vpshufb %xmm1, %xmm3, %xmm11;
1049 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1050 vpshufb %xmm1, %xmm3, %xmm12;
1051 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1052 vpshufb %xmm1, %xmm3, %xmm13;
1053 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1054 vpshufb %xmm1, %xmm3, %xmm14;
1055 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1056 vpshufb %xmm1, %xmm3, %xmm15;
1057 vmovdqu %xmm8, (0 * 16)(%rcx);
1058 vmovdqu %xmm9, (1 * 16)(%rcx);
1059 vmovdqu %xmm10, (2 * 16)(%rcx);
1060 vmovdqu %xmm11, (3 * 16)(%rcx);
1061 vmovdqu %xmm12, (4 * 16)(%rcx);
1062 vmovdqu %xmm13, (5 * 16)(%rcx);
1063 vmovdqu %xmm14, (6 * 16)(%rcx);
1064 vmovdqu %xmm15, (7 * 16)(%rcx);
1066 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1067 vpshufb %xmm1, %xmm3, %xmm8;
1068 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1069 vpshufb %xmm1, %xmm3, %xmm9;
1070 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1071 vpshufb %xmm1, %xmm3, %xmm10;
1072 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1073 vpshufb %xmm1, %xmm3, %xmm11;
1074 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1075 vpshufb %xmm1, %xmm3, %xmm12;
1076 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1077 vpshufb %xmm1, %xmm3, %xmm13;
1078 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1079 vpshufb %xmm1, %xmm3, %xmm14;
1080 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1081 vpshufb %xmm1, %xmm3, %xmm15;
1082 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1083 vpshufb %xmm1, %xmm3, %xmm4;
1084 vmovdqu %xmm4, (%r8);
1086 vmovdqu (0 * 16)(%rcx), %xmm0;
1087 vmovdqu (1 * 16)(%rcx), %xmm1;
1088 vmovdqu (2 * 16)(%rcx), %xmm2;
1089 vmovdqu (3 * 16)(%rcx), %xmm3;
1090 vmovdqu (4 * 16)(%rcx), %xmm4;
1091 vmovdqu (5 * 16)(%rcx), %xmm5;
1092 vmovdqu (6 * 16)(%rcx), %xmm6;
1093 vmovdqu (7 * 16)(%rcx), %xmm7;
1097 SYM_FUNC_END(__aria_aesni_avx_ctr_gen_keystream_16way)
1099 SYM_TYPED_FUNC_START(aria_aesni_avx_ctr_crypt_16way)
1105 * %r8: iv (big endian, 128bit)
1109 call __aria_aesni_avx_ctr_gen_keystream_16way;
1115 leaq ARIA_CTX_enc_key(CTX), %r9;
1117 call __aria_aesni_avx_crypt_16way;
1119 vpxor (0 * 16)(%r11), %xmm1, %xmm1;
1120 vpxor (1 * 16)(%r11), %xmm0, %xmm0;
1121 vpxor (2 * 16)(%r11), %xmm3, %xmm3;
1122 vpxor (3 * 16)(%r11), %xmm2, %xmm2;
1123 vpxor (4 * 16)(%r11), %xmm4, %xmm4;
1124 vpxor (5 * 16)(%r11), %xmm5, %xmm5;
1125 vpxor (6 * 16)(%r11), %xmm6, %xmm6;
1126 vpxor (7 * 16)(%r11), %xmm7, %xmm7;
1127 vpxor (8 * 16)(%r11), %xmm8, %xmm8;
1128 vpxor (9 * 16)(%r11), %xmm9, %xmm9;
1129 vpxor (10 * 16)(%r11), %xmm10, %xmm10;
1130 vpxor (11 * 16)(%r11), %xmm11, %xmm11;
1131 vpxor (12 * 16)(%r11), %xmm12, %xmm12;
1132 vpxor (13 * 16)(%r11), %xmm13, %xmm13;
1133 vpxor (14 * 16)(%r11), %xmm14, %xmm14;
1134 vpxor (15 * 16)(%r11), %xmm15, %xmm15;
1135 write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
1136 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1141 SYM_FUNC_END(aria_aesni_avx_ctr_crypt_16way)
1143 #ifdef CONFIG_AS_GFNI
1144 SYM_FUNC_START_LOCAL(__aria_aesni_avx_gfni_crypt_16way)
1149 * %xmm0..%xmm15: 16 byte-sliced blocks
1155 leaq 8 * 16(%rax), %r8;
1157 inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3,
1158 %xmm4, %xmm5, %xmm6, %xmm7,
1159 %xmm8, %xmm9, %xmm10, %xmm11,
1160 %xmm12, %xmm13, %xmm14,
1162 aria_fo_gfni(%xmm8, %xmm9, %xmm10, %xmm11,
1163 %xmm12, %xmm13, %xmm14, %xmm15,
1164 %xmm0, %xmm1, %xmm2, %xmm3,
1165 %xmm4, %xmm5, %xmm6, %xmm7,
1167 aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1168 %xmm4, %xmm5, %xmm6, %xmm7,
1169 %xmm8, %xmm9, %xmm10, %xmm11,
1170 %xmm12, %xmm13, %xmm14,
1171 %xmm15, %rax, %r9, 1);
1172 aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
1173 %xmm12, %xmm13, %xmm14, %xmm15,
1174 %xmm0, %xmm1, %xmm2, %xmm3,
1175 %xmm4, %xmm5, %xmm6, %xmm7,
1177 aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1178 %xmm4, %xmm5, %xmm6, %xmm7,
1179 %xmm8, %xmm9, %xmm10, %xmm11,
1180 %xmm12, %xmm13, %xmm14,
1181 %xmm15, %rax, %r9, 3);
1182 aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
1183 %xmm12, %xmm13, %xmm14, %xmm15,
1184 %xmm0, %xmm1, %xmm2, %xmm3,
1185 %xmm4, %xmm5, %xmm6, %xmm7,
1187 aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1188 %xmm4, %xmm5, %xmm6, %xmm7,
1189 %xmm8, %xmm9, %xmm10, %xmm11,
1190 %xmm12, %xmm13, %xmm14,
1191 %xmm15, %rax, %r9, 5);
1192 aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
1193 %xmm12, %xmm13, %xmm14, %xmm15,
1194 %xmm0, %xmm1, %xmm2, %xmm3,
1195 %xmm4, %xmm5, %xmm6, %xmm7,
1197 aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1198 %xmm4, %xmm5, %xmm6, %xmm7,
1199 %xmm8, %xmm9, %xmm10, %xmm11,
1200 %xmm12, %xmm13, %xmm14,
1201 %xmm15, %rax, %r9, 7);
1202 aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
1203 %xmm12, %xmm13, %xmm14, %xmm15,
1204 %xmm0, %xmm1, %xmm2, %xmm3,
1205 %xmm4, %xmm5, %xmm6, %xmm7,
1207 aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1208 %xmm4, %xmm5, %xmm6, %xmm7,
1209 %xmm8, %xmm9, %xmm10, %xmm11,
1210 %xmm12, %xmm13, %xmm14,
1211 %xmm15, %rax, %r9, 9);
1212 aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
1213 %xmm12, %xmm13, %xmm14, %xmm15,
1214 %xmm0, %xmm1, %xmm2, %xmm3,
1215 %xmm4, %xmm5, %xmm6, %xmm7,
1217 cmpl $12, ARIA_CTX_rounds(CTX);
1218 jne .Laria_gfni_192;
1219 aria_ff_gfni(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
1220 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1221 %xmm15, %rax, %r9, 11, 12);
1222 jmp .Laria_gfni_end;
1224 aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1225 %xmm4, %xmm5, %xmm6, %xmm7,
1226 %xmm8, %xmm9, %xmm10, %xmm11,
1227 %xmm12, %xmm13, %xmm14,
1228 %xmm15, %rax, %r9, 11);
1229 aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
1230 %xmm12, %xmm13, %xmm14, %xmm15,
1231 %xmm0, %xmm1, %xmm2, %xmm3,
1232 %xmm4, %xmm5, %xmm6, %xmm7,
1234 cmpl $14, ARIA_CTX_rounds(CTX);
1235 jne .Laria_gfni_256;
1236 aria_ff_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1237 %xmm4, %xmm5, %xmm6, %xmm7,
1238 %xmm8, %xmm9, %xmm10, %xmm11,
1239 %xmm12, %xmm13, %xmm14,
1240 %xmm15, %rax, %r9, 13, 14);
1241 jmp .Laria_gfni_end;
1243 aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1244 %xmm4, %xmm5, %xmm6, %xmm7,
1245 %xmm8, %xmm9, %xmm10, %xmm11,
1246 %xmm12, %xmm13, %xmm14,
1247 %xmm15, %rax, %r9, 13);
1248 aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
1249 %xmm12, %xmm13, %xmm14, %xmm15,
1250 %xmm0, %xmm1, %xmm2, %xmm3,
1251 %xmm4, %xmm5, %xmm6, %xmm7,
1253 aria_ff_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1254 %xmm4, %xmm5, %xmm6, %xmm7,
1255 %xmm8, %xmm9, %xmm10, %xmm11,
1256 %xmm12, %xmm13, %xmm14,
1257 %xmm15, %rax, %r9, 15, 16);
1259 debyteslice_16x16b(%xmm8, %xmm12, %xmm1, %xmm4,
1260 %xmm9, %xmm13, %xmm0, %xmm5,
1261 %xmm10, %xmm14, %xmm3, %xmm6,
1262 %xmm11, %xmm15, %xmm2, %xmm7,
1267 SYM_FUNC_END(__aria_aesni_avx_gfni_crypt_16way)
1269 SYM_TYPED_FUNC_START(aria_aesni_avx_gfni_encrypt_16way)
1278 leaq ARIA_CTX_enc_key(CTX), %r9;
1280 inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
1281 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1284 call __aria_aesni_avx_gfni_crypt_16way;
1286 write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
1287 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1292 SYM_FUNC_END(aria_aesni_avx_gfni_encrypt_16way)
1294 SYM_TYPED_FUNC_START(aria_aesni_avx_gfni_decrypt_16way)
1303 leaq ARIA_CTX_dec_key(CTX), %r9;
1305 inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
1306 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1309 call __aria_aesni_avx_gfni_crypt_16way;
1311 write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
1312 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1317 SYM_FUNC_END(aria_aesni_avx_gfni_decrypt_16way)
1319 SYM_TYPED_FUNC_START(aria_aesni_avx_gfni_ctr_crypt_16way)
1325 * %r8: iv (big endian, 128bit)
1329 call __aria_aesni_avx_ctr_gen_keystream_16way
1335 leaq ARIA_CTX_enc_key(CTX), %r9;
1337 call __aria_aesni_avx_gfni_crypt_16way;
1339 vpxor (0 * 16)(%r11), %xmm1, %xmm1;
1340 vpxor (1 * 16)(%r11), %xmm0, %xmm0;
1341 vpxor (2 * 16)(%r11), %xmm3, %xmm3;
1342 vpxor (3 * 16)(%r11), %xmm2, %xmm2;
1343 vpxor (4 * 16)(%r11), %xmm4, %xmm4;
1344 vpxor (5 * 16)(%r11), %xmm5, %xmm5;
1345 vpxor (6 * 16)(%r11), %xmm6, %xmm6;
1346 vpxor (7 * 16)(%r11), %xmm7, %xmm7;
1347 vpxor (8 * 16)(%r11), %xmm8, %xmm8;
1348 vpxor (9 * 16)(%r11), %xmm9, %xmm9;
1349 vpxor (10 * 16)(%r11), %xmm10, %xmm10;
1350 vpxor (11 * 16)(%r11), %xmm11, %xmm11;
1351 vpxor (12 * 16)(%r11), %xmm12, %xmm12;
1352 vpxor (13 * 16)(%r11), %xmm13, %xmm13;
1353 vpxor (14 * 16)(%r11), %xmm14, %xmm14;
1354 vpxor (15 * 16)(%r11), %xmm15, %xmm15;
1355 write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
1356 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1361 SYM_FUNC_END(aria_aesni_avx_gfni_ctr_crypt_16way)
1362 #endif /* CONFIG_AS_GFNI */