accel/amdxdna: use modern PM helpers
[drm/drm-misc.git] / arch / x86 / crypto / aria-aesni-avx2-asm_64.S
blobc60fa2980630379b6e4d095d20f155ce204fff52
1 /* SPDX-License-Identifier: GPL-2.0-or-later */
2 /*
3  * ARIA Cipher 32-way parallel algorithm (AVX2)
4  *
5  * Copyright (c) 2022 Taehee Yoo <ap420073@gmail.com>
6  *
7  */
9 #include <linux/linkage.h>
10 #include <asm/frame.h>
11 #include <asm/asm-offsets.h>
12 #include <linux/cfi_types.h>
14 /* register macros */
15 #define CTX %rdi
17 #define ymm0_x xmm0
18 #define ymm1_x xmm1
19 #define ymm2_x xmm2
20 #define ymm3_x xmm3
21 #define ymm4_x xmm4
22 #define ymm5_x xmm5
23 #define ymm6_x xmm6
24 #define ymm7_x xmm7
25 #define ymm8_x xmm8
26 #define ymm9_x xmm9
27 #define ymm10_x xmm10
28 #define ymm11_x xmm11
29 #define ymm12_x xmm12
30 #define ymm13_x xmm13
31 #define ymm14_x xmm14
32 #define ymm15_x xmm15
34 #define BV8(a0, a1, a2, a3, a4, a5, a6, a7)             \
35         ( (((a0) & 1) << 0) |                           \
36           (((a1) & 1) << 1) |                           \
37           (((a2) & 1) << 2) |                           \
38           (((a3) & 1) << 3) |                           \
39           (((a4) & 1) << 4) |                           \
40           (((a5) & 1) << 5) |                           \
41           (((a6) & 1) << 6) |                           \
42           (((a7) & 1) << 7) )
44 #define BM8X8(l0, l1, l2, l3, l4, l5, l6, l7)           \
45         ( ((l7) << (0 * 8)) |                           \
46           ((l6) << (1 * 8)) |                           \
47           ((l5) << (2 * 8)) |                           \
48           ((l4) << (3 * 8)) |                           \
49           ((l3) << (4 * 8)) |                           \
50           ((l2) << (5 * 8)) |                           \
51           ((l1) << (6 * 8)) |                           \
52           ((l0) << (7 * 8)) )
54 #define inc_le128(x, minus_one, tmp)                    \
55         vpcmpeqq minus_one, x, tmp;                     \
56         vpsubq minus_one, x, x;                         \
57         vpslldq $8, tmp, tmp;                           \
58         vpsubq tmp, x, x;
60 #define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0)      \
61         vpand x, mask4bit, tmp0;                        \
62         vpandn x, mask4bit, x;                          \
63         vpsrld $4, x, x;                                \
64                                                         \
65         vpshufb tmp0, lo_t, tmp0;                       \
66         vpshufb x, hi_t, x;                             \
67         vpxor tmp0, x, x;
69 #define transpose_4x4(x0, x1, x2, x3, t1, t2)           \
70         vpunpckhdq x1, x0, t2;                          \
71         vpunpckldq x1, x0, x0;                          \
72                                                         \
73         vpunpckldq x3, x2, t1;                          \
74         vpunpckhdq x3, x2, x2;                          \
75                                                         \
76         vpunpckhqdq t1, x0, x1;                         \
77         vpunpcklqdq t1, x0, x0;                         \
78                                                         \
79         vpunpckhqdq x2, t2, x3;                         \
80         vpunpcklqdq x2, t2, x2;
82 #define byteslice_16x16b(a0, b0, c0, d0,                \
83                          a1, b1, c1, d1,                \
84                          a2, b2, c2, d2,                \
85                          a3, b3, c3, d3,                \
86                          st0, st1)                      \
87         vmovdqu d2, st0;                                \
88         vmovdqu d3, st1;                                \
89         transpose_4x4(a0, a1, a2, a3, d2, d3);          \
90         transpose_4x4(b0, b1, b2, b3, d2, d3);          \
91         vmovdqu st0, d2;                                \
92         vmovdqu st1, d3;                                \
93                                                         \
94         vmovdqu a0, st0;                                \
95         vmovdqu a1, st1;                                \
96         transpose_4x4(c0, c1, c2, c3, a0, a1);          \
97         transpose_4x4(d0, d1, d2, d3, a0, a1);          \
98                                                         \
99         vbroadcasti128 .Lshufb_16x16b(%rip), a0;        \
100         vmovdqu st1, a1;                                \
101         vpshufb a0, a2, a2;                             \
102         vpshufb a0, a3, a3;                             \
103         vpshufb a0, b0, b0;                             \
104         vpshufb a0, b1, b1;                             \
105         vpshufb a0, b2, b2;                             \
106         vpshufb a0, b3, b3;                             \
107         vpshufb a0, a1, a1;                             \
108         vpshufb a0, c0, c0;                             \
109         vpshufb a0, c1, c1;                             \
110         vpshufb a0, c2, c2;                             \
111         vpshufb a0, c3, c3;                             \
112         vpshufb a0, d0, d0;                             \
113         vpshufb a0, d1, d1;                             \
114         vpshufb a0, d2, d2;                             \
115         vpshufb a0, d3, d3;                             \
116         vmovdqu d3, st1;                                \
117         vmovdqu st0, d3;                                \
118         vpshufb a0, d3, a0;                             \
119         vmovdqu d2, st0;                                \
120                                                         \
121         transpose_4x4(a0, b0, c0, d0, d2, d3);          \
122         transpose_4x4(a1, b1, c1, d1, d2, d3);          \
123         vmovdqu st0, d2;                                \
124         vmovdqu st1, d3;                                \
125                                                         \
126         vmovdqu b0, st0;                                \
127         vmovdqu b1, st1;                                \
128         transpose_4x4(a2, b2, c2, d2, b0, b1);          \
129         transpose_4x4(a3, b3, c3, d3, b0, b1);          \
130         vmovdqu st0, b0;                                \
131         vmovdqu st1, b1;                                \
132         /* does not adjust output bytes inside vectors */
134 #define debyteslice_16x16b(a0, b0, c0, d0,              \
135                            a1, b1, c1, d1,              \
136                            a2, b2, c2, d2,              \
137                            a3, b3, c3, d3,              \
138                            st0, st1)                    \
139         vmovdqu d2, st0;                                \
140         vmovdqu d3, st1;                                \
141         transpose_4x4(a0, a1, a2, a3, d2, d3);          \
142         transpose_4x4(b0, b1, b2, b3, d2, d3);          \
143         vmovdqu st0, d2;                                \
144         vmovdqu st1, d3;                                \
145                                                         \
146         vmovdqu a0, st0;                                \
147         vmovdqu a1, st1;                                \
148         transpose_4x4(c0, c1, c2, c3, a0, a1);          \
149         transpose_4x4(d0, d1, d2, d3, a0, a1);          \
150                                                         \
151         vbroadcasti128 .Lshufb_16x16b(%rip), a0;        \
152         vmovdqu st1, a1;                                \
153         vpshufb a0, a2, a2;                             \
154         vpshufb a0, a3, a3;                             \
155         vpshufb a0, b0, b0;                             \
156         vpshufb a0, b1, b1;                             \
157         vpshufb a0, b2, b2;                             \
158         vpshufb a0, b3, b3;                             \
159         vpshufb a0, a1, a1;                             \
160         vpshufb a0, c0, c0;                             \
161         vpshufb a0, c1, c1;                             \
162         vpshufb a0, c2, c2;                             \
163         vpshufb a0, c3, c3;                             \
164         vpshufb a0, d0, d0;                             \
165         vpshufb a0, d1, d1;                             \
166         vpshufb a0, d2, d2;                             \
167         vpshufb a0, d3, d3;                             \
168         vmovdqu d3, st1;                                \
169         vmovdqu st0, d3;                                \
170         vpshufb a0, d3, a0;                             \
171         vmovdqu d2, st0;                                \
172                                                         \
173         transpose_4x4(c0, d0, a0, b0, d2, d3);          \
174         transpose_4x4(c1, d1, a1, b1, d2, d3);          \
175         vmovdqu st0, d2;                                \
176         vmovdqu st1, d3;                                \
177                                                         \
178         vmovdqu b0, st0;                                \
179         vmovdqu b1, st1;                                \
180         transpose_4x4(c2, d2, a2, b2, b0, b1);          \
181         transpose_4x4(c3, d3, a3, b3, b0, b1);          \
182         vmovdqu st0, b0;                                \
183         vmovdqu st1, b1;                                \
184         /* does not adjust output bytes inside vectors */
186 /* load blocks to registers and apply pre-whitening */
187 #define inpack16_pre(x0, x1, x2, x3,                    \
188                      x4, x5, x6, x7,                    \
189                      y0, y1, y2, y3,                    \
190                      y4, y5, y6, y7,                    \
191                      rio)                               \
192         vmovdqu (0 * 32)(rio), x0;                      \
193         vmovdqu (1 * 32)(rio), x1;                      \
194         vmovdqu (2 * 32)(rio), x2;                      \
195         vmovdqu (3 * 32)(rio), x3;                      \
196         vmovdqu (4 * 32)(rio), x4;                      \
197         vmovdqu (5 * 32)(rio), x5;                      \
198         vmovdqu (6 * 32)(rio), x6;                      \
199         vmovdqu (7 * 32)(rio), x7;                      \
200         vmovdqu (8 * 32)(rio), y0;                      \
201         vmovdqu (9 * 32)(rio), y1;                      \
202         vmovdqu (10 * 32)(rio), y2;                     \
203         vmovdqu (11 * 32)(rio), y3;                     \
204         vmovdqu (12 * 32)(rio), y4;                     \
205         vmovdqu (13 * 32)(rio), y5;                     \
206         vmovdqu (14 * 32)(rio), y6;                     \
207         vmovdqu (15 * 32)(rio), y7;
209 /* byteslice pre-whitened blocks and store to temporary memory */
210 #define inpack16_post(x0, x1, x2, x3,                   \
211                       x4, x5, x6, x7,                   \
212                       y0, y1, y2, y3,                   \
213                       y4, y5, y6, y7,                   \
214                       mem_ab, mem_cd)                   \
215         byteslice_16x16b(x0, x1, x2, x3,                \
216                          x4, x5, x6, x7,                \
217                          y0, y1, y2, y3,                \
218                          y4, y5, y6, y7,                \
219                          (mem_ab), (mem_cd));           \
220                                                         \
221         vmovdqu x0, 0 * 32(mem_ab);                     \
222         vmovdqu x1, 1 * 32(mem_ab);                     \
223         vmovdqu x2, 2 * 32(mem_ab);                     \
224         vmovdqu x3, 3 * 32(mem_ab);                     \
225         vmovdqu x4, 4 * 32(mem_ab);                     \
226         vmovdqu x5, 5 * 32(mem_ab);                     \
227         vmovdqu x6, 6 * 32(mem_ab);                     \
228         vmovdqu x7, 7 * 32(mem_ab);                     \
229         vmovdqu y0, 0 * 32(mem_cd);                     \
230         vmovdqu y1, 1 * 32(mem_cd);                     \
231         vmovdqu y2, 2 * 32(mem_cd);                     \
232         vmovdqu y3, 3 * 32(mem_cd);                     \
233         vmovdqu y4, 4 * 32(mem_cd);                     \
234         vmovdqu y5, 5 * 32(mem_cd);                     \
235         vmovdqu y6, 6 * 32(mem_cd);                     \
236         vmovdqu y7, 7 * 32(mem_cd);
238 #define write_output(x0, x1, x2, x3,                    \
239                      x4, x5, x6, x7,                    \
240                      y0, y1, y2, y3,                    \
241                      y4, y5, y6, y7,                    \
242                      mem)                               \
243         vmovdqu x0, 0 * 32(mem);                        \
244         vmovdqu x1, 1 * 32(mem);                        \
245         vmovdqu x2, 2 * 32(mem);                        \
246         vmovdqu x3, 3 * 32(mem);                        \
247         vmovdqu x4, 4 * 32(mem);                        \
248         vmovdqu x5, 5 * 32(mem);                        \
249         vmovdqu x6, 6 * 32(mem);                        \
250         vmovdqu x7, 7 * 32(mem);                        \
251         vmovdqu y0, 8 * 32(mem);                        \
252         vmovdqu y1, 9 * 32(mem);                        \
253         vmovdqu y2, 10 * 32(mem);                       \
254         vmovdqu y3, 11 * 32(mem);                       \
255         vmovdqu y4, 12 * 32(mem);                       \
256         vmovdqu y5, 13 * 32(mem);                       \
257         vmovdqu y6, 14 * 32(mem);                       \
258         vmovdqu y7, 15 * 32(mem);                       \
260 #define aria_store_state_8way(x0, x1, x2, x3,           \
261                               x4, x5, x6, x7,           \
262                               mem_tmp, idx)             \
263         vmovdqu x0, ((idx + 0) * 32)(mem_tmp);          \
264         vmovdqu x1, ((idx + 1) * 32)(mem_tmp);          \
265         vmovdqu x2, ((idx + 2) * 32)(mem_tmp);          \
266         vmovdqu x3, ((idx + 3) * 32)(mem_tmp);          \
267         vmovdqu x4, ((idx + 4) * 32)(mem_tmp);          \
268         vmovdqu x5, ((idx + 5) * 32)(mem_tmp);          \
269         vmovdqu x6, ((idx + 6) * 32)(mem_tmp);          \
270         vmovdqu x7, ((idx + 7) * 32)(mem_tmp);
272 #define aria_load_state_8way(x0, x1, x2, x3,            \
273                              x4, x5, x6, x7,            \
274                              mem_tmp, idx)              \
275         vmovdqu ((idx + 0) * 32)(mem_tmp), x0;          \
276         vmovdqu ((idx + 1) * 32)(mem_tmp), x1;          \
277         vmovdqu ((idx + 2) * 32)(mem_tmp), x2;          \
278         vmovdqu ((idx + 3) * 32)(mem_tmp), x3;          \
279         vmovdqu ((idx + 4) * 32)(mem_tmp), x4;          \
280         vmovdqu ((idx + 5) * 32)(mem_tmp), x5;          \
281         vmovdqu ((idx + 6) * 32)(mem_tmp), x6;          \
282         vmovdqu ((idx + 7) * 32)(mem_tmp), x7;
284 #define aria_ark_8way(x0, x1, x2, x3,                   \
285                       x4, x5, x6, x7,                   \
286                       t0, rk, idx, round)               \
287         /* AddRoundKey */                               \
288         vpbroadcastb ((round * 16) + idx + 3)(rk), t0;  \
289         vpxor t0, x0, x0;                               \
290         vpbroadcastb ((round * 16) + idx + 2)(rk), t0;  \
291         vpxor t0, x1, x1;                               \
292         vpbroadcastb ((round * 16) + idx + 1)(rk), t0;  \
293         vpxor t0, x2, x2;                               \
294         vpbroadcastb ((round * 16) + idx + 0)(rk), t0;  \
295         vpxor t0, x3, x3;                               \
296         vpbroadcastb ((round * 16) + idx + 7)(rk), t0;  \
297         vpxor t0, x4, x4;                               \
298         vpbroadcastb ((round * 16) + idx + 6)(rk), t0;  \
299         vpxor t0, x5, x5;                               \
300         vpbroadcastb ((round * 16) + idx + 5)(rk), t0;  \
301         vpxor t0, x6, x6;                               \
302         vpbroadcastb ((round * 16) + idx + 4)(rk), t0;  \
303         vpxor t0, x7, x7;
305 #ifdef CONFIG_AS_GFNI
306 #define aria_sbox_8way_gfni(x0, x1, x2, x3,             \
307                             x4, x5, x6, x7,             \
308                             t0, t1, t2, t3,             \
309                             t4, t5, t6, t7)             \
310         vpbroadcastq .Ltf_s2_bitmatrix(%rip), t0;       \
311         vpbroadcastq .Ltf_inv_bitmatrix(%rip), t1;      \
312         vpbroadcastq .Ltf_id_bitmatrix(%rip), t2;       \
313         vpbroadcastq .Ltf_aff_bitmatrix(%rip), t3;      \
314         vpbroadcastq .Ltf_x2_bitmatrix(%rip), t4;       \
315         vgf2p8affineinvqb $(tf_s2_const), t0, x1, x1;   \
316         vgf2p8affineinvqb $(tf_s2_const), t0, x5, x5;   \
317         vgf2p8affineqb $(tf_inv_const), t1, x2, x2;     \
318         vgf2p8affineqb $(tf_inv_const), t1, x6, x6;     \
319         vgf2p8affineinvqb $0, t2, x2, x2;               \
320         vgf2p8affineinvqb $0, t2, x6, x6;               \
321         vgf2p8affineinvqb $(tf_aff_const), t3, x0, x0;  \
322         vgf2p8affineinvqb $(tf_aff_const), t3, x4, x4;  \
323         vgf2p8affineqb $(tf_x2_const), t4, x3, x3;      \
324         vgf2p8affineqb $(tf_x2_const), t4, x7, x7;      \
325         vgf2p8affineinvqb $0, t2, x3, x3;               \
326         vgf2p8affineinvqb $0, t2, x7, x7
328 #endif /* CONFIG_AS_GFNI */
329 #define aria_sbox_8way(x0, x1, x2, x3,                  \
330                        x4, x5, x6, x7,                  \
331                        t0, t1, t2, t3,                  \
332                        t4, t5, t6, t7)                  \
333         vpxor t7, t7, t7;                               \
334         vpxor t6, t6, t6;                               \
335         vbroadcasti128 .Linv_shift_row(%rip), t0;       \
336         vbroadcasti128 .Lshift_row(%rip), t1;           \
337         vbroadcasti128 .Ltf_lo__inv_aff__and__s2(%rip), t2; \
338         vbroadcasti128 .Ltf_hi__inv_aff__and__s2(%rip), t3; \
339         vbroadcasti128 .Ltf_lo__x2__and__fwd_aff(%rip), t4; \
340         vbroadcasti128 .Ltf_hi__x2__and__fwd_aff(%rip), t5; \
341                                                         \
342         vextracti128 $1, x0, t6##_x;                    \
343         vaesenclast t7##_x, x0##_x, x0##_x;             \
344         vaesenclast t7##_x, t6##_x, t6##_x;             \
345         vinserti128 $1, t6##_x, x0, x0;                 \
346                                                         \
347         vextracti128 $1, x4, t6##_x;                    \
348         vaesenclast t7##_x, x4##_x, x4##_x;             \
349         vaesenclast t7##_x, t6##_x, t6##_x;             \
350         vinserti128 $1, t6##_x, x4, x4;                 \
351                                                         \
352         vextracti128 $1, x1, t6##_x;                    \
353         vaesenclast t7##_x, x1##_x, x1##_x;             \
354         vaesenclast t7##_x, t6##_x, t6##_x;             \
355         vinserti128 $1, t6##_x, x1, x1;                 \
356                                                         \
357         vextracti128 $1, x5, t6##_x;                    \
358         vaesenclast t7##_x, x5##_x, x5##_x;             \
359         vaesenclast t7##_x, t6##_x, t6##_x;             \
360         vinserti128 $1, t6##_x, x5, x5;                 \
361                                                         \
362         vextracti128 $1, x2, t6##_x;                    \
363         vaesdeclast t7##_x, x2##_x, x2##_x;             \
364         vaesdeclast t7##_x, t6##_x, t6##_x;             \
365         vinserti128 $1, t6##_x, x2, x2;                 \
366                                                         \
367         vextracti128 $1, x6, t6##_x;                    \
368         vaesdeclast t7##_x, x6##_x, x6##_x;             \
369         vaesdeclast t7##_x, t6##_x, t6##_x;             \
370         vinserti128 $1, t6##_x, x6, x6;                 \
371                                                         \
372         vpbroadcastd .L0f0f0f0f(%rip), t6;              \
373                                                         \
374         /* AES inverse shift rows */                    \
375         vpshufb t0, x0, x0;                             \
376         vpshufb t0, x4, x4;                             \
377         vpshufb t0, x1, x1;                             \
378         vpshufb t0, x5, x5;                             \
379         vpshufb t1, x3, x3;                             \
380         vpshufb t1, x7, x7;                             \
381         vpshufb t1, x2, x2;                             \
382         vpshufb t1, x6, x6;                             \
383                                                         \
384         /* affine transformation for S2 */              \
385         filter_8bit(x1, t2, t3, t6, t0);                \
386         /* affine transformation for S2 */              \
387         filter_8bit(x5, t2, t3, t6, t0);                \
388                                                         \
389         /* affine transformation for X2 */              \
390         filter_8bit(x3, t4, t5, t6, t0);                \
391         /* affine transformation for X2 */              \
392         filter_8bit(x7, t4, t5, t6, t0);                \
393                                                         \
394         vpxor t6, t6, t6;                               \
395         vextracti128 $1, x3, t6##_x;                    \
396         vaesdeclast t7##_x, x3##_x, x3##_x;             \
397         vaesdeclast t7##_x, t6##_x, t6##_x;             \
398         vinserti128 $1, t6##_x, x3, x3;                 \
399                                                         \
400         vextracti128 $1, x7, t6##_x;                    \
401         vaesdeclast t7##_x, x7##_x, x7##_x;             \
402         vaesdeclast t7##_x, t6##_x, t6##_x;             \
403         vinserti128 $1, t6##_x, x7, x7;                 \
405 #define aria_diff_m(x0, x1, x2, x3,                     \
406                     t0, t1, t2, t3)                     \
407         /* T = rotr32(X, 8); */                         \
408         /* X ^= T */                                    \
409         vpxor x0, x3, t0;                               \
410         vpxor x1, x0, t1;                               \
411         vpxor x2, x1, t2;                               \
412         vpxor x3, x2, t3;                               \
413         /* X = T ^ rotr(X, 16); */                      \
414         vpxor t2, x0, x0;                               \
415         vpxor x1, t3, t3;                               \
416         vpxor t0, x2, x2;                               \
417         vpxor t1, x3, x1;                               \
418         vmovdqu t3, x3;
420 #define aria_diff_word(x0, x1, x2, x3,                  \
421                        x4, x5, x6, x7,                  \
422                        y0, y1, y2, y3,                  \
423                        y4, y5, y6, y7)                  \
424         /* t1 ^= t2; */                                 \
425         vpxor y0, x4, x4;                               \
426         vpxor y1, x5, x5;                               \
427         vpxor y2, x6, x6;                               \
428         vpxor y3, x7, x7;                               \
429                                                         \
430         /* t2 ^= t3; */                                 \
431         vpxor y4, y0, y0;                               \
432         vpxor y5, y1, y1;                               \
433         vpxor y6, y2, y2;                               \
434         vpxor y7, y3, y3;                               \
435                                                         \
436         /* t0 ^= t1; */                                 \
437         vpxor x4, x0, x0;                               \
438         vpxor x5, x1, x1;                               \
439         vpxor x6, x2, x2;                               \
440         vpxor x7, x3, x3;                               \
441                                                         \
442         /* t3 ^= t1; */                                 \
443         vpxor x4, y4, y4;                               \
444         vpxor x5, y5, y5;                               \
445         vpxor x6, y6, y6;                               \
446         vpxor x7, y7, y7;                               \
447                                                         \
448         /* t2 ^= t0; */                                 \
449         vpxor x0, y0, y0;                               \
450         vpxor x1, y1, y1;                               \
451         vpxor x2, y2, y2;                               \
452         vpxor x3, y3, y3;                               \
453                                                         \
454         /* t1 ^= t2; */                                 \
455         vpxor y0, x4, x4;                               \
456         vpxor y1, x5, x5;                               \
457         vpxor y2, x6, x6;                               \
458         vpxor y3, x7, x7;
460 #define aria_fe(x0, x1, x2, x3,                         \
461                 x4, x5, x6, x7,                         \
462                 y0, y1, y2, y3,                         \
463                 y4, y5, y6, y7,                         \
464                 mem_tmp, rk, round)                     \
465         aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,   \
466                       y0, rk, 8, round);                \
467                                                         \
468         aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5,  \
469                        y0, y1, y2, y3, y4, y5, y6, y7); \
470                                                         \
471         aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);    \
472         aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);    \
473         aria_store_state_8way(x0, x1, x2, x3,           \
474                               x4, x5, x6, x7,           \
475                               mem_tmp, 8);              \
476                                                         \
477         aria_load_state_8way(x0, x1, x2, x3,            \
478                              x4, x5, x6, x7,            \
479                              mem_tmp, 0);               \
480         aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,   \
481                       y0, rk, 0, round);                \
482                                                         \
483         aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5,  \
484                        y0, y1, y2, y3, y4, y5, y6, y7); \
485                                                         \
486         aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);    \
487         aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);    \
488         aria_store_state_8way(x0, x1, x2, x3,           \
489                               x4, x5, x6, x7,           \
490                               mem_tmp, 0);              \
491         aria_load_state_8way(y0, y1, y2, y3,            \
492                              y4, y5, y6, y7,            \
493                              mem_tmp, 8);               \
494         aria_diff_word(x0, x1, x2, x3,                  \
495                        x4, x5, x6, x7,                  \
496                        y0, y1, y2, y3,                  \
497                        y4, y5, y6, y7);                 \
498         /* aria_diff_byte()                             \
499          * T3 = ABCD -> BADC                            \
500          * T3 = y4, y5, y6, y7 -> y5, y4, y7, y6        \
501          * T0 = ABCD -> CDAB                            \
502          * T0 = x0, x1, x2, x3 -> x2, x3, x0, x1        \
503          * T1 = ABCD -> DCBA                            \
504          * T1 = x4, x5, x6, x7 -> x7, x6, x5, x4        \
505          */                                             \
506         aria_diff_word(x2, x3, x0, x1,                  \
507                        x7, x6, x5, x4,                  \
508                        y0, y1, y2, y3,                  \
509                        y5, y4, y7, y6);                 \
510         aria_store_state_8way(x3, x2, x1, x0,           \
511                               x6, x7, x4, x5,           \
512                               mem_tmp, 0);
514 #define aria_fo(x0, x1, x2, x3,                         \
515                 x4, x5, x6, x7,                         \
516                 y0, y1, y2, y3,                         \
517                 y4, y5, y6, y7,                         \
518                 mem_tmp, rk, round)                     \
519         aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,   \
520                       y0, rk, 8, round);                \
521                                                         \
522         aria_sbox_8way(x0, x1, x2, x3, x4, x5, x6, x7,  \
523                        y0, y1, y2, y3, y4, y5, y6, y7); \
524                                                         \
525         aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);    \
526         aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);    \
527         aria_store_state_8way(x0, x1, x2, x3,           \
528                               x4, x5, x6, x7,           \
529                               mem_tmp, 8);              \
530                                                         \
531         aria_load_state_8way(x0, x1, x2, x3,            \
532                              x4, x5, x6, x7,            \
533                              mem_tmp, 0);               \
534         aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,   \
535                       y0, rk, 0, round);                \
536                                                         \
537         aria_sbox_8way(x0, x1, x2, x3, x4, x5, x6, x7,  \
538                        y0, y1, y2, y3, y4, y5, y6, y7); \
539                                                         \
540         aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);    \
541         aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);    \
542         aria_store_state_8way(x0, x1, x2, x3,           \
543                               x4, x5, x6, x7,           \
544                               mem_tmp, 0);              \
545         aria_load_state_8way(y0, y1, y2, y3,            \
546                              y4, y5, y6, y7,            \
547                              mem_tmp, 8);               \
548         aria_diff_word(x0, x1, x2, x3,                  \
549                        x4, x5, x6, x7,                  \
550                        y0, y1, y2, y3,                  \
551                        y4, y5, y6, y7);                 \
552         /* aria_diff_byte()                             \
553          * T1 = ABCD -> BADC                            \
554          * T1 = x4, x5, x6, x7 -> x5, x4, x7, x6        \
555          * T2 = ABCD -> CDAB                            \
556          * T2 = y0, y1, y2, y3, -> y2, y3, y0, y1       \
557          * T3 = ABCD -> DCBA                            \
558          * T3 = y4, y5, y6, y7 -> y7, y6, y5, y4        \
559          */                                             \
560         aria_diff_word(x0, x1, x2, x3,                  \
561                        x5, x4, x7, x6,                  \
562                        y2, y3, y0, y1,                  \
563                        y7, y6, y5, y4);                 \
564         aria_store_state_8way(x3, x2, x1, x0,           \
565                               x6, x7, x4, x5,           \
566                               mem_tmp, 0);
568 #define aria_ff(x0, x1, x2, x3,                         \
569                 x4, x5, x6, x7,                         \
570                 y0, y1, y2, y3,                         \
571                 y4, y5, y6, y7,                         \
572                 mem_tmp, rk, round, last_round)         \
573         aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,   \
574                       y0, rk, 8, round);                \
575                                                         \
576         aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5,  \
577                        y0, y1, y2, y3, y4, y5, y6, y7); \
578                                                         \
579         aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,   \
580                       y0, rk, 8, last_round);           \
581                                                         \
582         aria_store_state_8way(x0, x1, x2, x3,           \
583                               x4, x5, x6, x7,           \
584                               mem_tmp, 8);              \
585                                                         \
586         aria_load_state_8way(x0, x1, x2, x3,            \
587                              x4, x5, x6, x7,            \
588                              mem_tmp, 0);               \
589         aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,   \
590                       y0, rk, 0, round);                \
591                                                         \
592         aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5,  \
593                        y0, y1, y2, y3, y4, y5, y6, y7); \
594                                                         \
595         aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,   \
596                       y0, rk, 0, last_round);           \
597                                                         \
598         aria_load_state_8way(y0, y1, y2, y3,            \
599                              y4, y5, y6, y7,            \
600                              mem_tmp, 8);
601 #ifdef CONFIG_AS_GFNI
602 #define aria_fe_gfni(x0, x1, x2, x3,                    \
603                      x4, x5, x6, x7,                    \
604                      y0, y1, y2, y3,                    \
605                      y4, y5, y6, y7,                    \
606                      mem_tmp, rk, round)                \
607         aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,   \
608                       y0, rk, 8, round);                \
609                                                         \
610         aria_sbox_8way_gfni(x2, x3, x0, x1,             \
611                             x6, x7, x4, x5,             \
612                             y0, y1, y2, y3,             \
613                             y4, y5, y6, y7);            \
614                                                         \
615         aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);    \
616         aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);    \
617         aria_store_state_8way(x0, x1, x2, x3,           \
618                               x4, x5, x6, x7,           \
619                               mem_tmp, 8);              \
620                                                         \
621         aria_load_state_8way(x0, x1, x2, x3,            \
622                              x4, x5, x6, x7,            \
623                              mem_tmp, 0);               \
624         aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,   \
625                       y0, rk, 0, round);                \
626                                                         \
627         aria_sbox_8way_gfni(x2, x3, x0, x1,             \
628                             x6, x7, x4, x5,             \
629                             y0, y1, y2, y3,             \
630                             y4, y5, y6, y7);            \
631                                                         \
632         aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);    \
633         aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);    \
634         aria_store_state_8way(x0, x1, x2, x3,           \
635                               x4, x5, x6, x7,           \
636                               mem_tmp, 0);              \
637         aria_load_state_8way(y0, y1, y2, y3,            \
638                              y4, y5, y6, y7,            \
639                              mem_tmp, 8);               \
640         aria_diff_word(x0, x1, x2, x3,                  \
641                        x4, x5, x6, x7,                  \
642                        y0, y1, y2, y3,                  \
643                        y4, y5, y6, y7);                 \
644         /* aria_diff_byte()                             \
645          * T3 = ABCD -> BADC                            \
646          * T3 = y4, y5, y6, y7 -> y5, y4, y7, y6        \
647          * T0 = ABCD -> CDAB                            \
648          * T0 = x0, x1, x2, x3 -> x2, x3, x0, x1        \
649          * T1 = ABCD -> DCBA                            \
650          * T1 = x4, x5, x6, x7 -> x7, x6, x5, x4        \
651          */                                             \
652         aria_diff_word(x2, x3, x0, x1,                  \
653                        x7, x6, x5, x4,                  \
654                        y0, y1, y2, y3,                  \
655                        y5, y4, y7, y6);                 \
656         aria_store_state_8way(x3, x2, x1, x0,           \
657                               x6, x7, x4, x5,           \
658                               mem_tmp, 0);
660 #define aria_fo_gfni(x0, x1, x2, x3,                    \
661                      x4, x5, x6, x7,                    \
662                      y0, y1, y2, y3,                    \
663                      y4, y5, y6, y7,                    \
664                      mem_tmp, rk, round)                \
665         aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,   \
666                       y0, rk, 8, round);                \
667                                                         \
668         aria_sbox_8way_gfni(x0, x1, x2, x3,             \
669                             x4, x5, x6, x7,             \
670                             y0, y1, y2, y3,             \
671                             y4, y5, y6, y7);            \
672                                                         \
673         aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);    \
674         aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);    \
675         aria_store_state_8way(x0, x1, x2, x3,           \
676                               x4, x5, x6, x7,           \
677                               mem_tmp, 8);              \
678                                                         \
679         aria_load_state_8way(x0, x1, x2, x3,            \
680                              x4, x5, x6, x7,            \
681                              mem_tmp, 0);               \
682         aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,   \
683                       y0, rk, 0, round);                \
684                                                         \
685         aria_sbox_8way_gfni(x0, x1, x2, x3,             \
686                             x4, x5, x6, x7,             \
687                             y0, y1, y2, y3,             \
688                             y4, y5, y6, y7);            \
689                                                         \
690         aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);    \
691         aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);    \
692         aria_store_state_8way(x0, x1, x2, x3,           \
693                               x4, x5, x6, x7,           \
694                               mem_tmp, 0);              \
695         aria_load_state_8way(y0, y1, y2, y3,            \
696                              y4, y5, y6, y7,            \
697                              mem_tmp, 8);               \
698         aria_diff_word(x0, x1, x2, x3,                  \
699                        x4, x5, x6, x7,                  \
700                        y0, y1, y2, y3,                  \
701                        y4, y5, y6, y7);                 \
702         /* aria_diff_byte()                             \
703          * T1 = ABCD -> BADC                            \
704          * T1 = x4, x5, x6, x7 -> x5, x4, x7, x6        \
705          * T2 = ABCD -> CDAB                            \
706          * T2 = y0, y1, y2, y3, -> y2, y3, y0, y1       \
707          * T3 = ABCD -> DCBA                            \
708          * T3 = y4, y5, y6, y7 -> y7, y6, y5, y4        \
709          */                                             \
710         aria_diff_word(x0, x1, x2, x3,                  \
711                        x5, x4, x7, x6,                  \
712                        y2, y3, y0, y1,                  \
713                        y7, y6, y5, y4);                 \
714         aria_store_state_8way(x3, x2, x1, x0,           \
715                               x6, x7, x4, x5,           \
716                               mem_tmp, 0);
718 #define aria_ff_gfni(x0, x1, x2, x3,                    \
719                 x4, x5, x6, x7,                         \
720                 y0, y1, y2, y3,                         \
721                 y4, y5, y6, y7,                         \
722                 mem_tmp, rk, round, last_round)         \
723         aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,   \
724                       y0, rk, 8, round);                \
725                                                         \
726         aria_sbox_8way_gfni(x2, x3, x0, x1,             \
727                             x6, x7, x4, x5,             \
728                             y0, y1, y2, y3,             \
729                             y4, y5, y6, y7);            \
730                                                         \
731         aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,   \
732                       y0, rk, 8, last_round);           \
733                                                         \
734         aria_store_state_8way(x0, x1, x2, x3,           \
735                               x4, x5, x6, x7,           \
736                               mem_tmp, 8);              \
737                                                         \
738         aria_load_state_8way(x0, x1, x2, x3,            \
739                              x4, x5, x6, x7,            \
740                              mem_tmp, 0);               \
741         aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,   \
742                       y0, rk, 0, round);                \
743                                                         \
744         aria_sbox_8way_gfni(x2, x3, x0, x1,             \
745                             x6, x7, x4, x5,             \
746                             y0, y1, y2, y3,             \
747                             y4, y5, y6, y7);            \
748                                                         \
749         aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,   \
750                       y0, rk, 0, last_round);           \
751                                                         \
752         aria_load_state_8way(y0, y1, y2, y3,            \
753                              y4, y5, y6, y7,            \
754                              mem_tmp, 8);
755 #endif /* CONFIG_AS_GFNI */
757 .section        .rodata.cst32.shufb_16x16b, "aM", @progbits, 32
758 .align 32
759 #define SHUFB_BYTES(idx) \
760         0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx)
761 .Lshufb_16x16b:
762         .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3)
763         .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3)
765 .section        .rodata.cst16, "aM", @progbits, 16
766 .align 16
767 /* For isolating SubBytes from AESENCLAST, inverse shift row */
768 .Linv_shift_row:
769         .byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b
770         .byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03
771 .Lshift_row:
772         .byte 0x00, 0x05, 0x0a, 0x0f, 0x04, 0x09, 0x0e, 0x03
773         .byte 0x08, 0x0d, 0x02, 0x07, 0x0c, 0x01, 0x06, 0x0b
774 /* For CTR-mode IV byteswap */
775 .Lbswap128_mask:
776         .byte 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08
777         .byte 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00
779 /* AES inverse affine and S2 combined:
780  *      1 1 0 0 0 0 0 1     x0     0
781  *      0 1 0 0 1 0 0 0     x1     0
782  *      1 1 0 0 1 1 1 1     x2     0
783  *      0 1 1 0 1 0 0 1     x3     1
784  *      0 1 0 0 1 1 0 0  *  x4  +  0
785  *      0 1 0 1 1 0 0 0     x5     0
786  *      0 0 0 0 0 1 0 1     x6     0
787  *      1 1 1 0 0 1 1 1     x7     1
788  */
789 .Ltf_lo__inv_aff__and__s2:
790         .octa 0x92172DA81A9FA520B2370D883ABF8500
791 .Ltf_hi__inv_aff__and__s2:
792         .octa 0x2B15FFC1AF917B45E6D8320C625CB688
794 /* X2 and AES forward affine combined:
795  *      1 0 1 1 0 0 0 1     x0     0
796  *      0 1 1 1 1 0 1 1     x1     0
797  *      0 0 0 1 1 0 1 0     x2     1
798  *      0 1 0 0 0 1 0 0     x3     0
799  *      0 0 1 1 1 0 1 1  *  x4  +  0
800  *      0 1 0 0 1 0 0 0     x5     0
801  *      1 1 0 1 0 0 1 1     x6     0
802  *      0 1 0 0 1 0 1 0     x7     0
803  */
804 .Ltf_lo__x2__and__fwd_aff:
805         .octa 0xEFAE0544FCBD1657B8F95213ABEA4100
806 .Ltf_hi__x2__and__fwd_aff:
807         .octa 0x3F893781E95FE1576CDA64D2BA0CB204
809 #ifdef CONFIG_AS_GFNI
810 .section        .rodata.cst8, "aM", @progbits, 8
811 .align 8
812 /* AES affine: */
813 #define tf_aff_const BV8(1, 1, 0, 0, 0, 1, 1, 0)
814 .Ltf_aff_bitmatrix:
815         .quad BM8X8(BV8(1, 0, 0, 0, 1, 1, 1, 1),
816                     BV8(1, 1, 0, 0, 0, 1, 1, 1),
817                     BV8(1, 1, 1, 0, 0, 0, 1, 1),
818                     BV8(1, 1, 1, 1, 0, 0, 0, 1),
819                     BV8(1, 1, 1, 1, 1, 0, 0, 0),
820                     BV8(0, 1, 1, 1, 1, 1, 0, 0),
821                     BV8(0, 0, 1, 1, 1, 1, 1, 0),
822                     BV8(0, 0, 0, 1, 1, 1, 1, 1))
824 /* AES inverse affine: */
825 #define tf_inv_const BV8(1, 0, 1, 0, 0, 0, 0, 0)
826 .Ltf_inv_bitmatrix:
827         .quad BM8X8(BV8(0, 0, 1, 0, 0, 1, 0, 1),
828                     BV8(1, 0, 0, 1, 0, 0, 1, 0),
829                     BV8(0, 1, 0, 0, 1, 0, 0, 1),
830                     BV8(1, 0, 1, 0, 0, 1, 0, 0),
831                     BV8(0, 1, 0, 1, 0, 0, 1, 0),
832                     BV8(0, 0, 1, 0, 1, 0, 0, 1),
833                     BV8(1, 0, 0, 1, 0, 1, 0, 0),
834                     BV8(0, 1, 0, 0, 1, 0, 1, 0))
836 /* S2: */
837 #define tf_s2_const BV8(0, 1, 0, 0, 0, 1, 1, 1)
838 .Ltf_s2_bitmatrix:
839         .quad BM8X8(BV8(0, 1, 0, 1, 0, 1, 1, 1),
840                     BV8(0, 0, 1, 1, 1, 1, 1, 1),
841                     BV8(1, 1, 1, 0, 1, 1, 0, 1),
842                     BV8(1, 1, 0, 0, 0, 0, 1, 1),
843                     BV8(0, 1, 0, 0, 0, 0, 1, 1),
844                     BV8(1, 1, 0, 0, 1, 1, 1, 0),
845                     BV8(0, 1, 1, 0, 0, 0, 1, 1),
846                     BV8(1, 1, 1, 1, 0, 1, 1, 0))
848 /* X2: */
849 #define tf_x2_const BV8(0, 0, 1, 1, 0, 1, 0, 0)
850 .Ltf_x2_bitmatrix:
851         .quad BM8X8(BV8(0, 0, 0, 1, 1, 0, 0, 0),
852                     BV8(0, 0, 1, 0, 0, 1, 1, 0),
853                     BV8(0, 0, 0, 0, 1, 0, 1, 0),
854                     BV8(1, 1, 1, 0, 0, 0, 1, 1),
855                     BV8(1, 1, 1, 0, 1, 1, 0, 0),
856                     BV8(0, 1, 1, 0, 1, 0, 1, 1),
857                     BV8(1, 0, 1, 1, 1, 1, 0, 1),
858                     BV8(1, 0, 0, 1, 0, 0, 1, 1))
860 /* Identity matrix: */
861 .Ltf_id_bitmatrix:
862         .quad BM8X8(BV8(1, 0, 0, 0, 0, 0, 0, 0),
863                     BV8(0, 1, 0, 0, 0, 0, 0, 0),
864                     BV8(0, 0, 1, 0, 0, 0, 0, 0),
865                     BV8(0, 0, 0, 1, 0, 0, 0, 0),
866                     BV8(0, 0, 0, 0, 1, 0, 0, 0),
867                     BV8(0, 0, 0, 0, 0, 1, 0, 0),
868                     BV8(0, 0, 0, 0, 0, 0, 1, 0),
869                     BV8(0, 0, 0, 0, 0, 0, 0, 1))
871 #endif /* CONFIG_AS_GFNI */
873 /* 4-bit mask */
874 .section        .rodata.cst4.L0f0f0f0f, "aM", @progbits, 4
875 .align 4
876 .L0f0f0f0f:
877         .long 0x0f0f0f0f
879 .text
881 SYM_FUNC_START_LOCAL(__aria_aesni_avx2_crypt_32way)
882         /* input:
883          *      %r9: rk
884          *      %rsi: dst
885          *      %rdx: src
886          *      %ymm0..%ymm15: byte-sliced blocks
887          */
889         FRAME_BEGIN
891         movq %rsi, %rax;
892         leaq 8 * 32(%rax), %r8;
894         inpack16_post(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
895                       %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
896                       %ymm15, %rax, %r8);
897         aria_fo(%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, %ymm15,
898                 %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
899                 %rax, %r9, 0);
900         aria_fe(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
901                 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
902                 %ymm15, %rax, %r9, 1);
903         aria_fo(%ymm9, %ymm8, %ymm11, %ymm10, %ymm12, %ymm13, %ymm14, %ymm15,
904                 %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
905                 %rax, %r9, 2);
906         aria_fe(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
907                 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
908                 %ymm15, %rax, %r9, 3);
909         aria_fo(%ymm9, %ymm8, %ymm11, %ymm10, %ymm12, %ymm13, %ymm14, %ymm15,
910                 %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
911                 %rax, %r9, 4);
912         aria_fe(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
913                 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
914                 %ymm15, %rax, %r9, 5);
915         aria_fo(%ymm9, %ymm8, %ymm11, %ymm10, %ymm12, %ymm13, %ymm14, %ymm15,
916                 %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
917                 %rax, %r9, 6);
918         aria_fe(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
919                 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
920                 %ymm15, %rax, %r9, 7);
921         aria_fo(%ymm9, %ymm8, %ymm11, %ymm10, %ymm12, %ymm13, %ymm14, %ymm15,
922                 %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
923                 %rax, %r9, 8);
924         aria_fe(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
925                 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
926                 %ymm15, %rax, %r9, 9);
927         aria_fo(%ymm9, %ymm8, %ymm11, %ymm10, %ymm12, %ymm13, %ymm14, %ymm15,
928                 %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
929                 %rax, %r9, 10);
930         cmpl $12, ARIA_CTX_rounds(CTX);
931         jne .Laria_192;
932         aria_ff(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
933                 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
934                 %ymm15, %rax, %r9, 11, 12);
935         jmp .Laria_end;
936 .Laria_192:
937         aria_fe(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
938                 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
939                 %ymm15, %rax, %r9, 11);
940         aria_fo(%ymm9, %ymm8, %ymm11, %ymm10, %ymm12, %ymm13, %ymm14, %ymm15,
941                 %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
942                 %rax, %r9, 12);
943         cmpl $14, ARIA_CTX_rounds(CTX);
944         jne .Laria_256;
945         aria_ff(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
946                 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
947                 %ymm15, %rax, %r9, 13, 14);
948         jmp .Laria_end;
949 .Laria_256:
950         aria_fe(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
951                 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
952                 %ymm15, %rax, %r9, 13);
953         aria_fo(%ymm9, %ymm8, %ymm11, %ymm10, %ymm12, %ymm13, %ymm14, %ymm15,
954                 %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
955                 %rax, %r9, 14);
956         aria_ff(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
957                 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
958                 %ymm15, %rax, %r9, 15, 16);
959 .Laria_end:
960         debyteslice_16x16b(%ymm8, %ymm12, %ymm1, %ymm4,
961                            %ymm9, %ymm13, %ymm0, %ymm5,
962                            %ymm10, %ymm14, %ymm3, %ymm6,
963                            %ymm11, %ymm15, %ymm2, %ymm7,
964                            (%rax), (%r8));
966         FRAME_END
967         RET;
968 SYM_FUNC_END(__aria_aesni_avx2_crypt_32way)
970 SYM_TYPED_FUNC_START(aria_aesni_avx2_encrypt_32way)
971         /* input:
972          *      %rdi: ctx, CTX
973          *      %rsi: dst
974          *      %rdx: src
975          */
977         FRAME_BEGIN
979         leaq ARIA_CTX_enc_key(CTX), %r9;
981         inpack16_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
982                      %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
983                      %ymm15, %rdx);
985         call __aria_aesni_avx2_crypt_32way;
987         write_output(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
988                      %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
989                      %ymm15, %rax);
991         FRAME_END
992         RET;
993 SYM_FUNC_END(aria_aesni_avx2_encrypt_32way)
995 SYM_TYPED_FUNC_START(aria_aesni_avx2_decrypt_32way)
996         /* input:
997          *      %rdi: ctx, CTX
998          *      %rsi: dst
999          *      %rdx: src
1000          */
1002         FRAME_BEGIN
1004         leaq ARIA_CTX_dec_key(CTX), %r9;
1006         inpack16_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
1007                      %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
1008                      %ymm15, %rdx);
1010         call __aria_aesni_avx2_crypt_32way;
1012         write_output(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
1013                      %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
1014                      %ymm15, %rax);
1016         FRAME_END
1017         RET;
1018 SYM_FUNC_END(aria_aesni_avx2_decrypt_32way)
1020 SYM_FUNC_START_LOCAL(__aria_aesni_avx2_ctr_gen_keystream_32way)
1021         /* input:
1022          *      %rdi: ctx
1023          *      %rsi: dst
1024          *      %rdx: src
1025          *      %rcx: keystream
1026          *      %r8: iv (big endian, 128bit)
1027          */
1029         FRAME_BEGIN
1030         movq 8(%r8), %r11;
1031         bswapq %r11;
1033         vbroadcasti128 .Lbswap128_mask (%rip), %ymm6;
1034         vpcmpeqd %ymm0, %ymm0, %ymm0;
1035         vpsrldq $8, %ymm0, %ymm0;   /* ab: -1:0 ; cd: -1:0 */
1036         vpaddq %ymm0, %ymm0, %ymm5; /* ab: -2:0 ; cd: -2:0 */
1038         /* load IV and byteswap */
1039         vmovdqu (%r8), %xmm7;
1040         vpshufb %xmm6, %xmm7, %xmm7;
1041         vmovdqa %xmm7, %xmm3;
1042         inc_le128(%xmm7, %xmm0, %xmm4);
1043         vinserti128 $1, %xmm7, %ymm3, %ymm3;
1044         vpshufb %ymm6, %ymm3, %ymm8; /* +1 ; +0 */
1046         /* check need for handling 64-bit overflow and carry */
1047         cmpq $(0xffffffffffffffff - 32), %r11;
1048         ja .Lhandle_ctr_carry;
1050         /* construct IVs */
1051         vpsubq %ymm5, %ymm3, %ymm3; /* +3 ; +2 */
1052         vpshufb %ymm6, %ymm3, %ymm9;
1053         vpsubq %ymm5, %ymm3, %ymm3; /* +5 ; +4 */
1054         vpshufb %ymm6, %ymm3, %ymm10;
1055         vpsubq %ymm5, %ymm3, %ymm3; /* +7 ; +6 */
1056         vpshufb %ymm6, %ymm3, %ymm11;
1057         vpsubq %ymm5, %ymm3, %ymm3; /* +9 ; +8 */
1058         vpshufb %ymm6, %ymm3, %ymm12;
1059         vpsubq %ymm5, %ymm3, %ymm3; /* +11 ; +10 */
1060         vpshufb %ymm6, %ymm3, %ymm13;
1061         vpsubq %ymm5, %ymm3, %ymm3; /* +13 ; +12 */
1062         vpshufb %ymm6, %ymm3, %ymm14;
1063         vpsubq %ymm5, %ymm3, %ymm3; /* +15 ; +14 */
1064         vpshufb %ymm6, %ymm3, %ymm15;
1065         vmovdqu %ymm8, (0 * 32)(%rcx);
1066         vmovdqu %ymm9, (1 * 32)(%rcx);
1067         vmovdqu %ymm10, (2 * 32)(%rcx);
1068         vmovdqu %ymm11, (3 * 32)(%rcx);
1069         vmovdqu %ymm12, (4 * 32)(%rcx);
1070         vmovdqu %ymm13, (5 * 32)(%rcx);
1071         vmovdqu %ymm14, (6 * 32)(%rcx);
1072         vmovdqu %ymm15, (7 * 32)(%rcx);
1074         vpsubq %ymm5, %ymm3, %ymm3; /* +17 ; +16 */
1075         vpshufb %ymm6, %ymm3, %ymm8;
1076         vpsubq %ymm5, %ymm3, %ymm3; /* +19 ; +18 */
1077         vpshufb %ymm6, %ymm3, %ymm9;
1078         vpsubq %ymm5, %ymm3, %ymm3; /* +21 ; +20 */
1079         vpshufb %ymm6, %ymm3, %ymm10;
1080         vpsubq %ymm5, %ymm3, %ymm3; /* +23 ; +22 */
1081         vpshufb %ymm6, %ymm3, %ymm11;
1082         vpsubq %ymm5, %ymm3, %ymm3; /* +25 ; +24 */
1083         vpshufb %ymm6, %ymm3, %ymm12;
1084         vpsubq %ymm5, %ymm3, %ymm3; /* +27 ; +26 */
1085         vpshufb %ymm6, %ymm3, %ymm13;
1086         vpsubq %ymm5, %ymm3, %ymm3; /* +29 ; +28 */
1087         vpshufb %ymm6, %ymm3, %ymm14;
1088         vpsubq %ymm5, %ymm3, %ymm3; /* +31 ; +30 */
1089         vpshufb %ymm6, %ymm3, %ymm15;
1090         vpsubq %ymm5, %ymm3, %ymm3; /* +32 */
1091         vpshufb %xmm6, %xmm3, %xmm3;
1092         vmovdqu %xmm3, (%r8);
1093         vmovdqu (0 * 32)(%rcx), %ymm0;
1094         vmovdqu (1 * 32)(%rcx), %ymm1;
1095         vmovdqu (2 * 32)(%rcx), %ymm2;
1096         vmovdqu (3 * 32)(%rcx), %ymm3;
1097         vmovdqu (4 * 32)(%rcx), %ymm4;
1098         vmovdqu (5 * 32)(%rcx), %ymm5;
1099         vmovdqu (6 * 32)(%rcx), %ymm6;
1100         vmovdqu (7 * 32)(%rcx), %ymm7;
1101         jmp .Lctr_carry_done;
1103         .Lhandle_ctr_carry:
1104         /* construct IVs */
1105         inc_le128(%ymm3, %ymm0, %ymm4);
1106         inc_le128(%ymm3, %ymm0, %ymm4);
1107         vpshufb %ymm6, %ymm3, %ymm9; /* +3 ; +2 */
1108         inc_le128(%ymm3, %ymm0, %ymm4);
1109         inc_le128(%ymm3, %ymm0, %ymm4);
1110         vpshufb %ymm6, %ymm3, %ymm10; /* +5 ; +4 */
1111         inc_le128(%ymm3, %ymm0, %ymm4);
1112         inc_le128(%ymm3, %ymm0, %ymm4);
1113         vpshufb %ymm6, %ymm3, %ymm11; /* +7 ; +6 */
1114         inc_le128(%ymm3, %ymm0, %ymm4);
1115         inc_le128(%ymm3, %ymm0, %ymm4);
1116         vpshufb %ymm6, %ymm3, %ymm12; /* +9 ; +8 */
1117         inc_le128(%ymm3, %ymm0, %ymm4);
1118         inc_le128(%ymm3, %ymm0, %ymm4);
1119         vpshufb %ymm6, %ymm3, %ymm13; /* +11 ; +10 */
1120         inc_le128(%ymm3, %ymm0, %ymm4);
1121         inc_le128(%ymm3, %ymm0, %ymm4);
1122         vpshufb %ymm6, %ymm3, %ymm14; /* +13 ; +12 */
1123         inc_le128(%ymm3, %ymm0, %ymm4);
1124         inc_le128(%ymm3, %ymm0, %ymm4);
1125         vpshufb %ymm6, %ymm3, %ymm15; /* +15 ; +14 */
1126         vmovdqu %ymm8, (0 * 32)(%rcx);
1127         vmovdqu %ymm9, (1 * 32)(%rcx);
1128         vmovdqu %ymm10, (2 * 32)(%rcx);
1129         vmovdqu %ymm11, (3 * 32)(%rcx);
1130         vmovdqu %ymm12, (4 * 32)(%rcx);
1131         vmovdqu %ymm13, (5 * 32)(%rcx);
1132         vmovdqu %ymm14, (6 * 32)(%rcx);
1133         vmovdqu %ymm15, (7 * 32)(%rcx);
1135         inc_le128(%ymm3, %ymm0, %ymm4);
1136         inc_le128(%ymm3, %ymm0, %ymm4);
1137         vpshufb %ymm6, %ymm3, %ymm8; /* +17 ; +16 */
1138         inc_le128(%ymm3, %ymm0, %ymm4);
1139         inc_le128(%ymm3, %ymm0, %ymm4);
1140         vpshufb %ymm6, %ymm3, %ymm9; /* +19 ; +18 */
1141         inc_le128(%ymm3, %ymm0, %ymm4);
1142         inc_le128(%ymm3, %ymm0, %ymm4);
1143         vpshufb %ymm6, %ymm3, %ymm10; /* +21 ; +20 */
1144         inc_le128(%ymm3, %ymm0, %ymm4);
1145         inc_le128(%ymm3, %ymm0, %ymm4);
1146         vpshufb %ymm6, %ymm3, %ymm11; /* +23 ; +22 */
1147         inc_le128(%ymm3, %ymm0, %ymm4);
1148         inc_le128(%ymm3, %ymm0, %ymm4);
1149         vpshufb %ymm6, %ymm3, %ymm12; /* +25 ; +24 */
1150         inc_le128(%ymm3, %ymm0, %ymm4);
1151         inc_le128(%ymm3, %ymm0, %ymm4);
1152         vpshufb %ymm6, %ymm3, %ymm13; /* +27 ; +26 */
1153         inc_le128(%ymm3, %ymm0, %ymm4);
1154         inc_le128(%ymm3, %ymm0, %ymm4);
1155         vpshufb %ymm6, %ymm3, %ymm14; /* +29 ; +28 */
1156         inc_le128(%ymm3, %ymm0, %ymm4);
1157         inc_le128(%ymm3, %ymm0, %ymm4);
1158         vpshufb %ymm6, %ymm3, %ymm15; /* +31 ; +30 */
1159         inc_le128(%ymm3, %ymm0, %ymm4);
1160         vextracti128 $1, %ymm3, %xmm3;
1161         vpshufb %xmm6, %xmm3, %xmm3; /* +32 */
1162         vmovdqu %xmm3, (%r8);
1163         vmovdqu (0 * 32)(%rcx), %ymm0;
1164         vmovdqu (1 * 32)(%rcx), %ymm1;
1165         vmovdqu (2 * 32)(%rcx), %ymm2;
1166         vmovdqu (3 * 32)(%rcx), %ymm3;
1167         vmovdqu (4 * 32)(%rcx), %ymm4;
1168         vmovdqu (5 * 32)(%rcx), %ymm5;
1169         vmovdqu (6 * 32)(%rcx), %ymm6;
1170         vmovdqu (7 * 32)(%rcx), %ymm7;
1172         .Lctr_carry_done:
1174         FRAME_END
1175         RET;
1176 SYM_FUNC_END(__aria_aesni_avx2_ctr_gen_keystream_32way)
1178 SYM_TYPED_FUNC_START(aria_aesni_avx2_ctr_crypt_32way)
1179         /* input:
1180          *      %rdi: ctx
1181          *      %rsi: dst
1182          *      %rdx: src
1183          *      %rcx: keystream
1184          *      %r8: iv (big endian, 128bit)
1185          */
1186         FRAME_BEGIN
1188         call __aria_aesni_avx2_ctr_gen_keystream_32way;
1190         leaq (%rsi), %r10;
1191         leaq (%rdx), %r11;
1192         leaq (%rcx), %rsi;
1193         leaq (%rcx), %rdx;
1194         leaq ARIA_CTX_enc_key(CTX), %r9;
1196         call __aria_aesni_avx2_crypt_32way;
1198         vpxor (0 * 32)(%r11), %ymm1, %ymm1;
1199         vpxor (1 * 32)(%r11), %ymm0, %ymm0;
1200         vpxor (2 * 32)(%r11), %ymm3, %ymm3;
1201         vpxor (3 * 32)(%r11), %ymm2, %ymm2;
1202         vpxor (4 * 32)(%r11), %ymm4, %ymm4;
1203         vpxor (5 * 32)(%r11), %ymm5, %ymm5;
1204         vpxor (6 * 32)(%r11), %ymm6, %ymm6;
1205         vpxor (7 * 32)(%r11), %ymm7, %ymm7;
1206         vpxor (8 * 32)(%r11), %ymm8, %ymm8;
1207         vpxor (9 * 32)(%r11), %ymm9, %ymm9;
1208         vpxor (10 * 32)(%r11), %ymm10, %ymm10;
1209         vpxor (11 * 32)(%r11), %ymm11, %ymm11;
1210         vpxor (12 * 32)(%r11), %ymm12, %ymm12;
1211         vpxor (13 * 32)(%r11), %ymm13, %ymm13;
1212         vpxor (14 * 32)(%r11), %ymm14, %ymm14;
1213         vpxor (15 * 32)(%r11), %ymm15, %ymm15;
1214         write_output(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
1215                      %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
1216                      %ymm15, %r10);
1218         FRAME_END
1219         RET;
1220 SYM_FUNC_END(aria_aesni_avx2_ctr_crypt_32way)
1222 #ifdef CONFIG_AS_GFNI
1223 SYM_FUNC_START_LOCAL(__aria_aesni_avx2_gfni_crypt_32way)
1224         /* input:
1225          *      %r9: rk
1226          *      %rsi: dst
1227          *      %rdx: src
1228          *      %ymm0..%ymm15: 16 byte-sliced blocks
1229          */
1231         FRAME_BEGIN
1233         movq %rsi, %rax;
1234         leaq 8 * 32(%rax), %r8;
1236         inpack16_post(%ymm0, %ymm1, %ymm2, %ymm3,
1237                       %ymm4, %ymm5, %ymm6, %ymm7,
1238                       %ymm8, %ymm9, %ymm10, %ymm11,
1239                       %ymm12, %ymm13, %ymm14,
1240                       %ymm15, %rax, %r8);
1241         aria_fo_gfni(%ymm8, %ymm9, %ymm10, %ymm11,
1242                      %ymm12, %ymm13, %ymm14, %ymm15,
1243                      %ymm0, %ymm1, %ymm2, %ymm3,
1244                      %ymm4, %ymm5, %ymm6, %ymm7,
1245                      %rax, %r9, 0);
1246         aria_fe_gfni(%ymm1, %ymm0, %ymm3, %ymm2,
1247                      %ymm4, %ymm5, %ymm6, %ymm7,
1248                      %ymm8, %ymm9, %ymm10, %ymm11,
1249                      %ymm12, %ymm13, %ymm14,
1250                      %ymm15, %rax, %r9, 1);
1251         aria_fo_gfni(%ymm9, %ymm8, %ymm11, %ymm10,
1252                      %ymm12, %ymm13, %ymm14, %ymm15,
1253                      %ymm0, %ymm1, %ymm2, %ymm3,
1254                      %ymm4, %ymm5, %ymm6, %ymm7,
1255                      %rax, %r9, 2);
1256         aria_fe_gfni(%ymm1, %ymm0, %ymm3, %ymm2,
1257                      %ymm4, %ymm5, %ymm6, %ymm7,
1258                      %ymm8, %ymm9, %ymm10, %ymm11,
1259                      %ymm12, %ymm13, %ymm14,
1260                      %ymm15, %rax, %r9, 3);
1261         aria_fo_gfni(%ymm9, %ymm8, %ymm11, %ymm10,
1262                      %ymm12, %ymm13, %ymm14, %ymm15,
1263                      %ymm0, %ymm1, %ymm2, %ymm3,
1264                      %ymm4, %ymm5, %ymm6, %ymm7,
1265                      %rax, %r9, 4);
1266         aria_fe_gfni(%ymm1, %ymm0, %ymm3, %ymm2,
1267                      %ymm4, %ymm5, %ymm6, %ymm7,
1268                      %ymm8, %ymm9, %ymm10, %ymm11,
1269                      %ymm12, %ymm13, %ymm14,
1270                      %ymm15, %rax, %r9, 5);
1271         aria_fo_gfni(%ymm9, %ymm8, %ymm11, %ymm10,
1272                      %ymm12, %ymm13, %ymm14, %ymm15,
1273                      %ymm0, %ymm1, %ymm2, %ymm3,
1274                      %ymm4, %ymm5, %ymm6, %ymm7,
1275                      %rax, %r9, 6);
1276         aria_fe_gfni(%ymm1, %ymm0, %ymm3, %ymm2,
1277                      %ymm4, %ymm5, %ymm6, %ymm7,
1278                      %ymm8, %ymm9, %ymm10, %ymm11,
1279                      %ymm12, %ymm13, %ymm14,
1280                      %ymm15, %rax, %r9, 7);
1281         aria_fo_gfni(%ymm9, %ymm8, %ymm11, %ymm10,
1282                      %ymm12, %ymm13, %ymm14, %ymm15,
1283                      %ymm0, %ymm1, %ymm2, %ymm3,
1284                      %ymm4, %ymm5, %ymm6, %ymm7,
1285                      %rax, %r9, 8);
1286         aria_fe_gfni(%ymm1, %ymm0, %ymm3, %ymm2,
1287                      %ymm4, %ymm5, %ymm6, %ymm7,
1288                      %ymm8, %ymm9, %ymm10, %ymm11,
1289                      %ymm12, %ymm13, %ymm14,
1290                      %ymm15, %rax, %r9, 9);
1291         aria_fo_gfni(%ymm9, %ymm8, %ymm11, %ymm10,
1292                      %ymm12, %ymm13, %ymm14, %ymm15,
1293                      %ymm0, %ymm1, %ymm2, %ymm3,
1294                      %ymm4, %ymm5, %ymm6, %ymm7,
1295                      %rax, %r9, 10);
1296         cmpl $12, ARIA_CTX_rounds(CTX);
1297         jne .Laria_gfni_192;
1298         aria_ff_gfni(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
1299                 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
1300                 %ymm15, %rax, %r9, 11, 12);
1301         jmp .Laria_gfni_end;
1302 .Laria_gfni_192:
1303         aria_fe_gfni(%ymm1, %ymm0, %ymm3, %ymm2,
1304                      %ymm4, %ymm5, %ymm6, %ymm7,
1305                      %ymm8, %ymm9, %ymm10, %ymm11,
1306                      %ymm12, %ymm13, %ymm14,
1307                      %ymm15, %rax, %r9, 11);
1308         aria_fo_gfni(%ymm9, %ymm8, %ymm11, %ymm10,
1309                      %ymm12, %ymm13, %ymm14, %ymm15,
1310                      %ymm0, %ymm1, %ymm2, %ymm3,
1311                      %ymm4, %ymm5, %ymm6, %ymm7,
1312                      %rax, %r9, 12);
1313         cmpl $14, ARIA_CTX_rounds(CTX);
1314         jne .Laria_gfni_256;
1315         aria_ff_gfni(%ymm1, %ymm0, %ymm3, %ymm2,
1316                      %ymm4, %ymm5, %ymm6, %ymm7,
1317                      %ymm8, %ymm9, %ymm10, %ymm11,
1318                      %ymm12, %ymm13, %ymm14,
1319                      %ymm15, %rax, %r9, 13, 14);
1320         jmp .Laria_gfni_end;
1321 .Laria_gfni_256:
1322         aria_fe_gfni(%ymm1, %ymm0, %ymm3, %ymm2,
1323                      %ymm4, %ymm5, %ymm6, %ymm7,
1324                      %ymm8, %ymm9, %ymm10, %ymm11,
1325                      %ymm12, %ymm13, %ymm14,
1326                      %ymm15, %rax, %r9, 13);
1327         aria_fo_gfni(%ymm9, %ymm8, %ymm11, %ymm10,
1328                      %ymm12, %ymm13, %ymm14, %ymm15,
1329                      %ymm0, %ymm1, %ymm2, %ymm3,
1330                      %ymm4, %ymm5, %ymm6, %ymm7,
1331                      %rax, %r9, 14);
1332         aria_ff_gfni(%ymm1, %ymm0, %ymm3, %ymm2,
1333                      %ymm4, %ymm5, %ymm6, %ymm7,
1334                      %ymm8, %ymm9, %ymm10, %ymm11,
1335                      %ymm12, %ymm13, %ymm14,
1336                      %ymm15, %rax, %r9, 15, 16);
1337 .Laria_gfni_end:
1338         debyteslice_16x16b(%ymm8, %ymm12, %ymm1, %ymm4,
1339                            %ymm9, %ymm13, %ymm0, %ymm5,
1340                            %ymm10, %ymm14, %ymm3, %ymm6,
1341                            %ymm11, %ymm15, %ymm2, %ymm7,
1342                            (%rax), (%r8));
1344         FRAME_END
1345         RET;
1346 SYM_FUNC_END(__aria_aesni_avx2_gfni_crypt_32way)
1348 SYM_TYPED_FUNC_START(aria_aesni_avx2_gfni_encrypt_32way)
1349         /* input:
1350          *      %rdi: ctx, CTX
1351          *      %rsi: dst
1352          *      %rdx: src
1353          */
1355         FRAME_BEGIN
1357         leaq ARIA_CTX_enc_key(CTX), %r9;
1359         inpack16_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
1360                      %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
1361                      %ymm15, %rdx);
1363         call __aria_aesni_avx2_gfni_crypt_32way;
1365         write_output(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
1366                      %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
1367                      %ymm15, %rax);
1369         FRAME_END
1370         RET;
1371 SYM_FUNC_END(aria_aesni_avx2_gfni_encrypt_32way)
1373 SYM_TYPED_FUNC_START(aria_aesni_avx2_gfni_decrypt_32way)
1374         /* input:
1375          *      %rdi: ctx, CTX
1376          *      %rsi: dst
1377          *      %rdx: src
1378          */
1380         FRAME_BEGIN
1382         leaq ARIA_CTX_dec_key(CTX), %r9;
1384         inpack16_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
1385                      %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
1386                      %ymm15, %rdx);
1388         call __aria_aesni_avx2_gfni_crypt_32way;
1390         write_output(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
1391                      %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
1392                      %ymm15, %rax);
1394         FRAME_END
1395         RET;
1396 SYM_FUNC_END(aria_aesni_avx2_gfni_decrypt_32way)
1398 SYM_TYPED_FUNC_START(aria_aesni_avx2_gfni_ctr_crypt_32way)
1399         /* input:
1400          *      %rdi: ctx
1401          *      %rsi: dst
1402          *      %rdx: src
1403          *      %rcx: keystream
1404          *      %r8: iv (big endian, 128bit)
1405          */
1406         FRAME_BEGIN
1408         call __aria_aesni_avx2_ctr_gen_keystream_32way
1410         leaq (%rsi), %r10;
1411         leaq (%rdx), %r11;
1412         leaq (%rcx), %rsi;
1413         leaq (%rcx), %rdx;
1414         leaq ARIA_CTX_enc_key(CTX), %r9;
1416         call __aria_aesni_avx2_gfni_crypt_32way;
1418         vpxor (0 * 32)(%r11), %ymm1, %ymm1;
1419         vpxor (1 * 32)(%r11), %ymm0, %ymm0;
1420         vpxor (2 * 32)(%r11), %ymm3, %ymm3;
1421         vpxor (3 * 32)(%r11), %ymm2, %ymm2;
1422         vpxor (4 * 32)(%r11), %ymm4, %ymm4;
1423         vpxor (5 * 32)(%r11), %ymm5, %ymm5;
1424         vpxor (6 * 32)(%r11), %ymm6, %ymm6;
1425         vpxor (7 * 32)(%r11), %ymm7, %ymm7;
1426         vpxor (8 * 32)(%r11), %ymm8, %ymm8;
1427         vpxor (9 * 32)(%r11), %ymm9, %ymm9;
1428         vpxor (10 * 32)(%r11), %ymm10, %ymm10;
1429         vpxor (11 * 32)(%r11), %ymm11, %ymm11;
1430         vpxor (12 * 32)(%r11), %ymm12, %ymm12;
1431         vpxor (13 * 32)(%r11), %ymm13, %ymm13;
1432         vpxor (14 * 32)(%r11), %ymm14, %ymm14;
1433         vpxor (15 * 32)(%r11), %ymm15, %ymm15;
1434         write_output(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
1435                      %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
1436                      %ymm15, %r10);
1438         FRAME_END
1439         RET;
1440 SYM_FUNC_END(aria_aesni_avx2_gfni_ctr_crypt_32way)
1441 #endif /* CONFIG_AS_GFNI */