accel/amdxdna: use modern PM helpers
[drm/drm-misc.git] / arch / x86 / crypto / aria-aesni-avx-asm_64.S
blob9556dacd984154a2b6ede8808321b8c6c692f347
1 /* SPDX-License-Identifier: GPL-2.0-or-later */
2 /*
3  * ARIA Cipher 16-way parallel algorithm (AVX)
4  *
5  * Copyright (c) 2022 Taehee Yoo <ap420073@gmail.com>
6  *
7  */
9 #include <linux/linkage.h>
10 #include <linux/cfi_types.h>
11 #include <asm/asm-offsets.h>
12 #include <asm/frame.h>
14 /* register macros */
15 #define CTX %rdi
18 #define BV8(a0, a1, a2, a3, a4, a5, a6, a7)             \
19         ( (((a0) & 1) << 0) |                           \
20           (((a1) & 1) << 1) |                           \
21           (((a2) & 1) << 2) |                           \
22           (((a3) & 1) << 3) |                           \
23           (((a4) & 1) << 4) |                           \
24           (((a5) & 1) << 5) |                           \
25           (((a6) & 1) << 6) |                           \
26           (((a7) & 1) << 7) )
28 #define BM8X8(l0, l1, l2, l3, l4, l5, l6, l7)           \
29         ( ((l7) << (0 * 8)) |                           \
30           ((l6) << (1 * 8)) |                           \
31           ((l5) << (2 * 8)) |                           \
32           ((l4) << (3 * 8)) |                           \
33           ((l3) << (4 * 8)) |                           \
34           ((l2) << (5 * 8)) |                           \
35           ((l1) << (6 * 8)) |                           \
36           ((l0) << (7 * 8)) )
38 #define inc_le128(x, minus_one, tmp)                    \
39         vpcmpeqq minus_one, x, tmp;                     \
40         vpsubq minus_one, x, x;                         \
41         vpslldq $8, tmp, tmp;                           \
42         vpsubq tmp, x, x;
44 #define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0)      \
45         vpand x, mask4bit, tmp0;                        \
46         vpandn x, mask4bit, x;                          \
47         vpsrld $4, x, x;                                \
48                                                         \
49         vpshufb tmp0, lo_t, tmp0;                       \
50         vpshufb x, hi_t, x;                             \
51         vpxor tmp0, x, x;
53 #define transpose_4x4(x0, x1, x2, x3, t1, t2)           \
54         vpunpckhdq x1, x0, t2;                          \
55         vpunpckldq x1, x0, x0;                          \
56                                                         \
57         vpunpckldq x3, x2, t1;                          \
58         vpunpckhdq x3, x2, x2;                          \
59                                                         \
60         vpunpckhqdq t1, x0, x1;                         \
61         vpunpcklqdq t1, x0, x0;                         \
62                                                         \
63         vpunpckhqdq x2, t2, x3;                         \
64         vpunpcklqdq x2, t2, x2;
66 #define byteslice_16x16b(a0, b0, c0, d0,                \
67                          a1, b1, c1, d1,                \
68                          a2, b2, c2, d2,                \
69                          a3, b3, c3, d3,                \
70                          st0, st1)                      \
71         vmovdqu d2, st0;                                \
72         vmovdqu d3, st1;                                \
73         transpose_4x4(a0, a1, a2, a3, d2, d3);          \
74         transpose_4x4(b0, b1, b2, b3, d2, d3);          \
75         vmovdqu st0, d2;                                \
76         vmovdqu st1, d3;                                \
77                                                         \
78         vmovdqu a0, st0;                                \
79         vmovdqu a1, st1;                                \
80         transpose_4x4(c0, c1, c2, c3, a0, a1);          \
81         transpose_4x4(d0, d1, d2, d3, a0, a1);          \
82                                                         \
83         vmovdqu .Lshufb_16x16b(%rip), a0;               \
84         vmovdqu st1, a1;                                \
85         vpshufb a0, a2, a2;                             \
86         vpshufb a0, a3, a3;                             \
87         vpshufb a0, b0, b0;                             \
88         vpshufb a0, b1, b1;                             \
89         vpshufb a0, b2, b2;                             \
90         vpshufb a0, b3, b3;                             \
91         vpshufb a0, a1, a1;                             \
92         vpshufb a0, c0, c0;                             \
93         vpshufb a0, c1, c1;                             \
94         vpshufb a0, c2, c2;                             \
95         vpshufb a0, c3, c3;                             \
96         vpshufb a0, d0, d0;                             \
97         vpshufb a0, d1, d1;                             \
98         vpshufb a0, d2, d2;                             \
99         vpshufb a0, d3, d3;                             \
100         vmovdqu d3, st1;                                \
101         vmovdqu st0, d3;                                \
102         vpshufb a0, d3, a0;                             \
103         vmovdqu d2, st0;                                \
104                                                         \
105         transpose_4x4(a0, b0, c0, d0, d2, d3);          \
106         transpose_4x4(a1, b1, c1, d1, d2, d3);          \
107         vmovdqu st0, d2;                                \
108         vmovdqu st1, d3;                                \
109                                                         \
110         vmovdqu b0, st0;                                \
111         vmovdqu b1, st1;                                \
112         transpose_4x4(a2, b2, c2, d2, b0, b1);          \
113         transpose_4x4(a3, b3, c3, d3, b0, b1);          \
114         vmovdqu st0, b0;                                \
115         vmovdqu st1, b1;                                \
116         /* does not adjust output bytes inside vectors */
118 #define debyteslice_16x16b(a0, b0, c0, d0,              \
119                            a1, b1, c1, d1,              \
120                            a2, b2, c2, d2,              \
121                            a3, b3, c3, d3,              \
122                            st0, st1)                    \
123         vmovdqu d2, st0;                                \
124         vmovdqu d3, st1;                                \
125         transpose_4x4(a0, a1, a2, a3, d2, d3);          \
126         transpose_4x4(b0, b1, b2, b3, d2, d3);          \
127         vmovdqu st0, d2;                                \
128         vmovdqu st1, d3;                                \
129                                                         \
130         vmovdqu a0, st0;                                \
131         vmovdqu a1, st1;                                \
132         transpose_4x4(c0, c1, c2, c3, a0, a1);          \
133         transpose_4x4(d0, d1, d2, d3, a0, a1);          \
134                                                         \
135         vmovdqu .Lshufb_16x16b(%rip), a0;               \
136         vmovdqu st1, a1;                                \
137         vpshufb a0, a2, a2;                             \
138         vpshufb a0, a3, a3;                             \
139         vpshufb a0, b0, b0;                             \
140         vpshufb a0, b1, b1;                             \
141         vpshufb a0, b2, b2;                             \
142         vpshufb a0, b3, b3;                             \
143         vpshufb a0, a1, a1;                             \
144         vpshufb a0, c0, c0;                             \
145         vpshufb a0, c1, c1;                             \
146         vpshufb a0, c2, c2;                             \
147         vpshufb a0, c3, c3;                             \
148         vpshufb a0, d0, d0;                             \
149         vpshufb a0, d1, d1;                             \
150         vpshufb a0, d2, d2;                             \
151         vpshufb a0, d3, d3;                             \
152         vmovdqu d3, st1;                                \
153         vmovdqu st0, d3;                                \
154         vpshufb a0, d3, a0;                             \
155         vmovdqu d2, st0;                                \
156                                                         \
157         transpose_4x4(c0, d0, a0, b0, d2, d3);          \
158         transpose_4x4(c1, d1, a1, b1, d2, d3);          \
159         vmovdqu st0, d2;                                \
160         vmovdqu st1, d3;                                \
161                                                         \
162         vmovdqu b0, st0;                                \
163         vmovdqu b1, st1;                                \
164         transpose_4x4(c2, d2, a2, b2, b0, b1);          \
165         transpose_4x4(c3, d3, a3, b3, b0, b1);          \
166         vmovdqu st0, b0;                                \
167         vmovdqu st1, b1;                                \
168         /* does not adjust output bytes inside vectors */
170 /* load blocks to registers and apply pre-whitening */
171 #define inpack16_pre(x0, x1, x2, x3,                    \
172                      x4, x5, x6, x7,                    \
173                      y0, y1, y2, y3,                    \
174                      y4, y5, y6, y7,                    \
175                      rio)                               \
176         vmovdqu (0 * 16)(rio), x0;                      \
177         vmovdqu (1 * 16)(rio), x1;                      \
178         vmovdqu (2 * 16)(rio), x2;                      \
179         vmovdqu (3 * 16)(rio), x3;                      \
180         vmovdqu (4 * 16)(rio), x4;                      \
181         vmovdqu (5 * 16)(rio), x5;                      \
182         vmovdqu (6 * 16)(rio), x6;                      \
183         vmovdqu (7 * 16)(rio), x7;                      \
184         vmovdqu (8 * 16)(rio), y0;                      \
185         vmovdqu (9 * 16)(rio), y1;                      \
186         vmovdqu (10 * 16)(rio), y2;                     \
187         vmovdqu (11 * 16)(rio), y3;                     \
188         vmovdqu (12 * 16)(rio), y4;                     \
189         vmovdqu (13 * 16)(rio), y5;                     \
190         vmovdqu (14 * 16)(rio), y6;                     \
191         vmovdqu (15 * 16)(rio), y7;
193 /* byteslice pre-whitened blocks and store to temporary memory */
194 #define inpack16_post(x0, x1, x2, x3,                   \
195                       x4, x5, x6, x7,                   \
196                       y0, y1, y2, y3,                   \
197                       y4, y5, y6, y7,                   \
198                       mem_ab, mem_cd)                   \
199         byteslice_16x16b(x0, x1, x2, x3,                \
200                          x4, x5, x6, x7,                \
201                          y0, y1, y2, y3,                \
202                          y4, y5, y6, y7,                \
203                          (mem_ab), (mem_cd));           \
204                                                         \
205         vmovdqu x0, 0 * 16(mem_ab);                     \
206         vmovdqu x1, 1 * 16(mem_ab);                     \
207         vmovdqu x2, 2 * 16(mem_ab);                     \
208         vmovdqu x3, 3 * 16(mem_ab);                     \
209         vmovdqu x4, 4 * 16(mem_ab);                     \
210         vmovdqu x5, 5 * 16(mem_ab);                     \
211         vmovdqu x6, 6 * 16(mem_ab);                     \
212         vmovdqu x7, 7 * 16(mem_ab);                     \
213         vmovdqu y0, 0 * 16(mem_cd);                     \
214         vmovdqu y1, 1 * 16(mem_cd);                     \
215         vmovdqu y2, 2 * 16(mem_cd);                     \
216         vmovdqu y3, 3 * 16(mem_cd);                     \
217         vmovdqu y4, 4 * 16(mem_cd);                     \
218         vmovdqu y5, 5 * 16(mem_cd);                     \
219         vmovdqu y6, 6 * 16(mem_cd);                     \
220         vmovdqu y7, 7 * 16(mem_cd);
222 #define write_output(x0, x1, x2, x3,                    \
223                      x4, x5, x6, x7,                    \
224                      y0, y1, y2, y3,                    \
225                      y4, y5, y6, y7,                    \
226                      mem)                               \
227         vmovdqu x0, 0 * 16(mem);                        \
228         vmovdqu x1, 1 * 16(mem);                        \
229         vmovdqu x2, 2 * 16(mem);                        \
230         vmovdqu x3, 3 * 16(mem);                        \
231         vmovdqu x4, 4 * 16(mem);                        \
232         vmovdqu x5, 5 * 16(mem);                        \
233         vmovdqu x6, 6 * 16(mem);                        \
234         vmovdqu x7, 7 * 16(mem);                        \
235         vmovdqu y0, 8 * 16(mem);                        \
236         vmovdqu y1, 9 * 16(mem);                        \
237         vmovdqu y2, 10 * 16(mem);                       \
238         vmovdqu y3, 11 * 16(mem);                       \
239         vmovdqu y4, 12 * 16(mem);                       \
240         vmovdqu y5, 13 * 16(mem);                       \
241         vmovdqu y6, 14 * 16(mem);                       \
242         vmovdqu y7, 15 * 16(mem);                       \
244 #define aria_store_state_8way(x0, x1, x2, x3,           \
245                               x4, x5, x6, x7,           \
246                               mem_tmp, idx)             \
247         vmovdqu x0, ((idx + 0) * 16)(mem_tmp);          \
248         vmovdqu x1, ((idx + 1) * 16)(mem_tmp);          \
249         vmovdqu x2, ((idx + 2) * 16)(mem_tmp);          \
250         vmovdqu x3, ((idx + 3) * 16)(mem_tmp);          \
251         vmovdqu x4, ((idx + 4) * 16)(mem_tmp);          \
252         vmovdqu x5, ((idx + 5) * 16)(mem_tmp);          \
253         vmovdqu x6, ((idx + 6) * 16)(mem_tmp);          \
254         vmovdqu x7, ((idx + 7) * 16)(mem_tmp);
256 #define aria_load_state_8way(x0, x1, x2, x3,            \
257                              x4, x5, x6, x7,            \
258                              mem_tmp, idx)              \
259         vmovdqu ((idx + 0) * 16)(mem_tmp), x0;          \
260         vmovdqu ((idx + 1) * 16)(mem_tmp), x1;          \
261         vmovdqu ((idx + 2) * 16)(mem_tmp), x2;          \
262         vmovdqu ((idx + 3) * 16)(mem_tmp), x3;          \
263         vmovdqu ((idx + 4) * 16)(mem_tmp), x4;          \
264         vmovdqu ((idx + 5) * 16)(mem_tmp), x5;          \
265         vmovdqu ((idx + 6) * 16)(mem_tmp), x6;          \
266         vmovdqu ((idx + 7) * 16)(mem_tmp), x7;
268 #define aria_ark_8way(x0, x1, x2, x3,                   \
269                       x4, x5, x6, x7,                   \
270                       t0, t1, t2, rk,                   \
271                       idx, round)                       \
272         /* AddRoundKey */                               \
273         vbroadcastss ((round * 16) + idx + 0)(rk), t0;  \
274         vpsrld $24, t0, t2;                             \
275         vpshufb t1, t2, t2;                             \
276         vpxor t2, x0, x0;                               \
277         vpsrld $16, t0, t2;                             \
278         vpshufb t1, t2, t2;                             \
279         vpxor t2, x1, x1;                               \
280         vpsrld $8, t0, t2;                              \
281         vpshufb t1, t2, t2;                             \
282         vpxor t2, x2, x2;                               \
283         vpshufb t1, t0, t2;                             \
284         vpxor t2, x3, x3;                               \
285         vbroadcastss ((round * 16) + idx + 4)(rk), t0;  \
286         vpsrld $24, t0, t2;                             \
287         vpshufb t1, t2, t2;                             \
288         vpxor t2, x4, x4;                               \
289         vpsrld $16, t0, t2;                             \
290         vpshufb t1, t2, t2;                             \
291         vpxor t2, x5, x5;                               \
292         vpsrld $8, t0, t2;                              \
293         vpshufb t1, t2, t2;                             \
294         vpxor t2, x6, x6;                               \
295         vpshufb t1, t0, t2;                             \
296         vpxor t2, x7, x7;
298 #ifdef CONFIG_AS_GFNI
299 #define aria_sbox_8way_gfni(x0, x1, x2, x3,             \
300                             x4, x5, x6, x7,             \
301                             t0, t1, t2, t3,             \
302                             t4, t5, t6, t7)             \
303         vmovdqa .Ltf_s2_bitmatrix(%rip), t0;            \
304         vmovdqa .Ltf_inv_bitmatrix(%rip), t1;           \
305         vmovdqa .Ltf_id_bitmatrix(%rip), t2;            \
306         vmovdqa .Ltf_aff_bitmatrix(%rip), t3;           \
307         vmovdqa .Ltf_x2_bitmatrix(%rip), t4;            \
308         vgf2p8affineinvqb $(tf_s2_const), t0, x1, x1;   \
309         vgf2p8affineinvqb $(tf_s2_const), t0, x5, x5;   \
310         vgf2p8affineqb $(tf_inv_const), t1, x2, x2;     \
311         vgf2p8affineqb $(tf_inv_const), t1, x6, x6;     \
312         vgf2p8affineinvqb $0, t2, x2, x2;               \
313         vgf2p8affineinvqb $0, t2, x6, x6;               \
314         vgf2p8affineinvqb $(tf_aff_const), t3, x0, x0;  \
315         vgf2p8affineinvqb $(tf_aff_const), t3, x4, x4;  \
316         vgf2p8affineqb $(tf_x2_const), t4, x3, x3;      \
317         vgf2p8affineqb $(tf_x2_const), t4, x7, x7;      \
318         vgf2p8affineinvqb $0, t2, x3, x3;               \
319         vgf2p8affineinvqb $0, t2, x7, x7
321 #endif /* CONFIG_AS_GFNI */
323 #define aria_sbox_8way(x0, x1, x2, x3,                  \
324                        x4, x5, x6, x7,                  \
325                        t0, t1, t2, t3,                  \
326                        t4, t5, t6, t7)                  \
327         vmovdqa .Linv_shift_row(%rip), t0;              \
328         vmovdqa .Lshift_row(%rip), t1;                  \
329         vbroadcastss .L0f0f0f0f(%rip), t6;              \
330         vmovdqa .Ltf_lo__inv_aff__and__s2(%rip), t2;    \
331         vmovdqa .Ltf_hi__inv_aff__and__s2(%rip), t3;    \
332         vmovdqa .Ltf_lo__x2__and__fwd_aff(%rip), t4;    \
333         vmovdqa .Ltf_hi__x2__and__fwd_aff(%rip), t5;    \
334                                                         \
335         vaesenclast t7, x0, x0;                         \
336         vaesenclast t7, x4, x4;                         \
337         vaesenclast t7, x1, x1;                         \
338         vaesenclast t7, x5, x5;                         \
339         vaesdeclast t7, x2, x2;                         \
340         vaesdeclast t7, x6, x6;                         \
341                                                         \
342         /* AES inverse shift rows */                    \
343         vpshufb t0, x0, x0;                             \
344         vpshufb t0, x4, x4;                             \
345         vpshufb t0, x1, x1;                             \
346         vpshufb t0, x5, x5;                             \
347         vpshufb t1, x3, x3;                             \
348         vpshufb t1, x7, x7;                             \
349         vpshufb t1, x2, x2;                             \
350         vpshufb t1, x6, x6;                             \
351                                                         \
352         /* affine transformation for S2 */              \
353         filter_8bit(x1, t2, t3, t6, t0);                \
354         /* affine transformation for S2 */              \
355         filter_8bit(x5, t2, t3, t6, t0);                \
356                                                         \
357         /* affine transformation for X2 */              \
358         filter_8bit(x3, t4, t5, t6, t0);                \
359         /* affine transformation for X2 */              \
360         filter_8bit(x7, t4, t5, t6, t0);                \
361         vaesdeclast t7, x3, x3;                         \
362         vaesdeclast t7, x7, x7;
364 #define aria_diff_m(x0, x1, x2, x3,                     \
365                     t0, t1, t2, t3)                     \
366         /* T = rotr32(X, 8); */                         \
367         /* X ^= T */                                    \
368         vpxor x0, x3, t0;                               \
369         vpxor x1, x0, t1;                               \
370         vpxor x2, x1, t2;                               \
371         vpxor x3, x2, t3;                               \
372         /* X = T ^ rotr(X, 16); */                      \
373         vpxor t2, x0, x0;                               \
374         vpxor x1, t3, t3;                               \
375         vpxor t0, x2, x2;                               \
376         vpxor t1, x3, x1;                               \
377         vmovdqu t3, x3;
379 #define aria_diff_word(x0, x1, x2, x3,                  \
380                        x4, x5, x6, x7,                  \
381                        y0, y1, y2, y3,                  \
382                        y4, y5, y6, y7)                  \
383         /* t1 ^= t2; */                                 \
384         vpxor y0, x4, x4;                               \
385         vpxor y1, x5, x5;                               \
386         vpxor y2, x6, x6;                               \
387         vpxor y3, x7, x7;                               \
388                                                         \
389         /* t2 ^= t3; */                                 \
390         vpxor y4, y0, y0;                               \
391         vpxor y5, y1, y1;                               \
392         vpxor y6, y2, y2;                               \
393         vpxor y7, y3, y3;                               \
394                                                         \
395         /* t0 ^= t1; */                                 \
396         vpxor x4, x0, x0;                               \
397         vpxor x5, x1, x1;                               \
398         vpxor x6, x2, x2;                               \
399         vpxor x7, x3, x3;                               \
400                                                         \
401         /* t3 ^= t1; */                                 \
402         vpxor x4, y4, y4;                               \
403         vpxor x5, y5, y5;                               \
404         vpxor x6, y6, y6;                               \
405         vpxor x7, y7, y7;                               \
406                                                         \
407         /* t2 ^= t0; */                                 \
408         vpxor x0, y0, y0;                               \
409         vpxor x1, y1, y1;                               \
410         vpxor x2, y2, y2;                               \
411         vpxor x3, y3, y3;                               \
412                                                         \
413         /* t1 ^= t2; */                                 \
414         vpxor y0, x4, x4;                               \
415         vpxor y1, x5, x5;                               \
416         vpxor y2, x6, x6;                               \
417         vpxor y3, x7, x7;
419 #define aria_fe(x0, x1, x2, x3,                         \
420                 x4, x5, x6, x7,                         \
421                 y0, y1, y2, y3,                         \
422                 y4, y5, y6, y7,                         \
423                 mem_tmp, rk, round)                     \
424         vpxor y7, y7, y7;                               \
425         aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,   \
426                       y0, y7, y2, rk, 8, round);        \
427                                                         \
428         aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5,  \
429                        y0, y1, y2, y3, y4, y5, y6, y7); \
430                                                         \
431         aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);    \
432         aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);    \
433         aria_store_state_8way(x0, x1, x2, x3,           \
434                               x4, x5, x6, x7,           \
435                               mem_tmp, 8);              \
436                                                         \
437         aria_load_state_8way(x0, x1, x2, x3,            \
438                              x4, x5, x6, x7,            \
439                              mem_tmp, 0);               \
440         aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,   \
441                       y0, y7, y2, rk, 0, round);        \
442                                                         \
443         aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5,  \
444                        y0, y1, y2, y3, y4, y5, y6, y7); \
445                                                         \
446         aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);    \
447         aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);    \
448         aria_store_state_8way(x0, x1, x2, x3,           \
449                               x4, x5, x6, x7,           \
450                               mem_tmp, 0);              \
451         aria_load_state_8way(y0, y1, y2, y3,            \
452                              y4, y5, y6, y7,            \
453                              mem_tmp, 8);               \
454         aria_diff_word(x0, x1, x2, x3,                  \
455                        x4, x5, x6, x7,                  \
456                        y0, y1, y2, y3,                  \
457                        y4, y5, y6, y7);                 \
458         /* aria_diff_byte()                             \
459          * T3 = ABCD -> BADC                            \
460          * T3 = y4, y5, y6, y7 -> y5, y4, y7, y6        \
461          * T0 = ABCD -> CDAB                            \
462          * T0 = x0, x1, x2, x3 -> x2, x3, x0, x1        \
463          * T1 = ABCD -> DCBA                            \
464          * T1 = x4, x5, x6, x7 -> x7, x6, x5, x4        \
465          */                                             \
466         aria_diff_word(x2, x3, x0, x1,                  \
467                        x7, x6, x5, x4,                  \
468                        y0, y1, y2, y3,                  \
469                        y5, y4, y7, y6);                 \
470         aria_store_state_8way(x3, x2, x1, x0,           \
471                               x6, x7, x4, x5,           \
472                               mem_tmp, 0);
474 #define aria_fo(x0, x1, x2, x3,                         \
475                 x4, x5, x6, x7,                         \
476                 y0, y1, y2, y3,                         \
477                 y4, y5, y6, y7,                         \
478                 mem_tmp, rk, round)                     \
479         vpxor y7, y7, y7;                               \
480         aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,   \
481                       y0, y7, y2, rk, 8, round);        \
482                                                         \
483         aria_sbox_8way(x0, x1, x2, x3, x4, x5, x6, x7,  \
484                        y0, y1, y2, y3, y4, y5, y6, y7); \
485                                                         \
486         aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);    \
487         aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);    \
488         aria_store_state_8way(x0, x1, x2, x3,           \
489                               x4, x5, x6, x7,           \
490                               mem_tmp, 8);              \
491                                                         \
492         aria_load_state_8way(x0, x1, x2, x3,            \
493                              x4, x5, x6, x7,            \
494                              mem_tmp, 0);               \
495         aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,   \
496                       y0, y7, y2, rk, 0, round);        \
497                                                         \
498         aria_sbox_8way(x0, x1, x2, x3, x4, x5, x6, x7,  \
499                        y0, y1, y2, y3, y4, y5, y6, y7); \
500                                                         \
501         aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);    \
502         aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);    \
503         aria_store_state_8way(x0, x1, x2, x3,           \
504                               x4, x5, x6, x7,           \
505                               mem_tmp, 0);              \
506         aria_load_state_8way(y0, y1, y2, y3,            \
507                              y4, y5, y6, y7,            \
508                              mem_tmp, 8);               \
509         aria_diff_word(x0, x1, x2, x3,                  \
510                        x4, x5, x6, x7,                  \
511                        y0, y1, y2, y3,                  \
512                        y4, y5, y6, y7);                 \
513         /* aria_diff_byte()                             \
514          * T1 = ABCD -> BADC                            \
515          * T1 = x4, x5, x6, x7 -> x5, x4, x7, x6        \
516          * T2 = ABCD -> CDAB                            \
517          * T2 = y0, y1, y2, y3, -> y2, y3, y0, y1       \
518          * T3 = ABCD -> DCBA                            \
519          * T3 = y4, y5, y6, y7 -> y7, y6, y5, y4        \
520          */                                             \
521         aria_diff_word(x0, x1, x2, x3,                  \
522                        x5, x4, x7, x6,                  \
523                        y2, y3, y0, y1,                  \
524                        y7, y6, y5, y4);                 \
525         aria_store_state_8way(x3, x2, x1, x0,           \
526                               x6, x7, x4, x5,           \
527                               mem_tmp, 0);
529 #define aria_ff(x0, x1, x2, x3,                         \
530                 x4, x5, x6, x7,                         \
531                 y0, y1, y2, y3,                         \
532                 y4, y5, y6, y7,                         \
533                 mem_tmp, rk, round, last_round)         \
534         vpxor y7, y7, y7;                               \
535         aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,   \
536                       y0, y7, y2, rk, 8, round);        \
537                                                         \
538         aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5,  \
539                        y0, y1, y2, y3, y4, y5, y6, y7); \
540                                                         \
541         aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,   \
542                       y0, y7, y2, rk, 8, last_round);   \
543                                                         \
544         aria_store_state_8way(x0, x1, x2, x3,           \
545                               x4, x5, x6, x7,           \
546                               mem_tmp, 8);              \
547                                                         \
548         aria_load_state_8way(x0, x1, x2, x3,            \
549                              x4, x5, x6, x7,            \
550                              mem_tmp, 0);               \
551         aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,   \
552                       y0, y7, y2, rk, 0, round);        \
553                                                         \
554         aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5,  \
555                        y0, y1, y2, y3, y4, y5, y6, y7); \
556                                                         \
557         aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,   \
558                       y0, y7, y2, rk, 0, last_round);   \
559                                                         \
560         aria_load_state_8way(y0, y1, y2, y3,            \
561                              y4, y5, y6, y7,            \
562                              mem_tmp, 8);
564 #ifdef CONFIG_AS_GFNI
565 #define aria_fe_gfni(x0, x1, x2, x3,                    \
566                      x4, x5, x6, x7,                    \
567                      y0, y1, y2, y3,                    \
568                      y4, y5, y6, y7,                    \
569                      mem_tmp, rk, round)                \
570         vpxor y7, y7, y7;                               \
571         aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,   \
572                       y0, y7, y2, rk, 8, round);        \
573                                                         \
574         aria_sbox_8way_gfni(x2, x3, x0, x1,             \
575                             x6, x7, x4, x5,             \
576                             y0, y1, y2, y3,             \
577                             y4, y5, y6, y7);            \
578                                                         \
579         aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);    \
580         aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);    \
581         aria_store_state_8way(x0, x1, x2, x3,           \
582                               x4, x5, x6, x7,           \
583                               mem_tmp, 8);              \
584                                                         \
585         aria_load_state_8way(x0, x1, x2, x3,            \
586                              x4, x5, x6, x7,            \
587                              mem_tmp, 0);               \
588         aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,   \
589                       y0, y7, y2, rk, 0, round);        \
590                                                         \
591         aria_sbox_8way_gfni(x2, x3, x0, x1,             \
592                             x6, x7, x4, x5,             \
593                             y0, y1, y2, y3,             \
594                             y4, y5, y6, y7);            \
595                                                         \
596         aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);    \
597         aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);    \
598         aria_store_state_8way(x0, x1, x2, x3,           \
599                               x4, x5, x6, x7,           \
600                               mem_tmp, 0);              \
601         aria_load_state_8way(y0, y1, y2, y3,            \
602                              y4, y5, y6, y7,            \
603                              mem_tmp, 8);               \
604         aria_diff_word(x0, x1, x2, x3,                  \
605                        x4, x5, x6, x7,                  \
606                        y0, y1, y2, y3,                  \
607                        y4, y5, y6, y7);                 \
608         /* aria_diff_byte()                             \
609          * T3 = ABCD -> BADC                            \
610          * T3 = y4, y5, y6, y7 -> y5, y4, y7, y6        \
611          * T0 = ABCD -> CDAB                            \
612          * T0 = x0, x1, x2, x3 -> x2, x3, x0, x1        \
613          * T1 = ABCD -> DCBA                            \
614          * T1 = x4, x5, x6, x7 -> x7, x6, x5, x4        \
615          */                                             \
616         aria_diff_word(x2, x3, x0, x1,                  \
617                        x7, x6, x5, x4,                  \
618                        y0, y1, y2, y3,                  \
619                        y5, y4, y7, y6);                 \
620         aria_store_state_8way(x3, x2, x1, x0,           \
621                               x6, x7, x4, x5,           \
622                               mem_tmp, 0);
624 #define aria_fo_gfni(x0, x1, x2, x3,                    \
625                      x4, x5, x6, x7,                    \
626                      y0, y1, y2, y3,                    \
627                      y4, y5, y6, y7,                    \
628                      mem_tmp, rk, round)                \
629         vpxor y7, y7, y7;                               \
630         aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,   \
631                       y0, y7, y2, rk, 8, round);        \
632                                                         \
633         aria_sbox_8way_gfni(x0, x1, x2, x3,             \
634                             x4, x5, x6, x7,             \
635                             y0, y1, y2, y3,             \
636                             y4, y5, y6, y7);            \
637                                                         \
638         aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);    \
639         aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);    \
640         aria_store_state_8way(x0, x1, x2, x3,           \
641                               x4, x5, x6, x7,           \
642                               mem_tmp, 8);              \
643                                                         \
644         aria_load_state_8way(x0, x1, x2, x3,            \
645                              x4, x5, x6, x7,            \
646                              mem_tmp, 0);               \
647         aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,   \
648                       y0, y7, y2, rk, 0, round);        \
649                                                         \
650         aria_sbox_8way_gfni(x0, x1, x2, x3,             \
651                             x4, x5, x6, x7,             \
652                             y0, y1, y2, y3,             \
653                             y4, y5, y6, y7);            \
654                                                         \
655         aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);    \
656         aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);    \
657         aria_store_state_8way(x0, x1, x2, x3,           \
658                               x4, x5, x6, x7,           \
659                               mem_tmp, 0);              \
660         aria_load_state_8way(y0, y1, y2, y3,            \
661                              y4, y5, y6, y7,            \
662                              mem_tmp, 8);               \
663         aria_diff_word(x0, x1, x2, x3,                  \
664                        x4, x5, x6, x7,                  \
665                        y0, y1, y2, y3,                  \
666                        y4, y5, y6, y7);                 \
667         /* aria_diff_byte()                             \
668          * T1 = ABCD -> BADC                            \
669          * T1 = x4, x5, x6, x7 -> x5, x4, x7, x6        \
670          * T2 = ABCD -> CDAB                            \
671          * T2 = y0, y1, y2, y3, -> y2, y3, y0, y1       \
672          * T3 = ABCD -> DCBA                            \
673          * T3 = y4, y5, y6, y7 -> y7, y6, y5, y4        \
674          */                                             \
675         aria_diff_word(x0, x1, x2, x3,                  \
676                        x5, x4, x7, x6,                  \
677                        y2, y3, y0, y1,                  \
678                        y7, y6, y5, y4);                 \
679         aria_store_state_8way(x3, x2, x1, x0,           \
680                               x6, x7, x4, x5,           \
681                               mem_tmp, 0);
683 #define aria_ff_gfni(x0, x1, x2, x3,                    \
684                 x4, x5, x6, x7,                         \
685                 y0, y1, y2, y3,                         \
686                 y4, y5, y6, y7,                         \
687                 mem_tmp, rk, round, last_round)         \
688         vpxor y7, y7, y7;                               \
689         aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,   \
690                       y0, y7, y2, rk, 8, round);        \
691                                                         \
692         aria_sbox_8way_gfni(x2, x3, x0, x1,             \
693                             x6, x7, x4, x5,             \
694                             y0, y1, y2, y3,             \
695                             y4, y5, y6, y7);            \
696                                                         \
697         aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,   \
698                       y0, y7, y2, rk, 8, last_round);   \
699                                                         \
700         aria_store_state_8way(x0, x1, x2, x3,           \
701                               x4, x5, x6, x7,           \
702                               mem_tmp, 8);              \
703                                                         \
704         aria_load_state_8way(x0, x1, x2, x3,            \
705                              x4, x5, x6, x7,            \
706                              mem_tmp, 0);               \
707         aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,   \
708                       y0, y7, y2, rk, 0, round);        \
709                                                         \
710         aria_sbox_8way_gfni(x2, x3, x0, x1,             \
711                             x6, x7, x4, x5,             \
712                             y0, y1, y2, y3,             \
713                             y4, y5, y6, y7);            \
714                                                         \
715         aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,   \
716                       y0, y7, y2, rk, 0, last_round);   \
717                                                         \
718         aria_load_state_8way(y0, y1, y2, y3,            \
719                              y4, y5, y6, y7,            \
720                              mem_tmp, 8);
722 #endif /* CONFIG_AS_GFNI */
724 /* NB: section is mergeable, all elements must be aligned 16-byte blocks */
725 .section        .rodata.cst16, "aM", @progbits, 16
726 .align 16
728 #define SHUFB_BYTES(idx) \
729         0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx)
731 .Lshufb_16x16b:
732         .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3);
733 /* For isolating SubBytes from AESENCLAST, inverse shift row */
734 .Linv_shift_row:
735         .byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b
736         .byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03
737 .Lshift_row:
738         .byte 0x00, 0x05, 0x0a, 0x0f, 0x04, 0x09, 0x0e, 0x03
739         .byte 0x08, 0x0d, 0x02, 0x07, 0x0c, 0x01, 0x06, 0x0b
740 /* For CTR-mode IV byteswap */
741 .Lbswap128_mask:
742         .byte 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08
743         .byte 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00
745 /* AES inverse affine and S2 combined:
746  *      1 1 0 0 0 0 0 1     x0     0
747  *      0 1 0 0 1 0 0 0     x1     0
748  *      1 1 0 0 1 1 1 1     x2     0
749  *      0 1 1 0 1 0 0 1     x3     1
750  *      0 1 0 0 1 1 0 0  *  x4  +  0
751  *      0 1 0 1 1 0 0 0     x5     0
752  *      0 0 0 0 0 1 0 1     x6     0
753  *      1 1 1 0 0 1 1 1     x7     1
754  */
755 .Ltf_lo__inv_aff__and__s2:
756         .octa 0x92172DA81A9FA520B2370D883ABF8500
757 .Ltf_hi__inv_aff__and__s2:
758         .octa 0x2B15FFC1AF917B45E6D8320C625CB688
760 /* X2 and AES forward affine combined:
761  *      1 0 1 1 0 0 0 1     x0     0
762  *      0 1 1 1 1 0 1 1     x1     0
763  *      0 0 0 1 1 0 1 0     x2     1
764  *      0 1 0 0 0 1 0 0     x3     0
765  *      0 0 1 1 1 0 1 1  *  x4  +  0
766  *      0 1 0 0 1 0 0 0     x5     0
767  *      1 1 0 1 0 0 1 1     x6     0
768  *      0 1 0 0 1 0 1 0     x7     0
769  */
770 .Ltf_lo__x2__and__fwd_aff:
771         .octa 0xEFAE0544FCBD1657B8F95213ABEA4100
772 .Ltf_hi__x2__and__fwd_aff:
773         .octa 0x3F893781E95FE1576CDA64D2BA0CB204
775 #ifdef CONFIG_AS_GFNI
776 /* AES affine: */
777 #define tf_aff_const BV8(1, 1, 0, 0, 0, 1, 1, 0)
778 .Ltf_aff_bitmatrix:
779         .quad BM8X8(BV8(1, 0, 0, 0, 1, 1, 1, 1),
780                     BV8(1, 1, 0, 0, 0, 1, 1, 1),
781                     BV8(1, 1, 1, 0, 0, 0, 1, 1),
782                     BV8(1, 1, 1, 1, 0, 0, 0, 1),
783                     BV8(1, 1, 1, 1, 1, 0, 0, 0),
784                     BV8(0, 1, 1, 1, 1, 1, 0, 0),
785                     BV8(0, 0, 1, 1, 1, 1, 1, 0),
786                     BV8(0, 0, 0, 1, 1, 1, 1, 1))
787         .quad BM8X8(BV8(1, 0, 0, 0, 1, 1, 1, 1),
788                     BV8(1, 1, 0, 0, 0, 1, 1, 1),
789                     BV8(1, 1, 1, 0, 0, 0, 1, 1),
790                     BV8(1, 1, 1, 1, 0, 0, 0, 1),
791                     BV8(1, 1, 1, 1, 1, 0, 0, 0),
792                     BV8(0, 1, 1, 1, 1, 1, 0, 0),
793                     BV8(0, 0, 1, 1, 1, 1, 1, 0),
794                     BV8(0, 0, 0, 1, 1, 1, 1, 1))
796 /* AES inverse affine: */
797 #define tf_inv_const BV8(1, 0, 1, 0, 0, 0, 0, 0)
798 .Ltf_inv_bitmatrix:
799         .quad BM8X8(BV8(0, 0, 1, 0, 0, 1, 0, 1),
800                     BV8(1, 0, 0, 1, 0, 0, 1, 0),
801                     BV8(0, 1, 0, 0, 1, 0, 0, 1),
802                     BV8(1, 0, 1, 0, 0, 1, 0, 0),
803                     BV8(0, 1, 0, 1, 0, 0, 1, 0),
804                     BV8(0, 0, 1, 0, 1, 0, 0, 1),
805                     BV8(1, 0, 0, 1, 0, 1, 0, 0),
806                     BV8(0, 1, 0, 0, 1, 0, 1, 0))
807         .quad BM8X8(BV8(0, 0, 1, 0, 0, 1, 0, 1),
808                     BV8(1, 0, 0, 1, 0, 0, 1, 0),
809                     BV8(0, 1, 0, 0, 1, 0, 0, 1),
810                     BV8(1, 0, 1, 0, 0, 1, 0, 0),
811                     BV8(0, 1, 0, 1, 0, 0, 1, 0),
812                     BV8(0, 0, 1, 0, 1, 0, 0, 1),
813                     BV8(1, 0, 0, 1, 0, 1, 0, 0),
814                     BV8(0, 1, 0, 0, 1, 0, 1, 0))
816 /* S2: */
817 #define tf_s2_const BV8(0, 1, 0, 0, 0, 1, 1, 1)
818 .Ltf_s2_bitmatrix:
819         .quad BM8X8(BV8(0, 1, 0, 1, 0, 1, 1, 1),
820                     BV8(0, 0, 1, 1, 1, 1, 1, 1),
821                     BV8(1, 1, 1, 0, 1, 1, 0, 1),
822                     BV8(1, 1, 0, 0, 0, 0, 1, 1),
823                     BV8(0, 1, 0, 0, 0, 0, 1, 1),
824                     BV8(1, 1, 0, 0, 1, 1, 1, 0),
825                     BV8(0, 1, 1, 0, 0, 0, 1, 1),
826                     BV8(1, 1, 1, 1, 0, 1, 1, 0))
827         .quad BM8X8(BV8(0, 1, 0, 1, 0, 1, 1, 1),
828                     BV8(0, 0, 1, 1, 1, 1, 1, 1),
829                     BV8(1, 1, 1, 0, 1, 1, 0, 1),
830                     BV8(1, 1, 0, 0, 0, 0, 1, 1),
831                     BV8(0, 1, 0, 0, 0, 0, 1, 1),
832                     BV8(1, 1, 0, 0, 1, 1, 1, 0),
833                     BV8(0, 1, 1, 0, 0, 0, 1, 1),
834                     BV8(1, 1, 1, 1, 0, 1, 1, 0))
836 /* X2: */
837 #define tf_x2_const BV8(0, 0, 1, 1, 0, 1, 0, 0)
838 .Ltf_x2_bitmatrix:
839         .quad BM8X8(BV8(0, 0, 0, 1, 1, 0, 0, 0),
840                     BV8(0, 0, 1, 0, 0, 1, 1, 0),
841                     BV8(0, 0, 0, 0, 1, 0, 1, 0),
842                     BV8(1, 1, 1, 0, 0, 0, 1, 1),
843                     BV8(1, 1, 1, 0, 1, 1, 0, 0),
844                     BV8(0, 1, 1, 0, 1, 0, 1, 1),
845                     BV8(1, 0, 1, 1, 1, 1, 0, 1),
846                     BV8(1, 0, 0, 1, 0, 0, 1, 1))
847         .quad BM8X8(BV8(0, 0, 0, 1, 1, 0, 0, 0),
848                     BV8(0, 0, 1, 0, 0, 1, 1, 0),
849                     BV8(0, 0, 0, 0, 1, 0, 1, 0),
850                     BV8(1, 1, 1, 0, 0, 0, 1, 1),
851                     BV8(1, 1, 1, 0, 1, 1, 0, 0),
852                     BV8(0, 1, 1, 0, 1, 0, 1, 1),
853                     BV8(1, 0, 1, 1, 1, 1, 0, 1),
854                     BV8(1, 0, 0, 1, 0, 0, 1, 1))
856 /* Identity matrix: */
857 .Ltf_id_bitmatrix:
858         .quad BM8X8(BV8(1, 0, 0, 0, 0, 0, 0, 0),
859                     BV8(0, 1, 0, 0, 0, 0, 0, 0),
860                     BV8(0, 0, 1, 0, 0, 0, 0, 0),
861                     BV8(0, 0, 0, 1, 0, 0, 0, 0),
862                     BV8(0, 0, 0, 0, 1, 0, 0, 0),
863                     BV8(0, 0, 0, 0, 0, 1, 0, 0),
864                     BV8(0, 0, 0, 0, 0, 0, 1, 0),
865                     BV8(0, 0, 0, 0, 0, 0, 0, 1))
866         .quad BM8X8(BV8(1, 0, 0, 0, 0, 0, 0, 0),
867                     BV8(0, 1, 0, 0, 0, 0, 0, 0),
868                     BV8(0, 0, 1, 0, 0, 0, 0, 0),
869                     BV8(0, 0, 0, 1, 0, 0, 0, 0),
870                     BV8(0, 0, 0, 0, 1, 0, 0, 0),
871                     BV8(0, 0, 0, 0, 0, 1, 0, 0),
872                     BV8(0, 0, 0, 0, 0, 0, 1, 0),
873                     BV8(0, 0, 0, 0, 0, 0, 0, 1))
874 #endif /* CONFIG_AS_GFNI */
876 /* 4-bit mask */
877 .section        .rodata.cst4.L0f0f0f0f, "aM", @progbits, 4
878 .align 4
879 .L0f0f0f0f:
880         .long 0x0f0f0f0f
882 .text
884 SYM_FUNC_START_LOCAL(__aria_aesni_avx_crypt_16way)
885         /* input:
886         *      %r9: rk
887         *      %rsi: dst
888         *      %rdx: src
889         *      %xmm0..%xmm15: 16 byte-sliced blocks
890         */
892         FRAME_BEGIN
894         movq %rsi, %rax;
895         leaq 8 * 16(%rax), %r8;
897         inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
898                       %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
899                       %xmm15, %rax, %r8);
900         aria_fo(%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15,
901                 %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
902                 %rax, %r9, 0);
903         aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
904                 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
905                 %xmm15, %rax, %r9, 1);
906         aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
907                 %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
908                 %rax, %r9, 2);
909         aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
910                 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
911                 %xmm15, %rax, %r9, 3);
912         aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
913                 %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
914                 %rax, %r9, 4);
915         aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
916                 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
917                 %xmm15, %rax, %r9, 5);
918         aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
919                 %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
920                 %rax, %r9, 6);
921         aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
922                 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
923                 %xmm15, %rax, %r9, 7);
924         aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
925                 %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
926                 %rax, %r9, 8);
927         aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
928                 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
929                 %xmm15, %rax, %r9, 9);
930         aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
931                 %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
932                 %rax, %r9, 10);
933         cmpl $12, ARIA_CTX_rounds(CTX);
934         jne .Laria_192;
935         aria_ff(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
936                 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
937                 %xmm15, %rax, %r9, 11, 12);
938         jmp .Laria_end;
939 .Laria_192:
940         aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
941                 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
942                 %xmm15, %rax, %r9, 11);
943         aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
944                 %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
945                 %rax, %r9, 12);
946         cmpl $14, ARIA_CTX_rounds(CTX);
947         jne .Laria_256;
948         aria_ff(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
949                 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
950                 %xmm15, %rax, %r9, 13, 14);
951         jmp .Laria_end;
952 .Laria_256:
953         aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
954                 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
955                 %xmm15, %rax, %r9, 13);
956         aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
957                 %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
958                 %rax, %r9, 14);
959         aria_ff(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
960                 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
961                 %xmm15, %rax, %r9, 15, 16);
962 .Laria_end:
963         debyteslice_16x16b(%xmm8, %xmm12, %xmm1, %xmm4,
964                            %xmm9, %xmm13, %xmm0, %xmm5,
965                            %xmm10, %xmm14, %xmm3, %xmm6,
966                            %xmm11, %xmm15, %xmm2, %xmm7,
967                            (%rax), (%r8));
969         FRAME_END
970         RET;
971 SYM_FUNC_END(__aria_aesni_avx_crypt_16way)
973 SYM_TYPED_FUNC_START(aria_aesni_avx_encrypt_16way)
974         /* input:
975         *      %rdi: ctx, CTX
976         *      %rsi: dst
977         *      %rdx: src
978         */
980         FRAME_BEGIN
982         leaq ARIA_CTX_enc_key(CTX), %r9;
984         inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
985                      %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
986                      %xmm15, %rdx);
988         call __aria_aesni_avx_crypt_16way;
990         write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
991                      %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
992                      %xmm15, %rax);
994         FRAME_END
995         RET;
996 SYM_FUNC_END(aria_aesni_avx_encrypt_16way)
998 SYM_TYPED_FUNC_START(aria_aesni_avx_decrypt_16way)
999         /* input:
1000         *      %rdi: ctx, CTX
1001         *      %rsi: dst
1002         *      %rdx: src
1003         */
1005         FRAME_BEGIN
1007         leaq ARIA_CTX_dec_key(CTX), %r9;
1009         inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
1010                      %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1011                      %xmm15, %rdx);
1013         call __aria_aesni_avx_crypt_16way;
1015         write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
1016                      %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1017                      %xmm15, %rax);
1019         FRAME_END
1020         RET;
1021 SYM_FUNC_END(aria_aesni_avx_decrypt_16way)
1023 SYM_FUNC_START_LOCAL(__aria_aesni_avx_ctr_gen_keystream_16way)
1024         /* input:
1025         *      %rdi: ctx
1026         *      %rsi: dst
1027         *      %rdx: src
1028         *      %rcx: keystream
1029         *      %r8: iv (big endian, 128bit)
1030         */
1032         FRAME_BEGIN
1033         /* load IV and byteswap */
1034         vmovdqu (%r8), %xmm8;
1036         vmovdqa .Lbswap128_mask (%rip), %xmm1;
1037         vpshufb %xmm1, %xmm8, %xmm3; /* be => le */
1039         vpcmpeqd %xmm0, %xmm0, %xmm0;
1040         vpsrldq $8, %xmm0, %xmm0; /* low: -1, high: 0 */
1042         /* construct IVs */
1043         inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1044         vpshufb %xmm1, %xmm3, %xmm9;
1045         inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1046         vpshufb %xmm1, %xmm3, %xmm10;
1047         inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1048         vpshufb %xmm1, %xmm3, %xmm11;
1049         inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1050         vpshufb %xmm1, %xmm3, %xmm12;
1051         inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1052         vpshufb %xmm1, %xmm3, %xmm13;
1053         inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1054         vpshufb %xmm1, %xmm3, %xmm14;
1055         inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1056         vpshufb %xmm1, %xmm3, %xmm15;
1057         vmovdqu %xmm8, (0 * 16)(%rcx);
1058         vmovdqu %xmm9, (1 * 16)(%rcx);
1059         vmovdqu %xmm10, (2 * 16)(%rcx);
1060         vmovdqu %xmm11, (3 * 16)(%rcx);
1061         vmovdqu %xmm12, (4 * 16)(%rcx);
1062         vmovdqu %xmm13, (5 * 16)(%rcx);
1063         vmovdqu %xmm14, (6 * 16)(%rcx);
1064         vmovdqu %xmm15, (7 * 16)(%rcx);
1066         inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1067         vpshufb %xmm1, %xmm3, %xmm8;
1068         inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1069         vpshufb %xmm1, %xmm3, %xmm9;
1070         inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1071         vpshufb %xmm1, %xmm3, %xmm10;
1072         inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1073         vpshufb %xmm1, %xmm3, %xmm11;
1074         inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1075         vpshufb %xmm1, %xmm3, %xmm12;
1076         inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1077         vpshufb %xmm1, %xmm3, %xmm13;
1078         inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1079         vpshufb %xmm1, %xmm3, %xmm14;
1080         inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1081         vpshufb %xmm1, %xmm3, %xmm15;
1082         inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1083         vpshufb %xmm1, %xmm3, %xmm4;
1084         vmovdqu %xmm4, (%r8);
1086         vmovdqu (0 * 16)(%rcx), %xmm0;
1087         vmovdqu (1 * 16)(%rcx), %xmm1;
1088         vmovdqu (2 * 16)(%rcx), %xmm2;
1089         vmovdqu (3 * 16)(%rcx), %xmm3;
1090         vmovdqu (4 * 16)(%rcx), %xmm4;
1091         vmovdqu (5 * 16)(%rcx), %xmm5;
1092         vmovdqu (6 * 16)(%rcx), %xmm6;
1093         vmovdqu (7 * 16)(%rcx), %xmm7;
1095         FRAME_END
1096         RET;
1097 SYM_FUNC_END(__aria_aesni_avx_ctr_gen_keystream_16way)
1099 SYM_TYPED_FUNC_START(aria_aesni_avx_ctr_crypt_16way)
1100         /* input:
1101         *      %rdi: ctx
1102         *      %rsi: dst
1103         *      %rdx: src
1104         *      %rcx: keystream
1105         *      %r8: iv (big endian, 128bit)
1106         */
1107         FRAME_BEGIN
1109         call __aria_aesni_avx_ctr_gen_keystream_16way;
1111         leaq (%rsi), %r10;
1112         leaq (%rdx), %r11;
1113         leaq (%rcx), %rsi;
1114         leaq (%rcx), %rdx;
1115         leaq ARIA_CTX_enc_key(CTX), %r9;
1117         call __aria_aesni_avx_crypt_16way;
1119         vpxor (0 * 16)(%r11), %xmm1, %xmm1;
1120         vpxor (1 * 16)(%r11), %xmm0, %xmm0;
1121         vpxor (2 * 16)(%r11), %xmm3, %xmm3;
1122         vpxor (3 * 16)(%r11), %xmm2, %xmm2;
1123         vpxor (4 * 16)(%r11), %xmm4, %xmm4;
1124         vpxor (5 * 16)(%r11), %xmm5, %xmm5;
1125         vpxor (6 * 16)(%r11), %xmm6, %xmm6;
1126         vpxor (7 * 16)(%r11), %xmm7, %xmm7;
1127         vpxor (8 * 16)(%r11), %xmm8, %xmm8;
1128         vpxor (9 * 16)(%r11), %xmm9, %xmm9;
1129         vpxor (10 * 16)(%r11), %xmm10, %xmm10;
1130         vpxor (11 * 16)(%r11), %xmm11, %xmm11;
1131         vpxor (12 * 16)(%r11), %xmm12, %xmm12;
1132         vpxor (13 * 16)(%r11), %xmm13, %xmm13;
1133         vpxor (14 * 16)(%r11), %xmm14, %xmm14;
1134         vpxor (15 * 16)(%r11), %xmm15, %xmm15;
1135         write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
1136                      %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1137                      %xmm15, %r10);
1139         FRAME_END
1140         RET;
1141 SYM_FUNC_END(aria_aesni_avx_ctr_crypt_16way)
1143 #ifdef CONFIG_AS_GFNI
1144 SYM_FUNC_START_LOCAL(__aria_aesni_avx_gfni_crypt_16way)
1145         /* input:
1146         *      %r9: rk
1147         *      %rsi: dst
1148         *      %rdx: src
1149         *      %xmm0..%xmm15: 16 byte-sliced blocks
1150         */
1152         FRAME_BEGIN
1154         movq %rsi, %rax;
1155         leaq 8 * 16(%rax), %r8;
1157         inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3,
1158                       %xmm4, %xmm5, %xmm6, %xmm7,
1159                       %xmm8, %xmm9, %xmm10, %xmm11,
1160                       %xmm12, %xmm13, %xmm14,
1161                       %xmm15, %rax, %r8);
1162         aria_fo_gfni(%xmm8, %xmm9, %xmm10, %xmm11,
1163                      %xmm12, %xmm13, %xmm14, %xmm15,
1164                      %xmm0, %xmm1, %xmm2, %xmm3,
1165                      %xmm4, %xmm5, %xmm6, %xmm7,
1166                      %rax, %r9, 0);
1167         aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1168                      %xmm4, %xmm5, %xmm6, %xmm7,
1169                      %xmm8, %xmm9, %xmm10, %xmm11,
1170                      %xmm12, %xmm13, %xmm14,
1171                      %xmm15, %rax, %r9, 1);
1172         aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
1173                      %xmm12, %xmm13, %xmm14, %xmm15,
1174                      %xmm0, %xmm1, %xmm2, %xmm3,
1175                      %xmm4, %xmm5, %xmm6, %xmm7,
1176                      %rax, %r9, 2);
1177         aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1178                      %xmm4, %xmm5, %xmm6, %xmm7,
1179                      %xmm8, %xmm9, %xmm10, %xmm11,
1180                      %xmm12, %xmm13, %xmm14,
1181                      %xmm15, %rax, %r9, 3);
1182         aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
1183                      %xmm12, %xmm13, %xmm14, %xmm15,
1184                      %xmm0, %xmm1, %xmm2, %xmm3,
1185                      %xmm4, %xmm5, %xmm6, %xmm7,
1186                      %rax, %r9, 4);
1187         aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1188                      %xmm4, %xmm5, %xmm6, %xmm7,
1189                      %xmm8, %xmm9, %xmm10, %xmm11,
1190                      %xmm12, %xmm13, %xmm14,
1191                      %xmm15, %rax, %r9, 5);
1192         aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
1193                      %xmm12, %xmm13, %xmm14, %xmm15,
1194                      %xmm0, %xmm1, %xmm2, %xmm3,
1195                      %xmm4, %xmm5, %xmm6, %xmm7,
1196                      %rax, %r9, 6);
1197         aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1198                      %xmm4, %xmm5, %xmm6, %xmm7,
1199                      %xmm8, %xmm9, %xmm10, %xmm11,
1200                      %xmm12, %xmm13, %xmm14,
1201                      %xmm15, %rax, %r9, 7);
1202         aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
1203                      %xmm12, %xmm13, %xmm14, %xmm15,
1204                      %xmm0, %xmm1, %xmm2, %xmm3,
1205                      %xmm4, %xmm5, %xmm6, %xmm7,
1206                      %rax, %r9, 8);
1207         aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1208                      %xmm4, %xmm5, %xmm6, %xmm7,
1209                      %xmm8, %xmm9, %xmm10, %xmm11,
1210                      %xmm12, %xmm13, %xmm14,
1211                      %xmm15, %rax, %r9, 9);
1212         aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
1213                      %xmm12, %xmm13, %xmm14, %xmm15,
1214                      %xmm0, %xmm1, %xmm2, %xmm3,
1215                      %xmm4, %xmm5, %xmm6, %xmm7,
1216                      %rax, %r9, 10);
1217         cmpl $12, ARIA_CTX_rounds(CTX);
1218         jne .Laria_gfni_192;
1219         aria_ff_gfni(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
1220                 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1221                 %xmm15, %rax, %r9, 11, 12);
1222         jmp .Laria_gfni_end;
1223 .Laria_gfni_192:
1224         aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1225                      %xmm4, %xmm5, %xmm6, %xmm7,
1226                      %xmm8, %xmm9, %xmm10, %xmm11,
1227                      %xmm12, %xmm13, %xmm14,
1228                      %xmm15, %rax, %r9, 11);
1229         aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
1230                      %xmm12, %xmm13, %xmm14, %xmm15,
1231                      %xmm0, %xmm1, %xmm2, %xmm3,
1232                      %xmm4, %xmm5, %xmm6, %xmm7,
1233                      %rax, %r9, 12);
1234         cmpl $14, ARIA_CTX_rounds(CTX);
1235         jne .Laria_gfni_256;
1236         aria_ff_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1237                      %xmm4, %xmm5, %xmm6, %xmm7,
1238                      %xmm8, %xmm9, %xmm10, %xmm11,
1239                      %xmm12, %xmm13, %xmm14,
1240                      %xmm15, %rax, %r9, 13, 14);
1241         jmp .Laria_gfni_end;
1242 .Laria_gfni_256:
1243         aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1244                      %xmm4, %xmm5, %xmm6, %xmm7,
1245                      %xmm8, %xmm9, %xmm10, %xmm11,
1246                      %xmm12, %xmm13, %xmm14,
1247                      %xmm15, %rax, %r9, 13);
1248         aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
1249                      %xmm12, %xmm13, %xmm14, %xmm15,
1250                      %xmm0, %xmm1, %xmm2, %xmm3,
1251                      %xmm4, %xmm5, %xmm6, %xmm7,
1252                      %rax, %r9, 14);
1253         aria_ff_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1254                      %xmm4, %xmm5, %xmm6, %xmm7,
1255                      %xmm8, %xmm9, %xmm10, %xmm11,
1256                      %xmm12, %xmm13, %xmm14,
1257                      %xmm15, %rax, %r9, 15, 16);
1258 .Laria_gfni_end:
1259         debyteslice_16x16b(%xmm8, %xmm12, %xmm1, %xmm4,
1260                            %xmm9, %xmm13, %xmm0, %xmm5,
1261                            %xmm10, %xmm14, %xmm3, %xmm6,
1262                            %xmm11, %xmm15, %xmm2, %xmm7,
1263                            (%rax), (%r8));
1265         FRAME_END
1266         RET;
1267 SYM_FUNC_END(__aria_aesni_avx_gfni_crypt_16way)
1269 SYM_TYPED_FUNC_START(aria_aesni_avx_gfni_encrypt_16way)
1270         /* input:
1271         *      %rdi: ctx, CTX
1272         *      %rsi: dst
1273         *      %rdx: src
1274         */
1276         FRAME_BEGIN
1278         leaq ARIA_CTX_enc_key(CTX), %r9;
1280         inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
1281                      %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1282                      %xmm15, %rdx);
1284         call __aria_aesni_avx_gfni_crypt_16way;
1286         write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
1287                      %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1288                      %xmm15, %rax);
1290         FRAME_END
1291         RET;
1292 SYM_FUNC_END(aria_aesni_avx_gfni_encrypt_16way)
1294 SYM_TYPED_FUNC_START(aria_aesni_avx_gfni_decrypt_16way)
1295         /* input:
1296         *      %rdi: ctx, CTX
1297         *      %rsi: dst
1298         *      %rdx: src
1299         */
1301         FRAME_BEGIN
1303         leaq ARIA_CTX_dec_key(CTX), %r9;
1305         inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
1306                      %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1307                      %xmm15, %rdx);
1309         call __aria_aesni_avx_gfni_crypt_16way;
1311         write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
1312                      %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1313                      %xmm15, %rax);
1315         FRAME_END
1316         RET;
1317 SYM_FUNC_END(aria_aesni_avx_gfni_decrypt_16way)
1319 SYM_TYPED_FUNC_START(aria_aesni_avx_gfni_ctr_crypt_16way)
1320         /* input:
1321         *      %rdi: ctx
1322         *      %rsi: dst
1323         *      %rdx: src
1324         *      %rcx: keystream
1325         *      %r8: iv (big endian, 128bit)
1326         */
1327         FRAME_BEGIN
1329         call __aria_aesni_avx_ctr_gen_keystream_16way
1331         leaq (%rsi), %r10;
1332         leaq (%rdx), %r11;
1333         leaq (%rcx), %rsi;
1334         leaq (%rcx), %rdx;
1335         leaq ARIA_CTX_enc_key(CTX), %r9;
1337         call __aria_aesni_avx_gfni_crypt_16way;
1339         vpxor (0 * 16)(%r11), %xmm1, %xmm1;
1340         vpxor (1 * 16)(%r11), %xmm0, %xmm0;
1341         vpxor (2 * 16)(%r11), %xmm3, %xmm3;
1342         vpxor (3 * 16)(%r11), %xmm2, %xmm2;
1343         vpxor (4 * 16)(%r11), %xmm4, %xmm4;
1344         vpxor (5 * 16)(%r11), %xmm5, %xmm5;
1345         vpxor (6 * 16)(%r11), %xmm6, %xmm6;
1346         vpxor (7 * 16)(%r11), %xmm7, %xmm7;
1347         vpxor (8 * 16)(%r11), %xmm8, %xmm8;
1348         vpxor (9 * 16)(%r11), %xmm9, %xmm9;
1349         vpxor (10 * 16)(%r11), %xmm10, %xmm10;
1350         vpxor (11 * 16)(%r11), %xmm11, %xmm11;
1351         vpxor (12 * 16)(%r11), %xmm12, %xmm12;
1352         vpxor (13 * 16)(%r11), %xmm13, %xmm13;
1353         vpxor (14 * 16)(%r11), %xmm14, %xmm14;
1354         vpxor (15 * 16)(%r11), %xmm15, %xmm15;
1355         write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
1356                      %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1357                      %xmm15, %r10);
1359         FRAME_END
1360         RET;
1361 SYM_FUNC_END(aria_aesni_avx_gfni_ctr_crypt_16way)
1362 #endif /* CONFIG_AS_GFNI */