drm/panthor: Don't add write fences to the shared BOs
[drm/drm-misc.git] / arch / x86 / crypto / aria-gfni-avx512-asm_64.S
blob860887e5d02ed6ef58b954611ea77a0a6f5661d9
1 /* SPDX-License-Identifier: GPL-2.0-or-later */
2 /*
3  * ARIA Cipher 64-way parallel algorithm (AVX512)
4  *
5  * Copyright (c) 2022 Taehee Yoo <ap420073@gmail.com>
6  *
7  */
9 #include <linux/linkage.h>
10 #include <asm/frame.h>
11 #include <asm/asm-offsets.h>
12 #include <linux/cfi_types.h>
14 /* register macros */
15 #define CTX %rdi
18 #define BV8(a0, a1, a2, a3, a4, a5, a6, a7)             \
19         ( (((a0) & 1) << 0) |                           \
20           (((a1) & 1) << 1) |                           \
21           (((a2) & 1) << 2) |                           \
22           (((a3) & 1) << 3) |                           \
23           (((a4) & 1) << 4) |                           \
24           (((a5) & 1) << 5) |                           \
25           (((a6) & 1) << 6) |                           \
26           (((a7) & 1) << 7) )
28 #define BM8X8(l0, l1, l2, l3, l4, l5, l6, l7)           \
29         ( ((l7) << (0 * 8)) |                           \
30           ((l6) << (1 * 8)) |                           \
31           ((l5) << (2 * 8)) |                           \
32           ((l4) << (3 * 8)) |                           \
33           ((l3) << (4 * 8)) |                           \
34           ((l2) << (5 * 8)) |                           \
35           ((l1) << (6 * 8)) |                           \
36           ((l0) << (7 * 8)) )
38 #define add_le128(out, in, lo_counter, hi_counter1)     \
39         vpaddq lo_counter, in, out;                     \
40         vpcmpuq $1, lo_counter, out, %k1;               \
41         kaddb %k1, %k1, %k1;                            \
42         vpaddq hi_counter1, out, out{%k1};
44 #define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0)      \
45         vpandq x, mask4bit, tmp0;                       \
46         vpandqn x, mask4bit, x;                         \
47         vpsrld $4, x, x;                                \
48                                                         \
49         vpshufb tmp0, lo_t, tmp0;                       \
50         vpshufb x, hi_t, x;                             \
51         vpxorq tmp0, x, x;
53 #define transpose_4x4(x0, x1, x2, x3, t1, t2)           \
54         vpunpckhdq x1, x0, t2;                          \
55         vpunpckldq x1, x0, x0;                          \
56                                                         \
57         vpunpckldq x3, x2, t1;                          \
58         vpunpckhdq x3, x2, x2;                          \
59                                                         \
60         vpunpckhqdq t1, x0, x1;                         \
61         vpunpcklqdq t1, x0, x0;                         \
62                                                         \
63         vpunpckhqdq x2, t2, x3;                         \
64         vpunpcklqdq x2, t2, x2;
66 #define byteslice_16x16b(a0, b0, c0, d0,                \
67                          a1, b1, c1, d1,                \
68                          a2, b2, c2, d2,                \
69                          a3, b3, c3, d3,                \
70                          st0, st1)                      \
71         vmovdqu64 d2, st0;                              \
72         vmovdqu64 d3, st1;                              \
73         transpose_4x4(a0, a1, a2, a3, d2, d3);          \
74         transpose_4x4(b0, b1, b2, b3, d2, d3);          \
75         vmovdqu64 st0, d2;                              \
76         vmovdqu64 st1, d3;                              \
77                                                         \
78         vmovdqu64 a0, st0;                              \
79         vmovdqu64 a1, st1;                              \
80         transpose_4x4(c0, c1, c2, c3, a0, a1);          \
81         transpose_4x4(d0, d1, d2, d3, a0, a1);          \
82                                                         \
83         vbroadcasti64x2 .Lshufb_16x16b(%rip), a0;       \
84         vmovdqu64 st1, a1;                              \
85         vpshufb a0, a2, a2;                             \
86         vpshufb a0, a3, a3;                             \
87         vpshufb a0, b0, b0;                             \
88         vpshufb a0, b1, b1;                             \
89         vpshufb a0, b2, b2;                             \
90         vpshufb a0, b3, b3;                             \
91         vpshufb a0, a1, a1;                             \
92         vpshufb a0, c0, c0;                             \
93         vpshufb a0, c1, c1;                             \
94         vpshufb a0, c2, c2;                             \
95         vpshufb a0, c3, c3;                             \
96         vpshufb a0, d0, d0;                             \
97         vpshufb a0, d1, d1;                             \
98         vpshufb a0, d2, d2;                             \
99         vpshufb a0, d3, d3;                             \
100         vmovdqu64 d3, st1;                              \
101         vmovdqu64 st0, d3;                              \
102         vpshufb a0, d3, a0;                             \
103         vmovdqu64 d2, st0;                              \
104                                                         \
105         transpose_4x4(a0, b0, c0, d0, d2, d3);          \
106         transpose_4x4(a1, b1, c1, d1, d2, d3);          \
107         vmovdqu64 st0, d2;                              \
108         vmovdqu64 st1, d3;                              \
109                                                         \
110         vmovdqu64 b0, st0;                              \
111         vmovdqu64 b1, st1;                              \
112         transpose_4x4(a2, b2, c2, d2, b0, b1);          \
113         transpose_4x4(a3, b3, c3, d3, b0, b1);          \
114         vmovdqu64 st0, b0;                              \
115         vmovdqu64 st1, b1;                              \
116         /* does not adjust output bytes inside vectors */
118 #define debyteslice_16x16b(a0, b0, c0, d0,              \
119                            a1, b1, c1, d1,              \
120                            a2, b2, c2, d2,              \
121                            a3, b3, c3, d3,              \
122                            st0, st1)                    \
123         vmovdqu64 d2, st0;                              \
124         vmovdqu64 d3, st1;                              \
125         transpose_4x4(a0, a1, a2, a3, d2, d3);          \
126         transpose_4x4(b0, b1, b2, b3, d2, d3);          \
127         vmovdqu64 st0, d2;                              \
128         vmovdqu64 st1, d3;                              \
129                                                         \
130         vmovdqu64 a0, st0;                              \
131         vmovdqu64 a1, st1;                              \
132         transpose_4x4(c0, c1, c2, c3, a0, a1);          \
133         transpose_4x4(d0, d1, d2, d3, a0, a1);          \
134                                                         \
135         vbroadcasti64x2 .Lshufb_16x16b(%rip), a0;       \
136         vmovdqu64 st1, a1;                              \
137         vpshufb a0, a2, a2;                             \
138         vpshufb a0, a3, a3;                             \
139         vpshufb a0, b0, b0;                             \
140         vpshufb a0, b1, b1;                             \
141         vpshufb a0, b2, b2;                             \
142         vpshufb a0, b3, b3;                             \
143         vpshufb a0, a1, a1;                             \
144         vpshufb a0, c0, c0;                             \
145         vpshufb a0, c1, c1;                             \
146         vpshufb a0, c2, c2;                             \
147         vpshufb a0, c3, c3;                             \
148         vpshufb a0, d0, d0;                             \
149         vpshufb a0, d1, d1;                             \
150         vpshufb a0, d2, d2;                             \
151         vpshufb a0, d3, d3;                             \
152         vmovdqu64 d3, st1;                              \
153         vmovdqu64 st0, d3;                              \
154         vpshufb a0, d3, a0;                             \
155         vmovdqu64 d2, st0;                              \
156                                                         \
157         transpose_4x4(c0, d0, a0, b0, d2, d3);          \
158         transpose_4x4(c1, d1, a1, b1, d2, d3);          \
159         vmovdqu64 st0, d2;                              \
160         vmovdqu64 st1, d3;                              \
161                                                         \
162         vmovdqu64 b0, st0;                              \
163         vmovdqu64 b1, st1;                              \
164         transpose_4x4(c2, d2, a2, b2, b0, b1);          \
165         transpose_4x4(c3, d3, a3, b3, b0, b1);          \
166         vmovdqu64 st0, b0;                              \
167         vmovdqu64 st1, b1;                              \
168         /* does not adjust output bytes inside vectors */
170 /* load blocks to registers and apply pre-whitening */
171 #define inpack16_pre(x0, x1, x2, x3,                    \
172                      x4, x5, x6, x7,                    \
173                      y0, y1, y2, y3,                    \
174                      y4, y5, y6, y7,                    \
175                      rio)                               \
176         vmovdqu64 (0 * 64)(rio), x0;                    \
177         vmovdqu64 (1 * 64)(rio), x1;                    \
178         vmovdqu64 (2 * 64)(rio), x2;                    \
179         vmovdqu64 (3 * 64)(rio), x3;                    \
180         vmovdqu64 (4 * 64)(rio), x4;                    \
181         vmovdqu64 (5 * 64)(rio), x5;                    \
182         vmovdqu64 (6 * 64)(rio), x6;                    \
183         vmovdqu64 (7 * 64)(rio), x7;                    \
184         vmovdqu64 (8 * 64)(rio), y0;                    \
185         vmovdqu64 (9 * 64)(rio), y1;                    \
186         vmovdqu64 (10 * 64)(rio), y2;                   \
187         vmovdqu64 (11 * 64)(rio), y3;                   \
188         vmovdqu64 (12 * 64)(rio), y4;                   \
189         vmovdqu64 (13 * 64)(rio), y5;                   \
190         vmovdqu64 (14 * 64)(rio), y6;                   \
191         vmovdqu64 (15 * 64)(rio), y7;
193 /* byteslice pre-whitened blocks and store to temporary memory */
194 #define inpack16_post(x0, x1, x2, x3,                   \
195                       x4, x5, x6, x7,                   \
196                       y0, y1, y2, y3,                   \
197                       y4, y5, y6, y7,                   \
198                       mem_ab, mem_cd)                   \
199         byteslice_16x16b(x0, x1, x2, x3,                \
200                          x4, x5, x6, x7,                \
201                          y0, y1, y2, y3,                \
202                          y4, y5, y6, y7,                \
203                          (mem_ab), (mem_cd));           \
204                                                         \
205         vmovdqu64 x0, 0 * 64(mem_ab);                   \
206         vmovdqu64 x1, 1 * 64(mem_ab);                   \
207         vmovdqu64 x2, 2 * 64(mem_ab);                   \
208         vmovdqu64 x3, 3 * 64(mem_ab);                   \
209         vmovdqu64 x4, 4 * 64(mem_ab);                   \
210         vmovdqu64 x5, 5 * 64(mem_ab);                   \
211         vmovdqu64 x6, 6 * 64(mem_ab);                   \
212         vmovdqu64 x7, 7 * 64(mem_ab);                   \
213         vmovdqu64 y0, 0 * 64(mem_cd);                   \
214         vmovdqu64 y1, 1 * 64(mem_cd);                   \
215         vmovdqu64 y2, 2 * 64(mem_cd);                   \
216         vmovdqu64 y3, 3 * 64(mem_cd);                   \
217         vmovdqu64 y4, 4 * 64(mem_cd);                   \
218         vmovdqu64 y5, 5 * 64(mem_cd);                   \
219         vmovdqu64 y6, 6 * 64(mem_cd);                   \
220         vmovdqu64 y7, 7 * 64(mem_cd);
222 #define write_output(x0, x1, x2, x3,                    \
223                      x4, x5, x6, x7,                    \
224                      y0, y1, y2, y3,                    \
225                      y4, y5, y6, y7,                    \
226                      mem)                               \
227         vmovdqu64 x0, 0 * 64(mem);                      \
228         vmovdqu64 x1, 1 * 64(mem);                      \
229         vmovdqu64 x2, 2 * 64(mem);                      \
230         vmovdqu64 x3, 3 * 64(mem);                      \
231         vmovdqu64 x4, 4 * 64(mem);                      \
232         vmovdqu64 x5, 5 * 64(mem);                      \
233         vmovdqu64 x6, 6 * 64(mem);                      \
234         vmovdqu64 x7, 7 * 64(mem);                      \
235         vmovdqu64 y0, 8 * 64(mem);                      \
236         vmovdqu64 y1, 9 * 64(mem);                      \
237         vmovdqu64 y2, 10 * 64(mem);                     \
238         vmovdqu64 y3, 11 * 64(mem);                     \
239         vmovdqu64 y4, 12 * 64(mem);                     \
240         vmovdqu64 y5, 13 * 64(mem);                     \
241         vmovdqu64 y6, 14 * 64(mem);                     \
242         vmovdqu64 y7, 15 * 64(mem);                     \
244 #define aria_store_state_8way(x0, x1, x2, x3,           \
245                               x4, x5, x6, x7,           \
246                               mem_tmp, idx)             \
247         vmovdqu64 x0, ((idx + 0) * 64)(mem_tmp);        \
248         vmovdqu64 x1, ((idx + 1) * 64)(mem_tmp);        \
249         vmovdqu64 x2, ((idx + 2) * 64)(mem_tmp);        \
250         vmovdqu64 x3, ((idx + 3) * 64)(mem_tmp);        \
251         vmovdqu64 x4, ((idx + 4) * 64)(mem_tmp);        \
252         vmovdqu64 x5, ((idx + 5) * 64)(mem_tmp);        \
253         vmovdqu64 x6, ((idx + 6) * 64)(mem_tmp);        \
254         vmovdqu64 x7, ((idx + 7) * 64)(mem_tmp);
256 #define aria_load_state_8way(x0, x1, x2, x3,            \
257                              x4, x5, x6, x7,            \
258                              mem_tmp, idx)              \
259         vmovdqu64 ((idx + 0) * 64)(mem_tmp), x0;        \
260         vmovdqu64 ((idx + 1) * 64)(mem_tmp), x1;        \
261         vmovdqu64 ((idx + 2) * 64)(mem_tmp), x2;        \
262         vmovdqu64 ((idx + 3) * 64)(mem_tmp), x3;        \
263         vmovdqu64 ((idx + 4) * 64)(mem_tmp), x4;        \
264         vmovdqu64 ((idx + 5) * 64)(mem_tmp), x5;        \
265         vmovdqu64 ((idx + 6) * 64)(mem_tmp), x6;        \
266         vmovdqu64 ((idx + 7) * 64)(mem_tmp), x7;
268 #define aria_ark_16way(x0, x1, x2, x3,                  \
269                        x4, x5, x6, x7,                  \
270                        y0, y1, y2, y3,                  \
271                        y4, y5, y6, y7,                  \
272                        t0, rk, round)                   \
273         /* AddRoundKey */                               \
274         vpbroadcastb ((round * 16) + 3)(rk), t0;        \
275         vpxorq t0, x0, x0;                              \
276         vpbroadcastb ((round * 16) + 2)(rk), t0;        \
277         vpxorq t0, x1, x1;                              \
278         vpbroadcastb ((round * 16) + 1)(rk), t0;        \
279         vpxorq t0, x2, x2;                              \
280         vpbroadcastb ((round * 16) + 0)(rk), t0;        \
281         vpxorq t0, x3, x3;                              \
282         vpbroadcastb ((round * 16) + 7)(rk), t0;        \
283         vpxorq t0, x4, x4;                              \
284         vpbroadcastb ((round * 16) + 6)(rk), t0;        \
285         vpxorq t0, x5, x5;                              \
286         vpbroadcastb ((round * 16) + 5)(rk), t0;        \
287         vpxorq t0, x6, x6;                              \
288         vpbroadcastb ((round * 16) + 4)(rk), t0;        \
289         vpxorq t0, x7, x7;                              \
290         vpbroadcastb ((round * 16) + 11)(rk), t0;       \
291         vpxorq t0, y0, y0;                              \
292         vpbroadcastb ((round * 16) + 10)(rk), t0;       \
293         vpxorq t0, y1, y1;                              \
294         vpbroadcastb ((round * 16) + 9)(rk), t0;        \
295         vpxorq t0, y2, y2;                              \
296         vpbroadcastb ((round * 16) + 8)(rk), t0;        \
297         vpxorq t0, y3, y3;                              \
298         vpbroadcastb ((round * 16) + 15)(rk), t0;       \
299         vpxorq t0, y4, y4;                              \
300         vpbroadcastb ((round * 16) + 14)(rk), t0;       \
301         vpxorq t0, y5, y5;                              \
302         vpbroadcastb ((round * 16) + 13)(rk), t0;       \
303         vpxorq t0, y6, y6;                              \
304         vpbroadcastb ((round * 16) + 12)(rk), t0;       \
305         vpxorq t0, y7, y7;
307 #define aria_sbox_8way_gfni(x0, x1, x2, x3,             \
308                             x4, x5, x6, x7,             \
309                             t0, t1, t2, t3,             \
310                             t4, t5, t6, t7)             \
311         vpbroadcastq .Ltf_s2_bitmatrix(%rip), t0;       \
312         vpbroadcastq .Ltf_inv_bitmatrix(%rip), t1;      \
313         vpbroadcastq .Ltf_id_bitmatrix(%rip), t2;       \
314         vpbroadcastq .Ltf_aff_bitmatrix(%rip), t3;      \
315         vpbroadcastq .Ltf_x2_bitmatrix(%rip), t4;       \
316         vgf2p8affineinvqb $(tf_s2_const), t0, x1, x1;   \
317         vgf2p8affineinvqb $(tf_s2_const), t0, x5, x5;   \
318         vgf2p8affineqb $(tf_inv_const), t1, x2, x2;     \
319         vgf2p8affineqb $(tf_inv_const), t1, x6, x6;     \
320         vgf2p8affineinvqb $0, t2, x2, x2;               \
321         vgf2p8affineinvqb $0, t2, x6, x6;               \
322         vgf2p8affineinvqb $(tf_aff_const), t3, x0, x0;  \
323         vgf2p8affineinvqb $(tf_aff_const), t3, x4, x4;  \
324         vgf2p8affineqb $(tf_x2_const), t4, x3, x3;      \
325         vgf2p8affineqb $(tf_x2_const), t4, x7, x7;      \
326         vgf2p8affineinvqb $0, t2, x3, x3;               \
327         vgf2p8affineinvqb $0, t2, x7, x7;
329 #define aria_sbox_16way_gfni(x0, x1, x2, x3,            \
330                              x4, x5, x6, x7,            \
331                              y0, y1, y2, y3,            \
332                              y4, y5, y6, y7,            \
333                              t0, t1, t2, t3,            \
334                              t4, t5, t6, t7)            \
335         vpbroadcastq .Ltf_s2_bitmatrix(%rip), t0;       \
336         vpbroadcastq .Ltf_inv_bitmatrix(%rip), t1;      \
337         vpbroadcastq .Ltf_id_bitmatrix(%rip), t2;       \
338         vpbroadcastq .Ltf_aff_bitmatrix(%rip), t3;      \
339         vpbroadcastq .Ltf_x2_bitmatrix(%rip), t4;       \
340         vgf2p8affineinvqb $(tf_s2_const), t0, x1, x1;   \
341         vgf2p8affineinvqb $(tf_s2_const), t0, x5, x5;   \
342         vgf2p8affineqb $(tf_inv_const), t1, x2, x2;     \
343         vgf2p8affineqb $(tf_inv_const), t1, x6, x6;     \
344         vgf2p8affineinvqb $0, t2, x2, x2;               \
345         vgf2p8affineinvqb $0, t2, x6, x6;               \
346         vgf2p8affineinvqb $(tf_aff_const), t3, x0, x0;  \
347         vgf2p8affineinvqb $(tf_aff_const), t3, x4, x4;  \
348         vgf2p8affineqb $(tf_x2_const), t4, x3, x3;      \
349         vgf2p8affineqb $(tf_x2_const), t4, x7, x7;      \
350         vgf2p8affineinvqb $0, t2, x3, x3;               \
351         vgf2p8affineinvqb $0, t2, x7, x7;               \
352         vgf2p8affineinvqb $(tf_s2_const), t0, y1, y1;   \
353         vgf2p8affineinvqb $(tf_s2_const), t0, y5, y5;   \
354         vgf2p8affineqb $(tf_inv_const), t1, y2, y2;     \
355         vgf2p8affineqb $(tf_inv_const), t1, y6, y6;     \
356         vgf2p8affineinvqb $0, t2, y2, y2;               \
357         vgf2p8affineinvqb $0, t2, y6, y6;               \
358         vgf2p8affineinvqb $(tf_aff_const), t3, y0, y0;  \
359         vgf2p8affineinvqb $(tf_aff_const), t3, y4, y4;  \
360         vgf2p8affineqb $(tf_x2_const), t4, y3, y3;      \
361         vgf2p8affineqb $(tf_x2_const), t4, y7, y7;      \
362         vgf2p8affineinvqb $0, t2, y3, y3;               \
363         vgf2p8affineinvqb $0, t2, y7, y7;
366 #define aria_diff_m(x0, x1, x2, x3,                     \
367                     t0, t1, t2, t3)                     \
368         /* T = rotr32(X, 8); */                         \
369         /* X ^= T */                                    \
370         vpxorq x0, x3, t0;                              \
371         vpxorq x1, x0, t1;                              \
372         vpxorq x2, x1, t2;                              \
373         vpxorq x3, x2, t3;                              \
374         /* X = T ^ rotr(X, 16); */                      \
375         vpxorq t2, x0, x0;                              \
376         vpxorq x1, t3, t3;                              \
377         vpxorq t0, x2, x2;                              \
378         vpxorq t1, x3, x1;                              \
379         vmovdqu64 t3, x3;
381 #define aria_diff_word(x0, x1, x2, x3,                  \
382                        x4, x5, x6, x7,                  \
383                        y0, y1, y2, y3,                  \
384                        y4, y5, y6, y7)                  \
385         /* t1 ^= t2; */                                 \
386         vpxorq y0, x4, x4;                              \
387         vpxorq y1, x5, x5;                              \
388         vpxorq y2, x6, x6;                              \
389         vpxorq y3, x7, x7;                              \
390                                                         \
391         /* t2 ^= t3; */                                 \
392         vpxorq y4, y0, y0;                              \
393         vpxorq y5, y1, y1;                              \
394         vpxorq y6, y2, y2;                              \
395         vpxorq y7, y3, y3;                              \
396                                                         \
397         /* t0 ^= t1; */                                 \
398         vpxorq x4, x0, x0;                              \
399         vpxorq x5, x1, x1;                              \
400         vpxorq x6, x2, x2;                              \
401         vpxorq x7, x3, x3;                              \
402                                                         \
403         /* t3 ^= t1; */                                 \
404         vpxorq x4, y4, y4;                              \
405         vpxorq x5, y5, y5;                              \
406         vpxorq x6, y6, y6;                              \
407         vpxorq x7, y7, y7;                              \
408                                                         \
409         /* t2 ^= t0; */                                 \
410         vpxorq x0, y0, y0;                              \
411         vpxorq x1, y1, y1;                              \
412         vpxorq x2, y2, y2;                              \
413         vpxorq x3, y3, y3;                              \
414                                                         \
415         /* t1 ^= t2; */                                 \
416         vpxorq y0, x4, x4;                              \
417         vpxorq y1, x5, x5;                              \
418         vpxorq y2, x6, x6;                              \
419         vpxorq y3, x7, x7;
421 #define aria_fe_gfni(x0, x1, x2, x3,                    \
422                      x4, x5, x6, x7,                    \
423                      y0, y1, y2, y3,                    \
424                      y4, y5, y6, y7,                    \
425                      z0, z1, z2, z3,                    \
426                      z4, z5, z6, z7,                    \
427                      mem_tmp, rk, round)                \
428         aria_ark_16way(x0, x1, x2, x3, x4, x5, x6, x7,  \
429                        y0, y1, y2, y3, y4, y5, y6, y7,  \
430                        z0, rk, round);                  \
431                                                         \
432         aria_sbox_16way_gfni(x2, x3, x0, x1,            \
433                              x6, x7, x4, x5,            \
434                              y2, y3, y0, y1,            \
435                              y6, y7, y4, y5,            \
436                              z0, z1, z2, z3,            \
437                              z4, z5, z6, z7);           \
438                                                         \
439         aria_diff_m(x0, x1, x2, x3, z0, z1, z2, z3);    \
440         aria_diff_m(x4, x5, x6, x7, z0, z1, z2, z3);    \
441         aria_diff_m(y0, y1, y2, y3, z0, z1, z2, z3);    \
442         aria_diff_m(y4, y5, y6, y7, z0, z1, z2, z3);    \
443         aria_diff_word(x0, x1, x2, x3,                  \
444                        x4, x5, x6, x7,                  \
445                        y0, y1, y2, y3,                  \
446                        y4, y5, y6, y7);                 \
447         /* aria_diff_byte()                             \
448          * T3 = ABCD -> BADC                            \
449          * T3 = y4, y5, y6, y7 -> y5, y4, y7, y6        \
450          * T0 = ABCD -> CDAB                            \
451          * T0 = x0, x1, x2, x3 -> x2, x3, x0, x1        \
452          * T1 = ABCD -> DCBA                            \
453          * T1 = x4, x5, x6, x7 -> x7, x6, x5, x4        \
454          */                                             \
455         aria_diff_word(x2, x3, x0, x1,                  \
456                        x7, x6, x5, x4,                  \
457                        y0, y1, y2, y3,                  \
458                        y5, y4, y7, y6);                 \
461 #define aria_fo_gfni(x0, x1, x2, x3,                    \
462                      x4, x5, x6, x7,                    \
463                      y0, y1, y2, y3,                    \
464                      y4, y5, y6, y7,                    \
465                      z0, z1, z2, z3,                    \
466                      z4, z5, z6, z7,                    \
467                      mem_tmp, rk, round)                \
468         aria_ark_16way(x0, x1, x2, x3, x4, x5, x6, x7,  \
469                        y0, y1, y2, y3, y4, y5, y6, y7,  \
470                        z0, rk, round);                  \
471                                                         \
472         aria_sbox_16way_gfni(x0, x1, x2, x3,            \
473                              x4, x5, x6, x7,            \
474                              y0, y1, y2, y3,            \
475                              y4, y5, y6, y7,            \
476                              z0, z1, z2, z3,            \
477                              z4, z5, z6, z7);           \
478                                                         \
479         aria_diff_m(x0, x1, x2, x3, z0, z1, z2, z3);    \
480         aria_diff_m(x4, x5, x6, x7, z0, z1, z2, z3);    \
481         aria_diff_m(y0, y1, y2, y3, z0, z1, z2, z3);    \
482         aria_diff_m(y4, y5, y6, y7, z0, z1, z2, z3);    \
483         aria_diff_word(x0, x1, x2, x3,                  \
484                        x4, x5, x6, x7,                  \
485                        y0, y1, y2, y3,                  \
486                        y4, y5, y6, y7);                 \
487         /* aria_diff_byte()                             \
488          * T1 = ABCD -> BADC                            \
489          * T1 = x4, x5, x6, x7 -> x5, x4, x7, x6        \
490          * T2 = ABCD -> CDAB                            \
491          * T2 = y0, y1, y2, y3, -> y2, y3, y0, y1       \
492          * T3 = ABCD -> DCBA                            \
493          * T3 = y4, y5, y6, y7 -> y7, y6, y5, y4        \
494          */                                             \
495         aria_diff_word(x0, x1, x2, x3,                  \
496                        x5, x4, x7, x6,                  \
497                        y2, y3, y0, y1,                  \
498                        y7, y6, y5, y4);
500 #define aria_ff_gfni(x0, x1, x2, x3,                    \
501                      x4, x5, x6, x7,                    \
502                      y0, y1, y2, y3,                    \
503                      y4, y5, y6, y7,                    \
504                      z0, z1, z2, z3,                    \
505                      z4, z5, z6, z7,                    \
506                      mem_tmp, rk, round, last_round)    \
507         aria_ark_16way(x0, x1, x2, x3,                  \
508                        x4, x5, x6, x7,                  \
509                        y0, y1, y2, y3,                  \
510                        y4, y5, y6, y7,                  \
511                        z0, rk, round);                  \
512         aria_sbox_16way_gfni(x2, x3, x0, x1,            \
513                              x6, x7, x4, x5,            \
514                              y2, y3, y0, y1,            \
515                              y6, y7, y4, y5,            \
516                              z0, z1, z2, z3,            \
517                              z4, z5, z6, z7);           \
518         aria_ark_16way(x0, x1, x2, x3,                  \
519                        x4, x5, x6, x7,                  \
520                        y0, y1, y2, y3,                  \
521                        y4, y5, y6, y7,                  \
522                        z0, rk, last_round);
525 .section        .rodata.cst64, "aM", @progbits, 64
526 .align 64
527 .Lcounter0123_lo:
528         .quad 0, 0
529         .quad 1, 0
530         .quad 2, 0
531         .quad 3, 0
533 .section        .rodata.cst32.shufb_16x16b, "aM", @progbits, 32
534 .align 32
535 #define SHUFB_BYTES(idx) \
536         0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx)
537 .Lshufb_16x16b:
538         .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3)
539         .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3)
541 .section        .rodata.cst16, "aM", @progbits, 16
542 .align 16
544 .Lcounter4444_lo:
545         .quad 4, 0
546 .Lcounter8888_lo:
547         .quad 8, 0
548 .Lcounter16161616_lo:
549         .quad 16, 0
550 .Lcounter1111_hi:
551         .quad 0, 1
553 /* For CTR-mode IV byteswap */
554 .Lbswap128_mask:
555         .byte 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08
556         .byte 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00
558 .section        .rodata.cst8, "aM", @progbits, 8
559 .align 8
560 /* AES affine: */
561 #define tf_aff_const BV8(1, 1, 0, 0, 0, 1, 1, 0)
562 .Ltf_aff_bitmatrix:
563         .quad BM8X8(BV8(1, 0, 0, 0, 1, 1, 1, 1),
564                     BV8(1, 1, 0, 0, 0, 1, 1, 1),
565                     BV8(1, 1, 1, 0, 0, 0, 1, 1),
566                     BV8(1, 1, 1, 1, 0, 0, 0, 1),
567                     BV8(1, 1, 1, 1, 1, 0, 0, 0),
568                     BV8(0, 1, 1, 1, 1, 1, 0, 0),
569                     BV8(0, 0, 1, 1, 1, 1, 1, 0),
570                     BV8(0, 0, 0, 1, 1, 1, 1, 1))
572 /* AES inverse affine: */
573 #define tf_inv_const BV8(1, 0, 1, 0, 0, 0, 0, 0)
574 .Ltf_inv_bitmatrix:
575         .quad BM8X8(BV8(0, 0, 1, 0, 0, 1, 0, 1),
576                     BV8(1, 0, 0, 1, 0, 0, 1, 0),
577                     BV8(0, 1, 0, 0, 1, 0, 0, 1),
578                     BV8(1, 0, 1, 0, 0, 1, 0, 0),
579                     BV8(0, 1, 0, 1, 0, 0, 1, 0),
580                     BV8(0, 0, 1, 0, 1, 0, 0, 1),
581                     BV8(1, 0, 0, 1, 0, 1, 0, 0),
582                     BV8(0, 1, 0, 0, 1, 0, 1, 0))
584 /* S2: */
585 #define tf_s2_const BV8(0, 1, 0, 0, 0, 1, 1, 1)
586 .Ltf_s2_bitmatrix:
587         .quad BM8X8(BV8(0, 1, 0, 1, 0, 1, 1, 1),
588                     BV8(0, 0, 1, 1, 1, 1, 1, 1),
589                     BV8(1, 1, 1, 0, 1, 1, 0, 1),
590                     BV8(1, 1, 0, 0, 0, 0, 1, 1),
591                     BV8(0, 1, 0, 0, 0, 0, 1, 1),
592                     BV8(1, 1, 0, 0, 1, 1, 1, 0),
593                     BV8(0, 1, 1, 0, 0, 0, 1, 1),
594                     BV8(1, 1, 1, 1, 0, 1, 1, 0))
596 /* X2: */
597 #define tf_x2_const BV8(0, 0, 1, 1, 0, 1, 0, 0)
598 .Ltf_x2_bitmatrix:
599         .quad BM8X8(BV8(0, 0, 0, 1, 1, 0, 0, 0),
600                     BV8(0, 0, 1, 0, 0, 1, 1, 0),
601                     BV8(0, 0, 0, 0, 1, 0, 1, 0),
602                     BV8(1, 1, 1, 0, 0, 0, 1, 1),
603                     BV8(1, 1, 1, 0, 1, 1, 0, 0),
604                     BV8(0, 1, 1, 0, 1, 0, 1, 1),
605                     BV8(1, 0, 1, 1, 1, 1, 0, 1),
606                     BV8(1, 0, 0, 1, 0, 0, 1, 1))
608 /* Identity matrix: */
609 .Ltf_id_bitmatrix:
610         .quad BM8X8(BV8(1, 0, 0, 0, 0, 0, 0, 0),
611                     BV8(0, 1, 0, 0, 0, 0, 0, 0),
612                     BV8(0, 0, 1, 0, 0, 0, 0, 0),
613                     BV8(0, 0, 0, 1, 0, 0, 0, 0),
614                     BV8(0, 0, 0, 0, 1, 0, 0, 0),
615                     BV8(0, 0, 0, 0, 0, 1, 0, 0),
616                     BV8(0, 0, 0, 0, 0, 0, 1, 0),
617                     BV8(0, 0, 0, 0, 0, 0, 0, 1))
619 .text
620 SYM_FUNC_START_LOCAL(__aria_gfni_avx512_crypt_64way)
621         /* input:
622          *      %r9: rk
623          *      %rsi: dst
624          *      %rdx: src
625          *      %zmm0..%zmm15: byte-sliced blocks
626          */
628         FRAME_BEGIN
630         movq %rsi, %rax;
631         leaq 8 * 64(%rax), %r8;
633         inpack16_post(%zmm0, %zmm1, %zmm2, %zmm3,
634                       %zmm4, %zmm5, %zmm6, %zmm7,
635                       %zmm8, %zmm9, %zmm10, %zmm11,
636                       %zmm12, %zmm13, %zmm14,
637                       %zmm15, %rax, %r8);
638         aria_fo_gfni(%zmm0, %zmm1, %zmm2, %zmm3,
639                      %zmm4, %zmm5, %zmm6, %zmm7,
640                      %zmm8, %zmm9, %zmm10, %zmm11,
641                      %zmm12, %zmm13, %zmm14, %zmm15,
642                      %zmm24, %zmm25, %zmm26, %zmm27,
643                      %zmm28, %zmm29, %zmm30, %zmm31,
644                      %rax, %r9, 0);
645         aria_fe_gfni(%zmm3, %zmm2, %zmm1, %zmm0,
646                      %zmm6, %zmm7, %zmm4, %zmm5,
647                      %zmm9, %zmm8, %zmm11, %zmm10,
648                      %zmm12, %zmm13, %zmm14, %zmm15,
649                      %zmm24, %zmm25, %zmm26, %zmm27,
650                      %zmm28, %zmm29, %zmm30, %zmm31,
651                      %rax, %r9, 1);
652         aria_fo_gfni(%zmm0, %zmm1, %zmm2, %zmm3,
653                      %zmm4, %zmm5, %zmm6, %zmm7,
654                      %zmm8, %zmm9, %zmm10, %zmm11,
655                      %zmm12, %zmm13, %zmm14, %zmm15,
656                      %zmm24, %zmm25, %zmm26, %zmm27,
657                      %zmm28, %zmm29, %zmm30, %zmm31,
658                      %rax, %r9, 2);
659         aria_fe_gfni(%zmm3, %zmm2, %zmm1, %zmm0,
660                      %zmm6, %zmm7, %zmm4, %zmm5,
661                      %zmm9, %zmm8, %zmm11, %zmm10,
662                      %zmm12, %zmm13, %zmm14, %zmm15,
663                      %zmm24, %zmm25, %zmm26, %zmm27,
664                      %zmm28, %zmm29, %zmm30, %zmm31,
665                      %rax, %r9, 3);
666         aria_fo_gfni(%zmm0, %zmm1, %zmm2, %zmm3,
667                      %zmm4, %zmm5, %zmm6, %zmm7,
668                      %zmm8, %zmm9, %zmm10, %zmm11,
669                      %zmm12, %zmm13, %zmm14, %zmm15,
670                      %zmm24, %zmm25, %zmm26, %zmm27,
671                      %zmm28, %zmm29, %zmm30, %zmm31,
672                      %rax, %r9, 4);
673         aria_fe_gfni(%zmm3, %zmm2, %zmm1, %zmm0,
674                      %zmm6, %zmm7, %zmm4, %zmm5,
675                      %zmm9, %zmm8, %zmm11, %zmm10,
676                      %zmm12, %zmm13, %zmm14, %zmm15,
677                      %zmm24, %zmm25, %zmm26, %zmm27,
678                      %zmm28, %zmm29, %zmm30, %zmm31,
679                      %rax, %r9, 5);
680         aria_fo_gfni(%zmm0, %zmm1, %zmm2, %zmm3,
681                      %zmm4, %zmm5, %zmm6, %zmm7,
682                      %zmm8, %zmm9, %zmm10, %zmm11,
683                      %zmm12, %zmm13, %zmm14, %zmm15,
684                      %zmm24, %zmm25, %zmm26, %zmm27,
685                      %zmm28, %zmm29, %zmm30, %zmm31,
686                      %rax, %r9, 6);
687         aria_fe_gfni(%zmm3, %zmm2, %zmm1, %zmm0,
688                      %zmm6, %zmm7, %zmm4, %zmm5,
689                      %zmm9, %zmm8, %zmm11, %zmm10,
690                      %zmm12, %zmm13, %zmm14, %zmm15,
691                      %zmm24, %zmm25, %zmm26, %zmm27,
692                      %zmm28, %zmm29, %zmm30, %zmm31,
693                      %rax, %r9, 7);
694         aria_fo_gfni(%zmm0, %zmm1, %zmm2, %zmm3,
695                      %zmm4, %zmm5, %zmm6, %zmm7,
696                      %zmm8, %zmm9, %zmm10, %zmm11,
697                      %zmm12, %zmm13, %zmm14, %zmm15,
698                      %zmm24, %zmm25, %zmm26, %zmm27,
699                      %zmm28, %zmm29, %zmm30, %zmm31,
700                      %rax, %r9, 8);
701         aria_fe_gfni(%zmm3, %zmm2, %zmm1, %zmm0,
702                      %zmm6, %zmm7, %zmm4, %zmm5,
703                      %zmm9, %zmm8, %zmm11, %zmm10,
704                      %zmm12, %zmm13, %zmm14, %zmm15,
705                      %zmm24, %zmm25, %zmm26, %zmm27,
706                      %zmm28, %zmm29, %zmm30, %zmm31,
707                      %rax, %r9, 9);
708         aria_fo_gfni(%zmm0, %zmm1, %zmm2, %zmm3,
709                      %zmm4, %zmm5, %zmm6, %zmm7,
710                      %zmm8, %zmm9, %zmm10, %zmm11,
711                      %zmm12, %zmm13, %zmm14, %zmm15,
712                      %zmm24, %zmm25, %zmm26, %zmm27,
713                      %zmm28, %zmm29, %zmm30, %zmm31,
714                      %rax, %r9, 10);
715         cmpl $12, ARIA_CTX_rounds(CTX);
716         jne .Laria_gfni_192;
717         aria_ff_gfni(%zmm3, %zmm2, %zmm1, %zmm0,
718                      %zmm6, %zmm7, %zmm4, %zmm5,
719                      %zmm9, %zmm8, %zmm11, %zmm10,
720                      %zmm12, %zmm13, %zmm14, %zmm15,
721                      %zmm24, %zmm25, %zmm26, %zmm27,
722                      %zmm28, %zmm29, %zmm30, %zmm31,
723                      %rax, %r9, 11, 12);
724         jmp .Laria_gfni_end;
725 .Laria_gfni_192:
726         aria_fe_gfni(%zmm3, %zmm2, %zmm1, %zmm0,
727                      %zmm6, %zmm7, %zmm4, %zmm5,
728                      %zmm9, %zmm8, %zmm11, %zmm10,
729                      %zmm12, %zmm13, %zmm14, %zmm15,
730                      %zmm24, %zmm25, %zmm26, %zmm27,
731                      %zmm28, %zmm29, %zmm30, %zmm31,
732                      %rax, %r9, 11);
733         aria_fo_gfni(%zmm0, %zmm1, %zmm2, %zmm3,
734                      %zmm4, %zmm5, %zmm6, %zmm7,
735                      %zmm8, %zmm9, %zmm10, %zmm11,
736                      %zmm12, %zmm13, %zmm14, %zmm15,
737                      %zmm24, %zmm25, %zmm26, %zmm27,
738                      %zmm28, %zmm29, %zmm30, %zmm31,
739                      %rax, %r9, 12);
740         cmpl $14, ARIA_CTX_rounds(CTX);
741         jne .Laria_gfni_256;
742         aria_ff_gfni(%zmm3, %zmm2, %zmm1, %zmm0,
743                      %zmm6, %zmm7, %zmm4, %zmm5,
744                      %zmm9, %zmm8, %zmm11, %zmm10,
745                      %zmm12, %zmm13, %zmm14, %zmm15,
746                      %zmm24, %zmm25, %zmm26, %zmm27,
747                      %zmm28, %zmm29, %zmm30, %zmm31,
748                      %rax, %r9, 13, 14);
749         jmp .Laria_gfni_end;
750 .Laria_gfni_256:
751         aria_fe_gfni(%zmm3, %zmm2, %zmm1, %zmm0,
752                      %zmm6, %zmm7, %zmm4, %zmm5,
753                      %zmm9, %zmm8, %zmm11, %zmm10,
754                      %zmm12, %zmm13, %zmm14, %zmm15,
755                      %zmm24, %zmm25, %zmm26, %zmm27,
756                      %zmm28, %zmm29, %zmm30, %zmm31,
757                      %rax, %r9, 13);
758         aria_fo_gfni(%zmm0, %zmm1, %zmm2, %zmm3,
759                      %zmm4, %zmm5, %zmm6, %zmm7,
760                      %zmm8, %zmm9, %zmm10, %zmm11,
761                      %zmm12, %zmm13, %zmm14, %zmm15,
762                      %zmm24, %zmm25, %zmm26, %zmm27,
763                      %zmm28, %zmm29, %zmm30, %zmm31,
764                      %rax, %r9, 14);
765         aria_ff_gfni(%zmm3, %zmm2, %zmm1, %zmm0,
766                      %zmm6, %zmm7, %zmm4, %zmm5,
767                      %zmm9, %zmm8, %zmm11, %zmm10,
768                      %zmm12, %zmm13, %zmm14, %zmm15,
769                      %zmm24, %zmm25, %zmm26, %zmm27,
770                      %zmm28, %zmm29, %zmm30, %zmm31,
771                      %rax, %r9, 15, 16);
772 .Laria_gfni_end:
773         debyteslice_16x16b(%zmm9, %zmm12, %zmm3, %zmm6,
774                            %zmm8, %zmm13, %zmm2, %zmm7,
775                            %zmm11, %zmm14, %zmm1, %zmm4,
776                            %zmm10, %zmm15, %zmm0, %zmm5,
777                            (%rax), (%r8));
778         FRAME_END
779         RET;
780 SYM_FUNC_END(__aria_gfni_avx512_crypt_64way)
782 SYM_TYPED_FUNC_START(aria_gfni_avx512_encrypt_64way)
783         /* input:
784          *      %rdi: ctx, CTX
785          *      %rsi: dst
786          *      %rdx: src
787          */
789         FRAME_BEGIN
791         leaq ARIA_CTX_enc_key(CTX), %r9;
793         inpack16_pre(%zmm0, %zmm1, %zmm2, %zmm3, %zmm4, %zmm5, %zmm6, %zmm7,
794                      %zmm8, %zmm9, %zmm10, %zmm11, %zmm12, %zmm13, %zmm14,
795                      %zmm15, %rdx);
797         call __aria_gfni_avx512_crypt_64way;
799         write_output(%zmm3, %zmm2, %zmm1, %zmm0, %zmm6, %zmm7, %zmm4, %zmm5,
800                      %zmm9, %zmm8, %zmm11, %zmm10, %zmm12, %zmm13, %zmm14,
801                      %zmm15, %rax);
803         FRAME_END
804         RET;
805 SYM_FUNC_END(aria_gfni_avx512_encrypt_64way)
807 SYM_TYPED_FUNC_START(aria_gfni_avx512_decrypt_64way)
808         /* input:
809          *      %rdi: ctx, CTX
810          *      %rsi: dst
811          *      %rdx: src
812          */
814         FRAME_BEGIN
816         leaq ARIA_CTX_dec_key(CTX), %r9;
818         inpack16_pre(%zmm0, %zmm1, %zmm2, %zmm3, %zmm4, %zmm5, %zmm6, %zmm7,
819                      %zmm8, %zmm9, %zmm10, %zmm11, %zmm12, %zmm13, %zmm14,
820                      %zmm15, %rdx);
822         call __aria_gfni_avx512_crypt_64way;
824         write_output(%zmm3, %zmm2, %zmm1, %zmm0, %zmm6, %zmm7, %zmm4, %zmm5,
825                      %zmm9, %zmm8, %zmm11, %zmm10, %zmm12, %zmm13, %zmm14,
826                      %zmm15, %rax);
828         FRAME_END
829         RET;
830 SYM_FUNC_END(aria_gfni_avx512_decrypt_64way)
832 SYM_FUNC_START_LOCAL(__aria_gfni_avx512_ctr_gen_keystream_64way)
833         /* input:
834          *      %rdi: ctx
835          *      %rsi: dst
836          *      %rdx: src
837          *      %rcx: keystream
838          *      %r8: iv (big endian, 128bit)
839          */
841         FRAME_BEGIN
843         vbroadcasti64x2 .Lbswap128_mask (%rip), %zmm19;
844         vmovdqa64 .Lcounter0123_lo (%rip), %zmm21;
845         vbroadcasti64x2 .Lcounter4444_lo (%rip), %zmm22;
846         vbroadcasti64x2 .Lcounter8888_lo (%rip), %zmm23;
847         vbroadcasti64x2 .Lcounter16161616_lo (%rip), %zmm24;
848         vbroadcasti64x2 .Lcounter1111_hi (%rip), %zmm25;
850         /* load IV and byteswap */
851         movq 8(%r8), %r11;
852         movq (%r8), %r10;
853         bswapq %r11;
854         bswapq %r10;
855         vbroadcasti64x2 (%r8), %zmm20;
856         vpshufb %zmm19, %zmm20, %zmm20;
858         /* check need for handling 64-bit overflow and carry */
859         cmpq $(0xffffffffffffffff - 64), %r11;
860         ja .Lload_ctr_carry;
862         /* construct IVs */
863         vpaddq %zmm21, %zmm20, %zmm0;  /* +0:+1:+2:+3 */
864         vpaddq %zmm22, %zmm0, %zmm1; /* +4:+5:+6:+7 */
865         vpaddq %zmm23, %zmm0, %zmm2; /* +8:+9:+10:+11 */
866         vpaddq %zmm23, %zmm1, %zmm3; /* +12:+13:+14:+15 */
867         vpaddq %zmm24, %zmm0, %zmm4; /* +16... */
868         vpaddq %zmm24, %zmm1, %zmm5; /* +20... */
869         vpaddq %zmm24, %zmm2, %zmm6; /* +24... */
870         vpaddq %zmm24, %zmm3, %zmm7; /* +28... */
871         vpaddq %zmm24, %zmm4, %zmm8; /* +32... */
872         vpaddq %zmm24, %zmm5, %zmm9; /* +36... */
873         vpaddq %zmm24, %zmm6, %zmm10; /* +40... */
874         vpaddq %zmm24, %zmm7, %zmm11; /* +44... */
875         vpaddq %zmm24, %zmm8, %zmm12; /* +48... */
876         vpaddq %zmm24, %zmm9, %zmm13; /* +52... */
877         vpaddq %zmm24, %zmm10, %zmm14; /* +56... */
878         vpaddq %zmm24, %zmm11, %zmm15; /* +60... */
879         jmp .Lload_ctr_done;
881 .Lload_ctr_carry:
882         /* construct IVs */
883         add_le128(%zmm0, %zmm20, %zmm21, %zmm25);  /* +0:+1:+2:+3 */
884         add_le128(%zmm1, %zmm0, %zmm22, %zmm25); /* +4:+5:+6:+7 */
885         add_le128(%zmm2, %zmm0, %zmm23, %zmm25); /* +8:+9:+10:+11 */
886         add_le128(%zmm3, %zmm1, %zmm23, %zmm25); /* +12:+13:+14:+15 */
887         add_le128(%zmm4, %zmm0, %zmm24, %zmm25); /* +16... */
888         add_le128(%zmm5, %zmm1, %zmm24, %zmm25); /* +20... */
889         add_le128(%zmm6, %zmm2, %zmm24, %zmm25); /* +24... */
890         add_le128(%zmm7, %zmm3, %zmm24, %zmm25); /* +28... */
891         add_le128(%zmm8, %zmm4, %zmm24, %zmm25); /* +32... */
892         add_le128(%zmm9, %zmm5, %zmm24, %zmm25); /* +36... */
893         add_le128(%zmm10, %zmm6, %zmm24, %zmm25); /* +40... */
894         add_le128(%zmm11, %zmm7, %zmm24, %zmm25); /* +44... */
895         add_le128(%zmm12, %zmm8, %zmm24, %zmm25); /* +48... */
896         add_le128(%zmm13, %zmm9, %zmm24, %zmm25); /* +52... */
897         add_le128(%zmm14, %zmm10, %zmm24, %zmm25); /* +56... */
898         add_le128(%zmm15, %zmm11, %zmm24, %zmm25); /* +60... */
900 .Lload_ctr_done:
901         /* Byte-swap IVs and update counter. */
902         addq $64, %r11;
903         adcq $0, %r10;
904         vpshufb %zmm19, %zmm15, %zmm15;
905         vpshufb %zmm19, %zmm14, %zmm14;
906         vpshufb %zmm19, %zmm13, %zmm13;
907         vpshufb %zmm19, %zmm12, %zmm12;
908         vpshufb %zmm19, %zmm11, %zmm11;
909         vpshufb %zmm19, %zmm10, %zmm10;
910         vpshufb %zmm19, %zmm9, %zmm9;
911         vpshufb %zmm19, %zmm8, %zmm8;
912         bswapq %r11;
913         bswapq %r10;
914         vpshufb %zmm19, %zmm7, %zmm7;
915         vpshufb %zmm19, %zmm6, %zmm6;
916         vpshufb %zmm19, %zmm5, %zmm5;
917         vpshufb %zmm19, %zmm4, %zmm4;
918         vpshufb %zmm19, %zmm3, %zmm3;
919         vpshufb %zmm19, %zmm2, %zmm2;
920         vpshufb %zmm19, %zmm1, %zmm1;
921         vpshufb %zmm19, %zmm0, %zmm0;
922         movq %r11, 8(%r8);
923         movq %r10, (%r8);
925         FRAME_END
926         RET;
927 SYM_FUNC_END(__aria_gfni_avx512_ctr_gen_keystream_64way)
929 SYM_TYPED_FUNC_START(aria_gfni_avx512_ctr_crypt_64way)
930         /* input:
931          *      %rdi: ctx
932          *      %rsi: dst
933          *      %rdx: src
934          *      %rcx: keystream
935          *      %r8: iv (big endian, 128bit)
936          */
937         FRAME_BEGIN
939         call __aria_gfni_avx512_ctr_gen_keystream_64way
941         leaq (%rsi), %r10;
942         leaq (%rdx), %r11;
943         leaq (%rcx), %rsi;
944         leaq (%rcx), %rdx;
945         leaq ARIA_CTX_enc_key(CTX), %r9;
947         call __aria_gfni_avx512_crypt_64way;
949         vpxorq (0 * 64)(%r11), %zmm3, %zmm3;
950         vpxorq (1 * 64)(%r11), %zmm2, %zmm2;
951         vpxorq (2 * 64)(%r11), %zmm1, %zmm1;
952         vpxorq (3 * 64)(%r11), %zmm0, %zmm0;
953         vpxorq (4 * 64)(%r11), %zmm6, %zmm6;
954         vpxorq (5 * 64)(%r11), %zmm7, %zmm7;
955         vpxorq (6 * 64)(%r11), %zmm4, %zmm4;
956         vpxorq (7 * 64)(%r11), %zmm5, %zmm5;
957         vpxorq (8 * 64)(%r11), %zmm9, %zmm9;
958         vpxorq (9 * 64)(%r11), %zmm8, %zmm8;
959         vpxorq (10 * 64)(%r11), %zmm11, %zmm11;
960         vpxorq (11 * 64)(%r11), %zmm10, %zmm10;
961         vpxorq (12 * 64)(%r11), %zmm12, %zmm12;
962         vpxorq (13 * 64)(%r11), %zmm13, %zmm13;
963         vpxorq (14 * 64)(%r11), %zmm14, %zmm14;
964         vpxorq (15 * 64)(%r11), %zmm15, %zmm15;
965         write_output(%zmm3, %zmm2, %zmm1, %zmm0, %zmm6, %zmm7, %zmm4, %zmm5,
966                      %zmm9, %zmm8, %zmm11, %zmm10, %zmm12, %zmm13, %zmm14,
967                      %zmm15, %r10);
969         FRAME_END
970         RET;
971 SYM_FUNC_END(aria_gfni_avx512_ctr_crypt_64way)