1 /* SPDX-License-Identifier: GPL-2.0-or-later */
3 * x86_64/AVX2/AES-NI assembler implementation of Camellia
5 * Copyright © 2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
8 #include <linux/linkage.h>
10 #include <asm/nospec-branch.h>
12 #define CAMELLIA_TABLE_BYTE_LEN 272
14 /* struct camellia_ctx: */
16 #define key_length CAMELLIA_TABLE_BYTE_LEN
22 /**********************************************************************
24 **********************************************************************/
25 #define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0) \
26 vpand x, mask4bit, tmp0; \
27 vpandn x, mask4bit, x; \
30 vpshufb tmp0, lo_t, tmp0; \
51 /**********************************************************************
53 **********************************************************************/
57 * x0..x7: byte-sliced AB state
58 * mem_cd: register pointer storing CD state
59 * key: index for key material
61 * x0..x7: new byte-sliced CD state
63 #define roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, t0, t1, t2, t3, t4, t5, t6, \
66 * S-function with AES subbytes \
68 vbroadcasti128 .Linv_shift_row, t4; \
69 vpbroadcastd .L0f0f0f0f, t7; \
70 vbroadcasti128 .Lpre_tf_lo_s1, t5; \
71 vbroadcasti128 .Lpre_tf_hi_s1, t6; \
72 vbroadcasti128 .Lpre_tf_lo_s4, t2; \
73 vbroadcasti128 .Lpre_tf_hi_s4, t3; \
75 /* AES inverse shift rows */ \
85 /* prefilter sboxes 1, 2 and 3 */ \
86 /* prefilter sbox 4 */ \
87 filter_8bit(x0, t5, t6, t7, t4); \
88 filter_8bit(x7, t5, t6, t7, t4); \
89 vextracti128 $1, x0, t0##_x; \
90 vextracti128 $1, x7, t1##_x; \
91 filter_8bit(x3, t2, t3, t7, t4); \
92 filter_8bit(x6, t2, t3, t7, t4); \
93 vextracti128 $1, x3, t3##_x; \
94 vextracti128 $1, x6, t2##_x; \
95 filter_8bit(x2, t5, t6, t7, t4); \
96 filter_8bit(x5, t5, t6, t7, t4); \
97 filter_8bit(x1, t5, t6, t7, t4); \
98 filter_8bit(x4, t5, t6, t7, t4); \
100 vpxor t4##_x, t4##_x, t4##_x; \
102 /* AES subbytes + AES shift rows */ \
103 vextracti128 $1, x2, t6##_x; \
104 vextracti128 $1, x5, t5##_x; \
105 vaesenclast t4##_x, x0##_x, x0##_x; \
106 vaesenclast t4##_x, t0##_x, t0##_x; \
107 vinserti128 $1, t0##_x, x0, x0; \
108 vaesenclast t4##_x, x7##_x, x7##_x; \
109 vaesenclast t4##_x, t1##_x, t1##_x; \
110 vinserti128 $1, t1##_x, x7, x7; \
111 vaesenclast t4##_x, x3##_x, x3##_x; \
112 vaesenclast t4##_x, t3##_x, t3##_x; \
113 vinserti128 $1, t3##_x, x3, x3; \
114 vaesenclast t4##_x, x6##_x, x6##_x; \
115 vaesenclast t4##_x, t2##_x, t2##_x; \
116 vinserti128 $1, t2##_x, x6, x6; \
117 vextracti128 $1, x1, t3##_x; \
118 vextracti128 $1, x4, t2##_x; \
119 vbroadcasti128 .Lpost_tf_lo_s1, t0; \
120 vbroadcasti128 .Lpost_tf_hi_s1, t1; \
121 vaesenclast t4##_x, x2##_x, x2##_x; \
122 vaesenclast t4##_x, t6##_x, t6##_x; \
123 vinserti128 $1, t6##_x, x2, x2; \
124 vaesenclast t4##_x, x5##_x, x5##_x; \
125 vaesenclast t4##_x, t5##_x, t5##_x; \
126 vinserti128 $1, t5##_x, x5, x5; \
127 vaesenclast t4##_x, x1##_x, x1##_x; \
128 vaesenclast t4##_x, t3##_x, t3##_x; \
129 vinserti128 $1, t3##_x, x1, x1; \
130 vaesenclast t4##_x, x4##_x, x4##_x; \
131 vaesenclast t4##_x, t2##_x, t2##_x; \
132 vinserti128 $1, t2##_x, x4, x4; \
134 /* postfilter sboxes 1 and 4 */ \
135 vbroadcasti128 .Lpost_tf_lo_s3, t2; \
136 vbroadcasti128 .Lpost_tf_hi_s3, t3; \
137 filter_8bit(x0, t0, t1, t7, t6); \
138 filter_8bit(x7, t0, t1, t7, t6); \
139 filter_8bit(x3, t0, t1, t7, t6); \
140 filter_8bit(x6, t0, t1, t7, t6); \
142 /* postfilter sbox 3 */ \
143 vbroadcasti128 .Lpost_tf_lo_s2, t4; \
144 vbroadcasti128 .Lpost_tf_hi_s2, t5; \
145 filter_8bit(x2, t2, t3, t7, t6); \
146 filter_8bit(x5, t2, t3, t7, t6); \
148 vpbroadcastq key, t0; /* higher 64-bit duplicate ignored */ \
150 /* postfilter sbox 2 */ \
151 filter_8bit(x1, t4, t5, t7, t2); \
152 filter_8bit(x4, t4, t5, t7, t2); \
155 vpsrldq $1, t0, t1; \
156 vpsrldq $2, t0, t2; \
157 vpshufb t7, t1, t1; \
158 vpsrldq $3, t0, t3; \
166 vpshufb t7, t2, t2; \
167 vpsrldq $4, t0, t4; \
168 vpshufb t7, t3, t3; \
169 vpsrldq $5, t0, t5; \
170 vpshufb t7, t4, t4; \
177 vpsrldq $6, t0, t6; \
178 vpshufb t7, t5, t5; \
179 vpshufb t7, t6, t6; \
189 vpxor x2, x7, x7; /* note: high and low parts swapped */ \
191 /* Add key material and result to CD (x becomes new CD) */ \
194 vpxor 5 * 32(mem_cd), x1, x1; \
196 vpsrldq $7, t0, t6; \
197 vpshufb t7, t0, t0; \
198 vpshufb t7, t6, t7; \
201 vpxor 4 * 32(mem_cd), x0, x0; \
204 vpxor 6 * 32(mem_cd), x2, x2; \
207 vpxor 7 * 32(mem_cd), x3, x3; \
210 vpxor 0 * 32(mem_cd), x4, x4; \
213 vpxor 1 * 32(mem_cd), x5, x5; \
216 vpxor 2 * 32(mem_cd), x6, x6; \
219 vpxor 3 * 32(mem_cd), x7, x7;
222 * Size optimization... with inlined roundsm32 binary would be over 5 times
223 * larger and would only marginally faster.
226 SYM_FUNC_START_LOCAL(roundsm32_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd)
227 roundsm32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
228 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, %ymm15,
231 SYM_FUNC_END(roundsm32_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd)
234 SYM_FUNC_START_LOCAL(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
235 roundsm32(%ymm4, %ymm5, %ymm6, %ymm7, %ymm0, %ymm1, %ymm2, %ymm3,
236 %ymm12, %ymm13, %ymm14, %ymm15, %ymm8, %ymm9, %ymm10, %ymm11,
239 SYM_FUNC_END(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
243 * x0..x7: byte-sliced AB state preloaded
244 * mem_ab: byte-sliced AB state in memory
245 * mem_cb: byte-sliced CD state in memory
247 #define two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
248 y6, y7, mem_ab, mem_cd, i, dir, store_ab) \
249 leaq (key_table + (i) * 8)(CTX), %r9; \
250 call roundsm32_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd; \
252 vmovdqu x0, 4 * 32(mem_cd); \
253 vmovdqu x1, 5 * 32(mem_cd); \
254 vmovdqu x2, 6 * 32(mem_cd); \
255 vmovdqu x3, 7 * 32(mem_cd); \
256 vmovdqu x4, 0 * 32(mem_cd); \
257 vmovdqu x5, 1 * 32(mem_cd); \
258 vmovdqu x6, 2 * 32(mem_cd); \
259 vmovdqu x7, 3 * 32(mem_cd); \
261 leaq (key_table + ((i) + (dir)) * 8)(CTX), %r9; \
262 call roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab; \
264 store_ab(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab);
266 #define dummy_store(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) /* do nothing */
268 #define store_ab_state(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) \
269 /* Store new AB state */ \
270 vmovdqu x4, 4 * 32(mem_ab); \
271 vmovdqu x5, 5 * 32(mem_ab); \
272 vmovdqu x6, 6 * 32(mem_ab); \
273 vmovdqu x7, 7 * 32(mem_ab); \
274 vmovdqu x0, 0 * 32(mem_ab); \
275 vmovdqu x1, 1 * 32(mem_ab); \
276 vmovdqu x2, 2 * 32(mem_ab); \
277 vmovdqu x3, 3 * 32(mem_ab);
279 #define enc_rounds32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
280 y6, y7, mem_ab, mem_cd, i) \
281 two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
282 y6, y7, mem_ab, mem_cd, (i) + 2, 1, store_ab_state); \
283 two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
284 y6, y7, mem_ab, mem_cd, (i) + 4, 1, store_ab_state); \
285 two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
286 y6, y7, mem_ab, mem_cd, (i) + 6, 1, dummy_store);
288 #define dec_rounds32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
289 y6, y7, mem_ab, mem_cd, i) \
290 two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
291 y6, y7, mem_ab, mem_cd, (i) + 7, -1, store_ab_state); \
292 two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
293 y6, y7, mem_ab, mem_cd, (i) + 5, -1, store_ab_state); \
294 two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
295 y6, y7, mem_ab, mem_cd, (i) + 3, -1, dummy_store);
299 * v0..3: byte-sliced 32-bit integers
303 #define rol32_1_32(v0, v1, v2, v3, t0, t1, t2, zero) \
304 vpcmpgtb v0, zero, t0; \
308 vpcmpgtb v1, zero, t1; \
312 vpcmpgtb v2, zero, t2; \
318 vpcmpgtb v3, zero, t0; \
328 * r: byte-sliced AB state in memory
329 * l: byte-sliced CD state in memory
331 * x0..x7: new byte-sliced CD state
333 #define fls32(l, l0, l1, l2, l3, l4, l5, l6, l7, r, t0, t1, t2, t3, tt0, \
334 tt1, tt2, tt3, kll, klr, krl, krr) \
338 * lr ^= rol32(t0, 1); \
340 vpbroadcastd kll, t0; /* only lowest 32-bit used */ \
341 vpxor tt0, tt0, tt0; \
342 vpshufb tt0, t0, t3; \
343 vpsrldq $1, t0, t0; \
344 vpshufb tt0, t0, t2; \
345 vpsrldq $1, t0, t0; \
346 vpshufb tt0, t0, t1; \
347 vpsrldq $1, t0, t0; \
348 vpshufb tt0, t0, t0; \
355 rol32_1_32(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \
358 vpbroadcastd krr, t0; /* only lowest 32-bit used */ \
359 vmovdqu l4, 4 * 32(l); \
361 vmovdqu l5, 5 * 32(l); \
363 vmovdqu l6, 6 * 32(l); \
365 vmovdqu l7, 7 * 32(l); \
373 vpshufb tt0, t0, t3; \
374 vpsrldq $1, t0, t0; \
375 vpshufb tt0, t0, t2; \
376 vpsrldq $1, t0, t0; \
377 vpshufb tt0, t0, t1; \
378 vpsrldq $1, t0, t0; \
379 vpshufb tt0, t0, t0; \
381 vpor 4 * 32(r), t0, t0; \
382 vpor 5 * 32(r), t1, t1; \
383 vpor 6 * 32(r), t2, t2; \
384 vpor 7 * 32(r), t3, t3; \
386 vpxor 0 * 32(r), t0, t0; \
387 vpxor 1 * 32(r), t1, t1; \
388 vpxor 2 * 32(r), t2, t2; \
389 vpxor 3 * 32(r), t3, t3; \
390 vmovdqu t0, 0 * 32(r); \
391 vpbroadcastd krl, t0; /* only lowest 32-bit used */ \
392 vmovdqu t1, 1 * 32(r); \
393 vmovdqu t2, 2 * 32(r); \
394 vmovdqu t3, 3 * 32(r); \
399 * rr ^= rol32(t2, 1); \
401 vpshufb tt0, t0, t3; \
402 vpsrldq $1, t0, t0; \
403 vpshufb tt0, t0, t2; \
404 vpsrldq $1, t0, t0; \
405 vpshufb tt0, t0, t1; \
406 vpsrldq $1, t0, t0; \
407 vpshufb tt0, t0, t0; \
409 vpand 0 * 32(r), t0, t0; \
410 vpand 1 * 32(r), t1, t1; \
411 vpand 2 * 32(r), t2, t2; \
412 vpand 3 * 32(r), t3, t3; \
414 rol32_1_32(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \
416 vpxor 4 * 32(r), t0, t0; \
417 vpxor 5 * 32(r), t1, t1; \
418 vpxor 6 * 32(r), t2, t2; \
419 vpxor 7 * 32(r), t3, t3; \
420 vmovdqu t0, 4 * 32(r); \
421 vpbroadcastd klr, t0; /* only lowest 32-bit used */ \
422 vmovdqu t1, 5 * 32(r); \
423 vmovdqu t2, 6 * 32(r); \
424 vmovdqu t3, 7 * 32(r); \
432 vpshufb tt0, t0, t3; \
433 vpsrldq $1, t0, t0; \
434 vpshufb tt0, t0, t2; \
435 vpsrldq $1, t0, t0; \
436 vpshufb tt0, t0, t1; \
437 vpsrldq $1, t0, t0; \
438 vpshufb tt0, t0, t0; \
446 vmovdqu l0, 0 * 32(l); \
448 vmovdqu l1, 1 * 32(l); \
450 vmovdqu l2, 2 * 32(l); \
452 vmovdqu l3, 3 * 32(l);
454 #define transpose_4x4(x0, x1, x2, x3, t1, t2) \
455 vpunpckhdq x1, x0, t2; \
456 vpunpckldq x1, x0, x0; \
458 vpunpckldq x3, x2, t1; \
459 vpunpckhdq x3, x2, x2; \
461 vpunpckhqdq t1, x0, x1; \
462 vpunpcklqdq t1, x0, x0; \
464 vpunpckhqdq x2, t2, x3; \
465 vpunpcklqdq x2, t2, x2;
467 #define byteslice_16x16b_fast(a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, \
468 a3, b3, c3, d3, st0, st1) \
471 transpose_4x4(a0, a1, a2, a3, d2, d3); \
472 transpose_4x4(b0, b1, b2, b3, d2, d3); \
478 transpose_4x4(c0, c1, c2, c3, a0, a1); \
479 transpose_4x4(d0, d1, d2, d3, a0, a1); \
481 vbroadcasti128 .Lshufb_16x16b, a0; \
483 vpshufb a0, a2, a2; \
484 vpshufb a0, a3, a3; \
485 vpshufb a0, b0, b0; \
486 vpshufb a0, b1, b1; \
487 vpshufb a0, b2, b2; \
488 vpshufb a0, b3, b3; \
489 vpshufb a0, a1, a1; \
490 vpshufb a0, c0, c0; \
491 vpshufb a0, c1, c1; \
492 vpshufb a0, c2, c2; \
493 vpshufb a0, c3, c3; \
494 vpshufb a0, d0, d0; \
495 vpshufb a0, d1, d1; \
496 vpshufb a0, d2, d2; \
497 vpshufb a0, d3, d3; \
500 vpshufb a0, d3, a0; \
503 transpose_4x4(a0, b0, c0, d0, d2, d3); \
504 transpose_4x4(a1, b1, c1, d1, d2, d3); \
510 transpose_4x4(a2, b2, c2, d2, b0, b1); \
511 transpose_4x4(a3, b3, c3, d3, b0, b1); \
514 /* does not adjust output bytes inside vectors */
516 /* load blocks to registers and apply pre-whitening */
517 #define inpack32_pre(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
519 vpbroadcastq key, x0; \
520 vpshufb .Lpack_bswap, x0, x0; \
522 vpxor 0 * 32(rio), x0, y7; \
523 vpxor 1 * 32(rio), x0, y6; \
524 vpxor 2 * 32(rio), x0, y5; \
525 vpxor 3 * 32(rio), x0, y4; \
526 vpxor 4 * 32(rio), x0, y3; \
527 vpxor 5 * 32(rio), x0, y2; \
528 vpxor 6 * 32(rio), x0, y1; \
529 vpxor 7 * 32(rio), x0, y0; \
530 vpxor 8 * 32(rio), x0, x7; \
531 vpxor 9 * 32(rio), x0, x6; \
532 vpxor 10 * 32(rio), x0, x5; \
533 vpxor 11 * 32(rio), x0, x4; \
534 vpxor 12 * 32(rio), x0, x3; \
535 vpxor 13 * 32(rio), x0, x2; \
536 vpxor 14 * 32(rio), x0, x1; \
537 vpxor 15 * 32(rio), x0, x0;
539 /* byteslice pre-whitened blocks and store to temporary memory */
540 #define inpack32_post(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
541 y6, y7, mem_ab, mem_cd) \
542 byteslice_16x16b_fast(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, \
543 y4, y5, y6, y7, (mem_ab), (mem_cd)); \
545 vmovdqu x0, 0 * 32(mem_ab); \
546 vmovdqu x1, 1 * 32(mem_ab); \
547 vmovdqu x2, 2 * 32(mem_ab); \
548 vmovdqu x3, 3 * 32(mem_ab); \
549 vmovdqu x4, 4 * 32(mem_ab); \
550 vmovdqu x5, 5 * 32(mem_ab); \
551 vmovdqu x6, 6 * 32(mem_ab); \
552 vmovdqu x7, 7 * 32(mem_ab); \
553 vmovdqu y0, 0 * 32(mem_cd); \
554 vmovdqu y1, 1 * 32(mem_cd); \
555 vmovdqu y2, 2 * 32(mem_cd); \
556 vmovdqu y3, 3 * 32(mem_cd); \
557 vmovdqu y4, 4 * 32(mem_cd); \
558 vmovdqu y5, 5 * 32(mem_cd); \
559 vmovdqu y6, 6 * 32(mem_cd); \
560 vmovdqu y7, 7 * 32(mem_cd);
562 /* de-byteslice, apply post-whitening and store blocks */
563 #define outunpack32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, \
564 y5, y6, y7, key, stack_tmp0, stack_tmp1) \
565 byteslice_16x16b_fast(y0, y4, x0, x4, y1, y5, x1, x5, y2, y6, x2, x6, \
566 y3, y7, x3, x7, stack_tmp0, stack_tmp1); \
568 vmovdqu x0, stack_tmp0; \
570 vpbroadcastq key, x0; \
571 vpshufb .Lpack_bswap, x0, x0; \
588 vpxor stack_tmp0, x0, x0;
590 #define write_output(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
592 vmovdqu x0, 0 * 32(rio); \
593 vmovdqu x1, 1 * 32(rio); \
594 vmovdqu x2, 2 * 32(rio); \
595 vmovdqu x3, 3 * 32(rio); \
596 vmovdqu x4, 4 * 32(rio); \
597 vmovdqu x5, 5 * 32(rio); \
598 vmovdqu x6, 6 * 32(rio); \
599 vmovdqu x7, 7 * 32(rio); \
600 vmovdqu y0, 8 * 32(rio); \
601 vmovdqu y1, 9 * 32(rio); \
602 vmovdqu y2, 10 * 32(rio); \
603 vmovdqu y3, 11 * 32(rio); \
604 vmovdqu y4, 12 * 32(rio); \
605 vmovdqu y5, 13 * 32(rio); \
606 vmovdqu y6, 14 * 32(rio); \
607 vmovdqu y7, 15 * 32(rio);
610 .section .rodata.cst32.shufb_16x16b, "aM", @progbits, 32
612 #define SHUFB_BYTES(idx) \
613 0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx)
615 .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3)
616 .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3)
618 .section .rodata.cst32.pack_bswap, "aM", @progbits, 32
621 .long 0x00010203, 0x04050607, 0x80808080, 0x80808080
622 .long 0x00010203, 0x04050607, 0x80808080, 0x80808080
624 /* NB: section is mergeable, all elements must be aligned 16-byte blocks */
625 .section .rodata.cst16, "aM", @progbits, 16
628 /* For CTR-mode IV byteswap */
630 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
633 .Lxts_gf128mul_and_shl1_mask_0:
634 .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0
635 .Lxts_gf128mul_and_shl1_mask_1:
636 .byte 0x0e, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0
639 * pre-SubByte transform
641 * pre-lookup for sbox1, sbox2, sbox3:
642 * swap_bitendianness(
643 * isom_map_camellia_to_aes(
645 * swap_bitendianess(in)
650 * (note: '⊕ 0xc5' inside camellia_f())
653 .byte 0x45, 0xe8, 0x40, 0xed, 0x2e, 0x83, 0x2b, 0x86
654 .byte 0x4b, 0xe6, 0x4e, 0xe3, 0x20, 0x8d, 0x25, 0x88
656 .byte 0x00, 0x51, 0xf1, 0xa0, 0x8a, 0xdb, 0x7b, 0x2a
657 .byte 0x09, 0x58, 0xf8, 0xa9, 0x83, 0xd2, 0x72, 0x23
660 * pre-SubByte transform
662 * pre-lookup for sbox4:
663 * swap_bitendianness(
664 * isom_map_camellia_to_aes(
666 * swap_bitendianess(in <<< 1)
671 * (note: '⊕ 0xc5' inside camellia_f())
674 .byte 0x45, 0x40, 0x2e, 0x2b, 0x4b, 0x4e, 0x20, 0x25
675 .byte 0x14, 0x11, 0x7f, 0x7a, 0x1a, 0x1f, 0x71, 0x74
677 .byte 0x00, 0xf1, 0x8a, 0x7b, 0x09, 0xf8, 0x83, 0x72
678 .byte 0xad, 0x5c, 0x27, 0xd6, 0xa4, 0x55, 0x2e, 0xdf
681 * post-SubByte transform
683 * post-lookup for sbox1, sbox4:
684 * swap_bitendianness(
686 * isom_map_aes_to_camellia(
687 * swap_bitendianness(
688 * aes_inverse_affine_transform(in)
694 * (note: '⊕ 0x6e' inside camellia_h())
697 .byte 0x3c, 0xcc, 0xcf, 0x3f, 0x32, 0xc2, 0xc1, 0x31
698 .byte 0xdc, 0x2c, 0x2f, 0xdf, 0xd2, 0x22, 0x21, 0xd1
700 .byte 0x00, 0xf9, 0x86, 0x7f, 0xd7, 0x2e, 0x51, 0xa8
701 .byte 0xa4, 0x5d, 0x22, 0xdb, 0x73, 0x8a, 0xf5, 0x0c
704 * post-SubByte transform
706 * post-lookup for sbox2:
707 * swap_bitendianness(
709 * isom_map_aes_to_camellia(
710 * swap_bitendianness(
711 * aes_inverse_affine_transform(in)
717 * (note: '⊕ 0x6e' inside camellia_h())
720 .byte 0x78, 0x99, 0x9f, 0x7e, 0x64, 0x85, 0x83, 0x62
721 .byte 0xb9, 0x58, 0x5e, 0xbf, 0xa5, 0x44, 0x42, 0xa3
723 .byte 0x00, 0xf3, 0x0d, 0xfe, 0xaf, 0x5c, 0xa2, 0x51
724 .byte 0x49, 0xba, 0x44, 0xb7, 0xe6, 0x15, 0xeb, 0x18
727 * post-SubByte transform
729 * post-lookup for sbox3:
730 * swap_bitendianness(
732 * isom_map_aes_to_camellia(
733 * swap_bitendianness(
734 * aes_inverse_affine_transform(in)
740 * (note: '⊕ 0x6e' inside camellia_h())
743 .byte 0x1e, 0x66, 0xe7, 0x9f, 0x19, 0x61, 0xe0, 0x98
744 .byte 0x6e, 0x16, 0x97, 0xef, 0x69, 0x11, 0x90, 0xe8
746 .byte 0x00, 0xfc, 0x43, 0xbf, 0xeb, 0x17, 0xa8, 0x54
747 .byte 0x52, 0xae, 0x11, 0xed, 0xb9, 0x45, 0xfa, 0x06
749 /* For isolating SubBytes from AESENCLAST, inverse shift row */
751 .byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b
752 .byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03
754 .section .rodata.cst4.L0f0f0f0f, "aM", @progbits, 4
763 SYM_FUNC_START_LOCAL(__camellia_enc_blk32)
766 * %rax: temporary storage, 512 bytes
767 * %ymm0..%ymm15: 32 plaintext blocks
769 * %ymm0..%ymm15: 32 encrypted blocks, order swapped:
770 * 7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
774 leaq 8 * 32(%rax), %rcx;
776 inpack32_post(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
777 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
780 enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
781 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
782 %ymm15, %rax, %rcx, 0);
784 fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
785 %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
787 ((key_table + (8) * 8) + 0)(CTX),
788 ((key_table + (8) * 8) + 4)(CTX),
789 ((key_table + (8) * 8) + 8)(CTX),
790 ((key_table + (8) * 8) + 12)(CTX));
792 enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
793 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
794 %ymm15, %rax, %rcx, 8);
796 fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
797 %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
799 ((key_table + (16) * 8) + 0)(CTX),
800 ((key_table + (16) * 8) + 4)(CTX),
801 ((key_table + (16) * 8) + 8)(CTX),
802 ((key_table + (16) * 8) + 12)(CTX));
804 enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
805 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
806 %ymm15, %rax, %rcx, 16);
809 cmpl $16, key_length(CTX);
813 /* load CD for output */
814 vmovdqu 0 * 32(%rcx), %ymm8;
815 vmovdqu 1 * 32(%rcx), %ymm9;
816 vmovdqu 2 * 32(%rcx), %ymm10;
817 vmovdqu 3 * 32(%rcx), %ymm11;
818 vmovdqu 4 * 32(%rcx), %ymm12;
819 vmovdqu 5 * 32(%rcx), %ymm13;
820 vmovdqu 6 * 32(%rcx), %ymm14;
821 vmovdqu 7 * 32(%rcx), %ymm15;
823 outunpack32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
824 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
825 %ymm15, (key_table)(CTX, %r8, 8), (%rax), 1 * 32(%rax));
834 fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
835 %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
837 ((key_table + (24) * 8) + 0)(CTX),
838 ((key_table + (24) * 8) + 4)(CTX),
839 ((key_table + (24) * 8) + 8)(CTX),
840 ((key_table + (24) * 8) + 12)(CTX));
842 enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
843 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
844 %ymm15, %rax, %rcx, 24);
847 SYM_FUNC_END(__camellia_enc_blk32)
850 SYM_FUNC_START_LOCAL(__camellia_dec_blk32)
853 * %rax: temporary storage, 512 bytes
854 * %r8d: 24 for 16 byte key, 32 for larger
855 * %ymm0..%ymm15: 16 encrypted blocks
857 * %ymm0..%ymm15: 16 plaintext blocks, order swapped:
858 * 7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
862 leaq 8 * 32(%rax), %rcx;
864 inpack32_post(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
865 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
872 dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
873 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
874 %ymm15, %rax, %rcx, 16);
876 fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
877 %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
879 ((key_table + (16) * 8) + 8)(CTX),
880 ((key_table + (16) * 8) + 12)(CTX),
881 ((key_table + (16) * 8) + 0)(CTX),
882 ((key_table + (16) * 8) + 4)(CTX));
884 dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
885 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
886 %ymm15, %rax, %rcx, 8);
888 fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
889 %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
891 ((key_table + (8) * 8) + 8)(CTX),
892 ((key_table + (8) * 8) + 12)(CTX),
893 ((key_table + (8) * 8) + 0)(CTX),
894 ((key_table + (8) * 8) + 4)(CTX));
896 dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
897 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
898 %ymm15, %rax, %rcx, 0);
900 /* load CD for output */
901 vmovdqu 0 * 32(%rcx), %ymm8;
902 vmovdqu 1 * 32(%rcx), %ymm9;
903 vmovdqu 2 * 32(%rcx), %ymm10;
904 vmovdqu 3 * 32(%rcx), %ymm11;
905 vmovdqu 4 * 32(%rcx), %ymm12;
906 vmovdqu 5 * 32(%rcx), %ymm13;
907 vmovdqu 6 * 32(%rcx), %ymm14;
908 vmovdqu 7 * 32(%rcx), %ymm15;
910 outunpack32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
911 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
912 %ymm15, (key_table)(CTX), (%rax), 1 * 32(%rax));
919 dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
920 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
921 %ymm15, %rax, %rcx, 24);
923 fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
924 %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
926 ((key_table + (24) * 8) + 8)(CTX),
927 ((key_table + (24) * 8) + 12)(CTX),
928 ((key_table + (24) * 8) + 0)(CTX),
929 ((key_table + (24) * 8) + 4)(CTX));
932 SYM_FUNC_END(__camellia_dec_blk32)
934 SYM_FUNC_START(camellia_ecb_enc_32way)
937 * %rsi: dst (32 blocks)
938 * %rdx: src (32 blocks)
944 inpack32_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
945 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
946 %ymm15, %rdx, (key_table)(CTX));
948 /* now dst can be used as temporary buffer (even in src == dst case) */
951 call __camellia_enc_blk32;
953 write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0,
954 %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9,
961 SYM_FUNC_END(camellia_ecb_enc_32way)
963 SYM_FUNC_START(camellia_ecb_dec_32way)
966 * %rsi: dst (32 blocks)
967 * %rdx: src (32 blocks)
973 cmpl $16, key_length(CTX);
976 cmovel %eax, %r8d; /* max */
978 inpack32_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
979 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
980 %ymm15, %rdx, (key_table)(CTX, %r8, 8));
982 /* now dst can be used as temporary buffer (even in src == dst case) */
985 call __camellia_dec_blk32;
987 write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0,
988 %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9,
995 SYM_FUNC_END(camellia_ecb_dec_32way)
997 SYM_FUNC_START(camellia_cbc_dec_32way)
1000 * %rsi: dst (32 blocks)
1001 * %rdx: src (32 blocks)
1007 cmpl $16, key_length(CTX);
1010 cmovel %eax, %r8d; /* max */
1012 inpack32_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
1013 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
1014 %ymm15, %rdx, (key_table)(CTX, %r8, 8));
1018 je .Lcbc_dec_use_stack;
1020 /* dst can be used as temporary storage, src is not overwritten. */
1022 jmp .Lcbc_dec_continue;
1024 .Lcbc_dec_use_stack:
1026 * dst still in-use (because dst == src), so use stack for temporary
1029 subq $(16 * 32), %rsp;
1033 call __camellia_dec_blk32;
1035 vmovdqu %ymm7, (%rax);
1036 vpxor %ymm7, %ymm7, %ymm7;
1037 vinserti128 $1, (%rdx), %ymm7, %ymm7;
1038 vpxor (%rax), %ymm7, %ymm7;
1040 vpxor (0 * 32 + 16)(%rdx), %ymm6, %ymm6;
1041 vpxor (1 * 32 + 16)(%rdx), %ymm5, %ymm5;
1042 vpxor (2 * 32 + 16)(%rdx), %ymm4, %ymm4;
1043 vpxor (3 * 32 + 16)(%rdx), %ymm3, %ymm3;
1044 vpxor (4 * 32 + 16)(%rdx), %ymm2, %ymm2;
1045 vpxor (5 * 32 + 16)(%rdx), %ymm1, %ymm1;
1046 vpxor (6 * 32 + 16)(%rdx), %ymm0, %ymm0;
1047 vpxor (7 * 32 + 16)(%rdx), %ymm15, %ymm15;
1048 vpxor (8 * 32 + 16)(%rdx), %ymm14, %ymm14;
1049 vpxor (9 * 32 + 16)(%rdx), %ymm13, %ymm13;
1050 vpxor (10 * 32 + 16)(%rdx), %ymm12, %ymm12;
1051 vpxor (11 * 32 + 16)(%rdx), %ymm11, %ymm11;
1052 vpxor (12 * 32 + 16)(%rdx), %ymm10, %ymm10;
1053 vpxor (13 * 32 + 16)(%rdx), %ymm9, %ymm9;
1054 vpxor (14 * 32 + 16)(%rdx), %ymm8, %ymm8;
1055 write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0,
1056 %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9,
1063 SYM_FUNC_END(camellia_cbc_dec_32way)
1065 #define inc_le128(x, minus_one, tmp) \
1066 vpcmpeqq minus_one, x, tmp; \
1067 vpsubq minus_one, x, x; \
1068 vpslldq $8, tmp, tmp; \
1071 #define add2_le128(x, minus_one, minus_two, tmp1, tmp2) \
1072 vpcmpeqq minus_one, x, tmp1; \
1073 vpcmpeqq minus_two, x, tmp2; \
1074 vpsubq minus_two, x, x; \
1075 vpor tmp2, tmp1, tmp1; \
1076 vpslldq $8, tmp1, tmp1; \
1079 SYM_FUNC_START(camellia_ctr_32way)
1082 * %rsi: dst (32 blocks)
1083 * %rdx: src (32 blocks)
1084 * %rcx: iv (little endian, 128bit)
1094 /* dst can be used as temporary storage, src is not overwritten. */
1099 subq $(16 * 32), %rsp;
1103 vpcmpeqd %ymm15, %ymm15, %ymm15;
1104 vpsrldq $8, %ymm15, %ymm15; /* ab: -1:0 ; cd: -1:0 */
1105 vpaddq %ymm15, %ymm15, %ymm12; /* ab: -2:0 ; cd: -2:0 */
1107 /* load IV and byteswap */
1108 vmovdqu (%rcx), %xmm0;
1109 vmovdqa %xmm0, %xmm1;
1110 inc_le128(%xmm0, %xmm15, %xmm14);
1111 vbroadcasti128 .Lbswap128_mask, %ymm14;
1112 vinserti128 $1, %xmm0, %ymm1, %ymm0;
1113 vpshufb %ymm14, %ymm0, %ymm13;
1114 vmovdqu %ymm13, 15 * 32(%rax);
1117 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); /* ab:le2 ; cd:le3 */
1118 vpshufb %ymm14, %ymm0, %ymm13;
1119 vmovdqu %ymm13, 14 * 32(%rax);
1120 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1121 vpshufb %ymm14, %ymm0, %ymm13;
1122 vmovdqu %ymm13, 13 * 32(%rax);
1123 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1124 vpshufb %ymm14, %ymm0, %ymm13;
1125 vmovdqu %ymm13, 12 * 32(%rax);
1126 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1127 vpshufb %ymm14, %ymm0, %ymm13;
1128 vmovdqu %ymm13, 11 * 32(%rax);
1129 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1130 vpshufb %ymm14, %ymm0, %ymm10;
1131 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1132 vpshufb %ymm14, %ymm0, %ymm9;
1133 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1134 vpshufb %ymm14, %ymm0, %ymm8;
1135 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1136 vpshufb %ymm14, %ymm0, %ymm7;
1137 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1138 vpshufb %ymm14, %ymm0, %ymm6;
1139 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1140 vpshufb %ymm14, %ymm0, %ymm5;
1141 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1142 vpshufb %ymm14, %ymm0, %ymm4;
1143 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1144 vpshufb %ymm14, %ymm0, %ymm3;
1145 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1146 vpshufb %ymm14, %ymm0, %ymm2;
1147 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1148 vpshufb %ymm14, %ymm0, %ymm1;
1149 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1150 vextracti128 $1, %ymm0, %xmm13;
1151 vpshufb %ymm14, %ymm0, %ymm0;
1152 inc_le128(%xmm13, %xmm15, %xmm14);
1153 vmovdqu %xmm13, (%rcx);
1156 vpbroadcastq (key_table)(CTX), %ymm15;
1157 vpshufb .Lpack_bswap, %ymm15, %ymm15;
1158 vpxor %ymm0, %ymm15, %ymm0;
1159 vpxor %ymm1, %ymm15, %ymm1;
1160 vpxor %ymm2, %ymm15, %ymm2;
1161 vpxor %ymm3, %ymm15, %ymm3;
1162 vpxor %ymm4, %ymm15, %ymm4;
1163 vpxor %ymm5, %ymm15, %ymm5;
1164 vpxor %ymm6, %ymm15, %ymm6;
1165 vpxor %ymm7, %ymm15, %ymm7;
1166 vpxor %ymm8, %ymm15, %ymm8;
1167 vpxor %ymm9, %ymm15, %ymm9;
1168 vpxor %ymm10, %ymm15, %ymm10;
1169 vpxor 11 * 32(%rax), %ymm15, %ymm11;
1170 vpxor 12 * 32(%rax), %ymm15, %ymm12;
1171 vpxor 13 * 32(%rax), %ymm15, %ymm13;
1172 vpxor 14 * 32(%rax), %ymm15, %ymm14;
1173 vpxor 15 * 32(%rax), %ymm15, %ymm15;
1175 call __camellia_enc_blk32;
1179 vpxor 0 * 32(%rdx), %ymm7, %ymm7;
1180 vpxor 1 * 32(%rdx), %ymm6, %ymm6;
1181 vpxor 2 * 32(%rdx), %ymm5, %ymm5;
1182 vpxor 3 * 32(%rdx), %ymm4, %ymm4;
1183 vpxor 4 * 32(%rdx), %ymm3, %ymm3;
1184 vpxor 5 * 32(%rdx), %ymm2, %ymm2;
1185 vpxor 6 * 32(%rdx), %ymm1, %ymm1;
1186 vpxor 7 * 32(%rdx), %ymm0, %ymm0;
1187 vpxor 8 * 32(%rdx), %ymm15, %ymm15;
1188 vpxor 9 * 32(%rdx), %ymm14, %ymm14;
1189 vpxor 10 * 32(%rdx), %ymm13, %ymm13;
1190 vpxor 11 * 32(%rdx), %ymm12, %ymm12;
1191 vpxor 12 * 32(%rdx), %ymm11, %ymm11;
1192 vpxor 13 * 32(%rdx), %ymm10, %ymm10;
1193 vpxor 14 * 32(%rdx), %ymm9, %ymm9;
1194 vpxor 15 * 32(%rdx), %ymm8, %ymm8;
1195 write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0,
1196 %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9,
1203 SYM_FUNC_END(camellia_ctr_32way)
1205 #define gf128mul_x_ble(iv, mask, tmp) \
1206 vpsrad $31, iv, tmp; \
1207 vpaddq iv, iv, iv; \
1208 vpshufd $0x13, tmp, tmp; \
1209 vpand mask, tmp, tmp; \
1212 #define gf128mul_x2_ble(iv, mask1, mask2, tmp0, tmp1) \
1213 vpsrad $31, iv, tmp0; \
1214 vpaddq iv, iv, tmp1; \
1215 vpsllq $2, iv, iv; \
1216 vpshufd $0x13, tmp0, tmp0; \
1217 vpsrad $31, tmp1, tmp1; \
1218 vpand mask2, tmp0, tmp0; \
1219 vpshufd $0x13, tmp1, tmp1; \
1220 vpxor tmp0, iv, iv; \
1221 vpand mask1, tmp1, tmp1; \
1225 SYM_FUNC_START_LOCAL(camellia_xts_crypt_32way)
1228 * %rsi: dst (32 blocks)
1229 * %rdx: src (32 blocks)
1230 * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
1231 * %r8: index for input whitening key
1232 * %r9: pointer to __camellia_enc_blk32 or __camellia_dec_blk32
1238 subq $(16 * 32), %rsp;
1241 vbroadcasti128 .Lxts_gf128mul_and_shl1_mask_0, %ymm12;
1243 /* load IV and construct second IV */
1244 vmovdqu (%rcx), %xmm0;
1245 vmovdqa %xmm0, %xmm15;
1246 gf128mul_x_ble(%xmm0, %xmm12, %xmm13);
1247 vbroadcasti128 .Lxts_gf128mul_and_shl1_mask_1, %ymm13;
1248 vinserti128 $1, %xmm0, %ymm15, %ymm0;
1249 vpxor 0 * 32(%rdx), %ymm0, %ymm15;
1250 vmovdqu %ymm15, 15 * 32(%rax);
1251 vmovdqu %ymm0, 0 * 32(%rsi);
1254 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1255 vpxor 1 * 32(%rdx), %ymm0, %ymm15;
1256 vmovdqu %ymm15, 14 * 32(%rax);
1257 vmovdqu %ymm0, 1 * 32(%rsi);
1259 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1260 vpxor 2 * 32(%rdx), %ymm0, %ymm15;
1261 vmovdqu %ymm15, 13 * 32(%rax);
1262 vmovdqu %ymm0, 2 * 32(%rsi);
1264 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1265 vpxor 3 * 32(%rdx), %ymm0, %ymm15;
1266 vmovdqu %ymm15, 12 * 32(%rax);
1267 vmovdqu %ymm0, 3 * 32(%rsi);
1269 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1270 vpxor 4 * 32(%rdx), %ymm0, %ymm11;
1271 vmovdqu %ymm0, 4 * 32(%rsi);
1273 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1274 vpxor 5 * 32(%rdx), %ymm0, %ymm10;
1275 vmovdqu %ymm0, 5 * 32(%rsi);
1277 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1278 vpxor 6 * 32(%rdx), %ymm0, %ymm9;
1279 vmovdqu %ymm0, 6 * 32(%rsi);
1281 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1282 vpxor 7 * 32(%rdx), %ymm0, %ymm8;
1283 vmovdqu %ymm0, 7 * 32(%rsi);
1285 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1286 vpxor 8 * 32(%rdx), %ymm0, %ymm7;
1287 vmovdqu %ymm0, 8 * 32(%rsi);
1289 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1290 vpxor 9 * 32(%rdx), %ymm0, %ymm6;
1291 vmovdqu %ymm0, 9 * 32(%rsi);
1293 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1294 vpxor 10 * 32(%rdx), %ymm0, %ymm5;
1295 vmovdqu %ymm0, 10 * 32(%rsi);
1297 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1298 vpxor 11 * 32(%rdx), %ymm0, %ymm4;
1299 vmovdqu %ymm0, 11 * 32(%rsi);
1301 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1302 vpxor 12 * 32(%rdx), %ymm0, %ymm3;
1303 vmovdqu %ymm0, 12 * 32(%rsi);
1305 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1306 vpxor 13 * 32(%rdx), %ymm0, %ymm2;
1307 vmovdqu %ymm0, 13 * 32(%rsi);
1309 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1310 vpxor 14 * 32(%rdx), %ymm0, %ymm1;
1311 vmovdqu %ymm0, 14 * 32(%rsi);
1313 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1314 vpxor 15 * 32(%rdx), %ymm0, %ymm15;
1315 vmovdqu %ymm15, 0 * 32(%rax);
1316 vmovdqu %ymm0, 15 * 32(%rsi);
1318 vextracti128 $1, %ymm0, %xmm0;
1319 gf128mul_x_ble(%xmm0, %xmm12, %xmm15);
1320 vmovdqu %xmm0, (%rcx);
1323 vpbroadcastq (key_table)(CTX, %r8, 8), %ymm15;
1324 vpshufb .Lpack_bswap, %ymm15, %ymm15;
1325 vpxor 0 * 32(%rax), %ymm15, %ymm0;
1326 vpxor %ymm1, %ymm15, %ymm1;
1327 vpxor %ymm2, %ymm15, %ymm2;
1328 vpxor %ymm3, %ymm15, %ymm3;
1329 vpxor %ymm4, %ymm15, %ymm4;
1330 vpxor %ymm5, %ymm15, %ymm5;
1331 vpxor %ymm6, %ymm15, %ymm6;
1332 vpxor %ymm7, %ymm15, %ymm7;
1333 vpxor %ymm8, %ymm15, %ymm8;
1334 vpxor %ymm9, %ymm15, %ymm9;
1335 vpxor %ymm10, %ymm15, %ymm10;
1336 vpxor %ymm11, %ymm15, %ymm11;
1337 vpxor 12 * 32(%rax), %ymm15, %ymm12;
1338 vpxor 13 * 32(%rax), %ymm15, %ymm13;
1339 vpxor 14 * 32(%rax), %ymm15, %ymm14;
1340 vpxor 15 * 32(%rax), %ymm15, %ymm15;
1344 addq $(16 * 32), %rsp;
1346 vpxor 0 * 32(%rsi), %ymm7, %ymm7;
1347 vpxor 1 * 32(%rsi), %ymm6, %ymm6;
1348 vpxor 2 * 32(%rsi), %ymm5, %ymm5;
1349 vpxor 3 * 32(%rsi), %ymm4, %ymm4;
1350 vpxor 4 * 32(%rsi), %ymm3, %ymm3;
1351 vpxor 5 * 32(%rsi), %ymm2, %ymm2;
1352 vpxor 6 * 32(%rsi), %ymm1, %ymm1;
1353 vpxor 7 * 32(%rsi), %ymm0, %ymm0;
1354 vpxor 8 * 32(%rsi), %ymm15, %ymm15;
1355 vpxor 9 * 32(%rsi), %ymm14, %ymm14;
1356 vpxor 10 * 32(%rsi), %ymm13, %ymm13;
1357 vpxor 11 * 32(%rsi), %ymm12, %ymm12;
1358 vpxor 12 * 32(%rsi), %ymm11, %ymm11;
1359 vpxor 13 * 32(%rsi), %ymm10, %ymm10;
1360 vpxor 14 * 32(%rsi), %ymm9, %ymm9;
1361 vpxor 15 * 32(%rsi), %ymm8, %ymm8;
1362 write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0,
1363 %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9,
1370 SYM_FUNC_END(camellia_xts_crypt_32way)
1372 SYM_FUNC_START(camellia_xts_enc_32way)
1375 * %rsi: dst (32 blocks)
1376 * %rdx: src (32 blocks)
1377 * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
1380 xorl %r8d, %r8d; /* input whitening key, 0 for enc */
1382 leaq __camellia_enc_blk32, %r9;
1384 jmp camellia_xts_crypt_32way;
1385 SYM_FUNC_END(camellia_xts_enc_32way)
1387 SYM_FUNC_START(camellia_xts_dec_32way)
1390 * %rsi: dst (32 blocks)
1391 * %rdx: src (32 blocks)
1392 * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
1395 cmpl $16, key_length(CTX);
1398 cmovel %eax, %r8d; /* input whitening key, last for dec */
1400 leaq __camellia_dec_blk32, %r9;
1402 jmp camellia_xts_crypt_32way;
1403 SYM_FUNC_END(camellia_xts_dec_32way)