2 * x86_64/AVX2/AES-NI assembler implementation of Camellia
4 * Copyright © 2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
13 #include <linux/linkage.h>
14 #include <asm/frame.h>
15 #include <asm/nospec-branch.h>
17 #define CAMELLIA_TABLE_BYTE_LEN 272
19 /* struct camellia_ctx: */
21 #define key_length CAMELLIA_TABLE_BYTE_LEN
27 /**********************************************************************
29 **********************************************************************/
30 #define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0) \
31 vpand x, mask4bit, tmp0; \
32 vpandn x, mask4bit, x; \
35 vpshufb tmp0, lo_t, tmp0; \
56 /**********************************************************************
58 **********************************************************************/
62 * x0..x7: byte-sliced AB state
63 * mem_cd: register pointer storing CD state
64 * key: index for key material
66 * x0..x7: new byte-sliced CD state
68 #define roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, t0, t1, t2, t3, t4, t5, t6, \
71 * S-function with AES subbytes \
73 vbroadcasti128 .Linv_shift_row, t4; \
74 vpbroadcastd .L0f0f0f0f, t7; \
75 vbroadcasti128 .Lpre_tf_lo_s1, t5; \
76 vbroadcasti128 .Lpre_tf_hi_s1, t6; \
77 vbroadcasti128 .Lpre_tf_lo_s4, t2; \
78 vbroadcasti128 .Lpre_tf_hi_s4, t3; \
80 /* AES inverse shift rows */ \
90 /* prefilter sboxes 1, 2 and 3 */ \
91 /* prefilter sbox 4 */ \
92 filter_8bit(x0, t5, t6, t7, t4); \
93 filter_8bit(x7, t5, t6, t7, t4); \
94 vextracti128 $1, x0, t0##_x; \
95 vextracti128 $1, x7, t1##_x; \
96 filter_8bit(x3, t2, t3, t7, t4); \
97 filter_8bit(x6, t2, t3, t7, t4); \
98 vextracti128 $1, x3, t3##_x; \
99 vextracti128 $1, x6, t2##_x; \
100 filter_8bit(x2, t5, t6, t7, t4); \
101 filter_8bit(x5, t5, t6, t7, t4); \
102 filter_8bit(x1, t5, t6, t7, t4); \
103 filter_8bit(x4, t5, t6, t7, t4); \
105 vpxor t4##_x, t4##_x, t4##_x; \
107 /* AES subbytes + AES shift rows */ \
108 vextracti128 $1, x2, t6##_x; \
109 vextracti128 $1, x5, t5##_x; \
110 vaesenclast t4##_x, x0##_x, x0##_x; \
111 vaesenclast t4##_x, t0##_x, t0##_x; \
112 vinserti128 $1, t0##_x, x0, x0; \
113 vaesenclast t4##_x, x7##_x, x7##_x; \
114 vaesenclast t4##_x, t1##_x, t1##_x; \
115 vinserti128 $1, t1##_x, x7, x7; \
116 vaesenclast t4##_x, x3##_x, x3##_x; \
117 vaesenclast t4##_x, t3##_x, t3##_x; \
118 vinserti128 $1, t3##_x, x3, x3; \
119 vaesenclast t4##_x, x6##_x, x6##_x; \
120 vaesenclast t4##_x, t2##_x, t2##_x; \
121 vinserti128 $1, t2##_x, x6, x6; \
122 vextracti128 $1, x1, t3##_x; \
123 vextracti128 $1, x4, t2##_x; \
124 vbroadcasti128 .Lpost_tf_lo_s1, t0; \
125 vbroadcasti128 .Lpost_tf_hi_s1, t1; \
126 vaesenclast t4##_x, x2##_x, x2##_x; \
127 vaesenclast t4##_x, t6##_x, t6##_x; \
128 vinserti128 $1, t6##_x, x2, x2; \
129 vaesenclast t4##_x, x5##_x, x5##_x; \
130 vaesenclast t4##_x, t5##_x, t5##_x; \
131 vinserti128 $1, t5##_x, x5, x5; \
132 vaesenclast t4##_x, x1##_x, x1##_x; \
133 vaesenclast t4##_x, t3##_x, t3##_x; \
134 vinserti128 $1, t3##_x, x1, x1; \
135 vaesenclast t4##_x, x4##_x, x4##_x; \
136 vaesenclast t4##_x, t2##_x, t2##_x; \
137 vinserti128 $1, t2##_x, x4, x4; \
139 /* postfilter sboxes 1 and 4 */ \
140 vbroadcasti128 .Lpost_tf_lo_s3, t2; \
141 vbroadcasti128 .Lpost_tf_hi_s3, t3; \
142 filter_8bit(x0, t0, t1, t7, t6); \
143 filter_8bit(x7, t0, t1, t7, t6); \
144 filter_8bit(x3, t0, t1, t7, t6); \
145 filter_8bit(x6, t0, t1, t7, t6); \
147 /* postfilter sbox 3 */ \
148 vbroadcasti128 .Lpost_tf_lo_s2, t4; \
149 vbroadcasti128 .Lpost_tf_hi_s2, t5; \
150 filter_8bit(x2, t2, t3, t7, t6); \
151 filter_8bit(x5, t2, t3, t7, t6); \
153 vpbroadcastq key, t0; /* higher 64-bit duplicate ignored */ \
155 /* postfilter sbox 2 */ \
156 filter_8bit(x1, t4, t5, t7, t2); \
157 filter_8bit(x4, t4, t5, t7, t2); \
160 vpsrldq $1, t0, t1; \
161 vpsrldq $2, t0, t2; \
162 vpshufb t7, t1, t1; \
163 vpsrldq $3, t0, t3; \
171 vpshufb t7, t2, t2; \
172 vpsrldq $4, t0, t4; \
173 vpshufb t7, t3, t3; \
174 vpsrldq $5, t0, t5; \
175 vpshufb t7, t4, t4; \
182 vpsrldq $6, t0, t6; \
183 vpshufb t7, t5, t5; \
184 vpshufb t7, t6, t6; \
194 vpxor x2, x7, x7; /* note: high and low parts swapped */ \
196 /* Add key material and result to CD (x becomes new CD) */ \
199 vpxor 5 * 32(mem_cd), x1, x1; \
201 vpsrldq $7, t0, t6; \
202 vpshufb t7, t0, t0; \
203 vpshufb t7, t6, t7; \
206 vpxor 4 * 32(mem_cd), x0, x0; \
209 vpxor 6 * 32(mem_cd), x2, x2; \
212 vpxor 7 * 32(mem_cd), x3, x3; \
215 vpxor 0 * 32(mem_cd), x4, x4; \
218 vpxor 1 * 32(mem_cd), x5, x5; \
221 vpxor 2 * 32(mem_cd), x6, x6; \
224 vpxor 3 * 32(mem_cd), x7, x7;
227 * Size optimization... with inlined roundsm32 binary would be over 5 times
228 * larger and would only marginally faster.
231 roundsm32_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd:
232 roundsm32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
233 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, %ymm15,
236 ENDPROC(roundsm32_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd)
239 roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab:
240 roundsm32(%ymm4, %ymm5, %ymm6, %ymm7, %ymm0, %ymm1, %ymm2, %ymm3,
241 %ymm12, %ymm13, %ymm14, %ymm15, %ymm8, %ymm9, %ymm10, %ymm11,
244 ENDPROC(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
248 * x0..x7: byte-sliced AB state preloaded
249 * mem_ab: byte-sliced AB state in memory
250 * mem_cb: byte-sliced CD state in memory
252 #define two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
253 y6, y7, mem_ab, mem_cd, i, dir, store_ab) \
254 leaq (key_table + (i) * 8)(CTX), %r9; \
255 call roundsm32_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd; \
257 vmovdqu x0, 4 * 32(mem_cd); \
258 vmovdqu x1, 5 * 32(mem_cd); \
259 vmovdqu x2, 6 * 32(mem_cd); \
260 vmovdqu x3, 7 * 32(mem_cd); \
261 vmovdqu x4, 0 * 32(mem_cd); \
262 vmovdqu x5, 1 * 32(mem_cd); \
263 vmovdqu x6, 2 * 32(mem_cd); \
264 vmovdqu x7, 3 * 32(mem_cd); \
266 leaq (key_table + ((i) + (dir)) * 8)(CTX), %r9; \
267 call roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab; \
269 store_ab(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab);
271 #define dummy_store(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) /* do nothing */
273 #define store_ab_state(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) \
274 /* Store new AB state */ \
275 vmovdqu x4, 4 * 32(mem_ab); \
276 vmovdqu x5, 5 * 32(mem_ab); \
277 vmovdqu x6, 6 * 32(mem_ab); \
278 vmovdqu x7, 7 * 32(mem_ab); \
279 vmovdqu x0, 0 * 32(mem_ab); \
280 vmovdqu x1, 1 * 32(mem_ab); \
281 vmovdqu x2, 2 * 32(mem_ab); \
282 vmovdqu x3, 3 * 32(mem_ab);
284 #define enc_rounds32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
285 y6, y7, mem_ab, mem_cd, i) \
286 two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
287 y6, y7, mem_ab, mem_cd, (i) + 2, 1, store_ab_state); \
288 two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
289 y6, y7, mem_ab, mem_cd, (i) + 4, 1, store_ab_state); \
290 two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
291 y6, y7, mem_ab, mem_cd, (i) + 6, 1, dummy_store);
293 #define dec_rounds32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
294 y6, y7, mem_ab, mem_cd, i) \
295 two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
296 y6, y7, mem_ab, mem_cd, (i) + 7, -1, store_ab_state); \
297 two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
298 y6, y7, mem_ab, mem_cd, (i) + 5, -1, store_ab_state); \
299 two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
300 y6, y7, mem_ab, mem_cd, (i) + 3, -1, dummy_store);
304 * v0..3: byte-sliced 32-bit integers
308 #define rol32_1_32(v0, v1, v2, v3, t0, t1, t2, zero) \
309 vpcmpgtb v0, zero, t0; \
313 vpcmpgtb v1, zero, t1; \
317 vpcmpgtb v2, zero, t2; \
323 vpcmpgtb v3, zero, t0; \
333 * r: byte-sliced AB state in memory
334 * l: byte-sliced CD state in memory
336 * x0..x7: new byte-sliced CD state
338 #define fls32(l, l0, l1, l2, l3, l4, l5, l6, l7, r, t0, t1, t2, t3, tt0, \
339 tt1, tt2, tt3, kll, klr, krl, krr) \
343 * lr ^= rol32(t0, 1); \
345 vpbroadcastd kll, t0; /* only lowest 32-bit used */ \
346 vpxor tt0, tt0, tt0; \
347 vpshufb tt0, t0, t3; \
348 vpsrldq $1, t0, t0; \
349 vpshufb tt0, t0, t2; \
350 vpsrldq $1, t0, t0; \
351 vpshufb tt0, t0, t1; \
352 vpsrldq $1, t0, t0; \
353 vpshufb tt0, t0, t0; \
360 rol32_1_32(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \
363 vpbroadcastd krr, t0; /* only lowest 32-bit used */ \
364 vmovdqu l4, 4 * 32(l); \
366 vmovdqu l5, 5 * 32(l); \
368 vmovdqu l6, 6 * 32(l); \
370 vmovdqu l7, 7 * 32(l); \
378 vpshufb tt0, t0, t3; \
379 vpsrldq $1, t0, t0; \
380 vpshufb tt0, t0, t2; \
381 vpsrldq $1, t0, t0; \
382 vpshufb tt0, t0, t1; \
383 vpsrldq $1, t0, t0; \
384 vpshufb tt0, t0, t0; \
386 vpor 4 * 32(r), t0, t0; \
387 vpor 5 * 32(r), t1, t1; \
388 vpor 6 * 32(r), t2, t2; \
389 vpor 7 * 32(r), t3, t3; \
391 vpxor 0 * 32(r), t0, t0; \
392 vpxor 1 * 32(r), t1, t1; \
393 vpxor 2 * 32(r), t2, t2; \
394 vpxor 3 * 32(r), t3, t3; \
395 vmovdqu t0, 0 * 32(r); \
396 vpbroadcastd krl, t0; /* only lowest 32-bit used */ \
397 vmovdqu t1, 1 * 32(r); \
398 vmovdqu t2, 2 * 32(r); \
399 vmovdqu t3, 3 * 32(r); \
404 * rr ^= rol32(t2, 1); \
406 vpshufb tt0, t0, t3; \
407 vpsrldq $1, t0, t0; \
408 vpshufb tt0, t0, t2; \
409 vpsrldq $1, t0, t0; \
410 vpshufb tt0, t0, t1; \
411 vpsrldq $1, t0, t0; \
412 vpshufb tt0, t0, t0; \
414 vpand 0 * 32(r), t0, t0; \
415 vpand 1 * 32(r), t1, t1; \
416 vpand 2 * 32(r), t2, t2; \
417 vpand 3 * 32(r), t3, t3; \
419 rol32_1_32(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \
421 vpxor 4 * 32(r), t0, t0; \
422 vpxor 5 * 32(r), t1, t1; \
423 vpxor 6 * 32(r), t2, t2; \
424 vpxor 7 * 32(r), t3, t3; \
425 vmovdqu t0, 4 * 32(r); \
426 vpbroadcastd klr, t0; /* only lowest 32-bit used */ \
427 vmovdqu t1, 5 * 32(r); \
428 vmovdqu t2, 6 * 32(r); \
429 vmovdqu t3, 7 * 32(r); \
437 vpshufb tt0, t0, t3; \
438 vpsrldq $1, t0, t0; \
439 vpshufb tt0, t0, t2; \
440 vpsrldq $1, t0, t0; \
441 vpshufb tt0, t0, t1; \
442 vpsrldq $1, t0, t0; \
443 vpshufb tt0, t0, t0; \
451 vmovdqu l0, 0 * 32(l); \
453 vmovdqu l1, 1 * 32(l); \
455 vmovdqu l2, 2 * 32(l); \
457 vmovdqu l3, 3 * 32(l);
459 #define transpose_4x4(x0, x1, x2, x3, t1, t2) \
460 vpunpckhdq x1, x0, t2; \
461 vpunpckldq x1, x0, x0; \
463 vpunpckldq x3, x2, t1; \
464 vpunpckhdq x3, x2, x2; \
466 vpunpckhqdq t1, x0, x1; \
467 vpunpcklqdq t1, x0, x0; \
469 vpunpckhqdq x2, t2, x3; \
470 vpunpcklqdq x2, t2, x2;
472 #define byteslice_16x16b_fast(a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, \
473 a3, b3, c3, d3, st0, st1) \
476 transpose_4x4(a0, a1, a2, a3, d2, d3); \
477 transpose_4x4(b0, b1, b2, b3, d2, d3); \
483 transpose_4x4(c0, c1, c2, c3, a0, a1); \
484 transpose_4x4(d0, d1, d2, d3, a0, a1); \
486 vbroadcasti128 .Lshufb_16x16b, a0; \
488 vpshufb a0, a2, a2; \
489 vpshufb a0, a3, a3; \
490 vpshufb a0, b0, b0; \
491 vpshufb a0, b1, b1; \
492 vpshufb a0, b2, b2; \
493 vpshufb a0, b3, b3; \
494 vpshufb a0, a1, a1; \
495 vpshufb a0, c0, c0; \
496 vpshufb a0, c1, c1; \
497 vpshufb a0, c2, c2; \
498 vpshufb a0, c3, c3; \
499 vpshufb a0, d0, d0; \
500 vpshufb a0, d1, d1; \
501 vpshufb a0, d2, d2; \
502 vpshufb a0, d3, d3; \
505 vpshufb a0, d3, a0; \
508 transpose_4x4(a0, b0, c0, d0, d2, d3); \
509 transpose_4x4(a1, b1, c1, d1, d2, d3); \
515 transpose_4x4(a2, b2, c2, d2, b0, b1); \
516 transpose_4x4(a3, b3, c3, d3, b0, b1); \
519 /* does not adjust output bytes inside vectors */
521 /* load blocks to registers and apply pre-whitening */
522 #define inpack32_pre(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
524 vpbroadcastq key, x0; \
525 vpshufb .Lpack_bswap, x0, x0; \
527 vpxor 0 * 32(rio), x0, y7; \
528 vpxor 1 * 32(rio), x0, y6; \
529 vpxor 2 * 32(rio), x0, y5; \
530 vpxor 3 * 32(rio), x0, y4; \
531 vpxor 4 * 32(rio), x0, y3; \
532 vpxor 5 * 32(rio), x0, y2; \
533 vpxor 6 * 32(rio), x0, y1; \
534 vpxor 7 * 32(rio), x0, y0; \
535 vpxor 8 * 32(rio), x0, x7; \
536 vpxor 9 * 32(rio), x0, x6; \
537 vpxor 10 * 32(rio), x0, x5; \
538 vpxor 11 * 32(rio), x0, x4; \
539 vpxor 12 * 32(rio), x0, x3; \
540 vpxor 13 * 32(rio), x0, x2; \
541 vpxor 14 * 32(rio), x0, x1; \
542 vpxor 15 * 32(rio), x0, x0;
544 /* byteslice pre-whitened blocks and store to temporary memory */
545 #define inpack32_post(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
546 y6, y7, mem_ab, mem_cd) \
547 byteslice_16x16b_fast(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, \
548 y4, y5, y6, y7, (mem_ab), (mem_cd)); \
550 vmovdqu x0, 0 * 32(mem_ab); \
551 vmovdqu x1, 1 * 32(mem_ab); \
552 vmovdqu x2, 2 * 32(mem_ab); \
553 vmovdqu x3, 3 * 32(mem_ab); \
554 vmovdqu x4, 4 * 32(mem_ab); \
555 vmovdqu x5, 5 * 32(mem_ab); \
556 vmovdqu x6, 6 * 32(mem_ab); \
557 vmovdqu x7, 7 * 32(mem_ab); \
558 vmovdqu y0, 0 * 32(mem_cd); \
559 vmovdqu y1, 1 * 32(mem_cd); \
560 vmovdqu y2, 2 * 32(mem_cd); \
561 vmovdqu y3, 3 * 32(mem_cd); \
562 vmovdqu y4, 4 * 32(mem_cd); \
563 vmovdqu y5, 5 * 32(mem_cd); \
564 vmovdqu y6, 6 * 32(mem_cd); \
565 vmovdqu y7, 7 * 32(mem_cd);
567 /* de-byteslice, apply post-whitening and store blocks */
568 #define outunpack32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, \
569 y5, y6, y7, key, stack_tmp0, stack_tmp1) \
570 byteslice_16x16b_fast(y0, y4, x0, x4, y1, y5, x1, x5, y2, y6, x2, x6, \
571 y3, y7, x3, x7, stack_tmp0, stack_tmp1); \
573 vmovdqu x0, stack_tmp0; \
575 vpbroadcastq key, x0; \
576 vpshufb .Lpack_bswap, x0, x0; \
593 vpxor stack_tmp0, x0, x0;
595 #define write_output(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
597 vmovdqu x0, 0 * 32(rio); \
598 vmovdqu x1, 1 * 32(rio); \
599 vmovdqu x2, 2 * 32(rio); \
600 vmovdqu x3, 3 * 32(rio); \
601 vmovdqu x4, 4 * 32(rio); \
602 vmovdqu x5, 5 * 32(rio); \
603 vmovdqu x6, 6 * 32(rio); \
604 vmovdqu x7, 7 * 32(rio); \
605 vmovdqu y0, 8 * 32(rio); \
606 vmovdqu y1, 9 * 32(rio); \
607 vmovdqu y2, 10 * 32(rio); \
608 vmovdqu y3, 11 * 32(rio); \
609 vmovdqu y4, 12 * 32(rio); \
610 vmovdqu y5, 13 * 32(rio); \
611 vmovdqu y6, 14 * 32(rio); \
612 vmovdqu y7, 15 * 32(rio);
615 .section .rodata.cst32.shufb_16x16b, "aM", @progbits, 32
617 #define SHUFB_BYTES(idx) \
618 0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx)
620 .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3)
621 .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3)
623 .section .rodata.cst32.pack_bswap, "aM", @progbits, 32
626 .long 0x00010203, 0x04050607, 0x80808080, 0x80808080
627 .long 0x00010203, 0x04050607, 0x80808080, 0x80808080
629 /* NB: section is mergeable, all elements must be aligned 16-byte blocks */
630 .section .rodata.cst16, "aM", @progbits, 16
633 /* For CTR-mode IV byteswap */
635 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
638 .Lxts_gf128mul_and_shl1_mask_0:
639 .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0
640 .Lxts_gf128mul_and_shl1_mask_1:
641 .byte 0x0e, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0
644 * pre-SubByte transform
646 * pre-lookup for sbox1, sbox2, sbox3:
647 * swap_bitendianness(
648 * isom_map_camellia_to_aes(
650 * swap_bitendianess(in)
655 * (note: '⊕ 0xc5' inside camellia_f())
658 .byte 0x45, 0xe8, 0x40, 0xed, 0x2e, 0x83, 0x2b, 0x86
659 .byte 0x4b, 0xe6, 0x4e, 0xe3, 0x20, 0x8d, 0x25, 0x88
661 .byte 0x00, 0x51, 0xf1, 0xa0, 0x8a, 0xdb, 0x7b, 0x2a
662 .byte 0x09, 0x58, 0xf8, 0xa9, 0x83, 0xd2, 0x72, 0x23
665 * pre-SubByte transform
667 * pre-lookup for sbox4:
668 * swap_bitendianness(
669 * isom_map_camellia_to_aes(
671 * swap_bitendianess(in <<< 1)
676 * (note: '⊕ 0xc5' inside camellia_f())
679 .byte 0x45, 0x40, 0x2e, 0x2b, 0x4b, 0x4e, 0x20, 0x25
680 .byte 0x14, 0x11, 0x7f, 0x7a, 0x1a, 0x1f, 0x71, 0x74
682 .byte 0x00, 0xf1, 0x8a, 0x7b, 0x09, 0xf8, 0x83, 0x72
683 .byte 0xad, 0x5c, 0x27, 0xd6, 0xa4, 0x55, 0x2e, 0xdf
686 * post-SubByte transform
688 * post-lookup for sbox1, sbox4:
689 * swap_bitendianness(
691 * isom_map_aes_to_camellia(
692 * swap_bitendianness(
693 * aes_inverse_affine_transform(in)
699 * (note: '⊕ 0x6e' inside camellia_h())
702 .byte 0x3c, 0xcc, 0xcf, 0x3f, 0x32, 0xc2, 0xc1, 0x31
703 .byte 0xdc, 0x2c, 0x2f, 0xdf, 0xd2, 0x22, 0x21, 0xd1
705 .byte 0x00, 0xf9, 0x86, 0x7f, 0xd7, 0x2e, 0x51, 0xa8
706 .byte 0xa4, 0x5d, 0x22, 0xdb, 0x73, 0x8a, 0xf5, 0x0c
709 * post-SubByte transform
711 * post-lookup for sbox2:
712 * swap_bitendianness(
714 * isom_map_aes_to_camellia(
715 * swap_bitendianness(
716 * aes_inverse_affine_transform(in)
722 * (note: '⊕ 0x6e' inside camellia_h())
725 .byte 0x78, 0x99, 0x9f, 0x7e, 0x64, 0x85, 0x83, 0x62
726 .byte 0xb9, 0x58, 0x5e, 0xbf, 0xa5, 0x44, 0x42, 0xa3
728 .byte 0x00, 0xf3, 0x0d, 0xfe, 0xaf, 0x5c, 0xa2, 0x51
729 .byte 0x49, 0xba, 0x44, 0xb7, 0xe6, 0x15, 0xeb, 0x18
732 * post-SubByte transform
734 * post-lookup for sbox3:
735 * swap_bitendianness(
737 * isom_map_aes_to_camellia(
738 * swap_bitendianness(
739 * aes_inverse_affine_transform(in)
745 * (note: '⊕ 0x6e' inside camellia_h())
748 .byte 0x1e, 0x66, 0xe7, 0x9f, 0x19, 0x61, 0xe0, 0x98
749 .byte 0x6e, 0x16, 0x97, 0xef, 0x69, 0x11, 0x90, 0xe8
751 .byte 0x00, 0xfc, 0x43, 0xbf, 0xeb, 0x17, 0xa8, 0x54
752 .byte 0x52, 0xae, 0x11, 0xed, 0xb9, 0x45, 0xfa, 0x06
754 /* For isolating SubBytes from AESENCLAST, inverse shift row */
756 .byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b
757 .byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03
759 .section .rodata.cst4.L0f0f0f0f, "aM", @progbits, 4
768 __camellia_enc_blk32:
771 * %rax: temporary storage, 512 bytes
772 * %ymm0..%ymm15: 32 plaintext blocks
774 * %ymm0..%ymm15: 32 encrypted blocks, order swapped:
775 * 7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
779 leaq 8 * 32(%rax), %rcx;
781 inpack32_post(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
782 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
785 enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
786 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
787 %ymm15, %rax, %rcx, 0);
789 fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
790 %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
792 ((key_table + (8) * 8) + 0)(CTX),
793 ((key_table + (8) * 8) + 4)(CTX),
794 ((key_table + (8) * 8) + 8)(CTX),
795 ((key_table + (8) * 8) + 12)(CTX));
797 enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
798 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
799 %ymm15, %rax, %rcx, 8);
801 fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
802 %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
804 ((key_table + (16) * 8) + 0)(CTX),
805 ((key_table + (16) * 8) + 4)(CTX),
806 ((key_table + (16) * 8) + 8)(CTX),
807 ((key_table + (16) * 8) + 12)(CTX));
809 enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
810 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
811 %ymm15, %rax, %rcx, 16);
814 cmpl $16, key_length(CTX);
818 /* load CD for output */
819 vmovdqu 0 * 32(%rcx), %ymm8;
820 vmovdqu 1 * 32(%rcx), %ymm9;
821 vmovdqu 2 * 32(%rcx), %ymm10;
822 vmovdqu 3 * 32(%rcx), %ymm11;
823 vmovdqu 4 * 32(%rcx), %ymm12;
824 vmovdqu 5 * 32(%rcx), %ymm13;
825 vmovdqu 6 * 32(%rcx), %ymm14;
826 vmovdqu 7 * 32(%rcx), %ymm15;
828 outunpack32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
829 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
830 %ymm15, (key_table)(CTX, %r8, 8), (%rax), 1 * 32(%rax));
839 fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
840 %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
842 ((key_table + (24) * 8) + 0)(CTX),
843 ((key_table + (24) * 8) + 4)(CTX),
844 ((key_table + (24) * 8) + 8)(CTX),
845 ((key_table + (24) * 8) + 12)(CTX));
847 enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
848 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
849 %ymm15, %rax, %rcx, 24);
852 ENDPROC(__camellia_enc_blk32)
855 __camellia_dec_blk32:
858 * %rax: temporary storage, 512 bytes
859 * %r8d: 24 for 16 byte key, 32 for larger
860 * %ymm0..%ymm15: 16 encrypted blocks
862 * %ymm0..%ymm15: 16 plaintext blocks, order swapped:
863 * 7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
867 leaq 8 * 32(%rax), %rcx;
869 inpack32_post(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
870 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
877 dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
878 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
879 %ymm15, %rax, %rcx, 16);
881 fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
882 %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
884 ((key_table + (16) * 8) + 8)(CTX),
885 ((key_table + (16) * 8) + 12)(CTX),
886 ((key_table + (16) * 8) + 0)(CTX),
887 ((key_table + (16) * 8) + 4)(CTX));
889 dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
890 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
891 %ymm15, %rax, %rcx, 8);
893 fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
894 %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
896 ((key_table + (8) * 8) + 8)(CTX),
897 ((key_table + (8) * 8) + 12)(CTX),
898 ((key_table + (8) * 8) + 0)(CTX),
899 ((key_table + (8) * 8) + 4)(CTX));
901 dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
902 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
903 %ymm15, %rax, %rcx, 0);
905 /* load CD for output */
906 vmovdqu 0 * 32(%rcx), %ymm8;
907 vmovdqu 1 * 32(%rcx), %ymm9;
908 vmovdqu 2 * 32(%rcx), %ymm10;
909 vmovdqu 3 * 32(%rcx), %ymm11;
910 vmovdqu 4 * 32(%rcx), %ymm12;
911 vmovdqu 5 * 32(%rcx), %ymm13;
912 vmovdqu 6 * 32(%rcx), %ymm14;
913 vmovdqu 7 * 32(%rcx), %ymm15;
915 outunpack32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
916 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
917 %ymm15, (key_table)(CTX), (%rax), 1 * 32(%rax));
924 dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
925 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
926 %ymm15, %rax, %rcx, 24);
928 fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
929 %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
931 ((key_table + (24) * 8) + 8)(CTX),
932 ((key_table + (24) * 8) + 12)(CTX),
933 ((key_table + (24) * 8) + 0)(CTX),
934 ((key_table + (24) * 8) + 4)(CTX));
937 ENDPROC(__camellia_dec_blk32)
939 ENTRY(camellia_ecb_enc_32way)
942 * %rsi: dst (32 blocks)
943 * %rdx: src (32 blocks)
949 inpack32_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
950 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
951 %ymm15, %rdx, (key_table)(CTX));
953 /* now dst can be used as temporary buffer (even in src == dst case) */
956 call __camellia_enc_blk32;
958 write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0,
959 %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9,
966 ENDPROC(camellia_ecb_enc_32way)
968 ENTRY(camellia_ecb_dec_32way)
971 * %rsi: dst (32 blocks)
972 * %rdx: src (32 blocks)
978 cmpl $16, key_length(CTX);
981 cmovel %eax, %r8d; /* max */
983 inpack32_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
984 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
985 %ymm15, %rdx, (key_table)(CTX, %r8, 8));
987 /* now dst can be used as temporary buffer (even in src == dst case) */
990 call __camellia_dec_blk32;
992 write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0,
993 %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9,
1000 ENDPROC(camellia_ecb_dec_32way)
1002 ENTRY(camellia_cbc_dec_32way)
1005 * %rsi: dst (32 blocks)
1006 * %rdx: src (32 blocks)
1012 cmpl $16, key_length(CTX);
1015 cmovel %eax, %r8d; /* max */
1017 inpack32_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
1018 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
1019 %ymm15, %rdx, (key_table)(CTX, %r8, 8));
1023 je .Lcbc_dec_use_stack;
1025 /* dst can be used as temporary storage, src is not overwritten. */
1027 jmp .Lcbc_dec_continue;
1029 .Lcbc_dec_use_stack:
1031 * dst still in-use (because dst == src), so use stack for temporary
1034 subq $(16 * 32), %rsp;
1038 call __camellia_dec_blk32;
1040 vmovdqu %ymm7, (%rax);
1041 vpxor %ymm7, %ymm7, %ymm7;
1042 vinserti128 $1, (%rdx), %ymm7, %ymm7;
1043 vpxor (%rax), %ymm7, %ymm7;
1045 vpxor (0 * 32 + 16)(%rdx), %ymm6, %ymm6;
1046 vpxor (1 * 32 + 16)(%rdx), %ymm5, %ymm5;
1047 vpxor (2 * 32 + 16)(%rdx), %ymm4, %ymm4;
1048 vpxor (3 * 32 + 16)(%rdx), %ymm3, %ymm3;
1049 vpxor (4 * 32 + 16)(%rdx), %ymm2, %ymm2;
1050 vpxor (5 * 32 + 16)(%rdx), %ymm1, %ymm1;
1051 vpxor (6 * 32 + 16)(%rdx), %ymm0, %ymm0;
1052 vpxor (7 * 32 + 16)(%rdx), %ymm15, %ymm15;
1053 vpxor (8 * 32 + 16)(%rdx), %ymm14, %ymm14;
1054 vpxor (9 * 32 + 16)(%rdx), %ymm13, %ymm13;
1055 vpxor (10 * 32 + 16)(%rdx), %ymm12, %ymm12;
1056 vpxor (11 * 32 + 16)(%rdx), %ymm11, %ymm11;
1057 vpxor (12 * 32 + 16)(%rdx), %ymm10, %ymm10;
1058 vpxor (13 * 32 + 16)(%rdx), %ymm9, %ymm9;
1059 vpxor (14 * 32 + 16)(%rdx), %ymm8, %ymm8;
1060 write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0,
1061 %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9,
1068 ENDPROC(camellia_cbc_dec_32way)
1070 #define inc_le128(x, minus_one, tmp) \
1071 vpcmpeqq minus_one, x, tmp; \
1072 vpsubq minus_one, x, x; \
1073 vpslldq $8, tmp, tmp; \
1076 #define add2_le128(x, minus_one, minus_two, tmp1, tmp2) \
1077 vpcmpeqq minus_one, x, tmp1; \
1078 vpcmpeqq minus_two, x, tmp2; \
1079 vpsubq minus_two, x, x; \
1080 vpor tmp2, tmp1, tmp1; \
1081 vpslldq $8, tmp1, tmp1; \
1084 ENTRY(camellia_ctr_32way)
1087 * %rsi: dst (32 blocks)
1088 * %rdx: src (32 blocks)
1089 * %rcx: iv (little endian, 128bit)
1099 /* dst can be used as temporary storage, src is not overwritten. */
1104 subq $(16 * 32), %rsp;
1108 vpcmpeqd %ymm15, %ymm15, %ymm15;
1109 vpsrldq $8, %ymm15, %ymm15; /* ab: -1:0 ; cd: -1:0 */
1110 vpaddq %ymm15, %ymm15, %ymm12; /* ab: -2:0 ; cd: -2:0 */
1112 /* load IV and byteswap */
1113 vmovdqu (%rcx), %xmm0;
1114 vmovdqa %xmm0, %xmm1;
1115 inc_le128(%xmm0, %xmm15, %xmm14);
1116 vbroadcasti128 .Lbswap128_mask, %ymm14;
1117 vinserti128 $1, %xmm0, %ymm1, %ymm0;
1118 vpshufb %ymm14, %ymm0, %ymm13;
1119 vmovdqu %ymm13, 15 * 32(%rax);
1122 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); /* ab:le2 ; cd:le3 */
1123 vpshufb %ymm14, %ymm0, %ymm13;
1124 vmovdqu %ymm13, 14 * 32(%rax);
1125 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1126 vpshufb %ymm14, %ymm0, %ymm13;
1127 vmovdqu %ymm13, 13 * 32(%rax);
1128 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1129 vpshufb %ymm14, %ymm0, %ymm13;
1130 vmovdqu %ymm13, 12 * 32(%rax);
1131 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1132 vpshufb %ymm14, %ymm0, %ymm13;
1133 vmovdqu %ymm13, 11 * 32(%rax);
1134 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1135 vpshufb %ymm14, %ymm0, %ymm10;
1136 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1137 vpshufb %ymm14, %ymm0, %ymm9;
1138 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1139 vpshufb %ymm14, %ymm0, %ymm8;
1140 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1141 vpshufb %ymm14, %ymm0, %ymm7;
1142 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1143 vpshufb %ymm14, %ymm0, %ymm6;
1144 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1145 vpshufb %ymm14, %ymm0, %ymm5;
1146 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1147 vpshufb %ymm14, %ymm0, %ymm4;
1148 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1149 vpshufb %ymm14, %ymm0, %ymm3;
1150 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1151 vpshufb %ymm14, %ymm0, %ymm2;
1152 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1153 vpshufb %ymm14, %ymm0, %ymm1;
1154 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1155 vextracti128 $1, %ymm0, %xmm13;
1156 vpshufb %ymm14, %ymm0, %ymm0;
1157 inc_le128(%xmm13, %xmm15, %xmm14);
1158 vmovdqu %xmm13, (%rcx);
1161 vpbroadcastq (key_table)(CTX), %ymm15;
1162 vpshufb .Lpack_bswap, %ymm15, %ymm15;
1163 vpxor %ymm0, %ymm15, %ymm0;
1164 vpxor %ymm1, %ymm15, %ymm1;
1165 vpxor %ymm2, %ymm15, %ymm2;
1166 vpxor %ymm3, %ymm15, %ymm3;
1167 vpxor %ymm4, %ymm15, %ymm4;
1168 vpxor %ymm5, %ymm15, %ymm5;
1169 vpxor %ymm6, %ymm15, %ymm6;
1170 vpxor %ymm7, %ymm15, %ymm7;
1171 vpxor %ymm8, %ymm15, %ymm8;
1172 vpxor %ymm9, %ymm15, %ymm9;
1173 vpxor %ymm10, %ymm15, %ymm10;
1174 vpxor 11 * 32(%rax), %ymm15, %ymm11;
1175 vpxor 12 * 32(%rax), %ymm15, %ymm12;
1176 vpxor 13 * 32(%rax), %ymm15, %ymm13;
1177 vpxor 14 * 32(%rax), %ymm15, %ymm14;
1178 vpxor 15 * 32(%rax), %ymm15, %ymm15;
1180 call __camellia_enc_blk32;
1184 vpxor 0 * 32(%rdx), %ymm7, %ymm7;
1185 vpxor 1 * 32(%rdx), %ymm6, %ymm6;
1186 vpxor 2 * 32(%rdx), %ymm5, %ymm5;
1187 vpxor 3 * 32(%rdx), %ymm4, %ymm4;
1188 vpxor 4 * 32(%rdx), %ymm3, %ymm3;
1189 vpxor 5 * 32(%rdx), %ymm2, %ymm2;
1190 vpxor 6 * 32(%rdx), %ymm1, %ymm1;
1191 vpxor 7 * 32(%rdx), %ymm0, %ymm0;
1192 vpxor 8 * 32(%rdx), %ymm15, %ymm15;
1193 vpxor 9 * 32(%rdx), %ymm14, %ymm14;
1194 vpxor 10 * 32(%rdx), %ymm13, %ymm13;
1195 vpxor 11 * 32(%rdx), %ymm12, %ymm12;
1196 vpxor 12 * 32(%rdx), %ymm11, %ymm11;
1197 vpxor 13 * 32(%rdx), %ymm10, %ymm10;
1198 vpxor 14 * 32(%rdx), %ymm9, %ymm9;
1199 vpxor 15 * 32(%rdx), %ymm8, %ymm8;
1200 write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0,
1201 %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9,
1208 ENDPROC(camellia_ctr_32way)
1210 #define gf128mul_x_ble(iv, mask, tmp) \
1211 vpsrad $31, iv, tmp; \
1212 vpaddq iv, iv, iv; \
1213 vpshufd $0x13, tmp, tmp; \
1214 vpand mask, tmp, tmp; \
1217 #define gf128mul_x2_ble(iv, mask1, mask2, tmp0, tmp1) \
1218 vpsrad $31, iv, tmp0; \
1219 vpaddq iv, iv, tmp1; \
1220 vpsllq $2, iv, iv; \
1221 vpshufd $0x13, tmp0, tmp0; \
1222 vpsrad $31, tmp1, tmp1; \
1223 vpand mask2, tmp0, tmp0; \
1224 vpshufd $0x13, tmp1, tmp1; \
1225 vpxor tmp0, iv, iv; \
1226 vpand mask1, tmp1, tmp1; \
1230 camellia_xts_crypt_32way:
1233 * %rsi: dst (32 blocks)
1234 * %rdx: src (32 blocks)
1235 * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
1236 * %r8: index for input whitening key
1237 * %r9: pointer to __camellia_enc_blk32 or __camellia_dec_blk32
1243 subq $(16 * 32), %rsp;
1246 vbroadcasti128 .Lxts_gf128mul_and_shl1_mask_0, %ymm12;
1248 /* load IV and construct second IV */
1249 vmovdqu (%rcx), %xmm0;
1250 vmovdqa %xmm0, %xmm15;
1251 gf128mul_x_ble(%xmm0, %xmm12, %xmm13);
1252 vbroadcasti128 .Lxts_gf128mul_and_shl1_mask_1, %ymm13;
1253 vinserti128 $1, %xmm0, %ymm15, %ymm0;
1254 vpxor 0 * 32(%rdx), %ymm0, %ymm15;
1255 vmovdqu %ymm15, 15 * 32(%rax);
1256 vmovdqu %ymm0, 0 * 32(%rsi);
1259 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1260 vpxor 1 * 32(%rdx), %ymm0, %ymm15;
1261 vmovdqu %ymm15, 14 * 32(%rax);
1262 vmovdqu %ymm0, 1 * 32(%rsi);
1264 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1265 vpxor 2 * 32(%rdx), %ymm0, %ymm15;
1266 vmovdqu %ymm15, 13 * 32(%rax);
1267 vmovdqu %ymm0, 2 * 32(%rsi);
1269 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1270 vpxor 3 * 32(%rdx), %ymm0, %ymm15;
1271 vmovdqu %ymm15, 12 * 32(%rax);
1272 vmovdqu %ymm0, 3 * 32(%rsi);
1274 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1275 vpxor 4 * 32(%rdx), %ymm0, %ymm11;
1276 vmovdqu %ymm0, 4 * 32(%rsi);
1278 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1279 vpxor 5 * 32(%rdx), %ymm0, %ymm10;
1280 vmovdqu %ymm0, 5 * 32(%rsi);
1282 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1283 vpxor 6 * 32(%rdx), %ymm0, %ymm9;
1284 vmovdqu %ymm0, 6 * 32(%rsi);
1286 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1287 vpxor 7 * 32(%rdx), %ymm0, %ymm8;
1288 vmovdqu %ymm0, 7 * 32(%rsi);
1290 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1291 vpxor 8 * 32(%rdx), %ymm0, %ymm7;
1292 vmovdqu %ymm0, 8 * 32(%rsi);
1294 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1295 vpxor 9 * 32(%rdx), %ymm0, %ymm6;
1296 vmovdqu %ymm0, 9 * 32(%rsi);
1298 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1299 vpxor 10 * 32(%rdx), %ymm0, %ymm5;
1300 vmovdqu %ymm0, 10 * 32(%rsi);
1302 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1303 vpxor 11 * 32(%rdx), %ymm0, %ymm4;
1304 vmovdqu %ymm0, 11 * 32(%rsi);
1306 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1307 vpxor 12 * 32(%rdx), %ymm0, %ymm3;
1308 vmovdqu %ymm0, 12 * 32(%rsi);
1310 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1311 vpxor 13 * 32(%rdx), %ymm0, %ymm2;
1312 vmovdqu %ymm0, 13 * 32(%rsi);
1314 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1315 vpxor 14 * 32(%rdx), %ymm0, %ymm1;
1316 vmovdqu %ymm0, 14 * 32(%rsi);
1318 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1319 vpxor 15 * 32(%rdx), %ymm0, %ymm15;
1320 vmovdqu %ymm15, 0 * 32(%rax);
1321 vmovdqu %ymm0, 15 * 32(%rsi);
1323 vextracti128 $1, %ymm0, %xmm0;
1324 gf128mul_x_ble(%xmm0, %xmm12, %xmm15);
1325 vmovdqu %xmm0, (%rcx);
1328 vpbroadcastq (key_table)(CTX, %r8, 8), %ymm15;
1329 vpshufb .Lpack_bswap, %ymm15, %ymm15;
1330 vpxor 0 * 32(%rax), %ymm15, %ymm0;
1331 vpxor %ymm1, %ymm15, %ymm1;
1332 vpxor %ymm2, %ymm15, %ymm2;
1333 vpxor %ymm3, %ymm15, %ymm3;
1334 vpxor %ymm4, %ymm15, %ymm4;
1335 vpxor %ymm5, %ymm15, %ymm5;
1336 vpxor %ymm6, %ymm15, %ymm6;
1337 vpxor %ymm7, %ymm15, %ymm7;
1338 vpxor %ymm8, %ymm15, %ymm8;
1339 vpxor %ymm9, %ymm15, %ymm9;
1340 vpxor %ymm10, %ymm15, %ymm10;
1341 vpxor %ymm11, %ymm15, %ymm11;
1342 vpxor 12 * 32(%rax), %ymm15, %ymm12;
1343 vpxor 13 * 32(%rax), %ymm15, %ymm13;
1344 vpxor 14 * 32(%rax), %ymm15, %ymm14;
1345 vpxor 15 * 32(%rax), %ymm15, %ymm15;
1349 addq $(16 * 32), %rsp;
1351 vpxor 0 * 32(%rsi), %ymm7, %ymm7;
1352 vpxor 1 * 32(%rsi), %ymm6, %ymm6;
1353 vpxor 2 * 32(%rsi), %ymm5, %ymm5;
1354 vpxor 3 * 32(%rsi), %ymm4, %ymm4;
1355 vpxor 4 * 32(%rsi), %ymm3, %ymm3;
1356 vpxor 5 * 32(%rsi), %ymm2, %ymm2;
1357 vpxor 6 * 32(%rsi), %ymm1, %ymm1;
1358 vpxor 7 * 32(%rsi), %ymm0, %ymm0;
1359 vpxor 8 * 32(%rsi), %ymm15, %ymm15;
1360 vpxor 9 * 32(%rsi), %ymm14, %ymm14;
1361 vpxor 10 * 32(%rsi), %ymm13, %ymm13;
1362 vpxor 11 * 32(%rsi), %ymm12, %ymm12;
1363 vpxor 12 * 32(%rsi), %ymm11, %ymm11;
1364 vpxor 13 * 32(%rsi), %ymm10, %ymm10;
1365 vpxor 14 * 32(%rsi), %ymm9, %ymm9;
1366 vpxor 15 * 32(%rsi), %ymm8, %ymm8;
1367 write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0,
1368 %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9,
1375 ENDPROC(camellia_xts_crypt_32way)
1377 ENTRY(camellia_xts_enc_32way)
1380 * %rsi: dst (32 blocks)
1381 * %rdx: src (32 blocks)
1382 * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
1385 xorl %r8d, %r8d; /* input whitening key, 0 for enc */
1387 leaq __camellia_enc_blk32, %r9;
1389 jmp camellia_xts_crypt_32way;
1390 ENDPROC(camellia_xts_enc_32way)
1392 ENTRY(camellia_xts_dec_32way)
1395 * %rsi: dst (32 blocks)
1396 * %rdx: src (32 blocks)
1397 * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
1400 cmpl $16, key_length(CTX);
1403 cmovel %eax, %r8d; /* input whitening key, last for dec */
1405 leaq __camellia_dec_blk32, %r9;
1407 jmp camellia_xts_crypt_32way;
1408 ENDPROC(camellia_xts_dec_32way)