2 * x86_64/AVX2/AES-NI assembler implementation of Camellia
4 * Copyright © 2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
13 #include <linux/linkage.h>
15 #define CAMELLIA_TABLE_BYTE_LEN 272
17 /* struct camellia_ctx: */
19 #define key_length CAMELLIA_TABLE_BYTE_LEN
25 /**********************************************************************
27 **********************************************************************/
28 #define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0) \
29 vpand x, mask4bit, tmp0; \
30 vpandn x, mask4bit, x; \
33 vpshufb tmp0, lo_t, tmp0; \
55 * AES-NI instructions do not support ymmX registers, so we need splitting and
58 #define vaesenclast256(zero, yreg, tmp) \
59 vextracti128 $1, yreg, tmp##_x; \
60 vaesenclast zero##_x, yreg##_x, yreg##_x; \
61 vaesenclast zero##_x, tmp##_x, tmp##_x; \
62 vinserti128 $1, tmp##_x, yreg, yreg;
64 /**********************************************************************
66 **********************************************************************/
70 * x0..x7: byte-sliced AB state
71 * mem_cd: register pointer storing CD state
72 * key: index for key material
74 * x0..x7: new byte-sliced CD state
76 #define roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, t0, t1, t2, t3, t4, t5, t6, \
79 * S-function with AES subbytes \
81 vbroadcasti128 .Linv_shift_row, t4; \
82 vpbroadcastb .L0f0f0f0f, t7; \
83 vbroadcasti128 .Lpre_tf_lo_s1, t0; \
84 vbroadcasti128 .Lpre_tf_hi_s1, t1; \
86 /* AES inverse shift rows */ \
96 /* prefilter sboxes 1, 2 and 3 */ \
97 vbroadcasti128 .Lpre_tf_lo_s4, t2; \
98 vbroadcasti128 .Lpre_tf_hi_s4, t3; \
99 filter_8bit(x0, t0, t1, t7, t6); \
100 filter_8bit(x7, t0, t1, t7, t6); \
101 filter_8bit(x1, t0, t1, t7, t6); \
102 filter_8bit(x4, t0, t1, t7, t6); \
103 filter_8bit(x2, t0, t1, t7, t6); \
104 filter_8bit(x5, t0, t1, t7, t6); \
106 /* prefilter sbox 4 */ \
107 vpxor t4##_x, t4##_x, t4##_x; \
108 filter_8bit(x3, t2, t3, t7, t6); \
109 filter_8bit(x6, t2, t3, t7, t6); \
111 /* AES subbytes + AES shift rows */ \
112 vbroadcasti128 .Lpost_tf_lo_s1, t0; \
113 vbroadcasti128 .Lpost_tf_hi_s1, t1; \
114 vaesenclast256(t4, x0, t5); \
115 vaesenclast256(t4, x7, t5); \
116 vaesenclast256(t4, x1, t5); \
117 vaesenclast256(t4, x4, t5); \
118 vaesenclast256(t4, x2, t5); \
119 vaesenclast256(t4, x5, t5); \
120 vaesenclast256(t4, x3, t5); \
121 vaesenclast256(t4, x6, t5); \
123 /* postfilter sboxes 1 and 4 */ \
124 vbroadcasti128 .Lpost_tf_lo_s3, t2; \
125 vbroadcasti128 .Lpost_tf_hi_s3, t3; \
126 filter_8bit(x0, t0, t1, t7, t6); \
127 filter_8bit(x7, t0, t1, t7, t6); \
128 filter_8bit(x3, t0, t1, t7, t6); \
129 filter_8bit(x6, t0, t1, t7, t6); \
131 /* postfilter sbox 3 */ \
132 vbroadcasti128 .Lpost_tf_lo_s2, t4; \
133 vbroadcasti128 .Lpost_tf_hi_s2, t5; \
134 filter_8bit(x2, t2, t3, t7, t6); \
135 filter_8bit(x5, t2, t3, t7, t6); \
137 vpbroadcastq key, t0; /* higher 64-bit duplicate ignored */ \
139 /* postfilter sbox 2 */ \
140 filter_8bit(x1, t4, t5, t7, t2); \
141 filter_8bit(x4, t4, t5, t7, t2); \
143 vpsrldq $1, t0, t1; \
144 vpsrldq $2, t0, t2; \
145 vpsrldq $3, t0, t3; \
146 vpsrldq $4, t0, t4; \
147 vpsrldq $5, t0, t5; \
148 vpsrldq $6, t0, t6; \
149 vpsrldq $7, t0, t7; \
150 vpbroadcastb t0##_x, t0; \
151 vpbroadcastb t1##_x, t1; \
152 vpbroadcastb t2##_x, t2; \
153 vpbroadcastb t3##_x, t3; \
154 vpbroadcastb t4##_x, t4; \
155 vpbroadcastb t6##_x, t6; \
156 vpbroadcastb t5##_x, t5; \
157 vpbroadcastb t7##_x, t7; \
178 vpxor x2, x7, x7; /* note: high and low parts swapped */ \
180 /* Add key material and result to CD (x becomes new CD) */ \
183 vpxor 4 * 32(mem_cd), x0, x0; \
186 vpxor 5 * 32(mem_cd), x1, x1; \
189 vpxor 6 * 32(mem_cd), x2, x2; \
192 vpxor 7 * 32(mem_cd), x3, x3; \
195 vpxor 0 * 32(mem_cd), x4, x4; \
198 vpxor 1 * 32(mem_cd), x5, x5; \
201 vpxor 2 * 32(mem_cd), x6, x6; \
204 vpxor 3 * 32(mem_cd), x7, x7;
207 * Size optimization... with inlined roundsm16 binary would be over 5 times
208 * larger and would only marginally faster.
211 roundsm32_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd:
212 roundsm32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
213 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, %ymm15,
216 ENDPROC(roundsm32_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd)
219 roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab:
220 roundsm32(%ymm4, %ymm5, %ymm6, %ymm7, %ymm0, %ymm1, %ymm2, %ymm3,
221 %ymm12, %ymm13, %ymm14, %ymm15, %ymm8, %ymm9, %ymm10, %ymm11,
224 ENDPROC(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
228 * x0..x7: byte-sliced AB state preloaded
229 * mem_ab: byte-sliced AB state in memory
230 * mem_cb: byte-sliced CD state in memory
232 #define two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
233 y6, y7, mem_ab, mem_cd, i, dir, store_ab) \
234 leaq (key_table + (i) * 8)(CTX), %r9; \
235 call roundsm32_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd; \
237 vmovdqu x0, 4 * 32(mem_cd); \
238 vmovdqu x1, 5 * 32(mem_cd); \
239 vmovdqu x2, 6 * 32(mem_cd); \
240 vmovdqu x3, 7 * 32(mem_cd); \
241 vmovdqu x4, 0 * 32(mem_cd); \
242 vmovdqu x5, 1 * 32(mem_cd); \
243 vmovdqu x6, 2 * 32(mem_cd); \
244 vmovdqu x7, 3 * 32(mem_cd); \
246 leaq (key_table + ((i) + (dir)) * 8)(CTX), %r9; \
247 call roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab; \
249 store_ab(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab);
251 #define dummy_store(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) /* do nothing */
253 #define store_ab_state(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) \
254 /* Store new AB state */ \
255 vmovdqu x4, 4 * 32(mem_ab); \
256 vmovdqu x5, 5 * 32(mem_ab); \
257 vmovdqu x6, 6 * 32(mem_ab); \
258 vmovdqu x7, 7 * 32(mem_ab); \
259 vmovdqu x0, 0 * 32(mem_ab); \
260 vmovdqu x1, 1 * 32(mem_ab); \
261 vmovdqu x2, 2 * 32(mem_ab); \
262 vmovdqu x3, 3 * 32(mem_ab);
264 #define enc_rounds32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
265 y6, y7, mem_ab, mem_cd, i) \
266 two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
267 y6, y7, mem_ab, mem_cd, (i) + 2, 1, store_ab_state); \
268 two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
269 y6, y7, mem_ab, mem_cd, (i) + 4, 1, store_ab_state); \
270 two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
271 y6, y7, mem_ab, mem_cd, (i) + 6, 1, dummy_store);
273 #define dec_rounds32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
274 y6, y7, mem_ab, mem_cd, i) \
275 two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
276 y6, y7, mem_ab, mem_cd, (i) + 7, -1, store_ab_state); \
277 two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
278 y6, y7, mem_ab, mem_cd, (i) + 5, -1, store_ab_state); \
279 two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
280 y6, y7, mem_ab, mem_cd, (i) + 3, -1, dummy_store);
284 * v0..3: byte-sliced 32-bit integers
288 #define rol32_1_32(v0, v1, v2, v3, t0, t1, t2, zero) \
289 vpcmpgtb v0, zero, t0; \
293 vpcmpgtb v1, zero, t1; \
297 vpcmpgtb v2, zero, t2; \
303 vpcmpgtb v3, zero, t0; \
313 * r: byte-sliced AB state in memory
314 * l: byte-sliced CD state in memory
316 * x0..x7: new byte-sliced CD state
318 #define fls32(l, l0, l1, l2, l3, l4, l5, l6, l7, r, t0, t1, t2, t3, tt0, \
319 tt1, tt2, tt3, kll, klr, krl, krr) \
323 * lr ^= rol32(t0, 1); \
325 vpbroadcastd kll, t0; /* only lowest 32-bit used */ \
326 vpxor tt0, tt0, tt0; \
327 vpbroadcastb t0##_x, t3; \
328 vpsrldq $1, t0, t0; \
329 vpbroadcastb t0##_x, t2; \
330 vpsrldq $1, t0, t0; \
331 vpbroadcastb t0##_x, t1; \
332 vpsrldq $1, t0, t0; \
333 vpbroadcastb t0##_x, t0; \
340 rol32_1_32(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \
343 vmovdqu l4, 4 * 32(l); \
345 vmovdqu l5, 5 * 32(l); \
347 vmovdqu l6, 6 * 32(l); \
349 vmovdqu l7, 7 * 32(l); \
357 vpbroadcastd krr, t0; /* only lowest 32-bit used */ \
358 vpbroadcastb t0##_x, t3; \
359 vpsrldq $1, t0, t0; \
360 vpbroadcastb t0##_x, t2; \
361 vpsrldq $1, t0, t0; \
362 vpbroadcastb t0##_x, t1; \
363 vpsrldq $1, t0, t0; \
364 vpbroadcastb t0##_x, t0; \
366 vpor 4 * 32(r), t0, t0; \
367 vpor 5 * 32(r), t1, t1; \
368 vpor 6 * 32(r), t2, t2; \
369 vpor 7 * 32(r), t3, t3; \
371 vpxor 0 * 32(r), t0, t0; \
372 vpxor 1 * 32(r), t1, t1; \
373 vpxor 2 * 32(r), t2, t2; \
374 vpxor 3 * 32(r), t3, t3; \
375 vmovdqu t0, 0 * 32(r); \
376 vmovdqu t1, 1 * 32(r); \
377 vmovdqu t2, 2 * 32(r); \
378 vmovdqu t3, 3 * 32(r); \
383 * rr ^= rol32(t2, 1); \
385 vpbroadcastd krl, t0; /* only lowest 32-bit used */ \
386 vpbroadcastb t0##_x, t3; \
387 vpsrldq $1, t0, t0; \
388 vpbroadcastb t0##_x, t2; \
389 vpsrldq $1, t0, t0; \
390 vpbroadcastb t0##_x, t1; \
391 vpsrldq $1, t0, t0; \
392 vpbroadcastb t0##_x, t0; \
394 vpand 0 * 32(r), t0, t0; \
395 vpand 1 * 32(r), t1, t1; \
396 vpand 2 * 32(r), t2, t2; \
397 vpand 3 * 32(r), t3, t3; \
399 rol32_1_32(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \
401 vpxor 4 * 32(r), t0, t0; \
402 vpxor 5 * 32(r), t1, t1; \
403 vpxor 6 * 32(r), t2, t2; \
404 vpxor 7 * 32(r), t3, t3; \
405 vmovdqu t0, 4 * 32(r); \
406 vmovdqu t1, 5 * 32(r); \
407 vmovdqu t2, 6 * 32(r); \
408 vmovdqu t3, 7 * 32(r); \
416 vpbroadcastd klr, t0; /* only lowest 32-bit used */ \
417 vpbroadcastb t0##_x, t3; \
418 vpsrldq $1, t0, t0; \
419 vpbroadcastb t0##_x, t2; \
420 vpsrldq $1, t0, t0; \
421 vpbroadcastb t0##_x, t1; \
422 vpsrldq $1, t0, t0; \
423 vpbroadcastb t0##_x, t0; \
431 vmovdqu l0, 0 * 32(l); \
433 vmovdqu l1, 1 * 32(l); \
435 vmovdqu l2, 2 * 32(l); \
437 vmovdqu l3, 3 * 32(l);
439 #define transpose_4x4(x0, x1, x2, x3, t1, t2) \
440 vpunpckhdq x1, x0, t2; \
441 vpunpckldq x1, x0, x0; \
443 vpunpckldq x3, x2, t1; \
444 vpunpckhdq x3, x2, x2; \
446 vpunpckhqdq t1, x0, x1; \
447 vpunpcklqdq t1, x0, x0; \
449 vpunpckhqdq x2, t2, x3; \
450 vpunpcklqdq x2, t2, x2;
452 #define byteslice_16x16b_fast(a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, \
453 a3, b3, c3, d3, st0, st1) \
456 transpose_4x4(a0, a1, a2, a3, d2, d3); \
457 transpose_4x4(b0, b1, b2, b3, d2, d3); \
463 transpose_4x4(c0, c1, c2, c3, a0, a1); \
464 transpose_4x4(d0, d1, d2, d3, a0, a1); \
466 vbroadcasti128 .Lshufb_16x16b, a0; \
468 vpshufb a0, a2, a2; \
469 vpshufb a0, a3, a3; \
470 vpshufb a0, b0, b0; \
471 vpshufb a0, b1, b1; \
472 vpshufb a0, b2, b2; \
473 vpshufb a0, b3, b3; \
474 vpshufb a0, a1, a1; \
475 vpshufb a0, c0, c0; \
476 vpshufb a0, c1, c1; \
477 vpshufb a0, c2, c2; \
478 vpshufb a0, c3, c3; \
479 vpshufb a0, d0, d0; \
480 vpshufb a0, d1, d1; \
481 vpshufb a0, d2, d2; \
482 vpshufb a0, d3, d3; \
485 vpshufb a0, d3, a0; \
488 transpose_4x4(a0, b0, c0, d0, d2, d3); \
489 transpose_4x4(a1, b1, c1, d1, d2, d3); \
495 transpose_4x4(a2, b2, c2, d2, b0, b1); \
496 transpose_4x4(a3, b3, c3, d3, b0, b1); \
499 /* does not adjust output bytes inside vectors */
501 /* load blocks to registers and apply pre-whitening */
502 #define inpack32_pre(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
504 vpbroadcastq key, x0; \
505 vpshufb .Lpack_bswap, x0, x0; \
507 vpxor 0 * 32(rio), x0, y7; \
508 vpxor 1 * 32(rio), x0, y6; \
509 vpxor 2 * 32(rio), x0, y5; \
510 vpxor 3 * 32(rio), x0, y4; \
511 vpxor 4 * 32(rio), x0, y3; \
512 vpxor 5 * 32(rio), x0, y2; \
513 vpxor 6 * 32(rio), x0, y1; \
514 vpxor 7 * 32(rio), x0, y0; \
515 vpxor 8 * 32(rio), x0, x7; \
516 vpxor 9 * 32(rio), x0, x6; \
517 vpxor 10 * 32(rio), x0, x5; \
518 vpxor 11 * 32(rio), x0, x4; \
519 vpxor 12 * 32(rio), x0, x3; \
520 vpxor 13 * 32(rio), x0, x2; \
521 vpxor 14 * 32(rio), x0, x1; \
522 vpxor 15 * 32(rio), x0, x0;
524 /* byteslice pre-whitened blocks and store to temporary memory */
525 #define inpack32_post(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
526 y6, y7, mem_ab, mem_cd) \
527 byteslice_16x16b_fast(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, \
528 y4, y5, y6, y7, (mem_ab), (mem_cd)); \
530 vmovdqu x0, 0 * 32(mem_ab); \
531 vmovdqu x1, 1 * 32(mem_ab); \
532 vmovdqu x2, 2 * 32(mem_ab); \
533 vmovdqu x3, 3 * 32(mem_ab); \
534 vmovdqu x4, 4 * 32(mem_ab); \
535 vmovdqu x5, 5 * 32(mem_ab); \
536 vmovdqu x6, 6 * 32(mem_ab); \
537 vmovdqu x7, 7 * 32(mem_ab); \
538 vmovdqu y0, 0 * 32(mem_cd); \
539 vmovdqu y1, 1 * 32(mem_cd); \
540 vmovdqu y2, 2 * 32(mem_cd); \
541 vmovdqu y3, 3 * 32(mem_cd); \
542 vmovdqu y4, 4 * 32(mem_cd); \
543 vmovdqu y5, 5 * 32(mem_cd); \
544 vmovdqu y6, 6 * 32(mem_cd); \
545 vmovdqu y7, 7 * 32(mem_cd);
547 /* de-byteslice, apply post-whitening and store blocks */
548 #define outunpack32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, \
549 y5, y6, y7, key, stack_tmp0, stack_tmp1) \
550 byteslice_16x16b_fast(y0, y4, x0, x4, y1, y5, x1, x5, y2, y6, x2, x6, \
551 y3, y7, x3, x7, stack_tmp0, stack_tmp1); \
553 vmovdqu x0, stack_tmp0; \
555 vpbroadcastq key, x0; \
556 vpshufb .Lpack_bswap, x0, x0; \
573 vpxor stack_tmp0, x0, x0;
575 #define write_output(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
577 vmovdqu x0, 0 * 32(rio); \
578 vmovdqu x1, 1 * 32(rio); \
579 vmovdqu x2, 2 * 32(rio); \
580 vmovdqu x3, 3 * 32(rio); \
581 vmovdqu x4, 4 * 32(rio); \
582 vmovdqu x5, 5 * 32(rio); \
583 vmovdqu x6, 6 * 32(rio); \
584 vmovdqu x7, 7 * 32(rio); \
585 vmovdqu y0, 8 * 32(rio); \
586 vmovdqu y1, 9 * 32(rio); \
587 vmovdqu y2, 10 * 32(rio); \
588 vmovdqu y3, 11 * 32(rio); \
589 vmovdqu y4, 12 * 32(rio); \
590 vmovdqu y5, 13 * 32(rio); \
591 vmovdqu y6, 14 * 32(rio); \
592 vmovdqu y7, 15 * 32(rio);
597 #define SHUFB_BYTES(idx) \
598 0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx)
601 .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3)
602 .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3)
605 .long 0x00010203, 0x04050607, 0x80808080, 0x80808080
606 .long 0x00010203, 0x04050607, 0x80808080, 0x80808080
608 /* For CTR-mode IV byteswap */
610 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
613 .Lxts_gf128mul_and_shl1_mask_0:
614 .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0
615 .Lxts_gf128mul_and_shl1_mask_1:
616 .byte 0x0e, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0
619 * pre-SubByte transform
621 * pre-lookup for sbox1, sbox2, sbox3:
622 * swap_bitendianness(
623 * isom_map_camellia_to_aes(
625 * swap_bitendianess(in)
630 * (note: '⊕ 0xc5' inside camellia_f())
633 .byte 0x45, 0xe8, 0x40, 0xed, 0x2e, 0x83, 0x2b, 0x86
634 .byte 0x4b, 0xe6, 0x4e, 0xe3, 0x20, 0x8d, 0x25, 0x88
636 .byte 0x00, 0x51, 0xf1, 0xa0, 0x8a, 0xdb, 0x7b, 0x2a
637 .byte 0x09, 0x58, 0xf8, 0xa9, 0x83, 0xd2, 0x72, 0x23
640 * pre-SubByte transform
642 * pre-lookup for sbox4:
643 * swap_bitendianness(
644 * isom_map_camellia_to_aes(
646 * swap_bitendianess(in <<< 1)
651 * (note: '⊕ 0xc5' inside camellia_f())
654 .byte 0x45, 0x40, 0x2e, 0x2b, 0x4b, 0x4e, 0x20, 0x25
655 .byte 0x14, 0x11, 0x7f, 0x7a, 0x1a, 0x1f, 0x71, 0x74
657 .byte 0x00, 0xf1, 0x8a, 0x7b, 0x09, 0xf8, 0x83, 0x72
658 .byte 0xad, 0x5c, 0x27, 0xd6, 0xa4, 0x55, 0x2e, 0xdf
661 * post-SubByte transform
663 * post-lookup for sbox1, sbox4:
664 * swap_bitendianness(
666 * isom_map_aes_to_camellia(
667 * swap_bitendianness(
668 * aes_inverse_affine_transform(in)
674 * (note: '⊕ 0x6e' inside camellia_h())
677 .byte 0x3c, 0xcc, 0xcf, 0x3f, 0x32, 0xc2, 0xc1, 0x31
678 .byte 0xdc, 0x2c, 0x2f, 0xdf, 0xd2, 0x22, 0x21, 0xd1
680 .byte 0x00, 0xf9, 0x86, 0x7f, 0xd7, 0x2e, 0x51, 0xa8
681 .byte 0xa4, 0x5d, 0x22, 0xdb, 0x73, 0x8a, 0xf5, 0x0c
684 * post-SubByte transform
686 * post-lookup for sbox2:
687 * swap_bitendianness(
689 * isom_map_aes_to_camellia(
690 * swap_bitendianness(
691 * aes_inverse_affine_transform(in)
697 * (note: '⊕ 0x6e' inside camellia_h())
700 .byte 0x78, 0x99, 0x9f, 0x7e, 0x64, 0x85, 0x83, 0x62
701 .byte 0xb9, 0x58, 0x5e, 0xbf, 0xa5, 0x44, 0x42, 0xa3
703 .byte 0x00, 0xf3, 0x0d, 0xfe, 0xaf, 0x5c, 0xa2, 0x51
704 .byte 0x49, 0xba, 0x44, 0xb7, 0xe6, 0x15, 0xeb, 0x18
707 * post-SubByte transform
709 * post-lookup for sbox3:
710 * swap_bitendianness(
712 * isom_map_aes_to_camellia(
713 * swap_bitendianness(
714 * aes_inverse_affine_transform(in)
720 * (note: '⊕ 0x6e' inside camellia_h())
723 .byte 0x1e, 0x66, 0xe7, 0x9f, 0x19, 0x61, 0xe0, 0x98
724 .byte 0x6e, 0x16, 0x97, 0xef, 0x69, 0x11, 0x90, 0xe8
726 .byte 0x00, 0xfc, 0x43, 0xbf, 0xeb, 0x17, 0xa8, 0x54
727 .byte 0x52, 0xae, 0x11, 0xed, 0xb9, 0x45, 0xfa, 0x06
729 /* For isolating SubBytes from AESENCLAST, inverse shift row */
731 .byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b
732 .byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03
742 __camellia_enc_blk32:
745 * %rax: temporary storage, 512 bytes
746 * %ymm0..%ymm15: 32 plaintext blocks
748 * %ymm0..%ymm15: 32 encrypted blocks, order swapped:
749 * 7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
752 leaq 8 * 32(%rax), %rcx;
754 inpack32_post(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
755 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
758 enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
759 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
760 %ymm15, %rax, %rcx, 0);
762 fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
763 %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
765 ((key_table + (8) * 8) + 0)(CTX),
766 ((key_table + (8) * 8) + 4)(CTX),
767 ((key_table + (8) * 8) + 8)(CTX),
768 ((key_table + (8) * 8) + 12)(CTX));
770 enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
771 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
772 %ymm15, %rax, %rcx, 8);
774 fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
775 %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
777 ((key_table + (16) * 8) + 0)(CTX),
778 ((key_table + (16) * 8) + 4)(CTX),
779 ((key_table + (16) * 8) + 8)(CTX),
780 ((key_table + (16) * 8) + 12)(CTX));
782 enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
783 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
784 %ymm15, %rax, %rcx, 16);
787 cmpl $16, key_length(CTX);
791 /* load CD for output */
792 vmovdqu 0 * 32(%rcx), %ymm8;
793 vmovdqu 1 * 32(%rcx), %ymm9;
794 vmovdqu 2 * 32(%rcx), %ymm10;
795 vmovdqu 3 * 32(%rcx), %ymm11;
796 vmovdqu 4 * 32(%rcx), %ymm12;
797 vmovdqu 5 * 32(%rcx), %ymm13;
798 vmovdqu 6 * 32(%rcx), %ymm14;
799 vmovdqu 7 * 32(%rcx), %ymm15;
801 outunpack32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
802 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
803 %ymm15, (key_table)(CTX, %r8, 8), (%rax), 1 * 32(%rax));
811 fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
812 %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
814 ((key_table + (24) * 8) + 0)(CTX),
815 ((key_table + (24) * 8) + 4)(CTX),
816 ((key_table + (24) * 8) + 8)(CTX),
817 ((key_table + (24) * 8) + 12)(CTX));
819 enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
820 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
821 %ymm15, %rax, %rcx, 24);
824 ENDPROC(__camellia_enc_blk32)
827 __camellia_dec_blk32:
830 * %rax: temporary storage, 512 bytes
831 * %r8d: 24 for 16 byte key, 32 for larger
832 * %ymm0..%ymm15: 16 encrypted blocks
834 * %ymm0..%ymm15: 16 plaintext blocks, order swapped:
835 * 7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
838 leaq 8 * 32(%rax), %rcx;
840 inpack32_post(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
841 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
848 dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
849 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
850 %ymm15, %rax, %rcx, 16);
852 fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
853 %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
855 ((key_table + (16) * 8) + 8)(CTX),
856 ((key_table + (16) * 8) + 12)(CTX),
857 ((key_table + (16) * 8) + 0)(CTX),
858 ((key_table + (16) * 8) + 4)(CTX));
860 dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
861 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
862 %ymm15, %rax, %rcx, 8);
864 fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
865 %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
867 ((key_table + (8) * 8) + 8)(CTX),
868 ((key_table + (8) * 8) + 12)(CTX),
869 ((key_table + (8) * 8) + 0)(CTX),
870 ((key_table + (8) * 8) + 4)(CTX));
872 dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
873 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
874 %ymm15, %rax, %rcx, 0);
876 /* load CD for output */
877 vmovdqu 0 * 32(%rcx), %ymm8;
878 vmovdqu 1 * 32(%rcx), %ymm9;
879 vmovdqu 2 * 32(%rcx), %ymm10;
880 vmovdqu 3 * 32(%rcx), %ymm11;
881 vmovdqu 4 * 32(%rcx), %ymm12;
882 vmovdqu 5 * 32(%rcx), %ymm13;
883 vmovdqu 6 * 32(%rcx), %ymm14;
884 vmovdqu 7 * 32(%rcx), %ymm15;
886 outunpack32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
887 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
888 %ymm15, (key_table)(CTX), (%rax), 1 * 32(%rax));
894 dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
895 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
896 %ymm15, %rax, %rcx, 24);
898 fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
899 %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
901 ((key_table + (24) * 8) + 8)(CTX),
902 ((key_table + (24) * 8) + 12)(CTX),
903 ((key_table + (24) * 8) + 0)(CTX),
904 ((key_table + (24) * 8) + 4)(CTX));
907 ENDPROC(__camellia_dec_blk32)
909 ENTRY(camellia_ecb_enc_32way)
912 * %rsi: dst (32 blocks)
913 * %rdx: src (32 blocks)
918 inpack32_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
919 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
920 %ymm15, %rdx, (key_table)(CTX));
922 /* now dst can be used as temporary buffer (even in src == dst case) */
925 call __camellia_enc_blk32;
927 write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0,
928 %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9,
934 ENDPROC(camellia_ecb_enc_32way)
936 ENTRY(camellia_ecb_dec_32way)
939 * %rsi: dst (32 blocks)
940 * %rdx: src (32 blocks)
945 cmpl $16, key_length(CTX);
948 cmovel %eax, %r8d; /* max */
950 inpack32_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
951 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
952 %ymm15, %rdx, (key_table)(CTX, %r8, 8));
954 /* now dst can be used as temporary buffer (even in src == dst case) */
957 call __camellia_dec_blk32;
959 write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0,
960 %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9,
966 ENDPROC(camellia_ecb_dec_32way)
968 ENTRY(camellia_cbc_dec_32way)
971 * %rsi: dst (32 blocks)
972 * %rdx: src (32 blocks)
977 cmpl $16, key_length(CTX);
980 cmovel %eax, %r8d; /* max */
982 inpack32_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
983 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
984 %ymm15, %rdx, (key_table)(CTX, %r8, 8));
988 je .Lcbc_dec_use_stack;
990 /* dst can be used as temporary storage, src is not overwritten. */
992 jmp .Lcbc_dec_continue;
996 * dst still in-use (because dst == src), so use stack for temporary
999 subq $(16 * 32), %rsp;
1003 call __camellia_dec_blk32;
1005 vmovdqu %ymm7, (%rax);
1006 vpxor %ymm7, %ymm7, %ymm7;
1007 vinserti128 $1, (%rdx), %ymm7, %ymm7;
1008 vpxor (%rax), %ymm7, %ymm7;
1010 vpxor (0 * 32 + 16)(%rdx), %ymm6, %ymm6;
1011 vpxor (1 * 32 + 16)(%rdx), %ymm5, %ymm5;
1012 vpxor (2 * 32 + 16)(%rdx), %ymm4, %ymm4;
1013 vpxor (3 * 32 + 16)(%rdx), %ymm3, %ymm3;
1014 vpxor (4 * 32 + 16)(%rdx), %ymm2, %ymm2;
1015 vpxor (5 * 32 + 16)(%rdx), %ymm1, %ymm1;
1016 vpxor (6 * 32 + 16)(%rdx), %ymm0, %ymm0;
1017 vpxor (7 * 32 + 16)(%rdx), %ymm15, %ymm15;
1018 vpxor (8 * 32 + 16)(%rdx), %ymm14, %ymm14;
1019 vpxor (9 * 32 + 16)(%rdx), %ymm13, %ymm13;
1020 vpxor (10 * 32 + 16)(%rdx), %ymm12, %ymm12;
1021 vpxor (11 * 32 + 16)(%rdx), %ymm11, %ymm11;
1022 vpxor (12 * 32 + 16)(%rdx), %ymm10, %ymm10;
1023 vpxor (13 * 32 + 16)(%rdx), %ymm9, %ymm9;
1024 vpxor (14 * 32 + 16)(%rdx), %ymm8, %ymm8;
1025 write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0,
1026 %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9,
1032 ENDPROC(camellia_cbc_dec_32way)
1034 #define inc_le128(x, minus_one, tmp) \
1035 vpcmpeqq minus_one, x, tmp; \
1036 vpsubq minus_one, x, x; \
1037 vpslldq $8, tmp, tmp; \
1040 #define add2_le128(x, minus_one, minus_two, tmp1, tmp2) \
1041 vpcmpeqq minus_one, x, tmp1; \
1042 vpcmpeqq minus_two, x, tmp2; \
1043 vpsubq minus_two, x, x; \
1044 vpor tmp2, tmp1, tmp1; \
1045 vpslldq $8, tmp1, tmp1; \
1048 ENTRY(camellia_ctr_32way)
1051 * %rsi: dst (32 blocks)
1052 * %rdx: src (32 blocks)
1053 * %rcx: iv (little endian, 128bit)
1062 /* dst can be used as temporary storage, src is not overwritten. */
1067 subq $(16 * 32), %rsp;
1071 vpcmpeqd %ymm15, %ymm15, %ymm15;
1072 vpsrldq $8, %ymm15, %ymm15; /* ab: -1:0 ; cd: -1:0 */
1073 vpaddq %ymm15, %ymm15, %ymm12; /* ab: -2:0 ; cd: -2:0 */
1075 /* load IV and byteswap */
1076 vmovdqu (%rcx), %xmm0;
1077 vmovdqa %xmm0, %xmm1;
1078 inc_le128(%xmm0, %xmm15, %xmm14);
1079 vbroadcasti128 .Lbswap128_mask, %ymm14;
1080 vinserti128 $1, %xmm0, %ymm1, %ymm0;
1081 vpshufb %ymm14, %ymm0, %ymm13;
1082 vmovdqu %ymm13, 15 * 32(%rax);
1085 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); /* ab:le2 ; cd:le3 */
1086 vpshufb %ymm14, %ymm0, %ymm13;
1087 vmovdqu %ymm13, 14 * 32(%rax);
1088 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1089 vpshufb %ymm14, %ymm0, %ymm13;
1090 vmovdqu %ymm13, 13 * 32(%rax);
1091 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1092 vpshufb %ymm14, %ymm0, %ymm13;
1093 vmovdqu %ymm13, 12 * 32(%rax);
1094 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1095 vpshufb %ymm14, %ymm0, %ymm13;
1096 vmovdqu %ymm13, 11 * 32(%rax);
1097 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1098 vpshufb %ymm14, %ymm0, %ymm10;
1099 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1100 vpshufb %ymm14, %ymm0, %ymm9;
1101 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1102 vpshufb %ymm14, %ymm0, %ymm8;
1103 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1104 vpshufb %ymm14, %ymm0, %ymm7;
1105 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1106 vpshufb %ymm14, %ymm0, %ymm6;
1107 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1108 vpshufb %ymm14, %ymm0, %ymm5;
1109 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1110 vpshufb %ymm14, %ymm0, %ymm4;
1111 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1112 vpshufb %ymm14, %ymm0, %ymm3;
1113 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1114 vpshufb %ymm14, %ymm0, %ymm2;
1115 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1116 vpshufb %ymm14, %ymm0, %ymm1;
1117 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1118 vextracti128 $1, %ymm0, %xmm13;
1119 vpshufb %ymm14, %ymm0, %ymm0;
1120 inc_le128(%xmm13, %xmm15, %xmm14);
1121 vmovdqu %xmm13, (%rcx);
1124 vpbroadcastq (key_table)(CTX), %ymm15;
1125 vpshufb .Lpack_bswap, %ymm15, %ymm15;
1126 vpxor %ymm0, %ymm15, %ymm0;
1127 vpxor %ymm1, %ymm15, %ymm1;
1128 vpxor %ymm2, %ymm15, %ymm2;
1129 vpxor %ymm3, %ymm15, %ymm3;
1130 vpxor %ymm4, %ymm15, %ymm4;
1131 vpxor %ymm5, %ymm15, %ymm5;
1132 vpxor %ymm6, %ymm15, %ymm6;
1133 vpxor %ymm7, %ymm15, %ymm7;
1134 vpxor %ymm8, %ymm15, %ymm8;
1135 vpxor %ymm9, %ymm15, %ymm9;
1136 vpxor %ymm10, %ymm15, %ymm10;
1137 vpxor 11 * 32(%rax), %ymm15, %ymm11;
1138 vpxor 12 * 32(%rax), %ymm15, %ymm12;
1139 vpxor 13 * 32(%rax), %ymm15, %ymm13;
1140 vpxor 14 * 32(%rax), %ymm15, %ymm14;
1141 vpxor 15 * 32(%rax), %ymm15, %ymm15;
1143 call __camellia_enc_blk32;
1147 vpxor 0 * 32(%rdx), %ymm7, %ymm7;
1148 vpxor 1 * 32(%rdx), %ymm6, %ymm6;
1149 vpxor 2 * 32(%rdx), %ymm5, %ymm5;
1150 vpxor 3 * 32(%rdx), %ymm4, %ymm4;
1151 vpxor 4 * 32(%rdx), %ymm3, %ymm3;
1152 vpxor 5 * 32(%rdx), %ymm2, %ymm2;
1153 vpxor 6 * 32(%rdx), %ymm1, %ymm1;
1154 vpxor 7 * 32(%rdx), %ymm0, %ymm0;
1155 vpxor 8 * 32(%rdx), %ymm15, %ymm15;
1156 vpxor 9 * 32(%rdx), %ymm14, %ymm14;
1157 vpxor 10 * 32(%rdx), %ymm13, %ymm13;
1158 vpxor 11 * 32(%rdx), %ymm12, %ymm12;
1159 vpxor 12 * 32(%rdx), %ymm11, %ymm11;
1160 vpxor 13 * 32(%rdx), %ymm10, %ymm10;
1161 vpxor 14 * 32(%rdx), %ymm9, %ymm9;
1162 vpxor 15 * 32(%rdx), %ymm8, %ymm8;
1163 write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0,
1164 %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9,
1170 ENDPROC(camellia_ctr_32way)
1172 #define gf128mul_x_ble(iv, mask, tmp) \
1173 vpsrad $31, iv, tmp; \
1174 vpaddq iv, iv, iv; \
1175 vpshufd $0x13, tmp, tmp; \
1176 vpand mask, tmp, tmp; \
1179 #define gf128mul_x2_ble(iv, mask1, mask2, tmp0, tmp1) \
1180 vpsrad $31, iv, tmp0; \
1181 vpaddq iv, iv, tmp1; \
1182 vpsllq $2, iv, iv; \
1183 vpshufd $0x13, tmp0, tmp0; \
1184 vpsrad $31, tmp1, tmp1; \
1185 vpand mask2, tmp0, tmp0; \
1186 vpshufd $0x13, tmp1, tmp1; \
1187 vpxor tmp0, iv, iv; \
1188 vpand mask1, tmp1, tmp1; \
1192 camellia_xts_crypt_32way:
1195 * %rsi: dst (32 blocks)
1196 * %rdx: src (32 blocks)
1197 * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
1198 * %r8: index for input whitening key
1199 * %r9: pointer to __camellia_enc_blk32 or __camellia_dec_blk32
1204 subq $(16 * 32), %rsp;
1207 vbroadcasti128 .Lxts_gf128mul_and_shl1_mask_0, %ymm12;
1209 /* load IV and construct second IV */
1210 vmovdqu (%rcx), %xmm0;
1211 vmovdqa %xmm0, %xmm15;
1212 gf128mul_x_ble(%xmm0, %xmm12, %xmm13);
1213 vbroadcasti128 .Lxts_gf128mul_and_shl1_mask_1, %ymm13;
1214 vinserti128 $1, %xmm0, %ymm15, %ymm0;
1215 vpxor 0 * 32(%rdx), %ymm0, %ymm15;
1216 vmovdqu %ymm15, 15 * 32(%rax);
1217 vmovdqu %ymm0, 0 * 32(%rsi);
1220 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1221 vpxor 1 * 32(%rdx), %ymm0, %ymm15;
1222 vmovdqu %ymm15, 14 * 32(%rax);
1223 vmovdqu %ymm0, 1 * 32(%rsi);
1225 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1226 vpxor 2 * 32(%rdx), %ymm0, %ymm15;
1227 vmovdqu %ymm15, 13 * 32(%rax);
1228 vmovdqu %ymm0, 2 * 32(%rsi);
1230 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1231 vpxor 3 * 32(%rdx), %ymm0, %ymm15;
1232 vmovdqu %ymm15, 12 * 32(%rax);
1233 vmovdqu %ymm0, 3 * 32(%rsi);
1235 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1236 vpxor 4 * 32(%rdx), %ymm0, %ymm11;
1237 vmovdqu %ymm0, 4 * 32(%rsi);
1239 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1240 vpxor 5 * 32(%rdx), %ymm0, %ymm10;
1241 vmovdqu %ymm0, 5 * 32(%rsi);
1243 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1244 vpxor 6 * 32(%rdx), %ymm0, %ymm9;
1245 vmovdqu %ymm0, 6 * 32(%rsi);
1247 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1248 vpxor 7 * 32(%rdx), %ymm0, %ymm8;
1249 vmovdqu %ymm0, 7 * 32(%rsi);
1251 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1252 vpxor 8 * 32(%rdx), %ymm0, %ymm7;
1253 vmovdqu %ymm0, 8 * 32(%rsi);
1255 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1256 vpxor 9 * 32(%rdx), %ymm0, %ymm6;
1257 vmovdqu %ymm0, 9 * 32(%rsi);
1259 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1260 vpxor 10 * 32(%rdx), %ymm0, %ymm5;
1261 vmovdqu %ymm0, 10 * 32(%rsi);
1263 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1264 vpxor 11 * 32(%rdx), %ymm0, %ymm4;
1265 vmovdqu %ymm0, 11 * 32(%rsi);
1267 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1268 vpxor 12 * 32(%rdx), %ymm0, %ymm3;
1269 vmovdqu %ymm0, 12 * 32(%rsi);
1271 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1272 vpxor 13 * 32(%rdx), %ymm0, %ymm2;
1273 vmovdqu %ymm0, 13 * 32(%rsi);
1275 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1276 vpxor 14 * 32(%rdx), %ymm0, %ymm1;
1277 vmovdqu %ymm0, 14 * 32(%rsi);
1279 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1280 vpxor 15 * 32(%rdx), %ymm0, %ymm15;
1281 vmovdqu %ymm15, 0 * 32(%rax);
1282 vmovdqu %ymm0, 15 * 32(%rsi);
1284 vextracti128 $1, %ymm0, %xmm0;
1285 gf128mul_x_ble(%xmm0, %xmm12, %xmm15);
1286 vmovdqu %xmm0, (%rcx);
1289 vpbroadcastq (key_table)(CTX, %r8, 8), %ymm15;
1290 vpshufb .Lpack_bswap, %ymm15, %ymm15;
1291 vpxor 0 * 32(%rax), %ymm15, %ymm0;
1292 vpxor %ymm1, %ymm15, %ymm1;
1293 vpxor %ymm2, %ymm15, %ymm2;
1294 vpxor %ymm3, %ymm15, %ymm3;
1295 vpxor %ymm4, %ymm15, %ymm4;
1296 vpxor %ymm5, %ymm15, %ymm5;
1297 vpxor %ymm6, %ymm15, %ymm6;
1298 vpxor %ymm7, %ymm15, %ymm7;
1299 vpxor %ymm8, %ymm15, %ymm8;
1300 vpxor %ymm9, %ymm15, %ymm9;
1301 vpxor %ymm10, %ymm15, %ymm10;
1302 vpxor %ymm11, %ymm15, %ymm11;
1303 vpxor 12 * 32(%rax), %ymm15, %ymm12;
1304 vpxor 13 * 32(%rax), %ymm15, %ymm13;
1305 vpxor 14 * 32(%rax), %ymm15, %ymm14;
1306 vpxor 15 * 32(%rax), %ymm15, %ymm15;
1310 addq $(16 * 32), %rsp;
1312 vpxor 0 * 32(%rsi), %ymm7, %ymm7;
1313 vpxor 1 * 32(%rsi), %ymm6, %ymm6;
1314 vpxor 2 * 32(%rsi), %ymm5, %ymm5;
1315 vpxor 3 * 32(%rsi), %ymm4, %ymm4;
1316 vpxor 4 * 32(%rsi), %ymm3, %ymm3;
1317 vpxor 5 * 32(%rsi), %ymm2, %ymm2;
1318 vpxor 6 * 32(%rsi), %ymm1, %ymm1;
1319 vpxor 7 * 32(%rsi), %ymm0, %ymm0;
1320 vpxor 8 * 32(%rsi), %ymm15, %ymm15;
1321 vpxor 9 * 32(%rsi), %ymm14, %ymm14;
1322 vpxor 10 * 32(%rsi), %ymm13, %ymm13;
1323 vpxor 11 * 32(%rsi), %ymm12, %ymm12;
1324 vpxor 12 * 32(%rsi), %ymm11, %ymm11;
1325 vpxor 13 * 32(%rsi), %ymm10, %ymm10;
1326 vpxor 14 * 32(%rsi), %ymm9, %ymm9;
1327 vpxor 15 * 32(%rsi), %ymm8, %ymm8;
1328 write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0,
1329 %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9,
1335 ENDPROC(camellia_xts_crypt_32way)
1337 ENTRY(camellia_xts_enc_32way)
1340 * %rsi: dst (32 blocks)
1341 * %rdx: src (32 blocks)
1342 * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
1345 xorl %r8d, %r8d; /* input whitening key, 0 for enc */
1347 leaq __camellia_enc_blk32, %r9;
1349 jmp camellia_xts_crypt_32way;
1350 ENDPROC(camellia_xts_enc_32way)
1352 ENTRY(camellia_xts_dec_32way)
1355 * %rsi: dst (32 blocks)
1356 * %rdx: src (32 blocks)
1357 * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
1360 cmpl $16, key_length(CTX);
1363 cmovel %eax, %r8d; /* input whitening key, last for dec */
1365 leaq __camellia_dec_blk32, %r9;
1367 jmp camellia_xts_crypt_32way;
1368 ENDPROC(camellia_xts_dec_32way)