2 * x86_64/AVX2/AES-NI assembler implementation of Camellia
4 * Copyright © 2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
13 #include <linux/linkage.h>
14 #include <asm/frame.h>
16 #define CAMELLIA_TABLE_BYTE_LEN 272
18 /* struct camellia_ctx: */
20 #define key_length CAMELLIA_TABLE_BYTE_LEN
26 /**********************************************************************
28 **********************************************************************/
29 #define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0) \
30 vpand x, mask4bit, tmp0; \
31 vpandn x, mask4bit, x; \
34 vpshufb tmp0, lo_t, tmp0; \
55 /**********************************************************************
57 **********************************************************************/
61 * x0..x7: byte-sliced AB state
62 * mem_cd: register pointer storing CD state
63 * key: index for key material
65 * x0..x7: new byte-sliced CD state
67 #define roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, t0, t1, t2, t3, t4, t5, t6, \
70 * S-function with AES subbytes \
72 vbroadcasti128 .Linv_shift_row, t4; \
73 vpbroadcastd .L0f0f0f0f, t7; \
74 vbroadcasti128 .Lpre_tf_lo_s1, t5; \
75 vbroadcasti128 .Lpre_tf_hi_s1, t6; \
76 vbroadcasti128 .Lpre_tf_lo_s4, t2; \
77 vbroadcasti128 .Lpre_tf_hi_s4, t3; \
79 /* AES inverse shift rows */ \
89 /* prefilter sboxes 1, 2 and 3 */ \
90 /* prefilter sbox 4 */ \
91 filter_8bit(x0, t5, t6, t7, t4); \
92 filter_8bit(x7, t5, t6, t7, t4); \
93 vextracti128 $1, x0, t0##_x; \
94 vextracti128 $1, x7, t1##_x; \
95 filter_8bit(x3, t2, t3, t7, t4); \
96 filter_8bit(x6, t2, t3, t7, t4); \
97 vextracti128 $1, x3, t3##_x; \
98 vextracti128 $1, x6, t2##_x; \
99 filter_8bit(x2, t5, t6, t7, t4); \
100 filter_8bit(x5, t5, t6, t7, t4); \
101 filter_8bit(x1, t5, t6, t7, t4); \
102 filter_8bit(x4, t5, t6, t7, t4); \
104 vpxor t4##_x, t4##_x, t4##_x; \
106 /* AES subbytes + AES shift rows */ \
107 vextracti128 $1, x2, t6##_x; \
108 vextracti128 $1, x5, t5##_x; \
109 vaesenclast t4##_x, x0##_x, x0##_x; \
110 vaesenclast t4##_x, t0##_x, t0##_x; \
111 vinserti128 $1, t0##_x, x0, x0; \
112 vaesenclast t4##_x, x7##_x, x7##_x; \
113 vaesenclast t4##_x, t1##_x, t1##_x; \
114 vinserti128 $1, t1##_x, x7, x7; \
115 vaesenclast t4##_x, x3##_x, x3##_x; \
116 vaesenclast t4##_x, t3##_x, t3##_x; \
117 vinserti128 $1, t3##_x, x3, x3; \
118 vaesenclast t4##_x, x6##_x, x6##_x; \
119 vaesenclast t4##_x, t2##_x, t2##_x; \
120 vinserti128 $1, t2##_x, x6, x6; \
121 vextracti128 $1, x1, t3##_x; \
122 vextracti128 $1, x4, t2##_x; \
123 vbroadcasti128 .Lpost_tf_lo_s1, t0; \
124 vbroadcasti128 .Lpost_tf_hi_s1, t1; \
125 vaesenclast t4##_x, x2##_x, x2##_x; \
126 vaesenclast t4##_x, t6##_x, t6##_x; \
127 vinserti128 $1, t6##_x, x2, x2; \
128 vaesenclast t4##_x, x5##_x, x5##_x; \
129 vaesenclast t4##_x, t5##_x, t5##_x; \
130 vinserti128 $1, t5##_x, x5, x5; \
131 vaesenclast t4##_x, x1##_x, x1##_x; \
132 vaesenclast t4##_x, t3##_x, t3##_x; \
133 vinserti128 $1, t3##_x, x1, x1; \
134 vaesenclast t4##_x, x4##_x, x4##_x; \
135 vaesenclast t4##_x, t2##_x, t2##_x; \
136 vinserti128 $1, t2##_x, x4, x4; \
138 /* postfilter sboxes 1 and 4 */ \
139 vbroadcasti128 .Lpost_tf_lo_s3, t2; \
140 vbroadcasti128 .Lpost_tf_hi_s3, t3; \
141 filter_8bit(x0, t0, t1, t7, t6); \
142 filter_8bit(x7, t0, t1, t7, t6); \
143 filter_8bit(x3, t0, t1, t7, t6); \
144 filter_8bit(x6, t0, t1, t7, t6); \
146 /* postfilter sbox 3 */ \
147 vbroadcasti128 .Lpost_tf_lo_s2, t4; \
148 vbroadcasti128 .Lpost_tf_hi_s2, t5; \
149 filter_8bit(x2, t2, t3, t7, t6); \
150 filter_8bit(x5, t2, t3, t7, t6); \
152 vpbroadcastq key, t0; /* higher 64-bit duplicate ignored */ \
154 /* postfilter sbox 2 */ \
155 filter_8bit(x1, t4, t5, t7, t2); \
156 filter_8bit(x4, t4, t5, t7, t2); \
159 vpsrldq $1, t0, t1; \
160 vpsrldq $2, t0, t2; \
161 vpshufb t7, t1, t1; \
162 vpsrldq $3, t0, t3; \
170 vpshufb t7, t2, t2; \
171 vpsrldq $4, t0, t4; \
172 vpshufb t7, t3, t3; \
173 vpsrldq $5, t0, t5; \
174 vpshufb t7, t4, t4; \
181 vpsrldq $6, t0, t6; \
182 vpshufb t7, t5, t5; \
183 vpshufb t7, t6, t6; \
193 vpxor x2, x7, x7; /* note: high and low parts swapped */ \
195 /* Add key material and result to CD (x becomes new CD) */ \
198 vpxor 5 * 32(mem_cd), x1, x1; \
200 vpsrldq $7, t0, t6; \
201 vpshufb t7, t0, t0; \
202 vpshufb t7, t6, t7; \
205 vpxor 4 * 32(mem_cd), x0, x0; \
208 vpxor 6 * 32(mem_cd), x2, x2; \
211 vpxor 7 * 32(mem_cd), x3, x3; \
214 vpxor 0 * 32(mem_cd), x4, x4; \
217 vpxor 1 * 32(mem_cd), x5, x5; \
220 vpxor 2 * 32(mem_cd), x6, x6; \
223 vpxor 3 * 32(mem_cd), x7, x7;
226 * Size optimization... with inlined roundsm32 binary would be over 5 times
227 * larger and would only marginally faster.
230 roundsm32_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd:
231 roundsm32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
232 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, %ymm15,
235 ENDPROC(roundsm32_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd)
238 roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab:
239 roundsm32(%ymm4, %ymm5, %ymm6, %ymm7, %ymm0, %ymm1, %ymm2, %ymm3,
240 %ymm12, %ymm13, %ymm14, %ymm15, %ymm8, %ymm9, %ymm10, %ymm11,
243 ENDPROC(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
247 * x0..x7: byte-sliced AB state preloaded
248 * mem_ab: byte-sliced AB state in memory
249 * mem_cb: byte-sliced CD state in memory
251 #define two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
252 y6, y7, mem_ab, mem_cd, i, dir, store_ab) \
253 leaq (key_table + (i) * 8)(CTX), %r9; \
254 call roundsm32_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd; \
256 vmovdqu x0, 4 * 32(mem_cd); \
257 vmovdqu x1, 5 * 32(mem_cd); \
258 vmovdqu x2, 6 * 32(mem_cd); \
259 vmovdqu x3, 7 * 32(mem_cd); \
260 vmovdqu x4, 0 * 32(mem_cd); \
261 vmovdqu x5, 1 * 32(mem_cd); \
262 vmovdqu x6, 2 * 32(mem_cd); \
263 vmovdqu x7, 3 * 32(mem_cd); \
265 leaq (key_table + ((i) + (dir)) * 8)(CTX), %r9; \
266 call roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab; \
268 store_ab(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab);
270 #define dummy_store(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) /* do nothing */
272 #define store_ab_state(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) \
273 /* Store new AB state */ \
274 vmovdqu x4, 4 * 32(mem_ab); \
275 vmovdqu x5, 5 * 32(mem_ab); \
276 vmovdqu x6, 6 * 32(mem_ab); \
277 vmovdqu x7, 7 * 32(mem_ab); \
278 vmovdqu x0, 0 * 32(mem_ab); \
279 vmovdqu x1, 1 * 32(mem_ab); \
280 vmovdqu x2, 2 * 32(mem_ab); \
281 vmovdqu x3, 3 * 32(mem_ab);
283 #define enc_rounds32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
284 y6, y7, mem_ab, mem_cd, i) \
285 two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
286 y6, y7, mem_ab, mem_cd, (i) + 2, 1, store_ab_state); \
287 two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
288 y6, y7, mem_ab, mem_cd, (i) + 4, 1, store_ab_state); \
289 two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
290 y6, y7, mem_ab, mem_cd, (i) + 6, 1, dummy_store);
292 #define dec_rounds32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
293 y6, y7, mem_ab, mem_cd, i) \
294 two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
295 y6, y7, mem_ab, mem_cd, (i) + 7, -1, store_ab_state); \
296 two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
297 y6, y7, mem_ab, mem_cd, (i) + 5, -1, store_ab_state); \
298 two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
299 y6, y7, mem_ab, mem_cd, (i) + 3, -1, dummy_store);
303 * v0..3: byte-sliced 32-bit integers
307 #define rol32_1_32(v0, v1, v2, v3, t0, t1, t2, zero) \
308 vpcmpgtb v0, zero, t0; \
312 vpcmpgtb v1, zero, t1; \
316 vpcmpgtb v2, zero, t2; \
322 vpcmpgtb v3, zero, t0; \
332 * r: byte-sliced AB state in memory
333 * l: byte-sliced CD state in memory
335 * x0..x7: new byte-sliced CD state
337 #define fls32(l, l0, l1, l2, l3, l4, l5, l6, l7, r, t0, t1, t2, t3, tt0, \
338 tt1, tt2, tt3, kll, klr, krl, krr) \
342 * lr ^= rol32(t0, 1); \
344 vpbroadcastd kll, t0; /* only lowest 32-bit used */ \
345 vpxor tt0, tt0, tt0; \
346 vpshufb tt0, t0, t3; \
347 vpsrldq $1, t0, t0; \
348 vpshufb tt0, t0, t2; \
349 vpsrldq $1, t0, t0; \
350 vpshufb tt0, t0, t1; \
351 vpsrldq $1, t0, t0; \
352 vpshufb tt0, t0, t0; \
359 rol32_1_32(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \
362 vpbroadcastd krr, t0; /* only lowest 32-bit used */ \
363 vmovdqu l4, 4 * 32(l); \
365 vmovdqu l5, 5 * 32(l); \
367 vmovdqu l6, 6 * 32(l); \
369 vmovdqu l7, 7 * 32(l); \
377 vpshufb tt0, t0, t3; \
378 vpsrldq $1, t0, t0; \
379 vpshufb tt0, t0, t2; \
380 vpsrldq $1, t0, t0; \
381 vpshufb tt0, t0, t1; \
382 vpsrldq $1, t0, t0; \
383 vpshufb tt0, t0, t0; \
385 vpor 4 * 32(r), t0, t0; \
386 vpor 5 * 32(r), t1, t1; \
387 vpor 6 * 32(r), t2, t2; \
388 vpor 7 * 32(r), t3, t3; \
390 vpxor 0 * 32(r), t0, t0; \
391 vpxor 1 * 32(r), t1, t1; \
392 vpxor 2 * 32(r), t2, t2; \
393 vpxor 3 * 32(r), t3, t3; \
394 vmovdqu t0, 0 * 32(r); \
395 vpbroadcastd krl, t0; /* only lowest 32-bit used */ \
396 vmovdqu t1, 1 * 32(r); \
397 vmovdqu t2, 2 * 32(r); \
398 vmovdqu t3, 3 * 32(r); \
403 * rr ^= rol32(t2, 1); \
405 vpshufb tt0, t0, t3; \
406 vpsrldq $1, t0, t0; \
407 vpshufb tt0, t0, t2; \
408 vpsrldq $1, t0, t0; \
409 vpshufb tt0, t0, t1; \
410 vpsrldq $1, t0, t0; \
411 vpshufb tt0, t0, t0; \
413 vpand 0 * 32(r), t0, t0; \
414 vpand 1 * 32(r), t1, t1; \
415 vpand 2 * 32(r), t2, t2; \
416 vpand 3 * 32(r), t3, t3; \
418 rol32_1_32(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \
420 vpxor 4 * 32(r), t0, t0; \
421 vpxor 5 * 32(r), t1, t1; \
422 vpxor 6 * 32(r), t2, t2; \
423 vpxor 7 * 32(r), t3, t3; \
424 vmovdqu t0, 4 * 32(r); \
425 vpbroadcastd klr, t0; /* only lowest 32-bit used */ \
426 vmovdqu t1, 5 * 32(r); \
427 vmovdqu t2, 6 * 32(r); \
428 vmovdqu t3, 7 * 32(r); \
436 vpshufb tt0, t0, t3; \
437 vpsrldq $1, t0, t0; \
438 vpshufb tt0, t0, t2; \
439 vpsrldq $1, t0, t0; \
440 vpshufb tt0, t0, t1; \
441 vpsrldq $1, t0, t0; \
442 vpshufb tt0, t0, t0; \
450 vmovdqu l0, 0 * 32(l); \
452 vmovdqu l1, 1 * 32(l); \
454 vmovdqu l2, 2 * 32(l); \
456 vmovdqu l3, 3 * 32(l);
458 #define transpose_4x4(x0, x1, x2, x3, t1, t2) \
459 vpunpckhdq x1, x0, t2; \
460 vpunpckldq x1, x0, x0; \
462 vpunpckldq x3, x2, t1; \
463 vpunpckhdq x3, x2, x2; \
465 vpunpckhqdq t1, x0, x1; \
466 vpunpcklqdq t1, x0, x0; \
468 vpunpckhqdq x2, t2, x3; \
469 vpunpcklqdq x2, t2, x2;
471 #define byteslice_16x16b_fast(a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, \
472 a3, b3, c3, d3, st0, st1) \
475 transpose_4x4(a0, a1, a2, a3, d2, d3); \
476 transpose_4x4(b0, b1, b2, b3, d2, d3); \
482 transpose_4x4(c0, c1, c2, c3, a0, a1); \
483 transpose_4x4(d0, d1, d2, d3, a0, a1); \
485 vbroadcasti128 .Lshufb_16x16b, a0; \
487 vpshufb a0, a2, a2; \
488 vpshufb a0, a3, a3; \
489 vpshufb a0, b0, b0; \
490 vpshufb a0, b1, b1; \
491 vpshufb a0, b2, b2; \
492 vpshufb a0, b3, b3; \
493 vpshufb a0, a1, a1; \
494 vpshufb a0, c0, c0; \
495 vpshufb a0, c1, c1; \
496 vpshufb a0, c2, c2; \
497 vpshufb a0, c3, c3; \
498 vpshufb a0, d0, d0; \
499 vpshufb a0, d1, d1; \
500 vpshufb a0, d2, d2; \
501 vpshufb a0, d3, d3; \
504 vpshufb a0, d3, a0; \
507 transpose_4x4(a0, b0, c0, d0, d2, d3); \
508 transpose_4x4(a1, b1, c1, d1, d2, d3); \
514 transpose_4x4(a2, b2, c2, d2, b0, b1); \
515 transpose_4x4(a3, b3, c3, d3, b0, b1); \
518 /* does not adjust output bytes inside vectors */
520 /* load blocks to registers and apply pre-whitening */
521 #define inpack32_pre(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
523 vpbroadcastq key, x0; \
524 vpshufb .Lpack_bswap, x0, x0; \
526 vpxor 0 * 32(rio), x0, y7; \
527 vpxor 1 * 32(rio), x0, y6; \
528 vpxor 2 * 32(rio), x0, y5; \
529 vpxor 3 * 32(rio), x0, y4; \
530 vpxor 4 * 32(rio), x0, y3; \
531 vpxor 5 * 32(rio), x0, y2; \
532 vpxor 6 * 32(rio), x0, y1; \
533 vpxor 7 * 32(rio), x0, y0; \
534 vpxor 8 * 32(rio), x0, x7; \
535 vpxor 9 * 32(rio), x0, x6; \
536 vpxor 10 * 32(rio), x0, x5; \
537 vpxor 11 * 32(rio), x0, x4; \
538 vpxor 12 * 32(rio), x0, x3; \
539 vpxor 13 * 32(rio), x0, x2; \
540 vpxor 14 * 32(rio), x0, x1; \
541 vpxor 15 * 32(rio), x0, x0;
543 /* byteslice pre-whitened blocks and store to temporary memory */
544 #define inpack32_post(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
545 y6, y7, mem_ab, mem_cd) \
546 byteslice_16x16b_fast(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, \
547 y4, y5, y6, y7, (mem_ab), (mem_cd)); \
549 vmovdqu x0, 0 * 32(mem_ab); \
550 vmovdqu x1, 1 * 32(mem_ab); \
551 vmovdqu x2, 2 * 32(mem_ab); \
552 vmovdqu x3, 3 * 32(mem_ab); \
553 vmovdqu x4, 4 * 32(mem_ab); \
554 vmovdqu x5, 5 * 32(mem_ab); \
555 vmovdqu x6, 6 * 32(mem_ab); \
556 vmovdqu x7, 7 * 32(mem_ab); \
557 vmovdqu y0, 0 * 32(mem_cd); \
558 vmovdqu y1, 1 * 32(mem_cd); \
559 vmovdqu y2, 2 * 32(mem_cd); \
560 vmovdqu y3, 3 * 32(mem_cd); \
561 vmovdqu y4, 4 * 32(mem_cd); \
562 vmovdqu y5, 5 * 32(mem_cd); \
563 vmovdqu y6, 6 * 32(mem_cd); \
564 vmovdqu y7, 7 * 32(mem_cd);
566 /* de-byteslice, apply post-whitening and store blocks */
567 #define outunpack32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, \
568 y5, y6, y7, key, stack_tmp0, stack_tmp1) \
569 byteslice_16x16b_fast(y0, y4, x0, x4, y1, y5, x1, x5, y2, y6, x2, x6, \
570 y3, y7, x3, x7, stack_tmp0, stack_tmp1); \
572 vmovdqu x0, stack_tmp0; \
574 vpbroadcastq key, x0; \
575 vpshufb .Lpack_bswap, x0, x0; \
592 vpxor stack_tmp0, x0, x0;
594 #define write_output(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
596 vmovdqu x0, 0 * 32(rio); \
597 vmovdqu x1, 1 * 32(rio); \
598 vmovdqu x2, 2 * 32(rio); \
599 vmovdqu x3, 3 * 32(rio); \
600 vmovdqu x4, 4 * 32(rio); \
601 vmovdqu x5, 5 * 32(rio); \
602 vmovdqu x6, 6 * 32(rio); \
603 vmovdqu x7, 7 * 32(rio); \
604 vmovdqu y0, 8 * 32(rio); \
605 vmovdqu y1, 9 * 32(rio); \
606 vmovdqu y2, 10 * 32(rio); \
607 vmovdqu y3, 11 * 32(rio); \
608 vmovdqu y4, 12 * 32(rio); \
609 vmovdqu y5, 13 * 32(rio); \
610 vmovdqu y6, 14 * 32(rio); \
611 vmovdqu y7, 15 * 32(rio);
616 #define SHUFB_BYTES(idx) \
617 0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx)
620 .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3)
621 .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3)
624 .long 0x00010203, 0x04050607, 0x80808080, 0x80808080
625 .long 0x00010203, 0x04050607, 0x80808080, 0x80808080
627 /* For CTR-mode IV byteswap */
629 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
632 .Lxts_gf128mul_and_shl1_mask_0:
633 .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0
634 .Lxts_gf128mul_and_shl1_mask_1:
635 .byte 0x0e, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0
638 * pre-SubByte transform
640 * pre-lookup for sbox1, sbox2, sbox3:
641 * swap_bitendianness(
642 * isom_map_camellia_to_aes(
644 * swap_bitendianess(in)
649 * (note: '⊕ 0xc5' inside camellia_f())
652 .byte 0x45, 0xe8, 0x40, 0xed, 0x2e, 0x83, 0x2b, 0x86
653 .byte 0x4b, 0xe6, 0x4e, 0xe3, 0x20, 0x8d, 0x25, 0x88
655 .byte 0x00, 0x51, 0xf1, 0xa0, 0x8a, 0xdb, 0x7b, 0x2a
656 .byte 0x09, 0x58, 0xf8, 0xa9, 0x83, 0xd2, 0x72, 0x23
659 * pre-SubByte transform
661 * pre-lookup for sbox4:
662 * swap_bitendianness(
663 * isom_map_camellia_to_aes(
665 * swap_bitendianess(in <<< 1)
670 * (note: '⊕ 0xc5' inside camellia_f())
673 .byte 0x45, 0x40, 0x2e, 0x2b, 0x4b, 0x4e, 0x20, 0x25
674 .byte 0x14, 0x11, 0x7f, 0x7a, 0x1a, 0x1f, 0x71, 0x74
676 .byte 0x00, 0xf1, 0x8a, 0x7b, 0x09, 0xf8, 0x83, 0x72
677 .byte 0xad, 0x5c, 0x27, 0xd6, 0xa4, 0x55, 0x2e, 0xdf
680 * post-SubByte transform
682 * post-lookup for sbox1, sbox4:
683 * swap_bitendianness(
685 * isom_map_aes_to_camellia(
686 * swap_bitendianness(
687 * aes_inverse_affine_transform(in)
693 * (note: '⊕ 0x6e' inside camellia_h())
696 .byte 0x3c, 0xcc, 0xcf, 0x3f, 0x32, 0xc2, 0xc1, 0x31
697 .byte 0xdc, 0x2c, 0x2f, 0xdf, 0xd2, 0x22, 0x21, 0xd1
699 .byte 0x00, 0xf9, 0x86, 0x7f, 0xd7, 0x2e, 0x51, 0xa8
700 .byte 0xa4, 0x5d, 0x22, 0xdb, 0x73, 0x8a, 0xf5, 0x0c
703 * post-SubByte transform
705 * post-lookup for sbox2:
706 * swap_bitendianness(
708 * isom_map_aes_to_camellia(
709 * swap_bitendianness(
710 * aes_inverse_affine_transform(in)
716 * (note: '⊕ 0x6e' inside camellia_h())
719 .byte 0x78, 0x99, 0x9f, 0x7e, 0x64, 0x85, 0x83, 0x62
720 .byte 0xb9, 0x58, 0x5e, 0xbf, 0xa5, 0x44, 0x42, 0xa3
722 .byte 0x00, 0xf3, 0x0d, 0xfe, 0xaf, 0x5c, 0xa2, 0x51
723 .byte 0x49, 0xba, 0x44, 0xb7, 0xe6, 0x15, 0xeb, 0x18
726 * post-SubByte transform
728 * post-lookup for sbox3:
729 * swap_bitendianness(
731 * isom_map_aes_to_camellia(
732 * swap_bitendianness(
733 * aes_inverse_affine_transform(in)
739 * (note: '⊕ 0x6e' inside camellia_h())
742 .byte 0x1e, 0x66, 0xe7, 0x9f, 0x19, 0x61, 0xe0, 0x98
743 .byte 0x6e, 0x16, 0x97, 0xef, 0x69, 0x11, 0x90, 0xe8
745 .byte 0x00, 0xfc, 0x43, 0xbf, 0xeb, 0x17, 0xa8, 0x54
746 .byte 0x52, 0xae, 0x11, 0xed, 0xb9, 0x45, 0xfa, 0x06
748 /* For isolating SubBytes from AESENCLAST, inverse shift row */
750 .byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b
751 .byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03
761 __camellia_enc_blk32:
764 * %rax: temporary storage, 512 bytes
765 * %ymm0..%ymm15: 32 plaintext blocks
767 * %ymm0..%ymm15: 32 encrypted blocks, order swapped:
768 * 7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
772 leaq 8 * 32(%rax), %rcx;
774 inpack32_post(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
775 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
778 enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
779 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
780 %ymm15, %rax, %rcx, 0);
782 fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
783 %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
785 ((key_table + (8) * 8) + 0)(CTX),
786 ((key_table + (8) * 8) + 4)(CTX),
787 ((key_table + (8) * 8) + 8)(CTX),
788 ((key_table + (8) * 8) + 12)(CTX));
790 enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
791 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
792 %ymm15, %rax, %rcx, 8);
794 fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
795 %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
797 ((key_table + (16) * 8) + 0)(CTX),
798 ((key_table + (16) * 8) + 4)(CTX),
799 ((key_table + (16) * 8) + 8)(CTX),
800 ((key_table + (16) * 8) + 12)(CTX));
802 enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
803 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
804 %ymm15, %rax, %rcx, 16);
807 cmpl $16, key_length(CTX);
811 /* load CD for output */
812 vmovdqu 0 * 32(%rcx), %ymm8;
813 vmovdqu 1 * 32(%rcx), %ymm9;
814 vmovdqu 2 * 32(%rcx), %ymm10;
815 vmovdqu 3 * 32(%rcx), %ymm11;
816 vmovdqu 4 * 32(%rcx), %ymm12;
817 vmovdqu 5 * 32(%rcx), %ymm13;
818 vmovdqu 6 * 32(%rcx), %ymm14;
819 vmovdqu 7 * 32(%rcx), %ymm15;
821 outunpack32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
822 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
823 %ymm15, (key_table)(CTX, %r8, 8), (%rax), 1 * 32(%rax));
832 fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
833 %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
835 ((key_table + (24) * 8) + 0)(CTX),
836 ((key_table + (24) * 8) + 4)(CTX),
837 ((key_table + (24) * 8) + 8)(CTX),
838 ((key_table + (24) * 8) + 12)(CTX));
840 enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
841 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
842 %ymm15, %rax, %rcx, 24);
845 ENDPROC(__camellia_enc_blk32)
848 __camellia_dec_blk32:
851 * %rax: temporary storage, 512 bytes
852 * %r8d: 24 for 16 byte key, 32 for larger
853 * %ymm0..%ymm15: 16 encrypted blocks
855 * %ymm0..%ymm15: 16 plaintext blocks, order swapped:
856 * 7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
860 leaq 8 * 32(%rax), %rcx;
862 inpack32_post(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
863 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
870 dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
871 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
872 %ymm15, %rax, %rcx, 16);
874 fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
875 %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
877 ((key_table + (16) * 8) + 8)(CTX),
878 ((key_table + (16) * 8) + 12)(CTX),
879 ((key_table + (16) * 8) + 0)(CTX),
880 ((key_table + (16) * 8) + 4)(CTX));
882 dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
883 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
884 %ymm15, %rax, %rcx, 8);
886 fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
887 %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
889 ((key_table + (8) * 8) + 8)(CTX),
890 ((key_table + (8) * 8) + 12)(CTX),
891 ((key_table + (8) * 8) + 0)(CTX),
892 ((key_table + (8) * 8) + 4)(CTX));
894 dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
895 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
896 %ymm15, %rax, %rcx, 0);
898 /* load CD for output */
899 vmovdqu 0 * 32(%rcx), %ymm8;
900 vmovdqu 1 * 32(%rcx), %ymm9;
901 vmovdqu 2 * 32(%rcx), %ymm10;
902 vmovdqu 3 * 32(%rcx), %ymm11;
903 vmovdqu 4 * 32(%rcx), %ymm12;
904 vmovdqu 5 * 32(%rcx), %ymm13;
905 vmovdqu 6 * 32(%rcx), %ymm14;
906 vmovdqu 7 * 32(%rcx), %ymm15;
908 outunpack32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
909 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
910 %ymm15, (key_table)(CTX), (%rax), 1 * 32(%rax));
917 dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
918 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
919 %ymm15, %rax, %rcx, 24);
921 fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
922 %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
924 ((key_table + (24) * 8) + 8)(CTX),
925 ((key_table + (24) * 8) + 12)(CTX),
926 ((key_table + (24) * 8) + 0)(CTX),
927 ((key_table + (24) * 8) + 4)(CTX));
930 ENDPROC(__camellia_dec_blk32)
932 ENTRY(camellia_ecb_enc_32way)
935 * %rsi: dst (32 blocks)
936 * %rdx: src (32 blocks)
942 inpack32_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
943 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
944 %ymm15, %rdx, (key_table)(CTX));
946 /* now dst can be used as temporary buffer (even in src == dst case) */
949 call __camellia_enc_blk32;
951 write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0,
952 %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9,
959 ENDPROC(camellia_ecb_enc_32way)
961 ENTRY(camellia_ecb_dec_32way)
964 * %rsi: dst (32 blocks)
965 * %rdx: src (32 blocks)
971 cmpl $16, key_length(CTX);
974 cmovel %eax, %r8d; /* max */
976 inpack32_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
977 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
978 %ymm15, %rdx, (key_table)(CTX, %r8, 8));
980 /* now dst can be used as temporary buffer (even in src == dst case) */
983 call __camellia_dec_blk32;
985 write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0,
986 %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9,
993 ENDPROC(camellia_ecb_dec_32way)
995 ENTRY(camellia_cbc_dec_32way)
998 * %rsi: dst (32 blocks)
999 * %rdx: src (32 blocks)
1005 cmpl $16, key_length(CTX);
1008 cmovel %eax, %r8d; /* max */
1010 inpack32_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
1011 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
1012 %ymm15, %rdx, (key_table)(CTX, %r8, 8));
1016 je .Lcbc_dec_use_stack;
1018 /* dst can be used as temporary storage, src is not overwritten. */
1020 jmp .Lcbc_dec_continue;
1022 .Lcbc_dec_use_stack:
1024 * dst still in-use (because dst == src), so use stack for temporary
1027 subq $(16 * 32), %rsp;
1031 call __camellia_dec_blk32;
1033 vmovdqu %ymm7, (%rax);
1034 vpxor %ymm7, %ymm7, %ymm7;
1035 vinserti128 $1, (%rdx), %ymm7, %ymm7;
1036 vpxor (%rax), %ymm7, %ymm7;
1038 vpxor (0 * 32 + 16)(%rdx), %ymm6, %ymm6;
1039 vpxor (1 * 32 + 16)(%rdx), %ymm5, %ymm5;
1040 vpxor (2 * 32 + 16)(%rdx), %ymm4, %ymm4;
1041 vpxor (3 * 32 + 16)(%rdx), %ymm3, %ymm3;
1042 vpxor (4 * 32 + 16)(%rdx), %ymm2, %ymm2;
1043 vpxor (5 * 32 + 16)(%rdx), %ymm1, %ymm1;
1044 vpxor (6 * 32 + 16)(%rdx), %ymm0, %ymm0;
1045 vpxor (7 * 32 + 16)(%rdx), %ymm15, %ymm15;
1046 vpxor (8 * 32 + 16)(%rdx), %ymm14, %ymm14;
1047 vpxor (9 * 32 + 16)(%rdx), %ymm13, %ymm13;
1048 vpxor (10 * 32 + 16)(%rdx), %ymm12, %ymm12;
1049 vpxor (11 * 32 + 16)(%rdx), %ymm11, %ymm11;
1050 vpxor (12 * 32 + 16)(%rdx), %ymm10, %ymm10;
1051 vpxor (13 * 32 + 16)(%rdx), %ymm9, %ymm9;
1052 vpxor (14 * 32 + 16)(%rdx), %ymm8, %ymm8;
1053 write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0,
1054 %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9,
1061 ENDPROC(camellia_cbc_dec_32way)
1063 #define inc_le128(x, minus_one, tmp) \
1064 vpcmpeqq minus_one, x, tmp; \
1065 vpsubq minus_one, x, x; \
1066 vpslldq $8, tmp, tmp; \
1069 #define add2_le128(x, minus_one, minus_two, tmp1, tmp2) \
1070 vpcmpeqq minus_one, x, tmp1; \
1071 vpcmpeqq minus_two, x, tmp2; \
1072 vpsubq minus_two, x, x; \
1073 vpor tmp2, tmp1, tmp1; \
1074 vpslldq $8, tmp1, tmp1; \
1077 ENTRY(camellia_ctr_32way)
1080 * %rsi: dst (32 blocks)
1081 * %rdx: src (32 blocks)
1082 * %rcx: iv (little endian, 128bit)
1092 /* dst can be used as temporary storage, src is not overwritten. */
1097 subq $(16 * 32), %rsp;
1101 vpcmpeqd %ymm15, %ymm15, %ymm15;
1102 vpsrldq $8, %ymm15, %ymm15; /* ab: -1:0 ; cd: -1:0 */
1103 vpaddq %ymm15, %ymm15, %ymm12; /* ab: -2:0 ; cd: -2:0 */
1105 /* load IV and byteswap */
1106 vmovdqu (%rcx), %xmm0;
1107 vmovdqa %xmm0, %xmm1;
1108 inc_le128(%xmm0, %xmm15, %xmm14);
1109 vbroadcasti128 .Lbswap128_mask, %ymm14;
1110 vinserti128 $1, %xmm0, %ymm1, %ymm0;
1111 vpshufb %ymm14, %ymm0, %ymm13;
1112 vmovdqu %ymm13, 15 * 32(%rax);
1115 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); /* ab:le2 ; cd:le3 */
1116 vpshufb %ymm14, %ymm0, %ymm13;
1117 vmovdqu %ymm13, 14 * 32(%rax);
1118 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1119 vpshufb %ymm14, %ymm0, %ymm13;
1120 vmovdqu %ymm13, 13 * 32(%rax);
1121 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1122 vpshufb %ymm14, %ymm0, %ymm13;
1123 vmovdqu %ymm13, 12 * 32(%rax);
1124 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1125 vpshufb %ymm14, %ymm0, %ymm13;
1126 vmovdqu %ymm13, 11 * 32(%rax);
1127 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1128 vpshufb %ymm14, %ymm0, %ymm10;
1129 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1130 vpshufb %ymm14, %ymm0, %ymm9;
1131 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1132 vpshufb %ymm14, %ymm0, %ymm8;
1133 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1134 vpshufb %ymm14, %ymm0, %ymm7;
1135 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1136 vpshufb %ymm14, %ymm0, %ymm6;
1137 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1138 vpshufb %ymm14, %ymm0, %ymm5;
1139 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1140 vpshufb %ymm14, %ymm0, %ymm4;
1141 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1142 vpshufb %ymm14, %ymm0, %ymm3;
1143 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1144 vpshufb %ymm14, %ymm0, %ymm2;
1145 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1146 vpshufb %ymm14, %ymm0, %ymm1;
1147 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1148 vextracti128 $1, %ymm0, %xmm13;
1149 vpshufb %ymm14, %ymm0, %ymm0;
1150 inc_le128(%xmm13, %xmm15, %xmm14);
1151 vmovdqu %xmm13, (%rcx);
1154 vpbroadcastq (key_table)(CTX), %ymm15;
1155 vpshufb .Lpack_bswap, %ymm15, %ymm15;
1156 vpxor %ymm0, %ymm15, %ymm0;
1157 vpxor %ymm1, %ymm15, %ymm1;
1158 vpxor %ymm2, %ymm15, %ymm2;
1159 vpxor %ymm3, %ymm15, %ymm3;
1160 vpxor %ymm4, %ymm15, %ymm4;
1161 vpxor %ymm5, %ymm15, %ymm5;
1162 vpxor %ymm6, %ymm15, %ymm6;
1163 vpxor %ymm7, %ymm15, %ymm7;
1164 vpxor %ymm8, %ymm15, %ymm8;
1165 vpxor %ymm9, %ymm15, %ymm9;
1166 vpxor %ymm10, %ymm15, %ymm10;
1167 vpxor 11 * 32(%rax), %ymm15, %ymm11;
1168 vpxor 12 * 32(%rax), %ymm15, %ymm12;
1169 vpxor 13 * 32(%rax), %ymm15, %ymm13;
1170 vpxor 14 * 32(%rax), %ymm15, %ymm14;
1171 vpxor 15 * 32(%rax), %ymm15, %ymm15;
1173 call __camellia_enc_blk32;
1177 vpxor 0 * 32(%rdx), %ymm7, %ymm7;
1178 vpxor 1 * 32(%rdx), %ymm6, %ymm6;
1179 vpxor 2 * 32(%rdx), %ymm5, %ymm5;
1180 vpxor 3 * 32(%rdx), %ymm4, %ymm4;
1181 vpxor 4 * 32(%rdx), %ymm3, %ymm3;
1182 vpxor 5 * 32(%rdx), %ymm2, %ymm2;
1183 vpxor 6 * 32(%rdx), %ymm1, %ymm1;
1184 vpxor 7 * 32(%rdx), %ymm0, %ymm0;
1185 vpxor 8 * 32(%rdx), %ymm15, %ymm15;
1186 vpxor 9 * 32(%rdx), %ymm14, %ymm14;
1187 vpxor 10 * 32(%rdx), %ymm13, %ymm13;
1188 vpxor 11 * 32(%rdx), %ymm12, %ymm12;
1189 vpxor 12 * 32(%rdx), %ymm11, %ymm11;
1190 vpxor 13 * 32(%rdx), %ymm10, %ymm10;
1191 vpxor 14 * 32(%rdx), %ymm9, %ymm9;
1192 vpxor 15 * 32(%rdx), %ymm8, %ymm8;
1193 write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0,
1194 %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9,
1201 ENDPROC(camellia_ctr_32way)
1203 #define gf128mul_x_ble(iv, mask, tmp) \
1204 vpsrad $31, iv, tmp; \
1205 vpaddq iv, iv, iv; \
1206 vpshufd $0x13, tmp, tmp; \
1207 vpand mask, tmp, tmp; \
1210 #define gf128mul_x2_ble(iv, mask1, mask2, tmp0, tmp1) \
1211 vpsrad $31, iv, tmp0; \
1212 vpaddq iv, iv, tmp1; \
1213 vpsllq $2, iv, iv; \
1214 vpshufd $0x13, tmp0, tmp0; \
1215 vpsrad $31, tmp1, tmp1; \
1216 vpand mask2, tmp0, tmp0; \
1217 vpshufd $0x13, tmp1, tmp1; \
1218 vpxor tmp0, iv, iv; \
1219 vpand mask1, tmp1, tmp1; \
1223 camellia_xts_crypt_32way:
1226 * %rsi: dst (32 blocks)
1227 * %rdx: src (32 blocks)
1228 * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
1229 * %r8: index for input whitening key
1230 * %r9: pointer to __camellia_enc_blk32 or __camellia_dec_blk32
1236 subq $(16 * 32), %rsp;
1239 vbroadcasti128 .Lxts_gf128mul_and_shl1_mask_0, %ymm12;
1241 /* load IV and construct second IV */
1242 vmovdqu (%rcx), %xmm0;
1243 vmovdqa %xmm0, %xmm15;
1244 gf128mul_x_ble(%xmm0, %xmm12, %xmm13);
1245 vbroadcasti128 .Lxts_gf128mul_and_shl1_mask_1, %ymm13;
1246 vinserti128 $1, %xmm0, %ymm15, %ymm0;
1247 vpxor 0 * 32(%rdx), %ymm0, %ymm15;
1248 vmovdqu %ymm15, 15 * 32(%rax);
1249 vmovdqu %ymm0, 0 * 32(%rsi);
1252 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1253 vpxor 1 * 32(%rdx), %ymm0, %ymm15;
1254 vmovdqu %ymm15, 14 * 32(%rax);
1255 vmovdqu %ymm0, 1 * 32(%rsi);
1257 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1258 vpxor 2 * 32(%rdx), %ymm0, %ymm15;
1259 vmovdqu %ymm15, 13 * 32(%rax);
1260 vmovdqu %ymm0, 2 * 32(%rsi);
1262 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1263 vpxor 3 * 32(%rdx), %ymm0, %ymm15;
1264 vmovdqu %ymm15, 12 * 32(%rax);
1265 vmovdqu %ymm0, 3 * 32(%rsi);
1267 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1268 vpxor 4 * 32(%rdx), %ymm0, %ymm11;
1269 vmovdqu %ymm0, 4 * 32(%rsi);
1271 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1272 vpxor 5 * 32(%rdx), %ymm0, %ymm10;
1273 vmovdqu %ymm0, 5 * 32(%rsi);
1275 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1276 vpxor 6 * 32(%rdx), %ymm0, %ymm9;
1277 vmovdqu %ymm0, 6 * 32(%rsi);
1279 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1280 vpxor 7 * 32(%rdx), %ymm0, %ymm8;
1281 vmovdqu %ymm0, 7 * 32(%rsi);
1283 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1284 vpxor 8 * 32(%rdx), %ymm0, %ymm7;
1285 vmovdqu %ymm0, 8 * 32(%rsi);
1287 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1288 vpxor 9 * 32(%rdx), %ymm0, %ymm6;
1289 vmovdqu %ymm0, 9 * 32(%rsi);
1291 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1292 vpxor 10 * 32(%rdx), %ymm0, %ymm5;
1293 vmovdqu %ymm0, 10 * 32(%rsi);
1295 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1296 vpxor 11 * 32(%rdx), %ymm0, %ymm4;
1297 vmovdqu %ymm0, 11 * 32(%rsi);
1299 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1300 vpxor 12 * 32(%rdx), %ymm0, %ymm3;
1301 vmovdqu %ymm0, 12 * 32(%rsi);
1303 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1304 vpxor 13 * 32(%rdx), %ymm0, %ymm2;
1305 vmovdqu %ymm0, 13 * 32(%rsi);
1307 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1308 vpxor 14 * 32(%rdx), %ymm0, %ymm1;
1309 vmovdqu %ymm0, 14 * 32(%rsi);
1311 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1312 vpxor 15 * 32(%rdx), %ymm0, %ymm15;
1313 vmovdqu %ymm15, 0 * 32(%rax);
1314 vmovdqu %ymm0, 15 * 32(%rsi);
1316 vextracti128 $1, %ymm0, %xmm0;
1317 gf128mul_x_ble(%xmm0, %xmm12, %xmm15);
1318 vmovdqu %xmm0, (%rcx);
1321 vpbroadcastq (key_table)(CTX, %r8, 8), %ymm15;
1322 vpshufb .Lpack_bswap, %ymm15, %ymm15;
1323 vpxor 0 * 32(%rax), %ymm15, %ymm0;
1324 vpxor %ymm1, %ymm15, %ymm1;
1325 vpxor %ymm2, %ymm15, %ymm2;
1326 vpxor %ymm3, %ymm15, %ymm3;
1327 vpxor %ymm4, %ymm15, %ymm4;
1328 vpxor %ymm5, %ymm15, %ymm5;
1329 vpxor %ymm6, %ymm15, %ymm6;
1330 vpxor %ymm7, %ymm15, %ymm7;
1331 vpxor %ymm8, %ymm15, %ymm8;
1332 vpxor %ymm9, %ymm15, %ymm9;
1333 vpxor %ymm10, %ymm15, %ymm10;
1334 vpxor %ymm11, %ymm15, %ymm11;
1335 vpxor 12 * 32(%rax), %ymm15, %ymm12;
1336 vpxor 13 * 32(%rax), %ymm15, %ymm13;
1337 vpxor 14 * 32(%rax), %ymm15, %ymm14;
1338 vpxor 15 * 32(%rax), %ymm15, %ymm15;
1342 addq $(16 * 32), %rsp;
1344 vpxor 0 * 32(%rsi), %ymm7, %ymm7;
1345 vpxor 1 * 32(%rsi), %ymm6, %ymm6;
1346 vpxor 2 * 32(%rsi), %ymm5, %ymm5;
1347 vpxor 3 * 32(%rsi), %ymm4, %ymm4;
1348 vpxor 4 * 32(%rsi), %ymm3, %ymm3;
1349 vpxor 5 * 32(%rsi), %ymm2, %ymm2;
1350 vpxor 6 * 32(%rsi), %ymm1, %ymm1;
1351 vpxor 7 * 32(%rsi), %ymm0, %ymm0;
1352 vpxor 8 * 32(%rsi), %ymm15, %ymm15;
1353 vpxor 9 * 32(%rsi), %ymm14, %ymm14;
1354 vpxor 10 * 32(%rsi), %ymm13, %ymm13;
1355 vpxor 11 * 32(%rsi), %ymm12, %ymm12;
1356 vpxor 12 * 32(%rsi), %ymm11, %ymm11;
1357 vpxor 13 * 32(%rsi), %ymm10, %ymm10;
1358 vpxor 14 * 32(%rsi), %ymm9, %ymm9;
1359 vpxor 15 * 32(%rsi), %ymm8, %ymm8;
1360 write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0,
1361 %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9,
1368 ENDPROC(camellia_xts_crypt_32way)
1370 ENTRY(camellia_xts_enc_32way)
1373 * %rsi: dst (32 blocks)
1374 * %rdx: src (32 blocks)
1375 * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
1378 xorl %r8d, %r8d; /* input whitening key, 0 for enc */
1380 leaq __camellia_enc_blk32, %r9;
1382 jmp camellia_xts_crypt_32way;
1383 ENDPROC(camellia_xts_enc_32way)
1385 ENTRY(camellia_xts_dec_32way)
1388 * %rsi: dst (32 blocks)
1389 * %rdx: src (32 blocks)
1390 * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
1393 cmpl $16, key_length(CTX);
1396 cmovel %eax, %r8d; /* input whitening key, last for dec */
1398 leaq __camellia_dec_blk32, %r9;
1400 jmp camellia_xts_crypt_32way;
1401 ENDPROC(camellia_xts_dec_32way)