2 * x86_64/AVX2/AES-NI assembler implementation of Camellia
4 * Copyright © 2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
13 #include <linux/linkage.h>
14 #include <asm/frame.h>
16 #define CAMELLIA_TABLE_BYTE_LEN 272
18 /* struct camellia_ctx: */
20 #define key_length CAMELLIA_TABLE_BYTE_LEN
26 /**********************************************************************
28 **********************************************************************/
29 #define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0) \
30 vpand x, mask4bit, tmp0; \
31 vpandn x, mask4bit, x; \
34 vpshufb tmp0, lo_t, tmp0; \
55 /**********************************************************************
57 **********************************************************************/
61 * x0..x7: byte-sliced AB state
62 * mem_cd: register pointer storing CD state
63 * key: index for key material
65 * x0..x7: new byte-sliced CD state
67 #define roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, t0, t1, t2, t3, t4, t5, t6, \
70 * S-function with AES subbytes \
72 vbroadcasti128 .Linv_shift_row, t4; \
73 vpbroadcastd .L0f0f0f0f, t7; \
74 vbroadcasti128 .Lpre_tf_lo_s1, t5; \
75 vbroadcasti128 .Lpre_tf_hi_s1, t6; \
76 vbroadcasti128 .Lpre_tf_lo_s4, t2; \
77 vbroadcasti128 .Lpre_tf_hi_s4, t3; \
79 /* AES inverse shift rows */ \
89 /* prefilter sboxes 1, 2 and 3 */ \
90 /* prefilter sbox 4 */ \
91 filter_8bit(x0, t5, t6, t7, t4); \
92 filter_8bit(x7, t5, t6, t7, t4); \
93 vextracti128 $1, x0, t0##_x; \
94 vextracti128 $1, x7, t1##_x; \
95 filter_8bit(x3, t2, t3, t7, t4); \
96 filter_8bit(x6, t2, t3, t7, t4); \
97 vextracti128 $1, x3, t3##_x; \
98 vextracti128 $1, x6, t2##_x; \
99 filter_8bit(x2, t5, t6, t7, t4); \
100 filter_8bit(x5, t5, t6, t7, t4); \
101 filter_8bit(x1, t5, t6, t7, t4); \
102 filter_8bit(x4, t5, t6, t7, t4); \
104 vpxor t4##_x, t4##_x, t4##_x; \
106 /* AES subbytes + AES shift rows */ \
107 vextracti128 $1, x2, t6##_x; \
108 vextracti128 $1, x5, t5##_x; \
109 vaesenclast t4##_x, x0##_x, x0##_x; \
110 vaesenclast t4##_x, t0##_x, t0##_x; \
111 vinserti128 $1, t0##_x, x0, x0; \
112 vaesenclast t4##_x, x7##_x, x7##_x; \
113 vaesenclast t4##_x, t1##_x, t1##_x; \
114 vinserti128 $1, t1##_x, x7, x7; \
115 vaesenclast t4##_x, x3##_x, x3##_x; \
116 vaesenclast t4##_x, t3##_x, t3##_x; \
117 vinserti128 $1, t3##_x, x3, x3; \
118 vaesenclast t4##_x, x6##_x, x6##_x; \
119 vaesenclast t4##_x, t2##_x, t2##_x; \
120 vinserti128 $1, t2##_x, x6, x6; \
121 vextracti128 $1, x1, t3##_x; \
122 vextracti128 $1, x4, t2##_x; \
123 vbroadcasti128 .Lpost_tf_lo_s1, t0; \
124 vbroadcasti128 .Lpost_tf_hi_s1, t1; \
125 vaesenclast t4##_x, x2##_x, x2##_x; \
126 vaesenclast t4##_x, t6##_x, t6##_x; \
127 vinserti128 $1, t6##_x, x2, x2; \
128 vaesenclast t4##_x, x5##_x, x5##_x; \
129 vaesenclast t4##_x, t5##_x, t5##_x; \
130 vinserti128 $1, t5##_x, x5, x5; \
131 vaesenclast t4##_x, x1##_x, x1##_x; \
132 vaesenclast t4##_x, t3##_x, t3##_x; \
133 vinserti128 $1, t3##_x, x1, x1; \
134 vaesenclast t4##_x, x4##_x, x4##_x; \
135 vaesenclast t4##_x, t2##_x, t2##_x; \
136 vinserti128 $1, t2##_x, x4, x4; \
138 /* postfilter sboxes 1 and 4 */ \
139 vbroadcasti128 .Lpost_tf_lo_s3, t2; \
140 vbroadcasti128 .Lpost_tf_hi_s3, t3; \
141 filter_8bit(x0, t0, t1, t7, t6); \
142 filter_8bit(x7, t0, t1, t7, t6); \
143 filter_8bit(x3, t0, t1, t7, t6); \
144 filter_8bit(x6, t0, t1, t7, t6); \
146 /* postfilter sbox 3 */ \
147 vbroadcasti128 .Lpost_tf_lo_s2, t4; \
148 vbroadcasti128 .Lpost_tf_hi_s2, t5; \
149 filter_8bit(x2, t2, t3, t7, t6); \
150 filter_8bit(x5, t2, t3, t7, t6); \
152 vpbroadcastq key, t0; /* higher 64-bit duplicate ignored */ \
154 /* postfilter sbox 2 */ \
155 filter_8bit(x1, t4, t5, t7, t2); \
156 filter_8bit(x4, t4, t5, t7, t2); \
159 vpsrldq $1, t0, t1; \
160 vpsrldq $2, t0, t2; \
161 vpshufb t7, t1, t1; \
162 vpsrldq $3, t0, t3; \
170 vpshufb t7, t2, t2; \
171 vpsrldq $4, t0, t4; \
172 vpshufb t7, t3, t3; \
173 vpsrldq $5, t0, t5; \
174 vpshufb t7, t4, t4; \
181 vpsrldq $6, t0, t6; \
182 vpshufb t7, t5, t5; \
183 vpshufb t7, t6, t6; \
193 vpxor x2, x7, x7; /* note: high and low parts swapped */ \
195 /* Add key material and result to CD (x becomes new CD) */ \
198 vpxor 5 * 32(mem_cd), x1, x1; \
200 vpsrldq $7, t0, t6; \
201 vpshufb t7, t0, t0; \
202 vpshufb t7, t6, t7; \
205 vpxor 4 * 32(mem_cd), x0, x0; \
208 vpxor 6 * 32(mem_cd), x2, x2; \
211 vpxor 7 * 32(mem_cd), x3, x3; \
214 vpxor 0 * 32(mem_cd), x4, x4; \
217 vpxor 1 * 32(mem_cd), x5, x5; \
220 vpxor 2 * 32(mem_cd), x6, x6; \
223 vpxor 3 * 32(mem_cd), x7, x7;
226 * Size optimization... with inlined roundsm32 binary would be over 5 times
227 * larger and would only marginally faster.
230 roundsm32_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd:
231 roundsm32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
232 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, %ymm15,
235 ENDPROC(roundsm32_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd)
238 roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab:
239 roundsm32(%ymm4, %ymm5, %ymm6, %ymm7, %ymm0, %ymm1, %ymm2, %ymm3,
240 %ymm12, %ymm13, %ymm14, %ymm15, %ymm8, %ymm9, %ymm10, %ymm11,
243 ENDPROC(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
247 * x0..x7: byte-sliced AB state preloaded
248 * mem_ab: byte-sliced AB state in memory
249 * mem_cb: byte-sliced CD state in memory
251 #define two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
252 y6, y7, mem_ab, mem_cd, i, dir, store_ab) \
253 leaq (key_table + (i) * 8)(CTX), %r9; \
254 call roundsm32_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd; \
256 vmovdqu x0, 4 * 32(mem_cd); \
257 vmovdqu x1, 5 * 32(mem_cd); \
258 vmovdqu x2, 6 * 32(mem_cd); \
259 vmovdqu x3, 7 * 32(mem_cd); \
260 vmovdqu x4, 0 * 32(mem_cd); \
261 vmovdqu x5, 1 * 32(mem_cd); \
262 vmovdqu x6, 2 * 32(mem_cd); \
263 vmovdqu x7, 3 * 32(mem_cd); \
265 leaq (key_table + ((i) + (dir)) * 8)(CTX), %r9; \
266 call roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab; \
268 store_ab(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab);
270 #define dummy_store(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) /* do nothing */
272 #define store_ab_state(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) \
273 /* Store new AB state */ \
274 vmovdqu x4, 4 * 32(mem_ab); \
275 vmovdqu x5, 5 * 32(mem_ab); \
276 vmovdqu x6, 6 * 32(mem_ab); \
277 vmovdqu x7, 7 * 32(mem_ab); \
278 vmovdqu x0, 0 * 32(mem_ab); \
279 vmovdqu x1, 1 * 32(mem_ab); \
280 vmovdqu x2, 2 * 32(mem_ab); \
281 vmovdqu x3, 3 * 32(mem_ab);
283 #define enc_rounds32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
284 y6, y7, mem_ab, mem_cd, i) \
285 two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
286 y6, y7, mem_ab, mem_cd, (i) + 2, 1, store_ab_state); \
287 two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
288 y6, y7, mem_ab, mem_cd, (i) + 4, 1, store_ab_state); \
289 two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
290 y6, y7, mem_ab, mem_cd, (i) + 6, 1, dummy_store);
292 #define dec_rounds32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
293 y6, y7, mem_ab, mem_cd, i) \
294 two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
295 y6, y7, mem_ab, mem_cd, (i) + 7, -1, store_ab_state); \
296 two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
297 y6, y7, mem_ab, mem_cd, (i) + 5, -1, store_ab_state); \
298 two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
299 y6, y7, mem_ab, mem_cd, (i) + 3, -1, dummy_store);
303 * v0..3: byte-sliced 32-bit integers
307 #define rol32_1_32(v0, v1, v2, v3, t0, t1, t2, zero) \
308 vpcmpgtb v0, zero, t0; \
312 vpcmpgtb v1, zero, t1; \
316 vpcmpgtb v2, zero, t2; \
322 vpcmpgtb v3, zero, t0; \
332 * r: byte-sliced AB state in memory
333 * l: byte-sliced CD state in memory
335 * x0..x7: new byte-sliced CD state
337 #define fls32(l, l0, l1, l2, l3, l4, l5, l6, l7, r, t0, t1, t2, t3, tt0, \
338 tt1, tt2, tt3, kll, klr, krl, krr) \
342 * lr ^= rol32(t0, 1); \
344 vpbroadcastd kll, t0; /* only lowest 32-bit used */ \
345 vpxor tt0, tt0, tt0; \
346 vpshufb tt0, t0, t3; \
347 vpsrldq $1, t0, t0; \
348 vpshufb tt0, t0, t2; \
349 vpsrldq $1, t0, t0; \
350 vpshufb tt0, t0, t1; \
351 vpsrldq $1, t0, t0; \
352 vpshufb tt0, t0, t0; \
359 rol32_1_32(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \
362 vpbroadcastd krr, t0; /* only lowest 32-bit used */ \
363 vmovdqu l4, 4 * 32(l); \
365 vmovdqu l5, 5 * 32(l); \
367 vmovdqu l6, 6 * 32(l); \
369 vmovdqu l7, 7 * 32(l); \
377 vpshufb tt0, t0, t3; \
378 vpsrldq $1, t0, t0; \
379 vpshufb tt0, t0, t2; \
380 vpsrldq $1, t0, t0; \
381 vpshufb tt0, t0, t1; \
382 vpsrldq $1, t0, t0; \
383 vpshufb tt0, t0, t0; \
385 vpor 4 * 32(r), t0, t0; \
386 vpor 5 * 32(r), t1, t1; \
387 vpor 6 * 32(r), t2, t2; \
388 vpor 7 * 32(r), t3, t3; \
390 vpxor 0 * 32(r), t0, t0; \
391 vpxor 1 * 32(r), t1, t1; \
392 vpxor 2 * 32(r), t2, t2; \
393 vpxor 3 * 32(r), t3, t3; \
394 vmovdqu t0, 0 * 32(r); \
395 vpbroadcastd krl, t0; /* only lowest 32-bit used */ \
396 vmovdqu t1, 1 * 32(r); \
397 vmovdqu t2, 2 * 32(r); \
398 vmovdqu t3, 3 * 32(r); \
403 * rr ^= rol32(t2, 1); \
405 vpshufb tt0, t0, t3; \
406 vpsrldq $1, t0, t0; \
407 vpshufb tt0, t0, t2; \
408 vpsrldq $1, t0, t0; \
409 vpshufb tt0, t0, t1; \
410 vpsrldq $1, t0, t0; \
411 vpshufb tt0, t0, t0; \
413 vpand 0 * 32(r), t0, t0; \
414 vpand 1 * 32(r), t1, t1; \
415 vpand 2 * 32(r), t2, t2; \
416 vpand 3 * 32(r), t3, t3; \
418 rol32_1_32(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \
420 vpxor 4 * 32(r), t0, t0; \
421 vpxor 5 * 32(r), t1, t1; \
422 vpxor 6 * 32(r), t2, t2; \
423 vpxor 7 * 32(r), t3, t3; \
424 vmovdqu t0, 4 * 32(r); \
425 vpbroadcastd klr, t0; /* only lowest 32-bit used */ \
426 vmovdqu t1, 5 * 32(r); \
427 vmovdqu t2, 6 * 32(r); \
428 vmovdqu t3, 7 * 32(r); \
436 vpshufb tt0, t0, t3; \
437 vpsrldq $1, t0, t0; \
438 vpshufb tt0, t0, t2; \
439 vpsrldq $1, t0, t0; \
440 vpshufb tt0, t0, t1; \
441 vpsrldq $1, t0, t0; \
442 vpshufb tt0, t0, t0; \
450 vmovdqu l0, 0 * 32(l); \
452 vmovdqu l1, 1 * 32(l); \
454 vmovdqu l2, 2 * 32(l); \
456 vmovdqu l3, 3 * 32(l);
458 #define transpose_4x4(x0, x1, x2, x3, t1, t2) \
459 vpunpckhdq x1, x0, t2; \
460 vpunpckldq x1, x0, x0; \
462 vpunpckldq x3, x2, t1; \
463 vpunpckhdq x3, x2, x2; \
465 vpunpckhqdq t1, x0, x1; \
466 vpunpcklqdq t1, x0, x0; \
468 vpunpckhqdq x2, t2, x3; \
469 vpunpcklqdq x2, t2, x2;
471 #define byteslice_16x16b_fast(a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, \
472 a3, b3, c3, d3, st0, st1) \
475 transpose_4x4(a0, a1, a2, a3, d2, d3); \
476 transpose_4x4(b0, b1, b2, b3, d2, d3); \
482 transpose_4x4(c0, c1, c2, c3, a0, a1); \
483 transpose_4x4(d0, d1, d2, d3, a0, a1); \
485 vbroadcasti128 .Lshufb_16x16b, a0; \
487 vpshufb a0, a2, a2; \
488 vpshufb a0, a3, a3; \
489 vpshufb a0, b0, b0; \
490 vpshufb a0, b1, b1; \
491 vpshufb a0, b2, b2; \
492 vpshufb a0, b3, b3; \
493 vpshufb a0, a1, a1; \
494 vpshufb a0, c0, c0; \
495 vpshufb a0, c1, c1; \
496 vpshufb a0, c2, c2; \
497 vpshufb a0, c3, c3; \
498 vpshufb a0, d0, d0; \
499 vpshufb a0, d1, d1; \
500 vpshufb a0, d2, d2; \
501 vpshufb a0, d3, d3; \
504 vpshufb a0, d3, a0; \
507 transpose_4x4(a0, b0, c0, d0, d2, d3); \
508 transpose_4x4(a1, b1, c1, d1, d2, d3); \
514 transpose_4x4(a2, b2, c2, d2, b0, b1); \
515 transpose_4x4(a3, b3, c3, d3, b0, b1); \
518 /* does not adjust output bytes inside vectors */
520 /* load blocks to registers and apply pre-whitening */
521 #define inpack32_pre(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
523 vpbroadcastq key, x0; \
524 vpshufb .Lpack_bswap, x0, x0; \
526 vpxor 0 * 32(rio), x0, y7; \
527 vpxor 1 * 32(rio), x0, y6; \
528 vpxor 2 * 32(rio), x0, y5; \
529 vpxor 3 * 32(rio), x0, y4; \
530 vpxor 4 * 32(rio), x0, y3; \
531 vpxor 5 * 32(rio), x0, y2; \
532 vpxor 6 * 32(rio), x0, y1; \
533 vpxor 7 * 32(rio), x0, y0; \
534 vpxor 8 * 32(rio), x0, x7; \
535 vpxor 9 * 32(rio), x0, x6; \
536 vpxor 10 * 32(rio), x0, x5; \
537 vpxor 11 * 32(rio), x0, x4; \
538 vpxor 12 * 32(rio), x0, x3; \
539 vpxor 13 * 32(rio), x0, x2; \
540 vpxor 14 * 32(rio), x0, x1; \
541 vpxor 15 * 32(rio), x0, x0;
543 /* byteslice pre-whitened blocks and store to temporary memory */
544 #define inpack32_post(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
545 y6, y7, mem_ab, mem_cd) \
546 byteslice_16x16b_fast(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, \
547 y4, y5, y6, y7, (mem_ab), (mem_cd)); \
549 vmovdqu x0, 0 * 32(mem_ab); \
550 vmovdqu x1, 1 * 32(mem_ab); \
551 vmovdqu x2, 2 * 32(mem_ab); \
552 vmovdqu x3, 3 * 32(mem_ab); \
553 vmovdqu x4, 4 * 32(mem_ab); \
554 vmovdqu x5, 5 * 32(mem_ab); \
555 vmovdqu x6, 6 * 32(mem_ab); \
556 vmovdqu x7, 7 * 32(mem_ab); \
557 vmovdqu y0, 0 * 32(mem_cd); \
558 vmovdqu y1, 1 * 32(mem_cd); \
559 vmovdqu y2, 2 * 32(mem_cd); \
560 vmovdqu y3, 3 * 32(mem_cd); \
561 vmovdqu y4, 4 * 32(mem_cd); \
562 vmovdqu y5, 5 * 32(mem_cd); \
563 vmovdqu y6, 6 * 32(mem_cd); \
564 vmovdqu y7, 7 * 32(mem_cd);
566 /* de-byteslice, apply post-whitening and store blocks */
567 #define outunpack32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, \
568 y5, y6, y7, key, stack_tmp0, stack_tmp1) \
569 byteslice_16x16b_fast(y0, y4, x0, x4, y1, y5, x1, x5, y2, y6, x2, x6, \
570 y3, y7, x3, x7, stack_tmp0, stack_tmp1); \
572 vmovdqu x0, stack_tmp0; \
574 vpbroadcastq key, x0; \
575 vpshufb .Lpack_bswap, x0, x0; \
592 vpxor stack_tmp0, x0, x0;
594 #define write_output(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
596 vmovdqu x0, 0 * 32(rio); \
597 vmovdqu x1, 1 * 32(rio); \
598 vmovdqu x2, 2 * 32(rio); \
599 vmovdqu x3, 3 * 32(rio); \
600 vmovdqu x4, 4 * 32(rio); \
601 vmovdqu x5, 5 * 32(rio); \
602 vmovdqu x6, 6 * 32(rio); \
603 vmovdqu x7, 7 * 32(rio); \
604 vmovdqu y0, 8 * 32(rio); \
605 vmovdqu y1, 9 * 32(rio); \
606 vmovdqu y2, 10 * 32(rio); \
607 vmovdqu y3, 11 * 32(rio); \
608 vmovdqu y4, 12 * 32(rio); \
609 vmovdqu y5, 13 * 32(rio); \
610 vmovdqu y6, 14 * 32(rio); \
611 vmovdqu y7, 15 * 32(rio);
614 .section .rodata.cst32.shufb_16x16b, "aM", @progbits, 32
616 #define SHUFB_BYTES(idx) \
617 0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx)
619 .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3)
620 .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3)
622 .section .rodata.cst32.pack_bswap, "aM", @progbits, 32
625 .long 0x00010203, 0x04050607, 0x80808080, 0x80808080
626 .long 0x00010203, 0x04050607, 0x80808080, 0x80808080
628 /* NB: section is mergeable, all elements must be aligned 16-byte blocks */
629 .section .rodata.cst16, "aM", @progbits, 16
632 /* For CTR-mode IV byteswap */
634 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
637 .Lxts_gf128mul_and_shl1_mask_0:
638 .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0
639 .Lxts_gf128mul_and_shl1_mask_1:
640 .byte 0x0e, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0
643 * pre-SubByte transform
645 * pre-lookup for sbox1, sbox2, sbox3:
646 * swap_bitendianness(
647 * isom_map_camellia_to_aes(
649 * swap_bitendianess(in)
654 * (note: '⊕ 0xc5' inside camellia_f())
657 .byte 0x45, 0xe8, 0x40, 0xed, 0x2e, 0x83, 0x2b, 0x86
658 .byte 0x4b, 0xe6, 0x4e, 0xe3, 0x20, 0x8d, 0x25, 0x88
660 .byte 0x00, 0x51, 0xf1, 0xa0, 0x8a, 0xdb, 0x7b, 0x2a
661 .byte 0x09, 0x58, 0xf8, 0xa9, 0x83, 0xd2, 0x72, 0x23
664 * pre-SubByte transform
666 * pre-lookup for sbox4:
667 * swap_bitendianness(
668 * isom_map_camellia_to_aes(
670 * swap_bitendianess(in <<< 1)
675 * (note: '⊕ 0xc5' inside camellia_f())
678 .byte 0x45, 0x40, 0x2e, 0x2b, 0x4b, 0x4e, 0x20, 0x25
679 .byte 0x14, 0x11, 0x7f, 0x7a, 0x1a, 0x1f, 0x71, 0x74
681 .byte 0x00, 0xf1, 0x8a, 0x7b, 0x09, 0xf8, 0x83, 0x72
682 .byte 0xad, 0x5c, 0x27, 0xd6, 0xa4, 0x55, 0x2e, 0xdf
685 * post-SubByte transform
687 * post-lookup for sbox1, sbox4:
688 * swap_bitendianness(
690 * isom_map_aes_to_camellia(
691 * swap_bitendianness(
692 * aes_inverse_affine_transform(in)
698 * (note: '⊕ 0x6e' inside camellia_h())
701 .byte 0x3c, 0xcc, 0xcf, 0x3f, 0x32, 0xc2, 0xc1, 0x31
702 .byte 0xdc, 0x2c, 0x2f, 0xdf, 0xd2, 0x22, 0x21, 0xd1
704 .byte 0x00, 0xf9, 0x86, 0x7f, 0xd7, 0x2e, 0x51, 0xa8
705 .byte 0xa4, 0x5d, 0x22, 0xdb, 0x73, 0x8a, 0xf5, 0x0c
708 * post-SubByte transform
710 * post-lookup for sbox2:
711 * swap_bitendianness(
713 * isom_map_aes_to_camellia(
714 * swap_bitendianness(
715 * aes_inverse_affine_transform(in)
721 * (note: '⊕ 0x6e' inside camellia_h())
724 .byte 0x78, 0x99, 0x9f, 0x7e, 0x64, 0x85, 0x83, 0x62
725 .byte 0xb9, 0x58, 0x5e, 0xbf, 0xa5, 0x44, 0x42, 0xa3
727 .byte 0x00, 0xf3, 0x0d, 0xfe, 0xaf, 0x5c, 0xa2, 0x51
728 .byte 0x49, 0xba, 0x44, 0xb7, 0xe6, 0x15, 0xeb, 0x18
731 * post-SubByte transform
733 * post-lookup for sbox3:
734 * swap_bitendianness(
736 * isom_map_aes_to_camellia(
737 * swap_bitendianness(
738 * aes_inverse_affine_transform(in)
744 * (note: '⊕ 0x6e' inside camellia_h())
747 .byte 0x1e, 0x66, 0xe7, 0x9f, 0x19, 0x61, 0xe0, 0x98
748 .byte 0x6e, 0x16, 0x97, 0xef, 0x69, 0x11, 0x90, 0xe8
750 .byte 0x00, 0xfc, 0x43, 0xbf, 0xeb, 0x17, 0xa8, 0x54
751 .byte 0x52, 0xae, 0x11, 0xed, 0xb9, 0x45, 0xfa, 0x06
753 /* For isolating SubBytes from AESENCLAST, inverse shift row */
755 .byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b
756 .byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03
758 .section .rodata.cst4.L0f0f0f0f, "aM", @progbits, 4
767 __camellia_enc_blk32:
770 * %rax: temporary storage, 512 bytes
771 * %ymm0..%ymm15: 32 plaintext blocks
773 * %ymm0..%ymm15: 32 encrypted blocks, order swapped:
774 * 7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
778 leaq 8 * 32(%rax), %rcx;
780 inpack32_post(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
781 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
784 enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
785 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
786 %ymm15, %rax, %rcx, 0);
788 fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
789 %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
791 ((key_table + (8) * 8) + 0)(CTX),
792 ((key_table + (8) * 8) + 4)(CTX),
793 ((key_table + (8) * 8) + 8)(CTX),
794 ((key_table + (8) * 8) + 12)(CTX));
796 enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
797 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
798 %ymm15, %rax, %rcx, 8);
800 fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
801 %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
803 ((key_table + (16) * 8) + 0)(CTX),
804 ((key_table + (16) * 8) + 4)(CTX),
805 ((key_table + (16) * 8) + 8)(CTX),
806 ((key_table + (16) * 8) + 12)(CTX));
808 enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
809 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
810 %ymm15, %rax, %rcx, 16);
813 cmpl $16, key_length(CTX);
817 /* load CD for output */
818 vmovdqu 0 * 32(%rcx), %ymm8;
819 vmovdqu 1 * 32(%rcx), %ymm9;
820 vmovdqu 2 * 32(%rcx), %ymm10;
821 vmovdqu 3 * 32(%rcx), %ymm11;
822 vmovdqu 4 * 32(%rcx), %ymm12;
823 vmovdqu 5 * 32(%rcx), %ymm13;
824 vmovdqu 6 * 32(%rcx), %ymm14;
825 vmovdqu 7 * 32(%rcx), %ymm15;
827 outunpack32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
828 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
829 %ymm15, (key_table)(CTX, %r8, 8), (%rax), 1 * 32(%rax));
838 fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
839 %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
841 ((key_table + (24) * 8) + 0)(CTX),
842 ((key_table + (24) * 8) + 4)(CTX),
843 ((key_table + (24) * 8) + 8)(CTX),
844 ((key_table + (24) * 8) + 12)(CTX));
846 enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
847 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
848 %ymm15, %rax, %rcx, 24);
851 ENDPROC(__camellia_enc_blk32)
854 __camellia_dec_blk32:
857 * %rax: temporary storage, 512 bytes
858 * %r8d: 24 for 16 byte key, 32 for larger
859 * %ymm0..%ymm15: 16 encrypted blocks
861 * %ymm0..%ymm15: 16 plaintext blocks, order swapped:
862 * 7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
866 leaq 8 * 32(%rax), %rcx;
868 inpack32_post(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
869 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
876 dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
877 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
878 %ymm15, %rax, %rcx, 16);
880 fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
881 %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
883 ((key_table + (16) * 8) + 8)(CTX),
884 ((key_table + (16) * 8) + 12)(CTX),
885 ((key_table + (16) * 8) + 0)(CTX),
886 ((key_table + (16) * 8) + 4)(CTX));
888 dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
889 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
890 %ymm15, %rax, %rcx, 8);
892 fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
893 %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
895 ((key_table + (8) * 8) + 8)(CTX),
896 ((key_table + (8) * 8) + 12)(CTX),
897 ((key_table + (8) * 8) + 0)(CTX),
898 ((key_table + (8) * 8) + 4)(CTX));
900 dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
901 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
902 %ymm15, %rax, %rcx, 0);
904 /* load CD for output */
905 vmovdqu 0 * 32(%rcx), %ymm8;
906 vmovdqu 1 * 32(%rcx), %ymm9;
907 vmovdqu 2 * 32(%rcx), %ymm10;
908 vmovdqu 3 * 32(%rcx), %ymm11;
909 vmovdqu 4 * 32(%rcx), %ymm12;
910 vmovdqu 5 * 32(%rcx), %ymm13;
911 vmovdqu 6 * 32(%rcx), %ymm14;
912 vmovdqu 7 * 32(%rcx), %ymm15;
914 outunpack32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
915 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
916 %ymm15, (key_table)(CTX), (%rax), 1 * 32(%rax));
923 dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
924 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
925 %ymm15, %rax, %rcx, 24);
927 fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
928 %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
930 ((key_table + (24) * 8) + 8)(CTX),
931 ((key_table + (24) * 8) + 12)(CTX),
932 ((key_table + (24) * 8) + 0)(CTX),
933 ((key_table + (24) * 8) + 4)(CTX));
936 ENDPROC(__camellia_dec_blk32)
938 ENTRY(camellia_ecb_enc_32way)
941 * %rsi: dst (32 blocks)
942 * %rdx: src (32 blocks)
948 inpack32_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
949 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
950 %ymm15, %rdx, (key_table)(CTX));
952 /* now dst can be used as temporary buffer (even in src == dst case) */
955 call __camellia_enc_blk32;
957 write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0,
958 %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9,
965 ENDPROC(camellia_ecb_enc_32way)
967 ENTRY(camellia_ecb_dec_32way)
970 * %rsi: dst (32 blocks)
971 * %rdx: src (32 blocks)
977 cmpl $16, key_length(CTX);
980 cmovel %eax, %r8d; /* max */
982 inpack32_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
983 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
984 %ymm15, %rdx, (key_table)(CTX, %r8, 8));
986 /* now dst can be used as temporary buffer (even in src == dst case) */
989 call __camellia_dec_blk32;
991 write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0,
992 %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9,
999 ENDPROC(camellia_ecb_dec_32way)
1001 ENTRY(camellia_cbc_dec_32way)
1004 * %rsi: dst (32 blocks)
1005 * %rdx: src (32 blocks)
1011 cmpl $16, key_length(CTX);
1014 cmovel %eax, %r8d; /* max */
1016 inpack32_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
1017 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
1018 %ymm15, %rdx, (key_table)(CTX, %r8, 8));
1022 je .Lcbc_dec_use_stack;
1024 /* dst can be used as temporary storage, src is not overwritten. */
1026 jmp .Lcbc_dec_continue;
1028 .Lcbc_dec_use_stack:
1030 * dst still in-use (because dst == src), so use stack for temporary
1033 subq $(16 * 32), %rsp;
1037 call __camellia_dec_blk32;
1039 vmovdqu %ymm7, (%rax);
1040 vpxor %ymm7, %ymm7, %ymm7;
1041 vinserti128 $1, (%rdx), %ymm7, %ymm7;
1042 vpxor (%rax), %ymm7, %ymm7;
1044 vpxor (0 * 32 + 16)(%rdx), %ymm6, %ymm6;
1045 vpxor (1 * 32 + 16)(%rdx), %ymm5, %ymm5;
1046 vpxor (2 * 32 + 16)(%rdx), %ymm4, %ymm4;
1047 vpxor (3 * 32 + 16)(%rdx), %ymm3, %ymm3;
1048 vpxor (4 * 32 + 16)(%rdx), %ymm2, %ymm2;
1049 vpxor (5 * 32 + 16)(%rdx), %ymm1, %ymm1;
1050 vpxor (6 * 32 + 16)(%rdx), %ymm0, %ymm0;
1051 vpxor (7 * 32 + 16)(%rdx), %ymm15, %ymm15;
1052 vpxor (8 * 32 + 16)(%rdx), %ymm14, %ymm14;
1053 vpxor (9 * 32 + 16)(%rdx), %ymm13, %ymm13;
1054 vpxor (10 * 32 + 16)(%rdx), %ymm12, %ymm12;
1055 vpxor (11 * 32 + 16)(%rdx), %ymm11, %ymm11;
1056 vpxor (12 * 32 + 16)(%rdx), %ymm10, %ymm10;
1057 vpxor (13 * 32 + 16)(%rdx), %ymm9, %ymm9;
1058 vpxor (14 * 32 + 16)(%rdx), %ymm8, %ymm8;
1059 write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0,
1060 %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9,
1067 ENDPROC(camellia_cbc_dec_32way)
1069 #define inc_le128(x, minus_one, tmp) \
1070 vpcmpeqq minus_one, x, tmp; \
1071 vpsubq minus_one, x, x; \
1072 vpslldq $8, tmp, tmp; \
1075 #define add2_le128(x, minus_one, minus_two, tmp1, tmp2) \
1076 vpcmpeqq minus_one, x, tmp1; \
1077 vpcmpeqq minus_two, x, tmp2; \
1078 vpsubq minus_two, x, x; \
1079 vpor tmp2, tmp1, tmp1; \
1080 vpslldq $8, tmp1, tmp1; \
1083 ENTRY(camellia_ctr_32way)
1086 * %rsi: dst (32 blocks)
1087 * %rdx: src (32 blocks)
1088 * %rcx: iv (little endian, 128bit)
1098 /* dst can be used as temporary storage, src is not overwritten. */
1103 subq $(16 * 32), %rsp;
1107 vpcmpeqd %ymm15, %ymm15, %ymm15;
1108 vpsrldq $8, %ymm15, %ymm15; /* ab: -1:0 ; cd: -1:0 */
1109 vpaddq %ymm15, %ymm15, %ymm12; /* ab: -2:0 ; cd: -2:0 */
1111 /* load IV and byteswap */
1112 vmovdqu (%rcx), %xmm0;
1113 vmovdqa %xmm0, %xmm1;
1114 inc_le128(%xmm0, %xmm15, %xmm14);
1115 vbroadcasti128 .Lbswap128_mask, %ymm14;
1116 vinserti128 $1, %xmm0, %ymm1, %ymm0;
1117 vpshufb %ymm14, %ymm0, %ymm13;
1118 vmovdqu %ymm13, 15 * 32(%rax);
1121 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); /* ab:le2 ; cd:le3 */
1122 vpshufb %ymm14, %ymm0, %ymm13;
1123 vmovdqu %ymm13, 14 * 32(%rax);
1124 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1125 vpshufb %ymm14, %ymm0, %ymm13;
1126 vmovdqu %ymm13, 13 * 32(%rax);
1127 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1128 vpshufb %ymm14, %ymm0, %ymm13;
1129 vmovdqu %ymm13, 12 * 32(%rax);
1130 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1131 vpshufb %ymm14, %ymm0, %ymm13;
1132 vmovdqu %ymm13, 11 * 32(%rax);
1133 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1134 vpshufb %ymm14, %ymm0, %ymm10;
1135 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1136 vpshufb %ymm14, %ymm0, %ymm9;
1137 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1138 vpshufb %ymm14, %ymm0, %ymm8;
1139 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1140 vpshufb %ymm14, %ymm0, %ymm7;
1141 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1142 vpshufb %ymm14, %ymm0, %ymm6;
1143 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1144 vpshufb %ymm14, %ymm0, %ymm5;
1145 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1146 vpshufb %ymm14, %ymm0, %ymm4;
1147 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1148 vpshufb %ymm14, %ymm0, %ymm3;
1149 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1150 vpshufb %ymm14, %ymm0, %ymm2;
1151 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1152 vpshufb %ymm14, %ymm0, %ymm1;
1153 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1154 vextracti128 $1, %ymm0, %xmm13;
1155 vpshufb %ymm14, %ymm0, %ymm0;
1156 inc_le128(%xmm13, %xmm15, %xmm14);
1157 vmovdqu %xmm13, (%rcx);
1160 vpbroadcastq (key_table)(CTX), %ymm15;
1161 vpshufb .Lpack_bswap, %ymm15, %ymm15;
1162 vpxor %ymm0, %ymm15, %ymm0;
1163 vpxor %ymm1, %ymm15, %ymm1;
1164 vpxor %ymm2, %ymm15, %ymm2;
1165 vpxor %ymm3, %ymm15, %ymm3;
1166 vpxor %ymm4, %ymm15, %ymm4;
1167 vpxor %ymm5, %ymm15, %ymm5;
1168 vpxor %ymm6, %ymm15, %ymm6;
1169 vpxor %ymm7, %ymm15, %ymm7;
1170 vpxor %ymm8, %ymm15, %ymm8;
1171 vpxor %ymm9, %ymm15, %ymm9;
1172 vpxor %ymm10, %ymm15, %ymm10;
1173 vpxor 11 * 32(%rax), %ymm15, %ymm11;
1174 vpxor 12 * 32(%rax), %ymm15, %ymm12;
1175 vpxor 13 * 32(%rax), %ymm15, %ymm13;
1176 vpxor 14 * 32(%rax), %ymm15, %ymm14;
1177 vpxor 15 * 32(%rax), %ymm15, %ymm15;
1179 call __camellia_enc_blk32;
1183 vpxor 0 * 32(%rdx), %ymm7, %ymm7;
1184 vpxor 1 * 32(%rdx), %ymm6, %ymm6;
1185 vpxor 2 * 32(%rdx), %ymm5, %ymm5;
1186 vpxor 3 * 32(%rdx), %ymm4, %ymm4;
1187 vpxor 4 * 32(%rdx), %ymm3, %ymm3;
1188 vpxor 5 * 32(%rdx), %ymm2, %ymm2;
1189 vpxor 6 * 32(%rdx), %ymm1, %ymm1;
1190 vpxor 7 * 32(%rdx), %ymm0, %ymm0;
1191 vpxor 8 * 32(%rdx), %ymm15, %ymm15;
1192 vpxor 9 * 32(%rdx), %ymm14, %ymm14;
1193 vpxor 10 * 32(%rdx), %ymm13, %ymm13;
1194 vpxor 11 * 32(%rdx), %ymm12, %ymm12;
1195 vpxor 12 * 32(%rdx), %ymm11, %ymm11;
1196 vpxor 13 * 32(%rdx), %ymm10, %ymm10;
1197 vpxor 14 * 32(%rdx), %ymm9, %ymm9;
1198 vpxor 15 * 32(%rdx), %ymm8, %ymm8;
1199 write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0,
1200 %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9,
1207 ENDPROC(camellia_ctr_32way)
1209 #define gf128mul_x_ble(iv, mask, tmp) \
1210 vpsrad $31, iv, tmp; \
1211 vpaddq iv, iv, iv; \
1212 vpshufd $0x13, tmp, tmp; \
1213 vpand mask, tmp, tmp; \
1216 #define gf128mul_x2_ble(iv, mask1, mask2, tmp0, tmp1) \
1217 vpsrad $31, iv, tmp0; \
1218 vpaddq iv, iv, tmp1; \
1219 vpsllq $2, iv, iv; \
1220 vpshufd $0x13, tmp0, tmp0; \
1221 vpsrad $31, tmp1, tmp1; \
1222 vpand mask2, tmp0, tmp0; \
1223 vpshufd $0x13, tmp1, tmp1; \
1224 vpxor tmp0, iv, iv; \
1225 vpand mask1, tmp1, tmp1; \
1229 camellia_xts_crypt_32way:
1232 * %rsi: dst (32 blocks)
1233 * %rdx: src (32 blocks)
1234 * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
1235 * %r8: index for input whitening key
1236 * %r9: pointer to __camellia_enc_blk32 or __camellia_dec_blk32
1242 subq $(16 * 32), %rsp;
1245 vbroadcasti128 .Lxts_gf128mul_and_shl1_mask_0, %ymm12;
1247 /* load IV and construct second IV */
1248 vmovdqu (%rcx), %xmm0;
1249 vmovdqa %xmm0, %xmm15;
1250 gf128mul_x_ble(%xmm0, %xmm12, %xmm13);
1251 vbroadcasti128 .Lxts_gf128mul_and_shl1_mask_1, %ymm13;
1252 vinserti128 $1, %xmm0, %ymm15, %ymm0;
1253 vpxor 0 * 32(%rdx), %ymm0, %ymm15;
1254 vmovdqu %ymm15, 15 * 32(%rax);
1255 vmovdqu %ymm0, 0 * 32(%rsi);
1258 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1259 vpxor 1 * 32(%rdx), %ymm0, %ymm15;
1260 vmovdqu %ymm15, 14 * 32(%rax);
1261 vmovdqu %ymm0, 1 * 32(%rsi);
1263 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1264 vpxor 2 * 32(%rdx), %ymm0, %ymm15;
1265 vmovdqu %ymm15, 13 * 32(%rax);
1266 vmovdqu %ymm0, 2 * 32(%rsi);
1268 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1269 vpxor 3 * 32(%rdx), %ymm0, %ymm15;
1270 vmovdqu %ymm15, 12 * 32(%rax);
1271 vmovdqu %ymm0, 3 * 32(%rsi);
1273 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1274 vpxor 4 * 32(%rdx), %ymm0, %ymm11;
1275 vmovdqu %ymm0, 4 * 32(%rsi);
1277 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1278 vpxor 5 * 32(%rdx), %ymm0, %ymm10;
1279 vmovdqu %ymm0, 5 * 32(%rsi);
1281 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1282 vpxor 6 * 32(%rdx), %ymm0, %ymm9;
1283 vmovdqu %ymm0, 6 * 32(%rsi);
1285 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1286 vpxor 7 * 32(%rdx), %ymm0, %ymm8;
1287 vmovdqu %ymm0, 7 * 32(%rsi);
1289 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1290 vpxor 8 * 32(%rdx), %ymm0, %ymm7;
1291 vmovdqu %ymm0, 8 * 32(%rsi);
1293 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1294 vpxor 9 * 32(%rdx), %ymm0, %ymm6;
1295 vmovdqu %ymm0, 9 * 32(%rsi);
1297 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1298 vpxor 10 * 32(%rdx), %ymm0, %ymm5;
1299 vmovdqu %ymm0, 10 * 32(%rsi);
1301 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1302 vpxor 11 * 32(%rdx), %ymm0, %ymm4;
1303 vmovdqu %ymm0, 11 * 32(%rsi);
1305 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1306 vpxor 12 * 32(%rdx), %ymm0, %ymm3;
1307 vmovdqu %ymm0, 12 * 32(%rsi);
1309 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1310 vpxor 13 * 32(%rdx), %ymm0, %ymm2;
1311 vmovdqu %ymm0, 13 * 32(%rsi);
1313 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1314 vpxor 14 * 32(%rdx), %ymm0, %ymm1;
1315 vmovdqu %ymm0, 14 * 32(%rsi);
1317 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1318 vpxor 15 * 32(%rdx), %ymm0, %ymm15;
1319 vmovdqu %ymm15, 0 * 32(%rax);
1320 vmovdqu %ymm0, 15 * 32(%rsi);
1322 vextracti128 $1, %ymm0, %xmm0;
1323 gf128mul_x_ble(%xmm0, %xmm12, %xmm15);
1324 vmovdqu %xmm0, (%rcx);
1327 vpbroadcastq (key_table)(CTX, %r8, 8), %ymm15;
1328 vpshufb .Lpack_bswap, %ymm15, %ymm15;
1329 vpxor 0 * 32(%rax), %ymm15, %ymm0;
1330 vpxor %ymm1, %ymm15, %ymm1;
1331 vpxor %ymm2, %ymm15, %ymm2;
1332 vpxor %ymm3, %ymm15, %ymm3;
1333 vpxor %ymm4, %ymm15, %ymm4;
1334 vpxor %ymm5, %ymm15, %ymm5;
1335 vpxor %ymm6, %ymm15, %ymm6;
1336 vpxor %ymm7, %ymm15, %ymm7;
1337 vpxor %ymm8, %ymm15, %ymm8;
1338 vpxor %ymm9, %ymm15, %ymm9;
1339 vpxor %ymm10, %ymm15, %ymm10;
1340 vpxor %ymm11, %ymm15, %ymm11;
1341 vpxor 12 * 32(%rax), %ymm15, %ymm12;
1342 vpxor 13 * 32(%rax), %ymm15, %ymm13;
1343 vpxor 14 * 32(%rax), %ymm15, %ymm14;
1344 vpxor 15 * 32(%rax), %ymm15, %ymm15;
1348 addq $(16 * 32), %rsp;
1350 vpxor 0 * 32(%rsi), %ymm7, %ymm7;
1351 vpxor 1 * 32(%rsi), %ymm6, %ymm6;
1352 vpxor 2 * 32(%rsi), %ymm5, %ymm5;
1353 vpxor 3 * 32(%rsi), %ymm4, %ymm4;
1354 vpxor 4 * 32(%rsi), %ymm3, %ymm3;
1355 vpxor 5 * 32(%rsi), %ymm2, %ymm2;
1356 vpxor 6 * 32(%rsi), %ymm1, %ymm1;
1357 vpxor 7 * 32(%rsi), %ymm0, %ymm0;
1358 vpxor 8 * 32(%rsi), %ymm15, %ymm15;
1359 vpxor 9 * 32(%rsi), %ymm14, %ymm14;
1360 vpxor 10 * 32(%rsi), %ymm13, %ymm13;
1361 vpxor 11 * 32(%rsi), %ymm12, %ymm12;
1362 vpxor 12 * 32(%rsi), %ymm11, %ymm11;
1363 vpxor 13 * 32(%rsi), %ymm10, %ymm10;
1364 vpxor 14 * 32(%rsi), %ymm9, %ymm9;
1365 vpxor 15 * 32(%rsi), %ymm8, %ymm8;
1366 write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0,
1367 %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9,
1374 ENDPROC(camellia_xts_crypt_32way)
1376 ENTRY(camellia_xts_enc_32way)
1379 * %rsi: dst (32 blocks)
1380 * %rdx: src (32 blocks)
1381 * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
1384 xorl %r8d, %r8d; /* input whitening key, 0 for enc */
1386 leaq __camellia_enc_blk32, %r9;
1388 jmp camellia_xts_crypt_32way;
1389 ENDPROC(camellia_xts_enc_32way)
1391 ENTRY(camellia_xts_dec_32way)
1394 * %rsi: dst (32 blocks)
1395 * %rdx: src (32 blocks)
1396 * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
1399 cmpl $16, key_length(CTX);
1402 cmovel %eax, %r8d; /* input whitening key, last for dec */
1404 leaq __camellia_dec_blk32, %r9;
1406 jmp camellia_xts_crypt_32way;
1407 ENDPROC(camellia_xts_dec_32way)