2 * x86_64/AVX2/AES-NI assembler implementation of Camellia
4 * Copyright © 2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
13 #include <linux/linkage.h>
15 #define CAMELLIA_TABLE_BYTE_LEN 272
17 /* struct camellia_ctx: */
19 #define key_length CAMELLIA_TABLE_BYTE_LEN
25 /**********************************************************************
27 **********************************************************************/
28 #define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0) \
29 vpand x, mask4bit, tmp0; \
30 vpandn x, mask4bit, x; \
33 vpshufb tmp0, lo_t, tmp0; \
54 /**********************************************************************
56 **********************************************************************/
60 * x0..x7: byte-sliced AB state
61 * mem_cd: register pointer storing CD state
62 * key: index for key material
64 * x0..x7: new byte-sliced CD state
66 #define roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, t0, t1, t2, t3, t4, t5, t6, \
69 * S-function with AES subbytes \
71 vbroadcasti128 .Linv_shift_row, t4; \
72 vpbroadcastd .L0f0f0f0f, t7; \
73 vbroadcasti128 .Lpre_tf_lo_s1, t5; \
74 vbroadcasti128 .Lpre_tf_hi_s1, t6; \
75 vbroadcasti128 .Lpre_tf_lo_s4, t2; \
76 vbroadcasti128 .Lpre_tf_hi_s4, t3; \
78 /* AES inverse shift rows */ \
88 /* prefilter sboxes 1, 2 and 3 */ \
89 /* prefilter sbox 4 */ \
90 filter_8bit(x0, t5, t6, t7, t4); \
91 filter_8bit(x7, t5, t6, t7, t4); \
92 vextracti128 $1, x0, t0##_x; \
93 vextracti128 $1, x7, t1##_x; \
94 filter_8bit(x3, t2, t3, t7, t4); \
95 filter_8bit(x6, t2, t3, t7, t4); \
96 vextracti128 $1, x3, t3##_x; \
97 vextracti128 $1, x6, t2##_x; \
98 filter_8bit(x2, t5, t6, t7, t4); \
99 filter_8bit(x5, t5, t6, t7, t4); \
100 filter_8bit(x1, t5, t6, t7, t4); \
101 filter_8bit(x4, t5, t6, t7, t4); \
103 vpxor t4##_x, t4##_x, t4##_x; \
105 /* AES subbytes + AES shift rows */ \
106 vextracti128 $1, x2, t6##_x; \
107 vextracti128 $1, x5, t5##_x; \
108 vaesenclast t4##_x, x0##_x, x0##_x; \
109 vaesenclast t4##_x, t0##_x, t0##_x; \
110 vinserti128 $1, t0##_x, x0, x0; \
111 vaesenclast t4##_x, x7##_x, x7##_x; \
112 vaesenclast t4##_x, t1##_x, t1##_x; \
113 vinserti128 $1, t1##_x, x7, x7; \
114 vaesenclast t4##_x, x3##_x, x3##_x; \
115 vaesenclast t4##_x, t3##_x, t3##_x; \
116 vinserti128 $1, t3##_x, x3, x3; \
117 vaesenclast t4##_x, x6##_x, x6##_x; \
118 vaesenclast t4##_x, t2##_x, t2##_x; \
119 vinserti128 $1, t2##_x, x6, x6; \
120 vextracti128 $1, x1, t3##_x; \
121 vextracti128 $1, x4, t2##_x; \
122 vbroadcasti128 .Lpost_tf_lo_s1, t0; \
123 vbroadcasti128 .Lpost_tf_hi_s1, t1; \
124 vaesenclast t4##_x, x2##_x, x2##_x; \
125 vaesenclast t4##_x, t6##_x, t6##_x; \
126 vinserti128 $1, t6##_x, x2, x2; \
127 vaesenclast t4##_x, x5##_x, x5##_x; \
128 vaesenclast t4##_x, t5##_x, t5##_x; \
129 vinserti128 $1, t5##_x, x5, x5; \
130 vaesenclast t4##_x, x1##_x, x1##_x; \
131 vaesenclast t4##_x, t3##_x, t3##_x; \
132 vinserti128 $1, t3##_x, x1, x1; \
133 vaesenclast t4##_x, x4##_x, x4##_x; \
134 vaesenclast t4##_x, t2##_x, t2##_x; \
135 vinserti128 $1, t2##_x, x4, x4; \
137 /* postfilter sboxes 1 and 4 */ \
138 vbroadcasti128 .Lpost_tf_lo_s3, t2; \
139 vbroadcasti128 .Lpost_tf_hi_s3, t3; \
140 filter_8bit(x0, t0, t1, t7, t6); \
141 filter_8bit(x7, t0, t1, t7, t6); \
142 filter_8bit(x3, t0, t1, t7, t6); \
143 filter_8bit(x6, t0, t1, t7, t6); \
145 /* postfilter sbox 3 */ \
146 vbroadcasti128 .Lpost_tf_lo_s2, t4; \
147 vbroadcasti128 .Lpost_tf_hi_s2, t5; \
148 filter_8bit(x2, t2, t3, t7, t6); \
149 filter_8bit(x5, t2, t3, t7, t6); \
151 vpbroadcastq key, t0; /* higher 64-bit duplicate ignored */ \
153 /* postfilter sbox 2 */ \
154 filter_8bit(x1, t4, t5, t7, t2); \
155 filter_8bit(x4, t4, t5, t7, t2); \
158 vpsrldq $1, t0, t1; \
159 vpsrldq $2, t0, t2; \
160 vpshufb t7, t1, t1; \
161 vpsrldq $3, t0, t3; \
169 vpshufb t7, t2, t2; \
170 vpsrldq $4, t0, t4; \
171 vpshufb t7, t3, t3; \
172 vpsrldq $5, t0, t5; \
173 vpshufb t7, t4, t4; \
180 vpsrldq $6, t0, t6; \
181 vpshufb t7, t5, t5; \
182 vpshufb t7, t6, t6; \
192 vpxor x2, x7, x7; /* note: high and low parts swapped */ \
194 /* Add key material and result to CD (x becomes new CD) */ \
197 vpxor 5 * 32(mem_cd), x1, x1; \
199 vpsrldq $7, t0, t6; \
200 vpshufb t7, t0, t0; \
201 vpshufb t7, t6, t7; \
204 vpxor 4 * 32(mem_cd), x0, x0; \
207 vpxor 6 * 32(mem_cd), x2, x2; \
210 vpxor 7 * 32(mem_cd), x3, x3; \
213 vpxor 0 * 32(mem_cd), x4, x4; \
216 vpxor 1 * 32(mem_cd), x5, x5; \
219 vpxor 2 * 32(mem_cd), x6, x6; \
222 vpxor 3 * 32(mem_cd), x7, x7;
225 * Size optimization... with inlined roundsm32 binary would be over 5 times
226 * larger and would only marginally faster.
229 roundsm32_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd:
230 roundsm32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
231 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, %ymm15,
234 ENDPROC(roundsm32_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd)
237 roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab:
238 roundsm32(%ymm4, %ymm5, %ymm6, %ymm7, %ymm0, %ymm1, %ymm2, %ymm3,
239 %ymm12, %ymm13, %ymm14, %ymm15, %ymm8, %ymm9, %ymm10, %ymm11,
242 ENDPROC(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
246 * x0..x7: byte-sliced AB state preloaded
247 * mem_ab: byte-sliced AB state in memory
248 * mem_cb: byte-sliced CD state in memory
250 #define two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
251 y6, y7, mem_ab, mem_cd, i, dir, store_ab) \
252 leaq (key_table + (i) * 8)(CTX), %r9; \
253 call roundsm32_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd; \
255 vmovdqu x0, 4 * 32(mem_cd); \
256 vmovdqu x1, 5 * 32(mem_cd); \
257 vmovdqu x2, 6 * 32(mem_cd); \
258 vmovdqu x3, 7 * 32(mem_cd); \
259 vmovdqu x4, 0 * 32(mem_cd); \
260 vmovdqu x5, 1 * 32(mem_cd); \
261 vmovdqu x6, 2 * 32(mem_cd); \
262 vmovdqu x7, 3 * 32(mem_cd); \
264 leaq (key_table + ((i) + (dir)) * 8)(CTX), %r9; \
265 call roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab; \
267 store_ab(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab);
269 #define dummy_store(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) /* do nothing */
271 #define store_ab_state(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) \
272 /* Store new AB state */ \
273 vmovdqu x4, 4 * 32(mem_ab); \
274 vmovdqu x5, 5 * 32(mem_ab); \
275 vmovdqu x6, 6 * 32(mem_ab); \
276 vmovdqu x7, 7 * 32(mem_ab); \
277 vmovdqu x0, 0 * 32(mem_ab); \
278 vmovdqu x1, 1 * 32(mem_ab); \
279 vmovdqu x2, 2 * 32(mem_ab); \
280 vmovdqu x3, 3 * 32(mem_ab);
282 #define enc_rounds32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
283 y6, y7, mem_ab, mem_cd, i) \
284 two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
285 y6, y7, mem_ab, mem_cd, (i) + 2, 1, store_ab_state); \
286 two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
287 y6, y7, mem_ab, mem_cd, (i) + 4, 1, store_ab_state); \
288 two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
289 y6, y7, mem_ab, mem_cd, (i) + 6, 1, dummy_store);
291 #define dec_rounds32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
292 y6, y7, mem_ab, mem_cd, i) \
293 two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
294 y6, y7, mem_ab, mem_cd, (i) + 7, -1, store_ab_state); \
295 two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
296 y6, y7, mem_ab, mem_cd, (i) + 5, -1, store_ab_state); \
297 two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
298 y6, y7, mem_ab, mem_cd, (i) + 3, -1, dummy_store);
302 * v0..3: byte-sliced 32-bit integers
306 #define rol32_1_32(v0, v1, v2, v3, t0, t1, t2, zero) \
307 vpcmpgtb v0, zero, t0; \
311 vpcmpgtb v1, zero, t1; \
315 vpcmpgtb v2, zero, t2; \
321 vpcmpgtb v3, zero, t0; \
331 * r: byte-sliced AB state in memory
332 * l: byte-sliced CD state in memory
334 * x0..x7: new byte-sliced CD state
336 #define fls32(l, l0, l1, l2, l3, l4, l5, l6, l7, r, t0, t1, t2, t3, tt0, \
337 tt1, tt2, tt3, kll, klr, krl, krr) \
341 * lr ^= rol32(t0, 1); \
343 vpbroadcastd kll, t0; /* only lowest 32-bit used */ \
344 vpxor tt0, tt0, tt0; \
345 vpshufb tt0, t0, t3; \
346 vpsrldq $1, t0, t0; \
347 vpshufb tt0, t0, t2; \
348 vpsrldq $1, t0, t0; \
349 vpshufb tt0, t0, t1; \
350 vpsrldq $1, t0, t0; \
351 vpshufb tt0, t0, t0; \
358 rol32_1_32(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \
361 vpbroadcastd krr, t0; /* only lowest 32-bit used */ \
362 vmovdqu l4, 4 * 32(l); \
364 vmovdqu l5, 5 * 32(l); \
366 vmovdqu l6, 6 * 32(l); \
368 vmovdqu l7, 7 * 32(l); \
376 vpshufb tt0, t0, t3; \
377 vpsrldq $1, t0, t0; \
378 vpshufb tt0, t0, t2; \
379 vpsrldq $1, t0, t0; \
380 vpshufb tt0, t0, t1; \
381 vpsrldq $1, t0, t0; \
382 vpshufb tt0, t0, t0; \
384 vpor 4 * 32(r), t0, t0; \
385 vpor 5 * 32(r), t1, t1; \
386 vpor 6 * 32(r), t2, t2; \
387 vpor 7 * 32(r), t3, t3; \
389 vpxor 0 * 32(r), t0, t0; \
390 vpxor 1 * 32(r), t1, t1; \
391 vpxor 2 * 32(r), t2, t2; \
392 vpxor 3 * 32(r), t3, t3; \
393 vmovdqu t0, 0 * 32(r); \
394 vpbroadcastd krl, t0; /* only lowest 32-bit used */ \
395 vmovdqu t1, 1 * 32(r); \
396 vmovdqu t2, 2 * 32(r); \
397 vmovdqu t3, 3 * 32(r); \
402 * rr ^= rol32(t2, 1); \
404 vpshufb tt0, t0, t3; \
405 vpsrldq $1, t0, t0; \
406 vpshufb tt0, t0, t2; \
407 vpsrldq $1, t0, t0; \
408 vpshufb tt0, t0, t1; \
409 vpsrldq $1, t0, t0; \
410 vpshufb tt0, t0, t0; \
412 vpand 0 * 32(r), t0, t0; \
413 vpand 1 * 32(r), t1, t1; \
414 vpand 2 * 32(r), t2, t2; \
415 vpand 3 * 32(r), t3, t3; \
417 rol32_1_32(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \
419 vpxor 4 * 32(r), t0, t0; \
420 vpxor 5 * 32(r), t1, t1; \
421 vpxor 6 * 32(r), t2, t2; \
422 vpxor 7 * 32(r), t3, t3; \
423 vmovdqu t0, 4 * 32(r); \
424 vpbroadcastd klr, t0; /* only lowest 32-bit used */ \
425 vmovdqu t1, 5 * 32(r); \
426 vmovdqu t2, 6 * 32(r); \
427 vmovdqu t3, 7 * 32(r); \
435 vpshufb tt0, t0, t3; \
436 vpsrldq $1, t0, t0; \
437 vpshufb tt0, t0, t2; \
438 vpsrldq $1, t0, t0; \
439 vpshufb tt0, t0, t1; \
440 vpsrldq $1, t0, t0; \
441 vpshufb tt0, t0, t0; \
449 vmovdqu l0, 0 * 32(l); \
451 vmovdqu l1, 1 * 32(l); \
453 vmovdqu l2, 2 * 32(l); \
455 vmovdqu l3, 3 * 32(l);
457 #define transpose_4x4(x0, x1, x2, x3, t1, t2) \
458 vpunpckhdq x1, x0, t2; \
459 vpunpckldq x1, x0, x0; \
461 vpunpckldq x3, x2, t1; \
462 vpunpckhdq x3, x2, x2; \
464 vpunpckhqdq t1, x0, x1; \
465 vpunpcklqdq t1, x0, x0; \
467 vpunpckhqdq x2, t2, x3; \
468 vpunpcklqdq x2, t2, x2;
470 #define byteslice_16x16b_fast(a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, \
471 a3, b3, c3, d3, st0, st1) \
474 transpose_4x4(a0, a1, a2, a3, d2, d3); \
475 transpose_4x4(b0, b1, b2, b3, d2, d3); \
481 transpose_4x4(c0, c1, c2, c3, a0, a1); \
482 transpose_4x4(d0, d1, d2, d3, a0, a1); \
484 vbroadcasti128 .Lshufb_16x16b, a0; \
486 vpshufb a0, a2, a2; \
487 vpshufb a0, a3, a3; \
488 vpshufb a0, b0, b0; \
489 vpshufb a0, b1, b1; \
490 vpshufb a0, b2, b2; \
491 vpshufb a0, b3, b3; \
492 vpshufb a0, a1, a1; \
493 vpshufb a0, c0, c0; \
494 vpshufb a0, c1, c1; \
495 vpshufb a0, c2, c2; \
496 vpshufb a0, c3, c3; \
497 vpshufb a0, d0, d0; \
498 vpshufb a0, d1, d1; \
499 vpshufb a0, d2, d2; \
500 vpshufb a0, d3, d3; \
503 vpshufb a0, d3, a0; \
506 transpose_4x4(a0, b0, c0, d0, d2, d3); \
507 transpose_4x4(a1, b1, c1, d1, d2, d3); \
513 transpose_4x4(a2, b2, c2, d2, b0, b1); \
514 transpose_4x4(a3, b3, c3, d3, b0, b1); \
517 /* does not adjust output bytes inside vectors */
519 /* load blocks to registers and apply pre-whitening */
520 #define inpack32_pre(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
522 vpbroadcastq key, x0; \
523 vpshufb .Lpack_bswap, x0, x0; \
525 vpxor 0 * 32(rio), x0, y7; \
526 vpxor 1 * 32(rio), x0, y6; \
527 vpxor 2 * 32(rio), x0, y5; \
528 vpxor 3 * 32(rio), x0, y4; \
529 vpxor 4 * 32(rio), x0, y3; \
530 vpxor 5 * 32(rio), x0, y2; \
531 vpxor 6 * 32(rio), x0, y1; \
532 vpxor 7 * 32(rio), x0, y0; \
533 vpxor 8 * 32(rio), x0, x7; \
534 vpxor 9 * 32(rio), x0, x6; \
535 vpxor 10 * 32(rio), x0, x5; \
536 vpxor 11 * 32(rio), x0, x4; \
537 vpxor 12 * 32(rio), x0, x3; \
538 vpxor 13 * 32(rio), x0, x2; \
539 vpxor 14 * 32(rio), x0, x1; \
540 vpxor 15 * 32(rio), x0, x0;
542 /* byteslice pre-whitened blocks and store to temporary memory */
543 #define inpack32_post(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
544 y6, y7, mem_ab, mem_cd) \
545 byteslice_16x16b_fast(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, \
546 y4, y5, y6, y7, (mem_ab), (mem_cd)); \
548 vmovdqu x0, 0 * 32(mem_ab); \
549 vmovdqu x1, 1 * 32(mem_ab); \
550 vmovdqu x2, 2 * 32(mem_ab); \
551 vmovdqu x3, 3 * 32(mem_ab); \
552 vmovdqu x4, 4 * 32(mem_ab); \
553 vmovdqu x5, 5 * 32(mem_ab); \
554 vmovdqu x6, 6 * 32(mem_ab); \
555 vmovdqu x7, 7 * 32(mem_ab); \
556 vmovdqu y0, 0 * 32(mem_cd); \
557 vmovdqu y1, 1 * 32(mem_cd); \
558 vmovdqu y2, 2 * 32(mem_cd); \
559 vmovdqu y3, 3 * 32(mem_cd); \
560 vmovdqu y4, 4 * 32(mem_cd); \
561 vmovdqu y5, 5 * 32(mem_cd); \
562 vmovdqu y6, 6 * 32(mem_cd); \
563 vmovdqu y7, 7 * 32(mem_cd);
565 /* de-byteslice, apply post-whitening and store blocks */
566 #define outunpack32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, \
567 y5, y6, y7, key, stack_tmp0, stack_tmp1) \
568 byteslice_16x16b_fast(y0, y4, x0, x4, y1, y5, x1, x5, y2, y6, x2, x6, \
569 y3, y7, x3, x7, stack_tmp0, stack_tmp1); \
571 vmovdqu x0, stack_tmp0; \
573 vpbroadcastq key, x0; \
574 vpshufb .Lpack_bswap, x0, x0; \
591 vpxor stack_tmp0, x0, x0;
593 #define write_output(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
595 vmovdqu x0, 0 * 32(rio); \
596 vmovdqu x1, 1 * 32(rio); \
597 vmovdqu x2, 2 * 32(rio); \
598 vmovdqu x3, 3 * 32(rio); \
599 vmovdqu x4, 4 * 32(rio); \
600 vmovdqu x5, 5 * 32(rio); \
601 vmovdqu x6, 6 * 32(rio); \
602 vmovdqu x7, 7 * 32(rio); \
603 vmovdqu y0, 8 * 32(rio); \
604 vmovdqu y1, 9 * 32(rio); \
605 vmovdqu y2, 10 * 32(rio); \
606 vmovdqu y3, 11 * 32(rio); \
607 vmovdqu y4, 12 * 32(rio); \
608 vmovdqu y5, 13 * 32(rio); \
609 vmovdqu y6, 14 * 32(rio); \
610 vmovdqu y7, 15 * 32(rio);
615 #define SHUFB_BYTES(idx) \
616 0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx)
619 .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3)
620 .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3)
623 .long 0x00010203, 0x04050607, 0x80808080, 0x80808080
624 .long 0x00010203, 0x04050607, 0x80808080, 0x80808080
626 /* For CTR-mode IV byteswap */
628 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
631 .Lxts_gf128mul_and_shl1_mask_0:
632 .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0
633 .Lxts_gf128mul_and_shl1_mask_1:
634 .byte 0x0e, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0
637 * pre-SubByte transform
639 * pre-lookup for sbox1, sbox2, sbox3:
640 * swap_bitendianness(
641 * isom_map_camellia_to_aes(
643 * swap_bitendianess(in)
648 * (note: '⊕ 0xc5' inside camellia_f())
651 .byte 0x45, 0xe8, 0x40, 0xed, 0x2e, 0x83, 0x2b, 0x86
652 .byte 0x4b, 0xe6, 0x4e, 0xe3, 0x20, 0x8d, 0x25, 0x88
654 .byte 0x00, 0x51, 0xf1, 0xa0, 0x8a, 0xdb, 0x7b, 0x2a
655 .byte 0x09, 0x58, 0xf8, 0xa9, 0x83, 0xd2, 0x72, 0x23
658 * pre-SubByte transform
660 * pre-lookup for sbox4:
661 * swap_bitendianness(
662 * isom_map_camellia_to_aes(
664 * swap_bitendianess(in <<< 1)
669 * (note: '⊕ 0xc5' inside camellia_f())
672 .byte 0x45, 0x40, 0x2e, 0x2b, 0x4b, 0x4e, 0x20, 0x25
673 .byte 0x14, 0x11, 0x7f, 0x7a, 0x1a, 0x1f, 0x71, 0x74
675 .byte 0x00, 0xf1, 0x8a, 0x7b, 0x09, 0xf8, 0x83, 0x72
676 .byte 0xad, 0x5c, 0x27, 0xd6, 0xa4, 0x55, 0x2e, 0xdf
679 * post-SubByte transform
681 * post-lookup for sbox1, sbox4:
682 * swap_bitendianness(
684 * isom_map_aes_to_camellia(
685 * swap_bitendianness(
686 * aes_inverse_affine_transform(in)
692 * (note: '⊕ 0x6e' inside camellia_h())
695 .byte 0x3c, 0xcc, 0xcf, 0x3f, 0x32, 0xc2, 0xc1, 0x31
696 .byte 0xdc, 0x2c, 0x2f, 0xdf, 0xd2, 0x22, 0x21, 0xd1
698 .byte 0x00, 0xf9, 0x86, 0x7f, 0xd7, 0x2e, 0x51, 0xa8
699 .byte 0xa4, 0x5d, 0x22, 0xdb, 0x73, 0x8a, 0xf5, 0x0c
702 * post-SubByte transform
704 * post-lookup for sbox2:
705 * swap_bitendianness(
707 * isom_map_aes_to_camellia(
708 * swap_bitendianness(
709 * aes_inverse_affine_transform(in)
715 * (note: '⊕ 0x6e' inside camellia_h())
718 .byte 0x78, 0x99, 0x9f, 0x7e, 0x64, 0x85, 0x83, 0x62
719 .byte 0xb9, 0x58, 0x5e, 0xbf, 0xa5, 0x44, 0x42, 0xa3
721 .byte 0x00, 0xf3, 0x0d, 0xfe, 0xaf, 0x5c, 0xa2, 0x51
722 .byte 0x49, 0xba, 0x44, 0xb7, 0xe6, 0x15, 0xeb, 0x18
725 * post-SubByte transform
727 * post-lookup for sbox3:
728 * swap_bitendianness(
730 * isom_map_aes_to_camellia(
731 * swap_bitendianness(
732 * aes_inverse_affine_transform(in)
738 * (note: '⊕ 0x6e' inside camellia_h())
741 .byte 0x1e, 0x66, 0xe7, 0x9f, 0x19, 0x61, 0xe0, 0x98
742 .byte 0x6e, 0x16, 0x97, 0xef, 0x69, 0x11, 0x90, 0xe8
744 .byte 0x00, 0xfc, 0x43, 0xbf, 0xeb, 0x17, 0xa8, 0x54
745 .byte 0x52, 0xae, 0x11, 0xed, 0xb9, 0x45, 0xfa, 0x06
747 /* For isolating SubBytes from AESENCLAST, inverse shift row */
749 .byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b
750 .byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03
760 __camellia_enc_blk32:
763 * %rax: temporary storage, 512 bytes
764 * %ymm0..%ymm15: 32 plaintext blocks
766 * %ymm0..%ymm15: 32 encrypted blocks, order swapped:
767 * 7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
770 leaq 8 * 32(%rax), %rcx;
772 inpack32_post(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
773 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
776 enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
777 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
778 %ymm15, %rax, %rcx, 0);
780 fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
781 %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
783 ((key_table + (8) * 8) + 0)(CTX),
784 ((key_table + (8) * 8) + 4)(CTX),
785 ((key_table + (8) * 8) + 8)(CTX),
786 ((key_table + (8) * 8) + 12)(CTX));
788 enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
789 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
790 %ymm15, %rax, %rcx, 8);
792 fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
793 %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
795 ((key_table + (16) * 8) + 0)(CTX),
796 ((key_table + (16) * 8) + 4)(CTX),
797 ((key_table + (16) * 8) + 8)(CTX),
798 ((key_table + (16) * 8) + 12)(CTX));
800 enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
801 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
802 %ymm15, %rax, %rcx, 16);
805 cmpl $16, key_length(CTX);
809 /* load CD for output */
810 vmovdqu 0 * 32(%rcx), %ymm8;
811 vmovdqu 1 * 32(%rcx), %ymm9;
812 vmovdqu 2 * 32(%rcx), %ymm10;
813 vmovdqu 3 * 32(%rcx), %ymm11;
814 vmovdqu 4 * 32(%rcx), %ymm12;
815 vmovdqu 5 * 32(%rcx), %ymm13;
816 vmovdqu 6 * 32(%rcx), %ymm14;
817 vmovdqu 7 * 32(%rcx), %ymm15;
819 outunpack32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
820 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
821 %ymm15, (key_table)(CTX, %r8, 8), (%rax), 1 * 32(%rax));
829 fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
830 %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
832 ((key_table + (24) * 8) + 0)(CTX),
833 ((key_table + (24) * 8) + 4)(CTX),
834 ((key_table + (24) * 8) + 8)(CTX),
835 ((key_table + (24) * 8) + 12)(CTX));
837 enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
838 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
839 %ymm15, %rax, %rcx, 24);
842 ENDPROC(__camellia_enc_blk32)
845 __camellia_dec_blk32:
848 * %rax: temporary storage, 512 bytes
849 * %r8d: 24 for 16 byte key, 32 for larger
850 * %ymm0..%ymm15: 16 encrypted blocks
852 * %ymm0..%ymm15: 16 plaintext blocks, order swapped:
853 * 7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
856 leaq 8 * 32(%rax), %rcx;
858 inpack32_post(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
859 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
866 dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
867 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
868 %ymm15, %rax, %rcx, 16);
870 fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
871 %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
873 ((key_table + (16) * 8) + 8)(CTX),
874 ((key_table + (16) * 8) + 12)(CTX),
875 ((key_table + (16) * 8) + 0)(CTX),
876 ((key_table + (16) * 8) + 4)(CTX));
878 dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
879 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
880 %ymm15, %rax, %rcx, 8);
882 fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
883 %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
885 ((key_table + (8) * 8) + 8)(CTX),
886 ((key_table + (8) * 8) + 12)(CTX),
887 ((key_table + (8) * 8) + 0)(CTX),
888 ((key_table + (8) * 8) + 4)(CTX));
890 dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
891 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
892 %ymm15, %rax, %rcx, 0);
894 /* load CD for output */
895 vmovdqu 0 * 32(%rcx), %ymm8;
896 vmovdqu 1 * 32(%rcx), %ymm9;
897 vmovdqu 2 * 32(%rcx), %ymm10;
898 vmovdqu 3 * 32(%rcx), %ymm11;
899 vmovdqu 4 * 32(%rcx), %ymm12;
900 vmovdqu 5 * 32(%rcx), %ymm13;
901 vmovdqu 6 * 32(%rcx), %ymm14;
902 vmovdqu 7 * 32(%rcx), %ymm15;
904 outunpack32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
905 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
906 %ymm15, (key_table)(CTX), (%rax), 1 * 32(%rax));
912 dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
913 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
914 %ymm15, %rax, %rcx, 24);
916 fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
917 %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
919 ((key_table + (24) * 8) + 8)(CTX),
920 ((key_table + (24) * 8) + 12)(CTX),
921 ((key_table + (24) * 8) + 0)(CTX),
922 ((key_table + (24) * 8) + 4)(CTX));
925 ENDPROC(__camellia_dec_blk32)
927 ENTRY(camellia_ecb_enc_32way)
930 * %rsi: dst (32 blocks)
931 * %rdx: src (32 blocks)
936 inpack32_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
937 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
938 %ymm15, %rdx, (key_table)(CTX));
940 /* now dst can be used as temporary buffer (even in src == dst case) */
943 call __camellia_enc_blk32;
945 write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0,
946 %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9,
952 ENDPROC(camellia_ecb_enc_32way)
954 ENTRY(camellia_ecb_dec_32way)
957 * %rsi: dst (32 blocks)
958 * %rdx: src (32 blocks)
963 cmpl $16, key_length(CTX);
966 cmovel %eax, %r8d; /* max */
968 inpack32_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
969 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
970 %ymm15, %rdx, (key_table)(CTX, %r8, 8));
972 /* now dst can be used as temporary buffer (even in src == dst case) */
975 call __camellia_dec_blk32;
977 write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0,
978 %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9,
984 ENDPROC(camellia_ecb_dec_32way)
986 ENTRY(camellia_cbc_dec_32way)
989 * %rsi: dst (32 blocks)
990 * %rdx: src (32 blocks)
995 cmpl $16, key_length(CTX);
998 cmovel %eax, %r8d; /* max */
1000 inpack32_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
1001 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
1002 %ymm15, %rdx, (key_table)(CTX, %r8, 8));
1006 je .Lcbc_dec_use_stack;
1008 /* dst can be used as temporary storage, src is not overwritten. */
1010 jmp .Lcbc_dec_continue;
1012 .Lcbc_dec_use_stack:
1014 * dst still in-use (because dst == src), so use stack for temporary
1017 subq $(16 * 32), %rsp;
1021 call __camellia_dec_blk32;
1023 vmovdqu %ymm7, (%rax);
1024 vpxor %ymm7, %ymm7, %ymm7;
1025 vinserti128 $1, (%rdx), %ymm7, %ymm7;
1026 vpxor (%rax), %ymm7, %ymm7;
1028 vpxor (0 * 32 + 16)(%rdx), %ymm6, %ymm6;
1029 vpxor (1 * 32 + 16)(%rdx), %ymm5, %ymm5;
1030 vpxor (2 * 32 + 16)(%rdx), %ymm4, %ymm4;
1031 vpxor (3 * 32 + 16)(%rdx), %ymm3, %ymm3;
1032 vpxor (4 * 32 + 16)(%rdx), %ymm2, %ymm2;
1033 vpxor (5 * 32 + 16)(%rdx), %ymm1, %ymm1;
1034 vpxor (6 * 32 + 16)(%rdx), %ymm0, %ymm0;
1035 vpxor (7 * 32 + 16)(%rdx), %ymm15, %ymm15;
1036 vpxor (8 * 32 + 16)(%rdx), %ymm14, %ymm14;
1037 vpxor (9 * 32 + 16)(%rdx), %ymm13, %ymm13;
1038 vpxor (10 * 32 + 16)(%rdx), %ymm12, %ymm12;
1039 vpxor (11 * 32 + 16)(%rdx), %ymm11, %ymm11;
1040 vpxor (12 * 32 + 16)(%rdx), %ymm10, %ymm10;
1041 vpxor (13 * 32 + 16)(%rdx), %ymm9, %ymm9;
1042 vpxor (14 * 32 + 16)(%rdx), %ymm8, %ymm8;
1043 write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0,
1044 %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9,
1050 ENDPROC(camellia_cbc_dec_32way)
1052 #define inc_le128(x, minus_one, tmp) \
1053 vpcmpeqq minus_one, x, tmp; \
1054 vpsubq minus_one, x, x; \
1055 vpslldq $8, tmp, tmp; \
1058 #define add2_le128(x, minus_one, minus_two, tmp1, tmp2) \
1059 vpcmpeqq minus_one, x, tmp1; \
1060 vpcmpeqq minus_two, x, tmp2; \
1061 vpsubq minus_two, x, x; \
1062 vpor tmp2, tmp1, tmp1; \
1063 vpslldq $8, tmp1, tmp1; \
1066 ENTRY(camellia_ctr_32way)
1069 * %rsi: dst (32 blocks)
1070 * %rdx: src (32 blocks)
1071 * %rcx: iv (little endian, 128bit)
1080 /* dst can be used as temporary storage, src is not overwritten. */
1085 subq $(16 * 32), %rsp;
1089 vpcmpeqd %ymm15, %ymm15, %ymm15;
1090 vpsrldq $8, %ymm15, %ymm15; /* ab: -1:0 ; cd: -1:0 */
1091 vpaddq %ymm15, %ymm15, %ymm12; /* ab: -2:0 ; cd: -2:0 */
1093 /* load IV and byteswap */
1094 vmovdqu (%rcx), %xmm0;
1095 vmovdqa %xmm0, %xmm1;
1096 inc_le128(%xmm0, %xmm15, %xmm14);
1097 vbroadcasti128 .Lbswap128_mask, %ymm14;
1098 vinserti128 $1, %xmm0, %ymm1, %ymm0;
1099 vpshufb %ymm14, %ymm0, %ymm13;
1100 vmovdqu %ymm13, 15 * 32(%rax);
1103 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); /* ab:le2 ; cd:le3 */
1104 vpshufb %ymm14, %ymm0, %ymm13;
1105 vmovdqu %ymm13, 14 * 32(%rax);
1106 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1107 vpshufb %ymm14, %ymm0, %ymm13;
1108 vmovdqu %ymm13, 13 * 32(%rax);
1109 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1110 vpshufb %ymm14, %ymm0, %ymm13;
1111 vmovdqu %ymm13, 12 * 32(%rax);
1112 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1113 vpshufb %ymm14, %ymm0, %ymm13;
1114 vmovdqu %ymm13, 11 * 32(%rax);
1115 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1116 vpshufb %ymm14, %ymm0, %ymm10;
1117 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1118 vpshufb %ymm14, %ymm0, %ymm9;
1119 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1120 vpshufb %ymm14, %ymm0, %ymm8;
1121 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1122 vpshufb %ymm14, %ymm0, %ymm7;
1123 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1124 vpshufb %ymm14, %ymm0, %ymm6;
1125 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1126 vpshufb %ymm14, %ymm0, %ymm5;
1127 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1128 vpshufb %ymm14, %ymm0, %ymm4;
1129 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1130 vpshufb %ymm14, %ymm0, %ymm3;
1131 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1132 vpshufb %ymm14, %ymm0, %ymm2;
1133 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1134 vpshufb %ymm14, %ymm0, %ymm1;
1135 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1136 vextracti128 $1, %ymm0, %xmm13;
1137 vpshufb %ymm14, %ymm0, %ymm0;
1138 inc_le128(%xmm13, %xmm15, %xmm14);
1139 vmovdqu %xmm13, (%rcx);
1142 vpbroadcastq (key_table)(CTX), %ymm15;
1143 vpshufb .Lpack_bswap, %ymm15, %ymm15;
1144 vpxor %ymm0, %ymm15, %ymm0;
1145 vpxor %ymm1, %ymm15, %ymm1;
1146 vpxor %ymm2, %ymm15, %ymm2;
1147 vpxor %ymm3, %ymm15, %ymm3;
1148 vpxor %ymm4, %ymm15, %ymm4;
1149 vpxor %ymm5, %ymm15, %ymm5;
1150 vpxor %ymm6, %ymm15, %ymm6;
1151 vpxor %ymm7, %ymm15, %ymm7;
1152 vpxor %ymm8, %ymm15, %ymm8;
1153 vpxor %ymm9, %ymm15, %ymm9;
1154 vpxor %ymm10, %ymm15, %ymm10;
1155 vpxor 11 * 32(%rax), %ymm15, %ymm11;
1156 vpxor 12 * 32(%rax), %ymm15, %ymm12;
1157 vpxor 13 * 32(%rax), %ymm15, %ymm13;
1158 vpxor 14 * 32(%rax), %ymm15, %ymm14;
1159 vpxor 15 * 32(%rax), %ymm15, %ymm15;
1161 call __camellia_enc_blk32;
1165 vpxor 0 * 32(%rdx), %ymm7, %ymm7;
1166 vpxor 1 * 32(%rdx), %ymm6, %ymm6;
1167 vpxor 2 * 32(%rdx), %ymm5, %ymm5;
1168 vpxor 3 * 32(%rdx), %ymm4, %ymm4;
1169 vpxor 4 * 32(%rdx), %ymm3, %ymm3;
1170 vpxor 5 * 32(%rdx), %ymm2, %ymm2;
1171 vpxor 6 * 32(%rdx), %ymm1, %ymm1;
1172 vpxor 7 * 32(%rdx), %ymm0, %ymm0;
1173 vpxor 8 * 32(%rdx), %ymm15, %ymm15;
1174 vpxor 9 * 32(%rdx), %ymm14, %ymm14;
1175 vpxor 10 * 32(%rdx), %ymm13, %ymm13;
1176 vpxor 11 * 32(%rdx), %ymm12, %ymm12;
1177 vpxor 12 * 32(%rdx), %ymm11, %ymm11;
1178 vpxor 13 * 32(%rdx), %ymm10, %ymm10;
1179 vpxor 14 * 32(%rdx), %ymm9, %ymm9;
1180 vpxor 15 * 32(%rdx), %ymm8, %ymm8;
1181 write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0,
1182 %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9,
1188 ENDPROC(camellia_ctr_32way)
1190 #define gf128mul_x_ble(iv, mask, tmp) \
1191 vpsrad $31, iv, tmp; \
1192 vpaddq iv, iv, iv; \
1193 vpshufd $0x13, tmp, tmp; \
1194 vpand mask, tmp, tmp; \
1197 #define gf128mul_x2_ble(iv, mask1, mask2, tmp0, tmp1) \
1198 vpsrad $31, iv, tmp0; \
1199 vpaddq iv, iv, tmp1; \
1200 vpsllq $2, iv, iv; \
1201 vpshufd $0x13, tmp0, tmp0; \
1202 vpsrad $31, tmp1, tmp1; \
1203 vpand mask2, tmp0, tmp0; \
1204 vpshufd $0x13, tmp1, tmp1; \
1205 vpxor tmp0, iv, iv; \
1206 vpand mask1, tmp1, tmp1; \
1210 camellia_xts_crypt_32way:
1213 * %rsi: dst (32 blocks)
1214 * %rdx: src (32 blocks)
1215 * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
1216 * %r8: index for input whitening key
1217 * %r9: pointer to __camellia_enc_blk32 or __camellia_dec_blk32
1222 subq $(16 * 32), %rsp;
1225 vbroadcasti128 .Lxts_gf128mul_and_shl1_mask_0, %ymm12;
1227 /* load IV and construct second IV */
1228 vmovdqu (%rcx), %xmm0;
1229 vmovdqa %xmm0, %xmm15;
1230 gf128mul_x_ble(%xmm0, %xmm12, %xmm13);
1231 vbroadcasti128 .Lxts_gf128mul_and_shl1_mask_1, %ymm13;
1232 vinserti128 $1, %xmm0, %ymm15, %ymm0;
1233 vpxor 0 * 32(%rdx), %ymm0, %ymm15;
1234 vmovdqu %ymm15, 15 * 32(%rax);
1235 vmovdqu %ymm0, 0 * 32(%rsi);
1238 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1239 vpxor 1 * 32(%rdx), %ymm0, %ymm15;
1240 vmovdqu %ymm15, 14 * 32(%rax);
1241 vmovdqu %ymm0, 1 * 32(%rsi);
1243 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1244 vpxor 2 * 32(%rdx), %ymm0, %ymm15;
1245 vmovdqu %ymm15, 13 * 32(%rax);
1246 vmovdqu %ymm0, 2 * 32(%rsi);
1248 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1249 vpxor 3 * 32(%rdx), %ymm0, %ymm15;
1250 vmovdqu %ymm15, 12 * 32(%rax);
1251 vmovdqu %ymm0, 3 * 32(%rsi);
1253 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1254 vpxor 4 * 32(%rdx), %ymm0, %ymm11;
1255 vmovdqu %ymm0, 4 * 32(%rsi);
1257 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1258 vpxor 5 * 32(%rdx), %ymm0, %ymm10;
1259 vmovdqu %ymm0, 5 * 32(%rsi);
1261 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1262 vpxor 6 * 32(%rdx), %ymm0, %ymm9;
1263 vmovdqu %ymm0, 6 * 32(%rsi);
1265 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1266 vpxor 7 * 32(%rdx), %ymm0, %ymm8;
1267 vmovdqu %ymm0, 7 * 32(%rsi);
1269 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1270 vpxor 8 * 32(%rdx), %ymm0, %ymm7;
1271 vmovdqu %ymm0, 8 * 32(%rsi);
1273 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1274 vpxor 9 * 32(%rdx), %ymm0, %ymm6;
1275 vmovdqu %ymm0, 9 * 32(%rsi);
1277 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1278 vpxor 10 * 32(%rdx), %ymm0, %ymm5;
1279 vmovdqu %ymm0, 10 * 32(%rsi);
1281 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1282 vpxor 11 * 32(%rdx), %ymm0, %ymm4;
1283 vmovdqu %ymm0, 11 * 32(%rsi);
1285 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1286 vpxor 12 * 32(%rdx), %ymm0, %ymm3;
1287 vmovdqu %ymm0, 12 * 32(%rsi);
1289 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1290 vpxor 13 * 32(%rdx), %ymm0, %ymm2;
1291 vmovdqu %ymm0, 13 * 32(%rsi);
1293 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1294 vpxor 14 * 32(%rdx), %ymm0, %ymm1;
1295 vmovdqu %ymm0, 14 * 32(%rsi);
1297 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1298 vpxor 15 * 32(%rdx), %ymm0, %ymm15;
1299 vmovdqu %ymm15, 0 * 32(%rax);
1300 vmovdqu %ymm0, 15 * 32(%rsi);
1302 vextracti128 $1, %ymm0, %xmm0;
1303 gf128mul_x_ble(%xmm0, %xmm12, %xmm15);
1304 vmovdqu %xmm0, (%rcx);
1307 vpbroadcastq (key_table)(CTX, %r8, 8), %ymm15;
1308 vpshufb .Lpack_bswap, %ymm15, %ymm15;
1309 vpxor 0 * 32(%rax), %ymm15, %ymm0;
1310 vpxor %ymm1, %ymm15, %ymm1;
1311 vpxor %ymm2, %ymm15, %ymm2;
1312 vpxor %ymm3, %ymm15, %ymm3;
1313 vpxor %ymm4, %ymm15, %ymm4;
1314 vpxor %ymm5, %ymm15, %ymm5;
1315 vpxor %ymm6, %ymm15, %ymm6;
1316 vpxor %ymm7, %ymm15, %ymm7;
1317 vpxor %ymm8, %ymm15, %ymm8;
1318 vpxor %ymm9, %ymm15, %ymm9;
1319 vpxor %ymm10, %ymm15, %ymm10;
1320 vpxor %ymm11, %ymm15, %ymm11;
1321 vpxor 12 * 32(%rax), %ymm15, %ymm12;
1322 vpxor 13 * 32(%rax), %ymm15, %ymm13;
1323 vpxor 14 * 32(%rax), %ymm15, %ymm14;
1324 vpxor 15 * 32(%rax), %ymm15, %ymm15;
1328 addq $(16 * 32), %rsp;
1330 vpxor 0 * 32(%rsi), %ymm7, %ymm7;
1331 vpxor 1 * 32(%rsi), %ymm6, %ymm6;
1332 vpxor 2 * 32(%rsi), %ymm5, %ymm5;
1333 vpxor 3 * 32(%rsi), %ymm4, %ymm4;
1334 vpxor 4 * 32(%rsi), %ymm3, %ymm3;
1335 vpxor 5 * 32(%rsi), %ymm2, %ymm2;
1336 vpxor 6 * 32(%rsi), %ymm1, %ymm1;
1337 vpxor 7 * 32(%rsi), %ymm0, %ymm0;
1338 vpxor 8 * 32(%rsi), %ymm15, %ymm15;
1339 vpxor 9 * 32(%rsi), %ymm14, %ymm14;
1340 vpxor 10 * 32(%rsi), %ymm13, %ymm13;
1341 vpxor 11 * 32(%rsi), %ymm12, %ymm12;
1342 vpxor 12 * 32(%rsi), %ymm11, %ymm11;
1343 vpxor 13 * 32(%rsi), %ymm10, %ymm10;
1344 vpxor 14 * 32(%rsi), %ymm9, %ymm9;
1345 vpxor 15 * 32(%rsi), %ymm8, %ymm8;
1346 write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0,
1347 %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9,
1353 ENDPROC(camellia_xts_crypt_32way)
1355 ENTRY(camellia_xts_enc_32way)
1358 * %rsi: dst (32 blocks)
1359 * %rdx: src (32 blocks)
1360 * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
1363 xorl %r8d, %r8d; /* input whitening key, 0 for enc */
1365 leaq __camellia_enc_blk32, %r9;
1367 jmp camellia_xts_crypt_32way;
1368 ENDPROC(camellia_xts_enc_32way)
1370 ENTRY(camellia_xts_dec_32way)
1373 * %rsi: dst (32 blocks)
1374 * %rdx: src (32 blocks)
1375 * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
1378 cmpl $16, key_length(CTX);
1381 cmovel %eax, %r8d; /* input whitening key, last for dec */
1383 leaq __camellia_dec_blk32, %r9;
1385 jmp camellia_xts_crypt_32way;
1386 ENDPROC(camellia_xts_dec_32way)