1 /* SPDX-License-Identifier: GPL-2.0-or-later */
3 * x86_64/AVX2/AES-NI assembler implementation of Camellia
5 * Copyright © 2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
8 #include <linux/linkage.h>
11 #define CAMELLIA_TABLE_BYTE_LEN 272
13 /* struct camellia_ctx: */
15 #define key_length CAMELLIA_TABLE_BYTE_LEN
21 /**********************************************************************
23 **********************************************************************/
24 #define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0) \
25 vpand x, mask4bit, tmp0; \
26 vpandn x, mask4bit, x; \
29 vpshufb tmp0, lo_t, tmp0; \
50 /**********************************************************************
52 **********************************************************************/
56 * x0..x7: byte-sliced AB state
57 * mem_cd: register pointer storing CD state
58 * key: index for key material
60 * x0..x7: new byte-sliced CD state
62 #define roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, t0, t1, t2, t3, t4, t5, t6, \
65 * S-function with AES subbytes \
67 vbroadcasti128 .Linv_shift_row(%rip), t4; \
68 vpbroadcastd .L0f0f0f0f(%rip), t7; \
69 vbroadcasti128 .Lpre_tf_lo_s1(%rip), t5; \
70 vbroadcasti128 .Lpre_tf_hi_s1(%rip), t6; \
71 vbroadcasti128 .Lpre_tf_lo_s4(%rip), t2; \
72 vbroadcasti128 .Lpre_tf_hi_s4(%rip), t3; \
74 /* AES inverse shift rows */ \
84 /* prefilter sboxes 1, 2 and 3 */ \
85 /* prefilter sbox 4 */ \
86 filter_8bit(x0, t5, t6, t7, t4); \
87 filter_8bit(x7, t5, t6, t7, t4); \
88 vextracti128 $1, x0, t0##_x; \
89 vextracti128 $1, x7, t1##_x; \
90 filter_8bit(x3, t2, t3, t7, t4); \
91 filter_8bit(x6, t2, t3, t7, t4); \
92 vextracti128 $1, x3, t3##_x; \
93 vextracti128 $1, x6, t2##_x; \
94 filter_8bit(x2, t5, t6, t7, t4); \
95 filter_8bit(x5, t5, t6, t7, t4); \
96 filter_8bit(x1, t5, t6, t7, t4); \
97 filter_8bit(x4, t5, t6, t7, t4); \
99 vpxor t4##_x, t4##_x, t4##_x; \
101 /* AES subbytes + AES shift rows */ \
102 vextracti128 $1, x2, t6##_x; \
103 vextracti128 $1, x5, t5##_x; \
104 vaesenclast t4##_x, x0##_x, x0##_x; \
105 vaesenclast t4##_x, t0##_x, t0##_x; \
106 vinserti128 $1, t0##_x, x0, x0; \
107 vaesenclast t4##_x, x7##_x, x7##_x; \
108 vaesenclast t4##_x, t1##_x, t1##_x; \
109 vinserti128 $1, t1##_x, x7, x7; \
110 vaesenclast t4##_x, x3##_x, x3##_x; \
111 vaesenclast t4##_x, t3##_x, t3##_x; \
112 vinserti128 $1, t3##_x, x3, x3; \
113 vaesenclast t4##_x, x6##_x, x6##_x; \
114 vaesenclast t4##_x, t2##_x, t2##_x; \
115 vinserti128 $1, t2##_x, x6, x6; \
116 vextracti128 $1, x1, t3##_x; \
117 vextracti128 $1, x4, t2##_x; \
118 vbroadcasti128 .Lpost_tf_lo_s1(%rip), t0; \
119 vbroadcasti128 .Lpost_tf_hi_s1(%rip), t1; \
120 vaesenclast t4##_x, x2##_x, x2##_x; \
121 vaesenclast t4##_x, t6##_x, t6##_x; \
122 vinserti128 $1, t6##_x, x2, x2; \
123 vaesenclast t4##_x, x5##_x, x5##_x; \
124 vaesenclast t4##_x, t5##_x, t5##_x; \
125 vinserti128 $1, t5##_x, x5, x5; \
126 vaesenclast t4##_x, x1##_x, x1##_x; \
127 vaesenclast t4##_x, t3##_x, t3##_x; \
128 vinserti128 $1, t3##_x, x1, x1; \
129 vaesenclast t4##_x, x4##_x, x4##_x; \
130 vaesenclast t4##_x, t2##_x, t2##_x; \
131 vinserti128 $1, t2##_x, x4, x4; \
133 /* postfilter sboxes 1 and 4 */ \
134 vbroadcasti128 .Lpost_tf_lo_s3(%rip), t2; \
135 vbroadcasti128 .Lpost_tf_hi_s3(%rip), t3; \
136 filter_8bit(x0, t0, t1, t7, t6); \
137 filter_8bit(x7, t0, t1, t7, t6); \
138 filter_8bit(x3, t0, t1, t7, t6); \
139 filter_8bit(x6, t0, t1, t7, t6); \
141 /* postfilter sbox 3 */ \
142 vbroadcasti128 .Lpost_tf_lo_s2(%rip), t4; \
143 vbroadcasti128 .Lpost_tf_hi_s2(%rip), t5; \
144 filter_8bit(x2, t2, t3, t7, t6); \
145 filter_8bit(x5, t2, t3, t7, t6); \
147 vpbroadcastq key, t0; /* higher 64-bit duplicate ignored */ \
149 /* postfilter sbox 2 */ \
150 filter_8bit(x1, t4, t5, t7, t2); \
151 filter_8bit(x4, t4, t5, t7, t2); \
154 vpsrldq $1, t0, t1; \
155 vpsrldq $2, t0, t2; \
156 vpshufb t7, t1, t1; \
157 vpsrldq $3, t0, t3; \
165 vpshufb t7, t2, t2; \
166 vpsrldq $4, t0, t4; \
167 vpshufb t7, t3, t3; \
168 vpsrldq $5, t0, t5; \
169 vpshufb t7, t4, t4; \
176 vpsrldq $6, t0, t6; \
177 vpshufb t7, t5, t5; \
178 vpshufb t7, t6, t6; \
188 vpxor x2, x7, x7; /* note: high and low parts swapped */ \
190 /* Add key material and result to CD (x becomes new CD) */ \
193 vpxor 5 * 32(mem_cd), x1, x1; \
195 vpsrldq $7, t0, t6; \
196 vpshufb t7, t0, t0; \
197 vpshufb t7, t6, t7; \
200 vpxor 4 * 32(mem_cd), x0, x0; \
203 vpxor 6 * 32(mem_cd), x2, x2; \
206 vpxor 7 * 32(mem_cd), x3, x3; \
209 vpxor 0 * 32(mem_cd), x4, x4; \
212 vpxor 1 * 32(mem_cd), x5, x5; \
215 vpxor 2 * 32(mem_cd), x6, x6; \
218 vpxor 3 * 32(mem_cd), x7, x7;
221 * Size optimization... with inlined roundsm32 binary would be over 5 times
222 * larger and would only marginally faster.
224 SYM_FUNC_START_LOCAL(roundsm32_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd)
225 roundsm32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
226 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, %ymm15,
229 SYM_FUNC_END(roundsm32_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd)
231 SYM_FUNC_START_LOCAL(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
232 roundsm32(%ymm4, %ymm5, %ymm6, %ymm7, %ymm0, %ymm1, %ymm2, %ymm3,
233 %ymm12, %ymm13, %ymm14, %ymm15, %ymm8, %ymm9, %ymm10, %ymm11,
236 SYM_FUNC_END(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
240 * x0..x7: byte-sliced AB state preloaded
241 * mem_ab: byte-sliced AB state in memory
242 * mem_cb: byte-sliced CD state in memory
244 #define two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
245 y6, y7, mem_ab, mem_cd, i, dir, store_ab) \
246 leaq (key_table + (i) * 8)(CTX), %r9; \
247 call roundsm32_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd; \
249 vmovdqu x0, 4 * 32(mem_cd); \
250 vmovdqu x1, 5 * 32(mem_cd); \
251 vmovdqu x2, 6 * 32(mem_cd); \
252 vmovdqu x3, 7 * 32(mem_cd); \
253 vmovdqu x4, 0 * 32(mem_cd); \
254 vmovdqu x5, 1 * 32(mem_cd); \
255 vmovdqu x6, 2 * 32(mem_cd); \
256 vmovdqu x7, 3 * 32(mem_cd); \
258 leaq (key_table + ((i) + (dir)) * 8)(CTX), %r9; \
259 call roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab; \
261 store_ab(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab);
263 #define dummy_store(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) /* do nothing */
265 #define store_ab_state(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) \
266 /* Store new AB state */ \
267 vmovdqu x4, 4 * 32(mem_ab); \
268 vmovdqu x5, 5 * 32(mem_ab); \
269 vmovdqu x6, 6 * 32(mem_ab); \
270 vmovdqu x7, 7 * 32(mem_ab); \
271 vmovdqu x0, 0 * 32(mem_ab); \
272 vmovdqu x1, 1 * 32(mem_ab); \
273 vmovdqu x2, 2 * 32(mem_ab); \
274 vmovdqu x3, 3 * 32(mem_ab);
276 #define enc_rounds32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
277 y6, y7, mem_ab, mem_cd, i) \
278 two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
279 y6, y7, mem_ab, mem_cd, (i) + 2, 1, store_ab_state); \
280 two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
281 y6, y7, mem_ab, mem_cd, (i) + 4, 1, store_ab_state); \
282 two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
283 y6, y7, mem_ab, mem_cd, (i) + 6, 1, dummy_store);
285 #define dec_rounds32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
286 y6, y7, mem_ab, mem_cd, i) \
287 two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
288 y6, y7, mem_ab, mem_cd, (i) + 7, -1, store_ab_state); \
289 two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
290 y6, y7, mem_ab, mem_cd, (i) + 5, -1, store_ab_state); \
291 two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
292 y6, y7, mem_ab, mem_cd, (i) + 3, -1, dummy_store);
296 * v0..3: byte-sliced 32-bit integers
300 #define rol32_1_32(v0, v1, v2, v3, t0, t1, t2, zero) \
301 vpcmpgtb v0, zero, t0; \
305 vpcmpgtb v1, zero, t1; \
309 vpcmpgtb v2, zero, t2; \
315 vpcmpgtb v3, zero, t0; \
325 * r: byte-sliced AB state in memory
326 * l: byte-sliced CD state in memory
328 * x0..x7: new byte-sliced CD state
330 #define fls32(l, l0, l1, l2, l3, l4, l5, l6, l7, r, t0, t1, t2, t3, tt0, \
331 tt1, tt2, tt3, kll, klr, krl, krr) \
335 * lr ^= rol32(t0, 1); \
337 vpbroadcastd kll, t0; /* only lowest 32-bit used */ \
338 vpxor tt0, tt0, tt0; \
339 vpshufb tt0, t0, t3; \
340 vpsrldq $1, t0, t0; \
341 vpshufb tt0, t0, t2; \
342 vpsrldq $1, t0, t0; \
343 vpshufb tt0, t0, t1; \
344 vpsrldq $1, t0, t0; \
345 vpshufb tt0, t0, t0; \
352 rol32_1_32(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \
355 vpbroadcastd krr, t0; /* only lowest 32-bit used */ \
356 vmovdqu l4, 4 * 32(l); \
358 vmovdqu l5, 5 * 32(l); \
360 vmovdqu l6, 6 * 32(l); \
362 vmovdqu l7, 7 * 32(l); \
370 vpshufb tt0, t0, t3; \
371 vpsrldq $1, t0, t0; \
372 vpshufb tt0, t0, t2; \
373 vpsrldq $1, t0, t0; \
374 vpshufb tt0, t0, t1; \
375 vpsrldq $1, t0, t0; \
376 vpshufb tt0, t0, t0; \
378 vpor 4 * 32(r), t0, t0; \
379 vpor 5 * 32(r), t1, t1; \
380 vpor 6 * 32(r), t2, t2; \
381 vpor 7 * 32(r), t3, t3; \
383 vpxor 0 * 32(r), t0, t0; \
384 vpxor 1 * 32(r), t1, t1; \
385 vpxor 2 * 32(r), t2, t2; \
386 vpxor 3 * 32(r), t3, t3; \
387 vmovdqu t0, 0 * 32(r); \
388 vpbroadcastd krl, t0; /* only lowest 32-bit used */ \
389 vmovdqu t1, 1 * 32(r); \
390 vmovdqu t2, 2 * 32(r); \
391 vmovdqu t3, 3 * 32(r); \
396 * rr ^= rol32(t2, 1); \
398 vpshufb tt0, t0, t3; \
399 vpsrldq $1, t0, t0; \
400 vpshufb tt0, t0, t2; \
401 vpsrldq $1, t0, t0; \
402 vpshufb tt0, t0, t1; \
403 vpsrldq $1, t0, t0; \
404 vpshufb tt0, t0, t0; \
406 vpand 0 * 32(r), t0, t0; \
407 vpand 1 * 32(r), t1, t1; \
408 vpand 2 * 32(r), t2, t2; \
409 vpand 3 * 32(r), t3, t3; \
411 rol32_1_32(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \
413 vpxor 4 * 32(r), t0, t0; \
414 vpxor 5 * 32(r), t1, t1; \
415 vpxor 6 * 32(r), t2, t2; \
416 vpxor 7 * 32(r), t3, t3; \
417 vmovdqu t0, 4 * 32(r); \
418 vpbroadcastd klr, t0; /* only lowest 32-bit used */ \
419 vmovdqu t1, 5 * 32(r); \
420 vmovdqu t2, 6 * 32(r); \
421 vmovdqu t3, 7 * 32(r); \
429 vpshufb tt0, t0, t3; \
430 vpsrldq $1, t0, t0; \
431 vpshufb tt0, t0, t2; \
432 vpsrldq $1, t0, t0; \
433 vpshufb tt0, t0, t1; \
434 vpsrldq $1, t0, t0; \
435 vpshufb tt0, t0, t0; \
443 vmovdqu l0, 0 * 32(l); \
445 vmovdqu l1, 1 * 32(l); \
447 vmovdqu l2, 2 * 32(l); \
449 vmovdqu l3, 3 * 32(l);
451 #define transpose_4x4(x0, x1, x2, x3, t1, t2) \
452 vpunpckhdq x1, x0, t2; \
453 vpunpckldq x1, x0, x0; \
455 vpunpckldq x3, x2, t1; \
456 vpunpckhdq x3, x2, x2; \
458 vpunpckhqdq t1, x0, x1; \
459 vpunpcklqdq t1, x0, x0; \
461 vpunpckhqdq x2, t2, x3; \
462 vpunpcklqdq x2, t2, x2;
464 #define byteslice_16x16b_fast(a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, \
465 a3, b3, c3, d3, st0, st1) \
468 transpose_4x4(a0, a1, a2, a3, d2, d3); \
469 transpose_4x4(b0, b1, b2, b3, d2, d3); \
475 transpose_4x4(c0, c1, c2, c3, a0, a1); \
476 transpose_4x4(d0, d1, d2, d3, a0, a1); \
478 vbroadcasti128 .Lshufb_16x16b(%rip), a0; \
480 vpshufb a0, a2, a2; \
481 vpshufb a0, a3, a3; \
482 vpshufb a0, b0, b0; \
483 vpshufb a0, b1, b1; \
484 vpshufb a0, b2, b2; \
485 vpshufb a0, b3, b3; \
486 vpshufb a0, a1, a1; \
487 vpshufb a0, c0, c0; \
488 vpshufb a0, c1, c1; \
489 vpshufb a0, c2, c2; \
490 vpshufb a0, c3, c3; \
491 vpshufb a0, d0, d0; \
492 vpshufb a0, d1, d1; \
493 vpshufb a0, d2, d2; \
494 vpshufb a0, d3, d3; \
497 vpshufb a0, d3, a0; \
500 transpose_4x4(a0, b0, c0, d0, d2, d3); \
501 transpose_4x4(a1, b1, c1, d1, d2, d3); \
507 transpose_4x4(a2, b2, c2, d2, b0, b1); \
508 transpose_4x4(a3, b3, c3, d3, b0, b1); \
511 /* does not adjust output bytes inside vectors */
513 /* load blocks to registers and apply pre-whitening */
514 #define inpack32_pre(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
516 vpbroadcastq key, x0; \
517 vpshufb .Lpack_bswap(%rip), x0, x0; \
519 vpxor 0 * 32(rio), x0, y7; \
520 vpxor 1 * 32(rio), x0, y6; \
521 vpxor 2 * 32(rio), x0, y5; \
522 vpxor 3 * 32(rio), x0, y4; \
523 vpxor 4 * 32(rio), x0, y3; \
524 vpxor 5 * 32(rio), x0, y2; \
525 vpxor 6 * 32(rio), x0, y1; \
526 vpxor 7 * 32(rio), x0, y0; \
527 vpxor 8 * 32(rio), x0, x7; \
528 vpxor 9 * 32(rio), x0, x6; \
529 vpxor 10 * 32(rio), x0, x5; \
530 vpxor 11 * 32(rio), x0, x4; \
531 vpxor 12 * 32(rio), x0, x3; \
532 vpxor 13 * 32(rio), x0, x2; \
533 vpxor 14 * 32(rio), x0, x1; \
534 vpxor 15 * 32(rio), x0, x0;
536 /* byteslice pre-whitened blocks and store to temporary memory */
537 #define inpack32_post(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
538 y6, y7, mem_ab, mem_cd) \
539 byteslice_16x16b_fast(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, \
540 y4, y5, y6, y7, (mem_ab), (mem_cd)); \
542 vmovdqu x0, 0 * 32(mem_ab); \
543 vmovdqu x1, 1 * 32(mem_ab); \
544 vmovdqu x2, 2 * 32(mem_ab); \
545 vmovdqu x3, 3 * 32(mem_ab); \
546 vmovdqu x4, 4 * 32(mem_ab); \
547 vmovdqu x5, 5 * 32(mem_ab); \
548 vmovdqu x6, 6 * 32(mem_ab); \
549 vmovdqu x7, 7 * 32(mem_ab); \
550 vmovdqu y0, 0 * 32(mem_cd); \
551 vmovdqu y1, 1 * 32(mem_cd); \
552 vmovdqu y2, 2 * 32(mem_cd); \
553 vmovdqu y3, 3 * 32(mem_cd); \
554 vmovdqu y4, 4 * 32(mem_cd); \
555 vmovdqu y5, 5 * 32(mem_cd); \
556 vmovdqu y6, 6 * 32(mem_cd); \
557 vmovdqu y7, 7 * 32(mem_cd);
559 /* de-byteslice, apply post-whitening and store blocks */
560 #define outunpack32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, \
561 y5, y6, y7, key, stack_tmp0, stack_tmp1) \
562 byteslice_16x16b_fast(y0, y4, x0, x4, y1, y5, x1, x5, y2, y6, x2, x6, \
563 y3, y7, x3, x7, stack_tmp0, stack_tmp1); \
565 vmovdqu x0, stack_tmp0; \
567 vpbroadcastq key, x0; \
568 vpshufb .Lpack_bswap(%rip), x0, x0; \
585 vpxor stack_tmp0, x0, x0;
587 #define write_output(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
589 vmovdqu x0, 0 * 32(rio); \
590 vmovdqu x1, 1 * 32(rio); \
591 vmovdqu x2, 2 * 32(rio); \
592 vmovdqu x3, 3 * 32(rio); \
593 vmovdqu x4, 4 * 32(rio); \
594 vmovdqu x5, 5 * 32(rio); \
595 vmovdqu x6, 6 * 32(rio); \
596 vmovdqu x7, 7 * 32(rio); \
597 vmovdqu y0, 8 * 32(rio); \
598 vmovdqu y1, 9 * 32(rio); \
599 vmovdqu y2, 10 * 32(rio); \
600 vmovdqu y3, 11 * 32(rio); \
601 vmovdqu y4, 12 * 32(rio); \
602 vmovdqu y5, 13 * 32(rio); \
603 vmovdqu y6, 14 * 32(rio); \
604 vmovdqu y7, 15 * 32(rio);
607 .section .rodata.cst32.shufb_16x16b, "aM", @progbits, 32
609 #define SHUFB_BYTES(idx) \
610 0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx)
612 .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3)
613 .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3)
615 .section .rodata.cst32.pack_bswap, "aM", @progbits, 32
618 .long 0x00010203, 0x04050607, 0x80808080, 0x80808080
619 .long 0x00010203, 0x04050607, 0x80808080, 0x80808080
621 /* NB: section is mergeable, all elements must be aligned 16-byte blocks */
622 .section .rodata.cst16, "aM", @progbits, 16
626 * pre-SubByte transform
628 * pre-lookup for sbox1, sbox2, sbox3:
629 * swap_bitendianness(
630 * isom_map_camellia_to_aes(
632 * swap_bitendianess(in)
637 * (note: '⊕ 0xc5' inside camellia_f())
640 .byte 0x45, 0xe8, 0x40, 0xed, 0x2e, 0x83, 0x2b, 0x86
641 .byte 0x4b, 0xe6, 0x4e, 0xe3, 0x20, 0x8d, 0x25, 0x88
643 .byte 0x00, 0x51, 0xf1, 0xa0, 0x8a, 0xdb, 0x7b, 0x2a
644 .byte 0x09, 0x58, 0xf8, 0xa9, 0x83, 0xd2, 0x72, 0x23
647 * pre-SubByte transform
649 * pre-lookup for sbox4:
650 * swap_bitendianness(
651 * isom_map_camellia_to_aes(
653 * swap_bitendianess(in <<< 1)
658 * (note: '⊕ 0xc5' inside camellia_f())
661 .byte 0x45, 0x40, 0x2e, 0x2b, 0x4b, 0x4e, 0x20, 0x25
662 .byte 0x14, 0x11, 0x7f, 0x7a, 0x1a, 0x1f, 0x71, 0x74
664 .byte 0x00, 0xf1, 0x8a, 0x7b, 0x09, 0xf8, 0x83, 0x72
665 .byte 0xad, 0x5c, 0x27, 0xd6, 0xa4, 0x55, 0x2e, 0xdf
668 * post-SubByte transform
670 * post-lookup for sbox1, sbox4:
671 * swap_bitendianness(
673 * isom_map_aes_to_camellia(
674 * swap_bitendianness(
675 * aes_inverse_affine_transform(in)
681 * (note: '⊕ 0x6e' inside camellia_h())
684 .byte 0x3c, 0xcc, 0xcf, 0x3f, 0x32, 0xc2, 0xc1, 0x31
685 .byte 0xdc, 0x2c, 0x2f, 0xdf, 0xd2, 0x22, 0x21, 0xd1
687 .byte 0x00, 0xf9, 0x86, 0x7f, 0xd7, 0x2e, 0x51, 0xa8
688 .byte 0xa4, 0x5d, 0x22, 0xdb, 0x73, 0x8a, 0xf5, 0x0c
691 * post-SubByte transform
693 * post-lookup for sbox2:
694 * swap_bitendianness(
696 * isom_map_aes_to_camellia(
697 * swap_bitendianness(
698 * aes_inverse_affine_transform(in)
704 * (note: '⊕ 0x6e' inside camellia_h())
707 .byte 0x78, 0x99, 0x9f, 0x7e, 0x64, 0x85, 0x83, 0x62
708 .byte 0xb9, 0x58, 0x5e, 0xbf, 0xa5, 0x44, 0x42, 0xa3
710 .byte 0x00, 0xf3, 0x0d, 0xfe, 0xaf, 0x5c, 0xa2, 0x51
711 .byte 0x49, 0xba, 0x44, 0xb7, 0xe6, 0x15, 0xeb, 0x18
714 * post-SubByte transform
716 * post-lookup for sbox3:
717 * swap_bitendianness(
719 * isom_map_aes_to_camellia(
720 * swap_bitendianness(
721 * aes_inverse_affine_transform(in)
727 * (note: '⊕ 0x6e' inside camellia_h())
730 .byte 0x1e, 0x66, 0xe7, 0x9f, 0x19, 0x61, 0xe0, 0x98
731 .byte 0x6e, 0x16, 0x97, 0xef, 0x69, 0x11, 0x90, 0xe8
733 .byte 0x00, 0xfc, 0x43, 0xbf, 0xeb, 0x17, 0xa8, 0x54
734 .byte 0x52, 0xae, 0x11, 0xed, 0xb9, 0x45, 0xfa, 0x06
736 /* For isolating SubBytes from AESENCLAST, inverse shift row */
738 .byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b
739 .byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03
741 .section .rodata.cst4.L0f0f0f0f, "aM", @progbits, 4
749 SYM_FUNC_START_LOCAL(__camellia_enc_blk32)
752 * %rax: temporary storage, 512 bytes
753 * %ymm0..%ymm15: 32 plaintext blocks
755 * %ymm0..%ymm15: 32 encrypted blocks, order swapped:
756 * 7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
760 leaq 8 * 32(%rax), %rcx;
762 inpack32_post(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
763 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
766 enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
767 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
768 %ymm15, %rax, %rcx, 0);
770 fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
771 %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
773 ((key_table + (8) * 8) + 0)(CTX),
774 ((key_table + (8) * 8) + 4)(CTX),
775 ((key_table + (8) * 8) + 8)(CTX),
776 ((key_table + (8) * 8) + 12)(CTX));
778 enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
779 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
780 %ymm15, %rax, %rcx, 8);
782 fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
783 %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
785 ((key_table + (16) * 8) + 0)(CTX),
786 ((key_table + (16) * 8) + 4)(CTX),
787 ((key_table + (16) * 8) + 8)(CTX),
788 ((key_table + (16) * 8) + 12)(CTX));
790 enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
791 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
792 %ymm15, %rax, %rcx, 16);
795 cmpl $16, key_length(CTX);
799 /* load CD for output */
800 vmovdqu 0 * 32(%rcx), %ymm8;
801 vmovdqu 1 * 32(%rcx), %ymm9;
802 vmovdqu 2 * 32(%rcx), %ymm10;
803 vmovdqu 3 * 32(%rcx), %ymm11;
804 vmovdqu 4 * 32(%rcx), %ymm12;
805 vmovdqu 5 * 32(%rcx), %ymm13;
806 vmovdqu 6 * 32(%rcx), %ymm14;
807 vmovdqu 7 * 32(%rcx), %ymm15;
809 outunpack32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
810 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
811 %ymm15, (key_table)(CTX, %r8, 8), (%rax), 1 * 32(%rax));
820 fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
821 %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
823 ((key_table + (24) * 8) + 0)(CTX),
824 ((key_table + (24) * 8) + 4)(CTX),
825 ((key_table + (24) * 8) + 8)(CTX),
826 ((key_table + (24) * 8) + 12)(CTX));
828 enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
829 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
830 %ymm15, %rax, %rcx, 24);
833 SYM_FUNC_END(__camellia_enc_blk32)
835 SYM_FUNC_START_LOCAL(__camellia_dec_blk32)
838 * %rax: temporary storage, 512 bytes
839 * %r8d: 24 for 16 byte key, 32 for larger
840 * %ymm0..%ymm15: 16 encrypted blocks
842 * %ymm0..%ymm15: 16 plaintext blocks, order swapped:
843 * 7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
847 leaq 8 * 32(%rax), %rcx;
849 inpack32_post(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
850 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
857 dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
858 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
859 %ymm15, %rax, %rcx, 16);
861 fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
862 %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
864 ((key_table + (16) * 8) + 8)(CTX),
865 ((key_table + (16) * 8) + 12)(CTX),
866 ((key_table + (16) * 8) + 0)(CTX),
867 ((key_table + (16) * 8) + 4)(CTX));
869 dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
870 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
871 %ymm15, %rax, %rcx, 8);
873 fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
874 %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
876 ((key_table + (8) * 8) + 8)(CTX),
877 ((key_table + (8) * 8) + 12)(CTX),
878 ((key_table + (8) * 8) + 0)(CTX),
879 ((key_table + (8) * 8) + 4)(CTX));
881 dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
882 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
883 %ymm15, %rax, %rcx, 0);
885 /* load CD for output */
886 vmovdqu 0 * 32(%rcx), %ymm8;
887 vmovdqu 1 * 32(%rcx), %ymm9;
888 vmovdqu 2 * 32(%rcx), %ymm10;
889 vmovdqu 3 * 32(%rcx), %ymm11;
890 vmovdqu 4 * 32(%rcx), %ymm12;
891 vmovdqu 5 * 32(%rcx), %ymm13;
892 vmovdqu 6 * 32(%rcx), %ymm14;
893 vmovdqu 7 * 32(%rcx), %ymm15;
895 outunpack32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
896 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
897 %ymm15, (key_table)(CTX), (%rax), 1 * 32(%rax));
904 dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
905 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
906 %ymm15, %rax, %rcx, 24);
908 fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
909 %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
911 ((key_table + (24) * 8) + 8)(CTX),
912 ((key_table + (24) * 8) + 12)(CTX),
913 ((key_table + (24) * 8) + 0)(CTX),
914 ((key_table + (24) * 8) + 4)(CTX));
917 SYM_FUNC_END(__camellia_dec_blk32)
919 SYM_FUNC_START(camellia_ecb_enc_32way)
922 * %rsi: dst (32 blocks)
923 * %rdx: src (32 blocks)
929 inpack32_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
930 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
931 %ymm15, %rdx, (key_table)(CTX));
933 /* now dst can be used as temporary buffer (even in src == dst case) */
936 call __camellia_enc_blk32;
938 write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0,
939 %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9,
946 SYM_FUNC_END(camellia_ecb_enc_32way)
948 SYM_FUNC_START(camellia_ecb_dec_32way)
951 * %rsi: dst (32 blocks)
952 * %rdx: src (32 blocks)
958 cmpl $16, key_length(CTX);
961 cmovel %eax, %r8d; /* max */
963 inpack32_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
964 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
965 %ymm15, %rdx, (key_table)(CTX, %r8, 8));
967 /* now dst can be used as temporary buffer (even in src == dst case) */
970 call __camellia_dec_blk32;
972 write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0,
973 %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9,
980 SYM_FUNC_END(camellia_ecb_dec_32way)
982 SYM_FUNC_START(camellia_cbc_dec_32way)
985 * %rsi: dst (32 blocks)
986 * %rdx: src (32 blocks)
989 subq $(16 * 32), %rsp;
993 cmpl $16, key_length(CTX);
996 cmovel %eax, %r8d; /* max */
998 inpack32_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
999 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
1000 %ymm15, %rdx, (key_table)(CTX, %r8, 8));
1003 je .Lcbc_dec_use_stack;
1005 /* dst can be used as temporary storage, src is not overwritten. */
1007 jmp .Lcbc_dec_continue;
1009 .Lcbc_dec_use_stack:
1011 * dst still in-use (because dst == src), so use stack for temporary
1017 call __camellia_dec_blk32;
1019 vmovdqu %ymm7, (%rax);
1020 vpxor %ymm7, %ymm7, %ymm7;
1021 vinserti128 $1, (%rdx), %ymm7, %ymm7;
1022 vpxor (%rax), %ymm7, %ymm7;
1023 vpxor (0 * 32 + 16)(%rdx), %ymm6, %ymm6;
1024 vpxor (1 * 32 + 16)(%rdx), %ymm5, %ymm5;
1025 vpxor (2 * 32 + 16)(%rdx), %ymm4, %ymm4;
1026 vpxor (3 * 32 + 16)(%rdx), %ymm3, %ymm3;
1027 vpxor (4 * 32 + 16)(%rdx), %ymm2, %ymm2;
1028 vpxor (5 * 32 + 16)(%rdx), %ymm1, %ymm1;
1029 vpxor (6 * 32 + 16)(%rdx), %ymm0, %ymm0;
1030 vpxor (7 * 32 + 16)(%rdx), %ymm15, %ymm15;
1031 vpxor (8 * 32 + 16)(%rdx), %ymm14, %ymm14;
1032 vpxor (9 * 32 + 16)(%rdx), %ymm13, %ymm13;
1033 vpxor (10 * 32 + 16)(%rdx), %ymm12, %ymm12;
1034 vpxor (11 * 32 + 16)(%rdx), %ymm11, %ymm11;
1035 vpxor (12 * 32 + 16)(%rdx), %ymm10, %ymm10;
1036 vpxor (13 * 32 + 16)(%rdx), %ymm9, %ymm9;
1037 vpxor (14 * 32 + 16)(%rdx), %ymm8, %ymm8;
1038 write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0,
1039 %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9,
1044 addq $(16 * 32), %rsp;
1047 SYM_FUNC_END(camellia_cbc_dec_32way)