2 * x86_64/AVX/AES-NI assembler implementation of Camellia
4 * Copyright © 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
14 * Version licensed under 2-clause BSD License is available at:
15 * http://koti.mbnet.fi/axh/crypto/camellia-BSD-1.2.0-aesni1.tar.xz
18 #include <linux/linkage.h>
20 #define CAMELLIA_TABLE_BYTE_LEN 272
22 /* struct camellia_ctx: */
24 #define key_length CAMELLIA_TABLE_BYTE_LEN
29 /**********************************************************************
31 **********************************************************************/
32 #define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0) \
33 vpand x, mask4bit, tmp0; \
34 vpandn x, mask4bit, x; \
37 vpshufb tmp0, lo_t, tmp0; \
43 * x0..x7: byte-sliced AB state
44 * mem_cd: register pointer storing CD state
45 * key: index for key material
47 * x0..x7: new byte-sliced CD state
49 #define roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, t0, t1, t2, t3, t4, t5, t6, \
52 * S-function with AES subbytes \
54 vmovdqa .Linv_shift_row, t4; \
55 vbroadcastss .L0f0f0f0f, t7; \
56 vmovdqa .Lpre_tf_lo_s1, t0; \
57 vmovdqa .Lpre_tf_hi_s1, t1; \
59 /* AES inverse shift rows */ \
69 /* prefilter sboxes 1, 2 and 3 */ \
70 vmovdqa .Lpre_tf_lo_s4, t2; \
71 vmovdqa .Lpre_tf_hi_s4, t3; \
72 filter_8bit(x0, t0, t1, t7, t6); \
73 filter_8bit(x7, t0, t1, t7, t6); \
74 filter_8bit(x1, t0, t1, t7, t6); \
75 filter_8bit(x4, t0, t1, t7, t6); \
76 filter_8bit(x2, t0, t1, t7, t6); \
77 filter_8bit(x5, t0, t1, t7, t6); \
79 /* prefilter sbox 4 */ \
81 filter_8bit(x3, t2, t3, t7, t6); \
82 filter_8bit(x6, t2, t3, t7, t6); \
84 /* AES subbytes + AES shift rows */ \
85 vmovdqa .Lpost_tf_lo_s1, t0; \
86 vmovdqa .Lpost_tf_hi_s1, t1; \
87 vaesenclast t4, x0, x0; \
88 vaesenclast t4, x7, x7; \
89 vaesenclast t4, x1, x1; \
90 vaesenclast t4, x4, x4; \
91 vaesenclast t4, x2, x2; \
92 vaesenclast t4, x5, x5; \
93 vaesenclast t4, x3, x3; \
94 vaesenclast t4, x6, x6; \
96 /* postfilter sboxes 1 and 4 */ \
97 vmovdqa .Lpost_tf_lo_s3, t2; \
98 vmovdqa .Lpost_tf_hi_s3, t3; \
99 filter_8bit(x0, t0, t1, t7, t6); \
100 filter_8bit(x7, t0, t1, t7, t6); \
101 filter_8bit(x3, t0, t1, t7, t6); \
102 filter_8bit(x6, t0, t1, t7, t6); \
104 /* postfilter sbox 3 */ \
105 vmovdqa .Lpost_tf_lo_s2, t4; \
106 vmovdqa .Lpost_tf_hi_s2, t5; \
107 filter_8bit(x2, t2, t3, t7, t6); \
108 filter_8bit(x5, t2, t3, t7, t6); \
113 /* postfilter sbox 2 */ \
114 filter_8bit(x1, t4, t5, t7, t2); \
115 filter_8bit(x4, t4, t5, t7, t2); \
117 vpsrldq $5, t0, t5; \
118 vpsrldq $1, t0, t1; \
119 vpsrldq $2, t0, t2; \
120 vpsrldq $3, t0, t3; \
121 vpsrldq $4, t0, t4; \
122 vpshufb t6, t0, t0; \
123 vpshufb t6, t1, t1; \
124 vpshufb t6, t2, t2; \
125 vpshufb t6, t3, t3; \
126 vpshufb t6, t4, t4; \
127 vpsrldq $2, t5, t7; \
128 vpshufb t6, t7, t7; \
151 vpxor x2, x7, x7; /* note: high and low parts swapped */ \
154 * Add key material and result to CD (x becomes new CD) \
158 vpxor 0 * 16(mem_cd), x4, x4; \
161 vpxor 1 * 16(mem_cd), x5, x5; \
163 vpsrldq $1, t5, t3; \
164 vpshufb t6, t5, t5; \
165 vpshufb t6, t3, t6; \
168 vpxor 2 * 16(mem_cd), x6, x6; \
171 vpxor 3 * 16(mem_cd), x7, x7; \
174 vpxor 4 * 16(mem_cd), x0, x0; \
177 vpxor 5 * 16(mem_cd), x1, x1; \
180 vpxor 6 * 16(mem_cd), x2, x2; \
183 vpxor 7 * 16(mem_cd), x3, x3;
186 * Size optimization... with inlined roundsm16, binary would be over 5 times
187 * larger and would only be 0.5% faster (on sandy-bridge).
190 roundsm16_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd:
191 roundsm16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
192 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15,
195 ENDPROC(roundsm16_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd)
198 roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab:
199 roundsm16(%xmm4, %xmm5, %xmm6, %xmm7, %xmm0, %xmm1, %xmm2, %xmm3,
200 %xmm12, %xmm13, %xmm14, %xmm15, %xmm8, %xmm9, %xmm10, %xmm11,
203 ENDPROC(roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
207 * x0..x7: byte-sliced AB state preloaded
208 * mem_ab: byte-sliced AB state in memory
209 * mem_cb: byte-sliced CD state in memory
211 #define two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
212 y6, y7, mem_ab, mem_cd, i, dir, store_ab) \
213 leaq (key_table + (i) * 8)(CTX), %r9; \
214 call roundsm16_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd; \
216 vmovdqu x4, 0 * 16(mem_cd); \
217 vmovdqu x5, 1 * 16(mem_cd); \
218 vmovdqu x6, 2 * 16(mem_cd); \
219 vmovdqu x7, 3 * 16(mem_cd); \
220 vmovdqu x0, 4 * 16(mem_cd); \
221 vmovdqu x1, 5 * 16(mem_cd); \
222 vmovdqu x2, 6 * 16(mem_cd); \
223 vmovdqu x3, 7 * 16(mem_cd); \
225 leaq (key_table + ((i) + (dir)) * 8)(CTX), %r9; \
226 call roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab; \
228 store_ab(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab);
230 #define dummy_store(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) /* do nothing */
232 #define store_ab_state(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) \
233 /* Store new AB state */ \
234 vmovdqu x0, 0 * 16(mem_ab); \
235 vmovdqu x1, 1 * 16(mem_ab); \
236 vmovdqu x2, 2 * 16(mem_ab); \
237 vmovdqu x3, 3 * 16(mem_ab); \
238 vmovdqu x4, 4 * 16(mem_ab); \
239 vmovdqu x5, 5 * 16(mem_ab); \
240 vmovdqu x6, 6 * 16(mem_ab); \
241 vmovdqu x7, 7 * 16(mem_ab);
243 #define enc_rounds16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
244 y6, y7, mem_ab, mem_cd, i) \
245 two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
246 y6, y7, mem_ab, mem_cd, (i) + 2, 1, store_ab_state); \
247 two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
248 y6, y7, mem_ab, mem_cd, (i) + 4, 1, store_ab_state); \
249 two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
250 y6, y7, mem_ab, mem_cd, (i) + 6, 1, dummy_store);
252 #define dec_rounds16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
253 y6, y7, mem_ab, mem_cd, i) \
254 two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
255 y6, y7, mem_ab, mem_cd, (i) + 7, -1, store_ab_state); \
256 two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
257 y6, y7, mem_ab, mem_cd, (i) + 5, -1, store_ab_state); \
258 two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
259 y6, y7, mem_ab, mem_cd, (i) + 3, -1, dummy_store);
263 * v0..3: byte-sliced 32-bit integers
267 #define rol32_1_16(v0, v1, v2, v3, t0, t1, t2, zero) \
268 vpcmpgtb v0, zero, t0; \
272 vpcmpgtb v1, zero, t1; \
276 vpcmpgtb v2, zero, t2; \
282 vpcmpgtb v3, zero, t0; \
292 * r: byte-sliced AB state in memory
293 * l: byte-sliced CD state in memory
295 * x0..x7: new byte-sliced CD state
297 #define fls16(l, l0, l1, l2, l3, l4, l5, l6, l7, r, t0, t1, t2, t3, tt0, \
298 tt1, tt2, tt3, kll, klr, krl, krr) \
302 * lr ^= rol32(t0, 1); \
304 vpxor tt0, tt0, tt0; \
306 vpshufb tt0, t0, t3; \
307 vpsrldq $1, t0, t0; \
308 vpshufb tt0, t0, t2; \
309 vpsrldq $1, t0, t0; \
310 vpshufb tt0, t0, t1; \
311 vpsrldq $1, t0, t0; \
312 vpshufb tt0, t0, t0; \
319 rol32_1_16(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \
322 vmovdqu l4, 4 * 16(l); \
324 vmovdqu l5, 5 * 16(l); \
326 vmovdqu l6, 6 * 16(l); \
328 vmovdqu l7, 7 * 16(l); \
337 vpshufb tt0, t0, t3; \
338 vpsrldq $1, t0, t0; \
339 vpshufb tt0, t0, t2; \
340 vpsrldq $1, t0, t0; \
341 vpshufb tt0, t0, t1; \
342 vpsrldq $1, t0, t0; \
343 vpshufb tt0, t0, t0; \
345 vpor 4 * 16(r), t0, t0; \
346 vpor 5 * 16(r), t1, t1; \
347 vpor 6 * 16(r), t2, t2; \
348 vpor 7 * 16(r), t3, t3; \
350 vpxor 0 * 16(r), t0, t0; \
351 vpxor 1 * 16(r), t1, t1; \
352 vpxor 2 * 16(r), t2, t2; \
353 vpxor 3 * 16(r), t3, t3; \
354 vmovdqu t0, 0 * 16(r); \
355 vmovdqu t1, 1 * 16(r); \
356 vmovdqu t2, 2 * 16(r); \
357 vmovdqu t3, 3 * 16(r); \
362 * rr ^= rol32(t2, 1); \
365 vpshufb tt0, t0, t3; \
366 vpsrldq $1, t0, t0; \
367 vpshufb tt0, t0, t2; \
368 vpsrldq $1, t0, t0; \
369 vpshufb tt0, t0, t1; \
370 vpsrldq $1, t0, t0; \
371 vpshufb tt0, t0, t0; \
373 vpand 0 * 16(r), t0, t0; \
374 vpand 1 * 16(r), t1, t1; \
375 vpand 2 * 16(r), t2, t2; \
376 vpand 3 * 16(r), t3, t3; \
378 rol32_1_16(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \
380 vpxor 4 * 16(r), t0, t0; \
381 vpxor 5 * 16(r), t1, t1; \
382 vpxor 6 * 16(r), t2, t2; \
383 vpxor 7 * 16(r), t3, t3; \
384 vmovdqu t0, 4 * 16(r); \
385 vmovdqu t1, 5 * 16(r); \
386 vmovdqu t2, 6 * 16(r); \
387 vmovdqu t3, 7 * 16(r); \
396 vpshufb tt0, t0, t3; \
397 vpsrldq $1, t0, t0; \
398 vpshufb tt0, t0, t2; \
399 vpsrldq $1, t0, t0; \
400 vpshufb tt0, t0, t1; \
401 vpsrldq $1, t0, t0; \
402 vpshufb tt0, t0, t0; \
410 vmovdqu l0, 0 * 16(l); \
412 vmovdqu l1, 1 * 16(l); \
414 vmovdqu l2, 2 * 16(l); \
416 vmovdqu l3, 3 * 16(l);
418 #define transpose_4x4(x0, x1, x2, x3, t1, t2) \
419 vpunpckhdq x1, x0, t2; \
420 vpunpckldq x1, x0, x0; \
422 vpunpckldq x3, x2, t1; \
423 vpunpckhdq x3, x2, x2; \
425 vpunpckhqdq t1, x0, x1; \
426 vpunpcklqdq t1, x0, x0; \
428 vpunpckhqdq x2, t2, x3; \
429 vpunpcklqdq x2, t2, x2;
431 #define byteslice_16x16b(a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, a3, \
432 b3, c3, d3, st0, st1) \
435 transpose_4x4(a0, a1, a2, a3, d2, d3); \
436 transpose_4x4(b0, b1, b2, b3, d2, d3); \
442 transpose_4x4(c0, c1, c2, c3, a0, a1); \
443 transpose_4x4(d0, d1, d2, d3, a0, a1); \
445 vmovdqu .Lshufb_16x16b, a0; \
447 vpshufb a0, a2, a2; \
448 vpshufb a0, a3, a3; \
449 vpshufb a0, b0, b0; \
450 vpshufb a0, b1, b1; \
451 vpshufb a0, b2, b2; \
452 vpshufb a0, b3, b3; \
453 vpshufb a0, a1, a1; \
454 vpshufb a0, c0, c0; \
455 vpshufb a0, c1, c1; \
456 vpshufb a0, c2, c2; \
457 vpshufb a0, c3, c3; \
458 vpshufb a0, d0, d0; \
459 vpshufb a0, d1, d1; \
460 vpshufb a0, d2, d2; \
461 vpshufb a0, d3, d3; \
464 vpshufb a0, d3, a0; \
467 transpose_4x4(a0, b0, c0, d0, d2, d3); \
468 transpose_4x4(a1, b1, c1, d1, d2, d3); \
474 transpose_4x4(a2, b2, c2, d2, b0, b1); \
475 transpose_4x4(a3, b3, c3, d3, b0, b1); \
478 /* does not adjust output bytes inside vectors */
480 /* load blocks to registers and apply pre-whitening */
481 #define inpack16_pre(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
484 vpshufb .Lpack_bswap, x0, x0; \
486 vpxor 0 * 16(rio), x0, y7; \
487 vpxor 1 * 16(rio), x0, y6; \
488 vpxor 2 * 16(rio), x0, y5; \
489 vpxor 3 * 16(rio), x0, y4; \
490 vpxor 4 * 16(rio), x0, y3; \
491 vpxor 5 * 16(rio), x0, y2; \
492 vpxor 6 * 16(rio), x0, y1; \
493 vpxor 7 * 16(rio), x0, y0; \
494 vpxor 8 * 16(rio), x0, x7; \
495 vpxor 9 * 16(rio), x0, x6; \
496 vpxor 10 * 16(rio), x0, x5; \
497 vpxor 11 * 16(rio), x0, x4; \
498 vpxor 12 * 16(rio), x0, x3; \
499 vpxor 13 * 16(rio), x0, x2; \
500 vpxor 14 * 16(rio), x0, x1; \
501 vpxor 15 * 16(rio), x0, x0;
503 /* byteslice pre-whitened blocks and store to temporary memory */
504 #define inpack16_post(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
505 y6, y7, mem_ab, mem_cd) \
506 byteslice_16x16b(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, \
507 y5, y6, y7, (mem_ab), (mem_cd)); \
509 vmovdqu x0, 0 * 16(mem_ab); \
510 vmovdqu x1, 1 * 16(mem_ab); \
511 vmovdqu x2, 2 * 16(mem_ab); \
512 vmovdqu x3, 3 * 16(mem_ab); \
513 vmovdqu x4, 4 * 16(mem_ab); \
514 vmovdqu x5, 5 * 16(mem_ab); \
515 vmovdqu x6, 6 * 16(mem_ab); \
516 vmovdqu x7, 7 * 16(mem_ab); \
517 vmovdqu y0, 0 * 16(mem_cd); \
518 vmovdqu y1, 1 * 16(mem_cd); \
519 vmovdqu y2, 2 * 16(mem_cd); \
520 vmovdqu y3, 3 * 16(mem_cd); \
521 vmovdqu y4, 4 * 16(mem_cd); \
522 vmovdqu y5, 5 * 16(mem_cd); \
523 vmovdqu y6, 6 * 16(mem_cd); \
524 vmovdqu y7, 7 * 16(mem_cd);
526 /* de-byteslice, apply post-whitening and store blocks */
527 #define outunpack16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, \
528 y5, y6, y7, key, stack_tmp0, stack_tmp1) \
529 byteslice_16x16b(y0, y4, x0, x4, y1, y5, x1, x5, y2, y6, x2, x6, y3, \
530 y7, x3, x7, stack_tmp0, stack_tmp1); \
532 vmovdqu x0, stack_tmp0; \
535 vpshufb .Lpack_bswap, x0, x0; \
552 vpxor stack_tmp0, x0, x0;
554 #define write_output(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
556 vmovdqu x0, 0 * 16(rio); \
557 vmovdqu x1, 1 * 16(rio); \
558 vmovdqu x2, 2 * 16(rio); \
559 vmovdqu x3, 3 * 16(rio); \
560 vmovdqu x4, 4 * 16(rio); \
561 vmovdqu x5, 5 * 16(rio); \
562 vmovdqu x6, 6 * 16(rio); \
563 vmovdqu x7, 7 * 16(rio); \
564 vmovdqu y0, 8 * 16(rio); \
565 vmovdqu y1, 9 * 16(rio); \
566 vmovdqu y2, 10 * 16(rio); \
567 vmovdqu y3, 11 * 16(rio); \
568 vmovdqu y4, 12 * 16(rio); \
569 vmovdqu y5, 13 * 16(rio); \
570 vmovdqu y6, 14 * 16(rio); \
571 vmovdqu y7, 15 * 16(rio);
576 #define SHUFB_BYTES(idx) \
577 0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx)
580 .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3);
588 /* For CTR-mode IV byteswap */
590 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
593 * pre-SubByte transform
595 * pre-lookup for sbox1, sbox2, sbox3:
596 * swap_bitendianness(
597 * isom_map_camellia_to_aes(
599 * swap_bitendianess(in)
604 * (note: '⊕ 0xc5' inside camellia_f())
607 .byte 0x45, 0xe8, 0x40, 0xed, 0x2e, 0x83, 0x2b, 0x86
608 .byte 0x4b, 0xe6, 0x4e, 0xe3, 0x20, 0x8d, 0x25, 0x88
610 .byte 0x00, 0x51, 0xf1, 0xa0, 0x8a, 0xdb, 0x7b, 0x2a
611 .byte 0x09, 0x58, 0xf8, 0xa9, 0x83, 0xd2, 0x72, 0x23
614 * pre-SubByte transform
616 * pre-lookup for sbox4:
617 * swap_bitendianness(
618 * isom_map_camellia_to_aes(
620 * swap_bitendianess(in <<< 1)
625 * (note: '⊕ 0xc5' inside camellia_f())
628 .byte 0x45, 0x40, 0x2e, 0x2b, 0x4b, 0x4e, 0x20, 0x25
629 .byte 0x14, 0x11, 0x7f, 0x7a, 0x1a, 0x1f, 0x71, 0x74
631 .byte 0x00, 0xf1, 0x8a, 0x7b, 0x09, 0xf8, 0x83, 0x72
632 .byte 0xad, 0x5c, 0x27, 0xd6, 0xa4, 0x55, 0x2e, 0xdf
635 * post-SubByte transform
637 * post-lookup for sbox1, sbox4:
638 * swap_bitendianness(
640 * isom_map_aes_to_camellia(
641 * swap_bitendianness(
642 * aes_inverse_affine_transform(in)
648 * (note: '⊕ 0x6e' inside camellia_h())
651 .byte 0x3c, 0xcc, 0xcf, 0x3f, 0x32, 0xc2, 0xc1, 0x31
652 .byte 0xdc, 0x2c, 0x2f, 0xdf, 0xd2, 0x22, 0x21, 0xd1
654 .byte 0x00, 0xf9, 0x86, 0x7f, 0xd7, 0x2e, 0x51, 0xa8
655 .byte 0xa4, 0x5d, 0x22, 0xdb, 0x73, 0x8a, 0xf5, 0x0c
658 * post-SubByte transform
660 * post-lookup for sbox2:
661 * swap_bitendianness(
663 * isom_map_aes_to_camellia(
664 * swap_bitendianness(
665 * aes_inverse_affine_transform(in)
671 * (note: '⊕ 0x6e' inside camellia_h())
674 .byte 0x78, 0x99, 0x9f, 0x7e, 0x64, 0x85, 0x83, 0x62
675 .byte 0xb9, 0x58, 0x5e, 0xbf, 0xa5, 0x44, 0x42, 0xa3
677 .byte 0x00, 0xf3, 0x0d, 0xfe, 0xaf, 0x5c, 0xa2, 0x51
678 .byte 0x49, 0xba, 0x44, 0xb7, 0xe6, 0x15, 0xeb, 0x18
681 * post-SubByte transform
683 * post-lookup for sbox3:
684 * swap_bitendianness(
686 * isom_map_aes_to_camellia(
687 * swap_bitendianness(
688 * aes_inverse_affine_transform(in)
694 * (note: '⊕ 0x6e' inside camellia_h())
697 .byte 0x1e, 0x66, 0xe7, 0x9f, 0x19, 0x61, 0xe0, 0x98
698 .byte 0x6e, 0x16, 0x97, 0xef, 0x69, 0x11, 0x90, 0xe8
700 .byte 0x00, 0xfc, 0x43, 0xbf, 0xeb, 0x17, 0xa8, 0x54
701 .byte 0x52, 0xae, 0x11, 0xed, 0xb9, 0x45, 0xfa, 0x06
703 /* For isolating SubBytes from AESENCLAST, inverse shift row */
705 .byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b
706 .byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03
716 __camellia_enc_blk16:
719 * %rax: temporary storage, 256 bytes
720 * %xmm0..%xmm15: 16 plaintext blocks
722 * %xmm0..%xmm15: 16 encrypted blocks, order swapped:
723 * 7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
726 leaq 8 * 16(%rax), %rcx;
728 inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
729 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
732 enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
733 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
734 %xmm15, %rax, %rcx, 0);
736 fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
737 %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
739 ((key_table + (8) * 8) + 0)(CTX),
740 ((key_table + (8) * 8) + 4)(CTX),
741 ((key_table + (8) * 8) + 8)(CTX),
742 ((key_table + (8) * 8) + 12)(CTX));
744 enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
745 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
746 %xmm15, %rax, %rcx, 8);
748 fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
749 %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
751 ((key_table + (16) * 8) + 0)(CTX),
752 ((key_table + (16) * 8) + 4)(CTX),
753 ((key_table + (16) * 8) + 8)(CTX),
754 ((key_table + (16) * 8) + 12)(CTX));
756 enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
757 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
758 %xmm15, %rax, %rcx, 16);
761 cmpl $16, key_length(CTX);
765 /* load CD for output */
766 vmovdqu 0 * 16(%rcx), %xmm8;
767 vmovdqu 1 * 16(%rcx), %xmm9;
768 vmovdqu 2 * 16(%rcx), %xmm10;
769 vmovdqu 3 * 16(%rcx), %xmm11;
770 vmovdqu 4 * 16(%rcx), %xmm12;
771 vmovdqu 5 * 16(%rcx), %xmm13;
772 vmovdqu 6 * 16(%rcx), %xmm14;
773 vmovdqu 7 * 16(%rcx), %xmm15;
775 outunpack16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
776 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
777 %xmm15, (key_table)(CTX, %r8, 8), (%rax), 1 * 16(%rax));
785 fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
786 %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
788 ((key_table + (24) * 8) + 0)(CTX),
789 ((key_table + (24) * 8) + 4)(CTX),
790 ((key_table + (24) * 8) + 8)(CTX),
791 ((key_table + (24) * 8) + 12)(CTX));
793 enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
794 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
795 %xmm15, %rax, %rcx, 24);
798 ENDPROC(__camellia_enc_blk16)
801 __camellia_dec_blk16:
804 * %rax: temporary storage, 256 bytes
805 * %r8d: 24 for 16 byte key, 32 for larger
806 * %xmm0..%xmm15: 16 encrypted blocks
808 * %xmm0..%xmm15: 16 plaintext blocks, order swapped:
809 * 7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
812 leaq 8 * 16(%rax), %rcx;
814 inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
815 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
822 dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
823 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
824 %xmm15, %rax, %rcx, 16);
826 fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
827 %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
829 ((key_table + (16) * 8) + 8)(CTX),
830 ((key_table + (16) * 8) + 12)(CTX),
831 ((key_table + (16) * 8) + 0)(CTX),
832 ((key_table + (16) * 8) + 4)(CTX));
834 dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
835 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
836 %xmm15, %rax, %rcx, 8);
838 fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
839 %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
841 ((key_table + (8) * 8) + 8)(CTX),
842 ((key_table + (8) * 8) + 12)(CTX),
843 ((key_table + (8) * 8) + 0)(CTX),
844 ((key_table + (8) * 8) + 4)(CTX));
846 dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
847 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
848 %xmm15, %rax, %rcx, 0);
850 /* load CD for output */
851 vmovdqu 0 * 16(%rcx), %xmm8;
852 vmovdqu 1 * 16(%rcx), %xmm9;
853 vmovdqu 2 * 16(%rcx), %xmm10;
854 vmovdqu 3 * 16(%rcx), %xmm11;
855 vmovdqu 4 * 16(%rcx), %xmm12;
856 vmovdqu 5 * 16(%rcx), %xmm13;
857 vmovdqu 6 * 16(%rcx), %xmm14;
858 vmovdqu 7 * 16(%rcx), %xmm15;
860 outunpack16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
861 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
862 %xmm15, (key_table)(CTX), (%rax), 1 * 16(%rax));
868 dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
869 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
870 %xmm15, %rax, %rcx, 24);
872 fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
873 %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
875 ((key_table + (24) * 8) + 8)(CTX),
876 ((key_table + (24) * 8) + 12)(CTX),
877 ((key_table + (24) * 8) + 0)(CTX),
878 ((key_table + (24) * 8) + 4)(CTX));
881 ENDPROC(__camellia_dec_blk16)
883 ENTRY(camellia_ecb_enc_16way)
886 * %rsi: dst (16 blocks)
887 * %rdx: src (16 blocks)
890 inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
891 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
892 %xmm15, %rdx, (key_table)(CTX));
894 /* now dst can be used as temporary buffer (even in src == dst case) */
897 call __camellia_enc_blk16;
899 write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,
900 %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
904 ENDPROC(camellia_ecb_enc_16way)
906 ENTRY(camellia_ecb_dec_16way)
909 * %rsi: dst (16 blocks)
910 * %rdx: src (16 blocks)
913 cmpl $16, key_length(CTX);
916 cmovel %eax, %r8d; /* max */
918 inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
919 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
920 %xmm15, %rdx, (key_table)(CTX, %r8, 8));
922 /* now dst can be used as temporary buffer (even in src == dst case) */
925 call __camellia_dec_blk16;
927 write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,
928 %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
932 ENDPROC(camellia_ecb_dec_16way)
934 ENTRY(camellia_cbc_dec_16way)
937 * %rsi: dst (16 blocks)
938 * %rdx: src (16 blocks)
941 cmpl $16, key_length(CTX);
944 cmovel %eax, %r8d; /* max */
946 inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
947 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
948 %xmm15, %rdx, (key_table)(CTX, %r8, 8));
951 * dst might still be in-use (in case dst == src), so use stack for
954 subq $(16 * 16), %rsp;
957 call __camellia_dec_blk16;
959 addq $(16 * 16), %rsp;
961 vpxor (0 * 16)(%rdx), %xmm6, %xmm6;
962 vpxor (1 * 16)(%rdx), %xmm5, %xmm5;
963 vpxor (2 * 16)(%rdx), %xmm4, %xmm4;
964 vpxor (3 * 16)(%rdx), %xmm3, %xmm3;
965 vpxor (4 * 16)(%rdx), %xmm2, %xmm2;
966 vpxor (5 * 16)(%rdx), %xmm1, %xmm1;
967 vpxor (6 * 16)(%rdx), %xmm0, %xmm0;
968 vpxor (7 * 16)(%rdx), %xmm15, %xmm15;
969 vpxor (8 * 16)(%rdx), %xmm14, %xmm14;
970 vpxor (9 * 16)(%rdx), %xmm13, %xmm13;
971 vpxor (10 * 16)(%rdx), %xmm12, %xmm12;
972 vpxor (11 * 16)(%rdx), %xmm11, %xmm11;
973 vpxor (12 * 16)(%rdx), %xmm10, %xmm10;
974 vpxor (13 * 16)(%rdx), %xmm9, %xmm9;
975 vpxor (14 * 16)(%rdx), %xmm8, %xmm8;
976 write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,
977 %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
981 ENDPROC(camellia_cbc_dec_16way)
983 #define inc_le128(x, minus_one, tmp) \
984 vpcmpeqq minus_one, x, tmp; \
985 vpsubq minus_one, x, x; \
986 vpslldq $8, tmp, tmp; \
989 ENTRY(camellia_ctr_16way)
992 * %rsi: dst (16 blocks)
993 * %rdx: src (16 blocks)
994 * %rcx: iv (little endian, 128bit)
997 subq $(16 * 16), %rsp;
1000 vmovdqa .Lbswap128_mask, %xmm14;
1002 /* load IV and byteswap */
1003 vmovdqu (%rcx), %xmm0;
1004 vpshufb %xmm14, %xmm0, %xmm15;
1005 vmovdqu %xmm15, 15 * 16(%rax);
1007 vpcmpeqd %xmm15, %xmm15, %xmm15;
1008 vpsrldq $8, %xmm15, %xmm15; /* low: -1, high: 0 */
1011 inc_le128(%xmm0, %xmm15, %xmm13);
1012 vpshufb %xmm14, %xmm0, %xmm13;
1013 vmovdqu %xmm13, 14 * 16(%rax);
1014 inc_le128(%xmm0, %xmm15, %xmm13);
1015 vpshufb %xmm14, %xmm0, %xmm13;
1016 vmovdqu %xmm13, 13 * 16(%rax);
1017 inc_le128(%xmm0, %xmm15, %xmm13);
1018 vpshufb %xmm14, %xmm0, %xmm12;
1019 inc_le128(%xmm0, %xmm15, %xmm13);
1020 vpshufb %xmm14, %xmm0, %xmm11;
1021 inc_le128(%xmm0, %xmm15, %xmm13);
1022 vpshufb %xmm14, %xmm0, %xmm10;
1023 inc_le128(%xmm0, %xmm15, %xmm13);
1024 vpshufb %xmm14, %xmm0, %xmm9;
1025 inc_le128(%xmm0, %xmm15, %xmm13);
1026 vpshufb %xmm14, %xmm0, %xmm8;
1027 inc_le128(%xmm0, %xmm15, %xmm13);
1028 vpshufb %xmm14, %xmm0, %xmm7;
1029 inc_le128(%xmm0, %xmm15, %xmm13);
1030 vpshufb %xmm14, %xmm0, %xmm6;
1031 inc_le128(%xmm0, %xmm15, %xmm13);
1032 vpshufb %xmm14, %xmm0, %xmm5;
1033 inc_le128(%xmm0, %xmm15, %xmm13);
1034 vpshufb %xmm14, %xmm0, %xmm4;
1035 inc_le128(%xmm0, %xmm15, %xmm13);
1036 vpshufb %xmm14, %xmm0, %xmm3;
1037 inc_le128(%xmm0, %xmm15, %xmm13);
1038 vpshufb %xmm14, %xmm0, %xmm2;
1039 inc_le128(%xmm0, %xmm15, %xmm13);
1040 vpshufb %xmm14, %xmm0, %xmm1;
1041 inc_le128(%xmm0, %xmm15, %xmm13);
1042 vmovdqa %xmm0, %xmm13;
1043 vpshufb %xmm14, %xmm0, %xmm0;
1044 inc_le128(%xmm13, %xmm15, %xmm14);
1045 vmovdqu %xmm13, (%rcx);
1048 vmovq (key_table)(CTX), %xmm15;
1049 vpshufb .Lpack_bswap, %xmm15, %xmm15;
1050 vpxor %xmm0, %xmm15, %xmm0;
1051 vpxor %xmm1, %xmm15, %xmm1;
1052 vpxor %xmm2, %xmm15, %xmm2;
1053 vpxor %xmm3, %xmm15, %xmm3;
1054 vpxor %xmm4, %xmm15, %xmm4;
1055 vpxor %xmm5, %xmm15, %xmm5;
1056 vpxor %xmm6, %xmm15, %xmm6;
1057 vpxor %xmm7, %xmm15, %xmm7;
1058 vpxor %xmm8, %xmm15, %xmm8;
1059 vpxor %xmm9, %xmm15, %xmm9;
1060 vpxor %xmm10, %xmm15, %xmm10;
1061 vpxor %xmm11, %xmm15, %xmm11;
1062 vpxor %xmm12, %xmm15, %xmm12;
1063 vpxor 13 * 16(%rax), %xmm15, %xmm13;
1064 vpxor 14 * 16(%rax), %xmm15, %xmm14;
1065 vpxor 15 * 16(%rax), %xmm15, %xmm15;
1067 call __camellia_enc_blk16;
1069 addq $(16 * 16), %rsp;
1071 vpxor 0 * 16(%rdx), %xmm7, %xmm7;
1072 vpxor 1 * 16(%rdx), %xmm6, %xmm6;
1073 vpxor 2 * 16(%rdx), %xmm5, %xmm5;
1074 vpxor 3 * 16(%rdx), %xmm4, %xmm4;
1075 vpxor 4 * 16(%rdx), %xmm3, %xmm3;
1076 vpxor 5 * 16(%rdx), %xmm2, %xmm2;
1077 vpxor 6 * 16(%rdx), %xmm1, %xmm1;
1078 vpxor 7 * 16(%rdx), %xmm0, %xmm0;
1079 vpxor 8 * 16(%rdx), %xmm15, %xmm15;
1080 vpxor 9 * 16(%rdx), %xmm14, %xmm14;
1081 vpxor 10 * 16(%rdx), %xmm13, %xmm13;
1082 vpxor 11 * 16(%rdx), %xmm12, %xmm12;
1083 vpxor 12 * 16(%rdx), %xmm11, %xmm11;
1084 vpxor 13 * 16(%rdx), %xmm10, %xmm10;
1085 vpxor 14 * 16(%rdx), %xmm9, %xmm9;
1086 vpxor 15 * 16(%rdx), %xmm8, %xmm8;
1087 write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,
1088 %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
1092 ENDPROC(camellia_ctr_16way)