2 * x86_64/AVX/AES-NI assembler implementation of Camellia
4 * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
14 * Version licensed under 2-clause BSD License is available at:
15 * http://koti.mbnet.fi/axh/crypto/camellia-BSD-1.2.0-aesni1.tar.xz
18 #include <linux/linkage.h>
19 #include <asm/frame.h>
21 #define CAMELLIA_TABLE_BYTE_LEN 272
23 /* struct camellia_ctx: */
25 #define key_length CAMELLIA_TABLE_BYTE_LEN
30 /**********************************************************************
32 **********************************************************************/
33 #define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0) \
34 vpand x, mask4bit, tmp0; \
35 vpandn x, mask4bit, x; \
38 vpshufb tmp0, lo_t, tmp0; \
44 * x0..x7: byte-sliced AB state
45 * mem_cd: register pointer storing CD state
46 * key: index for key material
48 * x0..x7: new byte-sliced CD state
50 #define roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, t0, t1, t2, t3, t4, t5, t6, \
53 * S-function with AES subbytes \
55 vmovdqa .Linv_shift_row, t4; \
56 vbroadcastss .L0f0f0f0f, t7; \
57 vmovdqa .Lpre_tf_lo_s1, t0; \
58 vmovdqa .Lpre_tf_hi_s1, t1; \
60 /* AES inverse shift rows */ \
70 /* prefilter sboxes 1, 2 and 3 */ \
71 vmovdqa .Lpre_tf_lo_s4, t2; \
72 vmovdqa .Lpre_tf_hi_s4, t3; \
73 filter_8bit(x0, t0, t1, t7, t6); \
74 filter_8bit(x7, t0, t1, t7, t6); \
75 filter_8bit(x1, t0, t1, t7, t6); \
76 filter_8bit(x4, t0, t1, t7, t6); \
77 filter_8bit(x2, t0, t1, t7, t6); \
78 filter_8bit(x5, t0, t1, t7, t6); \
80 /* prefilter sbox 4 */ \
82 filter_8bit(x3, t2, t3, t7, t6); \
83 filter_8bit(x6, t2, t3, t7, t6); \
85 /* AES subbytes + AES shift rows */ \
86 vmovdqa .Lpost_tf_lo_s1, t0; \
87 vmovdqa .Lpost_tf_hi_s1, t1; \
88 vaesenclast t4, x0, x0; \
89 vaesenclast t4, x7, x7; \
90 vaesenclast t4, x1, x1; \
91 vaesenclast t4, x4, x4; \
92 vaesenclast t4, x2, x2; \
93 vaesenclast t4, x5, x5; \
94 vaesenclast t4, x3, x3; \
95 vaesenclast t4, x6, x6; \
97 /* postfilter sboxes 1 and 4 */ \
98 vmovdqa .Lpost_tf_lo_s3, t2; \
99 vmovdqa .Lpost_tf_hi_s3, t3; \
100 filter_8bit(x0, t0, t1, t7, t6); \
101 filter_8bit(x7, t0, t1, t7, t6); \
102 filter_8bit(x3, t0, t1, t7, t6); \
103 filter_8bit(x6, t0, t1, t7, t6); \
105 /* postfilter sbox 3 */ \
106 vmovdqa .Lpost_tf_lo_s2, t4; \
107 vmovdqa .Lpost_tf_hi_s2, t5; \
108 filter_8bit(x2, t2, t3, t7, t6); \
109 filter_8bit(x5, t2, t3, t7, t6); \
114 /* postfilter sbox 2 */ \
115 filter_8bit(x1, t4, t5, t7, t2); \
116 filter_8bit(x4, t4, t5, t7, t2); \
118 vpsrldq $5, t0, t5; \
119 vpsrldq $1, t0, t1; \
120 vpsrldq $2, t0, t2; \
121 vpsrldq $3, t0, t3; \
122 vpsrldq $4, t0, t4; \
123 vpshufb t6, t0, t0; \
124 vpshufb t6, t1, t1; \
125 vpshufb t6, t2, t2; \
126 vpshufb t6, t3, t3; \
127 vpshufb t6, t4, t4; \
128 vpsrldq $2, t5, t7; \
129 vpshufb t6, t7, t7; \
152 vpxor x2, x7, x7; /* note: high and low parts swapped */ \
155 * Add key material and result to CD (x becomes new CD) \
159 vpxor 0 * 16(mem_cd), x4, x4; \
162 vpxor 1 * 16(mem_cd), x5, x5; \
164 vpsrldq $1, t5, t3; \
165 vpshufb t6, t5, t5; \
166 vpshufb t6, t3, t6; \
169 vpxor 2 * 16(mem_cd), x6, x6; \
172 vpxor 3 * 16(mem_cd), x7, x7; \
175 vpxor 4 * 16(mem_cd), x0, x0; \
178 vpxor 5 * 16(mem_cd), x1, x1; \
181 vpxor 6 * 16(mem_cd), x2, x2; \
184 vpxor 7 * 16(mem_cd), x3, x3;
187 * Size optimization... with inlined roundsm16, binary would be over 5 times
188 * larger and would only be 0.5% faster (on sandy-bridge).
191 roundsm16_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd:
192 roundsm16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
193 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15,
196 ENDPROC(roundsm16_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd)
199 roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab:
200 roundsm16(%xmm4, %xmm5, %xmm6, %xmm7, %xmm0, %xmm1, %xmm2, %xmm3,
201 %xmm12, %xmm13, %xmm14, %xmm15, %xmm8, %xmm9, %xmm10, %xmm11,
204 ENDPROC(roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
208 * x0..x7: byte-sliced AB state preloaded
209 * mem_ab: byte-sliced AB state in memory
210 * mem_cb: byte-sliced CD state in memory
212 #define two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
213 y6, y7, mem_ab, mem_cd, i, dir, store_ab) \
214 leaq (key_table + (i) * 8)(CTX), %r9; \
215 call roundsm16_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd; \
217 vmovdqu x4, 0 * 16(mem_cd); \
218 vmovdqu x5, 1 * 16(mem_cd); \
219 vmovdqu x6, 2 * 16(mem_cd); \
220 vmovdqu x7, 3 * 16(mem_cd); \
221 vmovdqu x0, 4 * 16(mem_cd); \
222 vmovdqu x1, 5 * 16(mem_cd); \
223 vmovdqu x2, 6 * 16(mem_cd); \
224 vmovdqu x3, 7 * 16(mem_cd); \
226 leaq (key_table + ((i) + (dir)) * 8)(CTX), %r9; \
227 call roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab; \
229 store_ab(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab);
231 #define dummy_store(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) /* do nothing */
233 #define store_ab_state(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) \
234 /* Store new AB state */ \
235 vmovdqu x0, 0 * 16(mem_ab); \
236 vmovdqu x1, 1 * 16(mem_ab); \
237 vmovdqu x2, 2 * 16(mem_ab); \
238 vmovdqu x3, 3 * 16(mem_ab); \
239 vmovdqu x4, 4 * 16(mem_ab); \
240 vmovdqu x5, 5 * 16(mem_ab); \
241 vmovdqu x6, 6 * 16(mem_ab); \
242 vmovdqu x7, 7 * 16(mem_ab);
244 #define enc_rounds16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
245 y6, y7, mem_ab, mem_cd, i) \
246 two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
247 y6, y7, mem_ab, mem_cd, (i) + 2, 1, store_ab_state); \
248 two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
249 y6, y7, mem_ab, mem_cd, (i) + 4, 1, store_ab_state); \
250 two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
251 y6, y7, mem_ab, mem_cd, (i) + 6, 1, dummy_store);
253 #define dec_rounds16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
254 y6, y7, mem_ab, mem_cd, i) \
255 two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
256 y6, y7, mem_ab, mem_cd, (i) + 7, -1, store_ab_state); \
257 two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
258 y6, y7, mem_ab, mem_cd, (i) + 5, -1, store_ab_state); \
259 two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
260 y6, y7, mem_ab, mem_cd, (i) + 3, -1, dummy_store);
264 * v0..3: byte-sliced 32-bit integers
268 #define rol32_1_16(v0, v1, v2, v3, t0, t1, t2, zero) \
269 vpcmpgtb v0, zero, t0; \
273 vpcmpgtb v1, zero, t1; \
277 vpcmpgtb v2, zero, t2; \
283 vpcmpgtb v3, zero, t0; \
293 * r: byte-sliced AB state in memory
294 * l: byte-sliced CD state in memory
296 * x0..x7: new byte-sliced CD state
298 #define fls16(l, l0, l1, l2, l3, l4, l5, l6, l7, r, t0, t1, t2, t3, tt0, \
299 tt1, tt2, tt3, kll, klr, krl, krr) \
303 * lr ^= rol32(t0, 1); \
305 vpxor tt0, tt0, tt0; \
307 vpshufb tt0, t0, t3; \
308 vpsrldq $1, t0, t0; \
309 vpshufb tt0, t0, t2; \
310 vpsrldq $1, t0, t0; \
311 vpshufb tt0, t0, t1; \
312 vpsrldq $1, t0, t0; \
313 vpshufb tt0, t0, t0; \
320 rol32_1_16(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \
323 vmovdqu l4, 4 * 16(l); \
325 vmovdqu l5, 5 * 16(l); \
327 vmovdqu l6, 6 * 16(l); \
329 vmovdqu l7, 7 * 16(l); \
338 vpshufb tt0, t0, t3; \
339 vpsrldq $1, t0, t0; \
340 vpshufb tt0, t0, t2; \
341 vpsrldq $1, t0, t0; \
342 vpshufb tt0, t0, t1; \
343 vpsrldq $1, t0, t0; \
344 vpshufb tt0, t0, t0; \
346 vpor 4 * 16(r), t0, t0; \
347 vpor 5 * 16(r), t1, t1; \
348 vpor 6 * 16(r), t2, t2; \
349 vpor 7 * 16(r), t3, t3; \
351 vpxor 0 * 16(r), t0, t0; \
352 vpxor 1 * 16(r), t1, t1; \
353 vpxor 2 * 16(r), t2, t2; \
354 vpxor 3 * 16(r), t3, t3; \
355 vmovdqu t0, 0 * 16(r); \
356 vmovdqu t1, 1 * 16(r); \
357 vmovdqu t2, 2 * 16(r); \
358 vmovdqu t3, 3 * 16(r); \
363 * rr ^= rol32(t2, 1); \
366 vpshufb tt0, t0, t3; \
367 vpsrldq $1, t0, t0; \
368 vpshufb tt0, t0, t2; \
369 vpsrldq $1, t0, t0; \
370 vpshufb tt0, t0, t1; \
371 vpsrldq $1, t0, t0; \
372 vpshufb tt0, t0, t0; \
374 vpand 0 * 16(r), t0, t0; \
375 vpand 1 * 16(r), t1, t1; \
376 vpand 2 * 16(r), t2, t2; \
377 vpand 3 * 16(r), t3, t3; \
379 rol32_1_16(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \
381 vpxor 4 * 16(r), t0, t0; \
382 vpxor 5 * 16(r), t1, t1; \
383 vpxor 6 * 16(r), t2, t2; \
384 vpxor 7 * 16(r), t3, t3; \
385 vmovdqu t0, 4 * 16(r); \
386 vmovdqu t1, 5 * 16(r); \
387 vmovdqu t2, 6 * 16(r); \
388 vmovdqu t3, 7 * 16(r); \
397 vpshufb tt0, t0, t3; \
398 vpsrldq $1, t0, t0; \
399 vpshufb tt0, t0, t2; \
400 vpsrldq $1, t0, t0; \
401 vpshufb tt0, t0, t1; \
402 vpsrldq $1, t0, t0; \
403 vpshufb tt0, t0, t0; \
411 vmovdqu l0, 0 * 16(l); \
413 vmovdqu l1, 1 * 16(l); \
415 vmovdqu l2, 2 * 16(l); \
417 vmovdqu l3, 3 * 16(l);
419 #define transpose_4x4(x0, x1, x2, x3, t1, t2) \
420 vpunpckhdq x1, x0, t2; \
421 vpunpckldq x1, x0, x0; \
423 vpunpckldq x3, x2, t1; \
424 vpunpckhdq x3, x2, x2; \
426 vpunpckhqdq t1, x0, x1; \
427 vpunpcklqdq t1, x0, x0; \
429 vpunpckhqdq x2, t2, x3; \
430 vpunpcklqdq x2, t2, x2;
432 #define byteslice_16x16b(a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, a3, \
433 b3, c3, d3, st0, st1) \
436 transpose_4x4(a0, a1, a2, a3, d2, d3); \
437 transpose_4x4(b0, b1, b2, b3, d2, d3); \
443 transpose_4x4(c0, c1, c2, c3, a0, a1); \
444 transpose_4x4(d0, d1, d2, d3, a0, a1); \
446 vmovdqu .Lshufb_16x16b, a0; \
448 vpshufb a0, a2, a2; \
449 vpshufb a0, a3, a3; \
450 vpshufb a0, b0, b0; \
451 vpshufb a0, b1, b1; \
452 vpshufb a0, b2, b2; \
453 vpshufb a0, b3, b3; \
454 vpshufb a0, a1, a1; \
455 vpshufb a0, c0, c0; \
456 vpshufb a0, c1, c1; \
457 vpshufb a0, c2, c2; \
458 vpshufb a0, c3, c3; \
459 vpshufb a0, d0, d0; \
460 vpshufb a0, d1, d1; \
461 vpshufb a0, d2, d2; \
462 vpshufb a0, d3, d3; \
465 vpshufb a0, d3, a0; \
468 transpose_4x4(a0, b0, c0, d0, d2, d3); \
469 transpose_4x4(a1, b1, c1, d1, d2, d3); \
475 transpose_4x4(a2, b2, c2, d2, b0, b1); \
476 transpose_4x4(a3, b3, c3, d3, b0, b1); \
479 /* does not adjust output bytes inside vectors */
481 /* load blocks to registers and apply pre-whitening */
482 #define inpack16_pre(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
485 vpshufb .Lpack_bswap, x0, x0; \
487 vpxor 0 * 16(rio), x0, y7; \
488 vpxor 1 * 16(rio), x0, y6; \
489 vpxor 2 * 16(rio), x0, y5; \
490 vpxor 3 * 16(rio), x0, y4; \
491 vpxor 4 * 16(rio), x0, y3; \
492 vpxor 5 * 16(rio), x0, y2; \
493 vpxor 6 * 16(rio), x0, y1; \
494 vpxor 7 * 16(rio), x0, y0; \
495 vpxor 8 * 16(rio), x0, x7; \
496 vpxor 9 * 16(rio), x0, x6; \
497 vpxor 10 * 16(rio), x0, x5; \
498 vpxor 11 * 16(rio), x0, x4; \
499 vpxor 12 * 16(rio), x0, x3; \
500 vpxor 13 * 16(rio), x0, x2; \
501 vpxor 14 * 16(rio), x0, x1; \
502 vpxor 15 * 16(rio), x0, x0;
504 /* byteslice pre-whitened blocks and store to temporary memory */
505 #define inpack16_post(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
506 y6, y7, mem_ab, mem_cd) \
507 byteslice_16x16b(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, \
508 y5, y6, y7, (mem_ab), (mem_cd)); \
510 vmovdqu x0, 0 * 16(mem_ab); \
511 vmovdqu x1, 1 * 16(mem_ab); \
512 vmovdqu x2, 2 * 16(mem_ab); \
513 vmovdqu x3, 3 * 16(mem_ab); \
514 vmovdqu x4, 4 * 16(mem_ab); \
515 vmovdqu x5, 5 * 16(mem_ab); \
516 vmovdqu x6, 6 * 16(mem_ab); \
517 vmovdqu x7, 7 * 16(mem_ab); \
518 vmovdqu y0, 0 * 16(mem_cd); \
519 vmovdqu y1, 1 * 16(mem_cd); \
520 vmovdqu y2, 2 * 16(mem_cd); \
521 vmovdqu y3, 3 * 16(mem_cd); \
522 vmovdqu y4, 4 * 16(mem_cd); \
523 vmovdqu y5, 5 * 16(mem_cd); \
524 vmovdqu y6, 6 * 16(mem_cd); \
525 vmovdqu y7, 7 * 16(mem_cd);
527 /* de-byteslice, apply post-whitening and store blocks */
528 #define outunpack16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, \
529 y5, y6, y7, key, stack_tmp0, stack_tmp1) \
530 byteslice_16x16b(y0, y4, x0, x4, y1, y5, x1, x5, y2, y6, x2, x6, y3, \
531 y7, x3, x7, stack_tmp0, stack_tmp1); \
533 vmovdqu x0, stack_tmp0; \
536 vpshufb .Lpack_bswap, x0, x0; \
553 vpxor stack_tmp0, x0, x0;
555 #define write_output(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
557 vmovdqu x0, 0 * 16(rio); \
558 vmovdqu x1, 1 * 16(rio); \
559 vmovdqu x2, 2 * 16(rio); \
560 vmovdqu x3, 3 * 16(rio); \
561 vmovdqu x4, 4 * 16(rio); \
562 vmovdqu x5, 5 * 16(rio); \
563 vmovdqu x6, 6 * 16(rio); \
564 vmovdqu x7, 7 * 16(rio); \
565 vmovdqu y0, 8 * 16(rio); \
566 vmovdqu y1, 9 * 16(rio); \
567 vmovdqu y2, 10 * 16(rio); \
568 vmovdqu y3, 11 * 16(rio); \
569 vmovdqu y4, 12 * 16(rio); \
570 vmovdqu y5, 13 * 16(rio); \
571 vmovdqu y6, 14 * 16(rio); \
572 vmovdqu y7, 15 * 16(rio);
575 /* NB: section is mergeable, all elements must be aligned 16-byte blocks */
576 .section .rodata.cst16, "aM", @progbits, 16
579 #define SHUFB_BYTES(idx) \
580 0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx)
583 .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3);
591 /* For CTR-mode IV byteswap */
593 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
595 /* For XTS mode IV generation */
596 .Lxts_gf128mul_and_shl1_mask:
597 .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0
600 * pre-SubByte transform
602 * pre-lookup for sbox1, sbox2, sbox3:
603 * swap_bitendianness(
604 * isom_map_camellia_to_aes(
606 * swap_bitendianess(in)
611 * (note: '⊕ 0xc5' inside camellia_f())
614 .byte 0x45, 0xe8, 0x40, 0xed, 0x2e, 0x83, 0x2b, 0x86
615 .byte 0x4b, 0xe6, 0x4e, 0xe3, 0x20, 0x8d, 0x25, 0x88
617 .byte 0x00, 0x51, 0xf1, 0xa0, 0x8a, 0xdb, 0x7b, 0x2a
618 .byte 0x09, 0x58, 0xf8, 0xa9, 0x83, 0xd2, 0x72, 0x23
621 * pre-SubByte transform
623 * pre-lookup for sbox4:
624 * swap_bitendianness(
625 * isom_map_camellia_to_aes(
627 * swap_bitendianess(in <<< 1)
632 * (note: '⊕ 0xc5' inside camellia_f())
635 .byte 0x45, 0x40, 0x2e, 0x2b, 0x4b, 0x4e, 0x20, 0x25
636 .byte 0x14, 0x11, 0x7f, 0x7a, 0x1a, 0x1f, 0x71, 0x74
638 .byte 0x00, 0xf1, 0x8a, 0x7b, 0x09, 0xf8, 0x83, 0x72
639 .byte 0xad, 0x5c, 0x27, 0xd6, 0xa4, 0x55, 0x2e, 0xdf
642 * post-SubByte transform
644 * post-lookup for sbox1, sbox4:
645 * swap_bitendianness(
647 * isom_map_aes_to_camellia(
648 * swap_bitendianness(
649 * aes_inverse_affine_transform(in)
655 * (note: '⊕ 0x6e' inside camellia_h())
658 .byte 0x3c, 0xcc, 0xcf, 0x3f, 0x32, 0xc2, 0xc1, 0x31
659 .byte 0xdc, 0x2c, 0x2f, 0xdf, 0xd2, 0x22, 0x21, 0xd1
661 .byte 0x00, 0xf9, 0x86, 0x7f, 0xd7, 0x2e, 0x51, 0xa8
662 .byte 0xa4, 0x5d, 0x22, 0xdb, 0x73, 0x8a, 0xf5, 0x0c
665 * post-SubByte transform
667 * post-lookup for sbox2:
668 * swap_bitendianness(
670 * isom_map_aes_to_camellia(
671 * swap_bitendianness(
672 * aes_inverse_affine_transform(in)
678 * (note: '⊕ 0x6e' inside camellia_h())
681 .byte 0x78, 0x99, 0x9f, 0x7e, 0x64, 0x85, 0x83, 0x62
682 .byte 0xb9, 0x58, 0x5e, 0xbf, 0xa5, 0x44, 0x42, 0xa3
684 .byte 0x00, 0xf3, 0x0d, 0xfe, 0xaf, 0x5c, 0xa2, 0x51
685 .byte 0x49, 0xba, 0x44, 0xb7, 0xe6, 0x15, 0xeb, 0x18
688 * post-SubByte transform
690 * post-lookup for sbox3:
691 * swap_bitendianness(
693 * isom_map_aes_to_camellia(
694 * swap_bitendianness(
695 * aes_inverse_affine_transform(in)
701 * (note: '⊕ 0x6e' inside camellia_h())
704 .byte 0x1e, 0x66, 0xe7, 0x9f, 0x19, 0x61, 0xe0, 0x98
705 .byte 0x6e, 0x16, 0x97, 0xef, 0x69, 0x11, 0x90, 0xe8
707 .byte 0x00, 0xfc, 0x43, 0xbf, 0xeb, 0x17, 0xa8, 0x54
708 .byte 0x52, 0xae, 0x11, 0xed, 0xb9, 0x45, 0xfa, 0x06
710 /* For isolating SubBytes from AESENCLAST, inverse shift row */
712 .byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b
713 .byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03
716 .section .rodata.cst4.L0f0f0f0f, "aM", @progbits, 4
724 __camellia_enc_blk16:
727 * %rax: temporary storage, 256 bytes
728 * %xmm0..%xmm15: 16 plaintext blocks
730 * %xmm0..%xmm15: 16 encrypted blocks, order swapped:
731 * 7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
735 leaq 8 * 16(%rax), %rcx;
737 inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
738 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
741 enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
742 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
743 %xmm15, %rax, %rcx, 0);
745 fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
746 %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
748 ((key_table + (8) * 8) + 0)(CTX),
749 ((key_table + (8) * 8) + 4)(CTX),
750 ((key_table + (8) * 8) + 8)(CTX),
751 ((key_table + (8) * 8) + 12)(CTX));
753 enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
754 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
755 %xmm15, %rax, %rcx, 8);
757 fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
758 %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
760 ((key_table + (16) * 8) + 0)(CTX),
761 ((key_table + (16) * 8) + 4)(CTX),
762 ((key_table + (16) * 8) + 8)(CTX),
763 ((key_table + (16) * 8) + 12)(CTX));
765 enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
766 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
767 %xmm15, %rax, %rcx, 16);
770 cmpl $16, key_length(CTX);
774 /* load CD for output */
775 vmovdqu 0 * 16(%rcx), %xmm8;
776 vmovdqu 1 * 16(%rcx), %xmm9;
777 vmovdqu 2 * 16(%rcx), %xmm10;
778 vmovdqu 3 * 16(%rcx), %xmm11;
779 vmovdqu 4 * 16(%rcx), %xmm12;
780 vmovdqu 5 * 16(%rcx), %xmm13;
781 vmovdqu 6 * 16(%rcx), %xmm14;
782 vmovdqu 7 * 16(%rcx), %xmm15;
784 outunpack16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
785 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
786 %xmm15, (key_table)(CTX, %r8, 8), (%rax), 1 * 16(%rax));
795 fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
796 %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
798 ((key_table + (24) * 8) + 0)(CTX),
799 ((key_table + (24) * 8) + 4)(CTX),
800 ((key_table + (24) * 8) + 8)(CTX),
801 ((key_table + (24) * 8) + 12)(CTX));
803 enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
804 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
805 %xmm15, %rax, %rcx, 24);
808 ENDPROC(__camellia_enc_blk16)
811 __camellia_dec_blk16:
814 * %rax: temporary storage, 256 bytes
815 * %r8d: 24 for 16 byte key, 32 for larger
816 * %xmm0..%xmm15: 16 encrypted blocks
818 * %xmm0..%xmm15: 16 plaintext blocks, order swapped:
819 * 7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
823 leaq 8 * 16(%rax), %rcx;
825 inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
826 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
833 dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
834 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
835 %xmm15, %rax, %rcx, 16);
837 fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
838 %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
840 ((key_table + (16) * 8) + 8)(CTX),
841 ((key_table + (16) * 8) + 12)(CTX),
842 ((key_table + (16) * 8) + 0)(CTX),
843 ((key_table + (16) * 8) + 4)(CTX));
845 dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
846 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
847 %xmm15, %rax, %rcx, 8);
849 fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
850 %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
852 ((key_table + (8) * 8) + 8)(CTX),
853 ((key_table + (8) * 8) + 12)(CTX),
854 ((key_table + (8) * 8) + 0)(CTX),
855 ((key_table + (8) * 8) + 4)(CTX));
857 dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
858 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
859 %xmm15, %rax, %rcx, 0);
861 /* load CD for output */
862 vmovdqu 0 * 16(%rcx), %xmm8;
863 vmovdqu 1 * 16(%rcx), %xmm9;
864 vmovdqu 2 * 16(%rcx), %xmm10;
865 vmovdqu 3 * 16(%rcx), %xmm11;
866 vmovdqu 4 * 16(%rcx), %xmm12;
867 vmovdqu 5 * 16(%rcx), %xmm13;
868 vmovdqu 6 * 16(%rcx), %xmm14;
869 vmovdqu 7 * 16(%rcx), %xmm15;
871 outunpack16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
872 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
873 %xmm15, (key_table)(CTX), (%rax), 1 * 16(%rax));
880 dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
881 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
882 %xmm15, %rax, %rcx, 24);
884 fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
885 %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
887 ((key_table + (24) * 8) + 8)(CTX),
888 ((key_table + (24) * 8) + 12)(CTX),
889 ((key_table + (24) * 8) + 0)(CTX),
890 ((key_table + (24) * 8) + 4)(CTX));
893 ENDPROC(__camellia_dec_blk16)
895 ENTRY(camellia_ecb_enc_16way)
898 * %rsi: dst (16 blocks)
899 * %rdx: src (16 blocks)
903 inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
904 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
905 %xmm15, %rdx, (key_table)(CTX));
907 /* now dst can be used as temporary buffer (even in src == dst case) */
910 call __camellia_enc_blk16;
912 write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,
913 %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
918 ENDPROC(camellia_ecb_enc_16way)
920 ENTRY(camellia_ecb_dec_16way)
923 * %rsi: dst (16 blocks)
924 * %rdx: src (16 blocks)
928 cmpl $16, key_length(CTX);
931 cmovel %eax, %r8d; /* max */
933 inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
934 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
935 %xmm15, %rdx, (key_table)(CTX, %r8, 8));
937 /* now dst can be used as temporary buffer (even in src == dst case) */
940 call __camellia_dec_blk16;
942 write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,
943 %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
948 ENDPROC(camellia_ecb_dec_16way)
950 ENTRY(camellia_cbc_dec_16way)
953 * %rsi: dst (16 blocks)
954 * %rdx: src (16 blocks)
958 cmpl $16, key_length(CTX);
961 cmovel %eax, %r8d; /* max */
963 inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
964 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
965 %xmm15, %rdx, (key_table)(CTX, %r8, 8));
968 * dst might still be in-use (in case dst == src), so use stack for
971 subq $(16 * 16), %rsp;
974 call __camellia_dec_blk16;
976 addq $(16 * 16), %rsp;
978 vpxor (0 * 16)(%rdx), %xmm6, %xmm6;
979 vpxor (1 * 16)(%rdx), %xmm5, %xmm5;
980 vpxor (2 * 16)(%rdx), %xmm4, %xmm4;
981 vpxor (3 * 16)(%rdx), %xmm3, %xmm3;
982 vpxor (4 * 16)(%rdx), %xmm2, %xmm2;
983 vpxor (5 * 16)(%rdx), %xmm1, %xmm1;
984 vpxor (6 * 16)(%rdx), %xmm0, %xmm0;
985 vpxor (7 * 16)(%rdx), %xmm15, %xmm15;
986 vpxor (8 * 16)(%rdx), %xmm14, %xmm14;
987 vpxor (9 * 16)(%rdx), %xmm13, %xmm13;
988 vpxor (10 * 16)(%rdx), %xmm12, %xmm12;
989 vpxor (11 * 16)(%rdx), %xmm11, %xmm11;
990 vpxor (12 * 16)(%rdx), %xmm10, %xmm10;
991 vpxor (13 * 16)(%rdx), %xmm9, %xmm9;
992 vpxor (14 * 16)(%rdx), %xmm8, %xmm8;
993 write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,
994 %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
999 ENDPROC(camellia_cbc_dec_16way)
1001 #define inc_le128(x, minus_one, tmp) \
1002 vpcmpeqq minus_one, x, tmp; \
1003 vpsubq minus_one, x, x; \
1004 vpslldq $8, tmp, tmp; \
1007 ENTRY(camellia_ctr_16way)
1010 * %rsi: dst (16 blocks)
1011 * %rdx: src (16 blocks)
1012 * %rcx: iv (little endian, 128bit)
1016 subq $(16 * 16), %rsp;
1019 vmovdqa .Lbswap128_mask, %xmm14;
1021 /* load IV and byteswap */
1022 vmovdqu (%rcx), %xmm0;
1023 vpshufb %xmm14, %xmm0, %xmm15;
1024 vmovdqu %xmm15, 15 * 16(%rax);
1026 vpcmpeqd %xmm15, %xmm15, %xmm15;
1027 vpsrldq $8, %xmm15, %xmm15; /* low: -1, high: 0 */
1030 inc_le128(%xmm0, %xmm15, %xmm13);
1031 vpshufb %xmm14, %xmm0, %xmm13;
1032 vmovdqu %xmm13, 14 * 16(%rax);
1033 inc_le128(%xmm0, %xmm15, %xmm13);
1034 vpshufb %xmm14, %xmm0, %xmm13;
1035 vmovdqu %xmm13, 13 * 16(%rax);
1036 inc_le128(%xmm0, %xmm15, %xmm13);
1037 vpshufb %xmm14, %xmm0, %xmm12;
1038 inc_le128(%xmm0, %xmm15, %xmm13);
1039 vpshufb %xmm14, %xmm0, %xmm11;
1040 inc_le128(%xmm0, %xmm15, %xmm13);
1041 vpshufb %xmm14, %xmm0, %xmm10;
1042 inc_le128(%xmm0, %xmm15, %xmm13);
1043 vpshufb %xmm14, %xmm0, %xmm9;
1044 inc_le128(%xmm0, %xmm15, %xmm13);
1045 vpshufb %xmm14, %xmm0, %xmm8;
1046 inc_le128(%xmm0, %xmm15, %xmm13);
1047 vpshufb %xmm14, %xmm0, %xmm7;
1048 inc_le128(%xmm0, %xmm15, %xmm13);
1049 vpshufb %xmm14, %xmm0, %xmm6;
1050 inc_le128(%xmm0, %xmm15, %xmm13);
1051 vpshufb %xmm14, %xmm0, %xmm5;
1052 inc_le128(%xmm0, %xmm15, %xmm13);
1053 vpshufb %xmm14, %xmm0, %xmm4;
1054 inc_le128(%xmm0, %xmm15, %xmm13);
1055 vpshufb %xmm14, %xmm0, %xmm3;
1056 inc_le128(%xmm0, %xmm15, %xmm13);
1057 vpshufb %xmm14, %xmm0, %xmm2;
1058 inc_le128(%xmm0, %xmm15, %xmm13);
1059 vpshufb %xmm14, %xmm0, %xmm1;
1060 inc_le128(%xmm0, %xmm15, %xmm13);
1061 vmovdqa %xmm0, %xmm13;
1062 vpshufb %xmm14, %xmm0, %xmm0;
1063 inc_le128(%xmm13, %xmm15, %xmm14);
1064 vmovdqu %xmm13, (%rcx);
1067 vmovq (key_table)(CTX), %xmm15;
1068 vpshufb .Lpack_bswap, %xmm15, %xmm15;
1069 vpxor %xmm0, %xmm15, %xmm0;
1070 vpxor %xmm1, %xmm15, %xmm1;
1071 vpxor %xmm2, %xmm15, %xmm2;
1072 vpxor %xmm3, %xmm15, %xmm3;
1073 vpxor %xmm4, %xmm15, %xmm4;
1074 vpxor %xmm5, %xmm15, %xmm5;
1075 vpxor %xmm6, %xmm15, %xmm6;
1076 vpxor %xmm7, %xmm15, %xmm7;
1077 vpxor %xmm8, %xmm15, %xmm8;
1078 vpxor %xmm9, %xmm15, %xmm9;
1079 vpxor %xmm10, %xmm15, %xmm10;
1080 vpxor %xmm11, %xmm15, %xmm11;
1081 vpxor %xmm12, %xmm15, %xmm12;
1082 vpxor 13 * 16(%rax), %xmm15, %xmm13;
1083 vpxor 14 * 16(%rax), %xmm15, %xmm14;
1084 vpxor 15 * 16(%rax), %xmm15, %xmm15;
1086 call __camellia_enc_blk16;
1088 addq $(16 * 16), %rsp;
1090 vpxor 0 * 16(%rdx), %xmm7, %xmm7;
1091 vpxor 1 * 16(%rdx), %xmm6, %xmm6;
1092 vpxor 2 * 16(%rdx), %xmm5, %xmm5;
1093 vpxor 3 * 16(%rdx), %xmm4, %xmm4;
1094 vpxor 4 * 16(%rdx), %xmm3, %xmm3;
1095 vpxor 5 * 16(%rdx), %xmm2, %xmm2;
1096 vpxor 6 * 16(%rdx), %xmm1, %xmm1;
1097 vpxor 7 * 16(%rdx), %xmm0, %xmm0;
1098 vpxor 8 * 16(%rdx), %xmm15, %xmm15;
1099 vpxor 9 * 16(%rdx), %xmm14, %xmm14;
1100 vpxor 10 * 16(%rdx), %xmm13, %xmm13;
1101 vpxor 11 * 16(%rdx), %xmm12, %xmm12;
1102 vpxor 12 * 16(%rdx), %xmm11, %xmm11;
1103 vpxor 13 * 16(%rdx), %xmm10, %xmm10;
1104 vpxor 14 * 16(%rdx), %xmm9, %xmm9;
1105 vpxor 15 * 16(%rdx), %xmm8, %xmm8;
1106 write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,
1107 %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
1112 ENDPROC(camellia_ctr_16way)
1114 #define gf128mul_x_ble(iv, mask, tmp) \
1115 vpsrad $31, iv, tmp; \
1116 vpaddq iv, iv, iv; \
1117 vpshufd $0x13, tmp, tmp; \
1118 vpand mask, tmp, tmp; \
1122 camellia_xts_crypt_16way:
1125 * %rsi: dst (16 blocks)
1126 * %rdx: src (16 blocks)
1127 * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
1128 * %r8: index for input whitening key
1129 * %r9: pointer to __camellia_enc_blk16 or __camellia_dec_blk16
1133 subq $(16 * 16), %rsp;
1136 vmovdqa .Lxts_gf128mul_and_shl1_mask, %xmm14;
1139 vmovdqu (%rcx), %xmm0;
1140 vpxor 0 * 16(%rdx), %xmm0, %xmm15;
1141 vmovdqu %xmm15, 15 * 16(%rax);
1142 vmovdqu %xmm0, 0 * 16(%rsi);
1145 gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1146 vpxor 1 * 16(%rdx), %xmm0, %xmm15;
1147 vmovdqu %xmm15, 14 * 16(%rax);
1148 vmovdqu %xmm0, 1 * 16(%rsi);
1150 gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1151 vpxor 2 * 16(%rdx), %xmm0, %xmm13;
1152 vmovdqu %xmm0, 2 * 16(%rsi);
1154 gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1155 vpxor 3 * 16(%rdx), %xmm0, %xmm12;
1156 vmovdqu %xmm0, 3 * 16(%rsi);
1158 gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1159 vpxor 4 * 16(%rdx), %xmm0, %xmm11;
1160 vmovdqu %xmm0, 4 * 16(%rsi);
1162 gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1163 vpxor 5 * 16(%rdx), %xmm0, %xmm10;
1164 vmovdqu %xmm0, 5 * 16(%rsi);
1166 gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1167 vpxor 6 * 16(%rdx), %xmm0, %xmm9;
1168 vmovdqu %xmm0, 6 * 16(%rsi);
1170 gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1171 vpxor 7 * 16(%rdx), %xmm0, %xmm8;
1172 vmovdqu %xmm0, 7 * 16(%rsi);
1174 gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1175 vpxor 8 * 16(%rdx), %xmm0, %xmm7;
1176 vmovdqu %xmm0, 8 * 16(%rsi);
1178 gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1179 vpxor 9 * 16(%rdx), %xmm0, %xmm6;
1180 vmovdqu %xmm0, 9 * 16(%rsi);
1182 gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1183 vpxor 10 * 16(%rdx), %xmm0, %xmm5;
1184 vmovdqu %xmm0, 10 * 16(%rsi);
1186 gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1187 vpxor 11 * 16(%rdx), %xmm0, %xmm4;
1188 vmovdqu %xmm0, 11 * 16(%rsi);
1190 gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1191 vpxor 12 * 16(%rdx), %xmm0, %xmm3;
1192 vmovdqu %xmm0, 12 * 16(%rsi);
1194 gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1195 vpxor 13 * 16(%rdx), %xmm0, %xmm2;
1196 vmovdqu %xmm0, 13 * 16(%rsi);
1198 gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1199 vpxor 14 * 16(%rdx), %xmm0, %xmm1;
1200 vmovdqu %xmm0, 14 * 16(%rsi);
1202 gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1203 vpxor 15 * 16(%rdx), %xmm0, %xmm15;
1204 vmovdqu %xmm15, 0 * 16(%rax);
1205 vmovdqu %xmm0, 15 * 16(%rsi);
1207 gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1208 vmovdqu %xmm0, (%rcx);
1211 vmovq (key_table)(CTX, %r8, 8), %xmm15;
1212 vpshufb .Lpack_bswap, %xmm15, %xmm15;
1213 vpxor 0 * 16(%rax), %xmm15, %xmm0;
1214 vpxor %xmm1, %xmm15, %xmm1;
1215 vpxor %xmm2, %xmm15, %xmm2;
1216 vpxor %xmm3, %xmm15, %xmm3;
1217 vpxor %xmm4, %xmm15, %xmm4;
1218 vpxor %xmm5, %xmm15, %xmm5;
1219 vpxor %xmm6, %xmm15, %xmm6;
1220 vpxor %xmm7, %xmm15, %xmm7;
1221 vpxor %xmm8, %xmm15, %xmm8;
1222 vpxor %xmm9, %xmm15, %xmm9;
1223 vpxor %xmm10, %xmm15, %xmm10;
1224 vpxor %xmm11, %xmm15, %xmm11;
1225 vpxor %xmm12, %xmm15, %xmm12;
1226 vpxor %xmm13, %xmm15, %xmm13;
1227 vpxor 14 * 16(%rax), %xmm15, %xmm14;
1228 vpxor 15 * 16(%rax), %xmm15, %xmm15;
1232 addq $(16 * 16), %rsp;
1234 vpxor 0 * 16(%rsi), %xmm7, %xmm7;
1235 vpxor 1 * 16(%rsi), %xmm6, %xmm6;
1236 vpxor 2 * 16(%rsi), %xmm5, %xmm5;
1237 vpxor 3 * 16(%rsi), %xmm4, %xmm4;
1238 vpxor 4 * 16(%rsi), %xmm3, %xmm3;
1239 vpxor 5 * 16(%rsi), %xmm2, %xmm2;
1240 vpxor 6 * 16(%rsi), %xmm1, %xmm1;
1241 vpxor 7 * 16(%rsi), %xmm0, %xmm0;
1242 vpxor 8 * 16(%rsi), %xmm15, %xmm15;
1243 vpxor 9 * 16(%rsi), %xmm14, %xmm14;
1244 vpxor 10 * 16(%rsi), %xmm13, %xmm13;
1245 vpxor 11 * 16(%rsi), %xmm12, %xmm12;
1246 vpxor 12 * 16(%rsi), %xmm11, %xmm11;
1247 vpxor 13 * 16(%rsi), %xmm10, %xmm10;
1248 vpxor 14 * 16(%rsi), %xmm9, %xmm9;
1249 vpxor 15 * 16(%rsi), %xmm8, %xmm8;
1250 write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,
1251 %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
1256 ENDPROC(camellia_xts_crypt_16way)
1258 ENTRY(camellia_xts_enc_16way)
1261 * %rsi: dst (16 blocks)
1262 * %rdx: src (16 blocks)
1263 * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
1265 xorl %r8d, %r8d; /* input whitening key, 0 for enc */
1267 leaq __camellia_enc_blk16, %r9;
1269 jmp camellia_xts_crypt_16way;
1270 ENDPROC(camellia_xts_enc_16way)
1272 ENTRY(camellia_xts_dec_16way)
1275 * %rsi: dst (16 blocks)
1276 * %rdx: src (16 blocks)
1277 * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
1280 cmpl $16, key_length(CTX);
1283 cmovel %eax, %r8d; /* input whitening key, last for dec */
1285 leaq __camellia_dec_blk16, %r9;
1287 jmp camellia_xts_crypt_16way;
1288 ENDPROC(camellia_xts_dec_16way)