2 * x86_64/AVX/AES-NI assembler implementation of Camellia
4 * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
14 * Version licensed under 2-clause BSD License is available at:
15 * http://koti.mbnet.fi/axh/crypto/camellia-BSD-1.2.0-aesni1.tar.xz
18 #include <linux/linkage.h>
20 #define CAMELLIA_TABLE_BYTE_LEN 272
22 /* struct camellia_ctx: */
24 #define key_length CAMELLIA_TABLE_BYTE_LEN
29 /**********************************************************************
31 **********************************************************************/
32 #define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0) \
33 vpand x, mask4bit, tmp0; \
34 vpandn x, mask4bit, x; \
37 vpshufb tmp0, lo_t, tmp0; \
43 * x0..x7: byte-sliced AB state
44 * mem_cd: register pointer storing CD state
45 * key: index for key material
47 * x0..x7: new byte-sliced CD state
49 #define roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, t0, t1, t2, t3, t4, t5, t6, \
52 * S-function with AES subbytes \
54 vmovdqa .Linv_shift_row, t4; \
55 vbroadcastss .L0f0f0f0f, t7; \
56 vmovdqa .Lpre_tf_lo_s1, t0; \
57 vmovdqa .Lpre_tf_hi_s1, t1; \
59 /* AES inverse shift rows */ \
69 /* prefilter sboxes 1, 2 and 3 */ \
70 vmovdqa .Lpre_tf_lo_s4, t2; \
71 vmovdqa .Lpre_tf_hi_s4, t3; \
72 filter_8bit(x0, t0, t1, t7, t6); \
73 filter_8bit(x7, t0, t1, t7, t6); \
74 filter_8bit(x1, t0, t1, t7, t6); \
75 filter_8bit(x4, t0, t1, t7, t6); \
76 filter_8bit(x2, t0, t1, t7, t6); \
77 filter_8bit(x5, t0, t1, t7, t6); \
79 /* prefilter sbox 4 */ \
81 filter_8bit(x3, t2, t3, t7, t6); \
82 filter_8bit(x6, t2, t3, t7, t6); \
84 /* AES subbytes + AES shift rows */ \
85 vmovdqa .Lpost_tf_lo_s1, t0; \
86 vmovdqa .Lpost_tf_hi_s1, t1; \
87 vaesenclast t4, x0, x0; \
88 vaesenclast t4, x7, x7; \
89 vaesenclast t4, x1, x1; \
90 vaesenclast t4, x4, x4; \
91 vaesenclast t4, x2, x2; \
92 vaesenclast t4, x5, x5; \
93 vaesenclast t4, x3, x3; \
94 vaesenclast t4, x6, x6; \
96 /* postfilter sboxes 1 and 4 */ \
97 vmovdqa .Lpost_tf_lo_s3, t2; \
98 vmovdqa .Lpost_tf_hi_s3, t3; \
99 filter_8bit(x0, t0, t1, t7, t6); \
100 filter_8bit(x7, t0, t1, t7, t6); \
101 filter_8bit(x3, t0, t1, t7, t6); \
102 filter_8bit(x6, t0, t1, t7, t6); \
104 /* postfilter sbox 3 */ \
105 vmovdqa .Lpost_tf_lo_s2, t4; \
106 vmovdqa .Lpost_tf_hi_s2, t5; \
107 filter_8bit(x2, t2, t3, t7, t6); \
108 filter_8bit(x5, t2, t3, t7, t6); \
113 /* postfilter sbox 2 */ \
114 filter_8bit(x1, t4, t5, t7, t2); \
115 filter_8bit(x4, t4, t5, t7, t2); \
117 vpsrldq $5, t0, t5; \
118 vpsrldq $1, t0, t1; \
119 vpsrldq $2, t0, t2; \
120 vpsrldq $3, t0, t3; \
121 vpsrldq $4, t0, t4; \
122 vpshufb t6, t0, t0; \
123 vpshufb t6, t1, t1; \
124 vpshufb t6, t2, t2; \
125 vpshufb t6, t3, t3; \
126 vpshufb t6, t4, t4; \
127 vpsrldq $2, t5, t7; \
128 vpshufb t6, t7, t7; \
151 vpxor x2, x7, x7; /* note: high and low parts swapped */ \
154 * Add key material and result to CD (x becomes new CD) \
158 vpxor 0 * 16(mem_cd), x4, x4; \
161 vpxor 1 * 16(mem_cd), x5, x5; \
163 vpsrldq $1, t5, t3; \
164 vpshufb t6, t5, t5; \
165 vpshufb t6, t3, t6; \
168 vpxor 2 * 16(mem_cd), x6, x6; \
171 vpxor 3 * 16(mem_cd), x7, x7; \
174 vpxor 4 * 16(mem_cd), x0, x0; \
177 vpxor 5 * 16(mem_cd), x1, x1; \
180 vpxor 6 * 16(mem_cd), x2, x2; \
183 vpxor 7 * 16(mem_cd), x3, x3;
186 * Size optimization... with inlined roundsm16, binary would be over 5 times
187 * larger and would only be 0.5% faster (on sandy-bridge).
190 roundsm16_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd:
191 roundsm16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
192 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15,
195 ENDPROC(roundsm16_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd)
198 roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab:
199 roundsm16(%xmm4, %xmm5, %xmm6, %xmm7, %xmm0, %xmm1, %xmm2, %xmm3,
200 %xmm12, %xmm13, %xmm14, %xmm15, %xmm8, %xmm9, %xmm10, %xmm11,
203 ENDPROC(roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
207 * x0..x7: byte-sliced AB state preloaded
208 * mem_ab: byte-sliced AB state in memory
209 * mem_cb: byte-sliced CD state in memory
211 #define two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
212 y6, y7, mem_ab, mem_cd, i, dir, store_ab) \
213 leaq (key_table + (i) * 8)(CTX), %r9; \
214 call roundsm16_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd; \
216 vmovdqu x4, 0 * 16(mem_cd); \
217 vmovdqu x5, 1 * 16(mem_cd); \
218 vmovdqu x6, 2 * 16(mem_cd); \
219 vmovdqu x7, 3 * 16(mem_cd); \
220 vmovdqu x0, 4 * 16(mem_cd); \
221 vmovdqu x1, 5 * 16(mem_cd); \
222 vmovdqu x2, 6 * 16(mem_cd); \
223 vmovdqu x3, 7 * 16(mem_cd); \
225 leaq (key_table + ((i) + (dir)) * 8)(CTX), %r9; \
226 call roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab; \
228 store_ab(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab);
230 #define dummy_store(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) /* do nothing */
232 #define store_ab_state(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) \
233 /* Store new AB state */ \
234 vmovdqu x0, 0 * 16(mem_ab); \
235 vmovdqu x1, 1 * 16(mem_ab); \
236 vmovdqu x2, 2 * 16(mem_ab); \
237 vmovdqu x3, 3 * 16(mem_ab); \
238 vmovdqu x4, 4 * 16(mem_ab); \
239 vmovdqu x5, 5 * 16(mem_ab); \
240 vmovdqu x6, 6 * 16(mem_ab); \
241 vmovdqu x7, 7 * 16(mem_ab);
243 #define enc_rounds16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
244 y6, y7, mem_ab, mem_cd, i) \
245 two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
246 y6, y7, mem_ab, mem_cd, (i) + 2, 1, store_ab_state); \
247 two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
248 y6, y7, mem_ab, mem_cd, (i) + 4, 1, store_ab_state); \
249 two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
250 y6, y7, mem_ab, mem_cd, (i) + 6, 1, dummy_store);
252 #define dec_rounds16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
253 y6, y7, mem_ab, mem_cd, i) \
254 two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
255 y6, y7, mem_ab, mem_cd, (i) + 7, -1, store_ab_state); \
256 two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
257 y6, y7, mem_ab, mem_cd, (i) + 5, -1, store_ab_state); \
258 two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
259 y6, y7, mem_ab, mem_cd, (i) + 3, -1, dummy_store);
263 * v0..3: byte-sliced 32-bit integers
267 #define rol32_1_16(v0, v1, v2, v3, t0, t1, t2, zero) \
268 vpcmpgtb v0, zero, t0; \
272 vpcmpgtb v1, zero, t1; \
276 vpcmpgtb v2, zero, t2; \
282 vpcmpgtb v3, zero, t0; \
292 * r: byte-sliced AB state in memory
293 * l: byte-sliced CD state in memory
295 * x0..x7: new byte-sliced CD state
297 #define fls16(l, l0, l1, l2, l3, l4, l5, l6, l7, r, t0, t1, t2, t3, tt0, \
298 tt1, tt2, tt3, kll, klr, krl, krr) \
302 * lr ^= rol32(t0, 1); \
304 vpxor tt0, tt0, tt0; \
306 vpshufb tt0, t0, t3; \
307 vpsrldq $1, t0, t0; \
308 vpshufb tt0, t0, t2; \
309 vpsrldq $1, t0, t0; \
310 vpshufb tt0, t0, t1; \
311 vpsrldq $1, t0, t0; \
312 vpshufb tt0, t0, t0; \
319 rol32_1_16(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \
322 vmovdqu l4, 4 * 16(l); \
324 vmovdqu l5, 5 * 16(l); \
326 vmovdqu l6, 6 * 16(l); \
328 vmovdqu l7, 7 * 16(l); \
337 vpshufb tt0, t0, t3; \
338 vpsrldq $1, t0, t0; \
339 vpshufb tt0, t0, t2; \
340 vpsrldq $1, t0, t0; \
341 vpshufb tt0, t0, t1; \
342 vpsrldq $1, t0, t0; \
343 vpshufb tt0, t0, t0; \
345 vpor 4 * 16(r), t0, t0; \
346 vpor 5 * 16(r), t1, t1; \
347 vpor 6 * 16(r), t2, t2; \
348 vpor 7 * 16(r), t3, t3; \
350 vpxor 0 * 16(r), t0, t0; \
351 vpxor 1 * 16(r), t1, t1; \
352 vpxor 2 * 16(r), t2, t2; \
353 vpxor 3 * 16(r), t3, t3; \
354 vmovdqu t0, 0 * 16(r); \
355 vmovdqu t1, 1 * 16(r); \
356 vmovdqu t2, 2 * 16(r); \
357 vmovdqu t3, 3 * 16(r); \
362 * rr ^= rol32(t2, 1); \
365 vpshufb tt0, t0, t3; \
366 vpsrldq $1, t0, t0; \
367 vpshufb tt0, t0, t2; \
368 vpsrldq $1, t0, t0; \
369 vpshufb tt0, t0, t1; \
370 vpsrldq $1, t0, t0; \
371 vpshufb tt0, t0, t0; \
373 vpand 0 * 16(r), t0, t0; \
374 vpand 1 * 16(r), t1, t1; \
375 vpand 2 * 16(r), t2, t2; \
376 vpand 3 * 16(r), t3, t3; \
378 rol32_1_16(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \
380 vpxor 4 * 16(r), t0, t0; \
381 vpxor 5 * 16(r), t1, t1; \
382 vpxor 6 * 16(r), t2, t2; \
383 vpxor 7 * 16(r), t3, t3; \
384 vmovdqu t0, 4 * 16(r); \
385 vmovdqu t1, 5 * 16(r); \
386 vmovdqu t2, 6 * 16(r); \
387 vmovdqu t3, 7 * 16(r); \
396 vpshufb tt0, t0, t3; \
397 vpsrldq $1, t0, t0; \
398 vpshufb tt0, t0, t2; \
399 vpsrldq $1, t0, t0; \
400 vpshufb tt0, t0, t1; \
401 vpsrldq $1, t0, t0; \
402 vpshufb tt0, t0, t0; \
410 vmovdqu l0, 0 * 16(l); \
412 vmovdqu l1, 1 * 16(l); \
414 vmovdqu l2, 2 * 16(l); \
416 vmovdqu l3, 3 * 16(l);
418 #define transpose_4x4(x0, x1, x2, x3, t1, t2) \
419 vpunpckhdq x1, x0, t2; \
420 vpunpckldq x1, x0, x0; \
422 vpunpckldq x3, x2, t1; \
423 vpunpckhdq x3, x2, x2; \
425 vpunpckhqdq t1, x0, x1; \
426 vpunpcklqdq t1, x0, x0; \
428 vpunpckhqdq x2, t2, x3; \
429 vpunpcklqdq x2, t2, x2;
431 #define byteslice_16x16b(a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, a3, \
432 b3, c3, d3, st0, st1) \
435 transpose_4x4(a0, a1, a2, a3, d2, d3); \
436 transpose_4x4(b0, b1, b2, b3, d2, d3); \
442 transpose_4x4(c0, c1, c2, c3, a0, a1); \
443 transpose_4x4(d0, d1, d2, d3, a0, a1); \
445 vmovdqu .Lshufb_16x16b, a0; \
447 vpshufb a0, a2, a2; \
448 vpshufb a0, a3, a3; \
449 vpshufb a0, b0, b0; \
450 vpshufb a0, b1, b1; \
451 vpshufb a0, b2, b2; \
452 vpshufb a0, b3, b3; \
453 vpshufb a0, a1, a1; \
454 vpshufb a0, c0, c0; \
455 vpshufb a0, c1, c1; \
456 vpshufb a0, c2, c2; \
457 vpshufb a0, c3, c3; \
458 vpshufb a0, d0, d0; \
459 vpshufb a0, d1, d1; \
460 vpshufb a0, d2, d2; \
461 vpshufb a0, d3, d3; \
464 vpshufb a0, d3, a0; \
467 transpose_4x4(a0, b0, c0, d0, d2, d3); \
468 transpose_4x4(a1, b1, c1, d1, d2, d3); \
474 transpose_4x4(a2, b2, c2, d2, b0, b1); \
475 transpose_4x4(a3, b3, c3, d3, b0, b1); \
478 /* does not adjust output bytes inside vectors */
480 /* load blocks to registers and apply pre-whitening */
481 #define inpack16_pre(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
484 vpshufb .Lpack_bswap, x0, x0; \
486 vpxor 0 * 16(rio), x0, y7; \
487 vpxor 1 * 16(rio), x0, y6; \
488 vpxor 2 * 16(rio), x0, y5; \
489 vpxor 3 * 16(rio), x0, y4; \
490 vpxor 4 * 16(rio), x0, y3; \
491 vpxor 5 * 16(rio), x0, y2; \
492 vpxor 6 * 16(rio), x0, y1; \
493 vpxor 7 * 16(rio), x0, y0; \
494 vpxor 8 * 16(rio), x0, x7; \
495 vpxor 9 * 16(rio), x0, x6; \
496 vpxor 10 * 16(rio), x0, x5; \
497 vpxor 11 * 16(rio), x0, x4; \
498 vpxor 12 * 16(rio), x0, x3; \
499 vpxor 13 * 16(rio), x0, x2; \
500 vpxor 14 * 16(rio), x0, x1; \
501 vpxor 15 * 16(rio), x0, x0;
503 /* byteslice pre-whitened blocks and store to temporary memory */
504 #define inpack16_post(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
505 y6, y7, mem_ab, mem_cd) \
506 byteslice_16x16b(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, \
507 y5, y6, y7, (mem_ab), (mem_cd)); \
509 vmovdqu x0, 0 * 16(mem_ab); \
510 vmovdqu x1, 1 * 16(mem_ab); \
511 vmovdqu x2, 2 * 16(mem_ab); \
512 vmovdqu x3, 3 * 16(mem_ab); \
513 vmovdqu x4, 4 * 16(mem_ab); \
514 vmovdqu x5, 5 * 16(mem_ab); \
515 vmovdqu x6, 6 * 16(mem_ab); \
516 vmovdqu x7, 7 * 16(mem_ab); \
517 vmovdqu y0, 0 * 16(mem_cd); \
518 vmovdqu y1, 1 * 16(mem_cd); \
519 vmovdqu y2, 2 * 16(mem_cd); \
520 vmovdqu y3, 3 * 16(mem_cd); \
521 vmovdqu y4, 4 * 16(mem_cd); \
522 vmovdqu y5, 5 * 16(mem_cd); \
523 vmovdqu y6, 6 * 16(mem_cd); \
524 vmovdqu y7, 7 * 16(mem_cd);
526 /* de-byteslice, apply post-whitening and store blocks */
527 #define outunpack16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, \
528 y5, y6, y7, key, stack_tmp0, stack_tmp1) \
529 byteslice_16x16b(y0, y4, x0, x4, y1, y5, x1, x5, y2, y6, x2, x6, y3, \
530 y7, x3, x7, stack_tmp0, stack_tmp1); \
532 vmovdqu x0, stack_tmp0; \
535 vpshufb .Lpack_bswap, x0, x0; \
552 vpxor stack_tmp0, x0, x0;
554 #define write_output(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
556 vmovdqu x0, 0 * 16(rio); \
557 vmovdqu x1, 1 * 16(rio); \
558 vmovdqu x2, 2 * 16(rio); \
559 vmovdqu x3, 3 * 16(rio); \
560 vmovdqu x4, 4 * 16(rio); \
561 vmovdqu x5, 5 * 16(rio); \
562 vmovdqu x6, 6 * 16(rio); \
563 vmovdqu x7, 7 * 16(rio); \
564 vmovdqu y0, 8 * 16(rio); \
565 vmovdqu y1, 9 * 16(rio); \
566 vmovdqu y2, 10 * 16(rio); \
567 vmovdqu y3, 11 * 16(rio); \
568 vmovdqu y4, 12 * 16(rio); \
569 vmovdqu y5, 13 * 16(rio); \
570 vmovdqu y6, 14 * 16(rio); \
571 vmovdqu y7, 15 * 16(rio);
576 #define SHUFB_BYTES(idx) \
577 0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx)
580 .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3);
588 /* For CTR-mode IV byteswap */
590 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
592 /* For XTS mode IV generation */
593 .Lxts_gf128mul_and_shl1_mask:
594 .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0
597 * pre-SubByte transform
599 * pre-lookup for sbox1, sbox2, sbox3:
600 * swap_bitendianness(
601 * isom_map_camellia_to_aes(
603 * swap_bitendianess(in)
608 * (note: '⊕ 0xc5' inside camellia_f())
611 .byte 0x45, 0xe8, 0x40, 0xed, 0x2e, 0x83, 0x2b, 0x86
612 .byte 0x4b, 0xe6, 0x4e, 0xe3, 0x20, 0x8d, 0x25, 0x88
614 .byte 0x00, 0x51, 0xf1, 0xa0, 0x8a, 0xdb, 0x7b, 0x2a
615 .byte 0x09, 0x58, 0xf8, 0xa9, 0x83, 0xd2, 0x72, 0x23
618 * pre-SubByte transform
620 * pre-lookup for sbox4:
621 * swap_bitendianness(
622 * isom_map_camellia_to_aes(
624 * swap_bitendianess(in <<< 1)
629 * (note: '⊕ 0xc5' inside camellia_f())
632 .byte 0x45, 0x40, 0x2e, 0x2b, 0x4b, 0x4e, 0x20, 0x25
633 .byte 0x14, 0x11, 0x7f, 0x7a, 0x1a, 0x1f, 0x71, 0x74
635 .byte 0x00, 0xf1, 0x8a, 0x7b, 0x09, 0xf8, 0x83, 0x72
636 .byte 0xad, 0x5c, 0x27, 0xd6, 0xa4, 0x55, 0x2e, 0xdf
639 * post-SubByte transform
641 * post-lookup for sbox1, sbox4:
642 * swap_bitendianness(
644 * isom_map_aes_to_camellia(
645 * swap_bitendianness(
646 * aes_inverse_affine_transform(in)
652 * (note: '⊕ 0x6e' inside camellia_h())
655 .byte 0x3c, 0xcc, 0xcf, 0x3f, 0x32, 0xc2, 0xc1, 0x31
656 .byte 0xdc, 0x2c, 0x2f, 0xdf, 0xd2, 0x22, 0x21, 0xd1
658 .byte 0x00, 0xf9, 0x86, 0x7f, 0xd7, 0x2e, 0x51, 0xa8
659 .byte 0xa4, 0x5d, 0x22, 0xdb, 0x73, 0x8a, 0xf5, 0x0c
662 * post-SubByte transform
664 * post-lookup for sbox2:
665 * swap_bitendianness(
667 * isom_map_aes_to_camellia(
668 * swap_bitendianness(
669 * aes_inverse_affine_transform(in)
675 * (note: '⊕ 0x6e' inside camellia_h())
678 .byte 0x78, 0x99, 0x9f, 0x7e, 0x64, 0x85, 0x83, 0x62
679 .byte 0xb9, 0x58, 0x5e, 0xbf, 0xa5, 0x44, 0x42, 0xa3
681 .byte 0x00, 0xf3, 0x0d, 0xfe, 0xaf, 0x5c, 0xa2, 0x51
682 .byte 0x49, 0xba, 0x44, 0xb7, 0xe6, 0x15, 0xeb, 0x18
685 * post-SubByte transform
687 * post-lookup for sbox3:
688 * swap_bitendianness(
690 * isom_map_aes_to_camellia(
691 * swap_bitendianness(
692 * aes_inverse_affine_transform(in)
698 * (note: '⊕ 0x6e' inside camellia_h())
701 .byte 0x1e, 0x66, 0xe7, 0x9f, 0x19, 0x61, 0xe0, 0x98
702 .byte 0x6e, 0x16, 0x97, 0xef, 0x69, 0x11, 0x90, 0xe8
704 .byte 0x00, 0xfc, 0x43, 0xbf, 0xeb, 0x17, 0xa8, 0x54
705 .byte 0x52, 0xae, 0x11, 0xed, 0xb9, 0x45, 0xfa, 0x06
707 /* For isolating SubBytes from AESENCLAST, inverse shift row */
709 .byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b
710 .byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03
720 __camellia_enc_blk16:
723 * %rax: temporary storage, 256 bytes
724 * %xmm0..%xmm15: 16 plaintext blocks
726 * %xmm0..%xmm15: 16 encrypted blocks, order swapped:
727 * 7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
730 leaq 8 * 16(%rax), %rcx;
732 inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
733 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
736 enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
737 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
738 %xmm15, %rax, %rcx, 0);
740 fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
741 %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
743 ((key_table + (8) * 8) + 0)(CTX),
744 ((key_table + (8) * 8) + 4)(CTX),
745 ((key_table + (8) * 8) + 8)(CTX),
746 ((key_table + (8) * 8) + 12)(CTX));
748 enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
749 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
750 %xmm15, %rax, %rcx, 8);
752 fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
753 %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
755 ((key_table + (16) * 8) + 0)(CTX),
756 ((key_table + (16) * 8) + 4)(CTX),
757 ((key_table + (16) * 8) + 8)(CTX),
758 ((key_table + (16) * 8) + 12)(CTX));
760 enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
761 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
762 %xmm15, %rax, %rcx, 16);
765 cmpl $16, key_length(CTX);
769 /* load CD for output */
770 vmovdqu 0 * 16(%rcx), %xmm8;
771 vmovdqu 1 * 16(%rcx), %xmm9;
772 vmovdqu 2 * 16(%rcx), %xmm10;
773 vmovdqu 3 * 16(%rcx), %xmm11;
774 vmovdqu 4 * 16(%rcx), %xmm12;
775 vmovdqu 5 * 16(%rcx), %xmm13;
776 vmovdqu 6 * 16(%rcx), %xmm14;
777 vmovdqu 7 * 16(%rcx), %xmm15;
779 outunpack16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
780 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
781 %xmm15, (key_table)(CTX, %r8, 8), (%rax), 1 * 16(%rax));
789 fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
790 %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
792 ((key_table + (24) * 8) + 0)(CTX),
793 ((key_table + (24) * 8) + 4)(CTX),
794 ((key_table + (24) * 8) + 8)(CTX),
795 ((key_table + (24) * 8) + 12)(CTX));
797 enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
798 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
799 %xmm15, %rax, %rcx, 24);
802 ENDPROC(__camellia_enc_blk16)
805 __camellia_dec_blk16:
808 * %rax: temporary storage, 256 bytes
809 * %r8d: 24 for 16 byte key, 32 for larger
810 * %xmm0..%xmm15: 16 encrypted blocks
812 * %xmm0..%xmm15: 16 plaintext blocks, order swapped:
813 * 7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
816 leaq 8 * 16(%rax), %rcx;
818 inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
819 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
826 dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
827 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
828 %xmm15, %rax, %rcx, 16);
830 fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
831 %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
833 ((key_table + (16) * 8) + 8)(CTX),
834 ((key_table + (16) * 8) + 12)(CTX),
835 ((key_table + (16) * 8) + 0)(CTX),
836 ((key_table + (16) * 8) + 4)(CTX));
838 dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
839 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
840 %xmm15, %rax, %rcx, 8);
842 fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
843 %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
845 ((key_table + (8) * 8) + 8)(CTX),
846 ((key_table + (8) * 8) + 12)(CTX),
847 ((key_table + (8) * 8) + 0)(CTX),
848 ((key_table + (8) * 8) + 4)(CTX));
850 dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
851 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
852 %xmm15, %rax, %rcx, 0);
854 /* load CD for output */
855 vmovdqu 0 * 16(%rcx), %xmm8;
856 vmovdqu 1 * 16(%rcx), %xmm9;
857 vmovdqu 2 * 16(%rcx), %xmm10;
858 vmovdqu 3 * 16(%rcx), %xmm11;
859 vmovdqu 4 * 16(%rcx), %xmm12;
860 vmovdqu 5 * 16(%rcx), %xmm13;
861 vmovdqu 6 * 16(%rcx), %xmm14;
862 vmovdqu 7 * 16(%rcx), %xmm15;
864 outunpack16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
865 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
866 %xmm15, (key_table)(CTX), (%rax), 1 * 16(%rax));
872 dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
873 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
874 %xmm15, %rax, %rcx, 24);
876 fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
877 %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
879 ((key_table + (24) * 8) + 8)(CTX),
880 ((key_table + (24) * 8) + 12)(CTX),
881 ((key_table + (24) * 8) + 0)(CTX),
882 ((key_table + (24) * 8) + 4)(CTX));
885 ENDPROC(__camellia_dec_blk16)
887 ENTRY(camellia_ecb_enc_16way)
890 * %rsi: dst (16 blocks)
891 * %rdx: src (16 blocks)
894 inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
895 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
896 %xmm15, %rdx, (key_table)(CTX));
898 /* now dst can be used as temporary buffer (even in src == dst case) */
901 call __camellia_enc_blk16;
903 write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,
904 %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
908 ENDPROC(camellia_ecb_enc_16way)
910 ENTRY(camellia_ecb_dec_16way)
913 * %rsi: dst (16 blocks)
914 * %rdx: src (16 blocks)
917 cmpl $16, key_length(CTX);
920 cmovel %eax, %r8d; /* max */
922 inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
923 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
924 %xmm15, %rdx, (key_table)(CTX, %r8, 8));
926 /* now dst can be used as temporary buffer (even in src == dst case) */
929 call __camellia_dec_blk16;
931 write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,
932 %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
936 ENDPROC(camellia_ecb_dec_16way)
938 ENTRY(camellia_cbc_dec_16way)
941 * %rsi: dst (16 blocks)
942 * %rdx: src (16 blocks)
945 cmpl $16, key_length(CTX);
948 cmovel %eax, %r8d; /* max */
950 inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
951 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
952 %xmm15, %rdx, (key_table)(CTX, %r8, 8));
955 * dst might still be in-use (in case dst == src), so use stack for
958 subq $(16 * 16), %rsp;
961 call __camellia_dec_blk16;
963 addq $(16 * 16), %rsp;
965 vpxor (0 * 16)(%rdx), %xmm6, %xmm6;
966 vpxor (1 * 16)(%rdx), %xmm5, %xmm5;
967 vpxor (2 * 16)(%rdx), %xmm4, %xmm4;
968 vpxor (3 * 16)(%rdx), %xmm3, %xmm3;
969 vpxor (4 * 16)(%rdx), %xmm2, %xmm2;
970 vpxor (5 * 16)(%rdx), %xmm1, %xmm1;
971 vpxor (6 * 16)(%rdx), %xmm0, %xmm0;
972 vpxor (7 * 16)(%rdx), %xmm15, %xmm15;
973 vpxor (8 * 16)(%rdx), %xmm14, %xmm14;
974 vpxor (9 * 16)(%rdx), %xmm13, %xmm13;
975 vpxor (10 * 16)(%rdx), %xmm12, %xmm12;
976 vpxor (11 * 16)(%rdx), %xmm11, %xmm11;
977 vpxor (12 * 16)(%rdx), %xmm10, %xmm10;
978 vpxor (13 * 16)(%rdx), %xmm9, %xmm9;
979 vpxor (14 * 16)(%rdx), %xmm8, %xmm8;
980 write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,
981 %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
985 ENDPROC(camellia_cbc_dec_16way)
987 #define inc_le128(x, minus_one, tmp) \
988 vpcmpeqq minus_one, x, tmp; \
989 vpsubq minus_one, x, x; \
990 vpslldq $8, tmp, tmp; \
993 ENTRY(camellia_ctr_16way)
996 * %rsi: dst (16 blocks)
997 * %rdx: src (16 blocks)
998 * %rcx: iv (little endian, 128bit)
1001 subq $(16 * 16), %rsp;
1004 vmovdqa .Lbswap128_mask, %xmm14;
1006 /* load IV and byteswap */
1007 vmovdqu (%rcx), %xmm0;
1008 vpshufb %xmm14, %xmm0, %xmm15;
1009 vmovdqu %xmm15, 15 * 16(%rax);
1011 vpcmpeqd %xmm15, %xmm15, %xmm15;
1012 vpsrldq $8, %xmm15, %xmm15; /* low: -1, high: 0 */
1015 inc_le128(%xmm0, %xmm15, %xmm13);
1016 vpshufb %xmm14, %xmm0, %xmm13;
1017 vmovdqu %xmm13, 14 * 16(%rax);
1018 inc_le128(%xmm0, %xmm15, %xmm13);
1019 vpshufb %xmm14, %xmm0, %xmm13;
1020 vmovdqu %xmm13, 13 * 16(%rax);
1021 inc_le128(%xmm0, %xmm15, %xmm13);
1022 vpshufb %xmm14, %xmm0, %xmm12;
1023 inc_le128(%xmm0, %xmm15, %xmm13);
1024 vpshufb %xmm14, %xmm0, %xmm11;
1025 inc_le128(%xmm0, %xmm15, %xmm13);
1026 vpshufb %xmm14, %xmm0, %xmm10;
1027 inc_le128(%xmm0, %xmm15, %xmm13);
1028 vpshufb %xmm14, %xmm0, %xmm9;
1029 inc_le128(%xmm0, %xmm15, %xmm13);
1030 vpshufb %xmm14, %xmm0, %xmm8;
1031 inc_le128(%xmm0, %xmm15, %xmm13);
1032 vpshufb %xmm14, %xmm0, %xmm7;
1033 inc_le128(%xmm0, %xmm15, %xmm13);
1034 vpshufb %xmm14, %xmm0, %xmm6;
1035 inc_le128(%xmm0, %xmm15, %xmm13);
1036 vpshufb %xmm14, %xmm0, %xmm5;
1037 inc_le128(%xmm0, %xmm15, %xmm13);
1038 vpshufb %xmm14, %xmm0, %xmm4;
1039 inc_le128(%xmm0, %xmm15, %xmm13);
1040 vpshufb %xmm14, %xmm0, %xmm3;
1041 inc_le128(%xmm0, %xmm15, %xmm13);
1042 vpshufb %xmm14, %xmm0, %xmm2;
1043 inc_le128(%xmm0, %xmm15, %xmm13);
1044 vpshufb %xmm14, %xmm0, %xmm1;
1045 inc_le128(%xmm0, %xmm15, %xmm13);
1046 vmovdqa %xmm0, %xmm13;
1047 vpshufb %xmm14, %xmm0, %xmm0;
1048 inc_le128(%xmm13, %xmm15, %xmm14);
1049 vmovdqu %xmm13, (%rcx);
1052 vmovq (key_table)(CTX), %xmm15;
1053 vpshufb .Lpack_bswap, %xmm15, %xmm15;
1054 vpxor %xmm0, %xmm15, %xmm0;
1055 vpxor %xmm1, %xmm15, %xmm1;
1056 vpxor %xmm2, %xmm15, %xmm2;
1057 vpxor %xmm3, %xmm15, %xmm3;
1058 vpxor %xmm4, %xmm15, %xmm4;
1059 vpxor %xmm5, %xmm15, %xmm5;
1060 vpxor %xmm6, %xmm15, %xmm6;
1061 vpxor %xmm7, %xmm15, %xmm7;
1062 vpxor %xmm8, %xmm15, %xmm8;
1063 vpxor %xmm9, %xmm15, %xmm9;
1064 vpxor %xmm10, %xmm15, %xmm10;
1065 vpxor %xmm11, %xmm15, %xmm11;
1066 vpxor %xmm12, %xmm15, %xmm12;
1067 vpxor 13 * 16(%rax), %xmm15, %xmm13;
1068 vpxor 14 * 16(%rax), %xmm15, %xmm14;
1069 vpxor 15 * 16(%rax), %xmm15, %xmm15;
1071 call __camellia_enc_blk16;
1073 addq $(16 * 16), %rsp;
1075 vpxor 0 * 16(%rdx), %xmm7, %xmm7;
1076 vpxor 1 * 16(%rdx), %xmm6, %xmm6;
1077 vpxor 2 * 16(%rdx), %xmm5, %xmm5;
1078 vpxor 3 * 16(%rdx), %xmm4, %xmm4;
1079 vpxor 4 * 16(%rdx), %xmm3, %xmm3;
1080 vpxor 5 * 16(%rdx), %xmm2, %xmm2;
1081 vpxor 6 * 16(%rdx), %xmm1, %xmm1;
1082 vpxor 7 * 16(%rdx), %xmm0, %xmm0;
1083 vpxor 8 * 16(%rdx), %xmm15, %xmm15;
1084 vpxor 9 * 16(%rdx), %xmm14, %xmm14;
1085 vpxor 10 * 16(%rdx), %xmm13, %xmm13;
1086 vpxor 11 * 16(%rdx), %xmm12, %xmm12;
1087 vpxor 12 * 16(%rdx), %xmm11, %xmm11;
1088 vpxor 13 * 16(%rdx), %xmm10, %xmm10;
1089 vpxor 14 * 16(%rdx), %xmm9, %xmm9;
1090 vpxor 15 * 16(%rdx), %xmm8, %xmm8;
1091 write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,
1092 %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
1096 ENDPROC(camellia_ctr_16way)
1098 #define gf128mul_x_ble(iv, mask, tmp) \
1099 vpsrad $31, iv, tmp; \
1100 vpaddq iv, iv, iv; \
1101 vpshufd $0x13, tmp, tmp; \
1102 vpand mask, tmp, tmp; \
1106 camellia_xts_crypt_16way:
1109 * %rsi: dst (16 blocks)
1110 * %rdx: src (16 blocks)
1111 * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
1112 * %r8: index for input whitening key
1113 * %r9: pointer to __camellia_enc_blk16 or __camellia_dec_blk16
1116 subq $(16 * 16), %rsp;
1119 vmovdqa .Lxts_gf128mul_and_shl1_mask, %xmm14;
1122 vmovdqu (%rcx), %xmm0;
1123 vpxor 0 * 16(%rdx), %xmm0, %xmm15;
1124 vmovdqu %xmm15, 15 * 16(%rax);
1125 vmovdqu %xmm0, 0 * 16(%rsi);
1128 gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1129 vpxor 1 * 16(%rdx), %xmm0, %xmm15;
1130 vmovdqu %xmm15, 14 * 16(%rax);
1131 vmovdqu %xmm0, 1 * 16(%rsi);
1133 gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1134 vpxor 2 * 16(%rdx), %xmm0, %xmm13;
1135 vmovdqu %xmm0, 2 * 16(%rsi);
1137 gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1138 vpxor 3 * 16(%rdx), %xmm0, %xmm12;
1139 vmovdqu %xmm0, 3 * 16(%rsi);
1141 gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1142 vpxor 4 * 16(%rdx), %xmm0, %xmm11;
1143 vmovdqu %xmm0, 4 * 16(%rsi);
1145 gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1146 vpxor 5 * 16(%rdx), %xmm0, %xmm10;
1147 vmovdqu %xmm0, 5 * 16(%rsi);
1149 gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1150 vpxor 6 * 16(%rdx), %xmm0, %xmm9;
1151 vmovdqu %xmm0, 6 * 16(%rsi);
1153 gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1154 vpxor 7 * 16(%rdx), %xmm0, %xmm8;
1155 vmovdqu %xmm0, 7 * 16(%rsi);
1157 gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1158 vpxor 8 * 16(%rdx), %xmm0, %xmm7;
1159 vmovdqu %xmm0, 8 * 16(%rsi);
1161 gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1162 vpxor 9 * 16(%rdx), %xmm0, %xmm6;
1163 vmovdqu %xmm0, 9 * 16(%rsi);
1165 gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1166 vpxor 10 * 16(%rdx), %xmm0, %xmm5;
1167 vmovdqu %xmm0, 10 * 16(%rsi);
1169 gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1170 vpxor 11 * 16(%rdx), %xmm0, %xmm4;
1171 vmovdqu %xmm0, 11 * 16(%rsi);
1173 gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1174 vpxor 12 * 16(%rdx), %xmm0, %xmm3;
1175 vmovdqu %xmm0, 12 * 16(%rsi);
1177 gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1178 vpxor 13 * 16(%rdx), %xmm0, %xmm2;
1179 vmovdqu %xmm0, 13 * 16(%rsi);
1181 gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1182 vpxor 14 * 16(%rdx), %xmm0, %xmm1;
1183 vmovdqu %xmm0, 14 * 16(%rsi);
1185 gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1186 vpxor 15 * 16(%rdx), %xmm0, %xmm15;
1187 vmovdqu %xmm15, 0 * 16(%rax);
1188 vmovdqu %xmm0, 15 * 16(%rsi);
1190 gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1191 vmovdqu %xmm0, (%rcx);
1194 vmovq (key_table)(CTX, %r8, 8), %xmm15;
1195 vpshufb .Lpack_bswap, %xmm15, %xmm15;
1196 vpxor 0 * 16(%rax), %xmm15, %xmm0;
1197 vpxor %xmm1, %xmm15, %xmm1;
1198 vpxor %xmm2, %xmm15, %xmm2;
1199 vpxor %xmm3, %xmm15, %xmm3;
1200 vpxor %xmm4, %xmm15, %xmm4;
1201 vpxor %xmm5, %xmm15, %xmm5;
1202 vpxor %xmm6, %xmm15, %xmm6;
1203 vpxor %xmm7, %xmm15, %xmm7;
1204 vpxor %xmm8, %xmm15, %xmm8;
1205 vpxor %xmm9, %xmm15, %xmm9;
1206 vpxor %xmm10, %xmm15, %xmm10;
1207 vpxor %xmm11, %xmm15, %xmm11;
1208 vpxor %xmm12, %xmm15, %xmm12;
1209 vpxor %xmm13, %xmm15, %xmm13;
1210 vpxor 14 * 16(%rax), %xmm15, %xmm14;
1211 vpxor 15 * 16(%rax), %xmm15, %xmm15;
1215 addq $(16 * 16), %rsp;
1217 vpxor 0 * 16(%rsi), %xmm7, %xmm7;
1218 vpxor 1 * 16(%rsi), %xmm6, %xmm6;
1219 vpxor 2 * 16(%rsi), %xmm5, %xmm5;
1220 vpxor 3 * 16(%rsi), %xmm4, %xmm4;
1221 vpxor 4 * 16(%rsi), %xmm3, %xmm3;
1222 vpxor 5 * 16(%rsi), %xmm2, %xmm2;
1223 vpxor 6 * 16(%rsi), %xmm1, %xmm1;
1224 vpxor 7 * 16(%rsi), %xmm0, %xmm0;
1225 vpxor 8 * 16(%rsi), %xmm15, %xmm15;
1226 vpxor 9 * 16(%rsi), %xmm14, %xmm14;
1227 vpxor 10 * 16(%rsi), %xmm13, %xmm13;
1228 vpxor 11 * 16(%rsi), %xmm12, %xmm12;
1229 vpxor 12 * 16(%rsi), %xmm11, %xmm11;
1230 vpxor 13 * 16(%rsi), %xmm10, %xmm10;
1231 vpxor 14 * 16(%rsi), %xmm9, %xmm9;
1232 vpxor 15 * 16(%rsi), %xmm8, %xmm8;
1233 write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,
1234 %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
1238 ENDPROC(camellia_xts_crypt_16way)
1240 ENTRY(camellia_xts_enc_16way)
1243 * %rsi: dst (16 blocks)
1244 * %rdx: src (16 blocks)
1245 * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
1247 xorl %r8d, %r8d; /* input whitening key, 0 for enc */
1249 leaq __camellia_enc_blk16, %r9;
1251 jmp camellia_xts_crypt_16way;
1252 ENDPROC(camellia_xts_enc_16way)
1254 ENTRY(camellia_xts_dec_16way)
1257 * %rsi: dst (16 blocks)
1258 * %rdx: src (16 blocks)
1259 * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
1262 cmpl $16, key_length(CTX);
1265 cmovel %eax, %r8d; /* input whitening key, last for dec */
1267 leaq __camellia_dec_blk16, %r9;
1269 jmp camellia_xts_crypt_16way;
1270 ENDPROC(camellia_xts_dec_16way)