2 * Cast5 Cipher 16-way parallel algorithm (AVX/x86_64)
4 * Copyright (C) 2012 Johannes Goetzfried
5 * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
7 * Copyright © 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
26 #include <linux/linkage.h>
28 .file "cast5-avx-x86_64-asm_64.S"
35 /* structure of crypto context */
38 #define rr ((16*4)+16)
46 /**********************************************************************
48 **********************************************************************/
99 #define lookup_32bit(src, dst, op1, op2, op3, interleave_op, il_reg) \
100 movzbl src ## bh, RID1d; \
101 movzbl src ## bl, RID2d; \
103 movl s1(, RID1, 4), dst ## d; \
104 op1 s2(, RID2, 4), dst ## d; \
105 movzbl src ## bh, RID1d; \
106 movzbl src ## bl, RID2d; \
107 interleave_op(il_reg); \
108 op2 s3(, RID1, 4), dst ## d; \
109 op3 s4(, RID2, 4), dst ## d;
111 #define dummy(d) /* do nothing */
113 #define shr_next(reg) \
116 #define F_head(a, x, gi1, gi2, op0) \
118 vpslld RKRF, x, RTMP; \
125 #define F_tail(a, x, gi1, gi2, op1, op2, op3) \
126 lookup_32bit(##gi1, RFS1, op1, op2, op3, shr_next, ##gi1); \
127 lookup_32bit(##gi2, RFS3, op1, op2, op3, shr_next, ##gi2); \
129 lookup_32bit(##gi1, RFS2, op1, op2, op3, dummy, none); \
132 lookup_32bit(##gi2, RFS1, op1, op2, op3, dummy, none); \
137 vpinsrq $1, RFS3, x, x;
139 #define F_2(a1, b1, a2, b2, op0, op1, op2, op3) \
140 F_head(b1, RX, RGI1, RGI2, op0); \
141 F_head(b2, RX, RGI3, RGI4, op0); \
143 F_tail(b1, RX, RGI1, RGI2, op1, op2, op3); \
144 F_tail(b2, RTMP, RGI3, RGI4, op1, op2, op3); \
149 #define F1_2(a1, b1, a2, b2) \
150 F_2(a1, b1, a2, b2, vpaddd, xorl, subl, addl)
151 #define F2_2(a1, b1, a2, b2) \
152 F_2(a1, b1, a2, b2, vpxor, subl, addl, xorl)
153 #define F3_2(a1, b1, a2, b2) \
154 F_2(a1, b1, a2, b2, vpsubd, addl, xorl, subl)
156 #define subround(a1, b1, a2, b2, f) \
157 F ## f ## _2(a1, b1, a2, b2);
159 #define round(l, r, n, f) \
160 vbroadcastss (km+(4*n))(CTX), RKM; \
161 vpand R1ST, RKR, RKRF; \
162 vpsubq RKRF, R32, RKRR; \
163 vpsrldq $1, RKR, RKR; \
164 subround(l ## 1, r ## 1, l ## 2, r ## 2, f); \
165 subround(l ## 3, r ## 3, l ## 4, r ## 4, f);
167 #define enc_preload_rkr() \
168 vbroadcastss .L16_mask, RKR; \
169 /* add 16-bit rotation to key rotations (mod 32) */ \
170 vpxor kr(CTX), RKR, RKR;
172 #define dec_preload_rkr() \
173 vbroadcastss .L16_mask, RKR; \
174 /* add 16-bit rotation to key rotations (mod 32) */ \
175 vpxor kr(CTX), RKR, RKR; \
176 vpshufb .Lbswap128_mask, RKR, RKR;
178 #define transpose_2x4(x0, x1, t0, t1) \
179 vpunpckldq x1, x0, t0; \
180 vpunpckhdq x1, x0, t1; \
182 vpunpcklqdq t1, t0, x0; \
183 vpunpckhqdq t1, t0, x1;
185 #define inpack_blocks(x0, x1, t0, t1, rmask) \
186 vpshufb rmask, x0, x0; \
187 vpshufb rmask, x1, x1; \
189 transpose_2x4(x0, x1, t0, t1)
191 #define outunpack_blocks(x0, x1, t0, t1, rmask) \
192 transpose_2x4(x0, x1, t0, t1) \
194 vpshufb rmask, x0, x0; \
195 vpshufb rmask, x1, x1;
201 .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
203 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
205 .byte 7, 6, 5, 4, 3, 2, 1, 0, 7, 6, 5, 4, 3, 2, 1, 0
219 * RL1: blocks 1 and 2
220 * RR1: blocks 3 and 4
221 * RL2: blocks 5 and 6
222 * RR2: blocks 7 and 8
223 * RL3: blocks 9 and 10
224 * RR3: blocks 11 and 12
225 * RL4: blocks 13 and 14
226 * RR4: blocks 15 and 16
228 * RL1: encrypted blocks 1 and 2
229 * RR1: encrypted blocks 3 and 4
230 * RL2: encrypted blocks 5 and 6
231 * RR2: encrypted blocks 7 and 8
232 * RL3: encrypted blocks 9 and 10
233 * RR3: encrypted blocks 11 and 12
234 * RL4: encrypted blocks 13 and 14
235 * RR4: encrypted blocks 15 and 16
241 vmovdqa .Lbswap_mask, RKM;
242 vmovd .Lfirst_mask, R1ST;
243 vmovd .L32_mask, R32;
246 inpack_blocks(RL1, RR1, RTMP, RX, RKM);
247 inpack_blocks(RL2, RR2, RTMP, RX, RKM);
248 inpack_blocks(RL3, RR3, RTMP, RX, RKM);
249 inpack_blocks(RL4, RR4, RTMP, RX, RKM);
261 round(RL, RR, 10, 2);
262 round(RR, RL, 11, 3);
264 movzbl rr(CTX), %eax;
268 round(RL, RR, 12, 1);
269 round(RR, RL, 13, 2);
270 round(RL, RR, 14, 3);
271 round(RR, RL, 15, 1);
277 vmovdqa .Lbswap_mask, RKM;
279 outunpack_blocks(RR1, RL1, RTMP, RX, RKM);
280 outunpack_blocks(RR2, RL2, RTMP, RX, RKM);
281 outunpack_blocks(RR3, RL3, RTMP, RX, RKM);
282 outunpack_blocks(RR4, RL4, RTMP, RX, RKM);
285 ENDPROC(__cast5_enc_blk16)
291 * RL1: encrypted blocks 1 and 2
292 * RR1: encrypted blocks 3 and 4
293 * RL2: encrypted blocks 5 and 6
294 * RR2: encrypted blocks 7 and 8
295 * RL3: encrypted blocks 9 and 10
296 * RR3: encrypted blocks 11 and 12
297 * RL4: encrypted blocks 13 and 14
298 * RR4: encrypted blocks 15 and 16
300 * RL1: decrypted blocks 1 and 2
301 * RR1: decrypted blocks 3 and 4
302 * RL2: decrypted blocks 5 and 6
303 * RR2: decrypted blocks 7 and 8
304 * RL3: decrypted blocks 9 and 10
305 * RR3: decrypted blocks 11 and 12
306 * RL4: decrypted blocks 13 and 14
307 * RR4: decrypted blocks 15 and 16
313 vmovdqa .Lbswap_mask, RKM;
314 vmovd .Lfirst_mask, R1ST;
315 vmovd .L32_mask, R32;
318 inpack_blocks(RL1, RR1, RTMP, RX, RKM);
319 inpack_blocks(RL2, RR2, RTMP, RX, RKM);
320 inpack_blocks(RL3, RR3, RTMP, RX, RKM);
321 inpack_blocks(RL4, RR4, RTMP, RX, RKM);
323 movzbl rr(CTX), %eax;
327 round(RL, RR, 15, 1);
328 round(RR, RL, 14, 3);
329 round(RL, RR, 13, 2);
330 round(RR, RL, 12, 1);
333 round(RL, RR, 11, 3);
334 round(RR, RL, 10, 2);
346 vmovdqa .Lbswap_mask, RKM;
350 outunpack_blocks(RR1, RL1, RTMP, RX, RKM);
351 outunpack_blocks(RR2, RL2, RTMP, RX, RKM);
352 outunpack_blocks(RR3, RL3, RTMP, RX, RKM);
353 outunpack_blocks(RR4, RL4, RTMP, RX, RKM);
358 vpsrldq $4, RKR, RKR;
360 ENDPROC(__cast5_dec_blk16)
362 ENTRY(cast5_ecb_enc_16way)
371 vmovdqu (0*4*4)(%rdx), RL1;
372 vmovdqu (1*4*4)(%rdx), RR1;
373 vmovdqu (2*4*4)(%rdx), RL2;
374 vmovdqu (3*4*4)(%rdx), RR2;
375 vmovdqu (4*4*4)(%rdx), RL3;
376 vmovdqu (5*4*4)(%rdx), RR3;
377 vmovdqu (6*4*4)(%rdx), RL4;
378 vmovdqu (7*4*4)(%rdx), RR4;
380 call __cast5_enc_blk16;
382 vmovdqu RR1, (0*4*4)(%r11);
383 vmovdqu RL1, (1*4*4)(%r11);
384 vmovdqu RR2, (2*4*4)(%r11);
385 vmovdqu RL2, (3*4*4)(%r11);
386 vmovdqu RR3, (4*4*4)(%r11);
387 vmovdqu RL3, (5*4*4)(%r11);
388 vmovdqu RR4, (6*4*4)(%r11);
389 vmovdqu RL4, (7*4*4)(%r11);
392 ENDPROC(cast5_ecb_enc_16way)
394 ENTRY(cast5_ecb_dec_16way)
403 vmovdqu (0*4*4)(%rdx), RL1;
404 vmovdqu (1*4*4)(%rdx), RR1;
405 vmovdqu (2*4*4)(%rdx), RL2;
406 vmovdqu (3*4*4)(%rdx), RR2;
407 vmovdqu (4*4*4)(%rdx), RL3;
408 vmovdqu (5*4*4)(%rdx), RR3;
409 vmovdqu (6*4*4)(%rdx), RL4;
410 vmovdqu (7*4*4)(%rdx), RR4;
412 call __cast5_dec_blk16;
414 vmovdqu RR1, (0*4*4)(%r11);
415 vmovdqu RL1, (1*4*4)(%r11);
416 vmovdqu RR2, (2*4*4)(%r11);
417 vmovdqu RL2, (3*4*4)(%r11);
418 vmovdqu RR3, (4*4*4)(%r11);
419 vmovdqu RL3, (5*4*4)(%r11);
420 vmovdqu RR4, (6*4*4)(%r11);
421 vmovdqu RL4, (7*4*4)(%r11);
424 ENDPROC(cast5_ecb_dec_16way)
426 ENTRY(cast5_cbc_dec_16way)
438 vmovdqu (0*16)(%rdx), RL1;
439 vmovdqu (1*16)(%rdx), RR1;
440 vmovdqu (2*16)(%rdx), RL2;
441 vmovdqu (3*16)(%rdx), RR2;
442 vmovdqu (4*16)(%rdx), RL3;
443 vmovdqu (5*16)(%rdx), RR3;
444 vmovdqu (6*16)(%rdx), RL4;
445 vmovdqu (7*16)(%rdx), RR4;
447 call __cast5_dec_blk16;
451 vpshufd $0x4f, RX, RX;
453 vpxor 0*16+8(%r12), RL1, RL1;
454 vpxor 1*16+8(%r12), RR2, RR2;
455 vpxor 2*16+8(%r12), RL2, RL2;
456 vpxor 3*16+8(%r12), RR3, RR3;
457 vpxor 4*16+8(%r12), RL3, RL3;
458 vpxor 5*16+8(%r12), RR4, RR4;
459 vpxor 6*16+8(%r12), RL4, RL4;
461 vmovdqu RR1, (0*16)(%r11);
462 vmovdqu RL1, (1*16)(%r11);
463 vmovdqu RR2, (2*16)(%r11);
464 vmovdqu RL2, (3*16)(%r11);
465 vmovdqu RR3, (4*16)(%r11);
466 vmovdqu RL3, (5*16)(%r11);
467 vmovdqu RR4, (6*16)(%r11);
468 vmovdqu RL4, (7*16)(%r11);
473 ENDPROC(cast5_cbc_dec_16way)
475 ENTRY(cast5_ctr_16way)
480 * %rcx: iv (big endian, 64bit)
488 vpcmpeqd RTMP, RTMP, RTMP;
489 vpsrldq $8, RTMP, RTMP; /* low: -1, high: 0 */
491 vpcmpeqd RKR, RKR, RKR;
492 vpaddq RKR, RKR, RKR; /* low: -2, high: -2 */
493 vmovdqa .Lbswap_iv_mask, R1ST;
494 vmovdqa .Lbswap128_mask, RKM;
496 /* load IV and byteswap */
498 vpshufb R1ST, RX, RX;
501 vpsubq RTMP, RX, RX; /* le: IV1, IV0 */
502 vpshufb RKM, RX, RL1; /* be: IV0, IV1 */
504 vpshufb RKM, RX, RR1; /* be: IV2, IV3 */
506 vpshufb RKM, RX, RL2; /* be: IV4, IV5 */
508 vpshufb RKM, RX, RR2; /* be: IV6, IV7 */
510 vpshufb RKM, RX, RL3; /* be: IV8, IV9 */
512 vpshufb RKM, RX, RR3; /* be: IV10, IV11 */
514 vpshufb RKM, RX, RL4; /* be: IV12, IV13 */
516 vpshufb RKM, RX, RR4; /* be: IV14, IV15 */
519 vpsubq RTMP, RX, RX; /* le: IV16, IV14 */
520 vpshufb R1ST, RX, RX; /* be: IV16, IV16 */
523 call __cast5_enc_blk16;
526 vpxor (0*16)(%r12), RR1, RR1;
527 vpxor (1*16)(%r12), RL1, RL1;
528 vpxor (2*16)(%r12), RR2, RR2;
529 vpxor (3*16)(%r12), RL2, RL2;
530 vpxor (4*16)(%r12), RR3, RR3;
531 vpxor (5*16)(%r12), RL3, RL3;
532 vpxor (6*16)(%r12), RR4, RR4;
533 vpxor (7*16)(%r12), RL4, RL4;
534 vmovdqu RR1, (0*16)(%r11);
535 vmovdqu RL1, (1*16)(%r11);
536 vmovdqu RR2, (2*16)(%r11);
537 vmovdqu RL2, (3*16)(%r11);
538 vmovdqu RR3, (4*16)(%r11);
539 vmovdqu RL3, (5*16)(%r11);
540 vmovdqu RR4, (6*16)(%r11);
541 vmovdqu RL4, (7*16)(%r11);
546 ENDPROC(cast5_ctr_16way)