2 * Cast5 Cipher 16-way parallel algorithm (AVX/x86_64)
4 * Copyright (C) 2012 Johannes Goetzfried
5 * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
7 * Copyright © 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
26 #include <linux/linkage.h>
27 #include <asm/frame.h>
29 .file "cast5-avx-x86_64-asm_64.S"
36 /* structure of crypto context */
39 #define rr ((16*4)+16)
47 /**********************************************************************
49 **********************************************************************/
100 #define lookup_32bit(src, dst, op1, op2, op3, interleave_op, il_reg) \
101 movzbl src ## bh, RID1d; \
102 movzbl src ## bl, RID2d; \
104 movl s1(, RID1, 4), dst ## d; \
105 op1 s2(, RID2, 4), dst ## d; \
106 movzbl src ## bh, RID1d; \
107 movzbl src ## bl, RID2d; \
108 interleave_op(il_reg); \
109 op2 s3(, RID1, 4), dst ## d; \
110 op3 s4(, RID2, 4), dst ## d;
112 #define dummy(d) /* do nothing */
114 #define shr_next(reg) \
117 #define F_head(a, x, gi1, gi2, op0) \
119 vpslld RKRF, x, RTMP; \
126 #define F_tail(a, x, gi1, gi2, op1, op2, op3) \
127 lookup_32bit(##gi1, RFS1, op1, op2, op3, shr_next, ##gi1); \
128 lookup_32bit(##gi2, RFS3, op1, op2, op3, shr_next, ##gi2); \
130 lookup_32bit(##gi1, RFS2, op1, op2, op3, dummy, none); \
133 lookup_32bit(##gi2, RFS1, op1, op2, op3, dummy, none); \
138 vpinsrq $1, RFS3, x, x;
140 #define F_2(a1, b1, a2, b2, op0, op1, op2, op3) \
141 F_head(b1, RX, RGI1, RGI2, op0); \
142 F_head(b2, RX, RGI3, RGI4, op0); \
144 F_tail(b1, RX, RGI1, RGI2, op1, op2, op3); \
145 F_tail(b2, RTMP, RGI3, RGI4, op1, op2, op3); \
150 #define F1_2(a1, b1, a2, b2) \
151 F_2(a1, b1, a2, b2, vpaddd, xorl, subl, addl)
152 #define F2_2(a1, b1, a2, b2) \
153 F_2(a1, b1, a2, b2, vpxor, subl, addl, xorl)
154 #define F3_2(a1, b1, a2, b2) \
155 F_2(a1, b1, a2, b2, vpsubd, addl, xorl, subl)
157 #define subround(a1, b1, a2, b2, f) \
158 F ## f ## _2(a1, b1, a2, b2);
160 #define round(l, r, n, f) \
161 vbroadcastss (km+(4*n))(CTX), RKM; \
162 vpand R1ST, RKR, RKRF; \
163 vpsubq RKRF, R32, RKRR; \
164 vpsrldq $1, RKR, RKR; \
165 subround(l ## 1, r ## 1, l ## 2, r ## 2, f); \
166 subround(l ## 3, r ## 3, l ## 4, r ## 4, f);
168 #define enc_preload_rkr() \
169 vbroadcastss .L16_mask, RKR; \
170 /* add 16-bit rotation to key rotations (mod 32) */ \
171 vpxor kr(CTX), RKR, RKR;
173 #define dec_preload_rkr() \
174 vbroadcastss .L16_mask, RKR; \
175 /* add 16-bit rotation to key rotations (mod 32) */ \
176 vpxor kr(CTX), RKR, RKR; \
177 vpshufb .Lbswap128_mask, RKR, RKR;
179 #define transpose_2x4(x0, x1, t0, t1) \
180 vpunpckldq x1, x0, t0; \
181 vpunpckhdq x1, x0, t1; \
183 vpunpcklqdq t1, t0, x0; \
184 vpunpckhqdq t1, t0, x1;
186 #define inpack_blocks(x0, x1, t0, t1, rmask) \
187 vpshufb rmask, x0, x0; \
188 vpshufb rmask, x1, x1; \
190 transpose_2x4(x0, x1, t0, t1)
192 #define outunpack_blocks(x0, x1, t0, t1, rmask) \
193 transpose_2x4(x0, x1, t0, t1) \
195 vpshufb rmask, x0, x0; \
196 vpshufb rmask, x1, x1;
202 .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
204 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
206 .byte 7, 6, 5, 4, 3, 2, 1, 0, 7, 6, 5, 4, 3, 2, 1, 0
220 * RL1: blocks 1 and 2
221 * RR1: blocks 3 and 4
222 * RL2: blocks 5 and 6
223 * RR2: blocks 7 and 8
224 * RL3: blocks 9 and 10
225 * RR3: blocks 11 and 12
226 * RL4: blocks 13 and 14
227 * RR4: blocks 15 and 16
229 * RL1: encrypted blocks 1 and 2
230 * RR1: encrypted blocks 3 and 4
231 * RL2: encrypted blocks 5 and 6
232 * RR2: encrypted blocks 7 and 8
233 * RL3: encrypted blocks 9 and 10
234 * RR3: encrypted blocks 11 and 12
235 * RL4: encrypted blocks 13 and 14
236 * RR4: encrypted blocks 15 and 16
242 vmovdqa .Lbswap_mask, RKM;
243 vmovd .Lfirst_mask, R1ST;
244 vmovd .L32_mask, R32;
247 inpack_blocks(RL1, RR1, RTMP, RX, RKM);
248 inpack_blocks(RL2, RR2, RTMP, RX, RKM);
249 inpack_blocks(RL3, RR3, RTMP, RX, RKM);
250 inpack_blocks(RL4, RR4, RTMP, RX, RKM);
262 round(RL, RR, 10, 2);
263 round(RR, RL, 11, 3);
265 movzbl rr(CTX), %eax;
269 round(RL, RR, 12, 1);
270 round(RR, RL, 13, 2);
271 round(RL, RR, 14, 3);
272 round(RR, RL, 15, 1);
278 vmovdqa .Lbswap_mask, RKM;
280 outunpack_blocks(RR1, RL1, RTMP, RX, RKM);
281 outunpack_blocks(RR2, RL2, RTMP, RX, RKM);
282 outunpack_blocks(RR3, RL3, RTMP, RX, RKM);
283 outunpack_blocks(RR4, RL4, RTMP, RX, RKM);
286 ENDPROC(__cast5_enc_blk16)
292 * RL1: encrypted blocks 1 and 2
293 * RR1: encrypted blocks 3 and 4
294 * RL2: encrypted blocks 5 and 6
295 * RR2: encrypted blocks 7 and 8
296 * RL3: encrypted blocks 9 and 10
297 * RR3: encrypted blocks 11 and 12
298 * RL4: encrypted blocks 13 and 14
299 * RR4: encrypted blocks 15 and 16
301 * RL1: decrypted blocks 1 and 2
302 * RR1: decrypted blocks 3 and 4
303 * RL2: decrypted blocks 5 and 6
304 * RR2: decrypted blocks 7 and 8
305 * RL3: decrypted blocks 9 and 10
306 * RR3: decrypted blocks 11 and 12
307 * RL4: decrypted blocks 13 and 14
308 * RR4: decrypted blocks 15 and 16
314 vmovdqa .Lbswap_mask, RKM;
315 vmovd .Lfirst_mask, R1ST;
316 vmovd .L32_mask, R32;
319 inpack_blocks(RL1, RR1, RTMP, RX, RKM);
320 inpack_blocks(RL2, RR2, RTMP, RX, RKM);
321 inpack_blocks(RL3, RR3, RTMP, RX, RKM);
322 inpack_blocks(RL4, RR4, RTMP, RX, RKM);
324 movzbl rr(CTX), %eax;
328 round(RL, RR, 15, 1);
329 round(RR, RL, 14, 3);
330 round(RL, RR, 13, 2);
331 round(RR, RL, 12, 1);
334 round(RL, RR, 11, 3);
335 round(RR, RL, 10, 2);
347 vmovdqa .Lbswap_mask, RKM;
351 outunpack_blocks(RR1, RL1, RTMP, RX, RKM);
352 outunpack_blocks(RR2, RL2, RTMP, RX, RKM);
353 outunpack_blocks(RR3, RL3, RTMP, RX, RKM);
354 outunpack_blocks(RR4, RL4, RTMP, RX, RKM);
359 vpsrldq $4, RKR, RKR;
361 ENDPROC(__cast5_dec_blk16)
363 ENTRY(cast5_ecb_enc_16way)
373 vmovdqu (0*4*4)(%rdx), RL1;
374 vmovdqu (1*4*4)(%rdx), RR1;
375 vmovdqu (2*4*4)(%rdx), RL2;
376 vmovdqu (3*4*4)(%rdx), RR2;
377 vmovdqu (4*4*4)(%rdx), RL3;
378 vmovdqu (5*4*4)(%rdx), RR3;
379 vmovdqu (6*4*4)(%rdx), RL4;
380 vmovdqu (7*4*4)(%rdx), RR4;
382 call __cast5_enc_blk16;
384 vmovdqu RR1, (0*4*4)(%r11);
385 vmovdqu RL1, (1*4*4)(%r11);
386 vmovdqu RR2, (2*4*4)(%r11);
387 vmovdqu RL2, (3*4*4)(%r11);
388 vmovdqu RR3, (4*4*4)(%r11);
389 vmovdqu RL3, (5*4*4)(%r11);
390 vmovdqu RR4, (6*4*4)(%r11);
391 vmovdqu RL4, (7*4*4)(%r11);
395 ENDPROC(cast5_ecb_enc_16way)
397 ENTRY(cast5_ecb_dec_16way)
407 vmovdqu (0*4*4)(%rdx), RL1;
408 vmovdqu (1*4*4)(%rdx), RR1;
409 vmovdqu (2*4*4)(%rdx), RL2;
410 vmovdqu (3*4*4)(%rdx), RR2;
411 vmovdqu (4*4*4)(%rdx), RL3;
412 vmovdqu (5*4*4)(%rdx), RR3;
413 vmovdqu (6*4*4)(%rdx), RL4;
414 vmovdqu (7*4*4)(%rdx), RR4;
416 call __cast5_dec_blk16;
418 vmovdqu RR1, (0*4*4)(%r11);
419 vmovdqu RL1, (1*4*4)(%r11);
420 vmovdqu RR2, (2*4*4)(%r11);
421 vmovdqu RL2, (3*4*4)(%r11);
422 vmovdqu RR3, (4*4*4)(%r11);
423 vmovdqu RL3, (5*4*4)(%r11);
424 vmovdqu RR4, (6*4*4)(%r11);
425 vmovdqu RL4, (7*4*4)(%r11);
429 ENDPROC(cast5_ecb_dec_16way)
431 ENTRY(cast5_cbc_dec_16way)
444 vmovdqu (0*16)(%rdx), RL1;
445 vmovdqu (1*16)(%rdx), RR1;
446 vmovdqu (2*16)(%rdx), RL2;
447 vmovdqu (3*16)(%rdx), RR2;
448 vmovdqu (4*16)(%rdx), RL3;
449 vmovdqu (5*16)(%rdx), RR3;
450 vmovdqu (6*16)(%rdx), RL4;
451 vmovdqu (7*16)(%rdx), RR4;
453 call __cast5_dec_blk16;
457 vpshufd $0x4f, RX, RX;
459 vpxor 0*16+8(%r12), RL1, RL1;
460 vpxor 1*16+8(%r12), RR2, RR2;
461 vpxor 2*16+8(%r12), RL2, RL2;
462 vpxor 3*16+8(%r12), RR3, RR3;
463 vpxor 4*16+8(%r12), RL3, RL3;
464 vpxor 5*16+8(%r12), RR4, RR4;
465 vpxor 6*16+8(%r12), RL4, RL4;
467 vmovdqu RR1, (0*16)(%r11);
468 vmovdqu RL1, (1*16)(%r11);
469 vmovdqu RR2, (2*16)(%r11);
470 vmovdqu RL2, (3*16)(%r11);
471 vmovdqu RR3, (4*16)(%r11);
472 vmovdqu RL3, (5*16)(%r11);
473 vmovdqu RR4, (6*16)(%r11);
474 vmovdqu RL4, (7*16)(%r11);
480 ENDPROC(cast5_cbc_dec_16way)
482 ENTRY(cast5_ctr_16way)
487 * %rcx: iv (big endian, 64bit)
496 vpcmpeqd RTMP, RTMP, RTMP;
497 vpsrldq $8, RTMP, RTMP; /* low: -1, high: 0 */
499 vpcmpeqd RKR, RKR, RKR;
500 vpaddq RKR, RKR, RKR; /* low: -2, high: -2 */
501 vmovdqa .Lbswap_iv_mask, R1ST;
502 vmovdqa .Lbswap128_mask, RKM;
504 /* load IV and byteswap */
506 vpshufb R1ST, RX, RX;
509 vpsubq RTMP, RX, RX; /* le: IV1, IV0 */
510 vpshufb RKM, RX, RL1; /* be: IV0, IV1 */
512 vpshufb RKM, RX, RR1; /* be: IV2, IV3 */
514 vpshufb RKM, RX, RL2; /* be: IV4, IV5 */
516 vpshufb RKM, RX, RR2; /* be: IV6, IV7 */
518 vpshufb RKM, RX, RL3; /* be: IV8, IV9 */
520 vpshufb RKM, RX, RR3; /* be: IV10, IV11 */
522 vpshufb RKM, RX, RL4; /* be: IV12, IV13 */
524 vpshufb RKM, RX, RR4; /* be: IV14, IV15 */
527 vpsubq RTMP, RX, RX; /* le: IV16, IV14 */
528 vpshufb R1ST, RX, RX; /* be: IV16, IV16 */
531 call __cast5_enc_blk16;
534 vpxor (0*16)(%r12), RR1, RR1;
535 vpxor (1*16)(%r12), RL1, RL1;
536 vpxor (2*16)(%r12), RR2, RR2;
537 vpxor (3*16)(%r12), RL2, RL2;
538 vpxor (4*16)(%r12), RR3, RR3;
539 vpxor (5*16)(%r12), RL3, RL3;
540 vpxor (6*16)(%r12), RR4, RR4;
541 vpxor (7*16)(%r12), RL4, RL4;
542 vmovdqu RR1, (0*16)(%r11);
543 vmovdqu RL1, (1*16)(%r11);
544 vmovdqu RR2, (2*16)(%r11);
545 vmovdqu RL2, (3*16)(%r11);
546 vmovdqu RR3, (4*16)(%r11);
547 vmovdqu RL3, (5*16)(%r11);
548 vmovdqu RR4, (6*16)(%r11);
549 vmovdqu RL4, (7*16)(%r11);
555 ENDPROC(cast5_ctr_16way)