2 * Cast6 Cipher 8-way parallel algorithm (AVX/x86_64)
4 * Copyright (C) 2012 Johannes Goetzfried
5 * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
7 * Copyright © 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
26 #include "glue_helper-asm-avx.S"
28 .file "cast6-avx-x86_64-asm_64.S"
35 /* structure of crypto context */
45 /**********************************************************************
47 **********************************************************************/
98 #define lookup_32bit(src, dst, op1, op2, op3, interleave_op, il_reg) \
99 movzbl src ## bh, RID1d; \
100 movzbl src ## bl, RID2d; \
102 movl s1(, RID1, 4), dst ## d; \
103 op1 s2(, RID2, 4), dst ## d; \
104 movzbl src ## bh, RID1d; \
105 movzbl src ## bl, RID2d; \
106 interleave_op(il_reg); \
107 op2 s3(, RID1, 4), dst ## d; \
108 op3 s4(, RID2, 4), dst ## d;
110 #define dummy(d) /* do nothing */
112 #define shr_next(reg) \
115 #define F_head(a, x, gi1, gi2, op0) \
117 vpslld RKRF, x, RTMP; \
124 #define F_tail(a, x, gi1, gi2, op1, op2, op3) \
125 lookup_32bit(##gi1, RFS1, op1, op2, op3, shr_next, ##gi1); \
126 lookup_32bit(##gi2, RFS3, op1, op2, op3, shr_next, ##gi2); \
128 lookup_32bit(##gi1, RFS2, op1, op2, op3, dummy, none); \
131 lookup_32bit(##gi2, RFS1, op1, op2, op3, dummy, none); \
136 vpinsrq $1, RFS3, x, x;
138 #define F_2(a1, b1, a2, b2, op0, op1, op2, op3) \
139 F_head(b1, RX, RGI1, RGI2, op0); \
140 F_head(b2, RX, RGI3, RGI4, op0); \
142 F_tail(b1, RX, RGI1, RGI2, op1, op2, op3); \
143 F_tail(b2, RTMP, RGI3, RGI4, op1, op2, op3); \
148 #define F1_2(a1, b1, a2, b2) \
149 F_2(a1, b1, a2, b2, vpaddd, xorl, subl, addl)
150 #define F2_2(a1, b1, a2, b2) \
151 F_2(a1, b1, a2, b2, vpxor, subl, addl, xorl)
152 #define F3_2(a1, b1, a2, b2) \
153 F_2(a1, b1, a2, b2, vpsubd, addl, xorl, subl)
155 #define qop(in, out, f) \
156 F ## f ## _2(out ## 1, in ## 1, out ## 2, in ## 2);
158 #define get_round_keys(nn) \
159 vbroadcastss (km+(4*(nn)))(CTX), RKM; \
160 vpand R1ST, RKR, RKRF; \
161 vpsubq RKRF, R32, RKRR; \
162 vpsrldq $1, RKR, RKR;
165 get_round_keys(4*n+0); \
168 get_round_keys(4*n+1); \
171 get_round_keys(4*n+2); \
174 get_round_keys(4*n+3); \
178 get_round_keys(4*n+3); \
181 get_round_keys(4*n+2); \
184 get_round_keys(4*n+1); \
187 get_round_keys(4*n+0); \
190 #define shuffle(mask) \
191 vpshufb mask, RKR, RKR;
193 #define preload_rkr(n, do_mask, mask) \
194 vbroadcastss .L16_mask, RKR; \
195 /* add 16-bit rotation to key rotations (mod 32) */ \
196 vpxor (kr+n*16)(CTX), RKR, RKR; \
199 #define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
200 vpunpckldq x1, x0, t0; \
201 vpunpckhdq x1, x0, t2; \
202 vpunpckldq x3, x2, t1; \
203 vpunpckhdq x3, x2, x3; \
205 vpunpcklqdq t1, t0, x0; \
206 vpunpckhqdq t1, t0, x1; \
207 vpunpcklqdq x3, t2, x2; \
208 vpunpckhqdq x3, t2, x3;
210 #define inpack_blocks(x0, x1, x2, x3, t0, t1, t2, rmask) \
211 vpshufb rmask, x0, x0; \
212 vpshufb rmask, x1, x1; \
213 vpshufb rmask, x2, x2; \
214 vpshufb rmask, x3, x3; \
216 transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
218 #define outunpack_blocks(x0, x1, x2, x3, t0, t1, t2, rmask) \
219 transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
221 vpshufb rmask, x0, x0; \
222 vpshufb rmask, x1, x1; \
223 vpshufb rmask, x2, x2; \
224 vpshufb rmask, x3, x3;
230 .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
232 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
233 .Lrkr_enc_Q_Q_QBAR_QBAR:
234 .byte 0, 1, 2, 3, 4, 5, 6, 7, 11, 10, 9, 8, 15, 14, 13, 12
235 .Lrkr_enc_QBAR_QBAR_QBAR_QBAR:
236 .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
238 .byte 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3
239 .Lrkr_dec_Q_Q_QBAR_QBAR:
240 .byte 12, 13, 14, 15, 8, 9, 10, 11, 7, 6, 5, 4, 3, 2, 1, 0
241 .Lrkr_dec_QBAR_QBAR_QBAR_QBAR:
242 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
253 .type __cast6_enc_blk8,@function;
258 * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: blocks
260 * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks
266 vmovdqa .Lbswap_mask, RKM;
267 vmovd .Lfirst_mask, R1ST;
268 vmovd .L32_mask, R32;
270 inpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
271 inpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
273 preload_rkr(0, dummy, none);
278 preload_rkr(1, shuffle, .Lrkr_enc_Q_Q_QBAR_QBAR);
283 preload_rkr(2, shuffle, .Lrkr_enc_QBAR_QBAR_QBAR_QBAR);
292 vmovdqa .Lbswap_mask, RKM;
294 outunpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
295 outunpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
300 .type __cast6_dec_blk8,@function;
305 * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks
307 * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: decrypted blocks
313 vmovdqa .Lbswap_mask, RKM;
314 vmovd .Lfirst_mask, R1ST;
315 vmovd .L32_mask, R32;
317 inpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
318 inpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
320 preload_rkr(2, shuffle, .Lrkr_dec_Q_Q_Q_Q);
325 preload_rkr(1, shuffle, .Lrkr_dec_Q_Q_QBAR_QBAR);
330 preload_rkr(0, shuffle, .Lrkr_dec_QBAR_QBAR_QBAR_QBAR);
339 vmovdqa .Lbswap_mask, RKM;
340 outunpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
341 outunpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
346 .global cast6_ecb_enc_8way
347 .type cast6_ecb_enc_8way,@function;
358 load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
360 call __cast6_enc_blk8;
362 store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
367 .global cast6_ecb_dec_8way
368 .type cast6_ecb_dec_8way,@function;
379 load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
381 call __cast6_dec_blk8;
383 store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
388 .global cast6_cbc_dec_8way
389 .type cast6_cbc_dec_8way,@function;
403 load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
405 call __cast6_dec_blk8;
407 store_cbc_8way(%r12, %r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
414 .global cast6_ctr_8way
415 .type cast6_ctr_8way,@function;
422 * %rcx: iv (little endian, 128bit)
430 load_ctr_8way(%rcx, .Lbswap128_mask, RA1, RB1, RC1, RD1, RA2, RB2, RC2,
433 call __cast6_enc_blk8;
435 store_ctr_8way(%r12, %r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);