2 * Cast6 Cipher 8-way parallel algorithm (AVX/x86_64)
4 * Copyright (C) 2012 Johannes Goetzfried
5 * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
7 * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
26 #include <linux/linkage.h>
27 #include "glue_helper-asm-avx.S"
29 .file "cast6-avx-x86_64-asm_64.S"
36 /* structure of crypto context */
46 /**********************************************************************
48 **********************************************************************/
99 #define lookup_32bit(src, dst, op1, op2, op3, interleave_op, il_reg) \
100 movzbl src ## bh, RID1d; \
101 movzbl src ## bl, RID2d; \
103 movl s1(, RID1, 4), dst ## d; \
104 op1 s2(, RID2, 4), dst ## d; \
105 movzbl src ## bh, RID1d; \
106 movzbl src ## bl, RID2d; \
107 interleave_op(il_reg); \
108 op2 s3(, RID1, 4), dst ## d; \
109 op3 s4(, RID2, 4), dst ## d;
111 #define dummy(d) /* do nothing */
113 #define shr_next(reg) \
116 #define F_head(a, x, gi1, gi2, op0) \
118 vpslld RKRF, x, RTMP; \
125 #define F_tail(a, x, gi1, gi2, op1, op2, op3) \
126 lookup_32bit(##gi1, RFS1, op1, op2, op3, shr_next, ##gi1); \
127 lookup_32bit(##gi2, RFS3, op1, op2, op3, shr_next, ##gi2); \
129 lookup_32bit(##gi1, RFS2, op1, op2, op3, dummy, none); \
132 lookup_32bit(##gi2, RFS1, op1, op2, op3, dummy, none); \
137 vpinsrq $1, RFS3, x, x;
139 #define F_2(a1, b1, a2, b2, op0, op1, op2, op3) \
140 F_head(b1, RX, RGI1, RGI2, op0); \
141 F_head(b2, RX, RGI3, RGI4, op0); \
143 F_tail(b1, RX, RGI1, RGI2, op1, op2, op3); \
144 F_tail(b2, RTMP, RGI3, RGI4, op1, op2, op3); \
149 #define F1_2(a1, b1, a2, b2) \
150 F_2(a1, b1, a2, b2, vpaddd, xorl, subl, addl)
151 #define F2_2(a1, b1, a2, b2) \
152 F_2(a1, b1, a2, b2, vpxor, subl, addl, xorl)
153 #define F3_2(a1, b1, a2, b2) \
154 F_2(a1, b1, a2, b2, vpsubd, addl, xorl, subl)
156 #define qop(in, out, f) \
157 F ## f ## _2(out ## 1, in ## 1, out ## 2, in ## 2);
159 #define get_round_keys(nn) \
160 vbroadcastss (km+(4*(nn)))(CTX), RKM; \
161 vpand R1ST, RKR, RKRF; \
162 vpsubq RKRF, R32, RKRR; \
163 vpsrldq $1, RKR, RKR;
166 get_round_keys(4*n+0); \
169 get_round_keys(4*n+1); \
172 get_round_keys(4*n+2); \
175 get_round_keys(4*n+3); \
179 get_round_keys(4*n+3); \
182 get_round_keys(4*n+2); \
185 get_round_keys(4*n+1); \
188 get_round_keys(4*n+0); \
191 #define shuffle(mask) \
192 vpshufb mask, RKR, RKR;
194 #define preload_rkr(n, do_mask, mask) \
195 vbroadcastss .L16_mask, RKR; \
196 /* add 16-bit rotation to key rotations (mod 32) */ \
197 vpxor (kr+n*16)(CTX), RKR, RKR; \
200 #define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
201 vpunpckldq x1, x0, t0; \
202 vpunpckhdq x1, x0, t2; \
203 vpunpckldq x3, x2, t1; \
204 vpunpckhdq x3, x2, x3; \
206 vpunpcklqdq t1, t0, x0; \
207 vpunpckhqdq t1, t0, x1; \
208 vpunpcklqdq x3, t2, x2; \
209 vpunpckhqdq x3, t2, x3;
211 #define inpack_blocks(x0, x1, x2, x3, t0, t1, t2, rmask) \
212 vpshufb rmask, x0, x0; \
213 vpshufb rmask, x1, x1; \
214 vpshufb rmask, x2, x2; \
215 vpshufb rmask, x3, x3; \
217 transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
219 #define outunpack_blocks(x0, x1, x2, x3, t0, t1, t2, rmask) \
220 transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
222 vpshufb rmask, x0, x0; \
223 vpshufb rmask, x1, x1; \
224 vpshufb rmask, x2, x2; \
225 vpshufb rmask, x3, x3;
230 .Lxts_gf128mul_and_shl1_mask:
231 .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0
233 .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
235 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
236 .Lrkr_enc_Q_Q_QBAR_QBAR:
237 .byte 0, 1, 2, 3, 4, 5, 6, 7, 11, 10, 9, 8, 15, 14, 13, 12
238 .Lrkr_enc_QBAR_QBAR_QBAR_QBAR:
239 .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
241 .byte 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3
242 .Lrkr_dec_Q_Q_QBAR_QBAR:
243 .byte 12, 13, 14, 15, 8, 9, 10, 11, 7, 6, 5, 4, 3, 2, 1, 0
244 .Lrkr_dec_QBAR_QBAR_QBAR_QBAR:
245 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
259 * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: blocks
261 * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks
267 vmovdqa .Lbswap_mask, RKM;
268 vmovd .Lfirst_mask, R1ST;
269 vmovd .L32_mask, R32;
271 inpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
272 inpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
274 preload_rkr(0, dummy, none);
279 preload_rkr(1, shuffle, .Lrkr_enc_Q_Q_QBAR_QBAR);
284 preload_rkr(2, shuffle, .Lrkr_enc_QBAR_QBAR_QBAR_QBAR);
293 vmovdqa .Lbswap_mask, RKM;
295 outunpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
296 outunpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
299 ENDPROC(__cast6_enc_blk8)
305 * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks
307 * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: decrypted blocks
313 vmovdqa .Lbswap_mask, RKM;
314 vmovd .Lfirst_mask, R1ST;
315 vmovd .L32_mask, R32;
317 inpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
318 inpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
320 preload_rkr(2, shuffle, .Lrkr_dec_Q_Q_Q_Q);
325 preload_rkr(1, shuffle, .Lrkr_dec_Q_Q_QBAR_QBAR);
330 preload_rkr(0, shuffle, .Lrkr_dec_QBAR_QBAR_QBAR_QBAR);
339 vmovdqa .Lbswap_mask, RKM;
340 outunpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
341 outunpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
344 ENDPROC(__cast6_dec_blk8)
346 ENTRY(cast6_ecb_enc_8way)
355 load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
357 call __cast6_enc_blk8;
359 store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
362 ENDPROC(cast6_ecb_enc_8way)
364 ENTRY(cast6_ecb_dec_8way)
373 load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
375 call __cast6_dec_blk8;
377 store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
380 ENDPROC(cast6_ecb_dec_8way)
382 ENTRY(cast6_cbc_dec_8way)
394 load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
396 call __cast6_dec_blk8;
398 store_cbc_8way(%r12, %r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
403 ENDPROC(cast6_cbc_dec_8way)
405 ENTRY(cast6_ctr_8way)
410 * %rcx: iv (little endian, 128bit)
418 load_ctr_8way(%rcx, .Lbswap128_mask, RA1, RB1, RC1, RD1, RA2, RB2, RC2,
421 call __cast6_enc_blk8;
423 store_ctr_8way(%r12, %r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
428 ENDPROC(cast6_ctr_8way)
430 ENTRY(cast6_xts_enc_8way)
435 * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
440 /* regs <= src, dst <= IVs, regs <= regs xor IVs */
441 load_xts_8way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2,
442 RX, RKR, RKM, .Lxts_gf128mul_and_shl1_mask);
444 call __cast6_enc_blk8;
446 /* dst <= regs xor IVs(in dst) */
447 store_xts_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
450 ENDPROC(cast6_xts_enc_8way)
452 ENTRY(cast6_xts_dec_8way)
457 * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
462 /* regs <= src, dst <= IVs, regs <= regs xor IVs */
463 load_xts_8way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2,
464 RX, RKR, RKM, .Lxts_gf128mul_and_shl1_mask);
466 call __cast6_dec_blk8;
468 /* dst <= regs xor IVs(in dst) */
469 store_xts_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
472 ENDPROC(cast6_xts_dec_8way)