1 /* SPDX-License-Identifier: GPL-2.0-or-later */
3 * Cast6 Cipher 8-way parallel algorithm (AVX/x86_64)
5 * Copyright (C) 2012 Johannes Goetzfried
6 * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
8 * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
11 #include <linux/linkage.h>
12 #include <asm/frame.h>
13 #include "glue_helper-asm-avx.S"
15 .file "cast6-avx-x86_64-asm_64.S"
22 /* structure of crypto context */
32 /**********************************************************************
34 **********************************************************************/
85 #define lookup_32bit(src, dst, op1, op2, op3, interleave_op, il_reg) \
86 movzbl src ## bh, RID1d; \
87 leaq s1(%rip), RID2; \
88 movl (RID2,RID1,4), dst ## d; \
89 movzbl src ## bl, RID2d; \
90 leaq s2(%rip), RID1; \
91 op1 (RID1,RID2,4), dst ## d; \
93 movzbl src ## bh, RID1d; \
94 leaq s3(%rip), RID2; \
95 op2 (RID2,RID1,4), dst ## d; \
96 movzbl src ## bl, RID2d; \
97 interleave_op(il_reg); \
98 leaq s4(%rip), RID1; \
99 op3 (RID1,RID2,4), dst ## d;
101 #define dummy(d) /* do nothing */
103 #define shr_next(reg) \
106 #define F_head(a, x, gi1, gi2, op0) \
108 vpslld RKRF, x, RTMP; \
115 #define F_tail(a, x, gi1, gi2, op1, op2, op3) \
116 lookup_32bit(##gi1, RFS1, op1, op2, op3, shr_next, ##gi1); \
117 lookup_32bit(##gi2, RFS3, op1, op2, op3, shr_next, ##gi2); \
119 lookup_32bit(##gi1, RFS2, op1, op2, op3, dummy, none); \
122 lookup_32bit(##gi2, RFS1, op1, op2, op3, dummy, none); \
127 vpinsrq $1, RFS3, x, x;
129 #define F_2(a1, b1, a2, b2, op0, op1, op2, op3) \
130 F_head(b1, RX, RGI1, RGI2, op0); \
131 F_head(b2, RX, RGI3, RGI4, op0); \
133 F_tail(b1, RX, RGI1, RGI2, op1, op2, op3); \
134 F_tail(b2, RTMP, RGI3, RGI4, op1, op2, op3); \
139 #define F1_2(a1, b1, a2, b2) \
140 F_2(a1, b1, a2, b2, vpaddd, xorl, subl, addl)
141 #define F2_2(a1, b1, a2, b2) \
142 F_2(a1, b1, a2, b2, vpxor, subl, addl, xorl)
143 #define F3_2(a1, b1, a2, b2) \
144 F_2(a1, b1, a2, b2, vpsubd, addl, xorl, subl)
146 #define qop(in, out, f) \
147 F ## f ## _2(out ## 1, in ## 1, out ## 2, in ## 2);
149 #define get_round_keys(nn) \
150 vbroadcastss (km+(4*(nn)))(CTX), RKM; \
151 vpand R1ST, RKR, RKRF; \
152 vpsubq RKRF, R32, RKRR; \
153 vpsrldq $1, RKR, RKR;
156 get_round_keys(4*n+0); \
159 get_round_keys(4*n+1); \
162 get_round_keys(4*n+2); \
165 get_round_keys(4*n+3); \
169 get_round_keys(4*n+3); \
172 get_round_keys(4*n+2); \
175 get_round_keys(4*n+1); \
178 get_round_keys(4*n+0); \
181 #define shuffle(mask) \
182 vpshufb mask(%rip), RKR, RKR;
184 #define preload_rkr(n, do_mask, mask) \
185 vbroadcastss .L16_mask(%rip), RKR; \
186 /* add 16-bit rotation to key rotations (mod 32) */ \
187 vpxor (kr+n*16)(CTX), RKR, RKR; \
190 #define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
191 vpunpckldq x1, x0, t0; \
192 vpunpckhdq x1, x0, t2; \
193 vpunpckldq x3, x2, t1; \
194 vpunpckhdq x3, x2, x3; \
196 vpunpcklqdq t1, t0, x0; \
197 vpunpckhqdq t1, t0, x1; \
198 vpunpcklqdq x3, t2, x2; \
199 vpunpckhqdq x3, t2, x3;
201 #define inpack_blocks(x0, x1, x2, x3, t0, t1, t2, rmask) \
202 vpshufb rmask, x0, x0; \
203 vpshufb rmask, x1, x1; \
204 vpshufb rmask, x2, x2; \
205 vpshufb rmask, x3, x3; \
207 transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
209 #define outunpack_blocks(x0, x1, x2, x3, t0, t1, t2, rmask) \
210 transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
212 vpshufb rmask, x0, x0; \
213 vpshufb rmask, x1, x1; \
214 vpshufb rmask, x2, x2; \
215 vpshufb rmask, x3, x3;
217 .section .rodata.cst16, "aM", @progbits, 16
220 .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
222 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
223 .Lrkr_enc_Q_Q_QBAR_QBAR:
224 .byte 0, 1, 2, 3, 4, 5, 6, 7, 11, 10, 9, 8, 15, 14, 13, 12
225 .Lrkr_enc_QBAR_QBAR_QBAR_QBAR:
226 .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
228 .byte 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3
229 .Lrkr_dec_Q_Q_QBAR_QBAR:
230 .byte 12, 13, 14, 15, 8, 9, 10, 11, 7, 6, 5, 4, 3, 2, 1, 0
231 .Lrkr_dec_QBAR_QBAR_QBAR_QBAR:
232 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
234 .section .rodata.cst4.L16_mask, "aM", @progbits, 4
239 .section .rodata.cst4.L32_mask, "aM", @progbits, 4
244 .section .rodata.cst4.first_mask, "aM", @progbits, 4
252 SYM_FUNC_START_LOCAL(__cast6_enc_blk8)
255 * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: blocks
257 * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks
265 vmovdqa .Lbswap_mask(%rip), RKM;
266 vmovd .Lfirst_mask(%rip), R1ST;
267 vmovd .L32_mask(%rip), R32;
269 inpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
270 inpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
272 preload_rkr(0, dummy, none);
277 preload_rkr(1, shuffle, .Lrkr_enc_Q_Q_QBAR_QBAR);
282 preload_rkr(2, shuffle, .Lrkr_enc_QBAR_QBAR_QBAR_QBAR);
291 vmovdqa .Lbswap_mask(%rip), RKM;
293 outunpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
294 outunpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
297 SYM_FUNC_END(__cast6_enc_blk8)
300 SYM_FUNC_START_LOCAL(__cast6_dec_blk8)
303 * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks
305 * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: decrypted blocks
313 vmovdqa .Lbswap_mask(%rip), RKM;
314 vmovd .Lfirst_mask(%rip), R1ST;
315 vmovd .L32_mask(%rip), R32;
317 inpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
318 inpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
320 preload_rkr(2, shuffle, .Lrkr_dec_Q_Q_Q_Q);
325 preload_rkr(1, shuffle, .Lrkr_dec_Q_Q_QBAR_QBAR);
330 preload_rkr(0, shuffle, .Lrkr_dec_QBAR_QBAR_QBAR_QBAR);
339 vmovdqa .Lbswap_mask(%rip), RKM;
340 outunpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
341 outunpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
344 SYM_FUNC_END(__cast6_dec_blk8)
346 SYM_FUNC_START(cast6_ecb_enc_8way)
358 load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
360 call __cast6_enc_blk8;
362 store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
367 SYM_FUNC_END(cast6_ecb_enc_8way)
369 SYM_FUNC_START(cast6_ecb_dec_8way)
381 load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
383 call __cast6_dec_blk8;
385 store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
390 SYM_FUNC_END(cast6_ecb_dec_8way)
392 SYM_FUNC_START(cast6_cbc_dec_8way)
406 load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
408 call __cast6_dec_blk8;
410 store_cbc_8way(%r12, %r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
416 SYM_FUNC_END(cast6_cbc_dec_8way)