1 /* SPDX-License-Identifier: GPL-2.0-or-later */
3 * Cast6 Cipher 8-way parallel algorithm (AVX/x86_64)
5 * Copyright (C) 2012 Johannes Goetzfried
6 * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
8 * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
11 #include <linux/linkage.h>
12 #include <asm/frame.h>
13 #include "glue_helper-asm-avx.S"
15 .file "cast6-avx-x86_64-asm_64.S"
22 /* structure of crypto context */
32 /**********************************************************************
34 **********************************************************************/
85 #define lookup_32bit(src, dst, op1, op2, op3, interleave_op, il_reg) \
86 movzbl src ## bh, RID1d; \
87 movzbl src ## bl, RID2d; \
89 movl s1(, RID1, 4), dst ## d; \
90 op1 s2(, RID2, 4), dst ## d; \
91 movzbl src ## bh, RID1d; \
92 movzbl src ## bl, RID2d; \
93 interleave_op(il_reg); \
94 op2 s3(, RID1, 4), dst ## d; \
95 op3 s4(, RID2, 4), dst ## d;
97 #define dummy(d) /* do nothing */
99 #define shr_next(reg) \
102 #define F_head(a, x, gi1, gi2, op0) \
104 vpslld RKRF, x, RTMP; \
111 #define F_tail(a, x, gi1, gi2, op1, op2, op3) \
112 lookup_32bit(##gi1, RFS1, op1, op2, op3, shr_next, ##gi1); \
113 lookup_32bit(##gi2, RFS3, op1, op2, op3, shr_next, ##gi2); \
115 lookup_32bit(##gi1, RFS2, op1, op2, op3, dummy, none); \
118 lookup_32bit(##gi2, RFS1, op1, op2, op3, dummy, none); \
123 vpinsrq $1, RFS3, x, x;
125 #define F_2(a1, b1, a2, b2, op0, op1, op2, op3) \
126 F_head(b1, RX, RGI1, RGI2, op0); \
127 F_head(b2, RX, RGI3, RGI4, op0); \
129 F_tail(b1, RX, RGI1, RGI2, op1, op2, op3); \
130 F_tail(b2, RTMP, RGI3, RGI4, op1, op2, op3); \
135 #define F1_2(a1, b1, a2, b2) \
136 F_2(a1, b1, a2, b2, vpaddd, xorl, subl, addl)
137 #define F2_2(a1, b1, a2, b2) \
138 F_2(a1, b1, a2, b2, vpxor, subl, addl, xorl)
139 #define F3_2(a1, b1, a2, b2) \
140 F_2(a1, b1, a2, b2, vpsubd, addl, xorl, subl)
142 #define qop(in, out, f) \
143 F ## f ## _2(out ## 1, in ## 1, out ## 2, in ## 2);
145 #define get_round_keys(nn) \
146 vbroadcastss (km+(4*(nn)))(CTX), RKM; \
147 vpand R1ST, RKR, RKRF; \
148 vpsubq RKRF, R32, RKRR; \
149 vpsrldq $1, RKR, RKR;
152 get_round_keys(4*n+0); \
155 get_round_keys(4*n+1); \
158 get_round_keys(4*n+2); \
161 get_round_keys(4*n+3); \
165 get_round_keys(4*n+3); \
168 get_round_keys(4*n+2); \
171 get_round_keys(4*n+1); \
174 get_round_keys(4*n+0); \
177 #define shuffle(mask) \
178 vpshufb mask, RKR, RKR;
180 #define preload_rkr(n, do_mask, mask) \
181 vbroadcastss .L16_mask, RKR; \
182 /* add 16-bit rotation to key rotations (mod 32) */ \
183 vpxor (kr+n*16)(CTX), RKR, RKR; \
186 #define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
187 vpunpckldq x1, x0, t0; \
188 vpunpckhdq x1, x0, t2; \
189 vpunpckldq x3, x2, t1; \
190 vpunpckhdq x3, x2, x3; \
192 vpunpcklqdq t1, t0, x0; \
193 vpunpckhqdq t1, t0, x1; \
194 vpunpcklqdq x3, t2, x2; \
195 vpunpckhqdq x3, t2, x3;
197 #define inpack_blocks(x0, x1, x2, x3, t0, t1, t2, rmask) \
198 vpshufb rmask, x0, x0; \
199 vpshufb rmask, x1, x1; \
200 vpshufb rmask, x2, x2; \
201 vpshufb rmask, x3, x3; \
203 transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
205 #define outunpack_blocks(x0, x1, x2, x3, t0, t1, t2, rmask) \
206 transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
208 vpshufb rmask, x0, x0; \
209 vpshufb rmask, x1, x1; \
210 vpshufb rmask, x2, x2; \
211 vpshufb rmask, x3, x3;
213 .section .rodata.cst16, "aM", @progbits, 16
215 .Lxts_gf128mul_and_shl1_mask:
216 .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0
218 .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
220 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
221 .Lrkr_enc_Q_Q_QBAR_QBAR:
222 .byte 0, 1, 2, 3, 4, 5, 6, 7, 11, 10, 9, 8, 15, 14, 13, 12
223 .Lrkr_enc_QBAR_QBAR_QBAR_QBAR:
224 .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
226 .byte 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3
227 .Lrkr_dec_Q_Q_QBAR_QBAR:
228 .byte 12, 13, 14, 15, 8, 9, 10, 11, 7, 6, 5, 4, 3, 2, 1, 0
229 .Lrkr_dec_QBAR_QBAR_QBAR_QBAR:
230 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
232 .section .rodata.cst4.L16_mask, "aM", @progbits, 4
237 .section .rodata.cst4.L32_mask, "aM", @progbits, 4
242 .section .rodata.cst4.first_mask, "aM", @progbits, 4
250 SYM_FUNC_START_LOCAL(__cast6_enc_blk8)
253 * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: blocks
255 * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks
263 vmovdqa .Lbswap_mask, RKM;
264 vmovd .Lfirst_mask, R1ST;
265 vmovd .L32_mask, R32;
267 inpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
268 inpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
270 preload_rkr(0, dummy, none);
275 preload_rkr(1, shuffle, .Lrkr_enc_Q_Q_QBAR_QBAR);
280 preload_rkr(2, shuffle, .Lrkr_enc_QBAR_QBAR_QBAR_QBAR);
289 vmovdqa .Lbswap_mask, RKM;
291 outunpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
292 outunpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
295 SYM_FUNC_END(__cast6_enc_blk8)
298 SYM_FUNC_START_LOCAL(__cast6_dec_blk8)
301 * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks
303 * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: decrypted blocks
311 vmovdqa .Lbswap_mask, RKM;
312 vmovd .Lfirst_mask, R1ST;
313 vmovd .L32_mask, R32;
315 inpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
316 inpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
318 preload_rkr(2, shuffle, .Lrkr_dec_Q_Q_Q_Q);
323 preload_rkr(1, shuffle, .Lrkr_dec_Q_Q_QBAR_QBAR);
328 preload_rkr(0, shuffle, .Lrkr_dec_QBAR_QBAR_QBAR_QBAR);
337 vmovdqa .Lbswap_mask, RKM;
338 outunpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
339 outunpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
342 SYM_FUNC_END(__cast6_dec_blk8)
344 SYM_FUNC_START(cast6_ecb_enc_8way)
356 load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
358 call __cast6_enc_blk8;
360 store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
365 SYM_FUNC_END(cast6_ecb_enc_8way)
367 SYM_FUNC_START(cast6_ecb_dec_8way)
379 load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
381 call __cast6_dec_blk8;
383 store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
388 SYM_FUNC_END(cast6_ecb_dec_8way)
390 SYM_FUNC_START(cast6_cbc_dec_8way)
404 load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
406 call __cast6_dec_blk8;
408 store_cbc_8way(%r12, %r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
414 SYM_FUNC_END(cast6_cbc_dec_8way)
416 SYM_FUNC_START(cast6_ctr_8way)
421 * %rcx: iv (little endian, 128bit)
431 load_ctr_8way(%rcx, .Lbswap128_mask, RA1, RB1, RC1, RD1, RA2, RB2, RC2,
434 call __cast6_enc_blk8;
436 store_ctr_8way(%r12, %r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
442 SYM_FUNC_END(cast6_ctr_8way)
444 SYM_FUNC_START(cast6_xts_enc_8way)
449 * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
457 /* regs <= src, dst <= IVs, regs <= regs xor IVs */
458 load_xts_8way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2,
459 RX, RKR, RKM, .Lxts_gf128mul_and_shl1_mask);
461 call __cast6_enc_blk8;
463 /* dst <= regs xor IVs(in dst) */
464 store_xts_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
469 SYM_FUNC_END(cast6_xts_enc_8way)
471 SYM_FUNC_START(cast6_xts_dec_8way)
476 * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
484 /* regs <= src, dst <= IVs, regs <= regs xor IVs */
485 load_xts_8way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2,
486 RX, RKR, RKM, .Lxts_gf128mul_and_shl1_mask);
488 call __cast6_dec_blk8;
490 /* dst <= regs xor IVs(in dst) */
491 store_xts_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
496 SYM_FUNC_END(cast6_xts_dec_8way)