2 * Cast6 Cipher 8-way parallel algorithm (AVX/x86_64)
4 * Copyright (C) 2012 Johannes Goetzfried
5 * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
7 * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
26 #include <linux/linkage.h>
27 #include <asm/frame.h>
28 #include "glue_helper-asm-avx.S"
30 .file "cast6-avx-x86_64-asm_64.S"
37 /* structure of crypto context */
47 /**********************************************************************
49 **********************************************************************/
100 #define lookup_32bit(src, dst, op1, op2, op3, interleave_op, il_reg) \
101 movzbl src ## bh, RID1d; \
102 movzbl src ## bl, RID2d; \
104 movl s1(, RID1, 4), dst ## d; \
105 op1 s2(, RID2, 4), dst ## d; \
106 movzbl src ## bh, RID1d; \
107 movzbl src ## bl, RID2d; \
108 interleave_op(il_reg); \
109 op2 s3(, RID1, 4), dst ## d; \
110 op3 s4(, RID2, 4), dst ## d;
112 #define dummy(d) /* do nothing */
114 #define shr_next(reg) \
117 #define F_head(a, x, gi1, gi2, op0) \
119 vpslld RKRF, x, RTMP; \
126 #define F_tail(a, x, gi1, gi2, op1, op2, op3) \
127 lookup_32bit(##gi1, RFS1, op1, op2, op3, shr_next, ##gi1); \
128 lookup_32bit(##gi2, RFS3, op1, op2, op3, shr_next, ##gi2); \
130 lookup_32bit(##gi1, RFS2, op1, op2, op3, dummy, none); \
133 lookup_32bit(##gi2, RFS1, op1, op2, op3, dummy, none); \
138 vpinsrq $1, RFS3, x, x;
140 #define F_2(a1, b1, a2, b2, op0, op1, op2, op3) \
141 F_head(b1, RX, RGI1, RGI2, op0); \
142 F_head(b2, RX, RGI3, RGI4, op0); \
144 F_tail(b1, RX, RGI1, RGI2, op1, op2, op3); \
145 F_tail(b2, RTMP, RGI3, RGI4, op1, op2, op3); \
150 #define F1_2(a1, b1, a2, b2) \
151 F_2(a1, b1, a2, b2, vpaddd, xorl, subl, addl)
152 #define F2_2(a1, b1, a2, b2) \
153 F_2(a1, b1, a2, b2, vpxor, subl, addl, xorl)
154 #define F3_2(a1, b1, a2, b2) \
155 F_2(a1, b1, a2, b2, vpsubd, addl, xorl, subl)
157 #define qop(in, out, f) \
158 F ## f ## _2(out ## 1, in ## 1, out ## 2, in ## 2);
160 #define get_round_keys(nn) \
161 vbroadcastss (km+(4*(nn)))(CTX), RKM; \
162 vpand R1ST, RKR, RKRF; \
163 vpsubq RKRF, R32, RKRR; \
164 vpsrldq $1, RKR, RKR;
167 get_round_keys(4*n+0); \
170 get_round_keys(4*n+1); \
173 get_round_keys(4*n+2); \
176 get_round_keys(4*n+3); \
180 get_round_keys(4*n+3); \
183 get_round_keys(4*n+2); \
186 get_round_keys(4*n+1); \
189 get_round_keys(4*n+0); \
192 #define shuffle(mask) \
193 vpshufb mask, RKR, RKR;
195 #define preload_rkr(n, do_mask, mask) \
196 vbroadcastss .L16_mask, RKR; \
197 /* add 16-bit rotation to key rotations (mod 32) */ \
198 vpxor (kr+n*16)(CTX), RKR, RKR; \
201 #define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
202 vpunpckldq x1, x0, t0; \
203 vpunpckhdq x1, x0, t2; \
204 vpunpckldq x3, x2, t1; \
205 vpunpckhdq x3, x2, x3; \
207 vpunpcklqdq t1, t0, x0; \
208 vpunpckhqdq t1, t0, x1; \
209 vpunpcklqdq x3, t2, x2; \
210 vpunpckhqdq x3, t2, x3;
212 #define inpack_blocks(x0, x1, x2, x3, t0, t1, t2, rmask) \
213 vpshufb rmask, x0, x0; \
214 vpshufb rmask, x1, x1; \
215 vpshufb rmask, x2, x2; \
216 vpshufb rmask, x3, x3; \
218 transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
220 #define outunpack_blocks(x0, x1, x2, x3, t0, t1, t2, rmask) \
221 transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
223 vpshufb rmask, x0, x0; \
224 vpshufb rmask, x1, x1; \
225 vpshufb rmask, x2, x2; \
226 vpshufb rmask, x3, x3;
228 .section .rodata.cst16, "aM", @progbits, 16
230 .Lxts_gf128mul_and_shl1_mask:
231 .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0
233 .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
235 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
236 .Lrkr_enc_Q_Q_QBAR_QBAR:
237 .byte 0, 1, 2, 3, 4, 5, 6, 7, 11, 10, 9, 8, 15, 14, 13, 12
238 .Lrkr_enc_QBAR_QBAR_QBAR_QBAR:
239 .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
241 .byte 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3
242 .Lrkr_dec_Q_Q_QBAR_QBAR:
243 .byte 12, 13, 14, 15, 8, 9, 10, 11, 7, 6, 5, 4, 3, 2, 1, 0
244 .Lrkr_dec_QBAR_QBAR_QBAR_QBAR:
245 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
247 .section .rodata.cst4.L16_mask, "aM", @progbits, 4
252 .section .rodata.cst4.L32_mask, "aM", @progbits, 4
257 .section .rodata.cst4.first_mask, "aM", @progbits, 4
268 * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: blocks
270 * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks
276 vmovdqa .Lbswap_mask, RKM;
277 vmovd .Lfirst_mask, R1ST;
278 vmovd .L32_mask, R32;
280 inpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
281 inpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
283 preload_rkr(0, dummy, none);
288 preload_rkr(1, shuffle, .Lrkr_enc_Q_Q_QBAR_QBAR);
293 preload_rkr(2, shuffle, .Lrkr_enc_QBAR_QBAR_QBAR_QBAR);
302 vmovdqa .Lbswap_mask, RKM;
304 outunpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
305 outunpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
308 ENDPROC(__cast6_enc_blk8)
314 * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks
316 * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: decrypted blocks
322 vmovdqa .Lbswap_mask, RKM;
323 vmovd .Lfirst_mask, R1ST;
324 vmovd .L32_mask, R32;
326 inpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
327 inpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
329 preload_rkr(2, shuffle, .Lrkr_dec_Q_Q_Q_Q);
334 preload_rkr(1, shuffle, .Lrkr_dec_Q_Q_QBAR_QBAR);
339 preload_rkr(0, shuffle, .Lrkr_dec_QBAR_QBAR_QBAR_QBAR);
348 vmovdqa .Lbswap_mask, RKM;
349 outunpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
350 outunpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
353 ENDPROC(__cast6_dec_blk8)
355 ENTRY(cast6_ecb_enc_8way)
365 load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
367 call __cast6_enc_blk8;
369 store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
373 ENDPROC(cast6_ecb_enc_8way)
375 ENTRY(cast6_ecb_dec_8way)
385 load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
387 call __cast6_dec_blk8;
389 store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
393 ENDPROC(cast6_ecb_dec_8way)
395 ENTRY(cast6_cbc_dec_8way)
408 load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
410 call __cast6_dec_blk8;
412 store_cbc_8way(%r12, %r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
418 ENDPROC(cast6_cbc_dec_8way)
420 ENTRY(cast6_ctr_8way)
425 * %rcx: iv (little endian, 128bit)
434 load_ctr_8way(%rcx, .Lbswap128_mask, RA1, RB1, RC1, RD1, RA2, RB2, RC2,
437 call __cast6_enc_blk8;
439 store_ctr_8way(%r12, %r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
445 ENDPROC(cast6_ctr_8way)
447 ENTRY(cast6_xts_enc_8way)
452 * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
458 /* regs <= src, dst <= IVs, regs <= regs xor IVs */
459 load_xts_8way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2,
460 RX, RKR, RKM, .Lxts_gf128mul_and_shl1_mask);
462 call __cast6_enc_blk8;
464 /* dst <= regs xor IVs(in dst) */
465 store_xts_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
469 ENDPROC(cast6_xts_enc_8way)
471 ENTRY(cast6_xts_dec_8way)
476 * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
482 /* regs <= src, dst <= IVs, regs <= regs xor IVs */
483 load_xts_8way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2,
484 RX, RKR, RKM, .Lxts_gf128mul_and_shl1_mask);
486 call __cast6_dec_blk8;
488 /* dst <= regs xor IVs(in dst) */
489 store_xts_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
493 ENDPROC(cast6_xts_dec_8way)