2 * Twofish Cipher 8-way parallel algorithm (AVX/x86_64)
4 * Copyright (C) 2012 Johannes Goetzfried
5 * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
24 .file "twofish-avx-x86_64-asm_64.S"
27 /* structure of crypto context */
35 /**********************************************************************
37 **********************************************************************/
76 #define lookup_32bit(t0, t1, t2, t3, src, dst) \
77 movb src ## bl, RID1b; \
78 movb src ## bh, RID2b; \
79 movl t0(CTX, RID1, 4), dst ## d; \
80 xorl t1(CTX, RID2, 4), dst ## d; \
82 movb src ## bl, RID1b; \
83 movb src ## bh, RID2b; \
84 xorl t2(CTX, RID1, 4), dst ## d; \
85 xorl t3(CTX, RID2, 4), dst ## d;
87 #define G(a, x, t0, t1, t2, t3) \
92 lookup_32bit(t0, t1, t2, t3, RGI1, RGS1); \
94 lookup_32bit(t0, t1, t2, t3, RGI1, RGS2); \
98 lookup_32bit(t0, t1, t2, t3, RGI2, RGS1); \
100 lookup_32bit(t0, t1, t2, t3, RGI2, RGS3); \
105 vpinsrq $1, RGS3, x, x;
107 #define encround(a, b, c, d, x, y) \
108 G(a, x, s0, s1, s2, s3); \
109 G(b, y, s1, s2, s3, s0); \
116 vpslld $(32 - 1), c, c; \
119 vpsrld $(32 - 1), d, d; \
123 #define decround(a, b, c, d, x, y) \
124 G(a, x, s0, s1, s2, s3); \
125 G(b, y, s1, s2, s3, s0); \
131 vpslld $(32 - 1), d, d; \
134 vpsrld $(32 - 1), c, c; \
139 #define encrypt_round(n, a, b, c, d) \
140 vbroadcastss (k+4*(2*(n)))(CTX), RK1; \
141 vbroadcastss (k+4*(2*(n)+1))(CTX), RK2; \
142 encround(a ## 1, b ## 1, c ## 1, d ## 1, RX, RY); \
143 encround(a ## 2, b ## 2, c ## 2, d ## 2, RX, RY);
145 #define decrypt_round(n, a, b, c, d) \
146 vbroadcastss (k+4*(2*(n)))(CTX), RK1; \
147 vbroadcastss (k+4*(2*(n)+1))(CTX), RK2; \
148 decround(a ## 1, b ## 1, c ## 1, d ## 1, RX, RY); \
149 decround(a ## 2, b ## 2, c ## 2, d ## 2, RX, RY);
151 #define encrypt_cycle(n) \
152 encrypt_round((2*n), RA, RB, RC, RD); \
153 encrypt_round(((2*n) + 1), RC, RD, RA, RB);
155 #define decrypt_cycle(n) \
156 decrypt_round(((2*n) + 1), RC, RD, RA, RB); \
157 decrypt_round((2*n), RA, RB, RC, RD);
160 #define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
161 vpunpckldq x1, x0, t0; \
162 vpunpckhdq x1, x0, t2; \
163 vpunpckldq x3, x2, t1; \
164 vpunpckhdq x3, x2, x3; \
166 vpunpcklqdq t1, t0, x0; \
167 vpunpckhqdq t1, t0, x1; \
168 vpunpcklqdq x3, t2, x2; \
169 vpunpckhqdq x3, t2, x3;
171 #define inpack_blocks(in, x0, x1, x2, x3, wkey, t0, t1, t2) \
172 vpxor (0*4*4)(in), wkey, x0; \
173 vpxor (1*4*4)(in), wkey, x1; \
174 vpxor (2*4*4)(in), wkey, x2; \
175 vpxor (3*4*4)(in), wkey, x3; \
177 transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
179 #define outunpack_blocks(out, x0, x1, x2, x3, wkey, t0, t1, t2) \
180 transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
182 vpxor x0, wkey, x0; \
183 vmovdqu x0, (0*4*4)(out); \
184 vpxor x1, wkey, x1; \
185 vmovdqu x1, (1*4*4)(out); \
186 vpxor x2, wkey, x2; \
187 vmovdqu x2, (2*4*4)(out); \
188 vpxor x3, wkey, x3; \
189 vmovdqu x3, (3*4*4)(out);
191 #define outunpack_xor_blocks(out, x0, x1, x2, x3, wkey, t0, t1, t2) \
192 transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
194 vpxor x0, wkey, x0; \
195 vpxor (0*4*4)(out), x0, x0; \
196 vmovdqu x0, (0*4*4)(out); \
197 vpxor x1, wkey, x1; \
198 vpxor (1*4*4)(out), x1, x1; \
199 vmovdqu x1, (1*4*4)(out); \
200 vpxor x2, wkey, x2; \
201 vpxor (2*4*4)(out), x2, x2; \
202 vmovdqu x2, (2*4*4)(out); \
203 vpxor x3, wkey, x3; \
204 vpxor (3*4*4)(out), x3, x3; \
205 vmovdqu x3, (3*4*4)(out);
208 .global __twofish_enc_blk_8way
209 .type __twofish_enc_blk_8way,@function;
211 __twofish_enc_blk_8way:
216 * %rcx: bool, if true: xor output
224 leaq (4*4*4)(%rdx), %rax;
225 inpack_blocks(%rdx, RA1, RB1, RC1, RD1, RK1, RX, RY, RK2);
226 inpack_blocks(%rax, RA2, RB2, RC2, RD2, RK1, RX, RY, RK2);
240 vmovdqu (w+4*4)(CTX), RK1;
245 leaq (4*4*4)(%rsi), %rax;
250 outunpack_blocks(%rsi, RC1, RD1, RA1, RB1, RK1, RX, RY, RK2);
251 outunpack_blocks(%rax, RC2, RD2, RA2, RB2, RK1, RX, RY, RK2);
256 outunpack_xor_blocks(%rsi, RC1, RD1, RA1, RB1, RK1, RX, RY, RK2);
257 outunpack_xor_blocks(%rax, RC2, RD2, RA2, RB2, RK1, RX, RY, RK2);
262 .global twofish_dec_blk_8way
263 .type twofish_dec_blk_8way,@function;
265 twofish_dec_blk_8way:
274 vmovdqu (w+4*4)(CTX), RK1;
276 leaq (4*4*4)(%rdx), %rax;
277 inpack_blocks(%rdx, RC1, RD1, RA1, RB1, RK1, RX, RY, RK2);
278 inpack_blocks(%rax, RC2, RD2, RA2, RB2, RK1, RX, RY, RK2);
292 vmovdqu (w)(CTX), RK1;
296 leaq (4*4*4)(%rsi), %rax;
297 outunpack_blocks(%rsi, RA1, RB1, RC1, RD1, RK1, RX, RY, RK2);
298 outunpack_blocks(%rax, RA2, RB2, RC2, RD2, RK1, RX, RY, RK2);