2 * x86_64/AVX2 assembler optimized version of Twofish
4 * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
13 #include <linux/linkage.h>
14 #include "glue_helper-asm-avx2.S"
16 .file "twofish-avx2-asm_64.S"
28 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
29 .Lxts_gf128mul_and_shl1_mask_0:
30 .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0
31 .Lxts_gf128mul_and_shl1_mask_1:
32 .byte 0x0e, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0
36 /* structure of crypto context */
79 /* vpgatherdd mask and '-1' */
82 /* byte mask, (-1 >> 24) */
85 /**********************************************************************
87 **********************************************************************/
88 #define init_round_constants() \
89 vpcmpeqd RNOT, RNOT, RNOT; \
90 vpsrld $24, RNOT, RBYTE; \
97 #define g16(ab, rs0, rs1, rs2, rs3, xy) \
98 vpand RBYTE, ab ## 0, RIDX; \
99 vpgatherdd RNOT, (rs0, RIDX, 4), xy ## 0; \
100 vpcmpeqd RNOT, RNOT, RNOT; \
102 vpand RBYTE, ab ## 1, RIDX; \
103 vpgatherdd RNOT, (rs0, RIDX, 4), xy ## 1; \
104 vpcmpeqd RNOT, RNOT, RNOT; \
106 vpsrld $8, ab ## 0, RIDX; \
107 vpand RBYTE, RIDX, RIDX; \
108 vpgatherdd RNOT, (rs1, RIDX, 4), RT0; \
109 vpcmpeqd RNOT, RNOT, RNOT; \
110 vpxor RT0, xy ## 0, xy ## 0; \
112 vpsrld $8, ab ## 1, RIDX; \
113 vpand RBYTE, RIDX, RIDX; \
114 vpgatherdd RNOT, (rs1, RIDX, 4), RT0; \
115 vpcmpeqd RNOT, RNOT, RNOT; \
116 vpxor RT0, xy ## 1, xy ## 1; \
118 vpsrld $16, ab ## 0, RIDX; \
119 vpand RBYTE, RIDX, RIDX; \
120 vpgatherdd RNOT, (rs2, RIDX, 4), RT0; \
121 vpcmpeqd RNOT, RNOT, RNOT; \
122 vpxor RT0, xy ## 0, xy ## 0; \
124 vpsrld $16, ab ## 1, RIDX; \
125 vpand RBYTE, RIDX, RIDX; \
126 vpgatherdd RNOT, (rs2, RIDX, 4), RT0; \
127 vpcmpeqd RNOT, RNOT, RNOT; \
128 vpxor RT0, xy ## 1, xy ## 1; \
130 vpsrld $24, ab ## 0, RIDX; \
131 vpgatherdd RNOT, (rs3, RIDX, 4), RT0; \
132 vpcmpeqd RNOT, RNOT, RNOT; \
133 vpxor RT0, xy ## 0, xy ## 0; \
135 vpsrld $24, ab ## 1, RIDX; \
136 vpgatherdd RNOT, (rs3, RIDX, 4), RT0; \
137 vpcmpeqd RNOT, RNOT, RNOT; \
138 vpxor RT0, xy ## 1, xy ## 1;
140 #define g1_16(a, x) \
141 g16(a, RS0, RS1, RS2, RS3, x);
143 #define g2_16(b, y) \
144 g16(b, RS1, RS2, RS3, RS0, y);
146 #define encrypt_round_end16(a, b, c, d, nk) \
147 vpaddd RY0, RX0, RX0; \
148 vpaddd RX0, RY0, RY0; \
149 vpbroadcastd nk(RK,RROUND,8), RT0; \
150 vpaddd RT0, RX0, RX0; \
151 vpbroadcastd 4+nk(RK,RROUND,8), RT0; \
152 vpaddd RT0, RY0, RY0; \
154 vpxor RY0, d ## 0, d ## 0; \
156 vpxor RX0, c ## 0, c ## 0; \
157 vpsrld $1, c ## 0, RT0; \
158 vpslld $31, c ## 0, c ## 0; \
159 vpor RT0, c ## 0, c ## 0; \
161 vpaddd RY1, RX1, RX1; \
162 vpaddd RX1, RY1, RY1; \
163 vpbroadcastd nk(RK,RROUND,8), RT0; \
164 vpaddd RT0, RX1, RX1; \
165 vpbroadcastd 4+nk(RK,RROUND,8), RT0; \
166 vpaddd RT0, RY1, RY1; \
168 vpxor RY1, d ## 1, d ## 1; \
170 vpxor RX1, c ## 1, c ## 1; \
171 vpsrld $1, c ## 1, RT0; \
172 vpslld $31, c ## 1, c ## 1; \
173 vpor RT0, c ## 1, c ## 1; \
175 #define encrypt_round16(a, b, c, d, nk) \
178 vpslld $1, b ## 0, RT0; \
179 vpsrld $31, b ## 0, b ## 0; \
180 vpor RT0, b ## 0, b ## 0; \
182 vpslld $1, b ## 1, RT0; \
183 vpsrld $31, b ## 1, b ## 1; \
184 vpor RT0, b ## 1, b ## 1; \
188 encrypt_round_end16(a, b, c, d, nk);
190 #define encrypt_round_first16(a, b, c, d, nk) \
191 vpslld $1, d ## 0, RT0; \
192 vpsrld $31, d ## 0, d ## 0; \
193 vpor RT0, d ## 0, d ## 0; \
195 vpslld $1, d ## 1, RT0; \
196 vpsrld $31, d ## 1, d ## 1; \
197 vpor RT0, d ## 1, d ## 1; \
199 encrypt_round16(a, b, c, d, nk);
201 #define encrypt_round_last16(a, b, c, d, nk) \
206 encrypt_round_end16(a, b, c, d, nk);
208 #define decrypt_round_end16(a, b, c, d, nk) \
209 vpaddd RY0, RX0, RX0; \
210 vpaddd RX0, RY0, RY0; \
211 vpbroadcastd nk(RK,RROUND,8), RT0; \
212 vpaddd RT0, RX0, RX0; \
213 vpbroadcastd 4+nk(RK,RROUND,8), RT0; \
214 vpaddd RT0, RY0, RY0; \
216 vpxor RX0, c ## 0, c ## 0; \
218 vpxor RY0, d ## 0, d ## 0; \
219 vpsrld $1, d ## 0, RT0; \
220 vpslld $31, d ## 0, d ## 0; \
221 vpor RT0, d ## 0, d ## 0; \
223 vpaddd RY1, RX1, RX1; \
224 vpaddd RX1, RY1, RY1; \
225 vpbroadcastd nk(RK,RROUND,8), RT0; \
226 vpaddd RT0, RX1, RX1; \
227 vpbroadcastd 4+nk(RK,RROUND,8), RT0; \
228 vpaddd RT0, RY1, RY1; \
230 vpxor RX1, c ## 1, c ## 1; \
232 vpxor RY1, d ## 1, d ## 1; \
233 vpsrld $1, d ## 1, RT0; \
234 vpslld $31, d ## 1, d ## 1; \
235 vpor RT0, d ## 1, d ## 1;
237 #define decrypt_round16(a, b, c, d, nk) \
240 vpslld $1, a ## 0, RT0; \
241 vpsrld $31, a ## 0, a ## 0; \
242 vpor RT0, a ## 0, a ## 0; \
244 vpslld $1, a ## 1, RT0; \
245 vpsrld $31, a ## 1, a ## 1; \
246 vpor RT0, a ## 1, a ## 1; \
250 decrypt_round_end16(a, b, c, d, nk);
252 #define decrypt_round_first16(a, b, c, d, nk) \
253 vpslld $1, c ## 0, RT0; \
254 vpsrld $31, c ## 0, c ## 0; \
255 vpor RT0, c ## 0, c ## 0; \
257 vpslld $1, c ## 1, RT0; \
258 vpsrld $31, c ## 1, c ## 1; \
259 vpor RT0, c ## 1, c ## 1; \
261 decrypt_round16(a, b, c, d, nk)
263 #define decrypt_round_last16(a, b, c, d, nk) \
268 decrypt_round_end16(a, b, c, d, nk);
270 #define encrypt_cycle16() \
271 encrypt_round16(RA, RB, RC, RD, 0); \
272 encrypt_round16(RC, RD, RA, RB, 8);
274 #define encrypt_cycle_first16() \
275 encrypt_round_first16(RA, RB, RC, RD, 0); \
276 encrypt_round16(RC, RD, RA, RB, 8);
278 #define encrypt_cycle_last16() \
279 encrypt_round16(RA, RB, RC, RD, 0); \
280 encrypt_round_last16(RC, RD, RA, RB, 8);
282 #define decrypt_cycle16(n) \
283 decrypt_round16(RC, RD, RA, RB, 8); \
284 decrypt_round16(RA, RB, RC, RD, 0);
286 #define decrypt_cycle_first16(n) \
287 decrypt_round_first16(RC, RD, RA, RB, 8); \
288 decrypt_round16(RA, RB, RC, RD, 0);
290 #define decrypt_cycle_last16(n) \
291 decrypt_round16(RC, RD, RA, RB, 8); \
292 decrypt_round_last16(RA, RB, RC, RD, 0);
294 #define transpose_4x4(x0,x1,x2,x3,t1,t2) \
295 vpunpckhdq x1, x0, t2; \
296 vpunpckldq x1, x0, x0; \
298 vpunpckldq x3, x2, t1; \
299 vpunpckhdq x3, x2, x2; \
301 vpunpckhqdq t1, x0, x1; \
302 vpunpcklqdq t1, x0, x0; \
304 vpunpckhqdq x2, t2, x3; \
305 vpunpcklqdq x2, t2, x2;
307 #define read_blocks8(offs,a,b,c,d) \
308 transpose_4x4(a, b, c, d, RX0, RY0);
310 #define write_blocks8(offs,a,b,c,d) \
311 transpose_4x4(a, b, c, d, RX0, RY0);
313 #define inpack_enc8(a,b,c,d) \
314 vpbroadcastd 4*0(RW), RT0; \
317 vpbroadcastd 4*1(RW), RT0; \
320 vpbroadcastd 4*2(RW), RT0; \
323 vpbroadcastd 4*3(RW), RT0; \
326 #define outunpack_enc8(a,b,c,d) \
327 vpbroadcastd 4*4(RW), RX0; \
328 vpbroadcastd 4*5(RW), RY0; \
332 vpbroadcastd 4*6(RW), RT0; \
334 vpbroadcastd 4*7(RW), RT0; \
340 #define inpack_dec8(a,b,c,d) \
341 vpbroadcastd 4*4(RW), RX0; \
342 vpbroadcastd 4*5(RW), RY0; \
346 vpbroadcastd 4*6(RW), RT0; \
348 vpbroadcastd 4*7(RW), RT0; \
354 #define outunpack_dec8(a,b,c,d) \
355 vpbroadcastd 4*0(RW), RT0; \
358 vpbroadcastd 4*1(RW), RT0; \
361 vpbroadcastd 4*2(RW), RT0; \
364 vpbroadcastd 4*3(RW), RT0; \
367 #define read_blocks16(a,b,c,d) \
368 read_blocks8(0, a ## 0, b ## 0, c ## 0, d ## 0); \
369 read_blocks8(8, a ## 1, b ## 1, c ## 1, d ## 1);
371 #define write_blocks16(a,b,c,d) \
372 write_blocks8(0, a ## 0, b ## 0, c ## 0, d ## 0); \
373 write_blocks8(8, a ## 1, b ## 1, c ## 1, d ## 1);
375 #define xor_blocks16(a,b,c,d) \
376 xor_blocks8(0, a ## 0, b ## 0, c ## 0, d ## 0); \
377 xor_blocks8(8, a ## 1, b ## 1, c ## 1, d ## 1);
379 #define inpack_enc16(a,b,c,d) \
380 inpack_enc8(a ## 0, b ## 0, c ## 0, d ## 0); \
381 inpack_enc8(a ## 1, b ## 1, c ## 1, d ## 1);
383 #define outunpack_enc16(a,b,c,d) \
384 outunpack_enc8(a ## 0, b ## 0, c ## 0, d ## 0); \
385 outunpack_enc8(a ## 1, b ## 1, c ## 1, d ## 1);
387 #define inpack_dec16(a,b,c,d) \
388 inpack_dec8(a ## 0, b ## 0, c ## 0, d ## 0); \
389 inpack_dec8(a ## 1, b ## 1, c ## 1, d ## 1);
391 #define outunpack_dec16(a,b,c,d) \
392 outunpack_dec8(a ## 0, b ## 0, c ## 0, d ## 0); \
393 outunpack_dec8(a ## 1, b ## 1, c ## 1, d ## 1);
399 * RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1: plaintext
401 * RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1: ciphertext
403 init_round_constants();
405 read_blocks16(RA, RB, RC, RD);
406 inpack_enc16(RA, RB, RC, RD);
408 xorl RROUNDd, RROUNDd;
409 encrypt_cycle_first16();
420 encrypt_cycle_last16();
422 outunpack_enc16(RA, RB, RC, RD);
423 write_blocks16(RA, RB, RC, RD);
426 ENDPROC(__twofish_enc_blk16)
432 * RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1: ciphertext
434 * RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1: plaintext
436 init_round_constants();
438 read_blocks16(RA, RB, RC, RD);
439 inpack_dec16(RA, RB, RC, RD);
442 decrypt_cycle_first16();
452 decrypt_cycle_last16();
454 outunpack_dec16(RA, RB, RC, RD);
455 write_blocks16(RA, RB, RC, RD);
458 ENDPROC(__twofish_dec_blk16)
460 ENTRY(twofish_ecb_enc_16way)
470 load_16way(%rdx, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1);
472 call __twofish_enc_blk16;
474 store_16way(%rsi, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1);
480 ENDPROC(twofish_ecb_enc_16way)
482 ENTRY(twofish_ecb_dec_16way)
492 load_16way(%rdx, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1);
494 call __twofish_dec_blk16;
496 store_16way(%rsi, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1);
502 ENDPROC(twofish_ecb_dec_16way)
504 ENTRY(twofish_cbc_dec_16way)
514 load_16way(%rdx, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1);
516 call __twofish_dec_blk16;
518 store_cbc_16way(%rdx, %rsi, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1,
525 ENDPROC(twofish_cbc_dec_16way)
527 ENTRY(twofish_ctr_16way)
530 * %rsi: dst (16 blocks)
531 * %rdx: src (16 blocks)
532 * %rcx: iv (little endian, 128bit)
538 load_ctr_16way(%rcx, .Lbswap128_mask, RA0, RB0, RC0, RD0, RA1, RB1, RC1,
539 RD1, RX0, RX0x, RX1, RX1x, RY0, RY0x, RY1, RY1x, RNOT,
542 call __twofish_enc_blk16;
544 store_ctr_16way(%rdx, %rsi, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1);
550 ENDPROC(twofish_ctr_16way)
553 twofish_xts_crypt_16way:
556 * %rsi: dst (16 blocks)
557 * %rdx: src (16 blocks)
558 * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
559 * %r8: pointer to __twofish_enc_blk16 or __twofish_dec_blk16
565 load_xts_16way(%rcx, %rdx, %rsi, RA0, RB0, RC0, RD0, RA1, RB1, RC1,
566 RD1, RX0, RX0x, RX1, RX1x, RY0, RY0x, RY1, RY1x, RNOT,
567 .Lxts_gf128mul_and_shl1_mask_0,
568 .Lxts_gf128mul_and_shl1_mask_1);
572 store_xts_16way(%rsi, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1);
578 ENDPROC(twofish_xts_crypt_16way)
580 ENTRY(twofish_xts_enc_16way)
583 * %rsi: dst (16 blocks)
584 * %rdx: src (16 blocks)
585 * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
587 leaq __twofish_enc_blk16, %r8;
588 jmp twofish_xts_crypt_16way;
589 ENDPROC(twofish_xts_enc_16way)
591 ENTRY(twofish_xts_dec_16way)
594 * %rsi: dst (16 blocks)
595 * %rdx: src (16 blocks)
596 * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
598 leaq __twofish_dec_blk16, %r8;
599 jmp twofish_xts_crypt_16way;
600 ENDPROC(twofish_xts_dec_16way)