1 /* SPDX-License-Identifier: GPL-2.0-or-later */
3 * Shared glue code for 128bit block ciphers, AVX2 assembler macros
5 * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
8 #define load_16way(src, x0, x1, x2, x3, x4, x5, x6, x7) \
9 vmovdqu (0*32)(src), x0; \
10 vmovdqu (1*32)(src), x1; \
11 vmovdqu (2*32)(src), x2; \
12 vmovdqu (3*32)(src), x3; \
13 vmovdqu (4*32)(src), x4; \
14 vmovdqu (5*32)(src), x5; \
15 vmovdqu (6*32)(src), x6; \
16 vmovdqu (7*32)(src), x7;
18 #define store_16way(dst, x0, x1, x2, x3, x4, x5, x6, x7) \
19 vmovdqu x0, (0*32)(dst); \
20 vmovdqu x1, (1*32)(dst); \
21 vmovdqu x2, (2*32)(dst); \
22 vmovdqu x3, (3*32)(dst); \
23 vmovdqu x4, (4*32)(dst); \
24 vmovdqu x5, (5*32)(dst); \
25 vmovdqu x6, (6*32)(dst); \
26 vmovdqu x7, (7*32)(dst);
28 #define store_cbc_16way(src, dst, x0, x1, x2, x3, x4, x5, x6, x7, t0) \
30 vinserti128 $1, (src), t0, t0; \
32 vpxor (0*32+16)(src), x1, x1; \
33 vpxor (1*32+16)(src), x2, x2; \
34 vpxor (2*32+16)(src), x3, x3; \
35 vpxor (3*32+16)(src), x4, x4; \
36 vpxor (4*32+16)(src), x5, x5; \
37 vpxor (5*32+16)(src), x6, x6; \
38 vpxor (6*32+16)(src), x7, x7; \
39 store_16way(dst, x0, x1, x2, x3, x4, x5, x6, x7);
41 #define inc_le128(x, minus_one, tmp) \
42 vpcmpeqq minus_one, x, tmp; \
43 vpsubq minus_one, x, x; \
44 vpslldq $8, tmp, tmp; \
47 #define add2_le128(x, minus_one, minus_two, tmp1, tmp2) \
48 vpcmpeqq minus_one, x, tmp1; \
49 vpcmpeqq minus_two, x, tmp2; \
50 vpsubq minus_two, x, x; \
51 vpor tmp2, tmp1, tmp1; \
52 vpslldq $8, tmp1, tmp1; \
55 #define load_ctr_16way(iv, bswap, x0, x1, x2, x3, x4, x5, x6, x7, t0, t0x, t1, \
56 t1x, t2, t2x, t3, t3x, t4, t5) \
57 vpcmpeqd t0, t0, t0; \
58 vpsrldq $8, t0, t0; /* ab: -1:0 ; cd: -1:0 */ \
59 vpaddq t0, t0, t4; /* ab: -2:0 ; cd: -2:0 */\
61 /* load IV and byteswap */ \
64 inc_le128(t2x, t0x, t1x); \
65 vbroadcasti128 bswap, t1; \
66 vinserti128 $1, t2x, t3, t2; /* ab: le0 ; cd: le1 */ \
70 add2_le128(t2, t0, t4, t3, t5); /* ab: le2 ; cd: le3 */ \
72 add2_le128(t2, t0, t4, t3, t5); \
74 add2_le128(t2, t0, t4, t3, t5); \
76 add2_le128(t2, t0, t4, t3, t5); \
78 add2_le128(t2, t0, t4, t3, t5); \
80 add2_le128(t2, t0, t4, t3, t5); \
82 add2_le128(t2, t0, t4, t3, t5); \
84 vextracti128 $1, t2, t2x; \
85 inc_le128(t2x, t0x, t3x); \
88 #define store_ctr_16way(src, dst, x0, x1, x2, x3, x4, x5, x6, x7) \
89 vpxor (0*32)(src), x0, x0; \
90 vpxor (1*32)(src), x1, x1; \
91 vpxor (2*32)(src), x2, x2; \
92 vpxor (3*32)(src), x3, x3; \
93 vpxor (4*32)(src), x4, x4; \
94 vpxor (5*32)(src), x5, x5; \
95 vpxor (6*32)(src), x6, x6; \
96 vpxor (7*32)(src), x7, x7; \
97 store_16way(dst, x0, x1, x2, x3, x4, x5, x6, x7);
99 #define gf128mul_x_ble(iv, mask, tmp) \
100 vpsrad $31, iv, tmp; \
102 vpshufd $0x13, tmp, tmp; \
103 vpand mask, tmp, tmp; \
106 #define gf128mul_x2_ble(iv, mask1, mask2, tmp0, tmp1) \
107 vpsrad $31, iv, tmp0; \
108 vpaddq iv, iv, tmp1; \
110 vpshufd $0x13, tmp0, tmp0; \
111 vpsrad $31, tmp1, tmp1; \
112 vpand mask2, tmp0, tmp0; \
113 vpshufd $0x13, tmp1, tmp1; \
114 vpxor tmp0, iv, iv; \
115 vpand mask1, tmp1, tmp1; \
118 #define load_xts_16way(iv, src, dst, x0, x1, x2, x3, x4, x5, x6, x7, tiv, \
119 tivx, t0, t0x, t1, t1x, t2, t2x, t3, \
120 xts_gf128mul_and_shl1_mask_0, \
121 xts_gf128mul_and_shl1_mask_1) \
122 vbroadcasti128 xts_gf128mul_and_shl1_mask_0, t1; \
124 /* load IV and construct second IV */ \
125 vmovdqu (iv), tivx; \
127 gf128mul_x_ble(tivx, t1x, t2x); \
128 vbroadcasti128 xts_gf128mul_and_shl1_mask_1, t2; \
129 vinserti128 $1, tivx, t0, tiv; \
130 vpxor (0*32)(src), tiv, x0; \
131 vmovdqu tiv, (0*32)(dst); \
133 /* construct and store IVs, also xor with source */ \
134 gf128mul_x2_ble(tiv, t1, t2, t0, t3); \
135 vpxor (1*32)(src), tiv, x1; \
136 vmovdqu tiv, (1*32)(dst); \
138 gf128mul_x2_ble(tiv, t1, t2, t0, t3); \
139 vpxor (2*32)(src), tiv, x2; \
140 vmovdqu tiv, (2*32)(dst); \
142 gf128mul_x2_ble(tiv, t1, t2, t0, t3); \
143 vpxor (3*32)(src), tiv, x3; \
144 vmovdqu tiv, (3*32)(dst); \
146 gf128mul_x2_ble(tiv, t1, t2, t0, t3); \
147 vpxor (4*32)(src), tiv, x4; \
148 vmovdqu tiv, (4*32)(dst); \
150 gf128mul_x2_ble(tiv, t1, t2, t0, t3); \
151 vpxor (5*32)(src), tiv, x5; \
152 vmovdqu tiv, (5*32)(dst); \
154 gf128mul_x2_ble(tiv, t1, t2, t0, t3); \
155 vpxor (6*32)(src), tiv, x6; \
156 vmovdqu tiv, (6*32)(dst); \
158 gf128mul_x2_ble(tiv, t1, t2, t0, t3); \
159 vpxor (7*32)(src), tiv, x7; \
160 vmovdqu tiv, (7*32)(dst); \
162 vextracti128 $1, tiv, tivx; \
163 gf128mul_x_ble(tivx, t1x, t2x); \
166 #define store_xts_16way(dst, x0, x1, x2, x3, x4, x5, x6, x7) \
167 vpxor (0*32)(dst), x0, x0; \
168 vpxor (1*32)(dst), x1, x1; \
169 vpxor (2*32)(dst), x2, x2; \
170 vpxor (3*32)(dst), x3, x3; \
171 vpxor (4*32)(dst), x4, x4; \
172 vpxor (5*32)(dst), x5, x5; \
173 vpxor (6*32)(dst), x6, x6; \
174 vpxor (7*32)(dst), x7, x7; \
175 store_16way(dst, x0, x1, x2, x3, x4, x5, x6, x7);