drm/panthor: Don't add write fences to the shared BOs
[drm/drm-misc.git] / arch / x86 / crypto / blowfish-x86_64-asm_64.S
blobe88c8e4f013c5fb4cfadc93c2230c3f8c9c1cb91
1 /* SPDX-License-Identifier: GPL-2.0-or-later */
2 /*
3  * Blowfish Cipher Algorithm (x86_64)
4  *
5  * Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
6  */
8 #include <linux/linkage.h>
10 .file "blowfish-x86_64-asm.S"
11 .text
13 /* structure of crypto context */
14 #define p       0
15 #define s0      ((16 + 2) * 4)
16 #define s1      ((16 + 2 + (1 * 256)) * 4)
17 #define s2      ((16 + 2 + (2 * 256)) * 4)
18 #define s3      ((16 + 2 + (3 * 256)) * 4)
20 /* register macros */
21 #define CTX %r12
22 #define RIO %rsi
24 #define RX0 %rax
25 #define RX1 %rbx
26 #define RX2 %rcx
27 #define RX3 %rdx
29 #define RX0d %eax
30 #define RX1d %ebx
31 #define RX2d %ecx
32 #define RX3d %edx
34 #define RX0bl %al
35 #define RX1bl %bl
36 #define RX2bl %cl
37 #define RX3bl %dl
39 #define RX0bh %ah
40 #define RX1bh %bh
41 #define RX2bh %ch
42 #define RX3bh %dh
44 #define RT0 %rdi
45 #define RT1 %rsi
46 #define RT2 %r8
47 #define RT3 %r9
49 #define RT0d %edi
50 #define RT1d %esi
51 #define RT2d %r8d
52 #define RT3d %r9d
54 #define RKEY %r10
56 /***********************************************************************
57  * 1-way blowfish
58  ***********************************************************************/
59 #define F() \
60         rorq $16,               RX0; \
61         movzbl RX0bh,           RT0d; \
62         movzbl RX0bl,           RT1d; \
63         rolq $16,               RX0; \
64         movl s0(CTX,RT0,4),     RT0d; \
65         addl s1(CTX,RT1,4),     RT0d; \
66         movzbl RX0bh,           RT1d; \
67         movzbl RX0bl,           RT2d; \
68         rolq $32,               RX0; \
69         xorl s2(CTX,RT1,4),     RT0d; \
70         addl s3(CTX,RT2,4),     RT0d; \
71         xorq RT0,               RX0;
73 #define add_roundkey_enc(n) \
74         xorq p+4*(n)(CTX),      RX0;
76 #define round_enc(n) \
77         add_roundkey_enc(n); \
78         \
79         F(); \
80         F();
82 #define add_roundkey_dec(n) \
83         movq p+4*(n-1)(CTX),    RT0; \
84         rorq $32,               RT0; \
85         xorq RT0,               RX0;
87 #define round_dec(n) \
88         add_roundkey_dec(n); \
89         \
90         F(); \
91         F(); \
93 #define read_block() \
94         movq (RIO),             RX0; \
95         rorq $32,               RX0; \
96         bswapq                  RX0;
98 #define write_block() \
99         bswapq                  RX0; \
100         movq RX0,               (RIO);
102 SYM_FUNC_START(blowfish_enc_blk)
103         /* input:
104          *      %rdi: ctx
105          *      %rsi: dst
106          *      %rdx: src
107          */
108         movq %r12, %r11;
110         movq %rdi, CTX;
111         movq %rsi, %r10;
112         movq %rdx, RIO;
114         read_block();
116         round_enc(0);
117         round_enc(2);
118         round_enc(4);
119         round_enc(6);
120         round_enc(8);
121         round_enc(10);
122         round_enc(12);
123         round_enc(14);
124         add_roundkey_enc(16);
126         movq %r11, %r12;
127         movq %r10, RIO;
129         write_block();
130         RET;
131 SYM_FUNC_END(blowfish_enc_blk)
133 SYM_FUNC_START(blowfish_dec_blk)
134         /* input:
135          *      %rdi: ctx
136          *      %rsi: dst
137          *      %rdx: src
138          */
139         movq %r12, %r11;
141         movq %rdi, CTX;
142         movq %rsi, %r10;
143         movq %rdx, RIO;
145         read_block();
147         round_dec(17);
148         round_dec(15);
149         round_dec(13);
150         round_dec(11);
151         round_dec(9);
152         round_dec(7);
153         round_dec(5);
154         round_dec(3);
155         add_roundkey_dec(1);
157         movq %r10, RIO;
158         write_block();
160         movq %r11, %r12;
162         RET;
163 SYM_FUNC_END(blowfish_dec_blk)
165 /**********************************************************************
166   4-way blowfish, four blocks parallel
167  **********************************************************************/
169 /* F() for 4-way. Slower when used alone/1-way, but faster when used
170  * parallel/4-way (tested on AMD Phenom II & Intel Xeon E7330).
171  */
172 #define F4(x) \
173         movzbl x ## bh,         RT1d; \
174         movzbl x ## bl,         RT3d; \
175         rorq $16,               x; \
176         movzbl x ## bh,         RT0d; \
177         movzbl x ## bl,         RT2d; \
178         rorq $16,               x; \
179         movl s0(CTX,RT0,4),     RT0d; \
180         addl s1(CTX,RT2,4),     RT0d; \
181         xorl s2(CTX,RT1,4),     RT0d; \
182         addl s3(CTX,RT3,4),     RT0d; \
183         xorq RT0,               x;
185 #define add_preloaded_roundkey4() \
186         xorq RKEY,              RX0; \
187         xorq RKEY,              RX1; \
188         xorq RKEY,              RX2; \
189         xorq RKEY,              RX3;
191 #define preload_roundkey_enc(n) \
192         movq p+4*(n)(CTX),      RKEY;
194 #define add_roundkey_enc4(n) \
195         add_preloaded_roundkey4(); \
196         preload_roundkey_enc(n + 2);
198 #define round_enc4(n) \
199         add_roundkey_enc4(n); \
200         \
201         F4(RX0); \
202         F4(RX1); \
203         F4(RX2); \
204         F4(RX3); \
205         \
206         F4(RX0); \
207         F4(RX1); \
208         F4(RX2); \
209         F4(RX3);
211 #define preload_roundkey_dec(n) \
212         movq p+4*((n)-1)(CTX),  RKEY; \
213         rorq $32,               RKEY;
215 #define add_roundkey_dec4(n) \
216         add_preloaded_roundkey4(); \
217         preload_roundkey_dec(n - 2);
219 #define round_dec4(n) \
220         add_roundkey_dec4(n); \
221         \
222         F4(RX0); \
223         F4(RX1); \
224         F4(RX2); \
225         F4(RX3); \
226         \
227         F4(RX0); \
228         F4(RX1); \
229         F4(RX2); \
230         F4(RX3);
232 #define read_block4() \
233         movq (RIO),             RX0; \
234         rorq $32,               RX0; \
235         bswapq                  RX0; \
236         \
237         movq 8(RIO),            RX1; \
238         rorq $32,               RX1; \
239         bswapq                  RX1; \
240         \
241         movq 16(RIO),           RX2; \
242         rorq $32,               RX2; \
243         bswapq                  RX2; \
244         \
245         movq 24(RIO),           RX3; \
246         rorq $32,               RX3; \
247         bswapq                  RX3;
249 #define write_block4() \
250         bswapq                  RX0; \
251         movq RX0,               (RIO); \
252         \
253         bswapq                  RX1; \
254         movq RX1,               8(RIO); \
255         \
256         bswapq                  RX2; \
257         movq RX2,               16(RIO); \
258         \
259         bswapq                  RX3; \
260         movq RX3,               24(RIO);
262 #define xor_block4() \
263         movq (RIO),             RT0; \
264         bswapq                  RT0; \
265         xorq RT0,               RX1; \
266         \
267         movq 8(RIO),            RT2; \
268         bswapq                  RT2; \
269         xorq RT2,               RX2; \
270         \
271         movq 16(RIO),           RT3; \
272         bswapq                  RT3; \
273         xorq RT3,               RX3;
275 SYM_FUNC_START(blowfish_enc_blk_4way)
276         /* input:
277          *      %rdi: ctx
278          *      %rsi: dst
279          *      %rdx: src
280          */
281         pushq %r12;
282         pushq %rbx;
284         movq %rdi, CTX
285         movq %rsi, %r11;
286         movq %rdx, RIO;
288         preload_roundkey_enc(0);
290         read_block4();
292         round_enc4(0);
293         round_enc4(2);
294         round_enc4(4);
295         round_enc4(6);
296         round_enc4(8);
297         round_enc4(10);
298         round_enc4(12);
299         round_enc4(14);
300         add_preloaded_roundkey4();
302         movq %r11, RIO;
303         write_block4();
305         popq %rbx;
306         popq %r12;
307         RET;
308 SYM_FUNC_END(blowfish_enc_blk_4way)
310 SYM_FUNC_START(__blowfish_dec_blk_4way)
311         /* input:
312          *      %rdi: ctx
313          *      %rsi: dst
314          *      %rdx: src
315          *      %rcx: cbc (bool)
316          */
317         pushq %r12;
318         pushq %rbx;
319         pushq %rcx;
320         pushq %rdx;
322         movq %rdi, CTX;
323         movq %rsi, %r11;
324         movq %rdx, RIO;
326         preload_roundkey_dec(17);
327         read_block4();
329         round_dec4(17);
330         round_dec4(15);
331         round_dec4(13);
332         round_dec4(11);
333         round_dec4(9);
334         round_dec4(7);
335         round_dec4(5);
336         round_dec4(3);
337         add_preloaded_roundkey4();
339         popq RIO;
340         popq %r12;
341         testq %r12, %r12;
342         jz .L_no_cbc_xor;
344         xor_block4();
346 .L_no_cbc_xor:
347         movq %r11, RIO;
348         write_block4();
350         popq %rbx;
351         popq %r12;
353         RET;
354 SYM_FUNC_END(__blowfish_dec_blk_4way)