WIP FPC-III support
[linux/fpc-iii.git] / arch / x86 / crypto / blowfish-x86_64-asm_64.S
blob4222ac6d65848b121f89396f3247f9c22eef35a6
1 /* SPDX-License-Identifier: GPL-2.0-or-later */
2 /*
3  * Blowfish Cipher Algorithm (x86_64)
4  *
5  * Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
6  */
8 #include <linux/linkage.h>
10 .file "blowfish-x86_64-asm.S"
11 .text
13 /* structure of crypto context */
14 #define p       0
15 #define s0      ((16 + 2) * 4)
16 #define s1      ((16 + 2 + (1 * 256)) * 4)
17 #define s2      ((16 + 2 + (2 * 256)) * 4)
18 #define s3      ((16 + 2 + (3 * 256)) * 4)
20 /* register macros */
21 #define CTX %r12
22 #define RIO %rsi
24 #define RX0 %rax
25 #define RX1 %rbx
26 #define RX2 %rcx
27 #define RX3 %rdx
29 #define RX0d %eax
30 #define RX1d %ebx
31 #define RX2d %ecx
32 #define RX3d %edx
34 #define RX0bl %al
35 #define RX1bl %bl
36 #define RX2bl %cl
37 #define RX3bl %dl
39 #define RX0bh %ah
40 #define RX1bh %bh
41 #define RX2bh %ch
42 #define RX3bh %dh
44 #define RT0 %rdi
45 #define RT1 %rsi
46 #define RT2 %r8
47 #define RT3 %r9
49 #define RT0d %edi
50 #define RT1d %esi
51 #define RT2d %r8d
52 #define RT3d %r9d
54 #define RKEY %r10
56 /***********************************************************************
57  * 1-way blowfish
58  ***********************************************************************/
59 #define F() \
60         rorq $16,               RX0; \
61         movzbl RX0bh,           RT0d; \
62         movzbl RX0bl,           RT1d; \
63         rolq $16,               RX0; \
64         movl s0(CTX,RT0,4),     RT0d; \
65         addl s1(CTX,RT1,4),     RT0d; \
66         movzbl RX0bh,           RT1d; \
67         movzbl RX0bl,           RT2d; \
68         rolq $32,               RX0; \
69         xorl s2(CTX,RT1,4),     RT0d; \
70         addl s3(CTX,RT2,4),     RT0d; \
71         xorq RT0,               RX0;
73 #define add_roundkey_enc(n) \
74         xorq p+4*(n)(CTX),      RX0;
76 #define round_enc(n) \
77         add_roundkey_enc(n); \
78         \
79         F(); \
80         F();
82 #define add_roundkey_dec(n) \
83         movq p+4*(n-1)(CTX),    RT0; \
84         rorq $32,               RT0; \
85         xorq RT0,               RX0;
87 #define round_dec(n) \
88         add_roundkey_dec(n); \
89         \
90         F(); \
91         F(); \
93 #define read_block() \
94         movq (RIO),             RX0; \
95         rorq $32,               RX0; \
96         bswapq                  RX0;
98 #define write_block() \
99         bswapq                  RX0; \
100         movq RX0,               (RIO);
102 #define xor_block() \
103         bswapq                  RX0; \
104         xorq RX0,               (RIO);
106 SYM_FUNC_START(__blowfish_enc_blk)
107         /* input:
108          *      %rdi: ctx
109          *      %rsi: dst
110          *      %rdx: src
111          *      %rcx: bool, if true: xor output
112          */
113         movq %r12, %r11;
115         movq %rdi, CTX;
116         movq %rsi, %r10;
117         movq %rdx, RIO;
119         read_block();
121         round_enc(0);
122         round_enc(2);
123         round_enc(4);
124         round_enc(6);
125         round_enc(8);
126         round_enc(10);
127         round_enc(12);
128         round_enc(14);
129         add_roundkey_enc(16);
131         movq %r11, %r12;
133         movq %r10, RIO;
134         test %cl, %cl;
135         jnz .L__enc_xor;
137         write_block();
138         ret;
139 .L__enc_xor:
140         xor_block();
141         ret;
142 SYM_FUNC_END(__blowfish_enc_blk)
144 SYM_FUNC_START(blowfish_dec_blk)
145         /* input:
146          *      %rdi: ctx
147          *      %rsi: dst
148          *      %rdx: src
149          */
150         movq %r12, %r11;
152         movq %rdi, CTX;
153         movq %rsi, %r10;
154         movq %rdx, RIO;
156         read_block();
158         round_dec(17);
159         round_dec(15);
160         round_dec(13);
161         round_dec(11);
162         round_dec(9);
163         round_dec(7);
164         round_dec(5);
165         round_dec(3);
166         add_roundkey_dec(1);
168         movq %r10, RIO;
169         write_block();
171         movq %r11, %r12;
173         ret;
174 SYM_FUNC_END(blowfish_dec_blk)
176 /**********************************************************************
177   4-way blowfish, four blocks parallel
178  **********************************************************************/
180 /* F() for 4-way. Slower when used alone/1-way, but faster when used
181  * parallel/4-way (tested on AMD Phenom II & Intel Xeon E7330).
182  */
183 #define F4(x) \
184         movzbl x ## bh,         RT1d; \
185         movzbl x ## bl,         RT3d; \
186         rorq $16,               x; \
187         movzbl x ## bh,         RT0d; \
188         movzbl x ## bl,         RT2d; \
189         rorq $16,               x; \
190         movl s0(CTX,RT0,4),     RT0d; \
191         addl s1(CTX,RT2,4),     RT0d; \
192         xorl s2(CTX,RT1,4),     RT0d; \
193         addl s3(CTX,RT3,4),     RT0d; \
194         xorq RT0,               x;
196 #define add_preloaded_roundkey4() \
197         xorq RKEY,              RX0; \
198         xorq RKEY,              RX1; \
199         xorq RKEY,              RX2; \
200         xorq RKEY,              RX3;
202 #define preload_roundkey_enc(n) \
203         movq p+4*(n)(CTX),      RKEY;
205 #define add_roundkey_enc4(n) \
206         add_preloaded_roundkey4(); \
207         preload_roundkey_enc(n + 2);
209 #define round_enc4(n) \
210         add_roundkey_enc4(n); \
211         \
212         F4(RX0); \
213         F4(RX1); \
214         F4(RX2); \
215         F4(RX3); \
216         \
217         F4(RX0); \
218         F4(RX1); \
219         F4(RX2); \
220         F4(RX3);
222 #define preload_roundkey_dec(n) \
223         movq p+4*((n)-1)(CTX),  RKEY; \
224         rorq $32,               RKEY;
226 #define add_roundkey_dec4(n) \
227         add_preloaded_roundkey4(); \
228         preload_roundkey_dec(n - 2);
230 #define round_dec4(n) \
231         add_roundkey_dec4(n); \
232         \
233         F4(RX0); \
234         F4(RX1); \
235         F4(RX2); \
236         F4(RX3); \
237         \
238         F4(RX0); \
239         F4(RX1); \
240         F4(RX2); \
241         F4(RX3);
243 #define read_block4() \
244         movq (RIO),             RX0; \
245         rorq $32,               RX0; \
246         bswapq                  RX0; \
247         \
248         movq 8(RIO),            RX1; \
249         rorq $32,               RX1; \
250         bswapq                  RX1; \
251         \
252         movq 16(RIO),           RX2; \
253         rorq $32,               RX2; \
254         bswapq                  RX2; \
255         \
256         movq 24(RIO),           RX3; \
257         rorq $32,               RX3; \
258         bswapq                  RX3;
260 #define write_block4() \
261         bswapq                  RX0; \
262         movq RX0,               (RIO); \
263         \
264         bswapq                  RX1; \
265         movq RX1,               8(RIO); \
266         \
267         bswapq                  RX2; \
268         movq RX2,               16(RIO); \
269         \
270         bswapq                  RX3; \
271         movq RX3,               24(RIO);
273 #define xor_block4() \
274         bswapq                  RX0; \
275         xorq RX0,               (RIO); \
276         \
277         bswapq                  RX1; \
278         xorq RX1,               8(RIO); \
279         \
280         bswapq                  RX2; \
281         xorq RX2,               16(RIO); \
282         \
283         bswapq                  RX3; \
284         xorq RX3,               24(RIO);
286 SYM_FUNC_START(__blowfish_enc_blk_4way)
287         /* input:
288          *      %rdi: ctx
289          *      %rsi: dst
290          *      %rdx: src
291          *      %rcx: bool, if true: xor output
292          */
293         pushq %r12;
294         pushq %rbx;
295         pushq %rcx;
297         movq %rdi, CTX
298         movq %rsi, %r11;
299         movq %rdx, RIO;
301         preload_roundkey_enc(0);
303         read_block4();
305         round_enc4(0);
306         round_enc4(2);
307         round_enc4(4);
308         round_enc4(6);
309         round_enc4(8);
310         round_enc4(10);
311         round_enc4(12);
312         round_enc4(14);
313         add_preloaded_roundkey4();
315         popq %r12;
316         movq %r11, RIO;
318         test %r12b, %r12b;
319         jnz .L__enc_xor4;
321         write_block4();
323         popq %rbx;
324         popq %r12;
325         ret;
327 .L__enc_xor4:
328         xor_block4();
330         popq %rbx;
331         popq %r12;
332         ret;
333 SYM_FUNC_END(__blowfish_enc_blk_4way)
335 SYM_FUNC_START(blowfish_dec_blk_4way)
336         /* input:
337          *      %rdi: ctx
338          *      %rsi: dst
339          *      %rdx: src
340          */
341         pushq %r12;
342         pushq %rbx;
344         movq %rdi, CTX;
345         movq %rsi, %r11
346         movq %rdx, RIO;
348         preload_roundkey_dec(17);
349         read_block4();
351         round_dec4(17);
352         round_dec4(15);
353         round_dec4(13);
354         round_dec4(11);
355         round_dec4(9);
356         round_dec4(7);
357         round_dec4(5);
358         round_dec4(3);
359         add_preloaded_roundkey4();
361         movq %r11, RIO;
362         write_block4();
364         popq %rbx;
365         popq %r12;
367         ret;
368 SYM_FUNC_END(blowfish_dec_blk_4way)