unify {de,}mangle_poll(), get rid of kernel-side POLL...
[cris-mirror.git] / arch / powerpc / crypto / sha1-spe-asm.S
blobfcb6cf002889dcb9234a2e41eedeeb8bda858a92
1 /*
2  * Fast SHA-1 implementation for SPE instruction set (PPC)
3  *
4  * This code makes use of the SPE SIMD instruction set as defined in
5  * http://cache.freescale.com/files/32bit/doc/ref_manual/SPEPIM.pdf
6  * Implementation is based on optimization guide notes from
7  * http://cache.freescale.com/files/32bit/doc/app_note/AN2665.pdf
8  *
9  * Copyright (c) 2015 Markus Stockhausen <stockhausen@collogia.de>
10  *
11  * This program is free software; you can redistribute it and/or modify it
12  * under the terms of the GNU General Public License as published by the Free
13  * Software Foundation; either version 2 of the License, or (at your option)
14  * any later version.
15  *
16  */
18 #include <asm/ppc_asm.h>
19 #include <asm/asm-offsets.h>
21 #define rHP     r3      /* pointer to hash value                        */
22 #define rWP     r4      /* pointer to input                             */
23 #define rKP     r5      /* pointer to constants                         */
25 #define rW0     r14     /* 64 bit round words                           */
26 #define rW1     r15
27 #define rW2     r16
28 #define rW3     r17
29 #define rW4     r18
30 #define rW5     r19
31 #define rW6     r20
32 #define rW7     r21
34 #define rH0     r6      /* 32 bit hash values                           */
35 #define rH1     r7
36 #define rH2     r8
37 #define rH3     r9
38 #define rH4     r10
40 #define rT0     r22     /* 64 bit temporary                             */
41 #define rT1     r0      /* 32 bit temporaries                           */
42 #define rT2     r11
43 #define rT3     r12
45 #define rK      r23     /* 64 bit constant in volatile register         */
47 #define LOAD_K01
49 #define LOAD_K11 \
50         evlwwsplat      rK,0(rKP);
52 #define LOAD_K21 \
53         evlwwsplat      rK,4(rKP);
55 #define LOAD_K31 \
56         evlwwsplat      rK,8(rKP);
58 #define LOAD_K41 \
59         evlwwsplat      rK,12(rKP);
61 #define INITIALIZE \
62         stwu            r1,-128(r1);    /* create stack frame           */ \
63         evstdw          r14,8(r1);      /* We must save non volatile    */ \
64         evstdw          r15,16(r1);     /* registers. Take the chance   */ \
65         evstdw          r16,24(r1);     /* and save the SPE part too    */ \
66         evstdw          r17,32(r1);                                        \
67         evstdw          r18,40(r1);                                        \
68         evstdw          r19,48(r1);                                        \
69         evstdw          r20,56(r1);                                        \
70         evstdw          r21,64(r1);                                        \
71         evstdw          r22,72(r1);                                        \
72         evstdw          r23,80(r1);
75 #define FINALIZE \
76         evldw           r14,8(r1);      /* restore SPE registers        */ \
77         evldw           r15,16(r1);                                        \
78         evldw           r16,24(r1);                                        \
79         evldw           r17,32(r1);                                        \
80         evldw           r18,40(r1);                                        \
81         evldw           r19,48(r1);                                        \
82         evldw           r20,56(r1);                                        \
83         evldw           r21,64(r1);                                        \
84         evldw           r22,72(r1);                                        \
85         evldw           r23,80(r1);                                        \
86         xor             r0,r0,r0;                                          \
87         stw             r0,8(r1);       /* Delete sensitive data        */ \
88         stw             r0,16(r1);      /* that we might have pushed    */ \
89         stw             r0,24(r1);      /* from other context that runs */ \
90         stw             r0,32(r1);      /* the same code. Assume that   */ \
91         stw             r0,40(r1);      /* the lower part of the GPRs   */ \
92         stw             r0,48(r1);      /* were already overwritten on  */ \
93         stw             r0,56(r1);      /* the way down to here         */ \
94         stw             r0,64(r1);                                         \
95         stw             r0,72(r1);                                         \
96         stw             r0,80(r1);                                         \
97         addi            r1,r1,128;      /* cleanup stack frame          */
99 #ifdef __BIG_ENDIAN__
100 #define LOAD_DATA(reg, off) \
101         lwz             reg,off(rWP);   /* load data                    */
102 #define NEXT_BLOCK \
103         addi            rWP,rWP,64;     /* increment per block          */
104 #else
105 #define LOAD_DATA(reg, off) \
106         lwbrx           reg,0,rWP;      /* load data                    */ \
107         addi            rWP,rWP,4;      /* increment per word           */
108 #define NEXT_BLOCK                      /* nothing to do                */
109 #endif
111 #define R_00_15(a, b, c, d, e, w0, w1, k, off) \
112         LOAD_DATA(w0, off)              /* 1: W                         */ \
113         and             rT2,b,c;        /* 1: F' = B and C              */ \
114         LOAD_K##k##1                                                       \
115         andc            rT1,d,b;        /* 1: F" = ~B and D             */ \
116         rotrwi          rT0,a,27;       /* 1: A' = A rotl 5             */ \
117         or              rT2,rT2,rT1;    /* 1: F = F' or F"              */ \
118         add             e,e,rT0;        /* 1: E = E + A'                */ \
119         rotrwi          b,b,2;          /* 1: B = B rotl 30             */ \
120         add             e,e,w0;         /* 1: E = E + W                 */ \
121         LOAD_DATA(w1, off+4)            /* 2: W                         */ \
122         add             e,e,rT2;        /* 1: E = E + F                 */ \
123         and             rT1,a,b;        /* 2: F' = B and C              */ \
124         add             e,e,rK;         /* 1: E = E + K                 */ \
125         andc            rT2,c,a;        /* 2: F" = ~B and D             */ \
126         add             d,d,rK;         /* 2: E = E + K                 */ \
127         or              rT2,rT2,rT1;    /* 2: F = F' or F"              */ \
128         rotrwi          rT0,e,27;       /* 2: A' = A rotl 5             */ \
129         add             d,d,w1;         /* 2: E = E + W                 */ \
130         rotrwi          a,a,2;          /* 2: B = B rotl 30             */ \
131         add             d,d,rT0;        /* 2: E = E + A'                */ \
132         evmergelo       w1,w1,w0;       /*    mix W[0]/W[1]             */ \
133         add             d,d,rT2         /* 2: E = E + F                 */
135 #define R_16_19(a, b, c, d, e, w0, w1, w4, w6, w7, k) \
136         and             rT2,b,c;        /* 1: F' = B and C              */ \
137         evmergelohi     rT0,w7,w6;      /*    W[-3]                     */ \
138         andc            rT1,d,b;        /* 1: F" = ~B and D             */ \
139         evxor           w0,w0,rT0;      /*    W = W[-16] xor W[-3]      */ \
140         or              rT1,rT1,rT2;    /* 1: F = F' or F"              */ \
141         evxor           w0,w0,w4;       /*    W = W xor W[-8]           */ \
142         add             e,e,rT1;        /* 1: E = E + F                 */ \
143         evxor           w0,w0,w1;       /*    W = W xor W[-14]          */ \
144         rotrwi          rT2,a,27;       /* 1: A' = A rotl 5             */ \
145         evrlwi          w0,w0,1;        /*    W = W rotl 1              */ \
146         add             e,e,rT2;        /* 1: E = E + A'                */ \
147         evaddw          rT0,w0,rK;      /*    WK = W + K                */ \
148         rotrwi          b,b,2;          /* 1: B = B rotl 30             */ \
149         LOAD_K##k##1                                                       \
150         evmergehi       rT1,rT1,rT0;    /*    WK1/WK2                   */ \
151         add             e,e,rT0;        /* 1: E = E + WK                */ \
152         add             d,d,rT1;        /* 2: E = E + WK                */ \
153         and             rT2,a,b;        /* 2: F' = B and C              */ \
154         andc            rT1,c,a;        /* 2: F" = ~B and D             */ \
155         rotrwi          rT0,e,27;       /* 2: A' = A rotl 5             */ \
156         or              rT1,rT1,rT2;    /* 2: F = F' or F"              */ \
157         add             d,d,rT0;        /* 2: E = E + A'                */ \
158         rotrwi          a,a,2;          /* 2: B = B rotl 30             */ \
159         add             d,d,rT1         /* 2: E = E + F                 */
161 #define R_20_39(a, b, c, d, e, w0, w1, w4, w6, w7, k) \
162         evmergelohi     rT0,w7,w6;      /*    W[-3]                     */ \
163         xor             rT2,b,c;        /* 1: F' = B xor C              */ \
164         evxor           w0,w0,rT0;      /*    W = W[-16] xor W[-3]      */ \
165         xor             rT2,rT2,d;      /* 1: F = F' xor D              */ \
166         evxor           w0,w0,w4;       /*    W = W xor W[-8]           */ \
167         add             e,e,rT2;        /* 1: E = E + F                 */ \
168         evxor           w0,w0,w1;       /*    W = W xor W[-14]          */ \
169         rotrwi          rT2,a,27;       /* 1: A' = A rotl 5             */ \
170         evrlwi          w0,w0,1;        /*    W = W rotl 1              */ \
171         add             e,e,rT2;        /* 1: E = E + A'                */ \
172         evaddw          rT0,w0,rK;      /*    WK = W + K                */ \
173         rotrwi          b,b,2;          /* 1: B = B rotl 30             */ \
174         LOAD_K##k##1                                                       \
175         evmergehi       rT1,rT1,rT0;    /*    WK1/WK2                   */ \
176         add             e,e,rT0;        /* 1: E = E + WK                */ \
177         xor             rT2,a,b;        /* 2: F' = B xor C              */ \
178         add             d,d,rT1;        /* 2: E = E + WK                */ \
179         xor             rT2,rT2,c;      /* 2: F = F' xor D              */ \
180         rotrwi          rT0,e,27;       /* 2: A' = A rotl 5             */ \
181         add             d,d,rT2;        /* 2: E = E + F                 */ \
182         rotrwi          a,a,2;          /* 2: B = B rotl 30             */ \
183         add             d,d,rT0         /* 2: E = E + A'                */
185 #define R_40_59(a, b, c, d, e, w0, w1, w4, w6, w7, k) \
186         and             rT2,b,c;        /* 1: F' = B and C              */ \
187         evmergelohi     rT0,w7,w6;      /*    W[-3]                     */ \
188         or              rT1,b,c;        /* 1: F" = B or C               */ \
189         evxor           w0,w0,rT0;      /*    W = W[-16] xor W[-3]      */ \
190         and             rT1,d,rT1;      /* 1: F" = F" and D             */ \
191         evxor           w0,w0,w4;       /*    W = W xor W[-8]           */ \
192         or              rT2,rT2,rT1;    /* 1: F = F' or F"              */ \
193         evxor           w0,w0,w1;       /*    W = W xor W[-14]          */ \
194         add             e,e,rT2;        /* 1: E = E + F                 */ \
195         evrlwi          w0,w0,1;        /*    W = W rotl 1              */ \
196         rotrwi          rT2,a,27;       /* 1: A' = A rotl 5             */ \
197         evaddw          rT0,w0,rK;      /*    WK = W + K                */ \
198         add             e,e,rT2;        /* 1: E = E + A'                */ \
199         LOAD_K##k##1                                                       \
200         evmergehi       rT1,rT1,rT0;    /*    WK1/WK2                   */ \
201         rotrwi          b,b,2;          /* 1: B = B rotl 30             */ \
202         add             e,e,rT0;        /* 1: E = E + WK                */ \
203         and             rT2,a,b;        /* 2: F' = B and C              */ \
204         or              rT0,a,b;        /* 2: F" = B or C               */ \
205         add             d,d,rT1;        /* 2: E = E + WK                */ \
206         and             rT0,c,rT0;      /* 2: F" = F" and D             */ \
207         rotrwi          a,a,2;          /* 2: B = B rotl 30             */ \
208         or              rT2,rT2,rT0;    /* 2: F = F' or F"              */ \
209         rotrwi          rT0,e,27;       /* 2: A' = A rotl 5             */ \
210         add             d,d,rT2;        /* 2: E = E + F                 */ \
211         add             d,d,rT0         /* 2: E = E + A'                */
213 #define R_60_79(a, b, c, d, e, w0, w1, w4, w6, w7, k) \
214         R_20_39(a, b, c, d, e, w0, w1, w4, w6, w7, k)
216 _GLOBAL(ppc_spe_sha1_transform)
217         INITIALIZE
219         lwz             rH0,0(rHP)
220         lwz             rH1,4(rHP)
221         mtctr           r5
222         lwz             rH2,8(rHP)
223         lis             rKP,PPC_SPE_SHA1_K@h
224         lwz             rH3,12(rHP)
225         ori             rKP,rKP,PPC_SPE_SHA1_K@l
226         lwz             rH4,16(rHP)
228 ppc_spe_sha1_main:
229         R_00_15(rH0, rH1, rH2, rH3, rH4, rW1, rW0, 1, 0)
230         R_00_15(rH3, rH4, rH0, rH1, rH2, rW2, rW1, 0, 8)
231         R_00_15(rH1, rH2, rH3, rH4, rH0, rW3, rW2, 0, 16)
232         R_00_15(rH4, rH0, rH1, rH2, rH3, rW4, rW3, 0, 24)
233         R_00_15(rH2, rH3, rH4, rH0, rH1, rW5, rW4, 0, 32)
234         R_00_15(rH0, rH1, rH2, rH3, rH4, rW6, rW5, 0, 40)
235         R_00_15(rH3, rH4, rH0, rH1, rH2, rT3, rW6, 0, 48)
236         R_00_15(rH1, rH2, rH3, rH4, rH0, rT3, rW7, 0, 56)
238         R_16_19(rH4, rH0, rH1, rH2, rH3, rW0, rW1, rW4, rW6, rW7, 0)
239         R_16_19(rH2, rH3, rH4, rH0, rH1, rW1, rW2, rW5, rW7, rW0, 2)
241         R_20_39(rH0, rH1, rH2, rH3, rH4, rW2, rW3, rW6, rW0, rW1, 0)
242         R_20_39(rH3, rH4, rH0, rH1, rH2, rW3, rW4, rW7, rW1, rW2, 0)
243         R_20_39(rH1, rH2, rH3, rH4, rH0, rW4, rW5, rW0, rW2, rW3, 0)
244         R_20_39(rH4, rH0, rH1, rH2, rH3, rW5, rW6, rW1, rW3, rW4, 0)
245         R_20_39(rH2, rH3, rH4, rH0, rH1, rW6, rW7, rW2, rW4, rW5, 0)
246         R_20_39(rH0, rH1, rH2, rH3, rH4, rW7, rW0, rW3, rW5, rW6, 0)
247         R_20_39(rH3, rH4, rH0, rH1, rH2, rW0, rW1, rW4, rW6, rW7, 0)
248         R_20_39(rH1, rH2, rH3, rH4, rH0, rW1, rW2, rW5, rW7, rW0, 0)
249         R_20_39(rH4, rH0, rH1, rH2, rH3, rW2, rW3, rW6, rW0, rW1, 0)
250         R_20_39(rH2, rH3, rH4, rH0, rH1, rW3, rW4, rW7, rW1, rW2, 3)
252         R_40_59(rH0, rH1, rH2, rH3, rH4, rW4, rW5, rW0, rW2, rW3, 0)
253         R_40_59(rH3, rH4, rH0, rH1, rH2, rW5, rW6, rW1, rW3, rW4, 0)
254         R_40_59(rH1, rH2, rH3, rH4, rH0, rW6, rW7, rW2, rW4, rW5, 0)
255         R_40_59(rH4, rH0, rH1, rH2, rH3, rW7, rW0, rW3, rW5, rW6, 0)
256         R_40_59(rH2, rH3, rH4, rH0, rH1, rW0, rW1, rW4, rW6, rW7, 0)
257         R_40_59(rH0, rH1, rH2, rH3, rH4, rW1, rW2, rW5, rW7, rW0, 0)
258         R_40_59(rH3, rH4, rH0, rH1, rH2, rW2, rW3, rW6, rW0, rW1, 0)
259         R_40_59(rH1, rH2, rH3, rH4, rH0, rW3, rW4, rW7, rW1, rW2, 0)
260         R_40_59(rH4, rH0, rH1, rH2, rH3, rW4, rW5, rW0, rW2, rW3, 0)
261         R_40_59(rH2, rH3, rH4, rH0, rH1, rW5, rW6, rW1, rW3, rW4, 4)
263         R_60_79(rH0, rH1, rH2, rH3, rH4, rW6, rW7, rW2, rW4, rW5, 0)
264         R_60_79(rH3, rH4, rH0, rH1, rH2, rW7, rW0, rW3, rW5, rW6, 0)
265         R_60_79(rH1, rH2, rH3, rH4, rH0, rW0, rW1, rW4, rW6, rW7, 0)
266         R_60_79(rH4, rH0, rH1, rH2, rH3, rW1, rW2, rW5, rW7, rW0, 0)
267         R_60_79(rH2, rH3, rH4, rH0, rH1, rW2, rW3, rW6, rW0, rW1, 0)
268         R_60_79(rH0, rH1, rH2, rH3, rH4, rW3, rW4, rW7, rW1, rW2, 0)
269         R_60_79(rH3, rH4, rH0, rH1, rH2, rW4, rW5, rW0, rW2, rW3, 0)
270         lwz             rT3,0(rHP)
271         R_60_79(rH1, rH2, rH3, rH4, rH0, rW5, rW6, rW1, rW3, rW4, 0)
272         lwz             rW1,4(rHP)
273         R_60_79(rH4, rH0, rH1, rH2, rH3, rW6, rW7, rW2, rW4, rW5, 0)
274         lwz             rW2,8(rHP)
275         R_60_79(rH2, rH3, rH4, rH0, rH1, rW7, rW0, rW3, rW5, rW6, 0)
276         lwz             rW3,12(rHP)
277         NEXT_BLOCK
278         lwz             rW4,16(rHP)
280         add             rH0,rH0,rT3
281         stw             rH0,0(rHP)
282         add             rH1,rH1,rW1
283         stw             rH1,4(rHP)
284         add             rH2,rH2,rW2
285         stw             rH2,8(rHP)
286         add             rH3,rH3,rW3
287         stw             rH3,12(rHP)
288         add             rH4,rH4,rW4
289         stw             rH4,16(rHP)
291         bdnz            ppc_spe_sha1_main
293         FINALIZE
294         blr
296 .data
297 .align 4
298 PPC_SPE_SHA1_K:
299         .long 0x5A827999,0x6ED9EBA1,0x8F1BBCDC,0xCA62C1D6