unify {de,}mangle_poll(), get rid of kernel-side POLL...
[cris-mirror.git] / arch / powerpc / crypto / sha256-spe-asm.S
blob2d10e4c08f038f0bfc13fbf3555c357dfb3f5236
1 /*
2  * Fast SHA-256 implementation for SPE instruction set (PPC)
3  *
4  * This code makes use of the SPE SIMD instruction set as defined in
5  * http://cache.freescale.com/files/32bit/doc/ref_manual/SPEPIM.pdf
6  * Implementation is based on optimization guide notes from
7  * http://cache.freescale.com/files/32bit/doc/app_note/AN2665.pdf
8  *
9  * Copyright (c) 2015 Markus Stockhausen <stockhausen@collogia.de>
10  *
11  * This program is free software; you can redistribute it and/or modify it
12  * under the terms of the GNU General Public License as published by the Free
13  * Software Foundation; either version 2 of the License, or (at your option)
14  * any later version.
15  *
16  */
18 #include <asm/ppc_asm.h>
19 #include <asm/asm-offsets.h>
21 #define rHP     r3      /* pointer to hash values in memory             */
22 #define rKP     r24     /* pointer to round constants                   */
23 #define rWP     r4      /* pointer to input data                        */
25 #define rH0     r5      /* 8 32 bit hash values in 8 registers          */
26 #define rH1     r6
27 #define rH2     r7
28 #define rH3     r8
29 #define rH4     r9
30 #define rH5     r10
31 #define rH6     r11
32 #define rH7     r12
34 #define rW0     r14     /* 64 bit registers. 16 words in 8 registers    */
35 #define rW1     r15
36 #define rW2     r16
37 #define rW3     r17
38 #define rW4     r18
39 #define rW5     r19
40 #define rW6     r20
41 #define rW7     r21
43 #define rT0     r22     /* 64 bit temporaries                           */
44 #define rT1     r23
45 #define rT2     r0      /* 32 bit temporaries                           */
46 #define rT3     r25
48 #define CMP_KN_LOOP
49 #define CMP_KC_LOOP \
50         cmpwi           rT1,0;
52 #define INITIALIZE \
53         stwu            r1,-128(r1);    /* create stack frame           */ \
54         evstdw          r14,8(r1);      /* We must save non volatile    */ \
55         evstdw          r15,16(r1);     /* registers. Take the chance   */ \
56         evstdw          r16,24(r1);     /* and save the SPE part too    */ \
57         evstdw          r17,32(r1);                                        \
58         evstdw          r18,40(r1);                                        \
59         evstdw          r19,48(r1);                                        \
60         evstdw          r20,56(r1);                                        \
61         evstdw          r21,64(r1);                                        \
62         evstdw          r22,72(r1);                                        \
63         evstdw          r23,80(r1);                                        \
64         stw             r24,88(r1);     /* save normal registers        */ \
65         stw             r25,92(r1);
68 #define FINALIZE \
69         evldw           r14,8(r1);      /* restore SPE registers        */ \
70         evldw           r15,16(r1);                                        \
71         evldw           r16,24(r1);                                        \
72         evldw           r17,32(r1);                                        \
73         evldw           r18,40(r1);                                        \
74         evldw           r19,48(r1);                                        \
75         evldw           r20,56(r1);                                        \
76         evldw           r21,64(r1);                                        \
77         evldw           r22,72(r1);                                        \
78         evldw           r23,80(r1);                                        \
79         lwz             r24,88(r1);     /* restore normal registers     */ \
80         lwz             r25,92(r1);                                        \
81         xor             r0,r0,r0;                                          \
82         stw             r0,8(r1);       /* Delete sensitive data        */ \
83         stw             r0,16(r1);      /* that we might have pushed    */ \
84         stw             r0,24(r1);      /* from other context that runs */ \
85         stw             r0,32(r1);      /* the same code. Assume that   */ \
86         stw             r0,40(r1);      /* the lower part of the GPRs   */ \
87         stw             r0,48(r1);      /* was already overwritten on   */ \
88         stw             r0,56(r1);      /* the way down to here         */ \
89         stw             r0,64(r1);                                         \
90         stw             r0,72(r1);                                         \
91         stw             r0,80(r1);                                         \
92         addi            r1,r1,128;      /* cleanup stack frame          */
94 #ifdef __BIG_ENDIAN__
95 #define LOAD_DATA(reg, off) \
96         lwz             reg,off(rWP);   /* load data                    */
97 #define NEXT_BLOCK \
98         addi            rWP,rWP,64;     /* increment per block          */
99 #else
100 #define LOAD_DATA(reg, off) \
101         lwbrx           reg,0,rWP;      /* load data                    */ \
102         addi            rWP,rWP,4;      /* increment per word           */
103 #define NEXT_BLOCK                      /* nothing to do                */
104 #endif
106 #define R_LOAD_W(a, b, c, d, e, f, g, h, w, off) \
107         LOAD_DATA(w, off)               /* 1: W                         */ \
108         rotrwi          rT0,e,6;        /* 1: S1 = e rotr 6             */ \
109         rotrwi          rT1,e,11;       /* 1: S1' = e rotr 11           */ \
110         rotrwi          rT2,e,25;       /* 1: S1" = e rotr 25           */ \
111         xor             rT0,rT0,rT1;    /* 1: S1 = S1 xor S1'           */ \
112         and             rT3,e,f;        /* 1: ch = e and f              */ \
113         xor             rT0,rT0,rT2;    /* 1: S1 = S1 xor S1"           */ \
114         andc            rT1,g,e;        /* 1: ch' = ~e and g            */ \
115         lwz             rT2,off(rKP);   /* 1: K                         */ \
116         xor             rT3,rT3,rT1;    /* 1: ch = ch xor ch'           */ \
117         add             h,h,rT0;        /* 1: temp1 = h + S1            */ \
118         add             rT3,rT3,w;      /* 1: temp1' = ch + w           */ \
119         rotrwi          rT0,a,2;        /* 1: S0 = a rotr 2             */ \
120         add             h,h,rT3;        /* 1: temp1 = temp1 + temp1'    */ \
121         rotrwi          rT1,a,13;       /* 1: S0' = a rotr 13           */ \
122         add             h,h,rT2;        /* 1: temp1 = temp1 + K         */ \
123         rotrwi          rT3,a,22;       /* 1: S0" = a rotr 22           */ \
124         xor             rT0,rT0,rT1;    /* 1: S0 = S0 xor S0'           */ \
125         add             d,d,h;          /* 1: d = d + temp1             */ \
126         xor             rT3,rT0,rT3;    /* 1: S0 = S0 xor S0"           */ \
127         evmergelo       w,w,w;          /*    shift W                   */ \
128         or              rT2,a,b;        /* 1: maj = a or b              */ \
129         and             rT1,a,b;        /* 1: maj' = a and b            */ \
130         and             rT2,rT2,c;      /* 1: maj = maj and c           */ \
131         LOAD_DATA(w, off+4)             /* 2: W                         */ \
132         or              rT2,rT1,rT2;    /* 1: maj = maj or maj'         */ \
133         rotrwi          rT0,d,6;        /* 2: S1 = e rotr 6             */ \
134         add             rT3,rT3,rT2;    /* 1: temp2 = S0 + maj          */ \
135         rotrwi          rT1,d,11;       /* 2: S1' = e rotr 11           */ \
136         add             h,h,rT3;        /* 1: h = temp1 + temp2         */ \
137         rotrwi          rT2,d,25;       /* 2: S1" = e rotr 25           */ \
138         xor             rT0,rT0,rT1;    /* 2: S1 = S1 xor S1'           */ \
139         and             rT3,d,e;        /* 2: ch = e and f              */ \
140         xor             rT0,rT0,rT2;    /* 2: S1 = S1 xor S1"           */ \
141         andc            rT1,f,d;        /* 2: ch' = ~e and g            */ \
142         lwz             rT2,off+4(rKP); /* 2: K                         */ \
143         xor             rT3,rT3,rT1;    /* 2: ch = ch xor ch'           */ \
144         add             g,g,rT0;        /* 2: temp1 = h + S1            */ \
145         add             rT3,rT3,w;      /* 2: temp1' = ch + w           */ \
146         rotrwi          rT0,h,2;        /* 2: S0 = a rotr 2             */ \
147         add             g,g,rT3;        /* 2: temp1 = temp1 + temp1'    */ \
148         rotrwi          rT1,h,13;       /* 2: S0' = a rotr 13           */ \
149         add             g,g,rT2;        /* 2: temp1 = temp1 + K         */ \
150         rotrwi          rT3,h,22;       /* 2: S0" = a rotr 22           */ \
151         xor             rT0,rT0,rT1;    /* 2: S0 = S0 xor S0'           */ \
152         or              rT2,h,a;        /* 2: maj = a or b              */ \
153         xor             rT3,rT0,rT3;    /* 2: S0 = S0 xor S0"           */ \
154         and             rT1,h,a;        /* 2: maj' = a and b            */ \
155         and             rT2,rT2,b;      /* 2: maj = maj and c           */ \
156         add             c,c,g;          /* 2: d = d + temp1             */ \
157         or              rT2,rT1,rT2;    /* 2: maj = maj or maj'         */ \
158         add             rT3,rT3,rT2;    /* 2: temp2 = S0 + maj          */ \
159         add             g,g,rT3         /* 2: h = temp1 + temp2         */
161 #define R_CALC_W(a, b, c, d, e, f, g, h, w0, w1, w4, w5, w7, k, off) \
162         rotrwi          rT2,e,6;        /* 1: S1 = e rotr 6             */ \
163         evmergelohi     rT0,w0,w1;      /*    w[-15]                    */ \
164         rotrwi          rT3,e,11;       /* 1: S1' = e rotr 11           */ \
165         evsrwiu         rT1,rT0,3;      /*    s0 = w[-15] >> 3          */ \
166         xor             rT2,rT2,rT3;    /* 1: S1 = S1 xor S1'           */ \
167         evrlwi          rT0,rT0,25;     /*    s0' = w[-15] rotr 7       */ \
168         rotrwi          rT3,e,25;       /* 1: S1' = e rotr 25           */ \
169         evxor           rT1,rT1,rT0;    /*    s0 = s0 xor s0'           */ \
170         xor             rT2,rT2,rT3;    /* 1: S1 = S1 xor S1'           */ \
171         evrlwi          rT0,rT0,21;     /*    s0' = w[-15] rotr 18      */ \
172         add             h,h,rT2;        /* 1: temp1 = h + S1            */ \
173         evxor           rT0,rT0,rT1;    /*    s0 = s0 xor s0'           */ \
174         and             rT2,e,f;        /* 1: ch = e and f              */ \
175         evaddw          w0,w0,rT0;      /*    w = w[-16] + s0           */ \
176         andc            rT3,g,e;        /* 1: ch' = ~e and g            */ \
177         evsrwiu         rT0,w7,10;      /*    s1 = w[-2] >> 10          */ \
178         xor             rT2,rT2,rT3;    /* 1: ch = ch xor ch'           */ \
179         evrlwi          rT1,w7,15;      /*    s1' = w[-2] rotr 17       */ \
180         add             h,h,rT2;        /* 1: temp1 = temp1 + ch        */ \
181         evxor           rT0,rT0,rT1;    /*    s1 = s1 xor s1'           */ \
182         rotrwi          rT2,a,2;        /* 1: S0 = a rotr 2             */ \
183         evrlwi          rT1,w7,13;      /*    s1' = w[-2] rotr 19       */ \
184         rotrwi          rT3,a,13;       /* 1: S0' = a rotr 13           */ \
185         evxor           rT0,rT0,rT1;    /*    s1 = s1 xor s1'           */ \
186         xor             rT2,rT2,rT3;    /* 1: S0 = S0 xor S0'           */ \
187         evldw           rT1,off(rKP);   /*    k                         */ \
188         rotrwi          rT3,a,22;       /* 1: S0' = a rotr 22           */ \
189         evaddw          w0,w0,rT0;      /*    w = w + s1                */ \
190         xor             rT2,rT2,rT3;    /* 1: S0 = S0 xor S0'           */ \
191         evmergelohi     rT0,w4,w5;      /*    w[-7]                     */ \
192         and             rT3,a,b;        /* 1: maj = a and b             */ \
193         evaddw          w0,w0,rT0;      /*    w = w + w[-7]             */ \
194         CMP_K##k##_LOOP                                                    \
195         add             rT2,rT2,rT3;    /* 1: temp2 = S0 + maj          */ \
196         evaddw          rT1,rT1,w0;     /*    wk = w + k                */ \
197         xor             rT3,a,b;        /* 1: maj = a xor b             */ \
198         evmergehi       rT0,rT1,rT1;    /*    wk1/wk2                   */ \
199         and             rT3,rT3,c;      /* 1: maj = maj and c           */ \
200         add             h,h,rT0;        /* 1: temp1 = temp1 + wk        */ \
201         add             rT2,rT2,rT3;    /* 1: temp2 = temp2 + maj       */ \
202         add             g,g,rT1;        /* 2: temp1 = temp1 + wk        */ \
203         add             d,d,h;          /* 1: d = d + temp1             */ \
204         rotrwi          rT0,d,6;        /* 2: S1 = e rotr 6             */ \
205         add             h,h,rT2;        /* 1: h = temp1 + temp2         */ \
206         rotrwi          rT1,d,11;       /* 2: S1' = e rotr 11           */ \
207         rotrwi          rT2,d,25;       /* 2: S" = e rotr 25            */ \
208         xor             rT0,rT0,rT1;    /* 2: S1 = S1 xor S1'           */ \
209         and             rT3,d,e;        /* 2: ch = e and f              */ \
210         xor             rT0,rT0,rT2;    /* 2: S1 = S1 xor S1"           */ \
211         andc            rT1,f,d;        /* 2: ch' = ~e and g            */ \
212         add             g,g,rT0;        /* 2: temp1 = h + S1            */ \
213         xor             rT3,rT3,rT1;    /* 2: ch = ch xor ch'           */ \
214         rotrwi          rT0,h,2;        /* 2: S0 = a rotr 2             */ \
215         add             g,g,rT3;        /* 2: temp1 = temp1 + ch        */ \
216         rotrwi          rT1,h,13;       /* 2: S0' = a rotr 13           */ \
217         rotrwi          rT3,h,22;       /* 2: S0" = a rotr 22           */ \
218         xor             rT0,rT0,rT1;    /* 2: S0 = S0 xor S0'           */ \
219         or              rT2,h,a;        /* 2: maj = a or b              */ \
220         and             rT1,h,a;        /* 2: maj' = a and b            */ \
221         and             rT2,rT2,b;      /* 2: maj = maj and c           */ \
222         xor             rT3,rT0,rT3;    /* 2: S0 = S0 xor S0"           */ \
223         or              rT2,rT1,rT2;    /* 2: maj = maj or maj'         */ \
224         add             c,c,g;          /* 2: d = d + temp1             */ \
225         add             rT3,rT3,rT2;    /* 2: temp2 = S0 + maj          */ \
226         add             g,g,rT3         /* 2: h = temp1 + temp2         */
228 _GLOBAL(ppc_spe_sha256_transform)
229         INITIALIZE
231         mtctr           r5
232         lwz             rH0,0(rHP)
233         lwz             rH1,4(rHP)
234         lwz             rH2,8(rHP)
235         lwz             rH3,12(rHP)
236         lwz             rH4,16(rHP)
237         lwz             rH5,20(rHP)
238         lwz             rH6,24(rHP)
239         lwz             rH7,28(rHP)
241 ppc_spe_sha256_main:
242         lis             rKP,PPC_SPE_SHA256_K@ha
243         addi            rKP,rKP,PPC_SPE_SHA256_K@l
245         R_LOAD_W(rH0, rH1, rH2, rH3, rH4, rH5, rH6, rH7, rW0, 0)
246         R_LOAD_W(rH6, rH7, rH0, rH1, rH2, rH3, rH4, rH5, rW1, 8)
247         R_LOAD_W(rH4, rH5, rH6, rH7, rH0, rH1, rH2, rH3, rW2, 16)
248         R_LOAD_W(rH2, rH3, rH4, rH5, rH6, rH7, rH0, rH1, rW3, 24)
249         R_LOAD_W(rH0, rH1, rH2, rH3, rH4, rH5, rH6, rH7, rW4, 32)
250         R_LOAD_W(rH6, rH7, rH0, rH1, rH2, rH3, rH4, rH5, rW5, 40)
251         R_LOAD_W(rH4, rH5, rH6, rH7, rH0, rH1, rH2, rH3, rW6, 48)
252         R_LOAD_W(rH2, rH3, rH4, rH5, rH6, rH7, rH0, rH1, rW7, 56)
253 ppc_spe_sha256_16_rounds:
254         addi            rKP,rKP,64
255         R_CALC_W(rH0, rH1, rH2, rH3, rH4, rH5, rH6, rH7,
256                  rW0, rW1, rW4, rW5, rW7, N, 0)
257         R_CALC_W(rH6, rH7, rH0, rH1, rH2, rH3, rH4, rH5,
258                  rW1, rW2, rW5, rW6, rW0, N, 8)
259         R_CALC_W(rH4, rH5, rH6, rH7, rH0, rH1, rH2, rH3,
260                  rW2, rW3, rW6, rW7, rW1, N, 16)
261         R_CALC_W(rH2, rH3, rH4, rH5, rH6, rH7, rH0, rH1,
262                  rW3, rW4, rW7, rW0, rW2, N, 24)
263         R_CALC_W(rH0, rH1, rH2, rH3, rH4, rH5, rH6, rH7,
264                  rW4, rW5, rW0, rW1, rW3, N, 32)
265         R_CALC_W(rH6, rH7, rH0, rH1, rH2, rH3, rH4, rH5,
266                  rW5, rW6, rW1, rW2, rW4, N, 40)
267         R_CALC_W(rH4, rH5, rH6, rH7, rH0, rH1, rH2, rH3,
268                  rW6, rW7, rW2, rW3, rW5, N, 48)
269         R_CALC_W(rH2, rH3, rH4, rH5, rH6, rH7, rH0, rH1,
270                  rW7, rW0, rW3, rW4, rW6, C, 56)
271         bt              gt,ppc_spe_sha256_16_rounds
273         lwz             rW0,0(rHP)
274         NEXT_BLOCK
275         lwz             rW1,4(rHP)
276         lwz             rW2,8(rHP)
277         lwz             rW3,12(rHP)
278         lwz             rW4,16(rHP)
279         lwz             rW5,20(rHP)
280         lwz             rW6,24(rHP)
281         lwz             rW7,28(rHP)
283         add             rH0,rH0,rW0
284         stw             rH0,0(rHP)
285         add             rH1,rH1,rW1
286         stw             rH1,4(rHP)
287         add             rH2,rH2,rW2
288         stw             rH2,8(rHP)
289         add             rH3,rH3,rW3
290         stw             rH3,12(rHP)
291         add             rH4,rH4,rW4
292         stw             rH4,16(rHP)
293         add             rH5,rH5,rW5
294         stw             rH5,20(rHP)
295         add             rH6,rH6,rW6
296         stw             rH6,24(rHP)
297         add             rH7,rH7,rW7
298         stw             rH7,28(rHP)
300         bdnz            ppc_spe_sha256_main
302         FINALIZE
303         blr
305 .data
306 .align 5
307 PPC_SPE_SHA256_K:
308         .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
309         .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
310         .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
311         .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
312         .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
313         .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
314         .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
315         .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
316         .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
317         .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
318         .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
319         .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
320         .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
321         .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
322         .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
323         .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2