1 /* SPDX-License-Identifier: GPL-2.0-or-later */
3 * Fast SHA-256 implementation for SPE instruction set (PPC)
5 * This code makes use of the SPE SIMD instruction set as defined in
6 * http://cache.freescale.com/files/32bit/doc/ref_manual/SPEPIM.pdf
7 * Implementation is based on optimization guide notes from
8 * http://cache.freescale.com/files/32bit/doc/app_note/AN2665.pdf
10 * Copyright (c) 2015 Markus Stockhausen <stockhausen@collogia.de>
13 #include <asm/ppc_asm.h>
14 #include <asm/asm-offsets.h>
16 #define rHP r3 /* pointer to hash values in memory */
17 #define rKP r24 /* pointer to round constants */
18 #define rWP r4 /* pointer to input data */
20 #define rH0 r5 /* 8 32 bit hash values in 8 registers */
29 #define rW0 r14 /* 64 bit registers. 16 words in 8 registers */
38 #define rT0 r22 /* 64 bit temporaries */
40 #define rT2 r0 /* 32 bit temporaries */
48 stwu r1,-128(r1); /* create stack frame */ \
49 evstdw r14,8(r1); /* We must save non volatile */ \
50 evstdw r15,16(r1); /* registers. Take the chance */ \
51 evstdw r16,24(r1); /* and save the SPE part too */ \
59 stw r24,88(r1); /* save normal registers */ \
64 evldw r14,8(r1); /* restore SPE registers */ \
74 lwz r24,88(r1); /* restore normal registers */ \
77 stw r0,8(r1); /* Delete sensitive data */ \
78 stw r0,16(r1); /* that we might have pushed */ \
79 stw r0,24(r1); /* from other context that runs */ \
80 stw r0,32(r1); /* the same code. Assume that */ \
81 stw r0,40(r1); /* the lower part of the GPRs */ \
82 stw r0,48(r1); /* was already overwritten on */ \
83 stw r0,56(r1); /* the way down to here */ \
87 addi r1,r1,128; /* cleanup stack frame */
90 #define LOAD_DATA(reg, off) \
91 lwz reg,off(rWP); /* load data */
93 addi rWP,rWP,64; /* increment per block */
95 #define LOAD_DATA(reg, off) \
96 lwbrx reg,0,rWP; /* load data */ \
97 addi rWP,rWP,4; /* increment per word */
98 #define NEXT_BLOCK /* nothing to do */
101 #define R_LOAD_W(a, b, c, d, e, f, g, h, w, off) \
102 LOAD_DATA(w, off) /* 1: W */ \
103 rotrwi rT0,e,6; /* 1: S1 = e rotr 6 */ \
104 rotrwi rT1,e,11; /* 1: S1' = e rotr 11 */ \
105 rotrwi rT2,e,25; /* 1: S1" = e rotr 25 */ \
106 xor rT0,rT0,rT1; /* 1: S1 = S1 xor S1' */ \
107 and rT3,e,f; /* 1: ch = e and f */ \
108 xor rT0,rT0,rT2; /* 1: S1 = S1 xor S1" */ \
109 andc rT1,g,e; /* 1: ch' = ~e and g */ \
110 lwz rT2,off(rKP); /* 1: K */ \
111 xor rT3,rT3,rT1; /* 1: ch = ch xor ch' */ \
112 add h,h,rT0; /* 1: temp1 = h + S1 */ \
113 add rT3,rT3,w; /* 1: temp1' = ch + w */ \
114 rotrwi rT0,a,2; /* 1: S0 = a rotr 2 */ \
115 add h,h,rT3; /* 1: temp1 = temp1 + temp1' */ \
116 rotrwi rT1,a,13; /* 1: S0' = a rotr 13 */ \
117 add h,h,rT2; /* 1: temp1 = temp1 + K */ \
118 rotrwi rT3,a,22; /* 1: S0" = a rotr 22 */ \
119 xor rT0,rT0,rT1; /* 1: S0 = S0 xor S0' */ \
120 add d,d,h; /* 1: d = d + temp1 */ \
121 xor rT3,rT0,rT3; /* 1: S0 = S0 xor S0" */ \
122 evmergelo w,w,w; /* shift W */ \
123 or rT2,a,b; /* 1: maj = a or b */ \
124 and rT1,a,b; /* 1: maj' = a and b */ \
125 and rT2,rT2,c; /* 1: maj = maj and c */ \
126 LOAD_DATA(w, off+4) /* 2: W */ \
127 or rT2,rT1,rT2; /* 1: maj = maj or maj' */ \
128 rotrwi rT0,d,6; /* 2: S1 = e rotr 6 */ \
129 add rT3,rT3,rT2; /* 1: temp2 = S0 + maj */ \
130 rotrwi rT1,d,11; /* 2: S1' = e rotr 11 */ \
131 add h,h,rT3; /* 1: h = temp1 + temp2 */ \
132 rotrwi rT2,d,25; /* 2: S1" = e rotr 25 */ \
133 xor rT0,rT0,rT1; /* 2: S1 = S1 xor S1' */ \
134 and rT3,d,e; /* 2: ch = e and f */ \
135 xor rT0,rT0,rT2; /* 2: S1 = S1 xor S1" */ \
136 andc rT1,f,d; /* 2: ch' = ~e and g */ \
137 lwz rT2,off+4(rKP); /* 2: K */ \
138 xor rT3,rT3,rT1; /* 2: ch = ch xor ch' */ \
139 add g,g,rT0; /* 2: temp1 = h + S1 */ \
140 add rT3,rT3,w; /* 2: temp1' = ch + w */ \
141 rotrwi rT0,h,2; /* 2: S0 = a rotr 2 */ \
142 add g,g,rT3; /* 2: temp1 = temp1 + temp1' */ \
143 rotrwi rT1,h,13; /* 2: S0' = a rotr 13 */ \
144 add g,g,rT2; /* 2: temp1 = temp1 + K */ \
145 rotrwi rT3,h,22; /* 2: S0" = a rotr 22 */ \
146 xor rT0,rT0,rT1; /* 2: S0 = S0 xor S0' */ \
147 or rT2,h,a; /* 2: maj = a or b */ \
148 xor rT3,rT0,rT3; /* 2: S0 = S0 xor S0" */ \
149 and rT1,h,a; /* 2: maj' = a and b */ \
150 and rT2,rT2,b; /* 2: maj = maj and c */ \
151 add c,c,g; /* 2: d = d + temp1 */ \
152 or rT2,rT1,rT2; /* 2: maj = maj or maj' */ \
153 add rT3,rT3,rT2; /* 2: temp2 = S0 + maj */ \
154 add g,g,rT3 /* 2: h = temp1 + temp2 */
156 #define R_CALC_W(a, b, c, d, e, f, g, h, w0, w1, w4, w5, w7, k, off) \
157 rotrwi rT2,e,6; /* 1: S1 = e rotr 6 */ \
158 evmergelohi rT0,w0,w1; /* w[-15] */ \
159 rotrwi rT3,e,11; /* 1: S1' = e rotr 11 */ \
160 evsrwiu rT1,rT0,3; /* s0 = w[-15] >> 3 */ \
161 xor rT2,rT2,rT3; /* 1: S1 = S1 xor S1' */ \
162 evrlwi rT0,rT0,25; /* s0' = w[-15] rotr 7 */ \
163 rotrwi rT3,e,25; /* 1: S1' = e rotr 25 */ \
164 evxor rT1,rT1,rT0; /* s0 = s0 xor s0' */ \
165 xor rT2,rT2,rT3; /* 1: S1 = S1 xor S1' */ \
166 evrlwi rT0,rT0,21; /* s0' = w[-15] rotr 18 */ \
167 add h,h,rT2; /* 1: temp1 = h + S1 */ \
168 evxor rT0,rT0,rT1; /* s0 = s0 xor s0' */ \
169 and rT2,e,f; /* 1: ch = e and f */ \
170 evaddw w0,w0,rT0; /* w = w[-16] + s0 */ \
171 andc rT3,g,e; /* 1: ch' = ~e and g */ \
172 evsrwiu rT0,w7,10; /* s1 = w[-2] >> 10 */ \
173 xor rT2,rT2,rT3; /* 1: ch = ch xor ch' */ \
174 evrlwi rT1,w7,15; /* s1' = w[-2] rotr 17 */ \
175 add h,h,rT2; /* 1: temp1 = temp1 + ch */ \
176 evxor rT0,rT0,rT1; /* s1 = s1 xor s1' */ \
177 rotrwi rT2,a,2; /* 1: S0 = a rotr 2 */ \
178 evrlwi rT1,w7,13; /* s1' = w[-2] rotr 19 */ \
179 rotrwi rT3,a,13; /* 1: S0' = a rotr 13 */ \
180 evxor rT0,rT0,rT1; /* s1 = s1 xor s1' */ \
181 xor rT2,rT2,rT3; /* 1: S0 = S0 xor S0' */ \
182 evldw rT1,off(rKP); /* k */ \
183 rotrwi rT3,a,22; /* 1: S0' = a rotr 22 */ \
184 evaddw w0,w0,rT0; /* w = w + s1 */ \
185 xor rT2,rT2,rT3; /* 1: S0 = S0 xor S0' */ \
186 evmergelohi rT0,w4,w5; /* w[-7] */ \
187 and rT3,a,b; /* 1: maj = a and b */ \
188 evaddw w0,w0,rT0; /* w = w + w[-7] */ \
190 add rT2,rT2,rT3; /* 1: temp2 = S0 + maj */ \
191 evaddw rT1,rT1,w0; /* wk = w + k */ \
192 xor rT3,a,b; /* 1: maj = a xor b */ \
193 evmergehi rT0,rT1,rT1; /* wk1/wk2 */ \
194 and rT3,rT3,c; /* 1: maj = maj and c */ \
195 add h,h,rT0; /* 1: temp1 = temp1 + wk */ \
196 add rT2,rT2,rT3; /* 1: temp2 = temp2 + maj */ \
197 add g,g,rT1; /* 2: temp1 = temp1 + wk */ \
198 add d,d,h; /* 1: d = d + temp1 */ \
199 rotrwi rT0,d,6; /* 2: S1 = e rotr 6 */ \
200 add h,h,rT2; /* 1: h = temp1 + temp2 */ \
201 rotrwi rT1,d,11; /* 2: S1' = e rotr 11 */ \
202 rotrwi rT2,d,25; /* 2: S" = e rotr 25 */ \
203 xor rT0,rT0,rT1; /* 2: S1 = S1 xor S1' */ \
204 and rT3,d,e; /* 2: ch = e and f */ \
205 xor rT0,rT0,rT2; /* 2: S1 = S1 xor S1" */ \
206 andc rT1,f,d; /* 2: ch' = ~e and g */ \
207 add g,g,rT0; /* 2: temp1 = h + S1 */ \
208 xor rT3,rT3,rT1; /* 2: ch = ch xor ch' */ \
209 rotrwi rT0,h,2; /* 2: S0 = a rotr 2 */ \
210 add g,g,rT3; /* 2: temp1 = temp1 + ch */ \
211 rotrwi rT1,h,13; /* 2: S0' = a rotr 13 */ \
212 rotrwi rT3,h,22; /* 2: S0" = a rotr 22 */ \
213 xor rT0,rT0,rT1; /* 2: S0 = S0 xor S0' */ \
214 or rT2,h,a; /* 2: maj = a or b */ \
215 and rT1,h,a; /* 2: maj' = a and b */ \
216 and rT2,rT2,b; /* 2: maj = maj and c */ \
217 xor rT3,rT0,rT3; /* 2: S0 = S0 xor S0" */ \
218 or rT2,rT1,rT2; /* 2: maj = maj or maj' */ \
219 add c,c,g; /* 2: d = d + temp1 */ \
220 add rT3,rT3,rT2; /* 2: temp2 = S0 + maj */ \
221 add g,g,rT3 /* 2: h = temp1 + temp2 */
223 _GLOBAL(ppc_spe_sha256_transform)
237 lis rKP,PPC_SPE_SHA256_K@ha
238 addi rKP,rKP,PPC_SPE_SHA256_K@l
240 R_LOAD_W(rH0, rH1, rH2, rH3, rH4, rH5, rH6, rH7, rW0, 0)
241 R_LOAD_W(rH6, rH7, rH0, rH1, rH2, rH3, rH4, rH5, rW1, 8)
242 R_LOAD_W(rH4, rH5, rH6, rH7, rH0, rH1, rH2, rH3, rW2, 16)
243 R_LOAD_W(rH2, rH3, rH4, rH5, rH6, rH7, rH0, rH1, rW3, 24)
244 R_LOAD_W(rH0, rH1, rH2, rH3, rH4, rH5, rH6, rH7, rW4, 32)
245 R_LOAD_W(rH6, rH7, rH0, rH1, rH2, rH3, rH4, rH5, rW5, 40)
246 R_LOAD_W(rH4, rH5, rH6, rH7, rH0, rH1, rH2, rH3, rW6, 48)
247 R_LOAD_W(rH2, rH3, rH4, rH5, rH6, rH7, rH0, rH1, rW7, 56)
248 ppc_spe_sha256_16_rounds:
250 R_CALC_W(rH0, rH1, rH2, rH3, rH4, rH5, rH6, rH7,
251 rW0, rW1, rW4, rW5, rW7, N, 0)
252 R_CALC_W(rH6, rH7, rH0, rH1, rH2, rH3, rH4, rH5,
253 rW1, rW2, rW5, rW6, rW0, N, 8)
254 R_CALC_W(rH4, rH5, rH6, rH7, rH0, rH1, rH2, rH3,
255 rW2, rW3, rW6, rW7, rW1, N, 16)
256 R_CALC_W(rH2, rH3, rH4, rH5, rH6, rH7, rH0, rH1,
257 rW3, rW4, rW7, rW0, rW2, N, 24)
258 R_CALC_W(rH0, rH1, rH2, rH3, rH4, rH5, rH6, rH7,
259 rW4, rW5, rW0, rW1, rW3, N, 32)
260 R_CALC_W(rH6, rH7, rH0, rH1, rH2, rH3, rH4, rH5,
261 rW5, rW6, rW1, rW2, rW4, N, 40)
262 R_CALC_W(rH4, rH5, rH6, rH7, rH0, rH1, rH2, rH3,
263 rW6, rW7, rW2, rW3, rW5, N, 48)
264 R_CALC_W(rH2, rH3, rH4, rH5, rH6, rH7, rH0, rH1,
265 rW7, rW0, rW3, rW4, rW6, C, 56)
266 bt gt,ppc_spe_sha256_16_rounds
295 bdnz ppc_spe_sha256_main
303 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
304 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
305 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
306 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
307 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
308 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
309 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
310 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
311 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
312 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
313 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
314 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
315 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
316 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
317 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
318 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2