2 * Fast SHA-1 implementation for SPE instruction set (PPC)
4 * This code makes use of the SPE SIMD instruction set as defined in
5 * http://cache.freescale.com/files/32bit/doc/ref_manual/SPEPIM.pdf
6 * Implementation is based on optimization guide notes from
7 * http://cache.freescale.com/files/32bit/doc/app_note/AN2665.pdf
9 * Copyright (c) 2015 Markus Stockhausen <stockhausen@collogia.de>
11 * This program is free software; you can redistribute it and/or modify it
12 * under the terms of the GNU General Public License as published by the Free
13 * Software Foundation; either version 2 of the License, or (at your option)
18 #include <asm/ppc_asm.h>
19 #include <asm/asm-offsets.h>
21 #define rHP r3 /* pointer to hash value */
22 #define rWP r4 /* pointer to input */
23 #define rKP r5 /* pointer to constants */
25 #define rW0 r14 /* 64 bit round words */
34 #define rH0 r6 /* 32 bit hash values */
40 #define rT0 r22 /* 64 bit temporary */
41 #define rT1 r0 /* 32 bit temporaries */
45 #define rK r23 /* 64 bit constant in volatile register */
59 evlwwsplat rK,12(rKP);
62 stwu r1,-128(r1); /* create stack frame */ \
63 evstdw r14,8(r1); /* We must save non volatile */ \
64 evstdw r15,16(r1); /* registers. Take the chance */ \
65 evstdw r16,24(r1); /* and save the SPE part too */ \
76 evldw r14,8(r1); /* restore SPE registers */ \
87 stw r0,8(r1); /* Delete sensitive data */ \
88 stw r0,16(r1); /* that we might have pushed */ \
89 stw r0,24(r1); /* from other context that runs */ \
90 stw r0,32(r1); /* the same code. Assume that */ \
91 stw r0,40(r1); /* the lower part of the GPRs */ \
92 stw r0,48(r1); /* were already overwritten on */ \
93 stw r0,56(r1); /* the way down to here */ \
97 addi r1,r1,128; /* cleanup stack frame */
100 #define LOAD_DATA(reg, off) \
101 lwz reg,off(rWP); /* load data */
103 addi rWP,rWP,64; /* increment per block */
105 #define LOAD_DATA(reg, off) \
106 lwbrx reg,0,rWP; /* load data */ \
107 addi rWP,rWP,4; /* increment per word */
108 #define NEXT_BLOCK /* nothing to do */
111 #define R_00_15(a, b, c, d, e, w0, w1, k, off) \
112 LOAD_DATA(w0, off) /* 1: W */ \
113 and rT2,b,c; /* 1: F' = B and C */ \
115 andc rT1,d,b; /* 1: F" = ~B and D */ \
116 rotrwi rT0,a,27; /* 1: A' = A rotl 5 */ \
117 or rT2,rT2,rT1; /* 1: F = F' or F" */ \
118 add e,e,rT0; /* 1: E = E + A' */ \
119 rotrwi b,b,2; /* 1: B = B rotl 30 */ \
120 add e,e,w0; /* 1: E = E + W */ \
121 LOAD_DATA(w1, off+4) /* 2: W */ \
122 add e,e,rT2; /* 1: E = E + F */ \
123 and rT1,a,b; /* 2: F' = B and C */ \
124 add e,e,rK; /* 1: E = E + K */ \
125 andc rT2,c,a; /* 2: F" = ~B and D */ \
126 add d,d,rK; /* 2: E = E + K */ \
127 or rT2,rT2,rT1; /* 2: F = F' or F" */ \
128 rotrwi rT0,e,27; /* 2: A' = A rotl 5 */ \
129 add d,d,w1; /* 2: E = E + W */ \
130 rotrwi a,a,2; /* 2: B = B rotl 30 */ \
131 add d,d,rT0; /* 2: E = E + A' */ \
132 evmergelo w1,w1,w0; /* mix W[0]/W[1] */ \
133 add d,d,rT2 /* 2: E = E + F */
135 #define R_16_19(a, b, c, d, e, w0, w1, w4, w6, w7, k) \
136 and rT2,b,c; /* 1: F' = B and C */ \
137 evmergelohi rT0,w7,w6; /* W[-3] */ \
138 andc rT1,d,b; /* 1: F" = ~B and D */ \
139 evxor w0,w0,rT0; /* W = W[-16] xor W[-3] */ \
140 or rT1,rT1,rT2; /* 1: F = F' or F" */ \
141 evxor w0,w0,w4; /* W = W xor W[-8] */ \
142 add e,e,rT1; /* 1: E = E + F */ \
143 evxor w0,w0,w1; /* W = W xor W[-14] */ \
144 rotrwi rT2,a,27; /* 1: A' = A rotl 5 */ \
145 evrlwi w0,w0,1; /* W = W rotl 1 */ \
146 add e,e,rT2; /* 1: E = E + A' */ \
147 evaddw rT0,w0,rK; /* WK = W + K */ \
148 rotrwi b,b,2; /* 1: B = B rotl 30 */ \
150 evmergehi rT1,rT1,rT0; /* WK1/WK2 */ \
151 add e,e,rT0; /* 1: E = E + WK */ \
152 add d,d,rT1; /* 2: E = E + WK */ \
153 and rT2,a,b; /* 2: F' = B and C */ \
154 andc rT1,c,a; /* 2: F" = ~B and D */ \
155 rotrwi rT0,e,27; /* 2: A' = A rotl 5 */ \
156 or rT1,rT1,rT2; /* 2: F = F' or F" */ \
157 add d,d,rT0; /* 2: E = E + A' */ \
158 rotrwi a,a,2; /* 2: B = B rotl 30 */ \
159 add d,d,rT1 /* 2: E = E + F */
161 #define R_20_39(a, b, c, d, e, w0, w1, w4, w6, w7, k) \
162 evmergelohi rT0,w7,w6; /* W[-3] */ \
163 xor rT2,b,c; /* 1: F' = B xor C */ \
164 evxor w0,w0,rT0; /* W = W[-16] xor W[-3] */ \
165 xor rT2,rT2,d; /* 1: F = F' xor D */ \
166 evxor w0,w0,w4; /* W = W xor W[-8] */ \
167 add e,e,rT2; /* 1: E = E + F */ \
168 evxor w0,w0,w1; /* W = W xor W[-14] */ \
169 rotrwi rT2,a,27; /* 1: A' = A rotl 5 */ \
170 evrlwi w0,w0,1; /* W = W rotl 1 */ \
171 add e,e,rT2; /* 1: E = E + A' */ \
172 evaddw rT0,w0,rK; /* WK = W + K */ \
173 rotrwi b,b,2; /* 1: B = B rotl 30 */ \
175 evmergehi rT1,rT1,rT0; /* WK1/WK2 */ \
176 add e,e,rT0; /* 1: E = E + WK */ \
177 xor rT2,a,b; /* 2: F' = B xor C */ \
178 add d,d,rT1; /* 2: E = E + WK */ \
179 xor rT2,rT2,c; /* 2: F = F' xor D */ \
180 rotrwi rT0,e,27; /* 2: A' = A rotl 5 */ \
181 add d,d,rT2; /* 2: E = E + F */ \
182 rotrwi a,a,2; /* 2: B = B rotl 30 */ \
183 add d,d,rT0 /* 2: E = E + A' */
185 #define R_40_59(a, b, c, d, e, w0, w1, w4, w6, w7, k) \
186 and rT2,b,c; /* 1: F' = B and C */ \
187 evmergelohi rT0,w7,w6; /* W[-3] */ \
188 or rT1,b,c; /* 1: F" = B or C */ \
189 evxor w0,w0,rT0; /* W = W[-16] xor W[-3] */ \
190 and rT1,d,rT1; /* 1: F" = F" and D */ \
191 evxor w0,w0,w4; /* W = W xor W[-8] */ \
192 or rT2,rT2,rT1; /* 1: F = F' or F" */ \
193 evxor w0,w0,w1; /* W = W xor W[-14] */ \
194 add e,e,rT2; /* 1: E = E + F */ \
195 evrlwi w0,w0,1; /* W = W rotl 1 */ \
196 rotrwi rT2,a,27; /* 1: A' = A rotl 5 */ \
197 evaddw rT0,w0,rK; /* WK = W + K */ \
198 add e,e,rT2; /* 1: E = E + A' */ \
200 evmergehi rT1,rT1,rT0; /* WK1/WK2 */ \
201 rotrwi b,b,2; /* 1: B = B rotl 30 */ \
202 add e,e,rT0; /* 1: E = E + WK */ \
203 and rT2,a,b; /* 2: F' = B and C */ \
204 or rT0,a,b; /* 2: F" = B or C */ \
205 add d,d,rT1; /* 2: E = E + WK */ \
206 and rT0,c,rT0; /* 2: F" = F" and D */ \
207 rotrwi a,a,2; /* 2: B = B rotl 30 */ \
208 or rT2,rT2,rT0; /* 2: F = F' or F" */ \
209 rotrwi rT0,e,27; /* 2: A' = A rotl 5 */ \
210 add d,d,rT2; /* 2: E = E + F */ \
211 add d,d,rT0 /* 2: E = E + A' */
213 #define R_60_79(a, b, c, d, e, w0, w1, w4, w6, w7, k) \
214 R_20_39(a, b, c, d, e, w0, w1, w4, w6, w7, k)
216 _GLOBAL(ppc_spe_sha1_transform)
223 lis rKP,PPC_SPE_SHA1_K@h
225 ori rKP,rKP,PPC_SPE_SHA1_K@l
229 R_00_15(rH0, rH1, rH2, rH3, rH4, rW1, rW0, 1, 0)
230 R_00_15(rH3, rH4, rH0, rH1, rH2, rW2, rW1, 0, 8)
231 R_00_15(rH1, rH2, rH3, rH4, rH0, rW3, rW2, 0, 16)
232 R_00_15(rH4, rH0, rH1, rH2, rH3, rW4, rW3, 0, 24)
233 R_00_15(rH2, rH3, rH4, rH0, rH1, rW5, rW4, 0, 32)
234 R_00_15(rH0, rH1, rH2, rH3, rH4, rW6, rW5, 0, 40)
235 R_00_15(rH3, rH4, rH0, rH1, rH2, rT3, rW6, 0, 48)
236 R_00_15(rH1, rH2, rH3, rH4, rH0, rT3, rW7, 0, 56)
238 R_16_19(rH4, rH0, rH1, rH2, rH3, rW0, rW1, rW4, rW6, rW7, 0)
239 R_16_19(rH2, rH3, rH4, rH0, rH1, rW1, rW2, rW5, rW7, rW0, 2)
241 R_20_39(rH0, rH1, rH2, rH3, rH4, rW2, rW3, rW6, rW0, rW1, 0)
242 R_20_39(rH3, rH4, rH0, rH1, rH2, rW3, rW4, rW7, rW1, rW2, 0)
243 R_20_39(rH1, rH2, rH3, rH4, rH0, rW4, rW5, rW0, rW2, rW3, 0)
244 R_20_39(rH4, rH0, rH1, rH2, rH3, rW5, rW6, rW1, rW3, rW4, 0)
245 R_20_39(rH2, rH3, rH4, rH0, rH1, rW6, rW7, rW2, rW4, rW5, 0)
246 R_20_39(rH0, rH1, rH2, rH3, rH4, rW7, rW0, rW3, rW5, rW6, 0)
247 R_20_39(rH3, rH4, rH0, rH1, rH2, rW0, rW1, rW4, rW6, rW7, 0)
248 R_20_39(rH1, rH2, rH3, rH4, rH0, rW1, rW2, rW5, rW7, rW0, 0)
249 R_20_39(rH4, rH0, rH1, rH2, rH3, rW2, rW3, rW6, rW0, rW1, 0)
250 R_20_39(rH2, rH3, rH4, rH0, rH1, rW3, rW4, rW7, rW1, rW2, 3)
252 R_40_59(rH0, rH1, rH2, rH3, rH4, rW4, rW5, rW0, rW2, rW3, 0)
253 R_40_59(rH3, rH4, rH0, rH1, rH2, rW5, rW6, rW1, rW3, rW4, 0)
254 R_40_59(rH1, rH2, rH3, rH4, rH0, rW6, rW7, rW2, rW4, rW5, 0)
255 R_40_59(rH4, rH0, rH1, rH2, rH3, rW7, rW0, rW3, rW5, rW6, 0)
256 R_40_59(rH2, rH3, rH4, rH0, rH1, rW0, rW1, rW4, rW6, rW7, 0)
257 R_40_59(rH0, rH1, rH2, rH3, rH4, rW1, rW2, rW5, rW7, rW0, 0)
258 R_40_59(rH3, rH4, rH0, rH1, rH2, rW2, rW3, rW6, rW0, rW1, 0)
259 R_40_59(rH1, rH2, rH3, rH4, rH0, rW3, rW4, rW7, rW1, rW2, 0)
260 R_40_59(rH4, rH0, rH1, rH2, rH3, rW4, rW5, rW0, rW2, rW3, 0)
261 R_40_59(rH2, rH3, rH4, rH0, rH1, rW5, rW6, rW1, rW3, rW4, 4)
263 R_60_79(rH0, rH1, rH2, rH3, rH4, rW6, rW7, rW2, rW4, rW5, 0)
264 R_60_79(rH3, rH4, rH0, rH1, rH2, rW7, rW0, rW3, rW5, rW6, 0)
265 R_60_79(rH1, rH2, rH3, rH4, rH0, rW0, rW1, rW4, rW6, rW7, 0)
266 R_60_79(rH4, rH0, rH1, rH2, rH3, rW1, rW2, rW5, rW7, rW0, 0)
267 R_60_79(rH2, rH3, rH4, rH0, rH1, rW2, rW3, rW6, rW0, rW1, 0)
268 R_60_79(rH0, rH1, rH2, rH3, rH4, rW3, rW4, rW7, rW1, rW2, 0)
269 R_60_79(rH3, rH4, rH0, rH1, rH2, rW4, rW5, rW0, rW2, rW3, 0)
271 R_60_79(rH1, rH2, rH3, rH4, rH0, rW5, rW6, rW1, rW3, rW4, 0)
273 R_60_79(rH4, rH0, rH1, rH2, rH3, rW6, rW7, rW2, rW4, rW5, 0)
275 R_60_79(rH2, rH3, rH4, rH0, rH1, rW7, rW0, rW3, rW5, rW6, 0)
291 bdnz ppc_spe_sha1_main
299 .long 0x5A827999,0x6ED9EBA1,0x8F1BBCDC,0xCA62C1D6