Bugfix: avoid sub-cent change (lost in fees) whenever possible
[bitcoin.git] / sha256.cpp
blobca116bdcd3f152bffccd9d9ef19216d55109ee24
1 // Copyright (c) 2010 Nils Schneider
2 // Distributed under the MIT/X11 software license, see the accompanying
3 // file license.txt or http://www.opensource.org/licenses/mit-license.php.
5 // 4-way 128-bit SSE2 SHA-256
7 #ifdef FOURWAYSSE2
9 #include <string.h>
10 #include <assert.h>
12 #include <xmmintrin.h>
13 #include <stdint.h>
14 #include <stdio.h>
16 #define NPAR 32
18 extern void DoubleBlockSHA256(const void* pin, void* pout, const void* pinit, unsigned int hash[8][NPAR], const void* init2);
20 static const unsigned int sha256_consts[] = {
21 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, /* 0 */
22 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
23 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, /* 8 */
24 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
25 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, /* 16 */
26 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
27 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, /* 24 */
28 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
29 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, /* 32 */
30 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
31 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, /* 40 */
32 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
33 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, /* 48 */
34 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
35 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, /* 56 */
36 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
40 static inline __m128i Ch(const __m128i b, const __m128i c, const __m128i d) {
41 return (b & c) ^ (~b & d);
44 static inline __m128i Maj(const __m128i b, const __m128i c, const __m128i d) {
45 return (b & c) ^ (b & d) ^ (c & d);
48 static inline __m128i ROTR(__m128i x, const int n) {
49 return _mm_srli_epi32(x, n) | _mm_slli_epi32(x, 32 - n);
52 static inline __m128i SHR(__m128i x, const int n) {
53 return _mm_srli_epi32(x, n);
56 /* SHA256 Functions */
57 #define BIGSIGMA0_256(x) (ROTR((x), 2) ^ ROTR((x), 13) ^ ROTR((x), 22))
58 #define BIGSIGMA1_256(x) (ROTR((x), 6) ^ ROTR((x), 11) ^ ROTR((x), 25))
59 #define SIGMA0_256(x) (ROTR((x), 7) ^ ROTR((x), 18) ^ SHR((x), 3))
60 #define SIGMA1_256(x) (ROTR((x), 17) ^ ROTR((x), 19) ^ SHR((x), 10))
62 static inline unsigned int store32(const __m128i x, int i) {
63 union { unsigned int ret[4]; __m128i x; } box;
64 box.x = x;
65 return box.ret[i];
68 static inline void store_epi32(const __m128i x, unsigned int *x0, unsigned int *x1, unsigned int *x2, unsigned int *x3) {
69 union { unsigned int ret[4]; __m128i x; } box;
70 box.x = x;
71 *x0 = box.ret[3]; *x1 = box.ret[2]; *x2 = box.ret[1]; *x3 = box.ret[0];
74 #define add4(x0, x1, x2, x3) _mm_add_epi32(_mm_add_epi32(_mm_add_epi32(x0, x1), x2), x3)
75 #define add5(x0, x1, x2, x3, x4) _mm_add_epi32(add4(x0, x1, x2, x3), x4)
77 #define SHA256ROUND(a, b, c, d, e, f, g, h, i, w) \
78 T1 = add5(h, BIGSIGMA1_256(e), Ch(e, f, g), _mm_set1_epi32(sha256_consts[i]), w); \
79 d = _mm_add_epi32(d, T1); \
80 h = _mm_add_epi32(T1, _mm_add_epi32(BIGSIGMA0_256(a), Maj(a, b, c)));
82 static inline void dumpreg(__m128i x, char *msg) {
83 union { unsigned int ret[4]; __m128i x; } box;
84 box.x = x ;
85 printf("%s %08x %08x %08x %08x\n", msg, box.ret[0], box.ret[1], box.ret[2], box.ret[3]);
88 #if 1
89 #define dumpstate(i) printf("%s: %08x %08x %08x %08x %08x %08x %08x %08x %08x\n", \
90 __func__, store32(w0, i), store32(a, i), store32(b, i), store32(c, i), store32(d, i), store32(e, i), store32(f, i), store32(g, i), store32(h, i));
91 #else
92 #define dumpstate()
93 #endif
95 // Align by increasing pointer, must have extra space at end of buffer
96 template <size_t nBytes, typename T>
97 T* alignup(T* p)
99 union
101 T* ptr;
102 size_t n;
103 } u;
104 u.ptr = p;
105 u.n = (u.n + (nBytes-1)) & ~(nBytes-1);
106 return u.ptr;
109 static const unsigned int pSHA256InitState[8] =
110 {0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19};
113 unsigned int ScanHash_4WaySSE2(char* pmidstate, char* pdata, char* phash1, char* phash, unsigned int& nHashesDone)
115 unsigned int& nNonce = *(unsigned int*)(pdata + 12);
116 for (;;)
118 nNonce += NPAR;
119 unsigned int thashbuf[9][NPAR];
120 unsigned int (&thash)[9][NPAR] = *alignup<16>(&thashbuf);
121 DoubleBlockSHA256(pdata, phash1, pmidstate, thash, pSHA256InitState);
123 for (int j = 0; j < NPAR; j++)
125 if (thash[7][j] == 0)
127 for (int i = 0; i < 32/4; i++)
128 ((unsigned int*)phash)[i] = thash[i][j];
129 return nNonce + j;
133 if ((nNonce & 0xffff) == 0)
135 nHashesDone = 0xffff+1;
136 return -1;
142 void DoubleBlockSHA256(const void* pin, void* pad, const void *pre, unsigned int thash[9][NPAR], const void *init)
144 unsigned int* In = (unsigned int*)pin;
145 unsigned int* Pad = (unsigned int*)pad;
146 unsigned int* hPre = (unsigned int*)pre;
147 unsigned int* hInit = (unsigned int*)init;
148 unsigned int i, j, k;
150 /* vectors used in calculation */
151 __m128i w0, w1, w2, w3, w4, w5, w6, w7;
152 __m128i w8, w9, w10, w11, w12, w13, w14, w15;
153 __m128i T1;
154 __m128i a, b, c, d, e, f, g, h;
155 __m128i nonce;
157 /* nonce offset for vector */
158 __m128i offset = _mm_set_epi32(0x00000003, 0x00000002, 0x00000001, 0x00000000);
161 for(k = 0; k<NPAR; k+=4) {
162 w0 = _mm_set1_epi32(In[0]);
163 w1 = _mm_set1_epi32(In[1]);
164 w2 = _mm_set1_epi32(In[2]);
165 //w3 = _mm_set1_epi32(In[3]); nonce will be later hacked into the hash
166 w4 = _mm_set1_epi32(In[4]);
167 w5 = _mm_set1_epi32(In[5]);
168 w6 = _mm_set1_epi32(In[6]);
169 w7 = _mm_set1_epi32(In[7]);
170 w8 = _mm_set1_epi32(In[8]);
171 w9 = _mm_set1_epi32(In[9]);
172 w10 = _mm_set1_epi32(In[10]);
173 w11 = _mm_set1_epi32(In[11]);
174 w12 = _mm_set1_epi32(In[12]);
175 w13 = _mm_set1_epi32(In[13]);
176 w14 = _mm_set1_epi32(In[14]);
177 w15 = _mm_set1_epi32(In[15]);
179 /* hack nonce into lowest byte of w3 */
180 nonce = _mm_set1_epi32(In[3]);
181 nonce = _mm_add_epi32(nonce, offset);
182 nonce = _mm_add_epi32(nonce, _mm_set1_epi32(k));
183 w3 = nonce;
185 a = _mm_set1_epi32(hPre[0]);
186 b = _mm_set1_epi32(hPre[1]);
187 c = _mm_set1_epi32(hPre[2]);
188 d = _mm_set1_epi32(hPre[3]);
189 e = _mm_set1_epi32(hPre[4]);
190 f = _mm_set1_epi32(hPre[5]);
191 g = _mm_set1_epi32(hPre[6]);
192 h = _mm_set1_epi32(hPre[7]);
194 SHA256ROUND(a, b, c, d, e, f, g, h, 0, w0);
195 SHA256ROUND(h, a, b, c, d, e, f, g, 1, w1);
196 SHA256ROUND(g, h, a, b, c, d, e, f, 2, w2);
197 SHA256ROUND(f, g, h, a, b, c, d, e, 3, w3);
198 SHA256ROUND(e, f, g, h, a, b, c, d, 4, w4);
199 SHA256ROUND(d, e, f, g, h, a, b, c, 5, w5);
200 SHA256ROUND(c, d, e, f, g, h, a, b, 6, w6);
201 SHA256ROUND(b, c, d, e, f, g, h, a, 7, w7);
202 SHA256ROUND(a, b, c, d, e, f, g, h, 8, w8);
203 SHA256ROUND(h, a, b, c, d, e, f, g, 9, w9);
204 SHA256ROUND(g, h, a, b, c, d, e, f, 10, w10);
205 SHA256ROUND(f, g, h, a, b, c, d, e, 11, w11);
206 SHA256ROUND(e, f, g, h, a, b, c, d, 12, w12);
207 SHA256ROUND(d, e, f, g, h, a, b, c, 13, w13);
208 SHA256ROUND(c, d, e, f, g, h, a, b, 14, w14);
209 SHA256ROUND(b, c, d, e, f, g, h, a, 15, w15);
211 w0 = add4(SIGMA1_256(w14), w9, SIGMA0_256(w1), w0);
212 SHA256ROUND(a, b, c, d, e, f, g, h, 16, w0);
213 w1 = add4(SIGMA1_256(w15), w10, SIGMA0_256(w2), w1);
214 SHA256ROUND(h, a, b, c, d, e, f, g, 17, w1);
215 w2 = add4(SIGMA1_256(w0), w11, SIGMA0_256(w3), w2);
216 SHA256ROUND(g, h, a, b, c, d, e, f, 18, w2);
217 w3 = add4(SIGMA1_256(w1), w12, SIGMA0_256(w4), w3);
218 SHA256ROUND(f, g, h, a, b, c, d, e, 19, w3);
219 w4 = add4(SIGMA1_256(w2), w13, SIGMA0_256(w5), w4);
220 SHA256ROUND(e, f, g, h, a, b, c, d, 20, w4);
221 w5 = add4(SIGMA1_256(w3), w14, SIGMA0_256(w6), w5);
222 SHA256ROUND(d, e, f, g, h, a, b, c, 21, w5);
223 w6 = add4(SIGMA1_256(w4), w15, SIGMA0_256(w7), w6);
224 SHA256ROUND(c, d, e, f, g, h, a, b, 22, w6);
225 w7 = add4(SIGMA1_256(w5), w0, SIGMA0_256(w8), w7);
226 SHA256ROUND(b, c, d, e, f, g, h, a, 23, w7);
227 w8 = add4(SIGMA1_256(w6), w1, SIGMA0_256(w9), w8);
228 SHA256ROUND(a, b, c, d, e, f, g, h, 24, w8);
229 w9 = add4(SIGMA1_256(w7), w2, SIGMA0_256(w10), w9);
230 SHA256ROUND(h, a, b, c, d, e, f, g, 25, w9);
231 w10 = add4(SIGMA1_256(w8), w3, SIGMA0_256(w11), w10);
232 SHA256ROUND(g, h, a, b, c, d, e, f, 26, w10);
233 w11 = add4(SIGMA1_256(w9), w4, SIGMA0_256(w12), w11);
234 SHA256ROUND(f, g, h, a, b, c, d, e, 27, w11);
235 w12 = add4(SIGMA1_256(w10), w5, SIGMA0_256(w13), w12);
236 SHA256ROUND(e, f, g, h, a, b, c, d, 28, w12);
237 w13 = add4(SIGMA1_256(w11), w6, SIGMA0_256(w14), w13);
238 SHA256ROUND(d, e, f, g, h, a, b, c, 29, w13);
239 w14 = add4(SIGMA1_256(w12), w7, SIGMA0_256(w15), w14);
240 SHA256ROUND(c, d, e, f, g, h, a, b, 30, w14);
241 w15 = add4(SIGMA1_256(w13), w8, SIGMA0_256(w0), w15);
242 SHA256ROUND(b, c, d, e, f, g, h, a, 31, w15);
244 w0 = add4(SIGMA1_256(w14), w9, SIGMA0_256(w1), w0);
245 SHA256ROUND(a, b, c, d, e, f, g, h, 32, w0);
246 w1 = add4(SIGMA1_256(w15), w10, SIGMA0_256(w2), w1);
247 SHA256ROUND(h, a, b, c, d, e, f, g, 33, w1);
248 w2 = add4(SIGMA1_256(w0), w11, SIGMA0_256(w3), w2);
249 SHA256ROUND(g, h, a, b, c, d, e, f, 34, w2);
250 w3 = add4(SIGMA1_256(w1), w12, SIGMA0_256(w4), w3);
251 SHA256ROUND(f, g, h, a, b, c, d, e, 35, w3);
252 w4 = add4(SIGMA1_256(w2), w13, SIGMA0_256(w5), w4);
253 SHA256ROUND(e, f, g, h, a, b, c, d, 36, w4);
254 w5 = add4(SIGMA1_256(w3), w14, SIGMA0_256(w6), w5);
255 SHA256ROUND(d, e, f, g, h, a, b, c, 37, w5);
256 w6 = add4(SIGMA1_256(w4), w15, SIGMA0_256(w7), w6);
257 SHA256ROUND(c, d, e, f, g, h, a, b, 38, w6);
258 w7 = add4(SIGMA1_256(w5), w0, SIGMA0_256(w8), w7);
259 SHA256ROUND(b, c, d, e, f, g, h, a, 39, w7);
260 w8 = add4(SIGMA1_256(w6), w1, SIGMA0_256(w9), w8);
261 SHA256ROUND(a, b, c, d, e, f, g, h, 40, w8);
262 w9 = add4(SIGMA1_256(w7), w2, SIGMA0_256(w10), w9);
263 SHA256ROUND(h, a, b, c, d, e, f, g, 41, w9);
264 w10 = add4(SIGMA1_256(w8), w3, SIGMA0_256(w11), w10);
265 SHA256ROUND(g, h, a, b, c, d, e, f, 42, w10);
266 w11 = add4(SIGMA1_256(w9), w4, SIGMA0_256(w12), w11);
267 SHA256ROUND(f, g, h, a, b, c, d, e, 43, w11);
268 w12 = add4(SIGMA1_256(w10), w5, SIGMA0_256(w13), w12);
269 SHA256ROUND(e, f, g, h, a, b, c, d, 44, w12);
270 w13 = add4(SIGMA1_256(w11), w6, SIGMA0_256(w14), w13);
271 SHA256ROUND(d, e, f, g, h, a, b, c, 45, w13);
272 w14 = add4(SIGMA1_256(w12), w7, SIGMA0_256(w15), w14);
273 SHA256ROUND(c, d, e, f, g, h, a, b, 46, w14);
274 w15 = add4(SIGMA1_256(w13), w8, SIGMA0_256(w0), w15);
275 SHA256ROUND(b, c, d, e, f, g, h, a, 47, w15);
277 w0 = add4(SIGMA1_256(w14), w9, SIGMA0_256(w1), w0);
278 SHA256ROUND(a, b, c, d, e, f, g, h, 48, w0);
279 w1 = add4(SIGMA1_256(w15), w10, SIGMA0_256(w2), w1);
280 SHA256ROUND(h, a, b, c, d, e, f, g, 49, w1);
281 w2 = add4(SIGMA1_256(w0), w11, SIGMA0_256(w3), w2);
282 SHA256ROUND(g, h, a, b, c, d, e, f, 50, w2);
283 w3 = add4(SIGMA1_256(w1), w12, SIGMA0_256(w4), w3);
284 SHA256ROUND(f, g, h, a, b, c, d, e, 51, w3);
285 w4 = add4(SIGMA1_256(w2), w13, SIGMA0_256(w5), w4);
286 SHA256ROUND(e, f, g, h, a, b, c, d, 52, w4);
287 w5 = add4(SIGMA1_256(w3), w14, SIGMA0_256(w6), w5);
288 SHA256ROUND(d, e, f, g, h, a, b, c, 53, w5);
289 w6 = add4(SIGMA1_256(w4), w15, SIGMA0_256(w7), w6);
290 SHA256ROUND(c, d, e, f, g, h, a, b, 54, w6);
291 w7 = add4(SIGMA1_256(w5), w0, SIGMA0_256(w8), w7);
292 SHA256ROUND(b, c, d, e, f, g, h, a, 55, w7);
293 w8 = add4(SIGMA1_256(w6), w1, SIGMA0_256(w9), w8);
294 SHA256ROUND(a, b, c, d, e, f, g, h, 56, w8);
295 w9 = add4(SIGMA1_256(w7), w2, SIGMA0_256(w10), w9);
296 SHA256ROUND(h, a, b, c, d, e, f, g, 57, w9);
297 w10 = add4(SIGMA1_256(w8), w3, SIGMA0_256(w11), w10);
298 SHA256ROUND(g, h, a, b, c, d, e, f, 58, w10);
299 w11 = add4(SIGMA1_256(w9), w4, SIGMA0_256(w12), w11);
300 SHA256ROUND(f, g, h, a, b, c, d, e, 59, w11);
301 w12 = add4(SIGMA1_256(w10), w5, SIGMA0_256(w13), w12);
302 SHA256ROUND(e, f, g, h, a, b, c, d, 60, w12);
303 w13 = add4(SIGMA1_256(w11), w6, SIGMA0_256(w14), w13);
304 SHA256ROUND(d, e, f, g, h, a, b, c, 61, w13);
305 w14 = add4(SIGMA1_256(w12), w7, SIGMA0_256(w15), w14);
306 SHA256ROUND(c, d, e, f, g, h, a, b, 62, w14);
307 w15 = add4(SIGMA1_256(w13), w8, SIGMA0_256(w0), w15);
308 SHA256ROUND(b, c, d, e, f, g, h, a, 63, w15);
310 #define store_load(x, i, dest) \
311 T1 = _mm_set1_epi32((hPre)[i]); \
312 dest = _mm_add_epi32(T1, x);
314 store_load(a, 0, w0);
315 store_load(b, 1, w1);
316 store_load(c, 2, w2);
317 store_load(d, 3, w3);
318 store_load(e, 4, w4);
319 store_load(f, 5, w5);
320 store_load(g, 6, w6);
321 store_load(h, 7, w7);
323 w8 = _mm_set1_epi32(Pad[8]);
324 w9 = _mm_set1_epi32(Pad[9]);
325 w10 = _mm_set1_epi32(Pad[10]);
326 w11 = _mm_set1_epi32(Pad[11]);
327 w12 = _mm_set1_epi32(Pad[12]);
328 w13 = _mm_set1_epi32(Pad[13]);
329 w14 = _mm_set1_epi32(Pad[14]);
330 w15 = _mm_set1_epi32(Pad[15]);
332 a = _mm_set1_epi32(hInit[0]);
333 b = _mm_set1_epi32(hInit[1]);
334 c = _mm_set1_epi32(hInit[2]);
335 d = _mm_set1_epi32(hInit[3]);
336 e = _mm_set1_epi32(hInit[4]);
337 f = _mm_set1_epi32(hInit[5]);
338 g = _mm_set1_epi32(hInit[6]);
339 h = _mm_set1_epi32(hInit[7]);
341 SHA256ROUND(a, b, c, d, e, f, g, h, 0, w0);
342 SHA256ROUND(h, a, b, c, d, e, f, g, 1, w1);
343 SHA256ROUND(g, h, a, b, c, d, e, f, 2, w2);
344 SHA256ROUND(f, g, h, a, b, c, d, e, 3, w3);
345 SHA256ROUND(e, f, g, h, a, b, c, d, 4, w4);
346 SHA256ROUND(d, e, f, g, h, a, b, c, 5, w5);
347 SHA256ROUND(c, d, e, f, g, h, a, b, 6, w6);
348 SHA256ROUND(b, c, d, e, f, g, h, a, 7, w7);
349 SHA256ROUND(a, b, c, d, e, f, g, h, 8, w8);
350 SHA256ROUND(h, a, b, c, d, e, f, g, 9, w9);
351 SHA256ROUND(g, h, a, b, c, d, e, f, 10, w10);
352 SHA256ROUND(f, g, h, a, b, c, d, e, 11, w11);
353 SHA256ROUND(e, f, g, h, a, b, c, d, 12, w12);
354 SHA256ROUND(d, e, f, g, h, a, b, c, 13, w13);
355 SHA256ROUND(c, d, e, f, g, h, a, b, 14, w14);
356 SHA256ROUND(b, c, d, e, f, g, h, a, 15, w15);
358 w0 = add4(SIGMA1_256(w14), w9, SIGMA0_256(w1), w0);
359 SHA256ROUND(a, b, c, d, e, f, g, h, 16, w0);
360 w1 = add4(SIGMA1_256(w15), w10, SIGMA0_256(w2), w1);
361 SHA256ROUND(h, a, b, c, d, e, f, g, 17, w1);
362 w2 = add4(SIGMA1_256(w0), w11, SIGMA0_256(w3), w2);
363 SHA256ROUND(g, h, a, b, c, d, e, f, 18, w2);
364 w3 = add4(SIGMA1_256(w1), w12, SIGMA0_256(w4), w3);
365 SHA256ROUND(f, g, h, a, b, c, d, e, 19, w3);
366 w4 = add4(SIGMA1_256(w2), w13, SIGMA0_256(w5), w4);
367 SHA256ROUND(e, f, g, h, a, b, c, d, 20, w4);
368 w5 = add4(SIGMA1_256(w3), w14, SIGMA0_256(w6), w5);
369 SHA256ROUND(d, e, f, g, h, a, b, c, 21, w5);
370 w6 = add4(SIGMA1_256(w4), w15, SIGMA0_256(w7), w6);
371 SHA256ROUND(c, d, e, f, g, h, a, b, 22, w6);
372 w7 = add4(SIGMA1_256(w5), w0, SIGMA0_256(w8), w7);
373 SHA256ROUND(b, c, d, e, f, g, h, a, 23, w7);
374 w8 = add4(SIGMA1_256(w6), w1, SIGMA0_256(w9), w8);
375 SHA256ROUND(a, b, c, d, e, f, g, h, 24, w8);
376 w9 = add4(SIGMA1_256(w7), w2, SIGMA0_256(w10), w9);
377 SHA256ROUND(h, a, b, c, d, e, f, g, 25, w9);
378 w10 = add4(SIGMA1_256(w8), w3, SIGMA0_256(w11), w10);
379 SHA256ROUND(g, h, a, b, c, d, e, f, 26, w10);
380 w11 = add4(SIGMA1_256(w9), w4, SIGMA0_256(w12), w11);
381 SHA256ROUND(f, g, h, a, b, c, d, e, 27, w11);
382 w12 = add4(SIGMA1_256(w10), w5, SIGMA0_256(w13), w12);
383 SHA256ROUND(e, f, g, h, a, b, c, d, 28, w12);
384 w13 = add4(SIGMA1_256(w11), w6, SIGMA0_256(w14), w13);
385 SHA256ROUND(d, e, f, g, h, a, b, c, 29, w13);
386 w14 = add4(SIGMA1_256(w12), w7, SIGMA0_256(w15), w14);
387 SHA256ROUND(c, d, e, f, g, h, a, b, 30, w14);
388 w15 = add4(SIGMA1_256(w13), w8, SIGMA0_256(w0), w15);
389 SHA256ROUND(b, c, d, e, f, g, h, a, 31, w15);
391 w0 = add4(SIGMA1_256(w14), w9, SIGMA0_256(w1), w0);
392 SHA256ROUND(a, b, c, d, e, f, g, h, 32, w0);
393 w1 = add4(SIGMA1_256(w15), w10, SIGMA0_256(w2), w1);
394 SHA256ROUND(h, a, b, c, d, e, f, g, 33, w1);
395 w2 = add4(SIGMA1_256(w0), w11, SIGMA0_256(w3), w2);
396 SHA256ROUND(g, h, a, b, c, d, e, f, 34, w2);
397 w3 = add4(SIGMA1_256(w1), w12, SIGMA0_256(w4), w3);
398 SHA256ROUND(f, g, h, a, b, c, d, e, 35, w3);
399 w4 = add4(SIGMA1_256(w2), w13, SIGMA0_256(w5), w4);
400 SHA256ROUND(e, f, g, h, a, b, c, d, 36, w4);
401 w5 = add4(SIGMA1_256(w3), w14, SIGMA0_256(w6), w5);
402 SHA256ROUND(d, e, f, g, h, a, b, c, 37, w5);
403 w6 = add4(SIGMA1_256(w4), w15, SIGMA0_256(w7), w6);
404 SHA256ROUND(c, d, e, f, g, h, a, b, 38, w6);
405 w7 = add4(SIGMA1_256(w5), w0, SIGMA0_256(w8), w7);
406 SHA256ROUND(b, c, d, e, f, g, h, a, 39, w7);
407 w8 = add4(SIGMA1_256(w6), w1, SIGMA0_256(w9), w8);
408 SHA256ROUND(a, b, c, d, e, f, g, h, 40, w8);
409 w9 = add4(SIGMA1_256(w7), w2, SIGMA0_256(w10), w9);
410 SHA256ROUND(h, a, b, c, d, e, f, g, 41, w9);
411 w10 = add4(SIGMA1_256(w8), w3, SIGMA0_256(w11), w10);
412 SHA256ROUND(g, h, a, b, c, d, e, f, 42, w10);
413 w11 = add4(SIGMA1_256(w9), w4, SIGMA0_256(w12), w11);
414 SHA256ROUND(f, g, h, a, b, c, d, e, 43, w11);
415 w12 = add4(SIGMA1_256(w10), w5, SIGMA0_256(w13), w12);
416 SHA256ROUND(e, f, g, h, a, b, c, d, 44, w12);
417 w13 = add4(SIGMA1_256(w11), w6, SIGMA0_256(w14), w13);
418 SHA256ROUND(d, e, f, g, h, a, b, c, 45, w13);
419 w14 = add4(SIGMA1_256(w12), w7, SIGMA0_256(w15), w14);
420 SHA256ROUND(c, d, e, f, g, h, a, b, 46, w14);
421 w15 = add4(SIGMA1_256(w13), w8, SIGMA0_256(w0), w15);
422 SHA256ROUND(b, c, d, e, f, g, h, a, 47, w15);
424 w0 = add4(SIGMA1_256(w14), w9, SIGMA0_256(w1), w0);
425 SHA256ROUND(a, b, c, d, e, f, g, h, 48, w0);
426 w1 = add4(SIGMA1_256(w15), w10, SIGMA0_256(w2), w1);
427 SHA256ROUND(h, a, b, c, d, e, f, g, 49, w1);
428 w2 = add4(SIGMA1_256(w0), w11, SIGMA0_256(w3), w2);
429 SHA256ROUND(g, h, a, b, c, d, e, f, 50, w2);
430 w3 = add4(SIGMA1_256(w1), w12, SIGMA0_256(w4), w3);
431 SHA256ROUND(f, g, h, a, b, c, d, e, 51, w3);
432 w4 = add4(SIGMA1_256(w2), w13, SIGMA0_256(w5), w4);
433 SHA256ROUND(e, f, g, h, a, b, c, d, 52, w4);
434 w5 = add4(SIGMA1_256(w3), w14, SIGMA0_256(w6), w5);
435 SHA256ROUND(d, e, f, g, h, a, b, c, 53, w5);
436 w6 = add4(SIGMA1_256(w4), w15, SIGMA0_256(w7), w6);
437 SHA256ROUND(c, d, e, f, g, h, a, b, 54, w6);
438 w7 = add4(SIGMA1_256(w5), w0, SIGMA0_256(w8), w7);
439 SHA256ROUND(b, c, d, e, f, g, h, a, 55, w7);
440 w8 = add4(SIGMA1_256(w6), w1, SIGMA0_256(w9), w8);
441 SHA256ROUND(a, b, c, d, e, f, g, h, 56, w8);
442 w9 = add4(SIGMA1_256(w7), w2, SIGMA0_256(w10), w9);
443 SHA256ROUND(h, a, b, c, d, e, f, g, 57, w9);
444 w10 = add4(SIGMA1_256(w8), w3, SIGMA0_256(w11), w10);
445 SHA256ROUND(g, h, a, b, c, d, e, f, 58, w10);
446 w11 = add4(SIGMA1_256(w9), w4, SIGMA0_256(w12), w11);
447 SHA256ROUND(f, g, h, a, b, c, d, e, 59, w11);
448 w12 = add4(SIGMA1_256(w10), w5, SIGMA0_256(w13), w12);
449 SHA256ROUND(e, f, g, h, a, b, c, d, 60, w12);
450 w13 = add4(SIGMA1_256(w11), w6, SIGMA0_256(w14), w13);
451 SHA256ROUND(d, e, f, g, h, a, b, c, 61, w13);
452 w14 = add4(SIGMA1_256(w12), w7, SIGMA0_256(w15), w14);
453 SHA256ROUND(c, d, e, f, g, h, a, b, 62, w14);
454 w15 = add4(SIGMA1_256(w13), w8, SIGMA0_256(w0), w15);
455 SHA256ROUND(b, c, d, e, f, g, h, a, 63, w15);
457 /* store resulsts directly in thash */
458 #define store_2(x,i) \
459 w0 = _mm_set1_epi32(hInit[i]); \
460 *(__m128i *)&(thash)[i][0+k] = _mm_add_epi32(w0, x);
462 store_2(a, 0);
463 store_2(b, 1);
464 store_2(c, 2);
465 store_2(d, 3);
466 store_2(e, 4);
467 store_2(f, 5);
468 store_2(g, 6);
469 store_2(h, 7);
470 *(__m128i *)&(thash)[8][0+k] = nonce;
475 #endif // FOURWAYSSE2