1 // Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
3 // Licensed under the OpenSSL license (the "License"). You may not use
4 // this file except in compliance with the License. You can obtain a copy
5 // in the file LICENSE in the source distribution or at
6 // https://www.openssl.org/source/license.html
8 // ====================================================================
9 // Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
10 // project. The module is, however, dual licensed under OpenSSL and
11 // CRYPTOGAMS licenses depending on where you obtain it. For further
12 // details see http://www.openssl.org/~appro/cryptogams/.
14 // Permission to use under GPLv2 terms is granted.
15 // ====================================================================
17 // SHA256/512 for ARMv8.
19 // Performance in cycles per processed byte and improvement coefficient
20 // over code generated with "default" compiler:
22 // SHA256-hw SHA256(*) SHA512
23 // Apple A7 1.97 10.5 (+33%) 6.73 (-1%(**))
24 // Cortex-A53 2.38 15.5 (+115%) 10.0 (+150%(***))
25 // Cortex-A57 2.31 11.6 (+86%) 7.51 (+260%(***))
26 // Denver 2.01 10.5 (+26%) 6.70 (+8%)
27 // X-Gene 20.0 (+100%) 12.8 (+300%(***))
28 // Mongoose 2.36 13.0 (+50%) 8.36 (+33%)
30 // (*) Software SHA256 results are of lesser relevance, presented
31 // mostly for informational purposes.
32 // (**) The result is a trade-off: it's possible to improve it by
33 // 10% (or by 1 cycle per round), but at the cost of 20% loss
34 // on Cortex-A53 (or by 4 cycles per round).
35 // (***) Super-impressive coefficients over gcc-generated code are
36 // indication of some compiler "pathology", most notably code
37 // generated with -mgeneral-regs-only is significanty faster
38 // and the gap is only 40-90%.
42 // Originally it was reckoned that it makes no sense to implement NEON
43 // version of SHA256 for 64-bit processors. This is because performance
44 // improvement on most wide-spread Cortex-A5x processors was observed
45 // to be marginal, same on Cortex-A53 and ~10% on A57. But then it was
46 // observed that 32-bit NEON SHA256 performs significantly better than
47 // 64-bit scalar version on *some* of the more recent processors. As
48 // result 64-bit NEON version of SHA256 was added to provide best
49 // all-round performance. For example it executes ~30% faster on X-Gene
50 // and Mongoose. [For reference, NEON version of SHA512 is bound to
51 // deliver much less improvement, likely *negative* on Cortex-A5x.
52 // Which is why NEON support is limited to SHA256.]
55 # include "arm_arch.h"
60 .extern OPENSSL_armcap_P
61 .globl sha512_block_data_order
62 .type sha512_block_data_order,%function
64 sha512_block_data_order:
65 stp x29,x30,[sp,#-128]!
75 ldp x20,x21,[x0] // load context
78 add x2,x1,x2,lsl#7 // end of input
85 ldr x19,[x30],#8 // *K++
86 eor x28,x21,x22 // magic seed
92 add x27,x27,x19 // h+=K[i]
96 add x27,x27,x3 // h+=X[i]
97 orr x17,x17,x19 // Ch(e,f,g)
98 eor x19,x20,x21 // a^b, b^c in next round
99 eor x16,x16,x6,ror#18 // Sigma1(e)
101 add x27,x27,x17 // h+=Ch(e,f,g)
102 eor x17,x20,x20,ror#5
103 add x27,x27,x16 // h+=Sigma1(e)
104 and x28,x28,x19 // (b^c)&=(a^b)
105 add x23,x23,x27 // d+=h
106 eor x28,x28,x21 // Maj(a,b,c)
107 eor x17,x6,x17,ror#34 // Sigma0(a)
108 add x27,x27,x28 // h+=Maj(a,b,c)
109 ldr x28,[x30],#8 // *K++, x19 in next round
110 //add x27,x27,x17 // h+=Sigma0(a)
111 #ifndef __AARCH64EB__
115 add x27,x27,x17 // h+=Sigma0(a)
117 add x26,x26,x28 // h+=K[i]
118 eor x7,x23,x23,ror#23
121 add x26,x26,x4 // h+=X[i]
122 orr x17,x17,x28 // Ch(e,f,g)
123 eor x28,x27,x20 // a^b, b^c in next round
124 eor x16,x16,x7,ror#18 // Sigma1(e)
126 add x26,x26,x17 // h+=Ch(e,f,g)
127 eor x17,x27,x27,ror#5
128 add x26,x26,x16 // h+=Sigma1(e)
129 and x19,x19,x28 // (b^c)&=(a^b)
130 add x22,x22,x26 // d+=h
131 eor x19,x19,x20 // Maj(a,b,c)
132 eor x17,x7,x17,ror#34 // Sigma0(a)
133 add x26,x26,x19 // h+=Maj(a,b,c)
134 ldr x19,[x30],#8 // *K++, x28 in next round
135 //add x26,x26,x17 // h+=Sigma0(a)
136 #ifndef __AARCH64EB__
139 add x26,x26,x17 // h+=Sigma0(a)
141 add x25,x25,x19 // h+=K[i]
142 eor x8,x22,x22,ror#23
145 add x25,x25,x5 // h+=X[i]
146 orr x17,x17,x19 // Ch(e,f,g)
147 eor x19,x26,x27 // a^b, b^c in next round
148 eor x16,x16,x8,ror#18 // Sigma1(e)
150 add x25,x25,x17 // h+=Ch(e,f,g)
151 eor x17,x26,x26,ror#5
152 add x25,x25,x16 // h+=Sigma1(e)
153 and x28,x28,x19 // (b^c)&=(a^b)
154 add x21,x21,x25 // d+=h
155 eor x28,x28,x27 // Maj(a,b,c)
156 eor x17,x8,x17,ror#34 // Sigma0(a)
157 add x25,x25,x28 // h+=Maj(a,b,c)
158 ldr x28,[x30],#8 // *K++, x19 in next round
159 //add x25,x25,x17 // h+=Sigma0(a)
160 #ifndef __AARCH64EB__
164 add x25,x25,x17 // h+=Sigma0(a)
166 add x24,x24,x28 // h+=K[i]
167 eor x9,x21,x21,ror#23
170 add x24,x24,x6 // h+=X[i]
171 orr x17,x17,x28 // Ch(e,f,g)
172 eor x28,x25,x26 // a^b, b^c in next round
173 eor x16,x16,x9,ror#18 // Sigma1(e)
175 add x24,x24,x17 // h+=Ch(e,f,g)
176 eor x17,x25,x25,ror#5
177 add x24,x24,x16 // h+=Sigma1(e)
178 and x19,x19,x28 // (b^c)&=(a^b)
179 add x20,x20,x24 // d+=h
180 eor x19,x19,x26 // Maj(a,b,c)
181 eor x17,x9,x17,ror#34 // Sigma0(a)
182 add x24,x24,x19 // h+=Maj(a,b,c)
183 ldr x19,[x30],#8 // *K++, x28 in next round
184 //add x24,x24,x17 // h+=Sigma0(a)
185 #ifndef __AARCH64EB__
188 add x24,x24,x17 // h+=Sigma0(a)
190 add x23,x23,x19 // h+=K[i]
191 eor x10,x20,x20,ror#23
194 add x23,x23,x7 // h+=X[i]
195 orr x17,x17,x19 // Ch(e,f,g)
196 eor x19,x24,x25 // a^b, b^c in next round
197 eor x16,x16,x10,ror#18 // Sigma1(e)
199 add x23,x23,x17 // h+=Ch(e,f,g)
200 eor x17,x24,x24,ror#5
201 add x23,x23,x16 // h+=Sigma1(e)
202 and x28,x28,x19 // (b^c)&=(a^b)
203 add x27,x27,x23 // d+=h
204 eor x28,x28,x25 // Maj(a,b,c)
205 eor x17,x10,x17,ror#34 // Sigma0(a)
206 add x23,x23,x28 // h+=Maj(a,b,c)
207 ldr x28,[x30],#8 // *K++, x19 in next round
208 //add x23,x23,x17 // h+=Sigma0(a)
209 #ifndef __AARCH64EB__
213 add x23,x23,x17 // h+=Sigma0(a)
215 add x22,x22,x28 // h+=K[i]
216 eor x11,x27,x27,ror#23
219 add x22,x22,x8 // h+=X[i]
220 orr x17,x17,x28 // Ch(e,f,g)
221 eor x28,x23,x24 // a^b, b^c in next round
222 eor x16,x16,x11,ror#18 // Sigma1(e)
224 add x22,x22,x17 // h+=Ch(e,f,g)
225 eor x17,x23,x23,ror#5
226 add x22,x22,x16 // h+=Sigma1(e)
227 and x19,x19,x28 // (b^c)&=(a^b)
228 add x26,x26,x22 // d+=h
229 eor x19,x19,x24 // Maj(a,b,c)
230 eor x17,x11,x17,ror#34 // Sigma0(a)
231 add x22,x22,x19 // h+=Maj(a,b,c)
232 ldr x19,[x30],#8 // *K++, x28 in next round
233 //add x22,x22,x17 // h+=Sigma0(a)
234 #ifndef __AARCH64EB__
237 add x22,x22,x17 // h+=Sigma0(a)
239 add x21,x21,x19 // h+=K[i]
240 eor x12,x26,x26,ror#23
243 add x21,x21,x9 // h+=X[i]
244 orr x17,x17,x19 // Ch(e,f,g)
245 eor x19,x22,x23 // a^b, b^c in next round
246 eor x16,x16,x12,ror#18 // Sigma1(e)
248 add x21,x21,x17 // h+=Ch(e,f,g)
249 eor x17,x22,x22,ror#5
250 add x21,x21,x16 // h+=Sigma1(e)
251 and x28,x28,x19 // (b^c)&=(a^b)
252 add x25,x25,x21 // d+=h
253 eor x28,x28,x23 // Maj(a,b,c)
254 eor x17,x12,x17,ror#34 // Sigma0(a)
255 add x21,x21,x28 // h+=Maj(a,b,c)
256 ldr x28,[x30],#8 // *K++, x19 in next round
257 //add x21,x21,x17 // h+=Sigma0(a)
258 #ifndef __AARCH64EB__
261 ldp x11,x12,[x1],#2*8
262 add x21,x21,x17 // h+=Sigma0(a)
264 add x20,x20,x28 // h+=K[i]
265 eor x13,x25,x25,ror#23
268 add x20,x20,x10 // h+=X[i]
269 orr x17,x17,x28 // Ch(e,f,g)
270 eor x28,x21,x22 // a^b, b^c in next round
271 eor x16,x16,x13,ror#18 // Sigma1(e)
273 add x20,x20,x17 // h+=Ch(e,f,g)
274 eor x17,x21,x21,ror#5
275 add x20,x20,x16 // h+=Sigma1(e)
276 and x19,x19,x28 // (b^c)&=(a^b)
277 add x24,x24,x20 // d+=h
278 eor x19,x19,x22 // Maj(a,b,c)
279 eor x17,x13,x17,ror#34 // Sigma0(a)
280 add x20,x20,x19 // h+=Maj(a,b,c)
281 ldr x19,[x30],#8 // *K++, x28 in next round
282 //add x20,x20,x17 // h+=Sigma0(a)
283 #ifndef __AARCH64EB__
286 add x20,x20,x17 // h+=Sigma0(a)
288 add x27,x27,x19 // h+=K[i]
289 eor x14,x24,x24,ror#23
292 add x27,x27,x11 // h+=X[i]
293 orr x17,x17,x19 // Ch(e,f,g)
294 eor x19,x20,x21 // a^b, b^c in next round
295 eor x16,x16,x14,ror#18 // Sigma1(e)
297 add x27,x27,x17 // h+=Ch(e,f,g)
298 eor x17,x20,x20,ror#5
299 add x27,x27,x16 // h+=Sigma1(e)
300 and x28,x28,x19 // (b^c)&=(a^b)
301 add x23,x23,x27 // d+=h
302 eor x28,x28,x21 // Maj(a,b,c)
303 eor x17,x14,x17,ror#34 // Sigma0(a)
304 add x27,x27,x28 // h+=Maj(a,b,c)
305 ldr x28,[x30],#8 // *K++, x19 in next round
306 //add x27,x27,x17 // h+=Sigma0(a)
307 #ifndef __AARCH64EB__
310 ldp x13,x14,[x1],#2*8
311 add x27,x27,x17 // h+=Sigma0(a)
313 add x26,x26,x28 // h+=K[i]
314 eor x15,x23,x23,ror#23
317 add x26,x26,x12 // h+=X[i]
318 orr x17,x17,x28 // Ch(e,f,g)
319 eor x28,x27,x20 // a^b, b^c in next round
320 eor x16,x16,x15,ror#18 // Sigma1(e)
322 add x26,x26,x17 // h+=Ch(e,f,g)
323 eor x17,x27,x27,ror#5
324 add x26,x26,x16 // h+=Sigma1(e)
325 and x19,x19,x28 // (b^c)&=(a^b)
326 add x22,x22,x26 // d+=h
327 eor x19,x19,x20 // Maj(a,b,c)
328 eor x17,x15,x17,ror#34 // Sigma0(a)
329 add x26,x26,x19 // h+=Maj(a,b,c)
330 ldr x19,[x30],#8 // *K++, x28 in next round
331 //add x26,x26,x17 // h+=Sigma0(a)
332 #ifndef __AARCH64EB__
335 add x26,x26,x17 // h+=Sigma0(a)
337 add x25,x25,x19 // h+=K[i]
338 eor x0,x22,x22,ror#23
341 add x25,x25,x13 // h+=X[i]
342 orr x17,x17,x19 // Ch(e,f,g)
343 eor x19,x26,x27 // a^b, b^c in next round
344 eor x16,x16,x0,ror#18 // Sigma1(e)
346 add x25,x25,x17 // h+=Ch(e,f,g)
347 eor x17,x26,x26,ror#5
348 add x25,x25,x16 // h+=Sigma1(e)
349 and x28,x28,x19 // (b^c)&=(a^b)
350 add x21,x21,x25 // d+=h
351 eor x28,x28,x27 // Maj(a,b,c)
352 eor x17,x0,x17,ror#34 // Sigma0(a)
353 add x25,x25,x28 // h+=Maj(a,b,c)
354 ldr x28,[x30],#8 // *K++, x19 in next round
355 //add x25,x25,x17 // h+=Sigma0(a)
356 #ifndef __AARCH64EB__
360 add x25,x25,x17 // h+=Sigma0(a)
363 add x24,x24,x28 // h+=K[i]
364 eor x6,x21,x21,ror#23
367 add x24,x24,x14 // h+=X[i]
368 orr x17,x17,x28 // Ch(e,f,g)
369 eor x28,x25,x26 // a^b, b^c in next round
370 eor x16,x16,x6,ror#18 // Sigma1(e)
372 add x24,x24,x17 // h+=Ch(e,f,g)
373 eor x17,x25,x25,ror#5
374 add x24,x24,x16 // h+=Sigma1(e)
375 and x19,x19,x28 // (b^c)&=(a^b)
376 add x20,x20,x24 // d+=h
377 eor x19,x19,x26 // Maj(a,b,c)
378 eor x17,x6,x17,ror#34 // Sigma0(a)
379 add x24,x24,x19 // h+=Maj(a,b,c)
380 ldr x19,[x30],#8 // *K++, x28 in next round
381 //add x24,x24,x17 // h+=Sigma0(a)
382 #ifndef __AARCH64EB__
385 add x24,x24,x17 // h+=Sigma0(a)
388 add x23,x23,x19 // h+=K[i]
389 eor x7,x20,x20,ror#23
392 add x23,x23,x15 // h+=X[i]
393 orr x17,x17,x19 // Ch(e,f,g)
394 eor x19,x24,x25 // a^b, b^c in next round
395 eor x16,x16,x7,ror#18 // Sigma1(e)
397 add x23,x23,x17 // h+=Ch(e,f,g)
398 eor x17,x24,x24,ror#5
399 add x23,x23,x16 // h+=Sigma1(e)
400 and x28,x28,x19 // (b^c)&=(a^b)
401 add x27,x27,x23 // d+=h
402 eor x28,x28,x25 // Maj(a,b,c)
403 eor x17,x7,x17,ror#34 // Sigma0(a)
404 add x23,x23,x28 // h+=Maj(a,b,c)
405 ldr x28,[x30],#8 // *K++, x19 in next round
406 //add x23,x23,x17 // h+=Sigma0(a)
407 #ifndef __AARCH64EB__
411 add x23,x23,x17 // h+=Sigma0(a)
414 add x22,x22,x28 // h+=K[i]
415 eor x8,x27,x27,ror#23
418 add x22,x22,x0 // h+=X[i]
419 orr x17,x17,x28 // Ch(e,f,g)
420 eor x28,x23,x24 // a^b, b^c in next round
421 eor x16,x16,x8,ror#18 // Sigma1(e)
423 add x22,x22,x17 // h+=Ch(e,f,g)
424 eor x17,x23,x23,ror#5
425 add x22,x22,x16 // h+=Sigma1(e)
426 and x19,x19,x28 // (b^c)&=(a^b)
427 add x26,x26,x22 // d+=h
428 eor x19,x19,x24 // Maj(a,b,c)
429 eor x17,x8,x17,ror#34 // Sigma0(a)
430 add x22,x22,x19 // h+=Maj(a,b,c)
431 ldr x19,[x30],#8 // *K++, x28 in next round
432 //add x22,x22,x17 // h+=Sigma0(a)
433 #ifndef __AARCH64EB__
437 add x22,x22,x17 // h+=Sigma0(a)
440 add x21,x21,x19 // h+=K[i]
441 eor x9,x26,x26,ror#23
444 add x21,x21,x1 // h+=X[i]
445 orr x17,x17,x19 // Ch(e,f,g)
446 eor x19,x22,x23 // a^b, b^c in next round
447 eor x16,x16,x9,ror#18 // Sigma1(e)
449 add x21,x21,x17 // h+=Ch(e,f,g)
450 eor x17,x22,x22,ror#5
451 add x21,x21,x16 // h+=Sigma1(e)
452 and x28,x28,x19 // (b^c)&=(a^b)
453 add x25,x25,x21 // d+=h
454 eor x28,x28,x23 // Maj(a,b,c)
455 eor x17,x9,x17,ror#34 // Sigma0(a)
456 add x21,x21,x28 // h+=Maj(a,b,c)
457 ldr x28,[x30],#8 // *K++, x19 in next round
458 //add x21,x21,x17 // h+=Sigma0(a)
459 #ifndef __AARCH64EB__
463 add x21,x21,x17 // h+=Sigma0(a)
466 add x20,x20,x28 // h+=K[i]
472 add x20,x20,x2 // h+=X[i]
473 eor x16,x16,x25,ror#18
475 orr x17,x17,x28 // Ch(e,f,g)
476 eor x28,x21,x22 // a^b, b^c in next round
477 eor x16,x16,x25,ror#41 // Sigma1(e)
478 eor x10,x10,x21,ror#34
479 add x20,x20,x17 // h+=Ch(e,f,g)
480 and x19,x19,x28 // (b^c)&=(a^b)
482 eor x9,x9,x4,lsr#7 // sigma0(X[i+1])
483 add x20,x20,x16 // h+=Sigma1(e)
484 eor x19,x19,x22 // Maj(a,b,c)
485 eor x17,x10,x21,ror#39 // Sigma0(a)
486 eor x8,x8,x1,lsr#6 // sigma1(X[i+14])
488 add x24,x24,x20 // d+=h
489 add x20,x20,x19 // h+=Maj(a,b,c)
490 ldr x19,[x30],#8 // *K++, x28 in next round
492 add x20,x20,x17 // h+=Sigma0(a)
498 add x27,x27,x19 // h+=K[i]
504 add x27,x27,x3 // h+=X[i]
505 eor x16,x16,x24,ror#18
507 orr x17,x17,x19 // Ch(e,f,g)
508 eor x19,x20,x21 // a^b, b^c in next round
509 eor x16,x16,x24,ror#41 // Sigma1(e)
510 eor x11,x11,x20,ror#34
511 add x27,x27,x17 // h+=Ch(e,f,g)
512 and x28,x28,x19 // (b^c)&=(a^b)
514 eor x10,x10,x5,lsr#7 // sigma0(X[i+1])
515 add x27,x27,x16 // h+=Sigma1(e)
516 eor x28,x28,x21 // Maj(a,b,c)
517 eor x17,x11,x20,ror#39 // Sigma0(a)
518 eor x9,x9,x2,lsr#6 // sigma1(X[i+14])
520 add x23,x23,x27 // d+=h
521 add x27,x27,x28 // h+=Maj(a,b,c)
522 ldr x28,[x30],#8 // *K++, x19 in next round
524 add x27,x27,x17 // h+=Sigma0(a)
529 add x26,x26,x28 // h+=K[i]
535 add x26,x26,x4 // h+=X[i]
536 eor x16,x16,x23,ror#18
538 orr x17,x17,x28 // Ch(e,f,g)
539 eor x28,x27,x20 // a^b, b^c in next round
540 eor x16,x16,x23,ror#41 // Sigma1(e)
541 eor x12,x12,x27,ror#34
542 add x26,x26,x17 // h+=Ch(e,f,g)
543 and x19,x19,x28 // (b^c)&=(a^b)
544 eor x10,x10,x3,ror#61
545 eor x11,x11,x6,lsr#7 // sigma0(X[i+1])
546 add x26,x26,x16 // h+=Sigma1(e)
547 eor x19,x19,x20 // Maj(a,b,c)
548 eor x17,x12,x27,ror#39 // Sigma0(a)
549 eor x10,x10,x3,lsr#6 // sigma1(X[i+14])
551 add x22,x22,x26 // d+=h
552 add x26,x26,x19 // h+=Maj(a,b,c)
553 ldr x19,[x30],#8 // *K++, x28 in next round
555 add x26,x26,x17 // h+=Sigma0(a)
560 add x25,x25,x19 // h+=K[i]
566 add x25,x25,x5 // h+=X[i]
567 eor x16,x16,x22,ror#18
569 orr x17,x17,x19 // Ch(e,f,g)
570 eor x19,x26,x27 // a^b, b^c in next round
571 eor x16,x16,x22,ror#41 // Sigma1(e)
572 eor x13,x13,x26,ror#34
573 add x25,x25,x17 // h+=Ch(e,f,g)
574 and x28,x28,x19 // (b^c)&=(a^b)
575 eor x11,x11,x4,ror#61
576 eor x12,x12,x7,lsr#7 // sigma0(X[i+1])
577 add x25,x25,x16 // h+=Sigma1(e)
578 eor x28,x28,x27 // Maj(a,b,c)
579 eor x17,x13,x26,ror#39 // Sigma0(a)
580 eor x11,x11,x4,lsr#6 // sigma1(X[i+14])
582 add x21,x21,x25 // d+=h
583 add x25,x25,x28 // h+=Maj(a,b,c)
584 ldr x28,[x30],#8 // *K++, x19 in next round
586 add x25,x25,x17 // h+=Sigma0(a)
591 add x24,x24,x28 // h+=K[i]
597 add x24,x24,x6 // h+=X[i]
598 eor x16,x16,x21,ror#18
600 orr x17,x17,x28 // Ch(e,f,g)
601 eor x28,x25,x26 // a^b, b^c in next round
602 eor x16,x16,x21,ror#41 // Sigma1(e)
603 eor x14,x14,x25,ror#34
604 add x24,x24,x17 // h+=Ch(e,f,g)
605 and x19,x19,x28 // (b^c)&=(a^b)
606 eor x12,x12,x5,ror#61
607 eor x13,x13,x8,lsr#7 // sigma0(X[i+1])
608 add x24,x24,x16 // h+=Sigma1(e)
609 eor x19,x19,x26 // Maj(a,b,c)
610 eor x17,x14,x25,ror#39 // Sigma0(a)
611 eor x12,x12,x5,lsr#6 // sigma1(X[i+14])
613 add x20,x20,x24 // d+=h
614 add x24,x24,x19 // h+=Maj(a,b,c)
615 ldr x19,[x30],#8 // *K++, x28 in next round
617 add x24,x24,x17 // h+=Sigma0(a)
622 add x23,x23,x19 // h+=K[i]
628 add x23,x23,x7 // h+=X[i]
629 eor x16,x16,x20,ror#18
631 orr x17,x17,x19 // Ch(e,f,g)
632 eor x19,x24,x25 // a^b, b^c in next round
633 eor x16,x16,x20,ror#41 // Sigma1(e)
634 eor x15,x15,x24,ror#34
635 add x23,x23,x17 // h+=Ch(e,f,g)
636 and x28,x28,x19 // (b^c)&=(a^b)
637 eor x13,x13,x6,ror#61
638 eor x14,x14,x9,lsr#7 // sigma0(X[i+1])
639 add x23,x23,x16 // h+=Sigma1(e)
640 eor x28,x28,x25 // Maj(a,b,c)
641 eor x17,x15,x24,ror#39 // Sigma0(a)
642 eor x13,x13,x6,lsr#6 // sigma1(X[i+14])
644 add x27,x27,x23 // d+=h
645 add x23,x23,x28 // h+=Maj(a,b,c)
646 ldr x28,[x30],#8 // *K++, x19 in next round
648 add x23,x23,x17 // h+=Sigma0(a)
653 add x22,x22,x28 // h+=K[i]
659 add x22,x22,x8 // h+=X[i]
660 eor x16,x16,x27,ror#18
661 eor x15,x15,x10,ror#8
662 orr x17,x17,x28 // Ch(e,f,g)
663 eor x28,x23,x24 // a^b, b^c in next round
664 eor x16,x16,x27,ror#41 // Sigma1(e)
666 add x22,x22,x17 // h+=Ch(e,f,g)
667 and x19,x19,x28 // (b^c)&=(a^b)
668 eor x14,x14,x7,ror#61
669 eor x15,x15,x10,lsr#7 // sigma0(X[i+1])
670 add x22,x22,x16 // h+=Sigma1(e)
671 eor x19,x19,x24 // Maj(a,b,c)
672 eor x17,x0,x23,ror#39 // Sigma0(a)
673 eor x14,x14,x7,lsr#6 // sigma1(X[i+14])
675 add x26,x26,x22 // d+=h
676 add x22,x22,x19 // h+=Maj(a,b,c)
677 ldr x19,[x30],#8 // *K++, x28 in next round
679 add x22,x22,x17 // h+=Sigma0(a)
684 add x21,x21,x19 // h+=K[i]
690 add x21,x21,x9 // h+=X[i]
691 eor x16,x16,x26,ror#18
693 orr x17,x17,x19 // Ch(e,f,g)
694 eor x19,x22,x23 // a^b, b^c in next round
695 eor x16,x16,x26,ror#41 // Sigma1(e)
697 add x21,x21,x17 // h+=Ch(e,f,g)
698 and x28,x28,x19 // (b^c)&=(a^b)
699 eor x15,x15,x8,ror#61
700 eor x0,x0,x11,lsr#7 // sigma0(X[i+1])
701 add x21,x21,x16 // h+=Sigma1(e)
702 eor x28,x28,x23 // Maj(a,b,c)
703 eor x17,x1,x22,ror#39 // Sigma0(a)
704 eor x15,x15,x8,lsr#6 // sigma1(X[i+14])
706 add x25,x25,x21 // d+=h
707 add x21,x21,x28 // h+=Maj(a,b,c)
708 ldr x28,[x30],#8 // *K++, x19 in next round
710 add x21,x21,x17 // h+=Sigma0(a)
715 add x20,x20,x28 // h+=K[i]
721 add x20,x20,x10 // h+=X[i]
722 eor x16,x16,x25,ror#18
724 orr x17,x17,x28 // Ch(e,f,g)
725 eor x28,x21,x22 // a^b, b^c in next round
726 eor x16,x16,x25,ror#41 // Sigma1(e)
728 add x20,x20,x17 // h+=Ch(e,f,g)
729 and x19,x19,x28 // (b^c)&=(a^b)
731 eor x1,x1,x12,lsr#7 // sigma0(X[i+1])
732 add x20,x20,x16 // h+=Sigma1(e)
733 eor x19,x19,x22 // Maj(a,b,c)
734 eor x17,x2,x21,ror#39 // Sigma0(a)
735 eor x0,x0,x9,lsr#6 // sigma1(X[i+14])
737 add x24,x24,x20 // d+=h
738 add x20,x20,x19 // h+=Maj(a,b,c)
739 ldr x19,[x30],#8 // *K++, x28 in next round
741 add x20,x20,x17 // h+=Sigma0(a)
746 add x27,x27,x19 // h+=K[i]
752 add x27,x27,x11 // h+=X[i]
753 eor x16,x16,x24,ror#18
755 orr x17,x17,x19 // Ch(e,f,g)
756 eor x19,x20,x21 // a^b, b^c in next round
757 eor x16,x16,x24,ror#41 // Sigma1(e)
759 add x27,x27,x17 // h+=Ch(e,f,g)
760 and x28,x28,x19 // (b^c)&=(a^b)
762 eor x2,x2,x13,lsr#7 // sigma0(X[i+1])
763 add x27,x27,x16 // h+=Sigma1(e)
764 eor x28,x28,x21 // Maj(a,b,c)
765 eor x17,x3,x20,ror#39 // Sigma0(a)
766 eor x1,x1,x10,lsr#6 // sigma1(X[i+14])
768 add x23,x23,x27 // d+=h
769 add x27,x27,x28 // h+=Maj(a,b,c)
770 ldr x28,[x30],#8 // *K++, x19 in next round
772 add x27,x27,x17 // h+=Sigma0(a)
777 add x26,x26,x28 // h+=K[i]
783 add x26,x26,x12 // h+=X[i]
784 eor x16,x16,x23,ror#18
786 orr x17,x17,x28 // Ch(e,f,g)
787 eor x28,x27,x20 // a^b, b^c in next round
788 eor x16,x16,x23,ror#41 // Sigma1(e)
790 add x26,x26,x17 // h+=Ch(e,f,g)
791 and x19,x19,x28 // (b^c)&=(a^b)
793 eor x3,x3,x14,lsr#7 // sigma0(X[i+1])
794 add x26,x26,x16 // h+=Sigma1(e)
795 eor x19,x19,x20 // Maj(a,b,c)
796 eor x17,x4,x27,ror#39 // Sigma0(a)
797 eor x2,x2,x11,lsr#6 // sigma1(X[i+14])
799 add x22,x22,x26 // d+=h
800 add x26,x26,x19 // h+=Maj(a,b,c)
801 ldr x19,[x30],#8 // *K++, x28 in next round
803 add x26,x26,x17 // h+=Sigma0(a)
808 add x25,x25,x19 // h+=K[i]
814 add x25,x25,x13 // h+=X[i]
815 eor x16,x16,x22,ror#18
817 orr x17,x17,x19 // Ch(e,f,g)
818 eor x19,x26,x27 // a^b, b^c in next round
819 eor x16,x16,x22,ror#41 // Sigma1(e)
821 add x25,x25,x17 // h+=Ch(e,f,g)
822 and x28,x28,x19 // (b^c)&=(a^b)
824 eor x4,x4,x15,lsr#7 // sigma0(X[i+1])
825 add x25,x25,x16 // h+=Sigma1(e)
826 eor x28,x28,x27 // Maj(a,b,c)
827 eor x17,x5,x26,ror#39 // Sigma0(a)
828 eor x3,x3,x12,lsr#6 // sigma1(X[i+14])
830 add x21,x21,x25 // d+=h
831 add x25,x25,x28 // h+=Maj(a,b,c)
832 ldr x28,[x30],#8 // *K++, x19 in next round
834 add x25,x25,x17 // h+=Sigma0(a)
839 add x24,x24,x28 // h+=K[i]
845 add x24,x24,x14 // h+=X[i]
846 eor x16,x16,x21,ror#18
848 orr x17,x17,x28 // Ch(e,f,g)
849 eor x28,x25,x26 // a^b, b^c in next round
850 eor x16,x16,x21,ror#41 // Sigma1(e)
852 add x24,x24,x17 // h+=Ch(e,f,g)
853 and x19,x19,x28 // (b^c)&=(a^b)
855 eor x5,x5,x0,lsr#7 // sigma0(X[i+1])
856 add x24,x24,x16 // h+=Sigma1(e)
857 eor x19,x19,x26 // Maj(a,b,c)
858 eor x17,x6,x25,ror#39 // Sigma0(a)
859 eor x4,x4,x13,lsr#6 // sigma1(X[i+14])
861 add x20,x20,x24 // d+=h
862 add x24,x24,x19 // h+=Maj(a,b,c)
863 ldr x19,[x30],#8 // *K++, x28 in next round
865 add x24,x24,x17 // h+=Sigma0(a)
870 add x23,x23,x19 // h+=K[i]
876 add x23,x23,x15 // h+=X[i]
877 eor x16,x16,x20,ror#18
879 orr x17,x17,x19 // Ch(e,f,g)
880 eor x19,x24,x25 // a^b, b^c in next round
881 eor x16,x16,x20,ror#41 // Sigma1(e)
883 add x23,x23,x17 // h+=Ch(e,f,g)
884 and x28,x28,x19 // (b^c)&=(a^b)
886 eor x6,x6,x1,lsr#7 // sigma0(X[i+1])
887 add x23,x23,x16 // h+=Sigma1(e)
888 eor x28,x28,x25 // Maj(a,b,c)
889 eor x17,x7,x24,ror#39 // Sigma0(a)
890 eor x5,x5,x14,lsr#6 // sigma1(X[i+14])
892 add x27,x27,x23 // d+=h
893 add x23,x23,x28 // h+=Maj(a,b,c)
894 ldr x28,[x30],#8 // *K++, x19 in next round
896 add x23,x23,x17 // h+=Sigma0(a)
901 add x22,x22,x28 // h+=K[i]
907 add x22,x22,x0 // h+=X[i]
908 eor x16,x16,x27,ror#18
910 orr x17,x17,x28 // Ch(e,f,g)
911 eor x28,x23,x24 // a^b, b^c in next round
912 eor x16,x16,x27,ror#41 // Sigma1(e)
914 add x22,x22,x17 // h+=Ch(e,f,g)
915 and x19,x19,x28 // (b^c)&=(a^b)
917 eor x7,x7,x2,lsr#7 // sigma0(X[i+1])
918 add x22,x22,x16 // h+=Sigma1(e)
919 eor x19,x19,x24 // Maj(a,b,c)
920 eor x17,x8,x23,ror#39 // Sigma0(a)
921 eor x6,x6,x15,lsr#6 // sigma1(X[i+14])
923 add x26,x26,x22 // d+=h
924 add x22,x22,x19 // h+=Maj(a,b,c)
925 ldr x19,[x30],#8 // *K++, x28 in next round
927 add x22,x22,x17 // h+=Sigma0(a)
932 add x21,x21,x19 // h+=K[i]
938 add x21,x21,x1 // h+=X[i]
939 eor x16,x16,x26,ror#18
941 orr x17,x17,x19 // Ch(e,f,g)
942 eor x19,x22,x23 // a^b, b^c in next round
943 eor x16,x16,x26,ror#41 // Sigma1(e)
945 add x21,x21,x17 // h+=Ch(e,f,g)
946 and x28,x28,x19 // (b^c)&=(a^b)
948 eor x8,x8,x3,lsr#7 // sigma0(X[i+1])
949 add x21,x21,x16 // h+=Sigma1(e)
950 eor x28,x28,x23 // Maj(a,b,c)
951 eor x17,x9,x22,ror#39 // Sigma0(a)
952 eor x7,x7,x0,lsr#6 // sigma1(X[i+14])
954 add x25,x25,x21 // d+=h
955 add x21,x21,x28 // h+=Maj(a,b,c)
956 ldr x28,[x30],#8 // *K++, x19 in next round
958 add x21,x21,x17 // h+=Sigma0(a)
963 add x20,x20,x28 // h+=K[i]
969 add x20,x20,x2 // h+=X[i]
970 eor x16,x16,x25,ror#18
972 orr x17,x17,x28 // Ch(e,f,g)
973 eor x28,x21,x22 // a^b, b^c in next round
974 eor x16,x16,x25,ror#41 // Sigma1(e)
975 eor x10,x10,x21,ror#34
976 add x20,x20,x17 // h+=Ch(e,f,g)
977 and x19,x19,x28 // (b^c)&=(a^b)
979 eor x9,x9,x4,lsr#7 // sigma0(X[i+1])
980 add x20,x20,x16 // h+=Sigma1(e)
981 eor x19,x19,x22 // Maj(a,b,c)
982 eor x17,x10,x21,ror#39 // Sigma0(a)
983 eor x8,x8,x1,lsr#6 // sigma1(X[i+14])
985 add x24,x24,x20 // d+=h
986 add x20,x20,x19 // h+=Maj(a,b,c)
987 ldr x19,[x30],#8 // *K++, x28 in next round
989 add x20,x20,x17 // h+=Sigma0(a)
995 sub x30,x30,#648 // rewind
999 add x1,x1,#14*8 // advance input pointer
1002 ldp x9,x10,[x0,#6*8]
1009 stp x22,x23,[x0,#2*8]
1013 stp x24,x25,[x0,#4*8]
1014 stp x26,x27,[x0,#6*8]
1017 ldp x19,x20,[x29,#16]
1019 ldp x21,x22,[x29,#32]
1020 ldp x23,x24,[x29,#48]
1021 ldp x25,x26,[x29,#64]
1022 ldp x27,x28,[x29,#80]
1023 ldp x29,x30,[sp],#128
1025 .size sha512_block_data_order,.-sha512_block_data_order
1028 .type .LK512,%object
1030 .quad 0x428a2f98d728ae22,0x7137449123ef65cd
1031 .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
1032 .quad 0x3956c25bf348b538,0x59f111f1b605d019
1033 .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
1034 .quad 0xd807aa98a3030242,0x12835b0145706fbe
1035 .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
1036 .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
1037 .quad 0x9bdc06a725c71235,0xc19bf174cf692694
1038 .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
1039 .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
1040 .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
1041 .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
1042 .quad 0x983e5152ee66dfab,0xa831c66d2db43210
1043 .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
1044 .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
1045 .quad 0x06ca6351e003826f,0x142929670a0e6e70
1046 .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
1047 .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
1048 .quad 0x650a73548baf63de,0x766a0abb3c77b2a8
1049 .quad 0x81c2c92e47edaee6,0x92722c851482353b
1050 .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
1051 .quad 0xc24b8b70d0f89791,0xc76c51a30654be30
1052 .quad 0xd192e819d6ef5218,0xd69906245565a910
1053 .quad 0xf40e35855771202a,0x106aa07032bbd1b8
1054 .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
1055 .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
1056 .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
1057 .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
1058 .quad 0x748f82ee5defb2fc,0x78a5636f43172f60
1059 .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
1060 .quad 0x90befffa23631e28,0xa4506cebde82bde9
1061 .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
1062 .quad 0xca273eceea26619c,0xd186b8c721c0c207
1063 .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
1064 .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
1065 .quad 0x113f9804bef90dae,0x1b710b35131c471b
1066 .quad 0x28db77f523047d84,0x32caab7b40c72493
1067 .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
1068 .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
1069 .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
1070 .quad 0 // terminator
1071 .size .LK512,.-.LK512
1076 .long OPENSSL_armcap_P-.
1078 .quad OPENSSL_armcap_P-.
1081 .asciz "SHA512 block transform for ARMv8, CRYPTOGAMS by <appro@openssl.org>"
1084 .comm OPENSSL_armcap_P,4,4