1 // SPDX-License-Identifier: GPL-2.0
3 // This code is taken from the OpenSSL project but the author (Andy Polyakov)
4 // has relicensed it under the GPLv2. Therefore this program is free software;
5 // you can redistribute it and/or modify it under the terms of the GNU General
6 // Public License version 2 as published by the Free Software Foundation.
8 // The original headers, including the original license headers, are
9 // included below for completeness.
11 // Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
13 // Licensed under the OpenSSL license (the "License"). You may not use
14 // this file except in compliance with the License. You can obtain a copy
15 // in the file LICENSE in the source distribution or at
16 // https://www.openssl.org/source/license.html
18 // ====================================================================
19 // Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
20 // project. The module is, however, dual licensed under OpenSSL and
21 // CRYPTOGAMS licenses depending on where you obtain it. For further
22 // details see http://www.openssl.org/~appro/cryptogams/.
23 // ====================================================================
25 // SHA256/512 for ARMv8.
27 // Performance in cycles per processed byte and improvement coefficient
28 // over code generated with "default" compiler:
30 // SHA256-hw SHA256(*) SHA512
31 // Apple A7 1.97 10.5 (+33%) 6.73 (-1%(**))
32 // Cortex-A53 2.38 15.5 (+115%) 10.0 (+150%(***))
33 // Cortex-A57 2.31 11.6 (+86%) 7.51 (+260%(***))
34 // Denver 2.01 10.5 (+26%) 6.70 (+8%)
35 // X-Gene 20.0 (+100%) 12.8 (+300%(***))
36 // Mongoose 2.36 13.0 (+50%) 8.36 (+33%)
38 // (*) Software SHA256 results are of lesser relevance, presented
39 // mostly for informational purposes.
40 // (**) The result is a trade-off: it's possible to improve it by
41 // 10% (or by 1 cycle per round), but at the cost of 20% loss
42 // on Cortex-A53 (or by 4 cycles per round).
43 // (***) Super-impressive coefficients over gcc-generated code are
44 // indication of some compiler "pathology", most notably code
45 // generated with -mgeneral-regs-only is significanty faster
46 // and the gap is only 40-90%.
50 // Originally it was reckoned that it makes no sense to implement NEON
51 // version of SHA256 for 64-bit processors. This is because performance
52 // improvement on most wide-spread Cortex-A5x processors was observed
53 // to be marginal, same on Cortex-A53 and ~10% on A57. But then it was
54 // observed that 32-bit NEON SHA256 performs significantly better than
55 // 64-bit scalar version on *some* of the more recent processors. As
56 // result 64-bit NEON version of SHA256 was added to provide best
57 // all-round performance. For example it executes ~30% faster on X-Gene
58 // and Mongoose. [For reference, NEON version of SHA512 is bound to
59 // deliver much less improvement, likely *negative* on Cortex-A5x.
60 // Which is why NEON support is limited to SHA256.]
63 # include "arm_arch.h"
68 .extern OPENSSL_armcap_P
69 .globl sha512_block_data_order
70 .type sha512_block_data_order,%function
72 sha512_block_data_order:
73 stp x29,x30,[sp,#-128]!
83 ldp x20,x21,[x0] // load context
86 add x2,x1,x2,lsl#7 // end of input
93 ldr x19,[x30],#8 // *K++
94 eor x28,x21,x22 // magic seed
100 add x27,x27,x19 // h+=K[i]
101 eor x6,x24,x24,ror#23
104 add x27,x27,x3 // h+=X[i]
105 orr x17,x17,x19 // Ch(e,f,g)
106 eor x19,x20,x21 // a^b, b^c in next round
107 eor x16,x16,x6,ror#18 // Sigma1(e)
109 add x27,x27,x17 // h+=Ch(e,f,g)
110 eor x17,x20,x20,ror#5
111 add x27,x27,x16 // h+=Sigma1(e)
112 and x28,x28,x19 // (b^c)&=(a^b)
113 add x23,x23,x27 // d+=h
114 eor x28,x28,x21 // Maj(a,b,c)
115 eor x17,x6,x17,ror#34 // Sigma0(a)
116 add x27,x27,x28 // h+=Maj(a,b,c)
117 ldr x28,[x30],#8 // *K++, x19 in next round
118 //add x27,x27,x17 // h+=Sigma0(a)
119 #ifndef __AARCH64EB__
123 add x27,x27,x17 // h+=Sigma0(a)
125 add x26,x26,x28 // h+=K[i]
126 eor x7,x23,x23,ror#23
129 add x26,x26,x4 // h+=X[i]
130 orr x17,x17,x28 // Ch(e,f,g)
131 eor x28,x27,x20 // a^b, b^c in next round
132 eor x16,x16,x7,ror#18 // Sigma1(e)
134 add x26,x26,x17 // h+=Ch(e,f,g)
135 eor x17,x27,x27,ror#5
136 add x26,x26,x16 // h+=Sigma1(e)
137 and x19,x19,x28 // (b^c)&=(a^b)
138 add x22,x22,x26 // d+=h
139 eor x19,x19,x20 // Maj(a,b,c)
140 eor x17,x7,x17,ror#34 // Sigma0(a)
141 add x26,x26,x19 // h+=Maj(a,b,c)
142 ldr x19,[x30],#8 // *K++, x28 in next round
143 //add x26,x26,x17 // h+=Sigma0(a)
144 #ifndef __AARCH64EB__
147 add x26,x26,x17 // h+=Sigma0(a)
149 add x25,x25,x19 // h+=K[i]
150 eor x8,x22,x22,ror#23
153 add x25,x25,x5 // h+=X[i]
154 orr x17,x17,x19 // Ch(e,f,g)
155 eor x19,x26,x27 // a^b, b^c in next round
156 eor x16,x16,x8,ror#18 // Sigma1(e)
158 add x25,x25,x17 // h+=Ch(e,f,g)
159 eor x17,x26,x26,ror#5
160 add x25,x25,x16 // h+=Sigma1(e)
161 and x28,x28,x19 // (b^c)&=(a^b)
162 add x21,x21,x25 // d+=h
163 eor x28,x28,x27 // Maj(a,b,c)
164 eor x17,x8,x17,ror#34 // Sigma0(a)
165 add x25,x25,x28 // h+=Maj(a,b,c)
166 ldr x28,[x30],#8 // *K++, x19 in next round
167 //add x25,x25,x17 // h+=Sigma0(a)
168 #ifndef __AARCH64EB__
172 add x25,x25,x17 // h+=Sigma0(a)
174 add x24,x24,x28 // h+=K[i]
175 eor x9,x21,x21,ror#23
178 add x24,x24,x6 // h+=X[i]
179 orr x17,x17,x28 // Ch(e,f,g)
180 eor x28,x25,x26 // a^b, b^c in next round
181 eor x16,x16,x9,ror#18 // Sigma1(e)
183 add x24,x24,x17 // h+=Ch(e,f,g)
184 eor x17,x25,x25,ror#5
185 add x24,x24,x16 // h+=Sigma1(e)
186 and x19,x19,x28 // (b^c)&=(a^b)
187 add x20,x20,x24 // d+=h
188 eor x19,x19,x26 // Maj(a,b,c)
189 eor x17,x9,x17,ror#34 // Sigma0(a)
190 add x24,x24,x19 // h+=Maj(a,b,c)
191 ldr x19,[x30],#8 // *K++, x28 in next round
192 //add x24,x24,x17 // h+=Sigma0(a)
193 #ifndef __AARCH64EB__
196 add x24,x24,x17 // h+=Sigma0(a)
198 add x23,x23,x19 // h+=K[i]
199 eor x10,x20,x20,ror#23
202 add x23,x23,x7 // h+=X[i]
203 orr x17,x17,x19 // Ch(e,f,g)
204 eor x19,x24,x25 // a^b, b^c in next round
205 eor x16,x16,x10,ror#18 // Sigma1(e)
207 add x23,x23,x17 // h+=Ch(e,f,g)
208 eor x17,x24,x24,ror#5
209 add x23,x23,x16 // h+=Sigma1(e)
210 and x28,x28,x19 // (b^c)&=(a^b)
211 add x27,x27,x23 // d+=h
212 eor x28,x28,x25 // Maj(a,b,c)
213 eor x17,x10,x17,ror#34 // Sigma0(a)
214 add x23,x23,x28 // h+=Maj(a,b,c)
215 ldr x28,[x30],#8 // *K++, x19 in next round
216 //add x23,x23,x17 // h+=Sigma0(a)
217 #ifndef __AARCH64EB__
221 add x23,x23,x17 // h+=Sigma0(a)
223 add x22,x22,x28 // h+=K[i]
224 eor x11,x27,x27,ror#23
227 add x22,x22,x8 // h+=X[i]
228 orr x17,x17,x28 // Ch(e,f,g)
229 eor x28,x23,x24 // a^b, b^c in next round
230 eor x16,x16,x11,ror#18 // Sigma1(e)
232 add x22,x22,x17 // h+=Ch(e,f,g)
233 eor x17,x23,x23,ror#5
234 add x22,x22,x16 // h+=Sigma1(e)
235 and x19,x19,x28 // (b^c)&=(a^b)
236 add x26,x26,x22 // d+=h
237 eor x19,x19,x24 // Maj(a,b,c)
238 eor x17,x11,x17,ror#34 // Sigma0(a)
239 add x22,x22,x19 // h+=Maj(a,b,c)
240 ldr x19,[x30],#8 // *K++, x28 in next round
241 //add x22,x22,x17 // h+=Sigma0(a)
242 #ifndef __AARCH64EB__
245 add x22,x22,x17 // h+=Sigma0(a)
247 add x21,x21,x19 // h+=K[i]
248 eor x12,x26,x26,ror#23
251 add x21,x21,x9 // h+=X[i]
252 orr x17,x17,x19 // Ch(e,f,g)
253 eor x19,x22,x23 // a^b, b^c in next round
254 eor x16,x16,x12,ror#18 // Sigma1(e)
256 add x21,x21,x17 // h+=Ch(e,f,g)
257 eor x17,x22,x22,ror#5
258 add x21,x21,x16 // h+=Sigma1(e)
259 and x28,x28,x19 // (b^c)&=(a^b)
260 add x25,x25,x21 // d+=h
261 eor x28,x28,x23 // Maj(a,b,c)
262 eor x17,x12,x17,ror#34 // Sigma0(a)
263 add x21,x21,x28 // h+=Maj(a,b,c)
264 ldr x28,[x30],#8 // *K++, x19 in next round
265 //add x21,x21,x17 // h+=Sigma0(a)
266 #ifndef __AARCH64EB__
269 ldp x11,x12,[x1],#2*8
270 add x21,x21,x17 // h+=Sigma0(a)
272 add x20,x20,x28 // h+=K[i]
273 eor x13,x25,x25,ror#23
276 add x20,x20,x10 // h+=X[i]
277 orr x17,x17,x28 // Ch(e,f,g)
278 eor x28,x21,x22 // a^b, b^c in next round
279 eor x16,x16,x13,ror#18 // Sigma1(e)
281 add x20,x20,x17 // h+=Ch(e,f,g)
282 eor x17,x21,x21,ror#5
283 add x20,x20,x16 // h+=Sigma1(e)
284 and x19,x19,x28 // (b^c)&=(a^b)
285 add x24,x24,x20 // d+=h
286 eor x19,x19,x22 // Maj(a,b,c)
287 eor x17,x13,x17,ror#34 // Sigma0(a)
288 add x20,x20,x19 // h+=Maj(a,b,c)
289 ldr x19,[x30],#8 // *K++, x28 in next round
290 //add x20,x20,x17 // h+=Sigma0(a)
291 #ifndef __AARCH64EB__
294 add x20,x20,x17 // h+=Sigma0(a)
296 add x27,x27,x19 // h+=K[i]
297 eor x14,x24,x24,ror#23
300 add x27,x27,x11 // h+=X[i]
301 orr x17,x17,x19 // Ch(e,f,g)
302 eor x19,x20,x21 // a^b, b^c in next round
303 eor x16,x16,x14,ror#18 // Sigma1(e)
305 add x27,x27,x17 // h+=Ch(e,f,g)
306 eor x17,x20,x20,ror#5
307 add x27,x27,x16 // h+=Sigma1(e)
308 and x28,x28,x19 // (b^c)&=(a^b)
309 add x23,x23,x27 // d+=h
310 eor x28,x28,x21 // Maj(a,b,c)
311 eor x17,x14,x17,ror#34 // Sigma0(a)
312 add x27,x27,x28 // h+=Maj(a,b,c)
313 ldr x28,[x30],#8 // *K++, x19 in next round
314 //add x27,x27,x17 // h+=Sigma0(a)
315 #ifndef __AARCH64EB__
318 ldp x13,x14,[x1],#2*8
319 add x27,x27,x17 // h+=Sigma0(a)
321 add x26,x26,x28 // h+=K[i]
322 eor x15,x23,x23,ror#23
325 add x26,x26,x12 // h+=X[i]
326 orr x17,x17,x28 // Ch(e,f,g)
327 eor x28,x27,x20 // a^b, b^c in next round
328 eor x16,x16,x15,ror#18 // Sigma1(e)
330 add x26,x26,x17 // h+=Ch(e,f,g)
331 eor x17,x27,x27,ror#5
332 add x26,x26,x16 // h+=Sigma1(e)
333 and x19,x19,x28 // (b^c)&=(a^b)
334 add x22,x22,x26 // d+=h
335 eor x19,x19,x20 // Maj(a,b,c)
336 eor x17,x15,x17,ror#34 // Sigma0(a)
337 add x26,x26,x19 // h+=Maj(a,b,c)
338 ldr x19,[x30],#8 // *K++, x28 in next round
339 //add x26,x26,x17 // h+=Sigma0(a)
340 #ifndef __AARCH64EB__
343 add x26,x26,x17 // h+=Sigma0(a)
345 add x25,x25,x19 // h+=K[i]
346 eor x0,x22,x22,ror#23
349 add x25,x25,x13 // h+=X[i]
350 orr x17,x17,x19 // Ch(e,f,g)
351 eor x19,x26,x27 // a^b, b^c in next round
352 eor x16,x16,x0,ror#18 // Sigma1(e)
354 add x25,x25,x17 // h+=Ch(e,f,g)
355 eor x17,x26,x26,ror#5
356 add x25,x25,x16 // h+=Sigma1(e)
357 and x28,x28,x19 // (b^c)&=(a^b)
358 add x21,x21,x25 // d+=h
359 eor x28,x28,x27 // Maj(a,b,c)
360 eor x17,x0,x17,ror#34 // Sigma0(a)
361 add x25,x25,x28 // h+=Maj(a,b,c)
362 ldr x28,[x30],#8 // *K++, x19 in next round
363 //add x25,x25,x17 // h+=Sigma0(a)
364 #ifndef __AARCH64EB__
368 add x25,x25,x17 // h+=Sigma0(a)
371 add x24,x24,x28 // h+=K[i]
372 eor x6,x21,x21,ror#23
375 add x24,x24,x14 // h+=X[i]
376 orr x17,x17,x28 // Ch(e,f,g)
377 eor x28,x25,x26 // a^b, b^c in next round
378 eor x16,x16,x6,ror#18 // Sigma1(e)
380 add x24,x24,x17 // h+=Ch(e,f,g)
381 eor x17,x25,x25,ror#5
382 add x24,x24,x16 // h+=Sigma1(e)
383 and x19,x19,x28 // (b^c)&=(a^b)
384 add x20,x20,x24 // d+=h
385 eor x19,x19,x26 // Maj(a,b,c)
386 eor x17,x6,x17,ror#34 // Sigma0(a)
387 add x24,x24,x19 // h+=Maj(a,b,c)
388 ldr x19,[x30],#8 // *K++, x28 in next round
389 //add x24,x24,x17 // h+=Sigma0(a)
390 #ifndef __AARCH64EB__
393 add x24,x24,x17 // h+=Sigma0(a)
396 add x23,x23,x19 // h+=K[i]
397 eor x7,x20,x20,ror#23
400 add x23,x23,x15 // h+=X[i]
401 orr x17,x17,x19 // Ch(e,f,g)
402 eor x19,x24,x25 // a^b, b^c in next round
403 eor x16,x16,x7,ror#18 // Sigma1(e)
405 add x23,x23,x17 // h+=Ch(e,f,g)
406 eor x17,x24,x24,ror#5
407 add x23,x23,x16 // h+=Sigma1(e)
408 and x28,x28,x19 // (b^c)&=(a^b)
409 add x27,x27,x23 // d+=h
410 eor x28,x28,x25 // Maj(a,b,c)
411 eor x17,x7,x17,ror#34 // Sigma0(a)
412 add x23,x23,x28 // h+=Maj(a,b,c)
413 ldr x28,[x30],#8 // *K++, x19 in next round
414 //add x23,x23,x17 // h+=Sigma0(a)
415 #ifndef __AARCH64EB__
419 add x23,x23,x17 // h+=Sigma0(a)
422 add x22,x22,x28 // h+=K[i]
423 eor x8,x27,x27,ror#23
426 add x22,x22,x0 // h+=X[i]
427 orr x17,x17,x28 // Ch(e,f,g)
428 eor x28,x23,x24 // a^b, b^c in next round
429 eor x16,x16,x8,ror#18 // Sigma1(e)
431 add x22,x22,x17 // h+=Ch(e,f,g)
432 eor x17,x23,x23,ror#5
433 add x22,x22,x16 // h+=Sigma1(e)
434 and x19,x19,x28 // (b^c)&=(a^b)
435 add x26,x26,x22 // d+=h
436 eor x19,x19,x24 // Maj(a,b,c)
437 eor x17,x8,x17,ror#34 // Sigma0(a)
438 add x22,x22,x19 // h+=Maj(a,b,c)
439 ldr x19,[x30],#8 // *K++, x28 in next round
440 //add x22,x22,x17 // h+=Sigma0(a)
441 #ifndef __AARCH64EB__
445 add x22,x22,x17 // h+=Sigma0(a)
448 add x21,x21,x19 // h+=K[i]
449 eor x9,x26,x26,ror#23
452 add x21,x21,x1 // h+=X[i]
453 orr x17,x17,x19 // Ch(e,f,g)
454 eor x19,x22,x23 // a^b, b^c in next round
455 eor x16,x16,x9,ror#18 // Sigma1(e)
457 add x21,x21,x17 // h+=Ch(e,f,g)
458 eor x17,x22,x22,ror#5
459 add x21,x21,x16 // h+=Sigma1(e)
460 and x28,x28,x19 // (b^c)&=(a^b)
461 add x25,x25,x21 // d+=h
462 eor x28,x28,x23 // Maj(a,b,c)
463 eor x17,x9,x17,ror#34 // Sigma0(a)
464 add x21,x21,x28 // h+=Maj(a,b,c)
465 ldr x28,[x30],#8 // *K++, x19 in next round
466 //add x21,x21,x17 // h+=Sigma0(a)
467 #ifndef __AARCH64EB__
471 add x21,x21,x17 // h+=Sigma0(a)
474 add x20,x20,x28 // h+=K[i]
480 add x20,x20,x2 // h+=X[i]
481 eor x16,x16,x25,ror#18
483 orr x17,x17,x28 // Ch(e,f,g)
484 eor x28,x21,x22 // a^b, b^c in next round
485 eor x16,x16,x25,ror#41 // Sigma1(e)
486 eor x10,x10,x21,ror#34
487 add x20,x20,x17 // h+=Ch(e,f,g)
488 and x19,x19,x28 // (b^c)&=(a^b)
490 eor x9,x9,x4,lsr#7 // sigma0(X[i+1])
491 add x20,x20,x16 // h+=Sigma1(e)
492 eor x19,x19,x22 // Maj(a,b,c)
493 eor x17,x10,x21,ror#39 // Sigma0(a)
494 eor x8,x8,x1,lsr#6 // sigma1(X[i+14])
496 add x24,x24,x20 // d+=h
497 add x20,x20,x19 // h+=Maj(a,b,c)
498 ldr x19,[x30],#8 // *K++, x28 in next round
500 add x20,x20,x17 // h+=Sigma0(a)
506 add x27,x27,x19 // h+=K[i]
512 add x27,x27,x3 // h+=X[i]
513 eor x16,x16,x24,ror#18
515 orr x17,x17,x19 // Ch(e,f,g)
516 eor x19,x20,x21 // a^b, b^c in next round
517 eor x16,x16,x24,ror#41 // Sigma1(e)
518 eor x11,x11,x20,ror#34
519 add x27,x27,x17 // h+=Ch(e,f,g)
520 and x28,x28,x19 // (b^c)&=(a^b)
522 eor x10,x10,x5,lsr#7 // sigma0(X[i+1])
523 add x27,x27,x16 // h+=Sigma1(e)
524 eor x28,x28,x21 // Maj(a,b,c)
525 eor x17,x11,x20,ror#39 // Sigma0(a)
526 eor x9,x9,x2,lsr#6 // sigma1(X[i+14])
528 add x23,x23,x27 // d+=h
529 add x27,x27,x28 // h+=Maj(a,b,c)
530 ldr x28,[x30],#8 // *K++, x19 in next round
532 add x27,x27,x17 // h+=Sigma0(a)
537 add x26,x26,x28 // h+=K[i]
543 add x26,x26,x4 // h+=X[i]
544 eor x16,x16,x23,ror#18
546 orr x17,x17,x28 // Ch(e,f,g)
547 eor x28,x27,x20 // a^b, b^c in next round
548 eor x16,x16,x23,ror#41 // Sigma1(e)
549 eor x12,x12,x27,ror#34
550 add x26,x26,x17 // h+=Ch(e,f,g)
551 and x19,x19,x28 // (b^c)&=(a^b)
552 eor x10,x10,x3,ror#61
553 eor x11,x11,x6,lsr#7 // sigma0(X[i+1])
554 add x26,x26,x16 // h+=Sigma1(e)
555 eor x19,x19,x20 // Maj(a,b,c)
556 eor x17,x12,x27,ror#39 // Sigma0(a)
557 eor x10,x10,x3,lsr#6 // sigma1(X[i+14])
559 add x22,x22,x26 // d+=h
560 add x26,x26,x19 // h+=Maj(a,b,c)
561 ldr x19,[x30],#8 // *K++, x28 in next round
563 add x26,x26,x17 // h+=Sigma0(a)
568 add x25,x25,x19 // h+=K[i]
574 add x25,x25,x5 // h+=X[i]
575 eor x16,x16,x22,ror#18
577 orr x17,x17,x19 // Ch(e,f,g)
578 eor x19,x26,x27 // a^b, b^c in next round
579 eor x16,x16,x22,ror#41 // Sigma1(e)
580 eor x13,x13,x26,ror#34
581 add x25,x25,x17 // h+=Ch(e,f,g)
582 and x28,x28,x19 // (b^c)&=(a^b)
583 eor x11,x11,x4,ror#61
584 eor x12,x12,x7,lsr#7 // sigma0(X[i+1])
585 add x25,x25,x16 // h+=Sigma1(e)
586 eor x28,x28,x27 // Maj(a,b,c)
587 eor x17,x13,x26,ror#39 // Sigma0(a)
588 eor x11,x11,x4,lsr#6 // sigma1(X[i+14])
590 add x21,x21,x25 // d+=h
591 add x25,x25,x28 // h+=Maj(a,b,c)
592 ldr x28,[x30],#8 // *K++, x19 in next round
594 add x25,x25,x17 // h+=Sigma0(a)
599 add x24,x24,x28 // h+=K[i]
605 add x24,x24,x6 // h+=X[i]
606 eor x16,x16,x21,ror#18
608 orr x17,x17,x28 // Ch(e,f,g)
609 eor x28,x25,x26 // a^b, b^c in next round
610 eor x16,x16,x21,ror#41 // Sigma1(e)
611 eor x14,x14,x25,ror#34
612 add x24,x24,x17 // h+=Ch(e,f,g)
613 and x19,x19,x28 // (b^c)&=(a^b)
614 eor x12,x12,x5,ror#61
615 eor x13,x13,x8,lsr#7 // sigma0(X[i+1])
616 add x24,x24,x16 // h+=Sigma1(e)
617 eor x19,x19,x26 // Maj(a,b,c)
618 eor x17,x14,x25,ror#39 // Sigma0(a)
619 eor x12,x12,x5,lsr#6 // sigma1(X[i+14])
621 add x20,x20,x24 // d+=h
622 add x24,x24,x19 // h+=Maj(a,b,c)
623 ldr x19,[x30],#8 // *K++, x28 in next round
625 add x24,x24,x17 // h+=Sigma0(a)
630 add x23,x23,x19 // h+=K[i]
636 add x23,x23,x7 // h+=X[i]
637 eor x16,x16,x20,ror#18
639 orr x17,x17,x19 // Ch(e,f,g)
640 eor x19,x24,x25 // a^b, b^c in next round
641 eor x16,x16,x20,ror#41 // Sigma1(e)
642 eor x15,x15,x24,ror#34
643 add x23,x23,x17 // h+=Ch(e,f,g)
644 and x28,x28,x19 // (b^c)&=(a^b)
645 eor x13,x13,x6,ror#61
646 eor x14,x14,x9,lsr#7 // sigma0(X[i+1])
647 add x23,x23,x16 // h+=Sigma1(e)
648 eor x28,x28,x25 // Maj(a,b,c)
649 eor x17,x15,x24,ror#39 // Sigma0(a)
650 eor x13,x13,x6,lsr#6 // sigma1(X[i+14])
652 add x27,x27,x23 // d+=h
653 add x23,x23,x28 // h+=Maj(a,b,c)
654 ldr x28,[x30],#8 // *K++, x19 in next round
656 add x23,x23,x17 // h+=Sigma0(a)
661 add x22,x22,x28 // h+=K[i]
667 add x22,x22,x8 // h+=X[i]
668 eor x16,x16,x27,ror#18
669 eor x15,x15,x10,ror#8
670 orr x17,x17,x28 // Ch(e,f,g)
671 eor x28,x23,x24 // a^b, b^c in next round
672 eor x16,x16,x27,ror#41 // Sigma1(e)
674 add x22,x22,x17 // h+=Ch(e,f,g)
675 and x19,x19,x28 // (b^c)&=(a^b)
676 eor x14,x14,x7,ror#61
677 eor x15,x15,x10,lsr#7 // sigma0(X[i+1])
678 add x22,x22,x16 // h+=Sigma1(e)
679 eor x19,x19,x24 // Maj(a,b,c)
680 eor x17,x0,x23,ror#39 // Sigma0(a)
681 eor x14,x14,x7,lsr#6 // sigma1(X[i+14])
683 add x26,x26,x22 // d+=h
684 add x22,x22,x19 // h+=Maj(a,b,c)
685 ldr x19,[x30],#8 // *K++, x28 in next round
687 add x22,x22,x17 // h+=Sigma0(a)
692 add x21,x21,x19 // h+=K[i]
698 add x21,x21,x9 // h+=X[i]
699 eor x16,x16,x26,ror#18
701 orr x17,x17,x19 // Ch(e,f,g)
702 eor x19,x22,x23 // a^b, b^c in next round
703 eor x16,x16,x26,ror#41 // Sigma1(e)
705 add x21,x21,x17 // h+=Ch(e,f,g)
706 and x28,x28,x19 // (b^c)&=(a^b)
707 eor x15,x15,x8,ror#61
708 eor x0,x0,x11,lsr#7 // sigma0(X[i+1])
709 add x21,x21,x16 // h+=Sigma1(e)
710 eor x28,x28,x23 // Maj(a,b,c)
711 eor x17,x1,x22,ror#39 // Sigma0(a)
712 eor x15,x15,x8,lsr#6 // sigma1(X[i+14])
714 add x25,x25,x21 // d+=h
715 add x21,x21,x28 // h+=Maj(a,b,c)
716 ldr x28,[x30],#8 // *K++, x19 in next round
718 add x21,x21,x17 // h+=Sigma0(a)
723 add x20,x20,x28 // h+=K[i]
729 add x20,x20,x10 // h+=X[i]
730 eor x16,x16,x25,ror#18
732 orr x17,x17,x28 // Ch(e,f,g)
733 eor x28,x21,x22 // a^b, b^c in next round
734 eor x16,x16,x25,ror#41 // Sigma1(e)
736 add x20,x20,x17 // h+=Ch(e,f,g)
737 and x19,x19,x28 // (b^c)&=(a^b)
739 eor x1,x1,x12,lsr#7 // sigma0(X[i+1])
740 add x20,x20,x16 // h+=Sigma1(e)
741 eor x19,x19,x22 // Maj(a,b,c)
742 eor x17,x2,x21,ror#39 // Sigma0(a)
743 eor x0,x0,x9,lsr#6 // sigma1(X[i+14])
745 add x24,x24,x20 // d+=h
746 add x20,x20,x19 // h+=Maj(a,b,c)
747 ldr x19,[x30],#8 // *K++, x28 in next round
749 add x20,x20,x17 // h+=Sigma0(a)
754 add x27,x27,x19 // h+=K[i]
760 add x27,x27,x11 // h+=X[i]
761 eor x16,x16,x24,ror#18
763 orr x17,x17,x19 // Ch(e,f,g)
764 eor x19,x20,x21 // a^b, b^c in next round
765 eor x16,x16,x24,ror#41 // Sigma1(e)
767 add x27,x27,x17 // h+=Ch(e,f,g)
768 and x28,x28,x19 // (b^c)&=(a^b)
770 eor x2,x2,x13,lsr#7 // sigma0(X[i+1])
771 add x27,x27,x16 // h+=Sigma1(e)
772 eor x28,x28,x21 // Maj(a,b,c)
773 eor x17,x3,x20,ror#39 // Sigma0(a)
774 eor x1,x1,x10,lsr#6 // sigma1(X[i+14])
776 add x23,x23,x27 // d+=h
777 add x27,x27,x28 // h+=Maj(a,b,c)
778 ldr x28,[x30],#8 // *K++, x19 in next round
780 add x27,x27,x17 // h+=Sigma0(a)
785 add x26,x26,x28 // h+=K[i]
791 add x26,x26,x12 // h+=X[i]
792 eor x16,x16,x23,ror#18
794 orr x17,x17,x28 // Ch(e,f,g)
795 eor x28,x27,x20 // a^b, b^c in next round
796 eor x16,x16,x23,ror#41 // Sigma1(e)
798 add x26,x26,x17 // h+=Ch(e,f,g)
799 and x19,x19,x28 // (b^c)&=(a^b)
801 eor x3,x3,x14,lsr#7 // sigma0(X[i+1])
802 add x26,x26,x16 // h+=Sigma1(e)
803 eor x19,x19,x20 // Maj(a,b,c)
804 eor x17,x4,x27,ror#39 // Sigma0(a)
805 eor x2,x2,x11,lsr#6 // sigma1(X[i+14])
807 add x22,x22,x26 // d+=h
808 add x26,x26,x19 // h+=Maj(a,b,c)
809 ldr x19,[x30],#8 // *K++, x28 in next round
811 add x26,x26,x17 // h+=Sigma0(a)
816 add x25,x25,x19 // h+=K[i]
822 add x25,x25,x13 // h+=X[i]
823 eor x16,x16,x22,ror#18
825 orr x17,x17,x19 // Ch(e,f,g)
826 eor x19,x26,x27 // a^b, b^c in next round
827 eor x16,x16,x22,ror#41 // Sigma1(e)
829 add x25,x25,x17 // h+=Ch(e,f,g)
830 and x28,x28,x19 // (b^c)&=(a^b)
832 eor x4,x4,x15,lsr#7 // sigma0(X[i+1])
833 add x25,x25,x16 // h+=Sigma1(e)
834 eor x28,x28,x27 // Maj(a,b,c)
835 eor x17,x5,x26,ror#39 // Sigma0(a)
836 eor x3,x3,x12,lsr#6 // sigma1(X[i+14])
838 add x21,x21,x25 // d+=h
839 add x25,x25,x28 // h+=Maj(a,b,c)
840 ldr x28,[x30],#8 // *K++, x19 in next round
842 add x25,x25,x17 // h+=Sigma0(a)
847 add x24,x24,x28 // h+=K[i]
853 add x24,x24,x14 // h+=X[i]
854 eor x16,x16,x21,ror#18
856 orr x17,x17,x28 // Ch(e,f,g)
857 eor x28,x25,x26 // a^b, b^c in next round
858 eor x16,x16,x21,ror#41 // Sigma1(e)
860 add x24,x24,x17 // h+=Ch(e,f,g)
861 and x19,x19,x28 // (b^c)&=(a^b)
863 eor x5,x5,x0,lsr#7 // sigma0(X[i+1])
864 add x24,x24,x16 // h+=Sigma1(e)
865 eor x19,x19,x26 // Maj(a,b,c)
866 eor x17,x6,x25,ror#39 // Sigma0(a)
867 eor x4,x4,x13,lsr#6 // sigma1(X[i+14])
869 add x20,x20,x24 // d+=h
870 add x24,x24,x19 // h+=Maj(a,b,c)
871 ldr x19,[x30],#8 // *K++, x28 in next round
873 add x24,x24,x17 // h+=Sigma0(a)
878 add x23,x23,x19 // h+=K[i]
884 add x23,x23,x15 // h+=X[i]
885 eor x16,x16,x20,ror#18
887 orr x17,x17,x19 // Ch(e,f,g)
888 eor x19,x24,x25 // a^b, b^c in next round
889 eor x16,x16,x20,ror#41 // Sigma1(e)
891 add x23,x23,x17 // h+=Ch(e,f,g)
892 and x28,x28,x19 // (b^c)&=(a^b)
894 eor x6,x6,x1,lsr#7 // sigma0(X[i+1])
895 add x23,x23,x16 // h+=Sigma1(e)
896 eor x28,x28,x25 // Maj(a,b,c)
897 eor x17,x7,x24,ror#39 // Sigma0(a)
898 eor x5,x5,x14,lsr#6 // sigma1(X[i+14])
900 add x27,x27,x23 // d+=h
901 add x23,x23,x28 // h+=Maj(a,b,c)
902 ldr x28,[x30],#8 // *K++, x19 in next round
904 add x23,x23,x17 // h+=Sigma0(a)
909 add x22,x22,x28 // h+=K[i]
915 add x22,x22,x0 // h+=X[i]
916 eor x16,x16,x27,ror#18
918 orr x17,x17,x28 // Ch(e,f,g)
919 eor x28,x23,x24 // a^b, b^c in next round
920 eor x16,x16,x27,ror#41 // Sigma1(e)
922 add x22,x22,x17 // h+=Ch(e,f,g)
923 and x19,x19,x28 // (b^c)&=(a^b)
925 eor x7,x7,x2,lsr#7 // sigma0(X[i+1])
926 add x22,x22,x16 // h+=Sigma1(e)
927 eor x19,x19,x24 // Maj(a,b,c)
928 eor x17,x8,x23,ror#39 // Sigma0(a)
929 eor x6,x6,x15,lsr#6 // sigma1(X[i+14])
931 add x26,x26,x22 // d+=h
932 add x22,x22,x19 // h+=Maj(a,b,c)
933 ldr x19,[x30],#8 // *K++, x28 in next round
935 add x22,x22,x17 // h+=Sigma0(a)
940 add x21,x21,x19 // h+=K[i]
946 add x21,x21,x1 // h+=X[i]
947 eor x16,x16,x26,ror#18
949 orr x17,x17,x19 // Ch(e,f,g)
950 eor x19,x22,x23 // a^b, b^c in next round
951 eor x16,x16,x26,ror#41 // Sigma1(e)
953 add x21,x21,x17 // h+=Ch(e,f,g)
954 and x28,x28,x19 // (b^c)&=(a^b)
956 eor x8,x8,x3,lsr#7 // sigma0(X[i+1])
957 add x21,x21,x16 // h+=Sigma1(e)
958 eor x28,x28,x23 // Maj(a,b,c)
959 eor x17,x9,x22,ror#39 // Sigma0(a)
960 eor x7,x7,x0,lsr#6 // sigma1(X[i+14])
962 add x25,x25,x21 // d+=h
963 add x21,x21,x28 // h+=Maj(a,b,c)
964 ldr x28,[x30],#8 // *K++, x19 in next round
966 add x21,x21,x17 // h+=Sigma0(a)
971 add x20,x20,x28 // h+=K[i]
977 add x20,x20,x2 // h+=X[i]
978 eor x16,x16,x25,ror#18
980 orr x17,x17,x28 // Ch(e,f,g)
981 eor x28,x21,x22 // a^b, b^c in next round
982 eor x16,x16,x25,ror#41 // Sigma1(e)
983 eor x10,x10,x21,ror#34
984 add x20,x20,x17 // h+=Ch(e,f,g)
985 and x19,x19,x28 // (b^c)&=(a^b)
987 eor x9,x9,x4,lsr#7 // sigma0(X[i+1])
988 add x20,x20,x16 // h+=Sigma1(e)
989 eor x19,x19,x22 // Maj(a,b,c)
990 eor x17,x10,x21,ror#39 // Sigma0(a)
991 eor x8,x8,x1,lsr#6 // sigma1(X[i+14])
993 add x24,x24,x20 // d+=h
994 add x20,x20,x19 // h+=Maj(a,b,c)
995 ldr x19,[x30],#8 // *K++, x28 in next round
997 add x20,x20,x17 // h+=Sigma0(a)
1003 sub x30,x30,#648 // rewind
1007 add x1,x1,#14*8 // advance input pointer
1010 ldp x9,x10,[x0,#6*8]
1017 stp x22,x23,[x0,#2*8]
1021 stp x24,x25,[x0,#4*8]
1022 stp x26,x27,[x0,#6*8]
1025 ldp x19,x20,[x29,#16]
1027 ldp x21,x22,[x29,#32]
1028 ldp x23,x24,[x29,#48]
1029 ldp x25,x26,[x29,#64]
1030 ldp x27,x28,[x29,#80]
1031 ldp x29,x30,[sp],#128
1033 .size sha512_block_data_order,.-sha512_block_data_order
1036 .type .LK512,%object
1038 .quad 0x428a2f98d728ae22,0x7137449123ef65cd
1039 .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
1040 .quad 0x3956c25bf348b538,0x59f111f1b605d019
1041 .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
1042 .quad 0xd807aa98a3030242,0x12835b0145706fbe
1043 .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
1044 .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
1045 .quad 0x9bdc06a725c71235,0xc19bf174cf692694
1046 .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
1047 .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
1048 .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
1049 .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
1050 .quad 0x983e5152ee66dfab,0xa831c66d2db43210
1051 .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
1052 .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
1053 .quad 0x06ca6351e003826f,0x142929670a0e6e70
1054 .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
1055 .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
1056 .quad 0x650a73548baf63de,0x766a0abb3c77b2a8
1057 .quad 0x81c2c92e47edaee6,0x92722c851482353b
1058 .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
1059 .quad 0xc24b8b70d0f89791,0xc76c51a30654be30
1060 .quad 0xd192e819d6ef5218,0xd69906245565a910
1061 .quad 0xf40e35855771202a,0x106aa07032bbd1b8
1062 .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
1063 .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
1064 .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
1065 .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
1066 .quad 0x748f82ee5defb2fc,0x78a5636f43172f60
1067 .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
1068 .quad 0x90befffa23631e28,0xa4506cebde82bde9
1069 .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
1070 .quad 0xca273eceea26619c,0xd186b8c721c0c207
1071 .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
1072 .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
1073 .quad 0x113f9804bef90dae,0x1b710b35131c471b
1074 .quad 0x28db77f523047d84,0x32caab7b40c72493
1075 .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
1076 .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
1077 .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
1078 .quad 0 // terminator
1079 .size .LK512,.-.LK512
1084 .long OPENSSL_armcap_P-.
1086 .quad OPENSSL_armcap_P-.
1089 .asciz "SHA512 block transform for ARMv8, CRYPTOGAMS by <appro@openssl.org>"
1092 .comm OPENSSL_armcap_P,4,4