ZIL: Call brt_pending_add() replaying TX_CLONE_RANGE
[zfs.git] / module / icp / asm-aarch64 / sha2 / sha512-armv8.S
blob9c61eeee4d7bdac394a5f9d51925aceed4bdc373
1 /*
2  * Copyright 2004-2022 The OpenSSL Project Authors. All Rights Reserved.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *     https://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
18  * Portions Copyright (c) 2022 Tino Reichardt <milky-zfs@mcmilk.de>
19  * - modified assembly to fit into OpenZFS
20  */
22 #if defined(__aarch64__)
24 .text
26 .align  6
27 .type   .LK512,%object
28 .LK512:
29         .quad   0x428a2f98d728ae22,0x7137449123ef65cd
30         .quad   0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
31         .quad   0x3956c25bf348b538,0x59f111f1b605d019
32         .quad   0x923f82a4af194f9b,0xab1c5ed5da6d8118
33         .quad   0xd807aa98a3030242,0x12835b0145706fbe
34         .quad   0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
35         .quad   0x72be5d74f27b896f,0x80deb1fe3b1696b1
36         .quad   0x9bdc06a725c71235,0xc19bf174cf692694
37         .quad   0xe49b69c19ef14ad2,0xefbe4786384f25e3
38         .quad   0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
39         .quad   0x2de92c6f592b0275,0x4a7484aa6ea6e483
40         .quad   0x5cb0a9dcbd41fbd4,0x76f988da831153b5
41         .quad   0x983e5152ee66dfab,0xa831c66d2db43210
42         .quad   0xb00327c898fb213f,0xbf597fc7beef0ee4
43         .quad   0xc6e00bf33da88fc2,0xd5a79147930aa725
44         .quad   0x06ca6351e003826f,0x142929670a0e6e70
45         .quad   0x27b70a8546d22ffc,0x2e1b21385c26c926
46         .quad   0x4d2c6dfc5ac42aed,0x53380d139d95b3df
47         .quad   0x650a73548baf63de,0x766a0abb3c77b2a8
48         .quad   0x81c2c92e47edaee6,0x92722c851482353b
49         .quad   0xa2bfe8a14cf10364,0xa81a664bbc423001
50         .quad   0xc24b8b70d0f89791,0xc76c51a30654be30
51         .quad   0xd192e819d6ef5218,0xd69906245565a910
52         .quad   0xf40e35855771202a,0x106aa07032bbd1b8
53         .quad   0x19a4c116b8d2d0c8,0x1e376c085141ab53
54         .quad   0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
55         .quad   0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
56         .quad   0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
57         .quad   0x748f82ee5defb2fc,0x78a5636f43172f60
58         .quad   0x84c87814a1f0ab72,0x8cc702081a6439ec
59         .quad   0x90befffa23631e28,0xa4506cebde82bde9
60         .quad   0xbef9a3f7b2c67915,0xc67178f2e372532b
61         .quad   0xca273eceea26619c,0xd186b8c721c0c207
62         .quad   0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
63         .quad   0x06f067aa72176fba,0x0a637dc5a2c898a6
64         .quad   0x113f9804bef90dae,0x1b710b35131c471b
65         .quad   0x28db77f523047d84,0x32caab7b40c72493
66         .quad   0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
67         .quad   0x4cc5d4becb3e42b6,0x597f299cfc657e2a
68         .quad   0x5fcb6fab3ad6faec,0x6c44198c4a475817
69         .quad   0       // terminator
70 .size   .LK512,.-.LK512
72 .globl  zfs_sha512_block_armv7
73 .type   zfs_sha512_block_armv7,%function
74 .align  6
75 zfs_sha512_block_armv7:
76         hint    #34                                     // bti c
77         stp     x29,x30,[sp,#-128]!
78         add     x29,sp,#0
80         stp     x19,x20,[sp,#16]
81         stp     x21,x22,[sp,#32]
82         stp     x23,x24,[sp,#48]
83         stp     x25,x26,[sp,#64]
84         stp     x27,x28,[sp,#80]
85         sub     sp,sp,#4*8
87         ldp     x20,x21,[x0]                            // load context
88         ldp     x22,x23,[x0,#2*8]
89         ldp     x24,x25,[x0,#4*8]
90         add     x2,x1,x2,lsl#7  // end of input
91         ldp     x26,x27,[x0,#6*8]
92         adr     x30,.LK512
93         stp     x0,x2,[x29,#96]
95 .Loop:
96         ldp     x3,x4,[x1],#2*8
97         ldr     x19,[x30],#8                    // *K++
98         eor     x28,x21,x22                             // magic seed
99         str     x1,[x29,#112]
100 #ifndef __AARCH64EB__
101         rev     x3,x3                   // 0
102 #endif
103         ror     x16,x24,#14
104         add     x27,x27,x19                     // h+=K[i]
105         eor     x6,x24,x24,ror#23
106         and     x17,x25,x24
107         bic     x19,x26,x24
108         add     x27,x27,x3                      // h+=X[i]
109         orr     x17,x17,x19                     // Ch(e,f,g)
110         eor     x19,x20,x21                     // a^b, b^c in next round
111         eor     x16,x16,x6,ror#18       // Sigma1(e)
112         ror     x6,x20,#28
113         add     x27,x27,x17                     // h+=Ch(e,f,g)
114         eor     x17,x20,x20,ror#5
115         add     x27,x27,x16                     // h+=Sigma1(e)
116         and     x28,x28,x19                     // (b^c)&=(a^b)
117         add     x23,x23,x27                     // d+=h
118         eor     x28,x28,x21                     // Maj(a,b,c)
119         eor     x17,x6,x17,ror#34       // Sigma0(a)
120         add     x27,x27,x28                     // h+=Maj(a,b,c)
121         ldr     x28,[x30],#8            // *K++, x19 in next round
122         //add   x27,x27,x17                     // h+=Sigma0(a)
123 #ifndef __AARCH64EB__
124         rev     x4,x4                   // 1
125 #endif
126         ldp     x5,x6,[x1],#2*8
127         add     x27,x27,x17                     // h+=Sigma0(a)
128         ror     x16,x23,#14
129         add     x26,x26,x28                     // h+=K[i]
130         eor     x7,x23,x23,ror#23
131         and     x17,x24,x23
132         bic     x28,x25,x23
133         add     x26,x26,x4                      // h+=X[i]
134         orr     x17,x17,x28                     // Ch(e,f,g)
135         eor     x28,x27,x20                     // a^b, b^c in next round
136         eor     x16,x16,x7,ror#18       // Sigma1(e)
137         ror     x7,x27,#28
138         add     x26,x26,x17                     // h+=Ch(e,f,g)
139         eor     x17,x27,x27,ror#5
140         add     x26,x26,x16                     // h+=Sigma1(e)
141         and     x19,x19,x28                     // (b^c)&=(a^b)
142         add     x22,x22,x26                     // d+=h
143         eor     x19,x19,x20                     // Maj(a,b,c)
144         eor     x17,x7,x17,ror#34       // Sigma0(a)
145         add     x26,x26,x19                     // h+=Maj(a,b,c)
146         ldr     x19,[x30],#8            // *K++, x28 in next round
147         //add   x26,x26,x17                     // h+=Sigma0(a)
148 #ifndef __AARCH64EB__
149         rev     x5,x5                   // 2
150 #endif
151         add     x26,x26,x17                     // h+=Sigma0(a)
152         ror     x16,x22,#14
153         add     x25,x25,x19                     // h+=K[i]
154         eor     x8,x22,x22,ror#23
155         and     x17,x23,x22
156         bic     x19,x24,x22
157         add     x25,x25,x5                      // h+=X[i]
158         orr     x17,x17,x19                     // Ch(e,f,g)
159         eor     x19,x26,x27                     // a^b, b^c in next round
160         eor     x16,x16,x8,ror#18       // Sigma1(e)
161         ror     x8,x26,#28
162         add     x25,x25,x17                     // h+=Ch(e,f,g)
163         eor     x17,x26,x26,ror#5
164         add     x25,x25,x16                     // h+=Sigma1(e)
165         and     x28,x28,x19                     // (b^c)&=(a^b)
166         add     x21,x21,x25                     // d+=h
167         eor     x28,x28,x27                     // Maj(a,b,c)
168         eor     x17,x8,x17,ror#34       // Sigma0(a)
169         add     x25,x25,x28                     // h+=Maj(a,b,c)
170         ldr     x28,[x30],#8            // *K++, x19 in next round
171         //add   x25,x25,x17                     // h+=Sigma0(a)
172 #ifndef __AARCH64EB__
173         rev     x6,x6                   // 3
174 #endif
175         ldp     x7,x8,[x1],#2*8
176         add     x25,x25,x17                     // h+=Sigma0(a)
177         ror     x16,x21,#14
178         add     x24,x24,x28                     // h+=K[i]
179         eor     x9,x21,x21,ror#23
180         and     x17,x22,x21
181         bic     x28,x23,x21
182         add     x24,x24,x6                      // h+=X[i]
183         orr     x17,x17,x28                     // Ch(e,f,g)
184         eor     x28,x25,x26                     // a^b, b^c in next round
185         eor     x16,x16,x9,ror#18       // Sigma1(e)
186         ror     x9,x25,#28
187         add     x24,x24,x17                     // h+=Ch(e,f,g)
188         eor     x17,x25,x25,ror#5
189         add     x24,x24,x16                     // h+=Sigma1(e)
190         and     x19,x19,x28                     // (b^c)&=(a^b)
191         add     x20,x20,x24                     // d+=h
192         eor     x19,x19,x26                     // Maj(a,b,c)
193         eor     x17,x9,x17,ror#34       // Sigma0(a)
194         add     x24,x24,x19                     // h+=Maj(a,b,c)
195         ldr     x19,[x30],#8            // *K++, x28 in next round
196         //add   x24,x24,x17                     // h+=Sigma0(a)
197 #ifndef __AARCH64EB__
198         rev     x7,x7                   // 4
199 #endif
200         add     x24,x24,x17                     // h+=Sigma0(a)
201         ror     x16,x20,#14
202         add     x23,x23,x19                     // h+=K[i]
203         eor     x10,x20,x20,ror#23
204         and     x17,x21,x20
205         bic     x19,x22,x20
206         add     x23,x23,x7                      // h+=X[i]
207         orr     x17,x17,x19                     // Ch(e,f,g)
208         eor     x19,x24,x25                     // a^b, b^c in next round
209         eor     x16,x16,x10,ror#18      // Sigma1(e)
210         ror     x10,x24,#28
211         add     x23,x23,x17                     // h+=Ch(e,f,g)
212         eor     x17,x24,x24,ror#5
213         add     x23,x23,x16                     // h+=Sigma1(e)
214         and     x28,x28,x19                     // (b^c)&=(a^b)
215         add     x27,x27,x23                     // d+=h
216         eor     x28,x28,x25                     // Maj(a,b,c)
217         eor     x17,x10,x17,ror#34      // Sigma0(a)
218         add     x23,x23,x28                     // h+=Maj(a,b,c)
219         ldr     x28,[x30],#8            // *K++, x19 in next round
220         //add   x23,x23,x17                     // h+=Sigma0(a)
221 #ifndef __AARCH64EB__
222         rev     x8,x8                   // 5
223 #endif
224         ldp     x9,x10,[x1],#2*8
225         add     x23,x23,x17                     // h+=Sigma0(a)
226         ror     x16,x27,#14
227         add     x22,x22,x28                     // h+=K[i]
228         eor     x11,x27,x27,ror#23
229         and     x17,x20,x27
230         bic     x28,x21,x27
231         add     x22,x22,x8                      // h+=X[i]
232         orr     x17,x17,x28                     // Ch(e,f,g)
233         eor     x28,x23,x24                     // a^b, b^c in next round
234         eor     x16,x16,x11,ror#18      // Sigma1(e)
235         ror     x11,x23,#28
236         add     x22,x22,x17                     // h+=Ch(e,f,g)
237         eor     x17,x23,x23,ror#5
238         add     x22,x22,x16                     // h+=Sigma1(e)
239         and     x19,x19,x28                     // (b^c)&=(a^b)
240         add     x26,x26,x22                     // d+=h
241         eor     x19,x19,x24                     // Maj(a,b,c)
242         eor     x17,x11,x17,ror#34      // Sigma0(a)
243         add     x22,x22,x19                     // h+=Maj(a,b,c)
244         ldr     x19,[x30],#8            // *K++, x28 in next round
245         //add   x22,x22,x17                     // h+=Sigma0(a)
246 #ifndef __AARCH64EB__
247         rev     x9,x9                   // 6
248 #endif
249         add     x22,x22,x17                     // h+=Sigma0(a)
250         ror     x16,x26,#14
251         add     x21,x21,x19                     // h+=K[i]
252         eor     x12,x26,x26,ror#23
253         and     x17,x27,x26
254         bic     x19,x20,x26
255         add     x21,x21,x9                      // h+=X[i]
256         orr     x17,x17,x19                     // Ch(e,f,g)
257         eor     x19,x22,x23                     // a^b, b^c in next round
258         eor     x16,x16,x12,ror#18      // Sigma1(e)
259         ror     x12,x22,#28
260         add     x21,x21,x17                     // h+=Ch(e,f,g)
261         eor     x17,x22,x22,ror#5
262         add     x21,x21,x16                     // h+=Sigma1(e)
263         and     x28,x28,x19                     // (b^c)&=(a^b)
264         add     x25,x25,x21                     // d+=h
265         eor     x28,x28,x23                     // Maj(a,b,c)
266         eor     x17,x12,x17,ror#34      // Sigma0(a)
267         add     x21,x21,x28                     // h+=Maj(a,b,c)
268         ldr     x28,[x30],#8            // *K++, x19 in next round
269         //add   x21,x21,x17                     // h+=Sigma0(a)
270 #ifndef __AARCH64EB__
271         rev     x10,x10                 // 7
272 #endif
273         ldp     x11,x12,[x1],#2*8
274         add     x21,x21,x17                     // h+=Sigma0(a)
275         ror     x16,x25,#14
276         add     x20,x20,x28                     // h+=K[i]
277         eor     x13,x25,x25,ror#23
278         and     x17,x26,x25
279         bic     x28,x27,x25
280         add     x20,x20,x10                     // h+=X[i]
281         orr     x17,x17,x28                     // Ch(e,f,g)
282         eor     x28,x21,x22                     // a^b, b^c in next round
283         eor     x16,x16,x13,ror#18      // Sigma1(e)
284         ror     x13,x21,#28
285         add     x20,x20,x17                     // h+=Ch(e,f,g)
286         eor     x17,x21,x21,ror#5
287         add     x20,x20,x16                     // h+=Sigma1(e)
288         and     x19,x19,x28                     // (b^c)&=(a^b)
289         add     x24,x24,x20                     // d+=h
290         eor     x19,x19,x22                     // Maj(a,b,c)
291         eor     x17,x13,x17,ror#34      // Sigma0(a)
292         add     x20,x20,x19                     // h+=Maj(a,b,c)
293         ldr     x19,[x30],#8            // *K++, x28 in next round
294         //add   x20,x20,x17                     // h+=Sigma0(a)
295 #ifndef __AARCH64EB__
296         rev     x11,x11                 // 8
297 #endif
298         add     x20,x20,x17                     // h+=Sigma0(a)
299         ror     x16,x24,#14
300         add     x27,x27,x19                     // h+=K[i]
301         eor     x14,x24,x24,ror#23
302         and     x17,x25,x24
303         bic     x19,x26,x24
304         add     x27,x27,x11                     // h+=X[i]
305         orr     x17,x17,x19                     // Ch(e,f,g)
306         eor     x19,x20,x21                     // a^b, b^c in next round
307         eor     x16,x16,x14,ror#18      // Sigma1(e)
308         ror     x14,x20,#28
309         add     x27,x27,x17                     // h+=Ch(e,f,g)
310         eor     x17,x20,x20,ror#5
311         add     x27,x27,x16                     // h+=Sigma1(e)
312         and     x28,x28,x19                     // (b^c)&=(a^b)
313         add     x23,x23,x27                     // d+=h
314         eor     x28,x28,x21                     // Maj(a,b,c)
315         eor     x17,x14,x17,ror#34      // Sigma0(a)
316         add     x27,x27,x28                     // h+=Maj(a,b,c)
317         ldr     x28,[x30],#8            // *K++, x19 in next round
318         //add   x27,x27,x17                     // h+=Sigma0(a)
319 #ifndef __AARCH64EB__
320         rev     x12,x12                 // 9
321 #endif
322         ldp     x13,x14,[x1],#2*8
323         add     x27,x27,x17                     // h+=Sigma0(a)
324         ror     x16,x23,#14
325         add     x26,x26,x28                     // h+=K[i]
326         eor     x15,x23,x23,ror#23
327         and     x17,x24,x23
328         bic     x28,x25,x23
329         add     x26,x26,x12                     // h+=X[i]
330         orr     x17,x17,x28                     // Ch(e,f,g)
331         eor     x28,x27,x20                     // a^b, b^c in next round
332         eor     x16,x16,x15,ror#18      // Sigma1(e)
333         ror     x15,x27,#28
334         add     x26,x26,x17                     // h+=Ch(e,f,g)
335         eor     x17,x27,x27,ror#5
336         add     x26,x26,x16                     // h+=Sigma1(e)
337         and     x19,x19,x28                     // (b^c)&=(a^b)
338         add     x22,x22,x26                     // d+=h
339         eor     x19,x19,x20                     // Maj(a,b,c)
340         eor     x17,x15,x17,ror#34      // Sigma0(a)
341         add     x26,x26,x19                     // h+=Maj(a,b,c)
342         ldr     x19,[x30],#8            // *K++, x28 in next round
343         //add   x26,x26,x17                     // h+=Sigma0(a)
344 #ifndef __AARCH64EB__
345         rev     x13,x13                 // 10
346 #endif
347         add     x26,x26,x17                     // h+=Sigma0(a)
348         ror     x16,x22,#14
349         add     x25,x25,x19                     // h+=K[i]
350         eor     x0,x22,x22,ror#23
351         and     x17,x23,x22
352         bic     x19,x24,x22
353         add     x25,x25,x13                     // h+=X[i]
354         orr     x17,x17,x19                     // Ch(e,f,g)
355         eor     x19,x26,x27                     // a^b, b^c in next round
356         eor     x16,x16,x0,ror#18       // Sigma1(e)
357         ror     x0,x26,#28
358         add     x25,x25,x17                     // h+=Ch(e,f,g)
359         eor     x17,x26,x26,ror#5
360         add     x25,x25,x16                     // h+=Sigma1(e)
361         and     x28,x28,x19                     // (b^c)&=(a^b)
362         add     x21,x21,x25                     // d+=h
363         eor     x28,x28,x27                     // Maj(a,b,c)
364         eor     x17,x0,x17,ror#34       // Sigma0(a)
365         add     x25,x25,x28                     // h+=Maj(a,b,c)
366         ldr     x28,[x30],#8            // *K++, x19 in next round
367         //add   x25,x25,x17                     // h+=Sigma0(a)
368 #ifndef __AARCH64EB__
369         rev     x14,x14                 // 11
370 #endif
371         ldp     x15,x0,[x1],#2*8
372         add     x25,x25,x17                     // h+=Sigma0(a)
373         str     x6,[sp,#24]
374         ror     x16,x21,#14
375         add     x24,x24,x28                     // h+=K[i]
376         eor     x6,x21,x21,ror#23
377         and     x17,x22,x21
378         bic     x28,x23,x21
379         add     x24,x24,x14                     // h+=X[i]
380         orr     x17,x17,x28                     // Ch(e,f,g)
381         eor     x28,x25,x26                     // a^b, b^c in next round
382         eor     x16,x16,x6,ror#18       // Sigma1(e)
383         ror     x6,x25,#28
384         add     x24,x24,x17                     // h+=Ch(e,f,g)
385         eor     x17,x25,x25,ror#5
386         add     x24,x24,x16                     // h+=Sigma1(e)
387         and     x19,x19,x28                     // (b^c)&=(a^b)
388         add     x20,x20,x24                     // d+=h
389         eor     x19,x19,x26                     // Maj(a,b,c)
390         eor     x17,x6,x17,ror#34       // Sigma0(a)
391         add     x24,x24,x19                     // h+=Maj(a,b,c)
392         ldr     x19,[x30],#8            // *K++, x28 in next round
393         //add   x24,x24,x17                     // h+=Sigma0(a)
394 #ifndef __AARCH64EB__
395         rev     x15,x15                 // 12
396 #endif
397         add     x24,x24,x17                     // h+=Sigma0(a)
398         str     x7,[sp,#0]
399         ror     x16,x20,#14
400         add     x23,x23,x19                     // h+=K[i]
401         eor     x7,x20,x20,ror#23
402         and     x17,x21,x20
403         bic     x19,x22,x20
404         add     x23,x23,x15                     // h+=X[i]
405         orr     x17,x17,x19                     // Ch(e,f,g)
406         eor     x19,x24,x25                     // a^b, b^c in next round
407         eor     x16,x16,x7,ror#18       // Sigma1(e)
408         ror     x7,x24,#28
409         add     x23,x23,x17                     // h+=Ch(e,f,g)
410         eor     x17,x24,x24,ror#5
411         add     x23,x23,x16                     // h+=Sigma1(e)
412         and     x28,x28,x19                     // (b^c)&=(a^b)
413         add     x27,x27,x23                     // d+=h
414         eor     x28,x28,x25                     // Maj(a,b,c)
415         eor     x17,x7,x17,ror#34       // Sigma0(a)
416         add     x23,x23,x28                     // h+=Maj(a,b,c)
417         ldr     x28,[x30],#8            // *K++, x19 in next round
418         //add   x23,x23,x17                     // h+=Sigma0(a)
419 #ifndef __AARCH64EB__
420         rev     x0,x0                   // 13
421 #endif
422         ldp     x1,x2,[x1]
423         add     x23,x23,x17                     // h+=Sigma0(a)
424         str     x8,[sp,#8]
425         ror     x16,x27,#14
426         add     x22,x22,x28                     // h+=K[i]
427         eor     x8,x27,x27,ror#23
428         and     x17,x20,x27
429         bic     x28,x21,x27
430         add     x22,x22,x0                      // h+=X[i]
431         orr     x17,x17,x28                     // Ch(e,f,g)
432         eor     x28,x23,x24                     // a^b, b^c in next round
433         eor     x16,x16,x8,ror#18       // Sigma1(e)
434         ror     x8,x23,#28
435         add     x22,x22,x17                     // h+=Ch(e,f,g)
436         eor     x17,x23,x23,ror#5
437         add     x22,x22,x16                     // h+=Sigma1(e)
438         and     x19,x19,x28                     // (b^c)&=(a^b)
439         add     x26,x26,x22                     // d+=h
440         eor     x19,x19,x24                     // Maj(a,b,c)
441         eor     x17,x8,x17,ror#34       // Sigma0(a)
442         add     x22,x22,x19                     // h+=Maj(a,b,c)
443         ldr     x19,[x30],#8            // *K++, x28 in next round
444         //add   x22,x22,x17                     // h+=Sigma0(a)
445 #ifndef __AARCH64EB__
446         rev     x1,x1                   // 14
447 #endif
448         ldr     x6,[sp,#24]
449         add     x22,x22,x17                     // h+=Sigma0(a)
450         str     x9,[sp,#16]
451         ror     x16,x26,#14
452         add     x21,x21,x19                     // h+=K[i]
453         eor     x9,x26,x26,ror#23
454         and     x17,x27,x26
455         bic     x19,x20,x26
456         add     x21,x21,x1                      // h+=X[i]
457         orr     x17,x17,x19                     // Ch(e,f,g)
458         eor     x19,x22,x23                     // a^b, b^c in next round
459         eor     x16,x16,x9,ror#18       // Sigma1(e)
460         ror     x9,x22,#28
461         add     x21,x21,x17                     // h+=Ch(e,f,g)
462         eor     x17,x22,x22,ror#5
463         add     x21,x21,x16                     // h+=Sigma1(e)
464         and     x28,x28,x19                     // (b^c)&=(a^b)
465         add     x25,x25,x21                     // d+=h
466         eor     x28,x28,x23                     // Maj(a,b,c)
467         eor     x17,x9,x17,ror#34       // Sigma0(a)
468         add     x21,x21,x28                     // h+=Maj(a,b,c)
469         ldr     x28,[x30],#8            // *K++, x19 in next round
470         //add   x21,x21,x17                     // h+=Sigma0(a)
471 #ifndef __AARCH64EB__
472         rev     x2,x2                   // 15
473 #endif
474         ldr     x7,[sp,#0]
475         add     x21,x21,x17                     // h+=Sigma0(a)
476         str     x10,[sp,#24]
477         ror     x16,x25,#14
478         add     x20,x20,x28                     // h+=K[i]
479         ror     x9,x4,#1
480         and     x17,x26,x25
481         ror     x8,x1,#19
482         bic     x28,x27,x25
483         ror     x10,x21,#28
484         add     x20,x20,x2                      // h+=X[i]
485         eor     x16,x16,x25,ror#18
486         eor     x9,x9,x4,ror#8
487         orr     x17,x17,x28                     // Ch(e,f,g)
488         eor     x28,x21,x22                     // a^b, b^c in next round
489         eor     x16,x16,x25,ror#41      // Sigma1(e)
490         eor     x10,x10,x21,ror#34
491         add     x20,x20,x17                     // h+=Ch(e,f,g)
492         and     x19,x19,x28                     // (b^c)&=(a^b)
493         eor     x8,x8,x1,ror#61
494         eor     x9,x9,x4,lsr#7  // sigma0(X[i+1])
495         add     x20,x20,x16                     // h+=Sigma1(e)
496         eor     x19,x19,x22                     // Maj(a,b,c)
497         eor     x17,x10,x21,ror#39      // Sigma0(a)
498         eor     x8,x8,x1,lsr#6  // sigma1(X[i+14])
499         add     x3,x3,x12
500         add     x24,x24,x20                     // d+=h
501         add     x20,x20,x19                     // h+=Maj(a,b,c)
502         ldr     x19,[x30],#8            // *K++, x28 in next round
503         add     x3,x3,x9
504         add     x20,x20,x17                     // h+=Sigma0(a)
505         add     x3,x3,x8
506 .Loop_16_xx:
507         ldr     x8,[sp,#8]
508         str     x11,[sp,#0]
509         ror     x16,x24,#14
510         add     x27,x27,x19                     // h+=K[i]
511         ror     x10,x5,#1
512         and     x17,x25,x24
513         ror     x9,x2,#19
514         bic     x19,x26,x24
515         ror     x11,x20,#28
516         add     x27,x27,x3                      // h+=X[i]
517         eor     x16,x16,x24,ror#18
518         eor     x10,x10,x5,ror#8
519         orr     x17,x17,x19                     // Ch(e,f,g)
520         eor     x19,x20,x21                     // a^b, b^c in next round
521         eor     x16,x16,x24,ror#41      // Sigma1(e)
522         eor     x11,x11,x20,ror#34
523         add     x27,x27,x17                     // h+=Ch(e,f,g)
524         and     x28,x28,x19                     // (b^c)&=(a^b)
525         eor     x9,x9,x2,ror#61
526         eor     x10,x10,x5,lsr#7        // sigma0(X[i+1])
527         add     x27,x27,x16                     // h+=Sigma1(e)
528         eor     x28,x28,x21                     // Maj(a,b,c)
529         eor     x17,x11,x20,ror#39      // Sigma0(a)
530         eor     x9,x9,x2,lsr#6  // sigma1(X[i+14])
531         add     x4,x4,x13
532         add     x23,x23,x27                     // d+=h
533         add     x27,x27,x28                     // h+=Maj(a,b,c)
534         ldr     x28,[x30],#8            // *K++, x19 in next round
535         add     x4,x4,x10
536         add     x27,x27,x17                     // h+=Sigma0(a)
537         add     x4,x4,x9
538         ldr     x9,[sp,#16]
539         str     x12,[sp,#8]
540         ror     x16,x23,#14
541         add     x26,x26,x28                     // h+=K[i]
542         ror     x11,x6,#1
543         and     x17,x24,x23
544         ror     x10,x3,#19
545         bic     x28,x25,x23
546         ror     x12,x27,#28
547         add     x26,x26,x4                      // h+=X[i]
548         eor     x16,x16,x23,ror#18
549         eor     x11,x11,x6,ror#8
550         orr     x17,x17,x28                     // Ch(e,f,g)
551         eor     x28,x27,x20                     // a^b, b^c in next round
552         eor     x16,x16,x23,ror#41      // Sigma1(e)
553         eor     x12,x12,x27,ror#34
554         add     x26,x26,x17                     // h+=Ch(e,f,g)
555         and     x19,x19,x28                     // (b^c)&=(a^b)
556         eor     x10,x10,x3,ror#61
557         eor     x11,x11,x6,lsr#7        // sigma0(X[i+1])
558         add     x26,x26,x16                     // h+=Sigma1(e)
559         eor     x19,x19,x20                     // Maj(a,b,c)
560         eor     x17,x12,x27,ror#39      // Sigma0(a)
561         eor     x10,x10,x3,lsr#6        // sigma1(X[i+14])
562         add     x5,x5,x14
563         add     x22,x22,x26                     // d+=h
564         add     x26,x26,x19                     // h+=Maj(a,b,c)
565         ldr     x19,[x30],#8            // *K++, x28 in next round
566         add     x5,x5,x11
567         add     x26,x26,x17                     // h+=Sigma0(a)
568         add     x5,x5,x10
569         ldr     x10,[sp,#24]
570         str     x13,[sp,#16]
571         ror     x16,x22,#14
572         add     x25,x25,x19                     // h+=K[i]
573         ror     x12,x7,#1
574         and     x17,x23,x22
575         ror     x11,x4,#19
576         bic     x19,x24,x22
577         ror     x13,x26,#28
578         add     x25,x25,x5                      // h+=X[i]
579         eor     x16,x16,x22,ror#18
580         eor     x12,x12,x7,ror#8
581         orr     x17,x17,x19                     // Ch(e,f,g)
582         eor     x19,x26,x27                     // a^b, b^c in next round
583         eor     x16,x16,x22,ror#41      // Sigma1(e)
584         eor     x13,x13,x26,ror#34
585         add     x25,x25,x17                     // h+=Ch(e,f,g)
586         and     x28,x28,x19                     // (b^c)&=(a^b)
587         eor     x11,x11,x4,ror#61
588         eor     x12,x12,x7,lsr#7        // sigma0(X[i+1])
589         add     x25,x25,x16                     // h+=Sigma1(e)
590         eor     x28,x28,x27                     // Maj(a,b,c)
591         eor     x17,x13,x26,ror#39      // Sigma0(a)
592         eor     x11,x11,x4,lsr#6        // sigma1(X[i+14])
593         add     x6,x6,x15
594         add     x21,x21,x25                     // d+=h
595         add     x25,x25,x28                     // h+=Maj(a,b,c)
596         ldr     x28,[x30],#8            // *K++, x19 in next round
597         add     x6,x6,x12
598         add     x25,x25,x17                     // h+=Sigma0(a)
599         add     x6,x6,x11
600         ldr     x11,[sp,#0]
601         str     x14,[sp,#24]
602         ror     x16,x21,#14
603         add     x24,x24,x28                     // h+=K[i]
604         ror     x13,x8,#1
605         and     x17,x22,x21
606         ror     x12,x5,#19
607         bic     x28,x23,x21
608         ror     x14,x25,#28
609         add     x24,x24,x6                      // h+=X[i]
610         eor     x16,x16,x21,ror#18
611         eor     x13,x13,x8,ror#8
612         orr     x17,x17,x28                     // Ch(e,f,g)
613         eor     x28,x25,x26                     // a^b, b^c in next round
614         eor     x16,x16,x21,ror#41      // Sigma1(e)
615         eor     x14,x14,x25,ror#34
616         add     x24,x24,x17                     // h+=Ch(e,f,g)
617         and     x19,x19,x28                     // (b^c)&=(a^b)
618         eor     x12,x12,x5,ror#61
619         eor     x13,x13,x8,lsr#7        // sigma0(X[i+1])
620         add     x24,x24,x16                     // h+=Sigma1(e)
621         eor     x19,x19,x26                     // Maj(a,b,c)
622         eor     x17,x14,x25,ror#39      // Sigma0(a)
623         eor     x12,x12,x5,lsr#6        // sigma1(X[i+14])
624         add     x7,x7,x0
625         add     x20,x20,x24                     // d+=h
626         add     x24,x24,x19                     // h+=Maj(a,b,c)
627         ldr     x19,[x30],#8            // *K++, x28 in next round
628         add     x7,x7,x13
629         add     x24,x24,x17                     // h+=Sigma0(a)
630         add     x7,x7,x12
631         ldr     x12,[sp,#8]
632         str     x15,[sp,#0]
633         ror     x16,x20,#14
634         add     x23,x23,x19                     // h+=K[i]
635         ror     x14,x9,#1
636         and     x17,x21,x20
637         ror     x13,x6,#19
638         bic     x19,x22,x20
639         ror     x15,x24,#28
640         add     x23,x23,x7                      // h+=X[i]
641         eor     x16,x16,x20,ror#18
642         eor     x14,x14,x9,ror#8
643         orr     x17,x17,x19                     // Ch(e,f,g)
644         eor     x19,x24,x25                     // a^b, b^c in next round
645         eor     x16,x16,x20,ror#41      // Sigma1(e)
646         eor     x15,x15,x24,ror#34
647         add     x23,x23,x17                     // h+=Ch(e,f,g)
648         and     x28,x28,x19                     // (b^c)&=(a^b)
649         eor     x13,x13,x6,ror#61
650         eor     x14,x14,x9,lsr#7        // sigma0(X[i+1])
651         add     x23,x23,x16                     // h+=Sigma1(e)
652         eor     x28,x28,x25                     // Maj(a,b,c)
653         eor     x17,x15,x24,ror#39      // Sigma0(a)
654         eor     x13,x13,x6,lsr#6        // sigma1(X[i+14])
655         add     x8,x8,x1
656         add     x27,x27,x23                     // d+=h
657         add     x23,x23,x28                     // h+=Maj(a,b,c)
658         ldr     x28,[x30],#8            // *K++, x19 in next round
659         add     x8,x8,x14
660         add     x23,x23,x17                     // h+=Sigma0(a)
661         add     x8,x8,x13
662         ldr     x13,[sp,#16]
663         str     x0,[sp,#8]
664         ror     x16,x27,#14
665         add     x22,x22,x28                     // h+=K[i]
666         ror     x15,x10,#1
667         and     x17,x20,x27
668         ror     x14,x7,#19
669         bic     x28,x21,x27
670         ror     x0,x23,#28
671         add     x22,x22,x8                      // h+=X[i]
672         eor     x16,x16,x27,ror#18
673         eor     x15,x15,x10,ror#8
674         orr     x17,x17,x28                     // Ch(e,f,g)
675         eor     x28,x23,x24                     // a^b, b^c in next round
676         eor     x16,x16,x27,ror#41      // Sigma1(e)
677         eor     x0,x0,x23,ror#34
678         add     x22,x22,x17                     // h+=Ch(e,f,g)
679         and     x19,x19,x28                     // (b^c)&=(a^b)
680         eor     x14,x14,x7,ror#61
681         eor     x15,x15,x10,lsr#7       // sigma0(X[i+1])
682         add     x22,x22,x16                     // h+=Sigma1(e)
683         eor     x19,x19,x24                     // Maj(a,b,c)
684         eor     x17,x0,x23,ror#39       // Sigma0(a)
685         eor     x14,x14,x7,lsr#6        // sigma1(X[i+14])
686         add     x9,x9,x2
687         add     x26,x26,x22                     // d+=h
688         add     x22,x22,x19                     // h+=Maj(a,b,c)
689         ldr     x19,[x30],#8            // *K++, x28 in next round
690         add     x9,x9,x15
691         add     x22,x22,x17                     // h+=Sigma0(a)
692         add     x9,x9,x14
693         ldr     x14,[sp,#24]
694         str     x1,[sp,#16]
695         ror     x16,x26,#14
696         add     x21,x21,x19                     // h+=K[i]
697         ror     x0,x11,#1
698         and     x17,x27,x26
699         ror     x15,x8,#19
700         bic     x19,x20,x26
701         ror     x1,x22,#28
702         add     x21,x21,x9                      // h+=X[i]
703         eor     x16,x16,x26,ror#18
704         eor     x0,x0,x11,ror#8
705         orr     x17,x17,x19                     // Ch(e,f,g)
706         eor     x19,x22,x23                     // a^b, b^c in next round
707         eor     x16,x16,x26,ror#41      // Sigma1(e)
708         eor     x1,x1,x22,ror#34
709         add     x21,x21,x17                     // h+=Ch(e,f,g)
710         and     x28,x28,x19                     // (b^c)&=(a^b)
711         eor     x15,x15,x8,ror#61
712         eor     x0,x0,x11,lsr#7 // sigma0(X[i+1])
713         add     x21,x21,x16                     // h+=Sigma1(e)
714         eor     x28,x28,x23                     // Maj(a,b,c)
715         eor     x17,x1,x22,ror#39       // Sigma0(a)
716         eor     x15,x15,x8,lsr#6        // sigma1(X[i+14])
717         add     x10,x10,x3
718         add     x25,x25,x21                     // d+=h
719         add     x21,x21,x28                     // h+=Maj(a,b,c)
720         ldr     x28,[x30],#8            // *K++, x19 in next round
721         add     x10,x10,x0
722         add     x21,x21,x17                     // h+=Sigma0(a)
723         add     x10,x10,x15
724         ldr     x15,[sp,#0]
725         str     x2,[sp,#24]
726         ror     x16,x25,#14
727         add     x20,x20,x28                     // h+=K[i]
728         ror     x1,x12,#1
729         and     x17,x26,x25
730         ror     x0,x9,#19
731         bic     x28,x27,x25
732         ror     x2,x21,#28
733         add     x20,x20,x10                     // h+=X[i]
734         eor     x16,x16,x25,ror#18
735         eor     x1,x1,x12,ror#8
736         orr     x17,x17,x28                     // Ch(e,f,g)
737         eor     x28,x21,x22                     // a^b, b^c in next round
738         eor     x16,x16,x25,ror#41      // Sigma1(e)
739         eor     x2,x2,x21,ror#34
740         add     x20,x20,x17                     // h+=Ch(e,f,g)
741         and     x19,x19,x28                     // (b^c)&=(a^b)
742         eor     x0,x0,x9,ror#61
743         eor     x1,x1,x12,lsr#7 // sigma0(X[i+1])
744         add     x20,x20,x16                     // h+=Sigma1(e)
745         eor     x19,x19,x22                     // Maj(a,b,c)
746         eor     x17,x2,x21,ror#39       // Sigma0(a)
747         eor     x0,x0,x9,lsr#6  // sigma1(X[i+14])
748         add     x11,x11,x4
749         add     x24,x24,x20                     // d+=h
750         add     x20,x20,x19                     // h+=Maj(a,b,c)
751         ldr     x19,[x30],#8            // *K++, x28 in next round
752         add     x11,x11,x1
753         add     x20,x20,x17                     // h+=Sigma0(a)
754         add     x11,x11,x0
755         ldr     x0,[sp,#8]
756         str     x3,[sp,#0]
757         ror     x16,x24,#14
758         add     x27,x27,x19                     // h+=K[i]
759         ror     x2,x13,#1
760         and     x17,x25,x24
761         ror     x1,x10,#19
762         bic     x19,x26,x24
763         ror     x3,x20,#28
764         add     x27,x27,x11                     // h+=X[i]
765         eor     x16,x16,x24,ror#18
766         eor     x2,x2,x13,ror#8
767         orr     x17,x17,x19                     // Ch(e,f,g)
768         eor     x19,x20,x21                     // a^b, b^c in next round
769         eor     x16,x16,x24,ror#41      // Sigma1(e)
770         eor     x3,x3,x20,ror#34
771         add     x27,x27,x17                     // h+=Ch(e,f,g)
772         and     x28,x28,x19                     // (b^c)&=(a^b)
773         eor     x1,x1,x10,ror#61
774         eor     x2,x2,x13,lsr#7 // sigma0(X[i+1])
775         add     x27,x27,x16                     // h+=Sigma1(e)
776         eor     x28,x28,x21                     // Maj(a,b,c)
777         eor     x17,x3,x20,ror#39       // Sigma0(a)
778         eor     x1,x1,x10,lsr#6 // sigma1(X[i+14])
779         add     x12,x12,x5
780         add     x23,x23,x27                     // d+=h
781         add     x27,x27,x28                     // h+=Maj(a,b,c)
782         ldr     x28,[x30],#8            // *K++, x19 in next round
783         add     x12,x12,x2
784         add     x27,x27,x17                     // h+=Sigma0(a)
785         add     x12,x12,x1
786         ldr     x1,[sp,#16]
787         str     x4,[sp,#8]
788         ror     x16,x23,#14
789         add     x26,x26,x28                     // h+=K[i]
790         ror     x3,x14,#1
791         and     x17,x24,x23
792         ror     x2,x11,#19
793         bic     x28,x25,x23
794         ror     x4,x27,#28
795         add     x26,x26,x12                     // h+=X[i]
796         eor     x16,x16,x23,ror#18
797         eor     x3,x3,x14,ror#8
798         orr     x17,x17,x28                     // Ch(e,f,g)
799         eor     x28,x27,x20                     // a^b, b^c in next round
800         eor     x16,x16,x23,ror#41      // Sigma1(e)
801         eor     x4,x4,x27,ror#34
802         add     x26,x26,x17                     // h+=Ch(e,f,g)
803         and     x19,x19,x28                     // (b^c)&=(a^b)
804         eor     x2,x2,x11,ror#61
805         eor     x3,x3,x14,lsr#7 // sigma0(X[i+1])
806         add     x26,x26,x16                     // h+=Sigma1(e)
807         eor     x19,x19,x20                     // Maj(a,b,c)
808         eor     x17,x4,x27,ror#39       // Sigma0(a)
809         eor     x2,x2,x11,lsr#6 // sigma1(X[i+14])
810         add     x13,x13,x6
811         add     x22,x22,x26                     // d+=h
812         add     x26,x26,x19                     // h+=Maj(a,b,c)
813         ldr     x19,[x30],#8            // *K++, x28 in next round
814         add     x13,x13,x3
815         add     x26,x26,x17                     // h+=Sigma0(a)
816         add     x13,x13,x2
817         ldr     x2,[sp,#24]
818         str     x5,[sp,#16]
819         ror     x16,x22,#14
820         add     x25,x25,x19                     // h+=K[i]
821         ror     x4,x15,#1
822         and     x17,x23,x22
823         ror     x3,x12,#19
824         bic     x19,x24,x22
825         ror     x5,x26,#28
826         add     x25,x25,x13                     // h+=X[i]
827         eor     x16,x16,x22,ror#18
828         eor     x4,x4,x15,ror#8
829         orr     x17,x17,x19                     // Ch(e,f,g)
830         eor     x19,x26,x27                     // a^b, b^c in next round
831         eor     x16,x16,x22,ror#41      // Sigma1(e)
832         eor     x5,x5,x26,ror#34
833         add     x25,x25,x17                     // h+=Ch(e,f,g)
834         and     x28,x28,x19                     // (b^c)&=(a^b)
835         eor     x3,x3,x12,ror#61
836         eor     x4,x4,x15,lsr#7 // sigma0(X[i+1])
837         add     x25,x25,x16                     // h+=Sigma1(e)
838         eor     x28,x28,x27                     // Maj(a,b,c)
839         eor     x17,x5,x26,ror#39       // Sigma0(a)
840         eor     x3,x3,x12,lsr#6 // sigma1(X[i+14])
841         add     x14,x14,x7
842         add     x21,x21,x25                     // d+=h
843         add     x25,x25,x28                     // h+=Maj(a,b,c)
844         ldr     x28,[x30],#8            // *K++, x19 in next round
845         add     x14,x14,x4
846         add     x25,x25,x17                     // h+=Sigma0(a)
847         add     x14,x14,x3
848         ldr     x3,[sp,#0]
849         str     x6,[sp,#24]
850         ror     x16,x21,#14
851         add     x24,x24,x28                     // h+=K[i]
852         ror     x5,x0,#1
853         and     x17,x22,x21
854         ror     x4,x13,#19
855         bic     x28,x23,x21
856         ror     x6,x25,#28
857         add     x24,x24,x14                     // h+=X[i]
858         eor     x16,x16,x21,ror#18
859         eor     x5,x5,x0,ror#8
860         orr     x17,x17,x28                     // Ch(e,f,g)
861         eor     x28,x25,x26                     // a^b, b^c in next round
862         eor     x16,x16,x21,ror#41      // Sigma1(e)
863         eor     x6,x6,x25,ror#34
864         add     x24,x24,x17                     // h+=Ch(e,f,g)
865         and     x19,x19,x28                     // (b^c)&=(a^b)
866         eor     x4,x4,x13,ror#61
867         eor     x5,x5,x0,lsr#7  // sigma0(X[i+1])
868         add     x24,x24,x16                     // h+=Sigma1(e)
869         eor     x19,x19,x26                     // Maj(a,b,c)
870         eor     x17,x6,x25,ror#39       // Sigma0(a)
871         eor     x4,x4,x13,lsr#6 // sigma1(X[i+14])
872         add     x15,x15,x8
873         add     x20,x20,x24                     // d+=h
874         add     x24,x24,x19                     // h+=Maj(a,b,c)
875         ldr     x19,[x30],#8            // *K++, x28 in next round
876         add     x15,x15,x5
877         add     x24,x24,x17                     // h+=Sigma0(a)
878         add     x15,x15,x4
879         ldr     x4,[sp,#8]
880         str     x7,[sp,#0]
881         ror     x16,x20,#14
882         add     x23,x23,x19                     // h+=K[i]
883         ror     x6,x1,#1
884         and     x17,x21,x20
885         ror     x5,x14,#19
886         bic     x19,x22,x20
887         ror     x7,x24,#28
888         add     x23,x23,x15                     // h+=X[i]
889         eor     x16,x16,x20,ror#18
890         eor     x6,x6,x1,ror#8
891         orr     x17,x17,x19                     // Ch(e,f,g)
892         eor     x19,x24,x25                     // a^b, b^c in next round
893         eor     x16,x16,x20,ror#41      // Sigma1(e)
894         eor     x7,x7,x24,ror#34
895         add     x23,x23,x17                     // h+=Ch(e,f,g)
896         and     x28,x28,x19                     // (b^c)&=(a^b)
897         eor     x5,x5,x14,ror#61
898         eor     x6,x6,x1,lsr#7  // sigma0(X[i+1])
899         add     x23,x23,x16                     // h+=Sigma1(e)
900         eor     x28,x28,x25                     // Maj(a,b,c)
901         eor     x17,x7,x24,ror#39       // Sigma0(a)
902         eor     x5,x5,x14,lsr#6 // sigma1(X[i+14])
903         add     x0,x0,x9
904         add     x27,x27,x23                     // d+=h
905         add     x23,x23,x28                     // h+=Maj(a,b,c)
906         ldr     x28,[x30],#8            // *K++, x19 in next round
907         add     x0,x0,x6
908         add     x23,x23,x17                     // h+=Sigma0(a)
909         add     x0,x0,x5
910         ldr     x5,[sp,#16]
911         str     x8,[sp,#8]
912         ror     x16,x27,#14
913         add     x22,x22,x28                     // h+=K[i]
914         ror     x7,x2,#1
915         and     x17,x20,x27
916         ror     x6,x15,#19
917         bic     x28,x21,x27
918         ror     x8,x23,#28
919         add     x22,x22,x0                      // h+=X[i]
920         eor     x16,x16,x27,ror#18
921         eor     x7,x7,x2,ror#8
922         orr     x17,x17,x28                     // Ch(e,f,g)
923         eor     x28,x23,x24                     // a^b, b^c in next round
924         eor     x16,x16,x27,ror#41      // Sigma1(e)
925         eor     x8,x8,x23,ror#34
926         add     x22,x22,x17                     // h+=Ch(e,f,g)
927         and     x19,x19,x28                     // (b^c)&=(a^b)
928         eor     x6,x6,x15,ror#61
929         eor     x7,x7,x2,lsr#7  // sigma0(X[i+1])
930         add     x22,x22,x16                     // h+=Sigma1(e)
931         eor     x19,x19,x24                     // Maj(a,b,c)
932         eor     x17,x8,x23,ror#39       // Sigma0(a)
933         eor     x6,x6,x15,lsr#6 // sigma1(X[i+14])
934         add     x1,x1,x10
935         add     x26,x26,x22                     // d+=h
936         add     x22,x22,x19                     // h+=Maj(a,b,c)
937         ldr     x19,[x30],#8            // *K++, x28 in next round
938         add     x1,x1,x7
939         add     x22,x22,x17                     // h+=Sigma0(a)
940         add     x1,x1,x6
941         ldr     x6,[sp,#24]
942         str     x9,[sp,#16]
943         ror     x16,x26,#14
944         add     x21,x21,x19                     // h+=K[i]
945         ror     x8,x3,#1
946         and     x17,x27,x26
947         ror     x7,x0,#19
948         bic     x19,x20,x26
949         ror     x9,x22,#28
950         add     x21,x21,x1                      // h+=X[i]
951         eor     x16,x16,x26,ror#18
952         eor     x8,x8,x3,ror#8
953         orr     x17,x17,x19                     // Ch(e,f,g)
954         eor     x19,x22,x23                     // a^b, b^c in next round
955         eor     x16,x16,x26,ror#41      // Sigma1(e)
956         eor     x9,x9,x22,ror#34
957         add     x21,x21,x17                     // h+=Ch(e,f,g)
958         and     x28,x28,x19                     // (b^c)&=(a^b)
959         eor     x7,x7,x0,ror#61
960         eor     x8,x8,x3,lsr#7  // sigma0(X[i+1])
961         add     x21,x21,x16                     // h+=Sigma1(e)
962         eor     x28,x28,x23                     // Maj(a,b,c)
963         eor     x17,x9,x22,ror#39       // Sigma0(a)
964         eor     x7,x7,x0,lsr#6  // sigma1(X[i+14])
965         add     x2,x2,x11
966         add     x25,x25,x21                     // d+=h
967         add     x21,x21,x28                     // h+=Maj(a,b,c)
968         ldr     x28,[x30],#8            // *K++, x19 in next round
969         add     x2,x2,x8
970         add     x21,x21,x17                     // h+=Sigma0(a)
971         add     x2,x2,x7
972         ldr     x7,[sp,#0]
973         str     x10,[sp,#24]
974         ror     x16,x25,#14
975         add     x20,x20,x28                     // h+=K[i]
976         ror     x9,x4,#1
977         and     x17,x26,x25
978         ror     x8,x1,#19
979         bic     x28,x27,x25
980         ror     x10,x21,#28
981         add     x20,x20,x2                      // h+=X[i]
982         eor     x16,x16,x25,ror#18
983         eor     x9,x9,x4,ror#8
984         orr     x17,x17,x28                     // Ch(e,f,g)
985         eor     x28,x21,x22                     // a^b, b^c in next round
986         eor     x16,x16,x25,ror#41      // Sigma1(e)
987         eor     x10,x10,x21,ror#34
988         add     x20,x20,x17                     // h+=Ch(e,f,g)
989         and     x19,x19,x28                     // (b^c)&=(a^b)
990         eor     x8,x8,x1,ror#61
991         eor     x9,x9,x4,lsr#7  // sigma0(X[i+1])
992         add     x20,x20,x16                     // h+=Sigma1(e)
993         eor     x19,x19,x22                     // Maj(a,b,c)
994         eor     x17,x10,x21,ror#39      // Sigma0(a)
995         eor     x8,x8,x1,lsr#6  // sigma1(X[i+14])
996         add     x3,x3,x12
997         add     x24,x24,x20                     // d+=h
998         add     x20,x20,x19                     // h+=Maj(a,b,c)
999         ldr     x19,[x30],#8            // *K++, x28 in next round
1000         add     x3,x3,x9
1001         add     x20,x20,x17                     // h+=Sigma0(a)
1002         add     x3,x3,x8
1003         cbnz    x19,.Loop_16_xx
1005         ldp     x0,x2,[x29,#96]
1006         ldr     x1,[x29,#112]
1007         sub     x30,x30,#648            // rewind
1009         ldp     x3,x4,[x0]
1010         ldp     x5,x6,[x0,#2*8]
1011         add     x1,x1,#14*8                     // advance input pointer
1012         ldp     x7,x8,[x0,#4*8]
1013         add     x20,x20,x3
1014         ldp     x9,x10,[x0,#6*8]
1015         add     x21,x21,x4
1016         add     x22,x22,x5
1017         add     x23,x23,x6
1018         stp     x20,x21,[x0]
1019         add     x24,x24,x7
1020         add     x25,x25,x8
1021         stp     x22,x23,[x0,#2*8]
1022         add     x26,x26,x9
1023         add     x27,x27,x10
1024         cmp     x1,x2
1025         stp     x24,x25,[x0,#4*8]
1026         stp     x26,x27,[x0,#6*8]
1027         b.ne    .Loop
1029         ldp     x19,x20,[x29,#16]
1030         add     sp,sp,#4*8
1031         ldp     x21,x22,[x29,#32]
1032         ldp     x23,x24,[x29,#48]
1033         ldp     x25,x26,[x29,#64]
1034         ldp     x27,x28,[x29,#80]
1035         ldp     x29,x30,[sp],#128
1036         ret
1037 .size   zfs_sha512_block_armv7,.-zfs_sha512_block_armv7
1040 .globl  zfs_sha512_block_armv8
1041 .type   zfs_sha512_block_armv8,%function
1042 .align  6
1043 zfs_sha512_block_armv8:
1044         hint            #34                             // bti c
1045 .Lv8_entry:
1046         // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later
1047         stp             x29,x30,[sp,#-16]!
1048         add             x29,sp,#0
1050         ld1             {v16.16b-v19.16b},[x1],#64      // load input
1051         ld1             {v20.16b-v23.16b},[x1],#64
1053         ld1             {v0.2d-v3.2d},[x0]              // load context
1054         adr             x3,.LK512
1056         rev64           v16.16b,v16.16b
1057         rev64           v17.16b,v17.16b
1058         rev64           v18.16b,v18.16b
1059         rev64           v19.16b,v19.16b
1060         rev64           v20.16b,v20.16b
1061         rev64           v21.16b,v21.16b
1062         rev64           v22.16b,v22.16b
1063         rev64           v23.16b,v23.16b
1064         b               .Loop_hw
1066 .align  4
1067 .Loop_hw:
1068         ld1             {v24.2d},[x3],#16
1069         subs            x2,x2,#1
1070         sub             x4,x1,#128
1071         orr             v26.16b,v0.16b,v0.16b                   // offload
1072         orr             v27.16b,v1.16b,v1.16b
1073         orr             v28.16b,v2.16b,v2.16b
1074         orr             v29.16b,v3.16b,v3.16b
1075         csel            x1,x1,x4,ne                     // conditional rewind
1076         add             v24.2d,v24.2d,v16.2d
1077         ld1             {v25.2d},[x3],#16
1078         ext             v24.16b,v24.16b,v24.16b,#8
1079         ext             v5.16b,v2.16b,v3.16b,#8
1080         ext             v6.16b,v1.16b,v2.16b,#8
1081         add             v3.2d,v3.2d,v24.2d                      // "T1 + H + K512[i]"
1082          .inst  0xcec08230      //sha512su0 v16.16b,v17.16b
1083          ext            v7.16b,v20.16b,v21.16b,#8
1084         .inst   0xce6680a3      //sha512h v3.16b,v5.16b,v6.16b
1085          .inst  0xce678af0      //sha512su1 v16.16b,v23.16b,v7.16b
1086         add             v4.2d,v1.2d,v3.2d               // "D + T1"
1087         .inst   0xce608423      //sha512h2 v3.16b,v1.16b,v0.16b
1088         add             v25.2d,v25.2d,v17.2d
1089         ld1             {v24.2d},[x3],#16
1090         ext             v25.16b,v25.16b,v25.16b,#8
1091         ext             v5.16b,v4.16b,v2.16b,#8
1092         ext             v6.16b,v0.16b,v4.16b,#8
1093         add             v2.2d,v2.2d,v25.2d                      // "T1 + H + K512[i]"
1094          .inst  0xcec08251      //sha512su0 v17.16b,v18.16b
1095          ext            v7.16b,v21.16b,v22.16b,#8
1096         .inst   0xce6680a2      //sha512h v2.16b,v5.16b,v6.16b
1097          .inst  0xce678a11      //sha512su1 v17.16b,v16.16b,v7.16b
1098         add             v1.2d,v0.2d,v2.2d               // "D + T1"
1099         .inst   0xce638402      //sha512h2 v2.16b,v0.16b,v3.16b
1100         add             v24.2d,v24.2d,v18.2d
1101         ld1             {v25.2d},[x3],#16
1102         ext             v24.16b,v24.16b,v24.16b,#8
1103         ext             v5.16b,v1.16b,v4.16b,#8
1104         ext             v6.16b,v3.16b,v1.16b,#8
1105         add             v4.2d,v4.2d,v24.2d                      // "T1 + H + K512[i]"
1106          .inst  0xcec08272      //sha512su0 v18.16b,v19.16b
1107          ext            v7.16b,v22.16b,v23.16b,#8
1108         .inst   0xce6680a4      //sha512h v4.16b,v5.16b,v6.16b
1109          .inst  0xce678a32      //sha512su1 v18.16b,v17.16b,v7.16b
1110         add             v0.2d,v3.2d,v4.2d               // "D + T1"
1111         .inst   0xce628464      //sha512h2 v4.16b,v3.16b,v2.16b
1112         add             v25.2d,v25.2d,v19.2d
1113         ld1             {v24.2d},[x3],#16
1114         ext             v25.16b,v25.16b,v25.16b,#8
1115         ext             v5.16b,v0.16b,v1.16b,#8
1116         ext             v6.16b,v2.16b,v0.16b,#8
1117         add             v1.2d,v1.2d,v25.2d                      // "T1 + H + K512[i]"
1118          .inst  0xcec08293      //sha512su0 v19.16b,v20.16b
1119          ext            v7.16b,v23.16b,v16.16b,#8
1120         .inst   0xce6680a1      //sha512h v1.16b,v5.16b,v6.16b
1121          .inst  0xce678a53      //sha512su1 v19.16b,v18.16b,v7.16b
1122         add             v3.2d,v2.2d,v1.2d               // "D + T1"
1123         .inst   0xce648441      //sha512h2 v1.16b,v2.16b,v4.16b
1124         add             v24.2d,v24.2d,v20.2d
1125         ld1             {v25.2d},[x3],#16
1126         ext             v24.16b,v24.16b,v24.16b,#8
1127         ext             v5.16b,v3.16b,v0.16b,#8
1128         ext             v6.16b,v4.16b,v3.16b,#8
1129         add             v0.2d,v0.2d,v24.2d                      // "T1 + H + K512[i]"
1130          .inst  0xcec082b4      //sha512su0 v20.16b,v21.16b
1131          ext            v7.16b,v16.16b,v17.16b,#8
1132         .inst   0xce6680a0      //sha512h v0.16b,v5.16b,v6.16b
1133          .inst  0xce678a74      //sha512su1 v20.16b,v19.16b,v7.16b
1134         add             v2.2d,v4.2d,v0.2d               // "D + T1"
1135         .inst   0xce618480      //sha512h2 v0.16b,v4.16b,v1.16b
1136         add             v25.2d,v25.2d,v21.2d
1137         ld1             {v24.2d},[x3],#16
1138         ext             v25.16b,v25.16b,v25.16b,#8
1139         ext             v5.16b,v2.16b,v3.16b,#8
1140         ext             v6.16b,v1.16b,v2.16b,#8
1141         add             v3.2d,v3.2d,v25.2d                      // "T1 + H + K512[i]"
1142          .inst  0xcec082d5      //sha512su0 v21.16b,v22.16b
1143          ext            v7.16b,v17.16b,v18.16b,#8
1144         .inst   0xce6680a3      //sha512h v3.16b,v5.16b,v6.16b
1145          .inst  0xce678a95      //sha512su1 v21.16b,v20.16b,v7.16b
1146         add             v4.2d,v1.2d,v3.2d               // "D + T1"
1147         .inst   0xce608423      //sha512h2 v3.16b,v1.16b,v0.16b
1148         add             v24.2d,v24.2d,v22.2d
1149         ld1             {v25.2d},[x3],#16
1150         ext             v24.16b,v24.16b,v24.16b,#8
1151         ext             v5.16b,v4.16b,v2.16b,#8
1152         ext             v6.16b,v0.16b,v4.16b,#8
1153         add             v2.2d,v2.2d,v24.2d                      // "T1 + H + K512[i]"
1154          .inst  0xcec082f6      //sha512su0 v22.16b,v23.16b
1155          ext            v7.16b,v18.16b,v19.16b,#8
1156         .inst   0xce6680a2      //sha512h v2.16b,v5.16b,v6.16b
1157          .inst  0xce678ab6      //sha512su1 v22.16b,v21.16b,v7.16b
1158         add             v1.2d,v0.2d,v2.2d               // "D + T1"
1159         .inst   0xce638402      //sha512h2 v2.16b,v0.16b,v3.16b
1160         add             v25.2d,v25.2d,v23.2d
1161         ld1             {v24.2d},[x3],#16
1162         ext             v25.16b,v25.16b,v25.16b,#8
1163         ext             v5.16b,v1.16b,v4.16b,#8
1164         ext             v6.16b,v3.16b,v1.16b,#8
1165         add             v4.2d,v4.2d,v25.2d                      // "T1 + H + K512[i]"
1166          .inst  0xcec08217      //sha512su0 v23.16b,v16.16b
1167          ext            v7.16b,v19.16b,v20.16b,#8
1168         .inst   0xce6680a4      //sha512h v4.16b,v5.16b,v6.16b
1169          .inst  0xce678ad7      //sha512su1 v23.16b,v22.16b,v7.16b
1170         add             v0.2d,v3.2d,v4.2d               // "D + T1"
1171         .inst   0xce628464      //sha512h2 v4.16b,v3.16b,v2.16b
1172         add             v24.2d,v24.2d,v16.2d
1173         ld1             {v25.2d},[x3],#16
1174         ext             v24.16b,v24.16b,v24.16b,#8
1175         ext             v5.16b,v0.16b,v1.16b,#8
1176         ext             v6.16b,v2.16b,v0.16b,#8
1177         add             v1.2d,v1.2d,v24.2d                      // "T1 + H + K512[i]"
1178          .inst  0xcec08230      //sha512su0 v16.16b,v17.16b
1179          ext            v7.16b,v20.16b,v21.16b,#8
1180         .inst   0xce6680a1      //sha512h v1.16b,v5.16b,v6.16b
1181          .inst  0xce678af0      //sha512su1 v16.16b,v23.16b,v7.16b
1182         add             v3.2d,v2.2d,v1.2d               // "D + T1"
1183         .inst   0xce648441      //sha512h2 v1.16b,v2.16b,v4.16b
1184         add             v25.2d,v25.2d,v17.2d
1185         ld1             {v24.2d},[x3],#16
1186         ext             v25.16b,v25.16b,v25.16b,#8
1187         ext             v5.16b,v3.16b,v0.16b,#8
1188         ext             v6.16b,v4.16b,v3.16b,#8
1189         add             v0.2d,v0.2d,v25.2d                      // "T1 + H + K512[i]"
1190          .inst  0xcec08251      //sha512su0 v17.16b,v18.16b
1191          ext            v7.16b,v21.16b,v22.16b,#8
1192         .inst   0xce6680a0      //sha512h v0.16b,v5.16b,v6.16b
1193          .inst  0xce678a11      //sha512su1 v17.16b,v16.16b,v7.16b
1194         add             v2.2d,v4.2d,v0.2d               // "D + T1"
1195         .inst   0xce618480      //sha512h2 v0.16b,v4.16b,v1.16b
1196         add             v24.2d,v24.2d,v18.2d
1197         ld1             {v25.2d},[x3],#16
1198         ext             v24.16b,v24.16b,v24.16b,#8
1199         ext             v5.16b,v2.16b,v3.16b,#8
1200         ext             v6.16b,v1.16b,v2.16b,#8
1201         add             v3.2d,v3.2d,v24.2d                      // "T1 + H + K512[i]"
1202          .inst  0xcec08272      //sha512su0 v18.16b,v19.16b
1203          ext            v7.16b,v22.16b,v23.16b,#8
1204         .inst   0xce6680a3      //sha512h v3.16b,v5.16b,v6.16b
1205          .inst  0xce678a32      //sha512su1 v18.16b,v17.16b,v7.16b
1206         add             v4.2d,v1.2d,v3.2d               // "D + T1"
1207         .inst   0xce608423      //sha512h2 v3.16b,v1.16b,v0.16b
1208         add             v25.2d,v25.2d,v19.2d
1209         ld1             {v24.2d},[x3],#16
1210         ext             v25.16b,v25.16b,v25.16b,#8
1211         ext             v5.16b,v4.16b,v2.16b,#8
1212         ext             v6.16b,v0.16b,v4.16b,#8
1213         add             v2.2d,v2.2d,v25.2d                      // "T1 + H + K512[i]"
1214          .inst  0xcec08293      //sha512su0 v19.16b,v20.16b
1215          ext            v7.16b,v23.16b,v16.16b,#8
1216         .inst   0xce6680a2      //sha512h v2.16b,v5.16b,v6.16b
1217          .inst  0xce678a53      //sha512su1 v19.16b,v18.16b,v7.16b
1218         add             v1.2d,v0.2d,v2.2d               // "D + T1"
1219         .inst   0xce638402      //sha512h2 v2.16b,v0.16b,v3.16b
1220         add             v24.2d,v24.2d,v20.2d
1221         ld1             {v25.2d},[x3],#16
1222         ext             v24.16b,v24.16b,v24.16b,#8
1223         ext             v5.16b,v1.16b,v4.16b,#8
1224         ext             v6.16b,v3.16b,v1.16b,#8
1225         add             v4.2d,v4.2d,v24.2d                      // "T1 + H + K512[i]"
1226          .inst  0xcec082b4      //sha512su0 v20.16b,v21.16b
1227          ext            v7.16b,v16.16b,v17.16b,#8
1228         .inst   0xce6680a4      //sha512h v4.16b,v5.16b,v6.16b
1229          .inst  0xce678a74      //sha512su1 v20.16b,v19.16b,v7.16b
1230         add             v0.2d,v3.2d,v4.2d               // "D + T1"
1231         .inst   0xce628464      //sha512h2 v4.16b,v3.16b,v2.16b
1232         add             v25.2d,v25.2d,v21.2d
1233         ld1             {v24.2d},[x3],#16
1234         ext             v25.16b,v25.16b,v25.16b,#8
1235         ext             v5.16b,v0.16b,v1.16b,#8
1236         ext             v6.16b,v2.16b,v0.16b,#8
1237         add             v1.2d,v1.2d,v25.2d                      // "T1 + H + K512[i]"
1238          .inst  0xcec082d5      //sha512su0 v21.16b,v22.16b
1239          ext            v7.16b,v17.16b,v18.16b,#8
1240         .inst   0xce6680a1      //sha512h v1.16b,v5.16b,v6.16b
1241          .inst  0xce678a95      //sha512su1 v21.16b,v20.16b,v7.16b
1242         add             v3.2d,v2.2d,v1.2d               // "D + T1"
1243         .inst   0xce648441      //sha512h2 v1.16b,v2.16b,v4.16b
1244         add             v24.2d,v24.2d,v22.2d
1245         ld1             {v25.2d},[x3],#16
1246         ext             v24.16b,v24.16b,v24.16b,#8
1247         ext             v5.16b,v3.16b,v0.16b,#8
1248         ext             v6.16b,v4.16b,v3.16b,#8
1249         add             v0.2d,v0.2d,v24.2d                      // "T1 + H + K512[i]"
1250          .inst  0xcec082f6      //sha512su0 v22.16b,v23.16b
1251          ext            v7.16b,v18.16b,v19.16b,#8
1252         .inst   0xce6680a0      //sha512h v0.16b,v5.16b,v6.16b
1253          .inst  0xce678ab6      //sha512su1 v22.16b,v21.16b,v7.16b
1254         add             v2.2d,v4.2d,v0.2d               // "D + T1"
1255         .inst   0xce618480      //sha512h2 v0.16b,v4.16b,v1.16b
1256         add             v25.2d,v25.2d,v23.2d
1257         ld1             {v24.2d},[x3],#16
1258         ext             v25.16b,v25.16b,v25.16b,#8
1259         ext             v5.16b,v2.16b,v3.16b,#8
1260         ext             v6.16b,v1.16b,v2.16b,#8
1261         add             v3.2d,v3.2d,v25.2d                      // "T1 + H + K512[i]"
1262          .inst  0xcec08217      //sha512su0 v23.16b,v16.16b
1263          ext            v7.16b,v19.16b,v20.16b,#8
1264         .inst   0xce6680a3      //sha512h v3.16b,v5.16b,v6.16b
1265          .inst  0xce678ad7      //sha512su1 v23.16b,v22.16b,v7.16b
1266         add             v4.2d,v1.2d,v3.2d               // "D + T1"
1267         .inst   0xce608423      //sha512h2 v3.16b,v1.16b,v0.16b
1268         add             v24.2d,v24.2d,v16.2d
1269         ld1             {v25.2d},[x3],#16
1270         ext             v24.16b,v24.16b,v24.16b,#8
1271         ext             v5.16b,v4.16b,v2.16b,#8
1272         ext             v6.16b,v0.16b,v4.16b,#8
1273         add             v2.2d,v2.2d,v24.2d                      // "T1 + H + K512[i]"
1274          .inst  0xcec08230      //sha512su0 v16.16b,v17.16b
1275          ext            v7.16b,v20.16b,v21.16b,#8
1276         .inst   0xce6680a2      //sha512h v2.16b,v5.16b,v6.16b
1277          .inst  0xce678af0      //sha512su1 v16.16b,v23.16b,v7.16b
1278         add             v1.2d,v0.2d,v2.2d               // "D + T1"
1279         .inst   0xce638402      //sha512h2 v2.16b,v0.16b,v3.16b
1280         add             v25.2d,v25.2d,v17.2d
1281         ld1             {v24.2d},[x3],#16
1282         ext             v25.16b,v25.16b,v25.16b,#8
1283         ext             v5.16b,v1.16b,v4.16b,#8
1284         ext             v6.16b,v3.16b,v1.16b,#8
1285         add             v4.2d,v4.2d,v25.2d                      // "T1 + H + K512[i]"
1286          .inst  0xcec08251      //sha512su0 v17.16b,v18.16b
1287          ext            v7.16b,v21.16b,v22.16b,#8
1288         .inst   0xce6680a4      //sha512h v4.16b,v5.16b,v6.16b
1289          .inst  0xce678a11      //sha512su1 v17.16b,v16.16b,v7.16b
1290         add             v0.2d,v3.2d,v4.2d               // "D + T1"
1291         .inst   0xce628464      //sha512h2 v4.16b,v3.16b,v2.16b
1292         add             v24.2d,v24.2d,v18.2d
1293         ld1             {v25.2d},[x3],#16
1294         ext             v24.16b,v24.16b,v24.16b,#8
1295         ext             v5.16b,v0.16b,v1.16b,#8
1296         ext             v6.16b,v2.16b,v0.16b,#8
1297         add             v1.2d,v1.2d,v24.2d                      // "T1 + H + K512[i]"
1298          .inst  0xcec08272      //sha512su0 v18.16b,v19.16b
1299          ext            v7.16b,v22.16b,v23.16b,#8
1300         .inst   0xce6680a1      //sha512h v1.16b,v5.16b,v6.16b
1301          .inst  0xce678a32      //sha512su1 v18.16b,v17.16b,v7.16b
1302         add             v3.2d,v2.2d,v1.2d               // "D + T1"
1303         .inst   0xce648441      //sha512h2 v1.16b,v2.16b,v4.16b
1304         add             v25.2d,v25.2d,v19.2d
1305         ld1             {v24.2d},[x3],#16
1306         ext             v25.16b,v25.16b,v25.16b,#8
1307         ext             v5.16b,v3.16b,v0.16b,#8
1308         ext             v6.16b,v4.16b,v3.16b,#8
1309         add             v0.2d,v0.2d,v25.2d                      // "T1 + H + K512[i]"
1310          .inst  0xcec08293      //sha512su0 v19.16b,v20.16b
1311          ext            v7.16b,v23.16b,v16.16b,#8
1312         .inst   0xce6680a0      //sha512h v0.16b,v5.16b,v6.16b
1313          .inst  0xce678a53      //sha512su1 v19.16b,v18.16b,v7.16b
1314         add             v2.2d,v4.2d,v0.2d               // "D + T1"
1315         .inst   0xce618480      //sha512h2 v0.16b,v4.16b,v1.16b
1316         add             v24.2d,v24.2d,v20.2d
1317         ld1             {v25.2d},[x3],#16
1318         ext             v24.16b,v24.16b,v24.16b,#8
1319         ext             v5.16b,v2.16b,v3.16b,#8
1320         ext             v6.16b,v1.16b,v2.16b,#8
1321         add             v3.2d,v3.2d,v24.2d                      // "T1 + H + K512[i]"
1322          .inst  0xcec082b4      //sha512su0 v20.16b,v21.16b
1323          ext            v7.16b,v16.16b,v17.16b,#8
1324         .inst   0xce6680a3      //sha512h v3.16b,v5.16b,v6.16b
1325          .inst  0xce678a74      //sha512su1 v20.16b,v19.16b,v7.16b
1326         add             v4.2d,v1.2d,v3.2d               // "D + T1"
1327         .inst   0xce608423      //sha512h2 v3.16b,v1.16b,v0.16b
1328         add             v25.2d,v25.2d,v21.2d
1329         ld1             {v24.2d},[x3],#16
1330         ext             v25.16b,v25.16b,v25.16b,#8
1331         ext             v5.16b,v4.16b,v2.16b,#8
1332         ext             v6.16b,v0.16b,v4.16b,#8
1333         add             v2.2d,v2.2d,v25.2d                      // "T1 + H + K512[i]"
1334          .inst  0xcec082d5      //sha512su0 v21.16b,v22.16b
1335          ext            v7.16b,v17.16b,v18.16b,#8
1336         .inst   0xce6680a2      //sha512h v2.16b,v5.16b,v6.16b
1337          .inst  0xce678a95      //sha512su1 v21.16b,v20.16b,v7.16b
1338         add             v1.2d,v0.2d,v2.2d               // "D + T1"
1339         .inst   0xce638402      //sha512h2 v2.16b,v0.16b,v3.16b
1340         add             v24.2d,v24.2d,v22.2d
1341         ld1             {v25.2d},[x3],#16
1342         ext             v24.16b,v24.16b,v24.16b,#8
1343         ext             v5.16b,v1.16b,v4.16b,#8
1344         ext             v6.16b,v3.16b,v1.16b,#8
1345         add             v4.2d,v4.2d,v24.2d                      // "T1 + H + K512[i]"
1346          .inst  0xcec082f6      //sha512su0 v22.16b,v23.16b
1347          ext            v7.16b,v18.16b,v19.16b,#8
1348         .inst   0xce6680a4      //sha512h v4.16b,v5.16b,v6.16b
1349          .inst  0xce678ab6      //sha512su1 v22.16b,v21.16b,v7.16b
1350         add             v0.2d,v3.2d,v4.2d               // "D + T1"
1351         .inst   0xce628464      //sha512h2 v4.16b,v3.16b,v2.16b
1352         add             v25.2d,v25.2d,v23.2d
1353         ld1             {v24.2d},[x3],#16
1354         ext             v25.16b,v25.16b,v25.16b,#8
1355         ext             v5.16b,v0.16b,v1.16b,#8
1356         ext             v6.16b,v2.16b,v0.16b,#8
1357         add             v1.2d,v1.2d,v25.2d                      // "T1 + H + K512[i]"
1358          .inst  0xcec08217      //sha512su0 v23.16b,v16.16b
1359          ext            v7.16b,v19.16b,v20.16b,#8
1360         .inst   0xce6680a1      //sha512h v1.16b,v5.16b,v6.16b
1361          .inst  0xce678ad7      //sha512su1 v23.16b,v22.16b,v7.16b
1362         add             v3.2d,v2.2d,v1.2d               // "D + T1"
1363         .inst   0xce648441      //sha512h2 v1.16b,v2.16b,v4.16b
1364         add             v24.2d,v24.2d,v16.2d
1365         ld1             {v25.2d},[x3],#16
1366         ext             v24.16b,v24.16b,v24.16b,#8
1367         ext             v5.16b,v3.16b,v0.16b,#8
1368         ext             v6.16b,v4.16b,v3.16b,#8
1369         add             v0.2d,v0.2d,v24.2d                      // "T1 + H + K512[i]"
1370          .inst  0xcec08230      //sha512su0 v16.16b,v17.16b
1371          ext            v7.16b,v20.16b,v21.16b,#8
1372         .inst   0xce6680a0      //sha512h v0.16b,v5.16b,v6.16b
1373          .inst  0xce678af0      //sha512su1 v16.16b,v23.16b,v7.16b
1374         add             v2.2d,v4.2d,v0.2d               // "D + T1"
1375         .inst   0xce618480      //sha512h2 v0.16b,v4.16b,v1.16b
1376         add             v25.2d,v25.2d,v17.2d
1377         ld1             {v24.2d},[x3],#16
1378         ext             v25.16b,v25.16b,v25.16b,#8
1379         ext             v5.16b,v2.16b,v3.16b,#8
1380         ext             v6.16b,v1.16b,v2.16b,#8
1381         add             v3.2d,v3.2d,v25.2d                      // "T1 + H + K512[i]"
1382          .inst  0xcec08251      //sha512su0 v17.16b,v18.16b
1383          ext            v7.16b,v21.16b,v22.16b,#8
1384         .inst   0xce6680a3      //sha512h v3.16b,v5.16b,v6.16b
1385          .inst  0xce678a11      //sha512su1 v17.16b,v16.16b,v7.16b
1386         add             v4.2d,v1.2d,v3.2d               // "D + T1"
1387         .inst   0xce608423      //sha512h2 v3.16b,v1.16b,v0.16b
1388         add             v24.2d,v24.2d,v18.2d
1389         ld1             {v25.2d},[x3],#16
1390         ext             v24.16b,v24.16b,v24.16b,#8
1391         ext             v5.16b,v4.16b,v2.16b,#8
1392         ext             v6.16b,v0.16b,v4.16b,#8
1393         add             v2.2d,v2.2d,v24.2d                      // "T1 + H + K512[i]"
1394          .inst  0xcec08272      //sha512su0 v18.16b,v19.16b
1395          ext            v7.16b,v22.16b,v23.16b,#8
1396         .inst   0xce6680a2      //sha512h v2.16b,v5.16b,v6.16b
1397          .inst  0xce678a32      //sha512su1 v18.16b,v17.16b,v7.16b
1398         add             v1.2d,v0.2d,v2.2d               // "D + T1"
1399         .inst   0xce638402      //sha512h2 v2.16b,v0.16b,v3.16b
1400         add             v25.2d,v25.2d,v19.2d
1401         ld1             {v24.2d},[x3],#16
1402         ext             v25.16b,v25.16b,v25.16b,#8
1403         ext             v5.16b,v1.16b,v4.16b,#8
1404         ext             v6.16b,v3.16b,v1.16b,#8
1405         add             v4.2d,v4.2d,v25.2d                      // "T1 + H + K512[i]"
1406          .inst  0xcec08293      //sha512su0 v19.16b,v20.16b
1407          ext            v7.16b,v23.16b,v16.16b,#8
1408         .inst   0xce6680a4      //sha512h v4.16b,v5.16b,v6.16b
1409          .inst  0xce678a53      //sha512su1 v19.16b,v18.16b,v7.16b
1410         add             v0.2d,v3.2d,v4.2d               // "D + T1"
1411         .inst   0xce628464      //sha512h2 v4.16b,v3.16b,v2.16b
1412         add             v24.2d,v24.2d,v20.2d
1413         ld1             {v25.2d},[x3],#16
1414         ext             v24.16b,v24.16b,v24.16b,#8
1415         ext             v5.16b,v0.16b,v1.16b,#8
1416         ext             v6.16b,v2.16b,v0.16b,#8
1417         add             v1.2d,v1.2d,v24.2d                      // "T1 + H + K512[i]"
1418          .inst  0xcec082b4      //sha512su0 v20.16b,v21.16b
1419          ext            v7.16b,v16.16b,v17.16b,#8
1420         .inst   0xce6680a1      //sha512h v1.16b,v5.16b,v6.16b
1421          .inst  0xce678a74      //sha512su1 v20.16b,v19.16b,v7.16b
1422         add             v3.2d,v2.2d,v1.2d               // "D + T1"
1423         .inst   0xce648441      //sha512h2 v1.16b,v2.16b,v4.16b
1424         add             v25.2d,v25.2d,v21.2d
1425         ld1             {v24.2d},[x3],#16
1426         ext             v25.16b,v25.16b,v25.16b,#8
1427         ext             v5.16b,v3.16b,v0.16b,#8
1428         ext             v6.16b,v4.16b,v3.16b,#8
1429         add             v0.2d,v0.2d,v25.2d                      // "T1 + H + K512[i]"
1430          .inst  0xcec082d5      //sha512su0 v21.16b,v22.16b
1431          ext            v7.16b,v17.16b,v18.16b,#8
1432         .inst   0xce6680a0      //sha512h v0.16b,v5.16b,v6.16b
1433          .inst  0xce678a95      //sha512su1 v21.16b,v20.16b,v7.16b
1434         add             v2.2d,v4.2d,v0.2d               // "D + T1"
1435         .inst   0xce618480      //sha512h2 v0.16b,v4.16b,v1.16b
1436         add             v24.2d,v24.2d,v22.2d
1437         ld1             {v25.2d},[x3],#16
1438         ext             v24.16b,v24.16b,v24.16b,#8
1439         ext             v5.16b,v2.16b,v3.16b,#8
1440         ext             v6.16b,v1.16b,v2.16b,#8
1441         add             v3.2d,v3.2d,v24.2d                      // "T1 + H + K512[i]"
1442          .inst  0xcec082f6      //sha512su0 v22.16b,v23.16b
1443          ext            v7.16b,v18.16b,v19.16b,#8
1444         .inst   0xce6680a3      //sha512h v3.16b,v5.16b,v6.16b
1445          .inst  0xce678ab6      //sha512su1 v22.16b,v21.16b,v7.16b
1446         add             v4.2d,v1.2d,v3.2d               // "D + T1"
1447         .inst   0xce608423      //sha512h2 v3.16b,v1.16b,v0.16b
1448         add             v25.2d,v25.2d,v23.2d
1449         ld1             {v24.2d},[x3],#16
1450         ext             v25.16b,v25.16b,v25.16b,#8
1451         ext             v5.16b,v4.16b,v2.16b,#8
1452         ext             v6.16b,v0.16b,v4.16b,#8
1453         add             v2.2d,v2.2d,v25.2d                      // "T1 + H + K512[i]"
1454          .inst  0xcec08217      //sha512su0 v23.16b,v16.16b
1455          ext            v7.16b,v19.16b,v20.16b,#8
1456         .inst   0xce6680a2      //sha512h v2.16b,v5.16b,v6.16b
1457          .inst  0xce678ad7      //sha512su1 v23.16b,v22.16b,v7.16b
1458         add             v1.2d,v0.2d,v2.2d               // "D + T1"
1459         .inst   0xce638402      //sha512h2 v2.16b,v0.16b,v3.16b
1460         ld1             {v25.2d},[x3],#16
1461         add             v24.2d,v24.2d,v16.2d
1462          ld1            {v16.16b},[x1],#16              // load next input
1463         ext             v24.16b,v24.16b,v24.16b,#8
1464         ext             v5.16b,v1.16b,v4.16b,#8
1465         ext             v6.16b,v3.16b,v1.16b,#8
1466         add             v4.2d,v4.2d,v24.2d                      // "T1 + H + K512[i]"
1467         .inst   0xce6680a4      //sha512h v4.16b,v5.16b,v6.16b
1468          rev64          v16.16b,v16.16b
1469         add             v0.2d,v3.2d,v4.2d               // "D + T1"
1470         .inst   0xce628464      //sha512h2 v4.16b,v3.16b,v2.16b
1471         ld1             {v24.2d},[x3],#16
1472         add             v25.2d,v25.2d,v17.2d
1473          ld1            {v17.16b},[x1],#16              // load next input
1474         ext             v25.16b,v25.16b,v25.16b,#8
1475         ext             v5.16b,v0.16b,v1.16b,#8
1476         ext             v6.16b,v2.16b,v0.16b,#8
1477         add             v1.2d,v1.2d,v25.2d                      // "T1 + H + K512[i]"
1478         .inst   0xce6680a1      //sha512h v1.16b,v5.16b,v6.16b
1479          rev64          v17.16b,v17.16b
1480         add             v3.2d,v2.2d,v1.2d               // "D + T1"
1481         .inst   0xce648441      //sha512h2 v1.16b,v2.16b,v4.16b
1482         ld1             {v25.2d},[x3],#16
1483         add             v24.2d,v24.2d,v18.2d
1484          ld1            {v18.16b},[x1],#16              // load next input
1485         ext             v24.16b,v24.16b,v24.16b,#8
1486         ext             v5.16b,v3.16b,v0.16b,#8
1487         ext             v6.16b,v4.16b,v3.16b,#8
1488         add             v0.2d,v0.2d,v24.2d                      // "T1 + H + K512[i]"
1489         .inst   0xce6680a0      //sha512h v0.16b,v5.16b,v6.16b
1490          rev64          v18.16b,v18.16b
1491         add             v2.2d,v4.2d,v0.2d               // "D + T1"
1492         .inst   0xce618480      //sha512h2 v0.16b,v4.16b,v1.16b
1493         ld1             {v24.2d},[x3],#16
1494         add             v25.2d,v25.2d,v19.2d
1495          ld1            {v19.16b},[x1],#16              // load next input
1496         ext             v25.16b,v25.16b,v25.16b,#8
1497         ext             v5.16b,v2.16b,v3.16b,#8
1498         ext             v6.16b,v1.16b,v2.16b,#8
1499         add             v3.2d,v3.2d,v25.2d                      // "T1 + H + K512[i]"
1500         .inst   0xce6680a3      //sha512h v3.16b,v5.16b,v6.16b
1501          rev64          v19.16b,v19.16b
1502         add             v4.2d,v1.2d,v3.2d               // "D + T1"
1503         .inst   0xce608423      //sha512h2 v3.16b,v1.16b,v0.16b
1504         ld1             {v25.2d},[x3],#16
1505         add             v24.2d,v24.2d,v20.2d
1506          ld1            {v20.16b},[x1],#16              // load next input
1507         ext             v24.16b,v24.16b,v24.16b,#8
1508         ext             v5.16b,v4.16b,v2.16b,#8
1509         ext             v6.16b,v0.16b,v4.16b,#8
1510         add             v2.2d,v2.2d,v24.2d                      // "T1 + H + K512[i]"
1511         .inst   0xce6680a2      //sha512h v2.16b,v5.16b,v6.16b
1512          rev64          v20.16b,v20.16b
1513         add             v1.2d,v0.2d,v2.2d               // "D + T1"
1514         .inst   0xce638402      //sha512h2 v2.16b,v0.16b,v3.16b
1515         ld1             {v24.2d},[x3],#16
1516         add             v25.2d,v25.2d,v21.2d
1517          ld1            {v21.16b},[x1],#16              // load next input
1518         ext             v25.16b,v25.16b,v25.16b,#8
1519         ext             v5.16b,v1.16b,v4.16b,#8
1520         ext             v6.16b,v3.16b,v1.16b,#8
1521         add             v4.2d,v4.2d,v25.2d                      // "T1 + H + K512[i]"
1522         .inst   0xce6680a4      //sha512h v4.16b,v5.16b,v6.16b
1523          rev64          v21.16b,v21.16b
1524         add             v0.2d,v3.2d,v4.2d               // "D + T1"
1525         .inst   0xce628464      //sha512h2 v4.16b,v3.16b,v2.16b
1526         ld1             {v25.2d},[x3],#16
1527         add             v24.2d,v24.2d,v22.2d
1528          ld1            {v22.16b},[x1],#16              // load next input
1529         ext             v24.16b,v24.16b,v24.16b,#8
1530         ext             v5.16b,v0.16b,v1.16b,#8
1531         ext             v6.16b,v2.16b,v0.16b,#8
1532         add             v1.2d,v1.2d,v24.2d                      // "T1 + H + K512[i]"
1533         .inst   0xce6680a1      //sha512h v1.16b,v5.16b,v6.16b
1534          rev64          v22.16b,v22.16b
1535         add             v3.2d,v2.2d,v1.2d               // "D + T1"
1536         .inst   0xce648441      //sha512h2 v1.16b,v2.16b,v4.16b
1537         sub             x3,x3,#80*8     // rewind
1538         add             v25.2d,v25.2d,v23.2d
1539          ld1            {v23.16b},[x1],#16              // load next input
1540         ext             v25.16b,v25.16b,v25.16b,#8
1541         ext             v5.16b,v3.16b,v0.16b,#8
1542         ext             v6.16b,v4.16b,v3.16b,#8
1543         add             v0.2d,v0.2d,v25.2d                      // "T1 + H + K512[i]"
1544         .inst   0xce6680a0      //sha512h v0.16b,v5.16b,v6.16b
1545          rev64          v23.16b,v23.16b
1546         add             v2.2d,v4.2d,v0.2d               // "D + T1"
1547         .inst   0xce618480      //sha512h2 v0.16b,v4.16b,v1.16b
1548         add             v0.2d,v0.2d,v26.2d                      // accumulate
1549         add             v1.2d,v1.2d,v27.2d
1550         add             v2.2d,v2.2d,v28.2d
1551         add             v3.2d,v3.2d,v29.2d
1553         cbnz            x2,.Loop_hw
1555         st1             {v0.2d-v3.2d},[x0]              // store context
1557         ldr             x29,[sp],#16
1558         ret
1559 .size   zfs_sha512_block_armv8,.-zfs_sha512_block_armv8
1560 #endif