Merge Chromium + Blink git repositories
[chromium-blink-merge.git] / third_party / boringssl / linux-aarch64 / crypto / aes / aesv8-armx64.S
blobfa2abbccc04ef6572b570b838e16471e5eb83e8e
1 #if defined(__aarch64__)
2 #include <openssl/arm_arch.h>
4 #if __ARM_MAX_ARCH__>=7
5 .text
6 #if !defined(__clang__)
7 .arch   armv8-a+crypto
8 #endif
9 .align  5
10 .Lrcon:
11 .long   0x01,0x01,0x01,0x01
12 .long   0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d     // rotate-n-splat
13 .long   0x1b,0x1b,0x1b,0x1b
15 .globl  aes_v8_set_encrypt_key
16 .type   aes_v8_set_encrypt_key,%function
17 .align  5
18 aes_v8_set_encrypt_key:
19 .Lenc_key:
20         stp     x29,x30,[sp,#-16]!
21         add     x29,sp,#0
22         mov     x3,#-1
23         cmp     x0,#0
24         b.eq    .Lenc_key_abort
25         cmp     x2,#0
26         b.eq    .Lenc_key_abort
27         mov     x3,#-2
28         cmp     w1,#128
29         b.lt    .Lenc_key_abort
30         cmp     w1,#256
31         b.gt    .Lenc_key_abort
32         tst     w1,#0x3f
33         b.ne    .Lenc_key_abort
35         adr     x3,.Lrcon
36         cmp     w1,#192
38         eor     v0.16b,v0.16b,v0.16b
39         ld1     {v3.16b},[x0],#16
40         mov     w1,#8           // reuse w1
41         ld1     {v1.4s,v2.4s},[x3],#32
43         b.lt    .Loop128
44         b.eq    .L192
45         b       .L256
47 .align  4
48 .Loop128:
49         tbl     v6.16b,{v3.16b},v2.16b
50         ext     v5.16b,v0.16b,v3.16b,#12
51         st1     {v3.4s},[x2],#16
52         aese    v6.16b,v0.16b
53         subs    w1,w1,#1
55         eor     v3.16b,v3.16b,v5.16b
56         ext     v5.16b,v0.16b,v5.16b,#12
57         eor     v3.16b,v3.16b,v5.16b
58         ext     v5.16b,v0.16b,v5.16b,#12
59         eor     v6.16b,v6.16b,v1.16b
60         eor     v3.16b,v3.16b,v5.16b
61         shl     v1.16b,v1.16b,#1
62         eor     v3.16b,v3.16b,v6.16b
63         b.ne    .Loop128
65         ld1     {v1.4s},[x3]
67         tbl     v6.16b,{v3.16b},v2.16b
68         ext     v5.16b,v0.16b,v3.16b,#12
69         st1     {v3.4s},[x2],#16
70         aese    v6.16b,v0.16b
72         eor     v3.16b,v3.16b,v5.16b
73         ext     v5.16b,v0.16b,v5.16b,#12
74         eor     v3.16b,v3.16b,v5.16b
75         ext     v5.16b,v0.16b,v5.16b,#12
76         eor     v6.16b,v6.16b,v1.16b
77         eor     v3.16b,v3.16b,v5.16b
78         shl     v1.16b,v1.16b,#1
79         eor     v3.16b,v3.16b,v6.16b
81         tbl     v6.16b,{v3.16b},v2.16b
82         ext     v5.16b,v0.16b,v3.16b,#12
83         st1     {v3.4s},[x2],#16
84         aese    v6.16b,v0.16b
86         eor     v3.16b,v3.16b,v5.16b
87         ext     v5.16b,v0.16b,v5.16b,#12
88         eor     v3.16b,v3.16b,v5.16b
89         ext     v5.16b,v0.16b,v5.16b,#12
90         eor     v6.16b,v6.16b,v1.16b
91         eor     v3.16b,v3.16b,v5.16b
92         eor     v3.16b,v3.16b,v6.16b
93         st1     {v3.4s},[x2]
94         add     x2,x2,#0x50
96         mov     w12,#10
97         b       .Ldone
99 .align  4
100 .L192:
101         ld1     {v4.8b},[x0],#8
102         movi    v6.16b,#8                       // borrow v6.16b
103         st1     {v3.4s},[x2],#16
104         sub     v2.16b,v2.16b,v6.16b    // adjust the mask
106 .Loop192:
107         tbl     v6.16b,{v4.16b},v2.16b
108         ext     v5.16b,v0.16b,v3.16b,#12
109         st1     {v4.8b},[x2],#8
110         aese    v6.16b,v0.16b
111         subs    w1,w1,#1
113         eor     v3.16b,v3.16b,v5.16b
114         ext     v5.16b,v0.16b,v5.16b,#12
115         eor     v3.16b,v3.16b,v5.16b
116         ext     v5.16b,v0.16b,v5.16b,#12
117         eor     v3.16b,v3.16b,v5.16b
119         dup     v5.4s,v3.s[3]
120         eor     v5.16b,v5.16b,v4.16b
121         eor     v6.16b,v6.16b,v1.16b
122         ext     v4.16b,v0.16b,v4.16b,#12
123         shl     v1.16b,v1.16b,#1
124         eor     v4.16b,v4.16b,v5.16b
125         eor     v3.16b,v3.16b,v6.16b
126         eor     v4.16b,v4.16b,v6.16b
127         st1     {v3.4s},[x2],#16
128         b.ne    .Loop192
130         mov     w12,#12
131         add     x2,x2,#0x20
132         b       .Ldone
134 .align  4
135 .L256:
136         ld1     {v4.16b},[x0]
137         mov     w1,#7
138         mov     w12,#14
139         st1     {v3.4s},[x2],#16
141 .Loop256:
142         tbl     v6.16b,{v4.16b},v2.16b
143         ext     v5.16b,v0.16b,v3.16b,#12
144         st1     {v4.4s},[x2],#16
145         aese    v6.16b,v0.16b
146         subs    w1,w1,#1
148         eor     v3.16b,v3.16b,v5.16b
149         ext     v5.16b,v0.16b,v5.16b,#12
150         eor     v3.16b,v3.16b,v5.16b
151         ext     v5.16b,v0.16b,v5.16b,#12
152         eor     v6.16b,v6.16b,v1.16b
153         eor     v3.16b,v3.16b,v5.16b
154         shl     v1.16b,v1.16b,#1
155         eor     v3.16b,v3.16b,v6.16b
156         st1     {v3.4s},[x2],#16
157         b.eq    .Ldone
159         dup     v6.4s,v3.s[3]           // just splat
160         ext     v5.16b,v0.16b,v4.16b,#12
161         aese    v6.16b,v0.16b
163         eor     v4.16b,v4.16b,v5.16b
164         ext     v5.16b,v0.16b,v5.16b,#12
165         eor     v4.16b,v4.16b,v5.16b
166         ext     v5.16b,v0.16b,v5.16b,#12
167         eor     v4.16b,v4.16b,v5.16b
169         eor     v4.16b,v4.16b,v6.16b
170         b       .Loop256
172 .Ldone:
173         str     w12,[x2]
174         mov     x3,#0
176 .Lenc_key_abort:
177         mov     x0,x3                   // return value
178         ldr     x29,[sp],#16
179         ret
180 .size   aes_v8_set_encrypt_key,.-aes_v8_set_encrypt_key
182 .globl  aes_v8_set_decrypt_key
183 .type   aes_v8_set_decrypt_key,%function
184 .align  5
185 aes_v8_set_decrypt_key:
186         stp     x29,x30,[sp,#-16]!
187         add     x29,sp,#0
188         bl      .Lenc_key
190         cmp     x0,#0
191         b.ne    .Ldec_key_abort
193         sub     x2,x2,#240              // restore original x2
194         mov     x4,#-16
195         add     x0,x2,x12,lsl#4 // end of key schedule
197         ld1     {v0.4s},[x2]
198         ld1     {v1.4s},[x0]
199         st1     {v0.4s},[x0],x4
200         st1     {v1.4s},[x2],#16
202 .Loop_imc:
203         ld1     {v0.4s},[x2]
204         ld1     {v1.4s},[x0]
205         aesimc  v0.16b,v0.16b
206         aesimc  v1.16b,v1.16b
207         st1     {v0.4s},[x0],x4
208         st1     {v1.4s},[x2],#16
209         cmp     x0,x2
210         b.hi    .Loop_imc
212         ld1     {v0.4s},[x2]
213         aesimc  v0.16b,v0.16b
214         st1     {v0.4s},[x0]
216         eor     x0,x0,x0                // return value
217 .Ldec_key_abort:
218         ldp     x29,x30,[sp],#16
219         ret
220 .size   aes_v8_set_decrypt_key,.-aes_v8_set_decrypt_key
221 .globl  aes_v8_encrypt
222 .type   aes_v8_encrypt,%function
223 .align  5
224 aes_v8_encrypt:
225         ldr     w3,[x2,#240]
226         ld1     {v0.4s},[x2],#16
227         ld1     {v2.16b},[x0]
228         sub     w3,w3,#2
229         ld1     {v1.4s},[x2],#16
231 .Loop_enc:
232         aese    v2.16b,v0.16b
233         aesmc   v2.16b,v2.16b
234         ld1     {v0.4s},[x2],#16
235         subs    w3,w3,#2
236         aese    v2.16b,v1.16b
237         aesmc   v2.16b,v2.16b
238         ld1     {v1.4s},[x2],#16
239         b.gt    .Loop_enc
241         aese    v2.16b,v0.16b
242         aesmc   v2.16b,v2.16b
243         ld1     {v0.4s},[x2]
244         aese    v2.16b,v1.16b
245         eor     v2.16b,v2.16b,v0.16b
247         st1     {v2.16b},[x1]
248         ret
249 .size   aes_v8_encrypt,.-aes_v8_encrypt
250 .globl  aes_v8_decrypt
251 .type   aes_v8_decrypt,%function
252 .align  5
253 aes_v8_decrypt:
254         ldr     w3,[x2,#240]
255         ld1     {v0.4s},[x2],#16
256         ld1     {v2.16b},[x0]
257         sub     w3,w3,#2
258         ld1     {v1.4s},[x2],#16
260 .Loop_dec:
261         aesd    v2.16b,v0.16b
262         aesimc  v2.16b,v2.16b
263         ld1     {v0.4s},[x2],#16
264         subs    w3,w3,#2
265         aesd    v2.16b,v1.16b
266         aesimc  v2.16b,v2.16b
267         ld1     {v1.4s},[x2],#16
268         b.gt    .Loop_dec
270         aesd    v2.16b,v0.16b
271         aesimc  v2.16b,v2.16b
272         ld1     {v0.4s},[x2]
273         aesd    v2.16b,v1.16b
274         eor     v2.16b,v2.16b,v0.16b
276         st1     {v2.16b},[x1]
277         ret
278 .size   aes_v8_decrypt,.-aes_v8_decrypt
279 .globl  aes_v8_cbc_encrypt
280 .type   aes_v8_cbc_encrypt,%function
281 .align  5
282 aes_v8_cbc_encrypt:
283         stp     x29,x30,[sp,#-16]!
284         add     x29,sp,#0
285         subs    x2,x2,#16
286         mov     x8,#16
287         b.lo    .Lcbc_abort
288         csel    x8,xzr,x8,eq
290         cmp     w5,#0                   // en- or decrypting?
291         ldr     w5,[x3,#240]
292         and     x2,x2,#-16
293         ld1     {v6.16b},[x4]
294         ld1     {v0.16b},[x0],x8
296         ld1     {v16.4s,v17.4s},[x3]            // load key schedule...
297         sub     w5,w5,#6
298         add     x7,x3,x5,lsl#4  // pointer to last 7 round keys
299         sub     w5,w5,#2
300         ld1     {v18.4s,v19.4s},[x7],#32
301         ld1     {v20.4s,v21.4s},[x7],#32
302         ld1     {v22.4s,v23.4s},[x7],#32
303         ld1     {v7.4s},[x7]
305         add     x7,x3,#32
306         mov     w6,w5
307         b.eq    .Lcbc_dec
309         cmp     w5,#2
310         eor     v0.16b,v0.16b,v6.16b
311         eor     v5.16b,v16.16b,v7.16b
312         b.eq    .Lcbc_enc128
314         ld1     {v2.4s,v3.4s},[x7]
315         add     x7,x3,#16
316         add     x6,x3,#16*4
317         add     x12,x3,#16*5
318         aese    v0.16b,v16.16b
319         aesmc   v0.16b,v0.16b
320         add     x14,x3,#16*6
321         add     x3,x3,#16*7
322         b       .Lenter_cbc_enc
324 .align  4
325 .Loop_cbc_enc:
326         aese    v0.16b,v16.16b
327         aesmc   v0.16b,v0.16b
328         st1     {v6.16b},[x1],#16
329 .Lenter_cbc_enc:
330         aese    v0.16b,v17.16b
331         aesmc   v0.16b,v0.16b
332         aese    v0.16b,v2.16b
333         aesmc   v0.16b,v0.16b
334         ld1     {v16.4s},[x6]
335         cmp     w5,#4
336         aese    v0.16b,v3.16b
337         aesmc   v0.16b,v0.16b
338         ld1     {v17.4s},[x12]
339         b.eq    .Lcbc_enc192
341         aese    v0.16b,v16.16b
342         aesmc   v0.16b,v0.16b
343         ld1     {v16.4s},[x14]
344         aese    v0.16b,v17.16b
345         aesmc   v0.16b,v0.16b
346         ld1     {v17.4s},[x3]
347         nop
349 .Lcbc_enc192:
350         aese    v0.16b,v16.16b
351         aesmc   v0.16b,v0.16b
352         subs    x2,x2,#16
353         aese    v0.16b,v17.16b
354         aesmc   v0.16b,v0.16b
355         csel    x8,xzr,x8,eq
356         aese    v0.16b,v18.16b
357         aesmc   v0.16b,v0.16b
358         aese    v0.16b,v19.16b
359         aesmc   v0.16b,v0.16b
360         ld1     {v16.16b},[x0],x8
361         aese    v0.16b,v20.16b
362         aesmc   v0.16b,v0.16b
363         eor     v16.16b,v16.16b,v5.16b
364         aese    v0.16b,v21.16b
365         aesmc   v0.16b,v0.16b
366         ld1     {v17.4s},[x7]           // re-pre-load rndkey[1]
367         aese    v0.16b,v22.16b
368         aesmc   v0.16b,v0.16b
369         aese    v0.16b,v23.16b
370         eor     v6.16b,v0.16b,v7.16b
371         b.hs    .Loop_cbc_enc
373         st1     {v6.16b},[x1],#16
374         b       .Lcbc_done
376 .align  5
377 .Lcbc_enc128:
378         ld1     {v2.4s,v3.4s},[x7]
379         aese    v0.16b,v16.16b
380         aesmc   v0.16b,v0.16b
381         b       .Lenter_cbc_enc128
382 .Loop_cbc_enc128:
383         aese    v0.16b,v16.16b
384         aesmc   v0.16b,v0.16b
385         st1     {v6.16b},[x1],#16
386 .Lenter_cbc_enc128:
387         aese    v0.16b,v17.16b
388         aesmc   v0.16b,v0.16b
389         subs    x2,x2,#16
390         aese    v0.16b,v2.16b
391         aesmc   v0.16b,v0.16b
392         csel    x8,xzr,x8,eq
393         aese    v0.16b,v3.16b
394         aesmc   v0.16b,v0.16b
395         aese    v0.16b,v18.16b
396         aesmc   v0.16b,v0.16b
397         aese    v0.16b,v19.16b
398         aesmc   v0.16b,v0.16b
399         ld1     {v16.16b},[x0],x8
400         aese    v0.16b,v20.16b
401         aesmc   v0.16b,v0.16b
402         aese    v0.16b,v21.16b
403         aesmc   v0.16b,v0.16b
404         aese    v0.16b,v22.16b
405         aesmc   v0.16b,v0.16b
406         eor     v16.16b,v16.16b,v5.16b
407         aese    v0.16b,v23.16b
408         eor     v6.16b,v0.16b,v7.16b
409         b.hs    .Loop_cbc_enc128
411         st1     {v6.16b},[x1],#16
412         b       .Lcbc_done
413 .align  5
414 .Lcbc_dec:
415         ld1     {v18.16b},[x0],#16
416         subs    x2,x2,#32               // bias
417         add     w6,w5,#2
418         orr     v3.16b,v0.16b,v0.16b
419         orr     v1.16b,v0.16b,v0.16b
420         orr     v19.16b,v18.16b,v18.16b
421         b.lo    .Lcbc_dec_tail
423         orr     v1.16b,v18.16b,v18.16b
424         ld1     {v18.16b},[x0],#16
425         orr     v2.16b,v0.16b,v0.16b
426         orr     v3.16b,v1.16b,v1.16b
427         orr     v19.16b,v18.16b,v18.16b
429 .Loop3x_cbc_dec:
430         aesd    v0.16b,v16.16b
431         aesimc  v0.16b,v0.16b
432         aesd    v1.16b,v16.16b
433         aesimc  v1.16b,v1.16b
434         aesd    v18.16b,v16.16b
435         aesimc  v18.16b,v18.16b
436         ld1     {v16.4s},[x7],#16
437         subs    w6,w6,#2
438         aesd    v0.16b,v17.16b
439         aesimc  v0.16b,v0.16b
440         aesd    v1.16b,v17.16b
441         aesimc  v1.16b,v1.16b
442         aesd    v18.16b,v17.16b
443         aesimc  v18.16b,v18.16b
444         ld1     {v17.4s},[x7],#16
445         b.gt    .Loop3x_cbc_dec
447         aesd    v0.16b,v16.16b
448         aesimc  v0.16b,v0.16b
449         aesd    v1.16b,v16.16b
450         aesimc  v1.16b,v1.16b
451         aesd    v18.16b,v16.16b
452         aesimc  v18.16b,v18.16b
453         eor     v4.16b,v6.16b,v7.16b
454         subs    x2,x2,#0x30
455         eor     v5.16b,v2.16b,v7.16b
456         csel    x6,x2,x6,lo                     // x6, w6, is zero at this point
457         aesd    v0.16b,v17.16b
458         aesimc  v0.16b,v0.16b
459         aesd    v1.16b,v17.16b
460         aesimc  v1.16b,v1.16b
461         aesd    v18.16b,v17.16b
462         aesimc  v18.16b,v18.16b
463         eor     v17.16b,v3.16b,v7.16b
464         add     x0,x0,x6                // x0 is adjusted in such way that
465                                         // at exit from the loop v1.16b-v18.16b
466                                         // are loaded with last "words"
467         orr     v6.16b,v19.16b,v19.16b
468         mov     x7,x3
469         aesd    v0.16b,v20.16b
470         aesimc  v0.16b,v0.16b
471         aesd    v1.16b,v20.16b
472         aesimc  v1.16b,v1.16b
473         aesd    v18.16b,v20.16b
474         aesimc  v18.16b,v18.16b
475         ld1     {v2.16b},[x0],#16
476         aesd    v0.16b,v21.16b
477         aesimc  v0.16b,v0.16b
478         aesd    v1.16b,v21.16b
479         aesimc  v1.16b,v1.16b
480         aesd    v18.16b,v21.16b
481         aesimc  v18.16b,v18.16b
482         ld1     {v3.16b},[x0],#16
483         aesd    v0.16b,v22.16b
484         aesimc  v0.16b,v0.16b
485         aesd    v1.16b,v22.16b
486         aesimc  v1.16b,v1.16b
487         aesd    v18.16b,v22.16b
488         aesimc  v18.16b,v18.16b
489         ld1     {v19.16b},[x0],#16
490         aesd    v0.16b,v23.16b
491         aesd    v1.16b,v23.16b
492         aesd    v18.16b,v23.16b
493         ld1     {v16.4s},[x7],#16       // re-pre-load rndkey[0]
494         add     w6,w5,#2
495         eor     v4.16b,v4.16b,v0.16b
496         eor     v5.16b,v5.16b,v1.16b
497         eor     v18.16b,v18.16b,v17.16b
498         ld1     {v17.4s},[x7],#16       // re-pre-load rndkey[1]
499         st1     {v4.16b},[x1],#16
500         orr     v0.16b,v2.16b,v2.16b
501         st1     {v5.16b},[x1],#16
502         orr     v1.16b,v3.16b,v3.16b
503         st1     {v18.16b},[x1],#16
504         orr     v18.16b,v19.16b,v19.16b
505         b.hs    .Loop3x_cbc_dec
507         cmn     x2,#0x30
508         b.eq    .Lcbc_done
509         nop
511 .Lcbc_dec_tail:
512         aesd    v1.16b,v16.16b
513         aesimc  v1.16b,v1.16b
514         aesd    v18.16b,v16.16b
515         aesimc  v18.16b,v18.16b
516         ld1     {v16.4s},[x7],#16
517         subs    w6,w6,#2
518         aesd    v1.16b,v17.16b
519         aesimc  v1.16b,v1.16b
520         aesd    v18.16b,v17.16b
521         aesimc  v18.16b,v18.16b
522         ld1     {v17.4s},[x7],#16
523         b.gt    .Lcbc_dec_tail
525         aesd    v1.16b,v16.16b
526         aesimc  v1.16b,v1.16b
527         aesd    v18.16b,v16.16b
528         aesimc  v18.16b,v18.16b
529         aesd    v1.16b,v17.16b
530         aesimc  v1.16b,v1.16b
531         aesd    v18.16b,v17.16b
532         aesimc  v18.16b,v18.16b
533         aesd    v1.16b,v20.16b
534         aesimc  v1.16b,v1.16b
535         aesd    v18.16b,v20.16b
536         aesimc  v18.16b,v18.16b
537         cmn     x2,#0x20
538         aesd    v1.16b,v21.16b
539         aesimc  v1.16b,v1.16b
540         aesd    v18.16b,v21.16b
541         aesimc  v18.16b,v18.16b
542         eor     v5.16b,v6.16b,v7.16b
543         aesd    v1.16b,v22.16b
544         aesimc  v1.16b,v1.16b
545         aesd    v18.16b,v22.16b
546         aesimc  v18.16b,v18.16b
547         eor     v17.16b,v3.16b,v7.16b
548         aesd    v1.16b,v23.16b
549         aesd    v18.16b,v23.16b
550         b.eq    .Lcbc_dec_one
551         eor     v5.16b,v5.16b,v1.16b
552         eor     v17.16b,v17.16b,v18.16b
553         orr     v6.16b,v19.16b,v19.16b
554         st1     {v5.16b},[x1],#16
555         st1     {v17.16b},[x1],#16
556         b       .Lcbc_done
558 .Lcbc_dec_one:
559         eor     v5.16b,v5.16b,v18.16b
560         orr     v6.16b,v19.16b,v19.16b
561         st1     {v5.16b},[x1],#16
563 .Lcbc_done:
564         st1     {v6.16b},[x4]
565 .Lcbc_abort:
566         ldr     x29,[sp],#16
567         ret
568 .size   aes_v8_cbc_encrypt,.-aes_v8_cbc_encrypt
569 .globl  aes_v8_ctr32_encrypt_blocks
570 .type   aes_v8_ctr32_encrypt_blocks,%function
571 .align  5
572 aes_v8_ctr32_encrypt_blocks:
573         stp     x29,x30,[sp,#-16]!
574         add     x29,sp,#0
575         ldr     w5,[x3,#240]
577         ldr     w8, [x4, #12]
578         ld1     {v0.4s},[x4]
580         ld1     {v16.4s,v17.4s},[x3]            // load key schedule...
581         sub     w5,w5,#4
582         mov     x12,#16
583         cmp     x2,#2
584         add     x7,x3,x5,lsl#4  // pointer to last 5 round keys
585         sub     w5,w5,#2
586         ld1     {v20.4s,v21.4s},[x7],#32
587         ld1     {v22.4s,v23.4s},[x7],#32
588         ld1     {v7.4s},[x7]
589         add     x7,x3,#32
590         mov     w6,w5
591         csel    x12,xzr,x12,lo
592 #ifndef __ARMEB__
593         rev     w8, w8
594 #endif
595         orr     v1.16b,v0.16b,v0.16b
596         add     w10, w8, #1
597         orr     v18.16b,v0.16b,v0.16b
598         add     w8, w8, #2
599         orr     v6.16b,v0.16b,v0.16b
600         rev     w10, w10
601         mov     v1.s[3],w10
602         b.ls    .Lctr32_tail
603         rev     w12, w8
604         sub     x2,x2,#3                // bias
605         mov     v18.s[3],w12
606         b       .Loop3x_ctr32
608 .align  4
609 .Loop3x_ctr32:
610         aese    v0.16b,v16.16b
611         aesmc   v0.16b,v0.16b
612         aese    v1.16b,v16.16b
613         aesmc   v1.16b,v1.16b
614         aese    v18.16b,v16.16b
615         aesmc   v18.16b,v18.16b
616         ld1     {v16.4s},[x7],#16
617         subs    w6,w6,#2
618         aese    v0.16b,v17.16b
619         aesmc   v0.16b,v0.16b
620         aese    v1.16b,v17.16b
621         aesmc   v1.16b,v1.16b
622         aese    v18.16b,v17.16b
623         aesmc   v18.16b,v18.16b
624         ld1     {v17.4s},[x7],#16
625         b.gt    .Loop3x_ctr32
627         aese    v0.16b,v16.16b
628         aesmc   v4.16b,v0.16b
629         aese    v1.16b,v16.16b
630         aesmc   v5.16b,v1.16b
631         ld1     {v2.16b},[x0],#16
632         orr     v0.16b,v6.16b,v6.16b
633         aese    v18.16b,v16.16b
634         aesmc   v18.16b,v18.16b
635         ld1     {v3.16b},[x0],#16
636         orr     v1.16b,v6.16b,v6.16b
637         aese    v4.16b,v17.16b
638         aesmc   v4.16b,v4.16b
639         aese    v5.16b,v17.16b
640         aesmc   v5.16b,v5.16b
641         ld1     {v19.16b},[x0],#16
642         mov     x7,x3
643         aese    v18.16b,v17.16b
644         aesmc   v17.16b,v18.16b
645         orr     v18.16b,v6.16b,v6.16b
646         add     w9,w8,#1
647         aese    v4.16b,v20.16b
648         aesmc   v4.16b,v4.16b
649         aese    v5.16b,v20.16b
650         aesmc   v5.16b,v5.16b
651         eor     v2.16b,v2.16b,v7.16b
652         add     w10,w8,#2
653         aese    v17.16b,v20.16b
654         aesmc   v17.16b,v17.16b
655         eor     v3.16b,v3.16b,v7.16b
656         add     w8,w8,#3
657         aese    v4.16b,v21.16b
658         aesmc   v4.16b,v4.16b
659         aese    v5.16b,v21.16b
660         aesmc   v5.16b,v5.16b
661         eor     v19.16b,v19.16b,v7.16b
662         rev     w9,w9
663         aese    v17.16b,v21.16b
664         aesmc   v17.16b,v17.16b
665         mov     v0.s[3], w9
666         rev     w10,w10
667         aese    v4.16b,v22.16b
668         aesmc   v4.16b,v4.16b
669         aese    v5.16b,v22.16b
670         aesmc   v5.16b,v5.16b
671         mov     v1.s[3], w10
672         rev     w12,w8
673         aese    v17.16b,v22.16b
674         aesmc   v17.16b,v17.16b
675         mov     v18.s[3], w12
676         subs    x2,x2,#3
677         aese    v4.16b,v23.16b
678         aese    v5.16b,v23.16b
679         aese    v17.16b,v23.16b
681         eor     v2.16b,v2.16b,v4.16b
682         ld1     {v16.4s},[x7],#16       // re-pre-load rndkey[0]
683         st1     {v2.16b},[x1],#16
684         eor     v3.16b,v3.16b,v5.16b
685         mov     w6,w5
686         st1     {v3.16b},[x1],#16
687         eor     v19.16b,v19.16b,v17.16b
688         ld1     {v17.4s},[x7],#16       // re-pre-load rndkey[1]
689         st1     {v19.16b},[x1],#16
690         b.hs    .Loop3x_ctr32
692         adds    x2,x2,#3
693         b.eq    .Lctr32_done
694         cmp     x2,#1
695         mov     x12,#16
696         csel    x12,xzr,x12,eq
698 .Lctr32_tail:
699         aese    v0.16b,v16.16b
700         aesmc   v0.16b,v0.16b
701         aese    v1.16b,v16.16b
702         aesmc   v1.16b,v1.16b
703         ld1     {v16.4s},[x7],#16
704         subs    w6,w6,#2
705         aese    v0.16b,v17.16b
706         aesmc   v0.16b,v0.16b
707         aese    v1.16b,v17.16b
708         aesmc   v1.16b,v1.16b
709         ld1     {v17.4s},[x7],#16
710         b.gt    .Lctr32_tail
712         aese    v0.16b,v16.16b
713         aesmc   v0.16b,v0.16b
714         aese    v1.16b,v16.16b
715         aesmc   v1.16b,v1.16b
716         aese    v0.16b,v17.16b
717         aesmc   v0.16b,v0.16b
718         aese    v1.16b,v17.16b
719         aesmc   v1.16b,v1.16b
720         ld1     {v2.16b},[x0],x12
721         aese    v0.16b,v20.16b
722         aesmc   v0.16b,v0.16b
723         aese    v1.16b,v20.16b
724         aesmc   v1.16b,v1.16b
725         ld1     {v3.16b},[x0]
726         aese    v0.16b,v21.16b
727         aesmc   v0.16b,v0.16b
728         aese    v1.16b,v21.16b
729         aesmc   v1.16b,v1.16b
730         eor     v2.16b,v2.16b,v7.16b
731         aese    v0.16b,v22.16b
732         aesmc   v0.16b,v0.16b
733         aese    v1.16b,v22.16b
734         aesmc   v1.16b,v1.16b
735         eor     v3.16b,v3.16b,v7.16b
736         aese    v0.16b,v23.16b
737         aese    v1.16b,v23.16b
739         cmp     x2,#1
740         eor     v2.16b,v2.16b,v0.16b
741         eor     v3.16b,v3.16b,v1.16b
742         st1     {v2.16b},[x1],#16
743         b.eq    .Lctr32_done
744         st1     {v3.16b},[x1]
746 .Lctr32_done:
747         ldr     x29,[sp],#16
748         ret
749 .size   aes_v8_ctr32_encrypt_blocks,.-aes_v8_ctr32_encrypt_blocks
750 #endif
751 #endif