dm writecache: fix incorrect flush sequence when doing SSD mode commit
[linux/fpc-iii.git] / arch / arm / crypto / aes-ce-core.S
blob4d1707388d94194b253741e4021fc22fd8c44506
1 /* SPDX-License-Identifier: GPL-2.0-only */
2 /*
3  * aes-ce-core.S - AES in CBC/CTR/XTS mode using ARMv8 Crypto Extensions
4  *
5  * Copyright (C) 2015 Linaro Ltd <ard.biesheuvel@linaro.org>
6  */
8 #include <linux/linkage.h>
9 #include <asm/assembler.h>
11         .text
12         .arch           armv8-a
13         .fpu            crypto-neon-fp-armv8
14         .align          3
16         .macro          enc_round, state, key
17         aese.8          \state, \key
18         aesmc.8         \state, \state
19         .endm
21         .macro          dec_round, state, key
22         aesd.8          \state, \key
23         aesimc.8        \state, \state
24         .endm
26         .macro          enc_dround, key1, key2
27         enc_round       q0, \key1
28         enc_round       q0, \key2
29         .endm
31         .macro          dec_dround, key1, key2
32         dec_round       q0, \key1
33         dec_round       q0, \key2
34         .endm
36         .macro          enc_fround, key1, key2, key3
37         enc_round       q0, \key1
38         aese.8          q0, \key2
39         veor            q0, q0, \key3
40         .endm
42         .macro          dec_fround, key1, key2, key3
43         dec_round       q0, \key1
44         aesd.8          q0, \key2
45         veor            q0, q0, \key3
46         .endm
48         .macro          enc_dround_4x, key1, key2
49         enc_round       q0, \key1
50         enc_round       q1, \key1
51         enc_round       q2, \key1
52         enc_round       q3, \key1
53         enc_round       q0, \key2
54         enc_round       q1, \key2
55         enc_round       q2, \key2
56         enc_round       q3, \key2
57         .endm
59         .macro          dec_dround_4x, key1, key2
60         dec_round       q0, \key1
61         dec_round       q1, \key1
62         dec_round       q2, \key1
63         dec_round       q3, \key1
64         dec_round       q0, \key2
65         dec_round       q1, \key2
66         dec_round       q2, \key2
67         dec_round       q3, \key2
68         .endm
70         .macro          enc_fround_4x, key1, key2, key3
71         enc_round       q0, \key1
72         enc_round       q1, \key1
73         enc_round       q2, \key1
74         enc_round       q3, \key1
75         aese.8          q0, \key2
76         aese.8          q1, \key2
77         aese.8          q2, \key2
78         aese.8          q3, \key2
79         veor            q0, q0, \key3
80         veor            q1, q1, \key3
81         veor            q2, q2, \key3
82         veor            q3, q3, \key3
83         .endm
85         .macro          dec_fround_4x, key1, key2, key3
86         dec_round       q0, \key1
87         dec_round       q1, \key1
88         dec_round       q2, \key1
89         dec_round       q3, \key1
90         aesd.8          q0, \key2
91         aesd.8          q1, \key2
92         aesd.8          q2, \key2
93         aesd.8          q3, \key2
94         veor            q0, q0, \key3
95         veor            q1, q1, \key3
96         veor            q2, q2, \key3
97         veor            q3, q3, \key3
98         .endm
100         .macro          do_block, dround, fround
101         cmp             r3, #12                 @ which key size?
102         vld1.32         {q10-q11}, [ip]!
103         \dround         q8, q9
104         vld1.32         {q12-q13}, [ip]!
105         \dround         q10, q11
106         vld1.32         {q10-q11}, [ip]!
107         \dround         q12, q13
108         vld1.32         {q12-q13}, [ip]!
109         \dround         q10, q11
110         blo             0f                      @ AES-128: 10 rounds
111         vld1.32         {q10-q11}, [ip]!
112         \dround         q12, q13
113         beq             1f                      @ AES-192: 12 rounds
114         vld1.32         {q12-q13}, [ip]
115         \dround         q10, q11
116 0:      \fround         q12, q13, q14
117         bx              lr
119 1:      \fround         q10, q11, q14
120         bx              lr
121         .endm
123         /*
124          * Internal, non-AAPCS compliant functions that implement the core AES
125          * transforms. These should preserve all registers except q0 - q2 and ip
126          * Arguments:
127          *   q0        : first in/output block
128          *   q1        : second in/output block (_4x version only)
129          *   q2        : third in/output block (_4x version only)
130          *   q3        : fourth in/output block (_4x version only)
131          *   q8        : first round key
132          *   q9        : secound round key
133          *   q14       : final round key
134          *   r2        : address of round key array
135          *   r3        : number of rounds
136          */
137         .align          6
138 aes_encrypt:
139         add             ip, r2, #32             @ 3rd round key
140 .Laes_encrypt_tweak:
141         do_block        enc_dround, enc_fround
142 ENDPROC(aes_encrypt)
144         .align          6
145 aes_decrypt:
146         add             ip, r2, #32             @ 3rd round key
147         do_block        dec_dround, dec_fround
148 ENDPROC(aes_decrypt)
150         .align          6
151 aes_encrypt_4x:
152         add             ip, r2, #32             @ 3rd round key
153         do_block        enc_dround_4x, enc_fround_4x
154 ENDPROC(aes_encrypt_4x)
156         .align          6
157 aes_decrypt_4x:
158         add             ip, r2, #32             @ 3rd round key
159         do_block        dec_dround_4x, dec_fround_4x
160 ENDPROC(aes_decrypt_4x)
162         .macro          prepare_key, rk, rounds
163         add             ip, \rk, \rounds, lsl #4
164         vld1.32         {q8-q9}, [\rk]          @ load first 2 round keys
165         vld1.32         {q14}, [ip]             @ load last round key
166         .endm
168         /*
169          * aes_ecb_encrypt(u8 out[], u8 const in[], u32 const rk[], int rounds,
170          *                 int blocks)
171          * aes_ecb_decrypt(u8 out[], u8 const in[], u32 const rk[], int rounds,
172          *                 int blocks)
173          */
174 ENTRY(ce_aes_ecb_encrypt)
175         push            {r4, lr}
176         ldr             r4, [sp, #8]
177         prepare_key     r2, r3
178 .Lecbencloop4x:
179         subs            r4, r4, #4
180         bmi             .Lecbenc1x
181         vld1.8          {q0-q1}, [r1]!
182         vld1.8          {q2-q3}, [r1]!
183         bl              aes_encrypt_4x
184         vst1.8          {q0-q1}, [r0]!
185         vst1.8          {q2-q3}, [r0]!
186         b               .Lecbencloop4x
187 .Lecbenc1x:
188         adds            r4, r4, #4
189         beq             .Lecbencout
190 .Lecbencloop:
191         vld1.8          {q0}, [r1]!
192         bl              aes_encrypt
193         vst1.8          {q0}, [r0]!
194         subs            r4, r4, #1
195         bne             .Lecbencloop
196 .Lecbencout:
197         pop             {r4, pc}
198 ENDPROC(ce_aes_ecb_encrypt)
200 ENTRY(ce_aes_ecb_decrypt)
201         push            {r4, lr}
202         ldr             r4, [sp, #8]
203         prepare_key     r2, r3
204 .Lecbdecloop4x:
205         subs            r4, r4, #4
206         bmi             .Lecbdec1x
207         vld1.8          {q0-q1}, [r1]!
208         vld1.8          {q2-q3}, [r1]!
209         bl              aes_decrypt_4x
210         vst1.8          {q0-q1}, [r0]!
211         vst1.8          {q2-q3}, [r0]!
212         b               .Lecbdecloop4x
213 .Lecbdec1x:
214         adds            r4, r4, #4
215         beq             .Lecbdecout
216 .Lecbdecloop:
217         vld1.8          {q0}, [r1]!
218         bl              aes_decrypt
219         vst1.8          {q0}, [r0]!
220         subs            r4, r4, #1
221         bne             .Lecbdecloop
222 .Lecbdecout:
223         pop             {r4, pc}
224 ENDPROC(ce_aes_ecb_decrypt)
226         /*
227          * aes_cbc_encrypt(u8 out[], u8 const in[], u32 const rk[], int rounds,
228          *                 int blocks, u8 iv[])
229          * aes_cbc_decrypt(u8 out[], u8 const in[], u32 const rk[], int rounds,
230          *                 int blocks, u8 iv[])
231          */
232 ENTRY(ce_aes_cbc_encrypt)
233         push            {r4-r6, lr}
234         ldrd            r4, r5, [sp, #16]
235         vld1.8          {q0}, [r5]
236         prepare_key     r2, r3
237 .Lcbcencloop:
238         vld1.8          {q1}, [r1]!             @ get next pt block
239         veor            q0, q0, q1              @ ..and xor with iv
240         bl              aes_encrypt
241         vst1.8          {q0}, [r0]!
242         subs            r4, r4, #1
243         bne             .Lcbcencloop
244         vst1.8          {q0}, [r5]
245         pop             {r4-r6, pc}
246 ENDPROC(ce_aes_cbc_encrypt)
248 ENTRY(ce_aes_cbc_decrypt)
249         push            {r4-r6, lr}
250         ldrd            r4, r5, [sp, #16]
251         vld1.8          {q15}, [r5]             @ keep iv in q15
252         prepare_key     r2, r3
253 .Lcbcdecloop4x:
254         subs            r4, r4, #4
255         bmi             .Lcbcdec1x
256         vld1.8          {q0-q1}, [r1]!
257         vld1.8          {q2-q3}, [r1]!
258         vmov            q4, q0
259         vmov            q5, q1
260         vmov            q6, q2
261         vmov            q7, q3
262         bl              aes_decrypt_4x
263         veor            q0, q0, q15
264         veor            q1, q1, q4
265         veor            q2, q2, q5
266         veor            q3, q3, q6
267         vmov            q15, q7
268         vst1.8          {q0-q1}, [r0]!
269         vst1.8          {q2-q3}, [r0]!
270         b               .Lcbcdecloop4x
271 .Lcbcdec1x:
272         adds            r4, r4, #4
273         beq             .Lcbcdecout
274         vmov            q6, q14                 @ preserve last round key
275 .Lcbcdecloop:
276         vld1.8          {q0}, [r1]!             @ get next ct block
277         veor            q14, q15, q6            @ combine prev ct with last key
278         vmov            q15, q0
279         bl              aes_decrypt
280         vst1.8          {q0}, [r0]!
281         subs            r4, r4, #1
282         bne             .Lcbcdecloop
283 .Lcbcdecout:
284         vst1.8          {q15}, [r5]             @ keep iv in q15
285         pop             {r4-r6, pc}
286 ENDPROC(ce_aes_cbc_decrypt)
289         /*
290          * ce_aes_cbc_cts_encrypt(u8 out[], u8 const in[], u32 const rk[],
291          *                        int rounds, int bytes, u8 const iv[])
292          * ce_aes_cbc_cts_decrypt(u8 out[], u8 const in[], u32 const rk[],
293          *                        int rounds, int bytes, u8 const iv[])
294          */
296 ENTRY(ce_aes_cbc_cts_encrypt)
297         push            {r4-r6, lr}
298         ldrd            r4, r5, [sp, #16]
300         movw            ip, :lower16:.Lcts_permute_table
301         movt            ip, :upper16:.Lcts_permute_table
302         sub             r4, r4, #16
303         add             lr, ip, #32
304         add             ip, ip, r4
305         sub             lr, lr, r4
306         vld1.8          {q5}, [ip]
307         vld1.8          {q6}, [lr]
309         add             ip, r1, r4
310         vld1.8          {q0}, [r1]                      @ overlapping loads
311         vld1.8          {q3}, [ip]
313         vld1.8          {q1}, [r5]                      @ get iv
314         prepare_key     r2, r3
316         veor            q0, q0, q1                      @ xor with iv
317         bl              aes_encrypt
319         vtbl.8          d4, {d0-d1}, d10
320         vtbl.8          d5, {d0-d1}, d11
321         vtbl.8          d2, {d6-d7}, d12
322         vtbl.8          d3, {d6-d7}, d13
324         veor            q0, q0, q1
325         bl              aes_encrypt
327         add             r4, r0, r4
328         vst1.8          {q2}, [r4]                      @ overlapping stores
329         vst1.8          {q0}, [r0]
331         pop             {r4-r6, pc}
332 ENDPROC(ce_aes_cbc_cts_encrypt)
334 ENTRY(ce_aes_cbc_cts_decrypt)
335         push            {r4-r6, lr}
336         ldrd            r4, r5, [sp, #16]
338         movw            ip, :lower16:.Lcts_permute_table
339         movt            ip, :upper16:.Lcts_permute_table
340         sub             r4, r4, #16
341         add             lr, ip, #32
342         add             ip, ip, r4
343         sub             lr, lr, r4
344         vld1.8          {q5}, [ip]
345         vld1.8          {q6}, [lr]
347         add             ip, r1, r4
348         vld1.8          {q0}, [r1]                      @ overlapping loads
349         vld1.8          {q1}, [ip]
351         vld1.8          {q3}, [r5]                      @ get iv
352         prepare_key     r2, r3
354         bl              aes_decrypt
356         vtbl.8          d4, {d0-d1}, d10
357         vtbl.8          d5, {d0-d1}, d11
358         vtbx.8          d0, {d2-d3}, d12
359         vtbx.8          d1, {d2-d3}, d13
361         veor            q1, q1, q2
362         bl              aes_decrypt
363         veor            q0, q0, q3                      @ xor with iv
365         add             r4, r0, r4
366         vst1.8          {q1}, [r4]                      @ overlapping stores
367         vst1.8          {q0}, [r0]
369         pop             {r4-r6, pc}
370 ENDPROC(ce_aes_cbc_cts_decrypt)
373         /*
374          * aes_ctr_encrypt(u8 out[], u8 const in[], u32 const rk[], int rounds,
375          *                 int blocks, u8 ctr[])
376          */
377 ENTRY(ce_aes_ctr_encrypt)
378         push            {r4-r6, lr}
379         ldrd            r4, r5, [sp, #16]
380         vld1.8          {q7}, [r5]              @ load ctr
381         prepare_key     r2, r3
382         vmov            r6, s31                 @ keep swabbed ctr in r6
383         rev             r6, r6
384         cmn             r6, r4                  @ 32 bit overflow?
385         bcs             .Lctrloop
386 .Lctrloop4x:
387         subs            r4, r4, #4
388         bmi             .Lctr1x
389         add             r6, r6, #1
390         vmov            q0, q7
391         vmov            q1, q7
392         rev             ip, r6
393         add             r6, r6, #1
394         vmov            q2, q7
395         vmov            s7, ip
396         rev             ip, r6
397         add             r6, r6, #1
398         vmov            q3, q7
399         vmov            s11, ip
400         rev             ip, r6
401         add             r6, r6, #1
402         vmov            s15, ip
403         vld1.8          {q4-q5}, [r1]!
404         vld1.8          {q6}, [r1]!
405         vld1.8          {q15}, [r1]!
406         bl              aes_encrypt_4x
407         veor            q0, q0, q4
408         veor            q1, q1, q5
409         veor            q2, q2, q6
410         veor            q3, q3, q15
411         rev             ip, r6
412         vst1.8          {q0-q1}, [r0]!
413         vst1.8          {q2-q3}, [r0]!
414         vmov            s31, ip
415         b               .Lctrloop4x
416 .Lctr1x:
417         adds            r4, r4, #4
418         beq             .Lctrout
419 .Lctrloop:
420         vmov            q0, q7
421         bl              aes_encrypt
423         adds            r6, r6, #1              @ increment BE ctr
424         rev             ip, r6
425         vmov            s31, ip
426         bcs             .Lctrcarry
428 .Lctrcarrydone:
429         subs            r4, r4, #1
430         bmi             .Lctrtailblock          @ blocks < 0 means tail block
431         vld1.8          {q3}, [r1]!
432         veor            q3, q0, q3
433         vst1.8          {q3}, [r0]!
434         bne             .Lctrloop
436 .Lctrout:
437         vst1.8          {q7}, [r5]              @ return next CTR value
438         pop             {r4-r6, pc}
440 .Lctrtailblock:
441         vst1.8          {q0}, [r0, :64]         @ return the key stream
442         b               .Lctrout
444 .Lctrcarry:
445         .irp            sreg, s30, s29, s28
446         vmov            ip, \sreg               @ load next word of ctr
447         rev             ip, ip                  @ ... to handle the carry
448         adds            ip, ip, #1
449         rev             ip, ip
450         vmov            \sreg, ip
451         bcc             .Lctrcarrydone
452         .endr
453         b               .Lctrcarrydone
454 ENDPROC(ce_aes_ctr_encrypt)
456         /*
457          * aes_xts_encrypt(u8 out[], u8 const in[], u32 const rk1[], int rounds,
458          *                 int bytes, u8 iv[], u32 const rk2[], int first)
459          * aes_xts_decrypt(u8 out[], u8 const in[], u32 const rk1[], int rounds,
460          *                 int bytes, u8 iv[], u32 const rk2[], int first)
461          */
463         .macro          next_tweak, out, in, const, tmp
464         vshr.s64        \tmp, \in, #63
465         vand            \tmp, \tmp, \const
466         vadd.u64        \out, \in, \in
467         vext.8          \tmp, \tmp, \tmp, #8
468         veor            \out, \out, \tmp
469         .endm
471 ce_aes_xts_init:
472         vmov.i32        d30, #0x87              @ compose tweak mask vector
473         vmovl.u32       q15, d30
474         vshr.u64        d30, d31, #7
476         ldrd            r4, r5, [sp, #16]       @ load args
477         ldr             r6, [sp, #28]
478         vld1.8          {q0}, [r5]              @ load iv
479         teq             r6, #1                  @ start of a block?
480         bxne            lr
482         @ Encrypt the IV in q0 with the second AES key. This should only
483         @ be done at the start of a block.
484         ldr             r6, [sp, #24]           @ load AES key 2
485         prepare_key     r6, r3
486         add             ip, r6, #32             @ 3rd round key of key 2
487         b               .Laes_encrypt_tweak     @ tail call
488 ENDPROC(ce_aes_xts_init)
490 ENTRY(ce_aes_xts_encrypt)
491         push            {r4-r6, lr}
493         bl              ce_aes_xts_init         @ run shared prologue
494         prepare_key     r2, r3
495         vmov            q4, q0
497         teq             r6, #0                  @ start of a block?
498         bne             .Lxtsenc4x
500 .Lxtsencloop4x:
501         next_tweak      q4, q4, q15, q10
502 .Lxtsenc4x:
503         subs            r4, r4, #64
504         bmi             .Lxtsenc1x
505         vld1.8          {q0-q1}, [r1]!          @ get 4 pt blocks
506         vld1.8          {q2-q3}, [r1]!
507         next_tweak      q5, q4, q15, q10
508         veor            q0, q0, q4
509         next_tweak      q6, q5, q15, q10
510         veor            q1, q1, q5
511         next_tweak      q7, q6, q15, q10
512         veor            q2, q2, q6
513         veor            q3, q3, q7
514         bl              aes_encrypt_4x
515         veor            q0, q0, q4
516         veor            q1, q1, q5
517         veor            q2, q2, q6
518         veor            q3, q3, q7
519         vst1.8          {q0-q1}, [r0]!          @ write 4 ct blocks
520         vst1.8          {q2-q3}, [r0]!
521         vmov            q4, q7
522         teq             r4, #0
523         beq             .Lxtsencret
524         b               .Lxtsencloop4x
525 .Lxtsenc1x:
526         adds            r4, r4, #64
527         beq             .Lxtsencout
528         subs            r4, r4, #16
529         bmi             .LxtsencctsNx
530 .Lxtsencloop:
531         vld1.8          {q0}, [r1]!
532 .Lxtsencctsout:
533         veor            q0, q0, q4
534         bl              aes_encrypt
535         veor            q0, q0, q4
536         teq             r4, #0
537         beq             .Lxtsencout
538         subs            r4, r4, #16
539         next_tweak      q4, q4, q15, q6
540         bmi             .Lxtsenccts
541         vst1.8          {q0}, [r0]!
542         b               .Lxtsencloop
543 .Lxtsencout:
544         vst1.8          {q0}, [r0]
545 .Lxtsencret:
546         vst1.8          {q4}, [r5]
547         pop             {r4-r6, pc}
549 .LxtsencctsNx:
550         vmov            q0, q3
551         sub             r0, r0, #16
552 .Lxtsenccts:
553         movw            ip, :lower16:.Lcts_permute_table
554         movt            ip, :upper16:.Lcts_permute_table
556         add             r1, r1, r4              @ rewind input pointer
557         add             r4, r4, #16             @ # bytes in final block
558         add             lr, ip, #32
559         add             ip, ip, r4
560         sub             lr, lr, r4
561         add             r4, r0, r4              @ output address of final block
563         vld1.8          {q1}, [r1]              @ load final partial block
564         vld1.8          {q2}, [ip]
565         vld1.8          {q3}, [lr]
567         vtbl.8          d4, {d0-d1}, d4
568         vtbl.8          d5, {d0-d1}, d5
569         vtbx.8          d0, {d2-d3}, d6
570         vtbx.8          d1, {d2-d3}, d7
572         vst1.8          {q2}, [r4]              @ overlapping stores
573         mov             r4, #0
574         b               .Lxtsencctsout
575 ENDPROC(ce_aes_xts_encrypt)
578 ENTRY(ce_aes_xts_decrypt)
579         push            {r4-r6, lr}
581         bl              ce_aes_xts_init         @ run shared prologue
582         prepare_key     r2, r3
583         vmov            q4, q0
585         /* subtract 16 bytes if we are doing CTS */
586         tst             r4, #0xf
587         subne           r4, r4, #0x10
589         teq             r6, #0                  @ start of a block?
590         bne             .Lxtsdec4x
592 .Lxtsdecloop4x:
593         next_tweak      q4, q4, q15, q10
594 .Lxtsdec4x:
595         subs            r4, r4, #64
596         bmi             .Lxtsdec1x
597         vld1.8          {q0-q1}, [r1]!          @ get 4 ct blocks
598         vld1.8          {q2-q3}, [r1]!
599         next_tweak      q5, q4, q15, q10
600         veor            q0, q0, q4
601         next_tweak      q6, q5, q15, q10
602         veor            q1, q1, q5
603         next_tweak      q7, q6, q15, q10
604         veor            q2, q2, q6
605         veor            q3, q3, q7
606         bl              aes_decrypt_4x
607         veor            q0, q0, q4
608         veor            q1, q1, q5
609         veor            q2, q2, q6
610         veor            q3, q3, q7
611         vst1.8          {q0-q1}, [r0]!          @ write 4 pt blocks
612         vst1.8          {q2-q3}, [r0]!
613         vmov            q4, q7
614         teq             r4, #0
615         beq             .Lxtsdecout
616         b               .Lxtsdecloop4x
617 .Lxtsdec1x:
618         adds            r4, r4, #64
619         beq             .Lxtsdecout
620         subs            r4, r4, #16
621 .Lxtsdecloop:
622         vld1.8          {q0}, [r1]!
623         bmi             .Lxtsdeccts
624 .Lxtsdecctsout:
625         veor            q0, q0, q4
626         bl              aes_decrypt
627         veor            q0, q0, q4
628         vst1.8          {q0}, [r0]!
629         teq             r4, #0
630         beq             .Lxtsdecout
631         subs            r4, r4, #16
632         next_tweak      q4, q4, q15, q6
633         b               .Lxtsdecloop
634 .Lxtsdecout:
635         vst1.8          {q4}, [r5]
636         pop             {r4-r6, pc}
638 .Lxtsdeccts:
639         movw            ip, :lower16:.Lcts_permute_table
640         movt            ip, :upper16:.Lcts_permute_table
642         add             r1, r1, r4              @ rewind input pointer
643         add             r4, r4, #16             @ # bytes in final block
644         add             lr, ip, #32
645         add             ip, ip, r4
646         sub             lr, lr, r4
647         add             r4, r0, r4              @ output address of final block
649         next_tweak      q5, q4, q15, q6
651         vld1.8          {q1}, [r1]              @ load final partial block
652         vld1.8          {q2}, [ip]
653         vld1.8          {q3}, [lr]
655         veor            q0, q0, q5
656         bl              aes_decrypt
657         veor            q0, q0, q5
659         vtbl.8          d4, {d0-d1}, d4
660         vtbl.8          d5, {d0-d1}, d5
661         vtbx.8          d0, {d2-d3}, d6
662         vtbx.8          d1, {d2-d3}, d7
664         vst1.8          {q2}, [r4]              @ overlapping stores
665         mov             r4, #0
666         b               .Lxtsdecctsout
667 ENDPROC(ce_aes_xts_decrypt)
669         /*
670          * u32 ce_aes_sub(u32 input) - use the aese instruction to perform the
671          *                             AES sbox substitution on each byte in
672          *                             'input'
673          */
674 ENTRY(ce_aes_sub)
675         vdup.32         q1, r0
676         veor            q0, q0, q0
677         aese.8          q0, q1
678         vmov            r0, s0
679         bx              lr
680 ENDPROC(ce_aes_sub)
682         /*
683          * void ce_aes_invert(u8 *dst, u8 *src) - perform the Inverse MixColumns
684          *                                        operation on round key *src
685          */
686 ENTRY(ce_aes_invert)
687         vld1.32         {q0}, [r1]
688         aesimc.8        q0, q0
689         vst1.32         {q0}, [r0]
690         bx              lr
691 ENDPROC(ce_aes_invert)
693         .section        ".rodata", "a"
694         .align          6
695 .Lcts_permute_table:
696         .byte           0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
697         .byte           0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
698         .byte            0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7
699         .byte            0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf
700         .byte           0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
701         .byte           0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff