Merge tag 'xtensa-20180225' of git://github.com/jcmvbkbc/linux-xtensa
[cris-mirror.git] / arch / arm64 / crypto / aes-neon.S
blob1c7b45b7268e4677fe589830830b08732d6cf138
1 /*
2  * linux/arch/arm64/crypto/aes-neon.S - AES cipher for ARMv8 NEON
3  *
4  * Copyright (C) 2013 - 2017 Linaro Ltd. <ard.biesheuvel@linaro.org>
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License version 2 as
8  * published by the Free Software Foundation.
9  */
11 #include <linux/linkage.h>
12 #include <asm/assembler.h>
14 #define AES_ENTRY(func)         ENTRY(neon_ ## func)
15 #define AES_ENDPROC(func)       ENDPROC(neon_ ## func)
17         /* multiply by polynomial 'x' in GF(2^8) */
18         .macro          mul_by_x, out, in, temp, const
19         sshr            \temp, \in, #7
20         shl             \out, \in, #1
21         and             \temp, \temp, \const
22         eor             \out, \out, \temp
23         .endm
25         /* multiply by polynomial 'x^2' in GF(2^8) */
26         .macro          mul_by_x2, out, in, temp, const
27         ushr            \temp, \in, #6
28         shl             \out, \in, #2
29         pmul            \temp, \temp, \const
30         eor             \out, \out, \temp
31         .endm
33         /* preload the entire Sbox */
34         .macro          prepare, sbox, shiftrows, temp
35         movi            v12.16b, #0x1b
36         ldr_l           q13, \shiftrows, \temp
37         ldr_l           q14, .Lror32by8, \temp
38         adr_l           \temp, \sbox
39         ld1             {v16.16b-v19.16b}, [\temp], #64
40         ld1             {v20.16b-v23.16b}, [\temp], #64
41         ld1             {v24.16b-v27.16b}, [\temp], #64
42         ld1             {v28.16b-v31.16b}, [\temp]
43         .endm
45         /* do preload for encryption */
46         .macro          enc_prepare, ignore0, ignore1, temp
47         prepare         .LForward_Sbox, .LForward_ShiftRows, \temp
48         .endm
50         .macro          enc_switch_key, ignore0, ignore1, temp
51         /* do nothing */
52         .endm
54         /* do preload for decryption */
55         .macro          dec_prepare, ignore0, ignore1, temp
56         prepare         .LReverse_Sbox, .LReverse_ShiftRows, \temp
57         .endm
59         /* apply SubBytes transformation using the the preloaded Sbox */
60         .macro          sub_bytes, in
61         sub             v9.16b, \in\().16b, v15.16b
62         tbl             \in\().16b, {v16.16b-v19.16b}, \in\().16b
63         sub             v10.16b, v9.16b, v15.16b
64         tbx             \in\().16b, {v20.16b-v23.16b}, v9.16b
65         sub             v11.16b, v10.16b, v15.16b
66         tbx             \in\().16b, {v24.16b-v27.16b}, v10.16b
67         tbx             \in\().16b, {v28.16b-v31.16b}, v11.16b
68         .endm
70         /* apply MixColumns transformation */
71         .macro          mix_columns, in, enc
72         .if             \enc == 0
73         /* Inverse MixColumns: pre-multiply by { 5, 0, 4, 0 } */
74         mul_by_x2       v8.16b, \in\().16b, v9.16b, v12.16b
75         eor             \in\().16b, \in\().16b, v8.16b
76         rev32           v8.8h, v8.8h
77         eor             \in\().16b, \in\().16b, v8.16b
78         .endif
80         mul_by_x        v9.16b, \in\().16b, v8.16b, v12.16b
81         rev32           v8.8h, \in\().8h
82         eor             v8.16b, v8.16b, v9.16b
83         eor             \in\().16b, \in\().16b, v8.16b
84         tbl             \in\().16b, {\in\().16b}, v14.16b
85         eor             \in\().16b, \in\().16b, v8.16b
86         .endm
88         .macro          do_block, enc, in, rounds, rk, rkp, i
89         ld1             {v15.4s}, [\rk]
90         add             \rkp, \rk, #16
91         mov             \i, \rounds
92 1111:   eor             \in\().16b, \in\().16b, v15.16b         /* ^round key */
93         movi            v15.16b, #0x40
94         tbl             \in\().16b, {\in\().16b}, v13.16b       /* ShiftRows */
95         sub_bytes       \in
96         subs            \i, \i, #1
97         ld1             {v15.4s}, [\rkp], #16
98         beq             2222f
99         mix_columns     \in, \enc
100         b               1111b
101 2222:   eor             \in\().16b, \in\().16b, v15.16b         /* ^round key */
102         .endm
104         .macro          encrypt_block, in, rounds, rk, rkp, i
105         do_block        1, \in, \rounds, \rk, \rkp, \i
106         .endm
108         .macro          decrypt_block, in, rounds, rk, rkp, i
109         do_block        0, \in, \rounds, \rk, \rkp, \i
110         .endm
112         /*
113          * Interleaved versions: functionally equivalent to the
114          * ones above, but applied to 2 or 4 AES states in parallel.
115          */
117         .macro          sub_bytes_2x, in0, in1
118         sub             v8.16b, \in0\().16b, v15.16b
119         tbl             \in0\().16b, {v16.16b-v19.16b}, \in0\().16b
120         sub             v9.16b, \in1\().16b, v15.16b
121         tbl             \in1\().16b, {v16.16b-v19.16b}, \in1\().16b
122         sub             v10.16b, v8.16b, v15.16b
123         tbx             \in0\().16b, {v20.16b-v23.16b}, v8.16b
124         sub             v11.16b, v9.16b, v15.16b
125         tbx             \in1\().16b, {v20.16b-v23.16b}, v9.16b
126         sub             v8.16b, v10.16b, v15.16b
127         tbx             \in0\().16b, {v24.16b-v27.16b}, v10.16b
128         sub             v9.16b, v11.16b, v15.16b
129         tbx             \in1\().16b, {v24.16b-v27.16b}, v11.16b
130         tbx             \in0\().16b, {v28.16b-v31.16b}, v8.16b
131         tbx             \in1\().16b, {v28.16b-v31.16b}, v9.16b
132         .endm
134         .macro          sub_bytes_4x, in0, in1, in2, in3
135         sub             v8.16b, \in0\().16b, v15.16b
136         tbl             \in0\().16b, {v16.16b-v19.16b}, \in0\().16b
137         sub             v9.16b, \in1\().16b, v15.16b
138         tbl             \in1\().16b, {v16.16b-v19.16b}, \in1\().16b
139         sub             v10.16b, \in2\().16b, v15.16b
140         tbl             \in2\().16b, {v16.16b-v19.16b}, \in2\().16b
141         sub             v11.16b, \in3\().16b, v15.16b
142         tbl             \in3\().16b, {v16.16b-v19.16b}, \in3\().16b
143         tbx             \in0\().16b, {v20.16b-v23.16b}, v8.16b
144         tbx             \in1\().16b, {v20.16b-v23.16b}, v9.16b
145         sub             v8.16b, v8.16b, v15.16b
146         tbx             \in2\().16b, {v20.16b-v23.16b}, v10.16b
147         sub             v9.16b, v9.16b, v15.16b
148         tbx             \in3\().16b, {v20.16b-v23.16b}, v11.16b
149         sub             v10.16b, v10.16b, v15.16b
150         tbx             \in0\().16b, {v24.16b-v27.16b}, v8.16b
151         sub             v11.16b, v11.16b, v15.16b
152         tbx             \in1\().16b, {v24.16b-v27.16b}, v9.16b
153         sub             v8.16b, v8.16b, v15.16b
154         tbx             \in2\().16b, {v24.16b-v27.16b}, v10.16b
155         sub             v9.16b, v9.16b, v15.16b
156         tbx             \in3\().16b, {v24.16b-v27.16b}, v11.16b
157         sub             v10.16b, v10.16b, v15.16b
158         tbx             \in0\().16b, {v28.16b-v31.16b}, v8.16b
159         sub             v11.16b, v11.16b, v15.16b
160         tbx             \in1\().16b, {v28.16b-v31.16b}, v9.16b
161         tbx             \in2\().16b, {v28.16b-v31.16b}, v10.16b
162         tbx             \in3\().16b, {v28.16b-v31.16b}, v11.16b
163         .endm
165         .macro          mul_by_x_2x, out0, out1, in0, in1, tmp0, tmp1, const
166         sshr            \tmp0\().16b, \in0\().16b, #7
167         shl             \out0\().16b, \in0\().16b, #1
168         sshr            \tmp1\().16b, \in1\().16b, #7
169         and             \tmp0\().16b, \tmp0\().16b, \const\().16b
170         shl             \out1\().16b, \in1\().16b, #1
171         and             \tmp1\().16b, \tmp1\().16b, \const\().16b
172         eor             \out0\().16b, \out0\().16b, \tmp0\().16b
173         eor             \out1\().16b, \out1\().16b, \tmp1\().16b
174         .endm
176         .macro          mul_by_x2_2x, out0, out1, in0, in1, tmp0, tmp1, const
177         ushr            \tmp0\().16b, \in0\().16b, #6
178         shl             \out0\().16b, \in0\().16b, #2
179         ushr            \tmp1\().16b, \in1\().16b, #6
180         pmul            \tmp0\().16b, \tmp0\().16b, \const\().16b
181         shl             \out1\().16b, \in1\().16b, #2
182         pmul            \tmp1\().16b, \tmp1\().16b, \const\().16b
183         eor             \out0\().16b, \out0\().16b, \tmp0\().16b
184         eor             \out1\().16b, \out1\().16b, \tmp1\().16b
185         .endm
187         .macro          mix_columns_2x, in0, in1, enc
188         .if             \enc == 0
189         /* Inverse MixColumns: pre-multiply by { 5, 0, 4, 0 } */
190         mul_by_x2_2x    v8, v9, \in0, \in1, v10, v11, v12
191         eor             \in0\().16b, \in0\().16b, v8.16b
192         rev32           v8.8h, v8.8h
193         eor             \in1\().16b, \in1\().16b, v9.16b
194         rev32           v9.8h, v9.8h
195         eor             \in0\().16b, \in0\().16b, v8.16b
196         eor             \in1\().16b, \in1\().16b, v9.16b
197         .endif
199         mul_by_x_2x     v8, v9, \in0, \in1, v10, v11, v12
200         rev32           v10.8h, \in0\().8h
201         rev32           v11.8h, \in1\().8h
202         eor             v10.16b, v10.16b, v8.16b
203         eor             v11.16b, v11.16b, v9.16b
204         eor             \in0\().16b, \in0\().16b, v10.16b
205         eor             \in1\().16b, \in1\().16b, v11.16b
206         tbl             \in0\().16b, {\in0\().16b}, v14.16b
207         tbl             \in1\().16b, {\in1\().16b}, v14.16b
208         eor             \in0\().16b, \in0\().16b, v10.16b
209         eor             \in1\().16b, \in1\().16b, v11.16b
210         .endm
212         .macro          do_block_2x, enc, in0, in1, rounds, rk, rkp, i
213         ld1             {v15.4s}, [\rk]
214         add             \rkp, \rk, #16
215         mov             \i, \rounds
216 1111:   eor             \in0\().16b, \in0\().16b, v15.16b       /* ^round key */
217         eor             \in1\().16b, \in1\().16b, v15.16b       /* ^round key */
218         movi            v15.16b, #0x40
219         tbl             \in0\().16b, {\in0\().16b}, v13.16b     /* ShiftRows */
220         tbl             \in1\().16b, {\in1\().16b}, v13.16b     /* ShiftRows */
221         sub_bytes_2x    \in0, \in1
222         subs            \i, \i, #1
223         ld1             {v15.4s}, [\rkp], #16
224         beq             2222f
225         mix_columns_2x  \in0, \in1, \enc
226         b               1111b
227 2222:   eor             \in0\().16b, \in0\().16b, v15.16b       /* ^round key */
228         eor             \in1\().16b, \in1\().16b, v15.16b       /* ^round key */
229         .endm
231         .macro          do_block_4x, enc, in0, in1, in2, in3, rounds, rk, rkp, i
232         ld1             {v15.4s}, [\rk]
233         add             \rkp, \rk, #16
234         mov             \i, \rounds
235 1111:   eor             \in0\().16b, \in0\().16b, v15.16b       /* ^round key */
236         eor             \in1\().16b, \in1\().16b, v15.16b       /* ^round key */
237         eor             \in2\().16b, \in2\().16b, v15.16b       /* ^round key */
238         eor             \in3\().16b, \in3\().16b, v15.16b       /* ^round key */
239         movi            v15.16b, #0x40
240         tbl             \in0\().16b, {\in0\().16b}, v13.16b     /* ShiftRows */
241         tbl             \in1\().16b, {\in1\().16b}, v13.16b     /* ShiftRows */
242         tbl             \in2\().16b, {\in2\().16b}, v13.16b     /* ShiftRows */
243         tbl             \in3\().16b, {\in3\().16b}, v13.16b     /* ShiftRows */
244         sub_bytes_4x    \in0, \in1, \in2, \in3
245         subs            \i, \i, #1
246         ld1             {v15.4s}, [\rkp], #16
247         beq             2222f
248         mix_columns_2x  \in0, \in1, \enc
249         mix_columns_2x  \in2, \in3, \enc
250         b               1111b
251 2222:   eor             \in0\().16b, \in0\().16b, v15.16b       /* ^round key */
252         eor             \in1\().16b, \in1\().16b, v15.16b       /* ^round key */
253         eor             \in2\().16b, \in2\().16b, v15.16b       /* ^round key */
254         eor             \in3\().16b, \in3\().16b, v15.16b       /* ^round key */
255         .endm
257         .macro          encrypt_block2x, in0, in1, rounds, rk, rkp, i
258         do_block_2x     1, \in0, \in1, \rounds, \rk, \rkp, \i
259         .endm
261         .macro          decrypt_block2x, in0, in1, rounds, rk, rkp, i
262         do_block_2x     0, \in0, \in1, \rounds, \rk, \rkp, \i
263         .endm
265         .macro          encrypt_block4x, in0, in1, in2, in3, rounds, rk, rkp, i
266         do_block_4x     1, \in0, \in1, \in2, \in3, \rounds, \rk, \rkp, \i
267         .endm
269         .macro          decrypt_block4x, in0, in1, in2, in3, rounds, rk, rkp, i
270         do_block_4x     0, \in0, \in1, \in2, \in3, \rounds, \rk, \rkp, \i
271         .endm
273 #include "aes-modes.S"
275         .section        ".rodata", "a"
276         .align          6
277 .LForward_Sbox:
278         .byte           0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5
279         .byte           0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76
280         .byte           0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0
281         .byte           0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0
282         .byte           0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc
283         .byte           0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15
284         .byte           0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a
285         .byte           0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75
286         .byte           0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0
287         .byte           0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84
288         .byte           0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b
289         .byte           0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf
290         .byte           0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85
291         .byte           0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8
292         .byte           0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5
293         .byte           0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2
294         .byte           0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17
295         .byte           0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73
296         .byte           0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88
297         .byte           0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb
298         .byte           0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c
299         .byte           0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79
300         .byte           0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9
301         .byte           0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08
302         .byte           0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6
303         .byte           0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a
304         .byte           0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e
305         .byte           0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e
306         .byte           0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94
307         .byte           0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf
308         .byte           0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68
309         .byte           0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
311 .LReverse_Sbox:
312         .byte           0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38
313         .byte           0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb
314         .byte           0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87
315         .byte           0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb
316         .byte           0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d
317         .byte           0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e
318         .byte           0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2
319         .byte           0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25
320         .byte           0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16
321         .byte           0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92
322         .byte           0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda
323         .byte           0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84
324         .byte           0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a
325         .byte           0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06
326         .byte           0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02
327         .byte           0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b
328         .byte           0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea
329         .byte           0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73
330         .byte           0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85
331         .byte           0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e
332         .byte           0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89
333         .byte           0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b
334         .byte           0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20
335         .byte           0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4
336         .byte           0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31
337         .byte           0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f
338         .byte           0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d
339         .byte           0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef
340         .byte           0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0
341         .byte           0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
342         .byte           0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
343         .byte           0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
345 .LForward_ShiftRows:
346         .octa           0x0b06010c07020d08030e09040f0a0500
348 .LReverse_ShiftRows:
349         .octa           0x0306090c0f0205080b0e0104070a0d00
351 .Lror32by8:
352         .octa           0x0c0f0e0d080b0a090407060500030201