perf tools: Streamline bpf examples and headers installation
[linux/fpc-iii.git] / arch / arm / crypto / speck-neon-core.S
blob57caa742016ed59bc8d3755fd6b9526f0c05f860
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * NEON-accelerated implementation of Speck128-XTS and Speck64-XTS
4  *
5  * Copyright (c) 2018 Google, Inc
6  *
7  * Author: Eric Biggers <ebiggers@google.com>
8  */
10 #include <linux/linkage.h>
12         .text
13         .fpu            neon
15         // arguments
16         ROUND_KEYS      .req    r0      // const {u64,u32} *round_keys
17         NROUNDS         .req    r1      // int nrounds
18         DST             .req    r2      // void *dst
19         SRC             .req    r3      // const void *src
20         NBYTES          .req    r4      // unsigned int nbytes
21         TWEAK           .req    r5      // void *tweak
23         // registers which hold the data being encrypted/decrypted
24         X0              .req    q0
25         X0_L            .req    d0
26         X0_H            .req    d1
27         Y0              .req    q1
28         Y0_H            .req    d3
29         X1              .req    q2
30         X1_L            .req    d4
31         X1_H            .req    d5
32         Y1              .req    q3
33         Y1_H            .req    d7
34         X2              .req    q4
35         X2_L            .req    d8
36         X2_H            .req    d9
37         Y2              .req    q5
38         Y2_H            .req    d11
39         X3              .req    q6
40         X3_L            .req    d12
41         X3_H            .req    d13
42         Y3              .req    q7
43         Y3_H            .req    d15
45         // the round key, duplicated in all lanes
46         ROUND_KEY       .req    q8
47         ROUND_KEY_L     .req    d16
48         ROUND_KEY_H     .req    d17
50         // index vector for vtbl-based 8-bit rotates
51         ROTATE_TABLE    .req    d18
53         // multiplication table for updating XTS tweaks
54         GF128MUL_TABLE  .req    d19
55         GF64MUL_TABLE   .req    d19
57         // current XTS tweak value(s)
58         TWEAKV          .req    q10
59         TWEAKV_L        .req    d20
60         TWEAKV_H        .req    d21
62         TMP0            .req    q12
63         TMP0_L          .req    d24
64         TMP0_H          .req    d25
65         TMP1            .req    q13
66         TMP2            .req    q14
67         TMP3            .req    q15
69         .align          4
70 .Lror64_8_table:
71         .byte           1, 2, 3, 4, 5, 6, 7, 0
72 .Lror32_8_table:
73         .byte           1, 2, 3, 0, 5, 6, 7, 4
74 .Lrol64_8_table:
75         .byte           7, 0, 1, 2, 3, 4, 5, 6
76 .Lrol32_8_table:
77         .byte           3, 0, 1, 2, 7, 4, 5, 6
78 .Lgf128mul_table:
79         .byte           0, 0x87
80         .fill           14
81 .Lgf64mul_table:
82         .byte           0, 0x1b, (0x1b << 1), (0x1b << 1) ^ 0x1b
83         .fill           12
86  * _speck_round_128bytes() - Speck encryption round on 128 bytes at a time
87  *
88  * Do one Speck encryption round on the 128 bytes (8 blocks for Speck128, 16 for
89  * Speck64) stored in X0-X3 and Y0-Y3, using the round key stored in all lanes
90  * of ROUND_KEY.  'n' is the lane size: 64 for Speck128, or 32 for Speck64.
91  *
92  * The 8-bit rotates are implemented using vtbl instead of vshr + vsli because
93  * the vtbl approach is faster on some processors and the same speed on others.
94  */
95 .macro _speck_round_128bytes    n
97         // x = ror(x, 8)
98         vtbl.8          X0_L, {X0_L}, ROTATE_TABLE
99         vtbl.8          X0_H, {X0_H}, ROTATE_TABLE
100         vtbl.8          X1_L, {X1_L}, ROTATE_TABLE
101         vtbl.8          X1_H, {X1_H}, ROTATE_TABLE
102         vtbl.8          X2_L, {X2_L}, ROTATE_TABLE
103         vtbl.8          X2_H, {X2_H}, ROTATE_TABLE
104         vtbl.8          X3_L, {X3_L}, ROTATE_TABLE
105         vtbl.8          X3_H, {X3_H}, ROTATE_TABLE
107         // x += y
108         vadd.u\n        X0, Y0
109         vadd.u\n        X1, Y1
110         vadd.u\n        X2, Y2
111         vadd.u\n        X3, Y3
113         // x ^= k
114         veor            X0, ROUND_KEY
115         veor            X1, ROUND_KEY
116         veor            X2, ROUND_KEY
117         veor            X3, ROUND_KEY
119         // y = rol(y, 3)
120         vshl.u\n        TMP0, Y0, #3
121         vshl.u\n        TMP1, Y1, #3
122         vshl.u\n        TMP2, Y2, #3
123         vshl.u\n        TMP3, Y3, #3
124         vsri.u\n        TMP0, Y0, #(\n - 3)
125         vsri.u\n        TMP1, Y1, #(\n - 3)
126         vsri.u\n        TMP2, Y2, #(\n - 3)
127         vsri.u\n        TMP3, Y3, #(\n - 3)
129         // y ^= x
130         veor            Y0, TMP0, X0
131         veor            Y1, TMP1, X1
132         veor            Y2, TMP2, X2
133         veor            Y3, TMP3, X3
134 .endm
137  * _speck_unround_128bytes() - Speck decryption round on 128 bytes at a time
139  * This is the inverse of _speck_round_128bytes().
140  */
141 .macro _speck_unround_128bytes  n
143         // y ^= x
144         veor            TMP0, Y0, X0
145         veor            TMP1, Y1, X1
146         veor            TMP2, Y2, X2
147         veor            TMP3, Y3, X3
149         // y = ror(y, 3)
150         vshr.u\n        Y0, TMP0, #3
151         vshr.u\n        Y1, TMP1, #3
152         vshr.u\n        Y2, TMP2, #3
153         vshr.u\n        Y3, TMP3, #3
154         vsli.u\n        Y0, TMP0, #(\n - 3)
155         vsli.u\n        Y1, TMP1, #(\n - 3)
156         vsli.u\n        Y2, TMP2, #(\n - 3)
157         vsli.u\n        Y3, TMP3, #(\n - 3)
159         // x ^= k
160         veor            X0, ROUND_KEY
161         veor            X1, ROUND_KEY
162         veor            X2, ROUND_KEY
163         veor            X3, ROUND_KEY
165         // x -= y
166         vsub.u\n        X0, Y0
167         vsub.u\n        X1, Y1
168         vsub.u\n        X2, Y2
169         vsub.u\n        X3, Y3
171         // x = rol(x, 8);
172         vtbl.8          X0_L, {X0_L}, ROTATE_TABLE
173         vtbl.8          X0_H, {X0_H}, ROTATE_TABLE
174         vtbl.8          X1_L, {X1_L}, ROTATE_TABLE
175         vtbl.8          X1_H, {X1_H}, ROTATE_TABLE
176         vtbl.8          X2_L, {X2_L}, ROTATE_TABLE
177         vtbl.8          X2_H, {X2_H}, ROTATE_TABLE
178         vtbl.8          X3_L, {X3_L}, ROTATE_TABLE
179         vtbl.8          X3_H, {X3_H}, ROTATE_TABLE
180 .endm
182 .macro _xts128_precrypt_one     dst_reg, tweak_buf, tmp
184         // Load the next source block
185         vld1.8          {\dst_reg}, [SRC]!
187         // Save the current tweak in the tweak buffer
188         vst1.8          {TWEAKV}, [\tweak_buf:128]!
190         // XOR the next source block with the current tweak
191         veor            \dst_reg, TWEAKV
193         /*
194          * Calculate the next tweak by multiplying the current one by x,
195          * modulo p(x) = x^128 + x^7 + x^2 + x + 1.
196          */
197         vshr.u64        \tmp, TWEAKV, #63
198         vshl.u64        TWEAKV, #1
199         veor            TWEAKV_H, \tmp\()_L
200         vtbl.8          \tmp\()_H, {GF128MUL_TABLE}, \tmp\()_H
201         veor            TWEAKV_L, \tmp\()_H
202 .endm
204 .macro _xts64_precrypt_two      dst_reg, tweak_buf, tmp
206         // Load the next two source blocks
207         vld1.8          {\dst_reg}, [SRC]!
209         // Save the current two tweaks in the tweak buffer
210         vst1.8          {TWEAKV}, [\tweak_buf:128]!
212         // XOR the next two source blocks with the current two tweaks
213         veor            \dst_reg, TWEAKV
215         /*
216          * Calculate the next two tweaks by multiplying the current ones by x^2,
217          * modulo p(x) = x^64 + x^4 + x^3 + x + 1.
218          */
219         vshr.u64        \tmp, TWEAKV, #62
220         vshl.u64        TWEAKV, #2
221         vtbl.8          \tmp\()_L, {GF64MUL_TABLE}, \tmp\()_L
222         vtbl.8          \tmp\()_H, {GF64MUL_TABLE}, \tmp\()_H
223         veor            TWEAKV, \tmp
224 .endm
227  * _speck_xts_crypt() - Speck-XTS encryption/decryption
229  * Encrypt or decrypt NBYTES bytes of data from the SRC buffer to the DST buffer
230  * using Speck-XTS, specifically the variant with a block size of '2n' and round
231  * count given by NROUNDS.  The expanded round keys are given in ROUND_KEYS, and
232  * the current XTS tweak value is given in TWEAK.  It's assumed that NBYTES is a
233  * nonzero multiple of 128.
234  */
235 .macro _speck_xts_crypt n, decrypting
236         push            {r4-r7}
237         mov             r7, sp
239         /*
240          * The first four parameters were passed in registers r0-r3.  Load the
241          * additional parameters, which were passed on the stack.
242          */
243         ldr             NBYTES, [sp, #16]
244         ldr             TWEAK, [sp, #20]
246         /*
247          * If decrypting, modify the ROUND_KEYS parameter to point to the last
248          * round key rather than the first, since for decryption the round keys
249          * are used in reverse order.
250          */
251 .if \decrypting
252 .if \n == 64
253         add             ROUND_KEYS, ROUND_KEYS, NROUNDS, lsl #3
254         sub             ROUND_KEYS, #8
255 .else
256         add             ROUND_KEYS, ROUND_KEYS, NROUNDS, lsl #2
257         sub             ROUND_KEYS, #4
258 .endif
259 .endif
261         // Load the index vector for vtbl-based 8-bit rotates
262 .if \decrypting
263         ldr             r12, =.Lrol\n\()_8_table
264 .else
265         ldr             r12, =.Lror\n\()_8_table
266 .endif
267         vld1.8          {ROTATE_TABLE}, [r12:64]
269         // One-time XTS preparation
271         /*
272          * Allocate stack space to store 128 bytes worth of tweaks.  For
273          * performance, this space is aligned to a 16-byte boundary so that we
274          * can use the load/store instructions that declare 16-byte alignment.
275          * For Thumb2 compatibility, don't do the 'bic' directly on 'sp'.
276          */
277         sub             r12, sp, #128
278         bic             r12, #0xf
279         mov             sp, r12
281 .if \n == 64
282         // Load first tweak
283         vld1.8          {TWEAKV}, [TWEAK]
285         // Load GF(2^128) multiplication table
286         ldr             r12, =.Lgf128mul_table
287         vld1.8          {GF128MUL_TABLE}, [r12:64]
288 .else
289         // Load first tweak
290         vld1.8          {TWEAKV_L}, [TWEAK]
292         // Load GF(2^64) multiplication table
293         ldr             r12, =.Lgf64mul_table
294         vld1.8          {GF64MUL_TABLE}, [r12:64]
296         // Calculate second tweak, packing it together with the first
297         vshr.u64        TMP0_L, TWEAKV_L, #63
298         vtbl.u8         TMP0_L, {GF64MUL_TABLE}, TMP0_L
299         vshl.u64        TWEAKV_H, TWEAKV_L, #1
300         veor            TWEAKV_H, TMP0_L
301 .endif
303 .Lnext_128bytes_\@:
305         /*
306          * Load the source blocks into {X,Y}[0-3], XOR them with their XTS tweak
307          * values, and save the tweaks on the stack for later.  Then
308          * de-interleave the 'x' and 'y' elements of each block, i.e. make it so
309          * that the X[0-3] registers contain only the second halves of blocks,
310          * and the Y[0-3] registers contain only the first halves of blocks.
311          * (Speck uses the order (y, x) rather than the more intuitive (x, y).)
312          */
313         mov             r12, sp
314 .if \n == 64
315         _xts128_precrypt_one    X0, r12, TMP0
316         _xts128_precrypt_one    Y0, r12, TMP0
317         _xts128_precrypt_one    X1, r12, TMP0
318         _xts128_precrypt_one    Y1, r12, TMP0
319         _xts128_precrypt_one    X2, r12, TMP0
320         _xts128_precrypt_one    Y2, r12, TMP0
321         _xts128_precrypt_one    X3, r12, TMP0
322         _xts128_precrypt_one    Y3, r12, TMP0
323         vswp            X0_L, Y0_H
324         vswp            X1_L, Y1_H
325         vswp            X2_L, Y2_H
326         vswp            X3_L, Y3_H
327 .else
328         _xts64_precrypt_two     X0, r12, TMP0
329         _xts64_precrypt_two     Y0, r12, TMP0
330         _xts64_precrypt_two     X1, r12, TMP0
331         _xts64_precrypt_two     Y1, r12, TMP0
332         _xts64_precrypt_two     X2, r12, TMP0
333         _xts64_precrypt_two     Y2, r12, TMP0
334         _xts64_precrypt_two     X3, r12, TMP0
335         _xts64_precrypt_two     Y3, r12, TMP0
336         vuzp.32         Y0, X0
337         vuzp.32         Y1, X1
338         vuzp.32         Y2, X2
339         vuzp.32         Y3, X3
340 .endif
342         // Do the cipher rounds
344         mov             r12, ROUND_KEYS
345         mov             r6, NROUNDS
347 .Lnext_round_\@:
348 .if \decrypting
349 .if \n == 64
350         vld1.64         ROUND_KEY_L, [r12]
351         sub             r12, #8
352         vmov            ROUND_KEY_H, ROUND_KEY_L
353 .else
354         vld1.32         {ROUND_KEY_L[],ROUND_KEY_H[]}, [r12]
355         sub             r12, #4
356 .endif
357         _speck_unround_128bytes \n
358 .else
359 .if \n == 64
360         vld1.64         ROUND_KEY_L, [r12]!
361         vmov            ROUND_KEY_H, ROUND_KEY_L
362 .else
363         vld1.32         {ROUND_KEY_L[],ROUND_KEY_H[]}, [r12]!
364 .endif
365         _speck_round_128bytes   \n
366 .endif
367         subs            r6, r6, #1
368         bne             .Lnext_round_\@
370         // Re-interleave the 'x' and 'y' elements of each block
371 .if \n == 64
372         vswp            X0_L, Y0_H
373         vswp            X1_L, Y1_H
374         vswp            X2_L, Y2_H
375         vswp            X3_L, Y3_H
376 .else
377         vzip.32         Y0, X0
378         vzip.32         Y1, X1
379         vzip.32         Y2, X2
380         vzip.32         Y3, X3
381 .endif
383         // XOR the encrypted/decrypted blocks with the tweaks we saved earlier
384         mov             r12, sp
385         vld1.8          {TMP0, TMP1}, [r12:128]!
386         vld1.8          {TMP2, TMP3}, [r12:128]!
387         veor            X0, TMP0
388         veor            Y0, TMP1
389         veor            X1, TMP2
390         veor            Y1, TMP3
391         vld1.8          {TMP0, TMP1}, [r12:128]!
392         vld1.8          {TMP2, TMP3}, [r12:128]!
393         veor            X2, TMP0
394         veor            Y2, TMP1
395         veor            X3, TMP2
396         veor            Y3, TMP3
398         // Store the ciphertext in the destination buffer
399         vst1.8          {X0, Y0}, [DST]!
400         vst1.8          {X1, Y1}, [DST]!
401         vst1.8          {X2, Y2}, [DST]!
402         vst1.8          {X3, Y3}, [DST]!
404         // Continue if there are more 128-byte chunks remaining, else return
405         subs            NBYTES, #128
406         bne             .Lnext_128bytes_\@
408         // Store the next tweak
409 .if \n == 64
410         vst1.8          {TWEAKV}, [TWEAK]
411 .else
412         vst1.8          {TWEAKV_L}, [TWEAK]
413 .endif
415         mov             sp, r7
416         pop             {r4-r7}
417         bx              lr
418 .endm
420 ENTRY(speck128_xts_encrypt_neon)
421         _speck_xts_crypt        n=64, decrypting=0
422 ENDPROC(speck128_xts_encrypt_neon)
424 ENTRY(speck128_xts_decrypt_neon)
425         _speck_xts_crypt        n=64, decrypting=1
426 ENDPROC(speck128_xts_decrypt_neon)
428 ENTRY(speck64_xts_encrypt_neon)
429         _speck_xts_crypt        n=32, decrypting=0
430 ENDPROC(speck64_xts_encrypt_neon)
432 ENTRY(speck64_xts_decrypt_neon)
433         _speck_xts_crypt        n=32, decrypting=1
434 ENDPROC(speck64_xts_decrypt_neon)