Linux 4.18.10
[linux/fpc-iii.git] / arch / arm64 / crypto / speck-neon-core.S
blobb14463438b0966b6bc37f2f7784b0285c51ce290
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * ARM64 NEON-accelerated implementation of Speck128-XTS and Speck64-XTS
4  *
5  * Copyright (c) 2018 Google, Inc
6  *
7  * Author: Eric Biggers <ebiggers@google.com>
8  */
10 #include <linux/linkage.h>
12         .text
14         // arguments
15         ROUND_KEYS      .req    x0      // const {u64,u32} *round_keys
16         NROUNDS         .req    w1      // int nrounds
17         NROUNDS_X       .req    x1
18         DST             .req    x2      // void *dst
19         SRC             .req    x3      // const void *src
20         NBYTES          .req    w4      // unsigned int nbytes
21         TWEAK           .req    x5      // void *tweak
23         // registers which hold the data being encrypted/decrypted
24         // (underscores avoid a naming collision with ARM64 registers x0-x3)
25         X_0             .req    v0
26         Y_0             .req    v1
27         X_1             .req    v2
28         Y_1             .req    v3
29         X_2             .req    v4
30         Y_2             .req    v5
31         X_3             .req    v6
32         Y_3             .req    v7
34         // the round key, duplicated in all lanes
35         ROUND_KEY       .req    v8
37         // index vector for tbl-based 8-bit rotates
38         ROTATE_TABLE    .req    v9
39         ROTATE_TABLE_Q  .req    q9
41         // temporary registers
42         TMP0            .req    v10
43         TMP1            .req    v11
44         TMP2            .req    v12
45         TMP3            .req    v13
47         // multiplication table for updating XTS tweaks
48         GFMUL_TABLE     .req    v14
49         GFMUL_TABLE_Q   .req    q14
51         // next XTS tweak value(s)
52         TWEAKV_NEXT     .req    v15
54         // XTS tweaks for the blocks currently being encrypted/decrypted
55         TWEAKV0         .req    v16
56         TWEAKV1         .req    v17
57         TWEAKV2         .req    v18
58         TWEAKV3         .req    v19
59         TWEAKV4         .req    v20
60         TWEAKV5         .req    v21
61         TWEAKV6         .req    v22
62         TWEAKV7         .req    v23
64         .align          4
65 .Lror64_8_table:
66         .octa           0x080f0e0d0c0b0a090007060504030201
67 .Lror32_8_table:
68         .octa           0x0c0f0e0d080b0a090407060500030201
69 .Lrol64_8_table:
70         .octa           0x0e0d0c0b0a09080f0605040302010007
71 .Lrol32_8_table:
72         .octa           0x0e0d0c0f0a09080b0605040702010003
73 .Lgf128mul_table:
74         .octa           0x00000000000000870000000000000001
75 .Lgf64mul_table:
76         .octa           0x0000000000000000000000002d361b00
79  * _speck_round_128bytes() - Speck encryption round on 128 bytes at a time
80  *
81  * Do one Speck encryption round on the 128 bytes (8 blocks for Speck128, 16 for
82  * Speck64) stored in X0-X3 and Y0-Y3, using the round key stored in all lanes
83  * of ROUND_KEY.  'n' is the lane size: 64 for Speck128, or 32 for Speck64.
84  * 'lanes' is the lane specifier: "2d" for Speck128 or "4s" for Speck64.
85  */
86 .macro _speck_round_128bytes    n, lanes
88         // x = ror(x, 8)
89         tbl             X_0.16b, {X_0.16b}, ROTATE_TABLE.16b
90         tbl             X_1.16b, {X_1.16b}, ROTATE_TABLE.16b
91         tbl             X_2.16b, {X_2.16b}, ROTATE_TABLE.16b
92         tbl             X_3.16b, {X_3.16b}, ROTATE_TABLE.16b
94         // x += y
95         add             X_0.\lanes, X_0.\lanes, Y_0.\lanes
96         add             X_1.\lanes, X_1.\lanes, Y_1.\lanes
97         add             X_2.\lanes, X_2.\lanes, Y_2.\lanes
98         add             X_3.\lanes, X_3.\lanes, Y_3.\lanes
100         // x ^= k
101         eor             X_0.16b, X_0.16b, ROUND_KEY.16b
102         eor             X_1.16b, X_1.16b, ROUND_KEY.16b
103         eor             X_2.16b, X_2.16b, ROUND_KEY.16b
104         eor             X_3.16b, X_3.16b, ROUND_KEY.16b
106         // y = rol(y, 3)
107         shl             TMP0.\lanes, Y_0.\lanes, #3
108         shl             TMP1.\lanes, Y_1.\lanes, #3
109         shl             TMP2.\lanes, Y_2.\lanes, #3
110         shl             TMP3.\lanes, Y_3.\lanes, #3
111         sri             TMP0.\lanes, Y_0.\lanes, #(\n - 3)
112         sri             TMP1.\lanes, Y_1.\lanes, #(\n - 3)
113         sri             TMP2.\lanes, Y_2.\lanes, #(\n - 3)
114         sri             TMP3.\lanes, Y_3.\lanes, #(\n - 3)
116         // y ^= x
117         eor             Y_0.16b, TMP0.16b, X_0.16b
118         eor             Y_1.16b, TMP1.16b, X_1.16b
119         eor             Y_2.16b, TMP2.16b, X_2.16b
120         eor             Y_3.16b, TMP3.16b, X_3.16b
121 .endm
124  * _speck_unround_128bytes() - Speck decryption round on 128 bytes at a time
126  * This is the inverse of _speck_round_128bytes().
127  */
128 .macro _speck_unround_128bytes  n, lanes
130         // y ^= x
131         eor             TMP0.16b, Y_0.16b, X_0.16b
132         eor             TMP1.16b, Y_1.16b, X_1.16b
133         eor             TMP2.16b, Y_2.16b, X_2.16b
134         eor             TMP3.16b, Y_3.16b, X_3.16b
136         // y = ror(y, 3)
137         ushr            Y_0.\lanes, TMP0.\lanes, #3
138         ushr            Y_1.\lanes, TMP1.\lanes, #3
139         ushr            Y_2.\lanes, TMP2.\lanes, #3
140         ushr            Y_3.\lanes, TMP3.\lanes, #3
141         sli             Y_0.\lanes, TMP0.\lanes, #(\n - 3)
142         sli             Y_1.\lanes, TMP1.\lanes, #(\n - 3)
143         sli             Y_2.\lanes, TMP2.\lanes, #(\n - 3)
144         sli             Y_3.\lanes, TMP3.\lanes, #(\n - 3)
146         // x ^= k
147         eor             X_0.16b, X_0.16b, ROUND_KEY.16b
148         eor             X_1.16b, X_1.16b, ROUND_KEY.16b
149         eor             X_2.16b, X_2.16b, ROUND_KEY.16b
150         eor             X_3.16b, X_3.16b, ROUND_KEY.16b
152         // x -= y
153         sub             X_0.\lanes, X_0.\lanes, Y_0.\lanes
154         sub             X_1.\lanes, X_1.\lanes, Y_1.\lanes
155         sub             X_2.\lanes, X_2.\lanes, Y_2.\lanes
156         sub             X_3.\lanes, X_3.\lanes, Y_3.\lanes
158         // x = rol(x, 8)
159         tbl             X_0.16b, {X_0.16b}, ROTATE_TABLE.16b
160         tbl             X_1.16b, {X_1.16b}, ROTATE_TABLE.16b
161         tbl             X_2.16b, {X_2.16b}, ROTATE_TABLE.16b
162         tbl             X_3.16b, {X_3.16b}, ROTATE_TABLE.16b
163 .endm
165 .macro _next_xts_tweak  next, cur, tmp, n
166 .if \n == 64
167         /*
168          * Calculate the next tweak by multiplying the current one by x,
169          * modulo p(x) = x^128 + x^7 + x^2 + x + 1.
170          */
171         sshr            \tmp\().2d, \cur\().2d, #63
172         and             \tmp\().16b, \tmp\().16b, GFMUL_TABLE.16b
173         shl             \next\().2d, \cur\().2d, #1
174         ext             \tmp\().16b, \tmp\().16b, \tmp\().16b, #8
175         eor             \next\().16b, \next\().16b, \tmp\().16b
176 .else
177         /*
178          * Calculate the next two tweaks by multiplying the current ones by x^2,
179          * modulo p(x) = x^64 + x^4 + x^3 + x + 1.
180          */
181         ushr            \tmp\().2d, \cur\().2d, #62
182         shl             \next\().2d, \cur\().2d, #2
183         tbl             \tmp\().16b, {GFMUL_TABLE.16b}, \tmp\().16b
184         eor             \next\().16b, \next\().16b, \tmp\().16b
185 .endif
186 .endm
189  * _speck_xts_crypt() - Speck-XTS encryption/decryption
191  * Encrypt or decrypt NBYTES bytes of data from the SRC buffer to the DST buffer
192  * using Speck-XTS, specifically the variant with a block size of '2n' and round
193  * count given by NROUNDS.  The expanded round keys are given in ROUND_KEYS, and
194  * the current XTS tweak value is given in TWEAK.  It's assumed that NBYTES is a
195  * nonzero multiple of 128.
196  */
197 .macro _speck_xts_crypt n, lanes, decrypting
199         /*
200          * If decrypting, modify the ROUND_KEYS parameter to point to the last
201          * round key rather than the first, since for decryption the round keys
202          * are used in reverse order.
203          */
204 .if \decrypting
205         mov             NROUNDS, NROUNDS        /* zero the high 32 bits */
206 .if \n == 64
207         add             ROUND_KEYS, ROUND_KEYS, NROUNDS_X, lsl #3
208         sub             ROUND_KEYS, ROUND_KEYS, #8
209 .else
210         add             ROUND_KEYS, ROUND_KEYS, NROUNDS_X, lsl #2
211         sub             ROUND_KEYS, ROUND_KEYS, #4
212 .endif
213 .endif
215         // Load the index vector for tbl-based 8-bit rotates
216 .if \decrypting
217         ldr             ROTATE_TABLE_Q, .Lrol\n\()_8_table
218 .else
219         ldr             ROTATE_TABLE_Q, .Lror\n\()_8_table
220 .endif
222         // One-time XTS preparation
223 .if \n == 64
224         // Load first tweak
225         ld1             {TWEAKV0.16b}, [TWEAK]
227         // Load GF(2^128) multiplication table
228         ldr             GFMUL_TABLE_Q, .Lgf128mul_table
229 .else
230         // Load first tweak
231         ld1             {TWEAKV0.8b}, [TWEAK]
233         // Load GF(2^64) multiplication table
234         ldr             GFMUL_TABLE_Q, .Lgf64mul_table
236         // Calculate second tweak, packing it together with the first
237         ushr            TMP0.2d, TWEAKV0.2d, #63
238         shl             TMP1.2d, TWEAKV0.2d, #1
239         tbl             TMP0.8b, {GFMUL_TABLE.16b}, TMP0.8b
240         eor             TMP0.8b, TMP0.8b, TMP1.8b
241         mov             TWEAKV0.d[1], TMP0.d[0]
242 .endif
244 .Lnext_128bytes_\@:
246         // Calculate XTS tweaks for next 128 bytes
247         _next_xts_tweak TWEAKV1, TWEAKV0, TMP0, \n
248         _next_xts_tweak TWEAKV2, TWEAKV1, TMP0, \n
249         _next_xts_tweak TWEAKV3, TWEAKV2, TMP0, \n
250         _next_xts_tweak TWEAKV4, TWEAKV3, TMP0, \n
251         _next_xts_tweak TWEAKV5, TWEAKV4, TMP0, \n
252         _next_xts_tweak TWEAKV6, TWEAKV5, TMP0, \n
253         _next_xts_tweak TWEAKV7, TWEAKV6, TMP0, \n
254         _next_xts_tweak TWEAKV_NEXT, TWEAKV7, TMP0, \n
256         // Load the next source blocks into {X,Y}[0-3]
257         ld1             {X_0.16b-Y_1.16b}, [SRC], #64
258         ld1             {X_2.16b-Y_3.16b}, [SRC], #64
260         // XOR the source blocks with their XTS tweaks
261         eor             TMP0.16b, X_0.16b, TWEAKV0.16b
262         eor             Y_0.16b,  Y_0.16b, TWEAKV1.16b
263         eor             TMP1.16b, X_1.16b, TWEAKV2.16b
264         eor             Y_1.16b,  Y_1.16b, TWEAKV3.16b
265         eor             TMP2.16b, X_2.16b, TWEAKV4.16b
266         eor             Y_2.16b,  Y_2.16b, TWEAKV5.16b
267         eor             TMP3.16b, X_3.16b, TWEAKV6.16b
268         eor             Y_3.16b,  Y_3.16b, TWEAKV7.16b
270         /*
271          * De-interleave the 'x' and 'y' elements of each block, i.e. make it so
272          * that the X[0-3] registers contain only the second halves of blocks,
273          * and the Y[0-3] registers contain only the first halves of blocks.
274          * (Speck uses the order (y, x) rather than the more intuitive (x, y).)
275          */
276         uzp2            X_0.\lanes, TMP0.\lanes, Y_0.\lanes
277         uzp1            Y_0.\lanes, TMP0.\lanes, Y_0.\lanes
278         uzp2            X_1.\lanes, TMP1.\lanes, Y_1.\lanes
279         uzp1            Y_1.\lanes, TMP1.\lanes, Y_1.\lanes
280         uzp2            X_2.\lanes, TMP2.\lanes, Y_2.\lanes
281         uzp1            Y_2.\lanes, TMP2.\lanes, Y_2.\lanes
282         uzp2            X_3.\lanes, TMP3.\lanes, Y_3.\lanes
283         uzp1            Y_3.\lanes, TMP3.\lanes, Y_3.\lanes
285         // Do the cipher rounds
286         mov             x6, ROUND_KEYS
287         mov             w7, NROUNDS
288 .Lnext_round_\@:
289 .if \decrypting
290         ld1r            {ROUND_KEY.\lanes}, [x6]
291         sub             x6, x6, #( \n / 8 )
292         _speck_unround_128bytes \n, \lanes
293 .else
294         ld1r            {ROUND_KEY.\lanes}, [x6], #( \n / 8 )
295         _speck_round_128bytes   \n, \lanes
296 .endif
297         subs            w7, w7, #1
298         bne             .Lnext_round_\@
300         // Re-interleave the 'x' and 'y' elements of each block
301         zip1            TMP0.\lanes, Y_0.\lanes, X_0.\lanes
302         zip2            Y_0.\lanes,  Y_0.\lanes, X_0.\lanes
303         zip1            TMP1.\lanes, Y_1.\lanes, X_1.\lanes
304         zip2            Y_1.\lanes,  Y_1.\lanes, X_1.\lanes
305         zip1            TMP2.\lanes, Y_2.\lanes, X_2.\lanes
306         zip2            Y_2.\lanes,  Y_2.\lanes, X_2.\lanes
307         zip1            TMP3.\lanes, Y_3.\lanes, X_3.\lanes
308         zip2            Y_3.\lanes,  Y_3.\lanes, X_3.\lanes
310         // XOR the encrypted/decrypted blocks with the tweaks calculated earlier
311         eor             X_0.16b, TMP0.16b, TWEAKV0.16b
312         eor             Y_0.16b, Y_0.16b,  TWEAKV1.16b
313         eor             X_1.16b, TMP1.16b, TWEAKV2.16b
314         eor             Y_1.16b, Y_1.16b,  TWEAKV3.16b
315         eor             X_2.16b, TMP2.16b, TWEAKV4.16b
316         eor             Y_2.16b, Y_2.16b,  TWEAKV5.16b
317         eor             X_3.16b, TMP3.16b, TWEAKV6.16b
318         eor             Y_3.16b, Y_3.16b,  TWEAKV7.16b
319         mov             TWEAKV0.16b, TWEAKV_NEXT.16b
321         // Store the ciphertext in the destination buffer
322         st1             {X_0.16b-Y_1.16b}, [DST], #64
323         st1             {X_2.16b-Y_3.16b}, [DST], #64
325         // Continue if there are more 128-byte chunks remaining
326         subs            NBYTES, NBYTES, #128
327         bne             .Lnext_128bytes_\@
329         // Store the next tweak and return
330 .if \n == 64
331         st1             {TWEAKV_NEXT.16b}, [TWEAK]
332 .else
333         st1             {TWEAKV_NEXT.8b}, [TWEAK]
334 .endif
335         ret
336 .endm
338 ENTRY(speck128_xts_encrypt_neon)
339         _speck_xts_crypt        n=64, lanes=2d, decrypting=0
340 ENDPROC(speck128_xts_encrypt_neon)
342 ENTRY(speck128_xts_decrypt_neon)
343         _speck_xts_crypt        n=64, lanes=2d, decrypting=1
344 ENDPROC(speck128_xts_decrypt_neon)
346 ENTRY(speck64_xts_encrypt_neon)
347         _speck_xts_crypt        n=32, lanes=4s, decrypting=0
348 ENDPROC(speck64_xts_encrypt_neon)
350 ENTRY(speck64_xts_decrypt_neon)
351         _speck_xts_crypt        n=32, lanes=4s, decrypting=1
352 ENDPROC(speck64_xts_decrypt_neon)