Merge tag 'trace-printf-v6.13' of git://git.kernel.org/pub/scm/linux/kernel/git/trace...
[drm/drm-misc.git] / arch / arm64 / crypto / sm4-ce-gcm-core.S
blob347f25d7572793a1b9d1f853e62ecc553521167b
1 /* SPDX-License-Identifier: GPL-2.0-or-later */
2 /*
3  * SM4-GCM AEAD Algorithm using ARMv8 Crypto Extensions
4  * as specified in rfc8998
5  * https://datatracker.ietf.org/doc/html/rfc8998
6  *
7  * Copyright (C) 2016 Jussi Kivilinna <jussi.kivilinna@iki.fi>
8  * Copyright (C) 2022 Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
9  */
11 #include <linux/linkage.h>
12 #include <linux/cfi_types.h>
13 #include <asm/assembler.h>
14 #include "sm4-ce-asm.h"
16 .arch   armv8-a+crypto
18 .irp b, 0, 1, 2, 3, 24, 25, 26, 27, 28, 29, 30, 31
19         .set .Lv\b\().4s, \b
20 .endr
22 .macro sm4e, vd, vn
23         .inst 0xcec08400 | (.L\vn << 5) | .L\vd
24 .endm
26 /* Register macros */
28 /* Used for both encryption and decryption */
29 #define RHASH   v21
30 #define RRCONST v22
31 #define RZERO   v23
33 /* Helper macros. */
36  * input: m0, m1
37  * output: r0:r1 (low 128-bits in r0, high in r1)
38  */
39 #define PMUL_128x128(r0, r1, m0, m1, T0, T1)                    \
40                 ext             T0.16b, m1.16b, m1.16b, #8;     \
41                 pmull           r0.1q, m0.1d, m1.1d;            \
42                 pmull           T1.1q, m0.1d, T0.1d;            \
43                 pmull2          T0.1q, m0.2d, T0.2d;            \
44                 pmull2          r1.1q, m0.2d, m1.2d;            \
45                 eor             T0.16b, T0.16b, T1.16b;         \
46                 ext             T1.16b, RZERO.16b, T0.16b, #8;  \
47                 ext             T0.16b, T0.16b, RZERO.16b, #8;  \
48                 eor             r0.16b, r0.16b, T1.16b;         \
49                 eor             r1.16b, r1.16b, T0.16b;
51 #define PMUL_128x128_4x(r0, r1, m0, m1, T0, T1,                 \
52                         r2, r3, m2, m3, T2, T3,                 \
53                         r4, r5, m4, m5, T4, T5,                 \
54                         r6, r7, m6, m7, T6, T7)                 \
55                 ext             T0.16b, m1.16b, m1.16b, #8;     \
56                 ext             T2.16b, m3.16b, m3.16b, #8;     \
57                 ext             T4.16b, m5.16b, m5.16b, #8;     \
58                 ext             T6.16b, m7.16b, m7.16b, #8;     \
59                 pmull           r0.1q, m0.1d, m1.1d;            \
60                 pmull           r2.1q, m2.1d, m3.1d;            \
61                 pmull           r4.1q, m4.1d, m5.1d;            \
62                 pmull           r6.1q, m6.1d, m7.1d;            \
63                 pmull           T1.1q, m0.1d, T0.1d;            \
64                 pmull           T3.1q, m2.1d, T2.1d;            \
65                 pmull           T5.1q, m4.1d, T4.1d;            \
66                 pmull           T7.1q, m6.1d, T6.1d;            \
67                 pmull2          T0.1q, m0.2d, T0.2d;            \
68                 pmull2          T2.1q, m2.2d, T2.2d;            \
69                 pmull2          T4.1q, m4.2d, T4.2d;            \
70                 pmull2          T6.1q, m6.2d, T6.2d;            \
71                 pmull2          r1.1q, m0.2d, m1.2d;            \
72                 pmull2          r3.1q, m2.2d, m3.2d;            \
73                 pmull2          r5.1q, m4.2d, m5.2d;            \
74                 pmull2          r7.1q, m6.2d, m7.2d;            \
75                 eor             T0.16b, T0.16b, T1.16b;         \
76                 eor             T2.16b, T2.16b, T3.16b;         \
77                 eor             T4.16b, T4.16b, T5.16b;         \
78                 eor             T6.16b, T6.16b, T7.16b;         \
79                 ext             T1.16b, RZERO.16b, T0.16b, #8;  \
80                 ext             T3.16b, RZERO.16b, T2.16b, #8;  \
81                 ext             T5.16b, RZERO.16b, T4.16b, #8;  \
82                 ext             T7.16b, RZERO.16b, T6.16b, #8;  \
83                 ext             T0.16b, T0.16b, RZERO.16b, #8;  \
84                 ext             T2.16b, T2.16b, RZERO.16b, #8;  \
85                 ext             T4.16b, T4.16b, RZERO.16b, #8;  \
86                 ext             T6.16b, T6.16b, RZERO.16b, #8;  \
87                 eor             r0.16b, r0.16b, T1.16b;         \
88                 eor             r2.16b, r2.16b, T3.16b;         \
89                 eor             r4.16b, r4.16b, T5.16b;         \
90                 eor             r6.16b, r6.16b, T7.16b;         \
91                 eor             r1.16b, r1.16b, T0.16b;         \
92                 eor             r3.16b, r3.16b, T2.16b;         \
93                 eor             r5.16b, r5.16b, T4.16b;         \
94                 eor             r7.16b, r7.16b, T6.16b;
97  * input: r0:r1 (low 128-bits in r0, high in r1)
98  * output: a
99  */
100 #define REDUCTION(a, r0, r1, rconst, T0, T1)                    \
101                 pmull2          T0.1q, r1.2d, rconst.2d;        \
102                 ext             T1.16b, T0.16b, RZERO.16b, #8;  \
103                 ext             T0.16b, RZERO.16b, T0.16b, #8;  \
104                 eor             r1.16b, r1.16b, T1.16b;         \
105                 eor             r0.16b, r0.16b, T0.16b;         \
106                 pmull           T0.1q, r1.1d, rconst.1d;        \
107                 eor             a.16b, r0.16b, T0.16b;
109 #define SM4_CRYPT_PMUL_128x128_BLK(b0, r0, r1, m0, m1, T0, T1)  \
110         rev32                   b0.16b, b0.16b;                 \
111                 ext             T0.16b, m1.16b, m1.16b, #8;     \
112         sm4e                    b0.4s, v24.4s;                  \
113                 pmull           r0.1q, m0.1d, m1.1d;            \
114         sm4e                    b0.4s, v25.4s;                  \
115                 pmull           T1.1q, m0.1d, T0.1d;            \
116         sm4e                    b0.4s, v26.4s;                  \
117                 pmull2          T0.1q, m0.2d, T0.2d;            \
118         sm4e                    b0.4s, v27.4s;                  \
119                 pmull2          r1.1q, m0.2d, m1.2d;            \
120         sm4e                    b0.4s, v28.4s;                  \
121                 eor             T0.16b, T0.16b, T1.16b;         \
122         sm4e                    b0.4s, v29.4s;                  \
123                 ext             T1.16b, RZERO.16b, T0.16b, #8;  \
124         sm4e                    b0.4s, v30.4s;                  \
125                 ext             T0.16b, T0.16b, RZERO.16b, #8;  \
126         sm4e                    b0.4s, v31.4s;                  \
127                 eor             r0.16b, r0.16b, T1.16b;         \
128         rev64                   b0.4s, b0.4s;                   \
129                 eor             r1.16b, r1.16b, T0.16b;         \
130         ext                     b0.16b, b0.16b, b0.16b, #8;     \
131         rev32                   b0.16b, b0.16b;
133 #define SM4_CRYPT_PMUL_128x128_BLK3(b0, b1, b2,                 \
134                                     r0, r1, m0, m1, T0, T1,     \
135                                     r2, r3, m2, m3, T2, T3,     \
136                                     r4, r5, m4, m5, T4, T5)     \
137         rev32                   b0.16b, b0.16b;                 \
138         rev32                   b1.16b, b1.16b;                 \
139         rev32                   b2.16b, b2.16b;                 \
140                 ext             T0.16b, m1.16b, m1.16b, #8;     \
141                 ext             T2.16b, m3.16b, m3.16b, #8;     \
142                 ext             T4.16b, m5.16b, m5.16b, #8;     \
143         sm4e                    b0.4s, v24.4s;                  \
144         sm4e                    b1.4s, v24.4s;                  \
145         sm4e                    b2.4s, v24.4s;                  \
146                 pmull           r0.1q, m0.1d, m1.1d;            \
147                 pmull           r2.1q, m2.1d, m3.1d;            \
148                 pmull           r4.1q, m4.1d, m5.1d;            \
149         sm4e                    b0.4s, v25.4s;                  \
150         sm4e                    b1.4s, v25.4s;                  \
151         sm4e                    b2.4s, v25.4s;                  \
152                 pmull           T1.1q, m0.1d, T0.1d;            \
153                 pmull           T3.1q, m2.1d, T2.1d;            \
154                 pmull           T5.1q, m4.1d, T4.1d;            \
155         sm4e                    b0.4s, v26.4s;                  \
156         sm4e                    b1.4s, v26.4s;                  \
157         sm4e                    b2.4s, v26.4s;                  \
158                 pmull2          T0.1q, m0.2d, T0.2d;            \
159                 pmull2          T2.1q, m2.2d, T2.2d;            \
160                 pmull2          T4.1q, m4.2d, T4.2d;            \
161         sm4e                    b0.4s, v27.4s;                  \
162         sm4e                    b1.4s, v27.4s;                  \
163         sm4e                    b2.4s, v27.4s;                  \
164                 pmull2          r1.1q, m0.2d, m1.2d;            \
165                 pmull2          r3.1q, m2.2d, m3.2d;            \
166                 pmull2          r5.1q, m4.2d, m5.2d;            \
167         sm4e                    b0.4s, v28.4s;                  \
168         sm4e                    b1.4s, v28.4s;                  \
169         sm4e                    b2.4s, v28.4s;                  \
170                 eor             T0.16b, T0.16b, T1.16b;         \
171                 eor             T2.16b, T2.16b, T3.16b;         \
172                 eor             T4.16b, T4.16b, T5.16b;         \
173         sm4e                    b0.4s, v29.4s;                  \
174         sm4e                    b1.4s, v29.4s;                  \
175         sm4e                    b2.4s, v29.4s;                  \
176                 ext             T1.16b, RZERO.16b, T0.16b, #8;  \
177                 ext             T3.16b, RZERO.16b, T2.16b, #8;  \
178                 ext             T5.16b, RZERO.16b, T4.16b, #8;  \
179         sm4e                    b0.4s, v30.4s;                  \
180         sm4e                    b1.4s, v30.4s;                  \
181         sm4e                    b2.4s, v30.4s;                  \
182                 ext             T0.16b, T0.16b, RZERO.16b, #8;  \
183                 ext             T2.16b, T2.16b, RZERO.16b, #8;  \
184                 ext             T4.16b, T4.16b, RZERO.16b, #8;  \
185         sm4e                    b0.4s, v31.4s;                  \
186         sm4e                    b1.4s, v31.4s;                  \
187         sm4e                    b2.4s, v31.4s;                  \
188                 eor             r0.16b, r0.16b, T1.16b;         \
189                 eor             r2.16b, r2.16b, T3.16b;         \
190                 eor             r4.16b, r4.16b, T5.16b;         \
191         rev64                   b0.4s, b0.4s;                   \
192         rev64                   b1.4s, b1.4s;                   \
193         rev64                   b2.4s, b2.4s;                   \
194                 eor             r1.16b, r1.16b, T0.16b;         \
195                 eor             r3.16b, r3.16b, T2.16b;         \
196                 eor             r5.16b, r5.16b, T4.16b;         \
197         ext                     b0.16b, b0.16b, b0.16b, #8;     \
198         ext                     b1.16b, b1.16b, b1.16b, #8;     \
199         ext                     b2.16b, b2.16b, b2.16b, #8;     \
200                 eor             r0.16b, r0.16b, r2.16b;         \
201                 eor             r1.16b, r1.16b, r3.16b;         \
202         rev32                   b0.16b, b0.16b;                 \
203         rev32                   b1.16b, b1.16b;                 \
204         rev32                   b2.16b, b2.16b;                 \
205                 eor             r0.16b, r0.16b, r4.16b;         \
206                 eor             r1.16b, r1.16b, r5.16b;
208 #define inc32_le128(vctr)                                       \
209                 mov             vctr.d[1], x9;                  \
210                 add             w6, w9, #1;                     \
211                 mov             vctr.d[0], x8;                  \
212                 bfi             x9, x6, #0, #32;                \
213                 rev64           vctr.16b, vctr.16b;
215 #define GTAG_HASH_LENGTHS(vctr0, vlen)                                  \
216                 ld1             {vlen.16b}, [x7];                       \
217                 /* construct CTR0 */                                    \
218                 /* the lower 32-bits of initial IV is always be32(1) */ \
219                 mov             x6, #0x1;                               \
220                 bfi             x9, x6, #0, #32;                        \
221                 mov             vctr0.d[0], x8;                         \
222                 mov             vctr0.d[1], x9;                         \
223                 rbit            vlen.16b, vlen.16b;                     \
224                 rev64           vctr0.16b, vctr0.16b;                   \
225                 /* authtag = GCTR(CTR0, GHASH) */                       \
226                 eor             RHASH.16b, RHASH.16b, vlen.16b;         \
227                 SM4_CRYPT_PMUL_128x128_BLK(vctr0, RR0, RR1, RHASH, RH1, \
228                                            RTMP0, RTMP1);               \
229                 REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP2, RTMP3);      \
230                 rbit            RHASH.16b, RHASH.16b;                   \
231                 eor             RHASH.16b, RHASH.16b, vctr0.16b;
234 /* Register macros for encrypt and ghash */
236 /* can be the same as input v0-v3 */
237 #define RR1     v0
238 #define RR3     v1
239 #define RR5     v2
240 #define RR7     v3
242 #define RR0     v4
243 #define RR2     v5
244 #define RR4     v6
245 #define RR6     v7
247 #define RTMP0   v8
248 #define RTMP1   v9
249 #define RTMP2   v10
250 #define RTMP3   v11
251 #define RTMP4   v12
252 #define RTMP5   v13
253 #define RTMP6   v14
254 #define RTMP7   v15
256 #define RH1     v16
257 #define RH2     v17
258 #define RH3     v18
259 #define RH4     v19
261 .align 3
262 SYM_FUNC_START(sm4_ce_pmull_ghash_setup)
263         /* input:
264          *   x0: round key array, CTX
265          *   x1: ghash table
266          */
267         SM4_PREPARE(x0)
269         adr_l           x2, .Lghash_rconst
270         ld1r            {RRCONST.2d}, [x2]
272         eor             RZERO.16b, RZERO.16b, RZERO.16b
274         /* H = E(K, 0^128) */
275         rev32           v0.16b, RZERO.16b
276         SM4_CRYPT_BLK_BE(v0)
278         /* H ^ 1 */
279         rbit            RH1.16b, v0.16b
281         /* H ^ 2 */
282         PMUL_128x128(RR0, RR1, RH1, RH1, RTMP0, RTMP1)
283         REDUCTION(RH2, RR0, RR1, RRCONST, RTMP2, RTMP3)
285         /* H ^ 3 */
286         PMUL_128x128(RR0, RR1, RH2, RH1, RTMP0, RTMP1)
287         REDUCTION(RH3, RR0, RR1, RRCONST, RTMP2, RTMP3)
289         /* H ^ 4 */
290         PMUL_128x128(RR0, RR1, RH2, RH2, RTMP0, RTMP1)
291         REDUCTION(RH4, RR0, RR1, RRCONST, RTMP2, RTMP3)
293         st1             {RH1.16b-RH4.16b}, [x1]
295         ret
296 SYM_FUNC_END(sm4_ce_pmull_ghash_setup)
298 .align 3
299 SYM_FUNC_START(pmull_ghash_update)
300         /* input:
301          *   x0: ghash table
302          *   x1: ghash result
303          *   x2: src
304          *   w3: nblocks
305          */
306         ld1             {RH1.16b-RH4.16b}, [x0]
308         ld1             {RHASH.16b}, [x1]
309         rbit            RHASH.16b, RHASH.16b
311         adr_l           x4, .Lghash_rconst
312         ld1r            {RRCONST.2d}, [x4]
314         eor             RZERO.16b, RZERO.16b, RZERO.16b
316 .Lghash_loop_4x:
317         cmp             w3, #4
318         blt             .Lghash_loop_1x
320         sub             w3, w3, #4
322         ld1             {v0.16b-v3.16b}, [x2], #64
324         rbit            v0.16b, v0.16b
325         rbit            v1.16b, v1.16b
326         rbit            v2.16b, v2.16b
327         rbit            v3.16b, v3.16b
329         /*
330          * (in0 ^ HASH) * H^4 => rr0:rr1
331          * (in1)        * H^3 => rr2:rr3
332          * (in2)        * H^2 => rr4:rr5
333          * (in3)        * H^1 => rr6:rr7
334          */
335         eor             RHASH.16b, RHASH.16b, v0.16b
337         PMUL_128x128_4x(RR0, RR1, RHASH, RH4, RTMP0, RTMP1,
338                         RR2, RR3, v1, RH3, RTMP2, RTMP3,
339                         RR4, RR5, v2, RH2, RTMP4, RTMP5,
340                         RR6, RR7, v3, RH1, RTMP6, RTMP7)
342         eor             RR0.16b, RR0.16b, RR2.16b
343         eor             RR1.16b, RR1.16b, RR3.16b
344         eor             RR0.16b, RR0.16b, RR4.16b
345         eor             RR1.16b, RR1.16b, RR5.16b
346         eor             RR0.16b, RR0.16b, RR6.16b
347         eor             RR1.16b, RR1.16b, RR7.16b
349         REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP0, RTMP1)
351         cbz             w3, .Lghash_end
352         b               .Lghash_loop_4x
354 .Lghash_loop_1x:
355         sub             w3, w3, #1
357         ld1             {v0.16b}, [x2], #16
358         rbit            v0.16b, v0.16b
359         eor             RHASH.16b, RHASH.16b, v0.16b
361         PMUL_128x128(RR0, RR1, RHASH, RH1, RTMP0, RTMP1)
362         REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP2, RTMP3)
364         cbnz            w3, .Lghash_loop_1x
366 .Lghash_end:
367         rbit            RHASH.16b, RHASH.16b
368         st1             {RHASH.2d}, [x1]
370         ret
371 SYM_FUNC_END(pmull_ghash_update)
373 .align 3
374 SYM_TYPED_FUNC_START(sm4_ce_pmull_gcm_enc)
375         /* input:
376          *   x0: round key array, CTX
377          *   x1: dst
378          *   x2: src
379          *   x3: ctr (big endian, 128 bit)
380          *   w4: nbytes
381          *   x5: ghash result
382          *   x6: ghash table
383          *   x7: lengths (only for last block)
384          */
385         SM4_PREPARE(x0)
387         ldp             x8, x9, [x3]
388         rev             x8, x8
389         rev             x9, x9
391         ld1             {RH1.16b-RH4.16b}, [x6]
393         ld1             {RHASH.16b}, [x5]
394         rbit            RHASH.16b, RHASH.16b
396         adr_l           x6, .Lghash_rconst
397         ld1r            {RRCONST.2d}, [x6]
399         eor             RZERO.16b, RZERO.16b, RZERO.16b
401         cbz             w4, .Lgcm_enc_hash_len
403 .Lgcm_enc_loop_4x:
404         cmp             w4, #(4 * 16)
405         blt             .Lgcm_enc_loop_1x
407         sub             w4, w4, #(4 * 16)
409         /* construct CTRs */
410         inc32_le128(v0)                 /* +0 */
411         inc32_le128(v1)                 /* +1 */
412         inc32_le128(v2)                 /* +2 */
413         inc32_le128(v3)                 /* +3 */
415         ld1             {RTMP0.16b-RTMP3.16b}, [x2], #64
417         SM4_CRYPT_BLK4(v0, v1, v2, v3)
419         eor             v0.16b, v0.16b, RTMP0.16b
420         eor             v1.16b, v1.16b, RTMP1.16b
421         eor             v2.16b, v2.16b, RTMP2.16b
422         eor             v3.16b, v3.16b, RTMP3.16b
423         st1             {v0.16b-v3.16b}, [x1], #64
425         /* ghash update */
427         rbit            v0.16b, v0.16b
428         rbit            v1.16b, v1.16b
429         rbit            v2.16b, v2.16b
430         rbit            v3.16b, v3.16b
432         /*
433          * (in0 ^ HASH) * H^4 => rr0:rr1
434          * (in1)        * H^3 => rr2:rr3
435          * (in2)        * H^2 => rr4:rr5
436          * (in3)        * H^1 => rr6:rr7
437          */
438         eor             RHASH.16b, RHASH.16b, v0.16b
440         PMUL_128x128_4x(RR0, RR1, RHASH, RH4, RTMP0, RTMP1,
441                         RR2, RR3, v1, RH3, RTMP2, RTMP3,
442                         RR4, RR5, v2, RH2, RTMP4, RTMP5,
443                         RR6, RR7, v3, RH1, RTMP6, RTMP7)
445         eor             RR0.16b, RR0.16b, RR2.16b
446         eor             RR1.16b, RR1.16b, RR3.16b
447         eor             RR0.16b, RR0.16b, RR4.16b
448         eor             RR1.16b, RR1.16b, RR5.16b
449         eor             RR0.16b, RR0.16b, RR6.16b
450         eor             RR1.16b, RR1.16b, RR7.16b
452         REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP0, RTMP1)
454         cbz             w4, .Lgcm_enc_hash_len
455         b               .Lgcm_enc_loop_4x
457 .Lgcm_enc_loop_1x:
458         cmp             w4, #16
459         blt             .Lgcm_enc_tail
461         sub             w4, w4, #16
463         /* construct CTRs */
464         inc32_le128(v0)
466         ld1             {RTMP0.16b}, [x2], #16
468         SM4_CRYPT_BLK(v0)
470         eor             v0.16b, v0.16b, RTMP0.16b
471         st1             {v0.16b}, [x1], #16
473         /* ghash update */
474         rbit            v0.16b, v0.16b
475         eor             RHASH.16b, RHASH.16b, v0.16b
476         PMUL_128x128(RR0, RR1, RHASH, RH1, RTMP0, RTMP1)
477         REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP2, RTMP3)
479         cbz             w4, .Lgcm_enc_hash_len
480         b               .Lgcm_enc_loop_1x
482 .Lgcm_enc_tail:
483         /* construct CTRs */
484         inc32_le128(v0)
485         SM4_CRYPT_BLK(v0)
487         /* load permute table */
488         adr_l           x0, .Lcts_permute_table
489         add             x0, x0, #32
490         sub             x0, x0, w4, uxtw
491         ld1             {v3.16b}, [x0]
493 .Lgcm_enc_tail_loop:
494         /* do encrypt */
495         ldrb            w0, [x2], #1    /* get 1 byte from input */
496         umov            w6, v0.b[0]     /* get top crypted byte */
497         eor             w6, w6, w0      /* w6 = CTR ^ input */
498         strb            w6, [x1], #1    /* store out byte */
500         /* shift right out one byte */
501         ext             v0.16b, v0.16b, v0.16b, #1
502         /* the last ciphertext is placed in high bytes */
503         ins             v0.b[15], w6
505         subs            w4, w4, #1
506         bne             .Lgcm_enc_tail_loop
508         /* padding last block with zeros */
509         tbl             v0.16b, {v0.16b}, v3.16b
511         /* ghash update */
512         rbit            v0.16b, v0.16b
513         eor             RHASH.16b, RHASH.16b, v0.16b
514         PMUL_128x128(RR0, RR1, RHASH, RH1, RTMP0, RTMP1)
515         REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP2, RTMP3)
517 .Lgcm_enc_hash_len:
518         cbz             x7, .Lgcm_enc_end
520         GTAG_HASH_LENGTHS(v1, v3)
522         b               .Lgcm_enc_ret
524 .Lgcm_enc_end:
525         /* store new CTR */
526         rev             x8, x8
527         rev             x9, x9
528         stp             x8, x9, [x3]
530         rbit            RHASH.16b, RHASH.16b
532 .Lgcm_enc_ret:
533         /* store new MAC */
534         st1             {RHASH.2d}, [x5]
536         ret
537 SYM_FUNC_END(sm4_ce_pmull_gcm_enc)
539 #undef  RR1
540 #undef  RR3
541 #undef  RR5
542 #undef  RR7
543 #undef  RR0
544 #undef  RR2
545 #undef  RR4
546 #undef  RR6
547 #undef RTMP0
548 #undef RTMP1
549 #undef RTMP2
550 #undef RTMP3
551 #undef RTMP4
552 #undef RTMP5
553 #undef RTMP6
554 #undef RTMP7
555 #undef  RH1
556 #undef  RH2
557 #undef  RH3
558 #undef  RH4
561 /* Register macros for decrypt */
563 /* v0-v2 for building CTRs, v3-v5 for saving inputs */
565 #define RR1     v6
566 #define RR3     v7
567 #define RR5     v8
569 #define RR0     v9
570 #define RR2     v10
571 #define RR4     v11
573 #define RTMP0   v12
574 #define RTMP1   v13
575 #define RTMP2   v14
576 #define RTMP3   v15
577 #define RTMP4   v16
578 #define RTMP5   v17
580 #define RH1     v18
581 #define RH2     v19
582 #define RH3     v20
584 .align 3
585 SYM_TYPED_FUNC_START(sm4_ce_pmull_gcm_dec)
586         /* input:
587          *   x0: round key array, CTX
588          *   x1: dst
589          *   x2: src
590          *   x3: ctr (big endian, 128 bit)
591          *   w4: nbytes
592          *   x5: ghash result
593          *   x6: ghash table
594          *   x7: lengths (only for last block)
595          */
596         SM4_PREPARE(x0)
598         ldp             x8, x9, [x3]
599         rev             x8, x8
600         rev             x9, x9
602         ld1             {RH1.16b-RH3.16b}, [x6]
604         ld1             {RHASH.16b}, [x5]
605         rbit            RHASH.16b, RHASH.16b
607         adr_l           x6, .Lghash_rconst
608         ld1r            {RRCONST.2d}, [x6]
610         eor             RZERO.16b, RZERO.16b, RZERO.16b
612         cbz             w4, .Lgcm_dec_hash_len
614 .Lgcm_dec_loop_3x:
615         cmp             w4, #(3 * 16)
616         blt             .Lgcm_dec_loop_1x
618         sub             w4, w4, #(3 * 16)
620         ld1             {v3.16b-v5.16b}, [x2], #(3 * 16)
622         /* construct CTRs */
623         inc32_le128(v0)                 /* +0 */
624         rbit            v6.16b, v3.16b
625         inc32_le128(v1)                 /* +1 */
626         rbit            v7.16b, v4.16b
627         inc32_le128(v2)                 /* +2 */
628         rbit            v8.16b, v5.16b
630         eor             RHASH.16b, RHASH.16b, v6.16b
632         /* decrypt & ghash update */
633         SM4_CRYPT_PMUL_128x128_BLK3(v0, v1, v2,
634                                     RR0, RR1, RHASH, RH3, RTMP0, RTMP1,
635                                     RR2, RR3, v7, RH2, RTMP2, RTMP3,
636                                     RR4, RR5, v8, RH1, RTMP4, RTMP5)
638         eor             v0.16b, v0.16b, v3.16b
639         eor             v1.16b, v1.16b, v4.16b
640         eor             v2.16b, v2.16b, v5.16b
642         REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP0, RTMP1)
644         st1             {v0.16b-v2.16b}, [x1], #(3 * 16)
646         cbz             w4, .Lgcm_dec_hash_len
647         b               .Lgcm_dec_loop_3x
649 .Lgcm_dec_loop_1x:
650         cmp             w4, #16
651         blt             .Lgcm_dec_tail
653         sub             w4, w4, #16
655         ld1             {v3.16b}, [x2], #16
657         /* construct CTRs */
658         inc32_le128(v0)
659         rbit            v6.16b, v3.16b
661         eor             RHASH.16b, RHASH.16b, v6.16b
663         SM4_CRYPT_PMUL_128x128_BLK(v0, RR0, RR1, RHASH, RH1, RTMP0, RTMP1)
665         eor             v0.16b, v0.16b, v3.16b
667         REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP2, RTMP3)
669         st1             {v0.16b}, [x1], #16
671         cbz             w4, .Lgcm_dec_hash_len
672         b               .Lgcm_dec_loop_1x
674 .Lgcm_dec_tail:
675         /* construct CTRs */
676         inc32_le128(v0)
677         SM4_CRYPT_BLK(v0)
679         /* load permute table */
680         adr_l           x0, .Lcts_permute_table
681         add             x0, x0, #32
682         sub             x0, x0, w4, uxtw
683         ld1             {v3.16b}, [x0]
685 .Lgcm_dec_tail_loop:
686         /* do decrypt */
687         ldrb            w0, [x2], #1    /* get 1 byte from input */
688         umov            w6, v0.b[0]     /* get top crypted byte */
689         eor             w6, w6, w0      /* w6 = CTR ^ input */
690         strb            w6, [x1], #1    /* store out byte */
692         /* shift right out one byte */
693         ext             v0.16b, v0.16b, v0.16b, #1
694         /* the last ciphertext is placed in high bytes */
695         ins             v0.b[15], w0
697         subs            w4, w4, #1
698         bne             .Lgcm_dec_tail_loop
700         /* padding last block with zeros */
701         tbl             v0.16b, {v0.16b}, v3.16b
703         /* ghash update */
704         rbit            v0.16b, v0.16b
705         eor             RHASH.16b, RHASH.16b, v0.16b
706         PMUL_128x128(RR0, RR1, RHASH, RH1, RTMP0, RTMP1)
707         REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP2, RTMP3)
709 .Lgcm_dec_hash_len:
710         cbz             x7, .Lgcm_dec_end
712         GTAG_HASH_LENGTHS(v1, v3)
714         b               .Lgcm_dec_ret
716 .Lgcm_dec_end:
717         /* store new CTR */
718         rev             x8, x8
719         rev             x9, x9
720         stp             x8, x9, [x3]
722         rbit            RHASH.16b, RHASH.16b
724 .Lgcm_dec_ret:
725         /* store new MAC */
726         st1             {RHASH.2d}, [x5]
728         ret
729 SYM_FUNC_END(sm4_ce_pmull_gcm_dec)
731         .section        ".rodata", "a"
732         .align 4
733 .Lcts_permute_table:
734         .byte           0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
735         .byte           0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
736         .byte            0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7
737         .byte            0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf
738         .byte           0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
739         .byte           0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
741 .Lghash_rconst:
742         .quad           0x87