1 /* SPDX-License-Identifier: GPL-2.0-or-later */
3 * SM4-GCM AEAD Algorithm using ARMv8 Crypto Extensions
4 * as specified in rfc8998
5 * https://datatracker.ietf.org/doc/html/rfc8998
7 * Copyright (C) 2016 Jussi Kivilinna <jussi.kivilinna@iki.fi>
8 * Copyright (C) 2022 Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
11 #include <linux/linkage.h>
12 #include <linux/cfi_types.h>
13 #include <asm/assembler.h>
14 #include "sm4-ce-asm.h"
18 .irp b, 0, 1, 2, 3, 24, 25, 26, 27, 28, 29, 30, 31
23 .inst 0xcec08400 | (.L\vn << 5) | .L\vd
28 /* Used for both encryption and decryption */
37 * output: r0:r1 (low 128-bits in r0, high in r1)
39 #define PMUL_128x128(r0, r1, m0, m1, T0, T1) \
40 ext T0.16b, m1.16b, m1.16b, #8; \
41 pmull r0.1q, m0.1d, m1.1d; \
42 pmull T1.1q, m0.1d, T0.1d; \
43 pmull2 T0.1q, m0.2d, T0.2d; \
44 pmull2 r1.1q, m0.2d, m1.2d; \
45 eor T0.16b, T0.16b, T1.16b; \
46 ext T1.16b, RZERO.16b, T0.16b, #8; \
47 ext T0.16b, T0.16b, RZERO.16b, #8; \
48 eor r0.16b, r0.16b, T1.16b; \
49 eor r1.16b, r1.16b, T0.16b;
51 #define PMUL_128x128_4x(r0, r1, m0, m1, T0, T1, \
52 r2, r3, m2, m3, T2, T3, \
53 r4, r5, m4, m5, T4, T5, \
54 r6, r7, m6, m7, T6, T7) \
55 ext T0.16b, m1.16b, m1.16b, #8; \
56 ext T2.16b, m3.16b, m3.16b, #8; \
57 ext T4.16b, m5.16b, m5.16b, #8; \
58 ext T6.16b, m7.16b, m7.16b, #8; \
59 pmull r0.1q, m0.1d, m1.1d; \
60 pmull r2.1q, m2.1d, m3.1d; \
61 pmull r4.1q, m4.1d, m5.1d; \
62 pmull r6.1q, m6.1d, m7.1d; \
63 pmull T1.1q, m0.1d, T0.1d; \
64 pmull T3.1q, m2.1d, T2.1d; \
65 pmull T5.1q, m4.1d, T4.1d; \
66 pmull T7.1q, m6.1d, T6.1d; \
67 pmull2 T0.1q, m0.2d, T0.2d; \
68 pmull2 T2.1q, m2.2d, T2.2d; \
69 pmull2 T4.1q, m4.2d, T4.2d; \
70 pmull2 T6.1q, m6.2d, T6.2d; \
71 pmull2 r1.1q, m0.2d, m1.2d; \
72 pmull2 r3.1q, m2.2d, m3.2d; \
73 pmull2 r5.1q, m4.2d, m5.2d; \
74 pmull2 r7.1q, m6.2d, m7.2d; \
75 eor T0.16b, T0.16b, T1.16b; \
76 eor T2.16b, T2.16b, T3.16b; \
77 eor T4.16b, T4.16b, T5.16b; \
78 eor T6.16b, T6.16b, T7.16b; \
79 ext T1.16b, RZERO.16b, T0.16b, #8; \
80 ext T3.16b, RZERO.16b, T2.16b, #8; \
81 ext T5.16b, RZERO.16b, T4.16b, #8; \
82 ext T7.16b, RZERO.16b, T6.16b, #8; \
83 ext T0.16b, T0.16b, RZERO.16b, #8; \
84 ext T2.16b, T2.16b, RZERO.16b, #8; \
85 ext T4.16b, T4.16b, RZERO.16b, #8; \
86 ext T6.16b, T6.16b, RZERO.16b, #8; \
87 eor r0.16b, r0.16b, T1.16b; \
88 eor r2.16b, r2.16b, T3.16b; \
89 eor r4.16b, r4.16b, T5.16b; \
90 eor r6.16b, r6.16b, T7.16b; \
91 eor r1.16b, r1.16b, T0.16b; \
92 eor r3.16b, r3.16b, T2.16b; \
93 eor r5.16b, r5.16b, T4.16b; \
94 eor r7.16b, r7.16b, T6.16b;
97 * input: r0:r1 (low 128-bits in r0, high in r1)
100 #define REDUCTION(a, r0, r1, rconst, T0, T1) \
101 pmull2 T0.1q, r1.2d, rconst.2d; \
102 ext T1.16b, T0.16b, RZERO.16b, #8; \
103 ext T0.16b, RZERO.16b, T0.16b, #8; \
104 eor r1.16b, r1.16b, T1.16b; \
105 eor r0.16b, r0.16b, T0.16b; \
106 pmull T0.1q, r1.1d, rconst.1d; \
107 eor a.16b, r0.16b, T0.16b;
109 #define SM4_CRYPT_PMUL_128x128_BLK(b0, r0, r1, m0, m1, T0, T1) \
110 rev32 b0.16b, b0.16b; \
111 ext T0.16b, m1.16b, m1.16b, #8; \
112 sm4e b0.4s, v24.4s; \
113 pmull r0.1q, m0.1d, m1.1d; \
114 sm4e b0.4s, v25.4s; \
115 pmull T1.1q, m0.1d, T0.1d; \
116 sm4e b0.4s, v26.4s; \
117 pmull2 T0.1q, m0.2d, T0.2d; \
118 sm4e b0.4s, v27.4s; \
119 pmull2 r1.1q, m0.2d, m1.2d; \
120 sm4e b0.4s, v28.4s; \
121 eor T0.16b, T0.16b, T1.16b; \
122 sm4e b0.4s, v29.4s; \
123 ext T1.16b, RZERO.16b, T0.16b, #8; \
124 sm4e b0.4s, v30.4s; \
125 ext T0.16b, T0.16b, RZERO.16b, #8; \
126 sm4e b0.4s, v31.4s; \
127 eor r0.16b, r0.16b, T1.16b; \
128 rev64 b0.4s, b0.4s; \
129 eor r1.16b, r1.16b, T0.16b; \
130 ext b0.16b, b0.16b, b0.16b, #8; \
131 rev32 b0.16b, b0.16b;
133 #define SM4_CRYPT_PMUL_128x128_BLK3(b0, b1, b2, \
134 r0, r1, m0, m1, T0, T1, \
135 r2, r3, m2, m3, T2, T3, \
136 r4, r5, m4, m5, T4, T5) \
137 rev32 b0.16b, b0.16b; \
138 rev32 b1.16b, b1.16b; \
139 rev32 b2.16b, b2.16b; \
140 ext T0.16b, m1.16b, m1.16b, #8; \
141 ext T2.16b, m3.16b, m3.16b, #8; \
142 ext T4.16b, m5.16b, m5.16b, #8; \
143 sm4e b0.4s, v24.4s; \
144 sm4e b1.4s, v24.4s; \
145 sm4e b2.4s, v24.4s; \
146 pmull r0.1q, m0.1d, m1.1d; \
147 pmull r2.1q, m2.1d, m3.1d; \
148 pmull r4.1q, m4.1d, m5.1d; \
149 sm4e b0.4s, v25.4s; \
150 sm4e b1.4s, v25.4s; \
151 sm4e b2.4s, v25.4s; \
152 pmull T1.1q, m0.1d, T0.1d; \
153 pmull T3.1q, m2.1d, T2.1d; \
154 pmull T5.1q, m4.1d, T4.1d; \
155 sm4e b0.4s, v26.4s; \
156 sm4e b1.4s, v26.4s; \
157 sm4e b2.4s, v26.4s; \
158 pmull2 T0.1q, m0.2d, T0.2d; \
159 pmull2 T2.1q, m2.2d, T2.2d; \
160 pmull2 T4.1q, m4.2d, T4.2d; \
161 sm4e b0.4s, v27.4s; \
162 sm4e b1.4s, v27.4s; \
163 sm4e b2.4s, v27.4s; \
164 pmull2 r1.1q, m0.2d, m1.2d; \
165 pmull2 r3.1q, m2.2d, m3.2d; \
166 pmull2 r5.1q, m4.2d, m5.2d; \
167 sm4e b0.4s, v28.4s; \
168 sm4e b1.4s, v28.4s; \
169 sm4e b2.4s, v28.4s; \
170 eor T0.16b, T0.16b, T1.16b; \
171 eor T2.16b, T2.16b, T3.16b; \
172 eor T4.16b, T4.16b, T5.16b; \
173 sm4e b0.4s, v29.4s; \
174 sm4e b1.4s, v29.4s; \
175 sm4e b2.4s, v29.4s; \
176 ext T1.16b, RZERO.16b, T0.16b, #8; \
177 ext T3.16b, RZERO.16b, T2.16b, #8; \
178 ext T5.16b, RZERO.16b, T4.16b, #8; \
179 sm4e b0.4s, v30.4s; \
180 sm4e b1.4s, v30.4s; \
181 sm4e b2.4s, v30.4s; \
182 ext T0.16b, T0.16b, RZERO.16b, #8; \
183 ext T2.16b, T2.16b, RZERO.16b, #8; \
184 ext T4.16b, T4.16b, RZERO.16b, #8; \
185 sm4e b0.4s, v31.4s; \
186 sm4e b1.4s, v31.4s; \
187 sm4e b2.4s, v31.4s; \
188 eor r0.16b, r0.16b, T1.16b; \
189 eor r2.16b, r2.16b, T3.16b; \
190 eor r4.16b, r4.16b, T5.16b; \
191 rev64 b0.4s, b0.4s; \
192 rev64 b1.4s, b1.4s; \
193 rev64 b2.4s, b2.4s; \
194 eor r1.16b, r1.16b, T0.16b; \
195 eor r3.16b, r3.16b, T2.16b; \
196 eor r5.16b, r5.16b, T4.16b; \
197 ext b0.16b, b0.16b, b0.16b, #8; \
198 ext b1.16b, b1.16b, b1.16b, #8; \
199 ext b2.16b, b2.16b, b2.16b, #8; \
200 eor r0.16b, r0.16b, r2.16b; \
201 eor r1.16b, r1.16b, r3.16b; \
202 rev32 b0.16b, b0.16b; \
203 rev32 b1.16b, b1.16b; \
204 rev32 b2.16b, b2.16b; \
205 eor r0.16b, r0.16b, r4.16b; \
206 eor r1.16b, r1.16b, r5.16b;
208 #define inc32_le128(vctr) \
212 bfi x9, x6, #0, #32; \
213 rev64 vctr.16b, vctr.16b;
215 #define GTAG_HASH_LENGTHS(vctr0, vlen) \
216 ld1 {vlen.16b}, [x7]; \
217 /* construct CTR0 */ \
218 /* the lower 32-bits of initial IV is always be32(1) */ \
220 bfi x9, x6, #0, #32; \
221 mov vctr0.d[0], x8; \
222 mov vctr0.d[1], x9; \
223 rbit vlen.16b, vlen.16b; \
224 rev64 vctr0.16b, vctr0.16b; \
225 /* authtag = GCTR(CTR0, GHASH) */ \
226 eor RHASH.16b, RHASH.16b, vlen.16b; \
227 SM4_CRYPT_PMUL_128x128_BLK(vctr0, RR0, RR1, RHASH, RH1, \
229 REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP2, RTMP3); \
230 rbit RHASH.16b, RHASH.16b; \
231 eor RHASH.16b, RHASH.16b, vctr0.16b;
234 /* Register macros for encrypt and ghash */
236 /* can be the same as input v0-v3 */
262 SYM_FUNC_START(sm4_ce_pmull_ghash_setup)
264 * x0: round key array, CTX
269 adr_l x2, .Lghash_rconst
270 ld1r {RRCONST.2d}, [x2]
272 eor RZERO.16b, RZERO.16b, RZERO.16b
274 /* H = E(K, 0^128) */
275 rev32 v0.16b, RZERO.16b
282 PMUL_128x128(RR0, RR1, RH1, RH1, RTMP0, RTMP1)
283 REDUCTION(RH2, RR0, RR1, RRCONST, RTMP2, RTMP3)
286 PMUL_128x128(RR0, RR1, RH2, RH1, RTMP0, RTMP1)
287 REDUCTION(RH3, RR0, RR1, RRCONST, RTMP2, RTMP3)
290 PMUL_128x128(RR0, RR1, RH2, RH2, RTMP0, RTMP1)
291 REDUCTION(RH4, RR0, RR1, RRCONST, RTMP2, RTMP3)
293 st1 {RH1.16b-RH4.16b}, [x1]
296 SYM_FUNC_END(sm4_ce_pmull_ghash_setup)
299 SYM_FUNC_START(pmull_ghash_update)
306 ld1 {RH1.16b-RH4.16b}, [x0]
308 ld1 {RHASH.16b}, [x1]
309 rbit RHASH.16b, RHASH.16b
311 adr_l x4, .Lghash_rconst
312 ld1r {RRCONST.2d}, [x4]
314 eor RZERO.16b, RZERO.16b, RZERO.16b
322 ld1 {v0.16b-v3.16b}, [x2], #64
330 * (in0 ^ HASH) * H^4 => rr0:rr1
331 * (in1) * H^3 => rr2:rr3
332 * (in2) * H^2 => rr4:rr5
333 * (in3) * H^1 => rr6:rr7
335 eor RHASH.16b, RHASH.16b, v0.16b
337 PMUL_128x128_4x(RR0, RR1, RHASH, RH4, RTMP0, RTMP1,
338 RR2, RR3, v1, RH3, RTMP2, RTMP3,
339 RR4, RR5, v2, RH2, RTMP4, RTMP5,
340 RR6, RR7, v3, RH1, RTMP6, RTMP7)
342 eor RR0.16b, RR0.16b, RR2.16b
343 eor RR1.16b, RR1.16b, RR3.16b
344 eor RR0.16b, RR0.16b, RR4.16b
345 eor RR1.16b, RR1.16b, RR5.16b
346 eor RR0.16b, RR0.16b, RR6.16b
347 eor RR1.16b, RR1.16b, RR7.16b
349 REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP0, RTMP1)
357 ld1 {v0.16b}, [x2], #16
359 eor RHASH.16b, RHASH.16b, v0.16b
361 PMUL_128x128(RR0, RR1, RHASH, RH1, RTMP0, RTMP1)
362 REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP2, RTMP3)
364 cbnz w3, .Lghash_loop_1x
367 rbit RHASH.16b, RHASH.16b
371 SYM_FUNC_END(pmull_ghash_update)
374 SYM_TYPED_FUNC_START(sm4_ce_pmull_gcm_enc)
376 * x0: round key array, CTX
379 * x3: ctr (big endian, 128 bit)
383 * x7: lengths (only for last block)
391 ld1 {RH1.16b-RH4.16b}, [x6]
393 ld1 {RHASH.16b}, [x5]
394 rbit RHASH.16b, RHASH.16b
396 adr_l x6, .Lghash_rconst
397 ld1r {RRCONST.2d}, [x6]
399 eor RZERO.16b, RZERO.16b, RZERO.16b
401 cbz w4, .Lgcm_enc_hash_len
405 blt .Lgcm_enc_loop_1x
407 sub w4, w4, #(4 * 16)
410 inc32_le128(v0) /* +0 */
411 inc32_le128(v1) /* +1 */
412 inc32_le128(v2) /* +2 */
413 inc32_le128(v3) /* +3 */
415 ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64
417 SM4_CRYPT_BLK4(v0, v1, v2, v3)
419 eor v0.16b, v0.16b, RTMP0.16b
420 eor v1.16b, v1.16b, RTMP1.16b
421 eor v2.16b, v2.16b, RTMP2.16b
422 eor v3.16b, v3.16b, RTMP3.16b
423 st1 {v0.16b-v3.16b}, [x1], #64
433 * (in0 ^ HASH) * H^4 => rr0:rr1
434 * (in1) * H^3 => rr2:rr3
435 * (in2) * H^2 => rr4:rr5
436 * (in3) * H^1 => rr6:rr7
438 eor RHASH.16b, RHASH.16b, v0.16b
440 PMUL_128x128_4x(RR0, RR1, RHASH, RH4, RTMP0, RTMP1,
441 RR2, RR3, v1, RH3, RTMP2, RTMP3,
442 RR4, RR5, v2, RH2, RTMP4, RTMP5,
443 RR6, RR7, v3, RH1, RTMP6, RTMP7)
445 eor RR0.16b, RR0.16b, RR2.16b
446 eor RR1.16b, RR1.16b, RR3.16b
447 eor RR0.16b, RR0.16b, RR4.16b
448 eor RR1.16b, RR1.16b, RR5.16b
449 eor RR0.16b, RR0.16b, RR6.16b
450 eor RR1.16b, RR1.16b, RR7.16b
452 REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP0, RTMP1)
454 cbz w4, .Lgcm_enc_hash_len
466 ld1 {RTMP0.16b}, [x2], #16
470 eor v0.16b, v0.16b, RTMP0.16b
471 st1 {v0.16b}, [x1], #16
475 eor RHASH.16b, RHASH.16b, v0.16b
476 PMUL_128x128(RR0, RR1, RHASH, RH1, RTMP0, RTMP1)
477 REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP2, RTMP3)
479 cbz w4, .Lgcm_enc_hash_len
487 /* load permute table */
488 adr_l x0, .Lcts_permute_table
495 ldrb w0, [x2], #1 /* get 1 byte from input */
496 umov w6, v0.b[0] /* get top crypted byte */
497 eor w6, w6, w0 /* w6 = CTR ^ input */
498 strb w6, [x1], #1 /* store out byte */
500 /* shift right out one byte */
501 ext v0.16b, v0.16b, v0.16b, #1
502 /* the last ciphertext is placed in high bytes */
506 bne .Lgcm_enc_tail_loop
508 /* padding last block with zeros */
509 tbl v0.16b, {v0.16b}, v3.16b
513 eor RHASH.16b, RHASH.16b, v0.16b
514 PMUL_128x128(RR0, RR1, RHASH, RH1, RTMP0, RTMP1)
515 REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP2, RTMP3)
518 cbz x7, .Lgcm_enc_end
520 GTAG_HASH_LENGTHS(v1, v3)
530 rbit RHASH.16b, RHASH.16b
537 SYM_FUNC_END(sm4_ce_pmull_gcm_enc)
561 /* Register macros for decrypt */
563 /* v0-v2 for building CTRs, v3-v5 for saving inputs */
585 SYM_TYPED_FUNC_START(sm4_ce_pmull_gcm_dec)
587 * x0: round key array, CTX
590 * x3: ctr (big endian, 128 bit)
594 * x7: lengths (only for last block)
602 ld1 {RH1.16b-RH3.16b}, [x6]
604 ld1 {RHASH.16b}, [x5]
605 rbit RHASH.16b, RHASH.16b
607 adr_l x6, .Lghash_rconst
608 ld1r {RRCONST.2d}, [x6]
610 eor RZERO.16b, RZERO.16b, RZERO.16b
612 cbz w4, .Lgcm_dec_hash_len
616 blt .Lgcm_dec_loop_1x
618 sub w4, w4, #(3 * 16)
620 ld1 {v3.16b-v5.16b}, [x2], #(3 * 16)
623 inc32_le128(v0) /* +0 */
625 inc32_le128(v1) /* +1 */
627 inc32_le128(v2) /* +2 */
630 eor RHASH.16b, RHASH.16b, v6.16b
632 /* decrypt & ghash update */
633 SM4_CRYPT_PMUL_128x128_BLK3(v0, v1, v2,
634 RR0, RR1, RHASH, RH3, RTMP0, RTMP1,
635 RR2, RR3, v7, RH2, RTMP2, RTMP3,
636 RR4, RR5, v8, RH1, RTMP4, RTMP5)
638 eor v0.16b, v0.16b, v3.16b
639 eor v1.16b, v1.16b, v4.16b
640 eor v2.16b, v2.16b, v5.16b
642 REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP0, RTMP1)
644 st1 {v0.16b-v2.16b}, [x1], #(3 * 16)
646 cbz w4, .Lgcm_dec_hash_len
655 ld1 {v3.16b}, [x2], #16
661 eor RHASH.16b, RHASH.16b, v6.16b
663 SM4_CRYPT_PMUL_128x128_BLK(v0, RR0, RR1, RHASH, RH1, RTMP0, RTMP1)
665 eor v0.16b, v0.16b, v3.16b
667 REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP2, RTMP3)
669 st1 {v0.16b}, [x1], #16
671 cbz w4, .Lgcm_dec_hash_len
679 /* load permute table */
680 adr_l x0, .Lcts_permute_table
687 ldrb w0, [x2], #1 /* get 1 byte from input */
688 umov w6, v0.b[0] /* get top crypted byte */
689 eor w6, w6, w0 /* w6 = CTR ^ input */
690 strb w6, [x1], #1 /* store out byte */
692 /* shift right out one byte */
693 ext v0.16b, v0.16b, v0.16b, #1
694 /* the last ciphertext is placed in high bytes */
698 bne .Lgcm_dec_tail_loop
700 /* padding last block with zeros */
701 tbl v0.16b, {v0.16b}, v3.16b
705 eor RHASH.16b, RHASH.16b, v0.16b
706 PMUL_128x128(RR0, RR1, RHASH, RH1, RTMP0, RTMP1)
707 REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP2, RTMP3)
710 cbz x7, .Lgcm_dec_end
712 GTAG_HASH_LENGTHS(v1, v3)
722 rbit RHASH.16b, RHASH.16b
729 SYM_FUNC_END(sm4_ce_pmull_gcm_dec)
731 .section ".rodata", "a"
734 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
735 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
736 .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7
737 .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf
738 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
739 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff