Merge tag 'trace-printf-v6.13' of git://git.kernel.org/pub/scm/linux/kernel/git/trace...
[drm/drm-misc.git] / arch / arm64 / crypto / sm4-ce-core.S
blob1f3625c2c67e4bb31273b466022bfd30ad941e16
1 /* SPDX-License-Identifier: GPL-2.0-or-later */
2 /*
3  * SM4 Cipher Algorithm for ARMv8 with Crypto Extensions
4  * as specified in
5  * https://tools.ietf.org/id/draft-ribose-cfrg-sm4-10.html
6  *
7  * Copyright (C) 2022, Alibaba Group.
8  * Copyright (C) 2022 Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
9  */
11 #include <linux/linkage.h>
12 #include <asm/assembler.h>
13 #include "sm4-ce-asm.h"
15 .arch   armv8-a+crypto
17 .irp b, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, \
18                 20, 24, 25, 26, 27, 28, 29, 30, 31
19         .set .Lv\b\().4s, \b
20 .endr
22 .macro sm4e, vd, vn
23         .inst 0xcec08400 | (.L\vn << 5) | .L\vd
24 .endm
26 .macro sm4ekey, vd, vn, vm
27         .inst 0xce60c800 | (.L\vm << 16) | (.L\vn << 5) | .L\vd
28 .endm
30 /* Register macros */
32 #define RTMP0   v16
33 #define RTMP1   v17
34 #define RTMP2   v18
35 #define RTMP3   v19
37 #define RIV     v20
38 #define RMAC    v20
39 #define RMASK   v21
42 .align 3
43 SYM_FUNC_START(sm4_ce_expand_key)
44         /* input:
45          *   x0: 128-bit key
46          *   x1: rkey_enc
47          *   x2: rkey_dec
48          *   x3: fk array
49          *   x4: ck array
50          */
51         ld1             {v0.16b}, [x0];
52         rev32           v0.16b, v0.16b;
53         ld1             {v1.16b}, [x3];
54         /* load ck */
55         ld1             {v24.16b-v27.16b}, [x4], #64;
56         ld1             {v28.16b-v31.16b}, [x4];
58         /* input ^ fk */
59         eor             v0.16b, v0.16b, v1.16b;
61         sm4ekey         v0.4s, v0.4s, v24.4s;
62         sm4ekey         v1.4s, v0.4s, v25.4s;
63         sm4ekey         v2.4s, v1.4s, v26.4s;
64         sm4ekey         v3.4s, v2.4s, v27.4s;
65         sm4ekey         v4.4s, v3.4s, v28.4s;
66         sm4ekey         v5.4s, v4.4s, v29.4s;
67         sm4ekey         v6.4s, v5.4s, v30.4s;
68         sm4ekey         v7.4s, v6.4s, v31.4s;
70         adr_l           x5, .Lbswap128_mask
71         ld1             {v24.16b}, [x5]
73         st1             {v0.16b-v3.16b}, [x1], #64;
74         st1             {v4.16b-v7.16b}, [x1];
76         tbl             v16.16b, {v7.16b}, v24.16b
77         tbl             v17.16b, {v6.16b}, v24.16b
78         tbl             v18.16b, {v5.16b}, v24.16b
79         tbl             v19.16b, {v4.16b}, v24.16b
80         tbl             v20.16b, {v3.16b}, v24.16b
81         tbl             v21.16b, {v2.16b}, v24.16b
82         tbl             v22.16b, {v1.16b}, v24.16b
83         tbl             v23.16b, {v0.16b}, v24.16b
85         st1             {v16.16b-v19.16b}, [x2], #64
86         st1             {v20.16b-v23.16b}, [x2]
88         ret;
89 SYM_FUNC_END(sm4_ce_expand_key)
91 .align 3
92 SYM_FUNC_START(sm4_ce_crypt_block)
93         /* input:
94          *   x0: round key array, CTX
95          *   x1: dst
96          *   x2: src
97          */
98         SM4_PREPARE(x0)
100         ld1             {v0.16b}, [x2];
101         SM4_CRYPT_BLK(v0);
102         st1             {v0.16b}, [x1];
104         ret;
105 SYM_FUNC_END(sm4_ce_crypt_block)
107 .align 3
108 SYM_FUNC_START(sm4_ce_crypt)
109         /* input:
110          *   x0: round key array, CTX
111          *   x1: dst
112          *   x2: src
113          *   w3: nblocks
114          */
115         SM4_PREPARE(x0)
117 .Lcrypt_loop_blk:
118         sub             w3, w3, #8;
119         tbnz            w3, #31, .Lcrypt_tail8;
121         ld1             {v0.16b-v3.16b}, [x2], #64;
122         ld1             {v4.16b-v7.16b}, [x2], #64;
124         SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7);
126         st1             {v0.16b-v3.16b}, [x1], #64;
127         st1             {v4.16b-v7.16b}, [x1], #64;
129         cbz             w3, .Lcrypt_end;
130         b               .Lcrypt_loop_blk;
132 .Lcrypt_tail8:
133         add             w3, w3, #8;
134         cmp             w3, #4;
135         blt             .Lcrypt_tail4;
137         sub             w3, w3, #4;
139         ld1             {v0.16b-v3.16b}, [x2], #64;
140         SM4_CRYPT_BLK4(v0, v1, v2, v3);
141         st1             {v0.16b-v3.16b}, [x1], #64;
143         cbz             w3, .Lcrypt_end;
145 .Lcrypt_tail4:
146         sub             w3, w3, #1;
148         ld1             {v0.16b}, [x2], #16;
149         SM4_CRYPT_BLK(v0);
150         st1             {v0.16b}, [x1], #16;
152         cbnz            w3, .Lcrypt_tail4;
154 .Lcrypt_end:
155         ret;
156 SYM_FUNC_END(sm4_ce_crypt)
158 .align 3
159 SYM_FUNC_START(sm4_ce_cbc_enc)
160         /* input:
161          *   x0: round key array, CTX
162          *   x1: dst
163          *   x2: src
164          *   x3: iv (big endian, 128 bit)
165          *   w4: nblocks
166          */
167         SM4_PREPARE(x0)
169         ld1             {RIV.16b}, [x3]
171 .Lcbc_enc_loop_4x:
172         cmp             w4, #4
173         blt             .Lcbc_enc_loop_1x
175         sub             w4, w4, #4
177         ld1             {v0.16b-v3.16b}, [x2], #64
179         eor             v0.16b, v0.16b, RIV.16b
180         SM4_CRYPT_BLK(v0)
181         eor             v1.16b, v1.16b, v0.16b
182         SM4_CRYPT_BLK(v1)
183         eor             v2.16b, v2.16b, v1.16b
184         SM4_CRYPT_BLK(v2)
185         eor             v3.16b, v3.16b, v2.16b
186         SM4_CRYPT_BLK(v3)
188         st1             {v0.16b-v3.16b}, [x1], #64
189         mov             RIV.16b, v3.16b
191         cbz             w4, .Lcbc_enc_end
192         b               .Lcbc_enc_loop_4x
194 .Lcbc_enc_loop_1x:
195         sub             w4, w4, #1
197         ld1             {v0.16b}, [x2], #16
199         eor             RIV.16b, RIV.16b, v0.16b
200         SM4_CRYPT_BLK(RIV)
202         st1             {RIV.16b}, [x1], #16
204         cbnz            w4, .Lcbc_enc_loop_1x
206 .Lcbc_enc_end:
207         /* store new IV */
208         st1             {RIV.16b}, [x3]
210         ret
211 SYM_FUNC_END(sm4_ce_cbc_enc)
213 .align 3
214 SYM_FUNC_START(sm4_ce_cbc_dec)
215         /* input:
216          *   x0: round key array, CTX
217          *   x1: dst
218          *   x2: src
219          *   x3: iv (big endian, 128 bit)
220          *   w4: nblocks
221          */
222         SM4_PREPARE(x0)
224         ld1             {RIV.16b}, [x3]
226 .Lcbc_dec_loop_8x:
227         sub             w4, w4, #8
228         tbnz            w4, #31, .Lcbc_dec_4x
230         ld1             {v0.16b-v3.16b}, [x2], #64
231         ld1             {v4.16b-v7.16b}, [x2], #64
233         rev32           v8.16b, v0.16b
234         rev32           v9.16b, v1.16b
235         rev32           v10.16b, v2.16b
236         rev32           v11.16b, v3.16b
237         rev32           v12.16b, v4.16b
238         rev32           v13.16b, v5.16b
239         rev32           v14.16b, v6.16b
240         rev32           v15.16b, v7.16b
242         SM4_CRYPT_BLK8_BE(v8, v9, v10, v11, v12, v13, v14, v15)
244         eor             v8.16b, v8.16b, RIV.16b
245         eor             v9.16b, v9.16b, v0.16b
246         eor             v10.16b, v10.16b, v1.16b
247         eor             v11.16b, v11.16b, v2.16b
248         eor             v12.16b, v12.16b, v3.16b
249         eor             v13.16b, v13.16b, v4.16b
250         eor             v14.16b, v14.16b, v5.16b
251         eor             v15.16b, v15.16b, v6.16b
253         st1             {v8.16b-v11.16b}, [x1], #64
254         st1             {v12.16b-v15.16b}, [x1], #64
256         mov             RIV.16b, v7.16b
258         cbz             w4, .Lcbc_dec_end
259         b               .Lcbc_dec_loop_8x
261 .Lcbc_dec_4x:
262         add             w4, w4, #8
263         cmp             w4, #4
264         blt             .Lcbc_dec_loop_1x
266         sub             w4, w4, #4
268         ld1             {v0.16b-v3.16b}, [x2], #64
270         rev32           v8.16b, v0.16b
271         rev32           v9.16b, v1.16b
272         rev32           v10.16b, v2.16b
273         rev32           v11.16b, v3.16b
275         SM4_CRYPT_BLK4_BE(v8, v9, v10, v11)
277         eor             v8.16b, v8.16b, RIV.16b
278         eor             v9.16b, v9.16b, v0.16b
279         eor             v10.16b, v10.16b, v1.16b
280         eor             v11.16b, v11.16b, v2.16b
282         st1             {v8.16b-v11.16b}, [x1], #64
284         mov             RIV.16b, v3.16b
286         cbz             w4, .Lcbc_dec_end
288 .Lcbc_dec_loop_1x:
289         sub             w4, w4, #1
291         ld1             {v0.16b}, [x2], #16
293         rev32           v8.16b, v0.16b
295         SM4_CRYPT_BLK_BE(v8)
297         eor             v8.16b, v8.16b, RIV.16b
298         st1             {v8.16b}, [x1], #16
300         mov             RIV.16b, v0.16b
302         cbnz            w4, .Lcbc_dec_loop_1x
304 .Lcbc_dec_end:
305         /* store new IV */
306         st1             {RIV.16b}, [x3]
308         ret
309 SYM_FUNC_END(sm4_ce_cbc_dec)
311 .align 3
312 SYM_FUNC_START(sm4_ce_cbc_cts_enc)
313         /* input:
314          *   x0: round key array, CTX
315          *   x1: dst
316          *   x2: src
317          *   x3: iv (big endian, 128 bit)
318          *   w4: nbytes
319          */
320         SM4_PREPARE(x0)
322         sub             w5, w4, #16
323         uxtw            x5, w5
325         ld1             {RIV.16b}, [x3]
327         ld1             {v0.16b}, [x2]
328         eor             RIV.16b, RIV.16b, v0.16b
329         SM4_CRYPT_BLK(RIV)
331         /* load permute table */
332         adr_l           x6, .Lcts_permute_table
333         add             x7, x6, #32
334         add             x6, x6, x5
335         sub             x7, x7, x5
336         ld1             {v3.16b}, [x6]
337         ld1             {v4.16b}, [x7]
339         /* overlapping loads */
340         add             x2, x2, x5
341         ld1             {v1.16b}, [x2]
343         /* create Cn from En-1 */
344         tbl             v0.16b, {RIV.16b}, v3.16b
345         /* padding Pn with zeros */
346         tbl             v1.16b, {v1.16b}, v4.16b
348         eor             v1.16b, v1.16b, RIV.16b
349         SM4_CRYPT_BLK(v1)
351         /* overlapping stores */
352         add             x5, x1, x5
353         st1             {v0.16b}, [x5]
354         st1             {v1.16b}, [x1]
356         ret
357 SYM_FUNC_END(sm4_ce_cbc_cts_enc)
359 .align 3
360 SYM_FUNC_START(sm4_ce_cbc_cts_dec)
361         /* input:
362          *   x0: round key array, CTX
363          *   x1: dst
364          *   x2: src
365          *   x3: iv (big endian, 128 bit)
366          *   w4: nbytes
367          */
368         SM4_PREPARE(x0)
370         sub             w5, w4, #16
371         uxtw            x5, w5
373         ld1             {RIV.16b}, [x3]
375         /* load permute table */
376         adr_l           x6, .Lcts_permute_table
377         add             x7, x6, #32
378         add             x6, x6, x5
379         sub             x7, x7, x5
380         ld1             {v3.16b}, [x6]
381         ld1             {v4.16b}, [x7]
383         /* overlapping loads */
384         ld1             {v0.16b}, [x2], x5
385         ld1             {v1.16b}, [x2]
387         SM4_CRYPT_BLK(v0)
388         /* select the first Ln bytes of Xn to create Pn */
389         tbl             v2.16b, {v0.16b}, v3.16b
390         eor             v2.16b, v2.16b, v1.16b
392         /* overwrite the first Ln bytes with Cn to create En-1 */
393         tbx             v0.16b, {v1.16b}, v4.16b
394         SM4_CRYPT_BLK(v0)
395         eor             v0.16b, v0.16b, RIV.16b
397         /* overlapping stores */
398         add             x5, x1, x5
399         st1             {v2.16b}, [x5]
400         st1             {v0.16b}, [x1]
402         ret
403 SYM_FUNC_END(sm4_ce_cbc_cts_dec)
405 .align 3
406 SYM_FUNC_START(sm4_ce_ctr_enc)
407         /* input:
408          *   x0: round key array, CTX
409          *   x1: dst
410          *   x2: src
411          *   x3: ctr (big endian, 128 bit)
412          *   w4: nblocks
413          */
414         SM4_PREPARE(x0)
416         ldp             x7, x8, [x3]
417         rev             x7, x7
418         rev             x8, x8
420 .Lctr_loop_8x:
421         sub             w4, w4, #8
422         tbnz            w4, #31, .Lctr_4x
424 #define inc_le128(vctr)                                 \
425                 mov             vctr.d[1], x8;          \
426                 mov             vctr.d[0], x7;          \
427                 adds            x8, x8, #1;             \
428                 rev64           vctr.16b, vctr.16b;     \
429                 adc             x7, x7, xzr;
431         /* construct CTRs */
432         inc_le128(v0)                   /* +0 */
433         inc_le128(v1)                   /* +1 */
434         inc_le128(v2)                   /* +2 */
435         inc_le128(v3)                   /* +3 */
436         inc_le128(v4)                   /* +4 */
437         inc_le128(v5)                   /* +5 */
438         inc_le128(v6)                   /* +6 */
439         inc_le128(v7)                   /* +7 */
441         ld1             {v8.16b-v11.16b}, [x2], #64
442         ld1             {v12.16b-v15.16b}, [x2], #64
444         SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7)
446         eor             v0.16b, v0.16b, v8.16b
447         eor             v1.16b, v1.16b, v9.16b
448         eor             v2.16b, v2.16b, v10.16b
449         eor             v3.16b, v3.16b, v11.16b
450         eor             v4.16b, v4.16b, v12.16b
451         eor             v5.16b, v5.16b, v13.16b
452         eor             v6.16b, v6.16b, v14.16b
453         eor             v7.16b, v7.16b, v15.16b
455         st1             {v0.16b-v3.16b}, [x1], #64
456         st1             {v4.16b-v7.16b}, [x1], #64
458         cbz             w4, .Lctr_end
459         b               .Lctr_loop_8x
461 .Lctr_4x:
462         add             w4, w4, #8
463         cmp             w4, #4
464         blt             .Lctr_loop_1x
466         sub             w4, w4, #4
468         /* construct CTRs */
469         inc_le128(v0)                   /* +0 */
470         inc_le128(v1)                   /* +1 */
471         inc_le128(v2)                   /* +2 */
472         inc_le128(v3)                   /* +3 */
474         ld1             {v8.16b-v11.16b}, [x2], #64
476         SM4_CRYPT_BLK4(v0, v1, v2, v3)
478         eor             v0.16b, v0.16b, v8.16b
479         eor             v1.16b, v1.16b, v9.16b
480         eor             v2.16b, v2.16b, v10.16b
481         eor             v3.16b, v3.16b, v11.16b
483         st1             {v0.16b-v3.16b}, [x1], #64
485         cbz             w4, .Lctr_end
487 .Lctr_loop_1x:
488         sub             w4, w4, #1
490         /* construct CTRs */
491         inc_le128(v0)
493         ld1             {v8.16b}, [x2], #16
495         SM4_CRYPT_BLK(v0)
497         eor             v0.16b, v0.16b, v8.16b
498         st1             {v0.16b}, [x1], #16
500         cbnz            w4, .Lctr_loop_1x
502 .Lctr_end:
503         /* store new CTR */
504         rev             x7, x7
505         rev             x8, x8
506         stp             x7, x8, [x3]
508         ret
509 SYM_FUNC_END(sm4_ce_ctr_enc)
512 #define tweak_next(vt, vin, RTMP)                                       \
513                 sshr            RTMP.2d, vin.2d, #63;                   \
514                 and             RTMP.16b, RTMP.16b, RMASK.16b;          \
515                 add             vt.2d, vin.2d, vin.2d;                  \
516                 ext             RTMP.16b, RTMP.16b, RTMP.16b, #8;       \
517                 eor             vt.16b, vt.16b, RTMP.16b;
519 .align 3
520 SYM_FUNC_START(sm4_ce_xts_enc)
521         /* input:
522          *   x0: round key array, CTX
523          *   x1: dst
524          *   x2: src
525          *   x3: tweak (big endian, 128 bit)
526          *   w4: nbytes
527          *   x5: round key array for IV
528          */
529         ld1             {v8.16b}, [x3]
531         cbz             x5, .Lxts_enc_nofirst
533         SM4_PREPARE(x5)
535         /* Generate first tweak */
536         SM4_CRYPT_BLK(v8)
538 .Lxts_enc_nofirst:
539         SM4_PREPARE(x0)
541         ands            w5, w4, #15
542         lsr             w4, w4, #4
543         sub             w6, w4, #1
544         csel            w4, w4, w6, eq
545         uxtw            x5, w5
547         movi            RMASK.2s, #0x1
548         movi            RTMP0.2s, #0x87
549         uzp1            RMASK.4s, RMASK.4s, RTMP0.4s
551         cbz             w4, .Lxts_enc_cts
553 .Lxts_enc_loop_8x:
554         sub             w4, w4, #8
555         tbnz            w4, #31, .Lxts_enc_4x
557         tweak_next( v9,  v8, RTMP0)
558         tweak_next(v10,  v9, RTMP1)
559         tweak_next(v11, v10, RTMP2)
560         tweak_next(v12, v11, RTMP3)
561         tweak_next(v13, v12, RTMP0)
562         tweak_next(v14, v13, RTMP1)
563         tweak_next(v15, v14, RTMP2)
565         ld1             {v0.16b-v3.16b}, [x2], #64
566         ld1             {v4.16b-v7.16b}, [x2], #64
567         eor             v0.16b, v0.16b,  v8.16b
568         eor             v1.16b, v1.16b,  v9.16b
569         eor             v2.16b, v2.16b, v10.16b
570         eor             v3.16b, v3.16b, v11.16b
571         eor             v4.16b, v4.16b, v12.16b
572         eor             v5.16b, v5.16b, v13.16b
573         eor             v6.16b, v6.16b, v14.16b
574         eor             v7.16b, v7.16b, v15.16b
576         SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7)
578         eor             v0.16b, v0.16b,  v8.16b
579         eor             v1.16b, v1.16b,  v9.16b
580         eor             v2.16b, v2.16b, v10.16b
581         eor             v3.16b, v3.16b, v11.16b
582         eor             v4.16b, v4.16b, v12.16b
583         eor             v5.16b, v5.16b, v13.16b
584         eor             v6.16b, v6.16b, v14.16b
585         eor             v7.16b, v7.16b, v15.16b
586         st1             {v0.16b-v3.16b}, [x1], #64
587         st1             {v4.16b-v7.16b}, [x1], #64
589         tweak_next(v8, v15, RTMP3)
591         cbz             w4, .Lxts_enc_cts
592         b               .Lxts_enc_loop_8x
594 .Lxts_enc_4x:
595         add             w4, w4, #8
596         cmp             w4, #4
597         blt             .Lxts_enc_loop_1x
599         sub             w4, w4, #4
601         tweak_next( v9,  v8, RTMP0)
602         tweak_next(v10,  v9, RTMP1)
603         tweak_next(v11, v10, RTMP2)
605         ld1             {v0.16b-v3.16b}, [x2], #64
606         eor             v0.16b, v0.16b,  v8.16b
607         eor             v1.16b, v1.16b,  v9.16b
608         eor             v2.16b, v2.16b, v10.16b
609         eor             v3.16b, v3.16b, v11.16b
611         SM4_CRYPT_BLK4(v0, v1, v2, v3)
613         eor             v0.16b, v0.16b,  v8.16b
614         eor             v1.16b, v1.16b,  v9.16b
615         eor             v2.16b, v2.16b, v10.16b
616         eor             v3.16b, v3.16b, v11.16b
617         st1             {v0.16b-v3.16b}, [x1], #64
619         tweak_next(v8, v11, RTMP3)
621         cbz             w4, .Lxts_enc_cts
623 .Lxts_enc_loop_1x:
624         sub             w4, w4, #1
626         ld1             {v0.16b}, [x2], #16
627         eor             v0.16b, v0.16b, v8.16b
629         SM4_CRYPT_BLK(v0)
631         eor             v0.16b, v0.16b, v8.16b
632         st1             {v0.16b}, [x1], #16
634         tweak_next(v8, v8, RTMP0)
636         cbnz            w4, .Lxts_enc_loop_1x
638 .Lxts_enc_cts:
639         cbz             x5, .Lxts_enc_end
641         /* cipher text stealing */
643         tweak_next(v9, v8, RTMP0)
644         ld1             {v0.16b}, [x2]
645         eor             v0.16b, v0.16b, v8.16b
646         SM4_CRYPT_BLK(v0)
647         eor             v0.16b, v0.16b, v8.16b
649         /* load permute table */
650         adr_l           x6, .Lcts_permute_table
651         add             x7, x6, #32
652         add             x6, x6, x5
653         sub             x7, x7, x5
654         ld1             {v3.16b}, [x6]
655         ld1             {v4.16b}, [x7]
657         /* overlapping loads */
658         add             x2, x2, x5
659         ld1             {v1.16b}, [x2]
661         /* create Cn from En-1 */
662         tbl             v2.16b, {v0.16b}, v3.16b
663         /* padding Pn with En-1 at the end */
664         tbx             v0.16b, {v1.16b}, v4.16b
666         eor             v0.16b, v0.16b, v9.16b
667         SM4_CRYPT_BLK(v0)
668         eor             v0.16b, v0.16b, v9.16b
671         /* overlapping stores */
672         add             x5, x1, x5
673         st1             {v2.16b}, [x5]
674         st1             {v0.16b}, [x1]
676         b               .Lxts_enc_ret
678 .Lxts_enc_end:
679         /* store new tweak */
680         st1             {v8.16b}, [x3]
682 .Lxts_enc_ret:
683         ret
684 SYM_FUNC_END(sm4_ce_xts_enc)
686 .align 3
687 SYM_FUNC_START(sm4_ce_xts_dec)
688         /* input:
689          *   x0: round key array, CTX
690          *   x1: dst
691          *   x2: src
692          *   x3: tweak (big endian, 128 bit)
693          *   w4: nbytes
694          *   x5: round key array for IV
695          */
696         ld1             {v8.16b}, [x3]
698         cbz             x5, .Lxts_dec_nofirst
700         SM4_PREPARE(x5)
702         /* Generate first tweak */
703         SM4_CRYPT_BLK(v8)
705 .Lxts_dec_nofirst:
706         SM4_PREPARE(x0)
708         ands            w5, w4, #15
709         lsr             w4, w4, #4
710         sub             w6, w4, #1
711         csel            w4, w4, w6, eq
712         uxtw            x5, w5
714         movi            RMASK.2s, #0x1
715         movi            RTMP0.2s, #0x87
716         uzp1            RMASK.4s, RMASK.4s, RTMP0.4s
718         cbz             w4, .Lxts_dec_cts
720 .Lxts_dec_loop_8x:
721         sub             w4, w4, #8
722         tbnz            w4, #31, .Lxts_dec_4x
724         tweak_next( v9,  v8, RTMP0)
725         tweak_next(v10,  v9, RTMP1)
726         tweak_next(v11, v10, RTMP2)
727         tweak_next(v12, v11, RTMP3)
728         tweak_next(v13, v12, RTMP0)
729         tweak_next(v14, v13, RTMP1)
730         tweak_next(v15, v14, RTMP2)
732         ld1             {v0.16b-v3.16b}, [x2], #64
733         ld1             {v4.16b-v7.16b}, [x2], #64
734         eor             v0.16b, v0.16b,  v8.16b
735         eor             v1.16b, v1.16b,  v9.16b
736         eor             v2.16b, v2.16b, v10.16b
737         eor             v3.16b, v3.16b, v11.16b
738         eor             v4.16b, v4.16b, v12.16b
739         eor             v5.16b, v5.16b, v13.16b
740         eor             v6.16b, v6.16b, v14.16b
741         eor             v7.16b, v7.16b, v15.16b
743         SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7)
745         eor             v0.16b, v0.16b,  v8.16b
746         eor             v1.16b, v1.16b,  v9.16b
747         eor             v2.16b, v2.16b, v10.16b
748         eor             v3.16b, v3.16b, v11.16b
749         eor             v4.16b, v4.16b, v12.16b
750         eor             v5.16b, v5.16b, v13.16b
751         eor             v6.16b, v6.16b, v14.16b
752         eor             v7.16b, v7.16b, v15.16b
753         st1             {v0.16b-v3.16b}, [x1], #64
754         st1             {v4.16b-v7.16b}, [x1], #64
756         tweak_next(v8, v15, RTMP3)
758         cbz             w4, .Lxts_dec_cts
759         b               .Lxts_dec_loop_8x
761 .Lxts_dec_4x:
762         add             w4, w4, #8
763         cmp             w4, #4
764         blt             .Lxts_dec_loop_1x
766         sub             w4, w4, #4
768         tweak_next( v9,  v8, RTMP0)
769         tweak_next(v10,  v9, RTMP1)
770         tweak_next(v11, v10, RTMP2)
772         ld1             {v0.16b-v3.16b}, [x2], #64
773         eor             v0.16b, v0.16b,  v8.16b
774         eor             v1.16b, v1.16b,  v9.16b
775         eor             v2.16b, v2.16b, v10.16b
776         eor             v3.16b, v3.16b, v11.16b
778         SM4_CRYPT_BLK4(v0, v1, v2, v3)
780         eor             v0.16b, v0.16b,  v8.16b
781         eor             v1.16b, v1.16b,  v9.16b
782         eor             v2.16b, v2.16b, v10.16b
783         eor             v3.16b, v3.16b, v11.16b
784         st1             {v0.16b-v3.16b}, [x1], #64
786         tweak_next(v8, v11, RTMP3)
788         cbz             w4, .Lxts_dec_cts
790 .Lxts_dec_loop_1x:
791         sub             w4, w4, #1
793         ld1             {v0.16b}, [x2], #16
794         eor             v0.16b, v0.16b, v8.16b
796         SM4_CRYPT_BLK(v0)
798         eor             v0.16b, v0.16b, v8.16b
799         st1             {v0.16b}, [x1], #16
801         tweak_next(v8, v8, RTMP0)
803         cbnz            w4, .Lxts_dec_loop_1x
805 .Lxts_dec_cts:
806         cbz             x5, .Lxts_dec_end
808         /* cipher text stealing */
810         tweak_next(v9, v8, RTMP0)
811         ld1             {v0.16b}, [x2]
812         eor             v0.16b, v0.16b, v9.16b
813         SM4_CRYPT_BLK(v0)
814         eor             v0.16b, v0.16b, v9.16b
816         /* load permute table */
817         adr_l           x6, .Lcts_permute_table
818         add             x7, x6, #32
819         add             x6, x6, x5
820         sub             x7, x7, x5
821         ld1             {v3.16b}, [x6]
822         ld1             {v4.16b}, [x7]
824         /* overlapping loads */
825         add             x2, x2, x5
826         ld1             {v1.16b}, [x2]
828         /* create Cn from En-1 */
829         tbl             v2.16b, {v0.16b}, v3.16b
830         /* padding Pn with En-1 at the end */
831         tbx             v0.16b, {v1.16b}, v4.16b
833         eor             v0.16b, v0.16b, v8.16b
834         SM4_CRYPT_BLK(v0)
835         eor             v0.16b, v0.16b, v8.16b
838         /* overlapping stores */
839         add             x5, x1, x5
840         st1             {v2.16b}, [x5]
841         st1             {v0.16b}, [x1]
843         b               .Lxts_dec_ret
845 .Lxts_dec_end:
846         /* store new tweak */
847         st1             {v8.16b}, [x3]
849 .Lxts_dec_ret:
850         ret
851 SYM_FUNC_END(sm4_ce_xts_dec)
853 .align 3
854 SYM_FUNC_START(sm4_ce_mac_update)
855         /* input:
856          *   x0: round key array, CTX
857          *   x1: digest
858          *   x2: src
859          *   w3: nblocks
860          *   w4: enc_before
861          *   w5: enc_after
862          */
863         SM4_PREPARE(x0)
865         ld1             {RMAC.16b}, [x1]
867         cbz             w4, .Lmac_update
869         SM4_CRYPT_BLK(RMAC)
871 .Lmac_update:
872         cbz             w3, .Lmac_ret
874         sub             w6, w3, #1
875         cmp             w5, wzr
876         csel            w3, w3, w6, ne
878         cbz             w3, .Lmac_end
880 .Lmac_loop_4x:
881         cmp             w3, #4
882         blt             .Lmac_loop_1x
884         sub             w3, w3, #4
886         ld1             {v0.16b-v3.16b}, [x2], #64
888         eor             RMAC.16b, RMAC.16b, v0.16b
889         SM4_CRYPT_BLK(RMAC)
890         eor             RMAC.16b, RMAC.16b, v1.16b
891         SM4_CRYPT_BLK(RMAC)
892         eor             RMAC.16b, RMAC.16b, v2.16b
893         SM4_CRYPT_BLK(RMAC)
894         eor             RMAC.16b, RMAC.16b, v3.16b
895         SM4_CRYPT_BLK(RMAC)
897         cbz             w3, .Lmac_end
898         b               .Lmac_loop_4x
900 .Lmac_loop_1x:
901         sub             w3, w3, #1
903         ld1             {v0.16b}, [x2], #16
905         eor             RMAC.16b, RMAC.16b, v0.16b
906         SM4_CRYPT_BLK(RMAC)
908         cbnz            w3, .Lmac_loop_1x
911 .Lmac_end:
912         cbnz            w5, .Lmac_ret
914         ld1             {v0.16b}, [x2], #16
915         eor             RMAC.16b, RMAC.16b, v0.16b
917 .Lmac_ret:
918         st1             {RMAC.16b}, [x1]
919         ret
920 SYM_FUNC_END(sm4_ce_mac_update)
923         .section        ".rodata", "a"
924         .align 4
925 .Lbswap128_mask:
926         .byte           0x0c, 0x0d, 0x0e, 0x0f, 0x08, 0x09, 0x0a, 0x0b
927         .byte           0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03
929 .Lcts_permute_table:
930         .byte           0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
931         .byte           0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
932         .byte            0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7
933         .byte            0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf
934         .byte           0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
935         .byte           0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff