1 /* SPDX-License-Identifier: GPL-2.0-or-later */
3 * SM4 Cipher Algorithm for ARMv8 with Crypto Extensions
5 * https://tools.ietf.org/id/draft-ribose-cfrg-sm4-10.html
7 * Copyright (C) 2022, Alibaba Group.
8 * Copyright (C) 2022 Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
11 #include <linux/linkage.h>
12 #include <asm/assembler.h>
13 #include "sm4-ce-asm.h"
17 .irp b, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, \
18 20, 24, 25, 26, 27, 28, 29, 30, 31
23 .inst 0xcec08400 | (.L\vn << 5) | .L\vd
26 .macro sm4ekey, vd, vn, vm
27 .inst 0xce60c800 | (.L\vm << 16) | (.L\vn << 5) | .L\vd
43 SYM_FUNC_START(sm4_ce_expand_key)
55 ld1 {v24.16b-v27.16b}, [x4], #64;
56 ld1 {v28.16b-v31.16b}, [x4];
59 eor v0.16b, v0.16b, v1.16b;
61 sm4ekey v0.4s, v0.4s, v24.4s;
62 sm4ekey v1.4s, v0.4s, v25.4s;
63 sm4ekey v2.4s, v1.4s, v26.4s;
64 sm4ekey v3.4s, v2.4s, v27.4s;
65 sm4ekey v4.4s, v3.4s, v28.4s;
66 sm4ekey v5.4s, v4.4s, v29.4s;
67 sm4ekey v6.4s, v5.4s, v30.4s;
68 sm4ekey v7.4s, v6.4s, v31.4s;
70 adr_l x5, .Lbswap128_mask
73 st1 {v0.16b-v3.16b}, [x1], #64;
74 st1 {v4.16b-v7.16b}, [x1];
76 tbl v16.16b, {v7.16b}, v24.16b
77 tbl v17.16b, {v6.16b}, v24.16b
78 tbl v18.16b, {v5.16b}, v24.16b
79 tbl v19.16b, {v4.16b}, v24.16b
80 tbl v20.16b, {v3.16b}, v24.16b
81 tbl v21.16b, {v2.16b}, v24.16b
82 tbl v22.16b, {v1.16b}, v24.16b
83 tbl v23.16b, {v0.16b}, v24.16b
85 st1 {v16.16b-v19.16b}, [x2], #64
86 st1 {v20.16b-v23.16b}, [x2]
89 SYM_FUNC_END(sm4_ce_expand_key)
92 SYM_FUNC_START(sm4_ce_crypt_block)
94 * x0: round key array, CTX
105 SYM_FUNC_END(sm4_ce_crypt_block)
108 SYM_FUNC_START(sm4_ce_crypt)
110 * x0: round key array, CTX
119 tbnz w3, #31, .Lcrypt_tail8;
121 ld1 {v0.16b-v3.16b}, [x2], #64;
122 ld1 {v4.16b-v7.16b}, [x2], #64;
124 SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7);
126 st1 {v0.16b-v3.16b}, [x1], #64;
127 st1 {v4.16b-v7.16b}, [x1], #64;
139 ld1 {v0.16b-v3.16b}, [x2], #64;
140 SM4_CRYPT_BLK4(v0, v1, v2, v3);
141 st1 {v0.16b-v3.16b}, [x1], #64;
148 ld1 {v0.16b}, [x2], #16;
150 st1 {v0.16b}, [x1], #16;
152 cbnz w3, .Lcrypt_tail4;
156 SYM_FUNC_END(sm4_ce_crypt)
159 SYM_FUNC_START(sm4_ce_cbc_enc)
161 * x0: round key array, CTX
164 * x3: iv (big endian, 128 bit)
173 blt .Lcbc_enc_loop_1x
177 ld1 {v0.16b-v3.16b}, [x2], #64
179 eor v0.16b, v0.16b, RIV.16b
181 eor v1.16b, v1.16b, v0.16b
183 eor v2.16b, v2.16b, v1.16b
185 eor v3.16b, v3.16b, v2.16b
188 st1 {v0.16b-v3.16b}, [x1], #64
191 cbz w4, .Lcbc_enc_end
197 ld1 {v0.16b}, [x2], #16
199 eor RIV.16b, RIV.16b, v0.16b
202 st1 {RIV.16b}, [x1], #16
204 cbnz w4, .Lcbc_enc_loop_1x
211 SYM_FUNC_END(sm4_ce_cbc_enc)
214 SYM_FUNC_START(sm4_ce_cbc_dec)
216 * x0: round key array, CTX
219 * x3: iv (big endian, 128 bit)
228 tbnz w4, #31, .Lcbc_dec_4x
230 ld1 {v0.16b-v3.16b}, [x2], #64
231 ld1 {v4.16b-v7.16b}, [x2], #64
235 rev32 v10.16b, v2.16b
236 rev32 v11.16b, v3.16b
237 rev32 v12.16b, v4.16b
238 rev32 v13.16b, v5.16b
239 rev32 v14.16b, v6.16b
240 rev32 v15.16b, v7.16b
242 SM4_CRYPT_BLK8_BE(v8, v9, v10, v11, v12, v13, v14, v15)
244 eor v8.16b, v8.16b, RIV.16b
245 eor v9.16b, v9.16b, v0.16b
246 eor v10.16b, v10.16b, v1.16b
247 eor v11.16b, v11.16b, v2.16b
248 eor v12.16b, v12.16b, v3.16b
249 eor v13.16b, v13.16b, v4.16b
250 eor v14.16b, v14.16b, v5.16b
251 eor v15.16b, v15.16b, v6.16b
253 st1 {v8.16b-v11.16b}, [x1], #64
254 st1 {v12.16b-v15.16b}, [x1], #64
258 cbz w4, .Lcbc_dec_end
264 blt .Lcbc_dec_loop_1x
268 ld1 {v0.16b-v3.16b}, [x2], #64
272 rev32 v10.16b, v2.16b
273 rev32 v11.16b, v3.16b
275 SM4_CRYPT_BLK4_BE(v8, v9, v10, v11)
277 eor v8.16b, v8.16b, RIV.16b
278 eor v9.16b, v9.16b, v0.16b
279 eor v10.16b, v10.16b, v1.16b
280 eor v11.16b, v11.16b, v2.16b
282 st1 {v8.16b-v11.16b}, [x1], #64
286 cbz w4, .Lcbc_dec_end
291 ld1 {v0.16b}, [x2], #16
297 eor v8.16b, v8.16b, RIV.16b
298 st1 {v8.16b}, [x1], #16
302 cbnz w4, .Lcbc_dec_loop_1x
309 SYM_FUNC_END(sm4_ce_cbc_dec)
312 SYM_FUNC_START(sm4_ce_cbc_cts_enc)
314 * x0: round key array, CTX
317 * x3: iv (big endian, 128 bit)
328 eor RIV.16b, RIV.16b, v0.16b
331 /* load permute table */
332 adr_l x6, .Lcts_permute_table
339 /* overlapping loads */
343 /* create Cn from En-1 */
344 tbl v0.16b, {RIV.16b}, v3.16b
345 /* padding Pn with zeros */
346 tbl v1.16b, {v1.16b}, v4.16b
348 eor v1.16b, v1.16b, RIV.16b
351 /* overlapping stores */
357 SYM_FUNC_END(sm4_ce_cbc_cts_enc)
360 SYM_FUNC_START(sm4_ce_cbc_cts_dec)
362 * x0: round key array, CTX
365 * x3: iv (big endian, 128 bit)
375 /* load permute table */
376 adr_l x6, .Lcts_permute_table
383 /* overlapping loads */
384 ld1 {v0.16b}, [x2], x5
388 /* select the first Ln bytes of Xn to create Pn */
389 tbl v2.16b, {v0.16b}, v3.16b
390 eor v2.16b, v2.16b, v1.16b
392 /* overwrite the first Ln bytes with Cn to create En-1 */
393 tbx v0.16b, {v1.16b}, v4.16b
395 eor v0.16b, v0.16b, RIV.16b
397 /* overlapping stores */
403 SYM_FUNC_END(sm4_ce_cbc_cts_dec)
406 SYM_FUNC_START(sm4_ce_ctr_enc)
408 * x0: round key array, CTX
411 * x3: ctr (big endian, 128 bit)
422 tbnz w4, #31, .Lctr_4x
424 #define inc_le128(vctr) \
428 rev64 vctr.16b, vctr.16b; \
432 inc_le128(v0) /* +0 */
433 inc_le128(v1) /* +1 */
434 inc_le128(v2) /* +2 */
435 inc_le128(v3) /* +3 */
436 inc_le128(v4) /* +4 */
437 inc_le128(v5) /* +5 */
438 inc_le128(v6) /* +6 */
439 inc_le128(v7) /* +7 */
441 ld1 {v8.16b-v11.16b}, [x2], #64
442 ld1 {v12.16b-v15.16b}, [x2], #64
444 SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7)
446 eor v0.16b, v0.16b, v8.16b
447 eor v1.16b, v1.16b, v9.16b
448 eor v2.16b, v2.16b, v10.16b
449 eor v3.16b, v3.16b, v11.16b
450 eor v4.16b, v4.16b, v12.16b
451 eor v5.16b, v5.16b, v13.16b
452 eor v6.16b, v6.16b, v14.16b
453 eor v7.16b, v7.16b, v15.16b
455 st1 {v0.16b-v3.16b}, [x1], #64
456 st1 {v4.16b-v7.16b}, [x1], #64
469 inc_le128(v0) /* +0 */
470 inc_le128(v1) /* +1 */
471 inc_le128(v2) /* +2 */
472 inc_le128(v3) /* +3 */
474 ld1 {v8.16b-v11.16b}, [x2], #64
476 SM4_CRYPT_BLK4(v0, v1, v2, v3)
478 eor v0.16b, v0.16b, v8.16b
479 eor v1.16b, v1.16b, v9.16b
480 eor v2.16b, v2.16b, v10.16b
481 eor v3.16b, v3.16b, v11.16b
483 st1 {v0.16b-v3.16b}, [x1], #64
493 ld1 {v8.16b}, [x2], #16
497 eor v0.16b, v0.16b, v8.16b
498 st1 {v0.16b}, [x1], #16
500 cbnz w4, .Lctr_loop_1x
509 SYM_FUNC_END(sm4_ce_ctr_enc)
512 #define tweak_next(vt, vin, RTMP) \
513 sshr RTMP.2d, vin.2d, #63; \
514 and RTMP.16b, RTMP.16b, RMASK.16b; \
515 add vt.2d, vin.2d, vin.2d; \
516 ext RTMP.16b, RTMP.16b, RTMP.16b, #8; \
517 eor vt.16b, vt.16b, RTMP.16b;
520 SYM_FUNC_START(sm4_ce_xts_enc)
522 * x0: round key array, CTX
525 * x3: tweak (big endian, 128 bit)
527 * x5: round key array for IV
531 cbz x5, .Lxts_enc_nofirst
535 /* Generate first tweak */
549 uzp1 RMASK.4s, RMASK.4s, RTMP0.4s
551 cbz w4, .Lxts_enc_cts
555 tbnz w4, #31, .Lxts_enc_4x
557 tweak_next( v9, v8, RTMP0)
558 tweak_next(v10, v9, RTMP1)
559 tweak_next(v11, v10, RTMP2)
560 tweak_next(v12, v11, RTMP3)
561 tweak_next(v13, v12, RTMP0)
562 tweak_next(v14, v13, RTMP1)
563 tweak_next(v15, v14, RTMP2)
565 ld1 {v0.16b-v3.16b}, [x2], #64
566 ld1 {v4.16b-v7.16b}, [x2], #64
567 eor v0.16b, v0.16b, v8.16b
568 eor v1.16b, v1.16b, v9.16b
569 eor v2.16b, v2.16b, v10.16b
570 eor v3.16b, v3.16b, v11.16b
571 eor v4.16b, v4.16b, v12.16b
572 eor v5.16b, v5.16b, v13.16b
573 eor v6.16b, v6.16b, v14.16b
574 eor v7.16b, v7.16b, v15.16b
576 SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7)
578 eor v0.16b, v0.16b, v8.16b
579 eor v1.16b, v1.16b, v9.16b
580 eor v2.16b, v2.16b, v10.16b
581 eor v3.16b, v3.16b, v11.16b
582 eor v4.16b, v4.16b, v12.16b
583 eor v5.16b, v5.16b, v13.16b
584 eor v6.16b, v6.16b, v14.16b
585 eor v7.16b, v7.16b, v15.16b
586 st1 {v0.16b-v3.16b}, [x1], #64
587 st1 {v4.16b-v7.16b}, [x1], #64
589 tweak_next(v8, v15, RTMP3)
591 cbz w4, .Lxts_enc_cts
597 blt .Lxts_enc_loop_1x
601 tweak_next( v9, v8, RTMP0)
602 tweak_next(v10, v9, RTMP1)
603 tweak_next(v11, v10, RTMP2)
605 ld1 {v0.16b-v3.16b}, [x2], #64
606 eor v0.16b, v0.16b, v8.16b
607 eor v1.16b, v1.16b, v9.16b
608 eor v2.16b, v2.16b, v10.16b
609 eor v3.16b, v3.16b, v11.16b
611 SM4_CRYPT_BLK4(v0, v1, v2, v3)
613 eor v0.16b, v0.16b, v8.16b
614 eor v1.16b, v1.16b, v9.16b
615 eor v2.16b, v2.16b, v10.16b
616 eor v3.16b, v3.16b, v11.16b
617 st1 {v0.16b-v3.16b}, [x1], #64
619 tweak_next(v8, v11, RTMP3)
621 cbz w4, .Lxts_enc_cts
626 ld1 {v0.16b}, [x2], #16
627 eor v0.16b, v0.16b, v8.16b
631 eor v0.16b, v0.16b, v8.16b
632 st1 {v0.16b}, [x1], #16
634 tweak_next(v8, v8, RTMP0)
636 cbnz w4, .Lxts_enc_loop_1x
639 cbz x5, .Lxts_enc_end
641 /* cipher text stealing */
643 tweak_next(v9, v8, RTMP0)
645 eor v0.16b, v0.16b, v8.16b
647 eor v0.16b, v0.16b, v8.16b
649 /* load permute table */
650 adr_l x6, .Lcts_permute_table
657 /* overlapping loads */
661 /* create Cn from En-1 */
662 tbl v2.16b, {v0.16b}, v3.16b
663 /* padding Pn with En-1 at the end */
664 tbx v0.16b, {v1.16b}, v4.16b
666 eor v0.16b, v0.16b, v9.16b
668 eor v0.16b, v0.16b, v9.16b
671 /* overlapping stores */
679 /* store new tweak */
684 SYM_FUNC_END(sm4_ce_xts_enc)
687 SYM_FUNC_START(sm4_ce_xts_dec)
689 * x0: round key array, CTX
692 * x3: tweak (big endian, 128 bit)
694 * x5: round key array for IV
698 cbz x5, .Lxts_dec_nofirst
702 /* Generate first tweak */
716 uzp1 RMASK.4s, RMASK.4s, RTMP0.4s
718 cbz w4, .Lxts_dec_cts
722 tbnz w4, #31, .Lxts_dec_4x
724 tweak_next( v9, v8, RTMP0)
725 tweak_next(v10, v9, RTMP1)
726 tweak_next(v11, v10, RTMP2)
727 tweak_next(v12, v11, RTMP3)
728 tweak_next(v13, v12, RTMP0)
729 tweak_next(v14, v13, RTMP1)
730 tweak_next(v15, v14, RTMP2)
732 ld1 {v0.16b-v3.16b}, [x2], #64
733 ld1 {v4.16b-v7.16b}, [x2], #64
734 eor v0.16b, v0.16b, v8.16b
735 eor v1.16b, v1.16b, v9.16b
736 eor v2.16b, v2.16b, v10.16b
737 eor v3.16b, v3.16b, v11.16b
738 eor v4.16b, v4.16b, v12.16b
739 eor v5.16b, v5.16b, v13.16b
740 eor v6.16b, v6.16b, v14.16b
741 eor v7.16b, v7.16b, v15.16b
743 SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7)
745 eor v0.16b, v0.16b, v8.16b
746 eor v1.16b, v1.16b, v9.16b
747 eor v2.16b, v2.16b, v10.16b
748 eor v3.16b, v3.16b, v11.16b
749 eor v4.16b, v4.16b, v12.16b
750 eor v5.16b, v5.16b, v13.16b
751 eor v6.16b, v6.16b, v14.16b
752 eor v7.16b, v7.16b, v15.16b
753 st1 {v0.16b-v3.16b}, [x1], #64
754 st1 {v4.16b-v7.16b}, [x1], #64
756 tweak_next(v8, v15, RTMP3)
758 cbz w4, .Lxts_dec_cts
764 blt .Lxts_dec_loop_1x
768 tweak_next( v9, v8, RTMP0)
769 tweak_next(v10, v9, RTMP1)
770 tweak_next(v11, v10, RTMP2)
772 ld1 {v0.16b-v3.16b}, [x2], #64
773 eor v0.16b, v0.16b, v8.16b
774 eor v1.16b, v1.16b, v9.16b
775 eor v2.16b, v2.16b, v10.16b
776 eor v3.16b, v3.16b, v11.16b
778 SM4_CRYPT_BLK4(v0, v1, v2, v3)
780 eor v0.16b, v0.16b, v8.16b
781 eor v1.16b, v1.16b, v9.16b
782 eor v2.16b, v2.16b, v10.16b
783 eor v3.16b, v3.16b, v11.16b
784 st1 {v0.16b-v3.16b}, [x1], #64
786 tweak_next(v8, v11, RTMP3)
788 cbz w4, .Lxts_dec_cts
793 ld1 {v0.16b}, [x2], #16
794 eor v0.16b, v0.16b, v8.16b
798 eor v0.16b, v0.16b, v8.16b
799 st1 {v0.16b}, [x1], #16
801 tweak_next(v8, v8, RTMP0)
803 cbnz w4, .Lxts_dec_loop_1x
806 cbz x5, .Lxts_dec_end
808 /* cipher text stealing */
810 tweak_next(v9, v8, RTMP0)
812 eor v0.16b, v0.16b, v9.16b
814 eor v0.16b, v0.16b, v9.16b
816 /* load permute table */
817 adr_l x6, .Lcts_permute_table
824 /* overlapping loads */
828 /* create Cn from En-1 */
829 tbl v2.16b, {v0.16b}, v3.16b
830 /* padding Pn with En-1 at the end */
831 tbx v0.16b, {v1.16b}, v4.16b
833 eor v0.16b, v0.16b, v8.16b
835 eor v0.16b, v0.16b, v8.16b
838 /* overlapping stores */
846 /* store new tweak */
851 SYM_FUNC_END(sm4_ce_xts_dec)
854 SYM_FUNC_START(sm4_ce_mac_update)
856 * x0: round key array, CTX
886 ld1 {v0.16b-v3.16b}, [x2], #64
888 eor RMAC.16b, RMAC.16b, v0.16b
890 eor RMAC.16b, RMAC.16b, v1.16b
892 eor RMAC.16b, RMAC.16b, v2.16b
894 eor RMAC.16b, RMAC.16b, v3.16b
903 ld1 {v0.16b}, [x2], #16
905 eor RMAC.16b, RMAC.16b, v0.16b
908 cbnz w3, .Lmac_loop_1x
914 ld1 {v0.16b}, [x2], #16
915 eor RMAC.16b, RMAC.16b, v0.16b
920 SYM_FUNC_END(sm4_ce_mac_update)
923 .section ".rodata", "a"
926 .byte 0x0c, 0x0d, 0x0e, 0x0f, 0x08, 0x09, 0x0a, 0x0b
927 .byte 0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03
930 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
931 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
932 .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7
933 .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf
934 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
935 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff