2 * SSE2 implementation of MORUS-1280
4 * Copyright (c) 2017-2018 Ondrej Mosnacek <omosnacek@gmail.com>
5 * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved.
7 * This program is free software; you can redistribute it and/or modify it
8 * under the terms of the GNU General Public License version 2 as published
9 * by the Free Software Foundation.
12 #include <linux/linkage.h>
13 #include <asm/frame.h>
15 #define SHUFFLE_MASK(i0, i1, i2, i3) \
16 (i0 | (i1 << 2) | (i2 << 4) | (i3 << 6))
18 #define MASK2 SHUFFLE_MASK(2, 3, 0, 1)
20 #define STATE0_LO %xmm0
21 #define STATE0_HI %xmm1
22 #define STATE1_LO %xmm2
23 #define STATE1_HI %xmm3
24 #define STATE2_LO %xmm4
25 #define STATE2_HI %xmm5
26 #define STATE3_LO %xmm6
27 #define STATE3_HI %xmm7
28 #define STATE4_LO %xmm8
29 #define STATE4_HI %xmm9
39 .section .rodata.cst16.morus640_const, "aM", @progbits, 16
42 .byte 0x00, 0x01, 0x01, 0x02, 0x03, 0x05, 0x08, 0x0d
43 .byte 0x15, 0x22, 0x37, 0x59, 0x90, 0xe9, 0x79, 0x62
45 .byte 0xdb, 0x3d, 0x18, 0x55, 0x6d, 0xc2, 0x2f, 0xf1
46 .byte 0x20, 0x11, 0x31, 0x42, 0x73, 0xb5, 0x28, 0xdd
48 .section .rodata.cst16.morus640_counter, "aM", @progbits, 16
51 .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
52 .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
54 .byte 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17
55 .byte 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f
61 * HI_1 | HI_0 || LO_1 | LO_0
63 * HI_0 | HI_1 || LO_1 | LO_0
65 * HI_0 | LO_1 || LO_0 | HI_1
67 pshufd $MASK2, \hi, \hi
83 * HI_1 | HI_0 || LO_1 | LO_0
85 * HI_0 | HI_1 || LO_1 | LO_0
87 * LO_0 | HI_1 || HI_0 | LO_1
89 pshufd $MASK2, \hi, \hi
96 .macro morus1280_round s0_l, s0_h, s1_l, s1_h, s2_l, s2_h, s3_l, s3_h, s4_l, s4_h, b, w
110 psrlq $(64 - \b), \s0_l
115 psrlq $(64 - \b), \s0_h
122 * __morus1280_update: internal ABI
124 * STATE[0-4] - input state
125 * MSG - message block
127 * STATE[0-4] - output state
133 STATE0_LO, STATE0_HI, \
134 STATE1_LO, STATE1_HI, \
135 STATE2_LO, STATE2_HI, \
136 STATE3_LO, STATE3_HI, \
137 STATE4_LO, STATE4_HI, \
139 pxor MSG_LO, STATE1_LO
140 pxor MSG_HI, STATE1_HI
142 STATE1_LO, STATE1_HI, \
143 STATE2_LO, STATE2_HI, \
144 STATE3_LO, STATE3_HI, \
145 STATE4_LO, STATE4_HI, \
146 STATE0_LO, STATE0_HI, \
148 pxor MSG_LO, STATE2_LO
149 pxor MSG_HI, STATE2_HI
151 STATE2_LO, STATE2_HI, \
152 STATE3_LO, STATE3_HI, \
153 STATE4_LO, STATE4_HI, \
154 STATE0_LO, STATE0_HI, \
155 STATE1_LO, STATE1_HI, \
157 pxor MSG_LO, STATE3_LO
158 pxor MSG_HI, STATE3_HI
160 STATE3_LO, STATE3_HI, \
161 STATE4_LO, STATE4_HI, \
162 STATE0_LO, STATE0_HI, \
163 STATE1_LO, STATE1_HI, \
164 STATE2_LO, STATE2_HI, \
166 pxor MSG_LO, STATE4_LO
167 pxor MSG_HI, STATE4_HI
169 STATE4_LO, STATE4_HI, \
170 STATE0_LO, STATE0_HI, \
171 STATE1_LO, STATE1_HI, \
172 STATE2_LO, STATE2_HI, \
173 STATE3_LO, STATE3_HI, \
176 ENDPROC(__morus1280_update)
179 * __morus1280_update_zero: internal ABI
181 * STATE[0-4] - input state
183 * STATE[0-4] - output state
187 __morus1280_update_zero:
189 STATE0_LO, STATE0_HI, \
190 STATE1_LO, STATE1_HI, \
191 STATE2_LO, STATE2_HI, \
192 STATE3_LO, STATE3_HI, \
193 STATE4_LO, STATE4_HI, \
196 STATE1_LO, STATE1_HI, \
197 STATE2_LO, STATE2_HI, \
198 STATE3_LO, STATE3_HI, \
199 STATE4_LO, STATE4_HI, \
200 STATE0_LO, STATE0_HI, \
203 STATE2_LO, STATE2_HI, \
204 STATE3_LO, STATE3_HI, \
205 STATE4_LO, STATE4_HI, \
206 STATE0_LO, STATE0_HI, \
207 STATE1_LO, STATE1_HI, \
210 STATE3_LO, STATE3_HI, \
211 STATE4_LO, STATE4_HI, \
212 STATE0_LO, STATE0_HI, \
213 STATE1_LO, STATE1_HI, \
214 STATE2_LO, STATE2_HI, \
217 STATE4_LO, STATE4_HI, \
218 STATE0_LO, STATE0_HI, \
219 STATE1_LO, STATE1_HI, \
220 STATE2_LO, STATE2_HI, \
221 STATE3_LO, STATE3_HI, \
224 ENDPROC(__morus1280_update_zero)
227 * __load_partial: internal ABI
232 * MSG - message block
293 movdqa MSG_LO, MSG_HI
294 movdqu (%rsi), MSG_LO
298 ENDPROC(__load_partial)
301 * __store_partial: internal ABI
366 ENDPROC(__store_partial)
369 * void crypto_morus1280_sse2_init(void *state, const void *key,
372 ENTRY(crypto_morus1280_sse2_init)
376 pxor STATE0_HI, STATE0_HI
377 movdqu (%rdx), STATE0_LO
379 movdqu 0(%rsi), KEY_LO
380 movdqu 16(%rsi), KEY_HI
381 movdqa KEY_LO, STATE1_LO
382 movdqa KEY_HI, STATE1_HI
384 pcmpeqd STATE2_LO, STATE2_LO
385 pcmpeqd STATE2_HI, STATE2_HI
386 /* load all zeros: */
387 pxor STATE3_LO, STATE3_LO
388 pxor STATE3_HI, STATE3_HI
389 /* load the constant: */
390 movdqa .Lmorus640_const_0, STATE4_LO
391 movdqa .Lmorus640_const_1, STATE4_HI
393 /* update 16 times with zero: */
394 call __morus1280_update_zero
395 call __morus1280_update_zero
396 call __morus1280_update_zero
397 call __morus1280_update_zero
398 call __morus1280_update_zero
399 call __morus1280_update_zero
400 call __morus1280_update_zero
401 call __morus1280_update_zero
402 call __morus1280_update_zero
403 call __morus1280_update_zero
404 call __morus1280_update_zero
405 call __morus1280_update_zero
406 call __morus1280_update_zero
407 call __morus1280_update_zero
408 call __morus1280_update_zero
409 call __morus1280_update_zero
411 /* xor-in the key again after updates: */
412 pxor KEY_LO, STATE1_LO
413 pxor KEY_HI, STATE1_HI
415 /* store the state: */
416 movdqu STATE0_LO, (0 * 16)(%rdi)
417 movdqu STATE0_HI, (1 * 16)(%rdi)
418 movdqu STATE1_LO, (2 * 16)(%rdi)
419 movdqu STATE1_HI, (3 * 16)(%rdi)
420 movdqu STATE2_LO, (4 * 16)(%rdi)
421 movdqu STATE2_HI, (5 * 16)(%rdi)
422 movdqu STATE3_LO, (6 * 16)(%rdi)
423 movdqu STATE3_HI, (7 * 16)(%rdi)
424 movdqu STATE4_LO, (8 * 16)(%rdi)
425 movdqu STATE4_HI, (9 * 16)(%rdi)
429 ENDPROC(crypto_morus1280_sse2_init)
432 * void crypto_morus1280_sse2_ad(void *state, const void *data,
433 * unsigned int length);
435 ENTRY(crypto_morus1280_sse2_ad)
441 /* load the state: */
442 movdqu (0 * 16)(%rdi), STATE0_LO
443 movdqu (1 * 16)(%rdi), STATE0_HI
444 movdqu (2 * 16)(%rdi), STATE1_LO
445 movdqu (3 * 16)(%rdi), STATE1_HI
446 movdqu (4 * 16)(%rdi), STATE2_LO
447 movdqu (5 * 16)(%rdi), STATE2_HI
448 movdqu (6 * 16)(%rdi), STATE3_LO
449 movdqu (7 * 16)(%rdi), STATE3_HI
450 movdqu (8 * 16)(%rdi), STATE4_LO
451 movdqu (9 * 16)(%rdi), STATE4_HI
459 movdqa 0(%rsi), MSG_LO
460 movdqa 16(%rsi), MSG_HI
461 call __morus1280_update
470 movdqu 0(%rsi), MSG_LO
471 movdqu 16(%rsi), MSG_HI
472 call __morus1280_update
479 /* store the state: */
480 movdqu STATE0_LO, (0 * 16)(%rdi)
481 movdqu STATE0_HI, (1 * 16)(%rdi)
482 movdqu STATE1_LO, (2 * 16)(%rdi)
483 movdqu STATE1_HI, (3 * 16)(%rdi)
484 movdqu STATE2_LO, (4 * 16)(%rdi)
485 movdqu STATE2_HI, (5 * 16)(%rdi)
486 movdqu STATE3_LO, (6 * 16)(%rdi)
487 movdqu STATE3_HI, (7 * 16)(%rdi)
488 movdqu STATE4_LO, (8 * 16)(%rdi)
489 movdqu STATE4_HI, (9 * 16)(%rdi)
494 ENDPROC(crypto_morus1280_sse2_ad)
497 * void crypto_morus1280_sse2_enc(void *state, const void *src, void *dst,
498 * unsigned int length);
500 ENTRY(crypto_morus1280_sse2_enc)
506 /* load the state: */
507 movdqu (0 * 16)(%rdi), STATE0_LO
508 movdqu (1 * 16)(%rdi), STATE0_HI
509 movdqu (2 * 16)(%rdi), STATE1_LO
510 movdqu (3 * 16)(%rdi), STATE1_HI
511 movdqu (4 * 16)(%rdi), STATE2_LO
512 movdqu (5 * 16)(%rdi), STATE2_HI
513 movdqu (6 * 16)(%rdi), STATE3_LO
514 movdqu (7 * 16)(%rdi), STATE3_HI
515 movdqu (8 * 16)(%rdi), STATE4_LO
516 movdqu (9 * 16)(%rdi), STATE4_HI
525 movdqa 0(%rsi), MSG_LO
526 movdqa 16(%rsi), MSG_HI
527 movdqa STATE1_LO, T1_LO
528 movdqa STATE1_HI, T1_HI
534 pxor STATE0_LO, T0_LO
535 pxor STATE0_HI, T0_HI
536 movdqa STATE2_LO, T1_LO
537 movdqa STATE2_HI, T1_HI
538 pand STATE3_LO, T1_LO
539 pand STATE3_HI, T1_HI
542 movdqa T0_LO, 0(%rdx)
543 movdqa T0_HI, 16(%rdx)
545 call __morus1280_update
555 movdqu 0(%rsi), MSG_LO
556 movdqu 16(%rsi), MSG_HI
557 movdqa STATE1_LO, T1_LO
558 movdqa STATE1_HI, T1_HI
564 pxor STATE0_LO, T0_LO
565 pxor STATE0_HI, T0_HI
566 movdqa STATE2_LO, T1_LO
567 movdqa STATE2_HI, T1_HI
568 pand STATE3_LO, T1_LO
569 pand STATE3_HI, T1_HI
572 movdqu T0_LO, 0(%rdx)
573 movdqu T0_HI, 16(%rdx)
575 call __morus1280_update
583 /* store the state: */
584 movdqu STATE0_LO, (0 * 16)(%rdi)
585 movdqu STATE0_HI, (1 * 16)(%rdi)
586 movdqu STATE1_LO, (2 * 16)(%rdi)
587 movdqu STATE1_HI, (3 * 16)(%rdi)
588 movdqu STATE2_LO, (4 * 16)(%rdi)
589 movdqu STATE2_HI, (5 * 16)(%rdi)
590 movdqu STATE3_LO, (6 * 16)(%rdi)
591 movdqu STATE3_HI, (7 * 16)(%rdi)
592 movdqu STATE4_LO, (8 * 16)(%rdi)
593 movdqu STATE4_HI, (9 * 16)(%rdi)
598 ENDPROC(crypto_morus1280_sse2_enc)
601 * void crypto_morus1280_sse2_enc_tail(void *state, const void *src, void *dst,
602 * unsigned int length);
604 ENTRY(crypto_morus1280_sse2_enc_tail)
607 /* load the state: */
608 movdqu (0 * 16)(%rdi), STATE0_LO
609 movdqu (1 * 16)(%rdi), STATE0_HI
610 movdqu (2 * 16)(%rdi), STATE1_LO
611 movdqu (3 * 16)(%rdi), STATE1_HI
612 movdqu (4 * 16)(%rdi), STATE2_LO
613 movdqu (5 * 16)(%rdi), STATE2_HI
614 movdqu (6 * 16)(%rdi), STATE3_LO
615 movdqu (7 * 16)(%rdi), STATE3_HI
616 movdqu (8 * 16)(%rdi), STATE4_LO
617 movdqu (9 * 16)(%rdi), STATE4_HI
619 /* encrypt message: */
622 movdqa STATE1_LO, T1_LO
623 movdqa STATE1_HI, T1_HI
629 pxor STATE0_LO, T0_LO
630 pxor STATE0_HI, T0_HI
631 movdqa STATE2_LO, T1_LO
632 movdqa STATE2_HI, T1_HI
633 pand STATE3_LO, T1_LO
634 pand STATE3_HI, T1_HI
640 call __morus1280_update
642 /* store the state: */
643 movdqu STATE0_LO, (0 * 16)(%rdi)
644 movdqu STATE0_HI, (1 * 16)(%rdi)
645 movdqu STATE1_LO, (2 * 16)(%rdi)
646 movdqu STATE1_HI, (3 * 16)(%rdi)
647 movdqu STATE2_LO, (4 * 16)(%rdi)
648 movdqu STATE2_HI, (5 * 16)(%rdi)
649 movdqu STATE3_LO, (6 * 16)(%rdi)
650 movdqu STATE3_HI, (7 * 16)(%rdi)
651 movdqu STATE4_LO, (8 * 16)(%rdi)
652 movdqu STATE4_HI, (9 * 16)(%rdi)
656 ENDPROC(crypto_morus1280_sse2_enc_tail)
659 * void crypto_morus1280_sse2_dec(void *state, const void *src, void *dst,
660 * unsigned int length);
662 ENTRY(crypto_morus1280_sse2_dec)
668 /* load the state: */
669 movdqu (0 * 16)(%rdi), STATE0_LO
670 movdqu (1 * 16)(%rdi), STATE0_HI
671 movdqu (2 * 16)(%rdi), STATE1_LO
672 movdqu (3 * 16)(%rdi), STATE1_HI
673 movdqu (4 * 16)(%rdi), STATE2_LO
674 movdqu (5 * 16)(%rdi), STATE2_HI
675 movdqu (6 * 16)(%rdi), STATE3_LO
676 movdqu (7 * 16)(%rdi), STATE3_HI
677 movdqu (8 * 16)(%rdi), STATE4_LO
678 movdqu (9 * 16)(%rdi), STATE4_HI
687 movdqa 0(%rsi), MSG_LO
688 movdqa 16(%rsi), MSG_HI
689 pxor STATE0_LO, MSG_LO
690 pxor STATE0_HI, MSG_HI
691 movdqa STATE1_LO, T1_LO
692 movdqa STATE1_HI, T1_HI
696 movdqa STATE2_LO, T1_LO
697 movdqa STATE2_HI, T1_HI
698 pand STATE3_LO, T1_LO
699 pand STATE3_HI, T1_HI
702 movdqa MSG_LO, 0(%rdx)
703 movdqa MSG_HI, 16(%rdx)
705 call __morus1280_update
715 movdqu 0(%rsi), MSG_LO
716 movdqu 16(%rsi), MSG_HI
717 pxor STATE0_LO, MSG_LO
718 pxor STATE0_HI, MSG_HI
719 movdqa STATE1_LO, T1_LO
720 movdqa STATE1_HI, T1_HI
724 movdqa STATE2_LO, T1_LO
725 movdqa STATE2_HI, T1_HI
726 pand STATE3_LO, T1_LO
727 pand STATE3_HI, T1_HI
730 movdqu MSG_LO, 0(%rdx)
731 movdqu MSG_HI, 16(%rdx)
733 call __morus1280_update
741 /* store the state: */
742 movdqu STATE0_LO, (0 * 16)(%rdi)
743 movdqu STATE0_HI, (1 * 16)(%rdi)
744 movdqu STATE1_LO, (2 * 16)(%rdi)
745 movdqu STATE1_HI, (3 * 16)(%rdi)
746 movdqu STATE2_LO, (4 * 16)(%rdi)
747 movdqu STATE2_HI, (5 * 16)(%rdi)
748 movdqu STATE3_LO, (6 * 16)(%rdi)
749 movdqu STATE3_HI, (7 * 16)(%rdi)
750 movdqu STATE4_LO, (8 * 16)(%rdi)
751 movdqu STATE4_HI, (9 * 16)(%rdi)
756 ENDPROC(crypto_morus1280_sse2_dec)
759 * void crypto_morus1280_sse2_dec_tail(void *state, const void *src, void *dst,
760 * unsigned int length);
762 ENTRY(crypto_morus1280_sse2_dec_tail)
765 /* load the state: */
766 movdqu (0 * 16)(%rdi), STATE0_LO
767 movdqu (1 * 16)(%rdi), STATE0_HI
768 movdqu (2 * 16)(%rdi), STATE1_LO
769 movdqu (3 * 16)(%rdi), STATE1_HI
770 movdqu (4 * 16)(%rdi), STATE2_LO
771 movdqu (5 * 16)(%rdi), STATE2_HI
772 movdqu (6 * 16)(%rdi), STATE3_LO
773 movdqu (7 * 16)(%rdi), STATE3_HI
774 movdqu (8 * 16)(%rdi), STATE4_LO
775 movdqu (9 * 16)(%rdi), STATE4_HI
777 /* decrypt message: */
780 pxor STATE0_LO, MSG_LO
781 pxor STATE0_HI, MSG_HI
782 movdqa STATE1_LO, T1_LO
783 movdqa STATE1_HI, T1_HI
787 movdqa STATE2_LO, T1_LO
788 movdqa STATE2_HI, T1_HI
789 pand STATE3_LO, T1_LO
790 pand STATE3_HI, T1_HI
798 /* mask with byte count: */
800 punpcklbw T0_LO, T0_LO
801 punpcklbw T0_LO, T0_LO
802 punpcklbw T0_LO, T0_LO
803 punpcklbw T0_LO, T0_LO
805 movdqa .Lmorus640_counter_0, T1_LO
806 movdqa .Lmorus640_counter_1, T1_HI
812 call __morus1280_update
814 /* store the state: */
815 movdqu STATE0_LO, (0 * 16)(%rdi)
816 movdqu STATE0_HI, (1 * 16)(%rdi)
817 movdqu STATE1_LO, (2 * 16)(%rdi)
818 movdqu STATE1_HI, (3 * 16)(%rdi)
819 movdqu STATE2_LO, (4 * 16)(%rdi)
820 movdqu STATE2_HI, (5 * 16)(%rdi)
821 movdqu STATE3_LO, (6 * 16)(%rdi)
822 movdqu STATE3_HI, (7 * 16)(%rdi)
823 movdqu STATE4_LO, (8 * 16)(%rdi)
824 movdqu STATE4_HI, (9 * 16)(%rdi)
828 ENDPROC(crypto_morus1280_sse2_dec_tail)
831 * void crypto_morus1280_sse2_final(void *state, void *tag_xor,
832 * u64 assoclen, u64 cryptlen);
834 ENTRY(crypto_morus1280_sse2_final)
837 /* load the state: */
838 movdqu (0 * 16)(%rdi), STATE0_LO
839 movdqu (1 * 16)(%rdi), STATE0_HI
840 movdqu (2 * 16)(%rdi), STATE1_LO
841 movdqu (3 * 16)(%rdi), STATE1_HI
842 movdqu (4 * 16)(%rdi), STATE2_LO
843 movdqu (5 * 16)(%rdi), STATE2_HI
844 movdqu (6 * 16)(%rdi), STATE3_LO
845 movdqu (7 * 16)(%rdi), STATE3_HI
846 movdqu (8 * 16)(%rdi), STATE4_LO
847 movdqu (9 * 16)(%rdi), STATE4_HI
849 /* xor state[0] into state[4]: */
850 pxor STATE0_LO, STATE4_LO
851 pxor STATE0_HI, STATE4_HI
853 /* prepare length block: */
858 psllq $3, MSG_LO /* multiply by 8 (to get bit count) */
862 call __morus1280_update
863 call __morus1280_update
864 call __morus1280_update
865 call __morus1280_update
866 call __morus1280_update
867 call __morus1280_update
868 call __morus1280_update
869 call __morus1280_update
870 call __morus1280_update
871 call __morus1280_update
874 movdqu 0(%rsi), MSG_LO
875 movdqu 16(%rsi), MSG_HI
877 pxor STATE0_LO, MSG_LO
878 pxor STATE0_HI, MSG_HI
879 movdqa STATE1_LO, T0_LO
880 movdqa STATE1_HI, T0_HI
884 movdqa STATE2_LO, T0_LO
885 movdqa STATE2_HI, T0_HI
886 pand STATE3_LO, T0_LO
887 pand STATE3_HI, T0_HI
891 movdqu MSG_LO, 0(%rsi)
892 movdqu MSG_HI, 16(%rsi)
896 ENDPROC(crypto_morus1280_sse2_final)