1 /* SPDX-License-Identifier: GPL-2.0-or-later */
3 # Accelerated poly1305 implementation for ppc64le.
5 # Copyright 2023- IBM Corp. All rights reserved
7 #===================================================================================
8 # Written by Danny Tsen <dtsen@us.ibm.com>
10 # Poly1305 - this version mainly using vector/VSX/Scalar
12 # - Handle multiple 64 byte blcok.
16 # clamp r &= 0x0FFFFFFC0FFFFFFC 0x0FFFFFFC0FFFFFFF
22 # Improve performance by breaking down polynominal to the sum of products with
23 # h4 = m1 * r⁴ + m2 * r³ + m3 * r² + m4 * r
25 # 07/22/21 - this revison based on the above sum of products. Setup r^4, r^3, r^2, r and s3, s2, s1, s0
26 # to 9 vectors for multiplications.
28 # setup r^4, r^3, r^2, r vectors
29 # vs [r^1, r^3, r^2, r^4]
40 # Each word in a vector consists a member of a "r/s" in [a * r/s].
42 # r0, r4*5, r3*5, r2*5, r1*5;
43 # r1, r0, r4*5, r3*5, r2*5;
44 # r2, r1, r0, r4*5, r3*5;
45 # r3, r2, r1, r0, r4*5;
46 # r4, r3, r2, r1, r0 ;
49 # poly1305_p10le_4blocks( uint8_t *k, uint32_t mlen, uint8_t *m)
55 #include <asm/ppc_asm.h>
56 #include <asm/asm-offsets.h>
57 #include <asm/asm-compat.h>
58 #include <linux/linkage.h>
64 .macro SAVE_GPR GPR OFFSET FRAME
65 std \GPR,\OFFSET(\FRAME)
68 .macro SAVE_VRS VRS OFFSET FRAME
73 .macro SAVE_VSX VSX OFFSET FRAME
75 stxvx \VSX, 16, \FRAME
78 .macro RESTORE_GPR GPR OFFSET FRAME
79 ld \GPR,\OFFSET(\FRAME)
82 .macro RESTORE_VRS VRS OFFSET FRAME
87 .macro RESTORE_VSX VSX OFFSET FRAME
153 RESTORE_VRS 21, 16, 9
154 RESTORE_VRS 22, 32, 9
155 RESTORE_VRS 23, 48, 9
156 RESTORE_VRS 24, 64, 9
157 RESTORE_VRS 25, 80, 9
158 RESTORE_VRS 26, 96, 9
159 RESTORE_VRS 27, 112, 9
160 RESTORE_VRS 28, 128, 9
161 RESTORE_VRS 29, 144, 9
162 RESTORE_VRS 30, 160, 9
163 RESTORE_VRS 31, 176, 9
165 RESTORE_VSX 14, 192, 9
166 RESTORE_VSX 15, 208, 9
167 RESTORE_VSX 16, 224, 9
168 RESTORE_VSX 17, 240, 9
169 RESTORE_VSX 18, 256, 9
170 RESTORE_VSX 19, 272, 9
171 RESTORE_VSX 20, 288, 9
172 RESTORE_VSX 21, 304, 9
173 RESTORE_VSX 22, 320, 9
174 RESTORE_VSX 23, 336, 9
175 RESTORE_VSX 24, 352, 9
176 RESTORE_VSX 25, 368, 9
177 RESTORE_VSX 26, 384, 9
178 RESTORE_VSX 27, 400, 9
179 RESTORE_VSX 28, 416, 9
180 RESTORE_VSX 29, 432, 9
181 RESTORE_VSX 30, 448, 9
182 RESTORE_VSX 31, 464, 9
184 RESTORE_GPR 14, 112, 1
185 RESTORE_GPR 15, 120, 1
186 RESTORE_GPR 16, 128, 1
187 RESTORE_GPR 17, 136, 1
188 RESTORE_GPR 18, 144, 1
189 RESTORE_GPR 19, 152, 1
190 RESTORE_GPR 20, 160, 1
191 RESTORE_GPR 21, 168, 1
192 RESTORE_GPR 22, 176, 1
193 RESTORE_GPR 23, 184, 1
194 RESTORE_GPR 24, 192, 1
195 RESTORE_GPR 25, 200, 1
196 RESTORE_GPR 26, 208, 1
197 RESTORE_GPR 27, 216, 1
198 RESTORE_GPR 28, 224, 1
199 RESTORE_GPR 29, 232, 1
200 RESTORE_GPR 30, 240, 1
201 RESTORE_GPR 31, 248, 1
209 # p[0] = a0*r0 + a1*r4*5 + a2*r3*5 + a3*r2*5 + a4*r1*5;
210 # p[1] = a0*r1 + a1*r0 + a2*r4*5 + a3*r3*5 + a4*r2*5;
211 # p[2] = a0*r2 + a1*r1 + a2*r0 + a3*r4*5 + a4*r3*5;
212 # p[3] = a0*r3 + a1*r2 + a2*r1 + a3*r0 + a4*r4*5;
213 # p[4] = a0*r4 + a1*r3 + a2*r2 + a3*r1 + a4*r0 ;
215 # [r^2, r^3, r^1, r^4]
218 # multiply odd and even words
231 vaddudm 14, 14, 13 # x0
237 vaddudm 15, 15, 13 # x1
246 vaddudm 16, 16, 13 # x2
255 vaddudm 17, 17, 13 # x3
264 vaddudm 18, 18, 13 # x4
277 vaddudm 14, 14, 13 # x0
288 vaddudm 15, 15, 13 # x1
299 vaddudm 16, 16, 13 # x2
310 vaddudm 17, 17, 13 # x3
321 vaddudm 18, 18, 13 # x4
327 # setup r^4, r^3, r^2, r vectors
339 # r0, r4*5, r3*5, r2*5, r1*5;
340 # r1, r0, r4*5, r3*5, r2*5;
341 # r2, r1, r0, r4*5, r3*5;
342 # r3, r2, r1, r0, r4*5;
343 # r4, r3, r2, r1, r0 ;
345 .macro poly1305_setup_r
364 xxpermdi 58, 58, 36, 0x3 # r0
365 xxpermdi 59, 59, 37, 0x3 # r1
366 xxpermdi 60, 60, 38, 0x3 # r2
367 xxpermdi 61, 61, 39, 0x3 # r3
368 xxpermdi 62, 62, 40, 0x3 # r4
369 xxpermdi 36, 36, 36, 0x3
370 xxpermdi 37, 37, 37, 0x3
371 xxpermdi 38, 38, 38, 0x3
372 xxpermdi 39, 39, 39, 0x3
373 xxpermdi 40, 40, 40, 0x3
437 SYM_FUNC_START_LOCAL(do_mul)
440 # do reduction ( h %= p )
475 .macro do_poly1305_init
476 addis 10, 2, rmask@toc@ha
477 addi 10, 10, rmask@toc@l
484 addis 10, 2, cnum@toc@ha
485 addi 10, 10, cnum@toc@l
486 lvx 25, 0, 10 # v25 - mask
487 lvx 31, 14, 10 # v31 = 1a
488 lvx 19, 15, 10 # v19 = 1 << 24
489 lxv 24, 48(10) # vs24
490 lxv 25, 64(10) # vs25
493 # load key from r3 to vectors
504 insrdi 16, 10, 14, 38
506 extrdi 17, 10, 26, 24
512 # r1 = r1 * 5, r2 = r2 * 5, r3 = r3 * 5, r4 = r4 * 5
515 vmulouw 0, 27, 4 # v0 = rr0
516 vmulouw 1, 28, 4 # v1 = rr1
517 vmulouw 2, 29, 4 # v2 = rr2
518 vmulouw 3, 30, 4 # v3 = rr3
522 # poly1305_p10le_4blocks( uint8_t *k, uint32_t mlen, uint8_t *m)
528 SYM_FUNC_START(poly1305_p10le_4blocks)
537 li 21, 0 # counter to message
541 # load previous H state
542 # break/convert r6 to 26 bits
552 insrdi 16, 10, 14, 38
554 extrdi 17, 10, 26, 24
571 vsrd 10, 14, 31 # >> 26
572 vsrd 11, 10, 31 # 12 bits left
580 vsrd 12, 15, 13 # >> 14
581 vsrd 13, 12, 31 # >> 26, a4
598 vsrd 10, 14, 31 # >> 26
599 vsrd 11, 10, 31 # 12 bits left
607 vsrd 12, 15, 13 # >> 14
608 vsrd 13, 12, 31 # >> 26, a4
611 # Smash 4 message blocks into 5 vectors of [m4, m2, m3, m1]
619 addi 5, 5, -64 # len -= 64
620 addi 21, 21, 64 # offset += 64
630 # h4 = m1 * r⁴ + m2 * r³ + m3 * r² + m4 * r
631 # Rewrite the polynominal sum of product as follows,
632 # h1 = (h0 + m1) * r^2, h2 = (h0 + m2) * r^2
633 # h3 = (h1 + m3) * r^2, h4 = (h2 + m4) * r^2 --> (h0 + m1) r*4 + (h3 + m3) r^2, (h0 + m2) r^4 + (h0 + m4) r^2
635 # h5 = (h3 + m5) * r^2, h6 = (h4 + m6) * r^2 -->
636 # h7 = (h5 + m7) * r^2, h8 = (h6 + m8) * r^1 --> m5 * r^4 + m6 * r^3 + m7 * r^2 + m8 * r
640 # Multiply odd words and even words
690 vsrd 21, 14, 31 # >> 26
691 vsrd 22, 21, 31 # 12 bits left
692 vsrd 10, 17, 31 # >> 26
693 vsrd 11, 10, 31 # 12 bits left
708 vsrd 23, 15, 13 # >> 14
709 vsrd 24, 23, 31 # >> 26, a4
711 vsrd 12, 18, 13 # >> 14
712 vsrd 13, 12, 31 # >> 26, a4
721 # Smash 4 message blocks into 5 vectors of [m4, m2, m3, m1]
729 addi 5, 5, -64 # len -= 64
730 addi 21, 21, 64 # offset += 64
745 # Multiply odd words and even words
750 xxpermdi 41, 31, 46, 0
751 xxpermdi 42, 31, 47, 0
753 xxpermdi 36, 31, 36, 3
755 xxpermdi 37, 31, 37, 3
756 xxpermdi 43, 31, 48, 0
758 xxpermdi 38, 31, 38, 3
759 xxpermdi 44, 31, 49, 0
761 xxpermdi 39, 31, 39, 3
762 xxpermdi 45, 31, 50, 0
764 xxpermdi 40, 31, 40, 3
801 # combine 26 bit limbs
802 # v4, v5, v6, v7 and v8 are 26 bit vectors
813 mfvsrld 16, 40 # save last 2 bytes
835 SYM_FUNC_END(poly1305_p10le_4blocks)
838 # =======================================================================
839 # The following functions implement 64 x 64 bits multiplication poly1305.
841 SYM_FUNC_START_LOCAL(Poly1305_init_64)
842 # mask 0x0FFFFFFC0FFFFFFC
843 # mask 0x0FFFFFFC0FFFFFFF
844 addis 10, 2, rmask@toc@ha
845 addi 10, 10, rmask@toc@l
853 and. 9, 9, 11 # cramp mask r0
854 and. 10, 10, 12 # cramp mask r1
857 add 19, 21, 10 # s1: r19 - (r1 >> 2) *5
861 mtvsrdd 32+0, 9, 19 # r0, s1
862 mtvsrdd 32+1, 10, 9 # r1, r0
863 mtvsrdd 32+2, 19, 25 # s1
864 mtvsrdd 32+3, 9, 25 # r0
867 SYM_FUNC_END(Poly1305_init_64)
870 # v6 = (h0, h1), v8 = h2
871 # v0 = (r0, s1), v1 = (r1, r0), v2 = s1, v3 = r0
873 # Output: v7, v10, v11
875 SYM_FUNC_START_LOCAL(Poly1305_mult)
877 # d0 = h0 * r0 + h1 * s1
878 vmsumudm 7, 6, 0, 9 # h0 * r0, h1 * s1
880 # d1 = h0 * r1 + h1 * r0 + h2 * s1
881 vmsumudm 11, 6, 1, 9 # h0 * r1, h1 * r0
882 vmsumudm 10, 8, 2, 11 # d1 += h2 * s1
885 vmsumudm 11, 8, 3, 9 # d2 = h2 * r0
887 SYM_FUNC_END(Poly1305_mult)
893 # Input: v7, v10, v11
894 # Output: r27, r28, r29
896 SYM_FUNC_START_LOCAL(Carry_reduction)
900 mfvsrd 20, 32+7 # h0.h
901 mfvsrd 21, 32+10 # h1.h
907 add 23, 23, 22 # (h2 & 3) * 5
910 andi. 29, 29, 0x3 # h2
912 SYM_FUNC_END(Carry_reduction)
915 # poly1305 multiplication
917 # d0 = h0 * r0 + h1 * s1
918 # d1 = h0 * r1 + h1 * r0 + h2 * s1
922 # unsigned int poly1305_test_64s(unisgned char *state, const byte *src, size_t len, highbit)
923 # - no highbit if final leftover block (highbit = 0)
925 SYM_FUNC_START(poly1305_64s)
927 ble Out_no_poly1305_64
955 li 25, 0 # offset to inp and outp
984 mtvsrdd 32+6, 27, 28 # h0, h1
985 mtvsrdd 32+8, 29, 22 # h2
999 RESTORE_GPR 14, 112, 1
1000 RESTORE_GPR 15, 120, 1
1001 RESTORE_GPR 16, 128, 1
1002 RESTORE_GPR 17, 136, 1
1003 RESTORE_GPR 18, 144, 1
1004 RESTORE_GPR 19, 152, 1
1005 RESTORE_GPR 20, 160, 1
1006 RESTORE_GPR 21, 168, 1
1007 RESTORE_GPR 22, 176, 1
1008 RESTORE_GPR 23, 184, 1
1009 RESTORE_GPR 24, 192, 1
1010 RESTORE_GPR 25, 200, 1
1011 RESTORE_GPR 26, 208, 1
1012 RESTORE_GPR 27, 216, 1
1013 RESTORE_GPR 28, 224, 1
1014 RESTORE_GPR 29, 232, 1
1015 RESTORE_GPR 30, 240, 1
1016 RESTORE_GPR 31, 248, 1
1027 SYM_FUNC_END(poly1305_64s)
1030 # Input: r3 = h, r4 = s, r5 = mac
1033 SYM_FUNC_START(poly1305_emit_64)
1046 srdi 9, 8, 2 # overflow?
1063 SYM_FUNC_END(poly1305_emit_64)
1065 SYM_DATA_START_LOCAL(RMASK)
1068 .byte 0xff, 0xff, 0xff, 0x0f, 0xfc, 0xff, 0xff, 0x0f, 0xfc, 0xff, 0xff, 0x0f, 0xfc, 0xff, 0xff, 0x0f
1070 .long 0x03ffffff, 0x00000000, 0x03ffffff, 0x00000000
1071 .long 0x1a, 0x00, 0x1a, 0x00
1072 .long 0x01000000, 0x01000000, 0x01000000, 0x01000000
1073 .long 0x00010203, 0x04050607, 0x10111213, 0x14151617
1074 .long 0x08090a0b, 0x0c0d0e0f, 0x18191a1b, 0x1c1d1e1f