1 /* SPDX-License-Identifier: GPL-2.0-or-later */
3 # Accelerated chacha20 implementation for ppc64le.
5 # Copyright 2023- IBM Corp. All rights reserved
7 #===================================================================================
8 # Written by Danny Tsen <dtsen@us.ibm.com>
10 # chacha_p10le_8x(u32 *state, byte *dst, const byte *src,
11 # size_t len, int nrounds);
13 # do rounds, 8 quarter rounds
14 # 1. a += b; d ^= a; d <<<= 16;
15 # 2. c += d; b ^= c; b <<<= 12;
16 # 3. a += b; d ^= a; d <<<= 8;
17 # 4. c += d; b ^= c; b <<<= 7
19 # row1 = (row1 + row2), row4 = row1 xor row4, row4 rotate each word by 16
20 # row3 = (row3 + row4), row2 = row3 xor row2, row2 rotate each word by 12
21 # row1 = (row1 + row2), row4 = row1 xor row4, row4 rotate each word by 8
22 # row3 = (row3 + row4), row2 = row3 xor row2, row2 rotate each word by 7
38 # Column round (v0, v4, v8, v12, v1, v5, v9, v13, v2, v6, v10, v14, v3, v7, v11, v15)
39 # Diagnal round (v0, v5, v10, v15, v1, v6, v11, v12, v2, v7, v8, v13, v3, v4, v9, v14)
42 #include <asm/ppc_asm.h>
43 #include <asm/asm-offsets.h>
44 #include <asm/asm-compat.h>
45 #include <linux/linkage.h>
50 .macro SAVE_GPR GPR OFFSET FRAME
51 std \GPR,\OFFSET(\FRAME)
54 .macro SAVE_VRS VRS OFFSET FRAME
59 .macro SAVE_VSX VSX OFFSET FRAME
61 stxvx \VSX, 16, \FRAME
64 .macro RESTORE_GPR GPR OFFSET FRAME
65 ld \GPR,\OFFSET(\FRAME)
68 .macro RESTORE_VRS VRS OFFSET FRAME
73 .macro RESTORE_VSX VSX OFFSET FRAME
139 RESTORE_VRS 21, 16, 9
140 RESTORE_VRS 22, 32, 9
141 RESTORE_VRS 23, 48, 9
142 RESTORE_VRS 24, 64, 9
143 RESTORE_VRS 25, 80, 9
144 RESTORE_VRS 26, 96, 9
145 RESTORE_VRS 27, 112, 9
146 RESTORE_VRS 28, 128, 9
147 RESTORE_VRS 29, 144, 9
148 RESTORE_VRS 30, 160, 9
149 RESTORE_VRS 31, 176, 9
151 RESTORE_VSX 14, 192, 9
152 RESTORE_VSX 15, 208, 9
153 RESTORE_VSX 16, 224, 9
154 RESTORE_VSX 17, 240, 9
155 RESTORE_VSX 18, 256, 9
156 RESTORE_VSX 19, 272, 9
157 RESTORE_VSX 20, 288, 9
158 RESTORE_VSX 21, 304, 9
159 RESTORE_VSX 22, 320, 9
160 RESTORE_VSX 23, 336, 9
161 RESTORE_VSX 24, 352, 9
162 RESTORE_VSX 25, 368, 9
163 RESTORE_VSX 26, 384, 9
164 RESTORE_VSX 27, 400, 9
165 RESTORE_VSX 28, 416, 9
166 RESTORE_VSX 29, 432, 9
167 RESTORE_VSX 30, 448, 9
168 RESTORE_VSX 31, 464, 9
170 RESTORE_GPR 14, 112, 1
171 RESTORE_GPR 15, 120, 1
172 RESTORE_GPR 16, 128, 1
173 RESTORE_GPR 17, 136, 1
174 RESTORE_GPR 18, 144, 1
175 RESTORE_GPR 19, 152, 1
176 RESTORE_GPR 20, 160, 1
177 RESTORE_GPR 21, 168, 1
178 RESTORE_GPR 22, 176, 1
179 RESTORE_GPR 23, 184, 1
180 RESTORE_GPR 24, 192, 1
181 RESTORE_GPR 25, 200, 1
182 RESTORE_GPR 26, 208, 1
183 RESTORE_GPR 27, 216, 1
184 RESTORE_GPR 28, 224, 1
185 RESTORE_GPR 29, 232, 1
186 RESTORE_GPR 30, 240, 1
187 RESTORE_GPR 31, 248, 1
195 # QR(v0, v4, v8, v12, v1, v5, v9, v13, v2, v6, v10, v14, v3, v7, v11, v15)
196 xxlor 0, 32+25, 32+25
207 vpermxor 12, 12, 0, 25
208 vpermxor 13, 13, 1, 25
209 vpermxor 14, 14, 2, 25
210 vpermxor 15, 15, 3, 25
211 vpermxor 28, 28, 16, 25
212 vpermxor 29, 29, 17, 25
213 vpermxor 30, 30, 18, 25
214 vpermxor 31, 31, 19, 25
233 xxlor 0, 32+25, 32+25
253 xxlor 0, 32+25, 32+25
255 vpermxor 12, 12, 0, 25
256 vpermxor 13, 13, 1, 25
257 vpermxor 14, 14, 2, 25
258 vpermxor 15, 15, 3, 25
259 vpermxor 28, 28, 16, 25
260 vpermxor 29, 29, 17, 25
261 vpermxor 30, 30, 18, 25
262 vpermxor 31, 31, 19, 25
272 xxlor 0, 32+28, 32+28
292 # QR(v0, v5, v10, v15, v1, v6, v11, v12, v2, v7, v8, v13, v3, v4, v9, v14)
293 xxlor 0, 32+25, 32+25
304 vpermxor 15, 15, 0, 25
305 vpermxor 12, 12, 1, 25
306 vpermxor 13, 13, 2, 25
307 vpermxor 14, 14, 3, 25
308 vpermxor 31, 31, 16, 25
309 vpermxor 28, 28, 17, 25
310 vpermxor 29, 29, 18, 25
311 vpermxor 30, 30, 19, 25
331 xxlor 0, 32+25, 32+25
352 xxlor 0, 32+25, 32+25
354 vpermxor 15, 15, 0, 25
355 vpermxor 12, 12, 1, 25
356 vpermxor 13, 13, 2, 25
357 vpermxor 14, 14, 3, 25
358 vpermxor 31, 31, 16, 25
359 vpermxor 28, 28, 17, 25
360 vpermxor 29, 29, 18, 25
361 vpermxor 30, 30, 19, 25
373 xxlor 0, 32+28, 32+28
395 # QR(v0, v4, v8, v12, v1, v5, v9, v13, v2, v6, v10, v14, v3, v7, v11, v15)
400 vpermxor 12, 12, 0, 20
401 vpermxor 13, 13, 1, 20
402 vpermxor 14, 14, 2, 20
403 vpermxor 15, 15, 3, 20
420 vpermxor 12, 12, 0, 22
421 vpermxor 13, 13, 1, 22
422 vpermxor 14, 14, 2, 22
423 vpermxor 15, 15, 3, 22
437 # QR(v0, v5, v10, v15, v1, v6, v11, v12, v2, v7, v8, v13, v3, v4, v9, v14)
442 vpermxor 15, 15, 0, 20
443 vpermxor 12, 12, 1, 20
444 vpermxor 13, 13, 2, 20
445 vpermxor 14, 14, 3, 20
462 vpermxor 15, 15, 0, 22
463 vpermxor 12, 12, 1, 22
464 vpermxor 13, 13, 2, 22
465 vpermxor 14, 14, 3, 22
481 .macro TP_4x a0 a1 a2 a3
482 xxmrghw 10, 32+\a0, 32+\a1 # a0, a1, b0, b1
483 xxmrghw 11, 32+\a2, 32+\a3 # a2, a3, b2, b3
484 xxmrglw 12, 32+\a0, 32+\a1 # c0, c1, d0, d1
485 xxmrglw 13, 32+\a2, 32+\a3 # c2, c3, d2, d3
486 xxpermdi 32+\a0, 10, 11, 0 # a0, a1, a2, a3
487 xxpermdi 32+\a1, 10, 11, 3 # b0, b1, b2, b3
488 xxpermdi 32+\a2, 12, 13, 0 # c0, c1, c2, c3
489 xxpermdi 32+\a3, 12, 13, 3 # d0, d1, d2, d3
492 # key stream = working state + state
494 vadduwm \S+0, \S+0, 16-\S
495 vadduwm \S+4, \S+4, 17-\S
496 vadduwm \S+8, \S+8, 18-\S
497 vadduwm \S+12, \S+12, 19-\S
499 vadduwm \S+1, \S+1, 16-\S
500 vadduwm \S+5, \S+5, 17-\S
501 vadduwm \S+9, \S+9, 18-\S
502 vadduwm \S+13, \S+13, 19-\S
504 vadduwm \S+2, \S+2, 16-\S
505 vadduwm \S+6, \S+6, 17-\S
506 vadduwm \S+10, \S+10, 18-\S
507 vadduwm \S+14, \S+14, 19-\S
509 vadduwm \S+3, \S+3, 16-\S
510 vadduwm \S+7, \S+7, 17-\S
511 vadduwm \S+11, \S+11, 18-\S
512 vadduwm \S+15, \S+15, 19-\S
538 xxlxor \S+32, \S+32, 0
539 xxlxor \S+36, \S+36, 1
540 xxlxor \S+40, \S+40, 2
541 xxlxor \S+44, \S+44, 3
542 xxlxor \S+33, \S+33, 4
543 xxlxor \S+37, \S+37, 5
544 xxlxor \S+41, \S+41, 6
545 xxlxor \S+45, \S+45, 7
546 xxlxor \S+34, \S+34, 8
547 xxlxor \S+38, \S+38, 9
548 xxlxor \S+42, \S+42, 10
549 xxlxor \S+46, \S+46, 11
550 xxlxor \S+35, \S+35, 12
551 xxlxor \S+39, \S+39, 13
552 xxlxor \S+43, \S+43, 14
553 xxlxor \S+47, \S+47, 15
556 stxvw4x \S+36, 17, 16
557 stxvw4x \S+40, 18, 16
558 stxvw4x \S+44, 19, 16
560 stxvw4x \S+33, 20, 16
561 stxvw4x \S+37, 21, 16
562 stxvw4x \S+41, 22, 16
563 stxvw4x \S+45, 23, 16
565 stxvw4x \S+34, 24, 16
566 stxvw4x \S+38, 25, 16
567 stxvw4x \S+42, 26, 16
568 stxvw4x \S+46, 27, 16
570 stxvw4x \S+35, 28, 16
571 stxvw4x \S+39, 29, 16
572 stxvw4x \S+43, 30, 16
573 stxvw4x \S+47, 31, 16
578 # chacha20_p10le_8x(u32 *state, byte *dst, const byte *src, size_t len, int nrounds);
580 SYM_FUNC_START(chacha_p10le_8x)
587 # r17 - r31 mainly for Write_256 macro.
605 li 14, 0 # offset to inp and outp
607 lxvw4x 48, 0, 3 # vr16, constants
608 lxvw4x 49, 17, 3 # vr17, key 1
609 lxvw4x 50, 18, 3 # vr18, key 2
610 lxvw4x 51, 19, 3 # vr19, counter, nonce
612 # create (0, 1, 2, 3) counters
619 vsldoi 30, 4, 5, 8 # vr30 counter, 4 (0, 1, 2, 3)
624 addis 11, 2, permx@toc@ha
625 addi 11, 11, permx@toc@l
633 # save constants to vsx
642 xxlor 25, 32+26, 32+26
643 xxlor 24, 32+25, 32+25
645 vadduwm 31, 30, 25 # counter = (0, 1, 2, 3) + (4, 4, 4, 4)
646 xxlor 30, 32+30, 32+30
647 xxlor 31, 32+31, 32+31
649 xxlor 20, 32+20, 32+20
650 xxlor 21, 32+21, 32+21
651 xxlor 22, 32+22, 32+22
652 xxlor 23, 32+23, 32+23
675 vadduwm 12, 12, 30 # increase counter
692 vadduwm 28, 28, 31 # increase counter
702 xxlor 0, 32+30, 32+30
725 addi 14, 14, 256 # offset +=256
726 addi 15, 15, -256 # len -=256
728 xxlor 5, 32+31, 32+31
732 TP_4x 16+0, 16+1, 16+2, 16+3
733 TP_4x 16+4, 16+5, 16+6, 16+7
734 TP_4x 16+8, 16+9, 16+10, 16+11
735 TP_4x 16+12, 16+13, 16+14, 16+15
743 addi 14, 14, 256 # offset +=256
744 addi 15, 15, -256 # len +=256
751 xxlor 30, 32+30, 32+30
752 xxlor 31, 32+31, 32+31
764 lxvw4x 48, 0, 3 # vr16, constants
765 lxvw4x 49, 17, 3 # vr17, key 1
766 lxvw4x 50, 18, 3 # vr18, key 2
767 lxvw4x 51, 19, 3 # vr19, counter, nonce
771 addis 11, 2, permx@toc@ha
772 addi 11, 11, permx@toc@l
794 vadduwm 12, 12, 30 # increase counter
813 addi 14, 14, 256 # offset += 256
814 addi 15, 15, -256 # len += 256
816 # Update state counter
835 SYM_FUNC_END(chacha_p10le_8x)
837 SYM_DATA_START_LOCAL(PERMX)
840 .long 0x22330011, 0x66774455, 0xaabb8899, 0xeeffccdd
841 .long 0x11223300, 0x55667744, 0x99aabb88, 0xddeeffcc