1 /* SPDX-License-Identifier: GPL-2.0-or-later */
4 * Copyright (C) IBM Corporation, 2011
6 * Author: Anton Blanchard <anton@au.ibm.com>
8 #include <asm/ppc_asm.h>
11 /* 0 == don't use VMX, 1 == use VMX */
12 #define SELFTEST_CASE 0
16 #define LVS(VRT,RA,RB) lvsl VRT,RA,RB
17 #define VPERM(VRT,VRA,VRB,VRC) vperm VRT,VRA,VRB,VRC
19 #define LVS(VRT,RA,RB) lvsr VRT,RA,RB
20 #define VPERM(VRT,VRA,VRB,VRC) vperm VRT,VRB,VRA,VRC
25 EX_TABLE(100b,.Ldo_err1)
30 EX_TABLE(200b,.Ldo_err2)
36 EX_TABLE(300b,.Ldo_err3)
41 EX_TABLE(400b,.Ldo_err4)
46 ld r16,STK_REG(R16)(r1)
47 ld r15,STK_REG(R15)(r1)
48 ld r14,STK_REG(R14)(r1)
51 ld r0,STACKFRAMESIZE+16(r1)
54 #endif /* CONFIG_ALTIVEC */
57 ld r22,STK_REG(R22)(r1)
58 ld r21,STK_REG(R21)(r1)
59 ld r20,STK_REG(R20)(r1)
60 ld r19,STK_REG(R19)(r1)
61 ld r18,STK_REG(R18)(r1)
62 ld r17,STK_REG(R17)(r1)
63 ld r16,STK_REG(R16)(r1)
64 ld r15,STK_REG(R15)(r1)
65 ld r14,STK_REG(R14)(r1)
67 addi r1,r1,STACKFRAMESIZE
69 ld r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
70 ld r4,-STACKFRAMESIZE+STK_REG(R30)(r1)
71 ld r5,-STACKFRAMESIZE+STK_REG(R29)(r1)
72 b __copy_tofrom_user_base
75 _GLOBAL(__copy_tofrom_user_power7)
79 std r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
80 std r4,-STACKFRAMESIZE+STK_REG(R30)(r1)
81 std r5,-STACKFRAMESIZE+STK_REG(R29)(r1)
86 test_feature = SELFTEST_CASE
89 END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
93 /* Get the source 8B aligned */
121 stdu r1,-STACKFRAMESIZE(r1)
122 std r14,STK_REG(R14)(r1)
123 std r15,STK_REG(R15)(r1)
124 std r16,STK_REG(R16)(r1)
125 std r17,STK_REG(R17)(r1)
126 std r18,STK_REG(R18)(r1)
127 std r19,STK_REG(R19)(r1)
128 std r20,STK_REG(R20)(r1)
129 std r21,STK_REG(R21)(r1)
130 std r22,STK_REG(R22)(r1)
131 std r0,STACKFRAMESIZE+16(r1)
136 /* Now do cacheline (128B) sized loads and stores. */
169 err2; std r19,104(r3)
170 err2; std r20,112(r3)
171 err2; std r21,120(r3)
177 ld r14,STK_REG(R14)(r1)
178 ld r15,STK_REG(R15)(r1)
179 ld r16,STK_REG(R16)(r1)
180 ld r17,STK_REG(R17)(r1)
181 ld r18,STK_REG(R18)(r1)
182 ld r19,STK_REG(R19)(r1)
183 ld r20,STK_REG(R20)(r1)
184 ld r21,STK_REG(R21)(r1)
185 ld r22,STK_REG(R22)(r1)
186 addi r1,r1,STACKFRAMESIZE
188 /* Up to 127B to go */
212 /* Up to 63B to go */
225 /* Up to 31B to go */
234 9: clrldi r5,r5,(64-4)
236 /* Up to 15B to go */
240 err1; lwz r0,0(r4) /* Less chance of a reject with word ops */
266 .Lunwind_stack_nonvmx_copy:
267 addi r1,r1,STACKFRAMESIZE
271 #ifdef CONFIG_ALTIVEC
274 stdu r1,-STACKFRAMESIZE(r1)
275 bl enter_vmx_usercopy
277 ld r0,STACKFRAMESIZE+16(r1)
278 ld r3,STK_REG(R31)(r1)
279 ld r4,STK_REG(R30)(r1)
280 ld r5,STK_REG(R29)(r1)
284 * We prefetch both the source and destination using enhanced touch
285 * instructions. We use a stream ID of 0 for the load side and
286 * 1 for the store side.
290 ori r9,r9,1 /* stream=1 */
292 srdi r7,r5,7 /* length in cachelines, capped at 0x3FF */
296 1: lis r0,0x0E00 /* depth=7 */
299 ori r10,r7,1 /* stream=1 */
301 lis r8,0x8000 /* GO=1 */
304 /* setup read stream 0 */
305 dcbt 0,r6,0b01000 /* addr from */
306 dcbt 0,r7,0b01010 /* length and depth from */
307 /* setup write stream 1 */
308 dcbtst 0,r9,0b01000 /* addr to */
309 dcbtst 0,r10,0b01010 /* length and depth to */
311 dcbt 0,r8,0b01010 /* all streams GO */
313 beq cr1,.Lunwind_stack_nonvmx_copy
316 * If source and destination are not relatively aligned we use a
317 * slower permute loop.
320 rldicl. r6,r6,0,(64-4)
321 bne .Lvmx_unaligned_copy
323 /* Get the destination 16B aligned */
354 /* Get the desination 128B aligned */
393 std r14,STK_REG(R14)(r1)
394 std r15,STK_REG(R15)(r1)
395 std r16,STK_REG(R16)(r1)
405 * Now do cacheline sized loads and stores. By this stage the
406 * cacheline stores are also cacheline aligned.
430 ld r14,STK_REG(R14)(r1)
431 ld r15,STK_REG(R15)(r1)
432 ld r16,STK_REG(R16)(r1)
434 /* Up to 127B to go */
465 /* Up to 15B to go */
466 11: clrldi r5,r5,(64-4)
490 15: addi r1,r1,STACKFRAMESIZE
491 b exit_vmx_usercopy /* tail call optimise */
493 .Lvmx_unaligned_copy:
494 /* Get the destination 16B aligned */
518 err3; lwz r0,0(r4) /* Less chance of a reject with word ops */
527 /* Get the desination 128B aligned */
537 LVS(v16,0,r4) /* Setup permute control vector */
571 err3; stvx v10,r3,r10
572 err3; stvx v11,r3,r11
578 std r14,STK_REG(R14)(r1)
579 std r15,STK_REG(R15)(r1)
580 std r16,STK_REG(R16)(r1)
590 * Now do cacheline sized loads and stores. By this stage the
591 * cacheline stores are also cacheline aligned.
614 err4; stvx v10,r3,r10
615 err4; stvx v11,r3,r11
616 err4; stvx v12,r3,r12
617 err4; stvx v13,r3,r14
618 err4; stvx v14,r3,r15
619 err4; stvx v15,r3,r16
623 ld r14,STK_REG(R14)(r1)
624 ld r15,STK_REG(R15)(r1)
625 ld r16,STK_REG(R16)(r1)
627 /* Up to 127B to go */
644 err3; stvx v10,r3,r10
645 err3; stvx v11,r3,r11
665 /* Up to 15B to go */
666 11: clrldi r5,r5,(64-4)
667 addi r4,r4,-16 /* Unwind the +16 load offset */
670 err3; lwz r0,0(r4) /* Less chance of a reject with word ops */
693 15: addi r1,r1,STACKFRAMESIZE
694 b exit_vmx_usercopy /* tail call optimise */
695 #endif /* CONFIG_ALTIVEC */