1 /* SPDX-License-Identifier: GPL-2.0-or-later */
4 * Copyright (C) IBM Corporation, 2012
6 * Author: Anton Blanchard <anton@au.ibm.com>
8 #include <asm/ppc_asm.h>
11 /* 0 == don't use VMX, 1 == use VMX */
12 #define SELFTEST_CASE 0
16 #define LVS(VRT,RA,RB) lvsl VRT,RA,RB
17 #define VPERM(VRT,VRA,VRB,VRC) vperm VRT,VRA,VRB,VRC
19 #define LVS(VRT,RA,RB) lvsr VRT,RA,RB
20 #define VPERM(VRT,VRA,VRB,VRC) vperm VRT,VRB,VRA,VRC
23 _GLOBAL(memcpy_power7)
26 std r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
30 test_feature = SELFTEST_CASE
33 END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
37 /* Get the source 8B aligned */
65 stdu r1,-STACKFRAMESIZE(r1)
66 std r14,STK_REG(R14)(r1)
67 std r15,STK_REG(R15)(r1)
68 std r16,STK_REG(R16)(r1)
69 std r17,STK_REG(R17)(r1)
70 std r18,STK_REG(R18)(r1)
71 std r19,STK_REG(R19)(r1)
72 std r20,STK_REG(R20)(r1)
73 std r21,STK_REG(R21)(r1)
74 std r22,STK_REG(R22)(r1)
75 std r0,STACKFRAMESIZE+16(r1)
80 /* Now do cacheline (128B) sized loads and stores. */
121 ld r14,STK_REG(R14)(r1)
122 ld r15,STK_REG(R15)(r1)
123 ld r16,STK_REG(R16)(r1)
124 ld r17,STK_REG(R17)(r1)
125 ld r18,STK_REG(R18)(r1)
126 ld r19,STK_REG(R19)(r1)
127 ld r20,STK_REG(R20)(r1)
128 ld r21,STK_REG(R21)(r1)
129 ld r22,STK_REG(R22)(r1)
130 addi r1,r1,STACKFRAMESIZE
132 /* Up to 127B to go */
156 /* Up to 63B to go */
169 /* Up to 31B to go */
178 9: clrldi r5,r5,(64-4)
180 /* Up to 15B to go */
184 lwz r0,0(r4) /* Less chance of a reject with word ops */
207 15: ld r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
210 .Lunwind_stack_nonvmx_copy:
211 addi r1,r1,STACKFRAMESIZE
215 #ifdef CONFIG_ALTIVEC
217 std r4,-STACKFRAMESIZE+STK_REG(R30)(r1)
218 std r5,-STACKFRAMESIZE+STK_REG(R29)(r1)
220 stdu r1,-STACKFRAMESIZE(r1)
221 bl CFUNC(enter_vmx_ops)
223 ld r0,STACKFRAMESIZE+16(r1)
224 ld r3,STK_REG(R31)(r1)
225 ld r4,STK_REG(R30)(r1)
226 ld r5,STK_REG(R29)(r1)
230 * We prefetch both the source and destination using enhanced touch
231 * instructions. We use a stream ID of 0 for the load side and
232 * 1 for the store side.
236 ori r9,r9,1 /* stream=1 */
238 srdi r7,r5,7 /* length in cachelines, capped at 0x3FF */
242 1: lis r0,0x0E00 /* depth=7 */
245 ori r10,r7,1 /* stream=1 */
247 DCBT_SETUP_STREAMS(r6, r7, r9, r10, r8)
249 beq cr1,.Lunwind_stack_nonvmx_copy
252 * If source and destination are not relatively aligned we use a
253 * slower permute loop.
256 rldicl. r6,r6,0,(64-4)
257 bne .Lvmx_unaligned_copy
259 /* Get the destination 16B aligned */
290 /* Get the desination 128B aligned */
329 std r14,STK_REG(R14)(r1)
330 std r15,STK_REG(R15)(r1)
331 std r16,STK_REG(R16)(r1)
341 * Now do cacheline sized loads and stores. By this stage the
342 * cacheline stores are also cacheline aligned.
366 ld r14,STK_REG(R14)(r1)
367 ld r15,STK_REG(R15)(r1)
368 ld r16,STK_REG(R16)(r1)
370 /* Up to 127B to go */
401 /* Up to 15B to go */
402 11: clrldi r5,r5,(64-4)
426 15: addi r1,r1,STACKFRAMESIZE
427 ld r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
428 b CFUNC(exit_vmx_ops) /* tail call optimise */
430 .Lvmx_unaligned_copy:
431 /* Get the destination 16B aligned */
455 lwz r0,0(r4) /* Less chance of a reject with word ops */
464 /* Get the desination 128B aligned */
474 LVS(v16,0,r4) /* Setup permute control vector */
515 std r14,STK_REG(R14)(r1)
516 std r15,STK_REG(R15)(r1)
517 std r16,STK_REG(R16)(r1)
527 * Now do cacheline sized loads and stores. By this stage the
528 * cacheline stores are also cacheline aligned.
560 ld r14,STK_REG(R14)(r1)
561 ld r15,STK_REG(R15)(r1)
562 ld r16,STK_REG(R16)(r1)
564 /* Up to 127B to go */
602 /* Up to 15B to go */
603 11: clrldi r5,r5,(64-4)
604 addi r4,r4,-16 /* Unwind the +16 load offset */
607 lwz r0,0(r4) /* Less chance of a reject with word ops */
630 15: addi r1,r1,STACKFRAMESIZE
631 ld r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
632 b CFUNC(exit_vmx_ops) /* tail call optimise */
633 #endif /* CONFIG_ALTIVEC */