2 * This program is free software; you can redistribute it and/or modify
3 * it under the terms of the GNU General Public License as published by
4 * the Free Software Foundation; either version 2 of the License, or
5 * (at your option) any later version.
7 * This program is distributed in the hope that it will be useful,
8 * but WITHOUT ANY WARRANTY; without even the implied warranty of
9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10 * GNU General Public License for more details.
12 * You should have received a copy of the GNU General Public License
13 * along with this program; if not, write to the Free Software
14 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
16 * Copyright (C) IBM Corporation, 2012
18 * Author: Anton Blanchard <anton@au.ibm.com>
20 #include <asm/ppc_asm.h>
22 _GLOBAL(memcpy_power7)
25 #define LVS(VRT,RA,RB) lvsl VRT,RA,RB
26 #define VPERM(VRT,VRA,VRB,VRC) vperm VRT,VRA,VRB,VRC
28 #define LVS(VRT,RA,RB) lvsr VRT,RA,RB
29 #define VPERM(VRT,VRA,VRB,VRC) vperm VRT,VRB,VRA,VRC
36 std r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
43 std r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
49 /* Get the source 8B aligned */
77 stdu r1,-STACKFRAMESIZE(r1)
78 std r14,STK_REG(R14)(r1)
79 std r15,STK_REG(R15)(r1)
80 std r16,STK_REG(R16)(r1)
81 std r17,STK_REG(R17)(r1)
82 std r18,STK_REG(R18)(r1)
83 std r19,STK_REG(R19)(r1)
84 std r20,STK_REG(R20)(r1)
85 std r21,STK_REG(R21)(r1)
86 std r22,STK_REG(R22)(r1)
87 std r0,STACKFRAMESIZE+16(r1)
92 /* Now do cacheline (128B) sized loads and stores. */
133 ld r14,STK_REG(R14)(r1)
134 ld r15,STK_REG(R15)(r1)
135 ld r16,STK_REG(R16)(r1)
136 ld r17,STK_REG(R17)(r1)
137 ld r18,STK_REG(R18)(r1)
138 ld r19,STK_REG(R19)(r1)
139 ld r20,STK_REG(R20)(r1)
140 ld r21,STK_REG(R21)(r1)
141 ld r22,STK_REG(R22)(r1)
142 addi r1,r1,STACKFRAMESIZE
144 /* Up to 127B to go */
168 /* Up to 63B to go */
181 /* Up to 31B to go */
190 9: clrldi r5,r5,(64-4)
192 /* Up to 15B to go */
196 lwz r0,0(r4) /* Less chance of a reject with word ops */
219 15: ld r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
222 .Lunwind_stack_nonvmx_copy:
223 addi r1,r1,STACKFRAMESIZE
226 #ifdef CONFIG_ALTIVEC
229 std r4,-STACKFRAMESIZE+STK_REG(R30)(r1)
230 std r5,-STACKFRAMESIZE+STK_REG(R29)(r1)
232 stdu r1,-STACKFRAMESIZE(r1)
235 ld r0,STACKFRAMESIZE+16(r1)
236 ld r3,STK_REG(R31)(r1)
237 ld r4,STK_REG(R30)(r1)
238 ld r5,STK_REG(R29)(r1)
242 * We prefetch both the source and destination using enhanced touch
243 * instructions. We use a stream ID of 0 for the load side and
244 * 1 for the store side.
248 ori r9,r9,1 /* stream=1 */
250 srdi r7,r5,7 /* length in cachelines, capped at 0x3FF */
254 1: lis r0,0x0E00 /* depth=7 */
257 ori r10,r7,1 /* stream=1 */
259 lis r8,0x8000 /* GO=1 */
267 dcbtst r0,r10,0b01010
269 dcbt r0,r8,0b01010 /* GO */
272 beq cr1,.Lunwind_stack_nonvmx_copy
275 * If source and destination are not relatively aligned we use a
276 * slower permute loop.
279 rldicl. r6,r6,0,(64-4)
280 bne .Lvmx_unaligned_copy
282 /* Get the destination 16B aligned */
313 /* Get the desination 128B aligned */
352 std r14,STK_REG(R14)(r1)
353 std r15,STK_REG(R15)(r1)
354 std r16,STK_REG(R16)(r1)
364 * Now do cacheline sized loads and stores. By this stage the
365 * cacheline stores are also cacheline aligned.
389 ld r14,STK_REG(R14)(r1)
390 ld r15,STK_REG(R15)(r1)
391 ld r16,STK_REG(R16)(r1)
393 /* Up to 127B to go */
424 /* Up to 15B to go */
425 11: clrldi r5,r5,(64-4)
449 15: addi r1,r1,STACKFRAMESIZE
450 ld r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
451 b exit_vmx_copy /* tail call optimise */
453 .Lvmx_unaligned_copy:
454 /* Get the destination 16B aligned */
478 lwz r0,0(r4) /* Less chance of a reject with word ops */
487 /* Get the desination 128B aligned */
497 LVS(vr16,0,r4) /* Setup permute control vector */
503 VPERM(vr8,vr0,vr1,vr16)
511 VPERM(vr8,vr0,vr1,vr16)
513 VPERM(vr9,vr1,vr0,vr16)
521 VPERM(vr8,vr0,vr3,vr16)
523 VPERM(vr9,vr3,vr2,vr16)
525 VPERM(vr10,vr2,vr1,vr16)
527 VPERM(vr11,vr1,vr0,vr16)
538 std r14,STK_REG(R14)(r1)
539 std r15,STK_REG(R15)(r1)
540 std r16,STK_REG(R16)(r1)
550 * Now do cacheline sized loads and stores. By this stage the
551 * cacheline stores are also cacheline aligned.
556 VPERM(vr8,vr0,vr7,vr16)
558 VPERM(vr9,vr7,vr6,vr16)
560 VPERM(vr10,vr6,vr5,vr16)
562 VPERM(vr11,vr5,vr4,vr16)
564 VPERM(vr12,vr4,vr3,vr16)
566 VPERM(vr13,vr3,vr2,vr16)
568 VPERM(vr14,vr2,vr1,vr16)
570 VPERM(vr15,vr1,vr0,vr16)
583 ld r14,STK_REG(R14)(r1)
584 ld r15,STK_REG(R15)(r1)
585 ld r16,STK_REG(R16)(r1)
587 /* Up to 127B to go */
594 VPERM(vr8,vr0,vr3,vr16)
596 VPERM(vr9,vr3,vr2,vr16)
598 VPERM(vr10,vr2,vr1,vr16)
600 VPERM(vr11,vr1,vr0,vr16)
610 VPERM(vr8,vr0,vr1,vr16)
612 VPERM(vr9,vr1,vr0,vr16)
620 VPERM(vr8,vr0,vr1,vr16)
625 /* Up to 15B to go */
626 11: clrldi r5,r5,(64-4)
627 addi r4,r4,-16 /* Unwind the +16 load offset */
630 lwz r0,0(r4) /* Less chance of a reject with word ops */
653 15: addi r1,r1,STACKFRAMESIZE
654 ld r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
655 b exit_vmx_copy /* tail call optimise */
656 #endif /* CONFiG_ALTIVEC */