2 * This program is free software; you can redistribute it and/or modify
3 * it under the terms of the GNU General Public License as published by
4 * the Free Software Foundation; either version 2 of the License, or
5 * (at your option) any later version.
7 * This program is distributed in the hope that it will be useful,
8 * but WITHOUT ANY WARRANTY; without even the implied warranty of
9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10 * GNU General Public License for more details.
12 * You should have received a copy of the GNU General Public License
13 * along with this program; if not, write to the Free Software
14 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
16 * Copyright (C) IBM Corporation, 2011
18 * Author: Anton Blanchard <anton@au.ibm.com>
20 #include <asm/ppc_asm.h>
23 #define LVS(VRT,RA,RB) lvsl VRT,RA,RB
24 #define VPERM(VRT,VRA,VRB,VRC) vperm VRT,VRA,VRB,VRC
26 #define LVS(VRT,RA,RB) lvsr VRT,RA,RB
27 #define VPERM(VRT,VRA,VRB,VRC) vperm VRT,VRB,VRA,VRC
32 .section __ex_table,"a"
40 .section __ex_table,"a"
49 .section __ex_table,"a"
57 .section __ex_table,"a"
65 ld r16,STK_REG(R16)(r1)
66 ld r15,STK_REG(R15)(r1)
67 ld r14,STK_REG(R14)(r1)
70 ld r0,STACKFRAMESIZE+16(r1)
73 #endif /* CONFIG_ALTIVEC */
76 ld r22,STK_REG(R22)(r1)
77 ld r21,STK_REG(R21)(r1)
78 ld r20,STK_REG(R20)(r1)
79 ld r19,STK_REG(R19)(r1)
80 ld r18,STK_REG(R18)(r1)
81 ld r17,STK_REG(R17)(r1)
82 ld r16,STK_REG(R16)(r1)
83 ld r15,STK_REG(R15)(r1)
84 ld r14,STK_REG(R14)(r1)
86 addi r1,r1,STACKFRAMESIZE
91 b __copy_tofrom_user_base
94 _GLOBAL(__copy_tofrom_user_power7)
116 /* Get the source 8B aligned */
144 stdu r1,-STACKFRAMESIZE(r1)
145 std r14,STK_REG(R14)(r1)
146 std r15,STK_REG(R15)(r1)
147 std r16,STK_REG(R16)(r1)
148 std r17,STK_REG(R17)(r1)
149 std r18,STK_REG(R18)(r1)
150 std r19,STK_REG(R19)(r1)
151 std r20,STK_REG(R20)(r1)
152 std r21,STK_REG(R21)(r1)
153 std r22,STK_REG(R22)(r1)
154 std r0,STACKFRAMESIZE+16(r1)
159 /* Now do cacheline (128B) sized loads and stores. */
192 err2; std r19,104(r3)
193 err2; std r20,112(r3)
194 err2; std r21,120(r3)
200 ld r14,STK_REG(R14)(r1)
201 ld r15,STK_REG(R15)(r1)
202 ld r16,STK_REG(R16)(r1)
203 ld r17,STK_REG(R17)(r1)
204 ld r18,STK_REG(R18)(r1)
205 ld r19,STK_REG(R19)(r1)
206 ld r20,STK_REG(R20)(r1)
207 ld r21,STK_REG(R21)(r1)
208 ld r22,STK_REG(R22)(r1)
209 addi r1,r1,STACKFRAMESIZE
211 /* Up to 127B to go */
235 /* Up to 63B to go */
248 /* Up to 31B to go */
257 9: clrldi r5,r5,(64-4)
259 /* Up to 15B to go */
263 err1; lwz r0,0(r4) /* Less chance of a reject with word ops */
289 .Lunwind_stack_nonvmx_copy:
290 addi r1,r1,STACKFRAMESIZE
293 #ifdef CONFIG_ALTIVEC
297 stdu r1,-STACKFRAMESIZE(r1)
298 bl .enter_vmx_usercopy
300 ld r0,STACKFRAMESIZE+16(r1)
301 ld r3,STACKFRAMESIZE+48(r1)
302 ld r4,STACKFRAMESIZE+56(r1)
303 ld r5,STACKFRAMESIZE+64(r1)
307 * We prefetch both the source and destination using enhanced touch
308 * instructions. We use a stream ID of 0 for the load side and
309 * 1 for the store side.
313 ori r9,r9,1 /* stream=1 */
315 srdi r7,r5,7 /* length in cachelines, capped at 0x3FF */
319 1: lis r0,0x0E00 /* depth=7 */
322 ori r10,r7,1 /* stream=1 */
324 lis r8,0x8000 /* GO=1 */
329 /* setup read stream 0 */
330 dcbt r0,r6,0b01000 /* addr from */
331 dcbt r0,r7,0b01010 /* length and depth from */
332 /* setup write stream 1 */
333 dcbtst r0,r9,0b01000 /* addr to */
334 dcbtst r0,r10,0b01010 /* length and depth to */
336 dcbt r0,r8,0b01010 /* all streams GO */
339 beq cr1,.Lunwind_stack_nonvmx_copy
342 * If source and destination are not relatively aligned we use a
343 * slower permute loop.
346 rldicl. r6,r6,0,(64-4)
347 bne .Lvmx_unaligned_copy
349 /* Get the destination 16B aligned */
380 /* Get the desination 128B aligned */
412 err3; stvx vr1,r3,r10
413 err3; stvx vr0,r3,r11
419 std r14,STK_REG(R14)(r1)
420 std r15,STK_REG(R15)(r1)
421 std r16,STK_REG(R16)(r1)
431 * Now do cacheline sized loads and stores. By this stage the
432 * cacheline stores are also cacheline aligned.
447 err4; stvx vr5,r3,r10
448 err4; stvx vr4,r3,r11
449 err4; stvx vr3,r3,r12
450 err4; stvx vr2,r3,r14
451 err4; stvx vr1,r3,r15
452 err4; stvx vr0,r3,r16
456 ld r14,STK_REG(R14)(r1)
457 ld r15,STK_REG(R15)(r1)
458 ld r16,STK_REG(R16)(r1)
460 /* Up to 127B to go */
473 err3; stvx vr1,r3,r10
474 err3; stvx vr0,r3,r11
491 /* Up to 15B to go */
492 11: clrldi r5,r5,(64-4)
516 15: addi r1,r1,STACKFRAMESIZE
517 b .exit_vmx_usercopy /* tail call optimise */
519 .Lvmx_unaligned_copy:
520 /* Get the destination 16B aligned */
544 err3; lwz r0,0(r4) /* Less chance of a reject with word ops */
553 /* Get the desination 128B aligned */
563 LVS(vr16,0,r4) /* Setup permute control vector */
569 VPERM(vr8,vr0,vr1,vr16)
577 VPERM(vr8,vr0,vr1,vr16)
579 VPERM(vr9,vr1,vr0,vr16)
587 VPERM(vr8,vr0,vr3,vr16)
589 VPERM(vr9,vr3,vr2,vr16)
591 VPERM(vr10,vr2,vr1,vr16)
593 VPERM(vr11,vr1,vr0,vr16)
597 err3; stvx vr10,r3,r10
598 err3; stvx vr11,r3,r11
604 std r14,STK_REG(R14)(r1)
605 std r15,STK_REG(R15)(r1)
606 std r16,STK_REG(R16)(r1)
616 * Now do cacheline sized loads and stores. By this stage the
617 * cacheline stores are also cacheline aligned.
622 VPERM(vr8,vr0,vr7,vr16)
624 VPERM(vr9,vr7,vr6,vr16)
626 VPERM(vr10,vr6,vr5,vr16)
628 VPERM(vr11,vr5,vr4,vr16)
630 VPERM(vr12,vr4,vr3,vr16)
632 VPERM(vr13,vr3,vr2,vr16)
634 VPERM(vr14,vr2,vr1,vr16)
636 VPERM(vr15,vr1,vr0,vr16)
640 err4; stvx vr10,r3,r10
641 err4; stvx vr11,r3,r11
642 err4; stvx vr12,r3,r12
643 err4; stvx vr13,r3,r14
644 err4; stvx vr14,r3,r15
645 err4; stvx vr15,r3,r16
649 ld r14,STK_REG(R14)(r1)
650 ld r15,STK_REG(R15)(r1)
651 ld r16,STK_REG(R16)(r1)
653 /* Up to 127B to go */
660 VPERM(vr8,vr0,vr3,vr16)
662 VPERM(vr9,vr3,vr2,vr16)
664 VPERM(vr10,vr2,vr1,vr16)
666 VPERM(vr11,vr1,vr0,vr16)
670 err3; stvx vr10,r3,r10
671 err3; stvx vr11,r3,r11
676 VPERM(vr8,vr0,vr1,vr16)
678 VPERM(vr9,vr1,vr0,vr16)
686 VPERM(vr8,vr0,vr1,vr16)
691 /* Up to 15B to go */
692 11: clrldi r5,r5,(64-4)
693 addi r4,r4,-16 /* Unwind the +16 load offset */
696 err3; lwz r0,0(r4) /* Less chance of a reject with word ops */
719 15: addi r1,r1,STACKFRAMESIZE
720 b .exit_vmx_usercopy /* tail call optimise */
721 #endif /* CONFiG_ALTIVEC */