2 * This program is free software; you can redistribute it and/or modify
3 * it under the terms of the GNU General Public License as published by
4 * the Free Software Foundation; either version 2 of the License, or
5 * (at your option) any later version.
7 * This program is distributed in the hope that it will be useful,
8 * but WITHOUT ANY WARRANTY; without even the implied warranty of
9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10 * GNU General Public License for more details.
12 * You should have received a copy of the GNU General Public License
13 * along with this program; if not, write to the Free Software
14 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
16 * Copyright (C) IBM Corporation, 2011
18 * Author: Anton Blanchard <anton@au.ibm.com>
20 #include <asm/ppc_asm.h>
22 #define STACKFRAMESIZE 256
23 #define STK_REG(i) (112 + ((i)-14)*8)
27 .section __ex_table,"a"
35 .section __ex_table,"a"
44 .section __ex_table,"a"
52 .section __ex_table,"a"
60 ld r16,STK_REG(r16)(r1)
61 ld r15,STK_REG(r15)(r1)
62 ld r14,STK_REG(r14)(r1)
65 ld r0,STACKFRAMESIZE+16(r1)
68 #endif /* CONFIG_ALTIVEC */
71 ld r22,STK_REG(r22)(r1)
72 ld r21,STK_REG(r21)(r1)
73 ld r20,STK_REG(r20)(r1)
74 ld r19,STK_REG(r19)(r1)
75 ld r18,STK_REG(r18)(r1)
76 ld r17,STK_REG(r17)(r1)
77 ld r16,STK_REG(r16)(r1)
78 ld r15,STK_REG(r15)(r1)
79 ld r14,STK_REG(r14)(r1)
81 addi r1,r1,STACKFRAMESIZE
86 b __copy_tofrom_user_base
89 _GLOBAL(__copy_tofrom_user_power7)
111 /* Get the source 8B aligned */
139 stdu r1,-STACKFRAMESIZE(r1)
140 std r14,STK_REG(r14)(r1)
141 std r15,STK_REG(r15)(r1)
142 std r16,STK_REG(r16)(r1)
143 std r17,STK_REG(r17)(r1)
144 std r18,STK_REG(r18)(r1)
145 std r19,STK_REG(r19)(r1)
146 std r20,STK_REG(r20)(r1)
147 std r21,STK_REG(r21)(r1)
148 std r22,STK_REG(r22)(r1)
149 std r0,STACKFRAMESIZE+16(r1)
154 /* Now do cacheline (128B) sized loads and stores. */
187 err2; std r19,104(r3)
188 err2; std r20,112(r3)
189 err2; std r21,120(r3)
195 ld r14,STK_REG(r14)(r1)
196 ld r15,STK_REG(r15)(r1)
197 ld r16,STK_REG(r16)(r1)
198 ld r17,STK_REG(r17)(r1)
199 ld r18,STK_REG(r18)(r1)
200 ld r19,STK_REG(r19)(r1)
201 ld r20,STK_REG(r20)(r1)
202 ld r21,STK_REG(r21)(r1)
203 ld r22,STK_REG(r22)(r1)
204 addi r1,r1,STACKFRAMESIZE
206 /* Up to 127B to go */
230 /* Up to 63B to go */
243 /* Up to 31B to go */
252 9: clrldi r5,r5,(64-4)
254 /* Up to 15B to go */
258 err1; lwz r0,0(r4) /* Less chance of a reject with word ops */
284 .Lunwind_stack_nonvmx_copy:
285 addi r1,r1,STACKFRAMESIZE
288 #ifdef CONFIG_ALTIVEC
292 stdu r1,-STACKFRAMESIZE(r1)
295 ld r0,STACKFRAMESIZE+16(r1)
296 ld r3,STACKFRAMESIZE+48(r1)
297 ld r4,STACKFRAMESIZE+56(r1)
298 ld r5,STACKFRAMESIZE+64(r1)
301 beq .Lunwind_stack_nonvmx_copy
304 * If source and destination are not relatively aligned we use a
305 * slower permute loop.
308 rldicl. r6,r6,0,(64-4)
309 bne .Lvmx_unaligned_copy
311 /* Get the destination 16B aligned */
342 /* Get the desination 128B aligned */
374 err3; stvx vr1,r3,r10
375 err3; stvx vr0,r3,r11
381 std r14,STK_REG(r14)(r1)
382 std r15,STK_REG(r15)(r1)
383 std r16,STK_REG(r16)(r1)
393 * Now do cacheline sized loads and stores. By this stage the
394 * cacheline stores are also cacheline aligned.
409 err4; stvx vr5,r3,r10
410 err4; stvx vr4,r3,r11
411 err4; stvx vr3,r3,r12
412 err4; stvx vr2,r3,r14
413 err4; stvx vr1,r3,r15
414 err4; stvx vr0,r3,r16
418 ld r14,STK_REG(r14)(r1)
419 ld r15,STK_REG(r15)(r1)
420 ld r16,STK_REG(r16)(r1)
422 /* Up to 127B to go */
435 err3; stvx vr1,r3,r10
436 err3; stvx vr0,r3,r11
453 /* Up to 15B to go */
454 11: clrldi r5,r5,(64-4)
478 15: addi r1,r1,STACKFRAMESIZE
479 b .exit_vmx_copy /* tail call optimise */
481 .Lvmx_unaligned_copy:
482 /* Get the destination 16B aligned */
506 err3; lwz r0,0(r4) /* Less chance of a reject with word ops */
515 /* Get the desination 128B aligned */
525 lvsl vr16,0,r4 /* Setup permute control vector */
531 vperm vr8,vr0,vr1,vr16
539 vperm vr8,vr0,vr1,vr16
541 vperm vr9,vr1,vr0,vr16
549 vperm vr8,vr0,vr3,vr16
551 vperm vr9,vr3,vr2,vr16
553 vperm vr10,vr2,vr1,vr16
555 vperm vr11,vr1,vr0,vr16
559 err3; stvx vr10,r3,r10
560 err3; stvx vr11,r3,r11
566 std r14,STK_REG(r14)(r1)
567 std r15,STK_REG(r15)(r1)
568 std r16,STK_REG(r16)(r1)
578 * Now do cacheline sized loads and stores. By this stage the
579 * cacheline stores are also cacheline aligned.
584 vperm vr8,vr0,vr7,vr16
586 vperm vr9,vr7,vr6,vr16
588 vperm vr10,vr6,vr5,vr16
590 vperm vr11,vr5,vr4,vr16
592 vperm vr12,vr4,vr3,vr16
594 vperm vr13,vr3,vr2,vr16
596 vperm vr14,vr2,vr1,vr16
598 vperm vr15,vr1,vr0,vr16
602 err4; stvx vr10,r3,r10
603 err4; stvx vr11,r3,r11
604 err4; stvx vr12,r3,r12
605 err4; stvx vr13,r3,r14
606 err4; stvx vr14,r3,r15
607 err4; stvx vr15,r3,r16
611 ld r14,STK_REG(r14)(r1)
612 ld r15,STK_REG(r15)(r1)
613 ld r16,STK_REG(r16)(r1)
615 /* Up to 127B to go */
622 vperm vr8,vr0,vr3,vr16
624 vperm vr9,vr3,vr2,vr16
626 vperm vr10,vr2,vr1,vr16
628 vperm vr11,vr1,vr0,vr16
632 err3; stvx vr10,r3,r10
633 err3; stvx vr11,r3,r11
638 vperm vr8,vr0,vr1,vr16
640 vperm vr9,vr1,vr0,vr16
648 vperm vr8,vr0,vr1,vr16
653 /* Up to 15B to go */
654 11: clrldi r5,r5,(64-4)
655 addi r4,r4,-16 /* Unwind the +16 load offset */
658 err3; lwz r0,0(r4) /* Less chance of a reject with word ops */
681 15: addi r1,r1,STACKFRAMESIZE
682 b .exit_vmx_copy /* tail call optimise */
683 #endif /* CONFiG_ALTIVEC */