2 * This program is free software; you can redistribute it and/or modify
3 * it under the terms of the GNU General Public License as published by
4 * the Free Software Foundation; either version 2 of the License, or
5 * (at your option) any later version.
7 * This program is distributed in the hope that it will be useful,
8 * but WITHOUT ANY WARRANTY; without even the implied warranty of
9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10 * GNU General Public License for more details.
12 * You should have received a copy of the GNU General Public License
13 * along with this program; if not, write to the Free Software
14 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
16 * Copyright (C) IBM Corporation, 2011
18 * Author: Anton Blanchard <anton@au.ibm.com>
20 #include <asm/ppc_asm.h>
24 .section __ex_table,"a"
32 .section __ex_table,"a"
41 .section __ex_table,"a"
49 .section __ex_table,"a"
57 ld r16,STK_REG(R16)(r1)
58 ld r15,STK_REG(R15)(r1)
59 ld r14,STK_REG(R14)(r1)
62 ld r0,STACKFRAMESIZE+16(r1)
65 #endif /* CONFIG_ALTIVEC */
68 ld r22,STK_REG(R22)(r1)
69 ld r21,STK_REG(R21)(r1)
70 ld r20,STK_REG(R20)(r1)
71 ld r19,STK_REG(R19)(r1)
72 ld r18,STK_REG(R18)(r1)
73 ld r17,STK_REG(R17)(r1)
74 ld r16,STK_REG(R16)(r1)
75 ld r15,STK_REG(R15)(r1)
76 ld r14,STK_REG(R14)(r1)
78 addi r1,r1,STACKFRAMESIZE
83 b __copy_tofrom_user_base
86 _GLOBAL(__copy_tofrom_user_power7)
108 /* Get the source 8B aligned */
136 stdu r1,-STACKFRAMESIZE(r1)
137 std r14,STK_REG(R14)(r1)
138 std r15,STK_REG(R15)(r1)
139 std r16,STK_REG(R16)(r1)
140 std r17,STK_REG(R17)(r1)
141 std r18,STK_REG(R18)(r1)
142 std r19,STK_REG(R19)(r1)
143 std r20,STK_REG(R20)(r1)
144 std r21,STK_REG(R21)(r1)
145 std r22,STK_REG(R22)(r1)
146 std r0,STACKFRAMESIZE+16(r1)
151 /* Now do cacheline (128B) sized loads and stores. */
184 err2; std r19,104(r3)
185 err2; std r20,112(r3)
186 err2; std r21,120(r3)
192 ld r14,STK_REG(R14)(r1)
193 ld r15,STK_REG(R15)(r1)
194 ld r16,STK_REG(R16)(r1)
195 ld r17,STK_REG(R17)(r1)
196 ld r18,STK_REG(R18)(r1)
197 ld r19,STK_REG(R19)(r1)
198 ld r20,STK_REG(R20)(r1)
199 ld r21,STK_REG(R21)(r1)
200 ld r22,STK_REG(R22)(r1)
201 addi r1,r1,STACKFRAMESIZE
203 /* Up to 127B to go */
227 /* Up to 63B to go */
240 /* Up to 31B to go */
249 9: clrldi r5,r5,(64-4)
251 /* Up to 15B to go */
255 err1; lwz r0,0(r4) /* Less chance of a reject with word ops */
281 .Lunwind_stack_nonvmx_copy:
282 addi r1,r1,STACKFRAMESIZE
285 #ifdef CONFIG_ALTIVEC
289 stdu r1,-STACKFRAMESIZE(r1)
290 bl .enter_vmx_usercopy
292 ld r0,STACKFRAMESIZE+16(r1)
293 ld r3,STACKFRAMESIZE+48(r1)
294 ld r4,STACKFRAMESIZE+56(r1)
295 ld r5,STACKFRAMESIZE+64(r1)
299 * We prefetch both the source and destination using enhanced touch
300 * instructions. We use a stream ID of 0 for the load side and
301 * 1 for the store side.
305 ori r9,r9,1 /* stream=1 */
307 srdi r7,r5,7 /* length in cachelines, capped at 0x3FF */
311 1: lis r0,0x0E00 /* depth=7 */
314 ori r10,r7,1 /* stream=1 */
316 lis r8,0x8000 /* GO=1 */
324 dcbtst r0,r10,0b01010
326 dcbt r0,r8,0b01010 /* GO */
330 * We prefetch both the source and destination using enhanced touch
331 * instructions. We use a stream ID of 0 for the load side and
332 * 1 for the store side.
336 ori r9,r9,1 /* stream=1 */
338 srdi r7,r5,7 /* length in cachelines, capped at 0x3FF */
342 1: lis r0,0x0E00 /* depth=7 */
345 ori r10,r7,1 /* stream=1 */
347 lis r8,0x8000 /* GO=1 */
355 dcbtst r0,r10,0b01010
357 dcbt r0,r8,0b01010 /* GO */
360 beq .Lunwind_stack_nonvmx_copy
363 * If source and destination are not relatively aligned we use a
364 * slower permute loop.
367 rldicl. r6,r6,0,(64-4)
368 bne .Lvmx_unaligned_copy
370 /* Get the destination 16B aligned */
401 /* Get the desination 128B aligned */
433 err3; stvx vr1,r3,r10
434 err3; stvx vr0,r3,r11
440 std r14,STK_REG(R14)(r1)
441 std r15,STK_REG(R15)(r1)
442 std r16,STK_REG(R16)(r1)
452 * Now do cacheline sized loads and stores. By this stage the
453 * cacheline stores are also cacheline aligned.
468 err4; stvx vr5,r3,r10
469 err4; stvx vr4,r3,r11
470 err4; stvx vr3,r3,r12
471 err4; stvx vr2,r3,r14
472 err4; stvx vr1,r3,r15
473 err4; stvx vr0,r3,r16
477 ld r14,STK_REG(R14)(r1)
478 ld r15,STK_REG(R15)(r1)
479 ld r16,STK_REG(R16)(r1)
481 /* Up to 127B to go */
494 err3; stvx vr1,r3,r10
495 err3; stvx vr0,r3,r11
512 /* Up to 15B to go */
513 11: clrldi r5,r5,(64-4)
537 15: addi r1,r1,STACKFRAMESIZE
538 b .exit_vmx_usercopy /* tail call optimise */
540 .Lvmx_unaligned_copy:
541 /* Get the destination 16B aligned */
565 err3; lwz r0,0(r4) /* Less chance of a reject with word ops */
574 /* Get the desination 128B aligned */
584 lvsl vr16,0,r4 /* Setup permute control vector */
590 vperm vr8,vr0,vr1,vr16
598 vperm vr8,vr0,vr1,vr16
600 vperm vr9,vr1,vr0,vr16
608 vperm vr8,vr0,vr3,vr16
610 vperm vr9,vr3,vr2,vr16
612 vperm vr10,vr2,vr1,vr16
614 vperm vr11,vr1,vr0,vr16
618 err3; stvx vr10,r3,r10
619 err3; stvx vr11,r3,r11
625 std r14,STK_REG(R14)(r1)
626 std r15,STK_REG(R15)(r1)
627 std r16,STK_REG(R16)(r1)
637 * Now do cacheline sized loads and stores. By this stage the
638 * cacheline stores are also cacheline aligned.
643 vperm vr8,vr0,vr7,vr16
645 vperm vr9,vr7,vr6,vr16
647 vperm vr10,vr6,vr5,vr16
649 vperm vr11,vr5,vr4,vr16
651 vperm vr12,vr4,vr3,vr16
653 vperm vr13,vr3,vr2,vr16
655 vperm vr14,vr2,vr1,vr16
657 vperm vr15,vr1,vr0,vr16
661 err4; stvx vr10,r3,r10
662 err4; stvx vr11,r3,r11
663 err4; stvx vr12,r3,r12
664 err4; stvx vr13,r3,r14
665 err4; stvx vr14,r3,r15
666 err4; stvx vr15,r3,r16
670 ld r14,STK_REG(R14)(r1)
671 ld r15,STK_REG(R15)(r1)
672 ld r16,STK_REG(R16)(r1)
674 /* Up to 127B to go */
681 vperm vr8,vr0,vr3,vr16
683 vperm vr9,vr3,vr2,vr16
685 vperm vr10,vr2,vr1,vr16
687 vperm vr11,vr1,vr0,vr16
691 err3; stvx vr10,r3,r10
692 err3; stvx vr11,r3,r11
697 vperm vr8,vr0,vr1,vr16
699 vperm vr9,vr1,vr0,vr16
707 vperm vr8,vr0,vr1,vr16
712 /* Up to 15B to go */
713 11: clrldi r5,r5,(64-4)
714 addi r4,r4,-16 /* Unwind the +16 load offset */
717 err3; lwz r0,0(r4) /* Less chance of a reject with word ops */
740 15: addi r1,r1,STACKFRAMESIZE
741 b .exit_vmx_usercopy /* tail call optimise */
742 #endif /* CONFiG_ALTIVEC */