Linux 3.17-rc2
[linux/fpc-iii.git] / arch / powerpc / lib / copyuser_power7.S
blobc46c876ac96af693445e91da8204ade16169ece6
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 2 of the License, or
5  * (at your option) any later version.
6  *
7  * This program is distributed in the hope that it will be useful,
8  * but WITHOUT ANY WARRANTY; without even the implied warranty of
9  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10  * GNU General Public License for more details.
11  *
12  * You should have received a copy of the GNU General Public License
13  * along with this program; if not, write to the Free Software
14  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
15  *
16  * Copyright (C) IBM Corporation, 2011
17  *
18  * Author: Anton Blanchard <anton@au.ibm.com>
19  */
20 #include <asm/ppc_asm.h>
22 #ifdef __BIG_ENDIAN__
23 #define LVS(VRT,RA,RB)          lvsl    VRT,RA,RB
24 #define VPERM(VRT,VRA,VRB,VRC)  vperm   VRT,VRA,VRB,VRC
25 #else
26 #define LVS(VRT,RA,RB)          lvsr    VRT,RA,RB
27 #define VPERM(VRT,VRA,VRB,VRC)  vperm   VRT,VRB,VRA,VRC
28 #endif
30         .macro err1
31 100:
32         .section __ex_table,"a"
33         .align 3
34         .llong 100b,.Ldo_err1
35         .previous
36         .endm
38         .macro err2
39 200:
40         .section __ex_table,"a"
41         .align 3
42         .llong 200b,.Ldo_err2
43         .previous
44         .endm
46 #ifdef CONFIG_ALTIVEC
47         .macro err3
48 300:
49         .section __ex_table,"a"
50         .align 3
51         .llong 300b,.Ldo_err3
52         .previous
53         .endm
55         .macro err4
56 400:
57         .section __ex_table,"a"
58         .align 3
59         .llong 400b,.Ldo_err4
60         .previous
61         .endm
64 .Ldo_err4:
65         ld      r16,STK_REG(R16)(r1)
66         ld      r15,STK_REG(R15)(r1)
67         ld      r14,STK_REG(R14)(r1)
68 .Ldo_err3:
69         bl      exit_vmx_usercopy
70         ld      r0,STACKFRAMESIZE+16(r1)
71         mtlr    r0
72         b       .Lexit
73 #endif /* CONFIG_ALTIVEC */
75 .Ldo_err2:
76         ld      r22,STK_REG(R22)(r1)
77         ld      r21,STK_REG(R21)(r1)
78         ld      r20,STK_REG(R20)(r1)
79         ld      r19,STK_REG(R19)(r1)
80         ld      r18,STK_REG(R18)(r1)
81         ld      r17,STK_REG(R17)(r1)
82         ld      r16,STK_REG(R16)(r1)
83         ld      r15,STK_REG(R15)(r1)
84         ld      r14,STK_REG(R14)(r1)
85 .Lexit:
86         addi    r1,r1,STACKFRAMESIZE
87 .Ldo_err1:
88         ld      r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
89         ld      r4,-STACKFRAMESIZE+STK_REG(R30)(r1)
90         ld      r5,-STACKFRAMESIZE+STK_REG(R29)(r1)
91         b       __copy_tofrom_user_base
94 _GLOBAL(__copy_tofrom_user_power7)
95 #ifdef CONFIG_ALTIVEC
96         cmpldi  r5,16
97         cmpldi  cr1,r5,4096
99         std     r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
100         std     r4,-STACKFRAMESIZE+STK_REG(R30)(r1)
101         std     r5,-STACKFRAMESIZE+STK_REG(R29)(r1)
103         blt     .Lshort_copy
104         bgt     cr1,.Lvmx_copy
105 #else
106         cmpldi  r5,16
108         std     r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
109         std     r4,-STACKFRAMESIZE+STK_REG(R30)(r1)
110         std     r5,-STACKFRAMESIZE+STK_REG(R29)(r1)
112         blt     .Lshort_copy
113 #endif
115 .Lnonvmx_copy:
116         /* Get the source 8B aligned */
117         neg     r6,r4
118         mtocrf  0x01,r6
119         clrldi  r6,r6,(64-3)
121         bf      cr7*4+3,1f
122 err1;   lbz     r0,0(r4)
123         addi    r4,r4,1
124 err1;   stb     r0,0(r3)
125         addi    r3,r3,1
127 1:      bf      cr7*4+2,2f
128 err1;   lhz     r0,0(r4)
129         addi    r4,r4,2
130 err1;   sth     r0,0(r3)
131         addi    r3,r3,2
133 2:      bf      cr7*4+1,3f
134 err1;   lwz     r0,0(r4)
135         addi    r4,r4,4
136 err1;   stw     r0,0(r3)
137         addi    r3,r3,4
139 3:      sub     r5,r5,r6
140         cmpldi  r5,128
141         blt     5f
143         mflr    r0
144         stdu    r1,-STACKFRAMESIZE(r1)
145         std     r14,STK_REG(R14)(r1)
146         std     r15,STK_REG(R15)(r1)
147         std     r16,STK_REG(R16)(r1)
148         std     r17,STK_REG(R17)(r1)
149         std     r18,STK_REG(R18)(r1)
150         std     r19,STK_REG(R19)(r1)
151         std     r20,STK_REG(R20)(r1)
152         std     r21,STK_REG(R21)(r1)
153         std     r22,STK_REG(R22)(r1)
154         std     r0,STACKFRAMESIZE+16(r1)
156         srdi    r6,r5,7
157         mtctr   r6
159         /* Now do cacheline (128B) sized loads and stores. */
160         .align  5
162 err2;   ld      r0,0(r4)
163 err2;   ld      r6,8(r4)
164 err2;   ld      r7,16(r4)
165 err2;   ld      r8,24(r4)
166 err2;   ld      r9,32(r4)
167 err2;   ld      r10,40(r4)
168 err2;   ld      r11,48(r4)
169 err2;   ld      r12,56(r4)
170 err2;   ld      r14,64(r4)
171 err2;   ld      r15,72(r4)
172 err2;   ld      r16,80(r4)
173 err2;   ld      r17,88(r4)
174 err2;   ld      r18,96(r4)
175 err2;   ld      r19,104(r4)
176 err2;   ld      r20,112(r4)
177 err2;   ld      r21,120(r4)
178         addi    r4,r4,128
179 err2;   std     r0,0(r3)
180 err2;   std     r6,8(r3)
181 err2;   std     r7,16(r3)
182 err2;   std     r8,24(r3)
183 err2;   std     r9,32(r3)
184 err2;   std     r10,40(r3)
185 err2;   std     r11,48(r3)
186 err2;   std     r12,56(r3)
187 err2;   std     r14,64(r3)
188 err2;   std     r15,72(r3)
189 err2;   std     r16,80(r3)
190 err2;   std     r17,88(r3)
191 err2;   std     r18,96(r3)
192 err2;   std     r19,104(r3)
193 err2;   std     r20,112(r3)
194 err2;   std     r21,120(r3)
195         addi    r3,r3,128
196         bdnz    4b
198         clrldi  r5,r5,(64-7)
200         ld      r14,STK_REG(R14)(r1)
201         ld      r15,STK_REG(R15)(r1)
202         ld      r16,STK_REG(R16)(r1)
203         ld      r17,STK_REG(R17)(r1)
204         ld      r18,STK_REG(R18)(r1)
205         ld      r19,STK_REG(R19)(r1)
206         ld      r20,STK_REG(R20)(r1)
207         ld      r21,STK_REG(R21)(r1)
208         ld      r22,STK_REG(R22)(r1)
209         addi    r1,r1,STACKFRAMESIZE
211         /* Up to 127B to go */
212 5:      srdi    r6,r5,4
213         mtocrf  0x01,r6
215 6:      bf      cr7*4+1,7f
216 err1;   ld      r0,0(r4)
217 err1;   ld      r6,8(r4)
218 err1;   ld      r7,16(r4)
219 err1;   ld      r8,24(r4)
220 err1;   ld      r9,32(r4)
221 err1;   ld      r10,40(r4)
222 err1;   ld      r11,48(r4)
223 err1;   ld      r12,56(r4)
224         addi    r4,r4,64
225 err1;   std     r0,0(r3)
226 err1;   std     r6,8(r3)
227 err1;   std     r7,16(r3)
228 err1;   std     r8,24(r3)
229 err1;   std     r9,32(r3)
230 err1;   std     r10,40(r3)
231 err1;   std     r11,48(r3)
232 err1;   std     r12,56(r3)
233         addi    r3,r3,64
235         /* Up to 63B to go */
236 7:      bf      cr7*4+2,8f
237 err1;   ld      r0,0(r4)
238 err1;   ld      r6,8(r4)
239 err1;   ld      r7,16(r4)
240 err1;   ld      r8,24(r4)
241         addi    r4,r4,32
242 err1;   std     r0,0(r3)
243 err1;   std     r6,8(r3)
244 err1;   std     r7,16(r3)
245 err1;   std     r8,24(r3)
246         addi    r3,r3,32
248         /* Up to 31B to go */
249 8:      bf      cr7*4+3,9f
250 err1;   ld      r0,0(r4)
251 err1;   ld      r6,8(r4)
252         addi    r4,r4,16
253 err1;   std     r0,0(r3)
254 err1;   std     r6,8(r3)
255         addi    r3,r3,16
257 9:      clrldi  r5,r5,(64-4)
259         /* Up to 15B to go */
260 .Lshort_copy:
261         mtocrf  0x01,r5
262         bf      cr7*4+0,12f
263 err1;   lwz     r0,0(r4)        /* Less chance of a reject with word ops */
264 err1;   lwz     r6,4(r4)
265         addi    r4,r4,8
266 err1;   stw     r0,0(r3)
267 err1;   stw     r6,4(r3)
268         addi    r3,r3,8
270 12:     bf      cr7*4+1,13f
271 err1;   lwz     r0,0(r4)
272         addi    r4,r4,4
273 err1;   stw     r0,0(r3)
274         addi    r3,r3,4
276 13:     bf      cr7*4+2,14f
277 err1;   lhz     r0,0(r4)
278         addi    r4,r4,2
279 err1;   sth     r0,0(r3)
280         addi    r3,r3,2
282 14:     bf      cr7*4+3,15f
283 err1;   lbz     r0,0(r4)
284 err1;   stb     r0,0(r3)
286 15:     li      r3,0
287         blr
289 .Lunwind_stack_nonvmx_copy:
290         addi    r1,r1,STACKFRAMESIZE
291         b       .Lnonvmx_copy
293 #ifdef CONFIG_ALTIVEC
294 .Lvmx_copy:
295         mflr    r0
296         std     r0,16(r1)
297         stdu    r1,-STACKFRAMESIZE(r1)
298         bl      enter_vmx_usercopy
299         cmpwi   cr1,r3,0
300         ld      r0,STACKFRAMESIZE+16(r1)
301         ld      r3,STK_REG(R31)(r1)
302         ld      r4,STK_REG(R30)(r1)
303         ld      r5,STK_REG(R29)(r1)
304         mtlr    r0
306         /*
307          * We prefetch both the source and destination using enhanced touch
308          * instructions. We use a stream ID of 0 for the load side and
309          * 1 for the store side.
310          */
311         clrrdi  r6,r4,7
312         clrrdi  r9,r3,7
313         ori     r9,r9,1         /* stream=1 */
315         srdi    r7,r5,7         /* length in cachelines, capped at 0x3FF */
316         cmpldi  r7,0x3FF
317         ble     1f
318         li      r7,0x3FF
319 1:      lis     r0,0x0E00       /* depth=7 */
320         sldi    r7,r7,7
321         or      r7,r7,r0
322         ori     r10,r7,1        /* stream=1 */
324         lis     r8,0x8000       /* GO=1 */
325         clrldi  r8,r8,32
327 .machine push
328 .machine "power4"
329         /* setup read stream 0 */
330         dcbt    r0,r6,0b01000   /* addr from */
331         dcbt    r0,r7,0b01010   /* length and depth from */
332         /* setup write stream 1 */
333         dcbtst  r0,r9,0b01000   /* addr to */
334         dcbtst  r0,r10,0b01010  /* length and depth to */
335         eieio
336         dcbt    r0,r8,0b01010   /* all streams GO */
337 .machine pop
339         beq     cr1,.Lunwind_stack_nonvmx_copy
341         /*
342          * If source and destination are not relatively aligned we use a
343          * slower permute loop.
344          */
345         xor     r6,r4,r3
346         rldicl. r6,r6,0,(64-4)
347         bne     .Lvmx_unaligned_copy
349         /* Get the destination 16B aligned */
350         neg     r6,r3
351         mtocrf  0x01,r6
352         clrldi  r6,r6,(64-4)
354         bf      cr7*4+3,1f
355 err3;   lbz     r0,0(r4)
356         addi    r4,r4,1
357 err3;   stb     r0,0(r3)
358         addi    r3,r3,1
360 1:      bf      cr7*4+2,2f
361 err3;   lhz     r0,0(r4)
362         addi    r4,r4,2
363 err3;   sth     r0,0(r3)
364         addi    r3,r3,2
366 2:      bf      cr7*4+1,3f
367 err3;   lwz     r0,0(r4)
368         addi    r4,r4,4
369 err3;   stw     r0,0(r3)
370         addi    r3,r3,4
372 3:      bf      cr7*4+0,4f
373 err3;   ld      r0,0(r4)
374         addi    r4,r4,8
375 err3;   std     r0,0(r3)
376         addi    r3,r3,8
378 4:      sub     r5,r5,r6
380         /* Get the desination 128B aligned */
381         neg     r6,r3
382         srdi    r7,r6,4
383         mtocrf  0x01,r7
384         clrldi  r6,r6,(64-7)
386         li      r9,16
387         li      r10,32
388         li      r11,48
390         bf      cr7*4+3,5f
391 err3;   lvx     vr1,r0,r4
392         addi    r4,r4,16
393 err3;   stvx    vr1,r0,r3
394         addi    r3,r3,16
396 5:      bf      cr7*4+2,6f
397 err3;   lvx     vr1,r0,r4
398 err3;   lvx     vr0,r4,r9
399         addi    r4,r4,32
400 err3;   stvx    vr1,r0,r3
401 err3;   stvx    vr0,r3,r9
402         addi    r3,r3,32
404 6:      bf      cr7*4+1,7f
405 err3;   lvx     vr3,r0,r4
406 err3;   lvx     vr2,r4,r9
407 err3;   lvx     vr1,r4,r10
408 err3;   lvx     vr0,r4,r11
409         addi    r4,r4,64
410 err3;   stvx    vr3,r0,r3
411 err3;   stvx    vr2,r3,r9
412 err3;   stvx    vr1,r3,r10
413 err3;   stvx    vr0,r3,r11
414         addi    r3,r3,64
416 7:      sub     r5,r5,r6
417         srdi    r6,r5,7
419         std     r14,STK_REG(R14)(r1)
420         std     r15,STK_REG(R15)(r1)
421         std     r16,STK_REG(R16)(r1)
423         li      r12,64
424         li      r14,80
425         li      r15,96
426         li      r16,112
428         mtctr   r6
430         /*
431          * Now do cacheline sized loads and stores. By this stage the
432          * cacheline stores are also cacheline aligned.
433          */
434         .align  5
436 err4;   lvx     vr7,r0,r4
437 err4;   lvx     vr6,r4,r9
438 err4;   lvx     vr5,r4,r10
439 err4;   lvx     vr4,r4,r11
440 err4;   lvx     vr3,r4,r12
441 err4;   lvx     vr2,r4,r14
442 err4;   lvx     vr1,r4,r15
443 err4;   lvx     vr0,r4,r16
444         addi    r4,r4,128
445 err4;   stvx    vr7,r0,r3
446 err4;   stvx    vr6,r3,r9
447 err4;   stvx    vr5,r3,r10
448 err4;   stvx    vr4,r3,r11
449 err4;   stvx    vr3,r3,r12
450 err4;   stvx    vr2,r3,r14
451 err4;   stvx    vr1,r3,r15
452 err4;   stvx    vr0,r3,r16
453         addi    r3,r3,128
454         bdnz    8b
456         ld      r14,STK_REG(R14)(r1)
457         ld      r15,STK_REG(R15)(r1)
458         ld      r16,STK_REG(R16)(r1)
460         /* Up to 127B to go */
461         clrldi  r5,r5,(64-7)
462         srdi    r6,r5,4
463         mtocrf  0x01,r6
465         bf      cr7*4+1,9f
466 err3;   lvx     vr3,r0,r4
467 err3;   lvx     vr2,r4,r9
468 err3;   lvx     vr1,r4,r10
469 err3;   lvx     vr0,r4,r11
470         addi    r4,r4,64
471 err3;   stvx    vr3,r0,r3
472 err3;   stvx    vr2,r3,r9
473 err3;   stvx    vr1,r3,r10
474 err3;   stvx    vr0,r3,r11
475         addi    r3,r3,64
477 9:      bf      cr7*4+2,10f
478 err3;   lvx     vr1,r0,r4
479 err3;   lvx     vr0,r4,r9
480         addi    r4,r4,32
481 err3;   stvx    vr1,r0,r3
482 err3;   stvx    vr0,r3,r9
483         addi    r3,r3,32
485 10:     bf      cr7*4+3,11f
486 err3;   lvx     vr1,r0,r4
487         addi    r4,r4,16
488 err3;   stvx    vr1,r0,r3
489         addi    r3,r3,16
491         /* Up to 15B to go */
492 11:     clrldi  r5,r5,(64-4)
493         mtocrf  0x01,r5
494         bf      cr7*4+0,12f
495 err3;   ld      r0,0(r4)
496         addi    r4,r4,8
497 err3;   std     r0,0(r3)
498         addi    r3,r3,8
500 12:     bf      cr7*4+1,13f
501 err3;   lwz     r0,0(r4)
502         addi    r4,r4,4
503 err3;   stw     r0,0(r3)
504         addi    r3,r3,4
506 13:     bf      cr7*4+2,14f
507 err3;   lhz     r0,0(r4)
508         addi    r4,r4,2
509 err3;   sth     r0,0(r3)
510         addi    r3,r3,2
512 14:     bf      cr7*4+3,15f
513 err3;   lbz     r0,0(r4)
514 err3;   stb     r0,0(r3)
516 15:     addi    r1,r1,STACKFRAMESIZE
517         b       exit_vmx_usercopy       /* tail call optimise */
519 .Lvmx_unaligned_copy:
520         /* Get the destination 16B aligned */
521         neg     r6,r3
522         mtocrf  0x01,r6
523         clrldi  r6,r6,(64-4)
525         bf      cr7*4+3,1f
526 err3;   lbz     r0,0(r4)
527         addi    r4,r4,1
528 err3;   stb     r0,0(r3)
529         addi    r3,r3,1
531 1:      bf      cr7*4+2,2f
532 err3;   lhz     r0,0(r4)
533         addi    r4,r4,2
534 err3;   sth     r0,0(r3)
535         addi    r3,r3,2
537 2:      bf      cr7*4+1,3f
538 err3;   lwz     r0,0(r4)
539         addi    r4,r4,4
540 err3;   stw     r0,0(r3)
541         addi    r3,r3,4
543 3:      bf      cr7*4+0,4f
544 err3;   lwz     r0,0(r4)        /* Less chance of a reject with word ops */
545 err3;   lwz     r7,4(r4)
546         addi    r4,r4,8
547 err3;   stw     r0,0(r3)
548 err3;   stw     r7,4(r3)
549         addi    r3,r3,8
551 4:      sub     r5,r5,r6
553         /* Get the desination 128B aligned */
554         neg     r6,r3
555         srdi    r7,r6,4
556         mtocrf  0x01,r7
557         clrldi  r6,r6,(64-7)
559         li      r9,16
560         li      r10,32
561         li      r11,48
563         LVS(vr16,0,r4)          /* Setup permute control vector */
564 err3;   lvx     vr0,0,r4
565         addi    r4,r4,16
567         bf      cr7*4+3,5f
568 err3;   lvx     vr1,r0,r4
569         VPERM(vr8,vr0,vr1,vr16)
570         addi    r4,r4,16
571 err3;   stvx    vr8,r0,r3
572         addi    r3,r3,16
573         vor     vr0,vr1,vr1
575 5:      bf      cr7*4+2,6f
576 err3;   lvx     vr1,r0,r4
577         VPERM(vr8,vr0,vr1,vr16)
578 err3;   lvx     vr0,r4,r9
579         VPERM(vr9,vr1,vr0,vr16)
580         addi    r4,r4,32
581 err3;   stvx    vr8,r0,r3
582 err3;   stvx    vr9,r3,r9
583         addi    r3,r3,32
585 6:      bf      cr7*4+1,7f
586 err3;   lvx     vr3,r0,r4
587         VPERM(vr8,vr0,vr3,vr16)
588 err3;   lvx     vr2,r4,r9
589         VPERM(vr9,vr3,vr2,vr16)
590 err3;   lvx     vr1,r4,r10
591         VPERM(vr10,vr2,vr1,vr16)
592 err3;   lvx     vr0,r4,r11
593         VPERM(vr11,vr1,vr0,vr16)
594         addi    r4,r4,64
595 err3;   stvx    vr8,r0,r3
596 err3;   stvx    vr9,r3,r9
597 err3;   stvx    vr10,r3,r10
598 err3;   stvx    vr11,r3,r11
599         addi    r3,r3,64
601 7:      sub     r5,r5,r6
602         srdi    r6,r5,7
604         std     r14,STK_REG(R14)(r1)
605         std     r15,STK_REG(R15)(r1)
606         std     r16,STK_REG(R16)(r1)
608         li      r12,64
609         li      r14,80
610         li      r15,96
611         li      r16,112
613         mtctr   r6
615         /*
616          * Now do cacheline sized loads and stores. By this stage the
617          * cacheline stores are also cacheline aligned.
618          */
619         .align  5
621 err4;   lvx     vr7,r0,r4
622         VPERM(vr8,vr0,vr7,vr16)
623 err4;   lvx     vr6,r4,r9
624         VPERM(vr9,vr7,vr6,vr16)
625 err4;   lvx     vr5,r4,r10
626         VPERM(vr10,vr6,vr5,vr16)
627 err4;   lvx     vr4,r4,r11
628         VPERM(vr11,vr5,vr4,vr16)
629 err4;   lvx     vr3,r4,r12
630         VPERM(vr12,vr4,vr3,vr16)
631 err4;   lvx     vr2,r4,r14
632         VPERM(vr13,vr3,vr2,vr16)
633 err4;   lvx     vr1,r4,r15
634         VPERM(vr14,vr2,vr1,vr16)
635 err4;   lvx     vr0,r4,r16
636         VPERM(vr15,vr1,vr0,vr16)
637         addi    r4,r4,128
638 err4;   stvx    vr8,r0,r3
639 err4;   stvx    vr9,r3,r9
640 err4;   stvx    vr10,r3,r10
641 err4;   stvx    vr11,r3,r11
642 err4;   stvx    vr12,r3,r12
643 err4;   stvx    vr13,r3,r14
644 err4;   stvx    vr14,r3,r15
645 err4;   stvx    vr15,r3,r16
646         addi    r3,r3,128
647         bdnz    8b
649         ld      r14,STK_REG(R14)(r1)
650         ld      r15,STK_REG(R15)(r1)
651         ld      r16,STK_REG(R16)(r1)
653         /* Up to 127B to go */
654         clrldi  r5,r5,(64-7)
655         srdi    r6,r5,4
656         mtocrf  0x01,r6
658         bf      cr7*4+1,9f
659 err3;   lvx     vr3,r0,r4
660         VPERM(vr8,vr0,vr3,vr16)
661 err3;   lvx     vr2,r4,r9
662         VPERM(vr9,vr3,vr2,vr16)
663 err3;   lvx     vr1,r4,r10
664         VPERM(vr10,vr2,vr1,vr16)
665 err3;   lvx     vr0,r4,r11
666         VPERM(vr11,vr1,vr0,vr16)
667         addi    r4,r4,64
668 err3;   stvx    vr8,r0,r3
669 err3;   stvx    vr9,r3,r9
670 err3;   stvx    vr10,r3,r10
671 err3;   stvx    vr11,r3,r11
672         addi    r3,r3,64
674 9:      bf      cr7*4+2,10f
675 err3;   lvx     vr1,r0,r4
676         VPERM(vr8,vr0,vr1,vr16)
677 err3;   lvx     vr0,r4,r9
678         VPERM(vr9,vr1,vr0,vr16)
679         addi    r4,r4,32
680 err3;   stvx    vr8,r0,r3
681 err3;   stvx    vr9,r3,r9
682         addi    r3,r3,32
684 10:     bf      cr7*4+3,11f
685 err3;   lvx     vr1,r0,r4
686         VPERM(vr8,vr0,vr1,vr16)
687         addi    r4,r4,16
688 err3;   stvx    vr8,r0,r3
689         addi    r3,r3,16
691         /* Up to 15B to go */
692 11:     clrldi  r5,r5,(64-4)
693         addi    r4,r4,-16       /* Unwind the +16 load offset */
694         mtocrf  0x01,r5
695         bf      cr7*4+0,12f
696 err3;   lwz     r0,0(r4)        /* Less chance of a reject with word ops */
697 err3;   lwz     r6,4(r4)
698         addi    r4,r4,8
699 err3;   stw     r0,0(r3)
700 err3;   stw     r6,4(r3)
701         addi    r3,r3,8
703 12:     bf      cr7*4+1,13f
704 err3;   lwz     r0,0(r4)
705         addi    r4,r4,4
706 err3;   stw     r0,0(r3)
707         addi    r3,r3,4
709 13:     bf      cr7*4+2,14f
710 err3;   lhz     r0,0(r4)
711         addi    r4,r4,2
712 err3;   sth     r0,0(r3)
713         addi    r3,r3,2
715 14:     bf      cr7*4+3,15f
716 err3;   lbz     r0,0(r4)
717 err3;   stb     r0,0(r3)
719 15:     addi    r1,r1,STACKFRAMESIZE
720         b       exit_vmx_usercopy       /* tail call optimise */
721 #endif /* CONFiG_ALTIVEC */