Merge branch 'drm-fixes' of git://people.freedesktop.org/~airlied/linux
[linux/fpc-iii.git] / arch / powerpc / lib / copyuser_power7.S
blobf9ede7c6606e54d36f4b54fddc5b3101085ee5ef
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 2 of the License, or
5  * (at your option) any later version.
6  *
7  * This program is distributed in the hope that it will be useful,
8  * but WITHOUT ANY WARRANTY; without even the implied warranty of
9  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10  * GNU General Public License for more details.
11  *
12  * You should have received a copy of the GNU General Public License
13  * along with this program; if not, write to the Free Software
14  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
15  *
16  * Copyright (C) IBM Corporation, 2011
17  *
18  * Author: Anton Blanchard <anton@au.ibm.com>
19  */
20 #include <asm/ppc_asm.h>
22         .macro err1
23 100:
24         .section __ex_table,"a"
25         .align 3
26         .llong 100b,.Ldo_err1
27         .previous
28         .endm
30         .macro err2
31 200:
32         .section __ex_table,"a"
33         .align 3
34         .llong 200b,.Ldo_err2
35         .previous
36         .endm
38 #ifdef CONFIG_ALTIVEC
39         .macro err3
40 300:
41         .section __ex_table,"a"
42         .align 3
43         .llong 300b,.Ldo_err3
44         .previous
45         .endm
47         .macro err4
48 400:
49         .section __ex_table,"a"
50         .align 3
51         .llong 400b,.Ldo_err4
52         .previous
53         .endm
56 .Ldo_err4:
57         ld      r16,STK_REG(R16)(r1)
58         ld      r15,STK_REG(R15)(r1)
59         ld      r14,STK_REG(R14)(r1)
60 .Ldo_err3:
61         bl      .exit_vmx_usercopy
62         ld      r0,STACKFRAMESIZE+16(r1)
63         mtlr    r0
64         b       .Lexit
65 #endif /* CONFIG_ALTIVEC */
67 .Ldo_err2:
68         ld      r22,STK_REG(R22)(r1)
69         ld      r21,STK_REG(R21)(r1)
70         ld      r20,STK_REG(R20)(r1)
71         ld      r19,STK_REG(R19)(r1)
72         ld      r18,STK_REG(R18)(r1)
73         ld      r17,STK_REG(R17)(r1)
74         ld      r16,STK_REG(R16)(r1)
75         ld      r15,STK_REG(R15)(r1)
76         ld      r14,STK_REG(R14)(r1)
77 .Lexit:
78         addi    r1,r1,STACKFRAMESIZE
79 .Ldo_err1:
80         ld      r3,48(r1)
81         ld      r4,56(r1)
82         ld      r5,64(r1)
83         b       __copy_tofrom_user_base
86 _GLOBAL(__copy_tofrom_user_power7)
87 #ifdef CONFIG_ALTIVEC
88         cmpldi  r5,16
89         cmpldi  cr1,r5,4096
91         std     r3,48(r1)
92         std     r4,56(r1)
93         std     r5,64(r1)
95         blt     .Lshort_copy
96         bgt     cr1,.Lvmx_copy
97 #else
98         cmpldi  r5,16
100         std     r3,48(r1)
101         std     r4,56(r1)
102         std     r5,64(r1)
104         blt     .Lshort_copy
105 #endif
107 .Lnonvmx_copy:
108         /* Get the source 8B aligned */
109         neg     r6,r4
110         mtocrf  0x01,r6
111         clrldi  r6,r6,(64-3)
113         bf      cr7*4+3,1f
114 err1;   lbz     r0,0(r4)
115         addi    r4,r4,1
116 err1;   stb     r0,0(r3)
117         addi    r3,r3,1
119 1:      bf      cr7*4+2,2f
120 err1;   lhz     r0,0(r4)
121         addi    r4,r4,2
122 err1;   sth     r0,0(r3)
123         addi    r3,r3,2
125 2:      bf      cr7*4+1,3f
126 err1;   lwz     r0,0(r4)
127         addi    r4,r4,4
128 err1;   stw     r0,0(r3)
129         addi    r3,r3,4
131 3:      sub     r5,r5,r6
132         cmpldi  r5,128
133         blt     5f
135         mflr    r0
136         stdu    r1,-STACKFRAMESIZE(r1)
137         std     r14,STK_REG(R14)(r1)
138         std     r15,STK_REG(R15)(r1)
139         std     r16,STK_REG(R16)(r1)
140         std     r17,STK_REG(R17)(r1)
141         std     r18,STK_REG(R18)(r1)
142         std     r19,STK_REG(R19)(r1)
143         std     r20,STK_REG(R20)(r1)
144         std     r21,STK_REG(R21)(r1)
145         std     r22,STK_REG(R22)(r1)
146         std     r0,STACKFRAMESIZE+16(r1)
148         srdi    r6,r5,7
149         mtctr   r6
151         /* Now do cacheline (128B) sized loads and stores. */
152         .align  5
154 err2;   ld      r0,0(r4)
155 err2;   ld      r6,8(r4)
156 err2;   ld      r7,16(r4)
157 err2;   ld      r8,24(r4)
158 err2;   ld      r9,32(r4)
159 err2;   ld      r10,40(r4)
160 err2;   ld      r11,48(r4)
161 err2;   ld      r12,56(r4)
162 err2;   ld      r14,64(r4)
163 err2;   ld      r15,72(r4)
164 err2;   ld      r16,80(r4)
165 err2;   ld      r17,88(r4)
166 err2;   ld      r18,96(r4)
167 err2;   ld      r19,104(r4)
168 err2;   ld      r20,112(r4)
169 err2;   ld      r21,120(r4)
170         addi    r4,r4,128
171 err2;   std     r0,0(r3)
172 err2;   std     r6,8(r3)
173 err2;   std     r7,16(r3)
174 err2;   std     r8,24(r3)
175 err2;   std     r9,32(r3)
176 err2;   std     r10,40(r3)
177 err2;   std     r11,48(r3)
178 err2;   std     r12,56(r3)
179 err2;   std     r14,64(r3)
180 err2;   std     r15,72(r3)
181 err2;   std     r16,80(r3)
182 err2;   std     r17,88(r3)
183 err2;   std     r18,96(r3)
184 err2;   std     r19,104(r3)
185 err2;   std     r20,112(r3)
186 err2;   std     r21,120(r3)
187         addi    r3,r3,128
188         bdnz    4b
190         clrldi  r5,r5,(64-7)
192         ld      r14,STK_REG(R14)(r1)
193         ld      r15,STK_REG(R15)(r1)
194         ld      r16,STK_REG(R16)(r1)
195         ld      r17,STK_REG(R17)(r1)
196         ld      r18,STK_REG(R18)(r1)
197         ld      r19,STK_REG(R19)(r1)
198         ld      r20,STK_REG(R20)(r1)
199         ld      r21,STK_REG(R21)(r1)
200         ld      r22,STK_REG(R22)(r1)
201         addi    r1,r1,STACKFRAMESIZE
203         /* Up to 127B to go */
204 5:      srdi    r6,r5,4
205         mtocrf  0x01,r6
207 6:      bf      cr7*4+1,7f
208 err1;   ld      r0,0(r4)
209 err1;   ld      r6,8(r4)
210 err1;   ld      r7,16(r4)
211 err1;   ld      r8,24(r4)
212 err1;   ld      r9,32(r4)
213 err1;   ld      r10,40(r4)
214 err1;   ld      r11,48(r4)
215 err1;   ld      r12,56(r4)
216         addi    r4,r4,64
217 err1;   std     r0,0(r3)
218 err1;   std     r6,8(r3)
219 err1;   std     r7,16(r3)
220 err1;   std     r8,24(r3)
221 err1;   std     r9,32(r3)
222 err1;   std     r10,40(r3)
223 err1;   std     r11,48(r3)
224 err1;   std     r12,56(r3)
225         addi    r3,r3,64
227         /* Up to 63B to go */
228 7:      bf      cr7*4+2,8f
229 err1;   ld      r0,0(r4)
230 err1;   ld      r6,8(r4)
231 err1;   ld      r7,16(r4)
232 err1;   ld      r8,24(r4)
233         addi    r4,r4,32
234 err1;   std     r0,0(r3)
235 err1;   std     r6,8(r3)
236 err1;   std     r7,16(r3)
237 err1;   std     r8,24(r3)
238         addi    r3,r3,32
240         /* Up to 31B to go */
241 8:      bf      cr7*4+3,9f
242 err1;   ld      r0,0(r4)
243 err1;   ld      r6,8(r4)
244         addi    r4,r4,16
245 err1;   std     r0,0(r3)
246 err1;   std     r6,8(r3)
247         addi    r3,r3,16
249 9:      clrldi  r5,r5,(64-4)
251         /* Up to 15B to go */
252 .Lshort_copy:
253         mtocrf  0x01,r5
254         bf      cr7*4+0,12f
255 err1;   lwz     r0,0(r4)        /* Less chance of a reject with word ops */
256 err1;   lwz     r6,4(r4)
257         addi    r4,r4,8
258 err1;   stw     r0,0(r3)
259 err1;   stw     r6,4(r3)
260         addi    r3,r3,8
262 12:     bf      cr7*4+1,13f
263 err1;   lwz     r0,0(r4)
264         addi    r4,r4,4
265 err1;   stw     r0,0(r3)
266         addi    r3,r3,4
268 13:     bf      cr7*4+2,14f
269 err1;   lhz     r0,0(r4)
270         addi    r4,r4,2
271 err1;   sth     r0,0(r3)
272         addi    r3,r3,2
274 14:     bf      cr7*4+3,15f
275 err1;   lbz     r0,0(r4)
276 err1;   stb     r0,0(r3)
278 15:     li      r3,0
279         blr
281 .Lunwind_stack_nonvmx_copy:
282         addi    r1,r1,STACKFRAMESIZE
283         b       .Lnonvmx_copy
285 #ifdef CONFIG_ALTIVEC
286 .Lvmx_copy:
287         mflr    r0
288         std     r0,16(r1)
289         stdu    r1,-STACKFRAMESIZE(r1)
290         bl      .enter_vmx_usercopy
291         cmpwi   r3,0
292         ld      r0,STACKFRAMESIZE+16(r1)
293         ld      r3,STACKFRAMESIZE+48(r1)
294         ld      r4,STACKFRAMESIZE+56(r1)
295         ld      r5,STACKFRAMESIZE+64(r1)
296         mtlr    r0
298         /*
299          * We prefetch both the source and destination using enhanced touch
300          * instructions. We use a stream ID of 0 for the load side and
301          * 1 for the store side.
302          */
303         clrrdi  r6,r4,7
304         clrrdi  r9,r3,7
305         ori     r9,r9,1         /* stream=1 */
307         srdi    r7,r5,7         /* length in cachelines, capped at 0x3FF */
308         cmpldi  r7,0x3FF
309         ble     1f
310         li      r7,0x3FF
311 1:      lis     r0,0x0E00       /* depth=7 */
312         sldi    r7,r7,7
313         or      r7,r7,r0
314         ori     r10,r7,1        /* stream=1 */
316         lis     r8,0x8000       /* GO=1 */
317         clrldi  r8,r8,32
319 .machine push
320 .machine "power4"
321         dcbt    r0,r6,0b01000
322         dcbt    r0,r7,0b01010
323         dcbtst  r0,r9,0b01000
324         dcbtst  r0,r10,0b01010
325         eieio
326         dcbt    r0,r8,0b01010   /* GO */
327 .machine pop
329         /*
330          * We prefetch both the source and destination using enhanced touch
331          * instructions. We use a stream ID of 0 for the load side and
332          * 1 for the store side.
333          */
334         clrrdi  r6,r4,7
335         clrrdi  r9,r3,7
336         ori     r9,r9,1         /* stream=1 */
338         srdi    r7,r5,7         /* length in cachelines, capped at 0x3FF */
339         cmpldi  cr1,r7,0x3FF
340         ble     cr1,1f
341         li      r7,0x3FF
342 1:      lis     r0,0x0E00       /* depth=7 */
343         sldi    r7,r7,7
344         or      r7,r7,r0
345         ori     r10,r7,1        /* stream=1 */
347         lis     r8,0x8000       /* GO=1 */
348         clrldi  r8,r8,32
350 .machine push
351 .machine "power4"
352         dcbt    r0,r6,0b01000
353         dcbt    r0,r7,0b01010
354         dcbtst  r0,r9,0b01000
355         dcbtst  r0,r10,0b01010
356         eieio
357         dcbt    r0,r8,0b01010   /* GO */
358 .machine pop
360         beq     .Lunwind_stack_nonvmx_copy
362         /*
363          * If source and destination are not relatively aligned we use a
364          * slower permute loop.
365          */
366         xor     r6,r4,r3
367         rldicl. r6,r6,0,(64-4)
368         bne     .Lvmx_unaligned_copy
370         /* Get the destination 16B aligned */
371         neg     r6,r3
372         mtocrf  0x01,r6
373         clrldi  r6,r6,(64-4)
375         bf      cr7*4+3,1f
376 err3;   lbz     r0,0(r4)
377         addi    r4,r4,1
378 err3;   stb     r0,0(r3)
379         addi    r3,r3,1
381 1:      bf      cr7*4+2,2f
382 err3;   lhz     r0,0(r4)
383         addi    r4,r4,2
384 err3;   sth     r0,0(r3)
385         addi    r3,r3,2
387 2:      bf      cr7*4+1,3f
388 err3;   lwz     r0,0(r4)
389         addi    r4,r4,4
390 err3;   stw     r0,0(r3)
391         addi    r3,r3,4
393 3:      bf      cr7*4+0,4f
394 err3;   ld      r0,0(r4)
395         addi    r4,r4,8
396 err3;   std     r0,0(r3)
397         addi    r3,r3,8
399 4:      sub     r5,r5,r6
401         /* Get the desination 128B aligned */
402         neg     r6,r3
403         srdi    r7,r6,4
404         mtocrf  0x01,r7
405         clrldi  r6,r6,(64-7)
407         li      r9,16
408         li      r10,32
409         li      r11,48
411         bf      cr7*4+3,5f
412 err3;   lvx     vr1,r0,r4
413         addi    r4,r4,16
414 err3;   stvx    vr1,r0,r3
415         addi    r3,r3,16
417 5:      bf      cr7*4+2,6f
418 err3;   lvx     vr1,r0,r4
419 err3;   lvx     vr0,r4,r9
420         addi    r4,r4,32
421 err3;   stvx    vr1,r0,r3
422 err3;   stvx    vr0,r3,r9
423         addi    r3,r3,32
425 6:      bf      cr7*4+1,7f
426 err3;   lvx     vr3,r0,r4
427 err3;   lvx     vr2,r4,r9
428 err3;   lvx     vr1,r4,r10
429 err3;   lvx     vr0,r4,r11
430         addi    r4,r4,64
431 err3;   stvx    vr3,r0,r3
432 err3;   stvx    vr2,r3,r9
433 err3;   stvx    vr1,r3,r10
434 err3;   stvx    vr0,r3,r11
435         addi    r3,r3,64
437 7:      sub     r5,r5,r6
438         srdi    r6,r5,7
440         std     r14,STK_REG(R14)(r1)
441         std     r15,STK_REG(R15)(r1)
442         std     r16,STK_REG(R16)(r1)
444         li      r12,64
445         li      r14,80
446         li      r15,96
447         li      r16,112
449         mtctr   r6
451         /*
452          * Now do cacheline sized loads and stores. By this stage the
453          * cacheline stores are also cacheline aligned.
454          */
455         .align  5
457 err4;   lvx     vr7,r0,r4
458 err4;   lvx     vr6,r4,r9
459 err4;   lvx     vr5,r4,r10
460 err4;   lvx     vr4,r4,r11
461 err4;   lvx     vr3,r4,r12
462 err4;   lvx     vr2,r4,r14
463 err4;   lvx     vr1,r4,r15
464 err4;   lvx     vr0,r4,r16
465         addi    r4,r4,128
466 err4;   stvx    vr7,r0,r3
467 err4;   stvx    vr6,r3,r9
468 err4;   stvx    vr5,r3,r10
469 err4;   stvx    vr4,r3,r11
470 err4;   stvx    vr3,r3,r12
471 err4;   stvx    vr2,r3,r14
472 err4;   stvx    vr1,r3,r15
473 err4;   stvx    vr0,r3,r16
474         addi    r3,r3,128
475         bdnz    8b
477         ld      r14,STK_REG(R14)(r1)
478         ld      r15,STK_REG(R15)(r1)
479         ld      r16,STK_REG(R16)(r1)
481         /* Up to 127B to go */
482         clrldi  r5,r5,(64-7)
483         srdi    r6,r5,4
484         mtocrf  0x01,r6
486         bf      cr7*4+1,9f
487 err3;   lvx     vr3,r0,r4
488 err3;   lvx     vr2,r4,r9
489 err3;   lvx     vr1,r4,r10
490 err3;   lvx     vr0,r4,r11
491         addi    r4,r4,64
492 err3;   stvx    vr3,r0,r3
493 err3;   stvx    vr2,r3,r9
494 err3;   stvx    vr1,r3,r10
495 err3;   stvx    vr0,r3,r11
496         addi    r3,r3,64
498 9:      bf      cr7*4+2,10f
499 err3;   lvx     vr1,r0,r4
500 err3;   lvx     vr0,r4,r9
501         addi    r4,r4,32
502 err3;   stvx    vr1,r0,r3
503 err3;   stvx    vr0,r3,r9
504         addi    r3,r3,32
506 10:     bf      cr7*4+3,11f
507 err3;   lvx     vr1,r0,r4
508         addi    r4,r4,16
509 err3;   stvx    vr1,r0,r3
510         addi    r3,r3,16
512         /* Up to 15B to go */
513 11:     clrldi  r5,r5,(64-4)
514         mtocrf  0x01,r5
515         bf      cr7*4+0,12f
516 err3;   ld      r0,0(r4)
517         addi    r4,r4,8
518 err3;   std     r0,0(r3)
519         addi    r3,r3,8
521 12:     bf      cr7*4+1,13f
522 err3;   lwz     r0,0(r4)
523         addi    r4,r4,4
524 err3;   stw     r0,0(r3)
525         addi    r3,r3,4
527 13:     bf      cr7*4+2,14f
528 err3;   lhz     r0,0(r4)
529         addi    r4,r4,2
530 err3;   sth     r0,0(r3)
531         addi    r3,r3,2
533 14:     bf      cr7*4+3,15f
534 err3;   lbz     r0,0(r4)
535 err3;   stb     r0,0(r3)
537 15:     addi    r1,r1,STACKFRAMESIZE
538         b       .exit_vmx_usercopy      /* tail call optimise */
540 .Lvmx_unaligned_copy:
541         /* Get the destination 16B aligned */
542         neg     r6,r3
543         mtocrf  0x01,r6
544         clrldi  r6,r6,(64-4)
546         bf      cr7*4+3,1f
547 err3;   lbz     r0,0(r4)
548         addi    r4,r4,1
549 err3;   stb     r0,0(r3)
550         addi    r3,r3,1
552 1:      bf      cr7*4+2,2f
553 err3;   lhz     r0,0(r4)
554         addi    r4,r4,2
555 err3;   sth     r0,0(r3)
556         addi    r3,r3,2
558 2:      bf      cr7*4+1,3f
559 err3;   lwz     r0,0(r4)
560         addi    r4,r4,4
561 err3;   stw     r0,0(r3)
562         addi    r3,r3,4
564 3:      bf      cr7*4+0,4f
565 err3;   lwz     r0,0(r4)        /* Less chance of a reject with word ops */
566 err3;   lwz     r7,4(r4)
567         addi    r4,r4,8
568 err3;   stw     r0,0(r3)
569 err3;   stw     r7,4(r3)
570         addi    r3,r3,8
572 4:      sub     r5,r5,r6
574         /* Get the desination 128B aligned */
575         neg     r6,r3
576         srdi    r7,r6,4
577         mtocrf  0x01,r7
578         clrldi  r6,r6,(64-7)
580         li      r9,16
581         li      r10,32
582         li      r11,48
584         lvsl    vr16,0,r4       /* Setup permute control vector */
585 err3;   lvx     vr0,0,r4
586         addi    r4,r4,16
588         bf      cr7*4+3,5f
589 err3;   lvx     vr1,r0,r4
590         vperm   vr8,vr0,vr1,vr16
591         addi    r4,r4,16
592 err3;   stvx    vr8,r0,r3
593         addi    r3,r3,16
594         vor     vr0,vr1,vr1
596 5:      bf      cr7*4+2,6f
597 err3;   lvx     vr1,r0,r4
598         vperm   vr8,vr0,vr1,vr16
599 err3;   lvx     vr0,r4,r9
600         vperm   vr9,vr1,vr0,vr16
601         addi    r4,r4,32
602 err3;   stvx    vr8,r0,r3
603 err3;   stvx    vr9,r3,r9
604         addi    r3,r3,32
606 6:      bf      cr7*4+1,7f
607 err3;   lvx     vr3,r0,r4
608         vperm   vr8,vr0,vr3,vr16
609 err3;   lvx     vr2,r4,r9
610         vperm   vr9,vr3,vr2,vr16
611 err3;   lvx     vr1,r4,r10
612         vperm   vr10,vr2,vr1,vr16
613 err3;   lvx     vr0,r4,r11
614         vperm   vr11,vr1,vr0,vr16
615         addi    r4,r4,64
616 err3;   stvx    vr8,r0,r3
617 err3;   stvx    vr9,r3,r9
618 err3;   stvx    vr10,r3,r10
619 err3;   stvx    vr11,r3,r11
620         addi    r3,r3,64
622 7:      sub     r5,r5,r6
623         srdi    r6,r5,7
625         std     r14,STK_REG(R14)(r1)
626         std     r15,STK_REG(R15)(r1)
627         std     r16,STK_REG(R16)(r1)
629         li      r12,64
630         li      r14,80
631         li      r15,96
632         li      r16,112
634         mtctr   r6
636         /*
637          * Now do cacheline sized loads and stores. By this stage the
638          * cacheline stores are also cacheline aligned.
639          */
640         .align  5
642 err4;   lvx     vr7,r0,r4
643         vperm   vr8,vr0,vr7,vr16
644 err4;   lvx     vr6,r4,r9
645         vperm   vr9,vr7,vr6,vr16
646 err4;   lvx     vr5,r4,r10
647         vperm   vr10,vr6,vr5,vr16
648 err4;   lvx     vr4,r4,r11
649         vperm   vr11,vr5,vr4,vr16
650 err4;   lvx     vr3,r4,r12
651         vperm   vr12,vr4,vr3,vr16
652 err4;   lvx     vr2,r4,r14
653         vperm   vr13,vr3,vr2,vr16
654 err4;   lvx     vr1,r4,r15
655         vperm   vr14,vr2,vr1,vr16
656 err4;   lvx     vr0,r4,r16
657         vperm   vr15,vr1,vr0,vr16
658         addi    r4,r4,128
659 err4;   stvx    vr8,r0,r3
660 err4;   stvx    vr9,r3,r9
661 err4;   stvx    vr10,r3,r10
662 err4;   stvx    vr11,r3,r11
663 err4;   stvx    vr12,r3,r12
664 err4;   stvx    vr13,r3,r14
665 err4;   stvx    vr14,r3,r15
666 err4;   stvx    vr15,r3,r16
667         addi    r3,r3,128
668         bdnz    8b
670         ld      r14,STK_REG(R14)(r1)
671         ld      r15,STK_REG(R15)(r1)
672         ld      r16,STK_REG(R16)(r1)
674         /* Up to 127B to go */
675         clrldi  r5,r5,(64-7)
676         srdi    r6,r5,4
677         mtocrf  0x01,r6
679         bf      cr7*4+1,9f
680 err3;   lvx     vr3,r0,r4
681         vperm   vr8,vr0,vr3,vr16
682 err3;   lvx     vr2,r4,r9
683         vperm   vr9,vr3,vr2,vr16
684 err3;   lvx     vr1,r4,r10
685         vperm   vr10,vr2,vr1,vr16
686 err3;   lvx     vr0,r4,r11
687         vperm   vr11,vr1,vr0,vr16
688         addi    r4,r4,64
689 err3;   stvx    vr8,r0,r3
690 err3;   stvx    vr9,r3,r9
691 err3;   stvx    vr10,r3,r10
692 err3;   stvx    vr11,r3,r11
693         addi    r3,r3,64
695 9:      bf      cr7*4+2,10f
696 err3;   lvx     vr1,r0,r4
697         vperm   vr8,vr0,vr1,vr16
698 err3;   lvx     vr0,r4,r9
699         vperm   vr9,vr1,vr0,vr16
700         addi    r4,r4,32
701 err3;   stvx    vr8,r0,r3
702 err3;   stvx    vr9,r3,r9
703         addi    r3,r3,32
705 10:     bf      cr7*4+3,11f
706 err3;   lvx     vr1,r0,r4
707         vperm   vr8,vr0,vr1,vr16
708         addi    r4,r4,16
709 err3;   stvx    vr8,r0,r3
710         addi    r3,r3,16
712         /* Up to 15B to go */
713 11:     clrldi  r5,r5,(64-4)
714         addi    r4,r4,-16       /* Unwind the +16 load offset */
715         mtocrf  0x01,r5
716         bf      cr7*4+0,12f
717 err3;   lwz     r0,0(r4)        /* Less chance of a reject with word ops */
718 err3;   lwz     r6,4(r4)
719         addi    r4,r4,8
720 err3;   stw     r0,0(r3)
721 err3;   stw     r6,4(r3)
722         addi    r3,r3,8
724 12:     bf      cr7*4+1,13f
725 err3;   lwz     r0,0(r4)
726         addi    r4,r4,4
727 err3;   stw     r0,0(r3)
728         addi    r3,r3,4
730 13:     bf      cr7*4+2,14f
731 err3;   lhz     r0,0(r4)
732         addi    r4,r4,2
733 err3;   sth     r0,0(r3)
734         addi    r3,r3,2
736 14:     bf      cr7*4+3,15f
737 err3;   lbz     r0,0(r4)
738 err3;   stb     r0,0(r3)
740 15:     addi    r1,r1,STACKFRAMESIZE
741         b       .exit_vmx_usercopy      /* tail call optimise */
742 #endif /* CONFiG_ALTIVEC */