Merge tag 'v3.3.7' into 3.3/master
[zen-stable.git] / arch / powerpc / lib / copyuser_power7.S
blob497db7b23bb1be8be3518c12a6e0e7307ddb2fe0
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 2 of the License, or
5  * (at your option) any later version.
6  *
7  * This program is distributed in the hope that it will be useful,
8  * but WITHOUT ANY WARRANTY; without even the implied warranty of
9  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10  * GNU General Public License for more details.
11  *
12  * You should have received a copy of the GNU General Public License
13  * along with this program; if not, write to the Free Software
14  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
15  *
16  * Copyright (C) IBM Corporation, 2011
17  *
18  * Author: Anton Blanchard <anton@au.ibm.com>
19  */
20 #include <asm/ppc_asm.h>
22 #define STACKFRAMESIZE  256
23 #define STK_REG(i)      (112 + ((i)-14)*8)
25         .macro err1
26 100:
27         .section __ex_table,"a"
28         .align 3
29         .llong 100b,.Ldo_err1
30         .previous
31         .endm
33         .macro err2
34 200:
35         .section __ex_table,"a"
36         .align 3
37         .llong 200b,.Ldo_err2
38         .previous
39         .endm
41 #ifdef CONFIG_ALTIVEC
42         .macro err3
43 300:
44         .section __ex_table,"a"
45         .align 3
46         .llong 300b,.Ldo_err3
47         .previous
48         .endm
50         .macro err4
51 400:
52         .section __ex_table,"a"
53         .align 3
54         .llong 400b,.Ldo_err4
55         .previous
56         .endm
59 .Ldo_err4:
60         ld      r16,STK_REG(r16)(r1)
61         ld      r15,STK_REG(r15)(r1)
62         ld      r14,STK_REG(r14)(r1)
63 .Ldo_err3:
64         bl      .exit_vmx_copy
65         ld      r0,STACKFRAMESIZE+16(r1)
66         mtlr    r0
67         b       .Lexit
68 #endif /* CONFIG_ALTIVEC */
70 .Ldo_err2:
71         ld      r22,STK_REG(r22)(r1)
72         ld      r21,STK_REG(r21)(r1)
73         ld      r20,STK_REG(r20)(r1)
74         ld      r19,STK_REG(r19)(r1)
75         ld      r18,STK_REG(r18)(r1)
76         ld      r17,STK_REG(r17)(r1)
77         ld      r16,STK_REG(r16)(r1)
78         ld      r15,STK_REG(r15)(r1)
79         ld      r14,STK_REG(r14)(r1)
80 .Lexit:
81         addi    r1,r1,STACKFRAMESIZE
82 .Ldo_err1:
83         ld      r3,48(r1)
84         ld      r4,56(r1)
85         ld      r5,64(r1)
86         b       __copy_tofrom_user_base
89 _GLOBAL(__copy_tofrom_user_power7)
90 #ifdef CONFIG_ALTIVEC
91         cmpldi  r5,16
92         cmpldi  cr1,r5,4096
94         std     r3,48(r1)
95         std     r4,56(r1)
96         std     r5,64(r1)
98         blt     .Lshort_copy
99         bgt     cr1,.Lvmx_copy
100 #else
101         cmpldi  r5,16
103         std     r3,48(r1)
104         std     r4,56(r1)
105         std     r5,64(r1)
107         blt     .Lshort_copy
108 #endif
110 .Lnonvmx_copy:
111         /* Get the source 8B aligned */
112         neg     r6,r4
113         mtocrf  0x01,r6
114         clrldi  r6,r6,(64-3)
116         bf      cr7*4+3,1f
117 err1;   lbz     r0,0(r4)
118         addi    r4,r4,1
119 err1;   stb     r0,0(r3)
120         addi    r3,r3,1
122 1:      bf      cr7*4+2,2f
123 err1;   lhz     r0,0(r4)
124         addi    r4,r4,2
125 err1;   sth     r0,0(r3)
126         addi    r3,r3,2
128 2:      bf      cr7*4+1,3f
129 err1;   lwz     r0,0(r4)
130         addi    r4,r4,4
131 err1;   stw     r0,0(r3)
132         addi    r3,r3,4
134 3:      sub     r5,r5,r6
135         cmpldi  r5,128
136         blt     5f
138         mflr    r0
139         stdu    r1,-STACKFRAMESIZE(r1)
140         std     r14,STK_REG(r14)(r1)
141         std     r15,STK_REG(r15)(r1)
142         std     r16,STK_REG(r16)(r1)
143         std     r17,STK_REG(r17)(r1)
144         std     r18,STK_REG(r18)(r1)
145         std     r19,STK_REG(r19)(r1)
146         std     r20,STK_REG(r20)(r1)
147         std     r21,STK_REG(r21)(r1)
148         std     r22,STK_REG(r22)(r1)
149         std     r0,STACKFRAMESIZE+16(r1)
151         srdi    r6,r5,7
152         mtctr   r6
154         /* Now do cacheline (128B) sized loads and stores. */
155         .align  5
157 err2;   ld      r0,0(r4)
158 err2;   ld      r6,8(r4)
159 err2;   ld      r7,16(r4)
160 err2;   ld      r8,24(r4)
161 err2;   ld      r9,32(r4)
162 err2;   ld      r10,40(r4)
163 err2;   ld      r11,48(r4)
164 err2;   ld      r12,56(r4)
165 err2;   ld      r14,64(r4)
166 err2;   ld      r15,72(r4)
167 err2;   ld      r16,80(r4)
168 err2;   ld      r17,88(r4)
169 err2;   ld      r18,96(r4)
170 err2;   ld      r19,104(r4)
171 err2;   ld      r20,112(r4)
172 err2;   ld      r21,120(r4)
173         addi    r4,r4,128
174 err2;   std     r0,0(r3)
175 err2;   std     r6,8(r3)
176 err2;   std     r7,16(r3)
177 err2;   std     r8,24(r3)
178 err2;   std     r9,32(r3)
179 err2;   std     r10,40(r3)
180 err2;   std     r11,48(r3)
181 err2;   std     r12,56(r3)
182 err2;   std     r14,64(r3)
183 err2;   std     r15,72(r3)
184 err2;   std     r16,80(r3)
185 err2;   std     r17,88(r3)
186 err2;   std     r18,96(r3)
187 err2;   std     r19,104(r3)
188 err2;   std     r20,112(r3)
189 err2;   std     r21,120(r3)
190         addi    r3,r3,128
191         bdnz    4b
193         clrldi  r5,r5,(64-7)
195         ld      r14,STK_REG(r14)(r1)
196         ld      r15,STK_REG(r15)(r1)
197         ld      r16,STK_REG(r16)(r1)
198         ld      r17,STK_REG(r17)(r1)
199         ld      r18,STK_REG(r18)(r1)
200         ld      r19,STK_REG(r19)(r1)
201         ld      r20,STK_REG(r20)(r1)
202         ld      r21,STK_REG(r21)(r1)
203         ld      r22,STK_REG(r22)(r1)
204         addi    r1,r1,STACKFRAMESIZE
206         /* Up to 127B to go */
207 5:      srdi    r6,r5,4
208         mtocrf  0x01,r6
210 6:      bf      cr7*4+1,7f
211 err1;   ld      r0,0(r4)
212 err1;   ld      r6,8(r4)
213 err1;   ld      r7,16(r4)
214 err1;   ld      r8,24(r4)
215 err1;   ld      r9,32(r4)
216 err1;   ld      r10,40(r4)
217 err1;   ld      r11,48(r4)
218 err1;   ld      r12,56(r4)
219         addi    r4,r4,64
220 err1;   std     r0,0(r3)
221 err1;   std     r6,8(r3)
222 err1;   std     r7,16(r3)
223 err1;   std     r8,24(r3)
224 err1;   std     r9,32(r3)
225 err1;   std     r10,40(r3)
226 err1;   std     r11,48(r3)
227 err1;   std     r12,56(r3)
228         addi    r3,r3,64
230         /* Up to 63B to go */
231 7:      bf      cr7*4+2,8f
232 err1;   ld      r0,0(r4)
233 err1;   ld      r6,8(r4)
234 err1;   ld      r7,16(r4)
235 err1;   ld      r8,24(r4)
236         addi    r4,r4,32
237 err1;   std     r0,0(r3)
238 err1;   std     r6,8(r3)
239 err1;   std     r7,16(r3)
240 err1;   std     r8,24(r3)
241         addi    r3,r3,32
243         /* Up to 31B to go */
244 8:      bf      cr7*4+3,9f
245 err1;   ld      r0,0(r4)
246 err1;   ld      r6,8(r4)
247         addi    r4,r4,16
248 err1;   std     r0,0(r3)
249 err1;   std     r6,8(r3)
250         addi    r3,r3,16
252 9:      clrldi  r5,r5,(64-4)
254         /* Up to 15B to go */
255 .Lshort_copy:
256         mtocrf  0x01,r5
257         bf      cr7*4+0,12f
258 err1;   lwz     r0,0(r4)        /* Less chance of a reject with word ops */
259 err1;   lwz     r6,4(r4)
260         addi    r4,r4,8
261 err1;   stw     r0,0(r3)
262 err1;   stw     r6,4(r3)
263         addi    r3,r3,8
265 12:     bf      cr7*4+1,13f
266 err1;   lwz     r0,0(r4)
267         addi    r4,r4,4
268 err1;   stw     r0,0(r3)
269         addi    r3,r3,4
271 13:     bf      cr7*4+2,14f
272 err1;   lhz     r0,0(r4)
273         addi    r4,r4,2
274 err1;   sth     r0,0(r3)
275         addi    r3,r3,2
277 14:     bf      cr7*4+3,15f
278 err1;   lbz     r0,0(r4)
279 err1;   stb     r0,0(r3)
281 15:     li      r3,0
282         blr
284 .Lunwind_stack_nonvmx_copy:
285         addi    r1,r1,STACKFRAMESIZE
286         b       .Lnonvmx_copy
288 #ifdef CONFIG_ALTIVEC
289 .Lvmx_copy:
290         mflr    r0
291         std     r0,16(r1)
292         stdu    r1,-STACKFRAMESIZE(r1)
293         bl      .enter_vmx_copy
294         cmpwi   r3,0
295         ld      r0,STACKFRAMESIZE+16(r1)
296         ld      r3,STACKFRAMESIZE+48(r1)
297         ld      r4,STACKFRAMESIZE+56(r1)
298         ld      r5,STACKFRAMESIZE+64(r1)
299         mtlr    r0
301         beq     .Lunwind_stack_nonvmx_copy
303         /*
304          * If source and destination are not relatively aligned we use a
305          * slower permute loop.
306          */
307         xor     r6,r4,r3
308         rldicl. r6,r6,0,(64-4)
309         bne     .Lvmx_unaligned_copy
311         /* Get the destination 16B aligned */
312         neg     r6,r3
313         mtocrf  0x01,r6
314         clrldi  r6,r6,(64-4)
316         bf      cr7*4+3,1f
317 err3;   lbz     r0,0(r4)
318         addi    r4,r4,1
319 err3;   stb     r0,0(r3)
320         addi    r3,r3,1
322 1:      bf      cr7*4+2,2f
323 err3;   lhz     r0,0(r4)
324         addi    r4,r4,2
325 err3;   sth     r0,0(r3)
326         addi    r3,r3,2
328 2:      bf      cr7*4+1,3f
329 err3;   lwz     r0,0(r4)
330         addi    r4,r4,4
331 err3;   stw     r0,0(r3)
332         addi    r3,r3,4
334 3:      bf      cr7*4+0,4f
335 err3;   ld      r0,0(r4)
336         addi    r4,r4,8
337 err3;   std     r0,0(r3)
338         addi    r3,r3,8
340 4:      sub     r5,r5,r6
342         /* Get the desination 128B aligned */
343         neg     r6,r3
344         srdi    r7,r6,4
345         mtocrf  0x01,r7
346         clrldi  r6,r6,(64-7)
348         li      r9,16
349         li      r10,32
350         li      r11,48
352         bf      cr7*4+3,5f
353 err3;   lvx     vr1,r0,r4
354         addi    r4,r4,16
355 err3;   stvx    vr1,r0,r3
356         addi    r3,r3,16
358 5:      bf      cr7*4+2,6f
359 err3;   lvx     vr1,r0,r4
360 err3;   lvx     vr0,r4,r9
361         addi    r4,r4,32
362 err3;   stvx    vr1,r0,r3
363 err3;   stvx    vr0,r3,r9
364         addi    r3,r3,32
366 6:      bf      cr7*4+1,7f
367 err3;   lvx     vr3,r0,r4
368 err3;   lvx     vr2,r4,r9
369 err3;   lvx     vr1,r4,r10
370 err3;   lvx     vr0,r4,r11
371         addi    r4,r4,64
372 err3;   stvx    vr3,r0,r3
373 err3;   stvx    vr2,r3,r9
374 err3;   stvx    vr1,r3,r10
375 err3;   stvx    vr0,r3,r11
376         addi    r3,r3,64
378 7:      sub     r5,r5,r6
379         srdi    r6,r5,7
381         std     r14,STK_REG(r14)(r1)
382         std     r15,STK_REG(r15)(r1)
383         std     r16,STK_REG(r16)(r1)
385         li      r12,64
386         li      r14,80
387         li      r15,96
388         li      r16,112
390         mtctr   r6
392         /*
393          * Now do cacheline sized loads and stores. By this stage the
394          * cacheline stores are also cacheline aligned.
395          */
396         .align  5
398 err4;   lvx     vr7,r0,r4
399 err4;   lvx     vr6,r4,r9
400 err4;   lvx     vr5,r4,r10
401 err4;   lvx     vr4,r4,r11
402 err4;   lvx     vr3,r4,r12
403 err4;   lvx     vr2,r4,r14
404 err4;   lvx     vr1,r4,r15
405 err4;   lvx     vr0,r4,r16
406         addi    r4,r4,128
407 err4;   stvx    vr7,r0,r3
408 err4;   stvx    vr6,r3,r9
409 err4;   stvx    vr5,r3,r10
410 err4;   stvx    vr4,r3,r11
411 err4;   stvx    vr3,r3,r12
412 err4;   stvx    vr2,r3,r14
413 err4;   stvx    vr1,r3,r15
414 err4;   stvx    vr0,r3,r16
415         addi    r3,r3,128
416         bdnz    8b
418         ld      r14,STK_REG(r14)(r1)
419         ld      r15,STK_REG(r15)(r1)
420         ld      r16,STK_REG(r16)(r1)
422         /* Up to 127B to go */
423         clrldi  r5,r5,(64-7)
424         srdi    r6,r5,4
425         mtocrf  0x01,r6
427         bf      cr7*4+1,9f
428 err3;   lvx     vr3,r0,r4
429 err3;   lvx     vr2,r4,r9
430 err3;   lvx     vr1,r4,r10
431 err3;   lvx     vr0,r4,r11
432         addi    r4,r4,64
433 err3;   stvx    vr3,r0,r3
434 err3;   stvx    vr2,r3,r9
435 err3;   stvx    vr1,r3,r10
436 err3;   stvx    vr0,r3,r11
437         addi    r3,r3,64
439 9:      bf      cr7*4+2,10f
440 err3;   lvx     vr1,r0,r4
441 err3;   lvx     vr0,r4,r9
442         addi    r4,r4,32
443 err3;   stvx    vr1,r0,r3
444 err3;   stvx    vr0,r3,r9
445         addi    r3,r3,32
447 10:     bf      cr7*4+3,11f
448 err3;   lvx     vr1,r0,r4
449         addi    r4,r4,16
450 err3;   stvx    vr1,r0,r3
451         addi    r3,r3,16
453         /* Up to 15B to go */
454 11:     clrldi  r5,r5,(64-4)
455         mtocrf  0x01,r5
456         bf      cr7*4+0,12f
457 err3;   ld      r0,0(r4)
458         addi    r4,r4,8
459 err3;   std     r0,0(r3)
460         addi    r3,r3,8
462 12:     bf      cr7*4+1,13f
463 err3;   lwz     r0,0(r4)
464         addi    r4,r4,4
465 err3;   stw     r0,0(r3)
466         addi    r3,r3,4
468 13:     bf      cr7*4+2,14f
469 err3;   lhz     r0,0(r4)
470         addi    r4,r4,2
471 err3;   sth     r0,0(r3)
472         addi    r3,r3,2
474 14:     bf      cr7*4+3,15f
475 err3;   lbz     r0,0(r4)
476 err3;   stb     r0,0(r3)
478 15:     addi    r1,r1,STACKFRAMESIZE
479         b       .exit_vmx_copy          /* tail call optimise */
481 .Lvmx_unaligned_copy:
482         /* Get the destination 16B aligned */
483         neg     r6,r3
484         mtocrf  0x01,r6
485         clrldi  r6,r6,(64-4)
487         bf      cr7*4+3,1f
488 err3;   lbz     r0,0(r4)
489         addi    r4,r4,1
490 err3;   stb     r0,0(r3)
491         addi    r3,r3,1
493 1:      bf      cr7*4+2,2f
494 err3;   lhz     r0,0(r4)
495         addi    r4,r4,2
496 err3;   sth     r0,0(r3)
497         addi    r3,r3,2
499 2:      bf      cr7*4+1,3f
500 err3;   lwz     r0,0(r4)
501         addi    r4,r4,4
502 err3;   stw     r0,0(r3)
503         addi    r3,r3,4
505 3:      bf      cr7*4+0,4f
506 err3;   lwz     r0,0(r4)        /* Less chance of a reject with word ops */
507 err3;   lwz     r7,4(r4)
508         addi    r4,r4,8
509 err3;   stw     r0,0(r3)
510 err3;   stw     r7,4(r3)
511         addi    r3,r3,8
513 4:      sub     r5,r5,r6
515         /* Get the desination 128B aligned */
516         neg     r6,r3
517         srdi    r7,r6,4
518         mtocrf  0x01,r7
519         clrldi  r6,r6,(64-7)
521         li      r9,16
522         li      r10,32
523         li      r11,48
525         lvsl    vr16,0,r4       /* Setup permute control vector */
526 err3;   lvx     vr0,0,r4
527         addi    r4,r4,16
529         bf      cr7*4+3,5f
530 err3;   lvx     vr1,r0,r4
531         vperm   vr8,vr0,vr1,vr16
532         addi    r4,r4,16
533 err3;   stvx    vr8,r0,r3
534         addi    r3,r3,16
535         vor     vr0,vr1,vr1
537 5:      bf      cr7*4+2,6f
538 err3;   lvx     vr1,r0,r4
539         vperm   vr8,vr0,vr1,vr16
540 err3;   lvx     vr0,r4,r9
541         vperm   vr9,vr1,vr0,vr16
542         addi    r4,r4,32
543 err3;   stvx    vr8,r0,r3
544 err3;   stvx    vr9,r3,r9
545         addi    r3,r3,32
547 6:      bf      cr7*4+1,7f
548 err3;   lvx     vr3,r0,r4
549         vperm   vr8,vr0,vr3,vr16
550 err3;   lvx     vr2,r4,r9
551         vperm   vr9,vr3,vr2,vr16
552 err3;   lvx     vr1,r4,r10
553         vperm   vr10,vr2,vr1,vr16
554 err3;   lvx     vr0,r4,r11
555         vperm   vr11,vr1,vr0,vr16
556         addi    r4,r4,64
557 err3;   stvx    vr8,r0,r3
558 err3;   stvx    vr9,r3,r9
559 err3;   stvx    vr10,r3,r10
560 err3;   stvx    vr11,r3,r11
561         addi    r3,r3,64
563 7:      sub     r5,r5,r6
564         srdi    r6,r5,7
566         std     r14,STK_REG(r14)(r1)
567         std     r15,STK_REG(r15)(r1)
568         std     r16,STK_REG(r16)(r1)
570         li      r12,64
571         li      r14,80
572         li      r15,96
573         li      r16,112
575         mtctr   r6
577         /*
578          * Now do cacheline sized loads and stores. By this stage the
579          * cacheline stores are also cacheline aligned.
580          */
581         .align  5
583 err4;   lvx     vr7,r0,r4
584         vperm   vr8,vr0,vr7,vr16
585 err4;   lvx     vr6,r4,r9
586         vperm   vr9,vr7,vr6,vr16
587 err4;   lvx     vr5,r4,r10
588         vperm   vr10,vr6,vr5,vr16
589 err4;   lvx     vr4,r4,r11
590         vperm   vr11,vr5,vr4,vr16
591 err4;   lvx     vr3,r4,r12
592         vperm   vr12,vr4,vr3,vr16
593 err4;   lvx     vr2,r4,r14
594         vperm   vr13,vr3,vr2,vr16
595 err4;   lvx     vr1,r4,r15
596         vperm   vr14,vr2,vr1,vr16
597 err4;   lvx     vr0,r4,r16
598         vperm   vr15,vr1,vr0,vr16
599         addi    r4,r4,128
600 err4;   stvx    vr8,r0,r3
601 err4;   stvx    vr9,r3,r9
602 err4;   stvx    vr10,r3,r10
603 err4;   stvx    vr11,r3,r11
604 err4;   stvx    vr12,r3,r12
605 err4;   stvx    vr13,r3,r14
606 err4;   stvx    vr14,r3,r15
607 err4;   stvx    vr15,r3,r16
608         addi    r3,r3,128
609         bdnz    8b
611         ld      r14,STK_REG(r14)(r1)
612         ld      r15,STK_REG(r15)(r1)
613         ld      r16,STK_REG(r16)(r1)
615         /* Up to 127B to go */
616         clrldi  r5,r5,(64-7)
617         srdi    r6,r5,4
618         mtocrf  0x01,r6
620         bf      cr7*4+1,9f
621 err3;   lvx     vr3,r0,r4
622         vperm   vr8,vr0,vr3,vr16
623 err3;   lvx     vr2,r4,r9
624         vperm   vr9,vr3,vr2,vr16
625 err3;   lvx     vr1,r4,r10
626         vperm   vr10,vr2,vr1,vr16
627 err3;   lvx     vr0,r4,r11
628         vperm   vr11,vr1,vr0,vr16
629         addi    r4,r4,64
630 err3;   stvx    vr8,r0,r3
631 err3;   stvx    vr9,r3,r9
632 err3;   stvx    vr10,r3,r10
633 err3;   stvx    vr11,r3,r11
634         addi    r3,r3,64
636 9:      bf      cr7*4+2,10f
637 err3;   lvx     vr1,r0,r4
638         vperm   vr8,vr0,vr1,vr16
639 err3;   lvx     vr0,r4,r9
640         vperm   vr9,vr1,vr0,vr16
641         addi    r4,r4,32
642 err3;   stvx    vr8,r0,r3
643 err3;   stvx    vr9,r3,r9
644         addi    r3,r3,32
646 10:     bf      cr7*4+3,11f
647 err3;   lvx     vr1,r0,r4
648         vperm   vr8,vr0,vr1,vr16
649         addi    r4,r4,16
650 err3;   stvx    vr8,r0,r3
651         addi    r3,r3,16
653         /* Up to 15B to go */
654 11:     clrldi  r5,r5,(64-4)
655         addi    r4,r4,-16       /* Unwind the +16 load offset */
656         mtocrf  0x01,r5
657         bf      cr7*4+0,12f
658 err3;   lwz     r0,0(r4)        /* Less chance of a reject with word ops */
659 err3;   lwz     r6,4(r4)
660         addi    r4,r4,8
661 err3;   stw     r0,0(r3)
662 err3;   stw     r6,4(r3)
663         addi    r3,r3,8
665 12:     bf      cr7*4+1,13f
666 err3;   lwz     r0,0(r4)
667         addi    r4,r4,4
668 err3;   stw     r0,0(r3)
669         addi    r3,r3,4
671 13:     bf      cr7*4+2,14f
672 err3;   lhz     r0,0(r4)
673         addi    r4,r4,2
674 err3;   sth     r0,0(r3)
675         addi    r3,r3,2
677 14:     bf      cr7*4+3,15f
678 err3;   lbz     r0,0(r4)
679 err3;   stb     r0,0(r3)
681 15:     addi    r1,r1,STACKFRAMESIZE
682         b       .exit_vmx_copy          /* tail call optimise */
683 #endif /* CONFiG_ALTIVEC */