printf: Remove unused 'bprintf'
[drm/drm-misc.git] / arch / powerpc / lib / copyuser_power7.S
blob8474c682a17849a29a3dfcb3af6544fc1e8909c6
1 /* SPDX-License-Identifier: GPL-2.0-or-later */
2 /*
3  *
4  * Copyright (C) IBM Corporation, 2011
5  *
6  * Author: Anton Blanchard <anton@au.ibm.com>
7  */
8 #include <asm/ppc_asm.h>
10 #ifndef SELFTEST_CASE
11 /* 0 == don't use VMX, 1 == use VMX */
12 #define SELFTEST_CASE   0
13 #endif
15 #ifdef __BIG_ENDIAN__
16 #define LVS(VRT,RA,RB)          lvsl    VRT,RA,RB
17 #define VPERM(VRT,VRA,VRB,VRC)  vperm   VRT,VRA,VRB,VRC
18 #else
19 #define LVS(VRT,RA,RB)          lvsr    VRT,RA,RB
20 #define VPERM(VRT,VRA,VRB,VRC)  vperm   VRT,VRB,VRA,VRC
21 #endif
23         .macro err1
24 100:
25         EX_TABLE(100b,.Ldo_err1)
26         .endm
28         .macro err2
29 200:
30         EX_TABLE(200b,.Ldo_err2)
31         .endm
33 #ifdef CONFIG_ALTIVEC
34         .macro err3
35 300:
36         EX_TABLE(300b,.Ldo_err3)
37         .endm
39         .macro err4
40 400:
41         EX_TABLE(400b,.Ldo_err4)
42         .endm
45 .Ldo_err4:
46         ld      r16,STK_REG(R16)(r1)
47         ld      r15,STK_REG(R15)(r1)
48         ld      r14,STK_REG(R14)(r1)
49 .Ldo_err3:
50         bl      CFUNC(exit_vmx_usercopy)
51         ld      r0,STACKFRAMESIZE+16(r1)
52         mtlr    r0
53         b       .Lexit
54 #endif /* CONFIG_ALTIVEC */
56 .Ldo_err2:
57         ld      r22,STK_REG(R22)(r1)
58         ld      r21,STK_REG(R21)(r1)
59         ld      r20,STK_REG(R20)(r1)
60         ld      r19,STK_REG(R19)(r1)
61         ld      r18,STK_REG(R18)(r1)
62         ld      r17,STK_REG(R17)(r1)
63         ld      r16,STK_REG(R16)(r1)
64         ld      r15,STK_REG(R15)(r1)
65         ld      r14,STK_REG(R14)(r1)
66 .Lexit:
67         addi    r1,r1,STACKFRAMESIZE
68 .Ldo_err1:
69         ld      r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
70         ld      r4,-STACKFRAMESIZE+STK_REG(R30)(r1)
71         ld      r5,-STACKFRAMESIZE+STK_REG(R29)(r1)
72         b       __copy_tofrom_user_base
75 _GLOBAL(__copy_tofrom_user_power7)
76         cmpldi  r5,16
77         cmpldi  cr1,r5,3328
79         std     r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
80         std     r4,-STACKFRAMESIZE+STK_REG(R30)(r1)
81         std     r5,-STACKFRAMESIZE+STK_REG(R29)(r1)
83         blt     .Lshort_copy
85 #ifdef CONFIG_ALTIVEC
86 test_feature = SELFTEST_CASE
87 BEGIN_FTR_SECTION
88         bgt     cr1,.Lvmx_copy
89 END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
90 #endif
92 .Lnonvmx_copy:
93         /* Get the source 8B aligned */
94         neg     r6,r4
95         mtocrf  0x01,r6
96         clrldi  r6,r6,(64-3)
98         bf      cr7*4+3,1f
99 err1;   lbz     r0,0(r4)
100         addi    r4,r4,1
101 err1;   stb     r0,0(r3)
102         addi    r3,r3,1
104 1:      bf      cr7*4+2,2f
105 err1;   lhz     r0,0(r4)
106         addi    r4,r4,2
107 err1;   sth     r0,0(r3)
108         addi    r3,r3,2
110 2:      bf      cr7*4+1,3f
111 err1;   lwz     r0,0(r4)
112         addi    r4,r4,4
113 err1;   stw     r0,0(r3)
114         addi    r3,r3,4
116 3:      sub     r5,r5,r6
117         cmpldi  r5,128
118         blt     5f
120         mflr    r0
121         stdu    r1,-STACKFRAMESIZE(r1)
122         std     r14,STK_REG(R14)(r1)
123         std     r15,STK_REG(R15)(r1)
124         std     r16,STK_REG(R16)(r1)
125         std     r17,STK_REG(R17)(r1)
126         std     r18,STK_REG(R18)(r1)
127         std     r19,STK_REG(R19)(r1)
128         std     r20,STK_REG(R20)(r1)
129         std     r21,STK_REG(R21)(r1)
130         std     r22,STK_REG(R22)(r1)
131         std     r0,STACKFRAMESIZE+16(r1)
133         srdi    r6,r5,7
134         mtctr   r6
136         /* Now do cacheline (128B) sized loads and stores. */
137         .align  5
139 err2;   ld      r0,0(r4)
140 err2;   ld      r6,8(r4)
141 err2;   ld      r7,16(r4)
142 err2;   ld      r8,24(r4)
143 err2;   ld      r9,32(r4)
144 err2;   ld      r10,40(r4)
145 err2;   ld      r11,48(r4)
146 err2;   ld      r12,56(r4)
147 err2;   ld      r14,64(r4)
148 err2;   ld      r15,72(r4)
149 err2;   ld      r16,80(r4)
150 err2;   ld      r17,88(r4)
151 err2;   ld      r18,96(r4)
152 err2;   ld      r19,104(r4)
153 err2;   ld      r20,112(r4)
154 err2;   ld      r21,120(r4)
155         addi    r4,r4,128
156 err2;   std     r0,0(r3)
157 err2;   std     r6,8(r3)
158 err2;   std     r7,16(r3)
159 err2;   std     r8,24(r3)
160 err2;   std     r9,32(r3)
161 err2;   std     r10,40(r3)
162 err2;   std     r11,48(r3)
163 err2;   std     r12,56(r3)
164 err2;   std     r14,64(r3)
165 err2;   std     r15,72(r3)
166 err2;   std     r16,80(r3)
167 err2;   std     r17,88(r3)
168 err2;   std     r18,96(r3)
169 err2;   std     r19,104(r3)
170 err2;   std     r20,112(r3)
171 err2;   std     r21,120(r3)
172         addi    r3,r3,128
173         bdnz    4b
175         clrldi  r5,r5,(64-7)
177         ld      r14,STK_REG(R14)(r1)
178         ld      r15,STK_REG(R15)(r1)
179         ld      r16,STK_REG(R16)(r1)
180         ld      r17,STK_REG(R17)(r1)
181         ld      r18,STK_REG(R18)(r1)
182         ld      r19,STK_REG(R19)(r1)
183         ld      r20,STK_REG(R20)(r1)
184         ld      r21,STK_REG(R21)(r1)
185         ld      r22,STK_REG(R22)(r1)
186         addi    r1,r1,STACKFRAMESIZE
188         /* Up to 127B to go */
189 5:      srdi    r6,r5,4
190         mtocrf  0x01,r6
192 6:      bf      cr7*4+1,7f
193 err1;   ld      r0,0(r4)
194 err1;   ld      r6,8(r4)
195 err1;   ld      r7,16(r4)
196 err1;   ld      r8,24(r4)
197 err1;   ld      r9,32(r4)
198 err1;   ld      r10,40(r4)
199 err1;   ld      r11,48(r4)
200 err1;   ld      r12,56(r4)
201         addi    r4,r4,64
202 err1;   std     r0,0(r3)
203 err1;   std     r6,8(r3)
204 err1;   std     r7,16(r3)
205 err1;   std     r8,24(r3)
206 err1;   std     r9,32(r3)
207 err1;   std     r10,40(r3)
208 err1;   std     r11,48(r3)
209 err1;   std     r12,56(r3)
210         addi    r3,r3,64
212         /* Up to 63B to go */
213 7:      bf      cr7*4+2,8f
214 err1;   ld      r0,0(r4)
215 err1;   ld      r6,8(r4)
216 err1;   ld      r7,16(r4)
217 err1;   ld      r8,24(r4)
218         addi    r4,r4,32
219 err1;   std     r0,0(r3)
220 err1;   std     r6,8(r3)
221 err1;   std     r7,16(r3)
222 err1;   std     r8,24(r3)
223         addi    r3,r3,32
225         /* Up to 31B to go */
226 8:      bf      cr7*4+3,9f
227 err1;   ld      r0,0(r4)
228 err1;   ld      r6,8(r4)
229         addi    r4,r4,16
230 err1;   std     r0,0(r3)
231 err1;   std     r6,8(r3)
232         addi    r3,r3,16
234 9:      clrldi  r5,r5,(64-4)
236         /* Up to 15B to go */
237 .Lshort_copy:
238         mtocrf  0x01,r5
239         bf      cr7*4+0,12f
240 err1;   lwz     r0,0(r4)        /* Less chance of a reject with word ops */
241 err1;   lwz     r6,4(r4)
242         addi    r4,r4,8
243 err1;   stw     r0,0(r3)
244 err1;   stw     r6,4(r3)
245         addi    r3,r3,8
247 12:     bf      cr7*4+1,13f
248 err1;   lwz     r0,0(r4)
249         addi    r4,r4,4
250 err1;   stw     r0,0(r3)
251         addi    r3,r3,4
253 13:     bf      cr7*4+2,14f
254 err1;   lhz     r0,0(r4)
255         addi    r4,r4,2
256 err1;   sth     r0,0(r3)
257         addi    r3,r3,2
259 14:     bf      cr7*4+3,15f
260 err1;   lbz     r0,0(r4)
261 err1;   stb     r0,0(r3)
263 15:     li      r3,0
264         blr
266 .Lunwind_stack_nonvmx_copy:
267         addi    r1,r1,STACKFRAMESIZE
268         b       .Lnonvmx_copy
270 .Lvmx_copy:
271 #ifdef CONFIG_ALTIVEC
272         mflr    r0
273         std     r0,16(r1)
274         stdu    r1,-STACKFRAMESIZE(r1)
275         bl      CFUNC(enter_vmx_usercopy)
276         cmpwi   cr1,r3,0
277         ld      r0,STACKFRAMESIZE+16(r1)
278         ld      r3,STK_REG(R31)(r1)
279         ld      r4,STK_REG(R30)(r1)
280         ld      r5,STK_REG(R29)(r1)
281         mtlr    r0
283         /*
284          * We prefetch both the source and destination using enhanced touch
285          * instructions. We use a stream ID of 0 for the load side and
286          * 1 for the store side.
287          */
288         clrrdi  r6,r4,7
289         clrrdi  r9,r3,7
290         ori     r9,r9,1         /* stream=1 */
292         srdi    r7,r5,7         /* length in cachelines, capped at 0x3FF */
293         cmpldi  r7,0x3FF
294         ble     1f
295         li      r7,0x3FF
296 1:      lis     r0,0x0E00       /* depth=7 */
297         sldi    r7,r7,7
298         or      r7,r7,r0
299         ori     r10,r7,1        /* stream=1 */
301         DCBT_SETUP_STREAMS(r6, r7, r9, r10, r8)
303         beq     cr1,.Lunwind_stack_nonvmx_copy
305         /*
306          * If source and destination are not relatively aligned we use a
307          * slower permute loop.
308          */
309         xor     r6,r4,r3
310         rldicl. r6,r6,0,(64-4)
311         bne     .Lvmx_unaligned_copy
313         /* Get the destination 16B aligned */
314         neg     r6,r3
315         mtocrf  0x01,r6
316         clrldi  r6,r6,(64-4)
318         bf      cr7*4+3,1f
319 err3;   lbz     r0,0(r4)
320         addi    r4,r4,1
321 err3;   stb     r0,0(r3)
322         addi    r3,r3,1
324 1:      bf      cr7*4+2,2f
325 err3;   lhz     r0,0(r4)
326         addi    r4,r4,2
327 err3;   sth     r0,0(r3)
328         addi    r3,r3,2
330 2:      bf      cr7*4+1,3f
331 err3;   lwz     r0,0(r4)
332         addi    r4,r4,4
333 err3;   stw     r0,0(r3)
334         addi    r3,r3,4
336 3:      bf      cr7*4+0,4f
337 err3;   ld      r0,0(r4)
338         addi    r4,r4,8
339 err3;   std     r0,0(r3)
340         addi    r3,r3,8
342 4:      sub     r5,r5,r6
344         /* Get the desination 128B aligned */
345         neg     r6,r3
346         srdi    r7,r6,4
347         mtocrf  0x01,r7
348         clrldi  r6,r6,(64-7)
350         li      r9,16
351         li      r10,32
352         li      r11,48
354         bf      cr7*4+3,5f
355 err3;   lvx     v1,0,r4
356         addi    r4,r4,16
357 err3;   stvx    v1,0,r3
358         addi    r3,r3,16
360 5:      bf      cr7*4+2,6f
361 err3;   lvx     v1,0,r4
362 err3;   lvx     v0,r4,r9
363         addi    r4,r4,32
364 err3;   stvx    v1,0,r3
365 err3;   stvx    v0,r3,r9
366         addi    r3,r3,32
368 6:      bf      cr7*4+1,7f
369 err3;   lvx     v3,0,r4
370 err3;   lvx     v2,r4,r9
371 err3;   lvx     v1,r4,r10
372 err3;   lvx     v0,r4,r11
373         addi    r4,r4,64
374 err3;   stvx    v3,0,r3
375 err3;   stvx    v2,r3,r9
376 err3;   stvx    v1,r3,r10
377 err3;   stvx    v0,r3,r11
378         addi    r3,r3,64
380 7:      sub     r5,r5,r6
381         srdi    r6,r5,7
383         std     r14,STK_REG(R14)(r1)
384         std     r15,STK_REG(R15)(r1)
385         std     r16,STK_REG(R16)(r1)
387         li      r12,64
388         li      r14,80
389         li      r15,96
390         li      r16,112
392         mtctr   r6
394         /*
395          * Now do cacheline sized loads and stores. By this stage the
396          * cacheline stores are also cacheline aligned.
397          */
398         .align  5
400 err4;   lvx     v7,0,r4
401 err4;   lvx     v6,r4,r9
402 err4;   lvx     v5,r4,r10
403 err4;   lvx     v4,r4,r11
404 err4;   lvx     v3,r4,r12
405 err4;   lvx     v2,r4,r14
406 err4;   lvx     v1,r4,r15
407 err4;   lvx     v0,r4,r16
408         addi    r4,r4,128
409 err4;   stvx    v7,0,r3
410 err4;   stvx    v6,r3,r9
411 err4;   stvx    v5,r3,r10
412 err4;   stvx    v4,r3,r11
413 err4;   stvx    v3,r3,r12
414 err4;   stvx    v2,r3,r14
415 err4;   stvx    v1,r3,r15
416 err4;   stvx    v0,r3,r16
417         addi    r3,r3,128
418         bdnz    8b
420         ld      r14,STK_REG(R14)(r1)
421         ld      r15,STK_REG(R15)(r1)
422         ld      r16,STK_REG(R16)(r1)
424         /* Up to 127B to go */
425         clrldi  r5,r5,(64-7)
426         srdi    r6,r5,4
427         mtocrf  0x01,r6
429         bf      cr7*4+1,9f
430 err3;   lvx     v3,0,r4
431 err3;   lvx     v2,r4,r9
432 err3;   lvx     v1,r4,r10
433 err3;   lvx     v0,r4,r11
434         addi    r4,r4,64
435 err3;   stvx    v3,0,r3
436 err3;   stvx    v2,r3,r9
437 err3;   stvx    v1,r3,r10
438 err3;   stvx    v0,r3,r11
439         addi    r3,r3,64
441 9:      bf      cr7*4+2,10f
442 err3;   lvx     v1,0,r4
443 err3;   lvx     v0,r4,r9
444         addi    r4,r4,32
445 err3;   stvx    v1,0,r3
446 err3;   stvx    v0,r3,r9
447         addi    r3,r3,32
449 10:     bf      cr7*4+3,11f
450 err3;   lvx     v1,0,r4
451         addi    r4,r4,16
452 err3;   stvx    v1,0,r3
453         addi    r3,r3,16
455         /* Up to 15B to go */
456 11:     clrldi  r5,r5,(64-4)
457         mtocrf  0x01,r5
458         bf      cr7*4+0,12f
459 err3;   ld      r0,0(r4)
460         addi    r4,r4,8
461 err3;   std     r0,0(r3)
462         addi    r3,r3,8
464 12:     bf      cr7*4+1,13f
465 err3;   lwz     r0,0(r4)
466         addi    r4,r4,4
467 err3;   stw     r0,0(r3)
468         addi    r3,r3,4
470 13:     bf      cr7*4+2,14f
471 err3;   lhz     r0,0(r4)
472         addi    r4,r4,2
473 err3;   sth     r0,0(r3)
474         addi    r3,r3,2
476 14:     bf      cr7*4+3,15f
477 err3;   lbz     r0,0(r4)
478 err3;   stb     r0,0(r3)
480 15:     addi    r1,r1,STACKFRAMESIZE
481         b       CFUNC(exit_vmx_usercopy)        /* tail call optimise */
483 .Lvmx_unaligned_copy:
484         /* Get the destination 16B aligned */
485         neg     r6,r3
486         mtocrf  0x01,r6
487         clrldi  r6,r6,(64-4)
489         bf      cr7*4+3,1f
490 err3;   lbz     r0,0(r4)
491         addi    r4,r4,1
492 err3;   stb     r0,0(r3)
493         addi    r3,r3,1
495 1:      bf      cr7*4+2,2f
496 err3;   lhz     r0,0(r4)
497         addi    r4,r4,2
498 err3;   sth     r0,0(r3)
499         addi    r3,r3,2
501 2:      bf      cr7*4+1,3f
502 err3;   lwz     r0,0(r4)
503         addi    r4,r4,4
504 err3;   stw     r0,0(r3)
505         addi    r3,r3,4
507 3:      bf      cr7*4+0,4f
508 err3;   lwz     r0,0(r4)        /* Less chance of a reject with word ops */
509 err3;   lwz     r7,4(r4)
510         addi    r4,r4,8
511 err3;   stw     r0,0(r3)
512 err3;   stw     r7,4(r3)
513         addi    r3,r3,8
515 4:      sub     r5,r5,r6
517         /* Get the desination 128B aligned */
518         neg     r6,r3
519         srdi    r7,r6,4
520         mtocrf  0x01,r7
521         clrldi  r6,r6,(64-7)
523         li      r9,16
524         li      r10,32
525         li      r11,48
527         LVS(v16,0,r4)           /* Setup permute control vector */
528 err3;   lvx     v0,0,r4
529         addi    r4,r4,16
531         bf      cr7*4+3,5f
532 err3;   lvx     v1,0,r4
533         VPERM(v8,v0,v1,v16)
534         addi    r4,r4,16
535 err3;   stvx    v8,0,r3
536         addi    r3,r3,16
537         vor     v0,v1,v1
539 5:      bf      cr7*4+2,6f
540 err3;   lvx     v1,0,r4
541         VPERM(v8,v0,v1,v16)
542 err3;   lvx     v0,r4,r9
543         VPERM(v9,v1,v0,v16)
544         addi    r4,r4,32
545 err3;   stvx    v8,0,r3
546 err3;   stvx    v9,r3,r9
547         addi    r3,r3,32
549 6:      bf      cr7*4+1,7f
550 err3;   lvx     v3,0,r4
551         VPERM(v8,v0,v3,v16)
552 err3;   lvx     v2,r4,r9
553         VPERM(v9,v3,v2,v16)
554 err3;   lvx     v1,r4,r10
555         VPERM(v10,v2,v1,v16)
556 err3;   lvx     v0,r4,r11
557         VPERM(v11,v1,v0,v16)
558         addi    r4,r4,64
559 err3;   stvx    v8,0,r3
560 err3;   stvx    v9,r3,r9
561 err3;   stvx    v10,r3,r10
562 err3;   stvx    v11,r3,r11
563         addi    r3,r3,64
565 7:      sub     r5,r5,r6
566         srdi    r6,r5,7
568         std     r14,STK_REG(R14)(r1)
569         std     r15,STK_REG(R15)(r1)
570         std     r16,STK_REG(R16)(r1)
572         li      r12,64
573         li      r14,80
574         li      r15,96
575         li      r16,112
577         mtctr   r6
579         /*
580          * Now do cacheline sized loads and stores. By this stage the
581          * cacheline stores are also cacheline aligned.
582          */
583         .align  5
585 err4;   lvx     v7,0,r4
586         VPERM(v8,v0,v7,v16)
587 err4;   lvx     v6,r4,r9
588         VPERM(v9,v7,v6,v16)
589 err4;   lvx     v5,r4,r10
590         VPERM(v10,v6,v5,v16)
591 err4;   lvx     v4,r4,r11
592         VPERM(v11,v5,v4,v16)
593 err4;   lvx     v3,r4,r12
594         VPERM(v12,v4,v3,v16)
595 err4;   lvx     v2,r4,r14
596         VPERM(v13,v3,v2,v16)
597 err4;   lvx     v1,r4,r15
598         VPERM(v14,v2,v1,v16)
599 err4;   lvx     v0,r4,r16
600         VPERM(v15,v1,v0,v16)
601         addi    r4,r4,128
602 err4;   stvx    v8,0,r3
603 err4;   stvx    v9,r3,r9
604 err4;   stvx    v10,r3,r10
605 err4;   stvx    v11,r3,r11
606 err4;   stvx    v12,r3,r12
607 err4;   stvx    v13,r3,r14
608 err4;   stvx    v14,r3,r15
609 err4;   stvx    v15,r3,r16
610         addi    r3,r3,128
611         bdnz    8b
613         ld      r14,STK_REG(R14)(r1)
614         ld      r15,STK_REG(R15)(r1)
615         ld      r16,STK_REG(R16)(r1)
617         /* Up to 127B to go */
618         clrldi  r5,r5,(64-7)
619         srdi    r6,r5,4
620         mtocrf  0x01,r6
622         bf      cr7*4+1,9f
623 err3;   lvx     v3,0,r4
624         VPERM(v8,v0,v3,v16)
625 err3;   lvx     v2,r4,r9
626         VPERM(v9,v3,v2,v16)
627 err3;   lvx     v1,r4,r10
628         VPERM(v10,v2,v1,v16)
629 err3;   lvx     v0,r4,r11
630         VPERM(v11,v1,v0,v16)
631         addi    r4,r4,64
632 err3;   stvx    v8,0,r3
633 err3;   stvx    v9,r3,r9
634 err3;   stvx    v10,r3,r10
635 err3;   stvx    v11,r3,r11
636         addi    r3,r3,64
638 9:      bf      cr7*4+2,10f
639 err3;   lvx     v1,0,r4
640         VPERM(v8,v0,v1,v16)
641 err3;   lvx     v0,r4,r9
642         VPERM(v9,v1,v0,v16)
643         addi    r4,r4,32
644 err3;   stvx    v8,0,r3
645 err3;   stvx    v9,r3,r9
646         addi    r3,r3,32
648 10:     bf      cr7*4+3,11f
649 err3;   lvx     v1,0,r4
650         VPERM(v8,v0,v1,v16)
651         addi    r4,r4,16
652 err3;   stvx    v8,0,r3
653         addi    r3,r3,16
655         /* Up to 15B to go */
656 11:     clrldi  r5,r5,(64-4)
657         addi    r4,r4,-16       /* Unwind the +16 load offset */
658         mtocrf  0x01,r5
659         bf      cr7*4+0,12f
660 err3;   lwz     r0,0(r4)        /* Less chance of a reject with word ops */
661 err3;   lwz     r6,4(r4)
662         addi    r4,r4,8
663 err3;   stw     r0,0(r3)
664 err3;   stw     r6,4(r3)
665         addi    r3,r3,8
667 12:     bf      cr7*4+1,13f
668 err3;   lwz     r0,0(r4)
669         addi    r4,r4,4
670 err3;   stw     r0,0(r3)
671         addi    r3,r3,4
673 13:     bf      cr7*4+2,14f
674 err3;   lhz     r0,0(r4)
675         addi    r4,r4,2
676 err3;   sth     r0,0(r3)
677         addi    r3,r3,2
679 14:     bf      cr7*4+3,15f
680 err3;   lbz     r0,0(r4)
681 err3;   stb     r0,0(r3)
683 15:     addi    r1,r1,STACKFRAMESIZE
684         b       CFUNC(exit_vmx_usercopy)        /* tail call optimise */
685 #endif /* CONFIG_ALTIVEC */