io_uring: ensure finish_wait() is always called in __io_uring_task_cancel()
[linux/fpc-iii.git] / arch / powerpc / lib / memcpy_power7.S
blob54f226333c9422f4f95f025ee00c18074cc81b55
1 /* SPDX-License-Identifier: GPL-2.0-or-later */
2 /*
3  *
4  * Copyright (C) IBM Corporation, 2012
5  *
6  * Author: Anton Blanchard <anton@au.ibm.com>
7  */
8 #include <asm/ppc_asm.h>
10 #ifndef SELFTEST_CASE
11 /* 0 == don't use VMX, 1 == use VMX */
12 #define SELFTEST_CASE   0
13 #endif
15 #ifdef __BIG_ENDIAN__
16 #define LVS(VRT,RA,RB)          lvsl    VRT,RA,RB
17 #define VPERM(VRT,VRA,VRB,VRC)  vperm   VRT,VRA,VRB,VRC
18 #else
19 #define LVS(VRT,RA,RB)          lvsr    VRT,RA,RB
20 #define VPERM(VRT,VRA,VRB,VRC)  vperm   VRT,VRB,VRA,VRC
21 #endif
23 _GLOBAL(memcpy_power7)
24         cmpldi  r5,16
25         cmpldi  cr1,r5,4096
26         std     r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
27         blt     .Lshort_copy
29 #ifdef CONFIG_ALTIVEC
30 test_feature = SELFTEST_CASE
31 BEGIN_FTR_SECTION
32         bgt     cr1, .Lvmx_copy
33 END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
34 #endif
36 .Lnonvmx_copy:
37         /* Get the source 8B aligned */
38         neg     r6,r4
39         mtocrf  0x01,r6
40         clrldi  r6,r6,(64-3)
42         bf      cr7*4+3,1f
43         lbz     r0,0(r4)
44         addi    r4,r4,1
45         stb     r0,0(r3)
46         addi    r3,r3,1
48 1:      bf      cr7*4+2,2f
49         lhz     r0,0(r4)
50         addi    r4,r4,2
51         sth     r0,0(r3)
52         addi    r3,r3,2
54 2:      bf      cr7*4+1,3f
55         lwz     r0,0(r4)
56         addi    r4,r4,4
57         stw     r0,0(r3)
58         addi    r3,r3,4
60 3:      sub     r5,r5,r6
61         cmpldi  r5,128
62         blt     5f
64         mflr    r0
65         stdu    r1,-STACKFRAMESIZE(r1)
66         std     r14,STK_REG(R14)(r1)
67         std     r15,STK_REG(R15)(r1)
68         std     r16,STK_REG(R16)(r1)
69         std     r17,STK_REG(R17)(r1)
70         std     r18,STK_REG(R18)(r1)
71         std     r19,STK_REG(R19)(r1)
72         std     r20,STK_REG(R20)(r1)
73         std     r21,STK_REG(R21)(r1)
74         std     r22,STK_REG(R22)(r1)
75         std     r0,STACKFRAMESIZE+16(r1)
77         srdi    r6,r5,7
78         mtctr   r6
80         /* Now do cacheline (128B) sized loads and stores. */
81         .align  5
83         ld      r0,0(r4)
84         ld      r6,8(r4)
85         ld      r7,16(r4)
86         ld      r8,24(r4)
87         ld      r9,32(r4)
88         ld      r10,40(r4)
89         ld      r11,48(r4)
90         ld      r12,56(r4)
91         ld      r14,64(r4)
92         ld      r15,72(r4)
93         ld      r16,80(r4)
94         ld      r17,88(r4)
95         ld      r18,96(r4)
96         ld      r19,104(r4)
97         ld      r20,112(r4)
98         ld      r21,120(r4)
99         addi    r4,r4,128
100         std     r0,0(r3)
101         std     r6,8(r3)
102         std     r7,16(r3)
103         std     r8,24(r3)
104         std     r9,32(r3)
105         std     r10,40(r3)
106         std     r11,48(r3)
107         std     r12,56(r3)
108         std     r14,64(r3)
109         std     r15,72(r3)
110         std     r16,80(r3)
111         std     r17,88(r3)
112         std     r18,96(r3)
113         std     r19,104(r3)
114         std     r20,112(r3)
115         std     r21,120(r3)
116         addi    r3,r3,128
117         bdnz    4b
119         clrldi  r5,r5,(64-7)
121         ld      r14,STK_REG(R14)(r1)
122         ld      r15,STK_REG(R15)(r1)
123         ld      r16,STK_REG(R16)(r1)
124         ld      r17,STK_REG(R17)(r1)
125         ld      r18,STK_REG(R18)(r1)
126         ld      r19,STK_REG(R19)(r1)
127         ld      r20,STK_REG(R20)(r1)
128         ld      r21,STK_REG(R21)(r1)
129         ld      r22,STK_REG(R22)(r1)
130         addi    r1,r1,STACKFRAMESIZE
132         /* Up to 127B to go */
133 5:      srdi    r6,r5,4
134         mtocrf  0x01,r6
136 6:      bf      cr7*4+1,7f
137         ld      r0,0(r4)
138         ld      r6,8(r4)
139         ld      r7,16(r4)
140         ld      r8,24(r4)
141         ld      r9,32(r4)
142         ld      r10,40(r4)
143         ld      r11,48(r4)
144         ld      r12,56(r4)
145         addi    r4,r4,64
146         std     r0,0(r3)
147         std     r6,8(r3)
148         std     r7,16(r3)
149         std     r8,24(r3)
150         std     r9,32(r3)
151         std     r10,40(r3)
152         std     r11,48(r3)
153         std     r12,56(r3)
154         addi    r3,r3,64
156         /* Up to 63B to go */
157 7:      bf      cr7*4+2,8f
158         ld      r0,0(r4)
159         ld      r6,8(r4)
160         ld      r7,16(r4)
161         ld      r8,24(r4)
162         addi    r4,r4,32
163         std     r0,0(r3)
164         std     r6,8(r3)
165         std     r7,16(r3)
166         std     r8,24(r3)
167         addi    r3,r3,32
169         /* Up to 31B to go */
170 8:      bf      cr7*4+3,9f
171         ld      r0,0(r4)
172         ld      r6,8(r4)
173         addi    r4,r4,16
174         std     r0,0(r3)
175         std     r6,8(r3)
176         addi    r3,r3,16
178 9:      clrldi  r5,r5,(64-4)
180         /* Up to 15B to go */
181 .Lshort_copy:
182         mtocrf  0x01,r5
183         bf      cr7*4+0,12f
184         lwz     r0,0(r4)        /* Less chance of a reject with word ops */
185         lwz     r6,4(r4)
186         addi    r4,r4,8
187         stw     r0,0(r3)
188         stw     r6,4(r3)
189         addi    r3,r3,8
191 12:     bf      cr7*4+1,13f
192         lwz     r0,0(r4)
193         addi    r4,r4,4
194         stw     r0,0(r3)
195         addi    r3,r3,4
197 13:     bf      cr7*4+2,14f
198         lhz     r0,0(r4)
199         addi    r4,r4,2
200         sth     r0,0(r3)
201         addi    r3,r3,2
203 14:     bf      cr7*4+3,15f
204         lbz     r0,0(r4)
205         stb     r0,0(r3)
207 15:     ld      r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
208         blr
210 .Lunwind_stack_nonvmx_copy:
211         addi    r1,r1,STACKFRAMESIZE
212         b       .Lnonvmx_copy
214 .Lvmx_copy:
215 #ifdef CONFIG_ALTIVEC
216         mflr    r0
217         std     r4,-STACKFRAMESIZE+STK_REG(R30)(r1)
218         std     r5,-STACKFRAMESIZE+STK_REG(R29)(r1)
219         std     r0,16(r1)
220         stdu    r1,-STACKFRAMESIZE(r1)
221         bl      enter_vmx_ops
222         cmpwi   cr1,r3,0
223         ld      r0,STACKFRAMESIZE+16(r1)
224         ld      r3,STK_REG(R31)(r1)
225         ld      r4,STK_REG(R30)(r1)
226         ld      r5,STK_REG(R29)(r1)
227         mtlr    r0
229         /*
230          * We prefetch both the source and destination using enhanced touch
231          * instructions. We use a stream ID of 0 for the load side and
232          * 1 for the store side.
233          */
234         clrrdi  r6,r4,7
235         clrrdi  r9,r3,7
236         ori     r9,r9,1         /* stream=1 */
238         srdi    r7,r5,7         /* length in cachelines, capped at 0x3FF */
239         cmpldi  r7,0x3FF
240         ble     1f
241         li      r7,0x3FF
242 1:      lis     r0,0x0E00       /* depth=7 */
243         sldi    r7,r7,7
244         or      r7,r7,r0
245         ori     r10,r7,1        /* stream=1 */
247         lis     r8,0x8000       /* GO=1 */
248         clrldi  r8,r8,32
250         dcbt    0,r6,0b01000
251         dcbt    0,r7,0b01010
252         dcbtst  0,r9,0b01000
253         dcbtst  0,r10,0b01010
254         eieio
255         dcbt    0,r8,0b01010    /* GO */
257         beq     cr1,.Lunwind_stack_nonvmx_copy
259         /*
260          * If source and destination are not relatively aligned we use a
261          * slower permute loop.
262          */
263         xor     r6,r4,r3
264         rldicl. r6,r6,0,(64-4)
265         bne     .Lvmx_unaligned_copy
267         /* Get the destination 16B aligned */
268         neg     r6,r3
269         mtocrf  0x01,r6
270         clrldi  r6,r6,(64-4)
272         bf      cr7*4+3,1f
273         lbz     r0,0(r4)
274         addi    r4,r4,1
275         stb     r0,0(r3)
276         addi    r3,r3,1
278 1:      bf      cr7*4+2,2f
279         lhz     r0,0(r4)
280         addi    r4,r4,2
281         sth     r0,0(r3)
282         addi    r3,r3,2
284 2:      bf      cr7*4+1,3f
285         lwz     r0,0(r4)
286         addi    r4,r4,4
287         stw     r0,0(r3)
288         addi    r3,r3,4
290 3:      bf      cr7*4+0,4f
291         ld      r0,0(r4)
292         addi    r4,r4,8
293         std     r0,0(r3)
294         addi    r3,r3,8
296 4:      sub     r5,r5,r6
298         /* Get the desination 128B aligned */
299         neg     r6,r3
300         srdi    r7,r6,4
301         mtocrf  0x01,r7
302         clrldi  r6,r6,(64-7)
304         li      r9,16
305         li      r10,32
306         li      r11,48
308         bf      cr7*4+3,5f
309         lvx     v1,0,r4
310         addi    r4,r4,16
311         stvx    v1,0,r3
312         addi    r3,r3,16
314 5:      bf      cr7*4+2,6f
315         lvx     v1,0,r4
316         lvx     v0,r4,r9
317         addi    r4,r4,32
318         stvx    v1,0,r3
319         stvx    v0,r3,r9
320         addi    r3,r3,32
322 6:      bf      cr7*4+1,7f
323         lvx     v3,0,r4
324         lvx     v2,r4,r9
325         lvx     v1,r4,r10
326         lvx     v0,r4,r11
327         addi    r4,r4,64
328         stvx    v3,0,r3
329         stvx    v2,r3,r9
330         stvx    v1,r3,r10
331         stvx    v0,r3,r11
332         addi    r3,r3,64
334 7:      sub     r5,r5,r6
335         srdi    r6,r5,7
337         std     r14,STK_REG(R14)(r1)
338         std     r15,STK_REG(R15)(r1)
339         std     r16,STK_REG(R16)(r1)
341         li      r12,64
342         li      r14,80
343         li      r15,96
344         li      r16,112
346         mtctr   r6
348         /*
349          * Now do cacheline sized loads and stores. By this stage the
350          * cacheline stores are also cacheline aligned.
351          */
352         .align  5
354         lvx     v7,0,r4
355         lvx     v6,r4,r9
356         lvx     v5,r4,r10
357         lvx     v4,r4,r11
358         lvx     v3,r4,r12
359         lvx     v2,r4,r14
360         lvx     v1,r4,r15
361         lvx     v0,r4,r16
362         addi    r4,r4,128
363         stvx    v7,0,r3
364         stvx    v6,r3,r9
365         stvx    v5,r3,r10
366         stvx    v4,r3,r11
367         stvx    v3,r3,r12
368         stvx    v2,r3,r14
369         stvx    v1,r3,r15
370         stvx    v0,r3,r16
371         addi    r3,r3,128
372         bdnz    8b
374         ld      r14,STK_REG(R14)(r1)
375         ld      r15,STK_REG(R15)(r1)
376         ld      r16,STK_REG(R16)(r1)
378         /* Up to 127B to go */
379         clrldi  r5,r5,(64-7)
380         srdi    r6,r5,4
381         mtocrf  0x01,r6
383         bf      cr7*4+1,9f
384         lvx     v3,0,r4
385         lvx     v2,r4,r9
386         lvx     v1,r4,r10
387         lvx     v0,r4,r11
388         addi    r4,r4,64
389         stvx    v3,0,r3
390         stvx    v2,r3,r9
391         stvx    v1,r3,r10
392         stvx    v0,r3,r11
393         addi    r3,r3,64
395 9:      bf      cr7*4+2,10f
396         lvx     v1,0,r4
397         lvx     v0,r4,r9
398         addi    r4,r4,32
399         stvx    v1,0,r3
400         stvx    v0,r3,r9
401         addi    r3,r3,32
403 10:     bf      cr7*4+3,11f
404         lvx     v1,0,r4
405         addi    r4,r4,16
406         stvx    v1,0,r3
407         addi    r3,r3,16
409         /* Up to 15B to go */
410 11:     clrldi  r5,r5,(64-4)
411         mtocrf  0x01,r5
412         bf      cr7*4+0,12f
413         ld      r0,0(r4)
414         addi    r4,r4,8
415         std     r0,0(r3)
416         addi    r3,r3,8
418 12:     bf      cr7*4+1,13f
419         lwz     r0,0(r4)
420         addi    r4,r4,4
421         stw     r0,0(r3)
422         addi    r3,r3,4
424 13:     bf      cr7*4+2,14f
425         lhz     r0,0(r4)
426         addi    r4,r4,2
427         sth     r0,0(r3)
428         addi    r3,r3,2
430 14:     bf      cr7*4+3,15f
431         lbz     r0,0(r4)
432         stb     r0,0(r3)
434 15:     addi    r1,r1,STACKFRAMESIZE
435         ld      r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
436         b       exit_vmx_ops            /* tail call optimise */
438 .Lvmx_unaligned_copy:
439         /* Get the destination 16B aligned */
440         neg     r6,r3
441         mtocrf  0x01,r6
442         clrldi  r6,r6,(64-4)
444         bf      cr7*4+3,1f
445         lbz     r0,0(r4)
446         addi    r4,r4,1
447         stb     r0,0(r3)
448         addi    r3,r3,1
450 1:      bf      cr7*4+2,2f
451         lhz     r0,0(r4)
452         addi    r4,r4,2
453         sth     r0,0(r3)
454         addi    r3,r3,2
456 2:      bf      cr7*4+1,3f
457         lwz     r0,0(r4)
458         addi    r4,r4,4
459         stw     r0,0(r3)
460         addi    r3,r3,4
462 3:      bf      cr7*4+0,4f
463         lwz     r0,0(r4)        /* Less chance of a reject with word ops */
464         lwz     r7,4(r4)
465         addi    r4,r4,8
466         stw     r0,0(r3)
467         stw     r7,4(r3)
468         addi    r3,r3,8
470 4:      sub     r5,r5,r6
472         /* Get the desination 128B aligned */
473         neg     r6,r3
474         srdi    r7,r6,4
475         mtocrf  0x01,r7
476         clrldi  r6,r6,(64-7)
478         li      r9,16
479         li      r10,32
480         li      r11,48
482         LVS(v16,0,r4)           /* Setup permute control vector */
483         lvx     v0,0,r4
484         addi    r4,r4,16
486         bf      cr7*4+3,5f
487         lvx     v1,0,r4
488         VPERM(v8,v0,v1,v16)
489         addi    r4,r4,16
490         stvx    v8,0,r3
491         addi    r3,r3,16
492         vor     v0,v1,v1
494 5:      bf      cr7*4+2,6f
495         lvx     v1,0,r4
496         VPERM(v8,v0,v1,v16)
497         lvx     v0,r4,r9
498         VPERM(v9,v1,v0,v16)
499         addi    r4,r4,32
500         stvx    v8,0,r3
501         stvx    v9,r3,r9
502         addi    r3,r3,32
504 6:      bf      cr7*4+1,7f
505         lvx     v3,0,r4
506         VPERM(v8,v0,v3,v16)
507         lvx     v2,r4,r9
508         VPERM(v9,v3,v2,v16)
509         lvx     v1,r4,r10
510         VPERM(v10,v2,v1,v16)
511         lvx     v0,r4,r11
512         VPERM(v11,v1,v0,v16)
513         addi    r4,r4,64
514         stvx    v8,0,r3
515         stvx    v9,r3,r9
516         stvx    v10,r3,r10
517         stvx    v11,r3,r11
518         addi    r3,r3,64
520 7:      sub     r5,r5,r6
521         srdi    r6,r5,7
523         std     r14,STK_REG(R14)(r1)
524         std     r15,STK_REG(R15)(r1)
525         std     r16,STK_REG(R16)(r1)
527         li      r12,64
528         li      r14,80
529         li      r15,96
530         li      r16,112
532         mtctr   r6
534         /*
535          * Now do cacheline sized loads and stores. By this stage the
536          * cacheline stores are also cacheline aligned.
537          */
538         .align  5
540         lvx     v7,0,r4
541         VPERM(v8,v0,v7,v16)
542         lvx     v6,r4,r9
543         VPERM(v9,v7,v6,v16)
544         lvx     v5,r4,r10
545         VPERM(v10,v6,v5,v16)
546         lvx     v4,r4,r11
547         VPERM(v11,v5,v4,v16)
548         lvx     v3,r4,r12
549         VPERM(v12,v4,v3,v16)
550         lvx     v2,r4,r14
551         VPERM(v13,v3,v2,v16)
552         lvx     v1,r4,r15
553         VPERM(v14,v2,v1,v16)
554         lvx     v0,r4,r16
555         VPERM(v15,v1,v0,v16)
556         addi    r4,r4,128
557         stvx    v8,0,r3
558         stvx    v9,r3,r9
559         stvx    v10,r3,r10
560         stvx    v11,r3,r11
561         stvx    v12,r3,r12
562         stvx    v13,r3,r14
563         stvx    v14,r3,r15
564         stvx    v15,r3,r16
565         addi    r3,r3,128
566         bdnz    8b
568         ld      r14,STK_REG(R14)(r1)
569         ld      r15,STK_REG(R15)(r1)
570         ld      r16,STK_REG(R16)(r1)
572         /* Up to 127B to go */
573         clrldi  r5,r5,(64-7)
574         srdi    r6,r5,4
575         mtocrf  0x01,r6
577         bf      cr7*4+1,9f
578         lvx     v3,0,r4
579         VPERM(v8,v0,v3,v16)
580         lvx     v2,r4,r9
581         VPERM(v9,v3,v2,v16)
582         lvx     v1,r4,r10
583         VPERM(v10,v2,v1,v16)
584         lvx     v0,r4,r11
585         VPERM(v11,v1,v0,v16)
586         addi    r4,r4,64
587         stvx    v8,0,r3
588         stvx    v9,r3,r9
589         stvx    v10,r3,r10
590         stvx    v11,r3,r11
591         addi    r3,r3,64
593 9:      bf      cr7*4+2,10f
594         lvx     v1,0,r4
595         VPERM(v8,v0,v1,v16)
596         lvx     v0,r4,r9
597         VPERM(v9,v1,v0,v16)
598         addi    r4,r4,32
599         stvx    v8,0,r3
600         stvx    v9,r3,r9
601         addi    r3,r3,32
603 10:     bf      cr7*4+3,11f
604         lvx     v1,0,r4
605         VPERM(v8,v0,v1,v16)
606         addi    r4,r4,16
607         stvx    v8,0,r3
608         addi    r3,r3,16
610         /* Up to 15B to go */
611 11:     clrldi  r5,r5,(64-4)
612         addi    r4,r4,-16       /* Unwind the +16 load offset */
613         mtocrf  0x01,r5
614         bf      cr7*4+0,12f
615         lwz     r0,0(r4)        /* Less chance of a reject with word ops */
616         lwz     r6,4(r4)
617         addi    r4,r4,8
618         stw     r0,0(r3)
619         stw     r6,4(r3)
620         addi    r3,r3,8
622 12:     bf      cr7*4+1,13f
623         lwz     r0,0(r4)
624         addi    r4,r4,4
625         stw     r0,0(r3)
626         addi    r3,r3,4
628 13:     bf      cr7*4+2,14f
629         lhz     r0,0(r4)
630         addi    r4,r4,2
631         sth     r0,0(r3)
632         addi    r3,r3,2
634 14:     bf      cr7*4+3,15f
635         lbz     r0,0(r4)
636         stb     r0,0(r3)
638 15:     addi    r1,r1,STACKFRAMESIZE
639         ld      r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
640         b       exit_vmx_ops            /* tail call optimise */
641 #endif /* CONFIG_ALTIVEC */