Linux 3.17-rc2
[linux/fpc-iii.git] / arch / powerpc / lib / memcpy_power7.S
blob2ff5c142f87ba061257f1f00fcc948bc4cb04780
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 2 of the License, or
5  * (at your option) any later version.
6  *
7  * This program is distributed in the hope that it will be useful,
8  * but WITHOUT ANY WARRANTY; without even the implied warranty of
9  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10  * GNU General Public License for more details.
11  *
12  * You should have received a copy of the GNU General Public License
13  * along with this program; if not, write to the Free Software
14  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
15  *
16  * Copyright (C) IBM Corporation, 2012
17  *
18  * Author: Anton Blanchard <anton@au.ibm.com>
19  */
20 #include <asm/ppc_asm.h>
22 _GLOBAL(memcpy_power7)
24 #ifdef __BIG_ENDIAN__
25 #define LVS(VRT,RA,RB)          lvsl    VRT,RA,RB
26 #define VPERM(VRT,VRA,VRB,VRC)  vperm   VRT,VRA,VRB,VRC
27 #else
28 #define LVS(VRT,RA,RB)          lvsr    VRT,RA,RB
29 #define VPERM(VRT,VRA,VRB,VRC)  vperm   VRT,VRB,VRA,VRC
30 #endif
32 #ifdef CONFIG_ALTIVEC
33         cmpldi  r5,16
34         cmpldi  cr1,r5,4096
36         std     r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
38         blt     .Lshort_copy
39         bgt     cr1,.Lvmx_copy
40 #else
41         cmpldi  r5,16
43         std     r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
45         blt     .Lshort_copy
46 #endif
48 .Lnonvmx_copy:
49         /* Get the source 8B aligned */
50         neg     r6,r4
51         mtocrf  0x01,r6
52         clrldi  r6,r6,(64-3)
54         bf      cr7*4+3,1f
55         lbz     r0,0(r4)
56         addi    r4,r4,1
57         stb     r0,0(r3)
58         addi    r3,r3,1
60 1:      bf      cr7*4+2,2f
61         lhz     r0,0(r4)
62         addi    r4,r4,2
63         sth     r0,0(r3)
64         addi    r3,r3,2
66 2:      bf      cr7*4+1,3f
67         lwz     r0,0(r4)
68         addi    r4,r4,4
69         stw     r0,0(r3)
70         addi    r3,r3,4
72 3:      sub     r5,r5,r6
73         cmpldi  r5,128
74         blt     5f
76         mflr    r0
77         stdu    r1,-STACKFRAMESIZE(r1)
78         std     r14,STK_REG(R14)(r1)
79         std     r15,STK_REG(R15)(r1)
80         std     r16,STK_REG(R16)(r1)
81         std     r17,STK_REG(R17)(r1)
82         std     r18,STK_REG(R18)(r1)
83         std     r19,STK_REG(R19)(r1)
84         std     r20,STK_REG(R20)(r1)
85         std     r21,STK_REG(R21)(r1)
86         std     r22,STK_REG(R22)(r1)
87         std     r0,STACKFRAMESIZE+16(r1)
89         srdi    r6,r5,7
90         mtctr   r6
92         /* Now do cacheline (128B) sized loads and stores. */
93         .align  5
95         ld      r0,0(r4)
96         ld      r6,8(r4)
97         ld      r7,16(r4)
98         ld      r8,24(r4)
99         ld      r9,32(r4)
100         ld      r10,40(r4)
101         ld      r11,48(r4)
102         ld      r12,56(r4)
103         ld      r14,64(r4)
104         ld      r15,72(r4)
105         ld      r16,80(r4)
106         ld      r17,88(r4)
107         ld      r18,96(r4)
108         ld      r19,104(r4)
109         ld      r20,112(r4)
110         ld      r21,120(r4)
111         addi    r4,r4,128
112         std     r0,0(r3)
113         std     r6,8(r3)
114         std     r7,16(r3)
115         std     r8,24(r3)
116         std     r9,32(r3)
117         std     r10,40(r3)
118         std     r11,48(r3)
119         std     r12,56(r3)
120         std     r14,64(r3)
121         std     r15,72(r3)
122         std     r16,80(r3)
123         std     r17,88(r3)
124         std     r18,96(r3)
125         std     r19,104(r3)
126         std     r20,112(r3)
127         std     r21,120(r3)
128         addi    r3,r3,128
129         bdnz    4b
131         clrldi  r5,r5,(64-7)
133         ld      r14,STK_REG(R14)(r1)
134         ld      r15,STK_REG(R15)(r1)
135         ld      r16,STK_REG(R16)(r1)
136         ld      r17,STK_REG(R17)(r1)
137         ld      r18,STK_REG(R18)(r1)
138         ld      r19,STK_REG(R19)(r1)
139         ld      r20,STK_REG(R20)(r1)
140         ld      r21,STK_REG(R21)(r1)
141         ld      r22,STK_REG(R22)(r1)
142         addi    r1,r1,STACKFRAMESIZE
144         /* Up to 127B to go */
145 5:      srdi    r6,r5,4
146         mtocrf  0x01,r6
148 6:      bf      cr7*4+1,7f
149         ld      r0,0(r4)
150         ld      r6,8(r4)
151         ld      r7,16(r4)
152         ld      r8,24(r4)
153         ld      r9,32(r4)
154         ld      r10,40(r4)
155         ld      r11,48(r4)
156         ld      r12,56(r4)
157         addi    r4,r4,64
158         std     r0,0(r3)
159         std     r6,8(r3)
160         std     r7,16(r3)
161         std     r8,24(r3)
162         std     r9,32(r3)
163         std     r10,40(r3)
164         std     r11,48(r3)
165         std     r12,56(r3)
166         addi    r3,r3,64
168         /* Up to 63B to go */
169 7:      bf      cr7*4+2,8f
170         ld      r0,0(r4)
171         ld      r6,8(r4)
172         ld      r7,16(r4)
173         ld      r8,24(r4)
174         addi    r4,r4,32
175         std     r0,0(r3)
176         std     r6,8(r3)
177         std     r7,16(r3)
178         std     r8,24(r3)
179         addi    r3,r3,32
181         /* Up to 31B to go */
182 8:      bf      cr7*4+3,9f
183         ld      r0,0(r4)
184         ld      r6,8(r4)
185         addi    r4,r4,16
186         std     r0,0(r3)
187         std     r6,8(r3)
188         addi    r3,r3,16
190 9:      clrldi  r5,r5,(64-4)
192         /* Up to 15B to go */
193 .Lshort_copy:
194         mtocrf  0x01,r5
195         bf      cr7*4+0,12f
196         lwz     r0,0(r4)        /* Less chance of a reject with word ops */
197         lwz     r6,4(r4)
198         addi    r4,r4,8
199         stw     r0,0(r3)
200         stw     r6,4(r3)
201         addi    r3,r3,8
203 12:     bf      cr7*4+1,13f
204         lwz     r0,0(r4)
205         addi    r4,r4,4
206         stw     r0,0(r3)
207         addi    r3,r3,4
209 13:     bf      cr7*4+2,14f
210         lhz     r0,0(r4)
211         addi    r4,r4,2
212         sth     r0,0(r3)
213         addi    r3,r3,2
215 14:     bf      cr7*4+3,15f
216         lbz     r0,0(r4)
217         stb     r0,0(r3)
219 15:     ld      r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
220         blr
222 .Lunwind_stack_nonvmx_copy:
223         addi    r1,r1,STACKFRAMESIZE
224         b       .Lnonvmx_copy
226 #ifdef CONFIG_ALTIVEC
227 .Lvmx_copy:
228         mflr    r0
229         std     r4,-STACKFRAMESIZE+STK_REG(R30)(r1)
230         std     r5,-STACKFRAMESIZE+STK_REG(R29)(r1)
231         std     r0,16(r1)
232         stdu    r1,-STACKFRAMESIZE(r1)
233         bl      enter_vmx_copy
234         cmpwi   cr1,r3,0
235         ld      r0,STACKFRAMESIZE+16(r1)
236         ld      r3,STK_REG(R31)(r1)
237         ld      r4,STK_REG(R30)(r1)
238         ld      r5,STK_REG(R29)(r1)
239         mtlr    r0
241         /*
242          * We prefetch both the source and destination using enhanced touch
243          * instructions. We use a stream ID of 0 for the load side and
244          * 1 for the store side.
245          */
246         clrrdi  r6,r4,7
247         clrrdi  r9,r3,7
248         ori     r9,r9,1         /* stream=1 */
250         srdi    r7,r5,7         /* length in cachelines, capped at 0x3FF */
251         cmpldi  r7,0x3FF
252         ble     1f
253         li      r7,0x3FF
254 1:      lis     r0,0x0E00       /* depth=7 */
255         sldi    r7,r7,7
256         or      r7,r7,r0
257         ori     r10,r7,1        /* stream=1 */
259         lis     r8,0x8000       /* GO=1 */
260         clrldi  r8,r8,32
262 .machine push
263 .machine "power4"
264         dcbt    r0,r6,0b01000
265         dcbt    r0,r7,0b01010
266         dcbtst  r0,r9,0b01000
267         dcbtst  r0,r10,0b01010
268         eieio
269         dcbt    r0,r8,0b01010   /* GO */
270 .machine pop
272         beq     cr1,.Lunwind_stack_nonvmx_copy
274         /*
275          * If source and destination are not relatively aligned we use a
276          * slower permute loop.
277          */
278         xor     r6,r4,r3
279         rldicl. r6,r6,0,(64-4)
280         bne     .Lvmx_unaligned_copy
282         /* Get the destination 16B aligned */
283         neg     r6,r3
284         mtocrf  0x01,r6
285         clrldi  r6,r6,(64-4)
287         bf      cr7*4+3,1f
288         lbz     r0,0(r4)
289         addi    r4,r4,1
290         stb     r0,0(r3)
291         addi    r3,r3,1
293 1:      bf      cr7*4+2,2f
294         lhz     r0,0(r4)
295         addi    r4,r4,2
296         sth     r0,0(r3)
297         addi    r3,r3,2
299 2:      bf      cr7*4+1,3f
300         lwz     r0,0(r4)
301         addi    r4,r4,4
302         stw     r0,0(r3)
303         addi    r3,r3,4
305 3:      bf      cr7*4+0,4f
306         ld      r0,0(r4)
307         addi    r4,r4,8
308         std     r0,0(r3)
309         addi    r3,r3,8
311 4:      sub     r5,r5,r6
313         /* Get the desination 128B aligned */
314         neg     r6,r3
315         srdi    r7,r6,4
316         mtocrf  0x01,r7
317         clrldi  r6,r6,(64-7)
319         li      r9,16
320         li      r10,32
321         li      r11,48
323         bf      cr7*4+3,5f
324         lvx     vr1,r0,r4
325         addi    r4,r4,16
326         stvx    vr1,r0,r3
327         addi    r3,r3,16
329 5:      bf      cr7*4+2,6f
330         lvx     vr1,r0,r4
331         lvx     vr0,r4,r9
332         addi    r4,r4,32
333         stvx    vr1,r0,r3
334         stvx    vr0,r3,r9
335         addi    r3,r3,32
337 6:      bf      cr7*4+1,7f
338         lvx     vr3,r0,r4
339         lvx     vr2,r4,r9
340         lvx     vr1,r4,r10
341         lvx     vr0,r4,r11
342         addi    r4,r4,64
343         stvx    vr3,r0,r3
344         stvx    vr2,r3,r9
345         stvx    vr1,r3,r10
346         stvx    vr0,r3,r11
347         addi    r3,r3,64
349 7:      sub     r5,r5,r6
350         srdi    r6,r5,7
352         std     r14,STK_REG(R14)(r1)
353         std     r15,STK_REG(R15)(r1)
354         std     r16,STK_REG(R16)(r1)
356         li      r12,64
357         li      r14,80
358         li      r15,96
359         li      r16,112
361         mtctr   r6
363         /*
364          * Now do cacheline sized loads and stores. By this stage the
365          * cacheline stores are also cacheline aligned.
366          */
367         .align  5
369         lvx     vr7,r0,r4
370         lvx     vr6,r4,r9
371         lvx     vr5,r4,r10
372         lvx     vr4,r4,r11
373         lvx     vr3,r4,r12
374         lvx     vr2,r4,r14
375         lvx     vr1,r4,r15
376         lvx     vr0,r4,r16
377         addi    r4,r4,128
378         stvx    vr7,r0,r3
379         stvx    vr6,r3,r9
380         stvx    vr5,r3,r10
381         stvx    vr4,r3,r11
382         stvx    vr3,r3,r12
383         stvx    vr2,r3,r14
384         stvx    vr1,r3,r15
385         stvx    vr0,r3,r16
386         addi    r3,r3,128
387         bdnz    8b
389         ld      r14,STK_REG(R14)(r1)
390         ld      r15,STK_REG(R15)(r1)
391         ld      r16,STK_REG(R16)(r1)
393         /* Up to 127B to go */
394         clrldi  r5,r5,(64-7)
395         srdi    r6,r5,4
396         mtocrf  0x01,r6
398         bf      cr7*4+1,9f
399         lvx     vr3,r0,r4
400         lvx     vr2,r4,r9
401         lvx     vr1,r4,r10
402         lvx     vr0,r4,r11
403         addi    r4,r4,64
404         stvx    vr3,r0,r3
405         stvx    vr2,r3,r9
406         stvx    vr1,r3,r10
407         stvx    vr0,r3,r11
408         addi    r3,r3,64
410 9:      bf      cr7*4+2,10f
411         lvx     vr1,r0,r4
412         lvx     vr0,r4,r9
413         addi    r4,r4,32
414         stvx    vr1,r0,r3
415         stvx    vr0,r3,r9
416         addi    r3,r3,32
418 10:     bf      cr7*4+3,11f
419         lvx     vr1,r0,r4
420         addi    r4,r4,16
421         stvx    vr1,r0,r3
422         addi    r3,r3,16
424         /* Up to 15B to go */
425 11:     clrldi  r5,r5,(64-4)
426         mtocrf  0x01,r5
427         bf      cr7*4+0,12f
428         ld      r0,0(r4)
429         addi    r4,r4,8
430         std     r0,0(r3)
431         addi    r3,r3,8
433 12:     bf      cr7*4+1,13f
434         lwz     r0,0(r4)
435         addi    r4,r4,4
436         stw     r0,0(r3)
437         addi    r3,r3,4
439 13:     bf      cr7*4+2,14f
440         lhz     r0,0(r4)
441         addi    r4,r4,2
442         sth     r0,0(r3)
443         addi    r3,r3,2
445 14:     bf      cr7*4+3,15f
446         lbz     r0,0(r4)
447         stb     r0,0(r3)
449 15:     addi    r1,r1,STACKFRAMESIZE
450         ld      r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
451         b       exit_vmx_copy           /* tail call optimise */
453 .Lvmx_unaligned_copy:
454         /* Get the destination 16B aligned */
455         neg     r6,r3
456         mtocrf  0x01,r6
457         clrldi  r6,r6,(64-4)
459         bf      cr7*4+3,1f
460         lbz     r0,0(r4)
461         addi    r4,r4,1
462         stb     r0,0(r3)
463         addi    r3,r3,1
465 1:      bf      cr7*4+2,2f
466         lhz     r0,0(r4)
467         addi    r4,r4,2
468         sth     r0,0(r3)
469         addi    r3,r3,2
471 2:      bf      cr7*4+1,3f
472         lwz     r0,0(r4)
473         addi    r4,r4,4
474         stw     r0,0(r3)
475         addi    r3,r3,4
477 3:      bf      cr7*4+0,4f
478         lwz     r0,0(r4)        /* Less chance of a reject with word ops */
479         lwz     r7,4(r4)
480         addi    r4,r4,8
481         stw     r0,0(r3)
482         stw     r7,4(r3)
483         addi    r3,r3,8
485 4:      sub     r5,r5,r6
487         /* Get the desination 128B aligned */
488         neg     r6,r3
489         srdi    r7,r6,4
490         mtocrf  0x01,r7
491         clrldi  r6,r6,(64-7)
493         li      r9,16
494         li      r10,32
495         li      r11,48
497         LVS(vr16,0,r4)          /* Setup permute control vector */
498         lvx     vr0,0,r4
499         addi    r4,r4,16
501         bf      cr7*4+3,5f
502         lvx     vr1,r0,r4
503         VPERM(vr8,vr0,vr1,vr16)
504         addi    r4,r4,16
505         stvx    vr8,r0,r3
506         addi    r3,r3,16
507         vor     vr0,vr1,vr1
509 5:      bf      cr7*4+2,6f
510         lvx     vr1,r0,r4
511         VPERM(vr8,vr0,vr1,vr16)
512         lvx     vr0,r4,r9
513         VPERM(vr9,vr1,vr0,vr16)
514         addi    r4,r4,32
515         stvx    vr8,r0,r3
516         stvx    vr9,r3,r9
517         addi    r3,r3,32
519 6:      bf      cr7*4+1,7f
520         lvx     vr3,r0,r4
521         VPERM(vr8,vr0,vr3,vr16)
522         lvx     vr2,r4,r9
523         VPERM(vr9,vr3,vr2,vr16)
524         lvx     vr1,r4,r10
525         VPERM(vr10,vr2,vr1,vr16)
526         lvx     vr0,r4,r11
527         VPERM(vr11,vr1,vr0,vr16)
528         addi    r4,r4,64
529         stvx    vr8,r0,r3
530         stvx    vr9,r3,r9
531         stvx    vr10,r3,r10
532         stvx    vr11,r3,r11
533         addi    r3,r3,64
535 7:      sub     r5,r5,r6
536         srdi    r6,r5,7
538         std     r14,STK_REG(R14)(r1)
539         std     r15,STK_REG(R15)(r1)
540         std     r16,STK_REG(R16)(r1)
542         li      r12,64
543         li      r14,80
544         li      r15,96
545         li      r16,112
547         mtctr   r6
549         /*
550          * Now do cacheline sized loads and stores. By this stage the
551          * cacheline stores are also cacheline aligned.
552          */
553         .align  5
555         lvx     vr7,r0,r4
556         VPERM(vr8,vr0,vr7,vr16)
557         lvx     vr6,r4,r9
558         VPERM(vr9,vr7,vr6,vr16)
559         lvx     vr5,r4,r10
560         VPERM(vr10,vr6,vr5,vr16)
561         lvx     vr4,r4,r11
562         VPERM(vr11,vr5,vr4,vr16)
563         lvx     vr3,r4,r12
564         VPERM(vr12,vr4,vr3,vr16)
565         lvx     vr2,r4,r14
566         VPERM(vr13,vr3,vr2,vr16)
567         lvx     vr1,r4,r15
568         VPERM(vr14,vr2,vr1,vr16)
569         lvx     vr0,r4,r16
570         VPERM(vr15,vr1,vr0,vr16)
571         addi    r4,r4,128
572         stvx    vr8,r0,r3
573         stvx    vr9,r3,r9
574         stvx    vr10,r3,r10
575         stvx    vr11,r3,r11
576         stvx    vr12,r3,r12
577         stvx    vr13,r3,r14
578         stvx    vr14,r3,r15
579         stvx    vr15,r3,r16
580         addi    r3,r3,128
581         bdnz    8b
583         ld      r14,STK_REG(R14)(r1)
584         ld      r15,STK_REG(R15)(r1)
585         ld      r16,STK_REG(R16)(r1)
587         /* Up to 127B to go */
588         clrldi  r5,r5,(64-7)
589         srdi    r6,r5,4
590         mtocrf  0x01,r6
592         bf      cr7*4+1,9f
593         lvx     vr3,r0,r4
594         VPERM(vr8,vr0,vr3,vr16)
595         lvx     vr2,r4,r9
596         VPERM(vr9,vr3,vr2,vr16)
597         lvx     vr1,r4,r10
598         VPERM(vr10,vr2,vr1,vr16)
599         lvx     vr0,r4,r11
600         VPERM(vr11,vr1,vr0,vr16)
601         addi    r4,r4,64
602         stvx    vr8,r0,r3
603         stvx    vr9,r3,r9
604         stvx    vr10,r3,r10
605         stvx    vr11,r3,r11
606         addi    r3,r3,64
608 9:      bf      cr7*4+2,10f
609         lvx     vr1,r0,r4
610         VPERM(vr8,vr0,vr1,vr16)
611         lvx     vr0,r4,r9
612         VPERM(vr9,vr1,vr0,vr16)
613         addi    r4,r4,32
614         stvx    vr8,r0,r3
615         stvx    vr9,r3,r9
616         addi    r3,r3,32
618 10:     bf      cr7*4+3,11f
619         lvx     vr1,r0,r4
620         VPERM(vr8,vr0,vr1,vr16)
621         addi    r4,r4,16
622         stvx    vr8,r0,r3
623         addi    r3,r3,16
625         /* Up to 15B to go */
626 11:     clrldi  r5,r5,(64-4)
627         addi    r4,r4,-16       /* Unwind the +16 load offset */
628         mtocrf  0x01,r5
629         bf      cr7*4+0,12f
630         lwz     r0,0(r4)        /* Less chance of a reject with word ops */
631         lwz     r6,4(r4)
632         addi    r4,r4,8
633         stw     r0,0(r3)
634         stw     r6,4(r3)
635         addi    r3,r3,8
637 12:     bf      cr7*4+1,13f
638         lwz     r0,0(r4)
639         addi    r4,r4,4
640         stw     r0,0(r3)
641         addi    r3,r3,4
643 13:     bf      cr7*4+2,14f
644         lhz     r0,0(r4)
645         addi    r4,r4,2
646         sth     r0,0(r3)
647         addi    r3,r3,2
649 14:     bf      cr7*4+3,15f
650         lbz     r0,0(r4)
651         stb     r0,0(r3)
653 15:     addi    r1,r1,STACKFRAMESIZE
654         ld      r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
655         b       exit_vmx_copy           /* tail call optimise */
656 #endif /* CONFiG_ALTIVEC */