Linux 4.6-rc6
[linux/fpc-iii.git] / arch / powerpc / lib / memcpy_64.S
blob32a06ec395d2108202762c6d164b3994ca6a7644
1 /*
2  * Copyright (C) 2002 Paul Mackerras, IBM Corp.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public License
6  * as published by the Free Software Foundation; either version
7  * 2 of the License, or (at your option) any later version.
8  */
9 #include <asm/processor.h>
10 #include <asm/ppc_asm.h>
12         .align  7
13 _GLOBAL_TOC(memcpy)
14 BEGIN_FTR_SECTION
15 #ifdef __LITTLE_ENDIAN__
16         cmpdi   cr7,r5,0
17 #else
18         std     r3,-STACKFRAMESIZE+STK_REG(R31)(r1)     /* save destination pointer for return value */
19 #endif
20 FTR_SECTION_ELSE
21 #ifndef SELFTEST
22         b       memcpy_power7
23 #endif
24 ALT_FTR_SECTION_END_IFCLR(CPU_FTR_VMX_COPY)
25 #ifdef __LITTLE_ENDIAN__
26         /* dumb little-endian memcpy that will get replaced at runtime */
27         addi r9,r3,-1
28         addi r4,r4,-1
29         beqlr cr7
30         mtctr r5
31 1:      lbzu r10,1(r4)
32         stbu r10,1(r9)
33         bdnz 1b
34         blr
35 #else
36         PPC_MTOCRF(0x01,r5)
37         cmpldi  cr1,r5,16
38         neg     r6,r3           # LS 3 bits = # bytes to 8-byte dest bdry
39         andi.   r6,r6,7
40         dcbt    0,r4
41         blt     cr1,.Lshort_copy
42 /* Below we want to nop out the bne if we're on a CPU that has the
43    CPU_FTR_UNALIGNED_LD_STD bit set and the CPU_FTR_CP_USE_DCBTZ bit
44    cleared.
45    At the time of writing the only CPU that has this combination of bits
46    set is Power6. */
47 BEGIN_FTR_SECTION
48         nop
49 FTR_SECTION_ELSE
50         bne     .Ldst_unaligned
51 ALT_FTR_SECTION_END(CPU_FTR_UNALIGNED_LD_STD | CPU_FTR_CP_USE_DCBTZ, \
52                     CPU_FTR_UNALIGNED_LD_STD)
53 .Ldst_aligned:
54         addi    r3,r3,-16
55 BEGIN_FTR_SECTION
56         andi.   r0,r4,7
57         bne     .Lsrc_unaligned
58 END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD)
59         srdi    r7,r5,4
60         ld      r9,0(r4)
61         addi    r4,r4,-8
62         mtctr   r7
63         andi.   r5,r5,7
64         bf      cr7*4+0,2f
65         addi    r3,r3,8
66         addi    r4,r4,8
67         mr      r8,r9
68         blt     cr1,3f
69 1:      ld      r9,8(r4)
70         std     r8,8(r3)
71 2:      ldu     r8,16(r4)
72         stdu    r9,16(r3)
73         bdnz    1b
74 3:      std     r8,8(r3)
75         beq     3f
76         addi    r3,r3,16
77 .Ldo_tail:
78         bf      cr7*4+1,1f
79         lwz     r9,8(r4)
80         addi    r4,r4,4
81         stw     r9,0(r3)
82         addi    r3,r3,4
83 1:      bf      cr7*4+2,2f
84         lhz     r9,8(r4)
85         addi    r4,r4,2
86         sth     r9,0(r3)
87         addi    r3,r3,2
88 2:      bf      cr7*4+3,3f
89         lbz     r9,8(r4)
90         stb     r9,0(r3)
91 3:      ld      r3,-STACKFRAMESIZE+STK_REG(R31)(r1)     /* return dest pointer */
92         blr
94 .Lsrc_unaligned:
95         srdi    r6,r5,3
96         addi    r5,r5,-16
97         subf    r4,r0,r4
98         srdi    r7,r5,4
99         sldi    r10,r0,3
100         cmpdi   cr6,r6,3
101         andi.   r5,r5,7
102         mtctr   r7
103         subfic  r11,r10,64
104         add     r5,r5,r0
106         bt      cr7*4+0,0f
108         ld      r9,0(r4)        # 3+2n loads, 2+2n stores
109         ld      r0,8(r4)
110         sld     r6,r9,r10
111         ldu     r9,16(r4)
112         srd     r7,r0,r11
113         sld     r8,r0,r10
114         or      r7,r7,r6
115         blt     cr6,4f
116         ld      r0,8(r4)
117         # s1<< in r8, d0=(s0<<|s1>>) in r7, s3 in r0, s2 in r9, nix in r6 & r12
118         b       2f
120 0:      ld      r0,0(r4)        # 4+2n loads, 3+2n stores
121         ldu     r9,8(r4)
122         sld     r8,r0,r10
123         addi    r3,r3,-8
124         blt     cr6,5f
125         ld      r0,8(r4)
126         srd     r12,r9,r11
127         sld     r6,r9,r10
128         ldu     r9,16(r4)
129         or      r12,r8,r12
130         srd     r7,r0,r11
131         sld     r8,r0,r10
132         addi    r3,r3,16
133         beq     cr6,3f
135         # d0=(s0<<|s1>>) in r12, s1<< in r6, s2>> in r7, s2<< in r8, s3 in r9
136 1:      or      r7,r7,r6
137         ld      r0,8(r4)
138         std     r12,8(r3)
139 2:      srd     r12,r9,r11
140         sld     r6,r9,r10
141         ldu     r9,16(r4)
142         or      r12,r8,r12
143         stdu    r7,16(r3)
144         srd     r7,r0,r11
145         sld     r8,r0,r10
146         bdnz    1b
148 3:      std     r12,8(r3)
149         or      r7,r7,r6
150 4:      std     r7,16(r3)
151 5:      srd     r12,r9,r11
152         or      r12,r8,r12
153         std     r12,24(r3)
154         beq     4f
155         cmpwi   cr1,r5,8
156         addi    r3,r3,32
157         sld     r9,r9,r10
158         ble     cr1,6f
159         ld      r0,8(r4)
160         srd     r7,r0,r11
161         or      r9,r7,r9
163         bf      cr7*4+1,1f
164         rotldi  r9,r9,32
165         stw     r9,0(r3)
166         addi    r3,r3,4
167 1:      bf      cr7*4+2,2f
168         rotldi  r9,r9,16
169         sth     r9,0(r3)
170         addi    r3,r3,2
171 2:      bf      cr7*4+3,3f
172         rotldi  r9,r9,8
173         stb     r9,0(r3)
174 3:      ld      r3,-STACKFRAMESIZE+STK_REG(R31)(r1)     /* return dest pointer */
175         blr
177 .Ldst_unaligned:
178         PPC_MTOCRF(0x01,r6)             # put #bytes to 8B bdry into cr7
179         subf    r5,r6,r5
180         li      r7,0
181         cmpldi  cr1,r5,16
182         bf      cr7*4+3,1f
183         lbz     r0,0(r4)
184         stb     r0,0(r3)
185         addi    r7,r7,1
186 1:      bf      cr7*4+2,2f
187         lhzx    r0,r7,r4
188         sthx    r0,r7,r3
189         addi    r7,r7,2
190 2:      bf      cr7*4+1,3f
191         lwzx    r0,r7,r4
192         stwx    r0,r7,r3
193 3:      PPC_MTOCRF(0x01,r5)
194         add     r4,r6,r4
195         add     r3,r6,r3
196         b       .Ldst_aligned
198 .Lshort_copy:
199         bf      cr7*4+0,1f
200         lwz     r0,0(r4)
201         lwz     r9,4(r4)
202         addi    r4,r4,8
203         stw     r0,0(r3)
204         stw     r9,4(r3)
205         addi    r3,r3,8
206 1:      bf      cr7*4+1,2f
207         lwz     r0,0(r4)
208         addi    r4,r4,4
209         stw     r0,0(r3)
210         addi    r3,r3,4
211 2:      bf      cr7*4+2,3f
212         lhz     r0,0(r4)
213         addi    r4,r4,2
214         sth     r0,0(r3)
215         addi    r3,r3,2
216 3:      bf      cr7*4+3,4f
217         lbz     r0,0(r4)
218         stb     r0,0(r3)
219 4:      ld      r3,-STACKFRAMESIZE+STK_REG(R31)(r1)     /* return dest pointer */
220         blr
221 #endif