Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/jmorris...
[linux/fpc-iii.git] / arch / powerpc / lib / memcpy_64.S
blobd2bbbc8d7dc0b045aab751d79264a2f4f959fe61
1 /*
2  * Copyright (C) 2002 Paul Mackerras, IBM Corp.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public License
6  * as published by the Free Software Foundation; either version
7  * 2 of the License, or (at your option) any later version.
8  */
9 #include <asm/processor.h>
10 #include <asm/ppc_asm.h>
12         .align  7
13 _GLOBAL(memcpy)
14 BEGIN_FTR_SECTION
15         std     r3,48(r1)       /* save destination pointer for return value */
16 FTR_SECTION_ELSE
17         b       memcpy_power7
18 ALT_FTR_SECTION_END_IFCLR(CPU_FTR_VMX_COPY)
19         PPC_MTOCRF(0x01,r5)
20         cmpldi  cr1,r5,16
21         neg     r6,r3           # LS 3 bits = # bytes to 8-byte dest bdry
22         andi.   r6,r6,7
23         dcbt    0,r4
24         blt     cr1,.Lshort_copy
25 /* Below we want to nop out the bne if we're on a CPU that has the
26    CPU_FTR_UNALIGNED_LD_STD bit set and the CPU_FTR_CP_USE_DCBTZ bit
27    cleared.
28    At the time of writing the only CPU that has this combination of bits
29    set is Power6. */
30 BEGIN_FTR_SECTION
31         nop
32 FTR_SECTION_ELSE
33         bne     .Ldst_unaligned
34 ALT_FTR_SECTION_END(CPU_FTR_UNALIGNED_LD_STD | CPU_FTR_CP_USE_DCBTZ, \
35                     CPU_FTR_UNALIGNED_LD_STD)
36 .Ldst_aligned:
37         addi    r3,r3,-16
38 BEGIN_FTR_SECTION
39         andi.   r0,r4,7
40         bne     .Lsrc_unaligned
41 END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD)
42         srdi    r7,r5,4
43         ld      r9,0(r4)
44         addi    r4,r4,-8
45         mtctr   r7
46         andi.   r5,r5,7
47         bf      cr7*4+0,2f
48         addi    r3,r3,8
49         addi    r4,r4,8
50         mr      r8,r9
51         blt     cr1,3f
52 1:      ld      r9,8(r4)
53         std     r8,8(r3)
54 2:      ldu     r8,16(r4)
55         stdu    r9,16(r3)
56         bdnz    1b
57 3:      std     r8,8(r3)
58         beq     3f
59         addi    r3,r3,16
60 .Ldo_tail:
61         bf      cr7*4+1,1f
62         lwz     r9,8(r4)
63         addi    r4,r4,4
64         stw     r9,0(r3)
65         addi    r3,r3,4
66 1:      bf      cr7*4+2,2f
67         lhz     r9,8(r4)
68         addi    r4,r4,2
69         sth     r9,0(r3)
70         addi    r3,r3,2
71 2:      bf      cr7*4+3,3f
72         lbz     r9,8(r4)
73         stb     r9,0(r3)
74 3:      ld      r3,48(r1)       /* return dest pointer */
75         blr
77 .Lsrc_unaligned:
78         srdi    r6,r5,3
79         addi    r5,r5,-16
80         subf    r4,r0,r4
81         srdi    r7,r5,4
82         sldi    r10,r0,3
83         cmpdi   cr6,r6,3
84         andi.   r5,r5,7
85         mtctr   r7
86         subfic  r11,r10,64
87         add     r5,r5,r0
89         bt      cr7*4+0,0f
91         ld      r9,0(r4)        # 3+2n loads, 2+2n stores
92         ld      r0,8(r4)
93         sld     r6,r9,r10
94         ldu     r9,16(r4)
95         srd     r7,r0,r11
96         sld     r8,r0,r10
97         or      r7,r7,r6
98         blt     cr6,4f
99         ld      r0,8(r4)
100         # s1<< in r8, d0=(s0<<|s1>>) in r7, s3 in r0, s2 in r9, nix in r6 & r12
101         b       2f
103 0:      ld      r0,0(r4)        # 4+2n loads, 3+2n stores
104         ldu     r9,8(r4)
105         sld     r8,r0,r10
106         addi    r3,r3,-8
107         blt     cr6,5f
108         ld      r0,8(r4)
109         srd     r12,r9,r11
110         sld     r6,r9,r10
111         ldu     r9,16(r4)
112         or      r12,r8,r12
113         srd     r7,r0,r11
114         sld     r8,r0,r10
115         addi    r3,r3,16
116         beq     cr6,3f
118         # d0=(s0<<|s1>>) in r12, s1<< in r6, s2>> in r7, s2<< in r8, s3 in r9
119 1:      or      r7,r7,r6
120         ld      r0,8(r4)
121         std     r12,8(r3)
122 2:      srd     r12,r9,r11
123         sld     r6,r9,r10
124         ldu     r9,16(r4)
125         or      r12,r8,r12
126         stdu    r7,16(r3)
127         srd     r7,r0,r11
128         sld     r8,r0,r10
129         bdnz    1b
131 3:      std     r12,8(r3)
132         or      r7,r7,r6
133 4:      std     r7,16(r3)
134 5:      srd     r12,r9,r11
135         or      r12,r8,r12
136         std     r12,24(r3)
137         beq     4f
138         cmpwi   cr1,r5,8
139         addi    r3,r3,32
140         sld     r9,r9,r10
141         ble     cr1,6f
142         ld      r0,8(r4)
143         srd     r7,r0,r11
144         or      r9,r7,r9
146         bf      cr7*4+1,1f
147         rotldi  r9,r9,32
148         stw     r9,0(r3)
149         addi    r3,r3,4
150 1:      bf      cr7*4+2,2f
151         rotldi  r9,r9,16
152         sth     r9,0(r3)
153         addi    r3,r3,2
154 2:      bf      cr7*4+3,3f
155         rotldi  r9,r9,8
156         stb     r9,0(r3)
157 3:      ld      r3,48(r1)       /* return dest pointer */
158         blr
160 .Ldst_unaligned:
161         PPC_MTOCRF(0x01,r6)             # put #bytes to 8B bdry into cr7
162         subf    r5,r6,r5
163         li      r7,0
164         cmpldi  cr1,r5,16
165         bf      cr7*4+3,1f
166         lbz     r0,0(r4)
167         stb     r0,0(r3)
168         addi    r7,r7,1
169 1:      bf      cr7*4+2,2f
170         lhzx    r0,r7,r4
171         sthx    r0,r7,r3
172         addi    r7,r7,2
173 2:      bf      cr7*4+1,3f
174         lwzx    r0,r7,r4
175         stwx    r0,r7,r3
176 3:      PPC_MTOCRF(0x01,r5)
177         add     r4,r6,r4
178         add     r3,r6,r3
179         b       .Ldst_aligned
181 .Lshort_copy:
182         bf      cr7*4+0,1f
183         lwz     r0,0(r4)
184         lwz     r9,4(r4)
185         addi    r4,r4,8
186         stw     r0,0(r3)
187         stw     r9,4(r3)
188         addi    r3,r3,8
189 1:      bf      cr7*4+1,2f
190         lwz     r0,0(r4)
191         addi    r4,r4,4
192         stw     r0,0(r3)
193         addi    r3,r3,4
194 2:      bf      cr7*4+2,3f
195         lhz     r0,0(r4)
196         addi    r4,r4,2
197         sth     r0,0(r3)
198         addi    r3,r3,2
199 3:      bf      cr7*4+3,4f
200         lbz     r0,0(r4)
201         stb     r0,0(r3)
202 4:      ld      r3,48(r1)       /* return dest pointer */
203         blr