1 /* Copyright (c) 2013, Linaro Limited
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions are met:
6 * Redistributions of source code must retain the above copyright
7 notice, this list of conditions and the following disclaimer.
8 * Redistributions in binary form must reproduce the above copyright
9 notice, this list of conditions and the following disclaimer in the
10 documentation and/or other materials provided with the distribution.
11 * Neither the name of the Linaro nor the
12 names of its contributors may be used to endorse or promote products
13 derived from this software without specific prior written permission.
15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
19 HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
20 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
21 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES ; LOSS OF USE,
22 DATA, OR PROFITS ; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
23 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
33 .macro def_fn f p2align=0
41 /* Parameters and result. */
67 b.hs memcpy /* No overlap. */
69 /* Upwards move with potential overlap.
70 * Need to move from the tail backwards. SRC and DST point one
71 * byte beyond the remaining data to move. */
75 b.ge .Lmov_not_short_up
77 /* Deal with small moves quickly by dropping straight into the
80 /* Move up to 48 bytes of data. At this point we only need the
81 * bottom 6 bits of count to be accurate. */
82 ands tmp1, count, #0x30
89 ldp A_l, A_h, [src, #32]
90 stp A_l, A_h, [dst, #32]
92 ldp A_l, A_h, [src, #16]
93 stp A_l, A_h, [dst, #16]
98 /* Move up to 15 bytes of data. Does not assume additional data
101 ldr tmp1, [src, #-8]!
102 str tmp1, [dst, #-8]!
105 ldr tmp1w, [src, #-4]!
106 str tmp1w, [dst, #-4]!
109 ldrh tmp1w, [src, #-2]!
110 strh tmp1w, [dst, #-2]!
113 ldrb tmp1w, [src, #-1]
114 strb tmp1w, [dst, #-1]
119 /* We don't much care about the alignment of DST, but we want SRC
120 * to be 128-bit (16 byte) aligned so that we don't cross cache line
121 * boundaries on both loads and stores. */
122 ands tmp2, src, #15 /* Bytes to reach alignment. */
124 sub count, count, tmp2
125 /* Move enough data to reach alignment; unlike memcpy, we have to
126 * be aware of the overlap, which means we can't move data twice. */
128 ldr tmp1, [src, #-8]!
129 str tmp1, [dst, #-8]!
132 ldr tmp1w, [src, #-4]!
133 str tmp1w, [dst, #-4]!
136 ldrh tmp1w, [src, #-2]!
137 strh tmp1w, [dst, #-2]!
140 ldrb tmp1w, [src, #-1]!
141 strb tmp1w, [dst, #-1]!
144 /* There may be less than 63 bytes to go now. */
148 subs count, count, #128
149 b.ge .Lmov_body_large_up
150 /* Less than 128 bytes to move, so handle 64 here and then jump
152 ldp A_l, A_h, [src, #-64]!
153 ldp B_l, B_h, [src, #16]
154 ldp C_l, C_h, [src, #32]
155 ldp D_l, D_h, [src, #48]
156 stp A_l, A_h, [dst, #-64]!
157 stp B_l, B_h, [dst, #16]
158 stp C_l, C_h, [dst, #32]
159 stp D_l, D_h, [dst, #48]
164 /* Critical loop. Start at a new Icache line boundary. Assuming
165 * 64 bytes per line this ensures the entire loop is in one line. */
168 /* There are at least 128 bytes to move. */
169 ldp A_l, A_h, [src, #-16]
170 ldp B_l, B_h, [src, #-32]
171 ldp C_l, C_h, [src, #-48]
172 ldp D_l, D_h, [src, #-64]!
174 stp A_l, A_h, [dst, #-16]
175 ldp A_l, A_h, [src, #-16]
176 stp B_l, B_h, [dst, #-32]
177 ldp B_l, B_h, [src, #-32]
178 stp C_l, C_h, [dst, #-48]
179 ldp C_l, C_h, [src, #-48]
180 stp D_l, D_h, [dst, #-64]!
181 ldp D_l, D_h, [src, #-64]!
182 subs count, count, #64
184 stp A_l, A_h, [dst, #-16]
185 stp B_l, B_h, [dst, #-32]
186 stp C_l, C_h, [dst, #-48]
187 stp D_l, D_h, [dst, #-64]!
194 /* For a downwards move we can safely use memcpy provided that
195 * DST is more than 16 bytes away from SRC. */
198 b.ls memcpy /* May overlap, but not critically. */
200 mov dst, dstin /* Preserve DSTIN for return value. */
202 b.ge .Lmov_not_short_down
204 /* Deal with small moves quickly by dropping straight into the
207 /* Move up to 48 bytes of data. At this point we only need the
208 * bottom 6 bits of count to be accurate. */
209 ands tmp1, count, #0x30
216 ldp A_l, A_h, [src, #-48]
217 stp A_l, A_h, [dst, #-48]
219 ldp A_l, A_h, [src, #-32]
220 stp A_l, A_h, [dst, #-32]
222 ldp A_l, A_h, [src, #-16]
223 stp A_l, A_h, [dst, #-16]
225 /* Move up to 15 bytes of data. Does not assume additional data
236 ldrh tmp1w, [src], #2
237 strh tmp1w, [dst], #2
245 .Lmov_not_short_down:
246 /* We don't much care about the alignment of DST, but we want SRC
247 * to be 128-bit (16 byte) aligned so that we don't cross cache line
248 * boundaries on both loads and stores. */
250 ands tmp2, tmp2, #15 /* Bytes to reach alignment. */
252 sub count, count, tmp2
253 /* Move enough data to reach alignment; unlike memcpy, we have to
254 * be aware of the overlap, which means we can't move data twice. */
264 ldrh tmp1w, [src], #2
265 strh tmp1w, [dst], #2
268 ldrb tmp1w, [src], #1
269 strb tmp1w, [dst], #1
272 /* There may be less than 63 bytes to go now. */
276 subs count, count, #128
277 b.ge .Lmov_body_large_down
278 /* Less than 128 bytes to move, so handle 64 here and then jump
281 ldp B_l, B_h, [src, #16]
282 ldp C_l, C_h, [src, #32]
283 ldp D_l, D_h, [src, #48]
285 stp B_l, B_h, [dst, #16]
286 stp C_l, C_h, [dst, #32]
287 stp D_l, D_h, [dst, #48]
294 /* Critical loop. Start at a new cache line boundary. Assuming
295 * 64 bytes per line this ensures the entire loop is in one line. */
297 .Lmov_body_large_down:
298 /* There are at least 128 bytes to move. */
299 ldp A_l, A_h, [src, #0]
300 sub dst, dst, #16 /* Pre-bias. */
301 ldp B_l, B_h, [src, #16]
302 ldp C_l, C_h, [src, #32]
303 ldp D_l, D_h, [src, #48]! /* src += 64 - Pre-bias. */
305 stp A_l, A_h, [dst, #16]
306 ldp A_l, A_h, [src, #16]
307 stp B_l, B_h, [dst, #32]
308 ldp B_l, B_h, [src, #32]
309 stp C_l, C_h, [dst, #48]
310 ldp C_l, C_h, [src, #48]
311 stp D_l, D_h, [dst, #64]!
312 ldp D_l, D_h, [src, #64]!
313 subs count, count, #64
315 stp A_l, A_h, [dst, #16]
316 stp B_l, B_h, [dst, #32]
317 stp C_l, C_h, [dst, #48]
318 stp D_l, D_h, [dst, #64]
320 add dst, dst, #64 + 16
324 .size memmove, . - memmove