2 * memcpy - copy memory area
4 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
5 * See https://llvm.org/LICENSE.txt for license information.
6 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 This memcpy routine is optimised for Cortex-A15 cores and takes advantage
11 of VFP or NEON when built with the appropriate flags.
15 ARMv6 (ARMv7-a if using Neon)
21 #include "../asmdefs.h"
24 /* This implementation requires ARM state. */
35 #elif !defined (__SOFTFP__)
39 # define FRAME_SIZE 32
44 # define FRAME_SIZE 32
48 /* Old versions of GAS incorrectly implement the NEON align semantics. */
49 #ifdef BROKEN_ASM_NEON_ALIGN
50 #define ALIGN(addr, align) addr,:align
52 #define ALIGN(addr, align) addr:align
55 #define PC_OFFSET 8 /* PC pipeline compensation. */
58 /* Call parameters. */
69 /* For bulk copies using GP registers. */
70 #define A_l r2 /* Call-clobbered. */
71 #define A_h r3 /* Call-clobbered. */
80 /* Number of lines ahead to pre-fetch data. If you change this the code
81 below will need adjustment to compensate. */
83 #define prefetch_lines 5
86 .macro cpy_line_vfp vreg, base
87 vstr \vreg, [dst, #\base]
88 vldr \vreg, [src, #\base]
89 vstr d0, [dst, #\base + 8]
90 vldr d0, [src, #\base + 8]
91 vstr d1, [dst, #\base + 16]
92 vldr d1, [src, #\base + 16]
93 vstr d2, [dst, #\base + 24]
94 vldr d2, [src, #\base + 24]
95 vstr \vreg, [dst, #\base + 32]
96 vldr \vreg, [src, #\base + prefetch_lines * 64 - 32]
97 vstr d0, [dst, #\base + 40]
98 vldr d0, [src, #\base + 40]
99 vstr d1, [dst, #\base + 48]
100 vldr d1, [src, #\base + 48]
101 vstr d2, [dst, #\base + 56]
102 vldr d2, [src, #\base + 56]
105 .macro cpy_tail_vfp vreg, base
106 vstr \vreg, [dst, #\base]
107 vldr \vreg, [src, #\base]
108 vstr d0, [dst, #\base + 8]
109 vldr d0, [src, #\base + 8]
110 vstr d1, [dst, #\base + 16]
111 vldr d1, [src, #\base + 16]
112 vstr d2, [dst, #\base + 24]
113 vldr d2, [src, #\base + 24]
114 vstr \vreg, [dst, #\base + 32]
115 vstr d0, [dst, #\base + 40]
116 vldr d0, [src, #\base + 40]
117 vstr d1, [dst, #\base + 48]
118 vldr d1, [src, #\base + 48]
119 vstr d2, [dst, #\base + 56]
120 vldr d2, [src, #\base + 56]
126 mov dst, dstin /* Preserve dstin, we need to return it. */
129 /* Deal with small copies quickly by dropping straight into the
134 and tmp1, count, #0x38
135 rsb tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
137 vld1.8 {d0}, [src]! /* 14 words to go. */
139 vld1.8 {d0}, [src]! /* 12 words to go. */
141 vld1.8 {d0}, [src]! /* 10 words to go. */
143 vld1.8 {d0}, [src]! /* 8 words to go. */
145 vld1.8 {d0}, [src]! /* 6 words to go. */
147 vld1.8 {d0}, [src]! /* 4 words to go. */
149 vld1.8 {d0}, [src]! /* 2 words to go. */
153 ldrne tmp1, [src], #4
154 strne tmp1, [dst], #4
156 /* Copy up to 15 full words of data. May not be aligned. */
157 /* Cannot use VFP for unaligned data. */
158 and tmp1, count, #0x3c
161 rsb tmp1, tmp1, #(60 - PC_OFFSET/2 + INSN_SIZE/2)
162 /* Jump directly into the sequence below at the correct offset. */
163 add pc, pc, tmp1, lsl #1
165 ldr tmp1, [src, #-60] /* 15 words to go. */
166 str tmp1, [dst, #-60]
168 ldr tmp1, [src, #-56] /* 14 words to go. */
169 str tmp1, [dst, #-56]
170 ldr tmp1, [src, #-52]
171 str tmp1, [dst, #-52]
173 ldr tmp1, [src, #-48] /* 12 words to go. */
174 str tmp1, [dst, #-48]
175 ldr tmp1, [src, #-44]
176 str tmp1, [dst, #-44]
178 ldr tmp1, [src, #-40] /* 10 words to go. */
179 str tmp1, [dst, #-40]
180 ldr tmp1, [src, #-36]
181 str tmp1, [dst, #-36]
183 ldr tmp1, [src, #-32] /* 8 words to go. */
184 str tmp1, [dst, #-32]
185 ldr tmp1, [src, #-28]
186 str tmp1, [dst, #-28]
188 ldr tmp1, [src, #-24] /* 6 words to go. */
189 str tmp1, [dst, #-24]
190 ldr tmp1, [src, #-20]
191 str tmp1, [dst, #-20]
193 ldr tmp1, [src, #-16] /* 4 words to go. */
194 str tmp1, [dst, #-16]
195 ldr tmp1, [src, #-12]
196 str tmp1, [dst, #-12]
198 ldr tmp1, [src, #-8] /* 2 words to go. */
204 lsls count, count, #31
205 ldrhcs tmp1, [src], #2
206 ldrbne src, [src] /* Src is dead, use as a scratch. */
207 strhcs tmp1, [dst], #2
212 /* At least 64 bytes to copy, but don't know the alignment yet. */
213 str tmp2, [sp, #-FRAME_SIZE]!
217 bne L(cpy_notaligned)
220 /* Magic dust alert! Force VFP on Cortex-A9. Experiments show
221 that the FP pipeline is much better at streaming loads and
222 stores. This is outside the critical loop. */
226 /* SRC and DST have the same mutual 64-bit alignment, but we may
227 still need to pre-copy some bytes to get to natural alignment.
228 We bring SRC and DST into full 64-bit alignment. */
232 sub count, count, tmp2, lsr #29
233 ldrmi tmp1, [src], #4
234 strmi tmp1, [dst], #4
236 ldrhcs tmp1, [src], #2
237 ldrbne tmp2, [src], #1
238 strhcs tmp1, [dst], #2
239 strbne tmp2, [dst], #1
242 subs tmp2, count, #64 /* Use tmp2 for count. */
248 L(cpy_body_medium): /* Count in tmp2. */
274 L(tail63aligned): /* Count in tmp2. */
275 and tmp1, tmp2, #0x38
278 rsb tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
281 vldr d0, [src, #-56] /* 14 words to go. */
283 vldr d0, [src, #-48] /* 12 words to go. */
285 vldr d0, [src, #-40] /* 10 words to go. */
287 vldr d0, [src, #-32] /* 8 words to go. */
289 vldr d0, [src, #-24] /* 6 words to go. */
291 vldr d0, [src, #-16] /* 4 words to go. */
293 vldr d0, [src, #-8] /* 2 words to go. */
299 ldrd A_l, A_h, [src, #8]
300 strd A_l, A_h, [dst, #8]
301 ldrd A_l, A_h, [src, #16]
302 strd A_l, A_h, [dst, #16]
303 ldrd A_l, A_h, [src, #24]
304 strd A_l, A_h, [dst, #24]
305 ldrd A_l, A_h, [src, #32]
306 strd A_l, A_h, [dst, #32]
307 ldrd A_l, A_h, [src, #40]
308 strd A_l, A_h, [dst, #40]
309 ldrd A_l, A_h, [src, #48]
310 strd A_l, A_h, [dst, #48]
311 ldrd A_l, A_h, [src, #56]
312 strd A_l, A_h, [dst, #56]
313 ldrd A_l, A_h, [src, #64]!
314 strd A_l, A_h, [dst, #64]!
319 ldr tmp2,[sp], #FRAME_SIZE
325 L(tail63aligned): /* Count in tmp2. */
326 /* Copy up to 7 d-words of data. Similar to Ltail63unaligned, but
327 we know that the src and dest are 64-bit aligned so we can use
328 LDRD/STRD to improve efficiency. */
329 /* TMP2 is now negative, but we don't care about that. The bottom
330 six bits still tell us how many bytes are left to copy. */
332 and tmp1, tmp2, #0x38
335 rsb tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
337 ldrd A_l, A_h, [src, #-56] /* 14 words to go. */
338 strd A_l, A_h, [dst, #-56]
339 ldrd A_l, A_h, [src, #-48] /* 12 words to go. */
340 strd A_l, A_h, [dst, #-48]
341 ldrd A_l, A_h, [src, #-40] /* 10 words to go. */
342 strd A_l, A_h, [dst, #-40]
343 ldrd A_l, A_h, [src, #-32] /* 8 words to go. */
344 strd A_l, A_h, [dst, #-32]
345 ldrd A_l, A_h, [src, #-24] /* 6 words to go. */
346 strd A_l, A_h, [dst, #-24]
347 ldrd A_l, A_h, [src, #-16] /* 4 words to go. */
348 strd A_l, A_h, [dst, #-16]
349 ldrd A_l, A_h, [src, #-8] /* 2 words to go. */
350 strd A_l, A_h, [dst, #-8]
354 ldrne tmp1, [src], #4
355 strne tmp1, [dst], #4
356 lsls tmp2, tmp2, #31 /* Count (tmp2) now dead. */
357 ldrhcs tmp1, [src], #2
359 strhcs tmp1, [dst], #2
363 ldr tmp2, [sp], #FRAME_SIZE
366 L(cpy_body_long): /* Count in tmp2. */
368 /* Long copy. We know that there's at least (prefetch_lines * 64)
371 /* Don't use PLD. Instead, read some data in advance of the current
372 copy position into a register. This should act like a PLD
373 operation but we won't have to repeat the transfer. */
386 subs tmp2, tmp2, #prefetch_lines * 64 * 2
392 add dst, dst, #3 * 64
393 add src, src, #3 * 64
396 add dst, dst, #2 * 64
397 add src, src, #2 * 64
398 subs tmp2, tmp2, #prefetch_lines * 64
405 add src, src, #3 * 64
406 add dst, dst, #3 * 64
410 vstr d0, [dst, #64 + 8]
411 vldr d0, [src, #64 + 8]
412 vstr d1, [dst, #64 + 16]
413 vldr d1, [src, #64 + 16]
414 vstr d2, [dst, #64 + 24]
415 vldr d2, [src, #64 + 24]
416 vstr d7, [dst, #64 + 32]
418 vstr d0, [dst, #64 + 40]
419 vstr d1, [dst, #64 + 48]
420 vstr d2, [dst, #64 + 56]
422 add tmp2, tmp2, #prefetch_lines * 64
425 /* Long copy. Use an SMS style loop to maximize the I/O
426 bandwidth of the core. We don't have enough spare registers
427 to synthesise prefetching, so use PLD operations. */
428 /* Pre-bias src and dst. */
435 ldrd A_l, A_h, [src, #8]
436 strd B_l, B_h, [sp, #8]
437 ldrd B_l, B_h, [src, #16]
438 strd C_l, C_h, [sp, #16]
439 ldrd C_l, C_h, [src, #24]
440 strd D_l, D_h, [sp, #24]
442 ldrd D_l, D_h, [src, #32]!
447 strd A_l, A_h, [dst, #40]
448 ldrd A_l, A_h, [src, #40]
449 strd B_l, B_h, [dst, #48]
450 ldrd B_l, B_h, [src, #48]
451 strd C_l, C_h, [dst, #56]
452 ldrd C_l, C_h, [src, #56]
453 strd D_l, D_h, [dst, #64]!
454 ldrd D_l, D_h, [src, #64]!
457 strd A_l, A_h, [dst, #8]
458 ldrd A_l, A_h, [src, #8]
459 strd B_l, B_h, [dst, #16]
460 ldrd B_l, B_h, [src, #16]
461 strd C_l, C_h, [dst, #24]
462 ldrd C_l, C_h, [src, #24]
463 strd D_l, D_h, [dst, #32]
464 ldrd D_l, D_h, [src, #32]
466 /* Save the remaining bytes and restore the callee-saved regs. */
467 strd A_l, A_h, [dst, #40]
469 strd B_l, B_h, [dst, #48]
470 ldrd B_l, B_h, [sp, #8]
471 strd C_l, C_h, [dst, #56]
472 ldrd C_l, C_h, [sp, #16]
473 strd D_l, D_h, [dst, #64]
474 ldrd D_l, D_h, [sp, #24]
478 ldr tmp2, [sp], #FRAME_SIZE
485 /* There's at least 64 bytes to copy, but there is no mutual
487 /* Bring DST to 64-bit alignment. */
492 sub count, count, tmp2, lsr #29
493 ldrmi tmp1, [src], #4
494 strmi tmp1, [dst], #4
496 ldrbne tmp1, [src], #1
497 ldrhcs tmp2, [src], #2
498 strbne tmp1, [dst], #1
499 strhcs tmp2, [dst], #2
502 subs count, count, #64
503 ldrmi tmp2, [sp], #FRAME_SIZE
504 bmi L(tail63unaligned)
508 vld1.8 {d0-d3}, [src]!
509 vld1.8 {d4-d7}, [src]!
510 subs count, count, #64
514 vst1.8 {d0-d3}, [ALIGN (dst, 64)]!
515 vld1.8 {d0-d3}, [src]!
516 vst1.8 {d4-d7}, [ALIGN (dst, 64)]!
517 vld1.8 {d4-d7}, [src]!
518 subs count, count, #64
521 vst1.8 {d0-d3}, [ALIGN (dst, 64)]!
522 vst1.8 {d4-d7}, [ALIGN (dst, 64)]!
523 ands count, count, #0x3f
525 /* Use an SMS style loop to maximize the I/O bandwidth. */
528 subs tmp2, count, #64 /* Use tmp2 for count. */
531 strd B_l, B_h, [sp, #8]
534 strd C_l, C_h, [sp, #16]
537 strd D_l, D_h, [sp, #24]
543 pld [src, #(5 * 64) - (32 - 4)]
544 strd A_l, A_h, [dst, #40]
547 strd B_l, B_h, [dst, #48]
550 strd C_l, C_h, [dst, #56]
553 strd D_l, D_h, [dst, #64]!
558 strd A_l, A_h, [dst, #8]
561 strd B_l, B_h, [dst, #16]
564 strd C_l, C_h, [dst, #24]
567 strd D_l, D_h, [dst, #32]
572 /* Save the remaining bytes and restore the callee-saved regs. */
573 strd A_l, A_h, [dst, #40]
575 strd B_l, B_h, [dst, #48]
576 ldrd B_l, B_h, [sp, #8]
577 strd C_l, C_h, [dst, #56]
578 ldrd C_l, C_h, [sp, #16]
579 strd D_l, D_h, [dst, #64]
580 ldrd D_l, D_h, [sp, #24]
582 ands count, tmp2, #0x3f
584 ldr tmp2, [sp], #FRAME_SIZE
585 bne L(tail63unaligned)