2 * Copyright (c) 2013 ARM Ltd
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * 3. The name of the company may not be used to endorse or promote
14 * products derived from this software without specific prior written
17 * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
18 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
19 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20 * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
22 * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
23 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
24 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
25 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
26 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 /* This memcpy routine is optimised for Cortex-M3/M4 cores with/without
32 If compiled with GCC, this file should be enclosed within following
34 if defined (__ARM_ARCH_7M__) || defined (__ARM_ARCH_7EM__)
36 Prototype: void *memcpy (void *dst, const void *src, size_t count);
38 The job will be done in 5 steps.
39 Step 1: Align src/dest pointers, copy mis-aligned if fail to align both
40 Step 2: Repeatedly copy big block size of __OPT_BIG_BLOCK_SIZE
41 Step 3: Repeatedly copy big block size of __OPT_MID_BLOCK_SIZE
42 Step 4: Copy word by word
43 Step 5: Copy byte-to-byte
46 __OPT_BIG_BLOCK_SIZE: Size of big block in words. Default to 64.
47 __OPT_MID_BLOCK_SIZE: Size of big block in words. Default to 16.
51 #ifndef __OPT_BIG_BLOCK_SIZE
52 #define __OPT_BIG_BLOCK_SIZE (4 * 16)
55 #ifndef __OPT_MID_BLOCK_SIZE
56 #define __OPT_MID_BLOCK_SIZE (4 * 4)
59 #if __OPT_BIG_BLOCK_SIZE == 16
60 #define BEGIN_UNROLL_BIG_BLOCK \
62 #elif __OPT_BIG_BLOCK_SIZE == 32
63 #define BEGIN_UNROLL_BIG_BLOCK \
64 .irp offset, 0,4,8,12,16,20,24,28
65 #elif __OPT_BIG_BLOCK_SIZE == 64
66 #define BEGIN_UNROLL_BIG_BLOCK \
67 .irp offset, 0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60
69 #error "Illegal __OPT_BIG_BLOCK_SIZE"
72 #if __OPT_MID_BLOCK_SIZE == 8
73 #define BEGIN_UNROLL_MID_BLOCK \
75 #elif __OPT_MID_BLOCK_SIZE == 16
76 #define BEGIN_UNROLL_MID_BLOCK \
79 #error "Illegal __OPT_MID_BLOCK_SIZE"
82 #define END_UNROLL .endr
91 .cfi_sections .debug_frame
93 .type memcpy, %function
98 #ifdef __ARM_FEATURE_UNALIGNED
99 /* In case of UNALIGNED access supported, ip is not used in
101 prologue push_ip=HAVE_PAC_LEAF
104 prologue 0 push_ip=HAVE_PAC_LEAF
105 #endif /* __ARM_FEATURE_UNALIGNED */
108 bne .Lmisaligned_copy
111 subs r2, __OPT_BIG_BLOCK_SIZE
114 /* Kernel loop for big block copy */
117 BEGIN_UNROLL_BIG_BLOCK
118 #ifdef __ARM_ARCH_7EM__
122 #else /* __ARM_ARCH_7M__ */
123 ldr r3, [r1, \offset]
124 str r3, [r0, \offset]
126 adds r0, __OPT_BIG_BLOCK_SIZE
127 adds r1, __OPT_BIG_BLOCK_SIZE
129 subs r2, __OPT_BIG_BLOCK_SIZE
133 adds r2, __OPT_BIG_BLOCK_SIZE - __OPT_MID_BLOCK_SIZE
134 blo .Lcopy_word_by_word
136 /* Kernel loop for mid-block copy */
139 BEGIN_UNROLL_MID_BLOCK
140 #ifdef __ARM_ARCH_7EM__
144 #else /* __ARM_ARCH_7M__ */
145 ldr r3, [r1, \offset]
146 str r3, [r0, \offset]
148 adds r0, __OPT_MID_BLOCK_SIZE
149 adds r1, __OPT_MID_BLOCK_SIZE
151 subs r2, __OPT_MID_BLOCK_SIZE
155 adds r2, __OPT_MID_BLOCK_SIZE - 4
156 blo .Lcopy_less_than_4
158 /* Kernel loop for small block copy */
160 .Lcopy_word_by_word_loop:
164 bhs .Lcopy_word_by_word_loop
176 #ifdef __ARM_FEATURE_UNALIGNED
184 #endif /* __ARM_FEATURE_UNALIGNED */
188 #ifdef __ARM_FEATURE_UNALIGNED
190 epilogue push_ip=HAVE_PAC_LEAF
192 epilogue 0 push_ip=HAVE_PAC_LEAF
193 #endif /* __ARM_FEATURE_UNALIGNED */
198 #ifdef __ARM_FEATURE_UNALIGNED
199 /* Define label DST_ALIGNED to BIG_BLOCK. It will go to aligned copy
200 once destination is adjusted to aligned. */
201 #define Ldst_aligned Lbig_block
203 /* Copy word by word using LDR when alignment can be done in hardware,
204 i.e., SCTLR.A is set, supporting unaligned access in LDR and STR. */
209 /* if src is aligned, just go to the big block loop. */
213 /* if len < 12, misalignment adjustment has more overhead than
214 just byte-to-byte copy. Also, len must >=8 to guarantee code
215 afterward work correctly. */
218 #endif /* __ARM_FEATURE_UNALIGNED */
220 /* Align dst only, not trying to align src. That is the because
221 handling of aligned src and misaligned dst need more overhead than
222 otherwise. By doing this the worst case is when initial src is aligned,
223 additional up to 4 byte additional copy will executed, which is
239 #ifdef __ARM_FEATURE_UNALIGNED
248 /* Now that dst is aligned */
250 /* if r1 is aligned now, it means r0/r1 has the same misalignment,
251 and they are both aligned now. Go aligned copy. */
255 /* dst is aligned, but src isn't. Misaligned copy. */
258 .cfi_adjust_cfa_offset 8
263 /* Backward r1 by misaligned bytes, to make r1 aligned.
264 Since we need to restore r1 to unaligned address after the loop,
265 we need keep the offset bytes to ip and sub it from r1 afterward. */
269 /* Pre-load on word */
273 beq .Lmisaligned_copy_2_2
275 beq .Lmisaligned_copy_3_1
277 .macro mis_src_copy shift
279 #ifdef __ARM_BIG_ENDIAN
285 #ifdef __ARM_BIG_ENDIAN
286 lsrs r5, r3, 32-\shift
288 lsls r5, r3, 32-\shift
297 .Lmisaligned_copy_1_3:
299 b .Lsrc_misaligned_tail
301 .Lmisaligned_copy_3_1:
302 mis_src_copy shift=24
303 b .Lsrc_misaligned_tail
305 .Lmisaligned_copy_2_2:
306 /* For 2_2 misalignment, ldr is still faster than 2 x ldrh. */
307 mis_src_copy shift=16
309 .Lsrc_misaligned_tail:
315 .cfi_adjust_cfa_offset -8
317 #endif /* __ARM_FEATURE_UNALIGNED */
321 blo .Lcopy_less_than_4
336 #ifdef __ARM_FEATURE_UNALIGNED
338 epilogue push_ip=HAVE_PAC_LEAF
340 epilogue 0 push_ip=HAVE_PAC_LEAF
341 #endif /* __ARM_FEATURE_UNALIGNED */
345 .size memcpy, .-memcpy