2 * memcpy - copy memory area
4 * Copyright (c) 2012-2022, Arm Limited.
5 * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
10 * ARMv8-a, AArch64, unaligned accesses.
14 #if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED))
15 /* See memcpy-stub.c */
46 /* This implementation handles overlaps and supports both memcpy and memmove
47 from a single entry point. It uses unaligned accesses and branchless
48 sequences to keep the code small, simple and improve performance.
50 Copies are split into 3 main cases: small copies of up to 32 bytes, medium
51 copies of up to 128 bytes, and large copies. The overhead of the overlap
52 check is negligible since it is only required for large copies.
54 Large copies use a software pipelined loop processing 64 bytes per iteration.
55 The destination pointer is 16-byte aligned to minimize unaligned accesses.
56 The loop tail is handled by always copying 64 bytes from the end.
64 add srcend, src, count
65 add dstend, dstin, count
71 /* Small copies: 0..32 bytes. */
75 ldp D_l, D_h, [srcend, -16]
77 stp D_l, D_h, [dstend, -16]
80 /* Copy 8-15 bytes. */
82 tbz count, 3, L(copy8)
92 tbz count, 2, L(copy4)
94 ldr B_lw, [srcend, -4]
96 str B_lw, [dstend, -4]
99 /* Copy 0..3 bytes using a branchless sequence. */
104 ldrb C_lw, [srcend, -1]
105 ldrb B_lw, [src, tmp1]
107 strb B_lw, [dstin, tmp1]
108 strb C_lw, [dstend, -1]
113 /* Medium copies: 33..128 bytes. */
116 ldp B_l, B_h, [src, 16]
117 ldp C_l, C_h, [srcend, -32]
118 ldp D_l, D_h, [srcend, -16]
121 stp A_l, A_h, [dstin]
122 stp B_l, B_h, [dstin, 16]
123 stp C_l, C_h, [dstend, -32]
124 stp D_l, D_h, [dstend, -16]
128 /* Copy 65..128 bytes. */
130 ldp E_l, E_h, [src, 32]
131 ldp F_l, F_h, [src, 48]
134 ldp G_l, G_h, [srcend, -64]
135 ldp H_l, H_h, [srcend, -48]
136 stp G_l, G_h, [dstend, -64]
137 stp H_l, H_h, [dstend, -48]
139 stp A_l, A_h, [dstin]
140 stp B_l, B_h, [dstin, 16]
141 stp E_l, E_h, [dstin, 32]
142 stp F_l, F_h, [dstin, 48]
143 stp C_l, C_h, [dstend, -32]
144 stp D_l, D_h, [dstend, -16]
148 /* Copy more than 128 bytes. */
150 /* Use backwards copy if there is an overlap. */
154 b.lo L(copy_long_backwards)
156 /* Copy 16 bytes and then align dst to 16-byte alignment. */
162 add count, count, tmp1 /* Count is now 16 too large. */
163 ldp A_l, A_h, [src, 16]
164 stp D_l, D_h, [dstin]
165 ldp B_l, B_h, [src, 32]
166 ldp C_l, C_h, [src, 48]
167 ldp D_l, D_h, [src, 64]!
168 subs count, count, 128 + 16 /* Test and readjust count. */
169 b.ls L(copy64_from_end)
172 stp A_l, A_h, [dst, 16]
173 ldp A_l, A_h, [src, 16]
174 stp B_l, B_h, [dst, 32]
175 ldp B_l, B_h, [src, 32]
176 stp C_l, C_h, [dst, 48]
177 ldp C_l, C_h, [src, 48]
178 stp D_l, D_h, [dst, 64]!
179 ldp D_l, D_h, [src, 64]!
180 subs count, count, 64
183 /* Write the last iteration and copy 64 bytes from the end. */
185 ldp E_l, E_h, [srcend, -64]
186 stp A_l, A_h, [dst, 16]
187 ldp A_l, A_h, [srcend, -48]
188 stp B_l, B_h, [dst, 32]
189 ldp B_l, B_h, [srcend, -32]
190 stp C_l, C_h, [dst, 48]
191 ldp C_l, C_h, [srcend, -16]
192 stp D_l, D_h, [dst, 64]
193 stp E_l, E_h, [dstend, -64]
194 stp A_l, A_h, [dstend, -48]
195 stp B_l, B_h, [dstend, -32]
196 stp C_l, C_h, [dstend, -16]
201 /* Large backwards copy for overlapping copies.
202 Copy 16 bytes and then align dst to 16-byte alignment. */
203 L(copy_long_backwards):
204 ldp D_l, D_h, [srcend, -16]
206 sub srcend, srcend, tmp1
207 sub count, count, tmp1
208 ldp A_l, A_h, [srcend, -16]
209 stp D_l, D_h, [dstend, -16]
210 ldp B_l, B_h, [srcend, -32]
211 ldp C_l, C_h, [srcend, -48]
212 ldp D_l, D_h, [srcend, -64]!
213 sub dstend, dstend, tmp1
214 subs count, count, 128
215 b.ls L(copy64_from_start)
218 stp A_l, A_h, [dstend, -16]
219 ldp A_l, A_h, [srcend, -16]
220 stp B_l, B_h, [dstend, -32]
221 ldp B_l, B_h, [srcend, -32]
222 stp C_l, C_h, [dstend, -48]
223 ldp C_l, C_h, [srcend, -48]
224 stp D_l, D_h, [dstend, -64]!
225 ldp D_l, D_h, [srcend, -64]!
226 subs count, count, 64
227 b.hi L(loop64_backwards)
229 /* Write the last iteration and copy 64 bytes from the start. */
230 L(copy64_from_start):
231 ldp G_l, G_h, [src, 48]
232 stp A_l, A_h, [dstend, -16]
233 ldp A_l, A_h, [src, 32]
234 stp B_l, B_h, [dstend, -32]
235 ldp B_l, B_h, [src, 16]
236 stp C_l, C_h, [dstend, -48]
238 stp D_l, D_h, [dstend, -64]
239 stp G_l, G_h, [dstin, 48]
240 stp A_l, A_h, [dstin, 32]
241 stp B_l, B_h, [dstin, 16]
242 stp C_l, C_h, [dstin]