2 * memcpy - copy memory area
4 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
5 * See https://llvm.org/LICENSE.txt for license information.
6 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
11 * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
15 #include "../asmdefs.h"
41 /* This implementation handles overlaps and supports both memcpy and memmove
42 from a single entry point. It uses unaligned accesses and branchless
43 sequences to keep the code small, simple and improve performance.
45 Copies are split into 3 main cases: small copies of up to 32 bytes, medium
46 copies of up to 128 bytes, and large copies. The overhead of the overlap
47 check is negligible since it is only required for large copies.
49 Large copies use a software pipelined loop processing 64 bytes per iteration.
50 The source pointer is 16-byte aligned to minimize unaligned accesses.
51 The loop tail is handled by always copying 64 bytes from the end.
54 ENTRY (__memcpy_aarch64_simd)
55 ENTRY_ALIAS (__memmove_aarch64_simd)
56 add srcend, src, count
57 add dstend, dstin, count
63 /* Small copies: 0..32 bytes. */
67 ldr B_q, [srcend, -16]
69 str B_q, [dstend, -16]
72 /* Copy 8-15 bytes. */
74 tbz count, 3, L(copy8)
84 tbz count, 2, L(copy4)
86 ldr B_lw, [srcend, -4]
88 str B_lw, [dstend, -4]
91 /* Copy 0..3 bytes using a branchless sequence. */
96 ldrb C_lw, [srcend, -1]
97 ldrb B_lw, [src, tmp1]
99 strb B_lw, [dstin, tmp1]
100 strb C_lw, [dstend, -1]
105 /* Medium copies: 33..128 bytes. */
108 ldp C_q, D_q, [srcend, -32]
111 stp A_q, B_q, [dstin]
112 stp C_q, D_q, [dstend, -32]
116 /* Copy 65..128 bytes. */
118 ldp E_q, F_q, [src, 32]
121 ldp G_q, H_q, [srcend, -64]
122 stp G_q, H_q, [dstend, -64]
124 stp A_q, B_q, [dstin]
125 stp E_q, F_q, [dstin, 32]
126 stp C_q, D_q, [dstend, -32]
129 /* Copy more than 128 bytes. */
131 /* Use backwards copy if there is an overlap. */
134 b.lo L(copy_long_backwards)
136 /* Copy 16 bytes and then align src to 16-byte alignment. */
141 add count, count, tmp1 /* Count is now 16 too large. */
142 ldp A_q, B_q, [src, 16]
144 ldp C_q, D_q, [src, 48]
145 subs count, count, 128 + 16 /* Test and readjust count. */
146 b.ls L(copy64_from_end)
148 stp A_q, B_q, [dst, 16]
149 ldp A_q, B_q, [src, 80]
150 stp C_q, D_q, [dst, 48]
151 ldp C_q, D_q, [src, 112]
154 subs count, count, 64
157 /* Write the last iteration and copy 64 bytes from the end. */
159 ldp E_q, F_q, [srcend, -64]
160 stp A_q, B_q, [dst, 16]
161 ldp A_q, B_q, [srcend, -32]
162 stp C_q, D_q, [dst, 48]
163 stp E_q, F_q, [dstend, -64]
164 stp A_q, B_q, [dstend, -32]
167 /* Large backwards copy for overlapping copies.
168 Copy 16 bytes and then align srcend to 16-byte alignment. */
169 L(copy_long_backwards):
171 ldr D_q, [srcend, -16]
173 bic srcend, srcend, 15
174 sub count, count, tmp1
175 ldp A_q, B_q, [srcend, -32]
176 str D_q, [dstend, -16]
177 ldp C_q, D_q, [srcend, -64]
178 sub dstend, dstend, tmp1
179 subs count, count, 128
180 b.ls L(copy64_from_start)
183 stp A_q, B_q, [dstend, -32]
184 ldp A_q, B_q, [srcend, -96]
185 stp C_q, D_q, [dstend, -64]
186 ldp C_q, D_q, [srcend, -128]
187 sub srcend, srcend, 64
188 sub dstend, dstend, 64
189 subs count, count, 64
190 b.hi L(loop64_backwards)
192 /* Write the last iteration and copy 64 bytes from the start. */
193 L(copy64_from_start):
194 ldp E_q, F_q, [src, 32]
195 stp A_q, B_q, [dstend, -32]
197 stp C_q, D_q, [dstend, -64]
198 stp E_q, F_q, [dstin, 32]
199 stp A_q, B_q, [dstin]
202 END (__memcpy_aarch64_simd)