1 /* SPDX-License-Identifier: GPL-2.0-only */
3 * Copyright (C) 2013 ARM Ltd.
4 * Copyright (C) 2013 Linaro.
6 * This code is based on glibc cortex strings work originally authored by Linaro
9 * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
10 * files/head:/src/aarch64/
13 #include <linux/linkage.h>
14 #include <asm/assembler.h>
15 #include <asm/cache.h>
18 * Move a buffer from src to test (alignment handled by the hardware).
19 * If dest <= src, call memcpy, otherwise copy in reverse order.
48 SYM_FUNC_START_ALIAS(__memmove)
49 SYM_FUNC_START_WEAK_PI(memmove)
54 b.hs __memcpy /* No overlap. */
59 b.lo .Ltail15 /*probably non-alignment accesses.*/
61 ands tmp2, src, #15 /* Bytes to reach alignment. */
63 sub count, count, tmp2
65 * process the aligned offset length to make the src aligned firstly.
66 * those extra instructions' cost is acceptable. It also make the
67 * coming accesses are based on aligned address.
70 ldrb tmp1w, [src, #-1]!
71 strb tmp1w, [dst, #-1]!
74 ldrh tmp1w, [src, #-2]!
75 strh tmp1w, [dst, #-2]!
78 ldr tmp1w, [src, #-4]!
79 str tmp1w, [dst, #-4]!
81 tbz tmp2, #3, .LSrcAligned
90 * Deal with small copies quickly by dropping straight into the
95 * Copy up to 48 bytes of data. At this point we only need the
96 * bottom 6 bits of count to be accurate.
98 ands tmp1, count, #0x30
103 ldp A_l, A_h, [src, #-16]!
104 stp A_l, A_h, [dst, #-16]!
106 ldp A_l, A_h, [src, #-16]!
107 stp A_l, A_h, [dst, #-16]!
109 ldp A_l, A_h, [src, #-16]!
110 stp A_l, A_h, [dst, #-16]!
114 ldr tmp1, [src, #-8]!
115 str tmp1, [dst, #-8]!
118 ldr tmp1w, [src, #-4]!
119 str tmp1w, [dst, #-4]!
122 ldrh tmp1w, [src, #-2]!
123 strh tmp1w, [dst, #-2]!
125 tbz count, #0, .Lexitfunc
126 ldrb tmp1w, [src, #-1]
127 strb tmp1w, [dst, #-1]
133 subs count, count, #128
134 b.ge .Lcpy_body_large
136 * Less than 128 bytes to copy, so handle 64 bytes here and then jump
139 ldp A_l, A_h, [src, #-16]
140 stp A_l, A_h, [dst, #-16]
141 ldp B_l, B_h, [src, #-32]
142 ldp C_l, C_h, [src, #-48]
143 stp B_l, B_h, [dst, #-32]
144 stp C_l, C_h, [dst, #-48]
145 ldp D_l, D_h, [src, #-64]!
146 stp D_l, D_h, [dst, #-64]!
153 * Critical loop. Start at a new cache line boundary. Assuming
154 * 64 bytes per line this ensures the entire loop is in one line.
156 .p2align L1_CACHE_SHIFT
158 /* pre-load 64 bytes data. */
159 ldp A_l, A_h, [src, #-16]
160 ldp B_l, B_h, [src, #-32]
161 ldp C_l, C_h, [src, #-48]
162 ldp D_l, D_h, [src, #-64]!
165 * interlace the load of next 64 bytes data block with store of the last
166 * loaded 64 bytes data.
168 stp A_l, A_h, [dst, #-16]
169 ldp A_l, A_h, [src, #-16]
170 stp B_l, B_h, [dst, #-32]
171 ldp B_l, B_h, [src, #-32]
172 stp C_l, C_h, [dst, #-48]
173 ldp C_l, C_h, [src, #-48]
174 stp D_l, D_h, [dst, #-64]!
175 ldp D_l, D_h, [src, #-64]!
176 subs count, count, #64
178 stp A_l, A_h, [dst, #-16]
179 stp B_l, B_h, [dst, #-32]
180 stp C_l, C_h, [dst, #-48]
181 stp D_l, D_h, [dst, #-64]!
186 SYM_FUNC_END_PI(memmove)
187 EXPORT_SYMBOL(memmove)
188 SYM_FUNC_END_ALIAS(__memmove)
189 EXPORT_SYMBOL(__memmove)