1 /* SPDX-License-Identifier: GPL-2.0-only */
3 * Copyright (C) 2013 ARM Ltd.
4 * Copyright (C) 2013 Linaro.
6 * This code is based on glibc cortex strings work originally authored by Linaro
9 * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
10 * files/head:/src/aarch64/
13 #include <linux/linkage.h>
14 #include <asm/assembler.h>
15 #include <asm/cache.h>
18 * Move a buffer from src to test (alignment handled by the hardware).
19 * If dest <= src, call memcpy, otherwise copy in reverse order.
55 b.hs __memcpy /* No overlap. */
60 b.lo .Ltail15 /*probably non-alignment accesses.*/
62 ands tmp2, src, #15 /* Bytes to reach alignment. */
64 sub count, count, tmp2
66 * process the aligned offset length to make the src aligned firstly.
67 * those extra instructions' cost is acceptable. It also make the
68 * coming accesses are based on aligned address.
71 ldrb tmp1w, [src, #-1]!
72 strb tmp1w, [dst, #-1]!
75 ldrh tmp1w, [src, #-2]!
76 strh tmp1w, [dst, #-2]!
79 ldr tmp1w, [src, #-4]!
80 str tmp1w, [dst, #-4]!
82 tbz tmp2, #3, .LSrcAligned
91 * Deal with small copies quickly by dropping straight into the
96 * Copy up to 48 bytes of data. At this point we only need the
97 * bottom 6 bits of count to be accurate.
99 ands tmp1, count, #0x30
104 ldp A_l, A_h, [src, #-16]!
105 stp A_l, A_h, [dst, #-16]!
107 ldp A_l, A_h, [src, #-16]!
108 stp A_l, A_h, [dst, #-16]!
110 ldp A_l, A_h, [src, #-16]!
111 stp A_l, A_h, [dst, #-16]!
115 ldr tmp1, [src, #-8]!
116 str tmp1, [dst, #-8]!
119 ldr tmp1w, [src, #-4]!
120 str tmp1w, [dst, #-4]!
123 ldrh tmp1w, [src, #-2]!
124 strh tmp1w, [dst, #-2]!
126 tbz count, #0, .Lexitfunc
127 ldrb tmp1w, [src, #-1]
128 strb tmp1w, [dst, #-1]
134 subs count, count, #128
135 b.ge .Lcpy_body_large
137 * Less than 128 bytes to copy, so handle 64 bytes here and then jump
140 ldp A_l, A_h, [src, #-16]
141 stp A_l, A_h, [dst, #-16]
142 ldp B_l, B_h, [src, #-32]
143 ldp C_l, C_h, [src, #-48]
144 stp B_l, B_h, [dst, #-32]
145 stp C_l, C_h, [dst, #-48]
146 ldp D_l, D_h, [src, #-64]!
147 stp D_l, D_h, [dst, #-64]!
154 * Critical loop. Start at a new cache line boundary. Assuming
155 * 64 bytes per line this ensures the entire loop is in one line.
157 .p2align L1_CACHE_SHIFT
159 /* pre-load 64 bytes data. */
160 ldp A_l, A_h, [src, #-16]
161 ldp B_l, B_h, [src, #-32]
162 ldp C_l, C_h, [src, #-48]
163 ldp D_l, D_h, [src, #-64]!
166 * interlace the load of next 64 bytes data block with store of the last
167 * loaded 64 bytes data.
169 stp A_l, A_h, [dst, #-16]
170 ldp A_l, A_h, [src, #-16]
171 stp B_l, B_h, [dst, #-32]
172 ldp B_l, B_h, [src, #-32]
173 stp C_l, C_h, [dst, #-48]
174 ldp C_l, C_h, [src, #-48]
175 stp D_l, D_h, [dst, #-64]!
176 ldp D_l, D_h, [src, #-64]!
177 subs count, count, #64
179 stp A_l, A_h, [dst, #-16]
180 stp B_l, B_h, [dst, #-32]
181 stp C_l, C_h, [dst, #-48]
182 stp D_l, D_h, [dst, #-64]!
188 EXPORT_SYMBOL(memmove)
190 EXPORT_SYMBOL(__memmove)