1 /* SPDX-License-Identifier: GPL-2.0-only */
3 * Copyright (C) 2013 ARM Ltd.
4 * Copyright (C) 2013 Linaro.
6 * This code is based on glibc cortex strings work originally authored by Linaro
9 * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
10 * files/head:/src/aarch64/
15 * Copy a buffer from src to dest (alignment handled by the hardware)
44 /*When memory length is less than 16, the accessed are not aligned.*/
48 ands tmp2, tmp2, #15/* Bytes to reach alignment. */
50 sub count, count, tmp2
52 * Copy the leading memory data from src to dst in an increasing
53 * address order.By this way,the risk of overwriting the source
54 * memory data is eliminated when the distance between src and
55 * dst is less than 16. The memory accesses here are alignment.
69 tbz tmp2, #3, .LSrcAligned
77 * Deal with small copies quickly by dropping straight into the
82 * Copy up to 48 bytes of data. At this point we only need the
83 * bottom 6 bits of count to be accurate.
85 ands tmp1, count, #0x30
90 ldp1 A_l, A_h, src, #16
91 stp1 A_l, A_h, dst, #16
93 ldp1 A_l, A_h, src, #16
94 stp1 A_l, A_h, dst, #16
96 ldp1 A_l, A_h, src, #16
97 stp1 A_l, A_h, dst, #16
100 * Prefer to break one ldp/stp into several load/store to access
101 * memory in an increasing address order,rather than to load/store 16
102 * bytes from (src-16) to (dst-16) and to backward the src to aligned
103 * address,which way is used in original cortex memcpy. If keeping
104 * the original memcpy process here, memmove need to satisfy the
105 * precondition that src address is at least 16 bytes bigger than dst
106 * address,otherwise some source data will be overwritten when memove
107 * call memcpy directly. To make memmove simpler and decouple the
108 * memcpy's dependency on memmove, withdrew the original process.
122 tbz count, #0, .Lexitfunc
129 subs count, count, #128
130 b.ge .Lcpy_body_large
132 * Less than 128 bytes to copy, so handle 64 here and then jump
135 ldp1 A_l, A_h, src, #16
136 stp1 A_l, A_h, dst, #16
137 ldp1 B_l, B_h, src, #16
138 ldp1 C_l, C_h, src, #16
139 stp1 B_l, B_h, dst, #16
140 stp1 C_l, C_h, dst, #16
141 ldp1 D_l, D_h, src, #16
142 stp1 D_l, D_h, dst, #16
149 * Critical loop. Start at a new cache line boundary. Assuming
150 * 64 bytes per line this ensures the entire loop is in one line.
152 .p2align L1_CACHE_SHIFT
154 /* pre-get 64 bytes data. */
155 ldp1 A_l, A_h, src, #16
156 ldp1 B_l, B_h, src, #16
157 ldp1 C_l, C_h, src, #16
158 ldp1 D_l, D_h, src, #16
161 * interlace the load of next 64 bytes data block with store of the last
162 * loaded 64 bytes data.
164 stp1 A_l, A_h, dst, #16
165 ldp1 A_l, A_h, src, #16
166 stp1 B_l, B_h, dst, #16
167 ldp1 B_l, B_h, src, #16
168 stp1 C_l, C_h, dst, #16
169 ldp1 C_l, C_h, src, #16
170 stp1 D_l, D_h, dst, #16
171 ldp1 D_l, D_h, src, #16
172 subs count, count, #64
174 stp1 A_l, A_h, dst, #16
175 stp1 B_l, B_h, dst, #16
176 stp1 C_l, C_h, dst, #16
177 stp1 D_l, D_h, dst, #16