2 * Copyright (C) 2013 ARM Ltd.
3 * Copyright (C) 2013 Linaro.
5 * This code is based on glibc cortex strings work originally authored by Linaro
6 * and re-licensed under GPLv2 for the Linux kernel. The original code can
9 * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
10 * files/head:/src/aarch64/
12 * This program is free software; you can redistribute it and/or modify
13 * it under the terms of the GNU General Public License version 2 as
14 * published by the Free Software Foundation.
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU General Public License for more details.
21 * You should have received a copy of the GNU General Public License
22 * along with this program. If not, see <http://www.gnu.org/licenses/>.
27 * Copy a buffer from src to dest (alignment handled by the hardware)
56 /*When memory length is less than 16, the accessed are not aligned.*/
60 ands tmp2, tmp2, #15/* Bytes to reach alignment. */
62 sub count, count, tmp2
64 * Copy the leading memory data from src to dst in an increasing
65 * address order.By this way,the risk of overwriting the source
66 * memory data is eliminated when the distance between src and
67 * dst is less than 16. The memory accesses here are alignment.
81 tbz tmp2, #3, .LSrcAligned
89 * Deal with small copies quickly by dropping straight into the
94 * Copy up to 48 bytes of data. At this point we only need the
95 * bottom 6 bits of count to be accurate.
97 ands tmp1, count, #0x30
102 ldp1 A_l, A_h, src, #16
103 stp1 A_l, A_h, dst, #16
105 ldp1 A_l, A_h, src, #16
106 stp1 A_l, A_h, dst, #16
108 ldp1 A_l, A_h, src, #16
109 stp1 A_l, A_h, dst, #16
112 * Prefer to break one ldp/stp into several load/store to access
113 * memory in an increasing address order,rather than to load/store 16
114 * bytes from (src-16) to (dst-16) and to backward the src to aligned
115 * address,which way is used in original cortex memcpy. If keeping
116 * the original memcpy process here, memmove need to satisfy the
117 * precondition that src address is at least 16 bytes bigger than dst
118 * address,otherwise some source data will be overwritten when memove
119 * call memcpy directly. To make memmove simpler and decouple the
120 * memcpy's dependency on memmove, withdrew the original process.
134 tbz count, #0, .Lexitfunc
141 subs count, count, #128
142 b.ge .Lcpy_body_large
144 * Less than 128 bytes to copy, so handle 64 here and then jump
147 ldp1 A_l, A_h, src, #16
148 stp1 A_l, A_h, dst, #16
149 ldp1 B_l, B_h, src, #16
150 ldp1 C_l, C_h, src, #16
151 stp1 B_l, B_h, dst, #16
152 stp1 C_l, C_h, dst, #16
153 ldp1 D_l, D_h, src, #16
154 stp1 D_l, D_h, dst, #16
161 * Critical loop. Start at a new cache line boundary. Assuming
162 * 64 bytes per line this ensures the entire loop is in one line.
164 .p2align L1_CACHE_SHIFT
166 /* pre-get 64 bytes data. */
167 ldp1 A_l, A_h, src, #16
168 ldp1 B_l, B_h, src, #16
169 ldp1 C_l, C_h, src, #16
170 ldp1 D_l, D_h, src, #16
173 * interlace the load of next 64 bytes data block with store of the last
174 * loaded 64 bytes data.
176 stp1 A_l, A_h, dst, #16
177 ldp1 A_l, A_h, src, #16
178 stp1 B_l, B_h, dst, #16
179 ldp1 B_l, B_h, src, #16
180 stp1 C_l, C_h, dst, #16
181 ldp1 C_l, C_h, src, #16
182 stp1 D_l, D_h, dst, #16
183 ldp1 D_l, D_h, src, #16
184 subs count, count, #64
186 stp1 A_l, A_h, dst, #16
187 stp1 B_l, B_h, dst, #16
188 stp1 C_l, C_h, dst, #16
189 stp1 D_l, D_h, dst, #16