2 * Copyright (C) 2013 ARM Ltd.
3 * Copyright (C) 2013 Linaro.
5 * This code is based on glibc cortex strings work originally authored by Linaro
6 * and re-licensed under GPLv2 for the Linux kernel. The original code can
9 * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
10 * files/head:/src/aarch64/
12 * This program is free software; you can redistribute it and/or modify
13 * it under the terms of the GNU General Public License version 2 as
14 * published by the Free Software Foundation.
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU General Public License for more details.
21 * You should have received a copy of the GNU General Public License
22 * along with this program. If not, see <http://www.gnu.org/licenses/>.
25 #include <linux/linkage.h>
26 #include <asm/assembler.h>
27 #include <asm/cache.h>
30 * Move a buffer from src to test (alignment handled by the hardware).
31 * If dest <= src, call memcpy, otherwise copy in reverse order.
67 b.hs __memcpy /* No overlap. */
72 b.lo .Ltail15 /*probably non-alignment accesses.*/
74 ands tmp2, src, #15 /* Bytes to reach alignment. */
76 sub count, count, tmp2
78 * process the aligned offset length to make the src aligned firstly.
79 * those extra instructions' cost is acceptable. It also make the
80 * coming accesses are based on aligned address.
83 ldrb tmp1w, [src, #-1]!
84 strb tmp1w, [dst, #-1]!
87 ldrh tmp1w, [src, #-2]!
88 strh tmp1w, [dst, #-2]!
91 ldr tmp1w, [src, #-4]!
92 str tmp1w, [dst, #-4]!
94 tbz tmp2, #3, .LSrcAligned
103 * Deal with small copies quickly by dropping straight into the
108 * Copy up to 48 bytes of data. At this point we only need the
109 * bottom 6 bits of count to be accurate.
111 ands tmp1, count, #0x30
116 ldp A_l, A_h, [src, #-16]!
117 stp A_l, A_h, [dst, #-16]!
119 ldp A_l, A_h, [src, #-16]!
120 stp A_l, A_h, [dst, #-16]!
122 ldp A_l, A_h, [src, #-16]!
123 stp A_l, A_h, [dst, #-16]!
127 ldr tmp1, [src, #-8]!
128 str tmp1, [dst, #-8]!
131 ldr tmp1w, [src, #-4]!
132 str tmp1w, [dst, #-4]!
135 ldrh tmp1w, [src, #-2]!
136 strh tmp1w, [dst, #-2]!
138 tbz count, #0, .Lexitfunc
139 ldrb tmp1w, [src, #-1]
140 strb tmp1w, [dst, #-1]
146 subs count, count, #128
147 b.ge .Lcpy_body_large
149 * Less than 128 bytes to copy, so handle 64 bytes here and then jump
152 ldp A_l, A_h, [src, #-16]
153 stp A_l, A_h, [dst, #-16]
154 ldp B_l, B_h, [src, #-32]
155 ldp C_l, C_h, [src, #-48]
156 stp B_l, B_h, [dst, #-32]
157 stp C_l, C_h, [dst, #-48]
158 ldp D_l, D_h, [src, #-64]!
159 stp D_l, D_h, [dst, #-64]!
166 * Critical loop. Start at a new cache line boundary. Assuming
167 * 64 bytes per line this ensures the entire loop is in one line.
169 .p2align L1_CACHE_SHIFT
171 /* pre-load 64 bytes data. */
172 ldp A_l, A_h, [src, #-16]
173 ldp B_l, B_h, [src, #-32]
174 ldp C_l, C_h, [src, #-48]
175 ldp D_l, D_h, [src, #-64]!
178 * interlace the load of next 64 bytes data block with store of the last
179 * loaded 64 bytes data.
181 stp A_l, A_h, [dst, #-16]
182 ldp A_l, A_h, [src, #-16]
183 stp B_l, B_h, [dst, #-32]
184 ldp B_l, B_h, [src, #-32]
185 stp C_l, C_h, [dst, #-48]
186 ldp C_l, C_h, [src, #-48]
187 stp D_l, D_h, [dst, #-64]!
188 ldp D_l, D_h, [src, #-64]!
189 subs count, count, #64
191 stp A_l, A_h, [dst, #-16]
192 stp B_l, B_h, [dst, #-32]
193 stp C_l, C_h, [dst, #-48]
194 stp D_l, D_h, [dst, #-64]!