2 * Copyright (C) 2013 ARM Ltd.
3 * Copyright (C) 2013 Linaro.
5 * This code is based on glibc cortex strings work originally authored by Linaro
6 * and re-licensed under GPLv2 for the Linux kernel. The original code can
9 * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
10 * files/head:/src/aarch64/
12 * This program is free software; you can redistribute it and/or modify
13 * it under the terms of the GNU General Public License version 2 as
14 * published by the Free Software Foundation.
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU General Public License for more details.
21 * You should have received a copy of the GNU General Public License
22 * along with this program. If not, see <http://www.gnu.org/licenses/>.
25 #include <linux/linkage.h>
26 #include <asm/assembler.h>
27 #include <asm/cache.h>
30 * Move a buffer from src to test (alignment handled by the hardware).
31 * If dest <= src, call memcpy, otherwise copy in reverse order.
65 b.hs memcpy /* No overlap. */
70 b.lo .Ltail15 /*probably non-alignment accesses.*/
72 ands tmp2, src, #15 /* Bytes to reach alignment. */
74 sub count, count, tmp2
76 * process the aligned offset length to make the src aligned firstly.
77 * those extra instructions' cost is acceptable. It also make the
78 * coming accesses are based on aligned address.
81 ldrb tmp1w, [src, #-1]!
82 strb tmp1w, [dst, #-1]!
85 ldrh tmp1w, [src, #-2]!
86 strh tmp1w, [dst, #-2]!
89 ldr tmp1w, [src, #-4]!
90 str tmp1w, [dst, #-4]!
92 tbz tmp2, #3, .LSrcAligned
101 * Deal with small copies quickly by dropping straight into the
106 * Copy up to 48 bytes of data. At this point we only need the
107 * bottom 6 bits of count to be accurate.
109 ands tmp1, count, #0x30
114 ldp A_l, A_h, [src, #-16]!
115 stp A_l, A_h, [dst, #-16]!
117 ldp A_l, A_h, [src, #-16]!
118 stp A_l, A_h, [dst, #-16]!
120 ldp A_l, A_h, [src, #-16]!
121 stp A_l, A_h, [dst, #-16]!
125 ldr tmp1, [src, #-8]!
126 str tmp1, [dst, #-8]!
129 ldr tmp1w, [src, #-4]!
130 str tmp1w, [dst, #-4]!
133 ldrh tmp1w, [src, #-2]!
134 strh tmp1w, [dst, #-2]!
136 tbz count, #0, .Lexitfunc
137 ldrb tmp1w, [src, #-1]
138 strb tmp1w, [dst, #-1]
144 subs count, count, #128
145 b.ge .Lcpy_body_large
147 * Less than 128 bytes to copy, so handle 64 bytes here and then jump
150 ldp A_l, A_h, [src, #-16]
151 stp A_l, A_h, [dst, #-16]
152 ldp B_l, B_h, [src, #-32]
153 ldp C_l, C_h, [src, #-48]
154 stp B_l, B_h, [dst, #-32]
155 stp C_l, C_h, [dst, #-48]
156 ldp D_l, D_h, [src, #-64]!
157 stp D_l, D_h, [dst, #-64]!
164 * Critical loop. Start at a new cache line boundary. Assuming
165 * 64 bytes per line this ensures the entire loop is in one line.
167 .p2align L1_CACHE_SHIFT
169 /* pre-load 64 bytes data. */
170 ldp A_l, A_h, [src, #-16]
171 ldp B_l, B_h, [src, #-32]
172 ldp C_l, C_h, [src, #-48]
173 ldp D_l, D_h, [src, #-64]!
176 * interlace the load of next 64 bytes data block with store of the last
177 * loaded 64 bytes data.
179 stp A_l, A_h, [dst, #-16]
180 ldp A_l, A_h, [src, #-16]
181 stp B_l, B_h, [dst, #-32]
182 ldp B_l, B_h, [src, #-32]
183 stp C_l, C_h, [dst, #-48]
184 ldp C_l, C_h, [src, #-48]
185 stp D_l, D_h, [dst, #-64]!
186 ldp D_l, D_h, [src, #-64]!
187 subs count, count, #64
189 stp A_l, A_h, [dst, #-16]
190 stp B_l, B_h, [dst, #-32]
191 stp C_l, C_h, [dst, #-48]
192 stp D_l, D_h, [dst, #-64]!