2 * strcpy/stpcpy - copy a string returning pointer to start/end.
4 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
5 * See https://llvm.org/LICENSE.txt for license information.
6 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
11 * ARMv8-a, AArch64, unaligned accesses, min page size 4k.
14 #include "../asmdefs.h"
16 /* To build as stpcpy, define BUILD_STPCPY before compiling this file.
18 To test the page crossing code path more thoroughly, compile with
19 -DSTRCPY_TEST_PAGE_CROSS - this will force all copies through the slower
20 entry path. This option is not intended for production use. */
22 /* Arguments and results. */
26 /* Locals and temporaries. */
47 #define STRCPY __stpcpy_aarch64
49 #define STRCPY __strcpy_aarch64
52 /* NUL detection works on the principle that (X - 1) & (~X) & 0x80
53 (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
54 can be done in parallel across the entire word. */
56 #define REP8_01 0x0101010101010101
57 #define REP8_7f 0x7f7f7f7f7f7f7f7f
58 #define REP8_80 0x8080808080808080
60 /* AArch64 systems have a minimum page size of 4k. We can do a quick
61 page size check for crossing this boundary on entry and if we
62 do not, then we can short-circuit much of the entry code. We
63 expect early page-crossing strings to be rare (probability of
64 16/MIN_PAGE_SIZE ~= 0.4%), so the branch should be quite
65 predictable, even with random strings.
67 We don't bother checking for larger page sizes, the cost of setting
68 up the correct page size is just not worth the extra gain from
69 a small reduction in the cases taking the slow path. Note that
70 we only care about whether the first fetch, which may be
71 misaligned, crosses a page boundary - after that we move to aligned
72 fetches for the remainder of the string. */
74 #ifdef STRCPY_TEST_PAGE_CROSS
75 /* Make everything that isn't Qword aligned look like a page cross. */
78 #define MIN_PAGE_P2 12
81 #define MIN_PAGE_SIZE (1 << MIN_PAGE_P2)
84 /* For moderately short strings, the fastest way to do the copy is to
85 calculate the length of the string in the same way as strlen, then
86 essentially do a memcpy of the result. This avoids the need for
87 multiple byte copies and further means that by the time we
88 reach the bulk copy loop we know we can always use DWord
89 accesses. We expect __strcpy_aarch64 to rarely be called repeatedly
90 with the same source string, so branch prediction is likely to
91 always be difficult - we mitigate against this by preferring
92 conditional select operations over branches whenever this is
94 and tmp2, srcin, #(MIN_PAGE_SIZE - 1)
95 mov zeroones, #REP8_01
96 and to_align, srcin, #15
97 cmp tmp2, #(MIN_PAGE_SIZE - 16)
99 /* The first fetch will straddle a (possible) page boundary iff
100 srcin + 15 causes bit[MIN_PAGE_P2] to change value. A 16-byte
101 aligned string will never fail the page align check, so will
102 always take the fast path. */
106 ldp data1, data2, [srcin]
108 /* Because we expect the end to be found within 16 characters
109 (profiling shows this is the most common case), it's worth
110 swapping the bytes now to save having to recalculate the
111 termination syndrome later. We preserve data1 and data2
112 so that we can re-use the values later on. */
114 sub tmp1, tmp2, zeroones
115 orr tmp2, tmp2, #REP8_7f
116 bics has_nul1, tmp1, tmp2
119 sub tmp3, tmp4, zeroones
120 orr tmp4, tmp4, #REP8_7f
122 sub tmp1, data1, zeroones
123 orr tmp2, data1, #REP8_7f
124 bics has_nul1, tmp1, tmp2
126 sub tmp3, data2, zeroones
127 orr tmp4, data2, #REP8_7f
129 bics has_nul2, tmp3, tmp4
132 /* The string is short (<=16 bytes). We don't know exactly how
133 short though, yet. Work out the exact length so that we can
134 quickly select the optimal copy strategy. */
136 rev has_nul2, has_nul2
139 add dst, dstin, pos, lsr #3 /* Bits to bytes. */
142 lsr data2, data2, pos
144 lsl data2, data2, pos
154 rev has_nul1, has_nul1
156 add dst, dstin, pos, lsr #3 /* Bits to bytes. */
157 subs tmp2, pos, #24 /* Pos in bits. */
162 lsr data2, data1, pos
163 lsr data1, data1, #32
165 lsr data2, data1, tmp2
167 /* 4->7 bytes to copy. */
168 str data2w, [dst, #-3]
176 /* 2->3 bytes to copy. */
178 lsr data1, data1, #48
181 /* Fall-through, one byte (max) to go. */
183 /* Null-terminated string. Last character must be zero! */
191 /* Aligning here ensures that the entry code and main loop all lies
192 within one 64-byte cache line. */
194 sub to_align, to_align, #16
195 stp data1, data2, [dstin]
196 sub src, srcin, to_align
197 sub dst, dstin, to_align
198 b L(entry_no_page_cross)
200 /* The inner loop deals with two Dwords at a time. This has a
201 slightly higher start-up cost, but we should win quite quickly,
202 especially on cores with a high number of issue slots per
203 cycle, as we get much better parallelism out of the operations. */
205 stp data1, data2, [dst], #16
206 L(entry_no_page_cross):
207 ldp data1, data2, [src], #16
208 sub tmp1, data1, zeroones
209 orr tmp2, data1, #REP8_7f
210 sub tmp3, data2, zeroones
211 orr tmp4, data2, #REP8_7f
212 bic has_nul1, tmp1, tmp2
213 bics has_nul2, tmp3, tmp4
214 ccmp has_nul1, #0, #0, eq /* NZCV = 0000 */
217 /* Since we know we are copying at least 16 bytes, the fastest way
218 to deal with the tail is to determine the location of the
219 trailing NUL, then (re)copy the 16 bytes leading up to that. */
222 /* For big-endian, carry propagation (if the final byte in the
223 string is 0x01) means we cannot use has_nul directly. The
224 easiest way to get the correct byte is to byte-swap the data
225 and calculate the syndrome a second time. */
226 csel data1, data1, data2, ne
228 sub tmp1, data1, zeroones
229 orr tmp2, data1, #REP8_7f
230 bic has_nul1, tmp1, tmp2
232 csel has_nul1, has_nul1, has_nul2, ne
234 rev has_nul1, has_nul1
238 csel pos, pos, tmp1, ne
239 add src, src, pos, lsr #3
240 add dst, dst, pos, lsr #3
241 ldp data1, data2, [src, #-32]
242 stp data1, data2, [dst, #-16]
250 /* Start by loading two words at [srcin & ~15], then forcing the
251 bytes that precede srcin to 0xff. This means they never look
252 like termination bytes. */
253 ldp data1, data2, [src]
254 lsl tmp1, tmp1, #3 /* Bytes beyond alignment -> bits. */
258 lsl tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */
260 lsr tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */
262 orr data1, data1, tmp2
263 orr data2a, data2, tmp2
265 csinv data1, data1, xzr, lt
266 csel data2, data2, data2a, lt
267 sub tmp1, data1, zeroones
268 orr tmp2, data1, #REP8_7f
269 sub tmp3, data2, zeroones
270 orr tmp4, data2, #REP8_7f
271 bic has_nul1, tmp1, tmp2
272 bics has_nul2, tmp3, tmp4
273 ccmp has_nul1, #0, #0, eq /* NZCV = 0000 */
274 b.eq L(page_cross_ok)
275 /* We now need to make data1 and data2 look like they've been
276 loaded directly from srcin. Do a rotate on the 128-bit value. */
277 lsl tmp1, to_align, #3 /* Bytes->bits. */
278 neg tmp2, to_align, lsl #3
280 lsl data1a, data1, tmp1
281 lsr tmp4, data2, tmp2
282 lsl data2, data2, tmp1
283 orr tmp4, tmp4, data1a
285 csel data1, tmp4, data2, lt
288 sub tmp1, tmp2, zeroones
289 orr tmp2, tmp2, #REP8_7f
290 sub tmp3, tmp4, zeroones
291 orr tmp4, tmp4, #REP8_7f
293 lsr data1a, data1, tmp1
294 lsl tmp4, data2, tmp2
295 lsr data2, data2, tmp1
296 orr tmp4, tmp4, data1a
298 csel data1, tmp4, data2, lt
299 sub tmp1, data1, zeroones
300 orr tmp2, data1, #REP8_7f
301 sub tmp3, data2, zeroones
302 orr tmp4, data2, #REP8_7f
304 bic has_nul1, tmp1, tmp2
305 cbnz has_nul1, L(fp_le8)
306 bic has_nul2, tmp3, tmp4