1 /* Copyright (c) 2010-2011, Linaro Limited
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions
8 * Redistributions of source code must retain the above copyright
9 notice, this list of conditions and the following disclaimer.
11 * Redistributions in binary form must reproduce the above copyright
12 notice, this list of conditions and the following disclaimer in the
13 documentation and/or other materials provided with the distribution.
15 * Neither the name of Linaro Limited nor the names of its
16 contributors may be used to endorse or promote products derived
17 from this software without specific prior written permission.
19 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23 HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 Written by Dave Gilbert <david.gilbert@linaro.org>
33 This memchr routine is optimised on a Cortex-A9 and should work on
34 all ARMv7 processors. It has a fast path for short sizes, and has
35 an optimised path for large data sets; the worst case is finding the
36 match early in a large data set. */
38 /* Copyright (c) 2015 ARM Ltd.
41 Redistribution and use in source and binary forms, with or without
42 modification, are permitted provided that the following conditions are met:
43 * Redistributions of source code must retain the above copyright
44 notice, this list of conditions and the following disclaimer.
45 * Redistributions in binary form must reproduce the above copyright
46 notice, this list of conditions and the following disclaimer in the
47 documentation and/or other materials provided with the distribution.
48 * Neither the name of the Linaro nor the
49 names of its contributors may be used to endorse or promote products
50 derived from this software without specific prior written permission.
52 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
53 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
54 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
55 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
56 HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
57 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
58 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
59 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
60 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
61 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
62 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
64 @ 2011-02-07 david.gilbert@linaro.org
65 @ Extracted from local git a5b438d861
66 @ 2011-07-14 david.gilbert@linaro.org
67 @ Import endianness fix from local git ea786f1b
68 @ 2011-10-11 david.gilbert@linaro.org
69 @ Import from cortex-strings bzr rev 63
70 @ Flip to ldrd (as suggested by Greta Yorsh)
71 @ Make conditional on CPU type
74 @ This code requires armv6t2 or later. Uses Thumb2.
78 #include "arm-acle-compat.h"
81 @ NOTE: This ifdef MUST match the one in memchr-stub.c
82 #if defined (__ARM_NEON__) || defined (__ARM_NEON)
83 #if __ARM_ARCH >= 8 && __ARM_ARCH_PROFILE == 'R'
97 #define result r0 /* Live range does not overlap with srcin */
99 /* Working registers */
100 #define src r1 /* Live range does not overlap with chrin */
102 #define synd r0 /* No overlap with srcin or result */
105 /* Working NEON registers */
108 #define vdata0_0 d2 /* Lower half of vdata0 */
109 #define vdata0_1 d3 /* Upper half of vdata0 */
111 #define vdata1_0 d4 /* Lower half of vhas_chr0 */
112 #define vdata1_1 d5 /* Upper half of vhas_chr0 */
123 * For each 32-byte chunk we calculate a 32-bit syndrome value, with one bit per
124 * byte. Each bit is set if the relevant byte matched the requested character
125 * and cleared otherwise. Since the bits in the syndrome reflect exactly the
126 * order in which things occur in the original string, counting trailing zeros
127 * allows to identify exactly which byte has matched.
135 .type memchr,%function
138 .cfi_sections .debug_frame
140 /* Use a simple loop if there are less than 8 bytes to search. */
143 and chrin, chrin, #0xff
146 subs cntin, cntin, #1
147 blo .Lnotfound /* Return not found if reached end. */
148 ldrb tmp, [srcin], #1
150 bne .Lsmallstr /* Loop again if not found. */
151 /* Otherwise fixup address and return. */
152 sub result, result, #1
157 vdup.8 vrepchr, chrin /* Duplicate char across all lanes. */
159 * Magic constant 0x8040201008040201 allows us to identify which lane
160 * matches the requested byte.
165 vmov vrepmask0, tmp, soff
166 vmov vrepmask1, tmp, soff
167 /* Work with aligned 32-byte chunks */
169 ands soff, srcin, #31
170 beq .Lloopintro /* Go straight to main loop if it's aligned. */
173 * Input string is not 32-byte aligned. We calculate the syndrome
174 * value for the aligned 32 bytes block containing the first bytes
175 * and mask the irrelevant part.
177 vld1.8 {vdata0, vdata1}, [src:256]!
179 adds cntin, cntin, tmp
180 vceq.i8 vdata0, vdata0, vrepchr
181 vceq.i8 vdata1, vdata1, vrepchr
182 vand vdata0, vdata0, vrepmask
183 vand vdata1, vdata1, vrepmask
184 vpadd.i8 vdata0_0, vdata0_0, vdata0_1
185 vpadd.i8 vdata1_0, vdata1_0, vdata1_1
186 vpadd.i8 vdata0_0, vdata0_0, vdata1_0
187 vpadd.i8 vdata0_0, vdata0_0, vdata0_0
188 vmov synd, vdata0_0[0]
190 /* Clear the soff lower bits */
193 /* The first block can also be the last */
195 /* Have we found something already? */
201 /* 264/265 correspond to d8/d9 for q4 */
202 .cfi_adjust_cfa_offset 16
203 .cfi_rel_offset 264, 0
204 .cfi_rel_offset 265, 8
207 vld1.8 {vdata0, vdata1}, [src:256]!
208 subs cntin, cntin, #32
209 vceq.i8 vdata0, vdata0, vrepchr
210 vceq.i8 vdata1, vdata1, vrepchr
211 /* If we're out of data we finish regardless of the result. */
213 /* Use a fast check for the termination condition. */
214 vorr vend, vdata0, vdata1
215 vorr vend0, vend0, vend1
216 vmov synd, tmp, vend0
218 /* We're not out of data, loop if we haven't found the character. */
223 .cfi_adjust_cfa_offset -16
227 /* Termination condition found, let's calculate the syndrome value. */
228 vand vdata0, vdata0, vrepmask
229 vand vdata1, vdata1, vrepmask
230 vpadd.i8 vdata0_0, vdata0_0, vdata0_1
231 vpadd.i8 vdata1_0, vdata1_0, vdata1_1
232 vpadd.i8 vdata0_0, vdata0_0, vdata1_0
233 vpadd.i8 vdata0_0, vdata0_0, vdata0_0
234 vmov synd, vdata0_0[0]
240 /* Clear the (-cntin) upper bits to avoid out-of-bounds matches. */
242 lsl synd, synd, cntin
243 lsrs synd, synd, cntin
245 moveq src, #0 /* If no match, set src to 0 so the retval is 0. */
249 /* Count the trailing zeros using bit reversing */
251 /* Compensate the last post-increment */
253 /* Count the leading zeros */
255 /* Compute the potential result and return */
256 add result, src, synd
261 /* Set result to NULL if not found and return */
266 .size memchr, . - memchr
268 #elif __ARM_ARCH_ISA_THUMB >= 2 && defined (__ARM_FEATURE_DSP)
270 #if __ARM_ARCH_PROFILE == 'M'
272 /* keep config inherited from -march=. */
275 #endif /* __ARM_ARCH >= 8 */
278 #endif /* __ARM_ARCH_PROFILE == 'M' */
280 @ this lets us check a flag in a 00/ff byte easily in either endianness
282 #define CHARTSTMASK(c) 1<<(31-(c*8))
284 #define CHARTSTMASK(c) 1<<(c*8)
289 @ ---------------------------------------------------------------------------
294 .type memchr,%function
296 .cfi_sections .debug_frame
299 @ r0 = start of memory to scan
300 @ r1 = character to look for
302 @ returns r0 = pointer to character or NULL if not found
304 and r1,r1,#0xff @ Don't trust the caller to pass a char
306 cmp r2,#16 @ If short don't bother with anything clever
309 tst r0, #7 @ If it's already aligned skip the next bit
312 @ Work up to an aligned point
317 beq 50f @ If it matches exit found
319 cbz r2, 40f @ If we run off the end, exit not found
320 bne 5b @ If not aligned yet then do next byte
323 @ We are aligned, we know we have at least 8 bytes to work with
325 .cfi_adjust_cfa_offset 16
329 .cfi_rel_offset 7, 12
330 orr r1, r1, r1, lsl #8 @ expand the match word across all bytes
331 orr r1, r1, r1, lsl #16
332 bic r4, r2, #7 @ Number of double words to work with * 8
333 mvns r7, #0 @ all F's
339 eor r5,r5, r1 @ r5,r6 have 00's where bytes match the target
341 uadd8 r5, r5, r7 @ Par add 0xff - sets GE bits for bytes!=0
342 sel r5, r3, r7 @ bytes are 00 for none-00 bytes,
343 @ or ff for 00 bytes - NOTE INVERSION
344 uadd8 r6, r6, r7 @ Par add 0xff - sets GE bits for bytes!=0
345 sel r6, r5, r7 @ chained....bytes are 00 for none-00 bytes
346 @ or ff for 00 bytes - NOTE INVERSION
348 bne 15b @ (Flags from the subs above)
355 .cfi_adjust_cfa_offset -16
356 and r1,r1,#0xff @ r1 back to a single character
357 and r2,r2,#7 @ Leave the count remaining as the number
358 @ after the double words have been done
361 cbz r2, 40f @ 0 length or hit the end already then not found
363 21: @ Post aligned section, or just a short call
366 eor r3,r3,r1 @ r3 = 0 if match - doesn't break flags from sub
368 bne 21b @ on r2 flags
372 movs r0,#0 @ not found
378 subs r0,r0,#1 @ found
381 60: @ We're here because the fast path found a hit
382 @ now we have to track down exactly which word it was
383 @ r0 points to the start of the double word after the one tested
384 @ r5 has the 00/ff pattern for the first word, r6 has the chained value
385 @ This point is reached from cbnz midway through label 15 prior to
386 @ popping r4-r7 off the stack. .cfi_restore_state alone disregards
387 @ this, so we manually correct this.
388 .cfi_restore_state @ Standard post-prologue state
389 .cfi_adjust_cfa_offset 16
393 .cfi_rel_offset 7, 12
396 moveq r5, r6 @ the end is in the 2nd word
397 subeq r0,r0,#3 @ Points to 2nd byte of 2nd word
398 subne r0,r0,#7 @ or 2nd byte of 1st word
400 @ r0 currently points to the 2nd byte of the word containing the hit
401 tst r5, # CHARTSTMASK(0) @ 1st character
404 tst r5, # CHARTSTMASK(1) @ 2nd character
407 tsteq r5, # (3<<15) @ 2nd & 3rd character
408 @ If not the 3rd must be the last one
417 .cfi_adjust_cfa_offset -16
424 /* Defined in memchr-stub.c. */