2 * Copyright (c) 2012-2014 ARM Ltd
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * 3. The name of the company may not be used to endorse or promote
14 * products derived from this software without specific prior written
17 * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
18 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
19 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20 * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
22 * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
23 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
24 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
25 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
26 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 /* Implementation of strcmp for ARMv7 when DSP instructions are
30 available. Use ldrd to support wider loads, provided the data
31 is sufficiently aligned. Use saturating arithmetic to optimize
35 STRCMP_NO_PRECHECK: Don't run a quick pre-check of the first
36 byte in the string. If comparing completely random strings
37 the pre-check will save time, since there is a very high
38 probability of a mismatch in the first character: we save
39 significant overhead if this is the common case. However,
40 if strings are likely to be identical (eg because we're
41 verifying a hit in a hash table), then this check is largely
44 /* This version uses Thumb-2 code. */
50 /* Parameters and result. */
53 #define result r0 /* Overlaps src1. */
55 /* Internal variables. */
60 /* Additional internal variables for 64-bit aligned data. */
65 #define syndrome_a tmp1
66 #define syndrome_b tmp2
68 /* Additional internal variables for 32-bit aligned data. */
74 /* Macro to compute and return the result value for word-aligned
76 .macro strcmp_epilogue_aligned synd d1 d2 restore_r6
77 #ifdef __ARM_BIG_ENDIAN
78 /* If data1 contains a zero byte, then syndrome will contain a 1 in
79 bit 7 of that byte. Otherwise, the highest set bit in the
80 syndrome will highlight the first different bit. It is therefore
81 sufficient to extract the eight bits starting with the syndrome
93 ldrd r4, r5, [sp], #16
96 .cfi_adjust_cfa_offset -16
97 sub result, result, r1, lsr #24
98 epilogue push_ip=HAVE_PAC_LEAF
100 /* To use the big-endian trick we'd have to reverse all three words.
101 that's slower than this approach. */
108 ldrd r6, r7, [sp, #8]
113 and result, \d1, #255
115 ldrd r4, r5, [sp], #16
118 .cfi_adjust_cfa_offset -16
119 sub result, result, r1
121 epilogue push_ip=HAVE_PAC_LEAF
130 .cfi_sections .debug_frame
132 prologue push_ip=HAVE_PAC_LEAF
133 #ifndef STRCMP_NO_PRECHECK
141 strd r4, r5, [sp, #-16]!
142 .cfi_adjust_cfa_offset 16
146 strd r6, r7, [sp, #8]
148 .cfi_rel_offset 7, 12
151 cbz r2, .Lloop_aligned8
158 /* Deal with mutual misalignment by aligning downwards and then
159 masking off the unwanted loaded data to prevent a difference. */
164 lsl tmp2, tmp2, #3 /* Bytes -> bits. */
165 ldrd data1a, data1b, [src1], #16
167 ldrd data2a, data2b, [src2], #16
168 /* In thumb code we can't use MVN with a register shift, but
170 S2HI tmp1, const_m1, tmp2
171 orn data1a, data1a, tmp1
172 orn data2a, data2a, tmp1
173 beq .Lstart_realigned8
174 orn data1b, data1b, tmp1
176 orn data2b, data2b, tmp1
180 /* Unwind the inner loop by a factor of 2, giving 16 bytes per
182 .p2align 5,,12 /* Don't start in the tail bytes of a cache line. */
183 .p2align 2 /* Always word aligned. */
185 ldrd data1a, data1b, [src1], #16
186 ldrd data2a, data2b, [src2], #16
188 uadd8 syndrome_b, data1a, const_m1 /* Only want GE bits, */
189 eor syndrome_a, data1a, data2a
190 sel syndrome_a, syndrome_a, const_m1
191 cbnz syndrome_a, .Ldiff_in_a
192 uadd8 syndrome_b, data1b, const_m1 /* Only want GE bits. */
193 eor syndrome_b, data1b, data2b
194 sel syndrome_b, syndrome_b, const_m1
195 cbnz syndrome_b, .Ldiff_in_b
197 ldrd data1a, data1b, [src1, #-8]
198 ldrd data2a, data2b, [src2, #-8]
199 uadd8 syndrome_b, data1a, const_m1 /* Only want GE bits, */
200 eor syndrome_a, data1a, data2a
201 sel syndrome_a, syndrome_a, const_m1
202 uadd8 syndrome_b, data1b, const_m1 /* Only want GE bits. */
203 eor syndrome_b, data1b, data2b
204 sel syndrome_b, syndrome_b, const_m1
205 /* Can't use CBZ for backwards branch. */
206 orrs syndrome_b, syndrome_b, syndrome_a /* Only need if s_a == 0 */
210 cbnz syndrome_a, .Ldiff_in_a
213 strcmp_epilogue_aligned syndrome_b, data1b, data2b 1
217 strcmp_epilogue_aligned syndrome_a, data1a, data2a 1
226 /* Unrolled by a factor of 2, to reduce the number of post-increment
229 ldr data1, [src1], #8
230 ldr data2, [src2], #8
232 uadd8 syndrome, data1, const_m1 /* Only need GE bits. */
233 eor syndrome, data1, data2
234 sel syndrome, syndrome, const_m1
235 cbnz syndrome, .Laligned4_done
236 ldr data1, [src1, #-4]
237 ldr data2, [src2, #-4]
238 uadd8 syndrome, data1, const_m1
239 eor syndrome, data1, data2
240 sel syndrome, syndrome, const_m1
245 strcmp_epilogue_aligned syndrome, data1, data2, 0
249 /* Deal with mutual misalignment by aligning downwards and then
250 masking off the unwanted loaded data to prevent a difference. */
251 lsl tmp1, tmp1, #3 /* Bytes -> bits. */
253 ldr data1, [src1], #8
255 ldr data2, [src2], #8
257 /* In thumb code we can't use MVN with a register shift, but
259 S2HI tmp1, const_m1, tmp1
260 orn data1, data1, tmp1
261 orn data2, data2, tmp1
270 ldr data1, [src1], #4
273 #ifdef STRCMP_NO_PRECHECK
274 ldrb data2, [src2, #1]
275 uxtb tmp1, data1, ror #BYTE1_OFFSET
276 subs tmp1, tmp1, data2
277 bne .Lmisaligned_exit
278 cbz data2, .Lmisaligned_exit
281 ldrb data2, [src2, #2]
282 uxtb tmp1, data1, ror #BYTE2_OFFSET
283 subs tmp1, tmp1, data2
284 bne .Lmisaligned_exit
285 cbz data2, .Lmisaligned_exit
288 ldrb data2, [src2, #3]
289 uxtb tmp1, data1, ror #BYTE3_OFFSET
290 subs tmp1, tmp1, data2
291 bne .Lmisaligned_exit
293 cbnz data2, .Lsrc1_aligned
294 #else /* STRCMP_NO_PRECHECK */
295 /* If we've done the pre-check, then we don't need to check the
296 first byte again here. */
297 ldrb data2, [src2, #2]
298 uxtb tmp1, data1, ror #BYTE2_OFFSET
299 subs tmp1, tmp1, data2
300 bne .Lmisaligned_exit
301 cbz data2, .Lmisaligned_exit
304 ldrb data2, [src2, #3]
305 uxtb tmp1, data1, ror #BYTE3_OFFSET
306 subs tmp1, tmp1, data2
307 bne .Lmisaligned_exit
308 cbnz data2, .Laligned_m1
316 .cfi_adjust_cfa_offset -16
317 epilogue push_ip=HAVE_PAC_LEAF
319 #ifndef STRCMP_NO_PRECHECK
324 epilogue push_ip=HAVE_PAC_LEAF
333 /* src1 is word aligned, but src2 has no common alignment
335 ldr data1, [src1], #4
336 lsls tmp1, src2, #31 /* C=src2[1], Z=src2[0]. */
339 ldr data2, [src2], #4
340 bhi .Loverlap1 /* C=1, Z=0 => src2[1:0] = 0b11. */
341 bcs .Loverlap2 /* C=1, Z=1 => src2[1:0] = 0b10. */
343 /* (overlap3) C=0, Z=0 => src2[1:0] = 0b01. */
345 bic tmp1, data1, #MSB
346 uadd8 syndrome, data1, const_m1
347 eors syndrome, tmp1, data2, S2LO #8
348 sel syndrome, syndrome, const_m1
351 ldr data2, [src2], #4
352 eor tmp1, tmp1, data1
353 cmp tmp1, data2, S2HI #24
355 ldr data1, [src1], #4
358 S2LO data2, data2, #8
362 bics syndrome, syndrome, #MSB
363 bne .Lstrcmp_done_equal
365 /* We can only get here if the MSB of data1 contains 0, so
366 fast-path the exit. */
369 ldrd r4, r5, [sp], #16
372 /* R6/7 Not used in this sequence. */
375 .cfi_adjust_cfa_offset -16
377 epilogue push_ip=HAVE_PAC_LEAF
381 S2LO data1, data1, #24
382 and data2, data2, #LSB
385 .p2align 5,,12 /* Ensure at least 3 instructions in cache line. */
387 and tmp1, data1, const_m1, S2LO #16
388 uadd8 syndrome, data1, const_m1
389 eors syndrome, tmp1, data2, S2LO #16
390 sel syndrome, syndrome, const_m1
393 ldr data2, [src2], #4
394 eor tmp1, tmp1, data1
395 cmp tmp1, data2, S2HI #16
397 ldr data1, [src1], #4
400 S2LO data2, data2, #16
403 ands syndrome, syndrome, const_m1, S2LO #16
404 bne .Lstrcmp_done_equal
407 S2LO data1, data1, #16
408 #ifdef __ARM_BIG_ENDIAN
409 lsl data2, data2, #16
414 S2LO data1, data1, #16
415 and data2, data2, const_m1, S2LO #16
418 .p2align 5,,12 /* Ensure at least 3 instructions in cache line. */
420 and tmp1, data1, #LSB
421 uadd8 syndrome, data1, const_m1
422 eors syndrome, tmp1, data2, S2LO #24
423 sel syndrome, syndrome, const_m1
426 ldr data2, [src2], #4
427 eor tmp1, tmp1, data1
428 cmp tmp1, data2, S2HI #8
430 ldr data1, [src1], #4
433 S2LO data2, data2, #24
437 bne .Lstrcmp_done_equal
440 S2LO data1, data1, #8
441 bic data2, data2, #MSB
447 ldrd r4, r5, [sp], #16
450 /* R6/7 not used in this sequence. */
453 .cfi_adjust_cfa_offset -16
454 epilogue push_ip=HAVE_PAC_LEAF
458 #ifndef __ARM_BIG_ENDIAN
461 /* Now everything looks big-endian... */
463 uadd8 tmp1, data1, const_m1
464 eor tmp1, data1, data2
465 sel syndrome, tmp1, const_m1
467 lsl data1, data1, tmp1
468 lsl data2, data2, tmp1
469 lsr result, data1, #24
470 ldrd r4, r5, [sp], #16
473 /* R6/7 not used in this sequence. */
476 .cfi_adjust_cfa_offset -16
477 sub result, result, data2, lsr #24
478 epilogue push_ip=HAVE_PAC_LEAF
482 .size strcmp, . - strcmp