2 * strncmp - compare two strings
4 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
5 * See https://llvm.org/LICENSE.txt for license information.
6 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
14 #include "../asmdefs.h"
16 #define REP8_01 0x0101010101010101
17 #define REP8_7f 0x7f7f7f7f7f7f7f7f
18 #define REP8_80 0x8080808080808080
20 /* Parameters and result. */
26 /* Internal variables. */
47 nop /* Pad so that the loop below fits a cache line. */
49 ENTRY_ALIGN (__strncmp_aarch64, 0)
52 mov zeroones, #REP8_01
56 cbnz count, L(mutual_align)
57 /* Calculate the number of full and partial words -1. */
58 sub limit_wd, limit, #1 /* limit != 0, so no underflow. */
59 lsr limit_wd, limit_wd, #3 /* Convert to Dwords. */
61 /* NUL detection works on the principle that (X - 1) & (~X) & 0x80
62 (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
63 can be done in parallel across the entire word. */
64 /* Start of performance-critical section -- one 64B cache line. */
69 subs limit_wd, limit_wd, #1
70 sub tmp1, data1, zeroones
71 orr tmp2, data1, #REP8_7f
72 eor diff, data1, data2 /* Non-zero if differences found. */
73 csinv endloop, diff, xzr, pl /* Last Dword or differences. */
74 bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
75 ccmp endloop, #0, #0, eq
77 /* End of performance-critical section -- one 64B cache line. */
79 /* Not reached the limit, must have found the end or a diff. */
80 tbz limit_wd, #63, L(not_limit)
82 /* Limit % 8 == 0 => all bytes significant. */
86 lsl limit, limit, #3 /* Bits -> bytes. */
93 bic data1, data1, mask
94 bic data2, data2, mask
96 /* Make sure that the NUL byte is marked in the syndrome. */
97 orr has_nul, has_nul, mask
100 orr syndrome, diff, has_nul
102 #ifndef __AARCH64EB__
103 rev syndrome, syndrome
105 /* The MS-non-zero bit of the syndrome marks either the first bit
106 that is different, or the top bit of the first zero byte.
107 Shifting left now will bring the critical information into the
111 lsl data1, data1, pos
112 lsl data2, data2, pos
113 /* But we need to zero-extend (char is unsigned) the value and then
114 perform a signed 32-bit subtraction. */
115 lsr data1, data1, #56
116 sub result, data1, data2, lsr #56
119 /* For big-endian we cannot use the trick with the syndrome value
120 as carry-propagation can corrupt the upper bits if the trailing
121 bytes in the string contain 0x01. */
122 /* However, if there is no NUL byte in the dword, we can generate
123 the result directly. We can't just subtract the bytes as the
124 MSB might be significant. */
128 cneg result, result, lo
131 /* Re-compute the NUL-byte detection, using a byte-reversed value. */
133 sub tmp1, tmp3, zeroones
134 orr tmp2, tmp3, #REP8_7f
135 bic has_nul, tmp1, tmp2
137 orr syndrome, diff, has_nul
139 /* The MS-non-zero bit of the syndrome marks either the first bit
140 that is different, or the top bit of the first zero byte.
141 Shifting left now will bring the critical information into the
143 lsl data1, data1, pos
144 lsl data2, data2, pos
145 /* But we need to zero-extend (char is unsigned) the value and then
146 perform a signed 32-bit subtraction. */
147 lsr data1, data1, #56
148 sub result, data1, data2, lsr #56
153 /* Sources are mutually aligned, but are not currently at an
154 alignment boundary. Round down the addresses and then mask off
155 the bytes that precede the start point.
156 We also need to adjust the limit calculations, but without
157 overflowing if the limit is near ULONG_MAX. */
160 ldr data1, [src1], #8
161 neg tmp3, count, lsl #3 /* 64 - bits(bytes beyond align). */
162 ldr data2, [src2], #8
164 sub limit_wd, limit, #1 /* limit != 0, so no underflow. */
166 /* Big-endian. Early bytes are at MSB. */
167 lsl tmp2, tmp2, tmp3 /* Shift (count & 63). */
169 /* Little-endian. Early bytes are at LSB. */
170 lsr tmp2, tmp2, tmp3 /* Shift (count & 63). */
172 and tmp3, limit_wd, #7
173 lsr limit_wd, limit_wd, #3
174 /* Adjust the limit. Only low 3 bits used, so overflow irrelevant. */
175 add limit, limit, count
176 add tmp3, tmp3, count
177 orr data1, data1, tmp2
178 orr data2, data2, tmp2
179 add limit_wd, limit_wd, tmp3, lsr #3
183 /* Don't bother with dwords for up to 16 bytes. */
186 b.hs L(try_misaligned_words)
189 /* Perhaps we can do better than this. */
190 ldrb data1w, [src1], #1
191 ldrb data2w, [src2], #1
192 subs limit, limit, #1
193 ccmp data1w, #1, #0, hi /* NZCV = 0b0000. */
194 ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */
197 sub result, data1, data2
199 /* Align the SRC1 to a dword by doing a bytewise compare and then do
201 L(try_misaligned_words):
202 lsr limit_wd, limit, #3
203 cbz count, L(do_misaligned)
207 sub limit, limit, count
208 lsr limit_wd, limit, #3
211 ldrb data1w, [src1], #1
212 ldrb data2w, [src2], #1
214 ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */
216 subs count, count, #1
217 b.hi L(page_end_loop)
220 /* Prepare ourselves for the next page crossing. Unlike the aligned
221 loop, we fetch 1 less dword because we risk crossing bounds on
224 subs limit_wd, limit_wd, #1
227 and tmp2, src2, #0xff8
228 eor tmp2, tmp2, #0xff8
229 cbz tmp2, L(page_end_loop)
231 ldr data1, [src1], #8
232 ldr data2, [src2], #8
233 sub tmp1, data1, zeroones
234 orr tmp2, data1, #REP8_7f
235 eor diff, data1, data2 /* Non-zero if differences found. */
236 bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
237 ccmp diff, #0, #0, eq
239 subs limit_wd, limit_wd, #1
240 b.pl L(loop_misaligned)
243 /* We found a difference or a NULL before the limit was reached. */
245 cbz limit, L(not_limit)
246 /* Read the last word. */
249 ldr data1, [src1, limit]
250 ldr data2, [src2, limit]
251 sub tmp1, data1, zeroones
252 orr tmp2, data1, #REP8_7f
253 eor diff, data1, data2 /* Non-zero if differences found. */
254 bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
255 ccmp diff, #0, #0, eq
262 END ( __strncmp_aarch64)