1 /* SPDX-License-Identifier: GPL-2.0-only */
3 * Copyright (c) 2013-2022, Arm Limited.
5 * Adapted from the original at:
6 * https://github.com/ARM-software/optimized-routines/blob/189dfefe37d54c5b/string/aarch64/strncmp.S
9 #include <linux/linkage.h>
10 #include <asm/assembler.h>
18 #define L(label) .L ## label
20 #define REP8_01 0x0101010101010101
21 #define REP8_7f 0x7f7f7f7f7f7f7f7f
23 /* Parameters and result. */
29 /* Internal variables. */
46 #define neg_offset x15
48 /* Define endian dependent shift operations.
49 On big-endian early bytes are at MSB and on little-endian LSB.
50 LS_FW means shifting towards early bytes.
51 LS_BK means shifting towards later bytes.
61 SYM_FUNC_START(__pi_strncmp)
64 mov zeroones, #REP8_01
68 cbnz count, L(mutual_align)
70 /* NUL detection works on the principle that (X - 1) & (~X) & 0x80
71 (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
72 can be done in parallel across the entire word. */
79 sub tmp1, data1, zeroones
80 orr tmp2, data1, #REP8_7f
81 eor diff, data1, data2 /* Non-zero if differences found. */
82 csinv endloop, diff, xzr, hi /* Last Dword or differences. */
83 bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
84 ccmp endloop, #0, #0, eq
86 /* End of main loop */
90 orr syndrome, diff, has_nul
91 add limit, limit, 8 /* Rewind limit to before last subs. */
93 /* Limit was reached. Check if the NUL byte or the difference
94 is before the limit. */
95 rev syndrome, syndrome
100 cmp limit, pos, lsr #3
101 lsl data2, data2, pos
102 /* But we need to zero-extend (char is unsigned) the value and then
103 perform a signed 32-bit subtraction. */
104 lsr data1, data1, #56
105 sub result, data1, data2, lsr #56
106 csel result, result, xzr, hi
109 /* Not reached the limit, must have found the end or a diff. */
110 tbz limit, #63, L(not_limit)
112 cbz limit, L(not_limit)
114 lsl limit, tmp1, #3 /* Bits -> bytes. */
116 lsr mask, mask, limit
117 bic data1, data1, mask
118 bic data2, data2, mask
120 /* Make sure that the NUL byte is marked in the syndrome. */
121 orr has_nul, has_nul, mask
124 /* For big-endian we cannot use the trick with the syndrome value
125 as carry-propagation can corrupt the upper bits if the trailing
126 bytes in the string contain 0x01. */
127 /* However, if there is no NUL byte in the dword, we can generate
128 the result directly. We can't just subtract the bytes as the
129 MSB might be significant. */
133 cneg result, result, lo
136 /* Re-compute the NUL-byte detection, using a byte-reversed value. */
138 sub tmp1, tmp3, zeroones
139 orr tmp2, tmp3, #REP8_7f
140 bic has_nul, tmp1, tmp2
142 orr syndrome, diff, has_nul
144 /* The most-significant-non-zero bit of the syndrome marks either the
145 first bit that is different, or the top bit of the first zero byte.
146 Shifting left now will bring the critical information into the
149 lsl data1, data1, pos
150 lsl data2, data2, pos
151 /* But we need to zero-extend (char is unsigned) the value and then
152 perform a signed 32-bit subtraction. */
153 lsr data1, data1, #56
154 sub result, data1, data2, lsr #56
159 /* Sources are mutually aligned, but are not currently at an
160 alignment boundary. Round down the addresses and then mask off
161 the bytes that precede the start point.
162 We also need to adjust the limit calculations, but without
163 overflowing if the limit is near ULONG_MAX. */
166 ldr data1, [src1], #8
167 neg tmp3, count, lsl #3 /* 64 - bits(bytes beyond align). */
168 ldr data2, [src2], #8
170 LS_FW tmp2, tmp2, tmp3 /* Shift (count & 63). */
171 /* Adjust the limit and ensure it doesn't overflow. */
172 adds limit, limit, count
173 csinv limit, limit, xzr, lo
174 orr data1, data1, tmp2
175 orr data2, data2, tmp2
179 /* Don't bother with dwords for up to 16 bytes. */
182 b.hs L(try_misaligned_words)
185 /* Perhaps we can do better than this. */
186 ldrb data1w, [src1], #1
187 ldrb data2w, [src2], #1
188 subs limit, limit, #1
189 ccmp data1w, #1, #0, hi /* NZCV = 0b0000. */
190 ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */
193 sub result, data1, data2
195 /* Align the SRC1 to a dword by doing a bytewise compare and then do
197 L(try_misaligned_words):
198 cbz count, L(src1_aligned)
202 sub limit, limit, count
205 ldrb data1w, [src1], #1
206 ldrb data2w, [src2], #1
208 ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */
210 subs count, count, #1
211 b.hi L(page_end_loop)
213 /* The following diagram explains the comparison of misaligned strings.
214 The bytes are shown in natural order. For little-endian, it is
215 reversed in the registers. The "x" bytes are before the string.
216 The "|" separates data that is loaded at one time.
217 src1 | a a a a a a a a | b b b c c c c c | . . .
218 src2 | x x x x x a a a a a a a a b b b | c c c c c . . .
220 After shifting in each step, the data looks like this:
222 data1 a a a a a a a a b b b c c c c c b b b c c c c c
223 data2 a a a a a a a a b b b 0 0 0 0 0 0 0 0 c c c c c
225 The bytes with "0" are eliminated from the syndrome via mask.
227 Align SRC2 down to 16 bytes. This way we can read 16 bytes at a
228 time from SRC2. The comparison happens in 3 steps. After each step
229 the loop can exit, or read from SRC1 or SRC2. */
231 /* Calculate offset from 8 byte alignment to string start in bits. No
232 need to mask offset since shifts are ignoring upper bits. */
236 neg neg_offset, offset
237 ldr data1, [src1], #8
238 ldp tmp1, tmp2, [src2], #16
239 LS_BK mask, mask, neg_offset
240 and neg_offset, neg_offset, #63 /* Need actual value for cmp later. */
241 /* Skip the first compare if data in tmp1 is irrelevant. */
242 tbnz offset, 6, L(misaligned_mid_loop)
245 /* STEP_A: Compare full 8 bytes when there is enough data from SRC2.*/
246 LS_FW data2, tmp1, offset
247 LS_BK tmp1, tmp2, neg_offset
248 subs limit, limit, #8
249 orr data2, data2, tmp1 /* 8 bytes from SRC2 combined from two regs.*/
250 sub has_nul, data1, zeroones
251 eor diff, data1, data2 /* Non-zero if differences found. */
252 orr tmp3, data1, #REP8_7f
253 csinv endloop, diff, xzr, hi /* If limit, set to all ones. */
254 bic has_nul, has_nul, tmp3 /* Non-zero if NUL byte found in SRC1. */
255 orr tmp3, endloop, has_nul
256 cbnz tmp3, L(full_check)
258 ldr data1, [src1], #8
259 L(misaligned_mid_loop):
260 /* STEP_B: Compare first part of data1 to second part of tmp2. */
261 LS_FW data2, tmp2, offset
263 /* For big-endian we do a byte reverse to avoid carry-propagation
264 problem described above. This way we can reuse the has_nul in the
265 next step and also use syndrome value trick at the end. */
267 #define data1_fixed tmp3
269 #define data1_fixed data1
271 sub has_nul, data1_fixed, zeroones
272 orr tmp3, data1_fixed, #REP8_7f
273 eor diff, data2, data1 /* Non-zero if differences found. */
274 bic has_nul, has_nul, tmp3 /* Non-zero if NUL terminator. */
278 cmp limit, neg_offset, lsr #3
279 orr syndrome, diff, has_nul
280 bic syndrome, syndrome, mask /* Ignore later bytes. */
281 csinv tmp3, syndrome, xzr, hi /* If limit, set to all ones. */
282 cbnz tmp3, L(syndrome_check)
284 /* STEP_C: Compare second part of data1 to first part of tmp1. */
285 ldp tmp1, tmp2, [src2], #16
287 LS_BK data2, tmp1, neg_offset
288 eor diff, data2, data1 /* Non-zero if differences found. */
289 orr syndrome, diff, has_nul
290 and syndrome, syndrome, mask /* Ignore earlier bytes. */
291 csinv tmp3, syndrome, xzr, hi /* If limit, set to all ones. */
292 cbnz tmp3, L(syndrome_check)
294 ldr data1, [src1], #8
301 cmp pos, limit, lsl #3
308 SYM_FUNC_END(__pi_strncmp)
309 SYM_FUNC_ALIAS_WEAK(strncmp, __pi_strncmp)
310 EXPORT_SYMBOL_NOKASAN(strncmp)