1 /* Wide character substring search, using the Two-Way algorithm.
2 Copyright (C) 2008-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Written by Eric Blake <ebb9@byu.net>, 2008.
6 This file is free software: you can redistribute it and/or modify
7 it under the terms of the GNU Lesser General Public License as
8 published by the Free Software Foundation; either version 2.1 of the
9 License, or (at your option) any later version.
11 This file is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public License
17 along with this program. If not, see <https://www.gnu.org/licenses/>. */
19 /* Before including this file, you need to include <config.h> and
20 <string.h>, and define:
21 UNIT The element type of the needle and haystack.
22 RETURN_TYPE A macro that expands to the return type.
23 AVAILABLE(h, h_l, j, n_l)
24 A macro that returns nonzero if there are
25 at least N_L characters left starting at H[J].
26 H is 'UNIT *', H_L, J, and N_L are 'size_t';
27 H_L is an lvalue. For NUL-terminated searches,
28 H_L can be modified each iteration to avoid
29 having to compute the end of H up front.
31 For case-insensitivity, you may optionally define:
32 CMP_FUNC(p1, p2, l) A macro that returns 0 iff the first L
33 characters of P1 and P2 are equal.
34 CANON_ELEMENT(c) A macro that canonicalizes an element right after
35 it has been fetched from one of the two strings.
36 The argument is a 'UNIT'; the result must be a
39 This file undefines the macros documented above, and defines
40 LONG_NEEDLE_THRESHOLD.
46 /* We use the Two-Way string matching algorithm (also known as
47 Chrochemore-Perrin), which guarantees linear complexity with
50 See https://www-igm.univ-mlv.fr/~lecroq/string/node26.html#SECTION00260
54 # define MAX(a, b) ((a < b) ? (b) : (a))
58 # define CANON_ELEMENT(c) c
61 # define CMP_FUNC wmemcmp
64 /* Perform a critical factorization of NEEDLE, of length NEEDLE_LEN.
65 Return the index of the first character in the right half, and set
66 *PERIOD to the global period of the right half.
68 The global period of a string is the smallest index (possibly its
69 length) at which all remaining bytes in the string are repetitions
70 of the prefix (the last repetition may be a subset of the prefix).
72 When NEEDLE is factored into two halves, a local period is the
73 length of the smallest word that shares a suffix with the left half
74 and shares a prefix with the right half. All factorizations of a
75 non-empty NEEDLE have a local period of at least 1 and no greater
78 A critical factorization has the property that the local period
79 equals the global period. All strings have at least one critical
80 factorization with the left half smaller than the global period.
81 And while some strings have more than one critical factorization,
82 it is provable that with an ordered alphabet, at least one of the
83 critical factorizations corresponds to a maximal suffix.
85 Given an ordered alphabet, a critical factorization can be computed
86 in linear time, with 2 * NEEDLE_LEN comparisons, by computing the
87 shorter of two ordered maximal suffixes. The ordered maximal
88 suffixes are determined by lexicographic comparison while tracking
91 critical_factorization (const UNIT
*needle
, size_t needle_len
,
94 /* Index of last character of left half, or SIZE_MAX. */
95 size_t max_suffix
, max_suffix_rev
;
96 size_t j
; /* Index into NEEDLE for current candidate suffix. */
97 size_t k
; /* Offset into current period. */
98 size_t p
; /* Intermediate period. */
99 UNIT a
, b
; /* Current comparison characters. */
101 /* Special case NEEDLE_LEN of 1 or 2 (all callers already filtered
102 out 0-length needles. */
106 return needle_len
- 1;
110 0 <= j < NEEDLE_LEN - 1
111 -1 <= max_suffix{,_rev} < j (treating SIZE_MAX as if it were signed)
112 min(max_suffix, max_suffix_rev) < global period of NEEDLE
113 1 <= p <= global period of NEEDLE
114 p == global period of the substring NEEDLE[max_suffix{,_rev}+1...j]
118 /* Perform lexicographic search. */
119 max_suffix
= SIZE_MAX
;
122 while (j
+ k
< needle_len
)
124 a
= CANON_ELEMENT (needle
[j
+ k
]);
125 b
= CANON_ELEMENT (needle
[max_suffix
+ k
]);
128 /* Suffix is smaller, period is entire prefix so far. */
135 /* Advance through repetition of the current period. */
146 /* Suffix is larger, start over from current location. */
153 /* Perform reverse lexicographic search. */
154 max_suffix_rev
= SIZE_MAX
;
157 while (j
+ k
< needle_len
)
159 a
= CANON_ELEMENT (needle
[j
+ k
]);
160 b
= CANON_ELEMENT (needle
[max_suffix_rev
+ k
]);
163 /* Suffix is smaller, period is entire prefix so far. */
166 p
= j
- max_suffix_rev
;
170 /* Advance through repetition of the current period. */
181 /* Suffix is larger, start over from current location. */
182 max_suffix_rev
= j
++;
187 /* Choose the shorter suffix. Return the index of the first character
188 of the right half, rather than the last character of the left half.
190 For some examples, 'banana' has two critical factorizations, both
191 exposed by the two lexicographic extreme suffixes of 'anana' and
192 'nana', where both suffixes have a period of 2. On the other
193 hand, with 'aab' and 'bba', both strings have a single critical
194 factorization of the last character, with the suffix having a period
195 of 1. While the maximal lexicographic suffix of 'aab' is 'b',
196 the maximal lexicographic suffix of 'bba' is 'ba', which is not a
197 critical factorization. Conversely, the maximal reverse
198 lexicographic suffix of 'a' works for 'bba', but not 'ab' for
199 'aab'. The shorter suffix of the two will always be a critical
201 if (max_suffix_rev
+ 1 < max_suffix
+ 1)
202 return max_suffix
+ 1;
204 return max_suffix_rev
+ 1;
207 /* Return the first location of non-empty NEEDLE within HAYSTACK, or
208 NULL. HAYSTACK_LEN is the minimum known length of HAYSTACK. This
209 method is optimized for NEEDLE_LEN < LONG_NEEDLE_THRESHOLD.
210 Performance is guaranteed to be linear, with an initialization cost
211 of 2 * NEEDLE_LEN comparisons.
213 If AVAILABLE does not modify HAYSTACK_LEN (as in memmem), then at
214 most 2 * HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching.
215 If AVAILABLE modifies HAYSTACK_LEN (as in strstr), then at most 3 *
216 HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching. */
217 static RETURN_TYPE _GL_ATTRIBUTE_PURE
218 two_way_short_needle (const UNIT
*haystack
, size_t haystack_len
,
219 const UNIT
*needle
, size_t needle_len
)
221 size_t i
; /* Index into current character of NEEDLE. */
222 size_t j
; /* Index into current window of HAYSTACK. */
223 size_t period
; /* The period of the right half of needle. */
224 size_t suffix
; /* The index of the right half of needle. */
226 /* Factor the needle into two halves, such that the left half is
227 smaller than the global period, and the right half is
228 periodic (with a period as large as NEEDLE_LEN - suffix). */
229 suffix
= critical_factorization (needle
, needle_len
, &period
);
231 /* Perform the search. Each iteration compares the right half
233 if (CMP_FUNC (needle
, needle
+ period
, suffix
) == 0)
235 /* Entire needle is periodic; a mismatch in the left half can
236 only advance by the period, so use memory to avoid rescanning
237 known occurrences of the period in the right half. */
240 while (AVAILABLE (haystack
, haystack_len
, j
, needle_len
))
242 /* Scan for matches in right half. */
243 i
= MAX (suffix
, memory
);
244 while (i
< needle_len
&& (CANON_ELEMENT (needle
[i
])
245 == CANON_ELEMENT (haystack
[i
+ j
])))
249 /* Scan for matches in left half. */
251 while (memory
< i
+ 1 && (CANON_ELEMENT (needle
[i
])
252 == CANON_ELEMENT (haystack
[i
+ j
])))
254 if (i
+ 1 < memory
+ 1)
255 return (RETURN_TYPE
) (haystack
+ j
);
256 /* No match, so remember how many repetitions of period
257 on the right half were scanned. */
259 memory
= needle_len
- period
;
270 /* The two halves of needle are distinct; no extra memory is
271 required, and any mismatch results in a maximal shift. */
272 period
= MAX (suffix
, needle_len
- suffix
) + 1;
274 while (AVAILABLE (haystack
, haystack_len
, j
, needle_len
))
276 /* Scan for matches in right half. */
278 while (i
< needle_len
&& (CANON_ELEMENT (needle
[i
])
279 == CANON_ELEMENT (haystack
[i
+ j
])))
283 /* Scan for matches in left half. */
285 while (i
!= SIZE_MAX
&& (CANON_ELEMENT (needle
[i
])
286 == CANON_ELEMENT (haystack
[i
+ j
])))
289 return (RETURN_TYPE
) (haystack
+ j
);