1 /* Searching in a string. -*- coding: utf-8 -*-
2 Copyright (C) 2005-2025 Free Software Foundation, Inc.
3 Written by Bruno Haible <bruno@clisp.org>, 2005.
5 This file is free software: you can redistribute it and/or modify
6 it under the terms of the GNU Lesser General Public License as
7 published by the Free Software Foundation, either version 3 of the
8 License, or (at your option) any later version.
10 This file is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public License
16 along with this program. If not, see <https://www.gnu.org/licenses/>. */
23 #include <stddef.h> /* for NULL, in case a nonstandard string.h lacks it */
28 #if GNULIB_MCEL_PREFER
30 typedef mcel_t mbchar_t
;
31 static bool mb_equal (mcel_t a
, mcel_t b
) { return mcel_cmp (a
, b
) == 0; }
36 /* Knuth-Morris-Pratt algorithm. */
37 #define UNIT unsigned char
38 #define CANON_ELEMENT(c) c
41 /* Knuth-Morris-Pratt algorithm.
42 See https://en.wikipedia.org/wiki/Knuth-Morris-Pratt_algorithm
43 Return a boolean indicating success:
44 Return true and set *RESULTP if the search was completed.
45 Return false if it was aborted because not enough memory was available. */
47 knuth_morris_pratt_multibyte (const char *haystack
, const char *needle
,
50 size_t m
= mbslen (needle
);
51 mbchar_t
*needle_mbchars
;
52 size_t extra_align
= (alignof (mbchar_t
) < alignof (size_t)
53 ? alignof (size_t) - alignof (mbchar_t
)
56 /* Allocate room for needle_mbchars and the table. */
57 void *memory
= nmalloca (m
+ !!extra_align
,
58 sizeof (mbchar_t
) + sizeof (size_t));
62 needle_mbchars
= memory
;
63 table_memory
= needle_mbchars
+ m
;
64 char *aligned
= table_memory
;
65 aligned
+= extra_align
;
66 aligned
-= (uintptr_t) aligned
% alignof (size_t);
67 size_t *table
= table_memory
= aligned
;
69 /* Fill needle_mbchars. */
70 #if GNULIB_MCEL_PREFER
71 for (size_t j
= 0; *needle
; needle
+= needle_mbchars
[j
++].len
)
72 needle_mbchars
[j
] = mcel_scanz (needle
);
79 for (mbui_init (iter
, needle
); mbui_avail (iter
); mbui_advance (iter
), j
++)
80 mb_copy (&needle_mbchars
[j
], &mbui_cur (iter
));
86 0 < table[i] <= i is defined such that
87 forall 0 < x < table[i]: needle[x..i-1] != needle[0..i-1-x],
88 and table[i] is as large as possible with this property.
92 needle[table[i]..i-1] = needle[0..i-1-table[i]].
94 rhaystack[0..i-1] == needle[0..i-1]
95 and exists h, i <= h < m: rhaystack[h] != needle[h]
97 forall 0 <= x < table[i]: rhaystack[x..x+m-1] != needle[0..m-1].
98 table[0] remains uninitialized. */
102 /* i = 1: Nothing to verify for x = 0. */
106 for (i
= 2; i
< m
; i
++)
108 /* Here: j = i-1 - table[i-1].
109 The inequality needle[x..i-1] != needle[0..i-1-x] is known to hold
110 for x < table[i-1], by induction.
111 Furthermore, if j>0: needle[i-1-j..i-2] = needle[0..j-1]. */
112 mbchar_t
*b
= &needle_mbchars
[i
- 1];
116 /* Invariants: The inequality needle[x..i-1] != needle[0..i-1-x]
117 is known to hold for x < i-1-j.
118 Furthermore, if j>0: needle[i-1-j..i-2] = needle[0..j-1]. */
119 if (mb_equal (*b
, needle_mbchars
[j
]))
121 /* Set table[i] := i-1-j. */
125 /* The inequality needle[x..i-1] != needle[0..i-1-x] also holds
126 for x = i-1-j, because
127 needle[i-1] != needle[j] = needle[i-1-x]. */
130 /* The inequality holds for all possible x. */
134 /* The inequality needle[x..i-1] != needle[0..i-1-x] also holds
135 for i-1-j < x < i-1-j+table[j], because for these x:
137 = needle[x-(i-1-j)..j-1]
138 != needle[0..j-1-(x-(i-1-j))] (by definition of table[j])
140 hence needle[x..i-1] != needle[0..i-1-x].
142 needle[i-1-j+table[j]..i-2]
143 = needle[table[j]..j-1]
144 = needle[0..j-1-table[j]] (by definition of table[j]). */
147 /* Here: j = i - table[i]. */
151 /* Search, using the table to accelerate the processing. */
153 #if GNULIB_MCEL_PREFER
155 char const *rhaystack
= haystack
;
156 char const *phaystack
= haystack
;
159 /* Invariant: phaystack = rhaystack + j. */
167 mcel_t g
= mcel_scanz (phaystack
);
168 if (mcel_cmp (needle_mbchars
[j
], g
) == 0)
171 /* Exit loop successfully if the entire needle has been found. */
178 /* Found a mismatch at needle[0] already. */
179 rhaystack
+= mcel_scanz (rhaystack
).len
;
184 /* Found a match of needle[0..j-1], mismatch at needle[j]. */
185 size_t count
= table
[j
];
187 for (; count
!= 0; count
--)
188 rhaystack
+= mcel_scanz (rhaystack
).len
;
191 *resultp
= rhaystack
;
194 mbui_iterator_t rhaystack
;
195 mbui_iterator_t phaystack
;
199 mbui_init (rhaystack
, haystack
);
200 mbui_init (phaystack
, haystack
);
201 /* Invariant: phaystack = rhaystack + j. */
202 while (mbui_avail (phaystack
))
203 if (mb_equal (needle_mbchars
[j
], mbui_cur (phaystack
)))
206 mbui_advance (phaystack
);
209 /* The entire needle has been found. */
210 *resultp
= mbui_cur_ptr (rhaystack
);
216 /* Found a match of needle[0..j-1], mismatch at needle[j]. */
217 size_t count
= table
[j
];
219 for (; count
> 0; count
--)
221 if (!mbui_avail (rhaystack
))
223 mbui_advance (rhaystack
);
228 /* Found a mismatch at needle[0] already. */
229 if (!mbui_avail (rhaystack
))
231 mbui_advance (rhaystack
);
232 mbui_advance (phaystack
);
241 /* Find the first occurrence of the character string NEEDLE in the character
242 string HAYSTACK. Return NULL if NEEDLE is not found in HAYSTACK. */
244 mbsstr (const char *haystack
, const char *needle
)
246 /* Be careful not to look at the entire extent of haystack or needle
247 until needed. This is useful because of these two cases:
248 - haystack may be very long, and a match of needle found early,
249 - needle may be very long, and not even a short initial segment of
250 needle may be found in haystack. */
253 #if GNULIB_MCEL_PREFER
255 return (char *) haystack
;
257 mcel_t ng
= mcel_scanz (needle
);
259 /* Minimizing the worst-case complexity:
260 Let n = mbslen(haystack), m = mbslen(needle).
261 The naïve algorithm is O(n*m) worst-case.
262 The Knuth-Morris-Pratt algorithm is O(n) worst-case but it needs a
264 To achieve linear complexity and yet amortize the cost of the
265 memory allocation, we activate the Knuth-Morris-Pratt algorithm
266 only once the naïve algorithm has already run for some time; more
268 - the outer loop count is >= 10,
269 - the average number of comparisons per outer loop is >= 5,
270 - the total number of comparisons is >= m.
271 But we try it only once. If the memory allocation attempt failed,
272 we don't retry it. */
274 size_t outer_loop_count
= 0;
275 size_t comparison_count
= 0;
277 /* Last comparison count, and needle + last_ccount. */
278 size_t last_ccount
= 0;
279 char const *iter_needle_last_ccount
= needle
;
281 char const *iter_haystack
= haystack
;
283 for (mcel_t hg
; *iter_haystack
; iter_haystack
+= hg
.len
)
285 /* See whether it's advisable to use an asymptotically faster
288 && outer_loop_count
>= 10
289 && comparison_count
>= 5 * outer_loop_count
)
291 /* See if needle + comparison_count now reaches the end of
293 size_t count
= comparison_count
- last_ccount
;
295 count
> 0 && *iter_needle_last_ccount
;
297 iter_needle_last_ccount
298 += mcel_scanz (iter_needle_last_ccount
).len
;
299 last_ccount
= comparison_count
;
300 if (!*iter_needle_last_ccount
)
303 if (knuth_morris_pratt_multibyte (haystack
, needle
,
305 return (char *) result
;
312 hg
= mcel_scanz (iter_haystack
);
313 if (mcel_cmp (hg
, ng
) == 0)
314 /* The first character matches. */
316 char const *rhaystack
= iter_haystack
+ hg
.len
;
317 char const *rneedle
= needle
+ ng
.len
;
322 return (char *) iter_haystack
;
325 rhg
= mcel_scanz (rhaystack
); rhaystack
+= rhg
.len
;
326 rng
= mcel_scanz (rneedle
); rneedle
+= rng
.len
;
329 while (mcel_cmp (rhg
, rng
) == 0);
335 mbui_iterator_t iter_needle
;
337 mbui_init (iter_needle
, needle
);
338 if (mbui_avail (iter_needle
))
340 /* Minimizing the worst-case complexity:
341 Let n = mbslen(haystack), m = mbslen(needle).
342 The naïve algorithm is O(n*m) worst-case.
343 The Knuth-Morris-Pratt algorithm is O(n) worst-case but it needs a
345 To achieve linear complexity and yet amortize the cost of the
346 memory allocation, we activate the Knuth-Morris-Pratt algorithm
347 only once the naïve algorithm has already run for some time; more
349 - the outer loop count is >= 10,
350 - the average number of comparisons per outer loop is >= 5,
351 - the total number of comparisons is >= m.
352 But we try it only once. If the memory allocation attempt failed,
353 we don't retry it. */
355 size_t outer_loop_count
= 0;
356 size_t comparison_count
= 0;
357 size_t last_ccount
= 0; /* last comparison count */
358 mbui_iterator_t iter_needle_last_ccount
; /* = needle + last_ccount */
360 mbui_iterator_t iter_haystack
;
362 mbui_init (iter_needle_last_ccount
, needle
);
363 mbui_init (iter_haystack
, haystack
);
364 for (;; mbui_advance (iter_haystack
))
366 if (!mbui_avail (iter_haystack
))
370 /* See whether it's advisable to use an asymptotically faster
373 && outer_loop_count
>= 10
374 && comparison_count
>= 5 * outer_loop_count
)
376 /* See if needle + comparison_count now reaches the end of
378 size_t count
= comparison_count
- last_ccount
;
380 count
> 0 && mbui_avail (iter_needle_last_ccount
);
382 mbui_advance (iter_needle_last_ccount
);
383 last_ccount
= comparison_count
;
384 if (!mbui_avail (iter_needle_last_ccount
))
386 /* Try the Knuth-Morris-Pratt algorithm. */
389 knuth_morris_pratt_multibyte (haystack
, needle
,
392 return (char *) result
;
399 if (mb_equal (mbui_cur (iter_haystack
), mbui_cur (iter_needle
)))
400 /* The first character matches. */
402 mbui_iterator_t rhaystack
;
403 mbui_iterator_t rneedle
;
405 memcpy (&rhaystack
, &iter_haystack
, sizeof (mbui_iterator_t
));
406 mbui_advance (rhaystack
);
408 mbui_init (rneedle
, needle
);
409 if (!mbui_avail (rneedle
))
411 mbui_advance (rneedle
);
413 for (;; mbui_advance (rhaystack
), mbui_advance (rneedle
))
415 if (!mbui_avail (rneedle
))
417 return (char *) mbui_cur_ptr (iter_haystack
);
418 if (!mbui_avail (rhaystack
))
422 if (!mb_equal (mbui_cur (rhaystack
), mbui_cur (rneedle
)))
423 /* Nothing in this round. */
430 return (char *) haystack
;
437 /* Minimizing the worst-case complexity:
438 Let n = strlen(haystack), m = strlen(needle).
439 The naïve algorithm is O(n*m) worst-case.
440 The Knuth-Morris-Pratt algorithm is O(n) worst-case but it needs a
442 To achieve linear complexity and yet amortize the cost of the
443 memory allocation, we activate the Knuth-Morris-Pratt algorithm
444 only once the naïve algorithm has already run for some time; more
446 - the outer loop count is >= 10,
447 - the average number of comparisons per outer loop is >= 5,
448 - the total number of comparisons is >= m.
449 But we try it only once. If the memory allocation attempt failed,
450 we don't retry it. */
452 size_t outer_loop_count
= 0;
453 size_t comparison_count
= 0;
454 size_t last_ccount
= 0; /* last comparison count */
455 const char *needle_last_ccount
= needle
; /* = needle + last_ccount */
457 /* Speed up the following searches of needle by caching its first
463 if (*haystack
== '\0')
467 /* See whether it's advisable to use an asymptotically faster
470 && outer_loop_count
>= 10
471 && comparison_count
>= 5 * outer_loop_count
)
473 /* See if needle + comparison_count now reaches the end of
475 if (needle_last_ccount
!= NULL
)
477 needle_last_ccount
+=
478 strnlen (needle_last_ccount
,
479 comparison_count
- last_ccount
);
480 if (*needle_last_ccount
== '\0')
481 needle_last_ccount
= NULL
;
482 last_ccount
= comparison_count
;
484 if (needle_last_ccount
== NULL
)
486 /* Try the Knuth-Morris-Pratt algorithm. */
487 const unsigned char *result
;
489 knuth_morris_pratt ((const unsigned char *) haystack
,
490 (const unsigned char *) (needle
- 1),
494 return (char *) result
;
502 /* The first character matches. */
504 const char *rhaystack
= haystack
+ 1;
505 const char *rneedle
= needle
;
507 for (;; rhaystack
++, rneedle
++)
509 if (*rneedle
== '\0')
511 return (char *) haystack
;
512 if (*rhaystack
== '\0')
516 if (*rhaystack
!= *rneedle
)
517 /* Nothing in this round. */
524 return (char *) haystack
;