1 /* Case-insensitive searching in a string. -*- coding: utf-8 -*-
2 Copyright (C) 2005-2025 Free Software Foundation, Inc.
3 Written by Bruno Haible <bruno@clisp.org>, 2005.
5 This file is free software: you can redistribute it and/or modify
6 it under the terms of the GNU Lesser General Public License as
7 published by the Free Software Foundation, either version 3 of the
8 License, or (at your option) any later version.
10 This file is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public License
16 along with this program. If not, see <https://www.gnu.org/licenses/>. */
24 #include <stddef.h> /* for NULL, in case a nonstandard string.h lacks it */
29 #if GNULIB_MCEL_PREFER
31 typedef mcel_t mbchar_t
;
32 static bool mb_equal (mcel_t a
, mcel_t b
) { return mcel_cmp (a
, b
) == 0; }
37 /* Knuth-Morris-Pratt algorithm. */
38 #define UNIT unsigned char
39 #define CANON_ELEMENT(c) tolower (c)
42 /* Knuth-Morris-Pratt algorithm.
43 See https://en.wikipedia.org/wiki/Knuth-Morris-Pratt_algorithm
44 Return a boolean indicating success:
45 Return true and set *RESULTP if the search was completed.
46 Return false if it was aborted because not enough memory was available. */
48 knuth_morris_pratt_multibyte (const char *haystack
, const char *needle
,
51 size_t m
= mbslen (needle
);
52 mbchar_t
*needle_mbchars
;
53 size_t extra_align
= (alignof (mbchar_t
) < alignof (size_t)
54 ? alignof (size_t) - alignof (mbchar_t
)
57 /* Allocate room for needle_mbchars and the table. */
58 void *memory
= nmalloca (m
+ !!extra_align
,
59 sizeof (mbchar_t
) + sizeof (size_t));
63 needle_mbchars
= memory
;
64 table_memory
= needle_mbchars
+ m
;
65 char *aligned
= table_memory
;
66 aligned
+= extra_align
;
67 aligned
-= (uintptr_t) aligned
% alignof (size_t);
68 size_t *table
= table_memory
= aligned
;
70 /* Fill needle_mbchars. */
71 #if GNULIB_MCEL_PREFER
72 for (size_t j
= 0; *needle
; needle
+= needle_mbchars
[j
++].len
)
74 needle_mbchars
[j
] = mcel_scanz (needle
);
75 needle_mbchars
[j
].ch
= c32tolower (needle_mbchars
[j
].ch
);
83 for (mbui_init (iter
, needle
); mbui_avail (iter
); mbui_advance (iter
), j
++)
85 mb_copy (&needle_mbchars
[j
], &mbui_cur (iter
));
86 if (needle_mbchars
[j
].wc_valid
)
87 needle_mbchars
[j
].wc
= c32tolower (needle_mbchars
[j
].wc
);
94 0 < table[i] <= i is defined such that
95 forall 0 < x < table[i]: needle[x..i-1] != needle[0..i-1-x],
96 and table[i] is as large as possible with this property.
100 needle[table[i]..i-1] = needle[0..i-1-table[i]].
102 rhaystack[0..i-1] == needle[0..i-1]
103 and exists h, i <= h < m: rhaystack[h] != needle[h]
105 forall 0 <= x < table[i]: rhaystack[x..x+m-1] != needle[0..m-1].
106 table[0] remains uninitialized. */
110 /* i = 1: Nothing to verify for x = 0. */
114 for (i
= 2; i
< m
; i
++)
116 /* Here: j = i-1 - table[i-1].
117 The inequality needle[x..i-1] != needle[0..i-1-x] is known to hold
118 for x < table[i-1], by induction.
119 Furthermore, if j>0: needle[i-1-j..i-2] = needle[0..j-1]. */
120 mbchar_t
*b
= &needle_mbchars
[i
- 1];
124 /* Invariants: The inequality needle[x..i-1] != needle[0..i-1-x]
125 is known to hold for x < i-1-j.
126 Furthermore, if j>0: needle[i-1-j..i-2] = needle[0..j-1]. */
127 if (mb_equal (*b
, needle_mbchars
[j
]))
129 /* Set table[i] := i-1-j. */
133 /* The inequality needle[x..i-1] != needle[0..i-1-x] also holds
134 for x = i-1-j, because
135 needle[i-1] != needle[j] = needle[i-1-x]. */
138 /* The inequality holds for all possible x. */
142 /* The inequality needle[x..i-1] != needle[0..i-1-x] also holds
143 for i-1-j < x < i-1-j+table[j], because for these x:
145 = needle[x-(i-1-j)..j-1]
146 != needle[0..j-1-(x-(i-1-j))] (by definition of table[j])
148 hence needle[x..i-1] != needle[0..i-1-x].
150 needle[i-1-j+table[j]..i-2]
151 = needle[table[j]..j-1]
152 = needle[0..j-1-table[j]] (by definition of table[j]). */
155 /* Here: j = i - table[i]. */
159 /* Search, using the table to accelerate the processing. */
161 #if GNULIB_MCEL_PREFER
163 char const *rhaystack
= haystack
;
164 char const *phaystack
= haystack
;
167 /* Invariant: phaystack = rhaystack + j. */
175 mcel_t g
= mcel_scanz (phaystack
);
176 g
.ch
= c32tolower (g
.ch
);
177 if (mcel_cmp (needle_mbchars
[j
], g
) == 0)
180 /* Exit loop successfully if the entire needle has been found. */
187 /* Found a mismatch at needle[0] already. */
188 rhaystack
+= mcel_scanz (rhaystack
).len
;
193 /* Found a match of needle[0..j-1], mismatch at needle[j]. */
194 size_t count
= table
[j
];
196 for (; count
!= 0; count
--)
197 rhaystack
+= mcel_scanz (rhaystack
).len
;
200 *resultp
= rhaystack
;
203 mbui_iterator_t rhaystack
;
204 mbui_iterator_t phaystack
;
208 mbui_init (rhaystack
, haystack
);
209 mbui_init (phaystack
, haystack
);
210 /* Invariant: phaystack = rhaystack + j. */
211 while (mbui_avail (phaystack
))
215 mb_copy (&c
, &mbui_cur (phaystack
));
217 c
.wc
= c32tolower (c
.wc
);
218 if (mb_equal (needle_mbchars
[j
], c
))
221 mbui_advance (phaystack
);
224 /* The entire needle has been found. */
225 *resultp
= mbui_cur_ptr (rhaystack
);
231 /* Found a match of needle[0..j-1], mismatch at needle[j]. */
232 size_t count
= table
[j
];
234 for (; count
> 0; count
--)
236 if (!mbui_avail (rhaystack
))
238 mbui_advance (rhaystack
);
243 /* Found a mismatch at needle[0] already. */
244 if (!mbui_avail (rhaystack
))
246 mbui_advance (rhaystack
);
247 mbui_advance (phaystack
);
257 /* Find the first occurrence of the character string NEEDLE in the character
258 string HAYSTACK, using case-insensitive comparison.
259 Note: This function may, in multibyte locales, return success even if
260 strlen (haystack) < strlen (needle) ! */
262 mbscasestr (const char *haystack
, const char *needle
)
264 /* Be careful not to look at the entire extent of haystack or needle
265 until needed. This is useful because of these two cases:
266 - haystack may be very long, and a match of needle found early,
267 - needle may be very long, and not even a short initial segment of
268 needle may be found in haystack. */
271 #if GNULIB_MCEL_PREFER
273 return (char *) haystack
;
275 mcel_t ng
= mcel_scanz (needle
);
276 ng
.ch
= c32tolower (ng
.ch
);
278 /* Minimizing the worst-case complexity:
279 Let n = mbslen(haystack), m = mbslen(needle).
280 The naïve algorithm is O(n*m) worst-case.
281 The Knuth-Morris-Pratt algorithm is O(n) worst-case but it needs a
283 To achieve linear complexity and yet amortize the cost of the
284 memory allocation, we activate the Knuth-Morris-Pratt algorithm
285 only once the naïve algorithm has already run for some time; more
287 - the outer loop count is >= 10,
288 - the average number of comparisons per outer loop is >= 5,
289 - the total number of comparisons is >= m.
290 But we try it only once. If the memory allocation attempt failed,
291 we don't retry it. */
293 size_t outer_loop_count
= 0;
294 size_t comparison_count
= 0;
296 /* Last comparison count, and needle + last_ccount. */
297 size_t last_ccount
= 0;
298 char const *iter_needle_last_ccount
= needle
;
300 char const *iter_haystack
= haystack
;
302 for (mcel_t hg
; *iter_haystack
; iter_haystack
+= hg
.len
)
304 /* See whether it's advisable to use an asymptotically faster
307 && outer_loop_count
>= 10
308 && comparison_count
>= 5 * outer_loop_count
)
310 /* See if needle + comparison_count now reaches the end of
312 size_t count
= comparison_count
- last_ccount
;
314 count
> 0 && *iter_needle_last_ccount
;
316 iter_needle_last_ccount
317 += mcel_scanz (iter_needle_last_ccount
).len
;
318 last_ccount
= comparison_count
;
319 if (!*iter_needle_last_ccount
)
322 if (knuth_morris_pratt_multibyte (haystack
, needle
,
324 return (char *) result
;
331 hg
= mcel_scanz (iter_haystack
);
332 hg
.ch
= c32tolower (hg
.ch
);
333 if (mcel_cmp (hg
, ng
) == 0)
334 /* The first character matches. */
336 char const *rhaystack
= iter_haystack
+ hg
.len
;
337 char const *rneedle
= needle
+ ng
.len
;
342 return (char *) iter_haystack
;
345 rhg
= mcel_scanz (rhaystack
); rhaystack
+= rhg
.len
;
346 rng
= mcel_scanz (rneedle
); rneedle
+= rng
.len
;
349 while (mcel_tocmp (c32tolower
, rhg
, rng
) == 0);
355 mbui_iterator_t iter_needle
;
357 mbui_init (iter_needle
, needle
);
358 if (mbui_avail (iter_needle
))
360 /* Minimizing the worst-case complexity:
361 Let n = mbslen(haystack), m = mbslen(needle).
362 The naïve algorithm is O(n*m) worst-case.
363 The Knuth-Morris-Pratt algorithm is O(n) worst-case but it needs a
365 To achieve linear complexity and yet amortize the cost of the
366 memory allocation, we activate the Knuth-Morris-Pratt algorithm
367 only once the naïve algorithm has already run for some time; more
369 - the outer loop count is >= 10,
370 - the average number of comparisons per outer loop is >= 5,
371 - the total number of comparisons is >= m.
372 But we try it only once. If the memory allocation attempt failed,
373 we don't retry it. */
375 size_t outer_loop_count
= 0;
376 size_t comparison_count
= 0;
377 size_t last_ccount
= 0; /* last comparison count */
378 mbui_iterator_t iter_needle_last_ccount
; /* = needle + last_ccount */
381 mbui_iterator_t iter_haystack
;
383 mbui_init (iter_needle_last_ccount
, needle
);
385 mb_copy (&b
, &mbui_cur (iter_needle
));
387 b
.wc
= c32tolower (b
.wc
);
389 mbui_init (iter_haystack
, haystack
);
390 for (;; mbui_advance (iter_haystack
))
394 if (!mbui_avail (iter_haystack
))
398 /* See whether it's advisable to use an asymptotically faster
401 && outer_loop_count
>= 10
402 && comparison_count
>= 5 * outer_loop_count
)
404 /* See if needle + comparison_count now reaches the end of
406 size_t count
= comparison_count
- last_ccount
;
408 count
> 0 && mbui_avail (iter_needle_last_ccount
);
410 mbui_advance (iter_needle_last_ccount
);
411 last_ccount
= comparison_count
;
412 if (!mbui_avail (iter_needle_last_ccount
))
414 /* Try the Knuth-Morris-Pratt algorithm. */
417 knuth_morris_pratt_multibyte (haystack
, needle
,
420 return (char *) result
;
427 mb_copy (&c
, &mbui_cur (iter_haystack
));
429 c
.wc
= c32tolower (c
.wc
);
431 /* The first character matches. */
433 mbui_iterator_t rhaystack
;
434 mbui_iterator_t rneedle
;
436 memcpy (&rhaystack
, &iter_haystack
, sizeof (mbui_iterator_t
));
437 mbui_advance (rhaystack
);
439 mbui_init (rneedle
, needle
);
440 if (!mbui_avail (rneedle
))
442 mbui_advance (rneedle
);
444 for (;; mbui_advance (rhaystack
), mbui_advance (rneedle
))
446 if (!mbui_avail (rneedle
))
448 return (char *) mbui_cur_ptr (iter_haystack
);
449 if (!mbui_avail (rhaystack
))
453 if (!mb_caseequal (mbui_cur (rhaystack
),
455 /* Nothing in this round. */
462 return (char *) haystack
;
469 /* Minimizing the worst-case complexity:
470 Let n = strlen(haystack), m = strlen(needle).
471 The naïve algorithm is O(n*m) worst-case.
472 The Knuth-Morris-Pratt algorithm is O(n) worst-case but it needs a
474 To achieve linear complexity and yet amortize the cost of the
475 memory allocation, we activate the Knuth-Morris-Pratt algorithm
476 only once the naïve algorithm has already run for some time; more
478 - the outer loop count is >= 10,
479 - the average number of comparisons per outer loop is >= 5,
480 - the total number of comparisons is >= m.
481 But we try it only once. If the memory allocation attempt failed,
482 we don't retry it. */
484 size_t outer_loop_count
= 0;
485 size_t comparison_count
= 0;
486 size_t last_ccount
= 0; /* last comparison count */
487 const char *needle_last_ccount
= needle
; /* = needle + last_ccount */
489 /* Speed up the following searches of needle by caching its first
490 character and lowercase counterpart. */
491 unsigned char B
= *needle
;
492 unsigned char b
= tolower (B
);
497 if (*haystack
== '\0')
501 /* See whether it's advisable to use an asymptotically faster
504 && outer_loop_count
>= 10
505 && comparison_count
>= 5 * outer_loop_count
)
507 /* See if needle + comparison_count now reaches the end of
509 if (needle_last_ccount
!= NULL
)
511 needle_last_ccount
+=
512 strnlen (needle_last_ccount
,
513 comparison_count
- last_ccount
);
514 if (*needle_last_ccount
== '\0')
515 needle_last_ccount
= NULL
;
516 last_ccount
= comparison_count
;
518 if (needle_last_ccount
== NULL
)
520 /* Try the Knuth-Morris-Pratt algorithm. */
521 const unsigned char *result
;
523 knuth_morris_pratt ((const unsigned char *) haystack
,
524 (const unsigned char *) (needle
- 1),
528 return (char *) result
;
535 unsigned char H
= *haystack
;
536 if (H
== B
|| H
== b
|| tolower (H
) == b
)
537 /* The first character matches. */
539 const char *rhaystack
= haystack
+ 1;
540 const char *rneedle
= needle
;
542 for (;; rhaystack
++, rneedle
++)
544 if (*rneedle
== '\0')
546 return (char *) haystack
;
547 if (*rhaystack
== '\0')
551 if (! (*rhaystack
== *rneedle
552 || (tolower ((unsigned char) *rhaystack
)
553 == tolower ((unsigned char) *rneedle
))))
554 /* Nothing in this round. */
561 return (char *) haystack
;