Fix last ChangeLog entry.
[gnulib.git] / lib / mbscasestr.c
blob7a186e0f988e24d528326d7b2cd2edd1d7d3ad3c
1 /* Case-insensitive searching in a string. -*- coding: utf-8 -*-
2 Copyright (C) 2005-2025 Free Software Foundation, Inc.
3 Written by Bruno Haible <bruno@clisp.org>, 2005.
5 This file is free software: you can redistribute it and/or modify
6 it under the terms of the GNU Lesser General Public License as
7 published by the Free Software Foundation, either version 3 of the
8 License, or (at your option) any later version.
10 This file is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public License
16 along with this program. If not, see <https://www.gnu.org/licenses/>. */
18 #include <config.h>
20 /* Specification. */
21 #include <string.h>
23 #include <ctype.h>
24 #include <stddef.h> /* for NULL, in case a nonstandard string.h lacks it */
25 #include <stdlib.h>
27 #include "malloca.h"
29 #if GNULIB_MCEL_PREFER
30 # include "mcel.h"
31 typedef mcel_t mbchar_t;
32 static bool mb_equal (mcel_t a, mcel_t b) { return mcel_cmp (a, b) == 0; }
33 #else
34 # include "mbuiter.h"
35 #endif
37 /* Knuth-Morris-Pratt algorithm. */
38 #define UNIT unsigned char
39 #define CANON_ELEMENT(c) tolower (c)
40 #include "str-kmp.h"
42 /* Knuth-Morris-Pratt algorithm.
43 See https://en.wikipedia.org/wiki/Knuth-Morris-Pratt_algorithm
44 Return a boolean indicating success:
45 Return true and set *RESULTP if the search was completed.
46 Return false if it was aborted because not enough memory was available. */
47 static bool
48 knuth_morris_pratt_multibyte (const char *haystack, const char *needle,
49 const char **resultp)
51 size_t m = mbslen (needle);
52 mbchar_t *needle_mbchars;
53 size_t extra_align = (alignof (mbchar_t) < alignof (size_t)
54 ? alignof (size_t) - alignof (mbchar_t)
55 : 0);
57 /* Allocate room for needle_mbchars and the table. */
58 void *memory = nmalloca (m + !!extra_align,
59 sizeof (mbchar_t) + sizeof (size_t));
60 void *table_memory;
61 if (memory == NULL)
62 return false;
63 needle_mbchars = memory;
64 table_memory = needle_mbchars + m;
65 char *aligned = table_memory;
66 aligned += extra_align;
67 aligned -= (uintptr_t) aligned % alignof (size_t);
68 size_t *table = table_memory = aligned;
70 /* Fill needle_mbchars. */
71 #if GNULIB_MCEL_PREFER
72 for (size_t j = 0; *needle; needle += needle_mbchars[j++].len)
74 needle_mbchars[j] = mcel_scanz (needle);
75 needle_mbchars[j].ch = c32tolower (needle_mbchars[j].ch);
77 #else
79 mbui_iterator_t iter;
80 size_t j;
82 j = 0;
83 for (mbui_init (iter, needle); mbui_avail (iter); mbui_advance (iter), j++)
85 mb_copy (&needle_mbchars[j], &mbui_cur (iter));
86 if (needle_mbchars[j].wc_valid)
87 needle_mbchars[j].wc = c32tolower (needle_mbchars[j].wc);
90 #endif
92 /* Fill the table.
93 For 0 < i < m:
94 0 < table[i] <= i is defined such that
95 forall 0 < x < table[i]: needle[x..i-1] != needle[0..i-1-x],
96 and table[i] is as large as possible with this property.
97 This implies:
98 1) For 0 < i < m:
99 If table[i] < i,
100 needle[table[i]..i-1] = needle[0..i-1-table[i]].
101 2) For 0 < i < m:
102 rhaystack[0..i-1] == needle[0..i-1]
103 and exists h, i <= h < m: rhaystack[h] != needle[h]
104 implies
105 forall 0 <= x < table[i]: rhaystack[x..x+m-1] != needle[0..m-1].
106 table[0] remains uninitialized. */
108 size_t i, j;
110 /* i = 1: Nothing to verify for x = 0. */
111 table[1] = 1;
112 j = 0;
114 for (i = 2; i < m; i++)
116 /* Here: j = i-1 - table[i-1].
117 The inequality needle[x..i-1] != needle[0..i-1-x] is known to hold
118 for x < table[i-1], by induction.
119 Furthermore, if j>0: needle[i-1-j..i-2] = needle[0..j-1]. */
120 mbchar_t *b = &needle_mbchars[i - 1];
122 for (;;)
124 /* Invariants: The inequality needle[x..i-1] != needle[0..i-1-x]
125 is known to hold for x < i-1-j.
126 Furthermore, if j>0: needle[i-1-j..i-2] = needle[0..j-1]. */
127 if (mb_equal (*b, needle_mbchars[j]))
129 /* Set table[i] := i-1-j. */
130 table[i] = i - ++j;
131 break;
133 /* The inequality needle[x..i-1] != needle[0..i-1-x] also holds
134 for x = i-1-j, because
135 needle[i-1] != needle[j] = needle[i-1-x]. */
136 if (j == 0)
138 /* The inequality holds for all possible x. */
139 table[i] = i;
140 break;
142 /* The inequality needle[x..i-1] != needle[0..i-1-x] also holds
143 for i-1-j < x < i-1-j+table[j], because for these x:
144 needle[x..i-2]
145 = needle[x-(i-1-j)..j-1]
146 != needle[0..j-1-(x-(i-1-j))] (by definition of table[j])
147 = needle[0..i-2-x],
148 hence needle[x..i-1] != needle[0..i-1-x].
149 Furthermore
150 needle[i-1-j+table[j]..i-2]
151 = needle[table[j]..j-1]
152 = needle[0..j-1-table[j]] (by definition of table[j]). */
153 j = j - table[j];
155 /* Here: j = i - table[i]. */
159 /* Search, using the table to accelerate the processing. */
161 #if GNULIB_MCEL_PREFER
162 size_t j;
163 char const *rhaystack = haystack;
164 char const *phaystack = haystack;
166 j = 0;
167 /* Invariant: phaystack = rhaystack + j. */
168 for (;;)
170 if (!*phaystack)
172 rhaystack = NULL;
173 break;
175 mcel_t g = mcel_scanz (phaystack);
176 g.ch = c32tolower (g.ch);
177 if (mcel_cmp (needle_mbchars[j], g) == 0)
179 j++;
180 /* Exit loop successfully if the entire needle has been found. */
181 if (j == m)
182 break;
183 phaystack += g.len;
185 else if (j == 0)
187 /* Found a mismatch at needle[0] already. */
188 rhaystack += mcel_scanz (rhaystack).len;
189 phaystack += g.len;
191 else
193 /* Found a match of needle[0..j-1], mismatch at needle[j]. */
194 size_t count = table[j];
195 j -= count;
196 for (; count != 0; count--)
197 rhaystack += mcel_scanz (rhaystack).len;
200 *resultp = rhaystack;
201 #else
202 size_t j;
203 mbui_iterator_t rhaystack;
204 mbui_iterator_t phaystack;
206 *resultp = NULL;
207 j = 0;
208 mbui_init (rhaystack, haystack);
209 mbui_init (phaystack, haystack);
210 /* Invariant: phaystack = rhaystack + j. */
211 while (mbui_avail (phaystack))
213 mbchar_t c;
215 mb_copy (&c, &mbui_cur (phaystack));
216 if (c.wc_valid)
217 c.wc = c32tolower (c.wc);
218 if (mb_equal (needle_mbchars[j], c))
220 j++;
221 mbui_advance (phaystack);
222 if (j == m)
224 /* The entire needle has been found. */
225 *resultp = mbui_cur_ptr (rhaystack);
226 break;
229 else if (j > 0)
231 /* Found a match of needle[0..j-1], mismatch at needle[j]. */
232 size_t count = table[j];
233 j -= count;
234 for (; count > 0; count--)
236 if (!mbui_avail (rhaystack))
237 abort ();
238 mbui_advance (rhaystack);
241 else
243 /* Found a mismatch at needle[0] already. */
244 if (!mbui_avail (rhaystack))
245 abort ();
246 mbui_advance (rhaystack);
247 mbui_advance (phaystack);
250 #endif
253 freea (memory);
254 return true;
257 /* Find the first occurrence of the character string NEEDLE in the character
258 string HAYSTACK, using case-insensitive comparison.
259 Note: This function may, in multibyte locales, return success even if
260 strlen (haystack) < strlen (needle) ! */
261 char *
262 mbscasestr (const char *haystack, const char *needle)
264 /* Be careful not to look at the entire extent of haystack or needle
265 until needed. This is useful because of these two cases:
266 - haystack may be very long, and a match of needle found early,
267 - needle may be very long, and not even a short initial segment of
268 needle may be found in haystack. */
269 if (MB_CUR_MAX > 1)
271 #if GNULIB_MCEL_PREFER
272 if (!*needle)
273 return (char *) haystack;
275 mcel_t ng = mcel_scanz (needle);
276 ng.ch = c32tolower (ng.ch);
278 /* Minimizing the worst-case complexity:
279 Let n = mbslen(haystack), m = mbslen(needle).
280 The naïve algorithm is O(n*m) worst-case.
281 The Knuth-Morris-Pratt algorithm is O(n) worst-case but it needs a
282 memory allocation.
283 To achieve linear complexity and yet amortize the cost of the
284 memory allocation, we activate the Knuth-Morris-Pratt algorithm
285 only once the naïve algorithm has already run for some time; more
286 precisely, when
287 - the outer loop count is >= 10,
288 - the average number of comparisons per outer loop is >= 5,
289 - the total number of comparisons is >= m.
290 But we try it only once. If the memory allocation attempt failed,
291 we don't retry it. */
292 bool try_kmp = true;
293 size_t outer_loop_count = 0;
294 size_t comparison_count = 0;
296 /* Last comparison count, and needle + last_ccount. */
297 size_t last_ccount = 0;
298 char const *iter_needle_last_ccount = needle;
300 char const *iter_haystack = haystack;
302 for (mcel_t hg; *iter_haystack; iter_haystack += hg.len)
304 /* See whether it's advisable to use an asymptotically faster
305 algorithm. */
306 if (try_kmp
307 && outer_loop_count >= 10
308 && comparison_count >= 5 * outer_loop_count)
310 /* See if needle + comparison_count now reaches the end of
311 needle. */
312 size_t count = comparison_count - last_ccount;
313 for (;
314 count > 0 && *iter_needle_last_ccount;
315 count--)
316 iter_needle_last_ccount
317 += mcel_scanz (iter_needle_last_ccount).len;
318 last_ccount = comparison_count;
319 if (!*iter_needle_last_ccount)
321 char const *result;
322 if (knuth_morris_pratt_multibyte (haystack, needle,
323 &result))
324 return (char *) result;
325 try_kmp = false;
329 outer_loop_count++;
330 comparison_count++;
331 hg = mcel_scanz (iter_haystack);
332 hg.ch = c32tolower (hg.ch);
333 if (mcel_cmp (hg, ng) == 0)
334 /* The first character matches. */
336 char const *rhaystack = iter_haystack + hg.len;
337 char const *rneedle = needle + ng.len;
338 mcel_t rhg, rng;
341 if (!*rneedle)
342 return (char *) iter_haystack;
343 if (!*rhaystack)
344 return NULL;
345 rhg = mcel_scanz (rhaystack); rhaystack += rhg.len;
346 rng = mcel_scanz (rneedle); rneedle += rng.len;
347 comparison_count++;
349 while (mcel_tocmp (c32tolower, rhg, rng) == 0);
353 return NULL;
354 #else
355 mbui_iterator_t iter_needle;
357 mbui_init (iter_needle, needle);
358 if (mbui_avail (iter_needle))
360 /* Minimizing the worst-case complexity:
361 Let n = mbslen(haystack), m = mbslen(needle).
362 The naïve algorithm is O(n*m) worst-case.
363 The Knuth-Morris-Pratt algorithm is O(n) worst-case but it needs a
364 memory allocation.
365 To achieve linear complexity and yet amortize the cost of the
366 memory allocation, we activate the Knuth-Morris-Pratt algorithm
367 only once the naïve algorithm has already run for some time; more
368 precisely, when
369 - the outer loop count is >= 10,
370 - the average number of comparisons per outer loop is >= 5,
371 - the total number of comparisons is >= m.
372 But we try it only once. If the memory allocation attempt failed,
373 we don't retry it. */
374 bool try_kmp = true;
375 size_t outer_loop_count = 0;
376 size_t comparison_count = 0;
377 size_t last_ccount = 0; /* last comparison count */
378 mbui_iterator_t iter_needle_last_ccount; /* = needle + last_ccount */
380 mbchar_t b;
381 mbui_iterator_t iter_haystack;
383 mbui_init (iter_needle_last_ccount, needle);
385 mb_copy (&b, &mbui_cur (iter_needle));
386 if (b.wc_valid)
387 b.wc = c32tolower (b.wc);
389 mbui_init (iter_haystack, haystack);
390 for (;; mbui_advance (iter_haystack))
392 mbchar_t c;
394 if (!mbui_avail (iter_haystack))
395 /* No match. */
396 return NULL;
398 /* See whether it's advisable to use an asymptotically faster
399 algorithm. */
400 if (try_kmp
401 && outer_loop_count >= 10
402 && comparison_count >= 5 * outer_loop_count)
404 /* See if needle + comparison_count now reaches the end of
405 needle. */
406 size_t count = comparison_count - last_ccount;
407 for (;
408 count > 0 && mbui_avail (iter_needle_last_ccount);
409 count--)
410 mbui_advance (iter_needle_last_ccount);
411 last_ccount = comparison_count;
412 if (!mbui_avail (iter_needle_last_ccount))
414 /* Try the Knuth-Morris-Pratt algorithm. */
415 const char *result;
416 bool success =
417 knuth_morris_pratt_multibyte (haystack, needle,
418 &result);
419 if (success)
420 return (char *) result;
421 try_kmp = false;
425 outer_loop_count++;
426 comparison_count++;
427 mb_copy (&c, &mbui_cur (iter_haystack));
428 if (c.wc_valid)
429 c.wc = c32tolower (c.wc);
430 if (mb_equal (c, b))
431 /* The first character matches. */
433 mbui_iterator_t rhaystack;
434 mbui_iterator_t rneedle;
436 memcpy (&rhaystack, &iter_haystack, sizeof (mbui_iterator_t));
437 mbui_advance (rhaystack);
439 mbui_init (rneedle, needle);
440 if (!mbui_avail (rneedle))
441 abort ();
442 mbui_advance (rneedle);
444 for (;; mbui_advance (rhaystack), mbui_advance (rneedle))
446 if (!mbui_avail (rneedle))
447 /* Found a match. */
448 return (char *) mbui_cur_ptr (iter_haystack);
449 if (!mbui_avail (rhaystack))
450 /* No match. */
451 return NULL;
452 comparison_count++;
453 if (!mb_caseequal (mbui_cur (rhaystack),
454 mbui_cur (rneedle)))
455 /* Nothing in this round. */
456 break;
461 else
462 return (char *) haystack;
463 #endif
465 else
467 if (*needle != '\0')
469 /* Minimizing the worst-case complexity:
470 Let n = strlen(haystack), m = strlen(needle).
471 The naïve algorithm is O(n*m) worst-case.
472 The Knuth-Morris-Pratt algorithm is O(n) worst-case but it needs a
473 memory allocation.
474 To achieve linear complexity and yet amortize the cost of the
475 memory allocation, we activate the Knuth-Morris-Pratt algorithm
476 only once the naïve algorithm has already run for some time; more
477 precisely, when
478 - the outer loop count is >= 10,
479 - the average number of comparisons per outer loop is >= 5,
480 - the total number of comparisons is >= m.
481 But we try it only once. If the memory allocation attempt failed,
482 we don't retry it. */
483 bool try_kmp = true;
484 size_t outer_loop_count = 0;
485 size_t comparison_count = 0;
486 size_t last_ccount = 0; /* last comparison count */
487 const char *needle_last_ccount = needle; /* = needle + last_ccount */
489 /* Speed up the following searches of needle by caching its first
490 character and lowercase counterpart. */
491 unsigned char B = *needle;
492 unsigned char b = tolower (B);
494 needle++;
495 for (;; haystack++)
497 if (*haystack == '\0')
498 /* No match. */
499 return NULL;
501 /* See whether it's advisable to use an asymptotically faster
502 algorithm. */
503 if (try_kmp
504 && outer_loop_count >= 10
505 && comparison_count >= 5 * outer_loop_count)
507 /* See if needle + comparison_count now reaches the end of
508 needle. */
509 if (needle_last_ccount != NULL)
511 needle_last_ccount +=
512 strnlen (needle_last_ccount,
513 comparison_count - last_ccount);
514 if (*needle_last_ccount == '\0')
515 needle_last_ccount = NULL;
516 last_ccount = comparison_count;
518 if (needle_last_ccount == NULL)
520 /* Try the Knuth-Morris-Pratt algorithm. */
521 const unsigned char *result;
522 bool success =
523 knuth_morris_pratt ((const unsigned char *) haystack,
524 (const unsigned char *) (needle - 1),
525 strlen (needle - 1),
526 &result);
527 if (success)
528 return (char *) result;
529 try_kmp = false;
533 outer_loop_count++;
534 comparison_count++;
535 unsigned char H = *haystack;
536 if (H == B || H == b || tolower (H) == b)
537 /* The first character matches. */
539 const char *rhaystack = haystack + 1;
540 const char *rneedle = needle;
542 for (;; rhaystack++, rneedle++)
544 if (*rneedle == '\0')
545 /* Found a match. */
546 return (char *) haystack;
547 if (*rhaystack == '\0')
548 /* No match. */
549 return NULL;
550 comparison_count++;
551 if (! (*rhaystack == *rneedle
552 || (tolower ((unsigned char) *rhaystack)
553 == tolower ((unsigned char) *rneedle))))
554 /* Nothing in this round. */
555 break;
560 else
561 return (char *) haystack;