errno-h: document Haiku errors can’t be -1
[gnulib.git] / lib / mbiterf.h
blob99d8d11d84e3a31a9f0a08aac232ba129ade85bb
1 /* Iterating through multibyte strings, faster: macros for multi-byte encodings.
2 Copyright (C) 2001, 2005, 2007, 2009-2025 Free Software Foundation, Inc.
4 This file is free software: you can redistribute it and/or modify
5 it under the terms of the GNU Lesser General Public License as
6 published by the Free Software Foundation; either version 2.1 of the
7 License, or (at your option) any later version.
9 This file is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU Lesser General Public License for more details.
14 You should have received a copy of the GNU Lesser General Public License
15 along with this program. If not, see <https://www.gnu.org/licenses/>. */
17 /* Written by Bruno Haible <bruno@clisp.org>,
18 with insights from Paul Eggert. */
20 /* The macros in this file implement forward iteration through a
21 multi-byte string.
23 With these macros, an iteration loop that looks like
25 char *iter;
26 for (iter = buf; iter < buf + buflen; iter++)
28 do_something (*iter);
31 becomes
33 const char *buf_end = buf + buflen;
34 mbif_state_t state;
35 [const] char *iter;
36 for (mbif_init (state), iter = buf; mbif_avail (state, iter, buf_end); )
38 mbchar_t cur = mbif_next (state, iter, buf_end);
39 // Note: Here always mb_ptr (cur) == iter.
40 do_something (iter, mb_len (cur));
41 iter += mb_len (cur);
44 The benefit of these macros over plain use of mbrtowc or mbrtoc32 is:
45 - Handling of invalid multibyte sequences is possible without
46 making the code more complicated, while still preserving the
47 invalid multibyte sequences.
49 The benefit of these macros over those from mbiter.h is that it
50 produces faster code with today's optimizing compilers (because mbif_next
51 returns its result by value).
53 mbif_state_t
54 is a type usable for variable declarations.
56 mbif_init (state)
57 initializes the state.
59 mbif_avail (state, iter, endptr)
60 returns true if another loop round is needed.
62 mbif_next (state, iter, endptr)
63 returns the next multibyte character.
64 It asssumes that the state is initialized and that iter < endptr.
66 Here are the function prototypes of the macros.
68 extern void mbif_init (mbif_state_t state);
69 extern bool mbif_avail (mbif_state_t state, const char *iter, const char *endptr);
70 extern mbchar_t mbif_next (mbif_state_t state, const char *iter, const char *endptr);
73 #ifndef _MBITERF_H
74 #define _MBITERF_H 1
76 /* This file uses _GL_INLINE_HEADER_BEGIN, _GL_INLINE,
77 _GL_ATTRIBUTE_ALWAYS_INLINE. */
78 #if !_GL_CONFIG_H_INCLUDED
79 #error "Please include config.h first."
80 #endif
82 #include <assert.h>
83 #include <stddef.h>
84 #include <string.h>
85 #include <uchar.h>
86 #include <wchar.h>
88 #include "mbchar.h"
90 _GL_INLINE_HEADER_BEGIN
91 #ifndef MBITERF_INLINE
92 # define MBITERF_INLINE _GL_INLINE _GL_ATTRIBUTE_ALWAYS_INLINE
93 #endif
95 #ifdef __cplusplus
96 extern "C" {
97 #endif
100 struct mbif_state
102 #if !GNULIB_MBRTOC32_REGULAR
103 bool in_shift; /* true if next byte may not be interpreted as ASCII */
104 /* If GNULIB_MBRTOC32_REGULAR, it is always false,
105 so optimize it away. */
106 #endif
107 mbstate_t state; /* if in_shift: current shift state */
108 /* If GNULIB_MBRTOC32_REGULAR, it is in an initial state
109 before and after every mbiterf_next invocation.
113 MBITERF_INLINE mbchar_t
114 mbiterf_next (struct mbif_state *ps, const char *iter, const char *endptr)
116 #if !GNULIB_MBRTOC32_REGULAR
117 if (ps->in_shift)
118 goto with_shift;
119 #endif
120 /* Handle most ASCII characters quickly, without calling mbrtowc(). */
121 if (is_basic (*iter))
123 /* These characters are part of the POSIX portable character set.
124 For most of them, namely those in the ISO C basic character set,
125 ISO C 99 guarantees that their wide character code is identical to
126 their char code. For the few other ones, this is the case as well,
127 in all locale encodings that are in use. The 32-bit wide character
128 code is the same as well. */
129 return (mbchar_t) { .ptr = iter, .bytes = 1, .wc_valid = true, .wc = *iter };
131 else
133 assert (mbsinit (&ps->state));
134 #if !GNULIB_MBRTOC32_REGULAR
135 ps->in_shift = true;
136 with_shift:;
137 #endif
138 size_t bytes;
139 char32_t wc;
140 bytes = mbrtoc32 (&wc, iter, endptr - iter, &ps->state);
141 if (bytes == (size_t) -1)
143 /* An invalid multibyte sequence was encountered. */
144 /* Allow the next invocation to continue from a sane state. */
145 #if !GNULIB_MBRTOC32_REGULAR
146 ps->in_shift = false;
147 #endif
148 mbszero (&ps->state);
149 return (mbchar_t) { .ptr = iter, .bytes = 1, .wc_valid = false };
151 else if (bytes == (size_t) -2)
153 /* An incomplete multibyte character at the end. */
154 #if !GNULIB_MBRTOC32_REGULAR
155 ps->in_shift = false;
156 #endif
157 /* Whether to reset ps->state or not is not important; the string end
158 is reached anyway. */
159 return (mbchar_t) { .ptr = iter, .bytes = endptr - iter, .wc_valid = false };
161 else
163 if (bytes == 0)
165 /* A null wide character was encountered. */
166 bytes = 1;
167 assert (*iter == '\0');
168 assert (wc == 0);
170 #if !GNULIB_MBRTOC32_REGULAR
171 else if (bytes == (size_t) -3)
172 /* The previous multibyte sequence produced an additional 32-bit
173 wide character. */
174 bytes = 0;
175 #endif
177 /* When in an initial state, we can go back treating ASCII
178 characters more quickly. */
179 #if !GNULIB_MBRTOC32_REGULAR
180 if (mbsinit (&ps->state))
181 ps->in_shift = false;
182 #endif
183 return (mbchar_t) { .ptr = iter, .bytes = bytes, .wc_valid = true, .wc = wc };
188 /* Iteration macros. */
189 typedef struct mbif_state mbif_state_t;
190 #if !GNULIB_MBRTOC32_REGULAR
191 #define mbif_init(st) \
192 ((st).in_shift = false, mbszero (&(st).state))
193 #else
194 /* Optimized: no in_shift. */
195 #define mbif_init(st) \
196 (mbszero (&(st).state))
197 #endif
198 #if !GNULIB_MBRTOC32_REGULAR
199 #define mbif_avail(st, iter, endptr) ((st).in_shift || ((iter) < (endptr)))
200 #else
201 /* Optimized: no in_shift. */
202 #define mbif_avail(st, iter, endptr) ((iter) < (endptr))
203 #endif
204 #define mbif_next(st, iter, endptr) \
205 mbiterf_next (&(st), (iter), (endptr))
208 #ifdef __cplusplus
210 #endif
212 _GL_INLINE_HEADER_END
214 #endif /* _MBITERF_H */