vfs: check userland buffers before reading them.
[haiku.git] / headers / private / interface / utf8_functions.h
blobf04e59ca22c2e13ef8cd5397bdf164969ada6019
1 /*
2 * Copyright 2004-2010, Haiku, Inc.
3 * Distributed under the terms of the MIT License.
4 */
5 #ifndef _UTF8_FUNCTIONS_H
6 #define _UTF8_FUNCTIONS_H
9 #include <SupportDefs.h>
12 static inline bool
13 IsInsideGlyph(uchar ch)
15 return (ch & 0xc0) == 0x80;
19 static inline uint32
20 UTF8NextCharLenUnsafe(const char *text)
22 const char *ptr = text;
24 do {
25 ptr++;
26 } while (IsInsideGlyph(*ptr));
28 return ptr - text;
32 static inline uint32
33 UTF8NextCharLen(const char *text)
35 if (text == NULL || *text == 0)
36 return 0;
38 return UTF8NextCharLenUnsafe(text);
42 static inline uint32
43 UTF8NextCharLen(const char *bytes, size_t length)
45 if (bytes == NULL || length == 0 || bytes[0] == 0)
46 return 0;
48 if ((bytes[0] & 0x80) == 0) {
49 // A single ASCII char - or so...
50 return 1;
53 if (IsInsideGlyph(bytes[0])) {
54 // Not a proper multibyte start.
55 return 0;
58 // We already know that we have the upper two bits set due to the above
59 // two checks.
60 uint8 mask = 0x20;
61 size_t bytesExpected = 2;
62 while ((bytes[0] & mask) != 0) {
63 if (mask == 0x02) {
64 // Seven byte char - invalid.
65 return 0;
68 bytesExpected++;
69 mask >>= 1;
72 // There would need to be more bytes to satisfy the char.
73 if (bytesExpected > length)
74 return 0;
76 // We already know the first byte is fine, check the rest.
77 for (size_t i = 1; i < bytesExpected; i++) {
78 if (!IsInsideGlyph(bytes[i])) {
79 // The sequence is incomplete.
80 return 0;
84 // Puh, everything's fine.
85 return bytesExpected;
89 static inline uint32
90 UTF8PreviousCharLen(const char *text, const char *limit)
92 const char *ptr = text;
94 if (ptr == NULL || limit == NULL)
95 return 0;
97 do {
98 if (ptr == limit)
99 break;
100 ptr--;
101 } while (IsInsideGlyph(*ptr));
103 return text - ptr;
107 /*! UTF8CountBytes gets the length (in bytes) of a UTF8 string. Up to
108 numChars characters are read. If numChars is a negative value it is ignored
109 and the string is read up to the terminating 0.
111 static inline uint32
112 UTF8CountBytes(const char *bytes, int32 numChars)
114 if (bytes == NULL)
115 return 0;
117 if (numChars < 0)
118 numChars = INT_MAX;
120 const char *base = bytes;
121 while (bytes[0] != '\0') {
122 if ((bytes[0] & 0xc0) != 0x80) {
123 if (--numChars < 0)
124 break;
126 bytes++;
129 return bytes - base;
133 /*! UTF8CountChars gets the length (in characters) of a UTF8 string. Up to
134 numBytes bytes are read. If numBytes is a negative value it is ignored
135 and the string is read up to the terminating 0.
137 static inline uint32
138 UTF8CountChars(const char *bytes, int32 numBytes)
140 if (bytes == NULL)
141 return 0;
143 uint32 length = 0;
144 const char *last;
145 if (numBytes < 0)
146 last = (const char *)SIZE_MAX;
147 else
148 last = bytes + numBytes - 1;
150 while (bytes[0] && bytes <= last) {
151 if ((bytes++[0] & 0xc0) != 0x80)
152 length++;
155 return length;
159 /*! UTF8ToCharCode converts the input that includes potential multibyte chars
160 to UTF-32 char codes that can be used by FreeType. The string pointer is
161 then advanced to the next character in the string. In case the terminating
162 0 is reached, the string pointer is not advanced anymore and nulls are
163 returned. This makes it safe to overruns and enables streamed processing
164 of UTF8 strings.
166 static inline uint32
167 UTF8ToCharCode(const char **bytes)
169 #define UTF8_SUBSTITUTE_CHARACTER 0xfffd
171 uint32 result;
172 if (((*bytes)[0] & 0x80) == 0) {
173 // a single byte character
174 result = (*bytes)[0];
175 if (result != '\0') {
176 // do not advance beyond the terminating '\0'
177 (*bytes)++;
180 return result;
183 if (((*bytes)[0] & 0xc0) == 0x80) {
184 // not a proper multibyte start
185 (*bytes)++;
186 return UTF8_SUBSTITUTE_CHARACTER;
189 // start of a multibyte character
190 uint8 mask = 0x80;
191 result = (uint32)((*bytes)[0] & 0xff);
192 (*bytes)++;
194 while (result & mask) {
195 if (mask == 0x02) {
196 // seven byte char - invalid
197 return UTF8_SUBSTITUTE_CHARACTER;
200 result &= ~mask;
201 mask >>= 1;
204 while (((*bytes)[0] & 0xc0) == 0x80) {
205 result <<= 6;
206 result += (*bytes)[0] & 0x3f;
207 (*bytes)++;
209 mask <<= 1;
210 if (mask == 0x40)
211 return result;
214 if (mask == 0x40)
215 return result;
217 if ((*bytes)[0] == '\0') {
218 // string terminated within multibyte char
219 return 0x00;
222 // not enough bytes in multibyte char
223 return UTF8_SUBSTITUTE_CHARACTER;
225 #undef UTF8_SUBSTITUTE_CHARACTER
228 #endif // _UTF8_FUNCTIONS_H