2 * Copyright 2004-2010, Haiku, Inc.
3 * Distributed under the terms of the MIT License.
5 #ifndef _UTF8_FUNCTIONS_H
6 #define _UTF8_FUNCTIONS_H
9 #include <SupportDefs.h>
13 IsInsideGlyph(uchar ch
)
15 return (ch
& 0xc0) == 0x80;
20 UTF8NextCharLenUnsafe(const char *text
)
22 const char *ptr
= text
;
26 } while (IsInsideGlyph(*ptr
));
33 UTF8NextCharLen(const char *text
)
35 if (text
== NULL
|| *text
== 0)
38 return UTF8NextCharLenUnsafe(text
);
43 UTF8NextCharLen(const char *bytes
, size_t length
)
45 if (bytes
== NULL
|| length
== 0 || bytes
[0] == 0)
48 if ((bytes
[0] & 0x80) == 0) {
49 // A single ASCII char - or so...
53 if (IsInsideGlyph(bytes
[0])) {
54 // Not a proper multibyte start.
58 // We already know that we have the upper two bits set due to the above
61 size_t bytesExpected
= 2;
62 while ((bytes
[0] & mask
) != 0) {
64 // Seven byte char - invalid.
72 // There would need to be more bytes to satisfy the char.
73 if (bytesExpected
> length
)
76 // We already know the first byte is fine, check the rest.
77 for (size_t i
= 1; i
< bytesExpected
; i
++) {
78 if (!IsInsideGlyph(bytes
[i
])) {
79 // The sequence is incomplete.
84 // Puh, everything's fine.
90 UTF8PreviousCharLen(const char *text
, const char *limit
)
92 const char *ptr
= text
;
94 if (ptr
== NULL
|| limit
== NULL
)
101 } while (IsInsideGlyph(*ptr
));
107 /*! UTF8CountBytes gets the length (in bytes) of a UTF8 string. Up to
108 numChars characters are read. If numChars is a negative value it is ignored
109 and the string is read up to the terminating 0.
112 UTF8CountBytes(const char *bytes
, int32 numChars
)
120 const char *base
= bytes
;
121 while (bytes
[0] != '\0') {
122 if ((bytes
[0] & 0xc0) != 0x80) {
133 /*! UTF8CountChars gets the length (in characters) of a UTF8 string. Up to
134 numBytes bytes are read. If numBytes is a negative value it is ignored
135 and the string is read up to the terminating 0.
138 UTF8CountChars(const char *bytes
, int32 numBytes
)
146 last
= (const char *)SIZE_MAX
;
148 last
= bytes
+ numBytes
- 1;
150 while (bytes
[0] && bytes
<= last
) {
151 if ((bytes
++[0] & 0xc0) != 0x80)
159 /*! UTF8ToCharCode converts the input that includes potential multibyte chars
160 to UTF-32 char codes that can be used by FreeType. The string pointer is
161 then advanced to the next character in the string. In case the terminating
162 0 is reached, the string pointer is not advanced anymore and nulls are
163 returned. This makes it safe to overruns and enables streamed processing
167 UTF8ToCharCode(const char **bytes
)
169 #define UTF8_SUBSTITUTE_CHARACTER 0xfffd
172 if (((*bytes
)[0] & 0x80) == 0) {
173 // a single byte character
174 result
= (*bytes
)[0];
175 if (result
!= '\0') {
176 // do not advance beyond the terminating '\0'
183 if (((*bytes
)[0] & 0xc0) == 0x80) {
184 // not a proper multibyte start
186 return UTF8_SUBSTITUTE_CHARACTER
;
189 // start of a multibyte character
191 result
= (uint32
)((*bytes
)[0] & 0xff);
194 while (result
& mask
) {
196 // seven byte char - invalid
197 return UTF8_SUBSTITUTE_CHARACTER
;
204 while (((*bytes
)[0] & 0xc0) == 0x80) {
206 result
+= (*bytes
)[0] & 0x3f;
217 if ((*bytes
)[0] == '\0') {
218 // string terminated within multibyte char
222 // not enough bytes in multibyte char
223 return UTF8_SUBSTITUTE_CHARACTER
;
225 #undef UTF8_SUBSTITUTE_CHARACTER
228 #endif // _UTF8_FUNCTIONS_H