1 /* -*- Mode: c; c-basic-offset: 4; indent-tabs-mode: t; tab-width: 8; -*- */
2 /* cairo - a vector graphics library with display and print output
4 * The code in this file is derived from GLib's gutf8.c and
5 * ultimately from libunicode. It is relicensed under the
6 * dual LGPL/MPL with permission of the original authors.
8 * Copyright © 1999 Tom Tromey
9 * Copyright © 2005 Red Hat, Inc
11 * This library is free software; you can redistribute it and/or
12 * modify it either under the terms of the GNU Lesser General Public
13 * License version 2.1 as published by the Free Software Foundation
14 * (the "LGPL") or, at your option, under the terms of the Mozilla
15 * Public License Version 1.1 (the "MPL"). If you do not alter this
16 * notice, a recipient may use your version of this file under either
17 * the MPL or the LGPL.
19 * You should have received a copy of the LGPL along with this library
20 * in the file COPYING-LGPL-2.1; if not, write to the Free Software
21 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 * You should have received a copy of the MPL along with this library
23 * in the file COPYING-MPL-1.1
25 * The contents of this file are subject to the Mozilla Public License
26 * Version 1.1 (the "License"); you may not use this file except in
27 * compliance with the License. You may obtain a copy of the License at
28 * http://www.mozilla.org/MPL/
30 * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY
31 * OF ANY KIND, either express or implied. See the LGPL or the MPL for
32 * the specific language governing rights and limitations.
34 * The Original Code is the cairo graphics library.
36 * The Initial Developer of the Original Code is Tom Tromey.
40 * Owen Taylor <otaylor@redhat.com>
45 #define UTF8_COMPUTE(Char, Mask, Len) \
51 else if ((Char & 0xe0) == 0xc0) \
56 else if ((Char & 0xf0) == 0xe0) \
61 else if ((Char & 0xf8) == 0xf0) \
66 else if ((Char & 0xfc) == 0xf8) \
71 else if ((Char & 0xfe) == 0xfc) \
79 #define UTF8_LENGTH(Char) \
80 ((Char) < 0x80 ? 1 : \
81 ((Char) < 0x800 ? 2 : \
82 ((Char) < 0x10000 ? 3 : \
83 ((Char) < 0x200000 ? 4 : \
84 ((Char) < 0x4000000 ? 5 : 6)))))
86 #define UTF8_GET(Result, Chars, Count, Mask, Len) \
87 (Result) = (Chars)[0] & (Mask); \
88 for ((Count) = 1; (Count) < (Len); ++(Count)) \
90 if (((Chars)[(Count)] & 0xc0) != 0x80) \
96 (Result) |= ((Chars)[(Count)] & 0x3f); \
99 #define UNICODE_VALID(Char) \
100 ((Char) < 0x110000 && \
101 (((Char) & 0xFFFFF800) != 0xD800) && \
102 ((Char) < 0xFDD0 || (Char) > 0xFDEF) && \
103 ((Char) & 0xFFFE) != 0xFFFE)
105 static const char utf8_skip_data
[256] = {
106 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
107 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
108 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
109 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
110 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
111 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
112 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
113 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1
116 #define UTF8_NEXT_CHAR(p) ((p) + utf8_skip_data[*(unsigned char *)(p)])
118 /* Converts a sequence of bytes encoded as UTF-8 to a Unicode character.
119 * If @p does not point to a valid UTF-8 encoded character, results are
123 _utf8_get_char (const unsigned char *p
)
125 int i
, mask
= 0, len
;
127 unsigned char c
= (unsigned char) *p
;
129 UTF8_COMPUTE (c
, mask
, len
);
132 UTF8_GET (result
, p
, i
, mask
, len
);
137 /* Like _utf8_get_char, but take a maximum length
138 * and return (uint32_t)-2 on incomplete trailing character
141 _utf8_get_char_extended (const unsigned char *p
,
145 uint32_t wc
= (unsigned char) *p
;
149 } else if (wc
< 0xc0) {
151 } else if (wc
< 0xe0) {
154 } else if (wc
< 0xf0) {
157 } else if (wc
< 0xf8) {
160 } else if (wc
< 0xfc) {
163 } else if (wc
< 0xfe) {
170 if (max_len
>= 0 && len
> max_len
) {
171 for (i
= 1; i
< max_len
; i
++) {
172 if ((((unsigned char *)p
)[i
] & 0xc0) != 0x80)
178 for (i
= 1; i
< len
; ++i
) {
179 uint32_t ch
= ((unsigned char *)p
)[i
];
181 if ((ch
& 0xc0) != 0x80) {
192 if (UTF8_LENGTH(wc
) != len
)
199 * _cairo_utf8_get_char_validated:
201 * @unicode: location to store one Unicode character
203 * Decodes the first character of a valid UTF-8 string, and returns
204 * the number of bytes consumed.
206 * Note that the string should be valid. Do not use this without
207 * validating the string first.
209 * Returns: the number of bytes forming the character returned.
212 _cairo_utf8_get_char_validated (const char *p
,
215 int i
, mask
= 0, len
;
217 unsigned char c
= (unsigned char) *p
;
219 UTF8_COMPUTE (c
, mask
, len
);
222 *unicode
= (uint32_t)-1;
225 UTF8_GET (result
, p
, i
, mask
, len
);
233 * _cairo_utf8_to_ucs4:
234 * @str: an UTF-8 string
235 * @len: length of @str in bytes, or -1 if it is nul-terminated.
236 * If @len is supplied and the string has an embedded nul
237 * byte, only the portion before the nul byte is converted.
238 * @result: location to store a pointer to a newly allocated UTF-32
239 * string (always native endian), or %NULL. Free with free(). A 0
240 * word will be written after the last character.
241 * @items_written: location to store number of 32-bit words
242 * written. (Not including the trailing 0)
244 * Converts a UTF-8 string to UCS-4. UCS-4 is an encoding of Unicode
245 * with 1 32-bit word per character. The string is validated to
246 * consist entirely of valid Unicode characters.
248 * Return value: %CAIRO_STATUS_SUCCESS if the entire string was
249 * successfully converted. %CAIRO_STATUS_INVALID_STRING if an
250 * invalid sequence was found.
253 _cairo_utf8_to_ucs4 (const char *str
,
258 uint32_t *str32
= NULL
;
260 const unsigned char *in
;
261 const unsigned char * const ustr
= (const unsigned char *) str
;
265 while ((len
< 0 || ustr
+ len
- in
> 0) && *in
)
267 uint32_t wc
= _utf8_get_char_extended (in
, ustr
+ len
- in
);
268 if (wc
& 0x80000000 || !UNICODE_VALID (wc
))
269 return _cairo_error (CAIRO_STATUS_INVALID_STRING
);
272 if (n_chars
== INT_MAX
)
273 return _cairo_error (CAIRO_STATUS_INVALID_STRING
);
275 in
= UTF8_NEXT_CHAR (in
);
279 str32
= _cairo_malloc_ab (n_chars
+ 1, sizeof (uint32_t));
281 return _cairo_error (CAIRO_STATUS_NO_MEMORY
);
284 for (i
=0; i
< n_chars
; i
++) {
285 str32
[i
] = _utf8_get_char (in
);
286 in
= UTF8_NEXT_CHAR (in
);
294 *items_written
= n_chars
;
296 return CAIRO_STATUS_SUCCESS
;
300 * _cairo_ucs4_to_utf8:
301 * @unicode: a UCS-4 character
302 * @utf8: buffer to write utf8 string into. Must have at least 4 bytes
303 * space available. Or %NULL.
305 * This space left intentionally blank.
307 * Return value: Number of bytes in the utf8 string or 0 if an invalid
311 _cairo_ucs4_to_utf8 (uint32_t unicode
,
317 if (unicode
< 0x80) {
321 } else if (unicode
< 0x800) {
323 } else if (unicode
< 0x10000) {
325 } else if (unicode
< 0x200000) {
336 *--p
= 0x80 | (unicode
& 0x3f);
339 *p
|= 0xf0 << (4 - bytes
);
344 #if CAIRO_HAS_UTF8_TO_UTF16
346 * _cairo_utf8_to_utf16:
347 * @str: an UTF-8 string
348 * @len: length of @str in bytes, or -1 if it is nul-terminated.
349 * If @len is supplied and the string has an embedded nul
350 * byte, only the portion before the nul byte is converted.
351 * @result: location to store a pointer to a newly allocated UTF-16
352 * string (always native endian). Free with free(). A 0
353 * word will be written after the last character.
354 * @items_written: location to store number of 16-bit words
355 * written. (Not including the trailing 0)
357 * Converts a UTF-8 string to UTF-16. UTF-16 is an encoding of Unicode
358 * where characters are represented either as a single 16-bit word, or
359 * as a pair of 16-bit "surrogates". The string is validated to
360 * consist entirely of valid Unicode characters.
362 * Return value: %CAIRO_STATUS_SUCCESS if the entire string was
363 * successfully converted. %CAIRO_STATUS_INVALID_STRING if an
364 * an invalid sequence was found.
367 _cairo_utf8_to_utf16 (const char *str
,
372 uint16_t *str16
= NULL
;
374 const unsigned char *in
;
375 const unsigned char * const ustr
= (const unsigned char *) str
;
379 while ((len
< 0 || ustr
+ len
- in
> 0) && *in
) {
380 uint32_t wc
= _utf8_get_char_extended (in
, ustr
+ len
- in
);
381 if (wc
& 0x80000000 || !UNICODE_VALID (wc
))
382 return _cairo_error (CAIRO_STATUS_INVALID_STRING
);
389 if (n16
== INT_MAX
- 1 || n16
== INT_MAX
)
390 return _cairo_error (CAIRO_STATUS_INVALID_STRING
);
392 in
= UTF8_NEXT_CHAR (in
);
395 str16
= _cairo_malloc_ab (n16
+ 1, sizeof (uint16_t));
397 return _cairo_error (CAIRO_STATUS_NO_MEMORY
);
400 for (i
= 0; i
< n16
;) {
401 uint32_t wc
= _utf8_get_char (in
);
406 str16
[i
++] = (wc
- 0x10000) / 0x400 + 0xd800;
407 str16
[i
++] = (wc
- 0x10000) % 0x400 + 0xdc00;
410 in
= UTF8_NEXT_CHAR (in
);
417 *items_written
= n16
;
419 return CAIRO_STATUS_SUCCESS
;