2 * "$Id: fl_utf.c 8585 2011-04-13 15:43:22Z ianmacarthur $"
4 * This is the utf.c file from fltk2 adapted for use in my fltk1.1 port
6 /* Copyright 2006-2011 by Bill Spitzak and others.
8 * This library is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Library General Public
10 * License as published by the Free Software Foundation; either
11 * version 2 of the License, or (at your option) any later version.
13 * This library is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * Library General Public License for more details.
18 * You should have received a copy of the GNU Library General Public
19 * License along with this library; if not, write to the Free Software
20 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
23 * Please report all bugs and problems on the following page:
25 * http://www.fltk.org/str.php
28 /* Modified to obey rfc3629, which limits unicode to 0-0x10ffff */
30 #include <FL/fl_utf8.h>
34 /** \addtogroup fl_unicode
41 \defgroup fl_unichar Unicode Character Functions
42 Global Functions Handling Single Unicode Characters
46 Converts a Unicode character into a utf-8 sequence.
47 \param[in] uc Unicode character
48 \param[out] text utf-8 sequence will be written here; if this pointer is
49 \c NULL, only the length of the utf-8 sequence is calculated
50 \return length of the sequence in bytes
52 /* FL_EXPORT int fl_unichar_to_utf8(unsigned int uc, char *text); */
57 \defgroup fl_utf8 Unicode String Functions
58 Global Functions Handling Unicode Text
62 Calculate the size of a utf-8 sequence for a Unicode character.
63 \param[in] uc Unicode character
64 \return length of the sequence in bytes
66 /* FL_EXPORT int fl_utf8_size(unsigned int uc); */
71 /*!Set to 1 to turn bad UTF8 bytes into ISO-8859-1. If this is to zero
72 they are instead turned into the Unicode REPLACEMENT CHARACTER, of
74 If this is on fl_utf8decode() will correctly map most (perhaps all)
75 human-readable text that is in ISO-8859-1. This may allow you
76 to completely ignore character sets in your code because virtually
77 everything is either ISO-8859-1 or UTF-8.
79 #define ERRORS_TO_ISO8859_1 1
81 /*!Set to 1 to turn bad UTF8 bytes in the 0x80-0x9f range into the
82 Unicode index for Microsoft's CP1252 character set. You should
83 also set ERRORS_TO_ISO8859_1. With this a huge amount of more
84 available text (such as all web pages) are correctly converted
87 #define ERRORS_TO_CP1252 1
89 /*!A number of Unicode code points are in fact illegal and should not
90 be produced by a UTF-8 converter. Turn this on will replace the
91 bytes in those encodings with errors. If you do this then converting
92 arbitrary 16-bit data to UTF-8 and then back is not an identity,
93 which will probably break a lot of software.
95 #define STRICT_RFC3629 0
98 /* Codes 0x80..0x9f from the Microsoft CP1252 character set, translated
101 static unsigned short cp1252
[32] = {
102 0x20ac, 0x0081, 0x201a, 0x0192, 0x201e, 0x2026, 0x2020, 0x2021,
103 0x02c6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008d, 0x017d, 0x008f,
104 0x0090, 0x2018, 0x2019, 0x201c, 0x201d, 0x2022, 0x2013, 0x2014,
105 0x02dc, 0x2122, 0x0161, 0x203a, 0x0153, 0x009d, 0x017e, 0x0178
109 /*! Decode a single UTF-8 encoded character starting at \e p. The
110 resulting Unicode value (in the range 0-0x10ffff) is returned,
111 and \e len is set to the number of bytes in the UTF-8 encoding
112 (adding \e len to \e p will point at the next character).
114 If \p p points at an illegal UTF-8 encoding, including one that
115 would go past \e end, or where a code is uses more bytes than
116 necessary, then *(unsigned char*)p is translated as though it is
117 in the Microsoft CP1252 character set and \e len is set to 1.
118 Treating errors this way allows this to decode almost any
119 ISO-8859-1 or CP1252 text that has been mistakenly placed where
120 UTF-8 is expected, and has proven very useful.
122 If you want errors to be converted to error characters (as the
123 standards recommend), adding a test to see if the length is
124 unexpectedly 1 will work:
127 if (*p & 0x80) { // what should be a multibyte encoding
128 code = fl_utf8decode(p,end,&len);
129 if (len<2) code = 0xFFFD; // Turn errors into REPLACEMENT CHARACTER
130 } else { // handle the 1-byte utf8 encoding:
136 Direct testing for the 1-byte case (as shown above) will also
137 speed up the scanning of strings where the majority of characters
140 unsigned fl_utf8decode(const char* p
, const char* end
, int* len
)
142 unsigned char c
= *(unsigned char*)p
;
147 } else if (c
< 0xa0) {
149 return cp1252
[c
-0x80];
151 } else if (c
< 0xc2) {
154 if ( (end
&& p
+1 >= end
) || (p
[1]&0xc0) != 0x80) goto FAIL
;
158 ((p
[0] & 0x1f) << 6) +
160 } else if (c
== 0xe0) {
161 if (((unsigned char*)p
)[1] < 0xa0) goto FAIL
;
164 } else if (c
== 0xed) {
165 /* RFC 3629 says surrogate chars are illegal. */
166 if (((unsigned char*)p
)[1] >= 0xa0) goto FAIL
;
168 } else if (c
== 0xef) {
169 /* 0xfffe and 0xffff are also illegal characters */
170 if (((unsigned char*)p
)[1]==0xbf &&
171 ((unsigned char*)p
)[2]>=0xbe) goto FAIL
;
174 } else if (c
< 0xf0) {
176 if ( (end
&& p
+2 >= end
) || (p
[2]&0xc0) != 0x80) goto FAIL
;
179 ((p
[0] & 0x0f) << 12) +
180 ((p
[1] & 0x3f) << 6) +
182 } else if (c
== 0xf0) {
183 if (((unsigned char*)p
)[1] < 0x90) goto FAIL
;
185 } else if (c
< 0xf4) {
187 if ( (end
&& p
+3 >= end
) || (p
[2]&0xc0) != 0x80 || (p
[3]&0xc0) != 0x80) goto FAIL
;
190 /* RFC 3629 says all codes ending in fffe or ffff are illegal: */
191 if ((p
[1]&0xf)==0xf &&
192 ((unsigned char*)p
)[2] == 0xbf &&
193 ((unsigned char*)p
)[3] >= 0xbe) goto FAIL
;
196 ((p
[0] & 0x07) << 18) +
197 ((p
[1] & 0x3f) << 12) +
198 ((p
[2] & 0x3f) << 6) +
200 } else if (c
== 0xf4) {
201 if (((unsigned char*)p
)[1] > 0x8f) goto FAIL
; /* after 0x10ffff */
206 #if ERRORS_TO_ISO8859_1
209 return 0xfffd; /* Unicode REPLACEMENT CHARACTER */
214 /*! Move \p p forward until it points to the start of a UTF-8
215 character. If it already points at the start of one then it
216 is returned unchanged. Any UTF-8 errors are treated as though each
217 byte of the error is an individual character.
219 \e start is the start of the string and is used to limit the
220 backwards search for the start of a utf8 character.
222 \e end is the end of the string and is assumed to be a break
223 between characters. It is assumed to be greater than p.
225 This function is for moving a pointer that was jumped to the
226 middle of a string, such as when doing a binary search for
227 a position. You should use either this or fl_utf8back() depending
228 on which direction your algorithim can handle the pointer
229 moving. Do not use this to scan strings, use fl_utf8decode()
232 const char* fl_utf8fwd(const char* p
, const char* start
, const char* end
)
236 /* if we are not pointing at a continuation character, we are done: */
237 if ((*p
&0xc0) != 0x80) return p
;
238 /* search backwards for a 0xc0 starting the character: */
239 for (a
= p
-1; ; --a
) {
240 if (a
< start
) return p
;
241 if (!(a
[0]&0x80)) return p
;
242 if ((a
[0]&0x40)) break;
244 fl_utf8decode(a
,end
,&len
);
250 /*! Move \p p backward until it points to the start of a UTF-8
251 character. If it already points at the start of one then it
252 is returned unchanged. Any UTF-8 errors are treated as though each
253 byte of the error is an individual character.
255 \e start is the start of the string and is used to limit the
256 backwards search for the start of a UTF-8 character.
258 \e end is the end of the string and is assumed to be a break
259 between characters. It is assumed to be greater than p.
261 If you wish to decrement a UTF-8 pointer, pass p-1 to this.
263 const char* fl_utf8back(const char* p
, const char* start
, const char* end
)
267 /* if we are not pointing at a continuation character, we are done: */
268 if ((*p
&0xc0) != 0x80) return p
;
269 /* search backwards for a 0xc0 starting the character: */
270 for (a
= p
-1; ; --a
) {
271 if (a
< start
) return p
;
272 if (!(a
[0]&0x80)) return p
;
273 if ((a
[0]&0x40)) break;
275 fl_utf8decode(a
,end
,&len
);
276 if (a
+len
> p
) return a
;
280 /*! Returns number of bytes that utf8encode() will use to encode the
282 int fl_utf8bytes(unsigned ucs
) {
283 if (ucs
< 0x000080U
) {
285 } else if (ucs
< 0x000800U
) {
287 } else if (ucs
< 0x010000U
) {
289 } else if (ucs
<= 0x10ffffU
) {
292 return 3; /* length of the illegal character encoding */
296 /*! Write the UTF-8 encoding of \e ucs into \e buf and return the
297 number of bytes written. Up to 4 bytes may be written. If you know
298 that \p ucs is less than 0x10000 then at most 3 bytes will be written.
299 If you wish to speed this up, remember that anything less than 0x80
300 is written as a single byte.
302 If ucs is greater than 0x10ffff this is an illegal character
303 according to RFC 3629. These are converted as though they are
304 0xFFFD (REPLACEMENT CHARACTER).
306 RFC 3629 also says many other values for \p ucs are illegal (in
307 the range 0xd800 to 0xdfff, or ending with 0xfffe or
308 0xffff). However I encode these as though they are legal, so that
309 utf8encode/fl_utf8decode will be the identity for all codes between 0
312 int fl_utf8encode(unsigned ucs
, char* buf
) {
313 if (ucs
< 0x000080U
) {
316 } else if (ucs
< 0x000800U
) {
317 buf
[0] = 0xc0 | (ucs
>> 6);
318 buf
[1] = 0x80 | (ucs
& 0x3F);
320 } else if (ucs
< 0x010000U
) {
321 buf
[0] = 0xe0 | (ucs
>> 12);
322 buf
[1] = 0x80 | ((ucs
>> 6) & 0x3F);
323 buf
[2] = 0x80 | (ucs
& 0x3F);
325 } else if (ucs
<= 0x0010ffffU
) {
326 buf
[0] = 0xf0 | (ucs
>> 18);
327 buf
[1] = 0x80 | ((ucs
>> 12) & 0x3F);
328 buf
[2] = 0x80 | ((ucs
>> 6) & 0x3F);
329 buf
[3] = 0x80 | (ucs
& 0x3F);
340 /*! Convert a single 32-bit Unicode codepoint into an array of 16-bit
341 characters. These are used by some system calls, especially on Windows.
343 \p ucs is the value to convert.
345 \p dst points at an array to write, and \p dstlen is the number of
346 locations in this array. At most \p dstlen words will be
347 written, and a 0 terminating word will be added if \p dstlen is
348 large enough. Thus this function will never overwrite the buffer
349 and will attempt return a zero-terminated string if space permits.
350 If \p dstlen is zero then \p dst can be set to NULL and no data
351 is written, but the length is returned.
353 The return value is the number of 16-bit words that \e would be written
354 to \p dst if it is large enough, not counting any terminating
357 If the return value is greater than \p dstlen it indicates truncation,
358 you should then allocate a new array of size return+1 and call this again.
360 Unicode characters in the range 0x10000 to 0x10ffff are converted to
361 "surrogate pairs" which take two words each (in UTF-16 encoding).
362 Typically, setting \p dstlen to 2 will ensure that any valid Unicode
363 value can be converted, and setting \p dstlen to 3 or more will allow
364 a NULL terminated sequence to be returned.
366 unsigned fl_ucs_to_Utf16(const unsigned ucs
, unsigned short *dst
, const unsigned dstlen
)
368 /* The rule for direct conversion from UCS to UTF16 is:
369 * - if UCS > 0x0010FFFF then UCS is invalid
370 * - if UCS >= 0xD800 && UCS <= 0xDFFF UCS is invalid
371 * - if UCS <= 0x0000FFFF then U16 = UCS, len = 1
373 * -- U16[0] = ((UCS - 0x00010000) >> 10) & 0x3FF + 0xD800
374 * -- U16[1] = (UCS & 0x3FF) + 0xDC00
377 unsigned count
; /* Count of converted UTF16 cells */
378 unsigned short u16
[4]; /* Alternate buffer if dst is not set */
379 unsigned short *out
; /* points to the active buffer */
380 /* Ensure we have a valid buffer to write to */
381 if((!dstlen
) || (!dst
)) {
386 /* Convert from UCS to UTF16 */
387 if((ucs
> 0x0010FFFF) || /* UCS is too large */
388 ((ucs
> 0xD7FF) && (ucs
< 0xE000))) { /* UCS in invalid range */
389 out
[0] = 0xFFFD; /* REPLACEMENT CHARACTER */
391 } else if(ucs
< 0x00010000) {
392 out
[0] = (unsigned short)ucs
;
394 } else if(dstlen
< 2) { /* dst is too small for the result */
395 out
[0] = 0xFFFD; /* REPLACEMENT CHARACTER */
398 out
[0] = (((ucs
- 0x00010000) >> 10) & 0x3FF) + 0xD800;
399 out
[1] = (ucs
& 0x3FF) + 0xDC00;
402 /* NULL terminate the output, if there is space */
403 if(count
< dstlen
) { out
[count
] = 0; }
405 } /* fl_ucs_to_Utf16 */
407 /*! Convert a UTF-8 sequence into an array of 16-bit characters. These
408 are used by some system calls, especially on Windows.
410 \p src points at the UTF-8, and \p srclen is the number of bytes to
413 \p dst points at an array to write, and \p dstlen is the number of
414 locations in this array. At most \p dstlen-1 words will be
415 written there, plus a 0 terminating word. Thus this function
416 will never overwrite the buffer and will always return a
417 zero-terminated string. If \p dstlen is zero then \p dst can be
418 null and no data is written, but the length is returned.
420 The return value is the number of 16-bit words that \e would be written
421 to \p dst if it were long enough, not counting the terminating
422 zero. If the return value is greater or equal to \p dstlen it
423 indicates truncation, you can then allocate a new array of size
424 return+1 and call this again.
426 Errors in the UTF-8 are converted as though each byte in the
427 erroneous string is in the Microsoft CP1252 encoding. This allows
428 ISO-8859-1 text mistakenly identified as UTF-8 to be printed
431 Unicode characters in the range 0x10000 to 0x10ffff are converted to
432 "surrogate pairs" which take two words each (this is called UTF-16
435 unsigned fl_utf8toUtf16(const char* src
, unsigned srclen
,
436 unsigned short* dst
, unsigned dstlen
)
439 const char* e
= src
+srclen
;
441 if (dstlen
) for (;;) {
442 if (p
>= e
) {dst
[count
] = 0; return count
;}
443 if (!(*p
& 0x80)) { /* ascii */
446 int len
; unsigned ucs
= fl_utf8decode(p
,e
,&len
);
451 /* make a surrogate pair: */
452 if (count
+2 >= dstlen
) {dst
[count
] = 0; count
+= 2; break;}
453 dst
[count
] = (((ucs
-0x10000u
)>>10)&0x3ff) | 0xd800;
454 dst
[++count
] = (ucs
&0x3ff) | 0xdc00;
457 if (++count
== dstlen
) {dst
[count
-1] = 0; break;}
459 /* we filled dst, measure the rest: */
461 if (!(*p
& 0x80)) p
++;
463 int len
; unsigned ucs
= fl_utf8decode(p
,e
,&len
);
465 if (ucs
>= 0x10000) ++count
;
474 Converts a UTF-8 string into a wide character string.
476 This function generates 32-bit wchar_t (e.g. "ucs4" as it were) except
477 on Windows where it is equivalent to fl_utf8toUtf16 and returns
480 \p src points at the UTF-8, and \p srclen is the number of bytes to
483 \p dst points at an array to write, and \p dstlen is the number of
484 locations in this array. At most \p dstlen-1 wchar_t will be
485 written there, plus a 0 terminating wchar_t.
487 The return value is the number of wchar_t that \e would be written
488 to \p dst if it were long enough, not counting the terminating
489 zero. If the return value is greater or equal to \p dstlen it
490 indicates truncation, you can then allocate a new array of size
491 return+1 and call this again.
493 Notice that sizeof(wchar_t) is 2 on Windows and is 4 on Linux
494 and most other systems. Where wchar_t is 16 bits, Unicode
495 characters in the range 0x10000 to 0x10ffff are converted to
496 "surrogate pairs" which take two words each (this is called UTF-16
497 encoding). If wchar_t is 32 bits this rather nasty problem is
500 Note that Windows includes Cygwin, i.e. compiled with Cygwin's POSIX
501 layer (cygwin1.dll, --enable-cygwin), either native (GDI) or X11.
503 unsigned fl_utf8towc(const char* src
, unsigned srclen
,
504 wchar_t* dst
, unsigned dstlen
)
506 #if defined(WIN32) || defined(__CYGWIN__)
507 return fl_utf8toUtf16(src
, srclen
, (unsigned short*)dst
, dstlen
);
510 const char* e
= src
+srclen
;
512 if (dstlen
) for (;;) {
517 if (!(*p
& 0x80)) { /* ascii */
520 int len
; unsigned ucs
= fl_utf8decode(p
,e
,&len
);
522 dst
[count
] = (wchar_t)ucs
;
524 if (++count
== dstlen
) {dst
[count
-1] = 0; break;}
526 /* we filled dst, measure the rest: */
528 if (!(*p
& 0x80)) p
++;
530 int len
; fl_utf8decode(p
,e
,&len
);
539 /*! Convert a UTF-8 sequence into an array of 1-byte characters.
541 If the UTF-8 decodes to a character greater than 0xff then it is
544 Errors in the UTF-8 are converted as individual bytes, same as
545 fl_utf8decode() does. This allows ISO-8859-1 text mistakenly identified
546 as UTF-8 to be printed correctly (and possibly CP1512 on Windows).
548 \p src points at the UTF-8, and \p srclen is the number of bytes to
551 Up to \p dstlen bytes are written to \p dst, including a null
552 terminator. The return value is the number of bytes that would be
553 written, not counting the null terminator. If greater or equal to
554 \p dstlen then if you malloc a new array of size n+1 you will have
555 the space needed for the entire string. If \p dstlen is zero then
556 nothing is written and this call just measures the storage space
559 unsigned fl_utf8toa(const char* src
, unsigned srclen
,
560 char* dst
, unsigned dstlen
)
563 const char* e
= src
+srclen
;
565 if (dstlen
) for (;;) {
567 if (p
>= e
) {dst
[count
] = 0; return count
;}
568 c
= *(unsigned char*)p
;
569 if (c
< 0xC2) { /* ascii or bad code */
573 int len
; unsigned ucs
= fl_utf8decode(p
,e
,&len
);
575 if (ucs
< 0x100) dst
[count
] = ucs
;
576 else dst
[count
] = '?';
578 if (++count
>= dstlen
) {dst
[count
-1] = 0; break;}
580 /* we filled dst, measure the rest: */
582 if (!(*p
& 0x80)) p
++;
585 fl_utf8decode(p
,e
,&len
);
593 /*! Turn "wide characters" as returned by some system calls
594 (especially on Windows) into UTF-8.
596 Up to \p dstlen bytes are written to \p dst, including a null
597 terminator. The return value is the number of bytes that would be
598 written, not counting the null terminator. If greater or equal to
599 \p dstlen then if you malloc a new array of size n+1 you will have
600 the space needed for the entire string. If \p dstlen is zero then
601 nothing is written and this call just measures the storage space
604 \p srclen is the number of words in \p src to convert. On Windows
605 this is not necessarily the number of characters, due to there
606 possibly being "surrogate pairs" in the UTF-16 encoding used.
607 On Unix wchar_t is 32 bits and each location is a character.
609 On Unix if a \p src word is greater than 0x10ffff then this is an
610 illegal character according to RFC 3629. These are converted as
611 though they are 0xFFFD (REPLACEMENT CHARACTER). Characters in the
612 range 0xd800 to 0xdfff, or ending with 0xfffe or 0xffff are also
613 illegal according to RFC 3629. However I encode these as though
614 they are legal, so that fl_utf8towc will return the original data.
616 On Windows "surrogate pairs" are converted to a single character
617 and UTF-8 encoded (as 4 bytes). Mismatched halves of surrogate
618 pairs are converted as though they are individual characters.
620 unsigned fl_utf8fromwc(char* dst
, unsigned dstlen
,
621 const wchar_t* src
, unsigned srclen
) {
624 if (dstlen
) for (;;) {
626 if (i
>= srclen
) {dst
[count
] = 0; return count
;}
630 if (count
>= dstlen
) {dst
[count
-1] = 0; break;}
631 } else if (ucs
< 0x800U
) { /* 2 bytes */
632 if (count
+2 >= dstlen
) {dst
[count
] = 0; count
+= 2; break;}
633 dst
[count
++] = 0xc0 | (ucs
>> 6);
634 dst
[count
++] = 0x80 | (ucs
& 0x3F);
635 #if defined(WIN32) || defined(__CYGWIN__)
636 } else if (ucs
>= 0xd800 && ucs
<= 0xdbff && i
< srclen
&&
637 src
[i
] >= 0xdc00 && src
[i
] <= 0xdfff) {
639 unsigned ucs2
= src
[i
++];
640 ucs
= 0x10000U
+ ((ucs
&0x3ff)<<10) + (ucs2
&0x3ff);
641 /* all surrogate pairs turn into 4-byte utf8 */
643 } else if (ucs
>= 0x10000) {
644 if (ucs
> 0x10ffff) {
649 if (count
+4 >= dstlen
) {dst
[count
] = 0; count
+= 4; break;}
650 dst
[count
++] = 0xf0 | (ucs
>> 18);
651 dst
[count
++] = 0x80 | ((ucs
>> 12) & 0x3F);
652 dst
[count
++] = 0x80 | ((ucs
>> 6) & 0x3F);
653 dst
[count
++] = 0x80 | (ucs
& 0x3F);
655 #if !(defined(WIN32) || defined(__CYGWIN__))
658 /* all others are 3 bytes: */
659 if (count
+3 >= dstlen
) {dst
[count
] = 0; count
+= 3; break;}
660 dst
[count
++] = 0xe0 | (ucs
>> 12);
661 dst
[count
++] = 0x80 | ((ucs
>> 6) & 0x3F);
662 dst
[count
++] = 0x80 | (ucs
& 0x3F);
665 /* we filled dst, measure the rest: */
667 unsigned ucs
= src
[i
++];
670 } else if (ucs
< 0x800U
) { /* 2 bytes */
672 #if defined(WIN32) || defined(__CYGWIN__)
673 } else if (ucs
>= 0xd800 && ucs
<= 0xdbff && i
< srclen
-1 &&
674 src
[i
+1] >= 0xdc00 && src
[i
+1] <= 0xdfff) {
678 } else if (ucs
>= 0x10000 && ucs
<= 0x10ffff) {
688 /*! Convert an ISO-8859-1 (ie normal c-string) byte stream to UTF-8.
690 It is possible this should convert Microsoft's CP1252 to UTF-8
691 instead. This would translate the codes in the range 0x80-0x9f
692 to different characters. Currently it does not do this.
694 Up to \p dstlen bytes are written to \p dst, including a null
695 terminator. The return value is the number of bytes that would be
696 written, not counting the null terminator. If greater or equal to
697 \p dstlen then if you malloc a new array of size n+1 you will have
698 the space needed for the entire string. If \p dstlen is zero then
699 nothing is written and this call just measures the storage space
702 \p srclen is the number of bytes in \p src to convert.
704 If the return value equals \p srclen then this indicates that
705 no conversion is necessary, as only ASCII characters are in the
708 unsigned fl_utf8froma(char* dst
, unsigned dstlen
,
709 const char* src
, unsigned srclen
) {
711 const char* e
= src
+srclen
;
713 if (dstlen
) for (;;) {
715 if (p
>= e
) {dst
[count
] = 0; return count
;}
716 ucs
= *(unsigned char*)p
++;
719 if (count
>= dstlen
) {dst
[count
-1] = 0; break;}
720 } else { /* 2 bytes (note that CP1252 translate could make 3 bytes!) */
721 if (count
+2 >= dstlen
) {dst
[count
] = 0; count
+= 2; break;}
722 dst
[count
++] = 0xc0 | (ucs
>> 6);
723 dst
[count
++] = 0x80 | (ucs
& 0x3F);
726 /* we filled dst, measure the rest: */
728 unsigned char ucs
= *(unsigned char*)p
++;
739 # include <windows.h>
742 /*! Return true if the "locale" seems to indicate that UTF-8 encoding
743 is used. If true the fl_utf8to_mb and fl_utf8from_mb don't do anything
746 <i>It is highly recommended that you change your system so this
747 does return true.</i> On Windows this is done by setting the
748 "codepage" to CP_UTF8. On Unix this is done by setting $LC_CTYPE
749 to a string containing the letters "utf" or "UTF" in it, or by
750 deleting all $LC* and $LANG environment variables. In the future
751 it is likely that all non-Asian Unix systems will return true,
752 due to the compatibility of UTF-8 with ISO-8859-1.
754 int fl_utf8locale(void) {
758 ret
= GetACP() == CP_UTF8
;
761 ret
= 1; /* assume UTF-8 if no locale */
762 if (((s
= getenv("LC_CTYPE")) && *s
) ||
763 ((s
= getenv("LC_ALL")) && *s
) ||
764 ((s
= getenv("LANG")) && *s
)) {
765 ret
= (strstr(s
,"utf") || strstr(s
,"UTF"));
772 /*! Convert the UTF-8 used by FLTK to the locale-specific encoding
773 used for filenames (and sometimes used for data in files).
774 Unfortunately due to stupid design you will have to do this as
775 needed for filenames. This is a bug on both Unix and Windows.
777 Up to \p dstlen bytes are written to \p dst, including a null
778 terminator. The return value is the number of bytes that would be
779 written, not counting the null terminator. If greater or equal to
780 \p dstlen then if you malloc a new array of size n+1 you will have
781 the space needed for the entire string. If \p dstlen is zero then
782 nothing is written and this call just measures the storage space
785 If fl_utf8locale() returns true then this does not change the data.
787 unsigned fl_utf8to_mb(const char* src
, unsigned srclen
,
788 char* dst
, unsigned dstlen
)
790 if (!fl_utf8locale()) {
794 unsigned length
= fl_utf8towc(src
, srclen
, buf
, 1024);
796 if (length
>= 1024) {
797 buf
= (wchar_t*)(malloc((length
+1)*sizeof(wchar_t)));
798 fl_utf8towc(src
, srclen
, buf
, length
+1);
801 /* apparently this does not null-terminate, even though msdn
802 * documentation claims it does:
805 WideCharToMultiByte(GetACP(), 0, buf
, length
, dst
, dstlen
, 0, 0);
808 /* if it overflows or measuring length, get the actual length: */
809 if (dstlen
==0 || ret
>= dstlen
-1)
811 WideCharToMultiByte(GetACP(), 0, buf
, length
, 0, 0, 0, 0);
812 if (buf
!= lbuf
) free((void*)buf
);
817 unsigned length
= fl_utf8towc(src
, srclen
, buf
, 1024);
819 if (length
>= 1024) {
820 buf
= (wchar_t*)(malloc((length
+1)*sizeof(wchar_t)));
821 fl_utf8towc(src
, srclen
, buf
, length
+1);
824 ret
= wcstombs(dst
, buf
, dstlen
);
825 if (ret
>= dstlen
-1) ret
= wcstombs(0,buf
,0);
827 ret
= wcstombs(0,buf
,0);
829 if (buf
!= lbuf
) free((void*)buf
);
830 if (ret
>= 0) return (unsigned)ret
;
831 /* on any errors we return the UTF-8 as raw text...*/
834 /* identity transform: */
835 if (srclen
< dstlen
) {
836 memcpy(dst
, src
, srclen
);
839 /* Buffer insufficent or buffer query */
844 /*! Convert a filename from the locale-specific multibyte encoding
845 used by Windows to UTF-8 as used by FLTK.
847 Up to \p dstlen bytes are written to \p dst, including a null
848 terminator. The return value is the number of bytes that would be
849 written, not counting the null terminator. If greater or equal to
850 \p dstlen then if you malloc a new array of size n+1 you will have
851 the space needed for the entire string. If \p dstlen is zero then
852 nothing is written and this call just measures the storage space
855 On Unix or on Windows when a UTF-8 locale is in effect, this
856 does not change the data.
857 You may also want to check if fl_utf8test() returns non-zero, so that
858 the filesystem can store filenames in UTF-8 encoding regardless of
861 unsigned fl_utf8from_mb(char* dst
, unsigned dstlen
,
862 const char* src
, unsigned srclen
)
864 if (!fl_utf8locale()) {
870 length
= MultiByteToWideChar(GetACP(), 0, src
, srclen
, buf
, 1024);
871 if ((length
== 0)&&(GetLastError()==ERROR_INSUFFICIENT_BUFFER
)) {
872 length
= MultiByteToWideChar(GetACP(), 0, src
, srclen
, 0, 0);
873 buf
= (wchar_t*)(malloc(length
*sizeof(wchar_t)));
874 MultiByteToWideChar(GetACP(), 0, src
, srclen
, buf
, length
);
876 ret
= fl_utf8fromwc(dst
, dstlen
, buf
, length
);
877 if (buf
!= lbuf
) free((void*)buf
);
884 length
= mbstowcs(buf
, src
, 1024);
885 if (length
>= 1024) {
886 length
= mbstowcs(0, src
, 0)+1;
887 buf
= (wchar_t*)(malloc(length
*sizeof(wchar_t)));
888 mbstowcs(buf
, src
, length
);
891 ret
= fl_utf8fromwc(dst
, dstlen
, buf
, length
);
892 if (buf
!= lbuf
) free((void*)buf
);
895 /* errors in conversion return the UTF-8 unchanged */
898 /* identity transform: */
899 if (srclen
< dstlen
) {
900 memcpy(dst
, src
, srclen
);
903 /* Buffer insufficent or buffer query */
908 /*! Examines the first \p srclen bytes in \p src and returns a verdict
909 on whether it is UTF-8 or not.
910 - Returns 0 if there is any illegal UTF-8 sequences, using the
911 same rules as fl_utf8decode(). Note that some UCS values considered
912 illegal by RFC 3629, such as 0xffff, are considered legal by this.
913 - Returns 1 if there are only single-byte characters (ie no bytes
914 have the high bit set). This is legal UTF-8, but also indicates
915 plain ASCII. It also returns 1 if \p srclen is zero.
916 - Returns 2 if there are only characters less than 0x800.
917 - Returns 3 if there are only characters less than 0x10000.
918 - Returns 4 if there are characters in the 0x10000 to 0x10ffff range.
920 Because there are many illegal sequences in UTF-8, it is almost
921 impossible for a string in another encoding to be confused with
922 UTF-8. This is very useful for transitioning Unix to UTF-8
923 filenames, you can simply test each filename with this to decide
924 if it is UTF-8 or in the locale encoding. My hope is that if
925 this is done we will be able to cleanly transition to a locale-less
928 int fl_utf8test(const char* src
, unsigned srclen
) {
931 const char* e
= src
+srclen
;
934 int len
; fl_utf8decode(p
,e
,&len
);
935 if (len
< 2) return 0;
936 if (len
> ret
) ret
= len
;
945 /* forward declare mk_wcwidth() as static so the name is not visible.
947 static int mk_wcwidth(unsigned int ucs
);
949 /* include the c source directly so it's contents are only visible here
951 #include "xutf8/mk_wcwidth.c"
953 /** wrapper to adapt Markus Kuhn's implementation of wcwidth() for FLTK
954 \param [in] ucs Unicode character value
955 \returns width of character in columns
957 See http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c for Markus Kuhn's
958 original implementation of wcwidth() and wcswidth()
959 (defined in IEEE Std 1002.1-2001) for Unicode.
961 \b WARNING: this function returns widths for "raw" Unicode characters.
962 It does not even try to map C1 control characters (0x80 to 0x9F) to
963 CP1252, and C0/C1 control characters and DEL will return -1.
964 You are advised to use fl_width(const char* src) instead.
966 int fl_wcwidth_(unsigned int ucs
) {
967 return mk_wcwidth(ucs
);
970 /** extended wrapper around fl_wcwidth_(unsigned int ucs) function.
971 \param[in] src pointer to start of UTF-8 byte sequence
972 \returns width of character in columns
974 Depending on build options, this function may map C1 control
975 characters (0x80 to 0x9f) to CP1252, and return the width of
976 that character instead. This is not the same behaviour as
977 fl_wcwidth_(unsigned int ucs) .
979 Note that other control characters and DEL will still return -1,
980 so if you want different behaviour, you need to test for those
981 characters before calling fl_wcwidth(), and handle them separately.
983 int fl_wcwidth(const char* src
) {
984 int len
= fl_utf8len(*src
);
986 unsigned int ucs
= fl_utf8decode(src
, src
+len
, &ret
);
987 int width
= fl_wcwidth_(ucs
);
994 * End of "$Id: fl_utf.c 8585 2011-04-13 15:43:22Z ianmacarthur $".