3 Copyright (C) 2005 and later, Cockos Incorporated
5 This software is provided 'as-is', without any express or implied
6 warranty. In no event will the authors be held liable for any damages
7 arising from the use of this software.
9 Permission is granted to anyone to use this software for any purpose,
10 including commercial applications, and to alter it and redistribute it
11 freely, subject to the following restrictions:
13 1. The origin of this software must not be misrepresented; you must not
14 claim that you wrote the original software. If you use this software
15 in a product, an acknowledgment in the product documentation would be
16 appreciated but is not required.
17 2. Altered source versions must be plainly marked as such, and must not be
18 misrepresented as being the original software.
19 3. This notice may not be removed or altered from any source distribution.
26 /* todo: handle overlongs?
27 * todo: handle multi-byte (make WideStr support UTF-16)
34 #define WDL_WCHAR WCHAR
36 #define WDL_WCHAR wchar_t
41 // returns size, sets cOut to code point.
42 // if invalid ITF-8, sets cOut to first character (as unsigned char).
43 // cOut may be NULL if you only want the size of the character
44 static int WDL_STATICFUNC_UNUSED
wdl_utf8_parsechar(const char *rd
, int *cOut
)
46 const unsigned char *p
= (const unsigned char *)rd
;
47 const unsigned char b0
= *p
;
48 unsigned char b1
,b2
,b3
;
55 if (((b1
=p
[1])&0xC0) != 0x80) return 1;
59 if (!(b0
&0x1E)) return 1; // detect overlong
60 if (cOut
) *cOut
= ((b0
&0x1F)<<6)|(b1
&0x3F);
64 if (((b2
=p
[2])&0xC0) != 0x80) return 1;
68 if (!(b0
&0xF) && !(b1
&0x20)) return 1; // detect overlong
70 if (cOut
) *cOut
= ((b0
&0x0F)<<12)|((b1
&0x3F)<<6)|(b2
&0x3f);
74 if (((b3
=p
[3])&0xC0) != 0x80) return 1;
78 if (!(b0
&0x7) && !(b1
&0x30)) return 1; // detect overlong
80 if (cOut
) *cOut
= ((b0
&7)<<18)|((b1
&0x3F)<<12)|((b2
&0x3F)<<6)|(b3
&0x3F);
84 // UTF-8 does not actually support 5-6 byte sequences as of 2003 (RFC-3629)
85 // skip them and return _
86 if ((p
[4]&0xC0) != 0x80) return 1;
89 if (cOut
) *cOut
= '_';
93 if ((p
[5]&0xC0) != 0x80) return 1;
94 if (cOut
) *cOut
= '_';
99 // makes a character, returns length. does NOT nul terminate.
100 // returns 0 if insufficient space, -1 if out of range value
101 static int WDL_STATICFUNC_UNUSED
wdl_utf8_makechar(int c
, char *dest
, int dest_len
)
103 if (c
< 0) return -1; // out of range character
107 if (dest_len
<1) return 0;
113 if (dest_len
< 2) return 0;
116 dest
[1]=0x80|(c
&0x3F);
121 if (dest_len
< 3) return 0;
123 dest
[0]=0xE0|(c
>>12);
124 dest
[1]=0x80|((c
>>6)&0x3F);
125 dest
[2]=0x80|(c
&0x3F);
130 if (dest_len
< 4) return 0;
131 dest
[0]=0xF0|(c
>>18);
132 dest
[1]=0x80|((c
>>12)&0x3F);
133 dest
[2]=0x80|((c
>>6)&0x3F);
134 dest
[3]=0x80|(c
&0x3F);
142 // invalid UTF-8 are now treated as ANSI characters for this function
143 static int WDL_STATICFUNC_UNUSED
WDL_MBtoWideStr(WDL_WCHAR
*dest
, const char *src
, int destlenbytes
)
145 WDL_WCHAR
*w
= dest
, *dest_endp
= dest
+(size_t)destlenbytes
/sizeof(WDL_WCHAR
)-1;
146 if (!dest
|| destlenbytes
< 1) return 0;
148 if (src
) for (; *src
&& w
< dest_endp
; )
150 int c
,sz
=wdl_utf8_parsechar(src
,&c
);
155 return (int)(w
-dest
);
159 // like wdl_utf8_makechar, except nul terminates and handles errors differently (returns _ and 1 on errors)
160 // negative values for character are treated as 0.
161 static int WDL_STATICFUNC_UNUSED
WDL_MakeUTFChar(char* dest
, int c
, int destlen
)
165 if (destlen
== 1) dest
[0]=0;
170 const int v
= wdl_utf8_makechar(c
>0?c
:0,dest
,destlen
-1);
171 if (v
< 1) // implies either insufficient space or out of range character
182 static int WDL_STATICFUNC_UNUSED
WDL_WideToMBStr(char *dest
, const WDL_WCHAR
*src
, int destlenbytes
)
184 char *p
= dest
, *dest_endp
= dest
+ destlenbytes
- 1;
185 if (!dest
|| destlenbytes
< 1) return 0;
187 if (src
) while (*src
&& p
< dest_endp
)
189 const int v
= wdl_utf8_makechar(*src
++,p
,(int)(dest_endp
-p
));
194 else if (v
== 0) break; // out of space
197 return (int)(p
-dest
);
200 // returns >0 if UTF-8, -1 if 8-bit chars occur that are not UTF-8, or 0 if ASCII
201 static int WDL_STATICFUNC_UNUSED
WDL_DetectUTF8(const char *str
)
209 const unsigned char c
= *(const unsigned char *)str
;
211 if (c
< 0xC2 || c
> 0xF7)
213 if (!c
) return hasUTF
;
214 if (c
>= 0x80) return -1;
219 const int l
= wdl_utf8_parsechar(str
,NULL
);
220 if (l
< 2) return -1; // wdl_utf8_parsechar returns length=1 if it couldn't parse UTF-8 properly
228 static int WDL_STATICFUNC_UNUSED
WDL_utf8_charpos_to_bytepos(const char *str
, int charpos
)
231 while (charpos
-- > 0 && str
[bpos
])
233 bpos
+= wdl_utf8_parsechar(str
+bpos
,NULL
);
237 static int WDL_STATICFUNC_UNUSED
WDL_utf8_bytepos_to_charpos(const char *str
, int bytepos
)
239 int bpos
= 0, cpos
=0;
240 while (bpos
< bytepos
&& str
[bpos
])
242 bpos
+= wdl_utf8_parsechar(str
+bpos
,NULL
);
248 #define WDL_utf8_get_charlen(rd) WDL_utf8_bytepos_to_charpos((rd), 0x7fffffff)