Merge pull request #110 from tesselode/fixes
[wdl/wdl-ol.git] / WDL / wdlutf8.h
blob41e02ae9dc6db93e6669b2b6d0b9f3a2e392e864
1 /*
2 WDL - wdlutf8.h
3 Copyright (C) 2005 and later, Cockos Incorporated
5 This software is provided 'as-is', without any express or implied
6 warranty. In no event will the authors be held liable for any damages
7 arising from the use of this software.
9 Permission is granted to anyone to use this software for any purpose,
10 including commercial applications, and to alter it and redistribute it
11 freely, subject to the following restrictions:
13 1. The origin of this software must not be misrepresented; you must not
14 claim that you wrote the original software. If you use this software
15 in a product, an acknowledgment in the product documentation would be
16 appreciated but is not required.
17 2. Altered source versions must be plainly marked as such, and must not be
18 misrepresented as being the original software.
19 3. This notice may not be removed or altered from any source distribution.
23 #ifndef _WDLUTF8_H_
24 #define _WDLUTF8_H_
26 /* todo: handle overlongs?
27 * todo: handle multi-byte (make WideStr support UTF-16)
30 #include "wdltypes.h"
32 #ifndef WDL_WCHAR
33 #ifdef _WIN32
34 #define WDL_WCHAR WCHAR
35 #else
36 #define WDL_WCHAR wchar_t
37 #endif
38 #endif
41 // returns size, sets cOut to code point.
42 // if invalid ITF-8, sets cOut to first character (as unsigned char).
43 // cOut may be NULL if you only want the size of the character
44 static int WDL_STATICFUNC_UNUSED wdl_utf8_parsechar(const char *rd, int *cOut)
46 const unsigned char *p = (const unsigned char *)rd;
47 const unsigned char b0 = *p;
48 unsigned char b1,b2,b3;
50 if (cOut) *cOut = b0;
51 if (b0 < 0x80)
53 return 1;
55 if (((b1=p[1])&0xC0) != 0x80) return 1;
57 if (b0 < 0xE0)
59 if (!(b0&0x1E)) return 1; // detect overlong
60 if (cOut) *cOut = ((b0&0x1F)<<6)|(b1&0x3F);
61 return 2;
64 if (((b2=p[2])&0xC0) != 0x80) return 1;
66 if (b0 < 0xF0)
68 if (!(b0&0xF) && !(b1&0x20)) return 1; // detect overlong
70 if (cOut) *cOut = ((b0&0x0F)<<12)|((b1&0x3F)<<6)|(b2&0x3f);
71 return 3;
74 if (((b3=p[3])&0xC0) != 0x80) return 1;
76 if (b0 < 0xF8)
78 if (!(b0&0x7) && !(b1&0x30)) return 1; // detect overlong
80 if (cOut) *cOut = ((b0&7)<<18)|((b1&0x3F)<<12)|((b2&0x3F)<<6)|(b3&0x3F);
81 return 4;
84 // UTF-8 does not actually support 5-6 byte sequences as of 2003 (RFC-3629)
85 // skip them and return _
86 if ((p[4]&0xC0) != 0x80) return 1;
87 if (b0 < 0xFC)
89 if (cOut) *cOut = '_';
90 return 5;
93 if ((p[5]&0xC0) != 0x80) return 1;
94 if (cOut) *cOut = '_';
95 return 6;
99 // makes a character, returns length. does NOT nul terminate.
100 // returns 0 if insufficient space, -1 if out of range value
101 static int WDL_STATICFUNC_UNUSED wdl_utf8_makechar(int c, char *dest, int dest_len)
103 if (c < 0) return -1; // out of range character
105 if (c < 0x80)
107 if (dest_len<1) return 0;
108 dest[0]=(char)c;
109 return 1;
111 if (c < 0x800)
113 if (dest_len < 2) return 0;
115 dest[0]=0xC0|(c>>6);
116 dest[1]=0x80|(c&0x3F);
117 return 2;
119 if (c < 0x10000)
121 if (dest_len < 3) return 0;
123 dest[0]=0xE0|(c>>12);
124 dest[1]=0x80|((c>>6)&0x3F);
125 dest[2]=0x80|(c&0x3F);
126 return 3;
128 if (c < 0x200000)
130 if (dest_len < 4) return 0;
131 dest[0]=0xF0|(c>>18);
132 dest[1]=0x80|((c>>12)&0x3F);
133 dest[2]=0x80|((c>>6)&0x3F);
134 dest[3]=0x80|(c&0x3F);
135 return 4;
138 return -1;
142 // invalid UTF-8 are now treated as ANSI characters for this function
143 static int WDL_STATICFUNC_UNUSED WDL_MBtoWideStr(WDL_WCHAR *dest, const char *src, int destlenbytes)
145 WDL_WCHAR *w = dest, *dest_endp = dest+(size_t)destlenbytes/sizeof(WDL_WCHAR)-1;
146 if (!dest || destlenbytes < 1) return 0;
148 if (src) for (; *src && w < dest_endp; )
150 int c,sz=wdl_utf8_parsechar(src,&c);
151 *w++ = c;
152 src+=sz;
154 *w=0;
155 return (int)(w-dest);
159 // like wdl_utf8_makechar, except nul terminates and handles errors differently (returns _ and 1 on errors)
160 // negative values for character are treated as 0.
161 static int WDL_STATICFUNC_UNUSED WDL_MakeUTFChar(char* dest, int c, int destlen)
163 if (destlen < 2)
165 if (destlen == 1) dest[0]=0;
166 return 0;
168 else
170 const int v = wdl_utf8_makechar(c>0?c:0,dest,destlen-1);
171 if (v < 1) // implies either insufficient space or out of range character
173 dest[0]='_';
174 dest[1]=0;
175 return 1;
177 dest[v]=0;
178 return v;
182 static int WDL_STATICFUNC_UNUSED WDL_WideToMBStr(char *dest, const WDL_WCHAR *src, int destlenbytes)
184 char *p = dest, *dest_endp = dest + destlenbytes - 1;
185 if (!dest || destlenbytes < 1) return 0;
187 if (src) while (*src && p < dest_endp)
189 const int v = wdl_utf8_makechar(*src++,p,(int)(dest_endp-p));
190 if (v > 0)
192 p += v;
194 else if (v == 0) break; // out of space
196 *p=0;
197 return (int)(p-dest);
200 // returns >0 if UTF-8, -1 if 8-bit chars occur that are not UTF-8, or 0 if ASCII
201 static int WDL_STATICFUNC_UNUSED WDL_DetectUTF8(const char *str)
203 int hasUTF=0;
205 if (!str) return 0;
207 for (;;)
209 const unsigned char c = *(const unsigned char *)str;
211 if (c < 0xC2 || c > 0xF7)
213 if (!c) return hasUTF;
214 if (c >= 0x80) return -1;
215 str++;
217 else
219 const int l = wdl_utf8_parsechar(str,NULL);
220 if (l < 2) return -1; // wdl_utf8_parsechar returns length=1 if it couldn't parse UTF-8 properly
221 str+=l;
222 hasUTF=1;
228 static int WDL_STATICFUNC_UNUSED WDL_utf8_charpos_to_bytepos(const char *str, int charpos)
230 int bpos = 0;
231 while (charpos-- > 0 && str[bpos])
233 bpos += wdl_utf8_parsechar(str+bpos,NULL);
235 return bpos;
237 static int WDL_STATICFUNC_UNUSED WDL_utf8_bytepos_to_charpos(const char *str, int bytepos)
239 int bpos = 0, cpos=0;
240 while (bpos < bytepos && str[bpos])
242 bpos += wdl_utf8_parsechar(str+bpos,NULL);
243 cpos++;
245 return cpos;
248 #define WDL_utf8_get_charlen(rd) WDL_utf8_bytepos_to_charpos((rd), 0x7fffffff)
250 #endif