langpackedit v0.13 -- from 8f9f0878
[wdl.git] / WDL / wdlutf8.h
blobaa17a6c9a4b7e907a1f3752d41d4b68c942e3f7e
1 /*
2 WDL - wdlutf8.h
3 Copyright (C) 2005 and later, Cockos Incorporated
5 This software is provided 'as-is', without any express or implied
6 warranty. In no event will the authors be held liable for any damages
7 arising from the use of this software.
9 Permission is granted to anyone to use this software for any purpose,
10 including commercial applications, and to alter it and redistribute it
11 freely, subject to the following restrictions:
13 1. The origin of this software must not be misrepresented; you must not
14 claim that you wrote the original software. If you use this software
15 in a product, an acknowledgment in the product documentation would be
16 appreciated but is not required.
17 2. Altered source versions must be plainly marked as such, and must not be
18 misrepresented as being the original software.
19 3. This notice may not be removed or altered from any source distribution.
23 #ifndef _WDLUTF8_H_
24 #define _WDLUTF8_H_
26 /* todo: handle overlongs?
27 * todo: handle multi-byte (make WideStr support UTF-16)
30 #include "wdltypes.h"
32 #ifndef WDL_WCHAR
33 #ifdef _WIN32
34 #define WDL_WCHAR WCHAR
35 #else
36 // this is often 4 bytes on macOS/linux! beware dragons!
37 #define WDL_WCHAR wchar_t
38 #endif
39 #endif
42 // returns size, sets cOut to code point.
43 // if invalid UTF-8, sets cOut to first character (as unsigned char).
44 // cOut may be NULL if you only want the size of the character
45 static int WDL_STATICFUNC_UNUSED wdl_utf8_parsechar(const char *rd, int *cOut)
47 const unsigned char *p = (const unsigned char *)rd;
48 const unsigned char b0 = *p;
49 unsigned char b1,b2,b3;
51 if (cOut) *cOut = b0;
52 if (b0 < 0x80)
54 return 1;
56 if (((b1=p[1])&0xC0) != 0x80) return 1;
58 if (b0 < 0xE0)
60 if (!(b0&0x1E)) return 1; // detect overlong
61 if (cOut) *cOut = ((b0&0x1F)<<6)|(b1&0x3F);
62 return 2;
65 if (((b2=p[2])&0xC0) != 0x80) return 1;
67 if (b0 < 0xF0)
69 if (!(b0&0xF) && !(b1&0x20)) return 1; // detect overlong
71 if (cOut) *cOut = ((b0&0x0F)<<12)|((b1&0x3F)<<6)|(b2&0x3f);
72 return 3;
75 if (((b3=p[3])&0xC0) != 0x80) return 1;
77 if (b0 < 0xF8)
79 if (!(b0&0x7) && !(b1&0x30)) return 1; // detect overlong
81 if (cOut) *cOut = ((b0&7)<<18)|((b1&0x3F)<<12)|((b2&0x3F)<<6)|(b3&0x3F);
82 return 4;
85 // UTF-8 does not actually support 5-6 byte sequences as of 2003 (RFC-3629)
86 // skip them and return _
87 if ((p[4]&0xC0) != 0x80) return 1;
88 if (b0 < 0xFC)
90 if (cOut) *cOut = '_';
91 return 5;
94 if ((p[5]&0xC0) != 0x80) return 1;
95 if (cOut) *cOut = '_';
96 return 6;
100 // makes a character, returns length. does NOT nul terminate.
101 // returns 0 if insufficient space, -1 if out of range value
102 static int WDL_STATICFUNC_UNUSED wdl_utf8_makechar(int c, char *dest, int dest_len)
104 if (c < 0) return -1; // out of range character
106 if (c < 0x80)
108 if (dest_len<1) return 0;
109 dest[0]=(char)c;
110 return 1;
112 if (c < 0x800)
114 if (dest_len < 2) return 0;
116 dest[0]=0xC0|(c>>6);
117 dest[1]=0x80|(c&0x3F);
118 return 2;
120 if (c < 0x10000)
122 if (dest_len < 3) return 0;
124 dest[0]=0xE0|(c>>12);
125 dest[1]=0x80|((c>>6)&0x3F);
126 dest[2]=0x80|(c&0x3F);
127 return 3;
129 if (c < 0x200000)
131 if (dest_len < 4) return 0;
132 dest[0]=0xF0|(c>>18);
133 dest[1]=0x80|((c>>12)&0x3F);
134 dest[2]=0x80|((c>>6)&0x3F);
135 dest[3]=0x80|(c&0x3F);
136 return 4;
139 return -1;
143 // invalid UTF-8 are now treated as ANSI characters for this function
144 static int WDL_STATICFUNC_UNUSED WDL_MBtoWideStr(WDL_WCHAR *dest, const char *src, int destlenbytes)
146 WDL_WCHAR *w = dest, *dest_endp = dest+(size_t)destlenbytes/sizeof(WDL_WCHAR)-1;
147 if (!dest || destlenbytes < 1) return 0;
149 if (src) for (; *src && w < dest_endp; )
151 int c,sz=wdl_utf8_parsechar(src,&c);
152 *w++ = c;
153 src+=sz;
155 *w=0;
156 return (int)(w-dest);
160 // like wdl_utf8_makechar, except nul terminates and handles errors differently (returns _ and 1 on errors)
161 // negative values for character are treated as 0.
162 static int WDL_STATICFUNC_UNUSED WDL_MakeUTFChar(char* dest, int c, int destlen)
164 if (destlen < 2)
166 if (destlen == 1) dest[0]=0;
167 return 0;
169 else
171 const int v = wdl_utf8_makechar(c>0?c:0,dest,destlen-1);
172 if (v < 1) // implies either insufficient space or out of range character
174 dest[0]='_';
175 dest[1]=0;
176 return 1;
178 dest[v]=0;
179 return v;
183 static int WDL_STATICFUNC_UNUSED WDL_WideToMBStr(char *dest, const WDL_WCHAR *src, int destlenbytes)
185 char *p = dest, *dest_endp = dest + destlenbytes - 1;
186 if (!dest || destlenbytes < 1) return 0;
188 if (src) while (*src && p < dest_endp)
190 const int v = wdl_utf8_makechar(*src++,p,(int)(dest_endp-p));
191 if (v > 0)
193 p += v;
195 else if (v == 0) break; // out of space
197 *p=0;
198 return (int)(p-dest);
201 // returns >0 if UTF-8, -1 if 8-bit chars occur that are not UTF-8, or 0 if ASCII
202 static int WDL_STATICFUNC_UNUSED WDL_DetectUTF8(const char *str)
204 int hasUTF=0;
206 if (!str) return 0;
208 for (;;)
210 const unsigned char c = *(const unsigned char *)str;
212 if (c < 0xC2 || c > 0xF7)
214 if (!c) return hasUTF;
215 if (c >= 0x80) return -1;
216 str++;
218 else
220 const int l = wdl_utf8_parsechar(str,NULL);
221 if (l < 2) return -1; // wdl_utf8_parsechar returns length=1 if it couldn't parse UTF-8 properly
222 str+=l;
223 hasUTF=1;
229 static int WDL_STATICFUNC_UNUSED WDL_utf8_charpos_to_bytepos(const char *str, int charpos)
231 int bpos = 0;
232 while (charpos-- > 0 && str[bpos])
234 bpos += wdl_utf8_parsechar(str+bpos,NULL);
236 return bpos;
238 static int WDL_STATICFUNC_UNUSED WDL_utf8_bytepos_to_charpos(const char *str, int bytepos)
240 int bpos = 0, cpos=0;
241 while (bpos < bytepos && str[bpos])
243 bpos += wdl_utf8_parsechar(str+bpos,NULL);
244 cpos++;
246 return cpos;
249 #define WDL_utf8_get_charlen(rd) WDL_utf8_bytepos_to_charpos((rd), 0x7fffffff)
251 static void WDL_STATICFUNC_UNUSED wdl_utf8_set_char_case(char *p, int upper) // upper 1 or -1 only
253 const unsigned char c1 = (unsigned char)*p;
254 WDL_ASSERT(upper == 1 || upper == -1);
255 if (c1 >= 'a' && c1 <= 'z')
257 if (upper>0) *p += 'A'-'a';
259 else if (c1 >= 'A' && c1 <= 'Z')
261 if (upper<0) *p -= 'A'-'a';
263 else if (c1 >= 0x80)
265 const unsigned char cc = (unsigned char)p[1] - 0x80;
266 switch (c1)
268 case 0xc3: // u+0c0 to u+0ff as 0..0x3f
269 if ((cc&~0x20) != 0x17) // all values except 0xc7 and 0xf7
271 if (upper>0) p[1] &= ~0x20;
272 else p[1] |= 0x20;
274 break;
275 case 0xc4: // u+100 to u+13f
276 if (cc <= 0x37)
278 // u+100 to u+137 low bit is lowercase
279 if (upper>0) p[1] &= ~1;
280 else p[1] |= 1;
282 // u+138 is not cased
283 else if (cc >= 0x39 && cc < 0x3f)
285 // u+139 to u+13e, odd is uppercase
286 if ((cc & 1) != (upper>0)) p[1] -= upper;
288 else if (cc == 0x3f && upper<0) // u+139 convert to u+140
290 p[0]++;
291 p[1] -= 0x3f;
293 break;
294 case 0xc5: // u+140 to u+17f
295 // u+149 and u+178 and u+17f are not cased
296 if (cc == 0 && upper>0) // u+140 -> u+13f
298 p[0]--;
299 p[1] |= 0x3f;
301 else if (cc >= 0xa && cc <= 0x37) // u+14a to u+177 low bit is lowercase
303 if (upper>0) p[1] &= ~1;
304 else p[1] |= 1;
306 else if ((cc > 0 && cc <= 8) || (cc >= 0x39 && cc <= 0x3e))
308 // u+141 to u+148 and u+179 to u+17e have odd=uppercase
309 if ((cc & 1) != (upper>0)) p[1] -= upper;
311 break;
317 #endif