WDL/wdlutf8.h

   1 /*
   2 WDL - wdlutf8.h
   3 Copyright (C) 2005 and later, Cockos Incorporated
   4
   5 This software is provided 'as-is', without any express or implied
   6 warranty.  In no event will the authors be held liable for any damages
   7 arising from the use of this software.
   8
   9 Permission is granted to anyone to use this software for any purpose,
  10 including commercial applications, and to alter it and redistribute it
  11 freely, subject to the following restrictions:
  12
  13 1. The origin of this software must not be misrepresented; you must not
  14 claim that you wrote the original software. If you use this software
  15 in a product, an acknowledgment in the product documentation would be
  16 appreciated but is not required.
  17 2. Altered source versions must be plainly marked as such, and must not be
  18 misrepresented as being the original software.
  19 3. This notice may not be removed or altered from any source distribution.
  20
  21 */
  22
  23 #ifndef _WDLUTF8_H_
  24 #define _WDLUTF8_H_
  25
  26 /* todo: handle overlongs?
  27  * todo: handle multi-byte (make WideStr support UTF-16)
  28  */
  29
  30 #include "wdltypes.h"
  31
  32 #ifndef WDL_WCHAR
  33   #ifdef _WIN32
  34     #define WDL_WCHAR WCHAR
  35   #else
  36     // this is often 4 bytes on macOS/linux! beware dragons!
  37     #define WDL_WCHAR wchar_t
  38   #endif
  39 #endif
  40
  41
  42 // returns size, sets cOut to code point.
  43 // if invalid UTF-8, sets cOut to first character (as unsigned char).
  44 // cOut may be NULL if you only want the size of the character
  45 static int WDL_STATICFUNC_UNUSED wdl_utf8_parsechar(const char *rd, int *cOut)
  46 {
  47   const unsigned char *p = (const unsigned char *)rd;
  48   const unsigned char b0 = *p;
  49   unsigned char b1,b2,b3;
  50
  51   if (cOut) *cOut = b0;
  52   if (b0 < 0x80)
  53   {
  54     return 1;
  55   }
  56   if (((b1=p[1])&0xC0) != 0x80) return 1;
  57
  58   if (b0 < 0xE0)
  59   {
  60     if (!(b0&0x1E)) return 1; // detect overlong
  61     if (cOut) *cOut = ((b0&0x1F)<<6)|(b1&0x3F);
  62     return 2;
  63   }
  64
  65   if (((b2=p[2])&0xC0) != 0x80) return 1;
  66
  67   if (b0 < 0xF0)
  68   {
  69     if (!(b0&0xF) && !(b1&0x20)) return 1; // detect overlong
  70
  71     if (cOut) *cOut = ((b0&0x0F)<<12)|((b1&0x3F)<<6)|(b2&0x3f);
  72     return 3;
  73   }
  74
  75   if (((b3=p[3])&0xC0) != 0x80) return 1;
  76
  77   if (b0 < 0xF8)
  78   {
  79     if (!(b0&0x7) && !(b1&0x30)) return 1; // detect overlong
  80
  81     if (cOut) *cOut = ((b0&7)<<18)|((b1&0x3F)<<12)|((b2&0x3F)<<6)|(b3&0x3F);
  82     return 4;
  83   }
  84
  85   // UTF-8 does not actually support 5-6 byte sequences as of 2003 (RFC-3629)
  86   // skip them and return _
  87   if ((p[4]&0xC0) != 0x80) return 1;
  88   if (b0 < 0xFC)
  89   {
  90     if (cOut) *cOut = '_';
  91     return 5;
  92   }
  93
  94   if ((p[5]&0xC0) != 0x80) return 1;
  95   if (cOut) *cOut = '_';
  96   return 6;
  97 }
  98
  99
 100 // makes a character, returns length. does NOT nul terminate.
 101 // returns 0 if insufficient space, -1 if out of range value
 102 static int WDL_STATICFUNC_UNUSED wdl_utf8_makechar(int c, char *dest, int dest_len)
 103 {
 104   if (c < 0) return -1; // out of range character
 105
 106   if (c < 0x80)
 107   {
 108     if (dest_len<1) return 0;
 109     dest[0]=(char)c;
 110     return 1;
 111   }
 112   if (c < 0x800)
 113   {
 114     if (dest_len < 2) return 0;
 115
 116     dest[0]=0xC0|(c>>6);
 117     dest[1]=0x80|(c&0x3F);
 118     return 2;
 119   }
 120   if (c < 0x10000)
 121   {
 122     if (dest_len < 3) return 0;
 123
 124     dest[0]=0xE0|(c>>12);
 125     dest[1]=0x80|((c>>6)&0x3F);
 126     dest[2]=0x80|(c&0x3F);
 127     return 3;
 128   }
 129   if (c < 0x200000)
 130   {
 131     if (dest_len < 4) return 0;
 132     dest[0]=0xF0|(c>>18);
 133     dest[1]=0x80|((c>>12)&0x3F);
 134     dest[2]=0x80|((c>>6)&0x3F);
 135     dest[3]=0x80|(c&0x3F);
 136     return 4;
 137   }
 138
 139   return -1;
 140 }
 141
 142
 143 // invalid UTF-8 are now treated as ANSI characters for this function
 144 static int WDL_STATICFUNC_UNUSED WDL_MBtoWideStr(WDL_WCHAR *dest, const char *src, int destlenbytes)
 145 {
 146   WDL_WCHAR *w = dest, *dest_endp = dest+(size_t)destlenbytes/sizeof(WDL_WCHAR)-1;
 147   if (!dest || destlenbytes < 1) return 0;
 148
 149   if (src) for (; *src && w < dest_endp; )
 150   {
 151     int c,sz=wdl_utf8_parsechar(src,&c);
 152     *w++ = c;
 153     src+=sz;
 154   }
 155   *w=0;
 156   return (int)(w-dest);
 157 }
 158
 159
 160 // like wdl_utf8_makechar, except nul terminates and handles errors differently (returns _ and 1 on errors)
 161 // negative values for character are treated as 0.
 162 static int WDL_STATICFUNC_UNUSED WDL_MakeUTFChar(char* dest, int c, int destlen)
 163 {
 164   if (destlen < 2)
 165   {
 166     if (destlen == 1) dest[0]=0;
 167     return 0;
 168   }
 169   else
 170   {
 171     const int v = wdl_utf8_makechar(c>0?c:0,dest,destlen-1);
 172     if (v < 1) // implies either insufficient space or out of range character
 173     {
 174       dest[0]='_';
 175       dest[1]=0;
 176       return 1;
 177     }
 178     dest[v]=0;
 179     return v;
 180   }
 181 }
 182
 183 static int WDL_STATICFUNC_UNUSED WDL_WideToMBStr(char *dest, const WDL_WCHAR *src, int destlenbytes)
 184 {
 185   char *p = dest, *dest_endp = dest + destlenbytes - 1;
 186   if (!dest || destlenbytes < 1) return 0;
 187
 188   if (src) while (*src && p < dest_endp)
 189   {
 190     const int v = wdl_utf8_makechar(*src++,p,(int)(dest_endp-p));
 191     if (v > 0)
 192     {
 193       p += v;
 194     }
 195     else if (v == 0) break; // out of space
 196   }
 197   *p=0;
 198   return (int)(p-dest);
 199 }
 200
 201 // returns >0 if UTF-8, -1 if 8-bit chars occur that are not UTF-8, or 0 if ASCII
 202 static int WDL_STATICFUNC_UNUSED WDL_DetectUTF8(const char *str)
 203 {
 204   int hasUTF=0;
 205
 206   if (!str) return 0;
 207
 208   for (;;)
 209   {
 210     const unsigned char c = *(const unsigned char *)str;
 211
 212     if (c < 0xC2 || c > 0xF7)
 213     {
 214       if (!c) return hasUTF;
 215       if (c >= 0x80) return -1;
 216       str++;
 217     }
 218     else
 219     {
 220       const int l = wdl_utf8_parsechar(str,NULL);
 221       if (l < 2) return -1; // wdl_utf8_parsechar returns length=1 if it couldn't parse UTF-8 properly
 222       str+=l;
 223       hasUTF=1;
 224     }
 225   }
 226 }
 227
 228
 229 static int WDL_STATICFUNC_UNUSED WDL_utf8_charpos_to_bytepos(const char *str, int charpos)
 230 {
 231   int bpos = 0;
 232   while (charpos-- > 0 && str[bpos])
 233   {
 234     bpos += wdl_utf8_parsechar(str+bpos,NULL);
 235   }
 236   return bpos;
 237 }
 238 static int WDL_STATICFUNC_UNUSED WDL_utf8_bytepos_to_charpos(const char *str, int bytepos)
 239 {
 240   int bpos = 0, cpos=0;
 241   while (bpos < bytepos && str[bpos])
 242   {
 243     bpos += wdl_utf8_parsechar(str+bpos,NULL);
 244     cpos++;
 245   }
 246   return cpos;
 247 }
 248
 249 #define WDL_utf8_get_charlen(rd) WDL_utf8_bytepos_to_charpos((rd), 0x7fffffff)
 250
 251 static void WDL_STATICFUNC_UNUSED wdl_utf8_set_char_case(char *p, int upper) // upper 1 or -1 only
 252 {
 253   const unsigned char c1 = (unsigned char)*p;
 254   WDL_ASSERT(upper == 1 || upper == -1);
 255   if (c1 >= 'a' && c1 <= 'z')
 256   {
 257     if (upper>0) *p += 'A'-'a';
 258   }
 259   else if (c1 >= 'A' && c1 <= 'Z')
 260   {
 261     if (upper<0) *p -= 'A'-'a';
 262   }
 263   else if (c1 >= 0x80)
 264   {
 265     const unsigned char cc = (unsigned char)p[1] - 0x80;
 266     switch (c1)
 267     {
 268       case 0xc3: // u+0c0 to u+0ff as 0..0x3f
 269         if ((cc&~0x20) != 0x17) // all values except 0xc7 and 0xf7
 270         {
 271           if (upper>0) p[1] &= ~0x20;
 272           else p[1] |= 0x20;
 273         }
 274       break;
 275       case 0xc4: // u+100 to u+13f
 276         if (cc <= 0x37)
 277         {
 278           // u+100 to u+137 low bit is lowercase
 279           if (upper>0) p[1] &= ~1;
 280           else p[1] |= 1;
 281         }
 282         // u+138 is not cased
 283         else if (cc >= 0x39 && cc < 0x3f)
 284         {
 285           // u+139 to u+13e, odd is uppercase
 286           if ((cc & 1) != (upper>0)) p[1] -= upper;
 287         }
 288         else if (cc == 0x3f && upper<0) // u+139 convert to u+140
 289         {
 290           p[0]++;
 291           p[1] -= 0x3f;
 292         }
 293       break;
 294       case 0xc5: // u+140 to u+17f
 295         // u+149 and u+178 and u+17f are not cased
 296         if (cc == 0 && upper>0) // u+140 -> u+13f
 297         {
 298           p[0]--;
 299           p[1] |= 0x3f;
 300         }
 301         else if (cc >= 0xa && cc <= 0x37) // u+14a to u+177 low bit is lowercase
 302         {
 303           if (upper>0) p[1] &= ~1;
 304           else p[1] |= 1;
 305         }
 306         else if ((cc > 0 && cc <= 8) || (cc >= 0x39 && cc <= 0x3e))
 307         {
 308           // u+141 to u+148 and u+179 to u+17e have odd=uppercase
 309           if ((cc & 1) != (upper>0)) p[1] -= upper;
 310         }
 311       break;
 312     }
 313   }
 314 }
 315
 316
 317 #endif