src/st-unicode.c

   1
   2 #include "st-unicode.h"
   3 #include "st-utils.h"
   4 #include <string.h>
   5
   6 st_unichar
   7 st_utf8_get_unichar (const char *p)
   8 {
   9     st_unichar ch;
  10
  11     if (p == NULL)
  12         return 0x00;
  13
  14     if ((p[0] & 0x80) == 0x00) {
  15         ch = p[0];
  16     } else if ((p[0] & 0xe0) == 0xc0) {
  17         ch = ((p[0] & 0x1f) << 6) | (p[1] & 0x3f);
  18     } else if ((p[0] & 0xf0) == 0xe0) {
  19         ch = ((p[0] & 0xf) << 12) | ((p[1] & 0x3f) << 6) | (p[2] & 0x3f);
  20     } else if ((p[0] & 0xf8) == 0xf0) {
  21         ch = ((p[0] & 0x7) << 18) | ((p[1] & 0x3f) << 12) | ((p[2] & 0x3f) << 6) | (p[3] & 0x3f);
  22     } else
  23         ch = 0x00; /* undefined */
  24
  25     return ch;
  26 }
  27
  28 /*
  29  * Copyright (C) 2008 Colin Percival
  30  */
  31 #if 0
  32 #define ONEMASK ((size_t)(-1) / 0xFF)
  33 size_t
  34 st_utf8_strlen(const char * _s)
  35 {
  36     const char * s;
  37     size_t count = 0;
  38     size_t u;
  39     unsigned char b;
  40
  41     /* Handle any initial misaligned bytes. */
  42     for (s = _s; (uintptr_t)(s) & (sizeof(size_t) - 1); s++) {
  43         b = *s;
  44
  45         /* Exit if we hit a zero byte. */
  46         if (b == '\0')
  47             goto done;
  48
  49         /* Is this byte NOT the first byte of a character? */
  50         count += (b >> 7) & ((~b) >> 6);
  51     }
  52
  53     /* Handle complete blocks. */
  54     for (; ; s += sizeof(size_t)) {
  55         /* Prefetch 256 bytes ahead. */
  56         __builtin_prefetch(&s[256], 0, 0);
  57
  58         /* Grab 4 or 8 bytes of UTF-8 data. */
  59         u = *(size_t *)(s);
  60
  61         /* Exit the loop if there are any zero bytes. */
  62         if ((u - ONEMASK) & (~u) & (ONEMASK * 0x80))
  63             break;
  64
  65         /* Count bytes which are NOT the first byte of a character. */
  66         u = ((u & (ONEMASK * 0x80)) >> 7) & ((~u) >> 6);
  67         count += (u * ONEMASK) >> ((sizeof(size_t) - 1) * 8);
  68     }
  69
  70     /* Take care of any left-over bytes. */
  71     for (; ; s++) {
  72         b = *s;
  73
  74         /* Exit if we hit a zero byte. */
  75         if (b == '\0')
  76             break;
  77
  78         /* Is this byte NOT the first byte of a character? */
  79         count += (b >> 7) & ((~b) >> 6);
  80     }
  81
  82 done:
  83     return ((s - _s) - count);
  84 }
  85 #endif
  86
  87 /* Derived from FontConfig
  88  * Copyright (C) 2006 Keith Packard
  89  */
  90 int
  91 st_unichar_to_utf8 (st_unichar ch, char *outbuf)
  92 {
  93     int bits;
  94     char *d = outbuf;
  95
  96     if      (ch <       0x80) {  *d++ =  ch;                         bits = -6; }
  97     else if (ch <      0x800) {  *d++ = ((ch >>  6) & 0x1f) | 0xc0;  bits =  0; }
  98     else if (ch <    0x10000) {  *d++ = ((ch >> 12) & 0x0f) | 0xe0;  bits =  6; }
  99     else if (ch <   0x200000) {  *d++ = ((ch >> 18) & 0x07) | 0xf0;  bits = 12; }
 100     else if (ch <  0x4000000) {  *d++ = ((ch >> 24) & 0x03) | 0xf8;  bits = 18; }
 101     else if (ch < 0x80000000) {  *d++ = ((ch >> 30) & 0x01) | 0xfC;  bits = 24; }
 102     else return 0;
 103
 104     for (; bits >= 0; bits -= 6) {
 105         *d++= ((ch >> bits) & 0x3F) | 0x80;
 106     }
 107     return d - outbuf;
 108 }
 109
 110 /**
 111  * st_utf8_validate
 112  * @utf: Pointer to putative UTF-8 encoded string.
 113  *
 114  * Checks @utf for being valid UTF-8. @utf is assumed to be
 115  * null-terminated. This function is not super-strict, as it will
 116  * allow longer UTF-8 sequences than necessary. Note that Java is
 117  * capable of producing these sequences if provoked. Also note, this
 118  * routine checks for the 4-byte maximum size, but does not check for
 119  * 0x10ffff maximum value.
 120  *
 121  * Return value: true if @utf is valid.
 122  **/
 123 /* Derived from eglib, libxml2
 124  * Copyright (C) 2006 Novell, Inc.
 125  * Copyright (C) 1998-2003 Daniel Veillard
 126  */
 127 bool
 128 st_utf8_validate (const char *string, ssize_t max_len)
 129 {
 130     int ix;
 131
 132     if (max_len == -1)
 133         max_len = strlen (string);
 134
 135     /*
 136      * input is a string of 1, 2, 3 or 4 bytes.  The valid strings
 137      * are as follows (in "bit format"):
 138      *    0xxxxxxx                                      valid 1-byte
 139      *    110xxxxx 10xxxxxx                             valid 2-byte
 140      *    1110xxxx 10xxxxxx 10xxxxxx                    valid 3-byte
 141      *    11110xxx 10xxxxxx 10xxxxxx 10xxxxxx           valid 4-byte
 142      */
 143     for (ix = 0; ix < max_len;) {      /* string is 0-terminated */
 144         st_uchar c;
 145
 146         c = string[ix];
 147         if ((c & 0x80) == 0x00) {       /* 1-byte code, starts with 10 */
 148             ix++;
 149         } else if ((c & 0xe0) == 0xc0) {/* 2-byte code, starts with 110 */
 150             if (((ix+1) >= max_len) || (string[ix+1] & 0xc0 ) != 0x80)
 151                 return false;
 152             ix += 2;
 153         } else if ((c & 0xf0) == 0xe0) {/* 3-byte code, starts with 1110 */
 154             if (((ix + 2) >= max_len) ||
 155                 ((string[ix+1] & 0xc0) != 0x80) ||
 156                 ((string[ix+2] & 0xc0) != 0x80))
 157                 return false;
 158             ix += 3;
 159         } else if ((c & 0xf8) == 0xf0) {/* 4-byte code, starts with 11110 */
 160             if (((ix + 3) >= max_len) ||
 161                 ((string[ix+1] & 0xc0) != 0x80) ||
 162                 ((string[ix+2] & 0xc0) != 0x80) ||
 163                 ((string[ix+3] & 0xc0) != 0x80))
 164                 return false;
 165             ix += 4;
 166         } else {/* unknown encoding */
 167             return false;
 168         }
 169     }
 170
 171     return true;
 172 }
 173
 174 /**
 175  * st_utf8_strlen:
 176  * @utf:  a sequence of UTF-8 encoded bytes
 177  *
 178  * compute the length of an UTF8 string, it doesn't do a full UTF8
 179  * checking of the content of the string.
 180  *
 181  * Returns the number of characters in the string or -1 in case of error
 182  */
 183 /* Derived from libxml2
 184  * Copyright (C) 1998-2003 Daniel Veillard
 185  */
 186 int
 187 st_utf8_strlen (const char *string)
 188 {
 189     int ret = 0;
 190
 191     if (string == NULL)
 192         return(-1);
 193
 194     while (*string != 0) {
 195         if (string[0] & 0x80) {
 196             if ((string[1] & 0xc0) != 0x80)
 197                 return(-1);
 198             if ((string[0] & 0xe0) == 0xe0) {
 199                 if ((string[2] & 0xc0) != 0x80)
 200                     return(-1);
 201                 if ((string[0] & 0xf0) == 0xf0) {
 202                     if ((string[0] & 0xf8) != 0xf0 || (string[3] & 0xc0) != 0x80)
 203                         return(-1);
 204                     string += 4;
 205                 } else {
 206                     string += 3;
 207                 }
 208             } else {
 209                 string += 2;
 210             }
 211         } else {
 212             string++;
 213         }
 214         ret++;
 215     }
 216     return(ret);
 217 }
 218
 219 const char *
 220 st_utf8_offset_to_pointer (const char *string, st_uint offset)
 221 {
 222     const char *p = string;
 223
 224     for (st_uint i = 0; i < offset; i++)
 225         p = st_utf8_next_char (p);
 226
 227     return p;
 228 }
 229
 230 st_unichar *
 231 st_utf8_to_ucs4 (const char *string)
 232 {
 233     const st_uchar *p = string;
 234     st_unichar *buffer, c;
 235     st_uint     index = 0;
 236
 237     if (string == NULL)
 238         return NULL;
 239
 240     buffer = st_malloc (sizeof (st_unichar) * (st_utf8_strlen (string) + 1));
 241
 242     while (p[0]) {
 243         if ((p[0] & 0x80) == 0x00) {
 244             c = p[0];
 245             p += 1;
 246         } else if ((p[0] & 0xe0) == 0xc0) {
 247             c = ((p[0] & 0x1f) << 6) | (p[1] & 0x3f);
 248             p += 2;
 249         } else if ((p[0] & 0xf0) == 0xe0) {
 250             c = ((p[0] & 0xf) << 12) | ((p[1] & 0x3f) << 6)  | (p[2] & 0x3f);
 251             p += 3;
 252         } else if ((p[0] & 0xf8) == 0xf0) {
 253             c = ((p[0] & 0x7) << 18) | ((p[1] & 0x3f) << 12) | ((p[2] & 0x3f) << 6) | (p[3] & 0x3f);
 254             p += 4;
 255         } else
 256             break;
 257
 258         buffer[index++] = c;
 259     }
 260
 261     buffer[index] = 0;
 262     return buffer;
 263 }
 264
 265 #if 0
 266
 267 char *
 268 st_ucs4_to_utf8 (const st_unichar *string)
 269 {
 270
 271     int bits;
 272     char *d = outbuf;
 273
 274     if      (ch <       0x80) {  *d++ =  ch;                         bits = -6; }
 275     else if (ch <      0x800) {  *d++ = ((ch >>  6) & 0x1f) | 0xc0;  bits =  0; }
 276     else if (ch <    0x10000) {  *d++ = ((ch >> 12) & 0x0f) | 0xe0;  bits =  6; }
 277     else if (ch <   0x200000) {  *d++ = ((ch >> 18) & 0x07) | 0xf0;  bits = 12; }
 278     else if (ch <  0x4000000) {  *d++ = ((ch >> 24) & 0x03) | 0xf8;  bits = 18; }
 279     else if (ch < 0x80000000) {  *d++ = ((ch >> 30) & 0x01) | 0xfC;  bits = 24; }
 280     else return 0;
 281
 282     for (; bits >= 0; bits -= 6) {
 283         *d++= ((ch >> bits) & 0x3F) | 0x80;
 284     }
 285     return d - outbuf;
 286 }
 287
 288 #endif