nl-utf8.c

   1 /* nl-utf8.c --- functions for UTF-8 unicode support
   2
   3     Copyright (C) 2008 Lutz Mueller
   4
   5     This program is free software: you can redistribute it and/or modify
   6     it under the terms of the GNU General Public License as published by
   7     the Free Software Foundation, either version 3 of the License, or
   8     (at your option) any later version.
   9
  10     This program is distributed in the hope that it will be useful,
  11     but WITHOUT ANY WARRANTY; without even the implied warranty of
  12     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13     GNU General Public License for more details.
  14
  15     You should have received a copy of the GNU General Public License
  16     along with this program.  If not, see <http://www.gnu.org/licenses/>.
  17
  18 //
  19 // portions are copied from pcre.c by: Philip Hazel <ph10@cam.ac.uk>
  20 // and Copyright (c) 1997-2003 University of Cambridge
  21 //
  22 */
  23
  24
  25 #include "newlisp.h"
  26 #include <wchar.h>
  27 #include <wctype.h>
  28 #include "protos.h"
  29
  30
  31 /*************************************************
  32 *    Macros and tables for character handling    *
  33 *        by Philip Hazel <ph10@cam.ac.uk>        *
  34 *************************************************/
  35
  36 /* These are the breakpoints for different numbers of bytes in a UTF-8
  37 character. */
  38
  39 static const int utf8_table1[] =
  40   { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff};
  41
  42 /* These are the indicator bits and the mask for the data bits to set in the
  43 first byte of a character, indexed by the number of additional bytes. */
  44
  45 static const int utf8_table2[] = { 0,    0xc0, 0xe0, 0xf0, 0xf8, 0xfc};
  46 static const int utf8_table3[] = { 0xff, 0x1f, 0x0f, 0x07, 0x03, 0x01};
  47
  48 /* Table of the number of extra characters, indexed by the first character
  49 masked with 0x3f. The highest number for a valid UTF-8 character is in fact
  50 0x3d. */
  51
  52 static const char utf8_table4[] = {
  53   1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
  54   1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
  55   2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
  56   3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 };
  57
  58 /* Get the next UTF-8 character, advancing the pointer. This is called when we
  59 know we are in UTF-8 mode. */
  60
  61 #define GETCHARINC(c, eptr) \
  62   c = (unsigned char)*eptr++; \
  63   if ((c & 0xc0) == 0xc0) \
  64     { \
  65     int gcaa = utf8_table4[c & 0x3f];  /* Number of additional bytes */ \
  66     int gcss = 6*gcaa; \
  67     c = (c & utf8_table3[gcaa]) << gcss; \
  68     while (gcaa-- > 0) \
  69       { \
  70       gcss -= 6; \
  71       c |= (*eptr++ & 0x3f) << gcss; \
  72       } \
  73     }
  74
  75 /* This function takes an integer value in the range 0 - 0x7fffffff
  76 and encodes it as a UTF-8 character in 0 to 6 bytes.
  77
  78 Arguments:
  79   cvalue     the character value
  80   buffer     pointer to buffer for result - at least 6 bytes long
  81
  82 Returns:     number of characters placed in the buffer
  83 */
  84
  85 int wchar_utf8(int cvalue, char *buffer)
  86 {
  87 register int i, j;
  88 for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
  89   if (cvalue <= utf8_table1[i]) break;
  90 buffer += i;
  91 for (j = i; j > 0; j--)
  92  {
  93  *buffer-- = 0x80 | (cvalue & 0x3f);
  94  cvalue >>= 6;
  95  }
  96 *buffer = utf8_table2[i] | cvalue;
  97 return i + 1;
  98 }
  99
 100
 101 /* ---------------------- UTF-8 utility fuctions --------------------------- */
 102
 103 /* get utf8 string from unicode wide character
 104  *
 105  * int wchar_utf8(int wchar, char * utf8str)
 106  *
 107  * the string is not nullterminated for contiguos filling
 108  * of longer strings
 109  * returns number of bytes placed in utf8str
 110 */
 111
 112
 113 /* get a unicode wide character from the utf8 string
 114  * return advanced utf8 string pointer
 115 */
 116
 117 char * utf8_wchar(char * utf8str, int * chr)
 118 {
 119 GETCHARINC(*chr, utf8str)
 120
 121 return(utf8str);
 122 }
 123
 124 /* return the number of characters encoded in utf8 string
 125  * without counting the zero terminator
 126 */
 127
 128 size_t utf8_wlen(char * utf8str)
 129 {
 130 int gcaa;
 131 int c;
 132 size_t count = 0;
 133
 134 while((c = *utf8str++) != 0)
 135         {
 136         count++;
 137         if ((c & 0xc0) == 0xc0)
 138                 {
 139         gcaa = utf8_table4[c & 0x3f];
 140                 utf8str += gcaa;
 141                 }
 142         }
 143
 144 return(count);
 145 }
 146
 147
 148 /* return the length of the first utf8 character
 149 */
 150
 151 int utf8_1st_len(char * utf8str)
 152 {
 153 int c;
 154
 155 if((c = *utf8str) != 0)
 156         {
 157         if((c & 0xc0) == 0xc0)
 158                 return(utf8_table4[c & 0x3f] + 1);
 159         else return(1);
 160         }
 161
 162 return(0);
 163 }
 164
 165
 166 /* convert utf8 string to vector of maxwc wide characters
 167  * unicode vector is zero terminated
 168  * return number of unicode characters (excluding zero int)
 169 */
 170
 171 int utf8_wstr(int * unicode, char * utf8str, int maxwc)
 172 {
 173 int wchar;
 174 int count = 0;
 175
 176 while(maxwc-- && *utf8str != 0)
 177         {
 178         count++;
 179         GETCHARINC(wchar, utf8str);
 180 /*      utf8str = utf8_wchar(utf8str, &wchar); */
 181         *(unicode++) = wchar;
 182         }
 183 *unicode = 0;
 184
 185 return(count);
 186 }
 187
 188 /* convert zero terminated unicode vector into utf8 string
 189  * return number of bytes stored in utr8 string excluding terminator
 190  * don't use more then maxstr bytes (excluding  zero terminator)
 191 */
 192
 193 int wstr_utf8(char * utf8str, int * unicode, int maxstr)
 194 {
 195 int len, size = 0;
 196
 197 while(*unicode != 0 && size < maxstr)
 198         {
 199         len = wchar_utf8(*unicode, utf8str);
 200         utf8str += len;
 201         size += len;
 202         unicode++;
 203         }
 204
 205 *utf8str = 0;
 206
 207 return(size);
 208 }
 209
 210 /* -------------------------------------- newLISP API -----------------------------------*/
 211
 212 CELL * p_unicode(CELL * params)
 213 {
 214 char * utf8str;
 215 size_t size;
 216 int * unicode;
 217 CELL * cell;
 218
 219 getStringSize(params, &utf8str, &size, TRUE);
 220 unicode = allocMemory((size + 1) * sizeof(int));
 221
 222 size = utf8_wstr(unicode, utf8str, size);
 223 unicode = reallocMemory(unicode, (size + 1) * sizeof(int) + 1);
 224
 225 cell = getCell(CELL_STRING);
 226 cell->contents = (UINT)unicode;
 227 cell->aux = (size + 1) * sizeof(int) + 1;
 228
 229 return(cell);
 230 }
 231
 232
 233 CELL * p_utf8(CELL * params)
 234 {
 235 int * unicode;
 236 size_t size;
 237 char * utf8str;
 238 CELL *cell;
 239
 240 getStringSize(params, (void *)&unicode, &size, TRUE);
 241 utf8str = allocMemory(size * UTF8_MAX_BYTES + 1);
 242
 243 size = wstr_utf8(utf8str, unicode, size);
 244 utf8str = reallocMemory(utf8str, size + 1);
 245
 246 cell = getCell(CELL_STRING);
 247 cell->contents = (UINT)utf8str;
 248 cell->aux = size + 1;
 249
 250 return(cell);
 251 }
 252
 253
 254 CELL * p_utf8len(CELL * params)
 255 {
 256 char * str;
 257
 258 getString(params, &str);
 259
 260 return(stuffInteger(utf8_wlen(str)));
 261 }
 262
 263 /* eof */