usr/src/lib/libldap5/sources/ldap/common/ldaputf8.c

   1 #pragma ident   "%Z%%M% %I%     %E% SMI"
   2
   3 /*
   4  * The contents of this file are subject to the Netscape Public
   5  * License Version 1.1 (the "License"); you may not use this file
   6  * except in compliance with the License. You may obtain a copy of
   7  * the License at http://www.mozilla.org/NPL/
   8  *
   9  * Software distributed under the License is distributed on an "AS
  10  * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or
  11  * implied. See the License for the specific language governing
  12  * rights and limitations under the License.
  13  *
  14  * The Original Code is Mozilla Communicator client code, released
  15  * March 31, 1998.
  16  *
  17  * The Initial Developer of the Original Code is Netscape
  18  * Communications Corporation. Portions created by Netscape are
  19  * Copyright (C) 1998-1999 Netscape Communications Corporation. All
  20  * Rights Reserved.
  21  *
  22  * Contributor(s):
  23  */
  24
  25 /* uft8.c - misc. utf8 "string" functions. */
  26 #include "ldap-int.h"
  27
  28 static char UTF8len[64]
  29 = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  30    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  31    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  32    2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 5, 6};
  33
  34 int
  35 LDAP_CALL
  36 ldap_utf8len (const char* s)
  37      /* Return the number of char's in the character at *s. */
  38 {
  39     return ldap_utf8next((char*)s) - s;
  40 }
  41
  42 char*
  43 LDAP_CALL
  44 ldap_utf8next (char* s)
  45      /* Return a pointer to the character immediately following *s.
  46         Handle any valid UTF-8 character, including '\0' and ASCII.
  47         Try to handle a misaligned pointer or a malformed character.
  48      */
  49 {
  50     register unsigned char* next = (unsigned char*)s;
  51     switch (UTF8len [(*next >> 2) & 0x3F]) {
  52       case 0: /* erroneous: s points to the middle of a character. */
  53       case 6: if ((*++next & 0xC0) != 0x80) break;
  54       case 5: if ((*++next & 0xC0) != 0x80) break;
  55       case 4: if ((*++next & 0xC0) != 0x80) break;
  56       case 3: if ((*++next & 0xC0) != 0x80) break;
  57       case 2: if ((*++next & 0xC0) != 0x80) break;
  58       case 1: ++next;
  59     }
  60     return (char*) next;
  61 }
  62
  63 char*
  64 LDAP_CALL
  65 ldap_utf8prev (char* s)
  66      /* Return a pointer to the character immediately preceding *s.
  67         Handle any valid UTF-8 character, including '\0' and ASCII.
  68         Try to handle a misaligned pointer or a malformed character.
  69      */
  70 {
  71     register unsigned char* prev = (unsigned char*)s;
  72     unsigned char* limit = prev - 6;
  73     while (((*--prev & 0xC0) == 0x80) && (prev != limit)) {
  74         ;
  75     }
  76     return (char*) prev;
  77 }
  78
  79 int
  80 LDAP_CALL
  81 ldap_utf8copy (char* dst, const char* src)
  82      /* Copy a character from src to dst; return the number of char's copied.
  83         Handle any valid UTF-8 character, including '\0' and ASCII.
  84         Try to handle a misaligned pointer or a malformed character.
  85      */
  86 {
  87     register const unsigned char* s = (const unsigned char*)src;
  88     switch (UTF8len [(*s >> 2) & 0x3F]) {
  89       case 0: /* erroneous: s points to the middle of a character. */
  90       case 6: *dst++ = *s++; if ((*s & 0xC0) != 0x80) break;
  91       case 5: *dst++ = *s++; if ((*s & 0xC0) != 0x80) break;
  92       case 4: *dst++ = *s++; if ((*s & 0xC0) != 0x80) break;
  93       case 3: *dst++ = *s++; if ((*s & 0xC0) != 0x80) break;
  94       case 2: *dst++ = *s++; if ((*s & 0xC0) != 0x80) break;
  95       case 1: *dst   = *s++;
  96     }
  97     return s - (const unsigned char*)src;
  98 }
  99
 100 size_t
 101 LDAP_CALL
 102 ldap_utf8characters (const char* src)
 103      /* Return the number of UTF-8 characters in the 0-terminated array s. */
 104 {
 105     register char* s = (char*)src;
 106     size_t n;
 107     for (n = 0; *s; LDAP_UTF8INC(s)) ++n;
 108     return n;
 109 }
 110
 111 unsigned long LDAP_CALL
 112 ldap_utf8getcc( const char** src )
 113 {
 114     register unsigned long c;
 115     register const unsigned char* s = (const unsigned char*)*src;
 116     switch (UTF8len [(*s >> 2) & 0x3F]) {
 117       case 0: /* erroneous: s points to the middle of a character. */
 118               c = (*s++) & 0x3F; goto more5;
 119       case 1: c = (*s++); break;
 120       case 2: c = (*s++) & 0x1F; goto more1;
 121       case 3: c = (*s++) & 0x0F; goto more2;
 122       case 4: c = (*s++) & 0x07; goto more3;
 123       case 5: c = (*s++) & 0x03; goto more4;
 124       case 6: c = (*s++) & 0x01; goto more5;
 125       more5: if ((*s & 0xC0) != 0x80) break; c = (c << 6) | ((*s++) & 0x3F);
 126       more4: if ((*s & 0xC0) != 0x80) break; c = (c << 6) | ((*s++) & 0x3F);
 127       more3: if ((*s & 0xC0) != 0x80) break; c = (c << 6) | ((*s++) & 0x3F);
 128       more2: if ((*s & 0xC0) != 0x80) break; c = (c << 6) | ((*s++) & 0x3F);
 129       more1: if ((*s & 0xC0) != 0x80) break; c = (c << 6) | ((*s++) & 0x3F);
 130         break;
 131     }
 132     *src = (const char*)s;
 133     return c;
 134 }
 135
 136 char*
 137 LDAP_CALL
 138 ldap_utf8strtok_r( char* sp, const char* brk, char** next)
 139 {
 140     const char *bp;
 141     unsigned long sc, bc;
 142     char *tok;
 143
 144     if (sp == NULL && (sp = *next) == NULL)
 145       return NULL;
 146
 147     /* Skip leading delimiters; roughly, sp += strspn(sp, brk) */
 148   cont:
 149     sc = LDAP_UTF8GETC(sp);
 150     for (bp = brk; (bc = LDAP_UTF8GETCC(bp)) != 0;) {
 151         if (sc == bc)
 152           goto cont;
 153     }
 154
 155     if (sc == 0) { /* no non-delimiter characters */
 156         *next = NULL;
 157         return NULL;
 158     }
 159     tok = LDAP_UTF8PREV(sp);
 160
 161     /* Scan token; roughly, sp += strcspn(sp, brk)
 162      * Note that brk must be 0-terminated; we stop if we see that, too.
 163      */
 164     while (1) {
 165         sc = LDAP_UTF8GETC(sp);
 166         bp = brk;
 167         do {
 168             if ((bc = LDAP_UTF8GETCC(bp)) == sc) {
 169                 if (sc == 0) {
 170                     *next = NULL;
 171                 } else {
 172                     *next = sp;
 173                     *(LDAP_UTF8PREV(sp)) = 0;
 174                 }
 175                 return tok;
 176             }
 177         } while (bc != 0);
 178     }
 179     /* NOTREACHED */
 180 }
 181
 182 int
 183 LDAP_CALL
 184 ldap_utf8isalnum( char* s )
 185 {
 186     register unsigned char c = *(unsigned char*)s;
 187     if (0x80 & c) return 0;
 188     if (c >= 'A' && c <= 'Z') return 1;
 189     if (c >= 'a' && c <= 'z') return 1;
 190     if (c >= '0' && c <= '9') return 1;
 191     return 0;
 192 }
 193
 194 int
 195 LDAP_CALL
 196 ldap_utf8isalpha( char* s )
 197 {
 198     register unsigned char c = *(unsigned char*)s;
 199     if (0x80 & c) return 0;
 200     if (c >= 'A' && c <= 'Z') return 1;
 201     if (c >= 'a' && c <= 'z') return 1;
 202     return 0;
 203 }
 204
 205 int
 206 LDAP_CALL
 207 ldap_utf8isdigit( char* s )
 208 {
 209     register unsigned char c = *(unsigned char*)s;
 210     if (0x80 & c) return 0;
 211     if (c >= '0' && c <= '9') return 1;
 212     return 0;
 213 }
 214
 215 int
 216 LDAP_CALL
 217 ldap_utf8isxdigit( char* s )
 218 {
 219     register unsigned char c = *(unsigned char*)s;
 220     if (0x80 & c) return 0;
 221     if (c >= '0' && c <= '9') return 1;
 222     if (c >= 'A' && c <= 'F') return 1;
 223     if (c >= 'a' && c <= 'f') return 1;
 224     return 0;
 225 }
 226
 227 int
 228 LDAP_CALL
 229 ldap_utf8isspace( char* s )
 230 {
 231     register unsigned char *c = (unsigned char*)s;
 232     int len = ldap_utf8len(s);
 233
 234     if (len == 0) {
 235         return 0;
 236     } else if (len == 1) {
 237         switch (*c) {
 238             case 0x09:
 239             case 0x0A:
 240             case 0x0B:
 241             case 0x0C:
 242             case 0x0D:
 243             case 0x20:
 244                 return 1;
 245             default:
 246                 return 0;
 247         }
 248     } else if (len == 2) {
 249         if (*c == 0xc2) {
 250                 return *(c+1) == 0x80;
 251         }
 252     } else if (len == 3) {
 253         if (*c == 0xE2) {
 254             c++;
 255             if (*c == 0x80) {
 256                 c++;
 257                 return (*c>=0x80 && *c<=0x8a);
 258             }
 259         } else if (*c == 0xE3) {
 260             return (*(c+1)==0x80) && (*(c+2)==0x80);
 261         } else if (*c==0xEF) {
 262             return (*(c+1)==0xBB) && (*(c+2)==0xBF);
 263         }
 264         return 0;
 265     }
 266
 267     /* should never reach here */
 268     return 0;
 269 }