1 #pragma ident "%Z%%M% %I% %E% SMI"
4 * The contents of this file are subject to the Netscape Public
5 * License Version 1.1 (the "License"); you may not use this file
6 * except in compliance with the License. You may obtain a copy of
7 * the License at http://www.mozilla.org/NPL/
9 * Software distributed under the License is distributed on an "AS
10 * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or
11 * implied. See the License for the specific language governing
12 * rights and limitations under the License.
14 * The Original Code is Mozilla Communicator client code, released
17 * The Initial Developer of the Original Code is Netscape
18 * Communications Corporation. Portions created by Netscape are
19 * Copyright (C) 1998-1999 Netscape Communications Corporation. All
25 /* uft8.c - misc. utf8 "string" functions. */
28 static char UTF8len
[64]
29 = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
30 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
31 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
32 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 5, 6};
36 ldap_utf8len (const char* s
)
37 /* Return the number of char's in the character at *s. */
39 return ldap_utf8next((char*)s
) - s
;
44 ldap_utf8next (char* s
)
45 /* Return a pointer to the character immediately following *s.
46 Handle any valid UTF-8 character, including '\0' and ASCII.
47 Try to handle a misaligned pointer or a malformed character.
50 register unsigned char* next
= (unsigned char*)s
;
51 switch (UTF8len
[(*next
>> 2) & 0x3F]) {
52 case 0: /* erroneous: s points to the middle of a character. */
53 case 6: if ((*++next
& 0xC0) != 0x80) break;
54 case 5: if ((*++next
& 0xC0) != 0x80) break;
55 case 4: if ((*++next
& 0xC0) != 0x80) break;
56 case 3: if ((*++next
& 0xC0) != 0x80) break;
57 case 2: if ((*++next
& 0xC0) != 0x80) break;
65 ldap_utf8prev (char* s
)
66 /* Return a pointer to the character immediately preceding *s.
67 Handle any valid UTF-8 character, including '\0' and ASCII.
68 Try to handle a misaligned pointer or a malformed character.
71 register unsigned char* prev
= (unsigned char*)s
;
72 unsigned char* limit
= prev
- 6;
73 while (((*--prev
& 0xC0) == 0x80) && (prev
!= limit
)) {
81 ldap_utf8copy (char* dst
, const char* src
)
82 /* Copy a character from src to dst; return the number of char's copied.
83 Handle any valid UTF-8 character, including '\0' and ASCII.
84 Try to handle a misaligned pointer or a malformed character.
87 register const unsigned char* s
= (const unsigned char*)src
;
88 switch (UTF8len
[(*s
>> 2) & 0x3F]) {
89 case 0: /* erroneous: s points to the middle of a character. */
90 case 6: *dst
++ = *s
++; if ((*s
& 0xC0) != 0x80) break;
91 case 5: *dst
++ = *s
++; if ((*s
& 0xC0) != 0x80) break;
92 case 4: *dst
++ = *s
++; if ((*s
& 0xC0) != 0x80) break;
93 case 3: *dst
++ = *s
++; if ((*s
& 0xC0) != 0x80) break;
94 case 2: *dst
++ = *s
++; if ((*s
& 0xC0) != 0x80) break;
97 return s
- (const unsigned char*)src
;
102 ldap_utf8characters (const char* src
)
103 /* Return the number of UTF-8 characters in the 0-terminated array s. */
105 register char* s
= (char*)src
;
107 for (n
= 0; *s
; LDAP_UTF8INC(s
)) ++n
;
111 unsigned long LDAP_CALL
112 ldap_utf8getcc( const char** src
)
114 register unsigned long c
;
115 register const unsigned char* s
= (const unsigned char*)*src
;
116 switch (UTF8len
[(*s
>> 2) & 0x3F]) {
117 case 0: /* erroneous: s points to the middle of a character. */
118 c
= (*s
++) & 0x3F; goto more5
;
119 case 1: c
= (*s
++); break;
120 case 2: c
= (*s
++) & 0x1F; goto more1
;
121 case 3: c
= (*s
++) & 0x0F; goto more2
;
122 case 4: c
= (*s
++) & 0x07; goto more3
;
123 case 5: c
= (*s
++) & 0x03; goto more4
;
124 case 6: c
= (*s
++) & 0x01; goto more5
;
125 more5
: if ((*s
& 0xC0) != 0x80) break; c
= (c
<< 6) | ((*s
++) & 0x3F);
126 more4
: if ((*s
& 0xC0) != 0x80) break; c
= (c
<< 6) | ((*s
++) & 0x3F);
127 more3
: if ((*s
& 0xC0) != 0x80) break; c
= (c
<< 6) | ((*s
++) & 0x3F);
128 more2
: if ((*s
& 0xC0) != 0x80) break; c
= (c
<< 6) | ((*s
++) & 0x3F);
129 more1
: if ((*s
& 0xC0) != 0x80) break; c
= (c
<< 6) | ((*s
++) & 0x3F);
132 *src
= (const char*)s
;
138 ldap_utf8strtok_r( char* sp
, const char* brk
, char** next
)
141 unsigned long sc
, bc
;
144 if (sp
== NULL
&& (sp
= *next
) == NULL
)
147 /* Skip leading delimiters; roughly, sp += strspn(sp, brk) */
149 sc
= LDAP_UTF8GETC(sp
);
150 for (bp
= brk
; (bc
= LDAP_UTF8GETCC(bp
)) != 0;) {
155 if (sc
== 0) { /* no non-delimiter characters */
159 tok
= LDAP_UTF8PREV(sp
);
161 /* Scan token; roughly, sp += strcspn(sp, brk)
162 * Note that brk must be 0-terminated; we stop if we see that, too.
165 sc
= LDAP_UTF8GETC(sp
);
168 if ((bc
= LDAP_UTF8GETCC(bp
)) == sc
) {
173 *(LDAP_UTF8PREV(sp
)) = 0;
184 ldap_utf8isalnum( char* s
)
186 register unsigned char c
= *(unsigned char*)s
;
187 if (0x80 & c
) return 0;
188 if (c
>= 'A' && c
<= 'Z') return 1;
189 if (c
>= 'a' && c
<= 'z') return 1;
190 if (c
>= '0' && c
<= '9') return 1;
196 ldap_utf8isalpha( char* s
)
198 register unsigned char c
= *(unsigned char*)s
;
199 if (0x80 & c
) return 0;
200 if (c
>= 'A' && c
<= 'Z') return 1;
201 if (c
>= 'a' && c
<= 'z') return 1;
207 ldap_utf8isdigit( char* s
)
209 register unsigned char c
= *(unsigned char*)s
;
210 if (0x80 & c
) return 0;
211 if (c
>= '0' && c
<= '9') return 1;
217 ldap_utf8isxdigit( char* s
)
219 register unsigned char c
= *(unsigned char*)s
;
220 if (0x80 & c
) return 0;
221 if (c
>= '0' && c
<= '9') return 1;
222 if (c
>= 'A' && c
<= 'F') return 1;
223 if (c
>= 'a' && c
<= 'f') return 1;
229 ldap_utf8isspace( char* s
)
231 register unsigned char *c
= (unsigned char*)s
;
232 int len
= ldap_utf8len(s
);
236 } else if (len
== 1) {
248 } else if (len
== 2) {
250 return *(c
+1) == 0x80;
252 } else if (len
== 3) {
257 return (*c
>=0x80 && *c
<=0x8a);
259 } else if (*c
== 0xE3) {
260 return (*(c
+1)==0x80) && (*(c
+2)==0x80);
261 } else if (*c
==0xEF) {
262 return (*(c
+1)==0xBB) && (*(c
+2)==0xBF);
267 /* should never reach here */