1 /* utf-8.c -- Basic UTF-8 routines */
2 /* $OpenLDAP: pkg/ldap/libraries/libldap/utf-8.c,v 1.36.2.3 2008/02/11 23:26:41 kurt Exp $ */
3 /* This work is part of OpenLDAP Software <http://www.openldap.org/>.
5 * Copyright 1998-2008 The OpenLDAP Foundation.
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted only as authorized by the OpenLDAP
12 * A copy of this license is available in the file LICENSE in the
13 * top-level directory of the distribution or, alternatively, at
14 * <http://www.OpenLDAP.org/license.html>.
16 /* Basic UTF-8 routines
18 * These routines are "dumb". Though they understand UTF-8,
19 * they don't grok Unicode. That is, they can push bits,
20 * but don't have a clue what the bits represent. That's
21 * good enough for use with the LDAP Client SDK.
23 * These routines are not optimized.
30 #include <ac/stdlib.h>
32 #include <ac/socket.h>
33 #include <ac/string.h>
36 #include "ldap_utf8.h"
39 #include "ldap_defaults.h"
42 * return the number of bytes required to hold the
43 * NULL-terminated UTF-8 string NOT INCLUDING the
46 ber_len_t
ldap_utf8_bytes( const char * p
)
50 for( bytes
=0; p
[bytes
]; bytes
++ ) {
57 ber_len_t
ldap_utf8_chars( const char * p
)
59 /* could be optimized and could check for invalid sequences */
62 for( ; *p
; LDAP_UTF8_INCR(p
) ) {
69 /* return offset to next character */
70 int ldap_utf8_offset( const char * p
)
72 return LDAP_UTF8_NEXT(p
) - p
;
76 * Returns length indicated by first byte.
78 const char ldap_utf8_lentab
[] = {
79 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
80 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
81 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
82 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
83 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
84 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
85 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
86 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0 };
88 int ldap_utf8_charlen( const char * p
)
93 return ldap_utf8_lentab
[*(const unsigned char *)p
^ 0x80];
97 * Make sure the UTF-8 char used the shortest possible encoding
98 * returns charlen if valid, 0 if not.
100 * Here are the valid UTF-8 encodings, taken from RFC 2279 page 4.
101 * The table is slightly modified from that of the RFC.
103 * UCS-4 range (hex) UTF-8 sequence (binary)
104 * 0000 0000-0000 007F 0.......
105 * 0000 0080-0000 07FF 110++++. 10......
106 * 0000 0800-0000 FFFF 1110++++ 10+..... 10......
107 * 0001 0000-001F FFFF 11110+++ 10++.... 10...... 10......
108 * 0020 0000-03FF FFFF 111110++ 10+++... 10...... 10...... 10......
109 * 0400 0000-7FFF FFFF 1111110+ 10++++.. 10...... 10...... 10...... 10......
111 * The '.' bits are "don't cares". When validating a UTF-8 sequence,
112 * at least one of the '+' bits must be set, otherwise the character
113 * should have been encoded in fewer octets. Note that in the two-octet
114 * case, only the first octet needs to be validated, and this is done
115 * in the ldap_utf8_lentab[] above.
118 /* mask of required bits in second octet */
121 c ldap_utf8_mintab
[] = {
122 (c
)0x20, (c
)0x80, (c
)0x80, (c
)0x80, (c
)0x80, (c
)0x80, (c
)0x80, (c
)0x80,
123 (c
)0x80, (c
)0x80, (c
)0x80, (c
)0x80, (c
)0x80, (c
)0x80, (c
)0x80, (c
)0x80,
124 (c
)0x30, (c
)0x80, (c
)0x80, (c
)0x80, (c
)0x80, (c
)0x80, (c
)0x80, (c
)0x80,
125 (c
)0x38, (c
)0x80, (c
)0x80, (c
)0x80, (c
)0x3c, (c
)0x80, (c
)0x00, (c
)0x00 };
128 int ldap_utf8_charlen2( const char * p
)
130 int i
= LDAP_UTF8_CHARLEN( p
);
133 if ( !( ldap_utf8_mintab
[*p
& 0x1f] & p
[1] ) )
139 /* conv UTF-8 to UCS-4, useful for comparisons */
140 ldap_ucs4_t
ldap_x_utf8_to_ucs4( const char * p
)
142 const unsigned char *c
= (const unsigned char *) p
;
145 static unsigned char mask
[] = {
146 0, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
148 len
= LDAP_UTF8_CHARLEN2(p
, len
);
150 if( len
== 0 ) return LDAP_UCS4_INVALID
;
152 ch
= c
[0] & mask
[len
];
154 for(i
=1; i
< len
; i
++) {
155 if ((c
[i
] & 0xc0) != 0x80) {
156 return LDAP_UCS4_INVALID
;
166 /* conv UCS-4 to UTF-8, not used */
167 int ldap_x_ucs4_to_utf8( ldap_ucs4_t c
, char *buf
)
170 unsigned char* p
= (unsigned char *) buf
;
172 /* not a valid Unicode character */
173 if ( c
< 0 ) return 0;
175 /* Just return length, don't convert */
177 if( c
< 0x80 ) return 1;
178 else if( c
< 0x800 ) return 2;
179 else if( c
< 0x10000 ) return 3;
180 else if( c
< 0x200000 ) return 4;
181 else if( c
< 0x4000000 ) return 5;
188 } else if( c
< 0x800 ) {
189 p
[len
++] = 0xc0 | ( c
>> 6 );
190 p
[len
++] = 0x80 | ( c
& 0x3f );
192 } else if( c
< 0x10000 ) {
193 p
[len
++] = 0xe0 | ( c
>> 12 );
194 p
[len
++] = 0x80 | ( (c
>> 6) & 0x3f );
195 p
[len
++] = 0x80 | ( c
& 0x3f );
197 } else if( c
< 0x200000 ) {
198 p
[len
++] = 0xf0 | ( c
>> 18 );
199 p
[len
++] = 0x80 | ( (c
>> 12) & 0x3f );
200 p
[len
++] = 0x80 | ( (c
>> 6) & 0x3f );
201 p
[len
++] = 0x80 | ( c
& 0x3f );
203 } else if( c
< 0x4000000 ) {
204 p
[len
++] = 0xf8 | ( c
>> 24 );
205 p
[len
++] = 0x80 | ( (c
>> 18) & 0x3f );
206 p
[len
++] = 0x80 | ( (c
>> 12) & 0x3f );
207 p
[len
++] = 0x80 | ( (c
>> 6) & 0x3f );
208 p
[len
++] = 0x80 | ( c
& 0x3f );
210 } else /* if( c < 0x80000000 ) */ {
211 p
[len
++] = 0xfc | ( c
>> 30 );
212 p
[len
++] = 0x80 | ( (c
>> 24) & 0x3f );
213 p
[len
++] = 0x80 | ( (c
>> 18) & 0x3f );
214 p
[len
++] = 0x80 | ( (c
>> 12) & 0x3f );
215 p
[len
++] = 0x80 | ( (c
>> 6) & 0x3f );
216 p
[len
++] = 0x80 | ( c
& 0x3f );
222 #define LDAP_UCS_UTF8LEN(c) \
223 c < 0 ? 0 : (c < 0x80 ? 1 : (c < 0x800 ? 2 : (c < 0x10000 ? 3 : \
224 (c < 0x200000 ? 4 : (c < 0x4000000 ? 5 : 6)))))
226 /* Convert a string to UTF-8 format. The input string is expected to
227 * have characters of 1, 2, or 4 octets (in network byte order)
228 * corresponding to the ASN.1 T61STRING, BMPSTRING, and UNIVERSALSTRING
229 * types respectively. (Here T61STRING just means that there is one
230 * octet per character and characters may use the high bit of the octet.
231 * The characters are assumed to use ISO mappings, no provision is made
232 * for converting from T.61 coding rules to Unicode.)
236 ldap_ucs_to_utf8s( struct berval
*ucs
, int csize
, struct berval
*utf8s
)
238 unsigned char *in
, *end
;
243 utf8s
->bv_val
= NULL
;
246 in
= (unsigned char *)ucs
->bv_val
;
248 /* Make sure we stop at an even multiple of csize */
249 end
= in
+ ( ucs
->bv_len
& ~(csize
-1) );
263 i
= LDAP_UCS_UTF8LEN(u
);
265 return LDAP_INVALID_SYNTAX
;
269 utf8s
->bv_val
= LDAP_MALLOC( l
+1 );
270 if (utf8s
->bv_val
== NULL
)
271 return LDAP_NO_MEMORY
;
275 for (in
= (unsigned char *)ucs
->bv_val
; in
< end
; ) {
287 ptr
+= ldap_x_ucs4_to_utf8(u
, ptr
);
294 * Advance to the next UTF-8 character
296 * Ignores length of multibyte character, instead rely on
297 * continuation markers to find start of next character.
298 * This allows for "resyncing" of when invalid characters
299 * are provided provided the start of the next character
300 * is appears within the 6 bytes examined.
302 char* ldap_utf8_next( const char * p
)
305 const unsigned char *u
= (const unsigned char *) p
;
307 if( LDAP_UTF8_ISASCII(u
) ) {
308 return (char *) &p
[1];
311 for( i
=1; i
<6; i
++ ) {
312 if ( ( u
[i
] & 0xc0 ) != 0x80 ) {
313 return (char *) &p
[i
];
317 return (char *) &p
[i
];
321 * Advance to the previous UTF-8 character
323 * Ignores length of multibyte character, instead rely on
324 * continuation markers to find start of next character.
325 * This allows for "resyncing" of when invalid characters
326 * are provided provided the start of the next character
327 * is appears within the 6 bytes examined.
329 char* ldap_utf8_prev( const char * p
)
332 const unsigned char *u
= (const unsigned char *) p
;
334 for( i
=-1; i
>-6 ; i
-- ) {
335 if ( ( u
[i
] & 0xc0 ) != 0x80 ) {
336 return (char *) &p
[i
];
340 return (char *) &p
[i
];
344 * Copy one UTF-8 character from src to dst returning
345 * number of bytes copied.
347 * Ignores length of multibyte character, instead rely on
348 * continuation markers to find start of next character.
349 * This allows for "resyncing" of when invalid characters
350 * are provided provided the start of the next character
351 * is appears within the 6 bytes examined.
353 int ldap_utf8_copy( char* dst
, const char *src
)
356 const unsigned char *u
= (const unsigned char *) src
;
360 if( LDAP_UTF8_ISASCII(u
) ) {
364 for( i
=1; i
<6; i
++ ) {
365 if ( ( u
[i
] & 0xc0 ) != 0x80 ) {
374 #ifndef UTF8_ALPHA_CTYPE
376 * UTF-8 ctype routines
377 * Only deals with characters < 0x80 (ie: US-ASCII)
380 int ldap_utf8_isascii( const char * p
)
382 unsigned c
= * (const unsigned char *) p
;
383 return LDAP_ASCII(c
);
386 int ldap_utf8_isdigit( const char * p
)
388 unsigned c
= * (const unsigned char *) p
;
390 if(!LDAP_ASCII(c
)) return 0;
392 return LDAP_DIGIT( c
);
395 int ldap_utf8_isxdigit( const char * p
)
397 unsigned c
= * (const unsigned char *) p
;
399 if(!LDAP_ASCII(c
)) return 0;
404 int ldap_utf8_isspace( const char * p
)
406 unsigned c
= * (const unsigned char *) p
;
408 if(!LDAP_ASCII(c
)) return 0;
424 * These are not needed by the C SDK and are
425 * not "good enough" for general use.
427 int ldap_utf8_isalpha( const char * p
)
429 unsigned c
= * (const unsigned char *) p
;
431 if(!LDAP_ASCII(c
)) return 0;
433 return LDAP_ALPHA(c
);
436 int ldap_utf8_isalnum( const char * p
)
438 unsigned c
= * (const unsigned char *) p
;
440 if(!LDAP_ASCII(c
)) return 0;
442 return LDAP_ALNUM(c
);
445 int ldap_utf8_islower( const char * p
)
447 unsigned c
= * (const unsigned char *) p
;
449 if(!LDAP_ASCII(c
)) return 0;
451 return LDAP_LOWER(c
);
454 int ldap_utf8_isupper( const char * p
)
456 unsigned c
= * (const unsigned char *) p
;
458 if(!LDAP_ASCII(c
)) return 0;
460 return LDAP_UPPER(c
);
466 * UTF-8 string routines
470 char * (ldap_utf8_strchr
)( const char *str
, const char *chr
)
472 for( ; *str
!= '\0'; LDAP_UTF8_INCR(str
) ) {
473 if( ldap_x_utf8_to_ucs4( str
) == ldap_x_utf8_to_ucs4( chr
) ) {
481 /* like strcspn() but returns number of bytes, not characters */
482 ber_len_t (ldap_utf8_strcspn
)( const char *str
, const char *set
)
487 for( cstr
= str
; *cstr
!= '\0'; LDAP_UTF8_INCR(cstr
) ) {
488 for( cset
= set
; *cset
!= '\0'; LDAP_UTF8_INCR(cset
) ) {
489 if( ldap_x_utf8_to_ucs4( cstr
) == ldap_x_utf8_to_ucs4( cset
) ) {
498 /* like strspn() but returns number of bytes, not characters */
499 ber_len_t (ldap_utf8_strspn
)( const char *str
, const char *set
)
504 for( cstr
= str
; *cstr
!= '\0'; LDAP_UTF8_INCR(cstr
) ) {
505 for( cset
= set
; ; LDAP_UTF8_INCR(cset
) ) {
506 if( *cset
== '\0' ) {
510 if( ldap_x_utf8_to_ucs4( cstr
) == ldap_x_utf8_to_ucs4( cset
) ) {
519 /* like strpbrk(), replaces strchr() as well */
520 char *(ldap_utf8_strpbrk
)( const char *str
, const char *set
)
522 for( ; *str
!= '\0'; LDAP_UTF8_INCR(str
) ) {
525 for( cset
= set
; *cset
!= '\0'; LDAP_UTF8_INCR(cset
) ) {
526 if( ldap_x_utf8_to_ucs4( str
) == ldap_x_utf8_to_ucs4( cset
) ) {
535 /* like strtok_r(), not strtok() */
536 char *(ldap_utf8_strtok
)(char *str
, const char *sep
, char **last
)
541 if( last
== NULL
) return NULL
;
543 begin
= str
? str
: *last
;
545 begin
+= ldap_utf8_strspn( begin
, sep
);
547 if( *begin
== '\0' ) {
552 end
= &begin
[ ldap_utf8_strcspn( begin
, sep
) ];
555 char *next
= LDAP_UTF8_NEXT( end
);