No empty .Rs/.Re
[netbsd-mini2440.git] / external / bsd / openldap / dist / libraries / libldap / utf-8.c
blob650c9cc792fc7fe4ead9a78475140d1a43ec1b20
1 /* utf-8.c -- Basic UTF-8 routines */
2 /* $OpenLDAP: pkg/ldap/libraries/libldap/utf-8.c,v 1.36.2.3 2008/02/11 23:26:41 kurt Exp $ */
3 /* This work is part of OpenLDAP Software <http://www.openldap.org/>.
5 * Copyright 1998-2008 The OpenLDAP Foundation.
6 * All rights reserved.
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted only as authorized by the OpenLDAP
10 * Public License.
12 * A copy of this license is available in the file LICENSE in the
13 * top-level directory of the distribution or, alternatively, at
14 * <http://www.OpenLDAP.org/license.html>.
16 /* Basic UTF-8 routines
18 * These routines are "dumb". Though they understand UTF-8,
19 * they don't grok Unicode. That is, they can push bits,
20 * but don't have a clue what the bits represent. That's
21 * good enough for use with the LDAP Client SDK.
23 * These routines are not optimized.
26 #include "portable.h"
28 #include <stdio.h>
30 #include <ac/stdlib.h>
32 #include <ac/socket.h>
33 #include <ac/string.h>
34 #include <ac/time.h>
36 #include "ldap_utf8.h"
38 #include "ldap-int.h"
39 #include "ldap_defaults.h"
42 * return the number of bytes required to hold the
43 * NULL-terminated UTF-8 string NOT INCLUDING the
44 * termination.
46 ber_len_t ldap_utf8_bytes( const char * p )
48 ber_len_t bytes;
50 for( bytes=0; p[bytes]; bytes++ ) {
51 /* EMPTY */ ;
54 return bytes;
57 ber_len_t ldap_utf8_chars( const char * p )
59 /* could be optimized and could check for invalid sequences */
60 ber_len_t chars=0;
62 for( ; *p ; LDAP_UTF8_INCR(p) ) {
63 chars++;
66 return chars;
69 /* return offset to next character */
70 int ldap_utf8_offset( const char * p )
72 return LDAP_UTF8_NEXT(p) - p;
76 * Returns length indicated by first byte.
78 const char ldap_utf8_lentab[] = {
79 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
80 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
81 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
82 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
83 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
84 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
85 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
86 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0 };
88 int ldap_utf8_charlen( const char * p )
90 if (!(*p & 0x80))
91 return 1;
93 return ldap_utf8_lentab[*(const unsigned char *)p ^ 0x80];
97 * Make sure the UTF-8 char used the shortest possible encoding
98 * returns charlen if valid, 0 if not.
100 * Here are the valid UTF-8 encodings, taken from RFC 2279 page 4.
101 * The table is slightly modified from that of the RFC.
103 * UCS-4 range (hex) UTF-8 sequence (binary)
104 * 0000 0000-0000 007F 0.......
105 * 0000 0080-0000 07FF 110++++. 10......
106 * 0000 0800-0000 FFFF 1110++++ 10+..... 10......
107 * 0001 0000-001F FFFF 11110+++ 10++.... 10...... 10......
108 * 0020 0000-03FF FFFF 111110++ 10+++... 10...... 10...... 10......
109 * 0400 0000-7FFF FFFF 1111110+ 10++++.. 10...... 10...... 10...... 10......
111 * The '.' bits are "don't cares". When validating a UTF-8 sequence,
112 * at least one of the '+' bits must be set, otherwise the character
113 * should have been encoded in fewer octets. Note that in the two-octet
114 * case, only the first octet needs to be validated, and this is done
115 * in the ldap_utf8_lentab[] above.
118 /* mask of required bits in second octet */
119 #undef c
120 #define c const char
121 c ldap_utf8_mintab[] = {
122 (c)0x20, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80,
123 (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80,
124 (c)0x30, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80,
125 (c)0x38, (c)0x80, (c)0x80, (c)0x80, (c)0x3c, (c)0x80, (c)0x00, (c)0x00 };
126 #undef c
128 int ldap_utf8_charlen2( const char * p )
130 int i = LDAP_UTF8_CHARLEN( p );
132 if ( i > 2 ) {
133 if ( !( ldap_utf8_mintab[*p & 0x1f] & p[1] ) )
134 i = 0;
136 return i;
139 /* conv UTF-8 to UCS-4, useful for comparisons */
140 ldap_ucs4_t ldap_x_utf8_to_ucs4( const char * p )
142 const unsigned char *c = (const unsigned char *) p;
143 ldap_ucs4_t ch;
144 int len, i;
145 static unsigned char mask[] = {
146 0, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
148 len = LDAP_UTF8_CHARLEN2(p, len);
150 if( len == 0 ) return LDAP_UCS4_INVALID;
152 ch = c[0] & mask[len];
154 for(i=1; i < len; i++) {
155 if ((c[i] & 0xc0) != 0x80) {
156 return LDAP_UCS4_INVALID;
159 ch <<= 6;
160 ch |= c[i] & 0x3f;
163 return ch;
166 /* conv UCS-4 to UTF-8, not used */
167 int ldap_x_ucs4_to_utf8( ldap_ucs4_t c, char *buf )
169 int len=0;
170 unsigned char* p = (unsigned char *) buf;
172 /* not a valid Unicode character */
173 if ( c < 0 ) return 0;
175 /* Just return length, don't convert */
176 if(buf == NULL) {
177 if( c < 0x80 ) return 1;
178 else if( c < 0x800 ) return 2;
179 else if( c < 0x10000 ) return 3;
180 else if( c < 0x200000 ) return 4;
181 else if( c < 0x4000000 ) return 5;
182 else return 6;
185 if( c < 0x80 ) {
186 p[len++] = c;
188 } else if( c < 0x800 ) {
189 p[len++] = 0xc0 | ( c >> 6 );
190 p[len++] = 0x80 | ( c & 0x3f );
192 } else if( c < 0x10000 ) {
193 p[len++] = 0xe0 | ( c >> 12 );
194 p[len++] = 0x80 | ( (c >> 6) & 0x3f );
195 p[len++] = 0x80 | ( c & 0x3f );
197 } else if( c < 0x200000 ) {
198 p[len++] = 0xf0 | ( c >> 18 );
199 p[len++] = 0x80 | ( (c >> 12) & 0x3f );
200 p[len++] = 0x80 | ( (c >> 6) & 0x3f );
201 p[len++] = 0x80 | ( c & 0x3f );
203 } else if( c < 0x4000000 ) {
204 p[len++] = 0xf8 | ( c >> 24 );
205 p[len++] = 0x80 | ( (c >> 18) & 0x3f );
206 p[len++] = 0x80 | ( (c >> 12) & 0x3f );
207 p[len++] = 0x80 | ( (c >> 6) & 0x3f );
208 p[len++] = 0x80 | ( c & 0x3f );
210 } else /* if( c < 0x80000000 ) */ {
211 p[len++] = 0xfc | ( c >> 30 );
212 p[len++] = 0x80 | ( (c >> 24) & 0x3f );
213 p[len++] = 0x80 | ( (c >> 18) & 0x3f );
214 p[len++] = 0x80 | ( (c >> 12) & 0x3f );
215 p[len++] = 0x80 | ( (c >> 6) & 0x3f );
216 p[len++] = 0x80 | ( c & 0x3f );
219 return len;
222 #define LDAP_UCS_UTF8LEN(c) \
223 c < 0 ? 0 : (c < 0x80 ? 1 : (c < 0x800 ? 2 : (c < 0x10000 ? 3 : \
224 (c < 0x200000 ? 4 : (c < 0x4000000 ? 5 : 6)))))
226 /* Convert a string to UTF-8 format. The input string is expected to
227 * have characters of 1, 2, or 4 octets (in network byte order)
228 * corresponding to the ASN.1 T61STRING, BMPSTRING, and UNIVERSALSTRING
229 * types respectively. (Here T61STRING just means that there is one
230 * octet per character and characters may use the high bit of the octet.
231 * The characters are assumed to use ISO mappings, no provision is made
232 * for converting from T.61 coding rules to Unicode.)
236 ldap_ucs_to_utf8s( struct berval *ucs, int csize, struct berval *utf8s )
238 unsigned char *in, *end;
239 char *ptr;
240 ldap_ucs4_t u;
241 int i, l = 0;
243 utf8s->bv_val = NULL;
244 utf8s->bv_len = 0;
246 in = (unsigned char *)ucs->bv_val;
248 /* Make sure we stop at an even multiple of csize */
249 end = in + ( ucs->bv_len & ~(csize-1) );
251 for (; in < end; ) {
252 u = *in++;
253 if (csize > 1) {
254 u <<= 8;
255 u |= *in++;
257 if (csize > 2) {
258 u <<= 8;
259 u |= *in++;
260 u <<= 8;
261 u |= *in++;
263 i = LDAP_UCS_UTF8LEN(u);
264 if (i == 0)
265 return LDAP_INVALID_SYNTAX;
266 l += i;
269 utf8s->bv_val = LDAP_MALLOC( l+1 );
270 if (utf8s->bv_val == NULL)
271 return LDAP_NO_MEMORY;
272 utf8s->bv_len = l;
274 ptr = utf8s->bv_val;
275 for (in = (unsigned char *)ucs->bv_val; in < end; ) {
276 u = *in++;
277 if (csize > 1) {
278 u <<= 8;
279 u |= *in++;
281 if (csize > 2) {
282 u <<= 8;
283 u |= *in++;
284 u <<= 8;
285 u |= *in++;
287 ptr += ldap_x_ucs4_to_utf8(u, ptr);
289 *ptr = '\0';
290 return LDAP_SUCCESS;
294 * Advance to the next UTF-8 character
296 * Ignores length of multibyte character, instead rely on
297 * continuation markers to find start of next character.
298 * This allows for "resyncing" of when invalid characters
299 * are provided provided the start of the next character
300 * is appears within the 6 bytes examined.
302 char* ldap_utf8_next( const char * p )
304 int i;
305 const unsigned char *u = (const unsigned char *) p;
307 if( LDAP_UTF8_ISASCII(u) ) {
308 return (char *) &p[1];
311 for( i=1; i<6; i++ ) {
312 if ( ( u[i] & 0xc0 ) != 0x80 ) {
313 return (char *) &p[i];
317 return (char *) &p[i];
321 * Advance to the previous UTF-8 character
323 * Ignores length of multibyte character, instead rely on
324 * continuation markers to find start of next character.
325 * This allows for "resyncing" of when invalid characters
326 * are provided provided the start of the next character
327 * is appears within the 6 bytes examined.
329 char* ldap_utf8_prev( const char * p )
331 int i;
332 const unsigned char *u = (const unsigned char *) p;
334 for( i=-1; i>-6 ; i-- ) {
335 if ( ( u[i] & 0xc0 ) != 0x80 ) {
336 return (char *) &p[i];
340 return (char *) &p[i];
344 * Copy one UTF-8 character from src to dst returning
345 * number of bytes copied.
347 * Ignores length of multibyte character, instead rely on
348 * continuation markers to find start of next character.
349 * This allows for "resyncing" of when invalid characters
350 * are provided provided the start of the next character
351 * is appears within the 6 bytes examined.
353 int ldap_utf8_copy( char* dst, const char *src )
355 int i;
356 const unsigned char *u = (const unsigned char *) src;
358 dst[0] = src[0];
360 if( LDAP_UTF8_ISASCII(u) ) {
361 return 1;
364 for( i=1; i<6; i++ ) {
365 if ( ( u[i] & 0xc0 ) != 0x80 ) {
366 return i;
368 dst[i] = src[i];
371 return i;
374 #ifndef UTF8_ALPHA_CTYPE
376 * UTF-8 ctype routines
377 * Only deals with characters < 0x80 (ie: US-ASCII)
380 int ldap_utf8_isascii( const char * p )
382 unsigned c = * (const unsigned char *) p;
383 return LDAP_ASCII(c);
386 int ldap_utf8_isdigit( const char * p )
388 unsigned c = * (const unsigned char *) p;
390 if(!LDAP_ASCII(c)) return 0;
392 return LDAP_DIGIT( c );
395 int ldap_utf8_isxdigit( const char * p )
397 unsigned c = * (const unsigned char *) p;
399 if(!LDAP_ASCII(c)) return 0;
401 return LDAP_HEX(c);
404 int ldap_utf8_isspace( const char * p )
406 unsigned c = * (const unsigned char *) p;
408 if(!LDAP_ASCII(c)) return 0;
410 switch(c) {
411 case ' ':
412 case '\t':
413 case '\n':
414 case '\r':
415 case '\v':
416 case '\f':
417 return 1;
420 return 0;
424 * These are not needed by the C SDK and are
425 * not "good enough" for general use.
427 int ldap_utf8_isalpha( const char * p )
429 unsigned c = * (const unsigned char *) p;
431 if(!LDAP_ASCII(c)) return 0;
433 return LDAP_ALPHA(c);
436 int ldap_utf8_isalnum( const char * p )
438 unsigned c = * (const unsigned char *) p;
440 if(!LDAP_ASCII(c)) return 0;
442 return LDAP_ALNUM(c);
445 int ldap_utf8_islower( const char * p )
447 unsigned c = * (const unsigned char *) p;
449 if(!LDAP_ASCII(c)) return 0;
451 return LDAP_LOWER(c);
454 int ldap_utf8_isupper( const char * p )
456 unsigned c = * (const unsigned char *) p;
458 if(!LDAP_ASCII(c)) return 0;
460 return LDAP_UPPER(c);
462 #endif
466 * UTF-8 string routines
469 /* like strchr() */
470 char * (ldap_utf8_strchr)( const char *str, const char *chr )
472 for( ; *str != '\0'; LDAP_UTF8_INCR(str) ) {
473 if( ldap_x_utf8_to_ucs4( str ) == ldap_x_utf8_to_ucs4( chr ) ) {
474 return (char *) str;
478 return NULL;
481 /* like strcspn() but returns number of bytes, not characters */
482 ber_len_t (ldap_utf8_strcspn)( const char *str, const char *set )
484 const char *cstr;
485 const char *cset;
487 for( cstr = str; *cstr != '\0'; LDAP_UTF8_INCR(cstr) ) {
488 for( cset = set; *cset != '\0'; LDAP_UTF8_INCR(cset) ) {
489 if( ldap_x_utf8_to_ucs4( cstr ) == ldap_x_utf8_to_ucs4( cset ) ) {
490 return cstr - str;
495 return cstr - str;
498 /* like strspn() but returns number of bytes, not characters */
499 ber_len_t (ldap_utf8_strspn)( const char *str, const char *set )
501 const char *cstr;
502 const char *cset;
504 for( cstr = str; *cstr != '\0'; LDAP_UTF8_INCR(cstr) ) {
505 for( cset = set; ; LDAP_UTF8_INCR(cset) ) {
506 if( *cset == '\0' ) {
507 return cstr - str;
510 if( ldap_x_utf8_to_ucs4( cstr ) == ldap_x_utf8_to_ucs4( cset ) ) {
511 break;
516 return cstr - str;
519 /* like strpbrk(), replaces strchr() as well */
520 char *(ldap_utf8_strpbrk)( const char *str, const char *set )
522 for( ; *str != '\0'; LDAP_UTF8_INCR(str) ) {
523 const char *cset;
525 for( cset = set; *cset != '\0'; LDAP_UTF8_INCR(cset) ) {
526 if( ldap_x_utf8_to_ucs4( str ) == ldap_x_utf8_to_ucs4( cset ) ) {
527 return (char *) str;
532 return NULL;
535 /* like strtok_r(), not strtok() */
536 char *(ldap_utf8_strtok)(char *str, const char *sep, char **last)
538 char *begin;
539 char *end;
541 if( last == NULL ) return NULL;
543 begin = str ? str : *last;
545 begin += ldap_utf8_strspn( begin, sep );
547 if( *begin == '\0' ) {
548 *last = NULL;
549 return NULL;
552 end = &begin[ ldap_utf8_strcspn( begin, sep ) ];
554 if( *end != '\0' ) {
555 char *next = LDAP_UTF8_NEXT( end );
556 *end = '\0';
557 end = next;
560 *last = end;
561 return begin;