1 /* $OpenLDAP: pkg/ldap/libraries/liblunicode/ucstr.c,v 1.37.2.4 2008/04/14 19:12:11 quanah Exp $ */
2 /* This work is part of OpenLDAP Software <http://www.openldap.org/>.
4 * Copyright 1998-2008 The OpenLDAP Foundation.
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted only as authorized by the OpenLDAP
11 * A copy of this license is available in file LICENSE in the
12 * top-level directory of the distribution or, alternatively, at
13 * <http://www.OpenLDAP.org/license.html>.
20 #include <ac/string.h>
21 #include <ac/stdlib.h>
25 #include <ldap_utf8.h>
26 #include <ldap_pvt_uc.h>
28 #define malloc(x) ber_memalloc_x(x,ctx)
29 #define realloc(x,y) ber_memrealloc_x(x,y,ctx)
30 #define free(x) ber_memfree_x(x,ctx)
33 const ldap_unicode_t
*u1
,
34 const ldap_unicode_t
*u2
,
37 for(; 0 < n
; ++u1
, ++u2
, --n
) {
39 return *u1
< *u2
? -1 : +1;
49 const ldap_unicode_t
*u1
,
50 const ldap_unicode_t
*u2
,
53 for(; 0 < n
; ++u1
, ++u2
, --n
) {
54 ldap_unicode_t uu1
= uctolower( *u1
);
55 ldap_unicode_t uu2
= uctolower( *u2
);
58 return uu1
< uu2
? -1 : +1;
67 ldap_unicode_t
* ucstrnchr(
68 const ldap_unicode_t
*u
,
72 for(; 0 < n
; ++u
, --n
) {
74 return (ldap_unicode_t
*) u
;
81 ldap_unicode_t
* ucstrncasechr(
82 const ldap_unicode_t
*u
,
87 for(; 0 < n
; ++u
, --n
) {
88 if( uctolower( *u
) == c
) {
89 return (ldap_unicode_t
*) u
;
100 for(; 0 < n
; ++u
, --n
) {
101 *u
= uctoupper( *u
);
105 struct berval
* UTF8bvnormalize(
107 struct berval
*newbv
,
111 int i
, j
, len
, clen
, outpos
, ucsoutlen
, outsize
, last
;
112 char *out
, *outtmp
, *s
;
113 ac_uint4
*ucs
, *p
, *ucsout
;
115 static unsigned char mask
[] = {
116 0, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
118 unsigned casefold
= flags
& LDAP_UTF8_CASEFOLD
;
119 unsigned approx
= flags
& LDAP_UTF8_APPROX
;
129 return ber_dupbv_x( newbv
, bv
, ctx
);
133 newbv
= ber_memalloc_x( sizeof(struct berval
), ctx
);
134 if ( !newbv
) return NULL
;
137 /* Should first check to see if string is already in proper
138 * normalized form. This is almost as time consuming as
139 * the normalization though.
142 /* finish off everything up to character before first non-ascii */
143 if ( LDAP_UTF8_ISASCII( s
) ) {
146 out
= (char *) ber_memalloc_x( outsize
, ctx
);
152 for ( i
= 1; (i
< len
) && LDAP_UTF8_ISASCII(s
+ i
); i
++ ) {
153 out
[outpos
++] = TOLOWER( s
[i
-1] );
156 out
[outpos
++] = TOLOWER( s
[len
-1] );
159 newbv
->bv_len
= outpos
;
163 for ( i
= 1; (i
< len
) && LDAP_UTF8_ISASCII(s
+ i
); i
++ ) {
168 return ber_str2bv_x( s
, len
, 1, newbv
, ctx
);
172 out
= (char *) ber_memalloc_x( outsize
, ctx
);
177 memcpy(out
, s
, outpos
);
181 out
= (char *) ber_memalloc_x( outsize
, ctx
);
189 p
= ucs
= ber_memalloc_x( len
* sizeof(*ucs
), ctx
);
191 ber_memfree_x(out
, ctx
);
195 /* convert character before first non-ascii to ucs-4 */
197 *p
= casefold
? TOLOWER( s
[i
-1] ) : s
[i
-1];
201 /* s[i] is now first non-ascii character */
203 /* s[i] is non-ascii */
204 /* convert everything up to next ascii to ucs-4 */
206 clen
= LDAP_UTF8_CHARLEN2( s
+ i
, clen
);
208 ber_memfree_x( ucs
, ctx
);
209 ber_memfree_x( out
, ctx
);
216 *p
= s
[i
] & mask
[clen
];
218 for( j
= 1; j
< clen
; j
++ ) {
219 if ( (s
[i
] & 0xc0) != 0x80 ) {
220 ber_memfree_x( ucs
, ctx
);
221 ber_memfree_x( out
, ctx
);
229 *p
= uctolower( *p
);
233 /* normalize ucs of length p - ucs */
234 uccompatdecomp( ucs
, p
- ucs
, &ucsout
, &ucsoutlen
, ctx
);
236 for ( j
= 0; j
< ucsoutlen
; j
++ ) {
237 if ( ucsout
[j
] < 0x80 ) {
238 out
[outpos
++] = ucsout
[j
];
242 ucsoutlen
= uccanoncomp( ucsout
, ucsoutlen
);
243 /* convert ucs to utf-8 and store in out */
244 for ( j
= 0; j
< ucsoutlen
; j
++ ) {
245 /* allocate more space if not enough room for
246 6 bytes and terminator */
247 if ( outsize
- outpos
< 7 ) {
248 outsize
= ucsoutlen
- j
+ outpos
+ 6;
249 outtmp
= (char *) ber_memrealloc_x( out
, outsize
, ctx
);
250 if ( outtmp
== NULL
) {
251 ber_memfree_x( ucsout
, ctx
);
252 ber_memfree_x( ucs
, ctx
);
253 ber_memfree_x( out
, ctx
);
258 outpos
+= ldap_x_ucs4_to_utf8( ucsout
[j
], &out
[outpos
] );
262 ber_memfree_x( ucsout
, ctx
);
271 /* Allocate more space in out if necessary */
272 if (len
- i
>= outsize
- outpos
) {
273 outsize
+= 1 + ((len
- i
) - (outsize
- outpos
));
274 outtmp
= (char *) ber_memrealloc_x(out
, outsize
, ctx
);
275 if (outtmp
== NULL
) {
276 ber_memfree_x( ucs
, ctx
);
277 ber_memfree_x( out
, ctx
);
284 /* finish off everything up to char before next non-ascii */
285 for ( i
++; (i
< len
) && LDAP_UTF8_ISASCII(s
+ i
); i
++ ) {
286 out
[outpos
++] = casefold
? TOLOWER( s
[i
-1] ) : s
[i
-1];
289 out
[outpos
++] = casefold
? TOLOWER( s
[len
-1] ) : s
[len
-1];
293 /* convert character before next non-ascii to ucs-4 */
294 *ucs
= casefold
? TOLOWER( s
[i
-1] ) : s
[i
-1];
298 ber_memfree_x( ucs
, ctx
);
301 newbv
->bv_len
= outpos
;
305 /* compare UTF8-strings, optionally ignore casing */
306 /* slow, should be optimized */
313 int i
, l1
, l2
, len
, ulen
, res
= 0;
314 char *s1
, *s2
, *done
;
315 ac_uint4
*ucs
, *ucsout1
, *ucsout2
;
317 unsigned casefold
= flags
& LDAP_UTF8_CASEFOLD
;
318 unsigned norm1
= flags
& LDAP_UTF8_ARG1NFC
;
319 unsigned norm2
= flags
& LDAP_UTF8_ARG2NFC
;
322 return bv2
== NULL
? 0 : -1;
324 } else if (bv2
== NULL
) {
331 len
= (l1
< l2
) ? l1
: l2
;
333 return l1
== 0 ? (l2
== 0 ? 0 : -1) : 1;
340 while ( (s1
< done
) && LDAP_UTF8_ISASCII(s1
) && LDAP_UTF8_ISASCII(s2
) ) {
342 char c1
= TOLOWER(*s1
);
343 char c2
= TOLOWER(*s2
);
351 /* done unless next character in s1 or s2 is non-ascii */
353 if (!LDAP_UTF8_ISASCII(s1
) || !LDAP_UTF8_ISASCII(s2
)) {
356 } else if (((len
< l1
) && !LDAP_UTF8_ISASCII(s1
)) ||
357 ((len
< l2
) && !LDAP_UTF8_ISASCII(s2
)))
365 /* We have encountered non-ascii or strings equal up to len */
367 /* set i to number of iterations */
369 /* passed through loop at least once? */
371 if (!res
&& (s1
== done
) &&
372 ((len
== l1
) || LDAP_UTF8_ISASCII(s1
)) &&
373 ((len
== l2
) || LDAP_UTF8_ISASCII(s2
))) {
374 /* all ascii and equal up to len */
378 /* rewind one char, and do normalized compare from there */
385 /* Should first check to see if strings are already in
386 * proper normalized form.
388 ucs
= malloc( ( ( norm1
|| l1
> l2
) ? l1
: l2
) * sizeof(*ucs
) );
390 return l1
> l2
? 1 : -1; /* what to do??? */
394 * XXYYZ: we convert to ucs4 even though -llunicode
395 * expects ucs2 in an ac_uint4
398 /* convert and normalize 1st string */
399 for ( i
= 0, ulen
= 0; i
< l1
; i
+= len
, ulen
++ ) {
400 ucs
[ulen
] = ldap_x_utf8_to_ucs4( s1
+ i
);
401 if ( ucs
[ulen
] == LDAP_UCS4_INVALID
) {
403 return -1; /* what to do??? */
405 len
= LDAP_UTF8_CHARLEN( s1
+ i
);
411 ucs
= malloc( l2
* sizeof(*ucs
) );
414 return l1
> l2
? 1 : -1; /* what to do??? */
417 uccompatdecomp( ucs
, ulen
, &ucsout1
, &l1
, ctx
);
418 l1
= uccanoncomp( ucsout1
, l1
);
421 /* convert and normalize 2nd string */
422 for ( i
= 0, ulen
= 0; i
< l2
; i
+= len
, ulen
++ ) {
423 ucs
[ulen
] = ldap_x_utf8_to_ucs4( s2
+ i
);
424 if ( ucs
[ulen
] == LDAP_UCS4_INVALID
) {
427 return 1; /* what to do??? */
429 len
= LDAP_UTF8_CHARLEN( s2
+ i
);
436 uccompatdecomp( ucs
, ulen
, &ucsout2
, &l2
, ctx
);
437 l2
= uccanoncomp( ucsout2
, l2
);
442 ? ucstrncasecmp( ucsout1
, ucsout2
, l1
< l2
? l1
: l2
)
443 : ucstrncmp( ucsout1
, ucsout2
, l1
< l2
? l1
: l2
);
453 return l1
> l2
? 1 : -1;