changelog update
[sqlcipher.git] / ext / icu / icu.c
blobe745ab0253865e55d3c2d6eadab75e7c4bde4729
1 /*
2 ** 2007 May 6
3 **
4 ** The author disclaims copyright to this source code. In place of
5 ** a legal notice, here is a blessing:
6 **
7 ** May you do good and not evil.
8 ** May you find forgiveness for yourself and forgive others.
9 ** May you share freely, never taking more than you give.
11 *************************************************************************
12 ** $Id: icu.c,v 1.7 2007/12/13 21:54:11 drh Exp $
14 ** This file implements an integration between the ICU library
15 ** ("International Components for Unicode", an open-source library
16 ** for handling unicode data) and SQLite. The integration uses
17 ** ICU to provide the following to SQLite:
19 ** * An implementation of the SQL regexp() function (and hence REGEXP
20 ** operator) using the ICU uregex_XX() APIs.
22 ** * Implementations of the SQL scalar upper() and lower() functions
23 ** for case mapping.
25 ** * Integration of ICU and SQLite collation sequences.
27 ** * An implementation of the LIKE operator that uses ICU to
28 ** provide case-independent matching.
31 #if !defined(SQLITE_CORE) \
32 || defined(SQLITE_ENABLE_ICU) \
33 || defined(SQLITE_ENABLE_ICU_COLLATIONS)
35 /* Include ICU headers */
36 #include <unicode/utypes.h>
37 #include <unicode/uregex.h>
38 #include <unicode/ustring.h>
39 #include <unicode/ucol.h>
41 #include <assert.h>
43 #ifndef SQLITE_CORE
44 #include "sqlite3ext.h"
45 SQLITE_EXTENSION_INIT1
46 #else
47 #include "sqlite3.h"
48 #endif
51 ** This function is called when an ICU function called from within
52 ** the implementation of an SQL scalar function returns an error.
54 ** The scalar function context passed as the first argument is
55 ** loaded with an error message based on the following two args.
57 static void icuFunctionError(
58 sqlite3_context *pCtx, /* SQLite scalar function context */
59 const char *zName, /* Name of ICU function that failed */
60 UErrorCode e /* Error code returned by ICU function */
62 char zBuf[128];
63 sqlite3_snprintf(128, zBuf, "ICU error: %s(): %s", zName, u_errorName(e));
64 zBuf[127] = '\0';
65 sqlite3_result_error(pCtx, zBuf, -1);
68 #if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_ICU)
71 ** Maximum length (in bytes) of the pattern in a LIKE or GLOB
72 ** operator.
74 #ifndef SQLITE_MAX_LIKE_PATTERN_LENGTH
75 # define SQLITE_MAX_LIKE_PATTERN_LENGTH 50000
76 #endif
79 ** Version of sqlite3_free() that is always a function, never a macro.
81 static void xFree(void *p){
82 sqlite3_free(p);
86 ** This lookup table is used to help decode the first byte of
87 ** a multi-byte UTF8 character. It is copied here from SQLite source
88 ** code file utf8.c.
90 static const unsigned char icuUtf8Trans1[] = {
91 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
92 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
93 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
94 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
95 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
96 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
97 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
98 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x00, 0x00,
101 #define SQLITE_ICU_READ_UTF8(zIn, c) \
102 c = *(zIn++); \
103 if( c>=0xc0 ){ \
104 c = icuUtf8Trans1[c-0xc0]; \
105 while( (*zIn & 0xc0)==0x80 ){ \
106 c = (c<<6) + (0x3f & *(zIn++)); \
110 #define SQLITE_ICU_SKIP_UTF8(zIn) \
111 assert( *zIn ); \
112 if( *(zIn++)>=0xc0 ){ \
113 while( (*zIn & 0xc0)==0x80 ){zIn++;} \
118 ** Compare two UTF-8 strings for equality where the first string is
119 ** a "LIKE" expression. Return true (1) if they are the same and
120 ** false (0) if they are different.
122 static int icuLikeCompare(
123 const uint8_t *zPattern, /* LIKE pattern */
124 const uint8_t *zString, /* The UTF-8 string to compare against */
125 const UChar32 uEsc /* The escape character */
127 static const uint32_t MATCH_ONE = (uint32_t)'_';
128 static const uint32_t MATCH_ALL = (uint32_t)'%';
130 int prevEscape = 0; /* True if the previous character was uEsc */
132 while( 1 ){
134 /* Read (and consume) the next character from the input pattern. */
135 uint32_t uPattern;
136 SQLITE_ICU_READ_UTF8(zPattern, uPattern);
137 if( uPattern==0 ) break;
139 /* There are now 4 possibilities:
141 ** 1. uPattern is an unescaped match-all character "%",
142 ** 2. uPattern is an unescaped match-one character "_",
143 ** 3. uPattern is an unescaped escape character, or
144 ** 4. uPattern is to be handled as an ordinary character
146 if( uPattern==MATCH_ALL && !prevEscape && uPattern!=(uint32_t)uEsc ){
147 /* Case 1. */
148 uint8_t c;
150 /* Skip any MATCH_ALL or MATCH_ONE characters that follow a
151 ** MATCH_ALL. For each MATCH_ONE, skip one character in the
152 ** test string.
154 while( (c=*zPattern) == MATCH_ALL || c == MATCH_ONE ){
155 if( c==MATCH_ONE ){
156 if( *zString==0 ) return 0;
157 SQLITE_ICU_SKIP_UTF8(zString);
159 zPattern++;
162 if( *zPattern==0 ) return 1;
164 while( *zString ){
165 if( icuLikeCompare(zPattern, zString, uEsc) ){
166 return 1;
168 SQLITE_ICU_SKIP_UTF8(zString);
170 return 0;
172 }else if( uPattern==MATCH_ONE && !prevEscape && uPattern!=(uint32_t)uEsc ){
173 /* Case 2. */
174 if( *zString==0 ) return 0;
175 SQLITE_ICU_SKIP_UTF8(zString);
177 }else if( uPattern==(uint32_t)uEsc && !prevEscape ){
178 /* Case 3. */
179 prevEscape = 1;
181 }else{
182 /* Case 4. */
183 uint32_t uString;
184 SQLITE_ICU_READ_UTF8(zString, uString);
185 uString = (uint32_t)u_foldCase((UChar32)uString, U_FOLD_CASE_DEFAULT);
186 uPattern = (uint32_t)u_foldCase((UChar32)uPattern, U_FOLD_CASE_DEFAULT);
187 if( uString!=uPattern ){
188 return 0;
190 prevEscape = 0;
194 return *zString==0;
198 ** Implementation of the like() SQL function. This function implements
199 ** the build-in LIKE operator. The first argument to the function is the
200 ** pattern and the second argument is the string. So, the SQL statements:
202 ** A LIKE B
204 ** is implemented as like(B, A). If there is an escape character E,
206 ** A LIKE B ESCAPE E
208 ** is mapped to like(B, A, E).
210 static void icuLikeFunc(
211 sqlite3_context *context,
212 int argc,
213 sqlite3_value **argv
215 const unsigned char *zA = sqlite3_value_text(argv[0]);
216 const unsigned char *zB = sqlite3_value_text(argv[1]);
217 UChar32 uEsc = 0;
219 /* Limit the length of the LIKE or GLOB pattern to avoid problems
220 ** of deep recursion and N*N behavior in patternCompare().
222 if( sqlite3_value_bytes(argv[0])>SQLITE_MAX_LIKE_PATTERN_LENGTH ){
223 sqlite3_result_error(context, "LIKE or GLOB pattern too complex", -1);
224 return;
228 if( argc==3 ){
229 /* The escape character string must consist of a single UTF-8 character.
230 ** Otherwise, return an error.
232 int nE= sqlite3_value_bytes(argv[2]);
233 const unsigned char *zE = sqlite3_value_text(argv[2]);
234 int i = 0;
235 if( zE==0 ) return;
236 U8_NEXT(zE, i, nE, uEsc);
237 if( i!=nE){
238 sqlite3_result_error(context,
239 "ESCAPE expression must be a single character", -1);
240 return;
244 if( zA && zB ){
245 sqlite3_result_int(context, icuLikeCompare(zA, zB, uEsc));
250 ** Function to delete compiled regexp objects. Registered as
251 ** a destructor function with sqlite3_set_auxdata().
253 static void icuRegexpDelete(void *p){
254 URegularExpression *pExpr = (URegularExpression *)p;
255 uregex_close(pExpr);
259 ** Implementation of SQLite REGEXP operator. This scalar function takes
260 ** two arguments. The first is a regular expression pattern to compile
261 ** the second is a string to match against that pattern. If either
262 ** argument is an SQL NULL, then NULL Is returned. Otherwise, the result
263 ** is 1 if the string matches the pattern, or 0 otherwise.
265 ** SQLite maps the regexp() function to the regexp() operator such
266 ** that the following two are equivalent:
268 ** zString REGEXP zPattern
269 ** regexp(zPattern, zString)
271 ** Uses the following ICU regexp APIs:
273 ** uregex_open()
274 ** uregex_matches()
275 ** uregex_close()
277 static void icuRegexpFunc(sqlite3_context *p, int nArg, sqlite3_value **apArg){
278 UErrorCode status = U_ZERO_ERROR;
279 URegularExpression *pExpr;
280 UBool res;
281 const UChar *zString = sqlite3_value_text16(apArg[1]);
283 (void)nArg; /* Unused parameter */
285 /* If the left hand side of the regexp operator is NULL,
286 ** then the result is also NULL.
288 if( !zString ){
289 return;
292 pExpr = sqlite3_get_auxdata(p, 0);
293 if( !pExpr ){
294 const UChar *zPattern = sqlite3_value_text16(apArg[0]);
295 if( !zPattern ){
296 return;
298 pExpr = uregex_open(zPattern, -1, 0, 0, &status);
300 if( U_SUCCESS(status) ){
301 sqlite3_set_auxdata(p, 0, pExpr, icuRegexpDelete);
302 pExpr = sqlite3_get_auxdata(p, 0);
304 if( !pExpr ){
305 icuFunctionError(p, "uregex_open", status);
306 return;
310 /* Configure the text that the regular expression operates on. */
311 uregex_setText(pExpr, zString, -1, &status);
312 if( !U_SUCCESS(status) ){
313 icuFunctionError(p, "uregex_setText", status);
314 return;
317 /* Attempt the match */
318 res = uregex_matches(pExpr, 0, &status);
319 if( !U_SUCCESS(status) ){
320 icuFunctionError(p, "uregex_matches", status);
321 return;
324 /* Set the text that the regular expression operates on to a NULL
325 ** pointer. This is not really necessary, but it is tidier than
326 ** leaving the regular expression object configured with an invalid
327 ** pointer after this function returns.
329 uregex_setText(pExpr, 0, 0, &status);
331 /* Return 1 or 0. */
332 sqlite3_result_int(p, res ? 1 : 0);
336 ** Implementations of scalar functions for case mapping - upper() and
337 ** lower(). Function upper() converts its input to upper-case (ABC).
338 ** Function lower() converts to lower-case (abc).
340 ** ICU provides two types of case mapping, "general" case mapping and
341 ** "language specific". Refer to ICU documentation for the differences
342 ** between the two.
344 ** To utilise "general" case mapping, the upper() or lower() scalar
345 ** functions are invoked with one argument:
347 ** upper('ABC') -> 'abc'
348 ** lower('abc') -> 'ABC'
350 ** To access ICU "language specific" case mapping, upper() or lower()
351 ** should be invoked with two arguments. The second argument is the name
352 ** of the locale to use. Passing an empty string ("") or SQL NULL value
353 ** as the second argument is the same as invoking the 1 argument version
354 ** of upper() or lower().
356 ** lower('I', 'en_us') -> 'i'
357 ** lower('I', 'tr_tr') -> '\u131' (small dotless i)
359 ** http://www.icu-project.org/userguide/posix.html#case_mappings
361 static void icuCaseFunc16(sqlite3_context *p, int nArg, sqlite3_value **apArg){
362 const UChar *zInput; /* Pointer to input string */
363 UChar *zOutput = 0; /* Pointer to output buffer */
364 int nInput; /* Size of utf-16 input string in bytes */
365 int nOut; /* Size of output buffer in bytes */
366 int cnt;
367 int bToUpper; /* True for toupper(), false for tolower() */
368 UErrorCode status;
369 const char *zLocale = 0;
371 assert(nArg==1 || nArg==2);
372 bToUpper = (sqlite3_user_data(p)!=0);
373 if( nArg==2 ){
374 zLocale = (const char *)sqlite3_value_text(apArg[1]);
377 zInput = sqlite3_value_text16(apArg[0]);
378 if( !zInput ){
379 return;
381 nOut = nInput = sqlite3_value_bytes16(apArg[0]);
382 if( nOut==0 ){
383 sqlite3_result_text16(p, "", 0, SQLITE_STATIC);
384 return;
387 for(cnt=0; cnt<2; cnt++){
388 UChar *zNew = sqlite3_realloc(zOutput, nOut);
389 if( zNew==0 ){
390 sqlite3_free(zOutput);
391 sqlite3_result_error_nomem(p);
392 return;
394 zOutput = zNew;
395 status = U_ZERO_ERROR;
396 if( bToUpper ){
397 nOut = 2*u_strToUpper(zOutput,nOut/2,zInput,nInput/2,zLocale,&status);
398 }else{
399 nOut = 2*u_strToLower(zOutput,nOut/2,zInput,nInput/2,zLocale,&status);
402 if( U_SUCCESS(status) ){
403 sqlite3_result_text16(p, zOutput, nOut, xFree);
404 }else if( status==U_BUFFER_OVERFLOW_ERROR ){
405 assert( cnt==0 );
406 continue;
407 }else{
408 icuFunctionError(p, bToUpper ? "u_strToUpper" : "u_strToLower", status);
410 return;
412 assert( 0 ); /* Unreachable */
415 #endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_ICU) */
418 ** Collation sequence destructor function. The pCtx argument points to
419 ** a UCollator structure previously allocated using ucol_open().
421 static void icuCollationDel(void *pCtx){
422 UCollator *p = (UCollator *)pCtx;
423 ucol_close(p);
427 ** Collation sequence comparison function. The pCtx argument points to
428 ** a UCollator structure previously allocated using ucol_open().
430 static int icuCollationColl(
431 void *pCtx,
432 int nLeft,
433 const void *zLeft,
434 int nRight,
435 const void *zRight
437 UCollationResult res;
438 UCollator *p = (UCollator *)pCtx;
439 res = ucol_strcoll(p, (UChar *)zLeft, nLeft/2, (UChar *)zRight, nRight/2);
440 switch( res ){
441 case UCOL_LESS: return -1;
442 case UCOL_GREATER: return +1;
443 case UCOL_EQUAL: return 0;
445 assert(!"Unexpected return value from ucol_strcoll()");
446 return 0;
450 ** Implementation of the scalar function icu_load_collation().
452 ** This scalar function is used to add ICU collation based collation
453 ** types to an SQLite database connection. It is intended to be called
454 ** as follows:
456 ** SELECT icu_load_collation(<locale>, <collation-name>);
458 ** Where <locale> is a string containing an ICU locale identifier (i.e.
459 ** "en_AU", "tr_TR" etc.) and <collation-name> is the name of the
460 ** collation sequence to create.
462 static void icuLoadCollation(
463 sqlite3_context *p,
464 int nArg,
465 sqlite3_value **apArg
467 sqlite3 *db = (sqlite3 *)sqlite3_user_data(p);
468 UErrorCode status = U_ZERO_ERROR;
469 const char *zLocale; /* Locale identifier - (eg. "jp_JP") */
470 const char *zName; /* SQL Collation sequence name (eg. "japanese") */
471 UCollator *pUCollator; /* ICU library collation object */
472 int rc; /* Return code from sqlite3_create_collation_x() */
474 assert(nArg==2);
475 (void)nArg; /* Unused parameter */
476 zLocale = (const char *)sqlite3_value_text(apArg[0]);
477 zName = (const char *)sqlite3_value_text(apArg[1]);
479 if( !zLocale || !zName ){
480 return;
483 pUCollator = ucol_open(zLocale, &status);
484 if( !U_SUCCESS(status) ){
485 icuFunctionError(p, "ucol_open", status);
486 return;
488 assert(p);
490 rc = sqlite3_create_collation_v2(db, zName, SQLITE_UTF16, (void *)pUCollator,
491 icuCollationColl, icuCollationDel
493 if( rc!=SQLITE_OK ){
494 ucol_close(pUCollator);
495 sqlite3_result_error(p, "Error registering collation function", -1);
500 ** Register the ICU extension functions with database db.
502 int sqlite3IcuInit(sqlite3 *db){
503 # define SQLITEICU_EXTRAFLAGS (SQLITE_DETERMINISTIC|SQLITE_INNOCUOUS)
504 static const struct IcuScalar {
505 const char *zName; /* Function name */
506 unsigned char nArg; /* Number of arguments */
507 unsigned int enc; /* Optimal text encoding */
508 unsigned char iContext; /* sqlite3_user_data() context */
509 void (*xFunc)(sqlite3_context*,int,sqlite3_value**);
510 } scalars[] = {
511 {"icu_load_collation",2,SQLITE_UTF8|SQLITE_DIRECTONLY,1, icuLoadCollation},
512 #if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_ICU)
513 {"regexp", 2, SQLITE_ANY|SQLITEICU_EXTRAFLAGS, 0, icuRegexpFunc},
514 {"lower", 1, SQLITE_UTF16|SQLITEICU_EXTRAFLAGS, 0, icuCaseFunc16},
515 {"lower", 2, SQLITE_UTF16|SQLITEICU_EXTRAFLAGS, 0, icuCaseFunc16},
516 {"upper", 1, SQLITE_UTF16|SQLITEICU_EXTRAFLAGS, 1, icuCaseFunc16},
517 {"upper", 2, SQLITE_UTF16|SQLITEICU_EXTRAFLAGS, 1, icuCaseFunc16},
518 {"lower", 1, SQLITE_UTF8|SQLITEICU_EXTRAFLAGS, 0, icuCaseFunc16},
519 {"lower", 2, SQLITE_UTF8|SQLITEICU_EXTRAFLAGS, 0, icuCaseFunc16},
520 {"upper", 1, SQLITE_UTF8|SQLITEICU_EXTRAFLAGS, 1, icuCaseFunc16},
521 {"upper", 2, SQLITE_UTF8|SQLITEICU_EXTRAFLAGS, 1, icuCaseFunc16},
522 {"like", 2, SQLITE_UTF8|SQLITEICU_EXTRAFLAGS, 0, icuLikeFunc},
523 {"like", 3, SQLITE_UTF8|SQLITEICU_EXTRAFLAGS, 0, icuLikeFunc},
524 #endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_ICU) */
526 int rc = SQLITE_OK;
527 int i;
529 for(i=0; rc==SQLITE_OK && i<(int)(sizeof(scalars)/sizeof(scalars[0])); i++){
530 const struct IcuScalar *p = &scalars[i];
531 rc = sqlite3_create_function(
532 db, p->zName, p->nArg, p->enc,
533 p->iContext ? (void*)db : (void*)0,
534 p->xFunc, 0, 0
538 return rc;
541 #if !SQLITE_CORE
542 #ifdef _WIN32
543 __declspec(dllexport)
544 #endif
545 int sqlite3_icu_init(
546 sqlite3 *db,
547 char **pzErrMsg,
548 const sqlite3_api_routines *pApi
550 SQLITE_EXTENSION_INIT2(pApi)
551 return sqlite3IcuInit(db);
553 #endif
555 #endif