4 ** The author disclaims copyright to this source code. In place of
5 ** a legal notice, here is a blessing:
7 ** May you do good and not evil.
8 ** May you find forgiveness for yourself and forgive others.
9 ** May you share freely, never taking more than you give.
11 *************************************************************************
12 ** This file implements a tokenizer for fts2 based on the ICU library.
14 ** $Id: fts2_icu.c,v 1.3 2008/12/18 05:30:26 danielk1977 Exp $
17 #if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2)
18 #ifdef SQLITE_ENABLE_ICU
22 #include "fts2_tokenizer.h"
24 #include <unicode/ubrk.h>
25 #include <unicode/ucol.h>
26 #include <unicode/ustring.h>
27 #include <unicode/utf16.h>
29 typedef struct IcuTokenizer IcuTokenizer
;
30 typedef struct IcuCursor IcuCursor
;
33 sqlite3_tokenizer base
;
38 sqlite3_tokenizer_cursor base
;
40 UBreakIterator
*pIter
; /* ICU break-iterator object */
41 int nChar
; /* Number of UChar elements in pInput */
42 UChar
*aChar
; /* Copy of input using utf-16 encoding */
43 int *aOffset
; /* Offsets of each character in utf-8 input */
52 ** Create a new tokenizer instance.
55 int argc
, /* Number of entries in argv[] */
56 const char * const *argv
, /* Tokenizer creation arguments */
57 sqlite3_tokenizer
**ppTokenizer
/* OUT: Created tokenizer */
63 n
= strlen(argv
[0])+1;
65 p
= (IcuTokenizer
*)sqlite3_malloc(sizeof(IcuTokenizer
)+n
);
69 memset(p
, 0, sizeof(IcuTokenizer
));
72 p
->zLocale
= (char *)&p
[1];
73 memcpy(p
->zLocale
, argv
[0], n
);
76 *ppTokenizer
= (sqlite3_tokenizer
*)p
;
82 ** Destroy a tokenizer
84 static int icuDestroy(sqlite3_tokenizer
*pTokenizer
){
85 IcuTokenizer
*p
= (IcuTokenizer
*)pTokenizer
;
91 ** Prepare to begin tokenizing a particular string. The input
92 ** string to be tokenized is pInput[0..nBytes-1]. A cursor
93 ** used to incrementally tokenize this string is returned in
97 sqlite3_tokenizer
*pTokenizer
, /* The tokenizer */
98 const char *zInput
, /* Input string */
99 int nInput
, /* Length of zInput in bytes */
100 sqlite3_tokenizer_cursor
**ppCursor
/* OUT: Tokenization cursor */
102 IcuTokenizer
*p
= (IcuTokenizer
*)pTokenizer
;
105 const int32_t opt
= U_FOLD_CASE_DEFAULT
;
106 UErrorCode status
= U_ZERO_ERROR
;
116 nInput
= strlen(zInput
);
119 pCsr
= (IcuCursor
*)sqlite3_malloc(
120 sizeof(IcuCursor
) + /* IcuCursor */
121 ((nChar
+3)&~3) * sizeof(UChar
) + /* IcuCursor.aChar[] */
122 (nChar
+1) * sizeof(int) /* IcuCursor.aOffset[] */
127 memset(pCsr
, 0, sizeof(IcuCursor
));
128 pCsr
->aChar
= (UChar
*)&pCsr
[1];
129 pCsr
->aOffset
= (int *)&pCsr
->aChar
[(nChar
+3)&~3];
131 pCsr
->aOffset
[iOut
] = iInput
;
132 U8_NEXT(zInput
, iInput
, nInput
, c
);
135 c
= u_foldCase(c
, opt
);
136 U16_APPEND(pCsr
->aChar
, iOut
, nChar
, c
, isError
);
141 pCsr
->aOffset
[iOut
] = iInput
;
144 U8_NEXT(zInput
, iInput
, nInput
, c
);
150 pCsr
->pIter
= ubrk_open(UBRK_WORD
, p
->zLocale
, pCsr
->aChar
, iOut
, &status
);
151 if( !U_SUCCESS(status
) ){
157 ubrk_first(pCsr
->pIter
);
158 *ppCursor
= (sqlite3_tokenizer_cursor
*)pCsr
;
163 ** Close a tokenization cursor previously opened by a call to icuOpen().
165 static int icuClose(sqlite3_tokenizer_cursor
*pCursor
){
166 IcuCursor
*pCsr
= (IcuCursor
*)pCursor
;
167 ubrk_close(pCsr
->pIter
);
168 sqlite3_free(pCsr
->zBuffer
);
174 ** Extract the next token from a tokenization cursor.
177 sqlite3_tokenizer_cursor
*pCursor
, /* Cursor returned by simpleOpen */
178 const char **ppToken
, /* OUT: *ppToken is the token text */
179 int *pnBytes
, /* OUT: Number of bytes in token */
180 int *piStartOffset
, /* OUT: Starting offset of token */
181 int *piEndOffset
, /* OUT: Ending offset of token */
182 int *piPosition
/* OUT: Position integer of token */
184 IcuCursor
*pCsr
= (IcuCursor
*)pCursor
;
190 while( iStart
==iEnd
){
193 iStart
= ubrk_current(pCsr
->pIter
);
194 iEnd
= ubrk_next(pCsr
->pIter
);
195 if( iEnd
==UBRK_DONE
){
199 while( iStart
<iEnd
){
201 U8_NEXT(pCsr
->aChar
, iWhite
, pCsr
->nChar
, c
);
208 assert(iStart
<=iEnd
);
212 UErrorCode status
= U_ZERO_ERROR
;
214 char *zNew
= sqlite3_realloc(pCsr
->zBuffer
, nByte
);
218 pCsr
->zBuffer
= zNew
;
219 pCsr
->nBuffer
= nByte
;
223 pCsr
->zBuffer
, pCsr
->nBuffer
, &nByte
, /* Output vars */
224 &pCsr
->aChar
[iStart
], iEnd
-iStart
, /* Input vars */
225 &status
/* Output success/failure */
227 } while( nByte
>pCsr
->nBuffer
);
229 *ppToken
= pCsr
->zBuffer
;
231 *piStartOffset
= pCsr
->aOffset
[iStart
];
232 *piEndOffset
= pCsr
->aOffset
[iEnd
];
233 *piPosition
= pCsr
->iToken
++;
239 ** The set of routines that implement the simple tokenizer
241 static const sqlite3_tokenizer_module icuTokenizerModule
= {
243 icuCreate
, /* xCreate */
244 icuDestroy
, /* xCreate */
246 icuClose
, /* xClose */
251 ** Set *ppModule to point at the implementation of the ICU tokenizer.
253 void sqlite3Fts2IcuTokenizerModule(
254 sqlite3_tokenizer_module
const**ppModule
256 *ppModule
= &icuTokenizerModule
;
259 #endif /* defined(SQLITE_ENABLE_ICU) */
260 #endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2) */