2 ** The author disclaims copyright to this source code.
4 *************************************************************************
5 ** Implementation of the "simple" full-text-search tokenizer.
9 #if !defined(__APPLE__)
18 #include "tokenizer.h"
20 /* Duplicate a string; the caller must free() the returned string.
21 * (We don't use strdup() since it's not part of the standard C library and
22 * may not be available everywhere.) */
23 /* TODO(shess) Copied from fulltext.c, consider util.c for such
25 static char *string_dup(const char *s
){
26 char *str
= malloc(strlen(s
) + 1);
31 typedef struct simple_tokenizer
{
32 sqlite3_tokenizer base
;
33 const char *zDelim
; /* token delimiters */
36 typedef struct simple_tokenizer_cursor
{
37 sqlite3_tokenizer_cursor base
;
38 const char *pInput
; /* input we are tokenizing */
39 int nBytes
; /* size of the input */
40 const char *pCurrent
; /* current position in pInput */
41 int iToken
; /* index of next token to be returned */
42 char *zToken
; /* storage for current token */
43 int nTokenBytes
; /* actual size of current token */
44 int nTokenAllocated
; /* space allocated to zToken buffer */
45 } simple_tokenizer_cursor
;
47 static sqlite3_tokenizer_module simpleTokenizerModule
;/* forward declaration */
49 static int simpleCreate(
50 int argc
, const char **argv
,
51 sqlite3_tokenizer
**ppTokenizer
55 t
= (simple_tokenizer
*) malloc(sizeof(simple_tokenizer
));
56 /* TODO(shess) Delimiters need to remain the same from run to run,
57 ** else we need to reindex. One solution would be a meta-table to
58 ** track such information in the database, then we'd only want this
59 ** information on the initial create.
62 t
->zDelim
= string_dup(argv
[1]);
64 /* Build a string excluding alphanumeric ASCII characters */
65 char zDelim
[0x80]; /* nul-terminated, so nul not a member */
67 for(i
=1, j
=0; i
<0x80; i
++){
73 assert( j
<=sizeof(zDelim
) );
74 t
->zDelim
= string_dup(zDelim
);
77 *ppTokenizer
= &t
->base
;
81 static int simpleDestroy(sqlite3_tokenizer
*pTokenizer
){
82 simple_tokenizer
*t
= (simple_tokenizer
*) pTokenizer
;
84 free((void *) t
->zDelim
);
90 static int simpleOpen(
91 sqlite3_tokenizer
*pTokenizer
,
92 const char *pInput
, int nBytes
,
93 sqlite3_tokenizer_cursor
**ppCursor
95 simple_tokenizer_cursor
*c
;
97 c
= (simple_tokenizer_cursor
*) malloc(sizeof(simple_tokenizer_cursor
));
99 c
->nBytes
= nBytes
<0 ? (int) strlen(pInput
) : nBytes
;
100 c
->pCurrent
= c
->pInput
; /* start tokenizing at the beginning */
102 c
->zToken
= NULL
; /* no space allocated, yet. */
104 c
->nTokenAllocated
= 0;
106 *ppCursor
= &c
->base
;
110 static int simpleClose(sqlite3_tokenizer_cursor
*pCursor
){
111 simple_tokenizer_cursor
*c
= (simple_tokenizer_cursor
*) pCursor
;
113 if( NULL
!=c
->zToken
){
121 static int simpleNext(
122 sqlite3_tokenizer_cursor
*pCursor
,
123 const char **ppToken
, int *pnBytes
,
124 int *piStartOffset
, int *piEndOffset
, int *piPosition
126 simple_tokenizer_cursor
*c
= (simple_tokenizer_cursor
*) pCursor
;
127 simple_tokenizer
*t
= (simple_tokenizer
*) pCursor
->pTokenizer
;
130 while( c
->pCurrent
-c
->pInput
<c
->nBytes
){
131 int n
= (int) strcspn(c
->pCurrent
, t
->zDelim
);
133 if( n
+1>c
->nTokenAllocated
){
134 c
->zToken
= realloc(c
->zToken
, n
+1);
136 for(ii
=0; ii
<n
; ii
++){
137 /* TODO(shess) This needs expansion to handle UTF-8
138 ** case-insensitivity.
140 char ch
= c
->pCurrent
[ii
];
141 c
->zToken
[ii
] = (unsigned char)ch
<0x80 ? tolower(ch
) : ch
;
144 *ppToken
= c
->zToken
;
146 *piStartOffset
= (int) (c
->pCurrent
-c
->pInput
);
147 *piEndOffset
= *piStartOffset
+n
;
148 *piPosition
= c
->iToken
++;
149 c
->pCurrent
+= n
+ 1;
153 c
->pCurrent
+= n
+ 1;
154 /* TODO(shess) could strspn() to skip delimiters en masse. Needs
155 ** to happen in two places, though, which is annoying.
161 static sqlite3_tokenizer_module simpleTokenizerModule
= {
170 void get_simple_tokenizer_module(
171 sqlite3_tokenizer_module
**ppModule
173 *ppModule
= &simpleTokenizerModule
;