delay a few things on startup, such as setting the visibility mode, which ensures...
[personal-kdebase.git] / runtime / nepomuk / services / storage / clucenetokenizer.cpp
bloba05d7e72227e5c7a5ecc17e8eba19e6cf8457744
1 /*
2 * Modified version of StandardTokenizer.cpp for Nepomuk mostly to optimize for filename indexing
3 * Copyright (C) 2008 Sebastian Trueg <trueg@kde.org>
5 * Based on StandardTokenizer.cpp from the CLucene package.
6 * Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team
8 * This library is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Library General Public
10 * License as published by the Free Software Foundation; either
11 * version 2 of the License, or (at your option) any later version.
13 * This library is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * Library General Public License for more details.
18 * You should have received a copy of the GNU Library General Public License
19 * along with this library; see the file COPYING.LIB. If not, write to
20 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
21 * Boston, MA 02110-1301, USA.
25 * Modified version for Nepomuk mostly to optimize for filename indexing
26 * Changes:
27 * - underscore is treated as space, ie. tokens are always divided at underscores
28 * - try to always divide tokens at dots.
29 * FIXME: we should still allow the acronym detection
30 * FIXME: is it possible to extract both? for example email addresses. Here it would
31 * be nice to have both the full address as a token and the separated portions.
34 #include <CLucene/StdHeader.h>
35 #include "clucenetokenizer.h"
37 CL_NS_USE(analysis)
38 CL_NS_USE(util)
40 namespace Nepomuk {
41 const TCHAR* tokenImageArray[] = {
42 _T("<EOF>"),
43 _T("<UNKNOWN>"),
44 _T("<ALPHANUM>"),
45 _T("<APOSTROPHE>"),
46 _T("<ACRONYM>"),
47 _T("<COMPANY>"),
48 _T("<EMAIL>"),
49 _T("<HOST>"),
50 _T("<NUM>"),
51 _T("<CJK>")
53 const TCHAR** tokenImage = tokenImageArray;
57 /* A bunch of shortcut macros, many of which make assumptions about variable
58 ** names. These macros enhance readability, not just convenience! */
59 #define EOS (ch==-1 || rd->Eos())
60 #define SPACE (_istspace((TCHAR)ch) != 0)
61 #define ALPHA (_istalpha((TCHAR)ch) != 0)
62 #define ALNUM (_istalnum(ch) != 0)
63 #define DIGIT (_istdigit(ch) != 0)
64 #define UNDERSCORE (ch == '_')
66 #define _CJK ( (ch>=0x3040 && ch<=0x318f) || \
67 (ch>=0x3300 && ch<=0x337f) || \
68 (ch>=0x3400 && ch<=0x3d2d) || \
69 (ch>=0x4e00 && ch<=0x9fff) || \
70 (ch>=0xf900 && ch<=0xfaff) || \
71 (ch>=0xac00 && ch<=0xd7af) ) //korean
74 #define DASH (ch == '-')
75 #define NEGATIVE_SIGN_ DASH
76 //#define POSITIVE_SIGN_ (ch == '+')
77 //#define SIGN (NEGATIVE_SIGN_ || POSITIVE_SIGN_)
79 #define DOT (ch == '.')
80 #define DECIMAL DOT
83 //freebsd seems to have a problem with defines over multiple lines, so this has to be one long line
84 #define _CONSUME_AS_LONG_AS(conditionFails) while (true) { ch = readChar(); if (ch==-1 || (!(conditionFails) || str.len >= LUCENE_MAX_WORD_LEN)) { break; } str.appendChar(ch);}
86 #define CONSUME_ALPHAS _CONSUME_AS_LONG_AS(ALPHA)
88 #define CONSUME_DIGITS _CONSUME_AS_LONG_AS(DIGIT)
90 /* otherMatches is a condition (possibly compound) under which a character
91 ** that's not an ALNUM can be considered not to break the
92 ** span. Callers should pass false if only ALNUM are acceptable. */
93 #define CONSUME_WORD _CONSUME_AS_LONG_AS(ALNUM)
96 ** Consume CJK characters
98 #define CONSUME_CJK _CONSUME_AS_LONG_AS(_CJK)
101 /* It is considered that "nothing of value" has been read if:
102 ** a) The "read head" hasn't moved since specialCharPos was established.
103 ** or
104 ** b) The "read head" has moved by one character, but that character was
105 ** either whitespace or not among the characters found in the body of
106 ** a token (deliberately doesn't include the likes of '@'/'&'). */
107 #define CONSUMED_NOTHING_OF_VALUE (rdPos == specialCharPos || (rdPos == specialCharPos+1 && ( SPACE || !(ALNUM || DOT || DASH || UNDERSCORE) )))
109 #define RIGHTMOST(sb) (sb.getBuffer()[sb.len-1])
110 #define RIGHTMOST_IS(sb, c) (RIGHTMOST(sb) == c)
111 /* To discard the last character in a StringBuffer, we decrement the buffer's
112 ** length indicator and move the terminator back by one character. */
113 #define SHAVE_RIGHTMOST(sb) (sb.getBuffer()[--sb.len] = '\0')
115 //#define REMOVE_TRAILING_CHARS(sb, charMatchesCondition) { TCHAR* sbBuf = sb.getBuffer(); for (int32_t i = sb.len-1; i >= 0; i--) { TCHAR c = sbBuf[i]; if (charMatchesCondition) { sbBuf[--sb.len] = '\0'; } else {break;}}}
117 /* Does StringBuffer sb contain any of the characters in string ofThese? */
118 #define CONTAINS_ANY(sb, ofThese) (_tcscspn(sb.getBuffer(), _T(ofThese)) != static_cast<size_t>(sb.len))
121 namespace Nepomuk {
123 CLuceneTokenizer::CLuceneTokenizer(Reader* reader):
124 rd(_CLNEW FastCharStream(reader)),
125 /* rdPos is zero-based. It starts at -1, and will advance to the first
126 ** position when readChar() is first called. */
127 rdPos(-1),
128 tokenStart(-1)
132 CLuceneTokenizer::~CLuceneTokenizer() {
133 _CLDELETE(rd);
136 int CLuceneTokenizer::readChar() {
137 /* Increment by 1 because we're speaking in terms of characters, not
138 ** necessarily bytes: */
139 rdPos++;
140 return rd->GetNext();
143 void CLuceneTokenizer::unReadChar() {
144 rd->UnGet();
145 rdPos--;
148 inline bool CLuceneTokenizer::setToken(CL_NS( analysis )::Token* t, StringBuffer* sb, TokenTypes tokenCode) {
149 t->setStartOffset(tokenStart);
150 t->setEndOffset(tokenStart+sb->length());
151 t->setType(tokenImage[tokenCode]);
152 sb->getBuffer(); //null terminates the buffer
153 t->resetTermTextLen();
154 return true;
157 bool CLuceneTokenizer::next(Token* t) {
158 int ch=0;
159 while (!EOS) {
160 ch = readChar();
162 if ( ch == 0 || ch == -1 ){
163 continue;
164 } else if (SPACE || UNDERSCORE) {
165 continue;
166 } else if (ALPHA) {
167 tokenStart = rdPos;
168 return ReadAlphaNum(ch,t);
169 } else if (DIGIT || NEGATIVE_SIGN_ || DECIMAL) {
170 tokenStart = rdPos;
171 /* ReadNumber returns NULL if it fails to extract a valid number; in
172 ** that case, we just continue. */
173 if (ReadNumber(NULL, ch,t))
174 return true;
175 } else if ( _CJK ){
176 if ( ReadCJK(ch,t) )
177 return true;
180 return false;
183 bool CLuceneTokenizer::ReadNumber(const TCHAR* previousNumber, const TCHAR prev,Token* t) {
184 /* previousNumber is only non-NULL if this function already read a complete
185 ** number in a previous recursion, yet has been asked to read additional
186 ** numeric segments. For example, in the HOST "192.168.1.3", "192.168" is
187 ** a complete number, but this function will recurse to read the "1.3",
188 ** generating a single HOST token "192.168.1.3". */
189 t->growBuffer(LUCENE_MAX_WORD_LEN+1);//make sure token can hold the next word
190 StringBuffer str(t->_termText,t->bufferLength(),true); //use stringbuffer to read data onto the termText
191 TokenTypes tokenType;
192 bool decExhausted;
193 if (previousNumber != NULL) {
194 str.prepend(previousNumber);
195 tokenType = HOST;
196 decExhausted = false;
197 } else {
198 tokenType = NUM;
199 decExhausted = (prev == '.');
201 if ( str.len >= LUCENE_MAX_WORD_LEN ){
202 //if a number is too long, i would say there is no point
203 //storing it, because its going to be the wrong number anyway?
204 //what do people think?
205 return false;
207 str.appendChar(prev);
209 const bool signExhausted = (prev == '-');
210 int ch = prev;
212 CONSUME_DIGITS;
214 if (str.len < 2 /* CONSUME_DIGITS didn't find any digits. */
215 && (
216 (signExhausted && !DECIMAL)
217 || (decExhausted /* && !DIGIT is implied, since CONSUME_DIGITS stopped on a non-digit. */)
221 /* We have either:
222 ** a) a negative sign that's not followed by either digit(s) or a decimal
223 ** b) a decimal that's not followed by digit(s)
224 ** so this is not a valid number. */
225 if (!EOS) {
226 /* Unread the character that stopped CONSUME_DIGITS: */
227 unReadChar();
229 return false;
232 /* We just read a group of digits. Is it followed by a decimal symbol,
233 ** implying that there might be another group of digits available? */
234 if (!EOS) {
235 if (DECIMAL) {
236 if ( str.len >= LUCENE_MAX_WORD_LEN )
237 return false; //read above for rationale
238 str.appendChar(ch);
239 } else {
240 unReadChar();
241 goto SUCCESSFULLY_EXTRACTED_NUMBER;
244 CONSUME_DIGITS;
245 if (!DIGIT && !DECIMAL) {
246 unReadChar();
247 } else if (!EOS && DECIMAL && _istdigit(rd->Peek())) {
248 /* We just read the fractional digit group, but it's also followed by
249 ** a decimal symbol and at least one more digit, so this must be a
250 ** HOST rather than a real number. */
251 return ReadNumber(str.getBuffer(), '.',t);
255 SUCCESSFULLY_EXTRACTED_NUMBER:
256 TCHAR rightmost = RIGHTMOST(str);
257 /* Don't including a trailing decimal point. */
258 if (rightmost == '.') {
259 SHAVE_RIGHTMOST(str);
260 unReadChar();
261 rightmost = RIGHTMOST(str);
263 /* If all we have left is a negative sign, it's not a valid number. */
264 if (rightmost == '-') {
265 CND_PRECONDITION (str.len == 1, "Number is invalid");
266 return false;
269 return setToken(t,&str,tokenType);
272 bool CLuceneTokenizer::ReadAlphaNum(const TCHAR prev, Token* t) {
273 t->growBuffer(LUCENE_MAX_WORD_LEN+1);//make sure token can hold the next word
274 StringBuffer str(t->_termText,t->bufferLength(),true); //use stringbuffer to read data onto the termText
275 if ( str.len < LUCENE_MAX_WORD_LEN ){
276 str.appendChar(prev);
277 int ch = prev;
279 CONSUME_WORD;
280 if (!EOS && str.len < LUCENE_MAX_WORD_LEN-1 ) { //still have space for 1 more character?
281 switch(ch) { /* What follows the first alphanum segment? */
282 #if 0
283 case '.':
284 str.appendChar('.');
285 return ReadDotted(&str, UNKNOWN,t);
286 #endif
287 case '\'':
288 str.appendChar('\'');
289 return ReadApostrophe(&str,t);
290 case '@':
291 str.appendChar('@');
292 return ReadAt(&str,t);
293 case '&':
294 str.appendChar('&');
295 return ReadCompany(&str,t);
296 /* default: fall through to end of this function. */
300 return setToken(t,&str,ALPHANUM);
303 bool CLuceneTokenizer::ReadCJK(const TCHAR prev, Token* t) {
304 t->growBuffer(LUCENE_MAX_WORD_LEN+1);//make sure token can hold the next word
305 StringBuffer str(t->_termText,t->bufferLength(),true); //use stringbuffer to read data onto the termText
306 if ( str.len < LUCENE_MAX_WORD_LEN ){
307 str.appendChar(prev);
308 int ch = prev;
310 CONSUME_CJK;
312 return setToken(t,&str,CJK);
316 bool CLuceneTokenizer::ReadDotted(StringBuffer* _str, TokenTypes forcedType, CL_NS(analysis)::Token* t) {
317 const int32_t specialCharPos = rdPos;
318 StringBuffer& str=*_str;
320 /* A segment of a "dotted" is not allowed to begin with another dot or a dash.
321 ** Even though hosts, e-mail addresses, etc., could have a dotted-segment
322 ** that begins with a dot or a dash, it's far more common in source text
323 ** for a pattern like "abc.--def" to be intended as two tokens. */
324 int ch = rd->Peek();
325 if (!(DOT || DASH)) {
326 bool prevWasDot;
327 bool prevWasDash;
328 if (str.len == 0) {
329 prevWasDot = false;
330 prevWasDash = false;
331 } else {
332 prevWasDot = RIGHTMOST(str) == '.';
333 prevWasDash = RIGHTMOST(str) == '-';
335 while (!EOS && str.len < LUCENE_MAX_WORD_LEN-1 ) {
336 ch = readChar();
337 const bool dot = ch == '.';
338 const bool dash = ch == '-';
340 if (!(ALNUM || UNDERSCORE || dot || dash)) {
341 break;
343 /* Multiple dots or dashes in succession end the token.
344 ** Consider the following inputs:
345 ** "Visit windowsupdate.microsoft.com--update today!"
346 ** "In the U.S.A.--yes, even there!" */
347 if ((dot || dash) && (prevWasDot || prevWasDash)) {
348 /* We're not going to append the character we just read, in any case.
349 ** As to the character before it (which is currently RIGHTMOST(str)):
350 ** Unless RIGHTMOST(str) is a dot, in which we need to save it so the
351 ** acronym-versus-host detection can work, we want to get rid of it. */
352 if (!prevWasDot) {
353 SHAVE_RIGHTMOST(str);
355 break;
358 str.appendChar(ch);
360 prevWasDot = dot;
361 prevWasDash = dash;
365 /* There's a potential StringBuffer.append call in the code above, which
366 ** could cause str to reallocate its internal buffer. We must wait to
367 ** obtain the optimization-oriented strBuf pointer until after the initial
368 ** potentially realloc-triggering operations on str.
369 ** Because there can be other such ops much later in this function, strBuf
370 ** is guarded within a block to prevent its use during or after the calls
371 ** that would potentially invalidate it. */
372 { /* Begin block-guard of strBuf */
373 TCHAR* strBuf = str.getBuffer();
375 bool rightmostIsDot = RIGHTMOST_IS(str, '.');
376 if (CONSUMED_NOTHING_OF_VALUE) {
377 /* No more alphanums available for this token; shave trailing dot, if any. */
378 if (rightmostIsDot) {
379 SHAVE_RIGHTMOST(str);
381 /* If there are no dots remaining, this is a generic ALPHANUM. */
382 if (_tcschr(strBuf, '.') == NULL) {
383 forcedType = ALPHANUM;
386 /* Check the token to see if it's an acronym. An acronym must have a
387 ** letter in every even slot and a dot in every odd slot, including the
388 ** last slot (for example, "U.S.A."). */
389 } else if (rightmostIsDot) {
390 bool isAcronym = true;
391 const int32_t upperCheckLimit = str.len - 1; /* -1 b/c we already checked the last slot. */
393 for (int32_t i = 0; i < upperCheckLimit; i++) {
394 const bool even = (i % 2 == 0);
395 ch = strBuf[i];
396 if ( (even && !ALPHA) || (!even && !DOT) ) {
397 isAcronym = false;
398 break;
401 if (isAcronym) {
402 forcedType = ACRONYM;
403 } else {
404 /* If it's not an acronym, we don't want the trailing dot. */
405 SHAVE_RIGHTMOST(str);
406 /* If there are no dots remaining, this is a generic ALPHANUM. */
407 if (_tcschr(strBuf, '.') == NULL) {
408 forcedType = ALPHANUM;
412 } /* End block-guard of strBuf */
414 if (!EOS) {
415 if (ch == '@' && str.len < LUCENE_MAX_WORD_LEN-1) {
416 str.appendChar('@');
417 return ReadAt(&str,t);
418 } else {
419 unReadChar();
423 return setToken(t,&str,UNKNOWN
424 ? forcedType : HOST);
427 bool CLuceneTokenizer::ReadApostrophe(StringBuffer* _str, Token* t) {
428 StringBuffer& str=*_str;
430 TokenTypes tokenType = APOSTROPHE;
431 const int32_t specialCharPos = rdPos;
432 int ch=0;
434 CONSUME_ALPHAS;
435 if (RIGHTMOST_IS(str, '\'') || CONSUMED_NOTHING_OF_VALUE) {
436 /* After the apostrophe, no more alphanums were available within this
437 ** token; shave trailing apostrophe and revert to generic ALPHANUM. */
438 SHAVE_RIGHTMOST(str);
439 tokenType = ALPHANUM;
441 if (!EOS) {
442 unReadChar();
445 return setToken(t,&str,tokenType);
448 bool CLuceneTokenizer::ReadAt(StringBuffer* str, Token* t) {
449 ReadDotted(str, EMAIL,t);
450 /* JLucene grammar indicates dots/digits not allowed in company name: */
451 if (!CONTAINS_ANY((*str), ".0123456789")) {
452 setToken(t,str,COMPANY);
454 return true;
457 bool CLuceneTokenizer::ReadCompany(StringBuffer* _str, Token* t) {
458 StringBuffer& str = *_str;
459 const int32_t specialCharPos = rdPos;
460 int ch=0;
462 CONSUME_WORD;
463 if (CONSUMED_NOTHING_OF_VALUE) {
464 /* After the ampersand, no more alphanums were available within this
465 ** token; shave trailing ampersand and revert to ALPHANUM. */
466 CND_PRECONDITION(RIGHTMOST_IS(str, '&'),"ReadCompany failed");
467 SHAVE_RIGHTMOST(str);
470 return setToken(t,&str,ALPHANUM);
472 if (!EOS) {
473 unReadChar();
476 return setToken(t,&str,COMPANY);