2 * Modified version of StandardTokenizer.cpp for Nepomuk mostly to optimize for filename indexing
3 * Copyright (C) 2008 Sebastian Trueg <trueg@kde.org>
5 * Based on StandardTokenizer.cpp from the CLucene package.
6 * Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team
8 * This library is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Library General Public
10 * License as published by the Free Software Foundation; either
11 * version 2 of the License, or (at your option) any later version.
13 * This library is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * Library General Public License for more details.
18 * You should have received a copy of the GNU Library General Public License
19 * along with this library; see the file COPYING.LIB. If not, write to
20 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
21 * Boston, MA 02110-1301, USA.
25 * Modified version for Nepomuk mostly to optimize for filename indexing
27 * - underscore is treated as space, ie. tokens are always divided at underscores
28 * - try to always divide tokens at dots.
29 * FIXME: we should still allow the acronym detection
30 * FIXME: is it possible to extract both? for example email addresses. Here it would
31 * be nice to have both the full address as a token and the separated portions.
34 #include <CLucene/StdHeader.h>
35 #include "clucenetokenizer.h"
41 const TCHAR
* tokenImageArray
[] = {
53 const TCHAR
** tokenImage
= tokenImageArray
;
57 /* A bunch of shortcut macros, many of which make assumptions about variable
58 ** names. These macros enhance readability, not just convenience! */
59 #define EOS (ch==-1 || rd->Eos())
60 #define SPACE (_istspace((TCHAR)ch) != 0)
61 #define ALPHA (_istalpha((TCHAR)ch) != 0)
62 #define ALNUM (_istalnum(ch) != 0)
63 #define DIGIT (_istdigit(ch) != 0)
64 #define UNDERSCORE (ch == '_')
66 #define _CJK ( (ch>=0x3040 && ch<=0x318f) || \
67 (ch>=0x3300 && ch<=0x337f) || \
68 (ch>=0x3400 && ch<=0x3d2d) || \
69 (ch>=0x4e00 && ch<=0x9fff) || \
70 (ch>=0xf900 && ch<=0xfaff) || \
71 (ch>=0xac00 && ch<=0xd7af) ) //korean
74 #define DASH (ch == '-')
75 #define NEGATIVE_SIGN_ DASH
76 //#define POSITIVE_SIGN_ (ch == '+')
77 //#define SIGN (NEGATIVE_SIGN_ || POSITIVE_SIGN_)
79 #define DOT (ch == '.')
83 //freebsd seems to have a problem with defines over multiple lines, so this has to be one long line
84 #define _CONSUME_AS_LONG_AS(conditionFails) while (true) { ch = readChar(); if (ch==-1 || (!(conditionFails) || str.len >= LUCENE_MAX_WORD_LEN)) { break; } str.appendChar(ch);}
86 #define CONSUME_ALPHAS _CONSUME_AS_LONG_AS(ALPHA)
88 #define CONSUME_DIGITS _CONSUME_AS_LONG_AS(DIGIT)
90 /* otherMatches is a condition (possibly compound) under which a character
91 ** that's not an ALNUM can be considered not to break the
92 ** span. Callers should pass false if only ALNUM are acceptable. */
93 #define CONSUME_WORD _CONSUME_AS_LONG_AS(ALNUM)
96 ** Consume CJK characters
98 #define CONSUME_CJK _CONSUME_AS_LONG_AS(_CJK)
101 /* It is considered that "nothing of value" has been read if:
102 ** a) The "read head" hasn't moved since specialCharPos was established.
104 ** b) The "read head" has moved by one character, but that character was
105 ** either whitespace or not among the characters found in the body of
106 ** a token (deliberately doesn't include the likes of '@'/'&'). */
107 #define CONSUMED_NOTHING_OF_VALUE (rdPos == specialCharPos || (rdPos == specialCharPos+1 && ( SPACE || !(ALNUM || DOT || DASH || UNDERSCORE) )))
109 #define RIGHTMOST(sb) (sb.getBuffer()[sb.len-1])
110 #define RIGHTMOST_IS(sb, c) (RIGHTMOST(sb) == c)
111 /* To discard the last character in a StringBuffer, we decrement the buffer's
112 ** length indicator and move the terminator back by one character. */
113 #define SHAVE_RIGHTMOST(sb) (sb.getBuffer()[--sb.len] = '\0')
115 //#define REMOVE_TRAILING_CHARS(sb, charMatchesCondition) { TCHAR* sbBuf = sb.getBuffer(); for (int32_t i = sb.len-1; i >= 0; i--) { TCHAR c = sbBuf[i]; if (charMatchesCondition) { sbBuf[--sb.len] = '\0'; } else {break;}}}
117 /* Does StringBuffer sb contain any of the characters in string ofThese? */
118 #define CONTAINS_ANY(sb, ofThese) (_tcscspn(sb.getBuffer(), _T(ofThese)) != static_cast<size_t>(sb.len))
123 CLuceneTokenizer::CLuceneTokenizer(Reader
* reader
):
124 rd(_CLNEW
FastCharStream(reader
)),
125 /* rdPos is zero-based. It starts at -1, and will advance to the first
126 ** position when readChar() is first called. */
132 CLuceneTokenizer::~CLuceneTokenizer() {
136 int CLuceneTokenizer::readChar() {
137 /* Increment by 1 because we're speaking in terms of characters, not
138 ** necessarily bytes: */
140 return rd
->GetNext();
143 void CLuceneTokenizer::unReadChar() {
148 inline bool CLuceneTokenizer::setToken(CL_NS( analysis
)::Token
* t
, StringBuffer
* sb
, TokenTypes tokenCode
) {
149 t
->setStartOffset(tokenStart
);
150 t
->setEndOffset(tokenStart
+sb
->length());
151 t
->setType(tokenImage
[tokenCode
]);
152 sb
->getBuffer(); //null terminates the buffer
153 t
->resetTermTextLen();
157 bool CLuceneTokenizer::next(Token
* t
) {
162 if ( ch
== 0 || ch
== -1 ){
164 } else if (SPACE
|| UNDERSCORE
) {
168 return ReadAlphaNum(ch
,t
);
169 } else if (DIGIT
|| NEGATIVE_SIGN_
|| DECIMAL
) {
171 /* ReadNumber returns NULL if it fails to extract a valid number; in
172 ** that case, we just continue. */
173 if (ReadNumber(NULL
, ch
,t
))
183 bool CLuceneTokenizer::ReadNumber(const TCHAR
* previousNumber
, const TCHAR prev
,Token
* t
) {
184 /* previousNumber is only non-NULL if this function already read a complete
185 ** number in a previous recursion, yet has been asked to read additional
186 ** numeric segments. For example, in the HOST "192.168.1.3", "192.168" is
187 ** a complete number, but this function will recurse to read the "1.3",
188 ** generating a single HOST token "192.168.1.3". */
189 t
->growBuffer(LUCENE_MAX_WORD_LEN
+1);//make sure token can hold the next word
190 StringBuffer
str(t
->_termText
,t
->bufferLength(),true); //use stringbuffer to read data onto the termText
191 TokenTypes tokenType
;
193 if (previousNumber
!= NULL
) {
194 str
.prepend(previousNumber
);
196 decExhausted
= false;
199 decExhausted
= (prev
== '.');
201 if ( str
.len
>= LUCENE_MAX_WORD_LEN
){
202 //if a number is too long, i would say there is no point
203 //storing it, because its going to be the wrong number anyway?
204 //what do people think?
207 str
.appendChar(prev
);
209 const bool signExhausted
= (prev
== '-');
214 if (str
.len
< 2 /* CONSUME_DIGITS didn't find any digits. */
216 (signExhausted
&& !DECIMAL
)
217 || (decExhausted
/* && !DIGIT is implied, since CONSUME_DIGITS stopped on a non-digit. */)
222 ** a) a negative sign that's not followed by either digit(s) or a decimal
223 ** b) a decimal that's not followed by digit(s)
224 ** so this is not a valid number. */
226 /* Unread the character that stopped CONSUME_DIGITS: */
232 /* We just read a group of digits. Is it followed by a decimal symbol,
233 ** implying that there might be another group of digits available? */
236 if ( str
.len
>= LUCENE_MAX_WORD_LEN
)
237 return false; //read above for rationale
241 goto SUCCESSFULLY_EXTRACTED_NUMBER
;
245 if (!DIGIT
&& !DECIMAL
) {
247 } else if (!EOS
&& DECIMAL
&& _istdigit(rd
->Peek())) {
248 /* We just read the fractional digit group, but it's also followed by
249 ** a decimal symbol and at least one more digit, so this must be a
250 ** HOST rather than a real number. */
251 return ReadNumber(str
.getBuffer(), '.',t
);
255 SUCCESSFULLY_EXTRACTED_NUMBER
:
256 TCHAR rightmost
= RIGHTMOST(str
);
257 /* Don't including a trailing decimal point. */
258 if (rightmost
== '.') {
259 SHAVE_RIGHTMOST(str
);
261 rightmost
= RIGHTMOST(str
);
263 /* If all we have left is a negative sign, it's not a valid number. */
264 if (rightmost
== '-') {
265 CND_PRECONDITION (str
.len
== 1, "Number is invalid");
269 return setToken(t
,&str
,tokenType
);
272 bool CLuceneTokenizer::ReadAlphaNum(const TCHAR prev
, Token
* t
) {
273 t
->growBuffer(LUCENE_MAX_WORD_LEN
+1);//make sure token can hold the next word
274 StringBuffer
str(t
->_termText
,t
->bufferLength(),true); //use stringbuffer to read data onto the termText
275 if ( str
.len
< LUCENE_MAX_WORD_LEN
){
276 str
.appendChar(prev
);
280 if (!EOS
&& str
.len
< LUCENE_MAX_WORD_LEN
-1 ) { //still have space for 1 more character?
281 switch(ch
) { /* What follows the first alphanum segment? */
285 return ReadDotted(&str
, UNKNOWN
,t
);
288 str
.appendChar('\'');
289 return ReadApostrophe(&str
,t
);
292 return ReadAt(&str
,t
);
295 return ReadCompany(&str
,t
);
296 /* default: fall through to end of this function. */
300 return setToken(t
,&str
,ALPHANUM
);
303 bool CLuceneTokenizer::ReadCJK(const TCHAR prev
, Token
* t
) {
304 t
->growBuffer(LUCENE_MAX_WORD_LEN
+1);//make sure token can hold the next word
305 StringBuffer
str(t
->_termText
,t
->bufferLength(),true); //use stringbuffer to read data onto the termText
306 if ( str
.len
< LUCENE_MAX_WORD_LEN
){
307 str
.appendChar(prev
);
312 return setToken(t
,&str
,CJK
);
316 bool CLuceneTokenizer::ReadDotted(StringBuffer
* _str
, TokenTypes forcedType
, CL_NS(analysis
)::Token
* t
) {
317 const int32_t specialCharPos
= rdPos
;
318 StringBuffer
& str
=*_str
;
320 /* A segment of a "dotted" is not allowed to begin with another dot or a dash.
321 ** Even though hosts, e-mail addresses, etc., could have a dotted-segment
322 ** that begins with a dot or a dash, it's far more common in source text
323 ** for a pattern like "abc.--def" to be intended as two tokens. */
325 if (!(DOT
|| DASH
)) {
332 prevWasDot
= RIGHTMOST(str
) == '.';
333 prevWasDash
= RIGHTMOST(str
) == '-';
335 while (!EOS
&& str
.len
< LUCENE_MAX_WORD_LEN
-1 ) {
337 const bool dot
= ch
== '.';
338 const bool dash
= ch
== '-';
340 if (!(ALNUM
|| UNDERSCORE
|| dot
|| dash
)) {
343 /* Multiple dots or dashes in succession end the token.
344 ** Consider the following inputs:
345 ** "Visit windowsupdate.microsoft.com--update today!"
346 ** "In the U.S.A.--yes, even there!" */
347 if ((dot
|| dash
) && (prevWasDot
|| prevWasDash
)) {
348 /* We're not going to append the character we just read, in any case.
349 ** As to the character before it (which is currently RIGHTMOST(str)):
350 ** Unless RIGHTMOST(str) is a dot, in which we need to save it so the
351 ** acronym-versus-host detection can work, we want to get rid of it. */
353 SHAVE_RIGHTMOST(str
);
365 /* There's a potential StringBuffer.append call in the code above, which
366 ** could cause str to reallocate its internal buffer. We must wait to
367 ** obtain the optimization-oriented strBuf pointer until after the initial
368 ** potentially realloc-triggering operations on str.
369 ** Because there can be other such ops much later in this function, strBuf
370 ** is guarded within a block to prevent its use during or after the calls
371 ** that would potentially invalidate it. */
372 { /* Begin block-guard of strBuf */
373 TCHAR
* strBuf
= str
.getBuffer();
375 bool rightmostIsDot
= RIGHTMOST_IS(str
, '.');
376 if (CONSUMED_NOTHING_OF_VALUE
) {
377 /* No more alphanums available for this token; shave trailing dot, if any. */
378 if (rightmostIsDot
) {
379 SHAVE_RIGHTMOST(str
);
381 /* If there are no dots remaining, this is a generic ALPHANUM. */
382 if (_tcschr(strBuf
, '.') == NULL
) {
383 forcedType
= ALPHANUM
;
386 /* Check the token to see if it's an acronym. An acronym must have a
387 ** letter in every even slot and a dot in every odd slot, including the
388 ** last slot (for example, "U.S.A."). */
389 } else if (rightmostIsDot
) {
390 bool isAcronym
= true;
391 const int32_t upperCheckLimit
= str
.len
- 1; /* -1 b/c we already checked the last slot. */
393 for (int32_t i
= 0; i
< upperCheckLimit
; i
++) {
394 const bool even
= (i
% 2 == 0);
396 if ( (even
&& !ALPHA
) || (!even
&& !DOT
) ) {
402 forcedType
= ACRONYM
;
404 /* If it's not an acronym, we don't want the trailing dot. */
405 SHAVE_RIGHTMOST(str
);
406 /* If there are no dots remaining, this is a generic ALPHANUM. */
407 if (_tcschr(strBuf
, '.') == NULL
) {
408 forcedType
= ALPHANUM
;
412 } /* End block-guard of strBuf */
415 if (ch
== '@' && str
.len
< LUCENE_MAX_WORD_LEN
-1) {
417 return ReadAt(&str
,t
);
423 return setToken(t
,&str
,UNKNOWN
424 ? forcedType
: HOST
);
427 bool CLuceneTokenizer::ReadApostrophe(StringBuffer
* _str
, Token
* t
) {
428 StringBuffer
& str
=*_str
;
430 TokenTypes tokenType
= APOSTROPHE
;
431 const int32_t specialCharPos
= rdPos
;
435 if (RIGHTMOST_IS(str
, '\'') || CONSUMED_NOTHING_OF_VALUE
) {
436 /* After the apostrophe, no more alphanums were available within this
437 ** token; shave trailing apostrophe and revert to generic ALPHANUM. */
438 SHAVE_RIGHTMOST(str
);
439 tokenType
= ALPHANUM
;
445 return setToken(t
,&str
,tokenType
);
448 bool CLuceneTokenizer::ReadAt(StringBuffer
* str
, Token
* t
) {
449 ReadDotted(str
, EMAIL
,t
);
450 /* JLucene grammar indicates dots/digits not allowed in company name: */
451 if (!CONTAINS_ANY((*str
), ".0123456789")) {
452 setToken(t
,str
,COMPANY
);
457 bool CLuceneTokenizer::ReadCompany(StringBuffer
* _str
, Token
* t
) {
458 StringBuffer
& str
= *_str
;
459 const int32_t specialCharPos
= rdPos
;
463 if (CONSUMED_NOTHING_OF_VALUE
) {
464 /* After the ampersand, no more alphanums were available within this
465 ** token; shave trailing ampersand and revert to ALPHANUM. */
466 CND_PRECONDITION(RIGHTMOST_IS(str
, '&'),"ReadCompany failed");
467 SHAVE_RIGHTMOST(str
);
470 return setToken(t
,&str
,ALPHANUM
);
476 return setToken(t
,&str
,COMPANY
);