runtime/nepomuk/services/storage/clucenetokenizer.cpp

   1 /*
   2  * Modified version of StandardTokenizer.cpp for Nepomuk mostly to optimize for filename indexing
   3  * Copyright (C) 2008 Sebastian Trueg <trueg@kde.org>
   4  *
   5  * Based on StandardTokenizer.cpp from the CLucene package.
   6  * Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team
   7  *
   8  * This library is free software; you can redistribute it and/or
   9  * modify it under the terms of the GNU Library General Public
  10  * License as published by the Free Software Foundation; either
  11  * version 2 of the License, or (at your option) any later version.
  12  *
  13  * This library is distributed in the hope that it will be useful,
  14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  16  * Library General Public License for more details.
  17  *
  18  * You should have received a copy of the GNU Library General Public License
  19  * along with this library; see the file COPYING.LIB.  If not, write to
  20  * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
  21  * Boston, MA 02110-1301, USA.
  22  */
  23
  24 /*
  25  * Modified version for Nepomuk mostly to optimize for filename indexing
  26  * Changes:
  27  * - underscore is treated as space, ie. tokens are always divided at underscores
  28  * - try to always divide tokens at dots.
  29  *   FIXME: we should still allow the acronym detection
  30  *   FIXME: is it possible to extract both? for example email addresses. Here it would
  31  *          be nice to have both the full address as a token and the separated portions.
  32  */
  33
  34 #include <CLucene/StdHeader.h>
  35 #include "clucenetokenizer.h"
  36
  37 CL_NS_USE(analysis)
  38 CL_NS_USE(util)
  39
  40 namespace Nepomuk {
  41     const TCHAR* tokenImageArray[] = {
  42         _T("<EOF>"),
  43         _T("<UNKNOWN>"),
  44         _T("<ALPHANUM>"),
  45         _T("<APOSTROPHE>"),
  46         _T("<ACRONYM>"),
  47         _T("<COMPANY>"),
  48         _T("<EMAIL>"),
  49         _T("<HOST>"),
  50         _T("<NUM>"),
  51         _T("<CJK>")
  52     };
  53     const TCHAR** tokenImage = tokenImageArray;
  54 }
  55
  56
  57 /* A bunch of shortcut macros, many of which make assumptions about variable
  58 ** names.  These macros enhance readability, not just convenience! */
  59 #define EOS           (ch==-1 || rd->Eos())
  60 #define SPACE         (_istspace((TCHAR)ch) != 0)
  61 #define ALPHA         (_istalpha((TCHAR)ch) != 0)
  62 #define ALNUM         (_istalnum(ch) != 0)
  63 #define DIGIT         (_istdigit(ch) != 0)
  64 #define UNDERSCORE    (ch == '_')
  65
  66 #define _CJK                    (  (ch>=0x3040 && ch<=0x318f) ||            \
  67                                                    (ch>=0x3300 && ch<=0x337f) ||            \
  68                                                    (ch>=0x3400 && ch<=0x3d2d) ||            \
  69                                                    (ch>=0x4e00 && ch<=0x9fff) ||            \
  70                                                    (ch>=0xf900 && ch<=0xfaff) ||            \
  71                                                    (ch>=0xac00 && ch<=0xd7af) ) //korean
  72
  73
  74 #define DASH          (ch == '-')
  75 #define NEGATIVE_SIGN_ DASH
  76 //#define POSITIVE_SIGN_ (ch == '+')
  77 //#define SIGN          (NEGATIVE_SIGN_ || POSITIVE_SIGN_)
  78
  79 #define DOT             (ch == '.')
  80 #define DECIMAL         DOT
  81
  82
  83 //freebsd seems to have a problem with defines over multiple lines, so this has to be one long line
  84 #define _CONSUME_AS_LONG_AS(conditionFails) while (true) { ch = readChar(); if (ch==-1 || (!(conditionFails) || str.len >= LUCENE_MAX_WORD_LEN)) { break; } str.appendChar(ch);}
  85
  86 #define CONSUME_ALPHAS _CONSUME_AS_LONG_AS(ALPHA)
  87
  88 #define CONSUME_DIGITS _CONSUME_AS_LONG_AS(DIGIT)
  89
  90 /* otherMatches is a condition (possibly compound) under which a character
  91 ** that's not an ALNUM can be considered not to break the
  92 ** span.  Callers should pass false if only ALNUM are acceptable. */
  93 #define CONSUME_WORD                  _CONSUME_AS_LONG_AS(ALNUM)
  94
  95 /*
  96 ** Consume CJK characters
  97 */
  98 #define CONSUME_CJK                   _CONSUME_AS_LONG_AS(_CJK)
  99
 100
 101 /* It is considered that "nothing of value" has been read if:
 102 ** a) The "read head" hasn't moved since specialCharPos was established.
 103 ** or
 104 ** b) The "read head" has moved by one character, but that character was
 105 **    either whitespace or not among the characters found in the body of
 106 **    a token (deliberately doesn't include the likes of '@'/'&'). */
 107 #define CONSUMED_NOTHING_OF_VALUE (rdPos == specialCharPos || (rdPos == specialCharPos+1 && ( SPACE || !(ALNUM || DOT || DASH || UNDERSCORE) )))
 108
 109 #define RIGHTMOST(sb) (sb.getBuffer()[sb.len-1])
 110 #define RIGHTMOST_IS(sb, c) (RIGHTMOST(sb) == c)
 111 /* To discard the last character in a StringBuffer, we decrement the buffer's
 112 ** length indicator and move the terminator back by one character. */
 113 #define SHAVE_RIGHTMOST(sb) (sb.getBuffer()[--sb.len] = '\0')
 114
 115 //#define REMOVE_TRAILING_CHARS(sb, charMatchesCondition) { TCHAR* sbBuf = sb.getBuffer(); for (int32_t i = sb.len-1; i >= 0; i--) { TCHAR c = sbBuf[i]; if (charMatchesCondition) { sbBuf[--sb.len] = '\0'; } else {break;}}}
 116
 117 /* Does StringBuffer sb contain any of the characters in string ofThese? */
 118 #define CONTAINS_ANY(sb, ofThese) (_tcscspn(sb.getBuffer(), _T(ofThese)) != static_cast<size_t>(sb.len))
 119
 120
 121 namespace Nepomuk {
 122
 123     CLuceneTokenizer::CLuceneTokenizer(Reader* reader):
 124         rd(_CLNEW FastCharStream(reader)),
 125         /* rdPos is zero-based.  It starts at -1, and will advance to the first
 126         ** position when readChar() is first called. */
 127         rdPos(-1),
 128         tokenStart(-1)
 129     {
 130     }
 131
 132     CLuceneTokenizer::~CLuceneTokenizer() {
 133         _CLDELETE(rd);
 134     }
 135
 136     int CLuceneTokenizer::readChar() {
 137         /* Increment by 1 because we're speaking in terms of characters, not
 138         ** necessarily bytes: */
 139         rdPos++;
 140         return rd->GetNext();
 141     }
 142
 143     void CLuceneTokenizer::unReadChar() {
 144         rd->UnGet();
 145         rdPos--;
 146     }
 147
 148     inline bool CLuceneTokenizer::setToken(CL_NS( analysis )::Token* t, StringBuffer* sb, TokenTypes tokenCode) {
 149         t->setStartOffset(tokenStart);
 150         t->setEndOffset(tokenStart+sb->length());
 151         t->setType(tokenImage[tokenCode]);
 152         sb->getBuffer(); //null terminates the buffer
 153         t->resetTermTextLen();
 154         return true;
 155     }
 156
 157     bool CLuceneTokenizer::next(Token* t) {
 158         int ch=0;
 159         while (!EOS) {
 160             ch = readChar();
 161
 162             if ( ch == 0 || ch == -1 ){
 163                 continue;
 164             } else if (SPACE || UNDERSCORE) {
 165                 continue;
 166             } else if (ALPHA) {
 167                 tokenStart = rdPos;
 168                 return ReadAlphaNum(ch,t);
 169             } else if (DIGIT || NEGATIVE_SIGN_ || DECIMAL) {
 170                 tokenStart = rdPos;
 171                 /* ReadNumber returns NULL if it fails to extract a valid number; in
 172                 ** that case, we just continue. */
 173                 if (ReadNumber(NULL, ch,t))
 174                     return true;
 175             } else if ( _CJK ){
 176                 if ( ReadCJK(ch,t) )
 177                     return true;
 178             }
 179         }
 180         return false;
 181     }
 182
 183     bool CLuceneTokenizer::ReadNumber(const TCHAR* previousNumber, const TCHAR prev,Token* t) {
 184         /* previousNumber is only non-NULL if this function already read a complete
 185         ** number in a previous recursion, yet has been asked to read additional
 186         ** numeric segments.  For example, in the HOST "192.168.1.3", "192.168" is
 187         ** a complete number, but this function will recurse to read the "1.3",
 188         ** generating a single HOST token "192.168.1.3". */
 189         t->growBuffer(LUCENE_MAX_WORD_LEN+1);//make sure token can hold the next word
 190         StringBuffer str(t->_termText,t->bufferLength(),true); //use stringbuffer to read data onto the termText
 191         TokenTypes tokenType;
 192         bool decExhausted;
 193         if (previousNumber != NULL) {
 194             str.prepend(previousNumber);
 195             tokenType = HOST;
 196             decExhausted = false;
 197         } else {
 198             tokenType = NUM;
 199             decExhausted = (prev == '.');
 200         }
 201         if (  str.len >= LUCENE_MAX_WORD_LEN ){
 202             //if a number is too long, i would say there is no point
 203             //storing it, because its going to be the wrong number anyway?
 204             //what do people think?
 205             return false;
 206         }
 207         str.appendChar(prev);
 208
 209         const bool signExhausted = (prev == '-');
 210         int ch = prev;
 211
 212         CONSUME_DIGITS;
 213
 214         if (str.len < 2 /* CONSUME_DIGITS didn't find any digits. */
 215             && (
 216                 (signExhausted && !DECIMAL)
 217                 || (decExhausted /* && !DIGIT is implied, since CONSUME_DIGITS stopped on a non-digit. */)
 218                 )
 219             )
 220         {
 221             /* We have either:
 222             **   a) a negative sign that's not followed by either digit(s) or a decimal
 223             **   b) a decimal that's not followed by digit(s)
 224             ** so this is not a valid number. */
 225             if (!EOS) {
 226                 /* Unread the character that stopped CONSUME_DIGITS: */
 227                 unReadChar();
 228             }
 229             return false;
 230         }
 231
 232         /* We just read a group of digits.  Is it followed by a decimal symbol,
 233         ** implying that there might be another group of digits available? */
 234         if (!EOS) {
 235             if (DECIMAL) {
 236                 if (  str.len >= LUCENE_MAX_WORD_LEN )
 237                     return false; //read above for rationale
 238                 str.appendChar(ch);
 239             } else {
 240                 unReadChar();
 241                 goto SUCCESSFULLY_EXTRACTED_NUMBER;
 242             }
 243
 244             CONSUME_DIGITS;
 245             if (!DIGIT && !DECIMAL) {
 246                 unReadChar();
 247             } else if (!EOS && DECIMAL && _istdigit(rd->Peek())) {
 248                 /* We just read the fractional digit group, but it's also followed by
 249                 ** a decimal symbol and at least one more digit, so this must be a
 250                 ** HOST rather than a real number. */
 251                 return ReadNumber(str.getBuffer(), '.',t);
 252             }
 253         }
 254
 255     SUCCESSFULLY_EXTRACTED_NUMBER:
 256         TCHAR rightmost = RIGHTMOST(str);
 257         /* Don't including a trailing decimal point. */
 258         if (rightmost == '.') {
 259             SHAVE_RIGHTMOST(str);
 260             unReadChar();
 261             rightmost = RIGHTMOST(str);
 262         }
 263         /* If all we have left is a negative sign, it's not a valid number. */
 264         if (rightmost == '-') {
 265             CND_PRECONDITION (str.len == 1, "Number is invalid");
 266             return false;
 267         }
 268
 269         return setToken(t,&str,tokenType);
 270     }
 271
 272     bool CLuceneTokenizer::ReadAlphaNum(const TCHAR prev, Token* t) {
 273         t->growBuffer(LUCENE_MAX_WORD_LEN+1);//make sure token can hold the next word
 274         StringBuffer str(t->_termText,t->bufferLength(),true); //use stringbuffer to read data onto the termText
 275         if (  str.len < LUCENE_MAX_WORD_LEN ){
 276             str.appendChar(prev);
 277             int ch = prev;
 278
 279             CONSUME_WORD;
 280             if (!EOS && str.len < LUCENE_MAX_WORD_LEN-1 ) { //still have space for 1 more character?
 281                 switch(ch) { /* What follows the first alphanum segment? */
 282 #if 0
 283                 case '.':
 284                                         str.appendChar('.');
 285                                         return ReadDotted(&str, UNKNOWN,t);
 286 #endif
 287                                 case '\'':
 288                                         str.appendChar('\'');
 289                                         return ReadApostrophe(&str,t);
 290                                 case '@':
 291                                         str.appendChar('@');
 292                                         return ReadAt(&str,t);
 293                                 case '&':
 294                                         str.appendChar('&');
 295                                         return ReadCompany(&str,t);
 296                     /* default: fall through to end of this function. */
 297                 }
 298             }
 299         }
 300         return setToken(t,&str,ALPHANUM);
 301     }
 302
 303     bool CLuceneTokenizer::ReadCJK(const TCHAR prev, Token* t) {
 304         t->growBuffer(LUCENE_MAX_WORD_LEN+1);//make sure token can hold the next word
 305         StringBuffer str(t->_termText,t->bufferLength(),true); //use stringbuffer to read data onto the termText
 306         if ( str.len < LUCENE_MAX_WORD_LEN ){
 307             str.appendChar(prev);
 308             int ch = prev;
 309
 310             CONSUME_CJK;
 311         }
 312         return setToken(t,&str,CJK);
 313     }
 314
 315
 316     bool CLuceneTokenizer::ReadDotted(StringBuffer* _str, TokenTypes forcedType, CL_NS(analysis)::Token* t) {
 317         const int32_t specialCharPos = rdPos;
 318         StringBuffer& str=*_str;
 319
 320         /* A segment of a "dotted" is not allowed to begin with another dot or a dash.
 321         ** Even though hosts, e-mail addresses, etc., could have a dotted-segment
 322         ** that begins with a dot or a dash, it's far more common in source text
 323         ** for a pattern like "abc.--def" to be intended as two tokens. */
 324         int ch = rd->Peek();
 325         if (!(DOT || DASH)) {
 326             bool prevWasDot;
 327             bool prevWasDash;
 328             if (str.len == 0) {
 329                 prevWasDot = false;
 330                 prevWasDash = false;
 331             } else {
 332                 prevWasDot = RIGHTMOST(str) == '.';
 333                 prevWasDash = RIGHTMOST(str) == '-';
 334             }
 335             while (!EOS && str.len < LUCENE_MAX_WORD_LEN-1 ) {
 336                 ch = readChar();
 337                 const bool dot = ch == '.';
 338                 const bool dash = ch == '-';
 339
 340                 if (!(ALNUM || UNDERSCORE || dot || dash)) {
 341                     break;
 342                 }
 343                 /* Multiple dots or dashes in succession end the token.
 344                 ** Consider the following inputs:
 345                 **   "Visit windowsupdate.microsoft.com--update today!"
 346                 **   "In the U.S.A.--yes, even there!"                 */
 347                 if ((dot || dash) && (prevWasDot || prevWasDash)) {
 348                     /* We're not going to append the character we just read, in any case.
 349                     ** As to the character before it (which is currently RIGHTMOST(str)):
 350                     ** Unless RIGHTMOST(str) is a dot, in which we need to save it so the
 351                     ** acronym-versus-host detection can work, we want to get rid of it. */
 352                     if (!prevWasDot) {
 353                         SHAVE_RIGHTMOST(str);
 354                     }
 355                     break;
 356                 }
 357
 358                 str.appendChar(ch);
 359
 360                 prevWasDot = dot;
 361                 prevWasDash = dash;
 362             }
 363         }
 364
 365         /* There's a potential StringBuffer.append call in the code above, which
 366         ** could cause str to reallocate its internal buffer.  We must wait to
 367         ** obtain the optimization-oriented strBuf pointer until after the initial
 368         ** potentially realloc-triggering operations on str.
 369         ** Because there can be other such ops much later in this function, strBuf
 370         ** is guarded within a block to prevent its use during or after the calls
 371         ** that would potentially invalidate it. */
 372         { /* Begin block-guard of strBuf */
 373             TCHAR* strBuf = str.getBuffer();
 374
 375             bool rightmostIsDot = RIGHTMOST_IS(str, '.');
 376             if (CONSUMED_NOTHING_OF_VALUE) {
 377                 /* No more alphanums available for this token; shave trailing dot, if any. */
 378                 if (rightmostIsDot) {
 379                     SHAVE_RIGHTMOST(str);
 380                 }
 381                 /* If there are no dots remaining, this is a generic ALPHANUM. */
 382                 if (_tcschr(strBuf, '.') == NULL) {
 383                     forcedType = ALPHANUM;
 384                 }
 385
 386                 /* Check the token to see if it's an acronym.  An acronym must have a
 387                 ** letter in every even slot and a dot in every odd slot, including the
 388                 ** last slot (for example, "U.S.A."). */
 389             } else if (rightmostIsDot) {
 390                 bool isAcronym = true;
 391                 const int32_t upperCheckLimit = str.len - 1; /* -1 b/c we already checked the last slot. */
 392
 393                 for (int32_t i = 0; i < upperCheckLimit; i++) {
 394                     const bool even = (i % 2 == 0);
 395                     ch = strBuf[i];
 396                     if ( (even && !ALPHA) || (!even && !DOT) ) {
 397                         isAcronym = false;
 398                         break;
 399                     }
 400                 }
 401                 if (isAcronym) {
 402                     forcedType = ACRONYM;
 403                 } else {
 404                     /* If it's not an acronym, we don't want the trailing dot. */
 405                     SHAVE_RIGHTMOST(str);
 406                     /* If there are no dots remaining, this is a generic ALPHANUM. */
 407                     if (_tcschr(strBuf, '.') == NULL) {
 408                         forcedType = ALPHANUM;
 409                     }
 410                 }
 411             }
 412         } /* End block-guard of strBuf */
 413
 414         if (!EOS) {
 415             if (ch == '@' && str.len < LUCENE_MAX_WORD_LEN-1) {
 416                 str.appendChar('@');
 417                 return ReadAt(&str,t);
 418             } else {
 419                 unReadChar();
 420             }
 421         }
 422
 423         return setToken(t,&str,UNKNOWN
 424                         ? forcedType : HOST);
 425     }
 426
 427     bool CLuceneTokenizer::ReadApostrophe(StringBuffer* _str, Token* t) {
 428         StringBuffer& str=*_str;
 429
 430         TokenTypes tokenType = APOSTROPHE;
 431         const int32_t specialCharPos = rdPos;
 432         int ch=0;
 433
 434         CONSUME_ALPHAS;
 435         if (RIGHTMOST_IS(str, '\'') || CONSUMED_NOTHING_OF_VALUE) {
 436             /* After the apostrophe, no more alphanums were available within this
 437             ** token; shave trailing apostrophe and revert to generic ALPHANUM. */
 438             SHAVE_RIGHTMOST(str);
 439             tokenType = ALPHANUM;
 440         }
 441         if (!EOS) {
 442             unReadChar();
 443         }
 444
 445         return setToken(t,&str,tokenType);
 446     }
 447
 448     bool CLuceneTokenizer::ReadAt(StringBuffer* str, Token* t) {
 449         ReadDotted(str, EMAIL,t);
 450         /* JLucene grammar indicates dots/digits not allowed in company name: */
 451         if (!CONTAINS_ANY((*str), ".0123456789")) {
 452             setToken(t,str,COMPANY);
 453         }
 454         return true;
 455     }
 456
 457     bool CLuceneTokenizer::ReadCompany(StringBuffer* _str, Token* t) {
 458         StringBuffer& str = *_str;
 459         const int32_t specialCharPos = rdPos;
 460         int ch=0;
 461
 462         CONSUME_WORD;
 463         if (CONSUMED_NOTHING_OF_VALUE) {
 464             /* After the ampersand, no more alphanums were available within this
 465             ** token; shave trailing ampersand and revert to ALPHANUM. */
 466             CND_PRECONDITION(RIGHTMOST_IS(str, '&'),"ReadCompany failed");
 467             SHAVE_RIGHTMOST(str);
 468
 469
 470             return setToken(t,&str,ALPHANUM);
 471         }
 472         if (!EOS) {
 473             unReadChar();
 474         }
 475
 476         return setToken(t,&str,COMPANY);
 477     }
 478 }