2 * Modified version of StandardTokenizer.h for Nepomuk mostly to optimize for filename indexing
3 * Copyright (C) 2008 Sebastian Trueg <trueg@kde.org>
5 * Based on StandardTokenizer.h from the CLucene package.
6 * Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team
8 * This library is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Library General Public
10 * License as published by the Free Software Foundation; either
11 * version 2 of the License, or (at your option) any later version.
13 * This library is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * Library General Public License for more details.
18 * You should have received a copy of the GNU Library General Public License
19 * along with this library; see the file COPYING.LIB. If not, write to
20 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
21 * Boston, MA 02110-1301, USA.
24 #ifndef _NEPOMUK_CLUCENE_TOKENIZER_H_
25 #define _NEPOMUK_CLUCENE_TOKENIZER_H_
27 #include <CLucene/clucene-config.h>
28 #include <CLucene/analysis/AnalysisHeader.h>
29 #include <CLucene/analysis/Analyzers.h>
30 #include <CLucene/util/StringBuffer.h>
31 #include <CLucene/util/FastCharStream.h>
32 #include <CLucene/util/Reader.h>
34 #include "clucenetokenizerconstants.h"
38 /** A grammar-based tokenizer constructed with JavaCC.
40 * <p> This should be a good tokenizer for most European-language documents:
43 * <li>Splits words at punctuation characters, removing punctuation. However, a
44 * dot that's not followed by whitespace is considered part of a token.
45 * <li>Splits words at hyphens, unless there's a number in the token, in which case
46 * the whole token is interpreted as a product number and is not split.
47 * <li>Recognizes email addresses and internet hostnames as one token.
50 * <p>Many applications have specific tokenizer needs. If this tokenizer does
51 * not suit your application, please consider copying this source code
52 * directory to your project and maintaining your own grammar-based tokenizer.
54 class CLuceneTokenizer
: public CL_NS(analysis
)::Tokenizer
57 CL_NS(util
)::FastCharStream
* rd
;
59 // Constructs a tokenizer for this Reader.
60 CLuceneTokenizer(CL_NS(util
)::Reader
* reader
);
64 /** Returns the next token in the stream, or false at end-of-stream.
65 * The returned token's type is set to an element of
66 * CLuceneTokenizerConstants::tokenImage. */
67 bool next(CL_NS(analysis
)::Token
* token
);
69 // Reads for number like "1"/"1234.567", or IP address like "192.168.1.2".
70 bool ReadNumber(const TCHAR
* previousNumber
, const TCHAR prev
, CL_NS(analysis
)::Token
* t
);
72 bool ReadAlphaNum(const TCHAR prev
, CL_NS(analysis
)::Token
* t
);
74 // Reads for apostrophe-containing word.
75 bool ReadApostrophe(CL_NS(util
)::StringBuffer
* str
, CL_NS(analysis
)::Token
* t
);
77 // Reads for something@... it may be a COMPANY name or a EMAIL address
78 bool ReadAt(CL_NS(util
)::StringBuffer
* str
, CL_NS(analysis
)::Token
* t
);
80 // Reads for COMPANY name like AT&T.
81 bool ReadCompany(CL_NS(util
)::StringBuffer
* str
, CL_NS(analysis
)::Token
* t
);
83 // Reads CJK characters
84 bool ReadCJK(const TCHAR prev
, CL_NS(analysis
)::Token
* t
);
90 // Advance by one character, incrementing rdPos and returning the character.
92 // Retreat by one character, decrementing rdPos.
95 // createToken centralizes token creation for auditing purposes.
96 //Token* createToken(CL_NS(util)::StringBuffer* sb, TokenTypes tokenCode);
97 inline bool setToken(CL_NS(analysis
)::Token
* t
, CL_NS(util
)::StringBuffer
* sb
, TokenTypes tokenCode
);
99 bool ReadDotted(CL_NS(util
)::StringBuffer
* str
, TokenTypes forcedType
, CL_NS(analysis
)::Token
* t
);