2 * @brief Tokenise CJK text as n-grams
4 /* Copyright (c) 2007, 2008 Yung-chung Lin (henearkrxern@gmail.com)
5 * Copyright (c) 2011 Richard Boulton (richard@tartarus.org)
6 * Copyright (c) 2011 Brandon Schaefer (brandontschaefer@gmail.com)
7 * Copyright (c) 2011,2019 Olly Betts
9 * Permission is hereby granted, free of charge, to any person obtaining a copy
10 * of this software and associated documentation files (the "Software"), to deal
11 * deal in the Software without restriction, including without limitation the
12 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
13 * sell copies of the Software, and to permit persons to whom the Software is
14 * furnished to do so, subject to the following conditions:
16 * The above copyright notice and this permission notice shall be included in
17 * all copies or substantial portions of the Software.
19 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
22 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
28 #ifndef XAPIAN_INCLUDED_CJK_TOKENIZER_H
29 #define XAPIAN_INCLUDED_CJK_TOKENIZER_H
32 # error config.h must be included first in each C++ source file
35 #include "xapian/unicode.h"
41 /** Should we use the CJK n-gram code?
43 * The first time this is called it reads the environment variable
44 * XAPIAN_CJK_NGRAM and returns true if it is set to a non-empty value.
45 * Subsequent calls cache and return the same value.
47 bool is_cjk_enabled();
49 bool codepoint_is_cjk(unsigned codepoint
);
51 void get_cjk(Xapian::Utf8Iterator
& it
);
55 /// Iterator returning unigrams and bigrams.
56 class CJKTokenIterator
{
57 Xapian::Utf8Iterator it
;
59 /** Offset to penultimate Unicode character in current_token.
61 * If current_token has one Unicode character, this is 0.
65 std::string current_token
;
67 /// Call to set current_token at the start.
71 explicit CJKTokenIterator(const std::string
& s
) : it(s
) {
75 explicit CJKTokenIterator(const Xapian::Utf8Iterator
& it_
) : it(it_
) {
79 CJKTokenIterator() { }
81 const std::string
& operator*() const {
85 CJKTokenIterator
& operator++();
87 /// Is this a unigram?
88 bool unigram() const { return offset
== 0; }
90 const Xapian::Utf8Iterator
& get_utf8iterator() const { return it
; }
92 bool operator==(const CJKTokenIterator
& other
) const {
93 // We only really care about comparisons where one or other is an end
95 return current_token
.empty() && other
.current_token
.empty();
98 bool operator!=(const CJKTokenIterator
& other
) const {
99 return !(*this == other
);
103 #endif // XAPIAN_INCLUDED_CJK_TOKENIZER_H