Fix integer type used by ESet
[xapian.git] / xapian-core / include / xapian / unicode.h
blob8251529861eccc6558505edc1b689ca1a5818ece
1 /** @file
2 * @brief Unicode and UTF-8 related classes and functions.
3 */
4 /* Copyright (C) 2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2019 Olly Betts
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
21 #ifndef XAPIAN_INCLUDED_UNICODE_H
22 #define XAPIAN_INCLUDED_UNICODE_H
24 #if !defined XAPIAN_IN_XAPIAN_H && !defined XAPIAN_LIB_BUILD
25 # error Never use <xapian/unicode.h> directly; include <xapian.h> instead.
26 #endif
28 #include <xapian/attributes.h>
29 #include <xapian/visibility.h>
31 #include <string>
33 namespace Xapian {
35 /** An iterator which returns Unicode character values from a UTF-8 encoded
36 * string.
38 class XAPIAN_VISIBILITY_DEFAULT Utf8Iterator {
39 const unsigned char* p;
40 const unsigned char* end;
41 mutable unsigned seqlen;
43 bool XAPIAN_NOTHROW(calculate_sequence_length() const);
45 unsigned get_char() const;
47 Utf8Iterator(const unsigned char* p_,
48 const unsigned char* end_,
49 unsigned seqlen_)
50 : p(p_), end(end_), seqlen(seqlen_) { }
52 public:
53 /** Return the raw const char* pointer for the current position. */
54 const char* raw() const {
55 return reinterpret_cast<const char*>(p ? p : end);
58 /** Return the number of bytes left in the iterator's buffer. */
59 size_t left() const { return p ? end - p : 0; }
61 /** Assign a new string to the iterator.
63 * The iterator will forget the string it was iterating through, and
64 * return characters from the start of the new string when next called.
65 * The string is not copied into the iterator, so it must remain valid
66 * while the iteration is in progress.
68 * @param p_ A pointer to the start of the string to read.
70 * @param len The length of the string to read.
72 void assign(const char* p_, size_t len) {
73 if (len) {
74 p = reinterpret_cast<const unsigned char*>(p_);
75 end = p + len;
76 seqlen = 0;
77 } else {
78 p = NULL;
82 /** Assign a new string to the iterator.
84 * The iterator will forget the string it was iterating through, and
85 * return characters from the start of the new string when next called.
86 * The string is not copied into the iterator, so it must remain valid
87 * while the iteration is in progress.
89 * @param s The string to read. Must not be modified while the iteration
90 * is in progress.
92 void assign(const std::string& s) { assign(s.data(), s.size()); }
94 /** Create an iterator given a pointer to a null terminated string.
96 * The iterator will return characters from the start of the string when
97 * next called. The string is not copied into the iterator, so it must
98 * remain valid while the iteration is in progress.
100 * @param p_ A pointer to the start of the null terminated string to read.
102 explicit Utf8Iterator(const char* p_);
104 /** Create an iterator given a pointer and a length.
106 * The iterator will return characters from the start of the string when
107 * next called. The string is not copied into the iterator, so it must
108 * remain valid while the iteration is in progress.
110 * @param p_ A pointer to the start of the string to read.
112 * @param len The length of the string to read.
114 Utf8Iterator(const char* p_, size_t len) { assign(p_, len); }
116 /** Create an iterator given a string.
118 * The iterator will return characters from the start of the string when
119 * next called. The string is not copied into the iterator, so it must
120 * remain valid while the iteration is in progress.
122 * @param s The string to read. Must not be modified while the iteration
123 * is in progress.
125 Utf8Iterator(const std::string& s) { assign(s.data(), s.size()); }
127 /** Create an iterator which is at the end of its iteration.
129 * This can be compared to another iterator to check if the other iterator
130 * has reached its end.
132 XAPIAN_NOTHROW(Utf8Iterator())
133 : p(NULL), end(0), seqlen(0) { }
135 /** Get the current Unicode character value pointed to by the iterator.
137 * If an invalid UTF-8 sequence is encountered, then the byte values
138 * comprising it are returned until valid UTF-8 or the end of the input is
139 * reached.
141 * Returns unsigned(-1) if the iterator has reached the end of its buffer.
143 unsigned XAPIAN_NOTHROW(operator*() const) XAPIAN_PURE_FUNCTION;
145 /** @private @internal Get the current Unicode character
146 * value pointed to by the iterator.
148 * If an invalid UTF-8 sequence is encountered, then the byte values
149 * comprising it are returned with the top bit set (so the caller can
150 * differentiate these from the same values arising from valid UTF-8)
151 * until valid UTF-8 or the end of the input is reached.
153 * Returns unsigned(-1) if the iterator has reached the end of its buffer.
155 unsigned XAPIAN_NOTHROW(strict_deref() const) XAPIAN_PURE_FUNCTION;
157 /** Move forward to the next Unicode character.
159 * @return An iterator pointing to the position before the move.
161 Utf8Iterator operator++(int) {
162 // If we've not calculated seqlen yet, do so.
163 if (seqlen == 0) calculate_sequence_length();
164 const unsigned char* old_p = p;
165 unsigned old_seqlen = seqlen;
166 p += seqlen;
167 if (p == end) p = NULL;
168 seqlen = 0;
169 return Utf8Iterator(old_p, end, old_seqlen);
172 /** Move forward to the next Unicode character.
174 * @return A reference to this object.
176 Utf8Iterator& operator++() {
177 if (seqlen == 0) calculate_sequence_length();
178 p += seqlen;
179 if (p == end) p = NULL;
180 seqlen = 0;
181 return *this;
184 /** Test two Utf8Iterators for equality.
186 * @param other The Utf8Iterator to compare this one with.
187 * @return true iff the iterators point to the same position.
189 bool XAPIAN_NOTHROW(operator==(const Utf8Iterator& other) const) {
190 return p == other.p;
193 /** Test two Utf8Iterators for inequality.
195 * @param other The Utf8Iterator to compare this one with.
196 * @return true iff the iterators do not point to the same position.
198 bool XAPIAN_NOTHROW(operator!=(const Utf8Iterator& other) const) {
199 return p != other.p;
202 /// We implement the semantics of an STL input_iterator.
203 //@{
204 typedef std::input_iterator_tag iterator_category;
205 typedef unsigned value_type;
206 typedef size_t difference_type;
207 typedef const unsigned* pointer;
208 typedef const unsigned& reference;
209 //@}
212 /// Functions associated with handling Unicode characters.
213 namespace Unicode {
215 /** Each Unicode character is in exactly one of these categories.
217 * The Unicode standard calls this the "General Category", and uses a
218 * "Major, minor" convention to derive a two letter code.
220 typedef enum {
221 UNASSIGNED, /**< Other, not assigned (Cn) */
222 UPPERCASE_LETTER, /**< Letter, uppercase (Lu) */
223 LOWERCASE_LETTER, /**< Letter, lowercase (Ll) */
224 TITLECASE_LETTER, /**< Letter, titlecase (Lt) */
225 MODIFIER_LETTER, /**< Letter, modifier (Lm) */
226 OTHER_LETTER, /**< Letter, other (Lo) */
227 NON_SPACING_MARK, /**< Mark, nonspacing (Mn) */
228 ENCLOSING_MARK, /**< Mark, enclosing (Me) */
229 COMBINING_SPACING_MARK, /**< Mark, spacing combining (Mc) */
230 DECIMAL_DIGIT_NUMBER, /**< Number, decimal digit (Nd) */
231 LETTER_NUMBER, /**< Number, letter (Nl) */
232 OTHER_NUMBER, /**< Number, other (No) */
233 SPACE_SEPARATOR, /**< Separator, space (Zs) */
234 LINE_SEPARATOR, /**< Separator, line (Zl) */
235 PARAGRAPH_SEPARATOR, /**< Separator, paragraph (Zp) */
236 CONTROL, /**< Other, control (Cc) */
237 FORMAT, /**< Other, format (Cf) */
238 PRIVATE_USE, /**< Other, private use (Co) */
239 SURROGATE, /**< Other, surrogate (Cs) */
240 CONNECTOR_PUNCTUATION, /**< Punctuation, connector (Pc) */
241 DASH_PUNCTUATION, /**< Punctuation, dash (Pd) */
242 OPEN_PUNCTUATION, /**< Punctuation, open (Ps) */
243 CLOSE_PUNCTUATION, /**< Punctuation, close (Pe) */
244 INITIAL_QUOTE_PUNCTUATION, /**< Punctuation, initial quote (Pi) */
245 FINAL_QUOTE_PUNCTUATION, /**< Punctuation, final quote (Pf) */
246 OTHER_PUNCTUATION, /**< Punctuation, other (Po) */
247 MATH_SYMBOL, /**< Symbol, math (Sm) */
248 CURRENCY_SYMBOL, /**< Symbol, currency (Sc) */
249 MODIFIER_SYMBOL, /**< Symbol, modified (Sk) */
250 OTHER_SYMBOL /**< Symbol, other (So) */
251 } category;
253 namespace Internal {
254 /** @private @internal Extract the information about a character from the
255 * Unicode character tables.
257 * Characters outside of the Unicode range (i.e. ch >= 0x110000) are
258 * treated as UNASSIGNED with no case variants.
260 XAPIAN_VISIBILITY_DEFAULT
261 int XAPIAN_NOTHROW(get_character_info(unsigned ch)) XAPIAN_CONST_FUNCTION;
263 /** @private @internal Extract how to convert the case of a Unicode
264 * character from its info.
266 inline int get_case_type(int info) { return ((info & 0xe0) >> 5); }
268 /** @private @internal Extract the category of a Unicode character from its
269 * info.
271 inline category get_category(int info) {
272 return static_cast<category>(info & 0x1f);
275 /** @private @internal Extract the delta to use for case conversion of a
276 * character from its info.
278 inline int get_delta(int info) {
279 /* It's implementation defined if sign extension happens when right
280 * shifting a signed int, although in practice sign extension is what
281 * most compilers implement.
283 * Some compilers are smart enough to spot common idioms for sign
284 * extension, but not all (e.g. GCC < 7 doesn't spot the one used
285 * below), so check what the implementation-defined behaviour is with
286 * a constant conditional which should get optimised away.
288 * We use the ternary operator here to avoid various compiler
289 * warnings which writing this as an `if` results in.
291 return ((-1 >> 1) == -1 ?
292 // Right shift sign-extends.
293 info >> 8 :
294 // Right shift shifts in zeros so bitwise-not before and after
295 // the shift for negative values.
296 (info >= 0) ? (info >> 8) : (~(~info >> 8)));
300 /** Convert a single non-ASCII Unicode character to UTF-8.
302 * This is intended mainly as a helper method for to_utf8().
304 * @param ch The character (which must be > 128) to write to @a buf.
305 * @param buf The buffer to write the character to - it must have
306 * space for (at least) 4 bytes.
308 * @return The length of the resultant UTF-8 character in bytes.
310 XAPIAN_VISIBILITY_DEFAULT
311 unsigned nonascii_to_utf8(unsigned ch, char* buf);
313 /** Convert a single Unicode character to UTF-8.
315 * @param ch The character to write to @a buf.
316 * @param buf The buffer to write the character to - it must have
317 * space for (at least) 4 bytes.
319 * @return The length of the resultant UTF-8 character in bytes.
321 inline unsigned to_utf8(unsigned ch, char* buf) {
322 if (ch < 128) {
323 *buf = static_cast<unsigned char>(ch);
324 return 1;
326 return Xapian::Unicode::nonascii_to_utf8(ch, buf);
329 /** Append the UTF-8 representation of a single Unicode character to a
330 * std::string.
332 inline void append_utf8(std::string& s, unsigned ch) {
333 char buf[4];
334 s.append(buf, to_utf8(ch, buf));
337 /// Return the category which a given Unicode character falls into.
338 inline category get_category(unsigned ch) {
339 return Internal::get_category(Internal::get_character_info(ch));
342 /// Test if a given Unicode character is "word character".
343 inline bool is_wordchar(unsigned ch) {
344 const unsigned int WORDCHAR_MASK =
345 (1 << Xapian::Unicode::UPPERCASE_LETTER) |
346 (1 << Xapian::Unicode::LOWERCASE_LETTER) |
347 (1 << Xapian::Unicode::TITLECASE_LETTER) |
348 (1 << Xapian::Unicode::MODIFIER_LETTER) |
349 (1 << Xapian::Unicode::OTHER_LETTER) |
350 (1 << Xapian::Unicode::NON_SPACING_MARK) |
351 (1 << Xapian::Unicode::ENCLOSING_MARK) |
352 (1 << Xapian::Unicode::COMBINING_SPACING_MARK) |
353 (1 << Xapian::Unicode::DECIMAL_DIGIT_NUMBER) |
354 (1 << Xapian::Unicode::LETTER_NUMBER) |
355 (1 << Xapian::Unicode::OTHER_NUMBER) |
356 (1 << Xapian::Unicode::CONNECTOR_PUNCTUATION);
357 return ((WORDCHAR_MASK >> get_category(ch)) & 1);
360 /// Test if a given Unicode character is a whitespace character.
361 inline bool is_whitespace(unsigned ch) {
362 const unsigned int WHITESPACE_MASK =
363 (1 << Xapian::Unicode::CONTROL) | // For TAB, CR, LF, FF.
364 (1 << Xapian::Unicode::SPACE_SEPARATOR) |
365 (1 << Xapian::Unicode::LINE_SEPARATOR) |
366 (1 << Xapian::Unicode::PARAGRAPH_SEPARATOR);
367 return ((WHITESPACE_MASK >> get_category(ch)) & 1);
370 /// Test if a given Unicode character is a currency symbol.
371 inline bool is_currency(unsigned ch) {
372 return (get_category(ch) == Xapian::Unicode::CURRENCY_SYMBOL);
375 /// Convert a Unicode character to lowercase.
376 inline unsigned tolower(unsigned ch) {
377 int info = Xapian::Unicode::Internal::get_character_info(ch);
378 if (!(Internal::get_case_type(info) & 2))
379 return ch;
380 return ch + Internal::get_delta(info);
383 /// Convert a Unicode character to uppercase.
384 inline unsigned toupper(unsigned ch) {
385 int info = Xapian::Unicode::Internal::get_character_info(ch);
386 if (!(Internal::get_case_type(info) & 4))
387 return ch;
388 return ch - Internal::get_delta(info);
391 /// Convert a UTF-8 std::string to lowercase.
392 inline std::string
393 tolower(const std::string& term)
395 std::string result;
396 result.reserve(term.size());
397 for (Utf8Iterator i(term); i != Utf8Iterator(); ++i) {
398 append_utf8(result, tolower(*i));
400 return result;
403 /// Convert a UTF-8 std::string to uppercase.
404 inline std::string
405 toupper(const std::string& term)
407 std::string result;
408 result.reserve(term.size());
409 for (Utf8Iterator i(term); i != Utf8Iterator(); ++i) {
410 append_utf8(result, toupper(*i));
412 return result;
419 #endif // XAPIAN_INCLUDED_UNICODE_H