xapian-core/include/xapian/unicode.h

   1 /** @file
   2  * @brief Unicode and UTF-8 related classes and functions.
   3  */
   4 /* Copyright (C) 2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2019 Olly Betts
   5  *
   6  * This program is free software; you can redistribute it and/or modify
   7  * it under the terms of the GNU General Public License as published by
   8  * the Free Software Foundation; either version 2 of the License, or
   9  * (at your option) any later version.
  10  *
  11  * This program is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14  * GNU General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU General Public License
  17  * along with this program; if not, write to the Free Software
  18  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
  19  */
  20
  21 #ifndef XAPIAN_INCLUDED_UNICODE_H
  22 #define XAPIAN_INCLUDED_UNICODE_H
  23
  24 #if !defined XAPIAN_IN_XAPIAN_H && !defined XAPIAN_LIB_BUILD
  25 # error Never use <xapian/unicode.h> directly; include <xapian.h> instead.
  26 #endif
  27
  28 #include <xapian/attributes.h>
  29 #include <xapian/visibility.h>
  30
  31 #include <string>
  32
  33 namespace Xapian {
  34
  35 /** An iterator which returns Unicode character values from a UTF-8 encoded
  36  *  string.
  37  */
  38 class XAPIAN_VISIBILITY_DEFAULT Utf8Iterator {
  39     const unsigned char* p;
  40     const unsigned char* end;
  41     mutable unsigned seqlen;
  42
  43     bool XAPIAN_NOTHROW(calculate_sequence_length() const);
  44
  45     unsigned get_char() const;
  46
  47     Utf8Iterator(const unsigned char* p_,
  48                  const unsigned char* end_,
  49                  unsigned seqlen_)
  50         : p(p_), end(end_), seqlen(seqlen_) { }
  51
  52   public:
  53     /** Return the raw const char* pointer for the current position. */
  54     const char* raw() const {
  55         return reinterpret_cast<const char*>(p ? p : end);
  56     }
  57
  58     /** Return the number of bytes left in the iterator's buffer. */
  59     size_t left() const { return p ? end - p : 0; }
  60
  61     /** Assign a new string to the iterator.
  62      *
  63      *  The iterator will forget the string it was iterating through, and
  64      *  return characters from the start of the new string when next called.
  65      *  The string is not copied into the iterator, so it must remain valid
  66      *  while the iteration is in progress.
  67      *
  68      *  @param p_ A pointer to the start of the string to read.
  69      *
  70      *  @param len The length of the string to read.
  71      */
  72     void assign(const char* p_, size_t len) {
  73         if (len) {
  74             p = reinterpret_cast<const unsigned char*>(p_);
  75             end = p + len;
  76             seqlen = 0;
  77         } else {
  78             p = NULL;
  79         }
  80     }
  81
  82     /** Assign a new string to the iterator.
  83      *
  84      *  The iterator will forget the string it was iterating through, and
  85      *  return characters from the start of the new string when next called.
  86      *  The string is not copied into the iterator, so it must remain valid
  87      *  while the iteration is in progress.
  88      *
  89      *  @param s The string to read.  Must not be modified while the iteration
  90      *           is in progress.
  91      */
  92     void assign(const std::string& s) { assign(s.data(), s.size()); }
  93
  94     /** Create an iterator given a pointer to a null terminated string.
  95      *
  96      *  The iterator will return characters from the start of the string when
  97      *  next called.  The string is not copied into the iterator, so it must
  98      *  remain valid while the iteration is in progress.
  99      *
 100      *  @param p_ A pointer to the start of the null terminated string to read.
 101      */
 102     explicit Utf8Iterator(const char* p_);
 103
 104     /** Create an iterator given a pointer and a length.
 105      *
 106      *  The iterator will return characters from the start of the string when
 107      *  next called.  The string is not copied into the iterator, so it must
 108      *  remain valid while the iteration is in progress.
 109      *
 110      *  @param p_ A pointer to the start of the string to read.
 111      *
 112      *  @param len The length of the string to read.
 113      */
 114     Utf8Iterator(const char* p_, size_t len) { assign(p_, len); }
 115
 116     /** Create an iterator given a string.
 117      *
 118      *  The iterator will return characters from the start of the string when
 119      *  next called.  The string is not copied into the iterator, so it must
 120      *  remain valid while the iteration is in progress.
 121      *
 122      *  @param s The string to read.  Must not be modified while the iteration
 123      *           is in progress.
 124      */
 125     Utf8Iterator(const std::string& s) { assign(s.data(), s.size()); }
 126
 127     /** Create an iterator which is at the end of its iteration.
 128      *
 129      *  This can be compared to another iterator to check if the other iterator
 130      *  has reached its end.
 131      */
 132     XAPIAN_NOTHROW(Utf8Iterator())
 133         : p(NULL), end(0), seqlen(0) { }
 134
 135     /** Get the current Unicode character value pointed to by the iterator.
 136      *
 137      *  If an invalid UTF-8 sequence is encountered, then the byte values
 138      *  comprising it are returned until valid UTF-8 or the end of the input is
 139      *  reached.
 140      *
 141      *  Returns unsigned(-1) if the iterator has reached the end of its buffer.
 142      */
 143     unsigned XAPIAN_NOTHROW(operator*() const) XAPIAN_PURE_FUNCTION;
 144
 145     /** @private @internal Get the current Unicode character
 146      *  value pointed to by the iterator.
 147      *
 148      *  If an invalid UTF-8 sequence is encountered, then the byte values
 149      *  comprising it are returned with the top bit set (so the caller can
 150      *  differentiate these from the same values arising from valid UTF-8)
 151      *  until valid UTF-8 or the end of the input is reached.
 152      *
 153      *  Returns unsigned(-1) if the iterator has reached the end of its buffer.
 154      */
 155     unsigned XAPIAN_NOTHROW(strict_deref() const) XAPIAN_PURE_FUNCTION;
 156
 157     /** Move forward to the next Unicode character.
 158      *
 159      *  @return An iterator pointing to the position before the move.
 160      */
 161     Utf8Iterator operator++(int) {
 162         // If we've not calculated seqlen yet, do so.
 163         if (seqlen == 0) calculate_sequence_length();
 164         const unsigned char* old_p = p;
 165         unsigned old_seqlen = seqlen;
 166         p += seqlen;
 167         if (p == end) p = NULL;
 168         seqlen = 0;
 169         return Utf8Iterator(old_p, end, old_seqlen);
 170     }
 171
 172     /** Move forward to the next Unicode character.
 173      *
 174      *  @return A reference to this object.
 175      */
 176     Utf8Iterator& operator++() {
 177         if (seqlen == 0) calculate_sequence_length();
 178         p += seqlen;
 179         if (p == end) p = NULL;
 180         seqlen = 0;
 181         return *this;
 182     }
 183
 184     /** Test two Utf8Iterators for equality.
 185      *
 186      *  @param other    The Utf8Iterator to compare this one with.
 187      *  @return true iff the iterators point to the same position.
 188      */
 189     bool XAPIAN_NOTHROW(operator==(const Utf8Iterator& other) const) {
 190         return p == other.p;
 191     }
 192
 193     /** Test two Utf8Iterators for inequality.
 194      *
 195      *  @param other    The Utf8Iterator to compare this one with.
 196      *  @return true iff the iterators do not point to the same position.
 197      */
 198     bool XAPIAN_NOTHROW(operator!=(const Utf8Iterator& other) const) {
 199         return p != other.p;
 200     }
 201
 202     /// We implement the semantics of an STL input_iterator.
 203     //@{
 204     typedef std::input_iterator_tag iterator_category;
 205     typedef unsigned value_type;
 206     typedef size_t difference_type;
 207     typedef const unsigned* pointer;
 208     typedef const unsigned& reference;
 209     //@}
 210 };
 211
 212 /// Functions associated with handling Unicode characters.
 213 namespace Unicode {
 214
 215 /** Each Unicode character is in exactly one of these categories.
 216  *
 217  * The Unicode standard calls this the "General Category", and uses a
 218  * "Major, minor" convention to derive a two letter code.
 219  */
 220 typedef enum {
 221     UNASSIGNED,                         /**< Other, not assigned (Cn) */
 222     UPPERCASE_LETTER,                   /**< Letter, uppercase (Lu) */
 223     LOWERCASE_LETTER,                   /**< Letter, lowercase (Ll) */
 224     TITLECASE_LETTER,                   /**< Letter, titlecase (Lt) */
 225     MODIFIER_LETTER,                    /**< Letter, modifier (Lm) */
 226     OTHER_LETTER,                       /**< Letter, other (Lo) */
 227     NON_SPACING_MARK,                   /**< Mark, nonspacing (Mn) */
 228     ENCLOSING_MARK,                     /**< Mark, enclosing (Me) */
 229     COMBINING_SPACING_MARK,             /**< Mark, spacing combining (Mc) */
 230     DECIMAL_DIGIT_NUMBER,               /**< Number, decimal digit (Nd) */
 231     LETTER_NUMBER,                      /**< Number, letter (Nl) */
 232     OTHER_NUMBER,                       /**< Number, other (No) */
 233     SPACE_SEPARATOR,                    /**< Separator, space (Zs) */
 234     LINE_SEPARATOR,                     /**< Separator, line (Zl) */
 235     PARAGRAPH_SEPARATOR,                /**< Separator, paragraph (Zp) */
 236     CONTROL,                            /**< Other, control (Cc) */
 237     FORMAT,                             /**< Other, format (Cf) */
 238     PRIVATE_USE,                        /**< Other, private use (Co) */
 239     SURROGATE,                          /**< Other, surrogate (Cs) */
 240     CONNECTOR_PUNCTUATION,              /**< Punctuation, connector (Pc) */
 241     DASH_PUNCTUATION,                   /**< Punctuation, dash (Pd) */
 242     OPEN_PUNCTUATION,                   /**< Punctuation, open (Ps) */
 243     CLOSE_PUNCTUATION,                  /**< Punctuation, close (Pe) */
 244     INITIAL_QUOTE_PUNCTUATION,          /**< Punctuation, initial quote (Pi) */
 245     FINAL_QUOTE_PUNCTUATION,            /**< Punctuation, final quote (Pf) */
 246     OTHER_PUNCTUATION,                  /**< Punctuation, other (Po) */
 247     MATH_SYMBOL,                        /**< Symbol, math (Sm) */
 248     CURRENCY_SYMBOL,                    /**< Symbol, currency (Sc) */
 249     MODIFIER_SYMBOL,                    /**< Symbol, modified (Sk) */
 250     OTHER_SYMBOL                        /**< Symbol, other (So) */
 251 } category;
 252
 253 namespace Internal {
 254     /** @private @internal Extract the information about a character from the
 255      *  Unicode character tables.
 256      *
 257      *  Characters outside of the Unicode range (i.e. ch >= 0x110000) are
 258      *  treated as UNASSIGNED with no case variants.
 259      */
 260     XAPIAN_VISIBILITY_DEFAULT
 261     int XAPIAN_NOTHROW(get_character_info(unsigned ch)) XAPIAN_CONST_FUNCTION;
 262
 263     /** @private @internal Extract how to convert the case of a Unicode
 264      *  character from its info.
 265      */
 266     inline int get_case_type(int info) { return ((info & 0xe0) >> 5); }
 267
 268     /** @private @internal Extract the category of a Unicode character from its
 269      *  info.
 270      */
 271     inline category get_category(int info) {
 272         return static_cast<category>(info & 0x1f);
 273     }
 274
 275     /** @private @internal Extract the delta to use for case conversion of a
 276      *  character from its info.
 277      */
 278     inline int get_delta(int info) {
 279         /* It's implementation defined if sign extension happens when right
 280          * shifting a signed int, although in practice sign extension is what
 281          * most compilers implement.
 282          *
 283          * Some compilers are smart enough to spot common idioms for sign
 284          * extension, but not all (e.g. GCC < 7 doesn't spot the one used
 285          * below), so check what the implementation-defined behaviour is with
 286          * a constant conditional which should get optimised away.
 287          *
 288          * We use the ternary operator here to avoid various compiler
 289          * warnings which writing this as an `if` results in.
 290          */
 291         return ((-1 >> 1) == -1 ?
 292                 // Right shift sign-extends.
 293                 info >> 8 :
 294                 // Right shift shifts in zeros so bitwise-not before and after
 295                 // the shift for negative values.
 296                 (info >= 0) ? (info >> 8) : (~(~info >> 8)));
 297     }
 298 }
 299
 300 /** Convert a single non-ASCII Unicode character to UTF-8.
 301  *
 302  *  This is intended mainly as a helper method for to_utf8().
 303  *
 304  *  @param ch   The character (which must be > 128) to write to @a buf.
 305  *  @param buf  The buffer to write the character to - it must have
 306  *              space for (at least) 4 bytes.
 307  *
 308  *  @return     The length of the resultant UTF-8 character in bytes.
 309  */
 310 XAPIAN_VISIBILITY_DEFAULT
 311 unsigned nonascii_to_utf8(unsigned ch, char* buf);
 312
 313 /** Convert a single Unicode character to UTF-8.
 314  *
 315  *  @param ch   The character to write to @a buf.
 316  *  @param buf  The buffer to write the character to - it must have
 317  *              space for (at least) 4 bytes.
 318  *
 319  *  @return     The length of the resultant UTF-8 character in bytes.
 320  */
 321 inline unsigned to_utf8(unsigned ch, char* buf) {
 322     if (ch < 128) {
 323         *buf = static_cast<unsigned char>(ch);
 324         return 1;
 325     }
 326     return Xapian::Unicode::nonascii_to_utf8(ch, buf);
 327 }
 328
 329 /** Append the UTF-8 representation of a single Unicode character to a
 330  *  std::string.
 331  */
 332 inline void append_utf8(std::string& s, unsigned ch) {
 333     char buf[4];
 334     s.append(buf, to_utf8(ch, buf));
 335 }
 336
 337 /// Return the category which a given Unicode character falls into.
 338 inline category get_category(unsigned ch) {
 339     return Internal::get_category(Internal::get_character_info(ch));
 340 }
 341
 342 /// Test if a given Unicode character is "word character".
 343 inline bool is_wordchar(unsigned ch) {
 344     const unsigned int WORDCHAR_MASK =
 345             (1 << Xapian::Unicode::UPPERCASE_LETTER) |
 346             (1 << Xapian::Unicode::LOWERCASE_LETTER) |
 347             (1 << Xapian::Unicode::TITLECASE_LETTER) |
 348             (1 << Xapian::Unicode::MODIFIER_LETTER) |
 349             (1 << Xapian::Unicode::OTHER_LETTER) |
 350             (1 << Xapian::Unicode::NON_SPACING_MARK) |
 351             (1 << Xapian::Unicode::ENCLOSING_MARK) |
 352             (1 << Xapian::Unicode::COMBINING_SPACING_MARK) |
 353             (1 << Xapian::Unicode::DECIMAL_DIGIT_NUMBER) |
 354             (1 << Xapian::Unicode::LETTER_NUMBER) |
 355             (1 << Xapian::Unicode::OTHER_NUMBER) |
 356             (1 << Xapian::Unicode::CONNECTOR_PUNCTUATION);
 357     return ((WORDCHAR_MASK >> get_category(ch)) & 1);
 358 }
 359
 360 /// Test if a given Unicode character is a whitespace character.
 361 inline bool is_whitespace(unsigned ch) {
 362     const unsigned int WHITESPACE_MASK =
 363             (1 << Xapian::Unicode::CONTROL) | // For TAB, CR, LF, FF.
 364             (1 << Xapian::Unicode::SPACE_SEPARATOR) |
 365             (1 << Xapian::Unicode::LINE_SEPARATOR) |
 366             (1 << Xapian::Unicode::PARAGRAPH_SEPARATOR);
 367     return ((WHITESPACE_MASK >> get_category(ch)) & 1);
 368 }
 369
 370 /// Test if a given Unicode character is a currency symbol.
 371 inline bool is_currency(unsigned ch) {
 372     return (get_category(ch) == Xapian::Unicode::CURRENCY_SYMBOL);
 373 }
 374
 375 /// Convert a Unicode character to lowercase.
 376 inline unsigned tolower(unsigned ch) {
 377     int info = Xapian::Unicode::Internal::get_character_info(ch);
 378     if (!(Internal::get_case_type(info) & 2))
 379         return ch;
 380     return ch + Internal::get_delta(info);
 381 }
 382
 383 /// Convert a Unicode character to uppercase.
 384 inline unsigned toupper(unsigned ch) {
 385     int info = Xapian::Unicode::Internal::get_character_info(ch);
 386     if (!(Internal::get_case_type(info) & 4))
 387         return ch;
 388     return ch - Internal::get_delta(info);
 389 }
 390
 391 /// Convert a UTF-8 std::string to lowercase.
 392 inline std::string
 393 tolower(const std::string& term)
 394 {
 395     std::string result;
 396     result.reserve(term.size());
 397     for (Utf8Iterator i(term); i != Utf8Iterator(); ++i) {
 398         append_utf8(result, tolower(*i));
 399     }
 400     return result;
 401 }
 402
 403 /// Convert a UTF-8 std::string to uppercase.
 404 inline std::string
 405 toupper(const std::string& term)
 406 {
 407     std::string result;
 408     result.reserve(term.size());
 409     for (Utf8Iterator i(term); i != Utf8Iterator(); ++i) {
 410         append_utf8(result, toupper(*i));
 411     }
 412     return result;
 413 }
 414
 415 }
 416
 417 }
 418
 419 #endif // XAPIAN_INCLUDED_UNICODE_H