include/llvm/Support/UnicodeCharRanges.h

   1 //===--- UnicodeCharRanges.h - Types and functions for character ranges ---===//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8 #ifndef LLVM_SUPPORT_UNICODECHARRANGES_H
   9 #define LLVM_SUPPORT_UNICODECHARRANGES_H
  10
  11 #include "llvm/ADT/ArrayRef.h"
  12 #include "llvm/ADT/SmallPtrSet.h"
  13 #include "llvm/Support/Compiler.h"
  14 #include "llvm/Support/Debug.h"
  15 #include "llvm/Support/Mutex.h"
  16 #include "llvm/Support/MutexGuard.h"
  17 #include "llvm/Support/raw_ostream.h"
  18 #include <algorithm>
  19
  20 #define DEBUG_TYPE "unicode"
  21
  22 namespace llvm {
  23 namespace sys {
  24
  25 /// Represents a closed range of Unicode code points [Lower, Upper].
  26 struct UnicodeCharRange {
  27   uint32_t Lower;
  28   uint32_t Upper;
  29 };
  30
  31 inline bool operator<(uint32_t Value, UnicodeCharRange Range) {
  32   return Value < Range.Lower;
  33 }
  34 inline bool operator<(UnicodeCharRange Range, uint32_t Value) {
  35   return Range.Upper < Value;
  36 }
  37
  38 /// Holds a reference to an ordered array of UnicodeCharRange and allows
  39 /// to quickly check if a code point is contained in the set represented by this
  40 /// array.
  41 class UnicodeCharSet {
  42 public:
  43   typedef ArrayRef<UnicodeCharRange> CharRanges;
  44
  45   /// Constructs a UnicodeCharSet instance from an array of
  46   /// UnicodeCharRanges.
  47   ///
  48   /// Array pointed by \p Ranges should have the lifetime at least as long as
  49   /// the UnicodeCharSet instance, and should not change. Array is validated by
  50   /// the constructor, so it makes sense to create as few UnicodeCharSet
  51   /// instances per each array of ranges, as possible.
  52 #ifdef NDEBUG
  53
  54   // FIXME: This could use constexpr + static_assert. This way we
  55   // may get rid of NDEBUG in this header. Unfortunately there are some
  56   // problems to get this working with MSVC 2013. Change this when
  57   // the support for MSVC 2013 is dropped.
  58   constexpr UnicodeCharSet(CharRanges Ranges) : Ranges(Ranges) {}
  59 #else
  60   UnicodeCharSet(CharRanges Ranges) : Ranges(Ranges) {
  61     assert(rangesAreValid());
  62   }
  63 #endif
  64
  65   /// Returns true if the character set contains the Unicode code point
  66   /// \p C.
  67   bool contains(uint32_t C) const {
  68     return std::binary_search(Ranges.begin(), Ranges.end(), C);
  69   }
  70
  71 private:
  72   /// Returns true if each of the ranges is a proper closed range
  73   /// [min, max], and if the ranges themselves are ordered and non-overlapping.
  74   bool rangesAreValid() const {
  75     uint32_t Prev = 0;
  76     for (CharRanges::const_iterator I = Ranges.begin(), E = Ranges.end();
  77          I != E; ++I) {
  78       if (I != Ranges.begin() && Prev >= I->Lower) {
  79         LLVM_DEBUG(dbgs() << "Upper bound 0x");
  80         LLVM_DEBUG(dbgs().write_hex(Prev));
  81         LLVM_DEBUG(dbgs() << " should be less than succeeding lower bound 0x");
  82         LLVM_DEBUG(dbgs().write_hex(I->Lower) << "\n");
  83         return false;
  84       }
  85       if (I->Upper < I->Lower) {
  86         LLVM_DEBUG(dbgs() << "Upper bound 0x");
  87         LLVM_DEBUG(dbgs().write_hex(I->Lower));
  88         LLVM_DEBUG(dbgs() << " should not be less than lower bound 0x");
  89         LLVM_DEBUG(dbgs().write_hex(I->Upper) << "\n");
  90         return false;
  91       }
  92       Prev = I->Upper;
  93     }
  94
  95     return true;
  96   }
  97
  98   const CharRanges Ranges;
  99 };
 100
 101 } // namespace sys
 102 } // namespace llvm
 103
 104 #undef DEBUG_TYPE // "unicode"
 105
 106 #endif // LLVM_SUPPORT_UNICODECHARRANGES_H