llvm/lib/Support/UnicodeNameToCodepoint.cpp

   1 //===- llvm/Support/UnicodeNameToCodepoint.cpp - Unicode character properties
   2 //-*- C++ -*-===//
   3 //
   4 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   5 // See https://llvm.org/LICENSE.txt for license information.
   6 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   7 //
   8 //===----------------------------------------------------------------------===//
   9 //
  10 // This file implements functions to map the name or alias of a unicode
  11 // character to its codepoint.
  12 //
  13 //===----------------------------------------------------------------------===//
  14
  15 #include "llvm/ADT/STLExtras.h"
  16 #include "llvm/ADT/StringExtras.h"
  17 #include "llvm/ADT/StringRef.h"
  18 #include "llvm/Support/Unicode.h"
  19
  20 namespace llvm {
  21 namespace sys {
  22 namespace unicode {
  23
  24 extern const char *UnicodeNameToCodepointDict;
  25 extern const uint8_t *UnicodeNameToCodepointIndex;
  26 extern const std::size_t UnicodeNameToCodepointIndexSize;
  27 extern const std::size_t UnicodeNameToCodepointLargestNameSize;
  28
  29 using BufferType = SmallString<64>;
  30
  31 struct Node {
  32   bool IsRoot = false;
  33   char32_t Value = 0xFFFFFFFF;
  34   uint32_t ChildrenOffset = 0;
  35   bool HasSibling = false;
  36   uint32_t Size = 0;
  37   StringRef Name;
  38   const Node *Parent = nullptr;
  39
  40   constexpr bool isValid() const {
  41     return !Name.empty() || Value == 0xFFFFFFFF;
  42   }
  43   constexpr bool hasChildren() const { return ChildrenOffset != 0 || IsRoot; }
  44
  45   std::string fullName() const {
  46     std::string S;
  47     // Reserve enough space for most unicode code points.
  48     // The chosen value represent the 99th percentile of name size as of
  49     // Unicode 15.0.
  50     S.reserve(46);
  51     const Node *N = this;
  52     while (N) {
  53       std::reverse_copy(N->Name.begin(), N->Name.end(), std::back_inserter(S));
  54       N = N->Parent;
  55     }
  56     std::reverse(S.begin(), S.end());
  57     return S;
  58   }
  59 };
  60
  61 static Node createRoot() {
  62   Node N;
  63   N.IsRoot = true;
  64   N.ChildrenOffset = 1;
  65   N.Size = 1;
  66   return N;
  67 }
  68
  69 static Node readNode(uint32_t Offset, const Node *Parent = nullptr) {
  70   if (Offset == 0)
  71     return createRoot();
  72
  73   uint32_t Origin = Offset;
  74   Node N;
  75   N.Parent = Parent;
  76   uint8_t NameInfo = UnicodeNameToCodepointIndex[Offset++];
  77   if (Offset + 6 >= UnicodeNameToCodepointIndexSize)
  78     return N;
  79
  80   bool LongName = NameInfo & 0x40;
  81   bool HasValue = NameInfo & 0x80;
  82   std::size_t Size = NameInfo & ~0xC0;
  83   if (LongName) {
  84     uint32_t NameOffset = (UnicodeNameToCodepointIndex[Offset++] << 8);
  85     NameOffset |= UnicodeNameToCodepointIndex[Offset++];
  86     N.Name = StringRef(UnicodeNameToCodepointDict + NameOffset, Size);
  87   } else {
  88     N.Name = StringRef(UnicodeNameToCodepointDict + Size, 1);
  89   }
  90   if (HasValue) {
  91     uint8_t H = UnicodeNameToCodepointIndex[Offset++];
  92     uint8_t M = UnicodeNameToCodepointIndex[Offset++];
  93     uint8_t L = UnicodeNameToCodepointIndex[Offset++];
  94     N.Value = ((H << 16) | (M << 8) | L) >> 3;
  95
  96     bool HasChildren = L & 0x02;
  97     N.HasSibling = L & 0x01;
  98
  99     if (HasChildren) {
 100       N.ChildrenOffset = UnicodeNameToCodepointIndex[Offset++] << 16;
 101       N.ChildrenOffset |= UnicodeNameToCodepointIndex[Offset++] << 8;
 102       N.ChildrenOffset |= UnicodeNameToCodepointIndex[Offset++];
 103     }
 104   } else {
 105     uint8_t H = UnicodeNameToCodepointIndex[Offset++];
 106     N.HasSibling = H & 0x80;
 107     bool HasChildren = H & 0x40;
 108     H &= uint8_t(~0xC0);
 109     if (HasChildren) {
 110       N.ChildrenOffset = (H << 16);
 111       N.ChildrenOffset |=
 112           (uint32_t(UnicodeNameToCodepointIndex[Offset++]) << 8);
 113       N.ChildrenOffset |= UnicodeNameToCodepointIndex[Offset++];
 114     }
 115   }
 116   N.Size = Offset - Origin;
 117   return N;
 118 }
 119
 120 static bool startsWith(StringRef Name, StringRef Needle, bool Strict,
 121                        std::size_t &Consummed, char &PreviousCharInName,
 122                        bool IsPrefix = false) {
 123
 124   Consummed = 0;
 125   if (Strict) {
 126     if (!Name.starts_with(Needle))
 127       return false;
 128     Consummed = Needle.size();
 129     return true;
 130   }
 131   if (Needle.empty())
 132     return true;
 133
 134   auto NamePos = Name.begin();
 135   auto NeedlePos = Needle.begin();
 136
 137   char PreviousCharInNameOrigin = PreviousCharInName;
 138   char PreviousCharInNeedle = *Needle.begin();
 139   auto IgnoreSpaces = [](auto It, auto End, char &PreviousChar,
 140                          bool IsPrefix = false) {
 141     while (It != End) {
 142       const auto Next = std::next(It);
 143       // Ignore spaces, underscore, medial hyphens
 144       // The generator ensures a needle never ends (or starts) by a medial
 145       // hyphen https://unicode.org/reports/tr44/#UAX44-LM2.
 146       bool Ignore =
 147           *It == ' ' || *It == '_' ||
 148           (*It == '-' && isAlnum(PreviousChar) &&
 149            ((Next != End && isAlnum(*Next)) || (Next == End && IsPrefix)));
 150       PreviousChar = *It;
 151       if (!Ignore)
 152         break;
 153       ++It;
 154     }
 155     return It;
 156   };
 157
 158   while (true) {
 159     NamePos = IgnoreSpaces(NamePos, Name.end(), PreviousCharInName);
 160     NeedlePos =
 161         IgnoreSpaces(NeedlePos, Needle.end(), PreviousCharInNeedle, IsPrefix);
 162     if (NeedlePos == Needle.end())
 163       break;
 164     if (NamePos == Name.end())
 165       break;
 166     if (toUpper(*NeedlePos) != toUpper(*NamePos))
 167       break;
 168     NeedlePos++;
 169     NamePos++;
 170   }
 171   Consummed = std::distance(Name.begin(), NamePos);
 172   if (NeedlePos != Needle.end()) {
 173     PreviousCharInName = PreviousCharInNameOrigin;
 174   }
 175   return NeedlePos == Needle.end();
 176 }
 177
 178 static std::tuple<Node, bool, uint32_t>
 179 compareNode(uint32_t Offset, StringRef Name, bool Strict,
 180             char PreviousCharInName, BufferType &Buffer,
 181             const Node *Parent = nullptr) {
 182   Node N = readNode(Offset, Parent);
 183   std::size_t Consummed = 0;
 184   bool DoesStartWith = N.IsRoot || startsWith(Name, N.Name, Strict, Consummed,
 185                                               PreviousCharInName);
 186   if (!DoesStartWith)
 187     return std::make_tuple(N, false, 0);
 188
 189   if (Name.size() - Consummed == 0 && N.Value != 0xFFFFFFFF)
 190     return std::make_tuple(N, true, N.Value);
 191
 192   if (N.hasChildren()) {
 193     uint32_t ChildOffset = N.ChildrenOffset;
 194     for (;;) {
 195       Node C;
 196       bool Matches;
 197       uint32_t Value;
 198       std::tie(C, Matches, Value) =
 199           compareNode(ChildOffset, Name.substr(Consummed), Strict,
 200                       PreviousCharInName, Buffer, &N);
 201       if (Matches) {
 202         std::reverse_copy(C.Name.begin(), C.Name.end(),
 203                           std::back_inserter(Buffer));
 204         return std::make_tuple(N, true, Value);
 205       }
 206       ChildOffset += C.Size;
 207       if (!C.HasSibling)
 208         break;
 209     }
 210   }
 211   return std::make_tuple(N, false, 0);
 212 }
 213
 214 static std::tuple<Node, bool, uint32_t>
 215 compareNode(uint32_t Offset, StringRef Name, bool Strict, BufferType &Buffer) {
 216   return compareNode(Offset, Name, Strict, 0, Buffer);
 217 }
 218
 219 // clang-format off
 220 constexpr const char *const HangulSyllables[][3] = {
 221     { "G",  "A",   ""   },
 222     { "GG", "AE",  "G"  },
 223     { "N",  "YA",  "GG" },
 224     { "D",  "YAE", "GS" },
 225     { "DD", "EO",  "N", },
 226     { "R",  "E",   "NJ" },
 227     { "M",  "YEO", "NH" },
 228     { "B",  "YE",  "D"  },
 229     { "BB", "O",   "L"  },
 230     { "S",  "WA",  "LG" },
 231     { "SS", "WAE", "LM" },
 232     { "",   "OE",  "LB" },
 233     { "J",  "YO",  "LS" },
 234     { "JJ", "U",   "LT" },
 235     { "C",  "WEO", "LP" },
 236     { "K",  "WE",  "LH" },
 237     { "T",  "WI",  "M"  },
 238     { "P",  "YU",  "B"  },
 239     { "H",  "EU",  "BS" },
 240     { 0,    "YI",  "S"  },
 241     { 0,    "I",   "SS" },
 242     { 0,    0,     "NG" },
 243     { 0,    0,     "J"  },
 244     { 0,    0,     "C"  },
 245     { 0,    0,     "K"  },
 246     { 0,    0,     "T"  },
 247     { 0,    0,     "P"  },
 248     { 0,    0,     "H"  }
 249     };
 250 // clang-format on
 251
 252 // Unicode 15.0
 253 // 3.12 Conjoining Jamo Behavior Common constants
 254 constexpr const char32_t SBase = 0xAC00;
 255 constexpr const uint32_t LCount = 19;
 256 constexpr const uint32_t VCount = 21;
 257 constexpr const uint32_t TCount = 28;
 258
 259 static std::size_t findSyllable(StringRef Name, bool Strict,
 260                                 char &PreviousInName, int &Pos, int Column) {
 261   assert(Column == 0 || Column == 1 || Column == 2);
 262   static std::size_t CountPerColumn[] = {LCount, VCount, TCount};
 263   int Len = -1;
 264   int Prev = PreviousInName;
 265   for (std::size_t I = 0; I < CountPerColumn[Column]; I++) {
 266     StringRef Syllable(HangulSyllables[I][Column]);
 267     if (int(Syllable.size()) <= Len)
 268       continue;
 269     std::size_t Consummed = 0;
 270     char PreviousInNameCopy = PreviousInName;
 271     bool DoesStartWith =
 272         startsWith(Name, Syllable, Strict, Consummed, PreviousInNameCopy);
 273     if (!DoesStartWith)
 274       continue;
 275     Len = Consummed;
 276     Pos = I;
 277     Prev = PreviousInNameCopy;
 278   }
 279   if (Len == -1)
 280     return 0;
 281   PreviousInName = Prev;
 282   return size_t(Len);
 283 }
 284
 285 static std::optional<char32_t>
 286 nameToHangulCodePoint(StringRef Name, bool Strict, BufferType &Buffer) {
 287   Buffer.clear();
 288   // Hangul Syllable Decomposition
 289   std::size_t Consummed = 0;
 290   char NameStart = 0;
 291   bool DoesStartWith =
 292       startsWith(Name, "HANGUL SYLLABLE ", Strict, Consummed, NameStart);
 293   if (!DoesStartWith)
 294     return std::nullopt;
 295   Name = Name.substr(Consummed);
 296   int L = -1, V = -1, T = -1;
 297   Name = Name.substr(findSyllable(Name, Strict, NameStart, L, 0));
 298   Name = Name.substr(findSyllable(Name, Strict, NameStart, V, 1));
 299   Name = Name.substr(findSyllable(Name, Strict, NameStart, T, 2));
 300   if (L != -1 && V != -1 && T != -1 && Name.empty()) {
 301     if (!Strict) {
 302       Buffer.append("HANGUL SYLLABLE ");
 303       if (L != -1)
 304         Buffer.append(HangulSyllables[L][0]);
 305       if (V != -1)
 306         Buffer.append(HangulSyllables[V][1]);
 307       if (T != -1)
 308         Buffer.append(HangulSyllables[T][2]);
 309     }
 310     return SBase + (std::uint32_t(L) * VCount + std::uint32_t(V)) * TCount +
 311            std::uint32_t(T);
 312   }
 313   // Otherwise, it's an illegal syllable name.
 314   return std::nullopt;
 315 }
 316
 317 struct GeneratedNamesData {
 318   StringRef Prefix;
 319   uint32_t Start;
 320   uint32_t End;
 321 };
 322
 323 // Unicode 15.1 Table 4-8. Name Derivation Rule Prefix Strings
 324 static const GeneratedNamesData GeneratedNamesDataTable[] = {
 325     {"CJK UNIFIED IDEOGRAPH-", 0x3400, 0x4DBF},
 326     {"CJK UNIFIED IDEOGRAPH-", 0x4E00, 0x9FFF},
 327     {"CJK UNIFIED IDEOGRAPH-", 0x20000, 0x2A6DF},
 328     {"CJK UNIFIED IDEOGRAPH-", 0x2A700, 0x2B739},
 329     {"CJK UNIFIED IDEOGRAPH-", 0x2B740, 0x2B81D},
 330     {"CJK UNIFIED IDEOGRAPH-", 0x2B820, 0x2CEA1},
 331     {"CJK UNIFIED IDEOGRAPH-", 0x2CEB0, 0x2EBE0},
 332     {"CJK UNIFIED IDEOGRAPH-", 0x2EBF0, 0x2EE5D},
 333     {"CJK UNIFIED IDEOGRAPH-", 0x30000, 0x3134A},
 334     {"CJK UNIFIED IDEOGRAPH-", 0x31350, 0x323AF},
 335     {"TANGUT IDEOGRAPH-", 0x17000, 0x187F7},
 336     {"TANGUT IDEOGRAPH-", 0x18D00, 0x18D08},
 337     {"KHITAN SMALL SCRIPT CHARACTER-", 0x18B00, 0x18CD5},
 338     {"NUSHU CHARACTER-", 0x1B170, 0x1B2FB},
 339     {"CJK COMPATIBILITY IDEOGRAPH-", 0xF900, 0xFA6D},
 340     {"CJK COMPATIBILITY IDEOGRAPH-", 0xFA70, 0xFAD9},
 341     {"CJK COMPATIBILITY IDEOGRAPH-", 0x2F800, 0x2FA1D},
 342 };
 343
 344 static std::optional<char32_t>
 345 nameToGeneratedCodePoint(StringRef Name, bool Strict, BufferType &Buffer) {
 346   for (auto &&Item : GeneratedNamesDataTable) {
 347     Buffer.clear();
 348     std::size_t Consummed = 0;
 349     char NameStart = 0;
 350     bool DoesStartWith = startsWith(Name, Item.Prefix, Strict, Consummed,
 351                                     NameStart, /*IsPrefix=*/true);
 352     if (!DoesStartWith)
 353       continue;
 354     auto Number = Name.substr(Consummed);
 355     unsigned long long V = 0;
 356     // Be consistent about mandating upper casing.
 357     if (Strict &&
 358         llvm::any_of(Number, [](char C) { return C >= 'a' && C <= 'f'; }))
 359       return {};
 360     if (getAsUnsignedInteger(Number, 16, V) || V < Item.Start || V > Item.End)
 361       continue;
 362     if (!Strict) {
 363       Buffer.append(Item.Prefix);
 364       Buffer.append(utohexstr(V, true));
 365     }
 366     return V;
 367   }
 368   return std::nullopt;
 369 }
 370
 371 static std::optional<char32_t> nameToCodepoint(StringRef Name, bool Strict,
 372                                                BufferType &Buffer) {
 373   if (Name.empty())
 374     return std::nullopt;
 375
 376   std::optional<char32_t> Res = nameToHangulCodePoint(Name, Strict, Buffer);
 377   if (!Res)
 378     Res = nameToGeneratedCodePoint(Name, Strict, Buffer);
 379   if (Res)
 380     return *Res;
 381
 382   Buffer.clear();
 383   Node Node;
 384   bool Matches;
 385   uint32_t Value;
 386   std::tie(Node, Matches, Value) = compareNode(0, Name, Strict, Buffer);
 387   if (Matches) {
 388     std::reverse(Buffer.begin(), Buffer.end());
 389     // UAX44-LM2. Ignore case, whitespace, underscore ('_'), and all medial
 390     // hyphens except the hyphen in U+1180 HANGUL JUNGSEONG O-E.
 391     if (!Strict && Value == 0x116c && Name.contains_insensitive("O-E")) {
 392       Buffer = "HANGUL JUNGSEONG O-E";
 393       Value = 0x1180;
 394     }
 395     return Value;
 396   }
 397   return std::nullopt;
 398 }
 399
 400 std::optional<char32_t> nameToCodepointStrict(StringRef Name) {
 401
 402   BufferType Buffer;
 403   auto Opt = nameToCodepoint(Name, true, Buffer);
 404   return Opt;
 405 }
 406
 407 std::optional<LooseMatchingResult>
 408 nameToCodepointLooseMatching(StringRef Name) {
 409   BufferType Buffer;
 410   auto Opt = nameToCodepoint(Name, false, Buffer);
 411   if (!Opt)
 412     return std::nullopt;
 413   return LooseMatchingResult{*Opt, Buffer};
 414 }
 415
 416 // Find the unicode character whose editing distance to Pattern
 417 // is shortest, using the Wagner–Fischer algorithm.
 418 llvm::SmallVector<MatchForCodepointName>
 419 nearestMatchesForCodepointName(StringRef Pattern, std::size_t MaxMatchesCount) {
 420   // We maintain a fixed size vector of matches,
 421   // sorted by distance
 422   // The worst match (with the biggest distance) are discarded when new elements
 423   // are added.
 424   std::size_t LargestEditDistance = 0;
 425   llvm::SmallVector<MatchForCodepointName> Matches;
 426   Matches.reserve(MaxMatchesCount + 1);
 427
 428   auto Insert = [&](const Node &Node, uint32_t Distance,
 429                     char32_t Value) -> bool {
 430     if (Distance > LargestEditDistance) {
 431       if (Matches.size() == MaxMatchesCount)
 432         return false;
 433       LargestEditDistance = Distance;
 434     }
 435     // To avoid allocations, the creation of the name is delayed
 436     // as much as possible.
 437     std::string Name;
 438     auto GetName = [&] {
 439       if (Name.empty())
 440         Name = Node.fullName();
 441       return Name;
 442     };
 443
 444     auto It = llvm::lower_bound(
 445         Matches, Distance,
 446         [&](const MatchForCodepointName &a, std::size_t Distance) {
 447           if (Distance == a.Distance)
 448             return a.Name < GetName();
 449           return a.Distance < Distance;
 450         });
 451     if (It == Matches.end() && Matches.size() == MaxMatchesCount)
 452       return false;
 453
 454     MatchForCodepointName M{GetName(), Distance, Value};
 455     Matches.insert(It, std::move(M));
 456     if (Matches.size() > MaxMatchesCount)
 457       Matches.pop_back();
 458     return true;
 459   };
 460
 461   // We ignore case, space, hyphens, etc,
 462   // in both the search pattern and the prospective names.
 463   auto Normalize = [](StringRef Name) {
 464     std::string Out;
 465     Out.reserve(Name.size());
 466     for (char C : Name) {
 467       if (isAlnum(C))
 468         Out.push_back(toUpper(C));
 469     }
 470     return Out;
 471   };
 472   std::string NormalizedName = Normalize(Pattern);
 473
 474   // Allocate a matrix big enough for longest names.
 475   const std::size_t Columns =
 476       std::min(NormalizedName.size(), UnicodeNameToCodepointLargestNameSize) +
 477       1;
 478
 479   LLVM_ATTRIBUTE_UNUSED static std::size_t Rows =
 480       UnicodeNameToCodepointLargestNameSize + 1;
 481
 482   std::vector<char> Distances(
 483       Columns * (UnicodeNameToCodepointLargestNameSize + 1), 0);
 484
 485   auto Get = [&Distances, Columns](size_t Column, std::size_t Row) -> char & {
 486     assert(Column < Columns);
 487     assert(Row < Rows);
 488     return Distances[Row * Columns + Column];
 489   };
 490
 491   for (std::size_t I = 0; I < Columns; I++)
 492     Get(I, 0) = I;
 493
 494   // Visit the childrens,
 495   // Filling (and overriding) the matrix for the name fragment of each node
 496   // iteratively. CompleteName is used to collect the actual name of potential
 497   // match, respecting case and spacing.
 498   auto VisitNode = [&](const Node &N, std::size_t Row,
 499                        auto &VisitNode) -> void {
 500     std::size_t J = 0;
 501     for (; J < N.Name.size(); J++) {
 502       if (!isAlnum(N.Name[J]))
 503         continue;
 504
 505       Get(0, Row) = Row;
 506
 507       for (std::size_t I = 1; I < Columns; I++) {
 508         const int Delete = Get(I - 1, Row) + 1;
 509         const int Insert = Get(I, Row - 1) + 1;
 510
 511         const int Replace =
 512             Get(I - 1, Row - 1) + (NormalizedName[I - 1] != N.Name[J] ? 1 : 0);
 513
 514         Get(I, Row) = std::min(Insert, std::min(Delete, Replace));
 515       }
 516
 517       Row++;
 518     }
 519
 520     unsigned Cost = Get(Columns - 1, Row - 1);
 521     if (N.Value != 0xFFFFFFFF) {
 522       Insert(N, Cost, N.Value);
 523     }
 524
 525     if (N.hasChildren()) {
 526       auto ChildOffset = N.ChildrenOffset;
 527       for (;;) {
 528         Node C = readNode(ChildOffset, &N);
 529         ChildOffset += C.Size;
 530         if (!C.isValid())
 531           break;
 532         VisitNode(C, Row, VisitNode);
 533         if (!C.HasSibling)
 534           break;
 535       }
 536     }
 537   };
 538
 539   Node Root = createRoot();
 540   VisitNode(Root, 1, VisitNode);
 541   return Matches;
 542 }
 543
 544 } // namespace unicode
 545
 546 } // namespace sys
 547 } // namespace llvm