llvm/utils/UnicodeData/UnicodeNameMappingGenerator.cpp

   1 //===--- UnicodeNameMappingGenerator.cpp - Unicode name data generator ---===//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8 //
   9 // This file is used to generate lib/Support/UnicodeNameToCodepointGenerated.cpp
  10 // using UnicodeData.txt and NameAliases.txt available at
  11 // https://unicode.org/Public/15.1.0/ucd/
  12 //===----------------------------------------------------------------------===//
  13
  14 #include "llvm/ADT/STLExtras.h"
  15 #include "llvm/ADT/StringExtras.h"
  16 #include "llvm/ADT/StringRef.h"
  17 #include <algorithm>
  18 #include <array>
  19 #include <deque>
  20 #include <fstream>
  21 #include <memory>
  22 #include <optional>
  23 #include <set>
  24 #include <string>
  25 #include <unordered_map>
  26 #include <utility>
  27 #include <vector>
  28
  29 static const llvm::StringRef Letters =
  30     " _-ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789";
  31
  32 // Collect names UnicodeData.txt and AliasNames.txt
  33 // There may be multiple names per code points.
  34 static std::unordered_multimap<char32_t, std::string>
  35 loadDataFiles(const std::string &NamesFile, const std::string &AliasesFile) {
  36   std::unordered_multimap<char32_t, std::string> CollectedCharacters;
  37   auto FromFile = [&](const std::string &File, bool IsAliasFile = false) {
  38     std::ifstream InputFile(File);
  39     for (std::string Line; getline(InputFile, Line);) {
  40       if (Line.empty() || !isxdigit(Line[0]))
  41         continue;
  42       auto FirstSemiPos = Line.find(';');
  43       if (FirstSemiPos == std::string::npos)
  44         continue;
  45       auto SecondSemiPos = Line.find(';', FirstSemiPos + 1);
  46       if (SecondSemiPos == std::string::npos)
  47         continue;
  48       unsigned long long CodePoint;
  49       if (llvm::getAsUnsignedInteger(
  50               llvm::StringRef(Line.c_str(), FirstSemiPos), 16, CodePoint)) {
  51         continue;
  52       }
  53
  54       std::string Name =
  55           Line.substr(FirstSemiPos + 1, SecondSemiPos - FirstSemiPos - 1);
  56
  57       if (!Name.empty() && Name[0] == '<') {
  58         // Ignore ranges of characters, as their name is either absent or
  59         // generated.
  60         continue;
  61       }
  62
  63       // Some aliases are ignored for compatibility with C++
  64       if (IsAliasFile) {
  65         std::string Kind = Line.substr(SecondSemiPos + 1);
  66         if (Kind != "control" && Kind != "correction" && Kind != "alternate")
  67           continue;
  68       }
  69
  70       auto InsertUnique = [&](char32_t CP, std::string Name) {
  71         auto It = CollectedCharacters.find(CP);
  72         while (It != std::end(CollectedCharacters) && It->first == CP) {
  73           if (It->second == Name)
  74             return;
  75           ++It;
  76         }
  77         CollectedCharacters.insert({CP, std::move(Name)});
  78       };
  79       InsertUnique(CodePoint, std::move(Name));
  80     }
  81   };
  82
  83   FromFile(NamesFile);
  84   FromFile(AliasesFile, true);
  85   return CollectedCharacters;
  86 }
  87
  88 class Trie {
  89   struct Node;
  90
  91 public:
  92   // When inserting named codepoint
  93   // We create a node per character in the name.
  94   // SPARKLE becomes S <- P <- A <- R <- K <- L <- E
  95   // Once all  characters are inserted, the tree is compacted
  96   void insert(llvm::StringRef Name, char32_t Codepoint) {
  97     Node *N = Root.get();
  98     bool IsBeforeMedial = false;
  99     for (auto ChIt = Name.begin(); ChIt != Name.end();
 100          ChIt += (IsBeforeMedial ? 3 : 1)) {
 101       char Ch = *ChIt;
 102       assert(Letters.contains(Ch) && "Unexpected symbol in Unicode name");
 103
 104       std::string Label(1, Ch);
 105
 106       // We need to ensure a node never ends or starts by
 107       // a medial hyphen as this would break the
 108       // loose matching algorithm.
 109       IsBeforeMedial = llvm::isAlnum(Ch) && ChIt + 1 != Name.end() &&
 110                        *(ChIt + 1) == '-' && ChIt + 2 != Name.end() &&
 111                        llvm::isAlnum(*(ChIt + 2));
 112       if (IsBeforeMedial)
 113         Label.assign(ChIt, ChIt + 3);
 114
 115       auto It = llvm::find_if(N->Children,
 116                               [&](const auto &C) { return C->Name == Label; });
 117       if (It == N->Children.end()) {
 118         It = N->Children.insert(It, std::make_unique<Node>(Label, N));
 119       }
 120       N = It->get();
 121     }
 122     N->Value = Codepoint;
 123   }
 124
 125   void compact() { compact(Root.get()); }
 126
 127   // This creates 2 arrays of bytes from the tree:
 128   // A serialized dictionary of node labels,
 129   // And the nodes themselves.
 130   // The name of each label is found by indexing into the dictionary.
 131   // The longest names are inserted first into the dictionary,
 132   // in the hope it will contain shorter labels as substring,
 133   // thereby reducing duplication.
 134   // We could theorically be more clever by trying to minimizing the size
 135   // of the dictionary.
 136   std::pair<std::string, std::vector<uint8_t>> serialize() {
 137     std::set<std::string> Names = this->getNameFragments();
 138     std::vector<std::string> Sorted(Names.begin(), Names.end());
 139     llvm::sort(Sorted, [](const auto &a, const auto &b) {
 140       return a.size() > b.size();
 141     });
 142     std::string Dict(Letters.begin(), Letters.end());
 143     Dict.reserve(50000);
 144     for (const std::string &Name : Sorted) {
 145       if (Name.size() <= 1)
 146         continue;
 147       if (Dict.find(Name) != std::string::npos)
 148         continue;
 149       Dict += Name;
 150     }
 151
 152     if (Dict.size() >= std::numeric_limits<uint16_t>::max()) {
 153       fprintf(stderr, "Dictionary too big  to be serialized");
 154       exit(1);
 155     }
 156
 157     auto Bytes = dumpIndex(Dict);
 158     return {Dict, Bytes};
 159   }
 160
 161   std::set<std::string> getNameFragments() {
 162     std::set<std::string> Keys;
 163     collectKeys(Root.get(), Keys);
 164     return Keys;
 165   }
 166
 167   // Maps a valid char in an Unicode character name
 168   // To a 6 bits index.
 169   static uint8_t letter(char C) {
 170     auto Pos = Letters.find(C);
 171     assert(Pos != std::string::npos &&
 172            "Invalid letter in Unicode character name");
 173     return Pos;
 174   }
 175
 176   // clang-format off
 177   // +================+============+======================+=============+========+===+==============+===============+
 178   // | 0          | 1             | 2-7 (6)              | 8-23        | 24-44  |    | 46           | 47            |
 179   // +================+============+======================+=============+========+===+==============+===============+
 180   // | Has Value |  Has Long Name | Letter OR Name Size  | Dict Index  | Value  |    | Has Sibling  | Has Children  |
 181   // +----------------+------------+----------------------+-------------+--------+---+--------------+---------------+
 182   // clang-format on
 183
 184   std::vector<uint8_t> dumpIndex(const std::string &Dict) {
 185     struct ChildrenOffset {
 186       Node *FirstChild;
 187       std::size_t Offset;
 188       bool HasValue;
 189     };
 190
 191     // Keep track of the start of each node
 192     // position in the serialized data.
 193     std::unordered_map<Node *, int32_t> Offsets;
 194
 195     // Keep track of where to write the index
 196     // of the first children
 197     std::vector<ChildrenOffset> ChildrenOffsets;
 198     std::unordered_map<Node *, bool> SiblingTracker;
 199     std::deque<Node *> AllNodes;
 200     std::vector<uint8_t> Bytes;
 201     Bytes.reserve(250'000);
 202     // This leading byte is used by the reading code to detect the root node.
 203     Bytes.push_back(0);
 204
 205     auto CollectChildren = [&SiblingTracker, &AllNodes](const auto &Children) {
 206       for (std::size_t Index = 0; Index < Children.size(); Index++) {
 207         const std::unique_ptr<Node> &Child = Children[Index];
 208         AllNodes.push_back(Child.get());
 209         if (Index != Children.size() - 1)
 210           SiblingTracker[Child.get()] = true;
 211       }
 212     };
 213     CollectChildren(Root->Children);
 214
 215     while (!AllNodes.empty()) {
 216       const std::size_t Offset = Bytes.size();
 217       Node *const N = AllNodes.front();
 218       AllNodes.pop_front();
 219
 220       assert(!N->Name.empty());
 221       Offsets[N] = Offset;
 222
 223       uint8_t FirstByte = (!!N->Value) ? 0x80 : 0;
 224       // Single letter node are indexed in 6 bits
 225       if (N->Name.size() == 1) {
 226         FirstByte |= letter(N->Name[0]);
 227         Bytes.push_back(FirstByte);
 228       } else {
 229         // Otherwise we use a 16 bits index
 230         FirstByte = FirstByte | uint8_t(N->Name.size()) | 0x40;
 231         Bytes.push_back(FirstByte);
 232         auto PosInDict = Dict.find(N->Name);
 233         assert(PosInDict != std::string::npos);
 234         uint8_t Low = PosInDict;
 235         uint8_t High = ((PosInDict >> 8) & 0xFF);
 236         Bytes.push_back(High);
 237         Bytes.push_back(Low);
 238       }
 239
 240       const bool HasSibling = SiblingTracker.count(N) != 0;
 241       const bool HasChildren = N->Children.size() != 0;
 242
 243       if (!!N->Value) {
 244         uint32_t Value = (*(N->Value) << 3);
 245         uint8_t H = ((Value >> 16) & 0xFF);
 246         uint8_t M = ((Value >> 8) & 0xFF);
 247         uint8_t L = (Value & 0xFF) | uint8_t(HasSibling ? 0x01 : 0) |
 248                     uint8_t(HasChildren ? 0x02 : 0);
 249
 250         Bytes.push_back(H);
 251         Bytes.push_back(M);
 252         Bytes.push_back(L);
 253
 254         if (HasChildren) {
 255           ChildrenOffsets.push_back(
 256               ChildrenOffset{N->Children[0].get(), Bytes.size(), true});
 257           // index of the first children
 258           Bytes.push_back(0x00);
 259           Bytes.push_back(0x00);
 260           Bytes.push_back(0x00);
 261         }
 262       } else {
 263         // When there is no value (that's most intermediate nodes)
 264         // Dispense of the 3 values bytes, and only store
 265         // 1 byte to track whether the node has sibling and children
 266         // + 2 bytes for the index of the first children if necessary.
 267         // That index also uses bytes 0-6 of the previous byte.
 268         uint8_t Byte =
 269             uint8_t(HasSibling ? 0x80 : 0) | uint8_t(HasChildren ? 0x40 : 0);
 270         Bytes.push_back(Byte);
 271         if (HasChildren) {
 272           ChildrenOffsets.emplace_back(
 273               ChildrenOffset{N->Children[0].get(), Bytes.size() - 1, false});
 274           Bytes.push_back(0x00);
 275           Bytes.push_back(0x00);
 276         }
 277       }
 278       CollectChildren(N->Children);
 279     }
 280
 281     // Once all the nodes are in the inndex
 282     // Fill the bytes we left to indicate the position
 283     // of the children
 284     for (const ChildrenOffset &Parent : ChildrenOffsets) {
 285       const auto It = Offsets.find(Parent.FirstChild);
 286       assert(It != Offsets.end());
 287       std::size_t Pos = It->second;
 288       if (Parent.HasValue) {
 289         Bytes[Parent.Offset] = ((Pos >> 16) & 0xFF);
 290       } else {
 291         Bytes[Parent.Offset] =
 292             Bytes[Parent.Offset] | uint8_t((Pos >> 16) & 0xFF);
 293       }
 294       Bytes[Parent.Offset + 1] = ((Pos >> 8) & 0xFF);
 295       Bytes[Parent.Offset + 2] = Pos & 0xFF;
 296     }
 297
 298     // Add some padding so that the deserialization code
 299     // doesn't try to read past the enf of the array.
 300     Bytes.push_back(0);
 301     Bytes.push_back(0);
 302     Bytes.push_back(0);
 303     Bytes.push_back(0);
 304     Bytes.push_back(0);
 305     Bytes.push_back(0);
 306
 307     return Bytes;
 308   }
 309
 310 private:
 311   void collectKeys(Node *N, std::set<std::string> &Keys) {
 312     Keys.insert(N->Name);
 313     for (const std::unique_ptr<Node> &Child : N->Children) {
 314       collectKeys(Child.get(), Keys);
 315     }
 316   }
 317
 318   // Merge sequences of 1-character nodes
 319   // This greatly reduce the total number of nodes,
 320   // and therefore the size of the index.
 321   // When the tree gets serialized, we only have 5 bytes to store the
 322   // size of a name. Overlong names (>32 characters) are therefore
 323   // kep into separate nodes
 324   void compact(Node *N) {
 325     for (auto &&Child : N->Children) {
 326       compact(Child.get());
 327     }
 328     if (N->Parent && N->Parent->Children.size() == 1 && !N->Parent->Value &&
 329         (N->Parent->Name.size() + N->Name.size() <= 32)) {
 330       N->Parent->Value = N->Value;
 331       N->Parent->Name += N->Name;
 332       N->Parent->Children = std::move(N->Children);
 333       for (std::unique_ptr<Node> &c : N->Parent->Children) {
 334         c->Parent = N->Parent;
 335       }
 336     }
 337   }
 338   struct Node {
 339     Node(std::string Name, Node *Parent = nullptr)
 340         : Name(Name), Parent(Parent) {}
 341
 342     std::vector<std::unique_ptr<Node>> Children;
 343     std::string Name;
 344     Node *Parent = nullptr;
 345     std::optional<char32_t> Value;
 346   };
 347
 348   std::unique_ptr<Node> Root = std::make_unique<Node>("");
 349 };
 350
 351 extern const char *UnicodeLicense;
 352
 353 int main(int argc, char **argv) {
 354   printf("Unicode name -> codepoint mapping generator\n"
 355          "Usage: %s UnicodeData.txt NameAliases.txt output\n\n",
 356          argv[0]);
 357   printf("NameAliases.txt can be found at "
 358          "https://unicode.org/Public/15.1.0/ucd/NameAliases.txt\n"
 359          "UnicodeData.txt can be found at "
 360          "https://unicode.org/Public/15.1.0/ucd/UnicodeData.txt\n\n");
 361
 362   if (argc != 4)
 363     return EXIT_FAILURE;
 364
 365   FILE *Out = fopen(argv[3], "w");
 366   if (!Out) {
 367     printf("Error creating output file.\n");
 368     return EXIT_FAILURE;
 369   }
 370
 371   Trie T;
 372   uint32_t NameCount = 0;
 373   std::size_t LongestName = 0;
 374   auto Entries = loadDataFiles(argv[1], argv[2]);
 375   for (const std::pair<const char32_t, std::string> &Entry : Entries) {
 376     char32_t Codepoint = Entry.first;
 377     const std::string &Name = Entry.second;
 378     // Ignore names which are not valid.
 379     if (Name.empty() ||
 380         !llvm::all_of(Name, [](char C) { return Letters.contains(C); })) {
 381       continue;
 382     }
 383     printf("%06x: %s\n", static_cast<unsigned int>(Codepoint), Name.c_str());
 384     T.insert(Name, Codepoint);
 385     LongestName =
 386         std::max(LongestName, std::size_t(llvm::count_if(Name, llvm::isAlnum)));
 387     NameCount++;
 388   }
 389   T.compact();
 390
 391   std::pair<std::string, std::vector<uint8_t>> Data = T.serialize();
 392   const std::string &Dict = Data.first;
 393   const std::vector<uint8_t> &Tree = Data.second;
 394
 395   fprintf(Out, R"(
 396 //===------------- Support/UnicodeNameToCodepointGenerated.cpp ------------===//
 397 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 398 // See https://llvm.org/LICENSE.txt for license information.
 399 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 400 //
 401 //===----------------------------------------------------------------------===//
 402 //
 403 // This file implements mapping the name of a unicode code point to its value.
 404 //
 405 // This file was generated using %s.
 406 // Do not edit manually.
 407 //
 408 //===----------------------------------------------------------------------===//
 409 %s
 410
 411
 412
 413 #include "llvm/Support/Compiler.h"
 414 #include <cstddef>
 415 #include <cstdint>
 416 )",
 417           argv[0], UnicodeLicense);
 418
 419   fprintf(Out,
 420           "namespace llvm { namespace sys { namespace unicode { \n"
 421           "extern const char *UnicodeNameToCodepointDict;\n"
 422           "extern const uint8_t *UnicodeNameToCodepointIndex;\n"
 423           "extern const std::size_t UnicodeNameToCodepointIndexSize;\n"
 424           "extern const std::size_t UnicodeNameToCodepointLargestNameSize;\n");
 425
 426   fprintf(Out, "const char* UnicodeNameToCodepointDict = \"%s\";\n",
 427           Dict.c_str());
 428
 429   fprintf(Out, "uint8_t UnicodeNameToCodepointIndex_[%zu] = {\n",
 430           Tree.size() + 1);
 431
 432   for (auto Byte : Tree) {
 433     fprintf(Out, "0x%02x,", Byte);
 434   }
 435
 436   fprintf(Out, "0};");
 437   fprintf(Out, "const uint8_t* UnicodeNameToCodepointIndex = "
 438                "UnicodeNameToCodepointIndex_; \n");
 439   fprintf(Out, "const std::size_t UnicodeNameToCodepointIndexSize = %zu;\n",
 440           Tree.size() + 1);
 441   fprintf(Out,
 442           "const std::size_t UnicodeNameToCodepointLargestNameSize = %zu;\n",
 443           LongestName);
 444   fprintf(Out, "\n}}}\n");
 445   fclose(Out);
 446   printf("Generated %s: %u Files.\nIndex: %f kB, Dictionary: %f kB.\nDone\n\n",
 447          argv[3], NameCount, Tree.size() / 1024.0, Dict.size() / 1024.0);
 448 }
 449
 450 const char *UnicodeLicense = R"(
 451 /*
 452 UNICODE, INC. LICENSE AGREEMENT - DATA FILES AND SOFTWARE
 453
 454 See Terms of Use <https://www.unicode.org/copyright.html>
 455 for definitions of Unicode Inc.’s Data Files and Software.
 456
 457 NOTICE TO USER: Carefully read the following legal agreement.
 458 BY DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING UNICODE INC.'S
 459 DATA FILES ("DATA FILES"), AND/OR SOFTWARE ("SOFTWARE"),
 460 YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE
 461 TERMS AND CONDITIONS OF THIS AGREEMENT.
 462 IF YOU DO NOT AGREE, DO NOT DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE
 463 THE DATA FILES OR SOFTWARE.
 464
 465 COPYRIGHT AND PERMISSION NOTICE
 466
 467 Copyright © 1991-2022 Unicode, Inc. All rights reserved.
 468 Distributed under the Terms of Use in https://www.unicode.org/copyright.html.
 469
 470 Permission is hereby granted, free of charge, to any person obtaining
 471 a copy of the Unicode data files and any associated documentation
 472 (the "Data Files") or Unicode software and any associated documentation
 473 (the "Software") to deal in the Data Files or Software
 474 without restriction, including without limitation the rights to use,
 475 copy, modify, merge, publish, distribute, and/or sell copies of
 476 the Data Files or Software, and to permit persons to whom the Data Files
 477 or Software are furnished to do so, provided that either
 478 (a) this copyright and permission notice appear with all copies
 479 of the Data Files or Software, or
 480 (b) this copyright and permission notice appear in associated
 481 Documentation.
 482
 483 THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF
 484 ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
 485 WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 486 NONINFRINGEMENT OF THIRD PARTY RIGHTS.
 487 IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS
 488 NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL
 489 DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
 490 DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
 491 TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
 492 PERFORMANCE OF THE DATA FILES OR SOFTWARE.
 493
 494 Except as contained in this notice, the name of a copyright holder
 495 shall not be used in advertising or otherwise to promote the sale,
 496 use or other dealings in these Data Files or Software without prior
 497 written authorization of the copyright holder.
 498 */
 499 )";