1 //===--- UnicodeNameMappingGenerator.cpp - Unicode name data generator ---===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // This file is used to generate lib/Support/UnicodeNameToCodepointGenerated.cpp
10 // using UnicodeData.txt and NameAliases.txt available at
11 // https://unicode.org/Public/15.1.0/ucd/
12 //===----------------------------------------------------------------------===//
14 #include "llvm/ADT/STLExtras.h"
15 #include "llvm/ADT/StringExtras.h"
16 #include "llvm/ADT/StringRef.h"
25 #include <unordered_map>
29 static const llvm::StringRef Letters
=
30 " _-ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789";
32 // Collect names UnicodeData.txt and AliasNames.txt
33 // There may be multiple names per code points.
34 static std::unordered_multimap
<char32_t
, std::string
>
35 loadDataFiles(const std::string
&NamesFile
, const std::string
&AliasesFile
) {
36 std::unordered_multimap
<char32_t
, std::string
> CollectedCharacters
;
37 auto FromFile
= [&](const std::string
&File
, bool IsAliasFile
= false) {
38 std::ifstream
InputFile(File
);
39 for (std::string Line
; getline(InputFile
, Line
);) {
40 if (Line
.empty() || !isxdigit(Line
[0]))
42 auto FirstSemiPos
= Line
.find(';');
43 if (FirstSemiPos
== std::string::npos
)
45 auto SecondSemiPos
= Line
.find(';', FirstSemiPos
+ 1);
46 if (SecondSemiPos
== std::string::npos
)
48 unsigned long long CodePoint
;
49 if (llvm::getAsUnsignedInteger(
50 llvm::StringRef(Line
.c_str(), FirstSemiPos
), 16, CodePoint
)) {
55 Line
.substr(FirstSemiPos
+ 1, SecondSemiPos
- FirstSemiPos
- 1);
57 if (!Name
.empty() && Name
[0] == '<') {
58 // Ignore ranges of characters, as their name is either absent or
63 // Some aliases are ignored for compatibility with C++
65 std::string Kind
= Line
.substr(SecondSemiPos
+ 1);
66 if (Kind
!= "control" && Kind
!= "correction" && Kind
!= "alternate")
70 auto InsertUnique
= [&](char32_t CP
, std::string Name
) {
71 auto It
= CollectedCharacters
.find(CP
);
72 while (It
!= std::end(CollectedCharacters
) && It
->first
== CP
) {
73 if (It
->second
== Name
)
77 CollectedCharacters
.insert({CP
, std::move(Name
)});
79 InsertUnique(CodePoint
, std::move(Name
));
84 FromFile(AliasesFile
, true);
85 return CollectedCharacters
;
92 // When inserting named codepoint
93 // We create a node per character in the name.
94 // SPARKLE becomes S <- P <- A <- R <- K <- L <- E
95 // Once all characters are inserted, the tree is compacted
96 void insert(llvm::StringRef Name
, char32_t Codepoint
) {
98 bool IsBeforeMedial
= false;
99 for (auto ChIt
= Name
.begin(); ChIt
!= Name
.end();
100 ChIt
+= (IsBeforeMedial
? 3 : 1)) {
102 assert(Letters
.contains(Ch
) && "Unexpected symbol in Unicode name");
104 std::string
Label(1, Ch
);
106 // We need to ensure a node never ends or starts by
107 // a medial hyphen as this would break the
108 // loose matching algorithm.
109 IsBeforeMedial
= llvm::isAlnum(Ch
) && ChIt
+ 1 != Name
.end() &&
110 *(ChIt
+ 1) == '-' && ChIt
+ 2 != Name
.end() &&
111 llvm::isAlnum(*(ChIt
+ 2));
113 Label
.assign(ChIt
, ChIt
+ 3);
115 auto It
= llvm::find_if(N
->Children
,
116 [&](const auto &C
) { return C
->Name
== Label
; });
117 if (It
== N
->Children
.end()) {
118 It
= N
->Children
.insert(It
, std::make_unique
<Node
>(Label
, N
));
122 N
->Value
= Codepoint
;
125 void compact() { compact(Root
.get()); }
127 // This creates 2 arrays of bytes from the tree:
128 // A serialized dictionary of node labels,
129 // And the nodes themselves.
130 // The name of each label is found by indexing into the dictionary.
131 // The longest names are inserted first into the dictionary,
132 // in the hope it will contain shorter labels as substring,
133 // thereby reducing duplication.
134 // We could theorically be more clever by trying to minimizing the size
135 // of the dictionary.
136 std::pair
<std::string
, std::vector
<uint8_t>> serialize() {
137 std::set
<std::string
> Names
= this->getNameFragments();
138 std::vector
<std::string
> Sorted(Names
.begin(), Names
.end());
139 llvm::sort(Sorted
, [](const auto &a
, const auto &b
) {
140 return a
.size() > b
.size();
142 std::string
Dict(Letters
.begin(), Letters
.end());
144 for (const std::string
&Name
: Sorted
) {
145 if (Name
.size() <= 1)
147 if (Dict
.find(Name
) != std::string::npos
)
152 if (Dict
.size() >= std::numeric_limits
<uint16_t>::max()) {
153 fprintf(stderr
, "Dictionary too big to be serialized");
157 auto Bytes
= dumpIndex(Dict
);
158 return {Dict
, Bytes
};
161 std::set
<std::string
> getNameFragments() {
162 std::set
<std::string
> Keys
;
163 collectKeys(Root
.get(), Keys
);
167 // Maps a valid char in an Unicode character name
168 // To a 6 bits index.
169 static uint8_t letter(char C
) {
170 auto Pos
= Letters
.find(C
);
171 assert(Pos
!= std::string::npos
&&
172 "Invalid letter in Unicode character name");
177 // +================+============+======================+=============+========+===+==============+===============+
178 // | 0 | 1 | 2-7 (6) | 8-23 | 24-44 | | 46 | 47 |
179 // +================+============+======================+=============+========+===+==============+===============+
180 // | Has Value | Has Long Name | Letter OR Name Size | Dict Index | Value | | Has Sibling | Has Children |
181 // +----------------+------------+----------------------+-------------+--------+---+--------------+---------------+
184 std::vector
<uint8_t> dumpIndex(const std::string
&Dict
) {
185 struct ChildrenOffset
{
191 // Keep track of the start of each node
192 // position in the serialized data.
193 std::unordered_map
<Node
*, int32_t> Offsets
;
195 // Keep track of where to write the index
196 // of the first children
197 std::vector
<ChildrenOffset
> ChildrenOffsets
;
198 std::unordered_map
<Node
*, bool> SiblingTracker
;
199 std::deque
<Node
*> AllNodes
;
200 std::vector
<uint8_t> Bytes
;
201 Bytes
.reserve(250'000);
202 // This leading byte is used by the reading code to detect the root node.
205 auto CollectChildren
= [&SiblingTracker
, &AllNodes
](const auto &Children
) {
206 for (std::size_t Index
= 0; Index
< Children
.size(); Index
++) {
207 const std::unique_ptr
<Node
> &Child
= Children
[Index
];
208 AllNodes
.push_back(Child
.get());
209 if (Index
!= Children
.size() - 1)
210 SiblingTracker
[Child
.get()] = true;
213 CollectChildren(Root
->Children
);
215 while (!AllNodes
.empty()) {
216 const std::size_t Offset
= Bytes
.size();
217 Node
*const N
= AllNodes
.front();
218 AllNodes
.pop_front();
220 assert(!N
->Name
.empty());
223 uint8_t FirstByte
= (!!N
->Value
) ? 0x80 : 0;
224 // Single letter node are indexed in 6 bits
225 if (N
->Name
.size() == 1) {
226 FirstByte
|= letter(N
->Name
[0]);
227 Bytes
.push_back(FirstByte
);
229 // Otherwise we use a 16 bits index
230 FirstByte
= FirstByte
| uint8_t(N
->Name
.size()) | 0x40;
231 Bytes
.push_back(FirstByte
);
232 auto PosInDict
= Dict
.find(N
->Name
);
233 assert(PosInDict
!= std::string::npos
);
234 uint8_t Low
= PosInDict
;
235 uint8_t High
= ((PosInDict
>> 8) & 0xFF);
236 Bytes
.push_back(High
);
237 Bytes
.push_back(Low
);
240 const bool HasSibling
= SiblingTracker
.count(N
) != 0;
241 const bool HasChildren
= N
->Children
.size() != 0;
244 uint32_t Value
= (*(N
->Value
) << 3);
245 uint8_t H
= ((Value
>> 16) & 0xFF);
246 uint8_t M
= ((Value
>> 8) & 0xFF);
247 uint8_t L
= (Value
& 0xFF) | uint8_t(HasSibling
? 0x01 : 0) |
248 uint8_t(HasChildren
? 0x02 : 0);
255 ChildrenOffsets
.push_back(
256 ChildrenOffset
{N
->Children
[0].get(), Bytes
.size(), true});
257 // index of the first children
258 Bytes
.push_back(0x00);
259 Bytes
.push_back(0x00);
260 Bytes
.push_back(0x00);
263 // When there is no value (that's most intermediate nodes)
264 // Dispense of the 3 values bytes, and only store
265 // 1 byte to track whether the node has sibling and children
266 // + 2 bytes for the index of the first children if necessary.
267 // That index also uses bytes 0-6 of the previous byte.
269 uint8_t(HasSibling
? 0x80 : 0) | uint8_t(HasChildren
? 0x40 : 0);
270 Bytes
.push_back(Byte
);
272 ChildrenOffsets
.emplace_back(
273 ChildrenOffset
{N
->Children
[0].get(), Bytes
.size() - 1, false});
274 Bytes
.push_back(0x00);
275 Bytes
.push_back(0x00);
278 CollectChildren(N
->Children
);
281 // Once all the nodes are in the inndex
282 // Fill the bytes we left to indicate the position
284 for (const ChildrenOffset
&Parent
: ChildrenOffsets
) {
285 const auto It
= Offsets
.find(Parent
.FirstChild
);
286 assert(It
!= Offsets
.end());
287 std::size_t Pos
= It
->second
;
288 if (Parent
.HasValue
) {
289 Bytes
[Parent
.Offset
] = ((Pos
>> 16) & 0xFF);
291 Bytes
[Parent
.Offset
] =
292 Bytes
[Parent
.Offset
] | uint8_t((Pos
>> 16) & 0xFF);
294 Bytes
[Parent
.Offset
+ 1] = ((Pos
>> 8) & 0xFF);
295 Bytes
[Parent
.Offset
+ 2] = Pos
& 0xFF;
298 // Add some padding so that the deserialization code
299 // doesn't try to read past the enf of the array.
311 void collectKeys(Node
*N
, std::set
<std::string
> &Keys
) {
312 Keys
.insert(N
->Name
);
313 for (const std::unique_ptr
<Node
> &Child
: N
->Children
) {
314 collectKeys(Child
.get(), Keys
);
318 // Merge sequences of 1-character nodes
319 // This greatly reduce the total number of nodes,
320 // and therefore the size of the index.
321 // When the tree gets serialized, we only have 5 bytes to store the
322 // size of a name. Overlong names (>32 characters) are therefore
323 // kep into separate nodes
324 void compact(Node
*N
) {
325 for (auto &&Child
: N
->Children
) {
326 compact(Child
.get());
328 if (N
->Parent
&& N
->Parent
->Children
.size() == 1 && !N
->Parent
->Value
&&
329 (N
->Parent
->Name
.size() + N
->Name
.size() <= 32)) {
330 N
->Parent
->Value
= N
->Value
;
331 N
->Parent
->Name
+= N
->Name
;
332 N
->Parent
->Children
= std::move(N
->Children
);
333 for (std::unique_ptr
<Node
> &c
: N
->Parent
->Children
) {
334 c
->Parent
= N
->Parent
;
339 Node(std::string Name
, Node
*Parent
= nullptr)
340 : Name(Name
), Parent(Parent
) {}
342 std::vector
<std::unique_ptr
<Node
>> Children
;
344 Node
*Parent
= nullptr;
345 std::optional
<char32_t
> Value
;
348 std::unique_ptr
<Node
> Root
= std::make_unique
<Node
>("");
351 extern const char *UnicodeLicense
;
353 int main(int argc
, char **argv
) {
354 printf("Unicode name -> codepoint mapping generator\n"
355 "Usage: %s UnicodeData.txt NameAliases.txt output\n\n",
357 printf("NameAliases.txt can be found at "
358 "https://unicode.org/Public/15.1.0/ucd/NameAliases.txt\n"
359 "UnicodeData.txt can be found at "
360 "https://unicode.org/Public/15.1.0/ucd/UnicodeData.txt\n\n");
365 FILE *Out
= fopen(argv
[3], "w");
367 printf("Error creating output file.\n");
372 uint32_t NameCount
= 0;
373 std::size_t LongestName
= 0;
374 auto Entries
= loadDataFiles(argv
[1], argv
[2]);
375 for (const std::pair
<const char32_t
, std::string
> &Entry
: Entries
) {
376 char32_t Codepoint
= Entry
.first
;
377 const std::string
&Name
= Entry
.second
;
378 // Ignore names which are not valid.
380 !llvm::all_of(Name
, [](char C
) { return Letters
.contains(C
); })) {
383 printf("%06x: %s\n", static_cast<unsigned int>(Codepoint
), Name
.c_str());
384 T
.insert(Name
, Codepoint
);
386 std::max(LongestName
, std::size_t(llvm::count_if(Name
, llvm::isAlnum
)));
391 std::pair
<std::string
, std::vector
<uint8_t>> Data
= T
.serialize();
392 const std::string
&Dict
= Data
.first
;
393 const std::vector
<uint8_t> &Tree
= Data
.second
;
396 //===------------- Support/UnicodeNameToCodepointGenerated.cpp ------------===//
397 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
398 // See https://llvm.org/LICENSE.txt for license information.
399 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
401 //===----------------------------------------------------------------------===//
403 // This file implements mapping the name of a unicode code point to its value.
405 // This file was generated using %s.
406 // Do not edit manually.
408 //===----------------------------------------------------------------------===//
413 #include "llvm
/Support
/Compiler
.h
"
417 argv
[0], UnicodeLicense
);
420 "namespace llvm { namespace sys { namespace unicode { \n"
421 "extern const char *UnicodeNameToCodepointDict;\n"
422 "extern const uint8_t *UnicodeNameToCodepointIndex;\n"
423 "extern const std::size_t UnicodeNameToCodepointIndexSize;\n"
424 "extern const std::size_t UnicodeNameToCodepointLargestNameSize;\n");
426 fprintf(Out
, "const char* UnicodeNameToCodepointDict = \"%s\";\n",
429 fprintf(Out
, "uint8_t UnicodeNameToCodepointIndex_[%zu] = {\n",
432 for (auto Byte
: Tree
) {
433 fprintf(Out
, "0x%02x,", Byte
);
437 fprintf(Out
, "const uint8_t* UnicodeNameToCodepointIndex = "
438 "UnicodeNameToCodepointIndex_; \n");
439 fprintf(Out
, "const std::size_t UnicodeNameToCodepointIndexSize = %zu;\n",
442 "const std::size_t UnicodeNameToCodepointLargestNameSize = %zu;\n",
444 fprintf(Out
, "\n}}}\n");
446 printf("Generated %s: %u Files.\nIndex: %f kB, Dictionary: %f kB.\nDone\n\n",
447 argv
[3], NameCount
, Tree
.size() / 1024.0, Dict
.size() / 1024.0);
450 const char *UnicodeLicense
= R
"(
452 UNICODE, INC. LICENSE AGREEMENT - DATA FILES AND SOFTWARE
454 See Terms of Use <https://www.unicode.org/copyright.html>
455 for definitions of Unicode Inc.’s Data Files and Software.
457 NOTICE TO USER: Carefully read the following legal agreement.
458 BY DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING UNICODE INC.'S
459 DATA FILES ("DATA FILES
"), AND/OR SOFTWARE ("SOFTWARE
"),
460 YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE
461 TERMS AND CONDITIONS OF THIS AGREEMENT.
462 IF YOU DO NOT AGREE, DO NOT DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE
463 THE DATA FILES OR SOFTWARE.
465 COPYRIGHT AND PERMISSION NOTICE
467 Copyright © 1991-2022 Unicode, Inc. All rights reserved.
468 Distributed under the Terms of Use in https://www.unicode.org/copyright.html.
470 Permission is hereby granted, free of charge, to any person obtaining
471 a copy of the Unicode data files and any associated documentation
472 (the "Data Files
") or Unicode software and any associated documentation
473 (the "Software
") to deal in the Data Files or Software
474 without restriction, including without limitation the rights to use,
475 copy, modify, merge, publish, distribute, and/or sell copies of
476 the Data Files or Software, and to permit persons to whom the Data Files
477 or Software are furnished to do so, provided that either
478 (a) this copyright and permission notice appear with all copies
479 of the Data Files or Software, or
480 (b) this copyright and permission notice appear in associated
483 THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS
", WITHOUT WARRANTY OF
484 ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
485 WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
486 NONINFRINGEMENT OF THIRD PARTY RIGHTS.
487 IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS
488 NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL
489 DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
490 DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
491 TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
492 PERFORMANCE OF THE DATA FILES OR SOFTWARE.
494 Except as contained in this notice, the name of a copyright holder
495 shall not be used in advertising or otherwise to promote the sale,
496 use or other dealings in these Data Files or Software without prior
497 written authorization of the copyright holder.