1 //===- llvm/Support/UnicodeNameToCodepoint.cpp - Unicode character properties
4 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
5 // See https://llvm.org/LICENSE.txt for license information.
6 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
8 //===----------------------------------------------------------------------===//
10 // This file implements functions to map the name or alias of a unicode
11 // character to its codepoint.
13 //===----------------------------------------------------------------------===//
15 #include "llvm/ADT/STLExtras.h"
16 #include "llvm/ADT/StringExtras.h"
17 #include "llvm/ADT/StringRef.h"
18 #include "llvm/Support/Unicode.h"
24 extern const char *UnicodeNameToCodepointDict
;
25 extern const uint8_t *UnicodeNameToCodepointIndex
;
26 extern const std::size_t UnicodeNameToCodepointIndexSize
;
27 extern const std::size_t UnicodeNameToCodepointLargestNameSize
;
29 using BufferType
= SmallString
<64>;
33 char32_t Value
= 0xFFFFFFFF;
34 uint32_t ChildrenOffset
= 0;
35 bool HasSibling
= false;
38 const Node
*Parent
= nullptr;
40 constexpr bool isValid() const {
41 return !Name
.empty() || Value
== 0xFFFFFFFF;
43 constexpr bool hasChildren() const { return ChildrenOffset
!= 0 || IsRoot
; }
45 std::string
fullName() const {
47 // Reserve enough space for most unicode code points.
48 // The chosen value represent the 99th percentile of name size as of
53 std::reverse_copy(N
->Name
.begin(), N
->Name
.end(), std::back_inserter(S
));
56 std::reverse(S
.begin(), S
.end());
61 static Node
createRoot() {
69 static Node
readNode(uint32_t Offset
, const Node
*Parent
= nullptr) {
73 uint32_t Origin
= Offset
;
76 uint8_t NameInfo
= UnicodeNameToCodepointIndex
[Offset
++];
77 if (Offset
+ 6 >= UnicodeNameToCodepointIndexSize
)
80 bool LongName
= NameInfo
& 0x40;
81 bool HasValue
= NameInfo
& 0x80;
82 std::size_t Size
= NameInfo
& ~0xC0;
84 uint32_t NameOffset
= (UnicodeNameToCodepointIndex
[Offset
++] << 8);
85 NameOffset
|= UnicodeNameToCodepointIndex
[Offset
++];
86 N
.Name
= StringRef(UnicodeNameToCodepointDict
+ NameOffset
, Size
);
88 N
.Name
= StringRef(UnicodeNameToCodepointDict
+ Size
, 1);
91 uint8_t H
= UnicodeNameToCodepointIndex
[Offset
++];
92 uint8_t M
= UnicodeNameToCodepointIndex
[Offset
++];
93 uint8_t L
= UnicodeNameToCodepointIndex
[Offset
++];
94 N
.Value
= ((H
<< 16) | (M
<< 8) | L
) >> 3;
96 bool HasChildren
= L
& 0x02;
97 N
.HasSibling
= L
& 0x01;
100 N
.ChildrenOffset
= UnicodeNameToCodepointIndex
[Offset
++] << 16;
101 N
.ChildrenOffset
|= UnicodeNameToCodepointIndex
[Offset
++] << 8;
102 N
.ChildrenOffset
|= UnicodeNameToCodepointIndex
[Offset
++];
105 uint8_t H
= UnicodeNameToCodepointIndex
[Offset
++];
106 N
.HasSibling
= H
& 0x80;
107 bool HasChildren
= H
& 0x40;
110 N
.ChildrenOffset
= (H
<< 16);
112 (uint32_t(UnicodeNameToCodepointIndex
[Offset
++]) << 8);
113 N
.ChildrenOffset
|= UnicodeNameToCodepointIndex
[Offset
++];
116 N
.Size
= Offset
- Origin
;
120 static bool startsWith(StringRef Name
, StringRef Needle
, bool Strict
,
121 std::size_t &Consummed
, char &PreviousCharInName
,
122 bool IsPrefix
= false) {
126 if (!Name
.startswith(Needle
))
128 Consummed
= Needle
.size();
134 auto NamePos
= Name
.begin();
135 auto NeedlePos
= Needle
.begin();
137 char PreviousCharInNameOrigin
= PreviousCharInName
;
138 char PreviousCharInNeedle
= *Needle
.begin();
139 auto IgnoreSpaces
= [](auto It
, auto End
, char &PreviousChar
,
140 bool IsPrefix
= false) {
142 const auto Next
= std::next(It
);
143 // Ignore spaces, underscore, medial hyphens
144 // The generator ensures a needle never ends (or starts) by a medial
145 // hyphen https://unicode.org/reports/tr44/#UAX44-LM2.
147 *It
== ' ' || *It
== '_' ||
148 (*It
== '-' && isAlnum(PreviousChar
) &&
149 ((Next
!= End
&& isAlnum(*Next
)) || (Next
== End
&& IsPrefix
)));
159 NamePos
= IgnoreSpaces(NamePos
, Name
.end(), PreviousCharInName
);
161 IgnoreSpaces(NeedlePos
, Needle
.end(), PreviousCharInNeedle
, IsPrefix
);
162 if (NeedlePos
== Needle
.end())
164 if (NamePos
== Name
.end())
166 if (toUpper(*NeedlePos
) != toUpper(*NamePos
))
171 Consummed
= std::distance(Name
.begin(), NamePos
);
172 if (NeedlePos
!= Needle
.end()) {
173 PreviousCharInName
= PreviousCharInNameOrigin
;
175 return NeedlePos
== Needle
.end();
178 static std::tuple
<Node
, bool, uint32_t>
179 compareNode(uint32_t Offset
, StringRef Name
, bool Strict
,
180 char PreviousCharInName
, BufferType
&Buffer
,
181 const Node
*Parent
= nullptr) {
182 Node N
= readNode(Offset
, Parent
);
183 std::size_t Consummed
= 0;
184 bool DoesStartWith
= N
.IsRoot
|| startsWith(Name
, N
.Name
, Strict
, Consummed
,
187 return std::make_tuple(N
, false, 0);
189 if (Name
.size() - Consummed
== 0 && N
.Value
!= 0xFFFFFFFF)
190 return std::make_tuple(N
, true, N
.Value
);
192 if (N
.hasChildren()) {
193 uint32_t ChildOffset
= N
.ChildrenOffset
;
198 std::tie(C
, Matches
, Value
) =
199 compareNode(ChildOffset
, Name
.substr(Consummed
), Strict
,
200 PreviousCharInName
, Buffer
, &N
);
202 std::reverse_copy(C
.Name
.begin(), C
.Name
.end(),
203 std::back_inserter(Buffer
));
204 return std::make_tuple(N
, true, Value
);
206 ChildOffset
+= C
.Size
;
211 return std::make_tuple(N
, false, 0);
214 static std::tuple
<Node
, bool, uint32_t>
215 compareNode(uint32_t Offset
, StringRef Name
, bool Strict
, BufferType
&Buffer
) {
216 return compareNode(Offset
, Name
, Strict
, 0, Buffer
);
220 constexpr const char *const HangulSyllables
[][3] = {
224 { "D", "YAE", "GS" },
225 { "DD", "EO", "N", },
227 { "M", "YEO", "NH" },
231 { "SS", "WAE", "LM" },
235 { "C", "WEO", "LP" },
253 // 3.12 Conjoining Jamo Behavior Common constants
254 constexpr const char32_t SBase
= 0xAC00;
255 constexpr const uint32_t LCount
= 19;
256 constexpr const uint32_t VCount
= 21;
257 constexpr const uint32_t TCount
= 28;
259 static std::size_t findSyllable(StringRef Name
, bool Strict
,
260 char &PreviousInName
, int &Pos
, int Column
) {
261 assert(Column
== 0 || Column
== 1 || Column
== 2);
262 static std::size_t CountPerColumn
[] = {LCount
, VCount
, TCount
};
264 int Prev
= PreviousInName
;
265 for (std::size_t I
= 0; I
< CountPerColumn
[Column
]; I
++) {
266 StringRef
Syllable(HangulSyllables
[I
][Column
]);
267 if (int(Syllable
.size()) <= Len
)
269 std::size_t Consummed
= 0;
270 char PreviousInNameCopy
= PreviousInName
;
272 startsWith(Name
, Syllable
, Strict
, Consummed
, PreviousInNameCopy
);
277 Prev
= PreviousInNameCopy
;
281 PreviousInName
= Prev
;
285 static std::optional
<char32_t
>
286 nameToHangulCodePoint(StringRef Name
, bool Strict
, BufferType
&Buffer
) {
288 // Hangul Syllable Decomposition
289 std::size_t Consummed
= 0;
292 startsWith(Name
, "HANGUL SYLLABLE ", Strict
, Consummed
, NameStart
);
295 Name
= Name
.substr(Consummed
);
296 int L
= -1, V
= -1, T
= -1;
297 Name
= Name
.substr(findSyllable(Name
, Strict
, NameStart
, L
, 0));
298 Name
= Name
.substr(findSyllable(Name
, Strict
, NameStart
, V
, 1));
299 Name
= Name
.substr(findSyllable(Name
, Strict
, NameStart
, T
, 2));
300 if (L
!= -1 && V
!= -1 && T
!= -1 && Name
.empty()) {
302 Buffer
.append("HANGUL SYLLABLE ");
304 Buffer
.append(HangulSyllables
[L
][0]);
306 Buffer
.append(HangulSyllables
[V
][1]);
308 Buffer
.append(HangulSyllables
[T
][2]);
310 return SBase
+ (std::uint32_t(L
) * VCount
+ std::uint32_t(V
)) * TCount
+
313 // Otherwise, it's an illegal syllable name.
317 struct GeneratedNamesData
{
323 // Unicode 15.0 Table 4-8. Name Derivation Rule Prefix Strings
324 static const GeneratedNamesData GeneratedNamesDataTable
[] = {
325 {"CJK UNIFIED IDEOGRAPH-", 0x3400, 0x4DBF},
326 {"CJK UNIFIED IDEOGRAPH-", 0x4E00, 0x9FFF},
327 {"CJK UNIFIED IDEOGRAPH-", 0x20000, 0x2A6DF},
328 {"CJK UNIFIED IDEOGRAPH-", 0x2A700, 0x2B739},
329 {"CJK UNIFIED IDEOGRAPH-", 0x2B740, 0x2B81D},
330 {"CJK UNIFIED IDEOGRAPH-", 0x2B820, 0x2CEA1},
331 {"CJK UNIFIED IDEOGRAPH-", 0x2CEB0, 0x2EBE0},
332 {"CJK UNIFIED IDEOGRAPH-", 0x30000, 0x3134A},
333 {"CJK UNIFIED IDEOGRAPH-", 0x31350, 0x323AF},
334 {"TANGUT IDEOGRAPH-", 0x17000, 0x187F7},
335 {"TANGUT IDEOGRAPH-", 0x18D00, 0x18D08},
336 {"KHITAN SMALL SCRIPT CHARACTER-", 0x18B00, 0x18CD5},
337 {"NUSHU CHARACTER-", 0x1B170, 0x1B2FB},
338 {"CJK COMPATIBILITY IDEOGRAPH-", 0xF900, 0xFA6D},
339 {"CJK COMPATIBILITY IDEOGRAPH-", 0xFA70, 0xFAD9},
340 {"CJK COMPATIBILITY IDEOGRAPH-", 0x2F800, 0x2FA1D},
343 static std::optional
<char32_t
>
344 nameToGeneratedCodePoint(StringRef Name
, bool Strict
, BufferType
&Buffer
) {
345 for (auto &&Item
: GeneratedNamesDataTable
) {
347 std::size_t Consummed
= 0;
349 bool DoesStartWith
= startsWith(Name
, Item
.Prefix
, Strict
, Consummed
,
350 NameStart
, /*IsPrefix=*/true);
353 auto Number
= Name
.substr(Consummed
);
354 unsigned long long V
= 0;
355 // Be consistent about mandating upper casing.
357 llvm::any_of(Number
, [](char C
) { return C
>= 'a' && C
<= 'f'; }))
359 if (getAsUnsignedInteger(Number
, 16, V
) || V
< Item
.Start
|| V
> Item
.End
)
362 Buffer
.append(Item
.Prefix
);
363 Buffer
.append(utohexstr(V
, true));
370 static std::optional
<char32_t
> nameToCodepoint(StringRef Name
, bool Strict
,
371 BufferType
&Buffer
) {
375 std::optional
<char32_t
> Res
= nameToHangulCodePoint(Name
, Strict
, Buffer
);
377 Res
= nameToGeneratedCodePoint(Name
, Strict
, Buffer
);
385 std::tie(Node
, Matches
, Value
) = compareNode(0, Name
, Strict
, Buffer
);
387 std::reverse(Buffer
.begin(), Buffer
.end());
388 // UAX44-LM2. Ignore case, whitespace, underscore ('_'), and all medial
389 // hyphens except the hyphen in U+1180 HANGUL JUNGSEONG O-E.
390 if (!Strict
&& Value
== 0x116c &&
391 Name
.find_insensitive("O-E") != StringRef::npos
) {
392 Buffer
= "HANGUL JUNGSEONG O-E";
400 std::optional
<char32_t
> nameToCodepointStrict(StringRef Name
) {
403 auto Opt
= nameToCodepoint(Name
, true, Buffer
);
407 std::optional
<LooseMatchingResult
>
408 nameToCodepointLooseMatching(StringRef Name
) {
410 auto Opt
= nameToCodepoint(Name
, false, Buffer
);
413 return LooseMatchingResult
{*Opt
, Buffer
};
416 // Find the unicode character whose editing distance to Pattern
417 // is shortest, using the Wagner–Fischer algorithm.
418 llvm::SmallVector
<MatchForCodepointName
>
419 nearestMatchesForCodepointName(StringRef Pattern
, std::size_t MaxMatchesCount
) {
420 // We maintain a fixed size vector of matches,
421 // sorted by distance
422 // The worst match (with the biggest distance) are discarded when new elements
424 std::size_t LargestEditDistance
= 0;
425 llvm::SmallVector
<MatchForCodepointName
> Matches
;
426 Matches
.reserve(MaxMatchesCount
+ 1);
428 auto Insert
= [&](const Node
&Node
, uint32_t Distance
,
429 char32_t Value
) -> bool {
430 if (Distance
> LargestEditDistance
) {
431 if (Matches
.size() == MaxMatchesCount
)
433 LargestEditDistance
= Distance
;
435 // To avoid allocations, the creation of the name is delayed
436 // as much as possible.
440 Name
= Node
.fullName();
444 auto It
= llvm::lower_bound(
446 [&](const MatchForCodepointName
&a
, std::size_t Distance
) {
447 if (Distance
== a
.Distance
)
448 return a
.Name
< GetName();
449 return a
.Distance
< Distance
;
451 if (It
== Matches
.end() && Matches
.size() == MaxMatchesCount
)
454 MatchForCodepointName M
{GetName(), Distance
, Value
};
455 Matches
.insert(It
, std::move(M
));
456 if (Matches
.size() > MaxMatchesCount
)
461 // We ignore case, space, hyphens, etc,
462 // in both the search pattern and the prospective names.
463 auto Normalize
= [](StringRef Name
) {
465 Out
.reserve(Name
.size());
466 for (char C
: Name
) {
468 Out
.push_back(toUpper(C
));
472 std::string NormalizedName
= Normalize(Pattern
);
474 // Allocate a matrix big enough for longest names.
475 const std::size_t Columns
=
476 std::min(NormalizedName
.size(), UnicodeNameToCodepointLargestNameSize
) +
479 LLVM_ATTRIBUTE_UNUSED
static std::size_t Rows
=
480 UnicodeNameToCodepointLargestNameSize
+ 1;
482 std::vector
<char> Distances(
483 Columns
* (UnicodeNameToCodepointLargestNameSize
+ 1), 0);
485 auto Get
= [&Distances
, Columns
](size_t Column
, std::size_t Row
) -> char & {
486 assert(Column
< Columns
);
488 return Distances
[Row
* Columns
+ Column
];
491 for (std::size_t I
= 0; I
< Columns
; I
++)
494 // Visit the childrens,
495 // Filling (and overriding) the matrix for the name fragment of each node
496 // iteratively. CompleteName is used to collect the actual name of potential
497 // match, respecting case and spacing.
498 auto VisitNode
= [&](const Node
&N
, std::size_t Row
,
499 auto &VisitNode
) -> void {
501 for (; J
< N
.Name
.size(); J
++) {
502 if (!isAlnum(N
.Name
[J
]))
507 for (std::size_t I
= 1; I
< Columns
; I
++) {
508 const int Delete
= Get(I
- 1, Row
) + 1;
509 const int Insert
= Get(I
, Row
- 1) + 1;
512 Get(I
- 1, Row
- 1) + (NormalizedName
[I
- 1] != N
.Name
[J
] ? 1 : 0);
514 Get(I
, Row
) = std::min(Insert
, std::min(Delete
, Replace
));
520 unsigned Cost
= Get(Columns
- 1, Row
- 1);
521 if (N
.Value
!= 0xFFFFFFFF) {
522 Insert(N
, Cost
, N
.Value
);
525 if (N
.hasChildren()) {
526 auto ChildOffset
= N
.ChildrenOffset
;
528 Node C
= readNode(ChildOffset
, &N
);
529 ChildOffset
+= C
.Size
;
532 VisitNode(C
, Row
, VisitNode
);
539 Node Root
= createRoot();
540 VisitNode(Root
, 1, VisitNode
);
544 } // namespace unicode