1 //===--- BuildConfusableTable.cpp - clang-tidy---------------------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 #include "llvm/ADT/STLExtras.h"
10 #include "llvm/ADT/StringExtras.h"
11 #include "llvm/Support/ConvertUTF.h"
12 #include "llvm/Support/MemoryBuffer.h"
13 #include "llvm/Support/raw_ostream.h"
17 int main(int argc
, char *argv
[]) {
18 auto ErrorOrBuffer
= MemoryBuffer::getFile(argv
[1], true);
21 std::unique_ptr
<MemoryBuffer
> Buffer
= std::move(ErrorOrBuffer
.get());
22 StringRef Content
= Buffer
->getBuffer();
23 Content
= Content
.drop_until([](char c
) { return c
== '#'; });
24 SmallVector
<StringRef
> Lines
;
25 SplitString(Content
, Lines
, "\r\n");
27 std::vector
<std::pair
<llvm::UTF32
, SmallVector
<llvm::UTF32
>>> Entries
;
28 SmallVector
<StringRef
> Values
;
29 for (StringRef Line
: Lines
) {
30 if (Line
.starts_with("#"))
34 Line
.split(Values
, ';');
35 if (Values
.size() < 2) {
36 errs() << "Failed to parse: " << Line
<< "\n";
40 llvm::StringRef From
= Values
[0].trim();
41 llvm::UTF32 CodePoint
= 0;
42 From
.getAsInteger(16, CodePoint
);
44 SmallVector
<llvm::UTF32
> To
;
45 SmallVector
<StringRef
> ToN
;
46 Values
[1].split(ToN
, ' ', -1, false);
47 for (StringRef To_
: ToN
) {
48 llvm::UTF32 ToCodePoint
= 0;
49 To_
.trim().getAsInteger(16, ToCodePoint
);
50 To
.push_back(ToCodePoint
);
55 Entries
.emplace_back(CodePoint
, To
);
59 unsigned LargestValue
=
60 std::max_element(Entries
.begin(), Entries
.end(),
61 [](const auto &Entry0
, const auto &Entry1
) {
62 return Entry0
.second
.size() < Entry1
.second
.size();
67 llvm::raw_fd_ostream
os(argv
[2], ec
);
69 // FIXME: If memory consumption and/or lookup time becomes a constraint, it
70 // maybe worth using a more elaborate data structure.
71 os
<< "struct {llvm::UTF32 codepoint; llvm::UTF32 values[" << LargestValue
73 "ConfusableEntries[] = {\n";
74 for (const auto &Values
: Entries
) {
78 for (auto CP
: Values
.second
)