1 //===-- ClangPseudo.cpp - Clang pseudoparser tool -------------------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 #include "clang-pseudo/Bracket.h"
10 #include "clang-pseudo/DirectiveTree.h"
11 #include "clang-pseudo/Forest.h"
12 #include "clang-pseudo/GLR.h"
13 #include "clang-pseudo/Language.h"
14 #include "clang-pseudo/Token.h"
15 #include "clang-pseudo/cli/CLI.h"
16 #include "clang-pseudo/grammar/Grammar.h"
17 #include "clang-pseudo/grammar/LRGraph.h"
18 #include "clang-pseudo/grammar/LRTable.h"
19 #include "clang/Basic/LangOptions.h"
20 #include "llvm/ADT/STLExtras.h"
21 #include "llvm/ADT/STLFunctionalExtras.h"
22 #include "llvm/Support/CommandLine.h"
23 #include "llvm/Support/FormatVariadic.h"
24 #include "llvm/Support/MemoryBuffer.h"
25 #include "llvm/Support/Signals.h"
27 using clang::pseudo::ForestNode
;
28 using clang::pseudo::Token
;
29 using clang::pseudo::TokenStream
;
34 static opt
<bool> PrintGrammar("print-grammar", desc("Print the grammar"));
35 static opt
<bool> PrintGraph("print-graph",
36 desc("Print the LR graph for the grammar"));
37 static opt
<bool> PrintTable("print-table",
38 desc("Print the LR table for the grammar"));
39 static opt
<std::string
> Source("source", desc("Source file"));
40 static opt
<bool> PrintSource("print-source", desc("Print token stream"));
41 static opt
<bool> PrintTokens("print-tokens", desc("Print detailed token info"));
43 PrintDirectiveTree("print-directive-tree",
44 desc("Print directive structure of source code"));
46 StripDirectives("strip-directives",
47 desc("Strip directives and select conditional sections"));
48 static opt
<bool> PrintStatistics("print-statistics", desc("Print GLR parser statistics"));
49 static opt
<bool> PrintForest("print-forest", desc("Print parse forest"));
50 static opt
<bool> ForestAbbrev("forest-abbrev", desc("Abbreviate parse forest"),
52 static opt
<std::string
> HTMLForest("html-forest",
53 desc("output file for HTML forest"));
54 static opt
<std::string
> StartSymbol("start-symbol",
55 desc("Specify the start symbol to parse"),
56 init("translation-unit"));
58 static std::string
readOrDie(llvm::StringRef Path
) {
59 llvm::ErrorOr
<std::unique_ptr
<llvm::MemoryBuffer
>> Text
=
60 llvm::MemoryBuffer::getFile(Path
);
61 if (std::error_code EC
= Text
.getError()) {
62 llvm::errs() << "Error: can't read file '" << Path
63 << "': " << EC
.message() << "\n";
66 return Text
.get()->getBuffer().str();
71 // Defined in HTMLForest.cpp
72 void writeHTMLForest(llvm::raw_ostream
&OS
, const Grammar
&,
73 const ForestNode
&Root
, const TokenStream
&);
78 std::vector
<std::pair
<SymbolID
, unsigned>> BySymbol
;
80 NodeStats(const ForestNode
&Root
,
81 llvm::function_ref
<bool(const ForestNode
&)> Filter
) {
82 llvm::DenseMap
<SymbolID
, unsigned> Map
;
83 for (const ForestNode
&N
: Root
.descendants())
88 BySymbol
= {Map
.begin(), Map
.end()};
89 // Sort by count descending, then symbol ascending.
90 llvm::sort(BySymbol
, [](const auto &L
, const auto &R
) {
91 return std::tie(R
.second
, L
.first
) < std::tie(L
.second
, R
.first
);
100 int main(int argc
, char *argv
[]) {
101 llvm::cl::ParseCommandLineOptions(argc
, argv
, "");
102 llvm::sys::PrintStackTraceOnErrorSignal(argv
[0]);
104 clang::LangOptions LangOpts
= clang::pseudo::genericLangOpts();
105 std::string SourceText
;
106 llvm::Optional
<clang::pseudo::TokenStream
> RawStream
;
107 llvm::Optional
<TokenStream
> PreprocessedStream
;
108 llvm::Optional
<clang::pseudo::TokenStream
> ParseableStream
;
109 if (Source
.getNumOccurrences()) {
110 SourceText
= readOrDie(Source
);
111 RawStream
= clang::pseudo::lex(SourceText
, LangOpts
);
112 TokenStream
*Stream
= RawStream
.getPointer();
114 auto DirectiveStructure
= clang::pseudo::DirectiveTree::parse(*RawStream
);
115 clang::pseudo::chooseConditionalBranches(DirectiveStructure
, *RawStream
);
117 llvm::Optional
<TokenStream
> Preprocessed
;
118 if (StripDirectives
) {
119 Preprocessed
= DirectiveStructure
.stripDirectives(*Stream
);
120 Stream
= Preprocessed
.getPointer();
124 Stream
->print(llvm::outs());
126 llvm::outs() << *Stream
;
127 if (PrintDirectiveTree
)
128 llvm::outs() << DirectiveStructure
;
130 ParseableStream
= clang::pseudo::stripComments(cook(*Stream
, LangOpts
));
131 pairBrackets(*ParseableStream
);
134 const auto &Lang
= clang::pseudo::getLanguageFromFlags();
136 llvm::outs() << Lang
.G
.dump();
138 llvm::outs() << clang::pseudo::LRGraph::buildLR0(Lang
.G
).dumpForTests(
142 llvm::outs() << Lang
.Table
.dumpForTests(Lang
.G
);
144 llvm::outs() << Lang
.Table
.dumpStatistics();
146 if (ParseableStream
) {
147 clang::pseudo::ForestArena Arena
;
148 clang::pseudo::GSS GSS
;
149 llvm::Optional
<clang::pseudo::SymbolID
> StartSymID
=
150 Lang
.G
.findNonterminal(StartSymbol
);
152 llvm::errs() << llvm::formatv(
153 "The start symbol {0} doesn't exit in the grammar!\n", StartSymbol
);
157 glrParse(clang::pseudo::ParseParams
{*ParseableStream
, Arena
, GSS
},
160 llvm::outs() << Root
.dumpRecursive(Lang
.G
, /*Abbreviated=*/ForestAbbrev
);
162 if (HTMLForest
.getNumOccurrences()) {
164 llvm::raw_fd_ostream
HTMLOut(HTMLForest
, EC
);
166 llvm::errs() << "Couldn't write " << HTMLForest
<< ": " << EC
.message()
170 clang::pseudo::writeHTMLForest(HTMLOut
, Lang
.G
, Root
, *ParseableStream
);
173 if (PrintStatistics
) {
174 llvm::outs() << "Forest bytes: " << Arena
.bytes()
175 << " nodes: " << Arena
.nodeCount() << "\n";
176 llvm::outs() << "GSS bytes: " << GSS
.bytes()
177 << " nodes: " << GSS
.nodesCreated() << "\n";
179 for (auto &P
: {std::make_pair("Ambiguous", ForestNode::Ambiguous
),
180 std::make_pair("Opaque", ForestNode::Opaque
)}) {
181 clang::pseudo::NodeStats
Stats(
182 Root
, [&](const auto &N
) { return N
.kind() == P
.second
; });
183 llvm::outs() << "\n" << Stats
.Total
<< " " << P
.first
<< " nodes:\n";
184 for (const auto &S
: Stats
.BySymbol
)
185 llvm::outs() << llvm::formatv(" {0,3} {1}\n", S
.second
,
186 Lang
.G
.symbolName(S
.first
));
189 // Metrics for how imprecise parsing was.
190 // These are rough but aim to be:
191 // - linear: if we eliminate half the errors the metric should halve
192 // - length-independent
193 unsigned UnparsedTokens
= 0; // Tokens covered by Opaque. (not unique)
194 unsigned Misparses
= 0; // Sum of alternatives-1
195 llvm::DenseSet
<const ForestNode
*> Visited
;
196 auto DFS
= [&](const ForestNode
&N
, Token::Index End
, auto &DFS
) -> void {
197 if (N
.kind() == ForestNode::Opaque
) {
198 UnparsedTokens
+= End
- N
.startTokenIndex();
199 } else if (N
.kind() == ForestNode::Ambiguous
) {
200 Misparses
+= N
.alternatives().size() - 1;
201 for (const auto *C
: N
.alternatives())
202 if (Visited
.insert(C
).second
)
204 } else if (N
.kind() == ForestNode::Sequence
) {
205 for (unsigned I
= 0, E
= N
.children().size(); I
< E
; ++I
)
206 if (Visited
.insert(N
.children()[I
]).second
)
207 DFS(*N
.children()[I
],
208 I
+ 1 == N
.children().size()
210 : N
.children()[I
+ 1]->startTokenIndex(),
214 unsigned Len
= ParseableStream
->tokens().size();
216 llvm::outs() << "\n";
217 llvm::outs() << llvm::formatv("Ambiguity: {0} misparses/token\n",
218 double(Misparses
) / Len
);
219 llvm::outs() << llvm::formatv("Unparsed: {0}%\n",
220 100.0 * UnparsedTokens
/ Len
);