1 //===-- ClangPseudo.cpp - Clang pseudoparser tool -------------------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 #include "clang-pseudo/Bracket.h"
10 #include "clang-pseudo/DirectiveTree.h"
11 #include "clang-pseudo/Disambiguate.h"
12 #include "clang-pseudo/Forest.h"
13 #include "clang-pseudo/GLR.h"
14 #include "clang-pseudo/Language.h"
15 #include "clang-pseudo/Token.h"
16 #include "clang-pseudo/cli/CLI.h"
17 #include "clang-pseudo/grammar/Grammar.h"
18 #include "clang-pseudo/grammar/LRGraph.h"
19 #include "clang-pseudo/grammar/LRTable.h"
20 #include "clang/Basic/LangOptions.h"
21 #include "llvm/ADT/STLExtras.h"
22 #include "llvm/ADT/STLFunctionalExtras.h"
23 #include "llvm/Support/CommandLine.h"
24 #include "llvm/Support/FormatVariadic.h"
25 #include "llvm/Support/MemoryBuffer.h"
26 #include "llvm/Support/Signals.h"
29 using clang::pseudo::ForestNode
;
30 using clang::pseudo::Token
;
31 using clang::pseudo::TokenStream
;
36 static opt
<bool> PrintGrammar("print-grammar", desc("Print the grammar"));
37 static opt
<bool> PrintGraph("print-graph",
38 desc("Print the LR graph for the grammar"));
39 static opt
<bool> PrintTable("print-table",
40 desc("Print the LR table for the grammar"));
41 static opt
<std::string
> Source("source", desc("Source file"));
42 static opt
<bool> PrintSource("print-source", desc("Print token stream"));
43 static opt
<bool> PrintTokens("print-tokens", desc("Print detailed token info"));
45 PrintDirectiveTree("print-directive-tree",
46 desc("Print directive structure of source code"));
48 StripDirectives("strip-directives",
49 desc("Strip directives and select conditional sections"));
50 static opt
<bool> Disambiguate("disambiguate",
51 desc("Choose best tree from parse forest"));
52 static opt
<bool> PrintStatistics("print-statistics", desc("Print GLR parser statistics"));
53 static opt
<bool> PrintForest("print-forest", desc("Print parse forest"));
54 static opt
<bool> ForestAbbrev("forest-abbrev", desc("Abbreviate parse forest"),
56 static opt
<std::string
> HTMLForest("html-forest",
57 desc("output file for HTML forest"));
58 static opt
<std::string
> StartSymbol("start-symbol",
59 desc("Specify the start symbol to parse"),
60 init("translation-unit"));
62 static std::string
readOrDie(llvm::StringRef Path
) {
63 llvm::ErrorOr
<std::unique_ptr
<llvm::MemoryBuffer
>> Text
=
64 llvm::MemoryBuffer::getFile(Path
);
65 if (std::error_code EC
= Text
.getError()) {
66 llvm::errs() << "Error: can't read file '" << Path
67 << "': " << EC
.message() << "\n";
70 return Text
.get()->getBuffer().str();
75 // Defined in HTMLForest.cpp
76 void writeHTMLForest(llvm::raw_ostream
&OS
, const Grammar
&,
77 const ForestNode
&Root
, const Disambiguation
&,
83 std::vector
<std::pair
<SymbolID
, unsigned>> BySymbol
;
85 NodeStats(const ForestNode
&Root
,
86 llvm::function_ref
<bool(const ForestNode
&)> Filter
) {
87 llvm::DenseMap
<SymbolID
, unsigned> Map
;
88 for (const ForestNode
&N
: Root
.descendants())
93 BySymbol
= {Map
.begin(), Map
.end()};
94 // Sort by count descending, then symbol ascending.
95 llvm::sort(BySymbol
, [](const auto &L
, const auto &R
) {
96 return std::tie(R
.second
, L
.first
) < std::tie(L
.second
, R
.first
);
102 } // namespace pseudo
105 int main(int argc
, char *argv
[]) {
106 llvm::cl::ParseCommandLineOptions(argc
, argv
, "");
107 llvm::sys::PrintStackTraceOnErrorSignal(argv
[0]);
109 clang::LangOptions LangOpts
= clang::pseudo::genericLangOpts();
110 std::string SourceText
;
111 std::optional
<clang::pseudo::TokenStream
> RawStream
;
112 std::optional
<TokenStream
> PreprocessedStream
;
113 std::optional
<clang::pseudo::TokenStream
> ParseableStream
;
114 if (Source
.getNumOccurrences()) {
115 SourceText
= readOrDie(Source
);
116 RawStream
= clang::pseudo::lex(SourceText
, LangOpts
);
117 TokenStream
*Stream
= &*RawStream
;
119 auto DirectiveStructure
= clang::pseudo::DirectiveTree::parse(*RawStream
);
120 clang::pseudo::chooseConditionalBranches(DirectiveStructure
, *RawStream
);
122 std::optional
<TokenStream
> Preprocessed
;
123 if (StripDirectives
) {
124 Preprocessed
= DirectiveStructure
.stripDirectives(*Stream
);
125 Stream
= &*Preprocessed
;
129 Stream
->print(llvm::outs());
131 llvm::outs() << *Stream
;
132 if (PrintDirectiveTree
)
133 llvm::outs() << DirectiveStructure
;
135 ParseableStream
= clang::pseudo::stripComments(cook(*Stream
, LangOpts
));
136 pairBrackets(*ParseableStream
);
139 const auto &Lang
= clang::pseudo::getLanguageFromFlags();
141 llvm::outs() << Lang
.G
.dump();
143 llvm::outs() << clang::pseudo::LRGraph::buildLR0(Lang
.G
).dumpForTests(
147 llvm::outs() << Lang
.Table
.dumpForTests(Lang
.G
);
149 llvm::outs() << Lang
.Table
.dumpStatistics();
151 if (ParseableStream
) {
152 clang::pseudo::ForestArena Arena
;
153 clang::pseudo::GSS GSS
;
154 std::optional
<clang::pseudo::SymbolID
> StartSymID
=
155 Lang
.G
.findNonterminal(StartSymbol
);
157 llvm::errs() << llvm::formatv(
158 "The start symbol {0} doesn't exit in the grammar!\n", StartSymbol
);
162 glrParse(clang::pseudo::ParseParams
{*ParseableStream
, Arena
, GSS
},
164 // If we're disambiguating, we'll print at the end instead.
165 if (PrintForest
&& !Disambiguate
)
166 llvm::outs() << Root
.dumpRecursive(Lang
.G
, /*Abbreviated=*/ForestAbbrev
);
167 clang::pseudo::Disambiguation Disambig
;
169 Disambig
= clang::pseudo::disambiguate(&Root
, {});
171 if (HTMLForest
.getNumOccurrences()) {
173 llvm::raw_fd_ostream
HTMLOut(HTMLForest
, EC
);
175 llvm::errs() << "Couldn't write " << HTMLForest
<< ": " << EC
.message()
179 clang::pseudo::writeHTMLForest(HTMLOut
, Lang
.G
, Root
, Disambig
,
183 if (PrintStatistics
) {
184 llvm::outs() << "Forest bytes: " << Arena
.bytes()
185 << " nodes: " << Arena
.nodeCount() << "\n";
186 llvm::outs() << "GSS bytes: " << GSS
.bytes()
187 << " nodes: " << GSS
.nodesCreated() << "\n";
189 for (auto &P
: {std::make_pair("Ambiguous", ForestNode::Ambiguous
),
190 std::make_pair("Opaque", ForestNode::Opaque
)}) {
191 clang::pseudo::NodeStats
Stats(
192 Root
, [&](const auto &N
) { return N
.kind() == P
.second
; });
193 llvm::outs() << "\n" << Stats
.Total
<< " " << P
.first
<< " nodes:\n";
194 for (const auto &S
: Stats
.BySymbol
)
195 llvm::outs() << llvm::formatv(" {0,3} {1}\n", S
.second
,
196 Lang
.G
.symbolName(S
.first
));
199 // Metrics for how imprecise parsing was.
200 // These are rough but aim to be:
201 // - linear: if we eliminate half the errors the metric should halve
202 // - length-independent
203 unsigned UnparsedTokens
= 0; // Tokens covered by Opaque. (not unique)
204 unsigned Misparses
= 0; // Sum of alternatives-1
205 llvm::DenseSet
<const ForestNode
*> Visited
;
206 auto DFS
= [&](const ForestNode
&N
, Token::Index End
, auto &DFS
) -> void {
207 if (N
.kind() == ForestNode::Opaque
) {
208 UnparsedTokens
+= End
- N
.startTokenIndex();
209 } else if (N
.kind() == ForestNode::Ambiguous
) {
210 Misparses
+= N
.alternatives().size() - 1;
211 for (const auto *C
: N
.alternatives())
212 if (Visited
.insert(C
).second
)
214 } else if (N
.kind() == ForestNode::Sequence
) {
215 for (unsigned I
= 0, E
= N
.children().size(); I
< E
; ++I
)
216 if (Visited
.insert(N
.children()[I
]).second
)
217 DFS(*N
.children()[I
],
218 I
+ 1 == N
.children().size()
220 : N
.children()[I
+ 1]->startTokenIndex(),
224 unsigned Len
= ParseableStream
->tokens().size();
226 llvm::outs() << "\n";
227 llvm::outs() << llvm::formatv("Ambiguity: {0} misparses/token\n",
228 double(Misparses
) / Len
);
229 llvm::outs() << llvm::formatv("Unparsed: {0}%\n",
230 100.0 * UnparsedTokens
/ Len
);
233 if (Disambiguate
&& PrintForest
) {
234 ForestNode
*DisambigRoot
= &Root
;
235 removeAmbiguities(DisambigRoot
, Disambig
);
236 llvm::outs() << "Disambiguated tree:\n";
237 llvm::outs() << DisambigRoot
->dumpRecursive(Lang
.G
,
238 /*Abbreviated=*/ForestAbbrev
);