[docs] Add LICENSE.txt to the root of the mono-repo
[llvm-project.git] / clang-tools-extra / pseudo / tool / ClangPseudo.cpp
blob294098a3a5c14a7f35cae7ba273eb26cac4009de
1 //===-- ClangPseudo.cpp - Clang pseudoparser tool -------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
9 #include "clang-pseudo/Bracket.h"
10 #include "clang-pseudo/DirectiveTree.h"
11 #include "clang-pseudo/Forest.h"
12 #include "clang-pseudo/GLR.h"
13 #include "clang-pseudo/Language.h"
14 #include "clang-pseudo/Token.h"
15 #include "clang-pseudo/cli/CLI.h"
16 #include "clang-pseudo/grammar/Grammar.h"
17 #include "clang-pseudo/grammar/LRGraph.h"
18 #include "clang-pseudo/grammar/LRTable.h"
19 #include "clang/Basic/LangOptions.h"
20 #include "llvm/ADT/STLExtras.h"
21 #include "llvm/ADT/STLFunctionalExtras.h"
22 #include "llvm/Support/CommandLine.h"
23 #include "llvm/Support/FormatVariadic.h"
24 #include "llvm/Support/MemoryBuffer.h"
25 #include "llvm/Support/Signals.h"
27 using clang::pseudo::ForestNode;
28 using clang::pseudo::Token;
29 using clang::pseudo::TokenStream;
30 using llvm::cl::desc;
31 using llvm::cl::init;
32 using llvm::cl::opt;
34 static opt<bool> PrintGrammar("print-grammar", desc("Print the grammar"));
35 static opt<bool> PrintGraph("print-graph",
36 desc("Print the LR graph for the grammar"));
37 static opt<bool> PrintTable("print-table",
38 desc("Print the LR table for the grammar"));
39 static opt<std::string> Source("source", desc("Source file"));
40 static opt<bool> PrintSource("print-source", desc("Print token stream"));
41 static opt<bool> PrintTokens("print-tokens", desc("Print detailed token info"));
42 static opt<bool>
43 PrintDirectiveTree("print-directive-tree",
44 desc("Print directive structure of source code"));
45 static opt<bool>
46 StripDirectives("strip-directives",
47 desc("Strip directives and select conditional sections"));
48 static opt<bool> PrintStatistics("print-statistics", desc("Print GLR parser statistics"));
49 static opt<bool> PrintForest("print-forest", desc("Print parse forest"));
50 static opt<bool> ForestAbbrev("forest-abbrev", desc("Abbreviate parse forest"),
51 init(true));
52 static opt<std::string> HTMLForest("html-forest",
53 desc("output file for HTML forest"));
54 static opt<std::string> StartSymbol("start-symbol",
55 desc("Specify the start symbol to parse"),
56 init("translation-unit"));
58 static std::string readOrDie(llvm::StringRef Path) {
59 llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> Text =
60 llvm::MemoryBuffer::getFile(Path);
61 if (std::error_code EC = Text.getError()) {
62 llvm::errs() << "Error: can't read file '" << Path
63 << "': " << EC.message() << "\n";
64 ::exit(1);
66 return Text.get()->getBuffer().str();
69 namespace clang {
70 namespace pseudo {
71 // Defined in HTMLForest.cpp
72 void writeHTMLForest(llvm::raw_ostream &OS, const Grammar &,
73 const ForestNode &Root, const TokenStream &);
74 namespace {
76 struct NodeStats {
77 unsigned Total = 0;
78 std::vector<std::pair<SymbolID, unsigned>> BySymbol;
80 NodeStats(const ForestNode &Root,
81 llvm::function_ref<bool(const ForestNode &)> Filter) {
82 llvm::DenseMap<SymbolID, unsigned> Map;
83 for (const ForestNode &N : Root.descendants())
84 if (Filter(N)) {
85 ++Total;
86 ++Map[N.symbol()];
88 BySymbol = {Map.begin(), Map.end()};
89 // Sort by count descending, then symbol ascending.
90 llvm::sort(BySymbol, [](const auto &L, const auto &R) {
91 return std::tie(R.second, L.first) < std::tie(L.second, R.first);
92 });
96 } // namespace
97 } // namespace pseudo
98 } // namespace clang
100 int main(int argc, char *argv[]) {
101 llvm::cl::ParseCommandLineOptions(argc, argv, "");
102 llvm::sys::PrintStackTraceOnErrorSignal(argv[0]);
104 clang::LangOptions LangOpts = clang::pseudo::genericLangOpts();
105 std::string SourceText;
106 llvm::Optional<clang::pseudo::TokenStream> RawStream;
107 llvm::Optional<TokenStream> PreprocessedStream;
108 llvm::Optional<clang::pseudo::TokenStream> ParseableStream;
109 if (Source.getNumOccurrences()) {
110 SourceText = readOrDie(Source);
111 RawStream = clang::pseudo::lex(SourceText, LangOpts);
112 TokenStream *Stream = RawStream.getPointer();
114 auto DirectiveStructure = clang::pseudo::DirectiveTree::parse(*RawStream);
115 clang::pseudo::chooseConditionalBranches(DirectiveStructure, *RawStream);
117 llvm::Optional<TokenStream> Preprocessed;
118 if (StripDirectives) {
119 Preprocessed = DirectiveStructure.stripDirectives(*Stream);
120 Stream = Preprocessed.getPointer();
123 if (PrintSource)
124 Stream->print(llvm::outs());
125 if (PrintTokens)
126 llvm::outs() << *Stream;
127 if (PrintDirectiveTree)
128 llvm::outs() << DirectiveStructure;
130 ParseableStream = clang::pseudo::stripComments(cook(*Stream, LangOpts));
131 pairBrackets(*ParseableStream);
134 const auto &Lang = clang::pseudo::getLanguageFromFlags();
135 if (PrintGrammar)
136 llvm::outs() << Lang.G.dump();
137 if (PrintGraph)
138 llvm::outs() << clang::pseudo::LRGraph::buildLR0(Lang.G).dumpForTests(
139 Lang.G);
141 if (PrintTable)
142 llvm::outs() << Lang.Table.dumpForTests(Lang.G);
143 if (PrintStatistics)
144 llvm::outs() << Lang.Table.dumpStatistics();
146 if (ParseableStream) {
147 clang::pseudo::ForestArena Arena;
148 clang::pseudo::GSS GSS;
149 llvm::Optional<clang::pseudo::SymbolID> StartSymID =
150 Lang.G.findNonterminal(StartSymbol);
151 if (!StartSymID) {
152 llvm::errs() << llvm::formatv(
153 "The start symbol {0} doesn't exit in the grammar!\n", StartSymbol);
154 return 2;
156 auto &Root =
157 glrParse(clang::pseudo::ParseParams{*ParseableStream, Arena, GSS},
158 *StartSymID, Lang);
159 if (PrintForest)
160 llvm::outs() << Root.dumpRecursive(Lang.G, /*Abbreviated=*/ForestAbbrev);
162 if (HTMLForest.getNumOccurrences()) {
163 std::error_code EC;
164 llvm::raw_fd_ostream HTMLOut(HTMLForest, EC);
165 if (EC) {
166 llvm::errs() << "Couldn't write " << HTMLForest << ": " << EC.message()
167 << "\n";
168 return 2;
170 clang::pseudo::writeHTMLForest(HTMLOut, Lang.G, Root, *ParseableStream);
173 if (PrintStatistics) {
174 llvm::outs() << "Forest bytes: " << Arena.bytes()
175 << " nodes: " << Arena.nodeCount() << "\n";
176 llvm::outs() << "GSS bytes: " << GSS.bytes()
177 << " nodes: " << GSS.nodesCreated() << "\n";
179 for (auto &P : {std::make_pair("Ambiguous", ForestNode::Ambiguous),
180 std::make_pair("Opaque", ForestNode::Opaque)}) {
181 clang::pseudo::NodeStats Stats(
182 Root, [&](const auto &N) { return N.kind() == P.second; });
183 llvm::outs() << "\n" << Stats.Total << " " << P.first << " nodes:\n";
184 for (const auto &S : Stats.BySymbol)
185 llvm::outs() << llvm::formatv(" {0,3} {1}\n", S.second,
186 Lang.G.symbolName(S.first));
189 // Metrics for how imprecise parsing was.
190 // These are rough but aim to be:
191 // - linear: if we eliminate half the errors the metric should halve
192 // - length-independent
193 unsigned UnparsedTokens = 0; // Tokens covered by Opaque. (not unique)
194 unsigned Misparses = 0; // Sum of alternatives-1
195 llvm::DenseSet<const ForestNode *> Visited;
196 auto DFS = [&](const ForestNode &N, Token::Index End, auto &DFS) -> void {
197 if (N.kind() == ForestNode::Opaque) {
198 UnparsedTokens += End - N.startTokenIndex();
199 } else if (N.kind() == ForestNode::Ambiguous) {
200 Misparses += N.alternatives().size() - 1;
201 for (const auto *C : N.alternatives())
202 if (Visited.insert(C).second)
203 DFS(*C, End, DFS);
204 } else if (N.kind() == ForestNode::Sequence) {
205 for (unsigned I = 0, E = N.children().size(); I < E; ++I)
206 if (Visited.insert(N.children()[I]).second)
207 DFS(*N.children()[I],
208 I + 1 == N.children().size()
209 ? End
210 : N.children()[I + 1]->startTokenIndex(),
211 DFS);
214 unsigned Len = ParseableStream->tokens().size();
215 DFS(Root, Len, DFS);
216 llvm::outs() << "\n";
217 llvm::outs() << llvm::formatv("Ambiguity: {0} misparses/token\n",
218 double(Misparses) / Len);
219 llvm::outs() << llvm::formatv("Unparsed: {0}%\n",
220 100.0 * UnparsedTokens / Len);
224 return 0;