1 //===--- GLRTest.cpp - Test the GLR parser ----------------------*- C++ -*-===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 #include "clang-pseudo/GLR.h"
10 #include "clang-pseudo/Bracket.h"
11 #include "clang-pseudo/Language.h"
12 #include "clang-pseudo/Token.h"
13 #include "clang-pseudo/grammar/Grammar.h"
14 #include "clang/Basic/LangOptions.h"
15 #include "clang/Basic/TokenKinds.h"
16 #include "llvm/ADT/StringExtras.h"
17 #include "llvm/Support/FormatVariadic.h"
18 #include "gmock/gmock.h"
19 #include "gtest/gtest.h"
25 llvm::raw_ostream
&operator<<(llvm::raw_ostream
&OS
,
26 const std::vector
<const GSS::Node
*> &Heads
) {
27 for (const auto *Head
: Heads
)
34 using StateID
= LRTable::StateID
;
36 using testing::ElementsAre
;
37 using testing::IsEmpty
;
38 using testing::UnorderedElementsAre
;
40 MATCHER_P(state
, StateID
, "") { return arg
->State
== StateID
; }
41 MATCHER_P(parsedSymbol
, FNode
, "") { return arg
->Payload
== FNode
; }
42 MATCHER_P(parsedSymbolID
, SID
, "") { return arg
->Payload
->symbol() == SID
; }
43 MATCHER_P(start
, Start
, "") { return arg
->Payload
->startTokenIndex() == Start
; }
45 testing::Matcher
<const GSS::Node
*>
46 parents(llvm::ArrayRef
<const GSS::Node
*> Parents
) {
47 return testing::Property(&GSS::Node::parents
,
48 testing::UnorderedElementsAreArray(Parents
));
51 Token::Index
recoverBraces(Token::Index Begin
, const TokenStream
&Code
) {
53 const Token
&Left
= Code
.tokens()[Begin
- 1];
54 EXPECT_EQ(Left
.Kind
, tok::l_brace
);
55 if (const auto* Right
= Left
.pair()) {
56 EXPECT_EQ(Right
->Kind
, tok::r_brace
);
57 return Code
.index(*Right
);
59 return Token::Invalid
;
62 class GLRTest
: public ::testing::Test
{
64 void build(llvm::StringRef GrammarBNF
) {
65 std::vector
<std::string
> Diags
;
66 TestLang
.G
= Grammar::parseBNF(GrammarBNF
, Diags
);
69 TokenStream
emptyTokenStream() {
75 void buildGrammar(std::vector
<std::string
> Nonterminals
,
76 std::vector
<std::string
> Rules
) {
77 Nonterminals
.push_back("_");
78 llvm::sort(Nonterminals
);
79 Nonterminals
.erase(std::unique(Nonterminals
.begin(), Nonterminals
.end()),
81 std::string FakeTestBNF
;
82 for (const auto &NT
: Nonterminals
)
83 FakeTestBNF
+= llvm::formatv("{0} := {1}\n", "_", NT
);
84 FakeTestBNF
+= llvm::join(Rules
, "\n");
88 SymbolID
id(llvm::StringRef Name
) const {
89 for (unsigned I
= 0; I
< NumTerminals
; ++I
)
90 if (TestLang
.G
.table().Terminals
[I
] == Name
)
91 return tokenSymbol(static_cast<tok::TokenKind
>(I
));
92 for (SymbolID ID
= 0; ID
< TestLang
.G
.table().Nonterminals
.size(); ++ID
)
93 if (TestLang
.G
.table().Nonterminals
[ID
].Name
== Name
)
95 ADD_FAILURE() << "No such symbol found: " << Name
;
98 ExtensionID
extensionID(llvm::StringRef AttrValueName
) const {
99 for (ExtensionID EID
= 0; EID
< TestLang
.G
.table().AttributeValues
.size();
101 if (TestLang
.G
.table().AttributeValues
[EID
] == AttrValueName
)
103 ADD_FAILURE() << "No such attribute value found: " << AttrValueName
;
107 RuleID
ruleFor(llvm::StringRef NonterminalName
) const {
109 TestLang
.G
.table().Nonterminals
[id(NonterminalName
)].RuleRange
;
110 if (RuleRange
.End
- RuleRange
.Start
== 1)
111 return TestLang
.G
.table()
112 .Nonterminals
[id(NonterminalName
)]
114 ADD_FAILURE() << "Expected a single rule for " << NonterminalName
115 << ", but it has " << RuleRange
.End
- RuleRange
.Start
126 TEST_F(GLRTest
, ShiftMergingHeads
) {
127 // Given a test case where we have two heads 1, 2, 3 in the GSS, the heads 1,
128 // 2 have shift actions to reach state 4, and the head 3 has a shift action to
133 // After the shift action, the GSS (with new heads 4, 5) is:
138 GSStack
.addNode(/*State=*/0, /*ForestNode=*/nullptr, /*Parents=*/{});
139 auto *GSSNode1
= GSStack
.addNode(/*State=*/1, /*ForestNode=*/nullptr,
140 /*Parents=*/{GSSNode0
});
141 auto *GSSNode2
= GSStack
.addNode(/*State=*/2, /*ForestNode=*/nullptr,
142 /*Parents=*/{GSSNode0
});
143 auto *GSSNode3
= GSStack
.addNode(/*State=*/3, /*ForestNode=*/nullptr,
144 /*Parents=*/{GSSNode0
});
146 buildGrammar({}, {}); // Create a fake empty grammar.
147 LRTable::Builder
B(TestLang
.G
);
148 B
.Transition
[{StateID
{1}, tokenSymbol(tok::semi
)}] = StateID
{4};
149 B
.Transition
[{StateID
{2}, tokenSymbol(tok::semi
)}] = StateID
{4};
150 B
.Transition
[{StateID
{3}, tokenSymbol(tok::semi
)}] = StateID
{5};
151 TestLang
.Table
= std::move(B
).build();
153 ForestNode
&SemiTerminal
= Arena
.createTerminal(tok::semi
, 0);
154 std::vector
<const GSS::Node
*> NewHeads
;
155 glrShift({GSSNode1
, GSSNode2
, GSSNode3
}, SemiTerminal
,
156 {emptyTokenStream(), Arena
, GSStack
}, TestLang
, NewHeads
);
158 EXPECT_THAT(NewHeads
,
159 UnorderedElementsAre(AllOf(state(4), parsedSymbol(&SemiTerminal
),
160 parents({GSSNode1
, GSSNode2
})),
161 AllOf(state(5), parsedSymbol(&SemiTerminal
),
162 parents({GSSNode3
}))))
166 TEST_F(GLRTest
, ReduceConflictsSplitting
) {
167 // Before (splitting due to R/R conflict):
169 // After reducing 1 by `class-name := IDENTIFIER` and
170 // `enum-name := IDENTIFIER`:
171 // 0--2(class-name) // 2 is goto(0, class-name)
172 // └--3(enum-name) // 3 is goto(0, enum-name)
173 buildGrammar({"class-name", "enum-name"},
174 {"class-name := IDENTIFIER", "enum-name := IDENTIFIER"});
175 LRTable::Builder
B(TestLang
.G
);
176 B
.Transition
[{StateID
{0}, id("class-name")}] = StateID
{2};
177 B
.Transition
[{StateID
{0}, id("enum-name")}] = StateID
{3};
178 B
.Reduce
[StateID
{1}].insert(ruleFor("class-name"));
179 B
.Reduce
[StateID
{1}].insert(ruleFor("enum-name"));
180 TestLang
.Table
= std::move(B
).build();
182 const auto *GSSNode0
=
183 GSStack
.addNode(/*State=*/0, /*ForestNode=*/nullptr, /*Parents=*/{});
184 const auto *GSSNode1
=
185 GSStack
.addNode(1, &Arena
.createTerminal(tok::identifier
, 0), {GSSNode0
});
187 std::vector
<const GSS::Node
*> Heads
= {GSSNode1
};
188 glrReduce(Heads
, tokenSymbol(tok::eof
),
189 {emptyTokenStream(), Arena
, GSStack
}, TestLang
);
190 EXPECT_THAT(Heads
, UnorderedElementsAre(
192 AllOf(state(2), parsedSymbolID(id("class-name")),
193 parents({GSSNode0
})),
194 AllOf(state(3), parsedSymbolID(id("enum-name")),
195 parents({GSSNode0
}))))
199 TEST_F(GLRTest
, ReduceSplittingDueToMultipleBases
) {
200 // Before (splitting due to multiple bases):
201 // 2(class-name)--4(*)
203 // After reducing 4 by `ptr-operator := *`:
204 // 2(class-name)--5(ptr-operator) // 5 is goto(2, ptr-operator)
205 // 3(enum-name)---6(ptr-operator) // 6 is goto(3, ptr-operator)
206 buildGrammar({"ptr-operator", "class-name", "enum-name"},
207 {"ptr-operator := *"});
209 auto *ClassNameNode
= &Arena
.createOpaque(id("class-name"), /*TokenIndex=*/0);
210 auto *EnumNameNode
= &Arena
.createOpaque(id("enum-name"), /*TokenIndex=*/0);
212 const auto *GSSNode2
=
213 GSStack
.addNode(/*State=*/2, /*ForestNode=*/ClassNameNode
, /*Parents=*/{});
214 const auto *GSSNode3
=
215 GSStack
.addNode(/*State=*/3, /*ForestNode=*/EnumNameNode
, /*Parents=*/{});
216 const auto *GSSNode4
= GSStack
.addNode(
217 /*State=*/4, &Arena
.createTerminal(tok::star
, /*TokenIndex=*/1),
218 /*Parents=*/{GSSNode2
, GSSNode3
});
220 LRTable::Builder
B(TestLang
.G
);
221 B
.Transition
[{StateID
{2}, id("ptr-operator")}] = StateID
{5};
222 B
.Transition
[{StateID
{3}, id("ptr-operator")}] = StateID
{6};
223 B
.Reduce
[StateID
{4}].insert(ruleFor("ptr-operator"));
224 TestLang
.Table
= std::move(B
).build();
226 std::vector
<const GSS::Node
*> Heads
= {GSSNode4
};
227 glrReduce(Heads
, tokenSymbol(tok::eof
), {emptyTokenStream(), Arena
, GSStack
},
230 EXPECT_THAT(Heads
, UnorderedElementsAre(
232 AllOf(state(5), parsedSymbolID(id("ptr-operator")),
233 parents({GSSNode2
})),
234 AllOf(state(6), parsedSymbolID(id("ptr-operator")),
235 parents({GSSNode3
}))))
237 // Verify that the payload of the two new heads is shared, only a single
238 // ptr-operator node is created in the forest.
239 EXPECT_EQ(Heads
[1]->Payload
, Heads
[2]->Payload
);
242 TEST_F(GLRTest
, ReduceJoiningWithMultipleBases
) {
243 // Before (joining due to same goto state, multiple bases):
244 // 0--1(cv-qualifier)--3(class-name)
245 // └--2(cv-qualifier)--4(enum-name)
246 // After reducing 3 by `type-name := class-name` and
247 // 4 by `type-name := enum-name`:
248 // 0--1(cv-qualifier)--5(type-name) // 5 is goto(1, type-name) and
249 // └--2(cv-qualifier)--┘ // goto(2, type-name)
250 buildGrammar({"type-name", "class-name", "enum-name", "cv-qualifier"},
251 {"type-name := class-name", "type-name := enum-name"});
253 auto *CVQualifierNode
=
254 &Arena
.createOpaque(id("cv-qualifier"), /*TokenIndex=*/0);
255 auto *ClassNameNode
= &Arena
.createOpaque(id("class-name"), /*TokenIndex=*/1);
256 auto *EnumNameNode
= &Arena
.createOpaque(id("enum-name"), /*TokenIndex=*/1);
258 const auto *GSSNode0
=
259 GSStack
.addNode(/*State=*/0, /*ForestNode=*/nullptr, /*Parents=*/{});
260 const auto *GSSNode1
= GSStack
.addNode(
261 /*State=*/1, /*ForestNode=*/CVQualifierNode
, /*Parents=*/{GSSNode0
});
262 const auto *GSSNode2
= GSStack
.addNode(
263 /*State=*/2, /*ForestNode=*/CVQualifierNode
, /*Parents=*/{GSSNode0
});
264 const auto *GSSNode3
= GSStack
.addNode(
265 /*State=*/3, /*ForestNode=*/ClassNameNode
,
266 /*Parents=*/{GSSNode1
});
267 const auto *GSSNode4
=
268 GSStack
.addNode(/*State=*/4, /*ForestNode=*/EnumNameNode
,
269 /*Parents=*/{GSSNode2
});
271 // FIXME: figure out a way to get rid of the hard-coded reduce RuleID!
272 LRTable::Builder
B(TestLang
.G
);
273 B
.Transition
[{StateID
{1}, id("type-name")}] = StateID
{5};
274 B
.Transition
[{StateID
{2}, id("type-name")}] = StateID
{5};
275 B
.Reduce
[StateID
{3}].insert(/* type-name := class-name */ RuleID
{0});
276 B
.Reduce
[StateID
{4}].insert(/* type-name := enum-name */ RuleID
{1});
277 TestLang
.Table
= std::move(B
).build();
279 std::vector
<const GSS::Node
*> Heads
= {GSSNode3
, GSSNode4
};
280 glrReduce(Heads
, tokenSymbol(tok::eof
), {emptyTokenStream(), Arena
, GSStack
},
283 // Verify that the stack heads are joint at state 5 after reduces.
284 EXPECT_THAT(Heads
, UnorderedElementsAre(GSSNode3
, GSSNode4
,
286 parsedSymbolID(id("type-name")),
287 parents({GSSNode1
, GSSNode2
}))))
289 // Verify that we create an ambiguous ForestNode of two parses of `type-name`.
290 EXPECT_EQ(Heads
.back()->Payload
->dumpRecursive(TestLang
.G
),
291 "[ 1, end) type-name := <ambiguous>\n"
292 "[ 1, end) ├─type-name := class-name\n"
293 "[ 1, end) │ └─class-name := <opaque>\n"
294 "[ 1, end) └─type-name := enum-name\n"
295 "[ 1, end) └─enum-name := <opaque>\n");
298 TEST_F(GLRTest
, ReduceJoiningWithSameBase
) {
299 // Before (joining due to same goto state, the same base):
300 // 0--1(class-name)--3(*)
301 // └--2(enum-name)--4(*)
302 // After reducing 3 by `pointer := class-name *` and
303 // 2 by `pointer := enum-name *`:
304 // 0--5(pointer) // 5 is goto(0, pointer)
305 buildGrammar({"pointer", "class-name", "enum-name"},
306 {"pointer := class-name *", "pointer := enum-name *"});
308 auto *ClassNameNode
= &Arena
.createOpaque(id("class-name"), /*TokenIndex=*/0);
309 auto *EnumNameNode
= &Arena
.createOpaque(id("enum-name"), /*TokenIndex=*/0);
310 auto *StartTerminal
= &Arena
.createTerminal(tok::star
, /*TokenIndex=*/1);
312 const auto *GSSNode0
=
313 GSStack
.addNode(/*State=*/0, /*ForestNode=*/nullptr, /*Parents=*/{});
314 const auto *GSSNode1
=
315 GSStack
.addNode(/*State=*/1, /*ForestNode=*/ClassNameNode
,
316 /*Parents=*/{GSSNode0
});
317 const auto *GSSNode2
=
318 GSStack
.addNode(/*State=*/2, /*ForestNode=*/EnumNameNode
,
319 /*Parents=*/{GSSNode0
});
320 const auto *GSSNode3
=
321 GSStack
.addNode(/*State=*/3, /*ForestNode=*/StartTerminal
,
322 /*Parents=*/{GSSNode1
});
323 const auto *GSSNode4
=
324 GSStack
.addNode(/*State=*/4, /*ForestNode=*/StartTerminal
,
325 /*Parents=*/{GSSNode2
});
327 // FIXME: figure out a way to get rid of the hard-coded reduce RuleID!
328 LRTable::Builder
B(TestLang
.G
);
329 B
.Transition
[{StateID
{0}, id("pointer")}] = StateID
{5};
330 B
.Reduce
[StateID
{3}].insert(/* pointer := class-name */ RuleID
{0});
331 B
.Reduce
[StateID
{4}].insert(/* pointer := enum-name */ RuleID
{1});
332 TestLang
.Table
= std::move(B
).build();
334 std::vector
<const GSS::Node
*> Heads
= {GSSNode3
, GSSNode4
};
335 glrReduce(Heads
, tokenSymbol(tok::eof
),
336 {emptyTokenStream(), Arena
, GSStack
}, TestLang
);
339 Heads
, UnorderedElementsAre(GSSNode3
, GSSNode4
,
340 AllOf(state(5), parsedSymbolID(id("pointer")),
341 parents({GSSNode0
}))))
343 EXPECT_EQ(Heads
.back()->Payload
->dumpRecursive(TestLang
.G
),
344 "[ 0, end) pointer := <ambiguous>\n"
345 "[ 0, end) ├─pointer := class-name *\n"
346 "[ 0, 1) │ ├─class-name := <opaque>\n"
347 "[ 1, end) │ └─* := tok[1]\n"
348 "[ 0, end) └─pointer := enum-name *\n"
349 "[ 0, 1) ├─enum-name := <opaque>\n"
350 "[ 1, end) └─* := tok[1]\n");
353 TEST_F(GLRTest
, ReduceLookahead
) {
354 // A term can be followed by +, but not by -.
355 buildGrammar({"sum", "term"}, {"expr := term + term", "term := IDENTIFIER"});
356 LRTable::Builder
B(TestLang
.G
);
357 B
.Transition
[{StateID
{0}, id("term")}] = StateID
{2};
358 B
.Reduce
[StateID
{1}].insert(RuleID
{0});
359 TestLang
.Table
= std::move(B
).build();
361 auto *Identifier
= &Arena
.createTerminal(tok::identifier
, /*Start=*/0);
364 GSStack
.addNode(/*State=*/0, /*ForestNode=*/nullptr, /*Parents=*/{});
365 const auto *GSSNode1
=
366 GSStack
.addNode(/*State=*/1, /*ForestNode=*/Identifier
, {Root
});
368 // When the lookahead is +, reduce is performed.
369 std::vector
<const GSS::Node
*> Heads
= {GSSNode1
};
370 glrReduce(Heads
, tokenSymbol(tok::plus
), {emptyTokenStream(), Arena
, GSStack
},
373 ElementsAre(GSSNode1
, AllOf(state(2), parsedSymbolID(id("term")),
376 // When the lookahead is -, reduce is not performed.
378 glrReduce(Heads
, tokenSymbol(tok::minus
),
379 {emptyTokenStream(), Arena
, GSStack
}, TestLang
);
380 EXPECT_THAT(Heads
, ElementsAre(GSSNode1
));
383 TEST_F(GLRTest
, Recover
) {
384 // Recovery while parsing "word" inside braces.
387 // After recovering a `word` at state 1:
388 // 0--3(word) // 3 is goto(1, word)
389 buildGrammar({"word", "top"}, {"top := { word [recover=Braces] }"});
390 LRTable::Builder
B(TestLang
.G
);
391 B
.Transition
[{StateID
{1}, id("word")}] = StateID
{3};
392 B
.Recoveries
.push_back({StateID
{1}, {extensionID("Braces"), id("word")}});
393 TestLang
.Table
= std::move(B
).build();
394 TestLang
.RecoveryStrategies
.try_emplace(extensionID("Braces"), recoverBraces
);
396 auto *LBrace
= &Arena
.createTerminal(tok::l_brace
, 0);
397 auto *Question1
= &Arena
.createTerminal(tok::question
, 1);
398 const auto *Root
= GSStack
.addNode(0, nullptr, {});
399 const auto *OpenedBraces
= GSStack
.addNode(1, LBrace
, {Root
});
400 const auto *AfterQuestion1
= GSStack
.addNode(2, Question1
, {OpenedBraces
});
402 // Need a token stream with paired braces so the strategy works.
403 clang::LangOptions LOptions
;
404 TokenStream Tokens
= cook(lex("{ ? ? ? }", LOptions
), LOptions
);
405 pairBrackets(Tokens
);
406 std::vector
<const GSS::Node
*> NewHeads
;
408 unsigned TokenIndex
= 2;
409 glrRecover({AfterQuestion1
}, TokenIndex
, {Tokens
, Arena
, GSStack
}, TestLang
,
411 EXPECT_EQ(TokenIndex
, 4u) << "should skip ahead to matching brace";
412 EXPECT_THAT(NewHeads
, ElementsAre(AllOf(state(3), parsedSymbolID(id("word")),
413 parents({OpenedBraces
}), start(1u))));
414 EXPECT_EQ(NewHeads
.front()->Payload
->kind(), ForestNode::Opaque
);
416 // Test recovery failure: omit closing brace so strategy fails
417 TokenStream NoRBrace
= cook(lex("{ ? ? ? ?", LOptions
), LOptions
);
418 pairBrackets(NoRBrace
);
421 glrRecover({AfterQuestion1
}, TokenIndex
, {NoRBrace
, Arena
, GSStack
}, TestLang
,
423 EXPECT_EQ(TokenIndex
, 2u) << "should not advance on failure";
424 EXPECT_THAT(NewHeads
, IsEmpty());
427 TEST_F(GLRTest
, RecoverRightmost
) {
428 // In a nested block structure, we recover at the innermost possible block.
430 // 0--1({)--1({)--1({)
431 // After recovering a `block` at inside the second braces:
432 // 0--1({)--2(body) // 2 is goto(1, body)
433 buildGrammar({"body", "top"}, {"top := { body [recover=Braces] }"});
434 LRTable::Builder
B(TestLang
.G
);
435 B
.Transition
[{StateID
{1}, id("body")}] = StateID
{2};
436 B
.Recoveries
.push_back({StateID
{1}, {extensionID("Braces"), id("body")}});
437 TestLang
.Table
= std::move(B
).build();
438 TestLang
.RecoveryStrategies
.try_emplace(extensionID("Braces"), recoverBraces
);
440 clang::LangOptions LOptions
;
441 // Innermost brace is unmatched, to test fallback to next brace.
442 TokenStream Tokens
= cook(lex("{ { { ? } }", LOptions
), LOptions
);
443 Tokens
.tokens()[0].Pair
= 5;
444 Tokens
.tokens()[1].Pair
= 4;
445 Tokens
.tokens()[4].Pair
= 1;
446 Tokens
.tokens()[5].Pair
= 0;
448 auto *Brace1
= &Arena
.createTerminal(tok::l_brace
, 0);
449 auto *Brace2
= &Arena
.createTerminal(tok::l_brace
, 1);
450 auto *Brace3
= &Arena
.createTerminal(tok::l_brace
, 2);
451 const auto *Root
= GSStack
.addNode(0, nullptr, {});
452 const auto *In1
= GSStack
.addNode(1, Brace1
, {Root
});
453 const auto *In2
= GSStack
.addNode(1, Brace2
, {In1
});
454 const auto *In3
= GSStack
.addNode(1, Brace3
, {In2
});
456 unsigned TokenIndex
= 3;
457 std::vector
<const GSS::Node
*> NewHeads
;
458 glrRecover({In3
}, TokenIndex
, {Tokens
, Arena
, GSStack
}, TestLang
, NewHeads
);
459 EXPECT_EQ(TokenIndex
, 5u);
460 EXPECT_THAT(NewHeads
, ElementsAre(AllOf(state(2), parsedSymbolID(id("body")),
461 parents({In2
}), start(2u))));
464 TEST_F(GLRTest
, RecoverAlternatives
) {
465 // Recovery inside braces with multiple equally good options
468 // After recovering either `word` or `number` inside the braces:
469 // 0--1({)--2(word) // 2 is goto(1, word)
470 // └--3(number) // 3 is goto(1, number)
471 buildGrammar({"number", "word", "top"},
473 "top := { number [recover=Braces] }",
474 "top := { word [recover=Braces] }",
476 LRTable::Builder
B(TestLang
.G
);
477 B
.Transition
[{StateID
{1}, id("number")}] = StateID
{2};
478 B
.Transition
[{StateID
{1}, id("word")}] = StateID
{3};
479 B
.Recoveries
.push_back({StateID
{1}, {extensionID("Braces"), id("number")}});
480 B
.Recoveries
.push_back({StateID
{1}, {extensionID("Braces"), id("word")}});
481 TestLang
.RecoveryStrategies
.try_emplace(extensionID("Braces"), recoverBraces
);
482 TestLang
.Table
= std::move(B
).build();
483 auto *LBrace
= &Arena
.createTerminal(tok::l_brace
, 0);
484 const auto *Root
= GSStack
.addNode(0, nullptr, {});
485 const auto *OpenedBraces
= GSStack
.addNode(1, LBrace
, {Root
});
487 clang::LangOptions LOptions
;
488 TokenStream Tokens
= cook(lex("{ ? }", LOptions
), LOptions
);
489 pairBrackets(Tokens
);
490 std::vector
<const GSS::Node
*> NewHeads
;
491 unsigned TokenIndex
= 1;
493 glrRecover({OpenedBraces
}, TokenIndex
, {Tokens
, Arena
, GSStack
}, TestLang
,
495 EXPECT_EQ(TokenIndex
, 2u);
496 EXPECT_THAT(NewHeads
,
497 UnorderedElementsAre(AllOf(state(2), parsedSymbolID(id("number")),
498 parents({OpenedBraces
}), start(1u)),
499 AllOf(state(3), parsedSymbolID(id("word")),
500 parents({OpenedBraces
}), start(1u))));
503 TEST_F(GLRTest
, PerfectForestNodeSharing
) {
504 // Run the GLR on a simple grammar and test that we build exactly one forest
505 // node per (SymbolID, token range).
507 // This is a grmammar where the original parsing-stack-based forest node
508 // sharing approach will fail. In its LR0 graph, it has two states containing
509 // item `expr := • IDENTIFIER`, and both have different goto states on the
510 // nonterminal `expr`.
516 test := left-paren expr
520 TestLang
.Table
= LRTable::buildSLR(TestLang
.G
);
521 clang::LangOptions LOptions
;
522 const TokenStream
&Tokens
= cook(lex("{ abc", LOptions
), LOptions
);
524 const ForestNode
&Parsed
=
525 glrParse({Tokens
, Arena
, GSStack
}, id("test"), TestLang
);
526 // Verify that there is no duplicated sequence node of `expr := IDENTIFIER`
527 // in the forest, see the `#1` and `=#1` in the dump string.
528 EXPECT_EQ(Parsed
.dumpRecursive(TestLang
.G
),
529 "[ 0, end) test := <ambiguous>\n"
530 "[ 0, end) ├─test := { expr\n"
531 "[ 0, 1) │ ├─{ := tok[0]\n"
532 "[ 1, end) │ └─expr := IDENTIFIER #1\n"
533 "[ 1, end) │ └─IDENTIFIER := tok[1]\n"
534 "[ 0, end) ├─test := { IDENTIFIER\n"
535 "[ 0, 1) │ ├─{ := tok[0]\n"
536 "[ 1, end) │ └─IDENTIFIER := tok[1]\n"
537 "[ 0, end) └─test := left-paren expr\n"
538 "[ 0, 1) ├─left-paren := {\n"
539 "[ 0, 1) │ └─{ := tok[0]\n"
540 "[ 1, end) └─expr =#1\n");
543 TEST_F(GLRTest
, GLRReduceOrder
) {
544 // Given the following grammar, and the input `IDENTIFIER`, reductions should
545 // be performed in the following order:
546 // 1. foo := IDENTIFIER
547 // 2. { test := IDENTIFIER, test := foo }
548 // foo should be reduced first, so that in step 2 we have completed reduces
549 // for test, and form an ambiguous forest node.
557 clang::LangOptions LOptions
;
558 const TokenStream
&Tokens
= cook(lex("IDENTIFIER", LOptions
), LOptions
);
559 TestLang
.Table
= LRTable::buildSLR(TestLang
.G
);
561 const ForestNode
&Parsed
=
562 glrParse({Tokens
, Arena
, GSStack
}, id("test"), TestLang
);
563 EXPECT_EQ(Parsed
.dumpRecursive(TestLang
.G
),
564 "[ 0, end) test := <ambiguous>\n"
565 "[ 0, end) ├─test := IDENTIFIER\n"
566 "[ 0, end) │ └─IDENTIFIER := tok[0]\n"
567 "[ 0, end) └─test := foo\n"
568 "[ 0, end) └─foo := IDENTIFIER\n"
569 "[ 0, end) └─IDENTIFIER := tok[0]\n");
572 TEST_F(GLRTest
, RecoveryEndToEnd
) {
573 // Simple example of brace-based recovery showing:
574 // - recovered region includes tokens both ahead of and behind the cursor
575 // - multiple possible recovery rules
576 // - recovery from outer scopes is rejected
580 block := { block [recover=Braces] }
581 block := { numbers [recover=Braces] }
582 numbers := NUMERIC_CONSTANT NUMERIC_CONSTANT
584 TestLang
.Table
= LRTable::buildSLR(TestLang
.G
);
585 TestLang
.RecoveryStrategies
.try_emplace(extensionID("Braces"), recoverBraces
);
586 clang::LangOptions LOptions
;
587 TokenStream Tokens
= cook(lex("{ { 42 ? } }", LOptions
), LOptions
);
588 pairBrackets(Tokens
);
590 const ForestNode
&Parsed
=
591 glrParse({Tokens
, Arena
, GSStack
}, id("block"), TestLang
);
592 EXPECT_EQ(Parsed
.dumpRecursive(TestLang
.G
),
593 "[ 0, end) block := { block [recover=Braces] }\n"
594 "[ 0, 1) ├─{ := tok[0]\n"
595 "[ 1, 5) ├─block := <ambiguous>\n"
596 "[ 1, 5) │ ├─block := { block [recover=Braces] }\n"
597 "[ 1, 2) │ │ ├─{ := tok[1]\n"
598 "[ 2, 4) │ │ ├─block := <opaque>\n"
599 "[ 4, 5) │ │ └─} := tok[4]\n"
600 "[ 1, 5) │ └─block := { numbers [recover=Braces] }\n"
601 "[ 1, 2) │ ├─{ := tok[1]\n"
602 "[ 2, 4) │ ├─numbers := <opaque>\n"
603 "[ 4, 5) │ └─} := tok[4]\n"
604 "[ 5, end) └─} := tok[5]\n");
607 TEST_F(GLRTest
, RecoverTerminal
) {
611 stmt := IDENTIFIER ; [recover=Skip]
613 TestLang
.Table
= LRTable::buildSLR(TestLang
.G
);
614 TestLang
.RecoveryStrategies
.try_emplace(
616 [](Token::Index Start
, const TokenStream
&) { return Start
; });
617 clang::LangOptions LOptions
;
618 TokenStream Tokens
= cook(lex("foo", LOptions
), LOptions
);
620 const ForestNode
&Parsed
=
621 glrParse({Tokens
, Arena
, GSStack
}, id("stmt"), TestLang
);
622 EXPECT_EQ(Parsed
.dumpRecursive(TestLang
.G
),
623 "[ 0, end) stmt := IDENTIFIER ; [recover=Skip]\n"
624 "[ 0, 1) ├─IDENTIFIER := tok[0]\n"
625 "[ 1, end) └─; := <opaque>\n");
628 TEST_F(GLRTest
, RecoverUnrestrictedReduce
) {
629 // Here, ! is not in any rule and therefore not in the follow set of `word`.
630 // We would not normally reduce `word := IDENTIFIER`, but do so for recovery.
636 sentence := word word [recover=AcceptAnyTokenInstead]
639 clang::LangOptions LOptions
;
640 const TokenStream
&Tokens
= cook(lex("id !", LOptions
), LOptions
);
641 TestLang
.Table
= LRTable::buildSLR(TestLang
.G
);
642 TestLang
.RecoveryStrategies
.try_emplace(
643 extensionID("AcceptAnyTokenInstead"),
644 [](Token::Index Start
, const TokenStream
&Stream
) { return Start
+ 1; });
646 const ForestNode
&Parsed
=
647 glrParse({Tokens
, Arena
, GSStack
}, id("sentence"), TestLang
);
648 EXPECT_EQ(Parsed
.dumpRecursive(TestLang
.G
),
649 "[ 0, end) sentence := word word [recover=AcceptAnyTokenInstead]\n"
650 "[ 0, 1) ├─word := IDENTIFIER\n"
651 "[ 0, 1) │ └─IDENTIFIER := tok[0]\n"
652 "[ 1, end) └─word := <opaque>\n");
655 TEST_F(GLRTest
, RecoveryFromStartOfInput
) {
657 _ := start [recover=Fallback] EOF
661 TestLang
.Table
= LRTable::buildSLR(TestLang
.G
);
662 bool fallback_recovered
= false;
663 auto fallback
= [&](Token::Index Start
, const TokenStream
& Code
) {
664 fallback_recovered
= true;
665 return Code
.tokens().size();
667 TestLang
.RecoveryStrategies
.try_emplace(
668 extensionID("Fallback"),
670 clang::LangOptions LOptions
;
671 TokenStream Tokens
= cook(lex("?", LOptions
), LOptions
);
673 const ForestNode
&Parsed
=
674 glrParse({Tokens
, Arena
, GSStack
}, id("start"), TestLang
);
675 EXPECT_TRUE(fallback_recovered
);
676 EXPECT_EQ(Parsed
.dumpRecursive(TestLang
.G
),
677 "[ 0, end) start := <opaque>\n");
680 TEST_F(GLRTest
, RepeatedRecovery
) {
681 // We require multiple steps of recovery at eof and then a reduction in order
682 // to successfully parse.
685 # FIXME: this forces EOF to be in follow(signature).
686 # Remove it once we use unconstrained reduction for recovery.
689 function := signature body [recover=Skip]
690 signature := IDENTIFIER params [recover=Skip]
694 TestLang
.Table
= LRTable::buildSLR(TestLang
.G
);
695 TestLang
.RecoveryStrategies
.try_emplace(
697 [](Token::Index Start
, const TokenStream
&) { return Start
; });
698 clang::LangOptions LOptions
;
699 TokenStream Tokens
= cook(lex("main", LOptions
), LOptions
);
701 const ForestNode
&Parsed
=
702 glrParse({Tokens
, Arena
, GSStack
}, id("function"), TestLang
);
703 EXPECT_EQ(Parsed
.dumpRecursive(TestLang
.G
),
704 "[ 0, end) function := signature body [recover=Skip]\n"
705 "[ 0, 1) ├─signature := IDENTIFIER params [recover=Skip]\n"
706 "[ 0, 1) │ ├─IDENTIFIER := tok[0]\n"
707 "[ 1, 1) │ └─params := <opaque>\n"
708 "[ 1, end) └─body := <opaque>\n");
711 TEST_F(GLRTest
, NoExplicitAccept
) {
715 test := IDENTIFIER test
718 clang::LangOptions LOptions
;
719 // Given the following input, and the grammar above, we perform two reductions
720 // of the nonterminal `test` when the next token is `eof`, verify that the
721 // parser stops at the right state.
722 const TokenStream
&Tokens
= cook(lex("id id", LOptions
), LOptions
);
723 TestLang
.Table
= LRTable::buildSLR(TestLang
.G
);
725 const ForestNode
&Parsed
=
726 glrParse({Tokens
, Arena
, GSStack
}, id("test"), TestLang
);
727 EXPECT_EQ(Parsed
.dumpRecursive(TestLang
.G
),
728 "[ 0, end) test := IDENTIFIER test\n"
729 "[ 0, 1) ├─IDENTIFIER := tok[0]\n"
730 "[ 1, end) └─test := IDENTIFIER\n"
731 "[ 1, end) └─IDENTIFIER := tok[1]\n");
734 TEST_F(GLRTest
, GuardExtension
) {
738 start := IDENTIFIER [guard]
740 TestLang
.Guards
.try_emplace(
741 ruleFor("start"), [&](const GuardParams
&P
) {
742 assert(P
.RHS
.size() == 1 &&
743 P
.RHS
.front()->symbol() ==
744 tokenSymbol(clang::tok::identifier
));
745 return P
.Tokens
.tokens()[P
.RHS
.front()->startTokenIndex()]
748 clang::LangOptions LOptions
;
749 TestLang
.Table
= LRTable::buildSLR(TestLang
.G
);
751 std::string Input
= "test";
752 const TokenStream
&Succeeded
= cook(lex(Input
, LOptions
), LOptions
);
753 EXPECT_EQ(glrParse({Succeeded
, Arena
, GSStack
}, id("start"), TestLang
)
754 .dumpRecursive(TestLang
.G
),
755 "[ 0, end) start := IDENTIFIER [guard]\n"
756 "[ 0, end) └─IDENTIFIER := tok[0]\n");
759 const TokenStream
&Failed
= cook(lex(Input
, LOptions
), LOptions
);
760 EXPECT_EQ(glrParse({Failed
, Arena
, GSStack
}, id("start"), TestLang
)
761 .dumpRecursive(TestLang
.G
),
762 "[ 0, end) start := <opaque>\n");
772 auto *Root
= GSStack
.addNode(0, nullptr, {});
773 auto *A
= GSStack
.addNode(0, nullptr, {Root
});
774 auto *B
= GSStack
.addNode(0, nullptr, {Root
});
775 auto *C
= GSStack
.addNode(0, nullptr, {Root
});
776 auto *D
= GSStack
.addNode(0, nullptr, {Root
});
777 auto *AB
= GSStack
.addNode(0, nullptr, {A
, B
});
779 EXPECT_EQ(1u, GSStack
.gc({AB
, C
})) << "D is destroyed";
780 EXPECT_EQ(0u, GSStack
.gc({AB
, C
})) << "D is already gone";
781 auto *E
= GSStack
.addNode(0, nullptr, {Root
});
782 EXPECT_EQ(D
, E
) << "Storage of GCed node D is reused for E";
783 EXPECT_EQ(3u, GSStack
.gc({A
, E
})) << "Destroys B, AB, C";
784 EXPECT_EQ(1u, GSStack
.gc({E
})) << "Destroys A";
788 } // namespace pseudo