1 //===- YAMLParser.cpp - Simple YAML parser --------------------------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // This file implements a YAML parser.
11 //===----------------------------------------------------------------------===//
13 #include "llvm/Support/YAMLParser.h"
14 #include "llvm/ADT/AllocatorList.h"
15 #include "llvm/ADT/ArrayRef.h"
16 #include "llvm/ADT/STLExtras.h"
17 #include "llvm/ADT/SmallString.h"
18 #include "llvm/ADT/SmallVector.h"
19 #include "llvm/ADT/StringExtras.h"
20 #include "llvm/ADT/StringRef.h"
21 #include "llvm/ADT/Twine.h"
22 #include "llvm/Support/Compiler.h"
23 #include "llvm/Support/ErrorHandling.h"
24 #include "llvm/Support/MemoryBuffer.h"
25 #include "llvm/Support/SMLoc.h"
26 #include "llvm/Support/SourceMgr.h"
27 #include "llvm/Support/Unicode.h"
28 #include "llvm/Support/raw_ostream.h"
35 #include <system_error>
41 enum UnicodeEncodingForm
{
42 UEF_UTF32_LE
, ///< UTF-32 Little Endian
43 UEF_UTF32_BE
, ///< UTF-32 Big Endian
44 UEF_UTF16_LE
, ///< UTF-16 Little Endian
45 UEF_UTF16_BE
, ///< UTF-16 Big Endian
46 UEF_UTF8
, ///< UTF-8 or ascii.
47 UEF_Unknown
///< Not a valid Unicode encoding.
50 /// EncodingInfo - Holds the encoding type and length of the byte order mark if
51 /// it exists. Length is in {0, 2, 3, 4}.
52 using EncodingInfo
= std::pair
<UnicodeEncodingForm
, unsigned>;
54 /// getUnicodeEncoding - Reads up to the first 4 bytes to determine the Unicode
55 /// encoding form of \a Input.
57 /// @param Input A string of length 0 or more.
58 /// @returns An EncodingInfo indicating the Unicode encoding form of the input
59 /// and how long the byte order mark is if one exists.
60 static EncodingInfo
getUnicodeEncoding(StringRef Input
) {
62 return std::make_pair(UEF_Unknown
, 0);
64 switch (uint8_t(Input
[0])) {
66 if (Input
.size() >= 4) {
68 && uint8_t(Input
[2]) == 0xFE
69 && uint8_t(Input
[3]) == 0xFF)
70 return std::make_pair(UEF_UTF32_BE
, 4);
71 if (Input
[1] == 0 && Input
[2] == 0 && Input
[3] != 0)
72 return std::make_pair(UEF_UTF32_BE
, 0);
75 if (Input
.size() >= 2 && Input
[1] != 0)
76 return std::make_pair(UEF_UTF16_BE
, 0);
77 return std::make_pair(UEF_Unknown
, 0);
79 if ( Input
.size() >= 4
80 && uint8_t(Input
[1]) == 0xFE
83 return std::make_pair(UEF_UTF32_LE
, 4);
85 if (Input
.size() >= 2 && uint8_t(Input
[1]) == 0xFE)
86 return std::make_pair(UEF_UTF16_LE
, 2);
87 return std::make_pair(UEF_Unknown
, 0);
89 if (Input
.size() >= 2 && uint8_t(Input
[1]) == 0xFF)
90 return std::make_pair(UEF_UTF16_BE
, 2);
91 return std::make_pair(UEF_Unknown
, 0);
93 if ( Input
.size() >= 3
94 && uint8_t(Input
[1]) == 0xBB
95 && uint8_t(Input
[2]) == 0xBF)
96 return std::make_pair(UEF_UTF8
, 3);
97 return std::make_pair(UEF_Unknown
, 0);
100 // It could still be utf-32 or utf-16.
101 if (Input
.size() >= 4 && Input
[1] == 0 && Input
[2] == 0 && Input
[3] == 0)
102 return std::make_pair(UEF_UTF32_LE
, 0);
104 if (Input
.size() >= 2 && Input
[1] == 0)
105 return std::make_pair(UEF_UTF16_LE
, 0);
107 return std::make_pair(UEF_UTF8
, 0);
110 /// Pin the vtables to this file.
111 void Node::anchor() {}
112 void NullNode::anchor() {}
113 void ScalarNode::anchor() {}
114 void BlockScalarNode::anchor() {}
115 void KeyValueNode::anchor() {}
116 void MappingNode::anchor() {}
117 void SequenceNode::anchor() {}
118 void AliasNode::anchor() {}
123 /// Token - A single YAML token.
126 TK_Error
, // Uninitialized token.
135 TK_BlockSequenceStart
,
136 TK_BlockMappingStart
,
138 TK_FlowSequenceStart
,
151 /// A string of length 0 or more whose begin() points to the logical location
152 /// of the token in the input.
155 /// The value of a block scalar node.
161 } // end namespace yaml
162 } // end namespace llvm
164 using TokenQueueT
= BumpPtrList
<Token
>;
168 /// This struct is used to track simple keys.
170 /// Simple keys are handled by creating an entry in SimpleKeys for each Token
171 /// which could legally be the start of a simple key. When peekNext is called,
172 /// if the Token To be returned is referenced by a SimpleKey, we continue
173 /// tokenizing until that potential simple key has either been found to not be
174 /// a simple key (we moved on to the next line or went further than 1024 chars).
175 /// Or when we run into a Value, and then insert a Key token (and possibly
176 /// others) before the SimpleKey's Tok.
178 TokenQueueT::iterator Tok
;
181 unsigned FlowLevel
= 0;
182 bool IsRequired
= false;
184 bool operator ==(const SimpleKey
&Other
) {
185 return Tok
== Other
.Tok
;
189 } // end anonymous namespace
191 /// The Unicode scalar value of a UTF-8 minimal well-formed code unit
192 /// subsequence and the subsequence's length in code units (uint8_t).
193 /// A length of 0 represents an error.
194 using UTF8Decoded
= std::pair
<uint32_t, unsigned>;
196 static UTF8Decoded
decodeUTF8(StringRef Range
) {
197 StringRef::iterator Position
= Range
.begin();
198 StringRef::iterator End
= Range
.end();
199 // 1 byte: [0x00, 0x7f]
200 // Bit pattern: 0xxxxxxx
201 if (Position
< End
&& (*Position
& 0x80) == 0) {
202 return std::make_pair(*Position
, 1);
204 // 2 bytes: [0x80, 0x7ff]
205 // Bit pattern: 110xxxxx 10xxxxxx
206 if (Position
+ 1 < End
&& ((*Position
& 0xE0) == 0xC0) &&
207 ((*(Position
+ 1) & 0xC0) == 0x80)) {
208 uint32_t codepoint
= ((*Position
& 0x1F) << 6) |
209 (*(Position
+ 1) & 0x3F);
210 if (codepoint
>= 0x80)
211 return std::make_pair(codepoint
, 2);
213 // 3 bytes: [0x8000, 0xffff]
214 // Bit pattern: 1110xxxx 10xxxxxx 10xxxxxx
215 if (Position
+ 2 < End
&& ((*Position
& 0xF0) == 0xE0) &&
216 ((*(Position
+ 1) & 0xC0) == 0x80) &&
217 ((*(Position
+ 2) & 0xC0) == 0x80)) {
218 uint32_t codepoint
= ((*Position
& 0x0F) << 12) |
219 ((*(Position
+ 1) & 0x3F) << 6) |
220 (*(Position
+ 2) & 0x3F);
221 // Codepoints between 0xD800 and 0xDFFF are invalid, as
222 // they are high / low surrogate halves used by UTF-16.
223 if (codepoint
>= 0x800 &&
224 (codepoint
< 0xD800 || codepoint
> 0xDFFF))
225 return std::make_pair(codepoint
, 3);
227 // 4 bytes: [0x10000, 0x10FFFF]
228 // Bit pattern: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
229 if (Position
+ 3 < End
&& ((*Position
& 0xF8) == 0xF0) &&
230 ((*(Position
+ 1) & 0xC0) == 0x80) &&
231 ((*(Position
+ 2) & 0xC0) == 0x80) &&
232 ((*(Position
+ 3) & 0xC0) == 0x80)) {
233 uint32_t codepoint
= ((*Position
& 0x07) << 18) |
234 ((*(Position
+ 1) & 0x3F) << 12) |
235 ((*(Position
+ 2) & 0x3F) << 6) |
236 (*(Position
+ 3) & 0x3F);
237 if (codepoint
>= 0x10000 && codepoint
<= 0x10FFFF)
238 return std::make_pair(codepoint
, 4);
240 return std::make_pair(0, 0);
246 /// Scans YAML tokens from a MemoryBuffer.
249 Scanner(StringRef Input
, SourceMgr
&SM
, bool ShowColors
= true,
250 std::error_code
*EC
= nullptr);
251 Scanner(MemoryBufferRef Buffer
, SourceMgr
&SM_
, bool ShowColors
= true,
252 std::error_code
*EC
= nullptr);
254 /// Parse the next token and return it without popping it.
257 /// Parse the next token and pop it from the queue.
260 void printError(SMLoc Loc
, SourceMgr::DiagKind Kind
, const Twine
&Message
,
261 ArrayRef
<SMRange
> Ranges
= std::nullopt
) {
262 SM
.PrintMessage(Loc
, Kind
, Message
, Ranges
, /* FixIts= */ std::nullopt
,
266 void setError(const Twine
&Message
, StringRef::iterator Position
) {
270 // propagate the error if possible
272 *EC
= make_error_code(std::errc::invalid_argument
);
274 // Don't print out more errors after the first one we encounter. The rest
275 // are just the result of the first, and have no meaning.
277 printError(SMLoc::getFromPointer(Position
), SourceMgr::DK_Error
, Message
);
281 /// Returns true if an error occurred while parsing.
287 void init(MemoryBufferRef Buffer
);
289 StringRef
currentInput() {
290 return StringRef(Current
, End
- Current
);
293 /// Decode a UTF-8 minimal well-formed code unit subsequence starting
296 /// If the UTF-8 code units starting at Position do not form a well-formed
297 /// code unit subsequence, then the Unicode scalar value is 0, and the length
299 UTF8Decoded
decodeUTF8(StringRef::iterator Position
) {
300 return ::decodeUTF8(StringRef(Position
, End
- Position
));
303 // The following functions are based on the gramar rules in the YAML spec. The
304 // style of the function names it meant to closely match how they are written
305 // in the spec. The number within the [] is the number of the grammar rule in
308 // See 4.2 [Production Naming Conventions] for the meaning of the prefixes.
311 // A production starting and ending with a special character.
313 // A production matching a single line break.
315 // A production starting and ending with a non-break character.
317 // A production starting and ending with a white space character.
319 // A production starting and ending with a non-space character.
321 // A production matching complete line(s).
323 /// Skip a single nb-char[27] starting at Position.
325 /// A nb-char is 0x9 | [0x20-0x7E] | 0x85 | [0xA0-0xD7FF] | [0xE000-0xFEFE]
326 /// | [0xFF00-0xFFFD] | [0x10000-0x10FFFF]
328 /// @returns The code unit after the nb-char, or Position if it's not an
330 StringRef::iterator
skip_nb_char(StringRef::iterator Position
);
332 /// Skip a single b-break[28] starting at Position.
334 /// A b-break is 0xD 0xA | 0xD | 0xA
336 /// @returns The code unit after the b-break, or Position if it's not a
338 StringRef::iterator
skip_b_break(StringRef::iterator Position
);
340 /// Skip a single s-space[31] starting at Position.
342 /// An s-space is 0x20
344 /// @returns The code unit after the s-space, or Position if it's not a
346 StringRef::iterator
skip_s_space(StringRef::iterator Position
);
348 /// Skip a single s-white[33] starting at Position.
350 /// A s-white is 0x20 | 0x9
352 /// @returns The code unit after the s-white, or Position if it's not a
354 StringRef::iterator
skip_s_white(StringRef::iterator Position
);
356 /// Skip a single ns-char[34] starting at Position.
358 /// A ns-char is nb-char - s-white
360 /// @returns The code unit after the ns-char, or Position if it's not a
362 StringRef::iterator
skip_ns_char(StringRef::iterator Position
);
364 using SkipWhileFunc
= StringRef::iterator (Scanner::*)(StringRef::iterator
);
366 /// Skip minimal well-formed code unit subsequences until Func
367 /// returns its input.
369 /// @returns The code unit after the last minimal well-formed code unit
370 /// subsequence that Func accepted.
371 StringRef::iterator
skip_while( SkipWhileFunc Func
372 , StringRef::iterator Position
);
374 /// Skip minimal well-formed code unit subsequences until Func returns its
376 void advanceWhile(SkipWhileFunc Func
);
378 /// Scan ns-uri-char[39]s starting at Cur.
380 /// This updates Cur and Column while scanning.
381 void scan_ns_uri_char();
383 /// Consume a minimal well-formed code unit subsequence starting at
384 /// \a Cur. Return false if it is not the same Unicode scalar value as
385 /// \a Expected. This updates \a Column.
386 bool consume(uint32_t Expected
);
388 /// Skip \a Distance UTF-8 code units. Updates \a Cur and \a Column.
389 void skip(uint32_t Distance
);
391 /// Return true if the minimal well-formed code unit subsequence at
392 /// Pos is whitespace or a new line
393 bool isBlankOrBreak(StringRef::iterator Position
);
395 /// Return true if the minimal well-formed code unit subsequence at
396 /// Pos is considered a "safe" character for plain scalars.
397 bool isPlainSafeNonBlank(StringRef::iterator Position
);
399 /// Return true if the line is a line break, false otherwise.
400 bool isLineEmpty(StringRef Line
);
402 /// Consume a single b-break[28] if it's present at the current position.
404 /// Return false if the code unit at the current position isn't a line break.
405 bool consumeLineBreakIfPresent();
407 /// If IsSimpleKeyAllowed, create and push_back a new SimpleKey.
408 void saveSimpleKeyCandidate( TokenQueueT::iterator Tok
412 /// Remove simple keys that can no longer be valid simple keys.
414 /// Invalid simple keys are not on the current line or are further than 1024
416 void removeStaleSimpleKeyCandidates();
418 /// Remove all simple keys on FlowLevel \a Level.
419 void removeSimpleKeyCandidatesOnFlowLevel(unsigned Level
);
421 /// Unroll indentation in \a Indents back to \a Col. Creates BlockEnd
422 /// tokens if needed.
423 bool unrollIndent(int ToColumn
);
425 /// Increase indent to \a Col. Creates \a Kind token at \a InsertPoint
427 bool rollIndent( int ToColumn
428 , Token::TokenKind Kind
429 , TokenQueueT::iterator InsertPoint
);
431 /// Skip a single-line comment when the comment starts at the current
432 /// position of the scanner.
435 /// Skip whitespace and comments until the start of the next token.
436 void scanToNextToken();
438 /// Must be the first token generated.
439 bool scanStreamStart();
441 /// Generate tokens needed to close out the stream.
442 bool scanStreamEnd();
444 /// Scan a %BLAH directive.
445 bool scanDirective();
447 /// Scan a ... or ---.
448 bool scanDocumentIndicator(bool IsStart
);
450 /// Scan a [ or { and generate the proper flow collection start token.
451 bool scanFlowCollectionStart(bool IsSequence
);
453 /// Scan a ] or } and generate the proper flow collection end token.
454 bool scanFlowCollectionEnd(bool IsSequence
);
456 /// Scan the , that separates entries in a flow collection.
457 bool scanFlowEntry();
459 /// Scan the - that starts block sequence entries.
460 bool scanBlockEntry();
462 /// Scan an explicit ? indicating a key.
465 /// Scan an explicit : indicating a value.
468 /// Scan a quoted scalar.
469 bool scanFlowScalar(bool IsDoubleQuoted
);
471 /// Scan an unquoted scalar.
472 bool scanPlainScalar();
474 /// Scan an Alias or Anchor starting with * or &.
475 bool scanAliasOrAnchor(bool IsAlias
);
477 /// Scan a block scalar starting with | or >.
478 bool scanBlockScalar(bool IsLiteral
);
480 /// Scan a block scalar style indicator and header.
482 /// Note: This is distinct from scanBlockScalarHeader to mirror the fact that
483 /// YAML does not consider the style indicator to be a part of the header.
485 /// Return false if an error occurred.
486 bool scanBlockScalarIndicators(char &StyleIndicator
, char &ChompingIndicator
,
487 unsigned &IndentIndicator
, bool &IsDone
);
489 /// Scan a style indicator in a block scalar header.
490 char scanBlockStyleIndicator();
492 /// Scan a chomping indicator in a block scalar header.
493 char scanBlockChompingIndicator();
495 /// Scan an indentation indicator in a block scalar header.
496 unsigned scanBlockIndentationIndicator();
498 /// Scan a block scalar header.
500 /// Return false if an error occurred.
501 bool scanBlockScalarHeader(char &ChompingIndicator
, unsigned &IndentIndicator
,
504 /// Look for the indentation level of a block scalar.
506 /// Return false if an error occurred.
507 bool findBlockScalarIndent(unsigned &BlockIndent
, unsigned BlockExitIndent
,
508 unsigned &LineBreaks
, bool &IsDone
);
510 /// Scan the indentation of a text line in a block scalar.
512 /// Return false if an error occurred.
513 bool scanBlockScalarIndent(unsigned BlockIndent
, unsigned BlockExitIndent
,
516 /// Scan a tag of the form !stuff.
519 /// Dispatch to the next scanning function based on \a *Cur.
520 bool fetchMoreTokens();
522 /// The SourceMgr used for diagnostics and buffer management.
525 /// The original input.
526 MemoryBufferRef InputBuffer
;
528 /// The current position of the scanner.
529 StringRef::iterator Current
;
531 /// The end of the input (one past the last character).
532 StringRef::iterator End
;
534 /// Current YAML indentation level in spaces.
537 /// Current column number in Unicode code points.
540 /// Current line number.
543 /// How deep we are in flow style containers. 0 Means at block level.
546 /// Are we at the start of the stream?
547 bool IsStartOfStream
;
549 /// Can the next token be the start of a simple key?
550 bool IsSimpleKeyAllowed
;
552 /// Can the next token be a value indicator even if it does not have a
554 bool IsAdjacentValueAllowedInFlow
;
556 /// True if an error has occurred.
559 /// Should colors be used when printing out the diagnostic messages?
562 /// Queue of tokens. This is required to queue up tokens while looking
563 /// for the end of a simple key. And for cases where a single character
564 /// can produce multiple tokens (e.g. BlockEnd).
565 TokenQueueT TokenQueue
;
567 /// Indentation levels.
568 SmallVector
<int, 4> Indents
;
570 /// Potential simple keys.
571 SmallVector
<SimpleKey
, 4> SimpleKeys
;
576 } // end namespace yaml
577 } // end namespace llvm
579 /// encodeUTF8 - Encode \a UnicodeScalarValue in UTF-8 and append it to result.
580 static void encodeUTF8( uint32_t UnicodeScalarValue
581 , SmallVectorImpl
<char> &Result
) {
582 if (UnicodeScalarValue
<= 0x7F) {
583 Result
.push_back(UnicodeScalarValue
& 0x7F);
584 } else if (UnicodeScalarValue
<= 0x7FF) {
585 uint8_t FirstByte
= 0xC0 | ((UnicodeScalarValue
& 0x7C0) >> 6);
586 uint8_t SecondByte
= 0x80 | (UnicodeScalarValue
& 0x3F);
587 Result
.push_back(FirstByte
);
588 Result
.push_back(SecondByte
);
589 } else if (UnicodeScalarValue
<= 0xFFFF) {
590 uint8_t FirstByte
= 0xE0 | ((UnicodeScalarValue
& 0xF000) >> 12);
591 uint8_t SecondByte
= 0x80 | ((UnicodeScalarValue
& 0xFC0) >> 6);
592 uint8_t ThirdByte
= 0x80 | (UnicodeScalarValue
& 0x3F);
593 Result
.push_back(FirstByte
);
594 Result
.push_back(SecondByte
);
595 Result
.push_back(ThirdByte
);
596 } else if (UnicodeScalarValue
<= 0x10FFFF) {
597 uint8_t FirstByte
= 0xF0 | ((UnicodeScalarValue
& 0x1F0000) >> 18);
598 uint8_t SecondByte
= 0x80 | ((UnicodeScalarValue
& 0x3F000) >> 12);
599 uint8_t ThirdByte
= 0x80 | ((UnicodeScalarValue
& 0xFC0) >> 6);
600 uint8_t FourthByte
= 0x80 | (UnicodeScalarValue
& 0x3F);
601 Result
.push_back(FirstByte
);
602 Result
.push_back(SecondByte
);
603 Result
.push_back(ThirdByte
);
604 Result
.push_back(FourthByte
);
608 bool yaml::dumpTokens(StringRef Input
, raw_ostream
&OS
) {
610 Scanner
scanner(Input
, SM
);
612 Token T
= scanner
.getNext();
614 case Token::TK_StreamStart
:
615 OS
<< "Stream-Start: ";
617 case Token::TK_StreamEnd
:
618 OS
<< "Stream-End: ";
620 case Token::TK_VersionDirective
:
621 OS
<< "Version-Directive: ";
623 case Token::TK_TagDirective
:
624 OS
<< "Tag-Directive: ";
626 case Token::TK_DocumentStart
:
627 OS
<< "Document-Start: ";
629 case Token::TK_DocumentEnd
:
630 OS
<< "Document-End: ";
632 case Token::TK_BlockEntry
:
633 OS
<< "Block-Entry: ";
635 case Token::TK_BlockEnd
:
638 case Token::TK_BlockSequenceStart
:
639 OS
<< "Block-Sequence-Start: ";
641 case Token::TK_BlockMappingStart
:
642 OS
<< "Block-Mapping-Start: ";
644 case Token::TK_FlowEntry
:
645 OS
<< "Flow-Entry: ";
647 case Token::TK_FlowSequenceStart
:
648 OS
<< "Flow-Sequence-Start: ";
650 case Token::TK_FlowSequenceEnd
:
651 OS
<< "Flow-Sequence-End: ";
653 case Token::TK_FlowMappingStart
:
654 OS
<< "Flow-Mapping-Start: ";
656 case Token::TK_FlowMappingEnd
:
657 OS
<< "Flow-Mapping-End: ";
662 case Token::TK_Value
:
665 case Token::TK_Scalar
:
668 case Token::TK_BlockScalar
:
669 OS
<< "Block Scalar: ";
671 case Token::TK_Alias
:
674 case Token::TK_Anchor
:
680 case Token::TK_Error
:
683 OS
<< T
.Range
<< "\n";
684 if (T
.Kind
== Token::TK_StreamEnd
)
686 else if (T
.Kind
== Token::TK_Error
)
692 bool yaml::scanTokens(StringRef Input
) {
694 Scanner
scanner(Input
, SM
);
696 Token T
= scanner
.getNext();
697 if (T
.Kind
== Token::TK_StreamEnd
)
699 else if (T
.Kind
== Token::TK_Error
)
705 std::string
yaml::escape(StringRef Input
, bool EscapePrintable
) {
706 std::string EscapedInput
;
707 for (StringRef::iterator i
= Input
.begin(), e
= Input
.end(); i
!= e
; ++i
) {
709 EscapedInput
+= "\\\\";
711 EscapedInput
+= "\\\"";
713 EscapedInput
+= "\\0";
715 EscapedInput
+= "\\a";
717 EscapedInput
+= "\\b";
719 EscapedInput
+= "\\t";
721 EscapedInput
+= "\\n";
723 EscapedInput
+= "\\v";
725 EscapedInput
+= "\\f";
727 EscapedInput
+= "\\r";
729 EscapedInput
+= "\\e";
730 else if ((unsigned char)*i
< 0x20) { // Control characters not handled above.
731 std::string HexStr
= utohexstr(*i
);
732 EscapedInput
+= "\\x" + std::string(2 - HexStr
.size(), '0') + HexStr
;
733 } else if (*i
& 0x80) { // UTF-8 multiple code unit subsequence.
734 UTF8Decoded UnicodeScalarValue
735 = decodeUTF8(StringRef(i
, Input
.end() - i
));
736 if (UnicodeScalarValue
.second
== 0) {
737 // Found invalid char.
739 encodeUTF8(0xFFFD, Val
);
740 llvm::append_range(EscapedInput
, Val
);
741 // FIXME: Error reporting.
744 if (UnicodeScalarValue
.first
== 0x85)
745 EscapedInput
+= "\\N";
746 else if (UnicodeScalarValue
.first
== 0xA0)
747 EscapedInput
+= "\\_";
748 else if (UnicodeScalarValue
.first
== 0x2028)
749 EscapedInput
+= "\\L";
750 else if (UnicodeScalarValue
.first
== 0x2029)
751 EscapedInput
+= "\\P";
752 else if (!EscapePrintable
&&
753 sys::unicode::isPrintable(UnicodeScalarValue
.first
))
754 EscapedInput
+= StringRef(i
, UnicodeScalarValue
.second
);
756 std::string HexStr
= utohexstr(UnicodeScalarValue
.first
);
757 if (HexStr
.size() <= 2)
758 EscapedInput
+= "\\x" + std::string(2 - HexStr
.size(), '0') + HexStr
;
759 else if (HexStr
.size() <= 4)
760 EscapedInput
+= "\\u" + std::string(4 - HexStr
.size(), '0') + HexStr
;
761 else if (HexStr
.size() <= 8)
762 EscapedInput
+= "\\U" + std::string(8 - HexStr
.size(), '0') + HexStr
;
764 i
+= UnicodeScalarValue
.second
- 1;
766 EscapedInput
.push_back(*i
);
771 std::optional
<bool> yaml::parseBool(StringRef S
) {
787 if (S
[1] == 'N') // ON
791 if (S
[1] == 'n') //[Oo]n
795 if (S
[1] == 'O') // NO
799 if (S
[1] == 'o') //[Nn]o
808 if (S
.drop_front() == "FF") // OFF
812 if (S
.drop_front() == "ff") //[Oo]ff
816 if (S
.drop_front() == "ES") // YES
820 if (S
.drop_front() == "es") //[Yy]es
829 if (S
.drop_front() == "RUE") // TRUE
833 if (S
.drop_front() == "rue") //[Tt]rue
842 if (S
.drop_front() == "ALSE") // FALSE
846 if (S
.drop_front() == "alse") //[Ff]alse
857 Scanner::Scanner(StringRef Input
, SourceMgr
&sm
, bool ShowColors
,
859 : SM(sm
), ShowColors(ShowColors
), EC(EC
) {
860 init(MemoryBufferRef(Input
, "YAML"));
863 Scanner::Scanner(MemoryBufferRef Buffer
, SourceMgr
&SM_
, bool ShowColors
,
865 : SM(SM_
), ShowColors(ShowColors
), EC(EC
) {
869 void Scanner::init(MemoryBufferRef Buffer
) {
870 InputBuffer
= Buffer
;
871 Current
= InputBuffer
.getBufferStart();
872 End
= InputBuffer
.getBufferEnd();
877 IsStartOfStream
= true;
878 IsSimpleKeyAllowed
= true;
879 IsAdjacentValueAllowedInFlow
= false;
881 std::unique_ptr
<MemoryBuffer
> InputBufferOwner
=
882 MemoryBuffer::getMemBuffer(Buffer
, /*RequiresNullTerminator=*/false);
883 SM
.AddNewSourceBuffer(std::move(InputBufferOwner
), SMLoc());
886 Token
&Scanner::peekNext() {
887 // If the current token is a possible simple key, keep parsing until we
889 bool NeedMore
= false;
891 if (TokenQueue
.empty() || NeedMore
) {
892 if (!fetchMoreTokens()) {
895 TokenQueue
.push_back(Token());
896 return TokenQueue
.front();
899 assert(!TokenQueue
.empty() &&
900 "fetchMoreTokens lied about getting tokens!");
902 removeStaleSimpleKeyCandidates();
904 SK
.Tok
= TokenQueue
.begin();
905 if (!is_contained(SimpleKeys
, SK
))
910 return TokenQueue
.front();
913 Token
Scanner::getNext() {
914 Token Ret
= peekNext();
915 // TokenQueue can be empty if there was an error getting the next token.
916 if (!TokenQueue
.empty())
917 TokenQueue
.pop_front();
919 // There cannot be any referenced Token's if the TokenQueue is empty. So do a
920 // quick deallocation of them all.
921 if (TokenQueue
.empty())
922 TokenQueue
.resetAlloc();
927 StringRef::iterator
Scanner::skip_nb_char(StringRef::iterator Position
) {
930 // Check 7 bit c-printable - b-char.
931 if ( *Position
== 0x09
932 || (*Position
>= 0x20 && *Position
<= 0x7E))
935 // Check for valid UTF-8.
936 if (uint8_t(*Position
) & 0x80) {
937 UTF8Decoded u8d
= decodeUTF8(Position
);
939 && u8d
.first
!= 0xFEFF
940 && ( u8d
.first
== 0x85
941 || ( u8d
.first
>= 0xA0
942 && u8d
.first
<= 0xD7FF)
943 || ( u8d
.first
>= 0xE000
944 && u8d
.first
<= 0xFFFD)
945 || ( u8d
.first
>= 0x10000
946 && u8d
.first
<= 0x10FFFF)))
947 return Position
+ u8d
.second
;
952 StringRef::iterator
Scanner::skip_b_break(StringRef::iterator Position
) {
955 if (*Position
== 0x0D) {
956 if (Position
+ 1 != End
&& *(Position
+ 1) == 0x0A)
961 if (*Position
== 0x0A)
966 StringRef::iterator
Scanner::skip_s_space(StringRef::iterator Position
) {
969 if (*Position
== ' ')
974 StringRef::iterator
Scanner::skip_s_white(StringRef::iterator Position
) {
977 if (*Position
== ' ' || *Position
== '\t')
982 StringRef::iterator
Scanner::skip_ns_char(StringRef::iterator Position
) {
985 if (*Position
== ' ' || *Position
== '\t')
987 return skip_nb_char(Position
);
990 StringRef::iterator
Scanner::skip_while( SkipWhileFunc Func
991 , StringRef::iterator Position
) {
993 StringRef::iterator i
= (this->*Func
)(Position
);
1001 void Scanner::advanceWhile(SkipWhileFunc Func
) {
1002 auto Final
= skip_while(Func
, Current
);
1003 Column
+= Final
- Current
;
1007 static bool is_ns_hex_digit(const char C
) { return isAlnum(C
); }
1009 static bool is_ns_word_char(const char C
) { return C
== '-' || isAlpha(C
); }
1011 void Scanner::scan_ns_uri_char() {
1015 if (( *Current
== '%'
1016 && Current
+ 2 < End
1017 && is_ns_hex_digit(*(Current
+ 1))
1018 && is_ns_hex_digit(*(Current
+ 2)))
1019 || is_ns_word_char(*Current
)
1020 || StringRef(Current
, 1).find_first_of("#;/?:@&=+$,_.!~*'()[]")
1021 != StringRef::npos
) {
1029 bool Scanner::consume(uint32_t Expected
) {
1030 if (Expected
>= 0x80) {
1031 setError("Cannot consume non-ascii characters", Current
);
1036 if (uint8_t(*Current
) >= 0x80) {
1037 setError("Cannot consume non-ascii characters", Current
);
1040 if (uint8_t(*Current
) == Expected
) {
1048 void Scanner::skip(uint32_t Distance
) {
1049 Current
+= Distance
;
1051 assert(Current
<= End
&& "Skipped past the end");
1054 bool Scanner::isBlankOrBreak(StringRef::iterator Position
) {
1055 if (Position
== End
)
1057 return *Position
== ' ' || *Position
== '\t' || *Position
== '\r' ||
1061 bool Scanner::isPlainSafeNonBlank(StringRef::iterator Position
) {
1062 if (Position
== End
|| isBlankOrBreak(Position
))
1065 StringRef(Position
, 1).find_first_of(",[]{}") != StringRef::npos
)
1070 bool Scanner::isLineEmpty(StringRef Line
) {
1071 for (const auto *Position
= Line
.begin(); Position
!= Line
.end(); ++Position
)
1072 if (!isBlankOrBreak(Position
))
1077 bool Scanner::consumeLineBreakIfPresent() {
1078 auto Next
= skip_b_break(Current
);
1079 if (Next
== Current
)
1087 void Scanner::saveSimpleKeyCandidate( TokenQueueT::iterator Tok
1089 , bool IsRequired
) {
1090 if (IsSimpleKeyAllowed
) {
1094 SK
.Column
= AtColumn
;
1095 SK
.IsRequired
= IsRequired
;
1096 SK
.FlowLevel
= FlowLevel
;
1097 SimpleKeys
.push_back(SK
);
1101 void Scanner::removeStaleSimpleKeyCandidates() {
1102 for (SmallVectorImpl
<SimpleKey
>::iterator i
= SimpleKeys
.begin();
1103 i
!= SimpleKeys
.end();) {
1104 if (i
->Line
!= Line
|| i
->Column
+ 1024 < Column
) {
1106 setError( "Could not find expected : for simple key"
1107 , i
->Tok
->Range
.begin());
1108 i
= SimpleKeys
.erase(i
);
1114 void Scanner::removeSimpleKeyCandidatesOnFlowLevel(unsigned Level
) {
1115 if (!SimpleKeys
.empty() && (SimpleKeys
.end() - 1)->FlowLevel
== Level
)
1116 SimpleKeys
.pop_back();
1119 bool Scanner::unrollIndent(int ToColumn
) {
1121 // Indentation is ignored in flow.
1125 while (Indent
> ToColumn
) {
1126 T
.Kind
= Token::TK_BlockEnd
;
1127 T
.Range
= StringRef(Current
, 1);
1128 TokenQueue
.push_back(T
);
1129 Indent
= Indents
.pop_back_val();
1135 bool Scanner::rollIndent( int ToColumn
1136 , Token::TokenKind Kind
1137 , TokenQueueT::iterator InsertPoint
) {
1140 if (Indent
< ToColumn
) {
1141 Indents
.push_back(Indent
);
1146 T
.Range
= StringRef(Current
, 0);
1147 TokenQueue
.insert(InsertPoint
, T
);
1152 void Scanner::skipComment() {
1153 if (Current
== End
|| *Current
!= '#')
1156 // This may skip more than one byte, thus Column is only incremented
1158 StringRef::iterator I
= skip_nb_char(Current
);
1166 void Scanner::scanToNextToken() {
1168 while (Current
!= End
&& (*Current
== ' ' || *Current
== '\t')) {
1175 StringRef::iterator i
= skip_b_break(Current
);
1181 // New lines may start a simple key.
1183 IsSimpleKeyAllowed
= true;
1187 bool Scanner::scanStreamStart() {
1188 IsStartOfStream
= false;
1190 EncodingInfo EI
= getUnicodeEncoding(currentInput());
1193 T
.Kind
= Token::TK_StreamStart
;
1194 T
.Range
= StringRef(Current
, EI
.second
);
1195 TokenQueue
.push_back(T
);
1196 Current
+= EI
.second
;
1200 bool Scanner::scanStreamEnd() {
1201 // Force an ending new line if one isn't present.
1209 IsSimpleKeyAllowed
= false;
1210 IsAdjacentValueAllowedInFlow
= false;
1213 T
.Kind
= Token::TK_StreamEnd
;
1214 T
.Range
= StringRef(Current
, 0);
1215 TokenQueue
.push_back(T
);
1219 bool Scanner::scanDirective() {
1220 // Reset the indentation level.
1223 IsSimpleKeyAllowed
= false;
1224 IsAdjacentValueAllowedInFlow
= false;
1226 StringRef::iterator Start
= Current
;
1228 StringRef::iterator NameStart
= Current
;
1229 Current
= skip_while(&Scanner::skip_ns_char
, Current
);
1230 StringRef
Name(NameStart
, Current
- NameStart
);
1231 Current
= skip_while(&Scanner::skip_s_white
, Current
);
1234 if (Name
== "YAML") {
1235 Current
= skip_while(&Scanner::skip_ns_char
, Current
);
1236 T
.Kind
= Token::TK_VersionDirective
;
1237 T
.Range
= StringRef(Start
, Current
- Start
);
1238 TokenQueue
.push_back(T
);
1240 } else if(Name
== "TAG") {
1241 Current
= skip_while(&Scanner::skip_ns_char
, Current
);
1242 Current
= skip_while(&Scanner::skip_s_white
, Current
);
1243 Current
= skip_while(&Scanner::skip_ns_char
, Current
);
1244 T
.Kind
= Token::TK_TagDirective
;
1245 T
.Range
= StringRef(Start
, Current
- Start
);
1246 TokenQueue
.push_back(T
);
1252 bool Scanner::scanDocumentIndicator(bool IsStart
) {
1255 IsSimpleKeyAllowed
= false;
1256 IsAdjacentValueAllowedInFlow
= false;
1259 T
.Kind
= IsStart
? Token::TK_DocumentStart
: Token::TK_DocumentEnd
;
1260 T
.Range
= StringRef(Current
, 3);
1262 TokenQueue
.push_back(T
);
1266 bool Scanner::scanFlowCollectionStart(bool IsSequence
) {
1268 T
.Kind
= IsSequence
? Token::TK_FlowSequenceStart
1269 : Token::TK_FlowMappingStart
;
1270 T
.Range
= StringRef(Current
, 1);
1272 TokenQueue
.push_back(T
);
1274 // [ and { may begin a simple key.
1275 saveSimpleKeyCandidate(--TokenQueue
.end(), Column
- 1, false);
1277 // And may also be followed by a simple key.
1278 IsSimpleKeyAllowed
= true;
1279 // Adjacent values are allowed in flows only after JSON-style keys.
1280 IsAdjacentValueAllowedInFlow
= false;
1285 bool Scanner::scanFlowCollectionEnd(bool IsSequence
) {
1286 removeSimpleKeyCandidatesOnFlowLevel(FlowLevel
);
1287 IsSimpleKeyAllowed
= false;
1288 IsAdjacentValueAllowedInFlow
= true;
1290 T
.Kind
= IsSequence
? Token::TK_FlowSequenceEnd
1291 : Token::TK_FlowMappingEnd
;
1292 T
.Range
= StringRef(Current
, 1);
1294 TokenQueue
.push_back(T
);
1300 bool Scanner::scanFlowEntry() {
1301 removeSimpleKeyCandidatesOnFlowLevel(FlowLevel
);
1302 IsSimpleKeyAllowed
= true;
1303 IsAdjacentValueAllowedInFlow
= false;
1305 T
.Kind
= Token::TK_FlowEntry
;
1306 T
.Range
= StringRef(Current
, 1);
1308 TokenQueue
.push_back(T
);
1312 bool Scanner::scanBlockEntry() {
1313 rollIndent(Column
, Token::TK_BlockSequenceStart
, TokenQueue
.end());
1314 removeSimpleKeyCandidatesOnFlowLevel(FlowLevel
);
1315 IsSimpleKeyAllowed
= true;
1316 IsAdjacentValueAllowedInFlow
= false;
1318 T
.Kind
= Token::TK_BlockEntry
;
1319 T
.Range
= StringRef(Current
, 1);
1321 TokenQueue
.push_back(T
);
1325 bool Scanner::scanKey() {
1327 rollIndent(Column
, Token::TK_BlockMappingStart
, TokenQueue
.end());
1329 removeSimpleKeyCandidatesOnFlowLevel(FlowLevel
);
1330 IsSimpleKeyAllowed
= !FlowLevel
;
1331 IsAdjacentValueAllowedInFlow
= false;
1334 T
.Kind
= Token::TK_Key
;
1335 T
.Range
= StringRef(Current
, 1);
1337 TokenQueue
.push_back(T
);
1341 bool Scanner::scanValue() {
1342 // If the previous token could have been a simple key, insert the key token
1343 // into the token queue.
1344 if (!SimpleKeys
.empty()) {
1345 SimpleKey SK
= SimpleKeys
.pop_back_val();
1347 T
.Kind
= Token::TK_Key
;
1348 T
.Range
= SK
.Tok
->Range
;
1349 TokenQueueT::iterator i
, e
;
1350 for (i
= TokenQueue
.begin(), e
= TokenQueue
.end(); i
!= e
; ++i
) {
1358 i
= TokenQueue
.insert(i
, T
);
1360 // We may also need to add a Block-Mapping-Start token.
1361 rollIndent(SK
.Column
, Token::TK_BlockMappingStart
, i
);
1363 IsSimpleKeyAllowed
= false;
1366 rollIndent(Column
, Token::TK_BlockMappingStart
, TokenQueue
.end());
1367 IsSimpleKeyAllowed
= !FlowLevel
;
1369 IsAdjacentValueAllowedInFlow
= false;
1372 T
.Kind
= Token::TK_Value
;
1373 T
.Range
= StringRef(Current
, 1);
1375 TokenQueue
.push_back(T
);
1379 // Forbidding inlining improves performance by roughly 20%.
1380 // FIXME: Remove once llvm optimizes this to the faster version without hints.
1381 LLVM_ATTRIBUTE_NOINLINE
static bool
1382 wasEscaped(StringRef::iterator First
, StringRef::iterator Position
);
1384 // Returns whether a character at 'Position' was escaped with a leading '\'.
1385 // 'First' specifies the position of the first character in the string.
1386 static bool wasEscaped(StringRef::iterator First
,
1387 StringRef::iterator Position
) {
1388 assert(Position
- 1 >= First
);
1389 StringRef::iterator I
= Position
- 1;
1390 // We calculate the number of consecutive '\'s before the current position
1391 // by iterating backwards through our string.
1392 while (I
>= First
&& *I
== '\\') --I
;
1393 // (Position - 1 - I) now contains the number of '\'s before the current
1394 // position. If it is odd, the character at 'Position' was escaped.
1395 return (Position
- 1 - I
) % 2 == 1;
1398 bool Scanner::scanFlowScalar(bool IsDoubleQuoted
) {
1399 StringRef::iterator Start
= Current
;
1400 unsigned ColStart
= Column
;
1401 if (IsDoubleQuoted
) {
1404 while (Current
!= End
&& *Current
!= '"')
1406 // Repeat until the previous character was not a '\' or was an escaped
1408 } while ( Current
!= End
1409 && *(Current
- 1) == '\\'
1410 && wasEscaped(Start
+ 1, Current
));
1413 while (Current
!= End
) {
1414 // Skip a ' followed by another '.
1415 if (Current
+ 1 < End
&& *Current
== '\'' && *(Current
+ 1) == '\'') {
1418 } else if (*Current
== '\'')
1420 StringRef::iterator i
= skip_nb_char(Current
);
1422 i
= skip_b_break(Current
);
1437 if (Current
== End
) {
1438 setError("Expected quote at end of scalar", Current
);
1442 skip(1); // Skip ending quote.
1444 T
.Kind
= Token::TK_Scalar
;
1445 T
.Range
= StringRef(Start
, Current
- Start
);
1446 TokenQueue
.push_back(T
);
1448 saveSimpleKeyCandidate(--TokenQueue
.end(), ColStart
, false);
1450 IsSimpleKeyAllowed
= false;
1451 IsAdjacentValueAllowedInFlow
= true;
1456 bool Scanner::scanPlainScalar() {
1457 StringRef::iterator Start
= Current
;
1458 unsigned ColStart
= Column
;
1459 unsigned LeadingBlanks
= 0;
1460 assert(Indent
>= -1 && "Indent must be >= -1 !");
1461 unsigned indent
= static_cast<unsigned>(Indent
+ 1);
1462 while (Current
!= End
) {
1463 if (*Current
== '#')
1466 while (Current
!= End
&&
1467 ((*Current
!= ':' && isPlainSafeNonBlank(Current
)) ||
1468 (*Current
== ':' && isPlainSafeNonBlank(Current
+ 1)))) {
1469 StringRef::iterator i
= skip_nb_char(Current
);
1476 // Are we at the end?
1477 if (!isBlankOrBreak(Current
))
1481 StringRef::iterator Tmp
= Current
;
1482 while (isBlankOrBreak(Tmp
)) {
1483 StringRef::iterator i
= skip_s_white(Tmp
);
1485 if (LeadingBlanks
&& (Column
< indent
) && *Tmp
== '\t') {
1486 setError("Found invalid tab character in indentation", Tmp
);
1492 i
= skip_b_break(Tmp
);
1501 if (!FlowLevel
&& Column
< indent
)
1506 if (Start
== Current
) {
1507 setError("Got empty plain scalar", Start
);
1511 T
.Kind
= Token::TK_Scalar
;
1512 T
.Range
= StringRef(Start
, Current
- Start
);
1513 TokenQueue
.push_back(T
);
1515 // Plain scalars can be simple keys.
1516 saveSimpleKeyCandidate(--TokenQueue
.end(), ColStart
, false);
1518 IsSimpleKeyAllowed
= false;
1519 IsAdjacentValueAllowedInFlow
= false;
1524 bool Scanner::scanAliasOrAnchor(bool IsAlias
) {
1525 StringRef::iterator Start
= Current
;
1526 unsigned ColStart
= Column
;
1528 while (Current
!= End
) {
1529 if ( *Current
== '[' || *Current
== ']'
1530 || *Current
== '{' || *Current
== '}'
1534 StringRef::iterator i
= skip_ns_char(Current
);
1541 if (Start
+ 1 == Current
) {
1542 setError("Got empty alias or anchor", Start
);
1547 T
.Kind
= IsAlias
? Token::TK_Alias
: Token::TK_Anchor
;
1548 T
.Range
= StringRef(Start
, Current
- Start
);
1549 TokenQueue
.push_back(T
);
1551 // Alias and anchors can be simple keys.
1552 saveSimpleKeyCandidate(--TokenQueue
.end(), ColStart
, false);
1554 IsSimpleKeyAllowed
= false;
1555 IsAdjacentValueAllowedInFlow
= false;
1560 bool Scanner::scanBlockScalarIndicators(char &StyleIndicator
,
1561 char &ChompingIndicator
,
1562 unsigned &IndentIndicator
,
1564 StyleIndicator
= scanBlockStyleIndicator();
1565 if (!scanBlockScalarHeader(ChompingIndicator
, IndentIndicator
, IsDone
))
1570 char Scanner::scanBlockStyleIndicator() {
1571 char Indicator
= ' ';
1572 if (Current
!= End
&& (*Current
== '>' || *Current
== '|')) {
1573 Indicator
= *Current
;
1579 char Scanner::scanBlockChompingIndicator() {
1580 char Indicator
= ' ';
1581 if (Current
!= End
&& (*Current
== '+' || *Current
== '-')) {
1582 Indicator
= *Current
;
1588 /// Get the number of line breaks after chomping.
1590 /// Return the number of trailing line breaks to emit, depending on
1591 /// \p ChompingIndicator.
1592 static unsigned getChompedLineBreaks(char ChompingIndicator
,
1593 unsigned LineBreaks
, StringRef Str
) {
1594 if (ChompingIndicator
== '-') // Strip all line breaks.
1596 if (ChompingIndicator
== '+') // Keep all line breaks.
1598 // Clip trailing lines.
1599 return Str
.empty() ? 0 : 1;
1602 unsigned Scanner::scanBlockIndentationIndicator() {
1603 unsigned Indent
= 0;
1604 if (Current
!= End
&& (*Current
>= '1' && *Current
<= '9')) {
1605 Indent
= unsigned(*Current
- '0');
1611 bool Scanner::scanBlockScalarHeader(char &ChompingIndicator
,
1612 unsigned &IndentIndicator
, bool &IsDone
) {
1613 auto Start
= Current
;
1615 ChompingIndicator
= scanBlockChompingIndicator();
1616 IndentIndicator
= scanBlockIndentationIndicator();
1617 // Check for the chomping indicator once again.
1618 if (ChompingIndicator
== ' ')
1619 ChompingIndicator
= scanBlockChompingIndicator();
1620 Current
= skip_while(&Scanner::skip_s_white
, Current
);
1623 if (Current
== End
) { // EOF, we have an empty scalar.
1625 T
.Kind
= Token::TK_BlockScalar
;
1626 T
.Range
= StringRef(Start
, Current
- Start
);
1627 TokenQueue
.push_back(T
);
1632 if (!consumeLineBreakIfPresent()) {
1633 setError("Expected a line break after block scalar header", Current
);
1639 bool Scanner::findBlockScalarIndent(unsigned &BlockIndent
,
1640 unsigned BlockExitIndent
,
1641 unsigned &LineBreaks
, bool &IsDone
) {
1642 unsigned MaxAllSpaceLineCharacters
= 0;
1643 StringRef::iterator LongestAllSpaceLine
;
1646 advanceWhile(&Scanner::skip_s_space
);
1647 if (skip_nb_char(Current
) != Current
) {
1648 // This line isn't empty, so try and find the indentation.
1649 if (Column
<= BlockExitIndent
) { // End of the block literal.
1653 // We found the block's indentation.
1654 BlockIndent
= Column
;
1655 if (MaxAllSpaceLineCharacters
> BlockIndent
) {
1657 "Leading all-spaces line must be smaller than the block indent",
1658 LongestAllSpaceLine
);
1663 if (skip_b_break(Current
) != Current
&&
1664 Column
> MaxAllSpaceLineCharacters
) {
1665 // Record the longest all-space line in case it's longer than the
1666 // discovered block indent.
1667 MaxAllSpaceLineCharacters
= Column
;
1668 LongestAllSpaceLine
= Current
;
1672 if (Current
== End
) {
1677 if (!consumeLineBreakIfPresent()) {
1686 bool Scanner::scanBlockScalarIndent(unsigned BlockIndent
,
1687 unsigned BlockExitIndent
, bool &IsDone
) {
1688 // Skip the indentation.
1689 while (Column
< BlockIndent
) {
1690 auto I
= skip_s_space(Current
);
1697 if (skip_nb_char(Current
) == Current
)
1700 if (Column
<= BlockExitIndent
) { // End of the block literal.
1705 if (Column
< BlockIndent
) {
1706 if (Current
!= End
&& *Current
== '#') { // Trailing comment.
1710 setError("A text line is less indented than the block scalar", Current
);
1713 return true; // A normal text line.
1716 bool Scanner::scanBlockScalar(bool IsLiteral
) {
1717 assert(*Current
== '|' || *Current
== '>');
1718 char StyleIndicator
;
1719 char ChompingIndicator
;
1720 unsigned BlockIndent
;
1721 bool IsDone
= false;
1722 if (!scanBlockScalarIndicators(StyleIndicator
, ChompingIndicator
, BlockIndent
,
1727 bool IsFolded
= StyleIndicator
== '>';
1729 const auto *Start
= Current
;
1730 unsigned BlockExitIndent
= Indent
< 0 ? 0 : (unsigned)Indent
;
1731 unsigned LineBreaks
= 0;
1732 if (BlockIndent
== 0) {
1733 if (!findBlockScalarIndent(BlockIndent
, BlockExitIndent
, LineBreaks
,
1738 // Scan the block's scalars body.
1739 SmallString
<256> Str
;
1741 if (!scanBlockScalarIndent(BlockIndent
, BlockExitIndent
, IsDone
))
1746 // Parse the current line.
1747 auto LineStart
= Current
;
1748 advanceWhile(&Scanner::skip_nb_char
);
1749 if (LineStart
!= Current
) {
1750 if (LineBreaks
&& IsFolded
&& !Scanner::isLineEmpty(Str
)) {
1751 // The folded style "folds" any single line break between content into a
1752 // single space, except when that content is "empty" (only contains
1753 // whitespace) in which case the line break is left as-is.
1754 if (LineBreaks
== 1) {
1755 Str
.append(LineBreaks
,
1756 isLineEmpty(StringRef(LineStart
, Current
- LineStart
))
1760 // If we saw a single line break, we are completely replacing it and so
1761 // want `LineBreaks == 0`. Otherwise this decrement accounts for the
1762 // fact that the first line break is "trimmed", only being used to
1763 // signal a sequence of line breaks which should not be folded.
1766 Str
.append(LineBreaks
, '\n');
1767 Str
.append(StringRef(LineStart
, Current
- LineStart
));
1775 if (!consumeLineBreakIfPresent())
1780 if (Current
== End
&& !LineBreaks
)
1781 // Ensure that there is at least one line break before the end of file.
1783 Str
.append(getChompedLineBreaks(ChompingIndicator
, LineBreaks
, Str
), '\n');
1785 // New lines may start a simple key.
1787 IsSimpleKeyAllowed
= true;
1788 IsAdjacentValueAllowedInFlow
= false;
1791 T
.Kind
= Token::TK_BlockScalar
;
1792 T
.Range
= StringRef(Start
, Current
- Start
);
1793 T
.Value
= std::string(Str
);
1794 TokenQueue
.push_back(T
);
1798 bool Scanner::scanTag() {
1799 StringRef::iterator Start
= Current
;
1800 unsigned ColStart
= Column
;
1802 if (Current
== End
|| isBlankOrBreak(Current
)); // An empty tag.
1803 else if (*Current
== '<') {
1809 // FIXME: Actually parse the c-ns-shorthand-tag rule.
1810 Current
= skip_while(&Scanner::skip_ns_char
, Current
);
1814 T
.Kind
= Token::TK_Tag
;
1815 T
.Range
= StringRef(Start
, Current
- Start
);
1816 TokenQueue
.push_back(T
);
1818 // Tags can be simple keys.
1819 saveSimpleKeyCandidate(--TokenQueue
.end(), ColStart
, false);
1821 IsSimpleKeyAllowed
= false;
1822 IsAdjacentValueAllowedInFlow
= false;
1827 bool Scanner::fetchMoreTokens() {
1828 if (IsStartOfStream
)
1829 return scanStreamStart();
1834 return scanStreamEnd();
1836 removeStaleSimpleKeyCandidates();
1838 unrollIndent(Column
);
1840 if (Column
== 0 && *Current
== '%')
1841 return scanDirective();
1843 if (Column
== 0 && Current
+ 4 <= End
1845 && *(Current
+ 1) == '-'
1846 && *(Current
+ 2) == '-'
1847 && (Current
+ 3 == End
|| isBlankOrBreak(Current
+ 3)))
1848 return scanDocumentIndicator(true);
1850 if (Column
== 0 && Current
+ 4 <= End
1852 && *(Current
+ 1) == '.'
1853 && *(Current
+ 2) == '.'
1854 && (Current
+ 3 == End
|| isBlankOrBreak(Current
+ 3)))
1855 return scanDocumentIndicator(false);
1857 if (*Current
== '[')
1858 return scanFlowCollectionStart(true);
1860 if (*Current
== '{')
1861 return scanFlowCollectionStart(false);
1863 if (*Current
== ']')
1864 return scanFlowCollectionEnd(true);
1866 if (*Current
== '}')
1867 return scanFlowCollectionEnd(false);
1869 if (*Current
== ',')
1870 return scanFlowEntry();
1872 if (*Current
== '-' && (isBlankOrBreak(Current
+ 1) || Current
+ 1 == End
))
1873 return scanBlockEntry();
1875 if (*Current
== '?' && (Current
+ 1 == End
|| isBlankOrBreak(Current
+ 1)))
1878 if (*Current
== ':' &&
1879 (!isPlainSafeNonBlank(Current
+ 1) || IsAdjacentValueAllowedInFlow
))
1882 if (*Current
== '*')
1883 return scanAliasOrAnchor(true);
1885 if (*Current
== '&')
1886 return scanAliasOrAnchor(false);
1888 if (*Current
== '!')
1891 if (*Current
== '|' && !FlowLevel
)
1892 return scanBlockScalar(true);
1894 if (*Current
== '>' && !FlowLevel
)
1895 return scanBlockScalar(false);
1897 if (*Current
== '\'')
1898 return scanFlowScalar(false);
1900 if (*Current
== '"')
1901 return scanFlowScalar(true);
1903 // Get a plain scalar.
1904 StringRef
FirstChar(Current
, 1);
1905 if ((!isBlankOrBreak(Current
) &&
1906 FirstChar
.find_first_of("-?:,[]{}#&*!|>'\"%@`") == StringRef::npos
) ||
1907 (FirstChar
.find_first_of("?:-") != StringRef::npos
&&
1908 isPlainSafeNonBlank(Current
+ 1)))
1909 return scanPlainScalar();
1911 setError("Unrecognized character while tokenizing.", Current
);
1915 Stream::Stream(StringRef Input
, SourceMgr
&SM
, bool ShowColors
,
1916 std::error_code
*EC
)
1917 : scanner(new Scanner(Input
, SM
, ShowColors
, EC
)) {}
1919 Stream::Stream(MemoryBufferRef InputBuffer
, SourceMgr
&SM
, bool ShowColors
,
1920 std::error_code
*EC
)
1921 : scanner(new Scanner(InputBuffer
, SM
, ShowColors
, EC
)) {}
1923 Stream::~Stream() = default;
1925 bool Stream::failed() { return scanner
->failed(); }
1927 void Stream::printError(Node
*N
, const Twine
&Msg
, SourceMgr::DiagKind Kind
) {
1928 printError(N
? N
->getSourceRange() : SMRange(), Msg
, Kind
);
1931 void Stream::printError(const SMRange
&Range
, const Twine
&Msg
,
1932 SourceMgr::DiagKind Kind
) {
1933 scanner
->printError(Range
.Start
, Kind
, Msg
, Range
);
1936 document_iterator
Stream::begin() {
1938 report_fatal_error("Can only iterate over the stream once");
1940 // Skip Stream-Start.
1943 CurrentDoc
.reset(new Document(*this));
1944 return document_iterator(CurrentDoc
);
1947 document_iterator
Stream::end() {
1948 return document_iterator();
1951 void Stream::skip() {
1952 for (Document
&Doc
: *this)
1956 Node::Node(unsigned int Type
, std::unique_ptr
<Document
> &D
, StringRef A
,
1958 : Doc(D
), TypeID(Type
), Anchor(A
), Tag(T
) {
1959 SMLoc Start
= SMLoc::getFromPointer(peekNext().Range
.begin());
1960 SourceRange
= SMRange(Start
, Start
);
1963 std::string
Node::getVerbatimTag() const {
1964 StringRef Raw
= getRawTag();
1965 if (!Raw
.empty() && Raw
!= "!") {
1967 if (Raw
.find_last_of('!') == 0) {
1968 Ret
= std::string(Doc
->getTagMap().find("!")->second
);
1969 Ret
+= Raw
.substr(1);
1971 } else if (Raw
.starts_with("!!")) {
1972 Ret
= std::string(Doc
->getTagMap().find("!!")->second
);
1973 Ret
+= Raw
.substr(2);
1976 StringRef TagHandle
= Raw
.substr(0, Raw
.find_last_of('!') + 1);
1977 std::map
<StringRef
, StringRef
>::const_iterator It
=
1978 Doc
->getTagMap().find(TagHandle
);
1979 if (It
!= Doc
->getTagMap().end())
1980 Ret
= std::string(It
->second
);
1983 T
.Kind
= Token::TK_Tag
;
1984 T
.Range
= TagHandle
;
1985 setError(Twine("Unknown tag handle ") + TagHandle
, T
);
1987 Ret
+= Raw
.substr(Raw
.find_last_of('!') + 1);
1992 switch (getType()) {
1994 return "tag:yaml.org,2002:null";
1996 case NK_BlockScalar
:
1997 // TODO: Tag resolution.
1998 return "tag:yaml.org,2002:str";
2000 return "tag:yaml.org,2002:map";
2002 return "tag:yaml.org,2002:seq";
2008 Token
&Node::peekNext() {
2009 return Doc
->peekNext();
2012 Token
Node::getNext() {
2013 return Doc
->getNext();
2016 Node
*Node::parseBlockNode() {
2017 return Doc
->parseBlockNode();
2020 BumpPtrAllocator
&Node::getAllocator() {
2021 return Doc
->NodeAllocator
;
2024 void Node::setError(const Twine
&Msg
, Token
&Tok
) const {
2025 Doc
->setError(Msg
, Tok
);
2028 bool Node::failed() const {
2029 return Doc
->failed();
2032 StringRef
ScalarNode::getValue(SmallVectorImpl
<char> &Storage
) const {
2033 if (Value
[0] == '"')
2034 return getDoubleQuotedValue(Value
, Storage
);
2035 if (Value
[0] == '\'')
2036 return getSingleQuotedValue(Value
, Storage
);
2037 return getPlainValue(Value
, Storage
);
2040 /// parseScalarValue - A common parsing routine for all flow scalar styles.
2041 /// It handles line break characters by itself, adds regular content characters
2042 /// to the result, and forwards escaped sequences to the provided routine for
2043 /// the style-specific processing.
2045 /// \param UnquotedValue - An input value without quotation marks.
2046 /// \param Storage - A storage for the result if the input value is multiline or
2047 /// contains escaped characters.
2048 /// \param LookupChars - A set of special characters to search in the input
2049 /// string. Should include line break characters and the escape character
2050 /// specific for the processing scalar style, if any.
2051 /// \param UnescapeCallback - This is called when the escape character is found
2053 /// \returns - The unfolded and unescaped value.
2055 parseScalarValue(StringRef UnquotedValue
, SmallVectorImpl
<char> &Storage
,
2056 StringRef LookupChars
,
2057 std::function
<StringRef(StringRef
, SmallVectorImpl
<char> &)>
2059 size_t I
= UnquotedValue
.find_first_of(LookupChars
);
2060 if (I
== StringRef::npos
)
2061 return UnquotedValue
;
2064 Storage
.reserve(UnquotedValue
.size());
2065 char LastNewLineAddedAs
= '\0';
2066 for (; I
!= StringRef::npos
; I
= UnquotedValue
.find_first_of(LookupChars
)) {
2067 if (UnquotedValue
[I
] != '\r' && UnquotedValue
[I
] != '\n') {
2068 llvm::append_range(Storage
, UnquotedValue
.take_front(I
));
2069 UnquotedValue
= UnescapeCallback(UnquotedValue
.drop_front(I
), Storage
);
2070 LastNewLineAddedAs
= '\0';
2073 if (size_t LastNonSWhite
= UnquotedValue
.find_last_not_of(" \t", I
);
2074 LastNonSWhite
!= StringRef::npos
) {
2075 llvm::append_range(Storage
, UnquotedValue
.take_front(LastNonSWhite
+ 1));
2076 Storage
.push_back(' ');
2077 LastNewLineAddedAs
= ' ';
2079 // Note: we can't just check if the last character in Storage is ' ',
2080 // '\n', or something else; that would give a wrong result for double
2081 // quoted values containing an escaped space character before a new-line
2083 switch (LastNewLineAddedAs
) {
2085 assert(!Storage
.empty() && Storage
.back() == ' ');
2086 Storage
.back() = '\n';
2087 LastNewLineAddedAs
= '\n';
2090 assert(!Storage
.empty() && Storage
.back() == '\n');
2091 Storage
.push_back('\n');
2094 Storage
.push_back(' ');
2095 LastNewLineAddedAs
= ' ';
2099 // Handle Windows-style EOL
2100 if (UnquotedValue
.substr(I
, 2) == "\r\n")
2102 UnquotedValue
= UnquotedValue
.drop_front(I
+ 1).ltrim(" \t");
2104 llvm::append_range(Storage
, UnquotedValue
);
2105 return StringRef(Storage
.begin(), Storage
.size());
2109 ScalarNode::getDoubleQuotedValue(StringRef RawValue
,
2110 SmallVectorImpl
<char> &Storage
) const {
2111 assert(RawValue
.size() >= 2 && RawValue
.front() == '"' &&
2112 RawValue
.back() == '"');
2113 StringRef UnquotedValue
= RawValue
.substr(1, RawValue
.size() - 2);
2115 auto UnescapeFunc
= [this](StringRef UnquotedValue
,
2116 SmallVectorImpl
<char> &Storage
) {
2117 assert(UnquotedValue
.take_front(1) == "\\");
2118 if (UnquotedValue
.size() == 1) {
2120 T
.Range
= UnquotedValue
;
2121 setError("Unrecognized escape code", T
);
2125 UnquotedValue
= UnquotedValue
.drop_front(1);
2126 switch (UnquotedValue
[0]) {
2129 T
.Range
= UnquotedValue
.take_front(1);
2130 setError("Unrecognized escape code", T
);
2135 // Shrink the Windows-style EOL.
2136 if (UnquotedValue
.size() >= 2 && UnquotedValue
[1] == '\n')
2137 UnquotedValue
= UnquotedValue
.drop_front(1);
2140 return UnquotedValue
.drop_front(1).ltrim(" \t");
2142 Storage
.push_back(0x00);
2145 Storage
.push_back(0x07);
2148 Storage
.push_back(0x08);
2152 Storage
.push_back(0x09);
2155 Storage
.push_back(0x0A);
2158 Storage
.push_back(0x0B);
2161 Storage
.push_back(0x0C);
2164 Storage
.push_back(0x0D);
2167 Storage
.push_back(0x1B);
2170 Storage
.push_back(0x20);
2173 Storage
.push_back(0x22);
2176 Storage
.push_back(0x2F);
2179 Storage
.push_back(0x5C);
2182 encodeUTF8(0x85, Storage
);
2185 encodeUTF8(0xA0, Storage
);
2188 encodeUTF8(0x2028, Storage
);
2191 encodeUTF8(0x2029, Storage
);
2194 if (UnquotedValue
.size() < 3)
2195 // TODO: Report error.
2197 unsigned int UnicodeScalarValue
;
2198 if (UnquotedValue
.substr(1, 2).getAsInteger(16, UnicodeScalarValue
))
2199 // TODO: Report error.
2200 UnicodeScalarValue
= 0xFFFD;
2201 encodeUTF8(UnicodeScalarValue
, Storage
);
2202 return UnquotedValue
.drop_front(3);
2205 if (UnquotedValue
.size() < 5)
2206 // TODO: Report error.
2208 unsigned int UnicodeScalarValue
;
2209 if (UnquotedValue
.substr(1, 4).getAsInteger(16, UnicodeScalarValue
))
2210 // TODO: Report error.
2211 UnicodeScalarValue
= 0xFFFD;
2212 encodeUTF8(UnicodeScalarValue
, Storage
);
2213 return UnquotedValue
.drop_front(5);
2216 if (UnquotedValue
.size() < 9)
2217 // TODO: Report error.
2219 unsigned int UnicodeScalarValue
;
2220 if (UnquotedValue
.substr(1, 8).getAsInteger(16, UnicodeScalarValue
))
2221 // TODO: Report error.
2222 UnicodeScalarValue
= 0xFFFD;
2223 encodeUTF8(UnicodeScalarValue
, Storage
);
2224 return UnquotedValue
.drop_front(9);
2227 return UnquotedValue
.drop_front(1);
2230 return parseScalarValue(UnquotedValue
, Storage
, "\\\r\n", UnescapeFunc
);
2233 StringRef
ScalarNode::getSingleQuotedValue(StringRef RawValue
,
2234 SmallVectorImpl
<char> &Storage
) {
2235 assert(RawValue
.size() >= 2 && RawValue
.front() == '\'' &&
2236 RawValue
.back() == '\'');
2237 StringRef UnquotedValue
= RawValue
.substr(1, RawValue
.size() - 2);
2239 auto UnescapeFunc
= [](StringRef UnquotedValue
,
2240 SmallVectorImpl
<char> &Storage
) {
2241 assert(UnquotedValue
.take_front(2) == "''");
2242 Storage
.push_back('\'');
2243 return UnquotedValue
.drop_front(2);
2246 return parseScalarValue(UnquotedValue
, Storage
, "'\r\n", UnescapeFunc
);
2249 StringRef
ScalarNode::getPlainValue(StringRef RawValue
,
2250 SmallVectorImpl
<char> &Storage
) {
2251 // Trim trailing whitespace ('b-char' and 's-white').
2252 // NOTE: Alternatively we could change the scanner to not include whitespace
2253 // here in the first place.
2254 RawValue
= RawValue
.rtrim("\r\n \t");
2255 return parseScalarValue(RawValue
, Storage
, "\r\n", nullptr);
2258 Node
*KeyValueNode::getKey() {
2261 // Handle implicit null keys.
2263 Token
&t
= peekNext();
2264 if ( t
.Kind
== Token::TK_BlockEnd
2265 || t
.Kind
== Token::TK_Value
2266 || t
.Kind
== Token::TK_Error
) {
2267 return Key
= new (getAllocator()) NullNode(Doc
);
2269 if (t
.Kind
== Token::TK_Key
)
2270 getNext(); // skip TK_Key.
2273 // Handle explicit null keys.
2274 Token
&t
= peekNext();
2275 if (t
.Kind
== Token::TK_BlockEnd
|| t
.Kind
== Token::TK_Value
) {
2276 return Key
= new (getAllocator()) NullNode(Doc
);
2279 // We've got a normal key.
2280 return Key
= parseBlockNode();
2283 Node
*KeyValueNode::getValue() {
2287 if (Node
* Key
= getKey())
2290 setError("Null key in Key Value.", peekNext());
2291 return Value
= new (getAllocator()) NullNode(Doc
);
2295 return Value
= new (getAllocator()) NullNode(Doc
);
2297 // Handle implicit null values.
2299 Token
&t
= peekNext();
2300 if ( t
.Kind
== Token::TK_BlockEnd
2301 || t
.Kind
== Token::TK_FlowMappingEnd
2302 || t
.Kind
== Token::TK_Key
2303 || t
.Kind
== Token::TK_FlowEntry
2304 || t
.Kind
== Token::TK_Error
) {
2305 return Value
= new (getAllocator()) NullNode(Doc
);
2308 if (t
.Kind
!= Token::TK_Value
) {
2309 setError("Unexpected token in Key Value.", t
);
2310 return Value
= new (getAllocator()) NullNode(Doc
);
2312 getNext(); // skip TK_Value.
2315 // Handle explicit null values.
2316 Token
&t
= peekNext();
2317 if (t
.Kind
== Token::TK_BlockEnd
|| t
.Kind
== Token::TK_Key
) {
2318 return Value
= new (getAllocator()) NullNode(Doc
);
2321 // We got a normal value.
2322 return Value
= parseBlockNode();
2325 void MappingNode::increment() {
2328 CurrentEntry
= nullptr;
2332 CurrentEntry
->skip();
2333 if (Type
== MT_Inline
) {
2335 CurrentEntry
= nullptr;
2339 Token T
= peekNext();
2340 if (T
.Kind
== Token::TK_Key
|| T
.Kind
== Token::TK_Scalar
) {
2341 // KeyValueNode eats the TK_Key. That way it can detect null keys.
2342 CurrentEntry
= new (getAllocator()) KeyValueNode(Doc
);
2343 } else if (Type
== MT_Block
) {
2345 case Token::TK_BlockEnd
:
2348 CurrentEntry
= nullptr;
2351 setError("Unexpected token. Expected Key or Block End", T
);
2353 case Token::TK_Error
:
2355 CurrentEntry
= nullptr;
2359 case Token::TK_FlowEntry
:
2360 // Eat the flow entry and recurse.
2363 case Token::TK_FlowMappingEnd
:
2366 case Token::TK_Error
:
2367 // Set this to end iterator.
2369 CurrentEntry
= nullptr;
2372 setError( "Unexpected token. Expected Key, Flow Entry, or Flow "
2376 CurrentEntry
= nullptr;
2381 void SequenceNode::increment() {
2384 CurrentEntry
= nullptr;
2388 CurrentEntry
->skip();
2389 Token T
= peekNext();
2390 if (SeqType
== ST_Block
) {
2392 case Token::TK_BlockEntry
:
2394 CurrentEntry
= parseBlockNode();
2395 if (!CurrentEntry
) { // An error occurred.
2397 CurrentEntry
= nullptr;
2400 case Token::TK_BlockEnd
:
2403 CurrentEntry
= nullptr;
2406 setError( "Unexpected token. Expected Block Entry or Block End."
2409 case Token::TK_Error
:
2411 CurrentEntry
= nullptr;
2413 } else if (SeqType
== ST_Indentless
) {
2415 case Token::TK_BlockEntry
:
2417 CurrentEntry
= parseBlockNode();
2418 if (!CurrentEntry
) { // An error occurred.
2420 CurrentEntry
= nullptr;
2424 case Token::TK_Error
:
2426 CurrentEntry
= nullptr;
2428 } else if (SeqType
== ST_Flow
) {
2430 case Token::TK_FlowEntry
:
2431 // Eat the flow entry and recurse.
2433 WasPreviousTokenFlowEntry
= true;
2435 case Token::TK_FlowSequenceEnd
:
2438 case Token::TK_Error
:
2439 // Set this to end iterator.
2441 CurrentEntry
= nullptr;
2443 case Token::TK_StreamEnd
:
2444 case Token::TK_DocumentEnd
:
2445 case Token::TK_DocumentStart
:
2446 setError("Could not find closing ]!", T
);
2447 // Set this to end iterator.
2449 CurrentEntry
= nullptr;
2452 if (!WasPreviousTokenFlowEntry
) {
2453 setError("Expected , between entries!", T
);
2455 CurrentEntry
= nullptr;
2458 // Otherwise it must be a flow entry.
2459 CurrentEntry
= parseBlockNode();
2460 if (!CurrentEntry
) {
2463 WasPreviousTokenFlowEntry
= false;
2469 Document::Document(Stream
&S
) : stream(S
), Root(nullptr) {
2470 // Tag maps starts with two default mappings.
2472 TagMap
["!!"] = "tag:yaml.org,2002:";
2474 if (parseDirectives())
2475 expectToken(Token::TK_DocumentStart
);
2476 Token
&T
= peekNext();
2477 if (T
.Kind
== Token::TK_DocumentStart
)
2481 bool Document::skip() {
2482 if (stream
.scanner
->failed())
2484 if (!Root
&& !getRoot())
2487 Token
&T
= peekNext();
2488 if (T
.Kind
== Token::TK_StreamEnd
)
2490 if (T
.Kind
== Token::TK_DocumentEnd
) {
2497 Token
&Document::peekNext() {
2498 return stream
.scanner
->peekNext();
2501 Token
Document::getNext() {
2502 return stream
.scanner
->getNext();
2505 void Document::setError(const Twine
&Message
, Token
&Location
) const {
2506 stream
.scanner
->setError(Message
, Location
.Range
.begin());
2509 bool Document::failed() const {
2510 return stream
.scanner
->failed();
2513 Node
*Document::parseBlockNode() {
2514 Token T
= peekNext();
2515 // Handle properties.
2520 case Token::TK_Alias
:
2522 return new (NodeAllocator
) AliasNode(stream
.CurrentDoc
, T
.Range
.substr(1));
2523 case Token::TK_Anchor
:
2524 if (AnchorInfo
.Kind
== Token::TK_Anchor
) {
2525 setError("Already encountered an anchor for this node!", T
);
2528 AnchorInfo
= getNext(); // Consume TK_Anchor.
2530 goto parse_property
;
2532 if (TagInfo
.Kind
== Token::TK_Tag
) {
2533 setError("Already encountered a tag for this node!", T
);
2536 TagInfo
= getNext(); // Consume TK_Tag.
2538 goto parse_property
;
2544 case Token::TK_BlockEntry
:
2545 // We got an unindented BlockEntry sequence. This is not terminated with
2547 // Don't eat the TK_BlockEntry, SequenceNode needs it.
2548 return new (NodeAllocator
) SequenceNode( stream
.CurrentDoc
2549 , AnchorInfo
.Range
.substr(1)
2551 , SequenceNode::ST_Indentless
);
2552 case Token::TK_BlockSequenceStart
:
2554 return new (NodeAllocator
)
2555 SequenceNode( stream
.CurrentDoc
2556 , AnchorInfo
.Range
.substr(1)
2558 , SequenceNode::ST_Block
);
2559 case Token::TK_BlockMappingStart
:
2561 return new (NodeAllocator
)
2562 MappingNode( stream
.CurrentDoc
2563 , AnchorInfo
.Range
.substr(1)
2565 , MappingNode::MT_Block
);
2566 case Token::TK_FlowSequenceStart
:
2568 return new (NodeAllocator
)
2569 SequenceNode( stream
.CurrentDoc
2570 , AnchorInfo
.Range
.substr(1)
2572 , SequenceNode::ST_Flow
);
2573 case Token::TK_FlowMappingStart
:
2575 return new (NodeAllocator
)
2576 MappingNode( stream
.CurrentDoc
2577 , AnchorInfo
.Range
.substr(1)
2579 , MappingNode::MT_Flow
);
2580 case Token::TK_Scalar
:
2582 return new (NodeAllocator
)
2583 ScalarNode( stream
.CurrentDoc
2584 , AnchorInfo
.Range
.substr(1)
2587 case Token::TK_BlockScalar
: {
2589 StringRef
NullTerminatedStr(T
.Value
.c_str(), T
.Value
.length() + 1);
2590 StringRef StrCopy
= NullTerminatedStr
.copy(NodeAllocator
).drop_back();
2591 return new (NodeAllocator
)
2592 BlockScalarNode(stream
.CurrentDoc
, AnchorInfo
.Range
.substr(1),
2593 TagInfo
.Range
, StrCopy
, T
.Range
);
2596 // Don't eat the TK_Key, KeyValueNode expects it.
2597 return new (NodeAllocator
)
2598 MappingNode( stream
.CurrentDoc
2599 , AnchorInfo
.Range
.substr(1)
2601 , MappingNode::MT_Inline
);
2602 case Token::TK_DocumentStart
:
2603 case Token::TK_DocumentEnd
:
2604 case Token::TK_StreamEnd
:
2606 // TODO: Properly handle tags. "[!!str ]" should resolve to !!str "", not
2608 return new (NodeAllocator
) NullNode(stream
.CurrentDoc
);
2609 case Token::TK_FlowMappingEnd
:
2610 case Token::TK_FlowSequenceEnd
:
2611 case Token::TK_FlowEntry
: {
2612 if (Root
&& (isa
<MappingNode
>(Root
) || isa
<SequenceNode
>(Root
)))
2613 return new (NodeAllocator
) NullNode(stream
.CurrentDoc
);
2615 setError("Unexpected token", T
);
2618 case Token::TK_Error
:
2621 llvm_unreachable("Control flow shouldn't reach here.");
2625 bool Document::parseDirectives() {
2626 bool isDirective
= false;
2628 Token T
= peekNext();
2629 if (T
.Kind
== Token::TK_TagDirective
) {
2630 parseTAGDirective();
2632 } else if (T
.Kind
== Token::TK_VersionDirective
) {
2633 parseYAMLDirective();
2641 void Document::parseYAMLDirective() {
2642 getNext(); // Eat %YAML <version>
2645 void Document::parseTAGDirective() {
2646 Token Tag
= getNext(); // %TAG <handle> <prefix>
2647 StringRef T
= Tag
.Range
;
2649 T
= T
.substr(T
.find_first_of(" \t")).ltrim(" \t");
2650 std::size_t HandleEnd
= T
.find_first_of(" \t");
2651 StringRef TagHandle
= T
.substr(0, HandleEnd
);
2652 StringRef TagPrefix
= T
.substr(HandleEnd
).ltrim(" \t");
2653 TagMap
[TagHandle
] = TagPrefix
;
2656 bool Document::expectToken(int TK
) {
2657 Token T
= getNext();
2659 setError("Unexpected token", T
);