1 //===- YAMLParser.cpp - Simple YAML parser --------------------------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // This file implements a YAML parser.
11 //===----------------------------------------------------------------------===//
13 #include "llvm/Support/YAMLParser.h"
14 #include "llvm/ADT/AllocatorList.h"
15 #include "llvm/ADT/ArrayRef.h"
16 #include "llvm/ADT/STLExtras.h"
17 #include "llvm/ADT/SmallString.h"
18 #include "llvm/ADT/SmallVector.h"
19 #include "llvm/ADT/StringExtras.h"
20 #include "llvm/ADT/StringRef.h"
21 #include "llvm/ADT/Twine.h"
22 #include "llvm/Support/Compiler.h"
23 #include "llvm/Support/ErrorHandling.h"
24 #include "llvm/Support/MemoryBuffer.h"
25 #include "llvm/Support/SMLoc.h"
26 #include "llvm/Support/SourceMgr.h"
27 #include "llvm/Support/Unicode.h"
28 #include "llvm/Support/raw_ostream.h"
35 #include <system_error>
41 enum UnicodeEncodingForm
{
42 UEF_UTF32_LE
, ///< UTF-32 Little Endian
43 UEF_UTF32_BE
, ///< UTF-32 Big Endian
44 UEF_UTF16_LE
, ///< UTF-16 Little Endian
45 UEF_UTF16_BE
, ///< UTF-16 Big Endian
46 UEF_UTF8
, ///< UTF-8 or ascii.
47 UEF_Unknown
///< Not a valid Unicode encoding.
50 /// EncodingInfo - Holds the encoding type and length of the byte order mark if
51 /// it exists. Length is in {0, 2, 3, 4}.
52 using EncodingInfo
= std::pair
<UnicodeEncodingForm
, unsigned>;
54 /// getUnicodeEncoding - Reads up to the first 4 bytes to determine the Unicode
55 /// encoding form of \a Input.
57 /// @param Input A string of length 0 or more.
58 /// @returns An EncodingInfo indicating the Unicode encoding form of the input
59 /// and how long the byte order mark is if one exists.
60 static EncodingInfo
getUnicodeEncoding(StringRef Input
) {
62 return std::make_pair(UEF_Unknown
, 0);
64 switch (uint8_t(Input
[0])) {
66 if (Input
.size() >= 4) {
68 && uint8_t(Input
[2]) == 0xFE
69 && uint8_t(Input
[3]) == 0xFF)
70 return std::make_pair(UEF_UTF32_BE
, 4);
71 if (Input
[1] == 0 && Input
[2] == 0 && Input
[3] != 0)
72 return std::make_pair(UEF_UTF32_BE
, 0);
75 if (Input
.size() >= 2 && Input
[1] != 0)
76 return std::make_pair(UEF_UTF16_BE
, 0);
77 return std::make_pair(UEF_Unknown
, 0);
79 if ( Input
.size() >= 4
80 && uint8_t(Input
[1]) == 0xFE
83 return std::make_pair(UEF_UTF32_LE
, 4);
85 if (Input
.size() >= 2 && uint8_t(Input
[1]) == 0xFE)
86 return std::make_pair(UEF_UTF16_LE
, 2);
87 return std::make_pair(UEF_Unknown
, 0);
89 if (Input
.size() >= 2 && uint8_t(Input
[1]) == 0xFF)
90 return std::make_pair(UEF_UTF16_BE
, 2);
91 return std::make_pair(UEF_Unknown
, 0);
93 if ( Input
.size() >= 3
94 && uint8_t(Input
[1]) == 0xBB
95 && uint8_t(Input
[2]) == 0xBF)
96 return std::make_pair(UEF_UTF8
, 3);
97 return std::make_pair(UEF_Unknown
, 0);
100 // It could still be utf-32 or utf-16.
101 if (Input
.size() >= 4 && Input
[1] == 0 && Input
[2] == 0 && Input
[3] == 0)
102 return std::make_pair(UEF_UTF32_LE
, 0);
104 if (Input
.size() >= 2 && Input
[1] == 0)
105 return std::make_pair(UEF_UTF16_LE
, 0);
107 return std::make_pair(UEF_UTF8
, 0);
110 /// Pin the vtables to this file.
111 void Node::anchor() {}
112 void NullNode::anchor() {}
113 void ScalarNode::anchor() {}
114 void BlockScalarNode::anchor() {}
115 void KeyValueNode::anchor() {}
116 void MappingNode::anchor() {}
117 void SequenceNode::anchor() {}
118 void AliasNode::anchor() {}
123 /// Token - A single YAML token.
126 TK_Error
, // Uninitialized token.
135 TK_BlockSequenceStart
,
136 TK_BlockMappingStart
,
138 TK_FlowSequenceStart
,
151 /// A string of length 0 or more whose begin() points to the logical location
152 /// of the token in the input.
155 /// The value of a block scalar node.
161 } // end namespace yaml
162 } // end namespace llvm
164 using TokenQueueT
= BumpPtrList
<Token
>;
168 /// This struct is used to track simple keys.
170 /// Simple keys are handled by creating an entry in SimpleKeys for each Token
171 /// which could legally be the start of a simple key. When peekNext is called,
172 /// if the Token To be returned is referenced by a SimpleKey, we continue
173 /// tokenizing until that potential simple key has either been found to not be
174 /// a simple key (we moved on to the next line or went further than 1024 chars).
175 /// Or when we run into a Value, and then insert a Key token (and possibly
176 /// others) before the SimpleKey's Tok.
178 TokenQueueT::iterator Tok
;
181 unsigned FlowLevel
= 0;
182 bool IsRequired
= false;
184 bool operator ==(const SimpleKey
&Other
) {
185 return Tok
== Other
.Tok
;
189 } // end anonymous namespace
191 /// The Unicode scalar value of a UTF-8 minimal well-formed code unit
192 /// subsequence and the subsequence's length in code units (uint8_t).
193 /// A length of 0 represents an error.
194 using UTF8Decoded
= std::pair
<uint32_t, unsigned>;
196 static UTF8Decoded
decodeUTF8(StringRef Range
) {
197 StringRef::iterator Position
= Range
.begin();
198 StringRef::iterator End
= Range
.end();
199 // 1 byte: [0x00, 0x7f]
200 // Bit pattern: 0xxxxxxx
201 if (Position
< End
&& (*Position
& 0x80) == 0) {
202 return std::make_pair(*Position
, 1);
204 // 2 bytes: [0x80, 0x7ff]
205 // Bit pattern: 110xxxxx 10xxxxxx
206 if (Position
+ 1 < End
&& ((*Position
& 0xE0) == 0xC0) &&
207 ((*(Position
+ 1) & 0xC0) == 0x80)) {
208 uint32_t codepoint
= ((*Position
& 0x1F) << 6) |
209 (*(Position
+ 1) & 0x3F);
210 if (codepoint
>= 0x80)
211 return std::make_pair(codepoint
, 2);
213 // 3 bytes: [0x8000, 0xffff]
214 // Bit pattern: 1110xxxx 10xxxxxx 10xxxxxx
215 if (Position
+ 2 < End
&& ((*Position
& 0xF0) == 0xE0) &&
216 ((*(Position
+ 1) & 0xC0) == 0x80) &&
217 ((*(Position
+ 2) & 0xC0) == 0x80)) {
218 uint32_t codepoint
= ((*Position
& 0x0F) << 12) |
219 ((*(Position
+ 1) & 0x3F) << 6) |
220 (*(Position
+ 2) & 0x3F);
221 // Codepoints between 0xD800 and 0xDFFF are invalid, as
222 // they are high / low surrogate halves used by UTF-16.
223 if (codepoint
>= 0x800 &&
224 (codepoint
< 0xD800 || codepoint
> 0xDFFF))
225 return std::make_pair(codepoint
, 3);
227 // 4 bytes: [0x10000, 0x10FFFF]
228 // Bit pattern: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
229 if (Position
+ 3 < End
&& ((*Position
& 0xF8) == 0xF0) &&
230 ((*(Position
+ 1) & 0xC0) == 0x80) &&
231 ((*(Position
+ 2) & 0xC0) == 0x80) &&
232 ((*(Position
+ 3) & 0xC0) == 0x80)) {
233 uint32_t codepoint
= ((*Position
& 0x07) << 18) |
234 ((*(Position
+ 1) & 0x3F) << 12) |
235 ((*(Position
+ 2) & 0x3F) << 6) |
236 (*(Position
+ 3) & 0x3F);
237 if (codepoint
>= 0x10000 && codepoint
<= 0x10FFFF)
238 return std::make_pair(codepoint
, 4);
240 return std::make_pair(0, 0);
246 /// Scans YAML tokens from a MemoryBuffer.
249 Scanner(StringRef Input
, SourceMgr
&SM
, bool ShowColors
= true,
250 std::error_code
*EC
= nullptr);
251 Scanner(MemoryBufferRef Buffer
, SourceMgr
&SM_
, bool ShowColors
= true,
252 std::error_code
*EC
= nullptr);
254 /// Parse the next token and return it without popping it.
257 /// Parse the next token and pop it from the queue.
260 void printError(SMLoc Loc
, SourceMgr::DiagKind Kind
, const Twine
&Message
,
261 ArrayRef
<SMRange
> Ranges
= {}) {
262 SM
.PrintMessage(Loc
, Kind
, Message
, Ranges
, /* FixIts= */ {}, ShowColors
);
265 void setError(const Twine
&Message
, StringRef::iterator Position
) {
269 // propagate the error if possible
271 *EC
= make_error_code(std::errc::invalid_argument
);
273 // Don't print out more errors after the first one we encounter. The rest
274 // are just the result of the first, and have no meaning.
276 printError(SMLoc::getFromPointer(Position
), SourceMgr::DK_Error
, Message
);
280 /// Returns true if an error occurred while parsing.
286 void init(MemoryBufferRef Buffer
);
288 StringRef
currentInput() {
289 return StringRef(Current
, End
- Current
);
292 /// Decode a UTF-8 minimal well-formed code unit subsequence starting
295 /// If the UTF-8 code units starting at Position do not form a well-formed
296 /// code unit subsequence, then the Unicode scalar value is 0, and the length
298 UTF8Decoded
decodeUTF8(StringRef::iterator Position
) {
299 return ::decodeUTF8(StringRef(Position
, End
- Position
));
302 // The following functions are based on the gramar rules in the YAML spec. The
303 // style of the function names it meant to closely match how they are written
304 // in the spec. The number within the [] is the number of the grammar rule in
307 // See 4.2 [Production Naming Conventions] for the meaning of the prefixes.
310 // A production starting and ending with a special character.
312 // A production matching a single line break.
314 // A production starting and ending with a non-break character.
316 // A production starting and ending with a white space character.
318 // A production starting and ending with a non-space character.
320 // A production matching complete line(s).
322 /// Skip a single nb-char[27] starting at Position.
324 /// A nb-char is 0x9 | [0x20-0x7E] | 0x85 | [0xA0-0xD7FF] | [0xE000-0xFEFE]
325 /// | [0xFF00-0xFFFD] | [0x10000-0x10FFFF]
327 /// @returns The code unit after the nb-char, or Position if it's not an
329 StringRef::iterator
skip_nb_char(StringRef::iterator Position
);
331 /// Skip a single b-break[28] starting at Position.
333 /// A b-break is 0xD 0xA | 0xD | 0xA
335 /// @returns The code unit after the b-break, or Position if it's not a
337 StringRef::iterator
skip_b_break(StringRef::iterator Position
);
339 /// Skip a single s-space[31] starting at Position.
341 /// An s-space is 0x20
343 /// @returns The code unit after the s-space, or Position if it's not a
345 StringRef::iterator
skip_s_space(StringRef::iterator Position
);
347 /// Skip a single s-white[33] starting at Position.
349 /// A s-white is 0x20 | 0x9
351 /// @returns The code unit after the s-white, or Position if it's not a
353 StringRef::iterator
skip_s_white(StringRef::iterator Position
);
355 /// Skip a single ns-char[34] starting at Position.
357 /// A ns-char is nb-char - s-white
359 /// @returns The code unit after the ns-char, or Position if it's not a
361 StringRef::iterator
skip_ns_char(StringRef::iterator Position
);
363 using SkipWhileFunc
= StringRef::iterator (Scanner::*)(StringRef::iterator
);
365 /// Skip minimal well-formed code unit subsequences until Func
366 /// returns its input.
368 /// @returns The code unit after the last minimal well-formed code unit
369 /// subsequence that Func accepted.
370 StringRef::iterator
skip_while( SkipWhileFunc Func
371 , StringRef::iterator Position
);
373 /// Skip minimal well-formed code unit subsequences until Func returns its
375 void advanceWhile(SkipWhileFunc Func
);
377 /// Scan ns-uri-char[39]s starting at Cur.
379 /// This updates Cur and Column while scanning.
380 void scan_ns_uri_char();
382 /// Consume a minimal well-formed code unit subsequence starting at
383 /// \a Cur. Return false if it is not the same Unicode scalar value as
384 /// \a Expected. This updates \a Column.
385 bool consume(uint32_t Expected
);
387 /// Skip \a Distance UTF-8 code units. Updates \a Cur and \a Column.
388 void skip(uint32_t Distance
);
390 /// Return true if the minimal well-formed code unit subsequence at
391 /// Pos is whitespace or a new line
392 bool isBlankOrBreak(StringRef::iterator Position
);
394 /// Return true if the minimal well-formed code unit subsequence at
395 /// Pos is considered a "safe" character for plain scalars.
396 bool isPlainSafeNonBlank(StringRef::iterator Position
);
398 /// Return true if the line is a line break, false otherwise.
399 bool isLineEmpty(StringRef Line
);
401 /// Consume a single b-break[28] if it's present at the current position.
403 /// Return false if the code unit at the current position isn't a line break.
404 bool consumeLineBreakIfPresent();
406 /// If IsSimpleKeyAllowed, create and push_back a new SimpleKey.
407 void saveSimpleKeyCandidate( TokenQueueT::iterator Tok
411 /// Remove simple keys that can no longer be valid simple keys.
413 /// Invalid simple keys are not on the current line or are further than 1024
415 void removeStaleSimpleKeyCandidates();
417 /// Remove all simple keys on FlowLevel \a Level.
418 void removeSimpleKeyCandidatesOnFlowLevel(unsigned Level
);
420 /// Unroll indentation in \a Indents back to \a Col. Creates BlockEnd
421 /// tokens if needed.
422 bool unrollIndent(int ToColumn
);
424 /// Increase indent to \a Col. Creates \a Kind token at \a InsertPoint
426 bool rollIndent( int ToColumn
427 , Token::TokenKind Kind
428 , TokenQueueT::iterator InsertPoint
);
430 /// Skip a single-line comment when the comment starts at the current
431 /// position of the scanner.
434 /// Skip whitespace and comments until the start of the next token.
435 void scanToNextToken();
437 /// Must be the first token generated.
438 bool scanStreamStart();
440 /// Generate tokens needed to close out the stream.
441 bool scanStreamEnd();
443 /// Scan a %BLAH directive.
444 bool scanDirective();
446 /// Scan a ... or ---.
447 bool scanDocumentIndicator(bool IsStart
);
449 /// Scan a [ or { and generate the proper flow collection start token.
450 bool scanFlowCollectionStart(bool IsSequence
);
452 /// Scan a ] or } and generate the proper flow collection end token.
453 bool scanFlowCollectionEnd(bool IsSequence
);
455 /// Scan the , that separates entries in a flow collection.
456 bool scanFlowEntry();
458 /// Scan the - that starts block sequence entries.
459 bool scanBlockEntry();
461 /// Scan an explicit ? indicating a key.
464 /// Scan an explicit : indicating a value.
467 /// Scan a quoted scalar.
468 bool scanFlowScalar(bool IsDoubleQuoted
);
470 /// Scan an unquoted scalar.
471 bool scanPlainScalar();
473 /// Scan an Alias or Anchor starting with * or &.
474 bool scanAliasOrAnchor(bool IsAlias
);
476 /// Scan a block scalar starting with | or >.
477 bool scanBlockScalar(bool IsLiteral
);
479 /// Scan a block scalar style indicator and header.
481 /// Note: This is distinct from scanBlockScalarHeader to mirror the fact that
482 /// YAML does not consider the style indicator to be a part of the header.
484 /// Return false if an error occurred.
485 bool scanBlockScalarIndicators(char &StyleIndicator
, char &ChompingIndicator
,
486 unsigned &IndentIndicator
, bool &IsDone
);
488 /// Scan a style indicator in a block scalar header.
489 char scanBlockStyleIndicator();
491 /// Scan a chomping indicator in a block scalar header.
492 char scanBlockChompingIndicator();
494 /// Scan an indentation indicator in a block scalar header.
495 unsigned scanBlockIndentationIndicator();
497 /// Scan a block scalar header.
499 /// Return false if an error occurred.
500 bool scanBlockScalarHeader(char &ChompingIndicator
, unsigned &IndentIndicator
,
503 /// Look for the indentation level of a block scalar.
505 /// Return false if an error occurred.
506 bool findBlockScalarIndent(unsigned &BlockIndent
, unsigned BlockExitIndent
,
507 unsigned &LineBreaks
, bool &IsDone
);
509 /// Scan the indentation of a text line in a block scalar.
511 /// Return false if an error occurred.
512 bool scanBlockScalarIndent(unsigned BlockIndent
, unsigned BlockExitIndent
,
515 /// Scan a tag of the form !stuff.
518 /// Dispatch to the next scanning function based on \a *Cur.
519 bool fetchMoreTokens();
521 /// The SourceMgr used for diagnostics and buffer management.
524 /// The original input.
525 MemoryBufferRef InputBuffer
;
527 /// The current position of the scanner.
528 StringRef::iterator Current
;
530 /// The end of the input (one past the last character).
531 StringRef::iterator End
;
533 /// Current YAML indentation level in spaces.
536 /// Current column number in Unicode code points.
539 /// Current line number.
542 /// How deep we are in flow style containers. 0 Means at block level.
545 /// Are we at the start of the stream?
546 bool IsStartOfStream
;
548 /// Can the next token be the start of a simple key?
549 bool IsSimpleKeyAllowed
;
551 /// Can the next token be a value indicator even if it does not have a
553 bool IsAdjacentValueAllowedInFlow
;
555 /// True if an error has occurred.
558 /// Should colors be used when printing out the diagnostic messages?
561 /// Queue of tokens. This is required to queue up tokens while looking
562 /// for the end of a simple key. And for cases where a single character
563 /// can produce multiple tokens (e.g. BlockEnd).
564 TokenQueueT TokenQueue
;
566 /// Indentation levels.
567 SmallVector
<int, 4> Indents
;
569 /// Potential simple keys.
570 SmallVector
<SimpleKey
, 4> SimpleKeys
;
575 } // end namespace yaml
576 } // end namespace llvm
578 /// encodeUTF8 - Encode \a UnicodeScalarValue in UTF-8 and append it to result.
579 static void encodeUTF8( uint32_t UnicodeScalarValue
580 , SmallVectorImpl
<char> &Result
) {
581 if (UnicodeScalarValue
<= 0x7F) {
582 Result
.push_back(UnicodeScalarValue
& 0x7F);
583 } else if (UnicodeScalarValue
<= 0x7FF) {
584 uint8_t FirstByte
= 0xC0 | ((UnicodeScalarValue
& 0x7C0) >> 6);
585 uint8_t SecondByte
= 0x80 | (UnicodeScalarValue
& 0x3F);
586 Result
.push_back(FirstByte
);
587 Result
.push_back(SecondByte
);
588 } else if (UnicodeScalarValue
<= 0xFFFF) {
589 uint8_t FirstByte
= 0xE0 | ((UnicodeScalarValue
& 0xF000) >> 12);
590 uint8_t SecondByte
= 0x80 | ((UnicodeScalarValue
& 0xFC0) >> 6);
591 uint8_t ThirdByte
= 0x80 | (UnicodeScalarValue
& 0x3F);
592 Result
.push_back(FirstByte
);
593 Result
.push_back(SecondByte
);
594 Result
.push_back(ThirdByte
);
595 } else if (UnicodeScalarValue
<= 0x10FFFF) {
596 uint8_t FirstByte
= 0xF0 | ((UnicodeScalarValue
& 0x1F0000) >> 18);
597 uint8_t SecondByte
= 0x80 | ((UnicodeScalarValue
& 0x3F000) >> 12);
598 uint8_t ThirdByte
= 0x80 | ((UnicodeScalarValue
& 0xFC0) >> 6);
599 uint8_t FourthByte
= 0x80 | (UnicodeScalarValue
& 0x3F);
600 Result
.push_back(FirstByte
);
601 Result
.push_back(SecondByte
);
602 Result
.push_back(ThirdByte
);
603 Result
.push_back(FourthByte
);
607 bool yaml::dumpTokens(StringRef Input
, raw_ostream
&OS
) {
609 Scanner
scanner(Input
, SM
);
611 Token T
= scanner
.getNext();
613 case Token::TK_StreamStart
:
614 OS
<< "Stream-Start: ";
616 case Token::TK_StreamEnd
:
617 OS
<< "Stream-End: ";
619 case Token::TK_VersionDirective
:
620 OS
<< "Version-Directive: ";
622 case Token::TK_TagDirective
:
623 OS
<< "Tag-Directive: ";
625 case Token::TK_DocumentStart
:
626 OS
<< "Document-Start: ";
628 case Token::TK_DocumentEnd
:
629 OS
<< "Document-End: ";
631 case Token::TK_BlockEntry
:
632 OS
<< "Block-Entry: ";
634 case Token::TK_BlockEnd
:
637 case Token::TK_BlockSequenceStart
:
638 OS
<< "Block-Sequence-Start: ";
640 case Token::TK_BlockMappingStart
:
641 OS
<< "Block-Mapping-Start: ";
643 case Token::TK_FlowEntry
:
644 OS
<< "Flow-Entry: ";
646 case Token::TK_FlowSequenceStart
:
647 OS
<< "Flow-Sequence-Start: ";
649 case Token::TK_FlowSequenceEnd
:
650 OS
<< "Flow-Sequence-End: ";
652 case Token::TK_FlowMappingStart
:
653 OS
<< "Flow-Mapping-Start: ";
655 case Token::TK_FlowMappingEnd
:
656 OS
<< "Flow-Mapping-End: ";
661 case Token::TK_Value
:
664 case Token::TK_Scalar
:
667 case Token::TK_BlockScalar
:
668 OS
<< "Block Scalar: ";
670 case Token::TK_Alias
:
673 case Token::TK_Anchor
:
679 case Token::TK_Error
:
682 OS
<< T
.Range
<< "\n";
683 if (T
.Kind
== Token::TK_StreamEnd
)
685 else if (T
.Kind
== Token::TK_Error
)
691 bool yaml::scanTokens(StringRef Input
) {
693 Scanner
scanner(Input
, SM
);
695 Token T
= scanner
.getNext();
696 if (T
.Kind
== Token::TK_StreamEnd
)
698 else if (T
.Kind
== Token::TK_Error
)
704 std::string
yaml::escape(StringRef Input
, bool EscapePrintable
) {
705 std::string EscapedInput
;
706 for (StringRef::iterator i
= Input
.begin(), e
= Input
.end(); i
!= e
; ++i
) {
708 EscapedInput
+= "\\\\";
710 EscapedInput
+= "\\\"";
712 EscapedInput
+= "\\0";
714 EscapedInput
+= "\\a";
716 EscapedInput
+= "\\b";
718 EscapedInput
+= "\\t";
720 EscapedInput
+= "\\n";
722 EscapedInput
+= "\\v";
724 EscapedInput
+= "\\f";
726 EscapedInput
+= "\\r";
728 EscapedInput
+= "\\e";
729 else if ((unsigned char)*i
< 0x20) { // Control characters not handled above.
730 std::string HexStr
= utohexstr(*i
);
731 EscapedInput
+= "\\x" + std::string(2 - HexStr
.size(), '0') + HexStr
;
732 } else if (*i
& 0x80) { // UTF-8 multiple code unit subsequence.
733 UTF8Decoded UnicodeScalarValue
734 = decodeUTF8(StringRef(i
, Input
.end() - i
));
735 if (UnicodeScalarValue
.second
== 0) {
736 // Found invalid char.
738 encodeUTF8(0xFFFD, Val
);
739 llvm::append_range(EscapedInput
, Val
);
740 // FIXME: Error reporting.
743 if (UnicodeScalarValue
.first
== 0x85)
744 EscapedInput
+= "\\N";
745 else if (UnicodeScalarValue
.first
== 0xA0)
746 EscapedInput
+= "\\_";
747 else if (UnicodeScalarValue
.first
== 0x2028)
748 EscapedInput
+= "\\L";
749 else if (UnicodeScalarValue
.first
== 0x2029)
750 EscapedInput
+= "\\P";
751 else if (!EscapePrintable
&&
752 sys::unicode::isPrintable(UnicodeScalarValue
.first
))
753 EscapedInput
+= StringRef(i
, UnicodeScalarValue
.second
);
755 std::string HexStr
= utohexstr(UnicodeScalarValue
.first
);
756 if (HexStr
.size() <= 2)
757 EscapedInput
+= "\\x" + std::string(2 - HexStr
.size(), '0') + HexStr
;
758 else if (HexStr
.size() <= 4)
759 EscapedInput
+= "\\u" + std::string(4 - HexStr
.size(), '0') + HexStr
;
760 else if (HexStr
.size() <= 8)
761 EscapedInput
+= "\\U" + std::string(8 - HexStr
.size(), '0') + HexStr
;
763 i
+= UnicodeScalarValue
.second
- 1;
765 EscapedInput
.push_back(*i
);
770 std::optional
<bool> yaml::parseBool(StringRef S
) {
786 if (S
[1] == 'N') // ON
790 if (S
[1] == 'n') //[Oo]n
794 if (S
[1] == 'O') // NO
798 if (S
[1] == 'o') //[Nn]o
807 if (S
.drop_front() == "FF") // OFF
811 if (S
.drop_front() == "ff") //[Oo]ff
815 if (S
.drop_front() == "ES") // YES
819 if (S
.drop_front() == "es") //[Yy]es
828 if (S
.drop_front() == "RUE") // TRUE
832 if (S
.drop_front() == "rue") //[Tt]rue
841 if (S
.drop_front() == "ALSE") // FALSE
845 if (S
.drop_front() == "alse") //[Ff]alse
856 Scanner::Scanner(StringRef Input
, SourceMgr
&sm
, bool ShowColors
,
858 : SM(sm
), ShowColors(ShowColors
), EC(EC
) {
859 init(MemoryBufferRef(Input
, "YAML"));
862 Scanner::Scanner(MemoryBufferRef Buffer
, SourceMgr
&SM_
, bool ShowColors
,
864 : SM(SM_
), ShowColors(ShowColors
), EC(EC
) {
868 void Scanner::init(MemoryBufferRef Buffer
) {
869 InputBuffer
= Buffer
;
870 Current
= InputBuffer
.getBufferStart();
871 End
= InputBuffer
.getBufferEnd();
876 IsStartOfStream
= true;
877 IsSimpleKeyAllowed
= true;
878 IsAdjacentValueAllowedInFlow
= false;
880 std::unique_ptr
<MemoryBuffer
> InputBufferOwner
=
881 MemoryBuffer::getMemBuffer(Buffer
, /*RequiresNullTerminator=*/false);
882 SM
.AddNewSourceBuffer(std::move(InputBufferOwner
), SMLoc());
885 Token
&Scanner::peekNext() {
886 // If the current token is a possible simple key, keep parsing until we
888 bool NeedMore
= false;
890 if (TokenQueue
.empty() || NeedMore
) {
891 if (!fetchMoreTokens()) {
894 TokenQueue
.push_back(Token());
895 return TokenQueue
.front();
898 assert(!TokenQueue
.empty() &&
899 "fetchMoreTokens lied about getting tokens!");
901 removeStaleSimpleKeyCandidates();
903 SK
.Tok
= TokenQueue
.begin();
904 if (!is_contained(SimpleKeys
, SK
))
909 return TokenQueue
.front();
912 Token
Scanner::getNext() {
913 Token Ret
= peekNext();
914 // TokenQueue can be empty if there was an error getting the next token.
915 if (!TokenQueue
.empty())
916 TokenQueue
.pop_front();
918 // There cannot be any referenced Token's if the TokenQueue is empty. So do a
919 // quick deallocation of them all.
920 if (TokenQueue
.empty())
921 TokenQueue
.resetAlloc();
926 StringRef::iterator
Scanner::skip_nb_char(StringRef::iterator Position
) {
929 // Check 7 bit c-printable - b-char.
930 if ( *Position
== 0x09
931 || (*Position
>= 0x20 && *Position
<= 0x7E))
934 // Check for valid UTF-8.
935 if (uint8_t(*Position
) & 0x80) {
936 UTF8Decoded u8d
= decodeUTF8(Position
);
938 && u8d
.first
!= 0xFEFF
939 && ( u8d
.first
== 0x85
940 || ( u8d
.first
>= 0xA0
941 && u8d
.first
<= 0xD7FF)
942 || ( u8d
.first
>= 0xE000
943 && u8d
.first
<= 0xFFFD)
944 || ( u8d
.first
>= 0x10000
945 && u8d
.first
<= 0x10FFFF)))
946 return Position
+ u8d
.second
;
951 StringRef::iterator
Scanner::skip_b_break(StringRef::iterator Position
) {
954 if (*Position
== 0x0D) {
955 if (Position
+ 1 != End
&& *(Position
+ 1) == 0x0A)
960 if (*Position
== 0x0A)
965 StringRef::iterator
Scanner::skip_s_space(StringRef::iterator Position
) {
968 if (*Position
== ' ')
973 StringRef::iterator
Scanner::skip_s_white(StringRef::iterator Position
) {
976 if (*Position
== ' ' || *Position
== '\t')
981 StringRef::iterator
Scanner::skip_ns_char(StringRef::iterator Position
) {
984 if (*Position
== ' ' || *Position
== '\t')
986 return skip_nb_char(Position
);
989 StringRef::iterator
Scanner::skip_while( SkipWhileFunc Func
990 , StringRef::iterator Position
) {
992 StringRef::iterator i
= (this->*Func
)(Position
);
1000 void Scanner::advanceWhile(SkipWhileFunc Func
) {
1001 auto Final
= skip_while(Func
, Current
);
1002 Column
+= Final
- Current
;
1006 static bool is_ns_hex_digit(const char C
) { return isAlnum(C
); }
1008 static bool is_ns_word_char(const char C
) { return C
== '-' || isAlpha(C
); }
1010 void Scanner::scan_ns_uri_char() {
1014 if (( *Current
== '%'
1015 && Current
+ 2 < End
1016 && is_ns_hex_digit(*(Current
+ 1))
1017 && is_ns_hex_digit(*(Current
+ 2)))
1018 || is_ns_word_char(*Current
)
1019 || StringRef(Current
, 1).find_first_of("#;/?:@&=+$,_.!~*'()[]")
1020 != StringRef::npos
) {
1028 bool Scanner::consume(uint32_t Expected
) {
1029 if (Expected
>= 0x80) {
1030 setError("Cannot consume non-ascii characters", Current
);
1035 if (uint8_t(*Current
) >= 0x80) {
1036 setError("Cannot consume non-ascii characters", Current
);
1039 if (uint8_t(*Current
) == Expected
) {
1047 void Scanner::skip(uint32_t Distance
) {
1048 Current
+= Distance
;
1050 assert(Current
<= End
&& "Skipped past the end");
1053 bool Scanner::isBlankOrBreak(StringRef::iterator Position
) {
1054 if (Position
== End
)
1056 return *Position
== ' ' || *Position
== '\t' || *Position
== '\r' ||
1060 bool Scanner::isPlainSafeNonBlank(StringRef::iterator Position
) {
1061 if (Position
== End
|| isBlankOrBreak(Position
))
1064 StringRef(Position
, 1).find_first_of(",[]{}") != StringRef::npos
)
1069 bool Scanner::isLineEmpty(StringRef Line
) {
1070 for (const auto *Position
= Line
.begin(); Position
!= Line
.end(); ++Position
)
1071 if (!isBlankOrBreak(Position
))
1076 bool Scanner::consumeLineBreakIfPresent() {
1077 auto Next
= skip_b_break(Current
);
1078 if (Next
== Current
)
1086 void Scanner::saveSimpleKeyCandidate( TokenQueueT::iterator Tok
1088 , bool IsRequired
) {
1089 if (IsSimpleKeyAllowed
) {
1093 SK
.Column
= AtColumn
;
1094 SK
.IsRequired
= IsRequired
;
1095 SK
.FlowLevel
= FlowLevel
;
1096 SimpleKeys
.push_back(SK
);
1100 void Scanner::removeStaleSimpleKeyCandidates() {
1101 for (SmallVectorImpl
<SimpleKey
>::iterator i
= SimpleKeys
.begin();
1102 i
!= SimpleKeys
.end();) {
1103 if (i
->Line
!= Line
|| i
->Column
+ 1024 < Column
) {
1105 setError( "Could not find expected : for simple key"
1106 , i
->Tok
->Range
.begin());
1107 i
= SimpleKeys
.erase(i
);
1113 void Scanner::removeSimpleKeyCandidatesOnFlowLevel(unsigned Level
) {
1114 if (!SimpleKeys
.empty() && (SimpleKeys
.end() - 1)->FlowLevel
== Level
)
1115 SimpleKeys
.pop_back();
1118 bool Scanner::unrollIndent(int ToColumn
) {
1120 // Indentation is ignored in flow.
1124 while (Indent
> ToColumn
) {
1125 T
.Kind
= Token::TK_BlockEnd
;
1126 T
.Range
= StringRef(Current
, 1);
1127 TokenQueue
.push_back(T
);
1128 Indent
= Indents
.pop_back_val();
1134 bool Scanner::rollIndent( int ToColumn
1135 , Token::TokenKind Kind
1136 , TokenQueueT::iterator InsertPoint
) {
1139 if (Indent
< ToColumn
) {
1140 Indents
.push_back(Indent
);
1145 T
.Range
= StringRef(Current
, 0);
1146 TokenQueue
.insert(InsertPoint
, T
);
1151 void Scanner::skipComment() {
1152 if (Current
== End
|| *Current
!= '#')
1155 // This may skip more than one byte, thus Column is only incremented
1157 StringRef::iterator I
= skip_nb_char(Current
);
1165 void Scanner::scanToNextToken() {
1167 while (Current
!= End
&& (*Current
== ' ' || *Current
== '\t')) {
1174 StringRef::iterator i
= skip_b_break(Current
);
1180 // New lines may start a simple key.
1182 IsSimpleKeyAllowed
= true;
1186 bool Scanner::scanStreamStart() {
1187 IsStartOfStream
= false;
1189 EncodingInfo EI
= getUnicodeEncoding(currentInput());
1192 T
.Kind
= Token::TK_StreamStart
;
1193 T
.Range
= StringRef(Current
, EI
.second
);
1194 TokenQueue
.push_back(T
);
1195 Current
+= EI
.second
;
1199 bool Scanner::scanStreamEnd() {
1200 // Force an ending new line if one isn't present.
1208 IsSimpleKeyAllowed
= false;
1209 IsAdjacentValueAllowedInFlow
= false;
1212 T
.Kind
= Token::TK_StreamEnd
;
1213 T
.Range
= StringRef(Current
, 0);
1214 TokenQueue
.push_back(T
);
1218 bool Scanner::scanDirective() {
1219 // Reset the indentation level.
1222 IsSimpleKeyAllowed
= false;
1223 IsAdjacentValueAllowedInFlow
= false;
1225 StringRef::iterator Start
= Current
;
1227 StringRef::iterator NameStart
= Current
;
1228 Current
= skip_while(&Scanner::skip_ns_char
, Current
);
1229 StringRef
Name(NameStart
, Current
- NameStart
);
1230 Current
= skip_while(&Scanner::skip_s_white
, Current
);
1233 if (Name
== "YAML") {
1234 Current
= skip_while(&Scanner::skip_ns_char
, Current
);
1235 T
.Kind
= Token::TK_VersionDirective
;
1236 T
.Range
= StringRef(Start
, Current
- Start
);
1237 TokenQueue
.push_back(T
);
1239 } else if(Name
== "TAG") {
1240 Current
= skip_while(&Scanner::skip_ns_char
, Current
);
1241 Current
= skip_while(&Scanner::skip_s_white
, Current
);
1242 Current
= skip_while(&Scanner::skip_ns_char
, Current
);
1243 T
.Kind
= Token::TK_TagDirective
;
1244 T
.Range
= StringRef(Start
, Current
- Start
);
1245 TokenQueue
.push_back(T
);
1251 bool Scanner::scanDocumentIndicator(bool IsStart
) {
1254 IsSimpleKeyAllowed
= false;
1255 IsAdjacentValueAllowedInFlow
= false;
1258 T
.Kind
= IsStart
? Token::TK_DocumentStart
: Token::TK_DocumentEnd
;
1259 T
.Range
= StringRef(Current
, 3);
1261 TokenQueue
.push_back(T
);
1265 bool Scanner::scanFlowCollectionStart(bool IsSequence
) {
1267 T
.Kind
= IsSequence
? Token::TK_FlowSequenceStart
1268 : Token::TK_FlowMappingStart
;
1269 T
.Range
= StringRef(Current
, 1);
1271 TokenQueue
.push_back(T
);
1273 // [ and { may begin a simple key.
1274 saveSimpleKeyCandidate(--TokenQueue
.end(), Column
- 1, false);
1276 // And may also be followed by a simple key.
1277 IsSimpleKeyAllowed
= true;
1278 // Adjacent values are allowed in flows only after JSON-style keys.
1279 IsAdjacentValueAllowedInFlow
= false;
1284 bool Scanner::scanFlowCollectionEnd(bool IsSequence
) {
1285 removeSimpleKeyCandidatesOnFlowLevel(FlowLevel
);
1286 IsSimpleKeyAllowed
= false;
1287 IsAdjacentValueAllowedInFlow
= true;
1289 T
.Kind
= IsSequence
? Token::TK_FlowSequenceEnd
1290 : Token::TK_FlowMappingEnd
;
1291 T
.Range
= StringRef(Current
, 1);
1293 TokenQueue
.push_back(T
);
1299 bool Scanner::scanFlowEntry() {
1300 removeSimpleKeyCandidatesOnFlowLevel(FlowLevel
);
1301 IsSimpleKeyAllowed
= true;
1302 IsAdjacentValueAllowedInFlow
= false;
1304 T
.Kind
= Token::TK_FlowEntry
;
1305 T
.Range
= StringRef(Current
, 1);
1307 TokenQueue
.push_back(T
);
1311 bool Scanner::scanBlockEntry() {
1312 rollIndent(Column
, Token::TK_BlockSequenceStart
, TokenQueue
.end());
1313 removeSimpleKeyCandidatesOnFlowLevel(FlowLevel
);
1314 IsSimpleKeyAllowed
= true;
1315 IsAdjacentValueAllowedInFlow
= false;
1317 T
.Kind
= Token::TK_BlockEntry
;
1318 T
.Range
= StringRef(Current
, 1);
1320 TokenQueue
.push_back(T
);
1324 bool Scanner::scanKey() {
1326 rollIndent(Column
, Token::TK_BlockMappingStart
, TokenQueue
.end());
1328 removeSimpleKeyCandidatesOnFlowLevel(FlowLevel
);
1329 IsSimpleKeyAllowed
= !FlowLevel
;
1330 IsAdjacentValueAllowedInFlow
= false;
1333 T
.Kind
= Token::TK_Key
;
1334 T
.Range
= StringRef(Current
, 1);
1336 TokenQueue
.push_back(T
);
1340 bool Scanner::scanValue() {
1341 // If the previous token could have been a simple key, insert the key token
1342 // into the token queue.
1343 if (!SimpleKeys
.empty()) {
1344 SimpleKey SK
= SimpleKeys
.pop_back_val();
1346 T
.Kind
= Token::TK_Key
;
1347 T
.Range
= SK
.Tok
->Range
;
1348 TokenQueueT::iterator i
, e
;
1349 for (i
= TokenQueue
.begin(), e
= TokenQueue
.end(); i
!= e
; ++i
) {
1357 i
= TokenQueue
.insert(i
, T
);
1359 // We may also need to add a Block-Mapping-Start token.
1360 rollIndent(SK
.Column
, Token::TK_BlockMappingStart
, i
);
1362 IsSimpleKeyAllowed
= false;
1365 rollIndent(Column
, Token::TK_BlockMappingStart
, TokenQueue
.end());
1366 IsSimpleKeyAllowed
= !FlowLevel
;
1368 IsAdjacentValueAllowedInFlow
= false;
1371 T
.Kind
= Token::TK_Value
;
1372 T
.Range
= StringRef(Current
, 1);
1374 TokenQueue
.push_back(T
);
1378 // Forbidding inlining improves performance by roughly 20%.
1379 // FIXME: Remove once llvm optimizes this to the faster version without hints.
1380 LLVM_ATTRIBUTE_NOINLINE
static bool
1381 wasEscaped(StringRef::iterator First
, StringRef::iterator Position
);
1383 // Returns whether a character at 'Position' was escaped with a leading '\'.
1384 // 'First' specifies the position of the first character in the string.
1385 static bool wasEscaped(StringRef::iterator First
,
1386 StringRef::iterator Position
) {
1387 assert(Position
- 1 >= First
);
1388 StringRef::iterator I
= Position
- 1;
1389 // We calculate the number of consecutive '\'s before the current position
1390 // by iterating backwards through our string.
1391 while (I
>= First
&& *I
== '\\') --I
;
1392 // (Position - 1 - I) now contains the number of '\'s before the current
1393 // position. If it is odd, the character at 'Position' was escaped.
1394 return (Position
- 1 - I
) % 2 == 1;
1397 bool Scanner::scanFlowScalar(bool IsDoubleQuoted
) {
1398 StringRef::iterator Start
= Current
;
1399 unsigned ColStart
= Column
;
1400 if (IsDoubleQuoted
) {
1403 while (Current
!= End
&& *Current
!= '"')
1405 // Repeat until the previous character was not a '\' or was an escaped
1407 } while ( Current
!= End
1408 && *(Current
- 1) == '\\'
1409 && wasEscaped(Start
+ 1, Current
));
1412 while (Current
!= End
) {
1413 // Skip a ' followed by another '.
1414 if (Current
+ 1 < End
&& *Current
== '\'' && *(Current
+ 1) == '\'') {
1417 } else if (*Current
== '\'')
1419 StringRef::iterator i
= skip_nb_char(Current
);
1421 i
= skip_b_break(Current
);
1436 if (Current
== End
) {
1437 setError("Expected quote at end of scalar", Current
);
1441 skip(1); // Skip ending quote.
1443 T
.Kind
= Token::TK_Scalar
;
1444 T
.Range
= StringRef(Start
, Current
- Start
);
1445 TokenQueue
.push_back(T
);
1447 saveSimpleKeyCandidate(--TokenQueue
.end(), ColStart
, false);
1449 IsSimpleKeyAllowed
= false;
1450 IsAdjacentValueAllowedInFlow
= true;
1455 bool Scanner::scanPlainScalar() {
1456 StringRef::iterator Start
= Current
;
1457 unsigned ColStart
= Column
;
1458 unsigned LeadingBlanks
= 0;
1459 assert(Indent
>= -1 && "Indent must be >= -1 !");
1460 unsigned indent
= static_cast<unsigned>(Indent
+ 1);
1461 while (Current
!= End
) {
1462 if (*Current
== '#')
1465 while (Current
!= End
&&
1466 ((*Current
!= ':' && isPlainSafeNonBlank(Current
)) ||
1467 (*Current
== ':' && isPlainSafeNonBlank(Current
+ 1)))) {
1468 StringRef::iterator i
= skip_nb_char(Current
);
1475 // Are we at the end?
1476 if (!isBlankOrBreak(Current
))
1480 StringRef::iterator Tmp
= Current
;
1481 while (isBlankOrBreak(Tmp
)) {
1482 StringRef::iterator i
= skip_s_white(Tmp
);
1484 if (LeadingBlanks
&& (Column
< indent
) && *Tmp
== '\t') {
1485 setError("Found invalid tab character in indentation", Tmp
);
1491 i
= skip_b_break(Tmp
);
1500 if (!FlowLevel
&& Column
< indent
)
1505 if (Start
== Current
) {
1506 setError("Got empty plain scalar", Start
);
1510 T
.Kind
= Token::TK_Scalar
;
1511 T
.Range
= StringRef(Start
, Current
- Start
);
1512 TokenQueue
.push_back(T
);
1514 // Plain scalars can be simple keys.
1515 saveSimpleKeyCandidate(--TokenQueue
.end(), ColStart
, false);
1517 IsSimpleKeyAllowed
= false;
1518 IsAdjacentValueAllowedInFlow
= false;
1523 bool Scanner::scanAliasOrAnchor(bool IsAlias
) {
1524 StringRef::iterator Start
= Current
;
1525 unsigned ColStart
= Column
;
1527 while (Current
!= End
) {
1528 if ( *Current
== '[' || *Current
== ']'
1529 || *Current
== '{' || *Current
== '}'
1533 StringRef::iterator i
= skip_ns_char(Current
);
1540 if (Start
+ 1 == Current
) {
1541 setError("Got empty alias or anchor", Start
);
1546 T
.Kind
= IsAlias
? Token::TK_Alias
: Token::TK_Anchor
;
1547 T
.Range
= StringRef(Start
, Current
- Start
);
1548 TokenQueue
.push_back(T
);
1550 // Alias and anchors can be simple keys.
1551 saveSimpleKeyCandidate(--TokenQueue
.end(), ColStart
, false);
1553 IsSimpleKeyAllowed
= false;
1554 IsAdjacentValueAllowedInFlow
= false;
1559 bool Scanner::scanBlockScalarIndicators(char &StyleIndicator
,
1560 char &ChompingIndicator
,
1561 unsigned &IndentIndicator
,
1563 StyleIndicator
= scanBlockStyleIndicator();
1564 if (!scanBlockScalarHeader(ChompingIndicator
, IndentIndicator
, IsDone
))
1569 char Scanner::scanBlockStyleIndicator() {
1570 char Indicator
= ' ';
1571 if (Current
!= End
&& (*Current
== '>' || *Current
== '|')) {
1572 Indicator
= *Current
;
1578 char Scanner::scanBlockChompingIndicator() {
1579 char Indicator
= ' ';
1580 if (Current
!= End
&& (*Current
== '+' || *Current
== '-')) {
1581 Indicator
= *Current
;
1587 /// Get the number of line breaks after chomping.
1589 /// Return the number of trailing line breaks to emit, depending on
1590 /// \p ChompingIndicator.
1591 static unsigned getChompedLineBreaks(char ChompingIndicator
,
1592 unsigned LineBreaks
, StringRef Str
) {
1593 if (ChompingIndicator
== '-') // Strip all line breaks.
1595 if (ChompingIndicator
== '+') // Keep all line breaks.
1597 // Clip trailing lines.
1598 return Str
.empty() ? 0 : 1;
1601 unsigned Scanner::scanBlockIndentationIndicator() {
1602 unsigned Indent
= 0;
1603 if (Current
!= End
&& (*Current
>= '1' && *Current
<= '9')) {
1604 Indent
= unsigned(*Current
- '0');
1610 bool Scanner::scanBlockScalarHeader(char &ChompingIndicator
,
1611 unsigned &IndentIndicator
, bool &IsDone
) {
1612 auto Start
= Current
;
1614 ChompingIndicator
= scanBlockChompingIndicator();
1615 IndentIndicator
= scanBlockIndentationIndicator();
1616 // Check for the chomping indicator once again.
1617 if (ChompingIndicator
== ' ')
1618 ChompingIndicator
= scanBlockChompingIndicator();
1619 Current
= skip_while(&Scanner::skip_s_white
, Current
);
1622 if (Current
== End
) { // EOF, we have an empty scalar.
1624 T
.Kind
= Token::TK_BlockScalar
;
1625 T
.Range
= StringRef(Start
, Current
- Start
);
1626 TokenQueue
.push_back(T
);
1631 if (!consumeLineBreakIfPresent()) {
1632 setError("Expected a line break after block scalar header", Current
);
1638 bool Scanner::findBlockScalarIndent(unsigned &BlockIndent
,
1639 unsigned BlockExitIndent
,
1640 unsigned &LineBreaks
, bool &IsDone
) {
1641 unsigned MaxAllSpaceLineCharacters
= 0;
1642 StringRef::iterator LongestAllSpaceLine
;
1645 advanceWhile(&Scanner::skip_s_space
);
1646 if (skip_nb_char(Current
) != Current
) {
1647 // This line isn't empty, so try and find the indentation.
1648 if (Column
<= BlockExitIndent
) { // End of the block literal.
1652 // We found the block's indentation.
1653 BlockIndent
= Column
;
1654 if (MaxAllSpaceLineCharacters
> BlockIndent
) {
1656 "Leading all-spaces line must be smaller than the block indent",
1657 LongestAllSpaceLine
);
1662 if (skip_b_break(Current
) != Current
&&
1663 Column
> MaxAllSpaceLineCharacters
) {
1664 // Record the longest all-space line in case it's longer than the
1665 // discovered block indent.
1666 MaxAllSpaceLineCharacters
= Column
;
1667 LongestAllSpaceLine
= Current
;
1671 if (Current
== End
) {
1676 if (!consumeLineBreakIfPresent()) {
1685 bool Scanner::scanBlockScalarIndent(unsigned BlockIndent
,
1686 unsigned BlockExitIndent
, bool &IsDone
) {
1687 // Skip the indentation.
1688 while (Column
< BlockIndent
) {
1689 auto I
= skip_s_space(Current
);
1696 if (skip_nb_char(Current
) == Current
)
1699 if (Column
<= BlockExitIndent
) { // End of the block literal.
1704 if (Column
< BlockIndent
) {
1705 if (Current
!= End
&& *Current
== '#') { // Trailing comment.
1709 setError("A text line is less indented than the block scalar", Current
);
1712 return true; // A normal text line.
1715 bool Scanner::scanBlockScalar(bool IsLiteral
) {
1716 assert(*Current
== '|' || *Current
== '>');
1717 char StyleIndicator
;
1718 char ChompingIndicator
;
1719 unsigned BlockIndent
;
1720 bool IsDone
= false;
1721 if (!scanBlockScalarIndicators(StyleIndicator
, ChompingIndicator
, BlockIndent
,
1726 bool IsFolded
= StyleIndicator
== '>';
1728 const auto *Start
= Current
;
1729 unsigned BlockExitIndent
= Indent
< 0 ? 0 : (unsigned)Indent
;
1730 unsigned LineBreaks
= 0;
1731 if (BlockIndent
== 0) {
1732 if (!findBlockScalarIndent(BlockIndent
, BlockExitIndent
, LineBreaks
,
1737 // Scan the block's scalars body.
1738 SmallString
<256> Str
;
1740 if (!scanBlockScalarIndent(BlockIndent
, BlockExitIndent
, IsDone
))
1745 // Parse the current line.
1746 auto LineStart
= Current
;
1747 advanceWhile(&Scanner::skip_nb_char
);
1748 if (LineStart
!= Current
) {
1749 if (LineBreaks
&& IsFolded
&& !Scanner::isLineEmpty(Str
)) {
1750 // The folded style "folds" any single line break between content into a
1751 // single space, except when that content is "empty" (only contains
1752 // whitespace) in which case the line break is left as-is.
1753 if (LineBreaks
== 1) {
1754 Str
.append(LineBreaks
,
1755 isLineEmpty(StringRef(LineStart
, Current
- LineStart
))
1759 // If we saw a single line break, we are completely replacing it and so
1760 // want `LineBreaks == 0`. Otherwise this decrement accounts for the
1761 // fact that the first line break is "trimmed", only being used to
1762 // signal a sequence of line breaks which should not be folded.
1765 Str
.append(LineBreaks
, '\n');
1766 Str
.append(StringRef(LineStart
, Current
- LineStart
));
1774 if (!consumeLineBreakIfPresent())
1779 if (Current
== End
&& !LineBreaks
)
1780 // Ensure that there is at least one line break before the end of file.
1782 Str
.append(getChompedLineBreaks(ChompingIndicator
, LineBreaks
, Str
), '\n');
1784 // New lines may start a simple key.
1786 IsSimpleKeyAllowed
= true;
1787 IsAdjacentValueAllowedInFlow
= false;
1790 T
.Kind
= Token::TK_BlockScalar
;
1791 T
.Range
= StringRef(Start
, Current
- Start
);
1792 T
.Value
= std::string(Str
);
1793 TokenQueue
.push_back(T
);
1797 bool Scanner::scanTag() {
1798 StringRef::iterator Start
= Current
;
1799 unsigned ColStart
= Column
;
1801 if (Current
== End
|| isBlankOrBreak(Current
)); // An empty tag.
1802 else if (*Current
== '<') {
1808 // FIXME: Actually parse the c-ns-shorthand-tag rule.
1809 Current
= skip_while(&Scanner::skip_ns_char
, Current
);
1813 T
.Kind
= Token::TK_Tag
;
1814 T
.Range
= StringRef(Start
, Current
- Start
);
1815 TokenQueue
.push_back(T
);
1817 // Tags can be simple keys.
1818 saveSimpleKeyCandidate(--TokenQueue
.end(), ColStart
, false);
1820 IsSimpleKeyAllowed
= false;
1821 IsAdjacentValueAllowedInFlow
= false;
1826 bool Scanner::fetchMoreTokens() {
1827 if (IsStartOfStream
)
1828 return scanStreamStart();
1833 return scanStreamEnd();
1835 removeStaleSimpleKeyCandidates();
1837 unrollIndent(Column
);
1839 if (Column
== 0 && *Current
== '%')
1840 return scanDirective();
1842 if (Column
== 0 && Current
+ 4 <= End
1844 && *(Current
+ 1) == '-'
1845 && *(Current
+ 2) == '-'
1846 && (Current
+ 3 == End
|| isBlankOrBreak(Current
+ 3)))
1847 return scanDocumentIndicator(true);
1849 if (Column
== 0 && Current
+ 4 <= End
1851 && *(Current
+ 1) == '.'
1852 && *(Current
+ 2) == '.'
1853 && (Current
+ 3 == End
|| isBlankOrBreak(Current
+ 3)))
1854 return scanDocumentIndicator(false);
1856 if (*Current
== '[')
1857 return scanFlowCollectionStart(true);
1859 if (*Current
== '{')
1860 return scanFlowCollectionStart(false);
1862 if (*Current
== ']')
1863 return scanFlowCollectionEnd(true);
1865 if (*Current
== '}')
1866 return scanFlowCollectionEnd(false);
1868 if (*Current
== ',')
1869 return scanFlowEntry();
1871 if (*Current
== '-' && (isBlankOrBreak(Current
+ 1) || Current
+ 1 == End
))
1872 return scanBlockEntry();
1874 if (*Current
== '?' && (Current
+ 1 == End
|| isBlankOrBreak(Current
+ 1)))
1877 if (*Current
== ':' &&
1878 (!isPlainSafeNonBlank(Current
+ 1) || IsAdjacentValueAllowedInFlow
))
1881 if (*Current
== '*')
1882 return scanAliasOrAnchor(true);
1884 if (*Current
== '&')
1885 return scanAliasOrAnchor(false);
1887 if (*Current
== '!')
1890 if (*Current
== '|' && !FlowLevel
)
1891 return scanBlockScalar(true);
1893 if (*Current
== '>' && !FlowLevel
)
1894 return scanBlockScalar(false);
1896 if (*Current
== '\'')
1897 return scanFlowScalar(false);
1899 if (*Current
== '"')
1900 return scanFlowScalar(true);
1902 // Get a plain scalar.
1903 StringRef
FirstChar(Current
, 1);
1904 if ((!isBlankOrBreak(Current
) &&
1905 FirstChar
.find_first_of("-?:,[]{}#&*!|>'\"%@`") == StringRef::npos
) ||
1906 (FirstChar
.find_first_of("?:-") != StringRef::npos
&&
1907 isPlainSafeNonBlank(Current
+ 1)))
1908 return scanPlainScalar();
1910 setError("Unrecognized character while tokenizing.", Current
);
1914 Stream::Stream(StringRef Input
, SourceMgr
&SM
, bool ShowColors
,
1915 std::error_code
*EC
)
1916 : scanner(new Scanner(Input
, SM
, ShowColors
, EC
)) {}
1918 Stream::Stream(MemoryBufferRef InputBuffer
, SourceMgr
&SM
, bool ShowColors
,
1919 std::error_code
*EC
)
1920 : scanner(new Scanner(InputBuffer
, SM
, ShowColors
, EC
)) {}
1922 Stream::~Stream() = default;
1924 bool Stream::failed() { return scanner
->failed(); }
1926 void Stream::printError(Node
*N
, const Twine
&Msg
, SourceMgr::DiagKind Kind
) {
1927 printError(N
? N
->getSourceRange() : SMRange(), Msg
, Kind
);
1930 void Stream::printError(const SMRange
&Range
, const Twine
&Msg
,
1931 SourceMgr::DiagKind Kind
) {
1932 scanner
->printError(Range
.Start
, Kind
, Msg
, Range
);
1935 document_iterator
Stream::begin() {
1937 report_fatal_error("Can only iterate over the stream once");
1939 // Skip Stream-Start.
1942 CurrentDoc
.reset(new Document(*this));
1943 return document_iterator(CurrentDoc
);
1946 document_iterator
Stream::end() {
1947 return document_iterator();
1950 void Stream::skip() {
1951 for (Document
&Doc
: *this)
1955 Node::Node(unsigned int Type
, std::unique_ptr
<Document
> &D
, StringRef A
,
1957 : Doc(D
), TypeID(Type
), Anchor(A
), Tag(T
) {
1958 SMLoc Start
= SMLoc::getFromPointer(peekNext().Range
.begin());
1959 SourceRange
= SMRange(Start
, Start
);
1962 std::string
Node::getVerbatimTag() const {
1963 StringRef Raw
= getRawTag();
1964 if (!Raw
.empty() && Raw
!= "!") {
1966 if (Raw
.find_last_of('!') == 0) {
1967 Ret
= std::string(Doc
->getTagMap().find("!")->second
);
1968 Ret
+= Raw
.substr(1);
1970 } else if (Raw
.starts_with("!!")) {
1971 Ret
= std::string(Doc
->getTagMap().find("!!")->second
);
1972 Ret
+= Raw
.substr(2);
1975 StringRef TagHandle
= Raw
.substr(0, Raw
.find_last_of('!') + 1);
1976 std::map
<StringRef
, StringRef
>::const_iterator It
=
1977 Doc
->getTagMap().find(TagHandle
);
1978 if (It
!= Doc
->getTagMap().end())
1979 Ret
= std::string(It
->second
);
1982 T
.Kind
= Token::TK_Tag
;
1983 T
.Range
= TagHandle
;
1984 setError(Twine("Unknown tag handle ") + TagHandle
, T
);
1986 Ret
+= Raw
.substr(Raw
.find_last_of('!') + 1);
1991 switch (getType()) {
1993 return "tag:yaml.org,2002:null";
1995 case NK_BlockScalar
:
1996 // TODO: Tag resolution.
1997 return "tag:yaml.org,2002:str";
1999 return "tag:yaml.org,2002:map";
2001 return "tag:yaml.org,2002:seq";
2007 Token
&Node::peekNext() {
2008 return Doc
->peekNext();
2011 Token
Node::getNext() {
2012 return Doc
->getNext();
2015 Node
*Node::parseBlockNode() {
2016 return Doc
->parseBlockNode();
2019 BumpPtrAllocator
&Node::getAllocator() {
2020 return Doc
->NodeAllocator
;
2023 void Node::setError(const Twine
&Msg
, Token
&Tok
) const {
2024 Doc
->setError(Msg
, Tok
);
2027 bool Node::failed() const {
2028 return Doc
->failed();
2031 StringRef
ScalarNode::getValue(SmallVectorImpl
<char> &Storage
) const {
2032 if (Value
[0] == '"')
2033 return getDoubleQuotedValue(Value
, Storage
);
2034 if (Value
[0] == '\'')
2035 return getSingleQuotedValue(Value
, Storage
);
2036 return getPlainValue(Value
, Storage
);
2039 /// parseScalarValue - A common parsing routine for all flow scalar styles.
2040 /// It handles line break characters by itself, adds regular content characters
2041 /// to the result, and forwards escaped sequences to the provided routine for
2042 /// the style-specific processing.
2044 /// \param UnquotedValue - An input value without quotation marks.
2045 /// \param Storage - A storage for the result if the input value is multiline or
2046 /// contains escaped characters.
2047 /// \param LookupChars - A set of special characters to search in the input
2048 /// string. Should include line break characters and the escape character
2049 /// specific for the processing scalar style, if any.
2050 /// \param UnescapeCallback - This is called when the escape character is found
2052 /// \returns - The unfolded and unescaped value.
2054 parseScalarValue(StringRef UnquotedValue
, SmallVectorImpl
<char> &Storage
,
2055 StringRef LookupChars
,
2056 std::function
<StringRef(StringRef
, SmallVectorImpl
<char> &)>
2058 size_t I
= UnquotedValue
.find_first_of(LookupChars
);
2059 if (I
== StringRef::npos
)
2060 return UnquotedValue
;
2063 Storage
.reserve(UnquotedValue
.size());
2064 char LastNewLineAddedAs
= '\0';
2065 for (; I
!= StringRef::npos
; I
= UnquotedValue
.find_first_of(LookupChars
)) {
2066 if (UnquotedValue
[I
] != '\r' && UnquotedValue
[I
] != '\n') {
2067 llvm::append_range(Storage
, UnquotedValue
.take_front(I
));
2068 UnquotedValue
= UnescapeCallback(UnquotedValue
.drop_front(I
), Storage
);
2069 LastNewLineAddedAs
= '\0';
2072 if (size_t LastNonSWhite
= UnquotedValue
.find_last_not_of(" \t", I
);
2073 LastNonSWhite
!= StringRef::npos
) {
2074 llvm::append_range(Storage
, UnquotedValue
.take_front(LastNonSWhite
+ 1));
2075 Storage
.push_back(' ');
2076 LastNewLineAddedAs
= ' ';
2078 // Note: we can't just check if the last character in Storage is ' ',
2079 // '\n', or something else; that would give a wrong result for double
2080 // quoted values containing an escaped space character before a new-line
2082 switch (LastNewLineAddedAs
) {
2084 assert(!Storage
.empty() && Storage
.back() == ' ');
2085 Storage
.back() = '\n';
2086 LastNewLineAddedAs
= '\n';
2089 assert(!Storage
.empty() && Storage
.back() == '\n');
2090 Storage
.push_back('\n');
2093 Storage
.push_back(' ');
2094 LastNewLineAddedAs
= ' ';
2098 // Handle Windows-style EOL
2099 if (UnquotedValue
.substr(I
, 2) == "\r\n")
2101 UnquotedValue
= UnquotedValue
.drop_front(I
+ 1).ltrim(" \t");
2103 llvm::append_range(Storage
, UnquotedValue
);
2104 return StringRef(Storage
.begin(), Storage
.size());
2108 ScalarNode::getDoubleQuotedValue(StringRef RawValue
,
2109 SmallVectorImpl
<char> &Storage
) const {
2110 assert(RawValue
.size() >= 2 && RawValue
.front() == '"' &&
2111 RawValue
.back() == '"');
2112 StringRef UnquotedValue
= RawValue
.substr(1, RawValue
.size() - 2);
2114 auto UnescapeFunc
= [this](StringRef UnquotedValue
,
2115 SmallVectorImpl
<char> &Storage
) {
2116 assert(UnquotedValue
.take_front(1) == "\\");
2117 if (UnquotedValue
.size() == 1) {
2119 T
.Range
= UnquotedValue
;
2120 setError("Unrecognized escape code", T
);
2124 UnquotedValue
= UnquotedValue
.drop_front(1);
2125 switch (UnquotedValue
[0]) {
2128 T
.Range
= UnquotedValue
.take_front(1);
2129 setError("Unrecognized escape code", T
);
2134 // Shrink the Windows-style EOL.
2135 if (UnquotedValue
.size() >= 2 && UnquotedValue
[1] == '\n')
2136 UnquotedValue
= UnquotedValue
.drop_front(1);
2139 return UnquotedValue
.drop_front(1).ltrim(" \t");
2141 Storage
.push_back(0x00);
2144 Storage
.push_back(0x07);
2147 Storage
.push_back(0x08);
2151 Storage
.push_back(0x09);
2154 Storage
.push_back(0x0A);
2157 Storage
.push_back(0x0B);
2160 Storage
.push_back(0x0C);
2163 Storage
.push_back(0x0D);
2166 Storage
.push_back(0x1B);
2169 Storage
.push_back(0x20);
2172 Storage
.push_back(0x22);
2175 Storage
.push_back(0x2F);
2178 Storage
.push_back(0x5C);
2181 encodeUTF8(0x85, Storage
);
2184 encodeUTF8(0xA0, Storage
);
2187 encodeUTF8(0x2028, Storage
);
2190 encodeUTF8(0x2029, Storage
);
2193 if (UnquotedValue
.size() < 3)
2194 // TODO: Report error.
2196 unsigned int UnicodeScalarValue
;
2197 if (UnquotedValue
.substr(1, 2).getAsInteger(16, UnicodeScalarValue
))
2198 // TODO: Report error.
2199 UnicodeScalarValue
= 0xFFFD;
2200 encodeUTF8(UnicodeScalarValue
, Storage
);
2201 return UnquotedValue
.drop_front(3);
2204 if (UnquotedValue
.size() < 5)
2205 // TODO: Report error.
2207 unsigned int UnicodeScalarValue
;
2208 if (UnquotedValue
.substr(1, 4).getAsInteger(16, UnicodeScalarValue
))
2209 // TODO: Report error.
2210 UnicodeScalarValue
= 0xFFFD;
2211 encodeUTF8(UnicodeScalarValue
, Storage
);
2212 return UnquotedValue
.drop_front(5);
2215 if (UnquotedValue
.size() < 9)
2216 // TODO: Report error.
2218 unsigned int UnicodeScalarValue
;
2219 if (UnquotedValue
.substr(1, 8).getAsInteger(16, UnicodeScalarValue
))
2220 // TODO: Report error.
2221 UnicodeScalarValue
= 0xFFFD;
2222 encodeUTF8(UnicodeScalarValue
, Storage
);
2223 return UnquotedValue
.drop_front(9);
2226 return UnquotedValue
.drop_front(1);
2229 return parseScalarValue(UnquotedValue
, Storage
, "\\\r\n", UnescapeFunc
);
2232 StringRef
ScalarNode::getSingleQuotedValue(StringRef RawValue
,
2233 SmallVectorImpl
<char> &Storage
) {
2234 assert(RawValue
.size() >= 2 && RawValue
.front() == '\'' &&
2235 RawValue
.back() == '\'');
2236 StringRef UnquotedValue
= RawValue
.substr(1, RawValue
.size() - 2);
2238 auto UnescapeFunc
= [](StringRef UnquotedValue
,
2239 SmallVectorImpl
<char> &Storage
) {
2240 assert(UnquotedValue
.take_front(2) == "''");
2241 Storage
.push_back('\'');
2242 return UnquotedValue
.drop_front(2);
2245 return parseScalarValue(UnquotedValue
, Storage
, "'\r\n", UnescapeFunc
);
2248 StringRef
ScalarNode::getPlainValue(StringRef RawValue
,
2249 SmallVectorImpl
<char> &Storage
) {
2250 // Trim trailing whitespace ('b-char' and 's-white').
2251 // NOTE: Alternatively we could change the scanner to not include whitespace
2252 // here in the first place.
2253 RawValue
= RawValue
.rtrim("\r\n \t");
2254 return parseScalarValue(RawValue
, Storage
, "\r\n", nullptr);
2257 Node
*KeyValueNode::getKey() {
2260 // Handle implicit null keys.
2262 Token
&t
= peekNext();
2263 if ( t
.Kind
== Token::TK_BlockEnd
2264 || t
.Kind
== Token::TK_Value
2265 || t
.Kind
== Token::TK_Error
) {
2266 return Key
= new (getAllocator()) NullNode(Doc
);
2268 if (t
.Kind
== Token::TK_Key
)
2269 getNext(); // skip TK_Key.
2272 // Handle explicit null keys.
2273 Token
&t
= peekNext();
2274 if (t
.Kind
== Token::TK_BlockEnd
|| t
.Kind
== Token::TK_Value
) {
2275 return Key
= new (getAllocator()) NullNode(Doc
);
2278 // We've got a normal key.
2279 return Key
= parseBlockNode();
2282 Node
*KeyValueNode::getValue() {
2286 if (Node
* Key
= getKey())
2289 setError("Null key in Key Value.", peekNext());
2290 return Value
= new (getAllocator()) NullNode(Doc
);
2294 return Value
= new (getAllocator()) NullNode(Doc
);
2296 // Handle implicit null values.
2298 Token
&t
= peekNext();
2299 if ( t
.Kind
== Token::TK_BlockEnd
2300 || t
.Kind
== Token::TK_FlowMappingEnd
2301 || t
.Kind
== Token::TK_Key
2302 || t
.Kind
== Token::TK_FlowEntry
2303 || t
.Kind
== Token::TK_Error
) {
2304 return Value
= new (getAllocator()) NullNode(Doc
);
2307 if (t
.Kind
!= Token::TK_Value
) {
2308 setError("Unexpected token in Key Value.", t
);
2309 return Value
= new (getAllocator()) NullNode(Doc
);
2311 getNext(); // skip TK_Value.
2314 // Handle explicit null values.
2315 Token
&t
= peekNext();
2316 if (t
.Kind
== Token::TK_BlockEnd
|| t
.Kind
== Token::TK_Key
) {
2317 return Value
= new (getAllocator()) NullNode(Doc
);
2320 // We got a normal value.
2321 return Value
= parseBlockNode();
2324 void MappingNode::increment() {
2327 CurrentEntry
= nullptr;
2331 CurrentEntry
->skip();
2332 if (Type
== MT_Inline
) {
2334 CurrentEntry
= nullptr;
2338 Token T
= peekNext();
2339 if (T
.Kind
== Token::TK_Key
|| T
.Kind
== Token::TK_Scalar
) {
2340 // KeyValueNode eats the TK_Key. That way it can detect null keys.
2341 CurrentEntry
= new (getAllocator()) KeyValueNode(Doc
);
2342 } else if (Type
== MT_Block
) {
2344 case Token::TK_BlockEnd
:
2347 CurrentEntry
= nullptr;
2350 setError("Unexpected token. Expected Key or Block End", T
);
2352 case Token::TK_Error
:
2354 CurrentEntry
= nullptr;
2358 case Token::TK_FlowEntry
:
2359 // Eat the flow entry and recurse.
2362 case Token::TK_FlowMappingEnd
:
2365 case Token::TK_Error
:
2366 // Set this to end iterator.
2368 CurrentEntry
= nullptr;
2371 setError( "Unexpected token. Expected Key, Flow Entry, or Flow "
2375 CurrentEntry
= nullptr;
2380 void SequenceNode::increment() {
2383 CurrentEntry
= nullptr;
2387 CurrentEntry
->skip();
2388 Token T
= peekNext();
2389 if (SeqType
== ST_Block
) {
2391 case Token::TK_BlockEntry
:
2393 CurrentEntry
= parseBlockNode();
2394 if (!CurrentEntry
) { // An error occurred.
2396 CurrentEntry
= nullptr;
2399 case Token::TK_BlockEnd
:
2402 CurrentEntry
= nullptr;
2405 setError( "Unexpected token. Expected Block Entry or Block End."
2408 case Token::TK_Error
:
2410 CurrentEntry
= nullptr;
2412 } else if (SeqType
== ST_Indentless
) {
2414 case Token::TK_BlockEntry
:
2416 CurrentEntry
= parseBlockNode();
2417 if (!CurrentEntry
) { // An error occurred.
2419 CurrentEntry
= nullptr;
2423 case Token::TK_Error
:
2425 CurrentEntry
= nullptr;
2427 } else if (SeqType
== ST_Flow
) {
2429 case Token::TK_FlowEntry
:
2430 // Eat the flow entry and recurse.
2432 WasPreviousTokenFlowEntry
= true;
2434 case Token::TK_FlowSequenceEnd
:
2437 case Token::TK_Error
:
2438 // Set this to end iterator.
2440 CurrentEntry
= nullptr;
2442 case Token::TK_StreamEnd
:
2443 case Token::TK_DocumentEnd
:
2444 case Token::TK_DocumentStart
:
2445 setError("Could not find closing ]!", T
);
2446 // Set this to end iterator.
2448 CurrentEntry
= nullptr;
2451 if (!WasPreviousTokenFlowEntry
) {
2452 setError("Expected , between entries!", T
);
2454 CurrentEntry
= nullptr;
2457 // Otherwise it must be a flow entry.
2458 CurrentEntry
= parseBlockNode();
2459 if (!CurrentEntry
) {
2462 WasPreviousTokenFlowEntry
= false;
2468 Document::Document(Stream
&S
) : stream(S
), Root(nullptr) {
2469 // Tag maps starts with two default mappings.
2471 TagMap
["!!"] = "tag:yaml.org,2002:";
2473 if (parseDirectives())
2474 expectToken(Token::TK_DocumentStart
);
2475 Token
&T
= peekNext();
2476 if (T
.Kind
== Token::TK_DocumentStart
)
2480 bool Document::skip() {
2481 if (stream
.scanner
->failed())
2483 if (!Root
&& !getRoot())
2486 Token
&T
= peekNext();
2487 if (T
.Kind
== Token::TK_StreamEnd
)
2489 if (T
.Kind
== Token::TK_DocumentEnd
) {
2496 Token
&Document::peekNext() {
2497 return stream
.scanner
->peekNext();
2500 Token
Document::getNext() {
2501 return stream
.scanner
->getNext();
2504 void Document::setError(const Twine
&Message
, Token
&Location
) const {
2505 stream
.scanner
->setError(Message
, Location
.Range
.begin());
2508 bool Document::failed() const {
2509 return stream
.scanner
->failed();
2512 Node
*Document::parseBlockNode() {
2513 Token T
= peekNext();
2514 // Handle properties.
2519 case Token::TK_Alias
:
2521 return new (NodeAllocator
) AliasNode(stream
.CurrentDoc
, T
.Range
.substr(1));
2522 case Token::TK_Anchor
:
2523 if (AnchorInfo
.Kind
== Token::TK_Anchor
) {
2524 setError("Already encountered an anchor for this node!", T
);
2527 AnchorInfo
= getNext(); // Consume TK_Anchor.
2529 goto parse_property
;
2531 if (TagInfo
.Kind
== Token::TK_Tag
) {
2532 setError("Already encountered a tag for this node!", T
);
2535 TagInfo
= getNext(); // Consume TK_Tag.
2537 goto parse_property
;
2543 case Token::TK_BlockEntry
:
2544 // We got an unindented BlockEntry sequence. This is not terminated with
2546 // Don't eat the TK_BlockEntry, SequenceNode needs it.
2547 return new (NodeAllocator
) SequenceNode( stream
.CurrentDoc
2548 , AnchorInfo
.Range
.substr(1)
2550 , SequenceNode::ST_Indentless
);
2551 case Token::TK_BlockSequenceStart
:
2553 return new (NodeAllocator
)
2554 SequenceNode( stream
.CurrentDoc
2555 , AnchorInfo
.Range
.substr(1)
2557 , SequenceNode::ST_Block
);
2558 case Token::TK_BlockMappingStart
:
2560 return new (NodeAllocator
)
2561 MappingNode( stream
.CurrentDoc
2562 , AnchorInfo
.Range
.substr(1)
2564 , MappingNode::MT_Block
);
2565 case Token::TK_FlowSequenceStart
:
2567 return new (NodeAllocator
)
2568 SequenceNode( stream
.CurrentDoc
2569 , AnchorInfo
.Range
.substr(1)
2571 , SequenceNode::ST_Flow
);
2572 case Token::TK_FlowMappingStart
:
2574 return new (NodeAllocator
)
2575 MappingNode( stream
.CurrentDoc
2576 , AnchorInfo
.Range
.substr(1)
2578 , MappingNode::MT_Flow
);
2579 case Token::TK_Scalar
:
2581 return new (NodeAllocator
)
2582 ScalarNode( stream
.CurrentDoc
2583 , AnchorInfo
.Range
.substr(1)
2586 case Token::TK_BlockScalar
: {
2588 StringRef
NullTerminatedStr(T
.Value
.c_str(), T
.Value
.length() + 1);
2589 StringRef StrCopy
= NullTerminatedStr
.copy(NodeAllocator
).drop_back();
2590 return new (NodeAllocator
)
2591 BlockScalarNode(stream
.CurrentDoc
, AnchorInfo
.Range
.substr(1),
2592 TagInfo
.Range
, StrCopy
, T
.Range
);
2595 // Don't eat the TK_Key, KeyValueNode expects it.
2596 return new (NodeAllocator
)
2597 MappingNode( stream
.CurrentDoc
2598 , AnchorInfo
.Range
.substr(1)
2600 , MappingNode::MT_Inline
);
2601 case Token::TK_DocumentStart
:
2602 case Token::TK_DocumentEnd
:
2603 case Token::TK_StreamEnd
:
2605 // TODO: Properly handle tags. "[!!str ]" should resolve to !!str "", not
2607 return new (NodeAllocator
) NullNode(stream
.CurrentDoc
);
2608 case Token::TK_FlowMappingEnd
:
2609 case Token::TK_FlowSequenceEnd
:
2610 case Token::TK_FlowEntry
: {
2611 if (Root
&& (isa
<MappingNode
>(Root
) || isa
<SequenceNode
>(Root
)))
2612 return new (NodeAllocator
) NullNode(stream
.CurrentDoc
);
2614 setError("Unexpected token", T
);
2617 case Token::TK_Error
:
2620 llvm_unreachable("Control flow shouldn't reach here.");
2624 bool Document::parseDirectives() {
2625 bool isDirective
= false;
2627 Token T
= peekNext();
2628 if (T
.Kind
== Token::TK_TagDirective
) {
2629 parseTAGDirective();
2631 } else if (T
.Kind
== Token::TK_VersionDirective
) {
2632 parseYAMLDirective();
2640 void Document::parseYAMLDirective() {
2641 getNext(); // Eat %YAML <version>
2644 void Document::parseTAGDirective() {
2645 Token Tag
= getNext(); // %TAG <handle> <prefix>
2646 StringRef T
= Tag
.Range
;
2648 T
= T
.substr(T
.find_first_of(" \t")).ltrim(" \t");
2649 std::size_t HandleEnd
= T
.find_first_of(" \t");
2650 StringRef TagHandle
= T
.substr(0, HandleEnd
);
2651 StringRef TagPrefix
= T
.substr(HandleEnd
).ltrim(" \t");
2652 TagMap
[TagHandle
] = TagPrefix
;
2655 bool Document::expectToken(int TK
) {
2656 Token T
= getNext();
2658 setError("Unexpected token", T
);