3 # YAML-DIRECTIVE(major_version, minor_version), TAG-DIRECTIVE(handle, prefix)
4 # RESERVED-DIRECTIVE(name)
5 # DOCUMENT-START, DOCUMENT-END
6 # BLOCK-SEQUENCE-START, BLOCK-MAPPING-START, BLOCK-END
7 # FLOW-SEQUENCE-START, FLOW-MAPPING-START, FLOW-SEQUENCE-END, FLOW-MAPPING-END
9 # ALIAS(name), ANCHOR(name), TAG(value), SCALAR(value, plain)
11 __all__
= ['Scanner', 'ScannerError']
13 from error
import YAMLError
16 class ScannerError(YAMLError
):
18 # ScannerError: while reading a quoted string
19 # in '...', line 5, column 10:
22 # got unknown quote character '?'
23 # in '...', line 5, column 15:
26 def __init__(self
, context
=None, context_marker
=None,
27 problem
=None, problem_marker
=None, description
=None):
28 self
.context
= context
29 self
.context_marker
= context_marker
30 self
.problem
= problem
31 self
.problem_marker
= problem_marker
32 self
.description
= description
36 for (place
, marker
) in [(self
.context
, self
.context_marker
),
37 (self
.problem
, self
.problem_marker
)]:
40 if marker
is not None:
41 lines
.append(str(marker
))
42 if self
.description
is not None:
43 lines
.append(self
.description
)
44 return '\n'.join(lines
)
47 def __init__(self
, token_number
, required
, index
, line
, column
, marker
):
48 self
.token_number
= token_number
49 self
.required
= required
58 def __init__(self
, reader
):
59 """Initialize the scanner."""
60 # The input stream. The Reader class do the dirty work of checking for
61 # BOM and converting the input data to Unicode. It also adds NUL to
64 # Reader supports the following methods
65 # self.reader.peek(k=1) # peek the next k characters
66 # self.reader.forward(k=1) # read the next k characters and move the
70 # Had we reached the end of the stream?
73 # The number of unclosed '{' and '['. `flow_level == 0` means block
77 # List of processed tokens that are not yet emitted.
80 # Number of tokens that were emitted through the `get_token` method.
83 # The current indentation level.
86 # Past indentation levels.
89 # Variables related to simple keys treatment.
91 # A simple key is a key that is not denoted by the '?' indicator.
92 # Example of simple keys:
94 # block simple key: value
96 # : { flow simple key: value }
97 # We emit the KEY token before all keys, so when we find a potential
98 # simple key, we try to locate the corresponding ':' indicator.
99 # Simple keys should be limited to a single line and 1024 characters.
101 # Can a simple key start at the current position? A simple key may
103 # - at the beginning of the line, not counting indentation spaces
104 # (in block context),
105 # - after '{', '[', ',' (in the flow context),
106 # - after '?', ':', '-' (in the block context).
107 # In the block context, this flag also signify if a block collection
108 # may start at the current position.
109 self
.allow_simple_key
= True
111 # Keep track of possible simple keys. This is a dictionary. The key
112 # is `flow_level`; there can be no more that one possible simple key
113 # for each level. The value is a SimpleKey record:
114 # (token_number, required, index, line, column, marker)
115 # A simple key may start with ALIAS, ANCHOR, TAG, SCALAR(flow),
116 # '[', or '{' tokens.
117 self
.possible_simple_keys
= {}
119 # Two public methods.
121 def peek_token(self
):
122 """Get the current token."""
123 while self
.need_more_tokens():
124 self
.fetch_more_tokens()
126 return self
.tokens
[0]
129 "Get the current token and remove it from the list of pending tokens."""
130 while self.need_more_tokens():
131 self.fetch_more_tokens()
133 self.tokens_taken += 1
134 return self.tokens.pop(0)
138 def need_more_tokens(self):
143 # The current token may be a potential simple key, so we
144 # need to look further.
145 self.stale_possible_simple_keys()
146 if self.next_possible_simple_key() == self.tokens_taken:
149 def fetch_more_tokens(self):
151 # Eat whitespaces and comments until we reach the next token.
152 self.scan_to_next_token()
154 # Remove obsolete possible simple keys.
155 self.stale_possible_simple_keys()
157 # Compare the current indentation and column. It may add some tokens
158 # and decrease the current indentation level.
159 self.unwind_indent(self.reader.column)
161 # Peek the next character.
162 ch = self.reader.peek()
164 # Is it the end of reader?
166 return self.fetch_end()
169 if ch == u'%' and self.check_directive():
170 return self.fetch_directive()
172 # Is it the document start?
173 if ch == u'-' and self.check_document_start():
174 return self.fetch_document_start()
176 # Is it the document end?
177 if ch == u'.' and self.check_document_end():
178 return self.fetch_document_end()
180 # Note: the order of the following checks is NOT significant.
182 # Is it the flow sequence start indicator?
184 return self.fetch_flow_sequence_start()
186 # Is it the flow mapping start indicator?
188 return self.fetch_flow_mapping_start()
190 # Is it the flow sequence end indicator?
192 return self.fetch_flow_sequence_end()
194 # Is it the flow mapping end indicator?
196 return self.fetch_flow_mapping_end()
198 # Is it the entry indicator?
199 if ch in u'-,' and self.check_entry():
200 return self.fetch_entry()
202 # Is it the key indicator?
203 if ch == u'?' and self.check_key():
204 return self.fetch_key()
206 # Is it the value indicator?
207 if ch == u':' and self.check_value():
208 return self.fetch_value()
212 return self.fetch_alias()
216 return self.fetch_anchor()
220 return self.fetch_tag()
222 # Is it a literal scalar?
223 if ch == u'|' and not self.flow_level:
224 return self.fetch_literal()
226 # Is it a folded scalar?
227 if ch == u'>' and not self.flow_level:
228 return self.fetch_folded()
230 # Is it a single quoted scalar?
232 return self.fetch_single()
234 # Is it a double quoted scalar?
236 return self.fetch_double()
238 # It must be a plain scalar then.
239 if self.check_plain():
240 return self.fetch_plain()
242 # No? It's an error. Let's produce a nice error message.
245 # Simple keys treatment.
247 def next_possible_simple_key(self):
248 # Return the number of the nearest possible simple key. Actually we
249 # don't need to loop through the whole dictionary. We may replace it
250 # with the following code:
251 # if not self.possible_simple_keys:
253 # return self.possible_simple_keys[
254 # min(self.possible_simple_keys.keys())].token_number
255 min_token_number = None
256 for level in self.possible_simple_keys:
257 key = self.possible_simple_keys[level]
258 if min_token_number is None or key.token_number < min_token_number:
259 min_token_number = key.token_number
260 return min_token_number
262 def stale_possible_simple_keys(self):
263 # Remove entries that are no longer possible simple keys. According to
264 # the YAML specification, simple keys
265 # - should be limited to a single line,
266 # - should be no longer than 1024 characters.
267 # Disabling this procedure will allow simple keys of any length and
268 # height (may cause problems if indentation is broken though).
269 for level in self.possible_simple_keys.keys():
270 key = self.possible_simple_keys[level]
271 if key.line != self.reader.line \
272 or self.reader.index-key.index > 1024:
274 raise ScannerError("while scanning a simple key
", key.marker,
275 "could
not found expected
':'", self.reader.get_marker())
276 del self.possible_simple_keys[level]
278 def save_possible_simple_key(self):
279 # The next token may start a simple key. We check if it's possible
280 # and save its position. This function is called for
281 # ALIAS, ANCHOR, TAG, SCALAR(flow), '[', and '{'.
283 # Check if a simple key is required at the current position.
284 required = not self.flow_level and self.indent == self.reader.column
286 # A simple key is required only if it is the first token in the current
287 # line. Therefore it is always allowed.
288 assert self.allow_simple_key or not required
290 # The next token might be a simple key. Let's save it's number and
292 if self.allow_simple_key:
293 self.remove_possible_simple_key()
294 token_number = self.tokens_taken+len(self.tokens)
295 index = self.reader.index
296 line = self.reader.line
297 column = self.reader.column
298 marker = self.reader.get_marker()
299 key = SimpleKey(token_number, required,
300 index, line, column, marker)
301 self.possible_simple_keys[self.flow_level] = key
303 def remove_possible_simple_key(self):
304 # Remove the saved possible key position at the current flow level.
305 if self.flow_level in self.possible_simple_keys:
306 key = self.possible_simple_keys[self.flow_level]
308 # I don't think it's possible, but I could be wrong.
309 assert not key.required
311 # raise ScannerError("while scanning a simple key
", key.marker,
312 # "could
not found expected
':'", self.reader.get_marker())
314 # Indentation functions.
316 def unwind_indent(self, column):
318 # In flow context, tokens should respect indentation.
319 # Actually the condition should be `self.indent >= column` according to
320 # the spec. But this condition will prohibit intuitively correct
321 # constructions such as
324 if self.flow_level and self.indent > column:
325 raise ScannerError(None, None,
326 "invalid intendation
or unclosed
'[' or '{'",
327 self.reader.get_marker())
329 # In block context, we may need to issue the BLOCK-END tokens.
330 while self.indent > column:
331 marker = self.reader.get_marker()
332 self.indent = self.indents.pop()
333 self.tokens.append(BlockEndToken(marker, marker))
335 def add_indent(self, column):
336 # Check if we need to increase indentation.
337 if self.indent < column:
338 self.indents.append(self.indent)
347 # Set the current intendation to -1.
348 self.unwind_indent(-1)
350 # Reset everything (not really needed).
351 self.allow_simple_key = False
352 self.possible_simple_keys = {}
355 marker = self.reader.get_marker()
358 self.tokens.append(StreamEndToken(marker, marker))
360 # The reader is ended.
363 def fetch_directive(self):
365 # Set the current intendation to -1.
366 self.unwind_indent(-1)
369 self.remove_possible_simple_key()
370 self.allow_simple_key = False
372 # Scan and add DIRECTIVE.
373 self.tokens.append(self.scan_directive())
375 def fetch_document_start(self):
376 self.fetch_document_indicator(DocumentStartToken)
378 def fetch_document_end(self):
379 self.fetch_document_indicator(DocumentEndToken)
381 def fetch_document_indicator(self, TokenClass):
383 # Set the current intendation to -1.
384 self.unwind_indent(-1)
386 # Reset simple keys. Note that there could not be a block collection
388 self.remove_possible_simple_key()
389 self.allow_simple_key = False
391 # Add DOCUMENT-START or DOCUMENT-END.
392 start_marker = self.reader.get_marker()
393 self.reader.forward(3)
394 end_marker = self.reader.get_marker()
395 self.tokens.append(TokenClass(start_marker, end_marker))
397 def fetch_flow_sequence_start(self):
398 self.fetch_flow_collection_start(FlowSequenceStartToken)
400 def fetch_flow_mapping_start(self):
401 self.fetch_flow_collection_start(FlowMappingStartToken)
403 def fetch_flow_collection_start(self, TokenClass):
405 # '[' and '{' may start a simple key.
406 self.save_possible_simple_key()
408 # Increase the flow level.
411 # Simple keys are allowed after '[' and '{'.
412 self.allow_simple_key = True
414 # Add FLOW-SEQUENCE-START or FLOW-MAPPING-START.
415 start_marker = self.reader.get_marker()
416 self.reader.forward()
417 end_marker = self.reader.get_marker()
418 self.tokens.append(TokenClass(start_marker, end_marker))
420 def fetch_flow_sequence_end(self):
421 self.fetch_flow_collection_end(FlowSequenceEndToken)
423 def fetch_flow_mapping_end(self):
424 self.fetch_flow_collection_end(FlowMappingEndToken)
426 def fetch_flow_collection_end(self, TokenClass):
428 # Reset possible simple key on the current level.
429 self.remove_possible_simple_key()
431 # Decrease the flow level.
434 # No simple keys after ']' or '}'.
435 self.allow_simple_key = False
437 # Add FLOW-SEQUENCE-END or FLOW-MAPPING-END.
438 start_marker = self.reader.get_marker()
439 self.reader.forward()
440 end_marker = self.reader.get_marker()
441 self.tokens.append(TokenClass(start_marker, end_marker))
443 def fetch_entry(self):
445 # Block context needs additional checks.
446 if not self.flow_level:
448 # Are we allowed to start a new entry?
449 if not self.allow_simple_key:
450 raise ScannerError(None, None,
451 "sequence entries are
not allowed here
",
452 self.reader.get_marker())
454 # We may need to add BLOCK-SEQUENCE-START.
455 if self.add_indent(self.reader.column):
456 marker = self.reader.get_marker()
457 self.tokens.append(BlockSequenceStartToken(marker, marker))
459 # Simple keys are allowed after '-' and ','.
460 self.allow_simple_key = True
462 # Reset possible simple key on the current level.
463 self.remove_possible_simple_key()
466 start_marker = self.reader.get_marker()
467 self.reader.forward()
468 end_marker = self.reader.get_marker()
469 self.tokens.append(EntryToken(start_marker, end_marker))
473 # Block context needs additional checks.
474 if not self.flow_level:
476 # Are we allowed to start a key (not nessesary a simple)?
477 if not self.allow_simple_key:
478 raise ScannerError(None, None,
479 "mapping keys are
not allowed here
",
480 self.reader.get_marker())
482 # We may need to add BLOCK-MAPPING-START.
483 if self.add_indent(self.reader.column):
484 marker = self.reader.get_marker()
485 self.tokens.append(BlockMappingStartToken(marker, marker))
487 # Simple keys are allowed after '?' in the block context.
488 self.allow_simple_key = not self.flow_level
490 # Reset possible simple key on the current level.
491 self.remove_possible_simple_key()
494 start_marker = self.reader.get_marker()
495 self.reader.forward()
496 end_marker = self.reader.get_marker()
497 self.tokens.append(KeyToken(start_marker, end_marker))
499 def fetch_value(self):
501 # Do we determine a simple key?
502 if self.flow_level in self.possible_simple_keys:
505 key = self.possible_simple_keys[self.flow_level]
506 del self.possible_simple_keys[self.flow_level]
507 self.tokens.insert(key.token_number-self.tokens_taken,
508 KeyToken(key.marker, key.marker))
510 # If this key starts a new block mapping, we need to add
511 # BLOCK-MAPPING-START.
512 if not self.flow_level:
513 if self.add_indent(key.column):
514 self.tokens.insert(key.token_number-self.tokens_taken,
515 BlockMappingStartToken(key.marker, key.marker))
517 # There cannot be two simple keys one after another.
518 self.allow_simple_key = False
520 # It must be a part of a complex key.
523 # Block context needs additional checks.
524 # (Do we really need them? They will be catched by the parser
526 if not self.flow_level:
528 # We are allowed to start a complex value if and only if
529 # we can start a simple key.
530 if not self.allow_simple_key:
531 raise ScannerError(None, None,
532 "mapping values are
not allowed here
",
533 self.reader.get_marker())
535 # Simple keys are allowed after ':' in the block context.
536 self.allow_simple_key = not self.flow_level
538 # Reset possible simple key on the current level.
539 self.remove_possible_simple_key()
542 start_marker = self.reader.get_marker()
543 self.reader.forward()
544 end_marker = self.reader.get_marker()
545 self.tokens.append(ValueToken(start_marker, end_marker))
547 def fetch_alias(self):
549 # ALIAS could be a simple key.
550 self.save_possible_simple_key()
552 # No simple keys after ALIAS.
553 self.allow_simple_key = False
555 # Scan and add ALIAS.
556 self.tokens.append(self.scan_anchor(AliasToken))
558 def fetch_anchor(self):
560 # ANCHOR could start a simple key.
561 self.save_possible_simple_key()
563 # No simple keys after ANCHOR.
564 self.allow_simple_key = False
566 # Scan and add ANCHOR.
567 self.tokens.append(self.scan_anchor(AnchorToken))
571 # TAG could start a simple key.
572 self.save_possible_simple_key()
574 # No simple keys after TAG.
575 self.allow_simple_key = False
578 self.tokens.append(self.scan_tag())
580 def fetch_literal(self):
581 self.fetch_block_scalar(folded=False)
583 def fetch_folded(self):
584 self.fetch_block_scalar(folded=True)
586 def fetch_block_scalar(self, folded):
588 # A simple key may follow a block scalar.
589 self.allow_simple_key = True
591 # Reset possible simple key on the current level.
592 self.remove_possible_simple_key()
594 # Scan and add SCALAR.
595 self.tokens.append(self.scan_block_scalar(folded))
597 def fetch_single(self):
598 self.fetch_flow_scalar(double=False)
600 def fetch_double(self):
601 self.fetch_flow_scalar(double=True)
603 def fetch_flow_scalar(self, double):
605 # A flow scalar could be a simple key.
606 self.save_possible_simple_key()
608 # No simple keys after flow scalars.
609 self.allow_simple_key = False
611 # Scan and add SCALAR.
612 self.tokens.append(self.scan_flow_scalar(double))
614 def fetch_plain(self):
616 # A plain scalar could be a simple key.
617 self.save_possible_simple_key()
619 # No simple keys after plain scalars. But note that `scan_plain` will
620 # change this flag if the scan is finished at the beginning of the
622 self.allow_simple_key = False
624 # Scan and add SCALAR. May change `allow_simple_key`.
625 self.tokens.append(self.scan_plain())
629 def check_directive(self):
631 # DIRECTIVE: ^ '%' ...
632 # The '%' indicator is already checked.
633 if self.reader.column == 0:
636 def check_document_start(self):
638 # DOCUMENT-START: ^ '---' (' '|'\n')
639 if self.reader.column == 0:
640 prefix = self.reader.peek(4)
641 if prefix[:3] == u'---' and prefix[3] in u'\0 \t\r\n\x85\u2028\u2029':
644 def check_document_end(self):
646 # DOCUMENT-END: ^ '...' (' '|'\n')
647 if self.reader.column == 0:
648 prefix = self.reader.peek(4)
649 if prefix[:3] == u'...' and prefix[3] in u'\0 \t\r\n\x85\u2028\u2029':
652 def check_entry(self):
654 # ENTRY(flow context): ','
656 return self.reader.peek() == u','
658 # ENTRY(block context): '-' (' '|'\n')
660 prefix = self.reader.peek(2)
661 return prefix[0] == u'-' and prefix[1] in u'\0 \t\r\n\x85\u2028\u2029'
665 # KEY(flow context): '?'
669 # KEY(block context): '?' (' '|'\n')
671 prefix = self.reader.peek(2)
672 return prefix[1] in u'\0 \t\r\n\x85\u2028\u2029'
674 def check_value(self):
676 # VALUE(flow context): ':'
680 # VALUE(block context): ':' (' '|'\n')
682 prefix = self.reader.peek(2)
683 return prefix[1] in u'\0 \t\r\n\x85\u2028\u2029'
685 def check_plain(self):
690 def scan_to_next_token(self):
691 # We ignore spaces, line breaks and comments.
692 # If we find a line break in the block context, we set the flag
693 # `allow_simple_key` on.
696 while self.reader.peek() == u' ':
697 self.reader.forward()
698 if self.reader.peek() == u'#':
699 while self.reader.peek() not in u'\0\r\n\x85\u2028\u2029':
700 self.reader.forward()
701 if self.scan_line_break():
702 if not self.flow_level:
703 self.allow_simple_key = True
707 def scan_directive(self):
708 marker = self.reader.get_marker()
709 if self.reader.peek(5) == u'%YAML ':
710 token = YAMLDirectiveToken(1, 1, marker, marker)
711 elif self.reader.peek(4) == u'%TAG ':
712 token = TagDirectiveToken(marker, marker)
714 token = ReservedDirectiveToken('', marker, marker)
715 while self.reader.peek() not in u'\0\r\n':
716 self.reader.forward()
717 self.reader.forward()
720 def scan_anchor(self, TokenClass):
721 start_marker = self.reader.get_marker()
722 while self.reader.peek() not in u'\0 \t\r\n,:':
723 self.reader.forward()
724 end_marker = self.reader.get_marker()
725 return TokenClass('', start_marker, end_marker)
728 start_marker = self.reader.get_marker()
729 while self.reader.peek() not in u'\0 \t\r\n':
730 self.reader.forward()
731 end_marker = self.reader.get_marker()
732 return TagToken('', start_marker, end_marker)
734 def scan_block_scalar(self, folded):
735 start_marker = self.reader.get_marker()
736 indent = self.indent+1
740 while self.reader.peek() and self.reader.peek() and self.reader.peek() not in u'\0\r\n\x85\u2028\u2029':
741 self.reader.forward()
742 if self.reader.peek() != u'\0':
743 self.reader.forward()
745 while count < indent and self.reader.peek() == u' ':
746 self.reader.forward()
748 if count < indent and self.reader.peek() not in u'#\r\n\x85\u2028\u2029':
750 return ScalarToken('', False, start_marker, start_marker)
752 def scan_flow_scalar(self, double):
753 marker = self.reader.get_marker()
754 quote = self.reader.peek()
755 self.reader.forward()
756 while self.reader.peek() != quote:
757 if double and self.reader.peek() == u'\\':
758 self.reader.forward(2)
759 elif not double and self.reader.peek(3)[1:] == u'\'\'':
760 self.reader.forward(3)
762 self.reader.forward(1)
763 self.reader.forward(1)
764 return ScalarToken('', False, marker, marker)
766 def scan_plain(self):
767 indent = self.indent+1
771 marker = self.reader.get_marker()
773 while self.reader.peek() == u' ':
774 self.reader.forward()
776 while self.reader.peek() not in u'\0\r\n?:,[]{}#' \
777 or (not space and self.reader.peek() == '#') \
778 or (not self.flow_level and self.reader.peek() in '?,[]{}') \
779 or (not self.flow_level and self.reader.peek() == ':' and self.reader.peek(2)[1] not in u' \0\r\n'):
780 space = self.reader.peek() not in u' \t'
781 self.reader.forward()
782 self.allow_simple_key = False
783 if self.reader.peek() not in u'\r\n':
785 while self.reader.peek() in u'\r\n':
786 self.reader.forward()
787 if not self.flow_level:
788 self.allow_simple_key = True
790 while self.reader.peek() == u' ' and count < indent:
791 self.reader.forward()
796 return ScalarToken('', True, marker, marker)
798 def scan_line_break(self):
804 # '\u2028' : '\u2028'
807 ch = self.reader.peek()
808 if ch in u'\r\n\x85':
809 if self.reader.peek(2) == u'\r\n':
812 self.reader.forward()
814 elif ch in u'\u2028\u2029':
815 self.reader.forward()
819 def invalid_token(self):
820 self.fail("invalid token
")
824 # psyco.bind(Scanner)