3 # YAML-DIRECTIVE(major_version, minor_version), TAG-DIRECTIVE(handle, prefix)
4 # RESERVED-DIRECTIVE(name)
5 # DOCUMENT-START, DOCUMENT-END
6 # BLOCK-SEQUENCE-START, BLOCK-MAPPING-START, BLOCK-END
7 # FLOW-SEQUENCE-START, FLOW-MAPPING-START, FLOW-SEQUENCE-END, FLOW-MAPPING-END
9 # ALIAS(name), ANCHOR(name), TAG(value), SCALAR(value, plain)
11 __all__
= ['Scanner', 'ScannerError']
13 from error
import YAMLError
16 class ScannerError(YAMLError
):
17 # ScannerError: while reading a quoted string
18 # in '...', line 5, column 10:
21 # got unknown quote character '?'
22 # in '...', line 5, column 15:
26 def __init__(self
, context
=None, context_marker
=None,
27 problem
=None, problem_marker
=None):
28 self
.context
= context
29 self
.context_marker
= context_marker
30 self
.problem
= problem
31 self
.problem_marker
= problem_marker
35 for (place
, marker
) in [(self
.context
, self
.context_marker
),
36 (self
.problem
, self
.problem_marker
)]:
39 if marker
is not None:
40 lines
.append(str(marker
))
41 return '\n'.join(lines
)
44 # See below simple keys treatment.
46 def __init__(self
, token_number
, required
, index
, line
, column
, marker
):
47 self
.token_number
= token_number
48 self
.required
= required
57 def __init__(self
, reader
):
58 """Initialize the scanner."""
59 # The input stream. The Reader class do the dirty work of checking for
60 # BOM and converting the input data to Unicode. It also adds NUL to
63 # Reader supports the following methods
64 # self.reader.peek(i=0) # peek the next i-th character
65 # self.reader.prefix(l=1) # peek the next l characters
66 # self.reader.forward(l=1) # read the next l characters
67 # and move the pointer
70 # Had we reached the end of the stream?
73 # The number of unclosed '{' and '['. `flow_level == 0` means block
77 # List of processed tokens that are not yet emitted.
80 # Number of tokens that were emitted through the `get_token` method.
83 # The current indentation level.
86 # Past indentation levels.
89 # Variables related to simple keys treatment.
91 # A simple key is a key that is not denoted by the '?' indicator.
92 # Example of simple keys:
94 # block simple key: value
96 # : { flow simple key: value }
97 # We emit the KEY token before all keys, so when we find a potential
98 # simple key, we try to locate the corresponding ':' indicator.
99 # Simple keys should be limited to a single line and 1024 characters.
101 # Can a simple key start at the current position? A simple key may
103 # - at the beginning of the line, not counting indentation spaces
104 # (in block context),
105 # - after '{', '[', ',' (in the flow context),
106 # - after '?', ':', '-' (in the block context).
107 # In the block context, this flag also signify if a block collection
108 # may start at the current position.
109 self
.allow_simple_key
= True
111 # Keep track of possible simple keys. This is a dictionary. The key
112 # is `flow_level`; there can be no more that one possible simple key
113 # for each level. The value is a SimpleKey record:
114 # (token_number, required, index, line, column, marker)
115 # A simple key may start with ALIAS, ANCHOR, TAG, SCALAR(flow),
116 # '[', or '{' tokens.
117 self
.possible_simple_keys
= {}
121 def check(self
, *choices
):
122 # Check if the next token is one of the given types.
123 while self
.need_more_tokens():
124 self
.fetch_more_tokens()
126 for choice
in choices
:
127 if isinstance(self
.tokens
[0], choice
):
132 # Return the next token, but do not delete if from the queue.
133 while self
.need_more_tokens():
134 self
.fetch_more_tokens()
136 return self
.tokens
[0]
139 # Return the next token.
140 while self
.need_more_tokens():
141 self
.fetch_more_tokens()
143 self
.tokens_taken
+= 1
144 return self
.tokens
.pop(0)
148 while self
.need_more_tokens():
149 self
.fetch_more_tokens()
151 self
.tokens_taken
+= 1
152 yield self
.tokens
.pop(0)
153 while self
.need_more_tokens():
154 self
.fetch_more_tokens()
158 def need_more_tokens(self
):
163 # The current token may be a potential simple key, so we
164 # need to look further.
165 self
.stale_possible_simple_keys()
166 if self
.next_possible_simple_key() == self
.tokens_taken
:
169 def fetch_more_tokens(self
):
171 # Eat whitespaces and comments until we reach the next token.
172 self
.scan_to_next_token()
174 # Remove obsolete possible simple keys.
175 self
.stale_possible_simple_keys()
177 # Compare the current indentation and column. It may add some tokens
178 # and decrease the current indentation level.
179 self
.unwind_indent(self
.reader
.column
)
181 # Peek the next character.
182 ch
= self
.reader
.peek()
184 # Is it the end of stream?
186 return self
.fetch_stream_end()
189 if ch
== u
'%' and self
.check_directive():
190 return self
.fetch_directive()
192 # Is it the document start?
193 if ch
== u
'-' and self
.check_document_start():
194 return self
.fetch_document_start()
196 # Is it the document end?
197 if ch
== u
'.' and self
.check_document_end():
198 return self
.fetch_document_end()
200 # Note: the order of the following checks is NOT significant.
202 # Is it the flow sequence start indicator?
204 return self
.fetch_flow_sequence_start()
206 # Is it the flow mapping start indicator?
208 return self
.fetch_flow_mapping_start()
210 # Is it the flow sequence end indicator?
212 return self
.fetch_flow_sequence_end()
214 # Is it the flow mapping end indicator?
216 return self
.fetch_flow_mapping_end()
218 # Is it the flow entry indicator?
220 return self
.fetch_flow_entry()
222 # Is it the block entry indicator?
223 if ch
in u
'-' and self
.check_block_entry():
224 return self
.fetch_block_entry()
226 # Is it the key indicator?
227 if ch
== u
'?' and self
.check_key():
228 return self
.fetch_key()
230 # Is it the value indicator?
231 if ch
== u
':' and self
.check_value():
232 return self
.fetch_value()
236 return self
.fetch_alias()
240 return self
.fetch_anchor()
244 return self
.fetch_tag()
246 # Is it a literal scalar?
247 if ch
== u
'|' and not self
.flow_level
:
248 return self
.fetch_literal()
250 # Is it a folded scalar?
251 if ch
== u
'>' and not self
.flow_level
:
252 return self
.fetch_folded()
254 # Is it a single quoted scalar?
256 return self
.fetch_single()
258 # Is it a double quoted scalar?
260 return self
.fetch_double()
262 # It must be a plain scalar then.
263 if self
.check_plain():
264 return self
.fetch_plain()
266 # No? It's an error. Let's produce a nice error message.
267 raise ScannerError("while scanning for the next token", None,
268 "found character %r that cannot start any token"
269 % ch
.encode('utf-8'), self
.reader
.get_marker())
271 # Simple keys treatment.
273 def next_possible_simple_key(self
):
274 # Return the number of the nearest possible simple key. Actually we
275 # don't need to loop through the whole dictionary. We may replace it
276 # with the following code:
277 # if not self.possible_simple_keys:
279 # return self.possible_simple_keys[
280 # min(self.possible_simple_keys.keys())].token_number
281 min_token_number
= None
282 for level
in self
.possible_simple_keys
:
283 key
= self
.possible_simple_keys
[level
]
284 if min_token_number
is None or key
.token_number
< min_token_number
:
285 min_token_number
= key
.token_number
286 return min_token_number
288 def stale_possible_simple_keys(self
):
289 # Remove entries that are no longer possible simple keys. According to
290 # the YAML specification, simple keys
291 # - should be limited to a single line,
292 # - should be no longer than 1024 characters.
293 # Disabling this procedure will allow simple keys of any length and
294 # height (may cause problems if indentation is broken though).
295 for level
in self
.possible_simple_keys
.keys():
296 key
= self
.possible_simple_keys
[level
]
297 if key
.line
!= self
.reader
.line \
298 or self
.reader
.index
-key
.index
> 1024:
300 raise ScannerError("while scanning a simple key", key
.marker
,
301 "could not found expected ':'", self
.reader
.get_marker())
302 del self
.possible_simple_keys
[level
]
304 def save_possible_simple_key(self
):
305 # The next token may start a simple key. We check if it's possible
306 # and save its position. This function is called for
307 # ALIAS, ANCHOR, TAG, SCALAR(flow), '[', and '{'.
309 # Check if a simple key is required at the current position.
310 required
= not self
.flow_level
and self
.indent
== self
.reader
.column
312 # A simple key is required only if it is the first token in the current
313 # line. Therefore it is always allowed.
314 assert self
.allow_simple_key
or not required
316 # The next token might be a simple key. Let's save it's number and
318 if self
.allow_simple_key
:
319 self
.remove_possible_simple_key()
320 token_number
= self
.tokens_taken
+len(self
.tokens
)
321 index
= self
.reader
.index
322 line
= self
.reader
.line
323 column
= self
.reader
.column
324 marker
= self
.reader
.get_marker()
325 key
= SimpleKey(token_number
, required
,
326 index
, line
, column
, marker
)
327 self
.possible_simple_keys
[self
.flow_level
] = key
329 def remove_possible_simple_key(self
):
330 # Remove the saved possible key position at the current flow level.
331 if self
.flow_level
in self
.possible_simple_keys
:
332 key
= self
.possible_simple_keys
[self
.flow_level
]
334 # I don't think it's possible, but I could be wrong.
335 assert not key
.required
337 # raise ScannerError("while scanning a simple key", key.marker,
338 # "could not found expected ':'", self.reader.get_marker())
340 # Indentation functions.
342 def unwind_indent(self
, column
):
344 # In flow context, tokens should respect indentation.
345 # Actually the condition should be `self.indent >= column` according to
346 # the spec. But this condition will prohibit intuitively correct
347 # constructions such as
350 if self
.flow_level
and self
.indent
> column
:
351 raise ScannerError(None, None,
352 "invalid intendation or unclosed '[' or '{'",
353 self
.reader
.get_marker())
355 # In block context, we may need to issue the BLOCK-END tokens.
356 while self
.indent
> column
:
357 marker
= self
.reader
.get_marker()
358 self
.indent
= self
.indents
.pop()
359 self
.tokens
.append(BlockEndToken(marker
, marker
))
361 def add_indent(self
, column
):
362 # Check if we need to increase indentation.
363 if self
.indent
< column
:
364 self
.indents
.append(self
.indent
)
371 def fetch_stream_end(self
):
373 # Set the current intendation to -1.
374 self
.unwind_indent(-1)
376 # Reset everything (not really needed).
377 self
.allow_simple_key
= False
378 self
.possible_simple_keys
= {}
381 marker
= self
.reader
.get_marker()
384 self
.tokens
.append(StreamEndToken(marker
, marker
))
386 # The reader is ended.
389 def fetch_directive(self
):
391 # Set the current intendation to -1.
392 self
.unwind_indent(-1)
395 self
.remove_possible_simple_key()
396 self
.allow_simple_key
= False
398 # Scan and add DIRECTIVE.
399 self
.tokens
.append(self
.scan_directive())
401 def fetch_document_start(self
):
402 self
.fetch_document_indicator(DocumentStartToken
)
404 def fetch_document_end(self
):
405 self
.fetch_document_indicator(DocumentEndToken
)
407 def fetch_document_indicator(self
, TokenClass
):
409 # Set the current intendation to -1.
410 self
.unwind_indent(-1)
412 # Reset simple keys. Note that there could not be a block collection
414 self
.remove_possible_simple_key()
415 self
.allow_simple_key
= False
417 # Add DOCUMENT-START or DOCUMENT-END.
418 start_marker
= self
.reader
.get_marker()
419 self
.reader
.forward(3)
420 end_marker
= self
.reader
.get_marker()
421 self
.tokens
.append(TokenClass(start_marker
, end_marker
))
423 def fetch_flow_sequence_start(self
):
424 self
.fetch_flow_collection_start(FlowSequenceStartToken
)
426 def fetch_flow_mapping_start(self
):
427 self
.fetch_flow_collection_start(FlowMappingStartToken
)
429 def fetch_flow_collection_start(self
, TokenClass
):
431 # '[' and '{' may start a simple key.
432 self
.save_possible_simple_key()
434 # Increase the flow level.
437 # Simple keys are allowed after '[' and '{'.
438 self
.allow_simple_key
= True
440 # Add FLOW-SEQUENCE-START or FLOW-MAPPING-START.
441 start_marker
= self
.reader
.get_marker()
442 self
.reader
.forward()
443 end_marker
= self
.reader
.get_marker()
444 self
.tokens
.append(TokenClass(start_marker
, end_marker
))
446 def fetch_flow_sequence_end(self
):
447 self
.fetch_flow_collection_end(FlowSequenceEndToken
)
449 def fetch_flow_mapping_end(self
):
450 self
.fetch_flow_collection_end(FlowMappingEndToken
)
452 def fetch_flow_collection_end(self
, TokenClass
):
454 # Reset possible simple key on the current level.
455 self
.remove_possible_simple_key()
457 # Decrease the flow level.
460 # No simple keys after ']' or '}'.
461 self
.allow_simple_key
= False
463 # Add FLOW-SEQUENCE-END or FLOW-MAPPING-END.
464 start_marker
= self
.reader
.get_marker()
465 self
.reader
.forward()
466 end_marker
= self
.reader
.get_marker()
467 self
.tokens
.append(TokenClass(start_marker
, end_marker
))
469 def fetch_flow_entry(self
):
471 # Simple keys are allowed after ','.
472 self
.allow_simple_key
= True
474 # Reset possible simple key on the current level.
475 self
.remove_possible_simple_key()
478 start_marker
= self
.reader
.get_marker()
479 self
.reader
.forward()
480 end_marker
= self
.reader
.get_marker()
481 self
.tokens
.append(FlowEntryToken(start_marker
, end_marker
))
483 def fetch_block_entry(self
):
485 # Block context needs additional checks.
486 if not self
.flow_level
:
488 # Are we allowed to start a new entry?
489 if not self
.allow_simple_key
:
490 raise ScannerError(None, None,
491 "sequence entries are not allowed here",
492 self
.reader
.get_marker())
494 # We may need to add BLOCK-SEQUENCE-START.
495 if self
.add_indent(self
.reader
.column
):
496 marker
= self
.reader
.get_marker()
497 self
.tokens
.append(BlockSequenceStartToken(marker
, marker
))
499 # It's an error for the block entry to occur in the flow context,
500 # but we let the parser detect this.
504 # Simple keys are allowed after '-'.
505 self
.allow_simple_key
= True
507 # Reset possible simple key on the current level.
508 self
.remove_possible_simple_key()
511 start_marker
= self
.reader
.get_marker()
512 self
.reader
.forward()
513 end_marker
= self
.reader
.get_marker()
514 self
.tokens
.append(BlockEntryToken(start_marker
, end_marker
))
518 # Block context needs additional checks.
519 if not self
.flow_level
:
521 # Are we allowed to start a key (not nessesary a simple)?
522 if not self
.allow_simple_key
:
523 raise ScannerError(None, None,
524 "mapping keys are not allowed here",
525 self
.reader
.get_marker())
527 # We may need to add BLOCK-MAPPING-START.
528 if self
.add_indent(self
.reader
.column
):
529 marker
= self
.reader
.get_marker()
530 self
.tokens
.append(BlockMappingStartToken(marker
, marker
))
532 # Simple keys are allowed after '?' in the block context.
533 self
.allow_simple_key
= not self
.flow_level
535 # Reset possible simple key on the current level.
536 self
.remove_possible_simple_key()
539 start_marker
= self
.reader
.get_marker()
540 self
.reader
.forward()
541 end_marker
= self
.reader
.get_marker()
542 self
.tokens
.append(KeyToken(start_marker
, end_marker
))
544 def fetch_value(self
):
546 # Do we determine a simple key?
547 if self
.flow_level
in self
.possible_simple_keys
:
550 key
= self
.possible_simple_keys
[self
.flow_level
]
551 del self
.possible_simple_keys
[self
.flow_level
]
552 self
.tokens
.insert(key
.token_number
-self
.tokens_taken
,
553 KeyToken(key
.marker
, key
.marker
))
555 # If this key starts a new block mapping, we need to add
556 # BLOCK-MAPPING-START.
557 if not self
.flow_level
:
558 if self
.add_indent(key
.column
):
559 self
.tokens
.insert(key
.token_number
-self
.tokens_taken
,
560 BlockMappingStartToken(key
.marker
, key
.marker
))
562 # There cannot be two simple keys one after another.
563 self
.allow_simple_key
= False
565 # It must be a part of a complex key.
568 # Block context needs additional checks.
569 # (Do we really need them? They will be catched by the parser
571 if not self
.flow_level
:
573 # We are allowed to start a complex value if and only if
574 # we can start a simple key.
575 if not self
.allow_simple_key
:
576 raise ScannerError(None, None,
577 "mapping values are not allowed here",
578 self
.reader
.get_marker())
580 # Simple keys are allowed after ':' in the block context.
581 self
.allow_simple_key
= not self
.flow_level
583 # Reset possible simple key on the current level.
584 self
.remove_possible_simple_key()
587 start_marker
= self
.reader
.get_marker()
588 self
.reader
.forward()
589 end_marker
= self
.reader
.get_marker()
590 self
.tokens
.append(ValueToken(start_marker
, end_marker
))
592 def fetch_alias(self
):
594 # ALIAS could be a simple key.
595 self
.save_possible_simple_key()
597 # No simple keys after ALIAS.
598 self
.allow_simple_key
= False
600 # Scan and add ALIAS.
601 self
.tokens
.append(self
.scan_anchor(AliasToken
))
603 def fetch_anchor(self
):
605 # ANCHOR could start a simple key.
606 self
.save_possible_simple_key()
608 # No simple keys after ANCHOR.
609 self
.allow_simple_key
= False
611 # Scan and add ANCHOR.
612 self
.tokens
.append(self
.scan_anchor(AnchorToken
))
616 # TAG could start a simple key.
617 self
.save_possible_simple_key()
619 # No simple keys after TAG.
620 self
.allow_simple_key
= False
623 self
.tokens
.append(self
.scan_tag())
625 def fetch_literal(self
):
626 self
.fetch_block_scalar(folded
=False)
628 def fetch_folded(self
):
629 self
.fetch_block_scalar(folded
=True)
631 def fetch_block_scalar(self
, folded
):
633 # A simple key may follow a block scalar.
634 self
.allow_simple_key
= True
636 # Reset possible simple key on the current level.
637 self
.remove_possible_simple_key()
639 # Scan and add SCALAR.
640 self
.tokens
.append(self
.scan_block_scalar(folded
))
642 def fetch_single(self
):
643 self
.fetch_flow_scalar(double
=False)
645 def fetch_double(self
):
646 self
.fetch_flow_scalar(double
=True)
648 def fetch_flow_scalar(self
, double
):
650 # A flow scalar could be a simple key.
651 self
.save_possible_simple_key()
653 # No simple keys after flow scalars.
654 self
.allow_simple_key
= False
656 # Scan and add SCALAR.
657 self
.tokens
.append(self
.scan_flow_scalar(double
))
659 def fetch_plain(self
):
661 # A plain scalar could be a simple key.
662 self
.save_possible_simple_key()
664 # No simple keys after plain scalars. But note that `scan_plain` will
665 # change this flag if the scan is finished at the beginning of the
667 self
.allow_simple_key
= False
669 # Scan and add SCALAR. May change `allow_simple_key`.
670 self
.tokens
.append(self
.scan_plain())
674 def check_directive(self
):
676 # DIRECTIVE: ^ '%' ...
677 # The '%' indicator is already checked.
678 if self
.reader
.column
== 0:
681 def check_document_start(self
):
683 # DOCUMENT-START: ^ '---' (' '|'\n')
684 if self
.reader
.column
== 0:
685 if self
.reader
.prefix(3) == u
'---' \
686 and self
.reader
.peek(3) in u
'\0 \t\r\n\x85\u2028\u2029':
689 def check_document_end(self
):
691 # DOCUMENT-END: ^ '...' (' '|'\n')
692 if self
.reader
.column
== 0:
693 prefix
= self
.reader
.peek(4)
694 if self
.reader
.prefix(3) == u
'...' \
695 and self
.reader
.peek(3) in u
'\0 \t\r\n\x85\u2028\u2029':
698 def check_block_entry(self
):
700 # BLOCK-ENTRY: '-' (' '|'\n')
701 return self
.reader
.peek(1) in u
'\0 \t\r\n\x85\u2028\u2029'
705 # KEY(flow context): '?'
709 # KEY(block context): '?' (' '|'\n')
711 return self
.reader
.peek(1) in u
'\0 \t\r\n\x85\u2028\u2029'
713 def check_value(self
):
715 # VALUE(flow context): ':'
719 # VALUE(block context): ':' (' '|'\n')
721 return self
.reader
.peek(1) in u
'\0 \t\r\n\x85\u2028\u2029'
723 def check_plain(self
):
725 # A plain scalar may start with any non-space character except:
726 # '-', '?', ':', ',', '[', ']', '{', '}',
727 # '#', '&', '*', '!', '|', '>', '\'', '\"',
730 # It may also start with
732 # if it is followed by a non-space character.
734 # Note that we limit the last rule to the block context (except the
735 # '-' character) because we want the flow context to be space
737 ch
= self
.reader
.peek()
738 return ch
not in u
'\0 \t\r\n\x85\u2028\u2029-?:,[]{}#&*!|>\'\"%@`' \
739 or (self
.reader
.peek(1) not in u
'\0 \t\r\n\x85\u2028\u2029'
740 and (ch
== '-' or (not self
.flow_level
and ch
in u
'?:')))
744 def scan_to_next_token(self
):
745 # We ignore spaces, line breaks and comments.
746 # If we find a line break in the block context, we set the flag
747 # `allow_simple_key` on.
748 # The byte order mark is stripped if it's the first character in the
749 # stream. We do not yet support BOM inside the stream as the
750 # specification requires. Any such mark will be considered as a part
752 if self
.reader
.index
== 0 and self
.reader
.peek() == u
'\uFEFF':
753 self
.reader
.forward()
756 while self
.reader
.peek() == u
' ':
757 self
.reader
.forward()
758 if self
.reader
.peek() == u
'#':
759 while self
.reader
.peek() not in u
'\0\r\n\x85\u2028\u2029':
760 self
.reader
.forward()
761 if self
.scan_line_break():
762 if not self
.flow_level
:
763 self
.allow_simple_key
= True
767 def scan_directive(self
):
768 # See the specification for details.
769 start_marker
= self
.reader
.get_marker()
770 self
.reader
.forward()
771 name
= self
.scan_directive_name(start_marker
)
774 value
= self
.scan_yaml_directive_value(start_marker
)
775 end_marker
= self
.reader
.get_marker()
777 value
= self
.scan_tag_directive_value(start_marker
)
778 end_marker
= self
.reader
.get_marker()
780 end_marker
= self
.reader
.get_marker()
781 while self
.reader
.peek() not in u
'\0\r\n\x85\u2028\u2029':
782 self
.reader
.forward()
783 self
.scan_directive_ignored_line(start_marker
)
784 return DirectiveToken(name
, value
, start_marker
, end_marker
)
786 def scan_directive_name(self
, start_marker
):
787 # See the specification for details.
789 ch
= self
.reader
.peek(length
)
790 while u
'0' <= ch
<= u
'9' or u
'A' <= ch
<= 'Z' or u
'a' <= ch
<= 'z' \
793 ch
= self
.reader
.peek(length
)
795 raise ScannerError("while scanning a directive", start_marker
,
796 "expected directive name, but found %r" % ch
.encode('utf-8'),
797 self
.reader
.get_marker())
798 value
= self
.reader
.prefix(length
)
799 self
.reader
.forward(length
)
800 ch
= self
.reader
.peek()
801 if ch
not in u
'\0 \r\n\x85\u2028\u2029':
802 raise ScannerError("while scanning a directive" % name
, start_marker
,
803 "expected alphabetic or numeric character, but found %r"
804 % ch
.encode('utf-8'), self
.reader
.get_marker())
807 def scan_yaml_directive_value(self
, start_marker
):
808 # See the specification for details.
809 while self
.reader
.peek() == u
' ':
810 self
.reader
.forward()
811 major
= self
.scan_yaml_directive_number(start_marker
)
812 if self
.reader
.peek() != '.':
813 raise ScannerError("while scanning a directive", start_marker
,
814 "expected a digit or '.', but found %r" % ch
.encode('utf-8'),
815 self
.reader
.get_marker())
816 self
.reader
.forward()
817 minor
= self
.scan_yaml_directive_number(start_marker
)
818 if self
.reader
.peek() not in u
'\0 \r\n\x85\u2028\u2029':
819 raise ScannerError("while scanning a directive", start_marker
,
820 "expected a digit or ' ', but found %r" % ch
.encode('utf-8'),
821 self
.reader
.get_marker())
822 return (major
, minor
)
824 def scan_yaml_directive_number(self
, start_marker
):
825 # See the specification for details.
826 ch
= self
.reader
.peek()
827 if not (u
'0' <= ch
<= '9'):
828 raise ScannerError("while scanning a directive", start_marker
,
829 "expected a digit, but found %r" % ch
.encode('utf-8'),
830 self
.reader
.get_marker())
832 while u
'0' <= self
.reader
.peek(length
) <= u
'9':
834 value
= int(self
.reader
.prefix(length
))
835 self
.reader
.forward(length
)
838 def scan_tag_directive_value(self
, start_marker
):
839 # See the specification for details.
840 while self
.reader
.peek() == u
' ':
841 self
.reader
.forward()
842 handle
= self
.scan_tag_directive_handle(start_marker
)
843 while self
.reader
.peek() == u
' ':
844 self
.reader
.forward()
845 prefix
= self
.scan_tag_directive_prefix(start_marker
)
846 return (handle
, prefix
)
848 def scan_tag_directive_handle(self
, start_marker
):
849 # See the specification for details.
850 value
= self
.scan_tag_handle('directive', start_marker
)
851 if self
.reader
.peek() != u
' ':
852 raise ScannerError("while scanning a directive", start_marker
,
853 "expected ' ', but found %r" % ch
.encode('utf-8'),
854 self
.reader
.get_marker())
857 def scan_tag_directive_prefix(self
, start_marker
):
858 # See the specification for details.
859 value
= self
.scan_tag_uri('directive', start_marker
)
860 ch
= self
.reader
.peek()
861 if ch
not in u
'\0 \r\n\x85\u2028\u2029':
862 raise ScannerError("while scanning a directive", start_marker
,
863 "expected ' ', but found %r" % ch
.encode('utf-8'),
864 self
.reader
.get_marker())
867 def scan_directive_ignored_line(self
, start_marker
):
868 # See the specification for details.
869 while self
.reader
.peek() == u
' ':
870 self
.reader
.forward()
871 if self
.reader
.peek() == u
'#':
872 while self
.reader
.peek() not in u
'\0\r\n\x85\u2028\u2029':
873 self
.reader
.forward()
874 ch
= self
.reader
.peek()
875 if ch
not in u
'\0\r\n\x85\u2028\u2029':
876 raise ScannerError("while scanning a directive", start_marker
,
877 "expected a comment or a line break, but found %r"
878 % ch
.encode('utf-8'), self
.reader
.get_marker())
879 self
.scan_line_break()
881 def scan_anchor(self
, TokenClass
):
882 # The specification does not restrict characters for anchors and
883 # aliases. This may lead to problems, for instance, the document:
885 # can be interpteted in two ways, as
888 # [ *alias , "value" ]
889 # Therefore we restrict aliases to numbers and ASCII letters.
890 start_marker
= self
.reader
.get_marker()
891 indicator
= self
.reader
.peek()
896 self
.reader
.forward()
898 ch
= self
.reader
.peek(length
)
899 while u
'0' <= ch
<= u
'9' or u
'A' <= ch
<= 'Z' or u
'a' <= ch
<= 'z' \
902 ch
= self
.reader
.peek(length
)
904 raise ScannerError("while scanning an %s" % name
, start_marker
,
905 "expected anchor name, but found %r" % ch
.encode('utf-8'),
906 self
.reader
.get_marker())
907 value
= self
.reader
.prefix(length
)
908 self
.reader
.forward(length
)
909 ch
= self
.reader
.peek()
910 if ch
not in u
'\0 \t\r\n\x85\u2028\u2029?:,]}%@`':
911 raise ScannerError("while scanning an %s" % name
, start_marker
,
912 "expected alphabetic or numeric character, but found %r"
913 % ch
.encode('utf-8'), self
.reader
.get_marker())
914 end_marker
= self
.reader
.get_marker()
915 return TokenClass(value
, start_marker
, end_marker
)
918 # See the specification for details.
919 start_marker
= self
.reader
.get_marker()
920 ch
= self
.reader
.peek(1)
923 self
.reader
.forward(2)
924 suffix
= self
.scan_tag_uri('tag', start_marker
)
925 if self
.reader
.peek() != u
'>':
926 raise ScannerError("while parsing a tag", start_marking
,
927 "expected '>', but got %r" % self
.reader
.peek().encode('utf-8'),
928 self
.reader
.get_marker())
929 self
.reader
.forward()
930 elif ch
in u
'\0 \t\r\n\x85\u2028\u2029':
933 self
.reader
.forward()
937 while ch
not in u
'\0 \r\n\x85\u2028\u2029':
942 ch
= self
.reader
.peek(length
)
945 handle
= self
.scan_tag_handle('tag', start_marker
)
948 self
.reader
.forward()
949 suffix
= self
.scan_tag_uri('tag', start_marker
)
950 ch
= self
.reader
.peek()
951 if ch
not in u
'\0 \r\n\x85\u2028\u2029':
952 raise ScannerError("while scanning a tag", start_marker
,
953 "expected ' ', but found %r" % ch
.encode('utf-8'),
954 self
.reader
.get_marker())
955 value
= (handle
, suffix
)
956 end_marker
= self
.reader
.get_marker()
957 return TagToken(value
, start_marker
, end_marker
)
959 def scan_block_scalar(self
, folded
):
960 # See the specification for details.
963 start_marker
= self
.reader
.get_marker()
966 self
.reader
.forward()
967 chomping
, increment
= self
.scan_block_scalar_indicators(start_marker
)
968 self
.scan_block_scalar_ignored_line(start_marker
)
970 # Determine the indentation level and go to the first non-empty line.
971 min_indent
= self
.indent
+1
974 if increment
is None:
975 breaks
, max_indent
, end_marker
= self
.scan_block_scalar_indentation()
976 indent
= max(min_indent
, max_indent
)
978 indent
= min_indent
+increment
-1
979 breaks
, end_marker
= self
.scan_block_scalar_breaks(indent
)
982 # Scan the inner part of the block scalar.
983 while self
.reader
.column
== indent
and self
.reader
.peek() != u
'\0':
984 chunks
.extend(breaks
)
985 leading_non_space
= self
.reader
.peek() not in u
' \t'
987 while self
.reader
.peek(length
) not in u
'\0\r\n\x85\u2028\u2029':
989 chunks
.append(self
.reader
.prefix(length
))
990 self
.reader
.forward(length
)
991 line_break
= self
.scan_line_break()
992 breaks
, end_marker
= self
.scan_block_scalar_breaks(indent
)
993 if self
.reader
.column
== indent
and self
.reader
.peek() != u
'\0':
994 # Unfortunately, folding rules are ambiguous.
996 # This is the folding according to the specification:
998 if folded
and line_break
== u
'\n' \
999 and leading_non_space
and self
.reader
.peek() not in u
' \t':
1003 chunks
.append(line_break
)
1005 # This is Clark Evans's interpretation (also in the spec
1008 #if folded and line_break == u'\n':
1010 # if self.reader.peek() not in ' \t':
1011 # chunks.append(u' ')
1013 # chunks.append(line_break)
1015 # chunks.append(line_break)
1020 if chomping
is not False:
1021 chunks
.append(line_break
)
1022 if chomping
is True:
1023 chunks
.extend(breaks
)
1026 return ScalarToken(u
''.join(chunks
), False, start_marker
, end_marker
)
1028 def scan_block_scalar_indicators(self
, start_marker
):
1029 # See the specification for details.
1032 ch
= self
.reader
.peek()
1038 self
.reader
.forward()
1039 ch
= self
.reader
.peek()
1040 if ch
in u
'0123456789':
1043 raise ScannerError("while scanning a block scalar", start_marker
,
1044 "expected indentation indicator in the range 1-9, but found 0",
1045 self
.reader
.get_marker())
1046 self
.reader
.forward()
1047 elif ch
in u
'0123456789':
1050 raise ScannerError("while scanning a block scalar", start_marker
,
1051 "expected indentation indicator in the range 1-9, but found 0",
1052 self
.reader
.get_marker())
1053 self
.reader
.forward()
1054 ch
= self
.reader
.peek()
1060 self
.reader
.forward()
1061 ch
= self
.reader
.peek()
1062 if ch
not in u
'\0 \r\n\x85\u2028\u2029':
1063 raise ScannerError("while scanning a block scalar", start_marker
,
1064 "expected chomping or indentation indicators, but found %r"
1065 % ch
.encode('utf-8'), self
.reader
.get_marker())
1066 return chomping
, increment
1068 def scan_block_scalar_ignored_line(self
, start_marker
):
1069 # See the specification for details.
1070 while self
.reader
.peek() == u
' ':
1071 self
.reader
.forward()
1072 if self
.reader
.peek() == u
'#':
1073 while self
.reader
.peek() not in u
'\0\r\n\x85\u2028\u2029':
1074 self
.reader
.forward()
1075 ch
= self
.reader
.peek()
1076 if ch
not in u
'\0\r\n\x85\u2028\u2029':
1077 raise ScannerError("while scanning a block scalar", start_marker
,
1078 "expected a comment or a line break, but found %r"
1079 % ch
.encode('utf-8'), self
.reader
.get_marker())
1080 self
.scan_line_break()
1082 def scan_block_scalar_indentation(self
):
1083 # See the specification for details.
1086 end_marker
= self
.reader
.get_marker()
1087 while self
.reader
.peek() in u
' \r\n\x85\u2028\u2029':
1088 if self
.reader
.peek() != u
' ':
1089 chunks
.append(self
.scan_line_break())
1090 end_marker
= self
.reader
.get_marker()
1092 self
.reader
.forward()
1093 if self
.reader
.column
> max_indent
:
1094 max_indent
= self
.reader
.column
1095 return chunks
, max_indent
, end_marker
1097 def scan_block_scalar_breaks(self
, indent
):
1098 # See the specification for details.
1100 end_marker
= self
.reader
.get_marker()
1101 while self
.reader
.column
< indent
and self
.reader
.peek() == u
' ':
1102 self
.reader
.forward()
1103 while self
.reader
.peek() in u
'\r\n\x85\u2028\u2029':
1104 chunks
.append(self
.scan_line_break())
1105 end_marker
= self
.reader
.get_marker()
1106 while self
.reader
.column
< indent
and self
.reader
.peek() == u
' ':
1107 self
.reader
.forward()
1108 return chunks
, end_marker
1110 def scan_flow_scalar(self
, double
):
1111 # See the specification for details.
1113 start_marker
= self
.reader
.get_marker()
1114 indent
= self
.indent
+1
1117 quote
= self
.reader
.peek()
1118 self
.reader
.forward()
1119 chunks
.extend(self
.scan_flow_scalar_non_spaces(double
, indent
, start_marker
))
1120 while self
.reader
.peek() != quote
:
1121 chunks
.extend(self
.scan_flow_scalar_spaces(double
, indent
, start_marker
))
1122 chunks
.extend(self
.scan_flow_scalar_non_spaces(double
, indent
, start_marker
))
1123 self
.reader
.forward()
1124 end_marker
= self
.reader
.get_marker()
1125 return ScalarToken(u
''.join(chunks
), False, start_marker
, end_marker
)
1127 ESCAPE_REPLACEMENTS
= {
1153 def scan_flow_scalar_non_spaces(self
, double
, indent
, start_marker
):
1154 # See the specification for details.
1158 while self
.reader
.peek(length
) not in u
'\'\"\\\0 \t\r\n\x85\u2028\u2029':
1161 chunks
.append(self
.reader
.prefix(length
))
1162 self
.reader
.forward(length
)
1163 ch
= self
.reader
.peek()
1164 if not double
and ch
== u
'\'' and self
.reader
.peek(1) == u
'\'':
1165 chunks
.append(u
'\'')
1166 self
.reader
.forward(2)
1167 elif (double
and ch
== u
'\'') or (not double
and ch
in u
'\"\\'):
1169 self
.reader
.forward()
1170 elif double
and ch
== u
'\\':
1171 self
.reader
.forward()
1172 ch
= self
.reader
.peek()
1173 if ch
in self
.ESCAPE_REPLACEMENTS
:
1174 chunks
.append(self
.ESCAPE_REPLACEMENTS
[ch
])
1175 self
.reader
.forward()
1176 elif ch
in self
.ESCAPE_CODES
:
1177 length
= self
.ESCAPE_CODES
[ch
]
1178 self
.reader
.forward()
1179 for k
in range(length
):
1180 if self
.reader
.peek(k
) not in u
'0123456789ABCDEFabcdef':
1181 raise ScannerError("while scanning a double-quoted scalar", start_marker
,
1182 "expected escape sequence of %d hexdecimal numbers, but found %r" %
1183 (length
, self
.reader
.peek(k
).encode('utf-8')), self
.reader
.get_marker())
1184 code
= int(self
.reader
.prefix(length
), 16)
1185 chunks
.append(unichr(code
))
1186 self
.reader
.forward(length
)
1187 elif ch
in u
'\r\n\x85\u2028\u2029':
1188 self
.scan_line_break()
1189 chunks
.extend(self
.scan_flow_scalar_breaks(double
, indent
, start_marker
))
1191 raise ScannerError("while scanning a double-quoted scalar", start_marker
,
1192 "found unknown escape character %r" % ch
.encode('utf-8'), self
.reader
.get_marker())
1196 def scan_flow_scalar_spaces(self
, double
, indent
, start_marker
):
1197 # See the specification for details.
1200 while self
.reader
.peek(length
) in u
' \t':
1202 whitespaces
= self
.reader
.prefix(length
)
1203 self
.reader
.forward(length
)
1204 ch
= self
.reader
.peek()
1206 raise ScannerError("while scanning a quoted scalar", start_marker
,
1207 "found unexpected end of stream", self
.reader
.get_marker())
1208 elif ch
in u
'\r\n\x85\u2028\u2029':
1209 line_break
= self
.scan_line_break()
1210 breaks
= self
.scan_flow_scalar_breaks(double
, indent
, start_marker
)
1211 if line_break
!= u
'\n':
1212 chunks
.append(line_break
)
1215 chunks
.extend(breaks
)
1217 chunks
.append(whitespaces
)
1220 def scan_flow_scalar_breaks(self
, double
, indent
, start_marker
):
1221 # See the specification for details.
1224 while self
.reader
.column
< indent
and self
.reader
.peek() == u
' ':
1225 self
.reader
.forward()
1226 if self
.reader
.column
< indent \
1227 and self
.reader
.peek() not in u
'\0\r\n\x85\u2028\u2029':
1231 raise ScannerError("while scanning a quoted scalar", start_marker
,
1232 "expected %d space%s indentation, but found %r"
1233 % (indent
, s
, self
.reader
.peek().encode('utf-8')),
1234 self
.reader
.get_marker())
1235 while self
.reader
.peek() in u
' \t':
1236 self
.reader
.forward()
1237 if self
.reader
.peek() in u
'\r\n\x85\u2028\u2029':
1238 chunks
.append(self
.scan_line_break())
1242 def scan_plain(self
):
1243 # See the specification for details.
1244 # We add an additional restriction for the flow context:
1245 # plain scalars in the flow context cannot contain ':' and '?'.
1246 # We also keep track of the `allow_simple_key` flag here.
1248 start_marker
= self
.reader
.get_marker()
1249 end_marker
= start_marker
1250 indent
= self
.indent
+1
1256 if self
.reader
.peek() == u
'#':
1259 ch
= self
.reader
.peek(length
)
1260 if ch
in u
'\0 \t\r\n\x85\u2028\u2029' \
1261 or (not self
.flow_level
and ch
== u
':' and
1262 self
.reader
.peek(length
+1) in u
'\0 \t\r\n\x28\u2028\u2029') \
1263 or (self
.flow_level
and ch
in u
',:?[]{}'):
1268 self
.allow_simple_key
= False
1269 chunks
.extend(spaces
)
1270 chunks
.append(self
.reader
.prefix(length
))
1271 self
.reader
.forward(length
)
1272 end_marker
= self
.reader
.get_marker()
1273 spaces
= self
.scan_plain_spaces(indent
)
1274 if not spaces
or self
.reader
.peek() == u
'#' \
1275 or self
.reader
.column
< indent
:
1277 return ScalarToken(u
''.join(chunks
), True, start_marker
, end_marker
)
1279 def scan_plain_spaces(self
, indent
):
1280 # See the specification for details.
1281 # The specification is really confusing about tabs in plain scalars.
1282 # We just forbid them completely. Do not use tabs in YAML!
1285 while self
.reader
.peek(length
) in u
' ':
1287 whitespaces
= self
.reader
.prefix(length
)
1288 self
.reader
.forward(length
)
1289 ch
= self
.reader
.peek()
1290 if ch
in u
'\r\n\x85\u2028\u2029':
1291 line_break
= self
.scan_line_break()
1292 self
.allow_simple_key
= True
1294 while self
.reader
.peek() in u
' \r\n\x85\u2028\u2029':
1295 if self
.reader
.peek() == ' ':
1296 self
.reader
.forward()
1298 breaks
.append(self
.scan_line_break())
1299 if line_break
!= u
'\n':
1300 chunks
.append(line_break
)
1303 chunks
.extend(breaks
)
1305 chunks
.append(whitespaces
)
1308 def scan_tag_handle(self
, name
, start_marker
):
1309 # See the specification for details.
1310 # For some strange reasons, the specification does not allow '_' in
1311 # tag handles. I have allowed it anyway.
1312 if self
.reader
.peek() != u
'!':
1313 raise ScannerError("while scanning a %s" % name
, start_marker
,
1314 "expected '!', but found %r" % ch
.encode('utf-8'),
1315 self
.reader
.get_marker())
1317 ch
= self
.reader
.peek(length
)
1319 while u
'0' <= ch
<= u
'9' or u
'A' <= ch
<= 'Z' or u
'a' <= ch
<= 'z' \
1322 ch
= self
.reader
.peek(length
)
1324 self
.reader
.forward(length
)
1325 raise ScannerError("while scanning a %s" % name
, start_marker
,
1326 "expected '!', but found %r" % ch
.encode('utf-8'),
1327 self
.reader
.get_marker())
1329 value
= self
.reader
.prefix(length
)
1330 self
.reader
.forward(length
)
1333 def scan_tag_uri(self
, name
, start_marker
):
1334 # See the specification for details.
1335 # Note: we do not check if URI is well-formed.
1338 ch
= self
.reader
.peek(length
)
1339 while u
'0' <= ch
<= u
'9' or u
'A' <= ch
<= 'Z' or u
'a' <= ch
<= 'z' \
1340 or ch
in u
'-;/?:@&=+$,_.!~*\'()[]%':
1342 chunks
.append(self
.reader
.prefix(length
))
1343 self
.reader
.forward(length
)
1345 chunks
.append(self
.scan_uri_escapes(name
, start_marker
))
1348 ch
= self
.reader
.peek(length
)
1350 chunks
.append(self
.reader
.prefix(length
))
1351 self
.reader
.forward(length
)
1354 raise ScannerError("while parsing a %s" % name
, start_marker
,
1355 "expected URI, but found %r" % ch
.encode('utf-8'),
1356 self
.reader
.get_marker())
1357 return u
''.join(chunks
)
1359 def scan_uri_escapes(self
, name
, start_marker
):
1360 # See the specification for details.
1362 marker
= self
.reader
.get_marker()
1363 while self
.reader
.peek() == u
'%':
1364 self
.reader
.forward()
1366 if self
.reader
.peek(k
) not in u
'0123456789ABCDEFabcdef':
1367 raise ScannerError("while scanning a %s" % name
, start_marker
,
1368 "expected URI escape sequence of 2 hexdecimal numbers, but found %r" %
1369 (self
.reader
.peek(k
).encode('utf-8')), self
.reader
.get_marker())
1370 bytes
.append(chr(int(self
.reader
.prefix(2), 16)))
1371 self
.reader
.forward(2)
1373 value
= unicode(''.join(bytes
), 'utf-8')
1374 except UnicodeDecodeError, exc
:
1375 raise ScannerError("while scanning a %s" % name
, start_marker
, str(exc
), marker
)
1378 def scan_line_break(self
):
1384 # '\u2028' : '\u2028'
1385 # '\u2029 : '\u2029'
1387 ch
= self
.reader
.peek()
1388 if ch
in u
'\r\n\x85':
1389 if self
.reader
.prefix(2) == u
'\r\n':
1392 self
.reader
.forward()
1394 elif ch
in u
'\u2028\u2029':
1395 self
.reader
.forward()
1401 # psyco.bind(Scanner)
1402 #except ImportError: