2 # Scanner produces tokens of the following types:
5 # DIRECTIVE(name, value)
22 # SCALAR(value, plain, style)
24 # Read comments in the Scanner code for more details.
27 __all__
= ['Scanner', 'ScannerError']
29 from error
import MarkedYAMLError
32 class ScannerError(MarkedYAMLError
):
35 class SimpleKey(object):
36 # See below simple keys treatment.
38 def __init__(self
, token_number
, required
, index
, line
, column
, mark
):
39 self
.token_number
= token_number
40 self
.required
= required
46 class Scanner(object):
49 """Initialize the scanner."""
50 # It is assumed that Scanner and Reader will have a common descendant.
51 # Reader do the dirty work of checking for BOM and converting the
52 # input data to Unicode. It also adds NUL to the end.
54 # Reader supports the following methods
55 # self.peek(i=0) # peek the next i-th character
56 # self.prefix(l=1) # peek the next l characters
57 # self.forward(l=1) # read the next l characters and move the pointer.
59 # Had we reached the end of the stream?
62 # The number of unclosed '{' and '['. `flow_level == 0` means block
66 # List of processed tokens that are not yet emitted.
69 # Add the STREAM-START token.
70 self
.fetch_stream_start()
72 # Number of tokens that were emitted through the `get_token` method.
75 # The current indentation level.
78 # Past indentation levels.
81 # Variables related to simple keys treatment.
83 # A simple key is a key that is not denoted by the '?' indicator.
84 # Example of simple keys:
86 # block simple key: value
88 # : { flow simple key: value }
89 # We emit the KEY token before all keys, so when we find a potential
90 # simple key, we try to locate the corresponding ':' indicator.
91 # Simple keys should be limited to a single line and 1024 characters.
93 # Can a simple key start at the current position? A simple key may
95 # - at the beginning of the line, not counting indentation spaces
97 # - after '{', '[', ',' (in the flow context),
98 # - after '?', ':', '-' (in the block context).
99 # In the block context, this flag also signifies if a block collection
100 # may start at the current position.
101 self
.allow_simple_key
= True
103 # Keep track of possible simple keys. This is a dictionary. The key
104 # is `flow_level`; there can be no more that one possible simple key
105 # for each level. The value is a SimpleKey record:
106 # (token_number, required, index, line, column, mark)
107 # A simple key may start with ALIAS, ANCHOR, TAG, SCALAR(flow),
108 # '[', or '{' tokens.
109 self
.possible_simple_keys
= {}
113 def check_token(self
, *choices
):
114 # Check if the next token is one of the given types.
115 while self
.need_more_tokens():
116 self
.fetch_more_tokens()
120 for choice
in choices
:
121 if isinstance(self
.tokens
[0], choice
):
125 def peek_token(self
):
126 # Return the next token, but do not delete if from the queue.
127 while self
.need_more_tokens():
128 self
.fetch_more_tokens()
130 return self
.tokens
[0]
133 # Return the next token.
134 while self
.need_more_tokens():
135 self
.fetch_more_tokens()
137 self
.tokens_taken
+= 1
138 return self
.tokens
.pop(0)
142 def need_more_tokens(self
):
147 # The current token may be a potential simple key, so we
148 # need to look further.
149 self
.stale_possible_simple_keys()
150 if self
.next_possible_simple_key() == self
.tokens_taken
:
153 def fetch_more_tokens(self
):
155 # Eat whitespaces and comments until we reach the next token.
156 self
.scan_to_next_token()
158 # Remove obsolete possible simple keys.
159 self
.stale_possible_simple_keys()
161 # Compare the current indentation and column. It may add some tokens
162 # and decrease the current indentation level.
163 self
.unwind_indent(self
.column
)
165 # Peek the next character.
168 # Is it the end of stream?
170 return self
.fetch_stream_end()
173 if ch
== u
'%' and self
.check_directive():
174 return self
.fetch_directive()
176 # Is it the document start?
177 if ch
== u
'-' and self
.check_document_start():
178 return self
.fetch_document_start()
180 # Is it the document end?
181 if ch
== u
'.' and self
.check_document_end():
182 return self
.fetch_document_end()
184 # TODO: support for BOM within a stream.
186 # return self.fetch_bom() <-- issue BOMToken
188 # Note: the order of the following checks is NOT significant.
190 # Is it the flow sequence start indicator?
192 return self
.fetch_flow_sequence_start()
194 # Is it the flow mapping start indicator?
196 return self
.fetch_flow_mapping_start()
198 # Is it the flow sequence end indicator?
200 return self
.fetch_flow_sequence_end()
202 # Is it the flow mapping end indicator?
204 return self
.fetch_flow_mapping_end()
206 # Is it the flow entry indicator?
208 return self
.fetch_flow_entry()
210 # Is it the block entry indicator?
211 if ch
== u
'-' and self
.check_block_entry():
212 return self
.fetch_block_entry()
214 # Is it the key indicator?
215 if ch
== u
'?' and self
.check_key():
216 return self
.fetch_key()
218 # Is it the value indicator?
219 if ch
== u
':' and self
.check_value():
220 return self
.fetch_value()
224 return self
.fetch_alias()
228 return self
.fetch_anchor()
232 return self
.fetch_tag()
234 # Is it a literal scalar?
235 if ch
== u
'|' and not self
.flow_level
:
236 return self
.fetch_literal()
238 # Is it a folded scalar?
239 if ch
== u
'>' and not self
.flow_level
:
240 return self
.fetch_folded()
242 # Is it a single quoted scalar?
244 return self
.fetch_single()
246 # Is it a double quoted scalar?
248 return self
.fetch_double()
250 # It must be a plain scalar then.
251 if self
.check_plain():
252 return self
.fetch_plain()
254 # No? It's an error. Let's produce a nice error message.
255 raise ScannerError("while scanning for the next token", None,
256 "found character %r that cannot start any token"
257 % ch
.encode('utf-8'), self
.get_mark())
259 # Simple keys treatment.
261 def next_possible_simple_key(self
):
262 # Return the number of the nearest possible simple key. Actually we
263 # don't need to loop through the whole dictionary. We may replace it
264 # with the following code:
265 # if not self.possible_simple_keys:
267 # return self.possible_simple_keys[
268 # min(self.possible_simple_keys.keys())].token_number
269 min_token_number
= None
270 for level
in self
.possible_simple_keys
:
271 key
= self
.possible_simple_keys
[level
]
272 if min_token_number
is None or key
.token_number
< min_token_number
:
273 min_token_number
= key
.token_number
274 return min_token_number
276 def stale_possible_simple_keys(self
):
277 # Remove entries that are no longer possible simple keys. According to
278 # the YAML specification, simple keys
279 # - should be limited to a single line,
280 # - should be no longer than 1024 characters.
281 # Disabling this procedure will allow simple keys of any length and
282 # height (may cause problems if indentation is broken though).
283 for level
in self
.possible_simple_keys
.keys():
284 key
= self
.possible_simple_keys
[level
]
285 if key
.line
!= self
.line \
286 or self
.index
-key
.index
> 1024:
288 raise ScannerError("while scanning a simple key", key
.mark
,
289 "could not found expected ':'", self
.get_mark())
290 del self
.possible_simple_keys
[level
]
292 def save_possible_simple_key(self
):
293 # The next token may start a simple key. We check if it's possible
294 # and save its position. This function is called for
295 # ALIAS, ANCHOR, TAG, SCALAR(flow), '[', and '{'.
297 # Check if a simple key is required at the current position.
298 required
= not self
.flow_level
and self
.indent
== self
.column
300 # A simple key is required only if it is the first token in the current
301 # line. Therefore it is always allowed.
302 assert self
.allow_simple_key
or not required
304 # The next token might be a simple key. Let's save it's number and
306 if self
.allow_simple_key
:
307 self
.remove_possible_simple_key()
308 token_number
= self
.tokens_taken
+len(self
.tokens
)
309 key
= SimpleKey(token_number
, required
,
310 self
.index
, self
.line
, self
.column
, self
.get_mark())
311 self
.possible_simple_keys
[self
.flow_level
] = key
313 def remove_possible_simple_key(self
):
314 # Remove the saved possible key position at the current flow level.
315 if self
.flow_level
in self
.possible_simple_keys
:
316 key
= self
.possible_simple_keys
[self
.flow_level
]
319 raise ScannerError("while scanning a simple key", key
.mark
,
320 "could not found expected ':'", self
.get_mark())
322 del self
.possible_simple_keys
[self
.flow_level
]
324 # Indentation functions.
326 def unwind_indent(self
, column
):
328 ## In flow context, tokens should respect indentation.
329 ## Actually the condition should be `self.indent >= column` according to
330 ## the spec. But this condition will prohibit intuitively correct
331 ## constructions such as
334 #if self.flow_level and self.indent > column:
335 # raise ScannerError(None, None,
336 # "invalid intendation or unclosed '[' or '{'",
339 # In the flow context, indentation is ignored. We make the scanner less
340 # restrictive then specification requires.
344 # In block context, we may need to issue the BLOCK-END tokens.
345 while self
.indent
> column
:
346 mark
= self
.get_mark()
347 self
.indent
= self
.indents
.pop()
348 self
.tokens
.append(BlockEndToken(mark
, mark
))
350 def add_indent(self
, column
):
351 # Check if we need to increase indentation.
352 if self
.indent
< column
:
353 self
.indents
.append(self
.indent
)
360 def fetch_stream_start(self
):
361 # We always add STREAM-START as the first token and STREAM-END as the
365 mark
= self
.get_mark()
368 self
.tokens
.append(StreamStartToken(mark
, mark
,
369 encoding
=self
.encoding
))
372 def fetch_stream_end(self
):
374 # Set the current intendation to -1.
375 self
.unwind_indent(-1)
377 # Reset everything (not really needed).
378 self
.allow_simple_key
= False
379 self
.possible_simple_keys
= {}
382 mark
= self
.get_mark()
385 self
.tokens
.append(StreamEndToken(mark
, mark
))
387 # The steam is finished.
390 def fetch_directive(self
):
392 # Set the current intendation to -1.
393 self
.unwind_indent(-1)
396 self
.remove_possible_simple_key()
397 self
.allow_simple_key
= False
399 # Scan and add DIRECTIVE.
400 self
.tokens
.append(self
.scan_directive())
402 def fetch_document_start(self
):
403 self
.fetch_document_indicator(DocumentStartToken
)
405 def fetch_document_end(self
):
406 self
.fetch_document_indicator(DocumentEndToken
)
408 def fetch_document_indicator(self
, TokenClass
):
410 # Set the current intendation to -1.
411 self
.unwind_indent(-1)
413 # Reset simple keys. Note that there could not be a block collection
415 self
.remove_possible_simple_key()
416 self
.allow_simple_key
= False
418 # Add DOCUMENT-START or DOCUMENT-END.
419 start_mark
= self
.get_mark()
421 end_mark
= self
.get_mark()
422 self
.tokens
.append(TokenClass(start_mark
, end_mark
))
424 def fetch_flow_sequence_start(self
):
425 self
.fetch_flow_collection_start(FlowSequenceStartToken
)
427 def fetch_flow_mapping_start(self
):
428 self
.fetch_flow_collection_start(FlowMappingStartToken
)
430 def fetch_flow_collection_start(self
, TokenClass
):
432 # '[' and '{' may start a simple key.
433 self
.save_possible_simple_key()
435 # Increase the flow level.
438 # Simple keys are allowed after '[' and '{'.
439 self
.allow_simple_key
= True
441 # Add FLOW-SEQUENCE-START or FLOW-MAPPING-START.
442 start_mark
= self
.get_mark()
444 end_mark
= self
.get_mark()
445 self
.tokens
.append(TokenClass(start_mark
, end_mark
))
447 def fetch_flow_sequence_end(self
):
448 self
.fetch_flow_collection_end(FlowSequenceEndToken
)
450 def fetch_flow_mapping_end(self
):
451 self
.fetch_flow_collection_end(FlowMappingEndToken
)
453 def fetch_flow_collection_end(self
, TokenClass
):
455 # Reset possible simple key on the current level.
456 self
.remove_possible_simple_key()
458 # Decrease the flow level.
461 # No simple keys after ']' or '}'.
462 self
.allow_simple_key
= False
464 # Add FLOW-SEQUENCE-END or FLOW-MAPPING-END.
465 start_mark
= self
.get_mark()
467 end_mark
= self
.get_mark()
468 self
.tokens
.append(TokenClass(start_mark
, end_mark
))
470 def fetch_flow_entry(self
):
472 # Simple keys are allowed after ','.
473 self
.allow_simple_key
= True
475 # Reset possible simple key on the current level.
476 self
.remove_possible_simple_key()
479 start_mark
= self
.get_mark()
481 end_mark
= self
.get_mark()
482 self
.tokens
.append(FlowEntryToken(start_mark
, end_mark
))
484 def fetch_block_entry(self
):
486 # Block context needs additional checks.
487 if not self
.flow_level
:
489 # Are we allowed to start a new entry?
490 if not self
.allow_simple_key
:
491 raise ScannerError(None, None,
492 "sequence entries are not allowed here",
495 # We may need to add BLOCK-SEQUENCE-START.
496 if self
.add_indent(self
.column
):
497 mark
= self
.get_mark()
498 self
.tokens
.append(BlockSequenceStartToken(mark
, mark
))
500 # It's an error for the block entry to occur in the flow context,
501 # but we let the parser detect this.
505 # Simple keys are allowed after '-'.
506 self
.allow_simple_key
= True
508 # Reset possible simple key on the current level.
509 self
.remove_possible_simple_key()
512 start_mark
= self
.get_mark()
514 end_mark
= self
.get_mark()
515 self
.tokens
.append(BlockEntryToken(start_mark
, end_mark
))
519 # Block context needs additional checks.
520 if not self
.flow_level
:
522 # Are we allowed to start a key (not nessesary a simple)?
523 if not self
.allow_simple_key
:
524 raise ScannerError(None, None,
525 "mapping keys are not allowed here",
528 # We may need to add BLOCK-MAPPING-START.
529 if self
.add_indent(self
.column
):
530 mark
= self
.get_mark()
531 self
.tokens
.append(BlockMappingStartToken(mark
, mark
))
533 # Simple keys are allowed after '?' in the block context.
534 self
.allow_simple_key
= not self
.flow_level
536 # Reset possible simple key on the current level.
537 self
.remove_possible_simple_key()
540 start_mark
= self
.get_mark()
542 end_mark
= self
.get_mark()
543 self
.tokens
.append(KeyToken(start_mark
, end_mark
))
545 def fetch_value(self
):
547 # Do we determine a simple key?
548 if self
.flow_level
in self
.possible_simple_keys
:
551 key
= self
.possible_simple_keys
[self
.flow_level
]
552 del self
.possible_simple_keys
[self
.flow_level
]
553 self
.tokens
.insert(key
.token_number
-self
.tokens_taken
,
554 KeyToken(key
.mark
, key
.mark
))
556 # If this key starts a new block mapping, we need to add
557 # BLOCK-MAPPING-START.
558 if not self
.flow_level
:
559 if self
.add_indent(key
.column
):
560 self
.tokens
.insert(key
.token_number
-self
.tokens_taken
,
561 BlockMappingStartToken(key
.mark
, key
.mark
))
563 # There cannot be two simple keys one after another.
564 self
.allow_simple_key
= False
566 # It must be a part of a complex key.
569 # Block context needs additional checks.
570 # (Do we really need them? They will be catched by the parser
572 if not self
.flow_level
:
574 # We are allowed to start a complex value if and only if
575 # we can start a simple key.
576 if not self
.allow_simple_key
:
577 raise ScannerError(None, None,
578 "mapping values are not allowed here",
581 # If this value starts a new block mapping, we need to add
582 # BLOCK-MAPPING-START. It will be detected as an error later by
584 if not self
.flow_level
:
585 if self
.add_indent(self
.column
):
586 mark
= self
.get_mark()
587 self
.tokens
.append(BlockMappingStartToken(mark
, mark
))
589 # Simple keys are allowed after ':' in the block context.
590 self
.allow_simple_key
= not self
.flow_level
592 # Reset possible simple key on the current level.
593 self
.remove_possible_simple_key()
596 start_mark
= self
.get_mark()
598 end_mark
= self
.get_mark()
599 self
.tokens
.append(ValueToken(start_mark
, end_mark
))
601 def fetch_alias(self
):
603 # ALIAS could be a simple key.
604 self
.save_possible_simple_key()
606 # No simple keys after ALIAS.
607 self
.allow_simple_key
= False
609 # Scan and add ALIAS.
610 self
.tokens
.append(self
.scan_anchor(AliasToken
))
612 def fetch_anchor(self
):
614 # ANCHOR could start a simple key.
615 self
.save_possible_simple_key()
617 # No simple keys after ANCHOR.
618 self
.allow_simple_key
= False
620 # Scan and add ANCHOR.
621 self
.tokens
.append(self
.scan_anchor(AnchorToken
))
625 # TAG could start a simple key.
626 self
.save_possible_simple_key()
628 # No simple keys after TAG.
629 self
.allow_simple_key
= False
632 self
.tokens
.append(self
.scan_tag())
634 def fetch_literal(self
):
635 self
.fetch_block_scalar(style
='|')
637 def fetch_folded(self
):
638 self
.fetch_block_scalar(style
='>')
640 def fetch_block_scalar(self
, style
):
642 # A simple key may follow a block scalar.
643 self
.allow_simple_key
= True
645 # Reset possible simple key on the current level.
646 self
.remove_possible_simple_key()
648 # Scan and add SCALAR.
649 self
.tokens
.append(self
.scan_block_scalar(style
))
651 def fetch_single(self
):
652 self
.fetch_flow_scalar(style
='\'')
654 def fetch_double(self
):
655 self
.fetch_flow_scalar(style
='"')
657 def fetch_flow_scalar(self
, style
):
659 # A flow scalar could be a simple key.
660 self
.save_possible_simple_key()
662 # No simple keys after flow scalars.
663 self
.allow_simple_key
= False
665 # Scan and add SCALAR.
666 self
.tokens
.append(self
.scan_flow_scalar(style
))
668 def fetch_plain(self
):
670 # A plain scalar could be a simple key.
671 self
.save_possible_simple_key()
673 # No simple keys after plain scalars. But note that `scan_plain` will
674 # change this flag if the scan is finished at the beginning of the
676 self
.allow_simple_key
= False
678 # Scan and add SCALAR. May change `allow_simple_key`.
679 self
.tokens
.append(self
.scan_plain())
683 def check_directive(self
):
685 # DIRECTIVE: ^ '%' ...
686 # The '%' indicator is already checked.
690 def check_document_start(self
):
692 # DOCUMENT-START: ^ '---' (' '|'\n')
694 if self
.prefix(3) == u
'---' \
695 and self
.peek(3) in u
'\0 \t\r\n\x85\u2028\u2029':
698 def check_document_end(self
):
700 # DOCUMENT-END: ^ '...' (' '|'\n')
702 if self
.prefix(3) == u
'...' \
703 and self
.peek(3) in u
'\0 \t\r\n\x85\u2028\u2029':
706 def check_block_entry(self
):
708 # BLOCK-ENTRY: '-' (' '|'\n')
709 return self
.peek(1) in u
'\0 \t\r\n\x85\u2028\u2029'
713 # KEY(flow context): '?'
717 # KEY(block context): '?' (' '|'\n')
719 return self
.peek(1) in u
'\0 \t\r\n\x85\u2028\u2029'
721 def check_value(self
):
723 # VALUE(flow context): ':'
727 # VALUE(block context): ':' (' '|'\n')
729 return self
.peek(1) in u
'\0 \t\r\n\x85\u2028\u2029'
731 def check_plain(self
):
733 # A plain scalar may start with any non-space character except:
734 # '-', '?', ':', ',', '[', ']', '{', '}',
735 # '#', '&', '*', '!', '|', '>', '\'', '\"',
738 # It may also start with
740 # if it is followed by a non-space character.
742 # Note that we limit the last rule to the block context (except the
743 # '-' character) because we want the flow context to be space
746 return ch
not in u
'\0 \t\r\n\x85\u2028\u2029-?:,[]{}#&*!|>\'\"%@`' \
747 or (self
.peek(1) not in u
'\0 \t\r\n\x85\u2028\u2029'
748 and (ch
== u
'-' or (not self
.flow_level
and ch
in u
'?:')))
752 def scan_to_next_token(self
):
753 # We ignore spaces, line breaks and comments.
754 # If we find a line break in the block context, we set the flag
755 # `allow_simple_key` on.
756 # The byte order mark is stripped if it's the first character in the
757 # stream. We do not yet support BOM inside the stream as the
758 # specification requires. Any such mark will be considered as a part
761 # TODO: We need to make tab handling rules more sane. A good rule is
762 # Tabs cannot precede tokens
763 # BLOCK-SEQUENCE-START, BLOCK-MAPPING-START, BLOCK-END,
764 # KEY(block), VALUE(block), BLOCK-ENTRY
765 # So the checking code is
767 # self.allow_simple_keys = False
768 # We also need to add the check for `allow_simple_keys == True` to
769 # `unwind_indent` before issuing BLOCK-END.
770 # Scanners for block, flow, and plain scalars need to be modified.
772 if self
.index
== 0 and self
.peek() == u
'\uFEFF':
776 while self
.peek() == u
' ':
778 if self
.peek() == u
'#':
779 while self
.peek() not in u
'\0\r\n\x85\u2028\u2029':
781 if self
.scan_line_break():
782 if not self
.flow_level
:
783 self
.allow_simple_key
= True
787 def scan_directive(self
):
788 # See the specification for details.
789 start_mark
= self
.get_mark()
791 name
= self
.scan_directive_name(start_mark
)
794 value
= self
.scan_yaml_directive_value(start_mark
)
795 end_mark
= self
.get_mark()
797 value
= self
.scan_tag_directive_value(start_mark
)
798 end_mark
= self
.get_mark()
800 end_mark
= self
.get_mark()
801 while self
.peek() not in u
'\0\r\n\x85\u2028\u2029':
803 self
.scan_directive_ignored_line(start_mark
)
804 return DirectiveToken(name
, value
, start_mark
, end_mark
)
806 def scan_directive_name(self
, start_mark
):
807 # See the specification for details.
809 ch
= self
.peek(length
)
810 while u
'0' <= ch
<= u
'9' or u
'A' <= ch
<= 'Z' or u
'a' <= ch
<= 'z' \
813 ch
= self
.peek(length
)
815 raise ScannerError("while scanning a directive", start_mark
,
816 "expected alphabetic or numeric character, but found %r"
817 % ch
.encode('utf-8'), self
.get_mark())
818 value
= self
.prefix(length
)
821 if ch
not in u
'\0 \r\n\x85\u2028\u2029':
822 raise ScannerError("while scanning a directive", start_mark
,
823 "expected alphabetic or numeric character, but found %r"
824 % ch
.encode('utf-8'), self
.get_mark())
827 def scan_yaml_directive_value(self
, start_mark
):
828 # See the specification for details.
829 while self
.peek() == u
' ':
831 major
= self
.scan_yaml_directive_number(start_mark
)
832 if self
.peek() != '.':
833 raise ScannerError("while scanning a directive", start_mark
,
834 "expected a digit or '.', but found %r"
835 % self
.peek().encode('utf-8'),
838 minor
= self
.scan_yaml_directive_number(start_mark
)
839 if self
.peek() not in u
'\0 \r\n\x85\u2028\u2029':
840 raise ScannerError("while scanning a directive", start_mark
,
841 "expected a digit or ' ', but found %r"
842 % self
.peek().encode('utf-8'),
844 return (major
, minor
)
846 def scan_yaml_directive_number(self
, start_mark
):
847 # See the specification for details.
849 if not (u
'0' <= ch
<= '9'):
850 raise ScannerError("while scanning a directive", start_mark
,
851 "expected a digit, but found %r" % ch
.encode('utf-8'),
854 while u
'0' <= self
.peek(length
) <= u
'9':
856 value
= int(self
.prefix(length
))
860 def scan_tag_directive_value(self
, start_mark
):
861 # See the specification for details.
862 while self
.peek() == u
' ':
864 handle
= self
.scan_tag_directive_handle(start_mark
)
865 while self
.peek() == u
' ':
867 prefix
= self
.scan_tag_directive_prefix(start_mark
)
868 return (handle
, prefix
)
870 def scan_tag_directive_handle(self
, start_mark
):
871 # See the specification for details.
872 value
= self
.scan_tag_handle('directive', start_mark
)
875 raise ScannerError("while scanning a directive", start_mark
,
876 "expected ' ', but found %r" % ch
.encode('utf-8'),
880 def scan_tag_directive_prefix(self
, start_mark
):
881 # See the specification for details.
882 value
= self
.scan_tag_uri('directive', start_mark
)
884 if ch
not in u
'\0 \r\n\x85\u2028\u2029':
885 raise ScannerError("while scanning a directive", start_mark
,
886 "expected ' ', but found %r" % ch
.encode('utf-8'),
890 def scan_directive_ignored_line(self
, start_mark
):
891 # See the specification for details.
892 while self
.peek() == u
' ':
894 if self
.peek() == u
'#':
895 while self
.peek() not in u
'\0\r\n\x85\u2028\u2029':
898 if ch
not in u
'\0\r\n\x85\u2028\u2029':
899 raise ScannerError("while scanning a directive", start_mark
,
900 "expected a comment or a line break, but found %r"
901 % ch
.encode('utf-8'), self
.get_mark())
902 self
.scan_line_break()
904 def scan_anchor(self
, TokenClass
):
905 # The specification does not restrict characters for anchors and
906 # aliases. This may lead to problems, for instance, the document:
908 # can be interpteted in two ways, as
911 # [ *alias , "value" ]
912 # Therefore we restrict aliases to numbers and ASCII letters.
913 start_mark
= self
.get_mark()
914 indicator
= self
.peek()
921 ch
= self
.peek(length
)
922 while u
'0' <= ch
<= u
'9' or u
'A' <= ch
<= 'Z' or u
'a' <= ch
<= 'z' \
925 ch
= self
.peek(length
)
927 raise ScannerError("while scanning an %s" % name
, start_mark
,
928 "expected alphabetic or numeric character, but found %r"
929 % ch
.encode('utf-8'), self
.get_mark())
930 value
= self
.prefix(length
)
933 if ch
not in u
'\0 \t\r\n\x85\u2028\u2029?:,]}%@`':
934 raise ScannerError("while scanning an %s" % name
, start_mark
,
935 "expected alphabetic or numeric character, but found %r"
936 % ch
.encode('utf-8'), self
.get_mark())
937 end_mark
= self
.get_mark()
938 return TokenClass(value
, start_mark
, end_mark
)
941 # See the specification for details.
942 start_mark
= self
.get_mark()
947 suffix
= self
.scan_tag_uri('tag', start_mark
)
948 if self
.peek() != u
'>':
949 raise ScannerError("while parsing a tag", start_mark
,
950 "expected '>', but found %r" % self
.peek().encode('utf-8'),
953 elif ch
in u
'\0 \t\r\n\x85\u2028\u2029':
960 while ch
not in u
'\0 \r\n\x85\u2028\u2029':
965 ch
= self
.peek(length
)
968 handle
= self
.scan_tag_handle('tag', start_mark
)
972 suffix
= self
.scan_tag_uri('tag', start_mark
)
974 if ch
not in u
'\0 \r\n\x85\u2028\u2029':
975 raise ScannerError("while scanning a tag", start_mark
,
976 "expected ' ', but found %r" % ch
.encode('utf-8'),
978 value
= (handle
, suffix
)
979 end_mark
= self
.get_mark()
980 return TagToken(value
, start_mark
, end_mark
)
982 def scan_block_scalar(self
, style
):
983 # See the specification for details.
991 start_mark
= self
.get_mark()
995 chomping
, increment
= self
.scan_block_scalar_indicators(start_mark
)
996 self
.scan_block_scalar_ignored_line(start_mark
)
998 # Determine the indentation level and go to the first non-empty line.
999 min_indent
= self
.indent
+1
1002 if increment
is None:
1003 breaks
, max_indent
, end_mark
= self
.scan_block_scalar_indentation()
1004 indent
= max(min_indent
, max_indent
)
1006 indent
= min_indent
+increment
-1
1007 breaks
, end_mark
= self
.scan_block_scalar_breaks(indent
)
1010 # Scan the inner part of the block scalar.
1011 while self
.column
== indent
and self
.peek() != u
'\0':
1012 chunks
.extend(breaks
)
1013 leading_non_space
= self
.peek() not in u
' \t'
1015 while self
.peek(length
) not in u
'\0\r\n\x85\u2028\u2029':
1017 chunks
.append(self
.prefix(length
))
1018 self
.forward(length
)
1019 line_break
= self
.scan_line_break()
1020 breaks
, end_mark
= self
.scan_block_scalar_breaks(indent
)
1021 if self
.column
== indent
and self
.peek() != u
'\0':
1023 # Unfortunately, folding rules are ambiguous.
1025 # This is the folding according to the specification:
1027 if folded
and line_break
== u
'\n' \
1028 and leading_non_space
and self
.peek() not in u
' \t':
1032 chunks
.append(line_break
)
1034 # This is Clark Evans's interpretation (also in the spec
1037 #if folded and line_break == u'\n':
1039 # if self.peek() not in ' \t':
1040 # chunks.append(u' ')
1042 # chunks.append(line_break)
1044 # chunks.append(line_break)
1049 if chomping
is not False:
1050 chunks
.append(line_break
)
1051 if chomping
is True:
1052 chunks
.extend(breaks
)
1055 return ScalarToken(u
''.join(chunks
), False, start_mark
, end_mark
,
1058 def scan_block_scalar_indicators(self
, start_mark
):
1059 # See the specification for details.
1070 if ch
in u
'0123456789':
1073 raise ScannerError("while scanning a block scalar", start_mark
,
1074 "expected indentation indicator in the range 1-9, but found 0",
1077 elif ch
in u
'0123456789':
1080 raise ScannerError("while scanning a block scalar", start_mark
,
1081 "expected indentation indicator in the range 1-9, but found 0",
1092 if ch
not in u
'\0 \r\n\x85\u2028\u2029':
1093 raise ScannerError("while scanning a block scalar", start_mark
,
1094 "expected chomping or indentation indicators, but found %r"
1095 % ch
.encode('utf-8'), self
.get_mark())
1096 return chomping
, increment
1098 def scan_block_scalar_ignored_line(self
, start_mark
):
1099 # See the specification for details.
1100 while self
.peek() == u
' ':
1102 if self
.peek() == u
'#':
1103 while self
.peek() not in u
'\0\r\n\x85\u2028\u2029':
1106 if ch
not in u
'\0\r\n\x85\u2028\u2029':
1107 raise ScannerError("while scanning a block scalar", start_mark
,
1108 "expected a comment or a line break, but found %r"
1109 % ch
.encode('utf-8'), self
.get_mark())
1110 self
.scan_line_break()
1112 def scan_block_scalar_indentation(self
):
1113 # See the specification for details.
1116 end_mark
= self
.get_mark()
1117 while self
.peek() in u
' \r\n\x85\u2028\u2029':
1118 if self
.peek() != u
' ':
1119 chunks
.append(self
.scan_line_break())
1120 end_mark
= self
.get_mark()
1123 if self
.column
> max_indent
:
1124 max_indent
= self
.column
1125 return chunks
, max_indent
, end_mark
1127 def scan_block_scalar_breaks(self
, indent
):
1128 # See the specification for details.
1130 end_mark
= self
.get_mark()
1131 while self
.column
< indent
and self
.peek() == u
' ':
1133 while self
.peek() in u
'\r\n\x85\u2028\u2029':
1134 chunks
.append(self
.scan_line_break())
1135 end_mark
= self
.get_mark()
1136 while self
.column
< indent
and self
.peek() == u
' ':
1138 return chunks
, end_mark
1140 def scan_flow_scalar(self
, style
):
1141 # See the specification for details.
1142 # Note that we loose indentation rules for quoted scalars. Quoted
1143 # scalars don't need to adhere indentation because " and ' clearly
1144 # mark the beginning and the end of them. Therefore we are less
1145 # restrictive then the specification requires. We only need to check
1146 # that document separators are not included in scalars.
1152 start_mark
= self
.get_mark()
1155 chunks
.extend(self
.scan_flow_scalar_non_spaces(double
, start_mark
))
1156 while self
.peek() != quote
:
1157 chunks
.extend(self
.scan_flow_scalar_spaces(double
, start_mark
))
1158 chunks
.extend(self
.scan_flow_scalar_non_spaces(double
, start_mark
))
1160 end_mark
= self
.get_mark()
1161 return ScalarToken(u
''.join(chunks
), False, start_mark
, end_mark
,
1164 ESCAPE_REPLACEMENTS
= {
1190 def scan_flow_scalar_non_spaces(self
, double
, start_mark
):
1191 # See the specification for details.
1195 while self
.peek(length
) not in u
'\'\"\\\0 \t\r\n\x85\u2028\u2029':
1198 chunks
.append(self
.prefix(length
))
1199 self
.forward(length
)
1201 if not double
and ch
== u
'\'' and self
.peek(1) == u
'\'':
1202 chunks
.append(u
'\'')
1204 elif (double
and ch
== u
'\'') or (not double
and ch
in u
'\"\\'):
1207 elif double
and ch
== u
'\\':
1210 if ch
in self
.ESCAPE_REPLACEMENTS
:
1211 chunks
.append(self
.ESCAPE_REPLACEMENTS
[ch
])
1213 elif ch
in self
.ESCAPE_CODES
:
1214 length
= self
.ESCAPE_CODES
[ch
]
1216 for k
in range(length
):
1217 if self
.peek(k
) not in u
'0123456789ABCDEFabcdef':
1218 raise ScannerError("while scanning a double-quoted scalar", start_mark
,
1219 "expected escape sequence of %d hexdecimal numbers, but found %r" %
1220 (length
, self
.peek(k
).encode('utf-8')), self
.get_mark())
1221 code
= int(self
.prefix(length
), 16)
1222 chunks
.append(unichr(code
))
1223 self
.forward(length
)
1224 elif ch
in u
'\r\n\x85\u2028\u2029':
1225 self
.scan_line_break()
1226 chunks
.extend(self
.scan_flow_scalar_breaks(double
, start_mark
))
1228 raise ScannerError("while scanning a double-quoted scalar", start_mark
,
1229 "found unknown escape character %r" % ch
.encode('utf-8'), self
.get_mark())
1233 def scan_flow_scalar_spaces(self
, double
, start_mark
):
1234 # See the specification for details.
1237 while self
.peek(length
) in u
' \t':
1239 whitespaces
= self
.prefix(length
)
1240 self
.forward(length
)
1243 raise ScannerError("while scanning a quoted scalar", start_mark
,
1244 "found unexpected end of stream", self
.get_mark())
1245 elif ch
in u
'\r\n\x85\u2028\u2029':
1246 line_break
= self
.scan_line_break()
1247 breaks
= self
.scan_flow_scalar_breaks(double
, start_mark
)
1248 if line_break
!= u
'\n':
1249 chunks
.append(line_break
)
1252 chunks
.extend(breaks
)
1254 chunks
.append(whitespaces
)
1257 def scan_flow_scalar_breaks(self
, double
, start_mark
):
1258 # See the specification for details.
1261 # Instead of checking indentation, we check for document
1263 prefix
= self
.prefix(3)
1264 if (prefix
== u
'---' or prefix
== u
'...') \
1265 and self
.peek(3) in u
'\0 \t\r\n\x85\u2028\u2029':
1266 raise ScannerError("while scanning a quoted scalar", start_mark
,
1267 "found unexpected document separator", self
.get_mark())
1268 while self
.peek() in u
' \t':
1270 if self
.peek() in u
'\r\n\x85\u2028\u2029':
1271 chunks
.append(self
.scan_line_break())
1275 def scan_plain(self
):
1276 # See the specification for details.
1277 # We add an additional restriction for the flow context:
1278 # plain scalars in the flow context cannot contain ',', ':' and '?'.
1279 # We also keep track of the `allow_simple_key` flag here.
1280 # Indentation rules are loosed for the flow context.
1282 start_mark
= self
.get_mark()
1283 end_mark
= start_mark
1284 indent
= self
.indent
+1
1285 # We allow zero indentation for scalars, but then we need to check for
1286 # document separators at the beginning of the line.
1292 if self
.peek() == u
'#':
1295 ch
= self
.peek(length
)
1296 if ch
in u
'\0 \t\r\n\x85\u2028\u2029' \
1297 or (not self
.flow_level
and ch
== u
':' and
1298 self
.peek(length
+1) in u
'\0 \t\r\n\x85\u2028\u2029') \
1299 or (self
.flow_level
and ch
in u
',:?[]{}'):
1302 # It's not clear what we should do with ':' in the flow context.
1303 if (self
.flow_level
and ch
== u
':'
1304 and self
.peek(length
+1) not in u
'\0 \t\r\n\x85\u2028\u2029,[]{}'):
1305 self
.forward(length
)
1306 raise ScannerError("while scanning a plain scalar", start_mark
,
1307 "found unexpected ':'", self
.get_mark(),
1308 "Please check http://pyyaml.org/wiki/YAMLColonInFlowContext for details.")
1311 self
.allow_simple_key
= False
1312 chunks
.extend(spaces
)
1313 chunks
.append(self
.prefix(length
))
1314 self
.forward(length
)
1315 end_mark
= self
.get_mark()
1316 spaces
= self
.scan_plain_spaces(indent
, start_mark
)
1317 if not spaces
or self
.peek() == u
'#' \
1318 or (not self
.flow_level
and self
.column
< indent
):
1320 return ScalarToken(u
''.join(chunks
), True, start_mark
, end_mark
)
1322 def scan_plain_spaces(self
, indent
, start_mark
):
1323 # See the specification for details.
1324 # The specification is really confusing about tabs in plain scalars.
1325 # We just forbid them completely. Do not use tabs in YAML!
1328 while self
.peek(length
) in u
' ':
1330 whitespaces
= self
.prefix(length
)
1331 self
.forward(length
)
1333 if ch
in u
'\r\n\x85\u2028\u2029':
1334 line_break
= self
.scan_line_break()
1335 self
.allow_simple_key
= True
1336 prefix
= self
.prefix(3)
1337 if (prefix
== u
'---' or prefix
== u
'...') \
1338 and self
.peek(3) in u
'\0 \t\r\n\x85\u2028\u2029':
1341 while self
.peek() in u
' \r\n\x85\u2028\u2029':
1342 if self
.peek() == ' ':
1345 breaks
.append(self
.scan_line_break())
1346 prefix
= self
.prefix(3)
1347 if (prefix
== u
'---' or prefix
== u
'...') \
1348 and self
.peek(3) in u
'\0 \t\r\n\x85\u2028\u2029':
1350 if line_break
!= u
'\n':
1351 chunks
.append(line_break
)
1354 chunks
.extend(breaks
)
1356 chunks
.append(whitespaces
)
1359 def scan_tag_handle(self
, name
, start_mark
):
1360 # See the specification for details.
1361 # For some strange reasons, the specification does not allow '_' in
1362 # tag handles. I have allowed it anyway.
1365 raise ScannerError("while scanning a %s" % name
, start_mark
,
1366 "expected '!', but found %r" % ch
.encode('utf-8'),
1369 ch
= self
.peek(length
)
1371 while u
'0' <= ch
<= u
'9' or u
'A' <= ch
<= 'Z' or u
'a' <= ch
<= 'z' \
1374 ch
= self
.peek(length
)
1376 self
.forward(length
)
1377 raise ScannerError("while scanning a %s" % name
, start_mark
,
1378 "expected '!', but found %r" % ch
.encode('utf-8'),
1381 value
= self
.prefix(length
)
1382 self
.forward(length
)
1385 def scan_tag_uri(self
, name
, start_mark
):
1386 # See the specification for details.
1387 # Note: we do not check if URI is well-formed.
1390 ch
= self
.peek(length
)
1391 while u
'0' <= ch
<= u
'9' or u
'A' <= ch
<= 'Z' or u
'a' <= ch
<= 'z' \
1392 or ch
in u
'-;/?:@&=+$,_.!~*\'()[]%':
1394 chunks
.append(self
.prefix(length
))
1395 self
.forward(length
)
1397 chunks
.append(self
.scan_uri_escapes(name
, start_mark
))
1400 ch
= self
.peek(length
)
1402 chunks
.append(self
.prefix(length
))
1403 self
.forward(length
)
1406 raise ScannerError("while parsing a %s" % name
, start_mark
,
1407 "expected URI, but found %r" % ch
.encode('utf-8'),
1409 return u
''.join(chunks
)
1411 def scan_uri_escapes(self
, name
, start_mark
):
1412 # See the specification for details.
1414 mark
= self
.get_mark()
1415 while self
.peek() == u
'%':
1418 if self
.peek(k
) not in u
'0123456789ABCDEFabcdef':
1419 raise ScannerError("while scanning a %s" % name
, start_mark
,
1420 "expected URI escape sequence of 2 hexdecimal numbers, but found %r" %
1421 (self
.peek(k
).encode('utf-8')), self
.get_mark())
1422 bytes
.append(chr(int(self
.prefix(2), 16)))
1425 value
= unicode(''.join(bytes
), 'utf-8')
1426 except UnicodeDecodeError, exc
:
1427 raise ScannerError("while scanning a %s" % name
, start_mark
, str(exc
), mark
)
1430 def scan_line_break(self
):
1436 # '\u2028' : '\u2028'
1437 # '\u2029 : '\u2029'
1440 if ch
in u
'\r\n\x85':
1441 if self
.prefix(2) == u
'\r\n':
1446 elif ch
in u
'\u2028\u2029':
1453 # psyco.bind(Scanner)
1454 #except ImportError: