2 # Scanner produces tokens of the following types:
3 # DIRECTIVE(name, value)
21 # SCALAR(value, plain)
23 # Read comments in the Scanner code for more details.
26 __all__
= ['Scanner', 'ScannerError']
28 from error
import MarkedYAMLError
31 class ScannerError(MarkedYAMLError
):
35 # See below simple keys treatment.
37 def __init__(self
, token_number
, required
, index
, line
, column
, marker
):
38 self
.token_number
= token_number
39 self
.required
= required
48 def __init__(self
, reader
):
49 """Initialize the scanner."""
50 # The input stream. The Reader class do the dirty work of checking for
51 # BOM and converting the input data to Unicode. It also adds NUL to
54 # Reader supports the following methods
55 # self.reader.peek(i=0) # peek the next i-th character
56 # self.reader.prefix(l=1) # peek the next l characters
57 # self.reader.forward(l=1) # read the next l characters
58 # and move the pointer
61 # Had we reached the end of the stream?
64 # The number of unclosed '{' and '['. `flow_level == 0` means block
68 # List of processed tokens that are not yet emitted.
71 # Number of tokens that were emitted through the `get_token` method.
74 # The current indentation level.
77 # Past indentation levels.
80 # Variables related to simple keys treatment.
82 # A simple key is a key that is not denoted by the '?' indicator.
83 # Example of simple keys:
85 # block simple key: value
87 # : { flow simple key: value }
88 # We emit the KEY token before all keys, so when we find a potential
89 # simple key, we try to locate the corresponding ':' indicator.
90 # Simple keys should be limited to a single line and 1024 characters.
92 # Can a simple key start at the current position? A simple key may
94 # - at the beginning of the line, not counting indentation spaces
96 # - after '{', '[', ',' (in the flow context),
97 # - after '?', ':', '-' (in the block context).
98 # In the block context, this flag also signify if a block collection
99 # may start at the current position.
100 self
.allow_simple_key
= True
102 # Keep track of possible simple keys. This is a dictionary. The key
103 # is `flow_level`; there can be no more that one possible simple key
104 # for each level. The value is a SimpleKey record:
105 # (token_number, required, index, line, column, marker)
106 # A simple key may start with ALIAS, ANCHOR, TAG, SCALAR(flow),
107 # '[', or '{' tokens.
108 self
.possible_simple_keys
= {}
112 def check(self
, *choices
):
113 # Check if the next token is one of the given types.
114 while self
.need_more_tokens():
115 self
.fetch_more_tokens()
117 for choice
in choices
:
118 if isinstance(self
.tokens
[0], choice
):
123 # Return the next token, but do not delete if from the queue.
124 while self
.need_more_tokens():
125 self
.fetch_more_tokens()
127 return self
.tokens
[0]
130 # Return the next token.
131 while self
.need_more_tokens():
132 self
.fetch_more_tokens()
134 self
.tokens_taken
+= 1
135 return self
.tokens
.pop(0)
139 while self
.need_more_tokens():
140 self
.fetch_more_tokens()
142 self
.tokens_taken
+= 1
143 yield self
.tokens
.pop(0)
144 while self
.need_more_tokens():
145 self
.fetch_more_tokens()
149 def need_more_tokens(self
):
154 # The current token may be a potential simple key, so we
155 # need to look further.
156 self
.stale_possible_simple_keys()
157 if self
.next_possible_simple_key() == self
.tokens_taken
:
160 def fetch_more_tokens(self
):
162 # Eat whitespaces and comments until we reach the next token.
163 self
.scan_to_next_token()
165 # Remove obsolete possible simple keys.
166 self
.stale_possible_simple_keys()
168 # Compare the current indentation and column. It may add some tokens
169 # and decrease the current indentation level.
170 self
.unwind_indent(self
.reader
.column
)
172 # Peek the next character.
173 ch
= self
.reader
.peek()
175 # Is it the end of stream?
177 return self
.fetch_stream_end()
180 if ch
== u
'%' and self
.check_directive():
181 return self
.fetch_directive()
183 # Is it the document start?
184 if ch
== u
'-' and self
.check_document_start():
185 return self
.fetch_document_start()
187 # Is it the document end?
188 if ch
== u
'.' and self
.check_document_end():
189 return self
.fetch_document_end()
191 # TODO: support for BOM within a stream.
193 # return self.fetch_bom() <-- issue BOMToken
195 # Note: the order of the following checks is NOT significant.
197 # Is it the flow sequence start indicator?
199 return self
.fetch_flow_sequence_start()
201 # Is it the flow mapping start indicator?
203 return self
.fetch_flow_mapping_start()
205 # Is it the flow sequence end indicator?
207 return self
.fetch_flow_sequence_end()
209 # Is it the flow mapping end indicator?
211 return self
.fetch_flow_mapping_end()
213 # Is it the flow entry indicator?
215 return self
.fetch_flow_entry()
217 # Is it the block entry indicator?
218 if ch
in u
'-' and self
.check_block_entry():
219 return self
.fetch_block_entry()
221 # Is it the key indicator?
222 if ch
== u
'?' and self
.check_key():
223 return self
.fetch_key()
225 # Is it the value indicator?
226 if ch
== u
':' and self
.check_value():
227 return self
.fetch_value()
231 return self
.fetch_alias()
235 return self
.fetch_anchor()
239 return self
.fetch_tag()
241 # Is it a literal scalar?
242 if ch
== u
'|' and not self
.flow_level
:
243 return self
.fetch_literal()
245 # Is it a folded scalar?
246 if ch
== u
'>' and not self
.flow_level
:
247 return self
.fetch_folded()
249 # Is it a single quoted scalar?
251 return self
.fetch_single()
253 # Is it a double quoted scalar?
255 return self
.fetch_double()
257 # It must be a plain scalar then.
258 if self
.check_plain():
259 return self
.fetch_plain()
261 # No? It's an error. Let's produce a nice error message.
262 raise ScannerError("while scanning for the next token", None,
263 "found character %r that cannot start any token"
264 % ch
.encode('utf-8'), self
.reader
.get_marker())
266 # Simple keys treatment.
268 def next_possible_simple_key(self
):
269 # Return the number of the nearest possible simple key. Actually we
270 # don't need to loop through the whole dictionary. We may replace it
271 # with the following code:
272 # if not self.possible_simple_keys:
274 # return self.possible_simple_keys[
275 # min(self.possible_simple_keys.keys())].token_number
276 min_token_number
= None
277 for level
in self
.possible_simple_keys
:
278 key
= self
.possible_simple_keys
[level
]
279 if min_token_number
is None or key
.token_number
< min_token_number
:
280 min_token_number
= key
.token_number
281 return min_token_number
283 def stale_possible_simple_keys(self
):
284 # Remove entries that are no longer possible simple keys. According to
285 # the YAML specification, simple keys
286 # - should be limited to a single line,
287 # - should be no longer than 1024 characters.
288 # Disabling this procedure will allow simple keys of any length and
289 # height (may cause problems if indentation is broken though).
290 for level
in self
.possible_simple_keys
.keys():
291 key
= self
.possible_simple_keys
[level
]
292 if key
.line
!= self
.reader
.line \
293 or self
.reader
.index
-key
.index
> 1024:
295 raise ScannerError("while scanning a simple key", key
.marker
,
296 "could not found expected ':'", self
.reader
.get_marker())
297 del self
.possible_simple_keys
[level
]
299 def save_possible_simple_key(self
):
300 # The next token may start a simple key. We check if it's possible
301 # and save its position. This function is called for
302 # ALIAS, ANCHOR, TAG, SCALAR(flow), '[', and '{'.
304 # Check if a simple key is required at the current position.
305 required
= not self
.flow_level
and self
.indent
== self
.reader
.column
307 # A simple key is required only if it is the first token in the current
308 # line. Therefore it is always allowed.
309 assert self
.allow_simple_key
or not required
311 # The next token might be a simple key. Let's save it's number and
313 if self
.allow_simple_key
:
314 self
.remove_possible_simple_key()
315 token_number
= self
.tokens_taken
+len(self
.tokens
)
316 index
= self
.reader
.index
317 line
= self
.reader
.line
318 column
= self
.reader
.column
319 marker
= self
.reader
.get_marker()
320 key
= SimpleKey(token_number
, required
,
321 index
, line
, column
, marker
)
322 self
.possible_simple_keys
[self
.flow_level
] = key
324 def remove_possible_simple_key(self
):
325 # Remove the saved possible key position at the current flow level.
326 if self
.flow_level
in self
.possible_simple_keys
:
327 key
= self
.possible_simple_keys
[self
.flow_level
]
329 # I don't think it's possible, but I could be wrong.
330 assert not key
.required
332 # raise ScannerError("while scanning a simple key", key.marker,
333 # "could not found expected ':'", self.reader.get_marker())
335 # Indentation functions.
337 def unwind_indent(self
, column
):
339 # In flow context, tokens should respect indentation.
340 # Actually the condition should be `self.indent >= column` according to
341 # the spec. But this condition will prohibit intuitively correct
342 # constructions such as
345 if self
.flow_level
and self
.indent
> column
:
346 raise ScannerError(None, None,
347 "invalid intendation or unclosed '[' or '{'",
348 self
.reader
.get_marker())
350 # In block context, we may need to issue the BLOCK-END tokens.
351 while self
.indent
> column
:
352 marker
= self
.reader
.get_marker()
353 self
.indent
= self
.indents
.pop()
354 self
.tokens
.append(BlockEndToken(marker
, marker
))
356 def add_indent(self
, column
):
357 # Check if we need to increase indentation.
358 if self
.indent
< column
:
359 self
.indents
.append(self
.indent
)
366 def fetch_stream_end(self
):
368 # Set the current intendation to -1.
369 self
.unwind_indent(-1)
371 # Reset everything (not really needed).
372 self
.allow_simple_key
= False
373 self
.possible_simple_keys
= {}
376 marker
= self
.reader
.get_marker()
379 self
.tokens
.append(StreamEndToken(marker
, marker
))
381 # The reader is ended.
384 def fetch_directive(self
):
386 # Set the current intendation to -1.
387 self
.unwind_indent(-1)
390 self
.remove_possible_simple_key()
391 self
.allow_simple_key
= False
393 # Scan and add DIRECTIVE.
394 self
.tokens
.append(self
.scan_directive())
396 def fetch_document_start(self
):
397 self
.fetch_document_indicator(DocumentStartToken
)
399 def fetch_document_end(self
):
400 self
.fetch_document_indicator(DocumentEndToken
)
402 def fetch_document_indicator(self
, TokenClass
):
404 # Set the current intendation to -1.
405 self
.unwind_indent(-1)
407 # Reset simple keys. Note that there could not be a block collection
409 self
.remove_possible_simple_key()
410 self
.allow_simple_key
= False
412 # Add DOCUMENT-START or DOCUMENT-END.
413 start_marker
= self
.reader
.get_marker()
414 self
.reader
.forward(3)
415 end_marker
= self
.reader
.get_marker()
416 self
.tokens
.append(TokenClass(start_marker
, end_marker
))
418 def fetch_flow_sequence_start(self
):
419 self
.fetch_flow_collection_start(FlowSequenceStartToken
)
421 def fetch_flow_mapping_start(self
):
422 self
.fetch_flow_collection_start(FlowMappingStartToken
)
424 def fetch_flow_collection_start(self
, TokenClass
):
426 # '[' and '{' may start a simple key.
427 self
.save_possible_simple_key()
429 # Increase the flow level.
432 # Simple keys are allowed after '[' and '{'.
433 self
.allow_simple_key
= True
435 # Add FLOW-SEQUENCE-START or FLOW-MAPPING-START.
436 start_marker
= self
.reader
.get_marker()
437 self
.reader
.forward()
438 end_marker
= self
.reader
.get_marker()
439 self
.tokens
.append(TokenClass(start_marker
, end_marker
))
441 def fetch_flow_sequence_end(self
):
442 self
.fetch_flow_collection_end(FlowSequenceEndToken
)
444 def fetch_flow_mapping_end(self
):
445 self
.fetch_flow_collection_end(FlowMappingEndToken
)
447 def fetch_flow_collection_end(self
, TokenClass
):
449 # Reset possible simple key on the current level.
450 self
.remove_possible_simple_key()
452 # Decrease the flow level.
455 # No simple keys after ']' or '}'.
456 self
.allow_simple_key
= False
458 # Add FLOW-SEQUENCE-END or FLOW-MAPPING-END.
459 start_marker
= self
.reader
.get_marker()
460 self
.reader
.forward()
461 end_marker
= self
.reader
.get_marker()
462 self
.tokens
.append(TokenClass(start_marker
, end_marker
))
464 def fetch_flow_entry(self
):
466 # Simple keys are allowed after ','.
467 self
.allow_simple_key
= True
469 # Reset possible simple key on the current level.
470 self
.remove_possible_simple_key()
473 start_marker
= self
.reader
.get_marker()
474 self
.reader
.forward()
475 end_marker
= self
.reader
.get_marker()
476 self
.tokens
.append(FlowEntryToken(start_marker
, end_marker
))
478 def fetch_block_entry(self
):
480 # Block context needs additional checks.
481 if not self
.flow_level
:
483 # Are we allowed to start a new entry?
484 if not self
.allow_simple_key
:
485 raise ScannerError(None, None,
486 "sequence entries are not allowed here",
487 self
.reader
.get_marker())
489 # We may need to add BLOCK-SEQUENCE-START.
490 if self
.add_indent(self
.reader
.column
):
491 marker
= self
.reader
.get_marker()
492 self
.tokens
.append(BlockSequenceStartToken(marker
, marker
))
494 # It's an error for the block entry to occur in the flow context,
495 # but we let the parser detect this.
499 # Simple keys are allowed after '-'.
500 self
.allow_simple_key
= True
502 # Reset possible simple key on the current level.
503 self
.remove_possible_simple_key()
506 start_marker
= self
.reader
.get_marker()
507 self
.reader
.forward()
508 end_marker
= self
.reader
.get_marker()
509 self
.tokens
.append(BlockEntryToken(start_marker
, end_marker
))
513 # Block context needs additional checks.
514 if not self
.flow_level
:
516 # Are we allowed to start a key (not nessesary a simple)?
517 if not self
.allow_simple_key
:
518 raise ScannerError(None, None,
519 "mapping keys are not allowed here",
520 self
.reader
.get_marker())
522 # We may need to add BLOCK-MAPPING-START.
523 if self
.add_indent(self
.reader
.column
):
524 marker
= self
.reader
.get_marker()
525 self
.tokens
.append(BlockMappingStartToken(marker
, marker
))
527 # Simple keys are allowed after '?' in the block context.
528 self
.allow_simple_key
= not self
.flow_level
530 # Reset possible simple key on the current level.
531 self
.remove_possible_simple_key()
534 start_marker
= self
.reader
.get_marker()
535 self
.reader
.forward()
536 end_marker
= self
.reader
.get_marker()
537 self
.tokens
.append(KeyToken(start_marker
, end_marker
))
539 def fetch_value(self
):
541 # Do we determine a simple key?
542 if self
.flow_level
in self
.possible_simple_keys
:
545 key
= self
.possible_simple_keys
[self
.flow_level
]
546 del self
.possible_simple_keys
[self
.flow_level
]
547 self
.tokens
.insert(key
.token_number
-self
.tokens_taken
,
548 KeyToken(key
.marker
, key
.marker
))
550 # If this key starts a new block mapping, we need to add
551 # BLOCK-MAPPING-START.
552 if not self
.flow_level
:
553 if self
.add_indent(key
.column
):
554 self
.tokens
.insert(key
.token_number
-self
.tokens_taken
,
555 BlockMappingStartToken(key
.marker
, key
.marker
))
557 # There cannot be two simple keys one after another.
558 self
.allow_simple_key
= False
560 # It must be a part of a complex key.
563 # Block context needs additional checks.
564 # (Do we really need them? They will be catched by the parser
566 if not self
.flow_level
:
568 # We are allowed to start a complex value if and only if
569 # we can start a simple key.
570 if not self
.allow_simple_key
:
571 raise ScannerError(None, None,
572 "mapping values are not allowed here",
573 self
.reader
.get_marker())
575 # Simple keys are allowed after ':' in the block context.
576 self
.allow_simple_key
= not self
.flow_level
578 # Reset possible simple key on the current level.
579 self
.remove_possible_simple_key()
582 start_marker
= self
.reader
.get_marker()
583 self
.reader
.forward()
584 end_marker
= self
.reader
.get_marker()
585 self
.tokens
.append(ValueToken(start_marker
, end_marker
))
587 def fetch_alias(self
):
589 # ALIAS could be a simple key.
590 self
.save_possible_simple_key()
592 # No simple keys after ALIAS.
593 self
.allow_simple_key
= False
595 # Scan and add ALIAS.
596 self
.tokens
.append(self
.scan_anchor(AliasToken
))
598 def fetch_anchor(self
):
600 # ANCHOR could start a simple key.
601 self
.save_possible_simple_key()
603 # No simple keys after ANCHOR.
604 self
.allow_simple_key
= False
606 # Scan and add ANCHOR.
607 self
.tokens
.append(self
.scan_anchor(AnchorToken
))
611 # TAG could start a simple key.
612 self
.save_possible_simple_key()
614 # No simple keys after TAG.
615 self
.allow_simple_key
= False
618 self
.tokens
.append(self
.scan_tag())
620 def fetch_literal(self
):
621 self
.fetch_block_scalar(folded
=False)
623 def fetch_folded(self
):
624 self
.fetch_block_scalar(folded
=True)
626 def fetch_block_scalar(self
, folded
):
628 # A simple key may follow a block scalar.
629 self
.allow_simple_key
= True
631 # Reset possible simple key on the current level.
632 self
.remove_possible_simple_key()
634 # Scan and add SCALAR.
635 self
.tokens
.append(self
.scan_block_scalar(folded
))
637 def fetch_single(self
):
638 self
.fetch_flow_scalar(double
=False)
640 def fetch_double(self
):
641 self
.fetch_flow_scalar(double
=True)
643 def fetch_flow_scalar(self
, double
):
645 # A flow scalar could be a simple key.
646 self
.save_possible_simple_key()
648 # No simple keys after flow scalars.
649 self
.allow_simple_key
= False
651 # Scan and add SCALAR.
652 self
.tokens
.append(self
.scan_flow_scalar(double
))
654 def fetch_plain(self
):
656 # A plain scalar could be a simple key.
657 self
.save_possible_simple_key()
659 # No simple keys after plain scalars. But note that `scan_plain` will
660 # change this flag if the scan is finished at the beginning of the
662 self
.allow_simple_key
= False
664 # Scan and add SCALAR. May change `allow_simple_key`.
665 self
.tokens
.append(self
.scan_plain())
669 def check_directive(self
):
671 # DIRECTIVE: ^ '%' ...
672 # The '%' indicator is already checked.
673 if self
.reader
.column
== 0:
676 def check_document_start(self
):
678 # DOCUMENT-START: ^ '---' (' '|'\n')
679 if self
.reader
.column
== 0:
680 if self
.reader
.prefix(3) == u
'---' \
681 and self
.reader
.peek(3) in u
'\0 \t\r\n\x85\u2028\u2029':
684 def check_document_end(self
):
686 # DOCUMENT-END: ^ '...' (' '|'\n')
687 if self
.reader
.column
== 0:
688 prefix
= self
.reader
.peek(4)
689 if self
.reader
.prefix(3) == u
'...' \
690 and self
.reader
.peek(3) in u
'\0 \t\r\n\x85\u2028\u2029':
693 def check_block_entry(self
):
695 # BLOCK-ENTRY: '-' (' '|'\n')
696 return self
.reader
.peek(1) in u
'\0 \t\r\n\x85\u2028\u2029'
700 # KEY(flow context): '?'
704 # KEY(block context): '?' (' '|'\n')
706 return self
.reader
.peek(1) in u
'\0 \t\r\n\x85\u2028\u2029'
708 def check_value(self
):
710 # VALUE(flow context): ':'
714 # VALUE(block context): ':' (' '|'\n')
716 return self
.reader
.peek(1) in u
'\0 \t\r\n\x85\u2028\u2029'
718 def check_plain(self
):
720 # A plain scalar may start with any non-space character except:
721 # '-', '?', ':', ',', '[', ']', '{', '}',
722 # '#', '&', '*', '!', '|', '>', '\'', '\"',
725 # It may also start with
727 # if it is followed by a non-space character.
729 # Note that we limit the last rule to the block context (except the
730 # '-' character) because we want the flow context to be space
732 ch
= self
.reader
.peek()
733 return ch
not in u
'\0 \t\r\n\x85\u2028\u2029-?:,[]{}#&*!|>\'\"%@`' \
734 or (self
.reader
.peek(1) not in u
'\0 \t\r\n\x85\u2028\u2029'
735 and (ch
== '-' or (not self
.flow_level
and ch
in u
'?:')))
739 def scan_to_next_token(self
):
740 # We ignore spaces, line breaks and comments.
741 # If we find a line break in the block context, we set the flag
742 # `allow_simple_key` on.
743 # The byte order mark is stripped if it's the first character in the
744 # stream. We do not yet support BOM inside the stream as the
745 # specification requires. Any such mark will be considered as a part
748 # TODO: We need to make tab handling rules more sane. A good rule is
749 # Tabs cannot precede tokens
750 # BLOCK-SEQUENCE-START, BLOCK-MAPPING-START, BLOCK-END,
751 # KEY(block), VALUE(block), BLOCK-ENTRY
752 # So the checking code is
754 # self.allow_simple_keys = False
755 # We also need to add the check for `allow_simple_keys == True` to
756 # `unwind_indent` before issuing BLOCK-END.
757 # Scanners for block, flow, and plain scalars need to be modified.
759 if self
.reader
.index
== 0 and self
.reader
.peek() == u
'\uFEFF':
760 self
.reader
.forward()
763 while self
.reader
.peek() == u
' ':
764 self
.reader
.forward()
765 if self
.reader
.peek() == u
'#':
766 while self
.reader
.peek() not in u
'\0\r\n\x85\u2028\u2029':
767 self
.reader
.forward()
768 if self
.scan_line_break():
769 if not self
.flow_level
:
770 self
.allow_simple_key
= True
774 def scan_directive(self
):
775 # See the specification for details.
776 start_marker
= self
.reader
.get_marker()
777 self
.reader
.forward()
778 name
= self
.scan_directive_name(start_marker
)
781 value
= self
.scan_yaml_directive_value(start_marker
)
782 end_marker
= self
.reader
.get_marker()
784 value
= self
.scan_tag_directive_value(start_marker
)
785 end_marker
= self
.reader
.get_marker()
787 end_marker
= self
.reader
.get_marker()
788 while self
.reader
.peek() not in u
'\0\r\n\x85\u2028\u2029':
789 self
.reader
.forward()
790 self
.scan_directive_ignored_line(start_marker
)
791 return DirectiveToken(name
, value
, start_marker
, end_marker
)
793 def scan_directive_name(self
, start_marker
):
794 # See the specification for details.
796 ch
= self
.reader
.peek(length
)
797 while u
'0' <= ch
<= u
'9' or u
'A' <= ch
<= 'Z' or u
'a' <= ch
<= 'z' \
800 ch
= self
.reader
.peek(length
)
802 raise ScannerError("while scanning a directive", start_marker
,
803 "expected alphabetic or numeric character, but found %r"
804 % ch
.encode('utf-8'), self
.reader
.get_marker())
805 value
= self
.reader
.prefix(length
)
806 self
.reader
.forward(length
)
807 ch
= self
.reader
.peek()
808 if ch
not in u
'\0 \r\n\x85\u2028\u2029':
809 raise ScannerError("while scanning a directive", start_marker
,
810 "expected alphabetic or numeric character, but found %r"
811 % ch
.encode('utf-8'), self
.reader
.get_marker())
814 def scan_yaml_directive_value(self
, start_marker
):
815 # See the specification for details.
816 while self
.reader
.peek() == u
' ':
817 self
.reader
.forward()
818 major
= self
.scan_yaml_directive_number(start_marker
)
819 if self
.reader
.peek() != '.':
820 raise ScannerError("while scanning a directive", start_marker
,
821 "expected a digit or '.', but found %r"
822 % self
.reader
.peek().encode('utf-8'),
823 self
.reader
.get_marker())
824 self
.reader
.forward()
825 minor
= self
.scan_yaml_directive_number(start_marker
)
826 if self
.reader
.peek() not in u
'\0 \r\n\x85\u2028\u2029':
827 raise ScannerError("while scanning a directive", start_marker
,
828 "expected a digit or ' ', but found %r"
829 % self
.reader
.peek().encode('utf-8'),
830 self
.reader
.get_marker())
831 return (major
, minor
)
833 def scan_yaml_directive_number(self
, start_marker
):
834 # See the specification for details.
835 ch
= self
.reader
.peek()
836 if not (u
'0' <= ch
<= '9'):
837 raise ScannerError("while scanning a directive", start_marker
,
838 "expected a digit, but found %r" % ch
.encode('utf-8'),
839 self
.reader
.get_marker())
841 while u
'0' <= self
.reader
.peek(length
) <= u
'9':
843 value
= int(self
.reader
.prefix(length
))
844 self
.reader
.forward(length
)
847 def scan_tag_directive_value(self
, start_marker
):
848 # See the specification for details.
849 while self
.reader
.peek() == u
' ':
850 self
.reader
.forward()
851 handle
= self
.scan_tag_directive_handle(start_marker
)
852 while self
.reader
.peek() == u
' ':
853 self
.reader
.forward()
854 prefix
= self
.scan_tag_directive_prefix(start_marker
)
855 return (handle
, prefix
)
857 def scan_tag_directive_handle(self
, start_marker
):
858 # See the specification for details.
859 value
= self
.scan_tag_handle('directive', start_marker
)
860 ch
= self
.reader
.peek()
862 raise ScannerError("while scanning a directive", start_marker
,
863 "expected ' ', but found %r" % ch
.encode('utf-8'),
864 self
.reader
.get_marker())
867 def scan_tag_directive_prefix(self
, start_marker
):
868 # See the specification for details.
869 value
= self
.scan_tag_uri('directive', start_marker
)
870 ch
= self
.reader
.peek()
871 if ch
not in u
'\0 \r\n\x85\u2028\u2029':
872 raise ScannerError("while scanning a directive", start_marker
,
873 "expected ' ', but found %r" % ch
.encode('utf-8'),
874 self
.reader
.get_marker())
877 def scan_directive_ignored_line(self
, start_marker
):
878 # See the specification for details.
879 while self
.reader
.peek() == u
' ':
880 self
.reader
.forward()
881 if self
.reader
.peek() == u
'#':
882 while self
.reader
.peek() not in u
'\0\r\n\x85\u2028\u2029':
883 self
.reader
.forward()
884 ch
= self
.reader
.peek()
885 if ch
not in u
'\0\r\n\x85\u2028\u2029':
886 raise ScannerError("while scanning a directive", start_marker
,
887 "expected a comment or a line break, but found %r"
888 % ch
.encode('utf-8'), self
.reader
.get_marker())
889 self
.scan_line_break()
891 def scan_anchor(self
, TokenClass
):
892 # The specification does not restrict characters for anchors and
893 # aliases. This may lead to problems, for instance, the document:
895 # can be interpteted in two ways, as
898 # [ *alias , "value" ]
899 # Therefore we restrict aliases to numbers and ASCII letters.
900 start_marker
= self
.reader
.get_marker()
901 indicator
= self
.reader
.peek()
906 self
.reader
.forward()
908 ch
= self
.reader
.peek(length
)
909 while u
'0' <= ch
<= u
'9' or u
'A' <= ch
<= 'Z' or u
'a' <= ch
<= 'z' \
912 ch
= self
.reader
.peek(length
)
914 raise ScannerError("while scanning an %s" % name
, start_marker
,
915 "expected alphabetic or numeric character, but found %r"
916 % ch
.encode('utf-8'), self
.reader
.get_marker())
917 value
= self
.reader
.prefix(length
)
918 self
.reader
.forward(length
)
919 ch
= self
.reader
.peek()
920 if ch
not in u
'\0 \t\r\n\x85\u2028\u2029?:,]}%@`':
921 raise ScannerError("while scanning an %s" % name
, start_marker
,
922 "expected alphabetic or numeric character, but found %r"
923 % ch
.encode('utf-8'), self
.reader
.get_marker())
924 end_marker
= self
.reader
.get_marker()
925 return TokenClass(value
, start_marker
, end_marker
)
928 # See the specification for details.
929 start_marker
= self
.reader
.get_marker()
930 ch
= self
.reader
.peek(1)
933 self
.reader
.forward(2)
934 suffix
= self
.scan_tag_uri('tag', start_marker
)
935 if self
.reader
.peek() != u
'>':
936 raise ScannerError("while parsing a tag", start_marker
,
937 "expected '>', but found %r" % self
.reader
.peek().encode('utf-8'),
938 self
.reader
.get_marker())
939 self
.reader
.forward()
940 elif ch
in u
'\0 \t\r\n\x85\u2028\u2029':
943 self
.reader
.forward()
947 while ch
not in u
'\0 \r\n\x85\u2028\u2029':
952 ch
= self
.reader
.peek(length
)
955 handle
= self
.scan_tag_handle('tag', start_marker
)
958 self
.reader
.forward()
959 suffix
= self
.scan_tag_uri('tag', start_marker
)
960 ch
= self
.reader
.peek()
961 if ch
not in u
'\0 \r\n\x85\u2028\u2029':
962 raise ScannerError("while scanning a tag", start_marker
,
963 "expected ' ', but found %r" % ch
.encode('utf-8'),
964 self
.reader
.get_marker())
965 value
= (handle
, suffix
)
966 end_marker
= self
.reader
.get_marker()
967 return TagToken(value
, start_marker
, end_marker
)
969 def scan_block_scalar(self
, folded
):
970 # See the specification for details.
973 start_marker
= self
.reader
.get_marker()
976 self
.reader
.forward()
977 chomping
, increment
= self
.scan_block_scalar_indicators(start_marker
)
978 self
.scan_block_scalar_ignored_line(start_marker
)
980 # Determine the indentation level and go to the first non-empty line.
981 min_indent
= self
.indent
+1
984 if increment
is None:
985 breaks
, max_indent
, end_marker
= self
.scan_block_scalar_indentation()
986 indent
= max(min_indent
, max_indent
)
988 indent
= min_indent
+increment
-1
989 breaks
, end_marker
= self
.scan_block_scalar_breaks(indent
)
992 # Scan the inner part of the block scalar.
993 while self
.reader
.column
== indent
and self
.reader
.peek() != u
'\0':
994 chunks
.extend(breaks
)
995 leading_non_space
= self
.reader
.peek() not in u
' \t'
997 while self
.reader
.peek(length
) not in u
'\0\r\n\x85\u2028\u2029':
999 chunks
.append(self
.reader
.prefix(length
))
1000 self
.reader
.forward(length
)
1001 line_break
= self
.scan_line_break()
1002 breaks
, end_marker
= self
.scan_block_scalar_breaks(indent
)
1003 if self
.reader
.column
== indent
and self
.reader
.peek() != u
'\0':
1004 # Unfortunately, folding rules are ambiguous.
1006 # This is the folding according to the specification:
1008 if folded
and line_break
== u
'\n' \
1009 and leading_non_space
and self
.reader
.peek() not in u
' \t':
1013 chunks
.append(line_break
)
1015 # This is Clark Evans's interpretation (also in the spec
1018 #if folded and line_break == u'\n':
1020 # if self.reader.peek() not in ' \t':
1021 # chunks.append(u' ')
1023 # chunks.append(line_break)
1025 # chunks.append(line_break)
1030 if chomping
is not False:
1031 chunks
.append(line_break
)
1032 if chomping
is True:
1033 chunks
.extend(breaks
)
1036 return ScalarToken(u
''.join(chunks
), False, start_marker
, end_marker
)
1038 def scan_block_scalar_indicators(self
, start_marker
):
1039 # See the specification for details.
1042 ch
= self
.reader
.peek()
1048 self
.reader
.forward()
1049 ch
= self
.reader
.peek()
1050 if ch
in u
'0123456789':
1053 raise ScannerError("while scanning a block scalar", start_marker
,
1054 "expected indentation indicator in the range 1-9, but found 0",
1055 self
.reader
.get_marker())
1056 self
.reader
.forward()
1057 elif ch
in u
'0123456789':
1060 raise ScannerError("while scanning a block scalar", start_marker
,
1061 "expected indentation indicator in the range 1-9, but found 0",
1062 self
.reader
.get_marker())
1063 self
.reader
.forward()
1064 ch
= self
.reader
.peek()
1070 self
.reader
.forward()
1071 ch
= self
.reader
.peek()
1072 if ch
not in u
'\0 \r\n\x85\u2028\u2029':
1073 raise ScannerError("while scanning a block scalar", start_marker
,
1074 "expected chomping or indentation indicators, but found %r"
1075 % ch
.encode('utf-8'), self
.reader
.get_marker())
1076 return chomping
, increment
1078 def scan_block_scalar_ignored_line(self
, start_marker
):
1079 # See the specification for details.
1080 while self
.reader
.peek() == u
' ':
1081 self
.reader
.forward()
1082 if self
.reader
.peek() == u
'#':
1083 while self
.reader
.peek() not in u
'\0\r\n\x85\u2028\u2029':
1084 self
.reader
.forward()
1085 ch
= self
.reader
.peek()
1086 if ch
not in u
'\0\r\n\x85\u2028\u2029':
1087 raise ScannerError("while scanning a block scalar", start_marker
,
1088 "expected a comment or a line break, but found %r"
1089 % ch
.encode('utf-8'), self
.reader
.get_marker())
1090 self
.scan_line_break()
1092 def scan_block_scalar_indentation(self
):
1093 # See the specification for details.
1096 end_marker
= self
.reader
.get_marker()
1097 while self
.reader
.peek() in u
' \r\n\x85\u2028\u2029':
1098 if self
.reader
.peek() != u
' ':
1099 chunks
.append(self
.scan_line_break())
1100 end_marker
= self
.reader
.get_marker()
1102 self
.reader
.forward()
1103 if self
.reader
.column
> max_indent
:
1104 max_indent
= self
.reader
.column
1105 return chunks
, max_indent
, end_marker
1107 def scan_block_scalar_breaks(self
, indent
):
1108 # See the specification for details.
1110 end_marker
= self
.reader
.get_marker()
1111 while self
.reader
.column
< indent
and self
.reader
.peek() == u
' ':
1112 self
.reader
.forward()
1113 while self
.reader
.peek() in u
'\r\n\x85\u2028\u2029':
1114 chunks
.append(self
.scan_line_break())
1115 end_marker
= self
.reader
.get_marker()
1116 while self
.reader
.column
< indent
and self
.reader
.peek() == u
' ':
1117 self
.reader
.forward()
1118 return chunks
, end_marker
1120 def scan_flow_scalar(self
, double
):
1121 # See the specification for details.
1123 start_marker
= self
.reader
.get_marker()
1124 indent
= self
.indent
+1
1127 quote
= self
.reader
.peek()
1128 self
.reader
.forward()
1129 chunks
.extend(self
.scan_flow_scalar_non_spaces(double
, indent
, start_marker
))
1130 while self
.reader
.peek() != quote
:
1131 chunks
.extend(self
.scan_flow_scalar_spaces(double
, indent
, start_marker
))
1132 chunks
.extend(self
.scan_flow_scalar_non_spaces(double
, indent
, start_marker
))
1133 self
.reader
.forward()
1134 end_marker
= self
.reader
.get_marker()
1135 return ScalarToken(u
''.join(chunks
), False, start_marker
, end_marker
)
1137 ESCAPE_REPLACEMENTS
= {
1163 def scan_flow_scalar_non_spaces(self
, double
, indent
, start_marker
):
1164 # See the specification for details.
1168 while self
.reader
.peek(length
) not in u
'\'\"\\\0 \t\r\n\x85\u2028\u2029':
1171 chunks
.append(self
.reader
.prefix(length
))
1172 self
.reader
.forward(length
)
1173 ch
= self
.reader
.peek()
1174 if not double
and ch
== u
'\'' and self
.reader
.peek(1) == u
'\'':
1175 chunks
.append(u
'\'')
1176 self
.reader
.forward(2)
1177 elif (double
and ch
== u
'\'') or (not double
and ch
in u
'\"\\'):
1179 self
.reader
.forward()
1180 elif double
and ch
== u
'\\':
1181 self
.reader
.forward()
1182 ch
= self
.reader
.peek()
1183 if ch
in self
.ESCAPE_REPLACEMENTS
:
1184 chunks
.append(self
.ESCAPE_REPLACEMENTS
[ch
])
1185 self
.reader
.forward()
1186 elif ch
in self
.ESCAPE_CODES
:
1187 length
= self
.ESCAPE_CODES
[ch
]
1188 self
.reader
.forward()
1189 for k
in range(length
):
1190 if self
.reader
.peek(k
) not in u
'0123456789ABCDEFabcdef':
1191 raise ScannerError("while scanning a double-quoted scalar", start_marker
,
1192 "expected escape sequence of %d hexdecimal numbers, but found %r" %
1193 (length
, self
.reader
.peek(k
).encode('utf-8')), self
.reader
.get_marker())
1194 code
= int(self
.reader
.prefix(length
), 16)
1195 chunks
.append(unichr(code
))
1196 self
.reader
.forward(length
)
1197 elif ch
in u
'\r\n\x85\u2028\u2029':
1198 self
.scan_line_break()
1199 chunks
.extend(self
.scan_flow_scalar_breaks(double
, indent
, start_marker
))
1201 raise ScannerError("while scanning a double-quoted scalar", start_marker
,
1202 "found unknown escape character %r" % ch
.encode('utf-8'), self
.reader
.get_marker())
1206 def scan_flow_scalar_spaces(self
, double
, indent
, start_marker
):
1207 # See the specification for details.
1210 while self
.reader
.peek(length
) in u
' \t':
1212 whitespaces
= self
.reader
.prefix(length
)
1213 self
.reader
.forward(length
)
1214 ch
= self
.reader
.peek()
1216 raise ScannerError("while scanning a quoted scalar", start_marker
,
1217 "found unexpected end of stream", self
.reader
.get_marker())
1218 elif ch
in u
'\r\n\x85\u2028\u2029':
1219 line_break
= self
.scan_line_break()
1220 breaks
= self
.scan_flow_scalar_breaks(double
, indent
, start_marker
)
1221 if line_break
!= u
'\n':
1222 chunks
.append(line_break
)
1225 chunks
.extend(breaks
)
1227 chunks
.append(whitespaces
)
1230 def scan_flow_scalar_breaks(self
, double
, indent
, start_marker
):
1231 # See the specification for details.
1234 while self
.reader
.column
< indent
and self
.reader
.peek() == u
' ':
1235 self
.reader
.forward()
1236 if self
.reader
.column
< indent \
1237 and self
.reader
.peek() not in u
'\0\r\n\x85\u2028\u2029':
1241 raise ScannerError("while scanning a quoted scalar", start_marker
,
1242 "expected %d space%s indentation, but found %r"
1243 % (indent
, s
, self
.reader
.peek().encode('utf-8')),
1244 self
.reader
.get_marker())
1245 while self
.reader
.peek() in u
' \t':
1246 self
.reader
.forward()
1247 if self
.reader
.peek() in u
'\r\n\x85\u2028\u2029':
1248 chunks
.append(self
.scan_line_break())
1252 def scan_plain(self
):
1253 # See the specification for details.
1254 # We add an additional restriction for the flow context:
1255 # plain scalars in the flow context cannot contain ':' and '?'.
1256 # We also keep track of the `allow_simple_key` flag here.
1258 start_marker
= self
.reader
.get_marker()
1259 end_marker
= start_marker
1260 indent
= self
.indent
+1
1266 if self
.reader
.peek() == u
'#':
1269 ch
= self
.reader
.peek(length
)
1270 if ch
in u
'\0 \t\r\n\x85\u2028\u2029' \
1271 or (not self
.flow_level
and ch
== u
':' and
1272 self
.reader
.peek(length
+1) in u
'\0 \t\r\n\x28\u2028\u2029') \
1273 or (self
.flow_level
and ch
in u
',:?[]{}'):
1278 self
.allow_simple_key
= False
1279 chunks
.extend(spaces
)
1280 chunks
.append(self
.reader
.prefix(length
))
1281 self
.reader
.forward(length
)
1282 end_marker
= self
.reader
.get_marker()
1283 spaces
= self
.scan_plain_spaces(indent
)
1284 if not spaces
or self
.reader
.peek() == u
'#' \
1285 or self
.reader
.column
< indent
:
1287 return ScalarToken(u
''.join(chunks
), True, start_marker
, end_marker
)
1289 def scan_plain_spaces(self
, indent
):
1290 # See the specification for details.
1291 # The specification is really confusing about tabs in plain scalars.
1292 # We just forbid them completely. Do not use tabs in YAML!
1295 while self
.reader
.peek(length
) in u
' ':
1297 whitespaces
= self
.reader
.prefix(length
)
1298 self
.reader
.forward(length
)
1299 ch
= self
.reader
.peek()
1300 if ch
in u
'\r\n\x85\u2028\u2029':
1301 line_break
= self
.scan_line_break()
1302 self
.allow_simple_key
= True
1304 while self
.reader
.peek() in u
' \r\n\x85\u2028\u2029':
1305 if self
.reader
.peek() == ' ':
1306 self
.reader
.forward()
1308 breaks
.append(self
.scan_line_break())
1309 if line_break
!= u
'\n':
1310 chunks
.append(line_break
)
1313 chunks
.extend(breaks
)
1315 chunks
.append(whitespaces
)
1318 def scan_tag_handle(self
, name
, start_marker
):
1319 # See the specification for details.
1320 # For some strange reasons, the specification does not allow '_' in
1321 # tag handles. I have allowed it anyway.
1322 ch
= self
.reader
.peek()
1324 raise ScannerError("while scanning a %s" % name
, start_marker
,
1325 "expected '!', but found %r" % ch
.encode('utf-8'),
1326 self
.reader
.get_marker())
1328 ch
= self
.reader
.peek(length
)
1330 while u
'0' <= ch
<= u
'9' or u
'A' <= ch
<= 'Z' or u
'a' <= ch
<= 'z' \
1333 ch
= self
.reader
.peek(length
)
1335 self
.reader
.forward(length
)
1336 raise ScannerError("while scanning a %s" % name
, start_marker
,
1337 "expected '!', but found %r" % ch
.encode('utf-8'),
1338 self
.reader
.get_marker())
1340 value
= self
.reader
.prefix(length
)
1341 self
.reader
.forward(length
)
1344 def scan_tag_uri(self
, name
, start_marker
):
1345 # See the specification for details.
1346 # Note: we do not check if URI is well-formed.
1349 ch
= self
.reader
.peek(length
)
1350 while u
'0' <= ch
<= u
'9' or u
'A' <= ch
<= 'Z' or u
'a' <= ch
<= 'z' \
1351 or ch
in u
'-;/?:@&=+$,_.!~*\'()[]%':
1353 chunks
.append(self
.reader
.prefix(length
))
1354 self
.reader
.forward(length
)
1356 chunks
.append(self
.scan_uri_escapes(name
, start_marker
))
1359 ch
= self
.reader
.peek(length
)
1361 chunks
.append(self
.reader
.prefix(length
))
1362 self
.reader
.forward(length
)
1365 raise ScannerError("while parsing a %s" % name
, start_marker
,
1366 "expected URI, but found %r" % ch
.encode('utf-8'),
1367 self
.reader
.get_marker())
1368 return u
''.join(chunks
)
1370 def scan_uri_escapes(self
, name
, start_marker
):
1371 # See the specification for details.
1373 marker
= self
.reader
.get_marker()
1374 while self
.reader
.peek() == u
'%':
1375 self
.reader
.forward()
1377 if self
.reader
.peek(k
) not in u
'0123456789ABCDEFabcdef':
1378 raise ScannerError("while scanning a %s" % name
, start_marker
,
1379 "expected URI escape sequence of 2 hexdecimal numbers, but found %r" %
1380 (self
.reader
.peek(k
).encode('utf-8')), self
.reader
.get_marker())
1381 bytes
.append(chr(int(self
.reader
.prefix(2), 16)))
1382 self
.reader
.forward(2)
1384 value
= unicode(''.join(bytes
), 'utf-8')
1385 except UnicodeDecodeError, exc
:
1386 raise ScannerError("while scanning a %s" % name
, start_marker
, str(exc
), marker
)
1389 def scan_line_break(self
):
1395 # '\u2028' : '\u2028'
1396 # '\u2029 : '\u2029'
1398 ch
= self
.reader
.peek()
1399 if ch
in u
'\r\n\x85':
1400 if self
.reader
.prefix(2) == u
'\r\n':
1403 self
.reader
.forward()
1405 elif ch
in u
'\u2028\u2029':
1406 self
.reader
.forward()
1412 # psyco.bind(Scanner)
1413 #except ImportError: