2 # Scanner produces tokens of the following types:
5 # DIRECTIVE(name, value)
22 # SCALAR(value, plain)
24 # Read comments in the Scanner code for more details.
27 __all__
= ['Scanner', 'ScannerError']
29 from error
import MarkedYAMLError
32 class ScannerError(MarkedYAMLError
):
36 # See below simple keys treatment.
38 def __init__(self
, token_number
, required
, index
, line
, column
, mark
):
39 self
.token_number
= token_number
40 self
.required
= required
49 def __init__(self
, reader
):
50 """Initialize the scanner."""
51 # The input stream. The Reader class do the dirty work of checking for
52 # BOM and converting the input data to Unicode. It also adds NUL to
55 # Reader supports the following methods
56 # self.reader.peek(i=0) # peek the next i-th character
57 # self.reader.prefix(l=1) # peek the next l characters
58 # self.reader.forward(l=1) # read the next l characters
59 # and move the pointer
62 # Had we reached the end of the stream?
65 # The number of unclosed '{' and '['. `flow_level == 0` means block
69 # List of processed tokens that are not yet emitted.
72 # Add the STREAM-START token.
73 self
.fetch_stream_start()
75 # Number of tokens that were emitted through the `get_token` method.
78 # The current indentation level.
81 # Past indentation levels.
84 # Variables related to simple keys treatment.
86 # A simple key is a key that is not denoted by the '?' indicator.
87 # Example of simple keys:
89 # block simple key: value
91 # : { flow simple key: value }
92 # We emit the KEY token before all keys, so when we find a potential
93 # simple key, we try to locate the corresponding ':' indicator.
94 # Simple keys should be limited to a single line and 1024 characters.
96 # Can a simple key start at the current position? A simple key may
98 # - at the beginning of the line, not counting indentation spaces
100 # - after '{', '[', ',' (in the flow context),
101 # - after '?', ':', '-' (in the block context).
102 # In the block context, this flag also signifies if a block collection
103 # may start at the current position.
104 self
.allow_simple_key
= True
106 # Keep track of possible simple keys. This is a dictionary. The key
107 # is `flow_level`; there can be no more that one possible simple key
108 # for each level. The value is a SimpleKey record:
109 # (token_number, required, index, line, column, mark)
110 # A simple key may start with ALIAS, ANCHOR, TAG, SCALAR(flow),
111 # '[', or '{' tokens.
112 self
.possible_simple_keys
= {}
116 def check(self
, *choices
):
117 # Check if the next token is one of the given types.
118 while self
.need_more_tokens():
119 self
.fetch_more_tokens()
121 for choice
in choices
:
122 if isinstance(self
.tokens
[0], choice
):
127 # Return the next token, but do not delete if from the queue.
128 while self
.need_more_tokens():
129 self
.fetch_more_tokens()
131 return self
.tokens
[0]
134 # Return the next token.
135 while self
.need_more_tokens():
136 self
.fetch_more_tokens()
138 self
.tokens_taken
+= 1
139 return self
.tokens
.pop(0)
143 while self
.need_more_tokens():
144 self
.fetch_more_tokens()
146 self
.tokens_taken
+= 1
147 yield self
.tokens
.pop(0)
148 while self
.need_more_tokens():
149 self
.fetch_more_tokens()
153 def need_more_tokens(self
):
158 # The current token may be a potential simple key, so we
159 # need to look further.
160 self
.stale_possible_simple_keys()
161 if self
.next_possible_simple_key() == self
.tokens_taken
:
164 def fetch_more_tokens(self
):
166 # Eat whitespaces and comments until we reach the next token.
167 self
.scan_to_next_token()
169 # Remove obsolete possible simple keys.
170 self
.stale_possible_simple_keys()
172 # Compare the current indentation and column. It may add some tokens
173 # and decrease the current indentation level.
174 self
.unwind_indent(self
.reader
.column
)
176 # Peek the next character.
177 ch
= self
.reader
.peek()
179 # Is it the end of stream?
181 return self
.fetch_stream_end()
184 if ch
== u
'%' and self
.check_directive():
185 return self
.fetch_directive()
187 # Is it the document start?
188 if ch
== u
'-' and self
.check_document_start():
189 return self
.fetch_document_start()
191 # Is it the document end?
192 if ch
== u
'.' and self
.check_document_end():
193 return self
.fetch_document_end()
195 # TODO: support for BOM within a stream.
197 # return self.fetch_bom() <-- issue BOMToken
199 # Note: the order of the following checks is NOT significant.
201 # Is it the flow sequence start indicator?
203 return self
.fetch_flow_sequence_start()
205 # Is it the flow mapping start indicator?
207 return self
.fetch_flow_mapping_start()
209 # Is it the flow sequence end indicator?
211 return self
.fetch_flow_sequence_end()
213 # Is it the flow mapping end indicator?
215 return self
.fetch_flow_mapping_end()
217 # Is it the flow entry indicator?
219 return self
.fetch_flow_entry()
221 # Is it the block entry indicator?
222 if ch
in u
'-' and self
.check_block_entry():
223 return self
.fetch_block_entry()
225 # Is it the key indicator?
226 if ch
== u
'?' and self
.check_key():
227 return self
.fetch_key()
229 # Is it the value indicator?
230 if ch
== u
':' and self
.check_value():
231 return self
.fetch_value()
235 return self
.fetch_alias()
239 return self
.fetch_anchor()
243 return self
.fetch_tag()
245 # Is it a literal scalar?
246 if ch
== u
'|' and not self
.flow_level
:
247 return self
.fetch_literal()
249 # Is it a folded scalar?
250 if ch
== u
'>' and not self
.flow_level
:
251 return self
.fetch_folded()
253 # Is it a single quoted scalar?
255 return self
.fetch_single()
257 # Is it a double quoted scalar?
259 return self
.fetch_double()
261 # It must be a plain scalar then.
262 if self
.check_plain():
263 return self
.fetch_plain()
265 # No? It's an error. Let's produce a nice error message.
266 raise ScannerError("while scanning for the next token", None,
267 "found character %r that cannot start any token"
268 % ch
.encode('utf-8'), self
.reader
.get_mark())
270 # Simple keys treatment.
272 def next_possible_simple_key(self
):
273 # Return the number of the nearest possible simple key. Actually we
274 # don't need to loop through the whole dictionary. We may replace it
275 # with the following code:
276 # if not self.possible_simple_keys:
278 # return self.possible_simple_keys[
279 # min(self.possible_simple_keys.keys())].token_number
280 min_token_number
= None
281 for level
in self
.possible_simple_keys
:
282 key
= self
.possible_simple_keys
[level
]
283 if min_token_number
is None or key
.token_number
< min_token_number
:
284 min_token_number
= key
.token_number
285 return min_token_number
287 def stale_possible_simple_keys(self
):
288 # Remove entries that are no longer possible simple keys. According to
289 # the YAML specification, simple keys
290 # - should be limited to a single line,
291 # - should be no longer than 1024 characters.
292 # Disabling this procedure will allow simple keys of any length and
293 # height (may cause problems if indentation is broken though).
294 for level
in self
.possible_simple_keys
.keys():
295 key
= self
.possible_simple_keys
[level
]
296 if key
.line
!= self
.reader
.line \
297 or self
.reader
.index
-key
.index
> 1024:
299 raise ScannerError("while scanning a simple key", key
.mark
,
300 "could not found expected ':'", self
.reader
.get_mark())
301 del self
.possible_simple_keys
[level
]
303 def save_possible_simple_key(self
):
304 # The next token may start a simple key. We check if it's possible
305 # and save its position. This function is called for
306 # ALIAS, ANCHOR, TAG, SCALAR(flow), '[', and '{'.
308 # Check if a simple key is required at the current position.
309 required
= not self
.flow_level
and self
.indent
== self
.reader
.column
311 # A simple key is required only if it is the first token in the current
312 # line. Therefore it is always allowed.
313 assert self
.allow_simple_key
or not required
315 # The next token might be a simple key. Let's save it's number and
317 if self
.allow_simple_key
:
318 self
.remove_possible_simple_key()
319 token_number
= self
.tokens_taken
+len(self
.tokens
)
320 index
= self
.reader
.index
321 line
= self
.reader
.line
322 column
= self
.reader
.column
323 mark
= self
.reader
.get_mark()
324 key
= SimpleKey(token_number
, required
,
325 index
, line
, column
, mark
)
326 self
.possible_simple_keys
[self
.flow_level
] = key
328 def remove_possible_simple_key(self
):
329 # Remove the saved possible key position at the current flow level.
330 if self
.flow_level
in self
.possible_simple_keys
:
331 key
= self
.possible_simple_keys
[self
.flow_level
]
333 # I don't think it's possible, but I could be wrong.
334 assert not key
.required
336 # raise ScannerError("while scanning a simple key", key.mark,
337 # "could not found expected ':'", self.reader.get_mark())
339 # Indentation functions.
341 def unwind_indent(self
, column
):
343 ## In flow context, tokens should respect indentation.
344 ## Actually the condition should be `self.indent >= column` according to
345 ## the spec. But this condition will prohibit intuitively correct
346 ## constructions such as
349 #if self.flow_level and self.indent > column:
350 # raise ScannerError(None, None,
351 # "invalid intendation or unclosed '[' or '{'",
352 # self.reader.get_mark())
354 # In the flow context, indentation is ignored. We make the scanner less
355 # restrictive then specification requires.
359 # In block context, we may need to issue the BLOCK-END tokens.
360 while self
.indent
> column
:
361 mark
= self
.reader
.get_mark()
362 self
.indent
= self
.indents
.pop()
363 self
.tokens
.append(BlockEndToken(mark
, mark
))
365 def add_indent(self
, column
):
366 # Check if we need to increase indentation.
367 if self
.indent
< column
:
368 self
.indents
.append(self
.indent
)
375 def fetch_stream_start(self
):
376 # We always add STREAM-START as the first token and STREAM-END as the
380 mark
= self
.reader
.get_mark()
383 self
.tokens
.append(StreamStartToken(mark
, mark
,
384 encoding
=self
.reader
.encoding
))
387 def fetch_stream_end(self
):
389 # Set the current intendation to -1.
390 self
.unwind_indent(-1)
392 # Reset everything (not really needed).
393 self
.allow_simple_key
= False
394 self
.possible_simple_keys
= {}
397 mark
= self
.reader
.get_mark()
400 self
.tokens
.append(StreamEndToken(mark
, mark
))
402 # The reader is ended.
405 def fetch_directive(self
):
407 # Set the current intendation to -1.
408 self
.unwind_indent(-1)
411 self
.remove_possible_simple_key()
412 self
.allow_simple_key
= False
414 # Scan and add DIRECTIVE.
415 self
.tokens
.append(self
.scan_directive())
417 def fetch_document_start(self
):
418 self
.fetch_document_indicator(DocumentStartToken
)
420 def fetch_document_end(self
):
421 self
.fetch_document_indicator(DocumentEndToken
)
423 def fetch_document_indicator(self
, TokenClass
):
425 # Set the current intendation to -1.
426 self
.unwind_indent(-1)
428 # Reset simple keys. Note that there could not be a block collection
430 self
.remove_possible_simple_key()
431 self
.allow_simple_key
= False
433 # Add DOCUMENT-START or DOCUMENT-END.
434 start_mark
= self
.reader
.get_mark()
435 self
.reader
.forward(3)
436 end_mark
= self
.reader
.get_mark()
437 self
.tokens
.append(TokenClass(start_mark
, end_mark
))
439 def fetch_flow_sequence_start(self
):
440 self
.fetch_flow_collection_start(FlowSequenceStartToken
)
442 def fetch_flow_mapping_start(self
):
443 self
.fetch_flow_collection_start(FlowMappingStartToken
)
445 def fetch_flow_collection_start(self
, TokenClass
):
447 # '[' and '{' may start a simple key.
448 self
.save_possible_simple_key()
450 # Increase the flow level.
453 # Simple keys are allowed after '[' and '{'.
454 self
.allow_simple_key
= True
456 # Add FLOW-SEQUENCE-START or FLOW-MAPPING-START.
457 start_mark
= self
.reader
.get_mark()
458 self
.reader
.forward()
459 end_mark
= self
.reader
.get_mark()
460 self
.tokens
.append(TokenClass(start_mark
, end_mark
))
462 def fetch_flow_sequence_end(self
):
463 self
.fetch_flow_collection_end(FlowSequenceEndToken
)
465 def fetch_flow_mapping_end(self
):
466 self
.fetch_flow_collection_end(FlowMappingEndToken
)
468 def fetch_flow_collection_end(self
, TokenClass
):
470 # Reset possible simple key on the current level.
471 self
.remove_possible_simple_key()
473 # Decrease the flow level.
476 # No simple keys after ']' or '}'.
477 self
.allow_simple_key
= False
479 # Add FLOW-SEQUENCE-END or FLOW-MAPPING-END.
480 start_mark
= self
.reader
.get_mark()
481 self
.reader
.forward()
482 end_mark
= self
.reader
.get_mark()
483 self
.tokens
.append(TokenClass(start_mark
, end_mark
))
485 def fetch_flow_entry(self
):
487 # Simple keys are allowed after ','.
488 self
.allow_simple_key
= True
490 # Reset possible simple key on the current level.
491 self
.remove_possible_simple_key()
494 start_mark
= self
.reader
.get_mark()
495 self
.reader
.forward()
496 end_mark
= self
.reader
.get_mark()
497 self
.tokens
.append(FlowEntryToken(start_mark
, end_mark
))
499 def fetch_block_entry(self
):
501 # Block context needs additional checks.
502 if not self
.flow_level
:
504 # Are we allowed to start a new entry?
505 if not self
.allow_simple_key
:
506 raise ScannerError(None, None,
507 "sequence entries are not allowed here",
508 self
.reader
.get_mark())
510 # We may need to add BLOCK-SEQUENCE-START.
511 if self
.add_indent(self
.reader
.column
):
512 mark
= self
.reader
.get_mark()
513 self
.tokens
.append(BlockSequenceStartToken(mark
, mark
))
515 # It's an error for the block entry to occur in the flow context,
516 # but we let the parser detect this.
520 # Simple keys are allowed after '-'.
521 self
.allow_simple_key
= True
523 # Reset possible simple key on the current level.
524 self
.remove_possible_simple_key()
527 start_mark
= self
.reader
.get_mark()
528 self
.reader
.forward()
529 end_mark
= self
.reader
.get_mark()
530 self
.tokens
.append(BlockEntryToken(start_mark
, end_mark
))
534 # Block context needs additional checks.
535 if not self
.flow_level
:
537 # Are we allowed to start a key (not nessesary a simple)?
538 if not self
.allow_simple_key
:
539 raise ScannerError(None, None,
540 "mapping keys are not allowed here",
541 self
.reader
.get_mark())
543 # We may need to add BLOCK-MAPPING-START.
544 if self
.add_indent(self
.reader
.column
):
545 mark
= self
.reader
.get_mark()
546 self
.tokens
.append(BlockMappingStartToken(mark
, mark
))
548 # Simple keys are allowed after '?' in the block context.
549 self
.allow_simple_key
= not self
.flow_level
551 # Reset possible simple key on the current level.
552 self
.remove_possible_simple_key()
555 start_mark
= self
.reader
.get_mark()
556 self
.reader
.forward()
557 end_mark
= self
.reader
.get_mark()
558 self
.tokens
.append(KeyToken(start_mark
, end_mark
))
560 def fetch_value(self
):
562 # Do we determine a simple key?
563 if self
.flow_level
in self
.possible_simple_keys
:
566 key
= self
.possible_simple_keys
[self
.flow_level
]
567 del self
.possible_simple_keys
[self
.flow_level
]
568 self
.tokens
.insert(key
.token_number
-self
.tokens_taken
,
569 KeyToken(key
.mark
, key
.mark
))
571 # If this key starts a new block mapping, we need to add
572 # BLOCK-MAPPING-START.
573 if not self
.flow_level
:
574 if self
.add_indent(key
.column
):
575 self
.tokens
.insert(key
.token_number
-self
.tokens_taken
,
576 BlockMappingStartToken(key
.mark
, key
.mark
))
578 # There cannot be two simple keys one after another.
579 self
.allow_simple_key
= False
581 # It must be a part of a complex key.
584 # Block context needs additional checks.
585 # (Do we really need them? They will be catched by the parser
587 if not self
.flow_level
:
589 # We are allowed to start a complex value if and only if
590 # we can start a simple key.
591 if not self
.allow_simple_key
:
592 raise ScannerError(None, None,
593 "mapping values are not allowed here",
594 self
.reader
.get_mark())
596 # Simple keys are allowed after ':' in the block context.
597 self
.allow_simple_key
= not self
.flow_level
599 # Reset possible simple key on the current level.
600 self
.remove_possible_simple_key()
603 start_mark
= self
.reader
.get_mark()
604 self
.reader
.forward()
605 end_mark
= self
.reader
.get_mark()
606 self
.tokens
.append(ValueToken(start_mark
, end_mark
))
608 def fetch_alias(self
):
610 # ALIAS could be a simple key.
611 self
.save_possible_simple_key()
613 # No simple keys after ALIAS.
614 self
.allow_simple_key
= False
616 # Scan and add ALIAS.
617 self
.tokens
.append(self
.scan_anchor(AliasToken
))
619 def fetch_anchor(self
):
621 # ANCHOR could start a simple key.
622 self
.save_possible_simple_key()
624 # No simple keys after ANCHOR.
625 self
.allow_simple_key
= False
627 # Scan and add ANCHOR.
628 self
.tokens
.append(self
.scan_anchor(AnchorToken
))
632 # TAG could start a simple key.
633 self
.save_possible_simple_key()
635 # No simple keys after TAG.
636 self
.allow_simple_key
= False
639 self
.tokens
.append(self
.scan_tag())
641 def fetch_literal(self
):
642 self
.fetch_block_scalar(style
='|')
644 def fetch_folded(self
):
645 self
.fetch_block_scalar(style
='>')
647 def fetch_block_scalar(self
, style
):
649 # A simple key may follow a block scalar.
650 self
.allow_simple_key
= True
652 # Reset possible simple key on the current level.
653 self
.remove_possible_simple_key()
655 # Scan and add SCALAR.
656 self
.tokens
.append(self
.scan_block_scalar(style
))
658 def fetch_single(self
):
659 self
.fetch_flow_scalar(style
='\'')
661 def fetch_double(self
):
662 self
.fetch_flow_scalar(style
='"')
664 def fetch_flow_scalar(self
, style
):
666 # A flow scalar could be a simple key.
667 self
.save_possible_simple_key()
669 # No simple keys after flow scalars.
670 self
.allow_simple_key
= False
672 # Scan and add SCALAR.
673 self
.tokens
.append(self
.scan_flow_scalar(style
))
675 def fetch_plain(self
):
677 # A plain scalar could be a simple key.
678 self
.save_possible_simple_key()
680 # No simple keys after plain scalars. But note that `scan_plain` will
681 # change this flag if the scan is finished at the beginning of the
683 self
.allow_simple_key
= False
685 # Scan and add SCALAR. May change `allow_simple_key`.
686 self
.tokens
.append(self
.scan_plain())
690 def check_directive(self
):
692 # DIRECTIVE: ^ '%' ...
693 # The '%' indicator is already checked.
694 if self
.reader
.column
== 0:
697 def check_document_start(self
):
699 # DOCUMENT-START: ^ '---' (' '|'\n')
700 if self
.reader
.column
== 0:
701 if self
.reader
.prefix(3) == u
'---' \
702 and self
.reader
.peek(3) in u
'\0 \t\r\n\x85\u2028\u2029':
705 def check_document_end(self
):
707 # DOCUMENT-END: ^ '...' (' '|'\n')
708 if self
.reader
.column
== 0:
709 prefix
= self
.reader
.peek(4)
710 if self
.reader
.prefix(3) == u
'...' \
711 and self
.reader
.peek(3) in u
'\0 \t\r\n\x85\u2028\u2029':
714 def check_block_entry(self
):
716 # BLOCK-ENTRY: '-' (' '|'\n')
717 return self
.reader
.peek(1) in u
'\0 \t\r\n\x85\u2028\u2029'
721 # KEY(flow context): '?'
725 # KEY(block context): '?' (' '|'\n')
727 return self
.reader
.peek(1) in u
'\0 \t\r\n\x85\u2028\u2029'
729 def check_value(self
):
731 # VALUE(flow context): ':'
735 # VALUE(block context): ':' (' '|'\n')
737 return self
.reader
.peek(1) in u
'\0 \t\r\n\x85\u2028\u2029'
739 def check_plain(self
):
741 # A plain scalar may start with any non-space character except:
742 # '-', '?', ':', ',', '[', ']', '{', '}',
743 # '#', '&', '*', '!', '|', '>', '\'', '\"',
746 # It may also start with
748 # if it is followed by a non-space character.
750 # Note that we limit the last rule to the block context (except the
751 # '-' character) because we want the flow context to be space
753 ch
= self
.reader
.peek()
754 return ch
not in u
'\0 \t\r\n\x85\u2028\u2029-?:,[]{}#&*!|>\'\"%@`' \
755 or (self
.reader
.peek(1) not in u
'\0 \t\r\n\x85\u2028\u2029'
756 and (ch
== u
'-' or (not self
.flow_level
and ch
in u
'?:')))
760 def scan_to_next_token(self
):
761 # We ignore spaces, line breaks and comments.
762 # If we find a line break in the block context, we set the flag
763 # `allow_simple_key` on.
764 # The byte order mark is stripped if it's the first character in the
765 # stream. We do not yet support BOM inside the stream as the
766 # specification requires. Any such mark will be considered as a part
769 # TODO: We need to make tab handling rules more sane. A good rule is
770 # Tabs cannot precede tokens
771 # BLOCK-SEQUENCE-START, BLOCK-MAPPING-START, BLOCK-END,
772 # KEY(block), VALUE(block), BLOCK-ENTRY
773 # So the checking code is
775 # self.allow_simple_keys = False
776 # We also need to add the check for `allow_simple_keys == True` to
777 # `unwind_indent` before issuing BLOCK-END.
778 # Scanners for block, flow, and plain scalars need to be modified.
780 if self
.reader
.index
== 0 and self
.reader
.peek() == u
'\uFEFF':
781 self
.reader
.forward()
784 while self
.reader
.peek() == u
' ':
785 self
.reader
.forward()
786 if self
.reader
.peek() == u
'#':
787 while self
.reader
.peek() not in u
'\0\r\n\x85\u2028\u2029':
788 self
.reader
.forward()
789 if self
.scan_line_break():
790 if not self
.flow_level
:
791 self
.allow_simple_key
= True
795 def scan_directive(self
):
796 # See the specification for details.
797 start_mark
= self
.reader
.get_mark()
798 self
.reader
.forward()
799 name
= self
.scan_directive_name(start_mark
)
802 value
= self
.scan_yaml_directive_value(start_mark
)
803 end_mark
= self
.reader
.get_mark()
805 value
= self
.scan_tag_directive_value(start_mark
)
806 end_mark
= self
.reader
.get_mark()
808 end_mark
= self
.reader
.get_mark()
809 while self
.reader
.peek() not in u
'\0\r\n\x85\u2028\u2029':
810 self
.reader
.forward()
811 self
.scan_directive_ignored_line(start_mark
)
812 return DirectiveToken(name
, value
, start_mark
, end_mark
)
814 def scan_directive_name(self
, start_mark
):
815 # See the specification for details.
817 ch
= self
.reader
.peek(length
)
818 while u
'0' <= ch
<= u
'9' or u
'A' <= ch
<= 'Z' or u
'a' <= ch
<= 'z' \
821 ch
= self
.reader
.peek(length
)
823 raise ScannerError("while scanning a directive", start_mark
,
824 "expected alphabetic or numeric character, but found %r"
825 % ch
.encode('utf-8'), self
.reader
.get_mark())
826 value
= self
.reader
.prefix(length
)
827 self
.reader
.forward(length
)
828 ch
= self
.reader
.peek()
829 if ch
not in u
'\0 \r\n\x85\u2028\u2029':
830 raise ScannerError("while scanning a directive", start_mark
,
831 "expected alphabetic or numeric character, but found %r"
832 % ch
.encode('utf-8'), self
.reader
.get_mark())
835 def scan_yaml_directive_value(self
, start_mark
):
836 # See the specification for details.
837 while self
.reader
.peek() == u
' ':
838 self
.reader
.forward()
839 major
= self
.scan_yaml_directive_number(start_mark
)
840 if self
.reader
.peek() != '.':
841 raise ScannerError("while scanning a directive", start_mark
,
842 "expected a digit or '.', but found %r"
843 % self
.reader
.peek().encode('utf-8'),
844 self
.reader
.get_mark())
845 self
.reader
.forward()
846 minor
= self
.scan_yaml_directive_number(start_mark
)
847 if self
.reader
.peek() not in u
'\0 \r\n\x85\u2028\u2029':
848 raise ScannerError("while scanning a directive", start_mark
,
849 "expected a digit or ' ', but found %r"
850 % self
.reader
.peek().encode('utf-8'),
851 self
.reader
.get_mark())
852 return (major
, minor
)
854 def scan_yaml_directive_number(self
, start_mark
):
855 # See the specification for details.
856 ch
= self
.reader
.peek()
857 if not (u
'0' <= ch
<= '9'):
858 raise ScannerError("while scanning a directive", start_mark
,
859 "expected a digit, but found %r" % ch
.encode('utf-8'),
860 self
.reader
.get_mark())
862 while u
'0' <= self
.reader
.peek(length
) <= u
'9':
864 value
= int(self
.reader
.prefix(length
))
865 self
.reader
.forward(length
)
868 def scan_tag_directive_value(self
, start_mark
):
869 # See the specification for details.
870 while self
.reader
.peek() == u
' ':
871 self
.reader
.forward()
872 handle
= self
.scan_tag_directive_handle(start_mark
)
873 while self
.reader
.peek() == u
' ':
874 self
.reader
.forward()
875 prefix
= self
.scan_tag_directive_prefix(start_mark
)
876 return (handle
, prefix
)
878 def scan_tag_directive_handle(self
, start_mark
):
879 # See the specification for details.
880 value
= self
.scan_tag_handle('directive', start_mark
)
881 ch
= self
.reader
.peek()
883 raise ScannerError("while scanning a directive", start_mark
,
884 "expected ' ', but found %r" % ch
.encode('utf-8'),
885 self
.reader
.get_mark())
888 def scan_tag_directive_prefix(self
, start_mark
):
889 # See the specification for details.
890 value
= self
.scan_tag_uri('directive', start_mark
)
891 ch
= self
.reader
.peek()
892 if ch
not in u
'\0 \r\n\x85\u2028\u2029':
893 raise ScannerError("while scanning a directive", start_mark
,
894 "expected ' ', but found %r" % ch
.encode('utf-8'),
895 self
.reader
.get_mark())
898 def scan_directive_ignored_line(self
, start_mark
):
899 # See the specification for details.
900 while self
.reader
.peek() == u
' ':
901 self
.reader
.forward()
902 if self
.reader
.peek() == u
'#':
903 while self
.reader
.peek() not in u
'\0\r\n\x85\u2028\u2029':
904 self
.reader
.forward()
905 ch
= self
.reader
.peek()
906 if ch
not in u
'\0\r\n\x85\u2028\u2029':
907 raise ScannerError("while scanning a directive", start_mark
,
908 "expected a comment or a line break, but found %r"
909 % ch
.encode('utf-8'), self
.reader
.get_mark())
910 self
.scan_line_break()
912 def scan_anchor(self
, TokenClass
):
913 # The specification does not restrict characters for anchors and
914 # aliases. This may lead to problems, for instance, the document:
916 # can be interpteted in two ways, as
919 # [ *alias , "value" ]
920 # Therefore we restrict aliases to numbers and ASCII letters.
921 start_mark
= self
.reader
.get_mark()
922 indicator
= self
.reader
.peek()
927 self
.reader
.forward()
929 ch
= self
.reader
.peek(length
)
930 while u
'0' <= ch
<= u
'9' or u
'A' <= ch
<= 'Z' or u
'a' <= ch
<= 'z' \
933 ch
= self
.reader
.peek(length
)
935 raise ScannerError("while scanning an %s" % name
, start_mark
,
936 "expected alphabetic or numeric character, but found %r"
937 % ch
.encode('utf-8'), self
.reader
.get_mark())
938 value
= self
.reader
.prefix(length
)
939 self
.reader
.forward(length
)
940 ch
= self
.reader
.peek()
941 if ch
not in u
'\0 \t\r\n\x85\u2028\u2029?:,]}%@`':
942 raise ScannerError("while scanning an %s" % name
, start_mark
,
943 "expected alphabetic or numeric character, but found %r"
944 % ch
.encode('utf-8'), self
.reader
.get_mark())
945 end_mark
= self
.reader
.get_mark()
946 return TokenClass(value
, start_mark
, end_mark
)
949 # See the specification for details.
950 start_mark
= self
.reader
.get_mark()
951 ch
= self
.reader
.peek(1)
954 self
.reader
.forward(2)
955 suffix
= self
.scan_tag_uri('tag', start_mark
)
956 if self
.reader
.peek() != u
'>':
957 raise ScannerError("while parsing a tag", start_mark
,
958 "expected '>', but found %r" % self
.reader
.peek().encode('utf-8'),
959 self
.reader
.get_mark())
960 self
.reader
.forward()
961 elif ch
in u
'\0 \t\r\n\x85\u2028\u2029':
964 self
.reader
.forward()
968 while ch
not in u
'\0 \r\n\x85\u2028\u2029':
973 ch
= self
.reader
.peek(length
)
976 handle
= self
.scan_tag_handle('tag', start_mark
)
979 self
.reader
.forward()
980 suffix
= self
.scan_tag_uri('tag', start_mark
)
981 ch
= self
.reader
.peek()
982 if ch
not in u
'\0 \r\n\x85\u2028\u2029':
983 raise ScannerError("while scanning a tag", start_mark
,
984 "expected ' ', but found %r" % ch
.encode('utf-8'),
985 self
.reader
.get_mark())
986 value
= (handle
, suffix
)
987 end_mark
= self
.reader
.get_mark()
988 return TagToken(value
, start_mark
, end_mark
)
990 def scan_block_scalar(self
, style
):
991 # See the specification for details.
999 start_mark
= self
.reader
.get_mark()
1002 self
.reader
.forward()
1003 chomping
, increment
= self
.scan_block_scalar_indicators(start_mark
)
1004 self
.scan_block_scalar_ignored_line(start_mark
)
1006 # Determine the indentation level and go to the first non-empty line.
1007 min_indent
= self
.indent
+1
1010 if increment
is None:
1011 breaks
, max_indent
, end_mark
= self
.scan_block_scalar_indentation()
1012 indent
= max(min_indent
, max_indent
)
1014 indent
= min_indent
+increment
-1
1015 breaks
, end_mark
= self
.scan_block_scalar_breaks(indent
)
1018 # Scan the inner part of the block scalar.
1019 while self
.reader
.column
== indent
and self
.reader
.peek() != u
'\0':
1020 chunks
.extend(breaks
)
1021 leading_non_space
= self
.reader
.peek() not in u
' \t'
1023 while self
.reader
.peek(length
) not in u
'\0\r\n\x85\u2028\u2029':
1025 chunks
.append(self
.reader
.prefix(length
))
1026 self
.reader
.forward(length
)
1027 line_break
= self
.scan_line_break()
1028 breaks
, end_mark
= self
.scan_block_scalar_breaks(indent
)
1029 if self
.reader
.column
== indent
and self
.reader
.peek() != u
'\0':
1031 # Unfortunately, folding rules are ambiguous.
1033 # This is the folding according to the specification:
1035 if folded
and line_break
== u
'\n' \
1036 and leading_non_space
and self
.reader
.peek() not in u
' \t':
1040 chunks
.append(line_break
)
1042 # This is Clark Evans's interpretation (also in the spec
1045 #if folded and line_break == u'\n':
1047 # if self.reader.peek() not in ' \t':
1048 # chunks.append(u' ')
1050 # chunks.append(line_break)
1052 # chunks.append(line_break)
1057 if chomping
is not False:
1058 chunks
.append(line_break
)
1059 if chomping
is True:
1060 chunks
.extend(breaks
)
1063 return ScalarToken(u
''.join(chunks
), False, start_mark
, end_mark
,
1066 def scan_block_scalar_indicators(self
, start_mark
):
1067 # See the specification for details.
1070 ch
= self
.reader
.peek()
1076 self
.reader
.forward()
1077 ch
= self
.reader
.peek()
1078 if ch
in u
'0123456789':
1081 raise ScannerError("while scanning a block scalar", start_mark
,
1082 "expected indentation indicator in the range 1-9, but found 0",
1083 self
.reader
.get_mark())
1084 self
.reader
.forward()
1085 elif ch
in u
'0123456789':
1088 raise ScannerError("while scanning a block scalar", start_mark
,
1089 "expected indentation indicator in the range 1-9, but found 0",
1090 self
.reader
.get_mark())
1091 self
.reader
.forward()
1092 ch
= self
.reader
.peek()
1098 self
.reader
.forward()
1099 ch
= self
.reader
.peek()
1100 if ch
not in u
'\0 \r\n\x85\u2028\u2029':
1101 raise ScannerError("while scanning a block scalar", start_mark
,
1102 "expected chomping or indentation indicators, but found %r"
1103 % ch
.encode('utf-8'), self
.reader
.get_mark())
1104 return chomping
, increment
1106 def scan_block_scalar_ignored_line(self
, start_mark
):
1107 # See the specification for details.
1108 while self
.reader
.peek() == u
' ':
1109 self
.reader
.forward()
1110 if self
.reader
.peek() == u
'#':
1111 while self
.reader
.peek() not in u
'\0\r\n\x85\u2028\u2029':
1112 self
.reader
.forward()
1113 ch
= self
.reader
.peek()
1114 if ch
not in u
'\0\r\n\x85\u2028\u2029':
1115 raise ScannerError("while scanning a block scalar", start_mark
,
1116 "expected a comment or a line break, but found %r"
1117 % ch
.encode('utf-8'), self
.reader
.get_mark())
1118 self
.scan_line_break()
1120 def scan_block_scalar_indentation(self
):
1121 # See the specification for details.
1124 end_mark
= self
.reader
.get_mark()
1125 while self
.reader
.peek() in u
' \r\n\x85\u2028\u2029':
1126 if self
.reader
.peek() != u
' ':
1127 chunks
.append(self
.scan_line_break())
1128 end_mark
= self
.reader
.get_mark()
1130 self
.reader
.forward()
1131 if self
.reader
.column
> max_indent
:
1132 max_indent
= self
.reader
.column
1133 return chunks
, max_indent
, end_mark
1135 def scan_block_scalar_breaks(self
, indent
):
1136 # See the specification for details.
1138 end_mark
= self
.reader
.get_mark()
1139 while self
.reader
.column
< indent
and self
.reader
.peek() == u
' ':
1140 self
.reader
.forward()
1141 while self
.reader
.peek() in u
'\r\n\x85\u2028\u2029':
1142 chunks
.append(self
.scan_line_break())
1143 end_mark
= self
.reader
.get_mark()
1144 while self
.reader
.column
< indent
and self
.reader
.peek() == u
' ':
1145 self
.reader
.forward()
1146 return chunks
, end_mark
1148 def scan_flow_scalar(self
, style
):
1149 # See the specification for details.
1150 # Note that we loose indentation rules for quoted scalars. Quoted
1151 # scalars don't need to adhere indentation because " and ' clearly
1152 # mark the beginning and the end of them. Therefore we are less
1153 # restrictive then the specification requires. We only need to check
1154 # that document separators are not included in scalars.
1160 start_mark
= self
.reader
.get_mark()
1161 quote
= self
.reader
.peek()
1162 self
.reader
.forward()
1163 chunks
.extend(self
.scan_flow_scalar_non_spaces(double
, start_mark
))
1164 while self
.reader
.peek() != quote
:
1165 chunks
.extend(self
.scan_flow_scalar_spaces(double
, start_mark
))
1166 chunks
.extend(self
.scan_flow_scalar_non_spaces(double
, start_mark
))
1167 self
.reader
.forward()
1168 end_mark
= self
.reader
.get_mark()
1169 return ScalarToken(u
''.join(chunks
), False, start_mark
, end_mark
,
1172 ESCAPE_REPLACEMENTS
= {
1198 def scan_flow_scalar_non_spaces(self
, double
, start_mark
):
1199 # See the specification for details.
1203 while self
.reader
.peek(length
) not in u
'\'\"\\\0 \t\r\n\x85\u2028\u2029':
1206 chunks
.append(self
.reader
.prefix(length
))
1207 self
.reader
.forward(length
)
1208 ch
= self
.reader
.peek()
1209 if not double
and ch
== u
'\'' and self
.reader
.peek(1) == u
'\'':
1210 chunks
.append(u
'\'')
1211 self
.reader
.forward(2)
1212 elif (double
and ch
== u
'\'') or (not double
and ch
in u
'\"\\'):
1214 self
.reader
.forward()
1215 elif double
and ch
== u
'\\':
1216 self
.reader
.forward()
1217 ch
= self
.reader
.peek()
1218 if ch
in self
.ESCAPE_REPLACEMENTS
:
1219 chunks
.append(self
.ESCAPE_REPLACEMENTS
[ch
])
1220 self
.reader
.forward()
1221 elif ch
in self
.ESCAPE_CODES
:
1222 length
= self
.ESCAPE_CODES
[ch
]
1223 self
.reader
.forward()
1224 for k
in range(length
):
1225 if self
.reader
.peek(k
) not in u
'0123456789ABCDEFabcdef':
1226 raise ScannerError("while scanning a double-quoted scalar", start_mark
,
1227 "expected escape sequence of %d hexdecimal numbers, but found %r" %
1228 (length
, self
.reader
.peek(k
).encode('utf-8')), self
.reader
.get_mark())
1229 code
= int(self
.reader
.prefix(length
), 16)
1230 chunks
.append(unichr(code
))
1231 self
.reader
.forward(length
)
1232 elif ch
in u
'\r\n\x85\u2028\u2029':
1233 self
.scan_line_break()
1234 chunks
.extend(self
.scan_flow_scalar_breaks(double
, start_mark
))
1236 raise ScannerError("while scanning a double-quoted scalar", start_mark
,
1237 "found unknown escape character %r" % ch
.encode('utf-8'), self
.reader
.get_mark())
1241 def scan_flow_scalar_spaces(self
, double
, start_mark
):
1242 # See the specification for details.
1245 while self
.reader
.peek(length
) in u
' \t':
1247 whitespaces
= self
.reader
.prefix(length
)
1248 self
.reader
.forward(length
)
1249 ch
= self
.reader
.peek()
1251 raise ScannerError("while scanning a quoted scalar", start_mark
,
1252 "found unexpected end of stream", self
.reader
.get_mark())
1253 elif ch
in u
'\r\n\x85\u2028\u2029':
1254 line_break
= self
.scan_line_break()
1255 breaks
= self
.scan_flow_scalar_breaks(double
, start_mark
)
1256 if line_break
!= u
'\n':
1257 chunks
.append(line_break
)
1260 chunks
.extend(breaks
)
1262 chunks
.append(whitespaces
)
1265 def scan_flow_scalar_breaks(self
, double
, start_mark
):
1266 # See the specification for details.
1269 # Instead of checking indentation, we check for document
1271 prefix
= self
.reader
.prefix(3)
1272 if (prefix
== u
'---' or prefix
== u
'...') \
1273 and self
.reader
.peek(3) in u
'\0 \t\r\n\x85\u2028\u2029':
1274 raise ScannerError("while scanning a quoted scalar", start_mark
,
1275 "found unexpected document separator", self
.reader
.get_mark())
1276 while self
.reader
.peek() in u
' \t':
1277 self
.reader
.forward()
1278 if self
.reader
.peek() in u
'\r\n\x85\u2028\u2029':
1279 chunks
.append(self
.scan_line_break())
1283 def scan_plain(self
):
1284 # See the specification for details.
1285 # We add an additional restriction for the flow context:
1286 # plain scalars in the flow context cannot contain ',', ':' and '?'.
1287 # We also keep track of the `allow_simple_key` flag here.
1288 # Indentation rules are loosed for the flow context.
1290 start_mark
= self
.reader
.get_mark()
1291 end_mark
= start_mark
1292 indent
= self
.indent
+1
1293 # We allow zero indentation for scalars, but then we need to check for
1294 # document separators at the beginning of the line.
1300 if self
.reader
.peek() == u
'#':
1303 ch
= self
.reader
.peek(length
)
1304 if ch
in u
'\0 \t\r\n\x85\u2028\u2029' \
1305 or (not self
.flow_level
and ch
== u
':' and
1306 self
.reader
.peek(length
+1) in u
'\0 \t\r\n\x28\u2028\u2029') \
1307 or (self
.flow_level
and ch
in u
',:?[]{}'):
1312 self
.allow_simple_key
= False
1313 chunks
.extend(spaces
)
1314 chunks
.append(self
.reader
.prefix(length
))
1315 self
.reader
.forward(length
)
1316 end_mark
= self
.reader
.get_mark()
1317 spaces
= self
.scan_plain_spaces(indent
, start_mark
)
1318 if not spaces
or self
.reader
.peek() == u
'#' \
1319 or (not self
.flow_level
and self
.reader
.column
< indent
):
1321 return ScalarToken(u
''.join(chunks
), True, start_mark
, end_mark
)
1323 def scan_plain_spaces(self
, indent
, start_mark
):
1324 # See the specification for details.
1325 # The specification is really confusing about tabs in plain scalars.
1326 # We just forbid them completely. Do not use tabs in YAML!
1329 while self
.reader
.peek(length
) in u
' ':
1331 whitespaces
= self
.reader
.prefix(length
)
1332 self
.reader
.forward(length
)
1333 ch
= self
.reader
.peek()
1334 if ch
in u
'\r\n\x85\u2028\u2029':
1335 line_break
= self
.scan_line_break()
1336 self
.allow_simple_key
= True
1337 prefix
= self
.reader
.prefix(3)
1338 if (prefix
== u
'---' or prefix
== u
'...') \
1339 and self
.reader
.peek(3) in u
'\0 \t\r\n\x85\u2028\u2029':
1342 while self
.reader
.peek() in u
' \r\n\x85\u2028\u2029':
1343 if self
.reader
.peek() == ' ':
1344 self
.reader
.forward()
1346 breaks
.append(self
.scan_line_break())
1347 prefix
= self
.reader
.prefix(3)
1348 if (prefix
== u
'---' or prefix
== u
'...') \
1349 and self
.reader
.peek(3) in u
'\0 \t\r\n\x85\u2028\u2029':
1351 if line_break
!= u
'\n':
1352 chunks
.append(line_break
)
1355 chunks
.extend(breaks
)
1357 chunks
.append(whitespaces
)
1360 def scan_tag_handle(self
, name
, start_mark
):
1361 # See the specification for details.
1362 # For some strange reasons, the specification does not allow '_' in
1363 # tag handles. I have allowed it anyway.
1364 ch
= self
.reader
.peek()
1366 raise ScannerError("while scanning a %s" % name
, start_mark
,
1367 "expected '!', but found %r" % ch
.encode('utf-8'),
1368 self
.reader
.get_mark())
1370 ch
= self
.reader
.peek(length
)
1372 while u
'0' <= ch
<= u
'9' or u
'A' <= ch
<= 'Z' or u
'a' <= ch
<= 'z' \
1375 ch
= self
.reader
.peek(length
)
1377 self
.reader
.forward(length
)
1378 raise ScannerError("while scanning a %s" % name
, start_mark
,
1379 "expected '!', but found %r" % ch
.encode('utf-8'),
1380 self
.reader
.get_mark())
1382 value
= self
.reader
.prefix(length
)
1383 self
.reader
.forward(length
)
1386 def scan_tag_uri(self
, name
, start_mark
):
1387 # See the specification for details.
1388 # Note: we do not check if URI is well-formed.
1391 ch
= self
.reader
.peek(length
)
1392 while u
'0' <= ch
<= u
'9' or u
'A' <= ch
<= 'Z' or u
'a' <= ch
<= 'z' \
1393 or ch
in u
'-;/?:@&=+$,_.!~*\'()[]%':
1395 chunks
.append(self
.reader
.prefix(length
))
1396 self
.reader
.forward(length
)
1398 chunks
.append(self
.scan_uri_escapes(name
, start_mark
))
1401 ch
= self
.reader
.peek(length
)
1403 chunks
.append(self
.reader
.prefix(length
))
1404 self
.reader
.forward(length
)
1407 raise ScannerError("while parsing a %s" % name
, start_mark
,
1408 "expected URI, but found %r" % ch
.encode('utf-8'),
1409 self
.reader
.get_mark())
1410 return u
''.join(chunks
)
1412 def scan_uri_escapes(self
, name
, start_mark
):
1413 # See the specification for details.
1415 mark
= self
.reader
.get_mark()
1416 while self
.reader
.peek() == u
'%':
1417 self
.reader
.forward()
1419 if self
.reader
.peek(k
) not in u
'0123456789ABCDEFabcdef':
1420 raise ScannerError("while scanning a %s" % name
, start_mark
,
1421 "expected URI escape sequence of 2 hexdecimal numbers, but found %r" %
1422 (self
.reader
.peek(k
).encode('utf-8')), self
.reader
.get_mark())
1423 bytes
.append(chr(int(self
.reader
.prefix(2), 16)))
1424 self
.reader
.forward(2)
1426 value
= unicode(''.join(bytes
), 'utf-8')
1427 except UnicodeDecodeError, exc
:
1428 raise ScannerError("while scanning a %s" % name
, start_mark
, str(exc
), mark
)
1431 def scan_line_break(self
):
1437 # '\u2028' : '\u2028'
1438 # '\u2029 : '\u2029'
1440 ch
= self
.reader
.peek()
1441 if ch
in u
'\r\n\x85':
1442 if self
.reader
.prefix(2) == u
'\r\n':
1443 self
.reader
.forward(2)
1445 self
.reader
.forward()
1447 elif ch
in u
'\u2028\u2029':
1448 self
.reader
.forward()
1454 # psyco.bind(Scanner)
1455 #except ImportError: