Fix a few bugs.
[pyyaml/python3.git] / lib / yaml / scanner.py
blob09e3bd6e69873a5e8edfaa5d5ad70aa1781bc409
2 # Scanner produces tokens of the following types:
3 # DIRECTIVE(name, value)
4 # DOCUMENT-START
5 # DOCUMENT-END
6 # STREAM-END
7 # BLOCK-SEQUENCE-START
8 # BLOCK-MAPPING-START
9 # BLOCK-END
10 # FLOW-SEQUENCE-START
11 # FLOW-MAPPING-START
12 # FLOW-SEQUENCE-END
13 # FLOW-MAPPING-END
14 # BLOCK-ENTRY
15 # FLOW-ENTRY
16 # KEY
17 # VALUE
18 # ALIAS(value)
19 # ANCHOR(value)
20 # TAG(value)
21 # SCALAR(value, plain)
23 # Read comments in the Scanner code for more details.
26 __all__ = ['Scanner', 'ScannerError']
28 from error import MarkedYAMLError
29 from tokens import *
31 class ScannerError(MarkedYAMLError):
32 pass
34 class SimpleKey:
35 # See below simple keys treatment.
37 def __init__(self, token_number, required, index, line, column, marker):
38 self.token_number = token_number
39 self.required = required
40 self.index = index
41 self.line = line
42 self.column = column
43 self.marker = marker
45 class Scanner:
48 def __init__(self, reader):
49 """Initialize the scanner."""
50 # The input stream. The Reader class do the dirty work of checking for
51 # BOM and converting the input data to Unicode. It also adds NUL to
52 # the end.
54 # Reader supports the following methods
55 # self.reader.peek(i=0) # peek the next i-th character
56 # self.reader.prefix(l=1) # peek the next l characters
57 # self.reader.forward(l=1) # read the next l characters
58 # and move the pointer
59 self.reader = reader
61 # Had we reached the end of the stream?
62 self.done = False
64 # The number of unclosed '{' and '['. `flow_level == 0` means block
65 # context.
66 self.flow_level = 0
68 # List of processed tokens that are not yet emitted.
69 self.tokens = []
71 # Number of tokens that were emitted through the `get_token` method.
72 self.tokens_taken = 0
74 # The current indentation level.
75 self.indent = -1
77 # Past indentation levels.
78 self.indents = []
80 # Variables related to simple keys treatment.
82 # A simple key is a key that is not denoted by the '?' indicator.
83 # Example of simple keys:
84 # ---
85 # block simple key: value
86 # ? not a simple key:
87 # : { flow simple key: value }
88 # We emit the KEY token before all keys, so when we find a potential
89 # simple key, we try to locate the corresponding ':' indicator.
90 # Simple keys should be limited to a single line and 1024 characters.
92 # Can a simple key start at the current position? A simple key may
93 # start:
94 # - at the beginning of the line, not counting indentation spaces
95 # (in block context),
96 # - after '{', '[', ',' (in the flow context),
97 # - after '?', ':', '-' (in the block context).
98 # In the block context, this flag also signify if a block collection
99 # may start at the current position.
100 self.allow_simple_key = True
102 # Keep track of possible simple keys. This is a dictionary. The key
103 # is `flow_level`; there can be no more that one possible simple key
104 # for each level. The value is a SimpleKey record:
105 # (token_number, required, index, line, column, marker)
106 # A simple key may start with ALIAS, ANCHOR, TAG, SCALAR(flow),
107 # '[', or '{' tokens.
108 self.possible_simple_keys = {}
110 # Public methods.
112 def check(self, *choices):
113 # Check if the next token is one of the given types.
114 while self.need_more_tokens():
115 self.fetch_more_tokens()
116 if self.tokens:
117 for choice in choices:
118 if isinstance(self.tokens[0], choice):
119 return True
120 return False
122 def peek(self):
123 # Return the next token, but do not delete if from the queue.
124 while self.need_more_tokens():
125 self.fetch_more_tokens()
126 if self.tokens:
127 return self.tokens[0]
129 def get(self):
130 # Return the next token.
131 while self.need_more_tokens():
132 self.fetch_more_tokens()
133 if self.tokens:
134 self.tokens_taken += 1
135 return self.tokens.pop(0)
137 def __iter__(self):
138 # Iterator protocol.
139 while self.need_more_tokens():
140 self.fetch_more_tokens()
141 while self.tokens:
142 self.tokens_taken += 1
143 yield self.tokens.pop(0)
144 while self.need_more_tokens():
145 self.fetch_more_tokens()
147 # Private methods.
149 def need_more_tokens(self):
150 if self.done:
151 return False
152 if not self.tokens:
153 return True
154 # The current token may be a potential simple key, so we
155 # need to look further.
156 self.stale_possible_simple_keys()
157 if self.next_possible_simple_key() == self.tokens_taken:
158 return True
160 def fetch_more_tokens(self):
162 # Eat whitespaces and comments until we reach the next token.
163 self.scan_to_next_token()
165 # Remove obsolete possible simple keys.
166 self.stale_possible_simple_keys()
168 # Compare the current indentation and column. It may add some tokens
169 # and decrease the current indentation level.
170 self.unwind_indent(self.reader.column)
172 # Peek the next character.
173 ch = self.reader.peek()
175 # Is it the end of stream?
176 if ch == u'\0':
177 return self.fetch_stream_end()
179 # Is it a directive?
180 if ch == u'%' and self.check_directive():
181 return self.fetch_directive()
183 # Is it the document start?
184 if ch == u'-' and self.check_document_start():
185 return self.fetch_document_start()
187 # Is it the document end?
188 if ch == u'.' and self.check_document_end():
189 return self.fetch_document_end()
191 # TODO: support for BOM within a stream.
192 #if ch == u'\uFEFF':
193 # return self.fetch_bom() <-- issue BOMToken
195 # Note: the order of the following checks is NOT significant.
197 # Is it the flow sequence start indicator?
198 if ch == u'[':
199 return self.fetch_flow_sequence_start()
201 # Is it the flow mapping start indicator?
202 if ch == u'{':
203 return self.fetch_flow_mapping_start()
205 # Is it the flow sequence end indicator?
206 if ch == u']':
207 return self.fetch_flow_sequence_end()
209 # Is it the flow mapping end indicator?
210 if ch == u'}':
211 return self.fetch_flow_mapping_end()
213 # Is it the flow entry indicator?
214 if ch in u',':
215 return self.fetch_flow_entry()
217 # Is it the block entry indicator?
218 if ch in u'-' and self.check_block_entry():
219 return self.fetch_block_entry()
221 # Is it the key indicator?
222 if ch == u'?' and self.check_key():
223 return self.fetch_key()
225 # Is it the value indicator?
226 if ch == u':' and self.check_value():
227 return self.fetch_value()
229 # Is it an alias?
230 if ch == u'*':
231 return self.fetch_alias()
233 # Is it an anchor?
234 if ch == u'&':
235 return self.fetch_anchor()
237 # Is it a tag?
238 if ch == u'!':
239 return self.fetch_tag()
241 # Is it a literal scalar?
242 if ch == u'|' and not self.flow_level:
243 return self.fetch_literal()
245 # Is it a folded scalar?
246 if ch == u'>' and not self.flow_level:
247 return self.fetch_folded()
249 # Is it a single quoted scalar?
250 if ch == u'\'':
251 return self.fetch_single()
253 # Is it a double quoted scalar?
254 if ch == u'\"':
255 return self.fetch_double()
257 # It must be a plain scalar then.
258 if self.check_plain():
259 return self.fetch_plain()
261 # No? It's an error. Let's produce a nice error message.
262 raise ScannerError("while scanning for the next token", None,
263 "found character %r that cannot start any token"
264 % ch.encode('utf-8'), self.reader.get_marker())
266 # Simple keys treatment.
268 def next_possible_simple_key(self):
269 # Return the number of the nearest possible simple key. Actually we
270 # don't need to loop through the whole dictionary. We may replace it
271 # with the following code:
272 # if not self.possible_simple_keys:
273 # return None
274 # return self.possible_simple_keys[
275 # min(self.possible_simple_keys.keys())].token_number
276 min_token_number = None
277 for level in self.possible_simple_keys:
278 key = self.possible_simple_keys[level]
279 if min_token_number is None or key.token_number < min_token_number:
280 min_token_number = key.token_number
281 return min_token_number
283 def stale_possible_simple_keys(self):
284 # Remove entries that are no longer possible simple keys. According to
285 # the YAML specification, simple keys
286 # - should be limited to a single line,
287 # - should be no longer than 1024 characters.
288 # Disabling this procedure will allow simple keys of any length and
289 # height (may cause problems if indentation is broken though).
290 for level in self.possible_simple_keys.keys():
291 key = self.possible_simple_keys[level]
292 if key.line != self.reader.line \
293 or self.reader.index-key.index > 1024:
294 if key.required:
295 raise ScannerError("while scanning a simple key", key.marker,
296 "could not found expected ':'", self.reader.get_marker())
297 del self.possible_simple_keys[level]
299 def save_possible_simple_key(self):
300 # The next token may start a simple key. We check if it's possible
301 # and save its position. This function is called for
302 # ALIAS, ANCHOR, TAG, SCALAR(flow), '[', and '{'.
304 # Check if a simple key is required at the current position.
305 required = not self.flow_level and self.indent == self.reader.column
307 # A simple key is required only if it is the first token in the current
308 # line. Therefore it is always allowed.
309 assert self.allow_simple_key or not required
311 # The next token might be a simple key. Let's save it's number and
312 # position.
313 if self.allow_simple_key:
314 self.remove_possible_simple_key()
315 token_number = self.tokens_taken+len(self.tokens)
316 index = self.reader.index
317 line = self.reader.line
318 column = self.reader.column
319 marker = self.reader.get_marker()
320 key = SimpleKey(token_number, required,
321 index, line, column, marker)
322 self.possible_simple_keys[self.flow_level] = key
324 def remove_possible_simple_key(self):
325 # Remove the saved possible key position at the current flow level.
326 if self.flow_level in self.possible_simple_keys:
327 key = self.possible_simple_keys[self.flow_level]
329 # I don't think it's possible, but I could be wrong.
330 assert not key.required
331 #if key.required:
332 # raise ScannerError("while scanning a simple key", key.marker,
333 # "could not found expected ':'", self.reader.get_marker())
335 # Indentation functions.
337 def unwind_indent(self, column):
339 # In flow context, tokens should respect indentation.
340 # Actually the condition should be `self.indent >= column` according to
341 # the spec. But this condition will prohibit intuitively correct
342 # constructions such as
343 # key : {
345 if self.flow_level and self.indent > column:
346 raise ScannerError(None, None,
347 "invalid intendation or unclosed '[' or '{'",
348 self.reader.get_marker())
350 # In block context, we may need to issue the BLOCK-END tokens.
351 while self.indent > column:
352 marker = self.reader.get_marker()
353 self.indent = self.indents.pop()
354 self.tokens.append(BlockEndToken(marker, marker))
356 def add_indent(self, column):
357 # Check if we need to increase indentation.
358 if self.indent < column:
359 self.indents.append(self.indent)
360 self.indent = column
361 return True
362 return False
364 # Fetchers.
366 def fetch_stream_end(self):
368 # Set the current intendation to -1.
369 self.unwind_indent(-1)
371 # Reset everything (not really needed).
372 self.allow_simple_key = False
373 self.possible_simple_keys = {}
375 # Read the token.
376 marker = self.reader.get_marker()
378 # Add END.
379 self.tokens.append(StreamEndToken(marker, marker))
381 # The reader is ended.
382 self.done = True
384 def fetch_directive(self):
386 # Set the current intendation to -1.
387 self.unwind_indent(-1)
389 # Reset simple keys.
390 self.remove_possible_simple_key()
391 self.allow_simple_key = False
393 # Scan and add DIRECTIVE.
394 self.tokens.append(self.scan_directive())
396 def fetch_document_start(self):
397 self.fetch_document_indicator(DocumentStartToken)
399 def fetch_document_end(self):
400 self.fetch_document_indicator(DocumentEndToken)
402 def fetch_document_indicator(self, TokenClass):
404 # Set the current intendation to -1.
405 self.unwind_indent(-1)
407 # Reset simple keys. Note that there could not be a block collection
408 # after '---'.
409 self.remove_possible_simple_key()
410 self.allow_simple_key = False
412 # Add DOCUMENT-START or DOCUMENT-END.
413 start_marker = self.reader.get_marker()
414 self.reader.forward(3)
415 end_marker = self.reader.get_marker()
416 self.tokens.append(TokenClass(start_marker, end_marker))
418 def fetch_flow_sequence_start(self):
419 self.fetch_flow_collection_start(FlowSequenceStartToken)
421 def fetch_flow_mapping_start(self):
422 self.fetch_flow_collection_start(FlowMappingStartToken)
424 def fetch_flow_collection_start(self, TokenClass):
426 # '[' and '{' may start a simple key.
427 self.save_possible_simple_key()
429 # Increase the flow level.
430 self.flow_level += 1
432 # Simple keys are allowed after '[' and '{'.
433 self.allow_simple_key = True
435 # Add FLOW-SEQUENCE-START or FLOW-MAPPING-START.
436 start_marker = self.reader.get_marker()
437 self.reader.forward()
438 end_marker = self.reader.get_marker()
439 self.tokens.append(TokenClass(start_marker, end_marker))
441 def fetch_flow_sequence_end(self):
442 self.fetch_flow_collection_end(FlowSequenceEndToken)
444 def fetch_flow_mapping_end(self):
445 self.fetch_flow_collection_end(FlowMappingEndToken)
447 def fetch_flow_collection_end(self, TokenClass):
449 # Reset possible simple key on the current level.
450 self.remove_possible_simple_key()
452 # Decrease the flow level.
453 self.flow_level -= 1
455 # No simple keys after ']' or '}'.
456 self.allow_simple_key = False
458 # Add FLOW-SEQUENCE-END or FLOW-MAPPING-END.
459 start_marker = self.reader.get_marker()
460 self.reader.forward()
461 end_marker = self.reader.get_marker()
462 self.tokens.append(TokenClass(start_marker, end_marker))
464 def fetch_flow_entry(self):
466 # Simple keys are allowed after ','.
467 self.allow_simple_key = True
469 # Reset possible simple key on the current level.
470 self.remove_possible_simple_key()
472 # Add FLOW-ENTRY.
473 start_marker = self.reader.get_marker()
474 self.reader.forward()
475 end_marker = self.reader.get_marker()
476 self.tokens.append(FlowEntryToken(start_marker, end_marker))
478 def fetch_block_entry(self):
480 # Block context needs additional checks.
481 if not self.flow_level:
483 # Are we allowed to start a new entry?
484 if not self.allow_simple_key:
485 raise ScannerError(None, None,
486 "sequence entries are not allowed here",
487 self.reader.get_marker())
489 # We may need to add BLOCK-SEQUENCE-START.
490 if self.add_indent(self.reader.column):
491 marker = self.reader.get_marker()
492 self.tokens.append(BlockSequenceStartToken(marker, marker))
494 # It's an error for the block entry to occur in the flow context,
495 # but we let the parser detect this.
496 else:
497 pass
499 # Simple keys are allowed after '-'.
500 self.allow_simple_key = True
502 # Reset possible simple key on the current level.
503 self.remove_possible_simple_key()
505 # Add BLOCK-ENTRY.
506 start_marker = self.reader.get_marker()
507 self.reader.forward()
508 end_marker = self.reader.get_marker()
509 self.tokens.append(BlockEntryToken(start_marker, end_marker))
511 def fetch_key(self):
513 # Block context needs additional checks.
514 if not self.flow_level:
516 # Are we allowed to start a key (not nessesary a simple)?
517 if not self.allow_simple_key:
518 raise ScannerError(None, None,
519 "mapping keys are not allowed here",
520 self.reader.get_marker())
522 # We may need to add BLOCK-MAPPING-START.
523 if self.add_indent(self.reader.column):
524 marker = self.reader.get_marker()
525 self.tokens.append(BlockMappingStartToken(marker, marker))
527 # Simple keys are allowed after '?' in the block context.
528 self.allow_simple_key = not self.flow_level
530 # Reset possible simple key on the current level.
531 self.remove_possible_simple_key()
533 # Add KEY.
534 start_marker = self.reader.get_marker()
535 self.reader.forward()
536 end_marker = self.reader.get_marker()
537 self.tokens.append(KeyToken(start_marker, end_marker))
539 def fetch_value(self):
541 # Do we determine a simple key?
542 if self.flow_level in self.possible_simple_keys:
544 # Add KEY.
545 key = self.possible_simple_keys[self.flow_level]
546 del self.possible_simple_keys[self.flow_level]
547 self.tokens.insert(key.token_number-self.tokens_taken,
548 KeyToken(key.marker, key.marker))
550 # If this key starts a new block mapping, we need to add
551 # BLOCK-MAPPING-START.
552 if not self.flow_level:
553 if self.add_indent(key.column):
554 self.tokens.insert(key.token_number-self.tokens_taken,
555 BlockMappingStartToken(key.marker, key.marker))
557 # There cannot be two simple keys one after another.
558 self.allow_simple_key = False
560 # It must be a part of a complex key.
561 else:
563 # Block context needs additional checks.
564 # (Do we really need them? They will be catched by the parser
565 # anyway.)
566 if not self.flow_level:
568 # We are allowed to start a complex value if and only if
569 # we can start a simple key.
570 if not self.allow_simple_key:
571 raise ScannerError(None, None,
572 "mapping values are not allowed here",
573 self.reader.get_marker())
575 # Simple keys are allowed after ':' in the block context.
576 self.allow_simple_key = not self.flow_level
578 # Reset possible simple key on the current level.
579 self.remove_possible_simple_key()
581 # Add VALUE.
582 start_marker = self.reader.get_marker()
583 self.reader.forward()
584 end_marker = self.reader.get_marker()
585 self.tokens.append(ValueToken(start_marker, end_marker))
587 def fetch_alias(self):
589 # ALIAS could be a simple key.
590 self.save_possible_simple_key()
592 # No simple keys after ALIAS.
593 self.allow_simple_key = False
595 # Scan and add ALIAS.
596 self.tokens.append(self.scan_anchor(AliasToken))
598 def fetch_anchor(self):
600 # ANCHOR could start a simple key.
601 self.save_possible_simple_key()
603 # No simple keys after ANCHOR.
604 self.allow_simple_key = False
606 # Scan and add ANCHOR.
607 self.tokens.append(self.scan_anchor(AnchorToken))
609 def fetch_tag(self):
611 # TAG could start a simple key.
612 self.save_possible_simple_key()
614 # No simple keys after TAG.
615 self.allow_simple_key = False
617 # Scan and add TAG.
618 self.tokens.append(self.scan_tag())
620 def fetch_literal(self):
621 self.fetch_block_scalar(folded=False)
623 def fetch_folded(self):
624 self.fetch_block_scalar(folded=True)
626 def fetch_block_scalar(self, folded):
628 # A simple key may follow a block scalar.
629 self.allow_simple_key = True
631 # Reset possible simple key on the current level.
632 self.remove_possible_simple_key()
634 # Scan and add SCALAR.
635 self.tokens.append(self.scan_block_scalar(folded))
637 def fetch_single(self):
638 self.fetch_flow_scalar(double=False)
640 def fetch_double(self):
641 self.fetch_flow_scalar(double=True)
643 def fetch_flow_scalar(self, double):
645 # A flow scalar could be a simple key.
646 self.save_possible_simple_key()
648 # No simple keys after flow scalars.
649 self.allow_simple_key = False
651 # Scan and add SCALAR.
652 self.tokens.append(self.scan_flow_scalar(double))
654 def fetch_plain(self):
656 # A plain scalar could be a simple key.
657 self.save_possible_simple_key()
659 # No simple keys after plain scalars. But note that `scan_plain` will
660 # change this flag if the scan is finished at the beginning of the
661 # line.
662 self.allow_simple_key = False
664 # Scan and add SCALAR. May change `allow_simple_key`.
665 self.tokens.append(self.scan_plain())
667 # Checkers.
669 def check_directive(self):
671 # DIRECTIVE: ^ '%' ...
672 # The '%' indicator is already checked.
673 if self.reader.column == 0:
674 return True
676 def check_document_start(self):
678 # DOCUMENT-START: ^ '---' (' '|'\n')
679 if self.reader.column == 0:
680 if self.reader.prefix(3) == u'---' \
681 and self.reader.peek(3) in u'\0 \t\r\n\x85\u2028\u2029':
682 return True
684 def check_document_end(self):
686 # DOCUMENT-END: ^ '...' (' '|'\n')
687 if self.reader.column == 0:
688 prefix = self.reader.peek(4)
689 if self.reader.prefix(3) == u'...' \
690 and self.reader.peek(3) in u'\0 \t\r\n\x85\u2028\u2029':
691 return True
693 def check_block_entry(self):
695 # BLOCK-ENTRY: '-' (' '|'\n')
696 return self.reader.peek(1) in u'\0 \t\r\n\x85\u2028\u2029'
698 def check_key(self):
700 # KEY(flow context): '?'
701 if self.flow_level:
702 return True
704 # KEY(block context): '?' (' '|'\n')
705 else:
706 return self.reader.peek(1) in u'\0 \t\r\n\x85\u2028\u2029'
708 def check_value(self):
710 # VALUE(flow context): ':'
711 if self.flow_level:
712 return True
714 # VALUE(block context): ':' (' '|'\n')
715 else:
716 return self.reader.peek(1) in u'\0 \t\r\n\x85\u2028\u2029'
718 def check_plain(self):
720 # A plain scalar may start with any non-space character except:
721 # '-', '?', ':', ',', '[', ']', '{', '}',
722 # '#', '&', '*', '!', '|', '>', '\'', '\"',
723 # '%', '@', '`'.
725 # It may also start with
726 # '-', '?', ':'
727 # if it is followed by a non-space character.
729 # Note that we limit the last rule to the block context (except the
730 # '-' character) because we want the flow context to be space
731 # independent.
732 ch = self.reader.peek()
733 return ch not in u'\0 \t\r\n\x85\u2028\u2029-?:,[]{}#&*!|>\'\"%@`' \
734 or (self.reader.peek(1) not in u'\0 \t\r\n\x85\u2028\u2029'
735 and (ch == '-' or (not self.flow_level and ch in u'?:')))
737 # Scanners.
739 def scan_to_next_token(self):
740 # We ignore spaces, line breaks and comments.
741 # If we find a line break in the block context, we set the flag
742 # `allow_simple_key` on.
743 # The byte order mark is stripped if it's the first character in the
744 # stream. We do not yet support BOM inside the stream as the
745 # specification requires. Any such mark will be considered as a part
746 # of the document.
748 # TODO: We need to make tab handling rules more sane. A good rule is
749 # Tabs cannot precede tokens
750 # BLOCK-SEQUENCE-START, BLOCK-MAPPING-START, BLOCK-END,
751 # KEY(block), VALUE(block), BLOCK-ENTRY
752 # So the checking code is
753 # if <TAB>:
754 # self.allow_simple_keys = False
755 # We also need to add the check for `allow_simple_keys == True` to
756 # `unwind_indent` before issuing BLOCK-END.
757 # Scanners for block, flow, and plain scalars need to be modified.
759 if self.reader.index == 0 and self.reader.peek() == u'\uFEFF':
760 self.reader.forward()
761 found = False
762 while not found:
763 while self.reader.peek() == u' ':
764 self.reader.forward()
765 if self.reader.peek() == u'#':
766 while self.reader.peek() not in u'\0\r\n\x85\u2028\u2029':
767 self.reader.forward()
768 if self.scan_line_break():
769 if not self.flow_level:
770 self.allow_simple_key = True
771 else:
772 found = True
774 def scan_directive(self):
775 # See the specification for details.
776 start_marker = self.reader.get_marker()
777 self.reader.forward()
778 name = self.scan_directive_name(start_marker)
779 value = None
780 if name == u'YAML':
781 value = self.scan_yaml_directive_value(start_marker)
782 end_marker = self.reader.get_marker()
783 elif name == u'TAG':
784 value = self.scan_tag_directive_value(start_marker)
785 end_marker = self.reader.get_marker()
786 else:
787 end_marker = self.reader.get_marker()
788 while self.reader.peek() not in u'\0\r\n\x85\u2028\u2029':
789 self.reader.forward()
790 self.scan_directive_ignored_line(start_marker)
791 return DirectiveToken(name, value, start_marker, end_marker)
793 def scan_directive_name(self, start_marker):
794 # See the specification for details.
795 length = 0
796 ch = self.reader.peek(length)
797 while u'0' <= ch <= u'9' or u'A' <= ch <= 'Z' or u'a' <= ch <= 'z' \
798 or ch in u'-_':
799 length += 1
800 ch = self.reader.peek(length)
801 if not length:
802 raise ScannerError("while scanning a directive", start_marker,
803 "expected alphabetic or numeric character, but found %r"
804 % ch.encode('utf-8'), self.reader.get_marker())
805 value = self.reader.prefix(length)
806 self.reader.forward(length)
807 ch = self.reader.peek()
808 if ch not in u'\0 \r\n\x85\u2028\u2029':
809 raise ScannerError("while scanning a directive", start_marker,
810 "expected alphabetic or numeric character, but found %r"
811 % ch.encode('utf-8'), self.reader.get_marker())
812 return value
814 def scan_yaml_directive_value(self, start_marker):
815 # See the specification for details.
816 while self.reader.peek() == u' ':
817 self.reader.forward()
818 major = self.scan_yaml_directive_number(start_marker)
819 if self.reader.peek() != '.':
820 raise ScannerError("while scanning a directive", start_marker,
821 "expected a digit or '.', but found %r"
822 % self.reader.peek().encode('utf-8'),
823 self.reader.get_marker())
824 self.reader.forward()
825 minor = self.scan_yaml_directive_number(start_marker)
826 if self.reader.peek() not in u'\0 \r\n\x85\u2028\u2029':
827 raise ScannerError("while scanning a directive", start_marker,
828 "expected a digit or ' ', but found %r"
829 % self.reader.peek().encode('utf-8'),
830 self.reader.get_marker())
831 return (major, minor)
833 def scan_yaml_directive_number(self, start_marker):
834 # See the specification for details.
835 ch = self.reader.peek()
836 if not (u'0' <= ch <= '9'):
837 raise ScannerError("while scanning a directive", start_marker,
838 "expected a digit, but found %r" % ch.encode('utf-8'),
839 self.reader.get_marker())
840 length = 0
841 while u'0' <= self.reader.peek(length) <= u'9':
842 length += 1
843 value = int(self.reader.prefix(length))
844 self.reader.forward(length)
845 return value
847 def scan_tag_directive_value(self, start_marker):
848 # See the specification for details.
849 while self.reader.peek() == u' ':
850 self.reader.forward()
851 handle = self.scan_tag_directive_handle(start_marker)
852 while self.reader.peek() == u' ':
853 self.reader.forward()
854 prefix = self.scan_tag_directive_prefix(start_marker)
855 return (handle, prefix)
857 def scan_tag_directive_handle(self, start_marker):
858 # See the specification for details.
859 value = self.scan_tag_handle('directive', start_marker)
860 ch = self.reader.peek()
861 if ch != u' ':
862 raise ScannerError("while scanning a directive", start_marker,
863 "expected ' ', but found %r" % ch.encode('utf-8'),
864 self.reader.get_marker())
865 return value
867 def scan_tag_directive_prefix(self, start_marker):
868 # See the specification for details.
869 value = self.scan_tag_uri('directive', start_marker)
870 ch = self.reader.peek()
871 if ch not in u'\0 \r\n\x85\u2028\u2029':
872 raise ScannerError("while scanning a directive", start_marker,
873 "expected ' ', but found %r" % ch.encode('utf-8'),
874 self.reader.get_marker())
875 return value
877 def scan_directive_ignored_line(self, start_marker):
878 # See the specification for details.
879 while self.reader.peek() == u' ':
880 self.reader.forward()
881 if self.reader.peek() == u'#':
882 while self.reader.peek() not in u'\0\r\n\x85\u2028\u2029':
883 self.reader.forward()
884 ch = self.reader.peek()
885 if ch not in u'\0\r\n\x85\u2028\u2029':
886 raise ScannerError("while scanning a directive", start_marker,
887 "expected a comment or a line break, but found %r"
888 % ch.encode('utf-8'), self.reader.get_marker())
889 self.scan_line_break()
891 def scan_anchor(self, TokenClass):
892 # The specification does not restrict characters for anchors and
893 # aliases. This may lead to problems, for instance, the document:
894 # [ *alias, value ]
895 # can be interpteted in two ways, as
896 # [ "value" ]
897 # and
898 # [ *alias , "value" ]
899 # Therefore we restrict aliases to numbers and ASCII letters.
900 start_marker = self.reader.get_marker()
901 indicator = self.reader.peek()
902 if indicator == '*':
903 name = 'alias'
904 else:
905 name = 'anchor'
906 self.reader.forward()
907 length = 0
908 ch = self.reader.peek(length)
909 while u'0' <= ch <= u'9' or u'A' <= ch <= 'Z' or u'a' <= ch <= 'z' \
910 or ch in u'-_':
911 length += 1
912 ch = self.reader.peek(length)
913 if not length:
914 raise ScannerError("while scanning an %s" % name, start_marker,
915 "expected alphabetic or numeric character, but found %r"
916 % ch.encode('utf-8'), self.reader.get_marker())
917 value = self.reader.prefix(length)
918 self.reader.forward(length)
919 ch = self.reader.peek()
920 if ch not in u'\0 \t\r\n\x85\u2028\u2029?:,]}%@`':
921 raise ScannerError("while scanning an %s" % name, start_marker,
922 "expected alphabetic or numeric character, but found %r"
923 % ch.encode('utf-8'), self.reader.get_marker())
924 end_marker = self.reader.get_marker()
925 return TokenClass(value, start_marker, end_marker)
927 def scan_tag(self):
928 # See the specification for details.
929 start_marker = self.reader.get_marker()
930 ch = self.reader.peek(1)
931 if ch == u'<':
932 handle = None
933 self.reader.forward(2)
934 suffix = self.scan_tag_uri('tag', start_marker)
935 if self.reader.peek() != u'>':
936 raise ScannerError("while parsing a tag", start_marker,
937 "expected '>', but found %r" % self.reader.peek().encode('utf-8'),
938 self.reader.get_marker())
939 self.reader.forward()
940 elif ch in u'\0 \t\r\n\x85\u2028\u2029':
941 handle = None
942 suffix = u'!'
943 self.reader.forward()
944 else:
945 length = 1
946 use_handle = False
947 while ch not in u'\0 \r\n\x85\u2028\u2029':
948 if ch == u'!':
949 use_handle = True
950 break
951 length += 1
952 ch = self.reader.peek(length)
953 handle = u'!'
954 if use_handle:
955 handle = self.scan_tag_handle('tag', start_marker)
956 else:
957 handle = u'!'
958 self.reader.forward()
959 suffix = self.scan_tag_uri('tag', start_marker)
960 ch = self.reader.peek()
961 if ch not in u'\0 \r\n\x85\u2028\u2029':
962 raise ScannerError("while scanning a tag", start_marker,
963 "expected ' ', but found %r" % ch.encode('utf-8'),
964 self.reader.get_marker())
965 value = (handle, suffix)
966 end_marker = self.reader.get_marker()
967 return TagToken(value, start_marker, end_marker)
969 def scan_block_scalar(self, folded):
970 # See the specification for details.
972 chunks = []
973 start_marker = self.reader.get_marker()
975 # Scan the header.
976 self.reader.forward()
977 chomping, increment = self.scan_block_scalar_indicators(start_marker)
978 self.scan_block_scalar_ignored_line(start_marker)
980 # Determine the indentation level and go to the first non-empty line.
981 min_indent = self.indent+1
982 if min_indent < 1:
983 min_indent = 1
984 if increment is None:
985 breaks, max_indent, end_marker = self.scan_block_scalar_indentation()
986 indent = max(min_indent, max_indent)
987 else:
988 indent = min_indent+increment-1
989 breaks, end_marker = self.scan_block_scalar_breaks(indent)
990 line_break = u''
992 # Scan the inner part of the block scalar.
993 while self.reader.column == indent and self.reader.peek() != u'\0':
994 chunks.extend(breaks)
995 leading_non_space = self.reader.peek() not in u' \t'
996 length = 0
997 while self.reader.peek(length) not in u'\0\r\n\x85\u2028\u2029':
998 length += 1
999 chunks.append(self.reader.prefix(length))
1000 self.reader.forward(length)
1001 line_break = self.scan_line_break()
1002 breaks, end_marker = self.scan_block_scalar_breaks(indent)
1003 if self.reader.column == indent and self.reader.peek() != u'\0':
1004 # Unfortunately, folding rules are ambiguous.
1006 # This is the folding according to the specification:
1008 if folded and line_break == u'\n' \
1009 and leading_non_space and self.reader.peek() not in u' \t':
1010 if not breaks:
1011 chunks.append(u' ')
1012 else:
1013 chunks.append(line_break)
1015 # This is Clark Evans's interpretation (also in the spec
1016 # examples):
1018 #if folded and line_break == u'\n':
1019 # if not breaks:
1020 # if self.reader.peek() not in ' \t':
1021 # chunks.append(u' ')
1022 # else:
1023 # chunks.append(line_break)
1024 #else:
1025 # chunks.append(line_break)
1026 else:
1027 break
1029 # Chomp the tail.
1030 if chomping is not False:
1031 chunks.append(line_break)
1032 if chomping is True:
1033 chunks.extend(breaks)
1035 # We are done.
1036 return ScalarToken(u''.join(chunks), False, start_marker, end_marker)
1038 def scan_block_scalar_indicators(self, start_marker):
1039 # See the specification for details.
1040 chomping = None
1041 increment = None
1042 ch = self.reader.peek()
1043 if ch in u'+-':
1044 if ch == '+':
1045 chomping = True
1046 else:
1047 chomping = False
1048 self.reader.forward()
1049 ch = self.reader.peek()
1050 if ch in u'0123456789':
1051 increment = int(ch)
1052 if increment == 0:
1053 raise ScannerError("while scanning a block scalar", start_marker,
1054 "expected indentation indicator in the range 1-9, but found 0",
1055 self.reader.get_marker())
1056 self.reader.forward()
1057 elif ch in u'0123456789':
1058 increment = int(ch)
1059 if increment == 0:
1060 raise ScannerError("while scanning a block scalar", start_marker,
1061 "expected indentation indicator in the range 1-9, but found 0",
1062 self.reader.get_marker())
1063 self.reader.forward()
1064 ch = self.reader.peek()
1065 if ch in u'+-':
1066 if ch == '+':
1067 chomping = True
1068 else:
1069 chomping = False
1070 self.reader.forward()
1071 ch = self.reader.peek()
1072 if ch not in u'\0 \r\n\x85\u2028\u2029':
1073 raise ScannerError("while scanning a block scalar", start_marker,
1074 "expected chomping or indentation indicators, but found %r"
1075 % ch.encode('utf-8'), self.reader.get_marker())
1076 return chomping, increment
1078 def scan_block_scalar_ignored_line(self, start_marker):
1079 # See the specification for details.
1080 while self.reader.peek() == u' ':
1081 self.reader.forward()
1082 if self.reader.peek() == u'#':
1083 while self.reader.peek() not in u'\0\r\n\x85\u2028\u2029':
1084 self.reader.forward()
1085 ch = self.reader.peek()
1086 if ch not in u'\0\r\n\x85\u2028\u2029':
1087 raise ScannerError("while scanning a block scalar", start_marker,
1088 "expected a comment or a line break, but found %r"
1089 % ch.encode('utf-8'), self.reader.get_marker())
1090 self.scan_line_break()
1092 def scan_block_scalar_indentation(self):
1093 # See the specification for details.
1094 chunks = []
1095 max_indent = 0
1096 end_marker = self.reader.get_marker()
1097 while self.reader.peek() in u' \r\n\x85\u2028\u2029':
1098 if self.reader.peek() != u' ':
1099 chunks.append(self.scan_line_break())
1100 end_marker = self.reader.get_marker()
1101 else:
1102 self.reader.forward()
1103 if self.reader.column > max_indent:
1104 max_indent = self.reader.column
1105 return chunks, max_indent, end_marker
1107 def scan_block_scalar_breaks(self, indent):
1108 # See the specification for details.
1109 chunks = []
1110 end_marker = self.reader.get_marker()
1111 while self.reader.column < indent and self.reader.peek() == u' ':
1112 self.reader.forward()
1113 while self.reader.peek() in u'\r\n\x85\u2028\u2029':
1114 chunks.append(self.scan_line_break())
1115 end_marker = self.reader.get_marker()
1116 while self.reader.column < indent and self.reader.peek() == u' ':
1117 self.reader.forward()
1118 return chunks, end_marker
1120 def scan_flow_scalar(self, double):
1121 # See the specification for details.
1122 chunks = []
1123 start_marker = self.reader.get_marker()
1124 indent = self.indent+1
1125 if indent == 0:
1126 indent = 1
1127 quote = self.reader.peek()
1128 self.reader.forward()
1129 chunks.extend(self.scan_flow_scalar_non_spaces(double, indent, start_marker))
1130 while self.reader.peek() != quote:
1131 chunks.extend(self.scan_flow_scalar_spaces(double, indent, start_marker))
1132 chunks.extend(self.scan_flow_scalar_non_spaces(double, indent, start_marker))
1133 self.reader.forward()
1134 end_marker = self.reader.get_marker()
1135 return ScalarToken(u''.join(chunks), False, start_marker, end_marker)
1137 ESCAPE_REPLACEMENTS = {
1138 u'0': u'\0',
1139 u'a': u'\x07',
1140 u'b': u'\x08',
1141 u't': u'\x09',
1142 u'\t': u'\x09',
1143 u'n': u'\x0A',
1144 u'v': u'\x0B',
1145 u'f': u'\x0C',
1146 u'r': u'\x0D',
1147 u'e': u'\x1B',
1148 u' ': u'\x20',
1149 u'\"': u'\"',
1150 u'\\': u'\\',
1151 u'N': u'\x85',
1152 u'_': u'\xA0',
1153 u'L': u'\u2028',
1154 u'P': u'\u2029',
1157 ESCAPE_CODES = {
1158 u'x': 2,
1159 u'u': 4,
1160 u'U': 8,
1163 def scan_flow_scalar_non_spaces(self, double, indent, start_marker):
1164 # See the specification for details.
1165 chunks = []
1166 while True:
1167 length = 0
1168 while self.reader.peek(length) not in u'\'\"\\\0 \t\r\n\x85\u2028\u2029':
1169 length += 1
1170 if length:
1171 chunks.append(self.reader.prefix(length))
1172 self.reader.forward(length)
1173 ch = self.reader.peek()
1174 if not double and ch == u'\'' and self.reader.peek(1) == u'\'':
1175 chunks.append(u'\'')
1176 self.reader.forward(2)
1177 elif (double and ch == u'\'') or (not double and ch in u'\"\\'):
1178 chunks.append(ch)
1179 self.reader.forward()
1180 elif double and ch == u'\\':
1181 self.reader.forward()
1182 ch = self.reader.peek()
1183 if ch in self.ESCAPE_REPLACEMENTS:
1184 chunks.append(self.ESCAPE_REPLACEMENTS[ch])
1185 self.reader.forward()
1186 elif ch in self.ESCAPE_CODES:
1187 length = self.ESCAPE_CODES[ch]
1188 self.reader.forward()
1189 for k in range(length):
1190 if self.reader.peek(k) not in u'0123456789ABCDEFabcdef':
1191 raise ScannerError("while scanning a double-quoted scalar", start_marker,
1192 "expected escape sequence of %d hexdecimal numbers, but found %r" %
1193 (length, self.reader.peek(k).encode('utf-8')), self.reader.get_marker())
1194 code = int(self.reader.prefix(length), 16)
1195 chunks.append(unichr(code))
1196 self.reader.forward(length)
1197 elif ch in u'\r\n\x85\u2028\u2029':
1198 self.scan_line_break()
1199 chunks.extend(self.scan_flow_scalar_breaks(double, indent, start_marker))
1200 else:
1201 raise ScannerError("while scanning a double-quoted scalar", start_marker,
1202 "found unknown escape character %r" % ch.encode('utf-8'), self.reader.get_marker())
1203 else:
1204 return chunks
1206 def scan_flow_scalar_spaces(self, double, indent, start_marker):
1207 # See the specification for details.
1208 chunks = []
1209 length = 0
1210 while self.reader.peek(length) in u' \t':
1211 length += 1
1212 whitespaces = self.reader.prefix(length)
1213 self.reader.forward(length)
1214 ch = self.reader.peek()
1215 if ch == u'\0':
1216 raise ScannerError("while scanning a quoted scalar", start_marker,
1217 "found unexpected end of stream", self.reader.get_marker())
1218 elif ch in u'\r\n\x85\u2028\u2029':
1219 line_break = self.scan_line_break()
1220 breaks = self.scan_flow_scalar_breaks(double, indent, start_marker)
1221 if line_break != u'\n':
1222 chunks.append(line_break)
1223 elif not breaks:
1224 chunks.append(u' ')
1225 chunks.extend(breaks)
1226 else:
1227 chunks.append(whitespaces)
1228 return chunks
1230 def scan_flow_scalar_breaks(self, double, indent, start_marker):
1231 # See the specification for details.
1232 chunks = []
1233 while True:
1234 while self.reader.column < indent and self.reader.peek() == u' ':
1235 self.reader.forward()
1236 if self.reader.column < indent \
1237 and self.reader.peek() not in u'\0\r\n\x85\u2028\u2029':
1238 s = 's'
1239 if indent == 1:
1240 s = ''
1241 raise ScannerError("while scanning a quoted scalar", start_marker,
1242 "expected %d space%s indentation, but found %r"
1243 % (indent, s, self.reader.peek().encode('utf-8')),
1244 self.reader.get_marker())
1245 while self.reader.peek() in u' \t':
1246 self.reader.forward()
1247 if self.reader.peek() in u'\r\n\x85\u2028\u2029':
1248 chunks.append(self.scan_line_break())
1249 else:
1250 return chunks
1252 def scan_plain(self):
1253 # See the specification for details.
1254 # We add an additional restriction for the flow context:
1255 # plain scalars in the flow context cannot contain ':' and '?'.
1256 # We also keep track of the `allow_simple_key` flag here.
1257 chunks = []
1258 start_marker = self.reader.get_marker()
1259 end_marker = start_marker
1260 indent = self.indent+1
1261 if indent == 0:
1262 indent = 1
1263 spaces = []
1264 while True:
1265 length = 0
1266 if self.reader.peek() == u'#':
1267 break
1268 while True:
1269 ch = self.reader.peek(length)
1270 if ch in u'\0 \t\r\n\x85\u2028\u2029' \
1271 or (not self.flow_level and ch == u':' and
1272 self.reader.peek(length+1) in u'\0 \t\r\n\x28\u2028\u2029') \
1273 or (self.flow_level and ch in u',:?[]{}'):
1274 break
1275 length += 1
1276 if length == 0:
1277 break
1278 self.allow_simple_key = False
1279 chunks.extend(spaces)
1280 chunks.append(self.reader.prefix(length))
1281 self.reader.forward(length)
1282 end_marker = self.reader.get_marker()
1283 spaces = self.scan_plain_spaces(indent)
1284 if not spaces or self.reader.peek() == u'#' \
1285 or self.reader.column < indent:
1286 break
1287 return ScalarToken(u''.join(chunks), True, start_marker, end_marker)
1289 def scan_plain_spaces(self, indent):
1290 # See the specification for details.
1291 # The specification is really confusing about tabs in plain scalars.
1292 # We just forbid them completely. Do not use tabs in YAML!
1293 chunks = []
1294 length = 0
1295 while self.reader.peek(length) in u' ':
1296 length += 1
1297 whitespaces = self.reader.prefix(length)
1298 self.reader.forward(length)
1299 ch = self.reader.peek()
1300 if ch in u'\r\n\x85\u2028\u2029':
1301 line_break = self.scan_line_break()
1302 self.allow_simple_key = True
1303 breaks = []
1304 while self.reader.peek() in u' \r\n\x85\u2028\u2029':
1305 if self.reader.peek() == ' ':
1306 self.reader.forward()
1307 else:
1308 breaks.append(self.scan_line_break())
1309 if line_break != u'\n':
1310 chunks.append(line_break)
1311 elif not breaks:
1312 chunks.append(u' ')
1313 chunks.extend(breaks)
1314 elif whitespaces:
1315 chunks.append(whitespaces)
1316 return chunks
1318 def scan_tag_handle(self, name, start_marker):
1319 # See the specification for details.
1320 # For some strange reasons, the specification does not allow '_' in
1321 # tag handles. I have allowed it anyway.
1322 ch = self.reader.peek()
1323 if ch != u'!':
1324 raise ScannerError("while scanning a %s" % name, start_marker,
1325 "expected '!', but found %r" % ch.encode('utf-8'),
1326 self.reader.get_marker())
1327 length = 1
1328 ch = self.reader.peek(length)
1329 if ch != u' ':
1330 while u'0' <= ch <= u'9' or u'A' <= ch <= 'Z' or u'a' <= ch <= 'z' \
1331 or ch in u'-_':
1332 length += 1
1333 ch = self.reader.peek(length)
1334 if ch != u'!':
1335 self.reader.forward(length)
1336 raise ScannerError("while scanning a %s" % name, start_marker,
1337 "expected '!', but found %r" % ch.encode('utf-8'),
1338 self.reader.get_marker())
1339 length += 1
1340 value = self.reader.prefix(length)
1341 self.reader.forward(length)
1342 return value
1344 def scan_tag_uri(self, name, start_marker):
1345 # See the specification for details.
1346 # Note: we do not check if URI is well-formed.
1347 chunks = []
1348 length = 0
1349 ch = self.reader.peek(length)
1350 while u'0' <= ch <= u'9' or u'A' <= ch <= 'Z' or u'a' <= ch <= 'z' \
1351 or ch in u'-;/?:@&=+$,_.!~*\'()[]%':
1352 if ch == u'%':
1353 chunks.append(self.reader.prefix(length))
1354 self.reader.forward(length)
1355 length = 0
1356 chunks.append(self.scan_uri_escapes(name, start_marker))
1357 else:
1358 length += 1
1359 ch = self.reader.peek(length)
1360 if length:
1361 chunks.append(self.reader.prefix(length))
1362 self.reader.forward(length)
1363 length = 0
1364 if not chunks:
1365 raise ScannerError("while parsing a %s" % name, start_marker,
1366 "expected URI, but found %r" % ch.encode('utf-8'),
1367 self.reader.get_marker())
1368 return u''.join(chunks)
1370 def scan_uri_escapes(self, name, start_marker):
1371 # See the specification for details.
1372 bytes = []
1373 marker = self.reader.get_marker()
1374 while self.reader.peek() == u'%':
1375 self.reader.forward()
1376 for k in range(2):
1377 if self.reader.peek(k) not in u'0123456789ABCDEFabcdef':
1378 raise ScannerError("while scanning a %s" % name, start_marker,
1379 "expected URI escape sequence of 2 hexdecimal numbers, but found %r" %
1380 (self.reader.peek(k).encode('utf-8')), self.reader.get_marker())
1381 bytes.append(chr(int(self.reader.prefix(2), 16)))
1382 self.reader.forward(2)
1383 try:
1384 value = unicode(''.join(bytes), 'utf-8')
1385 except UnicodeDecodeError, exc:
1386 raise ScannerError("while scanning a %s" % name, start_marker, str(exc), marker)
1387 return value
1389 def scan_line_break(self):
1390 # Transforms:
1391 # '\r\n' : '\n'
1392 # '\r' : '\n'
1393 # '\n' : '\n'
1394 # '\x85' : '\n'
1395 # '\u2028' : '\u2028'
1396 # '\u2029 : '\u2029'
1397 # default : ''
1398 ch = self.reader.peek()
1399 if ch in u'\r\n\x85':
1400 if self.reader.prefix(2) == u'\r\n':
1401 self.forward(2)
1402 else:
1403 self.reader.forward()
1404 return u'\n'
1405 elif ch in u'\u2028\u2029':
1406 self.reader.forward()
1407 return ch
1408 return u''
1410 #try:
1411 # import psyco
1412 # psyco.bind(Scanner)
1413 #except ImportError:
1414 # pass