3 # YAML-DIRECTIVE(major_version, minor_version), TAG-DIRECTIVE(handle, prefix)
4 # RESERVED-DIRECTIVE(name)
5 # DOCUMENT-START, DOCUMENT-END
6 # BLOCK-SEQUENCE-START, BLOCK-MAPPING-START, BLOCK-END
7 # FLOW-SEQUENCE-START, FLOW-MAPPING-START, FLOW-SEQUENCE-END, FLOW-MAPPING-END
9 # ALIAS(name), ANCHOR(name), TAG(value), SCALAR(value, plain)
11 __all__
= ['Scanner', 'ScannerError']
13 from error
import YAMLError
16 class ScannerError(YAMLError
):
18 # ScannerError: while reading a quoted string
19 # in '...', line 5, column 10:
22 # got unknown quote character '?'
23 # in '...', line 5, column 15:
29 def __init__(self
, token_number
, required
, index
, line
, column
, marker
):
30 self
.token_number
= token_number
31 self
.required
= required
40 def __init__(self
, reader
):
41 """Initialize the scanner."""
42 # The input stream. The Reader class do the dirty work of checking for
43 # BOM and converting the input data to Unicode. It also adds NUL to
46 # Reader supports the following methods
47 # self.reader.peek(k=1) # peek the next k characters
48 # self.reader.forward(k=1) # read the next k characters and move the
52 # Had we reached the end of the stream?
55 # The number of unclosed '{' and '['. `flow_level == 0` means block
59 # List of processed tokens that are not yet emitted.
62 # Number of tokens that were emitted through the `get_token` method.
65 # The current indentation level.
68 # Past indentation levels.
71 # Variables related to simple keys treatment.
73 # A simple key is a key that is not denoted by the '?' indicator.
74 # Example of simple keys:
76 # block simple key: value
78 # : { flow simple key: value }
79 # We emit the KEY token before all keys, so when we find a potential
80 # simple key, we try to locate the corresponding ':' indicator.
81 # Simple keys should be limited to a single line and 1024 characters.
83 # Can a simple key start at the current position? A simple key may
85 # - at the beginning of the line, not counting indentation spaces
87 # - after '{', '[', ',' (in the flow context),
88 # - after '?', ':', '-' (in the block context).
89 # In the block context, this flag also signify if a block collection
90 # may start at the current position.
91 self
.allow_simple_key
= True
93 # Keep track of possible simple keys. This is a dictionary. The key
94 # is `flow_level`; there can be no more that one possible simple key
95 # for each level. The value is a SimpleKey record:
96 # (token_number, required, index, line, column, marker)
97 # A simple key may start with ALIAS, ANCHOR, TAG, SCALAR(flow),
99 self
.possible_simple_keys
= {}
101 # Two public methods.
103 def peek_token(self
):
104 """Get the current token."""
105 while self
.need_more_tokens():
106 self
.fetch_more_tokens()
108 return self
.tokens
[0]
111 "Get the current token and remove it from the list of pending tokens."""
112 while self.need_more_tokens():
113 self.fetch_more_tokens()
115 self.tokens_taken += 1
116 return self.tokens.pop(0)
120 def need_more_tokens(self):
125 # The current token may be a potential simple key, so we
126 # need to look further.
127 self.stale_possible_simple_keys()
128 if self.next_possible_simple_key() == self.tokens_taken:
131 def fetch_more_tokens(self):
133 # Eat whitespaces and comments until we reach the next token.
134 self.scan_to_next_token()
136 # Remove obsolete possible simple keys.
137 self.stale_possible_simple_keys()
139 # Compare the current indentation and column. It may add some tokens
140 # and decrease the current indentation level.
141 self.unwind_indent(self.reader.column)
144 #print self.reader.get_marker().get_snippet()
146 # Peek the next character.
147 ch = self.reader.peek()
149 # Is it the end of reader?
151 return self.fetch_end()
154 if ch == u'%' and self.check_directive():
155 return self.fetch_directive()
157 # Is it the document start?
158 if ch == u'-' and self.check_document_start():
159 return self.fetch_document_start()
161 # Is it the document end?
162 if ch == u'.' and self.check_document_end():
163 return self.fetch_document_end()
165 # Note: the order of the following checks is NOT significant.
167 # Is it the flow sequence start indicator?
169 return self.fetch_flow_sequence_start()
171 # Is it the flow mapping start indicator?
173 return self.fetch_flow_mapping_start()
175 # Is it the flow sequence end indicator?
177 return self.fetch_flow_sequence_end()
179 # Is it the flow mapping end indicator?
181 return self.fetch_flow_mapping_end()
183 # Is it the entry indicator?
184 if ch in u'-,' and self.check_entry():
185 return self.fetch_entry()
187 # Is it the key indicator?
188 if ch == u'?' and self.check_key():
189 return self.fetch_key()
191 # Is it the value indicator?
192 if ch == u':' and self.check_value():
193 return self.fetch_value()
197 return self.fetch_alias()
201 return self.fetch_anchor()
205 return self.fetch_tag()
207 # Is it a literal scalar?
208 if ch == u'|' and not self.flow_level:
209 return self.fetch_literal()
211 # Is it a folded scalar?
212 if ch == u'>' and not self.flow_level:
213 return self.fetch_folded()
215 # Is it a single quoted scalar?
217 return self.fetch_single()
219 # Is it a double quoted scalar?
221 return self.fetch_double()
223 # It must be a plain scalar then.
224 if self.check_plain():
225 return self.fetch_plain()
227 # No? It's an error. Let's produce a nice error message.
230 # Simple keys treatment.
232 def next_possible_simple_key(self):
233 # Return the number of the nearest possible simple key. Actually we
234 # don't need to loop through the whole dictionary. We may replace it
235 # with the following code:
236 # if not self.possible_simple_keys:
238 # return self.possible_simple_keys[
239 # min(self.possible_simple_keys.keys())].token_number
240 min_token_number = None
241 for level in self.possible_simple_keys:
242 key = self.possible_simple_keys[level]
243 if min_token_number is None or key.token_number < min_token_number:
244 min_token_number = key.token_number
245 return min_token_number
247 def stale_possible_simple_keys(self):
248 # Remove entries that are no longer possible simple keys. According to
249 # the YAML specification, simple keys
250 # - should be limited to a single line,
251 # - should be no longer than 1024 characters.
252 # Disabling this procedure will allow simple keys of any length and
253 # height (may cause problems if indentation is broken though).
254 for level in self.possible_simple_keys.keys():
255 key = self.possible_simple_keys[level]
256 if key.line != self.reader.line \
257 or self.reader.index-key.index > 1024:
259 self.fail("simple key
is required
")
260 del self.possible_simple_keys[level]
262 def save_possible_simple_key(self):
263 # The next token may start a simple key. We check if it's possible
264 # and save its position. This function is called for
265 # ALIAS, ANCHOR, TAG, SCALAR(flow), '[', and '{'.
267 # Check if a simple key is required at the current position.
268 required = not self.flow_level and self.indent == self.reader.column
270 # The next token might be a simple key. Let's save it's number and
272 if self.allow_simple_key:
273 self.remove_possible_simple_key()
274 token_number = self.tokens_taken+len(self.tokens)
275 index = self.reader.index
276 line = self.reader.line
277 column = self.reader.column
278 marker = self.reader.get_marker()
279 key = SimpleKey(token_number, required,
280 index, line, column, marker)
281 self.possible_simple_keys[self.flow_level] = key
283 # A simple key is required at the current position.
285 self.fail("simple key
is required
")
287 def remove_possible_simple_key(self):
288 # Remove the saved possible key position at the current flow level.
289 if self.flow_level in self.possible_simple_keys:
290 key = self.possible_simple_keys[self.flow_level]
292 self.fail("simple key
is required
")
294 # Indentation functions.
296 def unwind_indent(self, column):
298 # In flow context, tokens should respect indentation.
299 if self.flow_level and self.indent > column:
300 self.fail("invalid intendation
in the flow context
")
302 # In block context, we may need to issue the BLOCK-END tokens.
303 while self.indent > column:
304 marker = self.reader.get_marker()
305 self.indent = self.indents.pop()
306 self.tokens.append(BlockEndToken(marker, marker))
308 def add_indent(self, column):
309 # Check if we need to increase indentation.
310 if self.indent < column:
311 self.indents.append(self.indent)
320 # Set the current intendation to -1.
321 self.unwind_indent(-1)
323 # Reset everything (not really needed).
324 self.allow_simple_key = False
325 self.possible_simple_keys = {}
328 marker = self.reader.get_marker()
331 self.tokens.append(EndToken(marker, marker))
333 # The reader is ended.
336 def fetch_directive(self):
338 # Set the current intendation to -1.
339 self.unwind_indent(-1)
342 self.remove_possible_simple_key()
343 self.allow_simple_key = False
345 # Scan and add DIRECTIVE.
346 self.scan_directive()
348 def fetch_document_start(self):
349 self.fetch_document_indicator(DocumentStartToken)
351 def fetch_document_end(self):
352 self.fetch_document_indicator(DocumentEndToken)
354 def fetch_document_indicator(self, TokenClass):
356 # Set the current intendation to -1.
357 self.unwind_indent(-1)
359 # Reset simple keys. Note that there could not be a block collection
361 self.remove_possible_simple_key()
362 self.allow_simple_key = False
364 # Add DOCUMENT-START or DOCUMENT-END.
365 start_marker = self.reader.get_marker()
366 self.reader.forward(3)
367 end_marker = self.reader.get_marker()
368 self.tokens.append(TokenClass(start_marker, end_marker))
370 def fetch_flow_sequence_start(self):
371 self.fetch_flow_collection_start(FlowSequenceStartToken)
373 def fetch_flow_mapping_start(self):
374 self.fetch_flow_collection_start(FlowMappingStartToken)
376 def fetch_flow_collection_start(self, TokenClass):
378 # '[' and '{' may start a simple key.
379 self.save_possible_simple_key()
381 # Increase the flow level.
384 # Simple keys are allowed after '[' and '{'.
385 self.allow_simple_key = True
387 # Add FLOW-SEQUENCE-START or FLOW-MAPPING-START.
388 start_marker = self.reader.get_marker()
389 self.reader.forward()
390 end_marker = self.reader.get_marker()
391 self.tokens.append(TokenClass(start_marker, end_marker))
393 def fetch_flow_sequence_end(self):
394 self.fetch_flow_collection_end(FlowSequenceEndToken)
396 def fetch_flow_mapping_end(self):
397 self.fetch_flow_collection_end(FlowMappingEndToken)
399 def fetch_flow_collection_end(self, TokenClass):
401 # Reset possible simple key on the current level.
402 self.remove_possible_simple_key()
404 # Decrease the flow level.
407 # No simple keys after ']' or '}'.
408 self.allow_simple_key = False
410 # Add FLOW-SEQUENCE-END or FLOW-MAPPING-END.
411 start_marker = self.reader.get_marker()
412 self.reader.forward()
413 end_marker = self.reader.get_marker()
414 self.tokens.append(TokenClass(start_marker, end_marker))
416 def fetch_entry(self):
418 # Block context needs additional checks.
419 if not self.flow_level:
421 # Are we allowed to start a new entry?
422 if not self.allow_simple_key:
423 self.fail("Cannot start a new entry here
")
425 # We may need to add BLOCK-SEQUENCE-START.
426 if self.add_indent(self.reader.column):
427 marker = self.reader.get_marker()
428 self.tokens.append(BlockSequenceStartToken(marker, marker))
430 # Simple keys are allowed after '-' and ','.
431 self.allow_simple_key = True
433 # Reset possible simple key on the current level.
434 self.remove_possible_simple_key()
437 start_marker = self.reader.get_marker()
438 self.reader.forward()
439 end_marker = self.reader.get_marker()
440 self.tokens.append(EntryToken(start_marker, end_marker))
444 # Block context needs additional checks.
445 if not self.flow_level:
447 # Are we allowed to start a key (not nessesary a simple)?
448 if not self.allow_simple_key:
449 self.fail("Cannot start a new key here
")
451 # We may need to add BLOCK-MAPPING-START.
452 if self.add_indent(self.reader.column):
453 marker = self.reader.get_marker()
454 self.tokens.append(BlockMappingStartToken(marker, marker))
456 # Simple keys are allowed after '?' in the block context.
457 self.allow_simple_key = not self.flow_level
459 # Reset possible simple key on the current level.
460 self.remove_possible_simple_key()
463 start_marker = self.reader.get_marker()
464 self.reader.forward()
465 end_marker = self.reader.get_marker()
466 self.tokens.append(KeyToken(start_marker, end_marker))
468 def fetch_value(self):
470 # Do we determine a simple key?
471 if self.flow_level in self.possible_simple_keys:
474 key = self.possible_simple_keys[self.flow_level]
475 del self.possible_simple_keys[self.flow_level]
476 self.tokens.insert(key.token_number-self.tokens_taken,
477 KeyToken(key.marker, key.marker))
479 # If this key starts a new block mapping, we need to add
480 # BLOCK-MAPPING-START.
481 if not self.flow_level:
482 if self.add_indent(key.column):
483 self.tokens.insert(key.token_number-self.tokens_taken,
484 BlockMappingStartToken(key.marker, key.marker))
486 # There cannot be two simple keys one after another.
487 self.allow_simple_key = False
489 # It must be a part of a complex key.
492 # Simple keys are allowed after ':' in the block context.
493 self.allow_simple_key = not self.flow_level
495 # Reset possible simple key on the current level.
496 self.remove_possible_simple_key()
499 start_marker = self.reader.get_marker()
500 self.reader.forward()
501 end_marker = self.reader.get_marker()
502 self.tokens.append(ValueToken(start_marker, end_marker))
504 def fetch_alias(self):
506 # ALIAS could be a simple key.
507 self.save_possible_simple_key()
509 # No simple keys after ALIAS.
510 self.allow_simple_key = False
512 # Scan and add ALIAS.
513 self.scan_anchor(AliasToken)
515 def fetch_anchor(self):
517 # ANCHOR could start a simple key.
518 self.save_possible_simple_key()
520 # No simple keys after ANCHOR.
521 self.allow_simple_key = False
523 # Scan and add ANCHOR.
524 self.scan_anchor(AnchorToken)
528 # TAG could start a simple key.
529 self.save_possible_simple_key()
531 # No simple keys after TAG.
532 self.allow_simple_key = False
537 def fetch_literal(self):
538 self.fetch_block_scalar(folded=False)
540 def fetch_folded(self):
541 self.fetch_block_scalar(folded=True)
543 def fetch_block_scalar(self, folded):
545 # A simple key may follow a block scalar.
546 self.allow_simple_key = True
548 # Reset possible simple key on the current level.
549 self.remove_possible_simple_key()
551 # Scan and add SCALAR.
552 self.scan_block_scalar(folded)
554 def fetch_single(self):
555 self.fetch_flow_scalar(double=False)
557 def fetch_double(self):
558 self.fetch_flow_scalar(double=True)
560 def fetch_flow_scalar(self, double):
562 # A flow scalar could be a simple key.
563 self.save_possible_simple_key()
565 # No simple keys after flow scalars.
566 self.allow_simple_key = False
568 # Scan and add SCALAR.
569 self.scan_flow_scalar(double)
571 def fetch_plain(self):
573 # A plain scalar could be a simple key.
574 self.save_possible_simple_key()
576 # No simple keys after plain scalars. But note that `scan_plain` will
577 # change this flag if the scan is finished at the beginning of the
579 self.allow_simple_key = False
581 # Scan and add SCALAR. May change `allow_simple_key`.
586 def check_directive(self):
588 # DIRECTIVE: ^ '%' ...
589 # The '%' indicator is already checked.
590 if self.reader.column == 0:
593 def check_document_start(self):
595 # DOCUMENT-START: ^ '---' (' '|'\n')
596 if self.reader.column == 0:
597 prefix = self.reader.peek(4)
598 if prefix[:3] == u'---' and prefix[3] in u'\0 \t\r\n\x85\u2028\u2029':
601 def check_document_end(self):
603 # DOCUMENT-END: ^ '...' (' '|'\n')
604 if self.reader.column == 0:
605 prefix = self.reader.peek(4)
606 if prefix[:3] == u'...' and prefix[3] in u'\0 \t\r\n\x85\u2028\u2029':
609 def check_entry(self):
611 # ENTRY(flow context): ','
613 return self.reader.peek() == u','
615 # ENTRY(block context): '-' (' '|'\n')
617 prefix = self.reader.peek(2)
618 return prefix[0] == u'-' and prefix[1] in u'\0 \t\r\n\x85\u2028\u2029'
622 # KEY(flow context): '?'
626 # KEY(block context): '?' (' '|'\n')
628 prefix = self.reader.peek(2)
629 return prefix[1] in u'\0 \t\r\n\x85\u2028\u2029'
631 def check_value(self):
633 # VALUE(flow context): ':'
637 # VALUE(block context): ':' (' '|'\n')
639 prefix = self.reader.peek(2)
640 return prefix[1] in u'\0 \t\r\n\x85\u2028\u2029'
642 def check_plain(self):
647 def scan_to_next_token(self):
650 while self.reader.peek() == u' ':
651 self.reader.forward()
652 if self.reader.peek() == u'#':
653 while self.reader.peek() not in u'\r\n':
654 self.reader.forward()
655 if self.reader.peek() in u'\r\n':
656 self.reader.forward()
657 if not self.flow_level:
658 self.allow_simple_key = True
662 def scan_directive(self):
663 marker = self.reader.get_marker()
664 if self.reader.peek(5) == u'%YAML ':
665 self.tokens.append(YAMLDirectiveToken(1, 1, marker, marker))
666 elif self.reader.peek(4) == u'%TAG ':
667 self.tokens.append(TagDirectiveToken(marker, marker))
669 self.tokens.append(ReservedDirectiveToken('', marker, marker))
670 while self.reader.peek() not in u'\0\r\n':
671 self.reader.forward()
672 self.reader.forward()
674 def scan_anchor(self, TokenClass):
675 start_marker = self.reader.get_marker()
676 while self.reader.peek() not in u'\0 \t\r\n,:':
677 self.reader.forward()
678 end_marker = self.reader.get_marker()
679 self.tokens.append(TokenClass('', start_marker, end_marker))
682 start_marker = self.reader.get_marker()
683 while self.reader.peek() not in u'\0 \t\r\n':
684 self.reader.forward()
685 end_marker = self.reader.get_marker()
686 self.tokens.append(TagToken('', start_marker, end_marker))
688 def scan_block_scalar(self, folded):
689 start_marker = self.reader.get_marker()
690 indent = self.indent+1
694 while self.reader.peek() and self.reader.peek() and self.reader.peek() not in u'\0\r\n\x85\u2028\u2029':
695 self.reader.forward()
696 if self.reader.peek() != u'\0':
697 self.reader.forward()
699 while count < indent and self.reader.peek() == u' ':
700 self.reader.forward()
702 if count < indent and self.reader.peek() not in u'#\r\n\x85\u2028\u2029':
704 self.tokens.append(ScalarToken('', False, start_marker, start_marker))
706 def scan_flow_scalar(self, double):
707 marker = self.reader.get_marker()
708 quote = self.reader.peek()
709 self.reader.forward()
710 while self.reader.peek() != quote:
711 if double and self.reader.peek() == u'\\':
712 self.reader.forward(2)
713 elif not double and self.reader.peek(3)[1:] == u'\'\'':
714 self.reader.forward(3)
716 self.reader.forward(1)
717 self.reader.forward(1)
718 self.tokens.append(ScalarToken('', False, marker, marker))
720 def scan_plain(self):
721 indent = self.indent+1
725 marker = self.reader.get_marker()
727 while self.reader.peek() == u' ':
728 self.reader.forward()
730 while self.reader.peek() not in u'\0\r\n?:,[]{}#' \
731 or (not space and self.reader.peek() == '#') \
732 or (not self.flow_level and self.reader.peek() in '?,[]{}') \
733 or (not self.flow_level and self.reader.peek() == ':' and self.reader.peek(2)[1] not in u' \0\r\n'):
734 space = self.reader.peek() not in u' \t'
735 self.reader.forward()
736 self.allow_simple_key = False
737 if self.reader.peek() not in u'\r\n':
739 while self.reader.peek() in u'\r\n':
740 self.reader.forward()
741 if not self.flow_level:
742 self.allow_simple_key = True
744 while self.reader.peek() == u' ' and count < indent:
745 self.reader.forward()
750 self.tokens.append(ScalarToken('', True, marker, marker))
752 def invalid_token(self):
753 self.fail("invalid token
")
755 def fail(self, message):
756 raise ScannerError(message)
760 # psyco.bind(Scanner)