Some renaming.
[pyyaml/python3.git] / lib / yaml / scanner.py
blobb8344788bac48fcbcca17d2c9b53f45eababc77a
2 # Tokens:
3 # YAML-DIRECTIVE(major_version, minor_version), TAG-DIRECTIVE(handle, prefix)
4 # RESERVED-DIRECTIVE(name)
5 # DOCUMENT-START, DOCUMENT-END
6 # BLOCK-SEQUENCE-START, BLOCK-MAPPING-START, BLOCK-END
7 # FLOW-SEQUENCE-START, FLOW-MAPPING-START, FLOW-SEQUENCE-END, FLOW-MAPPING-END
8 # ENTRY, KEY, VALUE
9 # ALIAS(name), ANCHOR(name), TAG(value), SCALAR(value, plain)
11 __all__ = ['Scanner', 'ScannerError']
13 from error import YAMLError
14 from tokens import *
16 class ScannerError(YAMLError):
17 # TODO:
18 # ScannerError: while reading a quoted string
19 # in '...', line 5, column 10:
20 # key: "valu\?e"
21 # ^
22 # got unknown quote character '?'
23 # in '...', line 5, column 15:
24 # key: "valu\?e"
25 # ^
26 pass
28 class SimpleKey:
29 def __init__(self, token_number, required, index, line, column, marker):
30 self.token_number = token_number
31 self.required = required
32 self.index = index
33 self.line = line
34 self.column = column
35 self.marker = marker
37 class Scanner:
40 def __init__(self, reader):
41 """Initialize the scanner."""
42 # The input stream. The Reader class do the dirty work of checking for
43 # BOM and converting the input data to Unicode. It also adds NUL to
44 # the end.
46 # Reader supports the following methods
47 # self.reader.peek(k=1) # peek the next k characters
48 # self.reader.forward(k=1) # read the next k characters and move the
49 # # pointer
50 self.reader = reader
52 # Had we reached the end of the stream?
53 self.done = False
55 # The number of unclosed '{' and '['. `flow_level == 0` means block
56 # context.
57 self.flow_level = 0
59 # List of processed tokens that are not yet emitted.
60 self.tokens = []
62 # Number of tokens that were emitted through the `get_token` method.
63 self.tokens_taken = 0
65 # The current indentation level.
66 self.indent = -1
68 # Past indentation levels.
69 self.indents = []
71 # Variables related to simple keys treatment.
73 # A simple key is a key that is not denoted by the '?' indicator.
74 # Example of simple keys:
75 # ---
76 # block simple key: value
77 # ? not a simple key:
78 # : { flow simple key: value }
79 # We emit the KEY token before all keys, so when we find a potential
80 # simple key, we try to locate the corresponding ':' indicator.
81 # Simple keys should be limited to a single line and 1024 characters.
83 # Can a simple key start at the current position? A simple key may
84 # start:
85 # - at the beginning of the line, not counting indentation spaces
86 # (in block context),
87 # - after '{', '[', ',' (in the flow context),
88 # - after '?', ':', '-' (in the block context).
89 # In the block context, this flag also signify if a block collection
90 # may start at the current position.
91 self.allow_simple_key = True
93 # Keep track of possible simple keys. This is a dictionary. The key
94 # is `flow_level`; there can be no more that one possible simple key
95 # for each level. The value is a SimpleKey record:
96 # (token_number, required, index, line, column, marker)
97 # A simple key may start with ALIAS, ANCHOR, TAG, SCALAR(flow),
98 # '[', or '{' tokens.
99 self.possible_simple_keys = {}
101 # Two public methods.
103 def peek_token(self):
104 """Get the current token."""
105 while self.need_more_tokens():
106 self.fetch_more_tokens()
107 if self.tokens:
108 return self.tokens[0]
110 def get_token(self):
111 "Get the current token and remove it from the list of pending tokens."""
112 while self.need_more_tokens():
113 self.fetch_more_tokens()
114 if self.tokens:
115 self.tokens_taken += 1
116 return self.tokens.pop(0)
118 # Private methods.
120 def need_more_tokens(self):
121 if self.done:
122 return False
123 if not self.tokens:
124 return True
125 # The current token may be a potential simple key, so we
126 # need to look further.
127 self.stale_possible_simple_keys()
128 if self.next_possible_simple_key() == self.tokens_taken:
129 return True
131 def fetch_more_tokens(self):
133 # Eat whitespaces and comments until we reach the next token.
134 self.scan_to_next_token()
136 # Remove obsolete possible simple keys.
137 self.stale_possible_simple_keys()
139 # Compare the current indentation and column. It may add some tokens
140 # and decrease the current indentation level.
141 self.unwind_indent(self.reader.column)
143 #print
144 #print self.reader.get_marker().get_snippet()
146 # Peek the next character.
147 ch = self.reader.peek()
149 # Is it the end of reader?
150 if ch == u'\0':
151 return self.fetch_end()
153 # Is it a directive?
154 if ch == u'%' and self.check_directive():
155 return self.fetch_directive()
157 # Is it the document start?
158 if ch == u'-' and self.check_document_start():
159 return self.fetch_document_start()
161 # Is it the document end?
162 if ch == u'.' and self.check_document_end():
163 return self.fetch_document_end()
165 # Note: the order of the following checks is NOT significant.
167 # Is it the flow sequence start indicator?
168 if ch == u'[':
169 return self.fetch_flow_sequence_start()
171 # Is it the flow mapping start indicator?
172 if ch == u'{':
173 return self.fetch_flow_mapping_start()
175 # Is it the flow sequence end indicator?
176 if ch == u']':
177 return self.fetch_flow_sequence_end()
179 # Is it the flow mapping end indicator?
180 if ch == u'}':
181 return self.fetch_flow_mapping_end()
183 # Is it the entry indicator?
184 if ch in u'-,' and self.check_entry():
185 return self.fetch_entry()
187 # Is it the key indicator?
188 if ch == u'?' and self.check_key():
189 return self.fetch_key()
191 # Is it the value indicator?
192 if ch == u':' and self.check_value():
193 return self.fetch_value()
195 # Is it an alias?
196 if ch == u'*':
197 return self.fetch_alias()
199 # Is it an anchor?
200 if ch == u'&':
201 return self.fetch_anchor()
203 # Is it a tag?
204 if ch == u'!':
205 return self.fetch_tag()
207 # Is it a literal scalar?
208 if ch == u'|' and not self.flow_level:
209 return self.fetch_literal()
211 # Is it a folded scalar?
212 if ch == u'>' and not self.flow_level:
213 return self.fetch_folded()
215 # Is it a single quoted scalar?
216 if ch == u'\'':
217 return self.fetch_single()
219 # Is it a double quoted scalar?
220 if ch == u'\"':
221 return self.fetch_double()
223 # It must be a plain scalar then.
224 if self.check_plain():
225 return self.fetch_plain()
227 # No? It's an error. Let's produce a nice error message.
228 self.invalid_token()
230 # Simple keys treatment.
232 def next_possible_simple_key(self):
233 # Return the number of the nearest possible simple key. Actually we
234 # don't need to loop through the whole dictionary. We may replace it
235 # with the following code:
236 # if not self.possible_simple_keys:
237 # return None
238 # return self.possible_simple_keys[
239 # min(self.possible_simple_keys.keys())].token_number
240 min_token_number = None
241 for level in self.possible_simple_keys:
242 key = self.possible_simple_keys[level]
243 if min_token_number is None or key.token_number < min_token_number:
244 min_token_number = key.token_number
245 return min_token_number
247 def stale_possible_simple_keys(self):
248 # Remove entries that are no longer possible simple keys. According to
249 # the YAML specification, simple keys
250 # - should be limited to a single line,
251 # - should be no longer than 1024 characters.
252 # Disabling this procedure will allow simple keys of any length and
253 # height (may cause problems if indentation is broken though).
254 for level in self.possible_simple_keys.keys():
255 key = self.possible_simple_keys[level]
256 if key.line != self.reader.line \
257 or self.reader.index-key.index > 1024:
258 if key.required:
259 self.fail("simple key is required")
260 del self.possible_simple_keys[level]
262 def save_possible_simple_key(self):
263 # The next token may start a simple key. We check if it's possible
264 # and save its position. This function is called for
265 # ALIAS, ANCHOR, TAG, SCALAR(flow), '[', and '{'.
267 # Check if a simple key is required at the current position.
268 required = not self.flow_level and self.indent == self.reader.column
270 # The next token might be a simple key. Let's save it's number and
271 # position.
272 if self.allow_simple_key:
273 self.remove_possible_simple_key()
274 token_number = self.tokens_taken+len(self.tokens)
275 index = self.reader.index
276 line = self.reader.line
277 column = self.reader.column
278 marker = self.reader.get_marker()
279 key = SimpleKey(token_number, required,
280 index, line, column, marker)
281 self.possible_simple_keys[self.flow_level] = key
283 # A simple key is required at the current position.
284 elif required:
285 self.fail("simple key is required")
287 def remove_possible_simple_key(self):
288 # Remove the saved possible key position at the current flow level.
289 if self.flow_level in self.possible_simple_keys:
290 key = self.possible_simple_keys[self.flow_level]
291 if key.required:
292 self.fail("simple key is required")
294 # Indentation functions.
296 def unwind_indent(self, column):
298 # In flow context, tokens should respect indentation.
299 if self.flow_level and self.indent > column:
300 self.fail("invalid intendation in the flow context")
302 # In block context, we may need to issue the BLOCK-END tokens.
303 while self.indent > column:
304 marker = self.reader.get_marker()
305 self.indent = self.indents.pop()
306 self.tokens.append(BlockEndToken(marker, marker))
308 def add_indent(self, column):
309 # Check if we need to increase indentation.
310 if self.indent < column:
311 self.indents.append(self.indent)
312 self.indent = column
313 return True
314 return False
316 # Fetchers.
318 def fetch_end(self):
320 # Set the current intendation to -1.
321 self.unwind_indent(-1)
323 # Reset everything (not really needed).
324 self.allow_simple_key = False
325 self.possible_simple_keys = {}
327 # Read the token.
328 marker = self.reader.get_marker()
330 # Add END.
331 self.tokens.append(EndToken(marker, marker))
333 # The reader is ended.
334 self.done = True
336 def fetch_directive(self):
338 # Set the current intendation to -1.
339 self.unwind_indent(-1)
341 # Reset simple keys.
342 self.remove_possible_simple_key()
343 self.allow_simple_key = False
345 # Scan and add DIRECTIVE.
346 self.scan_directive()
348 def fetch_document_start(self):
349 self.fetch_document_indicator(DocumentStartToken)
351 def fetch_document_end(self):
352 self.fetch_document_indicator(DocumentEndToken)
354 def fetch_document_indicator(self, TokenClass):
356 # Set the current intendation to -1.
357 self.unwind_indent(-1)
359 # Reset simple keys. Note that there could not be a block collection
360 # after '---'.
361 self.remove_possible_simple_key()
362 self.allow_simple_key = False
364 # Add DOCUMENT-START or DOCUMENT-END.
365 start_marker = self.reader.get_marker()
366 self.reader.forward(3)
367 end_marker = self.reader.get_marker()
368 self.tokens.append(TokenClass(start_marker, end_marker))
370 def fetch_flow_sequence_start(self):
371 self.fetch_flow_collection_start(FlowSequenceStartToken)
373 def fetch_flow_mapping_start(self):
374 self.fetch_flow_collection_start(FlowMappingStartToken)
376 def fetch_flow_collection_start(self, TokenClass):
378 # '[' and '{' may start a simple key.
379 self.save_possible_simple_key()
381 # Increase the flow level.
382 self.flow_level += 1
384 # Simple keys are allowed after '[' and '{'.
385 self.allow_simple_key = True
387 # Add FLOW-SEQUENCE-START or FLOW-MAPPING-START.
388 start_marker = self.reader.get_marker()
389 self.reader.forward()
390 end_marker = self.reader.get_marker()
391 self.tokens.append(TokenClass(start_marker, end_marker))
393 def fetch_flow_sequence_end(self):
394 self.fetch_flow_collection_end(FlowSequenceEndToken)
396 def fetch_flow_mapping_end(self):
397 self.fetch_flow_collection_end(FlowMappingEndToken)
399 def fetch_flow_collection_end(self, TokenClass):
401 # Reset possible simple key on the current level.
402 self.remove_possible_simple_key()
404 # Decrease the flow level.
405 self.flow_level -= 1
407 # No simple keys after ']' or '}'.
408 self.allow_simple_key = False
410 # Add FLOW-SEQUENCE-END or FLOW-MAPPING-END.
411 start_marker = self.reader.get_marker()
412 self.reader.forward()
413 end_marker = self.reader.get_marker()
414 self.tokens.append(TokenClass(start_marker, end_marker))
416 def fetch_entry(self):
418 # Block context needs additional checks.
419 if not self.flow_level:
421 # Are we allowed to start a new entry?
422 if not self.allow_simple_key:
423 self.fail("Cannot start a new entry here")
425 # We may need to add BLOCK-SEQUENCE-START.
426 if self.add_indent(self.reader.column):
427 marker = self.reader.get_marker()
428 self.tokens.append(BlockSequenceStartToken(marker, marker))
430 # Simple keys are allowed after '-' and ','.
431 self.allow_simple_key = True
433 # Reset possible simple key on the current level.
434 self.remove_possible_simple_key()
436 # Add ENTRY.
437 start_marker = self.reader.get_marker()
438 self.reader.forward()
439 end_marker = self.reader.get_marker()
440 self.tokens.append(EntryToken(start_marker, end_marker))
442 def fetch_key(self):
444 # Block context needs additional checks.
445 if not self.flow_level:
447 # Are we allowed to start a key (not nessesary a simple)?
448 if not self.allow_simple_key:
449 self.fail("Cannot start a new key here")
451 # We may need to add BLOCK-MAPPING-START.
452 if self.add_indent(self.reader.column):
453 marker = self.reader.get_marker()
454 self.tokens.append(BlockMappingStartToken(marker, marker))
456 # Simple keys are allowed after '?' in the block context.
457 self.allow_simple_key = not self.flow_level
459 # Reset possible simple key on the current level.
460 self.remove_possible_simple_key()
462 # Add KEY.
463 start_marker = self.reader.get_marker()
464 self.reader.forward()
465 end_marker = self.reader.get_marker()
466 self.tokens.append(KeyToken(start_marker, end_marker))
468 def fetch_value(self):
470 # Do we determine a simple key?
471 if self.flow_level in self.possible_simple_keys:
473 # Add KEY.
474 key = self.possible_simple_keys[self.flow_level]
475 del self.possible_simple_keys[self.flow_level]
476 self.tokens.insert(key.token_number-self.tokens_taken,
477 KeyToken(key.marker, key.marker))
479 # If this key starts a new block mapping, we need to add
480 # BLOCK-MAPPING-START.
481 if not self.flow_level:
482 if self.add_indent(key.column):
483 self.tokens.insert(key.token_number-self.tokens_taken,
484 BlockMappingStartToken(key.marker, key.marker))
486 # There cannot be two simple keys one after another.
487 self.allow_simple_key = False
489 # It must be a part of a complex key.
490 else:
492 # Simple keys are allowed after ':' in the block context.
493 self.allow_simple_key = not self.flow_level
495 # Reset possible simple key on the current level.
496 self.remove_possible_simple_key()
498 # Add VALUE.
499 start_marker = self.reader.get_marker()
500 self.reader.forward()
501 end_marker = self.reader.get_marker()
502 self.tokens.append(ValueToken(start_marker, end_marker))
504 def fetch_alias(self):
506 # ALIAS could be a simple key.
507 self.save_possible_simple_key()
509 # No simple keys after ALIAS.
510 self.allow_simple_key = False
512 # Scan and add ALIAS.
513 self.scan_anchor(AliasToken)
515 def fetch_anchor(self):
517 # ANCHOR could start a simple key.
518 self.save_possible_simple_key()
520 # No simple keys after ANCHOR.
521 self.allow_simple_key = False
523 # Scan and add ANCHOR.
524 self.scan_anchor(AnchorToken)
526 def fetch_tag(self):
528 # TAG could start a simple key.
529 self.save_possible_simple_key()
531 # No simple keys after TAG.
532 self.allow_simple_key = False
534 # Scan and add TAG.
535 self.scan_tag()
537 def fetch_literal(self):
538 self.fetch_block_scalar(folded=False)
540 def fetch_folded(self):
541 self.fetch_block_scalar(folded=True)
543 def fetch_block_scalar(self, folded):
545 # A simple key may follow a block scalar.
546 self.allow_simple_key = True
548 # Reset possible simple key on the current level.
549 self.remove_possible_simple_key()
551 # Scan and add SCALAR.
552 self.scan_block_scalar(folded)
554 def fetch_single(self):
555 self.fetch_flow_scalar(double=False)
557 def fetch_double(self):
558 self.fetch_flow_scalar(double=True)
560 def fetch_flow_scalar(self, double):
562 # A flow scalar could be a simple key.
563 self.save_possible_simple_key()
565 # No simple keys after flow scalars.
566 self.allow_simple_key = False
568 # Scan and add SCALAR.
569 self.scan_flow_scalar(double)
571 def fetch_plain(self):
573 # A plain scalar could be a simple key.
574 self.save_possible_simple_key()
576 # No simple keys after plain scalars. But note that `scan_plain` will
577 # change this flag if the scan is finished at the beginning of the
578 # line.
579 self.allow_simple_key = False
581 # Scan and add SCALAR. May change `allow_simple_key`.
582 self.scan_plain()
584 # Checkers.
586 def check_directive(self):
588 # DIRECTIVE: ^ '%' ...
589 # The '%' indicator is already checked.
590 if self.reader.column == 0:
591 return True
593 def check_document_start(self):
595 # DOCUMENT-START: ^ '---' (' '|'\n')
596 if self.reader.column == 0:
597 prefix = self.reader.peek(4)
598 if prefix[:3] == u'---' and prefix[3] in u'\0 \t\r\n\x85\u2028\u2029':
599 return True
601 def check_document_end(self):
603 # DOCUMENT-END: ^ '...' (' '|'\n')
604 if self.reader.column == 0:
605 prefix = self.reader.peek(4)
606 if prefix[:3] == u'...' and prefix[3] in u'\0 \t\r\n\x85\u2028\u2029':
607 return True
609 def check_entry(self):
611 # ENTRY(flow context): ','
612 if self.flow_level:
613 return self.reader.peek() == u','
615 # ENTRY(block context): '-' (' '|'\n')
616 else:
617 prefix = self.reader.peek(2)
618 return prefix[0] == u'-' and prefix[1] in u'\0 \t\r\n\x85\u2028\u2029'
620 def check_key(self):
622 # KEY(flow context): '?'
623 if self.flow_level:
624 return True
626 # KEY(block context): '?' (' '|'\n')
627 else:
628 prefix = self.reader.peek(2)
629 return prefix[1] in u'\0 \t\r\n\x85\u2028\u2029'
631 def check_value(self):
633 # VALUE(flow context): ':'
634 if self.flow_level:
635 return True
637 # VALUE(block context): ':' (' '|'\n')
638 else:
639 prefix = self.reader.peek(2)
640 return prefix[1] in u'\0 \t\r\n\x85\u2028\u2029'
642 def check_plain(self):
643 return True
645 # Scanners.
647 def scan_to_next_token(self):
648 found = False
649 while not found:
650 while self.reader.peek() == u' ':
651 self.reader.forward()
652 if self.reader.peek() == u'#':
653 while self.reader.peek() not in u'\r\n':
654 self.reader.forward()
655 if self.reader.peek() in u'\r\n':
656 self.reader.forward()
657 if not self.flow_level:
658 self.allow_simple_key = True
659 else:
660 found = True
662 def scan_directive(self):
663 marker = self.reader.get_marker()
664 if self.reader.peek(5) == u'%YAML ':
665 self.tokens.append(YAMLDirectiveToken(1, 1, marker, marker))
666 elif self.reader.peek(4) == u'%TAG ':
667 self.tokens.append(TagDirectiveToken(marker, marker))
668 else:
669 self.tokens.append(ReservedDirectiveToken('', marker, marker))
670 while self.reader.peek() not in u'\0\r\n':
671 self.reader.forward()
672 self.reader.forward()
674 def scan_anchor(self, TokenClass):
675 start_marker = self.reader.get_marker()
676 while self.reader.peek() not in u'\0 \t\r\n,:':
677 self.reader.forward()
678 end_marker = self.reader.get_marker()
679 self.tokens.append(TokenClass('', start_marker, end_marker))
681 def scan_tag(self):
682 start_marker = self.reader.get_marker()
683 while self.reader.peek() not in u'\0 \t\r\n':
684 self.reader.forward()
685 end_marker = self.reader.get_marker()
686 self.tokens.append(TagToken('', start_marker, end_marker))
688 def scan_block_scalar(self, folded):
689 start_marker = self.reader.get_marker()
690 indent = self.indent+1
691 if indent < 1:
692 indent = 1
693 while True:
694 while self.reader.peek() and self.reader.peek() and self.reader.peek() not in u'\0\r\n\x85\u2028\u2029':
695 self.reader.forward()
696 if self.reader.peek() != u'\0':
697 self.reader.forward()
698 count = 0
699 while count < indent and self.reader.peek() == u' ':
700 self.reader.forward()
701 count += 1
702 if count < indent and self.reader.peek() not in u'#\r\n\x85\u2028\u2029':
703 break
704 self.tokens.append(ScalarToken('', False, start_marker, start_marker))
706 def scan_flow_scalar(self, double):
707 marker = self.reader.get_marker()
708 quote = self.reader.peek()
709 self.reader.forward()
710 while self.reader.peek() != quote:
711 if double and self.reader.peek() == u'\\':
712 self.reader.forward(2)
713 elif not double and self.reader.peek(3)[1:] == u'\'\'':
714 self.reader.forward(3)
715 else:
716 self.reader.forward(1)
717 self.reader.forward(1)
718 self.tokens.append(ScalarToken('', False, marker, marker))
720 def scan_plain(self):
721 indent = self.indent+1
722 if indent < 1:
723 indent = 1
724 space = False
725 marker = self.reader.get_marker()
726 while True:
727 while self.reader.peek() == u' ':
728 self.reader.forward()
729 space = True
730 while self.reader.peek() not in u'\0\r\n?:,[]{}#' \
731 or (not space and self.reader.peek() == '#') \
732 or (not self.flow_level and self.reader.peek() in '?,[]{}') \
733 or (not self.flow_level and self.reader.peek() == ':' and self.reader.peek(2)[1] not in u' \0\r\n'):
734 space = self.reader.peek() not in u' \t'
735 self.reader.forward()
736 self.allow_simple_key = False
737 if self.reader.peek() not in u'\r\n':
738 break
739 while self.reader.peek() in u'\r\n':
740 self.reader.forward()
741 if not self.flow_level:
742 self.allow_simple_key = True
743 count = 0
744 while self.reader.peek() == u' ' and count < indent:
745 self.reader.forward()
746 count += 1
747 if count < indent:
748 break
749 space = True
750 self.tokens.append(ScalarToken('', True, marker, marker))
752 def invalid_token(self):
753 self.fail("invalid token")
755 def fail(self, message):
756 raise ScannerError(message)
758 #try:
759 # import psyco
760 # psyco.bind(Scanner)
761 #except ImportError:
762 # pass