Parser is done. Add iterator interfaces for Scanner and Parser.
[pyyaml/python3.git] / lib / yaml / scanner.py
blob220a99b69a3ccaf60806fc45319d442176c42f94
2 # Tokens:
3 # YAML-DIRECTIVE(major_version, minor_version), TAG-DIRECTIVE(handle, prefix)
4 # RESERVED-DIRECTIVE(name)
5 # DOCUMENT-START, DOCUMENT-END
6 # BLOCK-SEQUENCE-START, BLOCK-MAPPING-START, BLOCK-END
7 # FLOW-SEQUENCE-START, FLOW-MAPPING-START, FLOW-SEQUENCE-END, FLOW-MAPPING-END
8 # ENTRY, KEY, VALUE
9 # ALIAS(name), ANCHOR(name), TAG(value), SCALAR(value, plain)
11 __all__ = ['Scanner', 'ScannerError']
13 from error import YAMLError
14 from tokens import *
16 class ScannerError(YAMLError):
17 # ScannerError: while reading a quoted string
18 # in '...', line 5, column 10:
19 # key: "valu\?e"
20 # ^
21 # got unknown quote character '?'
22 # in '...', line 5, column 15:
23 # key: "valu\?e"
24 # ^
26 def __init__(self, context=None, context_marker=None,
27 problem=None, problem_marker=None):
28 self.context = context
29 self.context_marker = context_marker
30 self.problem = problem
31 self.problem_marker = problem_marker
33 def __str__(self):
34 lines = []
35 for (place, marker) in [(self.context, self.context_marker),
36 (self.problem, self.problem_marker)]:
37 if place is not None:
38 lines.append(place)
39 if marker is not None:
40 lines.append(str(marker))
41 return '\n'.join(lines)
43 class SimpleKey:
44 # See below simple keys treatment.
46 def __init__(self, token_number, required, index, line, column, marker):
47 self.token_number = token_number
48 self.required = required
49 self.index = index
50 self.line = line
51 self.column = column
52 self.marker = marker
54 class Scanner:
57 def __init__(self, reader):
58 """Initialize the scanner."""
59 # The input stream. The Reader class do the dirty work of checking for
60 # BOM and converting the input data to Unicode. It also adds NUL to
61 # the end.
63 # Reader supports the following methods
64 # self.reader.peek(i=0) # peek the next i-th character
65 # self.reader.prefix(l=1) # peek the next l characters
66 # self.reader.forward(l=1) # read the next l characters
67 # and move the pointer
68 self.reader = reader
70 # Had we reached the end of the stream?
71 self.done = False
73 # The number of unclosed '{' and '['. `flow_level == 0` means block
74 # context.
75 self.flow_level = 0
77 # List of processed tokens that are not yet emitted.
78 self.tokens = []
80 # Number of tokens that were emitted through the `get_token` method.
81 self.tokens_taken = 0
83 # The current indentation level.
84 self.indent = -1
86 # Past indentation levels.
87 self.indents = []
89 # Variables related to simple keys treatment.
91 # A simple key is a key that is not denoted by the '?' indicator.
92 # Example of simple keys:
93 # ---
94 # block simple key: value
95 # ? not a simple key:
96 # : { flow simple key: value }
97 # We emit the KEY token before all keys, so when we find a potential
98 # simple key, we try to locate the corresponding ':' indicator.
99 # Simple keys should be limited to a single line and 1024 characters.
101 # Can a simple key start at the current position? A simple key may
102 # start:
103 # - at the beginning of the line, not counting indentation spaces
104 # (in block context),
105 # - after '{', '[', ',' (in the flow context),
106 # - after '?', ':', '-' (in the block context).
107 # In the block context, this flag also signify if a block collection
108 # may start at the current position.
109 self.allow_simple_key = True
111 # Keep track of possible simple keys. This is a dictionary. The key
112 # is `flow_level`; there can be no more that one possible simple key
113 # for each level. The value is a SimpleKey record:
114 # (token_number, required, index, line, column, marker)
115 # A simple key may start with ALIAS, ANCHOR, TAG, SCALAR(flow),
116 # '[', or '{' tokens.
117 self.possible_simple_keys = {}
119 # Public methods.
121 def check(self, *choices):
122 # Check if the next token is one of the given types.
123 while self.need_more_tokens():
124 self.fetch_more_tokens()
125 if self.tokens:
126 for choice in choices:
127 if isinstance(self.tokens[0], choice):
128 return True
129 return False
131 def peek(self):
132 # Return the next token, but do not delete if from the queue.
133 while self.need_more_tokens():
134 self.fetch_more_tokens()
135 if self.tokens:
136 return self.tokens[0]
138 def get(self):
139 # Return the next token.
140 while self.need_more_tokens():
141 self.fetch_more_tokens()
142 if self.tokens:
143 self.tokens_taken += 1
144 return self.tokens.pop(0)
146 def __iter__(self):
147 # Iterator protocol.
148 while self.need_more_tokens():
149 self.fetch_more_tokens()
150 while self.tokens:
151 self.tokens_taken += 1
152 yield self.tokens.pop(0)
153 while self.need_more_tokens():
154 self.fetch_more_tokens()
156 # Private methods.
158 def need_more_tokens(self):
159 if self.done:
160 return False
161 if not self.tokens:
162 return True
163 # The current token may be a potential simple key, so we
164 # need to look further.
165 self.stale_possible_simple_keys()
166 if self.next_possible_simple_key() == self.tokens_taken:
167 return True
169 def fetch_more_tokens(self):
171 # Eat whitespaces and comments until we reach the next token.
172 self.scan_to_next_token()
174 # Remove obsolete possible simple keys.
175 self.stale_possible_simple_keys()
177 # Compare the current indentation and column. It may add some tokens
178 # and decrease the current indentation level.
179 self.unwind_indent(self.reader.column)
181 # Peek the next character.
182 ch = self.reader.peek()
184 # Is it the end of stream?
185 if ch == u'\0':
186 return self.fetch_stream_end()
188 # Is it a directive?
189 if ch == u'%' and self.check_directive():
190 return self.fetch_directive()
192 # Is it the document start?
193 if ch == u'-' and self.check_document_start():
194 return self.fetch_document_start()
196 # Is it the document end?
197 if ch == u'.' and self.check_document_end():
198 return self.fetch_document_end()
200 # Note: the order of the following checks is NOT significant.
202 # Is it the flow sequence start indicator?
203 if ch == u'[':
204 return self.fetch_flow_sequence_start()
206 # Is it the flow mapping start indicator?
207 if ch == u'{':
208 return self.fetch_flow_mapping_start()
210 # Is it the flow sequence end indicator?
211 if ch == u']':
212 return self.fetch_flow_sequence_end()
214 # Is it the flow mapping end indicator?
215 if ch == u'}':
216 return self.fetch_flow_mapping_end()
218 # Is it the flow entry indicator?
219 if ch in u',':
220 return self.fetch_flow_entry()
222 # Is it the block entry indicator?
223 if ch in u'-' and self.check_block_entry():
224 return self.fetch_block_entry()
226 # Is it the key indicator?
227 if ch == u'?' and self.check_key():
228 return self.fetch_key()
230 # Is it the value indicator?
231 if ch == u':' and self.check_value():
232 return self.fetch_value()
234 # Is it an alias?
235 if ch == u'*':
236 return self.fetch_alias()
238 # Is it an anchor?
239 if ch == u'&':
240 return self.fetch_anchor()
242 # Is it a tag?
243 if ch == u'!':
244 return self.fetch_tag()
246 # Is it a literal scalar?
247 if ch == u'|' and not self.flow_level:
248 return self.fetch_literal()
250 # Is it a folded scalar?
251 if ch == u'>' and not self.flow_level:
252 return self.fetch_folded()
254 # Is it a single quoted scalar?
255 if ch == u'\'':
256 return self.fetch_single()
258 # Is it a double quoted scalar?
259 if ch == u'\"':
260 return self.fetch_double()
262 # It must be a plain scalar then.
263 if self.check_plain():
264 return self.fetch_plain()
266 # No? It's an error. Let's produce a nice error message.
267 raise ScannerError("while scanning for the next token", None,
268 "found character %r that cannot start any token"
269 % ch.encode('utf-8'), self.reader.get_marker())
271 # Simple keys treatment.
273 def next_possible_simple_key(self):
274 # Return the number of the nearest possible simple key. Actually we
275 # don't need to loop through the whole dictionary. We may replace it
276 # with the following code:
277 # if not self.possible_simple_keys:
278 # return None
279 # return self.possible_simple_keys[
280 # min(self.possible_simple_keys.keys())].token_number
281 min_token_number = None
282 for level in self.possible_simple_keys:
283 key = self.possible_simple_keys[level]
284 if min_token_number is None or key.token_number < min_token_number:
285 min_token_number = key.token_number
286 return min_token_number
288 def stale_possible_simple_keys(self):
289 # Remove entries that are no longer possible simple keys. According to
290 # the YAML specification, simple keys
291 # - should be limited to a single line,
292 # - should be no longer than 1024 characters.
293 # Disabling this procedure will allow simple keys of any length and
294 # height (may cause problems if indentation is broken though).
295 for level in self.possible_simple_keys.keys():
296 key = self.possible_simple_keys[level]
297 if key.line != self.reader.line \
298 or self.reader.index-key.index > 1024:
299 if key.required:
300 raise ScannerError("while scanning a simple key", key.marker,
301 "could not found expected ':'", self.reader.get_marker())
302 del self.possible_simple_keys[level]
304 def save_possible_simple_key(self):
305 # The next token may start a simple key. We check if it's possible
306 # and save its position. This function is called for
307 # ALIAS, ANCHOR, TAG, SCALAR(flow), '[', and '{'.
309 # Check if a simple key is required at the current position.
310 required = not self.flow_level and self.indent == self.reader.column
312 # A simple key is required only if it is the first token in the current
313 # line. Therefore it is always allowed.
314 assert self.allow_simple_key or not required
316 # The next token might be a simple key. Let's save it's number and
317 # position.
318 if self.allow_simple_key:
319 self.remove_possible_simple_key()
320 token_number = self.tokens_taken+len(self.tokens)
321 index = self.reader.index
322 line = self.reader.line
323 column = self.reader.column
324 marker = self.reader.get_marker()
325 key = SimpleKey(token_number, required,
326 index, line, column, marker)
327 self.possible_simple_keys[self.flow_level] = key
329 def remove_possible_simple_key(self):
330 # Remove the saved possible key position at the current flow level.
331 if self.flow_level in self.possible_simple_keys:
332 key = self.possible_simple_keys[self.flow_level]
334 # I don't think it's possible, but I could be wrong.
335 assert not key.required
336 #if key.required:
337 # raise ScannerError("while scanning a simple key", key.marker,
338 # "could not found expected ':'", self.reader.get_marker())
340 # Indentation functions.
342 def unwind_indent(self, column):
344 # In flow context, tokens should respect indentation.
345 # Actually the condition should be `self.indent >= column` according to
346 # the spec. But this condition will prohibit intuitively correct
347 # constructions such as
348 # key : {
350 if self.flow_level and self.indent > column:
351 raise ScannerError(None, None,
352 "invalid intendation or unclosed '[' or '{'",
353 self.reader.get_marker())
355 # In block context, we may need to issue the BLOCK-END tokens.
356 while self.indent > column:
357 marker = self.reader.get_marker()
358 self.indent = self.indents.pop()
359 self.tokens.append(BlockEndToken(marker, marker))
361 def add_indent(self, column):
362 # Check if we need to increase indentation.
363 if self.indent < column:
364 self.indents.append(self.indent)
365 self.indent = column
366 return True
367 return False
369 # Fetchers.
371 def fetch_stream_end(self):
373 # Set the current intendation to -1.
374 self.unwind_indent(-1)
376 # Reset everything (not really needed).
377 self.allow_simple_key = False
378 self.possible_simple_keys = {}
380 # Read the token.
381 marker = self.reader.get_marker()
383 # Add END.
384 self.tokens.append(StreamEndToken(marker, marker))
386 # The reader is ended.
387 self.done = True
389 def fetch_directive(self):
391 # Set the current intendation to -1.
392 self.unwind_indent(-1)
394 # Reset simple keys.
395 self.remove_possible_simple_key()
396 self.allow_simple_key = False
398 # Scan and add DIRECTIVE.
399 self.tokens.append(self.scan_directive())
401 def fetch_document_start(self):
402 self.fetch_document_indicator(DocumentStartToken)
404 def fetch_document_end(self):
405 self.fetch_document_indicator(DocumentEndToken)
407 def fetch_document_indicator(self, TokenClass):
409 # Set the current intendation to -1.
410 self.unwind_indent(-1)
412 # Reset simple keys. Note that there could not be a block collection
413 # after '---'.
414 self.remove_possible_simple_key()
415 self.allow_simple_key = False
417 # Add DOCUMENT-START or DOCUMENT-END.
418 start_marker = self.reader.get_marker()
419 self.reader.forward(3)
420 end_marker = self.reader.get_marker()
421 self.tokens.append(TokenClass(start_marker, end_marker))
423 def fetch_flow_sequence_start(self):
424 self.fetch_flow_collection_start(FlowSequenceStartToken)
426 def fetch_flow_mapping_start(self):
427 self.fetch_flow_collection_start(FlowMappingStartToken)
429 def fetch_flow_collection_start(self, TokenClass):
431 # '[' and '{' may start a simple key.
432 self.save_possible_simple_key()
434 # Increase the flow level.
435 self.flow_level += 1
437 # Simple keys are allowed after '[' and '{'.
438 self.allow_simple_key = True
440 # Add FLOW-SEQUENCE-START or FLOW-MAPPING-START.
441 start_marker = self.reader.get_marker()
442 self.reader.forward()
443 end_marker = self.reader.get_marker()
444 self.tokens.append(TokenClass(start_marker, end_marker))
446 def fetch_flow_sequence_end(self):
447 self.fetch_flow_collection_end(FlowSequenceEndToken)
449 def fetch_flow_mapping_end(self):
450 self.fetch_flow_collection_end(FlowMappingEndToken)
452 def fetch_flow_collection_end(self, TokenClass):
454 # Reset possible simple key on the current level.
455 self.remove_possible_simple_key()
457 # Decrease the flow level.
458 self.flow_level -= 1
460 # No simple keys after ']' or '}'.
461 self.allow_simple_key = False
463 # Add FLOW-SEQUENCE-END or FLOW-MAPPING-END.
464 start_marker = self.reader.get_marker()
465 self.reader.forward()
466 end_marker = self.reader.get_marker()
467 self.tokens.append(TokenClass(start_marker, end_marker))
469 def fetch_flow_entry(self):
471 # Simple keys are allowed after ','.
472 self.allow_simple_key = True
474 # Reset possible simple key on the current level.
475 self.remove_possible_simple_key()
477 # Add FLOW-ENTRY.
478 start_marker = self.reader.get_marker()
479 self.reader.forward()
480 end_marker = self.reader.get_marker()
481 self.tokens.append(FlowEntryToken(start_marker, end_marker))
483 def fetch_block_entry(self):
485 # Block context needs additional checks.
486 if not self.flow_level:
488 # Are we allowed to start a new entry?
489 if not self.allow_simple_key:
490 raise ScannerError(None, None,
491 "sequence entries are not allowed here",
492 self.reader.get_marker())
494 # We may need to add BLOCK-SEQUENCE-START.
495 if self.add_indent(self.reader.column):
496 marker = self.reader.get_marker()
497 self.tokens.append(BlockSequenceStartToken(marker, marker))
499 # It's an error for the block entry to occur in the flow context,
500 # but we let the parser detect this.
501 else:
502 pass
504 # Simple keys are allowed after '-'.
505 self.allow_simple_key = True
507 # Reset possible simple key on the current level.
508 self.remove_possible_simple_key()
510 # Add BLOCK-ENTRY.
511 start_marker = self.reader.get_marker()
512 self.reader.forward()
513 end_marker = self.reader.get_marker()
514 self.tokens.append(BlockEntryToken(start_marker, end_marker))
516 def fetch_key(self):
518 # Block context needs additional checks.
519 if not self.flow_level:
521 # Are we allowed to start a key (not nessesary a simple)?
522 if not self.allow_simple_key:
523 raise ScannerError(None, None,
524 "mapping keys are not allowed here",
525 self.reader.get_marker())
527 # We may need to add BLOCK-MAPPING-START.
528 if self.add_indent(self.reader.column):
529 marker = self.reader.get_marker()
530 self.tokens.append(BlockMappingStartToken(marker, marker))
532 # Simple keys are allowed after '?' in the block context.
533 self.allow_simple_key = not self.flow_level
535 # Reset possible simple key on the current level.
536 self.remove_possible_simple_key()
538 # Add KEY.
539 start_marker = self.reader.get_marker()
540 self.reader.forward()
541 end_marker = self.reader.get_marker()
542 self.tokens.append(KeyToken(start_marker, end_marker))
544 def fetch_value(self):
546 # Do we determine a simple key?
547 if self.flow_level in self.possible_simple_keys:
549 # Add KEY.
550 key = self.possible_simple_keys[self.flow_level]
551 del self.possible_simple_keys[self.flow_level]
552 self.tokens.insert(key.token_number-self.tokens_taken,
553 KeyToken(key.marker, key.marker))
555 # If this key starts a new block mapping, we need to add
556 # BLOCK-MAPPING-START.
557 if not self.flow_level:
558 if self.add_indent(key.column):
559 self.tokens.insert(key.token_number-self.tokens_taken,
560 BlockMappingStartToken(key.marker, key.marker))
562 # There cannot be two simple keys one after another.
563 self.allow_simple_key = False
565 # It must be a part of a complex key.
566 else:
568 # Block context needs additional checks.
569 # (Do we really need them? They will be catched by the parser
570 # anyway.)
571 if not self.flow_level:
573 # We are allowed to start a complex value if and only if
574 # we can start a simple key.
575 if not self.allow_simple_key:
576 raise ScannerError(None, None,
577 "mapping values are not allowed here",
578 self.reader.get_marker())
580 # Simple keys are allowed after ':' in the block context.
581 self.allow_simple_key = not self.flow_level
583 # Reset possible simple key on the current level.
584 self.remove_possible_simple_key()
586 # Add VALUE.
587 start_marker = self.reader.get_marker()
588 self.reader.forward()
589 end_marker = self.reader.get_marker()
590 self.tokens.append(ValueToken(start_marker, end_marker))
592 def fetch_alias(self):
594 # ALIAS could be a simple key.
595 self.save_possible_simple_key()
597 # No simple keys after ALIAS.
598 self.allow_simple_key = False
600 # Scan and add ALIAS.
601 self.tokens.append(self.scan_anchor(AliasToken))
603 def fetch_anchor(self):
605 # ANCHOR could start a simple key.
606 self.save_possible_simple_key()
608 # No simple keys after ANCHOR.
609 self.allow_simple_key = False
611 # Scan and add ANCHOR.
612 self.tokens.append(self.scan_anchor(AnchorToken))
614 def fetch_tag(self):
616 # TAG could start a simple key.
617 self.save_possible_simple_key()
619 # No simple keys after TAG.
620 self.allow_simple_key = False
622 # Scan and add TAG.
623 self.tokens.append(self.scan_tag())
625 def fetch_literal(self):
626 self.fetch_block_scalar(folded=False)
628 def fetch_folded(self):
629 self.fetch_block_scalar(folded=True)
631 def fetch_block_scalar(self, folded):
633 # A simple key may follow a block scalar.
634 self.allow_simple_key = True
636 # Reset possible simple key on the current level.
637 self.remove_possible_simple_key()
639 # Scan and add SCALAR.
640 self.tokens.append(self.scan_block_scalar(folded))
642 def fetch_single(self):
643 self.fetch_flow_scalar(double=False)
645 def fetch_double(self):
646 self.fetch_flow_scalar(double=True)
648 def fetch_flow_scalar(self, double):
650 # A flow scalar could be a simple key.
651 self.save_possible_simple_key()
653 # No simple keys after flow scalars.
654 self.allow_simple_key = False
656 # Scan and add SCALAR.
657 self.tokens.append(self.scan_flow_scalar(double))
659 def fetch_plain(self):
661 # A plain scalar could be a simple key.
662 self.save_possible_simple_key()
664 # No simple keys after plain scalars. But note that `scan_plain` will
665 # change this flag if the scan is finished at the beginning of the
666 # line.
667 self.allow_simple_key = False
669 # Scan and add SCALAR. May change `allow_simple_key`.
670 self.tokens.append(self.scan_plain())
672 # Checkers.
674 def check_directive(self):
676 # DIRECTIVE: ^ '%' ...
677 # The '%' indicator is already checked.
678 if self.reader.column == 0:
679 return True
681 def check_document_start(self):
683 # DOCUMENT-START: ^ '---' (' '|'\n')
684 if self.reader.column == 0:
685 if self.reader.prefix(3) == u'---' \
686 and self.reader.peek(3) in u'\0 \t\r\n\x85\u2028\u2029':
687 return True
689 def check_document_end(self):
691 # DOCUMENT-END: ^ '...' (' '|'\n')
692 if self.reader.column == 0:
693 prefix = self.reader.peek(4)
694 if self.reader.prefix(3) == u'...' \
695 and self.reader.peek(3) in u'\0 \t\r\n\x85\u2028\u2029':
696 return True
698 def check_block_entry(self):
700 # BLOCK-ENTRY: '-' (' '|'\n')
701 return self.reader.peek(1) in u'\0 \t\r\n\x85\u2028\u2029'
703 def check_key(self):
705 # KEY(flow context): '?'
706 if self.flow_level:
707 return True
709 # KEY(block context): '?' (' '|'\n')
710 else:
711 return self.reader.peek(1) in u'\0 \t\r\n\x85\u2028\u2029'
713 def check_value(self):
715 # VALUE(flow context): ':'
716 if self.flow_level:
717 return True
719 # VALUE(block context): ':' (' '|'\n')
720 else:
721 return self.reader.peek(1) in u'\0 \t\r\n\x85\u2028\u2029'
723 def check_plain(self):
725 # A plain scalar may start with any non-space character except:
726 # '-', '?', ':', ',', '[', ']', '{', '}',
727 # '#', '&', '*', '!', '|', '>', '\'', '\"',
728 # '%', '@', '`'.
730 # It may also start with
731 # '-', '?', ':'
732 # if it is followed by a non-space character.
734 # Note that we limit the last rule to the block context (except the
735 # '-' character) because we want the flow context to be space
736 # independent.
737 ch = self.reader.peek()
738 return ch not in u'\0 \t\r\n\x85\u2028\u2029-?:,[]{}#&*!|>\'\"%@`' \
739 or (self.reader.peek(1) not in u'\0 \t\r\n\x85\u2028\u2029'
740 and (ch == '-' or (not self.flow_level and ch in u'?:')))
742 # Scanners.
744 def scan_to_next_token(self):
745 # We ignore spaces, line breaks and comments.
746 # If we find a line break in the block context, we set the flag
747 # `allow_simple_key` on.
748 # The byte order mark is stripped if it's the first character in the
749 # stream. We do not yet support BOM inside the stream as the
750 # specification requires. Any such mark will be considered as a part
751 # of the document.
752 if self.reader.index == 0 and self.reader.peek() == u'\uFEFF':
753 self.reader.forward()
754 found = False
755 while not found:
756 while self.reader.peek() == u' ':
757 self.reader.forward()
758 if self.reader.peek() == u'#':
759 while self.reader.peek() not in u'\0\r\n\x85\u2028\u2029':
760 self.reader.forward()
761 if self.scan_line_break():
762 if not self.flow_level:
763 self.allow_simple_key = True
764 else:
765 found = True
767 def scan_directive(self):
768 # See the specification for details.
769 start_marker = self.reader.get_marker()
770 self.reader.forward()
771 name = self.scan_directive_name(start_marker)
772 value = None
773 if name == u'YAML':
774 value = self.scan_yaml_directive_value(start_marker)
775 end_marker = self.reader.get_marker()
776 elif name == u'TAG':
777 value = self.scan_tag_directive_value(start_marker)
778 end_marker = self.reader.get_marker()
779 else:
780 end_marker = self.reader.get_marker()
781 while self.reader.peek() not in u'\0\r\n\x85\u2028\u2029':
782 self.reader.forward()
783 self.scan_directive_ignored_line(start_marker)
784 return DirectiveToken(name, value, start_marker, end_marker)
786 def scan_directive_name(self, start_marker):
787 # See the specification for details.
788 length = 0
789 ch = self.reader.peek(length)
790 while u'0' <= ch <= u'9' or u'A' <= ch <= 'Z' or u'a' <= ch <= 'z' \
791 or ch in u'-_':
792 length += 1
793 ch = self.reader.peek(length)
794 if not length:
795 raise ScannerError("while scanning a directive", start_marker,
796 "expected directive name, but found %r" % ch.encode('utf-8'),
797 self.reader.get_marker())
798 value = self.reader.prefix(length)
799 self.reader.forward(length)
800 ch = self.reader.peek()
801 if ch not in u'\0 \r\n\x85\u2028\u2029':
802 raise ScannerError("while scanning a directive" % name, start_marker,
803 "expected alphabetic or numeric character, but found %r"
804 % ch.encode('utf-8'), self.reader.get_marker())
805 return value
807 def scan_yaml_directive_value(self, start_marker):
808 # See the specification for details.
809 while self.reader.peek() == u' ':
810 self.reader.forward()
811 major = self.scan_yaml_directive_number(start_marker)
812 if self.reader.peek() != '.':
813 raise ScannerError("while scanning a directive", start_marker,
814 "expected a digit or '.', but found %r" % ch.encode('utf-8'),
815 self.reader.get_marker())
816 self.reader.forward()
817 minor = self.scan_yaml_directive_number(start_marker)
818 if self.reader.peek() not in u'\0 \r\n\x85\u2028\u2029':
819 raise ScannerError("while scanning a directive", start_marker,
820 "expected a digit or ' ', but found %r" % ch.encode('utf-8'),
821 self.reader.get_marker())
822 return (major, minor)
824 def scan_yaml_directive_number(self, start_marker):
825 # See the specification for details.
826 ch = self.reader.peek()
827 if not (u'0' <= ch <= '9'):
828 raise ScannerError("while scanning a directive", start_marker,
829 "expected a digit, but found %r" % ch.encode('utf-8'),
830 self.reader.get_marker())
831 length = 0
832 while u'0' <= self.reader.peek(length) <= u'9':
833 length += 1
834 value = int(self.reader.prefix(length))
835 self.reader.forward(length)
836 return value
838 def scan_tag_directive_value(self, start_marker):
839 # See the specification for details.
840 while self.reader.peek() == u' ':
841 self.reader.forward()
842 handle = self.scan_tag_directive_handle(start_marker)
843 while self.reader.peek() == u' ':
844 self.reader.forward()
845 prefix = self.scan_tag_directive_prefix(start_marker)
846 return (handle, prefix)
848 def scan_tag_directive_handle(self, start_marker):
849 # See the specification for details.
850 value = self.scan_tag_handle('directive', start_marker)
851 if self.reader.peek() != u' ':
852 raise ScannerError("while scanning a directive", start_marker,
853 "expected ' ', but found %r" % ch.encode('utf-8'),
854 self.reader.get_marker())
855 return value
857 def scan_tag_directive_prefix(self, start_marker):
858 # See the specification for details.
859 value = self.scan_tag_uri('directive', start_marker)
860 ch = self.reader.peek()
861 if ch not in u'\0 \r\n\x85\u2028\u2029':
862 raise ScannerError("while scanning a directive", start_marker,
863 "expected ' ', but found %r" % ch.encode('utf-8'),
864 self.reader.get_marker())
865 return value
867 def scan_directive_ignored_line(self, start_marker):
868 # See the specification for details.
869 while self.reader.peek() == u' ':
870 self.reader.forward()
871 if self.reader.peek() == u'#':
872 while self.reader.peek() not in u'\0\r\n\x85\u2028\u2029':
873 self.reader.forward()
874 ch = self.reader.peek()
875 if ch not in u'\0\r\n\x85\u2028\u2029':
876 raise ScannerError("while scanning a directive", start_marker,
877 "expected a comment or a line break, but found %r"
878 % ch.encode('utf-8'), self.reader.get_marker())
879 self.scan_line_break()
881 def scan_anchor(self, TokenClass):
882 # The specification does not restrict characters for anchors and
883 # aliases. This may lead to problems, for instance, the document:
884 # [ *alias, value ]
885 # can be interpteted in two ways, as
886 # [ "value" ]
887 # and
888 # [ *alias , "value" ]
889 # Therefore we restrict aliases to numbers and ASCII letters.
890 start_marker = self.reader.get_marker()
891 indicator = self.reader.peek()
892 if indicator == '*':
893 name = 'alias'
894 else:
895 name = 'anchor'
896 self.reader.forward()
897 length = 0
898 ch = self.reader.peek(length)
899 while u'0' <= ch <= u'9' or u'A' <= ch <= 'Z' or u'a' <= ch <= 'z' \
900 or ch in u'-_':
901 length += 1
902 ch = self.reader.peek(length)
903 if not length:
904 raise ScannerError("while scanning an %s" % name, start_marker,
905 "expected anchor name, but found %r" % ch.encode('utf-8'),
906 self.reader.get_marker())
907 value = self.reader.prefix(length)
908 self.reader.forward(length)
909 ch = self.reader.peek()
910 if ch not in u'\0 \t\r\n\x85\u2028\u2029?:,]}%@`':
911 raise ScannerError("while scanning an %s" % name, start_marker,
912 "expected alphabetic or numeric character, but found %r"
913 % ch.encode('utf-8'), self.reader.get_marker())
914 end_marker = self.reader.get_marker()
915 return TokenClass(value, start_marker, end_marker)
917 def scan_tag(self):
918 # See the specification for details.
919 start_marker = self.reader.get_marker()
920 ch = self.reader.peek(1)
921 if ch == u'<':
922 handle = None
923 self.reader.forward(2)
924 suffix = self.scan_tag_uri('tag', start_marker)
925 if self.reader.peek() != u'>':
926 raise ScannerError("while parsing a tag", start_marking,
927 "expected '>', but got %r" % self.reader.peek().encode('utf-8'),
928 self.reader.get_marker())
929 self.reader.forward()
930 elif ch in u'\0 \t\r\n\x85\u2028\u2029':
931 handle = None
932 suffix = u'!'
933 self.reader.forward()
934 else:
935 length = 1
936 use_handle = False
937 while ch not in u'\0 \r\n\x85\u2028\u2029':
938 if ch == u'!':
939 use_handle = True
940 break
941 length += 1
942 ch = self.reader.peek(length)
943 handle = u'!'
944 if use_handle:
945 handle = self.scan_tag_handle('tag', start_marker)
946 else:
947 handle = u'!'
948 self.reader.forward()
949 suffix = self.scan_tag_uri('tag', start_marker)
950 ch = self.reader.peek()
951 if ch not in u'\0 \r\n\x85\u2028\u2029':
952 raise ScannerError("while scanning a tag", start_marker,
953 "expected ' ', but found %r" % ch.encode('utf-8'),
954 self.reader.get_marker())
955 value = (handle, suffix)
956 end_marker = self.reader.get_marker()
957 return TagToken(value, start_marker, end_marker)
959 def scan_block_scalar(self, folded):
960 # See the specification for details.
962 chunks = []
963 start_marker = self.reader.get_marker()
965 # Scan the header.
966 self.reader.forward()
967 chomping, increment = self.scan_block_scalar_indicators(start_marker)
968 self.scan_block_scalar_ignored_line(start_marker)
970 # Determine the indentation level and go to the first non-empty line.
971 min_indent = self.indent+1
972 if min_indent < 1:
973 min_indent = 1
974 if increment is None:
975 breaks, max_indent, end_marker = self.scan_block_scalar_indentation()
976 indent = max(min_indent, max_indent)
977 else:
978 indent = min_indent+increment-1
979 breaks, end_marker = self.scan_block_scalar_breaks(indent)
980 line_break = u''
982 # Scan the inner part of the block scalar.
983 while self.reader.column == indent and self.reader.peek() != u'\0':
984 chunks.extend(breaks)
985 leading_non_space = self.reader.peek() not in u' \t'
986 length = 0
987 while self.reader.peek(length) not in u'\0\r\n\x85\u2028\u2029':
988 length += 1
989 chunks.append(self.reader.prefix(length))
990 self.reader.forward(length)
991 line_break = self.scan_line_break()
992 breaks, end_marker = self.scan_block_scalar_breaks(indent)
993 if self.reader.column == indent and self.reader.peek() != u'\0':
994 # Unfortunately, folding rules are ambiguous.
996 # This is the folding according to the specification:
998 if folded and line_break == u'\n' \
999 and leading_non_space and self.reader.peek() not in u' \t':
1000 if not breaks:
1001 chunks.append(u' ')
1002 else:
1003 chunks.append(line_break)
1005 # This is Clark Evans's interpretation (also in the spec
1006 # examples):
1008 #if folded and line_break == u'\n':
1009 # if not breaks:
1010 # if self.reader.peek() not in ' \t':
1011 # chunks.append(u' ')
1012 # else:
1013 # chunks.append(line_break)
1014 #else:
1015 # chunks.append(line_break)
1016 else:
1017 break
1019 # Chomp the tail.
1020 if chomping is not False:
1021 chunks.append(line_break)
1022 if chomping is True:
1023 chunks.extend(breaks)
1025 # We are done.
1026 return ScalarToken(u''.join(chunks), False, start_marker, end_marker)
1028 def scan_block_scalar_indicators(self, start_marker):
1029 # See the specification for details.
1030 chomping = None
1031 increment = None
1032 ch = self.reader.peek()
1033 if ch in u'+-':
1034 if ch == '+':
1035 chomping = True
1036 else:
1037 chomping = False
1038 self.reader.forward()
1039 ch = self.reader.peek()
1040 if ch in u'0123456789':
1041 increment = int(ch)
1042 if increment == 0:
1043 raise ScannerError("while scanning a block scalar", start_marker,
1044 "expected indentation indicator in the range 1-9, but found 0",
1045 self.reader.get_marker())
1046 self.reader.forward()
1047 elif ch in u'0123456789':
1048 increment = int(ch)
1049 if increment == 0:
1050 raise ScannerError("while scanning a block scalar", start_marker,
1051 "expected indentation indicator in the range 1-9, but found 0",
1052 self.reader.get_marker())
1053 self.reader.forward()
1054 ch = self.reader.peek()
1055 if ch in u'+-':
1056 if ch == '+':
1057 chomping = True
1058 else:
1059 chomping = False
1060 self.reader.forward()
1061 ch = self.reader.peek()
1062 if ch not in u'\0 \r\n\x85\u2028\u2029':
1063 raise ScannerError("while scanning a block scalar", start_marker,
1064 "expected chomping or indentation indicators, but found %r"
1065 % ch.encode('utf-8'), self.reader.get_marker())
1066 return chomping, increment
1068 def scan_block_scalar_ignored_line(self, start_marker):
1069 # See the specification for details.
1070 while self.reader.peek() == u' ':
1071 self.reader.forward()
1072 if self.reader.peek() == u'#':
1073 while self.reader.peek() not in u'\0\r\n\x85\u2028\u2029':
1074 self.reader.forward()
1075 ch = self.reader.peek()
1076 if ch not in u'\0\r\n\x85\u2028\u2029':
1077 raise ScannerError("while scanning a block scalar", start_marker,
1078 "expected a comment or a line break, but found %r"
1079 % ch.encode('utf-8'), self.reader.get_marker())
1080 self.scan_line_break()
1082 def scan_block_scalar_indentation(self):
1083 # See the specification for details.
1084 chunks = []
1085 max_indent = 0
1086 end_marker = self.reader.get_marker()
1087 while self.reader.peek() in u' \r\n\x85\u2028\u2029':
1088 if self.reader.peek() != u' ':
1089 chunks.append(self.scan_line_break())
1090 end_marker = self.reader.get_marker()
1091 else:
1092 self.reader.forward()
1093 if self.reader.column > max_indent:
1094 max_indent = self.reader.column
1095 return chunks, max_indent, end_marker
1097 def scan_block_scalar_breaks(self, indent):
1098 # See the specification for details.
1099 chunks = []
1100 end_marker = self.reader.get_marker()
1101 while self.reader.column < indent and self.reader.peek() == u' ':
1102 self.reader.forward()
1103 while self.reader.peek() in u'\r\n\x85\u2028\u2029':
1104 chunks.append(self.scan_line_break())
1105 end_marker = self.reader.get_marker()
1106 while self.reader.column < indent and self.reader.peek() == u' ':
1107 self.reader.forward()
1108 return chunks, end_marker
1110 def scan_flow_scalar(self, double):
1111 # See the specification for details.
1112 chunks = []
1113 start_marker = self.reader.get_marker()
1114 indent = self.indent+1
1115 if indent == 0:
1116 indent = 1
1117 quote = self.reader.peek()
1118 self.reader.forward()
1119 chunks.extend(self.scan_flow_scalar_non_spaces(double, indent, start_marker))
1120 while self.reader.peek() != quote:
1121 chunks.extend(self.scan_flow_scalar_spaces(double, indent, start_marker))
1122 chunks.extend(self.scan_flow_scalar_non_spaces(double, indent, start_marker))
1123 self.reader.forward()
1124 end_marker = self.reader.get_marker()
1125 return ScalarToken(u''.join(chunks), False, start_marker, end_marker)
1127 ESCAPE_REPLACEMENTS = {
1128 u'0': u'\0',
1129 u'a': u'\x07',
1130 u'b': u'\x08',
1131 u't': u'\x09',
1132 u'\t': u'\x09',
1133 u'n': u'\x0A',
1134 u'v': u'\x0B',
1135 u'f': u'\x0C',
1136 u'r': u'\x0D',
1137 u'e': u'\x1B',
1138 u' ': u'\x20',
1139 u'\"': u'\"',
1140 u'\\': u'\\',
1141 u'N': u'\x85',
1142 u'_': u'\xA0',
1143 u'L': u'\u2028',
1144 u'P': u'\u2029',
1147 ESCAPE_CODES = {
1148 u'x': 2,
1149 u'u': 4,
1150 u'U': 8,
1153 def scan_flow_scalar_non_spaces(self, double, indent, start_marker):
1154 # See the specification for details.
1155 chunks = []
1156 while True:
1157 length = 0
1158 while self.reader.peek(length) not in u'\'\"\\\0 \t\r\n\x85\u2028\u2029':
1159 length += 1
1160 if length:
1161 chunks.append(self.reader.prefix(length))
1162 self.reader.forward(length)
1163 ch = self.reader.peek()
1164 if not double and ch == u'\'' and self.reader.peek(1) == u'\'':
1165 chunks.append(u'\'')
1166 self.reader.forward(2)
1167 elif (double and ch == u'\'') or (not double and ch in u'\"\\'):
1168 chunks.append(ch)
1169 self.reader.forward()
1170 elif double and ch == u'\\':
1171 self.reader.forward()
1172 ch = self.reader.peek()
1173 if ch in self.ESCAPE_REPLACEMENTS:
1174 chunks.append(self.ESCAPE_REPLACEMENTS[ch])
1175 self.reader.forward()
1176 elif ch in self.ESCAPE_CODES:
1177 length = self.ESCAPE_CODES[ch]
1178 self.reader.forward()
1179 for k in range(length):
1180 if self.reader.peek(k) not in u'0123456789ABCDEFabcdef':
1181 raise ScannerError("while scanning a double-quoted scalar", start_marker,
1182 "expected escape sequence of %d hexdecimal numbers, but found %r" %
1183 (length, self.reader.peek(k).encode('utf-8')), self.reader.get_marker())
1184 code = int(self.reader.prefix(length), 16)
1185 chunks.append(unichr(code))
1186 self.reader.forward(length)
1187 elif ch in u'\r\n\x85\u2028\u2029':
1188 self.scan_line_break()
1189 chunks.extend(self.scan_flow_scalar_breaks(double, indent, start_marker))
1190 else:
1191 raise ScannerError("while scanning a double-quoted scalar", start_marker,
1192 "found unknown escape character %r" % ch.encode('utf-8'), self.reader.get_marker())
1193 else:
1194 return chunks
1196 def scan_flow_scalar_spaces(self, double, indent, start_marker):
1197 # See the specification for details.
1198 chunks = []
1199 length = 0
1200 while self.reader.peek(length) in u' \t':
1201 length += 1
1202 whitespaces = self.reader.prefix(length)
1203 self.reader.forward(length)
1204 ch = self.reader.peek()
1205 if ch == u'\0':
1206 raise ScannerError("while scanning a quoted scalar", start_marker,
1207 "found unexpected end of stream", self.reader.get_marker())
1208 elif ch in u'\r\n\x85\u2028\u2029':
1209 line_break = self.scan_line_break()
1210 breaks = self.scan_flow_scalar_breaks(double, indent, start_marker)
1211 if line_break != u'\n':
1212 chunks.append(line_break)
1213 elif not breaks:
1214 chunks.append(u' ')
1215 chunks.extend(breaks)
1216 else:
1217 chunks.append(whitespaces)
1218 return chunks
1220 def scan_flow_scalar_breaks(self, double, indent, start_marker):
1221 # See the specification for details.
1222 chunks = []
1223 while True:
1224 while self.reader.column < indent and self.reader.peek() == u' ':
1225 self.reader.forward()
1226 if self.reader.column < indent \
1227 and self.reader.peek() not in u'\0\r\n\x85\u2028\u2029':
1228 s = 's'
1229 if indent == 1:
1230 s = ''
1231 raise ScannerError("while scanning a quoted scalar", start_marker,
1232 "expected %d space%s indentation, but found %r"
1233 % (indent, s, self.reader.peek().encode('utf-8')),
1234 self.reader.get_marker())
1235 while self.reader.peek() in u' \t':
1236 self.reader.forward()
1237 if self.reader.peek() in u'\r\n\x85\u2028\u2029':
1238 chunks.append(self.scan_line_break())
1239 else:
1240 return chunks
1242 def scan_plain(self):
1243 # See the specification for details.
1244 # We add an additional restriction for the flow context:
1245 # plain scalars in the flow context cannot contain ':' and '?'.
1246 # We also keep track of the `allow_simple_key` flag here.
1247 chunks = []
1248 start_marker = self.reader.get_marker()
1249 end_marker = start_marker
1250 indent = self.indent+1
1251 if indent == 0:
1252 indent = 1
1253 spaces = []
1254 while True:
1255 length = 0
1256 if self.reader.peek() == u'#':
1257 break
1258 while True:
1259 ch = self.reader.peek(length)
1260 if ch in u'\0 \t\r\n\x85\u2028\u2029' \
1261 or (not self.flow_level and ch == u':' and
1262 self.reader.peek(length+1) in u'\0 \t\r\n\x28\u2028\u2029') \
1263 or (self.flow_level and ch in u',:?[]{}'):
1264 break
1265 length += 1
1266 if length == 0:
1267 break
1268 self.allow_simple_key = False
1269 chunks.extend(spaces)
1270 chunks.append(self.reader.prefix(length))
1271 self.reader.forward(length)
1272 end_marker = self.reader.get_marker()
1273 spaces = self.scan_plain_spaces(indent)
1274 if not spaces or self.reader.peek() == u'#' \
1275 or self.reader.column < indent:
1276 break
1277 return ScalarToken(u''.join(chunks), True, start_marker, end_marker)
1279 def scan_plain_spaces(self, indent):
1280 # See the specification for details.
1281 # The specification is really confusing about tabs in plain scalars.
1282 # We just forbid them completely. Do not use tabs in YAML!
1283 chunks = []
1284 length = 0
1285 while self.reader.peek(length) in u' ':
1286 length += 1
1287 whitespaces = self.reader.prefix(length)
1288 self.reader.forward(length)
1289 ch = self.reader.peek()
1290 if ch in u'\r\n\x85\u2028\u2029':
1291 line_break = self.scan_line_break()
1292 self.allow_simple_key = True
1293 breaks = []
1294 while self.reader.peek() in u' \r\n\x85\u2028\u2029':
1295 if self.reader.peek() == ' ':
1296 self.reader.forward()
1297 else:
1298 breaks.append(self.scan_line_break())
1299 if line_break != u'\n':
1300 chunks.append(line_break)
1301 elif not breaks:
1302 chunks.append(u' ')
1303 chunks.extend(breaks)
1304 elif whitespaces:
1305 chunks.append(whitespaces)
1306 return chunks
1308 def scan_tag_handle(self, name, start_marker):
1309 # See the specification for details.
1310 # For some strange reasons, the specification does not allow '_' in
1311 # tag handles. I have allowed it anyway.
1312 if self.reader.peek() != u'!':
1313 raise ScannerError("while scanning a %s" % name, start_marker,
1314 "expected '!', but found %r" % ch.encode('utf-8'),
1315 self.reader.get_marker())
1316 length = 1
1317 ch = self.reader.peek(length)
1318 if ch != u' ':
1319 while u'0' <= ch <= u'9' or u'A' <= ch <= 'Z' or u'a' <= ch <= 'z' \
1320 or ch in u'-_':
1321 length += 1
1322 ch = self.reader.peek(length)
1323 if ch != u'!':
1324 self.reader.forward(length)
1325 raise ScannerError("while scanning a %s" % name, start_marker,
1326 "expected '!', but found %r" % ch.encode('utf-8'),
1327 self.reader.get_marker())
1328 length += 1
1329 value = self.reader.prefix(length)
1330 self.reader.forward(length)
1331 return value
1333 def scan_tag_uri(self, name, start_marker):
1334 # See the specification for details.
1335 # Note: we do not check if URI is well-formed.
1336 chunks = []
1337 length = 0
1338 ch = self.reader.peek(length)
1339 while u'0' <= ch <= u'9' or u'A' <= ch <= 'Z' or u'a' <= ch <= 'z' \
1340 or ch in u'-;/?:@&=+$,_.!~*\'()[]%':
1341 if ch == u'%':
1342 chunks.append(self.reader.prefix(length))
1343 self.reader.forward(length)
1344 length = 0
1345 chunks.append(self.scan_uri_escapes(name, start_marker))
1346 else:
1347 length += 1
1348 ch = self.reader.peek(length)
1349 if length:
1350 chunks.append(self.reader.prefix(length))
1351 self.reader.forward(length)
1352 length = 0
1353 if not chunks:
1354 raise ScannerError("while parsing a %s" % name, start_marker,
1355 "expected URI, but found %r" % ch.encode('utf-8'),
1356 self.reader.get_marker())
1357 return u''.join(chunks)
1359 def scan_uri_escapes(self, name, start_marker):
1360 # See the specification for details.
1361 bytes = []
1362 marker = self.reader.get_marker()
1363 while self.reader.peek() == u'%':
1364 self.reader.forward()
1365 for k in range(2):
1366 if self.reader.peek(k) not in u'0123456789ABCDEFabcdef':
1367 raise ScannerError("while scanning a %s" % name, start_marker,
1368 "expected URI escape sequence of 2 hexdecimal numbers, but found %r" %
1369 (self.reader.peek(k).encode('utf-8')), self.reader.get_marker())
1370 bytes.append(chr(int(self.reader.prefix(2), 16)))
1371 self.reader.forward(2)
1372 try:
1373 value = unicode(''.join(bytes), 'utf-8')
1374 except UnicodeDecodeError, exc:
1375 raise ScannerError("while scanning a %s" % name, start_marker, str(exc), marker)
1376 return value
1378 def scan_line_break(self):
1379 # Transforms:
1380 # '\r\n' : '\n'
1381 # '\r' : '\n'
1382 # '\n' : '\n'
1383 # '\x85' : '\n'
1384 # '\u2028' : '\u2028'
1385 # '\u2029 : '\u2029'
1386 # default : ''
1387 ch = self.reader.peek()
1388 if ch in u'\r\n\x85':
1389 if self.reader.prefix(2) == u'\r\n':
1390 self.forward(2)
1391 else:
1392 self.reader.forward()
1393 return u'\n'
1394 elif ch in u'\u2028\u2029':
1395 self.reader.forward()
1396 return ch
1397 return u''
1399 #try:
1400 # import psyco
1401 # psyco.bind(Scanner)
1402 #except ImportError:
1403 # pass