To make porting easier, rewrite Parser not using generators.
[pyyaml/python3.git] / lib / yaml / parser.py
blobecf72feecd4386d8687fab1741ad7e3d89a06fa0
2 # The following YAML grammar is LL(1) and is parsed by a recursive descent
3 # parser.
5 # stream ::= STREAM-START implicit_document? explicit_document* STREAM-END
6 # implicit_document ::= block_node DOCUMENT-END*
7 # explicit_document ::= DIRECTIVE* DOCUMENT-START block_node? DOCUMENT-END*
8 # block_node_or_indentless_sequence ::=
9 # ALIAS
10 # | properties (block_content | indentless_block_sequence)?
11 # | block_content
12 # | indentless_block_sequence
13 # block_node ::= ALIAS
14 # | properties block_content?
15 # | block_content
16 # flow_node ::= ALIAS
17 # | properties flow_content?
18 # | flow_content
19 # properties ::= TAG ANCHOR? | ANCHOR TAG?
20 # block_content ::= block_collection | flow_collection | SCALAR
21 # flow_content ::= flow_collection | SCALAR
22 # block_collection ::= block_sequence | block_mapping
23 # flow_collection ::= flow_sequence | flow_mapping
24 # block_sequence ::= BLOCK-SEQUENCE-START (BLOCK-ENTRY block_node?)* BLOCK-END
25 # indentless_sequence ::= (BLOCK-ENTRY block_node?)+
26 # block_mapping ::= BLOCK-MAPPING_START
27 # ((KEY block_node_or_indentless_sequence?)?
28 # (VALUE block_node_or_indentless_sequence?)?)*
29 # BLOCK-END
30 # flow_sequence ::= FLOW-SEQUENCE-START
31 # (flow_sequence_entry FLOW-ENTRY)*
32 # flow_sequence_entry?
33 # FLOW-SEQUENCE-END
34 # flow_sequence_entry ::= flow_node | KEY flow_node? (VALUE flow_node?)?
35 # flow_mapping ::= FLOW-MAPPING-START
36 # (flow_mapping_entry FLOW-ENTRY)*
37 # flow_mapping_entry?
38 # FLOW-MAPPING-END
39 # flow_mapping_entry ::= flow_node | KEY flow_node? (VALUE flow_node?)?
41 # FIRST sets:
43 # stream: { STREAM-START }
44 # explicit_document: { DIRECTIVE DOCUMENT-START }
45 # implicit_document: FIRST(block_node)
46 # block_node: { ALIAS TAG ANCHOR SCALAR BLOCK-SEQUENCE-START BLOCK-MAPPING-START FLOW-SEQUENCE-START FLOW-MAPPING-START }
47 # flow_node: { ALIAS ANCHOR TAG SCALAR FLOW-SEQUENCE-START FLOW-MAPPING-START }
48 # block_content: { BLOCK-SEQUENCE-START BLOCK-MAPPING-START FLOW-SEQUENCE-START FLOW-MAPPING-START SCALAR }
49 # flow_content: { FLOW-SEQUENCE-START FLOW-MAPPING-START SCALAR }
50 # block_collection: { BLOCK-SEQUENCE-START BLOCK-MAPPING-START }
51 # flow_collection: { FLOW-SEQUENCE-START FLOW-MAPPING-START }
52 # block_sequence: { BLOCK-SEQUENCE-START }
53 # block_mapping: { BLOCK-MAPPING-START }
54 # block_node_or_indentless_sequence: { ALIAS ANCHOR TAG SCALAR BLOCK-SEQUENCE-START BLOCK-MAPPING-START FLOW-SEQUENCE-START FLOW-MAPPING-START BLOCK-ENTRY }
55 # indentless_sequence: { ENTRY }
56 # flow_collection: { FLOW-SEQUENCE-START FLOW-MAPPING-START }
57 # flow_sequence: { FLOW-SEQUENCE-START }
58 # flow_mapping: { FLOW-MAPPING-START }
59 # flow_sequence_entry: { ALIAS ANCHOR TAG SCALAR FLOW-SEQUENCE-START FLOW-MAPPING-START KEY }
60 # flow_mapping_entry: { ALIAS ANCHOR TAG SCALAR FLOW-SEQUENCE-START FLOW-MAPPING-START KEY }
62 __all__ = ['Parser', 'ParserError']
64 from error import MarkedYAMLError
65 from tokens import *
66 from events import *
67 from scanner import *
69 class ParserError(MarkedYAMLError):
70 pass
72 class Parser:
73 # Since writing a recursive-descendant parser is a straightforward task, we
74 # do not give many comments here.
75 # Note that we use Python generators. If you rewrite the parser in another
76 # language, you may replace all 'yield'-s with event handler calls.
78 DEFAULT_TAGS = {
79 u'!': u'!',
80 u'!!': u'tag:yaml.org,2002:',
83 def __init__(self):
84 self.current_event = None
85 self.yaml_version = None
86 self.tag_handles = {}
87 self.states = []
88 self.marks = []
89 self.state = self.parse_stream_start
91 def check_event(self, *choices):
92 # Check the type of the next event.
93 if self.current_event is None:
94 if self.state:
95 self.current_event = self.state()
96 if self.current_event is not None:
97 if not choices:
98 return True
99 for choice in choices:
100 if isinstance(self.current_event, choice):
101 return True
102 return False
104 def peek_event(self):
105 # Get the next event.
106 if self.current_event is None:
107 if self.state:
108 self.current_event = self.state()
109 return self.current_event
111 def get_event(self):
112 # Get the next event and proceed further.
113 if self.current_event is None:
114 if self.state:
115 self.current_event = self.state()
116 value = self.current_event
117 self.current_event = None
118 return value
120 # stream ::= STREAM-START implicit_document? explicit_document* STREAM-END
121 # implicit_document ::= block_node DOCUMENT-END*
122 # explicit_document ::= DIRECTIVE* DOCUMENT-START block_node? DOCUMENT-END*
124 def parse_stream_start(self):
126 # Parse the stream start.
127 token = self.get_token()
128 event = StreamStartEvent(token.start_mark, token.end_mark,
129 encoding=token.encoding)
131 # Prepare the next state.
132 self.state = self.parse_implicit_document_start
134 return event
136 def parse_implicit_document_start(self):
138 # Parse an implicit document.
139 if not self.check_token(DirectiveToken, DocumentStartToken,
140 StreamEndToken):
141 self.tag_handles = self.DEFAULT_TAGS
142 token = self.peek_token()
143 start_mark = end_mark = token.start_mark
144 event = DocumentStartEvent(start_mark, end_mark,
145 explicit=False)
147 # Prepare the next state.
148 self.states.append(self.parse_document_end)
149 self.state = self.parse_block_node
151 return event
153 else:
154 return self.parse_document_start()
156 def parse_document_start(self):
158 # Parse an explicit document.
159 if not self.check_token(StreamEndToken):
160 token = self.peek_token()
161 start_mark = token.start_mark
162 version, tags = self.process_directives()
163 if not self.check_token(DocumentStartToken):
164 raise ParserError(None, None,
165 "expected '<document start>', but found %r"
166 % self.peek_token().id,
167 self.peek_token().start_mark)
168 token = self.get_token()
169 end_mark = token.end_mark
170 event = DocumentStartEvent(start_mark, end_mark,
171 explicit=True, version=version, tags=tags)
172 self.states.append(self.parse_document_end)
173 self.state = self.parse_document_content
174 else:
175 # Parse the end of the stream.
176 token = self.get_token()
177 event = StreamEndEvent(token.start_mark, token.end_mark)
178 assert not self.states
179 assert not self.marks
180 self.state = None
181 return event
183 def parse_document_end(self):
185 # Parse the document end.
186 token = self.peek_token()
187 start_mark = end_mark = token.start_mark
188 explicit = False
189 while self.check_token(DocumentEndToken):
190 token = self.get_token()
191 end_mark = token.end_mark
192 explicit = True
193 event = DocumentEndEvent(start_mark, end_mark,
194 explicit=explicit)
196 # Prepare the next state.
197 self.state = self.parse_document_start
199 return event
201 def parse_document_content(self):
202 if self.check_token(DirectiveToken,
203 DocumentStartToken, DocumentEndToken, StreamEndToken):
204 event = self.process_empty_scalar(self.peek_token().start_mark)
205 self.state = self.states.pop()
206 return event
207 else:
208 return self.parse_block_node()
210 def process_directives(self):
211 self.yaml_version = None
212 self.tag_handles = {}
213 while self.check_token(DirectiveToken):
214 token = self.get_token()
215 if token.name == u'YAML':
216 if self.yaml_version is not None:
217 raise ParserError(None, None,
218 "found duplicate YAML directive", token.start_mark)
219 major, minor = token.value
220 if major != 1:
221 raise ParserError(None, None,
222 "found incompatible YAML document (version 1.* is required)",
223 token.start_mark)
224 self.yaml_version = token.value
225 elif token.name == u'TAG':
226 handle, prefix = token.value
227 if handle in self.tag_handles:
228 raise ParserError(None, None,
229 "duplicate tag handle %r" % handle.encode('utf-8'),
230 token.start_mark)
231 self.tag_handles[handle] = prefix
232 if self.tag_handles:
233 value = self.yaml_version, self.tag_handles.copy()
234 else:
235 value = self.yaml_version, None
236 for key in self.DEFAULT_TAGS:
237 if key not in self.tag_handles:
238 self.tag_handles[key] = self.DEFAULT_TAGS[key]
239 return value
241 # block_node_or_indentless_sequence ::= ALIAS
242 # | properties (block_content | indentless_block_sequence)?
243 # | block_content
244 # | indentless_block_sequence
245 # block_node ::= ALIAS
246 # | properties block_content?
247 # | block_content
248 # flow_node ::= ALIAS
249 # | properties flow_content?
250 # | flow_content
251 # properties ::= TAG ANCHOR? | ANCHOR TAG?
252 # block_content ::= block_collection | flow_collection | SCALAR
253 # flow_content ::= flow_collection | SCALAR
254 # block_collection ::= block_sequence | block_mapping
255 # flow_collection ::= flow_sequence | flow_mapping
257 def parse_block_node(self):
258 return self.parse_node(block=True)
260 def parse_flow_node(self):
261 return self.parse_node()
263 def parse_block_node_or_indentless_sequence(self):
264 return self.parse_node(block=True, indentless_sequence=True)
266 def parse_node(self, block=False, indentless_sequence=False):
267 if self.check_token(AliasToken):
268 token = self.get_token()
269 event = AliasEvent(token.value, token.start_mark, token.end_mark)
270 self.state = self.states.pop()
271 else:
272 anchor = None
273 tag = None
274 start_mark = end_mark = tag_mark = None
275 if self.check_token(AnchorToken):
276 token = self.get_token()
277 start_mark = token.start_mark
278 end_mark = token.end_mark
279 anchor = token.value
280 if self.check_token(TagToken):
281 token = self.get_token()
282 tag_mark = token.start_mark
283 end_mark = token.end_mark
284 tag = token.value
285 elif self.check_token(TagToken):
286 token = self.get_token()
287 start_mark = tag_mark = token.start_mark
288 end_mark = token.end_mark
289 tag = token.value
290 if self.check_token(AnchorToken):
291 token = self.get_token()
292 end_mark = token.end_mark
293 anchor = token.value
294 if tag is not None and tag != u'!':
295 handle, suffix = tag
296 if handle is not None:
297 if handle not in self.tag_handles:
298 raise ParserError("while parsing a node", start_mark,
299 "found undefined tag handle %r" % handle.encode('utf-8'),
300 tag_mark)
301 tag = self.tag_handles[handle]+suffix
302 else:
303 tag = suffix
304 #if tag == u'!':
305 # raise ParserError("while parsing a node", start_mark,
306 # "found non-specific tag '!'", tag_mark,
307 # "Please check 'http://pyyaml.org/wiki/YAMLNonSpecificTag' and share your opinion.")
308 if start_mark is None:
309 start_mark = end_mark = self.peek_token().start_mark
310 event = None
311 collection_events = None
312 implicit = (tag is None or tag == u'!')
313 if indentless_sequence and self.check_token(BlockEntryToken):
314 end_mark = self.peek_token().end_mark
315 event = SequenceStartEvent(anchor, tag, implicit,
316 start_mark, end_mark)
317 self.state = self.parse_indentless_sequence_entry
318 else:
319 if self.check_token(ScalarToken):
320 token = self.get_token()
321 end_mark = token.end_mark
322 if (token.plain and tag is None) or tag == u'!':
323 implicit = (True, False)
324 elif tag is None:
325 implicit = (False, True)
326 else:
327 implicit = (False, False)
328 event = ScalarEvent(anchor, tag, implicit, token.value,
329 start_mark, end_mark, style=token.style)
330 self.state = self.states.pop()
331 elif self.check_token(FlowSequenceStartToken):
332 end_mark = self.peek_token().end_mark
333 event = SequenceStartEvent(anchor, tag, implicit,
334 start_mark, end_mark, flow_style=True)
335 self.state = self.parse_flow_sequence_first_entry
336 elif self.check_token(FlowMappingStartToken):
337 end_mark = self.peek_token().end_mark
338 event = MappingStartEvent(anchor, tag, implicit,
339 start_mark, end_mark, flow_style=True)
340 self.state = self.parse_flow_mapping_first_key
341 elif block and self.check_token(BlockSequenceStartToken):
342 end_mark = self.peek_token().start_mark
343 event = SequenceStartEvent(anchor, tag, implicit,
344 start_mark, end_mark, flow_style=False)
345 self.state = self.parse_block_sequence_first_entry
346 elif block and self.check_token(BlockMappingStartToken):
347 end_mark = self.peek_token().start_mark
348 event = MappingStartEvent(anchor, tag, implicit,
349 start_mark, end_mark, flow_style=False)
350 self.state = self.parse_block_mapping_first_key
351 elif anchor is not None or tag is not None:
352 # Empty scalars are allowed even if a tag or an anchor is
353 # specified.
354 event = ScalarEvent(anchor, tag, (implicit, False), u'',
355 start_mark, end_mark)
356 self.state = self.states.pop()
357 else:
358 if block:
359 node = 'block'
360 else:
361 node = 'flow'
362 token = self.peek_token()
363 raise ParserError("while scanning a %s node" % node, start_mark,
364 "expected the node content, but found %r" % token.id,
365 token.start_mark)
366 return event
368 # block_sequence ::= BLOCK-SEQUENCE-START (BLOCK-ENTRY block_node?)* BLOCK-END
370 def parse_block_sequence_first_entry(self):
371 token = self.get_token()
372 self.marks.append(token.start_mark)
373 return self.parse_block_sequence_entry()
375 def parse_block_sequence_entry(self):
376 if self.check_token(BlockEntryToken):
377 token = self.get_token()
378 if not self.check_token(BlockEntryToken, BlockEndToken):
379 self.states.append(self.parse_block_sequence_entry)
380 return self.parse_block_node()
381 else:
382 self.state = self.parse_block_sequence_entry
383 return self.process_empty_scalar(token.end_mark)
384 if not self.check_token(BlockEndToken):
385 token = self.peek_token()
386 raise ParserError("while scanning a block collection", self.marks[-1],
387 "expected <block end>, but found %r" % token.id, token.start_mark)
388 token = self.get_token()
389 event = SequenceEndEvent(token.start_mark, token.end_mark)
390 self.state = self.states.pop()
391 self.marks.pop()
392 return event
394 # indentless_sequence ::= (BLOCK-ENTRY block_node?)+
396 def parse_indentless_sequence_entry(self):
397 if self.check_token(BlockEntryToken):
398 token = self.get_token()
399 if not self.check_token(BlockEntryToken,
400 KeyToken, ValueToken, BlockEndToken):
401 self.states.append(self.parse_indentless_sequence_entry)
402 return self.parse_block_node()
403 else:
404 self.state = self.parse_indentless_sequence_entry
405 return self.process_empty_scalar(token.end_mark)
406 token = self.peek_token()
407 event = SequenceEndEvent(token.start_mark, token.start_mark)
408 self.state = self.states.pop()
409 return event
411 # block_mapping ::= BLOCK-MAPPING_START
412 # ((KEY block_node_or_indentless_sequence?)?
413 # (VALUE block_node_or_indentless_sequence?)?)*
414 # BLOCK-END
416 def parse_block_mapping_first_key(self):
417 token = self.get_token()
418 self.marks.append(token.start_mark)
419 return self.parse_block_mapping_key()
421 def parse_block_mapping_key(self):
422 if self.check_token(KeyToken):
423 token = self.get_token()
424 if not self.check_token(KeyToken, ValueToken, BlockEndToken):
425 self.states.append(self.parse_block_mapping_value)
426 return self.parse_block_node_or_indentless_sequence()
427 else:
428 self.state = self.parse_block_mapping_value
429 return self.process_empty_scalar(token.end_mark)
430 if not self.check_token(BlockEndToken):
431 token = self.peek_token()
432 raise ParserError("while scanning a block mapping", self.marks[-1],
433 "expected <block end>, but found %r" % token.id, token.start_mark)
434 token = self.get_token()
435 event = MappingEndEvent(token.start_mark, token.end_mark)
436 self.state = self.states.pop()
437 self.marks.pop()
438 return event
440 def parse_block_mapping_value(self):
441 if self.check_token(ValueToken):
442 token = self.get_token()
443 if not self.check_token(KeyToken, ValueToken, BlockEndToken):
444 self.states.append(self.parse_block_mapping_key)
445 return self.parse_block_node_or_indentless_sequence()
446 else:
447 self.state = self.parse_block_mapping_key
448 return self.process_empty_scalar(token.end_mark)
449 else:
450 self.state = self.parse_block_mapping_key
451 token = self.peek_token()
452 return self.process_empty_scalar(token.start_mark)
454 # flow_sequence ::= FLOW-SEQUENCE-START
455 # (flow_sequence_entry FLOW-ENTRY)*
456 # flow_sequence_entry?
457 # FLOW-SEQUENCE-END
458 # flow_sequence_entry ::= flow_node | KEY flow_node? (VALUE flow_node?)?
460 # Note that while production rules for both flow_sequence_entry and
461 # flow_mapping_entry are equal, their interpretations are different.
462 # For `flow_sequence_entry`, the part `KEY flow_node? (VALUE flow_node?)?`
463 # generate an inline mapping (set syntax).
465 def parse_flow_sequence_first_entry(self):
466 token = self.get_token()
467 self.marks.append(token.start_mark)
468 return self.parse_flow_sequence_entry(first=True)
470 def parse_flow_sequence_entry(self, first=False):
471 if not self.check_token(FlowSequenceEndToken):
472 if not first:
473 if self.check_token(FlowEntryToken):
474 self.get_token()
475 else:
476 token = self.peek_token()
477 raise ParserError("while scanning a flow sequence", self.marks[-1],
478 "expected ',' or ']', but got %r" % token.id, token.start_mark)
480 if self.check_token(KeyToken):
481 token = self.get_token()
482 event = MappingStartEvent(None, None, True,
483 token.start_mark, token.end_mark,
484 flow_style=True)
485 self.state = self.parse_flow_sequence_entry_mapping_key
486 return event
487 elif not self.check_token(FlowSequenceEndToken):
488 self.states.append(self.parse_flow_sequence_entry)
489 return self.parse_flow_node()
490 token = self.get_token()
491 event = SequenceEndEvent(token.start_mark, token.end_mark)
492 self.state = self.states.pop()
493 self.marks.pop()
494 return event
496 def parse_flow_sequence_entry_mapping_key(self):
497 if not self.check_token(ValueToken,
498 FlowEntryToken, FlowSequenceEndToken):
499 self.states.append(self.parse_flow_sequence_entry_mapping_value)
500 return self.parse_flow_node()
501 else:
502 self.state = self.parse_flow_sequence_entry_mapping_value
503 return self.process_empty_scalar(token.end_mark)
505 def parse_flow_sequence_entry_mapping_value(self):
506 if self.check_token(ValueToken):
507 token = self.get_token()
508 if not self.check_token(FlowEntryToken, FlowSequenceEndToken):
509 self.states.append(self.parse_flow_sequence_entry_mapping_end)
510 return self.parse_flow_node()
511 else:
512 self.state = self.parse_flow_sequence_entry_mapping_end
513 return self.process_empty_scalar(token.end_mark)
514 else:
515 self.state = self.parse_flow_sequence_entry_mapping_end
516 token = self.peek_token()
517 return self.process_empty_scalar(token.start_mark)
519 def parse_flow_sequence_entry_mapping_end(self):
520 self.state = self.parse_flow_sequence_entry
521 token = self.peek_token()
522 return MappingEndEvent(token.start_mark, token.start_mark)
524 # flow_mapping ::= FLOW-MAPPING-START
525 # (flow_mapping_entry FLOW-ENTRY)*
526 # flow_mapping_entry?
527 # FLOW-MAPPING-END
528 # flow_mapping_entry ::= flow_node | KEY flow_node? (VALUE flow_node?)?
530 def parse_flow_mapping_first_key(self):
531 token = self.get_token()
532 self.marks.append(token.start_mark)
533 return self.parse_flow_mapping_key(first=True)
535 def parse_flow_mapping_key(self, first=False):
536 if not self.check_token(FlowMappingEndToken):
537 if not first:
538 if self.check_token(FlowEntryToken):
539 self.get_token()
540 else:
541 token = self.peek_token()
542 raise ParserError("while scanning a flow mapping", self.marks[-1],
543 "expected ',' or '}', but got %r" % token.id, token.start_mark)
544 if self.check_token(KeyToken):
545 token = self.get_token()
546 if not self.check_token(ValueToken,
547 FlowEntryToken, FlowMappingEndToken):
548 self.states.append(self.parse_flow_mapping_value)
549 return self.parse_flow_node()
550 else:
551 self.state = self.parse_flow_mapping_value
552 return self.process_empty_scalar(token.end_mark)
553 elif not self.check_token(FlowMappingEndToken):
554 self.states.append(self.parse_flow_mapping_empty_value)
555 return self.parse_flow_node()
556 token = self.get_token()
557 event = MappingEndEvent(token.start_mark, token.end_mark)
558 self.state = self.states.pop()
559 self.marks.pop()
560 return event
562 def parse_flow_mapping_value(self):
563 if self.check_token(ValueToken):
564 token = self.get_token()
565 if not self.check_token(FlowEntryToken, FlowMappingEndToken):
566 self.states.append(self.parse_flow_mapping_key)
567 return self.parse_flow_node()
568 else:
569 self.state = self.parse_flow_mapping_key
570 return self.process_empty_scalar(token.end_mark)
571 else:
572 self.state = self.parse_flow_mapping_key
573 token = self.peek_token()
574 return self.process_empty_scalar(token.start_mark)
576 def parse_flow_mapping_empty_value(self):
577 self.state = self.parse_flow_mapping_key
578 return self.process_empty_scalar(self.peek_token().start_mark)
580 def process_empty_scalar(self, mark):
581 return ScalarEvent(None, None, (True, False), u'', mark, mark)