Refactor resolver.
[pyyaml/python3.git] / lib / yaml / parser.py
blob2aec0fe33f4cd60b1c0e76e0c73d26204045733a
2 # YAML can be parsed by an LL(1) parser!
4 # We use the following production rules:
5 # stream ::= STREAM-START implicit_document? explicit_document* STREAM-END
6 # explicit_document ::= DIRECTIVE* DOCUMENT-START block_node? DOCUMENT-END?
7 # implicit_document ::= block_node DOCUMENT-END?
8 # block_node ::= ALIAS | properties? block_content
9 # flow_node ::= ALIAS | properties? flow_content
10 # properties ::= TAG ANCHOR? | ANCHOR TAG?
11 # block_content ::= block_collection | flow_collection | SCALAR
12 # flow_content ::= flow_collection | SCALAR
13 # block_collection ::= block_sequence | block_mapping
14 # block_sequence ::= BLOCK-SEQUENCE-START (BLOCK-ENTRY block_node?)* BLOCK-END
15 # block_mapping ::= BLOCK-MAPPING_START ((KEY block_node_or_indentless_sequence?)? (VALUE block_node_or_indentless_sequence?)?)* BLOCK-END
16 # block_node_or_indentless_sequence ::= ALIAS | properties? (block_content | indentless_block_sequence)
17 # indentless_block_sequence ::= (BLOCK-ENTRY block_node?)+
18 # flow_collection ::= flow_sequence | flow_mapping
19 # flow_sequence ::= FLOW-SEQUENCE-START (flow_sequence_entry FLOW-ENTRY)* flow_sequence_entry? FLOW-SEQUENCE-END
20 # flow_mapping ::= FLOW-MAPPING-START (flow_mapping_entry FLOW-ENTRY)* flow_mapping_entry? FLOW-MAPPING-END
21 # flow_sequence_entry ::= flow_node | KEY flow_node? (VALUE flow_node?)?
22 # flow_mapping_entry ::= flow_node | KEY flow_node? (VALUE flow_node?)?
24 # TODO: support for BOM within a stream.
25 # stream ::= (BOM? implicit_document)? (BOM? explicit_document)* STREAM-END
27 # FIRST sets:
28 # stream: { STREAM-START }
29 # explicit_document: { DIRECTIVE DOCUMENT-START }
30 # implicit_document: FIRST(block_node)
31 # block_node: { ALIAS TAG ANCHOR SCALAR BLOCK-SEQUENCE-START BLOCK-MAPPING-START FLOW-SEQUENCE-START FLOW-MAPPING-START }
32 # flow_node: { ALIAS ANCHOR TAG SCALAR FLOW-SEQUENCE-START FLOW-MAPPING-START }
33 # block_content: { BLOCK-SEQUENCE-START BLOCK-MAPPING-START FLOW-SEQUENCE-START FLOW-MAPPING-START SCALAR }
34 # flow_content: { FLOW-SEQUENCE-START FLOW-MAPPING-START SCALAR }
35 # block_collection: { BLOCK-SEQUENCE-START BLOCK-MAPPING-START }
36 # flow_collection: { FLOW-SEQUENCE-START FLOW-MAPPING-START }
37 # block_sequence: { BLOCK-SEQUENCE-START }
38 # block_mapping: { BLOCK-MAPPING-START }
39 # block_node_or_indentless_sequence: { ALIAS ANCHOR TAG SCALAR BLOCK-SEQUENCE-START BLOCK-MAPPING-START FLOW-SEQUENCE-START FLOW-MAPPING-START BLOCK-ENTRY }
40 # indentless_sequence: { ENTRY }
41 # flow_collection: { FLOW-SEQUENCE-START FLOW-MAPPING-START }
42 # flow_sequence: { FLOW-SEQUENCE-START }
43 # flow_mapping: { FLOW-MAPPING-START }
44 # flow_sequence_entry: { ALIAS ANCHOR TAG SCALAR FLOW-SEQUENCE-START FLOW-MAPPING-START KEY }
45 # flow_mapping_entry: { ALIAS ANCHOR TAG SCALAR FLOW-SEQUENCE-START FLOW-MAPPING-START KEY }
47 __all__ = ['Parser', 'ParserError']
49 from error import MarkedYAMLError
50 from tokens import *
51 from events import *
52 from scanner import *
54 class ParserError(MarkedYAMLError):
55 pass
57 class Parser:
58 # Since writing a recursive-descendant parser is a straightforward task, we
59 # do not give many comments here.
60 # Note that we use Python generators. If you rewrite the parser in another
61 # language, you may replace all 'yield'-s with event handler calls.
63 DEFAULT_TAGS = {
64 u'!': u'!',
65 u'!!': u'tag:yaml.org,2002:',
68 def __init__(self):
69 self.current_event = None
70 self.yaml_version = None
71 self.tag_handles = {}
72 self.event_generator = self.parse_stream()
74 def check_event(self, *choices):
75 # Check the type of the next event.
76 if self.current_event is None:
77 try:
78 self.current_event = self.event_generator.next()
79 except StopIteration:
80 pass
81 if self.current_event is not None:
82 if not choices:
83 return True
84 for choice in choices:
85 if isinstance(self.current_event, choice):
86 return True
87 return False
89 def peek_event(self):
90 # Get the next event.
91 if self.current_event is None:
92 try:
93 self.current_event = self.event_generator.next()
94 except StopIteration:
95 pass
96 return self.current_event
98 def get_event(self):
99 # Get the next event.
100 if self.current_event is None:
101 try:
102 self.current_event = self.event_generator.next()
103 except StopIteration:
104 pass
105 value = self.current_event
106 self.current_event = None
107 return value
109 def __iter__(self):
110 # Iterator protocol.
111 return self.event_generator
113 def parse_stream(self):
114 # STREAM-START implicit_document? explicit_document* STREAM-END
116 # Parse start of stream.
117 token = self.get_token()
118 yield StreamStartEvent(token.start_mark, token.end_mark,
119 encoding=token.encoding)
121 # Parse implicit document.
122 if not self.check_token(DirectiveToken, DocumentStartToken,
123 StreamEndToken):
124 self.tag_handles = self.DEFAULT_TAGS
125 token = self.peek_token()
126 start_mark = end_mark = token.start_mark
127 yield DocumentStartEvent(start_mark, end_mark,
128 explicit=False)
129 for event in self.parse_block_node():
130 yield event
131 token = self.peek_token()
132 start_mark = end_mark = token.start_mark
133 explicit = False
134 while self.check_token(DocumentEndToken):
135 token = self.get_token()
136 end_mark = token.end_mark
137 explicit = True
138 yield DocumentEndEvent(start_mark, end_mark,
139 explicit=explicit)
141 # Parse explicit documents.
142 while not self.check_token(StreamEndToken):
143 token = self.peek_token()
144 start_mark = token.start_mark
145 version, tags = self.process_directives()
146 if not self.check_token(DocumentStartToken):
147 raise ParserError(None, None,
148 "expected '<document start>', but found %r"
149 % self.peek_token().id,
150 self.peek_token().start_mark)
151 token = self.get_token()
152 end_mark = token.end_mark
153 yield DocumentStartEvent(start_mark, end_mark,
154 explicit=True, version=version, tags=tags)
155 if self.check_token(DirectiveToken,
156 DocumentStartToken, DocumentEndToken, StreamEndToken):
157 yield self.process_empty_scalar(token.end_mark)
158 else:
159 for event in self.parse_block_node():
160 yield event
161 token = self.peek_token()
162 start_mark = end_mark = token.start_mark
163 explicit = False
164 while self.check_token(DocumentEndToken):
165 token = self.get_token()
166 end_mark = token.end_mark
167 explicit=True
168 yield DocumentEndEvent(start_mark, end_mark,
169 explicit=explicit)
171 # Parse end of stream.
172 token = self.get_token()
173 yield StreamEndEvent(token.start_mark, token.end_mark)
175 def process_directives(self):
176 # DIRECTIVE*
177 self.yaml_version = None
178 self.tag_handles = {}
179 while self.check_token(DirectiveToken):
180 token = self.get_token()
181 if token.name == u'YAML':
182 if self.yaml_version is not None:
183 raise ParserError(None, None,
184 "found duplicate YAML directive", token.start_mark)
185 major, minor = token.value
186 if major != 1:
187 raise ParserError(None, None,
188 "found incompatible YAML document (version 1.* is required)",
189 token.start_mark)
190 self.yaml_version = token.value
191 elif token.name == u'TAG':
192 handle, prefix = token.value
193 if handle in self.tag_handles:
194 raise ParserError(None, None,
195 "duplicate tag handle %r" % handle.encode('utf-8'),
196 token.start_mark)
197 self.tag_handles[handle] = prefix
198 if self.tag_handles:
199 value = self.yaml_version, self.tag_handles.copy()
200 else:
201 value = self.yaml_version, None
202 for key in self.DEFAULT_TAGS:
203 if key not in self.tag_handles:
204 self.tag_handles[key] = self.DEFAULT_TAGS[key]
205 return value
207 def parse_block_node(self):
208 return self.parse_node(block=True)
210 def parse_flow_node(self):
211 return self.parse_node()
213 def parse_block_node_or_indentless_sequence(self):
214 return self.parse_node(block=True, indentless_sequence=True)
216 def parse_node(self, block=False, indentless_sequence=False):
217 # block_node ::= ALIAS | properties? block_content
218 # flow_node ::= ALIAS | properties? flow_content
219 # properties ::= TAG ANCHOR? | ANCHOR TAG?
220 # block_content ::= block_collection | flow_collection | SCALAR
221 # flow_content ::= flow_collection | SCALAR
222 # block_collection ::= block_sequence | block_mapping
223 # block_node_or_indentless_sequence ::= ALIAS | properties?
224 # (block_content | indentless_block_sequence)
225 if self.check_token(AliasToken):
226 token = self.get_token()
227 yield AliasEvent(token.value, token.start_mark, token.end_mark)
228 else:
229 anchor = None
230 tag = None
231 start_mark = end_mark = tag_mark = None
232 if self.check_token(AnchorToken):
233 token = self.get_token()
234 start_mark = token.start_mark
235 end_mark = token.end_mark
236 anchor = token.value
237 if self.check_token(TagToken):
238 token = self.get_token()
239 tag_mark = token.start_mark
240 end_mark = token.end_mark
241 tag = token.value
242 elif self.check_token(TagToken):
243 token = self.get_token()
244 start_mark = tag_mark = token.start_mark
245 end_mark = token.end_mark
246 tag = token.value
247 if self.check_token(AnchorToken):
248 token = self.get_token()
249 end_mark = token.end_mark
250 anchor = token.value
251 if tag is not None and tag != u'!':
252 handle, suffix = tag
253 if handle is not None:
254 if handle not in self.tag_handles:
255 raise ParserError("while parsing a node", start_mark,
256 "found undefined tag handle %r" % handle.encode('utf-8'),
257 tag_mark)
258 tag = self.tag_handles[handle]+suffix
259 else:
260 tag = suffix
261 #if tag == u'!':
262 # raise ParserError("while parsing a node", start_mark,
263 # "found non-specific tag '!'", tag_mark,
264 # "Please check 'http://pyyaml.org/wiki/YAMLNonSpecificTag' and share your opinion.")
265 if start_mark is None:
266 start_mark = end_mark = self.peek_token().start_mark
267 event = None
268 collection_events = None
269 implicit = (tag is None or tag == u'!')
270 if indentless_sequence and self.check_token(BlockEntryToken):
271 end_mark = self.peek_token().end_mark
272 event = SequenceStartEvent(anchor, tag, implicit,
273 start_mark, end_mark)
274 collection_events = self.parse_indentless_sequence()
275 else:
276 if self.check_token(ScalarToken):
277 token = self.get_token()
278 end_mark = token.end_mark
279 if (token.plain and tag is None) or tag == u'!':
280 implicit = (True, False)
281 elif tag is None:
282 implicit = (False, True)
283 else:
284 implicit = (False, False)
285 event = ScalarEvent(anchor, tag, implicit, token.value,
286 start_mark, end_mark, style=token.style)
287 elif self.check_token(FlowSequenceStartToken):
288 end_mark = self.peek_token().end_mark
289 event = SequenceStartEvent(anchor, tag, implicit,
290 start_mark, end_mark, flow_style=True)
291 collection_events = self.parse_flow_sequence()
292 elif self.check_token(FlowMappingStartToken):
293 end_mark = self.peek_token().end_mark
294 event = MappingStartEvent(anchor, tag, implicit,
295 start_mark, end_mark, flow_style=True)
296 collection_events = self.parse_flow_mapping()
297 elif block and self.check_token(BlockSequenceStartToken):
298 end_mark = self.peek_token().start_mark
299 event = SequenceStartEvent(anchor, tag, implicit,
300 start_mark, end_mark, flow_style=False)
301 collection_events = self.parse_block_sequence()
302 elif block and self.check_token(BlockMappingStartToken):
303 end_mark = self.peek_token().start_mark
304 event = MappingStartEvent(anchor, tag, implicit,
305 start_mark, end_mark, flow_style=False)
306 collection_events = self.parse_block_mapping()
307 elif anchor is not None or tag is not None:
308 # Empty scalars are allowed even if a tag or an anchor is
309 # specified.
310 event = ScalarEvent(anchor, tag, (implicit, False), u'',
311 start_mark, end_mark)
312 else:
313 if block:
314 node = 'block'
315 else:
316 node = 'flow'
317 token = self.peek_token()
318 raise ParserError("while scanning a %s node" % node, start_mark,
319 "expected the node content, but found %r" % token.id,
320 token.start_mark)
321 yield event
322 if collection_events is not None:
323 for event in collection_events:
324 yield event
326 def parse_block_sequence(self):
327 # BLOCK-SEQUENCE-START (BLOCK-ENTRY block_node?)* BLOCK-END
328 token = self.get_token()
329 start_mark = token.start_mark
330 while self.check_token(BlockEntryToken):
331 token = self.get_token()
332 if not self.check_token(BlockEntryToken, BlockEndToken):
333 for event in self.parse_block_node():
334 yield event
335 else:
336 yield self.process_empty_scalar(token.end_mark)
337 if not self.check_token(BlockEndToken):
338 token = self.peek_token()
339 raise ParserError("while scanning a block collection", start_mark,
340 "expected <block end>, but found %r" % token.id, token.start_mark)
341 token = self.get_token()
342 yield SequenceEndEvent(token.start_mark, token.end_mark)
344 def parse_indentless_sequence(self):
345 # (BLOCK-ENTRY block_node?)+
346 while self.check_token(BlockEntryToken):
347 token = self.get_token()
348 if not self.check_token(BlockEntryToken,
349 KeyToken, ValueToken, BlockEndToken):
350 for event in self.parse_block_node():
351 yield event
352 else:
353 yield self.process_empty_scalar(token.end_mark)
354 token = self.peek_token()
355 yield SequenceEndEvent(token.start_mark, token.start_mark)
357 def parse_block_mapping(self):
358 # BLOCK-MAPPING_START
359 # ((KEY block_node_or_indentless_sequence?)?
360 # (VALUE block_node_or_indentless_sequence?)?)*
361 # BLOCK-END
362 token = self.get_token()
363 start_mark = token.start_mark
364 while self.check_token(KeyToken, ValueToken):
365 if self.check_token(KeyToken):
366 token = self.get_token()
367 if not self.check_token(KeyToken, ValueToken, BlockEndToken):
368 for event in self.parse_block_node_or_indentless_sequence():
369 yield event
370 else:
371 yield self.process_empty_scalar(token.end_mark)
372 if self.check_token(ValueToken):
373 token = self.get_token()
374 if not self.check_token(KeyToken, ValueToken, BlockEndToken):
375 for event in self.parse_block_node_or_indentless_sequence():
376 yield event
377 else:
378 yield self.process_empty_scalar(token.end_mark)
379 else:
380 token = self.peek_token()
381 yield self.process_empty_scalar(token.start_mark)
382 if not self.check_token(BlockEndToken):
383 token = self.peek_token()
384 raise ParserError("while scanning a block mapping", start_mark,
385 "expected <block end>, but found %r" % token.id, token.start_mark)
386 token = self.get_token()
387 yield MappingEndEvent(token.start_mark, token.end_mark)
389 def parse_flow_sequence(self):
390 # flow_sequence ::= FLOW-SEQUENCE-START
391 # (flow_sequence_entry FLOW-ENTRY)*
392 # flow_sequence_entry?
393 # FLOW-SEQUENCE-END
394 # flow_sequence_entry ::= flow_node | KEY flow_node? (VALUE flow_node?)?
396 # Note that while production rules for both flow_sequence_entry and
397 # flow_mapping_entry are equal, their interpretations are different.
398 # For `flow_sequence_entry`, the part `KEY flow_node? (VALUE flow_node?)?`
399 # generate an inline mapping (set syntax).
400 token = self.get_token()
401 start_mark = token.start_mark
402 while not self.check_token(FlowSequenceEndToken):
403 if self.check_token(KeyToken):
404 token = self.get_token()
405 yield MappingStartEvent(None, None, True,
406 token.start_mark, token.end_mark,
407 flow_style=True)
408 if not self.check_token(ValueToken,
409 FlowEntryToken, FlowSequenceEndToken):
410 for event in self.parse_flow_node():
411 yield event
412 else:
413 yield self.process_empty_scalar(token.end_mark)
414 if self.check_token(ValueToken):
415 token = self.get_token()
416 if not self.check_token(FlowEntryToken, FlowSequenceEndToken):
417 for event in self.parse_flow_node():
418 yield event
419 else:
420 yield self.process_empty_scalar(token.end_mark)
421 else:
422 token = self.peek_token()
423 yield self.process_empty_scalar(token.start_mark)
424 token = self.peek_token()
425 yield MappingEndEvent(token.start_mark, token.start_mark)
426 else:
427 for event in self.parse_flow_node():
428 yield event
429 if not self.check_token(FlowEntryToken, FlowSequenceEndToken):
430 token = self.peek_token()
431 raise ParserError("while scanning a flow sequence", start_mark,
432 "expected ',' or ']', but got %r" % token.id, token.start_mark)
433 if self.check_token(FlowEntryToken):
434 self.get_token()
435 token = self.get_token()
436 yield SequenceEndEvent(token.start_mark, token.end_mark)
438 def parse_flow_mapping(self):
439 # flow_mapping ::= FLOW-MAPPING-START
440 # (flow_mapping_entry FLOW-ENTRY)*
441 # flow_mapping_entry?
442 # FLOW-MAPPING-END
443 # flow_mapping_entry ::= flow_node | KEY flow_node? (VALUE flow_node?)?
444 token = self.get_token()
445 start_mark = token.start_mark
446 while not self.check_token(FlowMappingEndToken):
447 if self.check_token(KeyToken):
448 token = self.get_token()
449 if not self.check_token(ValueToken,
450 FlowEntryToken, FlowMappingEndToken):
451 for event in self.parse_flow_node():
452 yield event
453 else:
454 yield self.process_empty_scalar(token.end_mark)
455 if self.check_token(ValueToken):
456 token = self.get_token()
457 if not self.check_token(FlowEntryToken, FlowMappingEndToken):
458 for event in self.parse_flow_node():
459 yield event
460 else:
461 yield self.process_empty_scalar(token.end_mark)
462 else:
463 token = self.peek_token()
464 yield self.process_empty_scalar(token.start_mark)
465 else:
466 for event in self.parse_flow_node():
467 yield event
468 yield self.process_empty_scalar(self.peek_token().start_mark)
469 if not self.check_token(FlowEntryToken, FlowMappingEndToken):
470 token = self.peek_token()
471 raise ParserError("while scanning a flow mapping", start_mark,
472 "expected ',' or '}', but got %r" % token.id, token.start_mark)
473 if self.check_token(FlowEntryToken):
474 self.get_token()
475 if not self.check_token(FlowMappingEndToken):
476 token = self.peek_token()
477 raise ParserError("while scanning a flow mapping", start_mark,
478 "expected '}', but found %r" % token.id, token.start_mark)
479 token = self.get_token()
480 yield MappingEndEvent(token.start_mark, token.end_mark)
482 def process_empty_scalar(self, mark):
483 return ScalarEvent(None, None, (True, False), u'', mark, mark)