2 # YAML can be parsed by an LL(1) parser!
4 # We use the following production rules:
5 # stream ::= STREAM-START implicit_document? explicit_document* STREAM-END
6 # explicit_document ::= DIRECTIVE* DOCUMENT-START block_node? DOCUMENT-END?
7 # implicit_document ::= block_node DOCUMENT-END?
8 # block_node ::= ALIAS | properties? block_content
9 # flow_node ::= ALIAS | properties? flow_content
10 # properties ::= TAG ANCHOR? | ANCHOR TAG?
11 # block_content ::= block_collection | flow_collection | SCALAR
12 # flow_content ::= flow_collection | SCALAR
13 # block_collection ::= block_sequence | block_mapping
14 # block_sequence ::= BLOCK-SEQUENCE-START (BLOCK-ENTRY block_node?)* BLOCK-END
15 # block_mapping ::= BLOCK-MAPPING_START ((KEY block_node_or_indentless_sequence?)? (VALUE block_node_or_indentless_sequence?)?)* BLOCK-END
16 # block_node_or_indentless_sequence ::= ALIAS | properties? (block_content | indentless_block_sequence)
17 # indentless_block_sequence ::= (BLOCK-ENTRY block_node?)+
18 # flow_collection ::= flow_sequence | flow_mapping
19 # flow_sequence ::= FLOW-SEQUENCE-START (flow_sequence_entry FLOW-ENTRY)* flow_sequence_entry? FLOW-SEQUENCE-END
20 # flow_mapping ::= FLOW-MAPPING-START (flow_mapping_entry FLOW-ENTRY)* flow_mapping_entry? FLOW-MAPPING-END
21 # flow_sequence_entry ::= flow_node | KEY flow_node? (VALUE flow_node?)?
22 # flow_mapping_entry ::= flow_node | KEY flow_node? (VALUE flow_node?)?
24 # TODO: support for BOM within a stream.
25 # stream ::= (BOM? implicit_document)? (BOM? explicit_document)* STREAM-END
28 # stream: { STREAM-START }
29 # explicit_document: { DIRECTIVE DOCUMENT-START }
30 # implicit_document: FIRST(block_node)
31 # block_node: { ALIAS TAG ANCHOR SCALAR BLOCK-SEQUENCE-START BLOCK-MAPPING-START FLOW-SEQUENCE-START FLOW-MAPPING-START }
32 # flow_node: { ALIAS ANCHOR TAG SCALAR FLOW-SEQUENCE-START FLOW-MAPPING-START }
33 # block_content: { BLOCK-SEQUENCE-START BLOCK-MAPPING-START FLOW-SEQUENCE-START FLOW-MAPPING-START SCALAR }
34 # flow_content: { FLOW-SEQUENCE-START FLOW-MAPPING-START SCALAR }
35 # block_collection: { BLOCK-SEQUENCE-START BLOCK-MAPPING-START }
36 # flow_collection: { FLOW-SEQUENCE-START FLOW-MAPPING-START }
37 # block_sequence: { BLOCK-SEQUENCE-START }
38 # block_mapping: { BLOCK-MAPPING-START }
39 # block_node_or_indentless_sequence: { ALIAS ANCHOR TAG SCALAR BLOCK-SEQUENCE-START BLOCK-MAPPING-START FLOW-SEQUENCE-START FLOW-MAPPING-START BLOCK-ENTRY }
40 # indentless_sequence: { ENTRY }
41 # flow_collection: { FLOW-SEQUENCE-START FLOW-MAPPING-START }
42 # flow_sequence: { FLOW-SEQUENCE-START }
43 # flow_mapping: { FLOW-MAPPING-START }
44 # flow_sequence_entry: { ALIAS ANCHOR TAG SCALAR FLOW-SEQUENCE-START FLOW-MAPPING-START KEY }
45 # flow_mapping_entry: { ALIAS ANCHOR TAG SCALAR FLOW-SEQUENCE-START FLOW-MAPPING-START KEY }
47 __all__
= ['Parser', 'ParserError']
49 from error
import MarkedYAMLError
54 class ParserError(MarkedYAMLError
):
58 # Since writing a recursive-descendant parser is a straightforward task, we
59 # do not give many comments here.
60 # Note that we use Python generators. If you rewrite the parser in another
61 # language, you may replace all 'yield'-s with event handler calls.
65 u
'!!': u
'tag:yaml.org,2002:',
69 self
.current_event
= None
70 self
.yaml_version
= None
72 self
.event_generator
= self
.parse_stream()
74 def check_event(self
, *choices
):
75 # Check the type of the next event.
76 if self
.current_event
is None:
78 self
.current_event
= self
.event_generator
.next()
81 if self
.current_event
is not None:
84 for choice
in choices
:
85 if isinstance(self
.current_event
, choice
):
91 if self
.current_event
is None:
93 self
.current_event
= self
.event_generator
.next()
96 return self
.current_event
100 if self
.current_event
is None:
102 self
.current_event
= self
.event_generator
.next()
103 except StopIteration:
105 value
= self
.current_event
106 self
.current_event
= None
111 return self
.event_generator
113 def parse_stream(self
):
114 # STREAM-START implicit_document? explicit_document* STREAM-END
116 # Parse start of stream.
117 token
= self
.get_token()
118 yield StreamStartEvent(token
.start_mark
, token
.end_mark
,
119 encoding
=token
.encoding
)
121 # Parse implicit document.
122 if not self
.check_token(DirectiveToken
, DocumentStartToken
,
124 self
.tag_handles
= self
.DEFAULT_TAGS
125 token
= self
.peek_token()
126 start_mark
= end_mark
= token
.start_mark
127 yield DocumentStartEvent(start_mark
, end_mark
,
129 for event
in self
.parse_block_node():
131 token
= self
.peek_token()
132 start_mark
= end_mark
= token
.start_mark
134 while self
.check_token(DocumentEndToken
):
135 token
= self
.get_token()
136 end_mark
= token
.end_mark
138 yield DocumentEndEvent(start_mark
, end_mark
,
141 # Parse explicit documents.
142 while not self
.check_token(StreamEndToken
):
143 token
= self
.peek_token()
144 start_mark
= token
.start_mark
145 version
, tags
= self
.process_directives()
146 if not self
.check_token(DocumentStartToken
):
147 raise ParserError(None, None,
148 "expected '<document start>', but found %r"
149 % self
.peek_token().id,
150 self
.peek_token().start_mark
)
151 token
= self
.get_token()
152 end_mark
= token
.end_mark
153 yield DocumentStartEvent(start_mark
, end_mark
,
154 explicit
=True, version
=version
, tags
=tags
)
155 if self
.check_token(DirectiveToken
,
156 DocumentStartToken
, DocumentEndToken
, StreamEndToken
):
157 yield self
.process_empty_scalar(token
.end_mark
)
159 for event
in self
.parse_block_node():
161 token
= self
.peek_token()
162 start_mark
= end_mark
= token
.start_mark
164 while self
.check_token(DocumentEndToken
):
165 token
= self
.get_token()
166 end_mark
= token
.end_mark
168 yield DocumentEndEvent(start_mark
, end_mark
,
171 # Parse end of stream.
172 token
= self
.get_token()
173 yield StreamEndEvent(token
.start_mark
, token
.end_mark
)
175 def process_directives(self
):
177 self
.yaml_version
= None
178 self
.tag_handles
= {}
179 while self
.check_token(DirectiveToken
):
180 token
= self
.get_token()
181 if token
.name
== u
'YAML':
182 if self
.yaml_version
is not None:
183 raise ParserError(None, None,
184 "found duplicate YAML directive", token
.start_mark
)
185 major
, minor
= token
.value
187 raise ParserError(None, None,
188 "found incompatible YAML document (version 1.* is required)",
190 self
.yaml_version
= token
.value
191 elif token
.name
== u
'TAG':
192 handle
, prefix
= token
.value
193 if handle
in self
.tag_handles
:
194 raise ParserError(None, None,
195 "duplicate tag handle %r" % handle
.encode('utf-8'),
197 self
.tag_handles
[handle
] = prefix
199 value
= self
.yaml_version
, self
.tag_handles
.copy()
201 value
= self
.yaml_version
, None
202 for key
in self
.DEFAULT_TAGS
:
203 if key
not in self
.tag_handles
:
204 self
.tag_handles
[key
] = self
.DEFAULT_TAGS
[key
]
207 def parse_block_node(self
):
208 return self
.parse_node(block
=True)
210 def parse_flow_node(self
):
211 return self
.parse_node()
213 def parse_block_node_or_indentless_sequence(self
):
214 return self
.parse_node(block
=True, indentless_sequence
=True)
216 def parse_node(self
, block
=False, indentless_sequence
=False):
217 # block_node ::= ALIAS | properties? block_content
218 # flow_node ::= ALIAS | properties? flow_content
219 # properties ::= TAG ANCHOR? | ANCHOR TAG?
220 # block_content ::= block_collection | flow_collection | SCALAR
221 # flow_content ::= flow_collection | SCALAR
222 # block_collection ::= block_sequence | block_mapping
223 # block_node_or_indentless_sequence ::= ALIAS | properties?
224 # (block_content | indentless_block_sequence)
225 if self
.check_token(AliasToken
):
226 token
= self
.get_token()
227 yield AliasEvent(token
.value
, token
.start_mark
, token
.end_mark
)
231 start_mark
= end_mark
= tag_mark
= None
232 if self
.check_token(AnchorToken
):
233 token
= self
.get_token()
234 start_mark
= token
.start_mark
235 end_mark
= token
.end_mark
237 if self
.check_token(TagToken
):
238 token
= self
.get_token()
239 tag_mark
= token
.start_mark
240 end_mark
= token
.end_mark
242 elif self
.check_token(TagToken
):
243 token
= self
.get_token()
244 start_mark
= tag_mark
= token
.start_mark
245 end_mark
= token
.end_mark
247 if self
.check_token(AnchorToken
):
248 token
= self
.get_token()
249 end_mark
= token
.end_mark
251 if tag
is not None and tag
!= u
'!':
253 if handle
is not None:
254 if handle
not in self
.tag_handles
:
255 raise ParserError("while parsing a node", start_mark
,
256 "found undefined tag handle %r" % handle
.encode('utf-8'),
258 tag
= self
.tag_handles
[handle
]+suffix
262 # raise ParserError("while parsing a node", start_mark,
263 # "found non-specific tag '!'", tag_mark,
264 # "Please check 'http://pyyaml.org/wiki/YAMLNonSpecificTag' and share your opinion.")
265 if start_mark
is None:
266 start_mark
= end_mark
= self
.peek_token().start_mark
268 collection_events
= None
269 implicit
= (tag
is None or tag
== u
'!')
270 if indentless_sequence
and self
.check_token(BlockEntryToken
):
271 end_mark
= self
.peek_token().end_mark
272 event
= SequenceStartEvent(anchor
, tag
, implicit
,
273 start_mark
, end_mark
)
274 collection_events
= self
.parse_indentless_sequence()
276 if self
.check_token(ScalarToken
):
277 token
= self
.get_token()
278 end_mark
= token
.end_mark
279 if (token
.plain
and tag
is None) or tag
== u
'!':
280 implicit
= (True, False)
282 implicit
= (False, True)
284 implicit
= (False, False)
285 event
= ScalarEvent(anchor
, tag
, implicit
, token
.value
,
286 start_mark
, end_mark
, style
=token
.style
)
287 elif self
.check_token(FlowSequenceStartToken
):
288 end_mark
= self
.peek_token().end_mark
289 event
= SequenceStartEvent(anchor
, tag
, implicit
,
290 start_mark
, end_mark
, flow_style
=True)
291 collection_events
= self
.parse_flow_sequence()
292 elif self
.check_token(FlowMappingStartToken
):
293 end_mark
= self
.peek_token().end_mark
294 event
= MappingStartEvent(anchor
, tag
, implicit
,
295 start_mark
, end_mark
, flow_style
=True)
296 collection_events
= self
.parse_flow_mapping()
297 elif block
and self
.check_token(BlockSequenceStartToken
):
298 end_mark
= self
.peek_token().start_mark
299 event
= SequenceStartEvent(anchor
, tag
, implicit
,
300 start_mark
, end_mark
, flow_style
=False)
301 collection_events
= self
.parse_block_sequence()
302 elif block
and self
.check_token(BlockMappingStartToken
):
303 end_mark
= self
.peek_token().start_mark
304 event
= MappingStartEvent(anchor
, tag
, implicit
,
305 start_mark
, end_mark
, flow_style
=False)
306 collection_events
= self
.parse_block_mapping()
307 elif anchor
is not None or tag
is not None:
308 # Empty scalars are allowed even if a tag or an anchor is
310 event
= ScalarEvent(anchor
, tag
, (implicit
, False), u
'',
311 start_mark
, end_mark
)
317 token
= self
.peek_token()
318 raise ParserError("while scanning a %s node" % node
, start_mark
,
319 "expected the node content, but found %r" % token
.id,
322 if collection_events
is not None:
323 for event
in collection_events
:
326 def parse_block_sequence(self
):
327 # BLOCK-SEQUENCE-START (BLOCK-ENTRY block_node?)* BLOCK-END
328 token
= self
.get_token()
329 start_mark
= token
.start_mark
330 while self
.check_token(BlockEntryToken
):
331 token
= self
.get_token()
332 if not self
.check_token(BlockEntryToken
, BlockEndToken
):
333 for event
in self
.parse_block_node():
336 yield self
.process_empty_scalar(token
.end_mark
)
337 if not self
.check_token(BlockEndToken
):
338 token
= self
.peek_token()
339 raise ParserError("while scanning a block collection", start_mark
,
340 "expected <block end>, but found %r" % token
.id, token
.start_mark
)
341 token
= self
.get_token()
342 yield SequenceEndEvent(token
.start_mark
, token
.end_mark
)
344 def parse_indentless_sequence(self
):
345 # (BLOCK-ENTRY block_node?)+
346 while self
.check_token(BlockEntryToken
):
347 token
= self
.get_token()
348 if not self
.check_token(BlockEntryToken
,
349 KeyToken
, ValueToken
, BlockEndToken
):
350 for event
in self
.parse_block_node():
353 yield self
.process_empty_scalar(token
.end_mark
)
354 token
= self
.peek_token()
355 yield SequenceEndEvent(token
.start_mark
, token
.start_mark
)
357 def parse_block_mapping(self
):
358 # BLOCK-MAPPING_START
359 # ((KEY block_node_or_indentless_sequence?)?
360 # (VALUE block_node_or_indentless_sequence?)?)*
362 token
= self
.get_token()
363 start_mark
= token
.start_mark
364 while self
.check_token(KeyToken
, ValueToken
):
365 if self
.check_token(KeyToken
):
366 token
= self
.get_token()
367 if not self
.check_token(KeyToken
, ValueToken
, BlockEndToken
):
368 for event
in self
.parse_block_node_or_indentless_sequence():
371 yield self
.process_empty_scalar(token
.end_mark
)
372 if self
.check_token(ValueToken
):
373 token
= self
.get_token()
374 if not self
.check_token(KeyToken
, ValueToken
, BlockEndToken
):
375 for event
in self
.parse_block_node_or_indentless_sequence():
378 yield self
.process_empty_scalar(token
.end_mark
)
380 token
= self
.peek_token()
381 yield self
.process_empty_scalar(token
.start_mark
)
382 if not self
.check_token(BlockEndToken
):
383 token
= self
.peek_token()
384 raise ParserError("while scanning a block mapping", start_mark
,
385 "expected <block end>, but found %r" % token
.id, token
.start_mark
)
386 token
= self
.get_token()
387 yield MappingEndEvent(token
.start_mark
, token
.end_mark
)
389 def parse_flow_sequence(self
):
390 # flow_sequence ::= FLOW-SEQUENCE-START
391 # (flow_sequence_entry FLOW-ENTRY)*
392 # flow_sequence_entry?
394 # flow_sequence_entry ::= flow_node | KEY flow_node? (VALUE flow_node?)?
396 # Note that while production rules for both flow_sequence_entry and
397 # flow_mapping_entry are equal, their interpretations are different.
398 # For `flow_sequence_entry`, the part `KEY flow_node? (VALUE flow_node?)?`
399 # generate an inline mapping (set syntax).
400 token
= self
.get_token()
401 start_mark
= token
.start_mark
402 while not self
.check_token(FlowSequenceEndToken
):
403 if self
.check_token(KeyToken
):
404 token
= self
.get_token()
405 yield MappingStartEvent(None, None, True,
406 token
.start_mark
, token
.end_mark
,
408 if not self
.check_token(ValueToken
,
409 FlowEntryToken
, FlowSequenceEndToken
):
410 for event
in self
.parse_flow_node():
413 yield self
.process_empty_scalar(token
.end_mark
)
414 if self
.check_token(ValueToken
):
415 token
= self
.get_token()
416 if not self
.check_token(FlowEntryToken
, FlowSequenceEndToken
):
417 for event
in self
.parse_flow_node():
420 yield self
.process_empty_scalar(token
.end_mark
)
422 token
= self
.peek_token()
423 yield self
.process_empty_scalar(token
.start_mark
)
424 token
= self
.peek_token()
425 yield MappingEndEvent(token
.start_mark
, token
.start_mark
)
427 for event
in self
.parse_flow_node():
429 if not self
.check_token(FlowEntryToken
, FlowSequenceEndToken
):
430 token
= self
.peek_token()
431 raise ParserError("while scanning a flow sequence", start_mark
,
432 "expected ',' or ']', but got %r" % token
.id, token
.start_mark
)
433 if self
.check_token(FlowEntryToken
):
435 token
= self
.get_token()
436 yield SequenceEndEvent(token
.start_mark
, token
.end_mark
)
438 def parse_flow_mapping(self
):
439 # flow_mapping ::= FLOW-MAPPING-START
440 # (flow_mapping_entry FLOW-ENTRY)*
441 # flow_mapping_entry?
443 # flow_mapping_entry ::= flow_node | KEY flow_node? (VALUE flow_node?)?
444 token
= self
.get_token()
445 start_mark
= token
.start_mark
446 while not self
.check_token(FlowMappingEndToken
):
447 if self
.check_token(KeyToken
):
448 token
= self
.get_token()
449 if not self
.check_token(ValueToken
,
450 FlowEntryToken
, FlowMappingEndToken
):
451 for event
in self
.parse_flow_node():
454 yield self
.process_empty_scalar(token
.end_mark
)
455 if self
.check_token(ValueToken
):
456 token
= self
.get_token()
457 if not self
.check_token(FlowEntryToken
, FlowMappingEndToken
):
458 for event
in self
.parse_flow_node():
461 yield self
.process_empty_scalar(token
.end_mark
)
463 token
= self
.peek_token()
464 yield self
.process_empty_scalar(token
.start_mark
)
466 for event
in self
.parse_flow_node():
468 yield self
.process_empty_scalar(self
.peek_token().start_mark
)
469 if not self
.check_token(FlowEntryToken
, FlowMappingEndToken
):
470 token
= self
.peek_token()
471 raise ParserError("while scanning a flow mapping", start_mark
,
472 "expected ',' or '}', but got %r" % token
.id, token
.start_mark
)
473 if self
.check_token(FlowEntryToken
):
475 if not self
.check_token(FlowMappingEndToken
):
476 token
= self
.peek_token()
477 raise ParserError("while scanning a flow mapping", start_mark
,
478 "expected '}', but found %r" % token
.id, token
.start_mark
)
479 token
= self
.get_token()
480 yield MappingEndEvent(token
.start_mark
, token
.end_mark
)
482 def process_empty_scalar(self
, mark
):
483 return ScalarEvent(None, None, (True, False), u
'', mark
, mark
)