lib/yaml/scanner.py

   1
   2 # Scanner produces tokens of the following types:
   3 # STREAM-START
   4 # STREAM-END
   5 # DIRECTIVE(name, value)
   6 # DOCUMENT-START
   7 # DOCUMENT-END
   8 # BLOCK-SEQUENCE-START
   9 # BLOCK-MAPPING-START
  10 # BLOCK-END
  11 # FLOW-SEQUENCE-START
  12 # FLOW-MAPPING-START
  13 # FLOW-SEQUENCE-END
  14 # FLOW-MAPPING-END
  15 # BLOCK-ENTRY
  16 # FLOW-ENTRY
  17 # KEY
  18 # VALUE
  19 # ALIAS(value)
  20 # ANCHOR(value)
  21 # TAG(value)
  22 # SCALAR(value, plain, style)
  23 #
  24 # Read comments in the Scanner code for more details.
  25 #
  26
  27 __all__ = ['Scanner', 'ScannerError']
  28
  29 from error import MarkedYAMLError
  30 from tokens import *
  31
  32 class ScannerError(MarkedYAMLError):
  33     pass
  34
  35 class SimpleKey(object):
  36     # See below simple keys treatment.
  37
  38     def __init__(self, token_number, required, index, line, column, mark):
  39         self.token_number = token_number
  40         self.required = required
  41         self.index = index
  42         self.line = line
  43         self.column = column
  44         self.mark = mark
  45
  46 class Scanner(object):
  47
  48     def __init__(self):
  49         """Initialize the scanner."""
  50         # It is assumed that Scanner and Reader will have a common descendant.
  51         # Reader do the dirty work of checking for BOM and converting the
  52         # input data to Unicode. It also adds NUL to the end.
  53         #
  54         # Reader supports the following methods
  55         #   self.peek(i=0)       # peek the next i-th character
  56         #   self.prefix(l=1)     # peek the next l characters
  57         #   self.forward(l=1)    # read the next l characters and move the pointer.
  58
  59         # Had we reached the end of the stream?
  60         self.done = False
  61
  62         # The number of unclosed '{' and '['. `flow_level == 0` means block
  63         # context.
  64         self.flow_level = 0
  65
  66         # List of processed tokens that are not yet emitted.
  67         self.tokens = []
  68
  69         # Add the STREAM-START token.
  70         self.fetch_stream_start()
  71
  72         # Number of tokens that were emitted through the `get_token` method.
  73         self.tokens_taken = 0
  74
  75         # The current indentation level.
  76         self.indent = -1
  77
  78         # Past indentation levels.
  79         self.indents = []
  80
  81         # Variables related to simple keys treatment.
  82
  83         # A simple key is a key that is not denoted by the '?' indicator.
  84         # Example of simple keys:
  85         #   ---
  86         #   block simple key: value
  87         #   ? not a simple key:
  88         #   : { flow simple key: value }
  89         # We emit the KEY token before all keys, so when we find a potential
  90         # simple key, we try to locate the corresponding ':' indicator.
  91         # Simple keys should be limited to a single line and 1024 characters.
  92
  93         # Can a simple key start at the current position? A simple key may
  94         # start:
  95         # - at the beginning of the line, not counting indentation spaces
  96         #       (in block context),
  97         # - after '{', '[', ',' (in the flow context),
  98         # - after '?', ':', '-' (in the block context).
  99         # In the block context, this flag also signifies if a block collection
 100         # may start at the current position.
 101         self.allow_simple_key = True
 102
 103         # Keep track of possible simple keys. This is a dictionary. The key
 104         # is `flow_level`; there can be no more that one possible simple key
 105         # for each level. The value is a SimpleKey record:
 106         #   (token_number, required, index, line, column, mark)
 107         # A simple key may start with ALIAS, ANCHOR, TAG, SCALAR(flow),
 108         # '[', or '{' tokens.
 109         self.possible_simple_keys = {}
 110
 111     # Public methods.
 112
 113     def check_token(self, *choices):
 114         # Check if the next token is one of the given types.
 115         while self.need_more_tokens():
 116             self.fetch_more_tokens()
 117         if self.tokens:
 118             if not choices:
 119                 return True
 120             for choice in choices:
 121                 if isinstance(self.tokens[0], choice):
 122                     return True
 123         return False
 124
 125     def peek_token(self):
 126         # Return the next token, but do not delete if from the queue.
 127         while self.need_more_tokens():
 128             self.fetch_more_tokens()
 129         if self.tokens:
 130             return self.tokens[0]
 131
 132     def get_token(self):
 133         # Return the next token.
 134         while self.need_more_tokens():
 135             self.fetch_more_tokens()
 136         if self.tokens:
 137             self.tokens_taken += 1
 138             return self.tokens.pop(0)
 139
 140     # Private methods.
 141
 142     def need_more_tokens(self):
 143         if self.done:
 144             return False
 145         if not self.tokens:
 146             return True
 147         # The current token may be a potential simple key, so we
 148         # need to look further.
 149         self.stale_possible_simple_keys()
 150         if self.next_possible_simple_key() == self.tokens_taken:
 151             return True
 152
 153     def fetch_more_tokens(self):
 154
 155         # Eat whitespaces and comments until we reach the next token.
 156         self.scan_to_next_token()
 157
 158         # Remove obsolete possible simple keys.
 159         self.stale_possible_simple_keys()
 160
 161         # Compare the current indentation and column. It may add some tokens
 162         # and decrease the current indentation level.
 163         self.unwind_indent(self.column)
 164
 165         # Peek the next character.
 166         ch = self.peek()
 167
 168         # Is it the end of stream?
 169         if ch == u'\0':
 170             return self.fetch_stream_end()
 171
 172         # Is it a directive?
 173         if ch == u'%' and self.check_directive():
 174             return self.fetch_directive()
 175
 176         # Is it the document start?
 177         if ch == u'-' and self.check_document_start():
 178             return self.fetch_document_start()
 179
 180         # Is it the document end?
 181         if ch == u'.' and self.check_document_end():
 182             return self.fetch_document_end()
 183
 184         # TODO: support for BOM within a stream.
 185         #if ch == u'\uFEFF':
 186         #    return self.fetch_bom()    <-- issue BOMToken
 187
 188         # Note: the order of the following checks is NOT significant.
 189
 190         # Is it the flow sequence start indicator?
 191         if ch == u'[':
 192             return self.fetch_flow_sequence_start()
 193
 194         # Is it the flow mapping start indicator?
 195         if ch == u'{':
 196             return self.fetch_flow_mapping_start()
 197
 198         # Is it the flow sequence end indicator?
 199         if ch == u']':
 200             return self.fetch_flow_sequence_end()
 201
 202         # Is it the flow mapping end indicator?
 203         if ch == u'}':
 204             return self.fetch_flow_mapping_end()
 205
 206         # Is it the flow entry indicator?
 207         if ch == u',':
 208             return self.fetch_flow_entry()
 209
 210         # Is it the block entry indicator?
 211         if ch == u'-' and self.check_block_entry():
 212             return self.fetch_block_entry()
 213
 214         # Is it the key indicator?
 215         if ch == u'?' and self.check_key():
 216             return self.fetch_key()
 217
 218         # Is it the value indicator?
 219         if ch == u':' and self.check_value():
 220             return self.fetch_value()
 221
 222         # Is it an alias?
 223         if ch == u'*':
 224             return self.fetch_alias()
 225
 226         # Is it an anchor?
 227         if ch == u'&':
 228             return self.fetch_anchor()
 229
 230         # Is it a tag?
 231         if ch == u'!':
 232             return self.fetch_tag()
 233
 234         # Is it a literal scalar?
 235         if ch == u'|' and not self.flow_level:
 236             return self.fetch_literal()
 237
 238         # Is it a folded scalar?
 239         if ch == u'>' and not self.flow_level:
 240             return self.fetch_folded()
 241
 242         # Is it a single quoted scalar?
 243         if ch == u'\'':
 244             return self.fetch_single()
 245
 246         # Is it a double quoted scalar?
 247         if ch == u'\"':
 248             return self.fetch_double()
 249
 250         # It must be a plain scalar then.
 251         if self.check_plain():
 252             return self.fetch_plain()
 253
 254         # No? It's an error. Let's produce a nice error message.
 255         raise ScannerError("while scanning for the next token", None,
 256                 "found character %r that cannot start any token"
 257                 % ch.encode('utf-8'), self.get_mark())
 258
 259     # Simple keys treatment.
 260
 261     def next_possible_simple_key(self):
 262         # Return the number of the nearest possible simple key. Actually we
 263         # don't need to loop through the whole dictionary. We may replace it
 264         # with the following code:
 265         #   if not self.possible_simple_keys:
 266         #       return None
 267         #   return self.possible_simple_keys[
 268         #           min(self.possible_simple_keys.keys())].token_number
 269         min_token_number = None
 270         for level in self.possible_simple_keys:
 271             key = self.possible_simple_keys[level]
 272             if min_token_number is None or key.token_number < min_token_number:
 273                 min_token_number = key.token_number
 274         return min_token_number
 275
 276     def stale_possible_simple_keys(self):
 277         # Remove entries that are no longer possible simple keys. According to
 278         # the YAML specification, simple keys
 279         # - should be limited to a single line,
 280         # - should be no longer than 1024 characters.
 281         # Disabling this procedure will allow simple keys of any length and
 282         # height (may cause problems if indentation is broken though).
 283         for level in self.possible_simple_keys.keys():
 284             key = self.possible_simple_keys[level]
 285             if key.line != self.line  \
 286                     or self.index-key.index > 1024:
 287                 if key.required:
 288                     raise ScannerError("while scanning a simple key", key.mark,
 289                             "could not found expected ':'", self.get_mark())
 290                 del self.possible_simple_keys[level]
 291
 292     def save_possible_simple_key(self):
 293         # The next token may start a simple key. We check if it's possible
 294         # and save its position. This function is called for
 295         #   ALIAS, ANCHOR, TAG, SCALAR(flow), '[', and '{'.
 296
 297         # Check if a simple key is required at the current position.
 298         required = not self.flow_level and self.indent == self.column
 299
 300         # A simple key is required only if it is the first token in the current
 301         # line. Therefore it is always allowed.
 302         assert self.allow_simple_key or not required
 303
 304         # The next token might be a simple key. Let's save it's number and
 305         # position.
 306         if self.allow_simple_key:
 307             self.remove_possible_simple_key()
 308             token_number = self.tokens_taken+len(self.tokens)
 309             key = SimpleKey(token_number, required,
 310                     self.index, self.line, self.column, self.get_mark())
 311             self.possible_simple_keys[self.flow_level] = key
 312
 313     def remove_possible_simple_key(self):
 314         # Remove the saved possible key position at the current flow level.
 315         if self.flow_level in self.possible_simple_keys:
 316             key = self.possible_simple_keys[self.flow_level]
 317
 318             if key.required:
 319                 raise ScannerError("while scanning a simple key", key.mark,
 320                         "could not found expected ':'", self.get_mark())
 321
 322             del self.possible_simple_keys[self.flow_level]
 323
 324     # Indentation functions.
 325
 326     def unwind_indent(self, column):
 327
 328         ## In flow context, tokens should respect indentation.
 329         ## Actually the condition should be `self.indent >= column` according to
 330         ## the spec. But this condition will prohibit intuitively correct
 331         ## constructions such as
 332         ## key : {
 333         ## }
 334         #if self.flow_level and self.indent > column:
 335         #    raise ScannerError(None, None,
 336         #            "invalid intendation or unclosed '[' or '{'",
 337         #            self.get_mark())
 338
 339         # In the flow context, indentation is ignored. We make the scanner less
 340         # restrictive then specification requires.
 341         if self.flow_level:
 342             return
 343
 344         # In block context, we may need to issue the BLOCK-END tokens.
 345         while self.indent > column:
 346             mark = self.get_mark()
 347             self.indent = self.indents.pop()
 348             self.tokens.append(BlockEndToken(mark, mark))
 349
 350     def add_indent(self, column):
 351         # Check if we need to increase indentation.
 352         if self.indent < column:
 353             self.indents.append(self.indent)
 354             self.indent = column
 355             return True
 356         return False
 357
 358     # Fetchers.
 359
 360     def fetch_stream_start(self):
 361         # We always add STREAM-START as the first token and STREAM-END as the
 362         # last token.
 363
 364         # Read the token.
 365         mark = self.get_mark()
 366
 367         # Add STREAM-START.
 368         self.tokens.append(StreamStartToken(mark, mark,
 369             encoding=self.encoding))
 370
 371
 372     def fetch_stream_end(self):
 373
 374         # Set the current intendation to -1.
 375         self.unwind_indent(-1)
 376
 377         # Reset everything (not really needed).
 378         self.allow_simple_key = False
 379         self.possible_simple_keys = {}
 380
 381         # Read the token.
 382         mark = self.get_mark()
 383
 384         # Add STREAM-END.
 385         self.tokens.append(StreamEndToken(mark, mark))
 386
 387         # The steam is finished.
 388         self.done = True
 389
 390     def fetch_directive(self):
 391
 392         # Set the current intendation to -1.
 393         self.unwind_indent(-1)
 394
 395         # Reset simple keys.
 396         self.remove_possible_simple_key()
 397         self.allow_simple_key = False
 398
 399         # Scan and add DIRECTIVE.
 400         self.tokens.append(self.scan_directive())
 401
 402     def fetch_document_start(self):
 403         self.fetch_document_indicator(DocumentStartToken)
 404
 405     def fetch_document_end(self):
 406         self.fetch_document_indicator(DocumentEndToken)
 407
 408     def fetch_document_indicator(self, TokenClass):
 409
 410         # Set the current intendation to -1.
 411         self.unwind_indent(-1)
 412
 413         # Reset simple keys. Note that there could not be a block collection
 414         # after '---'.
 415         self.remove_possible_simple_key()
 416         self.allow_simple_key = False
 417
 418         # Add DOCUMENT-START or DOCUMENT-END.
 419         start_mark = self.get_mark()
 420         self.forward(3)
 421         end_mark = self.get_mark()
 422         self.tokens.append(TokenClass(start_mark, end_mark))
 423
 424     def fetch_flow_sequence_start(self):
 425         self.fetch_flow_collection_start(FlowSequenceStartToken)
 426
 427     def fetch_flow_mapping_start(self):
 428         self.fetch_flow_collection_start(FlowMappingStartToken)
 429
 430     def fetch_flow_collection_start(self, TokenClass):
 431
 432         # '[' and '{' may start a simple key.
 433         self.save_possible_simple_key()
 434
 435         # Increase the flow level.
 436         self.flow_level += 1
 437
 438         # Simple keys are allowed after '[' and '{'.
 439         self.allow_simple_key = True
 440
 441         # Add FLOW-SEQUENCE-START or FLOW-MAPPING-START.
 442         start_mark = self.get_mark()
 443         self.forward()
 444         end_mark = self.get_mark()
 445         self.tokens.append(TokenClass(start_mark, end_mark))
 446
 447     def fetch_flow_sequence_end(self):
 448         self.fetch_flow_collection_end(FlowSequenceEndToken)
 449
 450     def fetch_flow_mapping_end(self):
 451         self.fetch_flow_collection_end(FlowMappingEndToken)
 452
 453     def fetch_flow_collection_end(self, TokenClass):
 454
 455         # Reset possible simple key on the current level.
 456         self.remove_possible_simple_key()
 457
 458         # Decrease the flow level.
 459         self.flow_level -= 1
 460
 461         # No simple keys after ']' or '}'.
 462         self.allow_simple_key = False
 463
 464         # Add FLOW-SEQUENCE-END or FLOW-MAPPING-END.
 465         start_mark = self.get_mark()
 466         self.forward()
 467         end_mark = self.get_mark()
 468         self.tokens.append(TokenClass(start_mark, end_mark))
 469
 470     def fetch_flow_entry(self):
 471
 472         # Simple keys are allowed after ','.
 473         self.allow_simple_key = True
 474
 475         # Reset possible simple key on the current level.
 476         self.remove_possible_simple_key()
 477
 478         # Add FLOW-ENTRY.
 479         start_mark = self.get_mark()
 480         self.forward()
 481         end_mark = self.get_mark()
 482         self.tokens.append(FlowEntryToken(start_mark, end_mark))
 483
 484     def fetch_block_entry(self):
 485
 486         # Block context needs additional checks.
 487         if not self.flow_level:
 488
 489             # Are we allowed to start a new entry?
 490             if not self.allow_simple_key:
 491                 raise ScannerError(None, None,
 492                         "sequence entries are not allowed here",
 493                         self.get_mark())
 494
 495             # We may need to add BLOCK-SEQUENCE-START.
 496             if self.add_indent(self.column):
 497                 mark = self.get_mark()
 498                 self.tokens.append(BlockSequenceStartToken(mark, mark))
 499
 500         # It's an error for the block entry to occur in the flow context,
 501         # but we let the parser detect this.
 502         else:
 503             pass
 504
 505         # Simple keys are allowed after '-'.
 506         self.allow_simple_key = True
 507
 508         # Reset possible simple key on the current level.
 509         self.remove_possible_simple_key()
 510
 511         # Add BLOCK-ENTRY.
 512         start_mark = self.get_mark()
 513         self.forward()
 514         end_mark = self.get_mark()
 515         self.tokens.append(BlockEntryToken(start_mark, end_mark))
 516
 517     def fetch_key(self):
 518
 519         # Block context needs additional checks.
 520         if not self.flow_level:
 521
 522             # Are we allowed to start a key (not nessesary a simple)?
 523             if not self.allow_simple_key:
 524                 raise ScannerError(None, None,
 525                         "mapping keys are not allowed here",
 526                         self.get_mark())
 527
 528             # We may need to add BLOCK-MAPPING-START.
 529             if self.add_indent(self.column):
 530                 mark = self.get_mark()
 531                 self.tokens.append(BlockMappingStartToken(mark, mark))
 532
 533         # Simple keys are allowed after '?' in the block context.
 534         self.allow_simple_key = not self.flow_level
 535
 536         # Reset possible simple key on the current level.
 537         self.remove_possible_simple_key()
 538
 539         # Add KEY.
 540         start_mark = self.get_mark()
 541         self.forward()
 542         end_mark = self.get_mark()
 543         self.tokens.append(KeyToken(start_mark, end_mark))
 544
 545     def fetch_value(self):
 546
 547         # Do we determine a simple key?
 548         if self.flow_level in self.possible_simple_keys:
 549
 550             # Add KEY.
 551             key = self.possible_simple_keys[self.flow_level]
 552             del self.possible_simple_keys[self.flow_level]
 553             self.tokens.insert(key.token_number-self.tokens_taken,
 554                     KeyToken(key.mark, key.mark))
 555
 556             # If this key starts a new block mapping, we need to add
 557             # BLOCK-MAPPING-START.
 558             if not self.flow_level:
 559                 if self.add_indent(key.column):
 560                     self.tokens.insert(key.token_number-self.tokens_taken,
 561                             BlockMappingStartToken(key.mark, key.mark))
 562
 563             # There cannot be two simple keys one after another.
 564             self.allow_simple_key = False
 565
 566         # It must be a part of a complex key.
 567         else:
 568
 569             # Block context needs additional checks.
 570             # (Do we really need them? They will be catched by the parser
 571             # anyway.)
 572             if not self.flow_level:
 573
 574                 # We are allowed to start a complex value if and only if
 575                 # we can start a simple key.
 576                 if not self.allow_simple_key:
 577                     raise ScannerError(None, None,
 578                             "mapping values are not allowed here",
 579                             self.get_mark())
 580
 581             # If this value starts a new block mapping, we need to add
 582             # BLOCK-MAPPING-START.  It will be detected as an error later by
 583             # the parser.
 584             if not self.flow_level:
 585                 if self.add_indent(self.column):
 586                     mark = self.get_mark()
 587                     self.tokens.append(BlockMappingStartToken(mark, mark))
 588
 589             # Simple keys are allowed after ':' in the block context.
 590             self.allow_simple_key = not self.flow_level
 591
 592             # Reset possible simple key on the current level.
 593             self.remove_possible_simple_key()
 594
 595         # Add VALUE.
 596         start_mark = self.get_mark()
 597         self.forward()
 598         end_mark = self.get_mark()
 599         self.tokens.append(ValueToken(start_mark, end_mark))
 600
 601     def fetch_alias(self):
 602
 603         # ALIAS could be a simple key.
 604         self.save_possible_simple_key()
 605
 606         # No simple keys after ALIAS.
 607         self.allow_simple_key = False
 608
 609         # Scan and add ALIAS.
 610         self.tokens.append(self.scan_anchor(AliasToken))
 611
 612     def fetch_anchor(self):
 613
 614         # ANCHOR could start a simple key.
 615         self.save_possible_simple_key()
 616
 617         # No simple keys after ANCHOR.
 618         self.allow_simple_key = False
 619
 620         # Scan and add ANCHOR.
 621         self.tokens.append(self.scan_anchor(AnchorToken))
 622
 623     def fetch_tag(self):
 624
 625         # TAG could start a simple key.
 626         self.save_possible_simple_key()
 627
 628         # No simple keys after TAG.
 629         self.allow_simple_key = False
 630
 631         # Scan and add TAG.
 632         self.tokens.append(self.scan_tag())
 633
 634     def fetch_literal(self):
 635         self.fetch_block_scalar(style='|')
 636
 637     def fetch_folded(self):
 638         self.fetch_block_scalar(style='>')
 639
 640     def fetch_block_scalar(self, style):
 641
 642         # A simple key may follow a block scalar.
 643         self.allow_simple_key = True
 644
 645         # Reset possible simple key on the current level.
 646         self.remove_possible_simple_key()
 647
 648         # Scan and add SCALAR.
 649         self.tokens.append(self.scan_block_scalar(style))
 650
 651     def fetch_single(self):
 652         self.fetch_flow_scalar(style='\'')
 653
 654     def fetch_double(self):
 655         self.fetch_flow_scalar(style='"')
 656
 657     def fetch_flow_scalar(self, style):
 658
 659         # A flow scalar could be a simple key.
 660         self.save_possible_simple_key()
 661
 662         # No simple keys after flow scalars.
 663         self.allow_simple_key = False
 664
 665         # Scan and add SCALAR.
 666         self.tokens.append(self.scan_flow_scalar(style))
 667
 668     def fetch_plain(self):
 669
 670         # A plain scalar could be a simple key.
 671         self.save_possible_simple_key()
 672
 673         # No simple keys after plain scalars. But note that `scan_plain` will
 674         # change this flag if the scan is finished at the beginning of the
 675         # line.
 676         self.allow_simple_key = False
 677
 678         # Scan and add SCALAR. May change `allow_simple_key`.
 679         self.tokens.append(self.scan_plain())
 680
 681     # Checkers.
 682
 683     def check_directive(self):
 684
 685         # DIRECTIVE:        ^ '%' ...
 686         # The '%' indicator is already checked.
 687         if self.column == 0:
 688             return True
 689
 690     def check_document_start(self):
 691
 692         # DOCUMENT-START:   ^ '---' (' '|'\n')
 693         if self.column == 0:
 694             if self.prefix(3) == u'---'  \
 695                     and self.peek(3) in u'\0 \t\r\n\x85\u2028\u2029':
 696                 return True
 697
 698     def check_document_end(self):
 699
 700         # DOCUMENT-END:     ^ '...' (' '|'\n')
 701         if self.column == 0:
 702             if self.prefix(3) == u'...'  \
 703                     and self.peek(3) in u'\0 \t\r\n\x85\u2028\u2029':
 704                 return True
 705
 706     def check_block_entry(self):
 707
 708         # BLOCK-ENTRY:      '-' (' '|'\n')
 709         return self.peek(1) in u'\0 \t\r\n\x85\u2028\u2029'
 710
 711     def check_key(self):
 712
 713         # KEY(flow context):    '?'
 714         if self.flow_level:
 715             return True
 716
 717         # KEY(block context):   '?' (' '|'\n')
 718         else:
 719             return self.peek(1) in u'\0 \t\r\n\x85\u2028\u2029'
 720
 721     def check_value(self):
 722
 723         # VALUE(flow context):  ':'
 724         if self.flow_level:
 725             return True
 726
 727         # VALUE(block context): ':' (' '|'\n')
 728         else:
 729             return self.peek(1) in u'\0 \t\r\n\x85\u2028\u2029'
 730
 731     def check_plain(self):
 732
 733         # A plain scalar may start with any non-space character except:
 734         #   '-', '?', ':', ',', '[', ']', '{', '}',
 735         #   '#', '&', '*', '!', '|', '>', '\'', '\"',
 736         #   '%', '@', '`'.
 737         #
 738         # It may also start with
 739         #   '-', '?', ':'
 740         # if it is followed by a non-space character.
 741         #
 742         # Note that we limit the last rule to the block context (except the
 743         # '-' character) because we want the flow context to be space
 744         # independent.
 745         ch = self.peek()
 746         return ch not in u'\0 \t\r\n\x85\u2028\u2029-?:,[]{}#&*!|>\'\"%@`'  \
 747                 or (self.peek(1) not in u'\0 \t\r\n\x85\u2028\u2029'
 748                         and (ch == u'-' or (not self.flow_level and ch in u'?:')))
 749
 750     # Scanners.
 751
 752     def scan_to_next_token(self):
 753         # We ignore spaces, line breaks and comments.
 754         # If we find a line break in the block context, we set the flag
 755         # `allow_simple_key` on.
 756         # The byte order mark is stripped if it's the first character in the
 757         # stream. We do not yet support BOM inside the stream as the
 758         # specification requires. Any such mark will be considered as a part
 759         # of the document.
 760         #
 761         # TODO: We need to make tab handling rules more sane. A good rule is
 762         #   Tabs cannot precede tokens
 763         #   BLOCK-SEQUENCE-START, BLOCK-MAPPING-START, BLOCK-END,
 764         #   KEY(block), VALUE(block), BLOCK-ENTRY
 765         # So the checking code is
 766         #   if <TAB>:
 767         #       self.allow_simple_keys = False
 768         # We also need to add the check for `allow_simple_keys == True` to
 769         # `unwind_indent` before issuing BLOCK-END.
 770         # Scanners for block, flow, and plain scalars need to be modified.
 771
 772         if self.index == 0 and self.peek() == u'\uFEFF':
 773             self.forward()
 774         found = False
 775         while not found:
 776             while self.peek() == u' ':
 777                 self.forward()
 778             if self.peek() == u'#':
 779                 while self.peek() not in u'\0\r\n\x85\u2028\u2029':
 780                     self.forward()
 781             if self.scan_line_break():
 782                 if not self.flow_level:
 783                     self.allow_simple_key = True
 784             else:
 785                 found = True
 786
 787     def scan_directive(self):
 788         # See the specification for details.
 789         start_mark = self.get_mark()
 790         self.forward()
 791         name = self.scan_directive_name(start_mark)
 792         value = None
 793         if name == u'YAML':
 794             value = self.scan_yaml_directive_value(start_mark)
 795             end_mark = self.get_mark()
 796         elif name == u'TAG':
 797             value = self.scan_tag_directive_value(start_mark)
 798             end_mark = self.get_mark()
 799         else:
 800             end_mark = self.get_mark()
 801             while self.peek() not in u'\0\r\n\x85\u2028\u2029':
 802                 self.forward()
 803         self.scan_directive_ignored_line(start_mark)
 804         return DirectiveToken(name, value, start_mark, end_mark)
 805
 806     def scan_directive_name(self, start_mark):
 807         # See the specification for details.
 808         length = 0
 809         ch = self.peek(length)
 810         while u'0' <= ch <= u'9' or u'A' <= ch <= 'Z' or u'a' <= ch <= 'z'  \
 811                 or ch in u'-_':
 812             length += 1
 813             ch = self.peek(length)
 814         if not length:
 815             raise ScannerError("while scanning a directive", start_mark,
 816                     "expected alphabetic or numeric character, but found %r"
 817                     % ch.encode('utf-8'), self.get_mark())
 818         value = self.prefix(length)
 819         self.forward(length)
 820         ch = self.peek()
 821         if ch not in u'\0 \r\n\x85\u2028\u2029':
 822             raise ScannerError("while scanning a directive", start_mark,
 823                     "expected alphabetic or numeric character, but found %r"
 824                     % ch.encode('utf-8'), self.get_mark())
 825         return value
 826
 827     def scan_yaml_directive_value(self, start_mark):
 828         # See the specification for details.
 829         while self.peek() == u' ':
 830             self.forward()
 831         major = self.scan_yaml_directive_number(start_mark)
 832         if self.peek() != '.':
 833             raise ScannerError("while scanning a directive", start_mark,
 834                     "expected a digit or '.', but found %r"
 835                     % self.peek().encode('utf-8'),
 836                     self.get_mark())
 837         self.forward()
 838         minor = self.scan_yaml_directive_number(start_mark)
 839         if self.peek() not in u'\0 \r\n\x85\u2028\u2029':
 840             raise ScannerError("while scanning a directive", start_mark,
 841                     "expected a digit or ' ', but found %r"
 842                     % self.peek().encode('utf-8'),
 843                     self.get_mark())
 844         return (major, minor)
 845
 846     def scan_yaml_directive_number(self, start_mark):
 847         # See the specification for details.
 848         ch = self.peek()
 849         if not (u'0' <= ch <= '9'):
 850             raise ScannerError("while scanning a directive", start_mark,
 851                     "expected a digit, but found %r" % ch.encode('utf-8'),
 852                     self.get_mark())
 853         length = 0
 854         while u'0' <= self.peek(length) <= u'9':
 855             length += 1
 856         value = int(self.prefix(length))
 857         self.forward(length)
 858         return value
 859
 860     def scan_tag_directive_value(self, start_mark):
 861         # See the specification for details.
 862         while self.peek() == u' ':
 863             self.forward()
 864         handle = self.scan_tag_directive_handle(start_mark)
 865         while self.peek() == u' ':
 866             self.forward()
 867         prefix = self.scan_tag_directive_prefix(start_mark)
 868         return (handle, prefix)
 869
 870     def scan_tag_directive_handle(self, start_mark):
 871         # See the specification for details.
 872         value = self.scan_tag_handle('directive', start_mark)
 873         ch = self.peek()
 874         if ch != u' ':
 875             raise ScannerError("while scanning a directive", start_mark,
 876                     "expected ' ', but found %r" % ch.encode('utf-8'),
 877                     self.get_mark())
 878         return value
 879
 880     def scan_tag_directive_prefix(self, start_mark):
 881         # See the specification for details.
 882         value = self.scan_tag_uri('directive', start_mark)
 883         ch = self.peek()
 884         if ch not in u'\0 \r\n\x85\u2028\u2029':
 885             raise ScannerError("while scanning a directive", start_mark,
 886                     "expected ' ', but found %r" % ch.encode('utf-8'),
 887                     self.get_mark())
 888         return value
 889
 890     def scan_directive_ignored_line(self, start_mark):
 891         # See the specification for details.
 892         while self.peek() == u' ':
 893             self.forward()
 894         if self.peek() == u'#':
 895             while self.peek() not in u'\0\r\n\x85\u2028\u2029':
 896                 self.forward()
 897         ch = self.peek()
 898         if ch not in u'\0\r\n\x85\u2028\u2029':
 899             raise ScannerError("while scanning a directive", start_mark,
 900                     "expected a comment or a line break, but found %r"
 901                         % ch.encode('utf-8'), self.get_mark())
 902         self.scan_line_break()
 903
 904     def scan_anchor(self, TokenClass):
 905         # The specification does not restrict characters for anchors and
 906         # aliases. This may lead to problems, for instance, the document:
 907         #   [ *alias, value ]
 908         # can be interpteted in two ways, as
 909         #   [ "value" ]
 910         # and
 911         #   [ *alias , "value" ]
 912         # Therefore we restrict aliases to numbers and ASCII letters.
 913         start_mark = self.get_mark()
 914         indicator = self.peek()
 915         if indicator == '*':
 916             name = 'alias'
 917         else:
 918             name = 'anchor'
 919         self.forward()
 920         length = 0
 921         ch = self.peek(length)
 922         while u'0' <= ch <= u'9' or u'A' <= ch <= 'Z' or u'a' <= ch <= 'z'  \
 923                 or ch in u'-_':
 924             length += 1
 925             ch = self.peek(length)
 926         if not length:
 927             raise ScannerError("while scanning an %s" % name, start_mark,
 928                     "expected alphabetic or numeric character, but found %r"
 929                     % ch.encode('utf-8'), self.get_mark())
 930         value = self.prefix(length)
 931         self.forward(length)
 932         ch = self.peek()
 933         if ch not in u'\0 \t\r\n\x85\u2028\u2029?:,]}%@`':
 934             raise ScannerError("while scanning an %s" % name, start_mark,
 935                     "expected alphabetic or numeric character, but found %r"
 936                     % ch.encode('utf-8'), self.get_mark())
 937         end_mark = self.get_mark()
 938         return TokenClass(value, start_mark, end_mark)
 939
 940     def scan_tag(self):
 941         # See the specification for details.
 942         start_mark = self.get_mark()
 943         ch = self.peek(1)
 944         if ch == u'<':
 945             handle = None
 946             self.forward(2)
 947             suffix = self.scan_tag_uri('tag', start_mark)
 948             if self.peek() != u'>':
 949                 raise ScannerError("while parsing a tag", start_mark,
 950                         "expected '>', but found %r" % self.peek().encode('utf-8'),
 951                         self.get_mark())
 952             self.forward()
 953         elif ch in u'\0 \t\r\n\x85\u2028\u2029':
 954             handle = None
 955             suffix = u'!'
 956             self.forward()
 957         else:
 958             length = 1
 959             use_handle = False
 960             while ch not in u'\0 \r\n\x85\u2028\u2029':
 961                 if ch == u'!':
 962                     use_handle = True
 963                     break
 964                 length += 1
 965                 ch = self.peek(length)
 966             handle = u'!'
 967             if use_handle:
 968                 handle = self.scan_tag_handle('tag', start_mark)
 969             else:
 970                 handle = u'!'
 971                 self.forward()
 972             suffix = self.scan_tag_uri('tag', start_mark)
 973         ch = self.peek()
 974         if ch not in u'\0 \r\n\x85\u2028\u2029':
 975             raise ScannerError("while scanning a tag", start_mark,
 976                     "expected ' ', but found %r" % ch.encode('utf-8'),
 977                     self.get_mark())
 978         value = (handle, suffix)
 979         end_mark = self.get_mark()
 980         return TagToken(value, start_mark, end_mark)
 981
 982     def scan_block_scalar(self, style):
 983         # See the specification for details.
 984
 985         if style == '>':
 986             folded = True
 987         else:
 988             folded = False
 989
 990         chunks = []
 991         start_mark = self.get_mark()
 992
 993         # Scan the header.
 994         self.forward()
 995         chomping, increment = self.scan_block_scalar_indicators(start_mark)
 996         self.scan_block_scalar_ignored_line(start_mark)
 997
 998         # Determine the indentation level and go to the first non-empty line.
 999         min_indent = self.indent+1
1000         if min_indent < 1:
1001             min_indent = 1
1002         if increment is None:
1003             breaks, max_indent, end_mark = self.scan_block_scalar_indentation()
1004             indent = max(min_indent, max_indent)
1005         else:
1006             indent = min_indent+increment-1
1007             breaks, end_mark = self.scan_block_scalar_breaks(indent)
1008         line_break = u''
1009
1010         # Scan the inner part of the block scalar.
1011         while self.column == indent and self.peek() != u'\0':
1012             chunks.extend(breaks)
1013             leading_non_space = self.peek() not in u' \t'
1014             length = 0
1015             while self.peek(length) not in u'\0\r\n\x85\u2028\u2029':
1016                 length += 1
1017             chunks.append(self.prefix(length))
1018             self.forward(length)
1019             line_break = self.scan_line_break()
1020             breaks, end_mark = self.scan_block_scalar_breaks(indent)
1021             if self.column == indent and self.peek() != u'\0':
1022
1023                 # Unfortunately, folding rules are ambiguous.
1024                 #
1025                 # This is the folding according to the specification:
1026
1027                 if folded and line_break == u'\n'   \
1028                         and leading_non_space and self.peek() not in u' \t':
1029                     if not breaks:
1030                         chunks.append(u' ')
1031                 else:
1032                     chunks.append(line_break)
1033
1034                 # This is Clark Evans's interpretation (also in the spec
1035                 # examples):
1036                 #
1037                 #if folded and line_break == u'\n':
1038                 #    if not breaks:
1039                 #        if self.peek() not in ' \t':
1040                 #            chunks.append(u' ')
1041                 #        else:
1042                 #            chunks.append(line_break)
1043                 #else:
1044                 #    chunks.append(line_break)
1045             else:
1046                 break
1047
1048         # Chomp the tail.
1049         if chomping is not False:
1050             chunks.append(line_break)
1051         if chomping is True:
1052             chunks.extend(breaks)
1053
1054         # We are done.
1055         return ScalarToken(u''.join(chunks), False, start_mark, end_mark,
1056                 style)
1057
1058     def scan_block_scalar_indicators(self, start_mark):
1059         # See the specification for details.
1060         chomping = None
1061         increment = None
1062         ch = self.peek()
1063         if ch in u'+-':
1064             if ch == '+':
1065                 chomping = True
1066             else:
1067                 chomping = False
1068             self.forward()
1069             ch = self.peek()
1070             if ch in u'0123456789':
1071                 increment = int(ch)
1072                 if increment == 0:
1073                     raise ScannerError("while scanning a block scalar", start_mark,
1074                             "expected indentation indicator in the range 1-9, but found 0",
1075                             self.get_mark())
1076                 self.forward()
1077         elif ch in u'0123456789':
1078             increment = int(ch)
1079             if increment == 0:
1080                 raise ScannerError("while scanning a block scalar", start_mark,
1081                         "expected indentation indicator in the range 1-9, but found 0",
1082                         self.get_mark())
1083             self.forward()
1084             ch = self.peek()
1085             if ch in u'+-':
1086                 if ch == '+':
1087                     chomping = True
1088                 else:
1089                     chomping = False
1090                 self.forward()
1091         ch = self.peek()
1092         if ch not in u'\0 \r\n\x85\u2028\u2029':
1093             raise ScannerError("while scanning a block scalar", start_mark,
1094                     "expected chomping or indentation indicators, but found %r"
1095                         % ch.encode('utf-8'), self.get_mark())
1096         return chomping, increment
1097
1098     def scan_block_scalar_ignored_line(self, start_mark):
1099         # See the specification for details.
1100         while self.peek() == u' ':
1101             self.forward()
1102         if self.peek() == u'#':
1103             while self.peek() not in u'\0\r\n\x85\u2028\u2029':
1104                 self.forward()
1105         ch = self.peek()
1106         if ch not in u'\0\r\n\x85\u2028\u2029':
1107             raise ScannerError("while scanning a block scalar", start_mark,
1108                     "expected a comment or a line break, but found %r"
1109                         % ch.encode('utf-8'), self.get_mark())
1110         self.scan_line_break()
1111
1112     def scan_block_scalar_indentation(self):
1113         # See the specification for details.
1114         chunks = []
1115         max_indent = 0
1116         end_mark = self.get_mark()
1117         while self.peek() in u' \r\n\x85\u2028\u2029':
1118             if self.peek() != u' ':
1119                 chunks.append(self.scan_line_break())
1120                 end_mark = self.get_mark()
1121             else:
1122                 self.forward()
1123                 if self.column > max_indent:
1124                     max_indent = self.column
1125         return chunks, max_indent, end_mark
1126
1127     def scan_block_scalar_breaks(self, indent):
1128         # See the specification for details.
1129         chunks = []
1130         end_mark = self.get_mark()
1131         while self.column < indent and self.peek() == u' ':
1132             self.forward()
1133         while self.peek() in u'\r\n\x85\u2028\u2029':
1134             chunks.append(self.scan_line_break())
1135             end_mark = self.get_mark()
1136             while self.column < indent and self.peek() == u' ':
1137                 self.forward()
1138         return chunks, end_mark
1139
1140     def scan_flow_scalar(self, style):
1141         # See the specification for details.
1142         # Note that we loose indentation rules for quoted scalars. Quoted
1143         # scalars don't need to adhere indentation because " and ' clearly
1144         # mark the beginning and the end of them. Therefore we are less
1145         # restrictive then the specification requires. We only need to check
1146         # that document separators are not included in scalars.
1147         if style == '"':
1148             double = True
1149         else:
1150             double = False
1151         chunks = []
1152         start_mark = self.get_mark()
1153         quote = self.peek()
1154         self.forward()
1155         chunks.extend(self.scan_flow_scalar_non_spaces(double, start_mark))
1156         while self.peek() != quote:
1157             chunks.extend(self.scan_flow_scalar_spaces(double, start_mark))
1158             chunks.extend(self.scan_flow_scalar_non_spaces(double, start_mark))
1159         self.forward()
1160         end_mark = self.get_mark()
1161         return ScalarToken(u''.join(chunks), False, start_mark, end_mark,
1162                 style)
1163
1164     ESCAPE_REPLACEMENTS = {
1165         u'0':   u'\0',
1166         u'a':   u'\x07',
1167         u'b':   u'\x08',
1168         u't':   u'\x09',
1169         u'\t':  u'\x09',
1170         u'n':   u'\x0A',
1171         u'v':   u'\x0B',
1172         u'f':   u'\x0C',
1173         u'r':   u'\x0D',
1174         u'e':   u'\x1B',
1175         u' ':   u'\x20',
1176         u'\"':  u'\"',
1177         u'\\':  u'\\',
1178         u'N':   u'\x85',
1179         u'_':   u'\xA0',
1180         u'L':   u'\u2028',
1181         u'P':   u'\u2029',
1182     }
1183
1184     ESCAPE_CODES = {
1185         u'x':   2,
1186         u'u':   4,
1187         u'U':   8,
1188     }
1189
1190     def scan_flow_scalar_non_spaces(self, double, start_mark):
1191         # See the specification for details.
1192         chunks = []
1193         while True:
1194             length = 0
1195             while self.peek(length) not in u'\'\"\\\0 \t\r\n\x85\u2028\u2029':
1196                 length += 1
1197             if length:
1198                 chunks.append(self.prefix(length))
1199                 self.forward(length)
1200             ch = self.peek()
1201             if not double and ch == u'\'' and self.peek(1) == u'\'':
1202                 chunks.append(u'\'')
1203                 self.forward(2)
1204             elif (double and ch == u'\'') or (not double and ch in u'\"\\'):
1205                 chunks.append(ch)
1206                 self.forward()
1207             elif double and ch == u'\\':
1208                 self.forward()
1209                 ch = self.peek()
1210                 if ch in self.ESCAPE_REPLACEMENTS:
1211                     chunks.append(self.ESCAPE_REPLACEMENTS[ch])
1212                     self.forward()
1213                 elif ch in self.ESCAPE_CODES:
1214                     length = self.ESCAPE_CODES[ch]
1215                     self.forward()
1216                     for k in range(length):
1217                         if self.peek(k) not in u'0123456789ABCDEFabcdef':
1218                             raise ScannerError("while scanning a double-quoted scalar", start_mark,
1219                                     "expected escape sequence of %d hexdecimal numbers, but found %r" %
1220                                         (length, self.peek(k).encode('utf-8')), self.get_mark())
1221                     code = int(self.prefix(length), 16)
1222                     chunks.append(unichr(code))
1223                     self.forward(length)
1224                 elif ch in u'\r\n\x85\u2028\u2029':
1225                     self.scan_line_break()
1226                     chunks.extend(self.scan_flow_scalar_breaks(double, start_mark))
1227                 else:
1228                     raise ScannerError("while scanning a double-quoted scalar", start_mark,
1229                             "found unknown escape character %r" % ch.encode('utf-8'), self.get_mark())
1230             else:
1231                 return chunks
1232
1233     def scan_flow_scalar_spaces(self, double, start_mark):
1234         # See the specification for details.
1235         chunks = []
1236         length = 0
1237         while self.peek(length) in u' \t':
1238             length += 1
1239         whitespaces = self.prefix(length)
1240         self.forward(length)
1241         ch = self.peek()
1242         if ch == u'\0':
1243             raise ScannerError("while scanning a quoted scalar", start_mark,
1244                     "found unexpected end of stream", self.get_mark())
1245         elif ch in u'\r\n\x85\u2028\u2029':
1246             line_break = self.scan_line_break()
1247             breaks = self.scan_flow_scalar_breaks(double, start_mark)
1248             if line_break != u'\n':
1249                 chunks.append(line_break)
1250             elif not breaks:
1251                 chunks.append(u' ')
1252             chunks.extend(breaks)
1253         else:
1254             chunks.append(whitespaces)
1255         return chunks
1256
1257     def scan_flow_scalar_breaks(self, double, start_mark):
1258         # See the specification for details.
1259         chunks = []
1260         while True:
1261             # Instead of checking indentation, we check for document
1262             # separators.
1263             prefix = self.prefix(3)
1264             if (prefix == u'---' or prefix == u'...')   \
1265                     and self.peek(3) in u'\0 \t\r\n\x85\u2028\u2029':
1266                 raise ScannerError("while scanning a quoted scalar", start_mark,
1267                         "found unexpected document separator", self.get_mark())
1268             while self.peek() in u' \t':
1269                 self.forward()
1270             if self.peek() in u'\r\n\x85\u2028\u2029':
1271                 chunks.append(self.scan_line_break())
1272             else:
1273                 return chunks
1274
1275     def scan_plain(self):
1276         # See the specification for details.
1277         # We add an additional restriction for the flow context:
1278         #   plain scalars in the flow context cannot contain ',', ':' and '?'.
1279         # We also keep track of the `allow_simple_key` flag here.
1280         # Indentation rules are loosed for the flow context.
1281         chunks = []
1282         start_mark = self.get_mark()
1283         end_mark = start_mark
1284         indent = self.indent+1
1285         # We allow zero indentation for scalars, but then we need to check for
1286         # document separators at the beginning of the line.
1287         #if indent == 0:
1288         #    indent = 1
1289         spaces = []
1290         while True:
1291             length = 0
1292             if self.peek() == u'#':
1293                 break
1294             while True:
1295                 ch = self.peek(length)
1296                 if ch in u'\0 \t\r\n\x85\u2028\u2029'   \
1297                         or (not self.flow_level and ch == u':' and
1298                                 self.peek(length+1) in u'\0 \t\r\n\x85\u2028\u2029') \
1299                         or (self.flow_level and ch in u',:?[]{}'):
1300                     break
1301                 length += 1
1302             # It's not clear what we should do with ':' in the flow context.
1303             if (self.flow_level and ch == u':'
1304                     and self.peek(length+1) not in u'\0 \t\r\n\x85\u2028\u2029,[]{}'):
1305                 self.forward(length)
1306                 raise ScannerError("while scanning a plain scalar", start_mark,
1307                     "found unexpected ':'", self.get_mark(),
1308                     "Please check http://pyyaml.org/wiki/YAMLColonInFlowContext for details.")
1309             if length == 0:
1310                 break
1311             self.allow_simple_key = False
1312             chunks.extend(spaces)
1313             chunks.append(self.prefix(length))
1314             self.forward(length)
1315             end_mark = self.get_mark()
1316             spaces = self.scan_plain_spaces(indent, start_mark)
1317             if not spaces or self.peek() == u'#' \
1318                     or (not self.flow_level and self.column < indent):
1319                 break
1320         return ScalarToken(u''.join(chunks), True, start_mark, end_mark)
1321
1322     def scan_plain_spaces(self, indent, start_mark):
1323         # See the specification for details.
1324         # The specification is really confusing about tabs in plain scalars.
1325         # We just forbid them completely. Do not use tabs in YAML!
1326         chunks = []
1327         length = 0
1328         while self.peek(length) in u' ':
1329             length += 1
1330         whitespaces = self.prefix(length)
1331         self.forward(length)
1332         ch = self.peek()
1333         if ch in u'\r\n\x85\u2028\u2029':
1334             line_break = self.scan_line_break()
1335             self.allow_simple_key = True
1336             prefix = self.prefix(3)
1337             if (prefix == u'---' or prefix == u'...')   \
1338                     and self.peek(3) in u'\0 \t\r\n\x85\u2028\u2029':
1339                 return
1340             breaks = []
1341             while self.peek() in u' \r\n\x85\u2028\u2029':
1342                 if self.peek() == ' ':
1343                     self.forward()
1344                 else:
1345                     breaks.append(self.scan_line_break())
1346                     prefix = self.prefix(3)
1347                     if (prefix == u'---' or prefix == u'...')   \
1348                             and self.peek(3) in u'\0 \t\r\n\x85\u2028\u2029':
1349                         return
1350             if line_break != u'\n':
1351                 chunks.append(line_break)
1352             elif not breaks:
1353                 chunks.append(u' ')
1354             chunks.extend(breaks)
1355         elif whitespaces:
1356             chunks.append(whitespaces)
1357         return chunks
1358
1359     def scan_tag_handle(self, name, start_mark):
1360         # See the specification for details.
1361         # For some strange reasons, the specification does not allow '_' in
1362         # tag handles. I have allowed it anyway.
1363         ch = self.peek()
1364         if ch != u'!':
1365             raise ScannerError("while scanning a %s" % name, start_mark,
1366                     "expected '!', but found %r" % ch.encode('utf-8'),
1367                     self.get_mark())
1368         length = 1
1369         ch = self.peek(length)
1370         if ch != u' ':
1371             while u'0' <= ch <= u'9' or u'A' <= ch <= 'Z' or u'a' <= ch <= 'z'  \
1372                     or ch in u'-_':
1373                 length += 1
1374                 ch = self.peek(length)
1375             if ch != u'!':
1376                 self.forward(length)
1377                 raise ScannerError("while scanning a %s" % name, start_mark,
1378                         "expected '!', but found %r" % ch.encode('utf-8'),
1379                         self.get_mark())
1380             length += 1
1381         value = self.prefix(length)
1382         self.forward(length)
1383         return value
1384
1385     def scan_tag_uri(self, name, start_mark):
1386         # See the specification for details.
1387         # Note: we do not check if URI is well-formed.
1388         chunks = []
1389         length = 0
1390         ch = self.peek(length)
1391         while u'0' <= ch <= u'9' or u'A' <= ch <= 'Z' or u'a' <= ch <= 'z'  \
1392                 or ch in u'-;/?:@&=+$,_.!~*\'()[]%':
1393             if ch == u'%':
1394                 chunks.append(self.prefix(length))
1395                 self.forward(length)
1396                 length = 0
1397                 chunks.append(self.scan_uri_escapes(name, start_mark))
1398             else:
1399                 length += 1
1400             ch = self.peek(length)
1401         if length:
1402             chunks.append(self.prefix(length))
1403             self.forward(length)
1404             length = 0
1405         if not chunks:
1406             raise ScannerError("while parsing a %s" % name, start_mark,
1407                     "expected URI, but found %r" % ch.encode('utf-8'),
1408                     self.get_mark())
1409         return u''.join(chunks)
1410
1411     def scan_uri_escapes(self, name, start_mark):
1412         # See the specification for details.
1413         bytes = []
1414         mark = self.get_mark()
1415         while self.peek() == u'%':
1416             self.forward()
1417             for k in range(2):
1418                 if self.peek(k) not in u'0123456789ABCDEFabcdef':
1419                     raise ScannerError("while scanning a %s" % name, start_mark,
1420                             "expected URI escape sequence of 2 hexdecimal numbers, but found %r" %
1421                                 (self.peek(k).encode('utf-8')), self.get_mark())
1422             bytes.append(chr(int(self.prefix(2), 16)))
1423             self.forward(2)
1424         try:
1425             value = unicode(''.join(bytes), 'utf-8')
1426         except UnicodeDecodeError, exc:
1427             raise ScannerError("while scanning a %s" % name, start_mark, str(exc), mark)
1428         return value
1429
1430     def scan_line_break(self):
1431         # Transforms:
1432         #   '\r\n'      :   '\n'
1433         #   '\r'        :   '\n'
1434         #   '\n'        :   '\n'
1435         #   '\x85'      :   '\n'
1436         #   '\u2028'    :   '\u2028'
1437         #   '\u2029     :   '\u2029'
1438         #   default     :   ''
1439         ch = self.peek()
1440         if ch in u'\r\n\x85':
1441             if self.prefix(2) == u'\r\n':
1442                 self.forward(2)
1443             else:
1444                 self.forward()
1445             return u'\n'
1446         elif ch in u'\u2028\u2029':
1447             self.forward()
1448             return ch
1449         return u''
1450
1451 #try:
1452 #    import psyco
1453 #    psyco.bind(Scanner)
1454 #except ImportError:
1455 #    pass
1456