lib/rbyaml/scanner.rb

   1 # Scanner produces tokens of the following types:
   2 # STREAM-START
   3 # STREAM-END
   4 # DIRECTIVE(name, value)
   5 # DOCUMENT-START
   6 # DOCUMENT-END
   7 # BLOCK-SEQUENCE-START
   8 # BLOCK-MAPPING-START
   9 # BLOCK-END
  10 # FLOW-SEQUENCE-START
  11 # FLOW-MAPPING-START
  12 # FLOW-SEQUENCE-END
  13 # FLOW-MAPPING-END
  14 # BLOCK-ENTRY
  15 # FLOW-ENTRY
  16 # KEY
  17 # VALUE
  18 # ALIAS(value)
  19 # ANCHOR(value)
  20 # TAG(value)
  21 # SCALAR(value, plain)
  22 #
  23 # Read comments in the Scanner code for more details.
  24 #
  25
  26 require 'rbyaml/util'
  27 require 'rbyaml/error'
  28 require 'rbyaml/tokens'
  29 require 'rbyaml/constants'
  30
  31 module RbYAML
  32   class ScannerError < YAMLError
  33   end
  34   class ReaderError < YAMLError
  35     def initialize(name, position, character, encoding, reason)
  36       @name = name
  37       @position = position
  38       @character = character
  39       @encoding = encoding
  40       @reason = reason
  41     end
  42
  43     def to_s
  44       if @character.__is_str
  45         "'#{@encoding}' codec can't decode byte #x%02x: #{@reason}\n  in \"#{@name}\", position #{@position}" % @character.to_i
  46       else
  47         "unacceptable character #x%04x: #{@reason}\n  in \"#{@name}\", position #{@position}" % @character.to_i
  48       end
  49     end
  50   end
  51
  52   SimpleKey = Struct.new(:token_number, :required, :column)
  53
  54   class Scanner
  55     attr_reader :column, :stream, :stream_pointer, :eof, :buffer, :pointer
  56     def initialize(stream)
  57       # Had we reached the end of the stream?
  58       @done = false
  59
  60       # The number of unclosed '{' and '['. `flow_level == 0` means block
  61       # context.
  62       @flow_level = 0
  63       @flow_zero = true
  64
  65       # List of processed tokens that are not yet emitted.
  66       @tokens = []
  67
  68       # Add the STREAM-START token.
  69       fetch_stream_start
  70
  71       # Number of tokens that were emitted through the `get_token` method.
  72       @tokens_taken = 0
  73
  74       # The current indentation level.
  75       @indent = -1
  76
  77       # Past indentation levels.
  78       @indents = []
  79
  80       # Variables related to simple keys treatment.
  81
  82       # A simple key is a key that is not denoted by the '?' indicator.
  83       # Example of simple keys:
  84       #   ---
  85       #   block simple key: value
  86       #   ? not a simple key:
  87       #   : { flow simple key: value }
  88       # We emit the KEY token before all keys, so when we find a potential
  89       # simple key, we try to locate the corresponding ':' indicator.
  90       # Simple keys should be limited to a single line and 1024 characters.
  91
  92       # Can a simple key start at the current position? A simple key may
  93       # start:
  94       # - at the beginning of the line, not counting indentation spaces
  95       #       (in block context),
  96       # - after '{', '[', ',' (in the flow context),
  97       # - after '?', ':', '-' (in the block context).
  98       # In the block context, this flag also signifies if a block collection
  99       # may start at the current position.
 100       @allow_simple_key = true
 101
 102       # Keep track of possible simple keys. This is a dictionary. The key
 103       # is `flow_level`; there can be no more that one possible simple key
 104       # for each level. The value is a SimpleKey record:
 105       #   (token_number, required, index, line, column, mark)
 106       # A simple key may start with ALIAS, ANCHOR, TAG, SCALAR(flow),
 107       # '[', or '{' tokens.
 108       @possible_simple_keys = {}
 109
 110       @stream = nil
 111       @stream_pointer = 0
 112       @eof = true
 113       @buffer = ""
 114       @buffer_length = 0
 115       @pointer = 0
 116       @pointer1 = 1
 117       @column = 0
 118       if stream.__is_str
 119         @name = "<string>"
 120         @raw_buffer = stream
 121       else
 122         @stream = stream
 123         @name = stream.respond_to?(:path) ? stream.path : stream.inspect
 124         @eof = false
 125         @raw_buffer = ""
 126       end
 127     end
 128
 129     def peek(index=0)
 130       peekn(index)
 131     end
 132
 133     def peek0
 134       update(1) unless @pointer1 < @buffer_length
 135       @buffer[@pointer]
 136     end
 137
 138     def peek1
 139       update(2) unless @pointer1+1 < @buffer_length
 140       @buffer[@pointer1]
 141     end
 142
 143     def peek2
 144       update(3) unless @pointer1+2 < @buffer_length
 145       @buffer[@pointer1+1]
 146     end
 147
 148     def peek3
 149       update(4) unless @pointer1+3 < @buffer_length
 150       @buffer[@pointer1+2]
 151     end
 152
 153     def peekn(index=0)
 154       pix = @pointer1+index
 155       unless pix < @buffer_length
 156         update(index+1)
 157         pix = @pointer1+index
 158       end
 159       @buffer[pix-1]
 160     end
 161
 162     def prefix(length=1)
 163       update(length) unless @pointer+length < @buffer_length
 164       @buffer[@pointer...@pointer+length]
 165     end
 166
 167     def prefix2()
 168       update(2) unless @pointer1+1 < @buffer_length
 169       @buffer[@pointer..@pointer1]
 170     end
 171
 172     def forward(length=1)
 173       case length
 174         when 0: forward0
 175         when 1: forward1
 176         when 2: forward2
 177         when 3: forward3
 178         when 4: forward4
 179         when 5: forward5
 180         when 6: forward6
 181         else forwardn(length)
 182       end
 183     end
 184
 185     def forward0
 186       update(1) unless @pointer1 < @buffer_length
 187     end
 188
 189     def forward1
 190       update(2) unless @pointer1+1 < @buffer_length
 191       buff = @buffer[@pointer...@pointer1+1]
 192       index = buff.rindex(LINE_BR_REG)
 193       @column = index ? -index : column+1
 194       @pointer += 1
 195       @pointer1 += 1
 196     end
 197
 198     def forward2
 199       update(3) unless @pointer1+2 < @buffer_length
 200       buff = @buffer[@pointer...@pointer1+2]
 201       index = buff.rindex(LINE_BR_REG)
 202       @column = index ? 1-index : column+2
 203       @pointer += 2
 204       @pointer1 += 2
 205     end
 206
 207     def forward3
 208       update(4) unless @pointer1+3 < @buffer_length
 209       buff = @buffer[@pointer...@pointer1+3]
 210       index = buff.rindex(LINE_BR_REG)
 211       @column = index ? 2-index : column+3
 212       @pointer += 3
 213       @pointer1 += 3
 214     end
 215
 216     def forward4
 217       update(5) unless @pointer1+4 < @buffer_length
 218       buff = @buffer[@pointer...@pointer1+4]
 219       index = buff.rindex(LINE_BR_REG)
 220       @column = index ? 3-index : column+4
 221       @pointer += 4
 222       @pointer1 += 4
 223     end
 224
 225     def forward5
 226       update(6) unless @pointer1+5 < @buffer_length
 227       buff = @buffer[@pointer...@pointer1+5]
 228       index = buff.rindex(LINE_BR_REG)
 229       @column = index ? 4-index : column+5
 230       @pointer += 5
 231       @pointer1 += 5
 232     end
 233
 234     def forward6
 235       update(7) unless @pointer1+6 < @buffer_length
 236       buff = @buffer[@pointer...@pointer1+6]
 237       index = buff.rindex(LINE_BR_REG)
 238       @column = index ? 5-index : column+6
 239       @pointer += 6
 240       @pointer1 += 6
 241     end
 242
 243     def forwardn(length)
 244       update(length + 1) unless @pointer1+length < @buffer_length
 245       buff = @buffer[@pointer...@pointer+length]
 246       index = buff.rindex(LINE_BR_REG)
 247       @column = index ? (length-index)-1 : column+length
 248       @pointer += length
 249       @pointer1 += length
 250     end
 251
 252     def check_printable(data)
 253       if NON_PRINTABLE_RE =~ data
 254         position = @buffer.length-@pointer+($~.offset(0)[0])
 255         raise ReaderError.new(@name, position, $&,"unicode","special characters are not allowed"),"special characters are not allowed"
 256       end
 257     end
 258
 259
 260     def update(length)
 261       return if @raw_buffer.nil?
 262       @buffer = @buffer[@pointer..-1]
 263       @pointer = 0
 264       while @buffer.length < length
 265         unless @eof
 266           data = @stream.read(1024)
 267           if data && !data.empty?
 268             @buffer << data
 269             @stream_pointer += data.length
 270             @raw_buffer = ""
 271           else
 272             @eof = true
 273             @buffer << ?\0
 274             @raw_buffer = nil
 275             break
 276           end
 277         else
 278           @buffer << @raw_buffer << ?\0
 279           @raw_buffer = nil
 280           break
 281         end
 282       end
 283       @buffer_length = @buffer.length
 284       if @eof
 285         check_printable(@buffer[(-length)..-2])
 286       else
 287         check_printable(@buffer[(-length)..-1])
 288       end
 289       @pointer1 = @pointer+1
 290     end
 291
 292     def check_token(*choices)
 293       # Check if the next token is one of the given types.
 294       fetch_more_tokens while need_more_tokens
 295       unless @tokens.empty?
 296         return true if choices.empty?
 297         for choice in choices
 298           return true if choice === @tokens[0]
 299         end
 300       end
 301       return false
 302     end
 303
 304     def peek_token
 305       # Return the next token, but do not delete if from the queue.
 306       fetch_more_tokens while need_more_tokens
 307       return @tokens[0] unless @tokens.empty?
 308     end
 309
 310     def get_token
 311       # Return the next token.
 312       fetch_more_tokens while need_more_tokens
 313       unless @tokens.empty?
 314         @tokens_taken += 1
 315         @tokens.shift
 316       end
 317     end
 318
 319     def each_token
 320       fetch_more_tokens while need_more_tokens
 321       while !@tokens.empty?
 322         @tokens_taken += 1
 323         yield @tokens.shift
 324         fetch_more_tokens while need_more_tokens
 325       end
 326     end
 327
 328     def need_more_tokens
 329       return false if @done
 330       @tokens.empty? || next_possible_simple_key == @tokens_taken
 331     end
 332
 333     def fetch_more_tokens
 334       # Eat whitespaces and comments until we reach the next token.
 335       scan_to_next_token
 336
 337       # Remove obsolete possible simple keys.
 338 #      stale_possible_simple_keys
 339
 340       # Compare the current indentation and column. It may add some tokens
 341       # and decrease the current indentation level.
 342       unwind_indent(@column)
 343
 344       # Peek the next character.
 345       ch = peek0
 346       colz = @column == 0
 347
 348       case ch
 349       when ?\0: return fetch_stream_end
 350       when ?': return fetch_single
 351       when ?": return fetch_double
 352       when ??: if !@flow_zero || NULL_OR_OTHER.include?(peek1): return fetch_key end
 353       when ?:: if !@flow_zero || NULL_OR_OTHER.include?(peek1): return fetch_value end
 354       when ?%: if colz: return fetch_stream_end end
 355       when ?-: if colz && ENDING =~ prefix(4): return fetch_document_start; elsif NULL_OR_OTHER.include?(peek1): return fetch_block_entry end
 356       when ?.: if colz && START =~ prefix(4): return fetch_document_end end
 357       when ?[: return fetch_flow_sequence_start
 358       when ?{: return fetch_flow_mapping_start
 359       when ?]: return fetch_flow_sequence_end
 360       when ?}: return fetch_flow_mapping_end
 361       when ?,: return fetch_flow_entry
 362       when ?*: return fetch_alias
 363       when ?&: return fetch_anchor
 364       when ?!: return fetch_tag
 365       when ?|: if @flow_zero: return fetch_literal end
 366       when ?>: if @flow_zero: return fetch_folded end
 367       end
 368       return fetch_plain if BEG =~ prefix(2)
 369       raise ScannerError.new("while scanning for the next token","found character #{ch.chr}(#{ch}) that cannot start any token")
 370     end
 371
 372     # Simple keys treatment.
 373
 374     def next_possible_simple_key
 375       # Return the number of the nearest possible simple key. Actually we
 376       # don't need to loop through the whole dictionary.
 377       @possible_simple_keys.each_value {|key| return key.token_number if key.token_number}
 378       nil
 379     end
 380
 381     def save_possible_simple_key
 382       # The next token may start a simple key. We check if it's possible
 383       # and save its position. This function is called for
 384       #   ALIAS, ANCHOR, TAG, SCALAR(flow), '[', and '{'.
 385       # The next token might be a simple key. Let's save it's number and
 386       # position.
 387       @possible_simple_keys[@flow_level] = SimpleKey.new(@tokens_taken+@tokens.length, @flow_zero && @indent == @column,column) if @allow_simple_key
 388     end
 389
 390     # Indentation functions.
 391
 392     def unwind_indent(col)
 393       ## In flow context, tokens should respect indentation.
 394       ## Actually the condition should be `@indent >= column` according to
 395       ## the spec. But this condition will prohibit intuitively correct
 396       ## constructions such as
 397       ## key : {
 398       ## }
 399       #if @flow_level and @indent > column
 400       #    raise ScannerError(nil, nil,
 401       #            "invalid intendation or unclosed '[' or '{'",
 402       #            get_mark)
 403
 404       # In the flow context, indentation is ignored. We make the scanner less
 405       # restrictive then specification requires.
 406       return nil if !@flow_zero
 407       # In block context, we may need to issue the BLOCK-END tokens.
 408       while @indent > col
 409         @indent = @indents.pop
 410         @tokens << BLOCK_END
 411       end
 412     end
 413
 414     def add_indent(col)
 415       # Check if we need to increase indentation.
 416       if @indent < col
 417         @indents << @indent
 418         @indent = col
 419         return true
 420       end
 421       return false
 422     end
 423
 424     # Fetchers.
 425
 426     def fetch_stream_start
 427       # We always add STREAM-START as the first token and STREAM-END as the
 428       # last token.
 429       # Read the token.
 430       # Add STREAM-START.
 431       @tokens << STREAM_START
 432     end
 433
 434
 435     def fetch_stream_end
 436       # Set the current intendation to -1.
 437       unwind_indent(-1)
 438       # Reset everything (not really needed).
 439       @allow_simple_key = false
 440       @possible_simple_keys = {}
 441       # Read the token.
 442       # Add STREAM-END.
 443       @tokens << STREAM_END
 444       # The stream is finished.
 445       @done = true
 446     end
 447
 448     def fetch_directive
 449       # Set the current intendation to -1.
 450       unwind_indent(-1)
 451       # Reset simple keys.
 452       @allow_simple_key = false
 453       # Scan and add DIRECTIVE.
 454       @tokens << scan_directive
 455     end
 456
 457     def fetch_document_start
 458       fetch_document_indicator(DOCUMENT_START)
 459     end
 460
 461     def fetch_document_end
 462       fetch_document_indicator(DOCUMENT_END)
 463     end
 464
 465     def fetch_document_indicator(token)
 466       # Set the current intendation to -1.
 467       unwind_indent(-1)
 468       # Reset simple keys. Note that there could not be a block collection
 469       # after '---'.
 470       @allow_simple_key = false
 471       # Add DOCUMENT-START or DOCUMENT-END.
 472       forward3
 473       @tokens << token
 474     end
 475
 476     def fetch_flow_sequence_start
 477       fetch_flow_collection_start(FLOW_SEQUENCE_START)
 478     end
 479
 480     def fetch_flow_mapping_start
 481       fetch_flow_collection_start(FLOW_MAPPING_START)
 482     end
 483
 484     def fetch_flow_collection_start(token)
 485       # '[' and '{' may start a simple key.
 486       save_possible_simple_key
 487       # Increase the flow level.
 488       @flow_level += 1
 489       @flow_zero = false
 490       # Simple keys are allowed after '[' and '{'.
 491       @allow_simple_key = true
 492       # Add FLOW-SEQUENCE-START or FLOW-MAPPING-START.
 493       forward1
 494       @tokens << token
 495     end
 496
 497     def fetch_flow_sequence_end
 498       fetch_flow_collection_end(FLOW_SEQUENCE_END)
 499     end
 500
 501     def fetch_flow_mapping_end
 502       fetch_flow_collection_end(FLOW_MAPPING_END)
 503     end
 504
 505     def fetch_flow_collection_end(token)
 506       # Decrease the flow level.
 507       @flow_level -= 1
 508       if @flow_level == 0
 509         @flow_zero = true
 510       end
 511       # No simple keys after ']' or '}'.
 512       @allow_simple_key = false
 513       # Add FLOW-SEQUENCE-END or FLOW-MAPPING-END.
 514       forward1
 515       @tokens << token
 516     end
 517
 518     def fetch_flow_entry
 519       # Simple keys are allowed after ','.
 520       @allow_simple_key = true
 521       # Add FLOW-ENTRY.
 522       forward1
 523       @tokens << FLOW_ENTRY
 524     end
 525
 526     def fetch_block_entry
 527       # Block context needs additional checks.
 528       if @flow_zero
 529         raise ScannerError.new(nil,"sequence entries are not allowed here") if !@allow_simple_key
 530         # We may need to add BLOCK-SEQUENCE-START.
 531         if add_indent(column)
 532           @tokens << BLOCK_SEQUENCE_START
 533         end
 534         # It's an error for the block entry to occur in the flow context,
 535         # but we let the parser detect this.
 536       end
 537       # Simple keys are allowed after '-'.
 538       @allow_simple_key = true
 539       # Add BLOCK-ENTRY.
 540       forward1
 541       @tokens << BLOCK_ENTRY
 542     end
 543
 544     def fetch_key
 545       # Block context needs additional checks.
 546       if @flow_zero
 547         # Are we allowed to start a key (not nessesary a simple)?
 548         raise ScannerError.new(nil,"mapping keys are not allowed here") if !@allow_simple_key
 549         # We may need to add BLOCK-MAPPING-START.
 550         if add_indent(column)
 551           @tokens << BLOCK_MAPPING_START
 552         end
 553       end
 554       # Simple keys are allowed after '?' in the block context.
 555       @allow_simple_key = @flow_zero
 556       # Add KEY.
 557       forward1
 558       @tokens << KEY
 559     end
 560
 561     def fetch_value
 562       key = @possible_simple_keys[@flow_level]
 563       # Do we determine a simple key?
 564       if key.nil?
 565         # Block context needs additional checks.
 566         # (Do we really need them? They will be catched by the parser
 567         # anyway.)
 568         if @flow_zero
 569           # We are allowed to start a complex value if and only if
 570           # we can start a simple key.
 571           raise ScannerError.new(nil,"mapping values are not allowed here") if !@allow_simple_key
 572           # Simple keys are allowed after ':' in the block context.
 573           @allow_simple_key = true
 574         end
 575       else
 576         # Add KEY.
 577         @possible_simple_keys.delete(@flow_level)
 578
 579         # If this key starts a new block mapping, we need to add
 580         # BLOCK-MAPPING-START.
 581         se = (@flow_zero && add_indent(key.column)) ? [BLOCK_MAPPING_START] : []
 582         se << KEY
 583         @tokens.insert(key.token_number-@tokens_taken,*se)
 584         # There cannot be two simple keys one after another.
 585         @allow_simple_key = false
 586         # It must be a part of a complex key.
 587       end
 588       # Add VALUE.
 589       forward1
 590       @tokens << VALUE
 591     end
 592
 593     def fetch_alias
 594       # ALIAS could be a simple key.
 595       save_possible_simple_key
 596       # No simple keys after ALIAS.
 597       @allow_simple_key = false
 598       # Scan and add ALIAS.
 599       @tokens << scan_anchor(AliasToken)
 600     end
 601
 602     def fetch_anchor
 603       # ANCHOR could start a simple key.
 604       save_possible_simple_key
 605       # No simple keys after ANCHOR.
 606       @allow_simple_key = false
 607       # Scan and add ANCHOR.
 608       @tokens << scan_anchor(AnchorToken)
 609     end
 610
 611     def fetch_tag
 612       # TAG could start a simple key.
 613       save_possible_simple_key
 614       # No simple keys after TAG.
 615       @allow_simple_key = false
 616       # Scan and add TAG.
 617       @tokens << scan_tag
 618     end
 619
 620     def fetch_literal
 621       fetch_block_scalar(?|)
 622     end
 623
 624     def fetch_folded
 625       fetch_block_scalar(?>)
 626     end
 627
 628     def fetch_block_scalar(style)
 629       # A simple key may follow a block scalar.
 630       @allow_simple_key = true
 631       # Scan and add SCALAR.
 632       @tokens << scan_block_scalar(style)
 633     end
 634
 635     def fetch_single
 636       fetch_flow_scalar(?')
 637     end
 638
 639     def fetch_double
 640       fetch_flow_scalar(?")
 641     end
 642
 643     def fetch_flow_scalar(style)
 644       # A flow scalar could be a simple key.
 645       save_possible_simple_key
 646       # No simple keys after flow scalars.
 647       @allow_simple_key = false
 648       # Scan and add SCALAR.
 649       @tokens << scan_flow_scalar(style)
 650     end
 651
 652     def fetch_plain
 653       # A plain scalar could be a simple key.
 654       save_possible_simple_key
 655       # No simple keys after plain scalars. But note that `scan_plain` will
 656       # change this flag if the scan is finished at the beginning of the
 657       # line.
 658       @allow_simple_key = false
 659       # Scan and add SCALAR. May change `allow_simple_key`.
 660       @tokens << scan_plain
 661     end
 662
 663
 664     # Scanners.
 665     def scan_to_next_token
 666       # We ignore spaces, line breaks and comments.
 667       # If we find a line break in the block context, we set the flag
 668       # `allow_simple_key` on.
 669       #
 670       # TODO: We need to make tab handling rules more sane. A good rule is
 671       #   Tabs cannot precede tokens
 672       #   BLOCK-SEQUENCE-START, BLOCK-MAPPING-START, BLOCK-END,
 673       #   KEY(block), VALUE(block), BLOCK-ENTRY
 674       # So the checking code is
 675       #   if <TAB>:
 676       #       @allow_simple_keys = false
 677       # We also need to add the check for `allow_simple_keys == true` to
 678       # `unwind_indent` before issuing BLOCK-END.
 679       # Scanners for block, flow, and plain scalars need to be modified.
 680       while true
 681         while peek0 == 32
 682           forward1
 683         end
 684         if peek0 == ?#
 685           while !NULL_OR_LINEBR.include?(peek0)
 686             forward1
 687           end
 688         end
 689
 690         if !scan_line_break.empty?
 691           @allow_simple_key = true if @flow_zero
 692         else
 693           break
 694         end
 695       end
 696     end
 697
 698     def scan_directive
 699       # See the specification for details.
 700       forward1
 701       name = scan_directive_name
 702       value = nil
 703       if name == "YAML"
 704         value = scan_yaml_directive_value
 705       elsif name == "TAG"
 706         value = scan_tag_directive_value
 707       else
 708         forward1 while !NULL_OR_LINEBR.include?(peek0)
 709       end
 710       scan_directive_ignored_line
 711       DirectiveToken.new(name, value)
 712     end
 713
 714     def scan_directive_name
 715       # See the specification for details.
 716       length = 0
 717       ch = peek(length)
 718       zlen = true
 719       while ALPHA_REG  =~ ch.chr
 720         zlen = false
 721         length += 1
 722         ch = peek(length)
 723       end
 724       raise ScannerError.new("while scanning a directive","expected alphabetic or numeric character, but found #{ch.to_s}") if zlen
 725       value = prefix(length)
 726       forward(length)
 727       ch = peek0
 728       raise ScannerError.new("while scanning a directive","expected alphabetic or numeric character, but found #{ch.to_s}") if !NULL_BL_LINEBR.include?(ch)
 729       value
 730     end
 731
 732     def scan_yaml_directive_value
 733       # See the specification for details.
 734       forward1 while peek0 == 32
 735       major = scan_yaml_directive_number
 736       raise ScannerError.new("while scanning a directive","expected a digit or '.', but found #{peek.to_s}") if peek0 != ?.
 737       forward1
 738       minor = scan_yaml_directive_number
 739       raise ScannerError.new("while scanning a directive","expected a digit or ' ', but found #{peek.to_s}") if !NULL_BL_LINEBR.include?(peek0)
 740       [major, minor]
 741     end
 742
 743     def scan_yaml_directive_number
 744       # See the specification for details.
 745       ch = peek0
 746       raise ScannerError.new("while scanning a directive","expected a digit, but found #{ch.to_s}") if !(ch.__is_ascii_num)
 747       length = 0
 748       length += 1 while (peek(length).__is_ascii_num)
 749       value = prefix(length)
 750       forward(length)
 751       value
 752     end
 753
 754     def scan_tag_directive_value
 755       # See the specification for details.
 756       forward1 while peek0 == 32
 757       handle = scan_tag_directive_handle
 758       forward1 while peek0 == 32
 759       prefix = scan_tag_directive_prefix
 760       [handle, prefix]
 761     end
 762
 763     def scan_tag_directive_handle
 764       # See the specification for details.
 765       value = scan_tag_handle("directive")
 766       raise ScannerError.new("while scanning a directive","expected ' ', but found #{peek0}") if peek0 != 32
 767       value
 768     end
 769
 770     def scan_tag_directive_prefix
 771       # See the specification for details.
 772       value = scan_tag_uri("directive")
 773       raise ScannerError.new("while scanning a directive","expected ' ', but found #{peek0}") if !NULL_BL_LINEBR.include?(peek0)
 774       value
 775     end
 776
 777     def scan_directive_ignored_line
 778       # See the specification for details.
 779       forward1 while peek0 == 32
 780       if peek0 == ?#
 781           forward1 while !NULL_OR_LINEBR.include?(peek0)
 782       end
 783       ch = peek0
 784       raise ScannerError.new("while scanning a directive","expected a comment or a line break, but found #{peek0.to_s}") if !NULL_OR_LINEBR.include?(peek0)
 785       scan_line_break
 786     end
 787
 788     def scan_anchor(token)
 789       # The specification does not restrict characters for anchors and
 790       # aliases. This may lead to problems, for instance, the document:
 791       #   [ *alias, value ]
 792       # can be interpteted in two ways, as
 793       #   [ "value" ]
 794       # and
 795       #   [ *alias , "value" ]
 796       # Therefore we restrict aliases to numbers and ASCII letters.
 797       name = (peek0 == ?*) ? "alias":"anchor"
 798       forward1
 799       length = 0
 800       chunk_size = 16
 801       while true
 802         chunk = prefix(chunk_size)
 803         if length = (NON_ALPHA =~ chunk)
 804           break
 805         end
 806         chunk_size += 16
 807       end
 808       raise ScannerError.new("while scanning an #{name}","expected alphabetic or numeric character, but found something else...") if length==0
 809       value = prefix(length)
 810       forward(length)
 811       if !NON_ALPHA_OR_NUM.include?(peek0)
 812         raise ScannerError.new("while scanning an #{name}","expected alphabetic or numeric character, but found #{peek0}")
 813       end
 814       token.new(value)
 815     end
 816
 817     def scan_tag
 818       # See the specification for details.
 819       ch = peek1
 820       if ch == ?<
 821         handle = nil
 822         forward2
 823         suffix = scan_tag_uri("tag")
 824         raise ScannerError.new("while parsing a tag","expected '>', but found #{peek.to_s}") if peek0 != ?>
 825         forward1
 826       elsif NULL_T_BL_LINEBR.include?(ch)
 827         handle = nil
 828         suffix = "!"
 829         forward1
 830       else
 831         length = 1
 832         use_handle = false
 833         while !NULL_T_BL_LINEBR.include?(ch)
 834           if ch == ?!
 835             use_handle = true
 836             break
 837           end
 838           length += 1
 839           ch = peek(length)
 840         end
 841         handle = "!"
 842         if use_handle
 843           handle = scan_tag_handle("tag")
 844         else
 845           handle = "!"
 846           forward1
 847         end
 848         suffix = scan_tag_uri("tag")
 849       end
 850       raise ScannerError.new("while scanning a tag","expected ' ', but found #{peek0}") if !NULL_BL_LINEBR.include?(peek0)
 851       value = [handle, suffix]
 852       TagToken.new(value)
 853     end
 854
 855     def scan_block_scalar(style)
 856       # See the specification for details.
 857       folded = style== ?>
 858       chunks = []
 859       # Scan the header.
 860       forward1
 861       chomping, increment = scan_block_scalar_indicators
 862       scan_block_scalar_ignored_line
 863       # Determine the indentation level and go to the first non-empty line.
 864       min_indent = @indent+1
 865       min_indent = 1 if min_indent < 1
 866       if increment.nil?
 867         breaks, max_indent = scan_block_scalar_indentation
 868         indent = [min_indent, max_indent].max
 869       else
 870         indent = min_indent+increment-1
 871         breaks = scan_block_scalar_breaks(indent)
 872       end
 873       line_break = ''
 874       # Scan the inner part of the block scalar.
 875       while column == indent and peek0 != ?\0
 876         chunks += breaks
 877         leading_non_space = !BLANK_T.include?(peek0)
 878         length = 0
 879         length += 1 while !NULL_OR_LINEBR.include?(peek(length))
 880         chunks << prefix(length)
 881         forward(length)
 882         line_break = scan_line_break
 883         breaks = scan_block_scalar_breaks(indent)
 884         if column == indent && peek0 != 0
 885           # Unfortunately, folding rules are ambiguous.
 886           #
 887           # This is the folding according to the specification:
 888           if folded && line_break == "\n" && leading_non_space && !BLANK_T.include?(peek0)
 889             chunks << ' ' if breaks.empty?
 890           else
 891             chunks << line_break
 892           end
 893           # This is Clark Evans's interpretation (also in the spec
 894           # examples):
 895           #
 896           #if folded and line_break == u'\n':
 897           #    if not breaks:
 898           #        if self.peek() not in ' \t':
 899           #            chunks.append(u' ')
 900           #        else:
 901           #            chunks.append(line_break)
 902           #else:
 903           #    chunks.append(line_break)
 904         else
 905           break
 906         end
 907       end
 908
 909       # Chomp the tail.
 910       if chomping
 911         chunks << line_break
 912         chunks += breaks
 913       end
 914
 915       # We are done.
 916       ScalarToken.new(chunks.to_s, false, style)
 917     end
 918
 919     def scan_block_scalar_indicators
 920       # See the specification for details.
 921       chomping = nil
 922       increment = nil
 923       ch = peek0
 924       if PLUS_MIN =~ ch.chr
 925         chomping = ch == ?+
 926         forward1
 927         ch = peek0
 928         if ch.__is_ascii_num
 929           increment = ch.chr.to_i
 930           raise ScannerError.new("while scanning a block scalar","expected indentation indicator in the range 1-9, but found 0") if increment == 0
 931           forward1
 932         end
 933       elsif ch.__is_ascii_num
 934         increment = ch.chr.to_i
 935         raise ScannerError.new("while scanning a block scalar","expected indentation indicator in the range 1-9, but found 0") if increment == 0
 936         forward1
 937         ch = peek0
 938         if PLUS_MIN =~ ch.chr
 939           chomping = ch == ?+
 940           forward1
 941         end
 942       end
 943       raise ScannerError.new("while scanning a block scalar","expected chomping or indentation indicators, but found #{peek0}") if !NULL_BL_LINEBR.include?(peek0)
 944       [chomping, increment]
 945     end
 946
 947     def scan_block_scalar_ignored_line
 948       # See the specification for details.
 949       forward1 while peek0 == 32
 950       if peek0 == ?#
 951           forward1 while !NULL_OR_LINEBR.include?(peek0)
 952       end
 953       raise ScannerError.new("while scanning a block scalar","expected a comment or a line break, but found #{peek0}") if !NULL_OR_LINEBR.include?(peek0)
 954       scan_line_break
 955     end
 956
 957     def scan_block_scalar_indentation
 958       # See the specification for details.
 959       chunks = []
 960       max_indent = 0
 961       while BLANK_OR_LINEBR.include?(peek0)
 962         if peek0 != 32
 963           chunks << scan_line_break
 964         else
 965           forward1
 966           max_indent = column if column > max_indent
 967         end
 968       end
 969       [chunks, max_indent]
 970     end
 971
 972     def scan_block_scalar_breaks(indent)
 973       # See the specification for details.
 974       chunks = []
 975       forward1 while @column < indent && peek0 == 32
 976       while FULL_LINEBR.include?(peek0)
 977         chunks << scan_line_break
 978         forward1 while @column < indent && peek0 == 32
 979       end
 980       chunks
 981     end
 982
 983     def scan_flow_scalar(style)
 984       # See the specification for details.
 985       # Note that we loose indentation rules for quoted scalars. Quoted
 986       # scalars don't need to adhere indentation because " and ' clearly
 987       # mark the beginning and the end of them. Therefore we are less
 988       # restrictive then the specification requires. We only need to check
 989       # that document separators are not included in scalars.
 990       double = style == ?"
 991       chunks = []
 992       quote = peek0
 993       forward1
 994       chunks += scan_flow_scalar_non_spaces(double)
 995       while peek0 != quote
 996         chunks += scan_flow_scalar_spaces(double)
 997         chunks += scan_flow_scalar_non_spaces(double)
 998       end
 999       forward1
1000       ScalarToken.new(chunks.to_s, false, style)
1001     end
1002
1003     def scan_flow_scalar_non_spaces(double)
1004       # See the specification for details.
1005       chunks = []
1006       while true
1007         length = 0
1008         length += 1 while !SPACES_AND_STUFF.include?(peek(length))
1009         if length!=0
1010           chunks << prefix(length)
1011           forward(length)
1012         end
1013         ch = peek0
1014         if !double && ch == ?' && peek1 == ?'
1015           chunks << ?'
1016           forward2
1017         elsif (double && ch == ?') || (!double && DOUBLE_ESC.include?(ch))
1018           chunks << ch
1019           forward1
1020         elsif double && ch == ?\\
1021           forward1
1022           ch = peek0
1023           if UNESCAPES.member?(ch.chr)
1024             chunks << UNESCAPES[ch.chr]
1025             forward1
1026           elsif ESCAPE_CODES.member?(ch.chr)
1027             length = ESCAPE_CODES[ch.chr]
1028             forward1
1029             if NOT_HEXA =~ prefix(length)
1030               raise ScannerError.new("while scanning a double-quoted scalar","expected escape sequence of #{length} hexdecimal numbers, but found something else: #{prefix(length)}}")
1031             end
1032             code = prefix(length).to_i(16).to_s
1033             chunks << code
1034             forward(length)
1035           elsif FULL_LINEBR.include?(ch)
1036             scan_line_break
1037             chunks += scan_flow_scalar_breaks(double)
1038           else
1039             raise ScannerError.new("while scanning a double-quoted scalar","found unknown escape character #{ch}")
1040           end
1041         else
1042           return chunks
1043         end
1044       end
1045     end
1046
1047     def scan_flow_scalar_spaces(double)
1048       # See the specification for details.
1049       chunks = []
1050       length = 0
1051       length += 1 while BLANK_T.include?(peek(length))
1052       whitespaces = prefix(length)
1053       forward(length)
1054       ch = peek0
1055       if ch == ?\0
1056         raise ScannerError.new("while scanning a quoted scalar","found unexpected end of stream")
1057       elsif FULL_LINEBR.include?(ch)
1058         line_break = scan_line_break
1059         breaks = scan_flow_scalar_breaks(double)
1060         if line_break != "\n"
1061           chunks << line_break
1062         elsif breaks.empty?
1063           chunks << ' '
1064         end
1065         chunks += breaks
1066       else
1067         chunks << whitespaces
1068       end
1069       chunks
1070     end
1071
1072     def scan_flow_scalar_breaks(double)
1073       # See the specification for details.
1074       chunks = []
1075       while true
1076         # Instead of checking indentation, we check for document
1077         # separators.
1078         prefix = prefix(3)
1079         if (prefix == "---" || prefix == "...") &&NULL_BL_T_LINEBR.include?(peek3)
1080           raise ScannerError.new("while scanning a quoted scalar","found unexpected document separator")
1081         end
1082         forward1 while BLANK_T.include?(peek0)
1083         if FULL_LINEBR.include?(peek0)
1084           chunks << scan_line_break
1085         else
1086           return chunks
1087         end
1088       end
1089     end
1090
1091     def scan_plain
1092       # See the specification for details.
1093       # We add an additional restriction for the flow context:
1094       #   plain scalars in the flow context cannot contain ',', ':' and '?'.
1095       # We also keep track of the `allow_simple_key` flag here.
1096       # Indentation rules are loosed for the flow context.
1097       chunks = []
1098       indent = @indent+1
1099       # We allow zero indentation for scalars, but then we need to check for
1100       # document separators at the beginning of the line.
1101       #if indent == 0
1102       #    indent = 1
1103       spaces = []
1104       if @flow_zero
1105         f_nzero, r_check = false, R_flowzero
1106       else
1107         f_nzero, r_check = true, R_flownonzero
1108       end
1109
1110       while peek0 != ?#
1111         length = 0
1112         chunk_size = 32
1113         chunk_size += 32 until length = (r_check =~ prefix(chunk_size))
1114         ch = peek(length)
1115         if f_nzero && ch == ?: && !S4.include?(peek(length+1))
1116           forward(length)
1117           raise ScannerError.new("while scanning a plain scalar","found unexpected ':'","Please check http://pyyaml.org/wiki/YAMLColonInFlowContext for details.")
1118         end
1119         break if length == 0
1120         @allow_simple_key = false
1121         chunks += spaces
1122         chunks << prefix(length)
1123         forward(length)
1124         spaces = scan_plain_spaces(indent)
1125         break if !spaces || (@flow_zero && @column < indent)
1126       end
1127       return ScalarToken.new(chunks.to_s, true)
1128     end
1129
1130     def scan_plain_spaces(indent)
1131       # See the specification for details.
1132       # The specification is really confusing about tabs in plain scalars.
1133       # We just forbid them completely. Do not use tabs in YAML!
1134       chunks = []
1135       length = 0
1136       length += 1 while peek(length) == 32
1137       whitespaces = prefix(length)
1138       forward(length)
1139       ch = peek0
1140       if FULL_LINEBR.include?(ch)
1141         line_break = scan_line_break
1142         @allow_simple_key = true
1143         return if END_OR_START =~ prefix(4)
1144         breaks = []
1145         while BLANK_OR_LINEBR.include?(peek0)
1146           if peek0 == 32
1147             forward1
1148           else
1149             breaks << scan_line_break
1150             return if END_OR_START =~ prefix(4)
1151           end
1152         end
1153         if line_break != "\n"
1154           chunks << line_break
1155         elsif breaks.nil? || breaks.empty?
1156           chunks << " "
1157         end
1158         chunks += breaks
1159       else
1160         chunks << whitespaces
1161       end
1162       chunks
1163     end
1164
1165
1166     def scan_tag_handle(name)
1167       # See the specification for details.
1168       # For some strange reasons, the specification does not allow '_' in
1169       # tag handles. I have allowed it anyway.
1170       ch = peek0
1171       raise ScannerError.new("while scanning a #{name}","expected '!', but found #{ch}") if ch != ?!
1172       length = 1
1173       ch = peek(length)
1174       if ch != 32
1175         while ALPHA_REG =~ ch.chr
1176           length += 1
1177           ch = peek(length)
1178         end
1179         if ch != ?!
1180           forward(length)
1181           raise ScannerError.new("while scanning a #{name}","expected '!', but found #{ch}")
1182         end
1183         length += 1
1184       end
1185       value = prefix(length)
1186       forward(length)
1187       value
1188     end
1189
1190     def scan_tag_uri(name)
1191       # See the specification for details.
1192       # Note: we do not check if URI is well-formed.
1193       chunks = []
1194       length = 0
1195       ch = peek(length)
1196       while  STRANGE_CHR =~ ch.chr
1197         if ch == ?%
1198           chunks << prefix(length)
1199           forward(length)
1200           length = 0
1201           chunks << scan_uri_escapes(name)
1202         else
1203           length += 1
1204         end
1205         ch = peek(length)
1206       end
1207       if length!=0
1208         chunks << prefix(length)
1209         forward(length)
1210       end
1211
1212       raise ScannerError.new("while parsing a #{name}","expected URI, but found #{ch}") if chunks.empty?
1213       chunks.to_s
1214     end
1215
1216     def scan_uri_escapes(name)
1217       # See the specification for details.
1218       bytes = []
1219       while peek0 == ?%
1220         forward1
1221         raise ScannerError.new("while scanning a #{name}","expected URI escape sequence of 2 hexdecimal numbers, but found #{peek1} and #{peek2}") if HEXA_REG !~ peek1.chr || HEXA_REG !~ peek2.chr
1222         bytes << prefix(2).to_i(16).to_s
1223         forward2
1224       end
1225       bytes.to_s
1226     end
1227
1228     RN = "\r\n"
1229     def scan_line_break
1230       # Transforms:
1231       #   '\r\n'      :   '\n'
1232       #   '\r'        :   '\n'
1233       #   '\n'        :   '\n'
1234       #   '\x85'      :   '\n'
1235       #   default     :   ''
1236       if FULL_LINEBR.include?(peek0)
1237         if prefix2 == RN
1238           forward2
1239         else
1240           forward1
1241         end
1242         return "\n"
1243       end
1244       ""
1245     end
1246   end
1247 end
1248