lib/rexml/parsers/baseparser.rb

   1 require 'rexml/parseexception'
   2 require 'rexml/undefinednamespaceexception'
   3 require 'rexml/source'
   4 require 'set'
   5
   6 module REXML
   7   module Parsers
   8     # = Using the Pull Parser
   9     # <em>This API is experimental, and subject to change.</em>
  10     #  parser = PullParser.new( "<a>text<b att='val'/>txet</a>" )
  11     #  while parser.has_next?
  12     #    res = parser.next
  13     #    puts res[1]['att'] if res.start_tag? and res[0] == 'b'
  14     #  end
  15     # See the PullEvent class for information on the content of the results.
  16     # The data is identical to the arguments passed for the various events to
  17     # the StreamListener API.
  18     #
  19     # Notice that:
  20     #  parser = PullParser.new( "<a>BAD DOCUMENT" )
  21     #  while parser.has_next?
  22     #    res = parser.next
  23     #    raise res[1] if res.error?
  24     #  end
  25     #
  26     # Nat Price gave me some good ideas for the API.
  27     class BaseParser
  28       NCNAME_STR= '[\w:][\-\w\d.]*'
  29       NAME_STR= "(?:(#{NCNAME_STR}):)?(#{NCNAME_STR})"
  30       UNAME_STR= "(?:#{NCNAME_STR}:)?#{NCNAME_STR}"
  31
  32       NAMECHAR = '[\-\w\d\.:]'
  33       NAME = "([\\w:]#{NAMECHAR}*)"
  34       NMTOKEN = "(?:#{NAMECHAR})+"
  35       NMTOKENS = "#{NMTOKEN}(\\s+#{NMTOKEN})*"
  36       REFERENCE = "(?:&#{NAME};|&#\\d+;|&#x[0-9a-fA-F]+;)"
  37       REFERENCE_RE = /#{REFERENCE}/
  38
  39       DOCTYPE_START = /\A\s*<!DOCTYPE\s/um
  40       DOCTYPE_PATTERN = /\s*<!DOCTYPE\s+(.*?)(\[|>)/um
  41       ATTRIBUTE_PATTERN = /\s*(#{NAME_STR})\s*=\s*(["'])(.*?)\4/um
  42       COMMENT_START = /\A<!--/u
  43       COMMENT_PATTERN = /<!--(.*?)-->/um
  44       CDATA_START = /\A<!\[CDATA\[/u
  45       CDATA_END = /^\s*\]\s*>/um
  46       CDATA_PATTERN = /<!\[CDATA\[(.*?)\]\]>/um
  47       XMLDECL_START = /\A<\?xml\s/u;
  48       XMLDECL_PATTERN = /<\?xml\s+(.*?)\?>/um
  49       INSTRUCTION_START = /\A<\?/u
  50       INSTRUCTION_PATTERN = /<\?(.*?)(\s+.*?)?\?>/um
  51       TAG_MATCH = /^<((?>#{NAME_STR}))\s*((?>\s+#{UNAME_STR}\s*=\s*(["']).*?\5)*)\s*(\/)?>/um
  52       CLOSE_MATCH = /^\s*<\/(#{NAME_STR})\s*>/um
  53
  54       VERSION = /\bversion\s*=\s*["'](.*?)['"]/um
  55       ENCODING = /\bencoding\s*=\s*["'](.*?)['"]/um
  56       STANDALONE = /\bstandalone\s*=\s["'](.*?)['"]/um
  57
  58       ENTITY_START = /^\s*<!ENTITY/
  59       IDENTITY = /^([!\*\w\-]+)(\s+#{NCNAME_STR})?(\s+["'](.*?)['"])?(\s+['"](.*?)["'])?/u
  60       ELEMENTDECL_START = /^\s*<!ELEMENT/um
  61       ELEMENTDECL_PATTERN = /^\s*(<!ELEMENT.*?)>/um
  62       SYSTEMENTITY = /^\s*(%.*?;)\s*$/um
  63       ENUMERATION = "\\(\\s*#{NMTOKEN}(?:\\s*\\|\\s*#{NMTOKEN})*\\s*\\)"
  64       NOTATIONTYPE = "NOTATION\\s+\\(\\s*#{NAME}(?:\\s*\\|\\s*#{NAME})*\\s*\\)"
  65       ENUMERATEDTYPE = "(?:(?:#{NOTATIONTYPE})|(?:#{ENUMERATION}))"
  66       ATTTYPE = "(CDATA|ID|IDREF|IDREFS|ENTITY|ENTITIES|NMTOKEN|NMTOKENS|#{ENUMERATEDTYPE})"
  67       ATTVALUE = "(?:\"((?:[^<&\"]|#{REFERENCE})*)\")|(?:'((?:[^<&']|#{REFERENCE})*)')"
  68       DEFAULTDECL = "(#REQUIRED|#IMPLIED|(?:(#FIXED\\s+)?#{ATTVALUE}))"
  69       ATTDEF = "\\s+#{NAME}\\s+#{ATTTYPE}\\s+#{DEFAULTDECL}"
  70       ATTDEF_RE = /#{ATTDEF}/
  71       ATTLISTDECL_START = /^\s*<!ATTLIST/um
  72       ATTLISTDECL_PATTERN = /^\s*<!ATTLIST\s+#{NAME}(?:#{ATTDEF})*\s*>/um
  73       NOTATIONDECL_START = /^\s*<!NOTATION/um
  74       PUBLIC = /^\s*<!NOTATION\s+(\w[\-\w]*)\s+(PUBLIC)\s+(["'])(.*?)\3(?:\s+(["'])(.*?)\5)?\s*>/um
  75       SYSTEM = /^\s*<!NOTATION\s+(\w[\-\w]*)\s+(SYSTEM)\s+(["'])(.*?)\3\s*>/um
  76
  77       TEXT_PATTERN = /\A([^<]*)/um
  78
  79       # Entity constants
  80       PUBIDCHAR = "\x20\x0D\x0Aa-zA-Z0-9\\-()+,./:=?;!*@$_%#"
  81       SYSTEMLITERAL = %Q{((?:"[^"]*")|(?:'[^']*'))}
  82       PUBIDLITERAL = %Q{("[#{PUBIDCHAR}']*"|'[#{PUBIDCHAR}]*')}
  83       EXTERNALID = "(?:(?:(SYSTEM)\\s+#{SYSTEMLITERAL})|(?:(PUBLIC)\\s+#{PUBIDLITERAL}\\s+#{SYSTEMLITERAL}))"
  84       NDATADECL = "\\s+NDATA\\s+#{NAME}"
  85       PEREFERENCE = "%#{NAME};"
  86       ENTITYVALUE = %Q{((?:"(?:[^%&"]|#{PEREFERENCE}|#{REFERENCE})*")|(?:'([^%&']|#{PEREFERENCE}|#{REFERENCE})*'))}
  87       PEDEF = "(?:#{ENTITYVALUE}|#{EXTERNALID})"
  88       ENTITYDEF = "(?:#{ENTITYVALUE}|(?:#{EXTERNALID}(#{NDATADECL})?))"
  89       PEDECL = "<!ENTITY\\s+(%)\\s+#{NAME}\\s+#{PEDEF}\\s*>"
  90       GEDECL = "<!ENTITY\\s+#{NAME}\\s+#{ENTITYDEF}\\s*>"
  91       ENTITYDECL = /\s*(?:#{GEDECL})|(?:#{PEDECL})/um
  92
  93       EREFERENCE = /&(?!#{NAME};)/
  94
  95       DEFAULT_ENTITIES = {
  96         'gt' => [/&gt;/, '&gt;', '>', />/],
  97         'lt' => [/&lt;/, '&lt;', '<', /</],
  98         'quot' => [/&quot;/, '&quot;', '"', /"/],
  99         "apos" => [/&apos;/, "&apos;", "'", /'/]
 100       }
 101
 102
 103       ######################################################################
 104       # These are patterns to identify common markup errors, to make the
 105       # error messages more informative.
 106       ######################################################################
 107       MISSING_ATTRIBUTE_QUOTES = /^<#{NAME_STR}\s+#{NAME_STR}\s*=\s*[^"']/um
 108
 109       def initialize( source )
 110         self.stream = source
 111       end
 112
 113       def add_listener( listener )
 114         if !defined?(@listeners) or !@listeners
 115           @listeners = []
 116           instance_eval <<-EOL
 117             alias :_old_pull :pull
 118             def pull
 119               event = _old_pull
 120               @listeners.each do |listener|
 121                 listener.receive event
 122               end
 123               event
 124             end
 125           EOL
 126         end
 127         @listeners << listener
 128       end
 129
 130       attr_reader :source
 131
 132       def stream=( source )
 133         @source = SourceFactory.create_from( source )
 134         @closed = nil
 135         @document_status = nil
 136         @tags = []
 137         @stack = []
 138         @entities = []
 139         @nsstack = []
 140       end
 141
 142       def position
 143         if @source.respond_to? :position
 144           @source.position
 145         else
 146           # FIXME
 147           0
 148         end
 149       end
 150
 151       # Returns true if there are no more events
 152       def empty?
 153         return (@source.empty? and @stack.empty?)
 154       end
 155
 156       # Returns true if there are more events.  Synonymous with !empty?
 157       def has_next?
 158         return !(@source.empty? and @stack.empty?)
 159       end
 160
 161       # Push an event back on the head of the stream.  This method
 162       # has (theoretically) infinite depth.
 163       def unshift token
 164         @stack.unshift(token)
 165       end
 166
 167       # Peek at the +depth+ event in the stack.  The first element on the stack
 168       # is at depth 0.  If +depth+ is -1, will parse to the end of the input
 169       # stream and return the last event, which is always :end_document.
 170       # Be aware that this causes the stream to be parsed up to the +depth+
 171       # event, so you can effectively pre-parse the entire document (pull the
 172       # entire thing into memory) using this method.
 173       def peek depth=0
 174         raise %Q[Illegal argument "#{depth}"] if depth < -1
 175         temp = []
 176         if depth == -1
 177           temp.push(pull()) until empty?
 178         else
 179           while @stack.size+temp.size < depth+1
 180             temp.push(pull())
 181           end
 182         end
 183         @stack += temp if temp.size > 0
 184         @stack[depth]
 185       end
 186
 187       # Returns the next event.  This is a +PullEvent+ object.
 188       def pull
 189         if @closed
 190           x, @closed = @closed, nil
 191           return [ :end_element, x ]
 192         end
 193         return [ :end_document ] if empty?
 194         return @stack.shift if @stack.size > 0
 195         #STDERR.puts @source.encoding
 196         @source.read if @source.buffer.size<2
 197         #STDERR.puts "BUFFER = #{@source.buffer.inspect}"
 198         if @document_status == nil
 199           #@source.consume( /^\s*/um )
 200           word = @source.match( /^((?:\s+)|(?:<[^>]*>))/um )
 201           word = word[1] unless word.nil?
 202           #STDERR.puts "WORD = #{word.inspect}"
 203           case word
 204           when COMMENT_START
 205             return [ :comment, @source.match( COMMENT_PATTERN, true )[1] ]
 206           when XMLDECL_START
 207             #STDERR.puts "XMLDECL"
 208             results = @source.match( XMLDECL_PATTERN, true )[1]
 209             version = VERSION.match( results )
 210             version = version[1] unless version.nil?
 211             encoding = ENCODING.match(results)
 212             encoding = encoding[1] unless encoding.nil?
 213             @source.encoding = encoding
 214             standalone = STANDALONE.match(results)
 215             standalone = standalone[1] unless standalone.nil?
 216             return [ :xmldecl, version, encoding, standalone ]
 217           when INSTRUCTION_START
 218             return [ :processing_instruction, *@source.match(INSTRUCTION_PATTERN, true)[1,2] ]
 219           when DOCTYPE_START
 220             md = @source.match( DOCTYPE_PATTERN, true )
 221             @nsstack.unshift(curr_ns=Set.new)
 222             identity = md[1]
 223             close = md[2]
 224             identity =~ IDENTITY
 225             name = $1
 226             raise REXML::ParseException.new("DOCTYPE is missing a name") if name.nil?
 227             pub_sys = $2.nil? ? nil : $2.strip
 228             long_name = $4.nil? ? nil : $4.strip
 229             uri = $6.nil? ? nil : $6.strip
 230             args = [ :start_doctype, name, pub_sys, long_name, uri ]
 231             if close == ">"
 232               @document_status = :after_doctype
 233               @source.read if @source.buffer.size<2
 234               md = @source.match(/^\s*/um, true)
 235               @stack << [ :end_doctype ]
 236             else
 237               @document_status = :in_doctype
 238             end
 239             return args
 240           when /^\s+/
 241           else
 242             @document_status = :after_doctype
 243             @source.read if @source.buffer.size<2
 244             md = @source.match(/\s*/um, true)
 245           end
 246         end
 247         if @document_status == :in_doctype
 248           md = @source.match(/\s*(.*?>)/um)
 249           case md[1]
 250           when SYSTEMENTITY
 251             match = @source.match( SYSTEMENTITY, true )[1]
 252             return [ :externalentity, match ]
 253
 254           when ELEMENTDECL_START
 255             return [ :elementdecl, @source.match( ELEMENTDECL_PATTERN, true )[1] ]
 256
 257           when ENTITY_START
 258             match = @source.match( ENTITYDECL, true ).to_a.compact
 259             match[0] = :entitydecl
 260             ref = false
 261             if match[1] == '%'
 262               ref = true
 263               match.delete_at 1
 264             end
 265             # Now we have to sort out what kind of entity reference this is
 266             if match[2] == 'SYSTEM'
 267               # External reference
 268               match[3] = match[3][1..-2] # PUBID
 269               match.delete_at(4) if match.size > 4 # Chop out NDATA decl
 270               # match is [ :entity, name, SYSTEM, pubid(, ndata)? ]
 271             elsif match[2] == 'PUBLIC'
 272               # External reference
 273               match[3] = match[3][1..-2] # PUBID
 274               match[4] = match[4][1..-2] # HREF
 275               # match is [ :entity, name, PUBLIC, pubid, href ]
 276             else
 277               match[2] = match[2][1..-2]
 278               match.pop if match.size == 4
 279               # match is [ :entity, name, value ]
 280             end
 281             match << '%' if ref
 282             return match
 283           when ATTLISTDECL_START
 284             md = @source.match( ATTLISTDECL_PATTERN, true )
 285             raise REXML::ParseException.new( "Bad ATTLIST declaration!", @source ) if md.nil?
 286             element = md[1]
 287             contents = md[0]
 288
 289             pairs = {}
 290             values = md[0].scan( ATTDEF_RE )
 291             values.each do |attdef|
 292               unless attdef[3] == "#IMPLIED"
 293                 attdef.compact!
 294                 val = attdef[3]
 295                 val = attdef[4] if val == "#FIXED "
 296                 pairs[attdef[0]] = val
 297                 if attdef[0] =~ /^xmlns:(.*)/
 298                   @nsstack[0] << $1
 299                 end
 300               end
 301             end
 302             return [ :attlistdecl, element, pairs, contents ]
 303           when NOTATIONDECL_START
 304             md = nil
 305             if @source.match( PUBLIC )
 306               md = @source.match( PUBLIC, true )
 307               vals = [md[1],md[2],md[4],md[6]]
 308             elsif @source.match( SYSTEM )
 309               md = @source.match( SYSTEM, true )
 310               vals = [md[1],md[2],nil,md[4]]
 311             else
 312               raise REXML::ParseException.new( "error parsing notation: no matching pattern", @source )
 313             end
 314             return [ :notationdecl, *vals ]
 315           when CDATA_END
 316             @document_status = :after_doctype
 317             @source.match( CDATA_END, true )
 318             return [ :end_doctype ]
 319           end
 320         end
 321         begin
 322           if @source.buffer[0] == ?<
 323             if @source.buffer[1] == ?/
 324               @nsstack.shift
 325               last_tag = @tags.pop
 326               #md = @source.match_to_consume( '>', CLOSE_MATCH)
 327               md = @source.match( CLOSE_MATCH, true )
 328               raise REXML::ParseException.new( "Missing end tag for "+
 329                 "'#{last_tag}' (got \"#{md[1]}\")",
 330                 @source) unless last_tag == md[1]
 331               return [ :end_element, last_tag ]
 332             elsif @source.buffer[1] == ?!
 333               md = @source.match(/\A(\s*[^>]*>)/um)
 334               #STDERR.puts "SOURCE BUFFER = #{source.buffer}, #{source.buffer.size}"
 335               raise REXML::ParseException.new("Malformed node", @source) unless md
 336               if md[0][2] == ?-
 337                 md = @source.match( COMMENT_PATTERN, true )
 338                 return [ :comment, md[1] ] if md
 339               else
 340                 md = @source.match( CDATA_PATTERN, true )
 341                 return [ :cdata, md[1] ] if md
 342               end
 343               raise REXML::ParseException.new( "Declarations can only occur "+
 344                 "in the doctype declaration.", @source)
 345             elsif @source.buffer[1] == ??
 346               md = @source.match( INSTRUCTION_PATTERN, true )
 347               return [ :processing_instruction, md[1], md[2] ] if md
 348               raise REXML::ParseException.new( "Bad instruction declaration",
 349                 @source)
 350             else
 351               # Get the next tag
 352               md = @source.match(TAG_MATCH, true)
 353               unless md
 354                 # Check for missing attribute quotes
 355                 raise REXML::ParseException.new("missing attribute quote", @source) if @source.match(MISSING_ATTRIBUTE_QUOTES )
 356                 raise REXML::ParseException.new("malformed XML: missing tag start", @source)
 357               end
 358               attributes = {}
 359               prefixes = Set.new
 360               prefixes << md[2] if md[2]
 361               @nsstack.unshift(curr_ns=Set.new)
 362               if md[4].size > 0
 363                 attrs = md[4].scan( ATTRIBUTE_PATTERN )
 364                 raise REXML::ParseException.new( "error parsing attributes: [#{attrs.join ', '}], excess = \"#$'\"", @source) if $' and $'.strip.size > 0
 365                 attrs.each { |a,b,c,d,e|
 366                   if b == "xmlns"
 367                     if c == "xml"
 368                       if d != "http://www.w3.org/XML/1998/namespace"
 369                         msg = "The 'xml' prefix must not be bound to any other namespace "+
 370                         "(http://www.w3.org/TR/REC-xml-names/#ns-decl)"
 371                         raise REXML::ParseException.new( msg, @source, self )
 372                       end
 373                     elsif c == "xmlns"
 374                       msg = "The 'xmlns' prefix must not be declared "+
 375                       "(http://www.w3.org/TR/REC-xml-names/#ns-decl)"
 376                       raise REXML::ParseException.new( msg, @source, self)
 377                     end
 378                     curr_ns << c
 379                   elsif b
 380                     prefixes << b unless b == "xml"
 381                   end
 382                   attributes[a] = e
 383                 }
 384               end
 385
 386               # Verify that all of the prefixes have been defined
 387               for prefix in prefixes
 388                 unless @nsstack.find{|k| k.member?(prefix)}
 389                   raise UndefinedNamespaceException.new(prefix,@source,self)
 390                 end
 391               end
 392
 393               if md[6]
 394                 @closed = md[1]
 395                 @nsstack.shift
 396               else
 397                 @tags.push( md[1] )
 398               end
 399               return [ :start_element, md[1], attributes ]
 400             end
 401           else
 402             md = @source.match( TEXT_PATTERN, true )
 403             if md[0].length == 0
 404               @source.match( /(\s+)/, true )
 405             end
 406             #STDERR.puts "GOT #{md[1].inspect}" unless md[0].length == 0
 407             #return [ :text, "" ] if md[0].length == 0
 408             # unnormalized = Text::unnormalize( md[1], self )
 409             # return PullEvent.new( :text, md[1], unnormalized )
 410             return [ :text, md[1] ]
 411           end
 412         rescue REXML::UndefinedNamespaceException
 413           raise
 414         rescue REXML::ParseException
 415           raise
 416         rescue Exception, NameError => error
 417           raise REXML::ParseException.new( "Exception parsing",
 418             @source, self, (error ? error : $!) )
 419         end
 420         return [ :dummy ]
 421       end
 422
 423       def entity( reference, entities )
 424         value = nil
 425         value = entities[ reference ] if entities
 426         if not value
 427           value = DEFAULT_ENTITIES[ reference ]
 428           value = value[2] if value
 429         end
 430         unnormalize( value, entities ) if value
 431       end
 432
 433       # Escapes all possible entities
 434       def normalize( input, entities=nil, entity_filter=nil )
 435         copy = input.clone
 436         # Doing it like this rather than in a loop improves the speed
 437         copy.gsub!( EREFERENCE, '&amp;' )
 438         entities.each do |key, value|
 439           copy.gsub!( value, "&#{key};" ) unless entity_filter and
 440                                       entity_filter.include?(entity)
 441         end if entities
 442         copy.gsub!( EREFERENCE, '&amp;' )
 443         DEFAULT_ENTITIES.each do |key, value|
 444           copy.gsub!( value[3], value[1] )
 445         end
 446         copy
 447       end
 448
 449       # Unescapes all possible entities
 450       def unnormalize( string, entities=nil, filter=nil )
 451         rv = string.clone
 452         rv.gsub!( /\r\n?/, "\n" )
 453         matches = rv.scan( REFERENCE_RE )
 454         return rv if matches.size == 0
 455         rv.gsub!( /&#0*((?:\d+)|(?:x[a-fA-F0-9]+));/ ) {|m|
 456           m=$1
 457           m = "0#{m}" if m[0] == ?x
 458           [Integer(m)].pack('U*')
 459         }
 460         matches.collect!{|x|x[0]}.compact!
 461         if matches.size > 0
 462           matches.each do |entity_reference|
 463             unless filter and filter.include?(entity_reference)
 464               entity_value = entity( entity_reference, entities )
 465               if entity_value
 466                 re = /&#{entity_reference};/
 467                 rv.gsub!( re, entity_value )
 468               end
 469             end
 470           end
 471           matches.each do |entity_reference|
 472             unless filter and filter.include?(entity_reference)
 473               er = DEFAULT_ENTITIES[entity_reference]
 474               rv.gsub!( er[0], er[2] ) if er
 475             end
 476           end
 477           rv.gsub!( /&amp;/, '&' )
 478         end
 479         rv
 480       end
 481     end
 482   end
 483 end
 484
 485 =begin
 486   case event[0]
 487   when :start_element
 488   when :text
 489   when :end_element
 490   when :processing_instruction
 491   when :cdata
 492   when :comment
 493   when :xmldecl
 494   when :start_doctype
 495   when :end_doctype
 496   when :externalentity
 497   when :elementdecl
 498   when :entity
 499   when :attlistdecl
 500   when :notationdecl
 501   when :end_doctype
 502   end
 503 =end