lib/rexml/text.rb

   1 require 'rexml/entity'
   2 require 'rexml/doctype'
   3 require 'rexml/child'
   4 require 'rexml/doctype'
   5 require 'rexml/parseexception'
   6
   7 module REXML
   8   # Represents text nodes in an XML document
   9   class Text < Child
  10     include Comparable
  11     # The order in which the substitutions occur
  12     SPECIALS = [ /&(?!#?[\w-]+;)/u, /</u, />/u, /"/u, /'/u, /\r/u ]
  13     SUBSTITUTES = ['&amp;', '&lt;', '&gt;', '&quot;', '&apos;', '&#13;']
  14     # Characters which are substituted in written strings
  15     SLAICEPS = [ '<', '>', '"', "'", '&' ]
  16     SETUTITSBUS = [ /&lt;/u, /&gt;/u, /&quot;/u, /&apos;/u, /&amp;/u ]
  17
  18     # If +raw+ is true, then REXML leaves the value alone
  19     attr_accessor :raw
  20
  21     ILLEGAL = /(<|&(?!(#{Entity::NAME})|(#0*((?:\d+)|(?:x[a-fA-F0-9]+)));))/um
  22     NUMERICENTITY = /&#0*((?:\d+)|(?:x[a-fA-F0-9]+));/
  23
  24     # Constructor
  25     # +arg+ if a String, the content is set to the String.  If a Text,
  26     # the object is shallowly cloned.
  27     #
  28     # +respect_whitespace+ (boolean, false) if true, whitespace is
  29     # respected
  30     #
  31     # +parent+ (nil) if this is a Parent object, the parent
  32     # will be set to this.
  33     #
  34     # +raw+ (nil) This argument can be given three values.
  35     # If true, then the value of used to construct this object is expected to
  36     # contain no unescaped XML markup, and REXML will not change the text. If
  37     # this value is false, the string may contain any characters, and REXML will
  38     # escape any and all defined entities whose values are contained in the
  39     # text.  If this value is nil (the default), then the raw value of the
  40     # parent will be used as the raw value for this node.  If there is no raw
  41     # value for the parent, and no value is supplied, the default is false.
  42     # Use this field if you have entities defined for some text, and you don't
  43     # want REXML to escape that text in output.
  44     #   Text.new( "<&", false, nil, false ) #-> "&lt;&amp;"
  45     #   Text.new( "&lt;&amp;", false, nil, false ) #-> "&amp;lt;&amp;amp;"
  46     #   Text.new( "<&", false, nil, true )  #-> Parse exception
  47     #   Text.new( "&lt;&amp;", false, nil, true )  #-> "&lt;&amp;"
  48     #   # Assume that the entity "s" is defined to be "sean"
  49     #   # and that the entity    "r" is defined to be "russell"
  50     #   Text.new( "sean russell" )          #-> "&s; &r;"
  51     #   Text.new( "sean russell", false, nil, true ) #-> "sean russell"
  52     #
  53     # +entity_filter+ (nil) This can be an array of entities to match in the
  54     # supplied text.  This argument is only useful if +raw+ is set to false.
  55     #   Text.new( "sean russell", false, nil, false, ["s"] ) #-> "&s; russell"
  56     #   Text.new( "sean russell", false, nil, true, ["s"] ) #-> "sean russell"
  57     # In the last example, the +entity_filter+ argument is ignored.
  58     #
  59     # +pattern+ INTERNAL USE ONLY
  60     def initialize(arg, respect_whitespace=false, parent=nil, raw=nil,
  61       entity_filter=nil, illegal=ILLEGAL )
  62
  63       @raw = false
  64
  65       if parent
  66         super( parent )
  67         @raw = parent.raw
  68       else
  69         @parent = nil
  70       end
  71
  72       @raw = raw unless raw.nil?
  73       @entity_filter = entity_filter
  74       @normalized = @unnormalized = nil
  75
  76       if arg.kind_of? String
  77         @string = arg.clone
  78         @string.squeeze!(" \n\t") unless respect_whitespace
  79       elsif arg.kind_of? Text
  80         @string = arg.to_s
  81         @raw = arg.raw
  82       elsif
  83         raise "Illegal argument of type #{arg.type} for Text constructor (#{arg})"
  84       end
  85
  86       @string.gsub!( /\r\n?/, "\n" )
  87
  88       # check for illegal characters
  89       if @raw
  90         if @string =~ illegal
  91           raise "Illegal character '#{$1}' in raw string \"#{@string}\""
  92         end
  93       end
  94     end
  95
  96     def node_type
  97       :text
  98     end
  99
 100     def empty?
 101       @string.size==0
 102     end
 103
 104
 105     def clone
 106       return Text.new(self)
 107     end
 108
 109
 110     # Appends text to this text node.  The text is appended in the +raw+ mode
 111     # of this text node.
 112     def <<( to_append )
 113       @string << to_append.gsub( /\r\n?/, "\n" )
 114     end
 115
 116
 117     # +other+ a String or a Text
 118     # +returns+ the result of (to_s <=> arg.to_s)
 119     def <=>( other )
 120       to_s() <=> other.to_s
 121     end
 122
 123     REFERENCE = /#{Entity::REFERENCE}/
 124     # Returns the string value of this text node.  This string is always
 125     # escaped, meaning that it is a valid XML text node string, and all
 126     # entities that can be escaped, have been inserted.  This method respects
 127     # the entity filter set in the constructor.
 128     #
 129     #   # Assume that the entity "s" is defined to be "sean", and that the
 130     #   # entity "r" is defined to be "russell"
 131     #   t = Text.new( "< & sean russell", false, nil, false, ['s'] )
 132     #   t.to_s   #-> "&lt; &amp; &s; russell"
 133     #   t = Text.new( "< & &s; russell", false, nil, false )
 134     #   t.to_s   #-> "&lt; &amp; &s; russell"
 135     #   u = Text.new( "sean russell", false, nil, true )
 136     #   u.to_s   #-> "sean russell"
 137     def to_s
 138       return @string if @raw
 139       return @normalized if @normalized
 140
 141       doctype = nil
 142       if @parent
 143         doc = @parent.document
 144         doctype = doc.doctype if doc
 145       end
 146
 147       @normalized = Text::normalize( @string, doctype, @entity_filter )
 148     end
 149
 150     def inspect
 151       @string.inspect
 152     end
 153
 154     # Returns the string value of this text.  This is the text without
 155     # entities, as it might be used programmatically, or printed to the
 156     # console.  This ignores the 'raw' attribute setting, and any
 157     # entity_filter.
 158     #
 159     #   # Assume that the entity "s" is defined to be "sean", and that the
 160     #   # entity "r" is defined to be "russell"
 161     #   t = Text.new( "< & sean russell", false, nil, false, ['s'] )
 162     #   t.value   #-> "< & sean russell"
 163     #   t = Text.new( "< & &s; russell", false, nil, false )
 164     #   t.value   #-> "< & sean russell"
 165     #   u = Text.new( "sean russell", false, nil, true )
 166     #   u.value   #-> "sean russell"
 167     def value
 168       @unnormalized if @unnormalized
 169       doctype = nil
 170       if @parent
 171         doc = @parent.document
 172         doctype = doc.doctype if doc
 173       end
 174       @unnormalized = Text::unnormalize( @string, doctype )
 175     end
 176
 177     # Sets the contents of this text node.  This expects the text to be
 178     # unnormalized.  It returns self.
 179     #
 180     #   e = Element.new( "a" )
 181     #   e.add_text( "foo" )   # <a>foo</a>
 182     #   e[0].value = "bar"    # <a>bar</a>
 183     #   e[0].value = "<a>"    # <a>&lt;a&gt;</a>
 184     def value=( val )
 185       @string = val.gsub( /\r\n?/, "\n" )
 186       @unnormalized = nil
 187       @normalized = nil
 188       @raw = false
 189     end
 190
 191      def wrap(string, width, addnewline=false)
 192        # Recursivly wrap string at width.
 193        return string if string.length <= width
 194        place = string.rindex(' ', width) # Position in string with last ' ' before cutoff
 195        if addnewline then
 196          return "\n" + string[0,place] + "\n" + wrap(string[place+1..-1], width)
 197        else
 198          return string[0,place] + "\n" + wrap(string[place+1..-1], width)
 199        end
 200      end
 201
 202     def indent_text(string, level=1, style="\t", indentfirstline=true)
 203       return string if level < 0
 204       new_string = ''
 205       string.each { |line|
 206         indent_string = style * level
 207         new_line = (indent_string + line).sub(/[\s]+$/,'')
 208         new_string << new_line
 209       }
 210       new_string.strip! unless indentfirstline
 211       return new_string
 212     end
 213
 214     # == DEPRECATED
 215     # See REXML::Formatters
 216     #
 217     def write( writer, indent=-1, transitive=false, ie_hack=false )
 218       Kernel.warn("#{self.class.name}.write is deprecated.  See REXML::Formatters")
 219       formatter = if indent > -1
 220           REXML::Formatters::Pretty.new( indent )
 221         else
 222           REXML::Formatters::Default.new
 223         end
 224       formatter.write( self, writer )
 225     end
 226
 227     # FIXME
 228     # This probably won't work properly
 229     def xpath
 230       path = @parent.xpath
 231       path += "/text()"
 232       return path
 233     end
 234
 235     # Writes out text, substituting special characters beforehand.
 236     # +out+ A String, IO, or any other object supporting <<( String )
 237     # +input+ the text to substitute and the write out
 238     #
 239     #   z=utf8.unpack("U*")
 240     #   ascOut=""
 241     #   z.each{|r|
 242     #     if r <  0x100
 243     #       ascOut.concat(r.chr)
 244     #     else
 245     #       ascOut.concat(sprintf("&#x%x;", r))
 246     #     end
 247     #   }
 248     #   puts ascOut
 249     def write_with_substitution out, input
 250       copy = input.clone
 251       # Doing it like this rather than in a loop improves the speed
 252       copy.gsub!( SPECIALS[0], SUBSTITUTES[0] )
 253       copy.gsub!( SPECIALS[1], SUBSTITUTES[1] )
 254       copy.gsub!( SPECIALS[2], SUBSTITUTES[2] )
 255       copy.gsub!( SPECIALS[3], SUBSTITUTES[3] )
 256       copy.gsub!( SPECIALS[4], SUBSTITUTES[4] )
 257       copy.gsub!( SPECIALS[5], SUBSTITUTES[5] )
 258       out << copy
 259     end
 260
 261     # Reads text, substituting entities
 262     def Text::read_with_substitution( input, illegal=nil )
 263       copy = input.clone
 264
 265       if copy =~ illegal
 266         raise ParseException.new( "malformed text: Illegal character #$& in \"#{copy}\"" )
 267       end if illegal
 268
 269       copy.gsub!( /\r\n?/, "\n" )
 270       if copy.include? ?&
 271         copy.gsub!( SETUTITSBUS[0], SLAICEPS[0] )
 272         copy.gsub!( SETUTITSBUS[1], SLAICEPS[1] )
 273         copy.gsub!( SETUTITSBUS[2], SLAICEPS[2] )
 274         copy.gsub!( SETUTITSBUS[3], SLAICEPS[3] )
 275         copy.gsub!( SETUTITSBUS[4], SLAICEPS[4] )
 276         copy.gsub!( /&#0*((?:\d+)|(?:x[a-f0-9]+));/ ) {|m|
 277           m=$1
 278           #m='0' if m==''
 279           m = "0#{m}" if m[0] == ?x
 280           [Integer(m)].pack('U*')
 281         }
 282       end
 283       copy
 284     end
 285
 286     EREFERENCE = /&(?!#{Entity::NAME};)/
 287     # Escapes all possible entities
 288     def Text::normalize( input, doctype=nil, entity_filter=nil )
 289       copy = input
 290       # Doing it like this rather than in a loop improves the speed
 291       #copy = copy.gsub( EREFERENCE, '&amp;' )
 292       copy = copy.gsub( "&", "&amp;" )
 293       if doctype
 294         # Replace all ampersands that aren't part of an entity
 295         doctype.entities.each_value do |entity|
 296           copy = copy.gsub( entity.value,
 297             "&#{entity.name};" ) if entity.value and
 298               not( entity_filter and entity_filter.include?(entity) )
 299         end
 300       else
 301         # Replace all ampersands that aren't part of an entity
 302         DocType::DEFAULT_ENTITIES.each_value do |entity|
 303           copy = copy.gsub(entity.value, "&#{entity.name};" )
 304         end
 305       end
 306       copy
 307     end
 308
 309     # Unescapes all possible entities
 310     def Text::unnormalize( string, doctype=nil, filter=nil, illegal=nil )
 311       rv = string.clone
 312       rv.gsub!( /\r\n?/, "\n" )
 313       matches = rv.scan( REFERENCE )
 314       return rv if matches.size == 0
 315       rv.gsub!( NUMERICENTITY ) {|m|
 316         m=$1
 317         m = "0#{m}" if m[0] == ?x
 318         [Integer(m)].pack('U*')
 319       }
 320       matches.collect!{|x|x[0]}.compact!
 321       if matches.size > 0
 322         if doctype
 323           matches.each do |entity_reference|
 324             unless filter and filter.include?(entity_reference)
 325               entity_value = doctype.entity( entity_reference )
 326               re = /&#{entity_reference};/
 327               rv.gsub!( re, entity_value ) if entity_value
 328             end
 329           end
 330         else
 331           matches.each do |entity_reference|
 332             unless filter and filter.include?(entity_reference)
 333               entity_value = DocType::DEFAULT_ENTITIES[ entity_reference ]
 334               re = /&#{entity_reference};/
 335               rv.gsub!( re, entity_value.value ) if entity_value
 336             end
 337           end
 338         end
 339         rv.gsub!( /&amp;/, '&' )
 340       end
 341       rv
 342     end
 343   end
 344 end