Re-enable spec/library for full CI runs.
[rbx.git] / lib / uri / common.rb
blob256e62a90378a2b269f27398e122ac0fda72ada5
1 # = uri/common.rb
3 # Author:: Akira Yamada <akira@ruby-lang.org>
4 # Revision:: $Id: common.rb 14565 2007-12-24 01:51:49Z drbrain $
5 # License:: 
6 #   You can redistribute it and/or modify it under the same term as Ruby.
9 module URI
10   module REGEXP
11     #
12     # Patterns used to parse URI's
13     #
14     module PATTERN
15       # :stopdoc:
17       # RFC 2396 (URI Generic Syntax)
18       # RFC 2732 (IPv6 Literal Addresses in URL's)
19       # RFC 2373 (IPv6 Addressing Architecture)
21       # alpha         = lowalpha | upalpha
22       ALPHA = "a-zA-Z"
23       # alphanum      = alpha | digit
24       ALNUM = "#{ALPHA}\\d"
26       # hex           = digit | "A" | "B" | "C" | "D" | "E" | "F" |
27       #                         "a" | "b" | "c" | "d" | "e" | "f"
28       HEX     = "a-fA-F\\d"
29       # escaped       = "%" hex hex
30       ESCAPED = "%[#{HEX}]{2}"
31       # mark          = "-" | "_" | "." | "!" | "~" | "*" | "'" |
32       #                 "(" | ")"
33       # unreserved    = alphanum | mark
34       UNRESERVED = "-_.!~*'()#{ALNUM}"
35       # reserved      = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
36       #                 "$" | ","
37       # reserved      = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" | 
38       #                 "$" | "," | "[" | "]" (RFC 2732)
39       RESERVED = ";/?:@&=+$,\\[\\]"
41       # uric          = reserved | unreserved | escaped
42       URIC = "(?:[#{UNRESERVED}#{RESERVED}]|#{ESCAPED})"
43       # uric_no_slash = unreserved | escaped | ";" | "?" | ":" | "@" |
44       #                 "&" | "=" | "+" | "$" | ","
45       URIC_NO_SLASH = "(?:[#{UNRESERVED};?:@&=+$,]|#{ESCAPED})"
46       # query         = *uric
47       QUERY = "#{URIC}*"
48       # fragment      = *uric
49       FRAGMENT = "#{URIC}*"
51       # domainlabel   = alphanum | alphanum *( alphanum | "-" ) alphanum
52       DOMLABEL = "(?:[#{ALNUM}](?:[-#{ALNUM}]*[#{ALNUM}])?)"
53       # toplabel      = alpha | alpha *( alphanum | "-" ) alphanum
54       TOPLABEL = "(?:[#{ALPHA}](?:[-#{ALNUM}]*[#{ALNUM}])?)"
55       # hostname      = *( domainlabel "." ) toplabel [ "." ]
56       HOSTNAME = "(?:#{DOMLABEL}\\.)*#{TOPLABEL}\\.?"
58       # RFC 2373, APPENDIX B:
59       # IPv6address = hexpart [ ":" IPv4address ]
60       # IPv4address   = 1*3DIGIT "." 1*3DIGIT "." 1*3DIGIT "." 1*3DIGIT
61       # hexpart = hexseq | hexseq "::" [ hexseq ] | "::" [ hexseq ]
62       # hexseq  = hex4 *( ":" hex4)
63       # hex4    = 1*4HEXDIG
64       #
65       # XXX: This definition has a flaw. "::" + IPv4address must be
66       # allowed too.  Here is a replacement.
67       #
68       # IPv4address = 1*3DIGIT "." 1*3DIGIT "." 1*3DIGIT "." 1*3DIGIT
69       IPV4ADDR = "\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}"
70       # hex4     = 1*4HEXDIG
71       HEX4 = "[#{HEX}]{1,4}"
72       # lastpart = hex4 | IPv4address
73       LASTPART = "(?:#{HEX4}|#{IPV4ADDR})"
74       # hexseq1  = *( hex4 ":" ) hex4
75       HEXSEQ1 = "(?:#{HEX4}:)*#{HEX4}"
76       # hexseq2  = *( hex4 ":" ) lastpart
77       HEXSEQ2 = "(?:#{HEX4}:)*#{LASTPART}"
78       # IPv6address = hexseq2 | [ hexseq1 ] "::" [ hexseq2 ]
79       IPV6ADDR = "(?:#{HEXSEQ2}|(?:#{HEXSEQ1})?::(?:#{HEXSEQ2})?)"
81       # IPv6prefix  = ( hexseq1 | [ hexseq1 ] "::" [ hexseq1 ] ) "/" 1*2DIGIT
82       # unused
84       # ipv6reference = "[" IPv6address "]" (RFC 2732)
85       IPV6REF = "\\[#{IPV6ADDR}\\]"
87       # host          = hostname | IPv4address
88       # host          = hostname | IPv4address | IPv6reference (RFC 2732)
89       HOST = "(?:#{HOSTNAME}|#{IPV4ADDR}|#{IPV6REF})"
90       # port          = *digit
91       PORT = '\d*'
92       # hostport      = host [ ":" port ]
93       HOSTPORT = "#{HOST}(?::#{PORT})?"
95       # userinfo      = *( unreserved | escaped |
96       #                    ";" | ":" | "&" | "=" | "+" | "$" | "," )
97       USERINFO = "(?:[#{UNRESERVED};:&=+$,]|#{ESCAPED})*"
99       # pchar         = unreserved | escaped |
100       #                 ":" | "@" | "&" | "=" | "+" | "$" | ","
101       PCHAR = "(?:[#{UNRESERVED}:@&=+$,]|#{ESCAPED})"
102       # param         = *pchar
103       PARAM = "#{PCHAR}*"
104       # segment       = *pchar *( ";" param )
105       SEGMENT = "#{PCHAR}*(?:;#{PARAM})*"
106       # path_segments = segment *( "/" segment )
107       PATH_SEGMENTS = "#{SEGMENT}(?:/#{SEGMENT})*"
109       # server        = [ [ userinfo "@" ] hostport ]
110       SERVER = "(?:#{USERINFO}@)?#{HOSTPORT}"
111       # reg_name      = 1*( unreserved | escaped | "$" | "," |
112       #                     ";" | ":" | "@" | "&" | "=" | "+" )
113       REG_NAME = "(?:[#{UNRESERVED}$,;:@&=+]|#{ESCAPED})+"
114       # authority     = server | reg_name
115       AUTHORITY = "(?:#{SERVER}|#{REG_NAME})"
117       # rel_segment   = 1*( unreserved | escaped |
118       #                     ";" | "@" | "&" | "=" | "+" | "$" | "," )
119       REL_SEGMENT = "(?:[#{UNRESERVED};@&=+$,]|#{ESCAPED})+"
121       # scheme        = alpha *( alpha | digit | "+" | "-" | "." )
122       SCHEME = "[#{ALPHA}][-+.#{ALPHA}\\d]*"
124       # abs_path      = "/"  path_segments
125       ABS_PATH = "/#{PATH_SEGMENTS}"
126       # rel_path      = rel_segment [ abs_path ]
127       REL_PATH = "#{REL_SEGMENT}(?:#{ABS_PATH})?"
128       # net_path      = "//" authority [ abs_path ]
129       NET_PATH   = "//#{AUTHORITY}(?:#{ABS_PATH})?"
131       # hier_part     = ( net_path | abs_path ) [ "?" query ]
132       HIER_PART   = "(?:#{NET_PATH}|#{ABS_PATH})(?:\\?(?:#{QUERY}))?"
133       # opaque_part   = uric_no_slash *uric
134       OPAQUE_PART = "#{URIC_NO_SLASH}#{URIC}*"
136       # absoluteURI   = scheme ":" ( hier_part | opaque_part )
137       ABS_URI   = "#{SCHEME}:(?:#{HIER_PART}|#{OPAQUE_PART})"
138       # relativeURI   = ( net_path | abs_path | rel_path ) [ "?" query ]
139       REL_URI = "(?:#{NET_PATH}|#{ABS_PATH}|#{REL_PATH})(?:\\?#{QUERY})?"
141       # URI-reference = [ absoluteURI | relativeURI ] [ "#" fragment ]
142       URI_REF = "(?:#{ABS_URI}|#{REL_URI})?(?:##{FRAGMENT})?"
144       # XXX:
145       X_ABS_URI = "
146         (#{PATTERN::SCHEME}):                     (?# 1: scheme)
147         (?:
148            (#{PATTERN::OPAQUE_PART})              (?# 2: opaque)
149         |
150            (?:(?:
151              //(?:
152                  (?:(?:(#{PATTERN::USERINFO})@)?  (?# 3: userinfo)
153                    (?:(#{PATTERN::HOST})(?::(\\d*))?))?(?# 4: host, 5: port)
154                |
155                  (#{PATTERN::REG_NAME})           (?# 6: registry)
156                )
157              |
158              (?!//))                              (?# XXX: '//' is the mark for hostport)
159              (#{PATTERN::ABS_PATH})?              (?# 7: path)
160            )(?:\\?(#{PATTERN::QUERY}))?           (?# 8: query)
161         )
162         (?:\\#(#{PATTERN::FRAGMENT}))?            (?# 9: fragment)
163       "
164       X_REL_URI = "
165         (?:
166           (?:
167             //
168             (?:
169               (?:(#{PATTERN::USERINFO})@)?       (?# 1: userinfo)
170                 (#{PATTERN::HOST})?(?::(\\d*))?  (?# 2: host, 3: port)
171             |
172               (#{PATTERN::REG_NAME})             (?# 4: registry)
173             )
174           )
175         |
176           (#{PATTERN::REL_SEGMENT})              (?# 5: rel_segment)
177         )?
178         (#{PATTERN::ABS_PATH})?                  (?# 6: abs_path)
179         (?:\\?(#{PATTERN::QUERY}))?              (?# 7: query)
180         (?:\\#(#{PATTERN::FRAGMENT}))?           (?# 8: fragment)
181       "
182       # :startdoc:
183     end # PATTERN
185     # :stopdoc:
187     # for URI::split
188     ABS_URI = Regexp.new('^' + PATTERN::X_ABS_URI + '$', #'
189                          Regexp::EXTENDED).freeze
190     REL_URI = Regexp.new('^' + PATTERN::X_REL_URI + '$', #'
191                          Regexp::EXTENDED).freeze
193     # for URI::extract
194     URI_REF     = Regexp.new(PATTERN::URI_REF).freeze
195     ABS_URI_REF = Regexp.new(PATTERN::X_ABS_URI, Regexp::EXTENDED).freeze
196     REL_URI_REF = Regexp.new(PATTERN::X_REL_URI, Regexp::EXTENDED).freeze
198     # for URI::escape/unescape
199     ESCAPED = Regexp.new(PATTERN::ESCAPED).freeze
200     UNSAFE  = Regexp.new("[^#{PATTERN::UNRESERVED}#{PATTERN::RESERVED}]").freeze
202     # for Generic#initialize
203     SCHEME   = Regexp.new("^#{PATTERN::SCHEME}$").freeze #"
204     USERINFO = Regexp.new("^#{PATTERN::USERINFO}$").freeze #"
205     HOST     = Regexp.new("^#{PATTERN::HOST}$").freeze #"
206     PORT     = Regexp.new("^#{PATTERN::PORT}$").freeze #"
207     OPAQUE   = Regexp.new("^#{PATTERN::OPAQUE_PART}$").freeze #"
208     REGISTRY = Regexp.new("^#{PATTERN::REG_NAME}$").freeze #"
209     ABS_PATH = Regexp.new("^#{PATTERN::ABS_PATH}$").freeze #"
210     REL_PATH = Regexp.new("^#{PATTERN::REL_PATH}$").freeze #"
211     QUERY    = Regexp.new("^#{PATTERN::QUERY}$").freeze #"
212     FRAGMENT = Regexp.new("^#{PATTERN::FRAGMENT}$").freeze #"
213     # :startdoc:
214   end # REGEXP
216   module Util # :nodoc:
217     def make_components_hash(klass, array_hash)
218       tmp = {}
219       if array_hash.kind_of?(Array) &&
220           array_hash.size == klass.component.size - 1
221         klass.component[1..-1].each_index do |i|
222           begin
223             tmp[klass.component[i + 1]] = array_hash[i].clone
224           rescue TypeError
225             tmp[klass.component[i + 1]] = array_hash[i]
226           end
227         end
229       elsif array_hash.kind_of?(Hash)
230         array_hash.each do |key, value|
231           begin
232             tmp[key] = value.clone
233           rescue TypeError
234             tmp[key] = value
235           end
236         end
237       else
238         raise ArgumentError, 
239           "expected Array of or Hash of components of #{klass.to_s} (#{klass.component[1..-1].join(', ')})"
240       end
241       tmp[:scheme] = klass.to_s.sub(/\A.*::/, '').downcase
243       return tmp
244     end
245     module_function :make_components_hash
246   end
248   module Escape
249     include REGEXP
251     #
252     # == Synopsis
253     #
254     #   URI.escape(str [, unsafe])
255     #
256     # == Args
257     #
258     # +str+::
259     #   String to replaces in.
260     # +unsafe+::
261     #   Regexp that matches all symbols that must be replaced with codes.
262     #   By default uses <tt>REGEXP::UNSAFE</tt>.
263     #   When this argument is a String, it represents a character set.
264     #
265     # == Description
266     #
267     # Escapes the string, replacing all unsafe characters with codes.
268     #
269     # == Usage
270     #
271     #   require 'uri'
272     #
273     #   enc_uri = URI.escape("http://example.com/?a=\11\15")
274     #   p enc_uri
275     #   # => "http://example.com/?a=%09%0D"
276     #
277     #   p URI.unescape(enc_uri)
278     #   # => "http://example.com/?a=\t\r"
279     #
280     #   p URI.escape("@?@!", "!?")
281     #   # => "@%3F@%21"
282     #
283     def escape(str, unsafe = UNSAFE)
284       unless unsafe.kind_of?(Regexp)
285         # perhaps unsafe is String object
286         unsafe = Regexp.new("[#{Regexp.quote(unsafe)}]", false, 'N')
287       end
288       str.gsub(unsafe) do |us|
289         tmp = ''
290         us.each_byte do |uc|
291           tmp << sprintf('%%%02X', uc)
292         end
293         tmp
294       end
295     end
296     alias encode escape
297     #
298     # == Synopsis
299     #
300     #   URI.unescape(str)
301     #
302     # == Args
303     #
304     # +str+::
305     #   Unescapes the string.
306     #
307     # == Usage
308     #
309     #   require 'uri'
310     #
311     #   enc_uri = URI.escape("http://example.com/?a=\11\15")
312     #   p enc_uri
313     #   # => "http://example.com/?a=%09%0D"
314     #
315     #   p URI.unescape(enc_uri)
316     #   # => "http://example.com/?a=\t\r"
317     #
318     def unescape(str)
319       str.gsub(ESCAPED) do
320         $&[1,2].hex.chr
321       end
322     end
323     alias decode unescape
324   end
326   include REGEXP
327   extend Escape
329   @@schemes = {}
330   
331   #
332   # Base class for all URI exceptions.
333   #
334   class Error < StandardError; end
335   #
336   # Not a URI.
337   #
338   class InvalidURIError < Error; end
339   #
340   # Not a URI component.
341   #
342   class InvalidComponentError < Error; end
343   #
344   # URI is valid, bad usage is not.
345   #
346   class BadURIError < Error; end
348   #
349   # == Synopsis
350   #
351   #   URI::split(uri)
352   #
353   # == Args
354   #
355   # +uri+::
356   #   String with URI.
357   #
358   # == Description
359   #
360   # Splits the string on following parts and returns array with result:
361   #
362   #   * Scheme
363   #   * Userinfo
364   #   * Host
365   #   * Port
366   #   * Registry
367   #   * Path
368   #   * Opaque
369   #   * Query
370   #   * Fragment
371   # 
372   # == Usage
373   #
374   #   require 'uri'
375   #
376   #   p URI.split("http://www.ruby-lang.org/")
377   #   # => ["http", nil, "www.ruby-lang.org", nil, nil, "/", nil, nil, nil]
378   #
379   def self.split(uri)
380     case uri
381     when ''
382       # null uri
384     when ABS_URI
385       scheme, opaque, userinfo, host, port, 
386         registry, path, query, fragment = $~[1..-1]
388       # URI-reference = [ absoluteURI | relativeURI ] [ "#" fragment ]
390       # absoluteURI   = scheme ":" ( hier_part | opaque_part )
391       # hier_part     = ( net_path | abs_path ) [ "?" query ]
392       # opaque_part   = uric_no_slash *uric
394       # abs_path      = "/"  path_segments
395       # net_path      = "//" authority [ abs_path ]
397       # authority     = server | reg_name
398       # server        = [ [ userinfo "@" ] hostport ]
400       if !scheme
401         raise InvalidURIError, 
402           "bad URI(absolute but no scheme): #{uri}"
403       end
404       if !opaque && (!path && (!host && !registry))
405         raise InvalidURIError,
406           "bad URI(absolute but no path): #{uri}" 
407       end
409     when REL_URI
410       scheme = nil
411       opaque = nil
413       userinfo, host, port, registry, 
414         rel_segment, abs_path, query, fragment = $~[1..-1]
415       if rel_segment && abs_path
416         path = rel_segment + abs_path
417       elsif rel_segment
418         path = rel_segment
419       elsif abs_path
420         path = abs_path
421       end
423       # URI-reference = [ absoluteURI | relativeURI ] [ "#" fragment ]
425       # relativeURI   = ( net_path | abs_path | rel_path ) [ "?" query ]
427       # net_path      = "//" authority [ abs_path ]
428       # abs_path      = "/"  path_segments
429       # rel_path      = rel_segment [ abs_path ]
431       # authority     = server | reg_name
432       # server        = [ [ userinfo "@" ] hostport ]
434     else
435       raise InvalidURIError, "bad URI(is not URI?): #{uri}"
436     end
438     path = '' if !path && !opaque # (see RFC2396 Section 5.2)
439     ret = [
440       scheme, 
441       userinfo, host, port,         # X
442       registry,                        # X
443       path,                         # Y
444       opaque,                        # Y
445       query,
446       fragment
447     ]
448     return ret
449   end
451   #
452   # == Synopsis
453   #
454   #   URI::parse(uri_str)
455   #
456   # == Args
457   #
458   # +uri_str+::
459   #   String with URI.
460   #
461   # == Description
462   #
463   # Creates one of the URI's subclasses instance from the string.
464   #  
465   # == Raises
466   #
467   # URI::InvalidURIError
468   #   Raised if URI given is not a correct one.
469   #
470   # == Usage
471   #
472   #   require 'uri'
473   #
474   #   uri = URI.parse("http://www.ruby-lang.org/")
475   #   p uri
476   #   # => #<URI::HTTP:0x202281be URL:http://www.ruby-lang.org/>
477   #   p uri.scheme 
478   #   # => "http" 
479   #   p uri.host 
480   #   # => "www.ruby-lang.org" 
481   # 
482   def self.parse(uri)
483     scheme, userinfo, host, port, 
484       registry, path, opaque, query, fragment = self.split(uri)
486     if scheme && @@schemes.include?(scheme.upcase)
487       @@schemes[scheme.upcase].new(scheme, userinfo, host, port, 
488                                    registry, path, opaque, query, 
489                                    fragment)
490     else
491       Generic.new(scheme, userinfo, host, port, 
492                   registry, path, opaque, query, 
493                   fragment)
494     end
495   end
497   #
498   # == Synopsis
499   #
500   #   URI::join(str[, str, ...])
501   #
502   # == Args
503   #
504   # +str+::
505   #   String(s) to work with
506   #
507   # == Description
508   #
509   # Joins URIs.
510   #
511   # == Usage
512   #
513   #   require 'uri'
514   #
515   #   p URI.join("http://localhost/","main.rbx")
516   #   # => #<URI::HTTP:0x2022ac02 URL:http://localhost/main.rbx>
517   #
518   def self.join(*str)
519     u = self.parse(str[0])
520     str[1 .. -1].each do |x|
521       u = u.merge(x)
522     end
523     u
524   end
526   #
527   # == Synopsis
528   #
529   #   URI::extract(str[, schemes][,&blk])
530   #
531   # == Args
532   #
533   # +str+:: 
534   #   String to extract URIs from.
535   # +schemes+::
536   #   Limit URI matching to a specific schemes.
537   #
538   # == Description
539   #
540   # Extracts URIs from a string. If block given, iterates through all matched URIs.
541   # Returns nil if block given or array with matches.
542   #
543   # == Usage
544   #
545   #   require "uri"
546   #
547   #   URI.extract("text here http://foo.example.org/bla and here mailto:test@example.com and here also.")
548   #   # => ["http://foo.example.com/bla", "mailto:test@example.com"]
549   #
550   def self.extract(str, schemes = nil, &block)
551     if block_given?
552       str.scan(regexp(schemes)) { yield $& }
553       nil
554     else
555       result = []
556       str.scan(regexp(schemes)) { result.push $& }
557       result
558     end
559   end
561   #
562   # == Synopsis
563   #
564   #   URI::regexp([match_schemes])
565   #
566   # == Args
567   #
568   # +match_schemes+:: 
569   #   Array of schemes. If given, resulting regexp matches to URIs
570   #   whose scheme is one of the match_schemes.
571   # 
572   # == Description
573   # Returns a Regexp object which matches to URI-like strings.
574   # The Regexp object returned by this method includes arbitrary
575   # number of capture group (parentheses).  Never rely on it's number.
576   # 
577   # == Usage
578   #
579   #   require 'uri'
580   #
581   #   # extract first URI from html_string
582   #   html_string.slice(URI.regexp)
583   # 
584   #   # remove ftp URIs
585   #   html_string.sub(URI.regexp(['ftp'])
586   # 
587   #   # You should not rely on the number of parentheses
588   #   html_string.scan(URI.regexp) do |*matches|
589   #     p $&
590   #   end
591   #
592   def self.regexp(schemes = nil)
593     unless schemes
594       ABS_URI_REF
595     else
596       /(?=#{Regexp.union(*schemes)}:)#{PATTERN::X_ABS_URI}/xn
597     end
598   end
602 module Kernel
603   # alias for URI.parse.
604   #
605   # This method is introduced at 1.8.2.
606   def URI(uri_str) # :doc:
607     URI.parse(uri_str)
608   end
609   module_function :URI