3 # from appendix B of rfc 3986 (http://www.ietf.org/rfc/rfc3986.txt)
4 uri_pattern
= r
'^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?'
5 uri_re
= re
.compile(uri_pattern
)
7 # gen-delims = ":" / "/" / "?" / "#" / "[" / "]" / "@"
9 # sub-delims = "!" / "$" / "&" / "'" / "(" / ")"
10 # / "*" / "+" / "," / ";" / "="
12 # unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~"
14 uri_illegal_char_re
= re
.compile(
15 "[^-A-Za-z0-9:/?#[\]@!$&'()*+,;=._~%]", re
.UNICODE
)
17 authority_pattern
= r
'^([^@]*@)?([^:]*)(:.*)?'
18 authority_re
= re
.compile(authority_pattern
)
21 pct_encoded_pattern
= r
'%([0-9A-Fa-f]{2})'
22 pct_encoded_re
= re
.compile(pct_encoded_pattern
)
65 _unreserved
= [False] * 256
66 for _
in range(ord('A'), ord('Z') + 1): _unreserved
[_
] = True
67 for _
in range(ord('0'), ord('9') + 1): _unreserved
[_
] = True
68 for _
in range(ord('a'), ord('z') + 1): _unreserved
[_
] = True
69 _unreserved
[ord('-')] = True
70 _unreserved
[ord('.')] = True
71 _unreserved
[ord('_')] = True
72 _unreserved
[ord('~')] = True
75 _escapeme_re
= re
.compile('[%s]' % (''.join(
76 map(lambda (m
, n
): u
'%s-%s' % (unichr(m
), unichr(n
)),
77 UCSCHAR
+ IPRIVATE
)),))
80 def _pct_escape_unicode(char_match
):
81 c
= char_match
.group()
82 return ''.join(['%%%X' % (ord(octet
),) for octet
in c
.encode('utf-8')])
85 def _pct_encoded_replace_unreserved(mo
):
87 i
= int(mo
.group(1), 16)
91 return mo
.group().upper()
97 def _pct_encoded_replace(mo
):
99 return chr(int(mo
.group(1), 16))
104 def remove_dot_segments(path
):
108 if path
.startswith('../'):
110 elif path
.startswith('./'):
112 elif path
.startswith('/./'):
116 elif path
.startswith('/../'):
119 result_segments
.pop()
123 result_segments
.pop()
124 elif path
== '..' or path
== '.':
130 i
= path
.find('/', i
)
133 result_segments
.append(path
[:i
])
136 return ''.join(result_segments
)
140 if isinstance(uri
, unicode):
141 uri
= _escapeme_re
.sub(_pct_escape_unicode
, uri
).encode('ascii')
143 illegal_mo
= uri_illegal_char_re
.search(uri
)
145 raise ValueError('Illegal characters in URI: %r at position %s' %
146 (illegal_mo
.group(), illegal_mo
.start()))
148 uri_mo
= uri_re
.match(uri
)
150 scheme
= uri_mo
.group(2)
152 raise ValueError('No scheme specified')
154 scheme
= scheme
.lower()
155 if scheme
not in ('http', 'https'):
156 raise ValueError('Not an absolute HTTP or HTTPS URI: %r' % (uri
,))
158 authority
= uri_mo
.group(4)
159 if authority
is None:
160 raise ValueError('Not an absolute URI: %r' % (uri
,))
162 authority_mo
= authority_re
.match(authority
)
163 if authority_mo
is None:
164 raise ValueError('URI does not have a valid authority: %r' % (uri
,))
166 userinfo
, host
, port
= authority_mo
.groups()
173 host
= pct_encoded_re
.sub(_pct_encoded_replace
, host
)
174 host
= unicode(host
, 'utf-8').encode('idna')
180 (scheme
== 'http' and port
== ':80') or
181 (scheme
== 'https' and port
== ':443')):
186 authority
= userinfo
+ host
+ port
188 path
= uri_mo
.group(5)
189 path
= pct_encoded_re
.sub(_pct_encoded_replace_unreserved
, path
)
190 path
= remove_dot_segments(path
)
194 query
= uri_mo
.group(6)
198 fragment
= uri_mo
.group(8)
202 return scheme
+ '://' + authority
+ path
+ query
+ fragment