getting file size for all dict files to be downloaded. coming to be 400mb or so.
[worddb.git] / libs / openid / urinorm.py
blob5bdbaeff19d67ee1acee935798e0ea8d751537d4
1 import re
3 # from appendix B of rfc 3986 (http://www.ietf.org/rfc/rfc3986.txt)
4 uri_pattern = r'^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?'
5 uri_re = re.compile(uri_pattern)
7 # gen-delims = ":" / "/" / "?" / "#" / "[" / "]" / "@"
9 # sub-delims = "!" / "$" / "&" / "'" / "(" / ")"
10 # / "*" / "+" / "," / ";" / "="
12 # unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~"
14 uri_illegal_char_re = re.compile(
15 "[^-A-Za-z0-9:/?#[\]@!$&'()*+,;=._~%]", re.UNICODE)
17 authority_pattern = r'^([^@]*@)?([^:]*)(:.*)?'
18 authority_re = re.compile(authority_pattern)
21 pct_encoded_pattern = r'%([0-9A-Fa-f]{2})'
22 pct_encoded_re = re.compile(pct_encoded_pattern)
24 try:
25 unichr(0x10000)
26 except ValueError:
27 # narrow python build
28 UCSCHAR = [
29 (0xA0, 0xD7FF),
30 (0xF900, 0xFDCF),
31 (0xFDF0, 0xFFEF),
34 IPRIVATE = [
35 (0xE000, 0xF8FF),
37 else:
38 UCSCHAR = [
39 (0xA0, 0xD7FF),
40 (0xF900, 0xFDCF),
41 (0xFDF0, 0xFFEF),
42 (0x10000, 0x1FFFD),
43 (0x20000, 0x2FFFD),
44 (0x30000, 0x3FFFD),
45 (0x40000, 0x4FFFD),
46 (0x50000, 0x5FFFD),
47 (0x60000, 0x6FFFD),
48 (0x70000, 0x7FFFD),
49 (0x80000, 0x8FFFD),
50 (0x90000, 0x9FFFD),
51 (0xA0000, 0xAFFFD),
52 (0xB0000, 0xBFFFD),
53 (0xC0000, 0xCFFFD),
54 (0xD0000, 0xDFFFD),
55 (0xE1000, 0xEFFFD),
58 IPRIVATE = [
59 (0xE000, 0xF8FF),
60 (0xF0000, 0xFFFFD),
61 (0x100000, 0x10FFFD),
65 _unreserved = [False] * 256
66 for _ in range(ord('A'), ord('Z') + 1): _unreserved[_] = True
67 for _ in range(ord('0'), ord('9') + 1): _unreserved[_] = True
68 for _ in range(ord('a'), ord('z') + 1): _unreserved[_] = True
69 _unreserved[ord('-')] = True
70 _unreserved[ord('.')] = True
71 _unreserved[ord('_')] = True
72 _unreserved[ord('~')] = True
75 _escapeme_re = re.compile('[%s]' % (''.join(
76 map(lambda (m, n): u'%s-%s' % (unichr(m), unichr(n)),
77 UCSCHAR + IPRIVATE)),))
80 def _pct_escape_unicode(char_match):
81 c = char_match.group()
82 return ''.join(['%%%X' % (ord(octet),) for octet in c.encode('utf-8')])
85 def _pct_encoded_replace_unreserved(mo):
86 try:
87 i = int(mo.group(1), 16)
88 if _unreserved[i]:
89 return chr(i)
90 else:
91 return mo.group().upper()
93 except ValueError:
94 return mo.group()
97 def _pct_encoded_replace(mo):
98 try:
99 return chr(int(mo.group(1), 16))
100 except ValueError:
101 return mo.group()
104 def remove_dot_segments(path):
105 result_segments = []
107 while path:
108 if path.startswith('../'):
109 path = path[3:]
110 elif path.startswith('./'):
111 path = path[2:]
112 elif path.startswith('/./'):
113 path = path[2:]
114 elif path == '/.':
115 path = '/'
116 elif path.startswith('/../'):
117 path = path[3:]
118 if result_segments:
119 result_segments.pop()
120 elif path == '/..':
121 path = '/'
122 if result_segments:
123 result_segments.pop()
124 elif path == '..' or path == '.':
125 path = ''
126 else:
127 i = 0
128 if path[0] == '/':
129 i = 1
130 i = path.find('/', i)
131 if i == -1:
132 i = len(path)
133 result_segments.append(path[:i])
134 path = path[i:]
136 return ''.join(result_segments)
139 def urinorm(uri):
140 if isinstance(uri, unicode):
141 uri = _escapeme_re.sub(_pct_escape_unicode, uri).encode('ascii')
143 illegal_mo = uri_illegal_char_re.search(uri)
144 if illegal_mo:
145 raise ValueError('Illegal characters in URI: %r at position %s' %
146 (illegal_mo.group(), illegal_mo.start()))
148 uri_mo = uri_re.match(uri)
150 scheme = uri_mo.group(2)
151 if scheme is None:
152 raise ValueError('No scheme specified')
154 scheme = scheme.lower()
155 if scheme not in ('http', 'https'):
156 raise ValueError('Not an absolute HTTP or HTTPS URI: %r' % (uri,))
158 authority = uri_mo.group(4)
159 if authority is None:
160 raise ValueError('Not an absolute URI: %r' % (uri,))
162 authority_mo = authority_re.match(authority)
163 if authority_mo is None:
164 raise ValueError('URI does not have a valid authority: %r' % (uri,))
166 userinfo, host, port = authority_mo.groups()
168 if userinfo is None:
169 userinfo = ''
171 if '%' in host:
172 host = host.lower()
173 host = pct_encoded_re.sub(_pct_encoded_replace, host)
174 host = unicode(host, 'utf-8').encode('idna')
175 else:
176 host = host.lower()
178 if port:
179 if (port == ':' or
180 (scheme == 'http' and port == ':80') or
181 (scheme == 'https' and port == ':443')):
182 port = ''
183 else:
184 port = ''
186 authority = userinfo + host + port
188 path = uri_mo.group(5)
189 path = pct_encoded_re.sub(_pct_encoded_replace_unreserved, path)
190 path = remove_dot_segments(path)
191 if not path:
192 path = '/'
194 query = uri_mo.group(6)
195 if query is None:
196 query = ''
198 fragment = uri_mo.group(8)
199 if fragment is None:
200 fragment = ''
202 return scheme + '://' + authority + path + query + fragment