Fix the tag.
[python/dscho.git] / Lib / encodings / punycode.py
blobb801a46092fe419177ad0866302fa3cc5df21cc9
1 # -*- coding: utf-8 -*-
2 """ Codec for the Punicode encoding, as specified in RFC 3492
4 Written by Martin v. Löwis.
5 """
7 import codecs
9 ##################### Encoding #####################################
11 def segregate(str):
12 """3.1 Basic code point segregation"""
13 base = bytearray()
14 extended = set()
15 for c in str:
16 if ord(c) < 128:
17 base.append(ord(c))
18 else:
19 extended.add(c)
20 extended = sorted(extended)
21 return bytes(base), extended
23 def selective_len(str, max):
24 """Return the length of str, considering only characters below max."""
25 res = 0
26 for c in str:
27 if ord(c) < max:
28 res += 1
29 return res
31 def selective_find(str, char, index, pos):
32 """Return a pair (index, pos), indicating the next occurrence of
33 char in str. index is the position of the character considering
34 only ordinals up to and including char, and pos is the position in
35 the full string. index/pos is the starting position in the full
36 string."""
38 l = len(str)
39 while 1:
40 pos += 1
41 if pos == l:
42 return (-1, -1)
43 c = str[pos]
44 if c == char:
45 return index+1, pos
46 elif c < char:
47 index += 1
49 def insertion_unsort(str, extended):
50 """3.2 Insertion unsort coding"""
51 oldchar = 0x80
52 result = []
53 oldindex = -1
54 for c in extended:
55 index = pos = -1
56 char = ord(c)
57 curlen = selective_len(str, char)
58 delta = (curlen+1) * (char - oldchar)
59 while 1:
60 index,pos = selective_find(str,c,index,pos)
61 if index == -1:
62 break
63 delta += index - oldindex
64 result.append(delta-1)
65 oldindex = index
66 delta = 0
67 oldchar = char
69 return result
71 def T(j, bias):
72 # Punycode parameters: tmin = 1, tmax = 26, base = 36
73 res = 36 * (j + 1) - bias
74 if res < 1: return 1
75 if res > 26: return 26
76 return res
78 digits = b"abcdefghijklmnopqrstuvwxyz0123456789"
79 def generate_generalized_integer(N, bias):
80 """3.3 Generalized variable-length integers"""
81 result = bytearray()
82 j = 0
83 while 1:
84 t = T(j, bias)
85 if N < t:
86 result.append(digits[N])
87 return bytes(result)
88 result.append(digits[t + ((N - t) % (36 - t))])
89 N = (N - t) // (36 - t)
90 j += 1
92 def adapt(delta, first, numchars):
93 if first:
94 delta //= 700
95 else:
96 delta //= 2
97 delta += delta // numchars
98 # ((base - tmin) * tmax) // 2 == 455
99 divisions = 0
100 while delta > 455:
101 delta = delta // 35 # base - tmin
102 divisions += 36
103 bias = divisions + (36 * delta // (delta + 38))
104 return bias
107 def generate_integers(baselen, deltas):
108 """3.4 Bias adaptation"""
109 # Punycode parameters: initial bias = 72, damp = 700, skew = 38
110 result = bytearray()
111 bias = 72
112 for points, delta in enumerate(deltas):
113 s = generate_generalized_integer(delta, bias)
114 result.extend(s)
115 bias = adapt(delta, points==0, baselen+points+1)
116 return bytes(result)
118 def punycode_encode(text):
119 base, extended = segregate(text)
120 deltas = insertion_unsort(text, extended)
121 extended = generate_integers(len(base), deltas)
122 if base:
123 return base + b"-" + extended
124 return extended
126 ##################### Decoding #####################################
128 def decode_generalized_number(extended, extpos, bias, errors):
129 """3.3 Generalized variable-length integers"""
130 result = 0
131 w = 1
132 j = 0
133 while 1:
134 try:
135 char = ord(extended[extpos])
136 except IndexError:
137 if errors == "strict":
138 raise UnicodeError("incomplete punicode string")
139 return extpos + 1, None
140 extpos += 1
141 if 0x41 <= char <= 0x5A: # A-Z
142 digit = char - 0x41
143 elif 0x30 <= char <= 0x39:
144 digit = char - 22 # 0x30-26
145 elif errors == "strict":
146 raise UnicodeError("Invalid extended code point '%s'"
147 % extended[extpos])
148 else:
149 return extpos, None
150 t = T(j, bias)
151 result += digit * w
152 if digit < t:
153 return extpos, result
154 w = w * (36 - t)
155 j += 1
158 def insertion_sort(base, extended, errors):
159 """3.2 Insertion unsort coding"""
160 char = 0x80
161 pos = -1
162 bias = 72
163 extpos = 0
164 while extpos < len(extended):
165 newpos, delta = decode_generalized_number(extended, extpos,
166 bias, errors)
167 if delta is None:
168 # There was an error in decoding. We can't continue because
169 # synchronization is lost.
170 return base
171 pos += delta+1
172 char += pos // (len(base) + 1)
173 if char > 0x10FFFF:
174 if errors == "strict":
175 raise UnicodeError("Invalid character U+%x" % char)
176 char = ord('?')
177 pos = pos % (len(base) + 1)
178 base = base[:pos] + chr(char) + base[pos:]
179 bias = adapt(delta, (extpos == 0), len(base))
180 extpos = newpos
181 return base
183 def punycode_decode(text, errors):
184 if isinstance(text, str):
185 text = text.encode("ascii")
186 pos = text.rfind(b"-")
187 if pos == -1:
188 base = ""
189 extended = str(text, "ascii").upper()
190 else:
191 base = str(text[:pos], "ascii", errors)
192 extended = str(text[pos+1:], "ascii").upper()
193 return insertion_sort(base, extended, errors)
195 ### Codec APIs
197 class Codec(codecs.Codec):
199 def encode(self, input, errors='strict'):
200 res = punycode_encode(input)
201 return res, len(input)
203 def decode(self, input, errors='strict'):
204 if errors not in ('strict', 'replace', 'ignore'):
205 raise UnicodeError("Unsupported error handling "+errors)
206 res = punycode_decode(input, errors)
207 return res, len(input)
209 class IncrementalEncoder(codecs.IncrementalEncoder):
210 def encode(self, input, final=False):
211 return punycode_encode(input)
213 class IncrementalDecoder(codecs.IncrementalDecoder):
214 def decode(self, input, final=False):
215 if self.errors not in ('strict', 'replace', 'ignore'):
216 raise UnicodeError("Unsupported error handling "+self.errors)
217 return punycode_decode(input, self.errors)
219 class StreamWriter(Codec,codecs.StreamWriter):
220 pass
222 class StreamReader(Codec,codecs.StreamReader):
223 pass
225 ### encodings module API
227 def getregentry():
228 return codecs.CodecInfo(
229 name='punycode',
230 encode=Codec().encode,
231 decode=Codec().decode,
232 incrementalencoder=IncrementalEncoder,
233 incrementaldecoder=IncrementalDecoder,
234 streamwriter=StreamWriter,
235 streamreader=StreamReader,