1 # -*- coding: utf-8 -*-
2 """ Codec for the Punicode encoding, as specified in RFC 3492
4 Written by Martin v. Löwis.
9 ##################### Encoding #####################################
12 """3.1 Basic code point segregation"""
20 extended
= sorted(extended
)
21 return bytes(base
), extended
23 def selective_len(str, max):
24 """Return the length of str, considering only characters below max."""
31 def selective_find(str, char
, index
, pos
):
32 """Return a pair (index, pos), indicating the next occurrence of
33 char in str. index is the position of the character considering
34 only ordinals up to and including char, and pos is the position in
35 the full string. index/pos is the starting position in the full
49 def insertion_unsort(str, extended
):
50 """3.2 Insertion unsort coding"""
57 curlen
= selective_len(str, char
)
58 delta
= (curlen
+1) * (char
- oldchar
)
60 index
,pos
= selective_find(str,c
,index
,pos
)
63 delta
+= index
- oldindex
64 result
.append(delta
-1)
72 # Punycode parameters: tmin = 1, tmax = 26, base = 36
73 res
= 36 * (j
+ 1) - bias
75 if res
> 26: return 26
78 digits
= b
"abcdefghijklmnopqrstuvwxyz0123456789"
79 def generate_generalized_integer(N
, bias
):
80 """3.3 Generalized variable-length integers"""
86 result
.append(digits
[N
])
88 result
.append(digits
[t
+ ((N
- t
) % (36 - t
))])
89 N
= (N
- t
) // (36 - t
)
92 def adapt(delta
, first
, numchars
):
97 delta
+= delta
// numchars
98 # ((base - tmin) * tmax) // 2 == 455
101 delta
= delta
// 35 # base - tmin
103 bias
= divisions
+ (36 * delta
// (delta
+ 38))
107 def generate_integers(baselen
, deltas
):
108 """3.4 Bias adaptation"""
109 # Punycode parameters: initial bias = 72, damp = 700, skew = 38
112 for points
, delta
in enumerate(deltas
):
113 s
= generate_generalized_integer(delta
, bias
)
115 bias
= adapt(delta
, points
==0, baselen
+points
+1)
118 def punycode_encode(text
):
119 base
, extended
= segregate(text
)
120 deltas
= insertion_unsort(text
, extended
)
121 extended
= generate_integers(len(base
), deltas
)
123 return base
+ b
"-" + extended
126 ##################### Decoding #####################################
128 def decode_generalized_number(extended
, extpos
, bias
, errors
):
129 """3.3 Generalized variable-length integers"""
135 char
= ord(extended
[extpos
])
137 if errors
== "strict":
138 raise UnicodeError("incomplete punicode string")
139 return extpos
+ 1, None
141 if 0x41 <= char
<= 0x5A: # A-Z
143 elif 0x30 <= char
<= 0x39:
144 digit
= char
- 22 # 0x30-26
145 elif errors
== "strict":
146 raise UnicodeError("Invalid extended code point '%s'"
153 return extpos
, result
158 def insertion_sort(base
, extended
, errors
):
159 """3.2 Insertion unsort coding"""
164 while extpos
< len(extended
):
165 newpos
, delta
= decode_generalized_number(extended
, extpos
,
168 # There was an error in decoding. We can't continue because
169 # synchronization is lost.
172 char
+= pos
// (len(base
) + 1)
174 if errors
== "strict":
175 raise UnicodeError("Invalid character U+%x" % char
)
177 pos
= pos
% (len(base
) + 1)
178 base
= base
[:pos
] + chr(char
) + base
[pos
:]
179 bias
= adapt(delta
, (extpos
== 0), len(base
))
183 def punycode_decode(text
, errors
):
184 if isinstance(text
, str):
185 text
= text
.encode("ascii")
186 pos
= text
.rfind(b
"-")
189 extended
= str(text
, "ascii").upper()
191 base
= str(text
[:pos
], "ascii", errors
)
192 extended
= str(text
[pos
+1:], "ascii").upper()
193 return insertion_sort(base
, extended
, errors
)
197 class Codec(codecs
.Codec
):
199 def encode(self
, input, errors
='strict'):
200 res
= punycode_encode(input)
201 return res
, len(input)
203 def decode(self
, input, errors
='strict'):
204 if errors
not in ('strict', 'replace', 'ignore'):
205 raise UnicodeError("Unsupported error handling "+errors
)
206 res
= punycode_decode(input, errors
)
207 return res
, len(input)
209 class IncrementalEncoder(codecs
.IncrementalEncoder
):
210 def encode(self
, input, final
=False):
211 return punycode_encode(input)
213 class IncrementalDecoder(codecs
.IncrementalDecoder
):
214 def decode(self
, input, final
=False):
215 if self
.errors
not in ('strict', 'replace', 'ignore'):
216 raise UnicodeError("Unsupported error handling "+self
.errors
)
217 return punycode_decode(input, self
.errors
)
219 class StreamWriter(Codec
,codecs
.StreamWriter
):
222 class StreamReader(Codec
,codecs
.StreamReader
):
225 ### encodings module API
228 return codecs
.CodecInfo(
230 encode
=Codec().encode
,
231 decode
=Codec().decode
,
232 incrementalencoder
=IncrementalEncoder
,
233 incrementaldecoder
=IncrementalDecoder
,
234 streamwriter
=StreamWriter
,
235 streamreader
=StreamReader
,