2 # Cython -- encoding related tools
8 if sys
.version_info
[0] >= 3:
9 _unicode
, _str
, _bytes
= str, str, bytes
12 _unicode
, _str
, _bytes
= unicode, str, str
15 empty_bytes
= _bytes()
16 empty_unicode
= _unicode()
18 join_bytes
= empty_bytes
.join
20 class UnicodeLiteralBuilder(object):
21 """Assemble a unicode string.
26 def append(self
, characters
):
27 if isinstance(characters
, _bytes
):
28 # this came from a Py2 string literal in the parser code
29 characters
= characters
.decode("ASCII")
30 assert isinstance(characters
, _unicode
), str(type(characters
))
31 self
.chars
.append(characters
)
33 if sys
.maxunicode
== 65535:
34 def append_charval(self
, char_number
):
35 if char_number
> 65535:
36 # wide Unicode character on narrow platform => replace
38 char_number
-= 0x10000
39 self
.chars
.append( unichr((char_number
// 1024) + 0xD800) )
40 self
.chars
.append( unichr((char_number
% 1024) + 0xDC00) )
42 self
.chars
.append( unichr(char_number
) )
44 def append_charval(self
, char_number
):
45 self
.chars
.append( unichr(char_number
) )
47 def append_uescape(self
, char_number
, escape_string
):
48 self
.append_charval(char_number
)
51 return EncodedString(u
''.join(self
.chars
))
54 return (None, self
.getstring())
57 class BytesLiteralBuilder(object):
58 """Assemble a byte string or char value.
60 def __init__(self
, target_encoding
):
62 self
.target_encoding
= target_encoding
64 def append(self
, characters
):
65 if isinstance(characters
, _unicode
):
66 characters
= characters
.encode(self
.target_encoding
)
67 assert isinstance(characters
, _bytes
), str(type(characters
))
68 self
.chars
.append(characters
)
70 def append_charval(self
, char_number
):
71 self
.chars
.append( unichr(char_number
).encode('ISO-8859-1') )
73 def append_uescape(self
, char_number
, escape_string
):
74 self
.append(escape_string
)
77 # this *must* return a byte string!
78 s
= BytesLiteral(join_bytes(self
.chars
))
79 s
.encoding
= self
.target_encoding
83 # this *must* return a byte string!
84 return self
.getstring()
87 return (self
.getstring(), None)
89 class StrLiteralBuilder(object):
90 """Assemble both a bytes and a unicode representation of a string.
92 def __init__(self
, target_encoding
):
93 self
._bytes
= BytesLiteralBuilder(target_encoding
)
94 self
._unicode
= UnicodeLiteralBuilder()
96 def append(self
, characters
):
97 self
._bytes
.append(characters
)
98 self
._unicode
.append(characters
)
100 def append_charval(self
, char_number
):
101 self
._bytes
.append_charval(char_number
)
102 self
._unicode
.append_charval(char_number
)
104 def append_uescape(self
, char_number
, escape_string
):
105 self
._bytes
.append(escape_string
)
106 self
._unicode
.append_charval(char_number
)
108 def getstrings(self
):
109 return (self
._bytes
.getstring(), self
._unicode
.getstring())
112 class EncodedString(_unicode
):
113 # unicode string subclass to keep track of the original encoding.
114 # 'encoding' is None for unicode strings and the source encoding
118 def __deepcopy__(self
, memo
):
121 def byteencode(self
):
122 assert self
.encoding
is not None
123 return self
.encode(self
.encoding
)
125 def utf8encode(self
):
126 assert self
.encoding
is None
127 return self
.encode("UTF-8")
130 def is_unicode(self
):
131 return self
.encoding
is None
133 def contains_surrogates(self
):
134 return string_contains_surrogates(self
)
137 def string_contains_surrogates(ustring
):
139 Check if the unicode string contains surrogate code points
140 on a CPython platform with wide (UCS-4) or narrow (UTF-16)
141 Unicode, i.e. characters that would be spelled as two
142 separate code units on a narrow platform.
144 for c
in map(ord, ustring
):
145 if c
> 65535: # can only happen on wide platforms
147 if 0xD800 <= c
<= 0xDFFF:
152 class BytesLiteral(_bytes
):
153 # bytes subclass that is compatible with EncodedString
156 def __deepcopy__(self
, memo
):
159 def byteencode(self
):
163 # fake-recode the string to make it a plain bytes object
164 return self
.decode('ISO-8859-1').encode('ISO-8859-1')
166 def utf8encode(self
):
167 assert False, "this is not a unicode string: %r" % self
170 """Fake-decode the byte string to unicode to support %
171 formatting of unicode strings.
173 return self
.decode('ISO-8859-1')
178 char_from_escape_sequence
= {
188 _c_special
= ('\\', '??', '"') + tuple(map(chr, range(32)))
191 def _to_escape_sequence(s
):
199 # within a character sequence, oct passes much better than hex
200 return ''.join(['\\%03o' % ord(c
) for c
in s
])
203 def _build_specials_replacer():
206 for special
in _c_special
:
207 regexp
= ''.join(['[%s]' % c
.replace('\\', '\\\\') for c
in special
])
208 subexps
.append(regexp
)
209 replacements
[special
.encode('ASCII')] = _to_escape_sequence(special
).encode('ASCII')
210 sub
= re
.compile(('(%s)' % '|'.join(subexps
)).encode('ASCII')).sub
211 def replace_specials(m
):
212 return replacements
[m
.group(1)]
214 return sub(replace_specials
, s
)
217 _replace_specials
= _build_specials_replacer()
222 c
= c
.decode('ISO-8859-1')
228 if n
< 32 or n
> 127:
229 # hex works well for characters
234 def escape_byte_string(s
):
235 """Escape a byte string so that it can be written into C code.
236 Note that this returns a Unicode string instead which, when
237 encoded as ISO-8859-1, will result in the correct byte sequence
240 s
= _replace_specials(s
)
242 return s
.decode("ASCII") # trial decoding: plain ASCII => done
243 except UnicodeDecodeError:
247 append
, extend
= s_new
.append
, s_new
.extend
250 extend(('\\%3o' % b
).encode('ASCII'))
253 return s_new
.decode('ISO-8859-1')
263 return join_bytes(l
).decode('ISO-8859-1')
265 def split_string_literal(s
, limit
=2000):
266 # MSVC can't handle long string literals.
272 while start
< len(s
):
274 if len(s
) > end
-4 and '\\' in s
[end
-4:end
]:
275 end
-= 4 - s
[end
-4:end
].find('\\') # just before the backslash
276 while s
[end
-1] == '\\':
279 # must have been a long line of backslashes
280 end
= start
+ limit
- (limit
% 2) - 4
282 chunks
.append(s
[start
:end
])
284 return '""'.join(chunks
)
286 def encode_pyunicode_string(s
):
287 """Create Py_UNICODE[] representation of a given unicode string.
289 s
= map(ord, s
) + [0]
291 if sys
.maxunicode
>= 0x10000: # Wide build or Py3.3
294 if code_point
>= 0x10000: # outside of BMP
295 high
, low
= divmod(code_point
- 0x10000, 1024)
296 utf16
.append(high
+ 0xD800)
297 utf16
.append(low
+ 0xDC00)
299 utf16
.append(code_point
)
303 if 0xDC00 <= code_unit
<= 0xDFFF and utf32
and 0xD800 <= utf32
[-1] <= 0xDBFF:
304 high
, low
= utf32
[-1], code_unit
305 utf32
[-1] = ((high
& 0x3FF) << 10) + (low
& 0x3FF) + 0x10000
307 utf32
.append(code_unit
)
311 return ",".join(map(unicode, utf16
)), ",".join(map(unicode, utf32
))