Pin Chrome's shortcut to the Win10 Start menu on install and OS upgrade.
[chromium-blink-merge.git] / third_party / cython / src / Cython / Compiler / StringEncoding.py
blob4d84afa209fc06133a70769e51b48bc3d669a9c6
2 # Cython -- encoding related tools
5 import re
6 import sys
8 if sys.version_info[0] >= 3:
9 _unicode, _str, _bytes = str, str, bytes
10 IS_PYTHON3 = True
11 else:
12 _unicode, _str, _bytes = unicode, str, str
13 IS_PYTHON3 = False
15 empty_bytes = _bytes()
16 empty_unicode = _unicode()
18 join_bytes = empty_bytes.join
20 class UnicodeLiteralBuilder(object):
21 """Assemble a unicode string.
22 """
23 def __init__(self):
24 self.chars = []
26 def append(self, characters):
27 if isinstance(characters, _bytes):
28 # this came from a Py2 string literal in the parser code
29 characters = characters.decode("ASCII")
30 assert isinstance(characters, _unicode), str(type(characters))
31 self.chars.append(characters)
33 if sys.maxunicode == 65535:
34 def append_charval(self, char_number):
35 if char_number > 65535:
36 # wide Unicode character on narrow platform => replace
37 # by surrogate pair
38 char_number -= 0x10000
39 self.chars.append( unichr((char_number // 1024) + 0xD800) )
40 self.chars.append( unichr((char_number % 1024) + 0xDC00) )
41 else:
42 self.chars.append( unichr(char_number) )
43 else:
44 def append_charval(self, char_number):
45 self.chars.append( unichr(char_number) )
47 def append_uescape(self, char_number, escape_string):
48 self.append_charval(char_number)
50 def getstring(self):
51 return EncodedString(u''.join(self.chars))
53 def getstrings(self):
54 return (None, self.getstring())
57 class BytesLiteralBuilder(object):
58 """Assemble a byte string or char value.
59 """
60 def __init__(self, target_encoding):
61 self.chars = []
62 self.target_encoding = target_encoding
64 def append(self, characters):
65 if isinstance(characters, _unicode):
66 characters = characters.encode(self.target_encoding)
67 assert isinstance(characters, _bytes), str(type(characters))
68 self.chars.append(characters)
70 def append_charval(self, char_number):
71 self.chars.append( unichr(char_number).encode('ISO-8859-1') )
73 def append_uescape(self, char_number, escape_string):
74 self.append(escape_string)
76 def getstring(self):
77 # this *must* return a byte string!
78 s = BytesLiteral(join_bytes(self.chars))
79 s.encoding = self.target_encoding
80 return s
82 def getchar(self):
83 # this *must* return a byte string!
84 return self.getstring()
86 def getstrings(self):
87 return (self.getstring(), None)
89 class StrLiteralBuilder(object):
90 """Assemble both a bytes and a unicode representation of a string.
91 """
92 def __init__(self, target_encoding):
93 self._bytes = BytesLiteralBuilder(target_encoding)
94 self._unicode = UnicodeLiteralBuilder()
96 def append(self, characters):
97 self._bytes.append(characters)
98 self._unicode.append(characters)
100 def append_charval(self, char_number):
101 self._bytes.append_charval(char_number)
102 self._unicode.append_charval(char_number)
104 def append_uescape(self, char_number, escape_string):
105 self._bytes.append(escape_string)
106 self._unicode.append_charval(char_number)
108 def getstrings(self):
109 return (self._bytes.getstring(), self._unicode.getstring())
112 class EncodedString(_unicode):
113 # unicode string subclass to keep track of the original encoding.
114 # 'encoding' is None for unicode strings and the source encoding
115 # otherwise
116 encoding = None
118 def __deepcopy__(self, memo):
119 return self
121 def byteencode(self):
122 assert self.encoding is not None
123 return self.encode(self.encoding)
125 def utf8encode(self):
126 assert self.encoding is None
127 return self.encode("UTF-8")
129 @property
130 def is_unicode(self):
131 return self.encoding is None
133 def contains_surrogates(self):
134 return string_contains_surrogates(self)
137 def string_contains_surrogates(ustring):
139 Check if the unicode string contains surrogate code points
140 on a CPython platform with wide (UCS-4) or narrow (UTF-16)
141 Unicode, i.e. characters that would be spelled as two
142 separate code units on a narrow platform.
144 for c in map(ord, ustring):
145 if c > 65535: # can only happen on wide platforms
146 return True
147 if 0xD800 <= c <= 0xDFFF:
148 return True
149 return False
152 class BytesLiteral(_bytes):
153 # bytes subclass that is compatible with EncodedString
154 encoding = None
156 def __deepcopy__(self, memo):
157 return self
159 def byteencode(self):
160 if IS_PYTHON3:
161 return _bytes(self)
162 else:
163 # fake-recode the string to make it a plain bytes object
164 return self.decode('ISO-8859-1').encode('ISO-8859-1')
166 def utf8encode(self):
167 assert False, "this is not a unicode string: %r" % self
169 def __str__(self):
170 """Fake-decode the byte string to unicode to support %
171 formatting of unicode strings.
173 return self.decode('ISO-8859-1')
175 is_unicode = False
178 char_from_escape_sequence = {
179 r'\a' : u'\a',
180 r'\b' : u'\b',
181 r'\f' : u'\f',
182 r'\n' : u'\n',
183 r'\r' : u'\r',
184 r'\t' : u'\t',
185 r'\v' : u'\v',
186 }.get
188 _c_special = ('\\', '??', '"') + tuple(map(chr, range(32)))
191 def _to_escape_sequence(s):
192 if s in '\n\r\t':
193 return repr(s)[1:-1]
194 elif s == '"':
195 return r'\"'
196 elif s == '\\':
197 return r'\\'
198 else:
199 # within a character sequence, oct passes much better than hex
200 return ''.join(['\\%03o' % ord(c) for c in s])
203 def _build_specials_replacer():
204 subexps = []
205 replacements = {}
206 for special in _c_special:
207 regexp = ''.join(['[%s]' % c.replace('\\', '\\\\') for c in special])
208 subexps.append(regexp)
209 replacements[special.encode('ASCII')] = _to_escape_sequence(special).encode('ASCII')
210 sub = re.compile(('(%s)' % '|'.join(subexps)).encode('ASCII')).sub
211 def replace_specials(m):
212 return replacements[m.group(1)]
213 def replace(s):
214 return sub(replace_specials, s)
215 return replace
217 _replace_specials = _build_specials_replacer()
220 def escape_char(c):
221 if IS_PYTHON3:
222 c = c.decode('ISO-8859-1')
223 if c in '\n\r\t\\':
224 return repr(c)[1:-1]
225 elif c == "'":
226 return "\\'"
227 n = ord(c)
228 if n < 32 or n > 127:
229 # hex works well for characters
230 return "\\x%02X" % n
231 else:
232 return c
234 def escape_byte_string(s):
235 """Escape a byte string so that it can be written into C code.
236 Note that this returns a Unicode string instead which, when
237 encoded as ISO-8859-1, will result in the correct byte sequence
238 being written.
240 s = _replace_specials(s)
241 try:
242 return s.decode("ASCII") # trial decoding: plain ASCII => done
243 except UnicodeDecodeError:
244 pass
245 if IS_PYTHON3:
246 s_new = bytearray()
247 append, extend = s_new.append, s_new.extend
248 for b in s:
249 if b >= 128:
250 extend(('\\%3o' % b).encode('ASCII'))
251 else:
252 append(b)
253 return s_new.decode('ISO-8859-1')
254 else:
255 l = []
256 append = l.append
257 for c in s:
258 o = ord(c)
259 if o >= 128:
260 append('\\%3o' % o)
261 else:
262 append(c)
263 return join_bytes(l).decode('ISO-8859-1')
265 def split_string_literal(s, limit=2000):
266 # MSVC can't handle long string literals.
267 if len(s) < limit:
268 return s
269 else:
270 start = 0
271 chunks = []
272 while start < len(s):
273 end = start + limit
274 if len(s) > end-4 and '\\' in s[end-4:end]:
275 end -= 4 - s[end-4:end].find('\\') # just before the backslash
276 while s[end-1] == '\\':
277 end -= 1
278 if end == start:
279 # must have been a long line of backslashes
280 end = start + limit - (limit % 2) - 4
281 break
282 chunks.append(s[start:end])
283 start = end
284 return '""'.join(chunks)
286 def encode_pyunicode_string(s):
287 """Create Py_UNICODE[] representation of a given unicode string.
289 s = map(ord, s) + [0]
291 if sys.maxunicode >= 0x10000: # Wide build or Py3.3
292 utf16, utf32 = [], s
293 for code_point in s:
294 if code_point >= 0x10000: # outside of BMP
295 high, low = divmod(code_point - 0x10000, 1024)
296 utf16.append(high + 0xD800)
297 utf16.append(low + 0xDC00)
298 else:
299 utf16.append(code_point)
300 else:
301 utf16, utf32 = s, []
302 for code_unit in s:
303 if 0xDC00 <= code_unit <= 0xDFFF and utf32 and 0xD800 <= utf32[-1] <= 0xDBFF:
304 high, low = utf32[-1], code_unit
305 utf32[-1] = ((high & 0x3FF) << 10) + (low & 0x3FF) + 0x10000
306 else:
307 utf32.append(code_unit)
309 if utf16 == utf32:
310 utf16 = []
311 return ",".join(map(unicode, utf16)), ",".join(map(unicode, utf32))