third_party/cython/src/Cython/Compiler/StringEncoding.py

   1 #
   2 #   Cython -- encoding related tools
   3 #
   4
   5 import re
   6 import sys
   7
   8 if sys.version_info[0] >= 3:
   9     _unicode, _str, _bytes = str, str, bytes
  10     IS_PYTHON3 = True
  11 else:
  12     _unicode, _str, _bytes = unicode, str, str
  13     IS_PYTHON3 = False
  14
  15 empty_bytes = _bytes()
  16 empty_unicode = _unicode()
  17
  18 join_bytes = empty_bytes.join
  19
  20 class UnicodeLiteralBuilder(object):
  21     """Assemble a unicode string.
  22     """
  23     def __init__(self):
  24         self.chars = []
  25
  26     def append(self, characters):
  27         if isinstance(characters, _bytes):
  28             # this came from a Py2 string literal in the parser code
  29             characters = characters.decode("ASCII")
  30         assert isinstance(characters, _unicode), str(type(characters))
  31         self.chars.append(characters)
  32
  33     if sys.maxunicode == 65535:
  34         def append_charval(self, char_number):
  35             if char_number > 65535:
  36                 # wide Unicode character on narrow platform => replace
  37                 # by surrogate pair
  38                 char_number -= 0x10000
  39                 self.chars.append( unichr((char_number // 1024) + 0xD800) )
  40                 self.chars.append( unichr((char_number  % 1024) + 0xDC00) )
  41             else:
  42                 self.chars.append( unichr(char_number) )
  43     else:
  44         def append_charval(self, char_number):
  45             self.chars.append( unichr(char_number) )
  46
  47     def append_uescape(self, char_number, escape_string):
  48         self.append_charval(char_number)
  49
  50     def getstring(self):
  51         return EncodedString(u''.join(self.chars))
  52
  53     def getstrings(self):
  54         return (None, self.getstring())
  55
  56
  57 class BytesLiteralBuilder(object):
  58     """Assemble a byte string or char value.
  59     """
  60     def __init__(self, target_encoding):
  61         self.chars = []
  62         self.target_encoding = target_encoding
  63
  64     def append(self, characters):
  65         if isinstance(characters, _unicode):
  66             characters = characters.encode(self.target_encoding)
  67         assert isinstance(characters, _bytes), str(type(characters))
  68         self.chars.append(characters)
  69
  70     def append_charval(self, char_number):
  71         self.chars.append( unichr(char_number).encode('ISO-8859-1') )
  72
  73     def append_uescape(self, char_number, escape_string):
  74         self.append(escape_string)
  75
  76     def getstring(self):
  77         # this *must* return a byte string!
  78         s = BytesLiteral(join_bytes(self.chars))
  79         s.encoding = self.target_encoding
  80         return s
  81
  82     def getchar(self):
  83         # this *must* return a byte string!
  84         return self.getstring()
  85
  86     def getstrings(self):
  87         return (self.getstring(), None)
  88
  89 class StrLiteralBuilder(object):
  90     """Assemble both a bytes and a unicode representation of a string.
  91     """
  92     def __init__(self, target_encoding):
  93         self._bytes   = BytesLiteralBuilder(target_encoding)
  94         self._unicode = UnicodeLiteralBuilder()
  95
  96     def append(self, characters):
  97         self._bytes.append(characters)
  98         self._unicode.append(characters)
  99
 100     def append_charval(self, char_number):
 101         self._bytes.append_charval(char_number)
 102         self._unicode.append_charval(char_number)
 103
 104     def append_uescape(self, char_number, escape_string):
 105         self._bytes.append(escape_string)
 106         self._unicode.append_charval(char_number)
 107
 108     def getstrings(self):
 109         return (self._bytes.getstring(), self._unicode.getstring())
 110
 111
 112 class EncodedString(_unicode):
 113     # unicode string subclass to keep track of the original encoding.
 114     # 'encoding' is None for unicode strings and the source encoding
 115     # otherwise
 116     encoding = None
 117
 118     def __deepcopy__(self, memo):
 119         return self
 120
 121     def byteencode(self):
 122         assert self.encoding is not None
 123         return self.encode(self.encoding)
 124
 125     def utf8encode(self):
 126         assert self.encoding is None
 127         return self.encode("UTF-8")
 128
 129     @property
 130     def is_unicode(self):
 131         return self.encoding is None
 132
 133     def contains_surrogates(self):
 134         return string_contains_surrogates(self)
 135
 136
 137 def string_contains_surrogates(ustring):
 138     """
 139     Check if the unicode string contains surrogate code points
 140     on a CPython platform with wide (UCS-4) or narrow (UTF-16)
 141     Unicode, i.e. characters that would be spelled as two
 142     separate code units on a narrow platform.
 143     """
 144     for c in map(ord, ustring):
 145         if c > 65535:  # can only happen on wide platforms
 146             return True
 147         if 0xD800 <= c <= 0xDFFF:
 148             return True
 149     return False
 150
 151
 152 class BytesLiteral(_bytes):
 153     # bytes subclass that is compatible with EncodedString
 154     encoding = None
 155
 156     def __deepcopy__(self, memo):
 157         return self
 158
 159     def byteencode(self):
 160         if IS_PYTHON3:
 161             return _bytes(self)
 162         else:
 163             # fake-recode the string to make it a plain bytes object
 164             return self.decode('ISO-8859-1').encode('ISO-8859-1')
 165
 166     def utf8encode(self):
 167         assert False, "this is not a unicode string: %r" % self
 168
 169     def __str__(self):
 170         """Fake-decode the byte string to unicode to support %
 171         formatting of unicode strings.
 172         """
 173         return self.decode('ISO-8859-1')
 174
 175     is_unicode = False
 176
 177
 178 char_from_escape_sequence = {
 179     r'\a' : u'\a',
 180     r'\b' : u'\b',
 181     r'\f' : u'\f',
 182     r'\n' : u'\n',
 183     r'\r' : u'\r',
 184     r'\t' : u'\t',
 185     r'\v' : u'\v',
 186     }.get
 187
 188 _c_special = ('\\', '??', '"') + tuple(map(chr, range(32)))
 189
 190
 191 def _to_escape_sequence(s):
 192     if s in '\n\r\t':
 193         return repr(s)[1:-1]
 194     elif s == '"':
 195         return r'\"'
 196     elif s == '\\':
 197         return r'\\'
 198     else:
 199         # within a character sequence, oct passes much better than hex
 200         return ''.join(['\\%03o' % ord(c) for c in s])
 201
 202
 203 def _build_specials_replacer():
 204     subexps = []
 205     replacements = {}
 206     for special in _c_special:
 207         regexp = ''.join(['[%s]' % c.replace('\\', '\\\\') for c in special])
 208         subexps.append(regexp)
 209         replacements[special.encode('ASCII')] = _to_escape_sequence(special).encode('ASCII')
 210     sub = re.compile(('(%s)' % '|'.join(subexps)).encode('ASCII')).sub
 211     def replace_specials(m):
 212         return replacements[m.group(1)]
 213     def replace(s):
 214         return sub(replace_specials, s)
 215     return replace
 216
 217 _replace_specials = _build_specials_replacer()
 218
 219
 220 def escape_char(c):
 221     if IS_PYTHON3:
 222         c = c.decode('ISO-8859-1')
 223     if c in '\n\r\t\\':
 224         return repr(c)[1:-1]
 225     elif c == "'":
 226         return "\\'"
 227     n = ord(c)
 228     if n < 32 or n > 127:
 229         # hex works well for characters
 230         return "\\x%02X" % n
 231     else:
 232         return c
 233
 234 def escape_byte_string(s):
 235     """Escape a byte string so that it can be written into C code.
 236     Note that this returns a Unicode string instead which, when
 237     encoded as ISO-8859-1, will result in the correct byte sequence
 238     being written.
 239     """
 240     s = _replace_specials(s)
 241     try:
 242         return s.decode("ASCII") # trial decoding: plain ASCII => done
 243     except UnicodeDecodeError:
 244         pass
 245     if IS_PYTHON3:
 246         s_new = bytearray()
 247         append, extend = s_new.append, s_new.extend
 248         for b in s:
 249             if b >= 128:
 250                 extend(('\\%3o' % b).encode('ASCII'))
 251             else:
 252                 append(b)
 253         return s_new.decode('ISO-8859-1')
 254     else:
 255         l = []
 256         append = l.append
 257         for c in s:
 258             o = ord(c)
 259             if o >= 128:
 260                 append('\\%3o' % o)
 261             else:
 262                 append(c)
 263         return join_bytes(l).decode('ISO-8859-1')
 264
 265 def split_string_literal(s, limit=2000):
 266     # MSVC can't handle long string literals.
 267     if len(s) < limit:
 268         return s
 269     else:
 270         start = 0
 271         chunks = []
 272         while start < len(s):
 273             end = start + limit
 274             if len(s) > end-4 and '\\' in s[end-4:end]:
 275                 end -= 4 - s[end-4:end].find('\\') # just before the backslash
 276                 while s[end-1] == '\\':
 277                     end -= 1
 278                     if end == start:
 279                         # must have been a long line of backslashes
 280                         end = start + limit - (limit % 2) - 4
 281                         break
 282             chunks.append(s[start:end])
 283             start = end
 284         return '""'.join(chunks)
 285
 286 def encode_pyunicode_string(s):
 287     """Create Py_UNICODE[] representation of a given unicode string.
 288     """
 289     s = map(ord, s) + [0]
 290
 291     if sys.maxunicode >= 0x10000:  # Wide build or Py3.3
 292         utf16, utf32 = [], s
 293         for code_point in s:
 294             if code_point >= 0x10000:  # outside of BMP
 295                 high, low = divmod(code_point - 0x10000, 1024)
 296                 utf16.append(high + 0xD800)
 297                 utf16.append(low + 0xDC00)
 298             else:
 299                 utf16.append(code_point)
 300     else:
 301         utf16, utf32 = s, []
 302         for code_unit in s:
 303             if 0xDC00 <= code_unit <= 0xDFFF and utf32 and 0xD800 <= utf32[-1] <= 0xDBFF:
 304                 high, low = utf32[-1], code_unit
 305                 utf32[-1] = ((high & 0x3FF) << 10) + (low & 0x3FF) + 0x10000
 306             else:
 307                 utf32.append(code_unit)
 308
 309     if utf16 == utf32:
 310         utf16 = []
 311     return ",".join(map(unicode, utf16)), ",".join(map(unicode, utf32))