misc/quote.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3 #
   4 # Copyright 2002-2006 Zuza Software Foundation
   5 #
   6 # This file is part of translate.
   7 #
   8 # translate is free software; you can redistribute it and/or modify
   9 # it under the terms of the GNU General Public License as published by
  10 # the Free Software Foundation; either version 2 of the License, or
  11 # (at your option) any later version.
  12 #
  13 # translate is distributed in the hope that it will be useful,
  14 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16 # GNU General Public License for more details.
  17 #
  18 # You should have received a copy of the GNU General Public License
  19 # along with translate; if not, write to the Free Software
  20 # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  21
  22 """string processing utilities for extracting strings with various kinds of delimiters"""
  23
  24 import logging
  25 import htmlentitydefs
  26
  27 def find_all(searchin, substr):
  28     """returns a list of locations where substr occurs in searchin
  29     locations are not allowed to overlap"""
  30     location = 0
  31     locations = []
  32     while location != -1:
  33         location = searchin.find(substr, location)
  34         if location != -1:
  35             locations.append(location)
  36             location += len(substr)
  37     return locations
  38
  39 def extract(source, startdelim, enddelim, escape=None, startinstring=False, allowreentry=True):
  40     """Extracts a doublequote-delimited string from a string, allowing for backslash-escaping
  41     returns tuple of (quoted string with quotes, still in string at end)"""
  42     # note that this returns the quote characters as well... even internally
  43     instring = startinstring
  44     enteredonce = False
  45     lenstart = len(startdelim)
  46     lenend = len(enddelim)
  47     startdelim_places = find_all(source, startdelim)
  48     if startdelim == enddelim:
  49         enddelim_places = startdelim_places[:]
  50     else:
  51         enddelim_places = find_all(source, enddelim)
  52     if escape is not None:
  53         lenescape = len(escape)
  54         escape_places = find_all(source, escape)
  55         last_escape_pos = -1
  56         # filter escaped escapes
  57         true_escape = False
  58         true_escape_places = []
  59         for escape_pos in escape_places:
  60             if escape_pos - lenescape in escape_places:
  61                 true_escape = not true_escape
  62             else:
  63                 true_escape = True
  64             if true_escape:
  65                 true_escape_places.append(escape_pos)
  66         startdelim_places = [pos for pos in startdelim_places if pos - lenescape not in true_escape_places]
  67         enddelim_places = [pos + lenend for pos in enddelim_places if pos - lenescape not in true_escape_places]
  68     else:
  69         enddelim_places = [pos + lenend for pos in enddelim_places]
  70     # get a unique sorted list of the significant places in the string
  71     significant_places = dict.fromkeys([0] + startdelim_places + enddelim_places + [len(source)-1]).keys()
  72     significant_places.sort()
  73     extracted = ""
  74     lastpos = None
  75     for pos in significant_places:
  76         if instring and pos in enddelim_places:
  77             # make sure that if startdelim == enddelim we don't get confused and count the same string as start and end
  78             if lastpos == pos - lenstart and lastpos in startdelim_places:
  79                 continue
  80             extracted += source[lastpos:pos]
  81             instring = False
  82             lastpos = pos
  83         if (not instring) and pos in startdelim_places and not (enteredonce and not allowreentry):
  84             instring = True
  85             enteredonce = True
  86             lastpos = pos
  87     if instring:
  88         extracted += source[lastpos:]
  89     return (extracted, instring)
  90
  91 def extractfromlines(lines, startdelim, enddelim, escape):
  92     """Calls extract over multiple lines, remembering whether in the string or not"""
  93     result = ""
  94     instring = 0
  95     for line in lines:
  96         (string, instring) = extract(line, startdelim, enddelim, escape, instring)
  97         result += string
  98         if not instring: break
  99     return result
 100
 101 def extractstr(source):
 102     "Extracts a doublequote-delimited string from a string, allowing for backslash-escaping"
 103     (string, instring) = extract(source, '"', '"', '\\')
 104     return string
 105
 106 def extractcomment(lines):
 107     "Extracts <!-- > XML comments from lines"
 108     return extractfromlines(lines, "<!--", "-->", None)
 109
 110 def extractwithoutquotes(source, startdelim, enddelim, escape=None, startinstring=False, includeescapes=True, allowreentry=True):
 111     """Extracts a doublequote-delimited string from a string, allowing for backslash-escaping
 112     includeescapes can also be a function that takes the whole escaped string and returns the replaced version"""
 113     instring = startinstring
 114     enteredonce = False
 115     lenstart = len(startdelim)
 116     lenend = len(enddelim)
 117     startdelim_places = find_all(source, startdelim)
 118     if startdelim == enddelim:
 119         enddelim_places = startdelim_places[:]
 120     else:
 121         enddelim_places = find_all(source, enddelim)
 122     if escape is not None:
 123         lenescape = len(escape)
 124         escape_places = find_all(source, escape)
 125         last_escape_pos = -1
 126         # filter escaped escapes
 127         true_escape = False
 128         true_escape_places = []
 129         for escape_pos in escape_places:
 130             if escape_pos - lenescape in escape_places:
 131                 true_escape = not true_escape
 132             else:
 133                 true_escape = True
 134             if true_escape:
 135                 true_escape_places.append(escape_pos)
 136         startdelim_places = [pos for pos in startdelim_places if pos - lenescape not in true_escape_places]
 137         enddelim_places = [pos + lenend for pos in enddelim_places if pos - lenescape not in true_escape_places]
 138     else:
 139         enddelim_places = [pos + lenend for pos in enddelim_places]
 140     # get a unique sorted list of the significant places in the string
 141     significant_places = dict.fromkeys([0] + startdelim_places + enddelim_places + [len(source)-1]).keys()
 142     significant_places.sort()
 143     extracted = ""
 144     lastpos = 0
 145     callable_includeescapes = callable(includeescapes)
 146     checkescapes = callable_includeescapes or not includeescapes
 147     for pos in significant_places:
 148         if instring and pos in enddelim_places and lastpos != pos - lenstart:
 149             section_start, section_end = lastpos + len(startdelim), pos - len(enddelim)
 150             section = source[section_start:section_end]
 151             if escape is not None and checkescapes:
 152                 escape_list = [epos - section_start for epos in true_escape_places if section_start <= epos <= section_end]
 153                 new_section = ""
 154                 last_epos = 0
 155                 for epos in escape_list:
 156                     new_section += section[last_epos:epos]
 157                     if callable_includeescapes:
 158                         replace_escape = includeescapes(section[epos:epos+lenescape+1])
 159                         # TODO: deprecate old method of returning boolean from includeescape, by removing this if block
 160                         if not isinstance(replace_escape, basestring):
 161                             if replace_escape:
 162                                 replace_escape = section[epos:epos+lenescape+1]
 163                             else:
 164                                 replace_escape = section[epos+lenescape:epos+lenescape+1]
 165                         new_section += replace_escape
 166                         last_epos = epos + lenescape + 1
 167                     else:
 168                         last_epos = epos + lenescape
 169                 section = new_section + section[last_epos:]
 170             extracted += section
 171             instring = False
 172             lastpos = pos
 173         if (not instring) and pos in startdelim_places and not (enteredonce and not allowreentry):
 174             instring = True
 175             enteredonce = True
 176             lastpos = pos
 177     if instring:
 178         section_start = lastpos + len(startdelim)
 179         section = source[section_start:]
 180         if escape is not None and not includeescapes:
 181             escape_list = [epos - section_start for epos in true_escape_places if section_start <= epos]
 182             new_section = ""
 183             last_epos = 0
 184             for epos in escape_list:
 185                 new_section += section[last_epos:epos]
 186                 if callable_includeescapes and includeescapes(section[epos:epos+lenescape+1]):
 187                     last_epos = epos
 188                 else:
 189                     last_epos = epos + lenescape
 190             section = new_section + section[last_epos:]
 191         extracted += section
 192     return (extracted, instring)
 193
 194 def escapequotes(source, escapeescapes=0):
 195     "Returns the same string, with double quotes escaped with backslash"
 196     if escapeescapes:
 197         return source.replace('\\', '\\\\').replace('"', '\\"')
 198     else:
 199         return source.replace('"','\\"')
 200
 201 def escapesinglequotes(source):
 202     "Returns the same string, with single quotes doubled"
 203     return source.replace("'","''")
 204
 205 def htmlentityencode(source):
 206     """encodes source using HTML entities e.g. © -> &copy;"""
 207     output = ""
 208     for char in source:
 209         charnum = ord(char)
 210         if charnum in htmlentitydefs.codepoint2name:
 211             output += "&%s;" % htmlentitydefs.codepoint2name[charnum]
 212         else:
 213             output += str(char)
 214     return output
 215
 216 def htmlentitydecode(source):
 217     """decodes source using HTML entities e.g. &copy; -> ©"""
 218     output = u""
 219     inentity = False
 220     for char in source:
 221         if char == "&":
 222             inentity = True
 223             possibleentity = ""
 224             continue
 225         if inentity:
 226             if char == ";":
 227                 if len(possibleentity) > 0 and possibleentity in htmlentitydefs.name2codepoint:
 228                     output += unichr(htmlentitydefs.name2codepoint[possibleentity])
 229                     inentity = False
 230                 else:
 231                     output += "&" + possibleentity + ";"
 232                     inentity = False
 233             elif char == " ":
 234                 output += "&" + possibleentity + char
 235                 inentity = False
 236             else:
 237                 possibleentity += char
 238         else:
 239             output += char
 240     return output
 241
 242 def javapropertiesencode(source):
 243     """encodes source in the escaped-unicode encoding used by Java .properties files"""
 244     output = ""
 245     for char in source:
 246         charnum = ord(char)
 247         if char in controlchars:
 248             output += controlchars[char]
 249         elif 0 <= charnum < 128:
 250             output += str(char)
 251         else:
 252             output += "\\u%04X" % charnum
 253     return output
 254
 255 def mozillapropertiesencode(source):
 256     """encodes source in the escaped-unicode encoding used by Mozilla .properties files"""
 257     output = ""
 258     for char in source:
 259         charnum = ord(char)
 260         if char in controlchars:
 261             output += controlchars[char]
 262         else:
 263             output += char
 264     return output
 265
 266 propertyescapes = {
 267     # escapes that are self-escaping
 268     "\\": "\\", "'": "'", '"': '"',
 269     # control characters that we keep
 270     "b": "\b", "f": "\f", "t": "\t", "n": "\n", "v": "\v", "a": "\a"
 271     }
 272
 273 controlchars = {
 274     # the reverse of the above...
 275     "\b": "\\b", "\f": "\\f", "\t": "\\t", "\n": "\\n", "\v": "\\v"
 276     }
 277
 278 def escapecontrols(source):
 279     """escape control characters in the given string"""
 280     for key, value in controlchars.iteritems():
 281         source = source.replace(key, value)
 282     return source
 283
 284 def mozillapropertiesdecode(source):
 285     """decodes source from the escaped-unicode encoding used by mozilla .properties files"""
 286     # since the .decode("unicode-escape") routine decodes everything, and we don't want to
 287     # we reimplemented the algorithm from Python Objects/unicode.c in Python here
 288     # and modified it to retain escaped control characters
 289     output = u""
 290     s = 0
 291     if isinstance(source, str):
 292         source = source.decode("utf-8")
 293     def unichr2(i):
 294         """Returns a Unicode string of one character with ordinal 32 <= i, otherwise an escaped control character"""
 295         if 32 <= i:
 296             return unichr(i)
 297         elif unichr(i) in controlchars:
 298             # we just return the character, unescaped
 299             # if people want to escape them they can use escapecontrols
 300             return unichr(i)
 301         else:
 302             return "\\u%04x" % i
 303     while s < len(source):
 304         c = source[s]
 305         if c != '\\':
 306             output += c
 307             s += 1
 308             continue
 309         s += 1
 310         if s >= len(source):
 311             # this is an escape at the end of the line, which implies a continuation...
 312             # return the escape to inform the parser
 313             output += c
 314             continue
 315         c = source[s]
 316         s += 1
 317         if c == '\n': pass
 318         # propertyescapes lookups
 319         elif c in propertyescapes: output += propertyescapes[c]
 320         # \uXXXX escapes
 321         # \UXXXX escapes
 322         elif c in "uU":
 323             digits = 4
 324             x = 0
 325             for digit in range(digits):
 326                 x <<= 4
 327                 if s + digit >= len(source):
 328                     digits = digit
 329                     break
 330                 c = source[s+digit].lower()
 331                 if c.isdigit():
 332                     x += ord(c) - ord('0')
 333                 elif c in "abcdef":
 334                     x += ord(c) - ord('a') + 10
 335                 else:
 336                     break
 337             s += digits
 338             output += unichr2(x)
 339         elif c == "N":
 340             if source[s] != "{":
 341                 logging.warn("Invalid named unicode escape: no { after \\N")
 342                 output += "\\" + c
 343                 continue
 344             s += 1
 345             e = source.find("}", s)
 346             if e == -1:
 347                 logging.warn("Invalid named unicode escape: no } after \\N{")
 348                 output += "\\" + c
 349                 continue
 350             import unicodedata
 351             name = source[s:e]
 352             output += unicodedata.lookup(name)
 353             s = e + 1
 354         else:
 355             output += "\\" + c
 356     return output
 357
 358 def quotestr(source, escapeescapes=0):
 359     "Returns a doublequote-delimited quoted string, escaping double quotes with backslash"
 360     if isinstance(source, list):
 361         firstline = True
 362         for line in source:
 363             if firstline:
 364                 newsource = '"' + escapequotes(line, escapeescapes) + '"'
 365                 firstline = False
 366             else:
 367                 newsource = newsource + '\n' + '"' + escapequotes(line, escapeescapes) + '"'
 368         return newsource
 369     else:
 370         return '"' + escapequotes(source, escapeescapes) + '"'
 371
 372 def singlequotestr(source):
 373     "Returns a doublequote-delimited quoted string, escaping single quotes with themselves"
 374     return "'" + escapesinglequotes(source) + "'"
 375
 376 def eitherquotestr(source):
 377     "Returns a singlequote- or doublequote-delimited string, depending on what quotes it contains"
 378     if '"' in source:
 379         return singlequotestr(source)
 380     else:
 381         return quotestr(source)
 382
 383 def findend(string, substring):
 384     s = string.find(substring)
 385     if s != -1:
 386         s += len(substring)
 387     return s
 388
 389 def rstripeol(string):
 390     return string.rstrip("\r\n")
 391
 392 def stripcomment(comment, startstring="<!--", endstring="-->"):
 393     cstart = comment.find(startstring)
 394     if cstart == -1:
 395         cstart = 0
 396     else:
 397         cstart += len(startstring)
 398     cend = comment.find(endstring, cstart)
 399     return comment[cstart:cend].strip()
 400
 401 def unstripcomment(comment, startstring="<!-- ", endstring=" -->\n"):
 402     return startstring+comment.strip()+endstring
 403
 404 def encodewithdict(unencoded, encodedict):
 405     """encodes certain characters in the string using an encode dictionary"""
 406     encoded = unencoded
 407     for key, value in encodedict.iteritems():
 408         if key in encoded:
 409             encoded = encoded.replace(key, value)
 410     return encoded
 411
 412 def makeutf8(d):
 413     """convert numbers to utf8 codes in the values of a dictionary"""
 414     for key, value in d.items():
 415         if type(value) == int:
 416             d[key] = unichr(value).encode('utf8')
 417     return d
 418
 419 def testcase():
 420     x = ' "this" " is " "a" " test!" '
 421     print extract(x, '"', '"', None)
 422     print extract(x, '"', '"', '!')
 423     print extractwithoutquotes(x, '"', '"', None)
 424     print extractwithoutquotes(x, '"', '"', '!')
 425     print extractwithoutquotes(x, '"', '"', '!', includeescapes=False)
 426
 427 if __name__ == '__main__':
 428     testcase()
 429