A number of methods in UnitChecker were called very, very frequently
[translate_toolkit.git] / misc / quote.py
blob3dd479292291c0eda8e260c074e595bebf0e7169
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3 #
4 # Copyright 2002-2006 Zuza Software Foundation
5 #
6 # This file is part of translate.
8 # translate is free software; you can redistribute it and/or modify
9 # it under the terms of the GNU General Public License as published by
10 # the Free Software Foundation; either version 2 of the License, or
11 # (at your option) any later version.
13 # translate is distributed in the hope that it will be useful,
14 # but WITHOUT ANY WARRANTY; without even the implied warranty of
15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 # GNU General Public License for more details.
18 # You should have received a copy of the GNU General Public License
19 # along with translate; if not, write to the Free Software
20 # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 """string processing utilities for extracting strings with various kinds of delimiters"""
24 import logging
25 import htmlentitydefs
27 def find_all(searchin, substr):
28 """returns a list of locations where substr occurs in searchin
29 locations are not allowed to overlap"""
30 location = 0
31 locations = []
32 while location != -1:
33 location = searchin.find(substr, location)
34 if location != -1:
35 locations.append(location)
36 location += len(substr)
37 return locations
39 def extract(source, startdelim, enddelim, escape=None, startinstring=False, allowreentry=True):
40 """Extracts a doublequote-delimited string from a string, allowing for backslash-escaping
41 returns tuple of (quoted string with quotes, still in string at end)"""
42 # note that this returns the quote characters as well... even internally
43 instring = startinstring
44 enteredonce = False
45 lenstart = len(startdelim)
46 lenend = len(enddelim)
47 startdelim_places = find_all(source, startdelim)
48 if startdelim == enddelim:
49 enddelim_places = startdelim_places[:]
50 else:
51 enddelim_places = find_all(source, enddelim)
52 if escape is not None:
53 lenescape = len(escape)
54 escape_places = find_all(source, escape)
55 last_escape_pos = -1
56 # filter escaped escapes
57 true_escape = False
58 true_escape_places = []
59 for escape_pos in escape_places:
60 if escape_pos - lenescape in escape_places:
61 true_escape = not true_escape
62 else:
63 true_escape = True
64 if true_escape:
65 true_escape_places.append(escape_pos)
66 startdelim_places = [pos for pos in startdelim_places if pos - lenescape not in true_escape_places]
67 enddelim_places = [pos + lenend for pos in enddelim_places if pos - lenescape not in true_escape_places]
68 else:
69 enddelim_places = [pos + lenend for pos in enddelim_places]
70 # get a unique sorted list of the significant places in the string
71 significant_places = dict.fromkeys([0] + startdelim_places + enddelim_places + [len(source)-1]).keys()
72 significant_places.sort()
73 extracted = ""
74 lastpos = None
75 for pos in significant_places:
76 if instring and pos in enddelim_places:
77 # make sure that if startdelim == enddelim we don't get confused and count the same string as start and end
78 if lastpos == pos - lenstart and lastpos in startdelim_places:
79 continue
80 extracted += source[lastpos:pos]
81 instring = False
82 lastpos = pos
83 if (not instring) and pos in startdelim_places and not (enteredonce and not allowreentry):
84 instring = True
85 enteredonce = True
86 lastpos = pos
87 if instring:
88 extracted += source[lastpos:]
89 return (extracted, instring)
91 def extractfromlines(lines, startdelim, enddelim, escape):
92 """Calls extract over multiple lines, remembering whether in the string or not"""
93 result = ""
94 instring = 0
95 for line in lines:
96 (string, instring) = extract(line, startdelim, enddelim, escape, instring)
97 result += string
98 if not instring: break
99 return result
101 def extractstr(source):
102 "Extracts a doublequote-delimited string from a string, allowing for backslash-escaping"
103 (string, instring) = extract(source, '"', '"', '\\')
104 return string
106 def extractcomment(lines):
107 "Extracts <!-- > XML comments from lines"
108 return extractfromlines(lines, "<!--", "-->", None)
110 def extractwithoutquotes(source, startdelim, enddelim, escape=None, startinstring=False, includeescapes=True, allowreentry=True):
111 """Extracts a doublequote-delimited string from a string, allowing for backslash-escaping
112 includeescapes can also be a function that takes the whole escaped string and returns the replaced version"""
113 instring = startinstring
114 enteredonce = False
115 lenstart = len(startdelim)
116 lenend = len(enddelim)
117 startdelim_places = find_all(source, startdelim)
118 if startdelim == enddelim:
119 enddelim_places = startdelim_places[:]
120 else:
121 enddelim_places = find_all(source, enddelim)
122 if escape is not None:
123 lenescape = len(escape)
124 escape_places = find_all(source, escape)
125 last_escape_pos = -1
126 # filter escaped escapes
127 true_escape = False
128 true_escape_places = []
129 for escape_pos in escape_places:
130 if escape_pos - lenescape in escape_places:
131 true_escape = not true_escape
132 else:
133 true_escape = True
134 if true_escape:
135 true_escape_places.append(escape_pos)
136 startdelim_places = [pos for pos in startdelim_places if pos - lenescape not in true_escape_places]
137 enddelim_places = [pos + lenend for pos in enddelim_places if pos - lenescape not in true_escape_places]
138 else:
139 enddelim_places = [pos + lenend for pos in enddelim_places]
140 # get a unique sorted list of the significant places in the string
141 significant_places = dict.fromkeys([0] + startdelim_places + enddelim_places + [len(source)-1]).keys()
142 significant_places.sort()
143 extracted = ""
144 lastpos = 0
145 callable_includeescapes = callable(includeescapes)
146 checkescapes = callable_includeescapes or not includeescapes
147 for pos in significant_places:
148 if instring and pos in enddelim_places and lastpos != pos - lenstart:
149 section_start, section_end = lastpos + len(startdelim), pos - len(enddelim)
150 section = source[section_start:section_end]
151 if escape is not None and checkescapes:
152 escape_list = [epos - section_start for epos in true_escape_places if section_start <= epos <= section_end]
153 new_section = ""
154 last_epos = 0
155 for epos in escape_list:
156 new_section += section[last_epos:epos]
157 if callable_includeescapes:
158 replace_escape = includeescapes(section[epos:epos+lenescape+1])
159 # TODO: deprecate old method of returning boolean from includeescape, by removing this if block
160 if not isinstance(replace_escape, basestring):
161 if replace_escape:
162 replace_escape = section[epos:epos+lenescape+1]
163 else:
164 replace_escape = section[epos+lenescape:epos+lenescape+1]
165 new_section += replace_escape
166 last_epos = epos + lenescape + 1
167 else:
168 last_epos = epos + lenescape
169 section = new_section + section[last_epos:]
170 extracted += section
171 instring = False
172 lastpos = pos
173 if (not instring) and pos in startdelim_places and not (enteredonce and not allowreentry):
174 instring = True
175 enteredonce = True
176 lastpos = pos
177 if instring:
178 section_start = lastpos + len(startdelim)
179 section = source[section_start:]
180 if escape is not None and not includeescapes:
181 escape_list = [epos - section_start for epos in true_escape_places if section_start <= epos]
182 new_section = ""
183 last_epos = 0
184 for epos in escape_list:
185 new_section += section[last_epos:epos]
186 if callable_includeescapes and includeescapes(section[epos:epos+lenescape+1]):
187 last_epos = epos
188 else:
189 last_epos = epos + lenescape
190 section = new_section + section[last_epos:]
191 extracted += section
192 return (extracted, instring)
194 def escapequotes(source, escapeescapes=0):
195 "Returns the same string, with double quotes escaped with backslash"
196 if escapeescapes:
197 return source.replace('\\', '\\\\').replace('"', '\\"')
198 else:
199 return source.replace('"','\\"')
201 def escapesinglequotes(source):
202 "Returns the same string, with single quotes doubled"
203 return source.replace("'","''")
205 def htmlentityencode(source):
206 """encodes source using HTML entities e.g. © -> &copy;"""
207 output = ""
208 for char in source:
209 charnum = ord(char)
210 if charnum in htmlentitydefs.codepoint2name:
211 output += "&%s;" % htmlentitydefs.codepoint2name[charnum]
212 else:
213 output += str(char)
214 return output
216 def htmlentitydecode(source):
217 """decodes source using HTML entities e.g. &copy; -> ©"""
218 output = u""
219 inentity = False
220 for char in source:
221 if char == "&":
222 inentity = True
223 possibleentity = ""
224 continue
225 if inentity:
226 if char == ";":
227 if len(possibleentity) > 0 and possibleentity in htmlentitydefs.name2codepoint:
228 output += unichr(htmlentitydefs.name2codepoint[possibleentity])
229 inentity = False
230 else:
231 output += "&" + possibleentity + ";"
232 inentity = False
233 elif char == " ":
234 output += "&" + possibleentity + char
235 inentity = False
236 else:
237 possibleentity += char
238 else:
239 output += char
240 return output
242 def javapropertiesencode(source):
243 """encodes source in the escaped-unicode encoding used by Java .properties files"""
244 output = ""
245 for char in source:
246 charnum = ord(char)
247 if char in controlchars:
248 output += controlchars[char]
249 elif 0 <= charnum < 128:
250 output += str(char)
251 else:
252 output += "\\u%04X" % charnum
253 return output
255 def mozillapropertiesencode(source):
256 """encodes source in the escaped-unicode encoding used by Mozilla .properties files"""
257 output = ""
258 for char in source:
259 charnum = ord(char)
260 if char in controlchars:
261 output += controlchars[char]
262 else:
263 output += char
264 return output
266 propertyescapes = {
267 # escapes that are self-escaping
268 "\\": "\\", "'": "'", '"': '"',
269 # control characters that we keep
270 "b": "\b", "f": "\f", "t": "\t", "n": "\n", "v": "\v", "a": "\a"
273 controlchars = {
274 # the reverse of the above...
275 "\b": "\\b", "\f": "\\f", "\t": "\\t", "\n": "\\n", "\v": "\\v"
278 def escapecontrols(source):
279 """escape control characters in the given string"""
280 for key, value in controlchars.iteritems():
281 source = source.replace(key, value)
282 return source
284 def mozillapropertiesdecode(source):
285 """decodes source from the escaped-unicode encoding used by mozilla .properties files"""
286 # since the .decode("unicode-escape") routine decodes everything, and we don't want to
287 # we reimplemented the algorithm from Python Objects/unicode.c in Python here
288 # and modified it to retain escaped control characters
289 output = u""
290 s = 0
291 if isinstance(source, str):
292 source = source.decode("utf-8")
293 def unichr2(i):
294 """Returns a Unicode string of one character with ordinal 32 <= i, otherwise an escaped control character"""
295 if 32 <= i:
296 return unichr(i)
297 elif unichr(i) in controlchars:
298 # we just return the character, unescaped
299 # if people want to escape them they can use escapecontrols
300 return unichr(i)
301 else:
302 return "\\u%04x" % i
303 while s < len(source):
304 c = source[s]
305 if c != '\\':
306 output += c
307 s += 1
308 continue
309 s += 1
310 if s >= len(source):
311 # this is an escape at the end of the line, which implies a continuation...
312 # return the escape to inform the parser
313 output += c
314 continue
315 c = source[s]
316 s += 1
317 if c == '\n': pass
318 # propertyescapes lookups
319 elif c in propertyescapes: output += propertyescapes[c]
320 # \uXXXX escapes
321 # \UXXXX escapes
322 elif c in "uU":
323 digits = 4
324 x = 0
325 for digit in range(digits):
326 x <<= 4
327 if s + digit >= len(source):
328 digits = digit
329 break
330 c = source[s+digit].lower()
331 if c.isdigit():
332 x += ord(c) - ord('0')
333 elif c in "abcdef":
334 x += ord(c) - ord('a') + 10
335 else:
336 break
337 s += digits
338 output += unichr2(x)
339 elif c == "N":
340 if source[s] != "{":
341 logging.warn("Invalid named unicode escape: no { after \\N")
342 output += "\\" + c
343 continue
344 s += 1
345 e = source.find("}", s)
346 if e == -1:
347 logging.warn("Invalid named unicode escape: no } after \\N{")
348 output += "\\" + c
349 continue
350 import unicodedata
351 name = source[s:e]
352 output += unicodedata.lookup(name)
353 s = e + 1
354 else:
355 output += "\\" + c
356 return output
358 def quotestr(source, escapeescapes=0):
359 "Returns a doublequote-delimited quoted string, escaping double quotes with backslash"
360 if isinstance(source, list):
361 firstline = True
362 for line in source:
363 if firstline:
364 newsource = '"' + escapequotes(line, escapeescapes) + '"'
365 firstline = False
366 else:
367 newsource = newsource + '\n' + '"' + escapequotes(line, escapeescapes) + '"'
368 return newsource
369 else:
370 return '"' + escapequotes(source, escapeescapes) + '"'
372 def singlequotestr(source):
373 "Returns a doublequote-delimited quoted string, escaping single quotes with themselves"
374 return "'" + escapesinglequotes(source) + "'"
376 def eitherquotestr(source):
377 "Returns a singlequote- or doublequote-delimited string, depending on what quotes it contains"
378 if '"' in source:
379 return singlequotestr(source)
380 else:
381 return quotestr(source)
383 def findend(string, substring):
384 s = string.find(substring)
385 if s != -1:
386 s += len(substring)
387 return s
389 def rstripeol(string):
390 return string.rstrip("\r\n")
392 def stripcomment(comment, startstring="<!--", endstring="-->"):
393 cstart = comment.find(startstring)
394 if cstart == -1:
395 cstart = 0
396 else:
397 cstart += len(startstring)
398 cend = comment.find(endstring, cstart)
399 return comment[cstart:cend].strip()
401 def unstripcomment(comment, startstring="<!-- ", endstring=" -->\n"):
402 return startstring+comment.strip()+endstring
404 def encodewithdict(unencoded, encodedict):
405 """encodes certain characters in the string using an encode dictionary"""
406 encoded = unencoded
407 for key, value in encodedict.iteritems():
408 if key in encoded:
409 encoded = encoded.replace(key, value)
410 return encoded
412 def makeutf8(d):
413 """convert numbers to utf8 codes in the values of a dictionary"""
414 for key, value in d.items():
415 if type(value) == int:
416 d[key] = unichr(value).encode('utf8')
417 return d
419 def testcase():
420 x = ' "this" " is " "a" " test!" '
421 print extract(x, '"', '"', None)
422 print extract(x, '"', '"', '!')
423 print extractwithoutquotes(x, '"', '"', None)
424 print extractwithoutquotes(x, '"', '"', '!')
425 print extractwithoutquotes(x, '"', '"', '!', includeescapes=False)
427 if __name__ == '__main__':
428 testcase()