2 # -*- coding: utf-8 -*-
4 # Copyright 2002-2006 Zuza Software Foundation
6 # This file is part of translate.
8 # translate is free software; you can redistribute it and/or modify
9 # it under the terms of the GNU General Public License as published by
10 # the Free Software Foundation; either version 2 of the License, or
11 # (at your option) any later version.
13 # translate is distributed in the hope that it will be useful,
14 # but WITHOUT ANY WARRANTY; without even the implied warranty of
15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 # GNU General Public License for more details.
18 # You should have received a copy of the GNU General Public License
19 # along with translate; if not, write to the Free Software
20 # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 """string processing utilities for extracting strings with various kinds of delimiters"""
27 def find_all(searchin
, substr
):
28 """returns a list of locations where substr occurs in searchin
29 locations are not allowed to overlap"""
33 location
= searchin
.find(substr
, location
)
35 locations
.append(location
)
36 location
+= len(substr
)
39 def extract(source
, startdelim
, enddelim
, escape
=None, startinstring
=False, allowreentry
=True):
40 """Extracts a doublequote-delimited string from a string, allowing for backslash-escaping
41 returns tuple of (quoted string with quotes, still in string at end)"""
42 # note that this returns the quote characters as well... even internally
43 instring
= startinstring
45 lenstart
= len(startdelim
)
46 lenend
= len(enddelim
)
47 startdelim_places
= find_all(source
, startdelim
)
48 if startdelim
== enddelim
:
49 enddelim_places
= startdelim_places
[:]
51 enddelim_places
= find_all(source
, enddelim
)
52 if escape
is not None:
53 lenescape
= len(escape
)
54 escape_places
= find_all(source
, escape
)
56 # filter escaped escapes
58 true_escape_places
= []
59 for escape_pos
in escape_places
:
60 if escape_pos
- lenescape
in escape_places
:
61 true_escape
= not true_escape
65 true_escape_places
.append(escape_pos
)
66 startdelim_places
= [pos
for pos
in startdelim_places
if pos
- lenescape
not in true_escape_places
]
67 enddelim_places
= [pos
+ lenend
for pos
in enddelim_places
if pos
- lenescape
not in true_escape_places
]
69 enddelim_places
= [pos
+ lenend
for pos
in enddelim_places
]
70 # get a unique sorted list of the significant places in the string
71 significant_places
= dict.fromkeys([0] + startdelim_places
+ enddelim_places
+ [len(source
)-1]).keys()
72 significant_places
.sort()
75 for pos
in significant_places
:
76 if instring
and pos
in enddelim_places
:
77 # make sure that if startdelim == enddelim we don't get confused and count the same string as start and end
78 if lastpos
== pos
- lenstart
and lastpos
in startdelim_places
:
80 extracted
+= source
[lastpos
:pos
]
83 if (not instring
) and pos
in startdelim_places
and not (enteredonce
and not allowreentry
):
88 extracted
+= source
[lastpos
:]
89 return (extracted
, instring
)
91 def extractfromlines(lines
, startdelim
, enddelim
, escape
):
92 """Calls extract over multiple lines, remembering whether in the string or not"""
96 (string
, instring
) = extract(line
, startdelim
, enddelim
, escape
, instring
)
98 if not instring
: break
101 def extractstr(source
):
102 "Extracts a doublequote-delimited string from a string, allowing for backslash-escaping"
103 (string
, instring
) = extract(source
, '"', '"', '\\')
106 def extractcomment(lines
):
107 "Extracts <!-- > XML comments from lines"
108 return extractfromlines(lines
, "<!--", "-->", None)
110 def extractwithoutquotes(source
, startdelim
, enddelim
, escape
=None, startinstring
=False, includeescapes
=True, allowreentry
=True):
111 """Extracts a doublequote-delimited string from a string, allowing for backslash-escaping
112 includeescapes can also be a function that takes the whole escaped string and returns the replaced version"""
113 instring
= startinstring
115 lenstart
= len(startdelim
)
116 lenend
= len(enddelim
)
117 startdelim_places
= find_all(source
, startdelim
)
118 if startdelim
== enddelim
:
119 enddelim_places
= startdelim_places
[:]
121 enddelim_places
= find_all(source
, enddelim
)
122 if escape
is not None:
123 lenescape
= len(escape
)
124 escape_places
= find_all(source
, escape
)
126 # filter escaped escapes
128 true_escape_places
= []
129 for escape_pos
in escape_places
:
130 if escape_pos
- lenescape
in escape_places
:
131 true_escape
= not true_escape
135 true_escape_places
.append(escape_pos
)
136 startdelim_places
= [pos
for pos
in startdelim_places
if pos
- lenescape
not in true_escape_places
]
137 enddelim_places
= [pos
+ lenend
for pos
in enddelim_places
if pos
- lenescape
not in true_escape_places
]
139 enddelim_places
= [pos
+ lenend
for pos
in enddelim_places
]
140 # get a unique sorted list of the significant places in the string
141 significant_places
= dict.fromkeys([0] + startdelim_places
+ enddelim_places
+ [len(source
)-1]).keys()
142 significant_places
.sort()
145 callable_includeescapes
= callable(includeescapes
)
146 checkescapes
= callable_includeescapes
or not includeescapes
147 for pos
in significant_places
:
148 if instring
and pos
in enddelim_places
and lastpos
!= pos
- lenstart
:
149 section_start
, section_end
= lastpos
+ len(startdelim
), pos
- len(enddelim
)
150 section
= source
[section_start
:section_end
]
151 if escape
is not None and checkescapes
:
152 escape_list
= [epos
- section_start
for epos
in true_escape_places
if section_start
<= epos
<= section_end
]
155 for epos
in escape_list
:
156 new_section
+= section
[last_epos
:epos
]
157 if callable_includeescapes
:
158 replace_escape
= includeescapes(section
[epos
:epos
+lenescape
+1])
159 # TODO: deprecate old method of returning boolean from includeescape, by removing this if block
160 if not isinstance(replace_escape
, basestring
):
162 replace_escape
= section
[epos
:epos
+lenescape
+1]
164 replace_escape
= section
[epos
+lenescape
:epos
+lenescape
+1]
165 new_section
+= replace_escape
166 last_epos
= epos
+ lenescape
+ 1
168 last_epos
= epos
+ lenescape
169 section
= new_section
+ section
[last_epos
:]
173 if (not instring
) and pos
in startdelim_places
and not (enteredonce
and not allowreentry
):
178 section_start
= lastpos
+ len(startdelim
)
179 section
= source
[section_start
:]
180 if escape
is not None and not includeescapes
:
181 escape_list
= [epos
- section_start
for epos
in true_escape_places
if section_start
<= epos
]
184 for epos
in escape_list
:
185 new_section
+= section
[last_epos
:epos
]
186 if callable_includeescapes
and includeescapes(section
[epos
:epos
+lenescape
+1]):
189 last_epos
= epos
+ lenescape
190 section
= new_section
+ section
[last_epos
:]
192 return (extracted
, instring
)
194 def escapequotes(source
, escapeescapes
=0):
195 "Returns the same string, with double quotes escaped with backslash"
197 return source
.replace('\\', '\\\\').replace('"', '\\"')
199 return source
.replace('"','\\"')
201 def escapesinglequotes(source
):
202 "Returns the same string, with single quotes doubled"
203 return source
.replace("'","''")
205 def htmlentityencode(source
):
206 """encodes source using HTML entities e.g. © -> ©"""
210 if charnum
in htmlentitydefs
.codepoint2name
:
211 output
+= "&%s;" % htmlentitydefs
.codepoint2name
[charnum
]
216 def htmlentitydecode(source
):
217 """decodes source using HTML entities e.g. © -> ©"""
227 if len(possibleentity
) > 0 and possibleentity
in htmlentitydefs
.name2codepoint
:
228 output
+= unichr(htmlentitydefs
.name2codepoint
[possibleentity
])
231 output
+= "&" + possibleentity
+ ";"
234 output
+= "&" + possibleentity
+ char
237 possibleentity
+= char
242 def javapropertiesencode(source
):
243 """encodes source in the escaped-unicode encoding used by Java .properties files"""
247 if char
in controlchars
:
248 output
+= controlchars
[char
]
249 elif 0 <= charnum
< 128:
252 output
+= "\\u%04X" % charnum
255 def mozillapropertiesencode(source
):
256 """encodes source in the escaped-unicode encoding used by Mozilla .properties files"""
260 if char
in controlchars
:
261 output
+= controlchars
[char
]
267 # escapes that are self-escaping
268 "\\": "\\", "'": "'", '"': '"',
269 # control characters that we keep
270 "b": "\b", "f": "\f", "t": "\t", "n": "\n", "v": "\v", "a": "\a"
274 # the reverse of the above...
275 "\b": "\\b", "\f": "\\f", "\t": "\\t", "\n": "\\n", "\v": "\\v"
278 def escapecontrols(source
):
279 """escape control characters in the given string"""
280 for key
, value
in controlchars
.iteritems():
281 source
= source
.replace(key
, value
)
284 def mozillapropertiesdecode(source
):
285 """decodes source from the escaped-unicode encoding used by mozilla .properties files"""
286 # since the .decode("unicode-escape") routine decodes everything, and we don't want to
287 # we reimplemented the algorithm from Python Objects/unicode.c in Python here
288 # and modified it to retain escaped control characters
291 if isinstance(source
, str):
292 source
= source
.decode("utf-8")
294 """Returns a Unicode string of one character with ordinal 32 <= i, otherwise an escaped control character"""
297 elif unichr(i
) in controlchars
:
298 # we just return the character, unescaped
299 # if people want to escape them they can use escapecontrols
303 while s
< len(source
):
311 # this is an escape at the end of the line, which implies a continuation...
312 # return the escape to inform the parser
318 # propertyescapes lookups
319 elif c
in propertyescapes
: output
+= propertyescapes
[c
]
325 for digit
in range(digits
):
327 if s
+ digit
>= len(source
):
330 c
= source
[s
+digit
].lower()
332 x
+= ord(c
) - ord('0')
334 x
+= ord(c
) - ord('a') + 10
341 logging
.warn("Invalid named unicode escape: no { after \\N")
345 e
= source
.find("}", s
)
347 logging
.warn("Invalid named unicode escape: no } after \\N{")
352 output
+= unicodedata
.lookup(name
)
358 def quotestr(source
, escapeescapes
=0):
359 "Returns a doublequote-delimited quoted string, escaping double quotes with backslash"
360 if isinstance(source
, list):
364 newsource
= '"' + escapequotes(line
, escapeescapes
) + '"'
367 newsource
= newsource
+ '\n' + '"' + escapequotes(line
, escapeescapes
) + '"'
370 return '"' + escapequotes(source
, escapeescapes
) + '"'
372 def singlequotestr(source
):
373 "Returns a doublequote-delimited quoted string, escaping single quotes with themselves"
374 return "'" + escapesinglequotes(source
) + "'"
376 def eitherquotestr(source
):
377 "Returns a singlequote- or doublequote-delimited string, depending on what quotes it contains"
379 return singlequotestr(source
)
381 return quotestr(source
)
383 def findend(string
, substring
):
384 s
= string
.find(substring
)
389 def rstripeol(string
):
390 return string
.rstrip("\r\n")
392 def stripcomment(comment
, startstring
="<!--", endstring
="-->"):
393 cstart
= comment
.find(startstring
)
397 cstart
+= len(startstring
)
398 cend
= comment
.find(endstring
, cstart
)
399 return comment
[cstart
:cend
].strip()
401 def unstripcomment(comment
, startstring
="<!-- ", endstring
=" -->\n"):
402 return startstring
+comment
.strip()+endstring
404 def encodewithdict(unencoded
, encodedict
):
405 """encodes certain characters in the string using an encode dictionary"""
407 for key
, value
in encodedict
.iteritems():
409 encoded
= encoded
.replace(key
, value
)
413 """convert numbers to utf8 codes in the values of a dictionary"""
414 for key
, value
in d
.items():
415 if type(value
) == int:
416 d
[key
] = unichr(value
).encode('utf8')
420 x
= ' "this" " is " "a" " test!" '
421 print extract(x
, '"', '"', None)
422 print extract(x
, '"', '"', '!')
423 print extractwithoutquotes(x
, '"', '"', None)
424 print extractwithoutquotes(x
, '"', '"', '!')
425 print extractwithoutquotes(x
, '"', '"', '!', includeescapes
=False)
427 if __name__
== '__main__':