fix git support for v1.5.3 (or higher) by setting "--work-tree"
[translate_toolkit.git] / storage / oo.py
blobf406518feb07432fbf94e69640b6fb1d34ceea12
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3 #
4 # Copyright 2002-2008 Zuza Software Foundation
5 #
6 # This file is part of translate.
8 # translate is free software; you can redistribute it and/or modify
9 # it under the terms of the GNU General Public License as published by
10 # the Free Software Foundation; either version 2 of the License, or
11 # (at your option) any later version.
13 # translate is distributed in the hope that it will be useful,
14 # but WITHOUT ANY WARRANTY; without even the implied warranty of
15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 # GNU General Public License for more details.
18 # You should have received a copy of the GNU General Public License
19 # along with translate; if not, write to the Free Software
20 # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 """
23 Classes that hold units of .oo files (oounit) or entire files (oofile).
25 These are specific .oo files for localisation exported by OpenOffice.org - SDF
26 format (previously knows as GSI files). For an overview of the format, see
27 http://l10n.openoffice.org/L10N_Framework/Intermediate_file_format.html
29 The behaviour in terms of escaping is explained in detail in the programming
30 comments.
31 """
32 # FIXME: add simple test which reads in a file and writes it out again
34 import os
35 import re
36 import sys
37 from translate.misc import quote
38 from translate.misc import wStringIO
39 import warnings
41 # File normalisation
43 normalfilenamechars = "/#.0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
44 normalizetable = ""
45 for i in map(chr, range(256)):
46 if i in normalfilenamechars:
47 normalizetable += i
48 else:
49 normalizetable += "_"
51 class unormalizechar(dict):
52 def __init__(self, normalchars):
53 self.normalchars = {}
54 for char in normalchars:
55 self.normalchars[ord(char)] = char
56 def __getitem__(self, key):
57 return self.normalchars.get(key, u"_")
59 unormalizetable = unormalizechar(normalfilenamechars.decode("ascii"))
61 def normalizefilename(filename):
62 """converts any non-alphanumeric (standard roman) characters to _"""
63 if isinstance(filename, str):
64 return filename.translate(normalizetable)
65 else:
66 return filename.translate(unormalizetable)
68 # These are functions that deal with escaping and unescaping of the text fields
69 # of the SDF file. These should only be applied to the text column.
70 # The fields quickhelptext and title are assumed to carry no escaping.
72 # The escaping of all strings except those coming from .xhp (helpcontent2)
73 # sourcefiles work as follows:
74 # (newline) -> \n
75 # (carriage return) -> \r
76 # (tab) -> \t
77 # Backslash characters (\) and single quotes (') are not consistently escaped,
78 # and are therefore left as they are.
80 # For strings coming from .xhp (helpcontent2) sourcefiles the following
81 # characters are escaped inside XML tags only:
82 # < -> \< when used with lowercase tagnames (with some exceptions)
83 # > -> \> when used with lowercase tagnames (with some exceptions)
84 # " -> \" around XML properties
85 # The following is consistently escaped in .xhp strings (not only in XML tags):
86 # \ -> \\
88 def escape_text(text):
89 """Escapes SDF text to be suitable for unit consumption."""
90 return text.replace("\n", "\\n").replace("\t", "\\t").replace("\r", "\\r")
92 def unescape_text(text):
93 """Unescapes SDF text to be suitable for unit consumption."""
94 return text.replace("\\\\", "\a").replace("\\n", "\n").replace("\\t", "\t").\
95 replace("\\r", "\r").replace("\a", "\\\\")
97 helptagre = re.compile('''<[/]??[a-z_\-]+?(?:| +[a-z]+?=".*?") *[/]??>''')
99 def escape_help_text(text):
100 """Escapes the help text as it would be in an SDF file.
102 <, >, " are only escaped in <[[:lower:]]> tags. Some HTML tags make it in in
103 lowercase so those are dealt with. Some OpenOffice.org help tags are not
104 escaped.
106 text = text.replace("\\", "\\\\")
107 for tag in helptagre.findall(text):
108 escapethistag = True
109 if tag in ["<br>", "<h1>", "</h1>", "<img ...>", "<->", "<empty>", "<ref>", "<references>"]:
110 escapethistag = False
111 for skip in ["<font", "<node", "<help_section"]:
112 if tag.startswith(skip):
113 escapethistag = False
114 if escapethistag:
115 escaped_tag = ("\\<" + tag[1:-1] + "\\>").replace('"', '\\"')
116 text = text.replace(tag, escaped_tag)
117 return text
119 def unescape_help_text(text):
120 """Unescapes normal text to be suitable for writing to the SDF file."""
121 return text.replace(r"\<", "<").replace(r"\>", ">").replace(r'\"', '"').replace(r"\\", "\\")
123 def encode_if_needed_utf8(text):
124 """Encode a Unicode string the the specified encoding"""
125 if isinstance(text, unicode):
126 return text.encode('UTF-8')
127 return text
130 class ooline(object):
131 """this represents one line, one translation in an .oo file"""
132 def __init__(self, parts=None):
133 """construct an ooline from its parts"""
134 if parts is None:
135 self.project, self.sourcefile, self.dummy, self.resourcetype, \
136 self.groupid, self.localid, self.helpid, self.platform, \
137 self.width, self.languageid, self.text, self.helptext, \
138 self.quickhelptext, self.title, self.timestamp = [""] * 15
139 else:
140 self.setparts(parts)
142 def setparts(self, parts):
143 """create a line from its tab-delimited parts"""
144 if len(parts) != 15:
145 warnings.warn("oo line contains %d parts, it should contain 15: %r" % \
146 (len(parts), parts))
147 newparts = list(parts)
148 if len(newparts) < 15:
149 newparts = newparts + [""] * (15-len(newparts))
150 else:
151 newparts = newparts[:15]
152 parts = tuple(newparts)
153 self.project, self.sourcefile, self.dummy, self.resourcetype, \
154 self.groupid, self.localid, self.helpid, self.platform, \
155 self.width, self.languageid, self._text, self.helptext, \
156 self.quickhelptext, self.title, self.timestamp = parts
158 def getparts(self):
159 """return a list of parts in this line"""
160 return (self.project, self.sourcefile, self.dummy, self.resourcetype,
161 self.groupid, self.localid, self.helpid, self.platform,
162 self.width, self.languageid, self._text, self.helptext,
163 self.quickhelptext, self.title, self.timestamp)
165 def gettext(self):
166 """Obtains the text column and handle escaping."""
167 if self.sourcefile.endswith(".xhp"):
168 return unescape_help_text(self._text)
169 else:
170 return unescape_text(self._text)
172 def settext(self, text):
173 """Sets the text column and handle escaping."""
174 if self.sourcefile.endswith(".xhp"):
175 self._text = escape_help_text(text)
176 else:
177 self._text = escape_text(text)
178 text = property(gettext, settext)
180 def __str__(self):
181 """convert to a string. double check that unicode is handled"""
182 return encode_if_needed_utf8(self.getoutput())
184 def getoutput(self):
185 """return a line in tab-delimited form"""
186 parts = self.getparts()
187 return "\t".join(parts)
189 def getkey(self):
190 """get the key that identifies the resource"""
191 return (self.project, self.sourcefile, self.resourcetype, self.groupid,
192 self.localid, self.platform)
194 class oounit:
195 """this represents a number of translations of a resource"""
196 def __init__(self):
197 """construct the oounit"""
198 self.languages = {}
199 self.lines = []
201 def addline(self, line):
202 """add a line to the oounit"""
203 self.languages[line.languageid] = line
204 self.lines.append(line)
206 def __str__(self):
207 """convert to a string. double check that unicode is handled"""
208 return encode_if_needed_utf8(self.getoutput())
210 def getoutput(self):
211 """return the lines in tab-delimited form"""
212 return "\r\n".join([str(line) for line in self.lines])
214 class oofile:
215 """this represents an entire .oo file"""
216 UnitClass = oounit
217 def __init__(self, input=None):
218 """constructs the oofile"""
219 self.oolines = []
220 self.units = []
221 self.ookeys = {}
222 self.filename = ""
223 self.languages = []
224 if input is not None:
225 self.parse(input)
227 def addline(self, thisline):
228 """adds a parsed line to the file"""
229 key = thisline.getkey()
230 element = self.ookeys.get(key, None)
231 if element is None:
232 element = self.UnitClass()
233 self.units.append(element)
234 self.ookeys[key] = element
235 element.addline(thisline)
236 self.oolines.append(thisline)
237 if thisline.languageid not in self.languages:
238 self.languages.append(thisline.languageid)
240 def parse(self, input):
241 """parses lines and adds them to the file"""
242 if not self.filename:
243 self.filename = getattr(input, 'name', '')
244 if hasattr(input, "read"):
245 src = input.read()
246 input.close()
247 else:
248 src = input
249 for line in src.split("\n"):
250 line = quote.rstripeol(line)
251 if not line:
252 continue
253 parts = line.split("\t")
254 thisline = ooline(parts)
255 self.addline(thisline)
257 def __str__(self):
258 """convert to a string. double check that unicode is handled"""
259 return encode_if_needed_utf8(self.getoutput())
261 def getoutput(self):
262 """converts all the lines back to tab-delimited form"""
263 lines = []
264 for oe in self.units:
265 if len(oe.lines) > 2:
266 warnings.warn("contains %d lines (should be 2 at most): languages %r" % (len(oe.lines), oe.languages))
267 oekeys = [line.getkey() for line in oe.lines]
268 warnings.warn("contains %d lines (should be 2 at most): keys %r" % (len(oe.lines), oekeys))
269 oeline = str(oe) + "\r\n"
270 lines.append(oeline)
271 return "".join(lines)
273 class oomultifile:
274 """this takes a huge GSI file and represents it as multiple smaller files..."""
275 def __init__(self, filename, mode=None, multifilestyle="single"):
276 """initialises oomultifile from a seekable inputfile or writable outputfile"""
277 self.filename = filename
278 if mode is None:
279 if os.path.exists(filename):
280 mode = 'r'
281 else:
282 mode = 'w'
283 self.mode = mode
284 self.multifilestyle = multifilestyle
285 self.multifilename = os.path.splitext(filename)[0]
286 self.multifile = open(filename, mode)
287 self.subfilelines = {}
288 if mode == "r":
289 self.createsubfileindex()
291 def createsubfileindex(self):
292 """reads in all the lines and works out the subfiles"""
293 linenum = 0
294 for line in self.multifile:
295 subfile = self.getsubfilename(line)
296 if not subfile in self.subfilelines:
297 self.subfilelines[subfile] = []
298 self.subfilelines[subfile].append(linenum)
299 linenum += 1
301 def getsubfilename(self, line):
302 """looks up the subfile name for the line"""
303 if line.count("\t") < 2:
304 raise ValueError("invalid tab-delimited line: %r" % line)
305 lineparts = line.split("\t", 2)
306 module, filename = lineparts[0], lineparts[1]
307 if self.multifilestyle == "onefile":
308 ooname = self.multifilename
309 elif self.multifilestyle == "toplevel":
310 ooname = module
311 else:
312 filename = filename.replace("\\", "/")
313 fileparts = [module] + filename.split("/")
314 ooname = os.path.join(*fileparts[:-1])
315 return ooname + os.extsep + "oo"
317 def listsubfiles(self):
318 """returns a list of subfiles in the file"""
319 return self.subfilelines.keys()
321 def __iter__(self):
322 """iterates through the subfile names"""
323 for subfile in self.listsubfiles():
324 yield subfile
326 def __contains__(self, pathname):
327 """checks if this pathname is a valid subfile"""
328 return pathname in self.subfilelines
330 def getsubfilesrc(self, subfile):
331 """returns the list of lines matching the subfile"""
332 lines = []
333 requiredlines = dict.fromkeys(self.subfilelines[subfile])
334 linenum = 0
335 self.multifile.seek(0)
336 for line in self.multifile:
337 if linenum in requiredlines:
338 lines.append(line)
339 linenum += 1
340 return "".join(lines)
342 def openinputfile(self, subfile):
343 """returns a pseudo-file object for the given subfile"""
344 subfilesrc = self.getsubfilesrc(subfile)
345 inputfile = wStringIO.StringIO(subfilesrc)
346 inputfile.filename = subfile
347 return inputfile
349 def openoutputfile(self, subfile):
350 """returns a pseudo-file object for the given subfile"""
351 def onclose(contents):
352 self.multifile.write(contents)
353 self.multifile.flush()
354 outputfile = wStringIO.CatchStringOutput(onclose)
355 outputfile.filename = subfile
356 return outputfile
358 def getoofile(self, subfile):
359 """returns an oofile built up from the given subfile's lines"""
360 subfilesrc = self.getsubfilesrc(subfile)
361 oosubfile = oofile()
362 oosubfile.filename = subfile
363 oosubfile.parse(subfilesrc)
364 return oosubfile
366 if __name__ == '__main__':
367 of = oofile()
368 of.parse(sys.stdin.read())
369 sys.stdout.write(str(of))