2 # -*- coding: utf-8 -*-
4 # Copyright 2002-2008 Zuza Software Foundation
6 # This file is part of translate.
8 # translate is free software; you can redistribute it and/or modify
9 # it under the terms of the GNU General Public License as published by
10 # the Free Software Foundation; either version 2 of the License, or
11 # (at your option) any later version.
13 # translate is distributed in the hope that it will be useful,
14 # but WITHOUT ANY WARRANTY; without even the implied warranty of
15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 # GNU General Public License for more details.
18 # You should have received a copy of the GNU General Public License
19 # along with translate; if not, write to the Free Software
20 # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
23 Classes that hold units of .oo files (oounit) or entire files (oofile).
25 These are specific .oo files for localisation exported by OpenOffice.org - SDF
26 format (previously knows as GSI files). For an overview of the format, see
27 http://l10n.openoffice.org/L10N_Framework/Intermediate_file_format.html
29 The behaviour in terms of escaping is explained in detail in the programming
32 # FIXME: add simple test which reads in a file and writes it out again
37 from translate
.misc
import quote
38 from translate
.misc
import wStringIO
43 normalfilenamechars
= "/#.0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
45 for i
in map(chr, range(256)):
46 if i
in normalfilenamechars
:
51 class unormalizechar(dict):
52 def __init__(self
, normalchars
):
54 for char
in normalchars
:
55 self
.normalchars
[ord(char
)] = char
56 def __getitem__(self
, key
):
57 return self
.normalchars
.get(key
, u
"_")
59 unormalizetable
= unormalizechar(normalfilenamechars
.decode("ascii"))
61 def normalizefilename(filename
):
62 """converts any non-alphanumeric (standard roman) characters to _"""
63 if isinstance(filename
, str):
64 return filename
.translate(normalizetable
)
66 return filename
.translate(unormalizetable
)
68 # These are functions that deal with escaping and unescaping of the text fields
69 # of the SDF file. These should only be applied to the text column.
70 # The fields quickhelptext and title are assumed to carry no escaping.
72 # The escaping of all strings except those coming from .xhp (helpcontent2)
73 # sourcefiles work as follows:
75 # (carriage return) -> \r
77 # Backslash characters (\) and single quotes (') are not consistently escaped,
78 # and are therefore left as they are.
80 # For strings coming from .xhp (helpcontent2) sourcefiles the following
81 # characters are escaped inside XML tags only:
82 # < -> \< when used with lowercase tagnames (with some exceptions)
83 # > -> \> when used with lowercase tagnames (with some exceptions)
84 # " -> \" around XML properties
85 # The following is consistently escaped in .xhp strings (not only in XML tags):
88 def escape_text(text
):
89 """Escapes SDF text to be suitable for unit consumption."""
90 return text
.replace("\n", "\\n").replace("\t", "\\t").replace("\r", "\\r")
92 def unescape_text(text
):
93 """Unescapes SDF text to be suitable for unit consumption."""
94 return text
.replace("\\\\", "\a").replace("\\n", "\n").replace("\\t", "\t").\
95 replace("\\r", "\r").replace("\a", "\\\\")
97 helptagre
= re
.compile('''<[/]??[a-z_\-]+?(?:| +[a-z]+?=".*?") *[/]??>''')
99 def escape_help_text(text
):
100 """Escapes the help text as it would be in an SDF file.
102 <, >, " are only escaped in <[[:lower:]]> tags. Some HTML tags make it in in
103 lowercase so those are dealt with. Some OpenOffice.org help tags are not
106 text
= text
.replace("\\", "\\\\")
107 for tag
in helptagre
.findall(text
):
109 if tag
in ["<br>", "<h1>", "</h1>", "<img ...>", "<->", "<empty>", "<ref>", "<references>"]:
110 escapethistag
= False
111 for skip
in ["<font", "<node", "<help_section"]:
112 if tag
.startswith(skip
):
113 escapethistag
= False
115 escaped_tag
= ("\\<" + tag
[1:-1] + "\\>").replace('"', '\\"')
116 text
= text
.replace(tag
, escaped_tag
)
119 def unescape_help_text(text
):
120 """Unescapes normal text to be suitable for writing to the SDF file."""
121 return text
.replace(r
"\<", "<").replace(r
"\>", ">").replace(r
'\"', '"').replace(r
"\\", "\\")
123 def encode_if_needed_utf8(text
):
124 """Encode a Unicode string the the specified encoding"""
125 if isinstance(text
, unicode):
126 return text
.encode('UTF-8')
130 class ooline(object):
131 """this represents one line, one translation in an .oo file"""
132 def __init__(self
, parts
=None):
133 """construct an ooline from its parts"""
135 self
.project
, self
.sourcefile
, self
.dummy
, self
.resourcetype
, \
136 self
.groupid
, self
.localid
, self
.helpid
, self
.platform
, \
137 self
.width
, self
.languageid
, self
.text
, self
.helptext
, \
138 self
.quickhelptext
, self
.title
, self
.timestamp
= [""] * 15
142 def setparts(self
, parts
):
143 """create a line from its tab-delimited parts"""
145 warnings
.warn("oo line contains %d parts, it should contain 15: %r" % \
147 newparts
= list(parts
)
148 if len(newparts
) < 15:
149 newparts
= newparts
+ [""] * (15-len(newparts
))
151 newparts
= newparts
[:15]
152 parts
= tuple(newparts
)
153 self
.project
, self
.sourcefile
, self
.dummy
, self
.resourcetype
, \
154 self
.groupid
, self
.localid
, self
.helpid
, self
.platform
, \
155 self
.width
, self
.languageid
, self
._text
, self
.helptext
, \
156 self
.quickhelptext
, self
.title
, self
.timestamp
= parts
159 """return a list of parts in this line"""
160 return (self
.project
, self
.sourcefile
, self
.dummy
, self
.resourcetype
,
161 self
.groupid
, self
.localid
, self
.helpid
, self
.platform
,
162 self
.width
, self
.languageid
, self
._text
, self
.helptext
,
163 self
.quickhelptext
, self
.title
, self
.timestamp
)
166 """Obtains the text column and handle escaping."""
167 if self
.sourcefile
.endswith(".xhp"):
168 return unescape_help_text(self
._text
)
170 return unescape_text(self
._text
)
172 def settext(self
, text
):
173 """Sets the text column and handle escaping."""
174 if self
.sourcefile
.endswith(".xhp"):
175 self
._text
= escape_help_text(text
)
177 self
._text
= escape_text(text
)
178 text
= property(gettext
, settext
)
181 """convert to a string. double check that unicode is handled"""
182 return encode_if_needed_utf8(self
.getoutput())
185 """return a line in tab-delimited form"""
186 parts
= self
.getparts()
187 return "\t".join(parts
)
190 """get the key that identifies the resource"""
191 return (self
.project
, self
.sourcefile
, self
.resourcetype
, self
.groupid
,
192 self
.localid
, self
.platform
)
195 """this represents a number of translations of a resource"""
197 """construct the oounit"""
201 def addline(self
, line
):
202 """add a line to the oounit"""
203 self
.languages
[line
.languageid
] = line
204 self
.lines
.append(line
)
207 """convert to a string. double check that unicode is handled"""
208 return encode_if_needed_utf8(self
.getoutput())
211 """return the lines in tab-delimited form"""
212 return "\r\n".join([str(line
) for line
in self
.lines
])
215 """this represents an entire .oo file"""
217 def __init__(self
, input=None):
218 """constructs the oofile"""
224 if input is not None:
227 def addline(self
, thisline
):
228 """adds a parsed line to the file"""
229 key
= thisline
.getkey()
230 element
= self
.ookeys
.get(key
, None)
232 element
= self
.UnitClass()
233 self
.units
.append(element
)
234 self
.ookeys
[key
] = element
235 element
.addline(thisline
)
236 self
.oolines
.append(thisline
)
237 if thisline
.languageid
not in self
.languages
:
238 self
.languages
.append(thisline
.languageid
)
240 def parse(self
, input):
241 """parses lines and adds them to the file"""
242 if not self
.filename
:
243 self
.filename
= getattr(input, 'name', '')
244 if hasattr(input, "read"):
249 for line
in src
.split("\n"):
250 line
= quote
.rstripeol(line
)
253 parts
= line
.split("\t")
254 thisline
= ooline(parts
)
255 self
.addline(thisline
)
258 """convert to a string. double check that unicode is handled"""
259 return encode_if_needed_utf8(self
.getoutput())
262 """converts all the lines back to tab-delimited form"""
264 for oe
in self
.units
:
265 if len(oe
.lines
) > 2:
266 warnings
.warn("contains %d lines (should be 2 at most): languages %r" % (len(oe
.lines
), oe
.languages
))
267 oekeys
= [line
.getkey() for line
in oe
.lines
]
268 warnings
.warn("contains %d lines (should be 2 at most): keys %r" % (len(oe
.lines
), oekeys
))
269 oeline
= str(oe
) + "\r\n"
271 return "".join(lines
)
274 """this takes a huge GSI file and represents it as multiple smaller files..."""
275 def __init__(self
, filename
, mode
=None, multifilestyle
="single"):
276 """initialises oomultifile from a seekable inputfile or writable outputfile"""
277 self
.filename
= filename
279 if os
.path
.exists(filename
):
284 self
.multifilestyle
= multifilestyle
285 self
.multifilename
= os
.path
.splitext(filename
)[0]
286 self
.multifile
= open(filename
, mode
)
287 self
.subfilelines
= {}
289 self
.createsubfileindex()
291 def createsubfileindex(self
):
292 """reads in all the lines and works out the subfiles"""
294 for line
in self
.multifile
:
295 subfile
= self
.getsubfilename(line
)
296 if not subfile
in self
.subfilelines
:
297 self
.subfilelines
[subfile
] = []
298 self
.subfilelines
[subfile
].append(linenum
)
301 def getsubfilename(self
, line
):
302 """looks up the subfile name for the line"""
303 if line
.count("\t") < 2:
304 raise ValueError("invalid tab-delimited line: %r" % line
)
305 lineparts
= line
.split("\t", 2)
306 module
, filename
= lineparts
[0], lineparts
[1]
307 if self
.multifilestyle
== "onefile":
308 ooname
= self
.multifilename
309 elif self
.multifilestyle
== "toplevel":
312 filename
= filename
.replace("\\", "/")
313 fileparts
= [module
] + filename
.split("/")
314 ooname
= os
.path
.join(*fileparts
[:-1])
315 return ooname
+ os
.extsep
+ "oo"
317 def listsubfiles(self
):
318 """returns a list of subfiles in the file"""
319 return self
.subfilelines
.keys()
322 """iterates through the subfile names"""
323 for subfile
in self
.listsubfiles():
326 def __contains__(self
, pathname
):
327 """checks if this pathname is a valid subfile"""
328 return pathname
in self
.subfilelines
330 def getsubfilesrc(self
, subfile
):
331 """returns the list of lines matching the subfile"""
333 requiredlines
= dict.fromkeys(self
.subfilelines
[subfile
])
335 self
.multifile
.seek(0)
336 for line
in self
.multifile
:
337 if linenum
in requiredlines
:
340 return "".join(lines
)
342 def openinputfile(self
, subfile
):
343 """returns a pseudo-file object for the given subfile"""
344 subfilesrc
= self
.getsubfilesrc(subfile
)
345 inputfile
= wStringIO
.StringIO(subfilesrc
)
346 inputfile
.filename
= subfile
349 def openoutputfile(self
, subfile
):
350 """returns a pseudo-file object for the given subfile"""
351 def onclose(contents
):
352 self
.multifile
.write(contents
)
353 self
.multifile
.flush()
354 outputfile
= wStringIO
.CatchStringOutput(onclose
)
355 outputfile
.filename
= subfile
358 def getoofile(self
, subfile
):
359 """returns an oofile built up from the given subfile's lines"""
360 subfilesrc
= self
.getsubfilesrc(subfile
)
362 oosubfile
.filename
= subfile
363 oosubfile
.parse(subfilesrc
)
366 if __name__
== '__main__':
368 of
.parse(sys
.stdin
.read())
369 sys
.stdout
.write(str(of
))