storage/oo.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3 #
   4 # Copyright 2002-2008 Zuza Software Foundation
   5 #
   6 # This file is part of translate.
   7 #
   8 # translate is free software; you can redistribute it and/or modify
   9 # it under the terms of the GNU General Public License as published by
  10 # the Free Software Foundation; either version 2 of the License, or
  11 # (at your option) any later version.
  12 #
  13 # translate is distributed in the hope that it will be useful,
  14 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16 # GNU General Public License for more details.
  17 #
  18 # You should have received a copy of the GNU General Public License
  19 # along with translate; if not, write to the Free Software
  20 # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  21
  22 """
  23 Classes that hold units of .oo files (oounit) or entire files (oofile).
  24
  25 These are specific .oo files for localisation exported by OpenOffice.org - SDF
  26 format (previously knows as GSI files). For an overview of the format, see
  27 http://l10n.openoffice.org/L10N_Framework/Intermediate_file_format.html
  28
  29 The behaviour in terms of escaping is explained in detail in the programming
  30 comments.
  31 """
  32 # FIXME: add simple test which reads in a file and writes it out again
  33
  34 import os
  35 import re
  36 import sys
  37 from translate.misc import quote
  38 from translate.misc import wStringIO
  39 import warnings
  40
  41 # File normalisation
  42
  43 normalfilenamechars = "/#.0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
  44 normalizetable = ""
  45 for i in map(chr, range(256)):
  46     if i in normalfilenamechars:
  47         normalizetable += i
  48     else:
  49         normalizetable += "_"
  50
  51 class unormalizechar(dict):
  52     def __init__(self, normalchars):
  53         self.normalchars = {}
  54         for char in normalchars:
  55             self.normalchars[ord(char)] = char
  56     def __getitem__(self, key):
  57         return self.normalchars.get(key, u"_")
  58
  59 unormalizetable = unormalizechar(normalfilenamechars.decode("ascii"))
  60
  61 def normalizefilename(filename):
  62     """converts any non-alphanumeric (standard roman) characters to _"""
  63     if isinstance(filename, str):
  64         return filename.translate(normalizetable)
  65     else:
  66         return filename.translate(unormalizetable)
  67
  68 # These are functions that deal with escaping and unescaping of the text fields
  69 # of the SDF file. These should only be applied to the text column.
  70 # The fields quickhelptext and title are assumed to carry no escaping.
  71 #
  72 # The escaping of all strings except those coming from .xhp (helpcontent2)
  73 # sourcefiles work as follows:
  74 #   (newline)         ->  \n
  75 #   (carriage return) ->  \r
  76 #   (tab)             ->  \t
  77 # Backslash characters (\) and single quotes (') are not consistently escaped,
  78 # and are therefore left as they are.
  79 #
  80 # For strings coming from .xhp (helpcontent2) sourcefiles the following
  81 # characters are escaped inside XML tags only:
  82 #   <  ->  \<  when used with lowercase tagnames (with some exceptions)
  83 #   >  ->  \>  when used with lowercase tagnames (with some exceptions)
  84 #   "  ->  \"  around XML properties
  85 # The following is consistently escaped in .xhp strings (not only in XML tags):
  86 #   \  ->  \\
  87
  88 def escape_text(text):
  89     """Escapes SDF text to be suitable for unit consumption."""
  90     return text.replace("\n", "\\n").replace("\t", "\\t").replace("\r", "\\r")
  91
  92 def unescape_text(text):
  93     """Unescapes SDF text to be suitable for unit consumption."""
  94     return text.replace("\\\\", "\a").replace("\\n", "\n").replace("\\t", "\t").\
  95            replace("\\r", "\r").replace("\a", "\\\\")
  96
  97 helptagre = re.compile('''<[/]??[a-z_\-]+?(?:| +[a-z]+?=".*?") *[/]??>''')
  98
  99 def escape_help_text(text):
 100     """Escapes the help text as it would be in an SDF file.
 101
 102     <, >, " are only escaped in <[[:lower:]]> tags. Some HTML tags make it in in
 103     lowercase so those are dealt with. Some OpenOffice.org help tags are not
 104     escaped.
 105     """
 106     text = text.replace("\\", "\\\\")
 107     for tag in helptagre.findall(text):
 108         escapethistag = True
 109         if tag in ["<br>", "<h1>", "</h1>", "<img ...>", "<->", "<empty>", "<ref>", "<references>"]:
 110             escapethistag = False
 111         for skip in ["<font", "<node", "<help_section"]:
 112             if tag.startswith(skip):
 113                 escapethistag = False
 114         if escapethistag:
 115             escaped_tag = ("\\<" + tag[1:-1] + "\\>").replace('"', '\\"')
 116             text = text.replace(tag, escaped_tag)
 117     return text
 118
 119 def unescape_help_text(text):
 120     """Unescapes normal text to be suitable for writing to the SDF file."""
 121     return text.replace(r"\<", "<").replace(r"\>", ">").replace(r'\"', '"').replace(r"\\", "\\")
 122
 123 def encode_if_needed_utf8(text):
 124     """Encode a Unicode string the the specified encoding"""
 125     if isinstance(text, unicode):
 126         return text.encode('UTF-8')
 127     return text
 128
 129
 130 class ooline(object):
 131     """this represents one line, one translation in an .oo file"""
 132     def __init__(self, parts=None):
 133         """construct an ooline from its parts"""
 134         if parts is None:
 135             self.project, self.sourcefile, self.dummy, self.resourcetype, \
 136                 self.groupid, self.localid, self.helpid, self.platform, \
 137                 self.width, self.languageid, self.text, self.helptext, \
 138                 self.quickhelptext, self.title, self.timestamp = [""] * 15
 139         else:
 140             self.setparts(parts)
 141
 142     def setparts(self, parts):
 143         """create a line from its tab-delimited parts"""
 144         if len(parts) != 15:
 145             warnings.warn("oo line contains %d parts, it should contain 15: %r" % \
 146                     (len(parts), parts))
 147             newparts = list(parts)
 148             if len(newparts) < 15:
 149                 newparts = newparts + [""] * (15-len(newparts))
 150             else:
 151                 newparts = newparts[:15]
 152             parts = tuple(newparts)
 153         self.project, self.sourcefile, self.dummy, self.resourcetype, \
 154             self.groupid, self.localid, self.helpid, self.platform, \
 155             self.width, self.languageid, self._text, self.helptext, \
 156             self.quickhelptext, self.title, self.timestamp = parts
 157
 158     def getparts(self):
 159         """return a list of parts in this line"""
 160         return (self.project, self.sourcefile, self.dummy, self.resourcetype,
 161                 self.groupid, self.localid, self.helpid, self.platform,
 162                 self.width, self.languageid, self._text, self.helptext,
 163                 self.quickhelptext, self.title, self.timestamp)
 164
 165     def gettext(self):
 166         """Obtains the text column and handle escaping."""
 167         if self.sourcefile.endswith(".xhp"):
 168             return unescape_help_text(self._text)
 169         else:
 170             return unescape_text(self._text)
 171
 172     def settext(self, text):
 173         """Sets the text column and handle escaping."""
 174         if self.sourcefile.endswith(".xhp"):
 175             self._text = escape_help_text(text)
 176         else:
 177             self._text = escape_text(text)
 178     text = property(gettext, settext)
 179
 180     def __str__(self):
 181         """convert to a string. double check that unicode is handled"""
 182         return encode_if_needed_utf8(self.getoutput())
 183
 184     def getoutput(self):
 185         """return a line in tab-delimited form"""
 186         parts = self.getparts()
 187         return "\t".join(parts)
 188
 189     def getkey(self):
 190         """get the key that identifies the resource"""
 191         return (self.project, self.sourcefile, self.resourcetype, self.groupid,
 192                 self.localid, self.platform)
 193
 194 class oounit:
 195     """this represents a number of translations of a resource"""
 196     def __init__(self):
 197         """construct the oounit"""
 198         self.languages = {}
 199         self.lines = []
 200
 201     def addline(self, line):
 202         """add a line to the oounit"""
 203         self.languages[line.languageid] = line
 204         self.lines.append(line)
 205
 206     def __str__(self):
 207         """convert to a string. double check that unicode is handled"""
 208         return encode_if_needed_utf8(self.getoutput())
 209
 210     def getoutput(self):
 211         """return the lines in tab-delimited form"""
 212         return "\r\n".join([str(line) for line in self.lines])
 213
 214 class oofile:
 215     """this represents an entire .oo file"""
 216     UnitClass = oounit
 217     def __init__(self, input=None):
 218         """constructs the oofile"""
 219         self.oolines = []
 220         self.units = []
 221         self.ookeys = {}
 222         self.filename = ""
 223         self.languages = []
 224         if input is not None:
 225             self.parse(input)
 226
 227     def addline(self, thisline):
 228         """adds a parsed line to the file"""
 229         key = thisline.getkey()
 230         element = self.ookeys.get(key, None)
 231         if element is None:
 232             element = self.UnitClass()
 233             self.units.append(element)
 234             self.ookeys[key] = element
 235         element.addline(thisline)
 236         self.oolines.append(thisline)
 237         if thisline.languageid not in self.languages:
 238             self.languages.append(thisline.languageid)
 239
 240     def parse(self, input):
 241         """parses lines and adds them to the file"""
 242         if not self.filename:
 243             self.filename = getattr(input, 'name', '')
 244         if hasattr(input, "read"):
 245             src = input.read()
 246             input.close()
 247         else:
 248             src = input
 249         for line in src.split("\n"):
 250             line = quote.rstripeol(line)
 251             if not line:
 252                 continue
 253             parts = line.split("\t")
 254             thisline = ooline(parts)
 255             self.addline(thisline)
 256
 257     def __str__(self):
 258         """convert to a string. double check that unicode is handled"""
 259         return encode_if_needed_utf8(self.getoutput())
 260
 261     def getoutput(self):
 262         """converts all the lines back to tab-delimited form"""
 263         lines = []
 264         for oe in self.units:
 265             if len(oe.lines) > 2:
 266                 warnings.warn("contains %d lines (should be 2 at most): languages %r" % (len(oe.lines), oe.languages))
 267                 oekeys = [line.getkey() for line in oe.lines]
 268                 warnings.warn("contains %d lines (should be 2 at most): keys %r" % (len(oe.lines), oekeys))
 269             oeline = str(oe) + "\r\n"
 270             lines.append(oeline)
 271         return "".join(lines)
 272
 273 class oomultifile:
 274     """this takes a huge GSI file and represents it as multiple smaller files..."""
 275     def __init__(self, filename, mode=None, multifilestyle="single"):
 276         """initialises oomultifile from a seekable inputfile or writable outputfile"""
 277         self.filename = filename
 278         if mode is None:
 279             if os.path.exists(filename):
 280                 mode = 'r'
 281             else:
 282                 mode = 'w'
 283         self.mode = mode
 284         self.multifilestyle = multifilestyle
 285         self.multifilename = os.path.splitext(filename)[0]
 286         self.multifile = open(filename, mode)
 287         self.subfilelines = {}
 288         if mode == "r":
 289             self.createsubfileindex()
 290
 291     def createsubfileindex(self):
 292         """reads in all the lines and works out the subfiles"""
 293         linenum = 0
 294         for line in self.multifile:
 295             subfile = self.getsubfilename(line)
 296             if not subfile in self.subfilelines:
 297                 self.subfilelines[subfile] = []
 298             self.subfilelines[subfile].append(linenum)
 299             linenum += 1
 300
 301     def getsubfilename(self, line):
 302         """looks up the subfile name for the line"""
 303         if line.count("\t") < 2:
 304             raise ValueError("invalid tab-delimited line: %r" % line)
 305         lineparts = line.split("\t", 2)
 306         module, filename = lineparts[0], lineparts[1]
 307         if self.multifilestyle == "onefile":
 308             ooname = self.multifilename
 309         elif self.multifilestyle == "toplevel":
 310             ooname = module
 311         else:
 312             filename = filename.replace("\\", "/")
 313             fileparts = [module] + filename.split("/")
 314             ooname = os.path.join(*fileparts[:-1])
 315         return ooname + os.extsep + "oo"
 316
 317     def listsubfiles(self):
 318         """returns a list of subfiles in the file"""
 319         return self.subfilelines.keys()
 320
 321     def __iter__(self):
 322         """iterates through the subfile names"""
 323         for subfile in self.listsubfiles():
 324             yield subfile
 325
 326     def __contains__(self, pathname):
 327         """checks if this pathname is a valid subfile"""
 328         return pathname in self.subfilelines
 329
 330     def getsubfilesrc(self, subfile):
 331         """returns the list of lines matching the subfile"""
 332         lines = []
 333         requiredlines = dict.fromkeys(self.subfilelines[subfile])
 334         linenum = 0
 335         self.multifile.seek(0)
 336         for line in self.multifile:
 337             if linenum in requiredlines:
 338                 lines.append(line)
 339             linenum += 1
 340         return "".join(lines)
 341
 342     def openinputfile(self, subfile):
 343         """returns a pseudo-file object for the given subfile"""
 344         subfilesrc = self.getsubfilesrc(subfile)
 345         inputfile = wStringIO.StringIO(subfilesrc)
 346         inputfile.filename = subfile
 347         return inputfile
 348
 349     def openoutputfile(self, subfile):
 350         """returns a pseudo-file object for the given subfile"""
 351         def onclose(contents):
 352             self.multifile.write(contents)
 353             self.multifile.flush()
 354         outputfile = wStringIO.CatchStringOutput(onclose)
 355         outputfile.filename = subfile
 356         return outputfile
 357
 358     def getoofile(self, subfile):
 359         """returns an oofile built up from the given subfile's lines"""
 360         subfilesrc = self.getsubfilesrc(subfile)
 361         oosubfile = oofile()
 362         oosubfile.filename = subfile
 363         oosubfile.parse(subfilesrc)
 364         return oosubfile
 365
 366 if __name__ == '__main__':
 367     of = oofile()
 368     of.parse(sys.stdin.read())
 369     sys.stdout.write(str(of))