storage/lisa.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3 #
   4 # Copyright 2006-2007 Zuza Software Foundation
   5 #
   6 # This file is part of translate.
   7 #
   8 # translate is free software; you can redistribute it and/or modify
   9 # it under the terms of the GNU General Public License as published by
  10 # the Free Software Foundation; either version 2 of the License, or
  11 # (at your option) any later version.
  12 #
  13 # translate is distributed in the hope that it will be useful,
  14 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16 # GNU General Public License for more details.
  17 #
  18 # You should have received a copy of the GNU General Public License
  19 # along with translate; if not, write to the Free Software
  20 # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  21 #
  22
  23 """Parent class for LISA standards (TMX, TBX, XLIFF)"""
  24
  25 import re
  26
  27 from translate.storage import base
  28 from translate.lang import data
  29 try:
  30     from lxml import etree
  31 except ImportError, e:
  32     raise ImportError("lxml is not installed. It might be possible to continue without support for XML formats.")
  33
  34 def getText(node):
  35     """joins together the text from all the text nodes in the nodelist and their children"""
  36     # node.xpath is very slow, so we only use it if there are children
  37     # TODO: consider rewriting by iterating over children
  38     if node:    # The etree way of testing for children
  39         return node.xpath("string()") # specific to lxml.etree
  40     else:
  41         return data.forceunicode(node.text) or u""
  42         # if node.text is none, we want to return "" since the tag is there
  43
  44 def _findAllMatches(text, re_obj):
  45     """generate match objects for all @re_obj matches in @text."""
  46     start = 0
  47     max = len(text)
  48     while start < max:
  49         m = re_obj.search(text, start)
  50         if not m: break
  51         yield m
  52         start = m.end()
  53
  54 placeholders = ['(%[diouxXeEfFgGcrs])', r'(\\+.?)', '(%[0-9]$lx)', '(%[0-9]\$[a-z])', '(<.+?>)']
  55 re_placeholders = [re.compile(ph) for ph in placeholders]
  56 def _getPhMatches(text):
  57     'return list of regexp matchobjects for with all place holders in the @text'
  58     matches = []
  59     for re_ph in re_placeholders:
  60         matches.extend(list(_findAllMatches(text, re_ph)))
  61
  62     # sort them so they come sequentially
  63     matches.sort(lambda a, b: cmp(a.start(), b.start()))
  64     return matches
  65
  66 XML_NS = 'http://www.w3.org/XML/1998/namespace'
  67
  68 def setXMLlang(node, lang):
  69     """Sets the xml:lang attribute on node"""
  70     node.set("{%s}lang" % XML_NS, lang)
  71
  72 def setXMLspace(node, value):
  73     """Sets the xml:space attribute on node"""
  74     node.set("{%s}space" % XML_NS, value)
  75
  76 def namespaced(namespace, name):
  77     """Returns name in Clark notation within the given namespace.
  78
  79     For example namespaced("source") in an XLIFF document might return
  80         {urn:oasis:names:tc:xliff:document:1.1}source
  81     This is needed throughout lxml.
  82     """
  83     if namespace:
  84         return "{%s}%s" % (namespace, name)
  85     else:
  86         return name
  87
  88 class LISAunit(base.TranslationUnit):
  89     """A single unit in the file.
  90 Provisional work is done to make several languages possible."""
  91
  92     #The name of the root element of this unit type:(termEntry, tu, trans-unit)
  93     rootNode = ""
  94     #The name of the per language element of this unit type:(termEntry, tu, trans-unit)
  95     languageNode = ""
  96     #The name of the innermost element of this unit type:(term, seg)
  97     textNode = ""
  98
  99     namespace = None
 100
 101     def __init__(self, source, empty=False):
 102         """Constructs a unit containing the given source string"""
 103         if empty:
 104             return
 105         self.xmlelement = etree.Element(self.rootNode)
 106         #add descrip, note, etc.
 107
 108         super(LISAunit, self).__init__(source)
 109
 110     def __eq__(self, other):
 111         """Compares two units"""
 112         languageNodes = self.getlanguageNodes()
 113         otherlanguageNodes = other.getlanguageNodes()
 114         if len(languageNodes) != len(otherlanguageNodes):
 115             return False
 116         for i in range(len(languageNodes)):
 117             mytext = self.getNodeText(languageNodes[i])
 118             othertext = other.getNodeText(otherlanguageNodes[i])
 119             if mytext != othertext:
 120                 #TODO:^ maybe we want to take children and notes into account
 121                 return False
 122         return True
 123
 124     def namespaced(self, name):
 125         """Returns name in Clark notation.
 126
 127         For example namespaced("source") in an XLIFF document might return
 128             {urn:oasis:names:tc:xliff:document:1.1}source
 129         This is needed throughout lxml.
 130         """
 131         return namespaced(self.namespace, name)
 132
 133     def setsource(self, source, sourcelang='en'):
 134         source = data.forceunicode(source)
 135         languageNodes = self.getlanguageNodes()
 136         sourcelanguageNode = self.createlanguageNode(sourcelang, source, "source")
 137         if len(languageNodes) > 0:
 138             self.xmlelement[0] = sourcelanguageNode
 139         else:
 140             self.xmlelement.append(sourcelanguageNode)
 141
 142     def getsource(self):
 143         return self.getNodeText(self.getlanguageNode(lang=None, index=0))
 144     source = property(getsource, setsource)
 145
 146     def settarget(self, text, lang='xx', append=False):
 147         #XXX: we really need the language - can't really be optional
 148         """Sets the "target" string (second language), or alternatively appends to the list"""
 149         text = data.forceunicode(text)
 150         #Firstly deal with reinitialising to None or setting to identical string
 151         if self.gettarget() == text:
 152             return
 153         languageNodes = self.getlanguageNodes()
 154         assert len(languageNodes) > 0
 155         if not text is None:
 156             languageNode = self.createlanguageNode(lang, text, "target")
 157             if append or len(languageNodes) == 1:
 158                 self.xmlelement.append(languageNode)
 159             else:
 160                 self.xmlelement.insert(1, languageNode)
 161         if not append and len(languageNodes) > 1:
 162             self.xmlelement.remove(languageNodes[1])
 163
 164     def gettarget(self, lang=None):
 165         """retrieves the "target" text (second entry), or the entry in the
 166         specified language, if it exists"""
 167         if lang:
 168             node = self.getlanguageNode(lang=lang)
 169         else:
 170             node = self.getlanguageNode(lang=None, index=1)
 171         return self.getNodeText(node)
 172     target = property(gettarget, settarget)
 173
 174     def createlanguageNode(self, lang, text, purpose=None):
 175         """Returns a xml Element setup with given parameters to represent a
 176         single language entry. Has to be overridden."""
 177         return None
 178
 179     def createPHnodes(self, parent, text):
 180         """Create the text node in parent containing all the ph tags"""
 181         matches = _getPhMatches(text)
 182         if not matches:
 183             parent.text = text
 184             return
 185
 186         # Now we know there will definitely be some ph tags
 187         start = matches[0].start()
 188         pretext = text[:start]
 189         if pretext:
 190             parent.text = pretext
 191         lasttag = parent
 192         for i, m in enumerate(matches):
 193             #pretext
 194             pretext = text[start:m.start()]
 195             # this will never happen with the first ph tag
 196             if pretext:
 197                 lasttag.tail = pretext
 198             #ph node
 199             phnode = etree.SubElement(parent, "ph")
 200             phnode.set("id", str(i+1))
 201             phnode.text = m.group()
 202             lasttag = phnode
 203             start = m.end()
 204         #post text
 205         if text[start:]:
 206             lasttag.tail = text[start:]
 207
 208     def getlanguageNodes(self):
 209         """Returns a list of all nodes that contain per language information."""
 210         return self.xmlelement.findall(self.namespaced(self.languageNode))
 211
 212     def getlanguageNode(self, lang=None, index=None):
 213         """Retrieves a languageNode either by language or by index"""
 214         if lang is None and index is None:
 215             raise KeyError("No criterea for languageNode given")
 216         languageNodes = self.getlanguageNodes()
 217         if lang:
 218             for set in languageNodes:
 219                 if set.get("{%s}lang" % XML_NS) == lang:
 220                     return set
 221         else:#have to use index
 222             if index >= len(languageNodes):
 223                 return None
 224             else:
 225                 return languageNodes[index]
 226         return None
 227
 228     def getNodeText(self, languageNode):
 229         """Retrieves the term from the given languageNode"""
 230         if languageNode is None:
 231             return None
 232         if self.textNode:
 233             terms = languageNode.findall('.//%s' % self.namespaced(self.textNode))
 234             if len(terms) == 0:
 235                 return None
 236             return getText(terms[0])
 237         else:
 238             return getText(languageNode)
 239
 240     def __str__(self):
 241         return etree.tostring(self.xmlelement, pretty_print=True, encoding='utf-8')
 242
 243     def createfromxmlElement(cls, element):
 244         term = cls(None, empty=True)
 245         term.xmlelement = element
 246         return term
 247     createfromxmlElement = classmethod(createfromxmlElement)
 248
 249 class LISAfile(base.TranslationStore):
 250     """A class representing a file store for one of the LISA file formats."""
 251     UnitClass = LISAunit
 252     #The root node of the XML document:
 253     rootNode = ""
 254     #The root node of the content section:
 255     bodyNode = ""
 256     #The XML skeleton to use for empty construction:
 257     XMLskeleton = ""
 258
 259     namespace = None
 260
 261     def __init__(self, inputfile=None, sourcelanguage='en', targetlanguage=None, unitclass=None):
 262         super(LISAfile, self).__init__(unitclass=unitclass)
 263         self.setsourcelanguage(sourcelanguage)
 264         self.settargetlanguage(targetlanguage)
 265         if inputfile is not None:
 266             self.parse(inputfile)
 267             assert self.document.getroot().tag == self.namespaced(self.rootNode)
 268         else:
 269             # We strip out newlines to ensure that spaces in the skeleton doesn't
 270             # interfere with the the pretty printing of lxml
 271             self.parse(self.XMLskeleton.replace("\n", ""))
 272             self.addheader()
 273
 274     def addheader(self):
 275         """Method to be overridden to initialise headers, etc."""
 276         pass
 277
 278     def namespaced(self, name):
 279         """Returns name in Clark notation.
 280
 281         For example namespaced("source") in an XLIFF document might return
 282             {urn:oasis:names:tc:xliff:document:1.1}source
 283         This is needed throughout lxml.
 284         """
 285         return namespaced(self.namespace, name)
 286
 287     def initbody(self):
 288         """Initialises self.body so it never needs to be retrieved from the XML again."""
 289         self.namespace = self.document.getroot().nsmap.get(None, None)
 290         self.body = self.document.find('//%s' % self.namespaced(self.bodyNode))
 291
 292     def setsourcelanguage(self, sourcelanguage):
 293         """Sets the source language for this store"""
 294         self.sourcelanguage = sourcelanguage
 295
 296     def settargetlanguage(self, targetlanguage):
 297         """Sets the target language for this store"""
 298         self.targetlanguage = targetlanguage
 299
 300     def addsourceunit(self, source):
 301         #TODO: miskien moet hierdie eerder addsourcestring of iets genoem word?
 302         """Adds and returns a new unit with the given string as first entry."""
 303         newunit = self.UnitClass(source)
 304         self.addunit(newunit)
 305         return newunit
 306
 307     def addunit(self, unit):
 308         unit.namespace = self.namespace
 309         self.body.append(unit.xmlelement)
 310         self.units.append(unit)
 311
 312     def __str__(self):
 313         """Converts to a string containing the file's XML"""
 314         return etree.tostring(self.document, pretty_print=True, xml_declaration=True, encoding='utf-8')
 315
 316     def parse(self, xml):
 317         """Populates this object from the given xml string"""
 318         if not hasattr(self, 'filename'):
 319             self.filename = getattr(xml, 'name', '')
 320         if hasattr(xml, "read"):
 321             xml.seek(0)
 322             posrc = xml.read()
 323             xml = posrc
 324         self.document = etree.fromstring(xml).getroottree()
 325         self.encoding = self.document.docinfo.encoding
 326         self.initbody()
 327         assert self.document.getroot().tag == self.namespaced(self.rootNode)
 328         termEntries = self.body.findall('.//%s' % self.namespaced(self.UnitClass.rootNode))
 329         if termEntries is None:
 330             return
 331         for entry in termEntries:
 332             term = self.UnitClass.createfromxmlElement(entry)
 333             term.namespace = self.namespace
 334             self.units.append(term)
 335