fix git support for v1.5.3 (or higher) by setting "--work-tree"
[translate_toolkit.git] / storage / lisa.py
bloba23044d6d6e78edfe6ca3258daf69f8d6d8520d9
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
4 # Copyright 2006-2007 Zuza Software Foundation
5 #
6 # This file is part of translate.
8 # translate is free software; you can redistribute it and/or modify
9 # it under the terms of the GNU General Public License as published by
10 # the Free Software Foundation; either version 2 of the License, or
11 # (at your option) any later version.
13 # translate is distributed in the hope that it will be useful,
14 # but WITHOUT ANY WARRANTY; without even the implied warranty of
15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 # GNU General Public License for more details.
18 # You should have received a copy of the GNU General Public License
19 # along with translate; if not, write to the Free Software
20 # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
23 """Parent class for LISA standards (TMX, TBX, XLIFF)"""
25 import re
27 from translate.storage import base
28 from translate.lang import data
29 try:
30 from lxml import etree
31 except ImportError, e:
32 raise ImportError("lxml is not installed. It might be possible to continue without support for XML formats.")
34 def getText(node):
35 """joins together the text from all the text nodes in the nodelist and their children"""
36 # node.xpath is very slow, so we only use it if there are children
37 # TODO: consider rewriting by iterating over children
38 if node: # The etree way of testing for children
39 return node.xpath("string()") # specific to lxml.etree
40 else:
41 return data.forceunicode(node.text) or u""
42 # if node.text is none, we want to return "" since the tag is there
44 def _findAllMatches(text, re_obj):
45 """generate match objects for all @re_obj matches in @text."""
46 start = 0
47 max = len(text)
48 while start < max:
49 m = re_obj.search(text, start)
50 if not m: break
51 yield m
52 start = m.end()
54 placeholders = ['(%[diouxXeEfFgGcrs])', r'(\\+.?)', '(%[0-9]$lx)', '(%[0-9]\$[a-z])', '(<.+?>)']
55 re_placeholders = [re.compile(ph) for ph in placeholders]
56 def _getPhMatches(text):
57 'return list of regexp matchobjects for with all place holders in the @text'
58 matches = []
59 for re_ph in re_placeholders:
60 matches.extend(list(_findAllMatches(text, re_ph)))
62 # sort them so they come sequentially
63 matches.sort(lambda a, b: cmp(a.start(), b.start()))
64 return matches
66 XML_NS = 'http://www.w3.org/XML/1998/namespace'
68 def setXMLlang(node, lang):
69 """Sets the xml:lang attribute on node"""
70 node.set("{%s}lang" % XML_NS, lang)
72 def setXMLspace(node, value):
73 """Sets the xml:space attribute on node"""
74 node.set("{%s}space" % XML_NS, value)
76 def namespaced(namespace, name):
77 """Returns name in Clark notation within the given namespace.
79 For example namespaced("source") in an XLIFF document might return
80 {urn:oasis:names:tc:xliff:document:1.1}source
81 This is needed throughout lxml.
82 """
83 if namespace:
84 return "{%s}%s" % (namespace, name)
85 else:
86 return name
88 class LISAunit(base.TranslationUnit):
89 """A single unit in the file.
90 Provisional work is done to make several languages possible."""
92 #The name of the root element of this unit type:(termEntry, tu, trans-unit)
93 rootNode = ""
94 #The name of the per language element of this unit type:(termEntry, tu, trans-unit)
95 languageNode = ""
96 #The name of the innermost element of this unit type:(term, seg)
97 textNode = ""
99 namespace = None
101 def __init__(self, source, empty=False):
102 """Constructs a unit containing the given source string"""
103 if empty:
104 return
105 self.xmlelement = etree.Element(self.rootNode)
106 #add descrip, note, etc.
108 super(LISAunit, self).__init__(source)
110 def __eq__(self, other):
111 """Compares two units"""
112 languageNodes = self.getlanguageNodes()
113 otherlanguageNodes = other.getlanguageNodes()
114 if len(languageNodes) != len(otherlanguageNodes):
115 return False
116 for i in range(len(languageNodes)):
117 mytext = self.getNodeText(languageNodes[i])
118 othertext = other.getNodeText(otherlanguageNodes[i])
119 if mytext != othertext:
120 #TODO:^ maybe we want to take children and notes into account
121 return False
122 return True
124 def namespaced(self, name):
125 """Returns name in Clark notation.
127 For example namespaced("source") in an XLIFF document might return
128 {urn:oasis:names:tc:xliff:document:1.1}source
129 This is needed throughout lxml.
131 return namespaced(self.namespace, name)
133 def setsource(self, source, sourcelang='en'):
134 source = data.forceunicode(source)
135 languageNodes = self.getlanguageNodes()
136 sourcelanguageNode = self.createlanguageNode(sourcelang, source, "source")
137 if len(languageNodes) > 0:
138 self.xmlelement[0] = sourcelanguageNode
139 else:
140 self.xmlelement.append(sourcelanguageNode)
142 def getsource(self):
143 return self.getNodeText(self.getlanguageNode(lang=None, index=0))
144 source = property(getsource, setsource)
146 def settarget(self, text, lang='xx', append=False):
147 #XXX: we really need the language - can't really be optional
148 """Sets the "target" string (second language), or alternatively appends to the list"""
149 text = data.forceunicode(text)
150 #Firstly deal with reinitialising to None or setting to identical string
151 if self.gettarget() == text:
152 return
153 languageNodes = self.getlanguageNodes()
154 assert len(languageNodes) > 0
155 if not text is None:
156 languageNode = self.createlanguageNode(lang, text, "target")
157 if append or len(languageNodes) == 1:
158 self.xmlelement.append(languageNode)
159 else:
160 self.xmlelement.insert(1, languageNode)
161 if not append and len(languageNodes) > 1:
162 self.xmlelement.remove(languageNodes[1])
164 def gettarget(self, lang=None):
165 """retrieves the "target" text (second entry), or the entry in the
166 specified language, if it exists"""
167 if lang:
168 node = self.getlanguageNode(lang=lang)
169 else:
170 node = self.getlanguageNode(lang=None, index=1)
171 return self.getNodeText(node)
172 target = property(gettarget, settarget)
174 def createlanguageNode(self, lang, text, purpose=None):
175 """Returns a xml Element setup with given parameters to represent a
176 single language entry. Has to be overridden."""
177 return None
179 def createPHnodes(self, parent, text):
180 """Create the text node in parent containing all the ph tags"""
181 matches = _getPhMatches(text)
182 if not matches:
183 parent.text = text
184 return
186 # Now we know there will definitely be some ph tags
187 start = matches[0].start()
188 pretext = text[:start]
189 if pretext:
190 parent.text = pretext
191 lasttag = parent
192 for i, m in enumerate(matches):
193 #pretext
194 pretext = text[start:m.start()]
195 # this will never happen with the first ph tag
196 if pretext:
197 lasttag.tail = pretext
198 #ph node
199 phnode = etree.SubElement(parent, "ph")
200 phnode.set("id", str(i+1))
201 phnode.text = m.group()
202 lasttag = phnode
203 start = m.end()
204 #post text
205 if text[start:]:
206 lasttag.tail = text[start:]
208 def getlanguageNodes(self):
209 """Returns a list of all nodes that contain per language information."""
210 return self.xmlelement.findall(self.namespaced(self.languageNode))
212 def getlanguageNode(self, lang=None, index=None):
213 """Retrieves a languageNode either by language or by index"""
214 if lang is None and index is None:
215 raise KeyError("No criterea for languageNode given")
216 languageNodes = self.getlanguageNodes()
217 if lang:
218 for set in languageNodes:
219 if set.get("{%s}lang" % XML_NS) == lang:
220 return set
221 else:#have to use index
222 if index >= len(languageNodes):
223 return None
224 else:
225 return languageNodes[index]
226 return None
228 def getNodeText(self, languageNode):
229 """Retrieves the term from the given languageNode"""
230 if languageNode is None:
231 return None
232 if self.textNode:
233 terms = languageNode.findall('.//%s' % self.namespaced(self.textNode))
234 if len(terms) == 0:
235 return None
236 return getText(terms[0])
237 else:
238 return getText(languageNode)
240 def __str__(self):
241 return etree.tostring(self.xmlelement, pretty_print=True, encoding='utf-8')
243 def createfromxmlElement(cls, element):
244 term = cls(None, empty=True)
245 term.xmlelement = element
246 return term
247 createfromxmlElement = classmethod(createfromxmlElement)
249 class LISAfile(base.TranslationStore):
250 """A class representing a file store for one of the LISA file formats."""
251 UnitClass = LISAunit
252 #The root node of the XML document:
253 rootNode = ""
254 #The root node of the content section:
255 bodyNode = ""
256 #The XML skeleton to use for empty construction:
257 XMLskeleton = ""
259 namespace = None
261 def __init__(self, inputfile=None, sourcelanguage='en', targetlanguage=None, unitclass=None):
262 super(LISAfile, self).__init__(unitclass=unitclass)
263 self.setsourcelanguage(sourcelanguage)
264 self.settargetlanguage(targetlanguage)
265 if inputfile is not None:
266 self.parse(inputfile)
267 assert self.document.getroot().tag == self.namespaced(self.rootNode)
268 else:
269 # We strip out newlines to ensure that spaces in the skeleton doesn't
270 # interfere with the the pretty printing of lxml
271 self.parse(self.XMLskeleton.replace("\n", ""))
272 self.addheader()
274 def addheader(self):
275 """Method to be overridden to initialise headers, etc."""
276 pass
278 def namespaced(self, name):
279 """Returns name in Clark notation.
281 For example namespaced("source") in an XLIFF document might return
282 {urn:oasis:names:tc:xliff:document:1.1}source
283 This is needed throughout lxml.
285 return namespaced(self.namespace, name)
287 def initbody(self):
288 """Initialises self.body so it never needs to be retrieved from the XML again."""
289 self.namespace = self.document.getroot().nsmap.get(None, None)
290 self.body = self.document.find('//%s' % self.namespaced(self.bodyNode))
292 def setsourcelanguage(self, sourcelanguage):
293 """Sets the source language for this store"""
294 self.sourcelanguage = sourcelanguage
296 def settargetlanguage(self, targetlanguage):
297 """Sets the target language for this store"""
298 self.targetlanguage = targetlanguage
300 def addsourceunit(self, source):
301 #TODO: miskien moet hierdie eerder addsourcestring of iets genoem word?
302 """Adds and returns a new unit with the given string as first entry."""
303 newunit = self.UnitClass(source)
304 self.addunit(newunit)
305 return newunit
307 def addunit(self, unit):
308 unit.namespace = self.namespace
309 self.body.append(unit.xmlelement)
310 self.units.append(unit)
312 def __str__(self):
313 """Converts to a string containing the file's XML"""
314 return etree.tostring(self.document, pretty_print=True, xml_declaration=True, encoding='utf-8')
316 def parse(self, xml):
317 """Populates this object from the given xml string"""
318 if not hasattr(self, 'filename'):
319 self.filename = getattr(xml, 'name', '')
320 if hasattr(xml, "read"):
321 xml.seek(0)
322 posrc = xml.read()
323 xml = posrc
324 self.document = etree.fromstring(xml).getroottree()
325 self.encoding = self.document.docinfo.encoding
326 self.initbody()
327 assert self.document.getroot().tag == self.namespaced(self.rootNode)
328 termEntries = self.body.findall('.//%s' % self.namespaced(self.UnitClass.rootNode))
329 if termEntries is None:
330 return
331 for entry in termEntries:
332 term = self.UnitClass.createfromxmlElement(entry)
333 term.namespace = self.namespace
334 self.units.append(term)