2 # -*- coding: utf-8 -*-
4 # Copyright 2006-2007 Zuza Software Foundation
6 # This file is part of translate.
8 # translate is free software; you can redistribute it and/or modify
9 # it under the terms of the GNU General Public License as published by
10 # the Free Software Foundation; either version 2 of the License, or
11 # (at your option) any later version.
13 # translate is distributed in the hope that it will be useful,
14 # but WITHOUT ANY WARRANTY; without even the implied warranty of
15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 # GNU General Public License for more details.
18 # You should have received a copy of the GNU General Public License
19 # along with translate; if not, write to the Free Software
20 # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
23 """Parent class for LISA standards (TMX, TBX, XLIFF)"""
27 from translate
.storage
import base
28 from translate
.lang
import data
30 from lxml
import etree
31 except ImportError, e
:
32 raise ImportError("lxml is not installed. It might be possible to continue without support for XML formats.")
35 """joins together the text from all the text nodes in the nodelist and their children"""
36 # node.xpath is very slow, so we only use it if there are children
37 # TODO: consider rewriting by iterating over children
38 if node
: # The etree way of testing for children
39 return node
.xpath("string()") # specific to lxml.etree
41 return data
.forceunicode(node
.text
) or u
""
42 # if node.text is none, we want to return "" since the tag is there
44 def _findAllMatches(text
, re_obj
):
45 """generate match objects for all @re_obj matches in @text."""
49 m
= re_obj
.search(text
, start
)
54 placeholders
= ['(%[diouxXeEfFgGcrs])', r
'(\\+.?)', '(%[0-9]$lx)', '(%[0-9]\$[a-z])', '(<.+?>)']
55 re_placeholders
= [re
.compile(ph
) for ph
in placeholders
]
56 def _getPhMatches(text
):
57 'return list of regexp matchobjects for with all place holders in the @text'
59 for re_ph
in re_placeholders
:
60 matches
.extend(list(_findAllMatches(text
, re_ph
)))
62 # sort them so they come sequentially
63 matches
.sort(lambda a
, b
: cmp(a
.start(), b
.start()))
66 XML_NS
= 'http://www.w3.org/XML/1998/namespace'
68 def setXMLlang(node
, lang
):
69 """Sets the xml:lang attribute on node"""
70 node
.set("{%s}lang" % XML_NS
, lang
)
72 def setXMLspace(node
, value
):
73 """Sets the xml:space attribute on node"""
74 node
.set("{%s}space" % XML_NS
, value
)
76 def namespaced(namespace
, name
):
77 """Returns name in Clark notation within the given namespace.
79 For example namespaced("source") in an XLIFF document might return
80 {urn:oasis:names:tc:xliff:document:1.1}source
81 This is needed throughout lxml.
84 return "{%s}%s" % (namespace
, name
)
88 class LISAunit(base
.TranslationUnit
):
89 """A single unit in the file.
90 Provisional work is done to make several languages possible."""
92 #The name of the root element of this unit type:(termEntry, tu, trans-unit)
94 #The name of the per language element of this unit type:(termEntry, tu, trans-unit)
96 #The name of the innermost element of this unit type:(term, seg)
101 def __init__(self
, source
, empty
=False):
102 """Constructs a unit containing the given source string"""
105 self
.xmlelement
= etree
.Element(self
.rootNode
)
106 #add descrip, note, etc.
108 super(LISAunit
, self
).__init
__(source
)
110 def __eq__(self
, other
):
111 """Compares two units"""
112 languageNodes
= self
.getlanguageNodes()
113 otherlanguageNodes
= other
.getlanguageNodes()
114 if len(languageNodes
) != len(otherlanguageNodes
):
116 for i
in range(len(languageNodes
)):
117 mytext
= self
.getNodeText(languageNodes
[i
])
118 othertext
= other
.getNodeText(otherlanguageNodes
[i
])
119 if mytext
!= othertext
:
120 #TODO:^ maybe we want to take children and notes into account
124 def namespaced(self
, name
):
125 """Returns name in Clark notation.
127 For example namespaced("source") in an XLIFF document might return
128 {urn:oasis:names:tc:xliff:document:1.1}source
129 This is needed throughout lxml.
131 return namespaced(self
.namespace
, name
)
133 def setsource(self
, source
, sourcelang
='en'):
134 source
= data
.forceunicode(source
)
135 languageNodes
= self
.getlanguageNodes()
136 sourcelanguageNode
= self
.createlanguageNode(sourcelang
, source
, "source")
137 if len(languageNodes
) > 0:
138 self
.xmlelement
[0] = sourcelanguageNode
140 self
.xmlelement
.append(sourcelanguageNode
)
143 return self
.getNodeText(self
.getlanguageNode(lang
=None, index
=0))
144 source
= property(getsource
, setsource
)
146 def settarget(self
, text
, lang
='xx', append
=False):
147 #XXX: we really need the language - can't really be optional
148 """Sets the "target" string (second language), or alternatively appends to the list"""
149 text
= data
.forceunicode(text
)
150 #Firstly deal with reinitialising to None or setting to identical string
151 if self
.gettarget() == text
:
153 languageNodes
= self
.getlanguageNodes()
154 assert len(languageNodes
) > 0
156 languageNode
= self
.createlanguageNode(lang
, text
, "target")
157 if append
or len(languageNodes
) == 1:
158 self
.xmlelement
.append(languageNode
)
160 self
.xmlelement
.insert(1, languageNode
)
161 if not append
and len(languageNodes
) > 1:
162 self
.xmlelement
.remove(languageNodes
[1])
164 def gettarget(self
, lang
=None):
165 """retrieves the "target" text (second entry), or the entry in the
166 specified language, if it exists"""
168 node
= self
.getlanguageNode(lang
=lang
)
170 node
= self
.getlanguageNode(lang
=None, index
=1)
171 return self
.getNodeText(node
)
172 target
= property(gettarget
, settarget
)
174 def createlanguageNode(self
, lang
, text
, purpose
=None):
175 """Returns a xml Element setup with given parameters to represent a
176 single language entry. Has to be overridden."""
179 def createPHnodes(self
, parent
, text
):
180 """Create the text node in parent containing all the ph tags"""
181 matches
= _getPhMatches(text
)
186 # Now we know there will definitely be some ph tags
187 start
= matches
[0].start()
188 pretext
= text
[:start
]
190 parent
.text
= pretext
192 for i
, m
in enumerate(matches
):
194 pretext
= text
[start
:m
.start()]
195 # this will never happen with the first ph tag
197 lasttag
.tail
= pretext
199 phnode
= etree
.SubElement(parent
, "ph")
200 phnode
.set("id", str(i
+1))
201 phnode
.text
= m
.group()
206 lasttag
.tail
= text
[start
:]
208 def getlanguageNodes(self
):
209 """Returns a list of all nodes that contain per language information."""
210 return self
.xmlelement
.findall(self
.namespaced(self
.languageNode
))
212 def getlanguageNode(self
, lang
=None, index
=None):
213 """Retrieves a languageNode either by language or by index"""
214 if lang
is None and index
is None:
215 raise KeyError("No criterea for languageNode given")
216 languageNodes
= self
.getlanguageNodes()
218 for set in languageNodes
:
219 if set.get("{%s}lang" % XML_NS
) == lang
:
221 else:#have to use index
222 if index
>= len(languageNodes
):
225 return languageNodes
[index
]
228 def getNodeText(self
, languageNode
):
229 """Retrieves the term from the given languageNode"""
230 if languageNode
is None:
233 terms
= languageNode
.findall('.//%s' % self
.namespaced(self
.textNode
))
236 return getText(terms
[0])
238 return getText(languageNode
)
241 return etree
.tostring(self
.xmlelement
, pretty_print
=True, encoding
='utf-8')
243 def createfromxmlElement(cls
, element
):
244 term
= cls(None, empty
=True)
245 term
.xmlelement
= element
247 createfromxmlElement
= classmethod(createfromxmlElement
)
249 class LISAfile(base
.TranslationStore
):
250 """A class representing a file store for one of the LISA file formats."""
252 #The root node of the XML document:
254 #The root node of the content section:
256 #The XML skeleton to use for empty construction:
261 def __init__(self
, inputfile
=None, sourcelanguage
='en', targetlanguage
=None, unitclass
=None):
262 super(LISAfile
, self
).__init
__(unitclass
=unitclass
)
263 self
.setsourcelanguage(sourcelanguage
)
264 self
.settargetlanguage(targetlanguage
)
265 if inputfile
is not None:
266 self
.parse(inputfile
)
267 assert self
.document
.getroot().tag
== self
.namespaced(self
.rootNode
)
269 # We strip out newlines to ensure that spaces in the skeleton doesn't
270 # interfere with the the pretty printing of lxml
271 self
.parse(self
.XMLskeleton
.replace("\n", ""))
275 """Method to be overridden to initialise headers, etc."""
278 def namespaced(self
, name
):
279 """Returns name in Clark notation.
281 For example namespaced("source") in an XLIFF document might return
282 {urn:oasis:names:tc:xliff:document:1.1}source
283 This is needed throughout lxml.
285 return namespaced(self
.namespace
, name
)
288 """Initialises self.body so it never needs to be retrieved from the XML again."""
289 self
.namespace
= self
.document
.getroot().nsmap
.get(None, None)
290 self
.body
= self
.document
.find('//%s' % self
.namespaced(self
.bodyNode
))
292 def setsourcelanguage(self
, sourcelanguage
):
293 """Sets the source language for this store"""
294 self
.sourcelanguage
= sourcelanguage
296 def settargetlanguage(self
, targetlanguage
):
297 """Sets the target language for this store"""
298 self
.targetlanguage
= targetlanguage
300 def addsourceunit(self
, source
):
301 #TODO: miskien moet hierdie eerder addsourcestring of iets genoem word?
302 """Adds and returns a new unit with the given string as first entry."""
303 newunit
= self
.UnitClass(source
)
304 self
.addunit(newunit
)
307 def addunit(self
, unit
):
308 unit
.namespace
= self
.namespace
309 self
.body
.append(unit
.xmlelement
)
310 self
.units
.append(unit
)
313 """Converts to a string containing the file's XML"""
314 return etree
.tostring(self
.document
, pretty_print
=True, xml_declaration
=True, encoding
='utf-8')
316 def parse(self
, xml
):
317 """Populates this object from the given xml string"""
318 if not hasattr(self
, 'filename'):
319 self
.filename
= getattr(xml
, 'name', '')
320 if hasattr(xml
, "read"):
324 self
.document
= etree
.fromstring(xml
).getroottree()
325 self
.encoding
= self
.document
.docinfo
.encoding
327 assert self
.document
.getroot().tag
== self
.namespaced(self
.rootNode
)
328 termEntries
= self
.body
.findall('.//%s' % self
.namespaced(self
.UnitClass
.rootNode
))
329 if termEntries
is None:
331 for entry
in termEntries
:
332 term
= self
.UnitClass
.createfromxmlElement(entry
)
333 term
.namespace
= self
.namespace
334 self
.units
.append(term
)