for git v1.5.2 (and below): chdir to the directory of the target file before executin...
[translate_toolkit.git] / convert / dtd2po.py
blob171369dbfdd2df372e5c14c7b3ed2d745238ea2f
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3 #
4 # Copyright 2002-2006 Zuza Software Foundation
5 #
6 # This file is part of translate.
8 # translate is free software; you can redistribute it and/or modify
9 # it under the terms of the GNU General Public License as published by
10 # the Free Software Foundation; either version 2 of the License, or
11 # (at your option) any later version.
13 # translate is distributed in the hope that it will be useful,
14 # but WITHOUT ANY WARRANTY; without even the implied warranty of
15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 # GNU General Public License for more details.
18 # You should have received a copy of the GNU General Public License
19 # along with translate; if not, write to the Free Software
20 # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 """script to convert a mozilla .dtd UTF-8 localization format to a
23 gettext .po localization file using the po and dtd modules, and the
24 dtd2po convertor class which is in this module
25 You can convert back to .dtd using po2dtd.py"""
27 from translate.storage import po
28 from translate.storage import dtd
29 from translate.misc import quote
31 class dtd2po:
32 def __init__(self, blankmsgstr=False, duplicatestyle="msgctxt"):
33 self.currentgroup = None
34 self.blankmsgstr = blankmsgstr
35 self.duplicatestyle = duplicatestyle
37 def convertcomments(self, thedtd, thepo):
38 entity = quote.rstripeol(thedtd.entity)
39 if len(entity) > 0:
40 thepo.addlocation(thedtd.entity)
41 for commenttype, comment in thedtd.comments:
42 # handle groups
43 if (commenttype == "locgroupstart"):
44 groupcomment = comment.replace('BEGIN','GROUP')
45 self.currentgroup = groupcomment
46 elif (commenttype == "locgroupend"):
47 groupcomment = comment.replace('END','GROUP')
48 self.currentgroup = None
49 # handle automatic comment
50 if commenttype == "automaticcomment":
51 thepo.addnote(comment, origin="developer")
52 # handle normal comments
53 else:
54 thepo.addnote(quote.stripcomment(comment), origin="developer")
55 # handle group stuff
56 if self.currentgroup is not None:
57 thepo.addnote(quote.stripcomment(self.currentgroup), origin="translator")
58 if entity.endswith(".height") or entity.endswith(".width") or entity.endswith(".size"):
59 thepo.addnote("Do not translate this. Only change the numeric values if you need this dialogue box to appear bigger", origin="developer")
61 def convertstrings(self, thedtd, thepo):
62 # extract the string, get rid of quoting
63 unquoted = dtd.unquotefromdtd(thedtd.definition).replace("\r", "")
64 # escape backslashes... but not if they're for a newline
65 # unquoted = unquoted.replace("\\", "\\\\").replace("\\\\n", "\\n")
66 # now split the string into lines and quote them
67 lines = unquoted.split('\n')
68 while lines and not lines[0].strip():
69 del lines[0]
70 while lines and not lines[-1].strip():
71 del lines[-1]
72 # quotes have been escaped already by escapeforpo, so just add the start and end quotes
73 if len(lines) > 1:
74 thepo.source = "\n".join([lines[0].rstrip() + ' '] + \
75 [line.strip() + ' ' for line in lines[1:-1]] + \
76 [lines[-1].lstrip()])
77 elif lines:
78 thepo.source = lines[0]
79 else:
80 thepo.source = ""
81 thepo.target = ""
83 def convertunit(self, thedtd):
84 """converts a dtd unit to a po unit, returns None if empty or not for translation"""
85 if thedtd is None:
86 return None
87 if getattr(thedtd, "entityparameter", None) == "SYSTEM":
88 return None
89 thepo = po.pounit(encoding="UTF-8")
90 # remove unwanted stuff
91 for commentnum in range(len(thedtd.comments)):
92 commenttype, locnote = thedtd.comments[commentnum]
93 # if this is a localization note
94 if commenttype == 'locnote':
95 # parse the locnote into the entity and the actual note
96 typeend = quote.findend(locnote,'LOCALIZATION NOTE')
97 # parse the id
98 idstart = locnote.find('(', typeend)
99 if idstart == -1: continue
100 idend = locnote.find(')', idstart+1)
101 entity = locnote[idstart+1:idend].strip()
102 # parse the actual note
103 actualnotestart = locnote.find(':', idend+1)
104 actualnoteend = locnote.find('-->', idend)
105 actualnote = locnote[actualnotestart+1:actualnoteend].strip()
106 # if it's for this entity, process it
107 if thedtd.entity == entity:
108 # if it says don't translate (and nothing more),
109 if actualnote.startswith("DONT_TRANSLATE"):
110 # take out the entity,definition and the DONT_TRANSLATE comment
111 thedtd.entity = ""
112 thedtd.definition = ""
113 del thedtd.comments[commentnum]
114 # finished this for loop
115 break
116 else:
117 # convert it into an automatic comment, to be processed by convertcomments
118 thedtd.comments[commentnum] = ("automaticcomment", actualnote)
119 # do a standard translation
120 self.convertcomments(thedtd, thepo)
121 self.convertstrings(thedtd, thepo)
122 if thepo.isblank() and not thepo.getlocations():
123 return None
124 else:
125 return thepo
127 # labelsuffixes and accesskeysuffixes are combined to accelerator notation
128 labelsuffixes = (".label", ".title")
129 accesskeysuffixes = (".accesskey", ".accessKey", ".akey")
131 def convertmixedunit(self, labeldtd, accesskeydtd):
132 labelpo = self.convertunit(labeldtd)
133 accesskeypo = self.convertunit(accesskeydtd)
134 if labelpo is None:
135 return accesskeypo
136 if accesskeypo is None:
137 return labelpo
138 thepo = po.pounit(encoding="UTF-8")
139 thepo.addlocations(labelpo.getlocations())
140 thepo.addlocations(accesskeypo.getlocations())
141 thepo.msgidcomment = thepo._extract_msgidcomments() + labelpo._extract_msgidcomments()
142 thepo.msgidcomment = thepo._extract_msgidcomments() + accesskeypo._extract_msgidcomments()
143 thepo.addnote(labelpo.getnotes("developer"), "developer")
144 thepo.addnote(accesskeypo.getnotes("developer"), "developer")
145 thepo.addnote(labelpo.getnotes("translator"), "translator")
146 thepo.addnote(accesskeypo.getnotes("translator"), "translator")
147 # redo the strings from original dtd...
148 label = dtd.unquotefromdtd(labeldtd.definition).decode('UTF-8')
149 accesskey = dtd.unquotefromdtd(accesskeydtd.definition).decode('UTF-8')
150 if len(accesskey) == 0:
151 return None
152 # try and put the & in front of the accesskey in the label...
153 # make sure to avoid muddling up &-type strings
154 searchpos = 0
155 accesskeypos = -1
156 inentity = 0
157 accesskeyaltcasepos = -1
158 while (accesskeypos < 0) and searchpos < len(label):
159 searchchar = label[searchpos]
160 if searchchar == '&':
161 inentity = 1
162 elif searchchar == ';':
163 inentity = 0
164 else:
165 if not inentity:
166 if searchchar == accesskey.upper():
167 # always prefer uppercase
168 accesskeypos = searchpos
169 if searchchar == accesskey.lower():
170 # take lower case otherwise...
171 if accesskeyaltcasepos == -1:
172 # only want to remember first altcasepos
173 accesskeyaltcasepos = searchpos
174 # note: we keep on looping through in hope of exact match
175 searchpos += 1
176 # if we didn't find an exact case match, use an alternate one if available
177 if accesskeypos == -1:
178 accesskeypos = accesskeyaltcasepos
179 # now we want to handle whatever we found...
180 if accesskeypos >= 0:
181 label = label[:accesskeypos] + '&' + label[accesskeypos:]
182 label = label.encode("UTF-8", "replace")
183 else:
184 # can't currently mix accesskey if it's not in label
185 return None
186 thepo.source = label
187 thepo.target = ""
188 return thepo
190 def findmixedentities(self, thedtdfile):
191 """creates self.mixedentities from the dtd file..."""
192 self.mixedentities = {} # those entities which have a .label/.title and .accesskey combined
193 for entity in thedtdfile.index.keys():
194 for labelsuffix in self.labelsuffixes:
195 if entity.endswith(labelsuffix):
196 entitybase = entity[:entity.rfind(labelsuffix)]
197 # see if there is a matching accesskey in this line, making this a
198 # mixed entity
199 for akeytype in self.accesskeysuffixes:
200 if thedtdfile.index.has_key(entitybase + akeytype):
201 # add both versions to the list of mixed entities
202 self.mixedentities[entity] = {}
203 self.mixedentities[entitybase+akeytype] = {}
204 # check if this could be a mixed entity (labelsuffix and ".accesskey")
206 def convertdtdunit(self, thedtdfile, thedtd, mixbucket="dtd"):
207 """converts a dtd unit from thedtdfile to a po unit, handling mixed entities along the way..."""
208 # keep track of whether accesskey and label were combined
209 if thedtd.entity in self.mixedentities:
210 # use special convertmixed unit which produces one pounit with
211 # both combined for the label and None for the accesskey
212 alreadymixed = self.mixedentities[thedtd.entity].get(mixbucket, None)
213 if alreadymixed:
214 # we are successfully throwing this away...
215 return None
216 elif alreadymixed is None:
217 # depending on what we come across first, work out the label and the accesskey
218 labeldtd, accesskeydtd = None, None
219 labelentity, accesskeyentity = None, None
220 for labelsuffix in self.labelsuffixes:
221 if thedtd.entity.endswith(labelsuffix):
222 entitybase = thedtd.entity[:thedtd.entity.rfind(labelsuffix)]
223 for akeytype in self.accesskeysuffixes:
224 if thedtdfile.index.has_key(entitybase + akeytype):
225 labelentity, labeldtd = thedtd.entity, thedtd
226 accesskeyentity = labelentity[:labelentity.rfind(labelsuffix)]+akeytype
227 accesskeydtd = thedtdfile.index[accesskeyentity]
228 break
229 else:
230 for akeytype in self.accesskeysuffixes:
231 if thedtd.entity.endswith(akeytype):
232 accesskeyentity, accesskeydtd = thedtd.entity, thedtd
233 for labelsuffix in self.labelsuffixes:
234 labelentity = accesskeyentity[:accesskeyentity.rfind(akeytype)]+labelsuffix
235 if thedtdfile.index.has_key(labelentity):
236 labeldtd = thedtdfile.index[labelentity]
237 break
238 else:
239 labelentity = None
240 accesskeyentity = None
241 thepo = self.convertmixedunit(labeldtd, accesskeydtd)
242 if thepo is not None:
243 if accesskeyentity is not None:
244 self.mixedentities[accesskeyentity][mixbucket] = True
245 if labelentity is not None:
246 self.mixedentities[labelentity][mixbucket] = True
247 return thepo
248 else:
249 # otherwise the mix failed. add each one separately and remember they weren't mixed
250 if accesskeyentity is not None:
251 self.mixedentities[accesskeyentity][mixbucket] = False
252 if labelentity is not None:
253 self.mixedentities[labelentity][mixbucket] = False
254 return self.convertunit(thedtd)
256 def convertstore(self, thedtdfile):
257 thetargetfile = po.pofile()
258 targetheader = thetargetfile.makeheader(charset="UTF-8", encoding="8bit", x_accelerator_marker="&")
259 targetheader.addnote("extracted from %s" % thedtdfile.filename, "developer")
260 thetargetfile.addunit(targetheader)
261 thedtdfile.makeindex()
262 self.findmixedentities(thedtdfile)
263 # go through the dtd and convert each unit
264 for thedtd in thedtdfile.units:
265 if thedtd.isnull():
266 continue
267 thepo = self.convertdtdunit(thedtdfile, thedtd)
268 if thepo is not None:
269 thetargetfile.addunit(thepo)
270 thetargetfile.removeduplicates(self.duplicatestyle)
271 return thetargetfile
273 def mergestore(self, origdtdfile, translateddtdfile):
274 thetargetfile = po.pofile()
275 targetheader = thetargetfile.makeheader(charset="UTF-8", encoding="8bit")
276 targetheader.addnote("extracted from %s, %s" % (origdtdfile.filename, translateddtdfile.filename), "developer")
277 thetargetfile.addunit(targetheader)
278 origdtdfile.makeindex()
279 self.findmixedentities(origdtdfile)
280 translateddtdfile.makeindex()
281 self.findmixedentities(translateddtdfile)
282 # go through the dtd files and convert each unit
283 for origdtd in origdtdfile.units:
284 if origdtd.isnull():
285 continue
286 origpo = self.convertdtdunit(origdtdfile, origdtd, mixbucket="orig")
287 if origdtd.entity in self.mixedentities:
288 mixedentitydict = self.mixedentities[origdtd.entity]
289 if "orig" not in mixedentitydict:
290 # this means that the entity is mixed in the translation, but not the original - treat as unmixed
291 mixbucket = "orig"
292 del self.mixedentities[origdtd.entity]
293 elif mixedentitydict["orig"]:
294 # the original entity is already mixed successfully
295 mixbucket = "translate"
296 else:
297 # ??
298 mixbucket = "orig"
299 else:
300 mixbucket = "translate"
301 if origpo is None:
302 # this means its a mixed entity (with accesskey) that's already been dealt with)
303 continue
304 if origdtd.entity in translateddtdfile.index:
305 translateddtd = translateddtdfile.index[origdtd.entity]
306 translatedpo = self.convertdtdunit(translateddtdfile, translateddtd, mixbucket=mixbucket)
307 else:
308 translatedpo = None
309 if origpo is not None:
310 if translatedpo is not None and not self.blankmsgstr:
311 origpo.target = translatedpo.source
312 thetargetfile.addunit(origpo)
313 thetargetfile.removeduplicates(self.duplicatestyle)
314 return thetargetfile
316 def convertdtd(inputfile, outputfile, templatefile, pot=False, duplicatestyle="msgctxt"):
317 """reads in inputfile and templatefile using dtd, converts using dtd2po, writes to outputfile"""
318 inputstore = dtd.dtdfile(inputfile)
319 convertor = dtd2po(blankmsgstr=pot, duplicatestyle=duplicatestyle)
320 if templatefile is None:
321 outputstore = convertor.convertstore(inputstore)
322 else:
323 templatestore = dtd.dtdfile(templatefile)
324 outputstore = convertor.mergestore(templatestore, inputstore)
325 if outputstore.isempty():
326 return 0
327 outputfile.write(str(outputstore))
328 return 1
330 def main(argv=None):
331 from translate.convert import convert
332 formats = {"dtd": ("po", convertdtd), ("dtd", "dtd"): ("po", convertdtd)}
333 parser = convert.ConvertOptionParser(formats, usetemplates=True, usepots=True, description=__doc__)
334 parser.add_duplicates_option()
335 parser.passthrough.append("pot")
336 parser.run(argv)
338 if __name__ == '__main__':
339 main()