convert/dtd2po.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3 #
   4 # Copyright 2002-2006 Zuza Software Foundation
   5 #
   6 # This file is part of translate.
   7 #
   8 # translate is free software; you can redistribute it and/or modify
   9 # it under the terms of the GNU General Public License as published by
  10 # the Free Software Foundation; either version 2 of the License, or
  11 # (at your option) any later version.
  12 #
  13 # translate is distributed in the hope that it will be useful,
  14 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16 # GNU General Public License for more details.
  17 #
  18 # You should have received a copy of the GNU General Public License
  19 # along with translate; if not, write to the Free Software
  20 # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  21
  22 """script to convert a mozilla .dtd UTF-8 localization format to a
  23 gettext .po localization file using the po and dtd modules, and the
  24 dtd2po convertor class which is in this module
  25 You can convert back to .dtd using po2dtd.py"""
  26
  27 from translate.storage import po
  28 from translate.storage import dtd
  29 from translate.misc import quote
  30
  31 class dtd2po:
  32     def __init__(self, blankmsgstr=False, duplicatestyle="msgctxt"):
  33         self.currentgroup = None
  34         self.blankmsgstr = blankmsgstr
  35         self.duplicatestyle = duplicatestyle
  36
  37     def convertcomments(self, thedtd, thepo):
  38         entity = quote.rstripeol(thedtd.entity)
  39         if len(entity) > 0:
  40             thepo.addlocation(thedtd.entity)
  41         for commenttype, comment in thedtd.comments:
  42             # handle groups
  43             if (commenttype == "locgroupstart"):
  44                 groupcomment = comment.replace('BEGIN','GROUP')
  45                 self.currentgroup = groupcomment
  46             elif (commenttype == "locgroupend"):
  47                 groupcomment = comment.replace('END','GROUP')
  48                 self.currentgroup = None
  49             # handle automatic comment
  50             if commenttype == "automaticcomment":
  51                 thepo.addnote(comment, origin="developer")
  52             # handle normal comments
  53             else:
  54                 thepo.addnote(quote.stripcomment(comment), origin="developer")
  55         # handle group stuff
  56         if self.currentgroup is not None:
  57             thepo.addnote(quote.stripcomment(self.currentgroup), origin="translator")
  58         if entity.endswith(".height") or entity.endswith(".width") or entity.endswith(".size"):
  59             thepo.addnote("Do not translate this.  Only change the numeric values if you need this dialogue box to appear bigger", origin="developer")
  60
  61     def convertstrings(self, thedtd, thepo):
  62         # extract the string, get rid of quoting
  63         unquoted = dtd.unquotefromdtd(thedtd.definition).replace("\r", "")
  64         # escape backslashes... but not if they're for a newline
  65         # unquoted = unquoted.replace("\\", "\\\\").replace("\\\\n", "\\n")
  66         # now split the string into lines and quote them
  67         lines = unquoted.split('\n')
  68         while lines and not lines[0].strip():
  69             del lines[0]
  70         while lines and not lines[-1].strip():
  71             del lines[-1]
  72         # quotes have been escaped already by escapeforpo, so just add the start and end quotes
  73         if len(lines) > 1:
  74             thepo.source = "\n".join([lines[0].rstrip() + ' '] + \
  75                     [line.strip() + ' ' for line in lines[1:-1]] + \
  76                     [lines[-1].lstrip()])
  77         elif lines:
  78             thepo.source = lines[0]
  79         else:
  80             thepo.source = ""
  81         thepo.target = ""
  82
  83     def convertunit(self, thedtd):
  84         """converts a dtd unit to a po unit, returns None if empty or not for translation"""
  85         if thedtd is None:
  86             return None
  87         if getattr(thedtd, "entityparameter", None) == "SYSTEM":
  88             return None
  89         thepo = po.pounit(encoding="UTF-8")
  90         # remove unwanted stuff
  91         for commentnum in range(len(thedtd.comments)):
  92             commenttype, locnote = thedtd.comments[commentnum]
  93             # if this is a localization note
  94             if commenttype == 'locnote':
  95                 # parse the locnote into the entity and the actual note
  96                 typeend = quote.findend(locnote,'LOCALIZATION NOTE')
  97                 # parse the id
  98                 idstart = locnote.find('(', typeend)
  99                 if idstart == -1: continue
 100                 idend = locnote.find(')', idstart+1)
 101                 entity = locnote[idstart+1:idend].strip()
 102                 # parse the actual note
 103                 actualnotestart = locnote.find(':', idend+1)
 104                 actualnoteend = locnote.find('-->', idend)
 105                 actualnote = locnote[actualnotestart+1:actualnoteend].strip()
 106                 # if it's for this entity, process it
 107                 if thedtd.entity == entity:
 108                     # if it says don't translate (and nothing more),
 109                     if actualnote.startswith("DONT_TRANSLATE"):
 110                         # take out the entity,definition and the DONT_TRANSLATE comment
 111                         thedtd.entity = ""
 112                         thedtd.definition = ""
 113                         del thedtd.comments[commentnum]
 114                         # finished this for loop
 115                         break
 116                     else:
 117                         # convert it into an automatic comment, to be processed by convertcomments
 118                         thedtd.comments[commentnum] = ("automaticcomment", actualnote)
 119         # do a standard translation
 120         self.convertcomments(thedtd, thepo)
 121         self.convertstrings(thedtd, thepo)
 122         if thepo.isblank() and not thepo.getlocations():
 123             return None
 124         else:
 125             return thepo
 126
 127     # labelsuffixes and accesskeysuffixes are combined to accelerator notation
 128     labelsuffixes = (".label", ".title")
 129     accesskeysuffixes = (".accesskey", ".accessKey", ".akey")
 130
 131     def convertmixedunit(self, labeldtd, accesskeydtd):
 132         labelpo = self.convertunit(labeldtd)
 133         accesskeypo = self.convertunit(accesskeydtd)
 134         if labelpo is None:
 135             return accesskeypo
 136         if accesskeypo is None:
 137             return labelpo
 138         thepo = po.pounit(encoding="UTF-8")
 139         thepo.addlocations(labelpo.getlocations())
 140         thepo.addlocations(accesskeypo.getlocations())
 141         thepo.msgidcomment = thepo._extract_msgidcomments() + labelpo._extract_msgidcomments()
 142         thepo.msgidcomment = thepo._extract_msgidcomments() + accesskeypo._extract_msgidcomments()
 143         thepo.addnote(labelpo.getnotes("developer"), "developer")
 144         thepo.addnote(accesskeypo.getnotes("developer"), "developer")
 145         thepo.addnote(labelpo.getnotes("translator"), "translator")
 146         thepo.addnote(accesskeypo.getnotes("translator"), "translator")
 147         # redo the strings from original dtd...
 148         label = dtd.unquotefromdtd(labeldtd.definition).decode('UTF-8')
 149         accesskey = dtd.unquotefromdtd(accesskeydtd.definition).decode('UTF-8')
 150         if len(accesskey) == 0:
 151             return None
 152         # try and put the & in front of the accesskey in the label...
 153         # make sure to avoid muddling up &amp;-type strings
 154         searchpos = 0
 155         accesskeypos = -1
 156         inentity = 0
 157         accesskeyaltcasepos = -1
 158         while (accesskeypos < 0) and searchpos < len(label):
 159             searchchar = label[searchpos]
 160             if searchchar == '&':
 161                 inentity = 1
 162             elif searchchar == ';':
 163                 inentity = 0
 164             else:
 165                 if not inentity:
 166                     if searchchar == accesskey.upper():
 167                         # always prefer uppercase
 168                         accesskeypos = searchpos
 169                     if searchchar == accesskey.lower():
 170                         # take lower case otherwise...
 171                         if accesskeyaltcasepos == -1:
 172                             # only want to remember first altcasepos
 173                             accesskeyaltcasepos = searchpos
 174                             # note: we keep on looping through in hope of exact match
 175             searchpos += 1
 176         # if we didn't find an exact case match, use an alternate one if available
 177         if accesskeypos == -1:
 178             accesskeypos = accesskeyaltcasepos
 179         # now we want to handle whatever we found...
 180         if accesskeypos >= 0:
 181             label = label[:accesskeypos] + '&' + label[accesskeypos:]
 182             label = label.encode("UTF-8", "replace")
 183         else:
 184             # can't currently mix accesskey if it's not in label
 185             return None
 186         thepo.source = label
 187         thepo.target = ""
 188         return thepo
 189
 190     def findmixedentities(self, thedtdfile):
 191         """creates self.mixedentities from the dtd file..."""
 192         self.mixedentities = {} # those entities which have a .label/.title and .accesskey combined
 193         for entity in thedtdfile.index.keys():
 194             for labelsuffix in self.labelsuffixes:
 195                 if entity.endswith(labelsuffix):
 196                     entitybase = entity[:entity.rfind(labelsuffix)]
 197                     # see if there is a matching accesskey in this line, making this a
 198                     # mixed entity
 199                     for akeytype in self.accesskeysuffixes:
 200                         if thedtdfile.index.has_key(entitybase + akeytype):
 201                             # add both versions to the list of mixed entities
 202                             self.mixedentities[entity] = {}
 203                             self.mixedentities[entitybase+akeytype] = {}
 204                     # check if this could be a mixed entity (labelsuffix and ".accesskey")
 205
 206     def convertdtdunit(self, thedtdfile, thedtd, mixbucket="dtd"):
 207         """converts a dtd unit from thedtdfile to a po unit, handling mixed entities along the way..."""
 208         # keep track of whether accesskey and label were combined
 209         if thedtd.entity in self.mixedentities:
 210             # use special convertmixed unit which produces one pounit with
 211             # both combined for the label and None for the accesskey
 212             alreadymixed = self.mixedentities[thedtd.entity].get(mixbucket, None)
 213             if alreadymixed:
 214                 # we are successfully throwing this away...
 215                 return None
 216             elif alreadymixed is None:
 217                 # depending on what we come across first, work out the label and the accesskey
 218                 labeldtd, accesskeydtd = None, None
 219                 labelentity, accesskeyentity = None, None
 220                 for labelsuffix in self.labelsuffixes:
 221                     if thedtd.entity.endswith(labelsuffix):
 222                         entitybase = thedtd.entity[:thedtd.entity.rfind(labelsuffix)]
 223                         for akeytype in self.accesskeysuffixes:
 224                             if thedtdfile.index.has_key(entitybase + akeytype):
 225                                 labelentity, labeldtd = thedtd.entity, thedtd
 226                                 accesskeyentity = labelentity[:labelentity.rfind(labelsuffix)]+akeytype
 227                                 accesskeydtd = thedtdfile.index[accesskeyentity]
 228                                 break
 229                 else:
 230                     for akeytype in self.accesskeysuffixes:
 231                         if thedtd.entity.endswith(akeytype):
 232                             accesskeyentity, accesskeydtd = thedtd.entity, thedtd
 233                             for labelsuffix in self.labelsuffixes:
 234                                 labelentity = accesskeyentity[:accesskeyentity.rfind(akeytype)]+labelsuffix
 235                                 if thedtdfile.index.has_key(labelentity):
 236                                     labeldtd = thedtdfile.index[labelentity]
 237                                     break
 238                             else:
 239                                 labelentity = None
 240                                 accesskeyentity = None
 241                 thepo = self.convertmixedunit(labeldtd, accesskeydtd)
 242                 if thepo is not None:
 243                     if accesskeyentity is not None:
 244                         self.mixedentities[accesskeyentity][mixbucket] = True
 245                     if labelentity is not None:
 246                         self.mixedentities[labelentity][mixbucket] = True
 247                     return thepo
 248                 else:
 249                     # otherwise the mix failed. add each one separately and remember they weren't mixed
 250                     if accesskeyentity is not None:
 251                         self.mixedentities[accesskeyentity][mixbucket] = False
 252                     if labelentity is not None:
 253                         self.mixedentities[labelentity][mixbucket] = False
 254         return self.convertunit(thedtd)
 255
 256     def convertstore(self, thedtdfile):
 257         thetargetfile = po.pofile()
 258         targetheader = thetargetfile.makeheader(charset="UTF-8", encoding="8bit", x_accelerator_marker="&")
 259         targetheader.addnote("extracted from %s" % thedtdfile.filename, "developer")
 260         thetargetfile.addunit(targetheader)
 261         thedtdfile.makeindex()
 262         self.findmixedentities(thedtdfile)
 263         # go through the dtd and convert each unit
 264         for thedtd in thedtdfile.units:
 265             if thedtd.isnull():
 266                 continue
 267             thepo = self.convertdtdunit(thedtdfile, thedtd)
 268             if thepo is not None:
 269                 thetargetfile.addunit(thepo)
 270         thetargetfile.removeduplicates(self.duplicatestyle)
 271         return thetargetfile
 272
 273     def mergestore(self, origdtdfile, translateddtdfile):
 274         thetargetfile = po.pofile()
 275         targetheader = thetargetfile.makeheader(charset="UTF-8", encoding="8bit")
 276         targetheader.addnote("extracted from %s, %s" % (origdtdfile.filename, translateddtdfile.filename), "developer")
 277         thetargetfile.addunit(targetheader)
 278         origdtdfile.makeindex()
 279         self.findmixedentities(origdtdfile)
 280         translateddtdfile.makeindex()
 281         self.findmixedentities(translateddtdfile)
 282         # go through the dtd files and convert each unit
 283         for origdtd in origdtdfile.units:
 284             if origdtd.isnull():
 285                 continue
 286             origpo = self.convertdtdunit(origdtdfile, origdtd, mixbucket="orig")
 287             if origdtd.entity in self.mixedentities:
 288                 mixedentitydict = self.mixedentities[origdtd.entity]
 289                 if "orig" not in mixedentitydict:
 290                     # this means that the entity is mixed in the translation, but not the original - treat as unmixed
 291                     mixbucket = "orig"
 292                     del self.mixedentities[origdtd.entity]
 293                 elif mixedentitydict["orig"]:
 294                     # the original entity is already mixed successfully
 295                     mixbucket = "translate"
 296                 else:
 297                     # ??
 298                     mixbucket = "orig"
 299             else:
 300                 mixbucket = "translate"
 301             if origpo is None:
 302                 # this means its a mixed entity (with accesskey) that's already been dealt with)
 303                 continue
 304             if origdtd.entity in translateddtdfile.index:
 305                 translateddtd = translateddtdfile.index[origdtd.entity]
 306                 translatedpo = self.convertdtdunit(translateddtdfile, translateddtd, mixbucket=mixbucket)
 307             else:
 308                 translatedpo = None
 309             if origpo is not None:
 310                 if translatedpo is not None and not self.blankmsgstr:
 311                     origpo.target = translatedpo.source
 312                 thetargetfile.addunit(origpo)
 313         thetargetfile.removeduplicates(self.duplicatestyle)
 314         return thetargetfile
 315
 316 def convertdtd(inputfile, outputfile, templatefile, pot=False, duplicatestyle="msgctxt"):
 317     """reads in inputfile and templatefile using dtd, converts using dtd2po, writes to outputfile"""
 318     inputstore = dtd.dtdfile(inputfile)
 319     convertor = dtd2po(blankmsgstr=pot, duplicatestyle=duplicatestyle)
 320     if templatefile is None:
 321         outputstore = convertor.convertstore(inputstore)
 322     else:
 323         templatestore = dtd.dtdfile(templatefile)
 324         outputstore = convertor.mergestore(templatestore, inputstore)
 325     if outputstore.isempty():
 326         return 0
 327     outputfile.write(str(outputstore))
 328     return 1
 329
 330 def main(argv=None):
 331     from translate.convert import convert
 332     formats = {"dtd": ("po", convertdtd), ("dtd", "dtd"): ("po", convertdtd)}
 333     parser = convert.ConvertOptionParser(formats, usetemplates=True, usepots=True, description=__doc__)
 334     parser.add_duplicates_option()
 335     parser.passthrough.append("pot")
 336     parser.run(argv)
 337
 338 if __name__ == '__main__':
 339     main()
 340