convert/html2po.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3 #
   4 # Copyright 2004-2006 Zuza Software Foundation
   5 #
   6 # This file is part of translate.
   7 #
   8 # translate is free software; you can redistribute it and/or modify
   9 # it under the terms of the GNU General Public License as published by
  10 # the Free Software Foundation; either version 2 of the License, or
  11 # (at your option) any later version.
  12 #
  13 # translate is distributed in the hope that it will be useful,
  14 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16 # GNU General Public License for more details.
  17 #
  18 # You should have received a copy of the GNU General Public License
  19 # along with translate; if not, write to the Free Software
  20 # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  21 #
  22
  23 """convert HTML files to Gettext PO localization files
  24
  25 See: http://translate.sourceforge.net/wiki/toolkit/html2po for examples and
  26 usage instructions
  27 """
  28
  29 from translate.storage import po
  30 from translate.storage import html
  31
  32 class html2po:
  33     def convertfile(self, inputfile, filename, includeheader, includeuntagged=False, duplicatestyle="msgid_comment"):
  34         """converts a html file to .po format"""
  35         thetargetfile = po.pofile()
  36         htmlparser = html.htmlfile(includeuntaggeddata=includeuntagged, inputfile=inputfile)
  37         if includeheader:
  38             targetheader = thetargetfile.makeheader(charset="UTF-8", encoding="8bit")
  39             thetargetfile.addunit(targetheader)
  40         for htmlunit in htmlparser.units:
  41             thepo = thetargetfile.addsourceunit(htmlunit.source)
  42             thepo.addlocations(htmlunit.getlocations())
  43         thetargetfile.removeduplicates(duplicatestyle)
  44         return thetargetfile
  45
  46 def converthtml(inputfile, outputfile, templates, includeuntagged=False, pot=False, duplicatestyle="msgctxt"):
  47     """reads in stdin using fromfileclass, converts using convertorclass, writes to stdout"""
  48     convertor = html2po()
  49     outputfilepos = outputfile.tell()
  50     includeheader = outputfilepos == 0
  51     outputstore = convertor.convertfile(inputfile, getattr(inputfile, "name", "unknown"), includeheader, includeuntagged, duplicatestyle=duplicatestyle)
  52     outputfile.write(str(outputstore))
  53     return 1
  54
  55 def main(argv=None):
  56     from translate.convert import convert
  57     from translate.misc import stdiotell
  58     import sys
  59     sys.stdout = stdiotell.StdIOWrapper(sys.stdout)
  60     formats = {"html":("po", converthtml), "htm":("po", converthtml), "xhtml":("po", converthtml), None:("po", converthtml)}
  61     parser = convert.ConvertOptionParser(formats, usepots=True, description=__doc__)
  62     parser.add_option("-u", "--untagged", dest="includeuntagged", default=False, action="store_true",
  63             help="include untagged sections")
  64     parser.passthrough.append("includeuntagged")
  65     parser.add_duplicates_option()
  66     parser.passthrough.append("pot")
  67     parser.run(argv)
  68
  69
  70 if __name__ == '__main__':
  71     main()