convert/csv2po.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3 #
   4 # Copyright 2003-2006 Zuza Software Foundation
   5 #
   6 # This file is part of translate.
   7 #
   8 # translate is free software; you can redistribute it and/or modify
   9 # it under the terms of the GNU General Public License as published by
  10 # the Free Software Foundation; either version 2 of the License, or
  11 # (at your option) any later version.
  12 #
  13 # translate is distributed in the hope that it will be useful,
  14 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16 # GNU General Public License for more details.
  17 #
  18 # You should have received a copy of the GNU General Public License
  19 # along with translate; if not, write to the Free Software
  20 # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  21
  22 """convert Comma-Separated Value (.csv) files to Gettext PO localization files
  23
  24 See: http://translate.sourceforge.net/wiki/toolkit/csv2po for examples and
  25 usage instructions
  26 """
  27
  28 import sys
  29 from translate.misc import sparse
  30 from translate.storage import po
  31 from translate.storage import csvl10n
  32
  33 def replacestrings(source, *pairs):
  34     for orig, new in pairs:
  35         source = source.replace(orig, new)
  36     return source
  37
  38 def quotecsvstr(source):
  39     return '"' + replacestrings(source, ('\\"','"'), ('"','\\"'), ("\\\\'", "\\'"), ('\\\\n', '\\n')) + '"'
  40
  41 def simplify(string):
  42     return filter(type(string).isalnum, string)
  43     tokens = sparse.SimpleParser().tokenize(string)
  44     return " ".join(tokens)
  45
  46 class csv2po:
  47     """a class that takes translations from a .csv file and puts them in a .po file"""
  48     def __init__(self, templatepo=None, charset=None, duplicatestyle="keep"):
  49         """construct the converter..."""
  50         self.pofile = templatepo
  51         self.charset = charset
  52         self.duplicatestyle = duplicatestyle
  53         if self.pofile is not None:
  54             self.unmatched = 0
  55             self.makeindex()
  56
  57     def makeindex(self):
  58         """makes indexes required for searching..."""
  59         self.commentindex = {}
  60         self.sourceindex = {}
  61         self.simpleindex = {}
  62         self.duplicatecomments = []
  63         for pounit in self.pofile.units:
  64             joinedcomment = " ".join(pounit.getlocations())
  65             source = pounit.source
  66             # the definitive way to match is by source comment (joinedcomment)
  67             if joinedcomment in self.commentindex:
  68                 # unless more than one thing matches...
  69                 self.duplicatecomments.append(joinedcomment)
  70             else:
  71                 self.commentindex[joinedcomment] = pounit
  72             # do simpler matching in case things have been mangled...
  73             simpleid = simplify(source)
  74             # but check for duplicates
  75             if simpleid in self.simpleindex and not (source in self.sourceindex):
  76                 # keep a list of them...
  77                 self.simpleindex[simpleid].append(pounit)
  78             else:
  79                 self.simpleindex[simpleid] = [pounit]
  80             # also match by standard msgid
  81             self.sourceindex[source] = pounit
  82         for comment in self.duplicatecomments:
  83             if comment in self.commentindex:
  84                 del self.commentindex[comment]
  85
  86     def convertunit(self, csvunit):
  87         """converts csv unit to po unit"""
  88         pounit = po.pounit(encoding="UTF-8")
  89         if csvunit.comment:
  90             pounit.addlocation(csvunit.comment)
  91         pounit.source = csvunit.source
  92         pounit.target = csvunit.target
  93         return pounit
  94
  95     def handlecsvunit(self, csvunit):
  96         """handles reintegrating a csv unit into the .po file"""
  97         if len(csvunit.comment.strip()) > 0 and csvunit.comment in self.commentindex:
  98             pounit = self.commentindex[csvunit.comment]
  99         elif csvunit.source in self.sourceindex:
 100             pounit = self.sourceindex[csvunit.source]
 101         elif simplify(csvunit.source) in self.simpleindex:
 102             thepolist = self.simpleindex[simplify(csvunit.source)]
 103             if len(thepolist) > 1:
 104                 csvfilename = getattr(self.csvfile, "filename", "(unknown)")
 105                 matches = "\n  ".join(["possible match: " + pounit.source for pounit in thepolist])
 106                 print >> sys.stderr, "%s - csv entry not found in pofile, multiple matches found:\n  location\t%s\n  original\t%s\n  translation\t%s\n  %s" % (csvfilename, csvunit.comment, csvunit.source, csvunit.target, matches)
 107                 self.unmatched += 1
 108                 return
 109             pounit = thepolist[0]
 110         else:
 111             csvfilename = getattr(self.csvfile, "filename", "(unknown)")
 112             print >> sys.stderr, "%s - csv entry not found in pofile:\n  location\t%s\n  original\t%s\n  translation\t%s" % (csvfilename, csvunit.comment, csvunit.source, csvunit.target)
 113             self.unmatched += 1
 114             return
 115         if pounit.hasplural():
 116             # we need to work out whether we matched the singular or the plural
 117             singularid = pounit.source.strings[0]
 118             pluralid = pounit.source.strings[1]
 119             if csvunit.source == singularid:
 120                 pounit.msgstr[0] = csvunit.target
 121             elif csvunit.source == pluralid:
 122                 pounit.msgstr[1] = csvunit.target
 123             elif simplify(csvunit.source) == simplify(singularid):
 124                 pounit.msgstr[0] = csvunit.target
 125             elif simplify(csvunit.source) == simplify(pluralid):
 126                 pounit.msgstr[1] = csvunit.target
 127             else:
 128                 print >> sys.stderr, "couldn't work out singular or plural: %r, %r, %r" %  \
 129                     (csvunit.source, singularid, pluralid)
 130                 self.unmatched += 1
 131                 return
 132         else:
 133             pounit.target = csvunit.target
 134
 135     def convertstore(self, thecsvfile):
 136         """converts a csvfile to a pofile, and returns it. uses templatepo if given at construction"""
 137         self.csvfile = thecsvfile
 138         if self.pofile is None:
 139             self.pofile = po.pofile()
 140             mergemode = False
 141         else:
 142             mergemode = True
 143         if self.pofile.units and self.pofile.units[0].isheader():
 144             targetheader = self.pofile.units[0]
 145             targetheader.msgstr = [line.replace("CHARSET", "UTF-8").replace("ENCODING", "8bit") for line in targetheader.msgstr]
 146         else:
 147             targetheader = self.pofile.makeheader(charset="UTF-8", encoding="8bit")
 148         targetheader.addnote("extracted from %s" % self.csvfile.filename, "developer")
 149         mightbeheader = True
 150         for csvunit in self.csvfile.units:
 151             if self.charset is not None:
 152                 csvunit.source = csvunit.source.decode(self.charset)
 153                 csvunit.target = csvunit.target.decode(self.charset)
 154             if mightbeheader:
 155                 # ignore typical header strings...
 156                 mightbeheader = False
 157                 if [item.strip().lower() for item in csvunit.comment, csvunit.source, csvunit.target] == \
 158                         ["comment", "original", "translation"]:
 159                     continue
 160                 if len(csvunit.comment.strip()) == 0 and csvunit.source.find("Content-Type:") != -1:
 161                     continue
 162             if mergemode:
 163                 self.handlecsvunit(csvunit)
 164             else:
 165                 pounit = self.convertunit(csvunit)
 166                 self.pofile.addunit(pounit)
 167         self.pofile.removeduplicates(self.duplicatestyle)
 168         return self.pofile
 169
 170 def convertcsv(inputfile, outputfile, templatefile, charset=None, columnorder=None, duplicatestyle="msgctxt"):
 171     """reads in inputfile using csvl10n, converts using csv2po, writes to outputfile"""
 172     inputstore = csvl10n.csvfile(inputfile, fieldnames=columnorder)
 173     if templatefile is None:
 174         convertor = csv2po(charset=charset, duplicatestyle=duplicatestyle)
 175     else:
 176         templatestore = po.pofile(templatefile)
 177         convertor = csv2po(templatestore, charset=charset, duplicatestyle=duplicatestyle)
 178     outputstore = convertor.convertstore(inputstore)
 179     if outputstore.isempty():
 180         return 0
 181     outputfile.write(str(outputstore))
 182     return 1
 183
 184 def main(argv=None):
 185     from translate.convert import convert
 186     formats = {("csv", "po"): ("po", convertcsv), ("csv", "pot"): ("po", convertcsv),
 187             ("csv", None): ("po", convertcsv)}
 188     parser = convert.ConvertOptionParser(formats, usetemplates=True, description=__doc__)
 189     parser.add_option("", "--charset", dest="charset", default=None,
 190         help="set charset to decode from csv files", metavar="CHARSET")
 191     parser.add_option("", "--columnorder", dest="columnorder", default=None,
 192         help="specify the order and position of columns (source,source,target)")
 193     parser.add_duplicates_option()
 194     parser.passthrough.append("charset")
 195     parser.passthrough.append("columnorder")
 196     parser.run(argv)
 197
 198
 199 if __name__ == '__main__':
 200     main()