fix git support for v1.5.3 (or higher) by setting "--work-tree"
[translate_toolkit.git] / convert / csv2po.py
blob09599f1e56f2e2d165790c87c86ea457da122291
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3 #
4 # Copyright 2003-2006 Zuza Software Foundation
5 #
6 # This file is part of translate.
8 # translate is free software; you can redistribute it and/or modify
9 # it under the terms of the GNU General Public License as published by
10 # the Free Software Foundation; either version 2 of the License, or
11 # (at your option) any later version.
13 # translate is distributed in the hope that it will be useful,
14 # but WITHOUT ANY WARRANTY; without even the implied warranty of
15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 # GNU General Public License for more details.
18 # You should have received a copy of the GNU General Public License
19 # along with translate; if not, write to the Free Software
20 # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 """convert Comma-Separated Value (.csv) files to Gettext PO localization files
24 See: http://translate.sourceforge.net/wiki/toolkit/csv2po for examples and
25 usage instructions
26 """
28 import sys
29 from translate.misc import sparse
30 from translate.storage import po
31 from translate.storage import csvl10n
33 def replacestrings(source, *pairs):
34 for orig, new in pairs:
35 source = source.replace(orig, new)
36 return source
38 def quotecsvstr(source):
39 return '"' + replacestrings(source, ('\\"','"'), ('"','\\"'), ("\\\\'", "\\'"), ('\\\\n', '\\n')) + '"'
41 def simplify(string):
42 return filter(type(string).isalnum, string)
43 tokens = sparse.SimpleParser().tokenize(string)
44 return " ".join(tokens)
46 class csv2po:
47 """a class that takes translations from a .csv file and puts them in a .po file"""
48 def __init__(self, templatepo=None, charset=None, duplicatestyle="keep"):
49 """construct the converter..."""
50 self.pofile = templatepo
51 self.charset = charset
52 self.duplicatestyle = duplicatestyle
53 if self.pofile is not None:
54 self.unmatched = 0
55 self.makeindex()
57 def makeindex(self):
58 """makes indexes required for searching..."""
59 self.commentindex = {}
60 self.sourceindex = {}
61 self.simpleindex = {}
62 self.duplicatecomments = []
63 for pounit in self.pofile.units:
64 joinedcomment = " ".join(pounit.getlocations())
65 source = pounit.source
66 # the definitive way to match is by source comment (joinedcomment)
67 if joinedcomment in self.commentindex:
68 # unless more than one thing matches...
69 self.duplicatecomments.append(joinedcomment)
70 else:
71 self.commentindex[joinedcomment] = pounit
72 # do simpler matching in case things have been mangled...
73 simpleid = simplify(source)
74 # but check for duplicates
75 if simpleid in self.simpleindex and not (source in self.sourceindex):
76 # keep a list of them...
77 self.simpleindex[simpleid].append(pounit)
78 else:
79 self.simpleindex[simpleid] = [pounit]
80 # also match by standard msgid
81 self.sourceindex[source] = pounit
82 for comment in self.duplicatecomments:
83 if comment in self.commentindex:
84 del self.commentindex[comment]
86 def convertunit(self, csvunit):
87 """converts csv unit to po unit"""
88 pounit = po.pounit(encoding="UTF-8")
89 if csvunit.comment:
90 pounit.addlocation(csvunit.comment)
91 pounit.source = csvunit.source
92 pounit.target = csvunit.target
93 return pounit
95 def handlecsvunit(self, csvunit):
96 """handles reintegrating a csv unit into the .po file"""
97 if len(csvunit.comment.strip()) > 0 and csvunit.comment in self.commentindex:
98 pounit = self.commentindex[csvunit.comment]
99 elif csvunit.source in self.sourceindex:
100 pounit = self.sourceindex[csvunit.source]
101 elif simplify(csvunit.source) in self.simpleindex:
102 thepolist = self.simpleindex[simplify(csvunit.source)]
103 if len(thepolist) > 1:
104 csvfilename = getattr(self.csvfile, "filename", "(unknown)")
105 matches = "\n ".join(["possible match: " + pounit.source for pounit in thepolist])
106 print >> sys.stderr, "%s - csv entry not found in pofile, multiple matches found:\n location\t%s\n original\t%s\n translation\t%s\n %s" % (csvfilename, csvunit.comment, csvunit.source, csvunit.target, matches)
107 self.unmatched += 1
108 return
109 pounit = thepolist[0]
110 else:
111 csvfilename = getattr(self.csvfile, "filename", "(unknown)")
112 print >> sys.stderr, "%s - csv entry not found in pofile:\n location\t%s\n original\t%s\n translation\t%s" % (csvfilename, csvunit.comment, csvunit.source, csvunit.target)
113 self.unmatched += 1
114 return
115 if pounit.hasplural():
116 # we need to work out whether we matched the singular or the plural
117 singularid = pounit.source.strings[0]
118 pluralid = pounit.source.strings[1]
119 if csvunit.source == singularid:
120 pounit.msgstr[0] = csvunit.target
121 elif csvunit.source == pluralid:
122 pounit.msgstr[1] = csvunit.target
123 elif simplify(csvunit.source) == simplify(singularid):
124 pounit.msgstr[0] = csvunit.target
125 elif simplify(csvunit.source) == simplify(pluralid):
126 pounit.msgstr[1] = csvunit.target
127 else:
128 print >> sys.stderr, "couldn't work out singular or plural: %r, %r, %r" % \
129 (csvunit.source, singularid, pluralid)
130 self.unmatched += 1
131 return
132 else:
133 pounit.target = csvunit.target
135 def convertstore(self, thecsvfile):
136 """converts a csvfile to a pofile, and returns it. uses templatepo if given at construction"""
137 self.csvfile = thecsvfile
138 if self.pofile is None:
139 self.pofile = po.pofile()
140 mergemode = False
141 else:
142 mergemode = True
143 if self.pofile.units and self.pofile.units[0].isheader():
144 targetheader = self.pofile.units[0]
145 targetheader.msgstr = [line.replace("CHARSET", "UTF-8").replace("ENCODING", "8bit") for line in targetheader.msgstr]
146 else:
147 targetheader = self.pofile.makeheader(charset="UTF-8", encoding="8bit")
148 targetheader.addnote("extracted from %s" % self.csvfile.filename, "developer")
149 mightbeheader = True
150 for csvunit in self.csvfile.units:
151 if self.charset is not None:
152 csvunit.source = csvunit.source.decode(self.charset)
153 csvunit.target = csvunit.target.decode(self.charset)
154 if mightbeheader:
155 # ignore typical header strings...
156 mightbeheader = False
157 if [item.strip().lower() for item in csvunit.comment, csvunit.source, csvunit.target] == \
158 ["comment", "original", "translation"]:
159 continue
160 if len(csvunit.comment.strip()) == 0 and csvunit.source.find("Content-Type:") != -1:
161 continue
162 if mergemode:
163 self.handlecsvunit(csvunit)
164 else:
165 pounit = self.convertunit(csvunit)
166 self.pofile.addunit(pounit)
167 self.pofile.removeduplicates(self.duplicatestyle)
168 return self.pofile
170 def convertcsv(inputfile, outputfile, templatefile, charset=None, columnorder=None, duplicatestyle="msgctxt"):
171 """reads in inputfile using csvl10n, converts using csv2po, writes to outputfile"""
172 inputstore = csvl10n.csvfile(inputfile, fieldnames=columnorder)
173 if templatefile is None:
174 convertor = csv2po(charset=charset, duplicatestyle=duplicatestyle)
175 else:
176 templatestore = po.pofile(templatefile)
177 convertor = csv2po(templatestore, charset=charset, duplicatestyle=duplicatestyle)
178 outputstore = convertor.convertstore(inputstore)
179 if outputstore.isempty():
180 return 0
181 outputfile.write(str(outputstore))
182 return 1
184 def main(argv=None):
185 from translate.convert import convert
186 formats = {("csv", "po"): ("po", convertcsv), ("csv", "pot"): ("po", convertcsv),
187 ("csv", None): ("po", convertcsv)}
188 parser = convert.ConvertOptionParser(formats, usetemplates=True, description=__doc__)
189 parser.add_option("", "--charset", dest="charset", default=None,
190 help="set charset to decode from csv files", metavar="CHARSET")
191 parser.add_option("", "--columnorder", dest="columnorder", default=None,
192 help="specify the order and position of columns (source,source,target)")
193 parser.add_duplicates_option()
194 parser.passthrough.append("charset")
195 parser.passthrough.append("columnorder")
196 parser.run(argv)
199 if __name__ == '__main__':
200 main()