fix git support for v1.5.3 (or higher) by setting "--work-tree"
[translate_toolkit.git] / tools / detectencoding
blobb8abf15ba0a8ea05017b4f2e70a1506b89544766
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3 #
4 # Copyright 2004 Zuza Software Foundation
5 #
6 # This file is part of translate.
8 # translate is free software; you can redistribute it and/or modify
9 # it under the terms of the GNU General Public License as published by
10 # the Free Software Foundation; either version 2 of the License, or
11 # (at your option) any later version.
13 # translate is distributed in the hope that it will be useful,
14 # but WITHOUT ANY WARRANTY; without even the implied warranty of
15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 # GNU General Public License for more details.
18 # You should have received a copy of the GNU General Public License
19 # along with translate; if not, write to the Free Software
20 # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 """takes a .po translation file and produces statistics to help work out what encoding it is in"""
24 import sys
25 import os
26 from translate.storage import po
27 from translate import __version__
28 import encodings
29 import codecs
30 import string
32 class detector:
33 """the detector class encompasses all the functionality required to detect encodings"""
34 excludechoices={'letters':string.letters, 'digits':string.digits,
35 'whitespace':string.whitespace, 'punctuation':string.punctuation,
36 'std':string.letters+string.digits+string.whitespace+string.punctuation}
38 def __init__(self, filenames, options):
39 """constructs a detector with the given filenames and options"""
40 self.options = options
41 self.alloriginals, self.alltranslations = [], []
42 self.recursefiles(filenames)
43 if options.dictfile:
44 self.dictstring = codecs.open(options.dictfile, 'r', options.dictencoding).read()
45 else:
46 self.dictstring = None
47 self.excludestring = ""
48 if options.exclude:
49 for excludecategory in options.exclude:
50 if excludecategory in self.excludechoices:
51 self.excludestring += self.excludechoices[excludecategory]
53 def listencodings(self):
54 """lists the encodings we are using (all if none have been given on cmdline)"""
55 if self.options.limitencodings is not None:
56 return self.options.limitencodings.split(",")
57 modules = encodings.aliases.aliases.values()
58 moduledict = dict([(module, True) for module in modules])
59 modules = moduledict.keys()
60 modules.sort()
61 return modules
63 def processoriginal(self, original):
64 """adds the given original string to the list of originals"""
65 self.alloriginals.append(original)
67 def processtranslation(self, translation):
68 """adds the given translation to the list of translations"""
69 self.alltranslations.append(translation)
71 def processfile(self, infile):
72 """processes all the strings in the given po file"""
73 pof = po.pofile()
74 pof.fromlines(infile.readlines())
75 originals, translations = [], []
76 for poe in pof.poelements:
77 if poe.isheader() or poe.isblank():
78 continue
79 msgid = po.getunquotedstr(poe.msgid)
80 if isinstance(poe.msgstr, dict):
81 msgstr = po.getunquotedstr(poe.msgstr[0])
82 else:
83 msgstr = po.getunquotedstr(poe.msgstr)
84 self.processoriginal(msgid)
85 self.processtranslation(msgstr)
87 def recursefiles(self, filenames):
88 """reads in the filenames given and extracts their translations"""
89 for filename in filenames:
90 if not os.path.exists(filename):
91 print >>sys.stderr, "cannot process %s: does not exist" % filename
92 continue
93 elif os.path.isdir(filename):
94 self.handledir(filename)
95 else:
96 self.readfile(filename)
98 def readfile(self, filename):
99 """reads in the given file and processes it"""
100 infile = open(filename)
101 self.processfile(infile)
102 infile.close()
103 if self.options.verbose:
104 print "read %s" % filename
106 def readfiles(self, arg, dirname, filenames):
107 """reads in the given files in the given directory and processes them"""
108 for filename in filenames:
109 pathname = os.path.join(dirname, filename)
110 if not os.path.isdir(pathname):
111 self.readfile(pathname)
113 def handledir(self, dirname):
114 """walks through the directory structure and reads in all the files"""
115 os.path.walk(dirname, self.readfiles, None)
117 def updatecountmap(self, charstring, excludestring="", countmap=None):
118 """makes a countmap of the characters in the string, excluding those in excludestring
119 starts with countmap if one is given"""
120 if countmap is None: countmap = {}
121 for char in charstring:
122 if char in excludestring: continue
123 if char in countmap: countmap[char] += 1
124 else: countmap[char] = 1
125 return countmap
127 def getcountmapdelta(self, countmap1, countmap2):
128 """returns a delta representing the difference between the two countmaps"""
129 total1 = reduce(int.__add__, countmap1.values())
130 total2 = reduce(int.__add__, countmap2.values())
131 delta = 0
132 for char in countmap1:
133 adjustedcount = (countmap1[char]*total2)/total1
134 if char in countmap2:
135 delta += abs(adjustedcount - countmap2[char])
136 else:
137 delta += adjustedcount
138 for char in countmap2:
139 if char not in countmap1:
140 delta += countmap2[char]
141 return delta
143 def countmaptostring(self, countmap):
144 """returns a string with the characters in countmap sorted by count"""
145 countpairs = [(count, char) for char,count in countmap.iteritems()]
146 countpairs.sort()
147 countpairs.reverse()
148 return "".join([char for count,char in countpairs])
150 def encodeattempt(self, charstring):
151 """encode what can be encoding in encoding, add the rest on at the end in a repr"""
152 try:
153 return charstring.encode(self.options.outputencoding)
154 except:
155 encoded = ""
156 failed = ""
157 for char in charstring:
158 try:
159 encoded += char.encode(self.options.outputencoding)
160 except:
161 failed += char
162 return encoded + " " + repr(failed)
164 def makecountmap(self, encoding):
165 """makes a countmap for all the translations using the encoding"""
166 countmap = {}
167 for translation in self.alltranslations:
168 try:
169 decoded = translation.decode(encoding)
170 except UnicodeDecodeError:
171 continue
172 except:
173 continue
174 self.updatecountmap(decoded, self.excludestring, countmap)
175 return countmap
177 def testcharstats(self):
178 """produces char distribution for each encoding, and dict, and shows statistical match"""
179 dictcountmap = {}
180 if self.dictstring:
181 self.updatecountmap(self.dictstring, self.excludestring, dictcountmap)
182 validencodings = {}
183 encodingdeltas = {}
184 for encoding in self.listencodings():
185 encodingdeltas[encoding] = 99999999
186 countmap = self.makecountmap(encoding)
187 validencodings[encoding] = countmap
188 if dictcountmap:
189 encodingdeltas[encoding] = self.getcountmapdelta(dictcountmap, countmap)
190 if self.dictstring:
191 sortedstring = self.countmaptostring(dictcountmap)
192 print "dict:", self.encodeattempt(sortedstring)
193 deltas = [(delta, encoding) for encoding, delta in encodingdeltas.iteritems()]
194 deltas.sort()
195 validkeys = [encoding for delta, encoding in deltas]
196 else:
197 # sort alphabetically
198 validkeys = validencodings.keys()
199 validkeys.sort()
200 if encodingdeltas:
201 keylen = max([len("%s (%d):" % (key, encodingdeltas[key])) for key in validencodings if key in encodingdeltas])
202 else:
203 keylen = 0
204 for validencoding in validkeys:
205 sortedstring = self.countmaptostring(validencodings[validencoding])
206 validencoding = "%s (%d):" % (validencoding, encodingdeltas[validencoding])
207 validencoding += " "*(keylen-len(validencoding))
208 print validencoding, self.encodeattempt(sortedstring)
210 def findwords(self, dictmap, encoding):
211 """finds all words in the translations that when decoded with encoding match in the dictmap"""
212 uniquewordsfound = {}
213 wordsfound = 0
214 for translation in self.alltranslations:
215 try:
216 decoded = translation.decode(encoding)
217 except UnicodeDecodeError:
218 raise
219 except Exception, e:
220 raise UnicodeDecodeError(encoding, translation, 0, 0, str(e))
221 if self.options.ignorecase:
222 decoded = decoded.lower()
223 decodedwords = decoded.split()
224 for word in decodedwords:
225 if word in dictmap:
226 if self.options.verbose:
227 if (not self.options.unique) or (word not in uniquewordsfound):
228 print self.encodeattempt(word)
229 uniquewordsfound[word] = 1
230 wordsfound += 1
231 if options.unique:
232 return len(uniquewordsfound)
233 else:
234 return wordsfound
236 def testwordstats(self):
237 """produces word count for each encoding, shows matches to dict"""
238 ignoremap = {}
239 for excludedchar in self.excludestring:
240 ignoremap[ord(excludedchar)] = u' '
241 dictmap = {}
242 if self.dictstring:
243 for dictword in self.dictstring.split():
244 if not dictword.translate(ignoremap).isspace():
245 if self.options.ignorecase:
246 dictword = dictword.lower()
247 dictmap[dictword] = 0
248 print "%d words in dictionary" % len(dictmap)
249 encodingcounts = {}
250 for encoding in self.listencodings():
251 try:
252 encodingcounts[encoding] = self.findwords(dictmap, encoding)
253 except UnicodeDecodeError:
254 continue
255 counts = [(count, encoding) for encoding, count in encodingcounts.iteritems()]
256 counts.sort()
257 validkeys = [encoding for count, encoding in counts]
258 for validencoding in validkeys:
259 count = encodingcounts[validencoding]
260 if count:
261 print "%s: %d" % (validencoding, count)
263 def fuzzy(self, word, includestring):
264 """return a version of word including all the characters in includestring, with sequences of other characters replaced by a space"""
265 # the dots cleverly help us catch start and end spaces
266 fuzzyword = '.'
267 for char in word:
268 if char in includestring:
269 fuzzyword += char
270 else:
271 fuzzyword += ' '
272 fuzzyword += '.'
273 return ' '.join(fuzzyword.split())[1:-1]
275 def updatecharmap(self, charmap, word, dictword, ignoremap):
276 """updates the given charmap with the changes from word to dictword (using ignoremap)"""
277 # check if the word actually matches
278 if dictword == word: return
279 elif isinstance(dictword, dict):
280 if word in dictword: return
281 # otherwise check that there is no ambiguity
282 print "multiple matches: not drawing conclusions. %r, %r" % (word, dictword)
283 return
284 wordparts = word.translate(ignoremap).split()
285 dictparts = dictword.translate(ignoremap).split()
286 for wordpart, dictpart in zip(wordparts, dictparts):
287 if wordpart != dictpart:
288 if wordpart not in charmap:
289 charmap[wordpart] = {}
290 transmap = charmap[wordpart]
291 if dictpart in transmap:
292 transmap[dictpart] += 1
293 else:
294 transmap[dictpart] = 1
296 def writescript(self, charmap, encoding):
297 """writes a script to a file that replaces chars in a po file as defined by charmap"""
298 scriptfile = open(self.options.outputscript, 'w')
299 scriptfile.write("# created by translate.tools.detectencoding\n")
300 scriptfile.write("from translate.convert import poreplace\n")
301 scriptfile.write("class pocharmap(poreplace.poreplace):\n")
302 scriptfile.write(" def convertstring(self, postr):\n")
303 scriptfile.write(" postr = postr.decode(%r)\n" % encoding)
304 replacements = []
305 for wordpart, transmap in charmap.iteritems():
306 # only handle exact matches...
307 if len(transmap) == 1:
308 dictpart, count = transmap.items()[0]
309 replacements.append((count, dictpart, wordpart))
310 replacements.sort()
311 replacements.reverse()
312 for count, dictpart, wordpart in replacements:
313 scriptfile.write(" postr = postr.replace(%r, %r) # %d matches\n" % (wordpart, dictpart, count))
314 scriptfile.write(" postr = postr.encode(%r)\n" % encoding)
315 scriptfile.write(" return postr\n")
316 scriptfile.write("if __name__ == '__main__':\n")
317 scriptfile.write(" poreplace.main(pocharmap)\n")
318 scriptfile.close()
320 def fuzzywordmatch(self, encoding):
321 """does fuzzy word match for given encoding, and shows correspondence to dict"""
322 ignoremap = {}
323 for excludedchar in self.excludestring:
324 ignoremap[ord(excludedchar)] = u' '
325 dictmap = {}
326 if self.dictstring:
327 for dictword in self.dictstring.split():
328 if not dictword.translate(ignoremap).isspace():
329 if self.options.ignorecase:
330 dictword = dictword.lower()
331 fuzzyword = self.fuzzy(dictword, self.excludestring)
332 # dictmap will contain a string if there is only one fuzzymatch
333 # otherwise it will contain a dict
334 if fuzzyword in dictmap:
335 if isinstance(dictmap[fuzzyword], dict):
336 dictmap[fuzzyword][dictword] = True
337 else:
338 dictmap[fuzzyword] = {dictmap[fuzzyword]:True, dictword:True}
339 else:
340 dictmap[self.fuzzy(dictword, self.excludestring)] = dictword
341 print "%d words in dictionary" % len(dictmap)
342 uniquewordsfound = {}
343 charmap = {}
344 wordsfound = 0
345 for translation in self.alltranslations:
346 decoded = translation.decode(encoding)
347 if self.options.ignorecase:
348 decoded = decoded.lower()
349 decodedwords = decoded.split()
350 for word in decodedwords:
351 fuzzyword = self.fuzzy(word, self.excludestring)
352 if fuzzyword in dictmap:
353 wordsfound += 1
354 dictword = dictmap[fuzzyword]
355 if self.options.verbose:
356 if (not self.options.unique) or (word not in uniquewordsfound):
357 print repr(word), repr(fuzzyword), repr(dictword)
358 self.updatecharmap(charmap, word, dictword, ignoremap)
359 uniquewordsfound[word] = fuzzyword
360 if options.unique:
361 print "fuzzy match on encoding %s produced %d unique words" % (encoding, len(uniquewordsfound))
362 else:
363 print "fuzzy match on encoding %s produced %d words" % (encoding, wordsfound)
364 if options.outputscript:
365 self.writescript(charmap, encoding)
366 for wordpart, transmap in charmap.iteritems():
367 if len(transmap) == 1:
368 dictpart, count = transmap.items()[0]
369 print "char %r in translations always found to match char %r in dict (%d times)" % \
370 (wordpart, dictpart, count)
371 else:
372 counts = [(count, dictpart) for dictpart, count in transmap.iteritems()]
373 counts.sort()
374 dictparts = [dictpart for count, dictpart in counts]
375 print "char %r in translations matches to the following characters in dict:" % wordpart
376 for dictpart in dictparts:
377 count = transmap[dictpart]
378 print " %r: %d" % (dictpart, count)
380 if __name__ == '__main__':
381 try:
382 import optparse
383 except ImportError:
384 from translate.misc import optparse
385 optparser = optparse.OptionParser(version="%prog "+__version__.ver)
386 optparser.add_option("", "--exclude", dest="exclude", type="choice",
387 action="append", choices=detector.excludechoices.keys(),
388 help="exclude certain common characters (%s)" % ", ".join(detector.excludechoices))
389 optparser.add_option("", "--dict", dest="dictfile",
390 action="store", default=None,
391 help="use a dictionary/wordlist to choose the best encoding(s)")
392 optparser.add_option("", "--dictencoding", dest="dictencoding",
393 action="store", default="utf8",
394 help="the encoding of the dictionary/wordlist")
395 optparser.add_option("", "--outputencoding", dest="outputencoding",
396 action="store", default="utf8",
397 help="the encoding of the output")
398 optparser.add_option("", "--matchwords", dest="matchwords",
399 action="store_true", default=False,
400 help="match words to the dictionary")
401 optparser.add_option("", "--matchchars", dest="matchchars",
402 action="store_true", default=False,
403 help="match chars to the dictionary")
404 optparser.add_option("", "--fuzzymatch", dest="fuzzymatch",
405 action="store", default=None,
406 help="match words to the dictionary using a fuzzy algorithm and the given encoding...")
407 optparser.add_option("", "--outputscript", dest="outputscript",
408 action="store", default=None,
409 help="produce a script based on the fuzzy match, to convert files with")
410 optparser.add_option("", "--limitencodings", dest="limitencodings",
411 action="store", default=None,
412 help="only use the encodings specified")
413 optparser.add_option("-i", "--ignorecase", dest="ignorecase",
414 action="store_true", default=False,
415 help="only use ignorecase words found, not total")
416 optparser.add_option("-u", "--unique", dest="unique",
417 action="store_true", default=False,
418 help="only use unique words found, not total")
419 optparser.add_option("-v", "--verbose", dest="verbose",
420 action="store_true", default=False,
421 help="verbose (print out lots of strings)")
422 (options, args) = optparser.parse_args()
423 if not (options.matchchars or options.matchwords or options.fuzzymatch):
424 optparser.error("you should specify at least one of matchchars, matchwords or fuzzymatch")
425 d = detector(args, options)
426 if options.matchchars:
427 d.testcharstats()
428 if options.matchwords:
429 d.testwordstats()
430 if options.fuzzymatch:
431 d.fuzzywordmatch(options.fuzzymatch)