tools/detectencoding

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3 #
   4 # Copyright 2004 Zuza Software Foundation
   5 #
   6 # This file is part of translate.
   7 #
   8 # translate is free software; you can redistribute it and/or modify
   9 # it under the terms of the GNU General Public License as published by
  10 # the Free Software Foundation; either version 2 of the License, or
  11 # (at your option) any later version.
  12 #
  13 # translate is distributed in the hope that it will be useful,
  14 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16 # GNU General Public License for more details.
  17 #
  18 # You should have received a copy of the GNU General Public License
  19 # along with translate; if not, write to the Free Software
  20 # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  21
  22 """takes a .po translation file and produces statistics to help work out what encoding it is in"""
  23
  24 import sys
  25 import os
  26 from translate.storage import po
  27 from translate import __version__
  28 import encodings
  29 import codecs
  30 import string
  31
  32 class detector:
  33   """the detector class encompasses all the functionality required to detect encodings"""
  34   excludechoices={'letters':string.letters, 'digits':string.digits,
  35                   'whitespace':string.whitespace, 'punctuation':string.punctuation,
  36                   'std':string.letters+string.digits+string.whitespace+string.punctuation}
  37
  38   def __init__(self, filenames, options):
  39     """constructs a detector with the given filenames and options"""
  40     self.options = options
  41     self.alloriginals, self.alltranslations = [], []
  42     self.recursefiles(filenames)
  43     if options.dictfile:
  44       self.dictstring = codecs.open(options.dictfile, 'r', options.dictencoding).read()
  45     else:
  46       self.dictstring = None
  47     self.excludestring = ""
  48     if options.exclude:
  49       for excludecategory in options.exclude:
  50         if excludecategory in self.excludechoices:
  51           self.excludestring += self.excludechoices[excludecategory]
  52
  53   def listencodings(self):
  54     """lists the encodings we are using (all if none have been given on cmdline)"""
  55     if self.options.limitencodings is not None:
  56       return self.options.limitencodings.split(",")
  57     modules = encodings.aliases.aliases.values()
  58     moduledict = dict([(module, True) for module in modules])
  59     modules = moduledict.keys()
  60     modules.sort()
  61     return modules
  62
  63   def processoriginal(self, original):
  64     """adds the given original string to the list of originals"""
  65     self.alloriginals.append(original)
  66
  67   def processtranslation(self, translation):
  68     """adds the given translation to the list of translations"""
  69     self.alltranslations.append(translation)
  70
  71   def processfile(self, infile):
  72     """processes all the strings in the given po file"""
  73     pof = po.pofile()
  74     pof.fromlines(infile.readlines())
  75     originals, translations = [], []
  76     for poe in pof.poelements:
  77       if poe.isheader() or poe.isblank():
  78         continue
  79       msgid = po.getunquotedstr(poe.msgid)
  80       if isinstance(poe.msgstr, dict):
  81         msgstr = po.getunquotedstr(poe.msgstr[0])
  82       else:
  83         msgstr = po.getunquotedstr(poe.msgstr)
  84       self.processoriginal(msgid)
  85       self.processtranslation(msgstr)
  86
  87   def recursefiles(self, filenames):
  88     """reads in the filenames given and extracts their translations"""
  89     for filename in filenames:
  90       if not os.path.exists(filename):
  91         print >>sys.stderr, "cannot process %s: does not exist" % filename
  92         continue
  93       elif os.path.isdir(filename):
  94         self.handledir(filename)
  95       else:
  96         self.readfile(filename)
  97
  98   def readfile(self, filename):
  99     """reads in the given file and processes it"""
 100     infile = open(filename)
 101     self.processfile(infile)
 102     infile.close()
 103     if self.options.verbose:
 104       print "read %s" % filename
 105
 106   def readfiles(self, arg, dirname, filenames):
 107     """reads in the given files in the given directory and processes them"""
 108     for filename in filenames:
 109       pathname = os.path.join(dirname, filename)
 110       if not os.path.isdir(pathname):
 111         self.readfile(pathname)
 112
 113   def handledir(self, dirname):
 114     """walks through the directory structure and reads in all the files"""
 115     os.path.walk(dirname, self.readfiles, None)
 116
 117   def updatecountmap(self, charstring, excludestring="", countmap=None):
 118     """makes a countmap of the characters in the string, excluding those in excludestring
 119     starts with countmap if one is given"""
 120     if countmap is None: countmap = {}
 121     for char in charstring:
 122       if char in excludestring: continue
 123       if char in countmap: countmap[char] += 1
 124       else: countmap[char] = 1
 125     return countmap
 126
 127   def getcountmapdelta(self, countmap1, countmap2):
 128     """returns a delta representing the difference between the two countmaps"""
 129     total1 = reduce(int.__add__, countmap1.values())
 130     total2 = reduce(int.__add__, countmap2.values())
 131     delta = 0
 132     for char in countmap1:
 133       adjustedcount = (countmap1[char]*total2)/total1
 134       if char in countmap2:
 135         delta += abs(adjustedcount - countmap2[char])
 136       else:
 137         delta += adjustedcount
 138     for char in countmap2:
 139       if char not in countmap1:
 140         delta += countmap2[char]
 141     return delta
 142
 143   def countmaptostring(self, countmap):
 144     """returns a string with the characters in countmap sorted by count"""
 145     countpairs = [(count, char) for char,count in countmap.iteritems()]
 146     countpairs.sort()
 147     countpairs.reverse()
 148     return "".join([char for count,char in countpairs])
 149
 150   def encodeattempt(self, charstring):
 151     """encode what can be encoding in encoding, add the rest on at the end in a repr"""
 152     try:
 153       return charstring.encode(self.options.outputencoding)
 154     except:
 155       encoded = ""
 156       failed = ""
 157       for char in charstring:
 158         try:
 159           encoded += char.encode(self.options.outputencoding)
 160         except:
 161           failed += char
 162       return encoded + " " + repr(failed)
 163
 164   def makecountmap(self, encoding):
 165     """makes a countmap for all the translations using the encoding"""
 166     countmap = {}
 167     for translation in self.alltranslations:
 168       try:
 169         decoded = translation.decode(encoding)
 170       except UnicodeDecodeError:
 171         continue
 172       except:
 173         continue
 174       self.updatecountmap(decoded, self.excludestring, countmap)
 175     return countmap
 176
 177   def testcharstats(self):
 178     """produces char distribution for each encoding, and dict, and shows statistical match"""
 179     dictcountmap = {}
 180     if self.dictstring:
 181       self.updatecountmap(self.dictstring, self.excludestring, dictcountmap)
 182     validencodings = {}
 183     encodingdeltas = {}
 184     for encoding in self.listencodings():
 185       encodingdeltas[encoding] = 99999999
 186       countmap = self.makecountmap(encoding)
 187       validencodings[encoding] = countmap
 188       if dictcountmap:
 189         encodingdeltas[encoding] = self.getcountmapdelta(dictcountmap, countmap)
 190     if self.dictstring:
 191       sortedstring = self.countmaptostring(dictcountmap)
 192       print "dict:", self.encodeattempt(sortedstring)
 193       deltas = [(delta, encoding) for encoding, delta in encodingdeltas.iteritems()]
 194       deltas.sort()
 195       validkeys = [encoding for delta, encoding in deltas]
 196     else:
 197       # sort alphabetically
 198       validkeys = validencodings.keys()
 199       validkeys.sort()
 200     if encodingdeltas:
 201       keylen = max([len("%s (%d):" % (key, encodingdeltas[key])) for key in validencodings if key in encodingdeltas])
 202     else:
 203       keylen = 0
 204     for validencoding in validkeys:
 205       sortedstring = self.countmaptostring(validencodings[validencoding])
 206       validencoding = "%s (%d):" % (validencoding, encodingdeltas[validencoding])
 207       validencoding += " "*(keylen-len(validencoding))
 208       print validencoding, self.encodeattempt(sortedstring)
 209
 210   def findwords(self, dictmap, encoding):
 211     """finds all words in the translations that when decoded with encoding match in the dictmap"""
 212     uniquewordsfound = {}
 213     wordsfound = 0
 214     for translation in self.alltranslations:
 215       try:
 216         decoded = translation.decode(encoding)
 217       except UnicodeDecodeError:
 218         raise
 219       except Exception, e:
 220         raise UnicodeDecodeError(encoding, translation, 0, 0, str(e))
 221       if self.options.ignorecase:
 222         decoded = decoded.lower()
 223       decodedwords = decoded.split()
 224       for word in decodedwords:
 225         if word in dictmap:
 226           if self.options.verbose:
 227             if (not self.options.unique) or (word not in uniquewordsfound):
 228               print self.encodeattempt(word)
 229           uniquewordsfound[word] = 1
 230           wordsfound += 1
 231     if options.unique:
 232       return len(uniquewordsfound)
 233     else:
 234       return wordsfound
 235
 236   def testwordstats(self):
 237     """produces word count for each encoding, shows matches to dict"""
 238     ignoremap = {}
 239     for excludedchar in self.excludestring:
 240       ignoremap[ord(excludedchar)] = u' '
 241     dictmap = {}
 242     if self.dictstring:
 243       for dictword in self.dictstring.split():
 244         if not dictword.translate(ignoremap).isspace():
 245           if self.options.ignorecase:
 246             dictword = dictword.lower()
 247           dictmap[dictword] = 0
 248     print "%d words in dictionary" % len(dictmap)
 249     encodingcounts = {}
 250     for encoding in self.listencodings():
 251       try:
 252         encodingcounts[encoding] = self.findwords(dictmap, encoding)
 253       except UnicodeDecodeError:
 254         continue
 255     counts = [(count, encoding) for encoding, count in encodingcounts.iteritems()]
 256     counts.sort()
 257     validkeys = [encoding for count, encoding in counts]
 258     for validencoding in validkeys:
 259       count = encodingcounts[validencoding]
 260       if count:
 261         print "%s: %d" % (validencoding, count)
 262
 263   def fuzzy(self, word, includestring):
 264     """return a version of word including all the characters in includestring, with sequences of other characters replaced by a space"""
 265     # the dots cleverly help us catch start and end spaces
 266     fuzzyword = '.'
 267     for char in word:
 268       if char in includestring:
 269         fuzzyword += char
 270       else:
 271         fuzzyword += ' '
 272     fuzzyword += '.'
 273     return ' '.join(fuzzyword.split())[1:-1]
 274
 275   def updatecharmap(self, charmap, word, dictword, ignoremap):
 276     """updates the given charmap with the changes from word to dictword (using ignoremap)"""
 277     # check if the word actually matches
 278     if dictword == word: return
 279     elif isinstance(dictword, dict):
 280       if word in dictword: return
 281       # otherwise check that there is no ambiguity
 282       print "multiple matches: not drawing conclusions. %r, %r" % (word, dictword)
 283       return
 284     wordparts = word.translate(ignoremap).split()
 285     dictparts = dictword.translate(ignoremap).split()
 286     for wordpart, dictpart in zip(wordparts, dictparts):
 287       if wordpart != dictpart:
 288         if wordpart not in charmap:
 289           charmap[wordpart] = {}
 290         transmap = charmap[wordpart]
 291         if dictpart in transmap:
 292           transmap[dictpart] += 1
 293         else:
 294           transmap[dictpart] = 1
 295
 296   def writescript(self, charmap, encoding):
 297     """writes a script to a file that replaces chars in a po file as defined by charmap"""
 298     scriptfile = open(self.options.outputscript, 'w')
 299     scriptfile.write("# created by translate.tools.detectencoding\n")
 300     scriptfile.write("from translate.convert import poreplace\n")
 301     scriptfile.write("class pocharmap(poreplace.poreplace):\n")
 302     scriptfile.write("  def convertstring(self, postr):\n")
 303     scriptfile.write("    postr = postr.decode(%r)\n" % encoding)
 304     replacements = []
 305     for wordpart, transmap in charmap.iteritems():
 306       # only handle exact matches...
 307       if len(transmap) == 1:
 308         dictpart, count = transmap.items()[0]
 309         replacements.append((count, dictpart, wordpart))
 310     replacements.sort()
 311     replacements.reverse()
 312     for count, dictpart, wordpart in replacements:
 313       scriptfile.write("    postr = postr.replace(%r, %r)  # %d matches\n" % (wordpart, dictpart, count))
 314     scriptfile.write("    postr = postr.encode(%r)\n" % encoding)
 315     scriptfile.write("    return postr\n")
 316     scriptfile.write("if __name__ == '__main__':\n")
 317     scriptfile.write("  poreplace.main(pocharmap)\n")
 318     scriptfile.close()
 319
 320   def fuzzywordmatch(self, encoding):
 321     """does fuzzy word match for given encoding, and shows correspondence to dict"""
 322     ignoremap = {}
 323     for excludedchar in self.excludestring:
 324       ignoremap[ord(excludedchar)] = u' '
 325     dictmap = {}
 326     if self.dictstring:
 327       for dictword in self.dictstring.split():
 328         if not dictword.translate(ignoremap).isspace():
 329           if self.options.ignorecase:
 330             dictword = dictword.lower()
 331           fuzzyword = self.fuzzy(dictword, self.excludestring)
 332           # dictmap will contain a string if there is only one fuzzymatch
 333           # otherwise it will contain a dict
 334           if fuzzyword in dictmap:
 335             if isinstance(dictmap[fuzzyword], dict):
 336               dictmap[fuzzyword][dictword] = True
 337             else:
 338               dictmap[fuzzyword] = {dictmap[fuzzyword]:True, dictword:True}
 339           else:
 340             dictmap[self.fuzzy(dictword, self.excludestring)] = dictword
 341     print "%d words in dictionary" % len(dictmap)
 342     uniquewordsfound = {}
 343     charmap = {}
 344     wordsfound = 0
 345     for translation in self.alltranslations:
 346       decoded = translation.decode(encoding)
 347       if self.options.ignorecase:
 348         decoded = decoded.lower()
 349       decodedwords = decoded.split()
 350       for word in decodedwords:
 351         fuzzyword = self.fuzzy(word, self.excludestring)
 352         if fuzzyword in dictmap:
 353           wordsfound += 1
 354           dictword = dictmap[fuzzyword]
 355           if self.options.verbose:
 356             if (not self.options.unique) or (word not in uniquewordsfound):
 357               print repr(word), repr(fuzzyword), repr(dictword)
 358           self.updatecharmap(charmap, word, dictword, ignoremap)
 359           uniquewordsfound[word] = fuzzyword
 360     if options.unique:
 361       print "fuzzy match on encoding %s produced %d unique words" % (encoding, len(uniquewordsfound))
 362     else:
 363       print "fuzzy match on encoding %s produced %d words" % (encoding, wordsfound)
 364     if options.outputscript:
 365       self.writescript(charmap, encoding)
 366     for wordpart, transmap in charmap.iteritems():
 367       if len(transmap) == 1:
 368         dictpart, count = transmap.items()[0]
 369         print "char %r in translations always found to match char %r in dict (%d times)" % \
 370               (wordpart, dictpart, count)
 371       else:
 372         counts = [(count, dictpart) for dictpart, count in transmap.iteritems()]
 373         counts.sort()
 374         dictparts = [dictpart for count, dictpart in counts]
 375         print "char %r in translations matches to the following characters in dict:" % wordpart
 376         for dictpart in dictparts:
 377           count = transmap[dictpart]
 378           print "  %r: %d" % (dictpart, count)
 379
 380 if __name__ == '__main__':
 381   try:
 382     import optparse
 383   except ImportError:
 384     from translate.misc import optparse
 385   optparser = optparse.OptionParser(version="%prog "+__version__.ver)
 386   optparser.add_option("", "--exclude", dest="exclude", type="choice",
 387     action="append", choices=detector.excludechoices.keys(),
 388     help="exclude certain common characters (%s)" % ", ".join(detector.excludechoices))
 389   optparser.add_option("", "--dict", dest="dictfile",
 390     action="store", default=None,
 391     help="use a dictionary/wordlist to choose the best encoding(s)")
 392   optparser.add_option("", "--dictencoding", dest="dictencoding",
 393     action="store", default="utf8",
 394     help="the encoding of the dictionary/wordlist")
 395   optparser.add_option("", "--outputencoding", dest="outputencoding",
 396     action="store", default="utf8",
 397     help="the encoding of the output")
 398   optparser.add_option("", "--matchwords", dest="matchwords",
 399     action="store_true", default=False,
 400     help="match words to the dictionary")
 401   optparser.add_option("", "--matchchars", dest="matchchars",
 402     action="store_true", default=False,
 403     help="match chars to the dictionary")
 404   optparser.add_option("", "--fuzzymatch", dest="fuzzymatch",
 405     action="store", default=None,
 406     help="match words to the dictionary using a fuzzy algorithm and the given encoding...")
 407   optparser.add_option("", "--outputscript", dest="outputscript",
 408     action="store", default=None,
 409     help="produce a script based on the fuzzy match, to convert files with")
 410   optparser.add_option("", "--limitencodings", dest="limitencodings",
 411     action="store", default=None,
 412     help="only use the encodings specified")
 413   optparser.add_option("-i", "--ignorecase", dest="ignorecase",
 414     action="store_true", default=False,
 415     help="only use ignorecase words found, not total")
 416   optparser.add_option("-u", "--unique", dest="unique",
 417     action="store_true", default=False,
 418     help="only use unique words found, not total")
 419   optparser.add_option("-v", "--verbose", dest="verbose",
 420     action="store_true", default=False,
 421     help="verbose (print out lots of strings)")
 422   (options, args) = optparser.parse_args()
 423   if not (options.matchchars or options.matchwords or options.fuzzymatch):
 424     optparser.error("you should specify at least one of matchchars, matchwords or fuzzymatch")
 425   d = detector(args, options)
 426   if options.matchchars:
 427     d.testcharstats()
 428   if options.matchwords:
 429     d.testwordstats()
 430   if options.fuzzymatch:
 431     d.fuzzywordmatch(options.fuzzymatch)
 432