2 # -*- coding: utf-8 -*-
4 # Copyright 2004 Zuza Software Foundation
6 # This file is part of translate.
8 # translate is free software; you can redistribute it and/or modify
9 # it under the terms of the GNU General Public License as published by
10 # the Free Software Foundation; either version 2 of the License, or
11 # (at your option) any later version.
13 # translate is distributed in the hope that it will be useful,
14 # but WITHOUT ANY WARRANTY; without even the implied warranty of
15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 # GNU General Public License for more details.
18 # You should have received a copy of the GNU General Public License
19 # along with translate; if not, write to the Free Software
20 # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 """takes a .po translation file and produces statistics to help work out what encoding it is in"""
26 from translate
.storage
import po
27 from translate
import __version__
33 """the detector class encompasses all the functionality required to detect encodings"""
34 excludechoices
={'letters':string
.letters
, 'digits':string
.digits
,
35 'whitespace':string
.whitespace
, 'punctuation':string
.punctuation
,
36 'std':string
.letters
+string
.digits
+string
.whitespace
+string
.punctuation
}
38 def __init__(self
, filenames
, options
):
39 """constructs a detector with the given filenames and options"""
40 self
.options
= options
41 self
.alloriginals
, self
.alltranslations
= [], []
42 self
.recursefiles(filenames
)
44 self
.dictstring
= codecs
.open(options
.dictfile
, 'r', options
.dictencoding
).read()
46 self
.dictstring
= None
47 self
.excludestring
= ""
49 for excludecategory
in options
.exclude
:
50 if excludecategory
in self
.excludechoices
:
51 self
.excludestring
+= self
.excludechoices
[excludecategory
]
53 def listencodings(self
):
54 """lists the encodings we are using (all if none have been given on cmdline)"""
55 if self
.options
.limitencodings
is not None:
56 return self
.options
.limitencodings
.split(",")
57 modules
= encodings
.aliases
.aliases
.values()
58 moduledict
= dict([(module
, True) for module
in modules
])
59 modules
= moduledict
.keys()
63 def processoriginal(self
, original
):
64 """adds the given original string to the list of originals"""
65 self
.alloriginals
.append(original
)
67 def processtranslation(self
, translation
):
68 """adds the given translation to the list of translations"""
69 self
.alltranslations
.append(translation
)
71 def processfile(self
, infile
):
72 """processes all the strings in the given po file"""
74 pof
.fromlines(infile
.readlines())
75 originals
, translations
= [], []
76 for poe
in pof
.poelements
:
77 if poe
.isheader() or poe
.isblank():
79 msgid
= po
.getunquotedstr(poe
.msgid
)
80 if isinstance(poe
.msgstr
, dict):
81 msgstr
= po
.getunquotedstr(poe
.msgstr
[0])
83 msgstr
= po
.getunquotedstr(poe
.msgstr
)
84 self
.processoriginal(msgid
)
85 self
.processtranslation(msgstr
)
87 def recursefiles(self
, filenames
):
88 """reads in the filenames given and extracts their translations"""
89 for filename
in filenames
:
90 if not os
.path
.exists(filename
):
91 print >>sys
.stderr
, "cannot process %s: does not exist" % filename
93 elif os
.path
.isdir(filename
):
94 self
.handledir(filename
)
96 self
.readfile(filename
)
98 def readfile(self
, filename
):
99 """reads in the given file and processes it"""
100 infile
= open(filename
)
101 self
.processfile(infile
)
103 if self
.options
.verbose
:
104 print "read %s" % filename
106 def readfiles(self
, arg
, dirname
, filenames
):
107 """reads in the given files in the given directory and processes them"""
108 for filename
in filenames
:
109 pathname
= os
.path
.join(dirname
, filename
)
110 if not os
.path
.isdir(pathname
):
111 self
.readfile(pathname
)
113 def handledir(self
, dirname
):
114 """walks through the directory structure and reads in all the files"""
115 os
.path
.walk(dirname
, self
.readfiles
, None)
117 def updatecountmap(self
, charstring
, excludestring
="", countmap
=None):
118 """makes a countmap of the characters in the string, excluding those in excludestring
119 starts with countmap if one is given"""
120 if countmap
is None: countmap
= {}
121 for char
in charstring
:
122 if char
in excludestring
: continue
123 if char
in countmap
: countmap
[char
] += 1
124 else: countmap
[char
] = 1
127 def getcountmapdelta(self
, countmap1
, countmap2
):
128 """returns a delta representing the difference between the two countmaps"""
129 total1
= reduce(int.__add
__, countmap1
.values())
130 total2
= reduce(int.__add
__, countmap2
.values())
132 for char
in countmap1
:
133 adjustedcount
= (countmap1
[char
]*total2
)/total1
134 if char
in countmap2
:
135 delta
+= abs(adjustedcount
- countmap2
[char
])
137 delta
+= adjustedcount
138 for char
in countmap2
:
139 if char
not in countmap1
:
140 delta
+= countmap2
[char
]
143 def countmaptostring(self
, countmap
):
144 """returns a string with the characters in countmap sorted by count"""
145 countpairs
= [(count
, char
) for char
,count
in countmap
.iteritems()]
148 return "".join([char
for count
,char
in countpairs
])
150 def encodeattempt(self
, charstring
):
151 """encode what can be encoding in encoding, add the rest on at the end in a repr"""
153 return charstring
.encode(self
.options
.outputencoding
)
157 for char
in charstring
:
159 encoded
+= char
.encode(self
.options
.outputencoding
)
162 return encoded
+ " " + repr(failed
)
164 def makecountmap(self
, encoding
):
165 """makes a countmap for all the translations using the encoding"""
167 for translation
in self
.alltranslations
:
169 decoded
= translation
.decode(encoding
)
170 except UnicodeDecodeError:
174 self
.updatecountmap(decoded
, self
.excludestring
, countmap
)
177 def testcharstats(self
):
178 """produces char distribution for each encoding, and dict, and shows statistical match"""
181 self
.updatecountmap(self
.dictstring
, self
.excludestring
, dictcountmap
)
184 for encoding
in self
.listencodings():
185 encodingdeltas
[encoding
] = 99999999
186 countmap
= self
.makecountmap(encoding
)
187 validencodings
[encoding
] = countmap
189 encodingdeltas
[encoding
] = self
.getcountmapdelta(dictcountmap
, countmap
)
191 sortedstring
= self
.countmaptostring(dictcountmap
)
192 print "dict:", self
.encodeattempt(sortedstring
)
193 deltas
= [(delta
, encoding
) for encoding
, delta
in encodingdeltas
.iteritems()]
195 validkeys
= [encoding
for delta
, encoding
in deltas
]
197 # sort alphabetically
198 validkeys
= validencodings
.keys()
201 keylen
= max([len("%s (%d):" % (key
, encodingdeltas
[key
])) for key
in validencodings
if key
in encodingdeltas
])
204 for validencoding
in validkeys
:
205 sortedstring
= self
.countmaptostring(validencodings
[validencoding
])
206 validencoding
= "%s (%d):" % (validencoding
, encodingdeltas
[validencoding
])
207 validencoding
+= " "*(keylen
-len(validencoding
))
208 print validencoding
, self
.encodeattempt(sortedstring
)
210 def findwords(self
, dictmap
, encoding
):
211 """finds all words in the translations that when decoded with encoding match in the dictmap"""
212 uniquewordsfound
= {}
214 for translation
in self
.alltranslations
:
216 decoded
= translation
.decode(encoding
)
217 except UnicodeDecodeError:
220 raise UnicodeDecodeError(encoding
, translation
, 0, 0, str(e
))
221 if self
.options
.ignorecase
:
222 decoded
= decoded
.lower()
223 decodedwords
= decoded
.split()
224 for word
in decodedwords
:
226 if self
.options
.verbose
:
227 if (not self
.options
.unique
) or (word
not in uniquewordsfound
):
228 print self
.encodeattempt(word
)
229 uniquewordsfound
[word
] = 1
232 return len(uniquewordsfound
)
236 def testwordstats(self
):
237 """produces word count for each encoding, shows matches to dict"""
239 for excludedchar
in self
.excludestring
:
240 ignoremap
[ord(excludedchar
)] = u
' '
243 for dictword
in self
.dictstring
.split():
244 if not dictword
.translate(ignoremap
).isspace():
245 if self
.options
.ignorecase
:
246 dictword
= dictword
.lower()
247 dictmap
[dictword
] = 0
248 print "%d words in dictionary" % len(dictmap
)
250 for encoding
in self
.listencodings():
252 encodingcounts
[encoding
] = self
.findwords(dictmap
, encoding
)
253 except UnicodeDecodeError:
255 counts
= [(count
, encoding
) for encoding
, count
in encodingcounts
.iteritems()]
257 validkeys
= [encoding
for count
, encoding
in counts
]
258 for validencoding
in validkeys
:
259 count
= encodingcounts
[validencoding
]
261 print "%s: %d" % (validencoding
, count
)
263 def fuzzy(self
, word
, includestring
):
264 """return a version of word including all the characters in includestring, with sequences of other characters replaced by a space"""
265 # the dots cleverly help us catch start and end spaces
268 if char
in includestring
:
273 return ' '.join(fuzzyword
.split())[1:-1]
275 def updatecharmap(self
, charmap
, word
, dictword
, ignoremap
):
276 """updates the given charmap with the changes from word to dictword (using ignoremap)"""
277 # check if the word actually matches
278 if dictword
== word
: return
279 elif isinstance(dictword
, dict):
280 if word
in dictword
: return
281 # otherwise check that there is no ambiguity
282 print "multiple matches: not drawing conclusions. %r, %r" % (word
, dictword
)
284 wordparts
= word
.translate(ignoremap
).split()
285 dictparts
= dictword
.translate(ignoremap
).split()
286 for wordpart
, dictpart
in zip(wordparts
, dictparts
):
287 if wordpart
!= dictpart
:
288 if wordpart
not in charmap
:
289 charmap
[wordpart
] = {}
290 transmap
= charmap
[wordpart
]
291 if dictpart
in transmap
:
292 transmap
[dictpart
] += 1
294 transmap
[dictpart
] = 1
296 def writescript(self
, charmap
, encoding
):
297 """writes a script to a file that replaces chars in a po file as defined by charmap"""
298 scriptfile
= open(self
.options
.outputscript
, 'w')
299 scriptfile
.write("# created by translate.tools.detectencoding\n")
300 scriptfile
.write("from translate.convert import poreplace\n")
301 scriptfile
.write("class pocharmap(poreplace.poreplace):\n")
302 scriptfile
.write(" def convertstring(self, postr):\n")
303 scriptfile
.write(" postr = postr.decode(%r)\n" % encoding
)
305 for wordpart
, transmap
in charmap
.iteritems():
306 # only handle exact matches...
307 if len(transmap
) == 1:
308 dictpart
, count
= transmap
.items()[0]
309 replacements
.append((count
, dictpart
, wordpart
))
311 replacements
.reverse()
312 for count
, dictpart
, wordpart
in replacements
:
313 scriptfile
.write(" postr = postr.replace(%r, %r) # %d matches\n" % (wordpart
, dictpart
, count
))
314 scriptfile
.write(" postr = postr.encode(%r)\n" % encoding
)
315 scriptfile
.write(" return postr\n")
316 scriptfile
.write("if __name__ == '__main__':\n")
317 scriptfile
.write(" poreplace.main(pocharmap)\n")
320 def fuzzywordmatch(self
, encoding
):
321 """does fuzzy word match for given encoding, and shows correspondence to dict"""
323 for excludedchar
in self
.excludestring
:
324 ignoremap
[ord(excludedchar
)] = u
' '
327 for dictword
in self
.dictstring
.split():
328 if not dictword
.translate(ignoremap
).isspace():
329 if self
.options
.ignorecase
:
330 dictword
= dictword
.lower()
331 fuzzyword
= self
.fuzzy(dictword
, self
.excludestring
)
332 # dictmap will contain a string if there is only one fuzzymatch
333 # otherwise it will contain a dict
334 if fuzzyword
in dictmap
:
335 if isinstance(dictmap
[fuzzyword
], dict):
336 dictmap
[fuzzyword
][dictword
] = True
338 dictmap
[fuzzyword
] = {dictmap
[fuzzyword
]:True, dictword
:True}
340 dictmap
[self
.fuzzy(dictword
, self
.excludestring
)] = dictword
341 print "%d words in dictionary" % len(dictmap
)
342 uniquewordsfound
= {}
345 for translation
in self
.alltranslations
:
346 decoded
= translation
.decode(encoding
)
347 if self
.options
.ignorecase
:
348 decoded
= decoded
.lower()
349 decodedwords
= decoded
.split()
350 for word
in decodedwords
:
351 fuzzyword
= self
.fuzzy(word
, self
.excludestring
)
352 if fuzzyword
in dictmap
:
354 dictword
= dictmap
[fuzzyword
]
355 if self
.options
.verbose
:
356 if (not self
.options
.unique
) or (word
not in uniquewordsfound
):
357 print repr(word
), repr(fuzzyword
), repr(dictword
)
358 self
.updatecharmap(charmap
, word
, dictword
, ignoremap
)
359 uniquewordsfound
[word
] = fuzzyword
361 print "fuzzy match on encoding %s produced %d unique words" % (encoding
, len(uniquewordsfound
))
363 print "fuzzy match on encoding %s produced %d words" % (encoding
, wordsfound
)
364 if options
.outputscript
:
365 self
.writescript(charmap
, encoding
)
366 for wordpart
, transmap
in charmap
.iteritems():
367 if len(transmap
) == 1:
368 dictpart
, count
= transmap
.items()[0]
369 print "char %r in translations always found to match char %r in dict (%d times)" % \
370 (wordpart
, dictpart
, count
)
372 counts
= [(count
, dictpart
) for dictpart
, count
in transmap
.iteritems()]
374 dictparts
= [dictpart
for count
, dictpart
in counts
]
375 print "char %r in translations matches to the following characters in dict:" % wordpart
376 for dictpart
in dictparts
:
377 count
= transmap
[dictpart
]
378 print " %r: %d" % (dictpart
, count
)
380 if __name__
== '__main__':
384 from translate
.misc
import optparse
385 optparser
= optparse
.OptionParser(version
="%prog "+__version__
.ver
)
386 optparser
.add_option("", "--exclude", dest
="exclude", type="choice",
387 action
="append", choices
=detector
.excludechoices
.keys(),
388 help="exclude certain common characters (%s)" % ", ".join(detector
.excludechoices
))
389 optparser
.add_option("", "--dict", dest
="dictfile",
390 action
="store", default
=None,
391 help="use a dictionary/wordlist to choose the best encoding(s)")
392 optparser
.add_option("", "--dictencoding", dest
="dictencoding",
393 action
="store", default
="utf8",
394 help="the encoding of the dictionary/wordlist")
395 optparser
.add_option("", "--outputencoding", dest
="outputencoding",
396 action
="store", default
="utf8",
397 help="the encoding of the output")
398 optparser
.add_option("", "--matchwords", dest
="matchwords",
399 action
="store_true", default
=False,
400 help="match words to the dictionary")
401 optparser
.add_option("", "--matchchars", dest
="matchchars",
402 action
="store_true", default
=False,
403 help="match chars to the dictionary")
404 optparser
.add_option("", "--fuzzymatch", dest
="fuzzymatch",
405 action
="store", default
=None,
406 help="match words to the dictionary using a fuzzy algorithm and the given encoding...")
407 optparser
.add_option("", "--outputscript", dest
="outputscript",
408 action
="store", default
=None,
409 help="produce a script based on the fuzzy match, to convert files with")
410 optparser
.add_option("", "--limitencodings", dest
="limitencodings",
411 action
="store", default
=None,
412 help="only use the encodings specified")
413 optparser
.add_option("-i", "--ignorecase", dest
="ignorecase",
414 action
="store_true", default
=False,
415 help="only use ignorecase words found, not total")
416 optparser
.add_option("-u", "--unique", dest
="unique",
417 action
="store_true", default
=False,
418 help="only use unique words found, not total")
419 optparser
.add_option("-v", "--verbose", dest
="verbose",
420 action
="store_true", default
=False,
421 help="verbose (print out lots of strings)")
422 (options
, args
) = optparser
.parse_args()
423 if not (options
.matchchars
or options
.matchwords
or options
.fuzzymatch
):
424 optparser
.error("you should specify at least one of matchchars, matchwords or fuzzymatch")
425 d
= detector(args
, options
)
426 if options
.matchchars
:
428 if options
.matchwords
:
430 if options
.fuzzymatch
:
431 d
.fuzzywordmatch(options
.fuzzymatch
)