fix git support for v1.5.3 (or higher) by setting "--work-tree"
[translate_toolkit.git] / tools / poterminology.py
blobf286c0ee529a148539f5018ac151d50835b6b0bf
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3 #
4 # This file is part of translate.
6 # translate is free software; you can redistribute it and/or modify
7 # it under the terms of the GNU General Public License as published by
8 # the Free Software Foundation; either version 2 of the License, or
9 # (at your option) any later version.
11 # translate is distributed in the hope that it will be useful,
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 # GNU General Public License for more details.
16 # You should have received a copy of the GNU General Public License
17 # along with translate; if not, write to the Free Software
18 # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 """reads a set of .po or .pot files to produce a pootle-terminology.pot"""
22 from translate.lang import factory as lang_factory
23 from translate.misc import optrecurse
24 from translate.storage import po
25 from translate.storage import factory
26 import os
27 import re
28 import sys
30 class TerminologyOptionParser(optrecurse.RecursiveOptionParser):
31 """a specialized Option Parser for the terminology tool..."""
33 # handles c-format and python-format
34 formatpat = re.compile(r"%(?:\([^)]+\)|[0-9]+\$)?[-+#0]*[0-9.*]*(?:[hlLzjt][hl])?[EFGXc-ginoprsux]")
35 # handles XML/HTML elements (<foo>text</foo> => text)
36 xmlelpat = re.compile(r"<(?:![[-]|[/?]?[A-Za-z_:])[^>]*>")
37 # handles XML/HTML entities (&#32; &#x20; &amp; &my_entity;)
38 xmlentpat = re.compile(r"&(?:#(?:[0-9]+|x[0-9a-f]+)|[a-z_:][\w.-:]*);",
39 flags=re.UNICODE|re.IGNORECASE)
41 sortorders = [ "frequency", "dictionary", "length" ]
43 files = 0
44 units = 0
46 def parse_args(self, args=None, values=None):
47 """parses the command line options, handling implicit input/output args"""
48 (options, args) = optrecurse.optparse.OptionParser.parse_args(self, args, values)
49 # some intelligence as to what reasonable people might give on the command line
50 if args and not options.input:
51 if not options.output and len(args) > 1:
52 options.input = args[:-1]
53 args = args[-1:]
54 else:
55 options.input = args
56 args = []
57 if args and not options.output:
58 options.output = args[-1]
59 args = args[:-1]
60 if not options.output:
61 options.output = "pootle-terminology.pot"
62 if args:
63 self.error("You have used an invalid combination of --input, --output and freestanding args")
64 if isinstance(options.input, list) and len(options.input) == 1:
65 options.input = options.input[0]
66 if options.inputmin == None:
67 options.inputmin = 1
68 elif options.inputmin == None:
69 options.inputmin = 2
70 return (options, args)
72 def set_usage(self, usage=None):
73 """sets the usage string - if usage not given, uses getusagestring for each option"""
74 if usage is None:
75 self.usage = "%prog " + " ".join([self.getusagestring(option) for option in self.option_list]) + \
76 "\n input directory is searched for PO files, terminology PO file is output file"
77 else:
78 super(TerminologyOptionParser, self).set_usage(usage)
80 def run(self):
81 """parses the arguments, and runs recursiveprocess with the resulting options"""
82 (options, args) = self.parse_args()
83 options.inputformats = self.inputformats
84 options.outputoptions = self.outputoptions
85 self.usepsyco(options)
86 self.recursiveprocess(options)
88 def recursiveprocess(self, options):
89 """recurse through directories and process files"""
90 if self.isrecursive(options.input, 'input') and getattr(options, "allowrecursiveinput", True):
91 if isinstance(options.input, list):
92 inputfiles = self.recurseinputfilelist(options)
93 else:
94 inputfiles = self.recurseinputfiles(options)
95 else:
96 if options.input:
97 inputfiles = [os.path.basename(options.input)]
98 options.input = os.path.dirname(options.input)
99 else:
100 inputfiles = [options.input]
101 if os.path.isdir(options.output):
102 options.output = os.path.join(options.output,"pootle-terminology.pot")
103 self.stopwords = {}
104 self.stoprelist = []
105 actions = { '+': frozenset(), ':': frozenset(['skip']),
106 '<': frozenset(['phrase']), '=': frozenset(['word']),
107 '>': frozenset(['word','skip']),
108 '@': frozenset(['word','phrase']) }
109 if options.stopwordfile != None:
110 stopfile = open(options.stopwordfile, "r")
111 try:
112 for stopline in stopfile:
113 stoptype = stopline[0]
114 if stoptype == '#' or stoptype == "\n":
115 continue
116 elif stoptype == '/':
117 self.stoprelist.append(re.compile(stopline[1:-1]+'$'))
118 else:
119 self.stopwords[stopline[1:-1]] = actions[stoptype]
120 except KeyError, character:
121 self.warning("Bad line in stopword list %s starts with" % (options.stopwordfile), options, sys.exc_info())
122 stopfile.close()
123 self.glossary = {}
124 self.initprogressbar(inputfiles, options)
125 for inputpath in inputfiles:
126 self.files += 1
127 fullinputpath = self.getfullinputpath(options, inputpath)
128 success = True
129 try:
130 self.processfile(None, options, fullinputpath)
131 except Exception, error:
132 if isinstance(error, KeyboardInterrupt):
133 raise
134 self.warning("Error processing: input %s" % (fullinputpath), options, sys.exc_info())
135 success = False
136 self.reportprogress(inputpath, success)
137 del self.progressbar
138 self.outputterminology(options)
140 def clean(self, string, options):
141 """returns the cleaned string that contains the text to be matched"""
142 for accelerator in options.accelchars:
143 string = string.replace(accelerator, "")
144 string = self.formatpat.sub(" ", string)
145 string = self.xmlelpat.sub(" ", string)
146 string = self.xmlentpat.sub(" ", string)
147 string = string.strip()
148 return string
150 def addphrases(self, words, skips, translation, partials=True):
151 """adds (sub)phrases with non-skipwords and more than one word"""
152 if (len(words) > skips + 1 and
153 'skip' not in self.stopwords.get(words[0], frozenset()) and
154 'skip' not in self.stopwords.get(words[-1], frozenset())):
155 self.glossary.setdefault(' '.join(words), []).append(translation)
156 if partials:
157 part = list(words)
158 while len(part) > 2:
159 if 'skip' in self.stopwords.get(part.pop(), frozenset()):
160 skips -= 1
161 if (len(part) > skips + 1 and
162 'skip' not in self.stopwords.get(part[0], frozenset()) and
163 'skip' not in self.stopwords.get(part[-1], frozenset())):
164 self.glossary.setdefault(' '.join(part), []).append(translation)
166 def processfile(self, fileprocessor, options, fullinputpath):
167 """process an individual file"""
168 inputfile = self.openinputfile(options, fullinputpath)
169 inputfile = factory.getobject(inputfile)
170 sourcelang = lang_factory.getlanguage(options.sourcelanguage)
171 rematchignore = frozenset(('word','phrase'))
172 defaultignore = frozenset()
173 for unit in inputfile.units:
174 self.units += 1
175 if unit.isheader():
176 continue
177 if unit.hasplural():
178 continue
179 if not options.invert:
180 source = self.clean(unit.source, options)
181 target = self.clean(unit.target, options)
182 else:
183 target = self.clean(unit.source, options)
184 source = self.clean(unit.target, options)
185 if len(source) <= 1:
186 continue
187 for sentence in sourcelang.sentences(source):
188 words = []
189 skips = 0
190 for word in sourcelang.words(sentence):
191 if options.ignorecase or (options.foldtitle and word.istitle()):
192 word = word.lower()
193 ignore = defaultignore
194 if word in self.stopwords:
195 ignore = self.stopwords[word]
196 else:
197 for stopre in self.stoprelist:
198 if stopre.match(word) != None:
199 ignore = rematchignore
200 break
201 translation = (source, target, unit, fullinputpath)
202 if 'word' not in ignore:
203 # reduce plurals
204 root = word
205 if len(word) > 3 and word[-1] == 's' and word[0:-1] in self.glossary:
206 root = word[0:-1]
207 elif len(root) > 2 and root + 's' in self.glossary:
208 self.glossary[root] = self.glossary.pop(root + 's')
209 self.glossary.setdefault(root, []).append(translation)
210 if options.termlength > 1:
211 if 'phrase' in ignore:
212 # add trailing phrases in previous words
213 while len(words) > 2:
214 if 'skip' in self.stopwords.get(words.pop(0), defaultignore):
215 skips -= 1
216 self.addphrases(words, skips, translation)
217 words = []
218 skips = 0
219 else:
220 words.append(word)
221 if 'skip' in ignore:
222 skips += 1
223 if len(words) > options.termlength + skips:
224 while len(words) > options.termlength + skips:
225 if 'skip' in self.stopwords.get(words.pop(0), defaultignore):
226 skips -= 1
227 self.addphrases(words, skips, translation)
228 else:
229 self.addphrases(words, skips, translation, partials=False)
230 if options.termlength > 1:
231 # add trailing phrases in sentence after reaching end
232 while options.termlength > 1 and len(words) > 2:
233 if 'skip' in self.stopwords.get(words.pop(0), defaultignore):
234 skips -= 1
235 self.addphrases(words, skips, translation)
237 def outputterminology(self, options):
238 """saves the generated terminology glossary"""
239 termfile = po.pofile()
240 terms = {}
241 locre = re.compile(r":[0-9]+$")
242 print >> sys.stderr, ("%d terms from %d units in %d files" %
243 (len(self.glossary), self.units, self.files))
244 for term, translations in self.glossary.iteritems():
245 if len(translations) <= 1:
246 continue
247 filecounts = {}
248 sources = {}
249 termunit = po.pounit(term)
250 locations = {}
251 sourcenotes = {}
252 transnotes = {}
253 targets = {}
254 fullmsg = False
255 for source, target, unit, filename in translations:
256 sources[source] = 1
257 filecounts[filename] = filecounts.setdefault(filename, 0) + 1
258 if term.lower() == self.clean(unit.source, options).lower():
259 fullmsg = True
260 target = self.clean(unit.target, options)
261 if options.ignorecase or (options.foldtitle and target.istitle()):
262 target = target.lower()
263 unit.settarget(target)
264 if target != "":
265 targets.setdefault(target, []).append(filename)
266 if term.lower() == unit.source.strip().lower():
267 sourcenotes[unit.getnotes("source code")] = None
268 transnotes[unit.getnotes("translator")] = None
269 else:
270 unit.settarget("")
271 unit.setsource(term)
272 termunit.merge(unit, overwrite=False, comments=False)
273 for loc in unit.getlocations():
274 locations.setdefault(locre.sub("", loc))
275 numsources = len(sources)
276 numfiles = len(filecounts)
277 numlocs = len(locations)
278 if numfiles < options.inputmin or numlocs < options.locmin:
279 continue
280 if fullmsg:
281 if numsources < options.fullmsgmin:
282 continue
283 elif numsources < options.substrmin:
284 continue
285 if len(targets.keys()) > 1:
286 txt = '; '.join(["%s {%s}" % (target, ', '.join(files))
287 for target, files in targets.iteritems()])
288 if termunit.gettarget().find('};') < 0:
289 termunit.settarget(txt)
290 termunit.markfuzzy()
291 else:
292 # if annotated multiple terms already present, keep as-is
293 termunit.addnote(txt, "translator")
294 locmax = 2 * options.locmin
295 if numlocs > locmax:
296 for location in locations.keys()[0:locmax]:
297 termunit.addlocation(location)
298 termunit.addlocation("(poterminology) %d more locations"
299 % (numlocs - locmax))
300 else:
301 for location in locations.keys():
302 termunit.addlocation(location)
303 for sourcenote in sourcenotes.keys():
304 termunit.addnote(sourcenote, "source code")
305 for transnote in transnotes.keys():
306 termunit.addnote(transnote, "translator")
307 for filename, count in filecounts.iteritems():
308 termunit.othercomments.append("# (poterminology) %s (%d)\n" % (filename, count))
309 terms[term] = (((10 * numfiles) + numsources, termunit))
310 # reduce subphrase
311 termlist = terms.keys()
312 print >> sys.stderr, "%d terms after thresholding" % len(termlist)
313 termlist.sort(lambda x, y: cmp(len(x), len(y)))
314 for term in termlist:
315 words = term.split()
316 if len(words) <= 2:
317 continue
318 while len(words) > 2:
319 words.pop()
320 if terms[term][0] == terms.get(' '.join(words), [0])[0]:
321 del terms[' '.join(words)]
322 words = term.split()
323 while len(words) > 2:
324 words.pop(0)
325 if terms[term][0] == terms.get(' '.join(words), [0])[0]:
326 del terms[' '.join(words)]
327 print >> sys.stderr, "%d terms after subphrase reduction" % len(terms.keys())
328 termitems = terms.values()
329 if options.sortorders == None:
330 options.sortorders = self.sortorders
331 while len(options.sortorders) > 0:
332 order = options.sortorders.pop()
333 if order == "frequency":
334 termitems.sort(lambda x, y: cmp(y[0], x[0]))
335 elif order == "dictionary":
336 termitems.sort(lambda x, y: cmp(x[1].source.lower(), y[1].source.lower()))
337 elif order == "length":
338 termitems.sort(lambda x, y: cmp(len(x[1].source), len(y[1].source)))
339 else:
340 self.warning("unknown sort order %s" % order, options)
341 for count, unit in termitems:
342 termfile.units.append(unit)
343 open(options.output, "w").write(str(termfile))
345 def main():
346 formats = {"po":("po", None), None:("po", None)}
347 parser = TerminologyOptionParser(formats)
348 parser.add_option("-I", "--ignore-case", dest="ignorecase",
349 action="store_true", default=False, help="make all terms lowercase")
350 parser.add_option("-F", "--fold-titlecase", dest="foldtitle",
351 action="store_true", default=False, help="fold \"Title Case\" to lowercase")
352 parser.add_option("", "--accelerator", dest="accelchars", default="",
353 metavar="ACCELERATORS", help="ignores the given accelerator characters when matching")
354 parser.add_option("-t", "--term-words", type="int", dest="termlength", default="3",
355 help="generate terms of up to LENGTH words (default 3)", metavar="LENGTH")
356 parser.add_option("", "--inputs-needed", type="int", dest="inputmin",
357 help="omit terms appearing in less than MIN input files (default 1 - 2 if multiple input files)", metavar="MIN")
358 parser.add_option("", "--fullmsg-needed", type="int", dest="fullmsgmin", default="1",
359 help="omit full message terms appearing in less than MIN different messages (default 1)", metavar="MIN")
360 parser.add_option("", "--substr-needed", type="int", dest="substrmin", default="2",
361 help="omit substring-only terms appearing in less than MIN different messages (default 2)", metavar="MIN")
362 parser.add_option("", "--locs-needed", type="int", dest="locmin", default="2",
363 help="omit terms appearing in less than MIN different original source files (default 2)", metavar="MIN")
364 parser.add_option("", "--sort", dest="sortorders", action="append",
365 type="choice", choices=parser.sortorders, metavar="ORDER",
366 help="output sort order(s): %s (default is all orders in the above priority)" % ', '.join(parser.sortorders))
367 parser.add_option("-S", "--stopword-list", type="string", dest="stopwordfile",
368 help="name of file containing stopword list", metavar="FILENAME")
369 parser.add_option("", "--source-language", dest="sourcelanguage", default="en",
370 help="the source language code (default 'en')", metavar="LANG")
371 parser.add_option("-v", "--invert", dest="invert",
372 action="store_true", default=False, help="invert the source and target languages for terminology")
373 parser.set_usage()
374 parser.description = __doc__
375 parser.run()
378 if __name__ == '__main__':
379 main()