tools/poterminology.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3 #
   4 # This file is part of translate.
   5 #
   6 # translate is free software; you can redistribute it and/or modify
   7 # it under the terms of the GNU General Public License as published by
   8 # the Free Software Foundation; either version 2 of the License, or
   9 # (at your option) any later version.
  10 #
  11 # translate is distributed in the hope that it will be useful,
  12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 # GNU General Public License for more details.
  15 #
  16 # You should have received a copy of the GNU General Public License
  17 # along with translate; if not, write to the Free Software
  18 # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  19
  20 """reads a set of .po or .pot files to produce a pootle-terminology.pot"""
  21
  22 from translate.lang import factory as lang_factory
  23 from translate.misc import optrecurse
  24 from translate.storage import po
  25 from translate.storage import factory
  26 import os
  27 import re
  28 import sys
  29
  30 class TerminologyOptionParser(optrecurse.RecursiveOptionParser):
  31     """a specialized Option Parser for the terminology tool..."""
  32
  33     # handles c-format and python-format
  34     formatpat = re.compile(r"%(?:\([^)]+\)|[0-9]+\$)?[-+#0]*[0-9.*]*(?:[hlLzjt][hl])?[EFGXc-ginoprsux]")
  35     # handles XML/HTML elements (<foo>text</foo> => text)
  36     xmlelpat = re.compile(r"<(?:![[-]|[/?]?[A-Za-z_:])[^>]*>")
  37     # handles XML/HTML entities (&#32; &#x20; &amp; &my_entity;)
  38     xmlentpat = re.compile(r"&(?:#(?:[0-9]+|x[0-9a-f]+)|[a-z_:][\w.-:]*);",
  39                            flags=re.UNICODE|re.IGNORECASE)
  40
  41     sortorders = [ "frequency", "dictionary", "length" ]
  42
  43     files = 0
  44     units = 0
  45
  46     def parse_args(self, args=None, values=None):
  47         """parses the command line options, handling implicit input/output args"""
  48         (options, args) = optrecurse.optparse.OptionParser.parse_args(self, args, values)
  49         # some intelligence as to what reasonable people might give on the command line
  50         if args and not options.input:
  51             if not options.output and len(args) > 1:
  52                 options.input = args[:-1]
  53                 args = args[-1:]
  54             else:
  55                 options.input = args
  56                 args = []
  57         if args and not options.output:
  58             options.output = args[-1]
  59             args = args[:-1]
  60         if not options.output:
  61             options.output = "pootle-terminology.pot"
  62         if args:
  63             self.error("You have used an invalid combination of --input, --output and freestanding args")
  64         if isinstance(options.input, list) and len(options.input) == 1:
  65             options.input = options.input[0]
  66             if options.inputmin == None:
  67                 options.inputmin = 1
  68         elif options.inputmin == None:
  69             options.inputmin = 2
  70         return (options, args)
  71
  72     def set_usage(self, usage=None):
  73         """sets the usage string - if usage not given, uses getusagestring for each option"""
  74         if usage is None:
  75             self.usage = "%prog " + " ".join([self.getusagestring(option) for option in self.option_list]) + \
  76                     "\n  input directory is searched for PO files, terminology PO file is output file"
  77         else:
  78             super(TerminologyOptionParser, self).set_usage(usage)
  79
  80     def run(self):
  81         """parses the arguments, and runs recursiveprocess with the resulting options"""
  82         (options, args) = self.parse_args()
  83         options.inputformats = self.inputformats
  84         options.outputoptions = self.outputoptions
  85         self.usepsyco(options)
  86         self.recursiveprocess(options)
  87
  88     def recursiveprocess(self, options):
  89         """recurse through directories and process files"""
  90         if self.isrecursive(options.input, 'input') and getattr(options, "allowrecursiveinput", True):
  91             if isinstance(options.input, list):
  92                 inputfiles = self.recurseinputfilelist(options)
  93             else:
  94                 inputfiles = self.recurseinputfiles(options)
  95         else:
  96             if options.input:
  97                 inputfiles = [os.path.basename(options.input)]
  98                 options.input = os.path.dirname(options.input)
  99             else:
 100                 inputfiles = [options.input]
 101         if os.path.isdir(options.output):
 102             options.output = os.path.join(options.output,"pootle-terminology.pot")
 103         self.stopwords = {}
 104         self.stoprelist = []
 105         actions = { '+': frozenset(), ':': frozenset(['skip']),
 106                     '<': frozenset(['phrase']), '=': frozenset(['word']),
 107                     '>': frozenset(['word','skip']),
 108                     '@': frozenset(['word','phrase']) }
 109         if options.stopwordfile != None:
 110             stopfile = open(options.stopwordfile, "r")
 111             try:
 112                 for stopline in stopfile:
 113                     stoptype = stopline[0]
 114                     if stoptype == '#' or stoptype == "\n":
 115                         continue
 116                     elif stoptype == '/':
 117                         self.stoprelist.append(re.compile(stopline[1:-1]+'$'))
 118                     else:
 119                         self.stopwords[stopline[1:-1]] = actions[stoptype]
 120             except KeyError, character:
 121                 self.warning("Bad line in stopword list %s starts with" % (options.stopwordfile), options, sys.exc_info())
 122             stopfile.close()
 123         self.glossary = {}
 124         self.initprogressbar(inputfiles, options)
 125         for inputpath in inputfiles:
 126             self.files += 1
 127             fullinputpath = self.getfullinputpath(options, inputpath)
 128             success = True
 129             try:
 130                 self.processfile(None, options, fullinputpath)
 131             except Exception, error:
 132                 if isinstance(error, KeyboardInterrupt):
 133                     raise
 134                 self.warning("Error processing: input %s" % (fullinputpath), options, sys.exc_info())
 135                 success = False
 136             self.reportprogress(inputpath, success)
 137         del self.progressbar
 138         self.outputterminology(options)
 139
 140     def clean(self, string, options):
 141         """returns the cleaned string that contains the text to be matched"""
 142         for accelerator in options.accelchars:
 143             string = string.replace(accelerator, "")
 144         string = self.formatpat.sub(" ", string)
 145         string = self.xmlelpat.sub(" ", string)
 146         string = self.xmlentpat.sub(" ", string)
 147         string = string.strip()
 148         return string
 149
 150     def addphrases(self, words, skips, translation, partials=True):
 151         """adds (sub)phrases with non-skipwords and more than one word"""
 152         if (len(words) > skips + 1 and
 153             'skip' not in self.stopwords.get(words[0], frozenset()) and
 154             'skip' not in self.stopwords.get(words[-1], frozenset())):
 155             self.glossary.setdefault(' '.join(words), []).append(translation)
 156         if partials:
 157             part = list(words)
 158             while len(part) > 2:
 159                 if 'skip' in self.stopwords.get(part.pop(), frozenset()):
 160                     skips -= 1
 161                 if (len(part) > skips + 1 and
 162                     'skip' not in self.stopwords.get(part[0], frozenset()) and
 163                     'skip' not in self.stopwords.get(part[-1], frozenset())):
 164                     self.glossary.setdefault(' '.join(part), []).append(translation)
 165
 166     def processfile(self, fileprocessor, options, fullinputpath):
 167         """process an individual file"""
 168         inputfile = self.openinputfile(options, fullinputpath)
 169         inputfile = factory.getobject(inputfile)
 170         sourcelang = lang_factory.getlanguage(options.sourcelanguage)
 171         rematchignore = frozenset(('word','phrase'))
 172         defaultignore = frozenset()
 173         for unit in inputfile.units:
 174             self.units += 1
 175             if unit.isheader():
 176                 continue
 177             if unit.hasplural():
 178                 continue
 179             if not options.invert:
 180                 source = self.clean(unit.source, options)
 181                 target = self.clean(unit.target, options)
 182             else:
 183                 target = self.clean(unit.source, options)
 184                 source = self.clean(unit.target, options)
 185             if len(source) <= 1:
 186                 continue
 187             for sentence in sourcelang.sentences(source):
 188                 words = []
 189                 skips = 0
 190                 for word in sourcelang.words(sentence):
 191                     if options.ignorecase or (options.foldtitle and word.istitle()):
 192                         word = word.lower()
 193                     ignore = defaultignore
 194                     if word in self.stopwords:
 195                         ignore = self.stopwords[word]
 196                     else:
 197                         for stopre in self.stoprelist:
 198                             if stopre.match(word) != None:
 199                                 ignore = rematchignore
 200                                 break
 201                     translation = (source, target, unit, fullinputpath)
 202                     if 'word' not in ignore:
 203                         # reduce plurals
 204                         root = word
 205                         if len(word) > 3 and word[-1] == 's' and word[0:-1] in self.glossary:
 206                             root = word[0:-1]
 207                         elif len(root) > 2 and root + 's' in self.glossary:
 208                             self.glossary[root] = self.glossary.pop(root + 's')
 209                         self.glossary.setdefault(root, []).append(translation)
 210                     if options.termlength > 1:
 211                         if 'phrase' in ignore:
 212                             # add trailing phrases in previous words
 213                             while len(words) > 2:
 214                                 if 'skip' in self.stopwords.get(words.pop(0), defaultignore):
 215                                     skips -= 1
 216                                 self.addphrases(words, skips, translation)
 217                             words = []
 218                             skips = 0
 219                         else:
 220                             words.append(word)
 221                             if 'skip' in ignore:
 222                                 skips += 1
 223                             if len(words) > options.termlength + skips:
 224                                 while len(words) > options.termlength + skips:
 225                                     if 'skip' in self.stopwords.get(words.pop(0), defaultignore):
 226                                         skips -= 1
 227                                 self.addphrases(words, skips, translation)
 228                             else:
 229                                 self.addphrases(words, skips, translation, partials=False)
 230                 if options.termlength > 1:
 231                     # add trailing phrases in sentence after reaching end
 232                     while options.termlength > 1 and len(words) > 2:
 233                         if 'skip' in self.stopwords.get(words.pop(0), defaultignore):
 234                             skips -= 1
 235                         self.addphrases(words, skips, translation)
 236
 237     def outputterminology(self, options):
 238         """saves the generated terminology glossary"""
 239         termfile = po.pofile()
 240         terms = {}
 241         locre = re.compile(r":[0-9]+$")
 242         print >> sys.stderr, ("%d terms from %d units in %d files" %
 243                               (len(self.glossary), self.units, self.files))
 244         for term, translations in self.glossary.iteritems():
 245             if len(translations) <= 1:
 246                 continue
 247             filecounts = {}
 248             sources = {}
 249             termunit = po.pounit(term)
 250             locations = {}
 251             sourcenotes = {}
 252             transnotes = {}
 253             targets = {}
 254             fullmsg = False
 255             for source, target, unit, filename in translations:
 256                 sources[source] = 1
 257                 filecounts[filename] = filecounts.setdefault(filename, 0) + 1
 258                 if term.lower() == self.clean(unit.source, options).lower():
 259                     fullmsg = True
 260                     target = self.clean(unit.target, options)
 261                     if options.ignorecase or (options.foldtitle and target.istitle()):
 262                         target = target.lower()
 263                     unit.settarget(target)
 264                     if target != "":
 265                         targets.setdefault(target, []).append(filename)
 266                     if term.lower() == unit.source.strip().lower():
 267                         sourcenotes[unit.getnotes("source code")] = None
 268                         transnotes[unit.getnotes("translator")] = None
 269                 else:
 270                     unit.settarget("")
 271                 unit.setsource(term)
 272                 termunit.merge(unit, overwrite=False, comments=False)
 273                 for loc in unit.getlocations():
 274                     locations.setdefault(locre.sub("", loc))
 275             numsources = len(sources)
 276             numfiles = len(filecounts)
 277             numlocs = len(locations)
 278             if numfiles < options.inputmin or numlocs < options.locmin:
 279                 continue
 280             if fullmsg:
 281                 if numsources < options.fullmsgmin:
 282                     continue
 283             elif numsources < options.substrmin:
 284                 continue
 285             if len(targets.keys()) > 1:
 286                 txt = '; '.join(["%s {%s}" % (target, ', '.join(files))
 287                                      for target, files in targets.iteritems()])
 288                 if termunit.gettarget().find('};') < 0:
 289                     termunit.settarget(txt)
 290                     termunit.markfuzzy()
 291                 else:
 292                     # if annotated multiple terms already present, keep as-is
 293                     termunit.addnote(txt, "translator")
 294             locmax = 2 * options.locmin
 295             if numlocs > locmax:
 296                 for location in locations.keys()[0:locmax]:
 297                     termunit.addlocation(location)
 298                 termunit.addlocation("(poterminology) %d more locations"
 299                                      % (numlocs - locmax))
 300             else:
 301                 for location in locations.keys():
 302                     termunit.addlocation(location)
 303             for sourcenote in sourcenotes.keys():
 304                 termunit.addnote(sourcenote, "source code")
 305             for transnote in transnotes.keys():
 306                 termunit.addnote(transnote, "translator")
 307             for filename, count in filecounts.iteritems():
 308                 termunit.othercomments.append("# (poterminology) %s (%d)\n" % (filename, count))
 309             terms[term] = (((10 * numfiles) + numsources, termunit))
 310         # reduce subphrase
 311         termlist = terms.keys()
 312         print >> sys.stderr, "%d terms after thresholding" % len(termlist)
 313         termlist.sort(lambda x, y: cmp(len(x), len(y)))
 314         for term in termlist:
 315             words = term.split()
 316             if len(words) <= 2:
 317                 continue
 318             while len(words) > 2:
 319                 words.pop()
 320                 if terms[term][0] == terms.get(' '.join(words), [0])[0]:
 321                     del terms[' '.join(words)]
 322             words = term.split()
 323             while len(words) > 2:
 324                 words.pop(0)
 325                 if terms[term][0] == terms.get(' '.join(words), [0])[0]:
 326                     del terms[' '.join(words)]
 327         print >> sys.stderr, "%d terms after subphrase reduction" % len(terms.keys())
 328         termitems = terms.values()
 329         if options.sortorders == None:
 330             options.sortorders = self.sortorders
 331         while len(options.sortorders) > 0:
 332             order = options.sortorders.pop()
 333             if order == "frequency":
 334                 termitems.sort(lambda x, y: cmp(y[0], x[0]))
 335             elif order == "dictionary":
 336                 termitems.sort(lambda x, y: cmp(x[1].source.lower(), y[1].source.lower()))
 337             elif order == "length":
 338                 termitems.sort(lambda x, y: cmp(len(x[1].source), len(y[1].source)))
 339             else:
 340                 self.warning("unknown sort order %s" % order, options)
 341         for count, unit in termitems:
 342             termfile.units.append(unit)
 343         open(options.output, "w").write(str(termfile))
 344
 345 def main():
 346     formats = {"po":("po", None), None:("po", None)}
 347     parser = TerminologyOptionParser(formats)
 348     parser.add_option("-I", "--ignore-case", dest="ignorecase",
 349         action="store_true", default=False, help="make all terms lowercase")
 350     parser.add_option("-F", "--fold-titlecase", dest="foldtitle",
 351         action="store_true", default=False, help="fold \"Title Case\" to lowercase")
 352     parser.add_option("", "--accelerator", dest="accelchars", default="",
 353         metavar="ACCELERATORS", help="ignores the given accelerator characters when matching")
 354     parser.add_option("-t", "--term-words", type="int", dest="termlength", default="3",
 355                       help="generate terms of up to LENGTH words (default 3)", metavar="LENGTH")
 356     parser.add_option("", "--inputs-needed", type="int", dest="inputmin",
 357                       help="omit terms appearing in less than MIN input files (default 1 - 2 if multiple input files)", metavar="MIN")
 358     parser.add_option("", "--fullmsg-needed", type="int", dest="fullmsgmin", default="1",
 359                       help="omit full message terms appearing in less than MIN different messages (default 1)", metavar="MIN")
 360     parser.add_option("", "--substr-needed", type="int", dest="substrmin", default="2",
 361                       help="omit substring-only terms appearing in less than MIN different messages (default 2)", metavar="MIN")
 362     parser.add_option("", "--locs-needed", type="int", dest="locmin", default="2",
 363                       help="omit terms appearing in less than MIN different original source files (default 2)", metavar="MIN")
 364     parser.add_option("", "--sort", dest="sortorders", action="append",
 365                       type="choice", choices=parser.sortorders, metavar="ORDER",
 366                       help="output sort order(s): %s (default is all orders in the above priority)" % ', '.join(parser.sortorders))
 367     parser.add_option("-S", "--stopword-list", type="string", dest="stopwordfile",
 368                       help="name of file containing stopword list", metavar="FILENAME")
 369     parser.add_option("", "--source-language", dest="sourcelanguage", default="en",
 370                       help="the source language code (default 'en')", metavar="LANG")
 371     parser.add_option("-v", "--invert", dest="invert",
 372         action="store_true", default=False, help="invert the source and target languages for terminology")
 373     parser.set_usage()
 374     parser.description = __doc__
 375     parser.run()
 376
 377
 378 if __name__ == '__main__':
 379     main()