tools/pogrep.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3 #
   4 # Copyright 2002-2008 Zuza Software Foundation
   5 #
   6 # This file is part of translate.
   7 #
   8 # translate is free software; you can redistribute it and/or modify
   9 # it under the terms of the GNU General Public License as published by
  10 # the Free Software Foundation; either version 2 of the License, or
  11 # (at your option) any later version.
  12 #
  13 # translate is distributed in the hope that it will be useful,
  14 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16 # GNU General Public License for more details.
  17 #
  18 # You should have received a copy of the GNU General Public License
  19 # along with translate; if not, write to the Free Software
  20 # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  21
  22 """Grep XLIFF, Gettext PO and TMX localization files
  23
  24 Matches are output to snippet files of the same type which can then be reviewed
  25 and later merged using pomerge
  26
  27 See: http://translate.sourceforge.net/wiki/toolkit/pogrep for examples and
  28 usage instructions
  29 """
  30
  31 from translate.storage import factory
  32 from translate.misc import optrecurse
  33 from translate.misc.multistring import multistring
  34 from translate.lang import data
  35 import re
  36 import locale
  37
  38 class GrepFilter:
  39     def __init__(self, searchstring, searchparts, ignorecase=False, useregexp=False, invertmatch=False, accelchar=None, encoding='utf-8', includeheader=False):
  40         """builds a checkfilter using the given checker"""
  41         if isinstance(searchstring, unicode):
  42             self.searchstring = searchstring
  43         else:
  44             self.searchstring = searchstring.decode(encoding)
  45         self.searchstring = data.normalize(self.searchstring)
  46         if searchparts:
  47             # For now we still support the old terminology, except for the old 'source'
  48             # which has a new meaning now.
  49             self.search_source = ('source' in searchparts) or ('msgid' in searchparts)
  50             self.search_target = ('target' in searchparts) or ('msgstr' in searchparts)
  51             self.search_notes =  ('notes' in searchparts) or ('comment' in searchparts)
  52             self.search_locations = 'locations' in searchparts
  53         else:
  54             self.search_source = True
  55             self.search_target = True
  56             self.search_notes = False
  57             self.search_locations = False
  58         self.ignorecase = ignorecase
  59         if self.ignorecase:
  60             self.searchstring = self.searchstring.lower()
  61         self.useregexp = useregexp
  62         if self.useregexp:
  63             self.searchpattern = re.compile(self.searchstring)
  64         self.invertmatch = invertmatch
  65         self.accelchar = accelchar
  66         self.includeheader = includeheader
  67
  68     def matches(self, teststr):
  69         teststr = data.normalize(teststr)
  70         if self.ignorecase:
  71             teststr = teststr.lower()
  72         if self.accelchar:
  73             teststr = re.sub(self.accelchar + self.accelchar, "#", teststr)
  74             teststr = re.sub(self.accelchar, "", teststr)
  75         if self.useregexp:
  76             found = self.searchpattern.search(teststr)
  77         else:
  78             found = teststr.find(self.searchstring) != -1
  79         if self.invertmatch:
  80             found = not found
  81         return found
  82
  83     def filterunit(self, unit):
  84         """runs filters on an element"""
  85         if unit.isheader(): return []
  86
  87         if self.search_source:
  88             if isinstance(unit.source, multistring):
  89                 strings = unit.source.strings
  90             else:
  91                 strings = [unit.source]
  92             for string in strings:
  93                 if self.matches(string):
  94                     return True
  95
  96         if self.search_target:
  97             if isinstance(unit.target, multistring):
  98                 strings = unit.target.strings
  99             else:
 100                 strings = [unit.target]
 101             for string in strings:
 102                 if self.matches(string):
 103                     return True
 104
 105         if self.search_notes:
 106             return self.matches(unit.getnotes())
 107         if self.search_locations:
 108             return self.matches(u" ".join(unit.getlocations()))
 109         return False
 110
 111     def filterfile(self, thefile):
 112         """runs filters on a translation file object"""
 113         thenewfile = type(thefile)()
 114         for unit in thefile.units:
 115             if self.filterunit(unit):
 116                 thenewfile.addunit(unit)
 117         if self.includeheader and thenewfile.units > 0:
 118             if thefile.units[0].isheader():
 119                 thenewfile.units.insert(0, thefile.units[0])
 120             else:
 121                 thenewfile.units.insert(0, thenewfile.makeheader())
 122         return thenewfile
 123
 124 class GrepOptionParser(optrecurse.RecursiveOptionParser):
 125     """a specialized Option Parser for the grep tool..."""
 126     def parse_args(self, args=None, values=None):
 127         """parses the command line options, handling implicit input/output args"""
 128         (options, args) = optrecurse.optparse.OptionParser.parse_args(self, args, values)
 129         # some intelligence as to what reasonable people might give on the command line
 130         if args:
 131             options.searchstring = args[0]
 132             args = args[1:]
 133         else:
 134             self.error("At least one argument must be given for the search string")
 135         if args and not options.input:
 136             if not options.output:
 137                 options.input = args[:-1]
 138                 args = args[-1:]
 139             else:
 140                 options.input = args
 141                 args = []
 142         if args and not options.output:
 143             options.output = args[-1]
 144             args = args[:-1]
 145         if args:
 146             self.error("You have used an invalid combination of --input, --output and freestanding args")
 147         if isinstance(options.input, list) and len(options.input) == 1:
 148             options.input = options.input[0]
 149         return (options, args)
 150
 151     def set_usage(self, usage=None):
 152         """sets the usage string - if usage not given, uses getusagestring for each option"""
 153         if usage is None:
 154             self.usage = "%prog searchstring " + " ".join([self.getusagestring(option) for option in self.option_list])
 155         else:
 156             super(GrepOptionParser, self).set_usage(usage)
 157
 158     def run(self):
 159         """parses the arguments, and runs recursiveprocess with the resulting options"""
 160         (options, args) = self.parse_args()
 161         options.inputformats = self.inputformats
 162         options.outputoptions = self.outputoptions
 163         options.checkfilter = GrepFilter(options.searchstring, options.searchparts, options.ignorecase, options.useregexp, options.invertmatch, options.accelchar, locale.getpreferredencoding(), options.includeheader)
 164         self.usepsyco(options)
 165         self.recursiveprocess(options)
 166
 167 def rungrep(inputfile, outputfile, templatefile, checkfilter):
 168     """reads in inputfile, filters using checkfilter, writes to outputfile"""
 169     fromfile = factory.getobject(inputfile)
 170     tofile = checkfilter.filterfile(fromfile)
 171     if tofile.isempty():
 172         return False
 173     outputfile.write(str(tofile))
 174     return True
 175
 176 def cmdlineparser():
 177     formats = {"po":("po", rungrep), "pot":("pot", rungrep),
 178             "xliff":("xliff", rungrep), "xlf":("xlf", rungrep), "xlff":("xlff", rungrep),
 179             "tmx":("tmx", rungrep),
 180             None:("po", rungrep)}
 181     parser = GrepOptionParser(formats)
 182     parser.add_option("", "--search", dest="searchparts",
 183         action="append", type="choice", choices=["source", "target", "notes", "locations", "msgid", "msgstr", "comment" ],
 184         metavar="SEARCHPARTS", help="searches the given parts (source, target, notes and locations)")
 185     parser.add_option("-I", "--ignore-case", dest="ignorecase",
 186         action="store_true", default=False, help="ignore case distinctions")
 187     parser.add_option("-e", "--regexp", dest="useregexp",
 188         action="store_true", default=False, help="use regular expression matching")
 189     parser.add_option("-v", "--invert-match", dest="invertmatch",
 190         action="store_true", default=False, help="select non-matching lines")
 191     parser.add_option("", "--accelerator", dest="accelchar",
 192         action="store", type="choice", choices=["&", "_", "~"],
 193         metavar="ACCELERATOR", help="ignores the given accelerator when matching")
 194     parser.add_option("", "--header", dest="includeheader",
 195         action="store_true", default=False,
 196         help="include a PO header in the output")
 197     parser.set_usage()
 198     parser.passthrough.append('checkfilter')
 199     parser.description = __doc__
 200     return parser
 201
 202 def main():
 203     parser = cmdlineparser()
 204     parser.run()
 205
 206 if __name__ == '__main__':
 207     main()