bin/find-german-comments

   1 #!/usr/bin/env python
   2 ########################################################################
   3 #
   4 #  Copyright (c) 2010 Jonas Jensen, Miklos Vajna
   5 #
   6 #  Permission is hereby granted, free of charge, to any person
   7 #  obtaining a copy of this software and associated documentation
   8 #  files (the "Software"), to deal in the Software without
   9 #  restriction, including without limitation the rights to use,
  10 #  copy, modify, merge, publish, distribute, sublicense, and/or sell
  11 #  copies of the Software, and to permit persons to whom the
  12 #  Software is furnished to do so, subject to the following
  13 #  conditions:
  14 #
  15 #  The above copyright notice and this permission notice shall be
  16 #  included in all copies or substantial portions of the Software.
  17 #
  18 #  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  19 #  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
  20 #  OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  21 #  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
  22 #  HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
  23 #  WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  24 #  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
  25 #  OTHER DEALINGS IN THE SOFTWARE.
  26 #
  27 ########################################################################
  28
  29
  30 import sys, re, subprocess, os, optparse, string
  31
  32 class Parser:
  33     """
  34     This parser extracts comments from source files, tries to guess
  35     their language and then prints out the german ones.
  36     """
  37     def __init__(self):
  38         self.strip = string.punctuation + " \n"
  39         self.text_cat = self.start_text_cat()
  40         op = optparse.OptionParser()
  41         op.set_usage("%prog [options] <rootdir>\n\n" +
  42             "Searches for german comments in cxx/hxx source files inside a given root\n" +
  43             "directory recursively.")
  44         op.add_option("-f", "--filenames-only", action="store_true", dest="filenames_only", default=False,
  45             help="Only print the filenames of files containing German comments")
  46         op.add_option("-v", "--verbose", action="store_true", dest="verbose", default=False,
  47             help="Turn on verbose mode (print only positives progress to stderr)")
  48         op.add_option("-l", "--line-numbers", action="store_true", dest="line_numbers", default=False,
  49             help="Prints the filenames and line numbers only.")
  50         op.add_option("-L", "--line-numbers-pos", action="store_true", dest="line_numbers_pos", default=False,
  51             help="Prints the filenames and line numbers only (if positive).")
  52         op.add_option("-t", "--threshold", action="store", dest="THRESHOLD", default=0,
  53             help="When used with '--line-numbers', only bothers outputting comment info if there are more than X number of flagged comments. Useful for weeding out false positives.")
  54         self.options, args = op.parse_args()
  55         try:
  56             dir = args[0]
  57         except IndexError:
  58             dir = "."
  59         self.check_source_files(dir)
  60
  61     def get_comments(self, filename):
  62         """
  63         Extracts the source code comments.
  64         """
  65         linenum = 0
  66         if self.options.verbose:
  67             sys.stderr.write("processing file '%s'...\n" % filename)
  68         sock = open(filename)
  69         # add an empty line to trigger the output of collected oneliner
  70         # comment group
  71         lines = sock.readlines() + ["\n"]
  72         sock.close()
  73
  74         in_comment = False
  75         buf = []
  76         count = 1
  77         for i in lines:
  78             if "//" in i and not in_comment:
  79                 # if we find a new //-style comment, then we
  80                 # just append it to a previous one if: there is
  81                 # only whitespace before the // mark that is
  82                 # necessary to make comments longer, giving
  83                 # more reliable output
  84                 if not len(re.sub("(.*)//.*", r"\1", i).strip(self.strip)):
  85                     s = re.sub(".*// ?", "", i).strip(self.strip)
  86                     if len(s):
  87                         buf.append(s)
  88                 else:
  89                     # otherwise it's an independent //-style comment in the next line
  90                     yield (count, "\n    ".join(buf))
  91                     buf = [re.sub(".*// ?", "", i.strip(self.strip))]
  92             elif "//" not in i and not in_comment and len(buf) > 0:
  93                 # first normal line after a // block
  94                 yield (count, "\n    ".join(buf))
  95                 buf = []
  96             elif "/*" in i and "*/" not in i and not in_comment:
  97                 # start of a real multiline comment
  98                 in_comment = True
  99                 linenum = count
 100                 s = re.sub(".*/\*+", "", i.strip(self.strip))
 101                 if len(s):
 102                     buf.append(s.strip(self.strip))
 103             elif in_comment and not "*/" in i:
 104                 # in multiline comment
 105                 s = re.sub("^( |\|)*\*?", "", i)
 106                 if len(s.strip(self.strip)):
 107                     buf.append(s.strip(self.strip))
 108             elif "*/" in i and in_comment:
 109                 # end of multiline comment
 110                 in_comment = False
 111                 s = re.sub(r"\*+/.*", "", i.strip(self.strip))
 112                 if len(s):
 113                     buf.append(s)
 114                 yield (count, "\n    ".join(buf))
 115                 buf = []
 116             elif "/*" in i and "*/" in i:
 117                 # c-style oneliner comment
 118                 yield (count, re.sub(".*/\*(.*)\*/.*", r"\1", i).strip(self.strip))
 119             count += 1
 120
 121     def start_text_cat(self):
 122         cwd = os.getcwd()
 123         # change to our directory
 124         os.chdir(os.path.split(os.path.abspath(sys.argv[0]))[0])
 125         sock = subprocess.Popen(["text_cat/text_cat", "-s", "-d", "text_cat/LM"], stdin=subprocess.PIPE, stdout=subprocess.PIPE)
 126         os.chdir(cwd)
 127         return sock
 128
 129     def get_lang(self, s):
 130         """ the output is 'german' or 'english' or 'german or english'. when
 131         unsure, just don't warn, there are strings where you just can't
 132         teremine the results reliably, like '#110680#' """
 133
 134         self.text_cat.stdin.write(s)
 135         self.text_cat.stdin.write("\n")
 136         self.text_cat.stdin.flush()
 137         lang = self.text_cat.stdout.readline().strip()
 138         return lang
 139
 140     def is_german(self, s):
 141         """
 142         determines if a string is german or not
 143         """
 144         # for short strings we can't do reliable recognition, so skip
 145         # short strings and less than 4 words
 146         s = s.replace('\n', ' ')
 147         if len(s) < 32 or len(s.split()) < 4:
 148             return False
 149         return "german" == self.get_lang(s)
 150
 151     def check_file(self, path):
 152         """
 153         checks each comment in a file
 154         """
 155         def tab_calc (string):
 156             START = 40 #Default of 10 tabs
 157             if len(string) >= START:
 158                 return 1
 159             diff = START - len(string)
 160             if diff % 4 is not 0:
 161                 padding = 1
 162             else:
 163                 padding = 0
 164             return (diff/4)+padding
 165
 166         if self.options.line_numbers or self.options.line_numbers_pos:
 167             TABS = "\t"*10
 168             path_linenums = []
 169             for linenum, s in self.get_comments(path):
 170                 if self.is_german(s):
 171                     path_linenums.append(linenum)
 172             valid = len(path_linenums) > int(self.options.THRESHOLD)
 173             if self.options.line_numbers:
 174                 sys.stderr.write("%s ... %s positives -- %s\n" % (path, str(len(path_linenums)), str(valid)))
 175             if valid:
 176                 if self.options.line_numbers_pos:
 177                     sys.stderr.write("%s ... %s positives\n" % (path, str(len(path_linenums))))
 178                     return
 179                 if len(path) + (len(path_linenums)*4) > 75:
 180                     print "%s:\n" % path
 181                     while(path_linenums):
 182                         i = 0
 183                         numline = []
 184                         while i < 10:
 185                             try:
 186                                 numline.append(path_linenums[0])
 187                                 path_linenums.remove(path_linenums[0])
 188                             except IndexError:
 189                                 i = 10
 190                             i += 1
 191                         numline = [str(i) for i in numline]
 192                         print "%s%s" % (TABS, ",".join(numline))
 193                 else:
 194                     if self.options.line_numbers:
 195                         path_linenums = [str(i) for i in path_linenums]
 196                         print "%s:%s%s" % (path, "\t"*tab_calc(path), ",".join(path_linenums))
 197
 198         elif not self.options.filenames_only:
 199             for linenum, s in self.get_comments(path):
 200                 if self.is_german(s):
 201                     print "%s:%s: %s" % (path, linenum, s)
 202         else:
 203             fnames = set([])
 204             for linenum, s in self.get_comments(path):
 205                 if self.is_german(s):
 206                     # Make sure we print each filename only once
 207                     fnames.add(path)
 208             # Print the filenames
 209             for f in fnames:
 210                 print f
 211
 212     def first_elem(self, path):
 213         lastElem = os.path.dirname(path)
 214         done = False
 215         while not done:
 216             nextElem = os.path.split(lastElem)[0]
 217             if nextElem is not '':
 218                 lastElem = nextElem
 219             else:
 220                 done = True
 221         return lastElem
 222
 223     def check_source_files(self, directory):
 224         """
 225         checks each _tracked_ file in a directory recursively
 226         """
 227         sock = os.popen(r"git ls-files '%s' |egrep '\.(c|h)xx$'" % directory)
 228         lines = sock.readlines()
 229         sock.close()
 230
 231         # Helps to speedup a global scan
 232         directory_whitelist = {
 233             "UnoControls" : 1,
 234             "accessibility" : 1,
 235             "android" : 1,
 236             "animations" : 1,
 237             "avmedia" : 1,
 238             "basctl" : 1,
 239             "basebmp" : 1,
 240             "basegfx" : 1,
 241             "basic" : 1,
 242             "binaryurp" : 1,
 243             "bridges" : 1,
 244             "canvas" : 1,
 245             "chart2" : 1,
 246             "cli_ure" : 1,
 247             "codemaker" : 1,
 248             "comphelper" : 1,
 249             "compilerplugins" : 1,
 250             "configmgr" : 1,
 251             "connectivity" : 1,
 252             "cppcanvas" : 1,
 253             "cppu" : 1,
 254             "cppuhelper" : 1,
 255             "cpputools" : 1,
 256             "cui" : 1,
 257             "dbaccess" : 1,
 258             "desktop" : 1,
 259             "drawinglayer" : 1,
 260             "dtrans" : 1,
 261             "editeng" : 1,
 262             "embeddedobj" : 1,
 263             "embedserv" : 1,
 264             "eventattacher" : 1,
 265             "extensions" : 1,
 266             "external" : 1,
 267             "filter" : 1,
 268             "forms" : 1,
 269             "formula" : 1,
 270             "fpicker" : 1,
 271             "framework" : 1,
 272             "helpcompiler" : 1,
 273             "hwpfilter" : 1,
 274             "i18npool" : 0, #
 275             "i18nlangtag" : 1,
 276             "i18nutil" : 1,
 277             "idl" : 1,
 278             "idlc" : 1,
 279             "include" : 0, #
 280             "io" : 1,
 281             "javaunohelper" : 1,
 282             "jvmaccess" : 1,
 283             "jvmfwk" : 1,
 284             "l10ntools" : 1,
 285             "lingucomponent" : 1,
 286             "linguistic" : 1,
 287             "lotuswordpro" : 1,
 288             "mysqlc" : 1,
 289             "o3tl" : 1,
 290             "odk" : 1,
 291             "officecfg" : 1,
 292             "oox" : 1,
 293             "package" : 1,
 294             "postprocess" : 1,
 295             "pyuno" : 1,
 296             "registry" : 1,
 297             "remotebridges" : 1,
 298             "reportdesign" : 0, #
 299             "rsc" : 0, #
 300             "sal" : 1,
 301             "salhelper" : 1,
 302             "sax" : 1,
 303             "sc" : 0, #
 304             "scaddins" : 0, #
 305             "sccomp" : 1,
 306             "scripting" : 1,
 307             "sd" : 1,
 308             "sdext" : 1,
 309             "sfx2" : 0, #
 310             "shell" : 1,
 311             "setup_native" : 1,
 312             "sot" : 1,
 313             "slideshow" : 1,
 314             "smoketest" : 1,
 315             "solenv" : 1,
 316             "soltools" : 1,
 317             "starmath" : 1,
 318             "stoc" : 0, #
 319             "store" : 1,
 320             "svgio" : 1,
 321             "svl" : 1,
 322             "svtools" : 1,
 323             "svx" : 0, #
 324             "sw" : 0, #
 325             "test" : 1,
 326             "testtools" : 1,
 327             "toolkit" : 1,
 328             "tools" : 1,
 329             "touch" : 1,
 330             "tubes" : 1,
 331             "ucb" : 1,
 332             "ucbhelper" : 1,
 333             "unodevtools" : 1,
 334             "unotest" : 1,
 335             "unoidl" : 1,
 336             "unotools" : 1,
 337             "unoxml" : 1,
 338             "uui" : 1,
 339             "vbahelper" : 1,
 340             "vcl" : 1,
 341             "winaccessibility" : 1,
 342             "writerfilter" : 1,
 343             "writerperfect" : 1,
 344             "xmlhelp" : 1,
 345             "xmloff" : 1,
 346             "xmlreader" : 1,
 347             "xmlsecurity" : 1,
 348             "xmlscript" : 1,
 349         }
 350
 351         if not directory is '.':
 352             sys.stderr.write("Warning: pass an absolute path to the top-level in order to use the faster white-list search\n")
 353
 354         for path in lines:
 355             baseDir = self.first_elem(path)
 356
 357             # Support searching within sub directories
 358             if directory is '.':
 359                 self.check_file(path.strip())
 360             elif not baseDir in directory_whitelist:
 361                 print ("Missing path %s " % baseDir)
 362             elif directory_whitelist[baseDir] is 0:
 363 #                print ("Scan path %s " % baseDir)
 364                 self.check_file(path.strip())
 365
 366 try:
 367     Parser()
 368 except KeyboardInterrupt:
 369     print "Interrupted!"
 370     sys.exit(0)
 371
 372 # vim:set shiftwidth=4 softtabstop=4 expandtab: