bin/find-german-comments

   1 #!/usr/bin/env python3
   2 ########################################################################
   3 #
   4 #  Copyright (c) 2010 Jonas Jensen, Miklos Vajna
   5 #
   6 #  Permission is hereby granted, free of charge, to any person
   7 #  obtaining a copy of this software and associated documentation
   8 #  files (the "Software"), to deal in the Software without
   9 #  restriction, including without limitation the rights to use,
  10 #  copy, modify, merge, publish, distribute, sublicense, and/or sell
  11 #  copies of the Software, and to permit persons to whom the
  12 #  Software is furnished to do so, subject to the following
  13 #  conditions:
  14 #
  15 #  The above copyright notice and this permission notice shall be
  16 #  included in all copies or substantial portions of the Software.
  17 #
  18 #  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  19 #  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
  20 #  OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  21 #  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
  22 #  HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
  23 #  WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  24 #  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
  25 #  OTHER DEALINGS IN THE SOFTWARE.
  26 #
  27 ########################################################################
  28
  29
  30 import sys
  31 import re
  32 import subprocess
  33 import os
  34 import argparse
  35 import string
  36
  37 class Parser:
  38     """
  39     This parser extracts comments from source files, tries to guess
  40     their language and then prints out the German ones.
  41     """
  42     def __init__(self):
  43         self.strip = string.punctuation + " \n"
  44         self.text_cat = self.start_text_cat()
  45         parser = argparse.ArgumentParser(description='Searches for German comments in cxx/hxx source files inside a given root directory recursively.')
  46         parser.add_argument("-f", "--filenames-only", action="store_true",
  47             help="Only print the filenames of files containing German comments")
  48         parser.add_argument("-v", "--verbose", action="store_true",
  49             help="Turn on verbose mode (print only positives progress to stderr)")
  50         parser.add_argument("-l", "--line-numbers", action="store_true",
  51             help="Prints the filenames and line numbers only.")
  52         parser.add_argument("-L", "--line-numbers-pos", action="store_true",
  53             help="Prints the filenames and line numbers only (if positive).")
  54         parser.add_argument("-t", "--threshold", action="store", default=0, type=int,
  55             help="When used with '--line-numbers', only bothers outputting comment info if there are more than X number of flagged comments. Useful for weeding out false positives.")
  56         parser.add_argument("directory", nargs='?', default='.', type=str, help='Give a directory to search in')
  57         self.args = parser.parse_args()
  58         self.check_source_files(self.args.directory)
  59
  60     def get_comments(self, filename):
  61         """
  62         Extracts the source code comments.
  63         """
  64         linenum = 0
  65         if self.args.verbose:
  66             print("processing file '%s'...\n" % filename)
  67         sock = open(filename)
  68         # add an empty line to trigger the output of collected oneliner
  69         # comment group
  70         lines = sock.readlines() + ["\n"]
  71         sock.close()
  72
  73         in_comment = False
  74         buf = []
  75         count = 1
  76         for i in lines:
  77             if "//" in i and not in_comment:
  78                 # if we find a new //-style comment, then we
  79                 # just append it to a previous one if: there is
  80                 # only whitespace before the // mark that is
  81                 # necessary to make comments longer, giving
  82                 # more reliable output
  83                 if not len(re.sub("(.*)//.*", r"\1", i).strip(self.strip)):
  84                     s = re.sub(".*// ?", "", i).strip(self.strip)
  85                     if len(s):
  86                         buf.append(s)
  87                 else:
  88                     # otherwise it's an independent //-style comment in the next line
  89                     yield (count, "\n    ".join(buf))
  90                     buf = [re.sub(".*// ?", "", i.strip(self.strip))]
  91             elif "//" not in i and not in_comment and len(buf) > 0:
  92                 # first normal line after a // block
  93                 yield (count, "\n    ".join(buf))
  94                 buf = []
  95             elif "/*" in i and "*/" not in i and not in_comment:
  96                 # start of a real multiline comment
  97                 in_comment = True
  98                 linenum = count
  99                 s = re.sub(".*/\*+", "", i.strip(self.strip))
 100                 if len(s):
 101                     buf.append(s.strip(self.strip))
 102             elif in_comment and not "*/" in i:
 103                 # in multiline comment
 104                 s = re.sub("^( |\|)*\*?", "", i)
 105                 if len(s.strip(self.strip)):
 106                     buf.append(s.strip(self.strip))
 107             elif "*/" in i and in_comment:
 108                 # end of multiline comment
 109                 in_comment = False
 110                 s = re.sub(r"\*+/.*", "", i.strip(self.strip))
 111                 if len(s):
 112                     buf.append(s)
 113                 yield (count, "\n    ".join(buf))
 114                 buf = []
 115             elif "/*" in i and "*/" in i:
 116                 # c-style oneliner comment
 117                 yield (count, re.sub(".*/\*(.*)\*/.*", r"\1", i).strip(self.strip))
 118             count += 1
 119
 120     def start_text_cat(self):
 121         cwd = os.getcwd()
 122         # change to our directory
 123         os.chdir(os.path.split(os.path.abspath(sys.argv[0]))[0])
 124         sock = subprocess.Popen(["text_cat/text_cat", "-s", "-d", "text_cat/LM"], stdin=subprocess.PIPE, stdout=subprocess.PIPE)
 125         os.chdir(cwd)
 126         return sock
 127
 128     def get_lang(self, s):
 129         """ the output is 'german' or 'english' or 'german or english'. when
 130         unsure, just don't warn, there are strings where you just can't
 131         teremine the results reliably, like '#110680#' """
 132
 133         self.text_cat.stdin.write(bytes(s, 'utf-8'))
 134         self.text_cat.stdin.write(bytes("\n", 'utf-8'))
 135         self.text_cat.stdin.flush()
 136         lang = self.text_cat.stdout.readline().strip()
 137         return lang
 138
 139     def is_german(self, s):
 140         """
 141         determines if a string is German or not
 142         """
 143         # for short strings we can't do reliable recognition, so skip
 144         # short strings and less than 4 words
 145         s = s.replace('\n', ' ')
 146         if len(s) < 32 or len(s.split()) < 4:
 147             return False
 148         return self.get_lang(s) == b"german"
 149
 150     def check_file(self, path):
 151         """
 152         checks each comment in a file
 153         """
 154         def tab_calc(path):
 155             START = 40 #Default of 10 tabs
 156             if len(path) >= START:
 157                 return 1
 158             diff = START - len(path)
 159             if diff % 4 is not 0:
 160                 padding = 1
 161             else:
 162                 padding = 0
 163             return (diff/4)+padding
 164
 165         if self.args.line_numbers or self.args.line_numbers_pos:
 166             TABS = "\t"*10
 167             path_linenums = []
 168             for linenum, s in self.get_comments(path):
 169                 if self.is_german(s):
 170                     path_linenums.append(linenum)
 171             valid = len(path_linenums) > int(self.args.threshold)
 172             if self.args.line_numbers:
 173                 print("%s ... %s positives -- %s\n" % (path, str(len(path_linenums)), str(valid)))
 174             if valid:
 175                 if self.args.line_numbers_pos:
 176                     print("%s ... %s positives\n" % (path, str(len(path_linenums))))
 177                     return
 178                 if len(path) + (len(path_linenums)*4) > 75:
 179                     print("%s:\n" % path)
 180                     while path_linenums:
 181                         i = 0
 182                         numline = []
 183                         while i < 10:
 184                             try:
 185                                 numline.append(path_linenums[0])
 186                                 path_linenums.remove(path_linenums[0])
 187                             except IndexError:
 188                                 i = 10
 189                             i += 1
 190                         numline = [str(i) for i in numline]
 191                         print("%s%s" % (TABS, ",".join(numline)))
 192                 else:
 193                     if self.args.line_numbers:
 194                         path_linenums = [str(i) for i in path_linenums]
 195                         print("%s:%s%s" % (path, "\t"*int(tab_calc(path)), ",".join(path_linenums)))
 196
 197         elif not self.args.filenames_only:
 198             for linenum, s in self.get_comments(path):
 199                 if self.is_german(s):
 200                     print("%s:%s: %s" % (path, linenum, s))
 201         else:
 202             fnames = set([])
 203             for linenum, s in self.get_comments(path):
 204                 if self.is_german(s):
 205                     # Make sure we print each filename only once
 206                     fnames.add(path)
 207             # Print the filenames
 208             for f in fnames:
 209                 print(f)
 210
 211     def first_elem(self, path):
 212         """
 213         Returns the root directory in our repo of a given path, so we can check against the whitelist.
 214         """
 215         lastElem = os.path.dirname(path)
 216         done = False
 217         while not done:
 218             nextElem = os.path.split(lastElem)[0]
 219             if nextElem is not '':
 220                 lastElem = nextElem
 221             else:
 222                 done = True
 223         return lastElem
 224
 225     def check_source_files(self, directory):
 226         """
 227         checks each _tracked_ file in a directory recursively
 228         """
 229
 230         # top-level project directory -> use whitelist.
 231         globalscan = False
 232         if os.path.exists(directory + "/.git/config"):
 233            globalscan = True
 234
 235         # Change into the given dir, so "git ls-tree" does work.
 236         os.chdir(directory)
 237
 238         sock = os.popen(r"git ls-tree -r HEAD --name-only |egrep '\.(c|cc|cpp|cxx|h|hxx|mm)$'")
 239         lines = sock.readlines()
 240         sock.close()
 241
 242         # Helps to speedup a global scan
 243         directory_whitelist = {
 244             "ure" : 1,
 245             "ios" : 1,
 246             "bean" : 1,
 247             "apple_remote" : 1,
 248             "UnoControls" : 1,
 249             "accessibility" : 1,
 250             "android" : 1,
 251             "animations" : 1,
 252             "avmedia" : 1,
 253             "basctl" : 1,
 254             "basegfx" : 1,
 255             "basic" : 1,
 256             "binaryurp" : 1,
 257             "bridges" : 1,
 258             "canvas" : 1,
 259             "chart2" : 1,
 260             "cli_ure" : 1,
 261             "codemaker" : 1,
 262             "comphelper" : 1,
 263             "compilerplugins" : 1,
 264             "configmgr" : 1,
 265             "connectivity" : 1,
 266             "cppcanvas" : 1,
 267             "cppu" : 1,
 268             "cppuhelper" : 1,
 269             "cpputools" : 1,
 270             "cui" : 1,
 271             "dbaccess" : 1,
 272             "desktop" : 1,
 273             "drawinglayer" : 1,
 274             "dtrans" : 1,
 275             "editeng" : 1,
 276             "embeddedobj" : 1,
 277             "embedserv" : 1,
 278             "eventattacher" : 1,
 279             "extensions" : 1,
 280             "external" : 1,
 281             "filter" : 1,
 282             "forms" : 1,
 283             "formula" : 1,
 284             "fpicker" : 1,
 285             "framework" : 1,
 286             "helpcompiler" : 1,
 287             "hwpfilter" : 1,
 288             "i18npool" : 1,
 289             "i18nlangtag" : 1,
 290             "i18nutil" : 1,
 291             "idl" : 1,
 292             "idlc" : 1,
 293             "include" : 1,
 294             "io" : 1,
 295             "javaunohelper" : 1,
 296             "jvmaccess" : 1,
 297             "jvmfwk" : 1,
 298             "jurt" : 1,
 299             "l10ntools" : 1,
 300             "libreofficekit" : 1,
 301             "lingucomponent" : 1,
 302             "linguistic" : 1,
 303             "lotuswordpro" : 1,
 304             "mysqlc" : 1,
 305             "o3tl" : 1,
 306             "odk" : 1,
 307             "officecfg" : 1,
 308             "onlineupdate" : 1,
 309             "opencl" : 1,
 310             "oox" : 1,
 311             "package" : 1,
 312             "postprocess" : 1,
 313             "pyuno" : 1,
 314             "registry" : 1,
 315             "remotebridges" : 1,
 316             "reportdesign" : 1,
 317             "rsc" : 1,
 318             "sal" : 1,
 319             "salhelper" : 1,
 320             "sax" : 1,
 321             "sc" : 1,
 322             "scaddins" : 1,
 323             "sccomp" : 1,
 324             "scripting" : 1,
 325             "sd" : 1,
 326             "sdext" : 1,
 327             "sfx2" : 1,
 328             "shell" : 1,
 329             "setup_native" : 1,
 330             "sot" : 1,
 331             "slideshow" : 1,
 332             "smoketest" : 1,
 333             "solenv" : 1,
 334             "soltools" : 1,
 335             "starmath" : 1,
 336             "stoc" : 1,
 337             "store" : 1,
 338             "svgio" : 1,
 339             "svl" : 1,
 340             "svtools" : 1,
 341             "svx" : 1,
 342             "sw" : 1,
 343             "test" : 1,
 344             "testtools" : 1,
 345             "toolkit" : 1,
 346             "tools" : 1,
 347             "touch" : 1,
 348             "ucb" : 1,
 349             "ucbhelper" : 1,
 350             "unodevtools" : 1,
 351             "unotest" : 1,
 352             "unoidl" : 1,
 353             "unotools" : 1,
 354             "unoxml" : 1,
 355             "uui" : 1,
 356             "vbahelper" : 1,
 357             "vcl" : 1,
 358             "winaccessibility" : 1,
 359             "writerfilter" : 1,
 360             "writerperfect" : 1,
 361             "xmlhelp" : 1,
 362             "xmloff" : 1,
 363             "xmlreader" : 1,
 364             "xmlsecurity" : 1,
 365             "xmlscript" : 1,
 366         }
 367
 368         if globalscan:
 369             print("Scanning all files globally:")
 370         elif directory == '.':
 371             print("Scanning all files in our current directory:")
 372         else:
 373             print("Scanning all files in", directory + ":")
 374
 375         num_checked = 0
 376
 377         for path in lines:
 378             baseDir = self.first_elem(path)
 379             # If we have an globalscan use the whitelist.
 380             if globalscan:
 381                 if not baseDir in directory_whitelist:
 382                     sys.stderr.write("\n - Error: Missing path %s -\n\n" % baseDir)
 383                     sys.exit(1)
 384                 elif directory_whitelist[baseDir] is 0:
 385                     self.check_file(path.strip())
 386                     num_checked = num_checked + 1
 387                 elif directory_whitelist[baseDir] is 1:
 388                     sys.stderr.write("Skipping whitelisted directory %s\n" % baseDir)
 389                     directory_whitelist[baseDir] = 2
 390             elif not globalscan:
 391                 self.check_file(path.strip())
 392                 num_checked = num_checked + 1
 393
 394         print("Scanned %s files\n" % num_checked)
 395
 396 try:
 397     Parser()
 398 except KeyboardInterrupt:
 399     print("Interrupted!")
 400     sys.exit(0)
 401
 402 # vim:set shiftwidth=4 softtabstop=4 expandtab: