bin/find-german-comments

   1 #!/usr/bin/env python3
   2 ########################################################################
   3 #
   4 #  Copyright (c) 2010 Jonas Jensen, Miklos Vajna
   5 #
   6 #  Permission is hereby granted, free of charge, to any person
   7 #  obtaining a copy of this software and associated documentation
   8 #  files (the "Software"), to deal in the Software without
   9 #  restriction, including without limitation the rights to use,
  10 #  copy, modify, merge, publish, distribute, sublicense, and/or sell
  11 #  copies of the Software, and to permit persons to whom the
  12 #  Software is furnished to do so, subject to the following
  13 #  conditions:
  14 #
  15 #  The above copyright notice and this permission notice shall be
  16 #  included in all copies or substantial portions of the Software.
  17 #
  18 #  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  19 #  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
  20 #  OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  21 #  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
  22 #  HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
  23 #  WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  24 #  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
  25 #  OTHER DEALINGS IN THE SOFTWARE.
  26 #
  27 ########################################################################
  28
  29
  30 import sys
  31 import re
  32 import subprocess
  33 import os
  34 import argparse
  35 import string
  36
  37 class Parser:
  38     """
  39     This parser extracts comments from source files, tries to guess
  40     their language and then prints out the German ones.
  41     """
  42     def __init__(self):
  43         self.strip = string.punctuation + " \n"
  44         self.text_cat = self.start_text_cat()
  45         parser = argparse.ArgumentParser(description='Searches for German comments in cxx/hxx source files inside a given root directory recursively.')
  46         parser.add_argument("-f", "--filenames-only", action="store_true",
  47             help="Only print the filenames of files containing German comments")
  48         parser.add_argument("-v", "--verbose", action="store_true",
  49             help="Turn on verbose mode (print only positives progress to stderr)")
  50         parser.add_argument("-l", "--line-numbers", action="store_true",
  51             help="Prints the filenames and line numbers only.")
  52         parser.add_argument("-L", "--line-numbers-pos", action="store_true",
  53             help="Prints the filenames and line numbers only (if positive).")
  54         parser.add_argument("-t", "--threshold", action="store", default=0, type=int,
  55             help="When used with '--line-numbers', only bothers outputting comment info if there are more than X number of flagged comments. Useful for weeding out false positives.")
  56         parser.add_argument("directory", nargs='?', default='.', type=str, help='Give a directory to search in')
  57         self.args = parser.parse_args()
  58         self.check_source_files(self.args.directory)
  59
  60     def get_comments(self, filename):
  61         """
  62         Extracts the source code comments.
  63         """
  64         if self.args.verbose:
  65             print("processing file '%s'...\n" % filename)
  66         sock = open(filename)
  67         # add an empty line to trigger the output of collected oneliner
  68         # comment group
  69         lines = sock.readlines() + ["\n"]
  70         sock.close()
  71
  72         in_comment = False
  73         buf = []
  74         count = 1
  75         for i in lines:
  76             if "//" in i and not in_comment:
  77                 # if we find a new //-style comment, then we
  78                 # just append it to a previous one if: there is
  79                 # only whitespace before the // mark that is
  80                 # necessary to make comments longer, giving
  81                 # more reliable output
  82                 if not len(re.sub("(.*)//.*", r"\1", i).strip(self.strip)):
  83                     s = re.sub(".*// ?", "", i).strip(self.strip)
  84                     if len(s):
  85                         buf.append(s)
  86                 else:
  87                     # otherwise it's an independent //-style comment in the next line
  88                     yield (count, "\n    ".join(buf))
  89                     buf = [re.sub(".*// ?", "", i.strip(self.strip))]
  90             elif "//" not in i and not in_comment and len(buf) > 0:
  91                 # first normal line after a // block
  92                 yield (count, "\n    ".join(buf))
  93                 buf = []
  94             elif "/*" in i and "*/" not in i and not in_comment:
  95                 # start of a real multiline comment
  96                 in_comment = True
  97                 s = re.sub(r".*/\*+", "", i.strip(self.strip))
  98                 if len(s):
  99                     buf.append(s.strip(self.strip))
 100             elif in_comment and not "*/" in i:
 101                 # in multiline comment
 102                 s = re.sub(r"^( |\|)*\*?", "", i)
 103                 if len(s.strip(self.strip)):
 104                     buf.append(s.strip(self.strip))
 105             elif "*/" in i and in_comment:
 106                 # end of multiline comment
 107                 in_comment = False
 108                 s = re.sub(r"\*+/.*", "", i.strip(self.strip))
 109                 if len(s):
 110                     buf.append(s)
 111                 yield (count, "\n    ".join(buf))
 112                 buf = []
 113             elif "/*" in i and "*/" in i:
 114                 # c-style oneliner comment
 115                 yield (count, re.sub(r".*/\*(.*)\*/.*", r"\1", i).strip(self.strip))
 116             count += 1
 117
 118     def start_text_cat(self):
 119         cwd = os.getcwd()
 120         # change to our directory
 121         os.chdir(os.path.split(os.path.abspath(sys.argv[0]))[0])
 122         sock = subprocess.Popen(["text_cat/text_cat", "-s", "-d", "text_cat/LM"], stdin=subprocess.PIPE, stdout=subprocess.PIPE)
 123         os.chdir(cwd)
 124         return sock
 125
 126     def get_lang(self, s):
 127         """ the output is 'german' or 'english' or 'german or english'. When
 128         unsure, just don't warn, there are strings where you just can't
 129         determine the results reliably, like '#110680#' """
 130
 131         self.text_cat.stdin.write(bytes(s, 'utf-8'))
 132         self.text_cat.stdin.write(bytes("\n", 'utf-8'))
 133         self.text_cat.stdin.flush()
 134         lang = self.text_cat.stdout.readline().strip()
 135         return lang
 136
 137     def is_german(self, s):
 138         """
 139         determines if a string is German or not
 140         """
 141         # for short strings we can't do reliable recognition, so skip
 142         # short strings and less than 4 words
 143         s = s.replace('\n', ' ')
 144         if len(s) < 32 or len(s.split()) < 4:
 145             return False
 146         return self.get_lang(s) == b"german"
 147
 148     def check_file(self, path):
 149         """
 150         checks each comment in a file
 151         """
 152         def tab_calc(path):
 153             START = 40 #Default of 10 tabs
 154             if len(path) >= START:
 155                 return 1
 156             diff = START - len(path)
 157             if (diff % 4) != 0:
 158                 padding = 1
 159             else:
 160                 padding = 0
 161             return (diff/4)+padding
 162
 163         if self.args.line_numbers or self.args.line_numbers_pos:
 164             TABS = "\t"*10
 165             path_linenums = []
 166             for linenum, s in self.get_comments(path):
 167                 if self.is_german(s):
 168                     path_linenums.append(linenum)
 169             valid = len(path_linenums) > int(self.args.threshold)
 170             if self.args.line_numbers:
 171                 print("%s ... %s positives -- %s\n" % (path, str(len(path_linenums)), str(valid)))
 172             if valid:
 173                 if self.args.line_numbers_pos:
 174                     print("%s ... %s positives\n" % (path, str(len(path_linenums))))
 175                     return
 176                 if len(path) + (len(path_linenums)*4) > 75:
 177                     print("%s:\n" % path)
 178                     while path_linenums:
 179                         i = 0
 180                         numline = []
 181                         while i < 10:
 182                             try:
 183                                 numline.append(path_linenums[0])
 184                                 path_linenums.remove(path_linenums[0])
 185                             except IndexError:
 186                                 i = 10
 187                             i += 1
 188                         numline = [str(i) for i in numline]
 189                         print("%s%s" % (TABS, ",".join(numline)))
 190                 else:
 191                     if self.args.line_numbers:
 192                         path_linenums = [str(i) for i in path_linenums]
 193                         print("%s:%s%s" % (path, "\t"*int(tab_calc(path)), ",".join(path_linenums)))
 194
 195         elif not self.args.filenames_only:
 196             for linenum, s in self.get_comments(path):
 197                 if self.is_german(s):
 198                     print("%s:%s: %s" % (path, linenum, s))
 199         else:
 200             fnames = set([])
 201             for linenum, s in self.get_comments(path):
 202                 if self.is_german(s):
 203                     # Make sure we print each filename only once
 204                     fnames.add(path)
 205             # Print the filenames
 206             for f in fnames:
 207                 print(f)
 208
 209     def first_elem(self, path):
 210         """
 211         Returns the root directory in our repo of a given path, so we can check against the allowlist.
 212         """
 213         lastElem = os.path.dirname(path)
 214         done = False
 215         while not done:
 216             nextElem = os.path.split(lastElem)[0]
 217             if nextElem != '':
 218                 lastElem = nextElem
 219             else:
 220                 done = True
 221         return lastElem
 222
 223     def check_source_files(self, directory):
 224         """
 225         checks each _tracked_ file in a directory recursively
 226         """
 227
 228         # top-level project directory -> use allowlist.
 229         globalscan = False
 230         if os.path.exists(directory + "/.git/config"):
 231            globalscan = True
 232
 233         # Change into the given dir, so "git ls-tree" does work.
 234         os.chdir(directory)
 235
 236         sock = os.popen(r"git ls-tree -r HEAD --name-only | grep -E '\.(c|cc|cpp|cxx|h|hxx|mm)$'")
 237         lines = sock.readlines()
 238         sock.close()
 239
 240         # Helps to speedup a global scan
 241         directory_allowlist = {
 242             "ure" : 1,
 243             "ios" : 1,
 244             "bean" : 1,
 245             "apple_remote" : 1,
 246             "UnoControls" : 1,
 247             "accessibility" : 1,
 248             "android" : 1,
 249             "animations" : 1,
 250             "avmedia" : 1,
 251             "basctl" : 1,
 252             "basegfx" : 1,
 253             "basic" : 1,
 254             "binaryurp" : 1,
 255             "bridges" : 1,
 256             "canvas" : 1,
 257             "chart2" : 1,
 258             "cli_ure" : 1,
 259             "codemaker" : 1,
 260             "comphelper" : 1,
 261             "compilerplugins" : 1,
 262             "configmgr" : 1,
 263             "connectivity" : 1,
 264             "cppcanvas" : 1,
 265             "cppu" : 1,
 266             "cppuhelper" : 1,
 267             "cpputools" : 1,
 268             "cui" : 1,
 269             "dbaccess" : 1,
 270             "desktop" : 1,
 271             "drawinglayer" : 1,
 272             "editeng" : 1,
 273             "embeddedobj" : 1,
 274             "embedserv" : 1,
 275             "eventattacher" : 1,
 276             "extensions" : 1,
 277             "external" : 1,
 278             "filter" : 1,
 279             "forms" : 1,
 280             "formula" : 1,
 281             "fpicker" : 1,
 282             "framework" : 1,
 283             "helpcompiler" : 1,
 284             "hwpfilter" : 1,
 285             "i18npool" : 1,
 286             "i18nlangtag" : 1,
 287             "i18nutil" : 1,
 288             "idl" : 1,
 289             "idlc" : 1,
 290             "include" : 1,
 291             "io" : 1,
 292             "javaunohelper" : 1,
 293             "jvmaccess" : 1,
 294             "jvmfwk" : 1,
 295             "jurt" : 1,
 296             "l10ntools" : 1,
 297             "libreofficekit" : 1,
 298             "lingucomponent" : 1,
 299             "linguistic" : 1,
 300             "lotuswordpro" : 1,
 301             "mysqlc" : 1,
 302             "o3tl" : 1,
 303             "odk" : 1,
 304             "officecfg" : 1,
 305             "opencl" : 1,
 306             "oox" : 1,
 307             "package" : 1,
 308             "postprocess" : 1,
 309             "pyuno" : 1,
 310             "registry" : 1,
 311             "remotebridges" : 1,
 312             "reportdesign" : 1,
 313             "rsc" : 1,
 314             "sal" : 1,
 315             "salhelper" : 1,
 316             "sax" : 1,
 317             "sc" : 1,
 318             "scaddins" : 1,
 319             "sccomp" : 1,
 320             "scripting" : 1,
 321             "sd" : 1,
 322             "sdext" : 1,
 323             "sfx2" : 1,
 324             "shell" : 1,
 325             "setup_native" : 1,
 326             "sot" : 1,
 327             "slideshow" : 1,
 328             "smoketest" : 1,
 329             "solenv" : 1,
 330             "soltools" : 1,
 331             "starmath" : 1,
 332             "stoc" : 1,
 333             "store" : 1,
 334             "svgio" : 1,
 335             "svl" : 1,
 336             "svtools" : 1,
 337             "svx" : 1,
 338             "sw" : 1,
 339             "test" : 1,
 340             "testtools" : 1,
 341             "toolkit" : 1,
 342             "tools" : 1,
 343             "touch" : 1,
 344             "ucb" : 1,
 345             "ucbhelper" : 1,
 346             "unodevtools" : 1,
 347             "unotest" : 1,
 348             "unoidl" : 1,
 349             "unotools" : 1,
 350             "unoxml" : 1,
 351             "uui" : 1,
 352             "vbahelper" : 1,
 353             "vcl" : 1,
 354             "winaccessibility" : 1,
 355             "writerperfect" : 1,
 356             "xmlhelp" : 1,
 357             "xmloff" : 1,
 358             "xmlreader" : 1,
 359             "xmlsecurity" : 1,
 360             "xmlscript" : 1,
 361         }
 362
 363         if globalscan:
 364             print("Scanning all files globally:")
 365         elif directory == '.':
 366             print("Scanning all files in our current directory:")
 367         else:
 368             print("Scanning all files in", directory + ":")
 369
 370         num_checked = 0
 371
 372         for path in lines:
 373             baseDir = self.first_elem(path)
 374             # If we have a globalscan use the allowlist.
 375             if globalscan:
 376                 if not baseDir in directory_allowlist:
 377                     sys.stderr.write("\n - Error: Missing path %s -\n\n" % baseDir)
 378                     sys.exit(1)
 379                 elif directory_allowlist[baseDir] == 0:
 380                     self.check_file(path.strip())
 381                     num_checked = num_checked + 1
 382                 elif directory_allowlist[baseDir] == 1:
 383                     sys.stderr.write("Skipping excluded directory %s\n" % baseDir)
 384                     directory_allowlist[baseDir] = 2
 385             elif not globalscan:
 386                 self.check_file(path.strip())
 387                 num_checked = num_checked + 1
 388
 389         print("Scanned %s files\n" % num_checked)
 390
 391 try:
 392     Parser()
 393 except KeyboardInterrupt:
 394     print("Interrupted!")
 395     sys.exit(0)
 396
 397 # vim:set shiftwidth=4 softtabstop=4 expandtab: