2 ########################################################################
4 # Copyright (c) 2010 Jonas Jensen, Miklos Vajna
6 # Permission is hereby granted, free of charge, to any person
7 # obtaining a copy of this software and associated documentation
8 # files (the "Software"), to deal in the Software without
9 # restriction, including without limitation the rights to use,
10 # copy, modify, merge, publish, distribute, sublicense, and/or sell
11 # copies of the Software, and to permit persons to whom the
12 # Software is furnished to do so, subject to the following
15 # The above copyright notice and this permission notice shall be
16 # included in all copies or substantial portions of the Software.
18 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
20 # OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
21 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
22 # HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
23 # WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
25 # OTHER DEALINGS IN THE SOFTWARE.
27 ########################################################################
39 This parser extracts comments from source files, tries to guess
40 their language and then prints out the German ones.
43 self.strip = string.punctuation + " \n"
44 self.text_cat = self.start_text_cat()
45 parser = argparse.ArgumentParser(description='Searches for German comments in cxx/hxx source files inside a given root directory recursively.')
46 parser.add_argument("-f", "--filenames-only", action="store_true",
47 help="Only print the filenames of files containing German comments")
48 parser.add_argument("-v", "--verbose", action="store_true",
49 help="Turn on verbose mode (print only positives progress to stderr)")
50 parser.add_argument("-l", "--line-numbers", action="store_true",
51 help="Prints the filenames and line numbers only.")
52 parser.add_argument("-L", "--line-numbers-pos", action="store_true",
53 help="Prints the filenames and line numbers only (if positive).")
54 parser.add_argument("-t", "--threshold", action="store", default=0, type=int,
55 help="When used with '--line-numbers', only bothers outputting comment info if there are more than X number of flagged comments. Useful for weeding out false positives.")
56 parser.add_argument("directory", nargs='?', default='.', type=str, help='Give a directory to search in')
57 self.args = parser.parse_args()
58 self.check_source_files(self.args.directory)
60 def get_comments(self, filename):
62 Extracts the source code comments.
65 print("processing file '%s'...\n" % filename)
67 # add an empty line to trigger the output of collected oneliner
69 lines = sock.readlines() + ["\n"]
76 if "//" in i and not in_comment:
77 # if we find a new //-style comment, then we
78 # just append it to a previous one if: there is
79 # only whitespace before the // mark that is
80 # necessary to make comments longer, giving
81 # more reliable output
82 if not len(re.sub("(.*)//.*", r"\1", i).strip(self.strip)):
83 s = re.sub(".*// ?", "", i).strip(self.strip)
87 # otherwise it's an independent //-style comment in the next line
88 yield (count, "\n ".join(buf))
89 buf = [re.sub(".*// ?", "", i.strip(self.strip))]
90 elif "//" not in i and not in_comment and len(buf) > 0:
91 # first normal line after a // block
92 yield (count, "\n ".join(buf))
94 elif "/*" in i and "*/" not in i and not in_comment:
95 # start of a real multiline comment
97 s = re.sub(r".*/\*+", "", i.strip(self.strip))
99 buf.append(s.strip(self.strip))
100 elif in_comment and not "*/" in i:
101 # in multiline comment
102 s = re.sub(r"^( |\|)*\*?", "", i)
103 if len(s.strip(self.strip)):
104 buf.append(s.strip(self.strip))
105 elif "*/" in i and in_comment:
106 # end of multiline comment
108 s = re.sub(r"\*+/.*", "", i.strip(self.strip))
111 yield (count, "\n ".join(buf))
113 elif "/*" in i and "*/" in i:
114 # c-style oneliner comment
115 yield (count, re.sub(r".*/\*(.*)\*/.*", r"\1", i).strip(self.strip))
118 def start_text_cat(self):
120 # change to our directory
121 os.chdir(os.path.split(os.path.abspath(sys.argv[0]))[0])
122 sock = subprocess.Popen(["text_cat/text_cat", "-s", "-d", "text_cat/LM"], stdin=subprocess.PIPE, stdout=subprocess.PIPE)
126 def get_lang(self, s):
127 """ the output is 'german' or 'english' or 'german or english'. When
128 unsure, just don't warn, there are strings where you just can't
129 determine the results reliably, like '#110680#' """
131 self.text_cat.stdin.write(bytes(s, 'utf-8'))
132 self.text_cat.stdin.write(bytes("\n", 'utf-8'))
133 self.text_cat.stdin.flush()
134 lang = self.text_cat.stdout.readline().strip()
137 def is_german(self, s):
139 determines if a string is German or not
141 # for short strings we can't do reliable recognition, so skip
142 # short strings and less than 4 words
143 s = s.replace('\n', ' ')
144 if len(s) < 32 or len(s.split()) < 4:
146 return self.get_lang(s) == b"german"
148 def check_file(self, path):
150 checks each comment in a file
153 START = 40 #Default of 10 tabs
154 if len(path) >= START:
156 diff = START - len(path)
161 return (diff/4)+padding
163 if self.args.line_numbers or self.args.line_numbers_pos:
166 for linenum, s in self.get_comments(path):
167 if self.is_german(s):
168 path_linenums.append(linenum)
169 valid = len(path_linenums) > int(self.args.threshold)
170 if self.args.line_numbers:
171 print("%s ... %s positives -- %s\n" % (path, str(len(path_linenums)), str(valid)))
173 if self.args.line_numbers_pos:
174 print("%s ... %s positives\n" % (path, str(len(path_linenums))))
176 if len(path) + (len(path_linenums)*4) > 75:
177 print("%s:\n" % path)
183 numline.append(path_linenums[0])
184 path_linenums.remove(path_linenums[0])
188 numline = [str(i) for i in numline]
189 print("%s%s" % (TABS, ",".join(numline)))
191 if self.args.line_numbers:
192 path_linenums = [str(i) for i in path_linenums]
193 print("%s:%s%s" % (path, "\t"*int(tab_calc(path)), ",".join(path_linenums)))
195 elif not self.args.filenames_only:
196 for linenum, s in self.get_comments(path):
197 if self.is_german(s):
198 print("%s:%s: %s" % (path, linenum, s))
201 for linenum, s in self.get_comments(path):
202 if self.is_german(s):
203 # Make sure we print each filename only once
205 # Print the filenames
209 def first_elem(self, path):
211 Returns the root directory in our repo of a given path, so we can check against the allowlist.
213 lastElem = os.path.dirname(path)
216 nextElem = os.path.split(lastElem)[0]
223 def check_source_files(self, directory):
225 checks each _tracked_ file in a directory recursively
228 # top-level project directory -> use allowlist.
230 if os.path.exists(directory + "/.git/config"):
233 # Change into the given dir, so "git ls-tree" does work.
236 sock = os.popen(r"git ls-tree -r HEAD --name-only | grep -E '\.(c|cc|cpp|cxx|h|hxx|mm)$'")
237 lines = sock.readlines()
240 # Helps to speedup a global scan
241 directory_allowlist = {
261 "compilerplugins" : 1,
297 "libreofficekit" : 1,
298 "lingucomponent" : 1,
354 "winaccessibility" : 1,
364 print("Scanning all files globally:")
365 elif directory == '.':
366 print("Scanning all files in our current directory:")
368 print("Scanning all files in", directory + ":")
373 baseDir = self.first_elem(path)
374 # If we have a globalscan use the allowlist.
376 if not baseDir in directory_allowlist:
377 sys.stderr.write("\n - Error: Missing path %s -\n\n" % baseDir)
379 elif directory_allowlist[baseDir] == 0:
380 self.check_file(path.strip())
381 num_checked = num_checked + 1
382 elif directory_allowlist[baseDir] == 1:
383 sys.stderr.write("Skipping excluded directory %s\n" % baseDir)
384 directory_allowlist[baseDir] = 2
386 self.check_file(path.strip())
387 num_checked = num_checked + 1
389 print("Scanned %s files\n" % num_checked)
393 except KeyboardInterrupt:
394 print("Interrupted!")
397 # vim:set shiftwidth=4 softtabstop=4 expandtab: