2 ########################################################################
4 # Copyright (c) 2010 Jonas Jensen, Miklos Vajna
6 # Permission is hereby granted, free of charge, to any person
7 # obtaining a copy of this software and associated documentation
8 # files (the "Software"), to deal in the Software without
9 # restriction, including without limitation the rights to use,
10 # copy, modify, merge, publish, distribute, sublicense, and/or sell
11 # copies of the Software, and to permit persons to whom the
12 # Software is furnished to do so, subject to the following
15 # The above copyright notice and this permission notice shall be
16 # included in all copies or substantial portions of the Software.
18 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
20 # OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
21 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
22 # HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
23 # WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
25 # OTHER DEALINGS IN THE SOFTWARE.
27 ########################################################################
30 import sys
, re
, subprocess
, os
, optparse
, string
34 This parser extracts comments from source files, tries to guess
35 their language and then prints out the german ones.
38 self
.strip
= string
.punctuation
+ " \n"
39 self
.text_cat
= self
.start_text_cat()
40 op
= optparse
.OptionParser()
41 op
.set_usage("%prog [options] <rootdir>\n\n" +
42 "Searches for german comments in cxx/hxx source files inside a given root\n" +
43 "directory recursively.")
44 op
.add_option("-f", "--filenames-only", action
="store_true", dest
="filenames_only", default
=False,
45 help="Only print the filenames of files containing German comments")
46 op
.add_option("-v", "--verbose", action
="store_true", dest
="verbose", default
=False,
47 help="Turn on verbose mode (print only positives progress to stderr)")
48 op
.add_option("-l", "--line-numbers", action
="store_true", dest
="line_numbers", default
=False,
49 help="Prints the filenames and line numbers only.")
50 op
.add_option("-L", "--line-numbers-pos", action
="store_true", dest
="line_numbers_pos", default
=False,
51 help="Prints the filenames and line numbers only (if positive).")
52 op
.add_option("-t", "--threshold", action
="store", dest
="THRESHOLD", default
=0,
53 help="When used with '--line-numbers', only bothers outputting comment info if there are more than X number of flagged comments. Useful for weeding out false positives.")
54 self
.options
, args
= op
.parse_args()
59 self
.check_source_files(dir)
61 def get_comments(self
, filename
):
63 Extracts the source code comments.
66 if self
.options
.verbose
:
67 sys
.stderr
.write("processing file '%s'...\n" % filename
)
69 # add an empty line to trigger the output of collected oneliner
71 lines
= sock
.readlines() + ["\n"]
78 if "//" in i
and not in_comment
:
79 # if we find a new //-style comment, then we
80 # just append it to a previous one if: there is
81 # only whitespace before the // mark that is
82 # necessary to make comments longer, giving
83 # more reliable output
84 if not len(re
.sub("(.*)//.*", r
"\1", i
).strip(self
.strip
)):
85 s
= re
.sub(".*// ?", "", i
).strip(self
.strip
)
89 # otherwise it's an independent //-style comment in the next line
90 yield (count
, "\n ".join(buf
))
91 buf
= [re
.sub(".*// ?", "", i
.strip(self
.strip
))]
92 elif "//" not in i
and not in_comment
and len(buf
) > 0:
93 # first normal line after a // block
94 yield (count
, "\n ".join(buf
))
96 elif "/*" in i
and "*/" not in i
and not in_comment
:
97 # start of a real multiline comment
100 s
= re
.sub(".*/\*+", "", i
.strip(self
.strip
))
102 buf
.append(s
.strip(self
.strip
))
103 elif in_comment
and not "*/" in i
:
104 # in multiline comment
105 s
= re
.sub("^( |\|)*\*?", "", i
)
106 if len(s
.strip(self
.strip
)):
107 buf
.append(s
.strip(self
.strip
))
108 elif "*/" in i
and in_comment
:
109 # end of multiline comment
111 s
= re
.sub(r
"\*+/.*", "", i
.strip(self
.strip
))
114 yield (count
, "\n ".join(buf
))
116 elif "/*" in i
and "*/" in i
:
117 # c-style oneliner comment
118 yield (count
, re
.sub(".*/\*(.*)\*/.*", r
"\1", i
).strip(self
.strip
))
121 def start_text_cat(self
):
123 # change to our directory
124 os
.chdir(os
.path
.split(os
.path
.abspath(sys
.argv
[0]))[0])
125 sock
= subprocess
.Popen(["text_cat/text_cat", "-s", "-d", "text_cat/LM"], stdin
=subprocess
.PIPE
, stdout
=subprocess
.PIPE
)
129 def get_lang(self
, s
):
130 """ the output is 'german' or 'english' or 'german or english'. when
131 unsure, just don't warn, there are strings where you just can't
132 teremine the results reliably, like '#110680#' """
134 self
.text_cat
.stdin
.write(s
)
135 self
.text_cat
.stdin
.write("\n")
136 self
.text_cat
.stdin
.flush()
137 lang
= self
.text_cat
.stdout
.readline().strip()
140 def is_german(self
, s
):
142 determines if a string is german or not
144 # for short strings we can't do reliable recognition, so skip
145 # short strings and less than 4 words
146 s
= s
.replace('\n', ' ')
147 if len(s
) < 32 or len(s
.split()) < 4:
149 return "german" == self
.get_lang(s
)
151 def check_file(self
, path
):
153 checks each comment in a file
155 def tab_calc (string
):
156 START
= 40 #Default of 10 tabs
157 if len(string
) >= START
:
159 diff
= START
- len(string
)
160 if diff
% 4 is not 0:
164 return (diff
/4)+padding
166 if self
.options
.line_numbers
or self
.options
.line_numbers_pos
:
169 for linenum
, s
in self
.get_comments(path
):
170 if self
.is_german(s
):
171 path_linenums
.append(linenum
)
172 valid
= len(path_linenums
) > int(self
.options
.THRESHOLD
)
173 if self
.options
.line_numbers
:
174 sys
.stderr
.write("%s ... %s positives -- %s\n" % (path
, str(len(path_linenums
)), str(valid
)))
176 if self
.options
.line_numbers_pos
:
177 sys
.stderr
.write("%s ... %s positives\n" % (path
, str(len(path_linenums
))))
179 if len(path
) + (len(path_linenums
)*4) > 75:
181 while(path_linenums
):
186 numline
.append(path_linenums
[0])
187 path_linenums
.remove(path_linenums
[0])
191 numline
= [str(i
) for i
in numline
]
192 print "%s%s" % (TABS
, ",".join(numline
))
194 if self
.options
.line_numbers
:
195 path_linenums
= [str(i
) for i
in path_linenums
]
196 print "%s:%s%s" % (path
, "\t"*tab_calc(path
), ",".join(path_linenums
))
198 elif not self
.options
.filenames_only
:
199 for linenum
, s
in self
.get_comments(path
):
200 if self
.is_german(s
):
201 print "%s:%s: %s" % (path
, linenum
, s
)
204 for linenum
, s
in self
.get_comments(path
):
205 if self
.is_german(s
):
206 # Make sure we print each filename only once
208 # Print the filenames
212 def first_elem(self
, path
):
213 lastElem
= os
.path
.dirname(path
)
216 nextElem
= os
.path
.split(lastElem
)[0]
217 if nextElem
is not '':
223 def check_source_files(self
, directory
):
225 checks each _tracked_ file in a directory recursively
227 sock
= os
.popen(r
"git ls-files '%s' |egrep '\.(c|h)xx$'" % directory
)
228 lines
= sock
.readlines()
231 # Helps to speedup a global scan
232 directory_whitelist
= {
249 "compilerplugins" : 1,
285 "lingucomponent" : 1,
298 "reportdesign" : 0, #
341 "winaccessibility" : 1,
351 if not directory
is '.':
352 sys
.stderr
.write("Warning: pass an absolute path to the top-level in order to use the faster white-list search\n")
355 baseDir
= self
.first_elem(path
)
357 # Support searching within sub directories
359 self
.check_file(path
.strip())
360 elif not baseDir
in directory_whitelist
:
361 print ("Missing path %s " % baseDir
)
362 elif directory_whitelist
[baseDir
] is 0:
363 # print ("Scan path %s " % baseDir)
364 self
.check_file(path
.strip())
368 except KeyboardInterrupt:
372 # vim:set shiftwidth=4 softtabstop=4 expandtab: