Branch libreoffice-5-0-4
[LibreOffice.git] / bin / find-german-comments
blob76ebe0d0d5dac84a278e4ea92a5fbce667cb23a6
1 #!/usr/bin/env python
2 ########################################################################
4 # Copyright (c) 2010 Jonas Jensen, Miklos Vajna
6 # Permission is hereby granted, free of charge, to any person
7 # obtaining a copy of this software and associated documentation
8 # files (the "Software"), to deal in the Software without
9 # restriction, including without limitation the rights to use,
10 # copy, modify, merge, publish, distribute, sublicense, and/or sell
11 # copies of the Software, and to permit persons to whom the
12 # Software is furnished to do so, subject to the following
13 # conditions:
15 # The above copyright notice and this permission notice shall be
16 # included in all copies or substantial portions of the Software.
18 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
20 # OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
21 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
22 # HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
23 # WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
25 # OTHER DEALINGS IN THE SOFTWARE.
27 ########################################################################
30 import sys, re, subprocess, os, optparse, string
32 class Parser:
33 """
34 This parser extracts comments from source files, tries to guess
35 their language and then prints out the german ones.
36 """
37 def __init__(self):
38 self.strip = string.punctuation + " \n"
39 self.text_cat = self.start_text_cat()
40 op = optparse.OptionParser()
41 op.set_usage("%prog [options] <rootdir>\n\n" +
42 "Searches for german comments in cxx/hxx source files inside a given root\n" +
43 "directory recursively.")
44 op.add_option("-f", "--filenames-only", action="store_true", dest="filenames_only", default=False,
45 help="Only print the filenames of files containing German comments")
46 op.add_option("-v", "--verbose", action="store_true", dest="verbose", default=False,
47 help="Turn on verbose mode (print only positives progress to stderr)")
48 op.add_option("-l", "--line-numbers", action="store_true", dest="line_numbers", default=False,
49 help="Prints the filenames and line numbers only.")
50 op.add_option("-L", "--line-numbers-pos", action="store_true", dest="line_numbers_pos", default=False,
51 help="Prints the filenames and line numbers only (if positive).")
52 op.add_option("-t", "--threshold", action="store", dest="THRESHOLD", default=0,
53 help="When used with '--line-numbers', only bothers outputting comment info if there are more than X number of flagged comments. Useful for weeding out false positives.")
54 self.options, args = op.parse_args()
55 try:
56 dir = args[0]
57 except IndexError:
58 dir = "."
59 self.check_source_files(dir)
61 def get_comments(self, filename):
62 """
63 Extracts the source code comments.
64 """
65 linenum = 0
66 if self.options.verbose:
67 sys.stderr.write("processing file '%s'...\n" % filename)
68 sock = open(filename)
69 # add an empty line to trigger the output of collected oneliner
70 # comment group
71 lines = sock.readlines() + ["\n"]
72 sock.close()
74 in_comment = False
75 buf = []
76 count = 1
77 for i in lines:
78 if "//" in i and not in_comment:
79 # if we find a new //-style comment, then we
80 # just append it to a previous one if: there is
81 # only whitespace before the // mark that is
82 # necessary to make comments longer, giving
83 # more reliable output
84 if not len(re.sub("(.*)//.*", r"\1", i).strip(self.strip)):
85 s = re.sub(".*// ?", "", i).strip(self.strip)
86 if len(s):
87 buf.append(s)
88 else:
89 # otherwise it's an independent //-style comment in the next line
90 yield (count, "\n ".join(buf))
91 buf = [re.sub(".*// ?", "", i.strip(self.strip))]
92 elif "//" not in i and not in_comment and len(buf) > 0:
93 # first normal line after a // block
94 yield (count, "\n ".join(buf))
95 buf = []
96 elif "/*" in i and "*/" not in i and not in_comment:
97 # start of a real multiline comment
98 in_comment = True
99 linenum = count
100 s = re.sub(".*/\*+", "", i.strip(self.strip))
101 if len(s):
102 buf.append(s.strip(self.strip))
103 elif in_comment and not "*/" in i:
104 # in multiline comment
105 s = re.sub("^( |\|)*\*?", "", i)
106 if len(s.strip(self.strip)):
107 buf.append(s.strip(self.strip))
108 elif "*/" in i and in_comment:
109 # end of multiline comment
110 in_comment = False
111 s = re.sub(r"\*+/.*", "", i.strip(self.strip))
112 if len(s):
113 buf.append(s)
114 yield (count, "\n ".join(buf))
115 buf = []
116 elif "/*" in i and "*/" in i:
117 # c-style oneliner comment
118 yield (count, re.sub(".*/\*(.*)\*/.*", r"\1", i).strip(self.strip))
119 count += 1
121 def start_text_cat(self):
122 cwd = os.getcwd()
123 # change to our directory
124 os.chdir(os.path.split(os.path.abspath(sys.argv[0]))[0])
125 sock = subprocess.Popen(["text_cat/text_cat", "-s", "-d", "text_cat/LM"], stdin=subprocess.PIPE, stdout=subprocess.PIPE)
126 os.chdir(cwd)
127 return sock
129 def get_lang(self, s):
130 """ the output is 'german' or 'english' or 'german or english'. when
131 unsure, just don't warn, there are strings where you just can't
132 teremine the results reliably, like '#110680#' """
134 self.text_cat.stdin.write(s)
135 self.text_cat.stdin.write("\n")
136 self.text_cat.stdin.flush()
137 lang = self.text_cat.stdout.readline().strip()
138 return lang
140 def is_german(self, s):
142 determines if a string is german or not
144 # for short strings we can't do reliable recognition, so skip
145 # short strings and less than 4 words
146 s = s.replace('\n', ' ')
147 if len(s) < 32 or len(s.split()) < 4:
148 return False
149 return "german" == self.get_lang(s)
151 def check_file(self, path):
153 checks each comment in a file
155 def tab_calc (string):
156 START = 40 #Default of 10 tabs
157 if len(string) >= START:
158 return 1
159 diff = START - len(string)
160 if diff % 4 is not 0:
161 padding = 1
162 else:
163 padding = 0
164 return (diff/4)+padding
166 if self.options.line_numbers or self.options.line_numbers_pos:
167 TABS = "\t"*10
168 path_linenums = []
169 for linenum, s in self.get_comments(path):
170 if self.is_german(s):
171 path_linenums.append(linenum)
172 valid = len(path_linenums) > int(self.options.THRESHOLD)
173 if self.options.line_numbers:
174 sys.stderr.write("%s ... %s positives -- %s\n" % (path, str(len(path_linenums)), str(valid)))
175 if valid:
176 if self.options.line_numbers_pos:
177 sys.stderr.write("%s ... %s positives\n" % (path, str(len(path_linenums))))
178 return
179 if len(path) + (len(path_linenums)*4) > 75:
180 print "%s:\n" % path
181 while(path_linenums):
182 i = 0
183 numline = []
184 while i < 10:
185 try:
186 numline.append(path_linenums[0])
187 path_linenums.remove(path_linenums[0])
188 except IndexError:
189 i = 10
190 i += 1
191 numline = [str(i) for i in numline]
192 print "%s%s" % (TABS, ",".join(numline))
193 else:
194 if self.options.line_numbers:
195 path_linenums = [str(i) for i in path_linenums]
196 print "%s:%s%s" % (path, "\t"*tab_calc(path), ",".join(path_linenums))
198 elif not self.options.filenames_only:
199 for linenum, s in self.get_comments(path):
200 if self.is_german(s):
201 print "%s:%s: %s" % (path, linenum, s)
202 else:
203 fnames = set([])
204 for linenum, s in self.get_comments(path):
205 if self.is_german(s):
206 # Make sure we print each filename only once
207 fnames.add(path)
208 # Print the filenames
209 for f in fnames:
210 print f
212 def first_elem(self, path):
213 lastElem = os.path.dirname(path)
214 done = False
215 while not done:
216 nextElem = os.path.split(lastElem)[0]
217 if nextElem is not '':
218 lastElem = nextElem
219 else:
220 done = True
221 return lastElem
223 def check_source_files(self, directory):
225 checks each _tracked_ file in a directory recursively
227 sock = os.popen(r"git ls-files '%s' |egrep '\.(c|h)xx$'" % directory)
228 lines = sock.readlines()
229 sock.close()
231 # Helps to speedup a global scan
232 directory_whitelist = {
233 "UnoControls" : 1,
234 "accessibility" : 1,
235 "android" : 1,
236 "animations" : 1,
237 "avmedia" : 1,
238 "basctl" : 1,
239 "basebmp" : 1,
240 "basegfx" : 1,
241 "basic" : 1,
242 "binaryurp" : 1,
243 "bridges" : 1,
244 "canvas" : 1,
245 "chart2" : 1,
246 "cli_ure" : 1,
247 "codemaker" : 1,
248 "comphelper" : 1,
249 "compilerplugins" : 1,
250 "configmgr" : 1,
251 "connectivity" : 1,
252 "cppcanvas" : 1,
253 "cppu" : 1,
254 "cppuhelper" : 1,
255 "cpputools" : 1,
256 "cui" : 1,
257 "dbaccess" : 1,
258 "desktop" : 1,
259 "drawinglayer" : 1,
260 "dtrans" : 1,
261 "editeng" : 1,
262 "embeddedobj" : 1,
263 "embedserv" : 1,
264 "eventattacher" : 1,
265 "extensions" : 1,
266 "external" : 1,
267 "filter" : 1,
268 "forms" : 1,
269 "formula" : 1,
270 "fpicker" : 1,
271 "framework" : 1,
272 "helpcompiler" : 1,
273 "hwpfilter" : 1,
274 "i18npool" : 0, #
275 "i18nlangtag" : 1,
276 "i18nutil" : 1,
277 "idl" : 1,
278 "idlc" : 1,
279 "include" : 0, #
280 "io" : 1,
281 "javaunohelper" : 1,
282 "jvmaccess" : 1,
283 "jvmfwk" : 1,
284 "l10ntools" : 1,
285 "lingucomponent" : 1,
286 "linguistic" : 1,
287 "lotuswordpro" : 1,
288 "mysqlc" : 1,
289 "o3tl" : 1,
290 "odk" : 1,
291 "officecfg" : 1,
292 "oox" : 1,
293 "package" : 1,
294 "postprocess" : 1,
295 "pyuno" : 1,
296 "registry" : 1,
297 "remotebridges" : 1,
298 "reportdesign" : 0, #
299 "rsc" : 0, #
300 "sal" : 1,
301 "salhelper" : 1,
302 "sax" : 1,
303 "sc" : 0, #
304 "scaddins" : 0, #
305 "sccomp" : 1,
306 "scripting" : 1,
307 "sd" : 1,
308 "sdext" : 1,
309 "sfx2" : 0, #
310 "shell" : 1,
311 "setup_native" : 1,
312 "sot" : 1,
313 "slideshow" : 1,
314 "smoketest" : 1,
315 "solenv" : 1,
316 "soltools" : 1,
317 "starmath" : 1,
318 "stoc" : 0, #
319 "store" : 1,
320 "svgio" : 1,
321 "svl" : 1,
322 "svtools" : 1,
323 "svx" : 0, #
324 "sw" : 0, #
325 "test" : 1,
326 "testtools" : 1,
327 "toolkit" : 1,
328 "tools" : 1,
329 "touch" : 1,
330 "tubes" : 1,
331 "ucb" : 1,
332 "ucbhelper" : 1,
333 "unodevtools" : 1,
334 "unotest" : 1,
335 "unoidl" : 1,
336 "unotools" : 1,
337 "unoxml" : 1,
338 "uui" : 1,
339 "vbahelper" : 1,
340 "vcl" : 1,
341 "winaccessibility" : 1,
342 "writerfilter" : 1,
343 "writerperfect" : 1,
344 "xmlhelp" : 1,
345 "xmloff" : 1,
346 "xmlreader" : 1,
347 "xmlsecurity" : 1,
348 "xmlscript" : 1,
351 if not directory is '.':
352 sys.stderr.write("Warning: pass an absolute path to the top-level in order to use the faster white-list search\n")
354 for path in lines:
355 baseDir = self.first_elem(path)
357 # Support searching within sub directories
358 if directory is '.':
359 self.check_file(path.strip())
360 elif not baseDir in directory_whitelist:
361 print ("Missing path %s " % baseDir)
362 elif directory_whitelist[baseDir] is 0:
363 # print ("Scan path %s " % baseDir)
364 self.check_file(path.strip())
366 try:
367 Parser()
368 except KeyboardInterrupt:
369 print "Interrupted!"
370 sys.exit(0)
372 # vim:set shiftwidth=4 softtabstop=4 expandtab: