dcerpc-nt: add UNION_ALIGN_TO... helpers
[wireshark-sm.git] / tools / check_spelling.py
blob5b9631c88d8833f57cb77acbd1cb65011b1487d5
1 #!/usr/bin/env python3
2 # Wireshark - Network traffic analyzer
3 # By Gerald Combs <gerald@wireshark.org>
4 # Copyright 1998 Gerald Combs
6 # SPDX-License-Identifier: GPL-2.0-or-later
8 import os
9 import sys
10 import re
11 import subprocess
12 import argparse
13 import signal
14 import glob
16 from spellchecker import SpellChecker
17 from collections import Counter
18 from html.parser import HTMLParser
19 import urllib.request
21 # Looks for spelling errors among strings found in source or documentation files.
22 # N.B.,
23 # - To run this script, you should install pyspellchecker (not spellchecker) using pip.
24 # - Because of colouring, you may want to pipe into less -R
27 # TODO: check structured doxygen comments?
29 # For text colouring/highlighting.
30 class bcolors:
31 HEADER = '\033[95m'
32 OKBLUE = '\033[94m'
33 OKGREEN = '\033[92m'
34 ADDED = '\033[45m'
35 WARNING = '\033[93m'
36 FAIL = '\033[91m'
37 ENDC = '\033[0m'
38 BOLD = '\033[1m'
39 UNDERLINE = '\033[4m'
42 # Try to exit soon after Ctrl-C is pressed.
43 should_exit = False
45 def signal_handler(sig, frame):
46 global should_exit
47 should_exit = True
48 print('You pressed Ctrl+C - exiting')
50 signal.signal(signal.SIGINT, signal_handler)
54 # Create spellchecker, and augment with some Wireshark words.
55 # Set up our dict with words from text file.
56 spell = SpellChecker()
57 spell.word_frequency.load_text_file('./tools/wireshark_words.txt')
61 # Track words that were not found.
62 missing_words = []
65 # Split camelCase string into separate words.
66 def camelCaseSplit(identifier):
67 matches = re.finditer(r'.+?(?:(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|$)', identifier)
68 return [m.group(0) for m in matches]
71 # A File object contains all of the strings to be checked for a given file.
72 class File:
73 def __init__(self, file):
74 self.file = file
75 self.values = []
77 filename, extension = os.path.splitext(file)
78 # TODO: add '.lua'? Would also need to check string and comment formats...
79 self.code_file = extension in {'.c', '.cpp', '.h' }
82 with open(file, 'r', encoding="utf8") as f:
83 contents = f.read()
85 if self.code_file:
86 # Remove comments so as not to trip up RE.
87 contents = removeComments(contents)
89 # Find protocol name and add to dict.
90 # N.B. doesn't work when a variable is used instead of a literal for the protocol name...
91 matches = re.finditer(r'proto_register_protocol\s*\([\n\r\s]*\"(.*)\",[\n\r\s]*\"(.*)\",[\n\r\s]*\"(.*)\"', contents)
92 for m in matches:
93 protocol = m.group(3)
94 # Add to dict.
95 spell.word_frequency.load_words([protocol])
96 spell.known([protocol])
97 print('Protocol is: ' + bcolors.BOLD + protocol + bcolors.ENDC)
99 # Add a string found in this file.
100 def add(self, value):
101 self.values.append(value.encode('utf-8') if sys.platform.startswith('win') else value)
103 # Whole word is not recognised, but is it 2 words concatenated (without camelcase) ?
104 def checkMultiWords(self, word):
105 if len(word) < 6:
106 return False
108 # Don't consider if mixed cases.
109 if not (word.islower() or word.isupper()):
110 # But make an exception if only the fist letter is uppercase..
111 if not word == (word[0].upper() + word[1:]):
112 return False
114 # Try splitting into 2 words recognised at various points.
115 # Allow 3-letter words.
116 length = len(word)
117 for idx in range(3, length-3):
118 word1 = word[0:idx]
119 word2 = word[idx:]
121 if not spell.unknown([word1, word2]):
122 return True
124 return self.checkMultiWordsRecursive(word)
126 # If word before 'id' is recognised, accept word.
127 def wordBeforeId(self, word):
128 if word.lower().endswith('id'):
129 if not spell.unknown([word[0:len(word)-2]]):
130 return True
131 else:
132 return False
134 def checkMultiWordsRecursive(self, word):
135 length = len(word)
136 if length < 4:
137 return False
139 for idx in range(4, length+1):
140 w = word[0:idx]
141 if not spell.unknown([w]):
142 if idx == len(word):
143 return True
144 else:
145 if self.checkMultiWordsRecursive(word[idx:]):
146 return True
148 return False
150 def numberPlusUnits(self, word):
151 m = re.search(r'^([0-9]+)([a-zA-Z]+)$', word)
152 if m:
153 if m.group(2).lower() in { "bit", "bits", "gb", "kbps", "gig", "mb", "th", "mhz", "v", "hz", "k",
154 "mbps", "m", "g", "ms", "nd", "nds", "rd", "kb", "kbit", "ghz",
155 "khz", "km", "ms", "usec", "sec", "gbe", "ns", "ksps", "qam", "mm" }:
156 return True
157 return False
160 # Check the spelling of all the words we have found
161 def spellCheck(self):
163 num_values = len(self.values)
164 for value_index,v in enumerate(self.values):
165 if should_exit:
166 exit(1)
168 v = str(v)
170 # Sometimes parentheses used to show optional letters, so don't leave space
171 #if re.compile(r"^[\S]*\(").search(v):
172 # v = v.replace('(', '')
173 #if re.compile(r"\S\)").search(v):
174 # v = v.replace(')', '')
176 # Ignore includes.
177 if v.endswith('.h'):
178 continue
180 # Store original (as want to include for context in error report).
181 original = str(v)
183 # Replace most punctuation with spaces, and eliminate common format specifiers.
184 v = v.replace('.', ' ')
185 v = v.replace(',', ' ')
186 v = v.replace('`', ' ')
187 v = v.replace(':', ' ')
188 v = v.replace(';', ' ')
189 v = v.replace('"', ' ')
190 v = v.replace('\\', ' ')
191 v = v.replace('+', ' ')
192 v = v.replace('|', ' ')
193 v = v.replace('(', ' ')
194 v = v.replace(')', ' ')
195 v = v.replace('[', ' ')
196 v = v.replace(']', ' ')
197 v = v.replace('{', ' ')
198 v = v.replace('}', ' ')
199 v = v.replace('<', ' ')
200 v = v.replace('>', ' ')
201 v = v.replace('_', ' ')
202 v = v.replace('-', ' ')
203 v = v.replace('/', ' ')
204 v = v.replace('!', ' ')
205 v = v.replace('?', ' ')
206 v = v.replace('=', ' ')
207 v = v.replace('*', ' ')
208 v = v.replace('%u', '')
209 v = v.replace('%d', '')
210 v = v.replace('%s', '')
211 v = v.replace('%', ' ')
212 v = v.replace('#', ' ')
213 v = v.replace('&', ' ')
214 v = v.replace('@', ' ')
215 v = v.replace('$', ' ')
216 v = v.replace('^', ' ')
217 v = v.replace('®', '')
218 v = v.replace("'", ' ')
219 v = v.replace('"', ' ')
220 v = v.replace('~', ' ')
222 # Split into words.
223 value_words = v.split()
224 # Further split up any camelCase words.
225 words = []
226 for w in value_words:
227 words += camelCaseSplit(w)
229 # Check each word within this string in turn.
230 for word in words:
231 # Strip trailing digits from word.
232 word = word.rstrip('1234567890')
234 # Quote marks found in some of the docs...
235 word = word.replace('“', '')
236 word = word.replace('”', '')
238 # Single and collective possession
239 if word.endswith("’s"):
240 word = word[:-2]
241 if word.endswith("s’"):
242 word = word[:-2]
245 if self.numberPlusUnits(word):
246 continue
248 if len(word) > 4 and spell.unknown([word]) and not self.checkMultiWords(word) and not self.wordBeforeId(word):
249 # Highlight words that appeared in Wikipedia list.
250 print(bcolors.BOLD if word in wiki_db else '',
251 self.file, value_index, '/', num_values, '"' + original + '"', bcolors.FAIL + word + bcolors.ENDC,
252 ' -> ', '?')
254 # TODO: this can be interesting, but takes too long!
255 # bcolors.OKGREEN + spell.correction(word) + bcolors.ENDC
256 global missing_words
257 missing_words.append(word)
259 def removeWhitespaceControl(code_string):
260 code_string = code_string.replace('\\n', ' ')
261 code_string = code_string.replace('\\r', ' ')
262 code_string = code_string.replace('\\t', ' ')
263 return code_string
265 # Remove any contractions from the given string.
266 def removeContractions(code_string):
267 contractions = [ "wireshark’s", "don’t", "let’s", "isn’t", "won’t", "user’s", "hasn’t", "you’re", "o’clock", "you’ll",
268 "you’d", "developer’s", "doesn’t", "what’s", "let’s", "haven’t", "can’t", "you’ve",
269 "shouldn’t", "didn’t", "wouldn’t", "aren’t", "there’s", "packet’s", "couldn’t", "world’s",
270 "needn’t", "graph’s", "table’s", "parent’s", "entity’s", "server’s", "node’s",
271 "querier’s", "sender’s", "receiver’s", "computer’s", "frame’s", "vendor’s", "system’s",
272 "we’ll", "asciidoctor’s", "protocol’s", "microsoft’s", "wasn’t" ]
273 for c in contractions:
274 code_string = code_string.replace(c, "")
275 code_string = code_string.replace(c.capitalize(), "")
276 code_string = code_string.replace(c.replace('’', "'"), "")
277 code_string = code_string.replace(c.capitalize().replace('’', "'"), "")
278 return code_string
280 def removeComments(code_string):
281 code_string = re.sub(re.compile(r"/\*.*?\*/", re.DOTALL), "" , code_string) # C-style comment
282 # Avoid matching // where it is allowed, e.g., https://www... or file:///...
283 code_string = re.sub(re.compile(r"(?<!:)(?<!/)(?<!\")(?<!\")(?<!\"\s\s)(?<!file:/)(?<!\,\s)//.*?\n" ) ,"" , code_string) # C++-style comment
284 return code_string
286 def getCommentWords(code_string):
287 words = []
289 # C++ comments
290 matches = re.finditer(r'//\s(.*?)\n', code_string)
291 for m in matches:
292 words += m.group(1).split()
294 # C comments
295 matches = re.finditer(r'/\*(.*?)\*/', code_string)
296 for m in matches:
297 words += m.group(1).split()
299 return words
301 def removeSingleQuotes(code_string):
302 code_string = code_string.replace('\\\\', " ") # Separate at \\
303 code_string = code_string.replace('\"\\\\\"', "")
304 code_string = code_string.replace("\\\"", " ")
305 code_string = code_string.replace("'\"'", "")
306 code_string = code_string.replace('…', ' ')
307 return code_string
309 def removeHexSpecifiers(code_string):
310 # Find all hex numbers
312 looking = True
313 while looking:
314 m = re.search(r'(0x[0-9a-fA-F]*)', code_string)
315 if m:
316 code_string = code_string.replace(m.group(0), "")
317 else:
318 looking = False
320 return code_string
323 # Create a File object that knows about all of the strings in the given file.
324 def findStrings(filename, check_comments=False):
325 with open(filename, 'r', encoding="utf8") as f:
326 contents = f.read()
328 # Remove comments & embedded quotes so as not to trip up RE.
329 contents = removeContractions(contents)
330 contents = removeWhitespaceControl(contents)
331 contents = removeSingleQuotes(contents)
332 contents = removeHexSpecifiers(contents)
334 # Create file object.
335 file = File(filename)
337 # What we check depends upon file type.
338 if file.code_file:
339 # May want to check comments for selected dissectors
340 if check_comments:
341 comment_words = getCommentWords(contents)
342 for w in comment_words:
343 file.add(w)
345 contents = removeComments(contents)
347 # Code so only checking strings.
348 matches = re.finditer(r'\"([^\"]*)\"', contents)
349 for m in matches:
350 file.add(m.group(1))
351 else:
352 # A documentation file, so examine all words.
353 for w in contents.split():
354 file.add(w)
356 return file
359 # Test for whether the given file was automatically generated.
360 def isGeneratedFile(filename):
361 # Check file exists - e.g. may have been deleted in a recent commit.
362 if not os.path.exists(filename):
363 return False
365 if not filename.endswith('.c'):
366 return False
368 # This file is generated, but notice is further in than want to check for all files
369 if filename.endswith('pci-ids.c') or filename.endswith('services-data.c') or filename.endswith('manuf-data.c'):
370 return True
372 if filename.endswith('packet-woww.c'):
373 return True
375 # Open file
376 f_read = open(os.path.join(filename), 'r', encoding="utf8")
377 for line_no,line in enumerate(f_read):
378 # The comment to say that its generated is near the top, so give up once
379 # get a few lines down.
380 if line_no > 10:
381 f_read.close()
382 return False
383 if (line.find('Generated automatically') != -1 or
384 line.find('Autogenerated from') != -1 or
385 line.find('is autogenerated') != -1 or
386 line.find('automatically generated by Pidl') != -1 or
387 line.find('Created by: The Qt Meta Object Compiler') != -1 or
388 line.find('This file was generated') != -1 or
389 line.find('This filter was automatically generated') != -1 or
390 line.find('This file is auto generated, do not edit!') != -1 or
391 line.find('this file is automatically generated') != -1):
393 f_read.close()
394 return True
396 # OK, looks like a hand-written file!
397 f_read.close()
398 return False
401 def isAppropriateFile(filename):
402 file, extension = os.path.splitext(filename)
403 if filename.find('CMake') != -1:
404 return False
405 # TODO: add , '.lua' ?
406 return extension in { '.adoc', '.c', '.cpp', '.pod', '.txt' } or file.endswith('README')
409 def findFilesInFolder(folder, recursive=True):
410 files_to_check = []
412 if recursive:
413 for root, subfolders, files in os.walk(folder):
414 for f in files:
415 if should_exit:
416 return
417 f = os.path.join(root, f)
418 if isAppropriateFile(f) and not isGeneratedFile(f):
419 files_to_check.append(f)
420 else:
421 for f in sorted(os.listdir(folder)):
422 f = os.path.join(folder, f)
423 if isAppropriateFile(f) and not isGeneratedFile(f):
424 files_to_check.append(f)
426 return files_to_check
429 # Check the given file.
430 def checkFile(filename, check_comments=False):
431 # Check file exists - e.g. may have been deleted in a recent commit.
432 if not os.path.exists(filename):
433 print(filename, 'does not exist!')
434 return
436 file = findStrings(filename, check_comments)
437 file.spellCheck()
441 #################################################################
442 # Main logic.
444 # command-line args. Controls which files should be checked.
445 # If no args given, will just scan epan/dissectors folder.
446 parser = argparse.ArgumentParser(description='Check spellings in specified files')
447 parser.add_argument('--file', action='append',
448 help='specify individual file to test')
449 parser.add_argument('--folder', action='append',
450 help='specify folder to test')
451 parser.add_argument('--glob', action='append',
452 help='specify glob to test - should give in "quotes"')
453 parser.add_argument('--no-recurse', action='store_true', default='',
454 help='do not recurse inside chosen folder(s)')
455 parser.add_argument('--commits', action='store',
456 help='last N commits to check')
457 parser.add_argument('--open', action='store_true',
458 help='check open files')
459 parser.add_argument('--comments', action='store_true',
460 help='check comments in source files')
461 parser.add_argument('--no-wikipedia', action='store_true',
462 help='skip checking known bad words from wikipedia - can be slow')
463 parser.add_argument('--show-most-common', action='store', default='100',
464 help='number of most common not-known workds to display')
467 args = parser.parse_args()
469 class TypoSourceDocumentParser(HTMLParser):
470 def __init__(self):
471 super().__init__()
472 self.capturing = False
473 self.content = ''
475 def handle_starttag(self, tag, attrs):
476 if tag == 'pre':
477 self.capturing = True
479 def handle_endtag(self, tag):
480 if tag == 'pre':
481 self.capturing = False
483 def handle_data(self, data):
484 if self.capturing:
485 self.content += data
488 # Fetch some common mispellings from wikipedia so we will definitely flag them.
489 wiki_db = dict()
490 if not args.no_wikipedia:
491 print('Fetching Wikipedia\'s list of common misspellings.')
492 req_headers = { 'User-Agent': 'Wireshark check-wikipedia-typos' }
493 req = urllib.request.Request('https://en.wikipedia.org/wiki/Wikipedia:Lists_of_common_misspellings/For_machines', headers=req_headers)
494 try:
495 response = urllib.request.urlopen(req)
496 content = response.read()
497 content = content.decode('UTF-8', 'replace')
499 # Extract the "<pre>...</pre>" part of the document.
500 parser = TypoSourceDocumentParser()
501 parser.feed(content)
502 content = parser.content.strip()
504 wiki_db = dict(line.lower().split('->', maxsplit=1) for line in content.splitlines())
505 del wiki_db['cmo'] # All false positives.
506 del wiki_db['ect'] # Too many false positives.
507 del wiki_db['thru'] # We'll let that one thru. ;-)
508 del wiki_db['sargeant'] # All false positives.
510 # Remove each word from dict
511 removed = 0
512 for word in wiki_db:
513 try:
514 if should_exit:
515 exit(1)
516 spell.word_frequency.remove_words([word])
517 #print('Removed', word)
518 removed += 1
519 except Exception:
520 pass
522 print('Removed', removed, 'known bad words')
523 except Exception:
524 print('Failed to fetch and/or parse Wikipedia mispellings!')
528 # Get files from wherever command-line args indicate.
529 files = []
530 if args.file:
531 # Add specified file(s)
532 for f in args.file:
533 if not os.path.isfile(f):
534 print('Chosen file', f, 'does not exist.')
535 exit(1)
536 else:
537 files.append(f)
538 if args.commits:
539 # Get files affected by specified number of commits.
540 command = ['git', 'diff', '--name-only', 'HEAD~' + args.commits]
541 files = [f.decode('utf-8')
542 for f in subprocess.check_output(command).splitlines()]
543 # Filter files
544 files = list(filter(lambda f : os.path.exists(f) and isAppropriateFile(f) and not isGeneratedFile(f), files))
546 if args.open:
547 # Unstaged changes.
548 command = ['git', 'diff', '--name-only']
549 files = [f.decode('utf-8')
550 for f in subprocess.check_output(command).splitlines()]
551 # Filter files.
552 files = list(filter(lambda f : isAppropriateFile(f) and not isGeneratedFile(f), files))
553 # Staged changes.
554 command = ['git', 'diff', '--staged', '--name-only']
555 files_staged = [f.decode('utf-8')
556 for f in subprocess.check_output(command).splitlines()]
557 # Filter files.
558 files_staged = list(filter(lambda f : isAppropriateFile(f) and not isGeneratedFile(f), files_staged))
559 for f in files_staged:
560 if f not in files:
561 files.append(f)
563 if args.glob:
564 # Add specified file(s)
565 for g in args.glob:
566 for f in glob.glob(g):
567 if not os.path.isfile(f):
568 print('Chosen file', f, 'does not exist.')
569 exit(1)
570 else:
571 files.append(f)
573 if args.folder:
574 for folder in args.folder:
575 if not os.path.isdir(folder):
576 print('Folder', folder, 'not found!')
577 exit(1)
579 # Find files from folder.
580 print('Looking for files in', folder)
581 files += findFilesInFolder(folder, not args.no_recurse)
583 # By default, scan dissector files.
584 if not args.file and not args.open and not args.commits and not args.glob and not args.folder:
585 # By default, scan dissectors directory
586 folder = os.path.join('epan', 'dissectors')
587 # Find files from folder.
588 print('Looking for files in', folder)
589 files = findFilesInFolder(folder, not args.no_recurse)
593 # If scanning a subset of files, list them here.
594 print('Examining:')
595 if args.file or args.folder or args.commits or args.open or args.glob:
596 if files:
597 print(' '.join(files), '\n')
598 else:
599 print('No files to check.\n')
600 else:
601 print('All dissector modules\n')
604 # Now check the chosen files.
605 for f in files:
606 # Check this file.
607 checkFile(f, check_comments=args.comments)
608 # But get out if control-C has been pressed.
609 if should_exit:
610 exit(1)
614 # Show the most commonly not-recognised words.
615 print('')
616 counter = Counter(missing_words).most_common(int(args.show_most_common))
617 if len(counter) > 0:
618 for c in counter:
619 print(c[0], ':', c[1])
621 # Show error count.
622 print('\n' + bcolors.BOLD + str(len(missing_words)) + ' issues found' + bcolors.ENDC + '\n')