2 # Wireshark - Network traffic analyzer
3 # By Gerald Combs <gerald@wireshark.org>
4 # Copyright 1998 Gerald Combs
6 # SPDX-License-Identifier: GPL-2.0-or-later
16 from spellchecker
import SpellChecker
17 from collections
import Counter
18 from html
.parser
import HTMLParser
21 # Looks for spelling errors among strings found in source or documentation files.
23 # - To run this script, you should install pyspellchecker (not spellchecker) using pip.
24 # - Because of colouring, you may want to pipe into less -R
27 # TODO: check structured doxygen comments?
29 # For text colouring/highlighting.
42 # Try to exit soon after Ctrl-C is pressed.
45 def signal_handler(sig
, frame
):
48 print('You pressed Ctrl+C - exiting')
50 signal
.signal(signal
.SIGINT
, signal_handler
)
54 # Create spellchecker, and augment with some Wireshark words.
55 # Set up our dict with words from text file.
56 spell
= SpellChecker()
57 spell
.word_frequency
.load_text_file('./tools/wireshark_words.txt')
61 # Track words that were not found.
65 # Split camelCase string into separate words.
66 def camelCaseSplit(identifier
):
67 matches
= re
.finditer(r
'.+?(?:(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|$)', identifier
)
68 return [m
.group(0) for m
in matches
]
71 # A File object contains all of the strings to be checked for a given file.
73 def __init__(self
, file):
77 filename
, extension
= os
.path
.splitext(file)
78 # TODO: add '.lua'? Would also need to check string and comment formats...
79 self
.code_file
= extension
in {'.c', '.cpp', '.h' }
82 with
open(file, 'r', encoding
="utf8") as f
:
86 # Remove comments so as not to trip up RE.
87 contents
= removeComments(contents
)
89 # Find protocol name and add to dict.
90 # N.B. doesn't work when a variable is used instead of a literal for the protocol name...
91 matches
= re
.finditer(r
'proto_register_protocol\s*\([\n\r\s]*\"(.*)\",[\n\r\s]*\"(.*)\",[\n\r\s]*\"(.*)\"', contents
)
95 spell
.word_frequency
.load_words([protocol
])
96 spell
.known([protocol
])
97 print('Protocol is: ' + bcolors
.BOLD
+ protocol
+ bcolors
.ENDC
)
99 # Add a string found in this file.
100 def add(self
, value
):
101 self
.values
.append(value
.encode('utf-8') if sys
.platform
.startswith('win') else value
)
103 # Whole word is not recognised, but is it 2 words concatenated (without camelcase) ?
104 def checkMultiWords(self
, word
):
108 # Don't consider if mixed cases.
109 if not (word
.islower() or word
.isupper()):
110 # But make an exception if only the fist letter is uppercase..
111 if not word
== (word
[0].upper() + word
[1:]):
114 # Try splitting into 2 words recognised at various points.
115 # Allow 3-letter words.
117 for idx
in range(3, length
-3):
121 if not spell
.unknown([word1
, word2
]):
124 return self
.checkMultiWordsRecursive(word
)
126 # If word before 'id' is recognised, accept word.
127 def wordBeforeId(self
, word
):
128 if word
.lower().endswith('id'):
129 if not spell
.unknown([word
[0:len(word
)-2]]):
134 def checkMultiWordsRecursive(self
, word
):
139 for idx
in range(4, length
+1):
141 if not spell
.unknown([w
]):
145 if self
.checkMultiWordsRecursive(word
[idx
:]):
150 def numberPlusUnits(self
, word
):
151 m
= re
.search(r
'^([0-9]+)([a-zA-Z]+)$', word
)
153 if m
.group(2).lower() in { "bit", "bits", "gb", "kbps", "gig", "mb", "th", "mhz", "v", "hz", "k",
154 "mbps", "m", "g", "ms", "nd", "nds", "rd", "kb", "kbit", "ghz",
155 "khz", "km", "ms", "usec", "sec", "gbe", "ns", "ksps", "qam", "mm" }:
160 # Check the spelling of all the words we have found
161 def spellCheck(self
):
163 num_values
= len(self
.values
)
164 for value_index
,v
in enumerate(self
.values
):
170 # Sometimes parentheses used to show optional letters, so don't leave space
171 #if re.compile(r"^[\S]*\(").search(v):
172 # v = v.replace('(', '')
173 #if re.compile(r"\S\)").search(v):
174 # v = v.replace(')', '')
180 # Store original (as want to include for context in error report).
183 # Replace most punctuation with spaces, and eliminate common format specifiers.
184 v
= v
.replace('.', ' ')
185 v
= v
.replace(',', ' ')
186 v
= v
.replace('`', ' ')
187 v
= v
.replace(':', ' ')
188 v
= v
.replace(';', ' ')
189 v
= v
.replace('"', ' ')
190 v
= v
.replace('\\', ' ')
191 v
= v
.replace('+', ' ')
192 v
= v
.replace('|', ' ')
193 v
= v
.replace('(', ' ')
194 v
= v
.replace(')', ' ')
195 v
= v
.replace('[', ' ')
196 v
= v
.replace(']', ' ')
197 v
= v
.replace('{', ' ')
198 v
= v
.replace('}', ' ')
199 v
= v
.replace('<', ' ')
200 v
= v
.replace('>', ' ')
201 v
= v
.replace('_', ' ')
202 v
= v
.replace('-', ' ')
203 v
= v
.replace('/', ' ')
204 v
= v
.replace('!', ' ')
205 v
= v
.replace('?', ' ')
206 v
= v
.replace('=', ' ')
207 v
= v
.replace('*', ' ')
208 v
= v
.replace('%u', '')
209 v
= v
.replace('%d', '')
210 v
= v
.replace('%s', '')
211 v
= v
.replace('%', ' ')
212 v
= v
.replace('#', ' ')
213 v
= v
.replace('&', ' ')
214 v
= v
.replace('@', ' ')
215 v
= v
.replace('$', ' ')
216 v
= v
.replace('^', ' ')
217 v
= v
.replace('®', '')
218 v
= v
.replace("'", ' ')
219 v
= v
.replace('"', ' ')
220 v
= v
.replace('~', ' ')
223 value_words
= v
.split()
224 # Further split up any camelCase words.
226 for w
in value_words
:
227 words
+= camelCaseSplit(w
)
229 # Check each word within this string in turn.
231 # Strip trailing digits from word.
232 word
= word
.rstrip('1234567890')
234 # Quote marks found in some of the docs...
235 word
= word
.replace('“', '')
236 word
= word
.replace('”', '')
238 # Single and collective possession
239 if word
.endswith("’s"):
241 if word
.endswith("s’"):
245 if self
.numberPlusUnits(word
):
248 if len(word
) > 4 and spell
.unknown([word
]) and not self
.checkMultiWords(word
) and not self
.wordBeforeId(word
):
249 # Highlight words that appeared in Wikipedia list.
250 print(bcolors
.BOLD
if word
in wiki_db
else '',
251 self
.file, value_index
, '/', num_values
, '"' + original
+ '"', bcolors
.FAIL
+ word
+ bcolors
.ENDC
,
254 # TODO: this can be interesting, but takes too long!
255 # bcolors.OKGREEN + spell.correction(word) + bcolors.ENDC
257 missing_words
.append(word
)
259 def removeWhitespaceControl(code_string
):
260 code_string
= code_string
.replace('\\n', ' ')
261 code_string
= code_string
.replace('\\r', ' ')
262 code_string
= code_string
.replace('\\t', ' ')
265 # Remove any contractions from the given string.
266 def removeContractions(code_string
):
267 contractions
= [ "wireshark’s", "don’t", "let’s", "isn’t", "won’t", "user’s", "hasn’t", "you’re", "o’clock", "you’ll",
268 "you’d", "developer’s", "doesn’t", "what’s", "let’s", "haven’t", "can’t", "you’ve",
269 "shouldn’t", "didn’t", "wouldn’t", "aren’t", "there’s", "packet’s", "couldn’t", "world’s",
270 "needn’t", "graph’s", "table’s", "parent’s", "entity’s", "server’s", "node’s",
271 "querier’s", "sender’s", "receiver’s", "computer’s", "frame’s", "vendor’s", "system’s",
272 "we’ll", "asciidoctor’s", "protocol’s", "microsoft’s", "wasn’t" ]
273 for c
in contractions
:
274 code_string
= code_string
.replace(c
, "")
275 code_string
= code_string
.replace(c
.capitalize(), "")
276 code_string
= code_string
.replace(c
.replace('’', "'"), "")
277 code_string
= code_string
.replace(c
.capitalize().replace('’', "'"), "")
280 def removeComments(code_string
):
281 code_string
= re
.sub(re
.compile(r
"/\*.*?\*/", re
.DOTALL
), "" , code_string
) # C-style comment
282 # Avoid matching // where it is allowed, e.g., https://www... or file:///...
283 code_string
= re
.sub(re
.compile(r
"(?<!:)(?<!/)(?<!\")(?
<!\")(?
<!\"\s\s
)(?
<!file:/)(?
<!\
,\s
)//.*?
\n" ) ,"" , code_string) # C++-style comment
286 def getCommentWords(code_string):
290 matches = re.finditer(r'//\s(.*?)\n', code_string)
292 words += m.group(1).split()
295 matches = re.finditer(r'/\*(.*?)\*/', code_string)
297 words += m.group(1).split()
301 def removeSingleQuotes(code_string):
302 code_string = code_string.replace('\\\\', " ") # Separate at \\
303 code_string = code_string.replace('\"\\\\\"', "")
304 code_string = code_string.replace("\\\"", " ")
305 code_string = code_string.replace("'\"'", "")
306 code_string = code_string.replace('…', ' ')
309 def removeHexSpecifiers(code_string):
310 # Find all hex numbers
314 m = re.search(r'(0x[0-9a-fA-F]*)', code_string)
316 code_string = code_string.replace(m.group(0), "")
323 # Create a File object that knows about all of the strings in the given file.
324 def findStrings(filename, check_comments=False):
325 with open(filename, 'r', encoding="utf8
") as f:
328 # Remove comments & embedded quotes so as not to trip up RE.
329 contents = removeContractions(contents)
330 contents = removeWhitespaceControl(contents)
331 contents = removeSingleQuotes(contents)
332 contents = removeHexSpecifiers(contents)
334 # Create file object.
335 file = File(filename)
337 # What we check depends upon file type.
339 # May want to check comments for selected dissectors
341 comment_words = getCommentWords(contents)
342 for w in comment_words:
345 contents = removeComments(contents)
347 # Code so only checking strings.
348 matches = re.finditer(r'\"([^\"]*)\"', contents)
352 # A documentation file, so examine all words.
353 for w in contents.split():
359 # Test for whether the given file was automatically generated.
360 def isGeneratedFile(filename):
361 # Check file exists - e.g. may have been deleted in a recent commit.
362 if not os.path.exists(filename):
365 if not filename.endswith('.c'):
368 # This file is generated, but notice is further in than want to check for all files
369 if filename.endswith('pci-ids.c') or filename.endswith('services-data.c') or filename.endswith('manuf-data.c'):
372 if filename.endswith('packet-woww.c'):
376 f_read = open(os.path.join(filename), 'r', encoding="utf8
")
377 for line_no,line in enumerate(f_read):
378 # The comment to say that its generated is near the top, so give up once
379 # get a few lines down.
383 if (line.find('Generated automatically') != -1 or
384 line.find('Autogenerated from') != -1 or
385 line.find('is autogenerated') != -1 or
386 line.find('automatically generated by Pidl') != -1 or
387 line.find('Created by: The Qt Meta Object Compiler') != -1 or
388 line.find('This file was generated') != -1 or
389 line.find('This filter was automatically generated') != -1 or
390 line.find('This file is auto generated, do not edit!') != -1 or
391 line.find('this file is automatically generated') != -1):
396 # OK, looks like a hand-written file!
401 def isAppropriateFile(filename):
402 file, extension = os.path.splitext(filename)
403 if filename.find('CMake') != -1:
405 # TODO: add , '.lua' ?
406 return extension in { '.adoc', '.c', '.cpp', '.pod', '.txt' } or file.endswith('README')
409 def findFilesInFolder(folder, recursive=True):
413 for root, subfolders, files in os.walk(folder):
417 f = os.path.join(root, f)
418 if isAppropriateFile(f) and not isGeneratedFile(f):
419 files_to_check.append(f)
421 for f in sorted(os.listdir(folder)):
422 f = os.path.join(folder, f)
423 if isAppropriateFile(f) and not isGeneratedFile(f):
424 files_to_check.append(f)
426 return files_to_check
429 # Check the given file.
430 def checkFile(filename, check_comments=False):
431 # Check file exists - e.g. may have been deleted in a recent commit.
432 if not os.path.exists(filename):
433 print(filename, 'does not exist!')
436 file = findStrings(filename, check_comments)
441 #################################################################
444 # command-line args. Controls which files should be checked.
445 # If no args given, will just scan epan/dissectors folder.
446 parser = argparse.ArgumentParser(description='Check spellings in specified files')
447 parser.add_argument('--file', action='append',
448 help='specify individual file to test')
449 parser.add_argument('--folder', action='append',
450 help='specify folder to test')
451 parser.add_argument('--glob', action='append',
452 help='specify glob to test - should give in "quotes
"')
453 parser.add_argument('--no-recurse', action='store_true', default='',
454 help='do not recurse inside chosen folder(s)')
455 parser.add_argument('--commits', action='store',
456 help='last N commits to check')
457 parser.add_argument('--open', action='store_true',
458 help='check open files')
459 parser.add_argument('--comments', action='store_true',
460 help='check comments in source files')
461 parser.add_argument('--no-wikipedia', action='store_true',
462 help='skip checking known bad words from wikipedia - can be slow')
463 parser.add_argument('--show-most-common', action='store', default='100',
464 help='number of most common not-known workds to display')
467 args = parser.parse_args()
469 class TypoSourceDocumentParser(HTMLParser):
472 self.capturing = False
475 def handle_starttag(self, tag, attrs):
477 self.capturing = True
479 def handle_endtag(self, tag):
481 self.capturing = False
483 def handle_data(self, data):
488 # Fetch some common mispellings from wikipedia so we will definitely flag them.
490 if not args.no_wikipedia:
491 print('Fetching Wikipedia\'s list of common misspellings.')
492 req_headers = { 'User-Agent': 'Wireshark check-wikipedia-typos' }
493 req = urllib.request.Request('https://en.wikipedia.org/wiki/Wikipedia:Lists_of_common_misspellings/For_machines', headers=req_headers)
495 response = urllib.request.urlopen(req)
496 content = response.read()
497 content = content.decode('UTF-8', 'replace')
499 # Extract the "<pre
>...</pre
>" part of the document.
500 parser = TypoSourceDocumentParser()
502 content = parser.content.strip()
504 wiki_db = dict(line.lower().split('->', maxsplit=1) for line in content.splitlines())
505 del wiki_db['cmo'] # All false positives.
506 del wiki_db['ect'] # Too many false positives.
507 del wiki_db['thru'] # We'll let that one thru. ;-)
508 del wiki_db['sargeant'] # All false positives.
510 # Remove each word from dict
516 spell.word_frequency.remove_words([word])
517 #print('Removed', word)
522 print('Removed', removed, 'known bad words')
524 print('Failed to fetch and/or parse Wikipedia mispellings!')
528 # Get files from wherever command-line args indicate.
531 # Add specified file(s)
533 if not os.path.isfile(f):
534 print('Chosen file', f, 'does not exist.')
539 # Get files affected by specified number of commits.
540 command = ['git', 'diff', '--name-only', 'HEAD~' + args.commits]
541 files = [f.decode('utf-8')
542 for f in subprocess.check_output(command).splitlines()]
544 files = list(filter(lambda f : os.path.exists(f) and isAppropriateFile(f) and not isGeneratedFile(f), files))
548 command = ['git', 'diff', '--name-only']
549 files = [f.decode('utf-8')
550 for f in subprocess.check_output(command).splitlines()]
552 files = list(filter(lambda f : isAppropriateFile(f) and not isGeneratedFile(f), files))
554 command = ['git', 'diff', '--staged', '--name-only']
555 files_staged = [f.decode('utf-8')
556 for f in subprocess.check_output(command).splitlines()]
558 files_staged = list(filter(lambda f : isAppropriateFile(f) and not isGeneratedFile(f), files_staged))
559 for f in files_staged:
564 # Add specified file(s)
566 for f in glob.glob(g):
567 if not os.path.isfile(f):
568 print('Chosen file', f, 'does not exist.')
574 for folder in args.folder:
575 if not os.path.isdir(folder):
576 print('Folder', folder, 'not found!')
579 # Find files from folder.
580 print('Looking for files in', folder)
581 files += findFilesInFolder(folder, not args.no_recurse)
583 # By default, scan dissector files.
584 if not args.file and not args.open and not args.commits and not args.glob and not args.folder:
585 # By default, scan dissectors directory
586 folder = os.path.join('epan', 'dissectors')
587 # Find files from folder.
588 print('Looking for files in', folder)
589 files = findFilesInFolder(folder, not args.no_recurse)
593 # If scanning a subset of files, list them here.
595 if args.file or args.folder or args.commits or args.open or args.glob:
597 print(' '.join(files), '\n')
599 print('No files to check.\n')
601 print('All dissector modules\n')
604 # Now check the chosen files.
607 checkFile(f, check_comments=args.comments)
608 # But get out if control-C has been pressed.
614 # Show the most commonly not-recognised words.
616 counter = Counter(missing_words).most_common(int(args.show_most_common))
619 print(c[0], ':', c[1])
622 print('\n' + bcolors.BOLD + str(len(missing_words)) + ' issues found' + bcolors.ENDC + '\n')