tools/check_spelling.py

   1 #!/usr/bin/env python3
   2 # Wireshark - Network traffic analyzer
   3 # By Gerald Combs <gerald@wireshark.org>
   4 # Copyright 1998 Gerald Combs
   5 #
   6 # SPDX-License-Identifier: GPL-2.0-or-later
   7
   8 import os
   9 import sys
  10 import re
  11 import subprocess
  12 import argparse
  13 import signal
  14 import glob
  15
  16 from spellchecker import SpellChecker
  17 from collections import Counter
  18 from html.parser import HTMLParser
  19 import urllib.request
  20
  21 # Looks for spelling errors among strings found in source or documentation files.
  22 # N.B.,
  23 # - To run this script, you should install pyspellchecker (not spellchecker) using pip.
  24 # - Because of colouring, you may want to pipe into less -R
  25
  26
  27 # TODO: check structured doxygen comments?
  28
  29 # For text colouring/highlighting.
  30 class bcolors:
  31     HEADER = '\033[95m'
  32     OKBLUE = '\033[94m'
  33     OKGREEN = '\033[92m'
  34     ADDED = '\033[45m'
  35     WARNING = '\033[93m'
  36     FAIL = '\033[91m'
  37     ENDC = '\033[0m'
  38     BOLD = '\033[1m'
  39     UNDERLINE = '\033[4m'
  40
  41
  42 # Try to exit soon after Ctrl-C is pressed.
  43 should_exit = False
  44
  45 def signal_handler(sig, frame):
  46     global should_exit
  47     should_exit = True
  48     print('You pressed Ctrl+C - exiting')
  49
  50 signal.signal(signal.SIGINT, signal_handler)
  51
  52
  53
  54 # Create spellchecker, and augment with some Wireshark words.
  55 # Set up our dict with words from text file.
  56 spell = SpellChecker()
  57 spell.word_frequency.load_text_file('./tools/wireshark_words.txt')
  58
  59
  60
  61 # Track words that were not found.
  62 missing_words = []
  63
  64
  65 # Split camelCase string into separate words.
  66 def camelCaseSplit(identifier):
  67     matches = re.finditer(r'.+?(?:(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|$)', identifier)
  68     return [m.group(0) for m in matches]
  69
  70
  71 # A File object contains all of the strings to be checked for a given file.
  72 class File:
  73     def __init__(self, file):
  74         self.file = file
  75         self.values = []
  76
  77         filename, extension = os.path.splitext(file)
  78         # TODO: add '.lua'?  Would also need to check string and comment formats...
  79         self.code_file = extension in {'.c', '.cpp', '.h' }
  80
  81
  82         with open(file, 'r', encoding="utf8") as f:
  83             contents = f.read()
  84
  85             if self.code_file:
  86                 # Remove comments so as not to trip up RE.
  87                 contents = removeComments(contents)
  88
  89             # Find protocol name and add to dict.
  90             # N.B. doesn't work when a variable is used instead of a literal for the protocol name...
  91             matches = re.finditer(r'proto_register_protocol\s*\([\n\r\s]*\"(.*)\",[\n\r\s]*\"(.*)\",[\n\r\s]*\"(.*)\"', contents)
  92             for m in matches:
  93                 protocol = m.group(3)
  94                 # Add to dict.
  95                 spell.word_frequency.load_words([protocol])
  96                 spell.known([protocol])
  97                 print('Protocol is: ' + bcolors.BOLD +  protocol + bcolors.ENDC)
  98
  99     # Add a string found in this file.
 100     def add(self, value):
 101         self.values.append(value.encode('utf-8') if sys.platform.startswith('win') else value)
 102
 103     # Whole word is not recognised, but is it 2 words concatenated (without camelcase) ?
 104     def checkMultiWords(self, word):
 105         if len(word) < 6:
 106             return False
 107
 108         # Don't consider if mixed cases.
 109         if not (word.islower() or word.isupper()):
 110             # But make an exception if only the fist letter is uppercase..
 111             if not word == (word[0].upper() + word[1:]):
 112                 return False
 113
 114         # Try splitting into 2 words recognised at various points.
 115         # Allow 3-letter words.
 116         length = len(word)
 117         for idx in range(3, length-3):
 118             word1 = word[0:idx]
 119             word2 = word[idx:]
 120
 121             if not spell.unknown([word1, word2]):
 122                 return True
 123
 124         return self.checkMultiWordsRecursive(word)
 125
 126     # If word before 'id' is recognised, accept word.
 127     def wordBeforeId(self, word):
 128         if word.lower().endswith('id'):
 129             if not spell.unknown([word[0:len(word)-2]]):
 130                 return True
 131             else:
 132                 return False
 133
 134     def checkMultiWordsRecursive(self, word):
 135         length = len(word)
 136         if length < 4:
 137             return False
 138
 139         for idx in range(4, length+1):
 140             w = word[0:idx]
 141             if not spell.unknown([w]):
 142                 if idx == len(word):
 143                     return True
 144                 else:
 145                     if self.checkMultiWordsRecursive(word[idx:]):
 146                         return True
 147
 148         return False
 149
 150     def numberPlusUnits(self, word):
 151         m = re.search(r'^([0-9]+)([a-zA-Z]+)$', word)
 152         if m:
 153             if m.group(2).lower() in { "bit", "bits", "gb", "kbps", "gig", "mb", "th", "mhz", "v", "hz", "k",
 154                                        "mbps", "m", "g", "ms", "nd", "nds", "rd", "kb", "kbit", "ghz",
 155                                        "khz", "km", "ms", "usec", "sec", "gbe", "ns", "ksps", "qam", "mm" }:
 156                 return True
 157         return False
 158
 159
 160     # Check the spelling of all the words we have found
 161     def spellCheck(self):
 162
 163         num_values = len(self.values)
 164         for value_index,v in enumerate(self.values):
 165             if should_exit:
 166                 exit(1)
 167
 168             v = str(v)
 169
 170             # Sometimes parentheses used to show optional letters, so don't leave space
 171             #if re.compile(r"^[\S]*\(").search(v):
 172             #    v = v.replace('(', '')
 173             #if re.compile(r"\S\)").search(v):
 174             #    v = v.replace(')', '')
 175
 176             # Ignore includes.
 177             if v.endswith('.h'):
 178                 continue
 179
 180             # Store original (as want to include for context in error report).
 181             original = str(v)
 182
 183             # Replace most punctuation with spaces, and eliminate common format specifiers.
 184             v = v.replace('.', ' ')
 185             v = v.replace(',', ' ')
 186             v = v.replace('`', ' ')
 187             v = v.replace(':', ' ')
 188             v = v.replace(';', ' ')
 189             v = v.replace('"', ' ')
 190             v = v.replace('\\', ' ')
 191             v = v.replace('+', ' ')
 192             v = v.replace('|', ' ')
 193             v = v.replace('(', ' ')
 194             v = v.replace(')', ' ')
 195             v = v.replace('[', ' ')
 196             v = v.replace(']', ' ')
 197             v = v.replace('{', ' ')
 198             v = v.replace('}', ' ')
 199             v = v.replace('<', ' ')
 200             v = v.replace('>', ' ')
 201             v = v.replace('_', ' ')
 202             v = v.replace('-', ' ')
 203             v = v.replace('/', ' ')
 204             v = v.replace('!', ' ')
 205             v = v.replace('?', ' ')
 206             v = v.replace('=', ' ')
 207             v = v.replace('*', ' ')
 208             v = v.replace('%u', '')
 209             v = v.replace('%d', '')
 210             v = v.replace('%s', '')
 211             v = v.replace('%', ' ')
 212             v = v.replace('#', ' ')
 213             v = v.replace('&', ' ')
 214             v = v.replace('@', ' ')
 215             v = v.replace('$', ' ')
 216             v = v.replace('^', ' ')
 217             v = v.replace('®', '')
 218             v = v.replace("'", ' ')
 219             v = v.replace('"', ' ')
 220             v = v.replace('~', ' ')
 221
 222             # Split into words.
 223             value_words = v.split()
 224             # Further split up any camelCase words.
 225             words = []
 226             for w in value_words:
 227                 words +=  camelCaseSplit(w)
 228
 229             # Check each word within this string in turn.
 230             for word in words:
 231                 # Strip trailing digits from word.
 232                 word = word.rstrip('1234567890')
 233
 234                 # Quote marks found in some of the docs...
 235                 word = word.replace('“', '')
 236                 word = word.replace('”', '')
 237
 238                 # Single and collective possession
 239                 if word.endswith("’s"):
 240                     word = word[:-2]
 241                 if word.endswith("s’"):
 242                     word = word[:-2]
 243
 244
 245                 if self.numberPlusUnits(word):
 246                     continue
 247
 248                 if len(word) > 4 and spell.unknown([word]) and not self.checkMultiWords(word) and not self.wordBeforeId(word):
 249                     # Highlight words that appeared in Wikipedia list.
 250                     print(bcolors.BOLD if word in wiki_db else '',
 251                           self.file, value_index, '/', num_values, '"' + original + '"', bcolors.FAIL + word + bcolors.ENDC,
 252                           ' -> ', '?')
 253
 254                     # TODO: this can be interesting, but takes too long!
 255                     # bcolors.OKGREEN + spell.correction(word) + bcolors.ENDC
 256                     global missing_words
 257                     missing_words.append(word)
 258
 259 def removeWhitespaceControl(code_string):
 260     code_string = code_string.replace('\\n', ' ')
 261     code_string = code_string.replace('\\r', ' ')
 262     code_string = code_string.replace('\\t', ' ')
 263     return code_string
 264
 265 # Remove any contractions from the given string.
 266 def removeContractions(code_string):
 267     contractions = [ "wireshark’s", "don’t", "let’s", "isn’t", "won’t", "user’s", "hasn’t", "you’re", "o’clock", "you’ll",
 268                      "you’d", "developer’s", "doesn’t", "what’s", "let’s", "haven’t", "can’t", "you’ve",
 269                      "shouldn’t", "didn’t", "wouldn’t", "aren’t", "there’s", "packet’s", "couldn’t", "world’s",
 270                      "needn’t", "graph’s", "table’s", "parent’s", "entity’s", "server’s", "node’s",
 271                      "querier’s", "sender’s", "receiver’s", "computer’s", "frame’s", "vendor’s", "system’s",
 272                      "we’ll", "asciidoctor’s", "protocol’s", "microsoft’s", "wasn’t" ]
 273     for c in contractions:
 274         code_string = code_string.replace(c, "")
 275         code_string = code_string.replace(c.capitalize(), "")
 276         code_string = code_string.replace(c.replace('’', "'"), "")
 277         code_string = code_string.replace(c.capitalize().replace('’', "'"), "")
 278     return code_string
 279
 280 def removeComments(code_string):
 281     code_string = re.sub(re.compile(r"/\*.*?\*/", re.DOTALL), "" , code_string) # C-style comment
 282     # Avoid matching // where it is allowed, e.g.,  https://www... or file:///...
 283     code_string = re.sub(re.compile(r"(?<!:)(?<!/)(?<!\")(?<!\")(?<!\"\s\s)(?<!file:/)(?<!\,\s)//.*?\n" ) ,"" , code_string)             # C++-style comment
 284     return code_string
 285
 286 def getCommentWords(code_string):
 287     words = []
 288
 289     # C++ comments
 290     matches = re.finditer(r'//\s(.*?)\n', code_string)
 291     for m in matches:
 292         words += m.group(1).split()
 293
 294     # C comments
 295     matches = re.finditer(r'/\*(.*?)\*/', code_string)
 296     for m in matches:
 297         words += m.group(1).split()
 298
 299     return words
 300
 301 def removeSingleQuotes(code_string):
 302     code_string = code_string.replace('\\\\', " ")        # Separate at \\
 303     code_string = code_string.replace('\"\\\\\"', "")
 304     code_string = code_string.replace("\\\"", " ")
 305     code_string = code_string.replace("'\"'", "")
 306     code_string = code_string.replace('…', ' ')
 307     return code_string
 308
 309 def removeHexSpecifiers(code_string):
 310     # Find all hex numbers
 311
 312     looking = True
 313     while looking:
 314         m = re.search(r'(0x[0-9a-fA-F]*)', code_string)
 315         if m:
 316             code_string = code_string.replace(m.group(0), "")
 317         else:
 318             looking = False
 319
 320     return code_string
 321
 322
 323 # Create a File object that knows about all of the strings in the given file.
 324 def findStrings(filename, check_comments=False):
 325     with open(filename, 'r', encoding="utf8") as f:
 326         contents = f.read()
 327
 328         # Remove comments & embedded quotes so as not to trip up RE.
 329         contents = removeContractions(contents)
 330         contents = removeWhitespaceControl(contents)
 331         contents = removeSingleQuotes(contents)
 332         contents = removeHexSpecifiers(contents)
 333
 334         # Create file object.
 335         file = File(filename)
 336
 337         # What we check depends upon file type.
 338         if file.code_file:
 339             # May want to check comments for selected dissectors
 340             if check_comments:
 341                 comment_words = getCommentWords(contents)
 342                 for w in comment_words:
 343                     file.add(w)
 344
 345             contents = removeComments(contents)
 346
 347             # Code so only checking strings.
 348             matches = re.finditer(r'\"([^\"]*)\"', contents)
 349             for m in matches:
 350                 file.add(m.group(1))
 351         else:
 352             # A documentation file, so examine all words.
 353             for w in contents.split():
 354                 file.add(w)
 355
 356         return file
 357
 358
 359 # Test for whether the given file was automatically generated.
 360 def isGeneratedFile(filename):
 361     # Check file exists - e.g. may have been deleted in a recent commit.
 362     if not os.path.exists(filename):
 363         return False
 364
 365     if not filename.endswith('.c'):
 366         return False
 367
 368     # This file is generated, but notice is further in than want to check for all files
 369     if filename.endswith('pci-ids.c') or filename.endswith('services-data.c') or filename.endswith('manuf-data.c'):
 370         return True
 371
 372     if filename.endswith('packet-woww.c'):
 373         return True
 374
 375     # Open file
 376     f_read = open(os.path.join(filename), 'r', encoding="utf8")
 377     for line_no,line in enumerate(f_read):
 378         # The comment to say that its generated is near the top, so give up once
 379         # get a few lines down.
 380         if line_no > 10:
 381             f_read.close()
 382             return False
 383         if (line.find('Generated automatically') != -1 or
 384             line.find('Autogenerated from') != -1 or
 385             line.find('is autogenerated') != -1 or
 386             line.find('automatically generated by Pidl') != -1 or
 387             line.find('Created by: The Qt Meta Object Compiler') != -1 or
 388             line.find('This file was generated') != -1 or
 389             line.find('This filter was automatically generated') != -1 or
 390             line.find('This file is auto generated, do not edit!') != -1 or
 391             line.find('this file is automatically generated') != -1):
 392
 393             f_read.close()
 394             return True
 395
 396     # OK, looks like a hand-written file!
 397     f_read.close()
 398     return False
 399
 400
 401 def isAppropriateFile(filename):
 402     file, extension = os.path.splitext(filename)
 403     if filename.find('CMake') != -1:
 404         return False
 405     # TODO: add , '.lua' ?
 406     return extension in { '.adoc', '.c', '.cpp', '.pod', '.txt' } or file.endswith('README')
 407
 408
 409 def findFilesInFolder(folder, recursive=True):
 410     files_to_check = []
 411
 412     if recursive:
 413         for root, subfolders, files in os.walk(folder):
 414             for f in files:
 415                 if should_exit:
 416                     return
 417                 f = os.path.join(root, f)
 418                 if isAppropriateFile(f) and not isGeneratedFile(f):
 419                     files_to_check.append(f)
 420     else:
 421         for f in sorted(os.listdir(folder)):
 422             f = os.path.join(folder, f)
 423             if isAppropriateFile(f) and not isGeneratedFile(f):
 424                 files_to_check.append(f)
 425
 426     return files_to_check
 427
 428
 429 # Check the given file.
 430 def checkFile(filename, check_comments=False):
 431     # Check file exists - e.g. may have been deleted in a recent commit.
 432     if not os.path.exists(filename):
 433         print(filename, 'does not exist!')
 434         return
 435
 436     file = findStrings(filename, check_comments)
 437     file.spellCheck()
 438
 439
 440
 441 #################################################################
 442 # Main logic.
 443
 444 # command-line args.  Controls which files should be checked.
 445 # If no args given, will just scan epan/dissectors folder.
 446 parser = argparse.ArgumentParser(description='Check spellings in specified files')
 447 parser.add_argument('--file', action='append',
 448                     help='specify individual file to test')
 449 parser.add_argument('--folder', action='append',
 450                     help='specify folder to test')
 451 parser.add_argument('--glob', action='append',
 452                     help='specify glob to test - should give in "quotes"')
 453 parser.add_argument('--no-recurse', action='store_true', default='',
 454                     help='do not recurse inside chosen folder(s)')
 455 parser.add_argument('--commits', action='store',
 456                     help='last N commits to check')
 457 parser.add_argument('--open', action='store_true',
 458                     help='check open files')
 459 parser.add_argument('--comments', action='store_true',
 460                     help='check comments in source files')
 461 parser.add_argument('--no-wikipedia', action='store_true',
 462                     help='skip checking known bad words from wikipedia - can be slow')
 463 parser.add_argument('--show-most-common', action='store', default='100',
 464                     help='number of most common not-known workds to display')
 465
 466
 467 args = parser.parse_args()
 468
 469 class TypoSourceDocumentParser(HTMLParser):
 470     def __init__(self):
 471         super().__init__()
 472         self.capturing = False
 473         self.content = ''
 474
 475     def handle_starttag(self, tag, attrs):
 476         if tag == 'pre':
 477             self.capturing = True
 478
 479     def handle_endtag(self, tag):
 480         if tag == 'pre':
 481             self.capturing = False
 482
 483     def handle_data(self, data):
 484         if self.capturing:
 485             self.content += data
 486
 487
 488 # Fetch some common mispellings from wikipedia so we will definitely flag them.
 489 wiki_db = dict()
 490 if not args.no_wikipedia:
 491     print('Fetching Wikipedia\'s list of common misspellings.')
 492     req_headers = { 'User-Agent': 'Wireshark check-wikipedia-typos' }
 493     req = urllib.request.Request('https://en.wikipedia.org/wiki/Wikipedia:Lists_of_common_misspellings/For_machines', headers=req_headers)
 494     try:
 495         response = urllib.request.urlopen(req)
 496         content = response.read()
 497         content = content.decode('UTF-8', 'replace')
 498
 499         # Extract the "<pre>...</pre>" part of the document.
 500         parser = TypoSourceDocumentParser()
 501         parser.feed(content)
 502         content = parser.content.strip()
 503
 504         wiki_db = dict(line.lower().split('->', maxsplit=1) for line in content.splitlines())
 505         del wiki_db['cmo']      # All false positives.
 506         del wiki_db['ect']      # Too many false positives.
 507         del wiki_db['thru']     # We'll let that one thru. ;-)
 508         del wiki_db['sargeant'] # All false positives.
 509
 510         # Remove each word from dict
 511         removed = 0
 512         for word in wiki_db:
 513             try:
 514                 if should_exit:
 515                     exit(1)
 516                 spell.word_frequency.remove_words([word])
 517                 #print('Removed', word)
 518                 removed += 1
 519             except Exception:
 520                 pass
 521
 522         print('Removed', removed, 'known bad words')
 523     except Exception:
 524         print('Failed to fetch and/or parse Wikipedia mispellings!')
 525
 526
 527
 528 # Get files from wherever command-line args indicate.
 529 files = []
 530 if args.file:
 531     # Add specified file(s)
 532     for f in args.file:
 533         if not os.path.isfile(f):
 534             print('Chosen file', f, 'does not exist.')
 535             exit(1)
 536         else:
 537             files.append(f)
 538 if args.commits:
 539     # Get files affected by specified number of commits.
 540     command = ['git', 'diff', '--name-only', 'HEAD~' + args.commits]
 541     files = [f.decode('utf-8')
 542              for f in subprocess.check_output(command).splitlines()]
 543     # Filter files
 544     files = list(filter(lambda f : os.path.exists(f) and isAppropriateFile(f) and not isGeneratedFile(f), files))
 545
 546 if args.open:
 547     # Unstaged changes.
 548     command = ['git', 'diff', '--name-only']
 549     files = [f.decode('utf-8')
 550              for f in subprocess.check_output(command).splitlines()]
 551     # Filter files.
 552     files = list(filter(lambda f : isAppropriateFile(f) and not isGeneratedFile(f), files))
 553     # Staged changes.
 554     command = ['git', 'diff', '--staged', '--name-only']
 555     files_staged = [f.decode('utf-8')
 556                     for f in subprocess.check_output(command).splitlines()]
 557     # Filter files.
 558     files_staged = list(filter(lambda f : isAppropriateFile(f) and not isGeneratedFile(f), files_staged))
 559     for f in files_staged:
 560         if f not in files:
 561             files.append(f)
 562
 563 if args.glob:
 564     # Add specified file(s)
 565     for g in args.glob:
 566         for f in glob.glob(g):
 567             if not os.path.isfile(f):
 568                 print('Chosen file', f, 'does not exist.')
 569                 exit(1)
 570             else:
 571                 files.append(f)
 572
 573 if args.folder:
 574     for folder in args.folder:
 575         if not os.path.isdir(folder):
 576             print('Folder', folder, 'not found!')
 577             exit(1)
 578
 579         # Find files from folder.
 580         print('Looking for files in', folder)
 581         files += findFilesInFolder(folder, not args.no_recurse)
 582
 583 # By default, scan dissector files.
 584 if not args.file and not args.open and not args.commits and not args.glob and not args.folder:
 585     # By default, scan dissectors directory
 586     folder = os.path.join('epan', 'dissectors')
 587     # Find files from folder.
 588     print('Looking for files in', folder)
 589     files = findFilesInFolder(folder, not args.no_recurse)
 590
 591
 592
 593 # If scanning a subset of files, list them here.
 594 print('Examining:')
 595 if args.file or args.folder or args.commits or args.open or args.glob:
 596     if files:
 597         print(' '.join(files), '\n')
 598     else:
 599         print('No files to check.\n')
 600 else:
 601     print('All dissector modules\n')
 602
 603
 604 # Now check the chosen files.
 605 for f in files:
 606     # Check this file.
 607     checkFile(f, check_comments=args.comments)
 608     # But get out if control-C has been pressed.
 609     if should_exit:
 610         exit(1)
 611
 612
 613
 614 # Show the most commonly not-recognised words.
 615 print('')
 616 counter = Counter(missing_words).most_common(int(args.show_most_common))
 617 if len(counter) > 0:
 618     for c in counter:
 619         print(c[0], ':', c[1])
 620
 621 # Show error count.
 622 print('\n' + bcolors.BOLD + str(len(missing_words)) + ' issues found' + bcolors.ENDC + '\n')