tools/check_dissector_urls.py

   1 #!/usr/bin/env python3
   2 # Wireshark - Network traffic analyzer
   3 # By Gerald Combs <gerald@wireshark.org>
   4 # Copyright 1998 Gerald Combs
   5 #
   6 # SPDX-License-Identifier: GPL-2.0-or-later
   7
   8 import argparse
   9 import aiohttp
  10 import asyncio
  11 import os
  12 import re
  13 import shutil
  14 import signal
  15 import subprocess
  16
  17 # This utility scans the dissector code for URLs, then attempts to
  18 # fetch the links.  The results are shown in stdout, but also, at
  19 # the end of the run, written to files:
  20 # - URLs that couldn't be loaded are written to failures.txt
  21 # - working URLs are written to successes.txt
  22 # - any previous failures.txt is also copied to failures_last_run.txt
  23 #
  24 # N.B. preferred form of RFC link is e.g., https://tools.ietf.org/html/rfc4349
  25
  26
  27 # TODO:
  28 # - option to write back to dissector file when there is a failure?
  29 # - optionally parse previous/recent successes.txt and avoid fetching them again?
  30 # - make sure URLs are really within comments in code?
  31 # - use urllib.parse or similar to better check URLs?
  32 # - improve regex to allow '+' in URL (like confluence uses)
  33
  34 # Try to exit soon after Ctrl-C is pressed.
  35 should_exit = False
  36
  37
  38 def signal_handler(sig, frame):
  39     global should_exit
  40     should_exit = True
  41     print('You pressed Ctrl+C - exiting')
  42     try:
  43         tasks = asyncio.all_tasks()
  44     except (RuntimeError):
  45         # we haven't yet started the async link checking, we can exit directly
  46         exit(1)
  47     # ignore further SIGINTs while we're cancelling the running tasks
  48     signal.signal(signal.SIGINT, signal.SIG_IGN)
  49     for t in tasks:
  50         t.cancel()
  51
  52 signal.signal(signal.SIGINT, signal_handler)
  53
  54
  55 class FailedLookup:
  56
  57     def __init__(self):
  58         # Fake values that will be queried (for a requests.get() return value)
  59         self.status = 0
  60         self.headers = {}
  61         self.headers['content-type'] = '<NONE>'
  62
  63     def __str__(self):
  64         s = ('FailedLookup: status=' + str(self.status) +
  65              ' content-type=' + self.headers['content-type'])
  66         return s
  67
  68
  69 # Dictionary from url -> result
  70 cached_lookups = {}
  71
  72
  73 class Link(object):
  74
  75     def __init__(self, file, line_number, url):
  76         self.file = file
  77         self.line_number = line_number
  78         self.url = url
  79         self.tested = False
  80         self.r = None
  81         self.success = False
  82
  83     def __str__(self):
  84         epan_idx = self.file.find('epan')
  85         if epan_idx == -1:
  86             filename = self.file
  87         else:
  88             filename = self.file[epan_idx:]
  89         s = ('SUCCESS  ' if self.success else 'FAILED  ') + \
  90             filename + ':' + str(self.line_number) + '   ' + self.url
  91         if True:  # self.r:
  92             if self.r.status:
  93                 s += "   status-code=" + str(self.r.status)
  94                 if 'content-type' in self.r.headers:
  95                     s += (' content-type="' +
  96                           self.r.headers['content-type'] + '"')
  97             else:
  98                 s += '    <No response Received>'
  99         return s
 100
 101     def validate(self):
 102         global cached_lookups
 103         global should_exit
 104         if should_exit:
 105             return
 106         self.tested = True
 107         if self.url in cached_lookups:
 108             self.r = cached_lookups[self.url]
 109         else:
 110             self.r = FailedLookup()
 111
 112         if self.r.status < 200 or self.r.status >= 300:
 113             self.success = False
 114         else:
 115             self.success = True
 116
 117         if (args.verbose or not self.success) and not should_exit:
 118             print(self)
 119
 120 links = []
 121 files = []
 122 all_urls = set()
 123
 124 def find_links_in_file(filename):
 125     if os.path.isdir(filename):
 126         return
 127
 128     with open(filename, 'r', encoding="utf8") as f:
 129         for line_number, line in enumerate(f, start=1):
 130             # TODO: not matching
 131             # https://cwiki.apache.org/confluence/display/KAFKA/A+Guide+To+The+Kafka+Protocol
 132             urls = re.findall(
 133                 r'https?://(?:[a-zA-Z0-9./_?&=-]+|%[0-9a-fA-F]{2})+', line)
 134
 135             for url in urls:
 136                 # Lop off any trailing chars that are not part of it
 137                 url = url.rstrip(").',")
 138
 139                 # A url must have a period somewhere
 140                 if '.' not in url:
 141                     continue
 142                 global links, all_urls
 143                 links.append(Link(filename, line_number, url))
 144                 all_urls.add(url)
 145
 146
 147 # Scan the given folder for links to test. Recurses.
 148 def find_links_in_folder(folder):
 149     files_to_check = []
 150     for root,subfolders,files in os.walk(folder):
 151         for f in files:
 152             if should_exit:
 153                 return
 154             file = os.path.join(root, f)
 155             if file.endswith('.c') or file.endswith('.adoc'):
 156                 files_to_check.append(file)
 157
 158     # Deal with files in sorted order.
 159     for file in sorted(files_to_check):
 160         find_links_in_file(file)
 161
 162
 163
 164 async def populate_cache(sem, session, url):
 165     global cached_lookups
 166     if should_exit:
 167         return
 168     async with sem:
 169         try:
 170             async with session.get(url) as r:
 171                 cached_lookups[url] = r
 172                 if args.verbose:
 173                     print('checking ', url, ': success', sep='')
 174
 175         except (asyncio.CancelledError, ValueError, ConnectionError, Exception):
 176             cached_lookups[url] = FailedLookup()
 177             if args.verbose:
 178                 print('checking ', url, ': failed', sep='')
 179
 180
 181 async def check_all_links(links):
 182     sem = asyncio.Semaphore(50)
 183     timeout = aiohttp.ClientTimeout(total=25)
 184     connector = aiohttp.TCPConnector(limit=30)
 185     headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.76 Safari/537.36',
 186                'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'}
 187     async with aiohttp.ClientSession(connector=connector, headers=headers, timeout=timeout) as session:
 188         tasks = [populate_cache(sem, session, u) for u in all_urls]
 189         try:
 190             await asyncio.gather(*tasks)
 191         except (asyncio.CancelledError):
 192             await session.close()
 193
 194     for link in links:
 195         link.validate()
 196
 197
 198 #################################################################
 199 # Main logic.
 200
 201 # command-line args.  Controls which dissector files should be scanned.
 202 # If no args given, will just scan epan/dissectors folder.
 203 parser = argparse.ArgumentParser(description='Check URL links in dissectors')
 204 parser.add_argument('--file', action='append',
 205                     help='specify individual dissector file to test')
 206 parser.add_argument('--commits', action='store',
 207                     help='last N commits to check')
 208 parser.add_argument('--open', action='store_true',
 209                     help='check open files')
 210 parser.add_argument('--verbose', action='store_true',
 211                     help='when enabled, show more output')
 212 parser.add_argument('--docs', action='store_true',
 213                     help='when enabled, also check document folders')
 214
 215
 216 args = parser.parse_args()
 217
 218
 219 def is_dissector_file(filename):
 220     p = re.compile(r'.*(packet|file)-.*\.c')
 221     return p.match(filename)
 222
 223
 224 # Get files from wherever command-line args indicate.
 225 if args.file:
 226     # Add specified file(s)
 227     for f in args.file:
 228         if not os.path.isfile(f) and not f.startswith('epan'):
 229             f = os.path.join('epan', 'dissectors', f)
 230         if not os.path.isfile(f):
 231             print('Chosen file', f, 'does not exist.')
 232             exit(1)
 233         else:
 234             files.append(f)
 235             find_links_in_file(f)
 236 elif args.commits:
 237     # Get files affected by specified number of commits.
 238     command = ['git', 'diff', '--name-only', 'HEAD~' + args.commits]
 239     files = [f.decode('utf-8')
 240              for f in subprocess.check_output(command).splitlines()]
 241     # Fetch links from files (dissectors files only)
 242     files = list(filter(is_dissector_file, files))
 243     for f in files:
 244         find_links_in_file(f)
 245 elif args.open:
 246     # Unstaged changes.
 247     command = ['git', 'diff', '--name-only']
 248     files = [f.decode('utf-8')
 249              for f in subprocess.check_output(command).splitlines()]
 250     files = list(filter(is_dissector_file, files))
 251     # Staged changes.
 252     command = ['git', 'diff', '--staged', '--name-only']
 253     files_staged = [f.decode('utf-8')
 254                     for f in subprocess.check_output(command).splitlines()]
 255     files_staged = list(filter(is_dissector_file, files_staged))
 256     for f in files:
 257         find_links_in_file(f)
 258     for f in files_staged:
 259         if f not in files:
 260             find_links_in_file(f)
 261             files.append(f)
 262 elif args.docs:
 263     # Find links from doc folder(s)
 264     find_links_in_folder(os.path.join(os.path.dirname(__file__), '..', 'doc'))
 265
 266 else:
 267     # Find links from dissector folder.
 268     find_links_in_folder(os.path.join(os.path.dirname(__file__), '..', 'epan', 'dissectors'))
 269
 270
 271 # If scanning a subset of files, list them here.
 272 print('Examining:')
 273 if args.file or args.commits or args.open:
 274     if files:
 275         print(' '.join(files), '\n')
 276     else:
 277         print('No files to check.\n')
 278 else:
 279     if not args.docs:
 280         print('All dissector modules\n')
 281     else:
 282         print('Document sources')
 283
 284 asyncio.run(check_all_links(links))
 285
 286 # Write failures to a file.  Back up any previous first though.
 287 if os.path.exists('failures.txt'):
 288     shutil.copyfile('failures.txt', 'failures_last_run.txt')
 289 with open('failures.txt', 'w') as f_f:
 290     for link in links:
 291         if link.tested and not link.success:
 292             f_f.write(str(link) + '\n')
 293 # And successes
 294 with open('successes.txt', 'w') as f_s:
 295     for link in links:
 296         if link.tested and link.success:
 297             f_s.write(str(link) + '\n')
 298
 299
 300 # Count and show overall stats.
 301 passed, failed = 0, 0
 302 for link in links:
 303     if link.tested:
 304         if link.success:
 305             passed += 1
 306         else:
 307             failed += 1
 308
 309 print('--------------------------------------------------------------------------------------------------')
 310 print(len(links), 'links checked: ', passed, 'passed,', failed, 'failed')