Revert "LATER packet-kerberos: ticket_checksum tmpvtb..."
[wireshark-sm.git] / tools / check_dissector_urls.py
blob96ee4d650b2dbdceee271b0af2978d386bb04301
1 #!/usr/bin/env python3
2 # Wireshark - Network traffic analyzer
3 # By Gerald Combs <gerald@wireshark.org>
4 # Copyright 1998 Gerald Combs
6 # SPDX-License-Identifier: GPL-2.0-or-later
8 import argparse
9 import aiohttp
10 import asyncio
11 import os
12 import re
13 import shutil
14 import signal
15 import subprocess
17 # This utility scans the dissector code for URLs, then attempts to
18 # fetch the links. The results are shown in stdout, but also, at
19 # the end of the run, written to files:
20 # - URLs that couldn't be loaded are written to failures.txt
21 # - working URLs are written to successes.txt
22 # - any previous failures.txt is also copied to failures_last_run.txt
24 # N.B. preferred form of RFC link is e.g., https://tools.ietf.org/html/rfc4349
27 # TODO:
28 # - option to write back to dissector file when there is a failure?
29 # - optionally parse previous/recent successes.txt and avoid fetching them again?
30 # - make sure URLs are really within comments in code?
31 # - use urllib.parse or similar to better check URLs?
32 # - improve regex to allow '+' in URL (like confluence uses)
34 # Try to exit soon after Ctrl-C is pressed.
35 should_exit = False
38 def signal_handler(sig, frame):
39 global should_exit
40 should_exit = True
41 print('You pressed Ctrl+C - exiting')
42 try:
43 tasks = asyncio.all_tasks()
44 except (RuntimeError):
45 # we haven't yet started the async link checking, we can exit directly
46 exit(1)
47 # ignore further SIGINTs while we're cancelling the running tasks
48 signal.signal(signal.SIGINT, signal.SIG_IGN)
49 for t in tasks:
50 t.cancel()
52 signal.signal(signal.SIGINT, signal_handler)
55 class FailedLookup:
57 def __init__(self):
58 # Fake values that will be queried (for a requests.get() return value)
59 self.status = 0
60 self.headers = {}
61 self.headers['content-type'] = '<NONE>'
63 def __str__(self):
64 s = ('FailedLookup: status=' + str(self.status) +
65 ' content-type=' + self.headers['content-type'])
66 return s
69 # Dictionary from url -> result
70 cached_lookups = {}
73 class Link(object):
75 def __init__(self, file, line_number, url):
76 self.file = file
77 self.line_number = line_number
78 self.url = url
79 self.tested = False
80 self.r = None
81 self.success = False
83 def __str__(self):
84 epan_idx = self.file.find('epan')
85 if epan_idx == -1:
86 filename = self.file
87 else:
88 filename = self.file[epan_idx:]
89 s = ('SUCCESS ' if self.success else 'FAILED ') + \
90 filename + ':' + str(self.line_number) + ' ' + self.url
91 if True: # self.r:
92 if self.r.status:
93 s += " status-code=" + str(self.r.status)
94 if 'content-type' in self.r.headers:
95 s += (' content-type="' +
96 self.r.headers['content-type'] + '"')
97 else:
98 s += ' <No response Received>'
99 return s
101 def validate(self):
102 global cached_lookups
103 global should_exit
104 if should_exit:
105 return
106 self.tested = True
107 if self.url in cached_lookups:
108 self.r = cached_lookups[self.url]
109 else:
110 self.r = FailedLookup()
112 if self.r.status < 200 or self.r.status >= 300:
113 self.success = False
114 else:
115 self.success = True
117 if (args.verbose or not self.success) and not should_exit:
118 print(self)
120 links = []
121 files = []
122 all_urls = set()
124 def find_links_in_file(filename):
125 if os.path.isdir(filename):
126 return
128 with open(filename, 'r', encoding="utf8") as f:
129 for line_number, line in enumerate(f, start=1):
130 # TODO: not matching
131 # https://cwiki.apache.org/confluence/display/KAFKA/A+Guide+To+The+Kafka+Protocol
132 urls = re.findall(
133 r'https?://(?:[a-zA-Z0-9./_?&=-]+|%[0-9a-fA-F]{2})+', line)
135 for url in urls:
136 # Lop off any trailing chars that are not part of it
137 url = url.rstrip(").',")
139 # A url must have a period somewhere
140 if '.' not in url:
141 continue
142 global links, all_urls
143 links.append(Link(filename, line_number, url))
144 all_urls.add(url)
147 # Scan the given folder for links to test. Recurses.
148 def find_links_in_folder(folder):
149 files_to_check = []
150 for root,subfolders,files in os.walk(folder):
151 for f in files:
152 if should_exit:
153 return
154 file = os.path.join(root, f)
155 if file.endswith('.c') or file.endswith('.adoc'):
156 files_to_check.append(file)
158 # Deal with files in sorted order.
159 for file in sorted(files_to_check):
160 find_links_in_file(file)
164 async def populate_cache(sem, session, url):
165 global cached_lookups
166 if should_exit:
167 return
168 async with sem:
169 try:
170 async with session.get(url) as r:
171 cached_lookups[url] = r
172 if args.verbose:
173 print('checking ', url, ': success', sep='')
175 except (asyncio.CancelledError, ValueError, ConnectionError, Exception):
176 cached_lookups[url] = FailedLookup()
177 if args.verbose:
178 print('checking ', url, ': failed', sep='')
181 async def check_all_links(links):
182 sem = asyncio.Semaphore(50)
183 timeout = aiohttp.ClientTimeout(total=25)
184 connector = aiohttp.TCPConnector(limit=30)
185 headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.76 Safari/537.36',
186 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'}
187 async with aiohttp.ClientSession(connector=connector, headers=headers, timeout=timeout) as session:
188 tasks = [populate_cache(sem, session, u) for u in all_urls]
189 try:
190 await asyncio.gather(*tasks)
191 except (asyncio.CancelledError):
192 await session.close()
194 for link in links:
195 link.validate()
198 #################################################################
199 # Main logic.
201 # command-line args. Controls which dissector files should be scanned.
202 # If no args given, will just scan epan/dissectors folder.
203 parser = argparse.ArgumentParser(description='Check URL links in dissectors')
204 parser.add_argument('--file', action='append',
205 help='specify individual dissector file to test')
206 parser.add_argument('--commits', action='store',
207 help='last N commits to check')
208 parser.add_argument('--open', action='store_true',
209 help='check open files')
210 parser.add_argument('--verbose', action='store_true',
211 help='when enabled, show more output')
212 parser.add_argument('--docs', action='store_true',
213 help='when enabled, also check document folders')
216 args = parser.parse_args()
219 def is_dissector_file(filename):
220 p = re.compile(r'.*(packet|file)-.*\.c')
221 return p.match(filename)
224 # Get files from wherever command-line args indicate.
225 if args.file:
226 # Add specified file(s)
227 for f in args.file:
228 if not os.path.isfile(f) and not f.startswith('epan'):
229 f = os.path.join('epan', 'dissectors', f)
230 if not os.path.isfile(f):
231 print('Chosen file', f, 'does not exist.')
232 exit(1)
233 else:
234 files.append(f)
235 find_links_in_file(f)
236 elif args.commits:
237 # Get files affected by specified number of commits.
238 command = ['git', 'diff', '--name-only', 'HEAD~' + args.commits]
239 files = [f.decode('utf-8')
240 for f in subprocess.check_output(command).splitlines()]
241 # Fetch links from files (dissectors files only)
242 files = list(filter(is_dissector_file, files))
243 for f in files:
244 find_links_in_file(f)
245 elif args.open:
246 # Unstaged changes.
247 command = ['git', 'diff', '--name-only']
248 files = [f.decode('utf-8')
249 for f in subprocess.check_output(command).splitlines()]
250 files = list(filter(is_dissector_file, files))
251 # Staged changes.
252 command = ['git', 'diff', '--staged', '--name-only']
253 files_staged = [f.decode('utf-8')
254 for f in subprocess.check_output(command).splitlines()]
255 files_staged = list(filter(is_dissector_file, files_staged))
256 for f in files:
257 find_links_in_file(f)
258 for f in files_staged:
259 if f not in files:
260 find_links_in_file(f)
261 files.append(f)
262 elif args.docs:
263 # Find links from doc folder(s)
264 find_links_in_folder(os.path.join(os.path.dirname(__file__), '..', 'doc'))
266 else:
267 # Find links from dissector folder.
268 find_links_in_folder(os.path.join(os.path.dirname(__file__), '..', 'epan', 'dissectors'))
271 # If scanning a subset of files, list them here.
272 print('Examining:')
273 if args.file or args.commits or args.open:
274 if files:
275 print(' '.join(files), '\n')
276 else:
277 print('No files to check.\n')
278 else:
279 if not args.docs:
280 print('All dissector modules\n')
281 else:
282 print('Document sources')
284 asyncio.run(check_all_links(links))
286 # Write failures to a file. Back up any previous first though.
287 if os.path.exists('failures.txt'):
288 shutil.copyfile('failures.txt', 'failures_last_run.txt')
289 with open('failures.txt', 'w') as f_f:
290 for link in links:
291 if link.tested and not link.success:
292 f_f.write(str(link) + '\n')
293 # And successes
294 with open('successes.txt', 'w') as f_s:
295 for link in links:
296 if link.tested and link.success:
297 f_s.write(str(link) + '\n')
300 # Count and show overall stats.
301 passed, failed = 0, 0
302 for link in links:
303 if link.tested:
304 if link.success:
305 passed += 1
306 else:
307 failed += 1
309 print('--------------------------------------------------------------------------------------------------')
310 print(len(links), 'links checked: ', passed, 'passed,', failed, 'failed')