2 # Wireshark - Network traffic analyzer
3 # By Gerald Combs <gerald@wireshark.org>
4 # Copyright 1998 Gerald Combs
6 # SPDX-License-Identifier: GPL-2.0-or-later
17 # This utility scans the dissector code for URLs, then attempts to
18 # fetch the links. The results are shown in stdout, but also, at
19 # the end of the run, written to files:
20 # - URLs that couldn't be loaded are written to failures.txt
21 # - working URLs are written to successes.txt
22 # - any previous failures.txt is also copied to failures_last_run.txt
24 # N.B. preferred form of RFC link is e.g., https://tools.ietf.org/html/rfc4349
28 # - option to write back to dissector file when there is a failure?
29 # - optionally parse previous/recent successes.txt and avoid fetching them again?
30 # - make sure URLs are really within comments in code?
31 # - use urllib.parse or similar to better check URLs?
32 # - improve regex to allow '+' in URL (like confluence uses)
34 # Try to exit soon after Ctrl-C is pressed.
38 def signal_handler(sig
, frame
):
41 print('You pressed Ctrl+C - exiting')
43 tasks
= asyncio
.all_tasks()
44 except (RuntimeError):
45 # we haven't yet started the async link checking, we can exit directly
47 # ignore further SIGINTs while we're cancelling the running tasks
48 signal
.signal(signal
.SIGINT
, signal
.SIG_IGN
)
52 signal
.signal(signal
.SIGINT
, signal_handler
)
58 # Fake values that will be queried (for a requests.get() return value)
61 self
.headers
['content-type'] = '<NONE>'
64 s
= ('FailedLookup: status=' + str(self
.status
) +
65 ' content-type=' + self
.headers
['content-type'])
69 # Dictionary from url -> result
75 def __init__(self
, file, line_number
, url
):
77 self
.line_number
= line_number
84 epan_idx
= self
.file.find('epan')
88 filename
= self
.file[epan_idx
:]
89 s
= ('SUCCESS ' if self
.success
else 'FAILED ') + \
90 filename
+ ':' + str(self
.line_number
) + ' ' + self
.url
93 s
+= " status-code=" + str(self
.r
.status
)
94 if 'content-type' in self
.r
.headers
:
95 s
+= (' content-type="' +
96 self
.r
.headers
['content-type'] + '"')
98 s
+= ' <No response Received>'
102 global cached_lookups
107 if self
.url
in cached_lookups
:
108 self
.r
= cached_lookups
[self
.url
]
110 self
.r
= FailedLookup()
112 if self
.r
.status
< 200 or self
.r
.status
>= 300:
117 if (args
.verbose
or not self
.success
) and not should_exit
:
124 def find_links_in_file(filename
):
125 if os
.path
.isdir(filename
):
128 with
open(filename
, 'r', encoding
="utf8") as f
:
129 for line_number
, line
in enumerate(f
, start
=1):
131 # https://cwiki.apache.org/confluence/display/KAFKA/A+Guide+To+The+Kafka+Protocol
133 r
'https?://(?:[a-zA-Z0-9./_?&=-]+|%[0-9a-fA-F]{2})+', line
)
136 # Lop off any trailing chars that are not part of it
137 url
= url
.rstrip(").',")
139 # A url must have a period somewhere
142 global links
, all_urls
143 links
.append(Link(filename
, line_number
, url
))
147 # Scan the given folder for links to test. Recurses.
148 def find_links_in_folder(folder
):
150 for root
,subfolders
,files
in os
.walk(folder
):
154 file = os
.path
.join(root
, f
)
155 if file.endswith('.c') or file.endswith('.adoc'):
156 files_to_check
.append(file)
158 # Deal with files in sorted order.
159 for file in sorted(files_to_check
):
160 find_links_in_file(file)
164 async def populate_cache(sem
, session
, url
):
165 global cached_lookups
170 async with session
.get(url
) as r
:
171 cached_lookups
[url
] = r
173 print('checking ', url
, ': success', sep
='')
175 except (asyncio
.CancelledError
, ValueError, ConnectionError
, Exception):
176 cached_lookups
[url
] = FailedLookup()
178 print('checking ', url
, ': failed', sep
='')
181 async def check_all_links(links
):
182 sem
= asyncio
.Semaphore(50)
183 timeout
= aiohttp
.ClientTimeout(total
=25)
184 connector
= aiohttp
.TCPConnector(limit
=30)
185 headers
= {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.76 Safari/537.36',
186 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'}
187 async with aiohttp
.ClientSession(connector
=connector
, headers
=headers
, timeout
=timeout
) as session
:
188 tasks
= [populate_cache(sem
, session
, u
) for u
in all_urls
]
190 await asyncio
.gather(*tasks
)
191 except (asyncio
.CancelledError
):
192 await session
.close()
198 #################################################################
201 # command-line args. Controls which dissector files should be scanned.
202 # If no args given, will just scan epan/dissectors folder.
203 parser
= argparse
.ArgumentParser(description
='Check URL links in dissectors')
204 parser
.add_argument('--file', action
='append',
205 help='specify individual dissector file to test')
206 parser
.add_argument('--commits', action
='store',
207 help='last N commits to check')
208 parser
.add_argument('--open', action
='store_true',
209 help='check open files')
210 parser
.add_argument('--verbose', action
='store_true',
211 help='when enabled, show more output')
212 parser
.add_argument('--docs', action
='store_true',
213 help='when enabled, also check document folders')
216 args
= parser
.parse_args()
219 def is_dissector_file(filename
):
220 p
= re
.compile(r
'.*(packet|file)-.*\.c')
221 return p
.match(filename
)
224 # Get files from wherever command-line args indicate.
226 # Add specified file(s)
228 if not os
.path
.isfile(f
) and not f
.startswith('epan'):
229 f
= os
.path
.join('epan', 'dissectors', f
)
230 if not os
.path
.isfile(f
):
231 print('Chosen file', f
, 'does not exist.')
235 find_links_in_file(f
)
237 # Get files affected by specified number of commits.
238 command
= ['git', 'diff', '--name-only', 'HEAD~' + args
.commits
]
239 files
= [f
.decode('utf-8')
240 for f
in subprocess
.check_output(command
).splitlines()]
241 # Fetch links from files (dissectors files only)
242 files
= list(filter(is_dissector_file
, files
))
244 find_links_in_file(f
)
247 command
= ['git', 'diff', '--name-only']
248 files
= [f
.decode('utf-8')
249 for f
in subprocess
.check_output(command
).splitlines()]
250 files
= list(filter(is_dissector_file
, files
))
252 command
= ['git', 'diff', '--staged', '--name-only']
253 files_staged
= [f
.decode('utf-8')
254 for f
in subprocess
.check_output(command
).splitlines()]
255 files_staged
= list(filter(is_dissector_file
, files_staged
))
257 find_links_in_file(f
)
258 for f
in files_staged
:
260 find_links_in_file(f
)
263 # Find links from doc folder(s)
264 find_links_in_folder(os
.path
.join(os
.path
.dirname(__file__
), '..', 'doc'))
267 # Find links from dissector folder.
268 find_links_in_folder(os
.path
.join(os
.path
.dirname(__file__
), '..', 'epan', 'dissectors'))
271 # If scanning a subset of files, list them here.
273 if args
.file or args
.commits
or args
.open:
275 print(' '.join(files
), '\n')
277 print('No files to check.\n')
280 print('All dissector modules\n')
282 print('Document sources')
284 asyncio
.run(check_all_links(links
))
286 # Write failures to a file. Back up any previous first though.
287 if os
.path
.exists('failures.txt'):
288 shutil
.copyfile('failures.txt', 'failures_last_run.txt')
289 with
open('failures.txt', 'w') as f_f
:
291 if link
.tested
and not link
.success
:
292 f_f
.write(str(link
) + '\n')
294 with
open('successes.txt', 'w') as f_s
:
296 if link
.tested
and link
.success
:
297 f_s
.write(str(link
) + '\n')
300 # Count and show overall stats.
301 passed
, failed
= 0, 0
309 print('--------------------------------------------------------------------------------------------------')
310 print(len(links
), 'links checked: ', passed
, 'passed,', failed
, 'failed')