dcerpc-nt: add UNION_ALIGN_TO... helpers
[wireshark-sm.git] / tools / make-manuf.py
blobd4c77942c1970773253c97c0dabc561faa13815c
1 #!/usr/bin/env python3
3 # Wireshark - Network traffic analyzer
4 # By Gerald Combs <gerald@wireshark.org>
5 # Copyright 1998 Gerald Combs
7 # SPDX-License-Identifier: GPL-2.0-or-later
8 '''Update the "manuf" file.
10 Make-manuf creates a file containing ethernet OUIs and their company
11 IDs from the databases at IEEE.
12 '''
14 import csv
15 import html
16 import io
17 import os
18 import re
19 import sys
20 import urllib.request, urllib.error, urllib.parse
22 have_icu = False
23 try:
24 # Use the grapheme or segments module instead?
25 import icu
26 have_icu = True
27 except ImportError:
28 pass
30 def exit_msg(msg=None, status=1):
31 if msg is not None:
32 sys.stderr.write(msg + '\n\n')
33 sys.stderr.write(__doc__ + '\n')
34 sys.exit(status)
36 def open_url(url):
37 '''Open a URL.
38 Returns a tuple containing the body and response dict. The body is a
39 str in Python 3 and bytes in Python 2 in order to be compatible with
40 csv.reader.
41 '''
43 if len(sys.argv) > 1:
44 url_path = os.path.join(sys.argv[1], url[1])
45 url_fd = open(url_path)
46 body = url_fd.read()
47 url_fd.close()
48 else:
49 url_path = '/'.join(url)
51 req_headers = { 'User-Agent': 'Wireshark make-manuf' }
52 try:
53 req = urllib.request.Request(url_path, headers=req_headers)
54 response = urllib.request.urlopen(req)
55 body = response.read().decode('UTF-8', 'replace').replace(u'\u200e', '')
56 except Exception:
57 exit_msg('Error opening ' + url_path)
59 return body
61 # These are applied after punctuation has been removed.
62 # More examples at https://en.wikipedia.org/wiki/Incorporation_(business)
63 general_terms = '|'.join([
64 ' a +s\\b', # A/S and A.S. but not "As" as in "Connect As".
65 ' ab\\b', # Also follows "Oy", which is covered below.
66 ' ag\\b',
67 ' b ?v\\b',
68 ' closed joint stock company\\b',
69 ' co\\b',
70 ' company\\b',
71 ' corp\\b',
72 ' corporation\\b',
73 ' corporate\\b',
74 ' de c ?v\\b', # Follows "S.A.", which is covered separately below.
75 ' gmbh\\b',
76 ' holding\\b',
77 ' inc\\b',
78 ' incorporated\\b',
79 ' jsc\\b',
80 ' kg\\b',
81 ' k k\\b', # "K.K." as in "kabushiki kaisha", but not "K+K" as in "K+K Messtechnik".
82 ' limited\\b',
83 ' llc\\b',
84 ' ltd\\b',
85 ' n ?v\\b',
86 ' oao\\b',
87 ' of\\b',
88 ' open joint stock company\\b',
89 ' ooo\\b',
90 ' oü\\b',
91 ' oy\\b',
92 ' oyj\\b',
93 ' plc\\b',
94 ' pty\\b',
95 ' pvt\\b',
96 ' s ?a ?r ?l\\b',
97 ' s ?a\\b',
98 ' s ?p ?a\\b',
99 ' sp ?k\\b',
100 ' s ?r ?l\\b',
101 ' systems\\b',
102 '\\bthe\\b',
103 ' zao\\b',
104 ' z ?o ?o\\b'
107 # Chinese company names tend to start with the location, skip it (non-exhaustive list).
108 skip_start = [
109 'shengzen',
110 'shenzhen',
111 'beijing',
112 'shanghai',
113 'wuhan',
114 'hangzhou',
115 'guangxi',
116 'guangdong',
117 'chengdu',
120 # Special cases handled directly
121 special_case = {
122 "Advanced Micro Devices": "AMD",
123 "杭州德澜科技有限公司": "DelanTech" # 杭州德澜科技有限公司(HangZhou Delan Technology Co.,Ltd)
126 def shorten(manuf):
127 '''Convert a long manufacturer name to abbreviated and short names'''
128 # Normalize whitespace.
129 manuf = ' '.join(manuf.split())
130 orig_manuf = manuf
131 # Convert all caps to title case
132 if manuf.isupper():
133 manuf = manuf.title()
134 # Remove the contents of parenthesis as ancillary data
135 manuf = re.sub(r"\(.*\)", '', manuf)
136 # Remove the contents of fullwidth parenthesis (mostly in Asian names)
137 manuf = re.sub(r"(.*)", '', manuf)
138 # Remove "a" before removing punctuation ("Aruba, a Hewlett [...]" etc.)
139 manuf = manuf.replace(" a ", " ")
140 # Remove any punctuation
141 # XXX Use string.punctuation? Note that it includes '-' and '*'.
142 manuf = re.sub(r"[\"',./:()+-]", ' ', manuf)
143 # XXX For some reason including the double angle brackets in the above
144 # regex makes it bomb
145 manuf = re.sub(r"[«»“”]", ' ', manuf)
146 # & isn't needed when Standalone
147 manuf = manuf.replace(" & ", " ")
148 # Remove business types and other general terms ("the", "inc", "plc", etc.)
149 plain_manuf = re.sub(general_terms, '', manuf, flags=re.IGNORECASE)
150 # ...but make sure we don't remove everything.
151 if not all(s == ' ' for s in plain_manuf):
152 manuf = plain_manuf
154 manuf = manuf.strip()
156 # Check for special case
157 if manuf in special_case.keys():
158 manuf = special_case[manuf]
160 # XXX: Some of the entries have Chinese city or other location
161 # names written with spaces between each character, like
162 # Bei jing, Wu Han, Shen Zhen, etc. We should remove that too.
163 split = manuf.split()
164 if len(split) > 1 and split[0].lower() in skip_start:
165 manuf = ' '.join(split[1:])
167 # Remove all spaces
168 manuf = re.sub(r'\s+', '', manuf)
170 if len(manuf) < 1:
171 sys.stderr.write('Manufacturer "{}" shortened to nothing.\n'.format(orig_manuf))
172 sys.exit(1)
174 # Truncate names to a reasonable length, say, 12 characters. If
175 # the string contains UTF-8, this may be substantially more than
176 # 12 bytes. It might also be less than 12 visible characters. Plain
177 # Python slices Unicode strings by code point, which is better
178 # than raw bytes but not as good as grapheme clusters. PyICU
179 # supports grapheme clusters. https://bugs.python.org/issue30717
182 # Truncate by code points
183 trunc_len = 12
185 if have_icu:
186 # Truncate by grapheme clusters
187 bi_ci = icu.BreakIterator.createCharacterInstance(icu.Locale('en_US'))
188 bi_ci.setText(manuf)
189 bounds = list(bi_ci)
190 bounds = bounds[0:trunc_len]
191 trunc_len = bounds[-1]
193 manuf = manuf[:trunc_len]
195 if manuf.lower() == orig_manuf.lower():
196 # Original manufacturer name was short and simple.
197 return [manuf, None]
199 mixed_manuf = orig_manuf
200 # At least one entry has whitespace in front of a period.
201 mixed_manuf = re.sub(r'\s+\.', '.', mixed_manuf)
202 #If company is all caps, convert to mixed case (so it doesn't look like we're screaming the company name)
203 if mixed_manuf.upper() == mixed_manuf:
204 mixed_manuf = mixed_manuf.title()
206 return [manuf, mixed_manuf]
208 MA_L = 'MA_L'
209 MA_M = 'MA_M'
210 MA_S = 'MA_S'
212 def prefix_to_oui(prefix, prefix_map):
213 pfx_len = int(len(prefix) * 8 / 2)
214 prefix24 = prefix[:6]
215 oui24 = ':'.join(hi + lo for hi, lo in zip(prefix24[0::2], prefix24[1::2]))
217 if pfx_len == 24:
218 # 24-bit OUI assignment, no mask
219 return oui24, MA_L
221 # Other lengths which require a mask.
222 oui = prefix.ljust(12, '0')
223 oui = ':'.join(hi + lo for hi, lo in zip(oui[0::2], oui[1::2]))
224 if pfx_len == 28:
225 kind = MA_M
226 elif pfx_len == 36:
227 kind = MA_S
228 prefix_map[oui24] = kind
230 return '{}/{:d}'.format(oui, int(pfx_len)), kind
232 def main():
233 manuf_path = os.path.join(os.path.dirname(__file__), '..', 'epan', 'manuf-data.c')
235 ieee_d = {
236 'OUI': { 'url': ["https://standards-oui.ieee.org/oui/", "oui.csv"], 'min_entries': 1000 },
237 'CID': { 'url': ["https://standards-oui.ieee.org/cid/", "cid.csv"], 'min_entries': 75 },
238 'IAB': { 'url': ["https://standards-oui.ieee.org/iab/", "iab.csv"], 'min_entries': 1000 },
239 'OUI28': { 'url': ["https://standards-oui.ieee.org/oui28/", "mam.csv"], 'min_entries': 1000 },
240 'OUI36': { 'url': ["https://standards-oui.ieee.org/oui36/", "oui36.csv"], 'min_entries': 1000 },
242 oui_d = {
243 MA_L: { '00:00:00' : ['00:00:00', 'Officially Xerox, but 0:0:0:0:0:0 is more common'] },
244 MA_M: {},
245 MA_S: {},
248 min_total = 35000 # 35830 as of 2018-09-05
249 total_added = 0
251 # Add IEEE entries from each of their databases
252 ieee_db_l = ['OUI', 'OUI28', 'OUI36', 'CID', 'IAB']
254 # map a 24-bit prefix to MA-M/MA-S or none (MA-L by default)
255 prefix_map = {}
257 for db in ieee_db_l:
258 db_url = ieee_d[db]['url']
259 ieee_d[db]['skipped'] = 0
260 ieee_d[db]['added'] = 0
261 ieee_d[db]['total'] = 0
262 print('Merging {} data from {}'.format(db, db_url))
263 body = open_url(db_url)
264 ieee_csv = csv.reader(body.splitlines())
266 # Pop the title row.
267 next(ieee_csv)
268 for ieee_row in ieee_csv:
269 #Registry,Assignment,Organization Name,Organization Address
270 #IAB,0050C2DD6,Transas Marine Limited,Datavagen 37 Askim Vastra Gotaland SE 436 32
271 oui, kind = prefix_to_oui(ieee_row[1].upper(), prefix_map)
272 manuf = ieee_row[2].strip()
273 # The Organization Name field occasionally contains HTML entities. Undo them.
274 manuf = html.unescape(manuf)
275 # "Watts A\S"
276 manuf = manuf.replace('\\', '/')
277 if manuf == 'IEEE Registration Authority':
278 # These are held for subdivision into MA-M/MA-S
279 continue
280 #if manuf == 'Private':
281 # continue
282 if oui in oui_d[kind]:
283 action = 'Skipping'
284 print('{} - {} IEEE "{}" in favor of "{}"'.format(oui, action, manuf, oui_d[kind][oui]))
285 ieee_d[db]['skipped'] += 1
286 else:
287 oui_d[kind][oui] = shorten(manuf)
288 ieee_d[db]['added'] += 1
289 ieee_d[db]['total'] += 1
291 if ieee_d[db]['total'] < ieee_d[db]['min_entries']:
292 exit_msg("Too few {} entries. Got {}, wanted {}".format(db, ieee_d[db]['total'], ieee_d[db]['min_entries']))
293 total_added += ieee_d[db]['total']
295 if total_added < min_total:
296 exit_msg("Too few total entries ({})".format(total_added))
298 try:
299 manuf_fd = io.open(manuf_path, 'w', encoding='UTF-8')
300 except Exception:
301 exit_msg("Couldn't open manuf file for reading ({}) ".format(manuf_path))
303 manuf_fd.write('''/*
304 * This file was generated by running ./tools/make-manuf.py.
306 * SPDX-License-Identifier: GPL-2.0-or-later
308 * The data below has been assembled from the following sources:
310 * The IEEE public OUI listings available from:
311 * <http://standards-oui.ieee.org/oui/oui.csv>
312 * <http://standards-oui.ieee.org/cid/cid.csv>
313 * <http://standards-oui.ieee.org/iab/iab.csv>
314 * <http://standards-oui.ieee.org/oui28/mam.csv>
315 * <http://standards-oui.ieee.org/oui36/oui36.csv>
319 ''')
321 # Write the prefix map
322 manuf_fd.write("static const manuf_registry_t ieee_registry_table[] = {\n")
323 keys = list(prefix_map.keys())
324 keys.sort()
325 for oui in keys:
326 manuf_fd.write(" {{ {{ 0x{}, 0x{}, 0x{} }}, {} }},\n".format(oui[0:2], oui[3:5], oui[6:8], prefix_map[oui]))
327 manuf_fd.write("};\n\n")
329 # write the MA-L table
330 manuf_fd.write("static const manuf_oui24_t global_manuf_oui24_table[] = {\n")
331 keys = list(oui_d[MA_L].keys())
332 keys.sort()
333 for oui in keys:
334 short = oui_d[MA_L][oui][0]
335 if oui_d[MA_L][oui][1]:
336 long = oui_d[MA_L][oui][1]
337 else:
338 long = short
339 line = " {{ {{ 0x{}, 0x{}, 0x{} }}, \"{}\", ".format(oui[0:2], oui[3:5], oui[6:8], short)
340 sep = 44 - len(line)
341 if sep <= 0:
342 sep = 0
343 line += sep * ' '
344 line += "\"{}\" }},\n".format(long.replace('"', '\\"'))
345 manuf_fd.write(line)
346 manuf_fd.write("};\n\n")
348 # write the MA-M table
349 manuf_fd.write("static const manuf_oui28_t global_manuf_oui28_table[] = {\n")
350 keys = list(oui_d[MA_M].keys())
351 keys.sort()
352 for oui in keys:
353 short = oui_d[MA_M][oui][0]
354 if oui_d[MA_M][oui][1]:
355 long = oui_d[MA_M][oui][1]
356 else:
357 long = short
358 line = " {{ {{ 0x{}, 0x{}, 0x{}, 0x{} }}, \"{}\", ".format(oui[0:2], oui[3:5], oui[6:8], oui[9:11], short)
359 sep = 50 - len(line)
360 if sep <= 0:
361 sep = 0
362 line += sep * ' '
363 line += "\"{}\" }},\n".format(long.replace('"', '\\"'))
364 manuf_fd.write(line)
365 manuf_fd.write("};\n\n")
367 #write the MA-S table
368 manuf_fd.write("static const manuf_oui36_t global_manuf_oui36_table[] = {\n")
369 keys = list(oui_d[MA_S].keys())
370 keys.sort()
371 for oui in keys:
372 short = oui_d[MA_S][oui][0]
373 if oui_d[MA_S][oui][1]:
374 long = oui_d[MA_S][oui][1]
375 else:
376 long = short
377 line = " {{ {{ 0x{}, 0x{}, 0x{}, 0x{}, 0x{} }}, \"{}\", ".format(oui[0:2], oui[3:5], oui[6:8], oui[9:11], oui[12:14], short)
378 sep = 56 - len(line)
379 if sep <= 0:
380 sep = 0
381 line += sep * ' '
382 line += "\"{}\" }},\n".format(long.replace('"', '\\"'))
383 manuf_fd.write(line)
384 manuf_fd.write("};\n")
386 manuf_fd.close()
388 for db in ieee_d:
389 print('{:<20}: {}'.format('IEEE ' + db + ' added', ieee_d[db]['added']))
390 print('{:<20}: {}'.format('Total added', total_added))
392 print()
393 for db in ieee_d:
394 print('{:<20}: {}'.format('IEEE ' + db + ' total', ieee_d[db]['total']))
396 print()
397 for db in ieee_d:
398 print('{:<20}: {}'.format('IEEE ' + db + ' skipped', ieee_d[db]['skipped']))
400 if __name__ == '__main__':
401 main()