tools/make-manuf.py

   1 #!/usr/bin/env python3
   2 #
   3 # Wireshark - Network traffic analyzer
   4 # By Gerald Combs <gerald@wireshark.org>
   5 # Copyright 1998 Gerald Combs
   6 #
   7 # SPDX-License-Identifier: GPL-2.0-or-later
   8 '''Update the "manuf" file.
   9
  10 Make-manuf creates a file containing ethernet OUIs and their company
  11 IDs from the databases at IEEE.
  12 '''
  13
  14 import csv
  15 import html
  16 import io
  17 import os
  18 import re
  19 import sys
  20 import urllib.request, urllib.error, urllib.parse
  21
  22 have_icu = False
  23 try:
  24     # Use the grapheme or segments module instead?
  25     import icu
  26     have_icu = True
  27 except ImportError:
  28     pass
  29
  30 def exit_msg(msg=None, status=1):
  31     if msg is not None:
  32         sys.stderr.write(msg + '\n\n')
  33     sys.stderr.write(__doc__ + '\n')
  34     sys.exit(status)
  35
  36 def open_url(url):
  37     '''Open a URL.
  38     Returns a tuple containing the body and response dict. The body is a
  39     str in Python 3 and bytes in Python 2 in order to be compatible with
  40     csv.reader.
  41     '''
  42
  43     if len(sys.argv) > 1:
  44         url_path = os.path.join(sys.argv[1], url[1])
  45         url_fd = open(url_path)
  46         body = url_fd.read()
  47         url_fd.close()
  48     else:
  49         url_path = '/'.join(url)
  50
  51         req_headers = { 'User-Agent': 'Wireshark make-manuf' }
  52         try:
  53             req = urllib.request.Request(url_path, headers=req_headers)
  54             response = urllib.request.urlopen(req)
  55             body = response.read().decode('UTF-8', 'replace').replace(u'\u200e', '')
  56         except Exception:
  57             exit_msg('Error opening ' + url_path)
  58
  59     return body
  60
  61 # These are applied after punctuation has been removed.
  62 # More examples at https://en.wikipedia.org/wiki/Incorporation_(business)
  63 general_terms = '|'.join([
  64     ' a +s\\b', # A/S and A.S. but not "As" as in "Connect As".
  65     ' ab\\b', # Also follows "Oy", which is covered below.
  66     ' ag\\b',
  67     ' b ?v\\b',
  68     ' closed joint stock company\\b',
  69     ' co\\b',
  70     ' company\\b',
  71     ' corp\\b',
  72     ' corporation\\b',
  73     ' corporate\\b',
  74     ' de c ?v\\b', # Follows "S.A.", which is covered separately below.
  75     ' gmbh\\b',
  76     ' holding\\b',
  77     ' inc\\b',
  78     ' incorporated\\b',
  79     ' jsc\\b',
  80     ' kg\\b',
  81     ' k k\\b', # "K.K." as in "kabushiki kaisha", but not "K+K" as in "K+K Messtechnik".
  82     ' limited\\b',
  83     ' llc\\b',
  84     ' ltd\\b',
  85     ' n ?v\\b',
  86     ' oao\\b',
  87     ' of\\b',
  88     ' open joint stock company\\b',
  89     ' ooo\\b',
  90     ' oü\\b',
  91     ' oy\\b',
  92     ' oyj\\b',
  93     ' plc\\b',
  94     ' pty\\b',
  95     ' pvt\\b',
  96     ' s ?a ?r ?l\\b',
  97     ' s ?a\\b',
  98     ' s ?p ?a\\b',
  99     ' sp ?k\\b',
 100     ' s ?r ?l\\b',
 101     ' systems\\b',
 102     '\\bthe\\b',
 103     ' zao\\b',
 104     ' z ?o ?o\\b'
 105     ])
 106
 107 # Chinese company names tend to start with the location, skip it (non-exhaustive list).
 108 skip_start = [
 109     'shengzen',
 110     'shenzhen',
 111     'beijing',
 112     'shanghai',
 113     'wuhan',
 114     'hangzhou',
 115     'guangxi',
 116     'guangdong',
 117     'chengdu',
 118 ]
 119
 120 # Special cases handled directly
 121 special_case = {
 122     "Advanced Micro Devices": "AMD",
 123     "杭州德澜科技有限公司": "DelanTech" # 杭州德澜科技有限公司（HangZhou Delan Technology Co.,Ltd）
 124 }
 125
 126 def shorten(manuf):
 127     '''Convert a long manufacturer name to abbreviated and short names'''
 128     # Normalize whitespace.
 129     manuf = ' '.join(manuf.split())
 130     orig_manuf = manuf
 131     # Convert all caps to title case
 132     if manuf.isupper():
 133         manuf = manuf.title()
 134     # Remove the contents of parenthesis as ancillary data
 135     manuf = re.sub(r"\(.*\)", '', manuf)
 136     # Remove the contents of fullwidth parenthesis (mostly in Asian names)
 137     manuf = re.sub(r"（.*）", '', manuf)
 138     # Remove "a" before removing punctuation ("Aruba, a Hewlett [...]" etc.)
 139     manuf = manuf.replace(" a ", " ")
 140     # Remove any punctuation
 141     # XXX Use string.punctuation? Note that it includes '-' and '*'.
 142     manuf = re.sub(r"[\"',./:()+-]", ' ', manuf)
 143     # XXX For some reason including the double angle brackets in the above
 144     # regex makes it bomb
 145     manuf = re.sub(r"[«»“”]", ' ', manuf)
 146     # & isn't needed when Standalone
 147     manuf = manuf.replace(" & ", " ")
 148     # Remove business types and other general terms ("the", "inc", "plc", etc.)
 149     plain_manuf = re.sub(general_terms, '', manuf, flags=re.IGNORECASE)
 150     # ...but make sure we don't remove everything.
 151     if not all(s == ' ' for s in plain_manuf):
 152         manuf = plain_manuf
 153
 154     manuf = manuf.strip()
 155
 156     # Check for special case
 157     if manuf in special_case.keys():
 158         manuf = special_case[manuf]
 159
 160     # XXX: Some of the entries have Chinese city or other location
 161     # names written with spaces between each character, like
 162     # Bei jing, Wu Han, Shen Zhen, etc. We should remove that too.
 163     split = manuf.split()
 164     if len(split) > 1 and split[0].lower() in skip_start:
 165         manuf = ' '.join(split[1:])
 166
 167     # Remove all spaces
 168     manuf = re.sub(r'\s+', '', manuf)
 169
 170     if len(manuf) < 1:
 171         sys.stderr.write('Manufacturer "{}" shortened to nothing.\n'.format(orig_manuf))
 172         sys.exit(1)
 173
 174     # Truncate names to a reasonable length, say, 12 characters. If
 175     # the string contains UTF-8, this may be substantially more than
 176     # 12 bytes. It might also be less than 12 visible characters. Plain
 177     # Python slices Unicode strings by code point, which is better
 178     # than raw bytes but not as good as grapheme clusters. PyICU
 179     # supports grapheme clusters. https://bugs.python.org/issue30717
 180     #
 181
 182     # Truncate by code points
 183     trunc_len = 12
 184
 185     if have_icu:
 186         # Truncate by grapheme clusters
 187         bi_ci = icu.BreakIterator.createCharacterInstance(icu.Locale('en_US'))
 188         bi_ci.setText(manuf)
 189         bounds = list(bi_ci)
 190         bounds = bounds[0:trunc_len]
 191         trunc_len = bounds[-1]
 192
 193     manuf = manuf[:trunc_len]
 194
 195     if manuf.lower() == orig_manuf.lower():
 196         # Original manufacturer name was short and simple.
 197         return [manuf, None]
 198
 199     mixed_manuf = orig_manuf
 200     # At least one entry has whitespace in front of a period.
 201     mixed_manuf = re.sub(r'\s+\.', '.', mixed_manuf)
 202     #If company is all caps, convert to mixed case (so it doesn't look like we're screaming the company name)
 203     if mixed_manuf.upper() == mixed_manuf:
 204         mixed_manuf = mixed_manuf.title()
 205
 206     return [manuf, mixed_manuf]
 207
 208 MA_L = 'MA_L'
 209 MA_M = 'MA_M'
 210 MA_S = 'MA_S'
 211
 212 def prefix_to_oui(prefix, prefix_map):
 213     pfx_len = int(len(prefix) * 8 / 2)
 214     prefix24 = prefix[:6]
 215     oui24 = ':'.join(hi + lo for hi, lo in zip(prefix24[0::2], prefix24[1::2]))
 216
 217     if pfx_len == 24:
 218         # 24-bit OUI assignment, no mask
 219         return oui24, MA_L
 220
 221     # Other lengths which require a mask.
 222     oui = prefix.ljust(12, '0')
 223     oui = ':'.join(hi + lo for hi, lo in zip(oui[0::2], oui[1::2]))
 224     if pfx_len == 28:
 225         kind = MA_M
 226     elif pfx_len == 36:
 227         kind = MA_S
 228     prefix_map[oui24] = kind
 229
 230     return '{}/{:d}'.format(oui, int(pfx_len)), kind
 231
 232 def main():
 233     manuf_path = os.path.join(os.path.dirname(__file__), '..', 'epan', 'manuf-data.c')
 234
 235     ieee_d = {
 236         'OUI':   { 'url': ["https://standards-oui.ieee.org/oui/", "oui.csv"], 'min_entries': 1000 },
 237         'CID':   { 'url': ["https://standards-oui.ieee.org/cid/", "cid.csv"], 'min_entries': 75 },
 238         'IAB':   { 'url': ["https://standards-oui.ieee.org/iab/", "iab.csv"], 'min_entries': 1000 },
 239         'OUI28': { 'url': ["https://standards-oui.ieee.org/oui28/", "mam.csv"], 'min_entries': 1000 },
 240         'OUI36': { 'url': ["https://standards-oui.ieee.org/oui36/", "oui36.csv"], 'min_entries': 1000 },
 241     }
 242     oui_d = {
 243         MA_L: { '00:00:00' : ['00:00:00', 'Officially Xerox, but 0:0:0:0:0:0 is more common'] },
 244         MA_M: {},
 245         MA_S: {},
 246     }
 247
 248     min_total = 35000 # 35830 as of 2018-09-05
 249     total_added = 0
 250
 251     # Add IEEE entries from each of their databases
 252     ieee_db_l = ['OUI', 'OUI28', 'OUI36', 'CID', 'IAB']
 253
 254     # map a 24-bit prefix to MA-M/MA-S or none (MA-L by default)
 255     prefix_map = {}
 256
 257     for db in ieee_db_l:
 258         db_url = ieee_d[db]['url']
 259         ieee_d[db]['skipped'] = 0
 260         ieee_d[db]['added'] = 0
 261         ieee_d[db]['total'] = 0
 262         print('Merging {} data from {}'.format(db, db_url))
 263         body = open_url(db_url)
 264         ieee_csv = csv.reader(body.splitlines())
 265
 266         # Pop the title row.
 267         next(ieee_csv)
 268         for ieee_row in ieee_csv:
 269             #Registry,Assignment,Organization Name,Organization Address
 270             #IAB,0050C2DD6,Transas Marine Limited,Datavagen 37 Askim Vastra Gotaland SE 436 32
 271             oui, kind = prefix_to_oui(ieee_row[1].upper(), prefix_map)
 272             manuf = ieee_row[2].strip()
 273             # The Organization Name field occasionally contains HTML entities. Undo them.
 274             manuf = html.unescape(manuf)
 275             # "Watts A\S"
 276             manuf = manuf.replace('\\', '/')
 277             if manuf == 'IEEE Registration Authority':
 278                 # These are held for subdivision into MA-M/MA-S
 279                 continue
 280             #if manuf == 'Private':
 281             #    continue
 282             if oui in oui_d[kind]:
 283                 action = 'Skipping'
 284                 print('{} - {} IEEE "{}" in favor of "{}"'.format(oui, action, manuf, oui_d[kind][oui]))
 285                 ieee_d[db]['skipped'] += 1
 286             else:
 287                 oui_d[kind][oui] = shorten(manuf)
 288                 ieee_d[db]['added'] += 1
 289             ieee_d[db]['total'] += 1
 290
 291         if ieee_d[db]['total'] < ieee_d[db]['min_entries']:
 292             exit_msg("Too few {} entries. Got {}, wanted {}".format(db, ieee_d[db]['total'], ieee_d[db]['min_entries']))
 293         total_added += ieee_d[db]['total']
 294
 295     if total_added < min_total:
 296         exit_msg("Too few total entries ({})".format(total_added))
 297
 298     try:
 299         manuf_fd = io.open(manuf_path, 'w', encoding='UTF-8')
 300     except Exception:
 301         exit_msg("Couldn't open manuf file for reading ({}) ".format(manuf_path))
 302
 303     manuf_fd.write('''/*
 304  * This file was generated by running ./tools/make-manuf.py.
 305  *
 306  * SPDX-License-Identifier: GPL-2.0-or-later
 307  *
 308  * The data below has been assembled from the following sources:
 309  *
 310  * The IEEE public OUI listings available from:
 311  * <http://standards-oui.ieee.org/oui/oui.csv>
 312  * <http://standards-oui.ieee.org/cid/cid.csv>
 313  * <http://standards-oui.ieee.org/iab/iab.csv>
 314  * <http://standards-oui.ieee.org/oui28/mam.csv>
 315  * <http://standards-oui.ieee.org/oui36/oui36.csv>
 316  *
 317  */
 318
 319 ''')
 320
 321     # Write the prefix map
 322     manuf_fd.write("static const manuf_registry_t ieee_registry_table[] = {\n")
 323     keys = list(prefix_map.keys())
 324     keys.sort()
 325     for oui in keys:
 326         manuf_fd.write("    {{ {{ 0x{}, 0x{}, 0x{} }}, {} }},\n".format(oui[0:2], oui[3:5], oui[6:8], prefix_map[oui]))
 327     manuf_fd.write("};\n\n")
 328
 329     # write the MA-L table
 330     manuf_fd.write("static const manuf_oui24_t global_manuf_oui24_table[] = {\n")
 331     keys = list(oui_d[MA_L].keys())
 332     keys.sort()
 333     for oui in keys:
 334         short = oui_d[MA_L][oui][0]
 335         if oui_d[MA_L][oui][1]:
 336             long = oui_d[MA_L][oui][1]
 337         else:
 338             long = short
 339         line = "    {{ {{ 0x{}, 0x{}, 0x{} }}, \"{}\", ".format(oui[0:2], oui[3:5], oui[6:8], short)
 340         sep = 44 - len(line)
 341         if sep <= 0:
 342             sep = 0
 343         line += sep * ' '
 344         line += "\"{}\" }},\n".format(long.replace('"', '\\"'))
 345         manuf_fd.write(line)
 346     manuf_fd.write("};\n\n")
 347
 348     # write the MA-M table
 349     manuf_fd.write("static const manuf_oui28_t global_manuf_oui28_table[] = {\n")
 350     keys = list(oui_d[MA_M].keys())
 351     keys.sort()
 352     for oui in keys:
 353         short = oui_d[MA_M][oui][0]
 354         if oui_d[MA_M][oui][1]:
 355             long = oui_d[MA_M][oui][1]
 356         else:
 357             long = short
 358         line = "    {{ {{ 0x{}, 0x{}, 0x{}, 0x{} }}, \"{}\", ".format(oui[0:2], oui[3:5], oui[6:8], oui[9:11], short)
 359         sep = 50 - len(line)
 360         if sep <= 0:
 361             sep = 0
 362         line += sep * ' '
 363         line += "\"{}\" }},\n".format(long.replace('"', '\\"'))
 364         manuf_fd.write(line)
 365     manuf_fd.write("};\n\n")
 366
 367     #write the MA-S table
 368     manuf_fd.write("static const manuf_oui36_t global_manuf_oui36_table[] = {\n")
 369     keys = list(oui_d[MA_S].keys())
 370     keys.sort()
 371     for oui in keys:
 372         short = oui_d[MA_S][oui][0]
 373         if oui_d[MA_S][oui][1]:
 374             long = oui_d[MA_S][oui][1]
 375         else:
 376             long = short
 377         line = "    {{ {{ 0x{}, 0x{}, 0x{}, 0x{}, 0x{} }}, \"{}\", ".format(oui[0:2], oui[3:5], oui[6:8], oui[9:11], oui[12:14], short)
 378         sep = 56 - len(line)
 379         if sep <= 0:
 380             sep = 0
 381         line += sep * ' '
 382         line += "\"{}\" }},\n".format(long.replace('"', '\\"'))
 383         manuf_fd.write(line)
 384     manuf_fd.write("};\n")
 385
 386     manuf_fd.close()
 387
 388     for db in ieee_d:
 389         print('{:<20}: {}'.format('IEEE ' + db + ' added', ieee_d[db]['added']))
 390     print('{:<20}: {}'.format('Total added', total_added))
 391
 392     print()
 393     for db in ieee_d:
 394         print('{:<20}: {}'.format('IEEE ' + db + ' total', ieee_d[db]['total']))
 395
 396     print()
 397     for db in ieee_d:
 398         print('{:<20}: {}'.format('IEEE ' + db + ' skipped', ieee_d[db]['skipped']))
 399
 400 if __name__ == '__main__':
 401     main()