clang/tools/include-mapping/cppreference_parser.py

   1 #!/usr/bin/env python3
   2 # ===- cppreference_parser.py -  ------------------------------*- python -*--===#
   3 #
   4 # Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   5 # See https://llvm.org/LICENSE.txt for license information.
   6 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   7 #
   8 # ===------------------------------------------------------------------------===#
   9
  10 from bs4 import BeautifulSoup, NavigableString
  11
  12 import collections
  13 import multiprocessing
  14 import os
  15 import re
  16 import signal
  17 import sys
  18
  19
  20 class Symbol:
  21     def __init__(self, name, namespace, headers):
  22         # unqualifed symbol name, e.g. "move"
  23         self.name = name
  24         # namespace of the symbol (with trailing "::"), e.g. "std::", "" (global scope)
  25         # None for C symbols.
  26         self.namespace = namespace
  27         # a list of corresponding headers
  28         self.headers = headers
  29
  30     def __lt__(self, other):
  31         if self.namespace != other.namespace:
  32             return str(self.namespace) < str(other.namespace)
  33         return self.name < other.name
  34
  35
  36 def _HasClass(tag, *classes):
  37     for c in tag.get("class", []):
  38         if c in classes:
  39             return True
  40     return False
  41
  42
  43 def _ParseSymbolPage(symbol_page_html, symbol_name):
  44     """Parse symbol page and retrieve the include header defined in this page.
  45     The symbol page provides header for the symbol, specifically in
  46     "Defined in header <header>" section. An example:
  47
  48     <tr class="t-dsc-header">
  49       <td colspan="2"> <div>Defined in header <code>&lt;ratio&gt;</code> </div>
  50     </td></tr>
  51
  52     Returns a list of headers.
  53     """
  54     headers = set()
  55     all_headers = set()
  56
  57     soup = BeautifulSoup(symbol_page_html, "html.parser")
  58     # Rows in table are like:
  59     #   Defined in header <foo>      .t-dsc-header
  60     #   Defined in header <bar>      .t-dsc-header
  61     #   decl1                        .t-dcl
  62     #   Defined in header <baz>      .t-dsc-header
  63     #   decl2                        .t-dcl
  64     for table in soup.select("table.t-dcl-begin, table.t-dsc-begin"):
  65         current_headers = []
  66         was_decl = False
  67         for row in table.select("tr"):
  68             if _HasClass(row, "t-dcl", "t-dsc"):
  69                 was_decl = True
  70                 # Symbols are in the first cell.
  71                 found_symbols = row.find("td").stripped_strings
  72                 if not symbol_name in found_symbols:
  73                     continue
  74                 headers.update(current_headers)
  75             elif _HasClass(row, "t-dsc-header"):
  76                 # If we saw a decl since the last header, this is a new block of headers
  77                 # for a new block of decls.
  78                 if was_decl:
  79                     current_headers = []
  80                 was_decl = False
  81                 # There are also .t-dsc-header for "defined in namespace".
  82                 if not "Defined in header " in row.text:
  83                     continue
  84                 # The interesting header content (e.g. <cstdlib>) is wrapped in <code>.
  85                 for header_code in row.find_all("code"):
  86                     current_headers.append(header_code.text)
  87                     all_headers.add(header_code.text)
  88     # If the symbol was never named, consider all named headers.
  89     return headers or all_headers
  90
  91
  92 def _ParseIndexPage(index_page_html):
  93     """Parse index page.
  94     The index page lists all std symbols and hrefs to their detailed pages
  95     (which contain the defined header). An example:
  96
  97     <a href="abs.html" title="abs"><tt>abs()</tt></a> (int) <br>
  98     <a href="acos.html" title="acos"><tt>acos()</tt></a> <br>
  99
 100     Returns a list of tuple (symbol_name, relative_path_to_symbol_page, variant).
 101     """
 102     symbols = []
 103     soup = BeautifulSoup(index_page_html, "html.parser")
 104     for symbol_href in soup.select("a[title]"):
 105         # Ignore annotated symbols like "acos<>() (std::complex)".
 106         # These tend to be overloads, and we the primary is more useful.
 107         # This accidentally accepts begin/end despite the (iterator) caption: the
 108         # (since C++11) note is first. They are good symbols, so the bug is unfixed.
 109         caption = symbol_href.next_sibling
 110         variant = None
 111         if isinstance(caption, NavigableString) and "(" in caption:
 112             variant = caption.text.strip(" ()")
 113         symbol_tt = symbol_href.find("tt")
 114         if symbol_tt:
 115             symbols.append(
 116                 (
 117                     symbol_tt.text.rstrip("<>()"),  # strip any trailing <>()
 118                     symbol_href["href"],
 119                     variant,
 120                 )
 121             )
 122     return symbols
 123
 124
 125 def _ReadSymbolPage(path, name):
 126     with open(path) as f:
 127         return _ParseSymbolPage(f.read(), name)
 128
 129
 130 def _GetSymbols(pool, root_dir, index_page_name, namespace, variants_to_accept):
 131     """Get all symbols listed in the index page. All symbols should be in the
 132     given namespace.
 133
 134     Returns a list of Symbols.
 135     """
 136
 137     # Workflow steps:
 138     #   1. Parse index page which lists all symbols to get symbol
 139     #      name (unqualified name) and its href link to the symbol page which
 140     #      contains the defined header.
 141     #   2. Parse the symbol page to get the defined header.
 142     index_page_path = os.path.join(root_dir, index_page_name)
 143     with open(index_page_path, "r") as f:
 144         # Read each symbol page in parallel.
 145         results = []  # (symbol_name, promise of [header...])
 146         for symbol_name, symbol_page_path, variant in _ParseIndexPage(f.read()):
 147             # Variant symbols (e.g. the std::locale version of isalpha) add ambiguity.
 148             # FIXME: use these as a fallback rather than ignoring entirely.
 149             variants_for_symbol = variants_to_accept.get(
 150                 (namespace or "") + symbol_name, ()
 151             )
 152             if variant and variant not in variants_for_symbol:
 153                 continue
 154             path = os.path.join(root_dir, symbol_page_path)
 155             if os.path.isfile(path):
 156                 results.append(
 157                     (
 158                         symbol_name,
 159                         pool.apply_async(_ReadSymbolPage, (path, symbol_name)),
 160                     )
 161                 )
 162             else:
 163                 sys.stderr.write(
 164                     "Discarding information for symbol: %s. Page %s does not exist.\n"
 165                     % (symbol_name, path)
 166                 )
 167
 168         # Build map from symbol name to a set of headers.
 169         symbol_headers = collections.defaultdict(set)
 170         for symbol_name, lazy_headers in results:
 171             symbol_headers[symbol_name].update(lazy_headers.get())
 172
 173     symbols = []
 174     for name, headers in sorted(symbol_headers.items(), key=lambda t: t[0]):
 175         symbols.append(Symbol(name, namespace, list(headers)))
 176     return symbols
 177
 178
 179 def signal_ignore_initializer():
 180     return signal.signal(signal.SIGINT, signal.SIG_IGN)
 181
 182
 183 def GetSymbols(parse_pages):
 184     """Get all symbols by parsing the given pages.
 185
 186     Args:
 187       parse_pages: a list of tuples (page_root_dir, index_page_name, namespace)
 188     """
 189     # By default we prefer the non-variant versions, as they're more common. But
 190     # there are some symbols, whose variant is more common. This list describes
 191     # those symbols.
 192     variants_to_accept = {
 193         # std::remove<> has variant algorithm.
 194         "std::remove": ("algorithm"),
 195     }
 196     symbols = []
 197     # Run many workers to process individual symbol pages under the symbol index.
 198     # Don't allow workers to capture Ctrl-C.
 199     pool = multiprocessing.Pool(initializer=signal_ignore_initializer)
 200     try:
 201         for root_dir, page_name, namespace in parse_pages:
 202             symbols.extend(
 203                 _GetSymbols(pool, root_dir, page_name, namespace, variants_to_accept)
 204             )
 205     finally:
 206         pool.terminate()
 207         pool.join()
 208     return sorted(symbols)