2 # ===- cppreference_parser.py - ------------------------------*- python -*--===#
4 # Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
5 # See https://llvm.org/LICENSE.txt for license information.
6 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
8 # ===------------------------------------------------------------------------===#
10 from bs4
import BeautifulSoup
, NavigableString
13 import multiprocessing
21 def __init__(self
, name
, namespace
, headers
):
22 # unqualifed symbol name, e.g. "move"
24 # namespace of the symbol (with trailing "::"), e.g. "std::", "" (global scope)
26 self
.namespace
= namespace
27 # a list of corresponding headers
28 self
.headers
= headers
30 def __lt__(self
, other
):
31 if self
.namespace
!= other
.namespace
:
32 return str(self
.namespace
) < str(other
.namespace
)
33 return self
.name
< other
.name
36 def _HasClass(tag
, *classes
):
37 for c
in tag
.get("class", []):
43 def _ParseSymbolPage(symbol_page_html
, symbol_name
):
44 """Parse symbol page and retrieve the include header defined in this page.
45 The symbol page provides header for the symbol, specifically in
46 "Defined in header <header>" section. An example:
48 <tr class="t-dsc-header">
49 <td colspan="2"> <div>Defined in header <code><ratio></code> </div>
52 Returns a list of headers.
57 soup
= BeautifulSoup(symbol_page_html
, "html.parser")
58 # Rows in table are like:
59 # Defined in header <foo> .t-dsc-header
60 # Defined in header <bar> .t-dsc-header
62 # Defined in header <baz> .t-dsc-header
64 for table
in soup
.select("table.t-dcl-begin, table.t-dsc-begin"):
67 for row
in table
.select("tr"):
68 if _HasClass(row
, "t-dcl", "t-dsc"):
70 # Symbols are in the first cell.
71 found_symbols
= row
.find("td").stripped_strings
72 if not symbol_name
in found_symbols
:
74 headers
.update(current_headers
)
75 elif _HasClass(row
, "t-dsc-header"):
76 # If we saw a decl since the last header, this is a new block of headers
77 # for a new block of decls.
81 # There are also .t-dsc-header for "defined in namespace".
82 if not "Defined in header " in row
.text
:
84 # The interesting header content (e.g. <cstdlib>) is wrapped in <code>.
85 for header_code
in row
.find_all("code"):
86 current_headers
.append(header_code
.text
)
87 all_headers
.add(header_code
.text
)
88 # If the symbol was never named, consider all named headers.
89 return headers
or all_headers
92 def _ParseIndexPage(index_page_html
):
94 The index page lists all std symbols and hrefs to their detailed pages
95 (which contain the defined header). An example:
97 <a href="abs.html" title="abs"><tt>abs()</tt></a> (int) <br>
98 <a href="acos.html" title="acos"><tt>acos()</tt></a> <br>
100 Returns a list of tuple (symbol_name, relative_path_to_symbol_page, variant).
103 soup
= BeautifulSoup(index_page_html
, "html.parser")
104 for symbol_href
in soup
.select("a[title]"):
105 # Ignore annotated symbols like "acos<>() (std::complex)".
106 # These tend to be overloads, and we the primary is more useful.
107 # This accidentally accepts begin/end despite the (iterator) caption: the
108 # (since C++11) note is first. They are good symbols, so the bug is unfixed.
109 caption
= symbol_href
.next_sibling
111 if isinstance(caption
, NavigableString
) and "(" in caption
:
112 variant
= caption
.text
.strip(" ()")
113 symbol_tt
= symbol_href
.find("tt")
117 symbol_tt
.text
.rstrip("<>()"), # strip any trailing <>()
125 def _ReadSymbolPage(path
, name
):
126 with
open(path
) as f
:
127 return _ParseSymbolPage(f
.read(), name
)
130 def _GetSymbols(pool
, root_dir
, index_page_name
, namespace
, variants_to_accept
):
131 """Get all symbols listed in the index page. All symbols should be in the
134 Returns a list of Symbols.
138 # 1. Parse index page which lists all symbols to get symbol
139 # name (unqualified name) and its href link to the symbol page which
140 # contains the defined header.
141 # 2. Parse the symbol page to get the defined header.
142 index_page_path
= os
.path
.join(root_dir
, index_page_name
)
143 with
open(index_page_path
, "r") as f
:
144 # Read each symbol page in parallel.
145 results
= [] # (symbol_name, promise of [header...])
146 for symbol_name
, symbol_page_path
, variant
in _ParseIndexPage(f
.read()):
147 # Variant symbols (e.g. the std::locale version of isalpha) add ambiguity.
148 # FIXME: use these as a fallback rather than ignoring entirely.
149 variants_for_symbol
= variants_to_accept
.get(
150 (namespace
or "") + symbol_name
, ()
152 if variant
and variant
not in variants_for_symbol
:
154 path
= os
.path
.join(root_dir
, symbol_page_path
)
155 if os
.path
.isfile(path
):
159 pool
.apply_async(_ReadSymbolPage
, (path
, symbol_name
)),
164 "Discarding information for symbol: %s. Page %s does not exist.\n"
165 % (symbol_name
, path
)
168 # Build map from symbol name to a set of headers.
169 symbol_headers
= collections
.defaultdict(set)
170 for symbol_name
, lazy_headers
in results
:
171 symbol_headers
[symbol_name
].update(lazy_headers
.get())
174 for name
, headers
in sorted(symbol_headers
.items(), key
=lambda t
: t
[0]):
175 symbols
.append(Symbol(name
, namespace
, list(headers
)))
179 def signal_ignore_initializer():
180 return signal
.signal(signal
.SIGINT
, signal
.SIG_IGN
)
183 def GetSymbols(parse_pages
):
184 """Get all symbols by parsing the given pages.
187 parse_pages: a list of tuples (page_root_dir, index_page_name, namespace)
189 # By default we prefer the non-variant versions, as they're more common. But
190 # there are some symbols, whose variant is more common. This list describes
192 variants_to_accept
= {
193 # std::remove<> has variant algorithm.
194 "std::remove": ("algorithm"),
197 # Run many workers to process individual symbol pages under the symbol index.
198 # Don't allow workers to capture Ctrl-C.
199 pool
= multiprocessing
.Pool(initializer
=signal_ignore_initializer
)
201 for root_dir
, page_name
, namespace
in parse_pages
:
203 _GetSymbols(pool
, root_dir
, page_name
, namespace
, variants_to_accept
)
208 return sorted(symbols
)