[RISCV] Add Qualcomm uC Xqcics(Conditional Select) extension (#119504)
[llvm-project.git] / clang / tools / include-mapping / cppreference_parser.py
blob9101f3dbff0f944765f1ad26de0fe8c55aa2f329
1 #!/usr/bin/env python3
2 # ===- cppreference_parser.py - ------------------------------*- python -*--===#
4 # Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
5 # See https://llvm.org/LICENSE.txt for license information.
6 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
8 # ===------------------------------------------------------------------------===#
10 from bs4 import BeautifulSoup, NavigableString, Tag
12 import collections
13 import multiprocessing
14 import os
15 import re
16 import signal
17 import sys
20 class Symbol:
21 def __init__(self, name, namespace, headers):
22 # unqualifed symbol name, e.g. "move"
23 self.name = name
24 # namespace of the symbol (with trailing "::"), e.g. "std::", "" (global scope)
25 # None for C symbols.
26 self.namespace = namespace
27 # a list of corresponding headers
28 self.headers = headers
30 def __lt__(self, other):
31 if self.namespace != other.namespace:
32 return str(self.namespace) < str(other.namespace)
33 return self.name < other.name
36 def _HasClass(tag, *classes):
37 for c in tag.get("class", []):
38 if c in classes:
39 return True
40 return False
43 def _ParseSymbolPage(symbol_page_html, symbol_name, qual_name):
44 """Parse symbol page and retrieve the include header defined in this page.
45 The symbol page provides header for the symbol, specifically in
46 "Defined in header <header>" section. An example:
48 <tr class="t-dsc-header">
49 <td colspan="2"> <div>Defined in header <code>&lt;ratio&gt;</code> </div>
50 </td></tr>
52 Returns a list of headers.
53 """
54 headers = set()
55 all_headers = set()
57 soup = BeautifulSoup(symbol_page_html, "html.parser")
58 # Rows in table are like:
59 # Defined in header <foo> .t-dsc-header
60 # Defined in header <bar> .t-dsc-header
61 # decl1 .t-dcl
62 # Defined in header <baz> .t-dsc-header
63 # decl2 .t-dcl
64 for table in soup.select("table.t-dcl-begin, table.t-dsc-begin"):
65 current_headers = []
66 was_decl = False
67 for row in table.select("tr"):
68 if _HasClass(row, "t-dcl", "t-dsc"):
69 was_decl = True
70 # Symbols are in the first cell.
71 found_symbols = row.find("td").stripped_strings
72 if not any(
73 sym == symbol_name or sym == qual_name for sym in found_symbols
75 continue
76 headers.update(current_headers)
77 elif _HasClass(row, "t-dsc-header"):
78 # If we saw a decl since the last header, this is a new block of headers
79 # for a new block of decls.
80 if was_decl:
81 current_headers = []
82 was_decl = False
83 # There are also .t-dsc-header for "defined in namespace".
84 if not "Defined in header " in row.text:
85 continue
86 # The interesting header content (e.g. <cstdlib>) is wrapped in <code>.
87 for header_code in row.find_all("code"):
88 current_headers.append(header_code.text)
89 all_headers.add(header_code.text)
90 # If the symbol was never named, consider all named headers.
91 return headers or all_headers
94 def _ParseSymbolVariant(caption):
95 if not (isinstance(caption, NavigableString) and "(" in caption):
96 return None
98 if ")" in caption.text: # (locale), (algorithm), etc.
99 return caption.text.strip(" ()")
101 second_part = caption.next_sibling
102 if isinstance(second_part, Tag) and second_part.name == "code":
103 # (<code>std::complex</code>), etc.
104 third_part = second_part.next_sibling
105 if isinstance(third_part, NavigableString) and third_part.text.startswith(")"):
106 return second_part.text
107 return None
110 def _ParseIndexPage(index_page_html):
111 """Parse index page.
112 The index page lists all std symbols and hrefs to their detailed pages
113 (which contain the defined header). An example:
115 <a href="abs.html" title="abs"><tt>abs()</tt></a> (int) <br>
116 <a href="acos.html" title="acos"><tt>acos()</tt></a> <br>
118 Returns a list of tuple (symbol_name, relative_path_to_symbol_page, variant).
120 symbols = []
121 soup = BeautifulSoup(index_page_html, "html.parser")
122 for symbol_href in soup.select("a[title]"):
123 # Ignore annotated symbols like "acos<>() (std::complex)".
124 # These tend to be overloads, and we the primary is more useful.
125 # This accidentally accepts begin/end despite the (iterator) caption: the
126 # (since C++11) note is first. They are good symbols, so the bug is unfixed.
127 caption = symbol_href.next_sibling
128 variant = _ParseSymbolVariant(caption)
129 symbol_tt = symbol_href.find("tt")
130 if symbol_tt:
131 symbols.append(
133 symbol_tt.text.rstrip("<>()"), # strip any trailing <>()
134 symbol_href["href"],
135 variant,
138 return symbols
141 def _ReadSymbolPage(path, name, qual_name):
142 with open(path) as f:
143 return _ParseSymbolPage(f.read(), name, qual_name)
146 def _GetSymbols(pool, root_dir, index_page_name, namespace, variants_to_accept):
147 """Get all symbols listed in the index page. All symbols should be in the
148 given namespace.
150 Returns a list of Symbols.
153 # Workflow steps:
154 # 1. Parse index page which lists all symbols to get symbol
155 # name (unqualified name) and its href link to the symbol page which
156 # contains the defined header.
157 # 2. Parse the symbol page to get the defined header.
158 index_page_path = os.path.join(root_dir, index_page_name)
159 with open(index_page_path, "r") as f:
160 # Read each symbol page in parallel.
161 results = [] # (symbol_name, promise of [header...])
162 for symbol_name, symbol_page_path, variant in _ParseIndexPage(f.read()):
163 # Variant symbols (e.g. the std::locale version of isalpha) add ambiguity.
164 # FIXME: use these as a fallback rather than ignoring entirely.
165 qualified_symbol_name = (namespace or "") + symbol_name
166 variants_for_symbol = variants_to_accept.get(qualified_symbol_name, ())
167 if variant and variant not in variants_for_symbol:
168 continue
169 path = os.path.join(root_dir, symbol_page_path)
170 if os.path.isfile(path):
171 results.append(
173 symbol_name,
174 pool.apply_async(
175 _ReadSymbolPage, (path, symbol_name, qualified_symbol_name)
179 else:
180 sys.stderr.write(
181 "Discarding information for symbol: %s. Page %s does not exist.\n"
182 % (symbol_name, path)
185 # Build map from symbol name to a set of headers.
186 symbol_headers = collections.defaultdict(set)
187 for symbol_name, lazy_headers in results:
188 symbol_headers[symbol_name].update(lazy_headers.get())
190 symbols = []
191 for name, headers in sorted(symbol_headers.items(), key=lambda t: t[0]):
192 symbols.append(Symbol(name, namespace, list(headers)))
193 return symbols
196 def signal_ignore_initializer():
197 return signal.signal(signal.SIGINT, signal.SIG_IGN)
200 def GetSymbols(parse_pages):
201 """Get all symbols by parsing the given pages.
203 Args:
204 parse_pages: a list of tuples (page_root_dir, index_page_name, namespace)
206 # By default we prefer the non-variant versions, as they're more common. But
207 # there are some symbols, whose variant is more common. This list describes
208 # those symbols.
209 variants_to_accept = {
210 # std::remove<> has variant algorithm.
211 "std::remove": ("algorithm"),
212 # These functions don't have a generic version, and all variants are defined in <chrono>
213 "std::chrono::abs": ("std::chrono::duration"),
214 "std::chrono::ceil": ("std::chrono::duration"),
215 "std::chrono::floor": ("std::chrono::duration"),
216 "std::chrono::from_stream": ("std::chrono::day"),
217 "std::chrono::round": ("std::chrono::duration"),
218 # Same, but in <filesystem>
219 "std::filesystem::begin": ("std::filesystem::directory_iterator"),
220 "std::filesystem::end": ("std::filesystem::directory_iterator"),
221 "std::ranges::get": ("std::ranges::subrange"),
223 symbols = []
224 # Run many workers to process individual symbol pages under the symbol index.
225 # Don't allow workers to capture Ctrl-C.
226 pool = multiprocessing.Pool(initializer=signal_ignore_initializer)
227 try:
228 for root_dir, page_name, namespace in parse_pages:
229 symbols.extend(
230 _GetSymbols(pool, root_dir, page_name, namespace, variants_to_accept)
232 finally:
233 pool.terminate()
234 pool.join()
235 return sorted(symbols)