[NFC][Coroutines] Use structured binding with llvm::enumerate in CoroSplit (#116879)
[llvm-project.git] / libcxx / utils / generate_width_estimation_table.py
blobf81f0ba77489eb79b79ff095c1978cafcdad60b7
1 #!/usr/bin/env python
2 # ===----------------------------------------------------------------------===##
4 # Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
5 # See https://llvm.org/LICENSE.txt for license information.
6 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
8 # ===----------------------------------------------------------------------===##
10 # The code is based on
11 # https://github.com/microsoft/STL/blob/main/tools/unicode_properties_parse/grapheme_break_property_data_gen.py
13 # Copyright (c) Microsoft Corporation.
14 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
16 from io import StringIO
17 from pathlib import Path
18 from dataclasses import dataclass
19 from typing import Optional
20 import re
21 import sys
24 @dataclass
25 class PropertyRange:
26 lower: int = -1
27 upper: int = -1
28 prop: str = None
31 @dataclass
32 class Entry:
33 lower: int = -1
34 offset: int = -1
37 LINE_REGEX = re.compile(
38 r"^(?P<lower>[0-9A-F]{4,6})(?:\.\.(?P<upper>[0-9A-F]{4,6}))?\s*;\s*(?P<prop>\w+)"
42 def filterProperty(element: PropertyRange) -> Optional[PropertyRange]:
43 ### Matches property predicate?
44 if element.prop in ["W", "F"]:
45 return element
47 ### Matches hardcode ranges predicate?
49 # Yijing Hexagram Symbols
50 if element.lower >= 0x4DC0 and element.upper <= 0x4DFF:
51 return element
53 # Miscellaneous Symbols and Pictographs
54 if element.lower >= 0x1F300 and element.upper <= 0x1F5FF:
55 return element
57 # Supplemental Symbols and Pictographs
58 if element.lower >= 0x1F900 and element.upper <= 0x1F9FF:
59 return element
61 return None
64 def parsePropertyLine(inputLine: str) -> Optional[PropertyRange]:
65 result = PropertyRange()
66 if m := LINE_REGEX.match(inputLine):
67 lower_str, upper_str, result.prop = m.group("lower", "upper", "prop")
68 result.lower = int(lower_str, base=16)
69 result.upper = result.lower
70 if upper_str is not None:
71 result.upper = int(upper_str, base=16)
72 return result
74 else:
75 return None
78 def compactPropertyRanges(input: list[PropertyRange]) -> list[PropertyRange]:
79 """
80 Merges overlapping and consecutive ranges to one range.
82 Since the input properties are filtered the exact property isn't
83 interesting anymore. The properties in the output are merged to aid
84 debugging.
85 Merging the ranges results in fewer ranges in the output table,
86 reducing binary and improving lookup performance.
87 """
88 result = list()
89 for x in input:
90 if (
91 len(result)
92 and x.lower > result[-1].lower
93 and x.lower <= result[-1].upper + 1
95 result[-1].upper = max(result[-1].upper, x.upper)
96 result[-1].prop += f" {x.prop}"
97 continue
98 result.append(x)
99 return result
102 DATA_ARRAY_TEMPLATE = r"""
103 /// The entries of the characters with an estimated width of 2.
105 /// Contains the entries for [format.string.std]/12
106 /// - Any code point with the East_Asian_Width="W" or East_Asian_Width="F"
107 /// Derived Extracted Property as described by UAX #44
108 /// - U+4DC0 - U+4DFF (Yijing Hexagram Symbols)
109 /// - U+1F300 - U+1F5FF (Miscellaneous Symbols and Pictographs)
110 /// - U+1F900 - U+1F9FF (Supplemental Symbols and Pictographs)
112 /// The data is generated from
113 /// - https://www.unicode.org/Public/UCD/latest/ucd/EastAsianWidth.txt
114 /// - The "overrides" in [format.string.std]/12
116 /// The format of EastAsianWidth.txt is two fields separated by a semicolon.
117 /// Field 0: Unicode code point value or range of code point values
118 /// Field 1: East_Asian_Width property, consisting of one of the following values:
119 /// "A", "F", "H", "N", "Na", "W"
120 /// - All code points, assigned or unassigned, that are not listed
121 /// explicitly are given the value "N".
122 /// - The unassigned code points in the following blocks default to "W":
123 /// CJK Unified Ideographs Extension A: U+3400..U+4DBF
124 /// CJK Unified Ideographs: U+4E00..U+9FFF
125 /// CJK Compatibility Ideographs: U+F900..U+FAFF
126 /// - All undesignated code points in Planes 2 and 3, whether inside or
127 /// outside of allocated blocks, default to "W":
128 /// Plane 2: U+20000..U+2FFFD
129 /// Plane 3: U+30000..U+3FFFD
131 /// The table is similar to the table
132 /// __extended_grapheme_custer_property_boundary::__entries
133 /// which explains the details of these classes. The only difference is this
134 /// table lacks a property, thus having more bits available for the size.
136 /// The maximum code point that has an estimated width of 2 is U+3FFFD. This
137 /// value can be encoded in 18 bits. Thus the upper 3 bits of the code point
138 /// are always 0. These 3 bits are used to enlarge the offset range. This
139 /// optimization reduces the table in Unicode 15 from 184 to 104 entries,
140 /// saving 320 bytes.
142 /// The data has 2 values:
143 /// - bits [0, 13] The size of the range, allowing 16384 elements.
144 /// - bits [14, 31] The lower bound code point of the range. The upper bound of
145 /// the range is lower bound + size.
146 _LIBCPP_HIDE_FROM_ABI inline constexpr uint32_t __entries[{size}] = {{
147 {entries}}};
149 /// The upper bound entry of EastAsianWidth.txt.
151 /// Values greater than this value may have more than 18 significant bits.
152 /// They always have a width of 1. This property makes it possible to store
153 /// the table in its compact form.
154 inline constexpr uint32_t __table_upper_bound = 0x{upper_bound:08x};
156 /// Returns the estimated width of a Unicode code point.
158 /// \\pre The code point is a valid Unicode code point.
159 [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr int __estimated_width(const char32_t __code_point) noexcept {{
160 // Since __table_upper_bound contains the unshifted range do the
161 // comparison without shifting.
162 if (__code_point > __table_upper_bound) [[unlikely]]
163 return 1;
165 // When the code-point is less than the first element in the table
166 // the lookup is quite expensive. Since quite some scripts are in
167 // that range, it makes sense to validate that first.
168 // The std_format_spec_string_unicode benchmark gives a measurable
169 // improvement.
170 if (__code_point < (__entries[0] >> 14))
171 return 1;
173 ptrdiff_t __i = std::ranges::upper_bound(__entries, (__code_point << 14) | 0x3fffu) - __entries;
174 if (__i == 0)
175 return 1;
177 --__i;
178 uint32_t __upper_bound = (__entries[__i] >> 14) + (__entries[__i] & 0x3fffu);
179 return 1 + (__code_point <= __upper_bound);
183 TABLES_HPP_TEMPLATE = """
184 // -*- C++ -*-
185 //===----------------------------------------------------------------------===//
187 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
188 // See https://llvm.org/LICENSE.txt for license information.
189 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
191 //===----------------------------------------------------------------------===//
193 // WARNING, this entire header is generated by
194 // utils/generate_width_estimation_table.py
195 // DO NOT MODIFY!
197 // UNICODE, INC. LICENSE AGREEMENT - DATA FILES AND SOFTWARE
199 // See Terms of Use <https://www.unicode.org/copyright.html>
200 // for definitions of Unicode Inc.'s Data Files and Software.
202 // NOTICE TO USER: Carefully read the following legal agreement.
203 // BY DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING UNICODE INC.'S
204 // DATA FILES ("DATA FILES"), AND/OR SOFTWARE ("SOFTWARE"),
205 // YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE
206 // TERMS AND CONDITIONS OF THIS AGREEMENT.
207 // IF YOU DO NOT AGREE, DO NOT DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE
208 // THE DATA FILES OR SOFTWARE.
210 // COPYRIGHT AND PERMISSION NOTICE
212 // Copyright (c) 1991-2022 Unicode, Inc. All rights reserved.
213 // Distributed under the Terms of Use in https://www.unicode.org/copyright.html.
215 // Permission is hereby granted, free of charge, to any person obtaining
216 // a copy of the Unicode data files and any associated documentation
217 // (the "Data Files") or Unicode software and any associated documentation
218 // (the "Software") to deal in the Data Files or Software
219 // without restriction, including without limitation the rights to use,
220 // copy, modify, merge, publish, distribute, and/or sell copies of
221 // the Data Files or Software, and to permit persons to whom the Data Files
222 // or Software are furnished to do so, provided that either
223 // (a) this copyright and permission notice appear with all copies
224 // of the Data Files or Software, or
225 // (b) this copyright and permission notice appear in associated
226 // Documentation.
228 // THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF
229 // ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
230 // WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
231 // NONINFRINGEMENT OF THIRD PARTY RIGHTS.
232 // IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS
233 // NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL
234 // DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
235 // DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
236 // TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
237 // PERFORMANCE OF THE DATA FILES OR SOFTWARE.
239 // Except as contained in this notice, the name of a copyright holder
240 // shall not be used in advertising or otherwise to promote the sale,
241 // use or other dealings in these Data Files or Software without prior
242 // written authorization of the copyright holder.
244 #ifndef _LIBCPP___FORMAT_WIDTH_ESTIMATION_TABLE_H
245 #define _LIBCPP___FORMAT_WIDTH_ESTIMATION_TABLE_H
247 #include <__algorithm/ranges_upper_bound.h>
248 #include <__config>
249 #include <__cstddef/ptrdiff_t.h>
250 #include <cstdint>
252 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
253 # pragma GCC system_header
254 #endif
256 _LIBCPP_BEGIN_NAMESPACE_STD
258 #if _LIBCPP_STD_VER >= 20
260 namespace __width_estimation_table {{
261 {content}
262 }} // namespace __width_estimation_table
264 #endif // _LIBCPP_STD_VER >= 20
266 _LIBCPP_END_NAMESPACE_STD
268 #endif // _LIBCPP___FORMAT_WIDTH_ESTIMATION_TABLE_H"""
271 def property_ranges_to_table(ranges: list[PropertyRange]) -> list[Entry]:
272 # The maximum value that can be encoded in the available bits in the
273 # __entries table.
274 upper_bound = 0x3FFFF
275 # The maximum offset in an __entries entry. Larger offsets will be
276 # splitted and stored in multiple entries.
277 chunk = 16384
278 result = list[Entry]()
279 high = -1
280 for range in sorted(ranges, key=lambda x: x.lower):
281 # Validate overlapping ranges
282 assert range.lower > high
283 high = range.upper
284 assert high <= upper_bound
286 while True:
287 e = Entry(range.lower, range.upper - range.lower)
288 if e.offset < chunk:
289 result.append(e)
290 break
291 e.offset = chunk - 1
292 result.append(e)
293 range.lower += chunk
294 return result
297 cpp_entrytemplate = " 0x{:08x} /* {:08x} - {:08x} [{:>5}] */"
300 def generate_cpp_data(ranges: list[PropertyRange], upper_bound: int) -> str:
301 result = StringIO()
302 table = property_ranges_to_table(ranges)
303 result.write(
304 DATA_ARRAY_TEMPLATE.format(
305 size=len(table),
306 entries=", //\n".join(
308 cpp_entrytemplate.format(
309 x.lower << 14 | x.offset,
310 x.lower,
311 x.lower + x.offset,
312 x.offset + 1,
314 for x in table
317 upper_bound=upper_bound,
321 return result.getvalue()
324 def generate_data_tables() -> str:
326 Generate Unicode data for [format.string.std]/12
328 east_asian_width_path = (
329 Path(__file__).absolute().parent / "data" / "unicode" / "EastAsianWidth.txt"
332 properties = list()
333 with east_asian_width_path.open(encoding="utf-8") as f:
334 properties.extend(
335 list(
336 filter(
337 filterProperty,
338 [x for line in f if (x := parsePropertyLine(line))],
342 # The range U+4DC0 - U+4DFF is neutral and should not be in the table
343 # The range U+1F300 - U+1F5FF is partly in the range, for example
344 # 1F300..1F320;W # So [33] CYCLONE..SHOOTING STAR
345 # 1F321..1F32C;N # So [12] THERMOMETER..WIND BLOWING FACE
346 # 1F32D..1F335;W # So [9] HOT DOG..CACTUS
347 # The first and last ranges are present, but the second isn't
349 # Validate the hardcode ranges are present
351 # Yijing Hexagram Symbols
352 for i in range(0x4DC0, 0x4DFF + 1):
353 assert [x for x in properties if i >= x.lower and i <= x.upper]
355 # Miscellaneous Symbols and Pictographs
356 for i in range(0x1F300, 0x1F5FF + 1):
357 assert [x for x in properties if i >= x.lower and i <= x.upper]
359 # Miscellaneous Symbols and Pictographs
360 for i in range(0x1F900, 0x1F9FF + 1):
361 assert [x for x in properties if i >= x.lower and i <= x.upper]
363 data = compactPropertyRanges(sorted(properties, key=lambda x: x.lower))
365 return "\n".join([generate_cpp_data(data, data[-1].upper)])
368 if __name__ == "__main__":
369 if len(sys.argv) == 2:
370 sys.stdout = open(sys.argv[1], "w")
371 print(TABLES_HPP_TEMPLATE.lstrip().format(content=generate_data_tables()))