2 # ===----------------------------------------------------------------------===##
4 # Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
5 # See https://llvm.org/LICENSE.txt for license information.
6 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
8 # ===----------------------------------------------------------------------===##
10 # The code is based on
11 # https://github.com/microsoft/STL/blob/main/tools/unicode_properties_parse/grapheme_break_property_data_gen.py
13 # Copyright (c) Microsoft Corporation.
14 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
16 from io
import StringIO
17 from pathlib
import Path
18 from dataclasses
import dataclass
, field
19 from typing
import Optional
37 LINE_REGEX
= re
.compile(
38 r
"^(?P<lower>[0-9A-F]{4,6})(?:\.\.(?P<upper>[0-9A-F]{4,6}))?\s*;\s*(?P<prop>\w+)"
42 def filterProperty(element
: PropertyRange
) -> Optional
[PropertyRange
]:
43 ### Matches property predicate?
44 if element
.prop
in ["W", "F"]:
47 ### Matches hardcode ranges predicate?
49 # Yijing Hexagram Symbols
50 if element
.lower
>= 0x4DC0 and element
.upper
<= 0x4DFF:
53 # Miscellaneous Symbols and Pictographs
54 if element
.lower
>= 0x1F300 and element
.upper
<= 0x1F5FF:
57 # Supplemental Symbols and Pictographs
58 if element
.lower
>= 0x1F900 and element
.upper
<= 0x1F9FF:
64 def parsePropertyLine(inputLine
: str) -> Optional
[PropertyRange
]:
65 result
= PropertyRange()
66 if m
:= LINE_REGEX
.match(inputLine
):
67 lower_str
, upper_str
, result
.prop
= m
.group("lower", "upper", "prop")
68 result
.lower
= int(lower_str
, base
=16)
69 result
.upper
= result
.lower
70 if upper_str
is not None:
71 result
.upper
= int(upper_str
, base
=16)
78 def compactPropertyRanges(input: list[PropertyRange
]) -> list[PropertyRange
]:
80 Merges overlapping and consecutive ranges to one range.
82 Since the input properties are filtered the exact property isn't
83 interesting anymore. The properties in the output are merged to aid
85 Merging the ranges results in fewer ranges in the output table,
86 reducing binary and improving lookup performance.
92 and x
.lower
> result
[-1].lower
93 and x
.lower
<= result
[-1].upper
+ 1
95 result
[-1].upper
= max(result
[-1].upper
, x
.upper
)
96 result
[-1].prop
+= f
" {x.prop}"
102 DATA_ARRAY_TEMPLATE
= """
103 /// The entries of the characters with an estimated width of 2.
105 /// Contains the entries for [format.string.std]/12
106 /// - Any code point with the East_Asian_Width="W" or East_Asian_Width="F"
107 /// Derived Extracted Property as described by UAX #44
108 /// - U+4DC0 - U+4DFF (Yijing Hexagram Symbols)
109 /// - U+1F300 - U+1F5FF (Miscellaneous Symbols and Pictographs)
110 /// - U+1F900 - U+1F9FF (Supplemental Symbols and Pictographs)
112 /// The data is generated from
113 /// - https://www.unicode.org/Public/UCD/latest/ucd/EastAsianWidth.txt
114 /// - The "overrides" in [format.string.std]/12
116 /// The format of EastAsianWidth.txt is two fields separated by a semicolon.
117 /// Field 0: Unicode code point value or range of code point values
118 /// Field 1: East_Asian_Width property, consisting of one of the following values:
119 /// "A", "F", "H", "N", "Na", "W"
120 /// - All code points, assigned or unassigned, that are not listed
121 /// explicitly are given the value "N".
122 /// - The unassigned code points in the following blocks default to "W":
123 /// CJK Unified Ideographs Extension A: U+3400..U+4DBF
124 /// CJK Unified Ideographs: U+4E00..U+9FFF
125 /// CJK Compatibility Ideographs: U+F900..U+FAFF
126 /// - All undesignated code points in Planes 2 and 3, whether inside or
127 /// outside of allocated blocks, default to "W":
128 /// Plane 2: U+20000..U+2FFFD
129 /// Plane 3: U+30000..U+3FFFD
131 /// The table is similar to the table
132 /// __extended_grapheme_custer_property_boundary::__entries
133 /// which explains the details of these classes. The only difference is this
134 /// table lacks a property, thus having more bits available for the size.
136 /// The maximum code point that has an estimated width of 2 is U+3FFFD. This
137 /// value can be encoded in 18 bits. Thus the upper 3 bits of the code point
138 /// are always 0. These 3 bits are used to enlarge the offset range. This
139 /// optimization reduces the table in Unicode 15 from 184 to 104 entries,
140 /// saving 320 bytes.
142 /// The data has 2 values:
143 /// - bits [0, 13] The size of the range, allowing 16384 elements.
144 /// - bits [14, 31] The lower bound code point of the range. The upper bound of
145 /// the range is lower bound + size.
146 inline constexpr uint32_t __entries[{size}] = {{
149 /// The upper bound entry of EastAsianWidth.txt.
151 /// Values greater than this value may have more than 18 significant bits.
152 /// They always have a width of 1. This property makes it possible to store
153 /// the table in its compact form.
154 inline constexpr uint32_t __table_upper_bound = 0x{upper_bound:08x};
156 /// Returns the estimated width of a Unicode code point.
158 /// \pre The code point is a valid Unicode code point.
159 [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr int __estimated_width(const char32_t __code_point) noexcept {{
160 // Since __table_upper_bound contains the unshifted range do the
161 // comparison without shifting.
162 if (__code_point > __table_upper_bound) [[unlikely]]
165 // When the code-point is less than the first element in the table
166 // the lookup is quite expensive. Since quite some scripts are in
167 // that range, it makes sense to validate that first.
168 // The std_format_spec_string_unicode benchmark gives a measurable
170 if (__code_point < (__entries[0] >> 14))
173 ptrdiff_t __i = std::ranges::upper_bound(__entries, (__code_point << 14) | 0x3fffu) - __entries;
178 uint32_t __upper_bound = (__entries[__i] >> 14) + (__entries[__i] & 0x3fffu);
179 return 1 + (__code_point <= __upper_bound);
183 TABLES_HPP_TEMPLATE
= """
185 //===----------------------------------------------------------------------===//
187 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
188 // See https://llvm.org/LICENSE.txt for license information.
189 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
191 //===----------------------------------------------------------------------===//
193 // WARNING, this entire header is generated by
194 // utils/generate_width_estimation_table.py
197 // UNICODE, INC. LICENSE AGREEMENT - DATA FILES AND SOFTWARE
199 // See Terms of Use <https://www.unicode.org/copyright.html>
200 // for definitions of Unicode Inc.'s Data Files and Software.
202 // NOTICE TO USER: Carefully read the following legal agreement.
203 // BY DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING UNICODE INC.'S
204 // DATA FILES ("DATA FILES"), AND/OR SOFTWARE ("SOFTWARE"),
205 // YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE
206 // TERMS AND CONDITIONS OF THIS AGREEMENT.
207 // IF YOU DO NOT AGREE, DO NOT DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE
208 // THE DATA FILES OR SOFTWARE.
210 // COPYRIGHT AND PERMISSION NOTICE
212 // Copyright (c) 1991-2022 Unicode, Inc. All rights reserved.
213 // Distributed under the Terms of Use in https://www.unicode.org/copyright.html.
215 // Permission is hereby granted, free of charge, to any person obtaining
216 // a copy of the Unicode data files and any associated documentation
217 // (the "Data Files") or Unicode software and any associated documentation
218 // (the "Software") to deal in the Data Files or Software
219 // without restriction, including without limitation the rights to use,
220 // copy, modify, merge, publish, distribute, and/or sell copies of
221 // the Data Files or Software, and to permit persons to whom the Data Files
222 // or Software are furnished to do so, provided that either
223 // (a) this copyright and permission notice appear with all copies
224 // of the Data Files or Software, or
225 // (b) this copyright and permission notice appear in associated
228 // THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF
229 // ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
230 // WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
231 // NONINFRINGEMENT OF THIRD PARTY RIGHTS.
232 // IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS
233 // NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL
234 // DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
235 // DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
236 // TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
237 // PERFORMANCE OF THE DATA FILES OR SOFTWARE.
239 // Except as contained in this notice, the name of a copyright holder
240 // shall not be used in advertising or otherwise to promote the sale,
241 // use or other dealings in these Data Files or Software without prior
242 // written authorization of the copyright holder.
244 #ifndef _LIBCPP___FORMAT_WIDTH_ESTIMATION_TABLE_H
245 #define _LIBCPP___FORMAT_WIDTH_ESTIMATION_TABLE_H
247 #include <__algorithm/ranges_upper_bound.h>
252 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
253 # pragma GCC system_header
256 _LIBCPP_BEGIN_NAMESPACE_STD
258 #if _LIBCPP_STD_VER >= 20
260 namespace __width_estimation_table {{
262 }} // namespace __width_estimation_table
264 #endif //_LIBCPP_STD_VER >= 20
266 _LIBCPP_END_NAMESPACE_STD
268 #endif // _LIBCPP___FORMAT_WIDTH_ESTIMATION_TABLE_H"""
271 def property_ranges_to_table(ranges
: list[PropertyRange
]) -> list[Entry
]:
272 # The maximum value that can be encoded in the available bits in the
274 upper_bound
= 0x3FFFF
275 # The maximum offset in an __entries entry. Larger offsets will be
276 # splitted and stored in multiple entries.
278 result
= list[Entry
]()
280 for range in sorted(ranges
, key
=lambda x
: x
.lower
):
281 # Validate overlapping ranges
282 assert range.lower
> high
284 assert high
<= upper_bound
287 e
= Entry(range.lower
, range.upper
- range.lower
)
297 cpp_entrytemplate
= " 0x{:08x} /* {:08x} - {:08x} [{:>5}] */"
300 def generate_cpp_data(ranges
: list[PropertyRange
], upper_bound
: int) -> str:
302 table
= property_ranges_to_table(ranges
)
304 DATA_ARRAY_TEMPLATE
.format(
306 entries
=", //\n".join(
308 cpp_entrytemplate
.format(
309 x
.lower
<< 14 | x
.offset
,
317 upper_bound
=upper_bound
,
321 return result
.getvalue()
324 def generate_data_tables() -> str:
326 Generate Unicode data for [format.string.std]/12
328 east_asian_width_path
= (
329 Path(__file__
).absolute().parent
/ "data" / "unicode" / "EastAsianWidth.txt"
333 with east_asian_width_path
.open(encoding
="utf-8") as f
:
338 [x
for line
in f
if (x
:= parsePropertyLine(line
))],
342 # The range U+4DC0 - U+4DFF is neutral and should not be in the table
343 # The range U+1F300 - U+1F5FF is partly in the range, for example
344 # 1F300..1F320;W # So [33] CYCLONE..SHOOTING STAR
345 # 1F321..1F32C;N # So [12] THERMOMETER..WIND BLOWING FACE
346 # 1F32D..1F335;W # So [9] HOT DOG..CACTUS
347 # The first and last ranges are present, but the second isn't
349 # Validate the hardcode ranges are present
351 # Yijing Hexagram Symbols
352 for i
in range(0x4DC0, 0x4DFF + 1):
353 assert [x
for x
in properties
if i
>= x
.lower
and i
<= x
.upper
]
355 # Miscellaneous Symbols and Pictographs
356 for i
in range(0x1F300, 0x1F5FF + 1):
357 assert [x
for x
in properties
if i
>= x
.lower
and i
<= x
.upper
]
359 # Miscellaneous Symbols and Pictographs
360 for i
in range(0x1F900, 0x1F9FF + 1):
361 assert [x
for x
in properties
if i
>= x
.lower
and i
<= x
.upper
]
363 data
= compactPropertyRanges(sorted(properties
, key
=lambda x
: x
.lower
))
365 return "\n".join([generate_cpp_data(data
, data
[-1].upper
)])
368 if __name__
== "__main__":
369 if len(sys
.argv
) == 2:
370 sys
.stdout
= open(sys
.argv
[1], "w")
371 print(TABLES_HPP_TEMPLATE
.lstrip().format(content
=generate_data_tables()))