libcxx/utils/generate_width_estimation_table.py

   1 #!/usr/bin/env python
   2 # ===----------------------------------------------------------------------===##
   3 #
   4 # Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   5 # See https://llvm.org/LICENSE.txt for license information.
   6 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   7 #
   8 # ===----------------------------------------------------------------------===##
   9
  10 # The code is based on
  11 # https://github.com/microsoft/STL/blob/main/tools/unicode_properties_parse/grapheme_break_property_data_gen.py
  12 #
  13 # Copyright (c) Microsoft Corporation.
  14 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  15
  16 from io import StringIO
  17 from pathlib import Path
  18 from dataclasses import dataclass
  19 from typing import Optional
  20 import re
  21 import sys
  22
  23
  24 @dataclass
  25 class PropertyRange:
  26     lower: int = -1
  27     upper: int = -1
  28     prop: str = None
  29
  30
  31 @dataclass
  32 class Entry:
  33     lower: int = -1
  34     offset: int = -1
  35
  36
  37 LINE_REGEX = re.compile(
  38     r"^(?P<lower>[0-9A-F]{4,6})(?:\.\.(?P<upper>[0-9A-F]{4,6}))?\s*;\s*(?P<prop>\w+)"
  39 )
  40
  41
  42 def filterProperty(element: PropertyRange) -> Optional[PropertyRange]:
  43     ### Matches property predicate?
  44     if element.prop in ["W", "F"]:
  45         return element
  46
  47     ### Matches hardcode ranges predicate?
  48
  49     # Yijing Hexagram Symbols
  50     if element.lower >= 0x4DC0 and element.upper <= 0x4DFF:
  51         return element
  52
  53     # Miscellaneous Symbols and Pictographs
  54     if element.lower >= 0x1F300 and element.upper <= 0x1F5FF:
  55         return element
  56
  57     # Supplemental Symbols and Pictographs
  58     if element.lower >= 0x1F900 and element.upper <= 0x1F9FF:
  59         return element
  60
  61     return None
  62
  63
  64 def parsePropertyLine(inputLine: str) -> Optional[PropertyRange]:
  65     result = PropertyRange()
  66     if m := LINE_REGEX.match(inputLine):
  67         lower_str, upper_str, result.prop = m.group("lower", "upper", "prop")
  68         result.lower = int(lower_str, base=16)
  69         result.upper = result.lower
  70         if upper_str is not None:
  71             result.upper = int(upper_str, base=16)
  72         return result
  73
  74     else:
  75         return None
  76
  77
  78 def compactPropertyRanges(input: list[PropertyRange]) -> list[PropertyRange]:
  79     """
  80     Merges overlapping and consecutive ranges to one range.
  81
  82     Since the input properties are filtered the exact property isn't
  83     interesting anymore. The properties in the output are merged to aid
  84     debugging.
  85     Merging the ranges results in fewer ranges in the output table,
  86     reducing binary and improving lookup performance.
  87     """
  88     result = list()
  89     for x in input:
  90         if (
  91             len(result)
  92             and x.lower > result[-1].lower
  93             and x.lower <= result[-1].upper + 1
  94         ):
  95             result[-1].upper = max(result[-1].upper, x.upper)
  96             result[-1].prop += f" {x.prop}"
  97             continue
  98         result.append(x)
  99     return result
 100
 101
 102 DATA_ARRAY_TEMPLATE = """
 103 /// The entries of the characters with an estimated width of 2.
 104 ///
 105 /// Contains the entries for [format.string.std]/12
 106 ///  -  Any code point with the East_Asian_Width="W" or East_Asian_Width="F"
 107 ///     Derived Extracted Property as described by UAX #44
 108 /// - U+4DC0 - U+4DFF (Yijing Hexagram Symbols)
 109 /// - U+1F300 - U+1F5FF (Miscellaneous Symbols and Pictographs)
 110 /// - U+1F900 - U+1F9FF (Supplemental Symbols and Pictographs)
 111 ///
 112 /// The data is generated from
 113 /// - https://www.unicode.org/Public/UCD/latest/ucd/EastAsianWidth.txt
 114 /// - The "overrides" in [format.string.std]/12
 115 ///
 116 /// The format of EastAsianWidth.txt is two fields separated by a semicolon.
 117 /// Field 0: Unicode code point value or range of code point values
 118 /// Field 1: East_Asian_Width property, consisting of one of the following values:
 119 ///         "A", "F", "H", "N", "Na", "W"
 120 ///  - All code points, assigned or unassigned, that are not listed
 121 ///      explicitly are given the value "N".
 122 ///  - The unassigned code points in the following blocks default to "W":
 123 ///         CJK Unified Ideographs Extension A: U+3400..U+4DBF
 124 ///         CJK Unified Ideographs:             U+4E00..U+9FFF
 125 ///         CJK Compatibility Ideographs:       U+F900..U+FAFF
 126 ///  - All undesignated code points in Planes 2 and 3, whether inside or
 127 ///      outside of allocated blocks, default to "W":
 128 ///         Plane 2:                            U+20000..U+2FFFD
 129 ///         Plane 3:                            U+30000..U+3FFFD
 130 ///
 131 /// The table is similar to the table
 132 ///  __extended_grapheme_custer_property_boundary::__entries
 133 /// which explains the details of these classes. The only difference is this
 134 /// table lacks a property, thus having more bits available for the size.
 135 ///
 136 /// The maximum code point that has an estimated width of 2 is U+3FFFD. This
 137 /// value can be encoded in 18 bits. Thus the upper 3 bits of the code point
 138 /// are always 0. These 3 bits are used to enlarge the offset range. This
 139 /// optimization reduces the table in Unicode 15 from 184 to 104 entries,
 140 /// saving 320 bytes.
 141 ///
 142 /// The data has 2 values:
 143 /// - bits [0, 13] The size of the range, allowing 16384 elements.
 144 /// - bits [14, 31] The lower bound code point of the range. The upper bound of
 145 ///   the range is lower bound + size.
 146 inline constexpr uint32_t __entries[{size}] = {{
 147 {entries}}};
 148
 149 /// The upper bound entry of EastAsianWidth.txt.
 150 ///
 151 /// Values greater than this value may have more than 18 significant bits.
 152 /// They always have a width of 1. This property makes it possible to store
 153 /// the table in its compact form.
 154 inline constexpr uint32_t __table_upper_bound = 0x{upper_bound:08x};
 155
 156 /// Returns the estimated width of a Unicode code point.
 157 ///
 158 /// \pre The code point is a valid Unicode code point.
 159 [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr int __estimated_width(const char32_t __code_point) noexcept {{
 160   // Since __table_upper_bound contains the unshifted range do the
 161   // comparison without shifting.
 162   if (__code_point > __table_upper_bound) [[unlikely]]
 163     return 1;
 164
 165   // When the code-point is less than the first element in the table
 166   // the lookup is quite expensive. Since quite some scripts are in
 167   // that range, it makes sense to validate that first.
 168   // The std_format_spec_string_unicode benchmark gives a measurable
 169   // improvement.
 170   if (__code_point < (__entries[0] >> 14))
 171     return 1;
 172
 173   ptrdiff_t __i = std::ranges::upper_bound(__entries, (__code_point << 14) | 0x3fffu) - __entries;
 174   if (__i == 0)
 175     return 1;
 176
 177   --__i;
 178   uint32_t __upper_bound = (__entries[__i] >> 14) + (__entries[__i] & 0x3fffu);
 179   return 1 + (__code_point <= __upper_bound);
 180 }}
 181 """
 182
 183 TABLES_HPP_TEMPLATE = """
 184 // -*- C++ -*-
 185 //===----------------------------------------------------------------------===//
 186 //
 187 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 188 // See https://llvm.org/LICENSE.txt for license information.
 189 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 190 //
 191 //===----------------------------------------------------------------------===//
 192
 193 // WARNING, this entire header is generated by
 194 // utils/generate_width_estimation_table.py
 195 // DO NOT MODIFY!
 196
 197 // UNICODE, INC. LICENSE AGREEMENT - DATA FILES AND SOFTWARE
 198 //
 199 // See Terms of Use <https://www.unicode.org/copyright.html>
 200 // for definitions of Unicode Inc.'s Data Files and Software.
 201 //
 202 // NOTICE TO USER: Carefully read the following legal agreement.
 203 // BY DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING UNICODE INC.'S
 204 // DATA FILES ("DATA FILES"), AND/OR SOFTWARE ("SOFTWARE"),
 205 // YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE
 206 // TERMS AND CONDITIONS OF THIS AGREEMENT.
 207 // IF YOU DO NOT AGREE, DO NOT DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE
 208 // THE DATA FILES OR SOFTWARE.
 209 //
 210 // COPYRIGHT AND PERMISSION NOTICE
 211 //
 212 // Copyright (c) 1991-2022 Unicode, Inc. All rights reserved.
 213 // Distributed under the Terms of Use in https://www.unicode.org/copyright.html.
 214 //
 215 // Permission is hereby granted, free of charge, to any person obtaining
 216 // a copy of the Unicode data files and any associated documentation
 217 // (the "Data Files") or Unicode software and any associated documentation
 218 // (the "Software") to deal in the Data Files or Software
 219 // without restriction, including without limitation the rights to use,
 220 // copy, modify, merge, publish, distribute, and/or sell copies of
 221 // the Data Files or Software, and to permit persons to whom the Data Files
 222 // or Software are furnished to do so, provided that either
 223 // (a) this copyright and permission notice appear with all copies
 224 // of the Data Files or Software, or
 225 // (b) this copyright and permission notice appear in associated
 226 // Documentation.
 227 //
 228 // THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF
 229 // ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
 230 // WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 231 // NONINFRINGEMENT OF THIRD PARTY RIGHTS.
 232 // IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS
 233 // NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL
 234 // DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
 235 // DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
 236 // TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
 237 // PERFORMANCE OF THE DATA FILES OR SOFTWARE.
 238 //
 239 // Except as contained in this notice, the name of a copyright holder
 240 // shall not be used in advertising or otherwise to promote the sale,
 241 // use or other dealings in these Data Files or Software without prior
 242 // written authorization of the copyright holder.
 243
 244 #ifndef _LIBCPP___FORMAT_WIDTH_ESTIMATION_TABLE_H
 245 #define _LIBCPP___FORMAT_WIDTH_ESTIMATION_TABLE_H
 246
 247 #include <__algorithm/ranges_upper_bound.h>
 248 #include <__config>
 249 #include <cstddef>
 250 #include <cstdint>
 251
 252 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 253 #  pragma GCC system_header
 254 #endif
 255
 256 _LIBCPP_BEGIN_NAMESPACE_STD
 257
 258 #if _LIBCPP_STD_VER >= 20
 259
 260 namespace __width_estimation_table {{
 261 {content}
 262 }} // namespace __width_estimation_table
 263
 264 #endif //_LIBCPP_STD_VER >= 20
 265
 266 _LIBCPP_END_NAMESPACE_STD
 267
 268 #endif // _LIBCPP___FORMAT_WIDTH_ESTIMATION_TABLE_H"""
 269
 270
 271 def property_ranges_to_table(ranges: list[PropertyRange]) -> list[Entry]:
 272     # The maximum value that can be encoded in the available bits in the
 273     # __entries table.
 274     upper_bound = 0x3FFFF
 275     # The maximum offset in an __entries entry. Larger offsets will be
 276     # splitted and stored in multiple entries.
 277     chunk = 16384
 278     result = list[Entry]()
 279     high = -1
 280     for range in sorted(ranges, key=lambda x: x.lower):
 281         # Validate overlapping ranges
 282         assert range.lower > high
 283         high = range.upper
 284         assert high <= upper_bound
 285
 286         while True:
 287             e = Entry(range.lower, range.upper - range.lower)
 288             if e.offset < chunk:
 289                 result.append(e)
 290                 break
 291             e.offset = chunk - 1
 292             result.append(e)
 293             range.lower += chunk
 294     return result
 295
 296
 297 cpp_entrytemplate = "    0x{:08x} /* {:08x} - {:08x} [{:>5}] */"
 298
 299
 300 def generate_cpp_data(ranges: list[PropertyRange], upper_bound: int) -> str:
 301     result = StringIO()
 302     table = property_ranges_to_table(ranges)
 303     result.write(
 304         DATA_ARRAY_TEMPLATE.format(
 305             size=len(table),
 306             entries=", //\n".join(
 307                 [
 308                     cpp_entrytemplate.format(
 309                         x.lower << 14 | x.offset,
 310                         x.lower,
 311                         x.lower + x.offset,
 312                         x.offset + 1,
 313                     )
 314                     for x in table
 315                 ]
 316             ),
 317             upper_bound=upper_bound,
 318         )
 319     )
 320
 321     return result.getvalue()
 322
 323
 324 def generate_data_tables() -> str:
 325     """
 326     Generate Unicode data for [format.string.std]/12
 327     """
 328     east_asian_width_path = (
 329         Path(__file__).absolute().parent / "data" / "unicode" / "EastAsianWidth.txt"
 330     )
 331
 332     properties = list()
 333     with east_asian_width_path.open(encoding="utf-8") as f:
 334         properties.extend(
 335             list(
 336                 filter(
 337                     filterProperty,
 338                     [x for line in f if (x := parsePropertyLine(line))],
 339                 )
 340             )
 341         )
 342     # The range U+4DC0 - U+4DFF is neutral and should not be in the table
 343     # The range U+1F300 - U+1F5FF is partly in the range, for example
 344     #   1F300..1F320;W   # So    [33] CYCLONE..SHOOTING STAR
 345     #   1F321..1F32C;N   # So    [12] THERMOMETER..WIND BLOWING FACE
 346     #   1F32D..1F335;W   # So     [9] HOT DOG..CACTUS
 347     # The first and last ranges are present, but the second isn't
 348
 349     # Validate the hardcode ranges are present
 350
 351     # Yijing Hexagram Symbols
 352     for i in range(0x4DC0, 0x4DFF + 1):
 353         assert [x for x in properties if i >= x.lower and i <= x.upper]
 354
 355     # Miscellaneous Symbols and Pictographs
 356     for i in range(0x1F300, 0x1F5FF + 1):
 357         assert [x for x in properties if i >= x.lower and i <= x.upper]
 358
 359     # Miscellaneous Symbols and Pictographs
 360     for i in range(0x1F900, 0x1F9FF + 1):
 361         assert [x for x in properties if i >= x.lower and i <= x.upper]
 362
 363     data = compactPropertyRanges(sorted(properties, key=lambda x: x.lower))
 364
 365     return "\n".join([generate_cpp_data(data, data[-1].upper)])
 366
 367
 368 if __name__ == "__main__":
 369     if len(sys.argv) == 2:
 370         sys.stdout = open(sys.argv[1], "w")
 371     print(TABLES_HPP_TEMPLATE.lstrip().format(content=generate_data_tables()))