libcxx/utils/generate_indic_conjunct_break_table.py

   1 #!/usr/bin/env python
   2 # ===----------------------------------------------------------------------===##
   3 #
   4 # Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   5 # See https://llvm.org/LICENSE.txt for license information.
   6 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   7 #
   8 # ===----------------------------------------------------------------------===##
   9
  10 # The code is based on
  11 # https://github.com/microsoft/STL/blob/main/tools/unicode_properties_parse/grapheme_break_property_data_gen.py
  12 #
  13 # Copyright (c) Microsoft Corporation.
  14 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  15
  16 from io import StringIO
  17 from pathlib import Path
  18 from dataclasses import dataclass
  19 from typing import Optional
  20 import re
  21 import sys
  22
  23
  24 @dataclass
  25 class PropertyRange:
  26     lower: int = -1
  27     upper: int = -1
  28     prop: str = None
  29
  30
  31 @dataclass
  32 class Entry:
  33     lower: int = -1
  34     offset: int = -1
  35     prop: int = -1
  36
  37
  38 LINE_REGEX = re.compile(
  39     r"^(?P<lower>[0-9A-F]{4,5})(?:\.\.(?P<upper>[0-9A-F]{4,5}))?\s*;\s*InCB;\s*(?P<prop>\w+)"
  40 )
  41
  42 def parsePropertyLine(inputLine: str) -> Optional[PropertyRange]:
  43     result = PropertyRange()
  44     if m := LINE_REGEX.match(inputLine):
  45         lower_str, upper_str, result.prop = m.group("lower", "upper", "prop")
  46         result.lower = int(lower_str, base=16)
  47         result.upper = result.lower
  48         if upper_str is not None:
  49             result.upper = int(upper_str, base=16)
  50         return result
  51
  52     else:
  53         return None
  54
  55
  56
  57 def compactPropertyRanges(input: list[PropertyRange]) -> list[PropertyRange]:
  58     """
  59     Merges consecutive ranges with the same property to one range.
  60
  61     Merging the ranges results in fewer ranges in the output table,
  62     reducing binary and improving lookup performance.
  63     """
  64     result = list()
  65     for x in input:
  66         if (
  67             len(result)
  68             and result[-1].prop == x.prop
  69             and result[-1].upper + 1 == x.lower
  70         ):
  71             result[-1].upper = x.upper
  72             continue
  73         result.append(x)
  74     return result
  75
  76
  77 PROP_VALUE_ENUMERATOR_TEMPLATE = "  __{}"
  78 PROP_VALUE_ENUM_TEMPLATE = """
  79 enum class __property : uint8_t {{
  80   // Values generated from the data files.
  81 {enumerators},
  82
  83   // The code unit has none of above properties.
  84   __none
  85 }};
  86 """
  87
  88 DATA_ARRAY_TEMPLATE = """
  89 /// The entries of the indic conjunct break property table.
  90 ///
  91 /// The data is generated from
  92 /// -  https://www.unicode.org/Public/UCD/latest/ucd/DerivedCoreProperties.txt
  93 ///
  94 /// The data has 3 values
  95 /// - bits [0, 1] The property. One of the values generated from the datafiles
  96 ///   of \\ref __property
  97 /// - bits [2, 10] The size of the range.
  98 /// - bits [11, 31] The lower bound code point of the range. The upper bound of
  99 ///   the range is lower bound + size.
 100 ///
 101 /// The 9 bits for the size allow a maximum range of 512 elements. Some ranges
 102 /// in the Unicode tables are larger. They are stored in multiple consecutive
 103 /// ranges in the data table. An alternative would be to store the sizes in a
 104 /// separate 16-bit value. The original MSVC STL code had such an approach, but
 105 /// this approach uses less space for the data and is about 4% faster in the
 106 /// following benchmark.
 107 /// libcxx/benchmarks/std_format_spec_string_unicode.bench.cpp
 108 // clang-format off
 109 _LIBCPP_HIDE_FROM_ABI inline constexpr uint32_t __entries[{size}] = {{
 110 {entries}}};
 111 // clang-format on
 112
 113 /// Returns the indic conjuct break property of a code point.
 114 [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr __property __get_property(const char32_t __code_point) noexcept {{
 115   // The algorithm searches for the upper bound of the range and, when found,
 116   // steps back one entry. This algorithm is used since the code point can be
 117   // anywhere in the range. After a lower bound is found the next step is to
 118   // compare whether the code unit is indeed in the range.
 119   //
 120   // Since the entry contains a code unit, size, and property the code point
 121   // being sought needs to be adjusted. Just shifting the code point to the
 122   // proper position doesn't work; suppose an entry has property 0, size 1,
 123   // and lower bound 3. This results in the entry 0x1810.
 124   // When searching for code point 3 it will search for 0x1800, find 0x1810
 125   // and moves to the previous entry. Thus the lower bound value will never
 126   // be found.
 127   // The simple solution is to set the bits belonging to the property and
 128   // size. Then the upper bound for code point 3 will return the entry after
 129   // 0x1810. After moving to the previous entry the algorithm arrives at the
 130   // correct entry.
 131   ptrdiff_t __i = std::ranges::upper_bound(__entries, (__code_point << 11) | 0x7ffu) - __entries;
 132   if (__i == 0)
 133     return __property::__none;
 134
 135   --__i;
 136   uint32_t __upper_bound = (__entries[__i] >> 11) + ((__entries[__i] >> 2) & 0b1'1111'1111);
 137   if (__code_point <= __upper_bound)
 138     return static_cast<__property>(__entries[__i] & 0b11);
 139
 140   return __property::__none;
 141 }}
 142 """
 143
 144 MSVC_FORMAT_UCD_TABLES_HPP_TEMPLATE = """
 145 // -*- C++ -*-
 146 //===----------------------------------------------------------------------===//
 147 //
 148 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 149 // See https://llvm.org/LICENSE.txt for license information.
 150 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 151 //
 152 //===----------------------------------------------------------------------===//
 153
 154 // WARNING, this entire header is generated by
 155 // utils/generate_indic_conjunct_break_table.py
 156 // DO NOT MODIFY!
 157
 158 // UNICODE, INC. LICENSE AGREEMENT - DATA FILES AND SOFTWARE
 159 //
 160 // See Terms of Use <https://www.unicode.org/copyright.html>
 161 // for definitions of Unicode Inc.'s Data Files and Software.
 162 //
 163 // NOTICE TO USER: Carefully read the following legal agreement.
 164 // BY DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING UNICODE INC.'S
 165 // DATA FILES ("DATA FILES"), AND/OR SOFTWARE ("SOFTWARE"),
 166 // YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE
 167 // TERMS AND CONDITIONS OF THIS AGREEMENT.
 168 // IF YOU DO NOT AGREE, DO NOT DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE
 169 // THE DATA FILES OR SOFTWARE.
 170 //
 171 // COPYRIGHT AND PERMISSION NOTICE
 172 //
 173 // Copyright (c) 1991-2022 Unicode, Inc. All rights reserved.
 174 // Distributed under the Terms of Use in https://www.unicode.org/copyright.html.
 175 //
 176 // Permission is hereby granted, free of charge, to any person obtaining
 177 // a copy of the Unicode data files and any associated documentation
 178 // (the "Data Files") or Unicode software and any associated documentation
 179 // (the "Software") to deal in the Data Files or Software
 180 // without restriction, including without limitation the rights to use,
 181 // copy, modify, merge, publish, distribute, and/or sell copies of
 182 // the Data Files or Software, and to permit persons to whom the Data Files
 183 // or Software are furnished to do so, provided that either
 184 // (a) this copyright and permission notice appear with all copies
 185 // of the Data Files or Software, or
 186 // (b) this copyright and permission notice appear in associated
 187 // Documentation.
 188 //
 189 // THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF
 190 // ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
 191 // WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 192 // NONINFRINGEMENT OF THIRD PARTY RIGHTS.
 193 // IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS
 194 // NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL
 195 // DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
 196 // DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
 197 // TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
 198 // PERFORMANCE OF THE DATA FILES OR SOFTWARE.
 199 //
 200 // Except as contained in this notice, the name of a copyright holder
 201 // shall not be used in advertising or otherwise to promote the sale,
 202 // use or other dealings in these Data Files or Software without prior
 203 // written authorization of the copyright holder.
 204
 205 #ifndef _LIBCPP___FORMAT_INDIC_CONJUNCT_BREAK_TABLE_H
 206 #define _LIBCPP___FORMAT_INDIC_CONJUNCT_BREAK_TABLE_H
 207
 208 #include <__algorithm/ranges_upper_bound.h>
 209 #include <__config>
 210 #include <__iterator/access.h>
 211 #include <cstddef>
 212 #include <cstdint>
 213
 214 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 215 #  pragma GCC system_header
 216 #endif
 217
 218 _LIBCPP_BEGIN_NAMESPACE_STD
 219
 220 #if _LIBCPP_STD_VER >= 20
 221
 222 namespace __indic_conjunct_break {{
 223 {content}
 224 }} // namespace __indic_conjunct_break
 225
 226 #endif //_LIBCPP_STD_VER >= 20
 227
 228 _LIBCPP_END_NAMESPACE_STD
 229
 230 #endif // _LIBCPP___FORMAT_INDIC_CONJUNCT_BREAK_TABLE_H"""
 231
 232
 233 def property_ranges_to_table(
 234     ranges: list[PropertyRange], props: list[str]
 235 ) -> list[Entry]:
 236     assert len(props) < 4
 237     result = list[Entry]()
 238     high = -1
 239     for range in sorted(ranges, key=lambda x: x.lower):
 240         # Validate overlapping ranges
 241         assert range.lower > high
 242         high = range.upper
 243
 244         while True:
 245             e = Entry(range.lower, range.upper - range.lower, props.index(range.prop))
 246             if e.offset <= 511:
 247                 result.append(e)
 248                 break
 249             e.offset = 511
 250             result.append(e)
 251             range.lower += 512
 252     return result
 253
 254
 255 cpp_entrytemplate = "    0x{:08x}"
 256
 257
 258 def generate_cpp_data(prop_name: str, ranges: list[PropertyRange]) -> str:
 259     result = StringIO()
 260     prop_values = sorted(set(x.prop for x in ranges))
 261     table = property_ranges_to_table(ranges, prop_values)
 262     enumerator_values = [PROP_VALUE_ENUMERATOR_TEMPLATE.format(x) for x in prop_values]
 263     result.write(
 264         PROP_VALUE_ENUM_TEMPLATE.format(enumerators=",\n".join(enumerator_values))
 265     )
 266     result.write(
 267         DATA_ARRAY_TEMPLATE.format(
 268             prop_name=prop_name,
 269             size=len(table),
 270             entries=",\n".join(
 271                 [
 272                     cpp_entrytemplate.format(x.lower << 11 | x.offset << 2 | x.prop)
 273                     for x in table
 274                 ]
 275             ),
 276         )
 277     )
 278
 279     return result.getvalue()
 280
 281
 282 def generate_data_tables() -> str:
 283     """
 284     Generate Unicode data for inclusion into <format> from
 285     - https://www.unicode.org/Public/UCD/latest/ucd/DerivedCoreProperties.txt
 286
 287     These files are expected to be stored in the same directory as this script.
 288     """
 289     root = Path(__file__).absolute().parent / "data" / "unicode"
 290     derived_core_path = root / "DerivedCoreProperties.txt"
 291
 292     indic_conjunct_break = list()
 293     with derived_core_path.open(encoding="utf-8") as f:
 294         indic_conjunct_break_ranges = compactPropertyRanges(
 295             [x for line in f if (x := parsePropertyLine(line))]
 296         )
 297
 298     indic_conjunct_break_data = generate_cpp_data("Grapheme_Break", indic_conjunct_break_ranges)
 299     return "\n".join([indic_conjunct_break_data])
 300
 301
 302 if __name__ == "__main__":
 303     if len(sys.argv) == 2:
 304         sys.stdout = open(sys.argv[1], "w")
 305     print(
 306         MSVC_FORMAT_UCD_TABLES_HPP_TEMPLATE.lstrip().format(
 307             content=generate_data_tables()
 308         )
 309     )