libcxx/utils/generate_extended_grapheme_cluster_table.py

   1 #!/usr/bin/env python
   2 # ===----------------------------------------------------------------------===##
   3 #
   4 # Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   5 # See https://llvm.org/LICENSE.txt for license information.
   6 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   7 #
   8 # ===----------------------------------------------------------------------===##
   9
  10 # The code is based on
  11 # https://github.com/microsoft/STL/blob/main/tools/unicode_properties_parse/grapheme_break_property_data_gen.py
  12 #
  13 # Copyright (c) Microsoft Corporation.
  14 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  15
  16 from io import StringIO
  17 from pathlib import Path
  18 from dataclasses import dataclass, field
  19 from typing import Optional
  20 import re
  21 import sys
  22
  23
  24 @dataclass
  25 class PropertyRange:
  26     lower: int = -1
  27     upper: int = -1
  28     prop: str = None
  29
  30
  31 @dataclass
  32 class Entry:
  33     lower: int = -1
  34     offset: int = -1
  35     prop: int = -1
  36
  37
  38 LINE_REGEX = re.compile(
  39     r"^(?P<lower>[0-9A-F]{4,5})(?:\.\.(?P<upper>[0-9A-F]{4,5}))?\s*;\s*(?P<prop>\w+)"
  40 )
  41
  42
  43 def parsePropertyLine(inputLine: str) -> Optional[PropertyRange]:
  44     result = PropertyRange()
  45     if m := LINE_REGEX.match(inputLine):
  46         lower_str, upper_str, result.prop = m.group("lower", "upper", "prop")
  47         result.lower = int(lower_str, base=16)
  48         result.upper = result.lower
  49         if upper_str is not None:
  50             result.upper = int(upper_str, base=16)
  51         return result
  52
  53     else:
  54         return None
  55
  56
  57 def compactPropertyRanges(input: list[PropertyRange]) -> list[PropertyRange]:
  58     """
  59     Merges consecutive ranges with the same property to one range.
  60
  61     Merging the ranges results in fewer ranges in the output table,
  62     reducing binary and improving lookup performance.
  63     """
  64     result = list()
  65     for x in input:
  66         if (
  67             len(result)
  68             and result[-1].prop == x.prop
  69             and result[-1].upper + 1 == x.lower
  70         ):
  71             result[-1].upper = x.upper
  72             continue
  73         result.append(x)
  74     return result
  75
  76
  77 PROP_VALUE_ENUMERATOR_TEMPLATE = "  __{}"
  78 PROP_VALUE_ENUM_TEMPLATE = """
  79 enum class __property : uint8_t {{
  80   // Values generated from the data files.
  81 {enumerators},
  82
  83   // The properies below aren't stored in the "database".
  84
  85   // Text position properties.
  86   __sot,
  87   __eot,
  88
  89   // The code unit has none of above properties.
  90   __none
  91 }};
  92 """
  93
  94 DATA_ARRAY_TEMPLATE = """
  95 /// The entries of the extended grapheme cluster bondary property table.
  96 ///
  97 /// The data is generated from
  98 /// - https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakProperty.txt
  99 /// - https://www.unicode.org/Public/UCD/latest/ucd/emoji/emoji-data.txt
 100 ///
 101 /// The data has 3 values
 102 /// - bits [0, 3] The property. One of the values generated from the datafiles
 103 ///   of \\ref __property
 104 /// - bits [4, 10] The size of the range.
 105 /// - bits [11, 31] The lower bound code point of the range. The upper bound of
 106 ///   the range is lower bound + size.
 107 ///
 108 /// The 7 bits for the size allow a maximum range of 128 elements. Some ranges
 109 /// in the Unicode tables are larger. They are stored in multiple consecutive
 110 /// ranges in the data table. An alternative would be to store the sizes in a
 111 /// separate 16-bit value. The original MSVC STL code had such an approach, but
 112 /// this approach uses less space for the data and is about 4% faster in the
 113 /// following benchmark.
 114 /// libcxx/benchmarks/std_format_spec_string_unicode.bench.cpp
 115 inline constexpr uint32_t __entries[{size}] = {{
 116 {entries}}};
 117
 118 /// Returns the extended grapheme cluster bondary property of a code point.
 119 [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr __property __get_property(const char32_t __code_point) noexcept {{
 120   // The algorithm searches for the upper bound of the range and, when found,
 121   // steps back one entry. This algorithm is used since the code point can be
 122   // anywhere in the range. After a lower bound is found the next step is to
 123   // compare whether the code unit is indeed in the range.
 124   //
 125   // Since the entry contains a code unit, size, and property the code point
 126   // being sought needs to be adjusted. Just shifting the code point to the
 127   // proper position doesn't work; suppose an entry has property 0, size 1,
 128   // and lower bound 3. This results in the entry 0x1810.
 129   // When searching for code point 3 it will search for 0x1800, find 0x1810
 130   // and moves to the previous entry. Thus the lower bound value will never
 131   // be found.
 132   // The simple solution is to set the bits belonging to the property and
 133   // size. Then the upper bound for code point 3 will return the entry after
 134   // 0x1810. After moving to the previous entry the algorithm arrives at the
 135   // correct entry.
 136   ptrdiff_t __i = std::ranges::upper_bound(__entries, (__code_point << 11) | 0x7ffu) - __entries;
 137   if (__i == 0)
 138     return __property::__none;
 139
 140   --__i;
 141   uint32_t __upper_bound = (__entries[__i] >> 11) + ((__entries[__i] >> 4) & 0x7f);
 142   if (__code_point <= __upper_bound)
 143     return static_cast<__property>(__entries[__i] & 0xf);
 144
 145   return __property::__none;
 146 }}
 147 """
 148
 149 MSVC_FORMAT_UCD_TABLES_HPP_TEMPLATE = """
 150 // -*- C++ -*-
 151 //===----------------------------------------------------------------------===//
 152 //
 153 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 154 // See https://llvm.org/LICENSE.txt for license information.
 155 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 156 //
 157 //===----------------------------------------------------------------------===//
 158
 159 // WARNING, this entire header is generated by
 160 // utils/generate_extended_grapheme_cluster_table.py
 161 // DO NOT MODIFY!
 162
 163 // UNICODE, INC. LICENSE AGREEMENT - DATA FILES AND SOFTWARE
 164 //
 165 // See Terms of Use <https://www.unicode.org/copyright.html>
 166 // for definitions of Unicode Inc.'s Data Files and Software.
 167 //
 168 // NOTICE TO USER: Carefully read the following legal agreement.
 169 // BY DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING UNICODE INC.'S
 170 // DATA FILES ("DATA FILES"), AND/OR SOFTWARE ("SOFTWARE"),
 171 // YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE
 172 // TERMS AND CONDITIONS OF THIS AGREEMENT.
 173 // IF YOU DO NOT AGREE, DO NOT DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE
 174 // THE DATA FILES OR SOFTWARE.
 175 //
 176 // COPYRIGHT AND PERMISSION NOTICE
 177 //
 178 // Copyright (c) 1991-2022 Unicode, Inc. All rights reserved.
 179 // Distributed under the Terms of Use in https://www.unicode.org/copyright.html.
 180 //
 181 // Permission is hereby granted, free of charge, to any person obtaining
 182 // a copy of the Unicode data files and any associated documentation
 183 // (the "Data Files") or Unicode software and any associated documentation
 184 // (the "Software") to deal in the Data Files or Software
 185 // without restriction, including without limitation the rights to use,
 186 // copy, modify, merge, publish, distribute, and/or sell copies of
 187 // the Data Files or Software, and to permit persons to whom the Data Files
 188 // or Software are furnished to do so, provided that either
 189 // (a) this copyright and permission notice appear with all copies
 190 // of the Data Files or Software, or
 191 // (b) this copyright and permission notice appear in associated
 192 // Documentation.
 193 //
 194 // THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF
 195 // ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
 196 // WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 197 // NONINFRINGEMENT OF THIRD PARTY RIGHTS.
 198 // IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS
 199 // NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL
 200 // DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
 201 // DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
 202 // TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
 203 // PERFORMANCE OF THE DATA FILES OR SOFTWARE.
 204 //
 205 // Except as contained in this notice, the name of a copyright holder
 206 // shall not be used in advertising or otherwise to promote the sale,
 207 // use or other dealings in these Data Files or Software without prior
 208 // written authorization of the copyright holder.
 209
 210 #ifndef _LIBCPP___FORMAT_EXTENDED_GRAPHEME_CLUSTER_TABLE_H
 211 #define _LIBCPP___FORMAT_EXTENDED_GRAPHEME_CLUSTER_TABLE_H
 212
 213 #include <__algorithm/ranges_upper_bound.h>
 214 #include <__config>
 215 #include <__iterator/access.h>
 216 #include <cstddef>
 217 #include <cstdint>
 218
 219 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 220 #  pragma GCC system_header
 221 #endif
 222
 223 _LIBCPP_BEGIN_NAMESPACE_STD
 224
 225 #if _LIBCPP_STD_VER >= 20
 226
 227 namespace __extended_grapheme_custer_property_boundary {{
 228 {content}
 229 }} // namespace __extended_grapheme_custer_property_boundary
 230
 231 #endif //_LIBCPP_STD_VER >= 20
 232
 233 _LIBCPP_END_NAMESPACE_STD
 234
 235 #endif // _LIBCPP___FORMAT_EXTENDED_GRAPHEME_CLUSTER_TABLE_H"""
 236
 237
 238 def property_ranges_to_table(
 239     ranges: list[PropertyRange], props: list[str]
 240 ) -> list[Entry]:
 241     assert len(props) < 16
 242     result = list[Entry]()
 243     high = -1
 244     for range in sorted(ranges, key=lambda x: x.lower):
 245         # Validate overlapping ranges
 246         assert range.lower > high
 247         high = range.upper
 248
 249         while True:
 250             e = Entry(range.lower, range.upper - range.lower, props.index(range.prop))
 251             if e.offset <= 127:
 252                 result.append(e)
 253                 break
 254             e.offset = 127
 255             result.append(e)
 256             range.lower += 128
 257     return result
 258
 259
 260 cpp_entrytemplate = "    0x{:08x}"
 261
 262
 263 def generate_cpp_data(prop_name: str, ranges: list[PropertyRange]) -> str:
 264     result = StringIO()
 265     prop_values = sorted(set(x.prop for x in ranges))
 266     table = property_ranges_to_table(ranges, prop_values)
 267     enumerator_values = [PROP_VALUE_ENUMERATOR_TEMPLATE.format(x) for x in prop_values]
 268     result.write(
 269         PROP_VALUE_ENUM_TEMPLATE.format(enumerators=",\n".join(enumerator_values))
 270     )
 271     result.write(
 272         DATA_ARRAY_TEMPLATE.format(
 273             prop_name=prop_name,
 274             size=len(table),
 275             entries=",\n".join(
 276                 [
 277                     cpp_entrytemplate.format(x.lower << 11 | x.offset << 4 | x.prop)
 278                     for x in table
 279                 ]
 280             ),
 281         )
 282     )
 283
 284     return result.getvalue()
 285
 286
 287 def generate_data_tables() -> str:
 288     """
 289     Generate Unicode data for inclusion into <format> from
 290     GraphemeBreakProperty.txt and emoji-data.txt.
 291
 292     GraphemeBreakProperty.txt can be found at
 293     https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakProperty.txt
 294
 295     emoji-data.txt can be found at
 296     https://www.unicode.org/Public/UCD/latest/ucd/emoji/emoji-data.txt
 297
 298     Both files are expected to be in the same directory as this script.
 299     """
 300     gbp_data_path = (
 301         Path(__file__).absolute().parent
 302         / "data"
 303         / "unicode"
 304         / "GraphemeBreakProperty.txt"
 305     )
 306     emoji_data_path = (
 307         Path(__file__).absolute().parent / "data" / "unicode" / "emoji-data.txt"
 308     )
 309     gbp_ranges = list()
 310     emoji_ranges = list()
 311     with gbp_data_path.open(encoding="utf-8") as f:
 312         gbp_ranges = compactPropertyRanges(
 313             [x for line in f if (x := parsePropertyLine(line))]
 314         )
 315     with emoji_data_path.open(encoding="utf-8") as f:
 316         emoji_ranges = compactPropertyRanges(
 317             [x for line in f if (x := parsePropertyLine(line))]
 318         )
 319
 320     [gbp_ranges.append(x) for x in emoji_ranges if x.prop == "Extended_Pictographic"]
 321     gpb_cpp_data = generate_cpp_data("Grapheme_Break", gbp_ranges)
 322     return "\n".join([gpb_cpp_data])
 323
 324
 325 if __name__ == "__main__":
 326     if len(sys.argv) == 2:
 327         sys.stdout = open(sys.argv[1], "w")
 328     print(
 329         MSVC_FORMAT_UCD_TABLES_HPP_TEMPLATE.lstrip().format(
 330             content=generate_data_tables()
 331         )
 332     )