libcxx/utils/generate_extended_grapheme_cluster_test.py

   1 #!/usr/bin/env python
   2 # ===----------------------------------------------------------------------===##
   3 #
   4 # Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   5 # See https://llvm.org/LICENSE.txt for license information.
   6 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   7 #
   8 # ===----------------------------------------------------------------------===##
   9
  10 # The code is based on
  11 # https://github.com/microsoft/STL/blob/main/tools/unicode_properties_parse/grapheme_break_test_data_gen.py
  12 #
  13 # Copyright (c) Microsoft Corporation.
  14 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  15
  16 from pathlib import Path
  17 from dataclasses import dataclass, field
  18 from typing import Optional, TextIO
  19 import sys
  20
  21
  22 @dataclass
  23 class BreakTestItem:
  24     code_points: list[int] = field(default_factory=list)
  25     encoded: str = ""
  26     breaks_utf8: list[int] = field(default_factory=list)
  27     breaks_utf16: list[int] = field(default_factory=list)
  28     breaks_utf32: list[int] = field(default_factory=list)
  29
  30
  31 class CommentLine:
  32     pass
  33
  34
  35 class EOF:
  36     pass
  37
  38
  39 def parseBreakTestLine(input: TextIO) -> Optional[BreakTestItem]:
  40     result = BreakTestItem()
  41     code_point = -1
  42     utf8 = 0
  43     utf16 = 0
  44     utf32 = 0
  45
  46     while True:
  47         c = input.read(1)
  48         if c == "\N{DIVISION SIGN}":
  49             # The line starts with a division sign, don't add it to the output.
  50             if code_point != -1:
  51                 result.code_points.append(code_point)
  52                 code_point = -1
  53                 result.breaks_utf8.append(utf8)
  54                 result.breaks_utf16.append(utf16)
  55                 result.breaks_utf32.append(utf32)
  56
  57             assert input.read(1).isspace()
  58             continue
  59         if c == "\N{MULTIPLICATION SIGN}":
  60             assert input.read(1).isspace()
  61             continue
  62         if c.isalnum():
  63             while next := input.read(1):
  64                 if next.isalnum():
  65                     c += next
  66                 else:
  67                     assert next.isspace()
  68                     break
  69             i = int(c, base=16)
  70             if code_point == -1:
  71                 code_point = i
  72
  73             result.encoded += f"\\U{i:08x}"
  74             c = chr(i)
  75             utf8 += c.encode().__len__()
  76             # Since we only care about the number of code units the byte order
  77             # doesn't matter. The byte order is specified to avoid the BOM
  78             utf16 += int(c.encode("utf-16-le").__len__() / 2)
  79             utf32 += int(c.encode("utf-32-le").__len__() / 4)
  80             continue
  81         if c == "#":
  82             input.readline()
  83             return result
  84         if c == "\n":
  85             return result
  86         if c == "":
  87             return None
  88         assert False
  89
  90
  91 cpp_template = """// -*- C++ -*-
  92 //===----------------------------------------------------------------------===//
  93 //
  94 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  95 // See https://llvm.org/LICENSE.txt for license information.
  96 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  97 //
  98 //===----------------------------------------------------------------------===//
  99
 100 // WARNING, this entire header is generated by
 101 // utils/generate_extended_grapheme_cluster_test.py
 102 // DO NOT MODIFY!
 103
 104 // UNICODE, INC. LICENSE AGREEMENT - DATA FILES AND SOFTWARE
 105 //
 106 // See Terms of Use <https://www.unicode.org/copyright.html>
 107 // for definitions of Unicode Inc.'s Data Files and Software.
 108 //
 109 // NOTICE TO USER: Carefully read the following legal agreement.
 110 // BY DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING UNICODE INC.'S
 111 // DATA FILES ("DATA FILES"), AND/OR SOFTWARE ("SOFTWARE"),
 112 // YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE
 113 // TERMS AND CONDITIONS OF THIS AGREEMENT.
 114 // IF YOU DO NOT AGREE, DO NOT DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE
 115 // THE DATA FILES OR SOFTWARE.
 116 //
 117 // COPYRIGHT AND PERMISSION NOTICE
 118 //
 119 // Copyright (c) 1991-2022 Unicode, Inc. All rights reserved.
 120 // Distributed under the Terms of Use in https://www.unicode.org/copyright.html.
 121 //
 122 // Permission is hereby granted, free of charge, to any person obtaining
 123 // a copy of the Unicode data files and any associated documentation
 124 // (the "Data Files") or Unicode software and any associated documentation
 125 // (the "Software") to deal in the Data Files or Software
 126 // without restriction, including without limitation the rights to use,
 127 // copy, modify, merge, publish, distribute, and/or sell copies of
 128 // the Data Files or Software, and to permit persons to whom the Data Files
 129 // or Software are furnished to do so, provided that either
 130 // (a) this copyright and permission notice appear with all copies
 131 // of the Data Files or Software, or
 132 // (b) this copyright and permission notice appear in associated
 133 // Documentation.
 134 //
 135 // THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF
 136 // ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
 137 // WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 138 // NONINFRINGEMENT OF THIRD PARTY RIGHTS.
 139 // IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS
 140 // NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL
 141 // DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
 142 // DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
 143 // TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
 144 // PERFORMANCE OF THE DATA FILES OR SOFTWARE.
 145 //
 146 // Except as contained in this notice, the name of a copyright holder
 147 // shall not be used in advertising or otherwise to promote the sale,
 148 // use or other dealings in these Data Files or Software without prior
 149 // written authorization of the copyright holder.
 150
 151 #ifndef LIBCXX_TEST_STD_UTILITIES_FORMAT_FORMAT_STRING_FORMAT_STRING_STD_EXTENDED_GRAPHEME_CLUSTER_H
 152 #define LIBCXX_TEST_STD_UTILITIES_FORMAT_FORMAT_STRING_FORMAT_STRING_STD_EXTENDED_GRAPHEME_CLUSTER_H
 153
 154 #include <array>
 155 #include <string_view>
 156 #include <vector>
 157
 158 #include "test_macros.h"
 159
 160 template <class CharT>
 161 struct data {{
 162   /// The input to parse.
 163   std::basic_string_view<CharT> input;
 164
 165   /// The first code point all extended grapheme clusters in the input.
 166   std::vector<char32_t> code_points;
 167
 168   /// The offset of the last code units of the extended grapheme clusters in the input.
 169   ///
 170   /// The vector has the same number of entries as \\ref code_points.
 171   std::vector<std::size_t> breaks;
 172 }};
 173
 174 /// The data for UTF-8.
 175 std::array<data<char>, {0}> data_utf8 = {{{{
 176 {1}}}}};
 177
 178 /// The data for UTF-16.
 179 ///
 180 /// Note that most of the data for the UTF-16 and UTF-32 are identical. However
 181 /// since the size of the code units differ the breaks can contain different
 182 /// values.
 183 #ifndef TEST_HAS_NO_WIDE_CHARACTERS
 184 std::array<data<wchar_t>, {0}> data_utf16 = {{{{
 185 {2}}}}};
 186
 187 /// The data for UTF-8.
 188 ///
 189 /// Note that most of the data for the UTF-16 and UTF-32 are identical. However
 190 /// since the size of the code units differ the breaks can contain different
 191 /// values.
 192 std::array<data<wchar_t>, {0}> data_utf32 = {{{{
 193 {3}}}}};
 194 #endif // TEST_HAS_NO_WIDE_CHARACTERS
 195
 196 #endif // LIBCXX_TEST_STD_UTILITIES_FORMAT_FORMAT_STRING_FORMAT_STRING_STD_EXTENDED_GRAPHEME_CLUSTER_H"""
 197
 198 cpp_test_data_line_template = "     {{{}, {{{}}}, {{{}}}}}"
 199
 200
 201 def lineToCppDataLineUtf8(line: BreakTestItem) -> str:
 202     return cpp_test_data_line_template.format(
 203         f'"{line.encoded}"',
 204         ", ".join([str(x) for x in line.code_points]),
 205         ", ".join([str(x) for x in line.breaks_utf8]),
 206     )
 207
 208
 209 def lineToCppDataLineUtf16(line: BreakTestItem) -> str:
 210     return cpp_test_data_line_template.format(
 211         f'L"{line.encoded}"',
 212         ", ".join([str(x) for x in line.code_points]),
 213         ", ".join([str(x) for x in line.breaks_utf16]),
 214     )
 215
 216
 217 def lineToCppDataLineUtf32(line: BreakTestItem) -> str:
 218     return cpp_test_data_line_template.format(
 219         f'L"{line.encoded}"',
 220         ", ".join([str(x) for x in line.code_points]),
 221         ", ".join([str(x) for x in line.breaks_utf32]),
 222     )
 223
 224
 225 """
 226 Generate test data from "GraphemeBreakText.txt"
 227 This file can be downloaded from:
 228 https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakTest.txt
 229 This script looks for GraphemeBreakTest.txt in same directory as this script
 230 """
 231
 232
 233 def generate_all() -> str:
 234     test_data_path = Path(__file__)
 235     test_data_path = test_data_path.absolute()
 236     test_data_path = (
 237         test_data_path.parent / "data" / "unicode" / "GraphemeBreakTest.txt"
 238     )
 239     lines = list()
 240     with open(test_data_path, mode="rt", encoding="utf-8") as file:
 241         while line := parseBreakTestLine(file):
 242             if len(line.encoded) > 0:
 243                 lines.append(line)
 244     return cpp_template.format(
 245         len(lines),
 246         ",\n".join(map(lineToCppDataLineUtf8, lines)),
 247         ",\n".join(map(lineToCppDataLineUtf16, lines)),
 248         ",\n".join(map(lineToCppDataLineUtf32, lines)),
 249     )
 250
 251
 252 if __name__ == "__main__":
 253     if len(sys.argv) == 2:
 254         sys.stdout = open(sys.argv[1], "w")
 255     print(generate_all())