[mlir][ods] Store the pointer to the anchor element (NFC)
[llvm-project.git] / libcxx / utils / generate_extended_grapheme_cluster_test.py
blobc263cdd69cd0bf03714200be270fa3a194ff4e44
1 #!/usr/bin/env python
2 # ===----------------------------------------------------------------------===##
4 # Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
5 # See https://llvm.org/LICENSE.txt for license information.
6 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
8 # ===----------------------------------------------------------------------===##
10 # The code is based on
11 # https://github.com/microsoft/STL/blob/main/tools/unicode_properties_parse/grapheme_break_test_data_gen.py
13 # Copyright (c) Microsoft Corporation.
14 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
16 from pathlib import Path
17 from dataclasses import dataclass, field
18 from typing import Optional, TextIO
19 from array import array
20 import sys
23 @dataclass
24 class BreakTestItem:
25 code_points: list[int] = field(default_factory=list)
26 encoded: str = ""
27 breaks_utf8: list[int] = field(default_factory=list)
28 breaks_utf16: list[int] = field(default_factory=list)
29 breaks_utf32: list[int] = field(default_factory=list)
32 class CommentLine:
33 pass
36 class EOF:
37 pass
40 def parseBreakTestLine(input: TextIO) -> Optional[BreakTestItem]:
41 result = BreakTestItem()
42 code_point = -1
43 utf8 = 0
44 utf16 = 0
45 utf32 = 0
47 while True:
48 c = input.read(1)
49 if c == "\N{DIVISION SIGN}":
50 # The line starts with a division sign, don't add it to the output.
51 if code_point != -1:
52 result.code_points.append(code_point)
53 code_point = -1
54 result.breaks_utf8.append(utf8)
55 result.breaks_utf16.append(utf16)
56 result.breaks_utf32.append(utf32)
58 assert input.read(1).isspace()
59 continue
60 if c == "\N{MULTIPLICATION SIGN}":
61 assert input.read(1).isspace()
62 continue
63 if c.isalnum():
64 while next := input.read(1):
65 if next.isalnum():
66 c += next
67 else:
68 assert next.isspace()
69 break
70 i = int(c, base=16)
71 if code_point == -1:
72 code_point = i
74 result.encoded += f"\\U{i:08x}"
75 c = chr(i)
76 utf8 += c.encode().__len__()
77 # Since we only care about the number of code units the byte order
78 # doesn't matter. The byte order is specified to avoid the BOM
79 utf16 += int(c.encode("utf-16-le").__len__() / 2)
80 utf32 += int(c.encode("utf-32-le").__len__() / 4)
81 continue
82 if c == "#":
83 input.readline()
84 return result
85 if c == "\n":
86 return result
87 if c == "":
88 return None
89 assert False
92 cpp_template = """// -*- C++ -*-
93 //===----------------------------------------------------------------------===//
95 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
96 // See https://llvm.org/LICENSE.txt for license information.
97 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
99 //===----------------------------------------------------------------------===//
101 // WARNING, this entire header is generated by
102 // utiles/generate_extended_grapheme_cluster_test.py
103 // DO NOT MODIFY!
105 // UNICODE, INC. LICENSE AGREEMENT - DATA FILES AND SOFTWARE
107 // See Terms of Use <https://www.unicode.org/copyright.html>
108 // for definitions of Unicode Inc.'s Data Files and Software.
110 // NOTICE TO USER: Carefully read the following legal agreement.
111 // BY DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING UNICODE INC.'S
112 // DATA FILES ("DATA FILES"), AND/OR SOFTWARE ("SOFTWARE"),
113 // YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE
114 // TERMS AND CONDITIONS OF THIS AGREEMENT.
115 // IF YOU DO NOT AGREE, DO NOT DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE
116 // THE DATA FILES OR SOFTWARE.
118 // COPYRIGHT AND PERMISSION NOTICE
120 // Copyright (c) 1991-2022 Unicode, Inc. All rights reserved.
121 // Distributed under the Terms of Use in https://www.unicode.org/copyright.html.
123 // Permission is hereby granted, free of charge, to any person obtaining
124 // a copy of the Unicode data files and any associated documentation
125 // (the "Data Files") or Unicode software and any associated documentation
126 // (the "Software") to deal in the Data Files or Software
127 // without restriction, including without limitation the rights to use,
128 // copy, modify, merge, publish, distribute, and/or sell copies of
129 // the Data Files or Software, and to permit persons to whom the Data Files
130 // or Software are furnished to do so, provided that either
131 // (a) this copyright and permission notice appear with all copies
132 // of the Data Files or Software, or
133 // (b) this copyright and permission notice appear in associated
134 // Documentation.
136 // THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF
137 // ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
138 // WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
139 // NONINFRINGEMENT OF THIRD PARTY RIGHTS.
140 // IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS
141 // NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL
142 // DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
143 // DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
144 // TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
145 // PERFORMANCE OF THE DATA FILES OR SOFTWARE.
147 // Except as contained in this notice, the name of a copyright holder
148 // shall not be used in advertising or otherwise to promote the sale,
149 // use or other dealings in these Data Files or Software without prior
150 // written authorization of the copyright holder.
152 #ifndef LIBCXX_TEST_STD_UTILITIES_FORMAT_FORMAT_STRING_FORMAT_STRING_STD_EXTENDED_GRAPHEME_CLUSTER_H
153 #define LIBCXX_TEST_STD_UTILITIES_FORMAT_FORMAT_STRING_FORMAT_STRING_STD_EXTENDED_GRAPHEME_CLUSTER_H
155 #include <array>
156 #include <string_view>
157 #include <vector>
159 template <class CharT>
160 struct data {{
161 /// The input to parse.
162 std::basic_string_view<CharT> input;
164 /// The first code point all extended grapheme clusters in the input.
165 std::vector<char32_t> code_points;
167 /// The offset of the last code units of the extended grapheme clusters in the input.
169 /// The vector has the same number of entries as \\ref code_points.
170 std::vector<size_t> breaks;
173 /// The data for UTF-8.
174 std::array<data<char>, {0}> data_utf8 = {{{{
175 {1}}}}};
177 /// The data for UTF-16.
179 /// Note that most of the data for the UTF-16 and UTF-32 are identical. However
180 /// since the size of the code units differ the breaks can contain different
181 /// values.
182 std::array<data<wchar_t>, {0}> data_utf16 = {{{{
183 {2}}}}};
185 /// The data for UTF-8.
187 /// Note that most of the data for the UTF-16 and UTF-32 are identical. However
188 /// since the size of the code units differ the breaks can contain different
189 /// values.
190 std::array<data<wchar_t>, {0}> data_utf32 = {{{{
191 {3}}}}};
193 #endif // LIBCXX_TEST_STD_UTILITIES_FORMAT_FORMAT_STRING_FORMAT_STRING_STD_EXTENDED_GRAPHEME_CLUSTER_H"""
195 cpp_test_data_line_template = " {{{}, {{{}}}, {{{}}}}}"
198 def lineToCppDataLineUtf8(line: BreakTestItem) -> str:
199 return cpp_test_data_line_template.format(
200 f'"{line.encoded}"',
201 ", ".join([str(x) for x in line.code_points]),
202 ", ".join([str(x) for x in line.breaks_utf8]),
206 def lineToCppDataLineUtf16(line: BreakTestItem) -> str:
207 return cpp_test_data_line_template.format(
208 f'L"{line.encoded}"',
209 ", ".join([str(x) for x in line.code_points]),
210 ", ".join([str(x) for x in line.breaks_utf16]),
214 def lineToCppDataLineUtf32(line: BreakTestItem) -> str:
215 return cpp_test_data_line_template.format(
216 f'L"{line.encoded}"',
217 ", ".join([str(x) for x in line.code_points]),
218 ", ".join([str(x) for x in line.breaks_utf32]),
223 Generate test data from "GraphemeBreakText.txt"
224 This file can be downloaded from:
225 https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakTest.txt
226 This script looks for GraphemeBreakTest.txt in same directory as this script
230 def generate_all() -> str:
231 test_data_path = Path(__file__)
232 test_data_path = test_data_path.absolute()
233 test_data_path = (
234 test_data_path.parent / "data" / "unicode" / "GraphemeBreakTest.txt"
236 lines = list()
237 with open(test_data_path, mode="rt", encoding="utf-8") as file:
238 while line := parseBreakTestLine(file):
239 if len(line.encoded) > 0:
240 lines.append(line)
241 return cpp_template.format(
242 len(lines),
243 ",\n".join(map(lineToCppDataLineUtf8, lines)),
244 ",\n".join(map(lineToCppDataLineUtf16, lines)),
245 ",\n".join(map(lineToCppDataLineUtf32, lines)),
249 if __name__ == "__main__":
250 if len(sys.argv) == 2:
251 sys.stdout = open(sys.argv[1], "w")
252 print(generate_all())