[flang][runtime] Make defined formatted I/O process format elementally (#74150)
[llvm-project.git] / libcxx / utils / generate_extended_grapheme_cluster_test.py
blobe0a6003ecd53c1c99d102bae152624d81108211a
1 #!/usr/bin/env python
2 # ===----------------------------------------------------------------------===##
4 # Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
5 # See https://llvm.org/LICENSE.txt for license information.
6 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
8 # ===----------------------------------------------------------------------===##
10 # The code is based on
11 # https://github.com/microsoft/STL/blob/main/tools/unicode_properties_parse/grapheme_break_test_data_gen.py
13 # Copyright (c) Microsoft Corporation.
14 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
16 from pathlib import Path
17 from dataclasses import dataclass, field
18 from typing import Optional, TextIO
19 import sys
22 @dataclass
23 class BreakTestItem:
24 code_points: list[int] = field(default_factory=list)
25 encoded: str = ""
26 breaks_utf8: list[int] = field(default_factory=list)
27 breaks_utf16: list[int] = field(default_factory=list)
28 breaks_utf32: list[int] = field(default_factory=list)
31 class CommentLine:
32 pass
35 class EOF:
36 pass
39 def parseBreakTestLine(input: TextIO) -> Optional[BreakTestItem]:
40 result = BreakTestItem()
41 code_point = -1
42 utf8 = 0
43 utf16 = 0
44 utf32 = 0
46 while True:
47 c = input.read(1)
48 if c == "\N{DIVISION SIGN}":
49 # The line starts with a division sign, don't add it to the output.
50 if code_point != -1:
51 result.code_points.append(code_point)
52 code_point = -1
53 result.breaks_utf8.append(utf8)
54 result.breaks_utf16.append(utf16)
55 result.breaks_utf32.append(utf32)
57 assert input.read(1).isspace()
58 continue
59 if c == "\N{MULTIPLICATION SIGN}":
60 assert input.read(1).isspace()
61 continue
62 if c.isalnum():
63 while next := input.read(1):
64 if next.isalnum():
65 c += next
66 else:
67 assert next.isspace()
68 break
69 i = int(c, base=16)
70 if code_point == -1:
71 code_point = i
73 result.encoded += f"\\U{i:08x}"
74 c = chr(i)
75 utf8 += c.encode().__len__()
76 # Since we only care about the number of code units the byte order
77 # doesn't matter. The byte order is specified to avoid the BOM
78 utf16 += int(c.encode("utf-16-le").__len__() / 2)
79 utf32 += int(c.encode("utf-32-le").__len__() / 4)
80 continue
81 if c == "#":
82 input.readline()
83 return result
84 if c == "\n":
85 return result
86 if c == "":
87 return None
88 assert False
91 cpp_template = """// -*- C++ -*-
92 //===----------------------------------------------------------------------===//
94 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
95 // See https://llvm.org/LICENSE.txt for license information.
96 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
98 //===----------------------------------------------------------------------===//
100 // WARNING, this entire header is generated by
101 // utils/generate_extended_grapheme_cluster_test.py
102 // DO NOT MODIFY!
104 // UNICODE, INC. LICENSE AGREEMENT - DATA FILES AND SOFTWARE
106 // See Terms of Use <https://www.unicode.org/copyright.html>
107 // for definitions of Unicode Inc.'s Data Files and Software.
109 // NOTICE TO USER: Carefully read the following legal agreement.
110 // BY DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING UNICODE INC.'S
111 // DATA FILES ("DATA FILES"), AND/OR SOFTWARE ("SOFTWARE"),
112 // YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE
113 // TERMS AND CONDITIONS OF THIS AGREEMENT.
114 // IF YOU DO NOT AGREE, DO NOT DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE
115 // THE DATA FILES OR SOFTWARE.
117 // COPYRIGHT AND PERMISSION NOTICE
119 // Copyright (c) 1991-2022 Unicode, Inc. All rights reserved.
120 // Distributed under the Terms of Use in https://www.unicode.org/copyright.html.
122 // Permission is hereby granted, free of charge, to any person obtaining
123 // a copy of the Unicode data files and any associated documentation
124 // (the "Data Files") or Unicode software and any associated documentation
125 // (the "Software") to deal in the Data Files or Software
126 // without restriction, including without limitation the rights to use,
127 // copy, modify, merge, publish, distribute, and/or sell copies of
128 // the Data Files or Software, and to permit persons to whom the Data Files
129 // or Software are furnished to do so, provided that either
130 // (a) this copyright and permission notice appear with all copies
131 // of the Data Files or Software, or
132 // (b) this copyright and permission notice appear in associated
133 // Documentation.
135 // THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF
136 // ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
137 // WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
138 // NONINFRINGEMENT OF THIRD PARTY RIGHTS.
139 // IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS
140 // NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL
141 // DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
142 // DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
143 // TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
144 // PERFORMANCE OF THE DATA FILES OR SOFTWARE.
146 // Except as contained in this notice, the name of a copyright holder
147 // shall not be used in advertising or otherwise to promote the sale,
148 // use or other dealings in these Data Files or Software without prior
149 // written authorization of the copyright holder.
151 #ifndef LIBCXX_TEST_STD_UTILITIES_FORMAT_FORMAT_STRING_FORMAT_STRING_STD_EXTENDED_GRAPHEME_CLUSTER_H
152 #define LIBCXX_TEST_STD_UTILITIES_FORMAT_FORMAT_STRING_FORMAT_STRING_STD_EXTENDED_GRAPHEME_CLUSTER_H
154 #include <array>
155 #include <string_view>
156 #include <vector>
158 #include "test_macros.h"
160 template <class CharT>
161 struct data {{
162 /// The input to parse.
163 std::basic_string_view<CharT> input;
165 /// The first code point all extended grapheme clusters in the input.
166 std::vector<char32_t> code_points;
168 /// The offset of the last code units of the extended grapheme clusters in the input.
170 /// The vector has the same number of entries as \\ref code_points.
171 std::vector<std::size_t> breaks;
174 /// The data for UTF-8.
175 std::array<data<char>, {0}> data_utf8 = {{{{
176 {1}}}}};
178 /// The data for UTF-16.
180 /// Note that most of the data for the UTF-16 and UTF-32 are identical. However
181 /// since the size of the code units differ the breaks can contain different
182 /// values.
183 #ifndef TEST_HAS_NO_WIDE_CHARACTERS
184 std::array<data<wchar_t>, {0}> data_utf16 = {{{{
185 {2}}}}};
187 /// The data for UTF-8.
189 /// Note that most of the data for the UTF-16 and UTF-32 are identical. However
190 /// since the size of the code units differ the breaks can contain different
191 /// values.
192 std::array<data<wchar_t>, {0}> data_utf32 = {{{{
193 {3}}}}};
194 #endif // TEST_HAS_NO_WIDE_CHARACTERS
196 #endif // LIBCXX_TEST_STD_UTILITIES_FORMAT_FORMAT_STRING_FORMAT_STRING_STD_EXTENDED_GRAPHEME_CLUSTER_H"""
198 cpp_test_data_line_template = " {{{}, {{{}}}, {{{}}}}}"
201 def lineToCppDataLineUtf8(line: BreakTestItem) -> str:
202 return cpp_test_data_line_template.format(
203 f'"{line.encoded}"',
204 ", ".join([str(x) for x in line.code_points]),
205 ", ".join([str(x) for x in line.breaks_utf8]),
209 def lineToCppDataLineUtf16(line: BreakTestItem) -> str:
210 return cpp_test_data_line_template.format(
211 f'L"{line.encoded}"',
212 ", ".join([str(x) for x in line.code_points]),
213 ", ".join([str(x) for x in line.breaks_utf16]),
217 def lineToCppDataLineUtf32(line: BreakTestItem) -> str:
218 return cpp_test_data_line_template.format(
219 f'L"{line.encoded}"',
220 ", ".join([str(x) for x in line.code_points]),
221 ", ".join([str(x) for x in line.breaks_utf32]),
226 Generate test data from "GraphemeBreakText.txt"
227 This file can be downloaded from:
228 https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakTest.txt
229 This script looks for GraphemeBreakTest.txt in same directory as this script
233 def generate_all() -> str:
234 test_data_path = Path(__file__)
235 test_data_path = test_data_path.absolute()
236 test_data_path = (
237 test_data_path.parent / "data" / "unicode" / "GraphemeBreakTest.txt"
239 lines = list()
240 with open(test_data_path, mode="rt", encoding="utf-8") as file:
241 while line := parseBreakTestLine(file):
242 if len(line.encoded) > 0:
243 lines.append(line)
244 return cpp_template.format(
245 len(lines),
246 ",\n".join(map(lineToCppDataLineUtf8, lines)),
247 ",\n".join(map(lineToCppDataLineUtf16, lines)),
248 ",\n".join(map(lineToCppDataLineUtf32, lines)),
252 if __name__ == "__main__":
253 if len(sys.argv) == 2:
254 sys.stdout = open(sys.argv[1], "w")
255 print(generate_all())