2 # ===----------------------------------------------------------------------===##
4 # Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
5 # See https://llvm.org/LICENSE.txt for license information.
6 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
8 # ===----------------------------------------------------------------------===##
10 # The code is based on
11 # https://github.com/microsoft/STL/blob/main/tools/unicode_properties_parse/grapheme_break_test_data_gen.py
13 # Copyright (c) Microsoft Corporation.
14 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
16 from pathlib
import Path
17 from dataclasses
import dataclass
, field
18 from typing
import Optional
, TextIO
19 from array
import array
25 code_points
: list[int] = field(default_factory
=list)
27 breaks_utf8
: list[int] = field(default_factory
=list)
28 breaks_utf16
: list[int] = field(default_factory
=list)
29 breaks_utf32
: list[int] = field(default_factory
=list)
40 def parseBreakTestLine(input: TextIO
) -> Optional
[BreakTestItem
]:
41 result
= BreakTestItem()
49 if c
== "\N{DIVISION SIGN}":
50 # The line starts with a division sign, don't add it to the output.
52 result
.code_points
.append(code_point
)
54 result
.breaks_utf8
.append(utf8
)
55 result
.breaks_utf16
.append(utf16
)
56 result
.breaks_utf32
.append(utf32
)
58 assert input.read(1).isspace()
60 if c
== "\N{MULTIPLICATION SIGN}":
61 assert input.read(1).isspace()
64 while next
:= input.read(1):
74 result
.encoded
+= f
"\\U{i:08x}"
76 utf8
+= c
.encode().__len
__()
77 # Since we only care about the number of code units the byte order
78 # doesn't matter. The byte order is specified to avoid the BOM
79 utf16
+= int(c
.encode("utf-16-le").__len
__() / 2)
80 utf32
+= int(c
.encode("utf-32-le").__len
__() / 4)
92 cpp_template
= """// -*- C++ -*-
93 //===----------------------------------------------------------------------===//
95 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
96 // See https://llvm.org/LICENSE.txt for license information.
97 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
99 //===----------------------------------------------------------------------===//
101 // WARNING, this entire header is generated by
102 // utils/generate_extended_grapheme_cluster_test.py
105 // UNICODE, INC. LICENSE AGREEMENT - DATA FILES AND SOFTWARE
107 // See Terms of Use <https://www.unicode.org/copyright.html>
108 // for definitions of Unicode Inc.'s Data Files and Software.
110 // NOTICE TO USER: Carefully read the following legal agreement.
111 // BY DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING UNICODE INC.'S
112 // DATA FILES ("DATA FILES"), AND/OR SOFTWARE ("SOFTWARE"),
113 // YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE
114 // TERMS AND CONDITIONS OF THIS AGREEMENT.
115 // IF YOU DO NOT AGREE, DO NOT DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE
116 // THE DATA FILES OR SOFTWARE.
118 // COPYRIGHT AND PERMISSION NOTICE
120 // Copyright (c) 1991-2022 Unicode, Inc. All rights reserved.
121 // Distributed under the Terms of Use in https://www.unicode.org/copyright.html.
123 // Permission is hereby granted, free of charge, to any person obtaining
124 // a copy of the Unicode data files and any associated documentation
125 // (the "Data Files") or Unicode software and any associated documentation
126 // (the "Software") to deal in the Data Files or Software
127 // without restriction, including without limitation the rights to use,
128 // copy, modify, merge, publish, distribute, and/or sell copies of
129 // the Data Files or Software, and to permit persons to whom the Data Files
130 // or Software are furnished to do so, provided that either
131 // (a) this copyright and permission notice appear with all copies
132 // of the Data Files or Software, or
133 // (b) this copyright and permission notice appear in associated
136 // THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF
137 // ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
138 // WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
139 // NONINFRINGEMENT OF THIRD PARTY RIGHTS.
140 // IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS
141 // NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL
142 // DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
143 // DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
144 // TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
145 // PERFORMANCE OF THE DATA FILES OR SOFTWARE.
147 // Except as contained in this notice, the name of a copyright holder
148 // shall not be used in advertising or otherwise to promote the sale,
149 // use or other dealings in these Data Files or Software without prior
150 // written authorization of the copyright holder.
152 #ifndef LIBCXX_TEST_STD_UTILITIES_FORMAT_FORMAT_STRING_FORMAT_STRING_STD_EXTENDED_GRAPHEME_CLUSTER_H
153 #define LIBCXX_TEST_STD_UTILITIES_FORMAT_FORMAT_STRING_FORMAT_STRING_STD_EXTENDED_GRAPHEME_CLUSTER_H
156 #include <string_view>
159 #include "test_macros.h"
161 template <class CharT>
163 /// The input to parse.
164 std::basic_string_view<CharT> input;
166 /// The first code point all extended grapheme clusters in the input.
167 std::vector<char32_t> code_points;
169 /// The offset of the last code units of the extended grapheme clusters in the input.
171 /// The vector has the same number of entries as \\ref code_points.
172 std::vector<std::size_t> breaks;
175 /// The data for UTF-8.
176 std::array<data<char>, {0}> data_utf8 = {{{{
179 /// The data for UTF-16.
181 /// Note that most of the data for the UTF-16 and UTF-32 are identical. However
182 /// since the size of the code units differ the breaks can contain different
184 #ifndef TEST_HAS_NO_WIDE_CHARACTERS
185 std::array<data<wchar_t>, {0}> data_utf16 = {{{{
188 /// The data for UTF-8.
190 /// Note that most of the data for the UTF-16 and UTF-32 are identical. However
191 /// since the size of the code units differ the breaks can contain different
193 std::array<data<wchar_t>, {0}> data_utf32 = {{{{
195 #endif // TEST_HAS_NO_WIDE_CHARACTERS
197 #endif // LIBCXX_TEST_STD_UTILITIES_FORMAT_FORMAT_STRING_FORMAT_STRING_STD_EXTENDED_GRAPHEME_CLUSTER_H"""
199 cpp_test_data_line_template
= " {{{}, {{{}}}, {{{}}}}}"
202 def lineToCppDataLineUtf8(line
: BreakTestItem
) -> str:
203 return cpp_test_data_line_template
.format(
205 ", ".join([str(x
) for x
in line
.code_points
]),
206 ", ".join([str(x
) for x
in line
.breaks_utf8
]),
210 def lineToCppDataLineUtf16(line
: BreakTestItem
) -> str:
211 return cpp_test_data_line_template
.format(
212 f
'L"{line.encoded}"',
213 ", ".join([str(x
) for x
in line
.code_points
]),
214 ", ".join([str(x
) for x
in line
.breaks_utf16
]),
218 def lineToCppDataLineUtf32(line
: BreakTestItem
) -> str:
219 return cpp_test_data_line_template
.format(
220 f
'L"{line.encoded}"',
221 ", ".join([str(x
) for x
in line
.code_points
]),
222 ", ".join([str(x
) for x
in line
.breaks_utf32
]),
227 Generate test data from "GraphemeBreakText.txt"
228 This file can be downloaded from:
229 https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakTest.txt
230 This script looks for GraphemeBreakTest.txt in same directory as this script
234 def generate_all() -> str:
235 test_data_path
= Path(__file__
)
236 test_data_path
= test_data_path
.absolute()
238 test_data_path
.parent
/ "data" / "unicode" / "GraphemeBreakTest.txt"
241 with
open(test_data_path
, mode
="rt", encoding
="utf-8") as file:
242 while line
:= parseBreakTestLine(file):
243 if len(line
.encoded
) > 0:
245 return cpp_template
.format(
247 ",\n".join(map(lineToCppDataLineUtf8
, lines
)),
248 ",\n".join(map(lineToCppDataLineUtf16
, lines
)),
249 ",\n".join(map(lineToCppDataLineUtf32
, lines
)),
253 if __name__
== "__main__":
254 if len(sys
.argv
) == 2:
255 sys
.stdout
= open(sys
.argv
[1], "w")
256 print(generate_all())