3 # Script to generate tables for libstdc++ std::format width estimation.
5 # This file is part of GCC.
7 # GCC is free software; you can redistribute it and/or modify it under
8 # the terms of the GNU General Public License as published by the Free
9 # Software Foundation; either version 3, or (at your option) any later
12 # GCC is distributed in the hope that it will be useful, but WITHOUT ANY
13 # WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 # FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
17 # You should have received a copy of the GNU General Public License
18 # along with GCC; see the file COPYING3. If not see
19 # <http://www.gnu.org/licenses/>.
21 # To update the Libstdc++ static data in <bits/unicode-data.h> download the latest:
22 # ftp://ftp.unicode.org/Public/UNIDATA/EastAsianWidth.txt
23 # ftp://ftp.unicode.org/Public/UNIDATA/DerivedCoreProperties.txt
24 # ftp://ftp.unicode.org/Public/UNIDATA/auxiliary/GraphemeBreakProperty.txt
25 # ftp://ftp.unicode.org/Public/UNIDATA/emoji/emoji-data.txt
26 # Then run this script and save the output to
27 # ../../libstdc++-v3/include/bits/unicode-data.h
34 self
= os
.path
.basename(__file__
)
35 print("// Generated by contrib/unicode/{}, do not edit.".format(self
))
37 // Copyright The GNU Toolchain Authors.
39 // This file is part of the GNU ISO C++ Library. This library is free
40 // software; you can redistribute it and/or modify it under the
41 // terms of the GNU General Public License as published by the
42 // Free Software Foundation; either version 3, or (at your option)
45 // This library is distributed in the hope that it will be useful,
46 // but WITHOUT ANY WARRANTY; without even the implied warranty of
47 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
48 // GNU General Public License for more details.
50 // Under Section 7 of GPL version 3, you are granted additional
51 // permissions described in the GCC Runtime Library Exception, version
52 // 3.1, as published by the Free Software Foundation.
54 // You should have received a copy of the GNU General Public License and
55 // a copy of the GCC Runtime Library Exception along with this program;
56 // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
57 // <http://www.gnu.org/licenses/>.
59 /** @file bits/unicode-data.h
60 * This is an internal header file, included by other library headers.
61 * Do not attempt to use it directly. @headername{format}
64 print("#ifndef _GLIBCXX_GET_UNICODE_DATA")
65 print('# error "This is not a public header, do not include it directly"')
66 print("#elif _GLIBCXX_GET_UNICODE_DATA != 160000")
67 print('# error "Version mismatch for Unicode static data"')
70 # Process a list and return a list of tuples (index, val) which are the elements
71 # in the list that have a different val from the previous element.
72 # e.g. find_edges([a, a, b, b, c, b, b, d]) is [(0,a), (2,b), (4,c), (5,b), (7,d)]
73 # and find_edges([a, a, b, b, c, b, b, d], a) is [(2,b), (4,c), (5,b), (7,d)]
74 def find_edges(vals
, init
= None):
77 for i
, v
in enumerate(vals
):
85 # Process a code point value or range of code point values with given property.
86 def process_code_points(code_points
, val
):
91 r
= code_points
.split("..")
93 c
= int(r
[0], base
=16)
94 all_code_points
[c
] = val
96 begin
= int(r
[0], base
=16)
97 end
= int(r
[1], base
=16) + 1
98 all_code_points
[begin
:end
] = [val
] * (end
- begin
)
102 # By default every code point has width 1. This is what the C++ standard says,
103 # even though the Unicode standard says some code points have width 0.
104 all_code_points
= [1] * (1 + 0x10FFFF)
106 # Extract all code points with East_Asian_Width=W or East_Asian_Width=F
107 for line
in open("EastAsianWidth.txt", "r"):
111 line
= line
.split("#")[0]
112 if re
.match(r
'^[\dA-Fa-f][^;]+;\s*[WF]\s*$', line
):
113 process_code_points(line
.split(";")[0], 2)
115 # The C++ standard also gives width 2 to the following ranges:
116 # U+4DC0 – U+4DFF (Yijing Hexagram Symbols)
117 process_code_points("4DC0..4DFF", 2)
118 # U+1F300 – U+1F5FF (Miscellaneous Symbols and Pictographs)
119 process_code_points("1F300..1F5FF", 2)
120 # U+1F900 – U+1F9FF (Supplemental Symbols and Pictographs)
121 process_code_points("1F900..1F9FF", 2)
123 # Create a list that only contains the code points that have a different width
124 # to the previous code point.
125 edges
= find_edges(all_code_points
, 1)
127 # Table for std::__unicode::__format_width(char32_t)
129 print(" // Table generated by contrib/unicode/gen_std_format_width.py,")
130 print(" // from EastAsianWidth.txt from the Unicode standard.");
131 print(" inline constexpr char32_t __width_edges[] = {", end
="")
132 for i
, e
in enumerate(edges
):
138 print("{:#x},".format(c
), end
="")
141 # By default every code point has Grapheme_Cluster_Break=Other.
142 all_code_points
= ["Other"] * (1 + 0x10FFFF)
144 # Extract Grapheme_Cluster_Break property for all code points.
145 for line
in open("GraphemeBreakProperty.txt", "r"):
147 # "0600..0605", "Prepend"
149 line
= line
.split("#")[0]
150 if re
.match(r
'^[\dA-Fa-f][^;]+;', line
):
151 code_points
, grapheme_property
= line
.split(";")
152 process_code_points(code_points
, grapheme_property
.strip())
154 edges
= find_edges(all_code_points
)
155 gcb_props
= {"Other":0}
157 if p
not in gcb_props
:
158 gcb_props
[p
] = len(gcb_props
)
159 shift_bits
= int(math
.ceil(math
.log2(len(gcb_props
))))
161 # Enum definition for std::__unicode::_Gcb_property
163 print(" enum class _Gcb_property {")
164 for p
in gcb_props
.items():
165 print(" _Gcb_{} = {},".format(p
[0],p
[1]))
168 # Tables for std::__unicode::_Grapheme_cluster_state
170 print(" // Values generated by contrib/unicode/gen_std_format_width.py,")
171 print(" // from GraphemeBreakProperty.txt from the Unicode standard.");
172 print(" // Entries are (code_point << shift_bits) + property.")
173 print(" inline constexpr int __gcb_shift_bits = {:#x};".format(shift_bits
))
174 print(" inline constexpr uint32_t __gcb_edges[] = {", end
="")
175 for i
, e
in enumerate(edges
):
181 x
= (c
<< shift_bits
) + gcb_props
[p
]
182 print("{0:#x},".format(x
), end
="")
185 # By default every code point has Indic_Conjunct_Break=None.
186 all_code_points
= [None] * (1 + 0x10FFFF)
188 # Extract Indic_Conjunct_Break property for all code points.
189 for line
in open("DerivedCoreProperties.txt", "r"):
191 # 094D ; InCB; Linker
192 # 0B71 ; InCB; Consonant
193 # 0300..034E ; InCB; Extend
194 line
= line
.split("#")[0]
195 if re
.match(r
'^[\dA-Fa-f][^;]+; InCB;', line
):
196 code_points
, _
, incb_property
= line
.split(";")
197 process_code_points(code_points
, incb_property
.strip())
199 # Table for std::__unicode::__is_incb_linker
200 # This table is tiny, so just contains the list of code points.
201 print(" inline constexpr char32_t __incb_linkers[] = {\n ", end
="")
202 for i
in [i
for i
,p
in enumerate(all_code_points
) if p
== "Linker"]:
203 print(" 0x{:04x},".format(i
), end
="")
204 all_code_points
[i
] = None
207 edges
= find_edges(all_code_points
)
209 incb_props
= {None:0, "Consonant":1, "Extend":2}
210 print(" enum class _InCB { _Consonant = 1, _Extend = 2 };\n")
211 # Table for std::__unicode::__incb_property
212 print(" // Values generated by contrib/unicode/gen_std_format_width.py,")
213 print(" // from DerivedCoreProperties.txt from the Unicode standard.");
214 print(" // Entries are (code_point << 2) + property.")
215 print(" inline constexpr uint32_t __incb_edges[] = {", end
="")
216 for i
, e
in enumerate(edges
):
222 x
= (c
<< 2) + incb_props
[p
]
223 print("{0:#x},".format(x
), end
="")
226 # By default every code point has Emoji=No.
227 all_code_points
= [False] * (1 + 0x10FFFF)
229 # Extract Emoji=Extended_Pictographic for all code points.
230 for line
in open("emoji-data.txt", "r"):
232 # 1100..115F ; Extended_Pictographic
233 # 232A ; Extended_Pictographic
234 line
= line
.split("#")[0]
235 if re
.match(r
'^[\dA-Fa-f][^;]+; Extended_Pictographic', line
):
236 process_code_points(line
.split(";")[0], True)
238 edges
= find_edges(all_code_points
, False)
240 # Table for std::__unicode::__is_extended_pictographic
241 print(" // Table generated by contrib/unicode/gen_std_format_width.py,")
242 print(" // from emoji-data.txt from the Unicode standard.");
243 print(" inline constexpr char32_t __xpicto_edges[] = {", end
="")
244 for i
, e
in enumerate(edges
):
250 print("{:#x},".format(c
), end
="")
253 # <bits/unicode.h> gives an error if this macro is left defined.
254 # Do this last, so that the generated output is not usable unless we reach here.
255 print("#undef _GLIBCXX_GET_UNICODE_DATA")