contrib/unicode/gen_libstdcxx_unicode_data.py

   1 #!/usr/bin/env python3
   2 #
   3 # Script to generate tables for libstdc++ std::format width estimation.
   4 #
   5 # This file is part of GCC.
   6 #
   7 # GCC is free software; you can redistribute it and/or modify it under
   8 # the terms of the GNU General Public License as published by the Free
   9 # Software Foundation; either version 3, or (at your option) any later
  10 # version.
  11 #
  12 # GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  13 # WARRANTY; without even the implied warranty of MERCHANTABILITY or
  14 # FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  15 # for more details.
  16 #
  17 # You should have received a copy of the GNU General Public License
  18 # along with GCC; see the file COPYING3.  If not see
  19 # <http://www.gnu.org/licenses/>.
  20
  21 # To update the Libstdc++ static data in <bits/unicode-data.h> download the latest:
  22 # ftp://ftp.unicode.org/Public/UNIDATA/EastAsianWidth.txt
  23 # ftp://ftp.unicode.org/Public/UNIDATA/DerivedCoreProperties.txt
  24 # ftp://ftp.unicode.org/Public/UNIDATA/auxiliary/GraphemeBreakProperty.txt
  25 # ftp://ftp.unicode.org/Public/UNIDATA/emoji/emoji-data.txt
  26 # Then run this script and save the output to
  27 # ../../libstdc++-v3/include/bits/unicode-data.h
  28
  29 import sys
  30 import re
  31 import math
  32 import os
  33
  34 self = os.path.basename(__file__)
  35 print("// Generated by contrib/unicode/{}, do not edit.".format(self))
  36 print("""
  37 // Copyright The GNU Toolchain Authors.
  38 //
  39 // This file is part of the GNU ISO C++ Library.  This library is free
  40 // software; you can redistribute it and/or modify it under the
  41 // terms of the GNU General Public License as published by the
  42 // Free Software Foundation; either version 3, or (at your option)
  43 // any later version.
  44
  45 // This library is distributed in the hope that it will be useful,
  46 // but WITHOUT ANY WARRANTY; without even the implied warranty of
  47 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  48 // GNU General Public License for more details.
  49
  50 // Under Section 7 of GPL version 3, you are granted additional
  51 // permissions described in the GCC Runtime Library Exception, version
  52 // 3.1, as published by the Free Software Foundation.
  53
  54 // You should have received a copy of the GNU General Public License and
  55 // a copy of the GCC Runtime Library Exception along with this program;
  56 // see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
  57 // <http://www.gnu.org/licenses/>.
  58
  59 /** @file bits/unicode-data.h
  60  *  This is an internal header file, included by other library headers.
  61  *  Do not attempt to use it directly. @headername{format}
  62  */
  63 """)
  64 print("#ifndef _GLIBCXX_GET_UNICODE_DATA")
  65 print('# error "This is not a public header, do not include it directly"')
  66 print("#elif _GLIBCXX_GET_UNICODE_DATA != 160000")
  67 print('# error "Version mismatch for Unicode static data"')
  68 print("#endif\n")
  69
  70 # Process a list and return a list of tuples (index, val) which are the elements
  71 # in the list that have a different val from the previous element.
  72 # e.g. find_edges([a, a, b, b, c, b, b, d]) is [(0,a), (2,b), (4,c), (5,b), (7,d)]
  73 # and find_edges([a, a, b, b, c, b, b, d], a) is [(2,b), (4,c), (5,b), (7,d)]
  74 def find_edges(vals, init = None):
  75     edges = []
  76     prev_val = init
  77     for i, v in enumerate(vals):
  78         if v != prev_val:
  79             edges.append((i,v))
  80             prev_val = v
  81     return edges
  82
  83 all_code_points = []
  84
  85 # Process a code point value or range of code point values with given property.
  86 def process_code_points(code_points, val):
  87     # Example arguments:
  88     # 1100..115F, x
  89     # 232A, y
  90
  91     r = code_points.split("..")
  92     if len(r) == 1:
  93         c = int(r[0], base=16)
  94         all_code_points[c] = val
  95     elif len(r) == 2:
  96         begin = int(r[0], base=16)
  97         end = int(r[1], base=16) + 1
  98         all_code_points[begin:end] = [val] * (end - begin)
  99     else:
 100         raise ValueError
 101
 102 # By default every code point has width 1. This is what the C++ standard says,
 103 # even though the Unicode standard says some code points have width 0.
 104 all_code_points = [1] * (1 + 0x10FFFF)
 105
 106 # Extract all code points with East_Asian_Width=W or East_Asian_Width=F
 107 for line in open("EastAsianWidth.txt", "r"):
 108     # Example lines:
 109     # 3000           ; F
 110     # 3001..3003     ; W
 111     line = line.split("#")[0]
 112     if re.match(r'^[\dA-Fa-f][^;]+;\s*[WF]\s*$', line):
 113         process_code_points(line.split(";")[0], 2)
 114
 115 # The C++ standard also gives width 2 to the following ranges:
 116 # U+4DC0 – U+4DFF (Yijing Hexagram Symbols)
 117 process_code_points("4DC0..4DFF", 2)
 118 # U+1F300 – U+1F5FF (Miscellaneous Symbols and Pictographs)
 119 process_code_points("1F300..1F5FF", 2)
 120 # U+1F900 – U+1F9FF (Supplemental Symbols and Pictographs)
 121 process_code_points("1F900..1F9FF", 2)
 122
 123 # Create a list that only contains the code points that have a different width
 124 # to the previous code point.
 125 edges = find_edges(all_code_points, 1)
 126
 127 # Table for std::__unicode::__format_width(char32_t)
 128
 129 print("  // Table generated by contrib/unicode/gen_std_format_width.py,")
 130 print("  // from EastAsianWidth.txt from the Unicode standard.");
 131 print("  inline constexpr char32_t __width_edges[] = {", end="")
 132 for i, e in enumerate(edges):
 133     if i % 8:
 134         print(" ", end="")
 135     else:
 136         print("\n    ", end="")
 137     c,_ = e
 138     print("{:#x},".format(c), end="")
 139 print("\n  };\n")
 140
 141 # By default every code point has Grapheme_Cluster_Break=Other.
 142 all_code_points = ["Other"] * (1 + 0x10FFFF)
 143
 144 # Extract Grapheme_Cluster_Break property for all code points.
 145 for line in open("GraphemeBreakProperty.txt", "r"):
 146     # Example lines:
 147     # "0600..0605", "Prepend"
 148     # "00AD", "Control"
 149     line = line.split("#")[0]
 150     if re.match(r'^[\dA-Fa-f][^;]+;', line):
 151         code_points, grapheme_property = line.split(";")
 152         process_code_points(code_points, grapheme_property.strip())
 153
 154 edges = find_edges(all_code_points)
 155 gcb_props = {"Other":0}
 156 for c, p in edges:
 157     if p not in gcb_props:
 158         gcb_props[p] = len(gcb_props)
 159 shift_bits = int(math.ceil(math.log2(len(gcb_props))))
 160
 161 # Enum definition for std::__unicode::_Gcb_property
 162
 163 print("  enum class _Gcb_property {")
 164 for p in gcb_props.items():
 165     print("    _Gcb_{} = {},".format(p[0],p[1]))
 166 print("  };\n")
 167
 168 # Tables for std::__unicode::_Grapheme_cluster_state
 169
 170 print("  // Values generated by contrib/unicode/gen_std_format_width.py,")
 171 print("  // from GraphemeBreakProperty.txt from the Unicode standard.");
 172 print("  // Entries are (code_point << shift_bits) + property.")
 173 print("  inline constexpr int __gcb_shift_bits = {:#x};".format(shift_bits))
 174 print("  inline constexpr uint32_t __gcb_edges[] = {", end="")
 175 for i, e in enumerate(edges):
 176     if i % 6:
 177         print(" ", end="")
 178     else:
 179         print("\n    ", end="")
 180     c, p = e
 181     x = (c << shift_bits) + gcb_props[p]
 182     print("{0:#x},".format(x), end="")
 183 print("\n  };\n")
 184
 185 # By default every code point has Indic_Conjunct_Break=None.
 186 all_code_points = [None] * (1 + 0x10FFFF)
 187
 188 # Extract Indic_Conjunct_Break property for all code points.
 189 for line in open("DerivedCoreProperties.txt", "r"):
 190     # Example lines:
 191     # 094D       ; InCB; Linker
 192     # 0B71       ; InCB; Consonant
 193     # 0300..034E ; InCB; Extend
 194     line = line.split("#")[0]
 195     if re.match(r'^[\dA-Fa-f][^;]+; InCB;', line):
 196         code_points, _, incb_property = line.split(";")
 197         process_code_points(code_points, incb_property.strip())
 198
 199 # Table for std::__unicode::__is_incb_linker
 200 # This table is tiny, so just contains the list of code points.
 201 print("  inline constexpr char32_t __incb_linkers[] = {\n   ", end="")
 202 for i in [i for i,p in enumerate(all_code_points) if p == "Linker"]:
 203     print(" 0x{:04x},".format(i), end="")
 204     all_code_points[i] = None
 205 print("\n  };\n")
 206
 207 edges = find_edges(all_code_points)
 208
 209 incb_props = {None:0, "Consonant":1, "Extend":2}
 210 print("  enum class _InCB { _Consonant = 1, _Extend = 2 };\n")
 211 # Table for std::__unicode::__incb_property
 212 print("  // Values generated by contrib/unicode/gen_std_format_width.py,")
 213 print("  // from DerivedCoreProperties.txt from the Unicode standard.");
 214 print("  // Entries are (code_point << 2) + property.")
 215 print("  inline constexpr uint32_t __incb_edges[] = {", end="")
 216 for i, e in enumerate(edges):
 217     if i % 6:
 218         print(" ", end="")
 219     else:
 220         print("\n    ", end="")
 221     c, p = e
 222     x = (c << 2) + incb_props[p]
 223     print("{0:#x},".format(x), end="")
 224 print("\n  };\n")
 225
 226 # By default every code point has Emoji=No.
 227 all_code_points = [False] * (1 + 0x10FFFF)
 228
 229 # Extract Emoji=Extended_Pictographic for all code points.
 230 for line in open("emoji-data.txt", "r"):
 231     # Example lines:
 232     # 1100..115F ; Extended_Pictographic
 233     # 232A       ; Extended_Pictographic
 234     line = line.split("#")[0]
 235     if re.match(r'^[\dA-Fa-f][^;]+; Extended_Pictographic', line):
 236         process_code_points(line.split(";")[0], True)
 237
 238 edges = find_edges(all_code_points, False)
 239
 240 # Table for std::__unicode::__is_extended_pictographic
 241 print("  // Table generated by contrib/unicode/gen_std_format_width.py,")
 242 print("  // from emoji-data.txt from the Unicode standard.");
 243 print("  inline constexpr char32_t __xpicto_edges[] = {", end="")
 244 for i, e in enumerate(edges):
 245     if i % 8:
 246         print(" ", end="")
 247     else:
 248         print("\n    ", end="")
 249     c,_ = e
 250     print("{:#x},".format(c), end="")
 251 print("\n  };\n")
 252
 253 # <bits/unicode.h> gives an error if this macro is left defined.
 254 # Do this last, so that the generated output is not usable unless we reach here.
 255 print("#undef _GLIBCXX_GET_UNICODE_DATA")