2 # Copyright (C) 1998, 1999 Tom Tromey
3 # Copyright (C) 2001 Red Hat Software
5 # This program is free software; you can redistribute it and/or modify
6 # it under the terms of the GNU General Public License as published by
7 # the Free Software Foundation; either version 2, or (at your option)
10 # This program is distributed in the hope that it will be useful,
11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 # GNU General Public License for more details.
15 # You should have received a copy of the GNU General Public License
16 # along with this program; if not, see <http://www.gnu.org/licenses/>.
19 gen-casemap-txt.py - Generate test cases for case mapping from Unicode data.
20 See http://www.unicode.org/Public/UNIDATA/UnicodeCharacterDatabase.html
22 I consider the output of this program to be unrestricted.
31 parser
= argparse
.ArgumentParser(
32 description
="Generate test cases for case mapping from Unicode data")
33 parser
.add_argument("UNICODE-VERSION")
34 parser
.add_argument("UnicodeData.txt")
35 parser
.add_argument("SpecialCasing.txt")
36 args
= parser
.parse_args(argv
[1:])
37 version
= getattr(args
, "UNICODE-VERSION")
38 filename_udata
= getattr(args
, "UnicodeData.txt")
39 filename_casing
= getattr(args
, "SpecialCasing.txt")
41 # Names of fields in Unicode data table.
42 CODE
, NAME
, CATEGORY
, COMBINING_CLASSES
, BIDI_CATEGORY
, DECOMPOSITION
, \
43 DECIMAL_VALUE
, DIGIT_VALUE
, NUMERIC_VALUE
, MIRRORED
, OLD_NAME
, \
44 COMMENT
, UPPER
, LOWER
, TITLE
= range(15)
46 # Names of fields in the SpecialCasing table
47 CASE_CODE
, CASE_LOWER
, CASE_TITLE
, CASE_UPPER
, CASE_CONDITION
= range(5)
54 """Converts a string of white space separated code points encoded as
55 hex values to a Unicode string. Any extra white space is ignored.
57 return "".join([chr(int(c
, 16)) for c
in codes
.split()])
59 def process_one(code
, fields
):
60 type_
= fields
[CATEGORY
]
62 upper
[code
] = make_hex(fields
[UPPER
])
63 lower
[code
] = chr(code
)
64 title
[code
] = make_hex(fields
[TITLE
])
66 lower
[code
] = make_hex(fields
[LOWER
])
67 upper
[code
] = chr(code
)
68 title
[code
] = make_hex(fields
[TITLE
])
70 upper
[code
] = make_hex(fields
[UPPER
])
71 lower
[code
] = make_hex(fields
[LOWER
])
72 title
[code
] = make_hex(fields
[LOWER
])
74 with
open(filename_udata
, encoding
="utf-8") as fileobj
:
78 fields
= [f
.strip() for f
in line
.split(";")]
81 "Entry for %s has wrong number of fields (%d)" % (
82 fields
[CODE
], len(fields
)))
84 code
= int(fields
[CODE
], 16)
86 if code
> last_code
+ 1:
88 if fields
[NAME
].endswith("Last>"):
89 # Fill the gap with the last character read,
90 # since this was a range specified in the char database
93 # The gap represents undefined characters. Only the type
95 gfields
= ['', '', 'Cn', '0', '', '', '', '', '', '', '',
99 while last_code
< code
:
100 gfields
[CODE
] = "%04x" % last_code
101 process_one(last_code
, gfields
)
104 process_one(code
, fields
)
107 with
open(filename_casing
, encoding
="utf-8") as fileobj
:
110 # strip comments and skip empty lines
111 line
= line
.split("#", 1)[0].strip()
115 # all lines end with ";" so just remove it
116 line
= line
.rstrip(";").rstrip()
117 fields
= [f
.strip() for f
in line
.split(";")]
118 if len(fields
) not in (4, 5):
120 "Entry for %s has wrong number of fields (%d)" % (
121 fields
[CASE_CODE
], len(fields
)))
124 # Ignore conditional special cases - we'll handle them manually
127 code
= int(fields
[CASE_CODE
], 16)
129 upper
[code
] = make_hex(fields
[CASE_UPPER
])
130 lower
[code
] = make_hex(fields
[CASE_LOWER
])
131 title
[code
] = make_hex(fields
[CASE_TITLE
])
133 print_tests(version
, upper
, title
, lower
)
136 def print_tests(version
, upper
, title
, lower
):
138 # Test cases generated from Unicode {} data
139 # by gen-casemap-txt.py. Do not edit.
141 # Some special hand crafted tests
143 tr_TR\ti\ti\t\u0130\t\u0130\t# i => LATIN CAPITAL LETTER I WITH DOT ABOVE
144 tr_TR\tI\t\u0131\tI\tI\t# I => LATIN SMALL LETTER DOTLESS I
145 tr_TR\tI\u0307\ti\tI\u0307\tI\u0307\t# I => LATIN SMALL LETTER DOTLESS I
146 tr_TR.UTF-8\ti\ti\t\u0130\t\u0130\t# i => LATIN CAPITAL LETTER I WITH DOT ABOVE
147 tr_TR.UTF-8\tI\t\u0131\tI\tI\t# I => LATIN SMALL LETTER DOTLESS I
148 tr_TR.UTF-8\tI\u0307\ti\tI\u0307\tI\u0307\t# I => LATIN SMALL LETTER DOTLESS I
149 # Test reordering of YPOGEGRAMMENI across other accents
150 \t\u03b1\u0345\u0314\t\u03b1\u0345\u0314\t\u0391\u0345\u0314\t\u0391\u0314\u0399\t
151 \t\u03b1\u0314\u0345\t\u03b1\u0314\u0345\t\u0391\u0314\u0345\t\u0391\u0314\u0399\t
152 # Handling of final and nonfinal sigma
153 \tΜΆΙΟΣ μάιος Μάιος ΜΆΙΟΣ
154 \tΜΆΙΟΣ μάιος Μάιος ΜΆΙΟΣ
155 \tΣΙΓΜΑ σιγμα Σιγμα ΣΙΓΜΑ
156 # Lithuanian rule of i followed by letter with dot. Not at all sure
157 # about the titlecase part here
158 lt_LT\ti\u0117\ti\u0117\tIe\tIE\t
159 lt_LT\tie\u0307\tie\u0307\tIe\tIE\t
160 lt_LT\t\u00cc\ti\u0307\u0300\t\u00cc\t\u00cc\t # LATIN CAPITAL LETTER I WITH GRAVE
161 lt_LT\t\u00CD\ti\u0307\u0301\t\u00CD\t\u00CD\t # LATIN CAPITAL LETTER I WITH ACUTE
162 lt_LT\t\u0128\ti\u0307\u0303\t\u0128\t\u0128\t # LATIN CAPITAL LETTER I WITH TILDE
163 lt_LT\tI\u0301\ti\u0307\u0301\tI\u0301\tI\u0301\t # LATIN CAPITAL LETTER I (with acute accent)
164 lt_LT\tI\u0300\ti\u0307\u0300\tI\u0300\tI\u0300\t # LATIN CAPITAL LETTER I (with grave accent)
165 lt_LT\tI\u0303\ti\u0307\u0303\tI\u0303\tI\u0303\t # LATIN CAPITAL LETTER I (with tilde above)
166 lt_LT\tI\u0328\u0301\ti\u0307\u0328\u0301\tI\u0328\u0301\tI\u0328\u0301\t # LATIN CAPITAL LETTER I (with ogonek and acute accent)
167 lt_LT\tJ\u0301\tj\u0307\u0301\tJ\u0301\tJ\u0301\t # LATIN CAPITAL LETTER J (with acute accent)
168 lt_LT\t\u012e\u0301\t\u012f\u0307\u0301\t\u012e\u0301\t\u012e\u0301\t # LATIN CAPITAL LETTER I WITH OGONEK (with acute accent)
169 lt_LT.UTF-8\ti\u0117\ti\u0117\tIe\tIE\t
170 lt_LT.UTF-8\tie\u0307\tie\u0307\tIe\tIE\t
171 lt_LT.UTF-8\t\u00cc\ti\u0307\u0300\t\u00cc\t\u00cc\t # LATIN CAPITAL LETTER I WITH GRAVE
172 lt_LT.UTF-8\t\u00CD\ti\u0307\u0301\t\u00CD\t\u00CD\t # LATIN CAPITAL LETTER I WITH ACUTE
173 lt_LT.UTF-8\t\u0128\ti\u0307\u0303\t\u0128\t\u0128\t # LATIN CAPITAL LETTER I WITH TILDE
174 lt_LT.UTF-8\tI\u0301\ti\u0307\u0301\tI\u0301\tI\u0301\t # LATIN CAPITAL LETTER I (with acute accent)
175 lt_LT.UTF-8\tI\u0300\ti\u0307\u0300\tI\u0300\tI\u0300\t # LATIN CAPITAL LETTER I (with grave accent)
176 lt_LT.UTF-8\tI\u0303\ti\u0307\u0303\tI\u0303\tI\u0303\t # LATIN CAPITAL LETTER I (with tilde above)
177 lt_LT.UTF-8\tI\u0328\u0301\ti\u0307\u0328\u0301\tI\u0328\u0301\tI\u0328\u0301\t # LATIN CAPITAL LETTER I (with ogonek and acute accent)
178 lt_LT.UTF-8\tJ\u0301\tj\u0307\u0301\tJ\u0301\tJ\u0301\t # LATIN CAPITAL LETTER J (with acute accent)
179 lt_LT.UTF-8\t\u012e\u0301\t\u012f\u0307\u0301\t\u012e\u0301\t\u012e\u0301\t # LATIN CAPITAL LETTER I WITH OGONEK (with acute accent)
180 # Special case not at initial position
181 \ta\ufb04\ta\ufb04\tAffl\tAFFL\t# FB04
183 # Now the automatic tests
184 #""".format(version
))
186 for i
in range(0x10ffff):
188 # Greek sigma needs special tests
191 up
= upper
.get(i
, "")
192 lo
= lower
.get(i
, "")
193 ti
= title
.get(i
, "")
195 if any([up
, lo
, ti
]):
196 print("\t%s\t%s\t%s\t%s\t# %4X" % (chr(i
), lo
, ti
, up
, i
))
199 if __name__
== "__main__":
200 sys
.exit(main(sys
.argv
))