Deprecate GValueArray
[glib.git] / tests / gen-casemap-txt.pl
blob0b9fc1d6b49d47ff85316697c6978d8c45119253
1 #! /usr/bin/perl -w
3 # Copyright (C) 1998, 1999 Tom Tromey
4 # Copyright (C) 2001 Red Hat Software
6 # This program is free software; you can redistribute it and/or modify
7 # it under the terms of the GNU General Public License as published by
8 # the Free Software Foundation; either version 2, or (at your option)
9 # any later version.
11 # This program is distributed in the hope that it will be useful,
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 # GNU General Public License for more details.
16 # You should have received a copy of the GNU General Public License
17 # along with this program; if not, write to the Free Software
18 # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
19 # 02111-1307, USA.
21 # gen-casemap-test.pl - Generate test cases for case mapping from Unicode data.
22 # See http://www.unicode.org/Public/UNIDATA/UnicodeCharacterDatabase.html
23 # I consider the output of this program to be unrestricted. Use it as
24 # you will.
26 require 5.006;
27 use utf8;
29 if (@ARGV != 3) {
30 $0 =~ s@.*/@@;
31 die "Usage: $0 UNICODE-VERSION UnicodeData.txt SpecialCasing.txt\n";
34 use vars qw($CODE $NAME $CATEGORY $COMBINING_CLASSES $BIDI_CATEGORY $DECOMPOSITION $DECIMAL_VALUE $DIGIT_VALUE $NUMERIC_VALUE $MIRRORED $OLD_NAME $COMMENT $UPPER $LOWER $TITLE $BREAK_CODE $BREAK_CATEGORY $BREAK_NAME $CASE_CODE $CASE_LOWER $CASE_TITLE $CASE_UPPER $CASE_CONDITION);
36 # Names of fields in Unicode data table.
37 $CODE = 0;
38 $NAME = 1;
39 $CATEGORY = 2;
40 $COMBINING_CLASSES = 3;
41 $BIDI_CATEGORY = 4;
42 $DECOMPOSITION = 5;
43 $DECIMAL_VALUE = 6;
44 $DIGIT_VALUE = 7;
45 $NUMERIC_VALUE = 8;
46 $MIRRORED = 9;
47 $OLD_NAME = 10;
48 $COMMENT = 11;
49 $UPPER = 12;
50 $LOWER = 13;
51 $TITLE = 14;
53 # Names of fields in the SpecialCasing table
54 $CASE_CODE = 0;
55 $CASE_LOWER = 1;
56 $CASE_TITLE = 2;
57 $CASE_UPPER = 3;
58 $CASE_CONDITION = 4;
60 my @upper;
61 my @title;
62 my @lower;
64 binmode STDOUT, ":utf8";
65 open (INPUT, "< $ARGV[1]") || exit 1;
67 $last_code = -1;
68 while (<INPUT>)
70 chop;
71 @fields = split (';', $_, 30);
72 if ($#fields != 14)
74 printf STDERR ("Entry for $fields[$CODE] has wrong number of fields (%d)\n", $#fields);
77 $code = hex ($fields[$CODE]);
79 if ($code > $last_code + 1)
81 # Found a gap.
82 if ($fields[$NAME] =~ /Last>/)
84 # Fill the gap with the last character read,
85 # since this was a range specified in the char database
86 @gfields = @fields;
88 else
90 # The gap represents undefined characters. Only the type
91 # matters.
92 @gfields = ('', '', 'Cn', '0', '', '', '', '', '', '', '',
93 '', '', '', '');
95 for (++$last_code; $last_code < $code; ++$last_code)
97 $gfields{$CODE} = sprintf ("%04x", $last_code);
98 &process_one ($last_code, @gfields);
101 &process_one ($code, @fields);
102 $last_code = $code;
105 close INPUT;
107 open (INPUT, "< $ARGV[2]") || exit 1;
109 while (<INPUT>)
111 my $code;
113 chop;
115 next if /^#/;
116 next if /^\s*$/;
118 s/\s*#.*//;
120 @fields = split ('\s*;\s*', $_, 30);
122 $raw_code = $fields[$CASE_CODE];
123 $code = hex ($raw_code);
125 if ($#fields != 4 && $#fields != 5)
127 printf STDERR ("Entry for $raw_code has wrong number of fields (%d)\n", $#fields);
128 next;
131 if (defined $fields[5]) {
132 # Ignore conditional special cases - we'll handle them manually
133 next;
136 $upper[$code] = &make_hex ($fields[$CASE_UPPER]);
137 $lower[$code] = &make_hex ($fields[$CASE_LOWER]);
138 $title[$code] = &make_hex ($fields[$CASE_TITLE]);
141 close INPUT;
143 print <<EOT;
144 # Test cases generated from Unicode $ARGV[0] data
145 # by gen-case-tests.pl. Do not edit.
147 # Some special hand crafted tests
149 tr_TR\ti\ti\t\x{0130}\t\x{0130}\t# i => LATIN CAPITAL LETTER I WITH DOT ABOVE
150 tr_TR\tI\t\x{0131}\tI\tI\t# I => LATIN SMALL LETTER DOTLESS I
151 tr_TR\tI\x{0307}\ti\tI\x{0307}\tI\x{0307}\t# I => LATIN SMALL LETTER DOTLESS I
152 tr_TR.UTF-8\ti\ti\t\x{0130}\t\x{0130}\t# i => LATIN CAPITAL LETTER I WITH DOT ABOVE
153 tr_TR.UTF-8\tI\t\x{0131}\tI\tI\t# I => LATIN SMALL LETTER DOTLESS I
154 tr_TR.UTF-8\tI\x{0307}\ti\tI\x{0307}\tI\x{0307}\t# I => LATIN SMALL LETTER DOTLESS I
155 # Test reordering of YPOGEGRAMMENI across other accents
156 \t\x{03b1}\x{0345}\x{0314}\t\x{03b1}\x{0345}\x{314}\t\x{0391}\x{0345}\x{0314}\t\x{0391}\x{0314}\x{0399}\t
157 \t\x{03b1}\x{0314}\x{0345}\t\x{03b1}\x{314}\x{0345}\t\x{0391}\x{0314}\x{0345}\t\x{0391}\x{0314}\x{0399}\t
158 # Handling of final and nonfinal sigma
159 ΜΆΙΟΣ μάιος Μάιος ΜΆΙΟΣ
160 ΜΆΙΟΣ μάιος Μάιος ΜΆΙΟΣ
161 ΣΙΓΜΑ σιγμα Σιγμα ΣΙΓΜΑ
162 # Lithuanian rule of i followed by letter with dot. Not at all sure
163 # about the titlecase part here
164 lt_LT\ti\x{117}\ti\x{117}\tIe\tIE\t
165 lt_LT\tie\x{307}\tie\x{307}\tIe\tIE\t
166 lt_LT\t\x{00cc}\ti\x{0307}\x{0300}\t\x{00cc}\t\x{00cc}\t # LATIN CAPITAL LETTER I WITH GRAVE
167 lt_LT\t\x{00CD}\ti\x{0307}\x{0301}\t\x{00CD}\t\x{00CD}\t # LATIN CAPITAL LETTER I WITH ACUTE
168 lt_LT\t\x{0128}\ti\x{0307}\x{0303}\t\x{0128}\t\x{0128}\t # LATIN CAPITAL LETTER I WITH TILDE
169 lt_LT\tI\x{0301}\ti\x{0307}\x{0301}\tI\x{0301}\tI\x{0301}\t # LATIN CAPITAL LETTER I (with acute accent)
170 lt_LT\tI\x{0300}\ti\x{0307}\x{0300}\tI\x{0300}\tI\x{0300}\t # LATIN CAPITAL LETTER I (with grave accent)
171 lt_LT\tI\x{0303}\ti\x{0307}\x{0303}\tI\x{0303}\tI\x{0303}\t # LATIN CAPITAL LETTER I (with tilde above)
172 lt_LT\tI\x{0328}\x{0301}\ti\x{0307}\x{0328}\x{0301}\tI\x{0328}\x{0301}\tI\x{0328}\x{0301}\t # LATIN CAPITAL LETTER I (with ogonek and acute accent)
173 lt_LT\tJ\x{0301}\tj\x{0307}\x{0301}\tJ\x{0301}\tJ\x{0301}\t # LATIN CAPITAL LETTER J (with acute accent)
174 lt_LT\t\x{012e}\x{0301}\t\x{012f}\x{0307}\x{0301}\t\x{012e}\x{0301}\t\x{012e}\x{0301}\t # LATIN CAPITAL LETTER I WITH OGONEK (with acute accent)
175 lt_LT.UTF-8\ti\x{117}\ti\x{117}\tIe\tIE\t
176 lt_LT.UTF-8\tie\x{307}\tie\x{307}\tIe\tIE\t
177 lt_LT.UTF-8\t\x{00cc}\ti\x{0307}\x{0300}\t\x{00cc}\t\x{00cc}\t # LATIN CAPITAL LETTER I WITH GRAVE
178 lt_LT.UTF-8\t\x{00CD}\ti\x{0307}\x{0301}\t\x{00CD}\t\x{00CD}\t # LATIN CAPITAL LETTER I WITH ACUTE
179 lt_LT.UTF-8\t\x{0128}\ti\x{0307}\x{0303}\t\x{0128}\t\x{0128}\t # LATIN CAPITAL LETTER I WITH TILDE
180 lt_LT.UTF-8\tI\x{0301}\ti\x{0307}\x{0301}\tI\x{0301}\tI\x{0301}\t # LATIN CAPITAL LETTER I (with acute accent)
181 lt_LT.UTF-8\tI\x{0300}\ti\x{0307}\x{0300}\tI\x{0300}\tI\x{0300}\t # LATIN CAPITAL LETTER I (with grave accent)
182 lt_LT.UTF-8\tI\x{0303}\ti\x{0307}\x{0303}\tI\x{0303}\tI\x{0303}\t # LATIN CAPITAL LETTER I (with tilde above)
183 lt_LT.UTF-8\tI\x{0328}\x{0301}\ti\x{0307}\x{0328}\x{0301}\tI\x{0328}\x{0301}\tI\x{0328}\x{0301}\t # LATIN CAPITAL LETTER I (with ogonek and acute accent)
184 lt_LT.UTF-8\tJ\x{0301}\tj\x{0307}\x{0301}\tJ\x{0301}\tJ\x{0301}\t # LATIN CAPITAL LETTER J (with acute accent)
185 lt_LT.UTF-8\t\x{012e}\x{0301}\t\x{012f}\x{0307}\x{0301}\t\x{012e}\x{0301}\t\x{012e}\x{0301}\t # LATIN CAPITAL LETTER I WITH OGONEK (with acute accent)
186 # Special case not at initial position
187 \ta\x{fb04}\ta\x{fb04}\tAffl\tAFFL\t# FB04
189 # Now the automatic tests
192 &print_tests;
194 exit 0;
196 # Process a single character.
197 sub process_one
199 my ($code, @fields) = @_;
201 my $type = $fields[$CATEGORY];
202 if ($type eq 'Ll')
204 $upper[$code] = make_hex ($fields[$UPPER]);
205 $lower[$code] = pack ("U", $code);
206 $title[$code] = make_hex ($fields[$TITLE]);
208 elsif ($type eq 'Lu')
210 $lower[$code] = make_hex ($fields[$LOWER]);
211 $upper[$code] = pack ("U", $code);
212 $title[$code] = make_hex ($fields[$TITLE]);
215 if ($type eq 'Lt')
217 $upper[$code] = make_hex ($fields[$UPPER]);
218 $lower[$code] = pack ("U", hex ($fields[$LOWER]));
219 $title[$code] = make_hex ($fields[$LOWER]);
223 sub print_tests
225 for ($i = 0; $i < 0x10ffff; $i++) {
226 if ($i == 0x3A3) {
227 # Greek sigma needs special tests
228 next;
231 my $lower = $lower[$i];
232 my $title = $title[$i];
233 my $upper = $upper[$i];
235 if (defined $upper || defined $lower || defined $title) {
236 printf "\t%s\t%s\t%s\t%s\t# %4X\n",
237 pack ("U", $i),
238 (defined $lower ? $lower : ""),
239 (defined $title ? $title : ""),
240 (defined $upper ? $upper : ""),
246 sub make_hex
248 my $codes = shift;
250 $codes =~ s/^\s+//;
251 $codes =~ s/\s+$//;
253 if ($codes eq "0" || $codes eq "") {
254 return "";
255 } else {
256 return pack ("U*", map { hex ($_) } split /\s+/, $codes);