update dev300-m58
[ooovba.git] / sal / textenc / generate / gb180302000.pl
blobada6608a4ceb2a326047c0d09cc07baad9e6ab6f
1 #!/usr/bin/perl
2 #*************************************************************************
4 # DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5 #
6 # Copyright 2008 by Sun Microsystems, Inc.
8 # OpenOffice.org - a multi-platform office productivity suite
10 # $RCSfile: gb180302000.pl,v $
12 # $Revision: 1.4 $
14 # This file is part of OpenOffice.org.
16 # OpenOffice.org is free software: you can redistribute it and/or modify
17 # it under the terms of the GNU Lesser General Public License version 3
18 # only, as published by the Free Software Foundation.
20 # OpenOffice.org is distributed in the hope that it will be useful,
21 # but WITHOUT ANY WARRANTY; without even the implied warranty of
22 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23 # GNU Lesser General Public License version 3 for more details
24 # (a copy is included in the LICENSE file that accompanied this code).
26 # You should have received a copy of the GNU Lesser General Public License
27 # version 3 along with OpenOffice.org. If not, see
28 # <http://www.openoffice.org/license.html>
29 # for a copy of the LGPLv3 License.
31 #*************************************************************************
33 # The following files must be available in a ./input subdir:
35 # <http://oss.software.ibm.com/cvs/icu/~checkout~/charset/data/xml/
36 # gb-18030-2000.xml?rev=1.4&content-type=text/plain>:
37 # "modified version="3" date="2001-02-21""
39 $id = "Gb180302000";
41 sub printUtf32
43 my $utf32 = $_[0];
44 return sprintf("U+%04X", $utf32);
47 sub printGb
49 if (defined($_[2]))
51 return sprintf("%02X%02X%02X%02X", $_[0], $_[1], $_[2], $_[3]);
53 elsif (defined($_[1]))
55 return sprintf("%02X%02X", $_[0], $_[1]);
57 else
59 return sprintf("%02X", $_[0]);
63 $gb_map_2_count = 0;
64 $gb_map_4_count = 0;
65 $gb_map_4_ranges = 0;
66 $gb_map_4_max = 0;
67 $uni_map_count = 0;
69 $range_count = 0;
71 if (1)
73 $filename = "gb-18030-2000.xml";
74 open IN, ("input/" . $filename) or die "Cannot read " . $filename;
75 while (<IN>)
77 if (/^[ \t]*<a +u=\"([0-9A-F]+)\" +b=\"([0-7][0-9A-F])\"\/>$/)
79 $utf32 = oct("0x" . $1);
80 $gb1 = oct("0x" . $2);
81 ($utf32 == $gb1)
82 or die "Bad " . printUtf32($utf32) . " to " . printGb($gb1);
84 elsif (/^[ \t]*<a +u=\"([0-9A-F]+)\" +b=\"([89A-F][0-9A-F]) ([4-789A-F][0-9A-F])\"\/>$/)
86 $utf32 = oct("0x" . $1);
87 $gb1 = oct("0x" . $2);
88 $gb2 = oct("0x" . $3);
89 $gb_code = ($gb1 - 0x81) * 190
90 + ($gb2 <= 0x7E ? $gb2 - 0x40 : $gb2 - 0x80 + 63);
91 !defined($gb_map_2[$gb_code])
92 or die "Redefined " . printGb($gb1, $gb2);
93 $gb_map_2[$gb_code] = $utf32;
94 ++$gb_map_2_count;
96 !defined($uni_map[$utf32]) or die "Double Unicode mapping";
97 $uni_map[$utf32] = $gb1 << 8 | $gb2;
98 ++$uni_map_count;
100 elsif (/^[ \t]*<a +u=\"([0-9A-F]+)\" +b=\"([89A-F][0-9A-F]) (3[0-9]) ([89A-F][0-9A-F]) (3[0-9])\"\/>$/)
102 $utf32 = oct("0x" . $1);
103 $gb1 = oct("0x" . $2);
104 $gb2 = oct("0x" . $3);
105 $gb3 = oct("0x" . $4);
106 $gb4 = oct("0x" . $5);
107 $gb_code = ($gb1 - 0x81) * 12600
108 + ($gb2 - 0x30) * 1260
109 + ($gb3 - 0x81) * 10
110 + ($gb4 - 0x30);
111 !defined($gb_map_4[$gb_code])
112 or die "Redefined " . printGb($gb1, $gb2, $gb3, $gb4);
113 $gb_map_4[$gb_code] = $utf32;
114 ++$gb_map_4_count;
115 $gb_map_4_max = $gb_code if ($gb_code > $gb_map_4_max);
117 !defined($uni_map[$utf32]) or die "Double Unicode mapping";
118 $uni_map[$utf32] = $gb1 << 24 | $gb2 << 16 | $gb3 << 8 | $gb4;
119 ++$uni_map_count;
121 elsif (/<a /)
123 die "Bad format";
125 elsif (/^[ \t]*<range +uFirst=\"([0-9A-F]+)\" +uLast=\"([0-9A-F]+)\" +bFirst=\"([89A-F][0-9A-F]) (3[0-9]) ([89A-F][0-9A-F]) (3[0-9])\" +bLast=\"([89A-F][0-9A-F]) (3[0-9]) ([89A-F][0-9A-F]) (3[0-9])\" +bMin=\"81 30 81 30\" +bMax=\"FE 39 FE 39\"\/>$/)
127 $utf32_first = oct("0x" . $1);
128 $utf32_last = oct("0x" . $2);
129 $gb1_first = oct("0x" . $3);
130 $gb2_first = oct("0x" . $4);
131 $gb3_first = oct("0x" . $5);
132 $gb4_first = oct("0x" . $6);
133 $gb1_last = oct("0x" . $7);
134 $gb2_last = oct("0x" . $8);
135 $gb3_last = oct("0x" . $9);
136 $gb4_last = oct("0x" . $10);
137 $linear_first
138 = ($gb1_first - 0x81) * 12600
139 + ($gb2_first - 0x30) * 1260
140 + ($gb3_first - 0x81) * 10
141 + ($gb4_first - 0x30);
142 $linear_last
143 = ($gb1_last - 0x81) * 12600
144 + ($gb2_last - 0x30) * 1260
145 + ($gb3_last - 0x81) * 10
146 + ($gb4_last - 0x30);
147 ($utf32_last - $utf32_first == $linear_last - $linear_first)
148 or die "Bad range";
149 if ($linear_first != 189000 || $linear_last != 1237575)
151 $range_uni_first[$range_count] = $utf32_first;
152 $range_uni_last[$range_count]
153 = ($utf32_last == 0xD7FF ? 0xDFFF : $utf32_last);
154 $range_linear_first[$range_count] = $linear_first;
155 $range_linear_last[$range_count] = $linear_last;
156 ++$range_count;
157 $gb_map_4_ranges += $linear_last - $linear_first + 1;
158 $gb_map_4_max = $linear_last
159 if ($linear_last > $gb_map_4_max);
162 elsif (/<range /)
164 die "Bad format";
167 close IN;
170 print "gb_map_2_count = ", $gb_map_2_count,
171 ", gb_map_4_count = ", $gb_map_4_count,
172 ", gb_map_4_ranges = ", $gb_map_4_ranges,
173 ", gb_map_4_max = ", $gb_map_4_max,
174 ", uni_map_count = ", $uni_map_count, "\n";
175 ($gb_map_2_count == 23940) or die "Bad gb_map_2_count != 23940";
176 ($gb_map_4_max == $gb_map_4_count + $gb_map_4_ranges - 1)
177 or die "Bad gb_map_4_max != gb_map_4_count + gb_map_4_ranges";
178 ($uni_map_count + $gb_map_4_ranges == 0x10000 - (0xE000 - 0xD800) - 0x80)
179 or die "Bad uni_map_count";
181 $range_index = 0;
182 $gb_nonrangedataindex[$range_index] = $gb_map_2_count;
183 for ($gb_code = 0; $gb_code < $gb_map_4_max; ++$gb_code)
185 if (defined($gb_map_4[$gb_code]))
187 $gb_map_2[$gb_map_2_count++] = $gb_map_4[$gb_code];
189 else
191 ($gb_code == $range_linear_first[$range_index]) or die "Bad input";
192 $gb_code = $range_linear_last[$range_index];
193 ++$range_index;
194 $gb_nonrangedataindex[$range_index] = $gb_map_2_count;
197 ($range_index == $range_count) or die "Bad input";
199 $filename = lc($id) . ".tab";
200 open OUT, ("> " . $filename) or die "Cannot write " . $filename;
203 $filename = lc($id). ".pl";
204 open IN, $filename or die "Cannot read ". $filename;
205 $first = 1;
206 while (<IN>)
208 if (/^\#!.*$/)
211 elsif (/^\#(\*.*)$/)
213 if ($first == 1)
215 print OUT "/", $1, "\n";
216 $first = 0;
218 else
220 print OUT " ", substr($1, 0, length($1) - 1), "/\n";
223 elsif (/^\# (.*)$/)
225 print OUT " *", $1, "\n";
227 elsif (/^\#(.*)$/)
229 print OUT " *", $1, "\n";
231 else
233 goto done;
236 done:
239 print OUT "\n",
240 "#ifndef INCLUDED_RTL_TEXTENC_CONVERTGB18030_H\n",
241 "#include \"convertgb18030.h\"\n",
242 "#endif\n",
243 "\n",
244 "#ifndef _SAL_TYPES_H_\n",
245 "#include \"sal/types.h\"\n",
246 "#endif\n",
247 "\n";
249 print OUT "static sal_Unicode const aImpl", $id, "ToUnicodeData[] = {\n ";
250 for ($gb_code = 0; $gb_code < $gb_map_2_count; ++$gb_code)
252 printf OUT "0x%04X,", $gb_map_2[$gb_code];
253 if ($gb_code % 8 == 7 && $gb_code != $gb_map_2_count - 1)
255 print OUT "\n ";
258 print OUT "\n};\n\n";
260 print OUT "static ImplGb180302000ToUnicodeRange const\n aImpl",
261 $id,
262 "ToUnicodeRanges[] = {\n";
263 for ($range_index = 0; $range_index < $range_count; ++$range_index)
265 printf OUT " { %d, %d, %d, 0x%04X },\n",
266 $gb_nonrangedataindex[$range_index],
267 $range_linear_first[$range_index],
268 $range_linear_last[$range_index] + 1,
269 $range_uni_first[$range_index];
271 print OUT " { -1, 0, 0, 0 }\n};\n\n";
273 print OUT "static sal_uInt32 const aImplUnicodeTo", $id, "Data[] = {\n ";
274 $index = 0;
275 $range_index = 0;
276 $uni_nonrangedataindex[$range_index] = $index;
277 for ($utf32 = 0x80; $utf32 <= 0xFFFF; ++$utf32)
279 if (defined($uni_map[$utf32]))
281 if ($index > 0 && ($index - 1) % 6 == 5)
283 print OUT "\n ";
285 $bytes = $uni_map[$utf32];
286 printf OUT ($bytes <= 0xFFFF ? " 0x%04X," : "0x%08X,"), $bytes;
287 ++$index;
289 else
291 ($utf32 == $range_uni_first[$range_index]) or die "Bad input";
292 $utf32 = $range_uni_last[$range_index];
293 ++$range_index;
294 $uni_nonrangedataindex[$range_index] = $index;
297 ($range_index == $range_count) or die "Bad input";
298 print OUT "\n};\n\n";
300 print OUT "static ImplUnicodeToGb180302000Range const\n aImplUnicodeTo",
301 $id,
302 "Ranges[] = {\n";
303 for ($range_index = 0; $range_index < $range_count; ++$range_index)
305 printf OUT " { %d, 0x%04X, 0x%04X, %d },\n",
306 $uni_nonrangedataindex[$range_index],
307 $range_uni_first[$range_index],
308 $range_uni_last[$range_index],
309 $range_linear_first[$range_index];
311 print OUT "};\n";
313 close OUT;