Version 4.2.0.1, tag libreoffice-4.2.0.1
[LibreOffice.git] / sal / textenc / generate / gb180302000.pl
blob410e7b6067f2ccf76d377706cc250b00dc1d8402
1 #!/usr/bin/perl
3 # This file is part of the LibreOffice project.
5 # This Source Code Form is subject to the terms of the Mozilla Public
6 # License, v. 2.0. If a copy of the MPL was not distributed with this
7 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 # This file incorporates work covered by the following license notice:
11 # Licensed to the Apache Software Foundation (ASF) under one or more
12 # contributor license agreements. See the NOTICE file distributed
13 # with this work for additional information regarding copyright
14 # ownership. The ASF licenses this file to you under the Apache
15 # License, Version 2.0 (the "License"); you may not use this file
16 # except in compliance with the License. You may obtain a copy of
17 # the License at http://www.apache.org/licenses/LICENSE-2.0 .
20 # The following files must be available in a ./input subdir:
22 # <http://oss.software.ibm.com/cvs/icu/~checkout~/charset/data/xml/
23 # gb-18030-2000.xml?rev=1.4&content-type=text/plain>:
24 # "modified version="3" date="2001-02-21""
26 $id = "Gb180302000";
28 sub printUtf32
30 my $utf32 = $_[0];
31 return sprintf("U+%04X", $utf32);
34 sub printGb
36 if (defined($_[2]))
38 return sprintf("%02X%02X%02X%02X", $_[0], $_[1], $_[2], $_[3]);
40 elsif (defined($_[1]))
42 return sprintf("%02X%02X", $_[0], $_[1]);
44 else
46 return sprintf("%02X", $_[0]);
50 $gb_map_2_count = 0;
51 $gb_map_4_count = 0;
52 $gb_map_4_ranges = 0;
53 $gb_map_4_max = 0;
54 $uni_map_count = 0;
56 $range_count = 0;
58 if (1)
60 $filename = "gb-18030-2000.xml";
61 open IN, ("input/" . $filename) or die "Cannot read " . $filename;
62 while (<IN>)
64 if (/^[ \t]*<a +u=\"([0-9A-F]+)\" +b=\"([0-7][0-9A-F])\"\/>$/)
66 $utf32 = oct("0x" . $1);
67 $gb1 = oct("0x" . $2);
68 ($utf32 == $gb1)
69 or die "Bad " . printUtf32($utf32) . " to " . printGb($gb1);
71 elsif (/^[ \t]*<a +u=\"([0-9A-F]+)\" +b=\"([89A-F][0-9A-F]) ([4-789A-F][0-9A-F])\"\/>$/)
73 $utf32 = oct("0x" . $1);
74 $gb1 = oct("0x" . $2);
75 $gb2 = oct("0x" . $3);
76 $gb_code = ($gb1 - 0x81) * 190
77 + ($gb2 <= 0x7E ? $gb2 - 0x40 : $gb2 - 0x80 + 63);
78 !defined($gb_map_2[$gb_code])
79 or die "Redefined " . printGb($gb1, $gb2);
80 $gb_map_2[$gb_code] = $utf32;
81 ++$gb_map_2_count;
83 !defined($uni_map[$utf32]) or die "Double Unicode mapping";
84 $uni_map[$utf32] = $gb1 << 8 | $gb2;
85 ++$uni_map_count;
87 elsif (/^[ \t]*<a +u=\"([0-9A-F]+)\" +b=\"([89A-F][0-9A-F]) (3[0-9]) ([89A-F][0-9A-F]) (3[0-9])\"\/>$/)
89 $utf32 = oct("0x" . $1);
90 $gb1 = oct("0x" . $2);
91 $gb2 = oct("0x" . $3);
92 $gb3 = oct("0x" . $4);
93 $gb4 = oct("0x" . $5);
94 $gb_code = ($gb1 - 0x81) * 12600
95 + ($gb2 - 0x30) * 1260
96 + ($gb3 - 0x81) * 10
97 + ($gb4 - 0x30);
98 !defined($gb_map_4[$gb_code])
99 or die "Redefined " . printGb($gb1, $gb2, $gb3, $gb4);
100 $gb_map_4[$gb_code] = $utf32;
101 ++$gb_map_4_count;
102 $gb_map_4_max = $gb_code if ($gb_code > $gb_map_4_max);
104 !defined($uni_map[$utf32]) or die "Double Unicode mapping";
105 $uni_map[$utf32] = $gb1 << 24 | $gb2 << 16 | $gb3 << 8 | $gb4;
106 ++$uni_map_count;
108 elsif (/<a /)
110 die "Bad format";
112 elsif (/^[ \t]*<range +uFirst=\"([0-9A-F]+)\" +uLast=\"([0-9A-F]+)\" +bFirst=\"([89A-F][0-9A-F]) (3[0-9]) ([89A-F][0-9A-F]) (3[0-9])\" +bLast=\"([89A-F][0-9A-F]) (3[0-9]) ([89A-F][0-9A-F]) (3[0-9])\" +bMin=\"81 30 81 30\" +bMax=\"FE 39 FE 39\"\/>$/)
114 $utf32_first = oct("0x" . $1);
115 $utf32_last = oct("0x" . $2);
116 $gb1_first = oct("0x" . $3);
117 $gb2_first = oct("0x" . $4);
118 $gb3_first = oct("0x" . $5);
119 $gb4_first = oct("0x" . $6);
120 $gb1_last = oct("0x" . $7);
121 $gb2_last = oct("0x" . $8);
122 $gb3_last = oct("0x" . $9);
123 $gb4_last = oct("0x" . $10);
124 $linear_first
125 = ($gb1_first - 0x81) * 12600
126 + ($gb2_first - 0x30) * 1260
127 + ($gb3_first - 0x81) * 10
128 + ($gb4_first - 0x30);
129 $linear_last
130 = ($gb1_last - 0x81) * 12600
131 + ($gb2_last - 0x30) * 1260
132 + ($gb3_last - 0x81) * 10
133 + ($gb4_last - 0x30);
134 ($utf32_last - $utf32_first == $linear_last - $linear_first)
135 or die "Bad range";
136 if ($linear_first != 189000 || $linear_last != 1237575)
138 $range_uni_first[$range_count] = $utf32_first;
139 $range_uni_last[$range_count]
140 = ($utf32_last == 0xD7FF ? 0xDFFF : $utf32_last);
141 $range_linear_first[$range_count] = $linear_first;
142 $range_linear_last[$range_count] = $linear_last;
143 ++$range_count;
144 $gb_map_4_ranges += $linear_last - $linear_first + 1;
145 $gb_map_4_max = $linear_last
146 if ($linear_last > $gb_map_4_max);
149 elsif (/<range /)
151 die "Bad format";
154 close IN;
157 print "gb_map_2_count = ", $gb_map_2_count,
158 ", gb_map_4_count = ", $gb_map_4_count,
159 ", gb_map_4_ranges = ", $gb_map_4_ranges,
160 ", gb_map_4_max = ", $gb_map_4_max,
161 ", uni_map_count = ", $uni_map_count, "\n";
162 ($gb_map_2_count == 23940) or die "Bad gb_map_2_count != 23940";
163 ($gb_map_4_max == $gb_map_4_count + $gb_map_4_ranges - 1)
164 or die "Bad gb_map_4_max != gb_map_4_count + gb_map_4_ranges";
165 ($uni_map_count + $gb_map_4_ranges == 0x10000 - (0xE000 - 0xD800) - 0x80)
166 or die "Bad uni_map_count";
168 $range_index = 0;
169 $gb_nonrangedataindex[$range_index] = $gb_map_2_count;
170 for ($gb_code = 0; $gb_code < $gb_map_4_max; ++$gb_code)
172 if (defined($gb_map_4[$gb_code]))
174 $gb_map_2[$gb_map_2_count++] = $gb_map_4[$gb_code];
176 else
178 ($gb_code == $range_linear_first[$range_index]) or die "Bad input";
179 $gb_code = $range_linear_last[$range_index];
180 ++$range_index;
181 $gb_nonrangedataindex[$range_index] = $gb_map_2_count;
184 ($range_index == $range_count) or die "Bad input";
186 $filename = lc($id) . ".tab";
187 open OUT, ("> " . $filename) or die "Cannot write " . $filename;
190 $filename = lc($id). ".pl";
191 open IN, $filename or die "Cannot read ". $filename;
192 $first = 1;
193 while (<IN>)
195 if (/^\#!.*$/)
198 elsif (/^\#(\*.*)$/)
200 if ($first == 1)
202 print OUT "/", $1, "\n";
203 $first = 0;
205 else
207 print OUT " ", substr($1, 0, length($1) - 1), "/\n";
210 elsif (/^\# (.*)$/)
212 print OUT " *", $1, "\n";
214 elsif (/^\#(.*)$/)
216 print OUT " *", $1, "\n";
218 else
220 goto done;
223 done:
226 print OUT "\n",
227 "#ifndef INCLUDED_RTL_TEXTENC_CONVERTGB18030_H\n",
228 "#include \"convertgb18030.h\"\n",
229 "#endif\n",
230 "\n",
231 "#ifndef _SAL_TYPES_H_\n",
232 "#include \"sal/types.h\"\n",
233 "#endif\n",
234 "\n";
236 print OUT "static sal_Unicode const aImpl", $id, "ToUnicodeData[] = {\n ";
237 for ($gb_code = 0; $gb_code < $gb_map_2_count; ++$gb_code)
239 printf OUT "0x%04X,", $gb_map_2[$gb_code];
240 if ($gb_code % 8 == 7 && $gb_code != $gb_map_2_count - 1)
242 print OUT "\n ";
245 print OUT "\n};\n\n";
247 print OUT "static ImplGb180302000ToUnicodeRange const\n aImpl",
248 $id,
249 "ToUnicodeRanges[] = {\n";
250 for ($range_index = 0; $range_index < $range_count; ++$range_index)
252 printf OUT " { %d, %d, %d, 0x%04X },\n",
253 $gb_nonrangedataindex[$range_index],
254 $range_linear_first[$range_index],
255 $range_linear_last[$range_index] + 1,
256 $range_uni_first[$range_index];
258 print OUT " { -1, 0, 0, 0 }\n};\n\n";
260 print OUT "static sal_uInt32 const aImplUnicodeTo", $id, "Data[] = {\n ";
261 $index = 0;
262 $range_index = 0;
263 $uni_nonrangedataindex[$range_index] = $index;
264 for ($utf32 = 0x80; $utf32 <= 0xFFFF; ++$utf32)
266 if (defined($uni_map[$utf32]))
268 if ($index > 0 && ($index - 1) % 6 == 5)
270 print OUT "\n ";
272 $bytes = $uni_map[$utf32];
273 printf OUT ($bytes <= 0xFFFF ? " 0x%04X," : "0x%08X,"), $bytes;
274 ++$index;
276 else
278 ($utf32 == $range_uni_first[$range_index]) or die "Bad input";
279 $utf32 = $range_uni_last[$range_index];
280 ++$range_index;
281 $uni_nonrangedataindex[$range_index] = $index;
284 ($range_index == $range_count) or die "Bad input";
285 print OUT "\n};\n\n";
287 print OUT "static ImplUnicodeToGb180302000Range const\n aImplUnicodeTo",
288 $id,
289 "Ranges[] = {\n";
290 for ($range_index = 0; $range_index < $range_count; ++$range_index)
292 printf OUT " { %d, 0x%04X, 0x%04X, %d },\n",
293 $uni_nonrangedataindex[$range_index],
294 $range_uni_first[$range_index],
295 $range_uni_last[$range_index],
296 $range_linear_first[$range_index];
298 print OUT "};\n";
300 close OUT;