Fix typo in 9b54bd30006c008b4a951331b273613d5bac3abf
[pm.git] / intl / unicharutil / tools / gentransliterate.pl
blob486e0afacf845ad9a3dd5cf8c60edf10d1d00400
1 #!/usr/bin/perl
3 # This Source Code Form is subject to the terms of the Mozilla Public
4 # License, v. 2.0. If a copy of the MPL was not distributed with this
5 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
7 $header = <<END_OF_HEADER;
8 # This Source Code Form is subject to the terms of the Mozilla Public
9 # License, v. 2.0. If a copy of the MPL was not distributed with this
10 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
13 # THIS FILE IS GENERATED BY mozilla/intl/unicharutil/tools/gentransliterate.pl
14 # PLEASE DO NOT MODIFY THIS FILE BY HAND
16 entity.list.name=transliterate
17 entity.169=(c)
20 # Here are the windows-1252 characters from the range 0x80 - 0x9F
22 END_OF_HEADER
24 $handcoded = <<END_OF_HANDCODED;
25 # EURO SIGN
26 entity.8364=EUR
27 # SINGLE LOW-9 QUOTATION MARK
28 entity.8218=,
29 # LATIN SMALL LETTER F WITH HOOK
30 entity.402=f
31 # DOUBLE LOW-9 QUOTATION MARK
32 entity.8222="
33 # DAGGER
34 entity.8224=+
35 # DOUBLE DAGGER
36 entity.8225=++
37 # MODIFIER LETTER CIRCUMFLEX ACCENT
38 entity.710=^
39 # PER MILLE SIGN
40 entity.8240=0/00
41 # SINGLE LEFT-POINTING ANGLE QUOTATION MARK
42 entity.8249=<
43 # LATIN CAPITAL LIGATURE OE
44 entity.338=OE
45 # LEFT SINGLE QUOTATION MARK
46 entity.8216='
47 # RIGHT SINGLE QUOTATION MARK
48 entity.8217='
49 # LEFT DOUBLE QUOTATION MARK
50 entity.8220="
51 # RIGHT DOUBLE QUOTATION MARK
52 entity.8221="
53 # BULLET
54 entity.8226=.
55 # EN DASH
56 entity.8211=--
57 # EM DASH
58 entity.8212=---
59 # SMALL TILDE
60 entity.732=~
61 # SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
62 entity.8250=>
63 # LATIN SMALL LIGATURE OE
64 entity.339=oe
65 # U+2000 EN QUAD
66 entity.8192=\\u0020
67 # U+2001 EM QUAD
68 entity.8193=\\u0020
69 # U+2010 HYPHEN
70 entity.8208=-
71 # U+2011 NON-BREAKING HYPHEN
72 entity.8209=-
73 # U+2012 FIGURE DASH
74 entity.8210=-
75 # U+2015 HORIZONTAL BAR
76 entity.8213=--
77 # U+200B, ZERO WIDTH SPACE (a.k.a. InvisibleComma)
78 entity.8203=
79 # U+2061, ApplyFunction, character showing function application in presentation tagging
80 entity.8289=
81 # U+2062, InvisibleTimes, marks multiplication when it is understood without a mark
82 entity.8290=
83 # U+2146, DifferentialD, d for use in differentials, e.g., within integrals
84 entity.8518=d
85 # U+2212, MINUS SIGN, official Unicode minus sign
86 entity.8722=-
87 # Hebrew punctuation
88 # U+05BE HEBREW PUNCTUATION MAQAF
89 entity.1470=-
90 # U+05C0 HEBREW PUNCTUATION PASEQ
91 entity.1472=|
92 # U+05C3 HEBREW PUNCTUATION SOF PASUQ
93 entity.1475=:
94 # U+05F3 HEBREW PUNCTUATION GERESH
95 entity.1523='
96 # U+05F4 HEBREW PUNCTUATION GERSHAYIM
97 entity.1524="
99 ## End of hand coded section
100 ## Below are generated from the unicode character database
102 END_OF_HANDCODED
104 @table = ();
105 sub FromLatinComment
107 my ($cmt) = (@_);
108 $char = "";
109 if($cmt =~ /PRECEDED BY APOSTROPHE/) {
110 $char = "\'";
112 if($cmt =~ /CAPITAL LETTER ([A-Z]*)/) {
113 $char = $char . $1;
115 if($cmt =~ /SMALL LETTER ([A-Z]*)/) {
116 $char = $char . lc($1);
118 @f = split(/ / , $cmt);
119 while($item = shift @f) {
120 if($item eq "DOT") {
121 $char .= ".";
122 } elsif ($item eq "DIAERESIS") {
123 $char .= "\"";
124 } elsif ($item eq "BREVE") {
125 $char .= "(";
126 } elsif ($item eq "ACUTE") {
127 $char .= "\'";
128 } elsif ($item eq "GRAVE") {
129 $char .= "`";
130 } elsif ($item eq "TILDE") {
131 $char .= "~";
132 } elsif ($item eq "CARON") {
133 $char .= "(";
134 } elsif ($item eq "HOOK") {
135 $char .= "?";
136 } elsif ($item eq "CEDILLA") {
137 $char .= ",";
138 } elsif ($item eq "MACRON") {
139 $char .= "-";
140 } elsif ($item eq "CIRCUMFLEX") {
141 $char .= "^";
142 } elsif ($item eq "RING") {
143 $char .= "*";
144 } elsif ($item eq "OGONEK") {
145 $char .= ";";
146 } elsif ($item eq "LINE") {
147 $char .= "_";
148 } elsif ($item eq "COMMA") {
149 $char .= ",";
150 } elsif ($item eq "STROKE") {
151 $char .= "/";
152 } elsif ($item eq "HORN") {
153 $char .= "+";
154 } elsif ($item =~ /^(LATIN|CAPITAL|SMALL|LETTER|WITH|ABOVE|BELOW|INVERTED|MIDDLE|AND|BY|APOSTROPHE|[A-Z])$/) {
155 # ignore
156 } else {
157 #print "AAAA $item\n";
161 return $char;
163 sub warning
165 my ($warning) = (@_);
166 print "WARNING: $warning \n";
168 sub doutput
170 my ($u, $cmt, $udec, $str) = (@_);
171 # don't print out comments - for debugging purposes only
172 # print "# U+$u $cmt\n";
173 print "entity.$udec=$str\n";
175 sub output
177 my ($u, $cmt, $udec, $str) = (@_);
178 if(decomposeIntoNonASCII($str)) {
179 if(($cmt =~ "LATIN") && ($cmt =~ "LETTER") && !($cmt =~ "LONG")) {
180 $str = FromLatinComment($cmt);
181 output($u,$cmt,$udec,$str);
183 } else {
184 # don't print out comments - for debugging purposes only
185 # print OUT "# U+$u $cmt\n";
186 print OUT "entity.$udec=$str\n";
190 sub decomposeIntoNonASCII
192 my ($dec) = (@_);
193 return $dec =~ /\\u([1-9A-F][0-9A-F][0-9A-F]|[0-9A-F][1-9A-F][0-9A-F]|00[8-9A-F])[0-9A-F]/;
196 sub foldcombining
198 my ($dec) = (@_);
199 $grave = "0060";
200 $acute = "0027";
201 $hat = "005E";
202 $hat = "005E";
203 $tilde = "007E";
204 $overscore = "002D"; ## should be 00AF but we can only handle ASCII now
205 $umlaut = "0022"; ## should be 00A8 but we can only handle ASCII now
206 $doubleacute = "0022";
207 $dot = "002E";
208 $doublegrave = "0060 0060";
211 $dec =~ s/00A8/$umlaut/eg;
212 $dec =~ s/00AF/$overscore/eg;
213 # $dec =~ s/00B0//eg;
214 $dec =~ s/00B4/$acute/eg;
215 $dec =~ s/00B7/$dot/eg;
216 # $dec =~ s/00B8//eg;
217 $dec =~ s/0300/$grave/eg;
218 $dec =~ s/0301/$acute/eg;
219 $dec =~ s/0302/$hat/eg;
220 $dec =~ s/0303/$tilde/eg;
221 $dec =~ s/0304/$overscore/eg;
222 $dec =~ s/0305/$overscore/eg;
223 #$dec =~ s/0306/?/eg;
224 $dec =~ s/0307/$dot/eg;
225 $dec =~ s/0308/$umlaut/eg;
226 #$dec =~ s/0309/?/eg;
227 #$dec =~ s/030A/?/eg;
228 $dec =~ s/030B/$doubleacute/eg;
229 #$dec =~ s/030C/?/eg;
230 $dec =~ s/030D/$acute/eg;
231 $dec =~ s/030E/$doubleacute/eg;
232 $dec =~ s/030F/$doublegrave/eg;
234 # $dec =~ s/03[0-9A-F][0-9A-F]//eg; ## drop others
235 return $dec;
237 sub rdecompose
239 my ($dec) = (@_);
240 if(exists $table{$dec}) {
241 $t = $table{$dec};
242 $t =~ s/<[a-zA-Z]*>//eg;
243 $t = foldcombining($t);
244 return rdecompose( $table{$t});
246 return $dec;
248 sub decompose
250 my ($removeprefix, $dec) = (@_);
251 $removeprefix .= " ";
253 $dec =~ s/$removeprefix//eg;
254 if($dec eq "0020") {
255 $dec = "\\u0020";
256 } elsif($dec eq "005C") {
257 $dec = "\\u005C";
258 } else {
259 $k = "\/";
260 $dec =~ s/2044/$k/eg;
261 $dec =~ s/([0-9A-F][0-9A-F][0-9A-F][0-9A-F])/rdecompose($1)/eg;
262 $dec =~ s/([0-9A-F][0-9A-F][0-9A-F][0-9A-F])/\\u$1/g;
263 $dec =~ s/\\u00([0-7][0-9A-F])/pack("C",hex($1))/eg;
264 $dec =~ s/ //eg;
266 return $dec;
269 ######################################################################
271 # Open the unicode database file
273 ######################################################################
274 open ( UNICODATA , "< UnicodeData-Latest.txt")
275 || die "cannot find UnicodeData-Latest.txt";
277 open ( UNICODATA2 , "< UnicodeData-Latest.txt")
278 || die "cannot find UnicodeData-Latest.txt";
279 ######################################################################
281 # Open the output file
283 ######################################################################
284 open ( OUT , "> ../tables/transliterate.properties")
285 || die "cannot open output ../tables/transliterate.properties file";
287 print OUT $header;
289 # remove comments from $handcoded
290 $handcoded =~ s/^#[^#].*\n//mg;
291 print OUT $handcoded;
293 ######################################################################
295 # Process the file line by line
297 ######################################################################
298 while(<UNICODATA2>) {
299 chop;
300 @f = split(/;/ , $_);
301 $udec = hex($u);
302 if(($udec > 256 ) && ($f[5] ne "")) {
303 $table{$f[0]}=$f[5];
306 while(<UNICODATA>) {
307 chop;
308 ######################################################################
310 # Get value from fields
312 ######################################################################
313 @f = split(/;/ , $_);
314 $u = $f[0]; # The unicode value
315 $cmt = $f[1]; # The comment
316 $dec = $f[5]; # The decomposed value
317 $d1 = $f[6];
318 $d2 = $f[7];
319 $d3 = $f[8];
320 $udec = hex($u);
322 if($udec > 128)
324 # not ASCII
325 if($dec ne "")
327 # have decomposition
328 if($dec =~ /</) {
329 # formated decomposition
330 if($dec =~ /<wide>/) {
331 output($u,$cmt,$udec,&decompose("<wide>", $dec));
332 } elsif($dec =~ /<narrow>/) {
333 # ignore non ASCII decomposition
334 # warning($_);
335 } elsif($dec =~ /<circle>/) {
336 output($u,$cmt,$udec,&decompose("<circle>", "(".$dec.")"));
337 } elsif($dec =~ /<fraction>/) {
338 output($u,$cmt,$udec,&decompose("<fraction>", $dec));
339 } elsif($dec =~ /<small>/) {
340 output($u,$cmt,$udec,&decompose("<small>", $dec));
341 } elsif($dec =~ /<vertical>/) {
342 # warning($_);
343 } elsif($dec =~ /<super>/) {
344 output($u,$cmt,$udec,"^(".&decompose("<super>", $dec).")");
345 } elsif($dec =~ /<sub>/) {
346 output($u,$cmt,$udec,"v(".&decompose("<sub>", $dec).")");
347 } elsif($dec =~ /<font>/) {
348 output($u,$cmt,$udec,&decompose("<font>", $dec));
349 } elsif($dec =~ /<square>/) {
350 # ignore <square>
351 # warning($_);
352 } elsif($dec =~ /<compat>/) {
353 output($u,$cmt,$udec,&decompose("<compat>", $dec));
354 } elsif($dec =~ /<isolated>/) {
355 # ignore <isolated>
356 # warning($_);
357 } elsif($dec =~ /<medial>/) {
358 # ignore <medial>
359 # warning($_);
360 } elsif($dec =~ /<final>/) {
361 # ignore <final>
362 # warning($_);
363 } elsif($dec =~ /<initial>/) {
364 # ignore <initial>
365 # warning($_);
366 } elsif($dec =~ /<noBreak>/) {
367 if($dec eq "<noBreak> 0020")
369 output($u,$cmt,$udec,"\\u0020");
370 } else {
371 # ignore
372 # warning($_);
374 } else {
375 warning($_);
377 } else {
378 # decomposition without format code
379 if($cmt =~ /LATIN/) {
380 $dec = foldcombining($dec);
381 output($u,$cmt,$udec,&decompose("", $dec));
382 } elsif($cmt =~ /CYRILLIC/) {
383 # ignore
384 # warning($_);
385 } elsif($cmt =~ /GREEK/) {
386 # ignore
387 # warning($_);
388 } elsif($cmt =~ /ARABIC/) {
389 # ignore
390 # warning($_);
391 } elsif($cmt =~ /CJK/) {
392 # ignore
393 # warning($_);
394 } elsif($cmt =~ /HEBREW/) {
395 # ignore
396 # warning($_);
397 } elsif($cmt =~ /DEVANAGARI/) {
398 # ignore
399 # warning($_);
400 } elsif($cmt =~ /BENGALI/) {
401 # ignore
402 # warning($_);
403 } elsif($cmt =~ /GURMUKHI/) {
404 # ignore
405 # warning($_);
406 } elsif($cmt =~ /ORIYA/) {
407 # ignore
408 # warning($_);
409 } elsif($cmt =~ /TAMIL/) {
410 # ignore
411 # warning($_);
412 } elsif($cmt =~ /TELUGU/) {
413 # ignore
414 # warning($_);
415 } elsif($cmt =~ /KANNADA/) {
416 # ignore
417 # warning($_);
418 } elsif($cmt =~ /MALAYALAM/) {
419 # ignore
420 # warning($_);
421 } elsif($cmt =~ /SINHALA/) {
422 # ignore
423 # warning($_);
424 } elsif($cmt =~ /TIBETAN/) {
425 # ignore
426 # warning($_);
427 } elsif($cmt =~ /MYANMAR/) {
428 # ignore
429 # warning($_);
430 } elsif($cmt =~ /KATAKANA/) {
431 # ignore
432 # warning($_);
433 } elsif($cmt =~ /HIRAGANA/) {
434 # ignore
435 # warning($_);
436 } else {
437 # ignore
438 # warning($_);
441 } else {
442 # do not have decomposition
443 if ($d1 ne "")
445 # are numeric characters
446 output($u,$cmt,$udec,$d1);
447 } elsif ($d2 ne "") {
448 if($cmt =~ /CIRCLED/) {
449 # circled
450 output($u,$cmt,$udec,"(".$d2.")");
451 } else {
452 # others, use [ ]
453 output($u,$cmt,$udec,"[".$d2."]");
455 } elsif ($d3 ne "") {
456 if($cmt =~ /CIRCLED/) {
457 # circled
458 output($u,$cmt,$udec,"(".$d3.")");
459 } else {
460 # others, use [ ]
461 output($u,$cmt,$udec,"[".$d3."]");
463 } else {
464 # not numeric characters
466 } # end of no decomposition
467 } # end of have/not decomposition
470 ######################################################################
472 # Close files
474 ######################################################################
475 close(UNIDATA);
476 close(OUT);