Bug 470455 - test_database_sync_embed_visits.js leaks, r=sdwilsh
[wine-gecko.git] / intl / unicharutil / tools / gentransliterate.pl
blob2b15481f920730a57cb9c453e1556674bd9fc9a4
1 #!/usr/bin/perl
3 # ***** BEGIN LICENSE BLOCK *****
4 # Version: MPL 1.1/GPL 2.0/LGPL 2.1
6 # The contents of this file are subject to the Mozilla Public License Version
7 # 1.1 (the "License"); you may not use this file except in compliance with
8 # the License. You may obtain a copy of the License at
9 # http://www.mozilla.org/MPL/
11 # Software distributed under the License is distributed on an "AS IS" basis,
12 # WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
13 # for the specific language governing rights and limitations under the
14 # License.
16 # The Original Code is mozilla.org code.
18 # The Initial Developer of the Original Code is
19 # Netscape Communications Corporation.
20 # Portions created by the Initial Developer are Copyright (C) 1999
21 # the Initial Developer. All Rights Reserved.
23 # Contributor(s):
25 # Alternatively, the contents of this file may be used under the terms of
26 # either the GNU General Public License Version 2 or later (the "GPL"), or
27 # the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
28 # in which case the provisions of the GPL or the LGPL are applicable instead
29 # of those above. If you wish to allow use of your version of this file only
30 # under the terms of either the GPL or the LGPL, and not to allow others to
31 # use your version of this file under the terms of the MPL, indicate your
32 # decision by deleting the provisions above and replace them with the notice
33 # and other provisions required by the GPL or the LGPL. If you do not delete
34 # the provisions above, a recipient may use your version of this file under
35 # the terms of any one of the MPL, the GPL or the LGPL.
37 # ***** END LICENSE BLOCK *****
39 $header = <<END_OF_HEADER;
40 # ***** BEGIN LICENSE BLOCK *****
41 # Version: MPL 1.1/GPL 2.0/LGPL 2.1
43 # The contents of this file are subject to the Mozilla Public License Version
44 # 1.1 (the "License"); you may not use this file except in compliance with
45 # the License. You may obtain a copy of the License at
46 # http://www.mozilla.org/MPL/
48 # Software distributed under the License is distributed on an "AS IS" basis,
49 # WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
50 # for the specific language governing rights and limitations under the
51 # License.
53 # The Original Code is mozilla.org code.
55 # The Initial Developer of the Original Code is
56 # Netscape Communications Corporation.
57 # Portions created by the Initial Developer are Copyright (C) 1999
58 # the Initial Developer. All Rights Reserved.
60 # Contributor(s):
62 # Alternatively, the contents of this file may be used under the terms of
63 # either the GNU General Public License Version 2 or later (the "GPL"), or
64 # the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
65 # in which case the provisions of the GPL or the LGPL are applicable instead
66 # of those above. If you wish to allow use of your version of this file only
67 # under the terms of either the GPL or the LGPL, and not to allow others to
68 # use your version of this file under the terms of the MPL, indicate your
69 # decision by deleting the provisions above and replace them with the notice
70 # and other provisions required by the GPL or the LGPL. If you do not delete
71 # the provisions above, a recipient may use your version of this file under
72 # the terms of any one of the MPL, the GPL or the LGPL.
74 # ***** END LICENSE BLOCK *****
77 # THIS FILE IS GENERATED BY mozilla/intl/unicharutil/tools/gentransliterate.pl
78 # PLEASE DO NOT MODIFY THIS FILE BY HAND
80 entity.list.name=transliterate
81 entity.169=(c)
84 # Here are the windows-1252 characters from the range 0x80 - 0x9F
86 END_OF_HEADER
88 $handcoded = <<END_OF_HANDCODED;
89 # EURO SIGN
90 entity.8364=EUR
91 # SINGLE LOW-9 QUOTATION MARK
92 entity.8218=,
93 # LATIN SMALL LETTER F WITH HOOK
94 entity.402=f
95 # DOUBLE LOW-9 QUOTATION MARK
96 entity.8222="
97 # DAGGER
98 entity.8224=+
99 # DOUBLE DAGGER
100 entity.8225=++
101 # MODIFIER LETTER CIRCUMFLEX ACCENT
102 entity.710=^
103 # PER MILLE SIGN
104 entity.8240=0/00
105 # SINGLE LEFT-POINTING ANGLE QUOTATION MARK
106 entity.8249=<
107 # LATIN CAPITAL LIGATURE OE
108 entity.338=OE
109 # LEFT SINGLE QUOTATION MARK
110 entity.8216='
111 # RIGHT SINGLE QUOTATION MARK
112 entity.8217='
113 # LEFT DOUBLE QUOTATION MARK
114 entity.8220="
115 # RIGHT DOUBLE QUOTATION MARK
116 entity.8221="
117 # BULLET
118 entity.8226=.
119 # EN DASH
120 entity.8211=--
121 # EM DASH
122 entity.8212=---
123 # SMALL TILDE
124 entity.732=~
125 # SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
126 entity.8250=>
127 # LATIN SMALL LIGATURE OE
128 entity.339=oe
129 # U+2000 EN QUAD
130 entity.8192=\\u0020
131 # U+2001 EM QUAD
132 entity.8193=\\u0020
133 # U+2010 HYPHEN
134 entity.8208=-
135 # U+2011 NON-BREAKING HYPHEN
136 entity.8209=-
137 # U+2012 FIGURE DASH
138 entity.8210=-
139 # U+2015 HORIZONTAL BAR
140 entity.8213=--
141 # U+200B, ZERO WIDTH SPACE (a.k.a. InvisibleComma)
142 entity.8203=
143 # U+2061, ApplyFunction, character showing function application in presentation tagging
144 entity.8289=
145 # U+2062, InvisibleTimes, marks multiplication when it is understood without a mark
146 entity.8290=
147 # U+2146, DifferentialD, d for use in differentials, e.g., within integrals
148 entity.8518=d
149 # U+2212, MINUS SIGN, official Unicode minus sign
150 entity.8722=-
151 # Hebrew punctuation
152 # U+05BE HEBREW PUNCTUATION MAQAF
153 entity.1470=-
154 # U+05C0 HEBREW PUNCTUATION PASEQ
155 entity.1472=|
156 # U+05C3 HEBREW PUNCTUATION SOF PASUQ
157 entity.1475=:
158 # U+05F3 HEBREW PUNCTUATION GERESH
159 entity.1523='
160 # U+05F4 HEBREW PUNCTUATION GERSHAYIM
161 entity.1524="
163 ## End of hand coded section
164 ## Below are generated from the unicode character database
166 END_OF_HANDCODED
168 @table = ();
169 sub FromLatinComment
171 my ($cmt) = (@_);
172 $char = "";
173 if($cmt =~ /PRECEDED BY APOSTROPHE/) {
174 $char = "\'";
176 if($cmt =~ /CAPITAL LETTER ([A-Z]*)/) {
177 $char = $char . $1;
179 if($cmt =~ /SMALL LETTER ([A-Z]*)/) {
180 $char = $char . lc($1);
182 @f = split(/ / , $cmt);
183 while($item = shift @f) {
184 if($item eq "DOT") {
185 $char .= ".";
186 } elsif ($item eq "DIAERESIS") {
187 $char .= "\"";
188 } elsif ($item eq "BREVE") {
189 $char .= "(";
190 } elsif ($item eq "ACUTE") {
191 $char .= "\'";
192 } elsif ($item eq "GRAVE") {
193 $char .= "`";
194 } elsif ($item eq "TILDE") {
195 $char .= "~";
196 } elsif ($item eq "CARON") {
197 $char .= "(";
198 } elsif ($item eq "HOOK") {
199 $char .= "?";
200 } elsif ($item eq "CEDILLA") {
201 $char .= ",";
202 } elsif ($item eq "MACRON") {
203 $char .= "-";
204 } elsif ($item eq "CIRCUMFLEX") {
205 $char .= "^";
206 } elsif ($item eq "RING") {
207 $char .= "*";
208 } elsif ($item eq "OGONEK") {
209 $char .= ";";
210 } elsif ($item eq "LINE") {
211 $char .= "_";
212 } elsif ($item eq "COMMA") {
213 $char .= ",";
214 } elsif ($item eq "STROKE") {
215 $char .= "/";
216 } elsif ($item eq "HORN") {
217 $char .= "+";
218 } elsif ($item =~ /^(LATIN|CAPITAL|SMALL|LETTER|WITH|ABOVE|BELOW|INVERTED|MIDDLE|AND|BY|APOSTROPHE|[A-Z])$/) {
219 # ignore
220 } else {
221 #print "AAAA $item\n";
225 return $char;
227 sub warning
229 my ($warning) = (@_);
230 print "WARNING: $warning \n";
232 sub doutput
234 my ($u, $cmt, $udec, $str) = (@_);
235 # don't print out comments - for debugging purposes only
236 # print "# U+$u $cmt\n";
237 print "entity.$udec=$str\n";
239 sub output
241 my ($u, $cmt, $udec, $str) = (@_);
242 if(decomposeIntoNonASCII($str)) {
243 if(($cmt =~ "LATIN") && ($cmt =~ "LETTER") && !($cmt =~ "LONG")) {
244 $str = FromLatinComment($cmt);
245 output($u,$cmt,$udec,$str);
247 } else {
248 # don't print out comments - for debugging purposes only
249 # print OUT "# U+$u $cmt\n";
250 print OUT "entity.$udec=$str\n";
254 sub decomposeIntoNonASCII
256 my ($dec) = (@_);
257 return $dec =~ /\\u([1-9A-F][0-9A-F][0-9A-F]|[0-9A-F][1-9A-F][0-9A-F]|00[8-9A-F])[0-9A-F]/;
260 sub foldcombining
262 my ($dec) = (@_);
263 $grave = "0060";
264 $acute = "0027";
265 $hat = "005E";
266 $hat = "005E";
267 $tilde = "007E";
268 $overscore = "002D"; ## should be 00AF but we can only handle ASCII now
269 $umlaut = "0022"; ## should be 00A8 but we can only handle ASCII now
270 $doubleacute = "0022";
271 $dot = "002E";
272 $doublegrave = "0060 0060";
275 $dec =~ s/00A8/$umlaut/eg;
276 $dec =~ s/00AF/$overscore/eg;
277 # $dec =~ s/00B0//eg;
278 $dec =~ s/00B4/$acute/eg;
279 $dec =~ s/00B7/$dot/eg;
280 # $dec =~ s/00B8//eg;
281 $dec =~ s/0300/$grave/eg;
282 $dec =~ s/0301/$acute/eg;
283 $dec =~ s/0302/$hat/eg;
284 $dec =~ s/0303/$tilde/eg;
285 $dec =~ s/0304/$overscore/eg;
286 $dec =~ s/0305/$overscore/eg;
287 #$dec =~ s/0306/?/eg;
288 $dec =~ s/0307/$dot/eg;
289 $dec =~ s/0308/$umlaut/eg;
290 #$dec =~ s/0309/?/eg;
291 #$dec =~ s/030A/?/eg;
292 $dec =~ s/030B/$doubleacute/eg;
293 #$dec =~ s/030C/?/eg;
294 $dec =~ s/030D/$acute/eg;
295 $dec =~ s/030E/$doubleacute/eg;
296 $dec =~ s/030F/$doublegrave/eg;
298 # $dec =~ s/03[0-9A-F][0-9A-F]//eg; ## drop others
299 return $dec;
301 sub rdecompose
303 my ($dec) = (@_);
304 if(exists $table{$dec}) {
305 $t = $table{$dec};
306 $t =~ s/<[a-zA-Z]*>//eg;
307 $t = foldcombining($t);
308 return rdecompose( $table{$t});
310 return $dec;
312 sub decompose
314 my ($removeprefix, $dec) = (@_);
315 $removeprefix .= " ";
317 $dec =~ s/$removeprefix//eg;
318 if($dec eq "0020") {
319 $dec = "\\u0020";
320 } elsif($dec eq "005C") {
321 $dec = "\\u005C";
322 } else {
323 $k = "\/";
324 $dec =~ s/2044/$k/eg;
325 $dec =~ s/([0-9A-F][0-9A-F][0-9A-F][0-9A-F])/rdecompose($1)/eg;
326 $dec =~ s/([0-9A-F][0-9A-F][0-9A-F][0-9A-F])/\\u$1/g;
327 $dec =~ s/\\u00([0-7][0-9A-F])/pack("C",hex($1))/eg;
328 $dec =~ s/ //eg;
330 return $dec;
333 ######################################################################
335 # Open the unicode database file
337 ######################################################################
338 open ( UNICODATA , "< UnicodeData-Latest.txt")
339 || die "cannot find UnicodeData-Latest.txt";
341 open ( UNICODATA2 , "< UnicodeData-Latest.txt")
342 || die "cannot find UnicodeData-Latest.txt";
343 ######################################################################
345 # Open the output file
347 ######################################################################
348 open ( OUT , "> ../tables/transliterate.properties")
349 || die "cannot open output ../tables/transliterate.properties file";
351 print OUT $header;
353 # remove comments from $handcoded
354 $handcoded =~ s/^#[^#].*\n//mg;
355 print OUT $handcoded;
357 ######################################################################
359 # Process the file line by line
361 ######################################################################
362 while(<UNICODATA2>) {
363 chop;
364 @f = split(/;/ , $_);
365 $udec = hex($u);
366 if(($udec > 256 ) && ($f[5] ne "")) {
367 $table{$f[0]}=$f[5];
370 while(<UNICODATA>) {
371 chop;
372 ######################################################################
374 # Get value from fields
376 ######################################################################
377 @f = split(/;/ , $_);
378 $u = $f[0]; # The unicode value
379 $cmt = $f[1]; # The comment
380 $dec = $f[5]; # The decomposed value
381 $d1 = $f[6];
382 $d2 = $f[7];
383 $d3 = $f[8];
384 $udec = hex($u);
386 if($udec > 128)
388 # not ASCII
389 if($dec ne "")
391 # have decomposition
392 if($dec =~ /</) {
393 # formated decomposition
394 if($dec =~ /<wide>/) {
395 output($u,$cmt,$udec,&decompose("<wide>", $dec));
396 } elsif($dec =~ /<narrow>/) {
397 # ignore non ASCII decomposition
398 # warning($_);
399 } elsif($dec =~ /<circle>/) {
400 output($u,$cmt,$udec,&decompose("<circle>", "(".$dec.")"));
401 } elsif($dec =~ /<fraction>/) {
402 output($u,$cmt,$udec,&decompose("<fraction>", $dec));
403 } elsif($dec =~ /<small>/) {
404 output($u,$cmt,$udec,&decompose("<small>", $dec));
405 } elsif($dec =~ /<vertical>/) {
406 # warning($_);
407 } elsif($dec =~ /<super>/) {
408 output($u,$cmt,$udec,"^(".&decompose("<super>", $dec).")");
409 } elsif($dec =~ /<sub>/) {
410 output($u,$cmt,$udec,"v(".&decompose("<sub>", $dec).")");
411 } elsif($dec =~ /<font>/) {
412 output($u,$cmt,$udec,&decompose("<font>", $dec));
413 } elsif($dec =~ /<square>/) {
414 # ignore <square>
415 # warning($_);
416 } elsif($dec =~ /<compat>/) {
417 output($u,$cmt,$udec,&decompose("<compat>", $dec));
418 } elsif($dec =~ /<isolated>/) {
419 # ignore <isolated>
420 # warning($_);
421 } elsif($dec =~ /<medial>/) {
422 # ignore <medial>
423 # warning($_);
424 } elsif($dec =~ /<final>/) {
425 # ignore <final>
426 # warning($_);
427 } elsif($dec =~ /<initial>/) {
428 # ignore <initial>
429 # warning($_);
430 } elsif($dec =~ /<noBreak>/) {
431 if($dec eq "<noBreak> 0020")
433 output($u,$cmt,$udec,"\\u0020");
434 } else {
435 # ignore
436 # warning($_);
438 } else {
439 warning($_);
441 } else {
442 # decomposition without format code
443 if($cmt =~ /LATIN/) {
444 $dec = foldcombining($dec);
445 output($u,$cmt,$udec,&decompose("", $dec));
446 } elsif($cmt =~ /CYRILLIC/) {
447 # ignore
448 # warning($_);
449 } elsif($cmt =~ /GREEK/) {
450 # ignore
451 # warning($_);
452 } elsif($cmt =~ /ARABIC/) {
453 # ignore
454 # warning($_);
455 } elsif($cmt =~ /CJK/) {
456 # ignore
457 # warning($_);
458 } elsif($cmt =~ /HEBREW/) {
459 # ignore
460 # warning($_);
461 } elsif($cmt =~ /DEVANAGARI/) {
462 # ignore
463 # warning($_);
464 } elsif($cmt =~ /BENGALI/) {
465 # ignore
466 # warning($_);
467 } elsif($cmt =~ /GURMUKHI/) {
468 # ignore
469 # warning($_);
470 } elsif($cmt =~ /ORIYA/) {
471 # ignore
472 # warning($_);
473 } elsif($cmt =~ /TAMIL/) {
474 # ignore
475 # warning($_);
476 } elsif($cmt =~ /TELUGU/) {
477 # ignore
478 # warning($_);
479 } elsif($cmt =~ /KANNADA/) {
480 # ignore
481 # warning($_);
482 } elsif($cmt =~ /MALAYALAM/) {
483 # ignore
484 # warning($_);
485 } elsif($cmt =~ /SINHALA/) {
486 # ignore
487 # warning($_);
488 } elsif($cmt =~ /TIBETAN/) {
489 # ignore
490 # warning($_);
491 } elsif($cmt =~ /MYANMAR/) {
492 # ignore
493 # warning($_);
494 } elsif($cmt =~ /KATAKANA/) {
495 # ignore
496 # warning($_);
497 } elsif($cmt =~ /HIRAGANA/) {
498 # ignore
499 # warning($_);
500 } else {
501 # ignore
502 # warning($_);
505 } else {
506 # do not have decomposition
507 if ($d1 ne "")
509 # are numeric characters
510 output($u,$cmt,$udec,$d1);
511 } elsif ($d2 ne "") {
512 if($cmt =~ /CIRCLED/) {
513 # circled
514 output($u,$cmt,$udec,"(".$d2.")");
515 } else {
516 # others, use [ ]
517 output($u,$cmt,$udec,"[".$d2."]");
519 } elsif ($d3 ne "") {
520 if($cmt =~ /CIRCLED/) {
521 # circled
522 output($u,$cmt,$udec,"(".$d3.")");
523 } else {
524 # others, use [ ]
525 output($u,$cmt,$udec,"[".$d3."]");
527 } else {
528 # not numeric characters
530 } # end of no decomposition
531 } # end of have/not decomposition
534 ######################################################################
536 # Close files
538 ######################################################################
539 close(UNIDATA);
540 close(OUT);