intl/unicharutil/tools/gentransliterate.pl

   1 #!/usr/bin/perl
   2 #
   3 # This Source Code Form is subject to the terms of the Mozilla Public
   4 # License, v. 2.0. If a copy of the MPL was not distributed with this
   5 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
   6
   7 $header = <<END_OF_HEADER;
   8 # This Source Code Form is subject to the terms of the Mozilla Public
   9 # License, v. 2.0. If a copy of the MPL was not distributed with this
  10 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
  11
  12 #
  13 # THIS FILE IS GENERATED BY mozilla/intl/unicharutil/tools/gentransliterate.pl
  14 # PLEASE DO NOT MODIFY THIS FILE BY HAND
  15 #
  16 entity.list.name=transliterate
  17 entity.169=(c)
  18 #
  19 #
  20 # Here are the windows-1252 characters from the range 0x80 - 0x9F
  21 #
  22 END_OF_HEADER
  23
  24 $handcoded = <<END_OF_HANDCODED;
  25 # EURO SIGN
  26 entity.8364=EUR
  27 # SINGLE LOW-9 QUOTATION MARK
  28 entity.8218=,
  29 # LATIN SMALL LETTER F WITH HOOK
  30 entity.402=f
  31 # DOUBLE LOW-9 QUOTATION MARK
  32 entity.8222="
  33 # DAGGER
  34 entity.8224=+
  35 # DOUBLE DAGGER
  36 entity.8225=++
  37 # MODIFIER LETTER CIRCUMFLEX ACCENT
  38 entity.710=^
  39 # PER MILLE SIGN
  40 entity.8240=0/00
  41 # SINGLE LEFT-POINTING ANGLE QUOTATION MARK
  42 entity.8249=<
  43 # LATIN CAPITAL LIGATURE OE
  44 entity.338=OE
  45 # LEFT SINGLE QUOTATION MARK
  46 entity.8216='
  47 # RIGHT SINGLE QUOTATION MARK
  48 entity.8217='
  49 # LEFT DOUBLE QUOTATION MARK
  50 entity.8220="
  51 # RIGHT DOUBLE QUOTATION MARK
  52 entity.8221="
  53 # BULLET
  54 entity.8226=.
  55 # EN DASH
  56 entity.8211=--
  57 # EM DASH
  58 entity.8212=---
  59 # SMALL TILDE
  60 entity.732=~
  61 # SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
  62 entity.8250=>
  63 # LATIN SMALL LIGATURE OE
  64 entity.339=oe
  65 # U+2000 EN QUAD
  66 entity.8192=\\u0020
  67 # U+2001 EM QUAD
  68 entity.8193=\\u0020
  69 # U+2010 HYPHEN
  70 entity.8208=-
  71 # U+2011 NON-BREAKING HYPHEN
  72 entity.8209=-
  73 # U+2012 FIGURE DASH
  74 entity.8210=-
  75 # U+2015 HORIZONTAL BAR
  76 entity.8213=--
  77 # U+200B, ZERO WIDTH SPACE (a.k.a. InvisibleComma)
  78 entity.8203=
  79 # U+2061, ApplyFunction, character showing function application in presentation tagging
  80 entity.8289=
  81 # U+2062, InvisibleTimes, marks multiplication when it is understood without a mark
  82 entity.8290=
  83 # U+2146, DifferentialD, d for use in differentials, e.g., within integrals
  84 entity.8518=d
  85 # U+2212, MINUS SIGN, official Unicode minus sign
  86 entity.8722=-
  87 # Hebrew punctuation
  88 # U+05BE HEBREW PUNCTUATION MAQAF
  89 entity.1470=-
  90 # U+05C0 HEBREW PUNCTUATION PASEQ
  91 entity.1472=|
  92 # U+05C3 HEBREW PUNCTUATION SOF PASUQ
  93 entity.1475=:
  94 # U+05F3 HEBREW PUNCTUATION GERESH
  95 entity.1523='
  96 # U+05F4 HEBREW PUNCTUATION GERSHAYIM
  97 entity.1524="
  98 ##
  99 ## End of hand coded section
 100 ## Below are generated from the unicode character database
 101 ##
 102 END_OF_HANDCODED
 103
 104 @table = ();
 105 sub FromLatinComment
 106 {
 107   my ($cmt) = (@_);
 108   $char = "";
 109   if($cmt =~ /PRECEDED BY APOSTROPHE/) {
 110       $char = "\'";
 111   }
 112   if($cmt =~ /CAPITAL LETTER ([A-Z]*)/) {
 113       $char = $char . $1;
 114   }
 115   if($cmt =~ /SMALL LETTER ([A-Z]*)/) {
 116       $char = $char . lc($1);
 117   }
 118   @f = split(/ / , $cmt);
 119   while($item = shift @f) {
 120      if($item eq "DOT") {
 121        $char .= ".";
 122      } elsif ($item eq "DIAERESIS") {
 123        $char .= "\"";
 124      } elsif ($item eq "BREVE") {
 125        $char .= "(";
 126      } elsif ($item eq "ACUTE") {
 127        $char .= "\'";
 128      } elsif ($item eq "GRAVE") {
 129        $char .= "`";
 130      } elsif ($item eq "TILDE") {
 131        $char .= "~";
 132      } elsif ($item eq "CARON") {
 133        $char .= "(";
 134      } elsif ($item eq "HOOK") {
 135        $char .= "?";
 136      } elsif ($item eq "CEDILLA") {
 137        $char .= ",";
 138      } elsif ($item eq "MACRON") {
 139        $char .= "-";
 140      } elsif ($item eq "CIRCUMFLEX") {
 141        $char .= "^";
 142      } elsif ($item eq "RING") {
 143        $char .= "*";
 144      } elsif ($item eq "OGONEK") {
 145        $char .= ";";
 146      } elsif ($item eq "LINE") {
 147        $char .= "_";
 148      } elsif ($item eq "COMMA") {
 149        $char .= ",";
 150      } elsif ($item eq "STROKE") {
 151        $char .= "/";
 152      } elsif ($item eq "HORN") {
 153        $char .= "+";
 154      } elsif ($item =~ /^(LATIN|CAPITAL|SMALL|LETTER|WITH|ABOVE|BELOW|INVERTED|MIDDLE|AND|BY|APOSTROPHE|[A-Z])$/) {
 155        # ignore
 156      } else {
 157        #print "AAAA $item\n";
 158      }
 159   }
 160
 161   return $char;
 162 }
 163 sub warning
 164 {
 165   my ($warning) = (@_);
 166   print "WARNING: $warning \n";
 167 }
 168 sub doutput
 169 {
 170   my ($u, $cmt, $udec, $str) = (@_);
 171   # don't print out comments - for debugging purposes only
 172   # print "# U+$u $cmt\n";
 173   print "entity.$udec=$str\n";
 174 }
 175 sub output
 176 {
 177   my ($u, $cmt, $udec, $str) = (@_);
 178   if(decomposeIntoNonASCII($str)) {
 179     if(($cmt =~ "LATIN")  && ($cmt =~ "LETTER") && !($cmt =~ "LONG")) {
 180        $str = FromLatinComment($cmt);
 181        output($u,$cmt,$udec,$str);
 182     }
 183   } else {
 184     # don't print out comments - for debugging purposes only
 185     # print OUT "# U+$u $cmt\n";
 186     print OUT "entity.$udec=$str\n";
 187   }
 188 }
 189
 190 sub decomposeIntoNonASCII
 191 {
 192   my ($dec) = (@_);
 193   return $dec =~ /\\u([1-9A-F][0-9A-F][0-9A-F]|[0-9A-F][1-9A-F][0-9A-F]|00[8-9A-F])[0-9A-F]/;
 194 }
 195
 196 sub foldcombining
 197 {
 198   my ($dec) = (@_);
 199   $grave = "0060";
 200   $acute = "0027";
 201   $hat = "005E";
 202   $hat = "005E";
 203   $tilde = "007E";
 204   $overscore = "002D"; ## should be 00AF but we can only handle ASCII now
 205   $umlaut = "0022"; ## should be 00A8 but we can only handle ASCII now
 206   $doubleacute = "0022";
 207   $dot = "002E";
 208   $doublegrave = "0060 0060";
 209
 210
 211   $dec =~ s/00A8/$umlaut/eg;
 212   $dec =~ s/00AF/$overscore/eg;
 213  # $dec =~ s/00B0//eg;
 214   $dec =~ s/00B4/$acute/eg;
 215   $dec =~ s/00B7/$dot/eg;
 216  # $dec =~ s/00B8//eg;
 217   $dec =~ s/0300/$grave/eg;
 218   $dec =~ s/0301/$acute/eg;
 219   $dec =~ s/0302/$hat/eg;
 220   $dec =~ s/0303/$tilde/eg;
 221   $dec =~ s/0304/$overscore/eg;
 222   $dec =~ s/0305/$overscore/eg;
 223  #$dec =~ s/0306/?/eg;
 224   $dec =~ s/0307/$dot/eg;
 225   $dec =~ s/0308/$umlaut/eg;
 226  #$dec =~ s/0309/?/eg;
 227  #$dec =~ s/030A/?/eg;
 228   $dec =~ s/030B/$doubleacute/eg;
 229  #$dec =~ s/030C/?/eg;
 230   $dec =~ s/030D/$acute/eg;
 231   $dec =~ s/030E/$doubleacute/eg;
 232   $dec =~ s/030F/$doublegrave/eg;
 233
 234  # $dec =~ s/03[0-9A-F][0-9A-F]//eg; ## drop others
 235   return $dec;
 236 }
 237 sub rdecompose
 238 {
 239   my ($dec) = (@_);
 240   if(exists $table{$dec}) {
 241     $t = $table{$dec};
 242     $t =~ s/<[a-zA-Z]*>//eg;
 243     $t = foldcombining($t);
 244     return rdecompose( $table{$t});
 245   }
 246   return $dec;
 247 }
 248 sub decompose
 249 {
 250   my ($removeprefix, $dec) = (@_);
 251   $removeprefix .= " ";
 252
 253   $dec =~ s/$removeprefix//eg;
 254   if($dec eq "0020") {
 255    $dec = "\\u0020";
 256   } elsif($dec eq "005C") {
 257    $dec = "\\u005C";
 258   } else {
 259    $k = "\/";
 260    $dec =~ s/2044/$k/eg;
 261    $dec =~ s/([0-9A-F][0-9A-F][0-9A-F][0-9A-F])/rdecompose($1)/eg;
 262    $dec =~ s/([0-9A-F][0-9A-F][0-9A-F][0-9A-F])/\\u$1/g;
 263    $dec =~ s/\\u00([0-7][0-9A-F])/pack("C",hex($1))/eg;
 264    $dec =~ s/ //eg;
 265   }
 266   return $dec;
 267 }
 268
 269 ######################################################################
 270 #
 271 # Open the unicode database file
 272 #
 273 ######################################################################
 274 open ( UNICODATA , "< UnicodeData-Latest.txt")
 275    || die "cannot find UnicodeData-Latest.txt";
 276
 277 open ( UNICODATA2 , "< UnicodeData-Latest.txt")
 278    || die "cannot find UnicodeData-Latest.txt";
 279 ######################################################################
 280 #
 281 # Open the output file
 282 #
 283 ######################################################################
 284 open ( OUT , "> ../tables/transliterate.properties")
 285   || die "cannot open output ../tables/transliterate.properties file";
 286
 287 print OUT $header;
 288
 289 # remove comments from $handcoded
 290 $handcoded =~ s/^#[^#].*\n//mg;
 291 print OUT $handcoded;
 292
 293 ######################################################################
 294 #
 295 # Process the file line by line
 296 #
 297 ######################################################################
 298 while(<UNICODATA2>) {
 299    chop;
 300    @f = split(/;/ , $_);
 301    $udec = hex($u);
 302    if(($udec > 256 ) && ($f[5] ne "")) {
 303      $table{$f[0]}=$f[5];
 304    }
 305 }
 306 while(<UNICODATA>) {
 307    chop;
 308    ######################################################################
 309    #
 310    # Get value from fields
 311    #
 312    ######################################################################
 313    @f = split(/;/ , $_);
 314    $u = $f[0];    # The unicode value
 315    $cmt = $f[1];  # The comment
 316    $dec = $f[5];  # The decomposed value
 317    $d1 = $f[6];
 318    $d2 = $f[7];
 319    $d3 = $f[8];
 320    $udec = hex($u);
 321
 322    if($udec > 128)
 323    {
 324      # not ASCII
 325      if($dec ne "")
 326      {
 327        # have decomposition
 328        if($dec =~ /</)  {
 329            # formated decomposition
 330            if($dec =~ /<wide>/)  {
 331               output($u,$cmt,$udec,&decompose("<wide>", $dec));
 332            } elsif($dec =~ /<narrow>/)  {
 333               # ignore non ASCII decomposition
 334               # warning($_);
 335            } elsif($dec =~ /<circle>/)  {
 336               output($u,$cmt,$udec,&decompose("<circle>", "(".$dec.")"));
 337            } elsif($dec =~ /<fraction>/)  {
 338               output($u,$cmt,$udec,&decompose("<fraction>", $dec));
 339            } elsif($dec =~ /<small>/)  {
 340               output($u,$cmt,$udec,&decompose("<small>", $dec));
 341            } elsif($dec =~ /<vertical>/)  {
 342               # warning($_);
 343            } elsif($dec =~ /<super>/)  {
 344               output($u,$cmt,$udec,"^(".&decompose("<super>", $dec).")");
 345            } elsif($dec =~ /<sub>/)  {
 346               output($u,$cmt,$udec,"v(".&decompose("<sub>", $dec).")");
 347            } elsif($dec =~ /<font>/)  {
 348                output($u,$cmt,$udec,&decompose("<font>", $dec));
 349            } elsif($dec =~ /<square>/)  {
 350               # ignore <square>
 351               # warning($_);
 352            } elsif($dec =~ /<compat>/)  {
 353                output($u,$cmt,$udec,&decompose("<compat>", $dec));
 354            } elsif($dec =~ /<isolated>/)  {
 355               # ignore <isolated>
 356               # warning($_);
 357            } elsif($dec =~ /<medial>/)  {
 358               # ignore <medial>
 359               # warning($_);
 360            } elsif($dec =~ /<final>/)  {
 361               # ignore <final>
 362               # warning($_);
 363            } elsif($dec =~ /<initial>/)  {
 364               # ignore <initial>
 365               # warning($_);
 366            } elsif($dec =~ /<noBreak>/)  {
 367              if($dec eq "<noBreak> 0020")
 368              {
 369                output($u,$cmt,$udec,"\\u0020");
 370              } else {
 371               # ignore
 372               # warning($_);
 373              }
 374            } else {
 375              warning($_);
 376            }
 377        } else {
 378          # decomposition without format code
 379          if($cmt =~ /LATIN/) {
 380            $dec = foldcombining($dec);
 381               output($u,$cmt,$udec,&decompose("", $dec));
 382          } elsif($cmt =~ /CYRILLIC/) {
 383               # ignore
 384               # warning($_);
 385          } elsif($cmt =~ /GREEK/) {
 386               # ignore
 387               # warning($_);
 388          } elsif($cmt =~ /ARABIC/) {
 389               # ignore
 390               # warning($_);
 391          } elsif($cmt =~ /CJK/) {
 392               # ignore
 393               # warning($_);
 394          } elsif($cmt =~ /HEBREW/) {
 395               # ignore
 396               # warning($_);
 397          } elsif($cmt =~ /DEVANAGARI/) {
 398               # ignore
 399               # warning($_);
 400          } elsif($cmt =~ /BENGALI/) {
 401               # ignore
 402               # warning($_);
 403          } elsif($cmt =~ /GURMUKHI/) {
 404               # ignore
 405               # warning($_);
 406          } elsif($cmt =~ /ORIYA/) {
 407               # ignore
 408               # warning($_);
 409          } elsif($cmt =~ /TAMIL/) {
 410               # ignore
 411               # warning($_);
 412          } elsif($cmt =~ /TELUGU/) {
 413               # ignore
 414               # warning($_);
 415          } elsif($cmt =~ /KANNADA/) {
 416               # ignore
 417               # warning($_);
 418          } elsif($cmt =~ /MALAYALAM/) {
 419               # ignore
 420               # warning($_);
 421          } elsif($cmt =~ /SINHALA/) {
 422               # ignore
 423               # warning($_);
 424          } elsif($cmt =~ /TIBETAN/) {
 425               # ignore
 426               # warning($_);
 427          } elsif($cmt =~ /MYANMAR/) {
 428               # ignore
 429               # warning($_);
 430          } elsif($cmt =~ /KATAKANA/) {
 431               # ignore
 432               # warning($_);
 433          } elsif($cmt =~ /HIRAGANA/) {
 434               # ignore
 435               # warning($_);
 436          } else {
 437               # ignore
 438               # warning($_);
 439          }
 440        }
 441      } else {
 442        # do not have decomposition
 443        if ($d1 ne "")
 444        {
 445          # are numeric characters
 446          output($u,$cmt,$udec,$d1);
 447        } elsif ($d2 ne "") {
 448          if($cmt =~ /CIRCLED/) {
 449            # circled
 450            output($u,$cmt,$udec,"(".$d2.")");
 451          } else {
 452            # others, use [ ]
 453            output($u,$cmt,$udec,"[".$d2."]");
 454          }
 455        } elsif ($d3 ne "") {
 456          if($cmt =~ /CIRCLED/) {
 457            # circled
 458            output($u,$cmt,$udec,"(".$d3.")");
 459          } else {
 460            # others, use [ ]
 461            output($u,$cmt,$udec,"[".$d3."]");
 462          }
 463        } else {
 464          # not numeric characters
 465
 466        } # end of no decomposition
 467      } # end of have/not decomposition
 468    }
 469 }
 470 ######################################################################
 471 #
 472 # Close files
 473 #
 474 ######################################################################
 475 close(UNIDATA);
 476 close(OUT);
 477