intl/unicharutil/tools/gentransliterate.pl

   1 #!/usr/bin/perl
   2 #
   3 # ***** BEGIN LICENSE BLOCK *****
   4 # Version: MPL 1.1/GPL 2.0/LGPL 2.1
   5 #
   6 # The contents of this file are subject to the Mozilla Public License Version
   7 # 1.1 (the "License"); you may not use this file except in compliance with
   8 # the License. You may obtain a copy of the License at
   9 # http://www.mozilla.org/MPL/
  10 #
  11 # Software distributed under the License is distributed on an "AS IS" basis,
  12 # WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
  13 # for the specific language governing rights and limitations under the
  14 # License.
  15 #
  16 # The Original Code is mozilla.org code.
  17 #
  18 # The Initial Developer of the Original Code is
  19 # Netscape Communications Corporation.
  20 # Portions created by the Initial Developer are Copyright (C) 1999
  21 # the Initial Developer. All Rights Reserved.
  22 #
  23 # Contributor(s):
  24 #
  25 # Alternatively, the contents of this file may be used under the terms of
  26 # either the GNU General Public License Version 2 or later (the "GPL"), or
  27 # the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
  28 # in which case the provisions of the GPL or the LGPL are applicable instead
  29 # of those above. If you wish to allow use of your version of this file only
  30 # under the terms of either the GPL or the LGPL, and not to allow others to
  31 # use your version of this file under the terms of the MPL, indicate your
  32 # decision by deleting the provisions above and replace them with the notice
  33 # and other provisions required by the GPL or the LGPL. If you do not delete
  34 # the provisions above, a recipient may use your version of this file under
  35 # the terms of any one of the MPL, the GPL or the LGPL.
  36 #
  37 # ***** END LICENSE BLOCK *****
  38
  39 $header = <<END_OF_HEADER;
  40 # ***** BEGIN LICENSE BLOCK *****
  41 # Version: MPL 1.1/GPL 2.0/LGPL 2.1
  42 #
  43 # The contents of this file are subject to the Mozilla Public License Version
  44 # 1.1 (the "License"); you may not use this file except in compliance with
  45 # the License. You may obtain a copy of the License at
  46 # http://www.mozilla.org/MPL/
  47 #
  48 # Software distributed under the License is distributed on an "AS IS" basis,
  49 # WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
  50 # for the specific language governing rights and limitations under the
  51 # License.
  52 #
  53 # The Original Code is mozilla.org code.
  54 #
  55 # The Initial Developer of the Original Code is
  56 # Netscape Communications Corporation.
  57 # Portions created by the Initial Developer are Copyright (C) 1999
  58 # the Initial Developer. All Rights Reserved.
  59 #
  60 # Contributor(s):
  61 #
  62 # Alternatively, the contents of this file may be used under the terms of
  63 # either the GNU General Public License Version 2 or later (the "GPL"), or
  64 # the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
  65 # in which case the provisions of the GPL or the LGPL are applicable instead
  66 # of those above. If you wish to allow use of your version of this file only
  67 # under the terms of either the GPL or the LGPL, and not to allow others to
  68 # use your version of this file under the terms of the MPL, indicate your
  69 # decision by deleting the provisions above and replace them with the notice
  70 # and other provisions required by the GPL or the LGPL. If you do not delete
  71 # the provisions above, a recipient may use your version of this file under
  72 # the terms of any one of the MPL, the GPL or the LGPL.
  73 #
  74 # ***** END LICENSE BLOCK *****
  75
  76 #
  77 # THIS FILE IS GENERATED BY mozilla/intl/unicharutil/tools/gentransliterate.pl
  78 # PLEASE DO NOT MODIFY THIS FILE BY HAND
  79 #
  80 entity.list.name=transliterate
  81 entity.169=(c)
  82 #
  83 #
  84 # Here are the windows-1252 characters from the range 0x80 - 0x9F
  85 #
  86 END_OF_HEADER
  87
  88 $handcoded = <<END_OF_HANDCODED;
  89 # EURO SIGN
  90 entity.8364=EUR
  91 # SINGLE LOW-9 QUOTATION MARK
  92 entity.8218=,
  93 # LATIN SMALL LETTER F WITH HOOK
  94 entity.402=f
  95 # DOUBLE LOW-9 QUOTATION MARK
  96 entity.8222="
  97 # DAGGER
  98 entity.8224=+
  99 # DOUBLE DAGGER
 100 entity.8225=++
 101 # MODIFIER LETTER CIRCUMFLEX ACCENT
 102 entity.710=^
 103 # PER MILLE SIGN
 104 entity.8240=0/00
 105 # SINGLE LEFT-POINTING ANGLE QUOTATION MARK
 106 entity.8249=<
 107 # LATIN CAPITAL LIGATURE OE
 108 entity.338=OE
 109 # LEFT SINGLE QUOTATION MARK
 110 entity.8216='
 111 # RIGHT SINGLE QUOTATION MARK
 112 entity.8217='
 113 # LEFT DOUBLE QUOTATION MARK
 114 entity.8220="
 115 # RIGHT DOUBLE QUOTATION MARK
 116 entity.8221="
 117 # BULLET
 118 entity.8226=.
 119 # EN DASH
 120 entity.8211=--
 121 # EM DASH
 122 entity.8212=---
 123 # SMALL TILDE
 124 entity.732=~
 125 # SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
 126 entity.8250=>
 127 # LATIN SMALL LIGATURE OE
 128 entity.339=oe
 129 # U+2000 EN QUAD
 130 entity.8192=\\u0020
 131 # U+2001 EM QUAD
 132 entity.8193=\\u0020
 133 # U+2010 HYPHEN
 134 entity.8208=-
 135 # U+2011 NON-BREAKING HYPHEN
 136 entity.8209=-
 137 # U+2012 FIGURE DASH
 138 entity.8210=-
 139 # U+2015 HORIZONTAL BAR
 140 entity.8213=--
 141 # U+200B, ZERO WIDTH SPACE (a.k.a. InvisibleComma)
 142 entity.8203=
 143 # U+2061, ApplyFunction, character showing function application in presentation tagging
 144 entity.8289=
 145 # U+2062, InvisibleTimes, marks multiplication when it is understood without a mark
 146 entity.8290=
 147 # U+2146, DifferentialD, d for use in differentials, e.g., within integrals
 148 entity.8518=d
 149 # U+2212, MINUS SIGN, official Unicode minus sign
 150 entity.8722=-
 151 # Hebrew punctuation
 152 # U+05BE HEBREW PUNCTUATION MAQAF
 153 entity.1470=-
 154 # U+05C0 HEBREW PUNCTUATION PASEQ
 155 entity.1472=|
 156 # U+05C3 HEBREW PUNCTUATION SOF PASUQ
 157 entity.1475=:
 158 # U+05F3 HEBREW PUNCTUATION GERESH
 159 entity.1523='
 160 # U+05F4 HEBREW PUNCTUATION GERSHAYIM
 161 entity.1524="
 162 ##
 163 ## End of hand coded section
 164 ## Below are generated from the unicode character database
 165 ##
 166 END_OF_HANDCODED
 167
 168 @table = ();
 169 sub FromLatinComment
 170 {
 171   my ($cmt) = (@_);
 172   $char = "";
 173   if($cmt =~ /PRECEDED BY APOSTROPHE/) {
 174       $char = "\'";
 175   }
 176   if($cmt =~ /CAPITAL LETTER ([A-Z]*)/) {
 177       $char = $char . $1;
 178   }
 179   if($cmt =~ /SMALL LETTER ([A-Z]*)/) {
 180       $char = $char . lc($1);
 181   }
 182   @f = split(/ / , $cmt);
 183   while($item = shift @f) {
 184      if($item eq "DOT") {
 185        $char .= ".";
 186      } elsif ($item eq "DIAERESIS") {
 187        $char .= "\"";
 188      } elsif ($item eq "BREVE") {
 189        $char .= "(";
 190      } elsif ($item eq "ACUTE") {
 191        $char .= "\'";
 192      } elsif ($item eq "GRAVE") {
 193        $char .= "`";
 194      } elsif ($item eq "TILDE") {
 195        $char .= "~";
 196      } elsif ($item eq "CARON") {
 197        $char .= "(";
 198      } elsif ($item eq "HOOK") {
 199        $char .= "?";
 200      } elsif ($item eq "CEDILLA") {
 201        $char .= ",";
 202      } elsif ($item eq "MACRON") {
 203        $char .= "-";
 204      } elsif ($item eq "CIRCUMFLEX") {
 205        $char .= "^";
 206      } elsif ($item eq "RING") {
 207        $char .= "*";
 208      } elsif ($item eq "OGONEK") {
 209        $char .= ";";
 210      } elsif ($item eq "LINE") {
 211        $char .= "_";
 212      } elsif ($item eq "COMMA") {
 213        $char .= ",";
 214      } elsif ($item eq "STROKE") {
 215        $char .= "/";
 216      } elsif ($item eq "HORN") {
 217        $char .= "+";
 218      } elsif ($item =~ /^(LATIN|CAPITAL|SMALL|LETTER|WITH|ABOVE|BELOW|INVERTED|MIDDLE|AND|BY|APOSTROPHE|[A-Z])$/) {
 219        # ignore
 220      } else {
 221        #print "AAAA $item\n";
 222      }
 223   }
 224
 225   return $char;
 226 }
 227 sub warning
 228 {
 229   my ($warning) = (@_);
 230   print "WARNING: $warning \n";
 231 }
 232 sub doutput
 233 {
 234   my ($u, $cmt, $udec, $str) = (@_);
 235   # don't print out comments - for debugging purposes only
 236   # print "# U+$u $cmt\n";
 237   print "entity.$udec=$str\n";
 238 }
 239 sub output
 240 {
 241   my ($u, $cmt, $udec, $str) = (@_);
 242   if(decomposeIntoNonASCII($str)) {
 243     if(($cmt =~ "LATIN")  && ($cmt =~ "LETTER") && !($cmt =~ "LONG")) {
 244        $str = FromLatinComment($cmt);
 245        output($u,$cmt,$udec,$str);
 246     }
 247   } else {
 248     # don't print out comments - for debugging purposes only
 249     # print OUT "# U+$u $cmt\n";
 250     print OUT "entity.$udec=$str\n";
 251   }
 252 }
 253
 254 sub decomposeIntoNonASCII
 255 {
 256   my ($dec) = (@_);
 257   return $dec =~ /\\u([1-9A-F][0-9A-F][0-9A-F]|[0-9A-F][1-9A-F][0-9A-F]|00[8-9A-F])[0-9A-F]/;
 258 }
 259
 260 sub foldcombining
 261 {
 262   my ($dec) = (@_);
 263   $grave = "0060";
 264   $acute = "0027";
 265   $hat = "005E";
 266   $hat = "005E";
 267   $tilde = "007E";
 268   $overscore = "002D"; ## should be 00AF but we can only handle ASCII now
 269   $umlaut = "0022"; ## should be 00A8 but we can only handle ASCII now
 270   $doubleacute = "0022";
 271   $dot = "002E";
 272   $doublegrave = "0060 0060";
 273
 274
 275   $dec =~ s/00A8/$umlaut/eg;
 276   $dec =~ s/00AF/$overscore/eg;
 277  # $dec =~ s/00B0//eg;
 278   $dec =~ s/00B4/$acute/eg;
 279   $dec =~ s/00B7/$dot/eg;
 280  # $dec =~ s/00B8//eg;
 281   $dec =~ s/0300/$grave/eg;
 282   $dec =~ s/0301/$acute/eg;
 283   $dec =~ s/0302/$hat/eg;
 284   $dec =~ s/0303/$tilde/eg;
 285   $dec =~ s/0304/$overscore/eg;
 286   $dec =~ s/0305/$overscore/eg;
 287  #$dec =~ s/0306/?/eg;
 288   $dec =~ s/0307/$dot/eg;
 289   $dec =~ s/0308/$umlaut/eg;
 290  #$dec =~ s/0309/?/eg;
 291  #$dec =~ s/030A/?/eg;
 292   $dec =~ s/030B/$doubleacute/eg;
 293  #$dec =~ s/030C/?/eg;
 294   $dec =~ s/030D/$acute/eg;
 295   $dec =~ s/030E/$doubleacute/eg;
 296   $dec =~ s/030F/$doublegrave/eg;
 297
 298  # $dec =~ s/03[0-9A-F][0-9A-F]//eg; ## drop others
 299   return $dec;
 300 }
 301 sub rdecompose
 302 {
 303   my ($dec) = (@_);
 304   if(exists $table{$dec}) {
 305     $t = $table{$dec};
 306     $t =~ s/<[a-zA-Z]*>//eg;
 307     $t = foldcombining($t);
 308     return rdecompose( $table{$t});
 309   }
 310   return $dec;
 311 }
 312 sub decompose
 313 {
 314   my ($removeprefix, $dec) = (@_);
 315   $removeprefix .= " ";
 316
 317   $dec =~ s/$removeprefix//eg;
 318   if($dec eq "0020") {
 319    $dec = "\\u0020";
 320   } elsif($dec eq "005C") {
 321    $dec = "\\u005C";
 322   } else {
 323    $k = "\/";
 324    $dec =~ s/2044/$k/eg;
 325    $dec =~ s/([0-9A-F][0-9A-F][0-9A-F][0-9A-F])/rdecompose($1)/eg;
 326    $dec =~ s/([0-9A-F][0-9A-F][0-9A-F][0-9A-F])/\\u$1/g;
 327    $dec =~ s/\\u00([0-7][0-9A-F])/pack("C",hex($1))/eg;
 328    $dec =~ s/ //eg;
 329   }
 330   return $dec;
 331 }
 332
 333 ######################################################################
 334 #
 335 # Open the unicode database file
 336 #
 337 ######################################################################
 338 open ( UNICODATA , "< UnicodeData-Latest.txt")
 339    || die "cannot find UnicodeData-Latest.txt";
 340
 341 open ( UNICODATA2 , "< UnicodeData-Latest.txt")
 342    || die "cannot find UnicodeData-Latest.txt";
 343 ######################################################################
 344 #
 345 # Open the output file
 346 #
 347 ######################################################################
 348 open ( OUT , "> ../tables/transliterate.properties")
 349   || die "cannot open output ../tables/transliterate.properties file";
 350
 351 print OUT $header;
 352
 353 # remove comments from $handcoded
 354 $handcoded =~ s/^#[^#].*\n//mg;
 355 print OUT $handcoded;
 356
 357 ######################################################################
 358 #
 359 # Process the file line by line
 360 #
 361 ######################################################################
 362 while(<UNICODATA2>) {
 363    chop;
 364    @f = split(/;/ , $_);
 365    $udec = hex($u);
 366    if(($udec > 256 ) && ($f[5] ne "")) {
 367      $table{$f[0]}=$f[5];
 368    }
 369 }
 370 while(<UNICODATA>) {
 371    chop;
 372    ######################################################################
 373    #
 374    # Get value from fields
 375    #
 376    ######################################################################
 377    @f = split(/;/ , $_);
 378    $u = $f[0];    # The unicode value
 379    $cmt = $f[1];  # The comment
 380    $dec = $f[5];  # The decomposed value
 381    $d1 = $f[6];
 382    $d2 = $f[7];
 383    $d3 = $f[8];
 384    $udec = hex($u);
 385
 386    if($udec > 128)
 387    {
 388      # not ASCII
 389      if($dec ne "")
 390      {
 391        # have decomposition
 392        if($dec =~ /</)  {
 393            # formated decomposition
 394            if($dec =~ /<wide>/)  {
 395               output($u,$cmt,$udec,&decompose("<wide>", $dec));
 396            } elsif($dec =~ /<narrow>/)  {
 397               # ignore non ASCII decomposition
 398               # warning($_);
 399            } elsif($dec =~ /<circle>/)  {
 400               output($u,$cmt,$udec,&decompose("<circle>", "(".$dec.")"));
 401            } elsif($dec =~ /<fraction>/)  {
 402               output($u,$cmt,$udec,&decompose("<fraction>", $dec));
 403            } elsif($dec =~ /<small>/)  {
 404               output($u,$cmt,$udec,&decompose("<small>", $dec));
 405            } elsif($dec =~ /<vertical>/)  {
 406               # warning($_);
 407            } elsif($dec =~ /<super>/)  {
 408               output($u,$cmt,$udec,"^(".&decompose("<super>", $dec).")");
 409            } elsif($dec =~ /<sub>/)  {
 410               output($u,$cmt,$udec,"v(".&decompose("<sub>", $dec).")");
 411            } elsif($dec =~ /<font>/)  {
 412                output($u,$cmt,$udec,&decompose("<font>", $dec));
 413            } elsif($dec =~ /<square>/)  {
 414               # ignore <square>
 415               # warning($_);
 416            } elsif($dec =~ /<compat>/)  {
 417                output($u,$cmt,$udec,&decompose("<compat>", $dec));
 418            } elsif($dec =~ /<isolated>/)  {
 419               # ignore <isolated>
 420               # warning($_);
 421            } elsif($dec =~ /<medial>/)  {
 422               # ignore <medial>
 423               # warning($_);
 424            } elsif($dec =~ /<final>/)  {
 425               # ignore <final>
 426               # warning($_);
 427            } elsif($dec =~ /<initial>/)  {
 428               # ignore <initial>
 429               # warning($_);
 430            } elsif($dec =~ /<noBreak>/)  {
 431              if($dec eq "<noBreak> 0020")
 432              {
 433                output($u,$cmt,$udec,"\\u0020");
 434              } else {
 435               # ignore
 436               # warning($_);
 437              }
 438            } else {
 439              warning($_);
 440            }
 441        } else {
 442          # decomposition without format code
 443          if($cmt =~ /LATIN/) {
 444            $dec = foldcombining($dec);
 445               output($u,$cmt,$udec,&decompose("", $dec));
 446          } elsif($cmt =~ /CYRILLIC/) {
 447               # ignore
 448               # warning($_);
 449          } elsif($cmt =~ /GREEK/) {
 450               # ignore
 451               # warning($_);
 452          } elsif($cmt =~ /ARABIC/) {
 453               # ignore
 454               # warning($_);
 455          } elsif($cmt =~ /CJK/) {
 456               # ignore
 457               # warning($_);
 458          } elsif($cmt =~ /HEBREW/) {
 459               # ignore
 460               # warning($_);
 461          } elsif($cmt =~ /DEVANAGARI/) {
 462               # ignore
 463               # warning($_);
 464          } elsif($cmt =~ /BENGALI/) {
 465               # ignore
 466               # warning($_);
 467          } elsif($cmt =~ /GURMUKHI/) {
 468               # ignore
 469               # warning($_);
 470          } elsif($cmt =~ /ORIYA/) {
 471               # ignore
 472               # warning($_);
 473          } elsif($cmt =~ /TAMIL/) {
 474               # ignore
 475               # warning($_);
 476          } elsif($cmt =~ /TELUGU/) {
 477               # ignore
 478               # warning($_);
 479          } elsif($cmt =~ /KANNADA/) {
 480               # ignore
 481               # warning($_);
 482          } elsif($cmt =~ /MALAYALAM/) {
 483               # ignore
 484               # warning($_);
 485          } elsif($cmt =~ /SINHALA/) {
 486               # ignore
 487               # warning($_);
 488          } elsif($cmt =~ /TIBETAN/) {
 489               # ignore
 490               # warning($_);
 491          } elsif($cmt =~ /MYANMAR/) {
 492               # ignore
 493               # warning($_);
 494          } elsif($cmt =~ /KATAKANA/) {
 495               # ignore
 496               # warning($_);
 497          } elsif($cmt =~ /HIRAGANA/) {
 498               # ignore
 499               # warning($_);
 500          } else {
 501               # ignore
 502               # warning($_);
 503          }
 504        }
 505      } else {
 506        # do not have decomposition
 507        if ($d1 ne "")
 508        {
 509          # are numeric characters
 510          output($u,$cmt,$udec,$d1);
 511        } elsif ($d2 ne "") {
 512          if($cmt =~ /CIRCLED/) {
 513            # circled
 514            output($u,$cmt,$udec,"(".$d2.")");
 515          } else {
 516            # others, use [ ]
 517            output($u,$cmt,$udec,"[".$d2."]");
 518          }
 519        } elsif ($d3 ne "") {
 520          if($cmt =~ /CIRCLED/) {
 521            # circled
 522            output($u,$cmt,$udec,"(".$d3.")");
 523          } else {
 524            # others, use [ ]
 525            output($u,$cmt,$udec,"[".$d3."]");
 526          }
 527        } else {
 528          # not numeric characters
 529
 530        } # end of no decomposition
 531      } # end of have/not decomposition
 532    }
 533 }
 534 ######################################################################
 535 #
 536 # Close files
 537 #
 538 ######################################################################
 539 close(UNIDATA);
 540 close(OUT);
 541