intl/unicharutil/tools/genUnicodePropertyData.pl

   1 #!/usr/bin/env perl
   2
   3 # This Source Code Form is subject to the terms of the Mozilla Public
   4 # License, v. 2.0. If a copy of the MPL was not distributed with this
   5 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
   6
   7 # This tool is used to prepare lookup tables of Unicode character properties
   8 # needed by gfx code to support text shaping operations. The properties are
   9 # read from the Unicode Character Database and compiled into multi-level arrays
  10 # for efficient lookup.
  11 #
  12 # To regenerate the tables in nsUnicodePropertyData.cpp:
  13 #
  14 # (1) Download the current Unicode data files from
  15 #
  16 #         http://www.unicode.org/Public/UNIDATA/
  17 #
  18 #     NB: not all the files are actually needed; currently, we require
  19 #       - UnicodeData.txt
  20 #       - Scripts.txt
  21 #       - EastAsianWidth.txt
  22 #       - BidiMirroring.txt
  23 #       - HangulSyllableType.txt
  24 #       - ReadMe.txt (to record version/date of the UCD)
  25 #       - Unihan_Variants.txt (from Unihan.zip)
  26 #     though this may change if we find a need for additional properties.
  27 #
  28 #     The Unicode data files listed above should be together in one directory.
  29 #
  30 #     We also require the file
  31 #        http://www.unicode.org/Public/security/latest/xidmodifications.txt
  32 #     This file should be in a sub-directory "security" immediately below the
  33 #        directory containing the other Unicode data files.
  34 #
  35 #     We also require the latest data file for UTR50, currently revision-13:
  36 #        http://www.unicode.org/Public/vertical/revision-13/VerticalOrientation-13.txt
  37 #     This file should be in a sub-directory "vertical" immediately below the
  38 #        directory containing the other Unicode data files.
  39 #
  40 #
  41 # (2) Run this tool using a command line of the form
  42 #
  43 #         perl genUnicodePropertyData.pl \
  44 #                 /path/to/harfbuzz/src  \
  45 #                 /path/to/UCD-directory
  46 #
  47 #     This will generate (or overwrite!) the files
  48 #
  49 #         nsUnicodePropertyData.cpp
  50 #         nsUnicodeScriptCodes.h
  51 #
  52 #     in the current directory.
  53
  54 use strict;
  55 use List::Util qw(first);
  56
  57 if ($#ARGV != 1) {
  58     print <<__EOT;
  59 # Run this tool using a command line of the form
  60 #
  61 #     perl genUnicodePropertyData.pl \\
  62 #             /path/to/harfbuzz/src  \\
  63 #             /path/to/UCD-directory
  64 #
  65 # where harfbuzz/src is the directory containing harfbuzz .cc and .hh files,
  66 # and UCD-directory is a directory containing the current Unicode Character
  67 # Database files (UnicodeData.txt, etc), available from
  68 # http://www.unicode.org/Public/UNIDATA/, with additional resources as
  69 # detailed in the source comments.
  70 #
  71 # This will generate (or overwrite!) the files
  72 #
  73 #     nsUnicodePropertyData.cpp
  74 #     nsUnicodeScriptCodes.h
  75 #
  76 # in the current directory.
  77 __EOT
  78     exit 0;
  79 }
  80
  81 # load HB_Script and HB_Category constants
  82
  83 # NOTE that HB_SCRIPT_* constants are now "tag" values, NOT sequentially-allocated
  84 # script codes as used by Glib/Pango/etc.
  85 # We therefore define a set of MOZ_SCRIPT_* constants that are script _codes_
  86 # compatible with those libraries, and map these to HB_SCRIPT_* _tags_ as needed.
  87
  88 # CHECK that this matches Pango source (as found for example at
  89 # http://git.gnome.org/browse/pango/tree/pango/pango-script.h)
  90 # for as many codes as that defines (currently up through Unicode 5.1)
  91 # and the GLib enumeration
  92 # http://developer.gnome.org/glib/2.30/glib-Unicode-Manipulation.html#GUnicodeScript
  93 # (currently defined up through Unicode 6.0).
  94 # Constants beyond these may be regarded as unstable for now, but we don't actually
  95 # depend on the specific values.
  96 my %scriptCode = (
  97   INVALID => -1,
  98   COMMON => 0,
  99   INHERITED => 1,
 100   ARABIC => 2,
 101   ARMENIAN => 3,
 102   BENGALI => 4,
 103   BOPOMOFO => 5,
 104   CHEROKEE => 6,
 105   COPTIC => 7,
 106   CYRILLIC => 8,
 107   DESERET => 9,
 108   DEVANAGARI => 10,
 109   ETHIOPIC => 11,
 110   GEORGIAN => 12,
 111   GOTHIC => 13,
 112   GREEK => 14,
 113   GUJARATI => 15,
 114   GURMUKHI => 16,
 115   HAN => 17,
 116   HANGUL => 18,
 117   HEBREW => 19,
 118   HIRAGANA => 20,
 119   KANNADA => 21,
 120   KATAKANA => 22,
 121   KHMER => 23,
 122   LAO => 24,
 123   LATIN => 25,
 124   MALAYALAM => 26,
 125   MONGOLIAN => 27,
 126   MYANMAR => 28,
 127   OGHAM => 29,
 128   OLD_ITALIC => 30,
 129   ORIYA => 31,
 130   RUNIC => 32,
 131   SINHALA => 33,
 132   SYRIAC => 34,
 133   TAMIL => 35,
 134   TELUGU => 36,
 135   THAANA => 37,
 136   THAI => 38,
 137   TIBETAN => 39,
 138   CANADIAN_ABORIGINAL => 40,
 139   YI => 41,
 140   TAGALOG => 42,
 141   HANUNOO => 43,
 142   BUHID => 44,
 143   TAGBANWA => 45,
 144 # unicode 4.0 additions
 145   BRAILLE => 46,
 146   CYPRIOT => 47,
 147   LIMBU => 48,
 148   OSMANYA => 49,
 149   SHAVIAN => 50,
 150   LINEAR_B => 51,
 151   TAI_LE => 52,
 152   UGARITIC => 53,
 153 # unicode 4.1 additions
 154   NEW_TAI_LUE => 54,
 155   BUGINESE => 55,
 156   GLAGOLITIC => 56,
 157   TIFINAGH => 57,
 158   SYLOTI_NAGRI => 58,
 159   OLD_PERSIAN => 59,
 160   KHAROSHTHI => 60,
 161 # unicode 5.0 additions
 162   UNKNOWN => 61,
 163   BALINESE => 62,
 164   CUNEIFORM => 63,
 165   PHOENICIAN => 64,
 166   PHAGS_PA => 65,
 167   NKO => 66,
 168 # unicode 5.1 additions
 169   KAYAH_LI => 67,
 170   LEPCHA => 68,
 171   REJANG => 69,
 172   SUNDANESE => 70,
 173   SAURASHTRA => 71,
 174   CHAM => 72,
 175   OL_CHIKI => 73,
 176   VAI => 74,
 177   CARIAN => 75,
 178   LYCIAN => 76,
 179   LYDIAN => 77,
 180 # unicode 5.2 additions
 181   AVESTAN => 78,
 182   BAMUM => 79,
 183   EGYPTIAN_HIEROGLYPHS => 80,
 184   IMPERIAL_ARAMAIC => 81,
 185   INSCRIPTIONAL_PAHLAVI => 82,
 186   INSCRIPTIONAL_PARTHIAN => 83,
 187   JAVANESE => 84,
 188   KAITHI => 85,
 189   LISU => 86,
 190   MEETEI_MAYEK => 87,
 191   OLD_SOUTH_ARABIAN => 88,
 192   OLD_TURKIC => 89,
 193   SAMARITAN => 90,
 194   TAI_THAM => 91,
 195   TAI_VIET => 92,
 196 # unicode 6.0 additions
 197   BATAK => 93,
 198   BRAHMI => 94,
 199   MANDAIC => 95,
 200 # unicode 6.1 additions
 201   CHAKMA => 96,
 202   MEROITIC_CURSIVE => 97,
 203   MEROITIC_HIEROGLYPHS => 98,
 204   MIAO => 99,
 205   SHARADA => 100,
 206   SORA_SOMPENG => 101,
 207   TAKRI => 102,
 208 # unicode 7.0 additions
 209   BASSA_VAH => 103,
 210   CAUCASIAN_ALBANIAN => 104,
 211   DUPLOYAN => 105,
 212   ELBASAN => 106,
 213   GRANTHA => 107,
 214   KHOJKI => 108,
 215   KHUDAWADI => 109,
 216   LINEAR_A => 110,
 217   MAHAJANI => 111,
 218   MANICHAEAN => 112,
 219   MENDE_KIKAKUI => 113,
 220   MODI => 114,
 221   MRO => 115,
 222   NABATAEAN => 116,
 223   OLD_NORTH_ARABIAN => 117,
 224   OLD_PERMIC => 118,
 225   PAHAWH_HMONG => 119,
 226   PALMYRENE => 120,
 227   PAU_CIN_HAU => 121,
 228   PSALTER_PAHLAVI => 122,
 229   SIDDHAM => 123,
 230   TIRHUTA => 124,
 231   WARANG_CITI => 125,
 232
 233 # additional "script" code, not from Unicode (but matches ISO 15924's Zmth tag)
 234   MATHEMATICAL_NOTATION => 126,
 235 );
 236
 237 my $sc = -1;
 238 my $cc = -1;
 239 my %catCode;
 240 my @scriptCodeToTag;
 241 my @scriptCodeToName;
 242
 243 sub readHarfBuzzHeader
 244 {
 245     my $file = shift;
 246     open FH, "< $ARGV[0]/$file" or die "can't open harfbuzz header $ARGV[0]/$file\n";
 247     while (<FH>) {
 248         s/CANADIAN_SYLLABICS/CANADIAN_ABORIGINAL/; # harfbuzz and unicode disagree on this name :(
 249         if (m/HB_SCRIPT_([A-Z_]+)\s*=\s*HB_TAG\s*\(('.','.','.','.')\)\s*,/) {
 250             unless (exists $scriptCode{$1}) {
 251                 warn "unknown script name $1 found in $file\n";
 252                 next;
 253             }
 254             $sc = $scriptCode{$1};
 255             $scriptCodeToTag[$sc] = $2;
 256             $scriptCodeToName[$sc] = $1;
 257         }
 258         if (m/HB_UNICODE_GENERAL_CATEGORY_([A-Z_]+)/) {
 259             $cc++;
 260             $catCode{$1} = $cc;
 261         }
 262     }
 263     close FH;
 264 }
 265
 266 &readHarfBuzzHeader("hb-common.h");
 267 &readHarfBuzzHeader("hb-unicode.h");
 268
 269 die "didn't find HarfBuzz script codes\n" if $sc == -1;
 270 die "didn't find HarfBuzz category codes\n" if $cc == -1;
 271
 272 # Additional code not present in HarfBuzz headers:
 273 $sc = $scriptCode{"MATHEMATICAL_NOTATION"};
 274 $scriptCodeToTag[$sc] = "'Z','m','t','h'";
 275 $scriptCodeToName[$sc] = "MATHEMATICAL_NOTATION";
 276
 277 my %xidmodCode = (
 278 'inclusion'         => 0,
 279 'recommended'       => 1,
 280 'default-ignorable' => 2,
 281 'historic'          => 3,
 282 'limited-use'       => 4,
 283 'not-NFKC'          => 5,
 284 'not-xid'           => 6,
 285 'obsolete'          => 7,
 286 'technical'         => 8,
 287 'not-chars'         => 9
 288 );
 289
 290 my %bidicategoryCode = (
 291   "L"   =>  "0", # Left-to-Right
 292   "R"   =>  "1", # Right-to-Left
 293   "EN"  =>  "2", # European Number
 294   "ES"  =>  "3", # European Number Separator
 295   "ET"  =>  "4", # European Number Terminator
 296   "AN"  =>  "5", # Arabic Number
 297   "CS"  =>  "6", # Common Number Separator
 298   "B"   =>  "7", # Paragraph Separator
 299   "S"   =>  "8", # Segment Separator
 300   "WS"  =>  "9", # Whitespace
 301   "ON"  => "10", # Other Neutrals
 302   "LRE" => "11", # Left-to-Right Embedding
 303   "LRO" => "12", # Left-to-Right Override
 304   "AL"  => "13", # Right-to-Left Arabic
 305   "RLE" => "14", # Right-to-Left Embedding
 306   "RLO" => "15", # Right-to-Left Override
 307   "PDF" => "16", # Pop Directional Format
 308   "NSM" => "17", # Non-Spacing Mark
 309   "BN"  => "18"  # Boundary Neutral
 310 );
 311
 312 my %verticalOrientationCode = (
 313   'U' => 0,  #   U - Upright, the same orientation as in the code charts
 314   'R' => 1,  #   R - Rotated 90 degrees clockwise compared to the code charts
 315   'Tu' => 2, #   Tu - Transformed typographically, with fallback to Upright
 316   'Tr' => 3  #   Tr - Transformed typographically, with fallback to Rotated
 317 );
 318
 319 # initialize default properties
 320 my @script;
 321 my @category;
 322 my @combining;
 323 my @eaw;
 324 my @mirror;
 325 my @hangul;
 326 my @casemap;
 327 my @xidmod;
 328 my @numericvalue;
 329 my @hanVariant;
 330 my @bidicategory;
 331 my @fullWidth;
 332 my @verticalOrientation;
 333 for (my $i = 0; $i < 0x110000; ++$i) {
 334     $script[$i] = $scriptCode{"UNKNOWN"};
 335     $category[$i] = $catCode{"UNASSIGNED"};
 336     $combining[$i] = 0;
 337     $casemap[$i] = 0;
 338     $xidmod[$i] = $xidmodCode{"not-chars"};
 339     $numericvalue[$i] = -1;
 340     $hanVariant[$i] = 0;
 341     $bidicategory[$i] = $bidicategoryCode{"L"};
 342     $fullWidth[$i] = 0;
 343     $verticalOrientation[$i] = 1; # default for unlisted codepoints is 'R'
 344 }
 345
 346 # blocks where the default for bidi category is not L
 347 for my $i (0x0600..0x07BF, 0x08A0..0x08FF, 0xFB50..0xFDCF, 0xFDF0..0xFDFF, 0xFE70..0xFEFF, 0x1EE00..0x0001EEFF) {
 348   $bidicategory[$i] = $bidicategoryCode{"AL"};
 349 }
 350 for my $i (0x0590..0x05FF, 0x07C0..0x089F, 0xFB1D..0xFB4F, 0x00010800..0x00010FFF, 0x0001E800..0x0001EDFF, 0x0001EF00..0x0001EFFF) {
 351   $bidicategory[$i] = $bidicategoryCode{"R"};
 352 }
 353 for my $i (0x20A0..0x20CF) {
 354   $bidicategory[$i] = $bidicategoryCode{"ET"};
 355 }
 356
 357 my %ucd2hb = (
 358 'Cc' => 'CONTROL',
 359 'Cf' => 'FORMAT',
 360 'Cn' => 'UNASSIGNED',
 361 'Co' => 'PRIVATE_USE',
 362 'Cs' => 'SURROGATE',
 363 'Ll' => 'LOWERCASE_LETTER',
 364 'Lm' => 'MODIFIER_LETTER',
 365 'Lo' => 'OTHER_LETTER',
 366 'Lt' => 'TITLECASE_LETTER',
 367 'Lu' => 'UPPERCASE_LETTER',
 368 'Mc' => 'SPACING_MARK',
 369 'Me' => 'ENCLOSING_MARK',
 370 'Mn' => 'NON_SPACING_MARK',
 371 'Nd' => 'DECIMAL_NUMBER',
 372 'Nl' => 'LETTER_NUMBER',
 373 'No' => 'OTHER_NUMBER',
 374 'Pc' => 'CONNECT_PUNCTUATION',
 375 'Pd' => 'DASH_PUNCTUATION',
 376 'Pe' => 'CLOSE_PUNCTUATION',
 377 'Pf' => 'FINAL_PUNCTUATION',
 378 'Pi' => 'INITIAL_PUNCTUATION',
 379 'Po' => 'OTHER_PUNCTUATION',
 380 'Ps' => 'OPEN_PUNCTUATION',
 381 'Sc' => 'CURRENCY_SYMBOL',
 382 'Sk' => 'MODIFIER_SYMBOL',
 383 'Sm' => 'MATH_SYMBOL',
 384 'So' => 'OTHER_SYMBOL',
 385 'Zl' => 'LINE_SEPARATOR',
 386 'Zp' => 'PARAGRAPH_SEPARATOR',
 387 'Zs' => 'SPACE_SEPARATOR'
 388 );
 389
 390 # read ReadMe.txt
 391 my @versionInfo;
 392 open FH, "< $ARGV[1]/ReadMe.txt" or die "can't open Unicode ReadMe.txt file\n";
 393 while (<FH>) {
 394     chomp;
 395     push @versionInfo, $_;
 396 }
 397 close FH;
 398
 399 my $kTitleToUpper = 0x80000000;
 400 my $kUpperToLower = 0x40000000;
 401 my $kLowerToTitle = 0x20000000;
 402 my $kLowerToUpper = 0x10000000;
 403 my $kCaseMapCharMask = 0x001fffff;
 404
 405 # read UnicodeData.txt
 406 open FH, "< $ARGV[1]/UnicodeData.txt" or die "can't open UCD file UnicodeData.txt\n";
 407 while (<FH>) {
 408     chomp;
 409     my @fields = split /;/;
 410     if ($fields[1] =~ /First/) {
 411         my $first = hex "0x$fields[0]";
 412         $_ = <FH>;
 413         @fields = split /;/;
 414         if ($fields[1] =~ /Last/) {
 415             my $last = hex "0x$fields[0]";
 416             do {
 417                 $category[$first] = $catCode{$ucd2hb{$fields[2]}};
 418                 $combining[$first] = $fields[3];
 419                 $bidicategory[$first] = $bidicategoryCode{$fields[4]};
 420                 unless (length($fields[7]) == 0) {
 421                   $numericvalue[$first] = $fields[7];
 422                 }
 423                 if ($fields[1] =~ /CJK/) {
 424                   @hanVariant[$first] = 3;
 425                 }
 426                 $first++;
 427             } while ($first <= $last);
 428         } else {
 429             die "didn't find Last code for range!\n";
 430         }
 431     } else {
 432         my $usv = hex "0x$fields[0]";
 433         $category[$usv] = $catCode{$ucd2hb{$fields[2]}};
 434         $combining[$usv] = $fields[3];
 435         my $upper = hex $fields[12];
 436         my $lower = hex $fields[13];
 437         my $title = hex $fields[14];
 438         # we only store one mapping for each character,
 439         # but also record what kind of mapping it is
 440         if ($upper && $lower) {
 441             $casemap[$usv] |= $kTitleToUpper;
 442             $casemap[$usv] |= ($usv ^ $upper);
 443         }
 444         elsif ($lower) {
 445             $casemap[$usv] |= $kUpperToLower;
 446             $casemap[$usv] |= ($usv ^ $lower);
 447         }
 448         elsif ($title && ($title != $upper)) {
 449             $casemap[$usv] |= $kLowerToTitle;
 450             $casemap[$usv] |= ($usv ^ $title);
 451         }
 452         elsif ($upper) {
 453             $casemap[$usv] |= $kLowerToUpper;
 454             $casemap[$usv] |= ($usv ^ $upper);
 455         }
 456         $bidicategory[$usv] = $bidicategoryCode{$fields[4]};
 457         unless (length($fields[7]) == 0) {
 458           $numericvalue[$usv] = $fields[7];
 459         }
 460         if ($fields[1] =~ /CJK/) {
 461           @hanVariant[$usv] = 3;
 462         }
 463         if ($fields[5] =~ /^<narrow>/) {
 464           my $wideChar = hex(substr($fields[5], 9));
 465           die "didn't expect supplementary-plane values here" if $usv > 0xffff || $wideChar > 0xffff;
 466           $fullWidth[$usv] = $wideChar;
 467         }
 468         elsif ($fields[5] =~ /^<wide>/) {
 469           my $narrowChar = hex(substr($fields[5], 7));
 470           die "didn't expect supplementary-plane values here" if $usv > 0xffff || $narrowChar > 0xffff;
 471           $fullWidth[$narrowChar] = $usv;
 472         }
 473     }
 474 }
 475 close FH;
 476
 477 # read Scripts.txt
 478 open FH, "< $ARGV[1]/Scripts.txt" or die "can't open UCD file Scripts.txt\n";
 479 push @versionInfo, "";
 480 while (<FH>) {
 481     chomp;
 482     push @versionInfo, $_;
 483     last if /Date:/;
 484 }
 485 while (<FH>) {
 486     if (m/([0-9A-F]{4,6})(?:\.\.([0-9A-F]{4,6}))*\s+;\s+([^ ]+)/) {
 487         my $script = uc($3);
 488         warn "unknown script $script" unless exists $scriptCode{$script};
 489         $script = $scriptCode{$script};
 490         my $start = hex "0x$1";
 491         my $end = (defined $2) ? hex "0x$2" : $start;
 492         for (my $i = $start; $i <= $end; ++$i) {
 493             $script[$i] = $script;
 494         }
 495     }
 496 }
 497 close FH;
 498
 499 # read EastAsianWidth.txt
 500 my %eawCode = (
 501   'A' => 0, #         ; Ambiguous
 502   'F' => 1, #         ; Fullwidth
 503   'H' => 2, #         ; Halfwidth
 504   'N' => 3, #         ; Neutral
 505   'NA'=> 4, #         ; Narrow
 506   'W' => 5  #         ; Wide
 507 );
 508 open FH, "< $ARGV[1]/EastAsianWidth.txt" or die "can't open UCD file EastAsianWidth.txt\n";
 509 push @versionInfo, "";
 510 while (<FH>) {
 511     chomp;
 512     push @versionInfo, $_;
 513     last if /Date:/;
 514 }
 515 while (<FH>) {
 516     s/#.*//;
 517     if (m/([0-9A-F]{4,6})(?:\.\.([0-9A-F]{4,6}))*\s*;\s*([^ ]+)/) {
 518         my $eaw = uc($3);
 519         warn "unknown EAW code $eaw" unless exists $eawCode{$eaw};
 520         $eaw = $eawCode{$eaw};
 521         my $start = hex "0x$1";
 522         my $end = (defined $2) ? hex "0x$2" : $start;
 523         for (my $i = $start; $i <= $end; ++$i) {
 524             $eaw[$i] = $eaw;
 525         }
 526     }
 527 }
 528 close FH;
 529
 530 # read BidiMirroring.txt
 531 my @offsets = ();
 532 push @offsets, 0;
 533
 534 open FH, "< $ARGV[1]/BidiMirroring.txt" or die "can't open UCD file BidiMirroring.txt\n";
 535 push @versionInfo, "";
 536 while (<FH>) {
 537     chomp;
 538     push @versionInfo, $_;
 539     last if /Date:/;
 540 }
 541 while (<FH>) {
 542     s/#.*//;
 543     if (m/([0-9A-F]{4,6});\s*([0-9A-F]{4,6})/) {
 544         my $mirrorOffset = hex("0x$2") - hex("0x$1");
 545         my $offsetIndex = first { $offsets[$_] eq $mirrorOffset } 0..$#offsets;
 546         if ($offsetIndex == undef) {
 547             die "too many offset codes\n" if scalar @offsets == 31;
 548             push @offsets, $mirrorOffset;
 549             $offsetIndex = $#offsets;
 550         }
 551         $mirror[hex "0x$1"] = $offsetIndex;
 552     }
 553 }
 554 close FH;
 555
 556 # read HangulSyllableType.txt
 557 my %hangulType = (
 558   'L'   => 0x01,
 559   'V'   => 0x02,
 560   'T'   => 0x04,
 561   'LV'  => 0x03,
 562   'LVT' => 0x07
 563 );
 564 open FH, "< $ARGV[1]/HangulSyllableType.txt" or die "can't open UCD file HangulSyllableType.txt\n";
 565 push @versionInfo, "";
 566 while (<FH>) {
 567     chomp;
 568     push @versionInfo, $_;
 569     last if /Date:/;
 570 }
 571 while (<FH>) {
 572     s/#.*//;
 573     if (m/([0-9A-F]{4,6})(?:\.\.([0-9A-F]{4,6}))*\s*;\s*([^ ]+)/) {
 574         my $hangul = uc($3);
 575         warn "unknown Hangul syllable type" unless exists $hangulType{$hangul};
 576         $hangul = $hangulType{$hangul};
 577         my $start = hex "0x$1";
 578         my $end = (defined $2) ? hex "0x$2" : $start;
 579         for (my $i = $start; $i <= $end; ++$i) {
 580             $hangul[$i] = $hangul;
 581         }
 582     }
 583 }
 584 close FH;
 585
 586 # read xidmodifications.txt
 587 open FH, "< $ARGV[1]/security/xidmodifications.txt" or die "can't open UCD file xidmodifications.txt\n";
 588 push @versionInfo, "";
 589 while (<FH>) {
 590   chomp;
 591   unless (/\xef\xbb\xbf/) {
 592     push @versionInfo, $_;
 593   }
 594   last if /Generated:/;
 595 }
 596 while (<FH>) {
 597   if (m/([0-9A-F]{4,6})(?:\.\.([0-9A-F]{4,6}))*\s+;\s+[^ ]+\s+;\s+([^ ]+)/) {
 598     my $xidmod = $3;
 599     warn "unknown Identifier Modification $xidmod" unless exists $xidmodCode{$xidmod};
 600     $xidmod = $xidmodCode{$xidmod};
 601     my $start = hex "0x$1";
 602     my $end = (defined $2) ? hex "0x$2" : $start;
 603     for (my $i = $start; $i <= $end; ++$i) {
 604       $xidmod[$i] = $xidmod;
 605     }
 606   }
 607 }
 608 close FH;
 609 # special case U+30FB KATAKANA MIDDLE DOT -- see bug 857490
 610 $xidmod[0x30FB] = 1;
 611
 612 open FH, "< $ARGV[1]/Unihan_Variants.txt" or die "can't open UCD file Unihan_Variants.txt (from Unihan.zip)\n";
 613 push @versionInfo, "";
 614 while (<FH>) {
 615   chomp;
 616   push @versionInfo, $_;
 617   last if /Date:/;
 618 }
 619 my $savedusv = 0;
 620 my $hasTC = 0;
 621 my $hasSC = 0;
 622 while (<FH>) {
 623   chomp;
 624   if (m/U\+([0-9A-F]{4,6})\s+k([^ ]+)Variant/) {
 625     my $usv = hex "0x$1";
 626     if ($usv != $savedusv) {
 627       unless ($savedusv == 0) {
 628         if ($hasTC && !$hasSC) {
 629           $hanVariant[$savedusv] = 1;
 630         } elsif (!$hasTC && $hasSC) {
 631           $hanVariant[$savedusv] = 2;
 632         }
 633       }
 634       $savedusv = $usv;
 635       $hasTC = 0;
 636       $hasSC = 0;
 637     }
 638     if ($2 eq "Traditional") {
 639       $hasTC = 1;
 640     }
 641     if ($2 eq "Simplified") {
 642       $hasSC = 1;
 643     }
 644   }
 645 }
 646 close FH;
 647
 648 # read VerticalOrientation-13.txt
 649 open FH, "< $ARGV[1]/vertical/VerticalOrientation-13.txt" or die "can't open UTR50 data file VerticalOrientation-13.txt\n";
 650 push @versionInfo, "";
 651 while (<FH>) {
 652     chomp;
 653     push @versionInfo, $_;
 654     last if /Date:/;
 655 }
 656 while (<FH>) {
 657     chomp;
 658     s/#.*//;
 659     if (m/([0-9A-F]{4,6})(?:\.\.([0-9A-F]{4,6}))*\s*;\s*([^ ]+)/) {
 660         my $vo = $3;
 661         warn "unknown Vertical_Orientation code $vo"
 662             unless exists $verticalOrientationCode{$vo};
 663         $vo = $verticalOrientationCode{$vo};
 664         my $start = hex "0x$1";
 665         my $end = (defined $2) ? hex "0x$2" : $start;
 666         for (my $i = $start; $i <= $end; ++$i) {
 667             $verticalOrientation[$i] = $vo;
 668         }
 669     }
 670 }
 671 close FH;
 672
 673 my $timestamp = gmtime();
 674
 675 open DATA_TABLES, "> nsUnicodePropertyData.cpp" or die "unable to open nsUnicodePropertyData.cpp for output";
 676
 677 my $licenseBlock = q[
 678 /* -*- Mode: C++; tab-width: 20; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
 679 /* This Source Code Form is subject to the terms of the Mozilla Public
 680  * License, v. 2.0. If a copy of the MPL was not distributed with this
 681  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
 682
 683 /*
 684  * Derived from the Unicode Character Database by genUnicodePropertyData.pl
 685  *
 686  * For Unicode terms of use, see http://www.unicode.org/terms_of_use.html
 687  */
 688 ];
 689
 690 my $versionInfo = join("\n", @versionInfo);
 691
 692 print DATA_TABLES <<__END;
 693 $licenseBlock
 694 /*
 695  * Created on $timestamp from UCD data files with version info:
 696  *
 697
 698 $versionInfo
 699
 700  *
 701  * * * * * This file contains MACHINE-GENERATED DATA, do not edit! * * * * *
 702  */
 703
 704 #include <stdint.h>
 705 #include "harfbuzz/hb.h"
 706
 707 __END
 708
 709 open HEADER, "> nsUnicodeScriptCodes.h" or die "unable to open nsUnicodeScriptCodes.h for output";
 710
 711 print HEADER <<__END;
 712 $licenseBlock
 713 /*
 714  * Created on $timestamp from UCD data files with version info:
 715  *
 716
 717 $versionInfo
 718
 719  *
 720  * * * * * This file contains MACHINE-GENERATED DATA, do not edit! * * * * *
 721  */
 722
 723 #ifndef NS_UNICODE_SCRIPT_CODES
 724 #define NS_UNICODE_SCRIPT_CODES
 725
 726 __END
 727
 728 print DATA_TABLES "static const uint32_t sScriptCodeToTag[] = {\n";
 729 for (my $i = 0; $i < scalar @scriptCodeToTag; ++$i) {
 730   printf DATA_TABLES "  HB_TAG(%s)", $scriptCodeToTag[$i];
 731   print DATA_TABLES $i < $#scriptCodeToTag ? ",\n" : "\n";
 732 }
 733 print DATA_TABLES "};\n\n";
 734
 735 our $totalData = 0;
 736
 737 print DATA_TABLES "static const int16_t sMirrorOffsets[] = {\n";
 738 for (my $i = 0; $i < scalar @offsets; ++$i) {
 739     printf DATA_TABLES "  $offsets[$i]";
 740     print DATA_TABLES $i < $#offsets ? ",\n" : "\n";
 741 }
 742 print DATA_TABLES "};\n\n";
 743
 744 print HEADER "#pragma pack(1)\n\n";
 745
 746 sub sprintCharProps1
 747 {
 748   my $usv = shift;
 749   return sprintf("{%d,%d,%d}, ", $mirror[$usv], $hangul[$usv], $combining[$usv]);
 750 }
 751 my $type = q/
 752 struct nsCharProps1 {
 753   unsigned char mMirrorOffsetIndex:5;
 754   unsigned char mHangulType:3;
 755   unsigned char mCombiningClass:8;
 756 };
 757 /;
 758 &genTables("CharProp1", $type, "nsCharProps1", 11, 5, \&sprintCharProps1, 1, 2, 1);
 759
 760 sub sprintCharProps2
 761 {
 762   my $usv = shift;
 763   return sprintf("{%d,%d,%d,%d,%d,%d,%d},",
 764                  $script[$usv], $eaw[$usv], $category[$usv],
 765                  $bidicategory[$usv], $xidmod[$usv], $numericvalue[$usv],
 766                  $verticalOrientation[$usv]);
 767 }
 768 $type = q/
 769 struct nsCharProps2 {
 770   unsigned char mScriptCode:8;
 771   unsigned char mEAW:3;
 772   unsigned char mCategory:5;
 773   unsigned char mBidiCategory:5;
 774   unsigned char mXidmod:4;
 775   signed char   mNumericValue:5;
 776   unsigned char mVertOrient:2;
 777 };
 778 /;
 779 &genTables("CharProp2", $type, "nsCharProps2", 11, 5, \&sprintCharProps2, 16, 4, 1);
 780
 781 print HEADER "#pragma pack()\n\n";
 782
 783 sub sprintHanVariants
 784 {
 785   my $baseUsv = shift;
 786   my $varShift = 0;
 787   my $val = 0;
 788   while ($varShift < 8) {
 789     $val |= $hanVariant[$baseUsv++] << $varShift;
 790     $varShift += 2;
 791   }
 792   return sprintf("0x%02x,", $val);
 793 }
 794 &genTables("HanVariant", "", "uint8_t", 9, 7, \&sprintHanVariants, 2, 1, 4);
 795
 796 sub sprintFullWidth
 797 {
 798   my $usv = shift;
 799   return sprintf("0x%04x,", $fullWidth[$usv]);
 800 }
 801 &genTables("FullWidth", "", "uint16_t", 10, 6, \&sprintFullWidth, 0, 2, 1);
 802
 803 sub sprintCasemap
 804 {
 805   my $usv = shift;
 806   return sprintf("0x%08x,", $casemap[$usv]);
 807 }
 808 &genTables("CaseMap", "", "uint32_t", 11, 5, \&sprintCasemap, 1, 4, 1);
 809
 810 print STDERR "Total data = $totalData\n";
 811
 812 printf DATA_TABLES "const uint32_t kTitleToUpper = 0x%08x;\n", $kTitleToUpper;
 813 printf DATA_TABLES "const uint32_t kUpperToLower = 0x%08x;\n", $kUpperToLower;
 814 printf DATA_TABLES "const uint32_t kLowerToTitle = 0x%08x;\n", $kLowerToTitle;
 815 printf DATA_TABLES "const uint32_t kLowerToUpper = 0x%08x;\n", $kLowerToUpper;
 816 printf DATA_TABLES "const uint32_t kCaseMapCharMask = 0x%08x;\n\n", $kCaseMapCharMask;
 817
 818 sub genTables
 819 {
 820   my ($prefix, $typedef, $type, $indexBits, $charBits, $func, $maxPlane, $bytesPerEntry, $charsPerEntry) = @_;
 821
 822   print DATA_TABLES "#define k${prefix}MaxPlane  $maxPlane\n";
 823   print DATA_TABLES "#define k${prefix}IndexBits $indexBits\n";
 824   print DATA_TABLES "#define k${prefix}CharBits  $charBits\n";
 825
 826   my $indexLen = 1 << $indexBits;
 827   my $charsPerPage = 1 << $charBits;
 828   my %charIndex = ();
 829   my %pageMapIndex = ();
 830   my @pageMap = ();
 831   my @char = ();
 832
 833   my $planeMap = "\x00" x $maxPlane;
 834   foreach my $plane (0 .. $maxPlane) {
 835     my $pageMap = "\x00" x $indexLen * 2;
 836     foreach my $page (0 .. $indexLen - 1) {
 837         my $charValues = "";
 838         for (my $ch = 0; $ch < $charsPerPage; $ch += $charsPerEntry) {
 839             my $usv = $plane * 0x10000 + $page * $charsPerPage + $ch;
 840             $charValues .= &$func($usv);
 841         }
 842         chop $charValues;
 843
 844         unless (exists $charIndex{$charValues}) {
 845             $charIndex{$charValues} = scalar keys %charIndex;
 846             $char[$charIndex{$charValues}] = $charValues;
 847         }
 848         substr($pageMap, $page * 2, 2) = pack('S', $charIndex{$charValues});
 849     }
 850
 851     unless (exists $pageMapIndex{$pageMap}) {
 852         $pageMapIndex{$pageMap} = scalar keys %pageMapIndex;
 853         $pageMap[$pageMapIndex{$pageMap}] = $pageMap;
 854     }
 855     if ($plane > 0) {
 856         substr($planeMap, $plane - 1, 1) = pack('C', $pageMapIndex{$pageMap});
 857     }
 858   }
 859
 860   if ($maxPlane) {
 861     print DATA_TABLES "static const uint8_t s${prefix}Planes[$maxPlane] = {";
 862     print DATA_TABLES join(',', map { sprintf("%d", $_) } unpack('C*', $planeMap));
 863     print DATA_TABLES "};\n\n";
 864   }
 865
 866   my $chCount = scalar @char;
 867   my $pmBits = $chCount > 255 ? 16 : 8;
 868   my $pmCount = scalar @pageMap;
 869   if ($maxPlane == 0) {
 870     die "there should only be one pageMap entry!" if $pmCount > 1;
 871     print DATA_TABLES "static const uint${pmBits}_t s${prefix}Pages[$indexLen] = {\n";
 872   } else {
 873     print DATA_TABLES "static const uint${pmBits}_t s${prefix}Pages[$pmCount][$indexLen] = {\n";
 874   }
 875   for (my $i = 0; $i < scalar @pageMap; ++$i) {
 876     print DATA_TABLES $maxPlane > 0 ? "  {" : "  ";
 877     print DATA_TABLES join(',', map { sprintf("%d", $_) } unpack('S*', $pageMap[$i]));
 878     print DATA_TABLES $maxPlane > 0 ? ($i < $#pageMap ? "},\n" : "}\n") : "\n";
 879   }
 880   print DATA_TABLES "};\n\n";
 881
 882   print HEADER "$typedef\n\n" if $typedef ne '';
 883
 884   my $pageLen = $charsPerPage / $charsPerEntry;
 885   print DATA_TABLES "static const $type s${prefix}Values[$chCount][$pageLen] = {\n";
 886   for (my $i = 0; $i < scalar @char; ++$i) {
 887     print DATA_TABLES "  {";
 888     print DATA_TABLES $char[$i];
 889     print DATA_TABLES $i < $#char ? "},\n" : "}\n";
 890   }
 891   print DATA_TABLES "};\n\n";
 892
 893   my $dataSize = $pmCount * $indexLen * $pmBits/8 +
 894                  $chCount * $pageLen * $bytesPerEntry +
 895                  $maxPlane;
 896   $totalData += $dataSize;
 897
 898   print STDERR "Data for $prefix = $dataSize\n";
 899 }
 900
 901 print DATA_TABLES <<__END;
 902 /*
 903  * * * * * This file contains MACHINE-GENERATED DATA, do not edit! * * * * *
 904  */
 905 __END
 906
 907 close DATA_TABLES;
 908
 909 print HEADER "enum {\n";
 910 for (my $i = 0; $i < scalar @scriptCodeToName; ++$i) {
 911   print HEADER "  MOZ_SCRIPT_", $scriptCodeToName[$i], " = ", $i, ",\n";
 912 }
 913 print HEADER "\n  MOZ_NUM_SCRIPT_CODES = ", scalar @scriptCodeToName, ",\n";
 914 print HEADER "\n  MOZ_SCRIPT_INVALID = -1\n";
 915 print HEADER "};\n\n";
 916
 917 print HEADER <<__END;
 918 #endif
 919 /*
 920  * * * * * This file contains MACHINE-GENERATED DATA, do not edit! * * * * *
 921  */
 922 __END
 923
 924 close HEADER;
 925