Fix typo in 9b54bd30006c008b4a951331b273613d5bac3abf
[pm.git] / intl / unicharutil / tools / genUnicodePropertyData.pl
blobc7015104423996df8d33770b5efb7f6e3adcd203
1 #!/usr/bin/env perl
3 # This Source Code Form is subject to the terms of the Mozilla Public
4 # License, v. 2.0. If a copy of the MPL was not distributed with this
5 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
7 # This tool is used to prepare lookup tables of Unicode character properties
8 # needed by gfx code to support text shaping operations. The properties are
9 # read from the Unicode Character Database and compiled into multi-level arrays
10 # for efficient lookup.
12 # To regenerate the tables in nsUnicodePropertyData.cpp:
14 # (1) Download the current Unicode data files from
16 # http://www.unicode.org/Public/UNIDATA/
18 # NB: not all the files are actually needed; currently, we require
19 # - UnicodeData.txt
20 # - Scripts.txt
21 # - EastAsianWidth.txt
22 # - BidiMirroring.txt
23 # - HangulSyllableType.txt
24 # - ReadMe.txt (to record version/date of the UCD)
25 # - Unihan_Variants.txt (from Unihan.zip)
26 # though this may change if we find a need for additional properties.
28 # The Unicode data files listed above should be together in one directory.
30 # We also require the file
31 # http://www.unicode.org/Public/security/latest/xidmodifications.txt
32 # This file should be in a sub-directory "security" immediately below the
33 # directory containing the other Unicode data files.
35 # We also require the latest data file for UTR50, currently revision-13:
36 # http://www.unicode.org/Public/vertical/revision-13/VerticalOrientation-13.txt
37 # This file should be in a sub-directory "vertical" immediately below the
38 # directory containing the other Unicode data files.
41 # (2) Run this tool using a command line of the form
43 # perl genUnicodePropertyData.pl \
44 # /path/to/harfbuzz/src \
45 # /path/to/UCD-directory
47 # This will generate (or overwrite!) the files
49 # nsUnicodePropertyData.cpp
50 # nsUnicodeScriptCodes.h
52 # in the current directory.
54 use strict;
55 use List::Util qw(first);
57 if ($#ARGV != 1) {
58 print <<__EOT;
59 # Run this tool using a command line of the form
61 # perl genUnicodePropertyData.pl \\
62 # /path/to/harfbuzz/src \\
63 # /path/to/UCD-directory
65 # where harfbuzz/src is the directory containing harfbuzz .cc and .hh files,
66 # and UCD-directory is a directory containing the current Unicode Character
67 # Database files (UnicodeData.txt, etc), available from
68 # http://www.unicode.org/Public/UNIDATA/, with additional resources as
69 # detailed in the source comments.
71 # This will generate (or overwrite!) the files
73 # nsUnicodePropertyData.cpp
74 # nsUnicodeScriptCodes.h
76 # in the current directory.
77 __EOT
78 exit 0;
81 # load HB_Script and HB_Category constants
83 # NOTE that HB_SCRIPT_* constants are now "tag" values, NOT sequentially-allocated
84 # script codes as used by Glib/Pango/etc.
85 # We therefore define a set of MOZ_SCRIPT_* constants that are script _codes_
86 # compatible with those libraries, and map these to HB_SCRIPT_* _tags_ as needed.
88 # CHECK that this matches Pango source (as found for example at
89 # http://git.gnome.org/browse/pango/tree/pango/pango-script.h)
90 # for as many codes as that defines (currently up through Unicode 5.1)
91 # and the GLib enumeration
92 # http://developer.gnome.org/glib/2.30/glib-Unicode-Manipulation.html#GUnicodeScript
93 # (currently defined up through Unicode 6.0).
94 # Constants beyond these may be regarded as unstable for now, but we don't actually
95 # depend on the specific values.
96 my %scriptCode = (
97 INVALID => -1,
98 COMMON => 0,
99 INHERITED => 1,
100 ARABIC => 2,
101 ARMENIAN => 3,
102 BENGALI => 4,
103 BOPOMOFO => 5,
104 CHEROKEE => 6,
105 COPTIC => 7,
106 CYRILLIC => 8,
107 DESERET => 9,
108 DEVANAGARI => 10,
109 ETHIOPIC => 11,
110 GEORGIAN => 12,
111 GOTHIC => 13,
112 GREEK => 14,
113 GUJARATI => 15,
114 GURMUKHI => 16,
115 HAN => 17,
116 HANGUL => 18,
117 HEBREW => 19,
118 HIRAGANA => 20,
119 KANNADA => 21,
120 KATAKANA => 22,
121 KHMER => 23,
122 LAO => 24,
123 LATIN => 25,
124 MALAYALAM => 26,
125 MONGOLIAN => 27,
126 MYANMAR => 28,
127 OGHAM => 29,
128 OLD_ITALIC => 30,
129 ORIYA => 31,
130 RUNIC => 32,
131 SINHALA => 33,
132 SYRIAC => 34,
133 TAMIL => 35,
134 TELUGU => 36,
135 THAANA => 37,
136 THAI => 38,
137 TIBETAN => 39,
138 CANADIAN_ABORIGINAL => 40,
139 YI => 41,
140 TAGALOG => 42,
141 HANUNOO => 43,
142 BUHID => 44,
143 TAGBANWA => 45,
144 # unicode 4.0 additions
145 BRAILLE => 46,
146 CYPRIOT => 47,
147 LIMBU => 48,
148 OSMANYA => 49,
149 SHAVIAN => 50,
150 LINEAR_B => 51,
151 TAI_LE => 52,
152 UGARITIC => 53,
153 # unicode 4.1 additions
154 NEW_TAI_LUE => 54,
155 BUGINESE => 55,
156 GLAGOLITIC => 56,
157 TIFINAGH => 57,
158 SYLOTI_NAGRI => 58,
159 OLD_PERSIAN => 59,
160 KHAROSHTHI => 60,
161 # unicode 5.0 additions
162 UNKNOWN => 61,
163 BALINESE => 62,
164 CUNEIFORM => 63,
165 PHOENICIAN => 64,
166 PHAGS_PA => 65,
167 NKO => 66,
168 # unicode 5.1 additions
169 KAYAH_LI => 67,
170 LEPCHA => 68,
171 REJANG => 69,
172 SUNDANESE => 70,
173 SAURASHTRA => 71,
174 CHAM => 72,
175 OL_CHIKI => 73,
176 VAI => 74,
177 CARIAN => 75,
178 LYCIAN => 76,
179 LYDIAN => 77,
180 # unicode 5.2 additions
181 AVESTAN => 78,
182 BAMUM => 79,
183 EGYPTIAN_HIEROGLYPHS => 80,
184 IMPERIAL_ARAMAIC => 81,
185 INSCRIPTIONAL_PAHLAVI => 82,
186 INSCRIPTIONAL_PARTHIAN => 83,
187 JAVANESE => 84,
188 KAITHI => 85,
189 LISU => 86,
190 MEETEI_MAYEK => 87,
191 OLD_SOUTH_ARABIAN => 88,
192 OLD_TURKIC => 89,
193 SAMARITAN => 90,
194 TAI_THAM => 91,
195 TAI_VIET => 92,
196 # unicode 6.0 additions
197 BATAK => 93,
198 BRAHMI => 94,
199 MANDAIC => 95,
200 # unicode 6.1 additions
201 CHAKMA => 96,
202 MEROITIC_CURSIVE => 97,
203 MEROITIC_HIEROGLYPHS => 98,
204 MIAO => 99,
205 SHARADA => 100,
206 SORA_SOMPENG => 101,
207 TAKRI => 102,
208 # unicode 7.0 additions
209 BASSA_VAH => 103,
210 CAUCASIAN_ALBANIAN => 104,
211 DUPLOYAN => 105,
212 ELBASAN => 106,
213 GRANTHA => 107,
214 KHOJKI => 108,
215 KHUDAWADI => 109,
216 LINEAR_A => 110,
217 MAHAJANI => 111,
218 MANICHAEAN => 112,
219 MENDE_KIKAKUI => 113,
220 MODI => 114,
221 MRO => 115,
222 NABATAEAN => 116,
223 OLD_NORTH_ARABIAN => 117,
224 OLD_PERMIC => 118,
225 PAHAWH_HMONG => 119,
226 PALMYRENE => 120,
227 PAU_CIN_HAU => 121,
228 PSALTER_PAHLAVI => 122,
229 SIDDHAM => 123,
230 TIRHUTA => 124,
231 WARANG_CITI => 125,
233 # additional "script" code, not from Unicode (but matches ISO 15924's Zmth tag)
234 MATHEMATICAL_NOTATION => 126,
237 my $sc = -1;
238 my $cc = -1;
239 my %catCode;
240 my @scriptCodeToTag;
241 my @scriptCodeToName;
243 sub readHarfBuzzHeader
245 my $file = shift;
246 open FH, "< $ARGV[0]/$file" or die "can't open harfbuzz header $ARGV[0]/$file\n";
247 while (<FH>) {
248 s/CANADIAN_SYLLABICS/CANADIAN_ABORIGINAL/; # harfbuzz and unicode disagree on this name :(
249 if (m/HB_SCRIPT_([A-Z_]+)\s*=\s*HB_TAG\s*\(('.','.','.','.')\)\s*,/) {
250 unless (exists $scriptCode{$1}) {
251 warn "unknown script name $1 found in $file\n";
252 next;
254 $sc = $scriptCode{$1};
255 $scriptCodeToTag[$sc] = $2;
256 $scriptCodeToName[$sc] = $1;
258 if (m/HB_UNICODE_GENERAL_CATEGORY_([A-Z_]+)/) {
259 $cc++;
260 $catCode{$1} = $cc;
263 close FH;
266 &readHarfBuzzHeader("hb-common.h");
267 &readHarfBuzzHeader("hb-unicode.h");
269 die "didn't find HarfBuzz script codes\n" if $sc == -1;
270 die "didn't find HarfBuzz category codes\n" if $cc == -1;
272 # Additional code not present in HarfBuzz headers:
273 $sc = $scriptCode{"MATHEMATICAL_NOTATION"};
274 $scriptCodeToTag[$sc] = "'Z','m','t','h'";
275 $scriptCodeToName[$sc] = "MATHEMATICAL_NOTATION";
277 my %xidmodCode = (
278 'inclusion' => 0,
279 'recommended' => 1,
280 'default-ignorable' => 2,
281 'historic' => 3,
282 'limited-use' => 4,
283 'not-NFKC' => 5,
284 'not-xid' => 6,
285 'obsolete' => 7,
286 'technical' => 8,
287 'not-chars' => 9
290 my %bidicategoryCode = (
291 "L" => "0", # Left-to-Right
292 "R" => "1", # Right-to-Left
293 "EN" => "2", # European Number
294 "ES" => "3", # European Number Separator
295 "ET" => "4", # European Number Terminator
296 "AN" => "5", # Arabic Number
297 "CS" => "6", # Common Number Separator
298 "B" => "7", # Paragraph Separator
299 "S" => "8", # Segment Separator
300 "WS" => "9", # Whitespace
301 "ON" => "10", # Other Neutrals
302 "LRE" => "11", # Left-to-Right Embedding
303 "LRO" => "12", # Left-to-Right Override
304 "AL" => "13", # Right-to-Left Arabic
305 "RLE" => "14", # Right-to-Left Embedding
306 "RLO" => "15", # Right-to-Left Override
307 "PDF" => "16", # Pop Directional Format
308 "NSM" => "17", # Non-Spacing Mark
309 "BN" => "18" # Boundary Neutral
312 my %verticalOrientationCode = (
313 'U' => 0, # U - Upright, the same orientation as in the code charts
314 'R' => 1, # R - Rotated 90 degrees clockwise compared to the code charts
315 'Tu' => 2, # Tu - Transformed typographically, with fallback to Upright
316 'Tr' => 3 # Tr - Transformed typographically, with fallback to Rotated
319 # initialize default properties
320 my @script;
321 my @category;
322 my @combining;
323 my @eaw;
324 my @mirror;
325 my @hangul;
326 my @casemap;
327 my @xidmod;
328 my @numericvalue;
329 my @hanVariant;
330 my @bidicategory;
331 my @fullWidth;
332 my @verticalOrientation;
333 for (my $i = 0; $i < 0x110000; ++$i) {
334 $script[$i] = $scriptCode{"UNKNOWN"};
335 $category[$i] = $catCode{"UNASSIGNED"};
336 $combining[$i] = 0;
337 $casemap[$i] = 0;
338 $xidmod[$i] = $xidmodCode{"not-chars"};
339 $numericvalue[$i] = -1;
340 $hanVariant[$i] = 0;
341 $bidicategory[$i] = $bidicategoryCode{"L"};
342 $fullWidth[$i] = 0;
343 $verticalOrientation[$i] = 1; # default for unlisted codepoints is 'R'
346 # blocks where the default for bidi category is not L
347 for my $i (0x0600..0x07BF, 0x08A0..0x08FF, 0xFB50..0xFDCF, 0xFDF0..0xFDFF, 0xFE70..0xFEFF, 0x1EE00..0x0001EEFF) {
348 $bidicategory[$i] = $bidicategoryCode{"AL"};
350 for my $i (0x0590..0x05FF, 0x07C0..0x089F, 0xFB1D..0xFB4F, 0x00010800..0x00010FFF, 0x0001E800..0x0001EDFF, 0x0001EF00..0x0001EFFF) {
351 $bidicategory[$i] = $bidicategoryCode{"R"};
353 for my $i (0x20A0..0x20CF) {
354 $bidicategory[$i] = $bidicategoryCode{"ET"};
357 my %ucd2hb = (
358 'Cc' => 'CONTROL',
359 'Cf' => 'FORMAT',
360 'Cn' => 'UNASSIGNED',
361 'Co' => 'PRIVATE_USE',
362 'Cs' => 'SURROGATE',
363 'Ll' => 'LOWERCASE_LETTER',
364 'Lm' => 'MODIFIER_LETTER',
365 'Lo' => 'OTHER_LETTER',
366 'Lt' => 'TITLECASE_LETTER',
367 'Lu' => 'UPPERCASE_LETTER',
368 'Mc' => 'SPACING_MARK',
369 'Me' => 'ENCLOSING_MARK',
370 'Mn' => 'NON_SPACING_MARK',
371 'Nd' => 'DECIMAL_NUMBER',
372 'Nl' => 'LETTER_NUMBER',
373 'No' => 'OTHER_NUMBER',
374 'Pc' => 'CONNECT_PUNCTUATION',
375 'Pd' => 'DASH_PUNCTUATION',
376 'Pe' => 'CLOSE_PUNCTUATION',
377 'Pf' => 'FINAL_PUNCTUATION',
378 'Pi' => 'INITIAL_PUNCTUATION',
379 'Po' => 'OTHER_PUNCTUATION',
380 'Ps' => 'OPEN_PUNCTUATION',
381 'Sc' => 'CURRENCY_SYMBOL',
382 'Sk' => 'MODIFIER_SYMBOL',
383 'Sm' => 'MATH_SYMBOL',
384 'So' => 'OTHER_SYMBOL',
385 'Zl' => 'LINE_SEPARATOR',
386 'Zp' => 'PARAGRAPH_SEPARATOR',
387 'Zs' => 'SPACE_SEPARATOR'
390 # read ReadMe.txt
391 my @versionInfo;
392 open FH, "< $ARGV[1]/ReadMe.txt" or die "can't open Unicode ReadMe.txt file\n";
393 while (<FH>) {
394 chomp;
395 push @versionInfo, $_;
397 close FH;
399 my $kTitleToUpper = 0x80000000;
400 my $kUpperToLower = 0x40000000;
401 my $kLowerToTitle = 0x20000000;
402 my $kLowerToUpper = 0x10000000;
403 my $kCaseMapCharMask = 0x001fffff;
405 # read UnicodeData.txt
406 open FH, "< $ARGV[1]/UnicodeData.txt" or die "can't open UCD file UnicodeData.txt\n";
407 while (<FH>) {
408 chomp;
409 my @fields = split /;/;
410 if ($fields[1] =~ /First/) {
411 my $first = hex "0x$fields[0]";
412 $_ = <FH>;
413 @fields = split /;/;
414 if ($fields[1] =~ /Last/) {
415 my $last = hex "0x$fields[0]";
416 do {
417 $category[$first] = $catCode{$ucd2hb{$fields[2]}};
418 $combining[$first] = $fields[3];
419 $bidicategory[$first] = $bidicategoryCode{$fields[4]};
420 unless (length($fields[7]) == 0) {
421 $numericvalue[$first] = $fields[7];
423 if ($fields[1] =~ /CJK/) {
424 @hanVariant[$first] = 3;
426 $first++;
427 } while ($first <= $last);
428 } else {
429 die "didn't find Last code for range!\n";
431 } else {
432 my $usv = hex "0x$fields[0]";
433 $category[$usv] = $catCode{$ucd2hb{$fields[2]}};
434 $combining[$usv] = $fields[3];
435 my $upper = hex $fields[12];
436 my $lower = hex $fields[13];
437 my $title = hex $fields[14];
438 # we only store one mapping for each character,
439 # but also record what kind of mapping it is
440 if ($upper && $lower) {
441 $casemap[$usv] |= $kTitleToUpper;
442 $casemap[$usv] |= ($usv ^ $upper);
444 elsif ($lower) {
445 $casemap[$usv] |= $kUpperToLower;
446 $casemap[$usv] |= ($usv ^ $lower);
448 elsif ($title && ($title != $upper)) {
449 $casemap[$usv] |= $kLowerToTitle;
450 $casemap[$usv] |= ($usv ^ $title);
452 elsif ($upper) {
453 $casemap[$usv] |= $kLowerToUpper;
454 $casemap[$usv] |= ($usv ^ $upper);
456 $bidicategory[$usv] = $bidicategoryCode{$fields[4]};
457 unless (length($fields[7]) == 0) {
458 $numericvalue[$usv] = $fields[7];
460 if ($fields[1] =~ /CJK/) {
461 @hanVariant[$usv] = 3;
463 if ($fields[5] =~ /^<narrow>/) {
464 my $wideChar = hex(substr($fields[5], 9));
465 die "didn't expect supplementary-plane values here" if $usv > 0xffff || $wideChar > 0xffff;
466 $fullWidth[$usv] = $wideChar;
468 elsif ($fields[5] =~ /^<wide>/) {
469 my $narrowChar = hex(substr($fields[5], 7));
470 die "didn't expect supplementary-plane values here" if $usv > 0xffff || $narrowChar > 0xffff;
471 $fullWidth[$narrowChar] = $usv;
475 close FH;
477 # read Scripts.txt
478 open FH, "< $ARGV[1]/Scripts.txt" or die "can't open UCD file Scripts.txt\n";
479 push @versionInfo, "";
480 while (<FH>) {
481 chomp;
482 push @versionInfo, $_;
483 last if /Date:/;
485 while (<FH>) {
486 if (m/([0-9A-F]{4,6})(?:\.\.([0-9A-F]{4,6}))*\s+;\s+([^ ]+)/) {
487 my $script = uc($3);
488 warn "unknown script $script" unless exists $scriptCode{$script};
489 $script = $scriptCode{$script};
490 my $start = hex "0x$1";
491 my $end = (defined $2) ? hex "0x$2" : $start;
492 for (my $i = $start; $i <= $end; ++$i) {
493 $script[$i] = $script;
497 close FH;
499 # read EastAsianWidth.txt
500 my %eawCode = (
501 'A' => 0, # ; Ambiguous
502 'F' => 1, # ; Fullwidth
503 'H' => 2, # ; Halfwidth
504 'N' => 3, # ; Neutral
505 'NA'=> 4, # ; Narrow
506 'W' => 5 # ; Wide
508 open FH, "< $ARGV[1]/EastAsianWidth.txt" or die "can't open UCD file EastAsianWidth.txt\n";
509 push @versionInfo, "";
510 while (<FH>) {
511 chomp;
512 push @versionInfo, $_;
513 last if /Date:/;
515 while (<FH>) {
516 s/#.*//;
517 if (m/([0-9A-F]{4,6})(?:\.\.([0-9A-F]{4,6}))*\s*;\s*([^ ]+)/) {
518 my $eaw = uc($3);
519 warn "unknown EAW code $eaw" unless exists $eawCode{$eaw};
520 $eaw = $eawCode{$eaw};
521 my $start = hex "0x$1";
522 my $end = (defined $2) ? hex "0x$2" : $start;
523 for (my $i = $start; $i <= $end; ++$i) {
524 $eaw[$i] = $eaw;
528 close FH;
530 # read BidiMirroring.txt
531 my @offsets = ();
532 push @offsets, 0;
534 open FH, "< $ARGV[1]/BidiMirroring.txt" or die "can't open UCD file BidiMirroring.txt\n";
535 push @versionInfo, "";
536 while (<FH>) {
537 chomp;
538 push @versionInfo, $_;
539 last if /Date:/;
541 while (<FH>) {
542 s/#.*//;
543 if (m/([0-9A-F]{4,6});\s*([0-9A-F]{4,6})/) {
544 my $mirrorOffset = hex("0x$2") - hex("0x$1");
545 my $offsetIndex = first { $offsets[$_] eq $mirrorOffset } 0..$#offsets;
546 if ($offsetIndex == undef) {
547 die "too many offset codes\n" if scalar @offsets == 31;
548 push @offsets, $mirrorOffset;
549 $offsetIndex = $#offsets;
551 $mirror[hex "0x$1"] = $offsetIndex;
554 close FH;
556 # read HangulSyllableType.txt
557 my %hangulType = (
558 'L' => 0x01,
559 'V' => 0x02,
560 'T' => 0x04,
561 'LV' => 0x03,
562 'LVT' => 0x07
564 open FH, "< $ARGV[1]/HangulSyllableType.txt" or die "can't open UCD file HangulSyllableType.txt\n";
565 push @versionInfo, "";
566 while (<FH>) {
567 chomp;
568 push @versionInfo, $_;
569 last if /Date:/;
571 while (<FH>) {
572 s/#.*//;
573 if (m/([0-9A-F]{4,6})(?:\.\.([0-9A-F]{4,6}))*\s*;\s*([^ ]+)/) {
574 my $hangul = uc($3);
575 warn "unknown Hangul syllable type" unless exists $hangulType{$hangul};
576 $hangul = $hangulType{$hangul};
577 my $start = hex "0x$1";
578 my $end = (defined $2) ? hex "0x$2" : $start;
579 for (my $i = $start; $i <= $end; ++$i) {
580 $hangul[$i] = $hangul;
584 close FH;
586 # read xidmodifications.txt
587 open FH, "< $ARGV[1]/security/xidmodifications.txt" or die "can't open UCD file xidmodifications.txt\n";
588 push @versionInfo, "";
589 while (<FH>) {
590 chomp;
591 unless (/\xef\xbb\xbf/) {
592 push @versionInfo, $_;
594 last if /Generated:/;
596 while (<FH>) {
597 if (m/([0-9A-F]{4,6})(?:\.\.([0-9A-F]{4,6}))*\s+;\s+[^ ]+\s+;\s+([^ ]+)/) {
598 my $xidmod = $3;
599 warn "unknown Identifier Modification $xidmod" unless exists $xidmodCode{$xidmod};
600 $xidmod = $xidmodCode{$xidmod};
601 my $start = hex "0x$1";
602 my $end = (defined $2) ? hex "0x$2" : $start;
603 for (my $i = $start; $i <= $end; ++$i) {
604 $xidmod[$i] = $xidmod;
608 close FH;
609 # special case U+30FB KATAKANA MIDDLE DOT -- see bug 857490
610 $xidmod[0x30FB] = 1;
612 open FH, "< $ARGV[1]/Unihan_Variants.txt" or die "can't open UCD file Unihan_Variants.txt (from Unihan.zip)\n";
613 push @versionInfo, "";
614 while (<FH>) {
615 chomp;
616 push @versionInfo, $_;
617 last if /Date:/;
619 my $savedusv = 0;
620 my $hasTC = 0;
621 my $hasSC = 0;
622 while (<FH>) {
623 chomp;
624 if (m/U\+([0-9A-F]{4,6})\s+k([^ ]+)Variant/) {
625 my $usv = hex "0x$1";
626 if ($usv != $savedusv) {
627 unless ($savedusv == 0) {
628 if ($hasTC && !$hasSC) {
629 $hanVariant[$savedusv] = 1;
630 } elsif (!$hasTC && $hasSC) {
631 $hanVariant[$savedusv] = 2;
634 $savedusv = $usv;
635 $hasTC = 0;
636 $hasSC = 0;
638 if ($2 eq "Traditional") {
639 $hasTC = 1;
641 if ($2 eq "Simplified") {
642 $hasSC = 1;
646 close FH;
648 # read VerticalOrientation-13.txt
649 open FH, "< $ARGV[1]/vertical/VerticalOrientation-13.txt" or die "can't open UTR50 data file VerticalOrientation-13.txt\n";
650 push @versionInfo, "";
651 while (<FH>) {
652 chomp;
653 push @versionInfo, $_;
654 last if /Date:/;
656 while (<FH>) {
657 chomp;
658 s/#.*//;
659 if (m/([0-9A-F]{4,6})(?:\.\.([0-9A-F]{4,6}))*\s*;\s*([^ ]+)/) {
660 my $vo = $3;
661 warn "unknown Vertical_Orientation code $vo"
662 unless exists $verticalOrientationCode{$vo};
663 $vo = $verticalOrientationCode{$vo};
664 my $start = hex "0x$1";
665 my $end = (defined $2) ? hex "0x$2" : $start;
666 for (my $i = $start; $i <= $end; ++$i) {
667 $verticalOrientation[$i] = $vo;
671 close FH;
673 my $timestamp = gmtime();
675 open DATA_TABLES, "> nsUnicodePropertyData.cpp" or die "unable to open nsUnicodePropertyData.cpp for output";
677 my $licenseBlock = q[
678 /* -*- Mode: C++; tab-width: 20; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
679 /* This Source Code Form is subject to the terms of the Mozilla Public
680 * License, v. 2.0. If a copy of the MPL was not distributed with this
681 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
684 * Derived from the Unicode Character Database by genUnicodePropertyData.pl
686 * For Unicode terms of use, see http://www.unicode.org/terms_of_use.html
690 my $versionInfo = join("\n", @versionInfo);
692 print DATA_TABLES <<__END;
693 $licenseBlock
695 * Created on $timestamp from UCD data files with version info:
698 $versionInfo
701 * * * * * This file contains MACHINE-GENERATED DATA, do not edit! * * * * *
704 #include <stdint.h>
705 #include "harfbuzz/hb.h"
707 __END
709 open HEADER, "> nsUnicodeScriptCodes.h" or die "unable to open nsUnicodeScriptCodes.h for output";
711 print HEADER <<__END;
712 $licenseBlock
714 * Created on $timestamp from UCD data files with version info:
717 $versionInfo
720 * * * * * This file contains MACHINE-GENERATED DATA, do not edit! * * * * *
723 #ifndef NS_UNICODE_SCRIPT_CODES
724 #define NS_UNICODE_SCRIPT_CODES
726 __END
728 print DATA_TABLES "static const uint32_t sScriptCodeToTag[] = {\n";
729 for (my $i = 0; $i < scalar @scriptCodeToTag; ++$i) {
730 printf DATA_TABLES " HB_TAG(%s)", $scriptCodeToTag[$i];
731 print DATA_TABLES $i < $#scriptCodeToTag ? ",\n" : "\n";
733 print DATA_TABLES "};\n\n";
735 our $totalData = 0;
737 print DATA_TABLES "static const int16_t sMirrorOffsets[] = {\n";
738 for (my $i = 0; $i < scalar @offsets; ++$i) {
739 printf DATA_TABLES " $offsets[$i]";
740 print DATA_TABLES $i < $#offsets ? ",\n" : "\n";
742 print DATA_TABLES "};\n\n";
744 print HEADER "#pragma pack(1)\n\n";
746 sub sprintCharProps1
748 my $usv = shift;
749 return sprintf("{%d,%d,%d}, ", $mirror[$usv], $hangul[$usv], $combining[$usv]);
751 my $type = q/
752 struct nsCharProps1 {
753 unsigned char mMirrorOffsetIndex:5;
754 unsigned char mHangulType:3;
755 unsigned char mCombiningClass:8;
758 &genTables("CharProp1", $type, "nsCharProps1", 11, 5, \&sprintCharProps1, 1, 2, 1);
760 sub sprintCharProps2
762 my $usv = shift;
763 return sprintf("{%d,%d,%d,%d,%d,%d,%d},",
764 $script[$usv], $eaw[$usv], $category[$usv],
765 $bidicategory[$usv], $xidmod[$usv], $numericvalue[$usv],
766 $verticalOrientation[$usv]);
768 $type = q/
769 struct nsCharProps2 {
770 unsigned char mScriptCode:8;
771 unsigned char mEAW:3;
772 unsigned char mCategory:5;
773 unsigned char mBidiCategory:5;
774 unsigned char mXidmod:4;
775 signed char mNumericValue:5;
776 unsigned char mVertOrient:2;
779 &genTables("CharProp2", $type, "nsCharProps2", 11, 5, \&sprintCharProps2, 16, 4, 1);
781 print HEADER "#pragma pack()\n\n";
783 sub sprintHanVariants
785 my $baseUsv = shift;
786 my $varShift = 0;
787 my $val = 0;
788 while ($varShift < 8) {
789 $val |= $hanVariant[$baseUsv++] << $varShift;
790 $varShift += 2;
792 return sprintf("0x%02x,", $val);
794 &genTables("HanVariant", "", "uint8_t", 9, 7, \&sprintHanVariants, 2, 1, 4);
796 sub sprintFullWidth
798 my $usv = shift;
799 return sprintf("0x%04x,", $fullWidth[$usv]);
801 &genTables("FullWidth", "", "uint16_t", 10, 6, \&sprintFullWidth, 0, 2, 1);
803 sub sprintCasemap
805 my $usv = shift;
806 return sprintf("0x%08x,", $casemap[$usv]);
808 &genTables("CaseMap", "", "uint32_t", 11, 5, \&sprintCasemap, 1, 4, 1);
810 print STDERR "Total data = $totalData\n";
812 printf DATA_TABLES "const uint32_t kTitleToUpper = 0x%08x;\n", $kTitleToUpper;
813 printf DATA_TABLES "const uint32_t kUpperToLower = 0x%08x;\n", $kUpperToLower;
814 printf DATA_TABLES "const uint32_t kLowerToTitle = 0x%08x;\n", $kLowerToTitle;
815 printf DATA_TABLES "const uint32_t kLowerToUpper = 0x%08x;\n", $kLowerToUpper;
816 printf DATA_TABLES "const uint32_t kCaseMapCharMask = 0x%08x;\n\n", $kCaseMapCharMask;
818 sub genTables
820 my ($prefix, $typedef, $type, $indexBits, $charBits, $func, $maxPlane, $bytesPerEntry, $charsPerEntry) = @_;
822 print DATA_TABLES "#define k${prefix}MaxPlane $maxPlane\n";
823 print DATA_TABLES "#define k${prefix}IndexBits $indexBits\n";
824 print DATA_TABLES "#define k${prefix}CharBits $charBits\n";
826 my $indexLen = 1 << $indexBits;
827 my $charsPerPage = 1 << $charBits;
828 my %charIndex = ();
829 my %pageMapIndex = ();
830 my @pageMap = ();
831 my @char = ();
833 my $planeMap = "\x00" x $maxPlane;
834 foreach my $plane (0 .. $maxPlane) {
835 my $pageMap = "\x00" x $indexLen * 2;
836 foreach my $page (0 .. $indexLen - 1) {
837 my $charValues = "";
838 for (my $ch = 0; $ch < $charsPerPage; $ch += $charsPerEntry) {
839 my $usv = $plane * 0x10000 + $page * $charsPerPage + $ch;
840 $charValues .= &$func($usv);
842 chop $charValues;
844 unless (exists $charIndex{$charValues}) {
845 $charIndex{$charValues} = scalar keys %charIndex;
846 $char[$charIndex{$charValues}] = $charValues;
848 substr($pageMap, $page * 2, 2) = pack('S', $charIndex{$charValues});
851 unless (exists $pageMapIndex{$pageMap}) {
852 $pageMapIndex{$pageMap} = scalar keys %pageMapIndex;
853 $pageMap[$pageMapIndex{$pageMap}] = $pageMap;
855 if ($plane > 0) {
856 substr($planeMap, $plane - 1, 1) = pack('C', $pageMapIndex{$pageMap});
860 if ($maxPlane) {
861 print DATA_TABLES "static const uint8_t s${prefix}Planes[$maxPlane] = {";
862 print DATA_TABLES join(',', map { sprintf("%d", $_) } unpack('C*', $planeMap));
863 print DATA_TABLES "};\n\n";
866 my $chCount = scalar @char;
867 my $pmBits = $chCount > 255 ? 16 : 8;
868 my $pmCount = scalar @pageMap;
869 if ($maxPlane == 0) {
870 die "there should only be one pageMap entry!" if $pmCount > 1;
871 print DATA_TABLES "static const uint${pmBits}_t s${prefix}Pages[$indexLen] = {\n";
872 } else {
873 print DATA_TABLES "static const uint${pmBits}_t s${prefix}Pages[$pmCount][$indexLen] = {\n";
875 for (my $i = 0; $i < scalar @pageMap; ++$i) {
876 print DATA_TABLES $maxPlane > 0 ? " {" : " ";
877 print DATA_TABLES join(',', map { sprintf("%d", $_) } unpack('S*', $pageMap[$i]));
878 print DATA_TABLES $maxPlane > 0 ? ($i < $#pageMap ? "},\n" : "}\n") : "\n";
880 print DATA_TABLES "};\n\n";
882 print HEADER "$typedef\n\n" if $typedef ne '';
884 my $pageLen = $charsPerPage / $charsPerEntry;
885 print DATA_TABLES "static const $type s${prefix}Values[$chCount][$pageLen] = {\n";
886 for (my $i = 0; $i < scalar @char; ++$i) {
887 print DATA_TABLES " {";
888 print DATA_TABLES $char[$i];
889 print DATA_TABLES $i < $#char ? "},\n" : "}\n";
891 print DATA_TABLES "};\n\n";
893 my $dataSize = $pmCount * $indexLen * $pmBits/8 +
894 $chCount * $pageLen * $bytesPerEntry +
895 $maxPlane;
896 $totalData += $dataSize;
898 print STDERR "Data for $prefix = $dataSize\n";
901 print DATA_TABLES <<__END;
903 * * * * * This file contains MACHINE-GENERATED DATA, do not edit! * * * * *
905 __END
907 close DATA_TABLES;
909 print HEADER "enum {\n";
910 for (my $i = 0; $i < scalar @scriptCodeToName; ++$i) {
911 print HEADER " MOZ_SCRIPT_", $scriptCodeToName[$i], " = ", $i, ",\n";
913 print HEADER "\n MOZ_NUM_SCRIPT_CODES = ", scalar @scriptCodeToName, ",\n";
914 print HEADER "\n MOZ_SCRIPT_INVALID = -1\n";
915 print HEADER "};\n\n";
917 print HEADER <<__END;
918 #endif
920 * * * * * This file contains MACHINE-GENERATED DATA, do not edit! * * * * *
922 __END
924 close HEADER;