workspace/kcontrol/kfontinst/viewpart/generate-unicode-tables.pl

   1 #!/usr/bin/perl -w
   2 #
   3 # Note: This file is taken, and modified, from gucharmap/gen-guch-unicode-tables.pl - svn revision 1040
   4 #
   5 # $Id$
   6 #
   7 # generates in the current directory:
   8 #  - UnicodeBlocks.h
   9 #  - unicode-names.h
  10 #  - unicode-nameslist.h
  11 #  - unicode-unihan.h
  12 #  - UnicodeCategories.h
  13 #  - UnicodeScripts.h
  14 #
  15 # usage: ./gen-guch-unicode-tables.pl UNICODE-VERSION DIRECTORY
  16 # where DIRECTORY contains UnicodeData.txt Unihan.zip NamesList.txt Blocks.txt Scripts.txt
  17 #
  18
  19 use strict;
  20 use vars ('$UNZIP', '$ICONV');
  21
  22 # if these things aren't in your path you can put full paths to them here
  23 $UNZIP = 'unzip';
  24 $ICONV = 'iconv';
  25
  26 sub process_unicode_data_txt ($);
  27 sub process_unihan_zip ($);
  28 sub process_nameslist_txt ($);
  29 sub process_blocks_txt ($);
  30 sub process_scripts_txt ($);
  31
  32 $| = 1;  # flush stdout buffer
  33
  34 if (@ARGV != 2)
  35 {
  36     $0 =~ s@.*/@@;
  37     die <<EOF
  38
  39 Usage: $0 UNICODE-VERSION DIRECTORY
  40
  41 DIRECTORY should contain the following Unicode data files:
  42 UnicodeData.txt Unihan.zip NamesList.txt Blocks.txt Scripts.txt
  43
  44 which can be found at http://www.unicode.org/Public/UNIDATA/
  45
  46 EOF
  47 }
  48
  49 my ($unicodedata_txt, $unihan_zip, $nameslist_txt, $blocks_txt, $scripts_txt);
  50
  51 my $v = $ARGV[0];
  52 my $d = $ARGV[1];
  53 opendir (my $dir, $d) or die "Cannot open Unicode data dir $d: $!\n";
  54 for my $f (readdir ($dir))
  55 {
  56     $unicodedata_txt = "$d/$f" if ($f =~ /UnicodeData.*\.txt/);
  57 #     $unihan_zip = "$d/$f" if ($f =~ /Unihan.*\.zip/);
  58 #     $nameslist_txt = "$d/$f" if ($f =~ /NamesList.*\.txt/);
  59     $blocks_txt = "$d/$f" if ($f =~ /Blocks.*\.txt/);
  60     $scripts_txt = "$d/$f" if ($f =~ /Scripts.*\.txt/);
  61 }
  62
  63 defined $unicodedata_txt or die "Did not find $d/UnicodeData.txt";
  64 # defined $unihan_zip or die "Did not find $d/Unihan.zip";
  65 # defined $nameslist_txt or die "Did not find $d/NamesList.txt";
  66 defined $blocks_txt or die "Did not find $d/Blocks.txt";
  67 defined $scripts_txt or die "Did not find $d/Scripts.txt";
  68
  69 process_unicode_data_txt ($unicodedata_txt);
  70 # process_nameslist_txt ($nameslist_txt);
  71 process_blocks_txt ($blocks_txt);
  72 process_scripts_txt ($scripts_txt);
  73 # process_unihan_zip ($unihan_zip);
  74
  75 exit;
  76
  77
  78 #------------------------#
  79
  80 sub process_unicode_data_txt ($)
  81 {
  82     my ($unicodedata_txt) = @_;
  83
  84     # part 1: names
  85
  86     open (my $unicodedata, $unicodedata_txt) or die;
  87 #     open (my $out, "> unicode-names.h") or die;
  88
  89     print "processing $unicodedata_txt...";
  90 #
  91 #     print $out "/* unicode-names.h */\n";
  92 #     print $out "/* THIS IS A GENERATED FILE. CHANGES WILL BE OVERWRITTEN. */\n";
  93 #     print $out "/* Generated by $0 */\n";
  94 #     print $out "/* Generated from UCD version $v */\n\n";
  95 #
  96 #     print $out "#ifndef UNICODE_NAMES_H\n";
  97 #     print $out "#define UNICODE_NAMES_H\n\n";
  98 #
  99 #     print $out "#include <glib/gunicode.h>\n\n";
 100 #     print $out "#include \"gucharmap-intl.h\"\n\n";
 101 #
 102 #     my @unicode_pairs;
 103 #     my %names;
 104 #
 105 #     while (my $line = <$unicodedata>)
 106 #     {
 107 #         chomp $line;
 108 #         $line =~ /^([^;]+);([^;]+)/ or die;
 109 #
 110 #         my $hex = $1;
 111 #         my $name = $2;
 112 #
 113 #         $names{$name} = 1;
 114 #         push @unicode_pairs, [$hex, $name];
 115 #     }
 116 #
 117 #     print $out "static const char unicode_names_strings[] = \\\n";
 118 #
 119 #     my $offset = 0;
 120 #
 121 #     foreach my $name (sort keys %names) {
 122 #       print $out "  \"$name\\0\"\n";
 123 #       $names{$name} = $offset;
 124 #       $offset += length($name) + 1;
 125 #     }
 126 #
 127 #     undef $offset;
 128 #
 129 #     print $out ";\n";
 130 #
 131 #     print $out "typedef struct _UnicodeName UnicodeName;\n\n";
 132 #
 133 #     print $out "static const struct _UnicodeName\n";
 134 #     print $out "{\n";
 135 #     print $out "  gunichar index;\n";
 136 #     print $out "  guint32 name_offset;\n";
 137 #     print $out "} \n";
 138 #     print $out "unicode_names[] =\n";
 139 #     print $out "{\n";
 140 #
 141 #     my $first_line = 1;
 142 #
 143 #     foreach my $pair (@unicode_pairs) {
 144 #       if (!$first_line) {
 145 #           print $out ",\n";
 146 #       } else {
 147 #           $first_line = 0;
 148 #       }
 149 #
 150 #       my ($hex, $name) = @{$pair};
 151 #       my $offset = $names{$name};
 152 #       print $out "  {0x$hex, $offset}";
 153 #     }
 154 #
 155 #     print $out "\n};\n\n";
 156 #
 157 #     print $out <<EOT;
 158 # static inline const char * unicode_name_get_name(const UnicodeName *entry)
 159 # {
 160 #   guint32 offset = entry->name_offset;
 161 #   return unicode_names_strings + offset;
 162 # }
 163 #
 164 # EOT
 165 #
 166 #     print $out "#endif  /* #ifndef UNICODE_NAMES_H */\n";
 167 #
 168 #     undef %names;
 169 #     undef @unicode_pairs;
 170 #
 171 #     close ($unicodedata);
 172 #     close ($out);
 173
 174     # part 2: categories
 175
 176     open ($unicodedata, $unicodedata_txt) or die;
 177     open (my $out, "> UnicodeCategories.h") or die;
 178
 179     # Map general category code onto symbolic name.
 180     my %mappings =
 181     (
 182         # Normative.
 183         'Lu' => "UNICODE_UPPERCASE_LETTER",
 184         'Ll' => "UNICODE_LOWERCASE_LETTER",
 185         'Lt' => "UNICODE_TITLECASE_LETTER",
 186         'Mn' => "UNICODE_NON_SPACING_MARK",
 187         'Mc' => "UNICODE_COMBINING_MARK",
 188         'Me' => "UNICODE_ENCLOSING_MARK",
 189         'Nd' => "UNICODE_DECIMAL_NUMBER",
 190         'Nl' => "UNICODE_LETTER_NUMBER",
 191         'No' => "UNICODE_OTHER_NUMBER",
 192         'Zs' => "UNICODE_SPACE_SEPARATOR",
 193         'Zl' => "UNICODE_LINE_SEPARATOR",
 194         'Zp' => "UNICODE_PARAGRAPH_SEPARATOR",
 195         'Cc' => "UNICODE_CONTROL",
 196         'Cf' => "UNICODE_FORMAT",
 197         'Cs' => "UNICODE_SURROGATE",
 198         'Co' => "UNICODE_PRIVATE_USE",
 199         'Cn' => "UNICODE_UNASSIGNED",
 200
 201         # Informative.
 202         'Lm' => "UNICODE_MODIFIER_LETTER",
 203         'Lo' => "UNICODE_OTHER_LETTER",
 204         'Pc' => "UNICODE_CONNECT_PUNCTUATION",
 205         'Pd' => "UNICODE_DASH_PUNCTUATION",
 206         'Ps' => "UNICODE_OPEN_PUNCTUATION",
 207         'Pe' => "UNICODE_CLOSE_PUNCTUATION",
 208         'Pi' => "UNICODE_INITIAL_PUNCTUATION",
 209         'Pf' => "UNICODE_FINAL_PUNCTUATION",
 210         'Po' => "UNICODE_OTHER_PUNCTUATION",
 211         'Sm' => "UNICODE_MATH_SYMBOL",
 212         'Sc' => "UNICODE_CURRENCY_SYMBOL",
 213         'Sk' => "UNICODE_MODIFIER_SYMBOL",
 214         'So' => "UNICODE_OTHER_SYMBOL"
 215     );
 216
 217     # these shouldn't be -1
 218     my ($codepoint, $last_codepoint, $start_codepoint) = (-999, -999, -999);
 219
 220     my ($category, $last_category) = ("FAKE1", "FAKE2");
 221     my ($started_range, $finished_range) = (undef, undef);
 222
 223     print $out "/* UnicodeCategories.h */\n";
 224     print $out "/* THIS IS A GENERATED FILE. CHANGES WILL BE OVERWRITTEN. */\n";
 225     print $out "/* Generated by $0 */\n";
 226     print $out "/* Generated from UCD version $v */\n\n";
 227
 228     print $out "#ifndef UNICODE_CATEGORIES_H\n";
 229     print $out "#define UNICODE_CATEGORIES_H\n\n";
 230     print $out "#include <QtCore/qglobal.h>\n\n";
 231     print $out "enum EUnicodeCategory\n";
 232     print $out "{\n";
 233     print $out "    UNICODE_UPPERCASE_LETTER,\n";
 234     print $out "    UNICODE_LOWERCASE_LETTER,\n";
 235     print $out "    UNICODE_TITLECASE_LETTER,\n";
 236     print $out "    UNICODE_NON_SPACING_MARK,\n";
 237     print $out "    UNICODE_COMBINING_MARK,\n";
 238     print $out "    UNICODE_ENCLOSING_MARK,\n";
 239     print $out "    UNICODE_DECIMAL_NUMBER,\n";
 240     print $out "    UNICODE_LETTER_NUMBER,\n";
 241     print $out "    UNICODE_OTHER_NUMBER,\n";
 242     print $out "    UNICODE_SPACE_SEPARATOR,\n";
 243     print $out "    UNICODE_LINE_SEPARATOR,\n";
 244     print $out "    UNICODE_PARAGRAPH_SEPARATOR,\n";
 245     print $out "    UNICODE_CONTROL,\n";
 246     print $out "    UNICODE_FORMAT,\n";
 247     print $out "    UNICODE_SURROGATE,\n";
 248     print $out "    UNICODE_PRIVATE_USE,\n";
 249     print $out "    UNICODE_UNASSIGNED,\n";
 250     print $out "    UNICODE_MODIFIER_LETTER,\n";
 251     print $out "    UNICODE_OTHER_LETTER,\n";
 252     print $out "    UNICODE_CONNECT_PUNCTUATION,\n";
 253     print $out "    UNICODE_DASH_PUNCTUATION,\n";
 254     print $out "    UNICODE_OPEN_PUNCTUATION,\n";
 255     print $out "    UNICODE_CLOSE_PUNCTUATION,\n";
 256     print $out "    UNICODE_INITIAL_PUNCTUATION,\n";
 257     print $out "    UNICODE_FINAL_PUNCTUATION,\n";
 258     print $out "    UNICODE_OTHER_PUNCTUATION,\n";
 259     print $out "    UNICODE_MATH_SYMBOL,\n";
 260     print $out "    UNICODE_CURRENCY_SYMBOL,\n";
 261     print $out "    UNICODE_MODIFIER_SYMBOL,\n";
 262     print $out "    UNICODE_OTHER_SYMBOL,\n";
 263     print $out "\n";
 264     print $out "    UNICODE_INVALID\n";
 265     print $out "};\n\n";
 266     print $out "struct TUnicodeCategory\n";
 267     print $out "{\n";
 268     print $out "    quint32 start;\n";
 269     print $out "    quint32 end;\n";
 270     print $out "    EUnicodeCategory category;\n";
 271     print $out "};\n\n";
 272     print $out "static const TUnicodeCategory constUnicodeCategoryList[] =\n";
 273     print $out "{\n";
 274
 275     while (my $line = <$unicodedata>)
 276     {
 277         $line =~ /^([0-9A-F]*);([^;]*);([^;]*);/ or die;
 278         my $codepoint = hex ($1);
 279         my $name = $2;
 280         my $category = $mappings{$3};
 281
 282         if ($finished_range
 283             or ($category ne $last_category)
 284             or (not $started_range and $codepoint != $last_codepoint + 1))
 285         {
 286             if ($last_codepoint >= 0) {
 287                 printf $out ("    { 0x%4.4X, 0x%4.4X, \%s },\n", $start_codepoint, $last_codepoint, $last_category);
 288             }
 289
 290             $start_codepoint = $codepoint;
 291         }
 292
 293         if ($name =~ /^<.*First>$/) {
 294             $started_range = 1;
 295             $finished_range = undef;
 296         }
 297         elsif ($name =~ /^<.*Last>$/) {
 298             $started_range = undef;
 299             $finished_range = 1;
 300         }
 301         elsif ($finished_range) {
 302             $finished_range = undef;
 303         }
 304
 305         $last_codepoint = $codepoint;
 306         $last_category = $category;
 307     }
 308     printf $out ("    { 0x%4.4X, 0x%4.4X, \%s },\n", $start_codepoint, $last_codepoint, $last_category);
 309     printf $out "    { 0x0, 0x0, UNICODE_INVALID }\n";
 310     print $out "};\n\n";
 311
 312     print $out "#endif\n";
 313
 314     close ($out);
 315     print " done.\n";
 316 }
 317
 318 #------------------------#
 319
 320 # XXX should do kFrequency too
 321 sub process_unihan_zip ($)
 322 {
 323     my ($unihan_zip) = @_;
 324
 325     open (my $unihan, "$UNZIP -c $unihan_zip |") or die;
 326     open (my $out, "> unicode-unihan.h") or die;
 327
 328     print "processing $unihan_zip";
 329
 330     print $out "/* unicode-unihan.h */\n";
 331     print $out "/* THIS IS A GENERATED FILE. CHANGES WILL BE OVERWRITTEN. */\n";
 332     print $out "/* Generated by $0 */\n";
 333     print $out "/* Generated from UCD version $v */\n\n";
 334
 335     print $out "#ifndef UNICODE_UNIHAN_H\n";
 336     print $out "#define UNICODE_UNIHAN_H\n\n";
 337
 338     print $out "#include <glib/gunicode.h>\n\n";
 339
 340     print $out "typedef struct _Unihan Unihan;\n\n";
 341
 342     print $out "static const struct _Unihan\n";
 343     print $out "{\n";
 344     print $out "  gunichar index;\n";
 345     print $out "  gint32 kDefinition;\n";
 346     print $out "  gint32 kCantonese;\n";
 347     print $out "  gint32 kMandarin;\n";
 348     print $out "  gint32 kTang;\n";
 349     print $out "  gint32 kKorean;\n";
 350     print $out "  gint32 kJapaneseKun;\n";
 351     print $out "  gint32 kJapaneseOn;\n";
 352     print $out "} \n";
 353     print $out "unihan[] =\n";
 354     print $out "{\n";
 355
 356     my @strings;
 357     my $offset = 0;
 358
 359     my $wc = 0;
 360     my ($kDefinition, $kCantonese, $kMandarin, $kTang, $kKorean, $kJapaneseKun, $kJapaneseOn);
 361
 362     my $i = 0;
 363     while (my $line = <$unihan>)
 364     {
 365         chomp $line;
 366         $line =~ /^U\+([0-9A-F]+)\s+([^\s]+)\s+(.+)$/ or next;
 367
 368         my $new_wc = hex ($1);
 369         my $field = $2;
 370
 371         my $value = $3;
 372         $value =~ s/\\/\\\\/g;
 373         $value =~ s/\"/\\"/g;
 374
 375         if ($new_wc != $wc)
 376         {
 377             if (defined $kDefinition or defined $kCantonese or defined $kMandarin
 378                 or defined $kTang or defined $kKorean or defined $kJapaneseKun
 379                 or defined $kJapaneseOn)
 380             {
 381                 printf $out ("  { 0x%04X, \%d, \%d, \%d, \%d, \%d, \%d, \%d },\n",
 382                              $wc,
 383                              (defined($kDefinition) ? $kDefinition : -1),
 384                              (defined($kCantonese) ? $kCantonese: -1),
 385                              (defined($kMandarin) ? $kMandarin : -1),
 386                              (defined($kTang) ? $kTang : -1),
 387                              (defined($kKorean) ? $kKorean : -1),
 388                              (defined($kJapaneseKun) ? $kJapaneseKun : -1),
 389                              (defined($kJapaneseOn) ? $kJapaneseOn : -1));
 390             }
 391
 392             $wc = $new_wc;
 393
 394             undef $kDefinition;
 395             undef $kCantonese;
 396             undef $kMandarin;
 397             undef $kTang;
 398             undef $kKorean;
 399             undef $kJapaneseKun;
 400             undef $kJapaneseOn;
 401         }
 402
 403         for my $f qw(kDefinition kCantonese kMandarin
 404                      kTang kKorean kJapaneseKun kJapaneseOn) {
 405
 406             if ($field eq $f) {
 407                 push @strings, $value;
 408                 my $last_offset = $offset;
 409                 $offset += length($value) + 1;
 410                 $value = $last_offset;
 411                 last;
 412             }
 413         }
 414
 415         if ($field eq "kDefinition") {
 416             $kDefinition = $value;
 417         }
 418         elsif ($field eq "kCantonese") {
 419             $kCantonese = $value;
 420         }
 421         elsif ($field eq "kMandarin") {
 422             $kMandarin = $value;
 423         }
 424         elsif ($field eq "kTang") {
 425             $kTang = $value;
 426         }
 427         elsif ($field eq "kKorean") {
 428             $kKorean = $value;
 429         }
 430         elsif ($field eq "kJapaneseKun") {
 431             $kJapaneseKun = $value;
 432         }
 433         elsif ($field eq "kJapaneseOn") {
 434             $kJapaneseOn = $value;
 435         }
 436
 437         if ($i++ % 32768 == 0) {
 438             print ".";
 439         }
 440     }
 441
 442     print $out "};\n\n";
 443
 444     print $out "static const char unihan_strings[] = \\\n";
 445
 446     for my $s (@strings) {
 447         print $out "  \"$s\\0\"\n";
 448     }
 449     print $out ";\n\n";
 450
 451     print $out "static const Unihan *_get_unihan (gunichar uc)\n;";
 452
 453     for my $name qw(kDefinition kCantonese kMandarin
 454                     kTang kKorean kJapaneseKun kJapaneseOn) {
 455     print $out <<EOT;
 456
 457 static inline const char * unihan_get_$name (const Unihan *uh)
 458 {
 459     gint32 offset = uh->$name;
 460     if (offset == -1)
 461       return NULL;
 462     return unihan_strings + offset;
 463 }
 464
 465 G_CONST_RETURN gchar *
 466 gucharmap_get_unicode_$name (gunichar uc)
 467 {
 468   const Unihan *uh = _get_unihan (uc);
 469   if (uh == NULL)
 470     return NULL;
 471   else
 472     return unihan_get_$name (uh);
 473 }
 474
 475 EOT
 476     }
 477
 478     print $out "#endif  /* #ifndef UNICODE_UNIHAN_H */\n";
 479
 480     close ($unihan);
 481     close ($out);
 482
 483     print " done.\n";
 484 }
 485
 486 #------------------------#
 487
 488 # $nameslist_hash =
 489 # {
 490 #     0x0027 => { '=' => {
 491 #                          'index'  => 30,
 492 #                          'values' => [ 'APOSTROPHE-QUOTE', 'APL quote' ]
 493 #                        }
 494 #                 '*' => {
 495 #                          'index'  => 50,
 496 #                          'values' => [ 'neutral (vertical) glyph with mixed usage',
 497 #                                        '2019 is preferred for apostrophe',
 498 #                                        'preferred characters in English for paired quotation marks are 2018 & 2019'
 499 #                                      ]
 500 #                         }
 501 #                  # etc
 502 #                }
 503 #     # etc
 504 # };
 505 #
 506 sub process_nameslist_txt ($)
 507 {
 508     my ($nameslist_txt) = @_;
 509
 510     open (my $nameslist, "$ICONV -f 'ISO8859-1' -t 'UTF-8' $nameslist_txt |") or die;
 511
 512     print "processing $nameslist_txt...";
 513
 514     my ($equal_i, $ex_i, $star_i, $pound_i, $colon_i) = (0, 0, 0, 0, 0);
 515     my $wc = 0;
 516
 517     my $nameslist_hash;
 518
 519     while (my $line = <$nameslist>)
 520     {
 521         chomp ($line);
 522
 523         if ($line =~ /^@/)
 524         {
 525             next;
 526         }
 527         elsif ($line =~ /^([0-9A-F]+)/)
 528         {
 529             $wc = hex ($1);
 530         }
 531         elsif ($line =~ /^\s+=\s+(.+)$/)
 532         {
 533             my $value = $1;
 534             $value =~ s/\\/\\\\/g;
 535             $value =~ s/\"/\\"/g;
 536
 537             if (not defined $nameslist_hash->{$wc}->{'='}->{'index'}) {
 538                 $nameslist_hash->{$wc}->{'='}->{'index'} = $equal_i;
 539             }
 540             push (@{$nameslist_hash->{$wc}->{'='}->{'values'}}, $value);
 541
 542             $equal_i++;
 543         }
 544         elsif ($line =~ /^\s+\*\s+(.+)$/)
 545         {
 546             my $value = $1;
 547             $value =~ s/\\/\\\\/g;
 548             $value =~ s/\"/\\"/g;
 549
 550             if (not defined $nameslist_hash->{$wc}->{'*'}->{'index'}) {
 551                 $nameslist_hash->{$wc}->{'*'}->{'index'} = $star_i;
 552             }
 553             push (@{$nameslist_hash->{$wc}->{'*'}->{'values'}}, $value);
 554
 555             $star_i++;
 556         }
 557         elsif ($line =~ /^\s+#\s+(.+)$/)
 558         {
 559             my $value = $1;
 560             $value =~ s/\\/\\\\/g;
 561             $value =~ s/\"/\\"/g;
 562
 563             if (not defined $nameslist_hash->{$wc}->{'#'}->{'index'}) {
 564                 $nameslist_hash->{$wc}->{'#'}->{'index'} = $pound_i;
 565             }
 566             push (@{$nameslist_hash->{$wc}->{'#'}->{'values'}}, $value);
 567
 568             $pound_i++;
 569         }
 570         elsif ($line =~ /^\s+:\s+(.+)$/)
 571         {
 572             my $value = $1;
 573             $value =~ s/\\/\\\\/g;
 574             $value =~ s/\"/\\"/g;
 575
 576             if (not defined $nameslist_hash->{$wc}->{':'}->{'index'}) {
 577                 $nameslist_hash->{$wc}->{':'}->{'index'} = $colon_i;
 578             }
 579             push (@{$nameslist_hash->{$wc}->{':'}->{'values'}}, $value);
 580
 581             $colon_i++;
 582         }
 583         elsif ($line =~ /^\s+x\s+.*([0-9A-F]{4,6})\)$/)  # this one is different
 584         {
 585             my $value = hex ($1);
 586
 587             if (not defined $nameslist_hash->{$wc}->{'x'}->{'index'}) {
 588                 $nameslist_hash->{$wc}->{'x'}->{'index'} = $ex_i;
 589             }
 590             push (@{$nameslist_hash->{$wc}->{'x'}->{'values'}}, $value);
 591
 592             $ex_i++;
 593         }
 594     }
 595
 596     close ($nameslist);
 597
 598     open (my $out, "> unicode-nameslist.h") or die;
 599
 600     print $out "/* unicode-nameslist.h */\n";
 601     print $out "/* THIS IS A GENERATED FILE. CHANGES WILL BE OVERWRITTEN. */\n";
 602     print $out "/* Generated by $0 */\n";
 603     print $out "/* Generated from UCD version $v */\n\n";
 604
 605     print $out "#ifndef UNICODE_NAMESLIST_H\n";
 606     print $out "#define UNICODE_NAMESLIST_H\n\n";
 607
 608     print $out "#include <glib/gunicode.h>\n\n";
 609
 610     print $out "typedef struct _UnicharString UnicharString;\n";
 611     print $out "typedef struct _UnicharUnichar UnicharUnichar;\n";
 612     print $out "typedef struct _NamesList NamesList;\n\n";
 613
 614     print $out "struct _UnicharString\n";
 615     print $out "{\n";
 616     print $out "  gunichar index;\n";
 617     print $out "  const gchar *value;\n";
 618     print $out "}; \n\n";
 619
 620     print $out "struct _UnicharUnichar\n";
 621     print $out "{\n";
 622     print $out "  gunichar index;\n";
 623     print $out "  gunichar value;\n";
 624     print $out "}; \n\n";
 625
 626     print $out "struct _NamesList\n";
 627     print $out "{\n";
 628     print $out "  gunichar index;\n";
 629     print $out "  gint equals_index;  /* -1 means */\n";
 630     print $out "  gint stars_index;   /* this character */\n";
 631     print $out "  gint exes_index;    /* doesn't */\n";
 632     print $out "  gint pounds_index;  /* have any */\n";
 633     print $out "  gint colons_index;\n";
 634     print $out "};\n\n";
 635
 636     print $out "static const UnicharString names_list_equals[] = \n";
 637     print $out "{\n";
 638     for $wc (sort {$a <=> $b} keys %{$nameslist_hash})
 639     {
 640         next if not exists $nameslist_hash->{$wc}->{'='};
 641         for my $value (@{$nameslist_hash->{$wc}->{'='}->{'values'}}) {
 642             printf $out (qq/  { 0x%04X, "\%s" },\n/, $wc, $value);
 643         }
 644     }
 645     print $out "  { (gunichar)(-1), 0 }\n";
 646     print $out "};\n\n";
 647
 648     print $out "static const UnicharString names_list_stars[] = \n";
 649     print $out "{\n";
 650     for $wc (sort {$a <=> $b} keys %{$nameslist_hash})
 651     {
 652         next if not exists $nameslist_hash->{$wc}->{'*'};
 653         for my $value (@{$nameslist_hash->{$wc}->{'*'}->{'values'}}) {
 654             printf $out (qq/  { 0x%04X, "\%s" },\n/, $wc, $value);
 655         }
 656     }
 657     print $out "  { (gunichar)(-1), 0 }\n";
 658     print $out "};\n\n";
 659
 660     print $out "static const UnicharString names_list_pounds[] = \n";
 661     print $out "{\n";
 662     for $wc (sort {$a <=> $b} keys %{$nameslist_hash})
 663     {
 664         next if not exists $nameslist_hash->{$wc}->{'#'};
 665         for my $value (@{$nameslist_hash->{$wc}->{'#'}->{'values'}}) {
 666             printf $out (qq/  { 0x%04X, "\%s" },\n/, $wc, $value);
 667         }
 668     }
 669     print $out "  { (gunichar)(-1), 0 }\n";
 670     print $out "};\n\n";
 671
 672     print $out "static const UnicharUnichar names_list_exes[] = \n";
 673     print $out "{\n";
 674     for $wc (sort {$a <=> $b} keys %{$nameslist_hash})
 675     {
 676         next if not exists $nameslist_hash->{$wc}->{'x'};
 677         for my $value (@{$nameslist_hash->{$wc}->{'x'}->{'values'}}) {
 678             printf $out (qq/  { 0x%04X, 0x%04X },\n/, $wc, $value);
 679         }
 680     }
 681     print $out "  { (gunichar)(-1), 0 }\n";
 682     print $out "};\n\n";
 683
 684     print $out "static const UnicharString names_list_colons[] = \n";
 685     print $out "{\n";
 686     for $wc (sort {$a <=> $b} keys %{$nameslist_hash})
 687     {
 688         next if not exists $nameslist_hash->{$wc}->{':'};
 689         for my $value (@{$nameslist_hash->{$wc}->{':'}->{'values'}}) {
 690             printf $out (qq/  { 0x%04X, "\%s" },\n/, $wc, $value);
 691         }
 692     }
 693     print $out "  { (gunichar)(-1), 0 }\n";
 694     print $out "};\n\n";
 695
 696     print $out "static const NamesList names_list[] =\n";
 697     print $out "{\n";
 698     for $wc (sort {$a <=> $b} keys %{$nameslist_hash})
 699     {
 700         my $eq    = exists $nameslist_hash->{$wc}->{'='}->{'index'} ? $nameslist_hash->{$wc}->{'='}->{'index'} : -1;
 701         my $star  = exists $nameslist_hash->{$wc}->{'*'}->{'index'} ? $nameslist_hash->{$wc}->{'*'}->{'index'} : -1;
 702         my $ex    = exists $nameslist_hash->{$wc}->{'x'}->{'index'} ? $nameslist_hash->{$wc}->{'x'}->{'index'} : -1;
 703         my $pound = exists $nameslist_hash->{$wc}->{'#'}->{'index'} ? $nameslist_hash->{$wc}->{'#'}->{'index'} : -1;
 704         my $colon = exists $nameslist_hash->{$wc}->{':'}->{'index'} ? $nameslist_hash->{$wc}->{':'}->{'index'} : -1;
 705
 706         printf $out ("  { 0x%04X, \%d, \%d, \%d, \%d, \%d },\n", $wc, $eq, $star, $ex, $pound, $colon);
 707     }
 708     print $out "};\n\n";
 709
 710     print $out "#endif  /* #ifndef UNICODE_NAMESLIST_H */\n";
 711
 712     close ($out);
 713
 714     print " done.\n";
 715 }
 716
 717 #------------------------#
 718
 719 sub process_blocks_txt ($)
 720 {
 721     my ($blocks_txt) = @_;
 722
 723     open (my $blocks, $blocks_txt) or die;
 724     open (my $out, "> UnicodeBlocks.h") or die;
 725
 726     print "processing $blocks_txt...";
 727
 728     print $out "/* UnicodeBlocks.h */\n";
 729     print $out "/* THIS IS A GENERATED FILE. CHANGES WILL BE OVERWRITTEN. */\n";
 730     print $out "/* Generated by $0 */\n";
 731     print $out "/* Generated from UCD version $v */\n\n";
 732
 733     print $out "#ifndef __UNICODE_BLOCKS_H__\n";
 734     print $out "#define __UNICODE_BLOCKS_H__\n\n";
 735
 736     print $out "#include <QtCore/qglobal.h>\n";
 737     print $out "#include <klocalizedstring.h>\n\n";
 738
 739     print $out "struct TUnicodeBlock\n";
 740     print $out "{\n";
 741     print $out "    quint32    start,\n";
 742     print $out "               end;\n";
 743     print $out "    const char *blockName;\n";
 744     print $out "};\n\n";
 745     print $out "static const struct TUnicodeBlock constUnicodeBlocks[] =\n";
 746     print $out "{\n";
 747     while (my $line = <$blocks>)
 748     {
 749         $line =~ /^([0-9A-F]+)\.\.([0-9A-F]+); (.+)$/ or next;
 750         print $out qq/    { 0x$1, 0x$2, I18N_NOOP("$3") },\n/;
 751     }
 752     print $out "    { 0x0, 0x0, NULL }\n";
 753     print $out "};\n\n";
 754
 755     print $out "#endif\n\n";
 756
 757     close ($blocks);
 758     close ($out);
 759
 760     print " done.\n";
 761 }
 762
 763 #------------------------#
 764
 765 sub process_scripts_txt ($)
 766 {
 767     my ($scripts_txt) = @_;
 768
 769     my %script_hash;
 770     my %scripts;
 771
 772     open (my $scripts, $scripts_txt) or die;
 773     open (my $out, "> UnicodeScripts.h") or die;
 774
 775     print "processing $scripts_txt...";
 776
 777     while (my $line = <$scripts>)
 778     {
 779         my ($start, $end, $raw_script);
 780
 781         if ($line =~ /^([0-9A-F]+)\.\.([0-9A-F]+)\s+;\s+(\S+)/)
 782         {
 783             $start = hex ($1);
 784             $end = hex ($2);
 785             $raw_script = $3;
 786         }
 787         elsif ($line =~ /^([0-9A-F]+)\s+;\s+(\S+)/)
 788         {
 789             $start = hex ($1);
 790             $end = $start;
 791             $raw_script = $2;
 792         }
 793         else
 794         {
 795             next;
 796         }
 797
 798         my $script = $raw_script;
 799         $script =~ tr/_/ /;
 800         $script =~ s/(\w+)/\u\L$1/g;
 801
 802         $script_hash{$start} = { 'end' => $end, 'script' => $script };
 803         $scripts{$script} = 1;
 804     }
 805
 806     close ($scripts);
 807
 808     # Adds Common to make sure works with UCD <= 4.0.0
 809     $scripts{"Common"} = 1;
 810
 811     print $out "/* UnicodeScripts.h */\n";
 812     print $out "/* THIS IS A GENERATED FILE. CHANGES WILL BE OVERWRITTEN. */\n";
 813     print $out "/* Generated by $0 */\n";
 814     print $out "/* Generated from UCD version $v */\n\n";
 815
 816     print $out "#ifndef __UNICODE_SCRIPTS_H__\n";
 817     print $out "#define __UNICODE_SCRIPTS_H__\n\n";
 818
 819     print $out "#include <QtCore/qglobal.h>\n";
 820     print $out "#include <klocalizedstring.h>\n\n";
 821
 822     print $out "static const char *constUnicodeScriptList[] =\n";
 823     print $out "{\n";
 824     my $i = 0;
 825     for my $script (sort keys %scripts)
 826     {
 827         $scripts{$script} = $i;
 828         print $out qq/    I18N_NOOP("$script"),\n/;
 829         $i++;
 830     }
 831     print $out "    NULL\n";
 832     print $out "};\n\n";
 833
 834     print $out "struct TUnicodeScript\n";
 835     print $out "{\n";
 836     print $out "    quint32 start,\n";
 837     print $out "            end;\n";
 838     print $out "    int     scriptIndex;   /* index into constUnicodeScriptList */\n";
 839     print $out "};\n\n";
 840     print $out "static const TUnicodeScript constUnicodeScripts[] =\n";
 841     print $out "{\n";
 842     for my $start (sort { $a <=> $b } keys %script_hash)
 843     {
 844         printf $out (qq/    { 0x%04X, 0x%04X, \%2d },\n/,
 845                      $start, $script_hash{$start}->{'end'}, $scripts{$script_hash{$start}->{'script'}});
 846     }
 847     printf $out "    { 0x0, 0x0, -1 }\n";
 848     print $out "};\n\n";
 849
 850     print $out "#endif\n\n";
 851
 852     close ($out);
 853     print " done.\n";
 854 }