newlib/libc/string/uniset

   1 #!/usr/bin/perl
   2 # Uniset -- Unicode subset manager -- Markus Kuhn
   3 # http://www.cl.cam.ac.uk/~mgk25/download/uniset.tar.gz
   4
   5 require 5.014;
   6 use open ':utf8';
   7 use FindBin qw($RealBin);  # to find directory where this file is located
   8
   9 binmode(STDOUT, ":utf8");
  10 binmode(STDIN, ":utf8");
  11
  12 my (%name, %invname, %category, %comment);
  13
  14 print <<End if $#ARGV < 0;
  15 Uniset -- Unicode subset manager -- Markus Kuhn
  16
  17 Uniset merges and subtracts Unicode subsets. It can output and
  18 analyse the resulting character set in various formats.
  19
  20 Uniset understand the following command-line arguments:
  21
  22 Commands to define a set of characters:
  23
  24   + filename   add the character set described in the file to the set
  25   - filename   remove the character set described in the file from the set
  26   +: filename  add the characters in the UTF-8 file to the set
  27   -: filename  remove the characters in the UTF-8 file from the set
  28   +xxxx..yyyy  add the range to the set (xxxx and yyyy are hex numbers)
  29   -xxxx..yyyy  remove the range from the set (xxxx and yyyy are hex numbers)
  30   +cat=Xx      add all Unicode characters with category code Xx
  31   -cat=Xx      remove all Unicode characters with category code Xx
  32   -cat!=Xx     remove all Unicode characters without category code Xx
  33   clean        remove any elements that do not appear in the Unicode database
  34   unknown      remove any elements that do appear in the Unicode database
  35
  36 Command to output descriptions of the constructed set of characters:
  37
  38   table        write a full table with one line per character
  39   compact      output the set in compact MES format
  40   c            output the set as C interval array
  41   nr           output the number of characters
  42   sources      output a table that shows the number of characters contributed
  43                by the various combinations of input sets added with +.
  44   utf8-list    output a list of all characters encoded in UTF-8
  45
  46 Commands to tailor the following output commands:
  47
  48   html         write HTML tables instead of plain text
  49   ucs          add the unicode character itself to the table (UTF-8 in
  50                plain table, numeric character reference in HTML)
  51
  52 Formats of character set input files read by the + and - command:
  53
  54 Empty lines, white space at the start and end of the line and any
  55 comment text following a \# are ignored. The following formats are
  56 recognized
  57
  58 xx yyyy             xx is the hex code in an 8-bit character set and yyyy
  59                     is the corresponding Unicode value. Both can optionally
  60                     be prefixed by 0x. This is the format used in the
  61                     files on <ftp://ftp.unicode.org/Public/MAPPINGS/>.
  62
  63 yyyy                yyyy (optionally prefixed with 0x) is a Unicode character
  64                     belonging to the specified subset.
  65
  66 yyyy-yyyy           a range of Unicode characters belonging to
  67 yyyy..yyyy          the specified subset.
  68
  69 xx yy yy yy-yy yy   xx denotes a row (high-byte) and the yy specify
  70                     corresponding low bytes or with a hyphen also ranges of
  71                     low bytes in the Unicode values that belong to this
  72                     subset. This is also the format that is generated by
  73                     the compact command.
  74 End
  75 exit 1 if $#ARGV < 0;
  76
  77
  78 # Subroutine to identify whether the ISO 10646/Unicode character code
  79 # ucs belongs into the East Asian Wide (W) or East Asian FullWidth
  80 # (F) category as defined in Unicode Technical Report #11.
  81
  82 sub iswide ($) {
  83     my $ucs = shift(@_);
  84
  85     return ($ucs >= 0x1100 &&
  86             ($ucs <= 0x115f ||                     # Hangul Jamo
  87              $ucs == 0x2329 || $ucs == 0x232a ||
  88              ($ucs >= 0x2e80 && $ucs <= 0xa4cf &&
  89               $ucs != 0x303f) ||                   # CJK .. Yi
  90              ($ucs >= 0xac00 && $ucs <= 0xd7a3) || # Hangul Syllables
  91              ($ucs >= 0xf900 && $ucs <= 0xfaff) || # CJK Comp. Ideographs
  92              ($ucs >= 0xfe30 && $ucs <= 0xfe6f) || # CJK Comp. Forms
  93              ($ucs >= 0xff00 && $ucs <= 0xff60) || # Fullwidth Forms
  94              ($ucs >= 0xffe0 && $ucs <= 0xffe6) ||
  95              ($ucs >= 0x20000 && $ucs <= 0x2fffd) ||
  96              ($ucs >= 0x30000 && $ucs <= 0x3fffd)));
  97 }
  98
  99 # Return the Unicode name that belongs to a given character code
 100
 101 # Jamo short names, see Unicode 3.0, table 4-4, page 86
 102
 103 my @lname = ('G', 'GG', 'N', 'D', 'DD', 'R', 'M', 'B', 'BB', 'S', 'SS', '',
 104              'J', 'JJ', 'C', 'K', 'T', 'P', 'H'); # 1100..1112
 105 my @vname = ('A', 'AE', 'YA', 'YAE', 'EO', 'E', 'YEO', 'YE', 'O',
 106              'WA', 'WAE', 'OE', 'YO', 'U', 'WEO', 'WE', 'WI', 'YU',
 107              'EU', 'YI', 'I'); # 1161..1175
 108 my @tname = ('G', 'GG', 'GS', 'N', 'NJ', 'NH', 'D', 'L', 'LG', 'LM',
 109              'LB', 'LS', 'LT', 'LP', 'LH', 'M', 'B', 'BS', 'S', 'SS',
 110              'NG', 'J', 'C', 'K', 'T', 'P', 'H'); # 11a8..11c2
 111
 112 sub name {
 113     my $ucs = shift(@_);
 114
 115     # The intervals used here reflect Unicode Version 3.2
 116     if (($ucs >=  0x3400 && $ucs <=  0x4db5) ||
 117         ($ucs >=  0x4e00 && $ucs <=  0x9fa5) ||
 118         ($ucs >= 0x20000 && $ucs <= 0x2a6d6)) {
 119         return "CJK UNIFIED IDEOGRAPH-" . sprintf("%04X", $ucs);
 120     }
 121
 122     if ($ucs >= 0xac00 && $ucs <= 0xd7a3) {
 123         my $s = $ucs - 0xac00;
 124         my $l = 0x1100 + int($s / (21 * 28));
 125         my $v = 0x1161 + int(($s % (21 * 28)) / 28);
 126         my $t = 0x11a7 + $s % 28;
 127         return "HANGUL SYLLABLE " .
 128             ($lname[int($s / (21 * 28))] .
 129              $vname[int(($s % (21 * 28)) / 28)] .
 130              $tname[$s % 28 - 1]);
 131     }
 132
 133     return $name{$ucs};
 134 }
 135
 136 sub is_unicode {
 137     my $ucs = shift(@_);
 138
 139     # The intervals used here reflect Unicode Version 3.2
 140     if (($ucs >=  0x3400 && $ucs <=  0x4db5) ||
 141         ($ucs >=  0x4e00 && $ucs <=  0x9fa5) ||
 142         ($ucs >=  0xac00 && $ucs <=  0xd7a3) ||
 143         ($ucs >= 0x20000 && $ucs <= 0x2a6d6)) {
 144         return 1;
 145     }
 146
 147     return exists $name{$ucs};
 148 }
 149
 150 my @search_path = ();
 151 if ($RealBin =~ m|^(.*)/bin\z| && -d "$1/share/uniset") {
 152     push @search_path, "$1/share/uniset";
 153 } else {
 154     push @search_path, $RealBin;
 155 }
 156
 157 sub search_open {
 158     my ($mode, $fn) = @_;
 159     my $file;
 160     return $file if open($file, $mode, $fn);
 161     return undef if $fn =~ m|/|;
 162     for my $path (@search_path) {
 163         return $file if open($file, $mode, "$path/$fn");
 164     }
 165     return undef;
 166 }
 167
 168 my $html = 0;
 169 my $image = 0;
 170 my $adducs = 0;
 171 my $unicodedata = "UnicodeData.txt";
 172 my $blockdata = "Blocks.txt";
 173
 174 # read list of all Unicode names
 175 my $data = search_open('<', $unicodedata);
 176 unless ($data) {
 177     die ("Can't open Unicode database '$unicodedata':\n$!\n\n" .
 178          "Please make sure that you have downloaded the file\n" .
 179          "http://www.unicode.org/Public/UNIDATA/UnicodeData.txt\n");
 180 }
 181 while (<$data>) {
 182     if (/^([0-9,A-F]{4,8});([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*)$/) {
 183         next if $2 ne '<control>' && substr($2, 0, 1) eq '<';
 184         $ucs = hex($1);
 185         $name{$ucs} = $2;
 186         $invname{$2} = $ucs;
 187         $category{$ucs} = $3;
 188         $comment{$ucs} = $12;
 189     } else {
 190         die("Syntax error in line '$_' in file '$unicodedata'\n");
 191     }
 192 }
 193 close($data);
 194
 195 # read list of all Unicode blocks
 196 $data = search_open('<', $blockdata);
 197 unless ($data) {
 198     die ("Can't open Unicode blockname list '$blockdata':\n$!\n\n" .
 199          "Please make sure that you have downloaded the file\n" .
 200          "http://www.unicode.org/Public/UNIDATA/Blocks.txt\n");
 201 }
 202 my $blocks = 0;
 203 my (@blockstart, @blockend, @blockname);
 204 while (<$data>) {
 205     if (/^\s*([0-9,A-F]{4,8})\s*\.\.\s*([0-9,A-F]{4,8})\s*;\s*(.*)$/) {
 206         $blockstart[$blocks] = hex($1);
 207         $blockend  [$blocks] = hex($2);
 208         $blockname [$blocks] = $3;
 209         $blocks++;
 210     } elsif (/^\s*\#/ || /^\s*$/) {
 211         # ignore comments and empty lines
 212     } else {
 213         die("Syntax error in line '$_' in file '$blockdata'\n");
 214     }
 215 }
 216 close($data);
 217 if ($blockend[$blocks-1] < 0x110000) {
 218     $blockstart[$blocks] = 0x110000;
 219     $blockend  [$blocks] = 0x7FFFFFFF;
 220     $blockname [$blocks] = "Beyond Plane 16";
 221     $blocks++;
 222 }
 223
 224 # process command line arguments
 225 while ($_ = shift(@ARGV)) {
 226     if (/^html$/) {
 227         $html = 1;
 228     } elsif (/^ucs$/) {
 229         $adducs = 1;
 230     } elsif (/^img$/) {
 231         $html = 1;
 232         $image = 1;
 233     } elsif (/^template$/) {
 234         $template = shift(@ARGV);
 235         open(TEMPLATE, $template) || die("Can't open template file '$template': $!\n");
 236         while (<TEMPLATE>) {
 237             if (/^\#\s*include\s+\"([^\"]*)\"\s*$/) {
 238                 open(INCLUDE, $1) || die("Can't open template include file '$1': $!\n");
 239                 while (<INCLUDE>) {
 240                     print $_;
 241                 }
 242                 close(INCLUDE);
 243             } elsif (/^\#\s*quote\s+\"([^\"]*)\"\s*$/) {
 244                 open(INCLUDE, $1) || die("Can't open template include file '$1': $!\n");
 245                 while (<INCLUDE>) {
 246                     s/&/&amp;/g;
 247                     s/</&lt;/g;
 248                     print $_;
 249                 }
 250                 close(INCLUDE);
 251             } else {
 252                 print $_;
 253             }
 254         }
 255         close(TEMPLATE);
 256     } elsif (/^\+cat=(.+)$/) {
 257         # add characters with given category
 258         $cat = $1;
 259         for $i (keys(%category)) {
 260             $used{$i} = "[${cat}]" if $category{$i} eq $cat;
 261         }
 262     } elsif (/^\-cat=(.+)$/) {
 263         # remove characters with given category
 264         $cat = $1;
 265         for $i (keys(%category)) {
 266             delete $used{$i} if $category{$i} eq $cat;
 267         }
 268     } elsif (/^\-cat!=(.+)$/) {
 269         # remove characters without given category
 270         $cat = $1;
 271         for $i (keys(%category)) {
 272             delete $used{$i} unless $category{$i} eq $cat;
 273         }
 274     } elsif (/^([+-]):(.*)/) {
 275         $remove = $1 eq "-";
 276         $setfile = $2;
 277         $setfile = shift(@ARGV) if $setfile eq "";
 278         push(@SETS, $setfile);
 279         open(SET, $setfile) || die("Can't open set file '$setfile': $!\n");
 280         $setname = $setfile;
 281         while (<SET>) {
 282             while ($_) {
 283                 $i = ord($_);
 284                 $used{$i} .= "[${setname}]" unless $remove;
 285                 delete $used{$i} if $remove;
 286                 $_ = substr($_, 1);
 287             }
 288         }
 289         close SET;
 290     } elsif (/^([+-])(.*)/) {
 291         $remove = $1 eq "-";
 292         $setfile = $2;
 293         $setfile = "$setfile..$setfile" if $setfile =~ /^([0-9A-Fa-f]{4,8})$/;
 294         if ($setfile =~ /^([0-9A-Fa-f]{4,8})(-|\.\.)([0-9A-Fa-f]{4,8})$/) {
 295             # handle intervall specification on command line
 296             $first = hex($1);
 297             $last = hex($3);
 298             for ($i = $first; $i <= $last; $i++) {
 299                 $used{$i} .= "[ARG]" unless $remove;
 300                 delete $used{$i} if $remove;
 301             }
 302             next;
 303         }
 304         $setfile = shift(@ARGV) if $setfile eq "";
 305         push(@SETS, $setfile);
 306         my $setf = search_open('<', $setfile);
 307         die("Can't open set file '$setfile': $!\n") unless $setf;
 308         $cedf = ($setfile =~ /cedf/); # detect Kosta Kosti's trans CEDF format by path name
 309         $setname = $setfile;
 310         $setname =~ s/([^.\[\]]*)\..*/$1/;
 311         while (<$setf>) {
 312             if (/^<code_set_name>/) {
 313                 # handle ISO 15897 (POSIX registry) charset mapping format
 314                 undef $comment_char;
 315                 undef $escape_char;
 316                 while (<$setf>) {
 317                     if ($comment_char && /^$comment_char/) {
 318                         # remove comments
 319                         $_ = $`;
 320                     }
 321                     next if (/^\032?\s*$/);                                             # skip empty lines
 322                     if (/^<comment_char> (\S)$/) {
 323                         $comment_char = $1;
 324                     } elsif (/^<escape_char> (\S)$/) {
 325                         $escape_char = $1;
 326                     } elsif (/^(END )?CHARMAP$/) {
 327                         #ignore
 328                     } elsif (/^<.*>\s*\/x([0-9A-F]{2})\s*<U([0-9A-F]{4,8})>/) {
 329                         $used{hex($2)} .= "[${setname}{$1}]" unless $remove;
 330                         delete $used{hex($2)} if $remove;
 331                     } else {
 332                         die("Syntax error in line $. in file '$setfile':\n'$_'\n");
 333                     }
 334                 }
 335                 next;
 336             } elsif (/^STARTFONT /) {
 337                 # handle X11 BDF file
 338                 while (<$setf>) {
 339                     if (/^ENCODING\s+([0-9]+)/) {
 340                         $used{$1} .= "[${setname}]" unless $remove;
 341                         delete $used{$1} if $remove;
 342                     }
 343                 }
 344                 next;
 345             }
 346             tr/a-z/A-Z/;           # make input uppercase
 347             if ($cedf) {
 348                 if ($. > 4) {
 349                     if (/^([0-9A-F]{2})\t.?\t(.*)$/) {
 350                         # handle Kosta Kosti's trans CEDF format
 351                         next if (hex($1) < 32 || (hex($1) > 0x7e && hex($1) < 0xa0));
 352                         $ucs = $invname{$2};
 353                         die "unknown ISO 10646 name '$2' in '$setfile' line $..\n" if ! $ucs;
 354                         $used{$ucs} .= "[${setname}{$1}]" unless $remove;
 355                         delete $used{$ucs} if $remove;
 356                     } else {
 357                         die("Syntax error in line $. in CEDF file '$setfile':\n'$_'\n");
 358                     }
 359                 }
 360                 next;
 361             }
 362             if (/^\s*(0X|U\+|U-)?([0-9A-F]{2})\s+\#\s*UNDEFINED\s*$/) {
 363                 # ignore ftp.unicode.org mapping file lines with #UNDEFINED
 364                 next;
 365             }
 366             s/^([^\#]*)\#.*$/$1/;  # remove comments
 367             next if (/^\032?\s*$/);     # skip empty lines
 368             if (/^\s*(0X)?([0-9A-F-]{2})\s+(0X|U\+|U-)?([0-9A-F]{4,8})\s*$/) {
 369                 # handle entry from a ftp.unicode.org mapping file
 370                 $used{hex($4)} .= "[${setname}{$2}]" unless $remove;
 371                 delete $used{hex($4)} if $remove;
 372             } elsif (/^\s*(0X|U\+|U-)?([0-9A-F]{4,8})(\s*-\s*|\s*\.\.\s*|\s+)(0X|U\+|U-)?([0-9A-F]{4,8})\s*$/) {
 373                 # handle interval specification
 374                 $first = hex($2);
 375                 $last = hex($5);
 376                 for ($i = $first; $i <= $last; $i++) {
 377                     $used{$i} .= "[${setname}]" unless $remove;
 378                     delete $used{$i} if $remove;
 379                 }
 380             } elsif (/^\s*([0-9A-F]{2,6})(\s+[0-9A-F]{2},?|\s+[0-9A-F]{2}-[0-9A-F]{2},?)+/) {
 381                 # handle lines from P10 MES draft
 382                 $row = $1;
 383                 $cols = $_;
 384                 $cols =~ s/^\s*([0-9A-F]{2,6})\s*(.*)\s*$/$2/;
 385                 $cols =~ tr/,//d;
 386                 @cols = split(/\s+/, $cols);
 387                 for (@cols) {
 388                     if (/^(..)$/) {
 389                         $first = hex("$row$1");
 390                         $last  = $first;
 391                     } elsif (/^(..)-(..)$/) {
 392                         $first = hex("$row$1");
 393                         $last  = hex("$row$2");
 394                     } else {
 395                         die ("this should never happen '$_'");
 396                     }
 397                     for ($i = $first; $i <= $last; $i++) {
 398                         $used{$i} .= "[${setname}]" unless $remove;
 399                         delete $used{$i} if $remove;
 400                     }
 401                 }
 402             } elsif (/^\s*(0X|U\+|U-)?([0-9A-F]{4,8})\s*/) {
 403                 # handle single character
 404                 $used{hex($2)} .= "[${setname}]" unless $remove;
 405                 delete $used{hex($2)} if $remove;
 406             } else {
 407                 die("Syntax error in line $. in file '$setfile':\n'$_'\n") unless /^\s*(\#.*)?$/;
 408             }
 409         }
 410         close $setf;
 411     } elsif (/^loadimages$/ || /^loadbigimages$/) {
 412         if (/^loadimages$/) {
 413             $prefix = "Small.Glyphs";
 414         } else {
 415             $prefix = "Glyphs";
 416         }
 417         $total = 0;
 418         for $i (keys(%used)) {
 419             next if ($name{$i} eq "<control>");
 420             $total++;
 421         }
 422         $count = 0;
 423         $| = 1;
 424         for $i (sort({$a <=> $b} keys(%used))) {
 425             next if ($name{$i} eq "<control>");
 426             $count++;
 427             $j = sprintf("%04X", $i);
 428             $j =~ /(..)(..)/;
 429             $gif = "http://charts.unicode.org/Unicode.charts/$prefix/$1/U$j.gif";
 430             print("\r$count/$total: $gif");
 431             system("mkdir -p $prefix/$1; cd $prefix/$1; webcopy -u -s $gif &");
 432             select(undef, undef, undef, 0.2);
 433         }
 434         print("\n");
 435         exit 0;
 436     } elsif (/^giftable/) {
 437         # form a table of glyphs (requires pbmtools installed)
 438         $count = 0;
 439         for $i (keys(%used)) {
 440             $count++ unless $name{$i} eq "<control>";
 441         }
 442         $width = int(sqrt($count/sqrt(2)) + 0.5);
 443         $width = $1 if /^giftable([0-9]+)$/;
 444         system("rm -f tmp-*.pnm table.pnm~ table.pnm");
 445         $col = 0;
 446         $row = 0;
 447         for $i (sort({$a <=> $b} keys(%used))) {
 448             next if ($name{$i} eq "<control>");
 449             $j = sprintf("%04X", $i);
 450             $j =~ /(..)(..)/;
 451             $gif = "Small.Glyphs/$1/U$j.gif";
 452             $pnm = sprintf("tmp-%02x.pnm", $col);
 453             $fallback = "Small.Glyphs/FF/UFFFD.gif";
 454             system("giftopnm $gif >$pnm || { rm $pnm ; giftopnm $fallback >$pnm ; }");
 455             if (++$col == $width) {
 456                 system("pnmcat -lr tmp-*.pnm | cat >tmp-row.pnm");
 457                 if ($row == 0) {
 458                     system("mv tmp-row.pnm table.pnm");
 459                 } else {
 460                     system("mv table.pnm table.pnm~; pnmcat -tb table.pnm~ tmp-row.pnm >table.pnm");
 461                 }
 462                 $row++;
 463                 $col = 0;
 464                 system("rm -f tmp-*.pnm table.pnm~");
 465             }
 466         }
 467         if ($col > 0) {
 468             system("pnmcat -lr tmp-*.pnm | cat >tmp-row.pnm");
 469             if ($row == 0) {
 470                 system("mv tmp-row.pnm table.pnm");
 471             } else {
 472                 system("mv table.pnm table.pnm~; pnmcat -tb -jleft -black table.pnm~ tmp-row.pnm >table.pnm");
 473             }
 474         }
 475         system("rm -f table.gif ; ppmtogif table.pnm > table.gif");
 476         system("rm -f tmp-*.pnm table.pnm~ table.pnm");
 477     } elsif (/^table$/) {
 478         # go through all used names to print full table
 479         print "<TABLE border=2>\n" if $html;
 480         for $i (sort({$a <=> $b} keys(%used))) {
 481             next if ($name{$i} eq "<control>");
 482             if ($html) {
 483                 $sources = $used{$i};
 484                 $sources =~ s/\]\[/, /g;
 485                 $sources =~ s/^\[//g;
 486                 $sources =~ s/\]$//g;
 487                 $sources =~ s/\{(..)\}/<SUB>$1<\/SUB>/g;
 488                 $j = sprintf("%04X", $i);
 489                 $j =~ /(..)(..)/;
 490                 $gif = "Small.Glyphs/$1/U$j.gif";
 491                 print "<TR>";
 492                 print "<TD><img width=32 height=32 src=\"$gif\">" if $image;
 493                 printf("<TD>&#%d;", $i) if $adducs;
 494                 print "<TD><SAMP>$j</SAMP><TD><SAMP>" . name($i);
 495                 print " ($comment{$i})" if $comment{$i};
 496                 print "</SAMP><TD><SMALL>$sources</SMALL>\n";
 497             } else {
 498                 printf("%04X \# ", $i);
 499                 print pack("U", $i) . " " if $adducs;
 500                 print name($i) ."\n";
 501             }
 502         }
 503         print "</TABLE>\n" if $html;
 504     } elsif (/^imgblock$/) {
 505         $width = 16;
 506         $width = $1 if /giftable([0-9]+)/;
 507         $col = 0;
 508         $subline = "";
 509         print "\n<P><TABLE cellspacing=0 cellpadding=0>";
 510         for $i (sort({$a <=> $b} keys(%used))) {
 511             print "<TR>" if $col == 0;
 512             $j = sprintf("%04X", $i);
 513             $j =~ /(..)(..)/;
 514             $gif = "Small.Glyphs/$1/U$j.gif";
 515             $alt = name($i);
 516             print "<TD><img width=32 height=32 src=\"$gif\" alt=\"$alt\">";
 517             $subline .= "<TD><SMALL><SAMP>$j</SAMP></SMALL>";
 518             if (++$col == $width) {
 519                 print "<TR align=center>$subline";
 520                 $col = 0;
 521                 $subline = "";
 522             }
 523         }
 524         print "<TR align=center>$subline" if ($col > 0);
 525         print "</TABLE>\n";
 526     } elsif (/^sources$/) {
 527         # count how many characters are attributed to the various source set combinations
 528         print "<P>Number of occurences of source character set combinations:\n<TABLE border=2>" if $html;
 529         for $i (keys(%used)) {
 530             next if ($name{$i} eq "<control>");
 531             $sources = $used{$i};
 532             $sources =~ s/\]\[/, /g;
 533             $sources =~ s/^\[//g;
 534             $sources =~ s/\]$//g;
 535             $sources =~ s/\{(..)\}//g;
 536             $contribs{$sources} += 1;
 537         }
 538         for $j (keys(%contribs)) {
 539             print "<TR><TD>$contribs{$j}<TD>$j\n" if $html;
 540         }
 541         print "</TABLE>\n" if $html;
 542     } elsif (/^compact$/) {
 543         # print compact table in P10 MES format
 544         print "<P>Compact representation of this character set:\n<TABLE border=2>" if $html;
 545         print "<TR><TD><B>Rows</B><TD><B>Positions (Cells)</B>" if $html;
 546         print "\n# Plane 00\n# Rows\tPositions (Cells)\n" unless $html;
 547         $current_row = '';
 548         $start_col = '';
 549         $last_col = '';
 550         for $i (sort({$a <=> $b} keys(%used))) {
 551             next if ($name{$i} eq "<control>");
 552             $row = sprintf("%02X", $i >> 8);
 553             $col = sprintf("%02X", $i & 0xff);
 554             if ($row ne $current_row) {
 555                 if (($last_col ne '') and ($last_col ne $start_col)) {
 556                     print "-$last_col";
 557                     print "</SAMP>" if $html;
 558                 }
 559                 print "<TR><TD><SAMP>$row</SAMP><TD><SAMP>" if $html;
 560                 print "\n  $row\t" unless $html;
 561                 $len = 0;
 562                 $current_row = $row;
 563                 $start_col = '';
 564             }
 565             if ($start_col eq '') {
 566                 print "$col";
 567                 $len += 2;
 568                 $start_col = $col;
 569                 $last_col = $col;
 570             } elsif (hex($col) == hex($last_col) + 1) {
 571                 $last_col = $col;
 572             } else {
 573                 if ($last_col ne $start_col) {
 574                     print "-$last_col";
 575                     $len += 3;
 576                 }
 577                 if ($len > 60 && !$html) {
 578                     print "\n  $row\t";
 579                     $len = 0;
 580                 };
 581                 print " " if $len;
 582                 print "$col";
 583                 $len += 2 + !! $len;
 584                 $start_col = $col;
 585                 $last_col = $col;
 586             }
 587         }
 588         if (($last_col ne '') and ($last_col ne $start_col)) {
 589             print "-$last_col";
 590             print "</SAMP>" if $html;
 591         }
 592         print "\n" if ($current_row ne '');
 593         print "</TABLE>\n" if $html;
 594         print "\n";
 595     } elsif (/^c$/) {
 596         # print table as C interval array
 597         print "{";
 598         $last_i = '';
 599         $columns = 3;
 600         $col = $columns;
 601         for $i (sort({$a <=> $b} keys(%used))) {
 602             next if ($name{$i} eq "<control>");
 603             if ($last_i eq '') {
 604                 if (++$col > $columns) { $col = 1; print "\n "; }
 605                 printf(" { 0x%04X, ", $i);
 606                 $last_i = $i;
 607             } elsif ($i == $last_i + 1) {
 608                 $last_i = $i;
 609             } else {
 610                 printf("0x%04X },", $last_i);
 611                 if (++$col > $columns) { $col = 1; print "\n "; }
 612                 printf(" { 0x%04X, ", $i);
 613                 $last_i = $i;
 614             }
 615         }
 616         if ($last_i ne '') {
 617             printf("0x%04X }", $last_i);
 618         }
 619         print "\n};\n";
 620     } elsif (/^utf8-list$/) {
 621         $col = 0;
 622         $block = 0;
 623         $last = -1;
 624         for $i (sort({$a <=> $b} keys(%used))) {
 625             next if ($name{$i} eq "<control>");
 626             while ($blockend[$block] < $i && $block < $blocks - 1) {
 627                 $block++;
 628             }
 629             if ($last <= $blockend[$block-1] &&
 630                 $i < $blockstart[$block]) {
 631                 print "\n" if ($col);
 632                 printf "\nFree block (U+%04X-U+%04X):\n\n",
 633                     $blockend[$block-1] + 1, $blockstart[$block] - 1;
 634                 $col = 0;
 635             }
 636             if ($last < $blockstart[$block] && $i >= $blockstart[$block]) {
 637                 print "\n" if ($col);
 638                 printf "\n$blockname[$block] (U+%04X-U+%04X):\n\n",
 639                 $blockstart[$block], $blockend[$block];
 640                 $col = 0;
 641             }
 642             if ($category{$i} eq 'Mn') {
 643                 # prefix non-spacing character with U+25CC DOTTED CIRCLE
 644                 print "\x{25CC}";
 645             } elsif ($category{$i} eq 'Me') {
 646                 # prefix enclosing non-spacing character with space
 647                 print " ";
 648             }
 649             print pack("U", $i);
 650             $col += 1 + iswide($i);
 651             if ($col >= 64) {
 652                 print "\n";
 653                 $col = 0;
 654             }
 655             $last = $i;
 656         }
 657         print "\n" if ($col);
 658     } elsif (/^collections$/) {
 659         $block = 0;
 660         $last = -1;
 661         for $i (sort({$a <=> $b} keys(%used))) {
 662             next if ($name{$i} eq "<control>");
 663             while ($blockend[$block] < $i && $block < $blocks - 1) {
 664                 $block++;
 665             }
 666             if ($last < $blockstart[$block] && $i >= $blockstart[$block]) {
 667                 print $blockname[$block],
 668                   " " x (40 - length($blockname[$block]));
 669                 printf "%04X-%04X\n",
 670                   $blockstart[$block], $blockend[$block];
 671             }
 672             $last = $i;
 673         }
 674     } elsif (/^nr$/) {
 675         print "<P>" if $html;
 676         print "# " unless $html;
 677         print "Number of characters in above table: ";
 678         $count = 0;
 679         for $i (keys(%used)) {
 680             $count++ unless $name{$i} eq "<control>";
 681         }
 682         print $count;
 683         print "\n";
 684     } elsif (/^clean$/) {
 685         # remove characters from set that are not in $unicodedata
 686         for $i (keys(%used)) {
 687             delete $used{$i} unless is_unicode($i);
 688         }
 689     } elsif (/^unknown$/) {
 690         # remove characters from set that are in $unicodedata
 691         for $i (keys(%used)) {
 692             delete $used{$i} if is_unicode($i);
 693         }
 694     } else {
 695         die("Unknown command line command '$_'\n");
 696     };
 697 }