2 # Uniset -- Unicode subset manager -- Markus Kuhn
3 # http://www.cl.cam.ac.uk/~mgk25/download/uniset.tar.gz
7 use FindBin
qw($RealBin); # to find directory where this file is located
9 binmode(STDOUT, ":utf8");
10 binmode(STDIN, ":utf8");
12 my (%name, %invname, %category, %comment);
14 print <<End if $#ARGV < 0;
15 Uniset -- Unicode subset manager -- Markus Kuhn
17 Uniset merges and subtracts Unicode subsets. It can output and
18 analyse the resulting character set in various formats.
20 Uniset understand the following command-line arguments:
22 Commands to define a set of characters:
24 + filename add the character set described in the file to the set
25 - filename remove the character set described in the file from the set
26 +: filename add the characters in the UTF-8 file to the set
27 -: filename remove the characters in the UTF-8 file from the set
28 +xxxx..yyyy add the range to the set (xxxx and yyyy are hex numbers)
29 -xxxx..yyyy remove the range from the set (xxxx and yyyy are hex numbers)
30 +cat=Xx add all Unicode characters with category code Xx
31 -cat=Xx remove all Unicode characters with category code Xx
32 -cat!=Xx remove all Unicode characters without category code Xx
33 clean remove any elements that do not appear in the Unicode database
34 unknown remove any elements that do appear in the Unicode database
36 Command to output descriptions of the constructed set of characters:
38 table write a full table with one line per character
39 compact output the set in compact MES format
40 c output the set as C interval array
41 nr output the number of characters
42 sources output a table that shows the number of characters contributed
43 by the various combinations of input sets added with +.
44 utf8-list output a list of all characters encoded in UTF-8
46 Commands to tailor the following output commands:
48 html write HTML tables instead of plain text
49 ucs add the unicode character itself to the table (UTF-8 in
50 plain table, numeric character reference in HTML)
52 Formats of character set input files read by the + and - command:
54 Empty lines, white space at the start and end of the line and any
55 comment text following a \# are ignored. The following formats are
58 xx yyyy xx is the hex code in an 8-bit character set and yyyy
59 is the corresponding Unicode value. Both can optionally
60 be prefixed by 0x. This is the format used in the
61 files on <ftp://ftp.unicode.org/Public/MAPPINGS/>.
63 yyyy yyyy (optionally prefixed with 0x) is a Unicode character
64 belonging to the specified subset.
66 yyyy-yyyy a range of Unicode characters belonging to
67 yyyy..yyyy the specified subset.
69 xx yy yy yy-yy yy xx denotes a row (high-byte) and the yy specify
70 corresponding low bytes or with a hyphen also ranges of
71 low bytes in the Unicode values that belong to this
72 subset. This is also the format that is generated by
78 # Subroutine to identify whether the ISO 10646/Unicode character code
79 # ucs belongs into the East Asian Wide (W) or East Asian FullWidth
80 # (F) category as defined in Unicode Technical Report #11.
85 return ($ucs >= 0x1100 &&
86 ($ucs <= 0x115f || # Hangul Jamo
87 $ucs == 0x2329 || $ucs == 0x232a ||
88 ($ucs >= 0x2e80 && $ucs <= 0xa4cf &&
89 $ucs != 0x303f) || # CJK .. Yi
90 ($ucs >= 0xac00 && $ucs <= 0xd7a3) || # Hangul Syllables
91 ($ucs >= 0xf900 && $ucs <= 0xfaff) || # CJK Comp. Ideographs
92 ($ucs >= 0xfe30 && $ucs <= 0xfe6f) || # CJK Comp. Forms
93 ($ucs >= 0xff00 && $ucs <= 0xff60) || # Fullwidth Forms
94 ($ucs >= 0xffe0 && $ucs <= 0xffe6) ||
95 ($ucs >= 0x20000 && $ucs <= 0x2fffd) ||
96 ($ucs >= 0x30000 && $ucs <= 0x3fffd)));
99 # Return the Unicode name that belongs to a given character code
101 # Jamo short names, see Unicode 3.0, table 4-4, page 86
103 my @lname = ('G', 'GG', 'N', 'D', 'DD', 'R', 'M', 'B', 'BB', 'S', 'SS', '',
104 'J', 'JJ', 'C', 'K', 'T', 'P', 'H'); # 1100..1112
105 my @vname = ('A', 'AE', 'YA', 'YAE', 'EO', 'E', 'YEO', 'YE', 'O',
106 'WA', 'WAE', 'OE', 'YO', 'U', 'WEO', 'WE', 'WI', 'YU',
107 'EU', 'YI', 'I'); # 1161..1175
108 my @tname = ('G', 'GG', 'GS', 'N', 'NJ', 'NH', 'D', 'L', 'LG', 'LM',
109 'LB', 'LS', 'LT', 'LP', 'LH', 'M', 'B', 'BS', 'S', 'SS',
110 'NG', 'J', 'C', 'K', 'T', 'P', 'H'); # 11a8..11c2
115 # The intervals used here reflect Unicode Version 3.2
116 if (($ucs >= 0x3400 && $ucs <= 0x4db5) ||
117 ($ucs >= 0x4e00 && $ucs <= 0x9fa5) ||
118 ($ucs >= 0x20000 && $ucs <= 0x2a6d6)) {
119 return "CJK UNIFIED IDEOGRAPH-" . sprintf("%04X", $ucs);
122 if ($ucs >= 0xac00 && $ucs <= 0xd7a3) {
123 my $s = $ucs - 0xac00;
124 my $l = 0x1100 + int($s / (21 * 28));
125 my $v = 0x1161 + int(($s % (21 * 28)) / 28);
126 my $t = 0x11a7 + $s % 28;
127 return "HANGUL SYLLABLE " .
128 ($lname[int($s / (21 * 28))] .
129 $vname[int(($s % (21 * 28)) / 28)] .
130 $tname[$s % 28 - 1]);
139 # The intervals used here reflect Unicode Version 3.2
140 if (($ucs >= 0x3400 && $ucs <= 0x4db5) ||
141 ($ucs >= 0x4e00 && $ucs <= 0x9fa5) ||
142 ($ucs >= 0xac00 && $ucs <= 0xd7a3) ||
143 ($ucs >= 0x20000 && $ucs <= 0x2a6d6)) {
147 return exists $name{$ucs};
150 my @search_path = ();
151 if ($RealBin =~ m
|^(.*)/bin\z| && -d "$1/share
/uniset
") {
152 push @search_path, "$1/share/uniset
";
154 push @search_path, $RealBin;
158 my ($mode, $fn) = @_;
160 return $file if open($file, $mode, $fn);
161 return undef if $fn =~ m|/|;
162 for my $path (@search_path) {
163 return $file if open($file, $mode, "$path/$fn");
171 my $unicodedata = "UnicodeData
.txt
";
172 my $blockdata = "Blocks
.txt
";
174 # read list of all Unicode names
175 my $data = search_open('<', $unicodedata);
177 die ("Can
't open Unicode database '$unicodedata':\n$!\n\n" .
178 "Please make sure that you have downloaded the file\n" .
179 "http://www.unicode.org/Public/UNIDATA/UnicodeData.txt\n");
182 if (/^([0-9,A-F]{4,8});([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*)$/) {
183 next if $2 ne '<control
>' && substr($2, 0, 1) eq '<';
187 $category{$ucs} = $3;
188 $comment{$ucs} = $12;
190 die("Syntax error in line '$_' in file '$unicodedata'\n");
195 # read list of all Unicode blocks
196 $data = search_open('<', $blockdata);
198 die ("Can't
open Unicode blockname list
'$blockdata':\n$!\n\n" .
199 "Please make sure that you have downloaded the file
\n" .
200 "http
://www
.unicode
.org
/Public/UNIDATA
/Blocks
.txt
\n");
203 my (@blockstart, @blockend, @blockname);
205 if (/^\s*([0-9,A-F]{4,8})\s*\.\.\s*([0-9,A-F]{4,8})\s*;\s*(.*)$/) {
206 $blockstart[$blocks] = hex($1);
207 $blockend [$blocks] = hex($2);
208 $blockname [$blocks] = $3;
210 } elsif (/^\s*\#/ || /^\s*$/) {
211 # ignore comments and empty lines
213 die("Syntax error
in line
'$_' in file
'$blockdata'\n");
217 if ($blockend[$blocks-1] < 0x110000) {
218 $blockstart[$blocks] = 0x110000;
219 $blockend [$blocks] = 0x7FFFFFFF;
220 $blockname [$blocks] = "Beyond Plane
16";
224 # process command line arguments
225 while ($_ = shift(@ARGV)) {
233 } elsif (/^template$/) {
234 $template = shift(@ARGV);
235 open(TEMPLATE, $template) || die("Can
't open template file '$template': $!\n");
237 if (/^\#\s*include\s+\"([^\"]*)\"\s*$/) {
238 open(INCLUDE, $1) || die("Can't
open template include file
'$1': $!\n");
243 } elsif (/^\#\s*quote\s+\"([^\"]*)\"\s*$/) {
244 open(INCLUDE, $1) || die("Can
't open template include file '$1': $!\n");
256 } elsif (/^\+cat=(.+)$/) {
257 # add characters with given category
259 for $i (keys(%category)) {
260 $used{$i} = "[${cat}]" if $category{$i} eq $cat;
262 } elsif (/^\-cat=(.+)$/) {
263 # remove characters with given category
265 for $i (keys(%category)) {
266 delete $used{$i} if $category{$i} eq $cat;
268 } elsif (/^\-cat!=(.+)$/) {
269 # remove characters without given category
271 for $i (keys(%category)) {
272 delete $used{$i} unless $category{$i} eq $cat;
274 } elsif (/^([+-]):(.*)/) {
277 $setfile = shift(@ARGV) if $setfile eq "";
278 push(@SETS, $setfile);
279 open(SET, $setfile) || die("Can't
open set file
'$setfile': $!\n");
284 $used{$i} .= "[${setname
}]" unless $remove;
285 delete $used{$i} if $remove;
290 } elsif (/^([+-])(.*)/) {
293 $setfile = "$setfile..$setfile" if $setfile =~ /^([0-9A-Fa-f]{4,8})$/;
294 if ($setfile =~ /^([0-9A-Fa-f]{4,8})(-|\.\.)([0-9A-Fa-f]{4,8})$/) {
295 # handle intervall specification on command line
298 for ($i = $first; $i <= $last; $i++) {
299 $used{$i} .= "[ARG
]" unless $remove;
300 delete $used{$i} if $remove;
304 $setfile = shift(@ARGV) if $setfile eq "";
305 push(@SETS, $setfile);
306 my $setf = search_open('<', $setfile);
307 die("Can
't open set file '$setfile': $!\n") unless $setf;
308 $cedf = ($setfile =~ /cedf/); # detect Kosta Kosti's trans CEDF format by path name
310 $setname =~ s/([^.\[\]]*)\..*/$1/;
312 if (/^<code_set_name>/) {
313 # handle ISO 15897 (POSIX registry) charset mapping format
317 if ($comment_char && /^$comment_char/) {
321 next if (/^\032?\s*$/); # skip empty lines
322 if (/^<comment_char> (\S)$/) {
324 } elsif (/^<escape_char> (\S)$/) {
326 } elsif (/^(END )?CHARMAP$/) {
328 } elsif (/^<.*>\s*\/x([0-9A-F]{2})\s*<U([0-9A-F]{4,8})>/) {
329 $used{hex($2)} .= "[${setname}{$1}]" unless $remove;
330 delete $used{hex($2)} if $remove;
332 die("Syntax error in line $. in file '$setfile':\n'$_'\n");
336 } elsif (/^STARTFONT /) {
337 # handle X11 BDF file
339 if (/^ENCODING\s+([0-9]+)/) {
340 $used{$1} .= "[${setname}]" unless $remove;
341 delete $used{$1} if $remove;
346 tr/a-z/A-Z/; # make input uppercase
349 if (/^([0-9A-F]{2})\t.?\t(.*)$/) {
350 # handle Kosta Kosti's trans CEDF format
351 next if (hex($1) < 32 || (hex($1) > 0x7e && hex($1) < 0xa0));
353 die "unknown ISO 10646 name '$2' in '$setfile' line $..\n" if ! $ucs;
354 $used{$ucs} .= "[${setname}{$1}]" unless $remove;
355 delete $used{$ucs} if $remove;
357 die("Syntax error in line $. in CEDF file '$setfile':\n'$_'\n");
362 if (/^\s*(0X|U\+|U-)?([0-9A-F]{2})\s+\#\s*UNDEFINED\s*$/) {
363 # ignore ftp.unicode.org mapping file lines with #UNDEFINED
366 s/^([^\#]*)\#.*$/$1/; # remove comments
367 next if (/^\032?\s*$/); # skip empty lines
368 if (/^\s*(0X)?([0-9A-F-]{2})\s+(0X|U\+|U-)?([0-9A-F]{4,8})\s*$/) {
369 # handle entry from a ftp.unicode.org mapping file
370 $used{hex($4)} .= "[${setname}{$2}]" unless $remove;
371 delete $used{hex($4)} if $remove;
372 } elsif (/^\s*(0X|U\+|U-)?([0-9A-F]{4,8})(\s*-\s*|\s*\.\.\s*|\s+)(0X|U\+|U-)?([0-9A-F]{4,8})\s*$/) {
373 # handle interval specification
376 for ($i = $first; $i <= $last; $i++) {
377 $used{$i} .= "[${setname}]" unless $remove;
378 delete $used{$i} if $remove;
380 } elsif (/^\s*([0-9A-F]{2,6})(\s+[0-9A-F]{2},?|\s+[0-9A-F]{2}-[0-9A-F]{2},?)+/) {
381 # handle lines from P10 MES draft
384 $cols =~ s/^\s*([0-9A-F]{2,6})\s*(.*)\s*$/$2/;
386 @cols = split(/\s+/, $cols);
389 $first = hex("$row$1");
391 } elsif (/^(..)-(..)$/) {
392 $first = hex("$row$1");
393 $last = hex("$row$2");
395 die ("this should never happen '$_'");
397 for ($i = $first; $i <= $last; $i++) {
398 $used{$i} .= "[${setname}]" unless $remove;
399 delete $used{$i} if $remove;
402 } elsif (/^\s*(0X|U\+|U-)?([0-9A-F]{4,8})\s*/) {
403 # handle single character
404 $used{hex($2)} .= "[${setname}]" unless $remove;
405 delete $used{hex($2)} if $remove;
407 die("Syntax error in line $. in file '$setfile':\n'$_'\n") unless /^\s*(\#.*)?$/;
411 } elsif (/^loadimages$/ || /^loadbigimages$/) {
412 if (/^loadimages$/) {
413 $prefix = "Small.Glyphs";
418 for $i (keys(%used)) {
419 next if ($name{$i} eq "<control>");
424 for $i (sort({$a <=> $b} keys(%used))) {
425 next if ($name{$i} eq "<control>");
427 $j = sprintf("%04X", $i);
429 $gif = "http://charts.unicode.org/Unicode.charts/$prefix/$1/U$j.gif";
430 print("\r$count/$total: $gif");
431 system("mkdir -p $prefix/$1; cd $prefix/$1; webcopy -u -s $gif &");
432 select(undef, undef, undef, 0.2);
436 } elsif (/^giftable/) {
437 # form a table of glyphs (requires pbmtools installed)
439 for $i (keys(%used)) {
440 $count++ unless $name{$i} eq "<control>";
442 $width = int(sqrt($count/sqrt(2)) + 0.5);
443 $width = $1 if /^giftable([0-9]+)$/;
444 system("rm -f tmp-*.pnm table.pnm~ table.pnm");
447 for $i (sort({$a <=> $b} keys(%used))) {
448 next if ($name{$i} eq "<control>");
449 $j = sprintf("%04X", $i);
451 $gif = "Small.Glyphs/$1/U$j.gif";
452 $pnm = sprintf("tmp-%02x.pnm", $col);
453 $fallback = "Small.Glyphs/FF/UFFFD.gif";
454 system("giftopnm $gif >$pnm || { rm $pnm ; giftopnm $fallback >$pnm ; }");
455 if (++$col == $width) {
456 system("pnmcat -lr tmp-*.pnm | cat >tmp-row.pnm");
458 system("mv tmp-row.pnm table.pnm");
460 system("mv table.pnm table.pnm~; pnmcat -tb table.pnm~ tmp-row.pnm >table.pnm");
464 system("rm -f tmp-*.pnm table.pnm~");
468 system("pnmcat -lr tmp-*.pnm | cat >tmp-row.pnm");
470 system("mv tmp-row.pnm table.pnm");
472 system("mv table.pnm table.pnm~; pnmcat -tb -jleft -black table.pnm~ tmp-row.pnm >table.pnm");
475 system("rm -f table.gif ; ppmtogif table.pnm > table.gif");
476 system("rm -f tmp-*.pnm table.pnm~ table.pnm");
477 } elsif (/^table$/) {
478 # go through all used names to print full table
479 print "<TABLE border=2>\n" if $html;
480 for $i (sort({$a <=> $b} keys(%used))) {
481 next if ($name{$i} eq "<control>");
483 $sources = $used{$i};
484 $sources =~ s/\]\[/, /g;
485 $sources =~ s/^\[//g;
486 $sources =~ s/\]$//g;
487 $sources =~ s/\{(..)\}/<SUB>$1<\/SUB>/g;
488 $j = sprintf("%04X", $i);
490 $gif = "Small.Glyphs/$1/U$j.gif";
492 print "<TD><img width=32 height=32 src=\"$gif\">" if $image;
493 printf("<TD>&#%d;", $i) if $adducs;
494 print "<TD><SAMP>$j</SAMP><TD><SAMP>" . name($i);
495 print " ($comment{$i})" if $comment{$i};
496 print "</SAMP><TD><SMALL>$sources</SMALL>\n";
498 printf("%04X \# ", $i);
499 print pack("U", $i) . " " if $adducs;
500 print name($i) ."\n";
503 print "</TABLE>\n" if $html;
504 } elsif (/^imgblock$/) {
506 $width = $1 if /giftable([0-9]+)/;
509 print "\n<P><TABLE cellspacing=0 cellpadding=0>";
510 for $i (sort({$a <=> $b} keys(%used))) {
511 print "<TR>" if $col == 0;
512 $j = sprintf("%04X", $i);
514 $gif = "Small.Glyphs/$1/U$j.gif";
516 print "<TD><img width=32 height=32 src=\"$gif\" alt=\"$alt\">";
517 $subline .= "<TD><SMALL><SAMP>$j</SAMP></SMALL>";
518 if (++$col == $width) {
519 print "<TR align=center>$subline";
524 print "<TR align=center>$subline" if ($col > 0);
526 } elsif (/^sources$/) {
527 # count how many characters are attributed to the various source set combinations
528 print "<P>Number of occurences of source character set combinations:\n<TABLE border=2>" if $html;
529 for $i (keys(%used)) {
530 next if ($name{$i} eq "<control>");
531 $sources = $used{$i};
532 $sources =~ s/\]\[/, /g;
533 $sources =~ s/^\[//g;
534 $sources =~ s/\]$//g;
535 $sources =~ s/\{(..)\}//g;
536 $contribs{$sources} += 1;
538 for $j (keys(%contribs)) {
539 print "<TR><TD>$contribs{$j}<TD>$j\n" if $html;
541 print "</TABLE>\n" if $html;
542 } elsif (/^compact$/) {
543 # print compact table in P10 MES format
544 print "<P>Compact representation of this character set:\n<TABLE border=2>" if $html;
545 print "<TR><TD><B>Rows</B><TD><B>Positions (Cells)</B>" if $html;
546 print "\n# Plane 00\n# Rows\tPositions (Cells)\n" unless $html;
550 for $i (sort({$a <=> $b} keys(%used))) {
551 next if ($name{$i} eq "<control>");
552 $row = sprintf("%02X", $i >> 8);
553 $col = sprintf("%02X", $i & 0xff);
554 if ($row ne $current_row) {
555 if (($last_col ne '') and ($last_col ne $start_col)) {
557 print "</SAMP>" if $html;
559 print "<TR><TD><SAMP>$row</SAMP><TD><SAMP>" if $html;
560 print "\n $row\t" unless $html;
565 if ($start_col eq '') {
570 } elsif (hex($col) == hex($last_col) + 1) {
573 if ($last_col ne $start_col) {
577 if ($len > 60 && !$html) {
588 if (($last_col ne '') and ($last_col ne $start_col)) {
590 print "</SAMP>" if $html;
592 print "\n" if ($current_row ne '');
593 print "</TABLE>\n" if $html;
596 # print table as C interval array
601 for $i (sort({$a <=> $b} keys(%used))) {
602 next if ($name{$i} eq "<control>");
604 if (++$col > $columns) { $col = 1; print "\n "; }
605 printf(" { 0x%04X, ", $i);
607 } elsif ($i == $last_i + 1) {
610 printf("0x%04X },", $last_i);
611 if (++$col > $columns) { $col = 1; print "\n "; }
612 printf(" { 0x%04X, ", $i);
617 printf("0x%04X }", $last_i);
620 } elsif (/^utf8-list$/) {
624 for $i (sort({$a <=> $b} keys(%used))) {
625 next if ($name{$i} eq "<control>");
626 while ($blockend[$block] < $i && $block < $blocks - 1) {
629 if ($last <= $blockend[$block-1] &&
630 $i < $blockstart[$block]) {
631 print "\n" if ($col);
632 printf "\nFree block (U+%04X-U+%04X):\n\n",
633 $blockend[$block-1] + 1, $blockstart[$block] - 1;
636 if ($last < $blockstart[$block] && $i >= $blockstart[$block]) {
637 print "\n" if ($col);
638 printf "\n$blockname[$block] (U+%04X-U+%04X):\n\n",
639 $blockstart[$block], $blockend[$block];
642 if ($category{$i} eq 'Mn') {
643 # prefix non-spacing character with U+25CC DOTTED CIRCLE
645 } elsif ($category{$i} eq 'Me') {
646 # prefix enclosing non-spacing character with space
650 $col += 1 + iswide($i);
657 print "\n" if ($col);
658 } elsif (/^collections$/) {
661 for $i (sort({$a <=> $b} keys(%used))) {
662 next if ($name{$i} eq "<control>");
663 while ($blockend[$block] < $i && $block < $blocks - 1) {
666 if ($last < $blockstart[$block] && $i >= $blockstart[$block]) {
667 print $blockname[$block],
668 " " x (40 - length($blockname[$block]));
669 printf "%04X-%04X\n",
670 $blockstart[$block], $blockend[$block];
675 print "<P>" if $html;
676 print "# " unless $html;
677 print "Number of characters in above table: ";
679 for $i (keys(%used)) {
680 $count++ unless $name{$i} eq "<control>";
684 } elsif (/^clean$/) {
685 # remove characters from set that are not in $unicodedata
686 for $i (keys(%used)) {
687 delete $used{$i} unless is_unicode($i);
689 } elsif (/^unknown$/) {
690 # remove characters from set that are in $unicodedata
691 for $i (keys(%used)) {
692 delete $used{$i} if is_unicode($i);
695 die("Unknown command line command '$_'\n");