Cygwin: mmap: allow remapping part of an existing anonymous mapping
[newlib-cygwin.git] / newlib / libc / string / uniset
blobb118dd66315f1e3fb0a24b558339a4c06c93b0bb
1 #!/usr/bin/perl
2 # Uniset -- Unicode subset manager -- Markus Kuhn
3 # http://www.cl.cam.ac.uk/~mgk25/download/uniset.tar.gz
5 require 5.014;
6 use open ':utf8';
7 use FindBin qw($RealBin); # to find directory where this file is located
9 binmode(STDOUT, ":utf8");
10 binmode(STDIN, ":utf8");
12 my (%name, %invname, %category, %comment);
14 print <<End if $#ARGV < 0;
15 Uniset -- Unicode subset manager -- Markus Kuhn
17 Uniset merges and subtracts Unicode subsets. It can output and
18 analyse the resulting character set in various formats.
20 Uniset understand the following command-line arguments:
22 Commands to define a set of characters:
24 + filename add the character set described in the file to the set
25 - filename remove the character set described in the file from the set
26 +: filename add the characters in the UTF-8 file to the set
27 -: filename remove the characters in the UTF-8 file from the set
28 +xxxx..yyyy add the range to the set (xxxx and yyyy are hex numbers)
29 -xxxx..yyyy remove the range from the set (xxxx and yyyy are hex numbers)
30 +cat=Xx add all Unicode characters with category code Xx
31 -cat=Xx remove all Unicode characters with category code Xx
32 -cat!=Xx remove all Unicode characters without category code Xx
33 clean remove any elements that do not appear in the Unicode database
34 unknown remove any elements that do appear in the Unicode database
36 Command to output descriptions of the constructed set of characters:
38 table write a full table with one line per character
39 compact output the set in compact MES format
40 c output the set as C interval array
41 nr output the number of characters
42 sources output a table that shows the number of characters contributed
43 by the various combinations of input sets added with +.
44 utf8-list output a list of all characters encoded in UTF-8
46 Commands to tailor the following output commands:
48 html write HTML tables instead of plain text
49 ucs add the unicode character itself to the table (UTF-8 in
50 plain table, numeric character reference in HTML)
52 Formats of character set input files read by the + and - command:
54 Empty lines, white space at the start and end of the line and any
55 comment text following a \# are ignored. The following formats are
56 recognized
58 xx yyyy xx is the hex code in an 8-bit character set and yyyy
59 is the corresponding Unicode value. Both can optionally
60 be prefixed by 0x. This is the format used in the
61 files on <ftp://ftp.unicode.org/Public/MAPPINGS/>.
63 yyyy yyyy (optionally prefixed with 0x) is a Unicode character
64 belonging to the specified subset.
66 yyyy-yyyy a range of Unicode characters belonging to
67 yyyy..yyyy the specified subset.
69 xx yy yy yy-yy yy xx denotes a row (high-byte) and the yy specify
70 corresponding low bytes or with a hyphen also ranges of
71 low bytes in the Unicode values that belong to this
72 subset. This is also the format that is generated by
73 the compact command.
74 End
75 exit 1 if $#ARGV < 0;
78 # Subroutine to identify whether the ISO 10646/Unicode character code
79 # ucs belongs into the East Asian Wide (W) or East Asian FullWidth
80 # (F) category as defined in Unicode Technical Report #11.
82 sub iswide ($) {
83 my $ucs = shift(@_);
85 return ($ucs >= 0x1100 &&
86 ($ucs <= 0x115f || # Hangul Jamo
87 $ucs == 0x2329 || $ucs == 0x232a ||
88 ($ucs >= 0x2e80 && $ucs <= 0xa4cf &&
89 $ucs != 0x303f) || # CJK .. Yi
90 ($ucs >= 0xac00 && $ucs <= 0xd7a3) || # Hangul Syllables
91 ($ucs >= 0xf900 && $ucs <= 0xfaff) || # CJK Comp. Ideographs
92 ($ucs >= 0xfe30 && $ucs <= 0xfe6f) || # CJK Comp. Forms
93 ($ucs >= 0xff00 && $ucs <= 0xff60) || # Fullwidth Forms
94 ($ucs >= 0xffe0 && $ucs <= 0xffe6) ||
95 ($ucs >= 0x20000 && $ucs <= 0x2fffd) ||
96 ($ucs >= 0x30000 && $ucs <= 0x3fffd)));
99 # Return the Unicode name that belongs to a given character code
101 # Jamo short names, see Unicode 3.0, table 4-4, page 86
103 my @lname = ('G', 'GG', 'N', 'D', 'DD', 'R', 'M', 'B', 'BB', 'S', 'SS', '',
104 'J', 'JJ', 'C', 'K', 'T', 'P', 'H'); # 1100..1112
105 my @vname = ('A', 'AE', 'YA', 'YAE', 'EO', 'E', 'YEO', 'YE', 'O',
106 'WA', 'WAE', 'OE', 'YO', 'U', 'WEO', 'WE', 'WI', 'YU',
107 'EU', 'YI', 'I'); # 1161..1175
108 my @tname = ('G', 'GG', 'GS', 'N', 'NJ', 'NH', 'D', 'L', 'LG', 'LM',
109 'LB', 'LS', 'LT', 'LP', 'LH', 'M', 'B', 'BS', 'S', 'SS',
110 'NG', 'J', 'C', 'K', 'T', 'P', 'H'); # 11a8..11c2
112 sub name {
113 my $ucs = shift(@_);
115 # The intervals used here reflect Unicode Version 3.2
116 if (($ucs >= 0x3400 && $ucs <= 0x4db5) ||
117 ($ucs >= 0x4e00 && $ucs <= 0x9fa5) ||
118 ($ucs >= 0x20000 && $ucs <= 0x2a6d6)) {
119 return "CJK UNIFIED IDEOGRAPH-" . sprintf("%04X", $ucs);
122 if ($ucs >= 0xac00 && $ucs <= 0xd7a3) {
123 my $s = $ucs - 0xac00;
124 my $l = 0x1100 + int($s / (21 * 28));
125 my $v = 0x1161 + int(($s % (21 * 28)) / 28);
126 my $t = 0x11a7 + $s % 28;
127 return "HANGUL SYLLABLE " .
128 ($lname[int($s / (21 * 28))] .
129 $vname[int(($s % (21 * 28)) / 28)] .
130 $tname[$s % 28 - 1]);
133 return $name{$ucs};
136 sub is_unicode {
137 my $ucs = shift(@_);
139 # The intervals used here reflect Unicode Version 3.2
140 if (($ucs >= 0x3400 && $ucs <= 0x4db5) ||
141 ($ucs >= 0x4e00 && $ucs <= 0x9fa5) ||
142 ($ucs >= 0xac00 && $ucs <= 0xd7a3) ||
143 ($ucs >= 0x20000 && $ucs <= 0x2a6d6)) {
144 return 1;
147 return exists $name{$ucs};
150 my @search_path = ();
151 if ($RealBin =~ m|^(.*)/bin\z| && -d "$1/share/uniset") {
152 push @search_path, "$1/share/uniset";
153 } else {
154 push @search_path, $RealBin;
157 sub search_open {
158 my ($mode, $fn) = @_;
159 my $file;
160 return $file if open($file, $mode, $fn);
161 return undef if $fn =~ m|/|;
162 for my $path (@search_path) {
163 return $file if open($file, $mode, "$path/$fn");
165 return undef;
168 my $html = 0;
169 my $image = 0;
170 my $adducs = 0;
171 my $unicodedata = "UnicodeData.txt";
172 my $blockdata = "Blocks.txt";
174 # read list of all Unicode names
175 my $data = search_open('<', $unicodedata);
176 unless ($data) {
177 die ("Can't open Unicode database '$unicodedata':\n$!\n\n" .
178 "Please make sure that you have downloaded the file\n" .
179 "http://www.unicode.org/Public/UNIDATA/UnicodeData.txt\n");
181 while (<$data>) {
182 if (/^([0-9,A-F]{4,8});([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*)$/) {
183 next if $2 ne '<control>' && substr($2, 0, 1) eq '<';
184 $ucs = hex($1);
185 $name{$ucs} = $2;
186 $invname{$2} = $ucs;
187 $category{$ucs} = $3;
188 $comment{$ucs} = $12;
189 } else {
190 die("Syntax error in line '$_' in file '$unicodedata'\n");
193 close($data);
195 # read list of all Unicode blocks
196 $data = search_open('<', $blockdata);
197 unless ($data) {
198 die ("Can't open Unicode blockname list '$blockdata':\n$!\n\n" .
199 "Please make sure that you have downloaded the file\n" .
200 "http://www.unicode.org/Public/UNIDATA/Blocks.txt\n");
202 my $blocks = 0;
203 my (@blockstart, @blockend, @blockname);
204 while (<$data>) {
205 if (/^\s*([0-9,A-F]{4,8})\s*\.\.\s*([0-9,A-F]{4,8})\s*;\s*(.*)$/) {
206 $blockstart[$blocks] = hex($1);
207 $blockend [$blocks] = hex($2);
208 $blockname [$blocks] = $3;
209 $blocks++;
210 } elsif (/^\s*\#/ || /^\s*$/) {
211 # ignore comments and empty lines
212 } else {
213 die("Syntax error in line '$_' in file '$blockdata'\n");
216 close($data);
217 if ($blockend[$blocks-1] < 0x110000) {
218 $blockstart[$blocks] = 0x110000;
219 $blockend [$blocks] = 0x7FFFFFFF;
220 $blockname [$blocks] = "Beyond Plane 16";
221 $blocks++;
224 # process command line arguments
225 while ($_ = shift(@ARGV)) {
226 if (/^html$/) {
227 $html = 1;
228 } elsif (/^ucs$/) {
229 $adducs = 1;
230 } elsif (/^img$/) {
231 $html = 1;
232 $image = 1;
233 } elsif (/^template$/) {
234 $template = shift(@ARGV);
235 open(TEMPLATE, $template) || die("Can't open template file '$template': $!\n");
236 while (<TEMPLATE>) {
237 if (/^\#\s*include\s+\"([^\"]*)\"\s*$/) {
238 open(INCLUDE, $1) || die("Can't open template include file '$1': $!\n");
239 while (<INCLUDE>) {
240 print $_;
242 close(INCLUDE);
243 } elsif (/^\#\s*quote\s+\"([^\"]*)\"\s*$/) {
244 open(INCLUDE, $1) || die("Can't open template include file '$1': $!\n");
245 while (<INCLUDE>) {
246 s/&/&amp;/g;
247 s/</&lt;/g;
248 print $_;
250 close(INCLUDE);
251 } else {
252 print $_;
255 close(TEMPLATE);
256 } elsif (/^\+cat=(.+)$/) {
257 # add characters with given category
258 $cat = $1;
259 for $i (keys(%category)) {
260 $used{$i} = "[${cat}]" if $category{$i} eq $cat;
262 } elsif (/^\-cat=(.+)$/) {
263 # remove characters with given category
264 $cat = $1;
265 for $i (keys(%category)) {
266 delete $used{$i} if $category{$i} eq $cat;
268 } elsif (/^\-cat!=(.+)$/) {
269 # remove characters without given category
270 $cat = $1;
271 for $i (keys(%category)) {
272 delete $used{$i} unless $category{$i} eq $cat;
274 } elsif (/^([+-]):(.*)/) {
275 $remove = $1 eq "-";
276 $setfile = $2;
277 $setfile = shift(@ARGV) if $setfile eq "";
278 push(@SETS, $setfile);
279 open(SET, $setfile) || die("Can't open set file '$setfile': $!\n");
280 $setname = $setfile;
281 while (<SET>) {
282 while ($_) {
283 $i = ord($_);
284 $used{$i} .= "[${setname}]" unless $remove;
285 delete $used{$i} if $remove;
286 $_ = substr($_, 1);
289 close SET;
290 } elsif (/^([+-])(.*)/) {
291 $remove = $1 eq "-";
292 $setfile = $2;
293 $setfile = "$setfile..$setfile" if $setfile =~ /^([0-9A-Fa-f]{4,8})$/;
294 if ($setfile =~ /^([0-9A-Fa-f]{4,8})(-|\.\.)([0-9A-Fa-f]{4,8})$/) {
295 # handle intervall specification on command line
296 $first = hex($1);
297 $last = hex($3);
298 for ($i = $first; $i <= $last; $i++) {
299 $used{$i} .= "[ARG]" unless $remove;
300 delete $used{$i} if $remove;
302 next;
304 $setfile = shift(@ARGV) if $setfile eq "";
305 push(@SETS, $setfile);
306 my $setf = search_open('<', $setfile);
307 die("Can't open set file '$setfile': $!\n") unless $setf;
308 $cedf = ($setfile =~ /cedf/); # detect Kosta Kosti's trans CEDF format by path name
309 $setname = $setfile;
310 $setname =~ s/([^.\[\]]*)\..*/$1/;
311 while (<$setf>) {
312 if (/^<code_set_name>/) {
313 # handle ISO 15897 (POSIX registry) charset mapping format
314 undef $comment_char;
315 undef $escape_char;
316 while (<$setf>) {
317 if ($comment_char && /^$comment_char/) {
318 # remove comments
319 $_ = $`;
321 next if (/^\032?\s*$/); # skip empty lines
322 if (/^<comment_char> (\S)$/) {
323 $comment_char = $1;
324 } elsif (/^<escape_char> (\S)$/) {
325 $escape_char = $1;
326 } elsif (/^(END )?CHARMAP$/) {
327 #ignore
328 } elsif (/^<.*>\s*\/x([0-9A-F]{2})\s*<U([0-9A-F]{4,8})>/) {
329 $used{hex($2)} .= "[${setname}{$1}]" unless $remove;
330 delete $used{hex($2)} if $remove;
331 } else {
332 die("Syntax error in line $. in file '$setfile':\n'$_'\n");
335 next;
336 } elsif (/^STARTFONT /) {
337 # handle X11 BDF file
338 while (<$setf>) {
339 if (/^ENCODING\s+([0-9]+)/) {
340 $used{$1} .= "[${setname}]" unless $remove;
341 delete $used{$1} if $remove;
344 next;
346 tr/a-z/A-Z/; # make input uppercase
347 if ($cedf) {
348 if ($. > 4) {
349 if (/^([0-9A-F]{2})\t.?\t(.*)$/) {
350 # handle Kosta Kosti's trans CEDF format
351 next if (hex($1) < 32 || (hex($1) > 0x7e && hex($1) < 0xa0));
352 $ucs = $invname{$2};
353 die "unknown ISO 10646 name '$2' in '$setfile' line $..\n" if ! $ucs;
354 $used{$ucs} .= "[${setname}{$1}]" unless $remove;
355 delete $used{$ucs} if $remove;
356 } else {
357 die("Syntax error in line $. in CEDF file '$setfile':\n'$_'\n");
360 next;
362 if (/^\s*(0X|U\+|U-)?([0-9A-F]{2})\s+\#\s*UNDEFINED\s*$/) {
363 # ignore ftp.unicode.org mapping file lines with #UNDEFINED
364 next;
366 s/^([^\#]*)\#.*$/$1/; # remove comments
367 next if (/^\032?\s*$/); # skip empty lines
368 if (/^\s*(0X)?([0-9A-F-]{2})\s+(0X|U\+|U-)?([0-9A-F]{4,8})\s*$/) {
369 # handle entry from a ftp.unicode.org mapping file
370 $used{hex($4)} .= "[${setname}{$2}]" unless $remove;
371 delete $used{hex($4)} if $remove;
372 } elsif (/^\s*(0X|U\+|U-)?([0-9A-F]{4,8})(\s*-\s*|\s*\.\.\s*|\s+)(0X|U\+|U-)?([0-9A-F]{4,8})\s*$/) {
373 # handle interval specification
374 $first = hex($2);
375 $last = hex($5);
376 for ($i = $first; $i <= $last; $i++) {
377 $used{$i} .= "[${setname}]" unless $remove;
378 delete $used{$i} if $remove;
380 } elsif (/^\s*([0-9A-F]{2,6})(\s+[0-9A-F]{2},?|\s+[0-9A-F]{2}-[0-9A-F]{2},?)+/) {
381 # handle lines from P10 MES draft
382 $row = $1;
383 $cols = $_;
384 $cols =~ s/^\s*([0-9A-F]{2,6})\s*(.*)\s*$/$2/;
385 $cols =~ tr/,//d;
386 @cols = split(/\s+/, $cols);
387 for (@cols) {
388 if (/^(..)$/) {
389 $first = hex("$row$1");
390 $last = $first;
391 } elsif (/^(..)-(..)$/) {
392 $first = hex("$row$1");
393 $last = hex("$row$2");
394 } else {
395 die ("this should never happen '$_'");
397 for ($i = $first; $i <= $last; $i++) {
398 $used{$i} .= "[${setname}]" unless $remove;
399 delete $used{$i} if $remove;
402 } elsif (/^\s*(0X|U\+|U-)?([0-9A-F]{4,8})\s*/) {
403 # handle single character
404 $used{hex($2)} .= "[${setname}]" unless $remove;
405 delete $used{hex($2)} if $remove;
406 } else {
407 die("Syntax error in line $. in file '$setfile':\n'$_'\n") unless /^\s*(\#.*)?$/;
410 close $setf;
411 } elsif (/^loadimages$/ || /^loadbigimages$/) {
412 if (/^loadimages$/) {
413 $prefix = "Small.Glyphs";
414 } else {
415 $prefix = "Glyphs";
417 $total = 0;
418 for $i (keys(%used)) {
419 next if ($name{$i} eq "<control>");
420 $total++;
422 $count = 0;
423 $| = 1;
424 for $i (sort({$a <=> $b} keys(%used))) {
425 next if ($name{$i} eq "<control>");
426 $count++;
427 $j = sprintf("%04X", $i);
428 $j =~ /(..)(..)/;
429 $gif = "http://charts.unicode.org/Unicode.charts/$prefix/$1/U$j.gif";
430 print("\r$count/$total: $gif");
431 system("mkdir -p $prefix/$1; cd $prefix/$1; webcopy -u -s $gif &");
432 select(undef, undef, undef, 0.2);
434 print("\n");
435 exit 0;
436 } elsif (/^giftable/) {
437 # form a table of glyphs (requires pbmtools installed)
438 $count = 0;
439 for $i (keys(%used)) {
440 $count++ unless $name{$i} eq "<control>";
442 $width = int(sqrt($count/sqrt(2)) + 0.5);
443 $width = $1 if /^giftable([0-9]+)$/;
444 system("rm -f tmp-*.pnm table.pnm~ table.pnm");
445 $col = 0;
446 $row = 0;
447 for $i (sort({$a <=> $b} keys(%used))) {
448 next if ($name{$i} eq "<control>");
449 $j = sprintf("%04X", $i);
450 $j =~ /(..)(..)/;
451 $gif = "Small.Glyphs/$1/U$j.gif";
452 $pnm = sprintf("tmp-%02x.pnm", $col);
453 $fallback = "Small.Glyphs/FF/UFFFD.gif";
454 system("giftopnm $gif >$pnm || { rm $pnm ; giftopnm $fallback >$pnm ; }");
455 if (++$col == $width) {
456 system("pnmcat -lr tmp-*.pnm | cat >tmp-row.pnm");
457 if ($row == 0) {
458 system("mv tmp-row.pnm table.pnm");
459 } else {
460 system("mv table.pnm table.pnm~; pnmcat -tb table.pnm~ tmp-row.pnm >table.pnm");
462 $row++;
463 $col = 0;
464 system("rm -f tmp-*.pnm table.pnm~");
467 if ($col > 0) {
468 system("pnmcat -lr tmp-*.pnm | cat >tmp-row.pnm");
469 if ($row == 0) {
470 system("mv tmp-row.pnm table.pnm");
471 } else {
472 system("mv table.pnm table.pnm~; pnmcat -tb -jleft -black table.pnm~ tmp-row.pnm >table.pnm");
475 system("rm -f table.gif ; ppmtogif table.pnm > table.gif");
476 system("rm -f tmp-*.pnm table.pnm~ table.pnm");
477 } elsif (/^table$/) {
478 # go through all used names to print full table
479 print "<TABLE border=2>\n" if $html;
480 for $i (sort({$a <=> $b} keys(%used))) {
481 next if ($name{$i} eq "<control>");
482 if ($html) {
483 $sources = $used{$i};
484 $sources =~ s/\]\[/, /g;
485 $sources =~ s/^\[//g;
486 $sources =~ s/\]$//g;
487 $sources =~ s/\{(..)\}/<SUB>$1<\/SUB>/g;
488 $j = sprintf("%04X", $i);
489 $j =~ /(..)(..)/;
490 $gif = "Small.Glyphs/$1/U$j.gif";
491 print "<TR>";
492 print "<TD><img width=32 height=32 src=\"$gif\">" if $image;
493 printf("<TD>&#%d;", $i) if $adducs;
494 print "<TD><SAMP>$j</SAMP><TD><SAMP>" . name($i);
495 print " ($comment{$i})" if $comment{$i};
496 print "</SAMP><TD><SMALL>$sources</SMALL>\n";
497 } else {
498 printf("%04X \# ", $i);
499 print pack("U", $i) . " " if $adducs;
500 print name($i) ."\n";
503 print "</TABLE>\n" if $html;
504 } elsif (/^imgblock$/) {
505 $width = 16;
506 $width = $1 if /giftable([0-9]+)/;
507 $col = 0;
508 $subline = "";
509 print "\n<P><TABLE cellspacing=0 cellpadding=0>";
510 for $i (sort({$a <=> $b} keys(%used))) {
511 print "<TR>" if $col == 0;
512 $j = sprintf("%04X", $i);
513 $j =~ /(..)(..)/;
514 $gif = "Small.Glyphs/$1/U$j.gif";
515 $alt = name($i);
516 print "<TD><img width=32 height=32 src=\"$gif\" alt=\"$alt\">";
517 $subline .= "<TD><SMALL><SAMP>$j</SAMP></SMALL>";
518 if (++$col == $width) {
519 print "<TR align=center>$subline";
520 $col = 0;
521 $subline = "";
524 print "<TR align=center>$subline" if ($col > 0);
525 print "</TABLE>\n";
526 } elsif (/^sources$/) {
527 # count how many characters are attributed to the various source set combinations
528 print "<P>Number of occurences of source character set combinations:\n<TABLE border=2>" if $html;
529 for $i (keys(%used)) {
530 next if ($name{$i} eq "<control>");
531 $sources = $used{$i};
532 $sources =~ s/\]\[/, /g;
533 $sources =~ s/^\[//g;
534 $sources =~ s/\]$//g;
535 $sources =~ s/\{(..)\}//g;
536 $contribs{$sources} += 1;
538 for $j (keys(%contribs)) {
539 print "<TR><TD>$contribs{$j}<TD>$j\n" if $html;
541 print "</TABLE>\n" if $html;
542 } elsif (/^compact$/) {
543 # print compact table in P10 MES format
544 print "<P>Compact representation of this character set:\n<TABLE border=2>" if $html;
545 print "<TR><TD><B>Rows</B><TD><B>Positions (Cells)</B>" if $html;
546 print "\n# Plane 00\n# Rows\tPositions (Cells)\n" unless $html;
547 $current_row = '';
548 $start_col = '';
549 $last_col = '';
550 for $i (sort({$a <=> $b} keys(%used))) {
551 next if ($name{$i} eq "<control>");
552 $row = sprintf("%02X", $i >> 8);
553 $col = sprintf("%02X", $i & 0xff);
554 if ($row ne $current_row) {
555 if (($last_col ne '') and ($last_col ne $start_col)) {
556 print "-$last_col";
557 print "</SAMP>" if $html;
559 print "<TR><TD><SAMP>$row</SAMP><TD><SAMP>" if $html;
560 print "\n $row\t" unless $html;
561 $len = 0;
562 $current_row = $row;
563 $start_col = '';
565 if ($start_col eq '') {
566 print "$col";
567 $len += 2;
568 $start_col = $col;
569 $last_col = $col;
570 } elsif (hex($col) == hex($last_col) + 1) {
571 $last_col = $col;
572 } else {
573 if ($last_col ne $start_col) {
574 print "-$last_col";
575 $len += 3;
577 if ($len > 60 && !$html) {
578 print "\n $row\t";
579 $len = 0;
581 print " " if $len;
582 print "$col";
583 $len += 2 + !! $len;
584 $start_col = $col;
585 $last_col = $col;
588 if (($last_col ne '') and ($last_col ne $start_col)) {
589 print "-$last_col";
590 print "</SAMP>" if $html;
592 print "\n" if ($current_row ne '');
593 print "</TABLE>\n" if $html;
594 print "\n";
595 } elsif (/^c$/) {
596 # print table as C interval array
597 print "{";
598 $last_i = '';
599 $columns = 3;
600 $col = $columns;
601 for $i (sort({$a <=> $b} keys(%used))) {
602 next if ($name{$i} eq "<control>");
603 if ($last_i eq '') {
604 if (++$col > $columns) { $col = 1; print "\n "; }
605 printf(" { 0x%04X, ", $i);
606 $last_i = $i;
607 } elsif ($i == $last_i + 1) {
608 $last_i = $i;
609 } else {
610 printf("0x%04X },", $last_i);
611 if (++$col > $columns) { $col = 1; print "\n "; }
612 printf(" { 0x%04X, ", $i);
613 $last_i = $i;
616 if ($last_i ne '') {
617 printf("0x%04X }", $last_i);
619 print "\n};\n";
620 } elsif (/^utf8-list$/) {
621 $col = 0;
622 $block = 0;
623 $last = -1;
624 for $i (sort({$a <=> $b} keys(%used))) {
625 next if ($name{$i} eq "<control>");
626 while ($blockend[$block] < $i && $block < $blocks - 1) {
627 $block++;
629 if ($last <= $blockend[$block-1] &&
630 $i < $blockstart[$block]) {
631 print "\n" if ($col);
632 printf "\nFree block (U+%04X-U+%04X):\n\n",
633 $blockend[$block-1] + 1, $blockstart[$block] - 1;
634 $col = 0;
636 if ($last < $blockstart[$block] && $i >= $blockstart[$block]) {
637 print "\n" if ($col);
638 printf "\n$blockname[$block] (U+%04X-U+%04X):\n\n",
639 $blockstart[$block], $blockend[$block];
640 $col = 0;
642 if ($category{$i} eq 'Mn') {
643 # prefix non-spacing character with U+25CC DOTTED CIRCLE
644 print "\x{25CC}";
645 } elsif ($category{$i} eq 'Me') {
646 # prefix enclosing non-spacing character with space
647 print " ";
649 print pack("U", $i);
650 $col += 1 + iswide($i);
651 if ($col >= 64) {
652 print "\n";
653 $col = 0;
655 $last = $i;
657 print "\n" if ($col);
658 } elsif (/^collections$/) {
659 $block = 0;
660 $last = -1;
661 for $i (sort({$a <=> $b} keys(%used))) {
662 next if ($name{$i} eq "<control>");
663 while ($blockend[$block] < $i && $block < $blocks - 1) {
664 $block++;
666 if ($last < $blockstart[$block] && $i >= $blockstart[$block]) {
667 print $blockname[$block],
668 " " x (40 - length($blockname[$block]));
669 printf "%04X-%04X\n",
670 $blockstart[$block], $blockend[$block];
672 $last = $i;
674 } elsif (/^nr$/) {
675 print "<P>" if $html;
676 print "# " unless $html;
677 print "Number of characters in above table: ";
678 $count = 0;
679 for $i (keys(%used)) {
680 $count++ unless $name{$i} eq "<control>";
682 print $count;
683 print "\n";
684 } elsif (/^clean$/) {
685 # remove characters from set that are not in $unicodedata
686 for $i (keys(%used)) {
687 delete $used{$i} unless is_unicode($i);
689 } elsif (/^unknown$/) {
690 # remove characters from set that are in $unicodedata
691 for $i (keys(%used)) {
692 delete $used{$i} if is_unicode($i);
694 } else {
695 die("Unknown command line command '$_'\n");