5 # unicode coverage analyzator
6 # (c)2004,2005 Stepan Roh (PUBLIC DOMAIN)
7 # usage: ./unicover.pl unicode_data_file blocks_file sfd_file1 label1 (sfd_file2 label2...)
8 # unicode data file can be downloaded from http://www.unicode.org/Public/UNIDATA/UnicodeData.txt
9 # blocks file can be downloaded from http://www.unicode.org/Public/UNIDATA/Blocks.txt
11 sub parse_blocks_file
($);
12 sub parse_unicode_data_file
($);
13 sub parse_sfd_file
($);
23 $Data::Dumper
::Indent
= 1;
24 $Data::Dumper
::Sortkeys
= 1;
25 $Data::Dumper
::Purity
= 1;
28 # map (start dec => ( 'name' => block name, 'end' => end dec, 'coverage' => ( sfd_file => coverage ), 'disabled_count' => number of disabled glyphs )
32 sub parse_blocks_file
($) {
33 my ($blocks_file) = @_;
35 open (F
, $blocks_file) || die "Unable to open $blocks_file : $!\n";
37 next if (/^\s*(#|$)/);
38 my ($start, $end, $name) = ($_ =~ /^(.*?)\.\.(.*?);\s*(.*?)\s*$/);
39 $start = hex ($start);
41 $blocks{$start}{'name'} = $name;
42 $blocks{$start}{'end'} = $end;
50 foreach $block_start (keys %blocks) {
51 my ($block_end) = $blocks{$block_start}{'end'};
52 if (($dec_enc >= $block_start) && ($dec_enc <= $block_end)) {
53 $blocks{$block_start}{'disabled_count'}++;
59 sub disable_char_range
($$) {
60 my ($range_start, $range_end) = @_;
62 my $cur_enc = $range_start;
63 while ($cur_enc <= $range_end) {
64 my $cur_block_start = -1;
65 foreach $block_start (keys %blocks) {
66 my ($block_end) = $blocks{$block_start}{'end'};
67 if (($cur_enc >= $block_start) && ($cur_enc <= $block_end)) {
68 $cur_block_start = $block_start;
72 return if ($cur_block_start == -1);
73 while (($cur_enc <= $range_end) && ($cur_enc <= $blocks{$cur_block_start}{'end'})) {
74 $blocks{$cur_block_start}{'disabled_count'}++;
80 sub parse_unicode_data_file
($) {
83 open (F
, $ud_file) || die "Unable to open $ud_file : $!\n";
86 next if (/^\s*(#|$)/);
87 my ($enc, $name) = split (/;/);
89 if ($prev_enc + 1 < $enc) {
90 disable_char_range
($prev_enc + 1, $enc - 1);
92 disable_char
($enc) if ($name =~ /^</);
93 $chars{$enc} = 1 if ($name !~ /^</);
96 # find last possible character
97 $last_enc = $prev_enc;
98 foreach $block_start (keys %blocks) {
99 my ($block_end) = $blocks{$block_start}{'end'};
100 $last_enc = $block_end if ($block_end > $last_enc);
102 if ($prev_enc + 1 <= $last_enc) {
103 disable_char_range
($prev_enc + 1, $last_enc);
108 sub inc_coverage
($$) {
109 my ($sfd_file, $dec_enc) = @_;
111 foreach $block_start (keys %blocks) {
112 my ($block_end) = $blocks{$block_start}{'end'};
113 if (($dec_enc >= $block_start) && ($dec_enc <= $block_end)) {
114 if (exists $chars{$dec_enc}) {
115 $blocks{$block_start}{'coverage'}{$sfd_file}++;
122 sub parse_sfd_file
($) {
125 open (F
, $sfd_file) || die "Unable to open $sfd_file : $!\n";
130 if (/^StartChar:\s*(\S+)\s*$/) {
134 } elsif (/^Colour:/) {
135 # XXX this is quick'n'dirty hack to detect non-empty glyphs
137 } elsif (/^Encoding:\s*\d+\s*(\d+)\s*\d+\s*$/) {
139 } elsif ($curenc && !$empty && /^EndChar\s*/) {
140 inc_coverage
($sfd_file, $curenc);
146 # TODO: formats would be better
147 sub print_coverage
() {
149 This is the Unicode coverage file for DejaVu fonts
152 Control and similar characters are discounted from totals.
156 foreach $sfd_file (@sfd_files) {
157 my $label = $sfd_files{$sfd_file};
158 printf "%-19s", $label;
161 foreach $block_start (sort { $a <=> $b } keys %blocks) {
162 my ($block_end) = $blocks{$block_start}{'end'};
163 my ($name) = $blocks{$block_start}{'name'};
164 my ($disabled) = $blocks{$block_start}{'disabled_count'};
165 $disabled = 0 if (!defined $disabled);
166 my ($length) = $block_end - $block_start + 1 - $disabled;
167 printf "U+%04x %-40s", $block_start, $name;
168 foreach $sfd_file (@sfd_files) {
169 my ($coverage) = $blocks{$block_start}{'coverage'}{$sfd_file};
170 $coverage = 0 if (!defined $coverage);
171 my ($percent) = ($length != 0) ?
($coverage/$length * 100) : 0;
173 printf " %3d%%", $percent;
177 printf " %-13s", "($coverage/$length)";
184 print STDERR
"usage: unicode_data_file blocks_file sfd_file1 label1 (sfd_file2 label2...)\n";
188 $unicode_data_file = shift @ARGV;
189 $blocks_file = shift @ARGV;
193 $sfd_file = shift @ARGV;
194 $label = shift @ARGV;
195 push (@sfd_files, $sfd_file);
196 $sfd_files{$sfd_file} = $label;
199 parse_blocks_file
($blocks_file);
200 parse_unicode_data_file
($unicode_data_file);
201 foreach $sfd_file (@sfd_files) {
202 parse_sfd_file
($sfd_file);
207 print STDERR Data
::Dumper
->Dump([\
%blocks], ['*blocks']);