* added U+233D-U+233E, U+234B, U+2352, U+236B-U+236D, U+2388
[dejavu.git] / dejavu-fonts / langcover.pl
blob42551a51cdbc8fcf5a65221ca9b4e2b1b7f14df6
1 #!/usr/bin/perl -w
3 # $Id$
5 # language coverage analyzator
6 # (c)2006 Stepan Roh (PUBLIC DOMAIN)
7 # usage: ./langcover.pl fc-lang_dir sfd_file1 label1 (sfd_file2 label2...)
8 # files from http://webcvs.freedesktop.org/fontconfig/fontconfig/fc-lang/ should be downloaded to fc-lang directory
10 use FileHandle;
12 sub parse_fc_lang_dir($);
13 sub parse_orth_file($;$);
14 sub parse_sfd_file($);
15 sub inc_coverage($$);
16 sub print_coverage();
18 # map (language code => ( 'name' => name, 'chars' => list of glyphs, 'coverage' => ( sfd_file => coverage ) )
19 %langs = ();
21 sub parse_fc_lang_dir($) {
22 my ($fc_lang_dir) = @_;
24 opendir(DIR, $fc_lang_dir) || die "Unable to open $fc_lang_dir : $!\n";
25 my @orth_files = map { "$fc_lang_dir/$_" } grep { /\.orth$/ } readdir(DIR);
26 closedir(DIR);
28 foreach $orth_file (@orth_files) {
29 parse_orth_file($orth_file);
33 # missing or UTF-8 lang names
34 %lang_names = (
35 'gn' => 'Guarani',
36 'ja' => 'Japanese',
37 'nb' => 'Norwegian Bokmal',
38 'no' => 'Norwegian (Bokmal)',
39 'se' => 'North Sami',
40 'sma' => 'South Sami',
41 'smj' => 'Lule Sami',
42 'smn' => 'Inari Sami',
43 'sms' => 'Skolt Sami',
44 'vo' => 'Volapuk',
45 'zh-tw' => 'Chinese (traditional)',
48 sub parse_orth_file($;$) {
49 my ($orth_file, $lang) = @_;
51 if (!defined $lang) {
52 ($lang) = ($orth_file =~ m,/([^/]*)\.[^./]*$,);
53 $lang =~ tr/_/-/;
55 # XXX some names in orth files have different language codes
56 my $orth_lang = $lang;
57 $orth_lang = 'kw' if ($orth_lang eq 'ay');
58 $orth_lang = 'kw' if ($orth_lang eq 'fj');
59 $orth_lang = 'eth' if ($orth_lang eq 'gez');
60 $orth_lang = 'hi' if ($orth_lang eq 'pa');
61 $orth_lang = 'cu' if ($orth_lang eq 'sco');
62 $orth_lang = 'af' if ($orth_lang eq 'sm');
63 $orth_lang = 'smj' if ($orth_lang eq 'sms');
64 $orth_lang = 'ge' if ($orth_lang eq 'te');
65 if (exists($lang_names{$lang})) {
66 $langs{$lang}{'name'} = $lang_names{$lang};
68 my $f = new FileHandle($orth_file) || die "Unable to open $orth_file : $!\n";
69 while (<$f>) {
70 if (!exists($langs{$lang}{'name'})) {
71 if (/^#\s*(.*?)\s*\($lang\)/i) {
72 $langs{$lang}{'name'} = $1;
73 next;
75 if (/^#\s*(.*?)\s*\($orth_lang\)/i) {
76 $langs{$lang}{'name'} = $1;
77 next;
80 next if (/^\s*(#|$)/);
81 if (/^\s*include\s+(\S+)/) {
82 my $include = $1;
83 my $include_file;
84 ($include_file = $orth_file) =~ s,/[^/]+$,/$include,;
85 parse_orth_file($include_file, $lang);
86 next;
88 my ($start) = ($_ =~ /^\s*(\S+)/);
89 if ($lang eq 'ibo') {
90 # XXX ibo.orth 1ee1 -> 1ee4 (https://bugs.freedesktop.org/show_bug.cgi?id=6237)
91 $start = '1ee4' if ($start eq '1ee1');
93 my $end = $start;
94 if ($start =~ /-/) {
95 ($start, $end) = split(/-/, $start);
97 # XXX ab.orth 0re1 -> 04e1 (https://bugs.freedesktop.org/show_bug.cgi?id=6238)
98 $end = '04e1' if ($end eq '0re1');
99 $start = hex ($start);
100 $end = hex ($end);
101 for (my $dec_enc = $start; $dec_enc <= $end; $dec_enc++) {
102 $langs{$lang}{'chars'}{$dec_enc} = 1;
105 $f->close();
108 sub inc_coverage($$) {
109 my ($sfd_file, $dec_enc) = @_;
111 foreach $lang (keys %langs) {
112 if (exists $langs{$lang}{'chars'}{$dec_enc}) {
113 $langs{$lang}{'coverage'}{$sfd_file}++;
118 sub parse_sfd_file($) {
119 my ($sfd_file) = @_;
121 open (F, $sfd_file) || die "Unable to open $sfd_file : $!\n";
122 my $curchar = '';
123 my $curenc = '';
124 my $empty = 0;
125 while (<F>) {
126 if (/^StartChar:\s*(\S+)\s*$/) {
127 $curchar = $1;
128 $curenc = '';
129 $empty = 0;
130 } elsif (/^Colour:/) {
131 # XXX this is quick'n'dirty hack to detect non-empty glyphs
132 $empty = 1;
133 } elsif (/^Encoding:\s*\d+\s*(\d+)\s*\d+\s*$/) {
134 $curenc = $1;
135 } elsif ($curenc && !$empty && /^EndChar\s*/) {
136 inc_coverage ($sfd_file, $curenc);
139 close (F);
142 # TODO: formats would be better
143 sub print_coverage() {
144 print <<END;
145 This is the language coverage file for DejaVu fonts
146 (\$Id\$)
149 print " ";
150 foreach $sfd_file (@sfd_files) {
151 my $label = $sfd_files{$sfd_file};
152 printf "%-19s", $label;
154 print "\n";
155 foreach $lang (sort keys %langs) {
156 my $name = $langs{$lang}{'name'};
157 my $length = keys %{$langs{$lang}{'chars'}};
158 printf "%-6s %-40s", $lang, $name;
159 foreach $sfd_file (@sfd_files) {
160 my ($coverage) = $langs{$lang}{'coverage'}{$sfd_file};
161 $coverage = 0 if (!defined $coverage);
162 my ($percent) = ($length != 0) ? ($coverage/$length * 100) : 0;
163 if ($percent > 0) {
164 printf " %3d%%", $percent;
165 } else {
166 print " ";
168 printf " %-13s", "($coverage/$length)";
170 print "\n";
174 if (@ARGV < 3) {
175 print STDERR "usage: fc-lang_dir sfd_file1 label1 (sfd_file2 label2...)\n";
176 exit 1;
179 $fc_lang_dir = shift @ARGV;
180 @sfd_files = ();
181 %sfd_files = ();
182 while (@ARGV) {
183 $sfd_file = shift @ARGV;
184 $label = shift @ARGV;
185 push (@sfd_files, $sfd_file);
186 $sfd_files{$sfd_file} = $label;
189 parse_fc_lang_dir($fc_lang_dir);
190 foreach $sfd_file (@sfd_files) {
191 parse_sfd_file($sfd_file);
193 print_coverage();