5 # language coverage analyzator
6 # (c)2006 Stepan Roh (PUBLIC DOMAIN)
7 # usage: ./langcover.pl fc-lang_dir sfd_file1 label1 (sfd_file2 label2...)
8 # files from http://webcvs.freedesktop.org/fontconfig/fontconfig/fc-lang/ should be downloaded to fc-lang directory
12 sub parse_fc_lang_dir
($);
13 sub parse_orth_file
($;$);
14 sub parse_sfd_file
($);
18 # map (language code => ( 'name' => name, 'chars' => list of glyphs, 'coverage' => ( sfd_file => coverage ) )
21 sub parse_fc_lang_dir
($) {
22 my ($fc_lang_dir) = @_;
24 opendir(DIR
, $fc_lang_dir) || die "Unable to open $fc_lang_dir : $!\n";
25 my @orth_files = map { "$fc_lang_dir/$_" } grep { /\
.orth
$/ } readdir(DIR
);
28 foreach $orth_file (@orth_files) {
29 parse_orth_file
($orth_file);
33 # missing or UTF-8 lang names
37 'nb' => 'Norwegian Bokmal',
38 'no' => 'Norwegian (Bokmal)',
40 'sma' => 'South Sami',
42 'smn' => 'Inari Sami',
43 'sms' => 'Skolt Sami',
45 'zh-tw' => 'Chinese (traditional)',
48 sub parse_orth_file
($;$) {
49 my ($orth_file, $lang) = @_;
52 ($lang) = ($orth_file =~ m
,/([^/]*)\
.[^./]*$,);
55 # XXX some names in orth files have different language codes
56 my $orth_lang = $lang;
57 $orth_lang = 'kw' if ($orth_lang eq 'ay');
58 $orth_lang = 'kw' if ($orth_lang eq 'fj');
59 $orth_lang = 'eth' if ($orth_lang eq 'gez');
60 $orth_lang = 'hi' if ($orth_lang eq 'pa');
61 $orth_lang = 'cu' if ($orth_lang eq 'sco');
62 $orth_lang = 'af' if ($orth_lang eq 'sm');
63 $orth_lang = 'smj' if ($orth_lang eq 'sms');
64 $orth_lang = 'ge' if ($orth_lang eq 'te');
65 if (exists($lang_names{$lang})) {
66 $langs{$lang}{'name'} = $lang_names{$lang};
68 my $f = new FileHandle
($orth_file) || die "Unable to open $orth_file : $!\n";
70 if (!exists($langs{$lang}{'name'})) {
71 if (/^#\s*(.*?)\s*\($lang\)/i) {
72 $langs{$lang}{'name'} = $1;
75 if (/^#\s*(.*?)\s*\($orth_lang\)/i) {
76 $langs{$lang}{'name'} = $1;
80 next if (/^\s*(#|$)/);
81 if (/^\s*include\s+(\S+)/) {
84 ($include_file = $orth_file) =~ s
,/[^/]+$,/$include,;
85 parse_orth_file
($include_file, $lang);
88 my ($start) = ($_ =~ /^\s*(\S+)/);
90 # XXX ibo.orth 1ee1 -> 1ee4 (https://bugs.freedesktop.org/show_bug.cgi?id=6237)
91 $start = '1ee4' if ($start eq '1ee1');
95 ($start, $end) = split(/-/, $start);
97 # XXX ab.orth 0re1 -> 04e1 (https://bugs.freedesktop.org/show_bug.cgi?id=6238)
98 $end = '04e1' if ($end eq '0re1');
99 $start = hex ($start);
101 for (my $dec_enc = $start; $dec_enc <= $end; $dec_enc++) {
102 $langs{$lang}{'chars'}{$dec_enc} = 1;
108 sub inc_coverage
($$) {
109 my ($sfd_file, $dec_enc) = @_;
111 foreach $lang (keys %langs) {
112 if (exists $langs{$lang}{'chars'}{$dec_enc}) {
113 $langs{$lang}{'coverage'}{$sfd_file}++;
118 sub parse_sfd_file
($) {
121 open (F
, $sfd_file) || die "Unable to open $sfd_file : $!\n";
126 if (/^StartChar:\s*(\S+)\s*$/) {
130 } elsif (/^Colour:/) {
131 # XXX this is quick'n'dirty hack to detect non-empty glyphs
133 } elsif (/^Encoding:\s*\d+\s*(\d+)\s*\d+\s*$/) {
135 } elsif ($curenc && !$empty && /^EndChar\s*/) {
136 inc_coverage
($sfd_file, $curenc);
142 # TODO: formats would be better
143 sub print_coverage
() {
145 This is the language coverage file for DejaVu fonts
150 foreach $sfd_file (@sfd_files) {
151 my $label = $sfd_files{$sfd_file};
152 printf "%-19s", $label;
155 foreach $lang (sort keys %langs) {
156 my $name = $langs{$lang}{'name'};
157 my $length = keys %{$langs{$lang}{'chars'}};
158 printf "%-6s %-40s", $lang, $name;
159 foreach $sfd_file (@sfd_files) {
160 my ($coverage) = $langs{$lang}{'coverage'}{$sfd_file};
161 $coverage = 0 if (!defined $coverage);
162 my ($percent) = ($length != 0) ?
($coverage/$length * 100) : 0;
164 printf " %3d%%", $percent;
168 printf " %-13s", "($coverage/$length)";
175 print STDERR
"usage: fc-lang_dir sfd_file1 label1 (sfd_file2 label2...)\n";
179 $fc_lang_dir = shift @ARGV;
183 $sfd_file = shift @ARGV;
184 $label = shift @ARGV;
185 push (@sfd_files, $sfd_file);
186 $sfd_files{$sfd_file} = $label;
189 parse_fc_lang_dir
($fc_lang_dir);
190 foreach $sfd_file (@sfd_files) {
191 parse_sfd_file
($sfd_file);