po/check_po.pl

   1 #!/usr/bin/env perl
   2 #
   3 # check_po.pl  -  check po file translations for likely errors
   4 #
   5 # Written by David W. Pfitzner dwp@mso.anu.edu.au
   6 # This script is hereby placed in the Public Domain.
   7 #
   8 # Various checks on po file translations:
   9 # - printf-style format strings;
  10 # - differences in trailing newlines;
  11 # - empty (non-fuzzy) msgid;
  12 # - likely whitespace errors on joining multi-line entries
  13 # Ignores all fuzzy entries.
  14 #
  15 # Options:
  16 #  -x  Don't do standard checks above (eg, just check one of below).
  17 #  -n  Check newlines within strings; ie, that have equal numbers
  18 #      of newlines in msgstr and msgid.  (Optional because this may
  19 #      happen legitimately.)
  20 #  -w  Check leading whitespace.  Sometimes whitespace is simply
  21 #      spacing (eg, for widget labels etc), or punctuation differences,
  22 #      so this may be ok.
  23 #  -W  Check trailing whitespace.  See -w above.
  24 #  -p  Check trailing punctuation.
  25 #  -c  Check capitalization of first non-whitespace character
  26 #      (only if [a-zA-Z]).
  27 #  -e  Check on empty (c.q. new) msgstr
  28 #
  29 # Reads stdin (or filename args, via <>), writes any problems to stdout.
  30 #
  31 # Modified by Davide Pagnin nightmare@freeciv.it to support plural forms
  32 #
  33 # Version: 0.41     (2002-06-06)
  34
  35 # TODO: This script needs to be able to handle Farsi's %Id flag for
  36 #       number format specifiers.  More information on how it works, see
  37 #       http://www.gnu.org/software/hello/manual/gettext/c_002dformat.html
  38 #       It's possible someone has already made this change... look around
  39 #       for an updated version of this script.
  40
  41 use strict;
  42 use vars qw($opt_c $opt_n $opt_p $opt_w $opt_W $opt_x $opt_e);
  43 use Getopt::Std;
  44
  45 getopts('cnpwWxe');
  46
  47 # Globals, for current po entry:
  48 #
  49 # Note that msgid and msgstr have newlines represented by the
  50 # two characters '\' and 'n' (and similarly for other escapes).
  51
  52 my @amsgid;                     # lines exactly as in input
  53 my @amsgstr;
  54 my $entryline;                  # lineno where entry starts
  55 my $msgid;                      # lines joined by ""
  56 my $msgstr;
  57 my $is_fuzzy;
  58 my $is_cformat;
  59 my $state;                      # From constant values below.
  60 my $did_print;                  # Whether we have printed this entry, to
  61                                 # print only once for multiple problems.
  62
  63 use constant S_LOOKING_START => 0;   # looking for start of entry
  64 use constant S_DOING_MSGID   => 1;   # doing msgid part
  65 use constant S_DOING_MSGSTR  => 2;   # doing msgstr part
  66
  67 # Initialize or reinitalize globals to prepare for new entry:
  68 sub new_entry {
  69     @amsgid = ();
  70     @amsgstr = ();
  71     $msgid = undef;
  72     $msgstr = undef;
  73     $entryline = 0;
  74     $is_fuzzy = 0;
  75     $is_cformat = 0;
  76     $did_print = 0;
  77     $state = S_LOOKING_START;
  78 }
  79
  80 # Nicely print either a "msgid" or "msgstr" (name is one of these)
  81 # with given array of data.
  82 sub print_one {
  83     my $name = shift;
  84     print "  $name \"", join("\"\n  \"", @_), "\"\n";
  85 }
  86
  87 # Print a problem (args like print()), preceeded by entry unless
  88 # we have already printed that: label, and msgid and msgstr.
  89 #
  90 sub print_problem {
  91     unless ($did_print) {
  92         print "ENTRY:", ($ARGV eq "-" ? "" : " ($ARGV, line $entryline)"), "\n";
  93         print_one("msgid", @amsgid);
  94         print_one("msgstr", @amsgstr);
  95         $did_print = 1;
  96     }
  97     print "*** ", @_;
  98 }
  99
 100 # Check final newline: probably, translations should end in a newline
 101 # if and only if the original string does.
 102 # (See also check_trailing_whitespace and check_num_newlines below.)
 103 #
 104 sub check_trailing_newlines {
 105     if ($opt_x) { return; }
 106
 107     my ($ichar, $schar);
 108
 109     $ichar = (length($msgid)>=2) ? substr($msgid, -2, 2) : "";
 110     $schar = (length($msgstr)>=2) ? substr($msgstr, -2, 2) : "";
 111
 112     if ($ichar eq "\\n" && $schar ne "\\n") {
 113         print_problem "Missing trailing newline\n";
 114     }
 115     if ($ichar ne "\\n" && $schar eq "\\n") {
 116         print_problem "Extra trailing newline\n";
 117     }
 118
 119 }
 120
 121 # Check leading whitespace.  In general, any leading whitespace should
 122 # be the same in msgstr and msgid -- but not always.
 123 #
 124 sub check_leading_whitespace {
 125     unless ($opt_w) { return; }
 126
 127     my ($id, $str);
 128
 129     if ($msgid =~ m/^(\s+)/) {
 130         $id = $1;
 131     } else {
 132         $id = "";
 133     }
 134     if ($msgstr =~ m/^(\s+)/) {
 135         $str = $1;
 136     } else {
 137         $str = "";
 138     }
 139     if ($id ne $str) {
 140         print_problem "Different leading whitespace\n";
 141     }
 142 }
 143
 144 # Check trailing whitespace.  In general, any trailing whitespace should
 145 # be the same in msgstr and msgid -- but not always.
 146 #
 147 sub check_trailing_whitespace {
 148     unless ($opt_W) { return; }
 149
 150     my ($id, $str);
 151
 152     if ($msgid =~ m/((?:\s|\\n)+)$/) {
 153         $id = $1;
 154     } else {
 155         $id = "";
 156     }
 157     if ($msgstr =~ m/((?:\s|\\n)+)$/) {
 158         $str = $1;
 159     } else {
 160         $str = "";
 161     }
 162     if ($id ne $str) {
 163         print_problem "Different trailing whitespace\n";
 164     }
 165 }
 166
 167 # Check equal numbers of newlines.  In general ... etc.
 168 #
 169 sub check_num_newlines {
 170     unless ($opt_n) { return; }
 171
 172     my $num_i = ($msgid =~ m(\\n)g);
 173     my $num_s = ($msgstr =~ m(\\n)g);
 174
 175     if ($num_i != $num_s) {
 176         print_problem "Mismatch in newline count\n";
 177     }
 178
 179 }
 180
 181 # Check capitalization of first non-whitespace character (for [a-zA-Z]
 182 # only).  In general ... etc.
 183 #
 184 sub check_leading_capitalization {
 185     unless ($opt_c) { return; }
 186
 187     my ($id, $str);
 188
 189     if ($msgid =~ m/^\s*([a-zA-Z])/) {
 190         $id = $1;
 191     }
 192     if ($msgstr =~ m/^\s*([a-zA-Z])/) {
 193         $str = $1;
 194     }
 195     if (defined($id) && defined($str)) {
 196         if (($id =~ /^[a-z]$/ && $str =~ /^[A-Z]$/) ||
 197             ($id =~ /^[A-Z]$/ && $str =~ /^[a-z]$/)) {
 198             print_problem "Different leading capitalization\n";
 199         }
 200     }
 201 }
 202
 203 # Check trailing 'punctuation' characters (ignoring trailing whitespace).
 204 # In general .. etc.
 205 #
 206 sub check_trailing_punctuation {
 207     unless ($opt_p) { return; }
 208
 209     my ($id, $str);
 210
 211     # Might want more characters:
 212     if ($msgid =~ m/([\\\.\/\,\!\?\"\'\:\;])+(?:\s|\\n)*$/) {
 213         $id = $1;
 214     } else {
 215         $id = "";
 216     }
 217     if ($msgstr =~ m/([\\\.\/\,\!\?\"\'\:\;])+(?:\s|\\n)*$/) {
 218         $str = $1;
 219     } else {
 220         $str = "";
 221     }
 222     ##print "$id $str\n";
 223     if ($id ne $str) {
 224         print_problem "Different trailing punctuation\n";
 225     }
 226 }
 227
 228 # Check that multiline strings have whitespace separation, since
 229 # otherwise, eg:
 230 #   msgstr "this is a multiline"
 231 #          "string"
 232 # expands to:
 233 #   "this is a multilinestring"
 234 #
 235 sub check_whitespace_joins {
 236     if ($opt_x) { return; }
 237
 238     my $ok = 1;
 239     my $i = 0;
 240
 241     foreach my $aref (\@amsgid, \@amsgstr) {
 242         my $prev = undef;
 243       LINE:
 244         foreach my $line (@$aref) {
 245             if (defined($prev)
 246                 && length($prev)
 247                 && $prev !~ /\s$/
 248                 && $prev !~ /\\n$/
 249                 && $line !~ /^\s/
 250                 && $line !~ /^\\n/)
 251             {
 252                 $ok = 0;
 253                 last LINE;
 254             }
 255             $prev = $line;
 256         }
 257         if (!$ok) {
 258             print_problem("Possible non-whitespace line-join problem in ",
 259                           ($i==0 ? "msgid" : "msgstr"), " \n");
 260         }
 261         $i++;
 262     }
 263 }
 264
 265 # Check printf-style format entries.
 266 # Non-trivial, because translation strings may use format specifiers
 267 # out of order, or skip some specifiers etc.  Also gettext marks
 268 # anything with '%' as cformat, though not all are.
 269 #
 270 sub check_cformat {
 271     unless ($is_cformat) { return; }
 272     if ($opt_x)          { return; }
 273
 274     my (@iform, @sform);
 275     @iform = ($msgid =~ m/\%[0-9\.\$]*[a-z]/g);
 276     @sform = ($msgstr =~ m/\%[0-9\.\$]*[a-z]/g);
 277
 278     ##print join("::", @iform), "\n";
 279     ##print join("::", @sform), "\n";
 280
 281     my $js;                     # index in sform
 282     my $j;                      # index into iform
 283   SFORM:
 284     for ($js=0; $js < @sform; $js++) {
 285         my $sf = $sform[$js];
 286         my $sf_orig = $sf;
 287         if ($sf =~ s/^\%([0-9]+)\$(.*[a-z])$/\%$2/) {
 288             $j = $1-1;
 289         } else {
 290             $j = $js;
 291         }
 292         if ($j > $#iform) {
 293             print_problem("Format number mismatch for $sf_orig [msgstr:",
 294                           ($js+1), "]\n");
 295             next SFORM;
 296         }
 297         my $if = $iform[$j];
 298         if ($sf ne $if) {
 299             print_problem("Format mismatch: $sf_orig [msgstr:", ($js+1), "]",
 300                           " vs $if [msgid:", ($j+1), "]\n");
 301         }
 302     }
 303 }
 304
 305 # Run all individual checks on current entry, reporting any problems.
 306 sub check_entry {
 307     if ($is_fuzzy) {
 308         return;
 309     }
 310     $msgid = join("", @amsgid);
 311     $msgstr = join("", @amsgstr);
 312
 313     unless ($opt_x) {
 314         if (length($msgid)==0) {
 315             print_problem "Zero length msgid\n";
 316         }
 317     }
 318     if (length($msgstr)==0) {
 319         unless ($opt_e) { return; }
 320         print_problem "Untranslated msgid\n";
 321     }
 322     check_cformat;
 323     check_whitespace_joins;
 324     check_num_newlines;
 325     check_leading_whitespace;
 326     check_trailing_newlines;
 327     check_trailing_whitespace;
 328     check_leading_capitalization;
 329     check_trailing_punctuation;
 330 }
 331
 332 new_entry;
 333
 334 LINE:
 335 while(<>) {
 336     if ( m(^\s*$) ) {
 337         if ($state==S_DOING_MSGSTR) {
 338             check_entry;
 339             new_entry;
 340         }
 341         next LINE;
 342     }
 343     if ( m(^\#, fuzzy) ) {
 344         $is_fuzzy = 1;
 345     }
 346     if ( m(^\#, .*c-format) ) {
 347         # .* is because can have fuzzy, c-format
 348         $is_cformat = 1;
 349     }
 350     if ( m(^\#) ) {
 351         next LINE;
 352     }
 353     if ( m(^msgid \"(.*)\"$) ) {
 354         $entryline = $.;
 355         @amsgid = ($1);
 356         $state = S_DOING_MSGID;
 357         next LINE;
 358     }
 359     if ( m(^msgid_plural \"(.*)\"$) ) {
 360         $entryline = $.;
 361         @amsgid = ($1);
 362         $state = S_DOING_MSGID;
 363         next LINE;
 364     }
 365     if ( m(^msgstr \"(.*)\"$) ) {
 366         @amsgstr = ($1);
 367         $state = S_DOING_MSGSTR;
 368         next LINE;
 369     }
 370     if ( m(^msgstr\[[0-5]\] \"(.*)\"$) ) {
 371         @amsgstr = ($1);
 372         $state = S_DOING_MSGSTR;
 373         next LINE;
 374     }
 375     if ( m(^\"(.*)\"$) ) {
 376         if ($state==S_DOING_MSGID) {
 377             push @amsgid, $1;
 378         } elsif($state==S_DOING_MSGSTR) {
 379             push @amsgstr, $1;
 380         } else {
 381             die "Looking at string $_ in bad state $state,";
 382         }
 383         next LINE;
 384     }
 385     die "Unexpected at $.: ", $_;
 386 }