i18nlangtag/source/isolang/langid.pl

   1 : # -*- perl -*-  vim: ft=perl
   2 eval 'exec perl -w -S $0 ${1+"$@"}'
   3 if 0;
   4 #
   5 # This file is part of the LibreOffice project.
   6 #
   7 # This Source Code Form is subject to the terms of the Mozilla Public
   8 # License, v. 2.0. If a copy of the MPL was not distributed with this
   9 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
  10 #
  11 # This file incorporates work covered by the following license notice:
  12 #
  13 #   Licensed to the Apache Software Foundation (ASF) under one or more
  14 #   contributor license agreements. See the NOTICE file distributed
  15 #   with this work for additional information regarding copyright
  16 #   ownership. The ASF licenses this file to you under the Apache
  17 #   License, Version 2.0 (the "License"); you may not use this file
  18 #   except in compliance with the License. You may obtain a copy of
  19 #   the License at http://www.apache.org/licenses/LICENSE-2.0 .
  20 #
  21
  22 # See Usage() below or invoke without arguments for short instructions.
  23 # For long instructions use the source, Luke ;-)
  24
  25 use strict;
  26
  27 sub Usage()
  28 {
  29     print STDERR
  30         "\n",
  31         "langid - a hackish utility to lookup lang.h language defines and LangIDs,\n",
  32         "isolang.cxx ISO639/ISO3166 mapping, locale data files, langtab.hrc language\n",
  33         "listbox entries, langlist.mk, file_ooo.scp registry name, languages.pm and\n",
  34         "msi-encodinglist.txt\n\n",
  35
  36         "Usage: $0 [--single] {language string} | {LangID} | {primarylanguage sublanguage} | {language-country}\n\n",
  37
  38         "A language string will be used as a generic string match in all searched files.\n",
  39         "You may enclose the language string in word delimiters,\n",
  40         "e.g. \\blanguage_german\\b for a specific match.\n",
  41         "If the language string expression matches more than one define,\n",
  42         "e.g. as in 'german', all matching defines will be processed.\n",
  43         "If the language string does not match a define or an identifier in\n",
  44         "langtab.hrc, a generic string match of the listbox entries will be tried.\n\n",
  45
  46         "Numeric values of LangID,primarylanguage,sublanguage can be given\n",
  47         "decimal, hexadecimal (leading 0x), octal (leading 0) or binary (leading 0b).\n",
  48         "The exact language_define of an exact match will be used in remaining lookups.\n\n",
  49
  50         "A language-country pair will lookup a xx-YY mapping from isolang.cxx,\n",
  51         "for example: 'en-US' or 'de-' or '-CH',\n",
  52         "xx and YY can be given case insensitive, will be lowered-uppered internally,\n",
  53         "and xx and YY themselves may be regular expressions.\n",
  54         "Also here a list of matches will be processed.\n\n",
  55
  56         "If option --single is given, only the first match will be processed.\n\n";
  57 }
  58
  59 my $SRC_ROOT = $ENV{"SRC_ROOT"};
  60 if (!defined($SRC_ROOT))
  61 {
  62     print "\nNeed \$SRC_ROOT, please set your LibreOffice environment!\n";
  63     Usage();
  64     exit 1;
  65 }
  66
  67 my $LANGUAGE_MASK_PRIMARY = 0x03ff;
  68
  69 sub getPrimaryLanguage($)
  70 {
  71     my($lcid) = @_;
  72     return $lcid & $LANGUAGE_MASK_PRIMARY;
  73 }
  74
  75 sub getSubLanguage($)
  76 {
  77     my($lcid) = @_;
  78     return $lcid >> 10;
  79 }
  80
  81 sub makeLangID($$)
  82 {
  83     my( $sub, $pri) = @_;
  84     return ($sub << 10) | $pri;
  85 }
  86
  87
  88 # Note that a regex needs a duplicated pair of backslashes to produce a literal
  89 # \\ like in \\\\* to search for zero or more \ backslashes.
  90 # @addregex can be an optional "block to grep" definition
  91 # (regex-to-start-block, regex-to-end-block, regex-to-find-in-block)
  92 sub grepFile($$$$$@)
  93 {
  94     my( $regex, $path, $module, $name, $printmsg, @addregex) = @_;
  95     my @result;
  96     my $found = 0;
  97     my $areopen = 0;
  98     my $arecloser = '';
  99     # Try module under current working directory first to catch local
 100     # modifications.
 101     my $file = "./$module/$name";
 102     if (!($found = open( IN, $file)))
 103     {
 104         # Then with the given path.
 105         $file = "$path/$module/$name";
 106         if (!($found = open( IN, $file)))
 107         {
 108             print "No $file\n";
 109         }
 110     }
 111     if ($found)
 112     {
 113         $found = 0;
 114         while (my $line = <IN>)
 115         {
 116             if ($line =~ /$regex/)
 117             {
 118                 if (!$found)
 119                 {
 120                     $found = 1;
 121                     print "$file:\n";
 122                 }
 123                 chomp( $line);
 124                 print "$line\n";
 125                 push( @result, $line);
 126             }
 127             elsif (@addregex)
 128             {
 129                 # By convention first element is opener, second element is closer.
 130                 if (!$areopen)
 131                 {
 132                     if ($line =~ /$addregex[0]/)
 133                     {
 134                         $areopen = 1;
 135                         $arecloser = $addregex[1];
 136                     }
 137                 }
 138                 if ($areopen)
 139                 {
 140                     for (my $i = 2; $i < @addregex; ++$i)
 141                     {
 142                         if ($line =~ /$addregex[$i]/)
 143                         {
 144                             if (!$found)
 145                             {
 146                                 $found = 1;
 147                                 print "$file:\n";
 148                             }
 149                             chomp( $line);
 150                             print "$line\n";
 151                             push( @result, $line);
 152                         }
 153                     }
 154                     if ($line =~ /$arecloser/)
 155                     {
 156                         $areopen = 0;
 157                     }
 158                 }
 159             }
 160         }
 161         close( IN);
 162     }
 163     if (!$found && $printmsg) {
 164         print "Not found in $file\n";
 165         #print "Not found in $file for $regex @addregex\n";
 166     }
 167     return @result;
 168 }
 169
 170
 171 sub main()
 172 {
 173     my( $lcid, @parts, $grepdef, $options, $single);
 174     $grepdef = 0;
 175     $single = 0;
 176     for ($options = 0; $options < @ARGV && $ARGV[$options] =~ /^--/; ++$options)
 177     {
 178         if ($ARGV[$options] eq '--single') { $single = 1; }
 179         else { print "Unknown option: $ARGV[$options]\n"; }
 180     }
 181     if (@ARGV == 1 + $options)
 182     {
 183         # 0x hex, 0b bin, 0 oct
 184         if ($ARGV[$options] =~ /^0/) {
 185             $lcid = oct( $ARGV[0]); }
 186         elsif ($ARGV[$options] =~ /^[0-9]/) {
 187             $lcid = $ARGV[$options]; }
 188         else
 189         {
 190             $grepdef = $ARGV[$options];
 191             $lcid = 0;
 192         }
 193         $parts[0] = getPrimaryLanguage( $lcid);
 194         $parts[1] = getSubLanguage( $lcid);
 195     }
 196     elsif (@ARGV == 2 + $options)
 197     {
 198         for (my $i = $options; $i < 2 + $options; ++$i)
 199         {
 200             if ($ARGV[$i] =~ /^0/) {
 201                 $parts[$i] = oct( $ARGV[$i]); }
 202             else {
 203                 $parts[$i] = $ARGV[$i]; }
 204         }
 205         $lcid = makeLangID( $parts[1], $parts[0]);
 206     }
 207     else
 208     {
 209         Usage();
 210         return 1;
 211     }
 212     my $modifier = "(?i)";
 213     my (@resultlist, @greplist, $result);
 214     # If no string was given on the command line, but value(s) were, lookup the
 215     # LangID value to obtain the define identifier.
 216     if ($grepdef)
 217     {
 218         # #define LANGUAGE_AFRIKAANS                  LanguageType(0x0436)
 219         @resultlist = grepFile(
 220             $modifier . '^\s*#\s*define\s+[A-Z_]*' . $grepdef,
 221             "$SRC_ROOT", "include", "i18nlangtag/lang.h", 1, ());
 222     }
 223     else
 224     {
 225         printf( "LangID: 0x%04X (dec %d), primary: 0x%03x, sub 0x%02x\n", $lcid,
 226                 $lcid, $parts[0], $parts[1]);
 227         my $buf = sprintf( "0x%04X", $lcid);
 228         # #define LANGUAGE_AFRIKAANS                  LanguageType(0x0436)
 229         @resultlist = grepFile(
 230             '^\s*#\s*define\s+\w+\s+LanguageType\(' . $buf . '\)',
 231             "$SRC_ROOT", "include", "i18nlangtag/lang.h", 1, ());
 232     }
 233     for $result (@resultlist)
 234     {
 235         # #define LANGUAGE_AFRIKAANS                  LanguageType(0x0436)
 236         if ($result =~ /^\s*#\s*define\s+(\w+)\s+LanguageType\((0x[0-9a-fA-F]+)\)/)
 237         {
 238             push( @greplist, '\b' . $1 . '\b');
 239             $modifier = "";     # complete identifier now case sensitive
 240             if ($single) {
 241                 last; }
 242         }
 243     }
 244     # If the string given is of the form xx-yy lookup a language,country pair
 245     # to obtain the define identifier. xx and yy may themselves be regexps.
 246     # xx- is a short form for 'xx-.*' and -yy a short form for '.*-yy'
 247     # Note that -Latn for '.*-Latn' also works, accidentally.
 248     if ($grepdef =~ /^(.*)-$/) {
 249         $grepdef = $1 . "-.*"; }
 250     if ($grepdef =~ /^-(.*)$/) {
 251         $grepdef = ".*-" . $1; }
 252     if ($grepdef =~ /^([^-]{2,3})-([^-]{2,2})$/)    # catches also .*-.*
 253     {
 254         my $lang = $1;
 255         my $coun = $2;
 256         $lang = lc($lang);
 257         $coun = uc($coun);
 258         #     { LANGUAGE_AFRIKAANS,                   "af", "ZA", 0     },
 259         @resultlist = grepFile(
 260             '^\s*\{\s*\w+\s*,\s*"' . $lang . '"\s*,\s*"'  . $coun . '"\s*,\s*\w+\s*\}\s*,',
 261             "$SRC_ROOT", "i18nlangtag", "source/isolang/isolang.cxx", 1, ());
 262         for $result (@resultlist)
 263         {
 264             if ($result =~ /^\s*\{\s*(\w+)\s*,\s*"(\w+)"\s*,\s*"(\w+)?"\s*,\s*\w+\s*\}\s*,/)
 265             {
 266                 push( @greplist, '\b' . $1 . '\b');
 267                 $modifier = "";     # complete identifier now case sensitive
 268                 if ($single) {
 269                     last; }
 270             }
 271         }
 272         $grepdef = 0;
 273     }
 274     # Same for lll-Ssss or lll-Ssss-CC language tag.
 275     if ($grepdef =~ /^([^-]{2,3})-([^-]{4,4})$/ || $grepdef =~ /^([^-]{2,3})-([^-]{4,4})-([^-]{2,2})$/)
 276     {
 277         my $lang = $1;
 278         my $scri = $2;
 279         my $coun = $3;
 280         if (!defined($coun)) {
 281             $coun = ""; }
 282         $lang = lc($lang);
 283         $scri = ucfirst(lc($scri));
 284         $coun = uc($coun);
 285         #     { LANGUAGE_SERBIAN_LATIN_SERBIA,                "sr-Latn", "RS", 0     },
 286         @resultlist = grepFile(
 287             '^\s*\{\s*\w+\s*,\s*"' . $lang . '-' . $scri . '"\s*,\s*"'  . $coun . '"\s*,\s*\w+\s*\}\s*,',
 288             "$SRC_ROOT", "i18nlangtag", "source/isolang/isolang.cxx", 1, ());
 289         for $result (@resultlist)
 290         {
 291             if ($result =~ /^\s*\{\s*(\w+)\s*,\s*"(\w+-\w+)"\s*,\s*"(\w+)?"\s*,\s*\w+\s*\}\s*,/)
 292             {
 293                 push( @greplist, '\b' . $1 . '\b');
 294                 $modifier = "";     # complete identifier now case sensitive
 295                 if ($single) {
 296                     last; }
 297             }
 298         }
 299         $grepdef = 0;
 300     }
 301     # And for any other language tag that MUST match case.
 302     if ($grepdef =~ /^[^-]+-/)
 303     {
 304         #     { LANGUAGE_CATALAN_VALENCIAN,       "ca-ES-valencia", "ES", "ca-valencia" },
 305         @resultlist = grepFile(
 306             '^\s*\{\s*\w+\s*,\s*"' . $grepdef . '"\s*,\s*"(\w*)"\s*,\s*"([^"]*)"\s*\}\s*,',
 307             "$SRC_ROOT", "i18nlangtag", "source/isolang/isolang.cxx", 1, ());
 308         for $result (@resultlist)
 309         {
 310             if ($result =~ /^\s*\{\s*(\w+)\s*,\s*"([^"]+)"\s*,\s*"(\w*)"\s*,\s*"([^"]*)"\s*\}\s*,/)
 311             {
 312                 push( @greplist, '\b' . $1 . '\b');
 313                 $modifier = "";     # complete identifier now case sensitive
 314                 if ($single) {
 315                     last; }
 316             }
 317         }
 318         $grepdef = 0;
 319     }
 320     if (!@greplist && $grepdef) {
 321         push( @greplist, $grepdef); }
 322     for $grepdef (@greplist)
 323     {
 324         print "\nUsing: " . $grepdef . "\n";
 325
 326         # Decimal LCID, was needed for Langpack.ulf but isn't used anymore,
 327         # keep just in case we'd need it again.
 328         # #define LANGUAGE_AFRIKAANS                  0x0436
 329         @resultlist = grepFile(
 330             $modifier . '^\s*#\s*define\s+[A-Z_]*' . $grepdef,
 331             "$SRC_ROOT", "include", "i18nlangtag/lang.h", 1, ());
 332         my @lcidlist;
 333         for $result (@resultlist)
 334         {
 335             # #define LANGUAGE_AFRIKAANS                  LanguageType(0x0436)
 336             if ($result =~ /^\s*#\s*define\s+(\w+)\s+LanguageType\((0x[0-9a-fA-F]+)\)/)
 337             {
 338                 push( @lcidlist, oct( $2));
 339             }
 340         }
 341
 342         my @allresultslist;
 343         #     { LANGUAGE_AFRIKAANS,                   "af", "ZA", 0     },
 344         @resultlist = grepFile(
 345             $modifier . '^\s*\{\s*.*' . $grepdef . '.*\s*,\s*"(\w+)"\s*,\s*"(\w+)?"\s*,\s*\w+\s*\}\s*,',
 346             "$SRC_ROOT", "i18nlangtag", "source/isolang/isolang.cxx", 0, ());
 347         push( @allresultslist, @resultlist);
 348         #     { LANGUAGE_SERBIAN_LATIN_SERBIA,                "sr-Latn", "RS", 0     },
 349         @resultlist = grepFile(
 350             $modifier . '^\s*\{\s*.*' . $grepdef . '.*\s*,\s*"(\w+-\w+)"\s*,\s*"(\w+)?"\s*,\s*\w+\s*}\s*,',
 351             "$SRC_ROOT", "i18nlangtag", "source/isolang/isolang.cxx", 0, ());
 352         push( @allresultslist, @resultlist);
 353         #     { LANGUAGE_CATALAN_VALENCIAN,       "ca-ES-valencia", "ES", "ca-valencia" },
 354         @resultlist = grepFile(
 355             $modifier . '^\s*\{\s*.*' . $grepdef . '.*\s*,\s*"([^"]+)"\s*,\s*"(\w*)"\s*,\s*"([^"]*)"\s*\}\s*,',
 356             "$SRC_ROOT", "i18nlangtag", "source/isolang/isolang.cxx", 0, ());
 357         push( @allresultslist, @resultlist);
 358
 359         my @langtaggreplist;
 360         for $result (@allresultslist)
 361         {
 362             my $loca;
 363             #     { LANGUAGE_AFRIKAANS,                   "af", "ZA", 0     },
 364             #     { LANGUAGE_SERBIAN_LATIN_SERBIA,                "sr-Latn", "RS", 0     },
 365             if ($result =~ /^\s*\{\s*(\w+)\s*,\s*"(\w+)"\s*,\s*"(\w+)?"\s*,\s*\w+\s*\}\s*,/ ||
 366                 $result =~ /^\s*\{\s*(\w+)\s*,\s*"(\w+-\w+)"\s*,\s*"(\w+)?"\s*,\s*\w+\s*\}\s*,/)
 367             {
 368                 my $lang = $2;
 369                 my $coun = $3;
 370                 if ($coun)
 371                 {
 372                     $loca = $lang . "_" . $coun;
 373                     push( @langtaggreplist, '\b' . $lang . '\b(-' . $coun . ')?');
 374                 }
 375                 else
 376                 {
 377                     $loca = $lang;
 378                     push( @langtaggreplist, '\b' . $lang . '\b');
 379                 }
 380             }
 381             #     { LANGUAGE_CATALAN_VALENCIAN,       "ca-ES-valencia", "ES", "ca-valencia" },
 382             if ($result =~ /^\s*\{\s*(\w+)\s*,\s*"([^"]+)"\s*,\s*"(\w*)"\s*,\s*"([^"]*)"\s*\}\s*,/)
 383             {
 384                 $loca = $2;
 385                 my $lang = $4;
 386                 my $coun = $3;
 387                 if ($lang)
 388                 {
 389                     if ($coun)
 390                     {
 391                         push( @langtaggreplist, '\b' . $lang . '\b(-' . $coun . ')?');
 392                     }
 393                     else
 394                     {
 395                         push( @langtaggreplist, '\b' . $lang . '\b');
 396                     }
 397                 }
 398             }
 399             if ($loca)
 400             {
 401                 $loca =~ s/-/_/g;
 402                 my $file = "$SRC_ROOT/i18npool/source/localedata/data/$loca.xml";
 403                 my $found = open( LD, $file);
 404                 if ($found)
 405                 {
 406                     print "Found $file:\n";
 407                     my $on = 0;
 408                     while (my $line = <LD>)
 409                     {
 410                         if ($line =~ /<(Language|Country|Variant)>/) {
 411                             $on = 1; }
 412                         if ($on) {
 413                             print $line; }
 414                         if ($line =~ /<\/(Language|Country|Variant)>/) {
 415                             $on = 0; }
 416                     }
 417                     close( LD);
 418                 }
 419                 else {
 420                     print "No $file\n"; }
 421             }
 422         }
 423
 424         # Find any special treatment, may need inspection then.
 425         # $grepdef already has \b word delimiters.
 426         grepFile(
 427             $modifier . $grepdef,
 428             "$SRC_ROOT", "i18nlangtag", "source/isolang/mslangid.cxx", 1, ());
 429
 430         my $module = "svtools";
 431         my $name = "inc/langtab.hrc";
 432         #    { NC_("STR_ARR_SVT_LANGUAGE_TABLE", "Afrikaans (South Africa)") , LANGUAGE_AFRIKAANS },
 433         # lookup define
 434         @resultlist = grepFile(
 435             $modifier . '^\s*\{\s*NC_\(\s*"[^"]*"\s*,\s*".*"\s*\)\s*,.*' . $grepdef . '.*\}',
 436             "$SRC_ROOT", $module, $name, 1, ());
 437         # lookup string
 438         if (!@resultlist) {
 439             grepFile(
 440                 $modifier . '^\s*\{\s*NC_\(\s*"[^"]*"\s*,\s*".*' . $grepdef . '.*"\s*\)\s*,.*\}',
 441                 "$SRC_ROOT", $module, $name, 1, ()); }
 442
 443         for my $langtag (@langtaggreplist)
 444         {
 445             # Name (xxx) = "/registry/spool/org/openoffice/Office/Common-ctl.xcu";
 446             grepFile(
 447                 '^\s*Name\s*\(' . $langtag . '\)\s*=',
 448                 "$SRC_ROOT", "scp2", "source/ooo/file_ooo.scp", 1, ());
 449
 450             # completelangiso=af ar as-IN ... zu
 451             grepFile(
 452                 '^\s*completelangiso\s*=\s*(\s*([a-z]{2,3})(-[A-Z][A-Z])?)*' . $langtag . '',
 453                 "$SRC_ROOT", "solenv", "inc/langlist.mk", 1,
 454                 # Also grep the list of tags, one per line, \ backslash continued.
 455                 ('^\s*completelangiso\s*=', '^\s*$', '^\s*' . $langtag . '\s*\\\\*$'));
 456
 457             # af    1252  1078   # Afrikaans
 458             grepFile(
 459                 '^\s*' . $langtag . '',
 460                 "$SRC_ROOT", "l10ntools", "source/ulfconv/msi-encodinglist.txt", 1, ());
 461
 462             # 27:af:afrikaans
 463             grepFile(
 464                 '^\d*:' . $langtag . '',
 465                 "$SRC_ROOT", "bin", "lo-xlate-lang", 1, ());
 466         }
 467     }
 468     return 0;
 469 }
 470
 471 main();