bump product version to 4.2.0.1
[LibreOffice.git] / i18nlangtag / source / isolang / langid.pl
blobc9d248d5041a9d6e2e38ced45299f96deb0a2a40
1 : # -*- perl -*- vim: ft=perl
2 eval 'exec perl -w -S $0 ${1+"$@"}'
3 if 0;
5 # This file is part of the LibreOffice project.
7 # This Source Code Form is subject to the terms of the Mozilla Public
8 # License, v. 2.0. If a copy of the MPL was not distributed with this
9 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
11 # This file incorporates work covered by the following license notice:
13 # Licensed to the Apache Software Foundation (ASF) under one or more
14 # contributor license agreements. See the NOTICE file distributed
15 # with this work for additional information regarding copyright
16 # ownership. The ASF licenses this file to you under the Apache
17 # License, Version 2.0 (the "License"); you may not use this file
18 # except in compliance with the License. You may obtain a copy of
19 # the License at http://www.apache.org/licenses/LICENSE-2.0 .
22 # See Usage() below or invoke without arguments for short instructions.
23 # For long instructions use the source, Luke ;-)
25 use strict;
27 sub Usage()
29 print STDERR
30 "\n",
31 "langid - a hackish utility to lookup lang.h language defines and LangIDs,\n",
32 "isolang.cxx ISO639/ISO3166 mapping, locale data files, langtab.src language\n",
33 "listbox entries, langlist.mk, file_ooo.scp registry name, languages.pm and\n",
34 "msi-encodinglist.txt\n\n",
36 "Usage: $0 [--single] {language string} | {LangID} | {primarylanguage sublanguage} | {language-country}\n\n",
38 "A language string will be used as a generic string match in all searched files.\n",
39 "You may enclose the language string in word delimiters,\n",
40 "e.g. \\blanguage_german\\b for a specific match.\n",
41 "If the language string expression matches more than one define,\n",
42 "e.g. as in 'german', all matching defines will be processed.\n",
43 "If the language string does not match a define or an identifier in\n",
44 "langtab.src, a generic string match of the listbox entries will be tried.\n\n",
46 "Numeric values of LangID,primarylanguage,sublanguage can be given\n",
47 "decimal, hexadecimal (leading 0x), octal (leading 0) or binary (leading 0b).\n",
48 "The exact language_define of an exact match will be used in remaining lookups.\n\n",
50 "A language-country pair will lookup a xx-YY mapping from isolang.cxx,\n",
51 "for example: 'en-US' or 'de-' or '-CH',\n",
52 "xx and YY can be given case insensitive, will be lowered-uppered internally,\n",
53 "and xx and YY themselves may be regular expressions.\n",
54 "Also here a list of matches will be processed.\n\n",
56 "If option --single is given, only the first match will be processed.\n\n";
59 my $SRC_ROOT = $ENV{"SRC_ROOT"};
60 if (!defined($SRC_ROOT))
62 print "\nNeed \$SRC_ROOT, please set your LibreOffice environment!\n";
63 Usage();
64 exit 1;
67 my $LANGUAGE_MASK_PRIMARY = 0x03ff;
69 sub getPrimaryLanguage($)
71 my($lcid) = @_;
72 return $lcid & $LANGUAGE_MASK_PRIMARY;
75 sub getSubLanguage($)
77 my($lcid) = @_;
78 return $lcid >> 10;
81 sub makeLangID($$)
83 my( $sub, $pri) = @_;
84 return ($sub << 10) | $pri;
88 sub grepFile($$$$$@)
90 my( $regex, $path, $module, $name, $printmsg, @addregex) = @_;
91 my @result;
92 my $found = 0;
93 my $areopen = 0;
94 my $arecloser = '';
95 # Try module under current working directory first to catch local
96 # modifications.
97 my $file = "./$module/$name";
98 if (!($found = open( IN, $file)))
100 # Then with the given path.
101 $file = "$path/$module/$name";
102 if (!($found = open( IN, $file)))
104 print "No $file\n";
107 if ($found)
109 $found = 0;
110 while (my $line = <IN>)
112 if ($line =~ /$regex/)
114 if (!$found)
116 $found = 1;
117 print "$file:\n";
119 chomp( $line);
120 print "$line\n";
121 push( @result, $line);
123 elsif (@addregex)
125 # By convention first element is opener, second element is closer.
126 if (!$areopen)
128 if ($line =~ /$addregex[0]/)
130 $areopen = 1;
131 $arecloser = $addregex[1];
134 if ($areopen)
136 for (my $i = 2; $i < @addregex; ++$i)
138 if ($line =~ /$addregex[$i]/)
140 if (!$found)
142 $found = 1;
143 print "$file:\n";
145 chomp( $line);
146 print "$line\n";
147 push( @result, $line);
150 if ($line =~ /$arecloser/)
152 $areopen = 0;
157 close( IN);
159 if (!$found && $printmsg) {
160 print "Not found in $file\n";
161 #print "Not found in $file for $regex @addregex\n";
163 return @result;
167 sub main()
169 my( $lcid, @parts, $grepdef, $options, $single);
170 $grepdef = 0;
171 $single = 0;
172 for ($options = 0; $options < @ARGV && $ARGV[$options] =~ /^--/; ++$options)
174 if ($ARGV[$options] eq '--single') { $single = 1; }
175 else { print "Unknown option: $ARGV[$options]\n"; }
177 if (@ARGV == 1 + $options)
179 # 0x hex, 0b bin, 0 oct
180 if ($ARGV[$options] =~ /^0/) {
181 $lcid = oct( $ARGV[0]); }
182 elsif ($ARGV[$options] =~ /^[0-9]/) {
183 $lcid = $ARGV[$options]; }
184 else
186 $grepdef = $ARGV[$options];
187 $lcid = 0;
189 $parts[0] = getPrimaryLanguage( $lcid);
190 $parts[1] = getSubLanguage( $lcid);
192 elsif (@ARGV == 2 + $options)
194 for (my $i = $options; $i < 2 + $options; ++$i)
196 if ($ARGV[$i] =~ /^0/) {
197 $parts[$i] = oct( $ARGV[$i]); }
198 else {
199 $parts[$i] = $ARGV[$i]; }
201 $lcid = makeLangID( $parts[1], $parts[0]);
203 else
205 Usage();
206 return 1;
208 my $modifier = "(?i)";
209 my (@resultlist, @greplist, $result);
210 # If no string was given on the command line, but value(s) were, lookup the
211 # LangID value to obtain the define identifier.
212 if ($grepdef)
214 # #define LANGUAGE_AFRIKAANS 0x0436
215 @resultlist = grepFile(
216 $modifier . '^\s*#\s*define\s+[A-Z_]*' . $grepdef,
217 "$SRC_ROOT", "include", "i18nlangtag/lang.h", 1, ());
219 else
221 printf( "LangID: 0x%04X (dec %d), primary: 0x%03x, sub 0x%02x\n", $lcid,
222 $lcid, $parts[0], $parts[1]);
223 my $buf = sprintf( "0x%04X", $lcid);
224 # #define LANGUAGE_AFRIKAANS 0x0436
225 @resultlist = grepFile(
226 '^\s*#\s*define\s+\w+\s+' . $buf,
227 "$SRC_ROOT", "include", "i18nlangtag/lang.h", 1, ());
229 for $result (@resultlist)
231 # #define LANGUAGE_AFRIKAANS 0x0436
232 if ($result =~ /^\s*#\s*define\s+(\w+)\s+(0x[0-9a-fA-F]+)/)
234 push( @greplist, '\b' . $1 . '\b');
235 $modifier = ""; # complete identifier now case sensitive
236 if ($single) {
237 last; }
240 # If the string given is of the form xx-yy lookup a language,country pair
241 # to obtain the define identifier. xx and yy themselfs may be regexps.
242 # xx- is a short form for 'xx-.*' and -yy a short form for '.*-yy'
243 # Note that -Latn for '.*-Latn' also works, accidentally.
244 if ($grepdef =~ /^(.*)-$/) {
245 $grepdef = $1 . "-.*"; }
246 if ($grepdef =~ /^-(.*)$/) {
247 $grepdef = ".*-" . $1; }
248 if ($grepdef =~ /^([^-]{2,3})-([^-]{2,2})$/) # catches also .*-.*
250 my $lang = $1;
251 my $coun = $2;
252 $lang = lc($lang);
253 $coun = uc($coun);
254 # { LANGUAGE_AFRIKAANS, "af", "ZA", 0 },
255 @resultlist = grepFile(
256 '^\s*\{\s*\w+\s*,\s*"' . $lang . '"\s*,\s*"' . $coun . '"\s*,\s*\w+\s*\}\s*,',
257 "$SRC_ROOT", "i18nlangtag", "source/isolang/isolang.cxx", 1, ());
258 for $result (@resultlist)
260 if ($result =~ /^\s*\{\s*(\w+)\s*,\s*"(\w+)"\s*,\s*"(\w+)?"\s*,\s*\w+\s*\}\s*,/)
262 push( @greplist, '\b' . $1 . '\b');
263 $modifier = ""; # complete identifier now case sensitive
264 if ($single) {
265 last; }
268 $grepdef = 0;
270 # Same for lll-Ssss or lll-Ssss-CC language tag.
271 if ($grepdef =~ /^([^-]{2,3})-([^-]{4,4})$/ || $grepdef =~ /^([^-]{2,3})-([^-]{4,4})-([^-]{2,2})$/)
273 my $lang = $1;
274 my $scri = $2;
275 my $coun = $3;
276 if (!defined($coun)) {
277 $coun = ""; }
278 $lang = lc($lang);
279 $scri = ucfirst(lc($scri));
280 $coun = uc($coun);
281 # { LANGUAGE_SERBIAN_LATIN_SERBIA, "sr-Latn", "RS", 0 },
282 @resultlist = grepFile(
283 '^\s*\{\s*\w+\s*,\s*"' . $lang . '-' . $scri . '"\s*,\s*"' . $coun . '"\s*,\s*\w+\s*\}\s*,',
284 "$SRC_ROOT", "i18nlangtag", "source/isolang/isolang.cxx", 1, ());
285 for $result (@resultlist)
287 if ($result =~ /^\s*\{\s*(\w+)\s*,\s*"(\w+-\w+)"\s*,\s*"(\w+)?"\s*,\s*\w+\s*\}\s*,/)
289 push( @greplist, '\b' . $1 . '\b');
290 $modifier = ""; # complete identifier now case sensitive
291 if ($single) {
292 last; }
295 $grepdef = 0;
297 # And for any other language tag that MUST match case.
298 if ($grepdef =~ /^[^-]+-/)
300 # { LANGUAGE_CATALAN_VALENCIAN, "ca-ES-valencia", "ES", "ca-valencia" },
301 @resultlist = grepFile(
302 '^\s*\{\s*\w+\s*,\s*"' . $grepdef . '"\s*,\s*"(\w*)"\s*,\s*"([^"]*)"\s*\}\s*,',
303 "$SRC_ROOT", "i18nlangtag", "source/isolang/isolang.cxx", 1, ());
304 for $result (@resultlist)
306 if ($result =~ /^\s*\{\s*(\w+)\s*,\s*"([^"]+)"\s*,\s*"(\w*)"\s*,\s*"([^"]*)"\s*\}\s*,/)
308 push( @greplist, '\b' . $1 . '\b');
309 $modifier = ""; # complete identifier now case sensitive
310 if ($single) {
311 last; }
314 $grepdef = 0;
316 if (!@greplist && $grepdef) {
317 push( @greplist, $grepdef); }
318 for $grepdef (@greplist)
320 print "\nUsing: " . $grepdef . "\n";
322 # Decimal LCID, was needed for Langpack.ulf but isn't used anymore,
323 # keep just in case we'd need it again.
324 # #define LANGUAGE_AFRIKAANS 0x0436
325 @resultlist = grepFile(
326 $modifier . '^\s*#\s*define\s+[A-Z_]*' . $grepdef,
327 "$SRC_ROOT", "include", "i18nlangtag/lang.h", 1, ());
328 my @lcidlist;
329 for $result (@resultlist)
331 # #define LANGUAGE_AFRIKAANS 0x0436
332 if ($result =~ /^\s*#\s*define\s+(\w+)\s+(0x[0-9a-fA-F]+)/)
334 push( @lcidlist, oct( $2));
338 my @allresultslist;
339 # { LANGUAGE_AFRIKAANS, "af", "ZA", 0 },
340 @resultlist = grepFile(
341 $modifier . '^\s*\{\s*.*' . $grepdef . '.*\s*,\s*"(\w+)"\s*,\s*"(\w+)?"\s*,\s*\w+\s*\}\s*,',
342 "$SRC_ROOT", "i18nlangtag", "source/isolang/isolang.cxx", 0, ());
343 push( @allresultslist, @resultlist);
344 # { LANGUAGE_SERBIAN_LATIN_SERBIA, "sr-Latn", "RS", 0 },
345 @resultlist = grepFile(
346 $modifier . '^\s*\{\s*.*' . $grepdef . '.*\s*,\s*"(\w+-\w+)"\s*,\s*"(\w+)?"\s*,\s*\w+\s*}\s*,',
347 "$SRC_ROOT", "i18nlangtag", "source/isolang/isolang.cxx", 0, ());
348 push( @allresultslist, @resultlist);
349 # { LANGUAGE_CATALAN_VALENCIAN, "ca-ES-valencia", "ES", "ca-valencia" },
350 @resultlist = grepFile(
351 $modifier . '^\s*\{\s*.*' . $grepdef . '.*\s*,\s*"([^"]+)"\s*,\s*"(\w*)"\s*,\s*"([^"]*)"\s*\}\s*,',
352 "$SRC_ROOT", "i18nlangtag", "source/isolang/isolang.cxx", 0, ());
353 push( @allresultslist, @resultlist);
355 my @langtaggreplist;
356 for $result (@allresultslist)
358 my $loca;
359 # { LANGUAGE_AFRIKAANS, "af", "ZA", 0 },
360 # { LANGUAGE_SERBIAN_LATIN_SERBIA, "sr-Latn", "RS", 0 },
361 if ($result =~ /^\s*\{\s*(\w+)\s*,\s*"(\w+)"\s*,\s*"(\w+)?"\s*,\s*\w+\s*\}\s*,/ ||
362 $result =~ /^\s*\{\s*(\w+)\s*,\s*"(\w+-\w+)"\s*,\s*"(\w+)?"\s*,\s*\w+\s*\}\s*,/)
364 my $lang = $2;
365 my $coun = $3;
366 if ($coun)
368 $loca = $lang . "_" . $coun;
369 push( @langtaggreplist, '\b' . $lang . '\b(-' . $coun . ')?');
371 else
373 $loca = $lang;
374 push( @langtaggreplist, '\b' . $lang . '\b');
377 # { LANGUAGE_CATALAN_VALENCIAN, "ca-ES-valencia", "ES", "ca-valencia" },
378 if ($result =~ /^\s*\{\s*(\w+)\s*,\s*"([^"]+)"\s*,\s*"(\w*)"\s*,\s*"([^"]*)"\s*\}\s*,/)
380 $loca = $2;
381 my $lang = $4;
382 my $coun = $3;
383 if ($lang)
385 if ($coun)
387 push( @langtaggreplist, '\b' . $lang . '\b(-' . $coun . ')?');
389 else
391 push( @langtaggreplist, '\b' . $lang . '\b');
395 if ($loca)
397 $loca =~ s/-/_/g;
398 my $file = "$SRC_ROOT/i18npool/source/localedata/data/$loca.xml";
399 my $found = open( LD, $file);
400 if ($found)
402 print "Found $file:\n";
403 my $on = 0;
404 while (my $line = <LD>)
406 if ($line =~ /<(Language|Country|Variant)>/) {
407 $on = 1; }
408 if ($on) {
409 print $line; }
410 if ($line =~ /<\/(Language|Country|Variant)>/) {
411 $on = 0; }
413 close( LD);
415 else {
416 print "No $file\n"; }
420 # case LANGUAGE_ARABIC_SAUDI_ARABIA & LANGUAGE_MASK_PRIMARY :
421 grepFile(
422 $modifier . '^\s*case\s*.*' . $grepdef . '.*(\s*&\s*\w+)?\s*:',
423 "$SRC_ROOT", "i18nlangtag", "source/isolang/mslangid.cxx", 1, ());
425 my $module = "svtools";
426 my $name = "source/misc/langtab.src";
427 # < "Afrikaans" ; LANGUAGE_AFRIKAANS ; > ;
428 # lookup define
429 @resultlist = grepFile(
430 $modifier . '^\s*<\s*".*"\s*;\s*.*' . $grepdef . '.*\s*;\s*>\s*;',
431 "$SRC_ROOT", $module, $name, 1, ());
432 # lookup string
433 if (!@resultlist) {
434 grepFile(
435 $modifier . '^\s*<\s*".*' . $grepdef . '.*"\s*;\s*.*\s*;\s*>\s*;',
436 "$SRC_ROOT", $module, $name, 1, ()); }
438 for my $langtag (@langtaggreplist)
440 # Name (xxx) = "/registry/spool/org/openoffice/Office/Common-ctl.xcu";
441 grepFile(
442 '^\s*Name\s*\(' . $langtag . '\)\s*=',
443 "$SRC_ROOT", "scp2", "source/ooo/file_ooo.scp", 1, ());
445 # completelangiso=af ar as-IN ... zu
446 grepFile(
447 '^\s*completelangiso\s*=\s*(\s*([a-z]{2,3})(-[A-Z][A-Z])?)*' . $langtag . '',
448 "$SRC_ROOT", "solenv", "inc/langlist.mk", 1,
449 # needs a duplicated pair of backslashes to produce a literal \\
450 ('^\s*completelangiso\s*=', '^\s*$', '^\s*' . $langtag . '\s*\\\\*$'));
452 # af 1252 1078 # Afrikaans
453 grepFile(
454 '^\s*' . $langtag . '',
455 "$SRC_ROOT", "l10ntools", "source/ulfconv/msi-encodinglist.txt", 1, ());
458 return 0;
461 main();