Bump version to 24.04.3.4
[LibreOffice.git] / i18nlangtag / source / isolang / langid.pl
blob061c69288e073679cce4395c17bb1e352f0ed30e
1 : # -*- perl -*- vim: ft=perl
2 eval 'exec perl -w -S $0 ${1+"$@"}'
3 if 0;
5 # This file is part of the LibreOffice project.
7 # This Source Code Form is subject to the terms of the Mozilla Public
8 # License, v. 2.0. If a copy of the MPL was not distributed with this
9 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
11 # This file incorporates work covered by the following license notice:
13 # Licensed to the Apache Software Foundation (ASF) under one or more
14 # contributor license agreements. See the NOTICE file distributed
15 # with this work for additional information regarding copyright
16 # ownership. The ASF licenses this file to you under the Apache
17 # License, Version 2.0 (the "License"); you may not use this file
18 # except in compliance with the License. You may obtain a copy of
19 # the License at http://www.apache.org/licenses/LICENSE-2.0 .
22 # See Usage() below or invoke without arguments for short instructions.
23 # For long instructions use the source, Luke ;-)
25 use strict;
27 sub Usage()
29 print STDERR
30 "\n",
31 "langid - a hackish utility to lookup lang.h language defines and LangIDs,\n",
32 "isolang.cxx ISO639/ISO3166 mapping, locale data files, langtab.hrc language\n",
33 "listbox entries, langlist.mk, file_ooo.scp registry name, languages.pm and\n",
34 "msi-encodinglist.txt\n\n",
36 "Usage: $0 [--single] {language string} | {LangID} | {primarylanguage sublanguage} | {language-country}\n\n",
38 "A language string will be used as a generic string match in all searched files.\n",
39 "You may enclose the language string in word delimiters,\n",
40 "e.g. \\blanguage_german\\b for a specific match.\n",
41 "If the language string expression matches more than one define,\n",
42 "e.g. as in 'german', all matching defines will be processed.\n",
43 "If the language string does not match a define or an identifier in\n",
44 "langtab.hrc, a generic string match of the listbox entries will be tried.\n\n",
46 "Numeric values of LangID,primarylanguage,sublanguage can be given\n",
47 "decimal, hexadecimal (leading 0x), octal (leading 0) or binary (leading 0b).\n",
48 "The exact language_define of an exact match will be used in remaining lookups.\n\n",
50 "A language-country pair will lookup a xx-YY mapping from isolang.cxx,\n",
51 "for example: 'en-US' or 'de-' or '-CH',\n",
52 "xx and YY can be given case insensitive, will be lowered-uppered internally,\n",
53 "and xx and YY themselves may be regular expressions.\n",
54 "Also here a list of matches will be processed.\n\n",
56 "If option --single is given, only the first match will be processed.\n\n";
59 my $SRC_ROOT = $ENV{"SRC_ROOT"};
60 if (!defined($SRC_ROOT))
62 print "\nNeed \$SRC_ROOT, please set your LibreOffice environment!\n";
63 Usage();
64 exit 1;
67 my $LANGUAGE_MASK_PRIMARY = 0x03ff;
69 sub getPrimaryLanguage($)
71 my($lcid) = @_;
72 return $lcid & $LANGUAGE_MASK_PRIMARY;
75 sub getSubLanguage($)
77 my($lcid) = @_;
78 return $lcid >> 10;
81 sub makeLangID($$)
83 my( $sub, $pri) = @_;
84 return ($sub << 10) | $pri;
88 # Note that a regex needs a duplicated pair of backslashes to produce a literal
89 # \\ like in \\\\* to search for zero or more \ backslashes.
90 # @addregex can be an optional "block to grep" definition
91 # (regex-to-start-block, regex-to-end-block, regex-to-find-in-block)
92 sub grepFile($$$$$@)
94 my( $regex, $path, $module, $name, $printmsg, @addregex) = @_;
95 my @result;
96 my $found = 0;
97 my $areopen = 0;
98 my $arecloser = '';
99 # Try module under current working directory first to catch local
100 # modifications.
101 my $file = "./$module/$name";
102 if (!($found = open( IN, $file)))
104 # Then with the given path.
105 $file = "$path/$module/$name";
106 if (!($found = open( IN, $file)))
108 print "No $file\n";
111 if ($found)
113 $found = 0;
114 while (my $line = <IN>)
116 if ($line =~ /$regex/)
118 if (!$found)
120 $found = 1;
121 print "$file:\n";
123 chomp( $line);
124 print "$line\n";
125 push( @result, $line);
127 elsif (@addregex)
129 # By convention first element is opener, second element is closer.
130 if (!$areopen)
132 if ($line =~ /$addregex[0]/)
134 $areopen = 1;
135 $arecloser = $addregex[1];
138 if ($areopen)
140 for (my $i = 2; $i < @addregex; ++$i)
142 if ($line =~ /$addregex[$i]/)
144 if (!$found)
146 $found = 1;
147 print "$file:\n";
149 chomp( $line);
150 print "$line\n";
151 push( @result, $line);
154 if ($line =~ /$arecloser/)
156 $areopen = 0;
161 close( IN);
163 if (!$found && $printmsg) {
164 print "Not found in $file\n";
165 #print "Not found in $file for $regex @addregex\n";
167 return @result;
171 sub main()
173 my( $lcid, @parts, $grepdef, $options, $single);
174 $grepdef = 0;
175 $single = 0;
176 for ($options = 0; $options < @ARGV && $ARGV[$options] =~ /^--/; ++$options)
178 if ($ARGV[$options] eq '--single') { $single = 1; }
179 else { print "Unknown option: $ARGV[$options]\n"; }
181 if (@ARGV == 1 + $options)
183 # 0x hex, 0b bin, 0 oct
184 if ($ARGV[$options] =~ /^0/) {
185 $lcid = oct( $ARGV[0]); }
186 elsif ($ARGV[$options] =~ /^[0-9]/) {
187 $lcid = $ARGV[$options]; }
188 else
190 $grepdef = $ARGV[$options];
191 $lcid = 0;
193 $parts[0] = getPrimaryLanguage( $lcid);
194 $parts[1] = getSubLanguage( $lcid);
196 elsif (@ARGV == 2 + $options)
198 for (my $i = $options; $i < 2 + $options; ++$i)
200 if ($ARGV[$i] =~ /^0/) {
201 $parts[$i] = oct( $ARGV[$i]); }
202 else {
203 $parts[$i] = $ARGV[$i]; }
205 $lcid = makeLangID( $parts[1], $parts[0]);
207 else
209 Usage();
210 return 1;
212 my $modifier = "(?i)";
213 my (@resultlist, @greplist, $result);
214 # If no string was given on the command line, but value(s) were, lookup the
215 # LangID value to obtain the define identifier.
216 if ($grepdef)
218 # #define LANGUAGE_AFRIKAANS LanguageType(0x0436)
219 @resultlist = grepFile(
220 $modifier . '^\s*#\s*define\s+[A-Z_]*' . $grepdef,
221 "$SRC_ROOT", "include", "i18nlangtag/lang.h", 1, ());
223 else
225 printf( "LangID: 0x%04X (dec %d), primary: 0x%03x, sub 0x%02x\n", $lcid,
226 $lcid, $parts[0], $parts[1]);
227 my $buf = sprintf( "0x%04X", $lcid);
228 # #define LANGUAGE_AFRIKAANS LanguageType(0x0436)
229 @resultlist = grepFile(
230 '^\s*#\s*define\s+\w+\s+LanguageType\(' . $buf . '\)',
231 "$SRC_ROOT", "include", "i18nlangtag/lang.h", 1, ());
233 for $result (@resultlist)
235 # #define LANGUAGE_AFRIKAANS LanguageType(0x0436)
236 if ($result =~ /^\s*#\s*define\s+(\w+)\s+LanguageType\((0x[0-9a-fA-F]+)\)/)
238 push( @greplist, '\b' . $1 . '\b');
239 $modifier = ""; # complete identifier now case sensitive
240 if ($single) {
241 last; }
244 # If the string given is of the form xx-yy lookup a language,country pair
245 # to obtain the define identifier. xx and yy may themselves be regexps.
246 # xx- is a short form for 'xx-.*' and -yy a short form for '.*-yy'
247 # Note that -Latn for '.*-Latn' also works, accidentally.
248 if ($grepdef =~ /^(.*)-$/) {
249 $grepdef = $1 . "-.*"; }
250 if ($grepdef =~ /^-(.*)$/) {
251 $grepdef = ".*-" . $1; }
252 if ($grepdef =~ /^([^-]{2,3})-([^-]{2,2})$/) # catches also .*-.*
254 my $lang = $1;
255 my $coun = $2;
256 $lang = lc($lang);
257 $coun = uc($coun);
258 # { LANGUAGE_AFRIKAANS, "af", "ZA", 0 },
259 @resultlist = grepFile(
260 '^\s*\{\s*\w+\s*,\s*"' . $lang . '"\s*,\s*"' . $coun . '"\s*,\s*\w+\s*\}\s*,',
261 "$SRC_ROOT", "i18nlangtag", "source/isolang/isolang.cxx", 1, ());
262 for $result (@resultlist)
264 if ($result =~ /^\s*\{\s*(\w+)\s*,\s*"(\w+)"\s*,\s*"(\w+)?"\s*,\s*\w+\s*\}\s*,/)
266 push( @greplist, '\b' . $1 . '\b');
267 $modifier = ""; # complete identifier now case sensitive
268 if ($single) {
269 last; }
272 $grepdef = 0;
274 # Same for lll-Ssss or lll-Ssss-CC language tag.
275 if ($grepdef =~ /^([^-]{2,3})-([^-]{4,4})$/ || $grepdef =~ /^([^-]{2,3})-([^-]{4,4})-([^-]{2,2})$/)
277 my $lang = $1;
278 my $scri = $2;
279 my $coun = $3;
280 if (!defined($coun)) {
281 $coun = ""; }
282 $lang = lc($lang);
283 $scri = ucfirst(lc($scri));
284 $coun = uc($coun);
285 # { LANGUAGE_SERBIAN_LATIN_SERBIA, "sr-Latn", "RS", 0 },
286 @resultlist = grepFile(
287 '^\s*\{\s*\w+\s*,\s*"' . $lang . '-' . $scri . '"\s*,\s*"' . $coun . '"\s*,\s*\w+\s*\}\s*,',
288 "$SRC_ROOT", "i18nlangtag", "source/isolang/isolang.cxx", 1, ());
289 for $result (@resultlist)
291 if ($result =~ /^\s*\{\s*(\w+)\s*,\s*"(\w+-\w+)"\s*,\s*"(\w+)?"\s*,\s*\w+\s*\}\s*,/)
293 push( @greplist, '\b' . $1 . '\b');
294 $modifier = ""; # complete identifier now case sensitive
295 if ($single) {
296 last; }
299 $grepdef = 0;
301 # And for any other language tag that MUST match case.
302 if ($grepdef =~ /^[^-]+-/)
304 # { LANGUAGE_CATALAN_VALENCIAN, "ca-ES-valencia", "ES", "ca-valencia" },
305 @resultlist = grepFile(
306 '^\s*\{\s*\w+\s*,\s*"' . $grepdef . '"\s*,\s*"(\w*)"\s*,\s*"([^"]*)"\s*\}\s*,',
307 "$SRC_ROOT", "i18nlangtag", "source/isolang/isolang.cxx", 1, ());
308 for $result (@resultlist)
310 if ($result =~ /^\s*\{\s*(\w+)\s*,\s*"([^"]+)"\s*,\s*"(\w*)"\s*,\s*"([^"]*)"\s*\}\s*,/)
312 push( @greplist, '\b' . $1 . '\b');
313 $modifier = ""; # complete identifier now case sensitive
314 if ($single) {
315 last; }
318 $grepdef = 0;
320 if (!@greplist && $grepdef) {
321 push( @greplist, $grepdef); }
322 for $grepdef (@greplist)
324 print "\nUsing: " . $grepdef . "\n";
326 # Decimal LCID, was needed for Langpack.ulf but isn't used anymore,
327 # keep just in case we'd need it again.
328 # #define LANGUAGE_AFRIKAANS 0x0436
329 @resultlist = grepFile(
330 $modifier . '^\s*#\s*define\s+[A-Z_]*' . $grepdef,
331 "$SRC_ROOT", "include", "i18nlangtag/lang.h", 1, ());
332 my @lcidlist;
333 for $result (@resultlist)
335 # #define LANGUAGE_AFRIKAANS LanguageType(0x0436)
336 if ($result =~ /^\s*#\s*define\s+(\w+)\s+LanguageType\((0x[0-9a-fA-F]+)\)/)
338 push( @lcidlist, oct( $2));
342 my @allresultslist;
343 # { LANGUAGE_AFRIKAANS, "af", "ZA", 0 },
344 @resultlist = grepFile(
345 $modifier . '^\s*\{\s*.*' . $grepdef . '.*\s*,\s*"(\w+)"\s*,\s*"(\w+)?"\s*,\s*\w+\s*\}\s*,',
346 "$SRC_ROOT", "i18nlangtag", "source/isolang/isolang.cxx", 0, ());
347 push( @allresultslist, @resultlist);
348 # { LANGUAGE_SERBIAN_LATIN_SERBIA, "sr-Latn", "RS", 0 },
349 @resultlist = grepFile(
350 $modifier . '^\s*\{\s*.*' . $grepdef . '.*\s*,\s*"(\w+-\w+)"\s*,\s*"(\w+)?"\s*,\s*\w+\s*}\s*,',
351 "$SRC_ROOT", "i18nlangtag", "source/isolang/isolang.cxx", 0, ());
352 push( @allresultslist, @resultlist);
353 # { LANGUAGE_CATALAN_VALENCIAN, "ca-ES-valencia", "ES", "ca-valencia" },
354 @resultlist = grepFile(
355 $modifier . '^\s*\{\s*.*' . $grepdef . '.*\s*,\s*"([^"]+)"\s*,\s*"(\w*)"\s*,\s*"([^"]*)"\s*\}\s*,',
356 "$SRC_ROOT", "i18nlangtag", "source/isolang/isolang.cxx", 0, ());
357 push( @allresultslist, @resultlist);
359 my @langtaggreplist;
360 for $result (@allresultslist)
362 my $loca;
363 # { LANGUAGE_AFRIKAANS, "af", "ZA", 0 },
364 # { LANGUAGE_SERBIAN_LATIN_SERBIA, "sr-Latn", "RS", 0 },
365 if ($result =~ /^\s*\{\s*(\w+)\s*,\s*"(\w+)"\s*,\s*"(\w+)?"\s*,\s*\w+\s*\}\s*,/ ||
366 $result =~ /^\s*\{\s*(\w+)\s*,\s*"(\w+-\w+)"\s*,\s*"(\w+)?"\s*,\s*\w+\s*\}\s*,/)
368 my $lang = $2;
369 my $coun = $3;
370 if ($coun)
372 $loca = $lang . "_" . $coun;
373 push( @langtaggreplist, '\b' . $lang . '\b(-' . $coun . ')?');
375 else
377 $loca = $lang;
378 push( @langtaggreplist, '\b' . $lang . '\b');
381 # { LANGUAGE_CATALAN_VALENCIAN, "ca-ES-valencia", "ES", "ca-valencia" },
382 if ($result =~ /^\s*\{\s*(\w+)\s*,\s*"([^"]+)"\s*,\s*"(\w*)"\s*,\s*"([^"]*)"\s*\}\s*,/)
384 $loca = $2;
385 my $lang = $4;
386 my $coun = $3;
387 if ($lang)
389 if ($coun)
391 push( @langtaggreplist, '\b' . $lang . '\b(-' . $coun . ')?');
393 else
395 push( @langtaggreplist, '\b' . $lang . '\b');
399 if ($loca)
401 $loca =~ s/-/_/g;
402 my $file = "$SRC_ROOT/i18npool/source/localedata/data/$loca.xml";
403 my $found = open( LD, $file);
404 if ($found)
406 print "Found $file:\n";
407 my $on = 0;
408 while (my $line = <LD>)
410 if ($line =~ /<(Language|Country|Variant)>/) {
411 $on = 1; }
412 if ($on) {
413 print $line; }
414 if ($line =~ /<\/(Language|Country|Variant)>/) {
415 $on = 0; }
417 close( LD);
419 else {
420 print "No $file\n"; }
424 # Find any special treatment, may need inspection then.
425 # $grepdef already has \b word delimiters.
426 grepFile(
427 $modifier . $grepdef,
428 "$SRC_ROOT", "i18nlangtag", "source/isolang/mslangid.cxx", 1, ());
430 my $module = "svtools";
431 my $name = "inc/langtab.hrc";
432 # { NC_("STR_ARR_SVT_LANGUAGE_TABLE", "Afrikaans (South Africa)") , LANGUAGE_AFRIKAANS },
433 # lookup define
434 @resultlist = grepFile(
435 $modifier . '^\s*\{\s*NC_\(\s*"[^"]*"\s*,\s*".*"\s*\)\s*,.*' . $grepdef . '.*\}',
436 "$SRC_ROOT", $module, $name, 1, ());
437 # lookup string
438 if (!@resultlist) {
439 grepFile(
440 $modifier . '^\s*\{\s*NC_\(\s*"[^"]*"\s*,\s*".*' . $grepdef . '.*"\s*\)\s*,.*\}',
441 "$SRC_ROOT", $module, $name, 1, ()); }
443 for my $langtag (@langtaggreplist)
445 # Name (xxx) = "/registry/spool/org/openoffice/Office/Common-ctl.xcu";
446 grepFile(
447 '^\s*Name\s*\(' . $langtag . '\)\s*=',
448 "$SRC_ROOT", "scp2", "source/ooo/file_ooo.scp", 1, ());
450 # completelangiso=af ar as-IN ... zu
451 grepFile(
452 '^\s*completelangiso\s*=\s*(\s*([a-z]{2,3})(-[A-Z][A-Z])?)*' . $langtag . '',
453 "$SRC_ROOT", "solenv", "inc/langlist.mk", 1,
454 # Also grep the list of tags, one per line, \ backslash continued.
455 ('^\s*completelangiso\s*=', '^\s*$', '^\s*' . $langtag . '\s*\\\\*$'));
457 # af 1252 1078 # Afrikaans
458 grepFile(
459 '^\s*' . $langtag . '',
460 "$SRC_ROOT", "l10ntools", "source/ulfconv/msi-encodinglist.txt", 1, ());
462 # 27:af:afrikaans
463 grepFile(
464 '^\d*:' . $langtag . '',
465 "$SRC_ROOT", "bin", "lo-xlate-lang", 1, ());
468 return 0;
471 main();