Version 4.0.0.1, tag libreoffice-4.0.0.1
[LibreOffice.git] / i18npool / source / isolang / lcid.awk
blob371cd64106d6a3edf85385a213f71be789ba95bf
1 #!/usr/bin/awk -f
3 # This file is part of the LibreOffice project.
5 # This Source Code Form is subject to the terms of the Mozilla Public
6 # License, v. 2.0. If a copy of the MPL was not distributed with this
7 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 # This file incorporates work covered by the following license notice:
11 # Licensed to the Apache Software Foundation (ASF) under one or more
12 # contributor license agreements. See the NOTICE file distributed
13 # with this work for additional information regarding copyright
14 # ownership. The ASF licenses this file to you under the Apache
15 # License, Version 2.0 (the "License"); you may not use this file
16 # except in compliance with the License. You may obtain a copy of
17 # the License at http://www.apache.org/licenses/LICENSE-2.0 .
19 # Utility to compare MS-LANGID definitions with those defined in ../../inc/i18npool/lang.h
20 # Run in i18npool/source/isolang
22 # outputs new #define LANGUAGE_... 0x... and also some commented out substrings
23 # that were matched in already existing defines.
25 # ATTENTION! The sed filter in the command line examples below assures that a
26 # '|' border is drawn by html2text in data tables, and nowhere else, on which
27 # this awk script relies. This script also heavily relies on the column layout
28 # encountered. Should MS decide to change their layout or their CSS names
29 # ("data..."), this would probably break. Should html2text decide that the last
30 # border="..." attribute encountered wins instead of the first, this may break
31 # also.
33 # sed -e 's/|/,/g; s/<TABLE/<table/g; /<table/\!b; s/\(<table[^>]*\)\(border\|BORDER\)="[0-9]*"/\1/g; s/\(<table\)\([^>]*\(class\|CLASS\)="data\)/\1 border="1"\2/g'
35 # After html2text best if file cleaned up to _only_ contain the table entries,
36 # but not necessary, entries are filtered. Check output.
38 # Expects input from the saved page of one of
40 # (1)
41 # http://www.microsoft.com/globaldev/reference/lcid-all.mspx
42 # filtered through ``html2text -nobs ...'', generated table:
43 # blank,name,hex,dec,blank fields:
44 # |Afrikaans_-_South_Africa___|0436___|1078___|
46 # complete command line:
47 # lynx -dump -source http://www.microsoft.com/globaldev/reference/lcid-all.mspx | sed -e 's/|/,/g; s/<TABLE/<table/g; /<table/\!b; s/\(<table[^>]*\)\(border\|BORDER\)="[0-9]*"/\1/g; s/\(<table\)\([^>]*\(class\|CLASS\)="data\)/\1 border="1"\2/g' | html2text -nobs -width 234 | awk -f lcid.awk >outfile
50 # (2)
51 # http://www.microsoft.com/globaldev/reference/winxp/xp-lcid.mspx
52 # filtered through ``html2text -nobs ...'', generated table:
53 # blank,name,hex,dec,inputlocales,collection,blank fields:
54 # |Afrikaans |0436 |1078 |0436:00000409, |Basic |
56 # complete command line:
57 # lynx -dump -source http://www.microsoft.com/globaldev/reference/winxp/xp-lcid.mspx | sed -e 's/|/,/g; s/<TABLE/<table/g; /<table/\!b; s/\(<table[^>]*\)\(border\|BORDER\)="[0-9]*"/\1/g; s/\(<table\)\([^>]*\(class\|CLASS\)="data\)/\1 border="1"\2/g' | html2text -nobs -width 234 | awk -f lcid.awk >outfile
60 # (3)
61 # http://msdn.microsoft.com/library/en-us/intl/nls_238z.asp
62 # filtered through ``html2text -nobs ...'', generated table:
63 # blank,hex,locale,name,blank fields:
64 # |0x0436___|af-ZA___|Afrikaans_(South_Africa)___|
66 # complete command line:
67 # lynx -dump -source http://msdn.microsoft.com/library/en-us/intl/nls_238z.asp | sed -e 's/|/,/g; s/<TABLE/<table/g; /<table/\!b; s/\(<table[^>]*\)\(border\|BORDER\)="[0-9]*"/\1/g; s/\(<table\)\([^>]*\(class\|CLASS\)="data\)/\1 border="1"\2/g' | html2text -nobs -width 234 | awk -f lcid.awk >outfile
69 # Author: Eike Rathke <erack@sun.com>, <er@openoffice.org>
72 BEGIN {
73 while ((getline < "../../inc/i18npool/lang.h") > 0)
75 if ($0 ~ /^#define[ ]*LANGUAGE_[_A-Za-z0-9]*[ ]*0x[0-9a-fA-F]/)
77 # lang[HEX]=NAME
78 lang[toupper(substr($3,3))] = toupper($2)
79 #print substr($3,3) "=" $2
82 # html2text table follows
83 FS = "\|"
84 filetype = 0
85 lcid_all = 1
86 xp_lcid = 2
87 nls_238z = 3
88 filetypename[filetype] = "unknown"
89 filetypename[lcid_all] = "lcid_all"
90 filetypename[xp_lcid] = "xp_lcid"
91 filetypename[nls_238z] = "nls_238z"
92 namefield[lcid_all] = 2
93 namefield[xp_lcid] = 2
94 namefield[nls_238z] = 4
95 hexfield[lcid_all] = 3
96 hexfield[xp_lcid] = 3
97 hexfield[nls_238z] = 2
98 locfield[lcid_all] = 0
99 locfield[xp_lcid] = 0
100 locfield[nls_238z] = 3
103 (NF < 5) { next }
105 !filetype {
106 if (NF == 5)
108 if ($2 ~ /^0x/)
109 filetype = nls_238z
110 else if ($2 ~ /^Afrikaans/)
111 filetype = lcid_all
113 else if (NF == 7)
114 filetype = xp_lcid
115 if (!filetype)
116 next
117 name = namefield[filetype]
118 hex = hexfield[filetype]
119 loc = locfield[filetype]
123 gsub( /^[^:]*:/, "", $name)
124 gsub( /\..*/, "", $name)
125 gsub( /(^[ _]+)|([ _]+$)/, "", $hex)
126 gsub( /(^[ _]+)|([ _]+$)/, "", $name)
127 if (loc)
128 gsub( /(^[ _]+)|([ _]+$)/, "", $loc)
131 ($hex ~ /^0x/) { $hex = substr( $hex, 3) }
133 # if only 464 instead of 0464, make it match lang.h
134 (length($hex) < 4) { $hex = "0" $hex }
136 ($hex !~ /^[0-9a-fA-F][0-9a-fA-F]*$/) { filtered[$hex] = $0; next }
138 # all[HEX]=string
139 { all[toupper($hex)] = $name }
141 (loc) { comment[toupper($hex)] = " /* " $loc " */" }
143 # new hex: newlang[HEX]=string
144 !(toupper($hex) in lang) { newlang[toupper($hex)] = $name }
146 END {
147 if (!filetype)
149 print "No file type recognized." >>"/dev/stderr"
150 exit(1)
152 print "// assuming " filetypename[filetype] " file"
153 # every new language
154 for (x in newlang)
156 printf( "xxxxxxx LANGUAGE_%-26s 0x%s%s\n", newlang[x], x, comment[x])
157 n = split(newlang[x],arr,/[^A-Za-z0-9]/)
158 def = ""
159 for (i=1; i<=n; ++i)
161 if (length(arr[i]))
163 # each identifier word of the language name
164 if (def)
165 def = def "_"
166 aup = toupper(arr[i])
167 def = def aup
168 for (l in lang)
170 # contained in already existing definitions?
171 if (lang[l] ~ aup)
172 printf( "// %-50s %s\n", arr[i] ": " lang[l], l)
176 printf( "#define LANGUAGE_%-26s 0x%s\n", def, x)
178 print "\n// --- reverse check follows ----------------------------------\n"
179 for (x in lang)
181 if (!(x in all))
182 print "// not in input file: " x " " lang[x]
184 print "\n// --- filtered table entries follow (if any) -----------------\n"
185 for (x in filtered)
186 print "// filtered: " x " " filtered[x]