Version 3.6.0.2, tag libreoffice-3.6.0.2
[LibreOffice.git] / i18npool / source / isolang / lcid.awk
blobb8209e7a585e53973aaa9f136fe917efa6d9b80f
1 #!/usr/bin/awk -f
3 # Utility to compare MS-LANGID definitions with those defined in ../../inc/i18npool/lang.h
4 # Run in i18npool/source/isolang
6 # outputs new #define LANGUAGE_... 0x... and also some commented out substrings
7 # that were matched in already existing defines.
9 # ATTENTION! The sed filter in the command line examples below assures that a
10 # '|' border is drawn by html2text in data tables, and nowhere else, on which
11 # this awk script relies. This script also heavily relies on the column layout
12 # encountered. Should MS decide to change their layout or their CSS names
13 # ("data..."), this would probably break. Should html2text decide that the last
14 # border="..." attribute encountered wins instead of the first, this may break
15 # also.
17 # sed -e 's/|/,/g; s/<TABLE/<table/g; /<table/\!b; s/\(<table[^>]*\)\(border\|BORDER\)="[0-9]*"/\1/g; s/\(<table\)\([^>]*\(class\|CLASS\)="data\)/\1 border="1"\2/g'
19 # After html2text best if file cleaned up to _only_ contain the table entries,
20 # but not necessary, entries are filtered. Check output.
22 # Expects input from the saved page of one of
24 # (1)
25 # http://www.microsoft.com/globaldev/reference/lcid-all.mspx
26 # filtered through ``html2text -nobs ...'', generated table:
27 # blank,name,hex,dec,blank fields:
28 # |Afrikaans_-_South_Africa___|0436___|1078___|
30 # complete command line:
31 # lynx -dump -source http://www.microsoft.com/globaldev/reference/lcid-all.mspx | sed -e 's/|/,/g; s/<TABLE/<table/g; /<table/\!b; s/\(<table[^>]*\)\(border\|BORDER\)="[0-9]*"/\1/g; s/\(<table\)\([^>]*\(class\|CLASS\)="data\)/\1 border="1"\2/g' | html2text -nobs -width 234 | awk -f lcid.awk >outfile
34 # (2)
35 # http://www.microsoft.com/globaldev/reference/winxp/xp-lcid.mspx
36 # filtered through ``html2text -nobs ...'', generated table:
37 # blank,name,hex,dec,inputlocales,collection,blank fields:
38 # |Afrikaans |0436 |1078 |0436:00000409, |Basic |
40 # complete command line:
41 # lynx -dump -source http://www.microsoft.com/globaldev/reference/winxp/xp-lcid.mspx | sed -e 's/|/,/g; s/<TABLE/<table/g; /<table/\!b; s/\(<table[^>]*\)\(border\|BORDER\)="[0-9]*"/\1/g; s/\(<table\)\([^>]*\(class\|CLASS\)="data\)/\1 border="1"\2/g' | html2text -nobs -width 234 | awk -f lcid.awk >outfile
44 # (3)
45 # http://msdn.microsoft.com/library/en-us/intl/nls_238z.asp
46 # filtered through ``html2text -nobs ...'', generated table:
47 # blank,hex,locale,name,blank fields:
48 # |0x0436___|af-ZA___|Afrikaans_(South_Africa)___|
50 # complete command line:
51 # lynx -dump -source http://msdn.microsoft.com/library/en-us/intl/nls_238z.asp | sed -e 's/|/,/g; s/<TABLE/<table/g; /<table/\!b; s/\(<table[^>]*\)\(border\|BORDER\)="[0-9]*"/\1/g; s/\(<table\)\([^>]*\(class\|CLASS\)="data\)/\1 border="1"\2/g' | html2text -nobs -width 234 | awk -f lcid.awk >outfile
53 # Author: Eike Rathke <erack@sun.com>, <er@openoffice.org>
56 BEGIN {
57 while ((getline < "../../inc/i18npool/lang.h") > 0)
59 if ($0 ~ /^#define[ ]*LANGUAGE_[_A-Za-z0-9]*[ ]*0x[0-9a-fA-F]/)
61 # lang[HEX]=NAME
62 lang[toupper(substr($3,3))] = toupper($2)
63 #print substr($3,3) "=" $2
66 # html2text table follows
67 FS = "\|"
68 filetype = 0
69 lcid_all = 1
70 xp_lcid = 2
71 nls_238z = 3
72 filetypename[filetype] = "unknown"
73 filetypename[lcid_all] = "lcid_all"
74 filetypename[xp_lcid] = "xp_lcid"
75 filetypename[nls_238z] = "nls_238z"
76 namefield[lcid_all] = 2
77 namefield[xp_lcid] = 2
78 namefield[nls_238z] = 4
79 hexfield[lcid_all] = 3
80 hexfield[xp_lcid] = 3
81 hexfield[nls_238z] = 2
82 locfield[lcid_all] = 0
83 locfield[xp_lcid] = 0
84 locfield[nls_238z] = 3
87 (NF < 5) { next }
89 !filetype {
90 if (NF == 5)
92 if ($2 ~ /^0x/)
93 filetype = nls_238z
94 else if ($2 ~ /^Afrikaans/)
95 filetype = lcid_all
97 else if (NF == 7)
98 filetype = xp_lcid
99 if (!filetype)
100 next
101 name = namefield[filetype]
102 hex = hexfield[filetype]
103 loc = locfield[filetype]
107 gsub( /^[^:]*:/, "", $name)
108 gsub( /\..*/, "", $name)
109 gsub( /(^[ _]+)|([ _]+$)/, "", $hex)
110 gsub( /(^[ _]+)|([ _]+$)/, "", $name)
111 if (loc)
112 gsub( /(^[ _]+)|([ _]+$)/, "", $loc)
115 ($hex ~ /^0x/) { $hex = substr( $hex, 3) }
117 # if only 464 instead of 0464, make it match lang.h
118 (length($hex) < 4) { $hex = "0" $hex }
120 ($hex !~ /^[0-9a-fA-F][0-9a-fA-F]*$/) { filtered[$hex] = $0; next }
122 # all[HEX]=string
123 { all[toupper($hex)] = $name }
125 (loc) { comment[toupper($hex)] = " /* " $loc " */" }
127 # new hex: newlang[HEX]=string
128 !(toupper($hex) in lang) { newlang[toupper($hex)] = $name }
130 END {
131 if (!filetype)
133 print "No file type recognized." >>"/dev/stderr"
134 exit(1)
136 print "// assuming " filetypename[filetype] " file"
137 # every new language
138 for (x in newlang)
140 printf( "xxxxxxx LANGUAGE_%-26s 0x%s%s\n", newlang[x], x, comment[x])
141 n = split(newlang[x],arr,/[^A-Za-z0-9]/)
142 def = ""
143 for (i=1; i<=n; ++i)
145 if (length(arr[i]))
147 # each identifier word of the language name
148 if (def)
149 def = def "_"
150 aup = toupper(arr[i])
151 def = def aup
152 for (l in lang)
154 # contained in already existing definitions?
155 if (lang[l] ~ aup)
156 printf( "// %-50s %s\n", arr[i] ": " lang[l], l)
160 printf( "#define LANGUAGE_%-26s 0x%s\n", def, x)
162 print "\n// --- reverse check follows ----------------------------------\n"
163 for (x in lang)
165 if (!(x in all))
166 print "// not in input file: " x " " lang[x]
168 print "\n// --- filtered table entries follow (if any) -----------------\n"
169 for (x in filtered)
170 print "// filtered: " x " " filtered[x]