2 # Creating the file ZhConversion.php used for Simplified/Traditional
3 # Chinese conversion. It gets the basic conversion table from the Unihan
4 # database, and construct the phrase tables using phrase libraries in
5 # the SCIM packages. There are also special tables used to for adjustment.
6 # Some data in the file simp2trad.manual was taken from the following
8 # Requirement: you need to set your locale to zh_CN.UTF-8 (or any
9 # other utf-8 locales, I suppose)
12 all: ZhConversion.php tradphrases.notsure simpphrases.notsure wordlist
15 wget ftp
://ftp.unicode.org
/Public
/UNIDATA
/Unihan.zip
19 wget http
://freedesktop.org
/~suzhe
/sources
/scim-tables-0.4
.3.
tar.gz
20 tar zxvf scim-tables-0.4
.3.
tar.gz
> /dev
/null
21 cp scim-tables-0.4
.3/zh
/EZ.txt.in .
22 rm -rf scim-tables-0.4
.3*
25 wget http
://freedesktop.org
/~suzhe
/scim-chinese
/scim-chinese-0.4
.2.
tar.gz
26 tar zxvf scim-chinese-0.4
.2.
tar.gz
> /dev
/null
27 cp scim-chinese-0.4
.2/data
/phrase_lib.txt .
28 rm -rf scim-chinese-0.4
.2*
31 wget http
://unc.dl.sourceforge.net
/sourceforge
/libtabe
/libtabe-0.2
.3.tgz
32 tar zxvf libtabe-0.2
.3.tgz
> /dev
/null
33 cp libtabe
/tsi-src
/tsi.src .
36 wordlist
: phrase_lib.txt EZ.txt.in tsi.src
37 iconv
-c
-f big5
-t utf8 tsi.src | sed
's/# //g' | sed
's/[ ][0-9].*//' > wordlist
38 sed
's/\(.*\)\t[0-9][0-9]*.*/\1/' phrase_lib.txt | sed
'1,5d' >>wordlist
39 sed
'1,/BEGIN_TABLE/d' EZ.txt.in | colrm
1 8 | sed
's/\t.*//' | grep
"^...*" >> wordlist
40 sort wordlist | uniq | sed
's/ //g' > t
43 printutf8
: printutf8.c
44 gcc
-o printutf8 printutf8.c
46 unihan.t2s.t
: Unihan.txt printutf8
47 grep kSimplifiedVariant Unihan.txt | sed
'/#/d' | sed
's/kSimplifiedVariant//' | .
/printutf8
> unihan.t2s.t
49 trad2simp.t
: trad2simp.manual unihan.t2s.t
51 for I in
`colrm 11 < trad2simp.manual` ; do sed
"/^$$I/d" tmp1
> tmp2
; mv tmp2 tmp1
; done
52 cat trad2simp.manual tmp1
> trad2simp.t
54 unihan.s2t.t
: Unihan.txt printutf8
55 grep kTraditionalVariant Unihan.txt | sed
'/#/d' | sed
's/kTraditionalVariant//' | .
/printutf8
> unihan.s2t.t
57 simp2trad.t
: unihan.s2t.t simp2trad.manual
59 for I in
`colrm 11 < simp2trad.manual` ; do sed
"/^$$I/d" tmp1
> tmp2
; mv tmp2 tmp1
; done
60 cat simp2trad.manual tmp1
> simp2trad.t
62 t2s_1tomany.t
: trad2simp.t
63 grep
-s
".\{19,\}" trad2simp.t | sed
's/U+...../"/' | sed
's/|U+...../"=>"/' | sed
's/|U+.....//g' | sed
's/|/",/' > t2s_1tomany.t
65 t2s_1to1.t
: trad2simp.t s2t_1tomany.t
66 sed
"/.*|.*|.*|.*/d" trad2simp.t | sed
's/U+[0-9a-z][0-9a-z]*/"/' | sed
's/|U+[0-9a-z][0-9a-z]*/"=>"/' | sed
's/|/",/' > t2s_1to1.t
67 grep
'"."=>"..",' s2t_1tomany.t | sed
's/\("."\)=>".\(.\)",/"\2"=>\1,/' >> t2s_1to1.t
68 grep
'"."=>"...",' s2t_1tomany.t | sed
's/\("."\)=>".\(.\).",/"\2"=>\1,/' >> t2s_1to1.t
69 grep
'"."=>"...",' s2t_1tomany.t | sed
's/\("."\)=>"..\(.\)",/"\2"=>\1,/' >> t2s_1to1.t
70 grep
'"."=>"....",' s2t_1tomany.t | sed
's/\("."\)=>".\(.\)..",/"\2"=>\1,/' >> t2s_1to1.t
71 grep
'"."=>"....",' s2t_1tomany.t | sed
's/\("."\)=>"..\(.\).",/"\2"=>\1,/' >> t2s_1to1.t
72 grep
'"."=>"....",' s2t_1tomany.t | sed
's/\("."\)=>"...\(.\)",/"\2"=>\1,/' >> t2s_1to1.t
73 sort t2s_1to1.t | uniq
> t
77 s2t_1tomany.t
: simp2trad.t
78 grep
-s
".\{19,\}" simp2trad.t | sed
's/U+...../"/' | sed
's/|U+...../"=>"/' | sed
's/|U+.....//g' | sed
's/|/",/' > s2t_1tomany.t
80 s2t_1to1.t
: simp2trad.t t2s_1tomany.t
81 sed
"/.*|.*|.*|.*/d" simp2trad.t | sed
's/U+[0-9a-z][0-9a-z]*/"/' | sed
's/|U+[0-9a-z][0-9a-z]*/"=>"/' | sed
's/|/",/' > s2t_1to1.t
82 grep
'"."=>"..",' t2s_1tomany.t | sed
's/\("."\)=>".\(.\)",/"\2"=>\1,/' >> s2t_1to1.t
83 grep
'"."=>"...",' t2s_1tomany.t | sed
's/\("."\)=>".\(.\).",/"\2"=>\1,/' >> s2t_1to1.t
84 grep
'"."=>"...",' t2s_1tomany.t | sed
's/\("."\)=>"..\(.\)",/"\2"=>\1,/' >> s2t_1to1.t
85 grep
'"."=>"....",' t2s_1tomany.t | sed
's/\("."\)=>".\(.\)..",/"\2"=>\1,/' >> s2t_1to1.t
86 grep
'"."=>"....",' t2s_1tomany.t | sed
's/\("."\)=>"..\(.\).",/"\2"=>\1,/' >> s2t_1to1.t
87 grep
'"."=>"....",' t2s_1tomany.t | sed
's/\("."\)=>"...\(.\)",/"\2"=>\1,/' >> s2t_1to1.t
88 sort s2t_1to1.t | uniq
> t
91 tphrase.t
: EZ.txt.in tsi.src
92 colrm
1 8 < EZ.txt.in | sed
's/\t//g' | grep
"^.\{2,4\}[0-9]" | sed
's/[0-9]//g' > t
93 iconv
-c
-f big5
-t utf8 tsi.src | sed
's/ [0-9].*//g' | sed
's/[# ]//g'| grep
"^.\{2,4\}" >> t
94 sort t | uniq
> tphrase.t
96 alltradphrases.t
: tphrase.t s2t_1tomany.t
97 for i in
`cat s2t_1tomany.t | sed 's/.*=>".//' | sed 's/"//g' |sed 's/,/\n/' | sed 's/\(.\)/\1\n/g' |sort | uniq`; do grep
-s
$$i tphrase.t
; done
> alltradphrases.t || true
100 tradphrases_2.t
: alltradphrases.t
101 cat alltradphrases.t | grep
"^..$$" |
sort | uniq
> tradphrases_2.t
103 tradphrases_3.t
: alltradphrases.t
104 cat alltradphrases.t | grep
"^...$$" |
sort | uniq
> tradphrases_3.t
105 for i in
`cat tradphrases_2.t`; do grep
$$i tradphrases_3.t
; done |
sort | uniq
> t3 || true
106 diff t3 tradphrases_3.t | grep
">" | sed
's/> //' > t
110 tradphrases_4.t
: alltradphrases.t
111 cat alltradphrases.t | grep
"^....$$" |
sort | uniq
> tradphrases_4.t
112 for i in
`cat tradphrases_2.t`; do grep
$$i tradphrases_4.t
; done |
sort | uniq
> t3 || true
113 diff t3 tradphrases_4.t | grep
">" | sed
's/> //' > t
115 for i in
`cat tradphrases_3.t`; do grep
$$i tradphrases_4.t
; done |
sort | uniq
> t3 || true
116 diff t3 tradphrases_4.t | grep
">" | sed
's/> //' > t
119 tradphrases.t
: tradphrases.manual tradphrases_2.t tradphrases_3.t tradphrases_4.t t2s_1tomany.t
120 cat tradphrases.manual tradphrases_2.t tradphrases_3.t tradphrases_4.t |
sort | uniq
> tradphrases.t
121 for i in
`sed 's/"\(.\).*/\1/' t2s_1tomany.t ` ; do grep
$$i tradphrases.t
; done | diff tradphrases.t
- | grep
'<' | sed
's/< //' > t
124 tradphrases.notsure
: tradphrases_2.t tradphrases_3.t tradphrases_4.t t2s_1tomany.t
125 cat tradphrases_2.t tradphrases_3.t tradphrases_4.t |
sort | uniq
> t
126 for i in
`sed 's/"\(.\).*/\1/' t2s_1tomany.t ` ; do grep
$$i t
; done | diff t
- | grep
'>' | sed
's/> //' > tradphrases.notsure
130 sed
's/[\t0-9a-zA-Z]//g' phrase_lib.txt | grep
"^.\{2,4\}$$" > ph.t
132 allsimpphrases.t
: ph.t
133 rm -f allsimpphrases.t
134 for i in
`cat t2s_1tomany.t | sed 's/.*=>".//' | sed 's/"//g' | sed 's/,/\n/' | sed 's/\(.\)/\1\n/g' | sort | uniq `; do grep
$$i ph.t
>> allsimpphrases.t
; done
136 simpphrases_2.t
: allsimpphrases.t
137 cat allsimpphrases.t | grep
"^..$$" |
sort | uniq
> simpphrases_2.t
139 simpphrases_3.t
: allsimpphrases.t
140 cat allsimpphrases.t | grep
"^...$$" |
sort | uniq
> simpphrases_3.t
141 for i in
`cat simpphrases_2.t`; do grep
$$i simpphrases_3.t
; done |
sort | uniq
> t3 || true
142 diff t3 simpphrases_3.t | grep
">" | sed
's/> //' > t
145 simpphrases_4.t
: allsimpphrases.t
146 cat allsimpphrases.t | grep
"^....$$" |
sort | uniq
> simpphrases_4.t
148 for i in
`cat simpphrases_2.t`; do grep
$$i simpphrases_4.t
>> t
; done || true
150 diff t3 simpphrases_4.t | grep
">" | sed
's/> //' > t
152 for i in
`cat simpphrases_3.t`; do grep
$$i simpphrases_4.t
; done |
sort | uniq
> t3 || true
153 diff t3 simpphrases_4.t | grep
">" | sed
's/> //' > t
156 simpphrases.t
:simpphrases_2.t simpphrases_3.t simpphrases_4.t t2s_1tomany.t
157 cat simpphrases_2.t simpphrases_3.t simpphrases_4.t
> simpphrases.t
158 for i in
`sed 's/"\(.\).*/\1/' t2s_1tomany.t ` ; do grep
$$i simpphrases.t
; done | diff simpphrases.t
- | grep
'<' | sed
's/< //' > t
162 simpphrases.notsure
:simpphrases_2.t simpphrases_3.t simpphrases_4.t t2s_1tomany.t
163 cat simpphrases_2.t simpphrases_3.t simpphrases_4.t
> t
164 for i in
`sed 's/"\(.\).*/\1/' t2s_1tomany.t ` ; do grep
$$i t
; done | diff t
- | grep
'>' | sed
's/> //' > simpphrases.notsure
166 trad2simp1to1.t
: t2s_1tomany.t t2s_1to1.t
167 sed
's/\(.......\).*/\1",/' t2s_1tomany.t
> trad2simp1to1.t
168 cat t2s_1to1.t
>> trad2simp1to1.t
170 simp2trad1to1.t
: s2t_1tomany.t s2t_1to1.t
171 sed
's/\(.......\).*/\1",/' s2t_1tomany.t
> simp2trad1to1.t
172 cat s2t_1to1.t
>> simp2trad1to1.t
174 trad2simp.php
: trad2simp1to1.t tradphrases.t
175 printf
'<?php\n$$trad2simp=array(' > trad2simp.php
176 cat trad2simp1to1.t
>> trad2simp.php
177 printf
');\n$$str=\n"' >> trad2simp.php
178 cat tradphrases.t
>> trad2simp.php
179 printf
'";\n$$t=strtr($$str, $$trad2simp);\necho $$t;\n?>' >> trad2simp.php
181 simp2trad.php
: simp2trad1to1.t simpphrases.t
182 printf
'<?php\n$$simp2trad=array(' > simp2trad.php
183 cat simp2trad1to1.t
>> simp2trad.php
184 printf
');\n$$str=\n"' >> simp2trad.php
185 cat simpphrases.t
>> simp2trad.php
186 printf
'";\n$$t=strtr($$str, $$simp2trad);\necho $$t;\n?>' >> simp2trad.php
188 simp2trad.phrases.t
: trad2simp.php tradphrases.t toTW.manual
189 php
-f trad2simp.php | sed
's/\(.*\)/"\1" => /' > tmp1
190 cat tradphrases.t | sed
's/\(.*\)/"\1",/' > tmp2
191 paste tmp1 tmp2
> simp2trad.phrases.t
192 sed
's/\(.*\)\t\(.*\)/"\1"=>"\2",/' toTW.manual
>> simp2trad.phrases.t
194 trad2simp.phrases.t
: simp2trad.php simpphrases.t toCN.manual
195 php
-f simp2trad.php | sed
's/\(.*\)/"\1" => /' > tmp1
196 cat simpphrases.t | sed
's/\(.*\)/"\1",/' > tmp2
197 paste tmp1 tmp2
> trad2simp.phrases.t
198 sed
's/\(.*\)\t\(.*\)/"\1"=>"\2",/' toCN.manual
>> trad2simp.phrases.t
200 ZhConversion.php
: simp2trad1to1.t simp2trad.phrases.t trad2simp1to1.t trad2simp.phrases.t toHK.manual toSG.manual
201 printf
'<?php\n/**\n * Simplified/Traditional Chinese conversion tables\n' > ZhConversion.php
202 printf
' *\n * Automatically generated using code and data in includes/zhtable/\n' >> ZhConversion.php
203 printf
' * Do not modify directly! \n *\n * @package MediaWiki\n*/\n\n' >> ZhConversion.php
204 printf
'$$zh2TW=array(\n' >> ZhConversion.php
205 cat simp2trad1to1.t
>> ZhConversion.php
206 echo
>> ZhConversion.php
207 cat simp2trad.phrases.t
>> ZhConversion.php
208 echo
>> ZhConversion.php
209 echo
');' >> ZhConversion.php
210 echo
>> ZhConversion.php
211 echo
>> ZhConversion.php
212 printf
'$$zh2CN=array(\n' >> ZhConversion.php
213 cat trad2simp1to1.t
>> ZhConversion.php
214 echo
>> ZhConversion.php
215 cat trad2simp.phrases.t
>> ZhConversion.php
216 echo
>> ZhConversion.php
217 printf
');' >> ZhConversion.php
218 echo
>> ZhConversion.php
219 echo
>> ZhConversion.php
220 printf
'$$zh2HK=array(\n' >> ZhConversion.php
221 sed
's/\(.*\)\t\(.*\)/"\1" => "\2",/' toHK.manual
>> ZhConversion.php
222 echo
>> ZhConversion.php
223 printf
');' >> ZhConversion.php
224 echo
>> ZhConversion.php
225 echo
>> ZhConversion.php
226 printf
'$$zh2SG=array(\n' >> ZhConversion.php
227 sed
's/\(.*\)\t\(.*\)/"\1" => "\2",/' toSG.manual
>> ZhConversion.php
228 echo
>> ZhConversion.php
229 printf
');' >> ZhConversion.php
230 echo
>> ZhConversion.php
231 printf
'?>' >> ZhConversion.php
235 rm -f ZhConversion.php tmp1 tmp2 tmp3 t3
*.t trad2simp.php simp2trad.php