2 # Creating the file ZhConversion.php used for Simplified/Traditional
3 # Chinese conversion. It gets the basic conversion table from the Unihan
4 # database, and construct the phrase tables using phrase libraries in
5 # the SCIM packages and the libtabe package. There are also special
6 # tables used to for adjustment.
9 GREP
= LANG
=zh_CN.UTF8 grep
10 SED
= LANG
=zh_CN.UTF8 sed
11 DIFF
= LANG
=zh_CN.UTF8 diff
14 #installation directory
15 INSTDIR
= /usr
/local
/share
/zhdaemons
/
17 all: ZhConversion.php tradphrases.notsure simpphrases.notsure wordlist toCN.dict toTW.dict toHK.dict toSG.dict
20 wget
-nc ftp
://ftp.unicode.org
/Public
/UNIDATA
/Unihan.zip
24 wget
-nc http
://easynews.dl.sourceforge.net
/sourceforge
/scim
/scim-tables-0.5
.1.
tar.gz
25 tar -xzf scim-tables-0.5
.1.
tar.gz
-O scim-tables-0.5
.1/zh
/EZ.txt.in
> EZ.txt.in
28 wget
-nc http
://easynews.dl.sourceforge.net
/sourceforge
/scim
/scim-pinyin-0.5
.0.
tar.gz
29 tar -xzf scim-pinyin-0.5
.0.
tar.gz
-O scim-pinyin-0.5
.0/data
/phrase_lib.txt
> phrase_lib.txt
32 wget
-nc http
://unc.dl.sourceforge.net
/sourceforge
/libtabe
/libtabe-0.2
.3.tgz
33 tar -xzf libtabe-0.2
.3.tgz
-O libtabe
/tsi-src
/tsi.src
> tsi.src
35 wordlist
: phrase_lib.txt EZ.txt.in tsi.src
36 iconv
-c
-f big5
-t utf8 tsi.src |
$(SED
) 's/# //g' |
$(SED
) 's/[ ][0-9].*//' > wordlist
37 $(SED
) 's/\(.*\)\t[0-9][0-9]*.*/\1/' phrase_lib.txt |
$(SED
) '1,5d' >>wordlist
38 $(SED
) '1,/BEGIN_TABLE/d' EZ.txt.in | colrm
1 8 |
$(SED
) 's/\t.*//' |
$(GREP
) "^...*" >> wordlist
39 sort wordlist | uniq |
$(SED
) 's/ //g' > t
42 printutf8
: printutf8.c
43 $(CC
) -o printutf8 printutf8.c
45 unihan.t2s.t
: Unihan.txt printutf8
46 $(GREP
) kSimplifiedVariant Unihan.txt |
$(SED
) '/#/d' |
$(SED
) 's/kSimplifiedVariant//' | .
/printutf8
> unihan.t2s.t
48 trad2simp.t
: trad2simp.manual unihan.t2s.t
50 for I in
`colrm 11 < trad2simp.manual` ; do
$(SED
) "/^$$I/d" tmp1
> tmp2
; mv tmp2 tmp1
; done
51 cat trad2simp.manual tmp1
> trad2simp.t
53 unihan.s2t.t
: Unihan.txt printutf8
54 $(GREP
) kTraditionalVariant Unihan.txt |
$(SED
) '/#/d' |
$(SED
) 's/kTraditionalVariant//' | .
/printutf8
> unihan.s2t.t
56 simp2trad.t
: unihan.s2t.t simp2trad.manual
58 for I in
`colrm 11 < simp2trad.manual` ; do
$(SED
) "/^$$I/d" tmp1
> tmp2
; mv tmp2 tmp1
; done
59 cat simp2trad.manual tmp1
> simp2trad.t
61 t2s_1tomany.t
: trad2simp.t
62 $(GREP
) -s
".\{19,\}" trad2simp.t |
$(SED
) 's/U+...../"/' |
$(SED
) 's/|U+...../"=>"/' |
$(SED
) 's/|U+.....//g' |
$(SED
) 's/|/",/' > t2s_1tomany.t
64 t2s_1to1.t
: trad2simp.t s2t_1tomany.t
65 $(SED
) "/.*|.*|.*|.*/d" trad2simp.t |
$(SED
) 's/U+[0-9a-z][0-9a-z]*/"/' |
$(SED
) 's/|U+[0-9a-z][0-9a-z]*/"=>"/' |
$(SED
) 's/|/",/' > t2s_1to1.t
66 $(GREP
) '"."=>"..",' s2t_1tomany.t |
$(SED
) 's/\("."\)=>".\(.\)",/"\2"=>\1,/' >> t2s_1to1.t
67 $(GREP
) '"."=>"...",' s2t_1tomany.t |
$(SED
) 's/\("."\)=>".\(.\).",/"\2"=>\1,/' >> t2s_1to1.t
68 $(GREP
) '"."=>"...",' s2t_1tomany.t |
$(SED
) 's/\("."\)=>"..\(.\)",/"\2"=>\1,/' >> t2s_1to1.t
69 $(GREP
) '"."=>"....",' s2t_1tomany.t |
$(SED
) 's/\("."\)=>".\(.\)..",/"\2"=>\1,/' >> t2s_1to1.t
70 $(GREP
) '"."=>"....",' s2t_1tomany.t |
$(SED
) 's/\("."\)=>"..\(.\).",/"\2"=>\1,/' >> t2s_1to1.t
71 $(GREP
) '"."=>"....",' s2t_1tomany.t |
$(SED
) 's/\("."\)=>"...\(.\)",/"\2"=>\1,/' >> t2s_1to1.t
72 sort t2s_1to1.t | uniq
> t
76 s2t_1tomany.t
: simp2trad.t
77 $(GREP
) -s
".\{19,\}" simp2trad.t |
$(SED
) 's/U+...../"/' |
$(SED
) 's/|U+...../"=>"/' |
$(SED
) 's/|U+.....//g' |
$(SED
) 's/|/",/' > s2t_1tomany.t
79 s2t_1to1.t
: simp2trad.t t2s_1tomany.t
80 $(SED
) "/.*|.*|.*|.*/d" simp2trad.t |
$(SED
) 's/U+[0-9a-z][0-9a-z]*/"/' |
$(SED
) 's/|U+[0-9a-z][0-9a-z]*/"=>"/' |
$(SED
) 's/|/",/' > s2t_1to1.t
81 $(GREP
) '"."=>"..",' t2s_1tomany.t |
$(SED
) 's/\("."\)=>".\(.\)",/"\2"=>\1,/' >> s2t_1to1.t
82 $(GREP
) '"."=>"...",' t2s_1tomany.t |
$(SED
) 's/\("."\)=>".\(.\).",/"\2"=>\1,/' >> s2t_1to1.t
83 $(GREP
) '"."=>"...",' t2s_1tomany.t |
$(SED
) 's/\("."\)=>"..\(.\)",/"\2"=>\1,/' >> s2t_1to1.t
84 $(GREP
) '"."=>"....",' t2s_1tomany.t |
$(SED
) 's/\("."\)=>".\(.\)..",/"\2"=>\1,/' >> s2t_1to1.t
85 $(GREP
) '"."=>"....",' t2s_1tomany.t |
$(SED
) 's/\("."\)=>"..\(.\).",/"\2"=>\1,/' >> s2t_1to1.t
86 $(GREP
) '"."=>"....",' t2s_1tomany.t |
$(SED
) 's/\("."\)=>"...\(.\)",/"\2"=>\1,/' >> s2t_1to1.t
87 sort s2t_1to1.t | uniq
> t
90 tphrase.t
: EZ.txt.in tsi.src
91 colrm
1 8 < EZ.txt.in |
$(SED
) 's/\t//g' |
$(GREP
) "^.\{2,4\}[0-9]" |
$(SED
) 's/[0-9]//g' > t
92 iconv
-c
-f big5
-t utf8 tsi.src |
$(SED
) 's/ [0-9].*//g' |
$(SED
) 's/[# ]//g'|
$(GREP
) "^.\{2,4\}" >> t
93 sort t | uniq
> tphrase.t
95 alltradphrases.t
: tphrase.t s2t_1tomany.t
96 for i in
`cat s2t_1tomany.t | $(SED) 's/.*=>".//' | $(SED) 's/"//g' |$(SED) 's/,/\n/' | $(SED) 's/\(.\)/\1\n/g' |sort | uniq`; do
$(GREP
) -s
$$i tphrase.t
; done
> alltradphrases.t || true
99 tradphrases_2.t
: alltradphrases.t
100 cat alltradphrases.t |
$(GREP
) "^..$$" |
sort | uniq
> tradphrases_2.t
102 tradphrases_3.t
: alltradphrases.t
103 cat alltradphrases.t |
$(GREP
) "^...$$" |
sort | uniq
> tradphrases_3.t
104 for i in
`cat tradphrases_2.t`; do
$(GREP
) $$i tradphrases_3.t
; done |
sort | uniq
> t3 || true
105 $(DIFF
) t3 tradphrases_3.t |
$(GREP
) ">" |
$(SED
) 's/> //' > t
109 tradphrases_4.t
: alltradphrases.t
110 cat alltradphrases.t |
$(GREP
) "^....$$" |
sort | uniq
> tradphrases_4.t
111 for i in
`cat tradphrases_2.t`; do
$(GREP
) $$i tradphrases_4.t
; done |
sort | uniq
> t3 || true
112 $(DIFF
) t3 tradphrases_4.t |
$(GREP
) ">" |
$(SED
) 's/> //' > t
114 for i in
`cat tradphrases_3.t`; do
$(GREP
) $$i tradphrases_4.t
; done |
sort | uniq
> t3 || true
115 $(DIFF
) t3 tradphrases_4.t |
$(GREP
) ">" |
$(SED
) 's/> //' > t
118 tradphrases.t
: tradphrases.manual tradphrases_2.t tradphrases_3.t tradphrases_4.t t2s_1tomany.t
119 cat tradphrases.manual tradphrases_2.t tradphrases_3.t tradphrases_4.t |
sort | uniq
> tradphrases.t
120 for i in
`$(SED) 's/"\(.\).*/\1/' t2s_1tomany.t ` ; do
$(GREP
) $$i tradphrases.t
; done |
$(DIFF
) tradphrases.t
- |
$(GREP
) '<' |
$(SED
) 's/< //' > t
123 tradphrases.notsure
: tradphrases_2.t tradphrases_3.t tradphrases_4.t t2s_1tomany.t
124 cat tradphrases_2.t tradphrases_3.t tradphrases_4.t |
sort | uniq
> t
125 for i in
`$(SED) 's/"\(.\).*/\1/' t2s_1tomany.t ` ; do
$(GREP
) $$i t
; done |
$(DIFF
) t
- |
$(GREP
) '>' |
$(SED
) 's/> //' > tradphrases.notsure
129 $(SED
) 's/[\t0-9a-zA-Z]//g' phrase_lib.txt |
$(GREP
) "^.\{2,4\}$$" > ph.t
131 allsimpphrases.t
: ph.t
132 rm -f allsimpphrases.t
133 for i in
`cat t2s_1tomany.t | $(SED) 's/.*=>".//' | $(SED) 's/"//g' | $(SED) 's/,/\n/' | $(SED) 's/\(.\)/\1\n/g' | sort | uniq `; do
$(GREP
) $$i ph.t
>> allsimpphrases.t
; done
135 simpphrases_2.t
: allsimpphrases.t
136 cat allsimpphrases.t |
$(GREP
) "^..$$" |
sort | uniq
> simpphrases_2.t
138 simpphrases_3.t
: allsimpphrases.t
139 cat allsimpphrases.t |
$(GREP
) "^...$$" |
sort | uniq
> simpphrases_3.t
140 for i in
`cat simpphrases_2.t`; do
$(GREP
) $$i simpphrases_3.t
; done |
sort | uniq
> t3 || true
141 $(DIFF
) t3 simpphrases_3.t |
$(GREP
) ">" |
$(SED
) 's/> //' > t
144 simpphrases_4.t
: allsimpphrases.t
145 cat allsimpphrases.t |
$(GREP
) "^....$$" |
sort | uniq
> simpphrases_4.t
147 for i in
`cat simpphrases_2.t`; do
$(GREP
) $$i simpphrases_4.t
>> t
; done || true
149 $(DIFF
) t3 simpphrases_4.t |
$(GREP
) ">" |
$(SED
) 's/> //' > t
151 for i in
`cat simpphrases_3.t`; do
$(GREP
) $$i simpphrases_4.t
; done |
sort | uniq
> t3 || true
152 $(DIFF
) t3 simpphrases_4.t |
$(GREP
) ">" |
$(SED
) 's/> //' > t
155 simpphrases.t
:simpphrases_2.t simpphrases_3.t simpphrases_4.t t2s_1tomany.t
156 cat simpphrases_2.t simpphrases_3.t simpphrases_4.t
> simpphrases.t
157 for i in
`$(SED) 's/"\(.\).*/\1/' t2s_1tomany.t ` ; do
$(GREP
) $$i simpphrases.t
; done |
$(DIFF
) simpphrases.t
- |
$(GREP
) '<' |
$(SED
) 's/< //' > t
161 simpphrases.notsure
:simpphrases_2.t simpphrases_3.t simpphrases_4.t t2s_1tomany.t
162 cat simpphrases_2.t simpphrases_3.t simpphrases_4.t
> t
163 for i in
`$(SED) 's/"\(.\).*/\1/' t2s_1tomany.t ` ; do
$(GREP
) $$i t
; done |
$(DIFF
) t
- |
$(GREP
) '>' |
$(SED
) 's/> //' > simpphrases.notsure
165 trad2simp1to1.t
: t2s_1tomany.t t2s_1to1.t
166 $(SED
) 's/\(.......\).*/\1",/' t2s_1tomany.t
> trad2simp1to1.t
167 cat t2s_1to1.t
>> trad2simp1to1.t
169 simp2trad1to1.t
: s2t_1tomany.t s2t_1to1.t
170 $(SED
) 's/\(.......\).*/\1",/' s2t_1tomany.t
> simp2trad1to1.t
171 cat s2t_1to1.t
>> simp2trad1to1.t
173 trad2simp.php
: trad2simp1to1.t tradphrases.t
174 printf
'<?php\n$$trad2simp=array(' > trad2simp.php
175 cat trad2simp1to1.t
>> trad2simp.php
176 printf
');\n$$str=\n"' >> trad2simp.php
177 cat tradphrases.t
>> trad2simp.php
178 printf
'";\n$$t=strtr($$str, $$trad2simp);\necho $$t;\n?>' >> trad2simp.php
180 simp2trad.php
: simp2trad1to1.t simpphrases.t
181 printf
'<?php\n$$simp2trad=array(' > simp2trad.php
182 cat simp2trad1to1.t
>> simp2trad.php
183 printf
');\n$$str=\n"' >> simp2trad.php
184 cat simpphrases.t
>> simp2trad.php
185 printf
'";\n$$t=strtr($$str, $$simp2trad);\necho $$t;\n?>' >> simp2trad.php
187 simp2trad.phrases.t
: trad2simp.php tradphrases.t toTW.manual
188 php
-f trad2simp.php |
$(SED
) 's/\(.*\)/"\1" => /' > tmp1
189 cat tradphrases.t |
$(SED
) 's/\(.*\)/"\1",/' > tmp2
190 paste tmp1 tmp2
> simp2trad.phrases.t
191 $(SED
) 's/\(.*\)\t\(.*\)/"\1"=>"\2",/' toTW.manual
>> simp2trad.phrases.t
193 trad2simp.phrases.t
: simp2trad.php simpphrases.t toCN.manual
194 php
-f simp2trad.php |
$(SED
) 's/\(.*\)/"\1" => /' > tmp1
195 cat simpphrases.t |
$(SED
) 's/\(.*\)/"\1",/' > tmp2
196 paste tmp1 tmp2
> trad2simp.phrases.t
197 $(SED
) 's/\(.*\)\t\(.*\)/"\1"=>"\2",/' toCN.manual
>> trad2simp.phrases.t
199 toCN.dict
: trad2simp1to1.t trad2simp.phrases.t
200 cat trad2simp1to1.t |
$(SED
) 's/[, \t]//g' |
$(SED
) 's/=>/\t/' > toCN.dict
201 cat trad2simp.phrases.t |
$(SED
) 's/[, \t]//g' |
$(SED
) 's/=>/\t/' >> toCN.dict
203 toTW.dict
: simp2trad1to1.t simp2trad.phrases.t
204 cat simp2trad1to1.t |
$(SED
) 's/[, \t]//g' |
$(SED
) 's/=>/\t/' > toTW.dict
205 cat simp2trad.phrases.t |
$(SED
) 's/[, \t]//g' |
$(SED
) 's/=>/\t/' >> toTW.dict
207 toHK.dict
: toHK.manual
208 cat toHK.manual |
$(SED
) 's/ //g' |
$(SED
) 's/\(^.*\)\t\(.*\)/"\1"\t"\2"/' > toHK.dict
210 toSG.dict
: toSG.manual
211 cat toSG.manual |
$(SED
) 's/ //g' |
$(SED
) 's/\(^.*\)\t\(.*\)/"\1"\t"\2"/' > toSG.dict
215 ZhConversion.php
: simp2trad1to1.t simp2trad.phrases.t trad2simp1to1.t trad2simp.phrases.t toHK.manual toSG.manual
216 printf
'<?php\n/**\n * Simplified/Traditional Chinese conversion tables\n' > ZhConversion.php
217 printf
' *\n * Automatically generated using code and data in includes/zhtable/\n' >> ZhConversion.php
218 printf
' * Do not modify directly! \n *\n * @package MediaWiki\n*/\n\n' >> ZhConversion.php
219 printf
'$$zh2TW=array(\n' >> ZhConversion.php
220 cat simp2trad1to1.t
>> ZhConversion.php
221 echo
>> ZhConversion.php
222 cat simp2trad.phrases.t
>> ZhConversion.php
223 echo
>> ZhConversion.php
224 echo
');' >> ZhConversion.php
225 echo
>> ZhConversion.php
226 echo
>> ZhConversion.php
227 printf
'$$zh2CN=array(\n' >> ZhConversion.php
228 cat trad2simp1to1.t
>> ZhConversion.php
229 echo
>> ZhConversion.php
230 cat trad2simp.phrases.t
>> ZhConversion.php
231 echo
>> ZhConversion.php
232 printf
');' >> ZhConversion.php
233 echo
>> ZhConversion.php
234 echo
>> ZhConversion.php
235 printf
'$$zh2HK=array(\n' >> ZhConversion.php
236 $(SED
) 's/\(.*\)\t\(.*\)/"\1" => "\2",/' toHK.manual
>> ZhConversion.php
237 echo
>> ZhConversion.php
238 printf
');' >> ZhConversion.php
239 echo
>> ZhConversion.php
240 echo
>> ZhConversion.php
241 printf
'$$zh2SG=array(\n' >> ZhConversion.php
242 $(SED
) 's/\(.*\)\t\(.*\)/"\1" => "\2",/' toSG.manual
>> ZhConversion.php
243 echo
>> ZhConversion.php
244 printf
');' >> ZhConversion.php
245 echo
>> ZhConversion.php
246 printf
'?>' >> ZhConversion.php
249 clean: cleantmp cleandl
252 # Stuff unpacked from the files fetched by wget
258 # Temporary files and other trash
259 rm -f ZhConversion.php tmp1 tmp2 tmp3 t3
*.t trad2simp.php simp2trad.php
*.dict printutf8
*~ \
260 simpphrases.notsure tradphrases.notsure wordlist
265 scim-tables-0.5
.1.
tar.gz \
266 scim-pinyin-0.5
.0.
tar.gz \