2 # Creating the file ZhConversion.php used for Simplified/Traditional
3 # Chinese conversion. It gets the basic conversion table from the Unihan
4 # database, and construct the phrase tables using phrase libraries in
5 # the SCIM packages and the libtabe package. There are also special
6 # tables used to for adjustment.
9 GREP
= LANG
=zh_CN.UTF8 grep
10 SED
= LANG
=zh_CN.UTF8 sed
11 DIFF
= LANG
=zh_CN.UTF8 diff
15 SCIM_TABLES_VER
= 0.5.8
16 SCIM_PINYIN_VER
= 0.5.91
19 # Installation directory
20 INSTDIR
= /usr
/local
/share
/zhdaemons
/
22 all: ZhConversion.php tradphrases.notsure simpphrases.notsure wordlist toHans.dict toHant.dict toCN.dict toTW.dict toHK.dict toSG.dict
24 # Download Unihan database and Traditional Chinese / Simplified Chinese phrases files
26 wget
-nc ftp
://ftp.unicode.org
/Public
/UNIDATA
/Unihan.zip
28 scim-tables-
$(SCIM_TABLES_VER
).
tar.gz
:
29 wget
-nc http
://$(SF_MIRROR
).dl.sourceforge.net
/sourceforge
/scim
/scim-tables-
$(SCIM_TABLES_VER
).
tar.gz
31 scim-pinyin-
$(SCIM_PINYIN_VER
).
tar.gz
:
32 wget
-nc http
://$(SF_MIRROR
).dl.sourceforge.net
/sourceforge
/scim
/scim-pinyin-
$(SCIM_PINYIN_VER
).
tar.gz
34 libtabe-
$(LIBTABE_VER
).tgz
:
35 wget
-nc http
://$(SF_MIRROR
).dl.sourceforge.net
/sourceforge
/libtabe
/libtabe-
$(LIBTABE_VER
).tgz
37 # Extract the file from a comressed files
38 Unihan.txt
: Unihan.zip
41 EZ.txt.in
: scim-tables-
$(SCIM_TABLES_VER
).
tar.gz
42 tar -xzf scim-tables-
$(SCIM_TABLES_VER
).
tar.gz
-O scim-tables-
$(SCIM_TABLES_VER
)/tables
/zh
/EZ-Big.txt.in
> EZ.txt.in
44 Wubi.txt.in
: scim-tables-
$(SCIM_TABLES_VER
).
tar.gz
45 tar -xzf scim-tables-
$(SCIM_TABLES_VER
).
tar.gz
-O scim-tables-
$(SCIM_TABLES_VER
)/tables
/zh
/Wubi.txt.in
> Wubi.txt.in
47 Ziranma.txt.in
: scim-tables-
$(SCIM_TABLES_VER
).
tar.gz
48 tar -xzf scim-tables-
$(SCIM_TABLES_VER
).
tar.gz
-O scim-tables-
$(SCIM_TABLES_VER
)/tables
/zh
/Ziranma.txt.in
> Ziranma.txt.in
51 phrase_lib.txt
: scim-pinyin-
$(SCIM_PINYIN_VER
).
tar.gz
52 tar -xzf scim-pinyin-
$(SCIM_PINYIN_VER
).
tar.gz
-O scim-pinyin-
$(SCIM_PINYIN_VER
)/data
/phrase_lib.txt
> phrase_lib.txt
54 tsi.src
: libtabe-
$(LIBTABE_VER
).tgz
55 tar -xzf libtabe-
$(LIBTABE_VER
).tgz
-O libtabe
/tsi-src
/tsi.src
> tsi.src
58 wordlist
: phrase_lib.txt EZ.txt.in tsi.src
59 iconv
-c
-f big5
-t utf8 tsi.src |
$(SED
) 's/# //g' |
$(SED
) 's/[ ][0-9].*//' > wordlist
60 $(SED
) 's/\(.*\)\t[0-9][0-9]*.*/\1/' phrase_lib.txt |
$(SED
) '1,5d' >> wordlist
61 $(SED
) '1,/BEGIN_TABLE/d' EZ.txt.in | colrm
1 8 |
$(SED
) 's/\t.*//' |
$(GREP
) "^...*" >> wordlist
62 sort wordlist | uniq |
$(SED
) 's/ //g' > t
65 printutf8
: printutf8.c
66 $(CC
) -o printutf8 printutf8.c
68 unihan.t2s.t
: Unihan.txt printutf8
69 $(GREP
) kSimplifiedVariant Unihan.txt |
$(SED
) '/#/d' |
$(SED
) 's/kSimplifiedVariant//' | .
/printutf8
> unihan.t2s.t
71 trad2simp.t
: trad2simp.manual unihan.t2s.t
73 for I in
`colrm 11 < trad2simp.manual` ; do
$(SED
) "/^$$I/d" tmp1
> tmp2
; mv tmp2 tmp1
; done
74 cat trad2simp.manual tmp1
> trad2simp.t
76 unihan.s2t.t
: Unihan.txt printutf8
77 $(GREP
) kTraditionalVariant Unihan.txt |
$(SED
) '/#/d' |
$(SED
) 's/kTraditionalVariant//' | .
/printutf8
> unihan.s2t.t
79 simp2trad.t
: unihan.s2t.t simp2trad.manual
81 for I in
`colrm 11 < simp2trad.manual` ; do
$(SED
) "/^$$I/d" tmp1
> tmp2
; mv tmp2 tmp1
; done
82 cat simp2trad.manual tmp1
> simp2trad.t
84 t2s_1tomany.t
: trad2simp.t
85 $(GREP
) -s
".\{19,\}" trad2simp.t |
$(SED
) 's/U+...../"/' |
$(SED
) 's/|U+...../"=>"/' |
$(SED
) 's/|U+.....//g' |
$(SED
) 's/|/",/' > t2s_1tomany.t
87 t2s_1to1.t
: trad2simp.t s2t_1tomany.t
88 $(SED
) "/.*|.*|.*|.*/d" trad2simp.t |
$(SED
) 's/U+[0-9a-z][0-9a-z]*/"/' |
$(SED
) 's/|U+[0-9a-z][0-9a-z]*/"=>"/' |
$(SED
) 's/|/",/' > t2s_1to1.t
89 $(GREP
) '"."=>"..",' s2t_1tomany.t |
$(SED
) 's/\("."\)=>".\(.\)",/"\2"=>\1,/' >> t2s_1to1.t
90 $(GREP
) '"."=>"...",' s2t_1tomany.t |
$(SED
) 's/\("."\)=>".\(.\).",/"\2"=>\1,/' >> t2s_1to1.t
91 $(GREP
) '"."=>"...",' s2t_1tomany.t |
$(SED
) 's/\("."\)=>"..\(.\)",/"\2"=>\1,/' >> t2s_1to1.t
92 $(GREP
) '"."=>"....",' s2t_1tomany.t |
$(SED
) 's/\("."\)=>".\(.\)..",/"\2"=>\1,/' >> t2s_1to1.t
93 $(GREP
) '"."=>"....",' s2t_1tomany.t |
$(SED
) 's/\("."\)=>"..\(.\).",/"\2"=>\1,/' >> t2s_1to1.t
94 $(GREP
) '"."=>"....",' s2t_1tomany.t |
$(SED
) 's/\("."\)=>"...\(.\)",/"\2"=>\1,/' >> t2s_1to1.t
95 sort t2s_1to1.t | uniq
> t
99 s2t_1tomany.t
: simp2trad.t
100 $(GREP
) -s
".\{19,\}" simp2trad.t |
$(SED
) 's/U+...../"/' |
$(SED
) 's/|U+...../"=>"/' |
$(SED
) 's/|U+.....//g' |
$(SED
) 's/|/",/' > s2t_1tomany.t
102 s2t_1to1.t
: simp2trad.t t2s_1tomany.t
103 $(SED
) "/.*|.*|.*|.*/d" simp2trad.t |
$(SED
) 's/U+[0-9a-z][0-9a-z]*/"/' |
$(SED
) 's/|U+[0-9a-z][0-9a-z]*/"=>"/' |
$(SED
) 's/|/",/' > s2t_1to1.t
104 $(GREP
) '"."=>"..",' t2s_1tomany.t |
$(SED
) 's/\("."\)=>".\(.\)",/"\2"=>\1,/' >> s2t_1to1.t
105 $(GREP
) '"."=>"...",' t2s_1tomany.t |
$(SED
) 's/\("."\)=>".\(.\).",/"\2"=>\1,/' >> s2t_1to1.t
106 $(GREP
) '"."=>"...",' t2s_1tomany.t |
$(SED
) 's/\("."\)=>"..\(.\)",/"\2"=>\1,/' >> s2t_1to1.t
107 $(GREP
) '"."=>"....",' t2s_1tomany.t |
$(SED
) 's/\("."\)=>".\(.\)..",/"\2"=>\1,/' >> s2t_1to1.t
108 $(GREP
) '"."=>"....",' t2s_1tomany.t |
$(SED
) 's/\("."\)=>"..\(.\).",/"\2"=>\1,/' >> s2t_1to1.t
109 $(GREP
) '"."=>"....",' t2s_1tomany.t |
$(SED
) 's/\("."\)=>"...\(.\)",/"\2"=>\1,/' >> s2t_1to1.t
110 sort s2t_1to1.t | uniq
> t
113 tphrase.t
: EZ.txt.in tsi.src
114 colrm
1 8 < EZ.txt.in |
$(SED
) 's/\t//g' |
$(GREP
) "^.\{2,4\}[0-9]" |
$(SED
) 's/[0-9]//g' > t
115 iconv
-c
-f big5
-t utf8 tsi.src |
$(SED
) 's/ [0-9].*//g' |
$(SED
) 's/[# ]//g'|
$(GREP
) "^.\{2,4\}" >> t
116 sort t | uniq
> tphrase.t
118 alltradphrases.t
: tphrase.t s2t_1tomany.t tradphrases_exclude.manual
119 for i in
`cat s2t_1tomany.t | $(SED) 's/.*=>".//' | $(SED) 's/"//g' |$(SED) 's/,/\n/' | $(SED) 's/\(.\)/\1\n/g' |sort | uniq`; do
$(GREP
) -s
$$i tphrase.t
; done
> alltradphrases.t || true
120 cat alltradphrases.t |
$(GREP
) -vf tradphrases_exclude.manual
> alltradphrases.tt
; mv alltradphrases.tt alltradphrases.t
123 tradphrases_2.t
: alltradphrases.t
124 cat alltradphrases.t |
$(GREP
) "^..$$" |
sort | uniq
> tradphrases_2.t
126 tradphrases_3.t
: alltradphrases.t
127 cat alltradphrases.t |
$(GREP
) "^...$$" |
sort | uniq
> tradphrases_3.t
128 for i in
`cat tradphrases_2.t`; do
$(GREP
) $$i tradphrases_3.t
; done |
sort | uniq
> t3 || true
129 $(DIFF
) t3 tradphrases_3.t |
$(GREP
) ">" |
$(SED
) 's/> //' > t
133 tradphrases_4.t
: alltradphrases.t
134 cat alltradphrases.t |
$(GREP
) "^....$$" |
sort | uniq
> tradphrases_4.t
135 for i in
`cat tradphrases_2.t`; do
$(GREP
) $$i tradphrases_4.t
; done |
sort | uniq
> t3 || true
136 $(DIFF
) t3 tradphrases_4.t |
$(GREP
) ">" |
$(SED
) 's/> //' > t
138 for i in
`cat tradphrases_3.t`; do
$(GREP
) $$i tradphrases_4.t
; done |
sort | uniq
> t3 || true
139 $(DIFF
) t3 tradphrases_4.t |
$(GREP
) ">" |
$(SED
) 's/> //' > t
142 tradphrases.t
: tradphrases.manual tradphrases_2.t tradphrases_3.t tradphrases_4.t t2s_1tomany.t
143 cat tradphrases.manual tradphrases_2.t tradphrases_3.t tradphrases_4.t |
sort | uniq
> tradphrases.t
144 for i in
`$(SED) 's/"\(.\).*/\1/' t2s_1tomany.t ` ; do
$(GREP
) $$i tradphrases.t
; done |
$(DIFF
) tradphrases.t
- |
$(GREP
) '<' |
$(SED
) 's/< //' > t
147 tradphrases.notsure
: tradphrases_2.t tradphrases_3.t tradphrases_4.t t2s_1tomany.t
148 cat tradphrases_2.t tradphrases_3.t tradphrases_4.t |
sort | uniq
> t
149 for i in
`$(SED) 's/"\(.\).*/\1/' t2s_1tomany.t ` ; do
$(GREP
) $$i t
; done |
$(DIFF
) t
- |
$(GREP
) '>' |
$(SED
) 's/> //' > tradphrases.notsure
153 $(SED
) 's/[\t0-9a-zA-Z]//g' phrase_lib.txt |
$(GREP
) "^.\{2,4\}$$" > ph.t
156 $(SED
) '1,/BEGIN_TABLE/d' Wubi.txt.in | colrm
1 8 |
$(SED
) 's/\t.*//' |
$(GREP
) "^...*" > Wubi.t
158 Ziranma.t
: Ziranma.txt.in
159 $(SED
) '1,/BEGIN_TABLE/d' Ziranma.txt.in | colrm
1 8 |
$(SED
) 's/\t.*//' |
$(GREP
) "^...*" > Ziranma.t
162 allsimpphrases.t
: t2s_1tomany.t ph.t Wubi.t Ziranma.t simpphrases_exclude.manual
163 rm -f allsimpphrases.t
164 for i in
`cat t2s_1tomany.t | $(SED) 's/.*=>".//' | $(SED) 's/"//g' | $(SED) 's/,/\n/' | $(SED) 's/\(.\)/\1\n/g' | sort | uniq `; do
$(GREP
) $$i Wubi.t
>> allsimpphrases.t
; done
165 for i in
`cat t2s_1tomany.t | $(SED) 's/.*=>".//' | $(SED) 's/"//g' | $(SED) 's/,/\n/' | $(SED) 's/\(.\)/\1\n/g' | sort | uniq `; do
$(GREP
) $$i Ziranma.t
>> allsimpphrases.t
; done
166 for i in
`cat t2s_1tomany.t | $(SED) 's/.*=>".//' | $(SED) 's/"//g' | $(SED) 's/,/\n/' | $(SED) 's/\(.\)/\1\n/g' | sort | uniq `; do
$(GREP
) $$i ph.t
>> allsimpphrases.t
; done
167 cat allsimpphrases.t |
$(GREP
) -vf simpphrases_exclude.manual
> allsimpphrases.tt
; mv allsimpphrases.tt allsimpphrases.t
169 simpphrases_2.t
: allsimpphrases.t
170 cat allsimpphrases.t |
$(GREP
) "^..$$" |
sort | uniq
> simpphrases_2.t
172 simpphrases_3.t
: allsimpphrases.t
173 cat allsimpphrases.t |
$(GREP
) "^...$$" |
sort | uniq
> simpphrases_3.t
174 for i in
`cat simpphrases_2.t`; do
$(GREP
) $$i simpphrases_3.t
; done |
sort | uniq
> t3 || true
175 $(DIFF
) t3 simpphrases_3.t |
$(GREP
) ">" |
$(SED
) 's/> //' > t
178 simpphrases_4.t
: allsimpphrases.t
179 cat allsimpphrases.t |
$(GREP
) "^....$$" |
sort | uniq
> simpphrases_4.t
181 for i in
`cat simpphrases_2.t`; do
$(GREP
) $$i simpphrases_4.t
>> t
; done || true
183 $(DIFF
) t3 simpphrases_4.t |
$(GREP
) ">" |
$(SED
) 's/> //' > t
185 for i in
`cat simpphrases_3.t`; do
$(GREP
) $$i simpphrases_4.t
; done |
sort | uniq
> t3 || true
186 $(DIFF
) t3 simpphrases_4.t |
$(GREP
) ">" |
$(SED
) 's/> //' > t
189 simpphrases.t
: simpphrases.manual simpphrases_2.t simpphrases_3.t simpphrases_4.t t2s_1tomany.t
190 cat simpphrases.manual simpphrases_2.t simpphrases_3.t simpphrases_4.t
> simpphrases.t
191 for i in
`$(SED) 's/"\(.\).*/\1/' t2s_1tomany.t ` ; do
$(GREP
) $$i simpphrases.t
; done |
$(DIFF
) simpphrases.t
- |
$(GREP
) '<' |
$(SED
) 's/< //' > t
195 simpphrases.notsure
: simpphrases_2.t simpphrases_3.t simpphrases_4.t t2s_1tomany.t
196 cat simpphrases_2.t simpphrases_3.t simpphrases_4.t
> t
197 for i in
`$(SED) 's/"\(.\).*/\1/' t2s_1tomany.t ` ; do
$(GREP
) $$i t
; done |
$(DIFF
) t
- |
$(GREP
) '>' |
$(SED
) 's/> //' > simpphrases.notsure
199 trad2simp1to1.t
: t2s_1tomany.t t2s_1to1.t trad2simp_noconvert.manual
200 $(SED
) 's/\(.......\).*/\1",/' t2s_1tomany.t
> trad2simp1to1.t
201 cat t2s_1to1.t
>> trad2simp1to1.t
202 cat trad2simp1to1.t |
$(GREP
) -vf trad2simp_noconvert.manual
> tt
203 mv tt trad2simp1to1.t
205 simp2trad1to1.t
: s2t_1tomany.t s2t_1to1.t simp2trad.manual simp2trad_noconvert.manual
206 $(SED
) 's/\(.......\).*/\1",/' s2t_1tomany.t
> tt
207 colrm
1 7 < simp2trad.manual | colrm
3 > simp2tradcharsrc.t
208 colrm
1 17 < simp2trad.manual | colrm
3 > simp2tradchardest.t
209 cat simp2tradcharsrc.t |
$(GREP
) -f simp2tradchardest.t
> simp2tradrepeatedchar.t
210 cat tt |
$(GREP
) -vf simp2tradrepeatedchar.t
> simp2trad1to1.t
211 cat s2t_1to1.t
>> simp2trad1to1.t
212 cat simp2trad1to1.t |
$(GREP
) -vf simp2trad_noconvert.manual
> tt
213 mv tt simp2trad1to1.t
215 trad2simp.php
: trad2simp1to1.t tradphrases.t
216 printf
'<?php\n$$trad2simp=array(' > trad2simp.php
217 cat trad2simp1to1.t
>> trad2simp.php
218 printf
');\n$$str=\n"' >> trad2simp.php
219 cat tradphrases.t
>> trad2simp.php
220 printf
'";\n$$t=strtr($$str, $$trad2simp);\necho $$t;\n?>' >> trad2simp.php
222 simp2trad.php
: simp2trad1to1.t simpphrases.t
223 printf
'<?php\n$$simp2trad=array(' > simp2trad.php
224 cat simp2trad1to1.t
>> simp2trad.php
225 printf
');\n$$str=\n"' >> simp2trad.php
226 cat simpphrases.t
>> simp2trad.php
227 printf
'";\n$$t=strtr($$str, $$simp2trad);\necho $$t;\n?>' >> simp2trad.php
229 simp2trad.phrases.t
: trad2simp.php tradphrases.t
230 php
-f trad2simp.php |
$(SED
) 's/\(.*\)/"\1" => /' > tmp1
231 cat tradphrases.t |
$(SED
) 's/\(.*\)/"\1",/' > tmp2
232 paste tmp1 tmp2
> simp2trad.phrases.t
234 trad2simp.phrases.t
: simp2trad.php simpphrases.t
235 php
-f simp2trad.php |
$(SED
) 's/\(.*\)/"\1" => /' > tmp1
236 cat simpphrases.t |
$(SED
) 's/\(.*\)/"\1",/' > tmp2
237 paste tmp1 tmp2
> trad2simp.phrases.t
239 toHans.dict
: trad2simp1to1.t trad2simp.phrases.t toSimp.manual
240 cat trad2simp1to1.t |
$(SED
) 's/[, \t]//g' |
$(SED
) 's/=>/\t/' > toHans.dict
241 cat trad2simp.phrases.t |
$(SED
) 's/[, \t]//g' |
$(SED
) 's/=>/\t/' >> toHans.dict
242 cat toSimp.manual |
$(SED
) 's/ //g' |
$(SED
) 's/\(^.*\)\t\(.*\)/"\1"\t"\2"/' >> toHans.dict
244 toHant.dict
: simp2trad1to1.t simp2trad.phrases.t toTrad.manual
245 cat simp2trad1to1.t |
$(SED
) 's/[, \t]//g' |
$(SED
) 's/=>/\t/' > toHant.dict
246 cat simp2trad.phrases.t |
$(SED
) 's/[, \t]//g' |
$(SED
) 's/=>/\t/' >> toHant.dict
247 cat toTrad.manual |
$(SED
) 's/ //g' |
$(SED
) 's/\(^.*\)\t\(.*\)/"\1"\t"\2"/' >> toHant.dict
249 toTW.dict
: toTW.manual
250 cat toTW.manual |
$(SED
) 's/ //g' |
$(SED
) 's/\(^.*\)\t\(.*\)/"\1"\t"\2"/' > toTW.dict
252 toHK.dict
: toHK.manual
253 cat toHK.manual |
$(SED
) 's/ //g' |
$(SED
) 's/\(^.*\)\t\(.*\)/"\1"\t"\2"/' > toHK.dict
255 toCN.dict
: toCN.manual
256 cat toCN.manual |
$(SED
) 's/ //g' |
$(SED
) 's/\(^.*\)\t\(.*\)/"\1"\t"\2"/' > toCN.dict
258 toSG.dict
: toSG.manual
259 cat toSG.manual |
$(SED
) 's/ //g' |
$(SED
) 's/\(^.*\)\t\(.*\)/"\1"\t"\2"/' > toSG.dict
261 ZhConversion.php
: simp2trad1to1.t simp2trad.phrases.t trad2simp1to1.t trad2simp.phrases.t toSimp.manual toTrad.manual toCN.manual toHK.manual toSG.manual toTW.manual
262 printf
'<?php\n/**\n * Simplified / Traditional Chinese conversion tables\n' > ZhConversion.php
263 printf
' *\n * Automatically generated using code and data in includes/zhtable/\n' >> ZhConversion.php
264 printf
' * Do not modify directly!\n */\n\n' >> ZhConversion.php
265 printf
'$$zh2Hant = array(\n' >> ZhConversion.php
266 cat simp2trad1to1.t
>> ZhConversion.php
267 echo
>> ZhConversion.php
268 cat simp2trad.phrases.t
>> ZhConversion.php
269 $(SED
) 's/\(.*\)\t\(.*\)/"\1" => "\2",/' toTrad.manual
>> ZhConversion.php
270 echo
');' >> ZhConversion.php
271 echo
>> ZhConversion.php
272 printf
'$$zh2Hans = array(\n' >> ZhConversion.php
273 cat trad2simp1to1.t
>> ZhConversion.php
274 echo
>> ZhConversion.php
275 cat trad2simp.phrases.t
>> ZhConversion.php
276 $(SED
) 's/\(.*\)\t\(.*\)/"\1" => "\2",/' toSimp.manual
>> ZhConversion.php
277 echo
');' >> ZhConversion.php
278 echo
>> ZhConversion.php
279 printf
'$$zh2TW = array(\n' >> ZhConversion.php
280 $(SED
) 's/\(.*\)\t\(.*\)/"\1" => "\2",/' toTW.manual
>> ZhConversion.php
281 echo
');' >> ZhConversion.php
282 echo
>> ZhConversion.php
283 printf
'$$zh2HK = array(\n' >> ZhConversion.php
284 $(SED
) 's/\(.*\)\t\(.*\)/"\1" => "\2",/' toHK.manual
>> ZhConversion.php
285 echo
');' >> ZhConversion.php
286 echo
>> ZhConversion.php
287 printf
'$$zh2CN = array(\n' >> ZhConversion.php
288 $(SED
) 's/\(.*\)\t\(.*\)/"\1" => "\2",/' toCN.manual
>> ZhConversion.php
289 echo
');' >> ZhConversion.php
290 echo
>> ZhConversion.php
291 printf
'$$zh2SG = array(\n' >> ZhConversion.php
292 $(SED
) 's/\(.*\)\t\(.*\)/"\1" => "\2",/' toSG.manual
>> ZhConversion.php
293 echo
>> ZhConversion.php
294 printf
');' >> ZhConversion.php
296 clean: cleantmp cleandl
299 # Stuff unpacked from the files fetched by wget
307 # Temporary files and other trash
308 rm -f ZhConversion.php tmp1 tmp2 tmp3 t3
*.t trad2simp.php simp2trad.php
*.dict printutf8
*~ \
309 simpphrases.notsure tradphrases.notsure wordlist
314 scim-tables-
$(SCIM_TABLES_VER
).
tar.gz \
315 scim-pinyin-
$(SCIM_PINYIN_VER
).
tar.gz \
316 libtabe-
$(LIBTABE_VER
).tgz