2 # -*- coding: utf-8 -*-
12 pyversion
= platform
.python_version()
13 islinux
= platform
.system().lower() == 'linux'
15 if pyversion
[:3] in ['2.6', '2.7']:
16 import urllib
as urllib_request
20 if sys
.maxunicode
< 0x10000:
25 return _unichr(0xD7C0 + (i
>> 10)) + _unichr(0xDC00 + (i
& 0x3FF))
26 elif pyversion
[:2] == '3.':
27 import urllib
.request
as urllib_request
32 return [unichr(int(i
.split('<')[0][2:], 16)) for i
in args
]
36 return [unichr(int(i
[2:7], 16)) for i
in args
if i
[2:7]]
41 SCIM_TABLES_VER
= '0.5.13'
42 SCIM_PINYIN_VER
= '0.5.92'
47 def download(url
, dest
):
48 if os
.path
.isfile(dest
):
49 print('File %s is up to date.' % dest
)
53 # we use wget instead urlretrieve under Linux,
54 # because wget could display details like download progress
55 os
.system('wget %s -O %s' % (url
, dest
))
57 print('Downloading from [%s] ...' % url
)
58 urllib_request
.urlretrieve(url
, dest
)
59 print('Download complete.\n')
63 def uncompress(fp
, member
, encoding
='U8'):
64 name
= member
.rsplit('/', 1)[-1]
65 print('Extracting %s ...' % name
)
67 shutil
.move(member
, name
)
69 shutil
.rmtree(member
.split('/', 1)[0])
70 if pyversion
[:1] in ['2']:
71 fc
= open(name
, 'rb', encoding
, 'ignore')
73 fc
= open(name
, 'r', encoding
=encoding
, errors
='ignore')
76 unzip
= lambda path
, member
, encoding
= 'U8': \
77 uncompress(zipfile
.ZipFile(path
), member
, encoding
)
79 untargz
= lambda path
, member
, encoding
= 'U8': \
80 uncompress(tarfile
.open(path
, 'r:gz'), member
, encoding
)
83 def parserCore(fp
, pos
, beginmark
=None, endmark
=None):
84 if beginmark
and endmark
:
90 if beginmark
and line
.startswith(beginmark
):
93 elif endmark
and line
.startswith(endmark
):
95 if start
and not line
.startswith('#'):
99 elif len(elems
[0]) > 1 and len(elems
[pos
]) > 1: # words only
100 mlist
.add(elems
[pos
])
104 def tablesParser(path
, name
):
105 """ Read file from scim-tables and parse it. """
106 global SCIM_TABLES_VER
107 src
= 'scim-tables-%s/tables/zh/%s' % (SCIM_TABLES_VER
, name
)
108 fp
= untargz(path
, src
, 'U8')
109 return parserCore(fp
, 1, 'BEGIN_TABLE', 'END_TABLE')
111 ezbigParser
= lambda path
: tablesParser(path
, 'EZ-Big.txt.in')
112 wubiParser
= lambda path
: tablesParser(path
, 'Wubi.txt.in')
113 zrmParser
= lambda path
: tablesParser(path
, 'Ziranma.txt.in')
116 def phraseParser(path
):
117 """ Read phrase_lib.txt and parse it. """
118 global SCIM_PINYIN_VER
119 src
= 'scim-pinyin-%s/data/phrase_lib.txt' % SCIM_PINYIN_VER
120 fp
= untargz(path
, src
, 'U8')
121 return parserCore(fp
, 0)
125 """ Read tsi.src and parse it. """
126 src
= 'libtabe/tsi-src/tsi.src'
127 fp
= untargz(path
, src
, 'big5hkscs')
128 return parserCore(fp
, 0)
131 def unihanParser(path
):
132 """ Read Unihan_Variants.txt and parse it. """
133 fp
= unzip(path
, 'Unihan_Variants.txt', 'U8')
137 if line
.startswith('#'):
144 elems
= unichr2(*elems
)
145 if type == 'kTraditionalVariant':
146 s2t
[elems
[0]] = elems
[1:]
147 elif type == 'kSimplifiedVariant':
148 t2s
[elems
[0]] = elems
[1:]
153 def applyExcludes(mlist
, path
):
154 """ Apply exclude rules from path to mlist. """
155 if pyversion
[:1] in ['2']:
156 excludes
= open(path
, 'rb', 'U8').read().split()
158 excludes
= open(path
, 'r', encoding
='U8').read().split()
159 excludes
= [word
.split('#')[0].strip() for word
in excludes
]
160 excludes
= '|'.join(excludes
)
161 excptn
= re
.compile('.*(?:%s).*' % excludes
)
162 diff
= [mword
for mword
in mlist
if excptn
.search(mword
)]
163 mlist
.difference_update(diff
)
167 def charManualTable(path
):
168 fp
= open(path
, 'r', encoding
='U8')
170 elems
= line
.split('#')[0].split('|')
171 elems
= unichr3(*elems
)
173 yield elems
[0], elems
[1:]
176 def toManyRules(src_table
):
178 if pyversion
[:1] in ['2']:
179 for (f
, t
) in src_table
.iteritems():
180 for i
in range(1, len(t
)):
183 for (f
, t
) in src_table
.items():
184 for i
in range(1, len(t
)):
189 def removeRules(path
, table
):
190 fp
= open(path
, 'r', encoding
='U8')
193 elems
= line
.split('=>')
194 f
= t
= elems
[0].strip()
197 f
= f
.strip('"').strip("'")
198 t
= t
.strip('"').strip("'")
206 texcptn
= re
.compile('^(?:%s)$' % '|'.join(texc
))
207 if pyversion
[:1] in ['2']:
208 for (tmp_f
, tmp_t
) in table
.copy().iteritems():
209 if texcptn
.match(tmp_t
):
212 for (tmp_f
, tmp_t
) in table
.copy().items():
213 if texcptn
.match(tmp_t
):
218 def customRules(path
):
219 fp
= open(path
, 'r', encoding
='U8')
222 line
= line
.rstrip('\r\n')
224 line
= line
.split('#')[0].rstrip()
225 elems
= line
.split('\t')
227 ret
[elems
[0]] = elems
[1]
231 def dictToSortedList(src_table
, pos
):
232 return sorted(src_table
.items(), key
=lambda m
: (m
[pos
], m
[1 - pos
]))
235 def translate(text
, conv_table
):
238 for j
in range(len(text
) - i
, 0, -1):
240 t
= conv_table
.get(f
)
242 text
= text
[:i
] + t
+ text
[i
:][j
:]
249 def manualWordsTable(path
, conv_table
, reconv_table
):
250 fp
= open(path
, 'r', encoding
='U8')
252 wordlist
= [line
.split('#')[0].strip() for line
in fp
]
253 wordlist
= list(set(wordlist
))
254 wordlist
.sort(key
=lambda w
: (len(w
), w
), reverse
=True)
256 word
= wordlist
.pop()
257 new_word
= translate(word
, conv_table
)
258 rcv_word
= translate(word
, reconv_table
)
260 reconv_table
[word
] = word
261 reconv_table
[new_word
] = word
265 def defaultWordsTable(src_wordlist
, src_tomany
, char_conv_table
,
267 wordlist
= list(src_wordlist
)
268 wordlist
.sort(key
=lambda w
: (len(w
), w
), reverse
=True)
270 word_reconv_table
= {}
271 conv_table
= char_conv_table
.copy()
272 reconv_table
= char_reconv_table
.copy()
273 tomanyptn
= re
.compile('(?:%s)' % '|'.join(src_tomany
))
275 conv_table
.update(word_conv_table
)
276 reconv_table
.update(word_reconv_table
)
277 word
= wordlist
.pop()
278 new_word_len
= word_len
= len(word
)
279 while new_word_len
== word_len
:
280 test_word
= translate(word
, reconv_table
)
281 new_word
= translate(word
, conv_table
)
282 if not reconv_table
.get(new_word
) and \
283 (test_word
!= word
or
284 (tomanyptn
.search(word
) and
285 word
!= translate(new_word
, reconv_table
))):
286 word_conv_table
[word
] = new_word
287 word_reconv_table
[new_word
] = word
289 word
= wordlist
.pop()
292 new_word_len
= len(word
)
293 return word_reconv_table
297 lines
= ['\'%s\' => \'%s\',' % (f
, t
) for (f
, t
) in table
if f
and t
]
298 return '\n'.join(lines
)
303 url
= 'http://www.unicode.org/Public/%s/ucd/Unihan.zip' % UNIHAN_VER
304 han_dest
= 'Unihan-%s.zip' % UNIHAN_VER
305 download(url
, han_dest
)
307 sfurlbase
= 'http://%s.dl.sourceforge.net/sourceforge/' % SF_MIRROR
309 # Get scim-tables-$(SCIM_TABLES_VER).tar.gz:
310 url
= sfurlbase
+ 'scim/scim-tables-%s.tar.gz' % SCIM_TABLES_VER
311 tbe_dest
= 'scim-tables-%s.tar.gz' % SCIM_TABLES_VER
312 download(url
, tbe_dest
)
314 # Get scim-pinyin-$(SCIM_PINYIN_VER).tar.gz:
315 url
= sfurlbase
+ 'scim/scim-pinyin-%s.tar.gz' % SCIM_PINYIN_VER
316 pyn_dest
= 'scim-pinyin-%s.tar.gz' % SCIM_PINYIN_VER
317 download(url
, pyn_dest
)
319 # Get libtabe-$(LIBTABE_VER).tgz:
320 url
= sfurlbase
+ 'libtabe/libtabe-%s.tgz' % LIBTABE_VER
321 lbt_dest
= 'libtabe-%s.tgz' % LIBTABE_VER
322 download(url
, lbt_dest
)
325 (t2s_1tomany
, s2t_1tomany
) = unihanParser(han_dest
)
327 t2s_1tomany
.update(charManualTable('symme_supp.manual'))
328 t2s_1tomany
.update(charManualTable('trad2simp.manual'))
329 s2t_1tomany
.update((t
[0], [f
]) for (f
, t
) in charManualTable('symme_supp.manual'))
330 s2t_1tomany
.update(charManualTable('simp2trad.manual'))
332 if pyversion
[:1] in ['2']:
333 t2s_1to1
= dict([(f
, t
[0]) for (f
, t
) in t2s_1tomany
.iteritems()])
334 s2t_1to1
= dict([(f
, t
[0]) for (f
, t
) in s2t_1tomany
.iteritems()])
336 t2s_1to1
= dict([(f
, t
[0]) for (f
, t
) in t2s_1tomany
.items()])
337 s2t_1to1
= dict([(f
, t
[0]) for (f
, t
) in s2t_1tomany
.items()])
339 s_tomany
= toManyRules(t2s_1tomany
)
340 t_tomany
= toManyRules(s2t_1tomany
)
343 t2s_1to1
= removeRules('trad2simp_noconvert.manual', t2s_1to1
)
344 s2t_1to1
= removeRules('simp2trad_noconvert.manual', s2t_1to1
)
346 # the supper set for word to word conversion
347 t2s_1to1_supp
= t2s_1to1
.copy()
348 s2t_1to1_supp
= s2t_1to1
.copy()
349 t2s_1to1_supp
.update(customRules('trad2simp_supp_set.manual'))
350 s2t_1to1_supp
.update(customRules('simp2trad_supp_set.manual'))
352 # word to word manual rules
353 t2s_word2word_manual
= manualWordsTable('simpphrases.manual',
354 s2t_1to1_supp
, t2s_1to1_supp
)
355 t2s_word2word_manual
.update(customRules('toSimp.manual'))
356 s2t_word2word_manual
= manualWordsTable('tradphrases.manual',
357 t2s_1to1_supp
, s2t_1to1_supp
)
358 s2t_word2word_manual
.update(customRules('toTrad.manual'))
360 # word to word rules from input methods
363 t_wordlist
.update(ezbigParser(tbe_dest
),
365 s_wordlist
.update(wubiParser(tbe_dest
),
367 phraseParser(pyn_dest
))
370 s_wordlist
= applyExcludes(s_wordlist
, 'simpphrases_exclude.manual')
371 t_wordlist
= applyExcludes(t_wordlist
, 'tradphrases_exclude.manual')
373 s2t_supp
= s2t_1to1_supp
.copy()
374 s2t_supp
.update(s2t_word2word_manual
)
375 t2s_supp
= t2s_1to1_supp
.copy()
376 t2s_supp
.update(t2s_word2word_manual
)
379 t2s_word2word
= defaultWordsTable(s_wordlist
, s_tomany
,
380 s2t_1to1_supp
, t2s_supp
)
381 t2s_word2word
.update(t2s_word2word_manual
)
382 s2t_word2word
= defaultWordsTable(t_wordlist
, t_tomany
,
383 t2s_1to1_supp
, s2t_supp
)
384 s2t_word2word
.update(s2t_word2word_manual
)
388 if pyversion
[:1] in ['2']:
389 t2s_1to1
= dict([(f
, t
) for (f
, t
) in t2s_1to1
.iteritems() if f
!= t
])
391 t2s_1to1
= dict([(f
, t
) for (f
, t
) in t2s_1to1
.items() if f
!= t
])
392 toHans
= dictToSortedList(t2s_1to1
, 0) + dictToSortedList(t2s_word2word
, 1)
394 if pyversion
[:1] in ['2']:
395 s2t_1to1
= dict([(f
, t
) for (f
, t
) in s2t_1to1
.iteritems() if f
!= t
])
397 s2t_1to1
= dict([(f
, t
) for (f
, t
) in s2t_1to1
.items() if f
!= t
])
398 toHant
= dictToSortedList(s2t_1to1
, 0) + dictToSortedList(s2t_word2word
, 1)
400 toCN
= dictToSortedList(customRules('toCN.manual'), 1)
402 toHK
= dictToSortedList(customRules('toHK.manual'), 1)
404 toTW
= dictToSortedList(customRules('toTW.manual'), 1)
409 * Simplified / Traditional Chinese conversion tables
411 * Automatically generated using code and data in maintenance/language/zhtable/
412 * Do not modify directly!
417 $zh2Hant = array(\n'''
418 php
+= PHPArray(toHant
) \
419 + '\n);\n\n$zh2Hans = array(\n' \
421 + '\n);\n\n$zh2TW = array(\n' \
423 + '\n);\n\n$zh2HK = array(\n' \
425 + '\n);\n\n$zh2CN = array(\n' \
429 if pyversion
[:1] in ['2']:
430 f
= open(os
.path
.join('..', '..', '..', 'includes', 'ZhConversion.php'), 'wb', encoding
='utf8')
432 f
= open(os
.path
.join('..', '..', '..', 'includes', 'ZhConversion.php'), 'w', buffering
=4096, encoding
='utf8')
433 print ('Writing ZhConversion.php ... ')
437 # Remove temporary files
438 print ('Deleting temporary files ... ')
439 os
.remove('EZ-Big.txt.in')
440 os
.remove('phrase_lib.txt')
442 os
.remove('Unihan_Variants.txt')
443 os
.remove('Wubi.txt.in')
444 os
.remove('Ziranma.txt.in')
447 if __name__
== '__main__':