2 # -*- coding: utf-8 -*-
12 pyversion
= platform
.python_version()
13 islinux
= platform
.system().lower() == 'linux'
15 if pyversion
[:3] in ['2.6', '2.7']:
16 import urllib
as urllib_request
20 if sys
.maxunicode
< 0x10000:
25 return _unichr(0xD7C0 + (i
>> 10)) + _unichr(0xDC00 + (i
& 0x3FF))
26 elif pyversion
[:2] == '3.':
27 import urllib
.request
as urllib_request
32 return [unichr(int(i
.split('<')[0][2:], 16)) for i
in args
]
36 return [unichr(int(i
[2:7], 16)) for i
in args
if i
[2:7]]
41 SCIM_TABLES_VER
= '0.5.13'
42 SCIM_PINYIN_VER
= '0.5.92'
47 def download(url
, dest
):
48 if os
.path
.isfile(dest
):
49 print('File %s is up to date.' % dest
)
53 # we use wget instead urlretrieve under Linux,
54 # because wget could display details like download progress
55 os
.system('wget %s -O %s' % (url
, dest
))
57 print('Downloading from [%s] ...' % url
)
58 urllib_request
.urlretrieve(url
, dest
)
59 print('Download complete.\n')
63 def uncompress(fp
, member
, encoding
='U8'):
64 name
= member
.rsplit('/', 1)[-1]
65 print('Extracting %s ...' % name
)
67 shutil
.move(member
, name
)
69 shutil
.rmtree(member
.split('/', 1)[0])
70 if pyversion
[:1] in ['2']:
71 fc
= open(name
, 'rb', encoding
, 'ignore')
73 fc
= open(name
, 'r', encoding
=encoding
, errors
='ignore')
76 unzip
= lambda path
, member
, encoding
= 'U8': \
77 uncompress(zipfile
.ZipFile(path
), member
, encoding
)
79 untargz
= lambda path
, member
, encoding
= 'U8': \
80 uncompress(tarfile
.open(path
, 'r:gz'), member
, encoding
)
83 def parserCore(fp
, pos
, beginmark
=None, endmark
=None):
84 if beginmark
and endmark
:
90 if beginmark
and line
.startswith(beginmark
):
93 elif endmark
and line
.startswith(endmark
):
95 if start
and not line
.startswith('#'):
99 elif len(elems
[0]) > 1 and len(elems
[pos
]) > 1: # words only
100 mlist
.add(elems
[pos
])
104 def tablesParser(path
, name
):
105 """ Read file from scim-tables and parse it. """
106 global SCIM_TABLES_VER
107 src
= 'scim-tables-%s/tables/zh/%s' % (SCIM_TABLES_VER
, name
)
108 fp
= untargz(path
, src
, 'U8')
109 return parserCore(fp
, 1, 'BEGIN_TABLE', 'END_TABLE')
111 ezbigParser
= lambda path
: tablesParser(path
, 'EZ-Big.txt.in')
112 wubiParser
= lambda path
: tablesParser(path
, 'Wubi.txt.in')
113 zrmParser
= lambda path
: tablesParser(path
, 'Ziranma.txt.in')
116 def phraseParser(path
):
117 """ Read phrase_lib.txt and parse it. """
118 global SCIM_PINYIN_VER
119 src
= 'scim-pinyin-%s/data/phrase_lib.txt' % SCIM_PINYIN_VER
120 fp
= untargz(path
, src
, 'U8')
121 return parserCore(fp
, 0)
125 """ Read tsi.src and parse it. """
126 src
= 'libtabe/tsi-src/tsi.src'
127 fp
= untargz(path
, src
, 'big5hkscs')
128 return parserCore(fp
, 0)
131 def unihanParser(path
):
132 """ Read Unihan_Variants.txt and parse it. """
133 fp
= unzip(path
, 'Unihan_Variants.txt', 'U8')
137 if line
.startswith('#'):
144 elems
= unichr2(*elems
)
145 if type == 'kTraditionalVariant':
146 s2t
[elems
[0]] = elems
[1:]
147 elif type == 'kSimplifiedVariant':
148 t2s
[elems
[0]] = elems
[1:]
153 def applyExcludes(mlist
, path
):
154 """ Apply exclude rules from path to mlist. """
155 if pyversion
[:1] in ['2']:
156 excludes
= open(path
, 'rb', 'U8').read().split()
158 excludes
= open(path
, 'r', encoding
='U8').read().split()
159 excludes
= [word
.split('#')[0].strip() for word
in excludes
]
160 excludes
= '|'.join(excludes
)
161 excptn
= re
.compile('.*(?:%s).*' % excludes
)
162 diff
= [mword
for mword
in mlist
if excptn
.search(mword
)]
163 mlist
.difference_update(diff
)
167 def charManualTable(path
):
168 fp
= open(path
, 'r', encoding
='U8')
170 elems
= line
.split('#')[0].split('|')
171 elems
= unichr3(*elems
)
173 yield elems
[0], elems
[1:]
176 def toManyRules(src_table
):
178 if pyversion
[:1] in ['2']:
179 for (f
, t
) in src_table
.iteritems():
180 for i
in range(1, len(t
)):
183 for (f
, t
) in src_table
.items():
184 for i
in range(1, len(t
)):
189 def removeRules(path
, table
):
190 fp
= open(path
, 'r', encoding
='U8')
193 elems
= line
.split('=>')
194 f
= t
= elems
[0].strip()
197 f
= f
.strip('"').strip("'")
198 t
= t
.strip('"').strip("'")
206 texcptn
= re
.compile('^(?:%s)$' % '|'.join(texc
))
207 if pyversion
[:1] in ['2']:
208 for (tmp_f
, tmp_t
) in table
.copy().iteritems():
209 if texcptn
.match(tmp_t
):
212 for (tmp_f
, tmp_t
) in table
.copy().items():
213 if texcptn
.match(tmp_t
):
218 def customRules(path
):
219 fp
= open(path
, 'r', encoding
='U8')
222 line
= line
.rstrip('\r\n')
224 line
= line
.split('#')[0].rstrip()
225 elems
= line
.split('\t')
227 ret
[elems
[0]] = elems
[1]
231 def dictToSortedList(src_table
, pos
):
232 return sorted(src_table
.items(), key
=lambda m
: (m
[pos
], m
[1 - pos
]))
235 def translate(text
, conv_table
):
238 for j
in range(len(text
) - i
, 0, -1):
240 t
= conv_table
.get(f
)
242 text
= text
[:i
] + t
+ text
[i
:][j
:]
249 def manualWordsTable(path
, conv_table
, reconv_table
):
250 fp
= open(path
, 'r', encoding
='U8')
251 reconv_table
= reconv_table
.copy()
253 wordlist
= [line
.split('#')[0].strip() for line
in fp
]
254 wordlist
= list(set(wordlist
))
255 wordlist
.sort(key
=lambda w
: (len(w
), w
), reverse
=True)
257 word
= wordlist
.pop()
258 new_word
= translate(word
, conv_table
)
259 rcv_word
= translate(word
, reconv_table
)
261 reconv_table
[word
] = out_table
[word
] = word
262 reconv_table
[new_word
] = out_table
[new_word
] = word
266 def defaultWordsTable(src_wordlist
, src_tomany
, char_conv_table
,
268 wordlist
= list(src_wordlist
)
269 wordlist
.sort(key
=lambda w
: (len(w
), w
), reverse
=True)
271 word_reconv_table
= {}
272 conv_table
= char_conv_table
.copy()
273 reconv_table
= char_reconv_table
.copy()
274 tomanyptn
= re
.compile('(?:%s)' % '|'.join(src_tomany
))
276 conv_table
.update(word_conv_table
)
277 reconv_table
.update(word_reconv_table
)
278 word
= wordlist
.pop()
279 new_word_len
= word_len
= len(word
)
280 while new_word_len
== word_len
:
281 test_word
= translate(word
, reconv_table
)
282 new_word
= translate(word
, conv_table
)
283 if not reconv_table
.get(new_word
) and \
284 (test_word
!= word
or
285 (tomanyptn
.search(word
) and
286 word
!= translate(new_word
, reconv_table
))):
287 word_conv_table
[word
] = new_word
288 word_reconv_table
[new_word
] = word
290 word
= wordlist
.pop()
293 new_word_len
= len(word
)
294 return word_reconv_table
298 lines
= ['\'%s\' => \'%s\',' % (f
, t
) for (f
, t
) in table
if f
and t
]
299 return '\n'.join(lines
)
304 url
= 'http://www.unicode.org/Public/%s/ucd/Unihan.zip' % UNIHAN_VER
305 han_dest
= 'Unihan-%s.zip' % UNIHAN_VER
306 download(url
, han_dest
)
308 sfurlbase
= 'http://%s.dl.sourceforge.net/sourceforge/' % SF_MIRROR
310 # Get scim-tables-$(SCIM_TABLES_VER).tar.gz:
311 url
= sfurlbase
+ 'scim/scim-tables-%s.tar.gz' % SCIM_TABLES_VER
312 tbe_dest
= 'scim-tables-%s.tar.gz' % SCIM_TABLES_VER
313 download(url
, tbe_dest
)
315 # Get scim-pinyin-$(SCIM_PINYIN_VER).tar.gz:
316 url
= sfurlbase
+ 'scim/scim-pinyin-%s.tar.gz' % SCIM_PINYIN_VER
317 pyn_dest
= 'scim-pinyin-%s.tar.gz' % SCIM_PINYIN_VER
318 download(url
, pyn_dest
)
320 # Get libtabe-$(LIBTABE_VER).tgz:
321 url
= sfurlbase
+ 'libtabe/libtabe-%s.tgz' % LIBTABE_VER
322 lbt_dest
= 'libtabe-%s.tgz' % LIBTABE_VER
323 download(url
, lbt_dest
)
326 (t2s_1tomany
, s2t_1tomany
) = unihanParser(han_dest
)
328 t2s_1tomany
.update(charManualTable('symme_supp.manual'))
329 t2s_1tomany
.update(charManualTable('trad2simp.manual'))
330 s2t_1tomany
.update((t
[0], [f
]) for (f
, t
) in charManualTable('symme_supp.manual'))
331 s2t_1tomany
.update(charManualTable('simp2trad.manual'))
333 if pyversion
[:1] in ['2']:
334 t2s_1to1
= dict([(f
, t
[0]) for (f
, t
) in t2s_1tomany
.iteritems()])
335 s2t_1to1
= dict([(f
, t
[0]) for (f
, t
) in s2t_1tomany
.iteritems()])
337 t2s_1to1
= dict([(f
, t
[0]) for (f
, t
) in t2s_1tomany
.items()])
338 s2t_1to1
= dict([(f
, t
[0]) for (f
, t
) in s2t_1tomany
.items()])
340 s_tomany
= toManyRules(t2s_1tomany
)
341 t_tomany
= toManyRules(s2t_1tomany
)
344 t2s_1to1
= removeRules('trad2simp_noconvert.manual', t2s_1to1
)
345 s2t_1to1
= removeRules('simp2trad_noconvert.manual', s2t_1to1
)
347 # the supper set for word to word conversion
348 t2s_1to1_supp
= t2s_1to1
.copy()
349 s2t_1to1_supp
= s2t_1to1
.copy()
350 t2s_1to1_supp
.update(customRules('trad2simp_supp_set.manual'))
351 s2t_1to1_supp
.update(customRules('simp2trad_supp_set.manual'))
353 # word to word manual rules
354 t2s_word2word_manual
= manualWordsTable('simpphrases.manual',
355 s2t_1to1_supp
, t2s_1to1_supp
)
356 t2s_word2word_manual
.update(customRules('toSimp.manual'))
357 s2t_word2word_manual
= manualWordsTable('tradphrases.manual',
358 t2s_1to1_supp
, s2t_1to1_supp
)
359 s2t_word2word_manual
.update(customRules('toTrad.manual'))
361 # word to word rules from input methods
364 t_wordlist
.update(ezbigParser(tbe_dest
),
366 s_wordlist
.update(wubiParser(tbe_dest
),
368 phraseParser(pyn_dest
))
371 s_wordlist
= applyExcludes(s_wordlist
, 'simpphrases_exclude.manual')
372 t_wordlist
= applyExcludes(t_wordlist
, 'tradphrases_exclude.manual')
374 s2t_supp
= s2t_1to1_supp
.copy()
375 s2t_supp
.update(s2t_word2word_manual
)
376 t2s_supp
= t2s_1to1_supp
.copy()
377 t2s_supp
.update(t2s_word2word_manual
)
380 t2s_word2word
= defaultWordsTable(s_wordlist
, s_tomany
,
381 s2t_1to1_supp
, t2s_supp
)
382 t2s_word2word
.update(t2s_word2word_manual
)
383 s2t_word2word
= defaultWordsTable(t_wordlist
, t_tomany
,
384 t2s_1to1_supp
, s2t_supp
)
385 s2t_word2word
.update(s2t_word2word_manual
)
389 if pyversion
[:1] in ['2']:
390 t2s_1to1
= dict([(f
, t
) for (f
, t
) in t2s_1to1
.iteritems() if f
!= t
])
392 t2s_1to1
= dict([(f
, t
) for (f
, t
) in t2s_1to1
.items() if f
!= t
])
393 toHans
= dictToSortedList(t2s_1to1
, 0) + dictToSortedList(t2s_word2word
, 1)
395 if pyversion
[:1] in ['2']:
396 s2t_1to1
= dict([(f
, t
) for (f
, t
) in s2t_1to1
.iteritems() if f
!= t
])
398 s2t_1to1
= dict([(f
, t
) for (f
, t
) in s2t_1to1
.items() if f
!= t
])
399 toHant
= dictToSortedList(s2t_1to1
, 0) + dictToSortedList(s2t_word2word
, 1)
401 toCN
= dictToSortedList(customRules('toCN.manual'), 1)
403 toHK
= dictToSortedList(customRules('toHK.manual'), 1)
405 toTW
= dictToSortedList(customRules('toTW.manual'), 1)
410 * Simplified / Traditional Chinese conversion tables
412 * Automatically generated using code and data in maintenance/language/zhtable/
413 * Do not modify directly!
418 namespace MediaWiki\Languages\Data;
421 public static $zh2Hant = [\n'''
422 php
+= PHPArray(toHant
) \
423 + '\n];\n\npublic static $zh2Hans = [\n' \
425 + '\n];\n\npublic static $zh2TW = [\n' \
427 + '\n];\n\npublic static $zh2HK = [\n' \
429 + '\n];\n\npublic static $zh2CN = [\n' \
433 if pyversion
[:1] in ['2']:
434 f
= open(os
.path
.join('..', '..', '..', 'languages', 'data', 'ZhConversion.php'), 'wb', encoding
='utf8')
436 f
= open(os
.path
.join('..', '..', '..', 'languages', 'data', 'ZhConversion.php'), 'w', buffering
=4096, encoding
='utf8')
437 print ('Writing ZhConversion.php ... ')
441 # Remove temporary files
442 print ('Deleting temporary files ... ')
443 os
.remove('EZ-Big.txt.in')
444 os
.remove('phrase_lib.txt')
446 os
.remove('Unihan_Variants.txt')
447 os
.remove('Wubi.txt.in')
448 os
.remove('Ziranma.txt.in')
451 if __name__
== '__main__':