Merge "Remove not used private member variable mParserWarnings from OutputPage"
[mediawiki.git] / maintenance / language / zhtable / Makefile.py
blob5924c66270e294aaa2cbffcb601ab681f9afc369
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3 # @author Philip
4 import os
5 import platform
6 import re
7 import shutil
8 import sys
9 import tarfile
10 import zipfile
12 pyversion = platform.python_version()
13 islinux = platform.system().lower() == 'linux'
15 if pyversion[:3] in ['2.6', '2.7']:
16 import urllib as urllib_request
17 import codecs
18 open = codecs.open
19 _unichr = unichr
20 if sys.maxunicode < 0x10000:
21 def unichr(i):
22 if i < 0x10000:
23 return _unichr(i)
24 else:
25 return _unichr(0xD7C0 + (i >> 10)) + _unichr(0xDC00 + (i & 0x3FF))
26 elif pyversion[:2] == '3.':
27 import urllib.request as urllib_request
28 unichr = chr
31 def unichr2(*args):
32 return [unichr(int(i.split('<')[0][2:], 16)) for i in args]
35 def unichr3(*args):
36 return [unichr(int(i[2:7], 16)) for i in args if i[2:7]]
38 # DEFINE
39 UNIHAN_VER = '6.3.0'
40 SF_MIRROR = 'dfn'
41 SCIM_TABLES_VER = '0.5.13'
42 SCIM_PINYIN_VER = '0.5.92'
43 LIBTABE_VER = '0.2.3'
44 # END OF DEFINE
47 def download(url, dest):
48 if os.path.isfile(dest):
49 print('File %s is up to date.' % dest)
50 return
51 global islinux
52 if islinux:
53 # we use wget instead urlretrieve under Linux,
54 # because wget could display details like download progress
55 os.system('wget %s -O %s' % (url, dest))
56 else:
57 print('Downloading from [%s] ...' % url)
58 urllib_request.urlretrieve(url, dest)
59 print('Download complete.\n')
60 return
63 def uncompress(fp, member, encoding='U8'):
64 name = member.rsplit('/', 1)[-1]
65 print('Extracting %s ...' % name)
66 fp.extract(member)
67 shutil.move(member, name)
68 if '/' in member:
69 shutil.rmtree(member.split('/', 1)[0])
70 if pyversion[:1] in ['2']:
71 fc = open(name, 'rb', encoding, 'ignore')
72 else:
73 fc = open(name, 'r', encoding=encoding, errors='ignore')
74 return fc
76 unzip = lambda path, member, encoding = 'U8': \
77 uncompress(zipfile.ZipFile(path), member, encoding)
79 untargz = lambda path, member, encoding = 'U8': \
80 uncompress(tarfile.open(path, 'r:gz'), member, encoding)
83 def parserCore(fp, pos, beginmark=None, endmark=None):
84 if beginmark and endmark:
85 start = False
86 else:
87 start = True
88 mlist = set()
89 for line in fp:
90 if beginmark and line.startswith(beginmark):
91 start = True
92 continue
93 elif endmark and line.startswith(endmark):
94 break
95 if start and not line.startswith('#'):
96 elems = line.split()
97 if len(elems) < 2:
98 continue
99 elif len(elems[0]) > 1 and len(elems[pos]) > 1: # words only
100 mlist.add(elems[pos])
101 return mlist
104 def tablesParser(path, name):
105 """ Read file from scim-tables and parse it. """
106 global SCIM_TABLES_VER
107 src = 'scim-tables-%s/tables/zh/%s' % (SCIM_TABLES_VER, name)
108 fp = untargz(path, src, 'U8')
109 return parserCore(fp, 1, 'BEGIN_TABLE', 'END_TABLE')
111 ezbigParser = lambda path: tablesParser(path, 'EZ-Big.txt.in')
112 wubiParser = lambda path: tablesParser(path, 'Wubi.txt.in')
113 zrmParser = lambda path: tablesParser(path, 'Ziranma.txt.in')
116 def phraseParser(path):
117 """ Read phrase_lib.txt and parse it. """
118 global SCIM_PINYIN_VER
119 src = 'scim-pinyin-%s/data/phrase_lib.txt' % SCIM_PINYIN_VER
120 fp = untargz(path, src, 'U8')
121 return parserCore(fp, 0)
124 def tsiParser(path):
125 """ Read tsi.src and parse it. """
126 src = 'libtabe/tsi-src/tsi.src'
127 fp = untargz(path, src, 'big5hkscs')
128 return parserCore(fp, 0)
131 def unihanParser(path):
132 """ Read Unihan_Variants.txt and parse it. """
133 fp = unzip(path, 'Unihan_Variants.txt', 'U8')
134 t2s = dict()
135 s2t = dict()
136 for line in fp:
137 if line.startswith('#'):
138 continue
139 else:
140 elems = line.split()
141 if len(elems) < 3:
142 continue
143 type = elems.pop(1)
144 elems = unichr2(*elems)
145 if type == 'kTraditionalVariant':
146 s2t[elems[0]] = elems[1:]
147 elif type == 'kSimplifiedVariant':
148 t2s[elems[0]] = elems[1:]
149 fp.close()
150 return (t2s, s2t)
153 def applyExcludes(mlist, path):
154 """ Apply exclude rules from path to mlist. """
155 if pyversion[:1] in ['2']:
156 excludes = open(path, 'rb', 'U8').read().split()
157 else:
158 excludes = open(path, 'r', encoding='U8').read().split()
159 excludes = [word.split('#')[0].strip() for word in excludes]
160 excludes = '|'.join(excludes)
161 excptn = re.compile('.*(?:%s).*' % excludes)
162 diff = [mword for mword in mlist if excptn.search(mword)]
163 mlist.difference_update(diff)
164 return mlist
167 def charManualTable(path):
168 fp = open(path, 'r', encoding='U8')
169 for line in fp:
170 elems = line.split('#')[0].split('|')
171 elems = unichr3(*elems)
172 if len(elems) > 1:
173 yield elems[0], elems[1:]
176 def toManyRules(src_table):
177 tomany = set()
178 if pyversion[:1] in ['2']:
179 for (f, t) in src_table.iteritems():
180 for i in range(1, len(t)):
181 tomany.add(t[i])
182 else:
183 for (f, t) in src_table.items():
184 for i in range(1, len(t)):
185 tomany.add(t[i])
186 return tomany
189 def removeRules(path, table):
190 fp = open(path, 'r', encoding='U8')
191 texc = list()
192 for line in fp:
193 elems = line.split('=>')
194 f = t = elems[0].strip()
195 if len(elems) == 2:
196 t = elems[1].strip()
197 f = f.strip('"').strip("'")
198 t = t.strip('"').strip("'")
199 if f:
200 try:
201 table.pop(f)
202 except:
203 pass
204 if t:
205 texc.append(t)
206 texcptn = re.compile('^(?:%s)$' % '|'.join(texc))
207 if pyversion[:1] in ['2']:
208 for (tmp_f, tmp_t) in table.copy().iteritems():
209 if texcptn.match(tmp_t):
210 table.pop(tmp_f)
211 else:
212 for (tmp_f, tmp_t) in table.copy().items():
213 if texcptn.match(tmp_t):
214 table.pop(tmp_f)
215 return table
218 def customRules(path):
219 fp = open(path, 'r', encoding='U8')
220 ret = dict()
221 for line in fp:
222 line = line.rstrip('\r\n')
223 if '#' in line:
224 line = line.split('#')[0].rstrip()
225 elems = line.split('\t')
226 if len(elems) > 1:
227 ret[elems[0]] = elems[1]
228 return ret
231 def dictToSortedList(src_table, pos):
232 return sorted(src_table.items(), key=lambda m: (m[pos], m[1 - pos]))
235 def translate(text, conv_table):
236 i = 0
237 while i < len(text):
238 for j in range(len(text) - i, 0, -1):
239 f = text[i:][:j]
240 t = conv_table.get(f)
241 if t:
242 text = text[:i] + t + text[i:][j:]
243 i += len(t) - 1
244 break
245 i += 1
246 return text
249 def manualWordsTable(path, conv_table, reconv_table):
250 fp = open(path, 'r', encoding='U8')
251 reconv_table = {}
252 wordlist = [line.split('#')[0].strip() for line in fp]
253 wordlist = list(set(wordlist))
254 wordlist.sort(key=lambda w: (len(w), w), reverse=True)
255 while wordlist:
256 word = wordlist.pop()
257 new_word = translate(word, conv_table)
258 rcv_word = translate(word, reconv_table)
259 if word != rcv_word:
260 reconv_table[word] = word
261 reconv_table[new_word] = word
262 return reconv_table
265 def defaultWordsTable(src_wordlist, src_tomany, char_conv_table,
266 char_reconv_table):
267 wordlist = list(src_wordlist)
268 wordlist.sort(key=lambda w: (len(w), w), reverse=True)
269 word_conv_table = {}
270 word_reconv_table = {}
271 conv_table = char_conv_table.copy()
272 reconv_table = char_reconv_table.copy()
273 tomanyptn = re.compile('(?:%s)' % '|'.join(src_tomany))
274 while wordlist:
275 conv_table.update(word_conv_table)
276 reconv_table.update(word_reconv_table)
277 word = wordlist.pop()
278 new_word_len = word_len = len(word)
279 while new_word_len == word_len:
280 test_word = translate(word, reconv_table)
281 new_word = translate(word, conv_table)
282 if not reconv_table.get(new_word) and \
283 (test_word != word or
284 (tomanyptn.search(word) and
285 word != translate(new_word, reconv_table))):
286 word_conv_table[word] = new_word
287 word_reconv_table[new_word] = word
288 try:
289 word = wordlist.pop()
290 except IndexError:
291 break
292 new_word_len = len(word)
293 return word_reconv_table
296 def PHPArray(table):
297 lines = ['\'%s\' => \'%s\',' % (f, t) for (f, t) in table if f and t]
298 return '\n'.join(lines)
301 def main():
302 # Get Unihan.zip:
303 url = 'http://www.unicode.org/Public/%s/ucd/Unihan.zip' % UNIHAN_VER
304 han_dest = 'Unihan-%s.zip' % UNIHAN_VER
305 download(url, han_dest)
307 sfurlbase = 'http://%s.dl.sourceforge.net/sourceforge/' % SF_MIRROR
309 # Get scim-tables-$(SCIM_TABLES_VER).tar.gz:
310 url = sfurlbase + 'scim/scim-tables-%s.tar.gz' % SCIM_TABLES_VER
311 tbe_dest = 'scim-tables-%s.tar.gz' % SCIM_TABLES_VER
312 download(url, tbe_dest)
314 # Get scim-pinyin-$(SCIM_PINYIN_VER).tar.gz:
315 url = sfurlbase + 'scim/scim-pinyin-%s.tar.gz' % SCIM_PINYIN_VER
316 pyn_dest = 'scim-pinyin-%s.tar.gz' % SCIM_PINYIN_VER
317 download(url, pyn_dest)
319 # Get libtabe-$(LIBTABE_VER).tgz:
320 url = sfurlbase + 'libtabe/libtabe-%s.tgz' % LIBTABE_VER
321 lbt_dest = 'libtabe-%s.tgz' % LIBTABE_VER
322 download(url, lbt_dest)
324 # Unihan.txt
325 (t2s_1tomany, s2t_1tomany) = unihanParser(han_dest)
327 t2s_1tomany.update(charManualTable('symme_supp.manual'))
328 t2s_1tomany.update(charManualTable('trad2simp.manual'))
329 s2t_1tomany.update((t[0], [f]) for (f, t) in charManualTable('symme_supp.manual'))
330 s2t_1tomany.update(charManualTable('simp2trad.manual'))
332 if pyversion[:1] in ['2']:
333 t2s_1to1 = dict([(f, t[0]) for (f, t) in t2s_1tomany.iteritems()])
334 s2t_1to1 = dict([(f, t[0]) for (f, t) in s2t_1tomany.iteritems()])
335 else:
336 t2s_1to1 = dict([(f, t[0]) for (f, t) in t2s_1tomany.items()])
337 s2t_1to1 = dict([(f, t[0]) for (f, t) in s2t_1tomany.items()])
339 s_tomany = toManyRules(t2s_1tomany)
340 t_tomany = toManyRules(s2t_1tomany)
342 # noconvert rules
343 t2s_1to1 = removeRules('trad2simp_noconvert.manual', t2s_1to1)
344 s2t_1to1 = removeRules('simp2trad_noconvert.manual', s2t_1to1)
346 # the supper set for word to word conversion
347 t2s_1to1_supp = t2s_1to1.copy()
348 s2t_1to1_supp = s2t_1to1.copy()
349 t2s_1to1_supp.update(customRules('trad2simp_supp_set.manual'))
350 s2t_1to1_supp.update(customRules('simp2trad_supp_set.manual'))
352 # word to word manual rules
353 t2s_word2word_manual = manualWordsTable('simpphrases.manual',
354 s2t_1to1_supp, t2s_1to1_supp)
355 t2s_word2word_manual.update(customRules('toSimp.manual'))
356 s2t_word2word_manual = manualWordsTable('tradphrases.manual',
357 t2s_1to1_supp, s2t_1to1_supp)
358 s2t_word2word_manual.update(customRules('toTrad.manual'))
360 # word to word rules from input methods
361 t_wordlist = set()
362 s_wordlist = set()
363 t_wordlist.update(ezbigParser(tbe_dest),
364 tsiParser(lbt_dest))
365 s_wordlist.update(wubiParser(tbe_dest),
366 zrmParser(tbe_dest),
367 phraseParser(pyn_dest))
369 # exclude
370 s_wordlist = applyExcludes(s_wordlist, 'simpphrases_exclude.manual')
371 t_wordlist = applyExcludes(t_wordlist, 'tradphrases_exclude.manual')
373 s2t_supp = s2t_1to1_supp.copy()
374 s2t_supp.update(s2t_word2word_manual)
375 t2s_supp = t2s_1to1_supp.copy()
376 t2s_supp.update(t2s_word2word_manual)
378 # parse list to dict
379 t2s_word2word = defaultWordsTable(s_wordlist, s_tomany,
380 s2t_1to1_supp, t2s_supp)
381 t2s_word2word.update(t2s_word2word_manual)
382 s2t_word2word = defaultWordsTable(t_wordlist, t_tomany,
383 t2s_1to1_supp, s2t_supp)
384 s2t_word2word.update(s2t_word2word_manual)
386 # Final tables
387 # sorted list toHans
388 if pyversion[:1] in ['2']:
389 t2s_1to1 = dict([(f, t) for (f, t) in t2s_1to1.iteritems() if f != t])
390 else:
391 t2s_1to1 = dict([(f, t) for (f, t) in t2s_1to1.items() if f != t])
392 toHans = dictToSortedList(t2s_1to1, 0) + dictToSortedList(t2s_word2word, 1)
393 # sorted list toHant
394 if pyversion[:1] in ['2']:
395 s2t_1to1 = dict([(f, t) for (f, t) in s2t_1to1.iteritems() if f != t])
396 else:
397 s2t_1to1 = dict([(f, t) for (f, t) in s2t_1to1.items() if f != t])
398 toHant = dictToSortedList(s2t_1to1, 0) + dictToSortedList(s2t_word2word, 1)
399 # sorted list toCN
400 toCN = dictToSortedList(customRules('toCN.manual'), 1)
401 # sorted list toHK
402 toHK = dictToSortedList(customRules('toHK.manual'), 1)
403 # sorted list toTW
404 toTW = dictToSortedList(customRules('toTW.manual'), 1)
406 # Get PHP Array
407 php = '''<?php
409 * Simplified / Traditional Chinese conversion tables
411 * Automatically generated using code and data in maintenance/language/zhtable/
412 * Do not modify directly!
414 * @file
417 $zh2Hant = array(\n'''
418 php += PHPArray(toHant) \
419 + '\n);\n\n$zh2Hans = array(\n' \
420 + PHPArray(toHans) \
421 + '\n);\n\n$zh2TW = array(\n' \
422 + PHPArray(toTW) \
423 + '\n);\n\n$zh2HK = array(\n' \
424 + PHPArray(toHK) \
425 + '\n);\n\n$zh2CN = array(\n' \
426 + PHPArray(toCN) \
427 + '\n);\n'
429 if pyversion[:1] in ['2']:
430 f = open(os.path.join('..', '..', '..', 'includes', 'ZhConversion.php'), 'wb', encoding='utf8')
431 else:
432 f = open(os.path.join('..', '..', '..', 'includes', 'ZhConversion.php'), 'w', buffering=4096, encoding='utf8')
433 print ('Writing ZhConversion.php ... ')
434 f.write(php)
435 f.close()
437 # Remove temporary files
438 print ('Deleting temporary files ... ')
439 os.remove('EZ-Big.txt.in')
440 os.remove('phrase_lib.txt')
441 os.remove('tsi.src')
442 os.remove('Unihan_Variants.txt')
443 os.remove('Wubi.txt.in')
444 os.remove('Ziranma.txt.in')
447 if __name__ == '__main__':
448 main()