maintenance/language/zhtable/Makefile.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3 # @author Philip
   4 import os
   5 import platform
   6 import re
   7 import shutil
   8 import sys
   9 import tarfile
  10 import zipfile
  11
  12 pyversion = platform.python_version()
  13 islinux = platform.system().lower() == 'linux'
  14
  15 if pyversion[:3] in ['2.6', '2.7']:
  16     import urllib as urllib_request
  17     import codecs
  18     open = codecs.open
  19     _unichr = unichr
  20     if sys.maxunicode < 0x10000:
  21         def unichr(i):
  22             if i < 0x10000:
  23                 return _unichr(i)
  24             else:
  25                 return _unichr(0xD7C0 + (i >> 10)) + _unichr(0xDC00 + (i & 0x3FF))
  26 elif pyversion[:2] == '3.':
  27     import urllib.request as urllib_request
  28     unichr = chr
  29
  30
  31 def unichr2(*args):
  32     return [unichr(int(i.split('<')[0][2:], 16)) for i in args]
  33
  34
  35 def unichr3(*args):
  36     return [unichr(int(i[2:7], 16)) for i in args if i[2:7]]
  37
  38 # DEFINE
  39 UNIHAN_VER = '6.3.0'
  40 SF_MIRROR = 'dfn'
  41 SCIM_TABLES_VER = '0.5.13'
  42 SCIM_PINYIN_VER = '0.5.92'
  43 LIBTABE_VER = '0.2.3'
  44 # END OF DEFINE
  45
  46
  47 def download(url, dest):
  48     if os.path.isfile(dest):
  49         print('File %s is up to date.' % dest)
  50         return
  51     global islinux
  52     if islinux:
  53         # we use wget instead urlretrieve under Linux,
  54         # because wget could display details like download progress
  55         os.system('wget %s -O %s' % (url, dest))
  56     else:
  57         print('Downloading from [%s] ...' % url)
  58         urllib_request.urlretrieve(url, dest)
  59         print('Download complete.\n')
  60     return
  61
  62
  63 def uncompress(fp, member, encoding='U8'):
  64     name = member.rsplit('/', 1)[-1]
  65     print('Extracting %s ...' % name)
  66     fp.extract(member)
  67     shutil.move(member, name)
  68     if '/' in member:
  69         shutil.rmtree(member.split('/', 1)[0])
  70     if pyversion[:1] in ['2']:
  71         fc = open(name, 'rb', encoding, 'ignore')
  72     else:
  73         fc = open(name, 'r', encoding=encoding, errors='ignore')
  74     return fc
  75
  76 unzip = lambda path, member, encoding = 'U8': \
  77         uncompress(zipfile.ZipFile(path), member, encoding)
  78
  79 untargz = lambda path, member, encoding = 'U8': \
  80         uncompress(tarfile.open(path, 'r:gz'), member, encoding)
  81
  82
  83 def parserCore(fp, pos, beginmark=None, endmark=None):
  84     if beginmark and endmark:
  85         start = False
  86     else:
  87         start = True
  88     mlist = set()
  89     for line in fp:
  90         if beginmark and line.startswith(beginmark):
  91             start = True
  92             continue
  93         elif endmark and line.startswith(endmark):
  94             break
  95         if start and not line.startswith('#'):
  96             elems = line.split()
  97             if len(elems) < 2:
  98                 continue
  99             elif len(elems[0]) > 1 and len(elems[pos]) > 1:  # words only
 100                 mlist.add(elems[pos])
 101     return mlist
 102
 103
 104 def tablesParser(path, name):
 105     """ Read file from scim-tables and parse it. """
 106     global SCIM_TABLES_VER
 107     src = 'scim-tables-%s/tables/zh/%s' % (SCIM_TABLES_VER, name)
 108     fp = untargz(path, src, 'U8')
 109     return parserCore(fp, 1, 'BEGIN_TABLE', 'END_TABLE')
 110
 111 ezbigParser = lambda path: tablesParser(path, 'EZ-Big.txt.in')
 112 wubiParser = lambda path: tablesParser(path, 'Wubi.txt.in')
 113 zrmParser = lambda path: tablesParser(path, 'Ziranma.txt.in')
 114
 115
 116 def phraseParser(path):
 117     """ Read phrase_lib.txt and parse it. """
 118     global SCIM_PINYIN_VER
 119     src = 'scim-pinyin-%s/data/phrase_lib.txt' % SCIM_PINYIN_VER
 120     fp = untargz(path, src, 'U8')
 121     return parserCore(fp, 0)
 122
 123
 124 def tsiParser(path):
 125     """ Read tsi.src and parse it. """
 126     src = 'libtabe/tsi-src/tsi.src'
 127     fp = untargz(path, src, 'big5hkscs')
 128     return parserCore(fp, 0)
 129
 130
 131 def unihanParser(path):
 132     """ Read Unihan_Variants.txt and parse it. """
 133     fp = unzip(path, 'Unihan_Variants.txt', 'U8')
 134     t2s = dict()
 135     s2t = dict()
 136     for line in fp:
 137         if line.startswith('#'):
 138             continue
 139         else:
 140             elems = line.split()
 141             if len(elems) < 3:
 142                 continue
 143             type = elems.pop(1)
 144             elems = unichr2(*elems)
 145             if type == 'kTraditionalVariant':
 146                 s2t[elems[0]] = elems[1:]
 147             elif type == 'kSimplifiedVariant':
 148                 t2s[elems[0]] = elems[1:]
 149     fp.close()
 150     return (t2s, s2t)
 151
 152
 153 def applyExcludes(mlist, path):
 154     """ Apply exclude rules from path to mlist. """
 155     if pyversion[:1] in ['2']:
 156         excludes = open(path, 'rb', 'U8').read().split()
 157     else:
 158         excludes = open(path, 'r', encoding='U8').read().split()
 159     excludes = [word.split('#')[0].strip() for word in excludes]
 160     excludes = '|'.join(excludes)
 161     excptn = re.compile('.*(?:%s).*' % excludes)
 162     diff = [mword for mword in mlist if excptn.search(mword)]
 163     mlist.difference_update(diff)
 164     return mlist
 165
 166
 167 def charManualTable(path):
 168     fp = open(path, 'r', encoding='U8')
 169     for line in fp:
 170         elems = line.split('#')[0].split('|')
 171         elems = unichr3(*elems)
 172         if len(elems) > 1:
 173             yield elems[0], elems[1:]
 174
 175
 176 def toManyRules(src_table):
 177     tomany = set()
 178     if pyversion[:1] in ['2']:
 179         for (f, t) in src_table.iteritems():
 180             for i in range(1, len(t)):
 181                 tomany.add(t[i])
 182     else:
 183         for (f, t) in src_table.items():
 184             for i in range(1, len(t)):
 185                 tomany.add(t[i])
 186     return tomany
 187
 188
 189 def removeRules(path, table):
 190     fp = open(path, 'r', encoding='U8')
 191     texc = list()
 192     for line in fp:
 193         elems = line.split('=>')
 194         f = t = elems[0].strip()
 195         if len(elems) == 2:
 196             t = elems[1].strip()
 197         f = f.strip('"').strip("'")
 198         t = t.strip('"').strip("'")
 199         if f:
 200             try:
 201                 table.pop(f)
 202             except:
 203                 pass
 204         if t:
 205             texc.append(t)
 206     texcptn = re.compile('^(?:%s)$' % '|'.join(texc))
 207     if pyversion[:1] in ['2']:
 208         for (tmp_f, tmp_t) in table.copy().iteritems():
 209             if texcptn.match(tmp_t):
 210                 table.pop(tmp_f)
 211     else:
 212         for (tmp_f, tmp_t) in table.copy().items():
 213             if texcptn.match(tmp_t):
 214                 table.pop(tmp_f)
 215     return table
 216
 217
 218 def customRules(path):
 219     fp = open(path, 'r', encoding='U8')
 220     ret = dict()
 221     for line in fp:
 222         line = line.rstrip('\r\n')
 223         if '#' in line:
 224             line = line.split('#')[0].rstrip()
 225         elems = line.split('\t')
 226         if len(elems) > 1:
 227             ret[elems[0]] = elems[1]
 228     return ret
 229
 230
 231 def dictToSortedList(src_table, pos):
 232     return sorted(src_table.items(), key=lambda m: (m[pos], m[1 - pos]))
 233
 234
 235 def translate(text, conv_table):
 236     i = 0
 237     while i < len(text):
 238         for j in range(len(text) - i, 0, -1):
 239             f = text[i:][:j]
 240             t = conv_table.get(f)
 241             if t:
 242                 text = text[:i] + t + text[i:][j:]
 243                 i += len(t) - 1
 244                 break
 245         i += 1
 246     return text
 247
 248
 249 def manualWordsTable(path, conv_table, reconv_table):
 250     fp = open(path, 'r', encoding='U8')
 251     reconv_table = {}
 252     wordlist = [line.split('#')[0].strip() for line in fp]
 253     wordlist = list(set(wordlist))
 254     wordlist.sort(key=lambda w: (len(w), w), reverse=True)
 255     while wordlist:
 256         word = wordlist.pop()
 257         new_word = translate(word, conv_table)
 258         rcv_word = translate(word, reconv_table)
 259         if word != rcv_word:
 260             reconv_table[word] = word
 261         reconv_table[new_word] = word
 262     return reconv_table
 263
 264
 265 def defaultWordsTable(src_wordlist, src_tomany, char_conv_table,
 266                       char_reconv_table):
 267     wordlist = list(src_wordlist)
 268     wordlist.sort(key=lambda w: (len(w), w), reverse=True)
 269     word_conv_table = {}
 270     word_reconv_table = {}
 271     conv_table = char_conv_table.copy()
 272     reconv_table = char_reconv_table.copy()
 273     tomanyptn = re.compile('(?:%s)' % '|'.join(src_tomany))
 274     while wordlist:
 275         conv_table.update(word_conv_table)
 276         reconv_table.update(word_reconv_table)
 277         word = wordlist.pop()
 278         new_word_len = word_len = len(word)
 279         while new_word_len == word_len:
 280             test_word = translate(word, reconv_table)
 281             new_word = translate(word, conv_table)
 282             if not reconv_table.get(new_word) and \
 283                (test_word != word or
 284                 (tomanyptn.search(word) and
 285                  word != translate(new_word, reconv_table))):
 286                 word_conv_table[word] = new_word
 287                 word_reconv_table[new_word] = word
 288             try:
 289                 word = wordlist.pop()
 290             except IndexError:
 291                 break
 292             new_word_len = len(word)
 293     return word_reconv_table
 294
 295
 296 def PHPArray(table):
 297     lines = ['\'%s\' => \'%s\',' % (f, t) for (f, t) in table if f and t]
 298     return '\n'.join(lines)
 299
 300
 301 def main():
 302     # Get Unihan.zip:
 303     url = 'http://www.unicode.org/Public/%s/ucd/Unihan.zip' % UNIHAN_VER
 304     han_dest = 'Unihan-%s.zip' % UNIHAN_VER
 305     download(url, han_dest)
 306
 307     sfurlbase = 'http://%s.dl.sourceforge.net/sourceforge/' % SF_MIRROR
 308
 309     # Get scim-tables-$(SCIM_TABLES_VER).tar.gz:
 310     url = sfurlbase + 'scim/scim-tables-%s.tar.gz' % SCIM_TABLES_VER
 311     tbe_dest = 'scim-tables-%s.tar.gz' % SCIM_TABLES_VER
 312     download(url, tbe_dest)
 313
 314     # Get scim-pinyin-$(SCIM_PINYIN_VER).tar.gz:
 315     url = sfurlbase + 'scim/scim-pinyin-%s.tar.gz' % SCIM_PINYIN_VER
 316     pyn_dest = 'scim-pinyin-%s.tar.gz' % SCIM_PINYIN_VER
 317     download(url, pyn_dest)
 318
 319     # Get libtabe-$(LIBTABE_VER).tgz:
 320     url = sfurlbase + 'libtabe/libtabe-%s.tgz' % LIBTABE_VER
 321     lbt_dest = 'libtabe-%s.tgz' % LIBTABE_VER
 322     download(url, lbt_dest)
 323
 324     # Unihan.txt
 325     (t2s_1tomany, s2t_1tomany) = unihanParser(han_dest)
 326
 327     t2s_1tomany.update(charManualTable('symme_supp.manual'))
 328     t2s_1tomany.update(charManualTable('trad2simp.manual'))
 329     s2t_1tomany.update((t[0], [f]) for (f, t) in charManualTable('symme_supp.manual'))
 330     s2t_1tomany.update(charManualTable('simp2trad.manual'))
 331
 332     if pyversion[:1] in ['2']:
 333         t2s_1to1 = dict([(f, t[0]) for (f, t) in t2s_1tomany.iteritems()])
 334         s2t_1to1 = dict([(f, t[0]) for (f, t) in s2t_1tomany.iteritems()])
 335     else:
 336         t2s_1to1 = dict([(f, t[0]) for (f, t) in t2s_1tomany.items()])
 337         s2t_1to1 = dict([(f, t[0]) for (f, t) in s2t_1tomany.items()])
 338
 339     s_tomany = toManyRules(t2s_1tomany)
 340     t_tomany = toManyRules(s2t_1tomany)
 341
 342     # noconvert rules
 343     t2s_1to1 = removeRules('trad2simp_noconvert.manual', t2s_1to1)
 344     s2t_1to1 = removeRules('simp2trad_noconvert.manual', s2t_1to1)
 345
 346     # the supper set for word to word conversion
 347     t2s_1to1_supp = t2s_1to1.copy()
 348     s2t_1to1_supp = s2t_1to1.copy()
 349     t2s_1to1_supp.update(customRules('trad2simp_supp_set.manual'))
 350     s2t_1to1_supp.update(customRules('simp2trad_supp_set.manual'))
 351
 352     # word to word manual rules
 353     t2s_word2word_manual = manualWordsTable('simpphrases.manual',
 354                                             s2t_1to1_supp, t2s_1to1_supp)
 355     t2s_word2word_manual.update(customRules('toSimp.manual'))
 356     s2t_word2word_manual = manualWordsTable('tradphrases.manual',
 357                                             t2s_1to1_supp, s2t_1to1_supp)
 358     s2t_word2word_manual.update(customRules('toTrad.manual'))
 359
 360     # word to word rules from input methods
 361     t_wordlist = set()
 362     s_wordlist = set()
 363     t_wordlist.update(ezbigParser(tbe_dest),
 364                       tsiParser(lbt_dest))
 365     s_wordlist.update(wubiParser(tbe_dest),
 366                       zrmParser(tbe_dest),
 367                       phraseParser(pyn_dest))
 368
 369     # exclude
 370     s_wordlist = applyExcludes(s_wordlist, 'simpphrases_exclude.manual')
 371     t_wordlist = applyExcludes(t_wordlist, 'tradphrases_exclude.manual')
 372
 373     s2t_supp = s2t_1to1_supp.copy()
 374     s2t_supp.update(s2t_word2word_manual)
 375     t2s_supp = t2s_1to1_supp.copy()
 376     t2s_supp.update(t2s_word2word_manual)
 377
 378     # parse list to dict
 379     t2s_word2word = defaultWordsTable(s_wordlist, s_tomany,
 380                                       s2t_1to1_supp, t2s_supp)
 381     t2s_word2word.update(t2s_word2word_manual)
 382     s2t_word2word = defaultWordsTable(t_wordlist, t_tomany,
 383                                       t2s_1to1_supp, s2t_supp)
 384     s2t_word2word.update(s2t_word2word_manual)
 385
 386     # Final tables
 387     # sorted list toHans
 388     if pyversion[:1] in ['2']:
 389         t2s_1to1 = dict([(f, t) for (f, t) in t2s_1to1.iteritems() if f != t])
 390     else:
 391         t2s_1to1 = dict([(f, t) for (f, t) in t2s_1to1.items() if f != t])
 392     toHans = dictToSortedList(t2s_1to1, 0) + dictToSortedList(t2s_word2word, 1)
 393     # sorted list toHant
 394     if pyversion[:1] in ['2']:
 395         s2t_1to1 = dict([(f, t) for (f, t) in s2t_1to1.iteritems() if f != t])
 396     else:
 397         s2t_1to1 = dict([(f, t) for (f, t) in s2t_1to1.items() if f != t])
 398     toHant = dictToSortedList(s2t_1to1, 0) + dictToSortedList(s2t_word2word, 1)
 399     # sorted list toCN
 400     toCN = dictToSortedList(customRules('toCN.manual'), 1)
 401     # sorted list toHK
 402     toHK = dictToSortedList(customRules('toHK.manual'), 1)
 403     # sorted list toTW
 404     toTW = dictToSortedList(customRules('toTW.manual'), 1)
 405
 406     # Get PHP Array
 407     php = '''<?php
 408 /**
 409  * Simplified / Traditional Chinese conversion tables
 410  *
 411  * Automatically generated using code and data in maintenance/language/zhtable/
 412  * Do not modify directly!
 413  *
 414  * @file
 415  */
 416
 417 $zh2Hant = array(\n'''
 418     php += PHPArray(toHant) \
 419         + '\n);\n\n$zh2Hans = array(\n' \
 420         + PHPArray(toHans) \
 421         + '\n);\n\n$zh2TW = array(\n' \
 422         + PHPArray(toTW) \
 423         + '\n);\n\n$zh2HK = array(\n' \
 424         + PHPArray(toHK) \
 425         + '\n);\n\n$zh2CN = array(\n' \
 426         + PHPArray(toCN) \
 427         + '\n);\n'
 428
 429     if pyversion[:1] in ['2']:
 430         f = open(os.path.join('..', '..', '..', 'includes', 'ZhConversion.php'), 'wb', encoding='utf8')
 431     else:
 432         f = open(os.path.join('..', '..', '..', 'includes', 'ZhConversion.php'), 'w', buffering=4096, encoding='utf8')
 433     print ('Writing ZhConversion.php ... ')
 434     f.write(php)
 435     f.close()
 436
 437     # Remove temporary files
 438     print ('Deleting temporary files ... ')
 439     os.remove('EZ-Big.txt.in')
 440     os.remove('phrase_lib.txt')
 441     os.remove('tsi.src')
 442     os.remove('Unihan_Variants.txt')
 443     os.remove('Wubi.txt.in')
 444     os.remove('Ziranma.txt.in')
 445
 446
 447 if __name__ == '__main__':
 448     main()