maintenance/language/zhtable/Makefile.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3 # @author Philip
   4 import tarfile as tf
   5 import zipfile as zf
   6 import os, re, shutil, sys, platform
   7
   8 pyversion = platform.python_version()
   9 islinux = platform.system().lower() == 'linux'
  10
  11 if pyversion[:3] in ['2.6', '2.7']:
  12     import urllib as urllib_request
  13     import codecs
  14     open = codecs.open
  15     _unichr = unichr
  16     if sys.maxunicode < 0x10000:
  17         def unichr(i):
  18             if i < 0x10000:
  19                 return _unichr(i)
  20             else:
  21                 return _unichr( 0xD7C0 + ( i>>10 ) ) + _unichr( 0xDC00 + ( i & 0x3FF ) )
  22 elif pyversion[:2] == '3.':
  23     import urllib.request as urllib_request
  24     unichr = chr
  25
  26 def unichr2( *args ):
  27     return [unichr( int( i.split('<')[0][2:], 16 ) ) for i in args]
  28
  29 def unichr3( *args ):
  30     return [unichr( int( i[2:7], 16 ) ) for i in args if i[2:7]]
  31
  32 # DEFINE
  33 UNIHAN_VER = '6.2.0'
  34 SF_MIRROR = 'dfn'
  35 SCIM_TABLES_VER = '0.5.11'
  36 SCIM_PINYIN_VER = '0.5.92'
  37 LIBTABE_VER = '0.2.3'
  38 # END OF DEFINE
  39
  40 def download( url, dest ):
  41     if os.path.isfile( dest ):
  42         print( 'File %s is up to date.' % dest )
  43         return
  44     global islinux
  45     if islinux:
  46         # we use wget instead urlretrieve under Linux,
  47         # because wget could display details like download progress
  48         os.system( 'wget %s -O %s' % ( url, dest ) )
  49     else:
  50         print( 'Downloading from [%s] ...' % url )
  51         urllib_request.urlretrieve( url, dest )
  52         print( 'Download complete.\n' )
  53     return
  54
  55 def uncompress( fp, member, encoding = 'U8' ):
  56     name = member.rsplit( '/', 1 )[-1]
  57     print( 'Extracting %s ...' % name )
  58     fp.extract( member )
  59     shutil.move( member, name )
  60     if '/' in member:
  61         shutil.rmtree( member.split( '/', 1 )[0] )
  62     return open( name, 'rb', encoding, 'ignore' )
  63
  64 unzip = lambda path, member, encoding = 'U8': \
  65         uncompress( zf.ZipFile( path ), member, encoding )
  66
  67 untargz = lambda path, member, encoding = 'U8': \
  68         uncompress( tf.open( path, 'r:gz' ), member, encoding )
  69
  70 def parserCore( fp, pos, beginmark = None, endmark = None ):
  71     if beginmark and endmark:
  72         start = False
  73     else: start = True
  74     mlist = set()
  75     for line in fp:
  76         if beginmark and line.startswith( beginmark ):
  77             start = True
  78             continue
  79         elif endmark and line.startswith( endmark ):
  80             break
  81         if start and not line.startswith( '#' ):
  82             elems = line.split()
  83             if len( elems ) < 2:
  84                 continue
  85             elif len( elems[0] ) > 1 and \
  86                 len( elems[pos] ) > 1: # words only
  87                 mlist.add( elems[pos] )
  88     return mlist
  89
  90 def tablesParser( path, name ):
  91     """ Read file from scim-tables and parse it. """
  92     global SCIM_TABLES_VER
  93     src = 'scim-tables-%s/tables/zh/%s' % ( SCIM_TABLES_VER, name )
  94     fp = untargz( path, src, 'U8' )
  95     return parserCore( fp, 1, 'BEGIN_TABLE', 'END_TABLE' )
  96
  97 ezbigParser = lambda path: tablesParser( path, 'EZ-Big.txt.in' )
  98 wubiParser = lambda path: tablesParser( path, 'Wubi.txt.in' )
  99 zrmParser = lambda path: tablesParser( path, 'Ziranma.txt.in' )
 100
 101 def phraseParser( path ):
 102     """ Read phrase_lib.txt and parse it. """
 103     global SCIM_PINYIN_VER
 104     src = 'scim-pinyin-%s/data/phrase_lib.txt' % SCIM_PINYIN_VER
 105     dst = 'phrase_lib.txt'
 106     fp = untargz( path, src, 'U8' )
 107     return parserCore( fp, 0 )
 108
 109 def tsiParser( path ):
 110     """ Read tsi.src and parse it. """
 111     src = 'libtabe/tsi-src/tsi.src'
 112     dst = 'tsi.src'
 113     fp = untargz( path, src, 'big5hkscs' )
 114     return parserCore( fp, 0 )
 115
 116 def unihanParser( path ):
 117     """ Read Unihan_Variants.txt and parse it. """
 118     fp = unzip( path, 'Unihan_Variants.txt', 'U8' )
 119     t2s = dict()
 120     s2t = dict()
 121     for line in fp:
 122         if line.startswith( '#' ):
 123             continue
 124         else:
 125             elems = line.split()
 126             if len( elems ) < 3:
 127                 continue
 128             type = elems.pop( 1 )
 129             elems = unichr2( *elems )
 130             if type == 'kTraditionalVariant':
 131                 s2t[elems[0]] = elems[1:]
 132             elif type == 'kSimplifiedVariant':
 133                 t2s[elems[0]] = elems[1:]
 134     fp.close()
 135     return ( t2s, s2t )
 136
 137 def applyExcludes( mlist, path ):
 138     """ Apply exclude rules from path to mlist. """
 139     excludes = open( path, 'rb', 'U8' ).read().split()
 140     excludes = [word.split( '#' )[0].strip() for word in excludes]
 141     excludes = '|'.join( excludes )
 142     excptn = re.compile( '.*(?:%s).*' % excludes )
 143     diff = [mword for mword in mlist if excptn.search( mword )]
 144     mlist.difference_update( diff )
 145     return mlist
 146
 147 def charManualTable( path ):
 148     fp = open( path, 'rb', 'U8' )
 149     ret = {}
 150     for line in fp:
 151         elems = line.split( '#' )[0].split( '|' )
 152         elems = unichr3( *elems )
 153         if len( elems ) > 1:
 154             ret[elems[0]] = elems[1:]
 155     return ret
 156
 157 def toManyRules( src_table ):
 158     tomany = set()
 159     for ( f, t ) in src_table.iteritems():
 160         for i in range( 1, len( t ) ):
 161             tomany.add( t[i] )
 162     return tomany
 163
 164 def removeRules( path, table ):
 165     fp = open( path, 'rb', 'U8' )
 166     texc = list()
 167     for line in fp:
 168         elems = line.split( '=>' )
 169         f = t = elems[0].strip()
 170         if len( elems ) == 2:
 171             t = elems[1].strip()
 172         f = f.strip('"').strip("'")
 173         t = t.strip('"').strip("'")
 174         if f:
 175             try:
 176                 table.pop( f )
 177             except:
 178                 pass
 179         if t:
 180             texc.append( t )
 181     texcptn = re.compile( '^(?:%s)$' % '|'.join( texc ) )
 182     for (tmp_f, tmp_t) in table.copy().iteritems():
 183         if texcptn.match( tmp_t ):
 184             table.pop( tmp_f )
 185     return table
 186
 187 def customRules( path ):
 188     fp = open( path, 'rb', 'U8' )
 189     ret = dict()
 190     for line in fp:
 191         elems = line.split( '#' )[0].split()
 192         if len( elems ) > 1:
 193             ret[elems[0]] = elems[1]
 194     return ret
 195
 196 def dictToSortedList( src_table, pos ):
 197     return sorted( src_table.items(), key = lambda m: m[pos] )
 198
 199 def translate( text, conv_table ):
 200     i = 0
 201     while i < len( text ):
 202         for j in range( len( text ) - i, 0, -1 ):
 203             f = text[i:][:j]
 204             t = conv_table.get( f )
 205             if t:
 206                 text = text[:i] + t + text[i:][j:]
 207                 i += len(t) - 1
 208                 break
 209         i += 1
 210     return text
 211
 212 def manualWordsTable( path, conv_table, reconv_table ):
 213     fp = open( path, 'rb', 'U8' )
 214     reconv_table = {}
 215     wordlist = [line.split( '#' )[0].strip() for line in fp]
 216     wordlist = list( set( wordlist ) )
 217     wordlist.sort( key = len, reverse = True )
 218     while wordlist:
 219         word = wordlist.pop()
 220         new_word = translate( word, conv_table )
 221         rcv_word = translate( word, reconv_table )
 222         if word != rcv_word:
 223             reconv_table[word] = word
 224         reconv_table[new_word] = word
 225     return reconv_table
 226
 227 def defaultWordsTable( src_wordlist, src_tomany, char_conv_table, char_reconv_table ):
 228     wordlist = list( src_wordlist )
 229     wordlist.sort( key = len, reverse = True )
 230     word_conv_table = {}
 231     word_reconv_table = {}
 232     conv_table = char_conv_table.copy()
 233     reconv_table = char_reconv_table.copy()
 234     tomanyptn = re.compile( '(?:%s)' % '|'.join( src_tomany ) )
 235     while wordlist:
 236         conv_table.update( word_conv_table )
 237         reconv_table.update( word_reconv_table )
 238         word = wordlist.pop()
 239         new_word_len = word_len = len( word )
 240         while new_word_len == word_len:
 241             add = False
 242             test_word = translate( word, reconv_table )
 243             new_word = translate( word, conv_table )
 244             if not reconv_table.get( new_word ) \
 245                and ( test_word != word \
 246                or ( tomanyptn.search( word ) \
 247                and word != translate( new_word, reconv_table ) ) ):
 248                 word_conv_table[word] = new_word
 249                 word_reconv_table[new_word] = word
 250             try:
 251                 word = wordlist.pop()
 252             except IndexError:
 253                 break
 254             new_word_len = len(word)
 255     return word_reconv_table
 256
 257 def PHPArray( table ):
 258     lines = ['\'%s\' => \'%s\',' % (f, t) for (f, t) in table if f and t]
 259     return '\n'.join(lines)
 260
 261 def main():
 262     #Get Unihan.zip:
 263     url = 'http://www.unicode.org/Public/%s/ucd/Unihan.zip' % UNIHAN_VER
 264     han_dest = 'Unihan.zip'
 265     download( url, han_dest )
 266
 267     # Get scim-tables-$(SCIM_TABLES_VER).tar.gz:
 268     url  = 'http://%s.dl.sourceforge.net/sourceforge/scim/scim-tables-%s.tar.gz' % ( SF_MIRROR, SCIM_TABLES_VER )
 269     tbe_dest = 'scim-tables-%s.tar.gz' % SCIM_TABLES_VER
 270     download( url, tbe_dest )
 271
 272     # Get scim-pinyin-$(SCIM_PINYIN_VER).tar.gz:
 273     url  = 'http://%s.dl.sourceforge.net/sourceforge/scim/scim-pinyin-%s.tar.gz' % ( SF_MIRROR, SCIM_PINYIN_VER )
 274     pyn_dest = 'scim-pinyin-%s.tar.gz' % SCIM_PINYIN_VER
 275     download( url, pyn_dest )
 276
 277     # Get libtabe-$(LIBTABE_VER).tgz:
 278     url  = 'http://%s.dl.sourceforge.net/sourceforge/libtabe/libtabe-%s.tgz' % ( SF_MIRROR, LIBTABE_VER )
 279     lbt_dest = 'libtabe-%s.tgz' % LIBTABE_VER
 280     download( url, lbt_dest )
 281
 282     # Unihan.txt
 283     ( t2s_1tomany, s2t_1tomany ) = unihanParser( han_dest )
 284
 285     t2s_1tomany.update( charManualTable( 'trad2simp.manual' ) )
 286     s2t_1tomany.update( charManualTable( 'simp2trad.manual' ) )
 287
 288     t2s_1to1 = dict( [( f, t[0] ) for ( f, t ) in t2s_1tomany.iteritems()] )
 289     s2t_1to1 = dict( [( f, t[0] ) for ( f, t ) in s2t_1tomany.iteritems()] )
 290
 291     s_tomany = toManyRules( t2s_1tomany )
 292     t_tomany = toManyRules( s2t_1tomany )
 293
 294     # noconvert rules
 295     t2s_1to1 = removeRules( 'trad2simp_noconvert.manual', t2s_1to1 )
 296     s2t_1to1 = removeRules( 'simp2trad_noconvert.manual', s2t_1to1 )
 297
 298     # the supper set for word to word conversion
 299     t2s_1to1_supp = t2s_1to1.copy()
 300     s2t_1to1_supp = s2t_1to1.copy()
 301     t2s_1to1_supp.update( customRules( 'trad2simp_supp_set.manual' ) )
 302     s2t_1to1_supp.update( customRules( 'simp2trad_supp_set.manual' ) )
 303
 304     # word to word manual rules
 305     t2s_word2word_manual = manualWordsTable( 'simpphrases.manual', s2t_1to1_supp, t2s_1to1_supp )
 306     t2s_word2word_manual.update( customRules( 'toSimp.manual' ) )
 307     s2t_word2word_manual = manualWordsTable( 'tradphrases.manual', t2s_1to1_supp, s2t_1to1_supp )
 308     s2t_word2word_manual.update( customRules( 'toTrad.manual' ) )
 309
 310     # word to word rules from input methods
 311     t_wordlist = set()
 312     s_wordlist = set()
 313     t_wordlist.update( ezbigParser( tbe_dest ),
 314                        tsiParser( lbt_dest ) )
 315     s_wordlist.update( wubiParser( tbe_dest ),
 316                        zrmParser( tbe_dest ),
 317                        phraseParser( pyn_dest ) )
 318
 319     # exclude
 320     s_wordlist = applyExcludes( s_wordlist, 'simpphrases_exclude.manual' )
 321     t_wordlist = applyExcludes( t_wordlist, 'tradphrases_exclude.manual' )
 322
 323     s2t_supp = s2t_1to1_supp.copy()
 324     s2t_supp.update( s2t_word2word_manual )
 325     t2s_supp = t2s_1to1_supp.copy()
 326     t2s_supp.update( t2s_word2word_manual )
 327
 328     # parse list to dict
 329     t2s_word2word = defaultWordsTable( s_wordlist, s_tomany, s2t_1to1_supp, t2s_supp )
 330     t2s_word2word.update( t2s_word2word_manual )
 331     s2t_word2word = defaultWordsTable( t_wordlist, t_tomany, t2s_1to1_supp, s2t_supp )
 332     s2t_word2word.update( s2t_word2word_manual )
 333
 334     # Final tables
 335     # sorted list toHans
 336     t2s_1to1 = dict( [( f, t ) for ( f, t ) in t2s_1to1.iteritems() if f != t] )
 337     toHans = dictToSortedList( t2s_1to1, 0 ) + dictToSortedList( t2s_word2word, 1 )
 338     # sorted list toHant
 339     s2t_1to1 = dict( [( f, t ) for ( f, t ) in s2t_1to1.iteritems() if f != t] )
 340     toHant = dictToSortedList( s2t_1to1, 0 ) + dictToSortedList( s2t_word2word, 1 )
 341     # sorted list toCN
 342     toCN = dictToSortedList( customRules( 'toCN.manual' ), 1 )
 343     # sorted list toHK
 344     toHK = dictToSortedList( customRules( 'toHK.manual' ), 1 )
 345     # sorted list toSG
 346     toSG = dictToSortedList( customRules( 'toSG.manual' ), 1 )
 347     # sorted list toTW
 348     toTW = dictToSortedList( customRules( 'toTW.manual' ), 1 )
 349
 350     # Get PHP Array
 351     php = '''<?php
 352 /**
 353  * Simplified / Traditional Chinese conversion tables
 354  *
 355  * Automatically generated using code and data in includes/zhtable/
 356  * Do not modify directly!
 357  *
 358  * @file
 359  */
 360
 361 $zh2Hant = array(\n'''
 362     php += PHPArray( toHant ) \
 363         +  '\n);\n\n$zh2Hans = array(\n' \
 364         +  PHPArray( toHans ) \
 365         +  '\n);\n\n$zh2TW = array(\n' \
 366         +  PHPArray( toTW ) \
 367         +  '\n);\n\n$zh2HK = array(\n' \
 368         +  PHPArray( toHK ) \
 369         +  '\n);\n\n$zh2CN = array(\n' \
 370         +  PHPArray( toCN ) \
 371         +  '\n);\n\n$zh2SG = array(\n' \
 372         +  PHPArray( toSG ) \
 373         +  '\n);\n'
 374
 375     f = open( os.path.join( '..', '..', '..', 'includes', 'ZhConversion.php' ), 'wb', encoding = 'utf8' )
 376     print ('Writing ZhConversion.php ... ')
 377     f.write( php )
 378     f.close()
 379
 380     # Remove temporary files
 381     print ('Deleting temporary files ... ')
 382     os.remove('EZ-Big.txt.in')
 383     os.remove('phrase_lib.txt')
 384     os.remove('tsi.src')
 385     os.remove('Unihan_Variants.txt')
 386     os.remove('Wubi.txt.in')
 387     os.remove('Ziranma.txt.in')
 388
 389
 390 if __name__ == '__main__':
 391     main()