2 # -*- coding: utf-8 -*-
6 import os
, re
, shutil
, sys
, platform
8 pyversion
= platform
.python_version()
9 islinux
= platform
.system().lower() == 'linux'
11 if pyversion
[:3] in ['2.6', '2.7']:
12 import urllib
as urllib_request
16 if sys
.maxunicode
< 0x10000:
21 return _unichr( 0xD7C0 + ( i
>>10 ) ) + _unichr( 0xDC00 + ( i
& 0x3FF ) )
22 elif pyversion
[:2] == '3.':
23 import urllib
.request
as urllib_request
27 return [unichr( int( i
.split('<')[0][2:], 16 ) ) for i
in args
]
30 return [unichr( int( i
[2:7], 16 ) ) for i
in args
if i
[2:7]]
35 SCIM_TABLES_VER
= '0.5.11'
36 SCIM_PINYIN_VER
= '0.5.92'
40 def download( url
, dest
):
41 if os
.path
.isfile( dest
):
42 print( 'File %s is up to date.' % dest
)
46 # we use wget instead urlretrieve under Linux,
47 # because wget could display details like download progress
48 os
.system( 'wget %s -O %s' % ( url
, dest
) )
50 print( 'Downloading from [%s] ...' % url
)
51 urllib_request
.urlretrieve( url
, dest
)
52 print( 'Download complete.\n' )
55 def uncompress( fp
, member
, encoding
= 'U8' ):
56 name
= member
.rsplit( '/', 1 )[-1]
57 print( 'Extracting %s ...' % name
)
59 shutil
.move( member
, name
)
61 shutil
.rmtree( member
.split( '/', 1 )[0] )
62 return open( name
, 'rb', encoding
, 'ignore' )
64 unzip
= lambda path
, member
, encoding
= 'U8': \
65 uncompress( zf
.ZipFile( path
), member
, encoding
)
67 untargz
= lambda path
, member
, encoding
= 'U8': \
68 uncompress( tf
.open( path
, 'r:gz' ), member
, encoding
)
70 def parserCore( fp
, pos
, beginmark
= None, endmark
= None ):
71 if beginmark
and endmark
:
76 if beginmark
and line
.startswith( beginmark
):
79 elif endmark
and line
.startswith( endmark
):
81 if start
and not line
.startswith( '#' ):
85 elif len( elems
[0] ) > 1 and \
86 len( elems
[pos
] ) > 1: # words only
87 mlist
.add( elems
[pos
] )
90 def tablesParser( path
, name
):
91 """ Read file from scim-tables and parse it. """
92 global SCIM_TABLES_VER
93 src
= 'scim-tables-%s/tables/zh/%s' % ( SCIM_TABLES_VER
, name
)
94 fp
= untargz( path
, src
, 'U8' )
95 return parserCore( fp
, 1, 'BEGIN_TABLE', 'END_TABLE' )
97 ezbigParser
= lambda path
: tablesParser( path
, 'EZ-Big.txt.in' )
98 wubiParser
= lambda path
: tablesParser( path
, 'Wubi.txt.in' )
99 zrmParser
= lambda path
: tablesParser( path
, 'Ziranma.txt.in' )
101 def phraseParser( path
):
102 """ Read phrase_lib.txt and parse it. """
103 global SCIM_PINYIN_VER
104 src
= 'scim-pinyin-%s/data/phrase_lib.txt' % SCIM_PINYIN_VER
105 dst
= 'phrase_lib.txt'
106 fp
= untargz( path
, src
, 'U8' )
107 return parserCore( fp
, 0 )
109 def tsiParser( path
):
110 """ Read tsi.src and parse it. """
111 src
= 'libtabe/tsi-src/tsi.src'
113 fp
= untargz( path
, src
, 'big5hkscs' )
114 return parserCore( fp
, 0 )
116 def unihanParser( path
):
117 """ Read Unihan_Variants.txt and parse it. """
118 fp
= unzip( path
, 'Unihan_Variants.txt', 'U8' )
122 if line
.startswith( '#' ):
128 type = elems
.pop( 1 )
129 elems
= unichr2( *elems
)
130 if type == 'kTraditionalVariant':
131 s2t
[elems
[0]] = elems
[1:]
132 elif type == 'kSimplifiedVariant':
133 t2s
[elems
[0]] = elems
[1:]
137 def applyExcludes( mlist
, path
):
138 """ Apply exclude rules from path to mlist. """
139 excludes
= open( path
, 'rb', 'U8' ).read().split()
140 excludes
= [word
.split( '#' )[0].strip() for word
in excludes
]
141 excludes
= '|'.join( excludes
)
142 excptn
= re
.compile( '.*(?:%s).*' % excludes
)
143 diff
= [mword
for mword
in mlist
if excptn
.search( mword
)]
144 mlist
.difference_update( diff
)
147 def charManualTable( path
):
148 fp
= open( path
, 'rb', 'U8' )
151 elems
= line
.split( '#' )[0].split( '|' )
152 elems
= unichr3( *elems
)
154 ret
[elems
[0]] = elems
[1:]
157 def toManyRules( src_table
):
159 for ( f
, t
) in src_table
.iteritems():
160 for i
in range( 1, len( t
) ):
164 def removeRules( path
, table
):
165 fp
= open( path
, 'rb', 'U8' )
168 elems
= line
.split( '=>' )
169 f
= t
= elems
[0].strip()
170 if len( elems
) == 2:
172 f
= f
.strip('"').strip("'")
173 t
= t
.strip('"').strip("'")
181 texcptn
= re
.compile( '^(?:%s)$' % '|'.join( texc
) )
182 for (tmp_f
, tmp_t
) in table
.copy().iteritems():
183 if texcptn
.match( tmp_t
):
187 def customRules( path
):
188 fp
= open( path
, 'rb', 'U8' )
191 elems
= line
.split( '#' )[0].split()
193 ret
[elems
[0]] = elems
[1]
196 def dictToSortedList( src_table
, pos
):
197 return sorted( src_table
.items(), key
= lambda m
: m
[pos
] )
199 def translate( text
, conv_table
):
201 while i
< len( text
):
202 for j
in range( len( text
) - i
, 0, -1 ):
204 t
= conv_table
.get( f
)
206 text
= text
[:i
] + t
+ text
[i
:][j
:]
212 def manualWordsTable( path
, conv_table
, reconv_table
):
213 fp
= open( path
, 'rb', 'U8' )
215 wordlist
= [line
.split( '#' )[0].strip() for line
in fp
]
216 wordlist
= list( set( wordlist
) )
217 wordlist
.sort( key
= len, reverse
= True )
219 word
= wordlist
.pop()
220 new_word
= translate( word
, conv_table
)
221 rcv_word
= translate( word
, reconv_table
)
223 reconv_table
[word
] = word
224 reconv_table
[new_word
] = word
227 def defaultWordsTable( src_wordlist
, src_tomany
, char_conv_table
, char_reconv_table
):
228 wordlist
= list( src_wordlist
)
229 wordlist
.sort( key
= len, reverse
= True )
231 word_reconv_table
= {}
232 conv_table
= char_conv_table
.copy()
233 reconv_table
= char_reconv_table
.copy()
234 tomanyptn
= re
.compile( '(?:%s)' % '|'.join( src_tomany
) )
236 conv_table
.update( word_conv_table
)
237 reconv_table
.update( word_reconv_table
)
238 word
= wordlist
.pop()
239 new_word_len
= word_len
= len( word
)
240 while new_word_len
== word_len
:
242 test_word
= translate( word
, reconv_table
)
243 new_word
= translate( word
, conv_table
)
244 if not reconv_table
.get( new_word
) \
245 and ( test_word
!= word \
246 or ( tomanyptn
.search( word
) \
247 and word
!= translate( new_word
, reconv_table
) ) ):
248 word_conv_table
[word
] = new_word
249 word_reconv_table
[new_word
] = word
251 word
= wordlist
.pop()
254 new_word_len
= len(word
)
255 return word_reconv_table
257 def PHPArray( table
):
258 lines
= ['\'%s\' => \'%s\',' % (f
, t
) for (f
, t
) in table
if f
and t
]
259 return '\n'.join(lines
)
263 url
= 'http://www.unicode.org/Public/%s/ucd/Unihan.zip' % UNIHAN_VER
264 han_dest
= 'Unihan.zip'
265 download( url
, han_dest
)
267 # Get scim-tables-$(SCIM_TABLES_VER).tar.gz:
268 url
= 'http://%s.dl.sourceforge.net/sourceforge/scim/scim-tables-%s.tar.gz' % ( SF_MIRROR
, SCIM_TABLES_VER
)
269 tbe_dest
= 'scim-tables-%s.tar.gz' % SCIM_TABLES_VER
270 download( url
, tbe_dest
)
272 # Get scim-pinyin-$(SCIM_PINYIN_VER).tar.gz:
273 url
= 'http://%s.dl.sourceforge.net/sourceforge/scim/scim-pinyin-%s.tar.gz' % ( SF_MIRROR
, SCIM_PINYIN_VER
)
274 pyn_dest
= 'scim-pinyin-%s.tar.gz' % SCIM_PINYIN_VER
275 download( url
, pyn_dest
)
277 # Get libtabe-$(LIBTABE_VER).tgz:
278 url
= 'http://%s.dl.sourceforge.net/sourceforge/libtabe/libtabe-%s.tgz' % ( SF_MIRROR
, LIBTABE_VER
)
279 lbt_dest
= 'libtabe-%s.tgz' % LIBTABE_VER
280 download( url
, lbt_dest
)
283 ( t2s_1tomany
, s2t_1tomany
) = unihanParser( han_dest
)
285 t2s_1tomany
.update( charManualTable( 'trad2simp.manual' ) )
286 s2t_1tomany
.update( charManualTable( 'simp2trad.manual' ) )
288 t2s_1to1
= dict( [( f
, t
[0] ) for ( f
, t
) in t2s_1tomany
.iteritems()] )
289 s2t_1to1
= dict( [( f
, t
[0] ) for ( f
, t
) in s2t_1tomany
.iteritems()] )
291 s_tomany
= toManyRules( t2s_1tomany
)
292 t_tomany
= toManyRules( s2t_1tomany
)
295 t2s_1to1
= removeRules( 'trad2simp_noconvert.manual', t2s_1to1
)
296 s2t_1to1
= removeRules( 'simp2trad_noconvert.manual', s2t_1to1
)
298 # the supper set for word to word conversion
299 t2s_1to1_supp
= t2s_1to1
.copy()
300 s2t_1to1_supp
= s2t_1to1
.copy()
301 t2s_1to1_supp
.update( customRules( 'trad2simp_supp_set.manual' ) )
302 s2t_1to1_supp
.update( customRules( 'simp2trad_supp_set.manual' ) )
304 # word to word manual rules
305 t2s_word2word_manual
= manualWordsTable( 'simpphrases.manual', s2t_1to1_supp
, t2s_1to1_supp
)
306 t2s_word2word_manual
.update( customRules( 'toSimp.manual' ) )
307 s2t_word2word_manual
= manualWordsTable( 'tradphrases.manual', t2s_1to1_supp
, s2t_1to1_supp
)
308 s2t_word2word_manual
.update( customRules( 'toTrad.manual' ) )
310 # word to word rules from input methods
313 t_wordlist
.update( ezbigParser( tbe_dest
),
314 tsiParser( lbt_dest
) )
315 s_wordlist
.update( wubiParser( tbe_dest
),
316 zrmParser( tbe_dest
),
317 phraseParser( pyn_dest
) )
320 s_wordlist
= applyExcludes( s_wordlist
, 'simpphrases_exclude.manual' )
321 t_wordlist
= applyExcludes( t_wordlist
, 'tradphrases_exclude.manual' )
323 s2t_supp
= s2t_1to1_supp
.copy()
324 s2t_supp
.update( s2t_word2word_manual
)
325 t2s_supp
= t2s_1to1_supp
.copy()
326 t2s_supp
.update( t2s_word2word_manual
)
329 t2s_word2word
= defaultWordsTable( s_wordlist
, s_tomany
, s2t_1to1_supp
, t2s_supp
)
330 t2s_word2word
.update( t2s_word2word_manual
)
331 s2t_word2word
= defaultWordsTable( t_wordlist
, t_tomany
, t2s_1to1_supp
, s2t_supp
)
332 s2t_word2word
.update( s2t_word2word_manual
)
336 t2s_1to1
= dict( [( f
, t
) for ( f
, t
) in t2s_1to1
.iteritems() if f
!= t
] )
337 toHans
= dictToSortedList( t2s_1to1
, 0 ) + dictToSortedList( t2s_word2word
, 1 )
339 s2t_1to1
= dict( [( f
, t
) for ( f
, t
) in s2t_1to1
.iteritems() if f
!= t
] )
340 toHant
= dictToSortedList( s2t_1to1
, 0 ) + dictToSortedList( s2t_word2word
, 1 )
342 toCN
= dictToSortedList( customRules( 'toCN.manual' ), 1 )
344 toHK
= dictToSortedList( customRules( 'toHK.manual' ), 1 )
346 toSG
= dictToSortedList( customRules( 'toSG.manual' ), 1 )
348 toTW
= dictToSortedList( customRules( 'toTW.manual' ), 1 )
353 * Simplified / Traditional Chinese conversion tables
355 * Automatically generated using code and data in includes/zhtable/
356 * Do not modify directly!
361 $zh2Hant = array(\n'''
362 php
+= PHPArray( toHant
) \
363 + '\n);\n\n$zh2Hans = array(\n' \
364 + PHPArray( toHans
) \
365 + '\n);\n\n$zh2TW = array(\n' \
367 + '\n);\n\n$zh2HK = array(\n' \
369 + '\n);\n\n$zh2CN = array(\n' \
371 + '\n);\n\n$zh2SG = array(\n' \
375 f
= open( os
.path
.join( '..', '..', '..', 'includes', 'ZhConversion.php' ), 'wb', encoding
= 'utf8' )
376 print ('Writing ZhConversion.php ... ')
380 # Remove temporary files
381 print ('Deleting temporary files ... ')
382 os
.remove('EZ-Big.txt.in')
383 os
.remove('phrase_lib.txt')
385 os
.remove('Unihan_Variants.txt')
386 os
.remove('Wubi.txt.in')
387 os
.remove('Ziranma.txt.in')
390 if __name__
== '__main__':