2 # -*- coding: utf-8 -*-
6 import os
, re
, shutil
, sys
, platform
8 pyversion
= platform
.python_version()
9 islinux
= platform
.system().lower() == 'linux'
11 if pyversion
[:3] in ['2.6', '2.7']:
12 import urllib
as urllib_request
16 if sys
.maxunicode
< 0x10000:
21 return _unichr( 0xD7C0 + ( i
>>10 ) ) + _unichr( 0xDC00 + ( i
& 0x3FF ) )
22 elif pyversion
[:2] == '3.':
23 import urllib
.request
as urllib_request
27 return [unichr( int( i
.split('<')[0][2:], 16 ) ) for i
in args
]
30 return [unichr( int( i
[2:7], 16 ) ) for i
in args
if i
[2:7]]
33 SF_MIRROR
= 'easynews'
34 SCIM_TABLES_VER
= '0.5.9'
35 SCIM_PINYIN_VER
= '0.5.91'
39 def download( url
, dest
):
40 if os
.path
.isfile( dest
):
41 print( 'File %s up to date.' % dest
)
45 # we use wget instead urlretrieve under Linux,
46 # because wget could display details like download progress
47 os
.system('wget %s' % url
)
49 print( 'Downloading from [%s] ...' % url
)
50 urllib_request
.urlretrieve( url
, dest
)
51 print( 'Download complete.\n' )
54 def uncompress( fp
, member
, encoding
= 'U8' ):
55 name
= member
.rsplit( '/', 1 )[-1]
56 print( 'Extracting %s ...' % name
)
58 shutil
.move( member
, name
)
60 shutil
.rmtree( member
.split( '/', 1 )[0] )
61 return open( name
, 'rb', encoding
, 'ignore' )
63 unzip
= lambda path
, member
, encoding
= 'U8': \
64 uncompress( zf
.ZipFile( path
), member
, encoding
)
66 untargz
= lambda path
, member
, encoding
= 'U8': \
67 uncompress( tf
.open( path
, 'r:gz' ), member
, encoding
)
69 def parserCore( fp
, pos
, beginmark
= None, endmark
= None ):
70 if beginmark
and endmark
:
75 if beginmark
and line
.startswith( beginmark
):
78 elif endmark
and line
.startswith( endmark
):
80 if start
and not line
.startswith( '#' ):
84 elif len( elems
[0] ) > 1:
85 mlist
.add( elems
[pos
] )
88 def tablesParser( path
, name
):
89 """ Read file from scim-tables and parse it. """
90 global SCIM_TABLES_VER
91 src
= 'scim-tables-%s/tables/zh/%s' % ( SCIM_TABLES_VER
, name
)
92 fp
= untargz( path
, src
, 'U8' )
93 return parserCore( fp
, 1, 'BEGIN_TABLE', 'END_TABLE' )
95 ezbigParser
= lambda path
: tablesParser( path
, 'EZ-Big.txt.in' )
96 wubiParser
= lambda path
: tablesParser( path
, 'Wubi.txt.in' )
97 zrmParser
= lambda path
: tablesParser( path
, 'Ziranma.txt.in' )
99 def phraseParser( path
):
100 """ Read phrase_lib.txt and parse it. """
101 global SCIM_PINYIN_VER
102 src
= 'scim-pinyin-%s/data/phrase_lib.txt' % SCIM_PINYIN_VER
103 dst
= 'phrase_lib.txt'
104 fp
= untargz( path
, src
, 'U8' )
105 return parserCore( fp
, 0 )
107 def tsiParser( path
):
108 """ Read tsi.src and parse it. """
109 src
= 'libtabe/tsi-src/tsi.src'
111 fp
= untargz( path
, src
, 'big5hkscs' )
112 return parserCore( fp
, 0 )
114 def unihanParser( path
):
115 """ Read Unihan_Variants.txt and parse it. """
116 fp
= unzip( path
, 'Unihan_Variants.txt', 'U8' )
120 if line
.startswith( '#' ):
126 type = elems
.pop( 1 )
127 elems
= unichr2( *elems
)
128 if type == 'kTraditionalVariant':
129 s2t
[elems
[0]] = elems
[1:]
130 elif type == 'kSimplifiedVariant':
131 t2s
[elems
[0]] = elems
[1:]
135 def applyExcludes( mlist
, path
):
136 """ Apply exclude rules from path to mlist. """
137 excludes
= open( path
, 'rb', 'U8' ).read().split()
138 excludes
= [word
.split( '#' )[0].strip() for word
in excludes
]
139 excludes
= '|'.join( excludes
)
140 excptn
= re
.compile( '.*(?:%s).*' % excludes
)
141 diff
= [mword
for mword
in mlist
if excptn
.search( mword
)]
142 mlist
.difference_update( diff
)
145 def charManualTable( path
):
146 fp
= open( path
, 'rb', 'U8' )
149 elems
= line
.split( '#' )[0].split( '|' )
150 elems
= unichr3( *elems
)
152 ret
[elems
[0]] = elems
[1:]
155 def toManyRules( src_table
):
157 for ( f
, t
) in src_table
.iteritems():
158 for i
in range( 1, len( t
) ):
162 def removeRules( path
, table
):
163 fp
= open( path
, 'rb', 'U8' )
166 elems
= line
.split( '=>' )
167 f
= t
= elems
[0].strip()
168 if len( elems
) == 2:
170 f
= f
.strip('"').strip("'")
171 t
= t
.strip('"').strip("'")
179 texcptn
= re
.compile( '^(?:%s)$' % '|'.join( texc
) )
180 for (tmp_f
, tmp_t
) in table
.copy().iteritems():
181 if texcptn
.match( tmp_t
):
185 def customRules( path
):
186 fp
= open( path
, 'rb', 'U8' )
189 elems
= line
.split( '#' )[0].split()
191 ret
[elems
[0]] = elems
[1]
194 def dictToSortedList( src_table
, pos
):
195 return sorted( src_table
.items(), key
= lambda m
: m
[pos
] )
197 def translate( text
, conv_table
):
199 while i
< len( text
):
200 for j
in range( len( text
) - i
, 0, -1 ):
202 t
= conv_table
.get( f
)
204 text
= text
[:i
] + t
+ text
[i
:][j
:]
210 def manualWordsTable( path
, conv_table
, reconv_table
):
211 fp
= open( path
, 'rb', 'U8' )
213 wordlist
= [line
.split( '#' )[0].strip() for line
in fp
]
214 wordlist
= list( set( wordlist
) )
215 wordlist
.sort( key
= len, reverse
= True )
217 word
= wordlist
.pop()
218 new_word
= translate( word
, conv_table
)
219 rcv_word
= translate( word
, reconv_table
)
221 reconv_table
[word
] = word
222 reconv_table
[new_word
] = word
225 def defaultWordsTable( src_wordlist
, src_tomany
, char_conv_table
, char_reconv_table
):
226 wordlist
= list( src_wordlist
)
227 wordlist
.sort( key
= len, reverse
= True )
229 word_reconv_table
= {}
230 conv_table
= char_conv_table
.copy()
231 reconv_table
= char_reconv_table
.copy()
232 tomanyptn
= re
.compile( '(?:%s)' % '|'.join( src_tomany
) )
234 conv_table
.update( word_conv_table
)
235 reconv_table
.update( word_reconv_table
)
236 word
= wordlist
.pop()
237 new_word_len
= word_len
= len( word
)
238 while new_word_len
== word_len
:
240 test_word
= translate( word
, reconv_table
)
241 new_word
= translate( word
, conv_table
)
242 if not reconv_table
.get( new_word
) \
243 and ( test_word
!= word \
244 or ( tomanyptn
.search( word
) \
245 and word
!= translate( new_word
, reconv_table
) ) ):
246 word_conv_table
[word
] = new_word
247 word_reconv_table
[new_word
] = word
249 word
= wordlist
.pop()
252 new_word_len
= len(word
)
253 return word_reconv_table
255 def PHPArray( table
):
256 lines
= ['\'%s\' => \'%s\',' % (f
, t
) for (f
, t
) in table
if f
and t
]
257 return '\n'.join(lines
)
261 url
= 'http://www.unicode.org/Public/UNIDATA/Unihan.zip'
262 han_dest
= 'Unihan.zip'
263 download( url
, han_dest
)
265 # Get scim-tables-$(SCIM_TABLES_VER).tar.gz:
266 url
= 'http://%s.dl.sourceforge.net/sourceforge/scim/scim-tables-%s.tar.gz' % ( SF_MIRROR
, SCIM_TABLES_VER
)
267 tbe_dest
= 'scim-tables-%s.tar.gz' % SCIM_TABLES_VER
268 download( url
, tbe_dest
)
270 # Get scim-pinyin-$(SCIM_PINYIN_VER).tar.gz:
271 url
= 'http://%s.dl.sourceforge.net/sourceforge/scim/scim-pinyin-%s.tar.gz' % ( SF_MIRROR
, SCIM_PINYIN_VER
)
272 pyn_dest
= 'scim-pinyin-%s.tar.gz' % SCIM_PINYIN_VER
273 download( url
, pyn_dest
)
275 # Get libtabe-$(LIBTABE_VER).tgz:
276 url
= 'http://%s.dl.sourceforge.net/sourceforge/libtabe/libtabe-%s.tgz' % ( SF_MIRROR
, LIBTABE_VER
)
277 lbt_dest
= 'libtabe-%s.tgz' % LIBTABE_VER
278 download( url
, lbt_dest
)
281 ( t2s_1tomany
, s2t_1tomany
) = unihanParser( han_dest
)
283 t2s_1tomany
.update( charManualTable( 'trad2simp.manual' ) )
284 s2t_1tomany
.update( charManualTable( 'simp2trad.manual' ) )
286 t2s_1to1
= dict( [( f
, t
[0] ) for ( f
, t
) in t2s_1tomany
.iteritems()] )
287 s2t_1to1
= dict( [( f
, t
[0] ) for ( f
, t
) in s2t_1tomany
.iteritems()] )
289 s_tomany
= toManyRules( t2s_1tomany
)
290 t_tomany
= toManyRules( s2t_1tomany
)
293 t2s_1to1
= removeRules( 'trad2simp_noconvert.manual', t2s_1to1
)
294 s2t_1to1
= removeRules( 'simp2trad_noconvert.manual', s2t_1to1
)
296 # the supper set for word to word conversion
297 t2s_1to1_supp
= t2s_1to1
.copy()
298 s2t_1to1_supp
= s2t_1to1
.copy()
299 t2s_1to1_supp
.update( customRules( 'trad2simp_supp_set.manual' ) )
300 s2t_1to1_supp
.update( customRules( 'simp2trad_supp_set.manual' ) )
302 # word to word manual rules
303 t2s_word2word_manual
= manualWordsTable( 'simpphrases.manual', s2t_1to1_supp
, t2s_1to1_supp
)
304 t2s_word2word_manual
.update( customRules( 'toSimp.manual' ) )
305 s2t_word2word_manual
= manualWordsTable( 'tradphrases.manual', t2s_1to1_supp
, s2t_1to1_supp
)
306 s2t_word2word_manual
.update( customRules( 'toTrad.manual' ) )
308 # word to word rules from input methods
311 t_wordlist
.update( ezbigParser( tbe_dest
),
312 tsiParser( lbt_dest
) )
313 s_wordlist
.update( wubiParser( tbe_dest
),
314 zrmParser( tbe_dest
),
315 phraseParser( pyn_dest
) )
318 s_wordlist
= applyExcludes( s_wordlist
, 'simpphrases_exclude.manual' )
319 t_wordlist
= applyExcludes( t_wordlist
, 'tradphrases_exclude.manual' )
321 s2t_supp
= s2t_1to1_supp
.copy()
322 s2t_supp
.update( s2t_word2word_manual
)
323 t2s_supp
= t2s_1to1_supp
.copy()
324 t2s_supp
.update( t2s_word2word_manual
)
327 t2s_word2word
= defaultWordsTable( s_wordlist
, s_tomany
, s2t_1to1_supp
, t2s_supp
)
328 t2s_word2word
.update( t2s_word2word_manual
)
329 s2t_word2word
= defaultWordsTable( t_wordlist
, t_tomany
, t2s_1to1_supp
, s2t_supp
)
330 s2t_word2word
.update( s2t_word2word_manual
)
334 t2s_1to1
= dict( [( f
, t
) for ( f
, t
) in t2s_1to1
.iteritems() if f
!= t
] )
335 toHans
= dictToSortedList( t2s_1to1
, 0 ) + dictToSortedList( t2s_word2word
, 1 )
337 s2t_1to1
= dict( [( f
, t
) for ( f
, t
) in s2t_1to1
.iteritems() if f
!= t
] )
338 toHant
= dictToSortedList( s2t_1to1
, 0 ) + dictToSortedList( s2t_word2word
, 1 )
340 toCN
= dictToSortedList( customRules( 'toCN.manual' ), 1 )
342 toHK
= dictToSortedList( customRules( 'toHK.manual' ), 1 )
344 toSG
= dictToSortedList( customRules( 'toSG.manual' ), 1 )
346 toTW
= dictToSortedList( customRules( 'toTW.manual' ), 1 )
351 * Simplified / Traditional Chinese conversion tables
353 * Automatically generated using code and data in includes/zhtable/
354 * Do not modify directly!
359 $zh2Hant = array(\n'''
360 php
+= PHPArray( toHant
) \
361 + '\n);\n\n$zh2Hans = array(\n' \
362 + PHPArray( toHans
) \
363 + '\n);\n\n$zh2TW = array(\n' \
365 + '\n);\n\n$zh2HK = array(\n' \
367 + '\n);\n\n$zh2CN = array(\n' \
369 + '\n);\n\n$zh2SG = array(\n' \
373 f
= open( 'ZhConversion.php', 'wb', encoding
= 'utf8' )
374 print ('Writing ZhConversion.php ... ')
379 print ('Deleting temp files ... ')
380 os
.remove('EZ-Big.txt.in')
381 os
.remove('phrase_lib.txt')
383 os
.remove('Unihan_Variants.txt')
384 os
.remove('Wubi.txt.in')
385 os
.remove('Ziranma.txt.in')
388 if __name__
== '__main__':