2 # -*- coding: utf-8 -*-
6 import os
, re
, shutil
, sys
, platform
8 pyversion
= platform
.python_version()
9 islinux
= platform
.system().lower() == 'linux'
11 if pyversion
[:3] in ['2.6', '2.7']:
12 import urllib
as urllib_request
16 if sys
.maxunicode
< 0x10000:
21 return _unichr( 0xD7C0 + ( i
>>10 ) ) + _unichr( 0xDC00 + ( i
& 0x3FF ) )
22 elif pyversion
[:2] == '3.':
23 import urllib
.request
as urllib_request
27 return [unichr( int( i
.split('<')[0][2:], 16 ) ) for i
in args
]
30 return [unichr( int( i
[2:7], 16 ) ) for i
in args
if i
[2:7]]
35 SCIM_TABLES_VER
= '0.5.13'
36 SCIM_PINYIN_VER
= '0.5.92'
40 def download( url
, dest
):
41 if os
.path
.isfile( dest
):
42 print( 'File %s is up to date.' % dest
)
46 # we use wget instead urlretrieve under Linux,
47 # because wget could display details like download progress
48 os
.system( 'wget %s -O %s' % ( url
, dest
) )
50 print( 'Downloading from [%s] ...' % url
)
51 urllib_request
.urlretrieve( url
, dest
)
52 print( 'Download complete.\n' )
55 def uncompress( fp
, member
, encoding
= 'U8' ):
56 name
= member
.rsplit( '/', 1 )[-1]
57 print( 'Extracting %s ...' % name
)
59 shutil
.move( member
, name
)
61 shutil
.rmtree( member
.split( '/', 1 )[0] )
62 if pyversion
[:1] in ['2']:
63 fc
= open( name
, 'rb', encoding
, 'ignore' )
65 fc
= open( name
, 'r', encoding
= encoding
, errors
= 'ignore' )
68 unzip
= lambda path
, member
, encoding
= 'U8': \
69 uncompress( zf
.ZipFile( path
), member
, encoding
)
71 untargz
= lambda path
, member
, encoding
= 'U8': \
72 uncompress( tf
.open( path
, 'r:gz' ), member
, encoding
)
74 def parserCore( fp
, pos
, beginmark
= None, endmark
= None ):
75 if beginmark
and endmark
:
80 if beginmark
and line
.startswith( beginmark
):
83 elif endmark
and line
.startswith( endmark
):
85 if start
and not line
.startswith( '#' ):
89 elif len( elems
[0] ) > 1 and \
90 len( elems
[pos
] ) > 1: # words only
91 mlist
.add( elems
[pos
] )
94 def tablesParser( path
, name
):
95 """ Read file from scim-tables and parse it. """
96 global SCIM_TABLES_VER
97 src
= 'scim-tables-%s/tables/zh/%s' % ( SCIM_TABLES_VER
, name
)
98 fp
= untargz( path
, src
, 'U8' )
99 return parserCore( fp
, 1, 'BEGIN_TABLE', 'END_TABLE' )
101 ezbigParser
= lambda path
: tablesParser( path
, 'EZ-Big.txt.in' )
102 wubiParser
= lambda path
: tablesParser( path
, 'Wubi.txt.in' )
103 zrmParser
= lambda path
: tablesParser( path
, 'Ziranma.txt.in' )
105 def phraseParser( path
):
106 """ Read phrase_lib.txt and parse it. """
107 global SCIM_PINYIN_VER
108 src
= 'scim-pinyin-%s/data/phrase_lib.txt' % SCIM_PINYIN_VER
109 dst
= 'phrase_lib.txt'
110 fp
= untargz( path
, src
, 'U8' )
111 return parserCore( fp
, 0 )
113 def tsiParser( path
):
114 """ Read tsi.src and parse it. """
115 src
= 'libtabe/tsi-src/tsi.src'
117 fp
= untargz( path
, src
, 'big5hkscs' )
118 return parserCore( fp
, 0 )
120 def unihanParser( path
):
121 """ Read Unihan_Variants.txt and parse it. """
122 fp
= unzip( path
, 'Unihan_Variants.txt', 'U8' )
126 if line
.startswith( '#' ):
132 type = elems
.pop( 1 )
133 elems
= unichr2( *elems
)
134 if type == 'kTraditionalVariant':
135 s2t
[elems
[0]] = elems
[1:]
136 elif type == 'kSimplifiedVariant':
137 t2s
[elems
[0]] = elems
[1:]
141 def applyExcludes( mlist
, path
):
142 """ Apply exclude rules from path to mlist. """
143 if pyversion
[:1] in ['2']:
144 excludes
= open( path
, 'rb', 'U8' ).read().split()
146 excludes
= open( path
, 'r', encoding
= 'U8' ).read().split()
147 excludes
= [word
.split( '#' )[0].strip() for word
in excludes
]
148 excludes
= '|'.join( excludes
)
149 excptn
= re
.compile( '.*(?:%s).*' % excludes
)
150 diff
= [mword
for mword
in mlist
if excptn
.search( mword
)]
151 mlist
.difference_update( diff
)
154 def charManualTable( path
):
155 fp
= open( path
, 'r', encoding
= 'U8' )
158 elems
= line
.split( '#' )[0].split( '|' )
159 elems
= unichr3( *elems
)
161 ret
[elems
[0]] = elems
[1:]
164 def toManyRules( src_table
):
166 if pyversion
[:1] in ['2']:
167 for ( f
, t
) in src_table
.iteritems():
168 for i
in range( 1, len( t
) ):
171 for ( f
, t
) in src_table
.items():
172 for i
in range( 1, len( t
) ):
176 def removeRules( path
, table
):
177 fp
= open( path
, 'r', encoding
= 'U8' )
180 elems
= line
.split( '=>' )
181 f
= t
= elems
[0].strip()
182 if len( elems
) == 2:
184 f
= f
.strip('"').strip("'")
185 t
= t
.strip('"').strip("'")
193 texcptn
= re
.compile( '^(?:%s)$' % '|'.join( texc
) )
194 if pyversion
[:1] in ['2']:
195 for (tmp_f
, tmp_t
) in table
.copy().iteritems():
196 if texcptn
.match( tmp_t
):
199 for (tmp_f
, tmp_t
) in table
.copy().items():
200 if texcptn
.match( tmp_t
):
204 def customRules( path
):
205 fp
= open( path
, 'r', encoding
= 'U8' )
208 elems
= line
.split( '#' )[0].split()
210 ret
[elems
[0]] = elems
[1]
213 def dictToSortedList( src_table
, pos
):
214 return sorted( src_table
.items(), key
= lambda m
: m
[pos
] )
216 def translate( text
, conv_table
):
218 while i
< len( text
):
219 for j
in range( len( text
) - i
, 0, -1 ):
221 t
= conv_table
.get( f
)
223 text
= text
[:i
] + t
+ text
[i
:][j
:]
229 def manualWordsTable( path
, conv_table
, reconv_table
):
230 fp
= open( path
, 'r', encoding
= 'U8' )
232 wordlist
= [line
.split( '#' )[0].strip() for line
in fp
]
233 wordlist
= list( set( wordlist
) )
234 wordlist
.sort( key
= len, reverse
= True )
236 word
= wordlist
.pop()
237 new_word
= translate( word
, conv_table
)
238 rcv_word
= translate( word
, reconv_table
)
240 reconv_table
[word
] = word
241 reconv_table
[new_word
] = word
244 def defaultWordsTable( src_wordlist
, src_tomany
, char_conv_table
, char_reconv_table
):
245 wordlist
= list( src_wordlist
)
246 wordlist
.sort( key
= len, reverse
= True )
248 word_reconv_table
= {}
249 conv_table
= char_conv_table
.copy()
250 reconv_table
= char_reconv_table
.copy()
251 tomanyptn
= re
.compile( '(?:%s)' % '|'.join( src_tomany
) )
253 conv_table
.update( word_conv_table
)
254 reconv_table
.update( word_reconv_table
)
255 word
= wordlist
.pop()
256 new_word_len
= word_len
= len( word
)
257 while new_word_len
== word_len
:
259 test_word
= translate( word
, reconv_table
)
260 new_word
= translate( word
, conv_table
)
261 if not reconv_table
.get( new_word
) \
262 and ( test_word
!= word \
263 or ( tomanyptn
.search( word
) \
264 and word
!= translate( new_word
, reconv_table
) ) ):
265 word_conv_table
[word
] = new_word
266 word_reconv_table
[new_word
] = word
268 word
= wordlist
.pop()
271 new_word_len
= len(word
)
272 return word_reconv_table
274 def PHPArray( table
):
275 lines
= ['\'%s\' => \'%s\',' % (f
, t
) for (f
, t
) in table
if f
and t
]
276 return '\n'.join(lines
)
280 url
= 'http://www.unicode.org/Public/%s/ucd/Unihan.zip' % UNIHAN_VER
281 han_dest
= 'Unihan.zip'
282 download( url
, han_dest
)
284 # Get scim-tables-$(SCIM_TABLES_VER).tar.gz:
285 url
= 'http://%s.dl.sourceforge.net/sourceforge/scim/scim-tables-%s.tar.gz' % ( SF_MIRROR
, SCIM_TABLES_VER
)
286 tbe_dest
= 'scim-tables-%s.tar.gz' % SCIM_TABLES_VER
287 download( url
, tbe_dest
)
289 # Get scim-pinyin-$(SCIM_PINYIN_VER).tar.gz:
290 url
= 'http://%s.dl.sourceforge.net/sourceforge/scim/scim-pinyin-%s.tar.gz' % ( SF_MIRROR
, SCIM_PINYIN_VER
)
291 pyn_dest
= 'scim-pinyin-%s.tar.gz' % SCIM_PINYIN_VER
292 download( url
, pyn_dest
)
294 # Get libtabe-$(LIBTABE_VER).tgz:
295 url
= 'http://%s.dl.sourceforge.net/sourceforge/libtabe/libtabe-%s.tgz' % ( SF_MIRROR
, LIBTABE_VER
)
296 lbt_dest
= 'libtabe-%s.tgz' % LIBTABE_VER
297 download( url
, lbt_dest
)
300 ( t2s_1tomany
, s2t_1tomany
) = unihanParser( han_dest
)
302 t2s_1tomany
.update( charManualTable( 'trad2simp.manual' ) )
303 s2t_1tomany
.update( charManualTable( 'simp2trad.manual' ) )
305 if pyversion
[:1] in ['2']:
306 t2s_1to1
= dict( [( f
, t
[0] ) for ( f
, t
) in t2s_1tomany
.iteritems()] )
307 s2t_1to1
= dict( [( f
, t
[0] ) for ( f
, t
) in s2t_1tomany
.iteritems()] )
309 t2s_1to1
= dict( [( f
, t
[0] ) for ( f
, t
) in t2s_1tomany
.items()] )
310 s2t_1to1
= dict( [( f
, t
[0] ) for ( f
, t
) in s2t_1tomany
.items()] )
312 s_tomany
= toManyRules( t2s_1tomany
)
313 t_tomany
= toManyRules( s2t_1tomany
)
316 t2s_1to1
= removeRules( 'trad2simp_noconvert.manual', t2s_1to1
)
317 s2t_1to1
= removeRules( 'simp2trad_noconvert.manual', s2t_1to1
)
319 # the supper set for word to word conversion
320 t2s_1to1_supp
= t2s_1to1
.copy()
321 s2t_1to1_supp
= s2t_1to1
.copy()
322 t2s_1to1_supp
.update( customRules( 'trad2simp_supp_set.manual' ) )
323 s2t_1to1_supp
.update( customRules( 'simp2trad_supp_set.manual' ) )
325 # word to word manual rules
326 t2s_word2word_manual
= manualWordsTable( 'simpphrases.manual', s2t_1to1_supp
, t2s_1to1_supp
)
327 t2s_word2word_manual
.update( customRules( 'toSimp.manual' ) )
328 s2t_word2word_manual
= manualWordsTable( 'tradphrases.manual', t2s_1to1_supp
, s2t_1to1_supp
)
329 s2t_word2word_manual
.update( customRules( 'toTrad.manual' ) )
331 # word to word rules from input methods
334 t_wordlist
.update( ezbigParser( tbe_dest
),
335 tsiParser( lbt_dest
) )
336 s_wordlist
.update( wubiParser( tbe_dest
),
337 zrmParser( tbe_dest
),
338 phraseParser( pyn_dest
) )
341 s_wordlist
= applyExcludes( s_wordlist
, 'simpphrases_exclude.manual' )
342 t_wordlist
= applyExcludes( t_wordlist
, 'tradphrases_exclude.manual' )
344 s2t_supp
= s2t_1to1_supp
.copy()
345 s2t_supp
.update( s2t_word2word_manual
)
346 t2s_supp
= t2s_1to1_supp
.copy()
347 t2s_supp
.update( t2s_word2word_manual
)
350 t2s_word2word
= defaultWordsTable( s_wordlist
, s_tomany
, s2t_1to1_supp
, t2s_supp
)
351 t2s_word2word
.update( t2s_word2word_manual
)
352 s2t_word2word
= defaultWordsTable( t_wordlist
, t_tomany
, t2s_1to1_supp
, s2t_supp
)
353 s2t_word2word
.update( s2t_word2word_manual
)
357 if pyversion
[:1] in ['2']:
358 t2s_1to1
= dict( [( f
, t
) for ( f
, t
) in t2s_1to1
.iteritems() if f
!= t
] )
360 t2s_1to1
= dict( [( f
, t
) for ( f
, t
) in t2s_1to1
.items() if f
!= t
] )
361 toHans
= dictToSortedList( t2s_1to1
, 0 ) + dictToSortedList( t2s_word2word
, 1 )
363 if pyversion
[:1] in ['2']:
364 s2t_1to1
= dict( [( f
, t
) for ( f
, t
) in s2t_1to1
.iteritems() if f
!= t
] )
366 s2t_1to1
= dict( [( f
, t
) for ( f
, t
) in s2t_1to1
.items() if f
!= t
] )
367 toHant
= dictToSortedList( s2t_1to1
, 0 ) + dictToSortedList( s2t_word2word
, 1 )
369 toCN
= dictToSortedList( customRules( 'toCN.manual' ), 1 )
371 toHK
= dictToSortedList( customRules( 'toHK.manual' ), 1 )
373 toSG
= dictToSortedList( customRules( 'toSG.manual' ), 1 )
375 toTW
= dictToSortedList( customRules( 'toTW.manual' ), 1 )
380 * Simplified / Traditional Chinese conversion tables
382 * Automatically generated using code and data in maintenance/language/zhtable/
383 * Do not modify directly!
388 $zh2Hant = array(\n'''
389 php
+= PHPArray( toHant
) \
390 + '\n);\n\n$zh2Hans = array(\n' \
391 + PHPArray( toHans
) \
392 + '\n);\n\n$zh2TW = array(\n' \
394 + '\n);\n\n$zh2HK = array(\n' \
396 + '\n);\n\n$zh2CN = array(\n' \
398 + '\n);\n\n$zh2SG = array(\n' \
402 if pyversion
[:1] in ['2']:
403 f
= open( os
.path
.join( '..', '..', '..', 'includes', 'ZhConversion.php' ), 'wb', encoding
= 'utf8' )
405 f
= open( os
.path
.join( '..', '..', '..', 'includes', 'ZhConversion.php' ), 'w', buffering
= 4096, encoding
= 'utf8' )
406 print ('Writing ZhConversion.php ... ')
410 # Remove temporary files
411 print ('Deleting temporary files ... ')
412 os
.remove('EZ-Big.txt.in')
413 os
.remove('phrase_lib.txt')
415 os
.remove('Unihan_Variants.txt')
416 os
.remove('Wubi.txt.in')
417 os
.remove('Ziranma.txt.in')
420 if __name__
== '__main__':