2 # -*- coding: utf-8 -*-
4 import tarfile
, zipfile
5 import os
, re
, shutil
, sys
, platform
7 pyversion
= platform
.python_version()
8 islinux
= platform
.system().lower() == 'linux' or False
10 if pyversion
[:3] in ['2.5', '2.6', '2.7']:
11 import urllib
as urllib_request
15 if sys
.maxunicode
>= 0x10000 or i
< 0x10000:
18 return unichr(0xD7C0+(i
>>10)) + unichr(0xDC00+(i
&0x3FF))
19 elif pyversion
[:2] == '3.':
20 import urllib
.request
as urllib_request
25 SF_MIRROR
= 'easynews'
26 SCIM_TABLES_VER
= '0.5.9'
27 SCIM_PINYIN_VER
= '0.5.91'
31 def GetFileFromURL( url
, dest
):
32 if os
.path
.isfile(dest
):
33 print( 'File %s up to date.' % dest
)
37 # we use wget instead urlretrieve under Linux,
38 # because wget will display details like download progress
39 os
.system('wget %s' % url
)
41 print( 'Downloading from [%s] ...' % url
)
42 urllib_request
.urlretrieve( url
, dest
)
43 print( 'Download complete.\n' )
46 def GetFileFromUnihan( path
):
47 print( 'Extracting files from %s ...' % path
)
48 text
= zipfile
.ZipFile(path
).read('Unihan_Variants.txt')
49 uhfile
= uniopen('Unihan_Variants.txt', 'w')
54 def GetFileFromTar( path
, member
, rename
):
55 print( 'Extracting %s from %s ...' % (rename
, path
) )
56 tarfile
.open(path
, 'r:gz').extract(member
)
57 shutil
.move(member
, rename
)
58 tree_rmv
= member
.split('/')[0]
59 shutil
.rmtree(tree_rmv
)
62 def ReadBIG5File( dest
):
63 print( 'Reading and decoding %s ...' % dest
)
64 f1
= uniopen( dest
, 'r', encoding
='big5hkscs', errors
='replace' )
66 text
= text
.replace( '\ufffd', '\n' )
68 f2
= uniopen( dest
, 'w', encoding
='utf8' )
74 print( 'Reading and decoding %s ...' % dest
)
75 f
= uniopen( dest
, 'r', encoding
='utf8' )
80 def ReadUnihanFile( dest
):
81 print( 'Reading and decoding %s ...' % dest
)
82 f
= uniopen( dest
, 'r', encoding
='utf8' )
88 if line
.startswith('#'):
90 elif not line
.find('kSimplifiedVariant') == -1:
91 temp
= line
.split('kSimplifiedVariant')
92 t2s_code
.append( ( temp
[0].strip(), temp
[1].strip() ) )
93 elif not line
.find('kTraditionalVariant') == -1:
94 temp
= line
.split('kTraditionalVariant')
95 s2t_code
.append( ( temp
[0].strip(), temp
[1].strip() ) )
99 return ( t2s_code
, s2t_code
)
101 def RemoveRows( text
, num
):
102 text
= re
.sub( '.*\s*', '', text
, num
)
105 def RemoveOneCharConv( text
):
106 preg
= re
.compile('^.\s*$', re
.MULTILINE
)
107 text
= preg
.sub( '', text
)
110 def ConvertToChar( code
):
111 code
= code
.split('<')[0]
112 return unichr2( int( code
[2:], 16 ) )
114 def GetDefaultTable( code_table
):
116 for ( f
, t
) in code_table
:
118 from_char
= ConvertToChar( f
)
119 to_chars
= [ConvertToChar( code
) for code
in t
.split()]
120 char_table
[from_char
] = to_chars
123 def GetManualTable( dest
):
124 text
= ReadFile( dest
)
128 elem
= elem
.strip('|')
130 temp2
= elem
.split( '|', 1 )
131 from_char
= unichr2( int( temp2
[0][2:7], 16 ) )
132 to_chars
= [unichr2( int( code
[2:7], 16 ) ) for code
in temp2
[1].split('|')]
133 char_table
[from_char
] = to_chars
136 def GetValidTable( src_table
):
138 for f
, t
in src_table
.items():
139 valid_table
[f
] = t
[0]
142 def GetToManyRules( src_table
):
144 for f
, t
in src_table
.items():
145 for i
in range(1, len(t
)):
146 tomany_table
[t
[i
]] = True
149 def RemoveRules( dest
, table
):
150 text
= ReadFile( dest
)
155 elem
= elem
.strip().replace( '"', '' ).replace( '\'', '' )
157 if elem
.startswith( '=>' ):
158 t
= elem
.replace( '=>', '' ).strip()
159 elif elem
.endswith( '=>' ):
160 f
= elem
.replace( '=>', '' ).strip()
162 temp2
= elem
.split( '=>' )
178 for temp_f
, temp_t
in table
.copy().items():
183 def DictToSortedList1( src_table
):
184 return sorted( src_table
.items(), key
= lambda m
: m
[0] ) #sorted( temp_table, key = lambda m: len( m[0] ) )
186 def DictToSortedList2( src_table
):
187 return sorted( src_table
.items(), key
= lambda m
: m
[1] )
189 def Converter( string
, conv_table
):
191 while i
< len(string
):
192 for j
in range(len(string
) - i
, 0, -1):
194 t
= conv_table
.get( f
)
196 string
= string
[:i
] + t
+ string
[i
:][j
:]
202 def GetDefaultWordsTable( src_wordlist
, src_tomany
, char_conv_table
, char_reconv_table
):
203 wordlist
= list( set( src_wordlist
) )
204 wordlist
.sort( key
= len, reverse
= True )
206 word_reconv_table
= {}
210 conv_table
.update( word_conv_table
)
211 conv_table
.update( char_conv_table
)
212 reconv_table
.update( word_reconv_table
)
213 reconv_table
.update( char_reconv_table
)
214 word
= wordlist
.pop()
215 new_word_len
= word_len
= len(word
)
216 while new_word_len
== word_len
:
219 rvt_test
= rvt_test
or src_tomany
.get(char
)
220 test_word
= Converter( word
, reconv_table
)
221 new_word
= Converter( word
, conv_table
)
222 if not reconv_table
.get( new_word
):
223 if not test_word
== word
:
224 word_conv_table
[word
] = new_word
225 word_reconv_table
[new_word
] = word
227 rvt_word
= Converter( new_word
, reconv_table
)
228 if not rvt_word
== word
:
229 word_conv_table
[word
] = new_word
230 word_reconv_table
[new_word
] = word
232 word
= wordlist
.pop()
235 new_word_len
= len(word
)
236 return word_reconv_table
238 def GetManualWordsTable( src_wordlist
, conv_table
):
239 src_wordlist
= [items
.split('#')[0].strip() for items
in src_wordlist
]
240 wordlist
= list( set( src_wordlist
) )
241 wordlist
.sort( key
= len, reverse
= True )
244 word
= wordlist
.pop()
245 new_word
= Converter( word
, conv_table
)
246 reconv_table
[new_word
] = word
249 def CustomRules( dest
):
250 text
= ReadFile( dest
)
253 for i
in range( 0, len( temp
), 2 ):
254 ret
[temp
[i
]] = temp
[i
+ 1]
257 def GetPHPArray( table
):
258 lines
= ['\'%s\' => \'%s\',' % (f
, t
) for (f
, t
) in table
if f
and t
]
259 #lines = ['"%s"=>"%s",' % (f, t) for (f, t) in table]
260 return '\n'.join(lines
)
262 def RemoveSameChar( src_table
):
264 for f
, t
in src_table
.items():
271 url
= 'http://www.unicode.org/Public/UNIDATA/Unihan.zip'
272 han_dest
= 'Unihan.zip'
273 GetFileFromURL( url
, han_dest
)
275 # Get scim-tables-$(SCIM_TABLES_VER).tar.gz:
276 url
= 'http://%s.dl.sourceforge.net/sourceforge/scim/scim-tables-%s.tar.gz' % ( SF_MIRROR
, SCIM_TABLES_VER
)
277 tbe_dest
= 'scim-tables-%s.tar.gz' % SCIM_TABLES_VER
278 GetFileFromURL( url
, tbe_dest
)
280 # Get scim-pinyin-$(SCIM_PINYIN_VER).tar.gz:
281 url
= 'http://%s.dl.sourceforge.net/sourceforge/scim/scim-pinyin-%s.tar.gz' % ( SF_MIRROR
, SCIM_PINYIN_VER
)
282 pyn_dest
= 'scim-pinyin-%s.tar.gz' % SCIM_PINYIN_VER
283 GetFileFromURL( url
, pyn_dest
)
285 # Get libtabe-$(LIBTABE_VER).tgz:
286 url
= 'http://%s.dl.sourceforge.net/sourceforge/libtabe/libtabe-%s.tgz' % ( SF_MIRROR
, LIBTABE_VER
)
287 lbt_dest
= 'libtabe-%s.tgz' % LIBTABE_VER
288 GetFileFromURL( url
, lbt_dest
)
290 # Extract the file from a comressed files
292 # Unihan.txt Simp. & Trad
293 GetFileFromUnihan( han_dest
)
300 src
= 'scim-tables-%s/tables/zh/EZ-Big.txt.in' % SCIM_TABLES_VER
302 GetFileFromTar( tbe_dest
, src
, dst
)
303 text
= ReadFile( dst
)
304 text
= text
.split( 'BEGIN_TABLE' )[1].strip()
305 text
= text
.split( 'END_TABLE' )[0].strip()
306 text
= re
.sub( '.*\t', '', text
)
307 text
= RemoveOneCharConv( text
)
308 t_wordlist
.extend( text
.split() )
311 src
= 'scim-tables-%s/tables/zh/Wubi.txt.in' % SCIM_TABLES_VER
313 GetFileFromTar( tbe_dest
, src
, dst
)
314 text
= ReadFile( dst
)
315 text
= text
.split( 'BEGIN_TABLE' )[1].strip()
316 text
= text
.split( 'END_TABLE' )[0].strip()
317 text
= re
.sub( '.*\t(.*?)\t\d*', '\g<1>', text
)
318 text
= RemoveOneCharConv( text
)
319 s_wordlist
.extend( text
.split() )
321 # Ziranma.txt.in Simp
322 src
= 'scim-tables-%s/tables/zh/Ziranma.txt.in' % SCIM_TABLES_VER
323 dst
= 'Ziranma.txt.in'
324 GetFileFromTar( tbe_dest
, src
, dst
)
325 text
= ReadFile( dst
)
326 text
= text
.split( 'BEGIN_TABLE' )[1].strip()
327 text
= text
.split( 'END_TABLE' )[0].strip()
328 text
= re
.sub( '.*\t(.*?)\t\d*', '\g<1>', text
)
329 text
= RemoveOneCharConv( text
)
330 s_wordlist
.extend( text
.split() )
332 # phrase_lib.txt Simp
333 src
= 'scim-pinyin-%s/data/phrase_lib.txt' % SCIM_PINYIN_VER
334 dst
= 'phrase_lib.txt'
335 GetFileFromTar( pyn_dest
, src
, dst
)
336 text
= ReadFile( 'phrase_lib.txt' )
337 text
= re
.sub( '(.*)\t\d\d*.*', '\g<1>', text
)
338 text
= RemoveRows( text
, 5 )
339 text
= RemoveOneCharConv( text
)
340 s_wordlist
.extend( text
.split() )
343 src
= 'libtabe/tsi-src/tsi.src'
345 GetFileFromTar( lbt_dest
, src
, dst
)
346 text
= ReadBIG5File( 'tsi.src' )
347 text
= re
.sub( ' \d.*', '', text
.replace('# ', ''))
348 text
= RemoveOneCharConv( text
)
349 t_wordlist
.extend( text
.split() )
351 # remove duplicate elements
352 t_wordlist
= list( set( t_wordlist
) )
353 s_wordlist
= list( set( s_wordlist
) )
355 # simpphrases_exclude.manual Simp
356 text
= ReadFile( 'simpphrases_exclude.manual' )
358 s_string
= '\n'.join( s_wordlist
)
360 s_string
= re
.sub( '.*%s.*\n' % elem
, '', s_string
)
361 s_wordlist
= s_string
.split('\n')
363 # tradphrases_exclude.manual Trad
364 text
= ReadFile( 'tradphrases_exclude.manual' )
366 t_string
= '\n'.join( t_wordlist
)
368 t_string
= re
.sub( '.*%s.*\n' % elem
, '', t_string
)
369 t_wordlist
= t_string
.split('\n')
371 # Make char to char convertion table
372 # Unihan.txt, dict t2s_code, s2t_code = { 'U+XXXX': 'U+YYYY( U+ZZZZ) ... ', ... }
373 ( t2s_code
, s2t_code
) = ReadUnihanFile( 'Unihan_Variants.txt' )
374 # dict t2s_1tomany = { '\uXXXX': '\uYYYY\uZZZZ ... ', ... }
376 t2s_1tomany
.update( GetDefaultTable( t2s_code
) )
377 t2s_1tomany
.update( GetManualTable( 'trad2simp.manual' ) )
380 s2t_1tomany
.update( GetDefaultTable( s2t_code
) )
381 s2t_1tomany
.update( GetManualTable( 'simp2trad.manual' ) )
382 # dict t2s_1to1 = { '\uXXXX': '\uYYYY', ... }; t2s_trans = { 'ddddd': '', ... }
383 t2s_1to1
= GetValidTable( t2s_1tomany
)
384 s_tomany
= GetToManyRules( t2s_1tomany
)
385 # dict s2t_1to1; s2t_trans
386 s2t_1to1
= GetValidTable( s2t_1tomany
)
387 t_tomany
= GetToManyRules( s2t_1tomany
)
388 # remove noconvert rules
389 t2s_1to1
= RemoveRules( 'trad2simp_noconvert.manual', t2s_1to1
)
390 s2t_1to1
= RemoveRules( 'simp2trad_noconvert.manual', s2t_1to1
)
392 # Make word to word convertion table
393 t2s_1to1_supp
= t2s_1to1
.copy()
394 s2t_1to1_supp
= s2t_1to1
.copy()
395 # trad2simp_supp_set.manual
396 t2s_1to1_supp
.update( CustomRules( 'trad2simp_supp_set.manual' ) )
397 # simp2trad_supp_set.manual
398 s2t_1to1_supp
.update( CustomRules( 'simp2trad_supp_set.manual' ) )
400 text
= ReadFile( 'simpphrases.manual' )
401 s_wordlist_manual
= text
.split('\n')
402 t2s_word2word_manual
= GetManualWordsTable(s_wordlist_manual
, s2t_1to1_supp
)
403 t2s_word2word_manual
.update( CustomRules( 'toSimp.manual' ) )
405 text
= ReadFile( 'tradphrases.manual' )
406 t_wordlist_manual
= text
.split('\n')
407 s2t_word2word_manual
= GetManualWordsTable(t_wordlist_manual
, t2s_1to1_supp
)
408 s2t_word2word_manual
.update( CustomRules( 'toTrad.manual' ) )
410 s2t_supp
= s2t_1to1_supp
.copy()
411 s2t_supp
.update( s2t_word2word_manual
)
412 t2s_supp
= t2s_1to1_supp
.copy()
413 t2s_supp
.update( t2s_word2word_manual
)
414 t2s_word2word
= GetDefaultWordsTable( s_wordlist
, s_tomany
, s2t_1to1_supp
, t2s_supp
)
416 t2s_word2word
.update( t2s_word2word_manual
)
418 s2t_word2word
= GetDefaultWordsTable( t_wordlist
, t_tomany
, t2s_1to1_supp
, s2t_supp
)
420 s2t_word2word
.update( s2t_word2word_manual
)
424 t2s_1to1
= RemoveSameChar( t2s_1to1
)
425 s2t_1to1
= RemoveSameChar( s2t_1to1
)
426 toHans
= DictToSortedList1( t2s_1to1
) + DictToSortedList2( t2s_word2word
)
428 toHant
= DictToSortedList1( s2t_1to1
) + DictToSortedList2( s2t_word2word
)
430 toCN
= DictToSortedList2( CustomRules( 'toCN.manual' ) )
432 toHK
= DictToSortedList2( CustomRules( 'toHK.manual' ) )
434 toSG
= DictToSortedList2( CustomRules( 'toSG.manual' ) )
436 toTW
= DictToSortedList2( CustomRules( 'toTW.manual' ) )
441 * Simplified / Traditional Chinese conversion tables
443 * Automatically generated using code and data in includes/zhtable/
444 * Do not modify directly!
447 $zh2Hant = array(\n'''
448 php
+= GetPHPArray( toHant
)
449 php
+= '\n);\n\n$zh2Hans = array(\n'
450 php
+= GetPHPArray( toHans
)
451 php
+= '\n);\n\n$zh2TW = array(\n'
452 php
+= GetPHPArray( toTW
)
453 php
+= '\n);\n\n$zh2HK = array(\n'
454 php
+= GetPHPArray( toHK
)
455 php
+= '\n);\n\n$zh2CN = array(\n'
456 php
+= GetPHPArray( toCN
)
457 php
+= '\n);\n\n$zh2SG = array(\n'
458 php
+= GetPHPArray( toSG
)
461 f
= uniopen( 'ZhConversion.php', 'w', encoding
= 'utf8' )
462 print ('Writing ZhConversion.php ... ')
467 print ('Deleting temp files ... ')
468 os
.remove('EZ.txt.in')
469 os
.remove('phrase_lib.txt')
471 os
.remove('Unihan_Variants.txt')
472 os
.remove('Wubi.txt.in')
473 os
.remove('Ziranma.txt.in')
476 if __name__
== '__main__':