2 # You should run this script UNDER python 3000.
3 import tarfile
, zipfile
4 import os
, re
, shutil
, urllib
.request
8 SCIM_TABLES_VER
= '0.5.9'
9 SCIM_PINYIN_VER
= '0.5.91'
13 def GetFileFromURL( url
, dest
):
14 if os
.path
.isfile(dest
):
15 print( 'File %s up to date.' % dest
)
17 print( 'Downloading from [%s] ...' % url
)
18 urllib
.request
.urlretrieve( url
, dest
)
19 print( 'Download complete.\n' )
22 def GetFileFromZip( path
):
23 print( 'Extracting files from %s ...' % path
)
24 zipfile
.ZipFile(path
).extractall()
27 def GetFileFromTar( path
, member
, rename
):
28 print( 'Extracting %s from %s ...' % (rename
, path
) )
29 tarfile
.open(path
, 'r:gz').extract(member
)
30 shutil
.move(member
, rename
)
31 tree_rmv
= member
.split('/')[0]
32 shutil
.rmtree(tree_rmv
)
35 def ReadBIG5File( dest
):
36 print( 'Reading and decoding %s ...' % dest
)
37 f1
= open( dest
, 'r', encoding
='big5hkscs', errors
='replace' )
39 text
= text
.replace( '\ufffd', '\n' )
41 f2
= open( dest
, 'w', encoding
='utf8' )
47 print( 'Reading and decoding %s ...' % dest
)
48 f
= open( dest
, 'r', encoding
='utf8' )
53 def ReadUnihanFile( dest
):
54 print( 'Reading and decoding %s ...' % dest
)
55 f
= open( dest
, 'r', encoding
='utf8' )
61 if line
.startswith('#'):
63 elif not line
.find('kSimplifiedVariant') == -1:
64 temp
= line
.split('kSimplifiedVariant')
65 t2s_code
.append( ( temp
[0].strip(), temp
[1].strip() ) )
66 elif not line
.find('kTraditionalVariant') == -1:
67 temp
= line
.split('kTraditionalVariant')
68 s2t_code
.append( ( temp
[0].strip(), temp
[1].strip() ) )
72 return ( t2s_code
, s2t_code
)
74 def RemoveRows( text
, num
):
75 text
= re
.sub( '.*\s*', '', text
, num
)
78 def RemoveOneCharConv( text
):
79 preg
= re
.compile('^.\s*$', re
.MULTILINE
)
80 text
= preg
.sub( '', text
)
83 def ConvertToChar( code
):
84 code
= code
.split('<')[0]
85 return chr( int( code
[2:], 16 ) )
87 def GetDefaultTable( code_table
):
89 for ( f
, t
) in code_table
:
91 from_char
= ConvertToChar( f
)
92 to_chars
= [ConvertToChar( code
) for code
in t
.split()]
93 char_table
[from_char
] = to_chars
96 def GetManualTable( dest
):
97 text
= ReadFile( dest
)
101 elem
= elem
.strip('|')
103 temp2
= elem
.split( '|', 1 )
104 from_char
= chr( int( temp2
[0][2:7], 16 ) )
105 to_chars
= [chr( int( code
[2:7], 16 ) ) for code
in temp2
[1].split('|')]
106 char_table
[from_char
] = to_chars
109 def GetValidTable( src_table
):
111 for f
, t
in src_table
.items():
112 valid_table
[f
] = t
[0]
115 def GetToManyRules( src_table
):
117 for f
, t
in src_table
.items():
118 for i
in range(1, len(t
)):
119 tomany_table
[t
[i
]] = True
122 def RemoveRules( dest
, table
):
123 text
= ReadFile( dest
)
128 elem
= elem
.strip().replace( '"', '' ).replace( '\'', '' )
130 if elem
.startswith( '=>' ):
131 t
= elem
.replace( '=>', '' ).strip()
132 elif elem
.endswith( '=>' ):
133 f
= elem
.replace( '=>', '' ).strip()
135 temp2
= elem
.split( '=>' )
151 for temp_f
, temp_t
in table
.copy().items():
156 def DictToSortedList1( src_table
):
157 return sorted( src_table
.items(), key
= lambda m
: m
[0] ) #sorted( temp_table, key = lambda m: len( m[0] ) )
159 def DictToSortedList2( src_table
):
160 return sorted( src_table
.items(), key
= lambda m
: m
[1] )
162 def Converter( string
, conv_table
):
164 while i
< len(string
):
165 for j
in range(len(string
) - i
, 0, -1):
167 t
= conv_table
.get( f
)
169 string
= string
[:i
] + t
+ string
[i
:][j
:]
175 def GetDefaultWordsTable( src_wordlist
, src_tomany
, char_conv_table
, char_reconv_table
):
176 wordlist
= list( set( src_wordlist
) )
177 wordlist
.sort( key
= len, reverse
= True )
179 word_reconv_table
= {}
183 conv_table
.update( word_conv_table
)
184 conv_table
.update( char_conv_table
)
185 reconv_table
.update( word_reconv_table
)
186 reconv_table
.update( char_reconv_table
)
187 word
= wordlist
.pop()
188 new_word_len
= word_len
= len(word
)
189 while new_word_len
== word_len
:
192 rvt_test
= rvt_test
or src_tomany
.get(char
)
193 test_word
= Converter( word
, reconv_table
)
194 new_word
= Converter( word
, conv_table
)
195 if not reconv_table
.get( new_word
):
196 if not test_word
== word
:
197 word_conv_table
[word
] = new_word
198 word_reconv_table
[new_word
] = word
200 rvt_word
= Converter( new_word
, reconv_table
)
201 if not rvt_word
== word
:
202 word_conv_table
[word
] = new_word
203 word_reconv_table
[new_word
] = word
205 word
= wordlist
.pop()
208 new_word_len
= len(word
)
209 return word_reconv_table
211 def GetManualWordsTable( src_wordlist
, conv_table
):
212 src_wordlist
= [items
.split('#')[0].strip() for items
in src_wordlist
]
213 wordlist
= list( set( src_wordlist
) )
214 wordlist
.sort( key
= len, reverse
= True )
217 word
= wordlist
.pop()
218 new_word
= Converter( word
, conv_table
)
219 reconv_table
[new_word
] = word
222 def CustomRules( dest
):
223 text
= ReadFile( dest
)
225 ret
= {temp
[i
]: temp
[i
+ 1] for i
in range( 0, len( temp
), 2 )}
228 def GetPHPArray( table
):
229 lines
= ['\'%s\' => \'%s\',' % (f
, t
) for (f
, t
) in table
]
230 #lines = ['"%s"=>"%s",' % (f, t) for (f, t) in table]
231 return '\n'.join(lines
)
233 def RemoveSameChar( src_table
):
235 for f
, t
in src_table
.items():
242 url
= 'http://www.unicode.org/Public/UNIDATA/Unihan.zip'
243 han_dest
= 'Unihan.zip'
244 GetFileFromURL( url
, han_dest
)
246 # Get scim-tables-$(SCIM_TABLES_VER).tar.gz:
247 url
= 'http://%s.dl.sourceforge.net/sourceforge/scim/scim-tables-%s.tar.gz' % ( SF_MIRROR
, SCIM_TABLES_VER
)
248 tbe_dest
= 'scim-tables-%s.tar.gz' % SCIM_TABLES_VER
249 GetFileFromURL( url
, tbe_dest
)
251 # Get scim-pinyin-$(SCIM_PINYIN_VER).tar.gz:
252 url
= 'http://%s.dl.sourceforge.net/sourceforge/scim/scim-pinyin-%s.tar.gz' % ( SF_MIRROR
, SCIM_PINYIN_VER
)
253 pyn_dest
= 'scim-pinyin-%s.tar.gz' % SCIM_PINYIN_VER
254 GetFileFromURL( url
, pyn_dest
)
256 # Get libtabe-$(LIBTABE_VER).tgz:
257 url
= 'http://%s.dl.sourceforge.net/sourceforge/libtabe/libtabe-%s.tgz' % ( SF_MIRROR
, LIBTABE_VER
)
258 lbt_dest
= 'libtabe-%s.tgz' % LIBTABE_VER
259 GetFileFromURL( url
, lbt_dest
)
261 # Extract the file from a comressed files
263 # Unihan.txt Simp. & Trad
264 GetFileFromZip( han_dest
)
271 src
= 'scim-tables-%s/tables/zh/EZ-Big.txt.in' % SCIM_TABLES_VER
273 GetFileFromTar( tbe_dest
, src
, dst
)
274 text
= ReadFile( dst
)
275 text
= text
.split( 'BEGIN_TABLE' )[1].strip()
276 text
= text
.split( 'END_TABLE' )[0].strip()
277 text
= re
.sub( '.*\t', '', text
)
278 text
= RemoveOneCharConv( text
)
279 t_wordlist
.extend( text
.split() )
282 src
= 'scim-tables-%s/tables/zh/Wubi.txt.in' % SCIM_TABLES_VER
284 GetFileFromTar( tbe_dest
, src
, dst
)
285 text
= ReadFile( dst
)
286 text
= text
.split( 'BEGIN_TABLE' )[1].strip()
287 text
= text
.split( 'END_TABLE' )[0].strip()
288 text
= re
.sub( '.*\t(.*?)\t\d*', '\g<1>', text
)
289 text
= RemoveOneCharConv( text
)
290 s_wordlist
.extend( text
.split() )
292 # Ziranma.txt.in Simp
293 src
= 'scim-tables-%s/tables/zh/Ziranma.txt.in' % SCIM_TABLES_VER
294 dst
= 'Ziranma.txt.in'
295 GetFileFromTar( tbe_dest
, src
, dst
)
296 text
= ReadFile( dst
)
297 text
= text
.split( 'BEGIN_TABLE' )[1].strip()
298 text
= text
.split( 'END_TABLE' )[0].strip()
299 text
= re
.sub( '.*\t(.*?)\t\d*', '\g<1>', text
)
300 text
= RemoveOneCharConv( text
)
301 s_wordlist
.extend( text
.split() )
303 # phrase_lib.txt Simp
304 src
= 'scim-pinyin-%s/data/phrase_lib.txt' % SCIM_PINYIN_VER
305 dst
= 'phrase_lib.txt'
306 GetFileFromTar( pyn_dest
, src
, dst
)
307 text
= ReadFile( 'phrase_lib.txt' )
308 text
= re
.sub( '(.*)\t\d\d*.*', '\g<1>', text
)
309 text
= RemoveRows( text
, 5 )
310 text
= RemoveOneCharConv( text
)
311 s_wordlist
.extend( text
.split() )
314 src
= 'libtabe/tsi-src/tsi.src'
316 GetFileFromTar( lbt_dest
, src
, dst
)
317 text
= ReadBIG5File( 'tsi.src' )
318 text
= re
.sub( ' \d.*', '', text
.replace('# ', ''))
319 text
= RemoveOneCharConv( text
)
320 t_wordlist
.extend( text
.split() )
322 # remove duplicate elements
323 t_wordlist
= list( set( t_wordlist
) )
324 s_wordlist
= list( set( s_wordlist
) )
326 # simpphrases_exclude.manual Simp
327 text
= ReadFile( 'simpphrases_exclude.manual' )
329 s_string
= '\n'.join( s_wordlist
)
331 s_string
= re
.sub( '.*%s.*\n' % elem
, '', s_string
)
332 s_wordlist
= s_string
.split('\n')
334 # tradphrases_exclude.manual Trad
335 text
= ReadFile( 'tradphrases_exclude.manual' )
337 t_string
= '\n'.join( t_wordlist
)
339 t_string
= re
.sub( '.*%s.*\n' % elem
, '', t_string
)
340 t_wordlist
= t_string
.split('\n')
342 # Make char to char convertion table
343 # Unihan.txt, dict t2s_code, s2t_code = { 'U+XXXX': 'U+YYYY( U+ZZZZ) ... ', ... }
344 ( t2s_code
, s2t_code
) = ReadUnihanFile( 'Unihan.txt' )
345 # dict t2s_1tomany = { '\uXXXX': '\uYYYY\uZZZZ ... ', ... }
347 t2s_1tomany
.update( GetDefaultTable( t2s_code
) )
348 t2s_1tomany
.update( GetManualTable( 'trad2simp.manual' ) )
351 s2t_1tomany
.update( GetDefaultTable( s2t_code
) )
352 s2t_1tomany
.update( GetManualTable( 'simp2trad.manual' ) )
353 # dict t2s_1to1 = { '\uXXXX': '\uYYYY', ... }; t2s_trans = { 'ddddd': '', ... }
354 t2s_1to1
= GetValidTable( t2s_1tomany
)
355 s_tomany
= GetToManyRules( t2s_1tomany
)
356 # dict s2t_1to1; s2t_trans
357 s2t_1to1
= GetValidTable( s2t_1tomany
)
358 t_tomany
= GetToManyRules( s2t_1tomany
)
359 # remove noconvert rules
360 t2s_1to1
= RemoveRules( 'trad2simp_noconvert.manual', t2s_1to1
)
361 s2t_1to1
= RemoveRules( 'simp2trad_noconvert.manual', s2t_1to1
)
363 # Make word to word convertion table
364 t2s_1to1_supp
= t2s_1to1
.copy()
365 s2t_1to1_supp
= s2t_1to1
.copy()
366 # trad2simp_supp_set.manual
367 t2s_1to1_supp
.update( CustomRules( 'trad2simp_supp_set.manual' ) )
368 # simp2trad_supp_set.manual
369 s2t_1to1_supp
.update( CustomRules( 'simp2trad_supp_set.manual' ) )
371 text
= ReadFile( 'simpphrases.manual' )
372 s_wordlist_manual
= text
.split('\n')
373 t2s_word2word_manual
= GetManualWordsTable(s_wordlist_manual
, s2t_1to1_supp
)
374 t2s_word2word_manual
.update( CustomRules( 'toSimp.manual' ) )
376 text
= ReadFile( 'tradphrases.manual' )
377 t_wordlist_manual
= text
.split('\n')
378 s2t_word2word_manual
= GetManualWordsTable(t_wordlist_manual
, t2s_1to1_supp
)
379 s2t_word2word_manual
.update( CustomRules( 'toTrad.manual' ) )
381 s2t_supp
= s2t_1to1_supp
.copy()
382 s2t_supp
.update( s2t_word2word_manual
)
383 t2s_supp
= t2s_1to1_supp
.copy()
384 t2s_supp
.update( t2s_word2word_manual
)
385 t2s_word2word
= GetDefaultWordsTable( s_wordlist
, s_tomany
, s2t_1to1_supp
, t2s_supp
)
387 t2s_word2word
.update( t2s_word2word_manual
)
389 s2t_word2word
= GetDefaultWordsTable( t_wordlist
, t_tomany
, t2s_1to1_supp
, s2t_supp
)
391 s2t_word2word
.update( s2t_word2word_manual
)
395 t2s_1to1
= RemoveSameChar( t2s_1to1
)
396 s2t_1to1
= RemoveSameChar( s2t_1to1
)
397 toHans
= DictToSortedList1( t2s_1to1
) + DictToSortedList2( t2s_word2word
)
399 toHant
= DictToSortedList1( s2t_1to1
) + DictToSortedList2( s2t_word2word
)
401 toCN
= DictToSortedList2( CustomRules( 'toCN.manual' ) )
403 toHK
= DictToSortedList2( CustomRules( 'toHK.manual' ) )
405 toSG
= DictToSortedList2( CustomRules( 'toSG.manual' ) )
407 toTW
= DictToSortedList2( CustomRules( 'toTW.manual' ) )
412 * Simplified / Traditional Chinese conversion tables
414 * Automatically generated using code and data in includes/zhtable/
415 * Do not modify directly!
418 $zh2Hant = array(\n'''
419 php
+= GetPHPArray( toHant
)
420 php
+= '\n);\n\n$zh2Hans = array(\n'
421 php
+= GetPHPArray( toHans
)
422 php
+= '\n);\n\n$zh2TW = array(\n'
423 php
+= GetPHPArray( toTW
)
424 php
+= '\n);\n\n$zh2HK = array(\n'
425 php
+= GetPHPArray( toHK
)
426 php
+= '\n);\n\n$zh2CN = array(\n'
427 php
+= GetPHPArray( toCN
)
428 php
+= '\n);\n\n$zh2SG = array(\n'
429 php
+= GetPHPArray( toSG
)
432 f
= open( 'ZhConversion.php', 'w', encoding
= 'utf8' )
433 print ('Writing ZhConversion.php ... ')
437 if __name__
== '__main__':