1 """ Unicode Mapping Parser and Codec Generator.
3 This script parses Unicode mapping files as available from the Unicode
4 site (ftp://ftp.unicode.org/Public/MAPPINGS/) and creates Python codec
5 modules from them. The codecs use the standard character mapping codec
6 to actually apply the mapping.
8 Synopsis: gencodec.py dir codec_prefix
10 All files in dir are scanned and those producing non-empty mappings
11 will be written to <codec_prefix><mapname>.py with <mapname> being the
12 first part of the map's filename ('a' in a.b.c.txt) converted to
13 lowercase with hyphens replaced by underscores.
15 The tool also writes marshalled versions of the mapping tables to the
16 same location (with .mapping extension).
18 Written by Marc-Andre Lemburg (mal@lemburg.com).
20 (c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
21 (c) Copyright Guido van Rossum, 2000.
25 import re
,os
,time
,marshal
27 # Create numeric tables or character based ones ?
30 mapRE
= re
.compile('((?:0x[0-9a-fA-F]+\+?)+)'
32 '((?:(?:0x[0-9a-fA-Z]+|<[A-Za-z]+>)\+?)*)'
37 len=len, filter=filter,range=range):
39 """ Converts code combinations to either a single code integer
40 or a tuple of integers.
42 meta-codes (in angular brackets, e.g. <LR> and <RL>) are
45 Empty codes or illegal ones are returned as None.
53 for i
in range(len(l
)):
58 l
= filter(lambda x
: x
is not None, l
)
64 def readmap(filename
):
66 f
= open(filename
,'r')
76 if not line
or line
[0] == '#':
80 #print '* not matched: %s' % repr(line)
82 enc
,uni
,comment
= m
.groups()
94 enc2uni
[enc
] = (uni
,comment
)
96 enc2uni
[enc
] = (uni
,comment
)
97 # If there are more identity-mapped entries than unmapped entries,
98 # it pays to generate an identity dictionary first, add add explicit
99 # mappings to None for the rest
100 if len(identity
)>=len(unmapped
):
102 enc2uni
[enc
] = (None, "")
103 enc2uni
['IDENTITY'] = 256
115 return '(' + ', '.join(map(lambda t
: '0x%04x' % t
, t
)) + ')'
127 return repr(unichr(t
))
128 return repr(''.join(map(unichr, t
)))
143 return repr(unichr(t
))
144 return repr(''.join(map(chr, t
)))
146 def codegen(name
,map,comments
=1):
148 """ Returns Python source for the given map.
150 Comments are included in the source, if comments is true (default).
155 """ Python Character Mapping Codec generated from '%s' with gencodec.py.
157 Written by Marc-Andre Lemburg (mal@lemburg.com).
159 (c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
160 (c) Copyright 2000 Guido van Rossum.
168 class Codec(codecs.Codec):
170 def encode(self,input,errors='strict'):
172 return codecs.charmap_encode(input,errors,encoding_map)
174 def decode(self,input,errors='strict'):
176 return codecs.charmap_decode(input,errors,decoding_map)
178 class StreamWriter(Codec,codecs.StreamWriter):
181 class StreamReader(Codec,codecs.StreamReader):
184 ### encodings module API
188 return (Codec().encode,Codec().decode,StreamReader,StreamWriter)
194 if map.has_key("IDENTITY"):
195 l
.append("decoding_map = codecs.make_identity_dict(range(%d))"
197 l
.append("decoding_map.update({")
201 l
.append("decoding_map = {")
204 mappings
= map.items()
208 for e
,value
in mappings
:
216 append('\t%s: %s,\t# %s' % (key
,unicoderepr(u
),c
))
218 append('\t%s: %s,' % (key
,unicoderepr(u
)))
221 # Split the definition into parts to that the Python
222 # parser doesn't dump core
227 append('decoding_map.update({')
237 encoding_map = codecs.make_encoding_map(decoding_map)
241 def pymap(name
,map,pyfile
,comments
=1):
243 code
= codegen(name
,map,comments
)
248 def marshalmap(name
,map,marshalfile
):
251 for e
,(u
,c
) in map.items():
253 f
= open(marshalfile
,'wb')
257 def convertdir(dir,prefix
='',comments
=1):
259 mapnames
= os
.listdir(dir)
260 for mapname
in mapnames
:
261 name
= os
.path
.split(mapname
)[1]
262 name
= name
.replace('-','_')
263 name
= name
.split('.')[0]
265 codefile
= name
+ '.py'
266 marshalfile
= name
+ '.mapping'
267 print 'converting %s to %s and %s' % (mapname
,
269 prefix
+ marshalfile
)
271 map = readmap(os
.path
.join(dir,mapname
))
273 print '* map is empty; skipping'
275 pymap(mapname
, map, prefix
+ codefile
,comments
)
276 marshalmap(mapname
, map, prefix
+ marshalfile
)
278 print '* conversion failed'
280 def rewritepythondir(dir,prefix
='',comments
=1):
282 mapnames
= os
.listdir(dir)
283 for mapname
in mapnames
:
284 if not mapname
.endswith('.mapping'):
286 codefile
= mapname
[:-len('.mapping')] + '.py'
287 print 'converting %s to %s' % (mapname
,
290 map = marshal
.load(open(os
.path
.join(dir,mapname
),
293 print '* map is empty; skipping'
295 pymap(mapname
, map, prefix
+ codefile
,comments
)
296 except ValueError, why
:
297 print '* conversion failed: %s' % why
299 if __name__
== '__main__':
303 apply(convertdir
,tuple(sys
.argv
[1:]))
305 apply(rewritepythondir
,tuple(sys
.argv
[1:]))