1 """ Unicode Mapping Parser and Codec Generator.
3 This script parses Unicode mapping files as available from the Unicode
4 site (ftp.unicode.org) and creates Python codec modules from them. The
5 codecs use the standard character mapping codec to actually apply the
8 Synopsis: gencodec.py dir codec_prefix
10 All files in dir are scanned and those producing non-empty mappings
11 will be written to <codec_prefix><mapname>.py with <mapname> being the
12 first part of the map's filename ('a' in a.b.c.txt) converted to
13 lowercase with hyphens replaced by underscores.
15 The tool also writes marshalled versions of the mapping tables to the
16 same location (with .mapping extension).
18 Written by Marc-Andre Lemburg (mal@lemburg.com).
20 (c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
24 import string
,re
,os
,time
,marshal
26 # Create numeric tables or character based ones ?
29 mapRE
= re
.compile('((?:0x[0-9a-fA-F]+\+?)+)'
31 '((?:(?:0x[0-9a-fA-Z]+|<[A-Za-z]+>)\+?)*)'
37 split
=string
.split
,atoi
=string
.atoi
,len=len,
38 filter=filter,range=range):
40 """ Converts code combinations to either a single code integer
41 or a tuple of integers.
43 meta-codes (in angular brackets, e.g. <LR> and <RL>) are
46 Empty codes or illegal ones are returned as None.
54 for i
in range(len(l
)):
59 l
= filter(lambda x
: x
is not None, l
)
69 f
= open(filename
,'r')
75 if not line
or line
[0] == '#':
79 #print '* not matched: %s' % repr(line)
81 enc
,uni
,comment
= m
.groups()
89 enc2uni
[enc
] = (uni
,comment
)
102 return '(' + join(map(lambda t
: '0x%04x' % t
, t
),', ') + ')'
116 return repr(unichr(t
))
117 return repr(join(map(unichr, t
),''))
134 return repr(unichr(t
))
135 return repr(join(map(chr, t
),''))
137 def codegen(name
,map,comments
=1):
139 """ Returns Python source for the given map.
141 Comments are included in the source, if comments is true (default).
146 """ Python Character Mapping Codec generated from '%s'.
148 Written by Marc-Andre Lemburg (mal@lemburg.com).
150 (c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
158 class Codec(codecs.Codec):
160 def encode(self,input,errors='strict'):
162 return codecs.charmap_encode(input,errors,encoding_map)
164 def decode(self,input,errors='strict'):
166 return codecs.charmap_decode(input,errors,decoding_map)
168 class StreamWriter(Codec,codecs.StreamWriter):
171 class StreamReader(Codec,codecs.StreamReader):
174 ### encodings module API
178 return (Codec().encode,Codec().decode,StreamReader,StreamWriter)
185 mappings
= map.items()
190 for e
,value
in mappings
:
198 append('\t%s: %s,\t# %s' % (key
,unicoderepr(u
),c
))
200 append('\t%s: %s,' % (key
,unicoderepr(u
)))
203 # Split the definition into parts to that the Python
204 # parser doesn't dump core
209 append('map.update({')
220 for k,v in decoding_map.items():
223 return string
.join(l
,'\n')
225 def pymap(name
,map,pyfile
,comments
=1):
227 code
= codegen(name
,map,comments
)
232 def marshalmap(name
,map,marshalfile
):
235 for e
,(u
,c
) in map.items():
237 f
= open(marshalfile
,'wb')
241 def convertdir(dir,prefix
='',comments
=1):
243 mapnames
= os
.listdir(dir)
244 for mapname
in mapnames
:
245 name
= os
.path
.split(mapname
)[1]
246 name
= string
.replace(name
,'-','_')
247 name
= string
.split(name
, '.')[0]
248 name
= string
.lower(name
)
249 codefile
= name
+ '.py'
250 marshalfile
= name
+ '.mapping'
251 print 'converting %s to %s and %s' % (mapname
,
253 prefix
+ marshalfile
)
255 map = readmap(os
.path
.join(dir,mapname
))
257 print '* map is empty; skipping'
259 pymap(mapname
, map, prefix
+ codefile
,comments
)
260 marshalmap(mapname
, map, prefix
+ marshalfile
)
262 print '* conversion failed'
264 def rewritepythondir(dir,prefix
='',comments
=1):
266 mapnames
= os
.listdir(dir)
267 for mapname
in mapnames
:
268 if mapname
[-len('.mapping'):] != '.mapping':
270 codefile
= mapname
[:-len('.mapping')] + '.py'
271 print 'converting %s to %s' % (mapname
,
274 map = marshal
.load(open(os
.path
.join(dir,mapname
),
277 print '* map is empty; skipping'
279 pymap(mapname
, map, prefix
+ codefile
,comments
)
280 except ValueError, why
:
281 print '* conversion failed: %s' % why
283 if __name__
== '__main__':
287 apply(convertdir
,tuple(sys
.argv
[1:]))
289 apply(rewritepythondir
,tuple(sys
.argv
[1:]))