More installation info. Bump alpha version.
[python/dscho.git] / Tools / scripts / gencodec.py
blob46563dfff9c287fdb0f6c99c6680ff0e3c090c24
1 """ Unicode Mapping Parser and Codec Generator.
3 This script parses Unicode mapping files as available from the Unicode
4 site (ftp://ftp.unicode.org/Public/MAPPINGS/) and creates Python codec
5 modules from them. The codecs use the standard character mapping codec
6 to actually apply the mapping.
8 Synopsis: gencodec.py dir codec_prefix
10 All files in dir are scanned and those producing non-empty mappings
11 will be written to <codec_prefix><mapname>.py with <mapname> being the
12 first part of the map's filename ('a' in a.b.c.txt) converted to
13 lowercase with hyphens replaced by underscores.
15 The tool also writes marshalled versions of the mapping tables to the
16 same location (with .mapping extension).
18 Written by Marc-Andre Lemburg (mal@lemburg.com).
20 (c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
21 (c) Copyright Guido van Rossum, 2000.
23 """#"
25 import re,os,time,marshal
27 # Create numeric tables or character based ones ?
28 numeric = 1
30 mapRE = re.compile('((?:0x[0-9a-fA-F]+\+?)+)'
31 '\s+'
32 '((?:(?:0x[0-9a-fA-Z]+|<[A-Za-z]+>)\+?)*)'
33 '\s*'
34 '(#.+)?')
36 def parsecodes(codes,
37 len=len, filter=filter,range=range):
39 """ Converts code combinations to either a single code integer
40 or a tuple of integers.
42 meta-codes (in angular brackets, e.g. <LR> and <RL>) are
43 ignored.
45 Empty codes or illegal ones are returned as None.
47 """
48 if not codes:
49 return None
50 l = codes.split('+')
51 if len(l) == 1:
52 return int(l[0],16)
53 for i in range(len(l)):
54 try:
55 l[i] = int(l[i],16)
56 except ValueError:
57 l[i] = None
58 l = filter(lambda x: x is not None, l)
59 if len(l) == 1:
60 return l[0]
61 else:
62 return tuple(l)
64 def readmap(filename):
66 f = open(filename,'r')
67 lines = f.readlines()
68 f.close()
69 enc2uni = {}
70 identity = []
71 unmapped = range(256)
72 for i in range(256):
73 unmapped[i] = i
74 for line in lines:
75 line = line.strip()
76 if not line or line[0] == '#':
77 continue
78 m = mapRE.match(line)
79 if not m:
80 #print '* not matched: %s' % repr(line)
81 continue
82 enc,uni,comment = m.groups()
83 enc = parsecodes(enc)
84 uni = parsecodes(uni)
85 if not comment:
86 comment = ''
87 else:
88 comment = comment[1:]
89 if enc < 256:
90 unmapped.remove(enc)
91 if enc == uni:
92 identity.append(enc)
93 else:
94 enc2uni[enc] = (uni,comment)
95 else:
96 enc2uni[enc] = (uni,comment)
97 # If there are more identity-mapped entries than unmapped entries,
98 # it pays to generate an identity dictionary first, add add explicit
99 # mappings to None for the rest
100 if len(identity)>=len(unmapped):
101 for enc in unmapped:
102 enc2uni[enc] = (None, "")
103 enc2uni['IDENTITY'] = 256
105 return enc2uni
107 def hexrepr(t):
109 if t is None:
110 return 'None'
111 try:
112 len(t)
113 except:
114 return '0x%04x' % t
115 return '(' + ', '.join(map(lambda t: '0x%04x' % t, t)) + ')'
117 def unicoderepr(t):
119 if t is None:
120 return 'None'
121 if numeric:
122 return hexrepr(t)
123 else:
124 try:
125 len(t)
126 except:
127 return repr(unichr(t))
128 return repr(''.join(map(unichr, t)))
130 def keyrepr(t):
132 if t is None:
133 return 'None'
134 if numeric:
135 return hexrepr(t)
136 else:
137 try:
138 len(t)
139 except:
140 if t < 256:
141 return repr(chr(t))
142 else:
143 return repr(unichr(t))
144 return repr(''.join(map(chr, t)))
146 def codegen(name,map,comments=1):
148 """ Returns Python source for the given map.
150 Comments are included in the source, if comments is true (default).
153 l = [
154 '''\
155 """ Python Character Mapping Codec generated from '%s' with gencodec.py.
157 Written by Marc-Andre Lemburg (mal@lemburg.com).
159 (c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
160 (c) Copyright 2000 Guido van Rossum.
162 """#"
164 import codecs
166 ### Codec APIs
168 class Codec(codecs.Codec):
170 def encode(self,input,errors='strict'):
172 return codecs.charmap_encode(input,errors,encoding_map)
174 def decode(self,input,errors='strict'):
176 return codecs.charmap_decode(input,errors,decoding_map)
178 class StreamWriter(Codec,codecs.StreamWriter):
179 pass
181 class StreamReader(Codec,codecs.StreamReader):
182 pass
184 ### encodings module API
186 def getregentry():
188 return (Codec().encode,Codec().decode,StreamReader,StreamWriter)
190 ### Decoding Map
191 ''' % name,
194 if map.has_key("IDENTITY"):
195 l.append("decoding_map = codecs.make_identity_dict(range(%d))"
196 % map["IDENTITY"])
197 l.append("decoding_map.update({")
198 splits = 1
199 del map["IDENTITY"]
200 else:
201 l.append("decoding_map = {")
202 splits = 0
204 mappings = map.items()
205 mappings.sort()
206 append = l.append
207 i = 0
208 for e,value in mappings:
209 try:
210 (u,c) = value
211 except TypeError:
212 u = value
213 c = ''
214 key = keyrepr(e)
215 if c and comments:
216 append('\t%s: %s,\t# %s' % (key,unicoderepr(u),c))
217 else:
218 append('\t%s: %s,' % (key,unicoderepr(u)))
219 i += 1
220 if i == 4096:
221 # Split the definition into parts to that the Python
222 # parser doesn't dump core
223 if splits == 0:
224 append('}')
225 else:
226 append('})')
227 append('decoding_map.update({')
228 i = 0
229 splits = splits + 1
230 if splits == 0:
231 append('}')
232 else:
233 append('})')
234 append('''
235 ### Encoding Map
237 encoding_map = codecs.make_encoding_map(decoding_map)
238 ''')
239 return '\n'.join(l)
241 def pymap(name,map,pyfile,comments=1):
243 code = codegen(name,map,comments)
244 f = open(pyfile,'w')
245 f.write(code)
246 f.close()
248 def marshalmap(name,map,marshalfile):
250 d = {}
251 for e,(u,c) in map.items():
252 d[e] = (u,c)
253 f = open(marshalfile,'wb')
254 marshal.dump(d,f)
255 f.close()
257 def convertdir(dir,prefix='',comments=1):
259 mapnames = os.listdir(dir)
260 for mapname in mapnames:
261 name = os.path.split(mapname)[1]
262 name = name.replace('-','_')
263 name = name.split('.')[0]
264 name = name.lower()
265 codefile = name + '.py'
266 marshalfile = name + '.mapping'
267 print 'converting %s to %s and %s' % (mapname,
268 prefix + codefile,
269 prefix + marshalfile)
270 try:
271 map = readmap(os.path.join(dir,mapname))
272 if not map:
273 print '* map is empty; skipping'
274 else:
275 pymap(mapname, map, prefix + codefile,comments)
276 marshalmap(mapname, map, prefix + marshalfile)
277 except ValueError:
278 print '* conversion failed'
280 def rewritepythondir(dir,prefix='',comments=1):
282 mapnames = os.listdir(dir)
283 for mapname in mapnames:
284 if not mapname.endswith('.mapping'):
285 continue
286 codefile = mapname[:-len('.mapping')] + '.py'
287 print 'converting %s to %s' % (mapname,
288 prefix + codefile)
289 try:
290 map = marshal.load(open(os.path.join(dir,mapname),
291 'rb'))
292 if not map:
293 print '* map is empty; skipping'
294 else:
295 pymap(mapname, map, prefix + codefile,comments)
296 except ValueError, why:
297 print '* conversion failed: %s' % why
299 if __name__ == '__main__':
301 import sys
302 if 1:
303 apply(convertdir,tuple(sys.argv[1:]))
304 else:
305 apply(rewritepythondir,tuple(sys.argv[1:]))