Bump version to 1.0.
[python/dscho.git] / Tools / scripts / gencodec.py
blob45b69b0abd1c19f8fd987117bd72e4d2e2f73dcd
1 """ Unicode Mapping Parser and Codec Generator.
3 This script parses Unicode mapping files as available from the Unicode
4 site (ftp.unicode.org) and creates Python codec modules from them. The
5 codecs use the standard character mapping codec to actually apply the
6 mapping.
8 Synopsis: gencodec.py dir codec_prefix
10 All files in dir are scanned and those producing non-empty mappings
11 will be written to <codec_prefix><mapname>.py with <mapname> being the
12 first part of the map's filename ('a' in a.b.c.txt) converted to
13 lowercase with hyphens replaced by underscores.
15 The tool also writes marshalled versions of the mapping tables to the
16 same location (with .mapping extension).
18 Written by Marc-Andre Lemburg (mal@lemburg.com).
20 (c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
22 """#"
24 import string,re,os,time,marshal
26 # Create numeric tables or character based ones ?
27 numeric = 1
29 mapRE = re.compile('((?:0x[0-9a-fA-F]+\+?)+)'
30 '\s+'
31 '((?:(?:0x[0-9a-fA-Z]+|<[A-Za-z]+>)\+?)*)'
32 '\s*'
33 '(#.+)?')
35 def parsecodes(codes,
37 split=string.split,atoi=string.atoi,len=len,
38 filter=filter,range=range):
40 """ Converts code combinations to either a single code integer
41 or a tuple of integers.
43 meta-codes (in angular brackets, e.g. <LR> and <RL>) are
44 ignored.
46 Empty codes or illegal ones are returned as None.
48 """
49 if not codes:
50 return None
51 l = split(codes,'+')
52 if len(l) == 1:
53 return atoi(l[0],16)
54 for i in range(len(l)):
55 try:
56 l[i] = atoi(l[i],16)
57 except ValueError:
58 l[i] = None
59 l = filter(lambda x: x is not None, l)
60 if len(l) == 1:
61 return l[0]
62 else:
63 return tuple(l)
65 def readmap(filename,
67 strip=string.strip):
69 f = open(filename,'r')
70 lines = f.readlines()
71 f.close()
72 enc2uni = {}
73 for line in lines:
74 line = strip(line)
75 if not line or line[0] == '#':
76 continue
77 m = mapRE.match(line)
78 if not m:
79 #print '* not matched: %s' % repr(line)
80 continue
81 enc,uni,comment = m.groups()
82 enc = parsecodes(enc)
83 uni = parsecodes(uni)
84 if not comment:
85 comment = ''
86 else:
87 comment = comment[1:]
88 if enc != uni:
89 enc2uni[enc] = (uni,comment)
90 return enc2uni
92 def hexrepr(t,
94 join=string.join):
96 if t is None:
97 return 'None'
98 try:
99 len(t)
100 except:
101 return '0x%04x' % t
102 return '(' + join(map(lambda t: '0x%04x' % t, t),', ') + ')'
104 def unicoderepr(t,
106 join=string.join):
108 if t is None:
109 return 'None'
110 if numeric:
111 return hexrepr(t)
112 else:
113 try:
114 len(t)
115 except:
116 return repr(unichr(t))
117 return repr(join(map(unichr, t),''))
119 def keyrepr(t,
121 join=string.join):
123 if t is None:
124 return 'None'
125 if numeric:
126 return hexrepr(t)
127 else:
128 try:
129 len(t)
130 except:
131 if t < 256:
132 return repr(chr(t))
133 else:
134 return repr(unichr(t))
135 return repr(join(map(chr, t),''))
137 def codegen(name,map,comments=1):
139 """ Returns Python source for the given map.
141 Comments are included in the source, if comments is true (default).
144 l = [
145 '''\
146 """ Python Character Mapping Codec generated from '%s'.
148 Written by Marc-Andre Lemburg (mal@lemburg.com).
150 (c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
152 """#"
154 import codecs
156 ### Codec APIs
158 class Codec(codecs.Codec):
160 def encode(self,input,errors='strict'):
162 return codecs.charmap_encode(input,errors,encoding_map)
164 def decode(self,input,errors='strict'):
166 return codecs.charmap_decode(input,errors,decoding_map)
168 class StreamWriter(Codec,codecs.StreamWriter):
169 pass
171 class StreamReader(Codec,codecs.StreamReader):
172 pass
174 ### encodings module API
176 def getregentry():
178 return (Codec().encode,Codec().decode,StreamReader,StreamWriter)
180 ### Decoding Map
182 decoding_map = {
183 ''' % name,
185 mappings = map.items()
186 mappings.sort()
187 append = l.append
188 i = 0
189 splits = 0
190 for e,value in mappings:
191 try:
192 (u,c) = value
193 except TypeError:
194 u = value
195 c = ''
196 key = keyrepr(e)
197 if c and comments:
198 append('\t%s: %s,\t# %s' % (key,unicoderepr(u),c))
199 else:
200 append('\t%s: %s,' % (key,unicoderepr(u)))
201 i = i + 1
202 if i == 4096:
203 # Split the definition into parts to that the Python
204 # parser doesn't dump core
205 if splits == 0:
206 append('}')
207 else:
208 append('})')
209 append('map.update({')
210 i = 0
211 splits = splits + 1
212 if splits == 0:
213 append('}')
214 else:
215 append('})')
216 append('''
217 ### Encoding Map
219 encoding_map = {}
220 for k,v in decoding_map.items():
221 encoding_map[v] = k
222 ''')
223 return string.join(l,'\n')
225 def pymap(name,map,pyfile,comments=1):
227 code = codegen(name,map,comments)
228 f = open(pyfile,'w')
229 f.write(code)
230 f.close()
232 def marshalmap(name,map,marshalfile):
234 d = {}
235 for e,(u,c) in map.items():
236 d[e] = (u,c)
237 f = open(marshalfile,'wb')
238 marshal.dump(d,f)
239 f.close()
241 def convertdir(dir,prefix='',comments=1):
243 mapnames = os.listdir(dir)
244 for mapname in mapnames:
245 name = os.path.split(mapname)[1]
246 name = string.replace(name,'-','_')
247 name = string.split(name, '.')[0]
248 name = string.lower(name)
249 codefile = name + '.py'
250 marshalfile = name + '.mapping'
251 print 'converting %s to %s and %s' % (mapname,
252 prefix + codefile,
253 prefix + marshalfile)
254 try:
255 map = readmap(os.path.join(dir,mapname))
256 if not map:
257 print '* map is empty; skipping'
258 else:
259 pymap(mapname, map, prefix + codefile,comments)
260 marshalmap(mapname, map, prefix + marshalfile)
261 except ValueError:
262 print '* conversion failed'
264 def rewritepythondir(dir,prefix='',comments=1):
266 mapnames = os.listdir(dir)
267 for mapname in mapnames:
268 if mapname[-len('.mapping'):] != '.mapping':
269 continue
270 codefile = mapname[:-len('.mapping')] + '.py'
271 print 'converting %s to %s' % (mapname,
272 prefix + codefile)
273 try:
274 map = marshal.load(open(os.path.join(dir,mapname),
275 'rb'))
276 if not map:
277 print '* map is empty; skipping'
278 else:
279 pymap(mapname, map, prefix + codefile,comments)
280 except ValueError, why:
281 print '* conversion failed: %s' % why
283 if __name__ == '__main__':
285 import sys
286 if 1:
287 apply(convertdir,tuple(sys.argv[1:]))
288 else:
289 apply(rewritepythondir,tuple(sys.argv[1:]))