1 #!/usr/local/bin/python
2 """ Utility for parsing HTML entity definitions available from:
4 http://www.w3.org/ as e.g.
5 http://www.w3.org/TR/REC-html40/HTMLlat1.ent
7 Input is read from stdin, output is written to stdout in form of a
8 Python snippet defining a dictionary "entitydefs" mapping literal
9 entity name to character or numeric entity.
11 Marc-Andre Lemburg, mal@lemburg.com, 1999.
12 Use as you like. NO WARRANTIES.
18 entityRE
= re
.compile('<!ENTITY +(\w+) +CDATA +"([^"]+)" +-- +((?:.|\n)+?) *-->')
20 def parse(text
,pos
=0,endpos
=None):
27 m
= entityRE
.search(text
,pos
,endpos
)
30 name
,charcode
,comment
= m
.groups()
31 d
[name
] = charcode
,comment
35 def writefile(f
,defs
):
37 f
.write("entitydefs = {\n")
40 for name
,(charcode
,comment
) in items
:
41 if charcode
[:2] == '&#':
42 code
= int(charcode
[2:-1])
44 charcode
= "'\%o'" % code
46 charcode
= repr(charcode
)
48 charcode
= repr(charcode
)
49 comment
= TextTools
.collapse(comment
)
50 f
.write(" '%s':\t%s, \t# %s\n" % (name
,charcode
,comment
))
53 if __name__
== '__main__':
55 infile
= open(sys
.argv
[1])
59 outfile
= open(sys
.argv
[2],'w')
64 writefile(outfile
,defs
)