| #!/usr/bin/env python | |
| """ Utility for parsing HTML entity definitions available from: | |
| http://www.w3.org/ as e.g. | |
| http://www.w3.org/TR/REC-html40/HTMLlat1.ent | |
| Input is read from stdin, output is written to stdout in form of a | |
| Python snippet defining a dictionary "entitydefs" mapping literal | |
| entity name to character or numeric entity. | |
| Marc-Andre Lemburg, mal@lemburg.com, 1999. | |
| Use as you like. NO WARRANTIES. | |
| """ | |
| import re,sys | |
| import TextTools | |
| entityRE = re.compile('<!ENTITY +(\w+) +CDATA +"([^"]+)" +-- +((?:.|\n)+?) *-->') | |
| def parse(text,pos=0,endpos=None): | |
| pos = 0 | |
| if endpos is None: | |
| endpos = len(text) | |
| d = {} | |
| while 1: | |
| m = entityRE.search(text,pos,endpos) | |
| if not m: | |
| break | |
| name,charcode,comment = m.groups() | |
| d[name] = charcode,comment | |
| pos = m.end() | |
| return d | |
| def writefile(f,defs): | |
| f.write("entitydefs = {\n") | |
| items = defs.items() | |
| items.sort() | |
| for name,(charcode,comment) in items: | |
| if charcode[:2] == '&#': | |
| code = int(charcode[2:-1]) | |
| if code < 256: | |
| charcode = "'\%o'" % code | |
| else: | |
| charcode = repr(charcode) | |
| else: | |
| charcode = repr(charcode) | |
| comment = TextTools.collapse(comment) | |
| f.write(" '%s':\t%s, \t# %s\n" % (name,charcode,comment)) | |
| f.write('\n}\n') | |
| if __name__ == '__main__': | |
| if len(sys.argv) > 1: | |
| infile = open(sys.argv[1]) | |
| else: | |
| infile = sys.stdin | |
| if len(sys.argv) > 2: | |
| outfile = open(sys.argv[2],'w') | |
| else: | |
| outfile = sys.stdout | |
| text = infile.read() | |
| defs = parse(text) | |
| writefile(outfile,defs) |