MSWSP: make printing of type optional in str_CBaseStorageVariant()
[wireshark-wip.git] / tools / html2text.py
blob9ae6c66fb61bbb6fc72e7eeea57ff7d000d56c82
1 #!/usr/bin/env python
2 """html2text: Turn HTML into equivalent Markdown-structured text."""
3 __version__ = "2.35-Wireshark"
4 __author__ = "Aaron Swartz (me@aaronsw.com)"
5 __copyright__ = "(C) 2004-2008 Aaron Swartz. GNU GPL 3."
6 __contributors__ = ["Martin 'Joey' Schulze", "Ricardo Reyes"]
8 # NOTE:
9 # This is a modified version of html2text.py from http://www.aaronsw.com/2002/html2text/
10 # Changes:
11 # Options can now be configured from the command line.
12 # SKIP_LINKS and INPUT_ENCODING options have been added.
13 # The script now requires Python 2.3
15 # TODO:
16 # Support decoded entities with unifiable.
17 # Relative URL resolution
18 # Indent sections and lists similar to elinks/links/lynx
20 if not hasattr(__builtins__, 'True'): True, False = 1, 0
21 import re, sys, urllib, htmlentitydefs, codecs, StringIO, types
22 import sgmllib
23 sgmllib.charref = re.compile('&#([xX]?[0-9a-fA-F]+)[^0-9a-fA-F]')
24 from optparse import OptionParser
26 try: from textwrap import wrap
27 except: pass
29 oparser = OptionParser()
30 options = None
31 args = None
33 oparser.add_option(
34 "--force-unicode",
35 action="store_true",
36 dest="UNICODE_SNOB",
37 default=False,
38 help="Use Unicode characters instead of their ascii psuedo-replacements. [default: False]",
41 oparser.add_option(
42 "--links-after-paragraphs",
43 action="store_true",
44 dest="LINKS_EACH_PARAGRAPH",
45 default=False,
46 help="Put the links after each paragraph instead of at the end. [default: False]",
49 oparser.add_option(
50 "--width",
51 type="int",
52 dest="BODY_WIDTH",
53 default=78,
54 help="Wrap long lines at position. 0 for no wrapping. Requires Python 2.3. [default: 78 characters]",
57 oparser.add_option(
58 "--no-internal-links",
59 action="store_true",
60 dest="SKIP_INTERNAL_LINKS",
61 default=False,
62 help='''Don't show internal links (href="#local-anchor"). Corresponding link targets won't be visible in the plain text file anyway. [default: False]''',
65 oparser.add_option(
66 "--no-links",
67 action="store_true",
68 dest="SKIP_LINKS",
69 default=False,
70 help='''Don't show links. [default: False]''',
73 oparser.add_option(
74 "--input-encoding",
75 type="string",
76 dest="INPUT_ENCODING",
77 default='utf-8',
78 help='''Force the encoding of the input file. [default: utf-8]''',
81 ### Entity Nonsense ###
83 def name2cp(k):
84 if k == 'apos': return ord("'")
85 if hasattr(htmlentitydefs, "name2codepoint"): # requires Python 2.3
86 return htmlentitydefs.name2codepoint[k]
87 else:
88 k = htmlentitydefs.entitydefs[k]
89 if k.startswith("&#") and k.endswith(";"): return int(k[2:-1]) # not in latin-1
90 return ord(codecs.latin_1_decode(k)[0])
92 unifiable = {'rsquo':"'", 'lsquo':"'", 'rdquo':'"', 'ldquo':'"',
93 'copy':'(C)', 'mdash':'--', 'nbsp':' ', 'rarr':'->', 'larr':'<-', 'middot':'*',
94 'ndash':'-', 'oelig':'oe', 'aelig':'ae',
95 'agrave':'a', 'aacute':'a', 'acirc':'a', 'atilde':'a', 'auml':'a', 'aring':'a',
96 'egrave':'e', 'eacute':'e', 'ecirc':'e', 'euml':'e',
97 'igrave':'i', 'iacute':'i', 'icirc':'i', 'iuml':'i',
98 'ograve':'o', 'oacute':'o', 'ocirc':'o', 'otilde':'o', 'ouml':'o',
99 'ugrave':'u', 'uacute':'u', 'ucirc':'u', 'uuml':'u'}
101 unifiable_n = {}
103 for k in unifiable.keys():
104 unifiable_n[name2cp(k)] = unifiable[k]
106 def charref(name):
107 global options
109 if name[0] in ['x','X']:
110 c = int(name[1:], 16)
111 else:
112 c = int(name)
114 if not options.UNICODE_SNOB and c in unifiable_n.keys():
115 return unifiable_n[c]
116 else:
117 return unichr(c)
119 def entityref(c):
120 global options
122 if not options.UNICODE_SNOB and c in unifiable.keys():
123 return unifiable[c]
124 else:
125 try: name2cp(c)
126 except KeyError: return "&" + c
127 else: return unichr(name2cp(c))
129 def replaceEntities(s):
130 s = s.group(1)
131 if s[0] == "#":
132 return charref(s[1:])
133 else: return entityref(s)
135 r_unescape = re.compile(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));")
136 def unescape(s):
137 return r_unescape.sub(replaceEntities, s)
139 def fixattrs(attrs):
140 # Fix bug in sgmllib.py
141 if not attrs: return attrs
142 newattrs = []
143 for attr in attrs:
144 newattrs.append((attr[0], unescape(attr[1])))
145 return newattrs
147 ### End Entity Nonsense ###
149 def onlywhite(line):
150 """Return true if the line does only consist of whitespace characters."""
151 for c in line:
152 if c is not ' ' and c is not ' ':
153 return c is ' '
154 return line
156 def optwrap(text):
157 """Wrap all paragraphs in the provided text."""
158 global options
159 if not options.BODY_WIDTH:
160 return text
162 assert wrap, "Requires Python 2.3."
163 result = ''
164 newlines = 0
165 for para in text.split("\n"):
166 if len(para) > 0:
167 if para[0] is not ' ' and para[0] is not '-' and para[0] is not '*':
168 for line in wrap(para, options.BODY_WIDTH):
169 result += line + "\n"
170 result += "\n"
171 newlines = 2
172 else:
173 if not onlywhite(para):
174 result += para + "\n"
175 newlines = 1
176 else:
177 if newlines < 2:
178 result += "\n"
179 newlines += 1
180 return result
182 def hn(tag):
183 if tag[0] == 'h' and len(tag) == 2:
184 try:
185 n = int(tag[1])
186 if n in range(1, 10): return n
187 except ValueError: return 0
189 class _html2text(sgmllib.SGMLParser):
190 def __init__(self, out=sys.stdout.write):
191 sgmllib.SGMLParser.__init__(self)
193 if out is None: self.out = self.outtextf
194 else: self.out = out
195 self.outtext = u''
196 self.quiet = 0
197 self.p_p = 0
198 self.outcount = 0
199 self.start = 1
200 self.space = 0
201 self.a = []
202 self.astack = []
203 self.acount = 0
204 self.list = []
205 self.blockquote = 0
206 self.pre = 0
207 self.startpre = 0
208 self.lastWasNL = 0
209 self.abbr_title = None # current abbreviation definition
210 self.abbr_data = None # last inner HTML (for abbr being defined)
211 self.abbr_list = {} # stack of abbreviations to write later
213 def outtextf(self, s):
214 self.outtext += s
216 def close(self):
217 sgmllib.SGMLParser.close(self)
219 self.pbr()
220 self.o('', 0, 'end')
222 return self.outtext
224 def handle_charref(self, c):
225 self.o(charref(c))
227 def handle_entityref(self, c):
228 self.o(entityref(c))
230 def unknown_starttag(self, tag, attrs):
231 self.handle_tag(tag, attrs, 1)
233 def unknown_endtag(self, tag):
234 self.handle_tag(tag, None, 0)
236 def previousIndex(self, attrs):
237 """ returns the index of certain set of attributes (of a link) in the
238 self.a list
240 If the set of attributes is not found, returns None
242 if not attrs.has_key('href'): return None
244 i = -1
245 for a in self.a:
246 i += 1
247 match = 0
249 if a.has_key('href') and a['href'] == attrs['href']:
250 if a.has_key('title') or attrs.has_key('title'):
251 if (a.has_key('title') and attrs.has_key('title') and
252 a['title'] == attrs['title']):
253 match = True
254 else:
255 match = True
257 if match: return i
259 def handle_tag(self, tag, attrs, start):
260 global options
261 attrs = fixattrs(attrs)
263 if hn(tag):
264 self.p()
265 if start: self.o(hn(tag)*"#" + ' ')
267 if tag in ['p', 'div']: self.p()
269 if tag == "br" and start: self.o(" \n")
271 if tag == "hr" and start:
272 self.p()
273 self.o("* * *")
274 self.p()
276 if tag in ["head", "style", 'script']:
277 if start: self.quiet += 1
278 else: self.quiet -= 1
280 if tag in ["body"]:
281 self.quiet = 0 # sites like 9rules.com never close <head>
283 if tag == "blockquote":
284 if start:
285 self.p(); self.o('> ', 0, 1); self.start = 1
286 self.blockquote += 1
287 else:
288 self.blockquote -= 1
289 self.p()
291 if tag in ['em', 'i', 'u']: self.o("_")
292 if tag in ['strong', 'b']: self.o("**")
293 if tag == "code" and not self.pre: self.o('`') #TODO: `` `this` ``
294 if tag == "abbr":
295 if start:
296 attrsD = {}
297 for (x, y) in attrs: attrsD[x] = y
298 attrs = attrsD
300 self.abbr_title = None
301 self.abbr_data = ''
302 if attrs.has_key('title'):
303 self.abbr_title = attrs['title']
304 else:
305 if self.abbr_title != None:
306 self.abbr_list[self.abbr_data] = self.abbr_title
307 self.abbr_title = None
308 self.abbr_data = ''
310 if tag == "a":
311 if start:
312 attrsD = {}
313 for (x, y) in attrs: attrsD[x] = y
314 attrs = attrsD
315 if attrs.has_key('href') and not (options.SKIP_LINKS or (options.SKIP_INTERNAL_LINKS and attrs['href'].startswith('#'))):
316 self.astack.append(attrs)
317 self.o("[")
318 else:
319 self.astack.append(None)
320 else:
321 if self.astack:
322 a = self.astack.pop()
323 if a:
324 i = self.previousIndex(a)
325 if i is not None:
326 a = self.a[i]
327 else:
328 self.acount += 1
329 a['count'] = self.acount
330 a['outcount'] = self.outcount
331 self.a.append(a)
332 self.o("][" + `a['count']` + "]")
334 if tag == "img" and start:
335 attrsD = {}
336 for (x, y) in attrs: attrsD[x] = y
337 attrs = attrsD
338 if attrs.has_key('src'):
339 attrs['href'] = attrs['src']
340 alt = attrs.get('alt', '')
341 i = self.previousIndex(attrs)
342 if i is not None:
343 attrs = self.a[i]
344 else:
345 self.acount += 1
346 attrs['count'] = self.acount
347 attrs['outcount'] = self.outcount
348 self.a.append(attrs)
349 self.o("![")
350 self.o(alt)
351 self.o("]["+`attrs['count']`+"]")
353 if tag == 'dl' and start: self.p()
354 if tag == 'dt' and not start: self.pbr()
355 if tag == 'dd' and start: self.o(' ')
356 if tag == 'dd' and not start: self.pbr()
358 if tag in ["ol", "ul"]:
359 if start:
360 self.list.append({'name':tag, 'num':0})
361 else:
362 if self.list: self.list.pop()
364 self.p()
366 if tag == 'li':
367 if start:
368 self.pbr()
369 if self.list: li = self.list[-1]
370 else: li = {'name':'ul', 'num':0}
371 self.o(" "*len(self.list)) #TODO: line up <ol><li>s > 9 correctly.
372 if li['name'] == "ul": self.o("* ")
373 elif li['name'] == "ol":
374 li['num'] += 1
375 self.o(`li['num']`+". ")
376 self.start = 1
377 else:
378 self.pbr()
380 if tag in ["table", "tr"] and start: self.p()
381 if tag == 'td': self.pbr()
383 if tag == "pre":
384 if start:
385 self.startpre = 1
386 self.pre = 1
387 else:
388 self.pre = 0
389 self.p()
391 def pbr(self):
392 if self.p_p == 0: self.p_p = 1
394 def p(self): self.p_p = 2
396 def o(self, data, puredata=0, force=0):
397 if self.abbr_data is not None: self.abbr_data += data
399 if not self.quiet:
400 if puredata and not self.pre:
401 data = re.sub('\s+', ' ', data)
402 if data and data[0] == ' ':
403 self.space = 1
404 data = data[1:]
405 if not data and not force: return
407 if self.startpre:
408 #self.out(" :") #TODO: not output when already one there
409 self.startpre = 0
411 bq = (">" * self.blockquote)
412 if not (force and data and data[0] == ">") and self.blockquote: bq += " "
414 if self.pre:
415 bq += " "
416 data = data.replace("\n", "\n"+bq)
418 if self.start:
419 self.space = 0
420 self.p_p = 0
421 self.start = 0
423 if force == 'end':
424 # It's the end.
425 self.p_p = 0
426 self.out("\n")
427 self.space = 0
430 if self.p_p:
431 self.out(('\n'+bq)*self.p_p)
432 self.space = 0
434 if self.space:
435 if not self.lastWasNL: self.out(' ')
436 self.space = 0
438 if self.a and ((self.p_p == 2 and options.LINKS_EACH_PARAGRAPH) or force == "end"):
439 if force == "end": self.out("\n")
441 newa = []
442 for link in self.a:
443 if self.outcount > link['outcount']:
444 self.out(" ["+`link['count']`+"]: " + link['href']) #TODO: base href
445 if link.has_key('title'): self.out(" ("+link['title']+")")
446 self.out("\n")
447 else:
448 newa.append(link)
450 if self.a != newa: self.out("\n") # Don't need an extra line when nothing was done.
452 self.a = newa
454 if self.abbr_list and force == "end":
455 for abbr, definition in self.abbr_list.items():
456 self.out(" *[" + abbr + "]: " + definition + "\n")
458 self.p_p = 0
459 self.out(data)
460 self.lastWasNL = data and data[-1] == '\n'
461 self.outcount += 1
463 def handle_data(self, data):
464 if r'\/script>' in data: self.quiet -= 1
465 self.o(data, 1)
467 def unknown_decl(self, data): pass
469 def wrapwrite(text): sys.stdout.write(text.encode('utf8'))
471 def html2text_file(html, out=wrapwrite):
472 global options, args, oparser
473 if options is None or args is None:
474 (options, args) = oparser.parse_args(None, None)
476 h = _html2text(out)
477 h.feed(html)
478 h.feed("")
479 return h.close()
481 def html2text(html):
482 return optwrap(html2text_file(html, None))
484 if __name__ == "__main__":
485 (options, args) = oparser.parse_args()
486 if len(args) > 0:
487 arg = args[0]
488 if arg.startswith('http://'):
489 j = urllib.urlopen(arg)
490 try:
491 from feedparser import _getCharacterEncoding as enc
492 except ImportError:
493 enc = lambda x, y: ('utf-8', 1)
494 text = j.read()
495 encoding = enc(j.headers, text)[0]
496 if encoding == 'us-ascii': encoding = 'utf-8'
497 data = text.decode(encoding)
499 else:
500 data = open(arg, 'r').read().decode(options.INPUT_ENCODING)
501 else:
502 data = sys.stdin.read().decode(options.INPUT_ENCODING)
503 wrapwrite(html2text(data))