2 """html2text: Turn HTML into equivalent Markdown-structured text."""
3 __version__
= "2.35-Wireshark"
4 __author__
= "Aaron Swartz (me@aaronsw.com)"
5 __copyright__
= "(C) 2004-2008 Aaron Swartz. GNU GPL 3."
6 __contributors__
= ["Martin 'Joey' Schulze", "Ricardo Reyes"]
9 # This is a modified version of html2text.py from http://www.aaronsw.com/2002/html2text/
11 # Options can now be configured from the command line.
12 # SKIP_LINKS and INPUT_ENCODING options have been added.
13 # The script now requires Python 2.3
16 # Support decoded entities with unifiable.
17 # Relative URL resolution
18 # Indent sections and lists similar to elinks/links/lynx
20 if not hasattr(__builtins__
, 'True'): True, False = 1, 0
21 import re
, sys
, urllib
, htmlentitydefs
, codecs
, StringIO
, types
23 sgmllib
.charref
= re
.compile('&#([xX]?[0-9a-fA-F]+)[^0-9a-fA-F]')
24 from optparse
import OptionParser
26 try: from textwrap
import wrap
29 oparser
= OptionParser()
38 help="Use Unicode characters instead of their ascii psuedo-replacements. [default: False]",
42 "--links-after-paragraphs",
44 dest
="LINKS_EACH_PARAGRAPH",
46 help="Put the links after each paragraph instead of at the end. [default: False]",
54 help="Wrap long lines at position. 0 for no wrapping. Requires Python 2.3. [default: 78 characters]",
58 "--no-internal-links",
60 dest
="SKIP_INTERNAL_LINKS",
62 help='''Don't show internal links (href="#local-anchor"). Corresponding link targets won't be visible in the plain text file anyway. [default: False]''',
70 help='''Don't show links. [default: False]''',
76 dest
="INPUT_ENCODING",
78 help='''Force the encoding of the input file. [default: utf-8]''',
81 ### Entity Nonsense ###
84 if k
== 'apos': return ord("'")
85 if hasattr(htmlentitydefs
, "name2codepoint"): # requires Python 2.3
86 return htmlentitydefs
.name2codepoint
[k
]
88 k
= htmlentitydefs
.entitydefs
[k
]
89 if k
.startswith("&#") and k
.endswith(";"): return int(k
[2:-1]) # not in latin-1
90 return ord(codecs
.latin_1_decode(k
)[0])
92 unifiable
= {'rsquo':"'", 'lsquo':"'", 'rdquo':'"', 'ldquo':'"',
93 'copy':'(C)', 'mdash':'--', 'nbsp':' ', 'rarr':'->', 'larr':'<-', 'middot':'*',
94 'ndash':'-', 'oelig':'oe', 'aelig':'ae',
95 'agrave':'a', 'aacute':'a', 'acirc':'a', 'atilde':'a', 'auml':'a', 'aring':'a',
96 'egrave':'e', 'eacute':'e', 'ecirc':'e', 'euml':'e',
97 'igrave':'i', 'iacute':'i', 'icirc':'i', 'iuml':'i',
98 'ograve':'o', 'oacute':'o', 'ocirc':'o', 'otilde':'o', 'ouml':'o',
99 'ugrave':'u', 'uacute':'u', 'ucirc':'u', 'uuml':'u'}
103 for k
in unifiable
.keys():
104 unifiable_n
[name2cp(k
)] = unifiable
[k
]
109 if name
[0] in ['x','X']:
110 c
= int(name
[1:], 16)
114 if not options
.UNICODE_SNOB
and c
in unifiable_n
.keys():
115 return unifiable_n
[c
]
122 if not options
.UNICODE_SNOB
and c
in unifiable
.keys():
126 except KeyError: return "&" + c
127 else: return unichr(name2cp(c
))
129 def replaceEntities(s
):
132 return charref(s
[1:])
133 else: return entityref(s
)
135 r_unescape
= re
.compile(r
"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));")
137 return r_unescape
.sub(replaceEntities
, s
)
140 # Fix bug in sgmllib.py
141 if not attrs
: return attrs
144 newattrs
.append((attr
[0], unescape(attr
[1])))
147 ### End Entity Nonsense ###
150 """Return true if the line does only consist of whitespace characters."""
152 if c
is not ' ' and c
is not ' ':
157 """Wrap all paragraphs in the provided text."""
159 if not options
.BODY_WIDTH
:
162 assert wrap
, "Requires Python 2.3."
165 for para
in text
.split("\n"):
167 if para
[0] is not ' ' and para
[0] is not '-' and para
[0] is not '*':
168 for line
in wrap(para
, options
.BODY_WIDTH
):
169 result
+= line
+ "\n"
173 if not onlywhite(para
):
174 result
+= para
+ "\n"
183 if tag
[0] == 'h' and len(tag
) == 2:
186 if n
in range(1, 10): return n
187 except ValueError: return 0
189 class _html2text(sgmllib
.SGMLParser
):
190 def __init__(self
, out
=sys
.stdout
.write
):
191 sgmllib
.SGMLParser
.__init
__(self
)
193 if out
is None: self
.out
= self
.outtextf
209 self
.abbr_title
= None # current abbreviation definition
210 self
.abbr_data
= None # last inner HTML (for abbr being defined)
211 self
.abbr_list
= {} # stack of abbreviations to write later
213 def outtextf(self
, s
):
217 sgmllib
.SGMLParser
.close(self
)
224 def handle_charref(self
, c
):
227 def handle_entityref(self
, c
):
230 def unknown_starttag(self
, tag
, attrs
):
231 self
.handle_tag(tag
, attrs
, 1)
233 def unknown_endtag(self
, tag
):
234 self
.handle_tag(tag
, None, 0)
236 def previousIndex(self
, attrs
):
237 """ returns the index of certain set of attributes (of a link) in the
240 If the set of attributes is not found, returns None
242 if not attrs
.has_key('href'): return None
249 if a
.has_key('href') and a
['href'] == attrs
['href']:
250 if a
.has_key('title') or attrs
.has_key('title'):
251 if (a
.has_key('title') and attrs
.has_key('title') and
252 a
['title'] == attrs
['title']):
259 def handle_tag(self
, tag
, attrs
, start
):
261 attrs
= fixattrs(attrs
)
265 if start
: self
.o(hn(tag
)*"#" + ' ')
267 if tag
in ['p', 'div']: self
.p()
269 if tag
== "br" and start
: self
.o(" \n")
271 if tag
== "hr" and start
:
276 if tag
in ["head", "style", 'script']:
277 if start
: self
.quiet
+= 1
278 else: self
.quiet
-= 1
281 self
.quiet
= 0 # sites like 9rules.com never close <head>
283 if tag
== "blockquote":
285 self
.p(); self
.o('> ', 0, 1); self
.start
= 1
291 if tag
in ['em', 'i', 'u']: self
.o("_")
292 if tag
in ['strong', 'b']: self
.o("**")
293 if tag
== "code" and not self
.pre
: self
.o('`') #TODO: `` `this` ``
297 for (x
, y
) in attrs
: attrsD
[x
] = y
300 self
.abbr_title
= None
302 if attrs
.has_key('title'):
303 self
.abbr_title
= attrs
['title']
305 if self
.abbr_title
!= None:
306 self
.abbr_list
[self
.abbr_data
] = self
.abbr_title
307 self
.abbr_title
= None
313 for (x
, y
) in attrs
: attrsD
[x
] = y
315 if attrs
.has_key('href') and not (options
.SKIP_LINKS
or (options
.SKIP_INTERNAL_LINKS
and attrs
['href'].startswith('#'))):
316 self
.astack
.append(attrs
)
319 self
.astack
.append(None)
322 a
= self
.astack
.pop()
324 i
= self
.previousIndex(a
)
329 a
['count'] = self
.acount
330 a
['outcount'] = self
.outcount
332 self
.o("][" + `a
['count']`
+ "]")
334 if tag
== "img" and start
:
336 for (x
, y
) in attrs
: attrsD
[x
] = y
338 if attrs
.has_key('src'):
339 attrs
['href'] = attrs
['src']
340 alt
= attrs
.get('alt', '')
341 i
= self
.previousIndex(attrs
)
346 attrs
['count'] = self
.acount
347 attrs
['outcount'] = self
.outcount
351 self
.o("]["+`attrs
['count']`
+"]")
353 if tag
== 'dl' and start
: self
.p()
354 if tag
== 'dt' and not start
: self
.pbr()
355 if tag
== 'dd' and start
: self
.o(' ')
356 if tag
== 'dd' and not start
: self
.pbr()
358 if tag
in ["ol", "ul"]:
360 self
.list.append({'name':tag
, 'num':0})
362 if self
.list: self
.list.pop()
369 if self
.list: li
= self
.list[-1]
370 else: li
= {'name':'ul', 'num':0}
371 self
.o(" "*len(self
.list)) #TODO: line up <ol><li>s > 9 correctly.
372 if li
['name'] == "ul": self
.o("* ")
373 elif li
['name'] == "ol":
375 self
.o(`li
['num']`
+". ")
380 if tag
in ["table", "tr"] and start
: self
.p()
381 if tag
== 'td': self
.pbr()
392 if self
.p_p
== 0: self
.p_p
= 1
394 def p(self
): self
.p_p
= 2
396 def o(self
, data
, puredata
=0, force
=0):
397 if self
.abbr_data
is not None: self
.abbr_data
+= data
400 if puredata
and not self
.pre
:
401 data
= re
.sub('\s+', ' ', data
)
402 if data
and data
[0] == ' ':
405 if not data
and not force
: return
408 #self.out(" :") #TODO: not output when already one there
411 bq
= (">" * self
.blockquote
)
412 if not (force
and data
and data
[0] == ">") and self
.blockquote
: bq
+= " "
416 data
= data
.replace("\n", "\n"+bq
)
431 self
.out(('\n'+bq
)*self
.p_p
)
435 if not self
.lastWasNL
: self
.out(' ')
438 if self
.a
and ((self
.p_p
== 2 and options
.LINKS_EACH_PARAGRAPH
) or force
== "end"):
439 if force
== "end": self
.out("\n")
443 if self
.outcount
> link
['outcount']:
444 self
.out(" ["+`link
['count']`
+"]: " + link
['href']) #TODO: base href
445 if link
.has_key('title'): self
.out(" ("+link
['title']+")")
450 if self
.a
!= newa
: self
.out("\n") # Don't need an extra line when nothing was done.
454 if self
.abbr_list
and force
== "end":
455 for abbr
, definition
in self
.abbr_list
.items():
456 self
.out(" *[" + abbr
+ "]: " + definition
+ "\n")
460 self
.lastWasNL
= data
and data
[-1] == '\n'
463 def handle_data(self
, data
):
464 if r
'\/script>' in data
: self
.quiet
-= 1
467 def unknown_decl(self
, data
): pass
469 def wrapwrite(text
): sys
.stdout
.write(text
.encode('utf8'))
471 def html2text_file(html
, out
=wrapwrite
):
472 global options
, args
, oparser
473 if options
is None or args
is None:
474 (options
, args
) = oparser
.parse_args(None, None)
482 return optwrap(html2text_file(html
, None))
484 if __name__
== "__main__":
485 (options
, args
) = oparser
.parse_args()
488 if arg
.startswith('http://'):
489 j
= urllib
.urlopen(arg
)
491 from feedparser
import _getCharacterEncoding
as enc
493 enc
= lambda x
, y
: ('utf-8', 1)
495 encoding
= enc(j
.headers
, text
)[0]
496 if encoding
== 'us-ascii': encoding
= 'utf-8'
497 data
= text
.decode(encoding
)
500 data
= open(arg
, 'r').read().decode(options
.INPUT_ENCODING
)
502 data
= sys
.stdin
.read().decode(options
.INPUT_ENCODING
)
503 wrapwrite(html2text(data
))