fix picture fetching
[rofl0r-twatscrape.git] / soup_parser.py
blob86ee1a4cf1777756071f5ba79dbbb351afb469cd
1 from bs4 import BeautifulSoup, SoupStrainer, FeatureNotFound
2 import sys
3 #import gumbo
5 parser = 'lxml'
6 def soupify_bs4(html, nohtml=False):
7 global parser
8 parser = 'html.parser'
9 htm = html if nohtml else '<html><body>%s</body></html>'%(html)
10 try:
11 res = BeautifulSoup(htm, parser)
12 except FeatureNotFound as e:
13 parser = 'html.parser'
14 res = BeautifulSoup(htm, parser)
15 return res
17 def soupify_gumbo(html, nohtml=False):
18 htm = html if nohtml else '<html><body>%s</body></html>'%(html)
19 try:
20 soup = gumbo.soup_parse(htm)
21 if not soup.body:
22 print "AAAA"
23 print html
24 print "BBBB"
25 print repr(soup)
26 return soup
28 except Exception as e:
29 sys.stdout.write(html)
30 raise
32 def soupify(html, nohtml=False):
33 # return soupify_gumbo(html, nohtml)
34 return soupify_bs4(html, nohtml)