1 # -*- coding: utf-8 -*-
2 from BeautifulSoup
import BeautifulSoup
, BeautifulStoneSoup
7 for root
, dirs
, files
in os
.walk('www.symbols.com/encyclopedia'):
9 if re
.match('.*\.html$', file) and file != "index.html":
10 text
= open(os
.path
.join(root
, file)).read()
11 p
= BeautifulSoup(text
)
12 div
= p
.find('div', attrs
={"class": "KonaBody"})
16 category
= b
.text
.split(' ')[0]
22 for img
in div
.findAll('a'):
24 m
= re
.search('\/(\d[\d\w]*)\.\w+$', href
)
29 img
.replaceWith(u
' ⟦%s⟧ '% (hulb
))
30 for img
in div
.findAll('img'):
32 m
= re
.search('\/([\d\w]+)\.\w+$', href
)
34 hulb
= "img#"+m
.group(1)
38 img
.replaceWith(u
' ⟦%s⟧ '% (hulb
.rstrip().lstrip()))
41 for t
in div
.findAll('p'):
43 if s
.text
and s
.text
!= ' ':
44 s
.replaceWith(u
' ⟦%s⟧ ' % (s
.text
.rstrip().lstrip()))
49 x
= str(BeautifulStoneSoup(str(div
), convertEntities
=BeautifulStoneSoup
.HTML_ENTITIES
))
51 x
= re
.sub('<[^\>]+?>', '', x
)
52 x
= re
.sub('[ \n\t\r]+', ' ', x
)
53 with
open(category
.replace(':', '_'), 'w') as file:
54 file.write(x
.rstrip().lstrip())