dependencies
[nltk_ontology_framework.git] / util / create_symbol_corpus.py
blobd042269bf2986575a39f2eeed54fecd0303ba011
1 # -*- coding: utf-8 -*-
2 from BeautifulSoup import BeautifulSoup, BeautifulStoneSoup
4 import os
5 import re
7 for root, dirs, files in os.walk('www.symbols.com/encyclopedia'):
8 for file in files:
9 if re.match('.*\.html$', file) and file != "index.html":
10 text = open(os.path.join(root, file)).read()
11 p = BeautifulSoup(text)
12 div = p.find('div', attrs={"class": "KonaBody"})
13 span = p.find('span')
14 span.decompose()
15 b = div.find('b')
16 category = b.text.split(' ')[0]
17 b.decompose()
19 #print '-'*79
20 #print category
22 for img in div.findAll('a'):
23 href = img['href']
24 m = re.search('\/(\d[\d\w]*)\.\w+$', href)
25 if m:
26 hulb = "#"+m.group(1)
27 else:
28 hulb = img.text
29 img.replaceWith(u' ⟦%s⟧ '% (hulb))
30 for img in div.findAll('img'):
31 href = img['src']
32 m = re.search('\/([\d\w]+)\.\w+$', href)
33 if m:
34 hulb = "img#"+m.group(1)
35 else:
36 hulb = img.text
37 if hulb:
38 img.replaceWith(u' ⟦%s⟧ '% (hulb.rstrip().lstrip()))
39 else:
40 img.replaceWith('')
41 for t in div.findAll('p'):
42 for s in t.findAll():
43 if s.text and s.text != '    ':
44 s.replaceWith(u' ⟦%s⟧ ' % (s.text.rstrip().lstrip()))
45 else:
46 s.replaceWith('')
49 x = str(BeautifulStoneSoup(str(div), convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
51 x = re.sub('<[^\>]+?>', '', x)
52 x = re.sub('[ \n\t\r]+', ' ', x)
53 with open(category.replace(':', '_'), 'w') as file:
54 file.write(x.rstrip().lstrip())
55 #print div