util/create_symbol_corpus.py

   1 # -*- coding: utf-8 -*-
   2 from BeautifulSoup import BeautifulSoup, BeautifulStoneSoup
   3
   4 import os
   5 import re
   6
   7 for root, dirs, files in os.walk('www.symbols.com/encyclopedia'):
   8     for file in files:
   9         if re.match('.*\.html$', file) and file != "index.html":
  10             text = open(os.path.join(root, file)).read()
  11             p = BeautifulSoup(text)
  12             div = p.find('div', attrs={"class": "KonaBody"})
  13             span = p.find('span')
  14             span.decompose()
  15             b = div.find('b')
  16             category = b.text.split(' ')[0]
  17             b.decompose()
  18
  19             #print '-'*79
  20             #print category
  21
  22             for img in div.findAll('a'):
  23                 href = img['href']
  24                 m = re.search('\/(\d[\d\w]*)\.\w+$', href)
  25                 if m:
  26                     hulb = "#"+m.group(1)
  27                 else:
  28                     hulb = img.text
  29                 img.replaceWith(u' ⟦%s⟧ '% (hulb))
  30             for img in div.findAll('img'):
  31                 href = img['src']
  32                 m = re.search('\/([\d\w]+)\.\w+$', href)
  33                 if m:
  34                     hulb = "img#"+m.group(1)
  35                 else:
  36                     hulb = img.text
  37                 if hulb:
  38                     img.replaceWith(u' ⟦%s⟧ '% (hulb.rstrip().lstrip()))
  39                 else:
  40                     img.replaceWith('')
  41             for t in div.findAll('p'):
  42                 for s in t.findAll():
  43                     if s.text and s.text != '&nbsp;&nbsp;&nbsp;&nbsp;':
  44                         s.replaceWith(u' ⟦%s⟧ ' % (s.text.rstrip().lstrip()))
  45                     else:
  46                         s.replaceWith('')
  47
  48
  49             x = str(BeautifulStoneSoup(str(div), convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
  50
  51             x = re.sub('<[^\>]+?>', '', x)
  52             x = re.sub('[ \n\t\r]+', ' ', x)
  53             with open(category.replace(':', '_'), 'w') as file:
  54                 file.write(x.rstrip().lstrip())
  55             #print div
  56