dicts/wiktionary.py

   1 import set_paths
   2
   3 import urllib2, urlparse
   4 from BeautifulSoup import BeautifulSoup
   5
   6 WIKTIONARY_DUMP_PAGE = "http://download.wikimedia.org/backup-index.html"
   7 def get_wiktionary_list():
   8     page = urllib2.urlopen(WIKTIONARY_DUMP_PAGE)
   9     soup = BeautifulSoup(page)
  10     dicts = {}
  11     for incident in soup('span', { "class": "done"}):
  12         try:
  13             link = incident.parent("a")[0]
  14         except IndexError:
  15             continue # some private wiki
  16         if not link.string.endswith("wiktionary"): continue
  17         if link.string in dicts:
  18             print "already visited", link.string
  19             continue
  20
  21         dicts[link.string] = urlparse.urljoin(
  22             WIKTIONARY_DUMP_PAGE,
  23             "%s/latest/%s-latest-pages-articles.xml.bz2" % (
  24                 link.string, link.string
  25             )
  26         )
  27     return dicts
  28
  29 def get_file_size(url):
  30     response = urllib2.urlopen(url)
  31     size =  response.info()['Content-Length']
  32     return int(size)
  33
  34 #get_wiktionary_list()
  35
  36 def get_total_dicts_size():
  37     size = 0
  38     for url in get_wiktionary_list().values():
  39         s = get_file_size(url)
  40         size += s
  41         print url, s
  42     print "Total size:", size
  43     return size
  44
  45 get_total_dicts_size()