webfaction and repo.or.cz deployment done
[worddb.git] / dicts / wiktionary.py
blob4b5e1d11c82012e73ed5b5b7b30f77f7b9ce625a
1 import set_paths
3 import urllib2, urlparse
4 from BeautifulSoup import BeautifulSoup
6 WIKTIONARY_DUMP_PAGE = "http://download.wikimedia.org/backup-index.html"
7 def get_wiktionary_list():
8 page = urllib2.urlopen(WIKTIONARY_DUMP_PAGE)
9 soup = BeautifulSoup(page)
10 dicts = {}
11 for incident in soup('span', { "class": "done"}):
12 try:
13 link = incident.parent("a")[0]
14 except IndexError:
15 continue # some private wiki
16 if not link.string.endswith("wiktionary"): continue
17 if link.string in dicts:
18 print "already visited", link.string
19 continue
21 dicts[link.string] = urlparse.urljoin(
22 WIKTIONARY_DUMP_PAGE,
23 "%s/latest/%s-latest-pages-articles.xml.bz2" % (
24 link.string, link.string
27 return dicts
29 def get_file_size(url):
30 response = urllib2.urlopen(url)
31 size = response.info()['Content-Length']
32 return int(size)
34 #get_wiktionary_list()
36 def get_total_dicts_size():
37 size = 0
38 for url in get_wiktionary_list().values():
39 s = get_file_size(url)
40 size += s
41 print url, s
42 print "Total size:", size
43 return size
45 get_total_dicts_size()