3 import urllib2
, urlparse
4 from BeautifulSoup
import BeautifulSoup
6 WIKTIONARY_DUMP_PAGE
= "http://download.wikimedia.org/backup-index.html"
7 def get_wiktionary_list():
8 page
= urllib2
.urlopen(WIKTIONARY_DUMP_PAGE
)
9 soup
= BeautifulSoup(page
)
11 for incident
in soup('span', { "class": "done"}):
13 link
= incident
.parent("a")[0]
15 continue # some private wiki
16 if not link
.string
.endswith("wiktionary"): continue
17 if link
.string
in dicts
:
18 print "already visited", link
.string
21 dicts
[link
.string
] = urlparse
.urljoin(
23 "%s/latest/%s-latest-pages-articles.xml.bz2" % (
24 link
.string
, link
.string
29 def get_file_size(url
):
30 response
= urllib2
.urlopen(url
)
31 size
= response
.info()['Content-Length']
34 #get_wiktionary_list()
36 def get_total_dicts_size():
38 for url
in get_wiktionary_list().values():
39 s
= get_file_size(url
)
42 print "Total size:", size
45 get_total_dicts_size()