changelog date typo
[eidogo.git] / titles / scrape.py
blob51eef288864a8d393f750395300af9fede394403
1 #!/usr/bin/env python2.5
3 import sys
4 import sqlite3
5 import urllib2
6 from BeautifulSoup import BeautifulSoup
7 import re
8 import os
9 import sgfparser
11 curdir = os.path.dirname(os.path.abspath(__file__))
12 con = sqlite3.connect(os.path.join(curdir, 'titles.db'), isolation_level=None)
13 cur = con.cursor()
14 cur.executescript("""
15 create table if not exists notices (
16 sig text primary key
18 create table if not exists games (
19 fn text primary key,
20 dt text,
21 ev text,
22 pw text,
23 pb text,
24 re text
26 """)
28 base_url = "http://igo-kisen.hp.infoseek.co.jp/"
29 # base_url = "http://localhost/eidogo/titles/"
30 page = urllib2.urlopen(base_url + "topics.html")
31 soup = BeautifulSoup(page)
33 trs = soup.find("table", {"width": "1050"}).findAll("tr")
34 dt = ""
36 re_tags = re.compile("<[^>]+>")
38 scraped_subpages = []
40 for tr in trs:
41 tds = tr.findAll("td")
42 if (len(tds) == 0):
43 continue
45 td0 = tds.pop(0).contents[0]
46 if (td0 != "&nbsp;"):
47 dt = td0
49 sig = [dt]
50 for td in tds:
51 sig.append(re_tags.sub("", str(td.contents[0])))
53 sig = ' '.join(sig)
55 cur.execute("select * from notices where sig=?", (sig,))
56 if (cur.fetchone()):
57 continue
59 cur.execute("insert into notices (sig) values (?)", (sig,))
61 subpage_fn = tr.a['href']
63 if (scraped_subpages.count(subpage_fn) > 0):
64 continue
66 print subpage_fn
67 scraped_subpages.append(subpage_fn)
68 try:
69 subpage = urllib2.urlopen(base_url + subpage_fn)
70 except:
71 print "! not found"
72 continue
73 subsoup = BeautifulSoup(subpage)
75 sgf_path = os.path.normpath(os.path.join(os.path.dirname(os.path.abspath(__file__)), '../sgf/titles'))
77 for a in subsoup.findAll("a"):
78 fn = a['href']
79 if (not fn.endswith(".sgf")):
80 continue
81 cur.execute("select * from games where fn=?", (fn,))
82 if (cur.fetchone()):
83 continue
84 try:
85 raw_sgf = urllib2.urlopen(base_url + fn).read()
86 except:
87 print " ! " + fn + " not found"
88 continue
89 if (len(raw_sgf) > 0):
90 print " " + fn
91 sgf = sgfparser.Cursor(raw_sgf)
92 info = sgf.getRootNode(0)
93 cur.execute("insert into games (fn, dt, ev, pw, pb, re) values (?,?,?,?,?,?)",
94 (fn, info['DT'][0], info['EV'][0], info['PW'][0] + ' ' + info['WR'][0], info['PB'][0] + ' ' + info['BR'][0], info['RE'][0]))
95 f = open(os.path.join(sgf_path, fn), "w")
96 f.write(raw_sgf)
97 f.close()
99 cur.execute("select * from games order by dt desc limit 250")
100 rows = cur.fetchall()
102 f = open(os.path.join(curdir, 'titles.html'), "w")
103 f.write("<table id='tourney-games'><tr><th>Date</th><th>Event</th><th>White</th><th>Black</th><th>Result</th></tr>")
104 cl = ""
105 for row in rows:
106 if (cl == " class='odd'"):
107 cl = " class='even'"
108 else:
109 cl = " class='odd'"
110 fn = row[0].replace(".sgf", "")
111 f.write("<tr" + cl + ">")
112 for col in row[1:]:
113 f.write("<td><a href='./#titles/" + fn + "'>" + col + "</a></td>")
114 f.write("</tr>")
115 f.write("</table>")
116 f.close()