3 Robots.txt file parser class. Accepts a list of lines or robots.txt URL as
4 input, builds a set of rules from that list, then answers questions about
5 fetchability of other URLs.
18 return self
.last_checked
22 self
.last_checked
= time
.time()
24 def set_url(self
, url
):
27 ## self.url = urlmisc.canonical_url(url)
31 self
.parse(urllib
.urlopen(self
.url
).readlines())
33 def parse(self
, lines
):
34 import regsub
, string
, regex
37 if self
.debug
: print '>', line
,
38 # blank line terminates current record
42 # remove optional comment and strip line
43 line
= string
.strip(line
[:string
.find(line
, '#')])
46 line
= regsub
.split(line
, ' *: *')
48 line
[0] = string
.lower(line
[0])
49 if line
[0] == 'user-agent':
50 # this record applies to this user agent
51 if self
.debug
: print '>> user-agent:', line
[1]
52 active
.append(line
[1])
53 if not self
.rules
.has_key(line
[1]):
54 self
.rules
[line
[1]] = []
55 elif line
[0] == 'disallow':
57 if self
.debug
: print '>> disallow:', line
[1]
59 self
.rules
[agent
].append(regex
.compile(line
[1]))
63 if self
.debug
: print '>> allow', agent
64 self
.rules
[agent
] = []
66 if self
.debug
: print '>> unknown:', line
70 # returns true if agent is allowed to fetch url
71 def can_fetch(self
, agent
, url
):
74 if not self
.rules
.has_key(ag
): ag
= '*'
75 if not self
.rules
.has_key(ag
):
76 if self
.debug
: print '>> allowing', url
, 'fetch by', agent
78 path
= urlparse
.urlparse(url
)[2]
79 for rule
in self
.rules
[ag
]:
80 if rule
.match(path
) != -1:
81 if self
.debug
: print '>> disallowing', url
, 'fetch by', agent
83 if self
.debug
: print '>> allowing', url
, 'fetch by', agent
87 rp
= RobotFileParser()
89 rp
.set_url('http://www.automatrix.com/robots.txt')
92 print rp
.can_fetch('*', 'http://www.calendar.com/concerts/')
93 print rp
.can_fetch('Musi-Cal-Robot',
94 'http://dolphin:80/cgi-bin/music-search?performer=Rolling+Stones')
96 print rp
.can_fetch('Lycos', 'http://www/~skip/volkswagen/')
97 print rp
.can_fetch('Lycos', 'http://www/~skip/volkswagen/vanagon-list-001')