py-cvs-rel2_1 (Rev 1.2) merge
[python/dscho.git] / Doc / lib / librobotparser.tex
bloba48ad85424a59bda6c26168fec48b5f7f10c35a2
1 \section{\module{robotparser} ---
2 Parser for robots.txt}
4 \declaremodule{standard}{robotparser}
5 \modulesynopsis{Accepts as input a list of lines or URL that refers to a
6 robots.txt file, parses the file, then builds a
7 set of rules from that list and answers questions
8 about fetchability of other URLs.}
9 \sectionauthor{Skip Montanaro}{skip@mojam.com}
11 \index{WWW}
12 \index{World-Wide Web}
13 \index{URL}
14 \index{robots.txt}
16 This module provides a single class, \class{RobotFileParser}, which answers
17 questions about whether or not a particular user agent can fetch a URL on
18 the web site that published the \file{robots.txt} file. For more details on
19 the structure of \file{robots.txt} files, see
20 \url{http://info.webcrawler.com/mak/projects/robots/norobots.html}.
22 \begin{classdesc}{RobotFileParser}{}
24 This class provides a set of methods to read, parse and answer questions
25 about a single \file{robots.txt} file.
27 \begin{methoddesc}{set_url}{url}
28 Sets the URL referring to a \file{robots.txt} file.
29 \end{methoddesc}
31 \begin{methoddesc}{read}{}
32 Reads the \file{robots.txt} URL and feeds it to the parser.
33 \end{methoddesc}
35 \begin{methoddesc}{parse}{lines}
36 Parses the lines argument.
37 \end{methoddesc}
39 \begin{methoddesc}{can_fetch}{useragent, url}
40 Returns true if the \var{useragent} is allowed to fetch the \var{url}
41 according to the rules contained in the parsed \file{robots.txt} file.
42 \end{methoddesc}
44 \begin{methoddesc}{mtime}{}
45 Returns the time the \code{robots.txt} file was last fetched. This is
46 useful for long-running web spiders that need to check for new
47 \code{robots.txt} files periodically.
48 \end{methoddesc}
50 \begin{methoddesc}{modified}{}
51 Sets the time the \code{robots.txt} file was last fetched to the current
52 time.
53 \end{methoddesc}
55 \end{classdesc}
57 The following example demonstrates basic use of the RobotFileParser class.
59 \begin{verbatim}
60 >>> import robotparser
61 >>> rp = robotparser.RobotFileParser()
62 >>> rp.set_url("http://www.musi-cal.com/robots.txt")
63 >>> rp.read()
64 >>> rp.can_fetch("*", "http://www.musi-cal.com/cgi-bin/search?city=San+Francisco")
66 >>> rp.can_fetch("*", "http://www.musi-cal.com/")
68 \end{verbatim}