Initial revision
[libcurl.git] / perl / checklinks.pl.in
blob17032b36ac90b0146cabe8ee05c4da172912ccb8
1 #!@PERL@
3 # checklinks.pl
5 # This script extracts all links from a HTML page and checks their validity.
6 # Written to use 'curl' for URL checking.
8 # Author: Daniel Stenberg <Daniel.Stenberg@sth.frontec.se>
9 # Version: 0.7 Sept 30, 1998
11 # HISTORY
13 # 0.5 - Cuts off the #-part from links before checking.
15 # 0.6 - Now deals with error codes 3XX better and follows the Location:
16 # properly.
17 # - Added the -x flag that only checks http:// -links
19 # 0.7 - Ok, http://www.viunga.se/main.html didn't realize this had no path
20 # but a document. Now it does.
24 $in="";
26 argv:
27 if($ARGV[0] eq "-v" ) {
28 $verbose = 1;
29 shift @ARGV;
30 goto argv;
32 elsif($ARGV[0] eq "-i" ) {
33 $usestdin = 1;
34 shift @ARGV;
35 goto argv;
37 elsif($ARGV[0] eq "-l" ) {
38 $linenumber = 1;
39 shift @ARGV;
40 goto argv;
42 elsif($ARGV[0] eq "-h" ) {
43 $help = 1;
44 shift @ARGV;
45 goto argv;
47 elsif($ARGV[0] eq "-x" ) {
48 $external = 1;
49 shift @ARGV;
50 goto argv;
53 $geturl = $ARGV[0];
55 if(($geturl eq "") || $help) {
56 print "Usage: $0 [-hilvx] <full URL>\n",
57 " Use a traling slash for directory URLs!\n",
58 " -h This help text\n",
59 " -i Read the initial page from stdin\n",
60 " -l Line number report for BAD links\n",
61 " -v Verbose mode\n",
62 " -x Check non-local (external?) links only\n";
63 exit;
66 if($ARGV[1] eq "-") {
67 print "We use stdin!\n";
68 $usestdin = 1;
71 # This is necessary from where I tried this:
72 #$proxy =" -x 194.237.142.41:80";
74 # linkchecker, URL will be appended to the right of this command line
75 # this is the one using HEAD:
76 $linkcheck = "curl -s -m 20 -I$proxy";
78 # as a second attempt, this will be used. This is not using HEAD but will
79 # get the whole frigging document!
80 $linkcheckfull = "curl -s -m 20 -i$proxy";
82 # htmlget, URL will be appended to the right of this command line
83 $htmlget = "curl -s$proxy";
85 # Parse the input URL and split it into the relevant parts:
87 sub SplitURL {
88 my $inurl = $_[0];
90 if($inurl=~ /^([^:]+):\/\/([^\/]*)\/(.*)\/(.*)/ ) {
91 $getprotocol = $1;
92 $getserver = $2;
93 $getpath = $3;
94 $getdocument = $4;
96 elsif ($inurl=~ /^([^:]+):\/\/([^\/]*)\/(.*)/ ) {
97 $getprotocol = $1;
98 $getserver = $2;
99 $getpath = $3;
100 $getdocument = "";
102 if($getpath !~ /\//) {
103 $getpath ="";
104 $getdocument = $3;
108 elsif ($inurl=~ /^([^:]+):\/\/(.*)/ ) {
109 $getprotocol = $1;
110 $getserver = $2;
111 $getpath = "";
112 $getdocument = "";
114 else {
115 print "Couldn't parse the specified URL, retry please!\n";
116 exit;
120 &SplitURL($geturl);
122 #print "protocol = $getprotocol\n";
123 #print "server = $getserver\n";
124 #print "path = $getpath\n";
125 #print "document = $getdocument\n";
126 #exit;
128 if(!$usestdin) {
129 open(HEADGET, "$linkcheck $geturl|") ||
130 die "Couldn't get web page for some reason";
131 headget:
132 while(<HEADGET>) {
133 # print $_;
134 if($_ =~ /HTTP\/.*3\d\d /) {
135 $pagemoved=1;
137 elsif($pagemoved &&
138 ($_ =~ /^Location: (.*)/)) {
139 $geturl = $1;
141 &SplitURL($geturl);
143 $pagemoved++;
144 last headget;
147 close(HEADGET);
149 if($pagemoved == 1) {
150 print "Page is moved but we don't know where. Did you forget the ",
151 "traling slash?\n";
152 exit;
155 open(WEBGET, "$htmlget $geturl|") ||
156 die "Couldn't get web page for some reason";
158 while(<WEBGET>) {
159 $line = $_;
160 push @indoc, $line;
161 $line=~ s/\n//g;
162 $line=~ s/\r//g;
163 # print $line."\n";
164 $in=$in.$line;
167 close(WEBGET);
169 else {
170 while(<STDIN>) {
171 $line = $_;
172 push @indoc, $line;
173 $line=~ s/\n//g;
174 $line=~ s/\r//g;
175 $in=$in.$line;
179 #print length($in)."\n";
181 sub LinkWorks {
182 my $check = $_[0];
184 # URL encode:
185 # $check =~s/([^a-zA-Z0-9_:\/.-])/uc sprintf("%%%02x",ord($1))/eg;
187 @doc = `$linkcheck \"$check\"`;
189 $head = 1;
191 # print "COMMAND: $linkcheck \"$check\"\n";
192 # print $doc[0]."\n";
194 boo:
195 if( $doc[0] =~ /^HTTP[^ ]+ (\d+)/ ) {
196 $error = $1;
198 if($error < 400 ) {
199 return "GOOD";
201 else {
203 if($head && ($error >= 500)) {
204 # This server doesn't like HEAD!
205 @doc = `$linkcheckfull \"$check\"`;
206 $head = 0;
207 goto boo;
209 return "BAD";
212 return "BAD";
216 sub GetLinks {
217 my $in = $_[0];
218 my @result;
220 getlinkloop:
221 while($in =~ /[^<]*(<[^>]+>)/g ) {
222 # we have a tag in $1
223 $tag = $1;
225 if($tag =~ /^<!--/) {
226 # this is a comment tag, ignore it
228 else {
229 if($tag =~ /(src|href|background|archive) *= *(\"[^\"]\"|[^ )>]*)/i) {
230 $url=$2;
231 if($url =~ /^\"(.*)\"$/) {
232 # this was a "string" now $1 has removed the quotes:
233 $url=$1;
237 $url =~ s/([^\#]*)\#.*/$1/g;
239 if($url eq "") {
240 # if the link was nothing than a #-link it may now have
241 # been emptied completely so then we skip the rest
242 next getlinkloop;
245 if($done{$url}) {
246 # if this url already is done, do next
247 $done{$url}++;
248 next getlinkloop;
251 $done{$url} = 1; # this is "done"
253 push @result, $url;
254 if($tag =~ /< *([^ ]+)/) {
255 # print "TAG: $1\n";
256 $tagtype{$url}=$1;
261 return @result;
264 @links = &GetLinks($in);
266 linkloop:
267 for(@links) {
268 $url = $_;
270 if($url =~ /^([^:]+):/) {
271 $prot = $1;
272 # if($prot !~ /(http|ftp|gopher)/i) {
273 if($prot !~ /http/i) {
274 # this is an unsupported protocol, we ignore this
275 next linkloop;
277 $link = $url;
279 else {
280 if($external) {
281 next linkloop;
284 # this is a link on the save server:
285 if($url =~ /^\//) {
286 # from root
287 $link = "$getprotocol://$getserver$url";
289 else {
290 # from the scanned page's dir
291 $nyurl=$url;
293 if(length($getpath) &&
294 ($getpath !~ /\/$/) &&
295 ($nyurl !~ /^\//)) {
296 # lacks ending slash, add one to the document part:
297 $nyurl = "/".$nyurl;
299 $link = "$getprotocol://$getserver/$getpath$nyurl";
303 #print "test $link\n";
304 #$success = "GOOD";
306 $success = &LinkWorks($link);
308 $count = $done{$url};
310 $allcount += $count;
312 print "$success $count <".$tagtype{$url}."> $link $url\n";
314 # If bad and -l, present the line numbers of the usage
315 if("BAD" eq $success) {
316 $badlinks++;
317 if($linenumber) {
318 $line =1;
319 for(@indoc) {
320 if($_ =~ /$url/) {
321 print " line $line\n";
323 $line++;
330 if($verbose) {
331 print "$allcount links were checked";
332 if($badlinks > 0) {
333 print ", $badlinks were found bad";
335 print "\n";