5 # This script extracts all links from a HTML page and checks their validity.
6 # Written to use 'curl' for URL checking.
8 # Author: Daniel Stenberg <Daniel.Stenberg@sth.frontec.se>
9 # Version: 0.7 Sept 30, 1998
13 # 0.5 - Cuts off the #-part from links before checking.
15 # 0.6 - Now deals with error codes 3XX better and follows the Location:
17 # - Added the -x flag that only checks http:// -links
19 # 0.7 - Ok, http://www.viunga.se/main.html didn't realize this had no path
20 # but a document. Now it does.
27 if($ARGV[0] eq "-v" ) {
32 elsif($ARGV[0] eq "-i" ) {
37 elsif($ARGV[0] eq "-l" ) {
42 elsif($ARGV[0] eq "-h" ) {
47 elsif($ARGV[0] eq "-x" ) {
55 if(($geturl eq "") || $help) {
56 print "Usage: $0 [-hilvx] <full URL>\n",
57 " Use a traling slash for directory URLs!\n",
58 " -h This help text\n",
59 " -i Read the initial page from stdin\n",
60 " -l Line number report for BAD links\n",
62 " -x Check non-local (external?) links only\n";
67 print "We use stdin!\n";
71 # This is necessary from where I tried this:
72 #$proxy =" -x 194.237.142.41:80";
74 # linkchecker, URL will be appended to the right of this command line
75 # this is the one using HEAD:
76 $linkcheck = "curl -s -m 20 -I$proxy";
78 # as a second attempt, this will be used. This is not using HEAD but will
79 # get the whole frigging document!
80 $linkcheckfull = "curl -s -m 20 -i$proxy";
82 # htmlget, URL will be appended to the right of this command line
83 $htmlget = "curl -s$proxy";
85 # Parse the input URL and split it into the relevant parts:
90 if($inurl=~ /^([^:]+):\/\
/([^\/]*)\
/(.*)\/(.*)/ ) {
96 elsif ($inurl=~ /^([^:]+):\/\
/([^\/]*)\
/(.*)/ ) {
102 if($getpath !~ /\//) {
108 elsif ($inurl=~ /^([^:]+):\/\
/(.*)/ ) {
115 print "Couldn't parse the specified URL, retry please!\n";
122 #print "protocol = $getprotocol\n";
123 #print "server = $getserver\n";
124 #print "path = $getpath\n";
125 #print "document = $getdocument\n";
129 open(HEADGET
, "$linkcheck $geturl|") ||
130 die "Couldn't get web page for some reason";
134 if($_ =~ /HTTP\/.*3\d\d
/) {
138 ($_ =~ /^Location: (.*)/)) {
149 if($pagemoved == 1) {
150 print "Page is moved but we don't know where. Did you forget the ",
155 open(WEBGET
, "$htmlget $geturl|") ||
156 die "Couldn't get web page for some reason";
179 #print length($in)."\n";
185 # $check =~s/([^a-zA-Z0-9_:\/.-])/uc sprintf("%%%02x",ord($1))/eg;
187 @doc = `$linkcheck \"$check\"`;
191 # print "COMMAND: $linkcheck \"$check\"\n";
192 # print $doc[0]."\n";
195 if( $doc[0] =~ /^HTTP[^ ]+ (\d+)/ ) {
203 if($head && ($error >= 500)) {
204 # This server doesn't like HEAD!
205 @doc = `$linkcheckfull \"$check\"`;
221 while($in =~ /[^<]*(<[^>]+>)/g ) {
222 # we have a tag in $1
225 if($tag =~ /^<!--/) {
226 # this is a comment tag, ignore it
229 if($tag =~ /(src|href|background|archive) *= *(\"[^\"]\"|[^ )>]*)/i) {
231 if($url =~ /^\"(.*)\"$/) {
232 # this was a "string" now $1 has removed the quotes:
237 $url =~ s/([^\#]*)\#.*/$1/g;
240 # if the link was nothing than a #-link it may now have
241 # been emptied completely so then we skip the rest
246 # if this url already is done, do next
251 $done{$url} = 1; # this is "done"
254 if($tag =~ /< *([^ ]+)/) {
264 @links = &GetLinks
($in);
270 if($url =~ /^([^:]+):/) {
272 # if($prot !~ /(http|ftp|gopher)/i) {
273 if($prot !~ /http/i) {
274 # this is an unsupported protocol, we ignore this
284 # this is a link on the save server:
287 $link = "$getprotocol://$getserver$url";
290 # from the scanned page's dir
293 if(length($getpath) &&
294 ($getpath !~ /\/$/) &&
296 # lacks ending slash, add one to the document part:
299 $link = "$getprotocol://$getserver/$getpath$nyurl";
303 #print "test $link\n";
306 $success = &LinkWorks
($link);
308 $count = $done{$url};
312 print "$success $count <".$tagtype{$url}."> $link $url\n";
314 # If bad and -l, present the line numbers of the usage
315 if("BAD" eq $success) {
321 print " line $line\n";
331 print "$allcount links were checked";
333 print ", $badlinks were found bad";