perl/getlinks.pl.in

   1 #!@PERL@
   2 #
   3 # getlinks.pl
   4 #
   5 # This script extracts all links from a HTML page, compares them to a pattern
   6 # entered on the command line and then downloads matching links into the
   7 # target dir (also specified on the command line).
   8 #
   9 # Written to use 'curl' for URL fetching, uses the source file names in the
  10 # target directory.
  11 #
  12 # Author: Daniel Stenberg <Daniel.Stenberg@sth.frontec.se>
  13 # Version: 0.1 Oct 7, 1998
  14 #
  15 # HISTORY
  16 #
  17 # 0.1 - Created now!
  18 #
  19
  20 $in="";
  21
  22  argv:
  23 if($ARGV[0] eq "-v" ) {
  24     $verbose = 1;
  25     shift @ARGV;
  26     goto argv;
  27 }
  28 if($ARGV[0] eq "-d" ) {
  29     $display = 1;
  30     shift @ARGV;
  31     goto argv;
  32 }
  33 elsif($ARGV[0] eq "-h" ) {
  34     $help = 1;
  35     shift @ARGV;
  36     goto argv;
  37 }
  38
  39 $geturl = $ARGV[0];
  40 $getdir = $ARGV[1];
  41 $getregex = $ARGV[2];
  42
  43 if(($geturl eq "") ||
  44    (($getdir eq "") && !$display) ||
  45    $help) {
  46     print  "Usage: $0 [-hv] <full source URL> <target dir> [regex]\n",
  47     " Use a traling slash for directory URLs!\n",
  48     " Use \"quotes\" around the regex!\n",
  49     " -h  This help text\n",
  50     " -d  Display matches only instead of downloading\n",
  51     " -v  Verbose mode\n";
  52     exit;
  53 }
  54
  55 # change to target directory:
  56 chdir $getdir ||
  57     die "couldn't cd into $getdir";
  58
  59 # This is necessary from where I tried this:
  60 #$proxy =" -x 194.237.142.41:80";
  61
  62 # linkchecker, URL will be appended to the right of this command line
  63 # this is the one using HEAD:
  64 $linkcheck = "curl -s -m 20 -I$proxy";
  65
  66 # as a second attempt, this will be used. This is not using HEAD but will
  67 # get the whole frigging document!
  68 $linkcheckfull = "curl -s -m 20 -i$proxy";
  69
  70 # htmlget, URL will be appended to the right of this command line
  71 $htmlget = "curl -s$proxy";
  72
  73 # urlget, URL will be appended to the right of this command line
  74 # this stores the file with the remote file name in the current dir
  75 $urlget = "curl -O -s$proxy";
  76
  77 # Parse the input URL and split it into the relevant parts:
  78
  79 sub SplitURL {
  80     my $inurl = $_[0];
  81
  82     if($inurl=~ /^([^:]+):\/\/([^\/]*)\/(.*)\/(.*)/ ) {
  83         $getprotocol = $1;
  84         $getserver = $2;
  85         $getpath = $3;
  86         $getdocument = $4;
  87     }
  88     elsif ($inurl=~ /^([^:]+):\/\/([^\/]*)\/(.*)/ ) {
  89         $getprotocol = $1;
  90         $getserver = $2;
  91         $getpath = $3;
  92         $getdocument = "";
  93
  94         if($getpath !~ /\//) {
  95             $getpath ="";
  96             $getdocument = $3;
  97         }
  98
  99     }
 100     elsif ($inurl=~ /^([^:]+):\/\/(.*)/ ) {
 101         $getprotocol = $1;
 102         $getserver = $2;
 103         $getpath = "";
 104         $getdocument = "";
 105     }
 106     else {
 107         print "Couldn't parse the specified URL, retry please!\n";
 108         exit;
 109     }
 110 }
 111
 112 &SplitURL($geturl);
 113
 114 #print "protocol = $getprotocol\n";
 115 #print "server = $getserver\n";
 116 #print "path = $getpath\n";
 117 #print "document = $getdocument\n";
 118 #exit;
 119
 120 if(!$usestdin) {
 121     open(HEADGET, "$linkcheck $geturl|") ||
 122         die "Couldn't get web page for some reason";
 123   headget:
 124     while(<HEADGET>) {
 125 #       print $_;
 126         if($_ =~ /HTTP\/.*3\d\d /) {
 127             $pagemoved=1;
 128         }
 129         elsif($pagemoved &&
 130                ($_ =~ /^Location: (.*)/)) {
 131             $geturl = $1;
 132
 133             &SplitURL($geturl);
 134
 135             $pagemoved++;
 136             last headget;
 137         }
 138     }
 139     close(HEADGET);
 140
 141     if($pagemoved == 1) {
 142         print "Page is moved but we don't know where. Did you forget the ",
 143         "traling slash?\n";
 144         exit;
 145     }
 146
 147     open(WEBGET, "$htmlget $geturl|") ||
 148         die "Couldn't get web page for some reason";
 149
 150     while(<WEBGET>) {
 151         $line = $_;
 152         push @indoc, $line;
 153         $line=~ s/\n//g;
 154         $line=~ s/\r//g;
 155 #    print $line."\n";
 156         $in=$in.$line;
 157     }
 158
 159     close(WEBGET);
 160 }
 161 else {
 162     while(<STDIN>) {
 163         $line = $_;
 164         push @indoc, $line;
 165         $line=~ s/\n//g;
 166         $line=~ s/\r//g;
 167         $in=$in.$line;
 168     }
 169 }
 170
 171 sub GetLinks {
 172     my $in = $_[0];
 173     my @result;
 174
 175   getlinkloop:
 176     while($in =~ /[^<]*(<[^>]+>)/g ) {
 177         # we have a tag in $1
 178         $tag = $1;
 179
 180         if($tag =~ /^<!--/) {
 181             # this is a comment tag, ignore it
 182         }
 183         else {
 184             if($tag =~ /(src|href|background|archive) *= *(\"[^\"]\"|[^ )>]*)/i) {
 185                 $url=$2;
 186                 if($url =~ /^\"(.*)\"$/) {
 187                     # this was a "string" now $1 has removed the quotes:
 188                     $url=$1;
 189                 }
 190
 191
 192                 $url =~ s/([^\#]*)\#.*/$1/g;
 193
 194                 if($url eq "") {
 195                     # if the link was nothing than a #-link it may now have
 196                     # been emptied completely so then we skip the rest
 197                     next getlinkloop;
 198                 }
 199
 200                 if($done{$url}) {
 201                     # if this url already is done, do next
 202                     $done{$url}++;
 203                     next getlinkloop;
 204                 }
 205
 206                 $done{$url} = 1; # this is "done"
 207
 208                 push @result, $url;
 209                 if($tag =~ /< *([^ ]+)/) {
 210 #                   print "TAG: $1\n";
 211                     $tagtype{$url}=$1;
 212                 }
 213             }
 214         }
 215     }
 216     return @result;
 217 }
 218
 219 @links = &GetLinks($in);
 220
 221  linkloop:
 222 for(@links) {
 223     $url = $_;
 224
 225     if($url =~ /^([^:]+):/) {
 226         $link = $url;
 227     }
 228     else {
 229         # this is an absolute link on the same server:
 230         if($url =~ /^\//) {
 231             # from root
 232             $link = "$getprotocol://$getserver$url";
 233         }
 234         else {
 235             # from the scanned page's dir
 236             $nyurl=$url;
 237
 238             if(length($getpath) &&
 239                ($getpath !~ /\/$/) &&
 240                ($nyurl !~ /^\//)) {
 241                 # lacks ending slash, add one to the document part:
 242                 $nyurl = "/".$nyurl;
 243             }
 244             $link = "$getprotocol://$getserver/$getpath$nyurl";
 245         }
 246     }
 247
 248     if($link =~ /$getregex/) {
 249         if($display) {
 250             print "$link\n";
 251         }
 252         else {
 253             if($verbose) {
 254                 print "Gets $link\n";
 255             }
 256             print `$urlget $link`;
 257         }
 258     }
 259
 260
 261 }