bin/lwp-rget

   1 #!/usr/bin/perl -w
   2
   3 =head1 NAME
   4
   5 lwp-rget - Retrieve web documents recursively
   6
   7 =head1 SYNOPSIS
   8
   9  lwp-rget [--verbose] [--auth=USER:PASS] [--depth=N] [--hier] [--iis]
  10           [--keepext=mime/type[,mime/type]] [--limit=N] [--nospace]
  11           [--prefix=URL] [--referer=URL] [--sleep=N] [--tolower] <URL>
  12  lwp-rget --version
  13
  14 =head1 DESCRIPTION
  15
  16 This program will retrieve a document and store it in a local file.  It
  17 will follow any links found in the document and store these documents
  18 as well, patching links so that they refer to these local copies.
  19 This process continues until there are no more unvisited links or the
  20 process is stopped by the one or more of the limits which can be
  21 controlled by the command line arguments.
  22
  23 This program is useful if you want to make a local copy of a
  24 collection of documents or want to do web reading off-line.
  25
  26 All documents are stored as plain files in the current directory. The
  27 file names chosen are derived from the last component of URL paths.
  28
  29 The options are:
  30
  31 =over 3
  32
  33 =item --auth=USER:PASS<n>
  34
  35 Set the authentication credentials to user "USER" and password "PASS" if
  36 any restricted parts of the web site are hit.  If there are restricted
  37 parts of the web site and authentication credentials are not available,
  38 those pages will not be downloaded.
  39
  40 =item --depth=I<n>
  41
  42 Limit the recursive level. Embedded images are always loaded, even if
  43 they fall outside the I<--depth>. This means that one can use
  44 I<--depth=0> in order to fetch a single document together with all
  45 inline graphics.
  46
  47 The default depth is 5.
  48
  49 =item --hier
  50
  51 Download files into a hierarchy that mimics the web site structure.
  52 The default is to put all files in the current directory.
  53
  54 =item --referer=I<URI>
  55
  56 Set the value of the Referer header for the initial request.  The
  57 special value C<"NONE"> can be used to suppress the Referer header in
  58 any of subsequent requests.  The Referer header will always be suppressed
  59 in all normal C<http> requests if the referring page was transmitted over
  60 C<https> as recommended in RFC 2616.
  61
  62 =item --iis
  63
  64 Sends an "Accept: */*" on all URL requests as a workaround for a bug in
  65 IIS 2.0.  If no Accept MIME header is present, IIS 2.0 returns with a
  66 "406 No acceptable objects were found" error.  Also converts any back
  67 slashes (\\) in URLs to forward slashes (/).
  68
  69 =item --keepext=I<mime/type[,mime/type]>
  70
  71 Keeps the current extension for the list MIME types.  Useful when
  72 downloading text/plain documents that shouldn't all be translated to
  73 *.txt files.
  74
  75 =item --limit=I<n>
  76
  77 Limit the number of documents to get.  The default limit is 50.
  78
  79 =item --nospace
  80
  81 Changes spaces in all URLs to underscore characters (_).  Useful when
  82 downloading files from sites serving URLs with spaces in them.  Does not
  83 remove spaces from fragments, e.g., "file.html#somewhere in here".
  84
  85 =item --prefix=I<url_prefix>
  86
  87 Limit the links to follow. Only URLs that start the prefix string are
  88 followed.
  89
  90 The default prefix is set as the "directory" of the initial URL to
  91 follow.  For instance if we start lwp-rget with the URL
  92 C<http://www.sn.no/foo/bar.html>, then prefix will be set to
  93 C<http://www.sn.no/foo/>.
  94
  95 Use C<--prefix=''> if you don't want the fetching to be limited by any
  96 prefix.
  97
  98 =item --sleep=I<n>
  99
 100 Sleep I<n> seconds before retrieving each document. This options allows
 101 you to go slowly, not loading the server you visiting too much.
 102
 103 =item --tolower
 104
 105 Translates all links to lowercase.  Useful when downloading files from
 106 IIS since it does not serve files in a case sensitive manner.
 107
 108 =item --verbose
 109
 110 Make more noise while running.
 111
 112 =item --quiet
 113
 114 Don't make any noise.
 115
 116 =item --version
 117
 118 Print program version number and quit.
 119
 120 =item --help
 121
 122 Print the usage message and quit.
 123
 124 =back
 125
 126 Before the program exits the name of the file, where the initial URL
 127 is stored, is printed on stdout.  All used filenames are also printed
 128 on stderr as they are loaded.  This printing can be suppressed with
 129 the I<--quiet> option.
 130
 131 =head1 SEE ALSO
 132
 133 L<lwp-request>, L<LWP>
 134
 135 =head1 AUTHOR
 136
 137 Gisle Aas <aas@sn.no>
 138
 139 =cut
 140
 141 use strict;
 142
 143 use Getopt::Long    qw(GetOptions);
 144 use URI::URL        qw(url);
 145 use LWP::MediaTypes qw(media_suffix);
 146 use HTML::Entities  ();
 147
 148 use vars qw($VERSION);
 149 use vars qw($MAX_DEPTH $MAX_DOCS $PREFIX $REFERER $VERBOSE $QUIET $SLEEP $HIER $AUTH $IIS $TOLOWER $NOSPACE %KEEPEXT);
 150
 151 my $progname = $0;
 152 $progname =~ s|.*/||;  # only basename left
 153 $progname =~ s/\.\w*$//; #strip extension if any
 154
 155 $VERSION = "5.818";
 156
 157 #$Getopt::Long::debug = 1;
 158 #$Getopt::Long::ignorecase = 0;
 159
 160 # Defaults
 161 $MAX_DEPTH = 5;
 162 $MAX_DOCS  = 50;
 163
 164 GetOptions('version'  => \&print_version,
 165            'help'     => \&usage,
 166            'depth=i'  => \$MAX_DEPTH,
 167            'limit=i'  => \$MAX_DOCS,
 168            'verbose!' => \$VERBOSE,
 169            'quiet!'   => \$QUIET,
 170            'sleep=i'  => \$SLEEP,
 171            'prefix:s' => \$PREFIX,
 172            'referer:s'=> \$REFERER,
 173            'hier'     => \$HIER,
 174            'auth=s'   => \$AUTH,
 175            'iis'      => \$IIS,
 176            'tolower'  => \$TOLOWER,
 177            'nospace'  => \$NOSPACE,
 178            'keepext=s' => \$KEEPEXT{'OPT'},
 179           ) || usage();
 180
 181 sub print_version {
 182     require LWP;
 183     my $DISTNAME = 'libwww-perl-' . LWP::Version();
 184     print <<"EOT";
 185 This is lwp-rget version $VERSION ($DISTNAME)
 186
 187 Copyright 1996-1998, Gisle Aas.
 188
 189 This program is free software; you can redistribute it and/or
 190 modify it under the same terms as Perl itself.
 191 EOT
 192     exit 0;
 193 }
 194
 195 my $start_url = shift || usage();
 196 usage() if @ARGV;
 197
 198 require LWP::UserAgent;
 199 my $ua = new LWP::UserAgent;
 200 $ua->agent("$progname/$VERSION ");
 201 $ua->env_proxy;
 202
 203 unless (defined $PREFIX) {
 204     $PREFIX = url($start_url);   # limit to URLs below this one
 205     eval {
 206         $PREFIX->eparams(undef);
 207         $PREFIX->equery(undef);
 208     };
 209
 210     $_ = $PREFIX->epath;
 211     s|[^/]+$||;
 212     $PREFIX->epath($_);
 213     $PREFIX = $PREFIX->as_string;
 214 }
 215
 216 %KEEPEXT = map { lc($_) => 1 } split(/\s*,\s*/, ($KEEPEXT{'OPT'}||0));
 217
 218 my $SUPPRESS_REFERER;
 219 $SUPPRESS_REFERER++ if ($REFERER || "") eq "NONE";
 220
 221 print <<"" if $VERBOSE;
 222 START     = $start_url
 223 MAX_DEPTH = $MAX_DEPTH
 224 MAX_DOCS  = $MAX_DOCS
 225 PREFIX    = $PREFIX
 226
 227 my $no_docs = 0;
 228 my %seen = ();     # mapping from URL => local_file
 229
 230 my $filename = fetch($start_url, undef, $REFERER);
 231 print "$filename\n" unless $QUIET;
 232
 233 sub fetch
 234 {
 235     my($url, $type, $referer, $depth) = @_;
 236
 237     # Fix http://sitename.com/../blah/blah.html to
 238     #     http://sitename.com/blah/blah.html
 239     $url = $url->as_string if (ref($url));
 240     while ($url =~ s#(https?://[^/]+/)\.\.\/#$1#) {}
 241
 242     # Fix backslashes (\) in URL if $IIS defined
 243     $url = fix_backslashes($url) if (defined $IIS);
 244
 245     $url = url($url);
 246     $type  ||= 'a';
 247     # Might be the background attribute
 248     $type = 'img' if ($type eq 'body' || $type eq 'td');
 249     $depth ||= 0;
 250
 251     # Print the URL before we start checking...
 252     my $out = (" " x $depth) . $url . " ";
 253     $out .= "." x (60 - length($out));
 254     print STDERR $out . " " if $VERBOSE;
 255
 256     # Can't get mailto things
 257     if ($url->scheme eq 'mailto') {
 258         print STDERR "*skipping mailto*\n" if $VERBOSE;
 259         return $url->as_string;
 260     }
 261
 262     # The $plain_url is a URL without the fragment part
 263     my $plain_url = $url->clone;
 264     $plain_url->frag(undef);
 265
 266     # Check PREFIX, but not for <IMG ...> links
 267     if ($type ne 'img' and  $url->as_string !~ /^\Q$PREFIX/o) {
 268         print STDERR "*outsider*\n" if $VERBOSE;
 269         return $url->as_string;
 270     }
 271
 272     # Translate URL to lowercase if $TOLOWER defined
 273     $plain_url = to_lower($plain_url) if (defined $TOLOWER);
 274
 275     # If we already have it, then there is nothing to be done
 276     my $seen = $seen{$plain_url->as_string};
 277     if ($seen) {
 278         my $frag = $url->frag;
 279         $seen .= "#$frag" if defined($frag);
 280         $seen = protect_frag_spaces($seen);
 281         print STDERR "$seen (again)\n" if $VERBOSE;
 282         return $seen;
 283     }
 284
 285     # Too much or too deep
 286     if ($depth > $MAX_DEPTH and $type ne 'img') {
 287         print STDERR "*too deep*\n" if $VERBOSE;
 288         return $url;
 289     }
 290     if ($no_docs > $MAX_DOCS) {
 291         print STDERR "*too many*\n" if $VERBOSE;
 292         return $url;
 293     }
 294
 295     # Fetch document
 296     $no_docs++;
 297     sleep($SLEEP) if $SLEEP;
 298     my $req = HTTP::Request->new(GET => $url);
 299     # See: http://ftp.sunet.se/pub/NT/mirror-microsoft/kb/Q163/7/74.TXT
 300     $req->header ('Accept', '*/*') if (defined $IIS);  # GIF/JPG from IIS 2.0
 301     $req->authorization_basic(split (/:/, $AUTH)) if (defined $AUTH);
 302     if ($referer && !$SUPPRESS_REFERER) {
 303         if ($req->url->scheme eq 'http') {
 304             # RFC 2616, section 15.1.3
 305             $referer = url($referer) unless ref($referer);
 306             undef $referer if ($referer->scheme || '') eq 'https';
 307         }
 308         $req->referer($referer) if $referer;
 309     }
 310     my $res = $ua->request($req);
 311
 312     # Check outcome
 313     if ($res->is_success) {
 314         my $doc = $res->content;
 315         my $ct = $res->content_type;
 316         my $name = find_name($res->request->url, $ct);
 317         print STDERR "$name\n" unless $QUIET;
 318         $seen{$plain_url->as_string} = $name;
 319
 320         # If the file is HTML, then we look for internal links
 321         if ($ct eq "text/html") {
 322             # Save an unprosessed version of the HTML document.  This
 323             # both reserves the name used, and it also ensures that we
 324             # don't loose everything if this program is killed before
 325             # we finish.
 326             save($name, $doc);
 327             my $base = $res->base;
 328
 329             # Follow and substitute links...
 330             $doc =~
 331 s/
 332   (
 333     <(img|a|body|area|frame|td)\b   # some interesting tag
 334     [^>]+                           # still inside tag (not strictly correct)
 335     \b(?:src|href|background)       # some link attribute
 336     \s*=\s*                         # =
 337   )
 338     (?:                             # scope of OR-ing
 339          (")([^"]*)"    |           # value in double quotes  OR
 340          (')([^']*)'    |           # value in single quotes  OR
 341             ([^\s>]+)               # quoteless value
 342     )
 343 /
 344   new_link($1, lc($2), $3||$5, HTML::Entities::decode($4||$6||$7),
 345            $base, $name, "$url", $depth+1)
 346 /giex;
 347            # XXX
 348            # The regular expression above is not strictly correct.
 349            # It is not really possible to parse HTML with a single
 350            # regular expression, but it is faster.  Tags that might
 351            # confuse us include:
 352            #    <a alt="href" href=link.html>
 353            #    <a alt=">" href="link.html">
 354            #
 355         }
 356         save($name, $doc);
 357         return $name;
 358     }
 359     else {
 360         print STDERR $res->code . " " . $res->message . "\n" if $VERBOSE;
 361         $seen{$plain_url->as_string} = $url->as_string;
 362         return $url->as_string;
 363     }
 364 }
 365
 366 sub new_link
 367 {
 368     my($pre, $type, $quote, $url, $base, $localbase, $referer, $depth) = @_;
 369
 370     $url = protect_frag_spaces($url);
 371
 372     $url = fetch(url($url, $base)->abs, $type, $referer, $depth);
 373     $url = url("file:$url", "file:$localbase")->rel
 374         unless $url =~ /^[.+\-\w]+:/;
 375
 376     $url = unprotect_frag_spaces($url);
 377
 378     return $pre . $quote . $url . $quote;
 379 }
 380
 381
 382 sub protect_frag_spaces
 383 {
 384     my ($url) = @_;
 385
 386     $url = $url->as_string if (ref($url));
 387
 388     if ($url =~ m/^([^#]*#)(.+)$/)
 389     {
 390       my ($base, $frag) = ($1, $2);
 391       $frag =~ s/ /%20/g;
 392       $url = $base . $frag;
 393     }
 394
 395     return $url;
 396 }
 397
 398
 399 sub unprotect_frag_spaces
 400 {
 401     my ($url) = @_;
 402
 403     $url = $url->as_string if (ref($url));
 404
 405     if ($url =~ m/^([^#]*#)(.+)$/)
 406     {
 407       my ($base, $frag) = ($1, $2);
 408       $frag =~ s/%20/ /g;
 409       $url = $base . $frag;
 410     }
 411
 412     return $url;
 413 }
 414
 415
 416 sub fix_backslashes
 417 {
 418     my ($url) = @_;
 419     my ($base, $frag);
 420
 421     $url = $url->as_string if (ref($url));
 422
 423     if ($url =~ m/([^#]+)(#.*)/)
 424     {
 425       ($base, $frag) = ($1, $2);
 426     }
 427     else
 428     {
 429       $base = $url;
 430       $frag = "";
 431     }
 432
 433     $base =~ tr/\\/\//;
 434     $base =~ s/%5[cC]/\//g;     # URL-encoded back slash is %5C
 435
 436     return $base . $frag;
 437 }
 438
 439
 440 sub to_lower
 441 {
 442     my ($url) = @_;
 443     my $was_object = 0;
 444
 445     if (ref($url))
 446     {
 447       $url = $url->as_string;
 448       $was_object = 1;
 449     }
 450
 451     if ($url =~ m/([^#]+)(#.*)/)
 452     {
 453       $url = lc($1) . $2;
 454     }
 455     else
 456     {
 457       $url = lc($url);
 458     }
 459
 460     if ($was_object == 1)
 461     {
 462       return url($url);
 463     }
 464     else
 465     {
 466       return $url;
 467     }
 468 }
 469
 470
 471 sub translate_spaces
 472 {
 473     my ($url) = @_;
 474     my ($base, $frag);
 475
 476     $url = $url->as_string if (ref($url));
 477
 478     if ($url =~ m/([^#]+)(#.*)/)
 479     {
 480       ($base, $frag) = ($1, $2);
 481     }
 482     else
 483     {
 484       $base = $url;
 485       $frag = "";
 486     }
 487
 488     $base =~ s/^ *//;   # Remove initial spaces from base
 489     $base =~ s/ *$//;   # Remove trailing spaces from base
 490
 491     $base =~ tr/ /_/;
 492     $base =~ s/%20/_/g; # URL-encoded space is %20
 493
 494     return $base . $frag;
 495 }
 496
 497
 498 sub mkdirp
 499 {
 500     my($directory, $mode) = @_;
 501     my @dirs = split(/\//, $directory);
 502     my $path = shift(@dirs);   # build it as we go
 503     my $result = 1;   # assume it will work
 504
 505     unless (-d $path) {
 506         $result &&= mkdir($path, $mode);
 507     }
 508
 509     foreach (@dirs) {
 510         $path .= "/$_";
 511         if ( ! -d $path) {
 512             $result &&= mkdir($path, $mode);
 513         }
 514     }
 515
 516     return $result;
 517 }
 518
 519
 520 sub find_name
 521 {
 522     my($url, $type) = @_;
 523     #print "find_name($url, $type)\n";
 524
 525     # Translate spaces in URL to underscores (_) if $NOSPACE defined
 526     $url = translate_spaces($url) if (defined $NOSPACE);
 527
 528     # Translate URL to lowercase if $TOLOWER defined
 529     $url = to_lower($url) if (defined $TOLOWER);
 530
 531     $url = url($url) unless ref($url);
 532
 533     my $path = $url->path;
 534
 535     # trim path until only the basename is left
 536     $path =~ s|(.*/)||;
 537     my $dirname = ".$1";
 538     if (!$HIER) {
 539         $dirname = "";
 540     }
 541     elsif (! -d $dirname) {
 542         mkdirp($dirname, 0775);
 543     }
 544
 545     my $extra = "";  # something to make the name unique
 546     my $suffix;
 547
 548     if ($KEEPEXT{lc($type)}) {
 549         $suffix = ($path =~ m/\.(.*)/) ? $1 : "";
 550     }
 551     else {
 552         $suffix = media_suffix($type);
 553     }
 554
 555     $path =~ s|\..*||;  # trim suffix
 556     $path = "index" unless length $path;
 557
 558     while (1) {
 559         # Construct a new file name
 560         my $file = $dirname . $path . $extra;
 561         $file .= ".$suffix" if $suffix;
 562         # Check if it is unique
 563         return $file unless -f $file;
 564
 565         # Try something extra
 566         unless ($extra) {
 567             $extra = "001";
 568             next;
 569         }
 570         $extra++;
 571     }
 572 }
 573
 574
 575 sub save
 576 {
 577     my $name = shift;
 578     #print "save($name,...)\n";
 579     open(FILE, ">$name") || die "Can't save $name: $!";
 580     binmode FILE;
 581     print FILE $_[0];
 582     close(FILE);
 583 }
 584
 585
 586 sub usage
 587 {
 588     print <<""; exit 1;
 589 Usage: $progname [options] <URL>
 590 Allowed options are:
 591   --auth=USER:PASS  Set authentication credentials for web site
 592   --depth=N         Maximum depth to traverse (default is $MAX_DEPTH)
 593   --hier            Download into hierarchy (not all files into cwd)
 594   --referer=URI     Set initial referer header (or "NONE")
 595   --iis             Workaround IIS 2.0 bug by sending "Accept: */*" MIME
 596                     header; translates backslashes (\\) to forward slashes (/)
 597   --keepext=type    Keep file extension for MIME types (comma-separated list)
 598   --limit=N         A limit on the number documents to get (default is $MAX_DOCS)
 599   --nospace         Translate spaces URLs (not #fragments) to underscores (_)
 600   --version         Print version number and quit
 601   --verbose         More output
 602   --quiet           No output
 603   --sleep=SECS      Sleep between gets, ie. go slowly
 604   --prefix=PREFIX   Limit URLs to follow to those which begin with PREFIX
 605   --tolower         Translate all URLs to lowercase (useful with IIS servers)
 606
 607 }