extractlinks.pl

   1 #!/usr/bin/perl -w
   2 ##
   3 ## Extracts links from a HTML page
   4 ## Copyright (c) 2005 by Berislav Kovacki (beca/AT/sezampro.yu)
   5 ## Copyright (c) 2005,2006 by Michal Nazarewicz (mina86/AT/mina86.com)
   6 ##
   7 ## This program is free software; you can redistribute it and/or modify
   8 ## it under the terms of the GNU General Public License as published by
   9 ## the Free Software Foundation; either version 3 of the License, or
  10 ## (at your option) any later version.
  11 ##
  12 ## This program is distributed in the hope that it will be useful,
  13 ## but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 ## GNU General Public License for more details.
  16 ##
  17 ## You should have received a copy of the GNU General Public License
  18 ## along with this program; if not, see <http://www.gnu.org/licenses/>.
  19 ##
  20 ## This is part of Tiny Applications Collection
  21 ##   -> http://tinyapps.sourceforge.net/
  22 ##
  23
  24 #
  25 # Documentation at the end of file.
  26 #
  27
  28 use strict;
  29 use warnings;
  30
  31 use English;
  32 use Getopt::Long;
  33 use Pod::Usage;
  34
  35 use URI::URL;
  36 use URI::file;
  37 use LWP::UserAgent;
  38 use HTML::LinkExtor;
  39
  40
  41 my $VERSION = '1.07';
  42
  43
  44 my $url = '';
  45 my @tags = ();
  46 my $noabs = 0;
  47 my $base_uri;
  48
  49
  50 pod2usage() unless GetOptions(
  51     'tag|tags|t=s' => \@tags,
  52     'relative|r'   => \$noabs,
  53     'base|b=s'     => \$base_uri,
  54     'help|h|?'     => sub { pod2usage(-verbose => 1); });
  55 @tags = split(/,/, join ',', @tags);
  56
  57
  58 my $error = 0;
  59 my $parser = HTML::LinkExtor->new(\&linkcallback);
  60 my $ua = LWP::UserAgent->new;
  61 my @links;
  62
  63
  64 @ARGV = ('-') unless (@ARGV);
  65 while (@ARGV) {
  66   $url = shift(@ARGV);
  67   @links = ();
  68   my $base;
  69
  70   if ($url eq '-') {
  71     $parser->parse( sub { <STDIN> } );
  72     $base = defined $base_uri ? $base_uri : URI::file->cwd;
  73     @links = map { $_ = url($_, $base)->abs; } @links unless ($noabs);
  74
  75   } else {
  76     $url = URI->new($url)->abs(URI::file->cwd);
  77
  78     my $res = $ua->request(HTTP::Request->new(GET => $url),
  79                            sub { $parser->parse($_[0]) });
  80
  81     if (!$res->is_success) {
  82       print('Parse failed: ');
  83       print($res->status_line);
  84       print("\n");
  85       $error = 1;
  86     } elsif (!$noabs) {
  87       $base = defined $base_uri ? $base_uri : $res->base;
  88       @links = map { $_ = url($_, $base)->abs; } @links;
  89     }
  90   }
  91
  92   print(join("\n", @links), "\n") if (@links);
  93 }
  94
  95 exit($error);
  96
  97
  98 sub linkcallback {
  99   my ($tag, %links) = @_;
 100   push @links, values %links if (!@tags || grep { $_ eq $tag } @tags);
 101 }
 102
 103
 104 __END__
 105
 106 =head1 NAME
 107
 108 extractlinks - Extracts links from a HTML page
 109
 110 =head1 DESCRIPTION
 111
 112 The extractlinks utility shall search the HTML file specified by the
 113 url parameter and extract all contained links.  The url must be
 114 specified as absolute URL.
 115
 116 =head1 SYNOPSIS
 117
 118 extractlinks [ -t tags ] [ -r | -b base ] [ -- ] [ url ... ]
 119
 120 =head1 OPTIONS
 121
 122 =over 8
 123
 124 =item B<-h --help>
 125
 126 Displays help screen.
 127
 128 =item B<-r --relative>
 129
 130 Won't make links absolute.
 131
 132 =item B<-b --base=>I<base>
 133
 134 Uses I<base> as relative links base address instead of base address of
 135 input file.
 136
 137 =item B<-t --tag=>I<tags>
 138
 139 A comma separated list of HTML tags to search for links.  This option
 140 may be specified several times.  If not give, links from all HTML rags
 141 are extracted.
 142
 143 =item I<url>
 144
 145 Can be a URI (eg. I<http://mina86.com/>), a file name
 146 (eg. I<index.html>) or a minus sign which will cause the script to
 147 read HTML page from standard input.  If no URLs are given, - is
 148 assumed.  Beware that I<www.google.com> will cause script to open a
 149 B<file> of a given name and not a Google page.
 150
 151 =back
 152
 153 =head1 AUTHOR
 154
 155 Berislav Kovacki <beca@sezampro.yu>,
 156 Michal Nazarewicz <mina86@mina86.com>
 157
 158 =cut