[tpwd] Fix segfault when exactly one argument given
[tinyapps.git] / extractlinks.pl
blobe35d38b00300615d5eec7b97e30ed84aa64eff0b
1 #!/usr/bin/perl -w
2 ##
3 ## Extracts links from a HTML page
4 ## Copyright (c) 2005 by Berislav Kovacki (beca/AT/sezampro.yu)
5 ## Copyright (c) 2005,2006 by Michal Nazarewicz (mina86/AT/mina86.com)
6 ##
7 ## This program is free software; you can redistribute it and/or modify
8 ## it under the terms of the GNU General Public License as published by
9 ## the Free Software Foundation; either version 3 of the License, or
10 ## (at your option) any later version.
12 ## This program is distributed in the hope that it will be useful,
13 ## but WITHOUT ANY WARRANTY; without even the implied warranty of
14 ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 ## GNU General Public License for more details.
17 ## You should have received a copy of the GNU General Public License
18 ## along with this program; if not, see <http://www.gnu.org/licenses/>.
20 ## This is part of Tiny Applications Collection
21 ## -> http://tinyapps.sourceforge.net/
25 # Documentation at the end of file.
28 use strict;
29 use warnings;
31 use English;
32 use Getopt::Long;
33 use Pod::Usage;
35 use URI::URL;
36 use URI::file;
37 use LWP::UserAgent;
38 use HTML::LinkExtor;
41 my $VERSION = '1.07';
44 my $url = '';
45 my @tags = ();
46 my $noabs = 0;
47 my $base_uri;
50 pod2usage() unless GetOptions(
51 'tag|tags|t=s' => \@tags,
52 'relative|r' => \$noabs,
53 'base|b=s' => \$base_uri,
54 'help|h|?' => sub { pod2usage(-verbose => 1); });
55 @tags = split(/,/, join ',', @tags);
58 my $error = 0;
59 my $parser = HTML::LinkExtor->new(\&linkcallback);
60 my $ua = LWP::UserAgent->new;
61 my @links;
64 @ARGV = ('-') unless (@ARGV);
65 while (@ARGV) {
66 $url = shift(@ARGV);
67 @links = ();
68 my $base;
70 if ($url eq '-') {
71 $parser->parse( sub { <STDIN> } );
72 $base = defined $base_uri ? $base_uri : URI::file->cwd;
73 @links = map { $_ = url($_, $base)->abs; } @links unless ($noabs);
75 } else {
76 $url = URI->new($url)->abs(URI::file->cwd);
78 my $res = $ua->request(HTTP::Request->new(GET => $url),
79 sub { $parser->parse($_[0]) });
81 if (!$res->is_success) {
82 print('Parse failed: ');
83 print($res->status_line);
84 print("\n");
85 $error = 1;
86 } elsif (!$noabs) {
87 $base = defined $base_uri ? $base_uri : $res->base;
88 @links = map { $_ = url($_, $base)->abs; } @links;
92 print(join("\n", @links), "\n") if (@links);
95 exit($error);
98 sub linkcallback {
99 my ($tag, %links) = @_;
100 push @links, values %links if (!@tags || grep { $_ eq $tag } @tags);
104 __END__
106 =head1 NAME
108 extractlinks - Extracts links from a HTML page
110 =head1 DESCRIPTION
112 The extractlinks utility shall search the HTML file specified by the
113 url parameter and extract all contained links. The url must be
114 specified as absolute URL.
116 =head1 SYNOPSIS
118 extractlinks [ -t tags ] [ -r | -b base ] [ -- ] [ url ... ]
120 =head1 OPTIONS
122 =over 8
124 =item B<-h --help>
126 Displays help screen.
128 =item B<-r --relative>
130 Won't make links absolute.
132 =item B<-b --base=>I<base>
134 Uses I<base> as relative links base address instead of base address of
135 input file.
137 =item B<-t --tag=>I<tags>
139 A comma separated list of HTML tags to search for links. This option
140 may be specified several times. If not give, links from all HTML rags
141 are extracted.
143 =item I<url>
145 Can be a URI (eg. I<http://mina86.com/>), a file name
146 (eg. I<index.html>) or a minus sign which will cause the script to
147 read HTML page from standard input. If no URLs are given, - is
148 assumed. Beware that I<www.google.com> will cause script to open a
149 B<file> of a given name and not a Google page.
151 =back
153 =head1 AUTHOR
155 Berislav Kovacki <beca@sezampro.yu>,
156 Michal Nazarewicz <mina86@mina86.com>
158 =cut