3 ## Extracts links from a HTML page
4 ## Copyright (c) 2005 by Berislav Kovacki (beca/AT/sezampro.yu)
5 ## Copyright (c) 2005,2006 by Michal Nazarewicz (mina86/AT/mina86.com)
7 ## This program is free software; you can redistribute it and/or modify
8 ## it under the terms of the GNU General Public License as published by
9 ## the Free Software Foundation; either version 3 of the License, or
10 ## (at your option) any later version.
12 ## This program is distributed in the hope that it will be useful,
13 ## but WITHOUT ANY WARRANTY; without even the implied warranty of
14 ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 ## GNU General Public License for more details.
17 ## You should have received a copy of the GNU General Public License
18 ## along with this program; if not, see <http://www.gnu.org/licenses/>.
20 ## This is part of Tiny Applications Collection
21 ## -> http://tinyapps.sourceforge.net/
25 # Documentation at the end of file.
50 pod2usage
() unless GetOptions
(
51 'tag|tags|t=s' => \
@tags,
52 'relative|r' => \
$noabs,
53 'base|b=s' => \
$base_uri,
54 'help|h|?' => sub { pod2usage
(-verbose
=> 1); });
55 @tags = split(/,/, join ',', @tags);
59 my $parser = HTML
::LinkExtor
->new(\
&linkcallback
);
60 my $ua = LWP
::UserAgent
->new;
64 @ARGV = ('-') unless (@ARGV);
71 $parser->parse( sub { <STDIN
> } );
72 $base = defined $base_uri ?
$base_uri : URI
::file
->cwd;
73 @links = map { $_ = url
($_, $base)->abs; } @links unless ($noabs);
76 $url = URI
->new($url)->abs(URI
::file
->cwd);
78 my $res = $ua->request(HTTP
::Request
->new(GET
=> $url),
79 sub { $parser->parse($_[0]) });
81 if (!$res->is_success) {
82 print('Parse failed: ');
83 print($res->status_line);
87 $base = defined $base_uri ?
$base_uri : $res->base;
88 @links = map { $_ = url
($_, $base)->abs; } @links;
92 print(join("\n", @links), "\n") if (@links);
99 my ($tag, %links) = @_;
100 push @links, values %links if (!@tags || grep { $_ eq $tag } @tags);
108 extractlinks - Extracts links from a HTML page
112 The extractlinks utility shall search the HTML file specified by the
113 url parameter and extract all contained links. The url must be
114 specified as absolute URL.
118 extractlinks [ -t tags ] [ -r | -b base ] [ -- ] [ url ... ]
126 Displays help screen.
128 =item B<-r --relative>
130 Won't make links absolute.
132 =item B<-b --base=>I<base>
134 Uses I<base> as relative links base address instead of base address of
137 =item B<-t --tag=>I<tags>
139 A comma separated list of HTML tags to search for links. This option
140 may be specified several times. If not give, links from all HTML rags
145 Can be a URI (eg. I<http://mina86.com/>), a file name
146 (eg. I<index.html>) or a minus sign which will cause the script to
147 read HTML page from standard input. If no URLs are given, - is
148 assumed. Beware that I<www.google.com> will cause script to open a
149 B<file> of a given name and not a Google page.
155 Berislav Kovacki <beca@sezampro.yu>,
156 Michal Nazarewicz <mina86@mina86.com>