Merge pull request #42 from solgenomics/topic/duplicate_image_warning
[cxgn-corelibs.git] / lib / CXGN / Tools / Wget.pm
blob429f5a46fe12a67d827a32b2117b7b74b7aa3a3b
1 package CXGN::Tools::Wget;
3 use strict;
4 use warnings;
5 use Carp::Clan qr/^CXGN::Tools::Wget/;
7 use File::Copy;
8 use File::Temp qw/tempfile/;
9 use File::Flock;
10 use Digest::MD5 qw/ md5_hex /;
11 use URI;
13 use CXGN::Tools::List qw/ all str_in /;
14 use CXGN::Tools::File qw/ is_filehandle /;
16 =head1 NAME
18 CXGN::Tools::Wget - contains functions for getting files via http
19 or ftp in ways that aren't directly supported by L<LWP>.
21 =head1 SYNOPSIS
23 use CXGN::Tools::Wget qw/wget_filter/;
25 #get a gzipped file from a remote site, gunzipping it as it comes in
26 #and putting in somewhere else
27 wget_filter( http://example.com/myfile.gz => '/tmp/somelocalfile.txt',
28 { gunzip => 1 },
31 #get the same file, but transform each line as it comes in with the given
32 #subroutine, because they really mean bonobos, not monkeys
33 wget_filter( http://example.com/myfile.gz => '/tmp/somelocalfile.txt',
34 sub {
35 my $line = shift;
36 $line =~ s/\s+monkey/ bonobo/;
37 return $line;
39 { gunzip => 1 },
42 # get a cxgn-resource file defined in public.resource_file
43 wget_filter( cxgn-resource://all_tomato_repeats => 'myrepeats.seq');
45 my $temp_repeats_file = wget_filter( 'cxgn-resource://all_tomato_repeats' );
47 =head1 CXGN-RESOURCE URLS
49 Sometimes we have a need for making datasets out of several other
50 datasets. For example, say you wanted a combined set of sequences
51 composed of NCBI's NR dataset, SGN's ESTs, and some random thing from
52 MIPS. You could define a resource file like:
54 insert into public.resource_file (name,expression)
55 values ('robs_composite_set','cat( gunzip(ftp://ftp.ncbi.nlm.nih.gov/nr.gz), ftp://ftp.sgn.cornell.edu/ests/Tomato_current.seq.gz, http://mips.gsf.de/some_random_set.fasta )');
57 Then, when you go
59 my $file = wget_filter('cxgn-resource://robs_composite_set');
61 You will get the concatenation of the unzipped NR set, the SGN est
62 set, and the MIPs set. What actually happens behind the scenes is,
63 wget downloads each of the files, gunzips the nr file, then
64 concatenates the three into another file and caches it, then copies
65 the cached copy into another tempfile, whose name it returns to you.
67 But you didn't have to know that. All you have to know is, define a
68 resource in the resource_file table, and wget_filter will build it for
69 you when you ask for it by wgetting a URL with the cxgn-resource
70 protocol.
72 =head1 CXGN-WGET URLS
74 Just like cxgn-resource URLs, except the resource expression is right
75 in the URL. Example:
77 cxgn-wget://cat( http://google.com,
79 =head1 FUNCTIONS
81 All functions are @EXPORT_OK.
83 =cut
85 BEGIN { our @EXPORT_OK = qw/ wget_filter clear_cache / }
87 our @EXPORT_OK;
88 use base 'Exporter';
92 =head2 wget_filter
94 Usage: wget_filter( http://example.com/myfile.txt => 'somelocalfile.txt');
95 Desc : get a remote file, optionally gunzipping it,
96 and/or running some subroutines on each line as it
97 comes in.
98 Ret : filename where the output was written, which
99 is either the destination file you provided,
100 or a tempfile if you did not provide a destination
101 file
102 Args : (url of file,
103 optional destination filename or filehandle,
104 optional list of filters to run on each line,
105 optional hashref of behavior options, as:
106 { gunzip => 1, #< gunzip the downloaded file before returning it. default false.
107 cache => 1, #< enable/disable persistent caching. default enabled
108 max_age => 3*24*60*60 (3 days), #< if caching maximum age of cached copy in seconds
109 unlink => 1, #< enable/disable automatic deletion of the
110 #temp file made and returned to you. only
111 #relevant if no destination filename is
112 #provided
113 test_only => 0, #if true, only download the first few
114 #bytes of each of the components of the
115 #resource, to check if everything looks OK
118 Side Effects: dies on error
119 Example:
120 #get the same file, but transform each line as it comes in with the given
121 #subroutine, because they really mean bonobos, not monkeys
122 wget_filter( http://example.com/myfile.gz => '/tmp/somelocalfile.txt',
123 sub {
124 my $line = shift;
125 $line =~ s/\s+monkey/ bonobo/;
126 return $line;
128 { gunzip => 1 },
130 # get a composite resource file defined in the public.resource_file
131 # table
132 wget_filter( cxgn-resource://test => '/tmp/mytestfile.html' );
134 =cut
136 use constant DEFAULT_CACHE_MAX_AGE => 3*24*60*60; #< 3 days ago, in seconds
138 sub wget_filter {
139 my ($url,@args) = @_;
141 #get our options hash if present
142 my %options = (ref($args[-1]) eq 'HASH') ? %{pop @args} : ();
144 $options{cache} = 1 unless exists $options{cache};
145 $options{unlink} = 1 unless exists $options{unlink};
147 $options{cache} = 0 if $options{test_only};
149 my $destfile = do {
150 if( !$args[0] || ref $args[0]) {
151 my (undef,$f) = tempfile( File::Spec->catfile( __PACKAGE__->temp_root_dir(), 'cxgn-tools-wget-XXXXXX'), UNLINK => $options{unlink});
152 # cluck "made tempfile $f\n";
154 } else {
155 shift @args
159 #and the rest of the arguments must be our filters
160 my @filters = @args;
161 !@filters || all map ref $_ eq 'CODE', @filters
162 or confess "all filters must be subroutine refs or anonymous subs (".join(',',@filters).")";
164 my $do_actual_fetch = sub {
165 _wget_filter({ filters => \@filters,
166 destfile => $destfile,
167 url => $url,
168 options => \%options,
172 # if we are filtering, just do the fetch without caching
173 return $do_actual_fetch->() if @filters || !$options{cache};
175 # otherwise, we are caching, and we need to do locking
177 # only do caching if we don't have any filters (we can't represent
178 # these in a persistent way in a hash key, because the CODE(...)
179 # will be different at every program run
180 my $cache_key = $url.' WITH OPTIONS '.join('=>',%options);
181 my $cache_filename = cache_filename( $cache_key );
182 my $lock_filename = "$cache_filename.lock";
183 my $try_read_lock = sub { File::Flock->new( $lock_filename, 'shared', 'nonblocking' ) };
184 my $try_write_lock = sub { File::Flock->new( $lock_filename, undef, 'nonblocking' ) };
186 my $cache_file_looks_valid = sub {
187 -r $cache_filename
188 && (time-(stat($cache_filename))[9]) <= ($options{max_age} || DEFAULT_CACHE_MAX_AGE)
191 my $copy_from_cache = sub {
192 my $gunzip_error = system qq!gunzip -c '$cache_filename' > '$destfile'!;
193 return 0 if $gunzip_error;
194 return 1;
197 my $read_cache = sub {
198 my $read_lock;
199 sleep 1 until $read_lock = $try_read_lock->();
200 return $destfile if $cache_file_looks_valid->() && $copy_from_cache->();
201 return;
204 # OK, the cache file needs updating or is corrupt, so try to get an
205 # exclusive write lock
206 my $dest_from_cache;
207 until ( $dest_from_cache = $read_cache->() ) {
208 if( my $write_lock = $try_write_lock->() ) {
209 $do_actual_fetch->();
211 # write the destfile into the cache
212 system qq!gzip -c '$destfile' > '$cache_filename'!
213 and confess "$! writing downloaded file to CXGN::Tools::Wget persistent cache (gzip $destfile -> $cache_filename.gz)";
215 return $destfile;
220 return $dest_from_cache;
224 # the get, minus caching
225 sub _wget_filter {
226 my $args = shift;
227 my %options = %{$args->{options}};
228 my $url = $args->{url};
229 my $destfile = $args->{destfile} or die 'pass destfile stupid';
230 my @filters = @{ $args->{filters} };
232 #properly form the gunzip command
233 $options{gunzip} = $options{gunzip} ? ' gunzip -c |' : '';
235 my $parsed_url = URI->new($url)
236 or croak "could not parse uri '$url'";
238 if ( str_in( $parsed_url->scheme, qw/ file http ftp / ) ) {
240 #try to use ncftpget for fetching from ftp with no wildcards, since
241 #wget suffers from some kind of bug with large ftp transfers.
242 #use wget for everything else, since it's a little more flexible
243 my $fetchcommand =
244 $parsed_url->scheme eq 'file' ? 'cat' :
245 $parsed_url->scheme eq 'ftp' && $url !~ /[\*\?]/ ? "ncftpget -cV" :
246 "wget -q -O -" ;
249 $url = $parsed_url->path if $parsed_url->scheme eq 'file';
251 #check that all of the given filters are code refs
252 @filters = grep {$_} @filters; #just ignore false things in the filters
253 foreach (@filters) {
254 ref eq 'CODE' or croak "Invalid filter argument '$_', must be a code ref";
257 #open the output filehandle if our argument isn't already a filehandle
258 my $out_fh;
259 my $open_out = ! is_filehandle($destfile);
260 if ($open_out) {
261 open $out_fh, '>', $destfile
262 or croak "Could not write to destination file $destfile: $!";
263 } else {
264 $out_fh = $destfile;
267 my $testhead = $options{test_only} ? 'head -c 30 |' : '';
268 #warn "testhead is $testhead\n";
270 #run wget to download the file
271 open my $urlpipe,"cd /tmp; $fetchcommand '$url' |$options{gunzip}$testhead"
272 or croak "Could not use wget to fetch $url: $!";
273 while (my $line = <$urlpipe>) {
274 #if we were given filters, run them on it
275 foreach my $filter (@filters) {
276 $line = $filter->($line);
278 print $out_fh $line;
280 close $urlpipe;
282 #close the output filehandle if it was us who opened it
283 close $out_fh if $open_out;
284 (stat($destfile))[7] > 0 || croak "Could not download $url using command '$fetchcommand'";
285 # print "done.\n";
287 ### cxgn-resource urls
288 elsif ( $parsed_url->scheme eq 'cxgn-resource' ) {
289 require CXGN::Tools::Wget::ResourceFile;
291 confess 'filters do not work with cxgn-resource urls' if @filters;
293 #look for a resource with that name
294 my $resource_name = $parsed_url->authority;
296 my ( $resource_file, $multiple_resources ) =
297 CXGN::Tools::Wget::ResourceFile->search(
298 name => $resource_name,
301 $resource_file or croak "no cxgn-resource found with name '$resource_name'";
302 $multiple_resources and croak "multiple cxgn-resource entries found with name '$resource_name'";
304 if ( $options{test_only} ) {
305 #warn "test fetch\n";
306 $resource_file->test_fetch();
307 } else {
308 $resource_file->fetch($destfile);
311 elsif ( $parsed_url->scheme eq 'cxgn-wget' ) {
312 require CXGN::Tools::Wget::ResourceExpression;
314 confess 'filters do not work with cxgn-wget urls' if @filters;
315 ( my $expression = "$url" ) =~ s!cxgn-wget://!!;
317 CXGN::Tools::Wget::ResourceExpression::fetch_expression( $expression, $destfile );
319 } else {
320 croak "unable to handle URIs with scheme '".($parsed_url->scheme || '')."', URI is '$url'";
325 #given a url, return the full path to where it should be stored on the
326 #filesystem
327 sub cache_filename {
328 my ($keystring) = @_;
329 my $name = md5_hex($keystring);
330 # md5sum the key to make a filename that is compact, does not need
331 # escaping, and that is still unique
332 return File::Spec->catfile(cache_root_dir(), "$name.gz");
336 =head2 temp_root_dir
338 Usage: CXGN::Tools::Wget->temp_dir( $new_dir );
339 Desc : class method to get/set the class-wide temp directory where
340 the wget file cache and temporarily fetched files
341 are kept
342 Args : (optional) new directory to set.
343 defaults to /tmp/cxgn-tools-wget-<username>
344 Ret : root directory where wget will keep its cache files
345 Side Effects: sets a piece of class data
347 =cut
350 my $cache_root;
351 sub temp_root_dir {
352 my ($class,$new_root) = @_;
353 $cache_root = $new_root if defined $new_root;
354 return $cache_root ||= do {
355 my $username = getpwuid $>;
356 my $dir_name = File::Spec->catdir( File::Spec->tmpdir, "cxgn-tools-wget-$username" );
357 system 'mkdir', -p => $dir_name;
358 -w $dir_name or die "could not make and/or write to cache dir $dir_name\n";
359 $dir_name
362 sub cache_root_dir {
363 my $c = File::Spec->catdir( temp_root_dir(), 'cxgn-tools-wget-cache' );
364 -d $c or mkdir $c or die "$! making cache dir '$c'";
369 =head2 clear_cache
371 Usage: CXGN::Tools::Wget::clear_cache
372 Desc : delete all the locally cached files managed by this module
373 Args :
374 Ret :
375 Side Effects:
376 Example:
378 =cut
380 sub clear_cache {
381 my ($class) = @_;
382 my @delete_us = glob cache_root_dir().'/*';
383 my $num_deleted = unlink @delete_us;
384 unless ($num_deleted == @delete_us) {
385 croak "could not delete all files in the cache root directory (".cache_root_dir().") : $!";
389 =head2 vacuum_cache
391 Usage: CXGN::Tools::Wget::vacuum_cache(1200)
392 Desc : delete all cached files that are older than N seconds old
393 Args : number of seconds old a file must be to be deleted
394 Ret : nothing meaningful
395 Side Effects: dies on error
397 =cut
399 sub vacuum_cache {
400 my ($max_age) = @_;
401 my @delete_us = grep { (stat $_)[9] < time-$max_age }
402 glob cache_root_dir().'/*';
404 unlink @delete_us == scalar @delete_us
405 or croak "could not vacuum files in the cache root directory (".cache_root_dir().") : $!";