1 package CXGN
::Tools
::Wget
;
5 use Carp
::Clan
qr/^CXGN::Tools::Wget/;
8 use File
::Temp qw
/tempfile/;
10 use Digest
::MD5 qw
/ md5_hex /;
13 use CXGN
::Tools
::List qw
/ all str_in /;
14 use CXGN
::Tools
::File qw
/ is_filehandle /;
18 CXGN::Tools::Wget - contains functions for getting files via http
19 or ftp in ways that aren't directly supported by L<LWP>.
23 use CXGN::Tools::Wget qw/wget_filter/;
25 #get a gzipped file from a remote site, gunzipping it as it comes in
26 #and putting in somewhere else
27 wget_filter( http://example.com/myfile.gz => '/tmp/somelocalfile.txt',
31 #get the same file, but transform each line as it comes in with the given
32 #subroutine, because they really mean bonobos, not monkeys
33 wget_filter( http://example.com/myfile.gz => '/tmp/somelocalfile.txt',
36 $line =~ s/\s+monkey/ bonobo/;
42 # get a cxgn-resource file defined in public.resource_file
43 wget_filter( cxgn-resource://all_tomato_repeats => 'myrepeats.seq');
45 my $temp_repeats_file = wget_filter( 'cxgn-resource://all_tomato_repeats' );
47 =head1 CXGN-RESOURCE URLS
49 Sometimes we have a need for making datasets out of several other
50 datasets. For example, say you wanted a combined set of sequences
51 composed of NCBI's NR dataset, SGN's ESTs, and some random thing from
52 MIPS. You could define a resource file like:
54 insert into public.resource_file (name,expression)
55 values ('robs_composite_set','cat( gunzip(ftp://ftp.ncbi.nlm.nih.gov/nr.gz), ftp://ftp.sgn.cornell.edu/ests/Tomato_current.seq.gz, http://mips.gsf.de/some_random_set.fasta )');
59 my $file = wget_filter('cxgn-resource://robs_composite_set');
61 You will get the concatenation of the unzipped NR set, the SGN est
62 set, and the MIPs set. What actually happens behind the scenes is,
63 wget downloads each of the files, gunzips the nr file, then
64 concatenates the three into another file and caches it, then copies
65 the cached copy into another tempfile, whose name it returns to you.
67 But you didn't have to know that. All you have to know is, define a
68 resource in the resource_file table, and wget_filter will build it for
69 you when you ask for it by wgetting a URL with the cxgn-resource
74 Just like cxgn-resource URLs, except the resource expression is right
77 cxgn-wget://cat( http://google.com,
81 All functions are @EXPORT_OK.
85 BEGIN { our @EXPORT_OK = qw
/ wget_filter clear_cache / }
94 Usage: wget_filter( http://example.com/myfile.txt => 'somelocalfile.txt');
95 Desc : get a remote file, optionally gunzipping it,
96 and/or running some subroutines on each line as it
98 Ret : filename where the output was written, which
99 is either the destination file you provided,
100 or a tempfile if you did not provide a destination
103 optional destination filename or filehandle,
104 optional list of filters to run on each line,
105 optional hashref of behavior options, as:
106 { gunzip => 1, #< gunzip the downloaded file before returning it. default false.
107 cache => 1, #< enable/disable persistent caching. default enabled
108 max_age => 3*24*60*60 (3 days), #< if caching maximum age of cached copy in seconds
109 unlink => 1, #< enable/disable automatic deletion of the
110 #temp file made and returned to you. only
111 #relevant if no destination filename is
113 test_only => 0, #if true, only download the first few
114 #bytes of each of the components of the
115 #resource, to check if everything looks OK
118 Side Effects: dies on error
120 #get the same file, but transform each line as it comes in with the given
121 #subroutine, because they really mean bonobos, not monkeys
122 wget_filter( http://example.com/myfile.gz => '/tmp/somelocalfile.txt',
125 $line =~ s/\s+monkey/ bonobo/;
130 # get a composite resource file defined in the public.resource_file
132 wget_filter( cxgn-resource://test => '/tmp/mytestfile.html' );
136 use constant DEFAULT_CACHE_MAX_AGE
=> 3*24*60*60; #< 3 days ago, in seconds
139 my ($url,@args) = @_;
141 #get our options hash if present
142 my %options = (ref($args[-1]) eq 'HASH') ?
%{pop @args} : ();
144 $options{cache
} = 1 unless exists $options{cache
};
145 $options{unlink} = 1 unless exists $options{unlink};
147 $options{cache
} = 0 if $options{test_only
};
150 if( !$args[0] || ref $args[0]) {
151 my (undef,$f) = tempfile
( File
::Spec
->catfile( __PACKAGE__
->temp_root_dir(), 'cxgn-tools-wget-XXXXXX'), UNLINK
=> $options{unlink});
152 # cluck "made tempfile $f\n";
159 #and the rest of the arguments must be our filters
161 !@filters || all
map ref $_ eq 'CODE', @filters
162 or confess
"all filters must be subroutine refs or anonymous subs (".join(',',@filters).")";
164 my $do_actual_fetch = sub {
165 _wget_filter
({ filters
=> \
@filters,
166 destfile
=> $destfile,
168 options
=> \
%options,
172 # if we are filtering, just do the fetch without caching
173 return $do_actual_fetch->() if @filters || !$options{cache
};
175 # otherwise, we are caching, and we need to do locking
177 # only do caching if we don't have any filters (we can't represent
178 # these in a persistent way in a hash key, because the CODE(...)
179 # will be different at every program run
180 my $cache_key = $url.' WITH OPTIONS '.join('=>',%options);
181 my $cache_filename = cache_filename
( $cache_key );
182 my $lock_filename = "$cache_filename.lock";
183 my $try_read_lock = sub { File
::Flock
->new( $lock_filename, 'shared', 'nonblocking' ) };
184 my $try_write_lock = sub { File
::Flock
->new( $lock_filename, undef, 'nonblocking' ) };
186 my $cache_file_looks_valid = sub {
188 && (time-(stat($cache_filename))[9]) <= ($options{max_age
} || DEFAULT_CACHE_MAX_AGE
)
191 my $copy_from_cache = sub {
192 my $gunzip_error = system qq!gunzip
-c
'$cache_filename' > '$destfile'!;
193 return 0 if $gunzip_error;
197 my $read_cache = sub {
199 sleep 1 until $read_lock = $try_read_lock->();
200 return $destfile if $cache_file_looks_valid->() && $copy_from_cache->();
204 # OK, the cache file needs updating or is corrupt, so try to get an
205 # exclusive write lock
207 until ( $dest_from_cache = $read_cache->() ) {
208 if( my $write_lock = $try_write_lock->() ) {
209 $do_actual_fetch->();
211 # write the destfile into the cache
212 system qq!gzip
-c
'$destfile' > '$cache_filename'!
213 and confess
"$! writing downloaded file to CXGN::Tools::Wget persistent cache (gzip $destfile -> $cache_filename.gz)";
220 return $dest_from_cache;
224 # the get, minus caching
227 my %options = %{$args->{options
}};
228 my $url = $args->{url
};
229 my $destfile = $args->{destfile
} or die 'pass destfile stupid';
230 my @filters = @
{ $args->{filters
} };
232 #properly form the gunzip command
233 $options{gunzip
} = $options{gunzip
} ?
' gunzip -c |' : '';
235 my $parsed_url = URI
->new($url)
236 or croak
"could not parse uri '$url'";
238 if ( str_in
( $parsed_url->scheme, qw
/ file http ftp / ) ) {
240 #try to use ncftpget for fetching from ftp with no wildcards, since
241 #wget suffers from some kind of bug with large ftp transfers.
242 #use wget for everything else, since it's a little more flexible
244 $parsed_url->scheme eq 'file' ?
'cat' :
245 $parsed_url->scheme eq 'ftp' && $url !~ /[\*\?]/ ?
"ncftpget -cV" :
249 $url = $parsed_url->path if $parsed_url->scheme eq 'file';
251 #check that all of the given filters are code refs
252 @filters = grep {$_} @filters; #just ignore false things in the filters
254 ref eq 'CODE' or croak
"Invalid filter argument '$_', must be a code ref";
257 #open the output filehandle if our argument isn't already a filehandle
259 my $open_out = ! is_filehandle
($destfile);
261 open $out_fh, '>', $destfile
262 or croak
"Could not write to destination file $destfile: $!";
267 my $testhead = $options{test_only
} ?
'head -c 30 |' : '';
268 #warn "testhead is $testhead\n";
270 #run wget to download the file
271 open my $urlpipe,"cd /tmp; $fetchcommand '$url' |$options{gunzip}$testhead"
272 or croak
"Could not use wget to fetch $url: $!";
273 while (my $line = <$urlpipe>) {
274 #if we were given filters, run them on it
275 foreach my $filter (@filters) {
276 $line = $filter->($line);
282 #close the output filehandle if it was us who opened it
283 close $out_fh if $open_out;
284 (stat($destfile))[7] > 0 || croak
"Could not download $url using command '$fetchcommand'";
287 ### cxgn-resource urls
288 elsif ( $parsed_url->scheme eq 'cxgn-resource' ) {
289 require CXGN
::Tools
::Wget
::ResourceFile
;
291 confess
'filters do not work with cxgn-resource urls' if @filters;
293 #look for a resource with that name
294 my $resource_name = $parsed_url->authority;
296 my ( $resource_file, $multiple_resources ) =
297 CXGN
::Tools
::Wget
::ResourceFile
->search(
298 name
=> $resource_name,
301 $resource_file or croak
"no cxgn-resource found with name '$resource_name'";
302 $multiple_resources and croak
"multiple cxgn-resource entries found with name '$resource_name'";
304 if ( $options{test_only
} ) {
305 #warn "test fetch\n";
306 $resource_file->test_fetch();
308 $resource_file->fetch($destfile);
311 elsif ( $parsed_url->scheme eq 'cxgn-wget' ) {
312 require CXGN
::Tools
::Wget
::ResourceExpression
;
314 confess
'filters do not work with cxgn-wget urls' if @filters;
315 ( my $expression = "$url" ) =~ s!cxgn-wget://!!;
317 CXGN
::Tools
::Wget
::ResourceExpression
::fetch_expression
( $expression, $destfile );
320 croak
"unable to handle URIs with scheme '".($parsed_url->scheme || '')."', URI is '$url'";
325 #given a url, return the full path to where it should be stored on the
328 my ($keystring) = @_;
329 my $name = md5_hex
($keystring);
330 # md5sum the key to make a filename that is compact, does not need
331 # escaping, and that is still unique
332 return File
::Spec
->catfile(cache_root_dir
(), "$name.gz");
338 Usage: CXGN::Tools::Wget->temp_dir( $new_dir );
339 Desc : class method to get/set the class-wide temp directory where
340 the wget file cache and temporarily fetched files
342 Args : (optional) new directory to set.
343 defaults to /tmp/cxgn-tools-wget-<username>
344 Ret : root directory where wget will keep its cache files
345 Side Effects: sets a piece of class data
352 my ($class,$new_root) = @_;
353 $cache_root = $new_root if defined $new_root;
354 return $cache_root ||= do {
355 my $username = getpwuid $>;
356 my $dir_name = File
::Spec
->catdir( File
::Spec
->tmpdir, "cxgn-tools-wget-$username" );
357 system 'mkdir', -p
=> $dir_name;
358 -w
$dir_name or die "could not make and/or write to cache dir $dir_name\n";
363 my $c = File
::Spec
->catdir( temp_root_dir
(), 'cxgn-tools-wget-cache' );
364 -d
$c or mkdir $c or die "$! making cache dir '$c'";
371 Usage: CXGN::Tools::Wget::clear_cache
372 Desc : delete all the locally cached files managed by this module
382 my @delete_us = glob cache_root_dir
().'/*';
383 my $num_deleted = unlink @delete_us;
384 unless ($num_deleted == @delete_us) {
385 croak
"could not delete all files in the cache root directory (".cache_root_dir
().") : $!";
391 Usage: CXGN::Tools::Wget::vacuum_cache(1200)
392 Desc : delete all cached files that are older than N seconds old
393 Args : number of seconds old a file must be to be deleted
394 Ret : nothing meaningful
395 Side Effects: dies on error
401 my @delete_us = grep { (stat $_)[9] < time-$max_age }
402 glob cache_root_dir
().'/*';
404 unlink @delete_us == scalar @delete_us
405 or croak
"could not vacuum files in the cache root directory (".cache_root_dir
().") : $!";