1 package CXGN
::Tools
::Wget
;
5 use Carp
::Clan
qr/^CXGN::Tools::Wget/;
8 use File
::Temp qw
/tempfile/;
10 use Digest
::MD5 qw
/ md5_hex /;
13 use CXGN
::Tools
::List qw
/ all str_in /;
14 use CXGN
::Tools
::File qw
/ is_filehandle /;
18 CXGN::Tools::Wget - contains functions for getting files via http
19 or ftp in ways that aren't directly supported by L<LWP>.
23 use CXGN::Tools::Wget qw/wget_filter/;
25 #get a gzipped file from a remote site, gunzipping it as it comes in
26 #and putting in somewhere else
27 wget_filter( http://example.com/myfile.gz => '/tmp/somelocalfile.txt',
31 #get the same file, but transform each line as it comes in with the given
32 #subroutine, because they really mean bonobos, not monkeys
33 wget_filter( http://example.com/myfile.gz => '/tmp/somelocalfile.txt',
36 $line =~ s/\s+monkey/ bonobo/;
42 # get a cxgn-resource file defined in public.resource_file
43 wget_filter( cxgn-resource://all_tomato_repeats => 'myrepeats.seq');
45 my $temp_repeats_file = wget_filter( 'cxgn-resource://all_tomato_repeats' );
47 =head1 ABOUT CXGN-RESOURCE URLS
49 Sometimes we have a need for making datasets out of several other
50 datasets. For example, say you wanted a combined set of sequences
51 composed of NCBI's NR dataset, SGN's ESTs, and some random thing from
52 MIPS. You could define a resource file like:
54 insert into public.resource_file (name,expression)
55 values ('robs_composite_set','cat( gunzip(ftp://ftp.ncbi.nlm.nih.gov/nr.gz), ftp://ftp.sgn.cornell.edu/ests/Tomato_current.seq.gz, http://mips.gsf.de/some_random_set.fasta )');
59 my $file = wget_filter('cxgn-resource://robs_composite_set');
61 You will get the concatenation of the unzipped NR set, the SGN est
62 set, and the MIPs set. What actually happens behind the scenes is,
63 wget downloads each of the files, gunzips the nr file, then
64 concatenates the three into another file and caches it, then copies
65 the cached copy into another tempfile, whose name it returns to you.
67 But you didn't have to know that. All you have to know is, define a
68 resource in the resource_file table, and wget_filter will build it for
69 you when you ask for it by wgetting a URL with the cxgn-resource
74 All functions are @EXPORT_OK.
78 BEGIN { our @EXPORT_OK = qw
/ wget_filter clear_cache / }
87 Usage: wget_filter( http://example.com/myfile.txt => 'somelocalfile.txt');
88 Desc : get a remote file, optionally gunzipping it,
89 and/or running some subroutines on each line as it
91 Ret : filename where the output was written, which
92 is either the destination file you provided,
93 or a tempfile if you did not provide a destination
96 optional destination filename or filehandle,
97 optional list of filters to run on each line,
98 optional hashref of behavior options, as:
99 { gunzip => 1, #< gunzip the downloaded file before returning it. default false.
100 cache => 1, #< enable/disable persistent caching. default enabled
101 max_age => 3*24*60*60 (3 days), #< if caching maximum age of cached copy in seconds
102 unlink => 1, #< enable/disable automatic deletion of the
103 #temp file made and returned to you. only
104 #relevant if no destination filename is
106 test_only => 0, #if true, only download the first few
107 #bytes of each of the components of the
108 #resource, to check if everything looks OK
111 Side Effects: dies on error
113 #get the same file, but transform each line as it comes in with the given
114 #subroutine, because they really mean bonobos, not monkeys
115 wget_filter( http://example.com/myfile.gz => '/tmp/somelocalfile.txt',
118 $line =~ s/\s+monkey/ bonobo/;
123 # get a composite resource file defined in the public.resource_file
125 wget_filter( cxgn-resource://test => '/tmp/mytestfile.html' );
129 use constant DEFAULT_CACHE_MAX_AGE
=> 3*24*60*60; #< 3 days ago, in seconds
132 my ($url,@args) = @_;
134 #get our options hash if present
135 my %options = (ref($args[-1]) eq 'HASH') ?
%{pop @args} : ();
137 $options{cache
} = 1 unless exists $options{cache
};
138 $options{unlink} = 1 unless exists $options{unlink};
140 $options{cache
} = 0 if $options{test_only
};
143 if( !$args[0] || ref $args[0]) {
144 my (undef,$f) = tempfile
( File
::Spec
->catfile( __PACKAGE__
->temp_root_dir(), 'cxgn-tools-wget-XXXXXX'), UNLINK
=> $options{unlink});
145 # cluck "made tempfile $f\n";
152 #and the rest of the arguments must be our filters
154 !@filters || all
map ref $_ eq 'CODE', @filters
155 or confess
"all filters must be subroutine refs or anonymous subs (".join(',',@filters).")";
157 my $do_actual_fetch = sub {
158 _wget_filter
({ filters
=> \
@filters,
159 destfile
=> $destfile,
161 options
=> \
%options,
165 # if we are filtering, just do the fetch without caching
166 return $do_actual_fetch->() if @filters || !$options{cache
};
168 # otherwise, we are caching, and we need to do locking
170 # only do caching if we don't have any filters (we can't represent
171 # these in a persistent way in a hash key, because the CODE(...)
172 # will be different at every program run
173 my $cache_key = $url.' WITH OPTIONS '.join('=>',%options);
174 my $cache_filename = cache_filename
( $cache_key );
175 my $lock_filename = "$cache_filename.lock";
176 my $try_read_lock = sub { File
::Flock
->new( $lock_filename, 'shared', 'nonblocking' ) };
177 my $try_write_lock = sub { File
::Flock
->new( $lock_filename, undef, 'nonblocking' ) };
179 my $cache_file_looks_valid = sub {
181 && (time-(stat($cache_filename))[9]) <= ($options{max_age
} || DEFAULT_CACHE_MAX_AGE
)
184 my $copy_from_cache = sub {
185 my $gunzip_error = system qq!gunzip
-c
'$cache_filename' > '$destfile'!;
186 return 0 if $gunzip_error;
190 my $read_cache = sub {
192 sleep 1 until $read_lock = $try_read_lock->();
193 return $destfile if $cache_file_looks_valid->() && $copy_from_cache->();
197 # OK, the cache file needs updating or is corrupt, so try to get an
198 # exclusive write lock
200 until ( $dest_from_cache = $read_cache->() ) {
201 if( my $write_lock = $try_write_lock->() ) {
202 $do_actual_fetch->();
204 # write the destfile into the cache
205 system qq!gzip
-c
'$destfile' > '$cache_filename'!
206 and confess
"$! writing downloaded file to CXGN::Tools::Wget persistent cache (gzip $destfile -> $cache_filename.gz)";
213 return $dest_from_cache;
217 # the get, minus caching
220 my %options = %{$args->{options
}};
221 my $url = $args->{url
};
222 my $destfile = $args->{destfile
} or die 'pass destfile stupid';
223 my @filters = @
{ $args->{filters
} };
225 #properly form the gunzip command
226 $options{gunzip
} = $options{gunzip
} ?
' gunzip -c |' : '';
228 my $parsed_url = URI
->new($url)
229 or croak
"could not parse uri '$url'";
231 if ( str_in
( $parsed_url->scheme, qw
/ file http ftp / ) ) {
233 #try to use ncftpget for fetching from ftp with no wildcards, since
234 #wget suffers from some kind of bug with large ftp transfers.
235 #use wget for everything else, since it's a little more flexible
237 $parsed_url->scheme eq 'file' ?
'cat' :
238 $parsed_url->scheme eq 'ftp' && $url !~ /[\*\?]/ ?
"ncftpget -cV" :
242 $url = $parsed_url->path if $parsed_url->scheme eq 'file';
244 #check that all of the given filters are code refs
245 @filters = grep {$_} @filters; #just ignore false things in the filters
247 ref eq 'CODE' or croak
"Invalid filter argument '$_', must be a code ref";
250 #open the output filehandle if our argument isn't already a filehandle
252 my $open_out = ! is_filehandle
($destfile);
254 open $out_fh, '>', $destfile
255 or croak
"Could not write to destination file $destfile: $!";
260 my $testhead = $options{test_only
} ?
'head -c 30 |' : '';
261 #warn "testhead is $testhead\n";
263 #run wget to download the file
264 open my $urlpipe,"cd /tmp; $fetchcommand '$url' |$options{gunzip}$testhead"
265 or croak
"Could not use wget to fetch $url: $!";
266 while (my $line = <$urlpipe>) {
267 #if we were given filters, run them on it
268 foreach my $filter (@filters) {
269 $line = $filter->($line);
275 #close the output filehandle if it was us who opened it
276 close $out_fh if $open_out;
277 (stat($destfile))[7] > 0 || croak
"Could not download $url using command '$fetchcommand'";
280 ### cxgn-resource urls
281 elsif ( $parsed_url->scheme eq 'cxgn-resource' ) {
283 confess
'filters do not work with cxgn-resource urls' if @filters;
285 #look for a resource with that name
286 my $resource_name = $parsed_url->authority;
288 my ($resource_file,$multiple_resources) = CXGN
::Tools
::Wget
::ResourceFile
->search( name
=> $resource_name );
289 $resource_file or croak
"no cxgn-resource found with name '$resource_name'";
290 $multiple_resources and croak
"multiple cxgn-resource entries found with name '$resource_name'";
292 if ( $options{test_only
} ) {
293 #warn "test fetch\n";
294 $resource_file->test_fetch();
296 $resource_file->fetch($destfile);
299 croak
"unable to handle URIs with scheme '".($parsed_url->scheme || '')."', URI is '$url'";
304 #given a url, return the full path to where it should be stored on the
307 my ($keystring) = @_;
308 my $name = md5_hex
($keystring);
309 # md5sum the key to make a filename that is compact, does not need
310 # escaping, and that is still unique
311 return File
::Spec
->catfile(cache_root_dir
(), "$name.gz");
317 Usage: CXGN::Tools::Wget->temp_dir( $new_dir );
318 Desc : class method to get/set the class-wide temp directory where
319 the wget file cache and temporarily fetched files
321 Args : (optional) new directory to set.
322 defaults to /tmp/cxgn-tools-wget-<username>
323 Ret : root directory where wget will keep its cache files
324 Side Effects: sets a piece of class data
331 my ($class,$new_root) = @_;
332 $cache_root = $new_root if defined $new_root;
333 return $cache_root ||= do {
334 my $username = getpwuid $>;
335 my $dir_name = File
::Spec
->catdir( File
::Spec
->tmpdir, "cxgn-tools-wget-$username" );
336 system 'mkdir', -p
=> $dir_name;
337 -w
$dir_name or die "could not make and/or write to cache dir $dir_name\n";
342 my $c = File
::Spec
->catdir( temp_root_dir
(), 'cxgn-tools-wget-cache' );
343 -d
$c or mkdir $c or die "$! making cache dir '$c'";
350 Usage: CXGN::Tools::Wget::clear_cache
351 Desc : delete all the locally cached files managed by this module
361 my @delete_us = glob cache_root_dir
().'/*';
362 my $num_deleted = unlink @delete_us;
363 unless ($num_deleted == @delete_us) {
364 croak
"could not delete all files in the cache root directory (".cache_root_dir
().") : $!";
370 Usage: CXGN::Tools::Wget::vacuum_cache(1200)
371 Desc : delete all cached files that are older than N seconds old
372 Args : number of seconds old a file must be to be deleted
373 Ret : nothing meaningful
374 Side Effects: dies on error
380 my @delete_us = grep { (stat $_)[9] < time-$max_age }
381 glob cache_root_dir
().'/*';
383 unlink @delete_us == scalar @delete_us
384 or croak
"could not vacuum files in the cache root directory (".cache_root_dir
().") : $!";
388 package CXGN
::Tools
::Wget
::ResourceFile
;
389 use Carp qw
/ cluck confess croak / ;
390 use File
::Temp qw
/ tempfile /;
391 use CXGN
::Tools
::Run
;
392 use base
'CXGN::CDBI::Class::DBI';
393 __PACKAGE__
->table('resource_file');
394 __PACKAGE__
->columns(All
=> qw
/ resource_file_id name expression /);
396 # the SQL table definition is here just for reference
397 my $creation_statement = <<EOSQL;
399 create table resource_file (
400 resource_file_id serial primary key,
401 name varchar(40) not null unique,
402 expression text not null
405 comment on table resource_file is
406 'each row defines a composite dataset, downloadable at the url cxgn-resource://name, that is composed of other downloadable datasets, according to the expression column. See CXGN::Tools::Wget for the accompanying code'
413 Usage: $resourcefile->fetch('resource_name');
414 Desc : assemble this composite resource file from its components
415 Args : filename in which to store the complete
417 Ret : full path to the complete assembled file
422 my ($self,$destfile) = @_;
424 my $parse_tree = $self->_parsed_expression; #< dies on parse error
426 #now go depth-first down the tree and evaluate it
427 # note that if no dest file is provided, _evaluate()
428 # will make a temp file
429 return _evaluate
( $parse_tree, 0, $destfile);
434 Usage: $resourcefile->test_fetch()
435 Desc : just test this resource and its components, see
436 if they are all fetchable
438 Ret : true if successful, false if not
439 Side Effects: dies with an error if fetch was unsuccessful
446 my $parse_tree = $self->_parsed_expression; #< dies on parse error
448 #now go depth-first down the tree and evaluate it
449 # note that if no dest file is provided, _evaluate()
450 # will make a temp file
451 my $file = _evaluate
( $parse_tree, 1);
456 #recursively evaluate one of these little parse trees,
457 #converting the URLs and function calls into filenames
459 my ($tree,$testing,$destfile) = @_;
461 #if we haven't been given a destination file, make a temporary one
463 my (undef,$f) = tempfile
( File
::Spec
->catfile( CXGN
::Tools
::Wget
->temp_root_dir(), 'cxgn-tools-wget-resourcefile-XXXXXX' ), UNLINK
=> 0);
464 #cluck "made tempfile $f\n";
468 if( $tree->isa('call') ) {
469 # evaluate each argument, then call the function on it
470 # these evaluations are each going to make a temp file
471 my ($func,@args) = @
$tree;
472 @args = map _evaluate
($_,$testing), @args;
474 #now apply the function to each of these files and make a composite file
476 "op_$func"->($destfile,$testing,@args);
478 #delete each of the argument temp files
482 #fetch the URL pointed to
483 ref $tree and die "assertion failed, parse tree should only have one element here";
484 #warn "fetching $tree\n";
485 CXGN
::Tools
::Wget
::wget_filter
($tree,$destfile, {cache
=> 0, test_only
=> $testing});
491 # parse the expression and return a tree representation of it
492 sub _parsed_expression
{
494 my $exp = $self->expression;
497 #ignore all whitespace
500 #split the expression into symbols
501 @symbols = ($exp =~ /[^\(\),]+|./g);
503 my $parse_tree = _parse_expression
();
509 #recursively parse the expression
510 # _parse_expression and _parse_func make a simple recursive-descent parser
511 sub _parse_expression
{
512 #beginning of a tuple
513 if( $symbols[0] =~ /^\S{2,}$/ ) {
514 if( $symbols[1] && $symbols[1] eq '(' ) {
515 return _parse_func
();
517 return shift @symbols;
521 die "unexpected symbol '$symbols[0]'";
525 my $funcname = shift @symbols;
526 my $leftparen = shift @symbols;
528 or die "unexpected symbol '$leftparen'";
530 my @args = _parse_expression
;
532 while( $symbols[0] ne ')' ) {
533 if( $symbols[0] eq ',' ) {
535 push @args, _parse_expression
;
538 die "unexpected symbol '$symbols[0]'";
541 shift @symbols; #< shift off the )
543 #check that this is a valid function name
544 __PACKAGE__
->can("op_$funcname") or die "unknown resource file op '$funcname'";
546 return bless [ $funcname, @args ], 'call';
549 #### FILE OPERATION FUNCTIONS, ADD YOUR OWN BELOW HERE ########
551 # 1. each function takes a destination file name, then a list of
552 # filenames as arguments. It does its operation on the argument files,
553 # and writes to the destination file
555 # 2. functions are NOT allowed to modify any files except their
558 # 3. functions should die on error
561 my ($destfile,$testing,@files) = @_;
563 # warn "gunzip ".join(',',@files)."> $destfile\n";
565 my $gunzip = CXGN
::Tools
::Run
->run('gunzip', -c
=> @files,
566 { out_file
=> $destfile }
574 my ($destfile,$testing,@files) = @_;
575 # warn "cat ".join(',',@files)." > $destfile\n";
576 my $cat = CXGN
::Tools
::Run
->run('cat', @files,
577 { out_file
=> $destfile }