a class to extract sequences from the genome
[cxgn-corelibs.git] / lib / CXGN / Tools / Wget.pm
blob92ebe58792bb5065bfc936d82a8965485a0e3d72
1 package CXGN::Tools::Wget;
3 use strict;
4 use warnings;
5 use Carp::Clan qr/^CXGN::Tools::Wget/;
7 use File::Copy;
8 use File::Temp qw/tempfile/;
9 use File::Flock;
10 use Digest::MD5 qw/ md5_hex /;
11 use URI;
13 use CXGN::Tools::List qw/ all str_in /;
14 use CXGN::Tools::File qw/ is_filehandle /;
16 =head1 NAME
18 CXGN::Tools::Wget - contains functions for getting files via http
19 or ftp in ways that aren't directly supported by L<LWP>.
21 =head1 SYNOPSIS
23 use CXGN::Tools::Wget qw/wget_filter/;
25 #get a gzipped file from a remote site, gunzipping it as it comes in
26 #and putting in somewhere else
27 wget_filter( http://example.com/myfile.gz => '/tmp/somelocalfile.txt',
28 { gunzip => 1 },
31 #get the same file, but transform each line as it comes in with the given
32 #subroutine, because they really mean bonobos, not monkeys
33 wget_filter( http://example.com/myfile.gz => '/tmp/somelocalfile.txt',
34 sub {
35 my $line = shift;
36 $line =~ s/\s+monkey/ bonobo/;
37 return $line;
39 { gunzip => 1 },
42 # get a cxgn-resource file defined in public.resource_file
43 wget_filter( cxgn-resource://all_tomato_repeats => 'myrepeats.seq');
45 my $temp_repeats_file = wget_filter( 'cxgn-resource://all_tomato_repeats' );
47 =head1 ABOUT CXGN-RESOURCE URLS
49 Sometimes we have a need for making datasets out of several other
50 datasets. For example, say you wanted a combined set of sequences
51 composed of NCBI's NR dataset, SGN's ESTs, and some random thing from
52 MIPS. You could define a resource file like:
54 insert into public.resource_file (name,expression)
55 values ('robs_composite_set','cat( gunzip(ftp://ftp.ncbi.nlm.nih.gov/nr.gz), ftp://ftp.sgn.cornell.edu/ests/Tomato_current.seq.gz, http://mips.gsf.de/some_random_set.fasta )');
57 Then, when you go
59 my $file = wget_filter('cxgn-resource://robs_composite_set');
61 You will get the concatenation of the unzipped NR set, the SGN est
62 set, and the MIPs set. What actually happens behind the scenes is,
63 wget downloads each of the files, gunzips the nr file, then
64 concatenates the three into another file and caches it, then copies
65 the cached copy into another tempfile, whose name it returns to you.
67 But you didn't have to know that. All you have to know is, define a
68 resource in the resource_file table, and wget_filter will build it for
69 you when you ask for it by wgetting a URL with the cxgn-resource
70 protocol.
72 =head1 FUNCTIONS
74 All functions are @EXPORT_OK.
76 =cut
78 BEGIN { our @EXPORT_OK = qw/ wget_filter clear_cache / }
80 our @EXPORT_OK;
81 use base 'Exporter';
85 =head2 wget_filter
87 Usage: wget_filter( http://example.com/myfile.txt => 'somelocalfile.txt');
88 Desc : get a remote file, optionally gunzipping it,
89 and/or running some subroutines on each line as it
90 comes in.
91 Ret : filename where the output was written, which
92 is either the destination file you provided,
93 or a tempfile if you did not provide a destination
94 file
95 Args : (url of file,
96 optional destination filename or filehandle,
97 optional list of filters to run on each line,
98 optional hashref of behavior options, as:
99 { gunzip => 1, #< gunzip the downloaded file before returning it. default false.
100 cache => 1, #< enable/disable persistent caching. default enabled
101 max_age => 3*24*60*60 (3 days), #< if caching maximum age of cached copy in seconds
102 unlink => 1, #< enable/disable automatic deletion of the
103 #temp file made and returned to you. only
104 #relevant if no destination filename is
105 #provided
106 test_only => 0, #if true, only download the first few
107 #bytes of each of the components of the
108 #resource, to check if everything looks OK
111 Side Effects: dies on error
112 Example:
113 #get the same file, but transform each line as it comes in with the given
114 #subroutine, because they really mean bonobos, not monkeys
115 wget_filter( http://example.com/myfile.gz => '/tmp/somelocalfile.txt',
116 sub {
117 my $line = shift;
118 $line =~ s/\s+monkey/ bonobo/;
119 return $line;
121 { gunzip => 1 },
123 # get a composite resource file defined in the public.resource_file
124 # table
125 wget_filter( cxgn-resource://test => '/tmp/mytestfile.html' );
127 =cut
129 use constant DEFAULT_CACHE_MAX_AGE => 3*24*60*60; #< 3 days ago, in seconds
131 sub wget_filter {
132 my ($url,@args) = @_;
134 #get our options hash if present
135 my %options = (ref($args[-1]) eq 'HASH') ? %{pop @args} : ();
137 $options{cache} = 1 unless exists $options{cache};
138 $options{unlink} = 1 unless exists $options{unlink};
140 $options{cache} = 0 if $options{test_only};
142 my $destfile = do {
143 if( !$args[0] || ref $args[0]) {
144 my (undef,$f) = tempfile( File::Spec->catfile( __PACKAGE__->temp_root_dir(), 'cxgn-tools-wget-XXXXXX'), UNLINK => $options{unlink});
145 # cluck "made tempfile $f\n";
147 } else {
148 shift @args
152 #and the rest of the arguments must be our filters
153 my @filters = @args;
154 !@filters || all map ref $_ eq 'CODE', @filters
155 or confess "all filters must be subroutine refs or anonymous subs (".join(',',@filters).")";
157 my $do_actual_fetch = sub {
158 _wget_filter({ filters => \@filters,
159 destfile => $destfile,
160 url => $url,
161 options => \%options,
165 # if we are filtering, just do the fetch without caching
166 return $do_actual_fetch->() if @filters || !$options{cache};
168 # otherwise, we are caching, and we need to do locking
170 # only do caching if we don't have any filters (we can't represent
171 # these in a persistent way in a hash key, because the CODE(...)
172 # will be different at every program run
173 my $cache_key = $url.' WITH OPTIONS '.join('=>',%options);
174 my $cache_filename = cache_filename( $cache_key );
175 my $lock_filename = "$cache_filename.lock";
176 my $try_read_lock = sub { File::Flock->new( $lock_filename, 'shared', 'nonblocking' ) };
177 my $try_write_lock = sub { File::Flock->new( $lock_filename, undef, 'nonblocking' ) };
179 my $cache_file_looks_valid = sub {
180 -r $cache_filename
181 && (time-(stat($cache_filename))[9]) <= ($options{max_age} || DEFAULT_CACHE_MAX_AGE)
184 my $copy_from_cache = sub {
185 my $gunzip_error = system qq!gunzip -c '$cache_filename' > '$destfile'!;
186 return 0 if $gunzip_error;
187 return 1;
190 my $read_cache = sub {
191 my $read_lock;
192 sleep 1 until $read_lock = $try_read_lock->();
193 return $destfile if $cache_file_looks_valid->() && $copy_from_cache->();
194 return;
197 # OK, the cache file needs updating or is corrupt, so try to get an
198 # exclusive write lock
199 my $dest_from_cache;
200 until ( $dest_from_cache = $read_cache->() ) {
201 if( my $write_lock = $try_write_lock->() ) {
202 $do_actual_fetch->();
204 # write the destfile into the cache
205 system qq!gzip -c '$destfile' > '$cache_filename'!
206 and confess "$! writing downloaded file to CXGN::Tools::Wget persistent cache (gzip $destfile -> $cache_filename.gz)";
208 return $destfile;
213 return $dest_from_cache;
217 # the get, minus caching
218 sub _wget_filter {
219 my $args = shift;
220 my %options = %{$args->{options}};
221 my $url = $args->{url};
222 my $destfile = $args->{destfile} or die 'pass destfile stupid';
223 my @filters = @{ $args->{filters} };
225 #properly form the gunzip command
226 $options{gunzip} = $options{gunzip} ? ' gunzip -c |' : '';
228 my $parsed_url = URI->new($url)
229 or croak "could not parse uri '$url'";
231 if ( str_in( $parsed_url->scheme, qw/ file http ftp / ) ) {
233 #try to use ncftpget for fetching from ftp with no wildcards, since
234 #wget suffers from some kind of bug with large ftp transfers.
235 #use wget for everything else, since it's a little more flexible
236 my $fetchcommand =
237 $parsed_url->scheme eq 'file' ? 'cat' :
238 $parsed_url->scheme eq 'ftp' && $url !~ /[\*\?]/ ? "ncftpget -cV" :
239 "wget -q -O -" ;
242 $url = $parsed_url->path if $parsed_url->scheme eq 'file';
244 #check that all of the given filters are code refs
245 @filters = grep {$_} @filters; #just ignore false things in the filters
246 foreach (@filters) {
247 ref eq 'CODE' or croak "Invalid filter argument '$_', must be a code ref";
250 #open the output filehandle if our argument isn't already a filehandle
251 my $out_fh;
252 my $open_out = ! is_filehandle($destfile);
253 if ($open_out) {
254 open $out_fh, '>', $destfile
255 or croak "Could not write to destination file $destfile: $!";
256 } else {
257 $out_fh = $destfile;
260 my $testhead = $options{test_only} ? 'head -c 30 |' : '';
261 #warn "testhead is $testhead\n";
263 #run wget to download the file
264 open my $urlpipe,"cd /tmp; $fetchcommand '$url' |$options{gunzip}$testhead"
265 or croak "Could not use wget to fetch $url: $!";
266 while (my $line = <$urlpipe>) {
267 #if we were given filters, run them on it
268 foreach my $filter (@filters) {
269 $line = $filter->($line);
271 print $out_fh $line;
273 close $urlpipe;
275 #close the output filehandle if it was us who opened it
276 close $out_fh if $open_out;
277 (stat($destfile))[7] > 0 || croak "Could not download $url using command '$fetchcommand'";
278 # print "done.\n";
280 ### cxgn-resource urls
281 elsif ( $parsed_url->scheme eq 'cxgn-resource' ) {
283 confess 'filters do not work with cxgn-resource urls' if @filters;
285 #look for a resource with that name
286 my $resource_name = $parsed_url->authority;
288 my ($resource_file,$multiple_resources) = CXGN::Tools::Wget::ResourceFile->search( name => $resource_name );
289 $resource_file or croak "no cxgn-resource found with name '$resource_name'";
290 $multiple_resources and croak "multiple cxgn-resource entries found with name '$resource_name'";
292 if ( $options{test_only} ) {
293 #warn "test fetch\n";
294 $resource_file->test_fetch();
295 } else {
296 $resource_file->fetch($destfile);
298 } else {
299 croak "unable to handle URIs with scheme '".($parsed_url->scheme || '')."', URI is '$url'";
304 #given a url, return the full path to where it should be stored on the
305 #filesystem
306 sub cache_filename {
307 my ($keystring) = @_;
308 my $name = md5_hex($keystring);
309 # md5sum the key to make a filename that is compact, does not need
310 # escaping, and that is still unique
311 return File::Spec->catfile(cache_root_dir(), "$name.gz");
315 =head2 temp_root_dir
317 Usage: CXGN::Tools::Wget->temp_dir( $new_dir );
318 Desc : class method to get/set the class-wide temp directory where
319 the wget file cache and temporarily fetched files
320 are kept
321 Args : (optional) new directory to set.
322 defaults to /tmp/cxgn-tools-wget-<username>
323 Ret : root directory where wget will keep its cache files
324 Side Effects: sets a piece of class data
326 =cut
329 my $cache_root;
330 sub temp_root_dir {
331 my ($class,$new_root) = @_;
332 $cache_root = $new_root if defined $new_root;
333 return $cache_root ||= do {
334 my $username = getpwuid $>;
335 my $dir_name = File::Spec->catdir( File::Spec->tmpdir, "cxgn-tools-wget-$username" );
336 system 'mkdir', -p => $dir_name;
337 -w $dir_name or die "could not make and/or write to cache dir $dir_name\n";
338 $dir_name
341 sub cache_root_dir {
342 my $c = File::Spec->catdir( temp_root_dir(), 'cxgn-tools-wget-cache' );
343 -d $c or mkdir $c or die "$! making cache dir '$c'";
348 =head2 clear_cache
350 Usage: CXGN::Tools::Wget::clear_cache
351 Desc : delete all the locally cached files managed by this module
352 Args :
353 Ret :
354 Side Effects:
355 Example:
357 =cut
359 sub clear_cache {
360 my ($class) = @_;
361 my @delete_us = glob cache_root_dir().'/*';
362 my $num_deleted = unlink @delete_us;
363 unless ($num_deleted == @delete_us) {
364 croak "could not delete all files in the cache root directory (".cache_root_dir().") : $!";
368 =head2 vacuum_cache
370 Usage: CXGN::Tools::Wget::vacuum_cache(1200)
371 Desc : delete all cached files that are older than N seconds old
372 Args : number of seconds old a file must be to be deleted
373 Ret : nothing meaningful
374 Side Effects: dies on error
376 =cut
378 sub vacuum_cache {
379 my ($max_age) = @_;
380 my @delete_us = grep { (stat $_)[9] < time-$max_age }
381 glob cache_root_dir().'/*';
383 unlink @delete_us == scalar @delete_us
384 or croak "could not vacuum files in the cache root directory (".cache_root_dir().") : $!";
388 package CXGN::Tools::Wget::ResourceFile;
389 use Carp qw/ cluck confess croak / ;
390 use File::Temp qw/ tempfile /;
391 use CXGN::Tools::Run;
392 use base 'CXGN::CDBI::Class::DBI';
393 __PACKAGE__->table('resource_file');
394 __PACKAGE__->columns(All => qw/ resource_file_id name expression /);
396 # the SQL table definition is here just for reference
397 my $creation_statement = <<EOSQL;
399 create table resource_file (
400 resource_file_id serial primary key,
401 name varchar(40) not null unique,
402 expression text not null
405 comment on table resource_file is
406 'each row defines a composite dataset, downloadable at the url cxgn-resource://name, that is composed of other downloadable datasets, according to the expression column. See CXGN::Tools::Wget for the accompanying code'
409 EOSQL
411 =head2 fetch
413 Usage: $resourcefile->fetch('resource_name');
414 Desc : assemble this composite resource file from its components
415 Args : filename in which to store the complete
416 assembled file
417 Ret : full path to the complete assembled file
419 =cut
421 sub fetch {
422 my ($self,$destfile) = @_;
424 my $parse_tree = $self->_parsed_expression; #< dies on parse error
426 #now go depth-first down the tree and evaluate it
427 # note that if no dest file is provided, _evaluate()
428 # will make a temp file
429 return _evaluate( $parse_tree, 0, $destfile);
432 =head2 test_fetch()
434 Usage: $resourcefile->test_fetch()
435 Desc : just test this resource and its components, see
436 if they are all fetchable
437 Args : none
438 Ret : true if successful, false if not
439 Side Effects: dies with an error if fetch was unsuccessful
441 =cut
443 sub test_fetch {
444 my ( $self ) = @_;
446 my $parse_tree = $self->_parsed_expression; #< dies on parse error
448 #now go depth-first down the tree and evaluate it
449 # note that if no dest file is provided, _evaluate()
450 # will make a temp file
451 my $file = _evaluate( $parse_tree, 1);
452 unlink $file;
453 return 1;
456 #recursively evaluate one of these little parse trees,
457 #converting the URLs and function calls into filenames
458 sub _evaluate {
459 my ($tree,$testing,$destfile) = @_;
461 #if we haven't been given a destination file, make a temporary one
462 $destfile ||= do {
463 my (undef,$f) = tempfile( File::Spec->catfile( CXGN::Tools::Wget->temp_root_dir(), 'cxgn-tools-wget-resourcefile-XXXXXX' ), UNLINK => 0);
464 #cluck "made tempfile $f\n";
468 if( $tree->isa('call') ) {
469 # evaluate each argument, then call the function on it
470 # these evaluations are each going to make a temp file
471 my ($func,@args) = @$tree;
472 @args = map _evaluate($_,$testing), @args;
474 #now apply the function to each of these files and make a composite file
475 no strict 'refs';
476 "op_$func"->($destfile,$testing,@args);
478 #delete each of the argument temp files
479 unlink @args;
481 else {
482 #fetch the URL pointed to
483 ref $tree and die "assertion failed, parse tree should only have one element here";
484 #warn "fetching $tree\n";
485 CXGN::Tools::Wget::wget_filter($tree,$destfile, {cache => 0, test_only => $testing});
487 return $destfile;
490 our @symbols;
491 # parse the expression and return a tree representation of it
492 sub _parsed_expression {
493 my ($self) = @_;
494 my $exp = $self->expression;
496 # $exp = 'foo';
497 #ignore all whitespace
498 $exp =~ s/\s//g;
500 #split the expression into symbols
501 @symbols = ($exp =~ /[^\(\),]+|./g);
503 my $parse_tree = _parse_expression();
505 return $parse_tree;
509 #recursively parse the expression
510 # _parse_expression and _parse_func make a simple recursive-descent parser
511 sub _parse_expression {
512 #beginning of a tuple
513 if( $symbols[0] =~ /^\S{2,}$/ ) {
514 if( $symbols[1] && $symbols[1] eq '(' ) {
515 return _parse_func();
516 } else {
517 return shift @symbols;
520 else {
521 die "unexpected symbol '$symbols[0]'";
524 sub _parse_func {
525 my $funcname = shift @symbols;
526 my $leftparen = shift @symbols;
527 $leftparen eq '('
528 or die "unexpected symbol '$leftparen'";
530 my @args = _parse_expression;
532 while( $symbols[0] ne ')' ) {
533 if( $symbols[0] eq ',' ) {
534 shift @symbols;
535 push @args, _parse_expression;
537 else {
538 die "unexpected symbol '$symbols[0]'";
541 shift @symbols; #< shift off the )
543 #check that this is a valid function name
544 __PACKAGE__->can("op_$funcname") or die "unknown resource file op '$funcname'";
546 return bless [ $funcname, @args ], 'call';
549 #### FILE OPERATION FUNCTIONS, ADD YOUR OWN BELOW HERE ########
551 # 1. each function takes a destination file name, then a list of
552 # filenames as arguments. It does its operation on the argument files,
553 # and writes to the destination file
555 # 2. functions are NOT allowed to modify any files except their
556 # destination file
558 # 3. functions should die on error
560 sub op_gunzip {
561 my ($destfile,$testing,@files) = @_;
563 # warn "gunzip ".join(',',@files)."> $destfile\n";
564 unless( $testing ) {
565 my $gunzip = CXGN::Tools::Run->run('gunzip', -c => @files,
566 { out_file => $destfile }
568 } else {
569 `touch $destfile`;
573 sub op_cat {
574 my ($destfile,$testing,@files) = @_;
575 # warn "cat ".join(',',@files)." > $destfile\n";
576 my $cat = CXGN::Tools::Run->run('cat', @files,
577 { out_file => $destfile }
581 =head1 AUTHOR
583 Robert Buels
585 =cut
588 1;#do not remove