maint: restructure to use Dist::Zilla
[bioperl-live.git] / lib / Bio / Tools / Run / Analysis.pm
blobe9851f3c5c19e58c8649635dff6163cb04172b49
2 # BioPerl module for Bio::Tools::Run::Analysis
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Martin Senger <martin.senger@gmail.com>
7 # For copyright and disclaimer see below.
10 # POD documentation - main docs before the code
12 =head1 NAME
14 Bio::Tools::Run::Analysis - Module representing any (remote or local)
15 analysis tool
17 =head1 SYNOPSIS
19 # run analysis 'seqret' using a default location and a default
20 # access method (which means using a Web Service at EBI)
21 use Bio::Tools::Run::Analysis;
22 print new Bio::Tools::Run::Analysis (-name => 'edit::seqret')
23 ->wait_for ({ sequence_direct_data => 'tatatacgtatacga',
24 osformat => 'embl'
26 ->result ('outseq');
28 # run a longer job without waiting for its completion
29 use Bio::Tools::Run::Analysis;
30 my $job = Bio::Tools::Run::Analysis->new(-name => 'edit::seqret')
31 ->run ({ sequence_direct_data => 'tatatacgtatacga',
32 osformat => 'embl'
33 });
34 # ...and after a while
35 $job->result ('outseq');
37 # get all results in the same invocation (as a hash reference
38 # with result names as keys) - let the module decide which
39 # results are binary (images in this examples) and save those
40 # in file (or files); it also shows how to tell that the module
41 # should read input data from a local file first
42 use Bio::Tools::Run::Analysis;
43 my $results =
44 Bio::Tools::Run::Analysis->new(-name => 'alignment_multiple::prettyplot')
45 ->wait_for ( { msf_direct_data => '@/home/testdata/my.seq' } )
46 ->results ('?');
47 use Data::Dumper;
48 print Dumper ($results);
50 # get names, types of all inputs and results,
51 # get short and detailed (in XML) service description
52 use Bio::Tools::Run::Analysis;
53 my $service = Bio::Tools::Run::Analysis->new(-name => 'edit::seqret');
54 my $hash1 = $service->input_spec;
55 my $hash2 = $service->result_spec;
56 my $hash3 = $service->analysis_spec;
57 my $xml = $service->describe;
59 # get current job status
60 use Bio::Tools::Run::Analysis;
61 print new Bio::Tools::Run::Analysis (-name => 'edit::seqret')
62 ->run ( { #...input data...
63 } )
64 ->status;
66 # run a job and print its job ID, keep the job un-destroyed
67 use Bio::Tools::Run::Analysis;
68 my $job =
69 Bio::Tools::Run::Analysis->new(-name => 'edit::seqret',
70 -destroy_on_exit => 0)
71 ->run ( { sequence_direct_data => '@/home/testdata/mzef.seq' } );
72 print $job->id . "\n";
73 # ...it prints (for example):
74 # edit::seqret/c8ef56:ef535489ac:-7ff4
76 # ...in another time, on another planet, you may say
77 use Bio::Tools::Run::Analysis;
78 my $job =
79 Bio::Tools::Run::Analysis::Job->new(-name => 'edit::seqret',
80 -id => 'edit::seqret/c8ef56:ef535489ac:-7ff4');
81 print join ("\n",
82 $job->status,
83 'Finished: ' . $job->ended (1), # (1) means 'formatted'
84 'Elapsed time: ' . $job->elapsed,
85 $job->last_event,
86 $job->result ('outseq')
89 # ...or you may achieve the same keeping module
90 # Bio::Tools::Run::Analysis::Job invisible
91 use Bio::Tools::Run::Analysis;
92 my $job =
93 Bio::Tools::Run::Analysis->new(-name => 'edit::seqret')
94 ->create_job ('edit::seqret/c8ef56:ef535489ac:-7ff4');
95 print join ("\n",
96 $job->status,
97 # ...
100 # ...and later you may free this job resources
101 $job->remove;
104 # --- See DESCRIPTION for using generator 'applmaker.pl':
108 =head1 DESCRIPTION
110 The module represents an access to the local and/or remote analysis
111 tools in a unified way that allows adding new access methods
112 (protocols) seamlessly.
114 At the moment of writing, there is available a I<SOAP> access to
115 almost all EMBOSS applications, running at the
116 European Bioinformatics Institute.
118 The documentation of all C<public> methods are to be found
119 in C<Bio::AnalysisI>. A tutorial (and examples how to call almost all
120 public methods) is in the script C<panalysis.PLS> (go to the C<scripts>
121 directory and type C<perldoc panalysis.PLS>).
123 The module C<Bio::Tools::Run::Analysis> uses general approach allowing to set
124 arbitrary input data and to retrieve results by naming them. However,
125 sometimes is more convenient to use a specific module, representing
126 one analysis tool, that already knows about available input and result
127 names. Such analyses-specific Perl modules can be generated by
128 C<papplmaker.PLS> generator. Its features and usage are documented in
129 the generator (go to the C<scripts> directory and type C<perldoc
130 papplmaker.PLS>).
132 # this will generate module Seqret.pm
133 perl papplmaker.PLS -n edit.seqret -m Seqret
135 # ...which can be used with data-specific methods
136 use Seqret;
137 my $outseq = new Seqret
138 ->sequence_direct_data ('@/home/testdata/my.seq')
139 ->osformat ('embl')
140 ->wait_for
141 ->outseq
143 print $outseq;
145 =head1 FEEDBACK
147 =head2 Mailing Lists
149 User feedback is an integral part of the evolution of this and other
150 Bioperl modules. Send your comments and suggestions preferably to
151 the Bioperl mailing list. Your participation is much appreciated.
153 bioperl-l@bioperl.org - General discussion
154 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
156 =head2 Support
158 Please direct usage questions or support issues to the mailing list:
160 I<bioperl-l@bioperl.org>
162 rather than to the module maintainer directly. Many experienced and
163 reponsive experts will be able look at the problem and quickly
164 address it. Please include a thorough description of the problem
165 with code and data examples if at all possible.
167 =head2 Reporting Bugs
169 Report bugs to the Bioperl bug tracking system to help us keep track
170 of the bugs and their resolution. Bug reports can be submitted via the
171 web:
173 http://redmine.open-bio.org/projects/bioperl/
175 =head1 AUTHOR
177 Martin Senger (martin.senger@gmail.com)
179 =head1 COPYRIGHT
181 Copyright (c) 2003, Martin Senger and EMBL-EBI.
182 All Rights Reserved.
184 This module is free software; you can redistribute it and/or modify
185 it under the same terms as Perl itself.
187 =head1 DISCLAIMER
189 This software is provided "as is" without warranty of any kind.
191 =head1 SEE ALSO
193 =over
195 =item *
197 http://www.ebi.ac.uk/soaplab/Perl_Client.html
199 =back
201 =head1 APPENDIX
203 Here is the rest of the object methods. Internal methods are preceded
204 with an underscore _.
206 =cut
209 # Let the code begin...
211 package Bio::Tools::Run::Analysis;
212 use vars qw(@ISA $Revision);
213 use strict;
215 use Bio::Root::Root;
216 use Bio::AnalysisI;
217 @ISA = qw(Bio::Root::Root Bio::AnalysisI);
219 BEGIN {
220 $Revision = q[$Id$];
223 # -----------------------------------------------------------------------------
225 =head2 new
227 Usage : my $tool =
228 Bio::Tools::Run::Analysis->new(-access => 'soap',
229 -name => 'edit.seqret',
232 Returns : a new Bio::Tools::Run::Analysis object representing the given tool
233 Args : There may be additional arguments which are specific
234 to the access method (see methods 'new' or '_initialize'
235 of the access-specific implementations (such as module
236 Bio::Tools::Run::Analysis::soap for a SOAP-based access).
238 The recognised and used arguments are:
239 -access
240 -location
241 -name
242 -httpproxy
243 -timeout
245 It builds, populates and returns a new C<Bio::Tools::Run::Analysis> object. This
246 is how it is seen from the outside. But in fact, it builds, populates
247 and returns a more specific lower-level object, for example
248 C<Bio::Tools::Run::Analysis::soap> object - which one it depends on the C<-access>
249 parameter.
251 =over
253 =item -access
255 It indicates what lower-level module to load. Default is 'soap'.
256 Other (but future) possibilities may be:
258 -access => 'novella'
259 -access => 'local'
261 =item -location
263 A location of the service. The contents is access-specific (see
264 details in the lower-level implementation modules).
266 Default is C<http://www.ebi.ac.uk/soaplab/services> ( services running
267 at European Bioinformatics Institute on top of most of EMBOSS
268 analyses, and on few others).
270 =item -name
272 A name of an analysis tool, or a name of its higher-level abstraction,
273 possibly including a category where the analysis belong to. There is
274 no default value (which usually means that this parameter is mandatory
275 unless your I<-location> parameter includes also the name (but it is
276 then access-dependent).
278 =item -destroy_on_exit =E<gt> '0'
280 Default value is '1' which means that all Bio::Tools::Run::Analysis::Job
281 objects - when being finalised - will send a request
282 to the remote site to forget the results of these jobs.
284 If you change it to '0' make sure that you know the job identification
285 - otherwise you will not be able to re-established connection with it
286 (later, when you use your program again). This can be done by calling
287 method C<id> on the job object (such object is returned by any of
288 these methods: C<create_job>, C<run>, C<wait_for>).
290 =item -httpproxy
292 In addition to the I<location> parameter, you may need to specify also
293 a location/URL of an HTTP proxy server (if your site requires
294 one). The expected format is C<http://server:port>. There is no
295 default value. It is also an access-specific parameter which may not
296 be used by all access methods.
298 =item -timeout
300 For long(er) running jobs the HTTP connection may be time-outed. In
301 order to avoid it (or, vice-versa, to call timeout sooner) you may
302 specify C<timeout> with the number of seconds the connection will be
303 kept alive. Zero means to keep it alive forever. The default value is
304 two minutes.
306 =back
308 =cut
310 sub new {
311 my ($caller,@args) = @_;
312 my $class = ref($caller) || $caller;
314 if ($class eq 'Bio::Tools::Run::Analysis') {
316 # this is called only the first time when somebody calls: 'new
317 # Bio::Tools::Run::Analysis (...)', and it actually loads a 'real-work-doing'
318 # module and call this new() method again (unless the loaded
319 # module has its own new() method)
321 my %param = @args;
322 @param { map { lc $_ } keys %param } = values %param; # lowercase keys
323 my $access =
324 $param {'-access'} || # use -access parameter
325 &Bio::Tools::Run::Analysis::Utils::_guess_access ( \%param ) || # or guess from other parameters
326 'soap'; # or use a default access method
327 $access = "\L$access"; # normalize capitalization to lower case
329 # remember the access method (putting it into @args means that the
330 # object - when created - will remember it)
331 push (@args, (-access => $access)) unless $param {'-access'};
333 # load module with the real implementation - as defined in $access
334 return undef unless (&Bio::Tools::Run::Analysis::Utils::_load_access_module ($access));
336 # this calls this same method new() - but now its object part
337 # (see the upper branche above) is called
338 return "Bio::Tools::Run::Analysis::$access"->new (@args);
340 } else {
342 # if $caller is an object, or if it is an underlying
343 # 'real-work-doing' class (e.g. Bio::Tools::Run::Analysis::soap) then
344 # we want to call SUPER to create and bless a new object
346 my ($self) = $class->SUPER::new (@args);
348 # now the $self is an empty object - we will populate it from
349 # the $caller - if $caller is an object (so we do cloning here)
351 if (ref ($caller)) {
352 %{ $self } = %{ $caller };
355 # and finally add values from '@args' into the newly created
356 # object (the values will overwrite the values copied above);
357 # this is done by calling '_initialize' of the 'real-work-doing'
358 # class (if there is no one there, there is always an empty one
359 # in Bio::Root::Root)
361 $self->_initialize (@args);
362 return $self;
368 # Create a hash with named inputs, all extracted
369 # from the given data.
371 sub _prepare_inputs {
372 my $self = shift;
373 my %inputs = (); # collect here input data
375 foreach my $input (@_) {
377 next unless defined $input;
379 # an element can be an array reference
380 # (with scalar elements: 'name = [[@]value]')
381 if (ref $input eq 'ARRAY') {
382 foreach my $elem (@$input) {
383 unless (ref $elem) { # taking only scalars
384 my ($name, $value) = split (/\s*=\s*/, $elem, 2);
385 next unless $name; # am I paranoid ?
386 $value = 1 unless defined $value;
387 $inputs{$name} = $value;
388 next;
393 # ...or an element can be a hash
394 # (name => [@]value)
395 elsif (ref $input eq 'HASH') {
396 foreach my $name (keys %$input) {
397 my $value = $$input{$name};
398 $inputs{$name} = $value;
402 # ...or an element can be a scalar (which means that it
403 # represents a name of a boolean parameter (an option)
404 elsif (ref \$input eq 'SCALAR') {
405 $input =~ s/^@/\\@/; # this cannot be a filename
406 $inputs{$input} = 1;
409 # everything else is ignored
410 else {
411 warn "Unrecognized input data type: $input\n";
415 # extracted inputs may be actually filenames and we want the
416 # contents of the files instead
417 # TBD: to support also filehandlers here?
418 foreach my $name (keys %inputs) {
419 $inputs{$name} = $self->_read_value ($inputs{$name});
421 return \%inputs;
424 # --- if a $value is a filename, read it and return its contents
425 # otherwise return the $value itself; if $value start with
426 # an escaped '@', change it to a normal '@'
427 sub _read_value {
428 my ($self, $value) = @_;
429 return unless defined $value;
430 if ($value =~ s/^\@//) {
431 my ($buf);
432 open (DATA, $value) || $self->throw ("Cannot read from '$value' ($!)");
433 binmode (DATA);
434 undef $value;
435 while (read (DATA, $buf, 8 * 2**10)) {
436 $value .= $buf;
438 close DATA;
439 } elsif ($value =~ s/^\\\@/@/) {
441 $value;
444 # --- save $value of result $name into file $filename + $seq;
445 # use some default filename if $filename not given
447 #$part = $self->_save_result (-value => $part,
448 # -name => $name,
449 # -filename => $filename,
450 # -template => $template,
451 # -seq => $seq++);
453 sub _save_result {
454 my ($self, %params) = @_;
455 my $name = $params{'-name'} || 'result';
457 # invent filename (if not given) from the given or default template
458 my $filename = $params{'-filename'};
459 unless ($filename) {
460 $filename = $params{'-template'};
461 $filename = "\$ANALYSIS_*_$name" unless $filename;
463 # replace $ANALYSIS and $RESULT in the filename
464 if ($filename =~ /\$\{?ANALYSIS\}?/) {
465 # (better to ask if we need it because getting
466 # the analysis name may require going to server)
467 my $analysis = $self->analysis_name;
468 $analysis =~ s/[:\/]/_/g; # would be troubles in filename
469 $filename =~ s/\$\{?ANALYSIS\}?/$analysis/ig;
471 $filename =~ s/\$\{?RESULT\}?/$name/ig;
474 # include the sequential number before file extension (if any)
475 my $seq = $params{'-seq'};
476 if ($seq) {
477 my $pos = rindex ($filename, '.');
478 if ($pos > -1) {
479 substr ($filename, $pos, 0) = ".$seq"; # insert $seq
480 } else {
481 $filename .= ".$seq"; # add $seq
485 # replace '*' in filename with a unique number
486 while ($filename =~ /\*/) {
487 my $unique_name;
488 my $number = 1;
489 while (1) {
490 ($unique_name = $filename) =~ s/\*/$number/;
491 last unless -e $unique_name;
492 $number++;
494 $filename = $unique_name;
497 # and finally write the file
498 open (DATA, ">$filename") ||
499 $self->throw ("Error by saving result '$name' into '$filename' ($!)");
500 binmode (DATA);
501 print (DATA $params{'-value'}) ||
502 $self->throw ("Error by writing result '$name' into '$filename' ($!)");
503 close DATA ||
504 $self->throw ("Error by closing result '$name' in '$filename' ($!)");
506 return $filename;
510 =head2 VERSION and Revision
512 Usage : print $Bio::Tools::Run::Analysis::VERSION;
513 print $Bio::Tools::Run::Analysis::Revision;
515 =cut
517 # -----------------------------------------------------------------------------
518 # Bio::Tools::Run::Analysis::Job
519 # A module representing an invocation (execution, job) of an analysis.
520 # -----------------------------------------------------------------------------
522 package Bio::Tools::Run::Analysis::Job;
524 =head1 Module Bio::Tools::Run::Analysis::Job
526 It represents a job, a single execution of an analysis tool. Usually
527 you do not instantiate these objects - they are returned by methods
528 C<create_job>, C<run>, and C<wait_for> of C<Bio::Tools::Run::Analysis> object.
530 However, if you wish to re-create a job you need to know its ID
531 (method C<id> gives it to you). The ID can be passed directly to the
532 C<new> method, or again you may use C<create_job> of a
533 C<Bio::Tools::Run::Analysis> object with the ID as parameter. See SYNOPSIS above
534 for an example.
536 Remember that all public methods of this module are described in
537 details in interface module C<Bio::AnalysisI> and in the tutorial in
538 the C<analysis.pl> script.
540 =cut
543 use vars qw(@ISA);
544 use strict;
546 use Bio::Root::Root;
547 @ISA = qw(Bio::Root::Root Bio::AnalysisI::JobI);
549 # -----------------------------------------------------------------------------
551 =head2 new
553 Usage : my $job = Bio::Tools::Run::Analysis::Job->new
554 (-access => 'soap',
555 -name => 'edit.seqret',
556 -id => 'xxxyyy111222333'
558 Returns : a re-created object representing a job
559 Args : The same arguments as for Bio::Tools::Run::Analysis object:
560 -access
561 -location
562 -name
563 -httpproxy
564 -timeout
565 (and perhaps others)
566 Additionally and specifically for this object:
568 -analysis
570 =over
572 =item -id
574 A job ID created some previous time and now used to re-create the same
575 job (in order to re-gain access to this job results, for example).
577 =item -analysis
579 A C<Bio::Tools::Run::Analysis> object whose properties (such as C<-access> and
580 C<-location> are used to re-create this job object.
582 =back
584 =cut
586 sub new {
587 my ($caller, @args) = @_;
588 my $class = ref($caller) || $caller;
590 if ($class eq 'Bio::Tools::Run::Analysis::Job') {
592 # this is called only the first time when somebody calls:
593 #'Bio::Tools::Run::Analysis::Job->new(...)'
595 my %param = @args;
596 @param { map { lc $_ } keys %param } = values %param; # lowercase keys
597 if ($param {'-analysis'}) {
599 # usually a new Job object is created from an existing
600 # Analysis object - which means that the Analysis already
601 # loaded a 'real-work-doing' Job object, so we need just
602 # to create a Job object (by calling its new() method,
603 # which calls actually this new() method again - but its
604 # 'object' part - see below
606 my $analysis = $param {'-analysis'};
607 return undef unless $analysis->{'_access'}; # TBD: error message here?
608 my $access = $analysis->{'_access'};
609 return "Bio::Tools::Run::Analysis::Job::$access"->new (@args);
611 } else {
613 # if a new Job object is created directly (by a user, not
614 # by a parent Analysis object) we need to create the
615 # Analysis object first (because it is the Analysis object
616 # who knows how to contact the underlying analysis tool),
617 # and only then let the Analysis create this Job object
618 # (which may be an empty Job - if there is no 'id' in @args)
620 return new Bio::Tools::Run::Analysis (@args)->create_job ($param {'-id'});
623 } else {
625 # if $caller is an object, or if it is an underlying
626 # 'real-work-doing' class (e.g. Bio::Tools::Run::Analysis::Job::soap) then
627 # we want to call SUPER to create and bless a new object
629 my ($self) = $class->SUPER::new (@args);
631 # now the $self is an empty object - we will populate it from
632 # the $caller - if $caller is an object (so we do cloning here)
634 if (ref ($caller)) {
635 %{ $self } = %{ $caller };
638 # and finally add values from '@args' into the newly created
639 # object (the values will overwrite the values copied above);
640 # this is done by calling '_initialize' of the 'real-work-doing'
641 # class (if there is no one there, there is always an empty one
642 # in Bio::Root::Root)
644 $self->_initialize (@args);
645 return $self;
650 sub id { shift->{'_id'}; }
652 # ---------------------------------------------------------------------
654 # A Utility module...
656 # ---------------------------------------------------------------------
658 package Bio::Tools::Run::Analysis::Utils;
660 =head1 Module Bio::Tools::Run::Analysis::Utils
662 It contains several general utilities. These are C<functions>, not
663 methods. Therefore call them like, for example:
665 &Bio::Tools::Run::Analysis::Utils::format_time (...);
667 =cut
669 # -----------------------------------------------------------------------------
671 =head2 format_time
673 Usage : Bio::Tools::Run::Analysis::Utils::format_time ($time);
674 Returns : Slightly formatted $time
675 Args : $time is number of seconds from the beginning of Epoch
677 It returns what C<localtime> returns which means that return value is
678 different in the array and scalar context (see localtime). If C<$time>
679 is ``-1'' it returns 'n/a' (in the scalar context) or an empty array
680 (in the array context). If C<$time> is too small to represent the
681 distance from the beginning of the Epoch, it returns it unchanged (the
682 same in any contex) - this is reasonable for C<$time> representing an
683 elapsed time.
685 The function is used to format times coming back from various job time
686 methods.
688 =cut
690 sub format_time {
691 my $time = shift;
692 return wantarray ? () : 'n/a' if "$time" eq '-1';
693 return $time if $time < 1000000000;
694 return localtime $time;
697 # -----------------------------------------------------------------------------
699 # It processes given result names which may be of various different
700 # types and returns a hash reference with result names as keys and
701 # values being result destinations (such as file names, or templates
702 # how to create filenames.
704 # Or, it returns a scalar ('@[template]' or '?[template]') if there
705 # were no real result names but only a global rule how to create
706 # result destinantions for all results.
708 # Or, it returns 'undef' if there were no result names at all.
710 sub normalize_names {
711 return undef unless @_;
712 my %names = ();
713 foreach (@_) {
714 if (ref $_ eq 'HASH') {
715 %names = (%names, %$_);
716 } elsif (not ref $_) {
717 my ($name, $dest) = split (/\s*=\s*/, $_, 2);
718 return $name if $name =~ /^@/; # special: it nullifies other rules
719 return $name if $name =~ /^\?/; # ditto
720 $names{$name} = $dest; # $dest may be undef
723 \%names;
726 # -----------------------------------------------------------------------------
728 =head2 _load_access_module
730 Usage : $class->_load_access_module ($access)
731 Returns : 1 on success, undef on failure
732 Args : 'access' should contain the last part of the
733 name of a module who does the real implementation
735 It does (in the run-time) a similar thing as
737 require Bio::Tools::Run::Analysis::$access
739 It prints an error on STDERR if it fails to find and load the module
740 (for example, because of the compilation errors in the module).
742 =cut
744 sub _load_access_module {
745 my ($access) = @_;
747 my $load = "Bio/Tools/Run/Analysis/$access.pm";
748 eval {
749 require $load;
752 if ( $@ ) {
753 Bio::Root::Root->throw (<<END);
754 $load: $access cannot be found or loaded
755 Exception $@
756 For more information about the Analysis system please see the Bio::Tools::Run::Analysis docs.
759 return;
761 return 1;
764 # -----------------------------------------------------------------------------
766 =head2 _guess_access
768 Usage : Bio::Tools::Run::Analysis::Utils::guess_access ($rh_params)
769 Returns : string with a guessed access protocol (e.g. 'soap'),
770 or undef if the guessing failed
771 Args : 'rh_params' is a hash reference containing parameters given
772 to the 'new' method.
774 It makes an expert guess what kind of access/transport protocol should
775 be used to access the underlying analysis. The guess is based on the
776 parameters in I<rh_params>. Rememeber that this method is called only
777 if there was no I<-access> parameter which could tell directly what
778 access method to use.
780 =cut
782 sub _guess_access {
783 my ($rh_params) = @_;
784 return undef;
790 __END__