Move HMMER related modules, tests, and programs to new distribution.
[bioperl-live.git] / Bio / SearchIO.pm
blobbf94084ac5309d33c4b1df8640ec0c36dcbe5d2d
2 # BioPerl module for Bio::SearchIO
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Jason Stajich <jason-at-bioperl.org>
8 # Copyright Jason Stajich
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
14 =head1 NAME
16 Bio::SearchIO - Driver for parsing Sequence Database Searches
17 (BLAST, FASTA, ...)
19 =head1 SYNOPSIS
21 use Bio::SearchIO;
22 # format can be 'fasta', 'blast', 'exonerate', ...
23 my $searchio = Bio::SearchIO->new( -format => 'blastxml',
24 -file => 'blastout.xml' );
25 while ( my $result = $searchio->next_result() ) {
26 while( my $hit = $result->next_hit ) {
27 # process the Bio::Search::Hit::HitI object
28 while( my $hsp = $hit->next_hsp ) {
29 # process the Bio::Search::HSP::HSPI object
35 =head1 DESCRIPTION
37 This is a driver for instantiating a parser for report files from
38 sequence database searches. This object serves as a wrapper for the
39 format parsers in Bio::SearchIO::* - you should not need to ever
40 use those format parsers directly. (For people used to the SeqIO
41 system it, we are deliberately using the same pattern).
43 Once you get a SearchIO object, calling next_result() gives you back
44 a L<Bio::Search::Result::ResultI> compliant object, which is an object that
45 represents one Blast/Fasta/HMMER whatever report.
47 A list of module names and formats is below:
49 blast BLAST (WUBLAST, NCBIBLAST,bl2seq)
50 fasta FASTA -m9 and -m0
51 blasttable BLAST -m9 or -m8 output (both NCBI and WUBLAST tabular)
52 megablast MEGABLAST
53 psl UCSC PSL format
54 waba WABA output
55 axt AXT format
56 sim4 Sim4
57 hmmer HMMER2 hmmpfam and hmmsearch or HMMER3 hmmscan and hmmsearch
58 exonerate Exonerate CIGAR and VULGAR format
59 blastxml NCBI BLAST XML
60 wise Genewise -genesf format
62 Also see the SearchIO HOWTO:
63 http://bioperl.org/howtos/SearchIO_HOWTO.html
65 =head1 FEEDBACK
67 =head2 Mailing Lists
69 User feedback is an integral part of the evolution of this and other
70 Bioperl modules. Send your comments and suggestions preferably to
71 the Bioperl mailing list. Your participation is much appreciated.
73 bioperl-l@bioperl.org - General discussion
74 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
76 =head2 Support
78 Please direct usage questions or support issues to the mailing list:
80 I<bioperl-l@bioperl.org>
82 rather than to the module maintainer directly. Many experienced and
83 reponsive experts will be able look at the problem and quickly
84 address it. Please include a thorough description of the problem
85 with code and data examples if at all possible.
87 =head2 Reporting Bugs
89 Report bugs to the Bioperl bug tracking system to help us keep track
90 of the bugs and their resolution. Bug reports can be submitted via the
91 web:
93 https://github.com/bioperl/bioperl-live/issues
95 =head1 AUTHOR - Jason Stajich & Steve Chervitz
97 Email jason-at-bioperl.org
98 Email sac-at-bioperl.org
100 =head1 APPENDIX
102 The rest of the documentation details each of the object methods.
103 Internal methods are usually preceded with a _
105 =cut
108 # Let the code begin...
111 package Bio::SearchIO;
112 use strict;
113 use warnings;
115 # Object preamble - inherits from Bio::Root::IO
117 use Bio::SearchIO::SearchResultEventBuilder;
119 # Special exception class for exceptions during parsing.
120 # End users should not ever see these.
121 # For an example of usage, see blast.pm.
122 @Bio::SearchIO::InternalParserError::ISA = qw(Bio::Root::Exception);
124 use Symbol;
126 use base qw(Bio::Root::IO Bio::Event::EventGeneratorI Bio::AnalysisParserI);
128 =head2 new
130 Title : new
131 Usage : my $obj = Bio::SearchIO->new();
132 Function: Builds a new Bio::SearchIO object
133 Returns : Bio::SearchIO initialized with the correct format
134 Args : -file => $filename
135 -format => format
136 -fh => filehandle to attach to
137 -result_factory => object implementing Bio::Factory::ObjectFactoryI
138 -hit_factory => object implementing Bio::Factory::ObjectFactoryI
139 -hsp_factory => object implementing Bio::Factory::ObjectFactoryI
140 -writer => object implementing Bio::SearchIO::SearchWriterI
141 -output_format => output format, which will dynamically load writer
142 -inclusion_threshold => e-value threshold for inclusion in the
143 PSI-BLAST score matrix model
144 -signif => float or scientific notation number to be used
145 as a P- or Expect value cutoff
146 -check_all_hits => boolean. Check all hits for significance against
147 significance criteria. Default = false.
148 If false, stops processing hits after the first
149 non-significant hit or the first hit that fails
150 the hit_filter call. This speeds parsing,
151 taking advantage of the fact that the hits are
152 processed in the order they appear in the report.
153 -min_query_len => integer to be used as a minimum for query sequence
154 length. Reports with query sequences below this
155 length will not be processed.
156 default = no minimum length.
157 -best => boolean. Only process the best hit of each report;
158 default = false.
160 See L<Bio::Factory::ObjectFactoryI>, L<Bio::SearchIO::SearchWriterI>
162 Any factory objects in the arguments are passed along to the
163 SearchResultEventBuilder object which holds these factories and sets
164 default ones if none are supplied as arguments.
166 =cut
168 # TODO: The below don't seem to be implemented (e.g. in Bio::SearchIO::blast)
170 # -score => integer or scientific notation number to be used
171 # as a blast score value cutoff
172 # -bits => integer or scientific notation number to be used
173 # as a bit score value cutoff
174 # -overlap => integer. The amount of overlap to permit between
175 # adjacent HSPs when tiling HSPs. A reasonable value is 2.
176 # default = $Bio::SearchIO::blast::MAX_HSP_OVERLAP.
178 sub new {
179 my($caller,@args) = @_;
180 my $class = ref($caller) || $caller;
182 # or do we want to call SUPER on an object if $caller is an
183 # object?
184 if( $class =~ /Bio::SearchIO::(\S+)/ ) {
185 my ($self) = $class->SUPER::new(@args);
186 $self->_initialize(@args);
187 return $self;
188 } else {
189 my %param = @args;
190 @param{ map { lc $_ } keys %param } = values %param; # lowercase keys
191 my $format = $param{'-format'} ||
192 $class->_guess_format( $param{'-file'} || $ARGV[0] ) || 'blast';
194 my $output_format = $param{'-output_format'};
195 my $writer = undef;
197 if( defined $output_format ) {
198 if( defined $param{'-writer'} ) {
199 my $dummy = Bio::Root::Root->new();
200 $dummy->throw("Both writer and output format specified - not good");
203 if( $output_format =~ /^blast$/i ) {
204 $output_format = 'TextResultWriter';
206 my $output_module = "Bio::SearchIO::Writer::".$output_format;
207 $class->_load_module($output_module);
208 $writer = $output_module->new(@args);
209 push(@args,"-writer",$writer);
213 # normalize capitalization to lower case
214 $format = "\L$format";
216 return unless( $class->_load_format_module($format) );
217 return "Bio::SearchIO::${format}"->new(@args);
221 sub _initialize {
222 my($self, @args) = @_;
223 $self->{'_handler'} = undef;
224 # not really necessary unless we put more in RootI
225 #$self->SUPER::_initialize(@args);
227 # initialize the IO part
228 $self->_initialize_io(@args);
229 $self->attach_EventHandler(Bio::SearchIO::SearchResultEventBuilder->new(@args));
230 $self->{'_reporttype'} = '';
231 $self->{_notfirsttime} = 0;
232 my ($min_qlen, $check_all, $overlap, $best, $it, $writer ) =
233 $self->_rearrange([qw(
234 MIN_LENGTH
235 CHECK_ALL_HITS
236 OVERLAP
237 BEST
238 INCLUSION_THRESHOLD
239 WRITER)], @args); # note: $overlap isn't used for some reason
241 $writer && $self->writer( $writer );
242 defined $it && $self->inclusion_threshold($it);
243 defined $min_qlen && $self->min_query_length($min_qlen);
244 defined $best && $self->best_hit_only($best);
245 defined $check_all && $self->check_all_hits($check_all);
248 =head2 newFh
250 Title : newFh
251 Usage : $fh = Bio::SearchIO->newFh(-file=>$filename,
252 -format=>'Format')
253 Function: does a new() followed by an fh()
254 Example : $fh = Bio::SearchIO->newFh(-file=>$filename,
255 -format=>'Format')
256 $result = <$fh>; # read a ResultI object
257 print $fh $result; # write a ResultI object
258 Returns : filehandle tied to the Bio::SearchIO::Fh class
259 Args :
261 =cut
263 sub newFh {
264 my $class = shift;
265 return unless my $self = $class->new(@_);
266 return $self->fh;
269 =head2 fh
271 Title : fh
272 Usage : $obj->fh
273 Function:
274 Example : $fh = $obj->fh; # make a tied filehandle
275 $result = <$fh>; # read a ResultI object
276 print $fh $result; # write a ResultI object
277 Returns : filehandle tied to the Bio::SearchIO::Fh class
278 Args :
280 =cut
283 sub fh {
284 my $self = shift;
285 my $class = ref($self) || $self;
286 my $s = Symbol::gensym;
287 tie $$s,$class,$self;
288 return $s;
292 =head2 format
294 Title : format
295 Usage : $format = $obj->format()
296 Function: Get the search format
297 Returns : search format
298 Args : none
300 =cut
302 # format() method inherited from Bio::Root::IO
305 =head2 attach_EventHandler
307 Title : attach_EventHandler
308 Usage : $parser->attatch_EventHandler($handler)
309 Function: Adds an event handler to listen for events
310 Returns : none
311 Args : Bio::SearchIO::EventHandlerI
313 See L<Bio::SearchIO::EventHandlerI>
315 =cut
317 sub attach_EventHandler{
318 my ($self,$handler) = @_;
319 return if( ! $handler );
320 if( ! $handler->isa('Bio::SearchIO::EventHandlerI') ) {
321 $self->warn("Ignoring request to attatch handler ".ref($handler). ' because it is not a Bio::SearchIO::EventHandlerI');
323 $self->{'_handler'} = $handler;
324 return;
327 =head2 _eventHandler
329 Title : _eventHandler
330 Usage : private
331 Function: Get the EventHandler
332 Returns : Bio::SearchIO::EventHandlerI
333 Args : none
335 See L<Bio::SearchIO::EventHandlerI>
337 =cut
339 sub _eventHandler{
340 my ($self) = @_;
341 return $self->{'_handler'};
344 =head2 next_result
346 Title : next_result
347 Usage : $result = stream->next_result
348 Function: Reads the next ResultI object from the stream and returns it.
350 Certain driver modules may encounter entries in the stream that
351 are either misformatted or that use syntax not yet understood
352 by the driver. If such an incident is recoverable, e.g., by
353 dismissing a feature of a feature table or some other non-mandatory
354 part of an entry, the driver will issue a warning. In the case
355 of a non-recoverable situation an exception will be thrown.
356 Do not assume that you can resume parsing the same stream after
357 catching the exception. Note that you can always turn recoverable
358 errors into exceptions by calling $stream->verbose(2) (see
359 Bio::Root::RootI POD page).
360 Returns : A Bio::Search::Result::ResultI object
361 Args : n/a
363 See L<Bio::Root::RootI>
365 =cut
367 sub next_result {
368 my ($self) = @_;
369 $self->throw_not_implemented;
372 =head2 write_result
374 Title : write_result
375 Usage : $stream->write_result($result_result, @other_args)
376 Function: Writes data from the $result_result object into the stream.
377 : Delegates to the to_string() method of the associated
378 : WriterI object.
379 Returns : 1 for success and 0 for error
380 Args : Bio::Search:Result::ResultI object,
381 : plus any other arguments for the Writer
382 Throws : Bio::Root::Exception if a Writer has not been set.
384 See L<Bio::Root::Exception>
386 =cut
388 sub write_result {
389 my ($self, $result, @args) = @_;
391 if( not ref($self->{'_result_writer'}) ) {
392 $self->throw("ResultWriter not defined.");
394 @args = $self->{'_notfirsttime'} unless( @args );
396 my $str = $self->writer->to_string( $result, @args);
397 $self->{'_notfirsttime'} = 1;
398 $self->_print( "$str" ) if defined $str;
400 $self->flush if $self->_flush_on_write && defined $self->_fh;
401 return 1;
404 =head2 write_report
406 Title : write_report
407 Usage : $stream->write_report(SearchIO stream, @other_args)
408 Function: Writes data directly from the SearchIO stream object into the
409 : writer. This is mainly useful if one has multiple ResultI objects
410 : in a SearchIO stream and you don't want to reiterate header/footer
411 : between each call.
412 Returns : 1 for success and 0 for error
413 Args : Bio::SearchIO stream object,
414 : plus any other arguments for the Writer
415 Throws : Bio::Root::Exception if a Writer has not been set.
417 See L<Bio::Root::Exception>
419 =cut
421 sub write_report {
422 my ($self, $result, @args) = @_;
424 if( not ref($self->{'_result_writer'}) ) {
425 $self->throw("ResultWriter not defined.");
427 @args = $self->{'_notfirsttime'} unless( @args );
429 my $str = $self->writer->to_string( $result, @args);
430 $self->{'_notfirsttime'} = 1;
431 $self->_print( "$str" ) if defined $str;
433 $self->flush if $self->_flush_on_write && defined $self->_fh;
434 return 1;
437 =head2 writer
439 Title : writer
440 Usage : $writer = $stream->writer;
441 Function: Sets/Gets a SearchWriterI object to be used for this searchIO.
442 Returns : 1 for success and 0 for error
443 Args : Bio::SearchIO::SearchWriterI object (when setting)
444 Throws : Bio::Root::Exception if a non-Bio::SearchIO::SearchWriterI object
445 is passed in.
447 =cut
449 sub writer {
450 my ($self, $writer) = @_;
451 if( ref($writer) and $writer->isa( 'Bio::SearchIO::SearchWriterI' )) {
452 $self->{'_result_writer'} = $writer;
454 elsif( defined $writer ) {
455 $self->throw("Can't set ResultWriter. Not a Bio::SearchIO::SearchWriterI: $writer");
457 return $self->{'_result_writer'};
460 =head2 result_count
462 Title : result_count
463 Usage : $num = $stream->result_count;
464 Function: Gets the number of Blast results that have been successfully parsed
465 at the point of the method call. This is not the total # of results
466 in the file.
467 Returns : integer
468 Args : none
469 Throws : none
471 =cut
473 sub result_count {
474 my $self = shift;
475 $self->throw_not_implemented;
478 =head2 inclusion_threshold
480 Title : inclusion_threshold
481 Usage : my $incl_thresh = $isreb->inclusion_threshold;
482 : $isreb->inclusion_threshold(1e-5);
483 Function: Get/Set the e-value threshold for inclusion in the PSI-BLAST
484 score matrix model (blastpgp) that was used for generating the reports
485 being parsed.
486 Returns : number (real)
487 Default value: $Bio::SearchIO::IteratedSearchResultEventBuilder::DEFAULT_INCLUSION_THRESHOLD
488 Args : number (real) (e.g., 0.0001 or 1e-4 )
490 =cut
492 # Delegates to the event handler.
493 sub inclusion_threshold {
494 shift->_eventHandler->inclusion_threshold(@_);
497 =head2 max_significance
499 Usage : $obj->max_significance();
500 Purpose : Set/Get the P or Expect value used as significance screening cutoff.
501 This is the value of the -signif parameter supplied to new().
502 Hits with P or E-value above this are skipped.
503 Returns : Scientific notation number with this format: 1.0e-05.
504 Argument : Scientific notation number or float (when setting)
505 Comments : Screening of significant hits uses the data provided on the
506 : description line. For NCBI BLAST1 and WU-BLAST, this data
507 : is P-value. for NCBI BLAST2 it is an Expect value.
509 =cut
511 sub max_significance { shift->{'_handler_cache'}->max_significance(@_) }
513 =head2 signif
515 Synonym for L<max_significance()|max_significance>
517 =cut
519 sub signif { shift->max_significance(@_) }
521 =head2 min_score
523 Usage : $obj->min_score();
524 Purpose : Set/Get the Blast score used as screening cutoff.
525 This is the value of the -score parameter supplied to new().
526 Hits with scores below this are skipped.
527 Returns : Integer or scientific notation number.
528 Argument : Integer or scientific notation number (when setting)
529 Comments : Screening of significant hits uses the data provided on the
530 : description line.
532 =cut
534 sub min_score { shift->{'_handler_cache'}->min_score(@_) }
536 =head2 min_query_length
538 Usage : $obj->min_query_length();
539 Purpose : Gets the query sequence length used as screening criteria.
540 This is the value of the -min_query_len parameter supplied to new().
541 Hits with sequence length below this are skipped.
542 Returns : Integer
543 Argument : n/a
545 =cut
547 sub min_query_length {
548 my $self = shift;
549 if (@_) {
550 my $min_qlen = shift;
551 if ( $min_qlen =~ /\D/ or $min_qlen <= 0 ) {
552 $self->throw(
553 -class => 'Bio::Root::BadParameter',
554 -text => "Invalid minimum query length value: $min_qlen\n"
555 . "Value must be an integer > 0. Value not set.",
556 -value => $min_qlen
559 $self->{'_confirm_qlength'} = 1;
560 $self->{'_min_query_length'} = $min_qlen;
563 return $self->{'_min_query_length'};
566 =head2 best_hit_only
568 Title : best_hit_only
569 Usage : print "only getting best hit.\n" if $obj->best_hit_only;
570 Purpose : Set/Get the indicator for whether or not to process only
571 : the best BlastHit.
572 Returns : Boolean (1 | 0)
573 Argument : Boolean (1 | 0) (when setting)
575 =cut
577 sub best_hit_only {
578 my $self = shift;
579 if (@_) { $self->{'_best'} = shift; }
580 $self->{'_best'};
583 =head2 check_all_hits
585 Title : check_all_hits
586 Usage : print "checking all hits.\n" if $obj->check_all_hits;
587 Purpose : Set/Get the indicator for whether or not to process all hits.
588 : If false, the parser will stop processing hits after the
589 : the first non-significance hit or the first hit that fails
590 : any hit filter.
591 Returns : Boolean (1 | 0)
592 Argument : Boolean (1 | 0) (when setting)
594 =cut
596 sub check_all_hits {
597 my $self = shift;
598 if (@_) { $self->{'_check_all'} = shift; }
599 $self->{'_check_all'};
602 =head2 _load_format_module
604 Title : _load_format_module
605 Usage : *INTERNAL SearchIO stuff*
606 Function: Loads up (like use) a module at run time on demand
607 Example :
608 Returns :
609 Args :
611 =cut
613 sub _load_format_module {
614 my ($self,$format) = @_;
615 my $module = "Bio::SearchIO::" . $format;
616 my $ok;
618 eval {
619 $ok = $self->_load_module($module);
621 if ( $@ ) {
622 print STDERR <<END;
623 $self: $format cannot be found
624 Exception $@
625 For more information about the SearchIO system please see the SearchIO docs.
626 This includes ways of checking for formats at compile time, not run time
630 return $ok;
633 =head2 _get_seq_identifiers
635 Title : _get_seq_identifiers
636 Usage : my ($gi, $acc,$ver) = &_get_seq_identifiers($id)
637 Function: Private function to get the gi, accession, version data
638 for an ID (if it is in NCBI format)
639 Returns : 3-pule of gi, accession, version
640 Args : ID string to process (NCBI format)
643 =cut
645 sub _get_seq_identifiers {
646 my ($self, $id) = @_;
648 return unless defined $id;
649 my ($gi, $acc, $version );
650 if ( $id =~ /^gi\|(\d+)\|/ ) {
651 $gi = $1;
653 if ( $id =~ /(gb|emb|dbj|sp|pdb|bbs|ref|lcl)\|(.*)\|(.*)/ ) {
654 ( $acc, $version ) = split /\./, $2;
656 elsif ( $id =~ /(pir|prf|pat|gnl)\|(.*)\|(.*)/ ) {
657 ( $acc, $version ) = split /\./, $3;
659 else {
661 #punt, not matching the db's at ftp://ftp.ncbi.nih.gov/blast/db/README
662 #Database Name Identifier Syntax
663 #============================ ========================
664 #GenBank gb|accession|locus
665 #EMBL Data Library emb|accession|locus
666 #DDBJ, DNA Database of Japan dbj|accession|locus
667 #NBRF PIR pir||entry
668 #Protein Research Foundation prf||name
669 #SWISS-PROT sp|accession|entry name
670 #Brookhaven Protein Data Bank pdb|entry|chain
671 #Patents pat|country|number
672 #GenInfo Backbone Id bbs|number
673 #General database identifier gnl|database|identifier
674 #NCBI Reference Sequence ref|accession|locus
675 #Local Sequence identifier lcl|identifier
676 $acc = $id;
678 return ($gi, $acc, $version );
681 =head2 _guess_format
683 Title : _guess_format
684 Usage : $obj->_guess_format($filename)
685 Function:
686 Example :
687 Returns : guessed format of filename (lower case)
688 Args :
690 =cut
692 sub _guess_format {
693 my $class = shift;
694 return unless $_ = shift;
695 return 'blast' if (/\.(blast|t?bl\w)$/i );
696 return 'fasta' if (/\.
697 (?: t? fas (?:ta)? |
698 m\d+ |
699 (?: t? (?: fa | fx | fy | ff | fs ) ) |
700 (?: (?:ss | os | ps) (?:earch)? ))
701 $/ix );
702 return 'blastxml' if ( /\.(blast)?xml$/i);
703 return 'exonerate' if ( /\.exon(erate)?/i );
706 sub close {
707 my $self = shift;
709 if( $self->writer ) {
710 $self->_print($self->writer->end_report());
711 $self->{'_result_writer'}= undef;
713 $self->SUPER::close(@_);
716 sub DESTROY {
717 my $self = shift;
718 $self->close() if defined $self->_fh;
719 $self->SUPER::DESTROY;
722 sub TIEHANDLE {
723 my $class = shift;
724 return bless {processor => shift}, $class;
727 sub READLINE {
728 my $self = shift;
729 return $self->{'processor'}->next_result() || undef unless wantarray;
730 my (@list, $obj);
731 push @list, $obj while $obj = $self->{'processor'}->next_result();
732 return @list;
735 sub PRINT {
736 my $self = shift;
737 $self->{'processor'}->write_result(@_);
743 __END__