Bio::Tools::CodonTable::is_start_codon: check in case of ambiguous codons (#266)
[bioperl-live.git] / lib / Bio / SearchIO.pm
blob2f6aaeccfc13736f889b41d457cb802cdd229dec
2 # BioPerl module for Bio::SearchIO
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Jason Stajich <jason-at-bioperl.org>
8 # Copyright Jason Stajich
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
14 =head1 NAME
16 Bio::SearchIO - Driver for parsing Sequence Database Searches
17 (BLAST, FASTA, ...)
19 =head1 SYNOPSIS
21 use Bio::SearchIO;
22 # format can be 'fasta', 'blast', 'exonerate', ...
23 my $searchio = Bio::SearchIO->new( -format => 'blastxml',
24 -file => 'blastout.xml' );
25 while ( my $result = $searchio->next_result() ) {
26 while( my $hit = $result->next_hit ) {
27 # process the Bio::Search::Hit::HitI object
28 while( my $hsp = $hit->next_hsp ) {
29 # process the Bio::Search::HSP::HSPI object
35 =head1 DESCRIPTION
37 This is a driver for instantiating a parser for report files from
38 sequence database searches. This object serves as a wrapper for the
39 format parsers in Bio::SearchIO::* - you should not need to ever
40 use those format parsers directly. (For people used to the SeqIO
41 system it, we are deliberately using the same pattern).
43 Once you get a SearchIO object, calling next_result() gives you back
44 a L<Bio::Search::Result::ResultI> compliant object, which is an object that
45 represents one Blast/Fasta/HMMER whatever report.
47 A list of module names and formats is below:
49 blast BLAST (WUBLAST, NCBIBLAST,bl2seq)
50 fasta FASTA -m9 and -m0
51 blasttable BLAST -m9 or -m8 output (both NCBI and WUBLAST tabular)
52 megablast MEGABLAST
53 psl UCSC PSL format
54 waba WABA output
55 axt AXT format
56 sim4 Sim4
57 hmmer HMMER2 hmmpfam and hmmsearch or HMMER3 hmmscan and hmmsearch
58 exonerate Exonerate CIGAR and VULGAR format
59 blastxml NCBI BLAST XML
60 wise Genewise -genesf format
62 Also see the SearchIO HOWTO:
63 http://bioperl.org/howtos/SearchIO_HOWTO.html
65 =head1 FEEDBACK
67 =head2 Mailing Lists
69 User feedback is an integral part of the evolution of this and other
70 Bioperl modules. Send your comments and suggestions preferably to
71 the Bioperl mailing list. Your participation is much appreciated.
73 bioperl-l@bioperl.org - General discussion
74 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
76 =head2 Support
78 Please direct usage questions or support issues to the mailing list:
80 I<bioperl-l@bioperl.org>
82 rather than to the module maintainer directly. Many experienced and
83 reponsive experts will be able look at the problem and quickly
84 address it. Please include a thorough description of the problem
85 with code and data examples if at all possible.
87 =head2 Reporting Bugs
89 Report bugs to the Bioperl bug tracking system to help us keep track
90 of the bugs and their resolution. Bug reports can be submitted via the
91 web:
93 https://github.com/bioperl/bioperl-live/issues
95 =head1 AUTHOR - Jason Stajich & Steve Chervitz
97 Email jason-at-bioperl.org
98 Email sac-at-bioperl.org
100 =head1 APPENDIX
102 The rest of the documentation details each of the object methods.
103 Internal methods are usually preceded with a _
105 =cut
108 # Let the code begin...
111 package Bio::SearchIO;
113 use strict;
114 use warnings;
116 # Object preamble - inherits from Bio::Root::IO
118 use Bio::SearchIO::SearchResultEventBuilder;
120 # Special exception class for exceptions during parsing.
121 # End users should not ever see these.
122 # For an example of usage, see blast.pm.
123 @Bio::SearchIO::InternalParserError::ISA = qw(Bio::Root::Exception);
125 use Symbol;
127 use base qw(Bio::Root::IO Bio::Event::EventGeneratorI Bio::AnalysisParserI);
129 =head2 new
131 Title : new
132 Usage : my $obj = Bio::SearchIO->new();
133 Function: Builds a new Bio::SearchIO object
134 Returns : Bio::SearchIO initialized with the correct format
135 Args : -file => $filename
136 -format => format
137 -fh => filehandle to attach to
138 -result_factory => object implementing Bio::Factory::ObjectFactoryI
139 -hit_factory => object implementing Bio::Factory::ObjectFactoryI
140 -hsp_factory => object implementing Bio::Factory::ObjectFactoryI
141 -writer => object implementing Bio::SearchIO::SearchWriterI
142 -output_format => output format, which will dynamically load writer
143 -inclusion_threshold => e-value threshold for inclusion in the
144 PSI-BLAST score matrix model
145 -signif => float or scientific notation number to be used
146 as a P- or Expect value cutoff
147 -check_all_hits => boolean. Check all hits for significance against
148 significance criteria. Default = false.
149 If false, stops processing hits after the first
150 non-significant hit or the first hit that fails
151 the hit_filter call. This speeds parsing,
152 taking advantage of the fact that the hits are
153 processed in the order they appear in the report.
154 -min_query_len => integer to be used as a minimum for query sequence
155 length. Reports with query sequences below this
156 length will not be processed.
157 default = no minimum length.
158 -best => boolean. Only process the best hit of each report;
159 default = false.
161 See L<Bio::Factory::ObjectFactoryI>, L<Bio::SearchIO::SearchWriterI>
163 Any factory objects in the arguments are passed along to the
164 SearchResultEventBuilder object which holds these factories and sets
165 default ones if none are supplied as arguments.
167 =cut
169 # TODO: The below don't seem to be implemented (e.g. in Bio::SearchIO::blast)
171 # -score => integer or scientific notation number to be used
172 # as a blast score value cutoff
173 # -bits => integer or scientific notation number to be used
174 # as a bit score value cutoff
175 # -overlap => integer. The amount of overlap to permit between
176 # adjacent HSPs when tiling HSPs. A reasonable value is 2.
177 # default = $Bio::SearchIO::blast::MAX_HSP_OVERLAP.
179 sub new {
180 my($caller,@args) = @_;
181 my $class = ref($caller) || $caller;
183 # or do we want to call SUPER on an object if $caller is an
184 # object?
185 if( $class =~ /Bio::SearchIO::(\S+)/ ) {
186 my ($self) = $class->SUPER::new(@args);
187 $self->_initialize(@args);
188 return $self;
189 } else {
190 my %param = @args;
191 @param{ map { lc $_ } keys %param } = values %param; # lowercase keys
192 my $format = $param{'-format'} ||
193 $class->_guess_format( $param{'-file'} || $ARGV[0] ) || 'blast';
195 my $output_format = $param{'-output_format'};
196 my $writer = undef;
198 if( defined $output_format ) {
199 if( defined $param{'-writer'} ) {
200 my $dummy = Bio::Root::Root->new();
201 $dummy->throw("Both writer and output format specified - not good");
204 if( $output_format =~ /^blast$/i ) {
205 $output_format = 'TextResultWriter';
207 my $output_module = "Bio::SearchIO::Writer::".$output_format;
208 $class->_load_module($output_module);
209 $writer = $output_module->new(@args);
210 push(@args,"-writer",$writer);
214 # normalize capitalization to lower case
215 $format = "\L$format";
217 return unless( $class->_load_format_module($format) );
218 return "Bio::SearchIO::${format}"->new(@args);
222 sub _initialize {
223 my($self, @args) = @_;
224 $self->{'_handler'} = undef;
225 # not really necessary unless we put more in RootI
226 #$self->SUPER::_initialize(@args);
228 # initialize the IO part
229 $self->_initialize_io(@args);
230 $self->attach_EventHandler(Bio::SearchIO::SearchResultEventBuilder->new(@args));
231 $self->{'_reporttype'} = '';
232 $self->{_notfirsttime} = 0;
233 my ($min_qlen, $check_all, $overlap, $best, $it, $writer ) =
234 $self->_rearrange([qw(
235 MIN_LENGTH
236 CHECK_ALL_HITS
237 OVERLAP
238 BEST
239 INCLUSION_THRESHOLD
240 WRITER)], @args); # note: $overlap isn't used for some reason
242 $writer && $self->writer( $writer );
243 defined $it && $self->inclusion_threshold($it);
244 defined $min_qlen && $self->min_query_length($min_qlen);
245 defined $best && $self->best_hit_only($best);
246 defined $check_all && $self->check_all_hits($check_all);
249 =head2 newFh
251 Title : newFh
252 Usage : $fh = Bio::SearchIO->newFh(-file=>$filename,
253 -format=>'Format')
254 Function: does a new() followed by an fh()
255 Example : $fh = Bio::SearchIO->newFh(-file=>$filename,
256 -format=>'Format')
257 $result = <$fh>; # read a ResultI object
258 print $fh $result; # write a ResultI object
259 Returns : filehandle tied to the Bio::SearchIO::Fh class
260 Args :
262 =cut
264 sub newFh {
265 my $class = shift;
266 return unless my $self = $class->new(@_);
267 return $self->fh;
270 =head2 fh
272 Title : fh
273 Usage : $obj->fh
274 Function:
275 Example : $fh = $obj->fh; # make a tied filehandle
276 $result = <$fh>; # read a ResultI object
277 print $fh $result; # write a ResultI object
278 Returns : filehandle tied to the Bio::SearchIO::Fh class
279 Args :
281 =cut
284 sub fh {
285 my $self = shift;
286 my $class = ref($self) || $self;
287 my $s = Symbol::gensym;
288 tie $$s,$class,$self;
289 return $s;
293 =head2 format
295 Title : format
296 Usage : $format = $obj->format()
297 Function: Get the search format
298 Returns : search format
299 Args : none
301 =cut
303 # format() method inherited from Bio::Root::IO
306 =head2 attach_EventHandler
308 Title : attach_EventHandler
309 Usage : $parser->attatch_EventHandler($handler)
310 Function: Adds an event handler to listen for events
311 Returns : none
312 Args : Bio::SearchIO::EventHandlerI
314 See L<Bio::SearchIO::EventHandlerI>
316 =cut
318 sub attach_EventHandler{
319 my ($self,$handler) = @_;
320 return if( ! $handler );
321 if( ! $handler->isa('Bio::SearchIO::EventHandlerI') ) {
322 $self->warn("Ignoring request to attatch handler ".ref($handler). ' because it is not a Bio::SearchIO::EventHandlerI');
324 $self->{'_handler'} = $handler;
325 return;
328 =head2 _eventHandler
330 Title : _eventHandler
331 Usage : private
332 Function: Get the EventHandler
333 Returns : Bio::SearchIO::EventHandlerI
334 Args : none
336 See L<Bio::SearchIO::EventHandlerI>
338 =cut
340 sub _eventHandler{
341 my ($self) = @_;
342 return $self->{'_handler'};
345 =head2 next_result
347 Title : next_result
348 Usage : $result = stream->next_result
349 Function: Reads the next ResultI object from the stream and returns it.
351 Certain driver modules may encounter entries in the stream that
352 are either misformatted or that use syntax not yet understood
353 by the driver. If such an incident is recoverable, e.g., by
354 dismissing a feature of a feature table or some other non-mandatory
355 part of an entry, the driver will issue a warning. In the case
356 of a non-recoverable situation an exception will be thrown.
357 Do not assume that you can resume parsing the same stream after
358 catching the exception. Note that you can always turn recoverable
359 errors into exceptions by calling $stream->verbose(2) (see
360 Bio::Root::RootI POD page).
361 Returns : A Bio::Search::Result::ResultI object
362 Args : n/a
364 See L<Bio::Root::RootI>
366 =cut
368 sub next_result {
369 my ($self) = @_;
370 $self->throw_not_implemented;
373 =head2 write_result
375 Title : write_result
376 Usage : $stream->write_result($result_result, @other_args)
377 Function: Writes data from the $result_result object into the stream.
378 : Delegates to the to_string() method of the associated
379 : WriterI object.
380 Returns : 1 for success and 0 for error
381 Args : Bio::Search:Result::ResultI object,
382 : plus any other arguments for the Writer
383 Throws : Bio::Root::Exception if a Writer has not been set.
385 See L<Bio::Root::Exception>
387 =cut
389 sub write_result {
390 my ($self, $result, @args) = @_;
392 if( not ref($self->{'_result_writer'}) ) {
393 $self->throw("ResultWriter not defined.");
395 @args = $self->{'_notfirsttime'} unless( @args );
397 my $str = $self->writer->to_string( $result, @args);
398 $self->{'_notfirsttime'} = 1;
399 $self->_print( "$str" ) if defined $str;
401 $self->flush if $self->_flush_on_write && defined $self->_fh;
402 return 1;
405 =head2 write_report
407 Title : write_report
408 Usage : $stream->write_report(SearchIO stream, @other_args)
409 Function: Writes data directly from the SearchIO stream object into the
410 : writer. This is mainly useful if one has multiple ResultI objects
411 : in a SearchIO stream and you don't want to reiterate header/footer
412 : between each call.
413 Returns : 1 for success and 0 for error
414 Args : Bio::SearchIO stream object,
415 : plus any other arguments for the Writer
416 Throws : Bio::Root::Exception if a Writer has not been set.
418 See L<Bio::Root::Exception>
420 =cut
422 sub write_report {
423 my ($self, $result, @args) = @_;
425 if( not ref($self->{'_result_writer'}) ) {
426 $self->throw("ResultWriter not defined.");
428 @args = $self->{'_notfirsttime'} unless( @args );
430 my $str = $self->writer->to_string( $result, @args);
431 $self->{'_notfirsttime'} = 1;
432 $self->_print( "$str" ) if defined $str;
434 $self->flush if $self->_flush_on_write && defined $self->_fh;
435 return 1;
438 =head2 writer
440 Title : writer
441 Usage : $writer = $stream->writer;
442 Function: Sets/Gets a SearchWriterI object to be used for this searchIO.
443 Returns : 1 for success and 0 for error
444 Args : Bio::SearchIO::SearchWriterI object (when setting)
445 Throws : Bio::Root::Exception if a non-Bio::SearchIO::SearchWriterI object
446 is passed in.
448 =cut
450 sub writer {
451 my ($self, $writer) = @_;
452 if( ref($writer) and $writer->isa( 'Bio::SearchIO::SearchWriterI' )) {
453 $self->{'_result_writer'} = $writer;
455 elsif( defined $writer ) {
456 $self->throw("Can't set ResultWriter. Not a Bio::SearchIO::SearchWriterI: $writer");
458 return $self->{'_result_writer'};
461 =head2 result_count
463 Title : result_count
464 Usage : $num = $stream->result_count;
465 Function: Gets the number of Blast results that have been successfully parsed
466 at the point of the method call. This is not the total # of results
467 in the file.
468 Returns : integer
469 Args : none
470 Throws : none
472 =cut
474 sub result_count {
475 my $self = shift;
476 $self->throw_not_implemented;
479 =head2 inclusion_threshold
481 Title : inclusion_threshold
482 Usage : my $incl_thresh = $isreb->inclusion_threshold;
483 : $isreb->inclusion_threshold(1e-5);
484 Function: Get/Set the e-value threshold for inclusion in the PSI-BLAST
485 score matrix model (blastpgp) that was used for generating the reports
486 being parsed.
487 Returns : number (real)
488 Default value: $Bio::SearchIO::IteratedSearchResultEventBuilder::DEFAULT_INCLUSION_THRESHOLD
489 Args : number (real) (e.g., 0.0001 or 1e-4 )
491 =cut
493 # Delegates to the event handler.
494 sub inclusion_threshold {
495 shift->_eventHandler->inclusion_threshold(@_);
498 =head2 max_significance
500 Usage : $obj->max_significance();
501 Purpose : Set/Get the P or Expect value used as significance screening cutoff.
502 This is the value of the -signif parameter supplied to new().
503 Hits with P or E-value above this are skipped.
504 Returns : Scientific notation number with this format: 1.0e-05.
505 Argument : Scientific notation number or float (when setting)
506 Comments : Screening of significant hits uses the data provided on the
507 : description line. For NCBI BLAST1 and WU-BLAST, this data
508 : is P-value. for NCBI BLAST2 it is an Expect value.
510 =cut
512 sub max_significance { shift->{'_handler_cache'}->max_significance(@_) }
514 =head2 signif
516 Synonym for L<max_significance()|max_significance>
518 =cut
520 sub signif { shift->max_significance(@_) }
522 =head2 min_score
524 Usage : $obj->min_score();
525 Purpose : Set/Get the Blast score used as screening cutoff.
526 This is the value of the -score parameter supplied to new().
527 Hits with scores below this are skipped.
528 Returns : Integer or scientific notation number.
529 Argument : Integer or scientific notation number (when setting)
530 Comments : Screening of significant hits uses the data provided on the
531 : description line.
533 =cut
535 sub min_score { shift->{'_handler_cache'}->min_score(@_) }
537 =head2 min_query_length
539 Usage : $obj->min_query_length();
540 Purpose : Gets the query sequence length used as screening criteria.
541 This is the value of the -min_query_len parameter supplied to new().
542 Hits with sequence length below this are skipped.
543 Returns : Integer
544 Argument : n/a
546 =cut
548 sub min_query_length {
549 my $self = shift;
550 if (@_) {
551 my $min_qlen = shift;
552 if ( $min_qlen =~ /\D/ or $min_qlen <= 0 ) {
553 $self->throw(
554 -class => 'Bio::Root::BadParameter',
555 -text => "Invalid minimum query length value: $min_qlen\n"
556 . "Value must be an integer > 0. Value not set.",
557 -value => $min_qlen
560 $self->{'_confirm_qlength'} = 1;
561 $self->{'_min_query_length'} = $min_qlen;
564 return $self->{'_min_query_length'};
567 =head2 best_hit_only
569 Title : best_hit_only
570 Usage : print "only getting best hit.\n" if $obj->best_hit_only;
571 Purpose : Set/Get the indicator for whether or not to process only
572 : the best BlastHit.
573 Returns : Boolean (1 | 0)
574 Argument : Boolean (1 | 0) (when setting)
576 =cut
578 sub best_hit_only {
579 my $self = shift;
580 if (@_) { $self->{'_best'} = shift; }
581 $self->{'_best'};
584 =head2 check_all_hits
586 Title : check_all_hits
587 Usage : print "checking all hits.\n" if $obj->check_all_hits;
588 Purpose : Set/Get the indicator for whether or not to process all hits.
589 : If false, the parser will stop processing hits after the
590 : the first non-significance hit or the first hit that fails
591 : any hit filter.
592 Returns : Boolean (1 | 0)
593 Argument : Boolean (1 | 0) (when setting)
595 =cut
597 sub check_all_hits {
598 my $self = shift;
599 if (@_) { $self->{'_check_all'} = shift; }
600 $self->{'_check_all'};
603 =head2 _load_format_module
605 Title : _load_format_module
606 Usage : *INTERNAL SearchIO stuff*
607 Function: Loads up (like use) a module at run time on demand
608 Example :
609 Returns :
610 Args :
612 =cut
614 sub _load_format_module {
615 my ($self,$format) = @_;
616 my $module = "Bio::SearchIO::" . $format;
617 my $ok;
619 eval {
620 $ok = $self->_load_module($module);
622 if ( $@ ) {
623 print STDERR <<END;
624 $self: $format cannot be found
625 Exception $@
626 For more information about the SearchIO system please see the SearchIO docs.
627 This includes ways of checking for formats at compile time, not run time
631 return $ok;
634 =head2 _get_seq_identifiers
636 Title : _get_seq_identifiers
637 Usage : my ($gi, $acc,$ver) = &_get_seq_identifiers($id)
638 Function: Private function to get the gi, accession, version data
639 for an ID (if it is in NCBI format)
640 Returns : 3-pule of gi, accession, version
641 Args : ID string to process (NCBI format)
644 =cut
646 sub _get_seq_identifiers {
647 my ($self, $id) = @_;
649 return unless defined $id;
650 my ($gi, $acc, $version );
651 if ( $id =~ /^gi\|(\d+)\|/ ) {
652 $gi = $1;
654 if ( $id =~ /(gb|emb|dbj|sp|pdb|bbs|ref|lcl)\|(.*)\|(.*)/ ) {
655 ( $acc, $version ) = split /\./, $2;
657 elsif ( $id =~ /(pir|prf|pat|gnl)\|(.*)\|(.*)/ ) {
658 ( $acc, $version ) = split /\./, $3;
660 else {
662 #punt, not matching the db's at ftp://ftp.ncbi.nih.gov/blast/db/README
663 #Database Name Identifier Syntax
664 #============================ ========================
665 #GenBank gb|accession|locus
666 #EMBL Data Library emb|accession|locus
667 #DDBJ, DNA Database of Japan dbj|accession|locus
668 #NBRF PIR pir||entry
669 #Protein Research Foundation prf||name
670 #SWISS-PROT sp|accession|entry name
671 #Brookhaven Protein Data Bank pdb|entry|chain
672 #Patents pat|country|number
673 #GenInfo Backbone Id bbs|number
674 #General database identifier gnl|database|identifier
675 #NCBI Reference Sequence ref|accession|locus
676 #Local Sequence identifier lcl|identifier
677 $acc = $id;
679 return ($gi, $acc, $version );
682 =head2 _guess_format
684 Title : _guess_format
685 Usage : $obj->_guess_format($filename)
686 Function:
687 Example :
688 Returns : guessed format of filename (lower case)
689 Args :
691 =cut
693 sub _guess_format {
694 my $class = shift;
695 return unless $_ = shift;
696 return 'blast' if (/\.(blast|t?bl\w)$/i );
697 return 'fasta' if (/\.
698 (?: t? fas (?:ta)? |
699 m\d+ |
700 (?: t? (?: fa | fx | fy | ff | fs ) ) |
701 (?: (?:ss | os | ps) (?:earch)? ))
702 $/ix );
703 return 'blastxml' if ( /\.(blast)?xml$/i);
704 return 'exonerate' if ( /\.exon(erate)?/i );
707 sub close {
708 my $self = shift;
710 if( $self->writer ) {
711 $self->_print($self->writer->end_report());
712 $self->{'_result_writer'}= undef;
714 $self->SUPER::close(@_);
717 sub DESTROY {
718 my $self = shift;
719 $self->close() if defined $self->_fh;
720 $self->SUPER::DESTROY;
723 sub TIEHANDLE {
724 my $class = shift;
725 return bless {processor => shift}, $class;
728 sub READLINE {
729 my $self = shift;
730 return $self->{'processor'}->next_result() || undef unless wantarray;
731 my (@list, $obj);
732 push @list, $obj while $obj = $self->{'processor'}->next_result();
733 return @list;
736 sub PRINT {
737 my $self = shift;
738 $self->{'processor'}->write_result(@_);
744 __END__