2 # BioPerl module for Bio::SearchIO::SearchResultEventBuilder
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Jason Stajich <jason@bioperl.org>
8 # Copyright Jason Stajich
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
16 Bio::SearchIO::SearchResultEventBuilder - Event Handler for SearchIO events.
20 # Do not use this object directly, this object is part of the SearchIO
21 # event based parsing system.
25 This object handles Search Events generated by the SearchIO classes
26 and build appropriate Bio::Search::* objects from them.
32 User feedback is an integral part of the evolution of this and other
33 Bioperl modules. Send your comments and suggestions preferably to
34 the Bioperl mailing list. Your participation is much appreciated.
36 bioperl-l@bioperl.org - General discussion
37 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
41 Please direct usage questions or support issues to the mailing list:
43 I<bioperl-l@bioperl.org>
45 rather than to the module maintainer directly. Many experienced and
46 reponsive experts will be able look at the problem and quickly
47 address it. Please include a thorough description of the problem
48 with code and data examples if at all possible.
52 Report bugs to the Bioperl bug tracking system to help us keep track
53 of the bugs and their resolution. Bug reports can be submitted via the
56 https://github.com/bioperl/bioperl-live/issues
58 =head1 AUTHOR - Jason Stajich
60 Email jason-at-bioperl.org
64 Sendu Bala, bix@sendu.me.uk
68 The rest of the documentation details each of the object methods.
69 Internal methods are usually preceded with a _
74 # Let the code begin...
77 package Bio
::SearchIO
::SearchResultEventBuilder
;
81 use Bio
::Factory
::ObjectFactory
;
83 use base
qw(Bio::Root::Root Bio::SearchIO::EventHandlerI);
85 use vars
qw($DEFAULT_INCLUSION_THRESHOLD
89 # e-value threshold for inclusion in the PSI-BLAST score matrix model (blastpgp)
90 # NOTE: Executing `blastpgp -` incorrectly reports that the default is 0.005.
91 # (version 2.2.2 [Jan-08-2002])
92 $DEFAULT_INCLUSION_THRESHOLD = 0.001;
94 $MAX_HSP_OVERLAP = 2; # Used when tiling multiple HSPs.
99 Usage : my $obj = Bio::SearchIO::SearchResultEventBuilder->new();
100 Function: Builds a new Bio::SearchIO::SearchResultEventBuilder object
101 Returns : Bio::SearchIO::SearchResultEventBuilder
102 Args : -hsp_factory => Bio::Factory::ObjectFactoryI
103 -hit_factory => Bio::Factory::ObjectFactoryI
104 -result_factory => Bio::Factory::ObjectFactoryI
105 -inclusion_threshold => e-value threshold for inclusion in the
106 PSI-BLAST score matrix model (blastpgp)
107 -signif => float or scientific notation number to be used
108 as a P- or Expect value cutoff
109 -score => integer or scientific notation number to be used
110 as a blast score value cutoff
111 -bits => integer or scientific notation number to be used
112 as a bit score value cutoff
113 -hit_filter => reference to a function to be used for
114 filtering hits based on arbitrary criteria.
116 See L<Bio::Factory::ObjectFactoryI> for more information
121 my ($class,@args) = @_;
122 my $self = $class->SUPER::new
(@args);
123 my ($resultF, $hitF, $hspF) =
124 $self->_rearrange([qw(RESULT_FACTORY
126 HSP_FACTORY)],@args);
127 $self->_init_parse_params(@args);
129 $self->register_factory('result', $resultF ||
130 Bio
::Factory
::ObjectFactory
->new(
131 -type
=> 'Bio::Search::Result::GenericResult',
132 -interface
=> 'Bio::Search::Result::ResultI'));
134 $self->register_factory('hit', $hitF ||
135 Bio
::Factory
::ObjectFactory
->new(
136 -type
=> 'Bio::Search::Hit::GenericHit',
137 -interface
=> 'Bio::Search::Hit::HitI'));
139 $self->register_factory('hsp', $hspF ||
140 Bio
::Factory
::ObjectFactory
->new(
141 -type
=> 'Bio::Search::HSP::GenericHSP',
142 -interface
=> 'Bio::Search::HSP::HSPI'));
147 # Initializes parameters used during parsing of reports.
148 sub _init_parse_params
{
150 my ($self, @args) = @_;
151 # -FILT_FUNC has been replaced by -HIT_FILTER.
152 # Leaving -FILT_FUNC in place for backward compatibility
153 my($ithresh, $signif, $score, $bits, $hit_filter, $filt_func) =
154 $self->_rearrange([qw(INCLUSION_THRESHOLD SIGNIF SCORE BITS
158 $self->inclusion_threshold( defined($ithresh) ?
$ithresh : $DEFAULT_INCLUSION_THRESHOLD);
159 my $hit_filt = $hit_filter || $filt_func;
160 defined $hit_filter && $self->hit_filter($hit_filt);
161 defined $signif && $self->max_significance($signif);
162 defined $score && $self->min_score($score);
163 defined $bits && $self->min_bits($bits);
169 Usage : if( $handler->will_handle($event_type) ) { ... }
170 Function: Tests if this event builder knows how to process a specific event
172 Args : event type name
177 my ($self,$type) = @_;
178 # these are the events we recognize
179 return ( $type eq 'hsp' || $type eq 'hit' || $type eq 'result' );
189 Usage : $handler->start_result($resulttype)
190 Function: Begins a result event cycle
192 Args : Type of Report
197 my ($self,$type) = @_;
198 $self->{'_resulttype'} = $type;
199 $self->{'_hits'} = [];
200 $self->{'_hsps'} = [];
201 $self->{'_hitcount'} = 0;
208 Usage : my @results = $parser->end_result
209 Function: Finishes a result handler cycle
210 Returns : A Bio::Search::Result::ResultI
215 # this is overridden by IteratedSearchResultEventBuilder
216 # so keep that in mind when debugging
219 my ($self,$type,$data) = @_;
221 if( defined $data->{'runid'} &&
222 $data->{'runid'} !~ /^\s+$/ ) {
224 if( $data->{'runid'} !~ /^lcl\|/) {
225 $data->{"RESULT-query_name"} = $data->{'runid'};
227 ($data->{"RESULT-query_name"},
228 $data->{"RESULT-query_description"}) =
229 split(/\s+/,$data->{"RESULT-query_description"},2);
232 if( my @a = split(/\|/,$data->{'RESULT-query_name'}) ) {
233 my $acc = pop @a ; # this is for accession |1234|gb|AAABB1.1|AAABB1
234 # this is for |123|gb|ABC1.1|
235 $acc = pop @a if( ! defined $acc || $acc =~ /^\s+$/);
236 $data->{"RESULT-query_accession"}= $acc;
238 delete $data->{'runid'};
240 my %args = map { my $v = $data->{$_}; s/RESULT//; ($_ => $v); }
241 grep { /^RESULT/ } keys %{$data};
243 $args{'-algorithm'} = uc( $args{'-algorithm_name'}
244 || $data->{'RESULT-algorithm_name'}
246 ($self->isa('Bio::SearchIO::IteratedSearchResultEventBuilder')) ?
247 ( $args{'-iterations'} = $self->{'_iterations'} )
248 : ( $args{'-hits'} = $self->{'_hits'} );
250 my $result = $self->factory('result')->create_object(%args);
251 $result->hit_factory($self->factory('hit'));
253 ($self->isa('Bio::SearchIO::IteratedSearchResultEventBuilder')) ?
254 ( $self->{'_iterations'} = [] )
255 : ( $self->{'_hits'} = [] );
263 Usage : $handler->start_hsp($name,$data)
264 Function: Begins processing a HSP event
266 Args : type of element
267 associated data (hashref)
272 my ($self,@args) = @_;
279 Usage : $handler->end_hsp()
280 Function: Finish processing a HSP event
282 Args : type of event and associated hashref
288 my ($self,$type,$data) = @_;
290 if( defined $data->{'runid'} &&
291 $data->{'runid'} !~ /^\s+$/ ) {
293 if( $data->{'runid'} !~ /^lcl\|/) {
294 $data->{"RESULT-query_name"}= $data->{'runid'};
296 ($data->{"RESULT-query_name"},
297 $data->{"RESULT-query_description"}) =
298 split(/\s+/,$data->{"RESULT-query_description"},2);
301 if( my @a = split(/\|/,$data->{'RESULT-query_name'}) ) {
302 my $acc = pop @a ; # this is for accession |1234|gb|AAABB1.1|AAABB1
303 # this is for |123|gb|ABC1.1|
304 $acc = pop @a if( ! defined $acc || $acc =~ /^\s+$/);
305 $data->{"RESULT-query_accession"}= $acc;
307 delete $data->{'runid'};
310 # this code is to deal with the fact that Blast XML data
311 # always has start < end and one has to infer strandedness
312 # from the frame which is a problem for the Search::HSP object
313 # which expect to be able to infer strand from the order of
314 # of the begin/end of the query and hit coordinates
315 if( defined $data->{'HSP-query_frame'} && # this is here to protect from undefs
316 (( $data->{'HSP-query_frame'} < 0 &&
317 $data->{'HSP-query_start'} < $data->{'HSP-query_end'} ) ||
318 $data->{'HSP-query_frame'} > 0 &&
319 ( $data->{'HSP-query_start'} > $data->{'HSP-query_end'} ) )
323 ($data->{'HSP-query_start'},
324 $data->{'HSP-query_end'}) = ($data->{'HSP-query_end'},
325 $data->{'HSP-query_start'});
327 if( defined $data->{'HSP-hit_frame'} && # this is here to protect from undefs
328 ((defined $data->{'HSP-hit_frame'} && $data->{'HSP-hit_frame'} < 0 &&
329 $data->{'HSP-hit_start'} < $data->{'HSP-hit_end'} ) ||
330 defined $data->{'HSP-hit_frame'} && $data->{'HSP-hit_frame'} > 0 &&
331 ( $data->{'HSP-hit_start'} > $data->{'HSP-hit_end'} ) )
335 ($data->{'HSP-hit_start'},
336 $data->{'HSP-hit_end'}) = ($data->{'HSP-hit_end'},
337 $data->{'HSP-hit_start'});
339 $data->{'HSP-query_frame'} ||= 0;
340 $data->{'HSP-hit_frame'} ||= 0;
341 # handle Blast 2.1.2 which did not support data member: hsp_align-len
342 $data->{'HSP-query_length'} ||= $data->{'RESULT-query_length'};
343 $data->{'HSP-hit_length'} ||= $data->{'HIT-length'};
345 # If undefined lengths, calculate from alignment without gaps and separators
346 if (not defined $data->{'HSP-query_length'}) {
347 if (my $hsp_qry_seq = $data->{'HSP-query_seq'}) {
348 $hsp_qry_seq =~ s/[-\.]//g;
349 $data->{'HSP-query_length'} = length $hsp_qry_seq;
352 $data->{'HSP-query_length'} = 0;
355 if (not defined $data->{'HSP-hit_length'}) {
356 if (my $hsp_hit_seq = $data->{'HSP-hit_seq'}) {
357 $hsp_hit_seq =~ s/[-\.]//g;
358 $data->{'HSP-hit_length'} = length $hsp_hit_seq;
361 $data->{'HSP-hit_length'} = 0;
364 $data->{'HSP-hsp_length'} ||= length ($data->{'HSP-homology_seq'} || '');
366 my %args = map { my $v = $data->{$_}; s/HSP//; ($_ => $v) }
367 grep { /^HSP/ } keys %{$data};
369 $args{'-algorithm'} = uc( $args{'-algorithm_name'} ||
370 $data->{'RESULT-algorithm_name'} || $type);
371 # copy this over from result
372 $args{'-query_name'} = $data->{'RESULT-query_name'};
373 $args{'-hit_name'} = $data->{'HIT-name'};
374 my ($rank) = scalar @
{$self->{'_hsps'} || []} + 1;
375 $args{'-rank'} = $rank;
377 $args{'-hit_desc'} = $data->{'HIT-description'};
378 $args{'-query_desc'} = $data->{'RESULT-query_description'};
380 my $bits = $args{'-bits'};
382 push @
{$self->{'_hsps'}}, $hsp;
390 Usage : $handler->start_hit()
391 Function: Starts a Hit event cycle
393 Args : type of event and associated hashref
398 my ($self,$type) = @_;
399 $self->{'_hsps'} = [];
406 Usage : $handler->end_hit()
407 Function: Ends a Hit event cycle
408 Returns : Bio::Search::Hit::HitI object
409 Args : type of event and associated hashref
414 my ($self,$type,$data) = @_;
416 # Skip process unless there is HSP data or Hit Significance (e.g. a bl2seq with no similarity
417 # gives a hit with the subject, but shows a "no hits found" message instead
418 # of the alignment data and don't have a significance value).
419 # This way, we avoid false positives
420 my @hsp_data = grep { /^HSP/ } keys %{$data};
421 return unless (scalar @hsp_data > 0 or exists $data->{'HIT-significance'});
423 my %args = map { my $v = $data->{$_}; s/HIT//; ($_ => $v); } grep { /^HIT/ } keys %{$data};
425 # I hate special cases, but this is here because NCBI BLAST XML
426 # doesn't play nice and is undergoing mutation -jason
427 if(exists $args{'-name'} && $args{'-name'} =~ /BL_ORD_ID/ ) {
428 ($args{'-name'}, $args{'-description'}) = split(/\s+/,$args{'-description'},2);
430 $args{'-algorithm'} = uc( $args{'-algorithm_name'} ||
431 $data->{'RESULT-algorithm_name'} || $type);
432 $args{'-hsps'} = $self->{'_hsps'};
433 $args{'-query_len'} = $data->{'RESULT-query_length'};
434 $args{'-rank'} = $self->{'_hitcount'} + 1;
435 unless( defined $args{'-significance'} ) {
436 if( defined $args{'-hsps'} &&
437 $args{'-hsps'}->[0] ) {
438 # use pvalue if present (WU-BLAST), otherwise evalue (NCBI BLAST)
439 $args{'-significance'} = $args{'-hsps'}->[0]->{'-pvalue'} || $args{'-hsps'}->[0]->{'-evalue'};
443 $hit->{'-hsp_factory'} = $self->factory('hsp');
444 $self->_add_hit($hit);
445 $self->{'_hsps'} = [];
449 # Title : _add_hit (private function for internal use only)
450 # Purpose : Applies hit filtering and store it if it passes filtering.
451 # Argument: Bio::Search::Hit::HitI object
454 my ($self, $hit) = @_;
455 my $hit_signif = $hit->{-significance
};
457 # Test significance using custom function (if supplied)
460 my $hit_filter = $self->{'_hit_filter'};
462 # since &hit_filter is out of our control and would expect a HitI object,
463 # we're forced to make one for it
464 $hit = $self->factory('hit')->create_object(%{$hit});
465 $add_hit = 0 unless &$hit_filter($hit);
468 if($self->{'_confirm_significance'}) {
469 $add_hit = 0 unless $hit_signif <= $self->{'_max_significance'};
471 if($self->{'_confirm_score'}) {
472 my $hit_score = $hit->{-score
} || $hit->{-hsps
}->[0]->{-score
};
473 $add_hit = 0 unless $hit_score >= $self->{'_min_score'};
475 if($self->{'_confirm_bits'}) {
476 my $hit_bits = $hit->{-bits
} || $hit->{-hsps
}->[0]->{-bits
} || 0;
477 $add_hit = 0 unless $hit_bits >= $self->{'_min_bits'};
481 $add_hit && push @
{$self->{'_hits'}}, $hit;;
482 $self->{'_hitcount'} = scalar @
{$self->{'_hits'}};
485 =head2 Factory methods
489 =head2 register_factory
491 Title : register_factory
492 Usage : $handler->register_factory('TYPE',$factory);
493 Function: Register a specific factory for a object type class
495 Args : string representing the class and
496 Bio::Factory::ObjectFactoryI
498 See L<Bio::Factory::ObjectFactoryI> for more information
502 sub register_factory
{
503 my ($self, $type,$f) = @_;
504 if( ! defined $f || ! ref($f) ||
505 ! $f->isa('Bio::Factory::ObjectFactoryI') ) {
506 $self->throw("Cannot set factory to value $f".ref($f)."\n");
508 $self->{'_factories'}->{lc($type)} = $f;
514 Usage : my $f = $handler->factory('TYPE');
515 Function: Retrieves the associated factory for requested 'TYPE'
516 Returns : a Bio::Factory::ObjectFactoryI
517 Throws : Bio::Root::BadParameter if none registered for the supplied type
518 Args : name of factory class to retrieve
520 See L<Bio::Factory::ObjectFactoryI> for more information
525 my ($self,$type) = @_;
526 return $self->{'_factories'}->{lc($type)} ||
527 $self->throw(-class=>'Bio::Root::BadParameter',
528 -text
=>"No factory registered for $type");
531 =head2 inclusion_threshold
533 See L<Bio::SearchIO::blast::inclusion_threshold>.
537 sub inclusion_threshold
{
539 return $self->{'_inclusion_threshold'} = shift if @_;
540 return $self->{'_inclusion_threshold'};
543 =head2 max_significance
545 Usage : $obj->max_significance();
546 Purpose : Set/Get the P or Expect value used as significance screening cutoff.
547 This is the value of the -signif parameter supplied to new().
548 Hits with P or E-value at HIT level above this are skipped.
549 Returns : Scientific notation number with this format: 1.0e-05.
550 Argument : Number (sci notation, float, integer) (when setting)
551 Throws : Bio::Root::BadParameter exception if the supplied argument is
552 : not a valid number.
553 Comments : Screening of significant hits uses the data provided on the
554 : description line. For NCBI BLAST1 and WU-BLAST, this data
555 : is P-value. for NCBI BLAST2 it is an Expect value.
559 sub max_significance
{
563 if( $sig =~ /[^\d.e-]/ or $sig <= 0) {
564 $self->throw(-class => 'Bio::Root::BadParameter',
565 -text
=> "Invalid significance value: $sig\n"
566 . "Must be a number greater than zero.",
569 $self->{'_confirm_significance'} = 1;
570 $self->{'_max_significance'} = $sig;
572 sprintf "%.1e", $self->{'_max_significance'};
578 Synonym for L<max_significance()|max_significance>
582 sub signif
{ shift->max_significance }
586 Usage : $obj->min_score();
587 Purpose : Gets the Blast score used as screening cutoff.
588 This is the value of the -score parameter supplied to new().
589 Hits with scores at HIT level below this are skipped.
590 Returns : Integer (or undef if not set)
591 Argument : Integer (when setting)
592 Throws : Bio::Root::BadParameter exception if the supplied argument is
593 : not a valid number.
594 Comments : Screening of significant hits uses the data provided on the
603 if( $score =~ /[^\de+]/ or $score <= 0) {
604 $self->throw(-class => 'Bio::Root::BadParameter',
605 -text
=> "Invalid score value: $score\n"
606 . "Must be an integer greater than zero.",
609 $self->{'_confirm_score'} = 1;
610 $self->{'_min_score'} = $score;
612 return $self->{'_min_score'};
617 Usage : $obj->min_bits();
618 Purpose : Gets the Blast bit score used as screening cutoff.
619 This is the value of the -bits parameter supplied to new().
620 Hits with bits score at HIT level below this are skipped.
621 Returns : Integer (or undef if not set)
622 Argument : Integer (when setting)
623 Throws : Bio::Root::BadParameter exception if the supplied argument is
624 : not a valid number.
625 Comments : Screening of significant hits uses the data provided on the
634 if( $bits =~ /[^\de+]/ or $bits <= 0) {
635 $self->throw(-class => 'Bio::Root::BadParameter',
636 -text
=> "Invalid bits value: $bits\n"
637 . "Must be an integer greater than zero.",
640 $self->{'_confirm_bits'} = 1;
641 $self->{'_min_bits'} = $bits;
643 return $self->{'_min_bits'};
648 Usage : $obj->hit_filter();
649 Purpose : Set/Get a function reference used for filtering out hits.
650 This is the value of the -hit_filter parameter supplied to new().
651 Hits that fail to pass the filter at HIT level are skipped.
652 Returns : Function ref (or undef if not set)
653 Argument : Function ref (when setting)
654 Throws : Bio::Root::BadParameter exception if the supplied argument is
655 : not a function reference.
663 if(not ref $func eq 'CODE') {
664 $self->throw(-class => 'Bio::Root::BadParameter',
665 -text
=> "Not a function reference: $func\n"
666 . "The -hit_filter parameter must be function reference.",
669 $self->{'_hit_filter'} = $func;
671 return $self->{'_hit_filter'};