ide/bioperl-mode: into its own repository to be developed separately
[bioperl-live.git] / Bio / SearchIO / SearchResultEventBuilder.pm
bloba98f3206f130401ba08a27448f5b4f0196c4ae57
2 # BioPerl module for Bio::SearchIO::SearchResultEventBuilder
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Jason Stajich <jason@bioperl.org>
8 # Copyright Jason Stajich
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
14 =head1 NAME
16 Bio::SearchIO::SearchResultEventBuilder - Event Handler for SearchIO events.
18 =head1 SYNOPSIS
20 # Do not use this object directly, this object is part of the SearchIO
21 # event based parsing system.
23 =head1 DESCRIPTION
25 This object handles Search Events generated by the SearchIO classes
26 and build appropriate Bio::Search::* objects from them.
28 =head1 FEEDBACK
30 =head2 Mailing Lists
32 User feedback is an integral part of the evolution of this and other
33 Bioperl modules. Send your comments and suggestions preferably to
34 the Bioperl mailing list. Your participation is much appreciated.
36 bioperl-l@bioperl.org - General discussion
37 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
39 =head2 Support
41 Please direct usage questions or support issues to the mailing list:
43 I<bioperl-l@bioperl.org>
45 rather than to the module maintainer directly. Many experienced and
46 reponsive experts will be able look at the problem and quickly
47 address it. Please include a thorough description of the problem
48 with code and data examples if at all possible.
50 =head2 Reporting Bugs
52 Report bugs to the Bioperl bug tracking system to help us keep track
53 of the bugs and their resolution. Bug reports can be submitted via the
54 web:
56 https://github.com/bioperl/bioperl-live/issues
58 =head1 AUTHOR - Jason Stajich
60 Email jason-at-bioperl.org
62 =head1 CONTRIBUTORS
64 Sendu Bala, bix@sendu.me.uk
66 =head1 APPENDIX
68 The rest of the documentation details each of the object methods.
69 Internal methods are usually preceded with a _
71 =cut
74 # Let the code begin...
77 package Bio::SearchIO::SearchResultEventBuilder;
79 use strict;
81 use Bio::Factory::ObjectFactory;
83 use base qw(Bio::Root::Root Bio::SearchIO::EventHandlerI);
85 use vars qw($DEFAULT_INCLUSION_THRESHOLD
86 $MAX_HSP_OVERLAP
89 # e-value threshold for inclusion in the PSI-BLAST score matrix model (blastpgp)
90 # NOTE: Executing `blastpgp -` incorrectly reports that the default is 0.005.
91 # (version 2.2.2 [Jan-08-2002])
92 $DEFAULT_INCLUSION_THRESHOLD = 0.001;
94 $MAX_HSP_OVERLAP = 2; # Used when tiling multiple HSPs.
96 =head2 new
98 Title : new
99 Usage : my $obj = Bio::SearchIO::SearchResultEventBuilder->new();
100 Function: Builds a new Bio::SearchIO::SearchResultEventBuilder object
101 Returns : Bio::SearchIO::SearchResultEventBuilder
102 Args : -hsp_factory => Bio::Factory::ObjectFactoryI
103 -hit_factory => Bio::Factory::ObjectFactoryI
104 -result_factory => Bio::Factory::ObjectFactoryI
105 -inclusion_threshold => e-value threshold for inclusion in the
106 PSI-BLAST score matrix model (blastpgp)
107 -signif => float or scientific notation number to be used
108 as a P- or Expect value cutoff
109 -score => integer or scientific notation number to be used
110 as a blast score value cutoff
111 -bits => integer or scientific notation number to be used
112 as a bit score value cutoff
113 -hit_filter => reference to a function to be used for
114 filtering hits based on arbitrary criteria.
116 See L<Bio::Factory::ObjectFactoryI> for more information
118 =cut
120 sub new {
121 my ($class,@args) = @_;
122 my $self = $class->SUPER::new(@args);
123 my ($resultF, $hitF, $hspF) =
124 $self->_rearrange([qw(RESULT_FACTORY
125 HIT_FACTORY
126 HSP_FACTORY)],@args);
127 $self->_init_parse_params(@args);
129 $self->register_factory('result', $resultF ||
130 Bio::Factory::ObjectFactory->new(
131 -type => 'Bio::Search::Result::GenericResult',
132 -interface => 'Bio::Search::Result::ResultI'));
134 $self->register_factory('hit', $hitF ||
135 Bio::Factory::ObjectFactory->new(
136 -type => 'Bio::Search::Hit::GenericHit',
137 -interface => 'Bio::Search::Hit::HitI'));
139 $self->register_factory('hsp', $hspF ||
140 Bio::Factory::ObjectFactory->new(
141 -type => 'Bio::Search::HSP::GenericHSP',
142 -interface => 'Bio::Search::HSP::HSPI'));
144 return $self;
147 # Initializes parameters used during parsing of reports.
148 sub _init_parse_params {
150 my ($self, @args) = @_;
151 # -FILT_FUNC has been replaced by -HIT_FILTER.
152 # Leaving -FILT_FUNC in place for backward compatibility
153 my($ithresh, $signif, $score, $bits, $hit_filter, $filt_func) =
154 $self->_rearrange([qw(INCLUSION_THRESHOLD SIGNIF SCORE BITS
155 HIT_FILTER FILT_FUNC
156 )], @args);
158 $self->inclusion_threshold( defined($ithresh) ? $ithresh : $DEFAULT_INCLUSION_THRESHOLD);
159 my $hit_filt = $hit_filter || $filt_func;
160 defined $hit_filter && $self->hit_filter($hit_filt);
161 defined $signif && $self->max_significance($signif);
162 defined $score && $self->min_score($score);
163 defined $bits && $self->min_bits($bits);
166 =head2 will_handle
168 Title : will_handle
169 Usage : if( $handler->will_handle($event_type) ) { ... }
170 Function: Tests if this event builder knows how to process a specific event
171 Returns : boolean
172 Args : event type name
174 =cut
176 sub will_handle{
177 my ($self,$type) = @_;
178 # these are the events we recognize
179 return ( $type eq 'hsp' || $type eq 'hit' || $type eq 'result' );
182 =head2 SAX methods
184 =cut
186 =head2 start_result
188 Title : start_result
189 Usage : $handler->start_result($resulttype)
190 Function: Begins a result event cycle
191 Returns : none
192 Args : Type of Report
194 =cut
196 sub start_result {
197 my ($self,$type) = @_;
198 $self->{'_resulttype'} = $type;
199 $self->{'_hits'} = [];
200 $self->{'_hsps'} = [];
201 $self->{'_hitcount'} = 0;
202 return;
205 =head2 end_result
207 Title : end_result
208 Usage : my @results = $parser->end_result
209 Function: Finishes a result handler cycle
210 Returns : A Bio::Search::Result::ResultI
211 Args : none
213 =cut
215 # this is overridden by IteratedSearchResultEventBuilder
216 # so keep that in mind when debugging
218 sub end_result {
219 my ($self,$type,$data) = @_;
221 if( defined $data->{'runid'} &&
222 $data->{'runid'} !~ /^\s+$/ ) {
224 if( $data->{'runid'} !~ /^lcl\|/) {
225 $data->{"RESULT-query_name"} = $data->{'runid'};
226 } else {
227 ($data->{"RESULT-query_name"},
228 $data->{"RESULT-query_description"}) =
229 split(/\s+/,$data->{"RESULT-query_description"},2);
232 if( my @a = split(/\|/,$data->{'RESULT-query_name'}) ) {
233 my $acc = pop @a ; # this is for accession |1234|gb|AAABB1.1|AAABB1
234 # this is for |123|gb|ABC1.1|
235 $acc = pop @a if( ! defined $acc || $acc =~ /^\s+$/);
236 $data->{"RESULT-query_accession"}= $acc;
238 delete $data->{'runid'};
240 my %args = map { my $v = $data->{$_}; s/RESULT//; ($_ => $v); }
241 grep { /^RESULT/ } keys %{$data};
243 $args{'-algorithm'} = uc( $args{'-algorithm_name'}
244 || $data->{'RESULT-algorithm_name'}
245 || $type);
246 ($self->isa('Bio::SearchIO::IteratedSearchResultEventBuilder')) ?
247 ( $args{'-iterations'} = $self->{'_iterations'} )
248 : ( $args{'-hits'} = $self->{'_hits'} );
250 my $result = $self->factory('result')->create_object(%args);
251 $result->hit_factory($self->factory('hit'));
253 ($self->isa('Bio::SearchIO::IteratedSearchResultEventBuilder')) ?
254 ( $self->{'_iterations'} = [] )
255 : ( $self->{'_hits'} = [] );
257 return $result;
260 =head2 start_hsp
262 Title : start_hsp
263 Usage : $handler->start_hsp($name,$data)
264 Function: Begins processing a HSP event
265 Returns : none
266 Args : type of element
267 associated data (hashref)
269 =cut
271 sub start_hsp {
272 my ($self,@args) = @_;
273 return;
276 =head2 end_hsp
278 Title : end_hsp
279 Usage : $handler->end_hsp()
280 Function: Finish processing a HSP event
281 Returns : none
282 Args : type of event and associated hashref
285 =cut
287 sub end_hsp {
288 my ($self,$type,$data) = @_;
290 if( defined $data->{'runid'} &&
291 $data->{'runid'} !~ /^\s+$/ ) {
293 if( $data->{'runid'} !~ /^lcl\|/) {
294 $data->{"RESULT-query_name"}= $data->{'runid'};
295 } else {
296 ($data->{"RESULT-query_name"},
297 $data->{"RESULT-query_description"}) =
298 split(/\s+/,$data->{"RESULT-query_description"},2);
301 if( my @a = split(/\|/,$data->{'RESULT-query_name'}) ) {
302 my $acc = pop @a ; # this is for accession |1234|gb|AAABB1.1|AAABB1
303 # this is for |123|gb|ABC1.1|
304 $acc = pop @a if( ! defined $acc || $acc =~ /^\s+$/);
305 $data->{"RESULT-query_accession"}= $acc;
307 delete $data->{'runid'};
310 # this code is to deal with the fact that Blast XML data
311 # always has start < end and one has to infer strandedness
312 # from the frame which is a problem for the Search::HSP object
313 # which expect to be able to infer strand from the order of
314 # of the begin/end of the query and hit coordinates
315 if( defined $data->{'HSP-query_frame'} && # this is here to protect from undefs
316 (( $data->{'HSP-query_frame'} < 0 &&
317 $data->{'HSP-query_start'} < $data->{'HSP-query_end'} ) ||
318 $data->{'HSP-query_frame'} > 0 &&
319 ( $data->{'HSP-query_start'} > $data->{'HSP-query_end'} ) )
322 # swap
323 ($data->{'HSP-query_start'},
324 $data->{'HSP-query_end'}) = ($data->{'HSP-query_end'},
325 $data->{'HSP-query_start'});
327 if( defined $data->{'HSP-hit_frame'} && # this is here to protect from undefs
328 ((defined $data->{'HSP-hit_frame'} && $data->{'HSP-hit_frame'} < 0 &&
329 $data->{'HSP-hit_start'} < $data->{'HSP-hit_end'} ) ||
330 defined $data->{'HSP-hit_frame'} && $data->{'HSP-hit_frame'} > 0 &&
331 ( $data->{'HSP-hit_start'} > $data->{'HSP-hit_end'} ) )
334 # swap
335 ($data->{'HSP-hit_start'},
336 $data->{'HSP-hit_end'}) = ($data->{'HSP-hit_end'},
337 $data->{'HSP-hit_start'});
339 $data->{'HSP-query_frame'} ||= 0;
340 $data->{'HSP-hit_frame'} ||= 0;
341 # handle Blast 2.1.2 which did not support data member: hsp_align-len
342 $data->{'HSP-query_length'} ||= $data->{'RESULT-query_length'};
343 $data->{'HSP-hit_length'} ||= $data->{'HIT-length'};
345 # If undefined lengths, calculate from alignment without gaps and separators
346 if (not defined $data->{'HSP-query_length'}) {
347 if (my $hsp_qry_seq = $data->{'HSP-query_seq'}) {
348 $hsp_qry_seq =~ s/[-\.]//g;
349 $data->{'HSP-query_length'} = length $hsp_qry_seq;
351 else {
352 $data->{'HSP-query_length'} = 0;
355 if (not defined $data->{'HSP-hit_length'}) {
356 if (my $hsp_hit_seq = $data->{'HSP-hit_seq'}) {
357 $hsp_hit_seq =~ s/[-\.]//g;
358 $data->{'HSP-hit_length'} = length $hsp_hit_seq;
360 else {
361 $data->{'HSP-hit_length'} = 0;
364 $data->{'HSP-hsp_length'} ||= length ($data->{'HSP-homology_seq'} || '');
366 my %args = map { my $v = $data->{$_}; s/HSP//; ($_ => $v) }
367 grep { /^HSP/ } keys %{$data};
369 $args{'-algorithm'} = uc( $args{'-algorithm_name'} ||
370 $data->{'RESULT-algorithm_name'} || $type);
371 # copy this over from result
372 $args{'-query_name'} = $data->{'RESULT-query_name'};
373 $args{'-hit_name'} = $data->{'HIT-name'};
374 my ($rank) = scalar @{$self->{'_hsps'} || []} + 1;
375 $args{'-rank'} = $rank;
377 $args{'-hit_desc'} = $data->{'HIT-description'};
378 $args{'-query_desc'} = $data->{'RESULT-query_description'};
380 my $bits = $args{'-bits'};
381 my $hsp = \%args;
382 push @{$self->{'_hsps'}}, $hsp;
384 return $hsp;
387 =head2 start_hit
389 Title : start_hit
390 Usage : $handler->start_hit()
391 Function: Starts a Hit event cycle
392 Returns : none
393 Args : type of event and associated hashref
395 =cut
397 sub start_hit{
398 my ($self,$type) = @_;
399 $self->{'_hsps'} = [];
400 return;
403 =head2 end_hit
405 Title : end_hit
406 Usage : $handler->end_hit()
407 Function: Ends a Hit event cycle
408 Returns : Bio::Search::Hit::HitI object
409 Args : type of event and associated hashref
411 =cut
413 sub end_hit{
414 my ($self,$type,$data) = @_;
416 # Skip process unless there is HSP data or Hit Significance (e.g. a bl2seq with no similarity
417 # gives a hit with the subject, but shows a "no hits found" message instead
418 # of the alignment data and don't have a significance value).
419 # This way, we avoid false positives
420 my @hsp_data = grep { /^HSP/ } keys %{$data};
421 return unless (scalar @hsp_data > 0 or exists $data->{'HIT-significance'});
423 my %args = map { my $v = $data->{$_}; s/HIT//; ($_ => $v); } grep { /^HIT/ } keys %{$data};
425 # I hate special cases, but this is here because NCBI BLAST XML
426 # doesn't play nice and is undergoing mutation -jason
427 if(exists $args{'-name'} && $args{'-name'} =~ /BL_ORD_ID/ ) {
428 ($args{'-name'}, $args{'-description'}) = split(/\s+/,$args{'-description'},2);
430 $args{'-algorithm'} = uc( $args{'-algorithm_name'} ||
431 $data->{'RESULT-algorithm_name'} || $type);
432 $args{'-hsps'} = $self->{'_hsps'};
433 $args{'-query_len'} = $data->{'RESULT-query_length'};
434 $args{'-rank'} = $self->{'_hitcount'} + 1;
435 unless( defined $args{'-significance'} ) {
436 if( defined $args{'-hsps'} &&
437 $args{'-hsps'}->[0] ) {
438 # use pvalue if present (WU-BLAST), otherwise evalue (NCBI BLAST)
439 $args{'-significance'} = $args{'-hsps'}->[0]->{'-pvalue'} || $args{'-hsps'}->[0]->{'-evalue'};
442 my $hit = \%args;
443 $hit->{'-hsp_factory'} = $self->factory('hsp');
444 $self->_add_hit($hit);
445 $self->{'_hsps'} = [];
446 return $hit;
449 # Title : _add_hit (private function for internal use only)
450 # Purpose : Applies hit filtering and store it if it passes filtering.
451 # Argument: Bio::Search::Hit::HitI object
453 sub _add_hit {
454 my ($self, $hit) = @_;
455 my $hit_signif = $hit->{-significance};
457 # Test significance using custom function (if supplied)
458 my $add_hit = 1;
460 my $hit_filter = $self->{'_hit_filter'};
461 if($hit_filter) {
462 # since &hit_filter is out of our control and would expect a HitI object,
463 # we're forced to make one for it
464 $hit = $self->factory('hit')->create_object(%{$hit});
465 $add_hit = 0 unless &$hit_filter($hit);
467 else {
468 if($self->{'_confirm_significance'}) {
469 $add_hit = 0 unless $hit_signif <= $self->{'_max_significance'};
471 if($self->{'_confirm_score'}) {
472 my $hit_score = $hit->{-score} || $hit->{-hsps}->[0]->{-score};
473 $add_hit = 0 unless $hit_score >= $self->{'_min_score'};
475 if($self->{'_confirm_bits'}) {
476 my $hit_bits = $hit->{-bits} || $hit->{-hsps}->[0]->{-bits} || 0;
477 $add_hit = 0 unless $hit_bits >= $self->{'_min_bits'};
481 $add_hit && push @{$self->{'_hits'}}, $hit;;
482 $self->{'_hitcount'} = scalar @{$self->{'_hits'}};
485 =head2 Factory methods
487 =cut
489 =head2 register_factory
491 Title : register_factory
492 Usage : $handler->register_factory('TYPE',$factory);
493 Function: Register a specific factory for a object type class
494 Returns : none
495 Args : string representing the class and
496 Bio::Factory::ObjectFactoryI
498 See L<Bio::Factory::ObjectFactoryI> for more information
500 =cut
502 sub register_factory{
503 my ($self, $type,$f) = @_;
504 if( ! defined $f || ! ref($f) ||
505 ! $f->isa('Bio::Factory::ObjectFactoryI') ) {
506 $self->throw("Cannot set factory to value $f".ref($f)."\n");
508 $self->{'_factories'}->{lc($type)} = $f;
511 =head2 factory
513 Title : factory
514 Usage : my $f = $handler->factory('TYPE');
515 Function: Retrieves the associated factory for requested 'TYPE'
516 Returns : a Bio::Factory::ObjectFactoryI
517 Throws : Bio::Root::BadParameter if none registered for the supplied type
518 Args : name of factory class to retrieve
520 See L<Bio::Factory::ObjectFactoryI> for more information
522 =cut
524 sub factory{
525 my ($self,$type) = @_;
526 return $self->{'_factories'}->{lc($type)} ||
527 $self->throw(-class=>'Bio::Root::BadParameter',
528 -text=>"No factory registered for $type");
531 =head2 inclusion_threshold
533 See L<Bio::SearchIO::blast::inclusion_threshold>.
535 =cut
537 sub inclusion_threshold {
538 my $self = shift;
539 return $self->{'_inclusion_threshold'} = shift if @_;
540 return $self->{'_inclusion_threshold'};
543 =head2 max_significance
545 Usage : $obj->max_significance();
546 Purpose : Set/Get the P or Expect value used as significance screening cutoff.
547 This is the value of the -signif parameter supplied to new().
548 Hits with P or E-value at HIT level above this are skipped.
549 Returns : Scientific notation number with this format: 1.0e-05.
550 Argument : Number (sci notation, float, integer) (when setting)
551 Throws : Bio::Root::BadParameter exception if the supplied argument is
552 : not a valid number.
553 Comments : Screening of significant hits uses the data provided on the
554 : description line. For NCBI BLAST1 and WU-BLAST, this data
555 : is P-value. for NCBI BLAST2 it is an Expect value.
557 =cut
559 sub max_significance {
560 my $self = shift;
561 if (@_) {
562 my $sig = shift;
563 if( $sig =~ /[^\d.e-]/ or $sig <= 0) {
564 $self->throw(-class => 'Bio::Root::BadParameter',
565 -text => "Invalid significance value: $sig\n"
566 . "Must be a number greater than zero.",
567 -value => $sig);
569 $self->{'_confirm_significance'} = 1;
570 $self->{'_max_significance'} = $sig;
572 sprintf "%.1e", $self->{'_max_significance'};
576 =head2 signif
578 Synonym for L<max_significance()|max_significance>
580 =cut
582 sub signif { shift->max_significance }
584 =head2 min_score
586 Usage : $obj->min_score();
587 Purpose : Gets the Blast score used as screening cutoff.
588 This is the value of the -score parameter supplied to new().
589 Hits with scores at HIT level below this are skipped.
590 Returns : Integer (or undef if not set)
591 Argument : Integer (when setting)
592 Throws : Bio::Root::BadParameter exception if the supplied argument is
593 : not a valid number.
594 Comments : Screening of significant hits uses the data provided on the
595 : description line.
597 =cut
599 sub min_score {
600 my $self = shift;
601 if (@_) {
602 my $score = shift;
603 if( $score =~ /[^\de+]/ or $score <= 0) {
604 $self->throw(-class => 'Bio::Root::BadParameter',
605 -text => "Invalid score value: $score\n"
606 . "Must be an integer greater than zero.",
607 -value => $score);
609 $self->{'_confirm_score'} = 1;
610 $self->{'_min_score'} = $score;
612 return $self->{'_min_score'};
615 =head2 min_bits
617 Usage : $obj->min_bits();
618 Purpose : Gets the Blast bit score used as screening cutoff.
619 This is the value of the -bits parameter supplied to new().
620 Hits with bits score at HIT level below this are skipped.
621 Returns : Integer (or undef if not set)
622 Argument : Integer (when setting)
623 Throws : Bio::Root::BadParameter exception if the supplied argument is
624 : not a valid number.
625 Comments : Screening of significant hits uses the data provided on the
626 : description line.
628 =cut
630 sub min_bits {
631 my $self = shift;
632 if (@_) {
633 my $bits = shift;
634 if( $bits =~ /[^\de+]/ or $bits <= 0) {
635 $self->throw(-class => 'Bio::Root::BadParameter',
636 -text => "Invalid bits value: $bits\n"
637 . "Must be an integer greater than zero.",
638 -value => $bits);
640 $self->{'_confirm_bits'} = 1;
641 $self->{'_min_bits'} = $bits;
643 return $self->{'_min_bits'};
646 =head2 hit_filter
648 Usage : $obj->hit_filter();
649 Purpose : Set/Get a function reference used for filtering out hits.
650 This is the value of the -hit_filter parameter supplied to new().
651 Hits that fail to pass the filter at HIT level are skipped.
652 Returns : Function ref (or undef if not set)
653 Argument : Function ref (when setting)
654 Throws : Bio::Root::BadParameter exception if the supplied argument is
655 : not a function reference.
657 =cut
659 sub hit_filter {
660 my $self = shift;
661 if (@_) {
662 my $func = shift;
663 if(not ref $func eq 'CODE') {
664 $self->throw(-class => 'Bio::Root::BadParameter',
665 -text => "Not a function reference: $func\n"
666 . "The -hit_filter parameter must be function reference.",
667 -value => $func);
669 $self->{'_hit_filter'} = $func;
671 return $self->{'_hit_filter'};