t/SeqFeature/Generic.t: fix typo on required module for testing
[bioperl-live.git] / lib / Bio / Search / HSP / BlastPullHSP.pm
blob9eac7019eebf78a0fa91f4bde75e58b86c6e6ffb
2 # BioPerl module for Bio::Search::HSP::BlastPullHSP
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Sendu Bala <bix@sendu.me.uk>
8 # Copyright Sendu Bala
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
14 =head1 NAME
16 Bio::Search::HSP::BlastPullHSP - A parser and HSP object for BlastN hsps
18 =head1 SYNOPSIS
20 # generally we use Bio::SearchIO to build these objects
21 use Bio::SearchIO;
22 my $in = Bio::SearchIO->new(-format => 'hmmer_pull',
23 -file => 'result.blast');
25 while (my $result = $in->next_result) {
26 while (my $hit = $result->next_hit) {
27 print $hit->name, "\n";
28 print $hit->score, "\n";
29 print $hit->significance, "\n";
31 while (my $hsp = $hit->next_hsp) {
32 # process HSPI objects
37 =head1 DESCRIPTION
39 This object implements a parser for BlastN hsp output.
41 =head1 FEEDBACK
43 =head2 Mailing Lists
45 User feedback is an integral part of the evolution of this and other
46 Bioperl modules. Send your comments and suggestions preferably to
47 the Bioperl mailing list. Your participation is much appreciated.
49 bioperl-l@bioperl.org - General discussion
50 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
52 =head2 Support
54 Please direct usage questions or support issues to the mailing list:
56 I<bioperl-l@bioperl.org>
58 rather than to the module maintainer directly. Many experienced and
59 reponsive experts will be able look at the problem and quickly
60 address it. Please include a thorough description of the problem
61 with code and data examples if at all possible.
63 =head2 Reporting Bugs
65 Report bugs to the Bioperl bug tracking system to help us keep track
66 of the bugs and their resolution. Bug reports can be submitted via the
67 web:
69 https://github.com/bioperl/bioperl-live/issues
71 =head1 AUTHOR - Sendu Bala
73 Email bix@sendu.me.uk
75 =head1 APPENDIX
77 The rest of the documentation details each of the object methods.
78 Internal methods are usually preceded with a _
80 =cut
82 # Let the code begin...
84 package Bio::Search::HSP::BlastPullHSP;
86 use strict;
87 use base qw(Bio::Search::HSP::PullHSPI);
89 =head2 new
91 Title : new
92 Usage : my $obj = Bio::Search::HSP::BlastNHSP->new();
93 Function: Builds a new Bio::Search::HSP::BlastNHSP object.
94 Returns : Bio::Search::HSP::BlastNHSP
95 Args : -chunk => [Bio::Root::IO, $start, $end] (required if no -parent)
96 -parent => Bio::PullParserI object (required if no -chunk)
98 where the array ref provided to -chunk contains an IO object
99 for a filehandle to something representing the raw data of the
100 hsp, and $start and $end define the tell() position within the
101 filehandle that the hsp data starts and ends (optional; defaults
102 to start and end of the entire thing described by the filehandle)
104 =cut
106 sub new {
107 my ($class, @args) = @_;
108 my $self = $class->SUPER::new(@args);
110 $self->_setup(@args);
112 my $fields = $self->_fields;
113 foreach my $field (qw( header alignment query_strand hit_strand )) {
114 $fields->{$field} = undef;
117 $self->_dependencies( { ( score => 'header',
118 bits => 'header',
119 evalue => 'header',
120 total_gaps => 'header',
121 query_strand => 'header',
122 hit_strand => 'header',
123 alignment => 'header',
124 query_string => 'alignment',
125 hit_string => 'alignment',
126 homology_string => 'alignment',
127 query_start => 'alignment',
128 query_end => 'alignment',
129 hit_start => 'alignment',
130 hit_end => 'alignment',
131 hit_identical_inds => 'seq_inds',
132 hit_conserved_inds => 'seq_inds',
133 hit_nomatch_inds => 'seq_inds',
134 hit_gap_inds => 'seq_inds',
135 query_identical_inds => 'seq_inds',
136 query_conserved_inds => 'seq_inds',
137 query_nomatch_inds => 'seq_inds',
138 query_gap_inds => 'seq_inds' ) } );
140 return $self;
144 # PullParserI discovery methods so we can answer all HitI questions
147 sub _discover_header {
148 my $self = shift;
149 $self->_chunk_seek(0);
150 my $header = $self->_get_chunk_by_end("\nQuery");
151 $self->{_after_header} = $self->_chunk_tell;
153 ($self->_fields->{bits}, $self->_fields->{score}, $self->_fields->{evalue},
154 $self->_fields->{total_gaps}, $self->_fields->{query_strand}, $self->_fields->{hit_strand})
155 = $header =~ /^\s*(\S+) bits \((\d+)\),\s+Expect = (\S+)(?:\s+.+Gaps = (\d+))?(?:.+Strand\s*=\s*(\w+)\s*\/\s*(\w+))?/sm;
157 if ($self->_fields->{query_strand}) {
158 # protein blasts don't have strand
159 for my $strand_type ('query_strand', 'hit_strand') {
160 $self->_fields->{$strand_type} = $self->_fields->{$strand_type} eq 'Plus' ? 1 : -1;
163 else {
164 $self->_fields->{query_strand} = 0;
165 $self->_fields->{hit_strand} = 0;
168 if ($self->_fields->{evalue} =~ /^e/) {
169 $self->_fields->{evalue} = '1'.$self->_fields->{evalue};
172 # query_gaps isn't always given
173 $self->_fields->{total_gaps} = '[unset]' unless $self->_fields->{total_gaps};
175 $self->_fields->{header} = 1;
178 sub _discover_alignment {
179 my $self = shift;
180 $self->_chunk_seek($self->{_after_header});
182 # work out various basic fields for the hsp
183 # (quicker to do this all at once instead of each method working on
184 # alignment itself)
185 my ($query_string, $hit_string, $homology_string, $q_start, $h_start, $q_end, $h_end);
186 while (my $strip = $self->_get_chunk_by_end("\nQuery") || $self->_get_chunk_by_nol(4)) {
187 $strip =~ /\s+(\d+)\s+(\S+)\s+(\d+)\s+(\S.+)\nSbjct:?\s+(\d+)\s+(\S+)\s+(\d+)/gm || last;
188 my $q1 = $1;
189 $query_string .= $2;
190 my $q2 = $3;
191 my $hom = $4;
192 my $h1 = $5;
193 $hit_string .= $6;
194 my $h2 = $7;
196 $hom = ' 'x(length($6) - length($hom)).$hom;
197 $homology_string .= $hom;
199 for my $q ($q1, $q2) {
200 if (! defined $q_start || $q < $q_start) {
201 $q_start = $q;
203 if (! defined $q_end || $q > $q_end) {
204 $q_end = $q;
207 for my $h ($h1, $h2) {
208 if (! defined $h_start || $h < $h_start) {
209 $h_start = $h;
211 if (! defined $h_end || $h > $h_end) {
212 $h_end = $h;
217 $self->_fields->{query_string} = $query_string;
218 $self->_fields->{hit_string} = $hit_string;
219 $self->_fields->{homology_string} = $homology_string;
221 $self->_fields->{query_start} = $q_start;
222 $self->_fields->{query_end} = $q_end;
223 $self->_fields->{hit_start} = $h_start;
224 $self->_fields->{hit_end} = $h_end;
226 ($self->{_query_gaps}) = $query_string =~ tr/-//;
227 ($self->{_hit_gaps}) = $hit_string =~ tr/-//;
228 ($self->{_total_gaps}) = $self->{_query_gaps} + $self->{_hit_gaps};
230 $self->_fields->{alignment} = 1; # stop this method being called again
233 # seq_inds related methods, all just need seq_inds field to have been gotten
234 sub _discover_seq_inds {
235 my $self = shift;
236 my ($seqString, $qseq, $sseq) = ( $self->get_field('homology_string'),
237 $self->get_field('query_string'),
238 $self->get_field('hit_string') );
240 # (code largely lifted from GenericHSP)
242 # Using hashes to avoid saving duplicate residue numbers.
243 my %identicalList_query = ();
244 my %identicalList_sbjct = ();
245 my %conservedList_query = ();
246 my %conservedList_sbjct = ();
247 my @gapList_query = ();
248 my @gapList_sbjct = ();
249 my %nomatchList_query = ();
250 my %nomatchList_sbjct = ();
252 my $resCount_query = $self->get_field('query_end');
253 my $resCount_sbjct = $self->get_field('hit_end');
255 my ($mchar, $schar, $qchar);
256 while ($mchar = chop($seqString) ) {
257 ($qchar, $schar) = (chop($qseq), chop($sseq));
259 if ($mchar eq '+' || $mchar eq '.' || $mchar eq ':') {
260 $conservedList_query{ $resCount_query } = 1;
261 $conservedList_sbjct{ $resCount_sbjct } = 1;
263 elsif ($mchar eq ' ') {
264 $nomatchList_query{ $resCount_query } = 1;
265 $nomatchList_sbjct{ $resCount_sbjct } = 1;
267 else {
268 $identicalList_query{ $resCount_query } = 1;
269 $identicalList_sbjct{ $resCount_sbjct } = 1;
272 if ($qchar eq '-') {
273 push(@gapList_query, $resCount_query);
275 else {
276 $resCount_query -= 1;
278 if ($schar eq '-') {
279 push(@gapList_sbjct, $resCount_sbjct);
281 else {
282 $resCount_sbjct -= 1;
286 my $fields = $self->_fields;
287 $fields->{hit_identical_inds} = [ sort { $a <=> $b } keys %identicalList_sbjct ];
288 $fields->{hit_conserved_inds} = [ sort { $a <=> $b } keys %conservedList_sbjct ];
289 $fields->{hit_nomatch_inds} = [ sort { $a <=> $b } keys %nomatchList_sbjct ];
290 $fields->{hit_gap_inds} = [ reverse @gapList_sbjct ];
291 $fields->{query_identical_inds} = [ sort { $a <=> $b } keys %identicalList_query ];
292 $fields->{query_conserved_inds} = [ sort { $a <=> $b } keys %conservedList_query ];
293 $fields->{query_nomatch_inds} = [ sort { $a <=> $b } keys %nomatchList_query ];
294 $fields->{query_gap_inds} = [ reverse @gapList_query ];
296 $fields->{seq_inds} = 1;
299 =head2 query
301 Title : query
302 Usage : my $query = $hsp->query
303 Function: Returns a SeqFeature representing the query in the HSP
304 Returns : L<Bio::SeqFeature::Similarity>
305 Args : none
307 =cut
309 sub query {
310 my $self = shift;
311 unless ($self->{_created_query}) {
312 $self->SUPER::query( new Bio::SeqFeature::Similarity
313 ('-primary' => $self->primary_tag,
314 '-start' => $self->get_field('query_start'),
315 '-end' => $self->get_field('query_end'),
316 '-expect' => $self->get_field('evalue'),
317 '-score' => $self->get_field('score'),
318 '-strand' => $self->get_field('query_strand'),
319 '-seq_id' => $self->get_field('query_name'),
320 '-seqlength'=> $self->get_field('query_length'),
321 '-source' => $self->get_field('algorithm'),
322 '-seqdesc' => $self->get_field('query_description'),
323 '-frame' => 0 # not known?
324 ) );
325 $self->{_created_query} = 1;
327 return $self->SUPER::query(@_);
330 =head2 hit
332 Title : hit
333 Usage : my $hit = $hsp->hit
334 Function: Returns a SeqFeature representing the hit in the HSP
335 Returns : L<Bio::SeqFeature::Similarity>
336 Args : [optional] new value to set
338 =cut
340 sub hit {
341 my $self = shift;
342 unless ($self->{_created_hit}) {
343 $self->SUPER::hit( new Bio::SeqFeature::Similarity
344 ('-primary' => $self->primary_tag,
345 '-start' => $self->get_field('hit_start'),
346 '-end' => $self->get_field('hit_end'),
347 '-expect' => $self->get_field('evalue'),
348 '-score' => $self->get_field('score'),
349 '-strand' => $self->get_field('hit_strand'),
350 '-seq_id' => $self->get_field('name'),
351 '-seqlength'=> $self->get_field('length'),
352 '-source' => $self->get_field('algorithm'),
353 '-seqdesc' => $self->get_field('description'),
354 '-frame' => 0 # not known?
355 ) );
356 $self->{_created_hit} = 1;
358 return $self->SUPER::hit(@_);
361 =head2 gaps
363 Title : gaps
364 Usage : my $gaps = $hsp->gaps( ['query'|'hit'|'total'] );
365 Function : Get the number of gap characters in the query, hit, or total alignment.
366 Returns : Integer, number of gap characters or 0 if none
367 Args : 'query' = num conserved / length of query seq (without gaps)
368 'hit' = num conserved / length of hit seq (without gaps)
369 'total' = num conserved / length of alignment (with gaps)
370 default = 'total'
372 =cut
374 sub gaps {
375 my ($self, $type) = @_;
377 $type = lc $type if defined $type;
378 $type = 'total' if (! defined $type || $type eq 'hsp' || $type !~ /query|hit|subject|sbjct|total/);
379 $type = 'hit' if $type =~ /sbjct|subject/;
381 if ($type eq 'total') {
382 my $answer = $self->get_field('total_gaps');
383 return $answer unless $answer eq '[unset]';
386 $self->get_field('alignment'); # make sure gaps have been calculated
388 return $self->{'_'.$type.'_gaps'};
391 =head2 strand
393 Title : strand
394 Usage : $hsp->strand('query')
395 Function: Retrieves the strand for the HSP component requested
396 Returns : +1 or -1 (0 if unknown)
397 Args : 'hit' or 'subject' or 'sbjct' to retrieve the strand of the subject
398 'query' to retrieve the query strand (default)
399 'list' or 'array' to retrieve both query and hit together
401 =cut
403 sub strand {
404 my $self = shift;
405 my $val = shift;
406 $val = 'query' unless defined $val;
407 $val =~ s/^\s+//;
409 if ($val =~ /^q/i) {
410 return $self->get_field('query_strand');
412 elsif ($val =~ /^hi|^s/i) {
413 return $self->get_field('hit_strand');
415 elsif ($val =~ /^list|array/i) {
416 return ($self->get_field('query_strand'), $self->get_field('hit_strand'));
418 else {
419 $self->warn("unrecognized component '$val' requested\n");
421 return 0;
424 =head2 start
426 Title : start
427 Usage : $hsp->start('query')
428 Function: Retrieves the start for the HSP component requested
429 Returns : integer, or list of two integers (query start and subject start) in
430 list context
431 Args : 'hit' or 'subject' or 'sbjct' to retrieve the start of the subject
432 'query' to retrieve the query start (default)
434 =cut
436 sub start {
437 my $self = shift;
438 my $val = shift;
439 $val = (wantarray ? 'list' : 'query') unless defined $val;
440 $val =~ s/^\s+//;
442 if ($val =~ /^q/i) {
443 return $self->get_field('query_start');
445 elsif ($val =~ /^(hi|s)/i) {
446 return $self->get_field('hit_start');
448 elsif ($val =~ /^list|array/i) {
449 return ($self->get_field('query_start'), $self->get_field('hit_start') );
451 else {
452 $self->warn("unrecognized component '$val' requested\n");
454 return 0;
457 =head2 end
459 Title : end
460 Usage : $hsp->end('query')
461 Function: Retrieves the end for the HSP component requested
462 Returns : integer, or list of two integers (query end and subject end) in
463 list context
464 Args : 'hit' or 'subject' or 'sbjct' to retrieve the end of the subject
465 'query' to retrieve the query end (default)
467 =cut
469 sub end {
470 my $self = shift;
471 my $val = shift;
472 $val = (wantarray ? 'list' : 'query') unless defined $val;
473 $val =~ s/^\s+//;
475 if ($val =~ /^q/i) {
476 return $self->get_field('query_end');
478 elsif ($val =~ /^(hi|s)/i) {
479 return $self->get_field('hit_end');
481 elsif ($val =~ /^list|array/i) {
482 return ($self->get_field('query_end'), $self->get_field('hit_end'));
484 else {
485 $self->warn("unrecognized end component '$val' requested\n");
487 return 0;
490 =head2 pvalue
492 Title : pvalue
493 Usage : my $pvalue = $hsp->pvalue();
494 Function: Returns the P-value for this HSP
495 Returns : undef (Hmmpfam reports do not have p-values)
496 Args : none
498 =cut
500 sub pvalue { }