2 # BioPerl module for Bio::Search::HSP::BlastPullHSP
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Sendu Bala <bix@sendu.me.uk>
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
16 Bio::Search::HSP::BlastPullHSP - A parser and HSP object for BlastN hsps
20 # generally we use Bio::SearchIO to build these objects
22 my $in = Bio::SearchIO->new(-format => 'hmmer_pull',
23 -file => 'result.blast');
25 while (my $result = $in->next_result) {
26 while (my $hit = $result->next_hit) {
27 print $hit->name, "\n";
28 print $hit->score, "\n";
29 print $hit->significance, "\n";
31 while (my $hsp = $hit->next_hsp) {
32 # process HSPI objects
39 This object implements a parser for BlastN hsp output.
45 User feedback is an integral part of the evolution of this and other
46 Bioperl modules. Send your comments and suggestions preferably to
47 the Bioperl mailing list. Your participation is much appreciated.
49 bioperl-l@bioperl.org - General discussion
50 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
54 Please direct usage questions or support issues to the mailing list:
56 I<bioperl-l@bioperl.org>
58 rather than to the module maintainer directly. Many experienced and
59 reponsive experts will be able look at the problem and quickly
60 address it. Please include a thorough description of the problem
61 with code and data examples if at all possible.
65 Report bugs to the Bioperl bug tracking system to help us keep track
66 of the bugs and their resolution. Bug reports can be submitted via the
69 https://github.com/bioperl/bioperl-live/issues
71 =head1 AUTHOR - Sendu Bala
77 The rest of the documentation details each of the object methods.
78 Internal methods are usually preceded with a _
82 # Let the code begin...
84 package Bio
::Search
::HSP
::BlastPullHSP
;
87 use base
qw(Bio::Search::HSP::PullHSPI);
92 Usage : my $obj = Bio::Search::HSP::BlastNHSP->new();
93 Function: Builds a new Bio::Search::HSP::BlastNHSP object.
94 Returns : Bio::Search::HSP::BlastNHSP
95 Args : -chunk => [Bio::Root::IO, $start, $end] (required if no -parent)
96 -parent => Bio::PullParserI object (required if no -chunk)
98 where the array ref provided to -chunk contains an IO object
99 for a filehandle to something representing the raw data of the
100 hsp, and $start and $end define the tell() position within the
101 filehandle that the hsp data starts and ends (optional; defaults
102 to start and end of the entire thing described by the filehandle)
107 my ($class, @args) = @_;
108 my $self = $class->SUPER::new
(@args);
110 $self->_setup(@args);
112 my $fields = $self->_fields;
113 foreach my $field (qw( header alignment query_strand hit_strand )) {
114 $fields->{$field} = undef;
117 $self->_dependencies( { ( score
=> 'header',
120 total_gaps
=> 'header',
121 query_strand
=> 'header',
122 hit_strand
=> 'header',
123 alignment
=> 'header',
124 query_string
=> 'alignment',
125 hit_string
=> 'alignment',
126 homology_string
=> 'alignment',
127 query_start
=> 'alignment',
128 query_end
=> 'alignment',
129 hit_start
=> 'alignment',
130 hit_end
=> 'alignment',
131 hit_identical_inds
=> 'seq_inds',
132 hit_conserved_inds
=> 'seq_inds',
133 hit_nomatch_inds
=> 'seq_inds',
134 hit_gap_inds
=> 'seq_inds',
135 query_identical_inds
=> 'seq_inds',
136 query_conserved_inds
=> 'seq_inds',
137 query_nomatch_inds
=> 'seq_inds',
138 query_gap_inds
=> 'seq_inds' ) } );
144 # PullParserI discovery methods so we can answer all HitI questions
147 sub _discover_header
{
149 $self->_chunk_seek(0);
150 my $header = $self->_get_chunk_by_end("\nQuery");
151 $self->{_after_header
} = $self->_chunk_tell;
153 ($self->_fields->{bits
}, $self->_fields->{score
}, $self->_fields->{evalue
},
154 $self->_fields->{total_gaps
}, $self->_fields->{query_strand
}, $self->_fields->{hit_strand
})
155 = $header =~ /^\s*(\S+) bits \((\d+)\),\s+Expect = (\S+)(?:\s+.+Gaps = (\d+))?(?:.+Strand\s*=\s*(\w+)\s*\/\s
*(\w
+))?
/sm
;
157 if ($self->_fields->{query_strand
}) {
158 # protein blasts don't have strand
159 for my $strand_type ('query_strand', 'hit_strand') {
160 $self->_fields->{$strand_type} = $self->_fields->{$strand_type} eq 'Plus' ?
1 : -1;
164 $self->_fields->{query_strand
} = 0;
165 $self->_fields->{hit_strand
} = 0;
168 if ($self->_fields->{evalue
} =~ /^e/) {
169 $self->_fields->{evalue
} = '1'.$self->_fields->{evalue
};
172 # query_gaps isn't always given
173 $self->_fields->{total_gaps
} = '[unset]' unless $self->_fields->{total_gaps
};
175 $self->_fields->{header
} = 1;
178 sub _discover_alignment
{
180 $self->_chunk_seek($self->{_after_header
});
182 # work out various basic fields for the hsp
183 # (quicker to do this all at once instead of each method working on
185 my ($query_string, $hit_string, $homology_string, $q_start, $h_start, $q_end, $h_end);
186 while (my $strip = $self->_get_chunk_by_end("\nQuery") || $self->_get_chunk_by_nol(4)) {
187 $strip =~ /\s+(\d+)\s+(\S+)\s+(\d+)\s+(\S.+)\nSbjct:?\s+(\d+)\s+(\S+)\s+(\d+)/gm || last;
196 $hom = ' 'x
(length($6) - length($hom)).$hom;
197 $homology_string .= $hom;
199 for my $q ($q1, $q2) {
200 if (! defined $q_start || $q < $q_start) {
203 if (! defined $q_end || $q > $q_end) {
207 for my $h ($h1, $h2) {
208 if (! defined $h_start || $h < $h_start) {
211 if (! defined $h_end || $h > $h_end) {
217 $self->_fields->{query_string
} = $query_string;
218 $self->_fields->{hit_string
} = $hit_string;
219 $self->_fields->{homology_string
} = $homology_string;
221 $self->_fields->{query_start
} = $q_start;
222 $self->_fields->{query_end
} = $q_end;
223 $self->_fields->{hit_start
} = $h_start;
224 $self->_fields->{hit_end
} = $h_end;
226 ($self->{_query_gaps
}) = $query_string =~ tr/-//;
227 ($self->{_hit_gaps
}) = $hit_string =~ tr/-//;
228 ($self->{_total_gaps
}) = $self->{_query_gaps
} + $self->{_hit_gaps
};
230 $self->_fields->{alignment
} = 1; # stop this method being called again
233 # seq_inds related methods, all just need seq_inds field to have been gotten
234 sub _discover_seq_inds
{
236 my ($seqString, $qseq, $sseq) = ( $self->get_field('homology_string'),
237 $self->get_field('query_string'),
238 $self->get_field('hit_string') );
240 # (code largely lifted from GenericHSP)
242 # Using hashes to avoid saving duplicate residue numbers.
243 my %identicalList_query = ();
244 my %identicalList_sbjct = ();
245 my %conservedList_query = ();
246 my %conservedList_sbjct = ();
247 my @gapList_query = ();
248 my @gapList_sbjct = ();
249 my %nomatchList_query = ();
250 my %nomatchList_sbjct = ();
252 my $resCount_query = $self->get_field('query_end');
253 my $resCount_sbjct = $self->get_field('hit_end');
255 my ($mchar, $schar, $qchar);
256 while ($mchar = chop($seqString) ) {
257 ($qchar, $schar) = (chop($qseq), chop($sseq));
259 if ($mchar eq '+' || $mchar eq '.' || $mchar eq ':') {
260 $conservedList_query{ $resCount_query } = 1;
261 $conservedList_sbjct{ $resCount_sbjct } = 1;
263 elsif ($mchar eq ' ') {
264 $nomatchList_query{ $resCount_query } = 1;
265 $nomatchList_sbjct{ $resCount_sbjct } = 1;
268 $identicalList_query{ $resCount_query } = 1;
269 $identicalList_sbjct{ $resCount_sbjct } = 1;
273 push(@gapList_query, $resCount_query);
276 $resCount_query -= 1;
279 push(@gapList_sbjct, $resCount_sbjct);
282 $resCount_sbjct -= 1;
286 my $fields = $self->_fields;
287 $fields->{hit_identical_inds
} = [ sort { $a <=> $b } keys %identicalList_sbjct ];
288 $fields->{hit_conserved_inds
} = [ sort { $a <=> $b } keys %conservedList_sbjct ];
289 $fields->{hit_nomatch_inds
} = [ sort { $a <=> $b } keys %nomatchList_sbjct ];
290 $fields->{hit_gap_inds
} = [ reverse @gapList_sbjct ];
291 $fields->{query_identical_inds
} = [ sort { $a <=> $b } keys %identicalList_query ];
292 $fields->{query_conserved_inds
} = [ sort { $a <=> $b } keys %conservedList_query ];
293 $fields->{query_nomatch_inds
} = [ sort { $a <=> $b } keys %nomatchList_query ];
294 $fields->{query_gap_inds
} = [ reverse @gapList_query ];
296 $fields->{seq_inds
} = 1;
302 Usage : my $query = $hsp->query
303 Function: Returns a SeqFeature representing the query in the HSP
304 Returns : L<Bio::SeqFeature::Similarity>
311 unless ($self->{_created_query
}) {
312 $self->SUPER::query
( new Bio
::SeqFeature
::Similarity
313 ('-primary' => $self->primary_tag,
314 '-start' => $self->get_field('query_start'),
315 '-end' => $self->get_field('query_end'),
316 '-expect' => $self->get_field('evalue'),
317 '-score' => $self->get_field('score'),
318 '-strand' => $self->get_field('query_strand'),
319 '-seq_id' => $self->get_field('query_name'),
320 '-seqlength'=> $self->get_field('query_length'),
321 '-source' => $self->get_field('algorithm'),
322 '-seqdesc' => $self->get_field('query_description'),
323 '-frame' => 0 # not known?
325 $self->{_created_query
} = 1;
327 return $self->SUPER::query
(@_);
333 Usage : my $hit = $hsp->hit
334 Function: Returns a SeqFeature representing the hit in the HSP
335 Returns : L<Bio::SeqFeature::Similarity>
336 Args : [optional] new value to set
342 unless ($self->{_created_hit
}) {
343 $self->SUPER::hit
( new Bio
::SeqFeature
::Similarity
344 ('-primary' => $self->primary_tag,
345 '-start' => $self->get_field('hit_start'),
346 '-end' => $self->get_field('hit_end'),
347 '-expect' => $self->get_field('evalue'),
348 '-score' => $self->get_field('score'),
349 '-strand' => $self->get_field('hit_strand'),
350 '-seq_id' => $self->get_field('name'),
351 '-seqlength'=> $self->get_field('length'),
352 '-source' => $self->get_field('algorithm'),
353 '-seqdesc' => $self->get_field('description'),
354 '-frame' => 0 # not known?
356 $self->{_created_hit
} = 1;
358 return $self->SUPER::hit
(@_);
364 Usage : my $gaps = $hsp->gaps( ['query'|'hit'|'total'] );
365 Function : Get the number of gap characters in the query, hit, or total alignment.
366 Returns : Integer, number of gap characters or 0 if none
367 Args : 'query' = num conserved / length of query seq (without gaps)
368 'hit' = num conserved / length of hit seq (without gaps)
369 'total' = num conserved / length of alignment (with gaps)
375 my ($self, $type) = @_;
377 $type = lc $type if defined $type;
378 $type = 'total' if (! defined $type || $type eq 'hsp' || $type !~ /query|hit|subject|sbjct|total/);
379 $type = 'hit' if $type =~ /sbjct|subject/;
381 if ($type eq 'total') {
382 my $answer = $self->get_field('total_gaps');
383 return $answer unless $answer eq '[unset]';
386 $self->get_field('alignment'); # make sure gaps have been calculated
388 return $self->{'_'.$type.'_gaps'};
394 Usage : $hsp->strand('query')
395 Function: Retrieves the strand for the HSP component requested
396 Returns : +1 or -1 (0 if unknown)
397 Args : 'hit' or 'subject' or 'sbjct' to retrieve the strand of the subject
398 'query' to retrieve the query strand (default)
399 'list' or 'array' to retrieve both query and hit together
406 $val = 'query' unless defined $val;
410 return $self->get_field('query_strand');
412 elsif ($val =~ /^hi|^s/i) {
413 return $self->get_field('hit_strand');
415 elsif ($val =~ /^list|array/i) {
416 return ($self->get_field('query_strand'), $self->get_field('hit_strand'));
419 $self->warn("unrecognized component '$val' requested\n");
427 Usage : $hsp->start('query')
428 Function: Retrieves the start for the HSP component requested
429 Returns : integer, or list of two integers (query start and subject start) in
431 Args : 'hit' or 'subject' or 'sbjct' to retrieve the start of the subject
432 'query' to retrieve the query start (default)
439 $val = (wantarray ?
'list' : 'query') unless defined $val;
443 return $self->get_field('query_start');
445 elsif ($val =~ /^(hi|s)/i) {
446 return $self->get_field('hit_start');
448 elsif ($val =~ /^list|array/i) {
449 return ($self->get_field('query_start'), $self->get_field('hit_start') );
452 $self->warn("unrecognized component '$val' requested\n");
460 Usage : $hsp->end('query')
461 Function: Retrieves the end for the HSP component requested
462 Returns : integer, or list of two integers (query end and subject end) in
464 Args : 'hit' or 'subject' or 'sbjct' to retrieve the end of the subject
465 'query' to retrieve the query end (default)
472 $val = (wantarray ?
'list' : 'query') unless defined $val;
476 return $self->get_field('query_end');
478 elsif ($val =~ /^(hi|s)/i) {
479 return $self->get_field('hit_end');
481 elsif ($val =~ /^list|array/i) {
482 return ($self->get_field('query_end'), $self->get_field('hit_end'));
485 $self->warn("unrecognized end component '$val' requested\n");
493 Usage : my $pvalue = $hsp->pvalue();
494 Function: Returns the P-value for this HSP
495 Returns : undef (Hmmpfam reports do not have p-values)