Bio::DB::TFBS namespace has been moved to its own distribution named after itself
[bioperl-live.git] / Bio / Search / Result / BlastPullResult.pm
blob27606e10ab242d4b585b6e47b51bb7ccec6a2104
2 # BioPerl module for Bio::Search::Result::BlastPullResult
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Sendu Bala <bix@sendu.me.uk>
8 # Copyright Sendu Bala
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
14 =head1 NAME
16 Bio::Search::Result::BlastPullResult - A parser and result object for BLASTN
17 results
19 =head1 SYNOPSIS
21 # generally we use Bio::SearchIO to build these objects
22 use Bio::SearchIO;
23 my $in = Bio::SearchIO->new(-format => 'blast_pull',
24 -file => 'result.blast');
26 while (my $result = $in->next_result) {
27 print $result->query_name, " ", $result->algorithm, " ", $result->num_hits(), " hits\n";
30 =head1 DESCRIPTION
32 This object implements a parser for NCBI BLASTN result output.
34 =head1 FEEDBACK
36 =head2 Mailing Lists
38 User feedback is an integral part of the evolution of this and other
39 Bioperl modules. Send your comments and suggestions preferably to
40 the Bioperl mailing list. Your participation is much appreciated.
42 bioperl-l@bioperl.org - General discussion
43 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
45 =head2 Support
47 Please direct usage questions or support issues to the mailing list:
49 I<bioperl-l@bioperl.org>
51 rather than to the module maintainer directly. Many experienced and
52 reponsive experts will be able look at the problem and quickly
53 address it. Please include a thorough description of the problem
54 with code and data examples if at all possible.
56 =head2 Reporting Bugs
58 Report bugs to the Bioperl bug tracking system to help us keep track
59 of the bugs and their resolution. Bug reports can be submitted via the
60 web:
62 https://github.com/bioperl/bioperl-live/issues
64 =head1 AUTHOR - Sendu Bala
66 Email bix@sendu.me.uk
68 =head1 CONTRIBUTORS
70 Additional contributors names and emails here
72 =head1 APPENDIX
74 The rest of the documentation details each of the object methods.
75 Internal methods are usually preceded with a _
77 =cut
79 # Let the code begin...
81 package Bio::Search::Result::BlastPullResult;
83 use strict;
85 use Bio::Search::Hit::BlastPullHit;
87 use base qw(Bio::Root::Root Bio::Search::Result::PullResultI);
89 =head2 new
91 Title : new
92 Usage : my $obj = Bio::SearchIO::Result::hmmpfam->new();
93 Function: Builds a new Bio::SearchIO::Result::hmmpfam object
94 Returns : Bio::SearchIO::Result::hmmpfam
95 Args : -chunk => [Bio::Root::IO, $start, $end] (required if no -parent)
96 -parent => Bio::PullParserI object (required if no -chunk)
97 -parameters => hash ref of search parameters (key => value), optional
98 -statistics => hash ref of search statistics (key => value), optional
100 where the array ref provided to -chunk contains an IO object
101 for a filehandle to something representing the raw data of the
102 result, and $start and $end define the tell() position within the
103 filehandle that the result data starts and ends (optional; defaults
104 to start and end of the entire thing described by the filehandle)
106 =cut
108 sub new {
109 my ($class, @args) = @_;
110 my $self = $class->SUPER::new(@args);
112 $self->_setup(@args);
114 foreach my $field (qw( header hit_table hsp_table alignments next_model
115 models query_length stats_and_params)) {
116 $self->_fields->{$field} = undef;
119 $self->_dependencies( { ( query_name => 'header',
120 query_accession => 'header',
121 query_description => 'header',
122 query_length => 'header',
123 hit_table => 'header',
124 num_hits => 'hit_table',
125 no_hits_found => 'hit_table' ) } );
127 return $self;
131 # PullParserI discovery methods so we can answer all ResultI questions
134 sub _discover_header {
135 my $self = shift;
136 $self->_chunk_seek(0);
137 my $header = $self->_get_chunk_by_end("Value\n");
138 if (!$header) {
139 $header = $self->_get_chunk_by_end("***** No hits found ******\n");
140 $self->{_no_hits_found} = 1;
142 $self->throw("Invalid header returned") unless $header;
143 $self->{_after_header} = $self->_chunk_tell;
145 ($self->_fields->{query_name}) = $header =~ /^\s*(\S+)/;
146 $self->_fields->{query_accession} = '';
147 $self->_fields->{query_description} = '';
149 if ($header =~ /^Length=(\d+)/m) {
150 $self->_fields->{query_length} = $1;
152 elsif ($header =~ /\((\d+) letters\)/) {
153 # older form?
154 $self->_fields->{query_length} = $1;
156 if ($header =~ /^\s*\(\d+ letters/) {
157 # there wasn't a query sequence name
158 $self->_fields->{query_name} = '';
162 $self->_fields->{header} = 1;
165 sub _discover_hit_table {
166 my $self = shift;
167 $self->_chunk_seek($self->{_after_header});
169 my $table = $self->_get_chunk_by_end("\n>");
171 if (!$table && !$self->{_no_hits_found}) {
172 # no alignments, presumably; hit table comprises the remainder of this
173 # result
174 while (my $line = $self->_get_chunk_by_nol(1)) {
175 $table .= $line;
179 $table ||= '';
181 $self->{_after_hit_table} = $self->_chunk_tell;
183 my $evalue_cutoff = $self->get_field('evalue_cutoff');
184 undef $evalue_cutoff if $evalue_cutoff eq '[unset]';
185 my $score_cutoff = $self->get_field('score_cutoff');
186 undef $score_cutoff if $score_cutoff eq '[unset]';
188 my @table;
189 my $no_hit = 1;
190 while ($table =~ /^(\S+)\s+(\S.*?)?\s+(\S+)\s+([\de]\S*)\s*\n/gm) {
191 $no_hit = 0;
192 my ($name, $desc, $score, $evalue) = ($1, $2, $3, $4);
193 $desc ||= '';
194 if ($evalue =~ /^e/) {
195 $evalue = '1'.$evalue;
197 next if ($evalue_cutoff && $evalue > $evalue_cutoff);
198 next if ($score_cutoff && $score < $score_cutoff);
199 push(@table, [$name, $desc, $score, $evalue]);
201 $self->_fields->{hit_table} = \@table;
202 $self->{_next_hit_index} = @table > 0 ? 0 : -1;
204 $self->_fields->{no_hits_found} = $no_hit;
205 $self->_fields->{num_hits} = @table;
208 sub _discover_next_hit {
209 my $self = shift;
210 my $hit_table = $self->get_field('hit_table');
211 return if $self->{_next_hit_index} == -1;
212 my $hit_row = ${$hit_table}[$self->{_next_hit_index}];
214 $self->_chunk_seek($self->{_end_of_previous_hit} || $self->{_after_hit_table});
216 my ($start, $end) = $self->_find_chunk_by_end("\n>");
217 unless ($end) {
218 $start = $self->{_end_of_previous_hit} || $self->{_after_hit_table};
219 $end = $self->_chunk_true_end;
221 else {
222 $end += $self->_chunk_true_start;
224 $start += $self->_chunk_true_start;
226 $self->{_end_of_previous_hit} = $end - $self->_chunk_true_start;
228 #*** needs to inherit piped_behaviour, and we need to deal with _sequential
229 # ourselves
230 $self->_fields->{next_hit} = Bio::Search::Hit::BlastPullHit->new(-parent => $self,
231 -chunk => [$self->chunk, $start, $end],
232 -hit_data => $hit_row);
234 $self->{_next_hit_index}++;
236 if ($self->{_next_hit_index} > $#{$hit_table}) {
237 $self->{_next_hit_index} = -1;
241 sub _discover_stats_and_params {
242 my $self = shift;
243 $self->_chunk_seek(0);
244 my ($start, $end) = $self->_find_chunk_by_end("\n Database: ");
245 $self->_chunk_seek($end);
247 my $gapped = 0;
248 while ($self->_get_chunk_by_nol(1)) {
249 if (/Number of letters in database:\s+(\S+)/) {
250 my $stat = $1;
251 $stat =~ s/,//g;
252 $self->add_statistic('dbletters', $stat);
254 elsif (/Number of sequences in database:\s+(\S+)/) {
255 my $stat = $1;
256 $stat =~ s/,//g;
257 $self->add_statistic('dbentries', $stat);
259 elsif (/^Lambda/) {
260 my $line = $self->_get_chunk_by_nol(1);
261 $line =~ /\s+(\S+)\s+(\S+)\s+(\S+)/;
262 $self->add_statistic($gapped ? 'lambda_gapped' : 'lambda', $1);
263 $self->add_statistic($gapped ? 'kappa_gapped' : 'kappa', $2);
264 $self->add_statistic($gapped ? 'entropy_gapped' : 'entropy', $3);
265 $gapped = 1;
267 elsif (/^Matrix: (.+)\n/) {
268 $self->add_parameter('matrix', $1);
270 elsif (/^Gap Penalties: Existence: (\d+), Extension: (\d+)/) {
271 $self->add_parameter('gapopen', $1);
272 $self->add_parameter('gapext', $2);
274 elsif (/^Number of Hits to DB: (\d+)/) {
275 $self->add_statistic('Hits_to_DB', $1);
277 elsif (/^Number of extensions: (\d+)/) {
278 $self->add_statistic('num_extensions', $1);
280 elsif (/^Number of successful extensions: (\d+)/) {
281 $self->add_statistic('num_successful_extensions', $1);
283 elsif (/^Number of sequences better than (\S+):/) {
284 $self->add_parameter('expect', $1);
286 elsif (/^[Ll]ength of query: (\d+)/) {
287 $self->add_statistic('querylength', $1);
289 elsif (/^[Ee]ffective HSP length: (\d+)/) {
290 $self->add_statistic('effective_hsplength', $1);
292 elsif (/^[Ee]ffective length of database: (\d+)/) {
293 $self->add_statistic('effectivedblength', $1);
295 elsif (/^[Ee]ffective search space: (\d+)/) {
296 $self->add_statistic('effectivespace', $1);
298 elsif (/^[Ee]ffective search space used: (\d+)/) {
299 $self->add_statistic('effectivespaceused', $1);
301 elsif (/^([TAXS]\d?): (\d+)(?: \((\S+))?/) {
302 $self->add_statistic($1, $2);
303 $self->add_statistic($1.'_bits', $3) if $3;
307 $self->_fields->{stats_and_params} = 1;
310 =head2 next_hit
312 Title : next_hit
313 Usage : while( $hit = $result->next_hit()) { ... }
314 Function: Returns the next available Hit object, representing potential
315 matches between the query and various entities from the database.
316 Returns : a Bio::Search::Hit::HitI object or undef if there are no more.
317 Args : none
319 =cut
321 sub next_hit {
322 my $self = shift;
323 my $hit = $self->get_field('next_hit');
324 undef $self->_fields->{next_hit};
325 return $hit;
328 =head2 hits
330 Title : hits
331 Usage : my @hits = $result->hits
332 Function: Returns the HitI objects contained within this Result
333 Returns : Array of Bio::Search::Hit::HitI objects
334 Args : none
336 See Also: L<Bio::Search::Hit::HitI>
338 =cut
340 sub hits {
341 my $self = shift;
342 my $old = $self->{_next_hit_index} || 0;
343 $self->rewind;
344 my @hits;
345 while (defined(my $hit = $self->next_hit)) {
346 push(@hits, $hit);
348 $self->{_next_hit_index} = @hits > 0 ? $old : -1;
349 return @hits;
352 =head2 sort_hits
354 Title : sort_hits
355 Usage : $result->sort_hits('<score')
356 Function : Sorts the hits so that they come out in the desired order when
357 hits() or next_hit() is called.
358 Returns : n/a
359 Args : A coderef for the sort function. See the documentation on the Perl
360 sort() function for guidelines on writing sort functions.
361 By default the sort order is ascending significance value (ie.
362 most significant hits first).
363 *** example
365 =cut
367 sub sort_hits {
368 my ($self, $code_ref) = @_;
369 $self->throw("Not implemented yet");
372 =head2 rewind
374 Title : rewind
375 Usage : $result->rewind;
376 Function: Allow one to reset the Hit iterator to the beginning, so that
377 next_hit() will subsequently return the first hit and so on.
378 Returns : n/a
379 Args : none
381 =cut
383 sub rewind {
384 my $self = shift;
385 unless ($self->_fields->{hit_table}) {
386 $self->get_field('hit_table');
388 $self->{_next_hit_index} = @{$self->_fields->{hit_table}} > 0 ? 0 : -1;
391 =head2 get_statistic
393 Title : get_statistic
394 Usage : my $gap_ext = $result->get_statistic('kappa')
395 Function: Returns the value for a specific statistic available
396 from this result
397 Returns : string
398 Args : name of statistic (string)
400 =cut
402 sub get_statistic {
403 my $self = shift;
404 $self->get_field('stats_and_params');
405 return $self->SUPER::get_statistic(@_);
408 =head2 get_parameter
410 Title : get_parameter
411 Usage : my $gap_ext = $result->get_parameter('gapext')
412 Function: Returns the value for a specific parameter used
413 when running this result
414 Returns : string
415 Args : name of parameter (string)
417 =cut
419 sub get_parameter {
420 my $self = shift;
421 $self->get_field('stats_and_params');
422 return $self->SUPER::get_parameter(@_);