Bio::DB::Universal: move into its own distribution
[bioperl-live.git] / Bio / SearchIO / megablast.pm
blobf47602190cf5d2f25bbc267aaaec3378ec250542
2 # BioPerl module for Bio::SearchIO::megablast
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Jason Stajich <jason@bioperl.org>
8 # Copyright Jason Stajich
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
14 =head1 NAME
16 Bio::SearchIO::megablast - a driver module for Bio::SearchIO to parse
17 megablast reports (format 0)
19 =head1 SYNOPSIS
21 # do not use this module directly
23 use Bio::SearchIO;
24 # for default format output from megablast
25 my $in = Bio::SearchIO->new(-file => 'file.mbl',
26 -format => 'megablast',
27 -report_format => 0);
29 while( my $r = $in->next_result ) {
30 while( my $hit = $r->next_hit ) {
31 while( my $hsp = $hit->next_hsp ) {
36 =head1 DESCRIPTION
38 Beware!
40 Because of the way megablast report format 0 is coded, realize that score
41 means # gap characters + # mismatches for a HSP.
43 The docs from NCBI regarding FORMAT 0
44 # 0: Produce one-line output for each alignment, in the form
46 # 'subject-id'=='[+-]query-id' (s_off q_off s_end q_end) score
48 # Here subject(query)-id is a gi number, an accession or some other type of
49 # identifier found in the FASTA definition line of the respective sequence.
51 # + or - corresponds to same or different strand alignment.
53 # Score for non-affine gapping parameters means the total number of
54 # differences (mismatches + gap characters). For affine case it is the
55 # actual (raw) score of the alignment.
57 FORMAT 1 parsing has not been implemented
58 FORMAT 2 parsing should work with the SearchIO 'blast' parser
60 =head1 FEEDBACK
62 =head2 Mailing Lists
64 User feedback is an integral part of the evolution of this and other
65 Bioperl modules. Send your comments and suggestions preferably to
66 the Bioperl mailing list. Your participation is much appreciated.
68 bioperl-l@bioperl.org - General discussion
69 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
71 =head2 Support
73 Please direct usage questions or support issues to the mailing list:
75 I<bioperl-l@bioperl.org>
77 rather than to the module maintainer directly. Many experienced and
78 reponsive experts will be able look at the problem and quickly
79 address it. Please include a thorough description of the problem
80 with code and data examples if at all possible.
82 =head2 Reporting Bugs
84 Report bugs to the Bioperl bug tracking system to help us keep track
85 of the bugs and their resolution. Bug reports can be submitted via the
86 web:
88 https://github.com/bioperl/bioperl-live/issues
90 =head1 AUTHOR - Jason Stajich
92 Email jason-at-bioperl.org
94 =head1 APPENDIX
96 The rest of the documentation details each of the object methods.
97 Internal methods are usually preceded with a _
99 =cut
102 # Let the code begin...
105 package Bio::SearchIO::megablast;
106 use strict;
107 use vars qw(%MAPPING %MODEMAP $DEFAULT_BLAST_WRITER_CLASS);
109 use base qw(Bio::SearchIO);
111 BEGIN {
112 # mapping of MegaBlast terms to Bioperl hash keys
113 %MODEMAP = ('MegaBlastOutput' => 'result',
114 'Hit' => 'hit',
115 'Hsp' => 'hsp'
118 # This should really be done more intelligently, like with
119 # XSLT
121 %MAPPING =
123 'Hsp_query-from' => 'HSP-query_start',
124 'Hsp_query-to' => 'HSP-query_end',
125 'Hsp_hit-from' => 'HSP-hit_start',
126 'Hsp_hit-to' => 'HSP-hit_end',
127 'Hit_score' => 'HIT-score',
128 'Hsp_score' => 'HSP-score',
130 'Hsp_identity' => 'HSP-identical',
131 'Hsp_positive' => 'HSP-conserved',
133 'Hit_id' => 'HIT-name',
135 'MegaBlastOutput_program' => 'RESULT-algorithm_name',
136 'MegaBlastOutput_query-def'=> 'RESULT-query_name',
140 $DEFAULT_BLAST_WRITER_CLASS = 'Bio::SearchIO::Writer::HitTableWriter';
144 =head2 new
146 Title : new
147 Usage : my $obj = Bio::SearchIO::blast->new();
148 Function: Builds a new Bio::SearchIO::blast object
149 Returns : Bio::SearchIO::blast
150 Args : -fh/-file => filehandle/filename to BLAST file
151 -format => 'blast'
153 =cut
155 sub _initialize {
156 my ($self,@args) = @_;
157 $self->SUPER::_initialize(@args);
158 my ($fmt) = $self->_rearrange([qw(REPORT_FORMAT)], @args);
160 $self->throw("Must provide a value for -report_format when initializing a megablast parser") unless defined $fmt ;
161 $self->report_format($fmt);
162 return 1;
165 =head2 next_result
167 Title : next_result
168 Usage : my $hit = $searchio->next_result;
169 Function: Returns the next Result from a search
170 Returns : Bio::Search::Result::ResultI object
171 Args : none
173 =cut
175 sub next_result{
176 my ($self) = @_;
178 local $/ = "\n";
179 local $_;
181 my $fmt = $self->report_format;
182 my ($lastquery,$lasthit);
183 while( defined($_ = $self->_readline) ) {
184 if( $fmt == 0 ) {
185 if( /^\'(\S+)\'\=\=\'(\+|\-)(\S+)\'\s+
186 \((\d+)\s+(\d+)\s+(\d+)\s+(\d+)\)\s+
187 (\d+)/ox )
189 my ($hit,$strand,$query,
190 $h_start,$q_start,$h_end,$q_end,
191 $score) = ($1,$2,$3,$4,$5,$6,$7,$8);
192 if( ! defined $lastquery ) {
193 $self->start_element({'Name' => 'MegaBlastOutput'});
194 $self->element({'Name' => 'MegaBlastOutput_program',
195 'Data' => 'MEGABLAST'});
196 $self->element({'Name' => 'MegaBlastOutput_query-def',
197 'Data' => $query});
198 } elsif( $lastquery ne $query ) {
199 $self->_pushback($_);
200 $self->end_element({'Name' => 'Hit'}) if( defined $lasthit);
201 $self->end_element({ 'Name' => 'MegaBlastOutput'});
202 $lasthit = undef;
203 $lastquery = undef;
204 return $self->end_document();
207 if( ! defined $lasthit || $lasthit ne $hit ) {
208 $self->end_element({'Name' => 'Hit'}) if( defined $lasthit);
209 $self->start_element({'Name' => 'Hit'});
210 $self->element({'Name' => 'Hit_id',
211 'Data' => $hit});
213 $self->start_element({'Name' => 'Hsp'});
214 $self->element({'Name' => 'Hsp_score',
215 'Data' => $score});
217 # flip flop start/end if strand is < 0
218 # since strandedness is inferred from the query
219 # because of the way it is coded all queries will
220 # be on the forward strand and hits will be either
221 # +/-
223 # also the NCBI docs state:
224 # 0: Produce one-line output for each alignment, in the form
226 # 'subject-id'=='[+-]query-id' (s_off q_off s_end q_end) score
228 # Here subject(query)-id is a gi number, an accession or some other type of
229 # identifier found in the FASTA definition line of the respective sequence.
231 # + or - corresponds to same or different strand alignment.
233 # Score for non-affine gapping parameters means the total number of
234 # differences (mismatches + gap characters). For affine case it is the
235 # actual (raw) score of the alignment.
237 # and yet when rev strand hits are made I see
238 # (MBL 2.2.4)
239 # 'Contig634'=='-503384' (1 7941 321 7620) 19
240 # so the query is on the rev strand and the
241 # subject is on the fwd strand
242 # so I am flip-flopping everything when I see a '-'
243 if( $strand eq '-' ) {
244 ($h_start,$h_end) = ( $h_end,$h_start);
245 ($q_start,$q_end) = ( $q_end,$q_start);
247 $self->element({'Name' => 'Hsp_hit-from',
248 'Data' => $h_start});
249 $self->element({'Name' => 'Hsp_hit-to',
250 'Data' => $h_end});
251 $self->element({'Name' => 'Hsp_query-from',
252 'Data' => $q_start});
253 $self->element({'Name' => 'Hsp_query-to',
254 'Data' => $q_end});
255 # might not be quite right -- need to know length of the HSP
256 my $numid = (abs($q_end - $q_start) - $score);
258 $self->element({'Name' => 'Hsp_identity',
259 'Data' => $numid});
260 $self->element({'Name' => 'Hsp_positive',
261 'Data' => $numid});
263 $self->end_element({'Name' => 'Hsp'});
264 $lasthit = $hit;
265 $lastquery = $query;
266 } else {
267 $self->debug("Unknown line in fmt0 parsing: $_");
271 if( defined $lastquery && $fmt == 0 ) {
272 $self->end_element({'Name' => 'Hit'}) if( defined $lasthit);
273 $self->end_element({ 'Name' => 'MegaBlastOutput'});
274 return $self->end_document();
276 return;
279 =head2 report_format
281 Title : report_format
282 Usage : $obj->report_format($newval)
283 Function: Get/Set the report_format value
284 Returns : value of report_format (a scalar)
285 Args : on set, new value (a scalar or undef, optional)
288 =cut
290 sub report_format{
291 my $self = shift;
292 return $self->{'_report_format'} = shift if @_;
293 return $self->{'_report_format'};
297 =head2 start_element
299 Title : start_element
300 Usage : $eventgenerator->start_element
301 Function: Handles a start element event
302 Returns : none
303 Args : hashref with at least 2 keys 'Data' and 'Name'
306 =cut
308 sub start_element{
309 my ($self,$data) = @_;
310 # we currently do not care about attributes
311 my $nm = $data->{'Name'};
312 if( my $type = $MODEMAP{$nm} ) {
313 $self->_mode($type);
314 if( $self->_eventHandler->will_handle($type) ) {
315 my $func = sprintf("start_%s",lc $type);
316 $self->_eventHandler->$func($data->{'Attributes'});
318 unshift @{$self->{'_elements'}}, $type;
321 if($nm eq 'MegaBlastOutput') {
322 $self->{'_values'} = {};
323 $self->{'_result'}= undef;
324 $self->{'_mode'} = '';
329 =head2 end_element
331 Title : start_element
332 Usage : $eventgenerator->end_element
333 Function: Handles an end element event
334 Returns : none
335 Args : hashref with at least 2 keys 'Data' and 'Name'
338 =cut
340 sub end_element {
341 my ($self,$data) = @_;
342 my $nm = $data->{'Name'};
343 my $rc;
345 if( my $type = $MODEMAP{$nm} ) {
346 if( $self->_eventHandler->will_handle($type) ) {
347 my $func = sprintf("end_%s",lc $type);
348 $rc = $self->_eventHandler->$func($self->{'_reporttype'},
349 $self->{'_values'});
351 shift @{$self->{'_elements'}};
353 } elsif( $MAPPING{$nm} ) {
354 if ( ref($MAPPING{$nm}) =~ /hash/i ) {
355 my $key = (keys %{$MAPPING{$nm}})[0];
356 $self->{'_values'}->{$key}->{$MAPPING{$nm}->{$key}} = $self->{'_last_data'};
357 } else {
358 $self->{'_values'}->{$MAPPING{$nm}} = $self->{'_last_data'};
360 } else {
361 $self->warn( "unknown nm $nm ignoring\n");
363 $self->{'_last_data'} = ''; # remove read data if we are at
364 # end of an element
365 $self->{'_result'} = $rc if( $nm eq 'MegaBlastOutput' );
366 return $rc;
370 =head2 element
372 Title : element
373 Usage : $eventhandler->element({'Name' => $name, 'Data' => $str});
374 Function: Convience method that calls start_element, characters, end_element
375 Returns : none
376 Args : Hash ref with the keys 'Name' and 'Data'
379 =cut
381 sub element{
382 my ($self,$data) = @_;
383 $self->start_element($data);
384 $self->characters($data);
385 $self->end_element($data);
389 =head2 characters
391 Title : characters
392 Usage : $eventgenerator->characters($str)
393 Function: Send a character events
394 Returns : none
395 Args : string
398 =cut
400 sub characters{
401 my ($self,$data) = @_;
402 return unless defined $data->{'Data'};
403 $self->{'_last_data'} = $data->{'Data'};
406 =head2 _mode
408 Title : _mode
409 Usage : $obj->_mode($newval)
410 Function:
411 Example :
412 Returns : value of _mode
413 Args : newvalue (optional)
416 =cut
418 sub _mode{
419 my ($self,$value) = @_;
420 if( defined $value) {
421 $self->{'_mode'} = $value;
423 return $self->{'_mode'};
426 =head2 within_element
428 Title : within_element
429 Usage : if( $eventgenerator->within_element($element) ) {}
430 Function: Test if we are within a particular element
431 This is different than 'in' because within can be tested
432 for a whole block.
433 Returns : boolean
434 Args : string element name
437 =cut
439 sub within_element{
440 my ($self,$name) = @_;
441 return 0 if ( ! defined $name &&
442 ! defined $self->{'_elements'} ||
443 scalar @{$self->{'_elements'}} == 0) ;
444 foreach ( @{$self->{'_elements'}} ) {
445 if( $_ eq $name ) {
446 return 1;
449 return 0;
452 =head2 in_element
454 Title : in_element
455 Usage : if( $eventgenerator->in_element($element) ) {}
456 Function: Test if we are in a particular element
457 This is different than 'in' because within can be tested
458 for a whole block.
459 Returns : boolean
460 Args : string element name
463 =cut
465 sub in_element{
466 my ($self,$name) = @_;
467 return 0 if ! defined $self->{'_elements'}->[0];
468 return ( $self->{'_elements'}->[0] eq $name)
472 =head2 start_document
474 Title : start_document
475 Usage : $eventgenerator->start_document
476 Function: Handles a start document event
477 Returns : none
478 Args : none
481 =cut
483 sub start_document{
484 my ($self) = @_;
485 $self->{'_lasttype'} = '';
486 $self->{'_values'} = {};
487 $self->{'_result'}= undef;
488 $self->{'_mode'} = '';
489 $self->{'_elements'} = [];
493 =head2 end_document
495 Title : end_document
496 Usage : $eventgenerator->end_document
497 Function: Handles an end document event
498 Returns : Bio::Search::Result::ResultI object
499 Args : none
502 =cut
504 sub end_document{
505 my ($self,@args) = @_;
506 return $self->{'_result'};
509 =head2 result_count
511 Title : result_count
512 Usage : my $count = $searchio->result_count
513 Function: Returns the number of results we have processed
514 Returns : integer
515 Args : none
518 =cut
520 sub result_count {
521 my $self = shift;
522 return $self->{'_result_count'};
525 sub report_count { shift->result_count }