2 # BioPerl module for Bio::SearchIO::megablast
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Jason Stajich <jason@bioperl.org>
8 # Copyright Jason Stajich
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
16 Bio::SearchIO::megablast - a driver module for Bio::SearchIO to parse
17 megablast reports (format 0)
21 # do not use this module directly
24 # for default format output from megablast
25 my $in = Bio::SearchIO->new(-file => 'file.mbl',
26 -format => 'megablast',
29 while( my $r = $in->next_result ) {
30 while( my $hit = $r->next_hit ) {
31 while( my $hsp = $hit->next_hsp ) {
40 Because of the way megablast report format 0 is coded, realize that score
41 means # gap characters + # mismatches for a HSP.
43 The docs from NCBI regarding FORMAT 0
44 # 0: Produce one-line output for each alignment, in the form
46 # 'subject-id'=='[+-]query-id' (s_off q_off s_end q_end) score
48 # Here subject(query)-id is a gi number, an accession or some other type of
49 # identifier found in the FASTA definition line of the respective sequence.
51 # + or - corresponds to same or different strand alignment.
53 # Score for non-affine gapping parameters means the total number of
54 # differences (mismatches + gap characters). For affine case it is the
55 # actual (raw) score of the alignment.
57 FORMAT 1 parsing has not been implemented
58 FORMAT 2 parsing should work with the SearchIO 'blast' parser
64 User feedback is an integral part of the evolution of this and other
65 Bioperl modules. Send your comments and suggestions preferably to
66 the Bioperl mailing list. Your participation is much appreciated.
68 bioperl-l@bioperl.org - General discussion
69 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
73 Please direct usage questions or support issues to the mailing list:
75 I<bioperl-l@bioperl.org>
77 rather than to the module maintainer directly. Many experienced and
78 reponsive experts will be able look at the problem and quickly
79 address it. Please include a thorough description of the problem
80 with code and data examples if at all possible.
84 Report bugs to the Bioperl bug tracking system to help us keep track
85 of the bugs and their resolution. Bug reports can be submitted via the
88 https://github.com/bioperl/bioperl-live/issues
90 =head1 AUTHOR - Jason Stajich
92 Email jason-at-bioperl.org
96 The rest of the documentation details each of the object methods.
97 Internal methods are usually preceded with a _
102 # Let the code begin...
105 package Bio
::SearchIO
::megablast
;
107 use vars
qw(%MAPPING %MODEMAP $DEFAULT_BLAST_WRITER_CLASS);
109 use base qw(Bio::SearchIO);
112 # mapping of MegaBlast terms to Bioperl hash keys
113 %MODEMAP = ('MegaBlastOutput' => 'result',
118 # This should really be done more intelligently, like with
123 'Hsp_query-from' => 'HSP-query_start',
124 'Hsp_query-to' => 'HSP-query_end',
125 'Hsp_hit-from' => 'HSP-hit_start',
126 'Hsp_hit-to' => 'HSP-hit_end',
127 'Hit_score' => 'HIT-score',
128 'Hsp_score' => 'HSP-score',
130 'Hsp_identity' => 'HSP-identical',
131 'Hsp_positive' => 'HSP-conserved',
133 'Hit_id' => 'HIT-name',
135 'MegaBlastOutput_program' => 'RESULT-algorithm_name',
136 'MegaBlastOutput_query-def'=> 'RESULT-query_name',
140 $DEFAULT_BLAST_WRITER_CLASS = 'Bio::SearchIO::Writer::HitTableWriter';
147 Usage : my $obj = Bio::SearchIO::blast->new();
148 Function: Builds a new Bio::SearchIO::blast object
149 Returns : Bio::SearchIO::blast
150 Args : -fh/-file => filehandle/filename to BLAST file
156 my ($self,@args) = @_;
157 $self->SUPER::_initialize
(@args);
158 my ($fmt) = $self->_rearrange([qw(REPORT_FORMAT)], @args);
160 $self->throw("Must provide a value for -report_format when initializing a megablast parser") unless defined $fmt ;
161 $self->report_format($fmt);
168 Usage : my $hit = $searchio->next_result;
169 Function: Returns the next Result from a search
170 Returns : Bio::Search::Result::ResultI object
181 my $fmt = $self->report_format;
182 my ($lastquery,$lasthit);
183 while( defined($_ = $self->_readline) ) {
185 if( /^\'(\S
+)\'\
=\
=\'(\
+|\
-)(\S
+)\'\s
+
186 \
((\d
+)\s
+(\d
+)\s
+(\d
+)\s
+(\d
+)\
)\s
+
189 my ($hit,$strand,$query,
190 $h_start,$q_start,$h_end,$q_end,
191 $score) = ($1,$2,$3,$4,$5,$6,$7,$8);
192 if( ! defined $lastquery ) {
193 $self->start_element({'Name' => 'MegaBlastOutput'});
194 $self->element({'Name' => 'MegaBlastOutput_program',
195 'Data' => 'MEGABLAST'});
196 $self->element({'Name' => 'MegaBlastOutput_query-def',
198 } elsif( $lastquery ne $query ) {
199 $self->_pushback($_);
200 $self->end_element({'Name' => 'Hit'}) if( defined $lasthit);
201 $self->end_element({ 'Name' => 'MegaBlastOutput'});
204 return $self->end_document();
207 if( ! defined $lasthit || $lasthit ne $hit ) {
208 $self->end_element({'Name' => 'Hit'}) if( defined $lasthit);
209 $self->start_element({'Name' => 'Hit'});
210 $self->element({'Name' => 'Hit_id',
213 $self->start_element({'Name' => 'Hsp'});
214 $self->element({'Name' => 'Hsp_score',
217 # flip flop start/end if strand is < 0
218 # since strandedness is inferred from the query
219 # because of the way it is coded all queries will
220 # be on the forward strand and hits will be either
223 # also the NCBI docs state:
224 # 0: Produce one-line output for each alignment, in the form
226 # 'subject-id'=='[+-]query-id' (s_off q_off s_end q_end) score
228 # Here subject(query)-id is a gi number, an accession or some other type of
229 # identifier found in the FASTA definition line of the respective sequence.
231 # + or - corresponds to same or different strand alignment.
233 # Score for non-affine gapping parameters means the total number of
234 # differences (mismatches + gap characters). For affine case it is the
235 # actual (raw) score of the alignment.
237 # and yet when rev strand hits are made I see
239 # 'Contig634'=='-503384' (1 7941 321 7620) 19
240 # so the query is on the rev strand and the
241 # subject is on the fwd strand
242 # so I am flip-flopping everything when I see a '-'
243 if( $strand eq '-' ) {
244 ($h_start,$h_end) = ( $h_end,$h_start);
245 ($q_start,$q_end) = ( $q_end,$q_start);
247 $self->element({'Name' => 'Hsp_hit-from',
248 'Data' => $h_start});
249 $self->element({'Name' => 'Hsp_hit-to',
251 $self->element({'Name' => 'Hsp_query-from',
252 'Data' => $q_start});
253 $self->element({'Name' => 'Hsp_query-to',
255 # might not be quite right -- need to know length of the HSP
256 my $numid = (abs($q_end - $q_start) - $score);
258 $self->element({'Name' => 'Hsp_identity',
260 $self->element({'Name' => 'Hsp_positive',
263 $self->end_element({'Name' => 'Hsp'});
267 $self->debug("Unknown line in fmt0 parsing: $_");
271 if( defined $lastquery && $fmt == 0 ) {
272 $self->end_element({'Name' => 'Hit'}) if( defined $lasthit);
273 $self->end_element({ 'Name' => 'MegaBlastOutput'});
274 return $self->end_document();
281 Title : report_format
282 Usage : $obj->report_format($newval)
283 Function: Get/Set the report_format value
284 Returns : value of report_format (a scalar)
285 Args : on set, new value (a scalar or undef, optional)
292 return $self->{'_report_format'} = shift if @_;
293 return $self->{'_report_format'};
299 Title : start_element
300 Usage : $eventgenerator->start_element
301 Function: Handles a start element event
303 Args : hashref with at least 2 keys 'Data' and 'Name'
309 my ($self,$data) = @_;
310 # we currently do not care about attributes
311 my $nm = $data->{'Name'};
312 if( my $type = $MODEMAP{$nm} ) {
314 if( $self->_eventHandler->will_handle($type) ) {
315 my $func = sprintf("start_%s",lc $type);
316 $self->_eventHandler->$func($data->{'Attributes'});
318 unshift @
{$self->{'_elements'}}, $type;
321 if($nm eq 'MegaBlastOutput') {
322 $self->{'_values'} = {};
323 $self->{'_result'}= undef;
324 $self->{'_mode'} = '';
331 Title : start_element
332 Usage : $eventgenerator->end_element
333 Function: Handles an end element event
335 Args : hashref with at least 2 keys 'Data' and 'Name'
341 my ($self,$data) = @_;
342 my $nm = $data->{'Name'};
345 if( my $type = $MODEMAP{$nm} ) {
346 if( $self->_eventHandler->will_handle($type) ) {
347 my $func = sprintf("end_%s",lc $type);
348 $rc = $self->_eventHandler->$func($self->{'_reporttype'},
351 shift @
{$self->{'_elements'}};
353 } elsif( $MAPPING{$nm} ) {
354 if ( ref($MAPPING{$nm}) =~ /hash/i ) {
355 my $key = (keys %{$MAPPING{$nm}})[0];
356 $self->{'_values'}->{$key}->{$MAPPING{$nm}->{$key}} = $self->{'_last_data'};
358 $self->{'_values'}->{$MAPPING{$nm}} = $self->{'_last_data'};
361 $self->warn( "unknown nm $nm ignoring\n");
363 $self->{'_last_data'} = ''; # remove read data if we are at
365 $self->{'_result'} = $rc if( $nm eq 'MegaBlastOutput' );
373 Usage : $eventhandler->element({'Name' => $name, 'Data' => $str});
374 Function: Convience method that calls start_element, characters, end_element
376 Args : Hash ref with the keys 'Name' and 'Data'
382 my ($self,$data) = @_;
383 $self->start_element($data);
384 $self->characters($data);
385 $self->end_element($data);
392 Usage : $eventgenerator->characters($str)
393 Function: Send a character events
401 my ($self,$data) = @_;
402 return unless defined $data->{'Data'};
403 $self->{'_last_data'} = $data->{'Data'};
409 Usage : $obj->_mode($newval)
412 Returns : value of _mode
413 Args : newvalue (optional)
419 my ($self,$value) = @_;
420 if( defined $value) {
421 $self->{'_mode'} = $value;
423 return $self->{'_mode'};
426 =head2 within_element
428 Title : within_element
429 Usage : if( $eventgenerator->within_element($element) ) {}
430 Function: Test if we are within a particular element
431 This is different than 'in' because within can be tested
434 Args : string element name
440 my ($self,$name) = @_;
441 return 0 if ( ! defined $name &&
442 ! defined $self->{'_elements'} ||
443 scalar @
{$self->{'_elements'}} == 0) ;
444 foreach ( @
{$self->{'_elements'}} ) {
455 Usage : if( $eventgenerator->in_element($element) ) {}
456 Function: Test if we are in a particular element
457 This is different than 'in' because within can be tested
460 Args : string element name
466 my ($self,$name) = @_;
467 return 0 if ! defined $self->{'_elements'}->[0];
468 return ( $self->{'_elements'}->[0] eq $name)
472 =head2 start_document
474 Title : start_document
475 Usage : $eventgenerator->start_document
476 Function: Handles a start document event
485 $self->{'_lasttype'} = '';
486 $self->{'_values'} = {};
487 $self->{'_result'}= undef;
488 $self->{'_mode'} = '';
489 $self->{'_elements'} = [];
496 Usage : $eventgenerator->end_document
497 Function: Handles an end document event
498 Returns : Bio::Search::Result::ResultI object
505 my ($self,@args) = @_;
506 return $self->{'_result'};
512 Usage : my $count = $searchio->result_count
513 Function: Returns the number of results we have processed
522 return $self->{'_result_count'};
525 sub report_count
{ shift->result_count }