2 # BioPerl module for Bio::SearchIO::exonerate
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Jason Stajich <jason-at-bioperl.org>
8 # Copyright Jason Stajich
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
16 Bio::SearchIO::exonerate - parser for Exonerate
20 # do not use this module directly, it is a driver for SearchIO
23 my $searchio = Bio::SearchIO->new(-file => 'file.exonerate',
24 -format => 'exonerate');
27 while( my $r = $searchio->next_result ) {
28 print $r->query_name, "\n";
33 This is a driver for the SearchIO system for parsing Exonerate (Guy
34 Slater) output. You can get Exonerate at
35 http://www.ebi.ac.uk/~guy/exonerate/
36 [until Guy puts up a Web reference,publication for it.]).
38 An optional parameter -min_intron is supported by the L<new>
39 initialization method. This is if you run Exonerate with a different
40 minimum intron length (default is 30) the parser will be able to
41 detect the difference between standard deletions and an intron. Still
42 some room to play with there that might cause this to get
43 misinterpreted that has not been fully tested or explored.
45 The VULGAR and CIGAR formats should be parsed okay now creating HSPs
46 where appropriate (so merging match states where appropriate rather
47 than breaking an HSP at each indel as it may have done in the past).
48 The GFF that comes from exonerate is still probably a better way to go
49 if you are doing protein2genome or est2genome mapping.
50 For example you can see this script:
52 ### TODO: Jason, this link is dead, do we have an updated one?
53 http://fungal.genome.duke.edu/~jes12/software/scripts/process_exonerate_gff3.perl.txt
55 If your report contains both CIGAR and VULGAR lines only the first one
56 will processed for a given Query/Target pair. If you preferentially
57 want to use VULGAR or CIGAR add one of these options when initializing
64 Or set them via these methods.
76 User feedback is an integral part of the evolution of this and other
77 Bioperl modules. Send your comments and suggestions preferably to
78 the Bioperl mailing list. Your participation is much appreciated.
80 bioperl-l@bioperl.org - General discussion
81 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
85 Please direct usage questions or support issues to the mailing list:
87 I<bioperl-l@bioperl.org>
89 rather than to the module maintainer directly. Many experienced and
90 reponsive experts will be able look at the problem and quickly
91 address it. Please include a thorough description of the problem
92 with code and data examples if at all possible.
96 Report bugs to the Bioperl bug tracking system to help us keep track
97 of the bugs and their resolution. Bug reports can be submitted via the
100 https://github.com/bioperl/bioperl-live/issues
102 =head1 AUTHOR - Jason Stajich
104 Email jason-at-bioperl.org
108 The rest of the documentation details each of the object methods.
109 Internal methods are usually preceded with a _
114 # Let the code begin...
117 package Bio
::SearchIO
::exonerate
;
119 use vars
qw(@STATES %MAPPING %MODEMAP $DEFAULT_WRITER_CLASS $MIN_INTRON);
121 use base qw(Bio::SearchIO);
123 %MODEMAP = ( 'ExonerateOutput' => 'result',
130 'Hsp_query-from'=> 'HSP-query_start',
131 'Hsp_query-to' => 'HSP-query_end',
132 'Hsp_hit-from' => 'HSP-hit_start',
133 'Hsp_hit-to' => 'HSP-hit_end',
134 'Hsp_qseq' => 'HSP-query_seq',
135 'Hsp_hseq' => 'HSP-hit_seq',
136 'Hsp_midline' => 'HSP-homology_seq',
137 'Hsp_score' => 'HSP-score',
138 'Hsp_qlength' => 'HSP-query_length',
139 'Hsp_hlength' => 'HSP-hit_length',
140 'Hsp_align-len' => 'HSP-hsp_length',
141 'Hsp_identity' => 'HSP-identical',
142 'Hsp_gaps' => 'HSP-hsp_gaps',
143 'Hsp_hitgaps' => 'HSP-hit_gaps',
144 'Hsp_querygaps' => 'HSP-query_gaps',
146 'Hit_id' => 'HIT-name',
147 'Hit_desc' => 'HIT-description',
148 'Hit_len' => 'HIT-length',
149 'Hit_score' => 'HIT-score',
151 'ExonerateOutput_program' => 'RESULT-algorithm_name',
152 'ExonerateOutput_query-def' => 'RESULT-query_name',
153 'ExonerateOutput_query-desc'=> 'RESULT-query_description',
154 'ExonerateOutput_query-len' => 'RESULT-query_length',
157 $DEFAULT_WRITER_CLASS = 'Bio::SearchIO::Writer::HitTableWriter';
159 $MIN_INTRON=30; # This is the minimum intron size
164 Usage : my $obj = Bio::SearchIO::exonerate->new();
165 Function: Builds a new Bio::SearchIO::exonerate object
166 Returns : an instance of Bio::SearchIO::exonerate
167 Args : -min_intron => somewhat obsolete option, how to determine if a
168 an indel is an intron or a local gap. Use VULGAR
169 rather than CIGAR to avoid this heuristic,default 30.
170 -cigar => 1 set this to 1 if you want to parse
172 -vulgar => 1 set this to 1 if you want to parse VULGAR
173 exclusively, setting both to 1 will revert
174 to the default behavior of just parsing the
175 first line that it sees.
181 my $self = $class->SUPER::new
(@_);
183 my ($min_intron,$cigar,
184 $vulgar) = $self->_rearrange([qw(MIN_INTRON
188 $MIN_INTRON = $min_intron;
190 if( $cigar && $vulgar ) {
191 $self->warn("cannot get HSPs from both CIGAR and VULGAR lines, will just choose whichever comes first (same as if you had chosen neither");
192 $cigar = 0; $vulgar=0;
194 $self->cigar($cigar);
195 $self->vulgar($vulgar);
202 Usage : my $hit = $searchio->next_result;
203 Function: Returns the next Result from a search
204 Returns : Bio::Search::Result::ResultI object
214 $self->{'_last_data'} = '';
215 my ($reporttype,$seenquery,$reportline);
216 $self->start_document();
219 my (@q_ex, @m_ex, @h_ex); ## gc addition
220 while( defined($_ = $self->_readline) ) {
221 # warn( "Reading $_");
222 if( /^\s*Query:\s+(\S+)\s*(.+)?/ ) {
224 $self->end_element({'Name' => 'ExonerateOutput'});
225 $self->_pushback($_);
226 return $self->end_document();
229 my ($nm,$desc) = ($1,$2);
230 chomp($desc) if defined $desc;
231 $self->{'_result_count'}++;
232 $self->start_element({'Name' => 'ExonerateOutput'});
233 $self->element({'Name' => 'ExonerateOutput_query-def',
235 $self->element({'Name' => 'ExonerateOutput_query-desc',
237 $self->element({'Name' => 'ExonerateOutput_program',
238 'Data' => 'Exonerate' });
239 $self->{'_seencigar'} = 0;
240 $self->{'_vulgar'} = 0;
242 } elsif ( /^Target:\s+(\S+)\s*(.+)?/ ) {
243 my ($nm,$desc) = ($1,$2);
244 chomp($desc) if defined $desc;
245 $self->start_element({'Name' => 'Hit'});
246 $self->element({'Name' => 'Hit_id',
248 $self->element({'Name' => 'Hit_desc',
250 $self->{'_seencigar'} = 0;
251 $self->{'_vulgar'} = 0;
252 } elsif( s
/^vulgar
:\s
+(\S
+)\s
+ # query sequence id
253 (\d
+)\s
+(\d
+)\s
+([\
-\
+\
.])\s
+ # query start-end-strand
254 (\S
+)\s
+ # target sequence id
255 (\d
+)\s
+(\d
+)\s
+([\
-\
+])\s
+ # target start-end-strand
258 next if( $self->cigar || $self->{'_seencigar'});
259 $self->{'_vulgar'}++;
261 # Note from Ewan. This is ugly - copy and paste from
262 # cigar line parsing. Should unify somehow...
264 if( ! $self->within_element('result') ) {
265 $self->start_element({'Name' => 'ExonerateOutput'});
266 $self->element({'Name' => 'ExonerateOutput_query-def',
269 if( ! $self->within_element('hit') ) {
270 $self->start_element({'Name' => 'Hit'});
271 $self->element({'Name' => 'Hit_id',
276 ## $qe and $he are no longer used for calculating the ends,
277 ## just the $qs and $hs values and the alignment and insert lenghts
278 my ($qs,$qe,$qstrand) = ($2,$3,$4);
279 my ($hs,$he,$hstrand) = ($6,$7,$8);
281 # $self->element({'Name' => 'ExonerateOutput_query-len',
283 # $self->element({'Name' => 'Hit_len',
287 ## add one because these values are zero-based
288 ## this calculation was originally done lower in the code,
289 ## but it's clearer to do it just once at the start
291 my ($qbegin,$qend) = ('query-from', 'query-to');
293 if( $qstrand eq '-' ) {
294 $qstrand = -1; $qe++;
299 my ($hbegin,$hend) = ('hit-from', 'hit-to');
301 if( $hstrand eq '-' ) {
308 # okay let's do this right and generate a set of HSPs
309 # from the cigar line/home/bio1/jes12/bin/exonerate --model est2genome --bestn 1 t/data/exonerate_cdna.fa t/data/exonerate_genomic_rev.fa
311 my ($aln_len,$inserts,$deletes) = (0,0,0);
312 my ($laststate,@events,$gaps) =( '' );
313 while( @rest >= 3 ) {
314 my ($state,$len1,$len2) = (shift @rest, shift @rest, shift @rest);
316 # HSPs are only the Match cases; otherwise we just
317 # move the coordinates on by the correct amount
320 if( $state eq 'M' ) {
321 if( $laststate eq 'G' ) {
322 # merge gaps across Match states so the HSP
324 $events[-1]->{$qend} = $qs + $len1*$qstrand - $qstrand;
325 $events[-1]->{$hend} = $hs + $len2*$hstrand - $hstrand;
326 $events[-1]->{'gaps'} = $gaps;
330 'align-len' => $len1,
332 $qend => ($qs + $len1*$qstrand - $qstrand),
334 $hend => ($hs + $len2*$hstrand - $hstrand),
339 $gaps = $len1 + $len2 if $state eq 'G';
341 $qs += $len1*$qstrand;
342 $hs += $len2*$hstrand;
345 for my $event ( @events ) {
346 $self->start_element({'Name' => 'Hsp'});
347 while( my ($key,$val) = each %$event ) {
348 $self->element({'Name' => "Hsp_$key",
351 $self->element({'Name' => 'Hsp_identity',
353 $self->end_element({'Name' => 'Hsp'});
357 $self->element({'Name' => 'Hit_score',
360 $self->end_element({'Name' => 'Hit'});
361 $self->end_element({'Name' => 'ExonerateOutput'});
363 return $self->end_document();
365 } elsif( s
/^cigar
:\s
+(\S
+)\s
+ # query sequence id
366 (\d
+)\s
+(\d
+)\s
+([\
-\
+])\s
+ # query start-end-strand
367 (\S
+)\s
+ # target sequence id
368 (\d
+)\s
+(\d
+)\s
+([\
-\
+])\s
+ # target start-end-strand
371 next if( $self->vulgar || $self->{'_seenvulgar'});
374 if( ! $self->within_element('result') ) {
375 $self->start_element({'Name' => 'ExonerateOutput'});
376 $self->element({'Name' => 'ExonerateOutput_query-def',
379 if( ! $self->within_element('hit') ) {
380 $self->start_element({'Name' => 'Hit'});
381 $self->element({'Name' => 'Hit_id',
385 ## $qe and $he are no longer used for calculating the ends,
386 ## just the $qs and $hs values and the alignment and insert lenghts
387 my ($qs,$qe,$qstrand) = ($2,$3,$4);
388 my ($hs,$he,$hstrand) = ($6,$7,$8);
390 # $self->element({'Name' => 'ExonerateOutput_query-len',
392 # $self->element({'Name' => 'Hit_len',
396 if( $qstrand eq '-' ) {
398 ($qs,$qe) = ($qe,$qs); # flip-flop if we're on opp strand
400 } else { $qstrand = 1; }
401 if( $hstrand eq '-' ) {
403 ($hs,$he) = ($he,$hs); # flip-flop if we're on opp strand
405 } else { $hstrand = 1; }
406 # okay let's do this right and generate a set of HSPs
407 # from the cigar line
410 ## add one because these values are zero-based
411 ## this calculation was originally done lower in the code,
412 ## but it's clearer to do it just once at the start
415 my ($aln_len,$inserts,$deletes) = (0,0,0);
416 while( @rest >= 2 ) {
417 my ($state,$len) = (shift @rest, shift @rest);
418 if( $state eq 'I' ) {
420 } elsif( $state eq 'D' ) {
421 if( $len >= $MIN_INTRON ) {
422 $self->start_element({'Name' => 'Hsp'});
424 $self->element({'Name' => 'Hsp_score',
426 $self->element({'Name' => 'Hsp_align-len',
427 'Data' => $aln_len});
428 $self->element({'Name' => 'Hsp_identity',
430 ($inserts + $deletes)});
432 # HSP ends where the other begins
433 $self->element({'Name' => 'Hsp_query-from',
436 ## $qs is now the start of the next hsp
437 ## the end of this hsp is 1 before this position
438 ## (or 1 after in case of reverse strand)
439 $qs += $aln_len*$qstrand;
440 $self->element({'Name' => 'Hsp_query-to',
441 'Data' => $qs - ($qstrand*1)});
443 $hs += $deletes*$hstrand;
444 $self->element({'Name' => 'Hsp_hit-from',
446 $hs += $aln_len*$hstrand;
447 $self->element({'Name' => 'Hsp_hit-to',
448 'Data' => $hs-($hstrand*1)});
450 $self->element({'Name' => 'Hsp_align-len',
451 'Data' => $aln_len + $inserts
453 $self->element({'Name' => 'Hsp_identity',
454 'Data' => $aln_len });
456 $self->element({'Name' => 'Hsp_gaps',
457 'Data' => $inserts + $deletes});
458 $self->element({'Name' => 'Hsp_querygaps',
459 'Data' => $inserts});
460 $self->element({'Name' => 'Hsp_hitgaps',
461 'Data' => $deletes});
465 $self->element({'Name' => 'Hsp_qseq',
466 'Data' => shift @q_ex,
468 $self->element({'Name' => 'Hsp_hseq',
469 'Data' => shift @h_ex,
471 $self->element({'Name' => 'Hsp_midline',
472 'Data' => shift @m_ex,
475 $self->end_element({'Name' => 'Hsp'});
477 $aln_len = $inserts = $deletes = 0;
484 $self->start_element({'Name' => 'Hsp'});
488 $self->element({'Name' => 'Hsp_qseq',
489 'Data' => shift @q_ex,
491 $self->element({'Name' => 'Hsp_hseq',
492 'Data' => shift @h_ex,
494 $self->element({'Name' => 'Hsp_midline',
495 'Data' => shift @m_ex,
499 $self->element({'Name' => 'Hsp_score',
502 $self->element({'Name' => 'Hsp_query-from',
505 $qs += $aln_len*$qstrand;
506 $self->element({'Name' => 'Hsp_query-to',
507 'Data' => $qs - ($qstrand*1)});
509 $hs += $deletes*$hstrand;
510 $self->element({'Name' => 'Hsp_hit-from',
512 $hs += $aln_len*$hstrand;
513 $self->element({'Name' => 'Hsp_hit-to',
514 'Data' => $hs -($hstrand*1)});
516 $self->element({'Name' => 'Hsp_align-len',
517 'Data' => $aln_len});
519 $self->element({'Name' => 'Hsp_identity',
520 'Data' => $aln_len - ($inserts + $deletes)});
522 $self->element({'Name' => 'Hsp_gaps',
523 'Data' => $inserts + $deletes});
525 $self->element({'Name' => 'Hsp_querygaps',
526 'Data' => $inserts});
527 $self->element({'Name' => 'Hsp_hitgaps',
528 'Data' => $deletes});
529 $self->end_element({'Name' => 'Hsp'});
531 $self->element({'Name' => 'Hit_score',
534 $self->end_element({'Name' => 'Hit'});
535 $self->end_element({'Name' => 'ExonerateOutput'});
537 return $self->end_document();
542 return $self->end_document() if( $seentop );
547 Title : start_element
548 Usage : $eventgenerator->start_element
549 Function: Handles a start element event
551 Args : hashref with at least 2 keys 'Data' and 'Name'
557 my ($self,$data) = @_;
558 # we currently don't care about attributes
559 my $nm = $data->{'Name'};
560 my $type = $MODEMAP{$nm};
563 if( $self->_eventHandler->will_handle($type) ) {
564 my $func = sprintf("start_%s",lc $type);
565 $self->_eventHandler->$func($data->{'Attributes'});
567 unshift @
{$self->{'_elements'}}, $type;
568 if($type eq 'result') {
569 $self->{'_values'} = {};
570 $self->{'_result'}= undef;
578 Title : start_element
579 Usage : $eventgenerator->end_element
580 Function: Handles an end element event
582 Args : hashref with at least 2 keys 'Data' and 'Name'
588 my ($self,$data) = @_;
589 my $nm = $data->{'Name'};
590 my $type = $MODEMAP{$nm};
593 if( $type = $MODEMAP{$nm} ) {
594 if( $self->_eventHandler->will_handle($type) ) {
595 my $func = sprintf("end_%s",lc $type);
596 $rc = $self->_eventHandler->$func($self->{'_reporttype'},
599 shift @
{$self->{'_elements'}};
601 } elsif( $MAPPING{$nm} ) {
603 if ( ref($MAPPING{$nm}) =~ /hash/i ) {
604 my $key = (keys %{$MAPPING{$nm}})[0];
605 $self->{'_values'}->{$key}->{$MAPPING{$nm}->{$key}} = $self->{'_last_data'};
607 $self->{'_values'}->{$MAPPING{$nm}} = $self->{'_last_data'};
610 $self->debug( "unknown nm $nm, ignoring\n");
612 $self->{'_last_data'} = ''; # remove read data if we are at
614 $self->{'_result'} = $rc if( defined $type && $type eq 'result' );
621 Usage : $eventhandler->element({'Name' => $name, 'Data' => $str});
622 Function: Convience method that calls start_element, characters, end_element
624 Args : Hash ref with the keys 'Name' and 'Data'
630 my ($self,$data) = @_;
631 $self->start_element($data);
632 $self->characters($data);
633 $self->end_element($data);
639 Usage : $eventgenerator->characters($str)
640 Function: Send a character events
648 my ($self,$data) = @_;
650 return unless ( defined $data->{'Data'} && $data->{'Data'} !~ /^\s+$/ );
652 $self->{'_last_data'} = $data->{'Data'};
655 =head2 within_element
657 Title : within_element
658 Usage : if( $eventgenerator->within_element($element) ) {}
659 Function: Test if we are within a particular element
660 This is different than 'in' because within can be tested
663 Args : string element name
669 my ($self,$name) = @_;
670 return 0 if ( ! defined $name &&
671 ! defined $self->{'_elements'} ||
672 scalar @
{$self->{'_elements'}} == 0) ;
673 foreach ( @
{$self->{'_elements'}} ) {
685 Usage : if( $eventgenerator->in_element($element) ) {}
686 Function: Test if we are in a particular element
687 This is different than 'in' because within can be tested
690 Args : string element name
696 my ($self,$name) = @_;
697 return 0 if ! defined $self->{'_elements'}->[0];
698 return ( $self->{'_elements'}->[0] eq $name)
701 =head2 start_document
703 Title : start_document
704 Usage : $eventgenerator->start_document
705 Function: Handle a start document event
714 $self->{'_lasttype'} = '';
715 $self->{'_values'} = {};
716 $self->{'_result'}= undef;
717 $self->{'_elements'} = [];
718 $self->{'_reporttype'} = 'exonerate';
725 Usage : $eventgenerator->end_document
726 Function: Handles an end document event
727 Returns : Bio::Search::Result::ResultI object
734 my ($self,@args) = @_;
735 return $self->{'_result'};
740 my ($self, $blast, @args) = @_;
742 if( not defined($self->writer) ) {
743 $self->warn("Writer not defined. Using a $DEFAULT_WRITER_CLASS");
744 $self->writer( $DEFAULT_WRITER_CLASS->new() );
746 $self->SUPER::write_result
( $blast, @args );
751 return $self->{'_result_count'};
754 sub report_count
{ shift->result_count }
759 Usage : $obj->vulgar($newval)
760 Function: Get/Set flag, do you want to build HSPs from VULGAR string?
761 Returns : value of vulgar (a scalar)
762 Args : on set, new value (a scalar or undef, optional)
771 if( $_[0] && $self->{'_cigar'} ) {
772 $self->warn("Trying to set vulgar and cigar both to 1, must be either or");
773 $self->{'_cigar'} = 0;
774 return $self->{'_vulgar'} = 0;
777 return $self->{'_vulgar'};
783 Usage : $obj->cigar($newval)
784 Function: Get/Set boolean flag do you want to build HSPs from CIGAR strings?
785 Returns : value of cigar (a scalar)
786 Args : on set, new value (a scalar or undef, optional)
795 if( $_[0] && $self->{'_vulgar'} ) {
796 $self->warn("Trying to set vulgar and cigar both to 1, must be either or");
797 $self->{'_vulgar'} = 0;
798 return $self->{'_cigar'} = 0;
801 return $self->{'_cigar'};