2 # BioPerl module for Bio::SearchIO::infernal
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Chris Fields <cjfields-at-uiuc-dot-edu>
8 # Copyright Chris Fields
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
16 Bio::SearchIO::infernal - SearchIO-based Infernal parser
20 my $parser = Bio::SearchIO->new(-format => 'infernal',
21 -file => 'purine.inf');
22 while( my $result = $parser->next_result ) {
23 # general result info, such as model used, Infernal version
24 while( my $hit = $result->next_hit ) {
25 while( my $hsp = $hit->next_hsp ) {
33 This is a SearchIO-based parser for Infernal output from the cmsearch program.
34 It currently parses cmsearch output for Infernal versions 0.7-1.1; older
35 versions may work but will not be supported.
37 The latest version of Infernal is 1.1. The output has changed substantially
38 relative to version 1.0. Versions 1.x are stable releases (and output has
39 stabilized) therefore it is highly recommended that users upgrade to using
40 the latest Infernal release. Support for the older pre-v.1 developer releases
41 will be dropped for future core 1.6 releases.
47 User feedback is an integral part of the evolution of this and other
48 Bioperl modules. Send your comments and suggestions preferably to
49 the Bioperl mailing list. Your participation is much appreciated.
51 bioperl-l@bioperl.org - General discussion
52 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
56 Please direct usage questions or support issues to the mailing list:
58 I<bioperl-l@bioperl.org>
60 rather than to the module maintainer directly. Many experienced and
61 reponsive experts will be able look at the problem and quickly
62 address it. Please include a thorough description of the problem
63 with code and data examples if at all possible.
67 Report bugs to the Bioperl bug tracking system to help us keep track
68 of the bugs and their resolution. Bug reports can be submitted via the
71 https://github.com/bioperl/bioperl-live/issues
73 =head1 AUTHOR - Chris Fields
75 Email cjfields-at-uiuc-dot-edu
79 Jeffrey Barrick, Michigan State University
80 Paul Cantalupo, University of Pittsburgh
84 The rest of the documentation details each of the object methods.
85 Internal methods are usually preceded with a _
89 # Let the code begin...
91 package Bio
::SearchIO
::infernal
;
95 use base
qw(Bio::SearchIO);
104 'Hsp_bit-score' => 'HSP-bits',
105 'Hsp_score' => 'HSP-score',
106 'Hsp_evalue' => 'HSP-evalue', # evalues only in v0.81, optional
107 'Hsp_pvalue' => 'HSP-pvalue', # pvalues only in v0.81, optional
108 'Hsp_query-from' => 'HSP-query_start',
109 'Hsp_query-to' => 'HSP-query_end',
110 'Hsp_query-strand'=> 'HSP-query_strand',
111 'Hsp_hit-from' => 'HSP-hit_start',
112 'Hsp_hit-to' => 'HSP-hit_end',
113 'Hsp_hit-strand' => 'HSP-hit_strand',
114 'Hsp_gaps' => 'HSP-hsp_gaps',
115 'Hsp_hitgaps' => 'HSP-hit_gaps',
116 'Hsp_querygaps' => 'HSP-query_gaps',
117 'Hsp_qseq' => 'HSP-query_seq',
118 'Hsp_ncline' => 'HSP-nc_seq',
119 'Hsp_hseq' => 'HSP-hit_seq',
120 'Hsp_midline' => 'HSP-homology_seq',
121 'Hsp_pline' => 'HSP-pp_seq',
122 'Hsp_structure' => 'HSP-meta',
123 'Hsp_align-len' => 'HSP-hsp_length',
124 'Hsp_stranded' => 'HSP-stranded',
126 'Hit_id' => 'HIT-name',
127 'Hit_len' => 'HIT-length',
128 'Hit_gi' => 'HIT-ncbi_gi',
129 'Hit_accession' => 'HIT-accession',
130 'Hit_desc' => 'HIT-description',
131 'Hit_def' => 'HIT-description',
132 'Hit_signif' => 'HIT-significance', # evalues in v1.1 and v0.81, optional
133 'Hit_p' => 'HIT-p', # pvalues only in 1.0, optional
134 'Hit_score' => 'HIT-score', # best HSP bit score (in v1.1, the only HSP bit score)
135 'Hit_bits' => 'HIT-bits', # best HSP bit score (ditto)
137 'Infernal_program' => 'RESULT-algorithm_name', # get/set
138 'Infernal_version' => 'RESULT-algorithm_version', # get/set
139 'Infernal_query-def'=> 'RESULT-query_name', # get/set
140 'Infernal_query-len'=> 'RESULT-query_length',
141 'Infernal_query-acc'=> 'RESULT-query_accession', # get/set
142 'Infernal_querydesc'=> 'RESULT-query_description', # get/set
143 'Infernal_cm' => 'RESULT-cm_name',
144 'Infernal_db' => 'RESULT-database_name', # get/set
145 'Infernal_db-len' => 'RESULT-database_entries', # in v1.1 only
146 'Infernal_db-let' => 'RESULT-database_letters', # in v1.1 only
150 my $DEFAULT_ALGORITHM = 'cmsearch';
151 my $DEFAULT_VERSION = '1.1';
153 my @VALID_SYMBOLS = qw(5-prime 3-prime single-strand unknown gap);
154 my %STRUCTURE_SYMBOLS = (
157 'single-strand' => ':',
165 Usage : my $obj = Bio::SearchIO::infernal->new();
166 Function: Builds a new Bio::SearchIO::infernal object
167 Returns : Bio::SearchIO::infernal
168 Args : -fh/-file => cmsearch (infernal) filename
169 -format => 'infernal'
170 -model => query model (Rfam ID) (default undef)
171 -database => database name (default undef)
172 -query_acc => query accession, eg. Rfam accession RF####
173 -query_desc => query description, eg. Rfam description
174 -hsp_minscore => minimum HSP score cutoff
175 -convert_meta => boolean, set to convert meta string to simple WUSS format
176 -symbols => hash ref of structure symbols to use
177 (default symbols in %STRUCTURE_SYMBOLS hash)
182 my ( $self, @args ) = @_;
183 $self->SUPER::_initialize
(@args);
184 my ($model, $database, $convert, $symbols, $cutoff,
185 $desc, $accession, $algorithm, $version) =
186 $self->_rearrange([qw(MODEL
195 my $handler = $self->_eventHandler;
196 $handler->register_factory(
198 Bio
::Factory
::ObjectFactory
->new(
199 -type
=> 'Bio::Search::Result::INFERNALResult',
200 -interface
=> 'Bio::Search::Result::ResultI',
201 -verbose
=> $self->verbose
205 $handler->register_factory(
207 Bio
::Factory
::ObjectFactory
->new(
208 -type
=> 'Bio::Search::Hit::ModelHit',
209 -interface
=> 'Bio::Search::Hit::HitI',
210 -verbose
=> $self->verbose
214 $handler->register_factory(
216 Bio
::Factory
::ObjectFactory
->new(
217 -type
=> 'Bio::Search::HSP::ModelHSP',
218 -interface
=> 'Bio::Search::HSP::HSPI',
219 -verbose
=> $self->verbose
223 defined $model && $self->model($model);
224 defined $database && $self->database($database);
225 defined $accession && $self->query_accession($accession);
226 defined $convert && $self->convert_meta($convert);
227 defined $desc && $self->query_description($desc);
229 $version ||= $DEFAULT_VERSION;
230 $self->version($version);
231 $symbols ||= \
%STRUCTURE_SYMBOLS;
232 $self->structure_symbols($symbols);
233 $cutoff ||= $MINSCORE;
234 $self->hsp_minscore($cutoff);
235 $algorithm ||= $DEFAULT_ALGORITHM;
236 $self->algorithm($algorithm);
242 Usage : my $hit = $searchio->next_result;
243 Function: Returns the next Result from a search
244 Returns : Bio::Search::Result::ResultI object
251 unless (exists $self->{'_handlerset'}) {
253 while ($line = $self->_readline) {
254 # advance to first line
255 next if $line =~ m{^\s*$};
256 # newer output starts with model name
257 if ($line =~ m{^\#\s+cmsearch\s}) {
258 my $secondline = $self->_readline;
259 if ($secondline =~ m{INFERNAL 1\.1}) {
260 $self->{'_handlerset'} = '1.1';
263 $self->{'_handlerset'} = 'latest'; # v1.0
265 $self->_pushback($secondline);
267 elsif ($line =~ m{^CM\s\d+:}) {
268 $self->{'_handlerset'} = 'pre-1.0';
271 $self->{'_handlerset'} ='old';
275 $self->_pushback($line);
276 #if ($self->{'_handlerset'} ne '1.0') {
278 # -message => "Parsing of Infernal pre-1.0 release is deprecated;\n".
279 # "upgrading to Infernal 1.0 or above is highly recommended",
280 # -version => 1.007);
283 return ($self->{'_handlerset'} eq '1.1') ?
$self->_parse_v1_1 :
284 ($self->{'_handlerset'} eq 'latest') ?
$self->_parse_latest :
285 ($self->{'_handlerset'} eq 'pre-1.0') ?
$self->_parse_pre :
294 my ($accession, $description) = ($self->query_accession, $self->query_description);
295 my ($buffer, $last, %modelcounter, @hit_list, %hitindex,
296 @hsp_list, %hspindex);
297 $self->start_document();
298 $buffer = $self->_readline;
299 if ( !defined $buffer || $buffer =~ m/^\[ok/ ) { # end of report
303 $self->_pushback($buffer);
306 PARSER
: # Parse each line of report
307 while ( defined( $buffer = $self->_readline ) ) {
309 my $lineorig = $buffer;
312 # INFERNAL program name
313 if ( $buffer =~ m/^\#\s(\S+)\s\:\:\s/ ) {
316 $self->start_element( { 'Name' => 'Result' } );
317 $self->element_hash( { 'Infernal_program' => uc($prog) } );
320 # INFERNAL version and release date
321 elsif ( $buffer =~ m/^\#\sINFERNAL\s+(\S+)\s+\((.+)\)/ ) {
323 my $versiondate = $2;
324 $self->{'_cmidline'} = $buffer;
325 $self->element_hash( { 'Infernal_version' => $version } );
329 elsif ( $buffer =~ /^\#\squery (?:\w+ )?file\:\s+(\S+)/ ) {
330 $self->{'_cmfileline'} = $lineorig;
331 $self->element_hash( { 'Infernal_cm' => $1 } );
335 elsif ( $buffer =~ m/^\#\starget\s\S+\sdatabase\:\s+(\S+)/ ) {
336 $self->{'_cmseqline'} = $lineorig;
337 $self->element_hash( { 'Infernal_db' => $1 } );
341 elsif ( $buffer =~ m/^Query:\s+(\S+)\s+\[CLEN=(\d+)\]$/ ) {
342 $self->element_hash( { 'Infernal_query-def' => $1,
343 'Infernal_query-len' => $2,
344 'Infernal_query-acc' => $accession,
345 'Infernal_querydesc' => $description,
349 # Get query accession
350 elsif ( $buffer =~ s/^Accession:\s+// && ! $accession) {
352 $self->element_hash( { 'Infernal_query-acc' => $buffer } );
355 # Get query description
356 elsif ( $buffer =~ s/^Description:\s+// && ! $description) {
358 $self->element_hash( { 'Infernal_querydesc' => $buffer } );
361 # Process hit table - including those below inclusion threshold
362 elsif ( $buffer =~ m/^Hit scores:/) {
363 @hit_list = (); # here is case there are multi-query reports
364 while ( defined( $buffer = $self->_readline ) ) {
365 if ( $buffer =~ m/^Hit alignments:/ ) {
366 $self->_pushback($buffer);
369 elsif ( $buffer =~ m/^\s+rank\s+E-value/
370 || $buffer =~ m/\-\-\-/
372 || $buffer =~ m/No hits detected/ ) {
378 my ($rank, $threshold, $eval, $score,
379 $bias, $hitid, $start, $end, $strand,
380 $mdl, $truc, $gc, @desc) = split( " ", $buffer );
381 my $desc = join " ", @desc;
382 $desc = '' if ( !defined($desc) );
384 push @hit_list, [ $hitid, $desc, $eval, $score ];
385 $hitindex{ $hitid.$hit_counter } = $#hit_list;
389 # Process hit alignments
390 elsif ( $buffer =~ /^Hit alignments:/ ) {
392 my $align_counter = 0;
393 while ( defined( $buffer = $self->_readline ) ) {
394 if ( $buffer =~ /^Internal CM pipeline statistics summary/ ) {
395 $self->_pushback($buffer);
398 if ( $buffer =~ m/^\>\>\s(\S*)\s+(.*)/ ) { # defline of hit
402 my $hitid_alignctr = $hitid.$align_counter;
403 $modelcounter{$hitid_alignctr} = 0;
405 # The Hit Description from the Hit table can be truncated if
406 # it is too long, so use the '>>' line description instead
407 $hit_list[ $hitindex{$hitid_alignctr} ][1] = $desc;
409 # Process hit information table
410 while ( defined( $buffer = $self->_readline ) ) {
411 if ( $buffer =~ m/^Internal CM pipeline statistics/
413 || $buffer =~ m/^\>\>/ ) {
414 $self->_pushback($buffer);
417 elsif ( $buffer =~ m/^\s+rank\s+E-value/
418 || $buffer =~ m/^\s----/
420 || $buffer =~ m/No hits detected/ ) {
424 # Get hsp data from table, push into @hsp;
425 my ( $rank, $threshold, $eval,
426 $score, $bias, $model,
427 $cm_start, $cm_stop, $cm_cov,
428 $seq_start, $seq_stop, $seq_strand, $seq_cov,
430 ) = split( " ", $buffer );
432 # Try to get the Hit Length from the alignment information.
433 # For cmsearch, if sequence coverage ends in ']' it means that the
434 # alignment runs full-length flush to the end of the target.
435 my $hitlength = ( $seq_cov =~ m/\]$/ ) ?
$seq_stop : 0;
437 my $tmphit = $hit_list[ $hitindex{$hitid_alignctr} ];
438 if ( !defined $tmphit ) {
439 $self->warn("Incomplete information: can't find HSP $hitid in list of hits\n");
443 push @hsp_list, [ $hitid,
445 $seq_start, $seq_stop,
448 $modelcounter{$hitid_alignctr}++;
449 my $hsp_key = $hitid_alignctr . "_" . $modelcounter{$hitid_alignctr};
450 $hspindex{$hsp_key} = $#hsp_list;
453 elsif ( $buffer =~ m/NC$/ ) { # start of HSP
454 # need CS line to get number of spaces before structure data
455 my $csline = $self->_readline;
456 $csline =~ m/^(\s+)\S+ CS$/;
457 my $offset = length($1);
458 $self->_pushback($csline);
459 $self->_pushback($buffer); # set up for loop
461 my ($ct, $strln) = 0;
464 my %hspline = ('0' => 'nc', '1' => 'meta',
465 '2' => 'query', '3' => 'midline',
466 '4' => 'hit', '5' => 'pp');
468 while (defined ($buffer = $self->_readline)) {
470 if ( $buffer =~ /^>>\s/
471 || $buffer =~ /^Internal CM pipeline statistics/) {
472 $self->_pushback($buffer);
475 elsif ( $ct % 6 == 0 && $buffer =~ /^$/ ) {
478 my $iterator = $ct % 6;
479 # NC line ends with ' NC' so remove these from the strlen count
480 $strln = ( length($buffer) - 3 ) if $iterator == 0;
481 my $data = substr($buffer, $offset, $strln-$offset);
482 $hspdata->{ $hspline{$iterator} } .= $data;
488 # catch any insertions and add them into the actual length
489 while ($hspdata->{'query'} =~ m{\*\[\s*(\d+)\s*\]\*}g) {
492 # add on the actual residues
493 $strlen += $hspdata->{'query'} =~ tr{A-Za-z}{A-Za-z};
494 my $metastr = ($self->convert_meta) ?
($self->simple_meta($hspdata->{'meta'})) :
497 my $hitid_alignctr = $hitid . $align_counter;
498 my $hsp_key = $hitid_alignctr . "_" . $modelcounter{$hitid_alignctr};
499 my $hsp = $hsp_list[ $hspindex{$hsp_key} ];
500 push (@
$hsp, $hspdata->{'nc'}, $metastr,
501 $hspdata->{'query'}, $hspdata->{'midline'},
502 $hspdata->{'hit'}, $hspdata->{'pp'});
505 } # end of 'Hit alignments:' section of report
507 # Process internal pipeline stats (end of report)
508 elsif ( $buffer =~ m/Internal CM pipeline statistics summary:/ ) {
509 while ( defined( $buffer = $self->_readline ) ) {
510 last if ( $buffer =~ m!^//! );
512 if ( $buffer =~ /^Target sequences:\s+(\d+)\s+\((\d+) residues/ ) {
513 $self->element_hash( { 'Infernal_db-len' => $1,
514 'Infernal_db-let' => $2, } );
517 last; # of the outer while defined $self->readline
522 #print STDERR "Missed line: $buffer\n";
523 $self->debug($buffer);
528 # Final processing of hits and hsps
530 foreach my $hit ( @hit_list ) {
532 my ($hit_name, $hit_desc, $hit_signif, $hit_score) = @
$hit;
533 my $num_hsp = $modelcounter{$hit_name . $hit_counter} || 0;
535 $self->start_element( { 'Name' => 'Hit' } );
536 $self->element_hash( {'Hit_id' => $hit_name,
537 'Hit_desc' => $hit_desc,
538 'Hit_signif'=> $hit_signif,
539 'Hit_score' => $hit_score,
540 'Hit_bits' => $hit_score, } );
541 for my $i ( 1 .. $num_hsp ) {
542 my $hsp_key = $hit_name . $hit_counter . "_" . $i;
543 my $hsp = $hsp_list[ $hspindex{$hsp_key} ];
544 if ( defined $hsp ) {
545 my $hspid = shift @
$hsp;
547 my ($cm_start, $cm_stop, $seq_start, $seq_stop,
548 $score, $eval, $hitlength, $ncline,
549 $csline, $qseq, $midline, $hseq, $pline) = @
$hsp;
550 if ( $hitlength != 0 ) {
552 { 'Name' => 'Hit_len', 'Data' => $hitlength }
556 $self->start_element( { 'Name' => 'Hsp' } );
557 $self->element_hash( { 'Hsp_stranded' => 'HIT',
558 'Hsp_query-from' => $cm_start,
559 'Hsp_query-to' => $cm_stop,
560 'Hsp_hit-from' => $seq_start,
561 'Hsp_hit-to' => $seq_stop,
562 'Hsp_score' => $score,
563 'Hsp_bit-score' => $score,
564 'Hsp_evalue' => $eval,
565 'Hsp_ncline' => $ncline,
566 'Hsp_structure' => $csline,
568 'Hsp_midline' => $midline,
570 'Hsp_pline' => $pline,
573 $self->end_element( { 'Name' => 'Hsp' } );
576 $self->end_element( { 'Name' => 'Hit' } );
579 $self->end_element( { 'Name' => 'Result' } );
580 my $result = $self->end_document();
587 Title : start_element
588 Usage : $eventgenerator->start_element
589 Function: Handles a start element event
591 Args : hashref with at least 2 keys 'Data' and 'Name'
597 my ( $self, $data ) = @_;
599 # we currently don't care about attributes
600 my $nm = $data->{'Name'};
601 my $type = $MODEMAP{$nm};
603 if ( $self->_eventHandler->will_handle($type) ) {
604 my $func = sprintf( "start_%s", lc $type );
605 $self->_eventHandler->$func( $data->{'Attributes'} );
607 unshift @
{ $self->{'_elements'} }, $type;
610 && $type eq 'result' )
612 $self->{'_values'} = {};
613 $self->{'_result'} = undef;
619 Title : start_element
620 Usage : $eventgenerator->end_element
621 Function: Handles an end element event
623 Args : hashref with at least 2 keys, 'Data' and 'Name'
628 my ( $self, $data ) = @_;
629 my $nm = $data->{'Name'};
630 my $type = $MODEMAP{$nm};
634 if ( $self->_eventHandler->will_handle($type) ) {
635 my $func = sprintf( "end_%s", lc $type );
636 $rc = $self->_eventHandler->$func( $self->{'_reporttype'},
637 $self->{'_values'} );
639 my $lastelem = shift @
{ $self->{'_elements'} };
641 # Infernal 1.1 allows one to know hit->length in some instances
642 # so remove it so it doesn't carry over to next hit. Tried flushing
643 # all 'type' values from {_values} buffer but it breaks legacy tests
644 if ($type eq 'hit' ) {
645 delete $self->{_values
}{'HIT-length'};
646 delete $self->{_values
}{'HSP-hit_length'};
649 elsif ( $MAPPING{$nm} ) {
650 if ( ref( $MAPPING{$nm} ) =~ /hash/i ) {
651 my $key = ( keys %{ $MAPPING{$nm} } )[0];
652 $self->{'_values'}->{$key}->{ $MAPPING{$nm}->{$key} } =
653 $self->{'_last_data'};
656 $self->{'_values'}->{ $MAPPING{$nm} } = $self->{'_last_data'};
660 $self->debug("unknown nm $nm, ignoring\n");
662 $self->{'_last_data'} = ''; # remove read data if we are at
664 $self->{'_result'} = $rc if ( defined $type && $type eq 'result' );
671 Usage : $eventhandler->element({'Name' => $name, 'Data' => $str});
672 Function: Convenience method that calls start_element, characters, end_element
674 Args : Hash ref with the keys 'Name' and 'Data'
679 my ( $self, $data ) = @_;
680 # simple data calls (%MAPPING) do not need start_element
681 $self->characters($data);
682 $self->end_element($data);
688 Usage : $eventhandler->element_hash({'Hsp_hit-from' => $start,
689 'Hsp_hit-to' => $end,
690 'Hsp_score' => $lastscore});
691 Function: Convenience method that takes multiple simple data elements and
692 maps to appropriate parameters
694 Args : Hash ref with the mapped key (in %MAPPING) and value
699 my ($self, $data) = @_;
700 $self->throw("Must provide data hash ref") if !$data || !ref($data);
701 for my $nm (sort keys %{$data}) {
702 next if $data->{$nm} && $data->{$nm} =~ m{^\s*$}o;
703 if ( $MAPPING{$nm} ) {
704 if ( ref( $MAPPING{$nm} ) =~ /hash/i ) {
705 my $key = ( keys %{ $MAPPING{$nm} } )[0];
706 $self->{'_values'}->{$key}->{ $MAPPING{$nm}->{$key} } =
710 $self->{'_values'}->{ $MAPPING{$nm} } = $data->{$nm};
719 Usage : $eventgenerator->characters($str)
720 Function: Send a character events
728 my ( $self, $data ) = @_;
729 return unless ( defined $data->{'Data'} && $data->{'Data'} !~ /^\s+$/o );
730 $self->{'_last_data'} = $data->{'Data'};
733 =head2 within_element
735 Title : within_element
736 Usage : if( $eventgenerator->within_element($element) ) {}
737 Function: Test if we are within a particular element
738 This is different than 'in' because within can be tested
741 Args : string element name
746 my ( $self, $name ) = @_;
749 || !defined $self->{'_elements'}
750 || scalar @
{ $self->{'_elements'} } == 0 );
751 foreach ( @
{ $self->{'_elements'} } ) {
752 return 1 if ( $_ eq $name );
760 Usage : if( $eventgenerator->in_element($element) ) {}
761 Function: Test if we are in a particular element
762 This is different than 'within' because 'in' only
763 tests its immediate parent.
765 Args : string element name
770 my ( $self, $name ) = @_;
771 return 0 if !defined $self->{'_elements'}->[0];
772 return ( $self->{'_elements'}->[0] eq $name );
775 =head2 start_document
777 Title : start_document
778 Usage : $eventgenerator->start_document
779 Function: Handle a start document event
787 $self->{'_lasttype'} = '';
788 $self->{'_values'} = {};
789 $self->{'_result'} = undef;
790 $self->{'_elements'} = [];
796 Usage : $eventgenerator->end_document
797 Function: Handles an end document event
798 Returns : Bio::Search::Result::ResultI object
805 return $self->{'_result'};
811 Usage : my $count = $searchio->result_count
812 Function: Returns the number of results we have processed
820 return $self->{'_result_count'};
826 Usage : my $model = $parser->model();
827 Function: Get/Set model; Infernal currently does not output
828 the model name (Rfam ID)
829 Returns : String (name of model)
830 Args : [optional] String (name of model)
836 return $self->{'_model'} = shift if @_;
837 return $self->{'_model'};
843 Usage : my $database = $parser->database();
844 Function: Get/Set database; pre-v.1 versions of Infernal do not output
846 Returns : String (database name)
847 Args : [optional] String (database name)
853 return $self->{'_database'} = shift if @_;
854 return $self->{'_database'};
860 Usage : my $algorithm = $parser->algorithm();
861 Function: Get/Set algorithm; pre-v.1 versions of Infernal do not output
863 Returns : String (algorithm name)
864 Args : [optional] String (algorithm name)
870 return $self->{'_algorithm'} = shift if @_;
871 return $self->{'_algorithm'};
874 =head2 query_accession
876 Title : query_accession
877 Usage : my $acc = $parser->query_accession();
878 Function: Get/Set query (model) accession; pre-v1.1 Infernal does not output
879 the accession number (Rfam accession #)
880 Returns : String (accession)
881 Args : [optional] String (accession)
885 sub query_accession
{
887 return $self->{'_query_accession'} = shift if @_;
888 return $self->{'_query_accession'};
891 =head2 query_description
893 Title : query_description
894 Usage : my $acc = $parser->query_description();
895 Function: Get/Set query (model) description; pre-v1.1 Infernal does not output
897 Returns : String (description)
898 Args : [optional] String (description)
902 sub query_description
{
904 return $self->{'_query_description'} = shift if @_;
905 return $self->{'_query_description'};
911 Usage : my $cutoff = $parser->hsp_minscore();
912 Function: Get/Set min bit score cutoff (for generating Hits/HSPs)
913 Returns : score (number)
914 Args : [optional] score (number)
920 return $self->{'_hsp_minscore'} = shift if @_;
921 return $self->{'_hsp_minscore'};
927 Usage : $parser->convert_meta(1);
928 Function: Get/Set boolean flag for converting Infernal WUSS format
929 to a simple bracketed format (simple WUSS by default)
930 Returns : boolean flag (TRUE or FALSE)
931 Args : [optional] boolean (eval's to TRUE or FALSE)
937 return $self->{'_convert_meta'} = shift if @_;
938 return $self->{'_convert_meta'};
944 Usage : $parser->version();
945 Function: Set the Infernal cmsearch version
947 Args : [optional] version
953 return $self->{'_version'} = shift if @_;
954 return $self->{'_version'};
957 =head2 structure_symbols
959 Title : structure_symbols
960 Usage : my $hashref = $parser->structure_symbols();
961 Function: Get/Set RNA structure symbols
962 Returns : Hash ref of delimiters (5' stem, 3' stem, single-strand, etc)
963 : default = < (5-prime)
968 Args : Hash ref of substitute delimiters, using above keys.
972 sub structure_symbols
{
973 my ($self, $delim) = @_;
975 if (ref($delim) =~ m{HASH}) {
976 my %data = %{ $delim };
977 for my $d (@VALID_SYMBOLS) {
978 if ( exists $data{$d} ) {
979 $self->{'_delimiter'}->{$d} = $data{$d};
983 $self->throw("Args to helix_delimiters() should be in a hash reference");
986 return $self->{'_delimiter'};
992 Usage : my $string = $parser->simple_meta($str);
993 Function: converts more complex WUSS meta format into simple bracket format
994 using symbols defined in structure_symbols()
995 Returns : converted string
996 Args : [required] string to convert
997 Note : This is a very simple conversion method to get simple bracketed
998 format from Infernal data. If the convert_meta() flag is set,
999 this is the method used to convert the strings.
1004 my ($self, $str) = @_;
1005 $self->throw("No string arg sent!") if !$str;
1006 my $structs = $self->structure_symbols();
1007 my ($ls, $rs, $ss, $unk, $gap) = ($structs->{'5-prime'}, $structs->{'3-prime'},
1008 $structs->{'single-strand'}, $structs->{'unknown'},
1010 $str =~ s{[\(\<\[\{]}{$ls}g;
1011 $str =~ s{[\)\>\]\}]}{$rs}g;
1012 $str =~ s{[:,_-]}{$ss}g;
1013 $str =~ s{\.}{$gap}g;
1014 # unknown not handled yet
1020 # this is a hack which guesses the format and sets the handler for parsing in
1021 # an instance; it'll be taken out when infernal 1.0 is released
1027 my ($accession, $description) = ($self->query_accession, $self->query_description);
1028 my ($maxscore, $mineval, $minpval);
1029 $self->start_document();
1030 my ($lasthit, $lastscore, $lasteval, $lastpval, $laststart, $lastend);
1032 while (my $line = $self->_readline) {
1033 next if $line =~ m{^\s+$};
1034 # stats aren't parsed yet...
1035 if ($line =~ m{^\#\s+cmsearch}xms) {
1037 $self->start_element({'Name' => 'Result'});
1038 $self->element_hash({
1039 'Infernal_program' => 'CMSEARCH'
1042 elsif ($line =~ m{^\#\sINFERNAL\s+(\d+\.\d+)}xms) {
1043 $self->element_hash({
1044 'Infernal_version' => $1,
1047 elsif ($line =~ m{^\#\scommand:.*?\s(\S+)$}xms) {
1048 $self->element_hash({
1049 'Infernal_db' => $1,
1052 elsif ($line =~ m{^\#\s+dbsize\(Mb\):\s+(\d+\.\d+)}xms) {
1053 # store absolute DB length
1054 $self->element_hash({
1055 'Infernal_db-let' => $1 * 1e6
1058 elsif ($line =~ m{^CM(?:\s(\d+))?:\s*(\S+)}xms) {
1059 # not sure, but it's possible single reports may contain multiple
1060 # models; if so, they should be rolled over into a new ResultI
1061 #print STDERR "ACC: $accession\nDESC: $description\n";
1062 $self->element_hash({
1063 'Infernal_query-def' => $2, # present in output now
1064 'Infernal_query-acc' => $accession,
1065 'Infernal_querydesc' => $description
1068 elsif ($line =~ m{^>\s*(\S+)} ){
1069 #$self->debug("Start Hit: Found hit:$1\n");
1070 if ($self->in_element('hit')) {
1071 $self->element_hash({'Hit_score' => $maxscore,
1072 'Hit_bits' => $maxscore});
1073 ($maxscore, $minpval, $mineval) = undef;
1074 $self->end_element({'Name' => 'Hit'});
1079 ^\sQuery\s
=\s\d
+\s
-\s\d
+,\s
# Query start/end
1080 Target\s
=\s
(\d
+)\s
-\s
(\d
+) # Target start/end
1082 # Query (model) start/end always the same, determined from
1084 ($laststart, $lastend) = ($1, $2);
1085 #$self->debug("Found hit coords:$laststart - $lastend\n");
1086 } elsif ($line =~ m
{
1087 ^\sScore\s
=\s
([\d\
.]+),\s
# Score = Bitscore (for now)
1088 (?
:E\s
=\s
([\d\
.e
-]+),\s
# E-val optional
1089 P\s
=\s
([\d\
.e
-]+),\s
)?
# P-val optional
1090 GC\s
= # GC not captured
1093 ($lastscore, $lasteval, $lastpval) = ($1, $2, $3);
1094 #$self->debug(sprintf("Found hit data:Score:%s,Eval:%s,Pval:%s\n",$lastscore, $lasteval||'', $lastpval||''));
1095 $maxscore ||= $lastscore;
1096 if ($lasteval && $lastpval) {
1097 $mineval ||= $lasteval;
1098 $minpval ||= $lastpval;
1099 $mineval = ($mineval > $lasteval) ?
$lasteval :
1101 $minpval = ($minpval > $lastpval) ?
$lastpval :
1104 $maxscore = ($maxscore < $lastscore) ?
$lastscore :
1106 if (!$self->within_element('hit')) {
1107 my ($gi, $acc, $ver) = $self->_get_seq_identifiers($lasthit);
1108 $self->start_element({'Name' => 'Hit'});
1109 $self->element_hash({
1110 'Hit_id' => $lasthit,
1111 'Hit_accession' => $ver ?
"$acc.$ver" :
1112 $acc ?
$acc : $lasthit,
1116 if (!$self->in_element('hsp')) {
1117 $self->start_element({'Name' => 'Hsp'});
1120 # hsp is similar to older output
1121 } elsif ($line =~ m{^(\s+)[<>\{\}\
(\
)\
[\
]:_
,-\
.]+}xms
) { # start of HSP
1122 $self->_pushback($line); # set up for loop
1123 #$self->debug("Start HSP\n");
1124 # what is length of the gap to the structure data?
1125 my $offset = length($1);
1126 my ($ct, $strln) = 0;
1129 my %hsp_key = ('0' => 'meta',
1134 while (defined ($line = $self->_readline)) {
1136 next if (!$line); # toss empty lines
1137 # next if $line =~ m{^\s*$}; # toss empty lines
1138 # it is possible to have homology lines consisting
1139 # entirely of spaces if the subject has a large
1140 # insertion where nothing matches the model
1142 # exit loop if at end of file or upon next hit/HSP
1143 if ($line =~ m{^\s{0,2}\S
+}) {
1144 $self->_pushback($line);
1147 # iterate to keep track of each line (4 lines per hsp block)
1148 my $iterator = $ct % 4;
1149 # strlen set only with structure lines (proper length)
1150 $strln = length($line) if $iterator == 0;
1151 # only grab the data needed (hit start and stop in hit line above)
1153 my $data = substr($line, $offset, $strln-$offset);
1154 $hsp->{ $hsp_key{$iterator} } .= $data;
1159 # query start, end are from the actual query length (entire hit is
1160 # mapped to CM data, so all CM data is represented)
1162 if ($self->in_element('hsp')) {
1163 # In some cases with HSPs unaligned residues are present in
1164 # the hit or query (Ex: '*[ 8]*' is 8 unaligned residues).
1165 # This info needs to be passed on unmodifed to the HSP class
1166 # and handled there as it is subjectively changed based on
1170 # catch any insertions and add them into the actual length
1171 while ($hsp->{'query'} =~ m{\*\[\s*(\d+)\s*\]\*}g) {
1174 # add on the actual residues
1175 $strlen += $hsp->{'query'} =~ tr{A-Za-z}{A-Za-z};
1177 my $metastr = ($self->convert_meta) ?
($self->simple_meta($hsp->{'meta'})) :
1179 $self->element_hash(
1180 {'Hsp_stranded' => 'HIT',
1181 'Hsp_qseq' => $hsp->{'query'},
1182 'Hsp_hseq' => $hsp->{'hit'},
1183 'Hsp_midline' => $hsp->{'midline'},
1184 'Hsp_structure' => $metastr,
1185 'Hsp_query-from' => 1,
1186 'Infernal_query-len' => $strlen,
1187 'Hsp_query-to' => $strlen,
1188 'Hsp_hit-from' => $laststart,
1189 'Hsp_hit-to' => $lastend,
1190 'Hsp_score' => $lastscore,
1191 'Hsp_bit-score' => $lastscore,
1193 $self->element_hash(
1194 {'Hsp_evalue' => $lasteval,
1195 'Hsp_pvalue' => $lastpval,
1196 }) if ($lasteval && $lastpval);
1197 $self->end_element({'Name' => 'Hsp'});
1199 # result now ends with // and 'Fin'
1200 } elsif ($line =~ m{^//}xms ) {
1201 if ($self->within_element('result') && $seentop) {
1202 if ($self->in_element('hit')) {
1203 $self->element_hash({'Hit_score' => $maxscore,
1204 'Hit_bits' => $maxscore});
1205 # don't know where to put minpval yet
1206 $self->element_hash({'Hit_signif' => $mineval}) if $mineval;
1207 $self->element_hash({'Hit_p' => $minpval}) if $minpval;
1208 $self->end_element({'Name' => 'Hit'});
1214 $self->within_element('hit') && $self->end_element( { 'Name' => 'Hit' } );
1215 $self->end_element( { 'Name' => 'Result' } ) if $seentop;
1216 return $self->end_document();
1219 # cmsearch 0.81 (pre-1.0)
1224 my ($accession, $db, $algorithm, $description, $version) =
1225 ($self->query_accession, $self->database, $self->algorithm,
1226 $self->query_description, '0.81');
1227 my ($maxscore, $mineval, $minpval);
1228 $self->start_document();
1229 my ($lasthit, $lastscore, $lasteval, $lastpval, $laststart, $lastend);
1231 while (my $line = $self->_readline) {
1232 next if $line =~ m{^\s+$};
1233 # stats aren't parsed yet...
1234 if ($line =~ m{CM\s\d+:\s*(\S+)}xms) {
1235 #$self->debug("Start Result: Found model:$1\n");
1236 if (!$self->within_element('result')) {
1238 $self->start_element({'Name' => 'Result'});
1239 $self->element_hash({
1240 'Infernal_program' => $algorithm,
1241 'Infernal_query-def' => $1, # present in output now
1242 'Infernal_query-acc' => $accession,
1243 'Infernal_querydesc' => $description,
1244 'Infernal_db' => $db
1247 } elsif ($line =~ m{^>\s*(\S+)} ){
1248 #$self->debug("Start Hit: Found hit:$1\n");
1249 if ($self->in_element('hit')) {
1250 $self->element_hash({'Hit_score' => $maxscore,
1251 'Hit_bits' => $maxscore});
1252 ($maxscore, $minpval, $mineval) = undef;
1253 $self->end_element({'Name' => 'Hit'});
1258 ^\sQuery\s
=\s\d
+\s
-\s\d
+,\s
# Query start/end
1259 Target\s
=\s
(\d
+)\s
-\s
(\d
+) # Target start/end
1261 # Query (model) start/end always the same, determined from
1263 ($laststart, $lastend) = ($1, $2);
1264 #$self->debug("Found hit coords:$laststart - $lastend\n");
1265 } elsif ($line =~ m
{
1266 ^\sScore\s
=\s
([\d\
.]+),\s
# Score = Bitscore (for now)
1267 (?
:E\s
=\s
([\d\
.e
-]+),\s
# E-val optional
1268 P\s
=\s
([\d\
.e
-]+),\s
)?
# P-val optional
1269 GC\s
= # GC not captured
1272 ($lastscore, $lasteval, $lastpval) = ($1, $2, $3);
1273 #$self->debug(sprintf("Found hit data:Score:%s,Eval:%s,Pval:%s\n",$lastscore, $lasteval||'', $lastpval||''));
1274 $maxscore ||= $lastscore;
1275 if ($lasteval && $lastpval) {
1276 $mineval ||= $lasteval;
1277 $minpval ||= $lastpval;
1278 $mineval = ($mineval > $lasteval) ?
$lasteval :
1280 $minpval = ($minpval > $lastpval) ?
$lastpval :
1283 $maxscore = ($maxscore < $lastscore) ?
$lastscore :
1285 if (!$self->within_element('hit')) {
1286 my ($gi, $acc, $ver) = $self->_get_seq_identifiers($lasthit);
1287 $self->start_element({'Name' => 'Hit'});
1288 $self->element_hash({
1289 'Hit_id' => $lasthit,
1290 'Hit_accession' => $ver ?
"$acc.$ver" :
1291 $acc ?
$acc : $lasthit,
1295 if (!$self->in_element('hsp')) {
1296 $self->start_element({'Name' => 'Hsp'});
1299 # hsp is similar to older output
1300 } elsif ($line =~ m{^(\s+)[<>\{\}\
(\
)\
[\
]:_
,-\
.]+}xms
) { # start of HSP
1301 $self->_pushback($line); # set up for loop
1302 #$self->debug("Start HSP\n");
1303 # what is length of the gap to the structure data?
1304 my $offset = length($1);
1305 my ($ct, $strln) = 0;
1308 my %hsp_key = ('0' => 'meta',
1313 while (defined ($line = $self->_readline)) {
1315 next if (!$line); # toss empty lines
1316 # next if $line =~ m{^\s*$}; # toss empty lines
1317 # it is possible to have homology lines consisting
1318 # entirely of spaces if the subject has a large
1319 # insertion where nothing matches the model
1321 # exit loop if at end of file or upon next hit/HSP
1322 if ($line =~ m{^\s{0,2}\S
+}) {
1323 $self->_pushback($line);
1326 # iterate to keep track of each line (4 lines per hsp block)
1327 my $iterator = $ct%4;
1328 # strlen set only with structure lines (proper length)
1329 $strln = length($line) if $iterator == 0;
1330 # only grab the data needed (hit start and stop in hit line above)
1332 my $data = substr($line, $offset, $strln-$offset);
1333 $hsp->{ $hsp_key{$iterator} } .= $data;
1338 # query start, end are from the actual query length (entire hit is
1339 # mapped to CM data, so all CM data is represented)
1341 if ($self->in_element('hsp')) {
1342 my $strlen = $hsp->{'query'} =~ tr{A-Za-z}{A-Za-z};
1345 $metastr = ($self->convert_meta) ?
($self->simple_meta($hsp->{'meta'})) :
1347 $self->element_hash(
1348 {'Hsp_stranded' => 'HIT',
1349 'Hsp_qseq' => $hsp->{'query'},
1350 'Hsp_hseq' => $hsp->{'hit'},
1351 'Hsp_midline' => $hsp->{'midline'},
1352 'Hsp_structure' => $metastr,
1353 'Hsp_query-from' => 1,
1354 'Infernal_query-len' => $strlen,
1355 'Hsp_query-to' => $strlen,
1356 'Hsp_hit-from' => $laststart,
1357 'Hsp_hit-to' => $lastend,
1358 'Hsp_score' => $lastscore,
1359 'Hsp_bit-score' => $lastscore,
1361 $self->element_hash(
1362 {'Hsp_evalue' => $lasteval,
1363 'Hsp_pvalue' => $lastpval,
1364 }) if ($lasteval && $lastpval);
1365 $self->end_element({'Name' => 'Hsp'});
1367 # result now ends with // and 'Fin'
1368 } elsif ($line =~ m{^//}xms ) {
1369 if ($self->within_element('result') && $seentop) {
1371 {'Name' => 'Infernal_version',
1374 if ($self->in_element('hit')) {
1375 $self->element_hash({'Hit_score' => $maxscore,
1376 'Hit_bits' => $maxscore});
1377 # don't know where to put minpval yet
1378 $self->element_hash({'Hit_signif' => $mineval}) if $mineval;
1379 $self->end_element({'Name' => 'Hit'});
1385 $self->within_element('hit') && $self->end_element( { 'Name' => 'Hit' } );
1386 $self->end_element( { 'Name' => 'Result' } ) if $seentop;
1387 return $self->end_document();
1390 # cmsearch 0.72 and below; will likely be dropped when Infernal 1.0 is released
1395 my ($accession, $db, $algorithm, $model, $description, $version) =
1396 ($self->query_accession, $self->database, $self->algorithm,
1397 $self->model, $self->query_description, $self->version);
1399 my $cutoff = $self->hsp_minscore;
1400 $self->start_document();
1403 my ($lasthit, $lastscore, $laststart, $lastend);
1406 while ( defined( $line = $self->_readline ) ) {
1407 next if $line =~ m{^\s+$};
1408 # bypass this for now...
1409 next if $line =~ m{^HMM\shit};
1411 if ($line =~ m{^sequence:\s+(\S+)} ){
1412 if (!$self->within_element('result')) {
1414 $self->start_element({'Name' => 'Result'});
1415 $self->element_hash({
1416 'Infernal_program' => $algorithm,
1417 'Infernal_query-def' => $model,
1418 'Infernal_query-acc' => $accession,
1419 'Infernal_querydesc' => $description,
1420 'Infernal_db' => $db
1423 if ($self->in_element('hit')) {
1424 $self->element_hash({'Hit_score' => $maxscore,
1425 'Hit_bits' => $maxscore});
1427 $self->end_element({'Name' => 'Hit'});
1430 } elsif ($line =~ m{^hit\s+\d+\s+:\s+(\d+)\s+(\d+)\s+(\d+\.\d+)\s+bits}xms) {
1431 ($laststart, $lastend, $lastscore) = ($1, $2, $3);
1432 $maxscore = $lastscore unless $maxscore;
1433 if ($lastscore > $cutoff) {
1434 if (!$self->within_element('hit')) {
1435 my ($gi, $acc, $ver) = $self->_get_seq_identifiers($lasthit);
1436 $self->start_element({'Name' => 'Hit'});
1437 $self->element_hash({
1438 'Hit_id' => $lasthit,
1439 'Hit_accession' => $ver ?
"$acc.$ver" :
1440 $acc ?
$acc : $lasthit,
1444 # necessary as infernal 0.71 has repeated hit line
1445 if (!$self->in_element('hsp')) {
1446 $self->start_element({'Name' => 'Hsp'});
1448 $maxscore = ($maxscore < $lastscore) ?
$lastscore :
1451 } elsif ($line =~ m{^(\s+)[<>\{\}\
(\
)\
[\
]:_
,-\
.]+}xms
) { # start of HSP
1452 $self->_pushback($line); # set up for loop
1453 # what is length of the gap to the structure data?
1454 my $offset = length($1);
1455 my ($ct, $strln) = 0;
1458 my %hsp_key = ('0' => 'meta',
1463 while ($line = $self->_readline) {
1464 next if $line =~ m{^\s*$}; # toss empty lines
1466 # exit loop if at end of file or upon next hit/HSP
1467 if (!defined($line) || $line =~ m{^\S+}) {
1468 $self->_pushback($line);
1471 # iterate to keep track of each line (4 lines per hsp block)
1472 my $iterator = $ct%4;
1473 # strlen set only with structure lines (proper length)
1474 $strln = length($line) if $iterator == 0;
1475 # only grab the data needed (hit start and stop in hit line above)
1477 my $data = substr($line, $offset, $strln-$offset);
1478 $hsp->{ $hsp_key{$iterator} } .= $data;
1481 # query start, end are from the actual query length (entire hit is
1482 # mapped to CM data, so all CM data is represented)
1484 if ($self->in_element('hsp')) {
1485 my $strlen = $hsp->{'query'} =~ tr{A-Za-z}{A-Za-z};
1488 # Ugh...these should be passed in a hash
1489 $metastr = ($self->convert_meta) ?
($self->simple_meta($hsp->{'meta'})) :
1491 $self->element_hash(
1492 {'Hsp_stranded' => 'HIT',
1493 'Hsp_qseq' => $hsp->{'query'},
1494 'Hsp_hseq' => $hsp->{'hit'},
1495 'Hsp_midline' => $hsp->{'midline'},
1496 'Hsp_structure' => $metastr,
1497 'Hsp_query-from' => 1,
1498 'Infernal_query-len' => $strlen,
1499 'Hsp_query-to' => $strlen,
1500 'Hsp_hit-from' => $laststart,
1501 'Hsp_hit-to' => $lastend,
1502 'Hsp_score' => $lastscore,
1503 'Hsp_bit-score' => $lastscore
1505 $self->end_element({'Name' => 'Hsp'});
1507 } elsif ($line =~ m{^memory}xms || $line =~ m{^CYK\smemory}xms ) {
1508 if ($self->within_element('result') && $seentop) {
1510 {'Name' => 'Infernal_version',
1513 if ($self->in_element('hit')) {
1514 $self->element_hash({'Hit_score' => $maxscore,
1515 'Hit_bits' => $maxscore});
1516 $self->end_element({'Name' => 'Hit'});
1522 $self->within_element('hit') && $self->end_element( { 'Name' => 'Hit' } );
1523 $self->end_element( { 'Name' => 'Result' } ) if $seentop;
1524 return $self->end_document();