2 # BioPerl module for Bio::SearchIO::rnamotif
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Chris Fields <cjfields-at-uiuc-dot-edu>
8 # Copyright Chris Fields
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
16 Bio::SearchIO::rnamotif - SearchIO-based RNAMotif parser
20 # do not call this module directly. Use Bio::SearchIO.
24 This is a highly experimental SearchIO-based parser for output from the rnamotif
25 program (one of the programs in the RNAMotif suite). It currently parses only
26 raw rnamotif output for RNAMotif versions 3.0 and above; older versions may work
27 but will not be supported. rmfmt output will not be supported at this time.
33 User feedback is an integral part of the evolution of this and other
34 Bioperl modules. Send your comments and suggestions preferably to
35 the Bioperl mailing list. Your participation is much appreciated.
37 bioperl-l@bioperl.org - General discussion
38 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
42 Please direct usage questions or support issues to the mailing list:
44 I<bioperl-l@bioperl.org>
46 rather than to the module maintainer directly. Many experienced and
47 reponsive experts will be able look at the problem and quickly
48 address it. Please include a thorough description of the problem
49 with code and data examples if at all possible.
53 Report bugs to the Bioperl bug tracking system to help us keep track
54 of the bugs and their resolution. Bug reports can be submitted via the
57 https://github.com/bioperl/bioperl-live/issues
59 =head1 AUTHOR - Chris Fields
61 Email cjfields-at-uiuc-dot-edu
65 The rest of the documentation details each of the object methods.
66 Internal methods are usually preceded with a _
70 # Let the code begin...
72 package Bio
::SearchIO
::rnamotif
;
75 use base
qw(Bio::SearchIO);
84 # commented out tags have not been assigned
86 'Hsp_score' => 'HSP-score',
87 'Hsp_custom-data' => 'HSP-custom_score',
89 # rnamotif has no evalue
91 # descriptor has no start, end; same as hit start, end
92 'Hsp_query-from' => 'HSP-query_start',
93 'Hsp_query-to' => 'HSP-query_end',
94 'Hsp_hit-from' => 'HSP-hit_start',
95 'Hsp_hit-to' => 'HSP-hit_end',
97 # descriptor has no start, end
99 'Hsp_hseq' => 'HSP-hit_seq',
100 'Hsp_align-len' => 'HSP-hsp_length',
102 # build this from scratch, simple WUSS-format
103 'Hsp_structure' => 'HSP-meta',
104 'Hsp_stranded' => 'HSP-stranded',
106 # not supported for RNAMotif
108 'Hit_id' => 'HIT-name',
109 'Hit_accession' => 'HIT-accession',
110 'Hit_gi' => 'HIT-ncbi_gi',
111 'Hit_def' => 'HIT-description',
112 'Hit_score' => 'HIT-score', # best HSP score
114 'RNAMotif_program' => 'RESULT-algorithm_name', # get/set
115 'RNAMotif_version' => 'RESULT-algorithm_version', # get/set
116 'RNAMotif_query-def'=> 'RESULT-query_name', # get/set
117 # No length (query is a descriptor)
118 'RNAMotif_query-acc'=> 'RESULT-query_accession', # get/set
119 'RNAMotif_querydesc'=> 'RESULT-query_description', # get/set
120 'RNAMotif_db' => 'RESULT-database_name', # get/set
123 # use structure_delimiters to set custom delimiters
125 my @VALID_SYMBOLS = qw(5-prime 3-prime single-strand unknown);
126 my %STRUCTURE_SYMBOLS = (
129 'single-strand' => '.',
131 # may add more for quartets, triplets
134 my $DEFAULT_VERSION = '3.0.3';
139 Usage : my $obj = Bio::SearchIO->new();
140 Function: Builds a new Bio::SearchIO::rnamotif object
141 Returns : Bio::SearchIO::rnamotif parser
142 Args : -fh/-file => RNAMotif filename
143 -format => 'rnamotif'
144 -model => query model (or descriptor, in this case)
145 -database => database name (default undef)
146 -query_acc => query accession (default undef)
147 -hsp_minscore => minimum HSP score cutoff
148 -hsp_maxscore => maximum HSP score cutoff
149 -symbols => hash ref of structure symbols to use
150 (default symbols in %STRUCTURE_SYMBOLS hash)
155 my ( $self, @args ) = @_;
156 $self->SUPER::_initialize
(@args);
157 my ($version, $model, $database, $maxcutoff, $mincutoff, $seqdistance,
158 $accession, $symbols) =
159 $self->_rearrange([qw(VERSION MODEL DATABASE HSP_MAXSCORE
160 HSP_MINSCORE SEQ_DISTANCE QUERY_ACC SYMBOLS)],@args);
161 my $handler = $self->_eventHandler;
162 $handler->register_factory(
164 Bio
::Factory
::ObjectFactory
->new(
165 -type
=> 'Bio::Search::Result::GenericResult',
166 -interface
=> 'Bio::Search::Result::ResultI',
167 -verbose
=> $self->verbose
171 $handler->register_factory(
173 Bio
::Factory
::ObjectFactory
->new(
174 -type
=> 'Bio::Search::Hit::ModelHit',
175 -interface
=> 'Bio::Search::Hit::HitI',
176 -verbose
=> $self->verbose
180 $handler->register_factory(
182 Bio
::Factory
::ObjectFactory
->new(
183 -type
=> 'Bio::Search::HSP::ModelHSP',
184 -interface
=> 'Bio::Search::HSP::HSPI',
185 -verbose
=> $self->verbose
188 $model && $self->model($model);
189 $database && $self->database($database);
190 $accession && $self->query_accession($accession);
191 $version ||= $DEFAULT_VERSION;
192 $self->algorithm_version($version);
193 $self->throw("Cannot define both a minimal and maximal cutoff")
194 if (defined($mincutoff) && defined($maxcutoff));
195 defined($mincutoff) && $self->hsp_minscore($mincutoff);
196 defined($maxcutoff) && $self->hsp_maxscore($maxcutoff);
197 $symbols ||= \
%STRUCTURE_SYMBOLS;
198 $self->structure_symbols($symbols);
204 Usage : my $hit = $searchio->next_result;
205 Function: Returns the next Result from a search
206 Returns : Bio::Search::Result::ResultI object
216 my ($rm, $d, $descriptor, $file, $oktobuild);
217 my ($hitid, $hitdesc, $hspid, $lastid, $lastscore);
220 # user-determined Result data
221 my ($accession, $db, $model) =
222 ($self->query_accession, $self->database, $self->model);
223 # HSP building options
224 my $hsp_min = $self->hsp_minscore;
225 my $hsp_max = $self->hsp_maxscore;
226 my $version = $self->algorithm_version;
229 my $verbose = $self->verbose; # cache for speed?
230 $self->start_document();
232 while ( defined( my $line = $self->_readline ) ) {
234 next if $line =~ m{^\s+$};
235 if (index($line,'#RM') == 0) {
236 if (index($line,'#RM scored') == 0 ) {
238 $self->_pushback($line);
241 $self->start_element({'Name' => 'Result'});
242 $self->element_hash({
243 'RNAMotif_program' => 'rnamotif',
244 'RNAMotif_version' => $version,
245 'RNAMotif_query-acc' => $accession,
249 #$self->debug("Start result\n");
250 } elsif (index($line,'#RM descr') == 0) {
251 ($rm, $d, $descriptor) = split ' ', $line, 3;
252 # toss $rm, $d; keep $descr
254 $self->{'_descriptor'} = $descriptor;
256 {'Name' => 'RNAMotif_querydesc',
257 'Data' => $descriptor}
259 } elsif(index($line,'#RM dfile') == 0) {
260 ($rm, $d, $file) = split ' ', $line, 3;
261 # toss $rm, $d; keep $file
264 {'Name' => 'RNAMotif_query-def',
268 $self->debug("Unrecognized line: $line");
270 } elsif ($line =~ s{^>}{}) {
272 ($hitid, $hitdesc) = split ' ',$line,2;
274 if ($self->within_element('hit') && ($hitid ne $lastid)) {
276 {'Name' => 'Hit_score',
277 'Data' => $lastscore}
279 $self->end_element({'Name' => 'Hit'});
280 $self->start_element({'Name' => 'Hit'});
281 } elsif (!$self->within_element('hit')) {
282 $self->start_element({'Name' => 'Hit'});
284 my ($gi, $acc, $ver) = $self->_get_seq_identifiers($hitid);
286 $self->element_hash({
289 'Hit_accession' => $ver ?
"$acc.$ver" :
290 $acc ?
$acc : $hitid,
291 'Hit_def' => $hitdesc}
294 } elsif ($line =~ m{^(\S+)\s+(.+?)\s+(\d+)\s+(\d+)\s+(\d+)\s(.*)$}xm) {
297 my ($score, $strand, $start, $length , $seq) = ($2, $3, $4, $5, $6);
298 $score *= 1; # implicitly cast any odd '0.000' to float
300 unless ($hitid eq $hspid) {
301 $self->throw("IDs do not match!");
303 # check score for possible sprintf data, mark as such, cache result
304 if (!defined($sprintf)) {
305 if ($score =~ m{[^0-9.-]+}gxms) {
306 if (defined $hsp_min || defined $hsp_max ) {
307 $self->warn("HSP data likely contains custom score; ".
308 "ignoring min/maxscore");
310 $sprintf = $oktobuild = 1;
317 if (($hsp_min && $score <= $hsp_min)
318 || ($hsp_max && ($score >= $hsp_max)) ) {
324 # store best hit score based on the hsp min/maxscore only
325 if (defined $hsp_min && $score > $hsp_min) {
326 $lastscore = $score if !$lastscore || $score > $lastscore;
327 } elsif (defined $hsp_max && $score < $hsp_max) {
328 $lastscore = $score if !$lastscore || $score < $lastscore;
336 # calculate start/end
338 $end = $start + $length -1;
340 $end = $start - $length + 1;
343 my ($rna, $meta) = $self->_motif2meta($seq, $descriptor);
345 $self->start_element({'Name' => 'Hsp'});
346 my $rnalen = $rna =~ tr{ATGCatgc}{ATGCatgc};
347 $self->element_hash({
348 'Hsp_stranded' => 'HIT',
350 'Hsp_query-from' => 1,
351 'Hsp_query-to' =>length($rna),
352 'Hsp_hit-from' => $start,
353 'Hsp_hit-to' => $end,
354 'Hsp_structure' => $meta,
355 'Hsp_align-len' => length($rna),
356 'Hsp_score' => $sprintf ?
undef : $score,
357 'Hsp_custom-data' => $sprintf ?
$score : undef,
359 $self->end_element({'Name' => 'Hsp'});
360 $oktobuild = 0 if (!$sprintf);
364 if ($self->within_element('hit')) {
366 {'Name' => 'Hit_score',
367 'Data' => $lastscore}
369 $self->end_element( { 'Name' => 'Hit' } );
372 $self->end_element( { 'Name' => 'Result' } );
374 return $self->end_document();
379 Title : start_element
380 Usage : $eventgenerator->start_element
381 Function: Handles a start element event
383 Args : hashref with at least 2 keys 'Data' and 'Name'
389 my ( $self, $data ) = @_;
391 # we currently don't care about attributes
392 my $nm = $data->{'Name'};
393 my $type = $MODEMAP{$nm};
395 if ( $self->_eventHandler->will_handle($type) ) {
396 my $func = sprintf( "start_%s", lc $type );
397 $self->_eventHandler->$func( $data->{'Attributes'} );
399 unshift @
{ $self->{'_elements'} }, $type;
402 && $type eq 'result' )
404 $self->{'_values'} = {};
405 $self->{'_result'} = undef;
411 Title : start_element
412 Usage : $eventgenerator->end_element
413 Function: Handles an end element event
415 Args : hashref with at least 2 keys, 'Data' and 'Name'
421 my ( $self, $data ) = @_;
422 my $nm = $data->{'Name'};
423 my $type = $MODEMAP{$nm};
427 if ( $self->_eventHandler->will_handle($type) ) {
428 my $func = sprintf( "end_%s", lc $type );
429 $rc = $self->_eventHandler->$func( $self->{'_reporttype'},
430 $self->{'_values'} );
432 my $lastelem = shift @
{ $self->{'_elements'} };
434 elsif ( $MAPPING{$nm} ) {
435 if ( ref( $MAPPING{$nm} ) =~ /hash/i ) {
436 my $key = ( keys %{ $MAPPING{$nm} } )[0];
437 $self->{'_values'}->{$key}->{ $MAPPING{$nm}->{$key} } =
438 $self->{'_last_data'};
441 $self->{'_values'}->{ $MAPPING{$nm} } = $self->{'_last_data'};
445 $self->debug("unknown nm $nm, ignoring\n");
447 $self->{'_last_data'} = ''; # remove read data if we are at
449 $self->{'_result'} = $rc if ( defined $type && $type eq 'result' );
456 Usage : $eventhandler->element({'Name' => $name, 'Data' => $str});
457 Function: Convenience method that calls start_element, characters, end_element
459 Args : Hash ref with the keys 'Name' and 'Data'
465 my ( $self, $data ) = @_;
466 # simple data calls (%MAPPING) do not need start_element
467 $self->characters($data);
468 $self->end_element($data);
475 Usage : $eventhandler->element_hash({'Hsp_hit-from' => $start,
476 'Hsp_hit-to' => $end,
477 'Hsp_score' => $lastscore});
478 Function: Convenience method that takes multiple simple data elements and
479 maps to appropriate parameters
481 Args : Hash ref with the mapped key (in %MAPPING) and value
486 my ($self, $data) = @_;
487 $self->throw("Must provide data hash ref") if !$data || !ref($data);
488 for my $nm (sort keys %{$data}) {
489 next if $data->{$nm} && $data->{$nm} =~ m{^\s*$}o;
490 if ( $MAPPING{$nm} ) {
491 if ( ref( $MAPPING{$nm} ) =~ /hash/i ) {
492 my $key = ( keys %{ $MAPPING{$nm} } )[0];
493 $self->{'_values'}->{$key}->{ $MAPPING{$nm}->{$key} } =
497 $self->{'_values'}->{ $MAPPING{$nm} } = $data->{$nm};
506 Usage : $eventgenerator->characters($str)
507 Function: Send a character events
515 my ( $self, $data ) = @_;
516 return unless ( defined $data->{'Data'} && $data->{'Data'} !~ /^\s+$/o );
517 $self->{'_last_data'} = $data->{'Data'};
520 =head2 within_element
522 Title : within_element
523 Usage : if( $eventgenerator->within_element($element) ) {}
524 Function: Test if we are within a particular element
525 This is different than 'in' because within can be tested
528 Args : string element name
533 my ( $self, $name ) = @_;
536 || !defined $self->{'_elements'}
537 || scalar @
{ $self->{'_elements'} } == 0 );
538 foreach ( @
{ $self->{'_elements'} } ) {
539 return 1 if ( $_ eq $name );
547 Usage : if( $eventgenerator->in_element($element) ) {}
548 Function: Test if we are in a particular element
549 This is different than 'within' because 'in' only
550 tests its immediate parent.
552 Args : string element name
557 my ( $self, $name ) = @_;
558 return 0 if !defined $self->{'_elements'}->[0];
559 return ( $self->{'_elements'}->[0] eq $name );
562 =head2 start_document
564 Title : start_document
565 Usage : $eventgenerator->start_document
566 Function: Handle a start document event
574 $self->{'_lasttype'} = '';
575 $self->{'_values'} = {};
576 $self->{'_result'} = undef;
577 $self->{'_elements'} = [];
583 Usage : $eventgenerator->end_document
584 Function: Handles an end document event
585 Returns : Bio::Search::Result::ResultI object
592 return $self->{'_result'};
598 Usage : my $count = $searchio->result_count
599 Function: Returns the number of results we have processed
607 return $self->{'_result_count'};
613 Usage : my $descr = $parser->descriptor();
614 Function: Get/Set descriptor name. Some versions of RNAMotif do not add the
615 descriptor name to the output. This also overrides any name found
617 Returns : String (name of model)
618 Args : [optional] String (name of model)
624 return $self->{'_descriptor'} = shift if @_;
625 return $self->{'_descriptor'};
631 Usage : my $model = $parser->model();
632 Function: Get/Set model; Infernal currently does not output
633 the model name (Rfam ID)
634 Returns : String (name of model)
635 Args : [optional] String (name of model)
636 Note : this is a synonym for descriptor()
640 sub model
{ shift->descriptor(@_) }
645 Usage : my $database = $parser->database();
646 Function: Get/Set database; Infernal currently does not output
648 Returns : String (database name)
649 Args : [optional] String (database name)
655 return $self->{'_database'} = shift if @_;
656 return $self->{'_database'};
659 =head2 query_accession
661 Title : query_accession
662 Usage : my $acc = $parser->query_accession();
663 Function: Get/Set query (model) accession; RNAMotif currently does not output
665 Returns : String (accession)
666 Args : [optional] String (accession)
670 sub query_accession
{
672 return $self->{'_query_accession'} = shift if @_;
673 return $self->{'_query_accession'};
676 =head2 algorithm_version
678 Title : algorithm_version
679 Usage : my $ver = $parser->algorithm_version();
680 Function: Get/Set algorithm version (not defined in RNAMotif output)
681 Returns : String (accession)
682 Args : [optional] String (accession)
686 sub algorithm_version
{
688 return $self->{'_algorithm'} = shift if @_;
689 return $self->{'_algorithm'};
695 Usage : my $cutoff = $parser->hsp_minscore();
696 Function: Get/Set min score cutoff (for generating Hits/HSPs).
697 Returns : score (number)
698 Args : [optional] score (number)
699 Note : Cannot be set along with hsp_maxscore()
704 my ($self, $score) = shift;
705 $self->throw('Minscore not set to a number') if
706 ($score && $score !~ m{[0-9.]+});
707 return $self->{'_hsp_minscore'} = shift if @_;
708 return $self->{'_hsp_minscore'};
714 Usage : my $cutoff = $parser->hsp_maxscore();
715 Function: Get/Set max score cutoff (for generating Hits/HSPs).
716 Returns : score (number)
717 Args : [optional] score (number)
718 Note : Cannot be set along with hsp_minscore()
723 my ($self, $score) = shift;
724 $self->throw('Maxscore not set to a number') if
725 ($score && $score !~ m{[0-9.]+});
726 return $self->{'_hsp_maxscore'} = shift if @_;
727 return $self->{'_hsp_maxscore'};
730 =head2 structure_symbols
732 Title : structure_symbols
733 Usage : my $hashref = $parser->structure_symbols();
734 Function: Get/Set RNA structure symbols
735 Returns : Hash ref of delimiters (5' stem, 3' stem, single-strand, etc)
736 : default = < (5-prime)
740 Args : Hash ref of substitute delimiters, using above keys.
744 sub structure_symbols
{
745 my ($self, $delim) = @_;
747 if (ref($delim) =~ m{HASH}) {
748 my %data = %{ $delim };
749 for my $d (@VALID_SYMBOLS) {
750 if ( exists $data{$d} ) {
751 $self->{'_delimiter'}->{$d} = $data{$d};
755 $self->throw("Args to helix_delimiters() should be in a hash reference");
758 return $self->{'_delimiter'};
766 Usage : my ($rna, $meta) = $parser->_motif2meta($str, $descr);
767 Function: Creates meta string from sequence and descriptor
768 Returns : array of sequence, meta strings
769 Args : Array of string data and descriptor data
771 Note: This is currently a quick and simple way of making simple
772 RNA structures (stem-loops, helices, etc) from RNAMotif descriptor
773 data in the output file. It does not currently work with pseudoknots,
774 triplets, G-quartets, or other more complex RNA structural motifs.
779 my ($self, $str, $descriptor) = @_;
781 my @desc_el = split ' ',$descriptor;
782 my @seq_el = split ' ',$str;
783 my $symbol = $self->structure_symbols();
784 if ($#desc_el != $#seq_el) {
785 $self->throw("Descriptor elements and seq elements do not match");
789 my ($seq, $motif) = (shift @seq_el, shift @desc_el);
790 $struct = (index($motif,'h5') == 0) ?
$symbol->{'5-prime'} :
791 (index($motif,'h3') == 0) ?
$symbol->{'3-prime'} :
792 (index($motif,'ss') == 0) ?
$symbol->{'single-strand'} :
793 (index($motif,'ctx')== 0) ?
$symbol->{'single-strand'} :
794 $symbol->{'unknown'};
795 $meta .= $struct x
(length($seq));
798 return ($rna, $meta);