2 # BioPerl module for Bio::SearchIO::erpin
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Chris Fields <cjfields-at-uiuc-dot-edu>
8 # Copyright Chris Fields
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
16 Bio::SearchIO::erpin - SearchIO-based ERPIN parser
20 # do not call this module directly. Use Bio::SearchIO.
24 This is an experimental SearchIO-based parser for output from
25 the erpin program. It currently parses erpin output for ERPIN
26 versions 4.2.5 and above; older versions may work but will not be supported.
32 User feedback is an integral part of the evolution of this and other
33 Bioperl modules. Send your comments and suggestions preferably to
34 the Bioperl mailing list. Your participation is much appreciated.
36 bioperl-l@bioperl.org - General discussion
37 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
41 Please direct usage questions or support issues to the mailing list:
43 I<bioperl-l@bioperl.org>
45 rather than to the module maintainer directly. Many experienced and
46 reponsive experts will be able look at the problem and quickly
47 address it. Please include a thorough description of the problem
48 with code and data examples if at all possible.
52 Report bugs to the Bioperl bug tracking system to help us keep track
53 of the bugs and their resolution. Bug reports can be submitted via the
56 https://github.com/bioperl/bioperl-live/issues
58 =head1 AUTHOR - Chris Fields
60 Email cjfields-at-uiuc-dot-edu
64 The rest of the documentation details each of the object methods.
65 Internal methods are usually preceded with a _
69 # Let the code begin...
71 package Bio
::SearchIO
::erpin
;
75 use base
qw(Bio::SearchIO);
84 'Hsp_bit-score' => 'HSP-bits',
85 'Hsp_score' => 'HSP-score',
86 'Hsp_evalue' => 'HSP-evalue', # no evalues yet
87 'Hsp_query-from' => 'HSP-query_start',
88 'Hsp_query-to' => 'HSP-query_end',
89 'Hsp_hit-from' => 'HSP-hit_start', #
90 'Hsp_hit-to' => 'HSP-hit_end', #
91 'Hsp_gaps' => 'HSP-hsp_gaps',
92 'Hsp_hitgaps' => 'HSP-hit_gaps',
93 'Hsp_querygaps' => 'HSP-query_gaps',
94 'Hsp_qseq' => 'HSP-query_seq',
95 'Hsp_hseq' => 'HSP-hit_seq',
96 'Hsp_midline' => 'HSP-homology_seq',
97 'Hsp_structure' => 'HSP-meta',
98 'Hsp_align-len' => 'HSP-hsp_length',
99 'Hsp_stranded' => 'HSP-stranded',
102 'Hsp_positive' => 'HSP-conserved',
103 'Hsp_identity' => 'HSP-identical',
105 'Hit_id' => 'HIT-name',
106 'Hit_len' => 'HIT-length',
107 'Hit_gi' => 'HIT-ncbi_gi',
108 'Hit_accession' => 'HIT-accession',
109 'Hit_def' => 'HIT-description',
110 'Hit_signif' => 'HIT-significance', # none yet
111 'Hit_score' => 'HIT-score', # best HSP bit score
112 'Hit_bits' => 'HIT-bits', # best HSP bit score
114 'ERPIN_program' => 'RESULT-algorithm_name', # get/set
115 'ERPIN_version' => 'RESULT-algorithm_version', # get/set
116 'ERPIN_query-def'=> 'RESULT-query_name', # get/set
117 'ERPIN_query-len'=> 'RESULT-query_length',
118 'ERPIN_query-acc'=> 'RESULT-query_accession', # get/set
119 'ERPIN_querydesc'=> 'RESULT-query_description', # get/set
120 'ERPIN_db' => 'RESULT-database_name', # get/set
121 'ERPIN_db-len' => 'RESULT-database_entries', # none yet
122 'ERPIN_db-let' => 'RESULT-database_letters', # none yet
124 'Parameters_cutoff' => { 'RESULT-parameters' => 'cutoff' },
125 'Parameters_expect' => { 'RESULT-parameters' => 'expect' },
127 'Parameters_include' => { 'RESULT-parameters' => 'include' },
128 'Parameters_sc-match' => { 'RESULT-parameters' => 'match' },
129 'Parameters_sc-mismatch' => { 'RESULT-parameters' => 'mismatch' },
130 'Parameters_gap-open' => { 'RESULT-parameters' => 'gapopen' },
131 'Parameters_gap-extend' => { 'RESULT-parameters' => 'gapext' },
132 'Parameters_filter' => { 'RESULT-parameters' => 'filter' },
133 'Parameters_allowgaps' => { 'RESULT-parameters' => 'allowgaps' },
134 'Parameters_full_dbpath' => { 'RESULT-parameters' => 'full_dbpath' },
135 'Statistics_db-let' => { 'RESULT-statistics' => 'dbletters' },
139 my $DEFAULT_VERSION = '4.2.5';
140 my $DEFAULT_ALGORITHM = 'erpin';
145 Usage : my $obj = Bio::SearchIO::infernal->new();
146 Function: Builds a new Bio::SearchIO::infernal object
147 Returns : Bio::SearchIO::infernal
148 Args : -fh/-file => cmsearch (infernal) filename
150 -algorithm => algorithm (default 'Infernal')
151 -query_acc => query accession, eg. Rfam accession (default undef)
152 -hsp_minscore => minimum HSP score cutoff
153 -version => ERPIN version (not reported in output)
158 my ( $self, @args ) = @_;
159 $self->SUPER::_initialize
(@args);
160 my ($cutoff, $accession, $version) =
161 $self->_rearrange([qw(HSP_MINSCORE QUERY_ACC VERSION)],@args);
162 my $handler = $self->_eventHandler;
163 $handler->register_factory(
165 Bio
::Factory
::ObjectFactory
->new(
166 -type
=> 'Bio::Search::Result::GenericResult',
167 -interface
=> 'Bio::Search::Result::ResultI',
168 -verbose
=> $self->verbose()
172 $handler->register_factory(
174 Bio
::Factory
::ObjectFactory
->new(
175 -type
=> 'Bio::Search::Hit::ModelHit',
176 -interface
=> 'Bio::Search::Hit::HitI',
177 -verbose
=> $self->verbose()
181 $handler->register_factory(
183 Bio
::Factory
::ObjectFactory
->new(
184 -type
=> 'Bio::Search::HSP::ModelHSP',
185 -interface
=> 'Bio::Search::HSP::HSPI',
186 -verbose
=> $self->verbose()
189 $accession && $self->query_accession($accession);
190 $cutoff ||= $MINSCORE;
191 $self->hsp_minscore($cutoff);
192 $version ||= $DEFAULT_VERSION;
193 $self->algorithm_version($version);
199 Usage : my $hit = $searchio->next_result;
200 Function: Returns the next Result from a search
201 Returns : Bio::Search::Result::ResultI object
211 my $accession = $self->query_accession;
212 my $minscore = $self->hsp_minscore;
213 my $version = $self->algorithm_version;
214 my $verbose = $self->verbose; # cache for speed?
215 $self->start_document();
216 my ($lasthit, $lastscore, $lastlen, $lasteval);
219 while ( defined( my $line = $self->_readline ) ) {
220 next if $line =~ m{^\s*$};
221 if ($line =~ m{^Training\sset:\s+"(.*)"}xmso) {
223 $self->_pushback($line);
226 $self->start_element({'Name' => 'Result'});
227 $self->element_hash( {
228 'ERPIN_query-def' => $1,
229 'ERPIN_program' =>'erpin',
230 'ERPIN_version' => $version,
231 'ERPIN_query-acc' => $accession,
234 # parse rest of header here
236 while (defined ($line = $self->_readline) ) {
237 next if $line =~ m{^\s*$};
238 if (index($line, '>') == 0 ||
239 index($line, '-------- at level 1 --------') == 0) {
240 $self->_pushback($line);
243 if ($line =~ m{^\s+(\d+\ssequences\sof\slength\s\d+)}xmso) {
245 {'Name' => 'ERPIN_querydesc',
248 } elsif ($line =~ m{^Cutoff:\s+(\S+)}xmso) {
250 {'Name' => 'Parameters_cutoff',
253 } elsif ($line =~ m{^Database:\s+"(.*)"}xmso) {
255 {'Name' => 'ERPIN_db',
258 } elsif ($line =~ m{^\s+(\d+)\snucleotides\sto\sbe\sprocessed\sin\s(\d+)\ssequences}xmso) {
260 {'ERPIN_db-len' => $2,
261 'ERPIN_db-let' => $1}
263 } elsif ($line =~ m{^E-value\sat\scutoff\s\S+\sfor\s\S+\sdouble\sstrand\sdata:\s+(\S+)}xmso) {
265 {'Name' => 'Parameters_expect',
268 } elsif ($line =~ m{^\s+(ATGC\sratios:\s+(?:\S+\s+\S+\s+\S+\s+\S+))}) {
270 {'Name' => 'Statistics_db-let',
275 } elsif ($line =~ m{^>(\S+)\s+(.*)}xmso ) {
276 my ($id, $desc) = ($1, $2);
278 # desc line is repeated for each strand, so must check
279 # prior to starting a new hit
280 if (!$lasthit || $id ne $lasthit) {
281 if ($self->within_element('hit') ) {
282 $self->element_hash({
283 'Hit_signif' => $lasteval,
284 'Hit_score' => $lastscore,
285 'Hit_bits' => $lastscore
287 $self->end_element({'Name' => 'Hit'});
289 $self->start_element({'Name' => 'Hit'});
290 my ($gi, $acc, $ver) = $self->_get_seq_identifiers($id);
292 $self->element_hash({
295 'Hit_accession' => $ver ?
"$acc.$ver" :
301 } elsif ( (index($line, 'FW') == 0) || (index($line, 'RC') == 0)) {
302 my ($str, $hn, $pos, $score, $eval) = split ' ', $line;
303 if ($minscore < $score) {
304 $self->start_element({'Name' => 'Hsp'});
306 my ($start, $end) = split m{\.\.}, $pos, 2;
307 ($start, $end) = ($end, $start) if ($str eq 'RC');
308 $line = $self->_readline;
310 $self->element_hash({
311 'Hsp_stranded' => 'HIT',
312 'Hsp_hit-from' => $start,
313 'Hsp_hit-to' => $end,
314 'Hsp_score' => $score,
315 'Hsp_bit-score' => $score,
316 'Hsp_evalue' => $eval,
317 'Hsp_query-from' => 1,
318 'Hsp_query-to' => length($line),
319 'Hsp_align-len' => length($line),
322 $self->end_element({'Name' => 'Hsp'});
323 $lastscore = $score if (!$lastscore || $lastscore < $score);
324 $lasteval = $eval if (!$lasteval || $lasteval > $eval);
327 #$self->debug("Dropped data: $line");
331 if ($self->within_element('hit')) {
332 $self->element_hash({
333 'Hit_signif' => $lasteval,
334 'Hit_score' => $lastscore,
335 'Hit_bits' => $lastscore
337 $self->end_element({'Name' => 'Hit'});
339 $self->end_element({'Name' => 'Result'});
341 return $self->end_document();
346 Title : start_element
347 Usage : $eventgenerator->start_element
348 Function: Handles a start element event
350 Args : hashref with at least 2 keys 'Data' and 'Name'
355 my ( $self, $data ) = @_;
357 # we currently don't care about attributes
358 my $nm = $data->{'Name'};
359 my $type = $MODEMAP{$nm};
361 if ( $self->_eventHandler->will_handle($type) ) {
362 my $func = sprintf( "start_%s", lc $type );
363 $self->_eventHandler->$func( $data->{'Attributes'} );
365 unshift @
{ $self->{'_elements'} }, $type;
368 && $type eq 'result' )
370 $self->{'_values'} = {};
371 $self->{'_result'} = undef;
377 Title : start_element
378 Usage : $eventgenerator->end_element
379 Function: Handles an end element event
381 Args : hashref with at least 2 keys, 'Data' and 'Name'
387 my ( $self, $data ) = @_;
388 my $nm = $data->{'Name'};
389 my $type = $MODEMAP{$nm};
393 if ( $self->_eventHandler->will_handle($type) ) {
394 my $func = sprintf( "end_%s", lc $type );
395 $rc = $self->_eventHandler->$func( $self->{'_reporttype'},
396 $self->{'_values'} );
398 my $lastelem = shift @
{ $self->{'_elements'} };
400 elsif ( $MAPPING{$nm} ) {
401 if ( ref( $MAPPING{$nm} ) =~ /hash/i ) {
402 my $key = ( keys %{ $MAPPING{$nm} } )[0];
403 $self->{'_values'}->{$key}->{ $MAPPING{$nm}->{$key} } =
404 $self->{'_last_data'};
407 $self->{'_values'}->{ $MAPPING{$nm} } = $self->{'_last_data'};
411 $self->debug("unknown nm $nm, ignoring\n");
413 $self->{'_last_data'} = ''; # remove read data if we are at
415 $self->{'_result'} = $rc if ( defined $type && $type eq 'result' );
422 Usage : $eventhandler->element({'Name' => $name, 'Data' => $str});
423 Function: Convenience method that calls start_element, characters, end_element
425 Args : Hash ref with the keys 'Name' and 'Data'
431 my ( $self, $data ) = @_;
432 # simple data calls (%MAPPING) do not need start_element
433 $self->characters($data);
434 $self->end_element($data);
440 Usage : $eventhandler->element_hash({'Hsp_hit-from' => $start,
441 'Hsp_hit-to' => $end,
442 'Hsp_score' => $lastscore});
443 Function: Convenience method that takes multiple simple data elements and
444 maps to appropriate parameters
446 Args : Hash ref with the mapped key (in %MAPPING) and value
451 my ($self, $data) = @_;
452 $self->throw("Must provide data hash ref") if !$data || !ref($data);
453 for my $nm (sort keys %{$data}) {
454 next if $data->{$nm} && $data->{$nm} =~ m{^\s*$}o;
455 if ( $MAPPING{$nm} ) {
456 if ( ref( $MAPPING{$nm} ) =~ /hash/i ) {
457 my $key = ( keys %{ $MAPPING{$nm} } )[0];
458 $self->{'_values'}->{$key}->{ $MAPPING{$nm}->{$key} } =
462 $self->{'_values'}->{ $MAPPING{$nm} } = $data->{$nm};
471 Usage : $eventgenerator->characters($str)
472 Function: Send a character events
480 my ( $self, $data ) = @_;
481 return unless ( defined $data->{'Data'} && $data->{'Data'} !~ /^\s+$/o );
482 $self->{'_last_data'} = $data->{'Data'};
485 =head2 within_element
487 Title : within_element
488 Usage : if( $eventgenerator->within_element($element) ) {}
489 Function: Test if we are within a particular element
490 This is different than 'in' because within can be tested
493 Args : string element name
498 my ( $self, $name ) = @_;
501 || !defined $self->{'_elements'}
502 || scalar @
{ $self->{'_elements'} } == 0 );
503 foreach ( @
{ $self->{'_elements'} } ) {
504 return 1 if ( $_ eq $name );
512 Usage : if( $eventgenerator->in_element($element) ) {}
513 Function: Test if we are in a particular element
514 This is different than 'within' because 'in' only
515 tests its immediate parent.
517 Args : string element name
522 my ( $self, $name ) = @_;
523 return 0 if !defined $self->{'_elements'}->[0];
524 return ( $self->{'_elements'}->[0] eq $name );
527 =head2 start_document
529 Title : start_document
530 Usage : $eventgenerator->start_document
531 Function: Handle a start document event
539 $self->{'_lasttype'} = '';
540 $self->{'_values'} = {};
541 $self->{'_result'} = undef;
542 $self->{'_elements'} = [];
548 Usage : $eventgenerator->end_document
549 Function: Handles an end document event
550 Returns : Bio::Search::Result::ResultI object
557 return $self->{'_result'};
563 Usage : my $count = $searchio->result_count
564 Function: Returns the number of results we have processed
572 return $self->{'_result_count'};
575 =head2 query_accession
577 Title : query_accession
578 Usage : my $acc = $parser->query_accession();
579 Function: Get/Set query (model) accession; Infernal currently does not output
580 the accession number (Rfam accession #)
581 Returns : String (accession)
582 Args : [optional] String (accession)
586 sub query_accession
{
588 return $self->{'_query_accession'} = shift if @_;
589 return $self->{'_query_accession'};
595 Usage : my $cutoff = $parser->hsp_minscore();
596 Function: Get/Set min bit score cutoff (for generating Hits/HSPs)
597 Returns : score (number)
598 Args : [optional] score (number)
604 return $self->{'_hsp_minscore'} = shift if @_;
605 return $self->{'_hsp_minscore'};
608 =head2 algorithm_version
610 Title : algorithm_version
611 Usage : my $ver = $parser->algorithm_version();
612 Function: Get/Set algorithm version (not defined in RNAMotif output)
613 Returns : String (accession)
614 Args : [optional] String (accession)
618 sub algorithm_version
{
620 return $self->{'_algorithm'} = shift if @_;
621 return $self->{'_algorithm'};