Bio::DB::Universal: move into its own distribution
[bioperl-live.git] / Bio / SearchIO / erpin.pm
blob94e21afceb9266196dfd5cd0c70686364e0a0680
2 # BioPerl module for Bio::SearchIO::erpin
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Chris Fields <cjfields-at-uiuc-dot-edu>
8 # Copyright Chris Fields
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
14 =head1 NAME
16 Bio::SearchIO::erpin - SearchIO-based ERPIN parser
18 =head1 SYNOPSIS
20 # do not call this module directly. Use Bio::SearchIO.
22 =head1 DESCRIPTION
24 This is an experimental SearchIO-based parser for output from
25 the erpin program. It currently parses erpin output for ERPIN
26 versions 4.2.5 and above; older versions may work but will not be supported.
28 =head1 FEEDBACK
30 =head2 Mailing Lists
32 User feedback is an integral part of the evolution of this and other
33 Bioperl modules. Send your comments and suggestions preferably to
34 the Bioperl mailing list. Your participation is much appreciated.
36 bioperl-l@bioperl.org - General discussion
37 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
39 =head2 Support
41 Please direct usage questions or support issues to the mailing list:
43 I<bioperl-l@bioperl.org>
45 rather than to the module maintainer directly. Many experienced and
46 reponsive experts will be able look at the problem and quickly
47 address it. Please include a thorough description of the problem
48 with code and data examples if at all possible.
50 =head2 Reporting Bugs
52 Report bugs to the Bioperl bug tracking system to help us keep track
53 of the bugs and their resolution. Bug reports can be submitted via the
54 web:
56 https://github.com/bioperl/bioperl-live/issues
58 =head1 AUTHOR - Chris Fields
60 Email cjfields-at-uiuc-dot-edu
62 =head1 APPENDIX
64 The rest of the documentation details each of the object methods.
65 Internal methods are usually preceded with a _
67 =cut
69 # Let the code begin...
71 package Bio::SearchIO::erpin;
72 use strict;
74 use Data::Dumper;
75 use base qw(Bio::SearchIO);
77 my %MODEMAP = (
78 'Result' => 'result',
79 'Hit' => 'hit',
80 'Hsp' => 'hsp'
83 my %MAPPING = (
84 'Hsp_bit-score' => 'HSP-bits',
85 'Hsp_score' => 'HSP-score',
86 'Hsp_evalue' => 'HSP-evalue', # no evalues yet
87 'Hsp_query-from' => 'HSP-query_start',
88 'Hsp_query-to' => 'HSP-query_end',
89 'Hsp_hit-from' => 'HSP-hit_start', #
90 'Hsp_hit-to' => 'HSP-hit_end', #
91 'Hsp_gaps' => 'HSP-hsp_gaps',
92 'Hsp_hitgaps' => 'HSP-hit_gaps',
93 'Hsp_querygaps' => 'HSP-query_gaps',
94 'Hsp_qseq' => 'HSP-query_seq',
95 'Hsp_hseq' => 'HSP-hit_seq',
96 'Hsp_midline' => 'HSP-homology_seq',
97 'Hsp_structure' => 'HSP-meta',
98 'Hsp_align-len' => 'HSP-hsp_length',
99 'Hsp_stranded' => 'HSP-stranded',
101 # not supported yet
102 'Hsp_positive' => 'HSP-conserved',
103 'Hsp_identity' => 'HSP-identical',
105 'Hit_id' => 'HIT-name',
106 'Hit_len' => 'HIT-length',
107 'Hit_gi' => 'HIT-ncbi_gi',
108 'Hit_accession' => 'HIT-accession',
109 'Hit_def' => 'HIT-description',
110 'Hit_signif' => 'HIT-significance', # none yet
111 'Hit_score' => 'HIT-score', # best HSP bit score
112 'Hit_bits' => 'HIT-bits', # best HSP bit score
114 'ERPIN_program' => 'RESULT-algorithm_name', # get/set
115 'ERPIN_version' => 'RESULT-algorithm_version', # get/set
116 'ERPIN_query-def'=> 'RESULT-query_name', # get/set
117 'ERPIN_query-len'=> 'RESULT-query_length',
118 'ERPIN_query-acc'=> 'RESULT-query_accession', # get/set
119 'ERPIN_querydesc'=> 'RESULT-query_description', # get/set
120 'ERPIN_db' => 'RESULT-database_name', # get/set
121 'ERPIN_db-len' => 'RESULT-database_entries', # none yet
122 'ERPIN_db-let' => 'RESULT-database_letters', # none yet
124 'Parameters_cutoff' => { 'RESULT-parameters' => 'cutoff' },
125 'Parameters_expect' => { 'RESULT-parameters' => 'expect' },
127 'Parameters_include' => { 'RESULT-parameters' => 'include' },
128 'Parameters_sc-match' => { 'RESULT-parameters' => 'match' },
129 'Parameters_sc-mismatch' => { 'RESULT-parameters' => 'mismatch' },
130 'Parameters_gap-open' => { 'RESULT-parameters' => 'gapopen' },
131 'Parameters_gap-extend' => { 'RESULT-parameters' => 'gapext' },
132 'Parameters_filter' => { 'RESULT-parameters' => 'filter' },
133 'Parameters_allowgaps' => { 'RESULT-parameters' => 'allowgaps' },
134 'Parameters_full_dbpath' => { 'RESULT-parameters' => 'full_dbpath' },
135 'Statistics_db-let' => { 'RESULT-statistics' => 'dbletters' },
138 my $MINSCORE = 0;
139 my $DEFAULT_VERSION = '4.2.5';
140 my $DEFAULT_ALGORITHM = 'erpin';
142 =head2 new
144 Title : new
145 Usage : my $obj = Bio::SearchIO::infernal->new();
146 Function: Builds a new Bio::SearchIO::infernal object
147 Returns : Bio::SearchIO::infernal
148 Args : -fh/-file => cmsearch (infernal) filename
149 -format => 'erpin'
150 -algorithm => algorithm (default 'Infernal')
151 -query_acc => query accession, eg. Rfam accession (default undef)
152 -hsp_minscore => minimum HSP score cutoff
153 -version => ERPIN version (not reported in output)
155 =cut
157 sub _initialize {
158 my ( $self, @args ) = @_;
159 $self->SUPER::_initialize(@args);
160 my ($cutoff, $accession, $version) =
161 $self->_rearrange([qw(HSP_MINSCORE QUERY_ACC VERSION)],@args);
162 my $handler = $self->_eventHandler;
163 $handler->register_factory(
164 'result',
165 Bio::Factory::ObjectFactory->new(
166 -type => 'Bio::Search::Result::GenericResult',
167 -interface => 'Bio::Search::Result::ResultI',
168 -verbose => $self->verbose()
172 $handler->register_factory(
173 'hit',
174 Bio::Factory::ObjectFactory->new(
175 -type => 'Bio::Search::Hit::ModelHit',
176 -interface => 'Bio::Search::Hit::HitI',
177 -verbose => $self->verbose()
181 $handler->register_factory(
182 'hsp',
183 Bio::Factory::ObjectFactory->new(
184 -type => 'Bio::Search::HSP::ModelHSP',
185 -interface => 'Bio::Search::HSP::HSPI',
186 -verbose => $self->verbose()
189 $accession && $self->query_accession($accession);
190 $cutoff ||= $MINSCORE;
191 $self->hsp_minscore($cutoff);
192 $version ||= $DEFAULT_VERSION;
193 $self->algorithm_version($version);
196 =head2 next_result
198 Title : next_result
199 Usage : my $hit = $searchio->next_result;
200 Function: Returns the next Result from a search
201 Returns : Bio::Search::Result::ResultI object
202 Args : none
204 =cut
206 sub next_result {
207 my ($self) = @_;
208 my $seentop = 0;
209 local $/ = "\n";
210 local $_;
211 my $accession = $self->query_accession;
212 my $minscore = $self->hsp_minscore;
213 my $version = $self->algorithm_version;
214 my $verbose = $self->verbose; # cache for speed?
215 $self->start_document();
216 my ($lasthit, $lastscore, $lastlen, $lasteval);
217 #my $hitline;
218 PARSER:
219 while ( defined( my $line = $self->_readline ) ) {
220 next if $line =~ m{^\s*$};
221 if ($line =~ m{^Training\sset:\s+"(.*)"}xmso) {
222 if ($seentop) {
223 $self->_pushback($line);
224 last PARSER;
226 $self->start_element({'Name' => 'Result'});
227 $self->element_hash( {
228 'ERPIN_query-def' => $1,
229 'ERPIN_program' =>'erpin',
230 'ERPIN_version' => $version,
231 'ERPIN_query-acc' => $accession,
233 $seentop = 1;
234 # parse rest of header here
235 HEADER:
236 while (defined ($line = $self->_readline) ) {
237 next if $line =~ m{^\s*$};
238 if (index($line, '>') == 0 ||
239 index($line, '-------- at level 1 --------') == 0) {
240 $self->_pushback($line);
241 last HEADER;
243 if ($line =~ m{^\s+(\d+\ssequences\sof\slength\s\d+)}xmso) {
244 $self->element(
245 {'Name' => 'ERPIN_querydesc',
246 'Data' => $1}
248 } elsif ($line =~ m{^Cutoff:\s+(\S+)}xmso) {
249 $self->element(
250 {'Name' => 'Parameters_cutoff',
251 'Data' => $1}
253 } elsif ($line =~ m{^Database:\s+"(.*)"}xmso) {
254 $self->element(
255 {'Name' => 'ERPIN_db',
256 'Data' => $1}
258 } elsif ($line =~ m{^\s+(\d+)\snucleotides\sto\sbe\sprocessed\sin\s(\d+)\ssequences}xmso) {
259 $self->element_hash(
260 {'ERPIN_db-len' => $2,
261 'ERPIN_db-let' => $1}
263 } elsif ($line =~ m{^E-value\sat\scutoff\s\S+\sfor\s\S+\sdouble\sstrand\sdata:\s+(\S+)}xmso) {
264 $self->element(
265 {'Name' => 'Parameters_expect',
266 'Data' => $1}
268 } elsif ($line =~ m{^\s+(ATGC\sratios:\s+(?:\S+\s+\S+\s+\S+\s+\S+))}) {
269 $self->element(
270 {'Name' => 'Statistics_db-let',
271 'Data' => $1}
275 } elsif ($line =~ m{^>(\S+)\s+(.*)}xmso ) {
276 my ($id, $desc) = ($1, $2);
277 chomp $desc;
278 # desc line is repeated for each strand, so must check
279 # prior to starting a new hit
280 if (!$lasthit || $id ne $lasthit) {
281 if ($self->within_element('hit') ) {
282 $self->element_hash({
283 'Hit_signif' => $lasteval,
284 'Hit_score' => $lastscore,
285 'Hit_bits' => $lastscore
287 $self->end_element({'Name' => 'Hit'});
289 $self->start_element({'Name' => 'Hit'});
290 my ($gi, $acc, $ver) = $self->_get_seq_identifiers($id);
292 $self->element_hash({
293 'Hit_id' => $id,
294 'Hit_gi' => $gi,
295 'Hit_accession' => $ver ? "$acc.$ver" :
296 $acc ? $acc : $id,
297 'Hit_def' => $desc
300 $lasthit = $id;
301 } elsif ( (index($line, 'FW') == 0) || (index($line, 'RC') == 0)) {
302 my ($str, $hn, $pos, $score, $eval) = split ' ', $line;
303 if ($minscore < $score) {
304 $self->start_element({'Name' => 'Hsp'});
306 my ($start, $end) = split m{\.\.}, $pos, 2;
307 ($start, $end) = ($end, $start) if ($str eq 'RC');
308 $line = $self->_readline;
309 chomp $line;
310 $self->element_hash({
311 'Hsp_stranded' => 'HIT',
312 'Hsp_hit-from' => $start,
313 'Hsp_hit-to' => $end,
314 'Hsp_score' => $score,
315 'Hsp_bit-score' => $score,
316 'Hsp_evalue' => $eval,
317 'Hsp_query-from' => 1,
318 'Hsp_query-to' => length($line),
319 'Hsp_align-len' => length($line),
320 'Hsp_hseq' =>$line
322 $self->end_element({'Name' => 'Hsp'});
323 $lastscore = $score if (!$lastscore || $lastscore < $score);
324 $lasteval = $eval if (!$lasteval || $lasteval > $eval);
326 } else {
327 #$self->debug("Dropped data: $line");
330 if ($seentop) {
331 if ($self->within_element('hit')) {
332 $self->element_hash({
333 'Hit_signif' => $lasteval,
334 'Hit_score' => $lastscore,
335 'Hit_bits' => $lastscore
337 $self->end_element({'Name' => 'Hit'});
339 $self->end_element({'Name' => 'Result'});
341 return $self->end_document();
344 =head2 start_element
346 Title : start_element
347 Usage : $eventgenerator->start_element
348 Function: Handles a start element event
349 Returns : none
350 Args : hashref with at least 2 keys 'Data' and 'Name'
352 =cut
354 sub start_element {
355 my ( $self, $data ) = @_;
357 # we currently don't care about attributes
358 my $nm = $data->{'Name'};
359 my $type = $MODEMAP{$nm};
360 if ($type) {
361 if ( $self->_eventHandler->will_handle($type) ) {
362 my $func = sprintf( "start_%s", lc $type );
363 $self->_eventHandler->$func( $data->{'Attributes'} );
365 unshift @{ $self->{'_elements'} }, $type;
367 if ( defined $type
368 && $type eq 'result' )
370 $self->{'_values'} = {};
371 $self->{'_result'} = undef;
375 =head2 end_element
377 Title : start_element
378 Usage : $eventgenerator->end_element
379 Function: Handles an end element event
380 Returns : none
381 Args : hashref with at least 2 keys, 'Data' and 'Name'
384 =cut
386 sub end_element {
387 my ( $self, $data ) = @_;
388 my $nm = $data->{'Name'};
389 my $type = $MODEMAP{$nm};
390 my $rc;
392 if ($type) {
393 if ( $self->_eventHandler->will_handle($type) ) {
394 my $func = sprintf( "end_%s", lc $type );
395 $rc = $self->_eventHandler->$func( $self->{'_reporttype'},
396 $self->{'_values'} );
398 my $lastelem = shift @{ $self->{'_elements'} };
400 elsif ( $MAPPING{$nm} ) {
401 if ( ref( $MAPPING{$nm} ) =~ /hash/i ) {
402 my $key = ( keys %{ $MAPPING{$nm} } )[0];
403 $self->{'_values'}->{$key}->{ $MAPPING{$nm}->{$key} } =
404 $self->{'_last_data'};
406 else {
407 $self->{'_values'}->{ $MAPPING{$nm} } = $self->{'_last_data'};
410 else {
411 $self->debug("unknown nm $nm, ignoring\n");
413 $self->{'_last_data'} = ''; # remove read data if we are at
414 # end of an element
415 $self->{'_result'} = $rc if ( defined $type && $type eq 'result' );
416 return $rc;
419 =head2 element
421 Title : element
422 Usage : $eventhandler->element({'Name' => $name, 'Data' => $str});
423 Function: Convenience method that calls start_element, characters, end_element
424 Returns : none
425 Args : Hash ref with the keys 'Name' and 'Data'
428 =cut
430 sub element {
431 my ( $self, $data ) = @_;
432 # simple data calls (%MAPPING) do not need start_element
433 $self->characters($data);
434 $self->end_element($data);
437 =head2 element_hash
439 Title : element
440 Usage : $eventhandler->element_hash({'Hsp_hit-from' => $start,
441 'Hsp_hit-to' => $end,
442 'Hsp_score' => $lastscore});
443 Function: Convenience method that takes multiple simple data elements and
444 maps to appropriate parameters
445 Returns : none
446 Args : Hash ref with the mapped key (in %MAPPING) and value
448 =cut
450 sub element_hash {
451 my ($self, $data) = @_;
452 $self->throw("Must provide data hash ref") if !$data || !ref($data);
453 for my $nm (sort keys %{$data}) {
454 next if $data->{$nm} && $data->{$nm} =~ m{^\s*$}o;
455 if ( $MAPPING{$nm} ) {
456 if ( ref( $MAPPING{$nm} ) =~ /hash/i ) {
457 my $key = ( keys %{ $MAPPING{$nm} } )[0];
458 $self->{'_values'}->{$key}->{ $MAPPING{$nm}->{$key} } =
459 $data->{$nm};
461 else {
462 $self->{'_values'}->{ $MAPPING{$nm} } = $data->{$nm};
468 =head2 characters
470 Title : characters
471 Usage : $eventgenerator->characters($str)
472 Function: Send a character events
473 Returns : none
474 Args : string
477 =cut
479 sub characters {
480 my ( $self, $data ) = @_;
481 return unless ( defined $data->{'Data'} && $data->{'Data'} !~ /^\s+$/o );
482 $self->{'_last_data'} = $data->{'Data'};
485 =head2 within_element
487 Title : within_element
488 Usage : if( $eventgenerator->within_element($element) ) {}
489 Function: Test if we are within a particular element
490 This is different than 'in' because within can be tested
491 for a whole block.
492 Returns : boolean
493 Args : string element name
495 =cut
497 sub within_element {
498 my ( $self, $name ) = @_;
499 return 0
500 if ( !defined $name
501 || !defined $self->{'_elements'}
502 || scalar @{ $self->{'_elements'} } == 0 );
503 foreach ( @{ $self->{'_elements'} } ) {
504 return 1 if ( $_ eq $name );
506 return 0;
509 =head2 in_element
511 Title : in_element
512 Usage : if( $eventgenerator->in_element($element) ) {}
513 Function: Test if we are in a particular element
514 This is different than 'within' because 'in' only
515 tests its immediate parent.
516 Returns : boolean
517 Args : string element name
519 =cut
521 sub in_element {
522 my ( $self, $name ) = @_;
523 return 0 if !defined $self->{'_elements'}->[0];
524 return ( $self->{'_elements'}->[0] eq $name );
527 =head2 start_document
529 Title : start_document
530 Usage : $eventgenerator->start_document
531 Function: Handle a start document event
532 Returns : none
533 Args : none
535 =cut
537 sub start_document {
538 my ($self) = @_;
539 $self->{'_lasttype'} = '';
540 $self->{'_values'} = {};
541 $self->{'_result'} = undef;
542 $self->{'_elements'} = [];
545 =head2 end_document
547 Title : end_document
548 Usage : $eventgenerator->end_document
549 Function: Handles an end document event
550 Returns : Bio::Search::Result::ResultI object
551 Args : none
553 =cut
555 sub end_document {
556 my ($self) = @_;
557 return $self->{'_result'};
560 =head2 result_count
562 Title : result_count
563 Usage : my $count = $searchio->result_count
564 Function: Returns the number of results we have processed
565 Returns : integer
566 Args : none
568 =cut
570 sub result_count {
571 my $self = shift;
572 return $self->{'_result_count'};
575 =head2 query_accession
577 Title : query_accession
578 Usage : my $acc = $parser->query_accession();
579 Function: Get/Set query (model) accession; Infernal currently does not output
580 the accession number (Rfam accession #)
581 Returns : String (accession)
582 Args : [optional] String (accession)
584 =cut
586 sub query_accession {
587 my $self = shift;
588 return $self->{'_query_accession'} = shift if @_;
589 return $self->{'_query_accession'};
592 =head2 hsp_minscore
594 Title : hsp_minscore
595 Usage : my $cutoff = $parser->hsp_minscore();
596 Function: Get/Set min bit score cutoff (for generating Hits/HSPs)
597 Returns : score (number)
598 Args : [optional] score (number)
600 =cut
602 sub hsp_minscore {
603 my $self = shift;
604 return $self->{'_hsp_minscore'} = shift if @_;
605 return $self->{'_hsp_minscore'};
608 =head2 algorithm_version
610 Title : algorithm_version
611 Usage : my $ver = $parser->algorithm_version();
612 Function: Get/Set algorithm version (not defined in RNAMotif output)
613 Returns : String (accession)
614 Args : [optional] String (accession)
616 =cut
618 sub algorithm_version {
619 my $self = shift;
620 return $self->{'_algorithm'} = shift if @_;
621 return $self->{'_algorithm'};