Bio::DB::Universal: move into its own distribution
[bioperl-live.git] / Bio / Tools / Signalp / ExtendedSignalp.pm
blobac90ee29fa714caa3ae082c318707e93f8f3fe04
2 # BioPerl module for Bio::Tools::Signalp::ExtendedSignalp
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Emmanuel Quevillon <emmanuel.quevillon@versailles.inra.fr>
8 # Copyright Emmanuel Quevillon
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
14 =head1 NAME
16 Bio::Tools::Signalp::ExtendedSignalp - enhanced parser for Signalp output
18 =head1 SYNOPSIS
20 use Bio::Tools::Signalp::ExtendedSignalp;
21 my $params = [qw(maxC maxY maxS meanS D)];
22 my $parser = new Bio::Tools::Signalp::ExtendedSignalp(
23 -fh => $filehandle
24 -factors => $params
27 $parser->factors($params);
28 while( my $sp_feat = $parser->next_feature ) {
29 #do something
30 #eg
31 push @sp_feat, $sp_feat;
34 =head1 DESCRIPTION
36 # Please direct questions and support issues to I<bioperl-l@bioperl.org>
38 Parser module for Signalp.
40 Based on the EnsEMBL module Bio::EnsEMBL::Pipeline::Runnable::Protein::Signalp
41 originally written by Marc Sohrmann (ms2 a sanger.ac.uk) Written in BioPipe by
42 Balamurugan Kumarasamy (savikalpa a fugu-sg.org) Cared for by the Fugu
43 Informatics team (fuguteam@fugu-sg.org)
45 You may distribute this module under the same terms as perl itself
47 Compared to the original SignalP, this method allow the user to filter results
48 out based on maxC maxY maxS meanS and D factor cutoff for the Neural Network (NN)
49 method only. The HMM method does not give any filters with 'YES' or 'NO' as result.
51 The user must be aware that the filters can only by applied on NN method.
52 Also, to ensure the compatibility with original Signalp parsing module, the user
53 must know that by default, if filters are empty, max Y and mean S filters are
54 automatically used to filter results.
56 If the used gives a list, then the parser will only report protein having 'YES'
57 for each factor.
59 This module supports parsing for full, summary and short output form signalp.
60 Actually, full and summary are equivalent in terms of filtering results.
62 =head1 FEEDBACK
64 =head2 Mailing Lists
66 User feedback is an integral part of the evolution of this and other
67 Bioperl modules. Send your comments and suggestions preferably to
68 the Bioperl mailing list. Your participation is much appreciated.
70 bioperl-l@bioperl.org - General discussion
71 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
73 =head2 Support
75 Please direct usage questions or support issues to the mailing list:
77 I<bioperl-l@bioperl.org>
79 rather than to the module maintainer directly. Many experienced and
80 reponsive experts will be able look at the problem and quickly
81 address it. Please include a thorough description of the problem
82 with code and data examples if at all possible.
84 =head2 Reporting Bugs
86 Report bugs to the Bioperl bug tracking system to help us keep track
87 of the bugs and their resolution. Bug reports can be submitted via
88 the web:
90 https://github.com/bioperl/bioperl-live/issues
92 =head1 AUTHOR
94 Based on the Bio::Tools::Signalp module
95 Emmanuel Quevillon <emmanuel.quevillon@versailles.inra.fr>
97 =head1 APPENDIX
99 The rest of the documentation details each of the object methods.
100 Internal methods are usually preceded with a _
102 =cut
104 package Bio::Tools::Signalp::ExtendedSignalp;
106 use strict;
107 use Data::Dumper;
108 use Bio::SeqFeature::Generic;
109 # don't need Bio::Root::Root/IO (already in inheritance tree)
110 use base qw(Bio::Tools::Signalp Bio::Tools::AnalysisResult);
112 #Supported arguments
113 my $FACTS = {
114 'maxC' => 1,
115 'maxS' => 1,
116 'maxY' => 1,
117 'meanS' => 1,
118 'D' => 1,
121 =head2 new
123 Title : new
124 Usage : my $obj = new Bio::Tools::Signalp::ExtendedSignalp();
125 Function: Builds a new Bio::Tools::Signalp::ExtendedSignalp object
126 Returns : Bio::Tools::Signalp::ExtendedSignalp
127 Args : -fh/-file => $val, # for initing input, see Bio::Root::IO
130 =cut
132 sub new {
133 my($class,@args) = @_;
135 my $self = $class->SUPER::new(@args);
136 $self->_initialize_io(@args);
138 my $factors = $self->_rearrange([qw(FACTORS)], @args);
139 #To behave like the parent module (Bio::Tools::Signalp) we default factors to these two factors
140 if($factors && scalar(@$factors)){
141 $factors = $factors;
143 else{
144 $factors = [qw(maxY meanS)];
146 $factors && $self->factors($factors);
148 return $self;
151 =head2 next_feature
153 Title : next_feature
154 Usage : my $feat = $signalp->next_feature
155 Function: Get the next result feature from parser data
156 Returns : Bio::SeqFeature::Generic
157 Args : none
160 =cut
162 sub next_feature {
164 my ($self) = @_;
166 if(!$self->_parsed()){
167 $self->_parse();
170 return shift @{$self->{_features}} || undef;
174 =head2 _filterok
176 Title : _filterok
177 Usage : my $feat = $signalp->_filterok
178 Function: Check if the factors required by the user are all ok.
179 Returns : 1/0
180 Args : hash reference
183 =cut
185 sub _filterok {
187 my($self, $hash) = @_;
189 #We hope everything will be fine ;)
190 my $bool = 1;
192 #If the user did not give any filter, we keep eveything
193 return $bool unless keys %{$self->{_factors}};
195 #If only one of the factors parsed is equal to NO based on the user factors cutoff
196 #Then the filter is not ok.
197 foreach my $fact (keys %{$self->factors()}){
198 if(exists($hash->{$fact}) && $hash->{$fact} =~ /^N/){
199 $bool = 0;
203 return $bool;
207 =head2 factors
209 Title : factors
210 Usage : my $feat = $signalp->factors
211 Function: Get/Set the filters required from the user
212 Returns : hash
213 Args : array reference
216 =cut
218 sub factors {
220 my($self, $array) = @_;
222 if($array){
223 $self->{_factors} = { };
224 foreach my $f (@$array){
225 if(exists($FACTS->{$f})){
226 $self->{_factors}->{$f} = 1;
228 else{
229 $self->throw("[$f] incorrect factor. Supported:\n- ".join("\n- ", keys %$FACTS)."\n");
234 return $self->{_factors};
238 =head2 _parsed
240 Title : _parsed
241 Usage : obj->_parsed()
242 Function: Get/Set if the result is parsed or not
243 Returns : 1/0 scalar
244 Args : On set 1
247 =cut
249 sub _parsed {
251 my($self, $parsed) = @_;
253 if(defined($parsed)){
254 $self->{_parsed} = $parsed;
257 return $self->{_parsed};
261 =head2 _parse
263 Title : _parse
264 Usage : obj->_parse
265 Function: Parse the SignalP result
266 Returns :
267 Args :
270 =cut
272 sub _parse {
274 my($self) = @_;
276 #Let's read the file...
277 while (my $line = $self->_readline()) {
279 chomp $line;
280 #We want to be sure to catch the first non empty line to be ablte to determine
281 #which format we are working with...
282 next unless ($line =~ /^>(\S+)|^# SignalP-[NHM]+ \S+ predictions/);
284 if($line =~ /^>(\S+)/){
285 $self->_pushback($line);
286 $self->_parse_summary_format();
287 last;
289 elsif($line =~ /^# SignalP-[NHM]+ \S+ predictions/){
290 $self->_pushback($line);
291 $self->_parse_short_format();
292 last;
294 else{
295 $self->throw("Unable to determine the format type.");
299 return;
302 =head2 _parse_summary_format
304 Title : _parse_summary_format
305 Usage : $self->_parse_summary_format
306 Function: Method to parse summary/full format from signalp output
307 It automatically fills filtered features.
308 Returns :
309 Args :
311 =cut
313 sub _parse_summary_format {
315 my($self) = @_;
317 my $feature = undef;
318 my $ok = 0;
320 while(my $line = $self->_readline()){
322 if($line =~ /^SignalP-NN result:/){
323 $self->_pushback($line);
324 $feature = $self->_parse_nn_result($feature);
326 if($line =~ /^SignalP-HMM result:/){
327 $self->_pushback($line);
328 $feature = $self->_parse_hmm_result($feature);
331 if($line =~ /^---------/ && $feature){
332 my $new_feature = $self->create_feature($feature);
333 push @{$self->{_features}}, $new_feature if $new_feature;
334 $feature = undef;
338 return;
342 =head2 _parse_nn_result
344 Title : _parse_nn_result
345 Usage : obj->_parse_nn_result
346 Function: Parses the Neuronal Network (NN) part of the result
347 Returns : Hash reference
348 Args :
351 =cut
353 sub _parse_nn_result {
355 my($self, $feature) = @_;
357 my $ok = 0;
358 my %facts;
360 #SignalP-NN result:
361 #>MGG_11635.5 length = 100
362 ## Measure Position Value Cutoff signal peptide?
363 # max. C 37 0.087 0.32 NO
364 # max. Y 37 0.042 0.33 NO
365 # max. S 3 0.062 0.87 NO
366 # mean S 1-36 0.024 0.48 NO
367 # D 1-36 0.033 0.43 NO
369 while(my $line = $self->_readline()){
371 chomp $line;
373 if($line =~ /^SignalP-NN result:/){
374 $ok = 1;
375 next;
378 $self->throw("Wrong line for parsing NN results.") unless $ok;
380 if ($line=~/^\>(\S+)\s+length/) {
381 $self->seqname($1);
382 %facts = ();
383 next;
385 elsif($line =~ /max\.\s+C\s+(\S+)\s+\S+\s+\S+\s+(\S+)/) {
386 $feature->{maxCprob} = $1;
387 $facts{maxC} = $2;
388 next;
390 elsif ($line =~ /max\.\s+Y\s+(\S+)\s+\S+\s+\S+\s+(\S+)/) {
391 $feature->{maxYprob} = $1;
392 $facts{maxY} = $2;
393 next;
395 elsif($line =~ /max\.\s+S\s+(\S+)\s+\S+\s+\S+\s+(\S+)/) {
396 $feature->{maxSprob} = $1;
397 $facts{maxS} = $2;
398 next;
400 elsif ($line=~/mean\s+S\s+(\S+)\s+\S+\s+\S+\s+(\S+)/) {
401 $feature->{meanSprob} = $1;
402 $facts{meanS} = $2;
403 next;
405 elsif ($line=~/\s+D\s+(\S+)\s+\S+\s+\S+\s+(\S+)/) {
406 $feature->{Dprob} = $1;
407 $facts{D} = $2;
408 next;
410 #If we don't have this line it means that all the factors cutoff are equal to 'NO'
411 elsif ($line =~ /Most likely cleavage site between pos\.\s+(\d+)/) {
412 #if($self->_filterok(\%facts)){
413 #$feature->{name} = $self->seqname();
414 #$feature->{start} = 1;
415 $feature->{end} = $1 + 1; #To be consistent with end given in short format
417 #return $feature;
419 elsif($line =~ /^\s*$/){
420 last;
424 if($self->_filterok(\%facts)){
425 $feature->{name} = $self->seqname();
426 $feature->{start} = 1;
427 $feature->{nnPrediction} = 'signal-peptide';
430 return $feature;
434 =head2 _parse_hmm_result
436 Title : _parse_hmm_result
437 Usage : obj->_parse_hmm_result
438 Function: Parses the Hiden Markov Model (HMM) part of the result
439 Returns : Hash reference
440 Args :
442 =cut
444 sub _parse_hmm_result {
446 my ($self, $feature_hash) = @_;
448 my $ok = 0;
450 #SignalP-HMM result:
451 #>MGG_11635.5
452 #Prediction: Non-secretory protein
453 #Signal peptide probability: 0.000
454 #Signal anchor probability: 0.000
455 #Max cleavage site probability: 0.000 between pos. -1 and 0
457 while(my $line = $self->_readline()){
459 chomp $line;
460 next if $line =~ /^\s*$/o;
462 if($line =~ /^SignalP-HMM result:/){
463 $ok = 1;
464 next;
467 $self->throw("Wrong line for parsing HMM result.") unless $ok;
469 if($line =~ /^>(\S+)/){
470 #In case we already seen a name with NN results
471 $feature_hash->{name} = $1 unless $self->seqname();
473 elsif($line =~ /Prediction: (.+)$/){
474 $feature_hash->{hmmPrediction} = $1;
476 elsif($line =~ /Signal peptide probability: ([0-9\.]+)/){
477 $feature_hash->{peptideProb} = $1;
479 elsif($line =~ /Signal anchor probability: ([0-9\.]+)/){
480 $feature_hash->{anchorProb} = $1;
482 elsif($line =~ /Max cleavage site probability: (\S+) between pos. \S+ and (\S+)/){
483 $feature_hash->{cleavageSiteProb} = $1;
484 #Strange case, if we don't have an end value in NN result (no nn method launched)
485 #We try anyway to get an end value, unless this value is lower than 1 which is
486 #the start
487 $feature_hash->{end} = $2 if($2 > 1 && !$feature_hash->{end});
488 $feature_hash->{start} = 1 unless $feature_hash->{start};
489 last;
493 return $feature_hash;
496 =head2 _parse_short_format
498 Title : _parse_short_format
499 Usage : $self->_parse_short_format
500 Function: Method to parse short format from signalp output
501 It automatically fills filtered features.
502 Returns :
503 Args :
505 =cut
507 sub _parse_short_format {
509 my($self) = @_;
511 my $ok = 0;
512 my $method = undef;
513 $self->{_oformat} = 'short';
515 #Output example
516 # SignalP-NN euk predictions # SignalP-HMM euk predictions
517 # name Cmax pos ? Ymax pos ? Smax pos ? Smean ? D ? # name ! Cmax pos ? Sprob ?
518 #Q5A8M1_CANAL 0.085 27 N 0.190 35 N 0.936 27 Y 0.418 N 0.304 N Q5A8M1_CANAL Q 0.001 35 N 0.002 N
519 #O74127_YARLI 0.121 21 N 0.284 21 N 0.953 11 Y 0.826 Y 0.555 Y O74127_YARLI S 0.485 23 N 0.668 Y
520 #Q5VJ86_9PEZI 0.355 24 Y 0.375 24 Y 0.798 12 N 0.447 N 0.411 N Q5VJ86_9PEZI Q 0.180 23 N 0.339 N
521 #Q5A8U5_CANAL 0.085 27 N 0.190 35 N 0.936 27 Y 0.418 N 0.304 N Q5A8U5_CANAL Q 0.001 35 N 0.002 N
523 while(my $line = $self->_readline()){
525 chomp $line;
526 next if $line =~ /^\s*$|^# name/;
528 if($line =~ /^#/){
529 $method = $line =~ /SignalP-NN .+ SignalP-HMM/ ?
530 'both' : $line =~ /SignalP-NN/ ?
531 'nn' : 'hmm';
532 next;
535 #$self->throw("It looks like the format is not 'short' format.") unless($ok);
537 my @data = split(/\s+/, $line);
538 $self->seqname($data[0]);
540 my $factors = { };
541 my $feature = { };
543 #NN results gives more fields than HMM
544 if($method eq 'both' || $method eq 'nn'){
546 $feature->{maxCprob} = $data[1];
547 $factors->{maxC} = $data[3];
548 $feature->{maxYprob} = $data[4];
549 $factors->{maxY} = $data[6];
550 $feature->{maxSprob} = $data[7];
551 $factors->{maxS} = $data[9];
552 $feature->{meanSprob}= $data[10];
553 $factors->{meanS} = $data[11];
554 $feature->{Dprob} = $data[12];
555 $factors->{D} = $data[13];
556 #It looks like the max Y position is reported as the most likely cleavage position
557 $feature->{end} = $data[5];
558 $feature->{nnPrediction} = 'signal-peptide';
560 if($method eq 'both'){
561 $feature->{hmmPrediction} = $data[15] eq 'Q' ? 'Non-secretory protein' : 'Signal peptide';
562 $feature->{cleavageSiteProb} = $data[16];
563 $feature->{peptideProb} = $data[19];
566 elsif($method eq 'hmm'){
567 #In short output anchor probability is not given
568 $feature->{hmmPrediction} = $data[1] eq 'Q' ? 'Non-secretory protein' : 'Signal peptide';
569 $feature->{cleavageSiteProb} = $data[2];
570 $feature->{peptideProb} = $data[5];
571 #It looks like the max cleavage probability position is given by the Cmax proability
572 $feature->{end} = $data[3];
575 #Unfortunately, we cannot parse the filters for hmm method.
576 if($self->_filterok($factors)){
577 $feature->{name} = $self->seqname();
578 $feature->{start} = 1;
579 $feature->{source} = 'Signalp';
580 $feature->{primary} = 'signal_peptide';
581 $feature->{program} = 'Signalp';
582 $feature->{logic_name} = 'signal_peptide';
584 my $new_feat = $self->create_feature($feature);
585 push @{$self->{_features}}, $new_feat if $new_feat;
589 return;
592 =head2 create_feature
594 Title : create_feature
595 Usage : obj->create_feature(\%feature)
596 Function: Internal(not to be used directly)
597 Returns :
598 Args :
601 =cut
603 sub create_feature {
605 my ($self, $feat) = @_;
607 #If we don't have neither start nor end, we return.
608 unless($feat->{name} && $feat->{start} && $feat->{end}){
609 return;
612 # create feature object
613 my $feature = Bio::SeqFeature::Generic->new(
614 -seq_id => $feat->{name},
615 -start => $feat->{start},
616 -end => $feat->{end},
617 -score => defined($feat->{peptideProb}) ? $feat->{peptideProb} : '',
618 -source => 'Signalp',
619 -primary => 'signal_peptide',
620 -logic_name => 'signal_peptide',
623 $feature->add_tag_value('peptideProb', $feat->{peptideProb});
624 $feature->add_tag_value('anchorProb', $feat->{anchorProb});
625 $feature->add_tag_value('evalue',$feat->{anchorProb});
626 $feature->add_tag_value('percent_id','NULL');
627 $feature->add_tag_value("hid",$feat->{primary});
628 $feature->add_tag_value('signalpPrediction', $feat->{hmmPrediction});
629 $feature->add_tag_value('cleavageSiteProb', $feat->{cleavageSiteProb}) if($feat->{cleavageSiteProb});
630 $feature->add_tag_value('nnPrediction', $feat->{nnPrediction}) if($feat->{nnPrediction});
631 $feature->add_tag_value('maxCprob', $feat->{maxCprob}) if(defined($feat->{maxCprob}));
632 $feature->add_tag_value('maxSprob', $feat->{maxSprob}) if(defined($feat->{maxSprob}));
633 $feature->add_tag_value('maxYprob', $feat->{maxYprob}) if(defined($feat->{maxYprob}));
634 $feature->add_tag_value('meanSprob', $feat->{meanSprob}) if(defined($feat->{meanSprob}));
635 $feature->add_tag_value('Dprob', $feat->{Dprob}) if(defined($feat->{Dprob}));
637 return $feature;
641 =head2 seqname
643 Title : seqname
644 Usage : obj->seqname($name)
645 Function: Internal(not to be used directly)
646 Returns :
647 Args :
650 =cut
652 sub seqname{
653 my ($self,$seqname)=@_;
655 if (defined($seqname)){
656 $self->{'seqname'} = $seqname;
659 return $self->{'seqname'};