2 # BioPerl module for Bio::Tools::Signalp::ExtendedSignalp
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Emmanuel Quevillon <emmanuel.quevillon@versailles.inra.fr>
8 # Copyright Emmanuel Quevillon
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
16 Bio::Tools::Signalp::ExtendedSignalp - enhanced parser for Signalp output
20 use Bio::Tools::Signalp::ExtendedSignalp;
21 my $params = [qw(maxC maxY maxS meanS D)];
22 my $parser = new Bio::Tools::Signalp::ExtendedSignalp(
27 $parser->factors($params);
28 while( my $sp_feat = $parser->next_feature ) {
31 push @sp_feat, $sp_feat;
36 # Please direct questions and support issues to I<bioperl-l@bioperl.org>
38 Parser module for Signalp.
40 Based on the EnsEMBL module Bio::EnsEMBL::Pipeline::Runnable::Protein::Signalp
41 originally written by Marc Sohrmann (ms2 a sanger.ac.uk) Written in BioPipe by
42 Balamurugan Kumarasamy (savikalpa a fugu-sg.org) Cared for by the Fugu
43 Informatics team (fuguteam@fugu-sg.org)
45 You may distribute this module under the same terms as perl itself
47 Compared to the original SignalP, this method allow the user to filter results
48 out based on maxC maxY maxS meanS and D factor cutoff for the Neural Network (NN)
49 method only. The HMM method does not give any filters with 'YES' or 'NO' as result.
51 The user must be aware that the filters can only by applied on NN method.
52 Also, to ensure the compatibility with original Signalp parsing module, the user
53 must know that by default, if filters are empty, max Y and mean S filters are
54 automatically used to filter results.
56 If the used gives a list, then the parser will only report protein having 'YES'
59 This module supports parsing for full, summary and short output form signalp.
60 Actually, full and summary are equivalent in terms of filtering results.
66 User feedback is an integral part of the evolution of this and other
67 Bioperl modules. Send your comments and suggestions preferably to
68 the Bioperl mailing list. Your participation is much appreciated.
70 bioperl-l@bioperl.org - General discussion
71 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
75 Please direct usage questions or support issues to the mailing list:
77 I<bioperl-l@bioperl.org>
79 rather than to the module maintainer directly. Many experienced and
80 reponsive experts will be able look at the problem and quickly
81 address it. Please include a thorough description of the problem
82 with code and data examples if at all possible.
86 Report bugs to the Bioperl bug tracking system to help us keep track
87 of the bugs and their resolution. Bug reports can be submitted via
90 https://github.com/bioperl/bioperl-live/issues
94 Based on the Bio::Tools::Signalp module
95 Emmanuel Quevillon <emmanuel.quevillon@versailles.inra.fr>
99 The rest of the documentation details each of the object methods.
100 Internal methods are usually preceded with a _
104 package Bio
::Tools
::Signalp
::ExtendedSignalp
;
108 use Bio
::SeqFeature
::Generic
;
109 # don't need Bio::Root::Root/IO (already in inheritance tree)
110 use base
qw(Bio::Tools::Signalp Bio::Tools::AnalysisResult);
124 Usage : my $obj = new Bio::Tools::Signalp::ExtendedSignalp();
125 Function: Builds a new Bio::Tools::Signalp::ExtendedSignalp object
126 Returns : Bio::Tools::Signalp::ExtendedSignalp
127 Args : -fh/-file => $val, # for initing input, see Bio::Root::IO
133 my($class,@args) = @_;
135 my $self = $class->SUPER::new
(@args);
136 $self->_initialize_io(@args);
138 my $factors = $self->_rearrange([qw(FACTORS)], @args);
139 #To behave like the parent module (Bio::Tools::Signalp) we default factors to these two factors
140 if($factors && scalar(@
$factors)){
144 $factors = [qw(maxY meanS)];
146 $factors && $self->factors($factors);
154 Usage : my $feat = $signalp->next_feature
155 Function: Get the next result feature from parser data
156 Returns : Bio::SeqFeature::Generic
166 if(!$self->_parsed()){
170 return shift @
{$self->{_features
}} || undef;
177 Usage : my $feat = $signalp->_filterok
178 Function: Check if the factors required by the user are all ok.
180 Args : hash reference
187 my($self, $hash) = @_;
189 #We hope everything will be fine ;)
192 #If the user did not give any filter, we keep eveything
193 return $bool unless keys %{$self->{_factors
}};
195 #If only one of the factors parsed is equal to NO based on the user factors cutoff
196 #Then the filter is not ok.
197 foreach my $fact (keys %{$self->factors()}){
198 if(exists($hash->{$fact}) && $hash->{$fact} =~ /^N/){
210 Usage : my $feat = $signalp->factors
211 Function: Get/Set the filters required from the user
213 Args : array reference
220 my($self, $array) = @_;
223 $self->{_factors
} = { };
224 foreach my $f (@
$array){
225 if(exists($FACTS->{$f})){
226 $self->{_factors
}->{$f} = 1;
229 $self->throw("[$f] incorrect factor. Supported:\n- ".join("\n- ", keys %$FACTS)."\n");
234 return $self->{_factors
};
241 Usage : obj->_parsed()
242 Function: Get/Set if the result is parsed or not
251 my($self, $parsed) = @_;
253 if(defined($parsed)){
254 $self->{_parsed
} = $parsed;
257 return $self->{_parsed
};
265 Function: Parse the SignalP result
276 #Let's read the file...
277 while (my $line = $self->_readline()) {
280 #We want to be sure to catch the first non empty line to be ablte to determine
281 #which format we are working with...
282 next unless ($line =~ /^>(\S+)|^# SignalP-[NHM]+ \S+ predictions/);
284 if($line =~ /^>(\S+)/){
285 $self->_pushback($line);
286 $self->_parse_summary_format();
289 elsif($line =~ /^# SignalP-[NHM]+ \S+ predictions/){
290 $self->_pushback($line);
291 $self->_parse_short_format();
295 $self->throw("Unable to determine the format type.");
302 =head2 _parse_summary_format
304 Title : _parse_summary_format
305 Usage : $self->_parse_summary_format
306 Function: Method to parse summary/full format from signalp output
307 It automatically fills filtered features.
313 sub _parse_summary_format
{
320 while(my $line = $self->_readline()){
322 if($line =~ /^SignalP-NN result:/){
323 $self->_pushback($line);
324 $feature = $self->_parse_nn_result($feature);
326 if($line =~ /^SignalP-HMM result:/){
327 $self->_pushback($line);
328 $feature = $self->_parse_hmm_result($feature);
331 if($line =~ /^---------/ && $feature){
332 my $new_feature = $self->create_feature($feature);
333 push @
{$self->{_features
}}, $new_feature if $new_feature;
342 =head2 _parse_nn_result
344 Title : _parse_nn_result
345 Usage : obj->_parse_nn_result
346 Function: Parses the Neuronal Network (NN) part of the result
347 Returns : Hash reference
353 sub _parse_nn_result
{
355 my($self, $feature) = @_;
361 #>MGG_11635.5 length = 100
362 ## Measure Position Value Cutoff signal peptide?
363 # max. C 37 0.087 0.32 NO
364 # max. Y 37 0.042 0.33 NO
365 # max. S 3 0.062 0.87 NO
366 # mean S 1-36 0.024 0.48 NO
367 # D 1-36 0.033 0.43 NO
369 while(my $line = $self->_readline()){
373 if($line =~ /^SignalP-NN result:/){
378 $self->throw("Wrong line for parsing NN results.") unless $ok;
380 if ($line=~/^\>(\S+)\s+length/) {
385 elsif($line =~ /max\.\s+C\s+(\S+)\s+\S+\s+\S+\s+(\S+)/) {
386 $feature->{maxCprob
} = $1;
390 elsif ($line =~ /max\.\s+Y\s+(\S+)\s+\S+\s+\S+\s+(\S+)/) {
391 $feature->{maxYprob
} = $1;
395 elsif($line =~ /max\.\s+S\s+(\S+)\s+\S+\s+\S+\s+(\S+)/) {
396 $feature->{maxSprob
} = $1;
400 elsif ($line=~/mean\s+S\s+(\S+)\s+\S+\s+\S+\s+(\S+)/) {
401 $feature->{meanSprob
} = $1;
405 elsif ($line=~/\s+D\s+(\S+)\s+\S+\s+\S+\s+(\S+)/) {
406 $feature->{Dprob
} = $1;
410 #If we don't have this line it means that all the factors cutoff are equal to 'NO'
411 elsif ($line =~ /Most likely cleavage site between pos\.\s+(\d+)/) {
412 #if($self->_filterok(\%facts)){
413 #$feature->{name} = $self->seqname();
414 #$feature->{start} = 1;
415 $feature->{end
} = $1 + 1; #To be consistent with end given in short format
419 elsif($line =~ /^\s*$/){
424 if($self->_filterok(\
%facts)){
425 $feature->{name
} = $self->seqname();
426 $feature->{start
} = 1;
427 $feature->{nnPrediction
} = 'signal-peptide';
434 =head2 _parse_hmm_result
436 Title : _parse_hmm_result
437 Usage : obj->_parse_hmm_result
438 Function: Parses the Hiden Markov Model (HMM) part of the result
439 Returns : Hash reference
444 sub _parse_hmm_result
{
446 my ($self, $feature_hash) = @_;
452 #Prediction: Non-secretory protein
453 #Signal peptide probability: 0.000
454 #Signal anchor probability: 0.000
455 #Max cleavage site probability: 0.000 between pos. -1 and 0
457 while(my $line = $self->_readline()){
460 next if $line =~ /^\s*$/o;
462 if($line =~ /^SignalP-HMM result:/){
467 $self->throw("Wrong line for parsing HMM result.") unless $ok;
469 if($line =~ /^>(\S+)/){
470 #In case we already seen a name with NN results
471 $feature_hash->{name
} = $1 unless $self->seqname();
473 elsif($line =~ /Prediction: (.+)$/){
474 $feature_hash->{hmmPrediction
} = $1;
476 elsif($line =~ /Signal peptide probability: ([0-9\.]+)/){
477 $feature_hash->{peptideProb
} = $1;
479 elsif($line =~ /Signal anchor probability: ([0-9\.]+)/){
480 $feature_hash->{anchorProb
} = $1;
482 elsif($line =~ /Max cleavage site probability: (\S+) between pos. \S+ and (\S+)/){
483 $feature_hash->{cleavageSiteProb
} = $1;
484 #Strange case, if we don't have an end value in NN result (no nn method launched)
485 #We try anyway to get an end value, unless this value is lower than 1 which is
487 $feature_hash->{end
} = $2 if($2 > 1 && !$feature_hash->{end
});
488 $feature_hash->{start
} = 1 unless $feature_hash->{start
};
493 return $feature_hash;
496 =head2 _parse_short_format
498 Title : _parse_short_format
499 Usage : $self->_parse_short_format
500 Function: Method to parse short format from signalp output
501 It automatically fills filtered features.
507 sub _parse_short_format
{
513 $self->{_oformat
} = 'short';
516 # SignalP-NN euk predictions # SignalP-HMM euk predictions
517 # name Cmax pos ? Ymax pos ? Smax pos ? Smean ? D ? # name ! Cmax pos ? Sprob ?
518 #Q5A8M1_CANAL 0.085 27 N 0.190 35 N 0.936 27 Y 0.418 N 0.304 N Q5A8M1_CANAL Q 0.001 35 N 0.002 N
519 #O74127_YARLI 0.121 21 N 0.284 21 N 0.953 11 Y 0.826 Y 0.555 Y O74127_YARLI S 0.485 23 N 0.668 Y
520 #Q5VJ86_9PEZI 0.355 24 Y 0.375 24 Y 0.798 12 N 0.447 N 0.411 N Q5VJ86_9PEZI Q 0.180 23 N 0.339 N
521 #Q5A8U5_CANAL 0.085 27 N 0.190 35 N 0.936 27 Y 0.418 N 0.304 N Q5A8U5_CANAL Q 0.001 35 N 0.002 N
523 while(my $line = $self->_readline()){
526 next if $line =~ /^\s*$|^# name/;
529 $method = $line =~ /SignalP-NN .+ SignalP-HMM/ ?
530 'both' : $line =~ /SignalP-NN/ ?
535 #$self->throw("It looks like the format is not 'short' format.") unless($ok);
537 my @data = split(/\s+/, $line);
538 $self->seqname($data[0]);
543 #NN results gives more fields than HMM
544 if($method eq 'both' || $method eq 'nn'){
546 $feature->{maxCprob
} = $data[1];
547 $factors->{maxC
} = $data[3];
548 $feature->{maxYprob
} = $data[4];
549 $factors->{maxY
} = $data[6];
550 $feature->{maxSprob
} = $data[7];
551 $factors->{maxS
} = $data[9];
552 $feature->{meanSprob
}= $data[10];
553 $factors->{meanS
} = $data[11];
554 $feature->{Dprob
} = $data[12];
555 $factors->{D
} = $data[13];
556 #It looks like the max Y position is reported as the most likely cleavage position
557 $feature->{end
} = $data[5];
558 $feature->{nnPrediction
} = 'signal-peptide';
560 if($method eq 'both'){
561 $feature->{hmmPrediction
} = $data[15] eq 'Q' ?
'Non-secretory protein' : 'Signal peptide';
562 $feature->{cleavageSiteProb
} = $data[16];
563 $feature->{peptideProb
} = $data[19];
566 elsif($method eq 'hmm'){
567 #In short output anchor probability is not given
568 $feature->{hmmPrediction
} = $data[1] eq 'Q' ?
'Non-secretory protein' : 'Signal peptide';
569 $feature->{cleavageSiteProb
} = $data[2];
570 $feature->{peptideProb
} = $data[5];
571 #It looks like the max cleavage probability position is given by the Cmax proability
572 $feature->{end
} = $data[3];
575 #Unfortunately, we cannot parse the filters for hmm method.
576 if($self->_filterok($factors)){
577 $feature->{name
} = $self->seqname();
578 $feature->{start
} = 1;
579 $feature->{source
} = 'Signalp';
580 $feature->{primary
} = 'signal_peptide';
581 $feature->{program
} = 'Signalp';
582 $feature->{logic_name
} = 'signal_peptide';
584 my $new_feat = $self->create_feature($feature);
585 push @
{$self->{_features
}}, $new_feat if $new_feat;
592 =head2 create_feature
594 Title : create_feature
595 Usage : obj->create_feature(\%feature)
596 Function: Internal(not to be used directly)
605 my ($self, $feat) = @_;
607 #If we don't have neither start nor end, we return.
608 unless($feat->{name
} && $feat->{start
} && $feat->{end
}){
612 # create feature object
613 my $feature = Bio
::SeqFeature
::Generic
->new(
614 -seq_id
=> $feat->{name
},
615 -start
=> $feat->{start
},
616 -end
=> $feat->{end
},
617 -score
=> defined($feat->{peptideProb
}) ?
$feat->{peptideProb
} : '',
618 -source
=> 'Signalp',
619 -primary
=> 'signal_peptide',
620 -logic_name
=> 'signal_peptide',
623 $feature->add_tag_value('peptideProb', $feat->{peptideProb
});
624 $feature->add_tag_value('anchorProb', $feat->{anchorProb
});
625 $feature->add_tag_value('evalue',$feat->{anchorProb
});
626 $feature->add_tag_value('percent_id','NULL');
627 $feature->add_tag_value("hid",$feat->{primary
});
628 $feature->add_tag_value('signalpPrediction', $feat->{hmmPrediction
});
629 $feature->add_tag_value('cleavageSiteProb', $feat->{cleavageSiteProb
}) if($feat->{cleavageSiteProb
});
630 $feature->add_tag_value('nnPrediction', $feat->{nnPrediction
}) if($feat->{nnPrediction
});
631 $feature->add_tag_value('maxCprob', $feat->{maxCprob
}) if(defined($feat->{maxCprob
}));
632 $feature->add_tag_value('maxSprob', $feat->{maxSprob
}) if(defined($feat->{maxSprob
}));
633 $feature->add_tag_value('maxYprob', $feat->{maxYprob
}) if(defined($feat->{maxYprob
}));
634 $feature->add_tag_value('meanSprob', $feat->{meanSprob
}) if(defined($feat->{meanSprob
}));
635 $feature->add_tag_value('Dprob', $feat->{Dprob
}) if(defined($feat->{Dprob
}));
644 Usage : obj->seqname($name)
645 Function: Internal(not to be used directly)
653 my ($self,$seqname)=@_;
655 if (defined($seqname)){
656 $self->{'seqname'} = $seqname;
659 return $self->{'seqname'};