2 # BioPerl module for Bio::Tools::Analysis::Protein::Scansite
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Richard Adams <richard.adams@ed.ac.uk>
8 # Copyright Richard Adams
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
16 Bio::Tools::Analysis::Protein::Scansite - a wrapper around the Scansite server
20 use Bio::Tools::Analysis::Protein::Scansite;
22 my $seq; # a Bio::PrimarySeqI object
24 my $tool = Bio::Tools::Analysis::Protein::Scansite->new
25 ( -seq => $seq->primary_seq );
27 # run Scansite prediction on a sequence
30 # alternatively you can say
31 $tool->seq($seq->primary_seq)->run;
33 die "Could not get a result" unless $tool->status =~ /^COMPLETED/;
35 print $tool->result; # print raw prediction to STDOUT
37 foreach my $feat ( $tool->result('Bio::SeqFeatureI') ) {
39 # do something to SeqFeature
41 print $feat->gff_string, "\n";
42 # or store within the sequence - if it is a Bio::RichSeqI
43 $seq->add_SeqFeature($feat);
49 This class is a wrapper around the Scansite 2.0 server which produces
50 predictions for serine, threonine and tyrosine phosphorylation sites
51 in eukaryotic proteins. At present this is a basic wrapper for the
52 "Scan protein by input sequence" functionality, which takes a sequence
53 and searches for motifs, with the option to select the search
54 stringency. At present, searches for specific phosphorylation
55 sites are not supported; all predicted sites are returned.
59 The Scansite results can be obtained in several formats:
67 my $res = $tool->result('');
69 $res holds a string of the predicted sites in tabular format.
75 my $data_ref = $tool->result('value')
77 $data_ref is a reference to an array of hashes. Each element in the
78 array represents a predicted phosphorylation site. The hash keys are
79 the names of the data fields,i.e.,
81 'motif' => 'Casn_Kin1' # name of kinase
82 'percentile' => 0.155 # see Scansite docs
83 'position' => 9 # position in protein
84 'protein' => 'A1' # protein id
85 'score' => 0.3696 # see Scansite docs
86 'sequence' => 'ASYFDTASYFSADAT' # sequence surrounding site
87 'site' => 'S9' # phosphorylated residue
88 'zscore' => '-3.110' # see Scansite docs
94 my @fts = $tool->Result('Bio::SeqFeatureI');
96 which returns an array of L<Bio::SeqFeatureI> compliant objects with
97 primary tag value 'Site' and tag names of 'motif', 'score',
98 'sequence', 'zscore' as above.
102 See L<http://scansite.mit.edu/>.
104 This inherits Bio::SimpleAnalysisI which hopefully makes it easier to
105 write wrappers on various services. This class uses a web resource and
106 therefore inherits from L<Bio::WebAgent>.
110 L<Bio::SimpleAnalysisI>,
117 User feedback is an integral part of the evolution of this and other
118 Bioperl modules. Send your comments and suggestions preferably to one
119 of the Bioperl mailing lists. Your participation is much appreciated.
121 bioperl-l@bioperl.org - General discussion
122 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
126 Please direct usage questions or support issues to the mailing list:
128 I<bioperl-l@bioperl.org>
130 rather than to the module maintainer directly. Many experienced and
131 reponsive experts will be able look at the problem and quickly
132 address it. Please include a thorough description of the problem
133 with code and data examples if at all possible.
135 =head2 Reporting Bugs
137 Report bugs to the Bioperl bug tracking system to help us keep track
138 the bugs and their resolution. Bug reports can be submitted via the
141 https://github.com/bioperl/bioperl-live/issues
145 Richard Adams, Richard.Adams@ed.ac.uk,
149 The rest of the documentation details each of the object
150 methods. Internal methods are usually preceded with a _
155 # Let the code begin...
158 package Bio
::Tools
::Analysis
::Protein
::Scansite
;
159 use vars
qw($FLOAT @STRINGENCY);
163 use HTTP::Request::Common qw(POST);
164 use Bio
::SeqFeature
::Generic
;
166 use base
qw(Bio::Tools::Analysis::SimpleAnalysisBase);
168 $FLOAT = '[+-]?\d*\.\d*';
169 @STRINGENCY = qw(High Medium Low);
170 my $URL = 'http://scansite.mit.edu/cgi-bin/motifscan_seq';
175 'name' => 'Scansite',
178 'supplier' => 'Massachusetts Institute of Technology',
179 'description' => 'Prediction of serine, threonine and tyrosine
180 phosphorylation sites in eukaryotic proteins',
186 'mandatory' => 'true',
187 'type' => 'Bio::PrimarySeqI',
191 'mandatory' => 'false',
193 'name' => 'protein_id',
194 'default' => 'unnamed',
197 'mandatory' => 'false',
199 'name' => 'stringency',
206 '' => 'bulk', # same as undef
207 'Bio::SeqFeatureI' => 'ARRAY of Bio::SeqFeature::Generic',
208 'raw' => 'Array of {motif=>, percentile=>, position=>,
209 protein=>, score=>, site=>, zscore=>
218 Usage : $job->result (...)
219 Returns : a result created by running an analysis
220 Args : none (but an implementation may choose
221 to add arguments for instructions how to process
224 The method returns a scalar representing a result of an executed
225 job. If the job was terminated by an error, the result may contain
226 an error message instead of the real data.
228 This implementation returns differently processed data depending on
235 Returns the raw ASCII data stream but without HTML tags
237 =item 'Bio::SeqFeatureI'
239 The argument string defined the type of bioperl objects returned in an
240 array. The objects are L<Bio::SeqFeature::Generic>.
244 Returns a reference to an array of hashes containing the data of one
245 phosphorylation site prediction. Key values are:
247 motif, percentile, position, protein, score, site, zscore, sequence.
256 my ($self,$value) = @_;
257 if( !exists($self->{'_result'}) || $self->status ne 'COMPLETED'){
258 $self->throw("Cannot get results, analysis not run!");
263 if ($value eq 'Bio::SeqFeatureI') {
264 for my $hit (@
{$self->{'_parsed'}}) {
265 push @fts, Bio
::SeqFeature
::Generic
->new(
266 -start
=> $hit->{'position'},
267 -end
=> $hit->{'position'},
268 -primary_tag
=> 'Site',
269 -source
=> 'Scansite',
271 score
=> $hit->{'score'},
272 zscore
=> $hit->{'zscore'},
273 motif
=> $hit->{'motif'},
274 site
=> $hit->{'site'},
275 sequence
=> $hit->{'sequence'},
281 elsif ($value eq 'meta') {
282 $self->throw("No meta sequences available in this analysis!");
285 return $self->{'_parsed'};
288 return $self->{'_result'};
293 Usage : $job->stringency(...)
294 Returns : The significance stringency of a prediction
295 Args : None (retrieves value) or 'High', 'Medium' or 'Low'.
296 Purpose : Get/setter of the stringency to be sumitted for analysis.
301 my ($self,$value) = @_;
303 if (! grep{$_=~ /$value/i}@STRINGENCY ) {
304 $self->throw("I need a stringency of [".
305 join " ", @STRINGENCY .
308 $self->{'_stringency'} = $value;
311 return $self->{'_stringency'} || $self->input_spec->[2]{'default'} ;
316 Usage : $job->protein_id(...)
317 Returns : The sequence id of the protein or 'unnamed' if not set.
319 Purpose : Getter of the seq_id. Returns the display_id of the sequence
326 return defined ($self->seq())?
$self->seq->display_id()
327 : $self->input_spec->[1]{'default'};
334 $self->{'_ANALYSIS_SPEC'} = $ANALYSIS_SPEC;
335 $self->{'_INPUT_SPEC'} = $INPUT_SPEC;
336 $self->{'_RESULT_SPEC'} = $RESULT_SPEC;
337 $self->{'_ANALYSIS_NAME'} = $ANALYSIS_SPEC->{'name'};
344 # format the sequence into fasta
346 # delay repeated calls by default by 3 sec, set delay() to change
349 $self->status('TERMINATED_BY_ERROR');
351 my $request = POST
$self->url,
352 Content
=> [sequence
=> $self->seq->seq(),
353 protein_id
=> $self->protein_id(),
354 motif_option
=> 'all',
357 stringency
=> $self->stringency(),
359 submit
=> "Submit Request",
362 my $content = $self->request($request);
363 my $text = $content->content;
365 ##access result data from tag in html
366 my @parsed_Results = ();
367 my @unwantedParams = qw(db source class);
368 my @results = split /sitestats\.phtml\?/, $text;
371 ##this module generates 'parsed' output directly from html,
372 ## avoids having toparse twice.
374 for my $hit (@results) {
375 ## get results string
376 my ($res) = $hit =~ /^(.+?)"/;
379 my %params = $res =~/(\w+)=([^&]+)/g;
381 ##remove unwanted data from hash
382 map{delete $params{$_}} @unwantedParams;
383 push @parsed_Results, \
%params;
386 ## now generate text output in table format
388 $out_Str .= $self->_make_header(\
@parsed_Results);
389 $out_Str .= $self->_add_data(\
@parsed_Results);
392 $self->{'_result'} = $out_Str;
393 $self->{'_parsed'} = \
@parsed_Results;
395 ## is successsful if there are results or if there are no results and
396 ## this beacuse there are no matches, not because of parsing errors etc.
397 $self->status('COMPLETED') if $text ne '' &&
398 (scalar @results > 0 ||
399 (scalar @results == 0 && $text =~/No sites found/));
400 if ($text =~ /server\s+error/i) {
401 $self->throw("Internal server error:\n\n $text");
406 sub _process_arguments
{
408 # extra checking for sequence length
409 # mitoprot specific argument testing
410 my ($self, $args) = @_;
411 #use base checking for existence of mandatory fields
412 $self->SUPER::_process_arguments
($args);
414 # specific requirements
415 $self->throw("Sequence must be > 15 amino acids long!")
416 if $self->seq->length < 15;
417 $self->throw("Sequence must be protein")
418 unless $self->seq->alphabet() eq 'protein';
422 my ($self, $res) = @_;
424 for my $k (sort keys %{$res->[0]} ){
425 next if $k eq 'sequence';
427 $header .= ' 'x
(12 -length($k));
429 $header .= "sequence\n\n";
434 my ($self, $res) = @_;
436 for my $hit (@
$res) {
437 for my $k (sort keys %$hit ){
438 next if $k eq 'sequence';
439 $outstr .= $hit->{$k};
440 $outstr .= ' 'x
(12 - length($hit->{$k}));
442 $outstr .= $hit->{'sequence'}. "\n" if $hit->{'sequence'};