maint: restructure to use Dist::Zilla
[bioperl-live.git] / lib / Bio / Tools / Analysis / Protein / Scansite.pm
blob11f4bec61e1f14bc7b6cab670c3ab352aec0b732
2 # BioPerl module for Bio::Tools::Analysis::Protein::Scansite
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Richard Adams <richard.adams@ed.ac.uk>
8 # Copyright Richard Adams
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
14 =head1 NAME
16 Bio::Tools::Analysis::Protein::Scansite - a wrapper around the Scansite server
18 =head1 SYNOPSIS
20 use Bio::Tools::Analysis::Protein::Scansite;
22 my $seq; # a Bio::PrimarySeqI object
24 my $tool = Bio::Tools::Analysis::Protein::Scansite->new
25 ( -seq => $seq->primary_seq );
27 # run Scansite prediction on a sequence
28 $tool->run();
30 # alternatively you can say
31 $tool->seq($seq->primary_seq)->run;
33 die "Could not get a result" unless $tool->status =~ /^COMPLETED/;
35 print $tool->result; # print raw prediction to STDOUT
37 foreach my $feat ( $tool->result('Bio::SeqFeatureI') ) {
39 # do something to SeqFeature
40 # e.g. print as GFF
41 print $feat->gff_string, "\n";
42 # or store within the sequence - if it is a Bio::RichSeqI
43 $seq->add_SeqFeature($feat);
47 =head1 DESCRIPTION
49 This class is a wrapper around the Scansite 2.0 server which produces
50 predictions for serine, threonine and tyrosine phosphorylation sites
51 in eukaryotic proteins. At present this is a basic wrapper for the
52 "Scan protein by input sequence" functionality, which takes a sequence
53 and searches for motifs, with the option to select the search
54 stringency. At present, searches for specific phosphorylation
55 sites are not supported; all predicted sites are returned.
57 =head2 Return formats
59 The Scansite results can be obtained in several formats:
61 =over 3
63 =item 1.
65 By calling
67 my $res = $tool->result('');
69 $res holds a string of the predicted sites in tabular format.
71 =item 2.
73 By calling
75 my $data_ref = $tool->result('value')
77 $data_ref is a reference to an array of hashes. Each element in the
78 array represents a predicted phosphorylation site. The hash keys are
79 the names of the data fields,i.e.,
81 'motif' => 'Casn_Kin1' # name of kinase
82 'percentile' => 0.155 # see Scansite docs
83 'position' => 9 # position in protein
84 'protein' => 'A1' # protein id
85 'score' => 0.3696 # see Scansite docs
86 'sequence' => 'ASYFDTASYFSADAT' # sequence surrounding site
87 'site' => 'S9' # phosphorylated residue
88 'zscore' => '-3.110' # see Scansite docs
90 =item 3.
92 By calling
94 my @fts = $tool->Result('Bio::SeqFeatureI');
96 which returns an array of L<Bio::SeqFeatureI> compliant objects with
97 primary tag value 'Site' and tag names of 'motif', 'score',
98 'sequence', 'zscore' as above.
100 =back
102 See L<http://scansite.mit.edu/>.
104 This inherits Bio::SimpleAnalysisI which hopefully makes it easier to
105 write wrappers on various services. This class uses a web resource and
106 therefore inherits from L<Bio::WebAgent>.
108 =head1 SEE ALSO
110 L<Bio::SimpleAnalysisI>,
111 L<Bio::WebAgent>
113 =head1 FEEDBACK
115 =head2 Mailing Lists
117 User feedback is an integral part of the evolution of this and other
118 Bioperl modules. Send your comments and suggestions preferably to one
119 of the Bioperl mailing lists. Your participation is much appreciated.
121 bioperl-l@bioperl.org - General discussion
122 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
124 =head2 Support
126 Please direct usage questions or support issues to the mailing list:
128 I<bioperl-l@bioperl.org>
130 rather than to the module maintainer directly. Many experienced and
131 reponsive experts will be able look at the problem and quickly
132 address it. Please include a thorough description of the problem
133 with code and data examples if at all possible.
135 =head2 Reporting Bugs
137 Report bugs to the Bioperl bug tracking system to help us keep track
138 the bugs and their resolution. Bug reports can be submitted via the
139 web:
141 https://github.com/bioperl/bioperl-live/issues
143 =head1 AUTHORS
145 Richard Adams, Richard.Adams@ed.ac.uk,
147 =head1 APPENDIX
149 The rest of the documentation details each of the object
150 methods. Internal methods are usually preceded with a _
152 =cut
155 # Let the code begin...
158 package Bio::Tools::Analysis::Protein::Scansite;
159 use vars qw($FLOAT @STRINGENCY);
160 use strict;
161 use IO::String;
162 use Bio::SeqIO;
163 use HTTP::Request::Common qw(POST);
164 use Bio::SeqFeature::Generic;
166 use base qw(Bio::Tools::Analysis::SimpleAnalysisBase);
168 $FLOAT = '[+-]?\d*\.\d*';
169 @STRINGENCY = qw(High Medium Low);
170 my $URL = 'http://scansite.mit.edu/cgi-bin/motifscan_seq';
173 my $ANALYSIS_SPEC =
175 'name' => 'Scansite',
176 'type' => 'Protein',
177 'version' => '2.0',
178 'supplier' => 'Massachusetts Institute of Technology',
179 'description' => 'Prediction of serine, threonine and tyrosine
180 phosphorylation sites in eukaryotic proteins',
183 my $INPUT_SPEC =
186 'mandatory' => 'true',
187 'type' => 'Bio::PrimarySeqI',
188 'name' => 'seq',
191 'mandatory' => 'false',
192 'type' => 'text',
193 'name' => 'protein_id',
194 'default' => 'unnamed',
197 'mandatory' => 'false',
198 'type' => 'text',
199 'name' => 'stringency',
200 'default' => 'High',
204 my $RESULT_SPEC =
206 '' => 'bulk', # same as undef
207 'Bio::SeqFeatureI' => 'ARRAY of Bio::SeqFeature::Generic',
208 'raw' => 'Array of {motif=>, percentile=>, position=>,
209 protein=>, score=>, site=>, zscore=>
210 sequence=>
215 =head2 result
217 Name : result
218 Usage : $job->result (...)
219 Returns : a result created by running an analysis
220 Args : none (but an implementation may choose
221 to add arguments for instructions how to process
222 the raw result)
224 The method returns a scalar representing a result of an executed
225 job. If the job was terminated by an error, the result may contain
226 an error message instead of the real data.
228 This implementation returns differently processed data depending on
229 argument:
231 =over 3
233 =item undef
235 Returns the raw ASCII data stream but without HTML tags
237 =item 'Bio::SeqFeatureI'
239 The argument string defined the type of bioperl objects returned in an
240 array. The objects are L<Bio::SeqFeature::Generic>.
242 =item 'parsed'
244 Returns a reference to an array of hashes containing the data of one
245 phosphorylation site prediction. Key values are:
247 motif, percentile, position, protein, score, site, zscore, sequence.
250 =back
253 =cut
255 sub result {
256 my ($self,$value) = @_;
257 if( !exists($self->{'_result'}) || $self->status ne 'COMPLETED'){
258 $self->throw("Cannot get results, analysis not run!");
260 my @fts;
262 if ($value ) {
263 if ($value eq 'Bio::SeqFeatureI') {
264 for my $hit (@{$self->{'_parsed'}}) {
265 push @fts, Bio::SeqFeature::Generic->new(
266 -start => $hit->{'position'},
267 -end => $hit->{'position'},
268 -primary_tag => 'Site',
269 -source => 'Scansite',
270 -tag => {
271 score => $hit->{'score'},
272 zscore => $hit->{'zscore'},
273 motif => $hit->{'motif'},
274 site => $hit->{'site'},
275 sequence => $hit->{'sequence'},
279 return @fts;
281 elsif ($value eq 'meta') {
282 $self->throw("No meta sequences available in this analysis!");
284 ## else get here
285 return $self->{'_parsed'};
288 return $self->{'_result'};
291 =head2 stringency
293 Usage : $job->stringency(...)
294 Returns : The significance stringency of a prediction
295 Args : None (retrieves value) or 'High', 'Medium' or 'Low'.
296 Purpose : Get/setter of the stringency to be sumitted for analysis.
298 =cut
300 sub stringency {
301 my ($self,$value) = @_;
302 if( $value) {
303 if (! grep{$_=~ /$value/i}@STRINGENCY ) {
304 $self->throw("I need a stringency of [".
305 join " ", @STRINGENCY .
306 "], not [$value]");
308 $self->{'_stringency'} = $value;
309 return $self;
311 return $self->{'_stringency'} || $self->input_spec->[2]{'default'} ;
314 =head2 protein_id
316 Usage : $job->protein_id(...)
317 Returns : The sequence id of the protein or 'unnamed' if not set.
318 Args : None
319 Purpose : Getter of the seq_id. Returns the display_id of the sequence
320 object.
322 =cut
324 sub protein_id {
325 my $self = shift;
326 return defined ($self->seq())? $self->seq->display_id()
327 : $self->input_spec->[1]{'default'};
330 sub _init
332 my $self = shift;
333 $self->url($URL);
334 $self->{'_ANALYSIS_SPEC'} = $ANALYSIS_SPEC;
335 $self->{'_INPUT_SPEC'} = $INPUT_SPEC;
336 $self->{'_RESULT_SPEC'} = $RESULT_SPEC;
337 $self->{'_ANALYSIS_NAME'} = $ANALYSIS_SPEC->{'name'};
338 return $self;
341 sub _run {
342 my $self = shift;
344 # format the sequence into fasta
345 $self->delay(1);
346 # delay repeated calls by default by 3 sec, set delay() to change
347 $self->sleep;
349 $self->status('TERMINATED_BY_ERROR');
351 my $request = POST $self->url,
352 Content => [sequence => $self->seq->seq(),
353 protein_id => $self->protein_id(),
354 motif_option => 'all',
355 motifs => '',
356 motif_groups => '',
357 stringency => $self->stringency(),
358 #domain_flag => '',
359 submit => "Submit Request",
361 ## raw html report,
362 my $content = $self->request($request);
363 my $text = $content->content;
365 ##access result data from tag in html
366 my @parsed_Results = ();
367 my @unwantedParams = qw(db source class);
368 my @results = split /sitestats\.phtml\?/, $text;
369 shift @results;
371 ##this module generates 'parsed' output directly from html,
372 ## avoids having toparse twice.
374 for my $hit (@results) {
375 ## get results string
376 my ($res) = $hit =~ /^(.+?)"/;
378 #get key value pairs
379 my %params = $res =~/(\w+)=([^&]+)/g;
381 ##remove unwanted data from hash
382 map{delete $params{$_}} @unwantedParams;
383 push @parsed_Results, \%params;
386 ## now generate text output in table format
387 my $out_Str = '';
388 $out_Str .= $self->_make_header(\@parsed_Results);
389 $out_Str .= $self->_add_data(\@parsed_Results);
392 $self->{'_result'} = $out_Str;
393 $self->{'_parsed'} = \@parsed_Results;
395 ## is successsful if there are results or if there are no results and
396 ## this beacuse there are no matches, not because of parsing errors etc.
397 $self->status('COMPLETED') if $text ne '' &&
398 (scalar @results > 0 ||
399 (scalar @results == 0 && $text =~/No sites found/));
400 if ($text =~ /server\s+error/i) {
401 $self->throw("Internal server error:\n\n $text");
402 return;
406 sub _process_arguments {
408 # extra checking for sequence length
409 # mitoprot specific argument testing
410 my ($self, $args) = @_;
411 #use base checking for existence of mandatory fields
412 $self->SUPER::_process_arguments($args);
414 # specific requirements
415 $self->throw("Sequence must be > 15 amino acids long!")
416 if $self->seq->length < 15;
417 $self->throw("Sequence must be protein")
418 unless $self->seq->alphabet() eq 'protein';
421 sub _make_header {
422 my ($self, $res) = @_;
423 my $header = '';
424 for my $k (sort keys %{$res->[0]} ){
425 next if $k eq 'sequence';
426 $header .= $k;
427 $header .= ' 'x(12 -length($k));
429 $header .= "sequence\n\n";
430 return $header;
433 sub _add_data {
434 my ($self, $res) = @_;
435 my $outstr = '';
436 for my $hit (@$res) {
437 for my $k (sort keys %$hit ){
438 next if $k eq 'sequence';
439 $outstr .= $hit->{$k};
440 $outstr .= ' 'x(12 - length($hit->{$k}));
442 $outstr .= $hit->{'sequence'}. "\n" if $hit->{'sequence'};
444 return $outstr;