added some POD.
[sgn.git] / cgi-bin / tools / sigpep_finder / sigpep_find.pl
blobcdedfb74fbde186529ea7e079d3413a514213931
1 #!/usr/bin/perl
2 # Signal peptide finder using a genentech-script-generated HMMER model
3 # modified by Evan 9 / 16 / 05
5 # arguments: sequences (string), filename (string), display_opt(filter/color),
6 # use_eval_cutoff(bool), use_bval_cutoff(bool), eval_cutoff(real), bval_cutoff(real)
8 use strict;
9 use English;
11 use CXGN::Page;
12 use Bio::SearchIO;
13 use Bio::SeqIO;
14 use IO::String;
15 use CXGN::VHost;
17 my $vhost_conf = CXGN::VHost->new();
18 my $ss_find_obj = SigpepFind->new($vhost_conf);
19 $ss_find_obj->process_seqs();
21 exit;
23 ################################################################################
24 package SigpepFind;
25 use strict;
26 use English;
28 #parameters: VHost obj
29 sub new {
30 my $classname = shift;
31 my $vhost_conf = shift;
32 my $obj = {}; #a reference to an anonymous hash
34 #all fields are listed here
35 $obj->{debug} = 0; #0 or 1
36 $obj->{sequences} = ""; #string with raw input from textbox
37 $obj->{file_contents} = ""; #string with raw input from file
38 $obj->{illegal_input_count} =
39 0; #number of illegally formatted sequences found
40 $obj->{seqarray} =
41 []; #arrayref of records like {seq => input sequence, id => descriptor}
42 $obj->{error} = 0; #set to 1 in check_input() if necessary
43 $obj->{output} = ""; #raw HMMER output
44 $obj->{tmpdir} =
45 $vhost_conf->get_conf('basepath')
46 . $vhost_conf->get_conf('tempfiles_subdir')
47 . "/sigpep_finder";
48 $obj->{hmmsearch_path} = $vhost_conf->get_conf('hmmsearch_location');
49 $obj->{page} = CXGN::Page->new( "Signal Peptide Finder", "Evan" );
50 $obj->{content} = ""; #the output HTML
52 bless( $obj, $classname ); #use $obj as an object of this type from now on
53 $obj->get_params(); #retrieve parameters
54 return $obj; #so it can be used like "$obj->func()"
57 #read parameters to script from the page object
58 #and read input from textbox and file into $self->{sequences} and $self->{file_contents}, respectively
59 sub get_params {
60 my $self = shift(@_);
61 ( $self->{sequences} ) =
62 $self->{page}->get_arguments('sequences')
63 ; #retrieve the seqlist in string form from the in-page textbox
64 ( $self->{display_opt} ) =
65 $self->{page}->get_arguments('display_opt')
66 ; #whether to filter sequences in output or to show them in green/red
67 ( $self->{truncate} ) = $self->{page}->get_arguments('truncate_seqs');
68 if ( $self->{truncate} ) {
69 ( $self->{truncate_type} ) =
70 $self->{page}->get_arguments('truncate_type');
73 #get an upload object to upload a file -- this and filling in the textbox are not mutually exclusive;
74 #each can hold either a list of fasta sequences or a single plaintext sequence
75 my $upload = $self->{page}->get_upload();
77 #check whether there's a filename in the filename text field
78 if ( defined $upload ) {
79 my $fh = $upload->fh();
80 my @fileLines = <$fh>
81 ; #need this line to put the file into an array context; can't go straight to the join()
82 $self->{file_contents} = join( '', @fileLines );
84 else {
85 $self->{file_contents} = '';
88 #check input format and fill $self->{seqarray}
89 $self->check_input();
91 #output options
92 ( $self->{use_eval_cutoff} ) =
93 $self->{page}->get_arguments('use_eval_cutoff');
94 if ( defined $self->{use_eval_cutoff} ) {
95 ( $self->{eval_cutoff} ) = $self->{page}->get_arguments("eval_cutoff");
98 ( $self->{use_bval_cutoff} ) =
99 $self->{page}->get_arguments('use_bval_cutoff');
100 if ( defined $self->{use_bval_cutoff} ) {
101 ( $self->{bval_cutoff} ) = $self->{page}->get_arguments("bval_cutoff");
105 #make sure contents of input fields are legal, and put them in a useful array form in {seqarray}
106 #no parameters; return 0 or 1
107 sub check_input {
108 my $self = shift(@_);
110 #process input from textbox and fill seqarray
111 my $error_msg =
112 $self->read_sequences( $self->{sequences}, 'sequence input box' )
113 ; #includes the period at the end of a sentence
114 if ( $error_msg ne '' ) {
115 $self->print_warning("$error_msg Skipping the rest of the input.");
118 #process input from uploaded file and fill seqarray
119 $error_msg =
120 $self->read_sequences( $self->{file_contents}, 'uploaded file' );
121 if ( $error_msg ne '' ) {
122 $self->print_warning("$error_msg Skipping the rest of the input.");
125 #print extracted sequence info
126 $self->debug( "seqarray: " . $self->{seqarray} );
127 $self->debug( "length of seqarray: " . scalar( @{ $self->{seqarray} } ) );
128 $self->debug(
129 "uploaded ids:\n"
130 . join( ",\n",
131 map( $_->{'id'} . ": " . $_->{'seq'}, @{ $self->{seqarray} } ) )
132 . "\n"
136 #add given data, which has not been checked for legality or format, to $self->{seqarray},
137 #which is an arrayref with contents being hashrefs with {id}, {seq} fields
138 #two parameters: a stringref containing the data to be added,
139 #a one-word description of where the input is coming from (for debugging)
140 #return a string that's empty if no error
141 sub read_sequences {
142 my ( $self, $data, $input_loc ) = @_;
144 return '' unless $data =~ /\S/; #< skip if no input seqs
146 open my $in_fh, "<", \$data
147 or die "could not open dataref";
148 my $inio = Bio::SeqIO->new(
149 "-fh" => $in_fh,
150 "-format" => $self->guess_input_format( \$data )
152 $inio->alphabet("protein");
153 $inio->verbose(2); #< make sure SeqIO will throw an exception
154 #whenever appropriate
156 my $seq_count = 0;
157 my $valid_seqs = 0;
158 eval {
159 while ( my $seq = $inio->next_seq )
161 $seq_count++;
162 my $temp_seq = uc $seq->seq();
163 if ( my $illegals = $self->contains_illegal_alphabet($temp_seq) ) {
164 $self->print_warning( "Sequence #$seq_count ("
165 . ( $seq->id || 'no name' )
166 . ") in the $input_loc contains illegal character(s) '$illegals', skipping."
168 next;
171 #add a record
172 push(
173 @{ $self->{seqarray} },
175 'id' => defined( $seq->id() ) ? $seq->id() : 'UnknownSeq',
176 'seq' => $temp_seq
179 $valid_seqs++;
182 if ($EVAL_ERROR) {
183 $self->print_error("Sequence processing failed, please check input");
186 unless ( $valid_seqs > 0 ) {
187 $self->print_error("Sequence processing failed, no valid sequences.");
189 return '';
192 #return our best guess as to what format the given input string is in,
193 #choosing from the following format strings accepted by Bioperl's SeqIO:
194 #raw fasta
195 sub guess_input_format {
196 my $self = shift(@_);
197 my $dataref = shift(@_);
198 if ( $$dataref =~ m/^\s*>/ ) {
199 return "fasta";
201 else {
203 #first remove all whitespace so we interpret it as a single sequence
204 $$dataref =~ s/\s*//g;
205 return "raw";
209 #return whether the given string contains any illegal alphabet characters
210 #one parameter: a string that should be just a sequence, with nothing but [A-Z] in it,
211 #and maybe a * to mark the end of the sequence
212 #(also requires that the actual sequence have length at least one)
213 sub contains_illegal_alphabet {
214 my ( $self, $data ) = @_;
215 if ( $data =~ m/([bxz])/i ) {
216 return $1;
218 return 0;
221 #process {seqarray} field; search with existing HMM
222 #no parameters
223 sub process_seqs {
224 my $self = shift;
226 # if we have an error, don't run the rest of it
227 if ( $self->{error} ) {
228 $self->write_page();
229 return;
232 #sequences are now in array form in {seqarray}
233 #hmmsearch input against may10_combined_lenSPEX.hmm (from 5 / 10 / 04; the best of the various hmms I created)
235 #output to temp file
236 my $outfilename = "$self->{tmpdir}/tmp.in.$$"; #$$ is this process' id
237 open( OUTPUT, ">$outfilename" )
238 or die "can't open '$outfilename' for writing: $!";
239 foreach my $s ( @{ $self->{seqarray} } ) {
240 print OUTPUT ( ">", $s->{'id'}, "\n", $s->{'seq'}, "\n" )
241 or die "can't output indata to tempfile";
243 close(OUTPUT);
245 #call hmmsearch
246 my $infilename = "$self->{tmpdir}/tmp.out.$$";
247 my $options = "-A0";
248 if ( $self->{display_opt} eq "filter" ) {
249 if ( $self->{use_eval_cutoff} ) {
250 $options .= " -E" . $self->{eval_cutoff};
252 if ( $self->{use_bval_cutoff} ) {
253 $options .= " -T" . $self->{bval_cutoff};
256 ################### make sure the call to hmmer is up to date ###################
257 my $hmmfile =
258 $self->{page}->path_to(
259 'cgi-bin/tools/sigpep_finder/hmmlib/may10_combined_lenSPEX.hmm');
260 -f $hmmfile or die "could not find hmmfile '$hmmfile'";
261 system
262 "$self->{hmmsearch_path} $options $hmmfile $outfilename > $infilename";
263 die "$! running hmmsearch executable '$self->{hmmsearch_path}'"
264 if $CHILD_ERROR;
266 #input from temp output file by means of a Bio::SearchIO
267 my $resultString = '';
269 my $inio = Bio::SearchIO->new( -format => 'hmmer', -file => $infilename );
270 while ( my $result = $inio->next_result() ) {
272 #there should only be one "result", since we only use one hmm library
274 #result is a Bio::Search::Result::HMMERResult object
275 my $qname = $result->query_name();
276 $resultString .= <<EOHTML;
277 <h2 style="text-align: center">Results for HMM $qname</h1>
278 <p>If you elected to "show all output", note that we can't show more output than HMMER produces; it doesn't always
279 report results for all input sequences. The sequence marked "least likely" to have a signal peptide is only
280 least likely <i>among those sequences for which output was reported</i>; sequences that don't show up in the
281 results table are even less likely to start with signals.</p>
282 <p>For a detailed explanation of the output, see <a href="/documents/hmmer_userguide.pdf">the HMMER user guide (pdf)</a>. Look for the "search the
283 sequence database with hmmsearch" section.</p>
284 <table style="margin: 0 auto" border="0" cellspacing="0" cellpadding="1">
285 <tr>
286 <td align="center" bgcolor="#e0ddff"><b>&nbsp;&nbsp;&nbsp;Sequence&nbsp;&nbsp;|&nbsp;&nbsp;Domain&nbsp;&nbsp;&nbsp;</b>
287 <td align="center" bgcolor="#d0ccee"><b>&nbsp;&nbsp;&nbsp;Score&nbsp;&nbsp;|&nbsp;&nbsp;E-value&nbsp;&nbsp;&nbsp;</b>
288 <td align="center" bgcolor="#e0ddff"><b>&nbsp;&nbsp;&nbsp;Signal Likelihood&nbsp;&nbsp;&nbsp;</b>
289 </tr>
290 EOHTML
291 my $lineNum = 0;
292 my ( $hit, $hsp, $color, $colStr1, $colStr2 );
293 my $next_hit = $result->next_hit();
294 if ( !$next_hit ) {
295 $resultString .=
296 qq|</table>\n<div style="margin: 1em auto; text-align: center; width: 20%; color: #ff0000">[ no hits ]</div>\n|;
298 else {
299 while ( $hit = $next_hit ) {
300 $hsp = $hit->next_hsp();
301 $next_hit = $result->next_hit();
302 if ( $self->{display_opt} eq "color" ) {
303 if (
304 $hit->raw_score() > (
305 defined( $self->{bval_cutoff} )
306 ? $self->{bval_cutoff}
307 : -1e6
309 && $hsp->evalue() < (
310 defined( $self->{eval_cutoff} )
311 ? $self->{eval_cutoff}
312 : 1e12
316 $color = "#559955";
318 else {
319 $color = "#994444";
321 $colStr1 = "<font color=\"$color\"><b>";
322 $colStr2 = "</b></font>";
324 else {
325 $colStr1 = "";
326 $colStr2 = "";
328 $resultString .= "
329 <tr>
330 <td align=\"left\" bgcolor=\"#e0ddff\">"
331 . $colStr1
332 . $hit->name()
333 . $colStr2
334 . "<td align=\"left\" bgcolor=\"#d0ccee\">&nbsp;"
335 . $colStr1
336 . $hit->raw_score()
337 . $colStr2
338 . "<td align=\"center\" bgcolor=\"#e0ddff\">"
339 . $colStr1
341 ( $lineNum == 0 )
342 ? ( $next_hit ? "Most likely" : "n/a" )
343 : ( !$next_hit ? "Least likely" : "&#8595;" )
345 . $colStr2 . "</tr>
346 <tr>
347 <td align=\"right\" bgcolor=\"#e0ddff\">1&nbsp;"
348 . "<td align=\"right\" bgcolor=\"#d0ccee\">"
349 . $hsp->evalue()
350 . "<td align=\"center\" bgcolor=\"#e0ddff\">" . "</tr>";
351 $lineNum++;
353 $resultString .= "</table>\n";
355 $resultString .= <<EOHTML;
356 <table border="0" cellspacing="10" cellpadding="0">
357 <tr>
358 <td colspan="1" align="center"><b>HMMER histogram</b>
359 </tr>
360 <tr>
361 <td colspan="1" align="left">
362 &nbsp;&nbsp;&nbsp;The histogram compares the observed (obs) sequence score distribution to that expected
363 (exp) from a file of the same size as the one you submitted but full of random sequences. The further down
364 the page the distribution of equal signs is with respect to that of asterisks, the more likely it is that
365 your sequences contain signal peptides. If the sequences you submitted all come from the same protein family,
366 the histogram suggests how likely it is that that family is secreted.
367 </tr>
368 <tr>
369 <td align="left">
370 <pre><code>
371 EOHTML
373 #reopen file to read and print histogram
374 open( INPUT, "<$infilename" );
375 my $line;
376 while ( $line = <INPUT> ) {
377 if ( $line =~
378 m/istogram/ ) #line ought to read "Histogram of all scores:"
380 while ( ( $line = <INPUT> ) !~ m/^\s*$/ ) {
381 $resultString .= $line;
385 close(INPUT);
387 $resultString .= <<EOH;
388 </code></pre>
389 </tr>
390 </table>
395 $self->print($resultString);
397 #remove temp files
398 # `rm $outfilename $infilename`;
400 $self->write_page();
403 #print to the content field
404 #one parameter: string to be displayed
405 sub print {
406 my $self = shift(@_);
407 $self->{content} .= shift(@_);
410 #print to the content field, depending on the debug flag, without setting the error flag
411 #one parameter: string to be displayed
412 sub debug {
413 my $self = shift(@_);
414 if ( $self->{debug} ) {
415 $self->print( "<p>" . shift(@_) . "</p>" );
419 #output an error to the content field, but continue to process output
420 #one parameter: error string (should be complete sentence(s) )
421 sub print_warning {
422 my $self = shift(@_);
423 $self->print( "<div><b>WARNING: " . shift(@_) . "</b></div>" );
426 #output an error to the content field and set a flag to bypass future processing
427 #one parameter: error string (should be complete sentence(s) )
428 sub print_error {
429 my $self = shift(@_);
430 $self->print( "<div><b>"
431 . shift(@_)
432 . "</b></div><div>Please press the back button and re-enter input.</div>"
434 $self->{error} = 1;
437 #output HMMER findings and/or error (whatever's in {content}) to HTML
438 #no parameters
439 sub write_page {
440 my $self = shift(@_);
441 $self->{page}->header("Signal Peptide Finder");
442 print $self->{content};
443 $self->{page}->footer();