Merge pull request #5230 from solgenomics/topic/open_pollinated
[sgn.git] / cgi-bin / tools / sigpep_finder / sigpep_find.pl
blob2e1b7ac32b529cbfbf934e3617e605c37e123c31
1 #!/usr/bin/perl
2 # Signal peptide finder using a genentech-script-generated HMMER model
3 # modified by Evan 9 / 16 / 05
5 # arguments: sequences (string), filename (string), display_opt(filter/color),
6 # use_eval_cutoff(bool), use_bval_cutoff(bool), eval_cutoff(real), bval_cutoff(real)
8 use strict;
9 use warnings;
11 use CXGN::Page;
12 use Bio::SearchIO;
13 use Bio::SeqIO;
14 use IO::String;
16 my $ss_find_obj = SigpepFind->new();
17 $ss_find_obj->process_seqs();
19 exit;
21 ################################################################################
22 package SigpepFind;
23 use strict;
24 use CatalystX::GlobalContext '$c';
25 use warnings;
26 use English;
28 sub new {
29 my $classname = shift;
30 my $obj = {}; #a reference to an anonymous hash
32 #all fields are listed here
33 $obj->{debug} = 0; #0 or 1
34 $obj->{sequences} = ""; #string with raw input from textbox
35 $obj->{file_contents} = ""; #string with raw input from file
36 $obj->{illegal_input_count} =
37 0; #number of illegally formatted sequences found
38 $obj->{seqarray} =
39 []; #arrayref of records like {seq => input sequence, id => descriptor}
40 $obj->{error} = 0; #set to 1 in check_input() if necessary
41 $obj->{output} = ""; #raw HMMER output
42 $obj->{tmpdir} =$c->path_to( $c->tempfiles_subdir( 'sigpep_finder' ) );
43 $obj->{hmmsearch_path} = $c->config->{'hmmsearch_location'};
44 $obj->{page} = CXGN::Page->new( "Signal Peptide Finder", "Evan" );
45 $obj->{content} = ""; #the output HTML
47 bless( $obj, $classname ); #use $obj as an object of this type from now on
48 $obj->get_params(); #retrieve parameters
49 return $obj; #so it can be used like "$obj->func()"
52 #read parameters to script from the page object
53 #and read input from textbox and file into $self->{sequences} and $self->{file_contents}, respectively
54 sub get_params {
55 my $self = shift(@_);
56 ( $self->{sequences} ) =
57 $self->{page}->get_arguments('sequences')
58 ; #retrieve the seqlist in string form from the in-page textbox
59 ( $self->{display_opt} ) =
60 $self->{page}->get_arguments('display_opt')
61 ; #whether to filter sequences in output or to show them in green/red
62 ( $self->{truncate} ) = $self->{page}->get_arguments('truncate_seqs');
63 if ( $self->{truncate} ) {
64 ( $self->{truncate_type} ) =
65 $self->{page}->get_arguments('truncate_type');
68 #get an upload object to upload a file -- this and filling in the textbox are not mutually exclusive;
69 #each can hold either a list of fasta sequences or a single plaintext sequence
70 my $upload = $self->{page}->get_upload();
72 #check whether there's a filename in the filename text field
73 if ( defined $upload ) {
74 my $fh = $upload->fh();
75 my @fileLines = <$fh>
76 ; #need this line to put the file into an array context; can't go straight to the join()
77 $self->{file_contents} = join( '', @fileLines );
79 else {
80 $self->{file_contents} = '';
83 #check input format and fill $self->{seqarray}
84 $self->check_input();
86 #output options
87 ( $self->{use_eval_cutoff} ) =
88 $self->{page}->get_arguments('use_eval_cutoff');
89 if ( defined $self->{use_eval_cutoff} ) {
90 ( $self->{eval_cutoff} ) = $self->{page}->get_arguments("eval_cutoff");
93 ( $self->{use_bval_cutoff} ) =
94 $self->{page}->get_arguments('use_bval_cutoff');
95 if ( defined $self->{use_bval_cutoff} ) {
96 ( $self->{bval_cutoff} ) = $self->{page}->get_arguments("bval_cutoff");
100 #make sure contents of input fields are legal, and put them in a useful array form in {seqarray}
101 #no parameters; return 0 or 1
102 sub check_input {
103 my $self = shift(@_);
105 #process input from textbox and fill seqarray
106 my $error_msg =
107 $self->read_sequences( $self->{sequences}, 'sequence input box' )
108 ; #includes the period at the end of a sentence
109 if ( $error_msg ne '' ) {
110 $self->print_warning("$error_msg Skipping the rest of the input.");
113 #process input from uploaded file and fill seqarray
114 $error_msg =
115 $self->read_sequences( $self->{file_contents}, 'uploaded file' );
116 if ( $error_msg ne '' ) {
117 $self->print_warning("$error_msg Skipping the rest of the input.");
120 #print extracted sequence info
121 $self->debug( "seqarray: " . $self->{seqarray} );
122 $self->debug( "length of seqarray: " . scalar( @{ $self->{seqarray} } ) );
123 $self->debug(
124 "uploaded ids:\n"
125 . join( ",\n",
126 map( $_->{'id'} . ": " . $_->{'seq'}, @{ $self->{seqarray} } ) )
127 . "\n"
131 #add given data, which has not been checked for legality or format, to $self->{seqarray},
132 #which is an arrayref with contents being hashrefs with {id}, {seq} fields
133 #two parameters: a stringref containing the data to be added,
134 #a one-word description of where the input is coming from (for debugging)
135 #return a string that's empty if no error
136 sub read_sequences {
137 my ( $self, $data, $input_loc ) = @_;
139 return '' unless $data =~ /\S/; #< skip if no input seqs
141 open my $in_fh, "<", \$data
142 or die "could not open dataref";
143 my $inio = Bio::SeqIO->new(
144 "-fh" => $in_fh,
145 "-format" => $self->guess_input_format( \$data )
147 $inio->alphabet("protein");
148 $inio->verbose(2); #< make sure SeqIO will throw an exception
149 #whenever appropriate
151 my $seq_count = 0;
152 my $valid_seqs = 0;
153 eval {
154 while ( my $seq = $inio->next_seq )
156 $seq_count++;
157 my $temp_seq = uc $seq->seq();
158 if ( my $illegals = $self->contains_illegal_alphabet($temp_seq) ) {
159 $self->print_warning( "Sequence #$seq_count ("
160 . ( $seq->id || 'no name' )
161 . ") in the $input_loc contains illegal character(s) '$illegals', skipping."
163 next;
166 #add a record
167 push(
168 @{ $self->{seqarray} },
170 'id' => defined( $seq->id() ) ? $seq->id() : 'UnknownSeq',
171 'seq' => $temp_seq
174 $valid_seqs++;
177 if ($EVAL_ERROR) {
178 $self->print_error("Sequence processing failed, please check input");
181 unless ( $valid_seqs > 0 ) {
182 $self->print_error("Sequence processing failed, no valid sequences.");
184 return '';
187 #return our best guess as to what format the given input string is in,
188 #choosing from the following format strings accepted by Bioperl's SeqIO:
189 #raw fasta
190 sub guess_input_format {
191 my $self = shift(@_);
192 my $dataref = shift(@_);
193 if ( $$dataref =~ m/^\s*>/ ) {
194 return "fasta";
196 else {
198 #first remove all whitespace so we interpret it as a single sequence
199 $$dataref =~ s/\s*//g;
200 return "raw";
204 #return whether the given string contains any illegal alphabet characters
205 #one parameter: a string that should be just a sequence, with nothing but [A-Z] in it,
206 #and maybe a * to mark the end of the sequence
207 #(also requires that the actual sequence have length at least one)
208 sub contains_illegal_alphabet {
209 my ( $self, $data ) = @_;
210 if ( $data =~ m/([bxz])/i ) {
211 return $1;
213 return 0;
216 #process {seqarray} field; search with existing HMM
217 #no parameters
218 sub process_seqs {
219 my $self = shift;
221 # if we have an error, don't run the rest of it
222 if ( $self->{error} ) {
223 $self->write_page();
224 return;
227 #sequences are now in array form in {seqarray}
228 #hmmsearch input against may10_combined_lenSPEX.hmm (from 5 / 10 / 04; the best of the various hmms I created)
230 #output to temp file
231 my $outfilename = "$self->{tmpdir}/tmp.in.$$"; #$$ is this process' id
232 open( OUTPUT, ">$outfilename" )
233 or die "can't open '$outfilename' for writing: $!";
234 foreach my $s ( @{ $self->{seqarray} } ) {
235 print OUTPUT ( ">", $s->{'id'}, "\n", $s->{'seq'}, "\n" )
236 or die "can't output indata to tempfile";
238 close(OUTPUT);
240 #call hmmsearch
241 my $infilename = "$self->{tmpdir}/tmp.out.$$";
242 my $options = "-A0";
243 if ( $self->{display_opt} eq "filter" ) {
244 if ( $self->{use_eval_cutoff} ) {
245 $options .= " -E" . $self->{eval_cutoff};
247 if ( $self->{use_bval_cutoff} ) {
248 $options .= " -T" . $self->{bval_cutoff};
251 ################### make sure the call to hmmer is up to date ###################
252 my $hmmfile =
253 $self->{page}->path_to(
254 'cgi-bin/tools/sigpep_finder/hmmlib/may10_combined_lenSPEX.hmm');
255 -f $hmmfile or die "could not find hmmfile '$hmmfile'";
256 system
257 "$self->{hmmsearch_path} $options $hmmfile $outfilename > $infilename";
258 die "$! running hmmsearch executable '$self->{hmmsearch_path}'"
259 if $CHILD_ERROR;
261 #input from temp output file by means of a Bio::SearchIO
262 my $resultString = '';
264 my $inio = Bio::SearchIO->new( -format => 'hmmer', -file => $infilename );
265 while ( my $result = $inio->next_result() ) {
267 #there should only be one "result", since we only use one hmm library
269 #result is a Bio::Search::Result::HMMERResult object
270 my $qname = $result->query_name();
271 $resultString .= <<EOHTML;
272 <h2 style="text-align: center">Results for HMM $qname</h1>
273 <p>If you elected to "show all output", note that we can't show more output than HMMER produces; it doesn't always
274 report results for all input sequences. The sequence marked "least likely" to have a signal peptide is only
275 least likely <i>among those sequences for which output was reported</i>; sequences that don't show up in the
276 results table are even less likely to start with signals.</p>
277 <p>For a detailed explanation of the output, see <a href="/documents/hmmer_userguide.pdf">the HMMER user guide (pdf)</a>. Look for the "search the
278 sequence database with hmmsearch" section.</p>
279 <table style="margin: 0 auto" border="0" cellspacing="0" cellpadding="1">
280 <tr>
281 <td align="center" bgcolor="#e0ddff"><b>&nbsp;&nbsp;&nbsp;Sequence&nbsp;&nbsp;|&nbsp;&nbsp;Domain&nbsp;&nbsp;&nbsp;</b>
282 <td align="center" bgcolor="#d0ccee"><b>&nbsp;&nbsp;&nbsp;Score&nbsp;&nbsp;|&nbsp;&nbsp;E-value&nbsp;&nbsp;&nbsp;</b>
283 <td align="center" bgcolor="#e0ddff"><b>&nbsp;&nbsp;&nbsp;Signal Likelihood&nbsp;&nbsp;&nbsp;</b>
284 </tr>
285 EOHTML
286 my $lineNum = 0;
287 my ( $hit, $hsp, $color, $colStr1, $colStr2 );
288 my $next_hit = $result->next_hit();
289 if ( !$next_hit ) {
290 $resultString .=
291 qq|</table>\n<div style="margin: 1em auto; text-align: center; width: 20%; color: #ff0000">[ no hits ]</div>\n|;
293 else {
294 while ( $hit = $next_hit ) {
295 $hsp = $hit->next_hsp();
296 $next_hit = $result->next_hit();
297 if ( $self->{display_opt} eq "color" ) {
298 if (
299 $hit->raw_score() > (
300 defined( $self->{bval_cutoff} )
301 ? $self->{bval_cutoff}
302 : -1e6
304 && $hsp->evalue() < (
305 defined( $self->{eval_cutoff} )
306 ? $self->{eval_cutoff}
307 : 1e12
311 $color = "#559955";
313 else {
314 $color = "#994444";
316 $colStr1 = "<font color=\"$color\"><b>";
317 $colStr2 = "</b></font>";
319 else {
320 $colStr1 = "";
321 $colStr2 = "";
323 $resultString .= "
324 <tr>
325 <td align=\"left\" bgcolor=\"#e0ddff\">"
326 . $colStr1
327 . $hit->name()
328 . $colStr2
329 . "<td align=\"left\" bgcolor=\"#d0ccee\">&nbsp;"
330 . $colStr1
331 . $hit->raw_score()
332 . $colStr2
333 . "<td align=\"center\" bgcolor=\"#e0ddff\">"
334 . $colStr1
336 ( $lineNum == 0 )
337 ? ( $next_hit ? "Most likely" : "n/a" )
338 : ( !$next_hit ? "Least likely" : "&#8595;" )
340 . $colStr2 . "</tr>
341 <tr>
342 <td align=\"right\" bgcolor=\"#e0ddff\">1&nbsp;"
343 . "<td align=\"right\" bgcolor=\"#d0ccee\">"
344 . $hsp->evalue()
345 . "<td align=\"center\" bgcolor=\"#e0ddff\">" . "</tr>";
346 $lineNum++;
348 $resultString .= "</table>\n";
350 $resultString .= <<EOHTML;
351 <table border="0" cellspacing="10" cellpadding="0">
352 <tr>
353 <td colspan="1" align="center"><b>HMMER histogram</b>
354 </tr>
355 <tr>
356 <td colspan="1" align="left">
357 &nbsp;&nbsp;&nbsp;The histogram compares the observed (obs) sequence score distribution to that expected
358 (exp) from a file of the same size as the one you submitted but full of random sequences. The further down
359 the page the distribution of equal signs is with respect to that of asterisks, the more likely it is that
360 your sequences contain signal peptides. If the sequences you submitted all come from the same protein family,
361 the histogram suggests how likely it is that that family is secreted.
362 </tr>
363 <tr>
364 <td align="left">
365 <pre><code>
366 EOHTML
368 #reopen file to read and print histogram
369 open( INPUT, "<$infilename" );
370 my $line;
371 while ( $line = <INPUT> ) {
372 if ( $line =~
373 m/istogram/ ) #line ought to read "Histogram of all scores:"
375 while ( ( $line = <INPUT> ) !~ m/^\s*$/ ) {
376 $resultString .= $line;
380 close(INPUT);
382 $resultString .= <<EOH;
383 </code></pre>
384 </tr>
385 </table>
390 $self->print($resultString);
392 #remove temp files
393 # `rm $outfilename $infilename`;
395 $self->write_page();
398 #print to the content field
399 #one parameter: string to be displayed
400 sub print {
401 my $self = shift(@_);
402 $self->{content} .= shift(@_);
405 #print to the content field, depending on the debug flag, without setting the error flag
406 #one parameter: string to be displayed
407 sub debug {
408 my $self = shift(@_);
409 if ( $self->{debug} ) {
410 $self->print( "<p>" . shift(@_) . "</p>" );
414 #output an error to the content field, but continue to process output
415 #one parameter: error string (should be complete sentence(s) )
416 sub print_warning {
417 my $self = shift(@_);
418 $self->print( "<div><b>WARNING: " . shift(@_) . "</b></div>" );
421 #output an error to the content field and set a flag to bypass future processing
422 #one parameter: error string (should be complete sentence(s) )
423 sub print_error {
424 my $self = shift(@_);
425 $self->print( "<div><b>"
426 . shift(@_)
427 . "</b></div><div>Please press the back button and re-enter input.</div>"
429 $self->{error} = 1;
432 #output HMMER findings and/or error (whatever's in {content}) to HTML
433 #no parameters
434 sub write_page {
435 my $self = shift(@_);
436 $self->{page}->header("Signal Peptide Finder");
437 print $self->{content};
438 $self->{page}->footer();