2 # Signal peptide finder using a genentech-script-generated HMMER model
3 # modified by Evan 9 / 16 / 05
5 # arguments: sequences (string), filename (string), display_opt(filter/color),
6 # use_eval_cutoff(bool), use_bval_cutoff(bool), eval_cutoff(real), bval_cutoff(real)
16 my $ss_find_obj = SigpepFind
->new();
17 $ss_find_obj->process_seqs();
21 ################################################################################
24 use CatalystX
::GlobalContext
'$c';
29 my $classname = shift;
30 my $obj = {}; #a reference to an anonymous hash
32 #all fields are listed here
33 $obj->{debug
} = 0; #0 or 1
34 $obj->{sequences
} = ""; #string with raw input from textbox
35 $obj->{file_contents
} = ""; #string with raw input from file
36 $obj->{illegal_input_count
} =
37 0; #number of illegally formatted sequences found
39 []; #arrayref of records like {seq => input sequence, id => descriptor}
40 $obj->{error
} = 0; #set to 1 in check_input() if necessary
41 $obj->{output
} = ""; #raw HMMER output
42 $obj->{tmpdir
} =$c->path_to( $c->tempfiles_subdir( 'sigpep_finder' ) );
43 $obj->{hmmsearch_path
} = $c->config->{'hmmsearch_location'};
44 $obj->{page
} = CXGN
::Page
->new( "Signal Peptide Finder", "Evan" );
45 $obj->{content
} = ""; #the output HTML
47 bless( $obj, $classname ); #use $obj as an object of this type from now on
48 $obj->get_params(); #retrieve parameters
49 return $obj; #so it can be used like "$obj->func()"
52 #read parameters to script from the page object
53 #and read input from textbox and file into $self->{sequences} and $self->{file_contents}, respectively
56 ( $self->{sequences
} ) =
57 $self->{page
}->get_arguments('sequences')
58 ; #retrieve the seqlist in string form from the in-page textbox
59 ( $self->{display_opt
} ) =
60 $self->{page
}->get_arguments('display_opt')
61 ; #whether to filter sequences in output or to show them in green/red
62 ( $self->{truncate} ) = $self->{page
}->get_arguments('truncate_seqs');
63 if ( $self->{truncate} ) {
64 ( $self->{truncate_type
} ) =
65 $self->{page
}->get_arguments('truncate_type');
68 #get an upload object to upload a file -- this and filling in the textbox are not mutually exclusive;
69 #each can hold either a list of fasta sequences or a single plaintext sequence
70 my $upload = $self->{page
}->get_upload();
72 #check whether there's a filename in the filename text field
73 if ( defined $upload ) {
74 my $fh = $upload->fh();
76 ; #need this line to put the file into an array context; can't go straight to the join()
77 $self->{file_contents
} = join( '', @fileLines );
80 $self->{file_contents
} = '';
83 #check input format and fill $self->{seqarray}
87 ( $self->{use_eval_cutoff
} ) =
88 $self->{page
}->get_arguments('use_eval_cutoff');
89 if ( defined $self->{use_eval_cutoff
} ) {
90 ( $self->{eval_cutoff
} ) = $self->{page
}->get_arguments("eval_cutoff");
93 ( $self->{use_bval_cutoff
} ) =
94 $self->{page
}->get_arguments('use_bval_cutoff');
95 if ( defined $self->{use_bval_cutoff
} ) {
96 ( $self->{bval_cutoff
} ) = $self->{page
}->get_arguments("bval_cutoff");
100 #make sure contents of input fields are legal, and put them in a useful array form in {seqarray}
101 #no parameters; return 0 or 1
103 my $self = shift(@_);
105 #process input from textbox and fill seqarray
107 $self->read_sequences( $self->{sequences
}, 'sequence input box' )
108 ; #includes the period at the end of a sentence
109 if ( $error_msg ne '' ) {
110 $self->print_warning("$error_msg Skipping the rest of the input.");
113 #process input from uploaded file and fill seqarray
115 $self->read_sequences( $self->{file_contents
}, 'uploaded file' );
116 if ( $error_msg ne '' ) {
117 $self->print_warning("$error_msg Skipping the rest of the input.");
120 #print extracted sequence info
121 $self->debug( "seqarray: " . $self->{seqarray
} );
122 $self->debug( "length of seqarray: " . scalar( @
{ $self->{seqarray
} } ) );
126 map( $_->{'id'} . ": " . $_->{'seq'}, @
{ $self->{seqarray
} } ) )
131 #add given data, which has not been checked for legality or format, to $self->{seqarray},
132 #which is an arrayref with contents being hashrefs with {id}, {seq} fields
133 #two parameters: a stringref containing the data to be added,
134 #a one-word description of where the input is coming from (for debugging)
135 #return a string that's empty if no error
137 my ( $self, $data, $input_loc ) = @_;
139 return '' unless $data =~ /\S/; #< skip if no input seqs
141 open my $in_fh, "<", \
$data
142 or die "could not open dataref";
143 my $inio = Bio
::SeqIO
->new(
145 "-format" => $self->guess_input_format( \
$data )
147 $inio->alphabet("protein");
148 $inio->verbose(2); #< make sure SeqIO will throw an exception
149 #whenever appropriate
154 while ( my $seq = $inio->next_seq )
157 my $temp_seq = uc $seq->seq();
158 if ( my $illegals = $self->contains_illegal_alphabet($temp_seq) ) {
159 $self->print_warning( "Sequence #$seq_count ("
160 . ( $seq->id || 'no name' )
161 . ") in the $input_loc contains illegal character(s) '$illegals', skipping."
168 @
{ $self->{seqarray
} },
170 'id' => defined( $seq->id() ) ?
$seq->id() : 'UnknownSeq',
178 $self->print_error("Sequence processing failed, please check input");
181 unless ( $valid_seqs > 0 ) {
182 $self->print_error("Sequence processing failed, no valid sequences.");
187 #return our best guess as to what format the given input string is in,
188 #choosing from the following format strings accepted by Bioperl's SeqIO:
190 sub guess_input_format
{
191 my $self = shift(@_);
192 my $dataref = shift(@_);
193 if ( $$dataref =~ m/^\s*>/ ) {
198 #first remove all whitespace so we interpret it as a single sequence
199 $$dataref =~ s/\s*//g;
204 #return whether the given string contains any illegal alphabet characters
205 #one parameter: a string that should be just a sequence, with nothing but [A-Z] in it,
206 #and maybe a * to mark the end of the sequence
207 #(also requires that the actual sequence have length at least one)
208 sub contains_illegal_alphabet
{
209 my ( $self, $data ) = @_;
210 if ( $data =~ m/([bxz])/i ) {
216 #process {seqarray} field; search with existing HMM
221 # if we have an error, don't run the rest of it
222 if ( $self->{error
} ) {
227 #sequences are now in array form in {seqarray}
228 #hmmsearch input against may10_combined_lenSPEX.hmm (from 5 / 10 / 04; the best of the various hmms I created)
231 my $outfilename = "$self->{tmpdir}/tmp.in.$$"; #$$ is this process' id
232 open( OUTPUT
, ">$outfilename" )
233 or die "can't open '$outfilename' for writing: $!";
234 foreach my $s ( @
{ $self->{seqarray
} } ) {
235 print OUTPUT
( ">", $s->{'id'}, "\n", $s->{'seq'}, "\n" )
236 or die "can't output indata to tempfile";
241 my $infilename = "$self->{tmpdir}/tmp.out.$$";
243 if ( $self->{display_opt
} eq "filter" ) {
244 if ( $self->{use_eval_cutoff
} ) {
245 $options .= " -E" . $self->{eval_cutoff
};
247 if ( $self->{use_bval_cutoff
} ) {
248 $options .= " -T" . $self->{bval_cutoff
};
251 ################### make sure the call to hmmer is up to date ###################
253 $self->{page
}->path_to(
254 'cgi-bin/tools/sigpep_finder/hmmlib/may10_combined_lenSPEX.hmm');
255 -f
$hmmfile or die "could not find hmmfile '$hmmfile'";
257 "$self->{hmmsearch_path} $options $hmmfile $outfilename > $infilename";
258 die "$! running hmmsearch executable '$self->{hmmsearch_path}'"
261 #input from temp output file by means of a Bio::SearchIO
262 my $resultString = '';
264 my $inio = Bio
::SearchIO
->new( -format
=> 'hmmer', -file
=> $infilename );
265 while ( my $result = $inio->next_result() ) {
267 #there should only be one "result", since we only use one hmm library
269 #result is a Bio::Search::Result::HMMERResult object
270 my $qname = $result->query_name();
271 $resultString .= <<EOHTML;
272 <h2 style="text-align: center">Results for HMM $qname</h1>
273 <p>If you elected to "show all output", note that we can't show more output than HMMER produces; it doesn't always
274 report results for all input sequences. The sequence marked "least likely" to have a signal peptide is only
275 least likely <i>among those sequences for which output was reported</i>; sequences that don't show up in the
276 results table are even less likely to start with signals.</p>
277 <p>For a detailed explanation of the output, see <a href="/documents/hmmer_userguide.pdf">the HMMER user guide (pdf)</a>. Look for the "search the
278 sequence database with hmmsearch" section.</p>
279 <table style="margin: 0 auto" border="0" cellspacing="0" cellpadding="1">
281 <td align="center" bgcolor="#e0ddff"><b> Sequence | Domain </b>
282 <td align="center" bgcolor="#d0ccee"><b> Score | E-value </b>
283 <td align="center" bgcolor="#e0ddff"><b> Signal Likelihood </b>
287 my ( $hit, $hsp, $color, $colStr1, $colStr2 );
288 my $next_hit = $result->next_hit();
291 qq|</table>\n<div style="margin: 1em auto; text-align: center; width: 20%; color: #ff0000">[ no hits ]</div
>\n|;
294 while ( $hit = $next_hit ) {
295 $hsp = $hit->next_hsp();
296 $next_hit = $result->next_hit();
297 if ( $self->{display_opt
} eq "color" ) {
299 $hit->raw_score() > (
300 defined( $self->{bval_cutoff
} )
301 ?
$self->{bval_cutoff
}
304 && $hsp->evalue() < (
305 defined( $self->{eval_cutoff
} )
306 ?
$self->{eval_cutoff
}
316 $colStr1 = "<font color=\"$color\"><b>";
317 $colStr2 = "</b></font>";
325 <td align=\"left\" bgcolor=\"#e0ddff\">"
329 . "<td align=\"left\" bgcolor=\"#d0ccee\"> "
333 . "<td align=\"center\" bgcolor=\"#e0ddff\">"
337 ?
( $next_hit ?
"Most likely" : "n/a" )
338 : ( !$next_hit ?
"Least likely" : "↓" )
342 <td align=\"right\" bgcolor=\"#e0ddff\">1 "
343 . "<td align=\"right\" bgcolor=\"#d0ccee\">"
345 . "<td align=\"center\" bgcolor=\"#e0ddff\">" . "</tr>";
348 $resultString .= "</table>\n";
350 $resultString .= <<EOHTML;
351 <table border="0" cellspacing="10" cellpadding="0">
353 <td colspan="1" align="center"><b>HMMER histogram</b>
356 <td colspan="1" align="left">
357 The histogram compares the observed (obs) sequence score distribution to that expected
358 (exp) from a file of the same size as the one you submitted but full of random sequences. The further down
359 the page the distribution of equal signs is with respect to that of asterisks, the more likely it is that
360 your sequences contain signal peptides. If the sequences you submitted all come from the same protein family,
361 the histogram suggests how likely it is that that family is secreted.
368 #reopen file to read and print histogram
369 open( INPUT
, "<$infilename" );
371 while ( $line = <INPUT
> ) {
373 m/istogram/ ) #line ought to read "Histogram of all scores:"
375 while ( ( $line = <INPUT
> ) !~ m/^\s*$/ ) {
376 $resultString .= $line;
382 $resultString .= <<EOH;
390 $self->print($resultString);
393 # `rm $outfilename $infilename`;
398 #print to the content field
399 #one parameter: string to be displayed
401 my $self = shift(@_);
402 $self->{content
} .= shift(@_);
405 #print to the content field, depending on the debug flag, without setting the error flag
406 #one parameter: string to be displayed
408 my $self = shift(@_);
409 if ( $self->{debug
} ) {
410 $self->print( "<p>" . shift(@_) . "</p>" );
414 #output an error to the content field, but continue to process output
415 #one parameter: error string (should be complete sentence(s) )
417 my $self = shift(@_);
418 $self->print( "<div><b>WARNING: " . shift(@_) . "</b></div>" );
421 #output an error to the content field and set a flag to bypass future processing
422 #one parameter: error string (should be complete sentence(s) )
424 my $self = shift(@_);
425 $self->print( "<div><b>"
427 . "</b></div><div>Please press the back button and re-enter input.</div>"
432 #output HMMER findings and/or error (whatever's in {content}) to HTML
435 my $self = shift(@_);
436 $self->{page
}->header("Signal Peptide Finder");
437 print $self->{content
};
438 $self->{page
}->footer();