2 # Signal peptide finder using a genentech-script-generated HMMER model
3 # modified by Evan 9 / 16 / 05
5 # arguments: sequences (string), filename (string), display_opt(filter/color),
6 # use_eval_cutoff(bool), use_bval_cutoff(bool), eval_cutoff(real), bval_cutoff(real)
17 my $vhost_conf = CXGN
::VHost
->new();
18 my $ss_find_obj = SigpepFind
->new($vhost_conf);
19 $ss_find_obj->process_seqs();
23 ################################################################################
28 #parameters: VHost obj
30 my $classname = shift;
31 my $vhost_conf = shift;
32 my $obj = {}; #a reference to an anonymous hash
34 #all fields are listed here
35 $obj->{debug
} = 0; #0 or 1
36 $obj->{sequences
} = ""; #string with raw input from textbox
37 $obj->{file_contents
} = ""; #string with raw input from file
38 $obj->{illegal_input_count
} =
39 0; #number of illegally formatted sequences found
41 []; #arrayref of records like {seq => input sequence, id => descriptor}
42 $obj->{error
} = 0; #set to 1 in check_input() if necessary
43 $obj->{output
} = ""; #raw HMMER output
45 $vhost_conf->get_conf('basepath')
46 . $vhost_conf->get_conf('tempfiles_subdir')
48 $obj->{hmmsearch_path
} = $vhost_conf->get_conf('hmmsearch_location');
49 $obj->{page
} = CXGN
::Page
->new( "Signal Peptide Finder", "Evan" );
50 $obj->{content
} = ""; #the output HTML
52 bless( $obj, $classname ); #use $obj as an object of this type from now on
53 $obj->get_params(); #retrieve parameters
54 return $obj; #so it can be used like "$obj->func()"
57 #read parameters to script from the page object
58 #and read input from textbox and file into $self->{sequences} and $self->{file_contents}, respectively
61 ( $self->{sequences
} ) =
62 $self->{page
}->get_arguments('sequences')
63 ; #retrieve the seqlist in string form from the in-page textbox
64 ( $self->{display_opt
} ) =
65 $self->{page
}->get_arguments('display_opt')
66 ; #whether to filter sequences in output or to show them in green/red
67 ( $self->{truncate} ) = $self->{page
}->get_arguments('truncate_seqs');
68 if ( $self->{truncate} ) {
69 ( $self->{truncate_type
} ) =
70 $self->{page
}->get_arguments('truncate_type');
73 #get an upload object to upload a file -- this and filling in the textbox are not mutually exclusive;
74 #each can hold either a list of fasta sequences or a single plaintext sequence
75 my $upload = $self->{page
}->get_upload();
77 #check whether there's a filename in the filename text field
78 if ( defined $upload ) {
79 my $fh = $upload->fh();
81 ; #need this line to put the file into an array context; can't go straight to the join()
82 $self->{file_contents
} = join( '', @fileLines );
85 $self->{file_contents
} = '';
88 #check input format and fill $self->{seqarray}
92 ( $self->{use_eval_cutoff
} ) =
93 $self->{page
}->get_arguments('use_eval_cutoff');
94 if ( defined $self->{use_eval_cutoff
} ) {
95 ( $self->{eval_cutoff
} ) = $self->{page
}->get_arguments("eval_cutoff");
98 ( $self->{use_bval_cutoff
} ) =
99 $self->{page
}->get_arguments('use_bval_cutoff');
100 if ( defined $self->{use_bval_cutoff
} ) {
101 ( $self->{bval_cutoff
} ) = $self->{page
}->get_arguments("bval_cutoff");
105 #make sure contents of input fields are legal, and put them in a useful array form in {seqarray}
106 #no parameters; return 0 or 1
108 my $self = shift(@_);
110 #process input from textbox and fill seqarray
112 $self->read_sequences( $self->{sequences
}, 'sequence input box' )
113 ; #includes the period at the end of a sentence
114 if ( $error_msg ne '' ) {
115 $self->print_warning("$error_msg Skipping the rest of the input.");
118 #process input from uploaded file and fill seqarray
120 $self->read_sequences( $self->{file_contents
}, 'uploaded file' );
121 if ( $error_msg ne '' ) {
122 $self->print_warning("$error_msg Skipping the rest of the input.");
125 #print extracted sequence info
126 $self->debug( "seqarray: " . $self->{seqarray
} );
127 $self->debug( "length of seqarray: " . scalar( @
{ $self->{seqarray
} } ) );
131 map( $_->{'id'} . ": " . $_->{'seq'}, @
{ $self->{seqarray
} } ) )
136 #add given data, which has not been checked for legality or format, to $self->{seqarray},
137 #which is an arrayref with contents being hashrefs with {id}, {seq} fields
138 #two parameters: a stringref containing the data to be added,
139 #a one-word description of where the input is coming from (for debugging)
140 #return a string that's empty if no error
142 my ( $self, $data, $input_loc ) = @_;
144 return '' unless $data =~ /\S/; #< skip if no input seqs
146 open my $in_fh, "<", \
$data
147 or die "could not open dataref";
148 my $inio = Bio
::SeqIO
->new(
150 "-format" => $self->guess_input_format( \
$data )
152 $inio->alphabet("protein");
153 $inio->verbose(2); #< make sure SeqIO will throw an exception
154 #whenever appropriate
159 while ( my $seq = $inio->next_seq )
162 my $temp_seq = uc $seq->seq();
163 if ( my $illegals = $self->contains_illegal_alphabet($temp_seq) ) {
164 $self->print_warning( "Sequence #$seq_count ("
165 . ( $seq->id || 'no name' )
166 . ") in the $input_loc contains illegal character(s) '$illegals', skipping."
173 @
{ $self->{seqarray
} },
175 'id' => defined( $seq->id() ) ?
$seq->id() : 'UnknownSeq',
183 $self->print_error("Sequence processing failed, please check input");
186 unless ( $valid_seqs > 0 ) {
187 $self->print_error("Sequence processing failed, no valid sequences.");
192 #return our best guess as to what format the given input string is in,
193 #choosing from the following format strings accepted by Bioperl's SeqIO:
195 sub guess_input_format
{
196 my $self = shift(@_);
197 my $dataref = shift(@_);
198 if ( $$dataref =~ m/^\s*>/ ) {
203 #first remove all whitespace so we interpret it as a single sequence
204 $$dataref =~ s/\s*//g;
209 #return whether the given string contains any illegal alphabet characters
210 #one parameter: a string that should be just a sequence, with nothing but [A-Z] in it,
211 #and maybe a * to mark the end of the sequence
212 #(also requires that the actual sequence have length at least one)
213 sub contains_illegal_alphabet
{
214 my ( $self, $data ) = @_;
215 if ( $data =~ m/([bxz])/i ) {
221 #process {seqarray} field; search with existing HMM
226 # if we have an error, don't run the rest of it
227 if ( $self->{error
} ) {
232 #sequences are now in array form in {seqarray}
233 #hmmsearch input against may10_combined_lenSPEX.hmm (from 5 / 10 / 04; the best of the various hmms I created)
236 my $outfilename = "$self->{tmpdir}/tmp.in.$$"; #$$ is this process' id
237 open( OUTPUT
, ">$outfilename" )
238 or die "can't open '$outfilename' for writing: $!";
239 foreach my $s ( @
{ $self->{seqarray
} } ) {
240 print OUTPUT
( ">", $s->{'id'}, "\n", $s->{'seq'}, "\n" )
241 or die "can't output indata to tempfile";
246 my $infilename = "$self->{tmpdir}/tmp.out.$$";
248 if ( $self->{display_opt
} eq "filter" ) {
249 if ( $self->{use_eval_cutoff
} ) {
250 $options .= " -E" . $self->{eval_cutoff
};
252 if ( $self->{use_bval_cutoff
} ) {
253 $options .= " -T" . $self->{bval_cutoff
};
256 ################### make sure the call to hmmer is up to date ###################
258 $self->{page
}->path_to(
259 'cgi-bin/tools/sigpep_finder/hmmlib/may10_combined_lenSPEX.hmm');
260 -f
$hmmfile or die "could not find hmmfile '$hmmfile'";
262 "$self->{hmmsearch_path} $options $hmmfile $outfilename > $infilename";
263 die "$! running hmmsearch executable '$self->{hmmsearch_path}'"
266 #input from temp output file by means of a Bio::SearchIO
267 my $resultString = '';
269 my $inio = Bio
::SearchIO
->new( -format
=> 'hmmer', -file
=> $infilename );
270 while ( my $result = $inio->next_result() ) {
272 #there should only be one "result", since we only use one hmm library
274 #result is a Bio::Search::Result::HMMERResult object
275 my $qname = $result->query_name();
276 $resultString .= <<EOHTML;
277 <h2 style="text-align: center">Results for HMM $qname</h1>
278 <p>If you elected to "show all output", note that we can't show more output than HMMER produces; it doesn't always
279 report results for all input sequences. The sequence marked "least likely" to have a signal peptide is only
280 least likely <i>among those sequences for which output was reported</i>; sequences that don't show up in the
281 results table are even less likely to start with signals.</p>
282 <p>For a detailed explanation of the output, see <a href="/documents/hmmer_userguide.pdf">the HMMER user guide (pdf)</a>. Look for the "search the
283 sequence database with hmmsearch" section.</p>
284 <table style="margin: 0 auto" border="0" cellspacing="0" cellpadding="1">
286 <td align="center" bgcolor="#e0ddff"><b> Sequence | Domain </b>
287 <td align="center" bgcolor="#d0ccee"><b> Score | E-value </b>
288 <td align="center" bgcolor="#e0ddff"><b> Signal Likelihood </b>
292 my ( $hit, $hsp, $color, $colStr1, $colStr2 );
293 my $next_hit = $result->next_hit();
296 qq|</table>\n<div style="margin: 1em auto; text-align: center; width: 20%; color: #ff0000">[ no hits ]</div
>\n|;
299 while ( $hit = $next_hit ) {
300 $hsp = $hit->next_hsp();
301 $next_hit = $result->next_hit();
302 if ( $self->{display_opt
} eq "color" ) {
304 $hit->raw_score() > (
305 defined( $self->{bval_cutoff
} )
306 ?
$self->{bval_cutoff
}
309 && $hsp->evalue() < (
310 defined( $self->{eval_cutoff
} )
311 ?
$self->{eval_cutoff
}
321 $colStr1 = "<font color=\"$color\"><b>";
322 $colStr2 = "</b></font>";
330 <td align=\"left\" bgcolor=\"#e0ddff\">"
334 . "<td align=\"left\" bgcolor=\"#d0ccee\"> "
338 . "<td align=\"center\" bgcolor=\"#e0ddff\">"
342 ?
( $next_hit ?
"Most likely" : "n/a" )
343 : ( !$next_hit ?
"Least likely" : "↓" )
347 <td align=\"right\" bgcolor=\"#e0ddff\">1 "
348 . "<td align=\"right\" bgcolor=\"#d0ccee\">"
350 . "<td align=\"center\" bgcolor=\"#e0ddff\">" . "</tr>";
353 $resultString .= "</table>\n";
355 $resultString .= <<EOHTML;
356 <table border="0" cellspacing="10" cellpadding="0">
358 <td colspan="1" align="center"><b>HMMER histogram</b>
361 <td colspan="1" align="left">
362 The histogram compares the observed (obs) sequence score distribution to that expected
363 (exp) from a file of the same size as the one you submitted but full of random sequences. The further down
364 the page the distribution of equal signs is with respect to that of asterisks, the more likely it is that
365 your sequences contain signal peptides. If the sequences you submitted all come from the same protein family,
366 the histogram suggests how likely it is that that family is secreted.
373 #reopen file to read and print histogram
374 open( INPUT
, "<$infilename" );
376 while ( $line = <INPUT
> ) {
378 m/istogram/ ) #line ought to read "Histogram of all scores:"
380 while ( ( $line = <INPUT
> ) !~ m/^\s*$/ ) {
381 $resultString .= $line;
387 $resultString .= <<EOH;
395 $self->print($resultString);
398 # `rm $outfilename $infilename`;
403 #print to the content field
404 #one parameter: string to be displayed
406 my $self = shift(@_);
407 $self->{content
} .= shift(@_);
410 #print to the content field, depending on the debug flag, without setting the error flag
411 #one parameter: string to be displayed
413 my $self = shift(@_);
414 if ( $self->{debug
} ) {
415 $self->print( "<p>" . shift(@_) . "</p>" );
419 #output an error to the content field, but continue to process output
420 #one parameter: error string (should be complete sentence(s) )
422 my $self = shift(@_);
423 $self->print( "<div><b>WARNING: " . shift(@_) . "</b></div>" );
426 #output an error to the content field and set a flag to bypass future processing
427 #one parameter: error string (should be complete sentence(s) )
429 my $self = shift(@_);
430 $self->print( "<div><b>"
432 . "</b></div><div>Please press the back button and re-enter input.</div>"
437 #output HMMER findings and/or error (whatever's in {content}) to HTML
440 my $self = shift(@_);
441 $self->{page
}->header("Signal Peptide Finder");
442 print $self->{content
};
443 $self->{page
}->footer();