cgi-bin/tools/sigpep_finder/sigpep_find.pl

   1 #!/usr/bin/perl
   2 # Signal peptide finder using a genentech-script-generated HMMER model
   3 # modified by Evan 9 / 16 / 05
   4 #
   5 # arguments: sequences (string), filename (string), display_opt(filter/color),
   6 #   use_eval_cutoff(bool), use_bval_cutoff(bool), eval_cutoff(real), bval_cutoff(real)
   7
   8 use strict;
   9 use English;
  10
  11 use CXGN::Page;
  12 use Bio::SearchIO;
  13 use Bio::SeqIO;
  14 use IO::String;
  15 use CXGN::VHost;
  16
  17 my $vhost_conf  = CXGN::VHost->new();
  18 my $ss_find_obj = SigpepFind->new($vhost_conf);
  19 $ss_find_obj->process_seqs();
  20
  21 exit;
  22
  23 ################################################################################
  24 package SigpepFind;
  25 use strict;
  26 use English;
  27
  28 #parameters: VHost obj
  29 sub new {
  30     my $classname  = shift;
  31     my $vhost_conf = shift;
  32     my $obj        = {};      #a reference to an anonymous hash
  33
  34     #all fields are listed here
  35     $obj->{debug}         = 0;     #0 or 1
  36     $obj->{sequences}     = "";    #string with raw input from textbox
  37     $obj->{file_contents} = "";    #string with raw input from file
  38     $obj->{illegal_input_count} =
  39       0;    #number of illegally formatted sequences found
  40     $obj->{seqarray} =
  41       [];    #arrayref of records like {seq => input sequence, id => descriptor}
  42     $obj->{error}  = 0;     #set to 1 in check_input() if necessary
  43     $obj->{output} = "";    #raw HMMER output
  44     $obj->{tmpdir} =
  45         $vhost_conf->get_conf('basepath')
  46       . $vhost_conf->get_conf('tempfiles_subdir')
  47       . "/sigpep_finder";
  48     $obj->{hmmsearch_path} = $vhost_conf->get_conf('hmmsearch_location');
  49     $obj->{page} = CXGN::Page->new( "Signal Peptide Finder", "Evan" );
  50     $obj->{content} = "";    #the output HTML
  51
  52     bless( $obj, $classname );   #use $obj as an object of this type from now on
  53     $obj->get_params();          #retrieve parameters
  54     return $obj;                 #so it can be used like "$obj->func()"
  55 }
  56
  57 #read parameters to script from the page object
  58 #and read input from textbox and file into $self->{sequences} and $self->{file_contents}, respectively
  59 sub get_params {
  60     my $self = shift(@_);
  61     ( $self->{sequences} ) =
  62       $self->{page}->get_arguments('sequences')
  63       ;    #retrieve the seqlist in string form from the in-page textbox
  64     ( $self->{display_opt} ) =
  65       $self->{page}->get_arguments('display_opt')
  66       ;    #whether to filter sequences in output or to show them in green/red
  67     ( $self->{truncate} ) = $self->{page}->get_arguments('truncate_seqs');
  68     if ( $self->{truncate} ) {
  69         ( $self->{truncate_type} ) =
  70           $self->{page}->get_arguments('truncate_type');
  71     }
  72
  73 #get an upload object to upload a file -- this and filling in the textbox are not mutually exclusive;
  74 #each can hold either a list of fasta sequences or a single plaintext sequence
  75     my $upload = $self->{page}->get_upload();
  76
  77     #check whether there's a filename in the filename text field
  78     if ( defined $upload ) {
  79         my $fh        = $upload->fh();
  80         my @fileLines = <$fh>
  81           ; #need this line to put the file into an array context; can't go straight to the join()
  82         $self->{file_contents} = join( '', @fileLines );
  83     }
  84     else {
  85         $self->{file_contents} = '';
  86     }
  87
  88     #check input format and fill $self->{seqarray}
  89     $self->check_input();
  90
  91     #output options
  92     ( $self->{use_eval_cutoff} ) =
  93       $self->{page}->get_arguments('use_eval_cutoff');
  94     if ( defined $self->{use_eval_cutoff} ) {
  95         ( $self->{eval_cutoff} ) = $self->{page}->get_arguments("eval_cutoff");
  96     }
  97
  98     ( $self->{use_bval_cutoff} ) =
  99       $self->{page}->get_arguments('use_bval_cutoff');
 100     if ( defined $self->{use_bval_cutoff} ) {
 101         ( $self->{bval_cutoff} ) = $self->{page}->get_arguments("bval_cutoff");
 102     }
 103 }
 104
 105 #make sure contents of input fields are legal, and put them in a useful array form in {seqarray}
 106 #no parameters; return 0 or 1
 107 sub check_input {
 108     my $self = shift(@_);
 109
 110     #process input from textbox and fill seqarray
 111     my $error_msg =
 112       $self->read_sequences( $self->{sequences}, 'sequence input box' )
 113       ;    #includes the period at the end of a sentence
 114     if ( $error_msg ne '' ) {
 115         $self->print_warning("$error_msg Skipping the rest of the input.");
 116     }
 117
 118     #process input from uploaded file and fill seqarray
 119     $error_msg =
 120       $self->read_sequences( $self->{file_contents}, 'uploaded file' );
 121     if ( $error_msg ne '' ) {
 122         $self->print_warning("$error_msg Skipping the rest of the input.");
 123     }
 124
 125     #print extracted sequence info
 126     $self->debug( "seqarray: " . $self->{seqarray} );
 127     $self->debug( "length of seqarray: " . scalar( @{ $self->{seqarray} } ) );
 128     $self->debug(
 129         "uploaded ids:\n"
 130           . join( ",\n",
 131             map( $_->{'id'} . ": " . $_->{'seq'}, @{ $self->{seqarray} } ) )
 132           . "\n"
 133     );
 134 }
 135
 136 #add given data, which has not been checked for legality or format, to $self->{seqarray},
 137 #which is an arrayref with contents being hashrefs with {id}, {seq} fields
 138 #two parameters: a stringref containing the data to be added,
 139 #a one-word description of where the input is coming from (for debugging)
 140 #return a string that's empty if no error
 141 sub read_sequences {
 142     my ( $self, $data, $input_loc ) = @_;
 143
 144     return '' unless $data =~ /\S/;    #< skip if no input seqs
 145
 146     open my $in_fh, "<", \$data
 147       or die "could not open dataref";
 148     my $inio = Bio::SeqIO->new(
 149         "-fh"     => $in_fh,
 150         "-format" => $self->guess_input_format( \$data )
 151     );
 152     $inio->alphabet("protein");
 153     $inio->verbose(2);    #< make sure SeqIO will throw an exception
 154                           #whenever appropriate
 155
 156     my $seq_count  = 0;
 157     my $valid_seqs = 0;
 158     eval {
 159         while ( my $seq = $inio->next_seq )
 160         {
 161             $seq_count++;
 162             my $temp_seq = uc $seq->seq();
 163             if ( my $illegals = $self->contains_illegal_alphabet($temp_seq) ) {
 164                 $self->print_warning( "Sequence #$seq_count ("
 165                       . ( $seq->id || 'no name' )
 166                       . ") in the $input_loc contains illegal character(s) '$illegals', skipping."
 167                 );
 168                 next;
 169             }
 170
 171             #add a record
 172             push(
 173                 @{ $self->{seqarray} },
 174                 {
 175                     'id' => defined( $seq->id() ) ? $seq->id() : 'UnknownSeq',
 176                     'seq' => $temp_seq
 177                 }
 178             );
 179             $valid_seqs++;
 180         }
 181     };
 182     if ($EVAL_ERROR) {
 183         $self->print_error("Sequence processing failed, please check input");
 184     }
 185
 186     unless ( $valid_seqs > 0 ) {
 187         $self->print_error("Sequence processing failed, no valid sequences.");
 188     }
 189     return '';
 190 }
 191
 192 #return our best guess as to what format the given input string is in,
 193 #choosing from the following format strings accepted by Bioperl's SeqIO:
 194 #raw fasta
 195 sub guess_input_format {
 196     my $self    = shift(@_);
 197     my $dataref = shift(@_);
 198     if ( $$dataref =~ m/^\s*>/ ) {
 199         return "fasta";
 200     }
 201     else {
 202
 203         #first remove all whitespace so we interpret it as a single sequence
 204         $$dataref =~ s/\s*//g;
 205         return "raw";
 206     }
 207 }
 208
 209 #return whether the given string contains any illegal alphabet characters
 210 #one parameter: a string that should be just a sequence, with nothing but [A-Z] in it,
 211 #and maybe a * to mark the end of the sequence
 212 #(also requires that the actual sequence have length at least one)
 213 sub contains_illegal_alphabet {
 214     my ( $self, $data ) = @_;
 215     if ( $data =~ m/([bxz])/i ) {
 216         return $1;
 217     }
 218     return 0;
 219 }
 220
 221 #process {seqarray} field; search with existing HMM
 222 #no parameters
 223 sub process_seqs {
 224     my $self = shift;
 225
 226     # if we have an error, don't run the rest of it
 227     if ( $self->{error} ) {
 228         $self->write_page();
 229         return;
 230     }
 231
 232 #sequences are now in array form in {seqarray}
 233 #hmmsearch input against may10_combined_lenSPEX.hmm (from 5 / 10 / 04; the best of the various hmms I created)
 234
 235     #output to temp file
 236     my $outfilename = "$self->{tmpdir}/tmp.in.$$";    #$$ is this process' id
 237     open( OUTPUT, ">$outfilename" )
 238       or die "can't open '$outfilename' for writing: $!";
 239     foreach my $s ( @{ $self->{seqarray} } ) {
 240         print OUTPUT ( ">", $s->{'id'}, "\n", $s->{'seq'}, "\n" )
 241           or die "can't output indata to tempfile";
 242     }
 243     close(OUTPUT);
 244
 245     #call hmmsearch
 246     my $infilename = "$self->{tmpdir}/tmp.out.$$";
 247     my $options    = "-A0";
 248     if ( $self->{display_opt} eq "filter" ) {
 249         if ( $self->{use_eval_cutoff} ) {
 250             $options .= " -E" . $self->{eval_cutoff};
 251         }
 252         if ( $self->{use_bval_cutoff} ) {
 253             $options .= " -T" . $self->{bval_cutoff};
 254         }
 255     }
 256     ################### make sure the call to hmmer is up to date ###################
 257     my $hmmfile =
 258       $self->{page}->path_to(
 259         'cgi-bin/tools/sigpep_finder/hmmlib/may10_combined_lenSPEX.hmm');
 260     -f $hmmfile or die "could not find hmmfile '$hmmfile'";
 261     system
 262       "$self->{hmmsearch_path} $options $hmmfile $outfilename > $infilename";
 263     die "$! running hmmsearch executable '$self->{hmmsearch_path}'"
 264       if $CHILD_ERROR;
 265
 266     #input from temp output file by means of a Bio::SearchIO
 267     my $resultString = '';
 268
 269     my $inio = Bio::SearchIO->new( -format => 'hmmer', -file => $infilename );
 270     while ( my $result = $inio->next_result() ) {
 271
 272         #there should only be one "result", since we only use one hmm library
 273
 274         #result is a Bio::Search::Result::HMMERResult object
 275         my $qname = $result->query_name();
 276         $resultString .= <<EOHTML;
 277                                 <h2 style="text-align: center">Results for HMM $qname</h1>
 278                                 <p>If you elected to "show all output", note that we can't show more output than HMMER produces; it doesn't always
 279                                 report results for all input sequences. The sequence marked "least likely" to have a signal peptide is only
 280                                 least likely <i>among those sequences for which output was reported</i>; sequences that don't show up in the
 281                                 results table are even less likely to start with signals.</p>
 282                                 <p>For a detailed explanation of the output, see <a href="/documents/hmmer_userguide.pdf">the HMMER user guide (pdf)</a>. Look for the "search the
 283                                 sequence database with hmmsearch" section.</p>
 284                                 <table style="margin: 0 auto" border="0" cellspacing="0" cellpadding="1">
 285                                         <tr>
 286                                                 <td align="center" bgcolor="#e0ddff"><b>&nbsp;&nbsp;&nbsp;Sequence&nbsp;&nbsp;|&nbsp;&nbsp;Domain&nbsp;&nbsp;&nbsp;</b>
 287                                                 <td align="center" bgcolor="#d0ccee"><b>&nbsp;&nbsp;&nbsp;Score&nbsp;&nbsp;|&nbsp;&nbsp;E-value&nbsp;&nbsp;&nbsp;</b>
 288                                                 <td align="center" bgcolor="#e0ddff"><b>&nbsp;&nbsp;&nbsp;Signal Likelihood&nbsp;&nbsp;&nbsp;</b>
 289                                         </tr>
 290 EOHTML
 291         my $lineNum = 0;
 292         my ( $hit, $hsp, $color, $colStr1, $colStr2 );
 293         my $next_hit = $result->next_hit();
 294         if ( !$next_hit ) {
 295             $resultString .=
 296               qq|</table>\n<div style="margin: 1em auto; text-align: center; width: 20%; color: #ff0000">[ no hits ]</div>\n|;
 297         }
 298         else {
 299             while ( $hit = $next_hit ) {
 300                 $hsp      = $hit->next_hsp();
 301                 $next_hit = $result->next_hit();
 302                 if ( $self->{display_opt} eq "color" ) {
 303                     if (
 304                         $hit->raw_score() > (
 305                             defined( $self->{bval_cutoff} )
 306                             ? $self->{bval_cutoff}
 307                             : -1e6
 308                         )
 309                         && $hsp->evalue() < (
 310                             defined( $self->{eval_cutoff} )
 311                             ? $self->{eval_cutoff}
 312                             : 1e12
 313                         )
 314                       )
 315                     {
 316                         $color = "#559955";
 317                     }
 318                     else {
 319                         $color = "#994444";
 320                     }
 321                     $colStr1 = "<font color=\"$color\"><b>";
 322                     $colStr2 = "</b></font>";
 323                 }
 324                 else {
 325                     $colStr1 = "";
 326                     $colStr2 = "";
 327                 }
 328                 $resultString .= "
 329                                                         <tr>
 330                                                                 <td align=\"left\" bgcolor=\"#e0ddff\">"
 331                   . $colStr1
 332                   . $hit->name()
 333                   . $colStr2
 334                   . "<td align=\"left\" bgcolor=\"#d0ccee\">&nbsp;"
 335                   . $colStr1
 336                   . $hit->raw_score()
 337                   . $colStr2
 338                   . "<td align=\"center\" bgcolor=\"#e0ddff\">"
 339                   . $colStr1
 340                   . (
 341                       ( $lineNum == 0 )
 342                     ? ( $next_hit ? "Most likely" : "n/a" )
 343                     : ( !$next_hit ? "Least likely" : "&#8595;" )
 344                   )
 345                   . $colStr2 . "</tr>
 346                                                         <tr>
 347                                                                 <td align=\"right\" bgcolor=\"#e0ddff\">1&nbsp;"
 348                   . "<td align=\"right\" bgcolor=\"#d0ccee\">"
 349                   . $hsp->evalue()
 350                   . "<td align=\"center\" bgcolor=\"#e0ddff\">" . "</tr>";
 351                 $lineNum++;
 352             }
 353             $resultString .= "</table>\n";
 354         }
 355         $resultString .= <<EOHTML;
 356                                         <table border="0" cellspacing="10" cellpadding="0">
 357                                                 <tr>
 358                                                         <td colspan="1" align="center"><b>HMMER histogram</b>
 359                                                 </tr>
 360                                                 <tr>
 361                                                         <td colspan="1" align="left">
 362                                                                 &nbsp;&nbsp;&nbsp;The histogram compares the observed (obs) sequence score distribution to that expected
 363                                                                 (exp) from a file of the same size as the one you submitted but full of random sequences. The further down
 364                                                                 the page the distribution of equal signs is with respect to that of asterisks, the more likely it is that
 365                                                                 your sequences contain signal peptides. If the sequences you submitted all come from the same protein family,
 366                                                                 the histogram suggests how likely it is that that family is secreted.
 367                                                 </tr>
 368                                                 <tr>
 369                                                         <td align="left">
 370                                                                 <pre><code>
 371 EOHTML
 372
 373         #reopen file to read and print histogram
 374         open( INPUT, "<$infilename" );
 375         my $line;
 376         while ( $line = <INPUT> ) {
 377             if ( $line =~
 378                 m/istogram/ )    #line ought to read "Histogram of all scores:"
 379             {
 380                 while ( ( $line = <INPUT> ) !~ m/^\s*$/ ) {
 381                     $resultString .= $line;
 382                 }
 383             }
 384         }
 385         close(INPUT);
 386
 387         $resultString .= <<EOH;
 388                                                                 </code></pre>
 389                                                 </tr>
 390                                         </table>
 391 EOH
 392
 393     }
 394
 395     $self->print($resultString);
 396
 397     #remove temp files
 398     #   `rm $outfilename $infilename`;
 399
 400     $self->write_page();
 401 }
 402
 403 #print to the content field
 404 #one parameter: string to be displayed
 405 sub print {
 406     my $self = shift(@_);
 407     $self->{content} .= shift(@_);
 408 }
 409
 410 #print to the content field, depending on the debug flag, without setting the error flag
 411 #one parameter: string to be displayed
 412 sub debug {
 413     my $self = shift(@_);
 414     if ( $self->{debug} ) {
 415         $self->print( "<p>" . shift(@_) . "</p>" );
 416     }
 417 }
 418
 419 #output an error to the content field, but continue to process output
 420 #one parameter: error string (should be complete sentence(s) )
 421 sub print_warning {
 422     my $self = shift(@_);
 423     $self->print( "<div><b>WARNING: " . shift(@_) . "</b></div>" );
 424 }
 425
 426 #output an error to the content field and set a flag to bypass future processing
 427 #one parameter: error string (should be complete sentence(s) )
 428 sub print_error {
 429     my $self = shift(@_);
 430     $self->print( "<div><b>"
 431           . shift(@_)
 432           . "</b></div><div>Please press the back button and re-enter input.</div>"
 433     );
 434     $self->{error} = 1;
 435 }
 436
 437 #output HMMER findings and/or error (whatever's in {content}) to HTML
 438 #no parameters
 439 sub write_page {
 440     my $self = shift(@_);
 441     $self->{page}->header("Signal Peptide Finder");
 442     print $self->{content};
 443     $self->{page}->footer();
 444 }