cgi-bin/tools/sigpep_finder/sigpep_find.pl

   1 #!/usr/bin/perl
   2 # Signal peptide finder using a genentech-script-generated HMMER model
   3 # modified by Evan 9 / 16 / 05
   4 #
   5 # arguments: sequences (string), filename (string), display_opt(filter/color),
   6 #   use_eval_cutoff(bool), use_bval_cutoff(bool), eval_cutoff(real), bval_cutoff(real)
   7
   8 use strict;
   9 use warnings;
  10
  11 use CXGN::Page;
  12 use Bio::SearchIO;
  13 use Bio::SeqIO;
  14 use IO::String;
  15
  16 my $ss_find_obj = SigpepFind->new();
  17 $ss_find_obj->process_seqs();
  18
  19 exit;
  20
  21 ################################################################################
  22 package SigpepFind;
  23 use strict;
  24 use CatalystX::GlobalContext '$c';
  25 use warnings;
  26 use English;
  27
  28 sub new {
  29     my $classname  = shift;
  30     my $obj        = {};      #a reference to an anonymous hash
  31
  32     #all fields are listed here
  33     $obj->{debug}         = 0;     #0 or 1
  34     $obj->{sequences}     = "";    #string with raw input from textbox
  35     $obj->{file_contents} = "";    #string with raw input from file
  36     $obj->{illegal_input_count} =
  37       0;    #number of illegally formatted sequences found
  38     $obj->{seqarray} =
  39       [];    #arrayref of records like {seq => input sequence, id => descriptor}
  40     $obj->{error}  = 0;     #set to 1 in check_input() if necessary
  41     $obj->{output} = "";    #raw HMMER output
  42     $obj->{tmpdir} =$c->path_to( $c->tempfiles_subdir( 'sigpep_finder' ) );
  43     $obj->{hmmsearch_path} = $c->config->{'hmmsearch_location'};
  44     $obj->{page} = CXGN::Page->new( "Signal Peptide Finder", "Evan" );
  45     $obj->{content} = "";    #the output HTML
  46
  47     bless( $obj, $classname );   #use $obj as an object of this type from now on
  48     $obj->get_params();          #retrieve parameters
  49     return $obj;                 #so it can be used like "$obj->func()"
  50 }
  51
  52 #read parameters to script from the page object
  53 #and read input from textbox and file into $self->{sequences} and $self->{file_contents}, respectively
  54 sub get_params {
  55     my $self = shift(@_);
  56     ( $self->{sequences} ) =
  57       $self->{page}->get_arguments('sequences')
  58       ;    #retrieve the seqlist in string form from the in-page textbox
  59     ( $self->{display_opt} ) =
  60       $self->{page}->get_arguments('display_opt')
  61       ;    #whether to filter sequences in output or to show them in green/red
  62     ( $self->{truncate} ) = $self->{page}->get_arguments('truncate_seqs');
  63     if ( $self->{truncate} ) {
  64         ( $self->{truncate_type} ) =
  65           $self->{page}->get_arguments('truncate_type');
  66     }
  67
  68 #get an upload object to upload a file -- this and filling in the textbox are not mutually exclusive;
  69 #each can hold either a list of fasta sequences or a single plaintext sequence
  70     my $upload = $self->{page}->get_upload();
  71
  72     #check whether there's a filename in the filename text field
  73     if ( defined $upload ) {
  74         my $fh        = $upload->fh();
  75         my @fileLines = <$fh>
  76           ; #need this line to put the file into an array context; can't go straight to the join()
  77         $self->{file_contents} = join( '', @fileLines );
  78     }
  79     else {
  80         $self->{file_contents} = '';
  81     }
  82
  83     #check input format and fill $self->{seqarray}
  84     $self->check_input();
  85
  86     #output options
  87     ( $self->{use_eval_cutoff} ) =
  88       $self->{page}->get_arguments('use_eval_cutoff');
  89     if ( defined $self->{use_eval_cutoff} ) {
  90         ( $self->{eval_cutoff} ) = $self->{page}->get_arguments("eval_cutoff");
  91     }
  92
  93     ( $self->{use_bval_cutoff} ) =
  94       $self->{page}->get_arguments('use_bval_cutoff');
  95     if ( defined $self->{use_bval_cutoff} ) {
  96         ( $self->{bval_cutoff} ) = $self->{page}->get_arguments("bval_cutoff");
  97     }
  98 }
  99
 100 #make sure contents of input fields are legal, and put them in a useful array form in {seqarray}
 101 #no parameters; return 0 or 1
 102 sub check_input {
 103     my $self = shift(@_);
 104
 105     #process input from textbox and fill seqarray
 106     my $error_msg =
 107       $self->read_sequences( $self->{sequences}, 'sequence input box' )
 108       ;    #includes the period at the end of a sentence
 109     if ( $error_msg ne '' ) {
 110         $self->print_warning("$error_msg Skipping the rest of the input.");
 111     }
 112
 113     #process input from uploaded file and fill seqarray
 114     $error_msg =
 115       $self->read_sequences( $self->{file_contents}, 'uploaded file' );
 116     if ( $error_msg ne '' ) {
 117         $self->print_warning("$error_msg Skipping the rest of the input.");
 118     }
 119
 120     #print extracted sequence info
 121     $self->debug( "seqarray: " . $self->{seqarray} );
 122     $self->debug( "length of seqarray: " . scalar( @{ $self->{seqarray} } ) );
 123     $self->debug(
 124         "uploaded ids:\n"
 125           . join( ",\n",
 126             map( $_->{'id'} . ": " . $_->{'seq'}, @{ $self->{seqarray} } ) )
 127           . "\n"
 128     );
 129 }
 130
 131 #add given data, which has not been checked for legality or format, to $self->{seqarray},
 132 #which is an arrayref with contents being hashrefs with {id}, {seq} fields
 133 #two parameters: a stringref containing the data to be added,
 134 #a one-word description of where the input is coming from (for debugging)
 135 #return a string that's empty if no error
 136 sub read_sequences {
 137     my ( $self, $data, $input_loc ) = @_;
 138
 139     return '' unless $data =~ /\S/;    #< skip if no input seqs
 140
 141     open my $in_fh, "<", \$data
 142       or die "could not open dataref";
 143     my $inio = Bio::SeqIO->new(
 144         "-fh"     => $in_fh,
 145         "-format" => $self->guess_input_format( \$data )
 146     );
 147     $inio->alphabet("protein");
 148     $inio->verbose(2);    #< make sure SeqIO will throw an exception
 149                           #whenever appropriate
 150
 151     my $seq_count  = 0;
 152     my $valid_seqs = 0;
 153     eval {
 154         while ( my $seq = $inio->next_seq )
 155         {
 156             $seq_count++;
 157             my $temp_seq = uc $seq->seq();
 158             if ( my $illegals = $self->contains_illegal_alphabet($temp_seq) ) {
 159                 $self->print_warning( "Sequence #$seq_count ("
 160                       . ( $seq->id || 'no name' )
 161                       . ") in the $input_loc contains illegal character(s) '$illegals', skipping."
 162                 );
 163                 next;
 164             }
 165
 166             #add a record
 167             push(
 168                 @{ $self->{seqarray} },
 169                 {
 170                     'id' => defined( $seq->id() ) ? $seq->id() : 'UnknownSeq',
 171                     'seq' => $temp_seq
 172                 }
 173             );
 174             $valid_seqs++;
 175         }
 176     };
 177     if ($EVAL_ERROR) {
 178         $self->print_error("Sequence processing failed, please check input");
 179     }
 180
 181     unless ( $valid_seqs > 0 ) {
 182         $self->print_error("Sequence processing failed, no valid sequences.");
 183     }
 184     return '';
 185 }
 186
 187 #return our best guess as to what format the given input string is in,
 188 #choosing from the following format strings accepted by Bioperl's SeqIO:
 189 #raw fasta
 190 sub guess_input_format {
 191     my $self    = shift(@_);
 192     my $dataref = shift(@_);
 193     if ( $$dataref =~ m/^\s*>/ ) {
 194         return "fasta";
 195     }
 196     else {
 197
 198         #first remove all whitespace so we interpret it as a single sequence
 199         $$dataref =~ s/\s*//g;
 200         return "raw";
 201     }
 202 }
 203
 204 #return whether the given string contains any illegal alphabet characters
 205 #one parameter: a string that should be just a sequence, with nothing but [A-Z] in it,
 206 #and maybe a * to mark the end of the sequence
 207 #(also requires that the actual sequence have length at least one)
 208 sub contains_illegal_alphabet {
 209     my ( $self, $data ) = @_;
 210     if ( $data =~ m/([bxz])/i ) {
 211         return $1;
 212     }
 213     return 0;
 214 }
 215
 216 #process {seqarray} field; search with existing HMM
 217 #no parameters
 218 sub process_seqs {
 219     my $self = shift;
 220
 221     # if we have an error, don't run the rest of it
 222     if ( $self->{error} ) {
 223         $self->write_page();
 224         return;
 225     }
 226
 227 #sequences are now in array form in {seqarray}
 228 #hmmsearch input against may10_combined_lenSPEX.hmm (from 5 / 10 / 04; the best of the various hmms I created)
 229
 230     #output to temp file
 231     my $outfilename = "$self->{tmpdir}/tmp.in.$$";    #$$ is this process' id
 232     open( OUTPUT, ">$outfilename" )
 233       or die "can't open '$outfilename' for writing: $!";
 234     foreach my $s ( @{ $self->{seqarray} } ) {
 235         print OUTPUT ( ">", $s->{'id'}, "\n", $s->{'seq'}, "\n" )
 236           or die "can't output indata to tempfile";
 237     }
 238     close(OUTPUT);
 239
 240     #call hmmsearch
 241     my $infilename = "$self->{tmpdir}/tmp.out.$$";
 242     my $options    = "-A0";
 243     if ( $self->{display_opt} eq "filter" ) {
 244         if ( $self->{use_eval_cutoff} ) {
 245             $options .= " -E" . $self->{eval_cutoff};
 246         }
 247         if ( $self->{use_bval_cutoff} ) {
 248             $options .= " -T" . $self->{bval_cutoff};
 249         }
 250     }
 251     ################### make sure the call to hmmer is up to date ###################
 252     my $hmmfile =
 253       $self->{page}->path_to(
 254         'cgi-bin/tools/sigpep_finder/hmmlib/may10_combined_lenSPEX.hmm');
 255     -f $hmmfile or die "could not find hmmfile '$hmmfile'";
 256     system
 257       "$self->{hmmsearch_path} $options $hmmfile $outfilename > $infilename";
 258     die "$! running hmmsearch executable '$self->{hmmsearch_path}'"
 259       if $CHILD_ERROR;
 260
 261     #input from temp output file by means of a Bio::SearchIO
 262     my $resultString = '';
 263
 264     my $inio = Bio::SearchIO->new( -format => 'hmmer', -file => $infilename );
 265     while ( my $result = $inio->next_result() ) {
 266
 267         #there should only be one "result", since we only use one hmm library
 268
 269         #result is a Bio::Search::Result::HMMERResult object
 270         my $qname = $result->query_name();
 271         $resultString .= <<EOHTML;
 272                                 <h2 style="text-align: center">Results for HMM $qname</h1>
 273                                 <p>If you elected to "show all output", note that we can't show more output than HMMER produces; it doesn't always
 274                                 report results for all input sequences. The sequence marked "least likely" to have a signal peptide is only
 275                                 least likely <i>among those sequences for which output was reported</i>; sequences that don't show up in the
 276                                 results table are even less likely to start with signals.</p>
 277                                 <p>For a detailed explanation of the output, see <a href="/documents/hmmer_userguide.pdf">the HMMER user guide (pdf)</a>. Look for the "search the
 278                                 sequence database with hmmsearch" section.</p>
 279                                 <table style="margin: 0 auto" border="0" cellspacing="0" cellpadding="1">
 280                                         <tr>
 281                                                 <td align="center" bgcolor="#e0ddff"><b>&nbsp;&nbsp;&nbsp;Sequence&nbsp;&nbsp;|&nbsp;&nbsp;Domain&nbsp;&nbsp;&nbsp;</b>
 282                                                 <td align="center" bgcolor="#d0ccee"><b>&nbsp;&nbsp;&nbsp;Score&nbsp;&nbsp;|&nbsp;&nbsp;E-value&nbsp;&nbsp;&nbsp;</b>
 283                                                 <td align="center" bgcolor="#e0ddff"><b>&nbsp;&nbsp;&nbsp;Signal Likelihood&nbsp;&nbsp;&nbsp;</b>
 284                                         </tr>
 285 EOHTML
 286         my $lineNum = 0;
 287         my ( $hit, $hsp, $color, $colStr1, $colStr2 );
 288         my $next_hit = $result->next_hit();
 289         if ( !$next_hit ) {
 290             $resultString .=
 291               qq|</table>\n<div style="margin: 1em auto; text-align: center; width: 20%; color: #ff0000">[ no hits ]</div>\n|;
 292         }
 293         else {
 294             while ( $hit = $next_hit ) {
 295                 $hsp      = $hit->next_hsp();
 296                 $next_hit = $result->next_hit();
 297                 if ( $self->{display_opt} eq "color" ) {
 298                     if (
 299                         $hit->raw_score() > (
 300                             defined( $self->{bval_cutoff} )
 301                             ? $self->{bval_cutoff}
 302                             : -1e6
 303                         )
 304                         && $hsp->evalue() < (
 305                             defined( $self->{eval_cutoff} )
 306                             ? $self->{eval_cutoff}
 307                             : 1e12
 308                         )
 309                       )
 310                     {
 311                         $color = "#559955";
 312                     }
 313                     else {
 314                         $color = "#994444";
 315                     }
 316                     $colStr1 = "<font color=\"$color\"><b>";
 317                     $colStr2 = "</b></font>";
 318                 }
 319                 else {
 320                     $colStr1 = "";
 321                     $colStr2 = "";
 322                 }
 323                 $resultString .= "
 324                                                         <tr>
 325                                                                 <td align=\"left\" bgcolor=\"#e0ddff\">"
 326                   . $colStr1
 327                   . $hit->name()
 328                   . $colStr2
 329                   . "<td align=\"left\" bgcolor=\"#d0ccee\">&nbsp;"
 330                   . $colStr1
 331                   . $hit->raw_score()
 332                   . $colStr2
 333                   . "<td align=\"center\" bgcolor=\"#e0ddff\">"
 334                   . $colStr1
 335                   . (
 336                       ( $lineNum == 0 )
 337                     ? ( $next_hit ? "Most likely" : "n/a" )
 338                     : ( !$next_hit ? "Least likely" : "&#8595;" )
 339                   )
 340                   . $colStr2 . "</tr>
 341                                                         <tr>
 342                                                                 <td align=\"right\" bgcolor=\"#e0ddff\">1&nbsp;"
 343                   . "<td align=\"right\" bgcolor=\"#d0ccee\">"
 344                   . $hsp->evalue()
 345                   . "<td align=\"center\" bgcolor=\"#e0ddff\">" . "</tr>";
 346                 $lineNum++;
 347             }
 348             $resultString .= "</table>\n";
 349         }
 350         $resultString .= <<EOHTML;
 351                                         <table border="0" cellspacing="10" cellpadding="0">
 352                                                 <tr>
 353                                                         <td colspan="1" align="center"><b>HMMER histogram</b>
 354                                                 </tr>
 355                                                 <tr>
 356                                                         <td colspan="1" align="left">
 357                                                                 &nbsp;&nbsp;&nbsp;The histogram compares the observed (obs) sequence score distribution to that expected
 358                                                                 (exp) from a file of the same size as the one you submitted but full of random sequences. The further down
 359                                                                 the page the distribution of equal signs is with respect to that of asterisks, the more likely it is that
 360                                                                 your sequences contain signal peptides. If the sequences you submitted all come from the same protein family,
 361                                                                 the histogram suggests how likely it is that that family is secreted.
 362                                                 </tr>
 363                                                 <tr>
 364                                                         <td align="left">
 365                                                                 <pre><code>
 366 EOHTML
 367
 368         #reopen file to read and print histogram
 369         open( INPUT, "<$infilename" );
 370         my $line;
 371         while ( $line = <INPUT> ) {
 372             if ( $line =~
 373                 m/istogram/ )    #line ought to read "Histogram of all scores:"
 374             {
 375                 while ( ( $line = <INPUT> ) !~ m/^\s*$/ ) {
 376                     $resultString .= $line;
 377                 }
 378             }
 379         }
 380         close(INPUT);
 381
 382         $resultString .= <<EOH;
 383                                                                 </code></pre>
 384                                                 </tr>
 385                                         </table>
 386 EOH
 387
 388     }
 389
 390     $self->print($resultString);
 391
 392     #remove temp files
 393     #   `rm $outfilename $infilename`;
 394
 395     $self->write_page();
 396 }
 397
 398 #print to the content field
 399 #one parameter: string to be displayed
 400 sub print {
 401     my $self = shift(@_);
 402     $self->{content} .= shift(@_);
 403 }
 404
 405 #print to the content field, depending on the debug flag, without setting the error flag
 406 #one parameter: string to be displayed
 407 sub debug {
 408     my $self = shift(@_);
 409     if ( $self->{debug} ) {
 410         $self->print( "<p>" . shift(@_) . "</p>" );
 411     }
 412 }
 413
 414 #output an error to the content field, but continue to process output
 415 #one parameter: error string (should be complete sentence(s) )
 416 sub print_warning {
 417     my $self = shift(@_);
 418     $self->print( "<div><b>WARNING: " . shift(@_) . "</b></div>" );
 419 }
 420
 421 #output an error to the content field and set a flag to bypass future processing
 422 #one parameter: error string (should be complete sentence(s) )
 423 sub print_error {
 424     my $self = shift(@_);
 425     $self->print( "<div><b>"
 426           . shift(@_)
 427           . "</b></div><div>Please press the back button and re-enter input.</div>"
 428     );
 429     $self->{error} = 1;
 430 }
 431
 432 #output HMMER findings and/or error (whatever's in {content}) to HTML
 433 #no parameters
 434 sub write_page {
 435     my $self = shift(@_);
 436     $self->{page}->header("Signal Peptide Finder");
 437     print $self->{content};
 438     $self->{page}->footer();
 439 }