Merge branch 'master' of https://github.com/solgenomics/sgn
[sgn.git] / cgi-bin / tools / align_viewer / show_align.pl
blob38a35464292191c8ea6a3fa5771ee8a69a334ebe
2 =head1 NAME
4 show_align.pl - a cgi-based graphical alignment viewer
6 =head1 DESCRIPTION
8 Generates an alignment view from data submitted through the input page (index.pl in this directory). It is a web script that supports the following cgi parameters:
10 =over 5
12 =item format
14 format may be either 'clustalw', 'fasta', or 'fasta_unaligned', and describes the format of the uploaded sequences. If format is 'fasta_unaligned', the sequences will be aligned using t_coffee.
16 =item seq_data
18 seq_data is the actual data uploaded, in the format described by the format parameter.
20 =item temp_file
22 if no seq_data is given, a temp_file name is supplied that specifies a previously parsed input. The temp_file specifies a file name, without the directory information, which will be in the default directory location, as defined in the conf object.
24 =item start_value
26 denotes the first position of the section of the alignment that should be displayed.
28 =item end_value
30 denotes the end position of the section of the alignment that should be displayed.
32 =item title
34 the title of the alignment, shown on the top of the page.
36 =item type
38 =over 5
40 =item 'nt' for nucleotide
42 =item 'pep' for peptide
44 =back 5
46 Not sure about this one. Defaults to peptide as default.
48 =item hide_seqs
50 a parameter that specifies a list of sequence ids, separated by whitespace, that should not be shown in the alignment.
52 =item run
54 'local' means run on local host. 'cluster' means run job on cluster (if an alignment needs to be run).
56 =back
58 =head1 DEPENDENCIES
60 Depends on the CXGN code base, in particular heavily on CXGN::Alignment and others, and on GD for the graphics routines and File::Temp for the temp file handling.
62 =head1 AUTHOR(S)
64 Code by Chenwei Lin (cl295@cornell.edu). Documentation and minor modifications (ability to run T-coffee from the page) by Lukas Mueller (lam87@cornell.edu).
66 =cut
68 use strict;
69 use Storable qw /store/;
70 use File::Copy;
71 use CXGN::Page;
72 use CXGN::Page::FormattingHelpers qw/ page_title_html blue_section_html /;
73 use CXGN::Page::Widgets qw/ collapser /;
74 use File::Temp qw/tempfile/;
75 use File::Basename qw /basename/;
76 use CXGN::Phylo::Alignment;
77 use Bio::AlignIO;
78 use CXGN::DB::Connection;
79 use CXGN::Tools::Run;
80 use CXGN::Tools::Gene;
81 use CXGN::Tools::Identifiers qw/ identifier_namespace identifier_url /;
82 use CXGN::Tools::Entrez;
83 use HTML::Entities;
84 use CXGN::Tools::Param qw/ hash2param /;
85 use CatalystX::GlobalContext '$c';
87 our $ALIGN_SEQ_COUNT_LIMIT = 500;
88 our $ALIGN_SEQ_TOTAL_LENGTH_LIMIT = 500_000;
90 my $page = CXGN::Page->new( "SGN Alignment Analyzer", "Chenwei-Carpita" );
91 $page->jsan_use("CXGN.Effects");
93 our (
94 $format, $seq_data, $id_data, $start_value,
95 $end_value, $title, $type, $hide_seqs,
96 $run, $sv_similarity, $sv_shared, $sv_indel,
97 $show_sv, $show_breakdown, $maxiters, $force_gap_cds,
98 $show_domains
100 = $page->get_arguments(
101 "format", "seq_data", "id_data", "start_value",
102 "end_value", "title", "type", "hide_seqs",
103 "run", "sv_similarity", "sv_shared", "sv_indel",
104 "show_sv", "show_breakdown", "maxiters", "force_gap_cds",
105 "show_domains"
108 my ( $image_content, $sum_content, $seq_sum_content, $sv_content, $al_content,
109 $combined_content );
111 our ( $temp_file, $cds_temp_file ) =
112 $page->get_arguments( "temp_file", "cds_temp_file" );
114 our ( $family_nr, $i_value, $family_id ) =
115 $page->get_arguments( "family_nr", "i_value", "family_id" );
117 if ( $family_id && !( $family_nr || $i_value ) ) {
118 my $dbh = $c->dbc->dbh;
119 my $sth =
120 $dbh->prepare(
121 "SELECT family_nr, i_value FROM sgn.family LEFT JOIN sgn.family_build USING (family_build_id) WHERE family_id=?"
123 $sth->execute($family_id);
124 ( $family_nr, $i_value ) = $sth->fetchrow_array();
127 $show_breakdown = 1 if $show_breakdown;
128 $show_sv = 1 if $show_sv;
129 $maxiters = 1000 if ( $maxiters > 1000 );
130 $maxiters ||= 2;
132 $show_domains = 1 unless ( $show_domains eq '0' );
134 #Run locally by default, cluster will run if the computation
135 #is found to be intensive (later in script)
136 unless ( $run =~ /^(local)|(cluster)$/ ) { $run = "local"; }
138 #############################################################
139 #Check if the input sequence is empty
140 unless ( $seq_data
141 || $id_data
142 || $temp_file
143 || $family_id
144 || ( $family_nr && $i_value ) )
146 &input_err_page( $page, "No sequence data provided!\n" );
149 #############################################################
150 #If a $seq_data is passed to the page, write the content of $seq_data into a temp file
152 our $HTML_ROOT_PATH = $c->config->{'basepath'};
153 our $PATH = $page->path_to( $page->tempfiles_subdir('align_viewer') );
154 unless( -d $PATH ) {
155 mkdir $PATH
156 or die "temp dir '$PATH' does not exist, and could not create";
158 our $tmp_fh;
159 our $CLUSTER_SHARED_TEMPDIR = $c->config->{'cluster_shared_tempdir'};
160 our $NOCLUSTER = $c->config->{'nocluster'};
162 #You can send the temp_file as a filename and not a full path, good for security
163 unless ( $temp_file =~ /\// ) {
164 $temp_file = $PATH . "/" . $temp_file;
166 unless ( $cds_temp_file =~ /\// ) {
167 $cds_temp_file = $PATH . "/" . $cds_temp_file;
169 #If a temp file is set, we are definitely CDS
170 #Fixes imagemap problem:
171 $type = "cds" if ( -f $cds_temp_file );
174 if ( -f $cds_temp_file && $force_gap_cds ) {
176 #The CDS temp file needs to be gapped
177 $cds_temp_file = build_gapped_cds_file( $temp_file, $cds_temp_file );
180 # upload file, if requested... (Lukas, 2006/09/05)
182 my $upload = $page->get_upload("upload");
183 my $upload_fh;
185 # print STDERR "Uploading file $args{upload_file}...\n";
186 if ( defined $upload ) {
187 $upload_fh = $upload->fh();
188 while (<$upload_fh>) {
189 $seq_data .= $_;
193 my @good_ids = ();
194 if ( $seq_data =~ /\S+/ ) {
195 $tmp_fh = File::Temp->new(
196 DIR => $PATH,
197 UNLINK => 0,
200 $temp_file = $tmp_fh->filename;
201 print $tmp_fh "$seq_data";
203 close $tmp_fh;
205 # Convert the input file into fasta if it is clustal
206 if ( $format eq 'clustalw' ) {
207 my $temp_file_fasta = $temp_file . ".fasta";
208 convert_clustal_fasta( $temp_file, $temp_file_fasta );
210 # continue with the converted file...
211 $temp_file = $temp_file_fasta;
214 elsif ( $id_data =~ /\S+/ ) {
215 unless ( -f $temp_file ) {
216 my $tmp_fh = File::Temp->new(
217 DIR => $PATH,
218 UNLINK => 0,
220 $temp_file = $tmp_fh->filename;
221 $tmp_fh->close();
223 $format = "fasta_unaligned";
225 open( my $temp_fh, '>', $temp_file ) or die $!;
227 my @ids = split /\s+/, $id_data;
228 foreach my $id (@ids) {
229 if ( $id =~ /^gi:\d+$/i
230 || CXGN::Tools::Identifiers::is_genbank_accession($id) )
232 ($id) = $id =~ /^gi:(\d+)$/i if $id =~ /^gi:\d+$/;
233 my $database = "protein";
234 $database = "nucleotide" if $type ne "pep";
235 my $entrez = CXGN::Tools::Entrez->new( { db => $database } );
236 my $seq = $entrez->get_sequence($id);
237 $id = "gi:$id" if $id =~ /^\d+$/;
238 print $temp_fh ">$id\n$seq" if $seq;
239 push( @good_ids, $id ) if $seq;
241 else {
242 eval {
243 my $gene = CXGN::Tools::Gene->new($id, $c->dbc->dbh );
244 my $seqtype = "";
245 $seqtype = "protein" if $type eq "pep";
246 $seqtype = "cds" if $type eq "cds";
247 $seqtype = "genomic" if $type eq "nt";
248 my $seq = $gene->getSequence($seqtype);
249 print $temp_fh ">$id\n$seq\n" if $seq;
251 if( my $err = $@ ) {
252 $err =~ s/\n\s*Catalyst::.+//;
253 warn $err;
254 } else {
255 push @good_ids, $id;
260 $id_data = join " ", @good_ids;
261 close($temp_fh);
263 elsif ( $family_nr && $i_value ) {
264 my $family_basedir = "/data/prod/public/family/$i_value";
265 unless ( -d $family_basedir ) {
266 &input_err_page( $page,
267 "No current family build corresponds to an i_value of $i_value" );
269 unless ( -f "$family_basedir/pep_align/$family_nr.pep.aligned.fasta" ) {
270 &input_err_page( $page,
271 "Alignment was not calculated for this family" );
274 unless ( -f $temp_file ) {
275 my $tmp_fh = File::Temp->new(
276 DIR => $PATH,
277 UNLINK => 0,
279 $temp_file = $tmp_fh->filename;
280 $tmp_fh->close();
282 $format = "fasta_aligned";
283 $type = "cds";
285 #disable cds until kinks worked out
286 if ( 0 && -f "$family_basedir/cds_align/$family_nr.cds.align.fasta" ) {
287 $type = "cds";
288 system
289 "cp $family_basedir/cds_align/$family_nr.cds.align.fasta $temp_file";
291 else {
292 $type = "pep";
293 system
294 "cp $family_basedir/pep_align/$family_nr.pep.aligned.fasta $temp_file";
298 #Check that the input file is a good FASTA, throw error page otherwise
299 our ( $SEQ_COUNT, $MAX_LENGTH, $TOTAL_LENGTH ) = fasta_check( $temp_file, $page );
301 #Also, use seq_count, maxiters, and max_length parameters for Muscle to see if we
302 #should run it on the cluster, otherwise allow running on the web server
303 if ( !$NOCLUSTER && ( $maxiters * $SEQ_COUNT * $MAX_LENGTH > 30000 )
304 || !$maxiters )
306 $run = "cluster";
308 ##Convert cds to protein, but if cds_temp_file exists,
309 #the conversion was already done on the previous page (or before)
310 if ( $type eq "cds" && $temp_file && !( -f $cds_temp_file ) ) {
311 my $new_temp_fh = File::Temp->new(
312 DIR => $PATH,
313 UNLINK => 0,
315 my $new_temp = $new_temp_fh->filename;
316 my $instream = Bio::SeqIO->new( -file => $temp_file, -format => 'fasta' );
317 while ( my $in = $instream->next_seq() ) {
318 my $seq = $in->seq();
319 my $id = $in->id();
320 my $len = length $seq;
321 my $member;
322 eval {
323 $member = CXGN::Phylo::Alignment::Member->new(
324 id => $id,
325 seq => $seq,
326 type => "cds",
327 start_value => 1,
328 end_value => $len
330 $member->translate_cds();
332 if ($@) {
333 input_err_page( $page, $@ );
336 my $prot_seq = $member->{translated_protein_seq};
337 print $new_temp_fh ">$id\n$prot_seq\n";
339 close($new_temp_fh);
341 #Important transition happens here. The temp_file
342 #is now the translated protein sequence. Maybe it's
343 #gapped, maybe not, depends on the format parameter
344 #and if the CDS was gapped
345 $cds_temp_file = $temp_file;
346 $temp_file = $new_temp;
349 #############################################################
350 #Read the alignment sequence and store them in an align object
352 #First parse $hide_seqs
353 my %hidden_seqs = ();
354 my @ids = split /\s+/, $hide_seqs;
355 foreach (@ids) {
356 $hidden_seqs{$_} = 1;
359 # align sequences if they are in the unaligned fasta format
360 # (added by Lukas, 9/5/2006)
361 if ( $format eq "fasta_unaligned" ) {
363 if ( $SEQ_COUNT > $ALIGN_SEQ_COUNT_LIMIT ) {
364 my $message =
365 "<center><span style='font-size:1.2em;'>You submitted <b>$SEQ_COUNT</b> sequences. Due to limited computational resources, we only <br>allow <b>$ALIGN_SEQ_COUNT_LIMIT or fewer</b> sequences to be aligned through our web interface.</span>";
366 $message .=
367 "<br /><br /><span style='font-size:1.1em'>You can download and run your own copy of the <br />alignment software at <a href='http://www.drive5.com/muscle/'>www.drive5.com/muscle/</a></span></center>";
368 input_err_page( $page, $message );
370 elsif( $TOTAL_LENGTH > $ALIGN_SEQ_TOTAL_LENGTH_LIMIT ) {
371 input_err_page( $page, <<"" );
372 <center><span style='font-size:1.2em;'>You submitted <b>$TOTAL_LENGTH</b> bases of sequence. Due to limited computational resources, we only <br>allow <b>$ALIGN_SEQ_TOTAL_LENGTH_LIMIT or fewer</b> sequences to be aligned through our web interface.</span>
373 <br /><br /><span style='font-size:1.1em'>You can download and run your own copy of the <br />alignment software at <a href='http://www.drive5.com/muscle/'>www.drive5.com/muscle/</a></span></center>
377 run_muscle( $page, $run );
380 #die "\n$temp_file\n$cds_temp_file";
382 #Let's see the toolbar while we're just building the image
384 #######################################
385 ## Construct Alignment ################
386 #######################################
388 my $disp_type = $type;
389 $disp_type = "pep" if ( $disp_type eq "cds" );
391 my $alignment = CXGN::Phylo::Alignment->new(
392 name => $title,
393 width => 800,
394 height => 2000,
395 type => $disp_type
397 $alignment->{page} = $page;
398 my $instream;
399 my $len;
401 $sv_similarity = 95 unless ($sv_similarity);
402 $sv_indel = 4 unless ($sv_indel);
403 $sv_shared = 20 unless ($sv_shared);
405 $alignment->set_sv_criteria( $sv_shared, $sv_similarity, $sv_indel );
407 #die $temp_file;
408 $instream = Bio::SeqIO->new( -file => $temp_file, -format => 'fasta' );
409 while ( my $in = $instream->next_seq() ) {
410 my $seq = $in->seq();
411 my ( $id, $species ) = $in->id() =~ m/(.+)\/(.+)/;
412 $id = $in->id();
413 ( !$species ) and $species = ();
415 my $hidden = 0;
416 $hidden = 1
417 if ( exists $hidden_seqs{$id} )
418 ; #skip the sequence if it is in the hide_seq
420 chomp $seq;
421 $len = length $seq;
422 my $member;
423 eval {
425 $member = CXGN::Phylo::Alignment::Member->new(
426 start_value => 1,
427 end_value => $len,
428 id => $id,
429 seq => $seq,
430 hidden => $hidden,
431 species => $species,
432 type => $disp_type
434 $alignment->add_member($member);
436 $@ and input_err_page( $page, $@ );
438 my ( $gene, $annot, $sigpos, $cleavage );
439 eval {
440 $gene = CXGN::Tools::Gene->new($id);
441 $annot = $gene->getAnnotation;
442 $annot = HTML::Entities::encode($annot);
444 unless ($@) {
445 $member->set_tooltip($annot);
448 $member->show_remove(); #draw X for removal
449 $member->show_link();
452 #Set the start value and end value#######
453 ( !$start_value ) and $start_value = 1;
454 $alignment->set_start_value($start_value);
455 ( !$end_value ) and $end_value = $len;
456 ( $len < $end_value ) and $end_value = $len;
457 $alignment->set_end_value($end_value);
459 ## Domain Highlighting ########################
460 $alignment->highlight_domains() if $show_domains;
462 ### If cds_temp exists, create a $cds_alignment object, for producing CDS files
464 our $cds_alignment;
465 if ( -f $cds_temp_file && $type eq "cds" ) {
466 $cds_alignment = CXGN::Phylo::Alignment->new(
467 name => $title,
468 type => "cds"
470 my $instream =
471 Bio::SeqIO->new( -file => $cds_temp_file, -format => 'fasta' );
472 while ( my $in = $instream->next_seq() ) {
473 my $seq = $in->seq();
474 my ( $id, $species ) = $in->id() =~ m/(.+)\/(.+)/;
475 $id = $in->id();
476 ( !$species ) and $species = ();
478 my $hidden = 0;
479 $hidden = 1
480 if ( exists $hidden_seqs{$id} )
481 ; #skip the sequence if it is in the hide_seq
483 chomp $seq;
484 my $len = length $seq;
485 my $member;
486 eval {
487 $member = CXGN::Phylo::Alignment::Member->new(
488 start_value => 1,
489 end_value => $len,
490 id => $id,
491 seq => $seq,
492 hidden => $hidden,
493 species => $species,
494 type => $disp_type
496 $cds_alignment->add_member($member);
498 $@ and input_err_page( $page, $@ );
499 $member->show_remove(); #draw X for removal
500 $member->show_link();
502 $cds_alignment->set_start_value( $start_value * 3 - 2 );
503 $cds_alignment->set_end_value( $end_value * 3 );
506 ### Do page header AFTER all eval{} statements
507 $page->header;
509 ############################################################
510 #Draw alignment image, unless we are at the original parameters and we
511 #have already drawn the most computationally intensive image
513 my $tmp_image = File::Temp->new(
514 DIR => $PATH,
515 SUFFIX => '.png',
516 UNLINK => 0,
519 #Render image
520 $alignment->render_png_file( $tmp_image, 'c' );
521 close $tmp_image;
522 $tmp_image =~ s/$HTML_ROOT_PATH//;
524 #Important for Ruler Image-Map Generation:
525 our ($temp_file_name) = $temp_file =~ /([^\/]+)$/;
526 our ($cds_temp_file_name) = $cds_temp_file =~ /([^\/]+)$/;
528 #Parameter hash for imagemap and other links on the page
529 our %PARAM = (
530 format => $format,
531 start_value => $start_value,
532 end_value => $end_value,
533 title => $title,
534 type => $type,
535 hide_seqs => $hide_seqs,
536 sv_similarity => $sv_similarity,
537 sv_shared => $sv_shared,
538 sv_indel => $sv_indel,
539 show_sv => $show_sv,
540 show_breakdown => $show_breakdown,
541 maxiters => $maxiters,
542 force_gap_cds => $force_gap_cds,
543 temp_file => $temp_file_name,
544 cds_temp_file => $cds_temp_file_name,
545 show_domains => $show_domains
548 $alignment->set_fasta_temp_file($temp_file_name);
549 $alignment->set_cds_temp_file($cds_temp_file_name);
550 $alignment->set_param( \%PARAM );
552 $image_content = "<tr><td align=\"center\">";
554 $image_content .=
555 "<img border=0 src='$tmp_image' alt='Alignment Image' usemap='#align_image_map'/>";
556 $image_content .= $alignment->get_image_map( $show_sv, $show_breakdown );
557 $image_content .=
558 "<div style=\"width:100%;text-align:left;padding-left:40px;\">";
559 if ($show_domains) {
560 $image_content .=
561 "<a href=\"show_align.pl?"
562 . hash2param( \%PARAM, { show_domains => 0 } )
563 . "\">Hide Domain Information</a>";
565 else {
566 $image_content .=
567 "<a href=\"show_align.pl?"
568 . hash2param( \%PARAM, { show_domains => 1 } )
569 . "\">Show Domain Information</a>";
571 $image_content .= "</div>";
573 if ( $start_value == 1 && $end_value == $len ) {
574 $image_content .=
575 "<b>Click on a region of the image to zoom in.</b><br />You must submit the form below to change alignment parameters or remove members from analysis.";
577 else {
578 $image_content .= "<a href='show_align.pl?";
579 $image_content .=
580 hash2param( \%PARAM, { start_value => 1, end_value => "" } );
581 $image_content .= "'>&lt;&lt; See Whole Alignment &gt;&gt;</a>";
584 $image_content .= "</td></tr>";
586 my $term_help_content = <<HTML;
587 <table width="100%" summary="" cellpadding="3" cellspacing="3" border="0"><tr><td><p style='font-size:1.1em'>SGN Alignment Viewer analyzes and provides useful information that can be used to optimize the overlapping region of the alignment.</p></td></tr></table>
588 <table width="100%" summary="" cellpadding="3" cellspacing="0" border="0">
589 <tr style='background-color:#ddf'><th valign='top'>Coverage&nbsp;Range</th>
590 <td>The distance between the first and last element of the un-gapped sequence in the current alignment.</td></tr>
591 <tr><th valign='top'>Bases</th><td>Number of "real sequence characters" (non gap) in the sequence.</td></tr>
592 <tr style='background-color:#ddf'><th valign='top'>Gaps</th><td>Number of gaps in the aligment</td></tr>
593 <tr><th valign='top'>Medium</th><td>The middle position of all non-gap characters of the alignment sequence</td></tr>
594 <tr style='background-color:#ddf'><th valign='top'>Putative Splice Variant Pairs</th><td>Sequence pairs that are from the same species, that, by default <ol>
595 <li>Share at least 60 bases if they are nucleotides or 20 bases if they are peptides.
596 <li>Share at least 95% sequence similarity in the overlapping region.
597 <li>Have insertion/deletion of at least 4 amino acids or 12 nucleotides in their common region.</ol> </td></tr>
598 <tr><th valign='top'>Putative Alleles</th><td>Sequence pairs that are from the same species, that, by default
599 <ol>
600 <li>Share at least 60 bases if they are nucleotides or 20 bases if they are peptides.
601 <li>Share at least 95% sequence similarity in the overlapping region.
602 <li>Have insertion/deletion of <b>not more than </b> 4 amino acids or 12 nucleotides in their common region.
603 </ol>
604 </td></tr>
605 <tr style='background-color:#ddf'><th valign='top'>Overlap Score</th><td>An indication of how well the sequence overlaps with other members in the alignment. If a non-gap character of a sequence overlaps with a character in another alignment sequence, it gets a point.<br><br>Sometimes a few sequences share very little overlap with the rest, significantly reducing the overall overlapping sequence of the alignment. Usually these sequences are short and won't help with the understanding of overall alignment. We suggest the user leaves out these sequences before further analysis of the aligment sequence. </td></tr></table>
606 HTML
607 $term_help_content =
608 blue_section_html( "<em>Help With Terms</em>", $term_help_content );
610 my ( $help_link, $hidden_help_content ) = collapser(
612 id => "alignment_help_content",
613 alt_href => "/about/align_term.pl",
614 alt_target => "align_about",
615 linktext => "(-) Help me understand terms on this page",
616 hide_state_linktext => "(?) Help me understand terms on this page",
617 linkstyle => "font-size:1.1em",
618 collapsed => 1,
619 content => $term_help_content,
623 ##########################################################
624 #Write sequence output to temp files and create links to them
626 ###########################################################
627 #Get summary information
629 $combined_content = get_combined_content($alignment);
631 ############################################################
632 #Page printout: Nav Section
634 print
635 "<a href=\"index.pl?id_data=$id_data\" style=\"margin-left:10px;font-size:1.1em\">&lt;&lt; Start over</a>";
636 my $disp_title = "";
637 $disp_title = "View and Analyze Alignment of <em>$title</em>" if ($title);
638 $disp_title ||= "View and Analyze Alignment";
639 print page_title_html("$disp_title");
641 print "&nbsp;&nbsp;" . $help_link . "<br><br>";
642 print $hidden_help_content; #this is actually a blue_section_html, but the whole thing is hidden
644 my $image_title = "Alignment Image";
645 $image_title .= ":&nbsp;&nbsp; CDS Translated" if ( $type eq "cds" );
646 print blue_section_html( $image_title,
647 '<table width="100%" cellpadding="0" cellspacing="0" border="0">'
648 . $image_content
649 . '</table>' );
651 print blue_section_html(
652 'Summary, Files, Change Parameters',
653 '<table width="100%" cellpadding="5" cellspacing="0" border="0">'
654 . $combined_content
655 . '</table>'
658 if ($show_breakdown) {
659 $seq_sum_content = get_seq_sum_content($alignment);
661 else {
662 $seq_sum_content .= "Not Calculated &nbsp;&nbsp;( <a href=\"show_align.pl?";
663 $seq_sum_content .= hash2param( \%PARAM, { show_breakdown => 1 } );
664 $seq_sum_content .=
665 "\">showing this table</a> will increase page load time )";
668 ###########################################################
669 #Find splice variants
671 if ($show_sv) {
672 ( $sv_content, $al_content ) = get_sv_al_content($alignment);
674 else {
675 $al_content = "Not Calculated &nbsp;&nbsp;( <a href=\"show_align.pl?";
676 $al_content .= hash2param( \%PARAM, { show_sv => 1 } );
677 $al_content .= "\">showing this table</a> will increase page load time )";
678 $sv_content = "Not Calculated &nbsp;&nbsp;( <a href=\"show_align.pl?";
679 $sv_content .= hash2param( \%PARAM, { show_sv => 1 } );
680 $sv_content .= "\">showing this table</a> will increase page load time )";
683 ###########################################################
684 ##Page Printout: Breakdown and Alleles
685 print blue_section_html( 'Alignment Breakdown', $seq_sum_content );
687 print blue_section_html( 'Putative Splice Variants', $sv_content );
689 print blue_section_html( 'Putative Alleles', $al_content );
691 #print blue_section_html('Output Sequences','<table width="100%" cellpadding="5" cellspacing="0" border="0">'.$seq_output_content.'</table>');
693 $page->footer();
695 sub get_combined_content {
696 my $alignment = shift;
697 my $align_member_nr = $alignment->get_member_nr();
698 my $ol_nr = $alignment->get_overlap_num();
700 my $id_list_ref = $alignment->get_nonhidden_member_ids;
701 my $seq_ref = $alignment->get_nopad_seqs();
702 my $align_seq_ref = $alignment->get_seqs();
703 my $ol_seq_ref = $alignment->get_overlap_seqs();
704 my $sp_ref = $alignment->get_member_species();
706 my ( $cds_ids_ref, $cds_seq_ref, $cds_align_seq_ref, $cds_ol_seq_ref ) =
707 ( undef, undef, undef, undef );
708 if ($cds_alignment) {
709 $cds_seq_ref = $cds_alignment->get_nopad_seqs();
710 $cds_align_seq_ref = $cds_alignment->get_seqs();
711 $cds_ol_seq_ref = $cds_alignment->get_overlap_seqs();
714 my %sp = %$sp_ref;
716 my $file_links = "";
717 my $cds_file_links = "";
718 my $tmp_seq_file;
719 my ( $seq_file, $cds_file, $seq_align_file );
720 ## Protein/Genomic Files:
721 if ($seq_ref) {
722 my $link;
723 ( $seq_file, $link ) =
724 build_seq_file( $id_list_ref, $seq_ref, "All Sequences (No Gaps)" );
725 $file_links .= $link . "<br />";
727 else { $file_links .= "All Sequences (No Gaps)<br>" }
729 if ($align_seq_ref) {
730 my $link;
731 ( $seq_align_file, $link ) =
732 build_seq_file( $id_list_ref, $align_seq_ref,
733 "All Alignment Sequences (Gapped)" );
734 $file_links .= $link . "<br />";
736 else { $file_links .= "All Alignment Sequences (Gapped)<br/>"; }
738 if ( $ol_nr > 0 ) {
739 ($file_links) .=
740 build_seq_file( $id_list_ref, $ol_seq_ref, "Overlap of Sequences" );
742 else { $file_links .= "Overlap of Sequences" }
744 ## CDS Files, if the CDS alignment was made from cds_file:
745 if ($cds_seq_ref) {
746 my $link;
747 ( $cds_file, $link ) =
748 build_seq_file( $id_list_ref, $cds_seq_ref,
749 "Coding Sequences (No Gaps)" );
750 $cds_file_links .= $link . "<br />";
752 if ($cds_align_seq_ref) {
753 $cds_file_links .=
754 build_seq_file( $id_list_ref, $cds_align_seq_ref,
755 "Coding Aligned Sequences (Gapped)" )
756 . "<br />";
758 if ( $ol_nr > 0 && $cds_ol_seq_ref ) {
759 $cds_file_links .=
760 build_seq_file( $id_list_ref, $cds_ol_seq_ref,
761 "Overlap of Coding Sequences" );
763 elsif ($cds_ol_seq_ref) { $cds_file_links .= "Overlap of Coding Sequences" }
765 ############################################################
766 # Summary, Files, And Alignment Parameter Changes
768 my $combined_content = "<tr><td>";
770 $combined_content .= "<table style='width:100\%;border-spacing:0px'><tr>";
772 #Summary Content
773 $combined_content .=
774 "<td style='width:50%;vertical-align:middle;'><table width='100%'>";
775 $combined_content .= "<tr><th>Type</th><td>";
776 if ( $type eq 'nt' ) {
777 $combined_content .= "Nucleotide</td></tr>";
779 else {
780 $combined_content .= "Peptide</td></tr>";
782 $combined_content .=
783 "<tr><th>No. Alignment Members</th><td>$align_member_nr</td></tr>";
784 $combined_content .=
785 "<tr><th>Selected Range</th><td>$start_value - $end_value</td></tr>";
786 $combined_content .= "<tr><th>Overlap</th><td>$ol_nr";
787 if ( $type eq 'nt' ) {
788 $combined_content .= " bp</td></tr>";
790 else {
791 $combined_content .= " aa</td></tr>";
794 if ($cds_file_links) {
795 $combined_content .=
796 "<tr><th>Protein Files:</th><td>$file_links</td></tr>";
797 $combined_content .=
798 "<tr><th>CDS Files:</th><td>$cds_file_links</td></tr>";
800 else {
801 $combined_content .=
802 "<tr><th>Sequence Files:</th><td>$file_links</td></tr>";
805 my $rerun_file;
806 ($cds_file) ? ( $rerun_file = $cds_file ) : ( $rerun_file = $seq_file );
807 my ($tmp_seq_name) = $rerun_file =~ /([^\/]+)$/;
808 my ($tmp_seq_align_name) = $seq_align_file =~ /([^\/]+)$/;
809 my $target = "_SGN_TREE_" . int( rand(10000000) );
811 my $available_seq_count = 0;
812 while ( my ( $id, $seq ) = each %$seq_ref ) {
813 $available_seq_count++ if $seq =~ /\S/;
815 if ( $available_seq_count > 1 ) {
816 $combined_content .= <<HTML;
818 <tr><td colspan=2><br />
819 <a href='index.pl?temp_file=$tmp_seq_name&maxiters=$maxiters&type=$type&format=fasta_unaligned&title=$title'>&gt;&gt; Re-run Alignment On Selected Range ($available_seq_count)</a>
820 <br />This will allow you to specify greater accuracy by increasing the number of iterations performed by Muscle<br /><br />
821 <a target="$target" href="/tools/tree_browser/index.pl?align_temp_file=$tmp_seq_align_name&align_type=pep&tree_from_align=1">
822 &gt;&gt; Calculate Tree from Selected Range ($available_seq_count)</a><br />
823 Use a quick neighbor-joining algorithm (Kimura) to calculate a tree based on the protein alignment, and analyze both in the combined alignment/tree browser.
824 </td></tr>
825 HTML
827 else {
828 $combined_content .= <<HTML;
829 <tr><td colspan=2><br />
830 Re-alignment and Tree Calculation cannot be performed, since there are less than two sequences for which data is available at this range
831 </td></tr>
832 HTML
834 $combined_content .= "</table></td>";
836 ### Modify and submit alignment parameters
837 ##########################################
839 $combined_content .=
840 "<td><form method='post' action='show_align.pl' name='show_align'>";
842 #hidden parameters
843 $combined_content .= "<input type='hidden' name='title' value='$title' />";
844 $combined_content .=
845 "<input type='hidden' name='format' value='$format' />";
846 $combined_content .=
847 "<input type='hidden' name='temp_file' value='$temp_file' />";
848 $combined_content .=
849 "<input type='hidden' name='cds_temp_file' value='$cds_temp_file_name' />";
850 $combined_content .= "<input type='hidden' name='type' value='$type' />";
852 #start and end value
853 $combined_content .=
854 "<table style='border:0px;border-spacing:0px'><tr><th>Start Value</th><th>End Value</th></tr>";
855 $combined_content .=
856 "<tr><td><input type='text' name='start_value' value='$start_value' /></td>";
857 $combined_content .=
858 "<td><input type='text' name='end_value' value='$end_value' /></td></tr></table>";
860 #Sequences to leave out
861 $combined_content .=
862 "<b>IDs of Sequences to Leave Out</b> (separated by spaces)<br>";
863 $combined_content .=
864 "<textarea name='hide_seqs' id='hide_seqs' rows='2' cols='50'>$hide_seqs</textarea><br>";
866 my $show_breakdown_checked = "";
867 $show_breakdown_checked = "CHECKED" if ($show_breakdown);
868 $combined_content .=
869 "<b><input type='checkbox' name='show_breakdown' $show_breakdown_checked> Calculate Coverage and Overlap Details</b> (Time Intensive)<br />";
871 my $show_sv_checked = "";
872 $show_sv_checked = "CHECKED" if ($show_sv);
873 $combined_content .=
874 "<b><input type='checkbox' name='show_sv' $show_sv_checked> Calculate Splice Variants and Alleles</b> (Time Intensive)";
875 $combined_content .= "<table style='border:0px;text-align:center'>";
876 $combined_content .=
877 "<tr><th>% Similarity</th><th></th><th>Shared</th><th></th><th>InDel Limit</th></tr>";
878 $combined_content .=
879 "<tr><td><input type='text' size=10 name='sv_similarity' value='$sv_similarity'></td><td width=3>&nbsp;</td>";
880 $combined_content .=
881 "<td><input type='text' size=10 name='sv_shared' value='$sv_shared'></td><td width=3>&nbsp;</td>";
882 $combined_content .=
883 "<td><input type='text' size=10 name='sv_indel' value='$sv_indel'></td></tr></table>";
885 #submit
886 $combined_content .=
887 "<input type='submit' value='Change Alignment Details' /></form></td>";
889 $combined_content .= "</tr></table>";
890 $combined_content .= "</td></tr>";
892 return $combined_content;
895 sub get_sv_al_content {
896 my $alignment = shift;
897 my ( $ob_ref, $pi_ref, $sv_sp_ref ) = $alignment->get_sv_candidates();
898 my %ob = %$ob_ref;
899 my %pi = %$pi_ref;
900 my %sv_sp = %$sv_sp_ref;
901 my ( $sv_content, $al_content ) = ( "", "" );
902 if ( keys %ob ) {
903 $sv_content =
904 "<table width='100%' cellpadding='5' cellspacing='0' border='1'>";
905 $sv_content .= "<tr>";
906 $sv_content .= "<th>Species</th>"; # if $show_species;
907 $sv_content .=
908 "<th>Sequence id</th><th>Sequence id</th><th>Overlap Bases</th><th>\% Similarity</th></tr>";
909 foreach my $first_key ( keys %ob ) {
910 foreach my $second_key ( keys %{ $ob{$first_key} } ) {
911 $sv_content .= "<tr>";
912 $sv_content .=
913 "<td>$sv_sp{$first_key}</td>"; # if $show_species;
914 $sv_content .=
915 "<td>$first_key</td><td>$second_key</td><td>$ob{$first_key}{$second_key}</td><td>$pi{$first_key}{$second_key}</td></tr>";
918 $sv_content .= "</table>";
919 $sv_content .=
920 "&nbsp;Reduce page load time by <a href=\"show_align.pl?";
921 $sv_content .= hash2param( \%PARAM, { show_sv => 0 } );
922 $sv_content .= "\">hiding this table</a>";
925 else {
926 $sv_content =
927 "None were found. Reduce page load time by <a href=\"show_align.pl?";
928 $sv_content .= hash2param( \%PARAM, { show_sv => 0 } );
929 $sv_content .= "\">hiding this table</a>";
933 ###########################################################
934 #Find alleles
936 my ( $al_ob_ref, $al_pi_ref, $al_sp_ref ) =
937 $alignment->get_allele_candidates();
938 my %al_ob = %$al_ob_ref;
939 my %al_pi = %$al_pi_ref;
940 my %al_sp = %$al_sp_ref;
942 if ( keys %al_ob ) {
943 $al_content =
944 "<table width='100%' cellpadding='5' cellspacing='0' border='1'>";
945 $al_content .= "<tr>";
946 $al_content .= "<th>Species</th>"; # if $show_species;
947 $al_content .=
948 "<th>Sequence id</th><th>Sequence id</th><th>Overlap Bases</th><th>\% Similarity</th></tr>";
949 foreach my $first_key ( keys %al_ob ) {
950 foreach my $second_key ( keys %{ $al_ob{$first_key} } ) {
951 $al_content .= "<tr>";
952 $al_content .=
953 "<td>$al_sp{$first_key}</td>"; # if $show_species;
954 $al_content .=
955 "<td>$first_key</td><td>$second_key</td><td>$al_ob{$first_key}{$second_key}</td><td>$al_pi{$first_key}{$second_key}</td></tr>";
958 $al_content .= "</table>";
959 $al_content .=
960 "&nbsp;Reduce page load time by <a href=\"show_align.pl?";
961 $al_content .= hash2param( \%PARAM, { show_sv => 0 } );
962 $al_content .= "\">hiding this table</a>";
964 else {
965 $al_content =
966 "None were found. Reduce page load time by <a href=\"show_align.pl?";
967 $al_content .= hash2param( \%PARAM, { show_sv => 0 } );
968 $al_content .= "\">hiding this table</a>";
970 return ( $sv_content, $al_content );
973 sub get_seq_sum_content {
975 my $alignment = shift;
976 ###########################################################
977 #Analyze sequences and create analysis content
978 my $member_ids_ref = $alignment->get_member_ids();
979 my $ov_score_ref = $alignment->get_all_overlap_score();
980 my $medium_ref = $alignment->get_all_medium();
981 my ( $head_ref, $tail_ref ) = $alignment->get_all_range();
982 my $ng_ref = $alignment->get_all_nogap_length();
983 my $sp_ref = $alignment->get_member_species();
985 my @member_ids = @$member_ids_ref;
986 my %ov_score = %$ov_score_ref;
987 my %medium = %$medium_ref;
988 my %head = %$head_ref;
989 my %tail = %$tail_ref;
990 my %ng = %$ng_ref;
991 my %gap = ();
992 my %species = %$sp_ref;
994 foreach ( keys %ng ) {
995 $gap{$_} = $tail{$_} - $head{$_} + 1 - $ng{$_};
997 my $show_species = 0;
998 foreach ( keys %species ) {
999 $show_species = 1 if $species{$_};
1002 my $seq_sum_content = "";
1003 $seq_sum_content .=
1004 "<table width='100%' cellpadding='5' cellspacing='0' border='1'>";
1005 $seq_sum_content .= "<tr><th>Sequence ID</th>";
1006 $seq_sum_content .= "<th>Species</th>" if $show_species;
1007 $seq_sum_content .=
1008 "<th>Coverage Range</th><th>Bases</th><th>Gaps</th><th>Medium</th><th>Overlap Score</th></tr>";
1010 foreach (@member_ids)
1011 { ##Access the align_seqs members this way, instead of by the keys of the hashes, so that the sequences are grouped together
1012 $seq_sum_content .= "<tr><td>$_</td>";
1013 $seq_sum_content .= "<td>$species{$_}</td>" if $show_species;
1014 $seq_sum_content .=
1015 "<td>$head{$_} - $tail{$_}</td><td>$ng{$_}</td><td>$gap{$_}</td><td>$medium{$_}</td><td>$ov_score{$_}</td></tr>";
1017 $seq_sum_content .= "</table>";
1018 $seq_sum_content .=
1019 "&nbsp;Reduce page load time by <a href=\"show_align.pl?";
1020 $seq_sum_content .= hash2param( \%PARAM, { show_breakdown => 0 } );
1021 $seq_sum_content .= "\">hiding this table</a>";
1022 return $seq_sum_content;
1025 sub build_gapped_cds_file {
1026 my $prot_file = shift;
1027 my $cds_file = shift;
1028 my $instream = Bio::SeqIO->new( -file => $prot_file, -format => 'fasta' );
1029 my $cds_instream =
1030 Bio::SeqIO->new( -file => $cds_file, -format => 'fasta' );
1032 my %cds_seq;
1033 my %prot_seq;
1035 while ( my $in = $instream->next_seq ) {
1036 my $id = $in->id();
1037 $prot_seq{$id} = $in->seq();
1039 while ( my $in = $cds_instream->next_seq ) {
1040 my $id = $in->id();
1041 $cds_seq{$id} = $in->seq();
1043 my $temp_fh = File::Temp->new(
1044 DIR => $PATH,
1045 SUFFIX => '.txt',
1046 UNLINK => 0,
1048 my $new_cds_temp_file = $temp_fh->filename;
1049 my $i = 0;
1050 my $st = time();
1051 while ( my ( $id, $prot_seq ) = each %prot_seq ) {
1052 $i++;
1053 my $cds = $cds_seq{$id};
1055 my $member = CXGN::Phylo::Alignment::Member->new(
1056 id => $id,
1057 seq => $cds,
1058 type => "cds",
1059 cds_nocheck => 1
1061 $member->cds_insert_gaps($prot_seq);
1062 my $newseq = $member->get_seq();
1063 print $temp_fh ">$id\n$newseq\n";
1065 my $et = time();
1066 my $rate = ( $et - $st ) / $i;
1068 #die $rate ." seconds/member\n";
1070 $temp_fh->close();
1071 return $new_cds_temp_file;
1074 sub build_seq_file {
1075 my $id_list_ref = shift;
1076 my $seq_ref = shift;
1077 my $linkname = shift;
1078 my $temp_fh = File::Temp->new(
1079 DIR => $PATH,
1080 SUFFIX => '.txt',
1081 UNLINK => 0,
1083 my $outstream = Bio::SeqIO->new( -fh => $temp_fh, -format => 'fasta' );
1084 foreach (@$id_list_ref) {
1085 my $seq = Bio::Seq->new( -seq => $seq_ref->{$_}, -id => $_ );
1086 $outstream->write_seq($seq) if $seq_ref->{$_} =~ /\w/;
1088 my $tempname = $temp_fh->filename;
1089 close $temp_fh;
1090 $tempname =~ s/$HTML_ROOT_PATH//;
1091 return ( $tempname,
1092 "<a target=\"sgn_align_file\" href=\"$tempname\">$linkname</a>" );
1095 sub input_err_page {
1096 my $input_err_page = shift;
1097 my $err_message = shift;
1098 $input_err_page->header();
1099 my ($disp_err_message) = $err_message =~ /(.*) at \/usr\/local\//;
1100 $disp_err_message = $err_message unless $disp_err_message;
1101 print page_title_html("Combined Tree/Alignment Error");
1102 print "$disp_err_message<br><br><a href=\"index.pl\">Start over</a><br>";
1103 $input_err_page->footer();
1104 exit;
1107 sub fasta_check {
1108 my ( $file, $page, $n ) = @_;
1109 my ($filename) = $file =~ /([^\/]+)$/;
1110 my $count = 0;
1111 my $maxlen = 0;
1112 my $total_length = 0;
1113 my $instream = Bio::SeqIO->new( -file => $file, -format => 'fasta' );
1114 my $entry = $instream->next_seq();
1115 unless ( $entry && $entry->id && $entry->seq ) {
1116 input_err_page( $page, "FASTA needs IDs and Sequences [$filename]" );
1118 $count++;
1119 $maxlen = $entry->length;
1120 $total_length += $entry->length;
1122 $entry = $instream->next_seq;
1123 unless ( $entry && $entry->id && $entry->seq ) {
1124 input_err_page( $page, "FASTA must have at least two valid sequences" );
1126 $count++;
1127 $maxlen = $entry->length if $entry->length > $maxlen;
1128 $total_length += $entry->length;
1130 while ( $entry = $instream->next_seq() ) {
1131 unless ( $entry->id && $entry->seq ) {
1132 input_err_page( $page, "Every entry must have ID AND sequence" );
1134 $maxlen = $entry->length if $entry->length > $maxlen;
1135 $total_length += $entry->length;
1136 $count++;
1138 return ( $count, $maxlen, $total_length );
1141 sub run_muscle {
1143 my ( $page, $run ) = @_;
1144 my @local_run_output = "";
1145 my $command_line = "";
1146 my $old_temp_file = $temp_file;
1147 if ( $run eq "local" ) {
1148 my $wd = `pwd`;
1149 chomp $wd;
1150 my $result_file = $temp_file;
1151 $result_file .= ".aligned.fasta" unless $result_file =~ /\.aligned\.fasta$/;
1152 chdir $PATH;
1154 #Within a limit of less than 50 sequences, muscle should run within 3 seconds. Wawa-woo-a!
1155 $command_line =
1156 "muscle -in $temp_file -out $result_file -maxiters $maxiters";
1157 print STDERR "Running: $command_line\n";
1158 @local_run_output = `$command_line `;
1159 $temp_file = $result_file;
1160 if ( -f $cds_temp_file ) {
1162 #The CDS temp file needs to be gapped
1163 $cds_temp_file =
1164 build_gapped_cds_file( $temp_file, $cds_temp_file );
1166 chdir $wd;
1169 ## RUN ON CLUSTER NODES ###################################################
1171 if ( $run eq "cluster" ) {
1173 my ( undef, $filename ) = tempfile(
1174 TEMPLATE => "jobXXXXXX",
1175 DIR => $CLUSTER_SHARED_TEMPDIR
1178 copy( $temp_file, $filename )
1179 || die "Can't copy $temp_file to $filename";
1181 chdir $CLUSTER_SHARED_TEMPDIR;
1183 print STDERR "Running on cluster: $command_line\n";
1185 CXGN::Tools::Run->temp_base($CLUSTER_SHARED_TEMPDIR);
1187 # generate a .req file that will indicate to the wait.pl script
1188 # that such a request has been made
1190 system("touch $filename.req");
1192 my $old_wd = `pwd`;
1194 my $job = CXGN::Tools::Run->run_cluster(
1195 "muscle",
1196 -in => "$filename",
1197 -out => "$filename.aln",
1198 -maxiters => $maxiters,
1199 { #if the next line is uncommented, we get the weird STDERR problem I was talking about --ccarpita
1200 #out_file => "$filename.aln",
1201 err_file => $filename . ".STDERR",
1202 working_dir => $CLUSTER_SHARED_TEMPDIR,
1203 temp_base => $CLUSTER_SHARED_TEMPDIR,
1204 # don't block and wait if the cluster looks full
1205 max_cluster_jobs => 1_000_000_000,
1209 my ( undef, $job_file ) =
1210 tempfile( DIR => $PATH, TEMPLATE => "object_XXXXXX" );
1212 store( $job, $job_file )
1213 or $page->message_page(
1214 "An error occurred in the serializaton of the job object");
1216 my $job_file_base = File::Basename::basename($job_file);
1218 print STDERR "SUBMITTED JOB WITH JOBID: " . $job->job_id() . "\n";
1219 my $cds_temp_filename = File::Basename::basename($cds_temp_file);
1221 # url encode the destination pass_back page.
1222 my $pass_back =
1223 "./align_viewer/show_align.pl?&amp;title=$title&amp;type=$type&amp;force_gap_cds=1&amp;cds_temp_file=$cds_temp_filename&amp;temp_file=";
1224 $pass_back =~ s/([^A-Za-z0-9])/sprintf("%%%02X", ord($1))/seg;
1225 my $message =
1226 "Running Muscle v3.6 ($SEQ_COUNT sequences), please wait ...";
1228 chdir $old_wd;
1230 $page->client_redirect(
1231 "../wait.pl?tmp_app_dir=/align_viewer&amp;job_file=$job_file_base&amp;out_file_override=$filename.aln&amp;message=$message&amp;redirect=$pass_back"
1237 sub convert_clustal_fasta {
1238 my $clustal_file = shift;
1239 my $fasta_file = shift;
1241 my $in = Bio::AlignIO->new(
1242 -file => $clustal_file,
1243 -format => 'clustalw'
1245 my $cl_out = Bio::AlignIO->new(
1246 -file => ">$fasta_file",
1247 -format => 'fasta',
1250 while ( my $aln = $in->next_aln() ) {
1251 $aln->set_displayname_flat();
1252 $cl_out->write_aln($aln);
1254 $cl_out->close();
1255 $in->close();