4 show_align.pl - a cgi-based graphical alignment viewer
8 Generates an alignment view from data submitted through the input page (index.pl in this directory). It is a web script that supports the following cgi parameters:
14 format may be either 'clustalw', 'fasta', or 'fasta_unaligned', and describes the format of the uploaded sequences. If format is 'fasta_unaligned', the sequences will be aligned using t_coffee.
18 seq_data is the actual data uploaded, in the format described by the format parameter.
22 if no seq_data is given, a temp_file name is supplied that specifies a previously parsed input. The temp_file specifies a file name, without the directory information, which will be in the default directory location, as defined in the conf object.
26 denotes the first position of the section of the alignment that should be displayed.
30 denotes the end position of the section of the alignment that should be displayed.
34 the title of the alignment, shown on the top of the page.
40 =item 'nt' for nucleotide
42 =item 'pep' for peptide
46 Not sure about this one. Defaults to peptide as default.
50 a parameter that specifies a list of sequence ids, separated by whitespace, that should not be shown in the alignment.
54 'local' means run on local host. 'cluster' means run job on cluster (if an alignment needs to be run).
60 Depends on the CXGN code base, in particular heavily on CXGN::Alignment and others, and on GD for the graphics routines and File::Temp for the temp file handling.
64 Code by Chenwei Lin (cl295@cornell.edu). Documentation and minor modifications (ability to run T-coffee from the page) by Lukas Mueller (lam87@cornell.edu).
69 use Storable qw
/store/;
72 use CXGN
::Page
::FormattingHelpers qw
/ page_title_html blue_section_html /;
73 use CXGN
::Page
::Widgets qw
/ collapser /;
74 use File
::Temp qw
/tempfile/;
75 use File
::Basename qw
/basename/;
76 use CXGN
::Phylo
::Alignment
;
78 use CXGN
::DB
::Connection
;
80 use CXGN
::Tools
::Gene
;
81 use CXGN
::Tools
::Identifiers qw
/ identifier_namespace identifier_url /;
82 use CXGN
::Tools
::Entrez
;
84 use CXGN
::Tools
::Param qw
/ hash2param /;
85 use CatalystX
::GlobalContext
'$c';
87 our $ALIGN_SEQ_COUNT_LIMIT = 500;
88 our $ALIGN_SEQ_TOTAL_LENGTH_LIMIT = 500_000
;
90 my $page = CXGN
::Page
->new( "SGN Alignment Analyzer", "Chenwei-Carpita" );
91 $page->jsan_use("CXGN.Effects");
94 $format, $seq_data, $id_data, $start_value,
95 $end_value, $title, $type, $hide_seqs,
96 $run, $sv_similarity, $sv_shared, $sv_indel,
97 $show_sv, $show_breakdown, $maxiters, $force_gap_cds,
100 = $page->get_arguments(
101 "format", "seq_data", "id_data", "start_value",
102 "end_value", "title", "type", "hide_seqs",
103 "run", "sv_similarity", "sv_shared", "sv_indel",
104 "show_sv", "show_breakdown", "maxiters", "force_gap_cds",
108 my ( $image_content, $sum_content, $seq_sum_content, $sv_content, $al_content,
111 our ( $temp_file, $cds_temp_file ) =
112 $page->get_arguments( "temp_file", "cds_temp_file" );
114 our ( $family_nr, $i_value, $family_id ) =
115 $page->get_arguments( "family_nr", "i_value", "family_id" );
117 if ( $family_id && !( $family_nr || $i_value ) ) {
118 my $dbh = $c->dbc->dbh;
121 "SELECT family_nr, i_value FROM sgn.family LEFT JOIN sgn.family_build USING (family_build_id) WHERE family_id=?"
123 $sth->execute($family_id);
124 ( $family_nr, $i_value ) = $sth->fetchrow_array();
127 $show_breakdown = 1 if $show_breakdown;
128 $show_sv = 1 if $show_sv;
129 $maxiters = 1000 if ( $maxiters > 1000 );
132 $show_domains = 1 unless ( $show_domains eq '0' );
134 #Run locally by default, cluster will run if the computation
135 #is found to be intensive (later in script)
136 unless ( $run =~ /^(local)|(cluster)$/ ) { $run = "local"; }
138 #############################################################
139 #Check if the input sequence is empty
144 || ( $family_nr && $i_value ) )
146 &input_err_page
( $page, "No sequence data provided!\n" );
149 #############################################################
150 #If a $seq_data is passed to the page, write the content of $seq_data into a temp file
152 our $HTML_ROOT_PATH = $c->config->{'basepath'};
153 our $PATH = $page->path_to( $page->tempfiles_subdir('align_viewer') );
156 or die "temp dir '$PATH' does not exist, and could not create";
159 our $CLUSTER_SHARED_TEMPDIR = $c->config->{'cluster_shared_tempdir'};
160 our $NOCLUSTER = $c->config->{'nocluster'};
162 #You can send the temp_file as a filename and not a full path, good for security
163 unless ( $temp_file =~ /\// ) {
164 $temp_file = $PATH . "/" . $temp_file;
166 unless ( $cds_temp_file =~ /\// ) {
167 $cds_temp_file = $PATH . "/" . $cds_temp_file;
169 #If a temp file is set, we are definitely CDS
170 #Fixes imagemap problem:
171 $type = "cds" if ( -f
$cds_temp_file );
174 if ( -f
$cds_temp_file && $force_gap_cds ) {
176 #The CDS temp file needs to be gapped
177 $cds_temp_file = build_gapped_cds_file
( $temp_file, $cds_temp_file );
180 # upload file, if requested... (Lukas, 2006/09/05)
182 my $upload = $page->get_upload("upload");
185 # print STDERR "Uploading file $args{upload_file}...\n";
186 if ( defined $upload ) {
187 $upload_fh = $upload->fh();
188 while (<$upload_fh>) {
194 if ( $seq_data =~ /\S+/ ) {
195 $tmp_fh = File
::Temp
->new(
200 $temp_file = $tmp_fh->filename;
201 print $tmp_fh "$seq_data";
205 # Convert the input file into fasta if it is clustal
206 if ( $format eq 'clustalw' ) {
207 my $temp_file_fasta = $temp_file . ".fasta";
208 convert_clustal_fasta
( $temp_file, $temp_file_fasta );
210 # continue with the converted file...
211 $temp_file = $temp_file_fasta;
214 elsif ( $id_data =~ /\S+/ ) {
215 unless ( -f
$temp_file ) {
216 my $tmp_fh = File
::Temp
->new(
220 $temp_file = $tmp_fh->filename;
223 $format = "fasta_unaligned";
225 open( my $temp_fh, '>', $temp_file ) or die $!;
227 my @ids = split /\s+/, $id_data;
228 foreach my $id (@ids) {
229 if ( $id =~ /^gi:\d+$/i
230 || CXGN
::Tools
::Identifiers
::is_genbank_accession
($id) )
232 ($id) = $id =~ /^gi:(\d+)$/i if $id =~ /^gi:\d+$/;
233 my $database = "protein";
234 $database = "nucleotide" if $type ne "pep";
235 my $entrez = CXGN
::Tools
::Entrez
->new( { db
=> $database } );
236 my $seq = $entrez->get_sequence($id);
237 $id = "gi:$id" if $id =~ /^\d+$/;
238 print $temp_fh ">$id\n$seq" if $seq;
239 push( @good_ids, $id ) if $seq;
243 my $gene = CXGN
::Tools
::Gene
->new($id, $c->dbc->dbh );
245 $seqtype = "protein" if $type eq "pep";
246 $seqtype = "cds" if $type eq "cds";
247 $seqtype = "genomic" if $type eq "nt";
248 my $seq = $gene->getSequence($seqtype);
249 print $temp_fh ">$id\n$seq\n" if $seq;
252 $err =~ s/\n\s*Catalyst::.+//;
260 $id_data = join " ", @good_ids;
263 elsif ( $family_nr && $i_value ) {
264 my $family_basedir = "/data/prod/public/family/$i_value";
265 unless ( -d
$family_basedir ) {
266 &input_err_page
( $page,
267 "No current family build corresponds to an i_value of $i_value" );
269 unless ( -f
"$family_basedir/pep_align/$family_nr.pep.aligned.fasta" ) {
270 &input_err_page
( $page,
271 "Alignment was not calculated for this family" );
274 unless ( -f
$temp_file ) {
275 my $tmp_fh = File
::Temp
->new(
279 $temp_file = $tmp_fh->filename;
282 $format = "fasta_aligned";
285 #disable cds until kinks worked out
286 if ( 0 && -f
"$family_basedir/cds_align/$family_nr.cds.align.fasta" ) {
289 "cp $family_basedir/cds_align/$family_nr.cds.align.fasta $temp_file";
294 "cp $family_basedir/pep_align/$family_nr.pep.aligned.fasta $temp_file";
298 #Check that the input file is a good FASTA, throw error page otherwise
299 our ( $SEQ_COUNT, $MAX_LENGTH, $TOTAL_LENGTH ) = fasta_check
( $temp_file, $page );
301 #Also, use seq_count, maxiters, and max_length parameters for Muscle to see if we
302 #should run it on the cluster, otherwise allow running on the web server
303 if ( !$NOCLUSTER && ( $maxiters * $SEQ_COUNT * $MAX_LENGTH > 30000 )
308 ##Convert cds to protein, but if cds_temp_file exists,
309 #the conversion was already done on the previous page (or before)
310 if ( $type eq "cds" && $temp_file && !( -f
$cds_temp_file ) ) {
311 my $new_temp_fh = File
::Temp
->new(
315 my $new_temp = $new_temp_fh->filename;
316 my $instream = Bio
::SeqIO
->new( -file
=> $temp_file, -format
=> 'fasta' );
317 while ( my $in = $instream->next_seq() ) {
318 my $seq = $in->seq();
320 my $len = length $seq;
323 $member = CXGN
::Phylo
::Alignment
::Member
->new(
330 $member->translate_cds();
333 input_err_page
( $page, $@
);
336 my $prot_seq = $member->{translated_protein_seq
};
337 print $new_temp_fh ">$id\n$prot_seq\n";
341 #Important transition happens here. The temp_file
342 #is now the translated protein sequence. Maybe it's
343 #gapped, maybe not, depends on the format parameter
344 #and if the CDS was gapped
345 $cds_temp_file = $temp_file;
346 $temp_file = $new_temp;
349 #############################################################
350 #Read the alignment sequence and store them in an align object
352 #First parse $hide_seqs
353 my %hidden_seqs = ();
354 my @ids = split /\s+/, $hide_seqs;
356 $hidden_seqs{$_} = 1;
359 # align sequences if they are in the unaligned fasta format
360 # (added by Lukas, 9/5/2006)
361 if ( $format eq "fasta_unaligned" ) {
363 if ( $SEQ_COUNT > $ALIGN_SEQ_COUNT_LIMIT ) {
365 "<center><span style='font-size:1.2em;'>You submitted <b>$SEQ_COUNT</b> sequences. Due to limited computational resources, we only <br>allow <b>$ALIGN_SEQ_COUNT_LIMIT or fewer</b> sequences to be aligned through our web interface.</span>";
367 "<br /><br /><span style='font-size:1.1em'>You can download and run your own copy of the <br />alignment software at <a href='http://www.drive5.com/muscle/'>www.drive5.com/muscle/</a></span></center>";
368 input_err_page
( $page, $message );
370 elsif( $TOTAL_LENGTH > $ALIGN_SEQ_TOTAL_LENGTH_LIMIT ) {
371 input_err_page
( $page, <<"" );
372 <center
><span style
='font-size:1.2em;'>You submitted
<b
>$TOTAL_LENGTH</b> bases of sequence. Due to limited computational resources, we only <br>allow <b>$ALIGN_SEQ_TOTAL_LENGTH_LIMIT or fewer</b
> sequences to be aligned through
our web interface
.</span
>
373 <br
/><br /><span style
='font-size:1.1em'>You can download
and run your own copy of the
<br
/>alignment software at <a href='http://www.drive5.com/muscle
/'>www.drive5.com/muscle
/</a></span></center
>
377 run_muscle
( $page, $run );
380 #die "\n$temp_file\n$cds_temp_file";
382 #Let's see the toolbar while we're just building the image
384 #######################################
385 ## Construct Alignment ################
386 #######################################
388 my $disp_type = $type;
389 $disp_type = "pep" if ( $disp_type eq "cds" );
391 my $alignment = CXGN
::Phylo
::Alignment
->new(
397 $alignment->{page
} = $page;
401 $sv_similarity = 95 unless ($sv_similarity);
402 $sv_indel = 4 unless ($sv_indel);
403 $sv_shared = 20 unless ($sv_shared);
405 $alignment->set_sv_criteria( $sv_shared, $sv_similarity, $sv_indel );
408 $instream = Bio
::SeqIO
->new( -file
=> $temp_file, -format
=> 'fasta' );
409 while ( my $in = $instream->next_seq() ) {
410 my $seq = $in->seq();
411 my ( $id, $species ) = $in->id() =~ m/(.+)\/(.+)/;
413 ( !$species ) and $species = ();
417 if ( exists $hidden_seqs{$id} )
418 ; #skip the sequence if it is in the hide_seq
425 $member = CXGN
::Phylo
::Alignment
::Member
->new(
434 $alignment->add_member($member);
436 $@
and input_err_page
( $page, $@
);
438 my ( $gene, $annot, $sigpos, $cleavage );
440 $gene = CXGN
::Tools
::Gene
->new($id);
441 $annot = $gene->getAnnotation;
442 $annot = HTML
::Entities
::encode
($annot);
445 $member->set_tooltip($annot);
448 $member->show_remove(); #draw X for removal
449 $member->show_link();
452 #Set the start value and end value#######
453 ( !$start_value ) and $start_value = 1;
454 $alignment->set_start_value($start_value);
455 ( !$end_value ) and $end_value = $len;
456 ( $len < $end_value ) and $end_value = $len;
457 $alignment->set_end_value($end_value);
459 ## Domain Highlighting ########################
460 $alignment->highlight_domains() if $show_domains;
462 ### If cds_temp exists, create a $cds_alignment object, for producing CDS files
465 if ( -f
$cds_temp_file && $type eq "cds" ) {
466 $cds_alignment = CXGN
::Phylo
::Alignment
->new(
471 Bio
::SeqIO
->new( -file
=> $cds_temp_file, -format
=> 'fasta' );
472 while ( my $in = $instream->next_seq() ) {
473 my $seq = $in->seq();
474 my ( $id, $species ) = $in->id() =~ m/(.+)\/(.+)/;
476 ( !$species ) and $species = ();
480 if ( exists $hidden_seqs{$id} )
481 ; #skip the sequence if it is in the hide_seq
484 my $len = length $seq;
487 $member = CXGN
::Phylo
::Alignment
::Member
->new(
496 $cds_alignment->add_member($member);
498 $@
and input_err_page
( $page, $@
);
499 $member->show_remove(); #draw X for removal
500 $member->show_link();
502 $cds_alignment->set_start_value( $start_value * 3 - 2 );
503 $cds_alignment->set_end_value( $end_value * 3 );
506 ### Do page header AFTER all eval{} statements
509 ############################################################
510 #Draw alignment image, unless we are at the original parameters and we
511 #have already drawn the most computationally intensive image
513 my $tmp_image = File
::Temp
->new(
520 $alignment->render_png_file( $tmp_image, 'c' );
522 $tmp_image =~ s/$HTML_ROOT_PATH//;
524 #Important for Ruler Image-Map Generation:
525 our ($temp_file_name) = $temp_file =~ /([^\/]+)$/;
526 our ($cds_temp_file_name) = $cds_temp_file =~ /([^\/]+)$/;
528 #Parameter hash for imagemap and other links on the page
531 start_value
=> $start_value,
532 end_value
=> $end_value,
535 hide_seqs
=> $hide_seqs,
536 sv_similarity
=> $sv_similarity,
537 sv_shared
=> $sv_shared,
538 sv_indel
=> $sv_indel,
540 show_breakdown
=> $show_breakdown,
541 maxiters
=> $maxiters,
542 force_gap_cds
=> $force_gap_cds,
543 temp_file
=> $temp_file_name,
544 cds_temp_file
=> $cds_temp_file_name,
545 show_domains
=> $show_domains
548 $alignment->set_fasta_temp_file($temp_file_name);
549 $alignment->set_cds_temp_file($cds_temp_file_name);
550 $alignment->set_param( \
%PARAM );
552 $image_content = "<tr><td align=\"center\">";
555 "<img border=0 src='$tmp_image' alt='Alignment Image' usemap='#align_image_map'/>";
556 $image_content .= $alignment->get_image_map( $show_sv, $show_breakdown );
558 "<div style=\"width:100%;text-align:left;padding-left:40px;\">";
561 "<a href=\"show_align.pl?"
562 . hash2param
( \
%PARAM, { show_domains
=> 0 } )
563 . "\">Hide Domain Information</a>";
567 "<a href=\"show_align.pl?"
568 . hash2param
( \
%PARAM, { show_domains
=> 1 } )
569 . "\">Show Domain Information</a>";
571 $image_content .= "</div>";
573 if ( $start_value == 1 && $end_value == $len ) {
575 "<b>Click on a region of the image to zoom in.</b><br />You must submit the form below to change alignment parameters or remove members from analysis.";
578 $image_content .= "<a href='show_align.pl?";
580 hash2param
( \
%PARAM, { start_value
=> 1, end_value
=> "" } );
581 $image_content .= "'><< See Whole Alignment >></a>";
584 $image_content .= "</td></tr>";
586 my $term_help_content = <<HTML;
587 <table width="100%" summary="" cellpadding="3" cellspacing="3" border="0"><tr><td><p style='font-size:1.1em'>SGN Alignment Viewer analyzes and provides useful information that can be used to optimize the overlapping region of the alignment.</p></td></tr></table>
588 <table width="100%" summary="" cellpadding="3" cellspacing="0" border="0">
589 <tr style='background-color:#ddf'><th valign='top'>Coverage Range</th>
590 <td>The distance between the first and last element of the un-gapped sequence in the current alignment.</td></tr>
591 <tr><th valign='top'>Bases</th><td>Number of "real sequence characters" (non gap) in the sequence.</td></tr>
592 <tr style='background-color:#ddf'><th valign='top'>Gaps</th><td>Number of gaps in the aligment</td></tr>
593 <tr><th valign='top'>Medium</th><td>The middle position of all non-gap characters of the alignment sequence</td></tr>
594 <tr style='background-color:#ddf'><th valign='top'>Putative Splice Variant Pairs</th><td>Sequence pairs that are from the same species, that, by default <ol>
595 <li>Share at least 60 bases if they are nucleotides or 20 bases if they are peptides.
596 <li>Share at least 95% sequence similarity in the overlapping region.
597 <li>Have insertion/deletion of at least 4 amino acids or 12 nucleotides in their common region.</ol> </td></tr>
598 <tr><th valign='top'>Putative Alleles</th><td>Sequence pairs that are from the same species, that, by default
600 <li>Share at least 60 bases if they are nucleotides or 20 bases if they are peptides.
601 <li>Share at least 95% sequence similarity in the overlapping region.
602 <li>Have insertion/deletion of <b>not more than </b> 4 amino acids or 12 nucleotides in their common region.
605 <tr style='background-color:#ddf'><th valign='top'>Overlap Score</th><td>An indication of how well the sequence overlaps with other members in the alignment. If a non-gap character of a sequence overlaps with a character in another alignment sequence, it gets a point.<br><br>Sometimes a few sequences share very little overlap with the rest, significantly reducing the overall overlapping sequence of the alignment. Usually these sequences are short and will not help with the understanding of overall alignment. We suggest the user leaves out these sequences before further analysis of the aligment sequence. </td></tr></table>
608 blue_section_html
( "<em>Help With Terms</em>", $term_help_content );
610 my ( $help_link, $hidden_help_content ) = collapser
(
612 id
=> "alignment_help_content",
613 alt_href
=> "/about/align_term.pl",
614 alt_target
=> "align_about",
615 linktext
=> "(-) Help me understand terms on this page",
616 hide_state_linktext
=> "(?) Help me understand terms on this page",
617 linkstyle
=> "font-size:1.1em",
619 content
=> $term_help_content,
623 ##########################################################
624 #Write sequence output to temp files and create links to them
626 ###########################################################
627 #Get summary information
629 $combined_content = get_combined_content
($alignment);
631 ############################################################
632 #Page printout: Nav Section
635 "<a href=\"index.pl?id_data=$id_data\" style=\"margin-left:10px;font-size:1.1em\"><< Start over</a>";
637 $disp_title = "View and Analyze Alignment of <em>$title</em>" if ($title);
638 $disp_title ||= "View and Analyze Alignment";
639 print page_title_html
("$disp_title");
641 print " " . $help_link . "<br><br>";
642 print $hidden_help_content; #this is actually a blue_section_html, but the whole thing is hidden
644 my $image_title = "Alignment Image";
645 $image_title .= ": CDS Translated" if ( $type eq "cds" );
646 print blue_section_html
( $image_title,
647 '<table width="100%" cellpadding="0" cellspacing="0" border="0">'
651 print blue_section_html
(
652 'Summary, Files, Change Parameters',
653 '<table width="100%" cellpadding="5" cellspacing="0" border="0">'
658 if ($show_breakdown) {
659 $seq_sum_content = get_seq_sum_content
($alignment);
662 $seq_sum_content .= "Not Calculated ( <a href=\"show_align.pl?";
663 $seq_sum_content .= hash2param
( \
%PARAM, { show_breakdown
=> 1 } );
665 "\">showing this table</a> will increase page load time )";
668 ###########################################################
669 #Find splice variants
672 ( $sv_content, $al_content ) = get_sv_al_content
($alignment);
675 $al_content = "Not Calculated ( <a href=\"show_align.pl?";
676 $al_content .= hash2param
( \
%PARAM, { show_sv
=> 1 } );
677 $al_content .= "\">showing this table</a> will increase page load time )";
678 $sv_content = "Not Calculated ( <a href=\"show_align.pl?";
679 $sv_content .= hash2param
( \
%PARAM, { show_sv
=> 1 } );
680 $sv_content .= "\">showing this table</a> will increase page load time )";
683 ###########################################################
684 ##Page Printout: Breakdown and Alleles
685 print blue_section_html
( 'Alignment Breakdown', $seq_sum_content );
687 print blue_section_html
( 'Putative Splice Variants', $sv_content );
689 print blue_section_html
( 'Putative Alleles', $al_content );
691 #print blue_section_html('Output Sequences','<table width="100%" cellpadding="5" cellspacing="0" border="0">'.$seq_output_content.'</table>');
695 sub get_combined_content
{
696 my $alignment = shift;
697 my $align_member_nr = $alignment->get_member_nr();
698 my $ol_nr = $alignment->get_overlap_num();
700 my $id_list_ref = $alignment->get_nonhidden_member_ids;
701 my $seq_ref = $alignment->get_nopad_seqs();
702 my $align_seq_ref = $alignment->get_seqs();
703 my $ol_seq_ref = $alignment->get_overlap_seqs();
704 my $sp_ref = $alignment->get_member_species();
706 my ( $cds_ids_ref, $cds_seq_ref, $cds_align_seq_ref, $cds_ol_seq_ref ) =
707 ( undef, undef, undef, undef );
708 if ($cds_alignment) {
709 $cds_seq_ref = $cds_alignment->get_nopad_seqs();
710 $cds_align_seq_ref = $cds_alignment->get_seqs();
711 $cds_ol_seq_ref = $cds_alignment->get_overlap_seqs();
717 my $cds_file_links = "";
719 my ( $seq_file, $cds_file, $seq_align_file );
720 ## Protein/Genomic Files:
723 ( $seq_file, $link ) =
724 build_seq_file
( $id_list_ref, $seq_ref, "All Sequences (No Gaps)" );
725 $file_links .= $link . "<br />";
727 else { $file_links .= "All Sequences (No Gaps)<br>" }
729 if ($align_seq_ref) {
731 ( $seq_align_file, $link ) =
732 build_seq_file
( $id_list_ref, $align_seq_ref,
733 "All Alignment Sequences (Gapped)" );
734 $file_links .= $link . "<br />";
736 else { $file_links .= "All Alignment Sequences (Gapped)<br/>"; }
740 build_seq_file
( $id_list_ref, $ol_seq_ref, "Overlap of Sequences" );
742 else { $file_links .= "Overlap of Sequences" }
744 ## CDS Files, if the CDS alignment was made from cds_file:
747 ( $cds_file, $link ) =
748 build_seq_file
( $id_list_ref, $cds_seq_ref,
749 "Coding Sequences (No Gaps)" );
750 $cds_file_links .= $link . "<br />";
752 if ($cds_align_seq_ref) {
754 build_seq_file
( $id_list_ref, $cds_align_seq_ref,
755 "Coding Aligned Sequences (Gapped)" )
758 if ( $ol_nr > 0 && $cds_ol_seq_ref ) {
760 build_seq_file
( $id_list_ref, $cds_ol_seq_ref,
761 "Overlap of Coding Sequences" );
763 elsif ($cds_ol_seq_ref) { $cds_file_links .= "Overlap of Coding Sequences" }
765 ############################################################
766 # Summary, Files, And Alignment Parameter Changes
768 my $combined_content = "<tr><td>";
770 $combined_content .= "<table style='width:100\%;border-spacing:0px'><tr>";
774 "<td style='width:50%;vertical-align:middle;'><table width='100%'>";
775 $combined_content .= "<tr><th>Type</th><td>";
776 if ( $type eq 'nt' ) {
777 $combined_content .= "Nucleotide</td></tr>";
780 $combined_content .= "Peptide</td></tr>";
783 "<tr><th>No. Alignment Members</th><td>$align_member_nr</td></tr>";
785 "<tr><th>Selected Range</th><td>$start_value - $end_value</td></tr>";
786 $combined_content .= "<tr><th>Overlap</th><td>$ol_nr";
787 if ( $type eq 'nt' ) {
788 $combined_content .= " bp</td></tr>";
791 $combined_content .= " aa</td></tr>";
794 if ($cds_file_links) {
796 "<tr><th>Protein Files:</th><td>$file_links</td></tr>";
798 "<tr><th>CDS Files:</th><td>$cds_file_links</td></tr>";
802 "<tr><th>Sequence Files:</th><td>$file_links</td></tr>";
806 ($cds_file) ?
( $rerun_file = $cds_file ) : ( $rerun_file = $seq_file );
807 my ($tmp_seq_name) = $rerun_file =~ /([^\/]+)$/;
808 my ($tmp_seq_align_name) = $seq_align_file =~ /([^\/]+)$/;
809 my $target = "_SGN_TREE_" . int( rand(10000000) );
811 my $available_seq_count = 0;
812 while ( my ( $id, $seq ) = each %$seq_ref ) {
813 $available_seq_count++ if $seq =~ /\S/;
815 if ( $available_seq_count > 1 ) {
816 $combined_content .= <<HTML;
818 <tr><td colspan=2><br />
819 <a href='index.pl?temp_file=$tmp_seq_name&maxiters=$maxiters&type=$type&format=fasta_unaligned&title=$title'>>> Re-run Alignment On Selected Range ($available_seq_count)</a>
820 <br />This will allow you to specify greater accuracy by increasing the number of iterations performed by Muscle<br /><br />
821 <a target="$target" href="/tools/tree_browser/index.pl?align_temp_file=$tmp_seq_align_name&align_type=pep&tree_from_align=1">
822 >> Calculate Tree from Selected Range ($available_seq_count)</a><br />
823 Use a quick neighbor-joining algorithm (Kimura) to calculate a tree based on the protein alignment, and analyze both in the combined alignment/tree browser.
828 $combined_content .= <<HTML;
829 <tr><td colspan=2><br />
830 Re-alignment and Tree Calculation cannot be performed, since there are less than two sequences for which data is available at this range
834 $combined_content .= "</table></td>";
836 ### Modify and submit alignment parameters
837 ##########################################
840 "<td><form method='post' action='show_align.pl' name='show_align'>";
843 $combined_content .= "<input type='hidden' name='title' value='$title' />";
845 "<input type='hidden' name='format' value='$format' />";
847 "<input type='hidden' name='temp_file' value='$temp_file' />";
849 "<input type='hidden' name='cds_temp_file' value='$cds_temp_file_name' />";
850 $combined_content .= "<input type='hidden' name='type' value='$type' />";
854 "<table style='border:0px;border-spacing:0px'><tr><th>Start Value</th><th>End Value</th></tr>";
856 "<tr><td><input type='text' name='start_value' value='$start_value' /></td>";
858 "<td><input type='text' name='end_value' value='$end_value' /></td></tr></table>";
860 #Sequences to leave out
862 "<b>IDs of Sequences to Leave Out</b> (separated by spaces)<br>";
864 "<textarea name='hide_seqs' id='hide_seqs' rows='2' cols='50'>$hide_seqs</textarea><br>";
866 my $show_breakdown_checked = "";
867 $show_breakdown_checked = "CHECKED" if ($show_breakdown);
869 "<b><input type='checkbox' name='show_breakdown' $show_breakdown_checked> Calculate Coverage and Overlap Details</b> (Time Intensive)<br />";
871 my $show_sv_checked = "";
872 $show_sv_checked = "CHECKED" if ($show_sv);
874 "<b><input type='checkbox' name='show_sv' $show_sv_checked> Calculate Splice Variants and Alleles</b> (Time Intensive)";
875 $combined_content .= "<table style='border:0px;text-align:center'>";
877 "<tr><th>% Similarity</th><th></th><th>Shared</th><th></th><th>InDel Limit</th></tr>";
879 "<tr><td><input type='text' size=10 name='sv_similarity' value='$sv_similarity'></td><td width=3> </td>";
881 "<td><input type='text' size=10 name='sv_shared' value='$sv_shared'></td><td width=3> </td>";
883 "<td><input type='text' size=10 name='sv_indel' value='$sv_indel'></td></tr></table>";
887 "<input type='submit' value='Change Alignment Details' /></form></td>";
889 $combined_content .= "</tr></table>";
890 $combined_content .= "</td></tr>";
892 return $combined_content;
895 sub get_sv_al_content
{
896 my $alignment = shift;
897 my ( $ob_ref, $pi_ref, $sv_sp_ref ) = $alignment->get_sv_candidates();
900 my %sv_sp = %$sv_sp_ref;
901 my ( $sv_content, $al_content ) = ( "", "" );
904 "<table width='100%' cellpadding='5' cellspacing='0' border='1'>";
905 $sv_content .= "<tr>";
906 $sv_content .= "<th>Species</th>"; # if $show_species;
908 "<th>Sequence id</th><th>Sequence id</th><th>Overlap Bases</th><th>\% Similarity</th></tr>";
909 foreach my $first_key ( keys %ob ) {
910 foreach my $second_key ( keys %{ $ob{$first_key} } ) {
911 $sv_content .= "<tr>";
913 "<td>$sv_sp{$first_key}</td>"; # if $show_species;
915 "<td>$first_key</td><td>$second_key</td><td>$ob{$first_key}{$second_key}</td><td>$pi{$first_key}{$second_key}</td></tr>";
918 $sv_content .= "</table>";
920 " Reduce page load time by <a href=\"show_align.pl?";
921 $sv_content .= hash2param
( \
%PARAM, { show_sv
=> 0 } );
922 $sv_content .= "\">hiding this table</a>";
927 "None were found. Reduce page load time by <a href=\"show_align.pl?";
928 $sv_content .= hash2param
( \
%PARAM, { show_sv
=> 0 } );
929 $sv_content .= "\">hiding this table</a>";
933 ###########################################################
936 my ( $al_ob_ref, $al_pi_ref, $al_sp_ref ) =
937 $alignment->get_allele_candidates();
938 my %al_ob = %$al_ob_ref;
939 my %al_pi = %$al_pi_ref;
940 my %al_sp = %$al_sp_ref;
944 "<table width='100%' cellpadding='5' cellspacing='0' border='1'>";
945 $al_content .= "<tr>";
946 $al_content .= "<th>Species</th>"; # if $show_species;
948 "<th>Sequence id</th><th>Sequence id</th><th>Overlap Bases</th><th>\% Similarity</th></tr>";
949 foreach my $first_key ( keys %al_ob ) {
950 foreach my $second_key ( keys %{ $al_ob{$first_key} } ) {
951 $al_content .= "<tr>";
953 "<td>$al_sp{$first_key}</td>"; # if $show_species;
955 "<td>$first_key</td><td>$second_key</td><td>$al_ob{$first_key}{$second_key}</td><td>$al_pi{$first_key}{$second_key}</td></tr>";
958 $al_content .= "</table>";
960 " Reduce page load time by <a href=\"show_align.pl?";
961 $al_content .= hash2param
( \
%PARAM, { show_sv
=> 0 } );
962 $al_content .= "\">hiding this table</a>";
966 "None were found. Reduce page load time by <a href=\"show_align.pl?";
967 $al_content .= hash2param
( \
%PARAM, { show_sv
=> 0 } );
968 $al_content .= "\">hiding this table</a>";
970 return ( $sv_content, $al_content );
973 sub get_seq_sum_content
{
975 my $alignment = shift;
976 ###########################################################
977 #Analyze sequences and create analysis content
978 my $member_ids_ref = $alignment->get_member_ids();
979 my $ov_score_ref = $alignment->get_all_overlap_score();
980 my $medium_ref = $alignment->get_all_medium();
981 my ( $head_ref, $tail_ref ) = $alignment->get_all_range();
982 my $ng_ref = $alignment->get_all_nogap_length();
983 my $sp_ref = $alignment->get_member_species();
985 my @member_ids = @
$member_ids_ref;
986 my %ov_score = %$ov_score_ref;
987 my %medium = %$medium_ref;
988 my %head = %$head_ref;
989 my %tail = %$tail_ref;
992 my %species = %$sp_ref;
994 foreach ( keys %ng ) {
995 $gap{$_} = $tail{$_} - $head{$_} + 1 - $ng{$_};
997 my $show_species = 0;
998 foreach ( keys %species ) {
999 $show_species = 1 if $species{$_};
1002 my $seq_sum_content = "";
1004 "<table width='100%' cellpadding='5' cellspacing='0' border='1'>";
1005 $seq_sum_content .= "<tr><th>Sequence ID</th>";
1006 $seq_sum_content .= "<th>Species</th>" if $show_species;
1008 "<th>Coverage Range</th><th>Bases</th><th>Gaps</th><th>Medium</th><th>Overlap Score</th></tr>";
1010 foreach (@member_ids)
1011 { ##Access the align_seqs members this way, instead of by the keys of the hashes, so that the sequences are grouped together
1012 $seq_sum_content .= "<tr><td>$_</td>";
1013 $seq_sum_content .= "<td>$species{$_}</td>" if $show_species;
1015 "<td>$head{$_} - $tail{$_}</td><td>$ng{$_}</td><td>$gap{$_}</td><td>$medium{$_}</td><td>$ov_score{$_}</td></tr>";
1017 $seq_sum_content .= "</table>";
1019 " Reduce page load time by <a href=\"show_align.pl?";
1020 $seq_sum_content .= hash2param
( \
%PARAM, { show_breakdown
=> 0 } );
1021 $seq_sum_content .= "\">hiding this table</a>";
1022 return $seq_sum_content;
1025 sub build_gapped_cds_file
{
1026 my $prot_file = shift;
1027 my $cds_file = shift;
1028 my $instream = Bio
::SeqIO
->new( -file
=> $prot_file, -format
=> 'fasta' );
1030 Bio
::SeqIO
->new( -file
=> $cds_file, -format
=> 'fasta' );
1035 while ( my $in = $instream->next_seq ) {
1037 $prot_seq{$id} = $in->seq();
1039 while ( my $in = $cds_instream->next_seq ) {
1041 $cds_seq{$id} = $in->seq();
1043 my $temp_fh = File
::Temp
->new(
1048 my $new_cds_temp_file = $temp_fh->filename;
1051 while ( my ( $id, $prot_seq ) = each %prot_seq ) {
1053 my $cds = $cds_seq{$id};
1055 my $member = CXGN
::Phylo
::Alignment
::Member
->new(
1061 $member->cds_insert_gaps($prot_seq);
1062 my $newseq = $member->get_seq();
1063 print $temp_fh ">$id\n$newseq\n";
1066 my $rate = ( $et - $st ) / $i;
1068 #die $rate ." seconds/member\n";
1071 return $new_cds_temp_file;
1074 sub build_seq_file
{
1075 my $id_list_ref = shift;
1076 my $seq_ref = shift;
1077 my $linkname = shift;
1078 my $temp_fh = File
::Temp
->new(
1083 my $outstream = Bio
::SeqIO
->new( -fh
=> $temp_fh, -format
=> 'fasta' );
1084 foreach (@
$id_list_ref) {
1085 my $seq = Bio
::Seq
->new( -seq
=> $seq_ref->{$_}, -id
=> $_ );
1086 $outstream->write_seq($seq) if $seq_ref->{$_} =~ /\w/;
1088 my $tempname = $temp_fh->filename;
1090 $tempname =~ s/$HTML_ROOT_PATH//;
1092 "<a target=\"sgn_align_file\" href=\"$tempname\">$linkname</a>" );
1095 sub input_err_page
{
1096 my $input_err_page = shift;
1097 my $err_message = shift;
1098 $input_err_page->header();
1099 my ($disp_err_message) = $err_message =~ /(.*) at \/usr\
/local\//;
1100 $disp_err_message = $err_message unless $disp_err_message;
1101 print page_title_html
("Combined Tree/Alignment Error");
1102 print "$disp_err_message<br><br><a href=\"index.pl\">Start over</a><br>";
1103 $input_err_page->footer();
1108 my ( $file, $page, $n ) = @_;
1109 my ($filename) = $file =~ /([^\/]+)$/;
1112 my $total_length = 0;
1113 my $instream = Bio
::SeqIO
->new( -file
=> $file, -format
=> 'fasta' );
1114 my $entry = $instream->next_seq();
1115 unless ( $entry && $entry->id && $entry->seq ) {
1116 input_err_page
( $page, "FASTA needs IDs and Sequences [$filename]" );
1119 $maxlen = $entry->length;
1120 $total_length += $entry->length;
1122 $entry = $instream->next_seq;
1123 unless ( $entry && $entry->id && $entry->seq ) {
1124 input_err_page
( $page, "FASTA must have at least two valid sequences" );
1127 $maxlen = $entry->length if $entry->length > $maxlen;
1128 $total_length += $entry->length;
1130 while ( $entry = $instream->next_seq() ) {
1131 unless ( $entry->id && $entry->seq ) {
1132 input_err_page
( $page, "Every entry must have ID AND sequence" );
1134 $maxlen = $entry->length if $entry->length > $maxlen;
1135 $total_length += $entry->length;
1138 return ( $count, $maxlen, $total_length );
1142 my ( $page, $run ) = @_;
1143 my @local_run_output = "";
1144 my $command_line = "";
1145 my $old_temp_file = $temp_file;
1146 if ( $run eq "local" ) {
1149 my $result_file = $temp_file;
1150 $result_file .= ".aligned.fasta" unless $result_file =~ /\.aligned\.fasta$/;
1153 #Within a limit of less than 50 sequences, muscle should run within 3 seconds. Wawa-woo-a!
1155 "muscle -in $temp_file -out $result_file -maxiters $maxiters";
1156 print STDERR
"Running: $command_line\n";
1157 @local_run_output = `$command_line `;
1158 $temp_file = $result_file;
1159 if ( -f
$cds_temp_file ) {
1161 #The CDS temp file needs to be gapped
1163 build_gapped_cds_file
( $temp_file, $cds_temp_file );
1168 ## RUN ON CLUSTER NODES ###################################################
1170 if ( $run eq "cluster" ) {
1172 my ( undef, $filename ) = tempfile
(
1173 TEMPLATE
=> "jobXXXXXX",
1174 DIR
=> $CLUSTER_SHARED_TEMPDIR
1177 copy
( $temp_file, $filename )
1178 || die "Cannot copy $temp_file to $filename";
1180 chdir $CLUSTER_SHARED_TEMPDIR;
1182 print STDERR
"Running on cluster: $command_line\n";
1184 #CXGN::Tools::Run->temp_base($CLUSTER_SHARED_TEMPDIR);
1186 # generate a .req file that will indicate to the wait.pl script
1187 # that such a request has been made
1189 system("touch $filename.req");
1193 # my $job = CXGN::Tools::Run->run_cluster(
1195 # -in => "$filename",
1196 # -out => "$filename.aln",
1197 # -maxiters => $maxiters,
1198 # { #if the next line is uncommented, we get the weird STDERR problem I was talking about --ccarpita
1199 # #out_file => "$filename.aln",
1200 # err_file => $filename . ".STDERR",
1201 # working_dir => $CLUSTER_SHARED_TEMPDIR,
1202 # temp_base => $CLUSTER_SHARED_TEMPDIR,
1203 # # do not block and wait if the cluster looks full
1204 # max_cluster_jobs => 1_000_000_000,
1208 print STDERR
"Running muscle with $filename as input, $filename.aln as output\n";
1209 system("muscle", "-in", $filename, "-out", "$filename.aln", "-maxiters", $maxiters);
1211 print STDERR
"TEMPFILE = $temp_file\n";
1213 print STDERR
"Copying $filename.aln TO $CLUSTER_SHARED_TEMPDIR\n";
1214 copy
("$filename.aln", "$PATH");
1216 # my ( undef, $job_file ) =
1217 # tempfile( DIR => $PATH, TEMPLATE => "object_XXXXXX" );
1219 # store( $job, $job_file )
1220 # or $page->message_page(
1221 # "An error occurred in the serializaton of the job object");
1223 my $job_file_base = File
::Basename
::basename
("$filename.aln");
1225 # print STDERR "SUBMITTED JOB WITH JOBID: " . $job->job_id() . "\n";
1226 my $cds_temp_filename = File
::Basename
::basename
($cds_temp_file);
1228 # # url encode the destination pass_back page.
1230 "show_align.pl?title=$title&type=$type&force_gap_cds=1&cds_temp_file=$cds_temp_filename&temp_file=$job_file_base";
1231 #$pass_back =~ s/([^A-Za-z0-9])/sprintf("%%%02X", ord($1))/seg;
1233 # "Running Muscle v3.6 ($SEQ_COUNT sequences), please wait ...";
1237 print STDERR
"Now redirecting...\n";
1238 $page->client_redirect($pass_back);
1244 sub convert_clustal_fasta
{
1245 my $clustal_file = shift;
1246 my $fasta_file = shift;
1248 my $in = Bio
::AlignIO
->new(
1249 -file
=> $clustal_file,
1250 -format
=> 'clustalw'
1252 my $cl_out = Bio
::AlignIO
->new(
1253 -file
=> ">$fasta_file",
1257 while ( my $aln = $in->next_aln() ) {
1258 $aln->set_displayname_flat();
1259 $cl_out->write_aln($aln);