upload fieldbook from manage phenotyping
[sgn.git] / cgi-bin / search / family_align.pl
blobe7935689114eadfd6715bd851f09a14d473b6ce7
1 #!/usr/bin/perl -w
2 use strict;
3 use warnings;
4 use CXGN::Page;
5 use CXGN::Page::FormattingHelpers qw/ page_title_html
6 blue_section_html /;
7 use CXGN::DB::Connection;
8 use CXGN::Alignment;
9 use File::Temp;
10 use Bio::Seq;
11 use Bio::SeqIO;
12 use CatalystX::GlobalContext '$c';
14 our $page = CXGN::Page->new( "SGN Gene Family Alignment", "Chenwei Lin");
16 our $family_size_limit = 100;
18 my ($family_id) = $page->get_arguments("family_id");
19 if ($family_id eq ""){
20 empty_search($page);
23 ##Connect to database and define queries
25 my $dbh = CXGN::DB::Connection->new();
27 my $at_family_member_q = $dbh->prepare("select sequence_name from family_member where family_id = ? and database_name like 'Arabidopsis'");
29 my $sgn_family_member_q = $dbh->prepare("select unigene_id, family_member.cds_id, organism_group_id from family_member left join cds using (cds_id) where family_id = ? and (database_name like'SGN' or database_name like 'CGN')");
31 my $organism_group_q = $dbh->prepare("select comment from groups where group_id = ?");
33 my $sgn_align_seq_q = $dbh->prepare("select alignment_seq from family_member left join cds using (cds_id) where unigene_id = ? and family_id = ?");
35 my $at_align_seq_q = $dbh->prepare("select alignment_seq from family_member where sequence_name like ? and database_name like 'Arabidopsis' and family_id = ?");
37 my $family_tree_log_q = $dbh->prepare("select tree_taxa_number, tree_overlap_length, tree_log_file_location from family where family_id = ? and tree_taxa_number > 3");
39 my $family_tree_nw_q = $dbh->prepare("select tree_nr, newick_unigene from family_tree where family_id = ?");
41 ##Variables for each display sections. There are five sections: align_image, summary, splice variants, sequence_analyisis, sequence_output.
42 my ($sum_content, $align_content, $sv_content, $al_content, $seq_sum_content, $seq_output_content, $gene_tree_content);
46 ####################################################
47 #Get family member information.
49 my @at_member = ();
50 my %sgn_member = ();
51 my @all_sgn_member = ();
52 my %group_comment = ();
54 ##Retrieve Arabidoposis family members
56 $at_family_member_q->execute($family_id);
57 while( my ($member) = $at_family_member_q->fetchrow_array()){
58 my $locus = substr ($member, 0,9);
59 push @at_member, $member;
62 ##Retrieve SGN family members, group them by organisms
64 $sgn_family_member_q->execute($family_id);
65 while( my ($member, $cds_id, $organism_group_id) = $sgn_family_member_q->fetchrow_array()){
66 push @{$sgn_member{$organism_group_id}}, $member;
67 push @all_sgn_member, $cds_id;
70 foreach (keys %sgn_member){
71 $organism_group_q->execute($_);
72 if (my ($comment) = $organism_group_q->fetchrow_array()){
73 $group_comment{$_} = $comment;
77 ######################################################
78 #Retrieve alignment sequences and store them in a align object $family_align
79 my $family_align;
81 ##First check family size
82 my $total_member_nr = int (@all_sgn_member) + int (@at_member);
83 ($total_member_nr <2 ) and &not_applicable($page, $family_id, $total_member_nr);
84 ($total_member_nr >= $family_size_limit) and large_size($page, $family_id, $total_member_nr);
87 ##Retrieve alignment sequences, SGN and Arabidopsis seperately
88 my %align_seq = ();
89 foreach (keys %sgn_member) {
90 my $organism = $_;
91 foreach (@{$sgn_member{$organism}}) {
92 my $unigene_id = $_;
93 $sgn_align_seq_q->execute($unigene_id, $family_id);
94 if (my ($alignment_seq) = $sgn_align_seq_q->fetchrow_array()){
95 chomp $alignment_seq;
96 $align_seq{$unigene_id} = $alignment_seq;
101 foreach (@at_member) {
102 $at_align_seq_q->execute($_, $family_id);
103 if (my ($alignment_seq) = $at_align_seq_q->fetchrow_array()){
104 chomp $alignment_seq;
105 $align_seq{$_} = $alignment_seq;
108 #use Data::Dumper;
109 #warn Dumper \%align_seq;#for debug purpose
111 #############################
112 #Create a family_align object and save the data in it
114 $family_align = CXGN::Alignment::align->new(align_name=>$family_id,
115 width=>800,
116 height=>2000,
117 type=>'nt'
120 my ($my_align_seq, $len);
122 #Adding align_seqs members this way, instead of by going through keys of %align_seq, so that the align_seqs members are grouped by their species
123 foreach (keys %sgn_member) {
124 my $organism = $_;
125 foreach (@{$sgn_member{$organism}}) {
126 my $organism_name = $group_comment{$organism};
127 if (defined $align_seq{$_}) {
128 $len = length ($align_seq{$_});
129 $my_align_seq = CXGN::Alignment::align_seq->new(horizontal_offset=>0,
130 vertical_offset=>0,
131 length=>400,
132 height=>15,
133 start_value=>1,
134 end_value=>$len,
135 id=>$_,
136 seq=>$align_seq{$_},
137 species=>$organism_name
139 $my_align_seq->set_url("unigene.pl?unigene_id=$_");
141 $family_align->add_align_seq($my_align_seq);
146 foreach (@at_member) {
147 if (defined $align_seq{$_}) {
148 $len = length $align_seq{$_};
149 my $my_align_seq = CXGN::Alignment::align_seq->new(horizontal_offset=>0,
150 vertical_offset=>0,
151 length=>400,
152 height=>15,
153 start_value=>1,
154 end_value=>$len,
155 id=>$_,
156 seq=>$align_seq{$_},
157 species=>'Arabidopsis'
159 my $locus = substr ($_, 0,9);
160 $my_align_seq->set_url("http://www.arabidopsis.org/servlets/TairObject?type=locus&amp;name=$locus");
162 $family_align -> add_align_seq($my_align_seq);
167 #########################################################
168 #Retrieve selected range, overlap length, no hidden member and so on
169 #Store the numbers in $sum_content
171 my $range_start = $family_align -> get_start_value();
172 my $range_end = $family_align -> get_end_value();
174 my $overlap_len = $family_align -> get_overlap_num();
176 #this number might be different from $total_member_nr, since some sequences failed when being converted from peptide to nucleotide and were left out
177 my @all_align_members = $family_align -> get_nonhidden_member_ids();
178 my $total_align_member = int (@all_align_members);
180 #This number might be different from $total_align_member since some mebers might be hidden
181 my @nonhidden_align_members = $family_align->get_nonhidden_member_ids();
182 my $total_show_align_member = int (@nonhidden_align_members);
184 $sum_content = "<tr><td colspan=\"2\"><a target=\"blank\" href=\"/about/fam_align_analysis.pl\">For help with how we made the alignment, please click here.</a></td></tr>";
185 $sum_content .="<tr><th>Sequence Type</th><td>Nucleotide</td></tr>";
186 $sum_content .= "<tr><th>Total Family Members</th><td>$total_member_nr</td></tr>";
187 $sum_content .= "<tr><th>Available Member Sequences</th><td>$total_align_member</td></tr>";
188 $sum_content .= "<tr><th>Show Member Sequences</th><td>$total_show_align_member</td></tr>";
189 $sum_content .= "<tr><th>Range</th><td>$range_start - $range_end bp</td></tr>";
190 $sum_content .= "<tr><th>Overlap</th><td>$overlap_len bp</td></tr>";
192 $sum_content .= "<tr><td colspan=\"2\" align=\"center\"><a target=\"blank\" href=\"/about/align_term.pl\">For an explanation of the terms, plase click here.</a></td></tr>";
194 #############################################################
195 #Draw family_alignment image
197 #Generate temp file handles
198 my $html_root_path = $c->config->{'basepath'};
199 my $doc_path = $c->config->{'tempfiles_subdir'}.'/align_viewer';
200 my $path = $html_root_path . $doc_path;
202 my $tmp_image = new File::Temp(
203 DIR => $path,
204 SUFFIX => '.png',
205 UNLINK => 0,
208 ##Render image
209 $family_align -> render_png_file($tmp_image, 'c');
210 close $tmp_image;
211 $tmp_image =~ s/$html_root_path//;
213 $align_content = "<tr><td><center><img src=\"$tmp_image\" usemap=\"#align_image_map\" alt=\"\" /></center></td></tr>";
214 $align_content .= "<tr><td>To view details for a particular family member, click the member image or the identifier.</td></tr>";
216 ###Write image map
217 my $map_string = $family_align->write_image_map();
218 $align_content .= "<tr><td>$map_string</td></tr>";
220 ######################################################
221 #Find putative splice variants
222 my ($ob_ref, $pi_ref, $sp_ref) = $family_align -> get_sv_candidates();
223 my %ob = %$ob_ref;
224 my %pi = %$pi_ref;
225 my %species = %$sp_ref;
226 $sv_content = "<tr><th>Species</th><th>Sequence id</th><th>Sequence id</th><th>Overlap Bases</th><th>%Identical</th></tr>";
227 foreach my $first_key (keys %ob) {
228 foreach my $second_key (keys %{$ob{$first_key}}) {
229 $sv_content .= "<tr><td>$species{$first_key}</td><td>$first_key</td><td>$second_key</td><td>$ob{$first_key}{$second_key}</td><td>$pi{$first_key}{$second_key}</td></tr>";
234 ######################################################
235 #Find putative allele pairs
236 my ($al_ob_ref, $al_pi_ref, $al_sp_ref) = $family_align -> get_allele_candidates();
237 my %al_ob = %$al_ob_ref;
238 my %al_pi = %$al_pi_ref;
239 my %al_species = %$al_sp_ref;
240 $al_content = "<tr><th>Species</th><th>Sequence id</th><th>Sequence id</th><th>Overlap Bases</th><th>%Identical</th></tr>";
241 foreach my $first_key (keys %al_ob) {
242 foreach my $second_key (keys %{$al_ob{$first_key}}) {
243 $al_content .= "<tr><td>$al_species{$first_key}</td><td>$first_key</td><td>$second_key</td><td>$al_ob{$first_key}{$second_key}</td><td>$al_pi{$first_key}{$second_key}</td></tr>";
248 ######################################################
249 #Analyze sequences
250 my $family_member_ids_ref = $family_align->get_member_ids();
251 my $ov_score_ref = $family_align -> get_all_overlap_score();
252 my $medium_ref = $family_align -> get_all_medium();
253 my ($head_ref, $tail_ref) = $family_align -> get_all_range();
254 my $ng_ref = $family_align -> get_all_nogap_length();
255 $sp_ref = $family_align->get_member_species();
256 my $url_ref = $family_align->get_member_urls();
258 my @family_member_ids = @$family_member_ids_ref;
259 %species = %$sp_ref;
260 my %ov_score = %$ov_score_ref;
261 my %medium = %$medium_ref;
262 my %head = %$head_ref;
263 my %tail = %$tail_ref;
264 my %ng = %$ng_ref;
265 my %gap = ();
266 my %url = %$url_ref;
268 foreach (keys %ng) {
269 $gap{$_} = $tail{$_} - $head{$_} + 1 - $ng{$_};
273 $seq_sum_content = "<tr><th>Sequence id</th><th>Species</th><th>Cover Range</th><th>Bases</th><th>Gaps</th><th>Medium</th><th>Overlap Score</th></tr>";
275 foreach (@family_member_ids) { ##Access the align_seqs members this way, instead of by the keys of the hashes, so that the sequences are grouped together
276 $seq_sum_content .= "<tr><td><a target=\"blank\" href=\"$url{$)}\">$_</a></td><td>$species{$_}</td><td>$head{$_} - $tail{$_}</td><td>$ng{$_}</td><td>$gap{$_}</td><td>$medium{$_}</td><td>$ov_score{$_}</td></tr>";
279 #####################################################
280 #Write sequence output content
282 my $seq_ref = $family_align->get_nopad_seqs();
283 my $align_seq_ref = $family_align -> get_seqs();
284 my $ol_seq_ref = $family_align -> get_overlap_seqs();
286 if (defined $seq_ref) {
288 my $tmp_seq_file = new File::Temp(
289 DIR => $path,
290 SUFFIX => '.txt',
291 UNLINK => 0,
294 my %seqs = %$seq_ref;
296 my $seq_io_obj = Bio::SeqIO->new('-fh' => $tmp_seq_file,
297 '-format' => 'fasta'
299 foreach (@family_member_ids) {
300 my $comp_id = $_ . '-' . $species{$_};
301 $comp_id =~ s/\s/_/g;
302 my $seq_obj = Bio::Seq->new('-seq' => $seqs{$_},
303 '-id' => $comp_id
305 $seq_io_obj->write_seq($seq_obj);
307 close $tmp_seq_file;
309 $tmp_seq_file =~ s/$html_root_path//;
311 $seq_output_content = "<tr><td><a target=\"blank\" href=\"$tmp_seq_file\">All Sequences (No gap)</a></td></tr>";
315 if (defined $align_seq_ref) {
317 my $tmp_align_seq_file = new File::Temp(
318 DIR => $path,
319 SUFFIX => '.txt',
320 UNLINK => 0,
323 my %alignment_seqs = %$align_seq_ref;
324 my $seq_io_obj = Bio::SeqIO->new('-fh' => $tmp_align_seq_file,
325 '-format' => 'fasta');
326 foreach (@family_member_ids) {
327 my $comp_id = $_ . '-' . $species{$_};
328 $comp_id =~ s/\s/_/g;
329 my $seq_obj = Bio::Seq->new(
330 '-seq' => $alignment_seqs{$_},
331 '-id' => $comp_id
333 $seq_io_obj->write_seq($seq_obj);
335 close $tmp_align_seq_file;
337 $tmp_align_seq_file =~ s/$html_root_path//;
339 my $pass_tmp_align_seq_file = $html_root_path . $tmp_align_seq_file;
341 $seq_output_content .= "<tr><td><a target=\"blank\" href=\"$tmp_align_seq_file\">All Alignment Sequences (Padded with gap)</a></td>";
343 my $title = 'Family ' . $family_id;
344 $seq_output_content .= "<td><a target=\"blank\" href=\"/tools/align_viewer/show_align.pl?format='fasta'&amp;temp_file=$pass_tmp_align_seq_file&amp;type=nt&amp;start_value=1&amp;end_value=10000&amp;title=$title\">Analyze and Optimize with Alignment Viewer.</a></td></tr>";
347 if ($overlap_len > 0) {
348 my %ol_seqs = %$ol_seq_ref;
350 my $tmp_ol_seq_file = new File::Temp(
351 DIR => $path,
352 SUFFIX => '.txt',
353 UNLINK => 0,
356 my $ol_seq_io_obj = Bio::SeqIO->new(
357 '-fh' => $tmp_ol_seq_file,
358 '-format' => 'fasta'
361 foreach (@family_member_ids) {
362 my $comp_id = $_ . '_' . $species{$_};
363 my $seq_obj = Bio::Seq->new(
364 '-seq' => $ol_seqs{$_},
365 '-id' => $comp_id
367 $ol_seq_io_obj->write_seq($seq_obj);
369 close $tmp_ol_seq_file;
371 $tmp_ol_seq_file =~ s/$html_root_path//;
372 $seq_output_content .= "<tr><td><a target=\"blank\" href=$tmp_ol_seq_file>Overlap Sequences</a></td></tr>";
377 #####################################################
378 #Retrieve tree information
379 $family_tree_log_q->execute($family_id);
380 if (my ($taxa_num, $ol_len, $lf_loc) = $family_tree_log_q->fetchrow_array()) {
381 $gene_tree_content = "<tr><td colspan=\"2\" align=\"center\"><a target=\"blank\" href=\"/about/fam_tree_analysis.pl\">For help with gene tree construction, please click here.</a></td></tr>";
382 $gene_tree_content .= "<tr><th>No. of Taxa</th><td>$taxa_num</td></tr>";
383 $gene_tree_content .= "<tr><th>No. of Characters</th><td>$ol_len</td></tr>";
384 $gene_tree_content .= "<tr><th>Log File</th><td><a href=\"$lf_loc\">View</a></td></tr>";
386 $family_tree_nw_q->execute($family_id);
387 while (my ($tree_nr, $nw) = $family_tree_nw_q->fetchrow_array()) {
388 my $tree_title = 'Family ' . $family_id . ' Tree No. ' . $tree_nr;
389 $gene_tree_content .= "<tr><th>Tree No. $tree_nr</th><td>$nw<br><br><a target=\"blanc\" href=\"/tools/tree_browser/?tree_string=$nw&amp;title=$tree_title\">View in Tree Browser</a><br><br></td></tr>";
392 else {
393 $gene_tree_content = "<tr><td>No tree available</td></tr>";
397 ######################################################
398 #Page output
399 $page->header();
400 print page_title_html("Gene Family $family_id Alignment Details");
402 print blue_section_html('Family Sequence Alignment','<table width="100%" cellpadding="5" cellspacing="0" border="0">'.$align_content.'</table>');
404 print blue_section_html('Summary','<table width="80%" cellpadding="5" cellspacing="0" border="0">'.$sum_content.'</table>');
406 print blue_section_html('Putative Splice Variant Pairs','<table width="100%" cellpadding="5" cellspacing="0" border="0">'.$sv_content.'</table>');
408 print blue_section_html('Putative Allele Pairs','<table width="100%" cellpadding="5" cellspacing="0" border="0">'.$al_content.'</table>');
410 print blue_section_html('Aligment Analysis','<table width="100%" cellpadding="5" cellspacing="0" border="1">'.$seq_sum_content.'</table>');
412 print blue_section_html('Output Sequences','<table width="100%" cellpadding="5" cellspacing="0" border="0">'.$seq_output_content.'</table>');
414 print blue_section_html('Family Gene Tree','<table width="100%" cellpadding="5" cellspacing="0" border="1">'.$gene_tree_content.'</table>');
417 $page->footer();
419 sub empty_search {
420 my ($page, $family_id) = @_;
422 $page->header();
424 print <<EOF;
426 <b>No family id specified</b>
430 $page->footer();
431 exit 0;
434 sub not_applicable {
435 my ($page, $family_id, $num) = @_;
437 $page->header();
439 print <<EOF;
441 <b>The specified $family_id has only $num valid member so that alignment is not applicable.</b>
445 $page->footer();
446 exit 0;
449 sub large_size {
450 my ($page, $family_id, $num) = @_;
452 $page->header();
454 print <<EOF;
456 <b>The specified $family_id has $num valid members. Alignment is available for families having up to $family_size_limit members.</b>
460 $page->footer();
461 exit 0;