Merge pull request #42 from solgenomics/topic/duplicate_image_warning
[cxgn-corelibs.git] / lib / CXGN / Phylo / Alignment.pm
blob26352afd297dd4ad2ce548a41d9cdf6573be2f3a
1 package CXGN::Phylo::Alignment;
3 use strict;
4 use GD;
5 use File::Temp;
6 use Carp;
7 use Cwd;
8 use Digest::MD5 qw/md5_hex/;
9 use CXGN::Phylo::Alignment::Ruler;
10 use CXGN::Phylo::Alignment::Chart;
11 use CXGN::Phylo::Alignment::Member;
12 use CXGN::Phylo::Alignment::Legend;
13 use CXGN::Tools::Identifiers qw/identifier_url identifier_namespace/;
14 use CXGN::Tools::Param qw/ hash2param /;
15 use CXGN::Tools::Parse::Fasta;
16 use CXGN::Tools::Gene;
17 use CXGN::DB::Connection;
19 =head1 CXGN::Phylo::Alignment
21 Alignment -- packages for analyzing, optimizing and displaying sequence alignments
23 =head1 Author
25 Chenwei Lin (cl295@cornell.edu)
26 Refactoring and lingual adjustments provided by Chris Carpita <csc32@cornell.edu>
28 =head1 Packages
30 CXGN::Phylo::Alignment
31 CXGN::Phylo::Alignment::ImageObject
32 CXGN::Phylo::Alignment::Member
33 CXGN::Phylo::Alignment::Ruler
34 CXGN::Phylo::Alignment::Chart
36 Packages Member, Ruler and Chart inherit from ImageObject
38 =head1 Package CXGN::Phylo::Alignment
40 The basic element of the alignment object is an array of member.
41 Its attributes include: name, width (pixel), height(pixel),
42 image, members, ruler, chart, conserved_seq, sv_overlap, sv_identtity,
43 start_value and end_value
45 Its functionality includes:
46 1. Image display.
47 2. Calculation and output of pairwise similaity and putative splice variant pairs based on similarity.
48 3. Hide some alignment sequences so that they are not included in the analysis.
49 4. Select a range of sequences to be analyzed.
50 5. Asses how a set of members overlaps with another set.
51 6. Calculate the non-gap mid point of each alignment sequence and group the sequences according to their overlap.
53 =head2 Constructer new()
55 Create a new alignment, returns an alignment object
57 =head3
59 Synopsis: my $alignment = CXGN::Phylo::Alignment->new(
60 name=>$name,
61 width=>$width,
62 height=>$height,
63 type=>$type, #'nt' or 'pep'
66 Description: Upon constructing an alignment object, it sets name,
67 width and height using arguments. It also generates an image object of
68 {width} and {height} and sets the default value of sv_overlap,
69 sv_identity (the minimum number overlapping aa and percentage
70 identity for two members to be considered as putative splice variants),
71 sv_indel_limit (the min aa indel length for two sequences to be considered
72 as splice variants, instead of alleles) and start_value (the start value
73 of the ruler and member objects(s))
75 Returns: A CXGN::Phylo::Alignment object
77 =cut
79 sub new {
80 my $class = shift;
81 my %args = @_;
82 if ( ref( $_[0] ) eq "HASH" ) { #forward compatibility
83 %args = %{ $_[0] };
86 my $self = bless {}, $class;
87 ##################set attibutes from parameter
88 $self->{name} = $args{name};
90 ## Default constraints for all member ImageObjects, unless their display
91 ## properties are set explicitly. If you don't use set_image, this will
92 ## determine the height and width of the created image
93 $self->{width} = $args{width} || 600;
94 $self->{height} = $args{height} || 400;
95 $self->{left_margin} = 30;
96 $self->{top_margin} = 30;
97 $self->{right_margin} = 10;
99 $self->{type} = $args{type}; # 'pep' or 'nt' or 'cds'
101 $self->{display_type} = "complete";
103 #also: "alignment" (only), "stats" (chart and conserved sequence only).
104 #You can use 'c','a', and 's' as shorthand.
106 ##################set defaults
107 #splice variants criteria
108 $self->{sv_overlap} = 20; #default in case we are using peptides
109 $self->{indel_limit} = 4; #(ditto)
110 $self->{sv_identity} = 95;
111 $self->{start_value} =
112 1; #the end_value is initiated when the first member is added
113 $self->{end_value} = 1;
115 ## define some 'empty' attributes that will be asigned later
116 @{ $self->{members} } = ();
117 $self->{ruler} = undef;
118 $self->{chart} = undef $self->{image} = undef;
119 $self->{conserved_seq} = undef;
120 $self->{seq_length} = 0;
122 #generally don't need to set this, since it can be inferred based on content
123 $self->{unaligned} = 0;
124 $self->{muscle} = {}; #muscle parameters for running alignment
125 $self->{from_file} = undef;
127 foreach (qw/ from_file unaligned from_ids muscle tmp_dir /) {
128 $self->{$_} = $args{$_} if exists( $args{$_} );
131 if ( defined $self->{from_file} and -f $self->{from_file} ) {
132 $self->from_file();
134 elsif ( $self->{from_file} ) {
135 warn "Alignment file not found: " . $self->{from_file} . "\n";
138 if ( ref( $self->{from_ids} ) eq "ARRAY" ) {
139 my @ids = @{ $self->{from_ids} };
140 foreach my $id (@ids) {
141 my $seq = "";
142 my $g = undef;
143 eval {
144 $g = CXGN::Tools::Gene->new($id);
145 $g->fetch("seq");
146 $seq = $g->get_sequence('protein');
148 if ($seq) {
149 my $m = CXGN::Phylo::Alignment::Member->new(
151 id => $id,
152 seq => $seq
155 $self->add_member($m);
158 $self->run_muscle();
161 return $self;
164 # get the cols with <= $max_gaps gaps in them (non-hidden sequences only), make new alignment object
165 # with just these columns of the non-hidden sequences only.
167 sub remove_gappy_cols {
168 my $self = shift;
169 my $mxgps = shift;
170 $mxgps = 0 unless ($mxgps); #default is 0
171 my $oseqs = $self->get_overlap_seqs($mxgps)
172 ; # ref to hash with id/seq pairs. These are seqs with only the ungappy columns
173 my $cols_left;
174 if ( ( scalar keys %$oseqs ) ) {
175 $cols_left = length $oseqs->{ ( keys %$oseqs )[0] };
177 # print STDERR "After winnowing cols with max_gaps = $mxgps. Cols remaining: ", $cols_left, "\n";
179 else {
180 $cols_left = 0;
182 return undef if ( $cols_left == 0 ); # in case of empty sequences
183 my $alignment = CXGN::Phylo::Alignment->new();
184 foreach my $id ( keys %$oseqs ) {
185 my $seq = $oseqs->{$id};
186 my $member =
187 CXGN::Phylo::Alignment::Member->new( id => $id, seq => $seq );
188 $alignment->add_member($member);
190 return $alignment;
193 =head2 Setters and getters
195 get_name(), set_name()
197 get_image(), set_image()
199 get_width(), set_width()
201 get_height(), set_height()
203 get_seq_length()
205 get_sv_criteria(), set_sv_criteria()
207 get_start_value(), set_start_value(), check_start_value()
209 get_end_value(), set_end_value(), check_end_value()
211 get_left_margin(), set_left_margin(), get_top_margin(), set_top_margin()
213 Those for name, image, sv_overlap and sv_identity, height, width,
214 left_margin, top_margin are straightforward, while the
215 setters for start_value, end_value are not simple, since these attributes
216 are related to and/or restricted by other attributes. seq_length is
217 determined by the first member added and therefore can not be reset.
219 =cut
221 sub get_name {
222 my $self = shift;
223 return $self->{name};
226 sub set_name {
227 my $self = shift;
228 my $name = shift;
229 $self->{name} = $name;
232 sub get_image {
233 my $self = shift;
234 return $self->{image};
237 sub set_image {
238 my $self = shift;
239 $self->{image} = shift;
242 sub get_width {
243 my $self = shift;
244 return $self->{width};
247 sub get_height {
248 my $self = shift;
249 return $self->{height};
252 sub set_tmp_dir {
253 my $self = shift;
254 my $dir = shift;
255 if ( -d $dir ) {
256 $self->{tmp_dir} = $dir;
258 elsif ($dir) {
259 die "Directory does not exist: $dir\n";
261 else {
262 my $vhost = SGN::Context->new();
263 $self->{tmp_dir} =
264 $vhost->get_conf('basepath')
265 . $vhost->get_conf('tempfiles_subdir')
266 . '/align_viewer';
267 unless ( -d $self->{tmp_dir} ) {
268 warn "Temporary directory does not exist: "
269 . $self->{tmp_dir}
270 . "; setting to current directory";
271 $self->{tmp_dir} = ".";
274 return $self->{tmp_dir};
277 sub get_tmp_dir {
278 my $self = shift;
279 if ( !$self->{tmp_dir} ) {
280 $self->set_tmp_dir();
282 return $self->{tmp_dir};
285 =head3 set_width(), set_height()
287 Synopsis: set_width($x), set_height($x)
289 Description: sets the attributes {width} and {height}.
291 =cut
293 sub set_width {
294 my $self = shift;
295 $self->{width} = shift;
298 sub set_height {
299 my $self = shift;
300 $self->{height} = shift;
303 =head3 set_sv_criteria()
305 Synopsis: $alignment->set_sv_criteria($x, $y, $z), while $x is
306 the minimum overlap, $y is a percentage similarity and $z is the minimal
307 amino acid indel length to be considered as a splice variant
309 Description: Set the putative splice variants standard, the minimum
310 overlapping bases and percentage identity (sv_overlap and sv_identity).
311 The sub checks if the values are correct before setting the attributes
313 =cut
315 sub set_sv_criteria {
316 my $self = shift;
317 my ( $overlap, $identity, $indel ) = @_;
319 if ( $overlap <= 0 ) {
320 die "Overlap must be greater than 0!\n";
322 elsif ( ( $identity < 0 ) || ( $identity > 100 ) ) {
323 die "Percentage identity must be greater than 0 and less than 100\n";
325 elsif ( $indel < 0 ) {
326 die "Indel limit must be greater than 0!\n";
328 else {
329 $self->{sv_overlap} = $overlap;
330 $self->{sv_identity} = $identity;
331 $self->{indel_limit} = $indel;
335 sub get_sv_criteria {
336 my $self = shift;
337 return ( $self->{sv_overlap}, $self->{sv_identity}, $self->{indel_limit} );
340 sub set_left_margin {
341 my $self = shift;
342 $self->{left_margin} = shift;
345 sub get_left_margin {
346 my $self = shift;
347 return $self->{left_margin};
350 sub set_top_margin {
351 my $self = shift;
352 $self->{top_margin} = shift;
355 sub get_top_margin {
356 my $self = shift;
357 return $self->{top_margin};
360 sub get_right_margin {
361 my $self = shift;
362 return $self->{right_margin};
365 sub set_right_margin {
366 my $self = shift;
367 $self->{right_margin} = shift;
370 sub get_label_gap {
371 my $self = shift;
372 return $self->{label_gap};
375 sub set_label_gap {
376 my $self = shift;
377 $self->{label_gap} = shift;
380 sub _calculate_label_gap {
381 my $self = shift;
382 my $maxwidth = 0;
383 foreach my $m ( @{ $self->{members} } ) {
384 my $ls = $m->get_label_spacer();
385 $ls ||= 0;
386 my $labelwidth = $m->{font}->width() * length( $m->get_id() ) + $ls;
387 $maxwidth = $labelwidth if ( $labelwidth > $maxwidth );
389 if ( $self->{chart} ) {
390 my $labelwidth = $self->{chart}->get_longest_label_width();
391 $maxwidth = $labelwidth if ( $labelwidth > $maxwidth );
393 $self->{label_gap} = $maxwidth;
394 return $maxwidth;
397 sub get_start_value {
398 my $self = shift;
399 return $self->{start_value};
402 sub get_end_value {
403 my $self = shift;
404 return $self->{end_value};
407 sub check_start_value {
408 my $self = shift;
409 my $value = shift;
410 if ( $value < 0 ) {
411 $value = 0;
412 print "$self->{id} start_value less than 0, reset to 0\n";
414 return $value;
417 sub check_end_value {
418 my $self = shift;
419 my $value = shift;
420 if ( $value > length $self->{members}[0]->{seq} ) {
421 $value = length $self->{members}[0];
422 print "value greater than sequence length, reset to sequence length\n";
424 return $value;
428 sub get_seq_length {
429 my $self = shift;
430 return $self->{seq_length};
433 =head3 set_start_value(), set_end_value()
435 Synopsis: set_start_value($x), set_end_value($x)
437 Description: set the start_value and end_value. Check if the input value is correct before setting the attributes. Since the {start_value} and {end_value} attributes of the {ruler} and {members} must be the same as those of the alignment object, the set subs call the set_start_value and set_end_value of the {ruler} and all members of {members}
439 Returns:
441 =cut
443 sub set_start_value {
444 my $self = shift;
445 my $value = shift;
446 $value =
447 $self->check_start_value($value); #check if the start value is correct
448 $self->{start_value} = $value; #the ruler will use this start_value
450 foreach ( @{ $self->{members} } )
451 { #set the start_value of each included member
452 $_->set_start_value($value);
456 sub set_end_value {
457 my $self = shift;
458 my $value = shift;
460 #$value = $self->check_end_value($value);#check if the end value is correct
461 $self->{end_value} = $value;
463 foreach ( @{ $self->{members} } )
464 { #set the start_value of each included member
465 $_->set_end_value($value);
469 sub get_type {
470 my $self = shift;
471 return $self->{type};
474 sub set_type {
475 my $self = shift;
476 $self->{type} = shift; #'nt' or 'pep'
479 sub set_display_type {
480 my $self = shift;
481 my $display_type = shift;
482 die "Please set display type to c(omplete), a(lignment), or s(tats)"
483 unless $display_type =~ /^(a|c|s)/i;
484 $self->{display_type} = $display_type;
487 sub get_display_type {
488 my $self = shift;
489 return $self->{display_type};
492 sub show_label {
493 my $self = shift;
494 $self->{label_shown} = 1;
497 sub hide_label {
498 my $self = shift;
499 $self->{label_shown} = 0;
502 sub get_combined_member_height {
503 my $self = shift;
504 my $total_height = 0;
505 foreach my $m ( @{ $self->{members} } ) {
506 next if $m->is_hidden;
507 $total_height += $m->get_height;
509 return $total_height;
512 sub set_fasta_temp_file {
513 my $self = shift;
514 $self->{fasta_temp_file} = shift;
517 sub get_fasta_temp_file {
518 my $self = shift;
519 return $self->{fasta_temp_file};
522 sub set_cds_temp_file {
523 my $self = shift;
524 $self->{cds_temp_file} = shift;
527 sub get_cds_temp_file {
528 my $self = shift;
529 return $self->{cds_temp_file};
532 =head1 Subroutines to add ImageObjects to Alignment object
534 add_member(), _add_ruler(), _add_cvg_chart, _add_conserved_seq_obj()
536 =cut
538 sub from_file {
539 my $self = shift;
540 my $from_file = shift;
541 $from_file ||= $self->{from_file};
542 my $parser = CXGN::Tools::Parse::Fasta->new($from_file);
543 my $count = 0;
544 my $debug = "";
545 while ( my $entry = $parser->next() ) {
546 my ( $id, $seq, $species ) =
547 ( $entry->{id}, $entry->{seq}, $entry->{species} );
548 chomp $seq;
549 my $len = length($seq);
550 my $member = undef;
551 eval {
552 $member = CXGN::Phylo::Alignment::Member->new(
553 id => $id,
554 seq => $seq,
555 species => $species,
556 type => $self->{type}
558 my $r = $self->add_member($member);
559 $count++;
561 warn $@ if ($@);
565 =head3 add_member()
567 Synopsis: $alignment->add_member($member), while member is an member object
569 Description: Add member to alignment object. The member objects are
570 stored in an array members. Once the first member object is added,
571 only member objects of the same length can be further added. At the
572 same time, the end_value is set to be the same as the sequence length
573 of the first member added.
575 =cut
577 sub add_member {
578 my $self = shift;
579 my $member = shift;
580 ref $member && $member->isa( __PACKAGE__ . '::Member' )
581 or croak "invalid member '$member'";
583 if ( ! @{ $self->{members} } ) {
585 #if there are no members in @members, reset the end_value of overall
586 #alignment to the length of this sequence
587 $self->set_end_value( $member->get_end_value() );
588 $self->{seq_length} = length( $member->get_select_seq() );
590 #adjust the vertical and horizontal offsets and length
591 #add the member to @members
592 push @{ $self->{members} }, $member;
594 else {
595 my $len = length( $member->get_seq() );
597 #the length of the member must the the same as the first member in
598 #@members, otherwise it won't be added
599 if ( $len == $self->{seq_length} ) {
601 # Set the start_value and end_value of the member to the current
602 # values of the overall alignment
603 $member->set_start_value( $self->get_start_value() );
604 $member->set_end_value( $self->get_end_value() );
606 push @{ $self->{members} }, $member;
608 elsif ( !$self->{unaligned} ) {
609 my $id = $member->get_id();
610 my $seq = $member->get_select_seq();
611 unless ( $seq =~ /-/ ) {
612 warn
613 "Sequence w/ id $id does not have same length, assuming unaligned sequences are provided";
614 $self->{unaligned} = 1;
616 #don't need to set start/end values, since those are meaningless in the unaligned context
617 #we are assuming that muscle will be run via $self->run_muscle();
618 push( @{ $self->{members} }, $member );
620 else {
622 #if we thought we were well-aligned, then we're dead wrong
624 "Error in adding pre-aligned sequence $id, length ($len) not the same as overall alignment ("
625 . $self->{seq_length} . ")\n";
628 else {
630 #We know the family is yet-to-be aligned
631 push( @{ $self->{members} }, $member );
636 sub get_nongaps_hash
637 { # this returns a ref to a hash with ids as keys, numbers of nongaps as values
638 my $self = shift;
639 my %nongaps;
640 foreach ( @{ $self->{members} } ) {
641 next if ( $_->is_hidden() );
642 $nongaps{ $_->get_id() } = $_->get_nongaps()
643 ; # this get_nongaps is a method of Member, returns # of nongaps
645 return \%nongaps;
648 sub get_gaps_nongaps
649 { # this returns just the total number of nongap characters in the alignment's non-hidden members.
650 my $self = shift;
651 my $nongaps = 0;
652 foreach ( @{ $self->{members} } ) {
653 next if ( $_->is_hidden() );
654 $nongaps += $_->get_nongaps()
655 ; # this get_nongaps is a method of Member, returns # of nongaps
658 # print "in get_gaps_nongaps: ", $self->get_seq_length()*$self->get_visible_member_nr() - $nongaps, " ", $nongaps, "\n";
659 return (
660 $self->get_seq_length() * $self->get_visible_member_nr() - $nongaps,
661 $nongaps );
664 #sub get_gaps{ # get total gap characters in non-hidden members
665 # my $self = shift;
666 # my $n_cols = $self->get_seq_length();
667 # my $n_seq = $self->get_visible_member_nr();
668 # return $n_cols*$n_seq - $self->get_nongaps();
671 =head3 hide_gappy_members
673 Hide members (sequences) if they have fewer than the specified number of non-gap characters
674 Arg: Minimum number of non-gap characters in sequence for it to be kept.
675 Ret: Number of members hidden by this pass through subroutine, others may have been already hidden.
677 =cut
679 sub hide_gappy_members {
680 my $self = shift;
681 my $min_nongaps = shift;
682 my $n_hide = 0; # counts the number of sequences hidden by this call
683 foreach ( @{ $self->{members} } ) {
684 next if ( $_->is_hidden() ); #
685 # my ($gaps, $nongaps) = $_->get_nongaps();
686 if ( $_->get_nongaps() < $min_nongaps ) {
687 $_->hide_seq();
688 $n_hide++;
690 # print "hiding id: ", $_->get_id(), ". nongaps: ", $_->get_nongaps(), "\n";
693 return $n_hide;
696 =head3 get_members
698 Get the member objects of the alignment
699 Arg: none
700 Ret: List of members
702 =cut
704 sub get_members {
705 my $self = shift;
706 return @{ $self->{members} };
709 sub add_legend_item {
710 my $self = shift;
711 my ( $name, $color, $url, $tooltip ) = @_;
713 #color is array ref
714 $self->_add_legend() unless $self->{legend};
715 $self->{legend}->add_item( $name, $color, $url, $tooltip );
718 =head3 _add_ruler()
720 Synopsis: $alignment->_add_ruler($x,$y), where $x is the top margin
721 and $y is the height of the ruler.
723 Description: Add a ruler to the alignment object, the start_value and
724 end_value are set to the same as those of the alignment. If no member
725 has been added to the alignment object, the seq_length, start_value and
726 end_value of the alignment are not set (see sub add_member), then a ruler
727 can not be added.
729 =cut
731 sub _add_ruler {
732 my $self = shift;
734 my $ruler = CXGN::Phylo::Alignment::Ruler->new(
735 start_value => $self->{start_value},
736 end_value => $self->{end_value}
738 ( $self->{type} eq 'pep' ) and ( $ruler->set_unit('aa') );
739 ( $self->{type} eq 'nt' ) and ( $ruler->set_unit('bp') );
740 $self->{ruler} = $ruler;
743 =head3 _add_cvg_chart()
745 Synopsis: $alignment->_add_ruler($x,$y,$z), while $x is the vertical
746 offset, $y is the id and $z is a hash reference whose key is an integer
747 (a position) and value is a percentage
749 Description: Add a chart representing coverage by a member. The
750 start_value and end_value are set to the same as those of the alignment.
751 The coverage of each alignment postion is repreesnted by a hash reference
752 passed to the subroutine. The key of the hash is the alignment postion
753 and the values are percentage converage.
755 =cut
757 sub _add_cvg_chart {
758 my $self = shift;
759 my $title = shift;
760 my $similarity_hash = shift;
761 my $conservation_hash = shift;
762 my $type_hash = shift;
764 my $chart = CXGN::Phylo::Alignment::Chart->new(
765 start_value => $self->{start_value},
766 end_value => $self->{end_value},
767 id => $title,
768 type => $self->{type},
769 similarity_hash => $similarity_hash,
770 conservation_hash => $conservation_hash,
771 type_hash => $type_hash,
773 $self->{chart} = $chart;
776 sub _add_legend {
777 my $self = shift;
779 $self->{legend} = CXGN::Phylo::Alignment::Legend->new(
780 left_margin => $self->{left_margin},
781 top_margin => $self->{height} - 100,
785 =head3 _add_conserved_seq_obj()
787 Synopsis: $alignment->_add_conserved_seq_obj($x), while $x is the vertical offset.
789 Description: Add a member object representing the conserved sequence of
790 the @members. The seq of this object is generated by another subroutine
791 get_conserved_seq. If the sequence at a position is not conserved among
792 all present members, it is repreesnted by - in conserved_seq. This
793 object is NOT a member of @members.
795 =cut
797 sub _add_conserved_seq_obj {
798 my $self = shift;
799 my $title = shift;
800 $title ||= "Overall Conserved Sequence";
801 my $seq = $self->get_conserved_seq();
802 my $seq_obj = CXGN::Phylo::Alignment::Member->new(
803 start_value => $self->{start_value},
804 end_value => $self->{end_value},
805 id => $title,
806 seq => $seq,
807 species => ' ',
809 $self->{conserved_seq} = $seq_obj;
812 =head1 Subroutines to search and ouput ids of @members
814 is_id_member(), is_member(), id_to_member(), get_member_ids(),
815 get_nonhidden_member_ids, get_hidden_member_ids(), get_member_species(),
816 get_member_urls()
818 =cut
820 =head3 is_id_member()
822 Synopsis: is_id_member($id)
824 Description: Do any of the members have the same id as $id?
826 Returns: 1 for true and 0 for false
828 =cut
830 sub is_id_member {
831 my $self = shift;
832 my $id = shift;
833 foreach ( @{ $self->{members} } ) {
834 return 1 if $_->{id} eq $id;
836 return 0;
839 =head3 is_member()
841 Synopsis: is_member($member), while $member is an member object
843 Description: Is $member already a member?
845 Returns: 1 for true and 0 for false
847 =cut
849 sub is_member {
850 my $self = shift;
851 my $member = shift;
852 foreach ( @{ $self->{members} } ) {
853 return 1 if $member == $_;
855 return 0;
858 =head3 id_to_member()
860 Synopsis: $alignment->id_to_member($id);
862 Description: check if a alignment member has the id $id and return the alignment member
864 Returns: an alignment object
866 =cut
868 sub id_to_member {
869 my $self = shift;
870 my $id = shift;
871 foreach ( @{ $self->{members} } ) {
872 return $_ if $_->{id} eq $id;
876 =head3 get_member_ids()
878 Synopsis: $alignment->get_member_ids()
880 Description: Returns ids of all members
882 Returns: an array of ids of all non-hidden @members elements
884 =cut
886 sub get_member_ids {
887 my $self = shift;
889 my @member_ids = ();
890 push( @member_ids, $_->get_id ) foreach ( @{ $self->{members} } );
891 return \@member_ids;
894 sub get_member_nr {
895 my $self = shift;
897 my $number = int( @{ $self->{members} } );
898 return $number;
901 =head3 get_nonhidden_member_ids()
903 Synopsis: $alignment->get_nonhidden_member_ids()
905 Description: Returns ids of members that are not hidden
907 Returns: an array of ids of all non-hidden @members elements
909 =cut
911 sub get_nonhidden_member_ids {
912 my $self = shift;
914 my @members = ();
915 foreach ( @{ $self->{members} } ) {
916 unless ( $_->is_hidden ) {
917 my $id = $_->get_id();
918 push @members, $id;
921 return \@members;
924 sub get_nonhidden_member_nr {
925 my $self = shift;
926 my $number = 0;
927 foreach ( @{ $self->{members} } ) {
928 $number++ unless $_->is_hidden;
930 return $number;
933 =head3 get_hidden_member_ids()
935 Synopsis: $alignment->get_hidden_member_ids()
937 Description: Returns ids of members that are hidden
939 Returns: an array of ids of all hidden @members elements
941 =cut
943 sub get_hidden_member_ids {
944 my $self = shift;
945 my @members = ();
946 foreach ( @{ $self->{members} } ) {
947 push( @members, $_->get_id() ) if ( $_->is_hidden() );
949 return \@members;
952 sub get_hidden_member_nr {
953 my $self = shift;
954 my $number = 0;
955 foreach ( @{ $self->{members} } ) {
956 $number++ if $_->is_hidden;
958 return $number;
961 sub get_visible_member_nr {
962 my $self = shift;
963 my $hidden_number = 0;
964 foreach ( @{ $self->{members} } ) {
965 $hidden_number++ if $_->is_hidden;
967 return $self->get_member_nr - $hidden_number;
970 =head3 get_member_species()
972 Synopsis: $alignment->get_member_species()
974 Description: Return the species of each member of @members
976 Returns: A hash reference whose keys are ids and values are species
978 =cut
980 sub get_member_species {
981 my $self = shift;
983 my %member_species = ();
984 foreach ( @{ $self->{members} } ) {
985 my $id = $_->get_id;
986 my $species = $_->get_species();
987 $member_species{$id} = $species;
989 return \%member_species;
992 sub get_member_urls {
993 my $self = shift;
995 my %member_url = ();
996 foreach ( @{ $self->{members} } ) {
997 my $id = $_->get_id;
998 my $url = $_->get_url();
999 $member_url{$id} = $url;
1002 return \%member_url;
1005 =head2 Image processing subs of the package
1007 render(), render_png(), render_jpg(), render_png_file(), render_jpg_file(), get_image_map()
1009 =cut
1011 sub layout {
1012 my $self = shift;
1013 my ($option) = $self->{display_type} =~ /^(\w)/;
1014 $option = lc($option);
1015 $option ||= 'c';
1017 $self->_add_ruler() unless $self->{ruler};
1018 $self->_add_cvg_chart( "Coverage %", $self->get_ngap_pct )
1019 if ( $option =~ /^(c|s)/i && !$self->{chart} );
1021 #$self->_add_conserved_seq_obj() if ($option =~ /^(c|s)/i && !$self->{conserved_seq});
1023 my $label_gap = $self->_calculate_label_gap();
1025 #The ImageObjects: Chart, Ruler, Member must all be width-adjusted to account for
1026 #label spacing and alignment image margins. This is an example where ImageObjects
1027 #should be subclassed and rendered properly in the first place, taking label spacing
1028 #into account, but this is much easier for now:
1029 my $width_adjustment =
1030 $label_gap + $self->get_left_margin + $self->get_right_margin;
1031 $width_adjustment = $self->{width_adjustment}
1032 if exists $self->{width_adjustment};
1034 my $ruler = $self->{ruler};
1035 my $ruler_top_margin = 0;
1036 if ($ruler) {
1037 $ruler->layout_top_margin( $self->get_top_margin );
1038 $ruler->layout_left_margin( $self->get_left_margin );
1039 $ruler->layout_width( $self->get_width - $width_adjustment );
1040 $ruler->layout_label_spacer(20);
1041 $ruler->layout_color( 140, 140, 140 );
1042 $ruler->layout_label_color( 120, 120, 120 );
1044 # $ruler->layout_height($self->get_height);
1045 $ruler_top_margin = $ruler->get_top_margin;
1047 my $chart = $self->{chart};
1048 my $chart_height = 0;
1049 if ($chart) {
1050 $chart->layout_top_margin( $ruler_top_margin + 20 );
1051 $chart->layout_left_margin( $self->get_left_margin );
1052 $chart->layout_width( $self->get_width - $width_adjustment );
1053 $chart->layout_label_spacer(20);
1054 $chart->layout_color( 90, 90, 150 );
1055 $chart->layout_height(40);
1056 $chart_height = $chart->get_height;
1059 if ( $option eq 'c' || $option eq 'a' || $option eq 's' )
1060 { #alignment shown, default layout if member values not set
1061 my $align_v_offset;
1062 ( $option eq 'c' || $option eq 's' )
1063 and $align_v_offset = $ruler_top_margin + $chart_height + 40;
1064 ( $option eq 'a' ) and $align_v_offset = $ruler_top_margin + 20;
1065 foreach my $m ( @{ $self->{members} } ) {
1066 next if ( $m->is_hidden() );
1067 $m->layout_top_margin($align_v_offset);
1068 $m->set_left_margin( $self->get_left_margin() );
1069 $m->layout_width( $self->get_width() - $width_adjustment );
1070 $m->layout_label_spacer(20);
1071 $m->layout_height(15);
1072 $m->layout_color( 20, 20, 160 );
1073 $m->layout_label_color( 90, 90, 90 );
1074 $align_v_offset += $m->get_height;
1076 my $members_height = $align_v_offset;
1077 $self->set_height( $members_height + 20 );
1079 else {
1080 $self->set_height( $ruler_top_margin + $chart_height + 40 );
1083 $ruler->layout_height( $self->get_height - 20 ) if $ruler;
1085 my $legend = $self->{legend};
1086 if ($legend) {
1087 $legend->{width} = $self->{width} - $self->{left_margin} - $label_gap;
1088 $legend->layout();
1089 $self->{height} = ( $self->{height} + $legend->{height} + 10 );
1090 $legend->{top_margin} = ( $self->{height} - $legend->{height} - 20 );
1091 $self->{height} -= 5;
1092 $legend->{left_margin} += 5;
1097 =head3 render()
1099 Synopsis: $alignment->render($o) where $o represnts option, 'c' for complete, 'a' for alignment only and 's' for simple (only the ruler, coverage chart and conserved sequence, no individual members of @members)
1101 Description: it does the following
1102 1. Generage, set and render a ruler
1103 2. Generate, set and render a chart representing coverge
1104 3. Generate, set and render a member object representing conserved sequence
1105 4. Render all non-hidden members of the @aign_seqs
1108 Returns:
1110 =cut
1112 sub render {
1113 my $self = shift;
1115 $self->layout();
1117 #Generate a image object for the ImageObject (member, ruler and chart) to render
1118 unless ( $self->{image} ) {
1119 $self->{image} =
1120 GD::Image->new( $self->get_width(), $self->get_height() )
1121 or die "Can't generate image\n";
1124 # the first color located is the background color, white
1125 $self->{white} = $self->{image}->colorResolve( 255, 255, 255 );
1127 $self->{image}->filledRectangle( 0, 0, $self->{width}, $self->{height},
1128 $self->{white} );
1130 $self->{chart}->render( $self->{image} ) if ( $self->{chart} );
1132 $self->{ruler}->render( $self->{image}, $self ) if ( $self->{ruler} );
1133 $self->{conserved_seq}->render( $self->{image} )
1134 if ( $self->{conserved_seq} );
1136 my ($option) = $self->{display_type} =~ /^(\w)/;
1137 $option = lc($option);
1138 my $m;
1139 if ( $option eq 'a' || $option eq 'c' ) {
1140 foreach ( @{ $self->{members} } ) {
1141 $_->render( $self->{image} ) unless ( $_->is_hidden );
1142 $m = $_;
1146 #Send one member to legend renderer as sample for base_showing mode,
1147 #where colors must be lightened. Easiest if member has already been
1148 #rendered: otherwise, scaling factor and text width must be calculated
1149 #explicitly.
1150 $self->{legend}->render( $self->{image}, $self, $m )
1151 if ( $self->{legend} && ( $option eq 'c' || $option eq 's' ) );
1155 =head3 render_png(), render_jpg()
1157 Synopsis: $alignment->render_jpg(), $alignment->render_png
1159 Description: Render itself and convert print out png or jpg
1161 Returns:
1163 =cut
1165 sub render_separate_png_files {
1166 my $self = shift;
1167 my $background_filepath = shift;
1169 $self->{display_type} = "separate";
1170 $self->render();
1172 open( WF, ">", $background_filepath );
1173 print WF $self->{image}->png();
1174 close WF;
1176 my $mfp = $background_filepath;
1177 $mfp =~ s/\.[^\.]+$//;
1179 my @member_imgs = ();
1180 foreach ( @{ $self->{members} } ) {
1181 $_->set_left_margin( $self->get_left_margin() );
1182 $_->set_top_margin( $_->get_height() / 2 );
1184 my $id = $_->get_id();
1185 next unless $id;
1187 my $img_path = $mfp . "." . md5_hex($id) . ".png";
1188 push( @member_imgs, $img_path );
1189 open( WF, ">$img_path" );
1190 my $w =
1191 $_->get_width() + $self->get_label_gap() + $self->get_right_margin();
1192 +$self->get_left_margin();
1194 my $h = $_->get_height();
1196 my $image = GD::Image->new( $w, $h )
1197 or die "Can't generate image\n";
1198 my $white = $image->colorAllocate( 255, 255, 255 );
1199 $image->filledRectangle( 0, 0, $w, $h, $white );
1200 $image->transparent($white);
1202 $_->render($image);
1203 print WF $image->png();
1204 close WF;
1206 return @member_imgs;
1209 sub render_png {
1210 my $self = shift;
1211 $self->render();
1212 print $self->{image}->png();
1215 sub render_jpg {
1216 my $self = shift;
1217 $self->render();
1218 print $self->{image}->jpeg();
1221 =head3 render_png_file(), render_jpg_file()
1223 SYnopsis: $alignment->render_png_file($file_name, $option), $alignment->render_jpg_file($file_name, $option)
1225 Description: take a filename as arguments, render itself and output pgn or jpg image to the file.
1227 Returns:
1229 =cut
1231 sub render_png_file {
1232 my $self = shift;
1233 my $filename = shift;
1234 $self->render();
1235 open( F, ">$filename" )
1236 || die
1237 "Can't open $filename for writing!!! Check write permission in dest directory.";
1238 print F $self->{image}->png;
1239 close(F);
1242 sub render_jpg_file {
1243 my $self = shift;
1244 my $filename = shift;
1245 $self->render();
1246 open( F, ">$filename" )
1247 || die
1248 "Can't open $filename for writing!!! Check write permission in dest directory.";
1249 print F $self->{image}->jpeg;
1250 close(F);
1253 =head3 write_image_map()
1255 Synopsis: $alignment->write_image_map()
1257 Description: get the image map string of each non-hidden @members, concat them and return as a single string
1259 Returns: a string
1261 =cut
1263 sub get_param {
1264 my $self = shift;
1265 return $self->{param};
1268 sub set_param {
1269 my $self = shift;
1270 my $hashref = shift;
1271 die "Not hashref" unless ( ref $hashref eq "HASH" );
1272 $self->{param} = $hashref;
1275 sub get_image_map {
1276 my $self = shift;
1277 my ( $show_sv, $show_breakdown ) = @_;
1278 my $map_content;
1280 #XHTML 1.0+ requires id; name is for backward compatibility -- Evan, 1/8/07
1281 $map_content = "<map name='align_image_map' id='align_image_map'>\n";
1282 foreach ( @{ $self->{members} } ) {
1283 unless ( ref && $_->isa( __PACKAGE__ . '::Member' ) ) {
1284 require Data::Dumper;
1285 warn "members list is:\n"
1286 . Data::Dumper::Dumper( $self->{members} );
1287 confess "invalid member '$_'";
1289 next if ( $_->is_hidden );
1290 $_->set_url( identifier_url( $_->get_id ) );
1291 my $string = $_->get_imagemap_string();
1292 $map_content .= $string . "\n";
1294 if ( $self->{ruler} && $self->get_fasta_temp_file ) {
1295 my ( $sv_shared, $sv_similarity, $sv_indel ) = $self->get_sv_criteria;
1296 my $hide_seq_string = join( "%20", @{ $self->get_hidden_member_ids } );
1298 # my $extra_url_vars = hash2param($self->get_param);
1299 $map_content .=
1300 $self->{ruler}->get_imagemap_string( $self, $self->get_param );
1302 $map_content .= $self->{legend}->get_imagemap_string() if $self->{legend};
1303 $map_content .= "</map>\n";
1304 return $map_content;
1307 =head1 Subroutines to analyze sequences of @members and output result
1309 get_member_similarity(), get_sv_candidates(), get_allele_candidates(),
1310 get_overlap_score(), get_all_overlap_score(), get_all_medium(),
1311 get_all_range(), get_seqs(), get_nopad_seqs(), get_overlap_seqs(),
1312 get_overlap_nums(), get_ngap_pct(), get_all_ngap_length(),
1313 get_conserved_seq_obj()
1315 =cut
1317 =head3 get_member_similarity()
1319 Sysopsis: $alignment->get_member_similarity($al_sq) where $al_sq is an object of of algn_seq and member of @members
1321 Description: To output pair-wise similarities (overlap base, percentage indentity)of the member which is specified as argument between other members of @members.
1323 Returns: two hash references, one for overlap bases and the other for percentage indentity. The key of both hashes are the ids of other non hidden members of @members
1325 =cut
1327 sub get_member_similarity {
1328 my $self = shift;
1329 my $al_sq = shift;
1330 my %member_ol = ();
1331 my %member_pi = ();
1333 foreach ( @{ $self->{members} } ) {
1334 next if $_ == $al_sq;
1335 my ( $overlapping, $percent_ident ) = $al_sq->calculate_similarity($_);
1336 my $other_id = $_->get_id();
1337 $member_ol{$other_id} = $overlapping;
1338 $member_pi{$other_id} = $percent_ident;
1340 return \%member_ol, \%member_pi;
1343 =head3 get_sv_candidates()
1345 Synopsis: $alignment->get_sv_candidates()
1346 Description: make pairwise comparison between members of @members of the
1347 same species. If the pair have enough overlap, and the percentage
1348 indentity is high enough, and they have enough insertion-deletion (
1349 specified as parameter), they are considered as putative splice variant pair
1351 Returns: 3 hash references
1352 1. for overlap bases, a 2-D hash, the two keys are the ids of putative pslice variant pair.
1353 2. for indentity percentage, also 2-D
1354 3. for species, the key is the species of the putative splice variant pair.
1356 =cut
1358 sub get_sv_candidates {
1359 my $self = shift;
1361 my ( $indel, $overlap );
1362 ( $self->{type} eq 'pep' )
1363 ? ( $indel = '-' x $self->{indel_limit} )
1364 : ( $indel = '---' x $self->{indel_limit} );
1365 ( $self->{type} eq 'pep' )
1366 ? ( $overlap = $self->{sv_overlap} )
1367 : ( $overlap = $self->{sv_overlap} * 3 );
1369 my %sv_candidate_ob = ();
1370 my %sv_candidate_pi = ();
1371 my %sv_candidate_sp = ();
1373 foreach ( my $i = 0 ; $i < @{ $self->{members} } ; $i++ ) {
1374 foreach ( my $j = $i + 1 ; $j < @{ $self->{members} } ; $j++ ) {
1375 ( ( $self->{members}[$i]->get_species() ) ne
1376 ( $self->{members}[$j]->get_species() ) )
1377 and next;
1378 my ( $ol_seq1, $ol_seq2 ) =
1379 $self->{members}[$i]->get_clean_member( $self->{members}[$j] );
1381 ( !( ( $ol_seq1 =~ /$indel/ ) || ( $ol_seq2 =~ /$indel/ ) ) )
1382 and next;
1384 my ( $self_id, $other_id ) =
1385 ( $self->{members}[$i]->get_id(),
1386 $self->{members}[$j]->get_id() );
1387 my ( $ob, $pi ) =
1388 $self->{members}[$i]
1389 ->calculate_similarity( $self->{members}[$j] );
1390 if ( ( $ob >= $overlap ) && ( $pi >= $self->{sv_identity} ) ) {
1391 $sv_candidate_ob{$self_id}{$other_id} = $ob;
1392 $pi = sprintf( "%.2f", $pi )
1393 ; #truncate the number to two digits after the decimal point
1394 $sv_candidate_pi{$self_id}{$other_id} = $pi;
1395 $sv_candidate_sp{$self_id} = $self->{members}[$i]->get_species;
1399 return \%sv_candidate_ob, \%sv_candidate_pi, \%sv_candidate_sp;
1402 =head3 get_allele_candidates()
1404 Synopsis: $alignment->get_allele_candidates()
1406 Description: make pairwise comparison between members of @members of the
1407 same species. If the pair have enough overlap, and the percentage
1408 indentity is high enough, and they only have short insertion-deletion
1409 (specified as parameter), they are considered as putative allele pair
1411 Returns: 3 hash references
1412 1. for overlap bases, a 2-D hash, the two keys are the ids of putative alllele pair.
1413 2. for indentity percentage, also 2-D
1414 3. for species, the key is the species of the putative allele pair.
1416 =cut
1418 sub get_allele_candidates {
1419 my $self = shift;
1421 my ( $indel, $overlap );
1422 ( $self->{type} eq 'pep' )
1423 ? ( $indel = '-' x $self->{indel_limit} )
1424 : ( $indel = '---' x $self->{indel_limit} );
1425 ( $self->{type} eq 'pep' )
1426 ? ( $overlap = $self->{sv_overlap} )
1427 : ( $overlap = $self->{sv_overlap} * 3 );
1429 my %al_candidate_ob = ();
1430 my %al_candidate_pi = ();
1431 my %al_candidate_sp = ();
1432 foreach ( my $i = 0 ; $i < @{ $self->{members} } ; $i++ ) {
1433 foreach ( my $j = $i + 1 ; $j < @{ $self->{members} } ; $j++ ) {
1434 ( ( $self->{members}[$i]->get_species() ) ne
1435 ( $self->{members}[$j]->get_species() ) )
1436 and next;
1437 my ( $ol_seq1, $ol_seq2 ) =
1438 $self->{members}[$i]->get_clean_member( $self->{members}[$j] );
1440 ( ( $ol_seq1 =~ /$indel/ ) || ( $ol_seq2 =~ /$indel/ ) )
1441 and next; #skip if the aequence pair have long indel
1443 my ( $self_id, $other_id ) =
1444 ( $self->{members}[$i]->get_id(),
1445 $self->{members}[$j]->get_id() );
1446 my ( $ob, $pi ) =
1447 $self->{members}[$i]
1448 ->calculate_similarity( $self->{members}[$j] );
1449 if ( $ob >= $overlap && ( $pi >= $self->{sv_identity} ) ) {
1450 $al_candidate_ob{$self_id}{$other_id} = $ob;
1451 $pi = sprintf( "%.2f", $pi )
1452 ; #truncate the number to two digits after the decimal point
1453 $al_candidate_pi{$self_id}{$other_id} = $pi;
1454 $al_candidate_sp{$self_id} = $self->{members}[$i]->get_species;
1458 return \%al_candidate_ob, \%al_candidate_pi, \%al_candidate_sp;
1461 =head3 get_overlap_score();
1463 Synopsis: $alignment->get_overlap_score($alignment) where $member is an object of align
1465 Description: Calculate the overlap score of a member of @ members, which is specified as parameter. At a particular position of the target sequence(s1) has an overlap (not a gap) in another sequence (s2), s1 gets 1 point for alignment score. The total alignment score of s1 is the sum of all its non-gap positions.
1467 Returns: an integer
1469 =cut
1471 sub get_overlap_score {
1472 my $self = shift;
1473 my $al_sq = shift;
1475 return unless $self->is_member($al_sq);
1476 return if $al_sq->is_hidden;
1477 my $score = 0;
1478 foreach ( my $i = $self->{start_value} - 1 ;
1479 $i < $self->{end_value} - 1 ; $i++ )
1481 my $base = substr( $al_sq->get_seq(), $i, 1 );
1482 $base eq '-' and next; # if a gap, next
1484 # char in al_sq is not a gap
1485 foreach ( @{ $self->{members} } ) {
1486 $_ == $al_sq and next; # do all sequences except skip $al_sq
1487 next if $_->is_hidden;
1488 my $other_base = substr( $_->get_seq(), $i, 1 );
1489 if ( $other_base ne '-' ) {
1490 $score++;
1494 return $score;
1497 =head3 get_all_overlap_score()
1499 Synopsis: $alignment->get_all_overlap_score()
1501 Description: score of all the non-hiden members in @members
1503 Returns: A hash reference whose key is the id of a @members elements and the value is the overlap score
1505 =cut
1507 sub get_all_overlap_score {
1508 my $self = shift;
1510 my %member_score = ();
1511 foreach ( @{ $self->{members} } ) {
1512 next if $_->is_hidden;
1513 my $score = $self->get_overlap_score($_);
1514 my $id = $_->get_id();
1515 $member_score{$id} = $score;
1517 return \%member_score;
1520 =head3 get_all_medium
1522 Synopsis: $alignment->get_all_medium()
1524 Description: Returns the medium position of each alignment sequence, in hash form
1526 Returns: A hash reference whose key is the id of a @members elements and the value is the medium position
1528 =cut
1530 sub get_all_medium {
1531 my $self = shift;
1533 my %member_medium = ();
1534 foreach ( @{ $self->{members} } ) {
1535 unless ( $_->is_hidden ) {
1536 my $medium = $_->get_medium();
1537 my $id = $_->get_id();
1538 $member_medium{$id} = $medium;
1541 return \%member_medium;
1544 =head3 get_all_range
1546 Synopsis: $alignment->get_all_range()
1548 Description: Output the start and end of characters of each member sequence
1550 Returns: Two hash references whose keys are the id of a @members element
1551 and whose values are the start and end positions, respectively
1552 ($start_pos, $end_pos) = ($start_ref->{$id}, $end_ref->{$id})
1554 =cut
1556 sub get_all_range {
1557 my $self = shift;
1559 my %member_head = ();
1560 my %member_end = ();
1561 foreach ( @{ $self->{members} } ) {
1562 unless ( $_->is_hidden ) {
1563 my ( $head, $end ) = $_->get_range();
1564 my $id = $_->get_id();
1565 $member_head{$id} = $head;
1566 $member_end{$id} = $end;
1569 return \%member_head, \%member_end;
1572 =head3 get_seqs()
1574 Synopsis: $alignment->get_seqs()
1576 Description: Output the alignment sequences (padded with gaps) of @members which are are not hidden, in the range specified by start_value and end_value
1578 Returns: a hash reference whose key is the id of each member and the value is the alignment sequence
1580 =cut
1582 sub get_seqs {
1583 my $self = shift;
1584 ( ! @{ $self->{members} } ) and return;
1586 my %member_seqs = ();
1587 foreach ( @{ $self->{members} } ) {
1588 unless ( $_->is_hidden ) {
1589 my $id = $_->get_id();
1590 my $seq = $_->get_select_seq();
1591 $member_seqs{$id} = $seq;
1594 return \%member_seqs;
1597 =head3 get_nopad_seqs()
1599 Synopsis: $alignment->get_nopad_seqs()
1600 Description: Output the 'original' sequences (with gaps removed) of @members
1601 which are are not hidden, in the range specified by start_value and end_value
1602 Returns: a hash reference whose key is the id of each members and the value is the sequence
1604 =cut
1606 sub get_nopad_seqs {
1607 my $self = shift;
1608 ( ! @{ $self->{members} } ) and return;
1610 my %member_seqs = ();
1611 foreach ( @{ $self->{members} } ) {
1612 unless ( $_->is_hidden ) {
1613 my $id = $_->get_id();
1614 my $seq = $_->get_select_seq();
1615 $seq =~ s/-//g;
1616 $member_seqs{$id} = $seq;
1619 return \%member_seqs;
1622 =head3 get_overlap_seqs()
1624 Synopsis: $alignment-> get_overlap_seqs($max_gaps)
1625 Arguments: $max_gaps specifies a maximum number of gaps allowed in kept columns. Default is zero.
1626 Description: for each non-hidden @members, get the sequences that overlap with all
1627 the other non-hidden @members, in the range from start_value to end_value
1628 Returns: a hash reference whose key is the id of each member and the value is the overlap sequence
1630 =cut
1632 sub get_overlap_seqs {
1633 my $self = shift;
1634 my $max_gaps = shift;
1635 $max_gaps ||= 0;
1636 return unless ( @{ $self->{members} } );
1637 my $cols_removed = 0;
1638 my $cols_kept = 0;
1639 my %overlap_seqs;
1640 foreach (
1641 my $i = $self->{start_value} - 1 ;
1642 $i <= $self->{end_value} - 1 ;
1643 $i++
1646 my %single_base = ();
1647 my $gaps_in_col = 0;
1648 foreach ( @{ $self->{members} } ) {
1649 next if $_->is_hidden; # skip hidden ones
1650 my $id = $_->get_id();
1651 $overlap_seqs{$id} = ""
1652 unless ( $overlap_seqs{$id} )
1653 ; # make sure there is a key/val pair for each id (with val empty string).
1654 my $base = substr( $_->get_seq(), $i, 1 );
1655 $single_base{ $_->get_id() } = $base;
1656 if ( $base eq '-' ) {
1657 $gaps_in_col++;
1658 last if ( $gaps_in_col > $max_gaps );
1662 # $overlap_seqs{$_} = "" unless (defined $overlap_seqs{$_});
1663 if ( $gaps_in_col <= $max_gaps ) {
1664 $cols_kept++;
1665 foreach ( keys %single_base ) {
1667 # $overlap_seqs{$_} = "" unless ($overlap_seqs{$_});
1668 $overlap_seqs{$_} .= $single_base{$_};
1671 else {
1672 $cols_removed++;
1674 # $overlap_seqs{$_} = "" unless (defined $overlap_seqs{$_});
1678 # print "in get_overlap_seqs. cols kept, removed: ", $cols_kept, " ", $cols_removed, "\n";
1679 return \%overlap_seqs;
1682 =head3 get_overlap_num()
1684 Synopsis: $alignment->get_overlap_num()
1686 Description: count the number of bases that overlap between all the non-
1687 hidden members, in the range from start_value to end_value
1689 Returns: an integer
1691 =cut
1693 sub get_overlap_num {
1694 my $self = shift;
1695 if (0) { # old way. works
1696 my $overlap_count = 0;
1697 return unless @{ $self->{members} };
1699 #Get the first non-hidden member in @members as a basis for comparison
1700 my $select;
1701 foreach ( @{ $self->{members} } ) {
1702 next if $_->is_hidden;
1703 $select = $_;
1704 last;
1707 foreach (
1708 my $i = $self->{start_value} - 1 ;
1709 $i <= $self->{end_value} - 1 ;
1710 $i++
1713 my $base = substr( $select->get_seq(), $i, 1 );
1714 next if $base eq '-';
1715 my $pause = 0;
1716 foreach ( @{ $self->{members} } ) {
1717 next if $_ == $select; # skip $select
1718 next if $_->is_hidden;
1719 my $other_base = substr( $_->get_seq(), $i, 1 );
1720 if ( $other_base eq '-' ) {
1721 $pause = 1;
1722 last;
1725 $overlap_count++ if ( $pause == 0 );
1727 return $overlap_count;
1729 else
1730 { # new way. also works but uses get_overlap_cols, which is more general as can allow >0 gaps in a col.
1731 my $overlap_cols = $self->get_overlap_cols();
1732 return $overlap_cols;
1736 =head3 get_overlap_cols()
1738 Synopsis: $alignment->get_overlap_cols($max_gaps)
1740 Description: Like get_overlap_num, but can allow some number ($max_gaps) of gaps in a column (counting non-hidden members only).,
1741 instead of requiring this be strictly zero. ($max_gaps is 0 by default).
1743 Returns: The number of columns with no more than $max_gaps gaps in them.
1744 =cut
1746 sub get_overlap_cols {
1747 my $self = shift;
1748 my $max_gaps = shift
1749 ; # this is the max number of gaps allowed in a column which is kept in the overlap
1750 $max_gaps = 0 unless ( defined $max_gaps );
1751 return unless @{ $self->{members} }; # return if no sequences
1753 my $ncols = $self->{end_value} - $self->{start_value} + 1;
1754 my $gappy_cols = 0;
1755 foreach (
1756 my $i = $self->{start_value} - 1 ;
1757 $i <= $self->{end_value} - 1 ;
1758 $i++
1761 my $col_gaps = 0;
1762 foreach ( @{ $self->{members} } ) {
1763 next if $_->is_hidden;
1764 my $base = substr( $_->get_seq(), $i, 1 );
1765 if ( $base eq '-' ) {
1766 $col_gaps++;
1767 if ( $col_gaps > $max_gaps ) {
1768 $gappy_cols++;
1769 last;
1774 return $ncols - $gappy_cols;
1777 =head3 get_overlap_cols_nongapchars()
1779 Synopsis: $alignment->get_overlap_cols_nongapchars($max_gaps)
1781 Returns: A list of (The number of columns with no more than $max_gaps gaps in them, the number of non-gap characters in those columns)
1782 =cut
1784 sub get_overlap_cols_nongapchars{ # gets the number of columns with <= max_gaps, and also the number of nongap chars in those columns.
1785 my $self = shift;
1786 my $max_gaps = shift; # this is the max number of gaps allowed in a column which is kept in the overlap
1787 $max_gaps = 0 unless(defined $max_gaps);
1788 my $overlap_cols = 0;
1789 my $overlap_nongap_chars = 0;
1790 return unless @{$self->{members}}; # return if no sequences
1792 foreach (my $i = $self->{start_value} - 1; $i <= $self->{end_value} - 1; $i++) {
1793 my $col_gaps = 0;
1794 my $n_seq_nonhidden = 0;
1795 foreach (@{$self->{members}}) {
1796 next if $_->is_hidden;
1797 $n_seq_nonhidden++;
1798 my $base = substr($_->get_seq(), $i, 1);
1799 if ($base eq '-') {
1800 $col_gaps++;
1803 if ($col_gaps <= $max_gaps) {
1804 $overlap_cols++;
1805 $overlap_nongap_chars += $n_seq_nonhidden - $col_gaps;
1808 return ($overlap_cols, $overlap_nongap_chars);
1811 =head3 get_ngap_pct()
1813 Synopsis: $alignment->get_ngap_pct()
1815 Description: go from start_value to end_value, get the percentage
1816 coverage by @members. A position is covered by a member when it has a
1817 non gap at the position.
1819 Returns: a hash reference whose key is the position and values are the
1820 percentage coverage
1822 =cut
1824 sub get_ngap_pct {
1825 my $self = shift;
1826 my %value_hash = ();
1827 my %conservation_hash = ();
1828 my %type_hash = ();
1830 ( !@{ $self->{members} } ) and return;
1832 my $total_nhidden_member = 0;
1833 foreach ( @{ $self->{members} } ) {
1834 next if $_->is_hidden;
1835 $total_nhidden_member++;
1838 foreach ( my $i = $self->{start_value} - 1 ;
1839 $i < $self->{end_value} ; $i++ )
1841 my $ngap_count = 0;
1842 foreach ( @{ $self->{members} } ) {
1843 next if $_->is_hidden;
1844 my $seq = $_->get_seq();
1845 my $base = substr( $seq, $i, 1 );
1846 ( $base ne '-' ) and $ngap_count++;
1848 my $pct = $ngap_count / $total_nhidden_member * 100;
1849 $value_hash{$i} = sprintf( "%.2f", $pct );
1851 if ( $pct > 0 ) {
1852 my %na_count = ();
1853 my %type_count = ();
1854 my $base_count = 0;
1855 foreach ( @{ $self->{members} } ) {
1856 next if $_->is_hidden();
1857 my $base = substr( $_->get_seq(), $i, 1 );
1858 $base_count++;
1859 unless ( $base eq '-' ) {
1860 $na_count{$base}++;
1861 my $type = $self->_get_aa_type($base);
1862 $type_count{$type}++;
1866 my $max_ind = 0;
1867 while ( my ( $base, $count ) = each %na_count ) {
1868 $max_ind = $count if ( $count > $max_ind );
1870 my $max_type = 0;
1871 while ( my ( $type, $count ) = each %type_count ) {
1872 $max_type = $count if ( $count > $max_type );
1874 if ( $max_ind > 1 ) {
1875 $conservation_hash{$i} =
1876 sprintf( "%.2f", ( $max_ind / $base_count ) * 100 );
1878 if ( $max_type > 1 && $self->{type} eq 'pep' ) {
1879 $type_hash{$i} =
1880 sprintf( "%.2f", ( $max_type / $base_count ) * 100 );
1883 else {
1884 $conservation_hash{$i} = 0;
1885 $type_hash{$i} = 0;
1888 return \%value_hash, \%conservation_hash, \%type_hash;
1891 =head3 get_all_nogap_length()
1893 Synopsis: $alignment->get_all_nogap_length()
1895 Description: Go from start_value to end_value, get the sequence length
1896 without gap of @members...
1898 Returns: A hash reference whose key is the id and value is the length
1900 =cut
1902 sub get_all_nogap_length {
1903 my $self = shift;
1904 my %member_ng_len = ();
1905 foreach ( @{ $self->{members} } ) {
1906 next if $_->is_hidden();
1907 $member_ng_len{ $_->get_id() } = $_->get_nogap_length();
1909 return \%member_ng_len;
1912 =head3 get_conserved_seq()
1914 Synopsis: $alignment->get_conserved_seq()
1916 Description: go through each postion from start_value to end_value of
1917 non-hidden member of @members. If all members have the same seq at the
1918 position, get the seq, otherwise put a gap (-) in the position.
1920 Returns: s string of sequence
1922 =cut
1924 sub get_conserved_seq {
1925 my $self = shift;
1926 my $seq;
1928 return unless ( @{ $self->{members} } );
1930 my $total_nhidden_member = 0;
1931 foreach ( @{ $self->{members} } ) {
1932 next if $_->is_hidden;
1933 $total_nhidden_member++;
1935 return if ( $total_nhidden_member == 0 );
1937 foreach ( my $i = $self->{start_value} - 1 ;
1938 $i < $self->{end_value} ; $i++ )
1940 my %na_count = ();
1941 my $base_count = 0;
1942 my $conserved_base = '-';
1943 foreach ( @{ $self->{members} } ) {
1944 next if $_->is_hidden();
1945 my $base = substr( $_->get_seq(), $i, 1 );
1946 unless ( $base eq '-' ) {
1947 $base_count++;
1948 $na_count{$base}++;
1951 ($conserved_base) = ( keys %na_count )
1952 if ( ( int( keys %na_count ) == 1 ) && ( $base_count > 1 ) );
1953 $seq .= $conserved_base;
1956 #fill in the positions before the start value with gap characters
1957 $seq = '-' x ( $self->{start_value} - 1 ) . $seq;
1958 return $seq;
1961 sub _get_aa_type {
1962 my ( $self, $aa ) = @_;
1963 $aa = uc($aa);
1964 return unless ( length($aa) == 1 );
1965 return "acidic" if ( $aa =~ /D|E/ );
1966 return "basic" if ( $aa =~ /R|H|K/ );
1967 return "polar" if ( $aa =~ /N|C|Q|S|T/ );
1968 return "nonpolar" if ( $aa =~ /A|G|I|L|M|F|P|W|Y|V/ );
1971 sub highlight_domains {
1972 my $self = shift;
1974 #Retrieved from storable hash of gene objects, saving resources when possible:
1975 my $genes = shift;
1977 $genes ||= {};
1979 my ( $start_value, $end_value ) =
1980 ( $self->{start_value}, $self->{end_value} );
1981 return unless $self->{type} eq "pep";
1982 my %domains; #general domain information
1983 my %correct_length;
1984 my $sigpos_found = 0; #flag for displaying legend item
1986 my %member_domains; # member_id => array_ref of DB-derived domains
1988 foreach my $m ( @{ $self->{members} } ) {
1989 my $id = $m->get_id();
1990 my $gene = $genes->{$id};
1991 unless ($gene) {
1992 eval {
1993 $gene = CXGN::Tools::Gene->new($id);
1994 $gene->fetch("protein_length");
1995 $gene->fetch_sigp();
1996 $gene->fetch_dom();
1998 $self->{page}->log($@) if $@ && ref( $self->{page} );
1999 $genes->{$id} = $gene;
2003 #Test the input sequences against protein lengths in the database.
2004 #If they're not the same, the user put in a fragment, or incorrect
2005 #data. Thus, we won't display misleading domain information.
2006 foreach my $m ( @{ $self->{members} } ) {
2007 my $id = $m->get_id();
2008 my $db_seq_len = 0;
2009 my $gene = $genes->{$id};
2010 next unless $gene;
2011 $db_seq_len = length( $gene->get_sequence('protein') );
2012 my $prot_seq = $m->get_ungapped_seq;
2013 $prot_seq =~ s/X$//; #stop codon may be translated to X
2014 ( length($prot_seq) == $db_seq_len )
2015 ? ( $correct_length{$id} = 1 )
2016 : ( $correct_length{$id} = 0 );
2019 #Iterate through members, collecting domain information that will determine
2020 #the predominant domains for which to show colors. Also, determine signal
2021 #peptide and cleavage point.
2022 foreach my $member ( @{ $self->{members} } ) {
2023 next if $member->is_hidden();
2025 my $id = $member->{id};
2027 next unless $correct_length{$id};
2028 my $gene = $genes->{$id};
2029 next unless $gene;
2031 #Grab signal peptide
2032 my ( $sigpos, $cleavage );
2033 $sigpos = $gene->isSignalPositive();
2034 $cleavage = $gene->getCleavagePosition();
2035 my ( $lb, $rb ) =
2036 $member->translate_positions_to_seq( $start_value, $end_value );
2037 unless ( !$sigpos || $lb > ( $cleavage + 1 ) ) {
2038 my @pepcolor = ( 40, 180, 40 );
2039 $member->add_region( "Signal Peptide",
2040 1, $cleavage - 1, \@pepcolor )
2041 if $sigpos;
2042 $sigpos_found = 1;
2045 #Grab Domains
2046 my @domains = $gene->getDomains;
2047 $member_domains{$id} = \@domains;
2049 foreach (@domains) {
2050 my ( $lb, $rb ) =
2051 $member->translate_positions_to_seq( $start_value, $end_value );
2052 unless ( ( defined $rb && $_->{dom_start} > $rb )
2053 || ( $_->{dom_end} < $lb ) )
2055 $domains{ $_->{dom_desc} }->{count}++;
2056 $domains{ $_->{dom_desc} }->{length_tally} +=
2057 ( $_->{dom_end} - $_->{dom_start} );
2058 $domains{ $_->{dom_desc} }->{tooltip} = $_->{dom_full_desc}
2059 if $_->{dom_full_desc};
2060 $domains{ $_->{dom_desc} }->{url} =
2061 identifier_url( $_->{dom_interpro_id} );
2063 else {
2064 my $debug =
2065 "Dom-start: "
2066 . $_->{dom_start} . "\n"
2067 . "Dom-end: "
2068 . $_->{dom_end}
2069 . "\nRb: $rb\nLb: $lb";
2071 #die $debug;
2076 $self->add_legend_item( "Signal Peptide", [ 40, 180, 40 ] )
2077 if $sigpos_found;
2079 my @limited_domain_colors = (
2080 [ 130, 20, 120 ],
2081 [ 20, 10, 80 ],
2082 [ 30, 100, 30 ],
2083 [ 70, 90, 10 ],
2084 [ 130, 140, 0 ],
2085 [ 170, 80, 20 ],
2086 [ 170, 10, 10 ],
2087 [ 70, 0, 70 ],
2088 [ 120, 70, 30 ],
2089 [ 130, 0, 50 ]
2092 my $limited_count = scalar @limited_domain_colors;
2094 my $other_domain_color = [ 140, 140, 140 ]; #everything else
2095 my %domain_color;
2096 my $color_limit_reached = 0;
2097 foreach my $domain (
2098 sort { $domains{$b}->{length_tally} <=> $domains{$a}->{length_tally} }
2099 keys %domains )
2101 if (@limited_domain_colors) {
2102 $domain_color{$domain} = shift @limited_domain_colors;
2104 else {
2105 $color_limit_reached = 1;
2109 #Second domain iteration. This time, we know which domains are assigned to
2110 #which colors, so we can set the highlighting.
2111 foreach my $m ( @{ $self->{members} } ) {
2112 next if $m->is_hidden();
2113 my $id = $m->{id};
2114 next unless $correct_length{$id};
2116 my @domains = @{ $member_domains{$id} };
2117 foreach (@domains) {
2118 my $desc = $_->{dom_desc};
2119 my $color = $domain_color{$desc};
2120 $color ||= $other_domain_color;
2121 $m->add_region( $desc, $_->{dom_start}, $_->{dom_end}, $color );
2125 while ( my ( $desc, $color ) = each %domain_color ) {
2126 $self->add_legend_item(
2127 $desc, $color,
2128 $domains{$desc}->{url},
2129 $domains{$desc}->{tooltip}
2132 if ($color_limit_reached) {
2133 my $overage = ( ( scalar keys %domains ) - $limited_count );
2134 $self->add_legend_item( "Other ($overage)", $other_domain_color );
2136 return $genes; #return hashref to be stored by script, optionally
2139 sub print_to_file {
2140 my $self = shift;
2141 my $file = shift;
2142 open( WF, ">$file" ) or die("Can't open file '$file' for writing");
2143 print WF $self->to_fasta();
2146 sub print_to_stdout {
2147 my $self = shift;
2148 print $self->to_fasta();
2151 sub to_fasta {
2152 my $self = shift;
2153 my $string = "";
2154 foreach my $m ( @{ $self->{members} } ) {
2155 $string .= $m->to_fasta . "\n";
2157 return $string;
2161 sub run_muscle {
2162 my $self = shift;
2163 my @local_run_output = "";
2164 my $command_line = "";
2165 my $path = $self->get_tmp_dir();
2166 my $run = shift;
2167 $run ||= "local";
2169 my $maxiters = 2;
2170 $maxiters = $self->{muscle}->{maxiters} if ref $self->{muscle};
2171 $maxiters ||= 2;
2173 if ( $run eq "local" ) {
2174 my @t = `which muscle`;
2175 my $mt = $t[0];
2176 chomp $mt;
2177 unless ( -x $mt ) {
2178 warn
2179 "Program 'muscle' not available, alignment will not be performed.";
2180 return;
2182 my $wd = &Cwd::cwd();
2183 chomp $wd;
2184 my $temp_file = File::Temp->new(
2185 TEMPLATE => 'unaligned-XXXXXXX',
2186 UNLINK => 1,
2187 DIR => $path,
2189 $self->print_to_file( $temp_file->filename );
2190 my $result_file = $temp_file . ".aligned.fasta";
2191 chdir $path;
2192 $command_line =
2193 "muscle -in $temp_file -out $result_file -maxiters $maxiters";
2194 print STDERR "Running: $command_line\n";
2195 @local_run_output = `$command_line `;
2197 my $aligned = __PACKAGE__->new( { from_file => $result_file } );
2199 $self->{members} = $aligned->{members};
2200 $self->{seq_length} = $aligned->{seq_length};
2202 system "rm $result_file";
2203 chdir $wd;
2206 if ( $run eq "cluster" ) {
2207 die 'cluster runs no longer supported in this module';