1 package Bio
::SimpleAlign
;
8 use Bio
::LocatableSeq
; # uses Seq's as list
10 use Bio
::SeqFeature
::Generic
;
12 use parent
qw(Bio::Root::Root Bio::Align::AlignI Bio::AnnotatableI Bio::FeatureHolderI);
14 # BioPerl module for SimpleAlign
16 # Please direct questions and support issues to <bioperl-l@bioperl.org>
18 # Cared for by Heikki Lehvaslaiho <heikki-at-bioperl-dot-org>
20 # Copyright Ewan Birney
22 # You may distribute this module under the same terms as perl itself
24 # POD documentation - main docs before the code
27 # 11/3/00 Added threshold feature to consensus and consensus_aa - PS
28 # May 2001 major rewrite - Heikki Lehvaslaiho
32 Bio::SimpleAlign - Multiple alignments held as a set of sequences
36 # Use Bio::AlignIO to read in the alignment
37 $str = Bio::AlignIO->new(-file => 't/data/testaln.pfam');
38 $aln = $str->next_aln();
42 print $aln->num_residues;
44 print $aln->num_sequences;
46 print $aln->percentage_identity;
47 print $aln->consensus_string(50);
49 # Find the position in the alignment for a sequence location
50 $pos = $aln->column_from_residue_number('1433_LYCES', 14); # = 6;
52 # Extract sequences and check values for the alignment column $pos
53 foreach $seq ($aln->each_seq) {
54 $res = $seq->subseq($pos, $pos);
57 foreach $res (keys %count) {
58 printf "Res: %s Count: %2d\n", $res, $count{$res};
62 $aln->remove_seq($seq);
63 $mini_aln = $aln->slice(20,30); # get a block of columns
64 $mini_aln = $aln->select_noncont(1,3,5,7,11); # select certain sequences
65 $new_aln = $aln->remove_columns([20,30]); # remove by position
66 $new_aln = $aln->remove_columns(['mismatch']); # remove by property
69 $str = $aln->consensus_string($threshold_percent);
70 $str = $aln->match_line();
71 $str = $aln->cigar_line();
72 $id = $aln->percentage_identity;
74 # See the module documentation for details and more methods.
78 SimpleAlign is an object that handles a multiple sequence alignment
79 (MSA). It is very permissive of types (it does not insist on sequences
80 being all same length, for example). Think of it as a set of sequences
81 with a whole series of built-in manipulations and methods for reading and
84 SimpleAlign uses L<Bio::LocatableSeq>, a subclass of L<Bio::PrimarySeq>,
85 to store its sequences. These are subsequences with a start and end
86 positions in the parent reference sequence. Each sequence in the
87 SimpleAlign object is a Bio::LocatableSeq.
89 SimpleAlign expects the combination of name, start, and end for a
90 given sequence to be unique in the alignment, and this is the key for the
91 internal hashes (name, start, end are abbreviated C<nse> in the code).
92 However, in some cases people do not want the name/start-end to be displayed:
93 either multiple names in an alignment or names specific to the alignment
94 (ROA1_HUMAN_1, ROA1_HUMAN_2 etc). These names are called
95 C<displayname>, and generally is what is used to print out the
96 alignment. They default to name/start-end.
98 The SimpleAlign Module is derived from the Align module by Ewan Birney.
104 User feedback is an integral part of the evolution of this and other
105 Bioperl modules. Send your comments and suggestions preferably to one
106 of the Bioperl mailing lists. Your participation is much appreciated.
108 bioperl-l@bioperl.org - General discussion
109 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
113 Please direct usage questions or support issues to the mailing list:
115 I<bioperl-l@bioperl.org>
117 rather than to the module maintainer directly. Many experienced and
118 reponsive experts will be able look at the problem and quickly
119 address it. Please include a thorough description of the problem
120 with code and data examples if at all possible.
122 =head2 Reporting Bugs
124 Report bugs to the Bioperl bug tracking system to help us keep track
125 the bugs and their resolution. Bug reports can be submitted via the
128 https://github.com/bioperl/bioperl-live/issues
132 Ewan Birney, birney@ebi.ac.uk
136 Allen Day, allenday-at-ucla.edu,
137 Richard Adams, Richard.Adams-at-ed.ac.uk,
138 David J. Evans, David.Evans-at-vir.gla.ac.uk,
139 Heikki Lehvaslaiho, heikki-at-bioperl-dot-org,
140 Allen Smith, allens-at-cpan.org,
141 Jason Stajich, jason-at-bioperl.org,
142 Anthony Underwood, aunderwood-at-phls.org.uk,
143 Xintao Wei & Giri Narasimhan, giri-at-cs.fiu.edu
144 Brian Osborne, bosborne at alum.mit.edu
145 Weigang Qiu, Weigang at GENECTR-HUNTER-CUNY-EDU
146 Hongyu Zhang, forward at hongyu.org
147 Jay Hannah, jay at jays.net
148 Alexandr Bezginov, albezg at gmail.com
156 The rest of the documentation details each of the object
157 methods. Internal methods are usually preceded with a _
161 ## This data should probably be in a more centralized module...
162 ## it is taken from Clustalw documentation.
163 ## These are all the positively scoring groups that occur in the
164 ## Gonnet Pam250 matrix. The strong and weak groups are
165 ## defined as strong score >0.5 and weak score =<0.5 respectively.
166 our %CONSERVATION_GROUPS = (
167 'strong' => [qw(STA NEQK NHQK NDEQ QHRK MILV MILF HY FYW )],
168 'weak' => [qw(CSA ATV SAG STNK STPA SGND SNDEQK NDEQHK NEQHRK FVLIM HFY)],
175 Usage : my $aln = Bio::SimpleAlign->new();
176 Function : Creates a new simple align object
177 Returns : Bio::SimpleAlign
178 Args : -source => string representing the source program
179 where this alignment came from
180 -annotation => Bio::AnnotationCollectionI
181 -seq_annotation => Bio::AnnotationCollectionI for sequences (requires -annotation also be set)
182 -seqs => array ref containing Bio::LocatableSeq or Bio::Seq::Meta
183 -consensus => consensus string
184 -consensus_meta => Bio::Seq::Meta object containing consensus met information (kludge)
190 my($class,@args) = @_;
192 my $self = $class->SUPER::new
(@args);
194 my ($src, $score, $id, $acc, $desc, $seqs, $feats, $coll, $sa, $con, $cmeta) = $self->_rearrange([qw(
207 $src && $self->source($src);
208 defined $score && $self->score($score);
209 # we need to set up internal hashs first!
211 $self->{'_seq'} = {};
212 $self->{'_order'} = {};
213 $self->{'_start_end_lists'} = {};
214 $self->{'_dis_name'} = {};
215 $self->{'_id'} = 'NoName';
216 # maybe we should automatically read in from args. Hmmm...
217 $id && $self->id($id);
218 $acc && $self->accession($acc);
219 $desc && $self->description($desc);
220 $coll && $self->annotation($coll);
221 # sequence annotation is layered into a provided annotation collection (or dies)
223 $self->throw("Must supply an alignment-based annotation collection (-annotation) ".
224 "with a sequence annotation collection")
226 $coll->add_Annotation('seq_annotation', $sa);
228 if ($feats && ref $feats eq 'ARRAY') {
229 for my $feat (@
$feats) {
230 $self->add_SeqFeature($feat);
233 $con && $self->consensus($con);
234 $cmeta && $self->consensus_meta($cmeta);
235 # assumes these are in correct alignment order
236 if ($seqs && ref($seqs) eq 'ARRAY') {
237 for my $seq (@
$seqs) {
238 $self->add_seq($seq);
242 return $self; # success - we hope!
245 =head1 Modifier methods
247 These methods modify the MSA by adding, removing or shuffling complete
253 Usage : $myalign->add_seq($newseq);
254 $myalign->add_seq(-SEQ=>$newseq, -ORDER=>5);
255 Function : Adds another sequence to the alignment. *Does not* align
256 it - just adds it to the hashes.
257 If -ORDER is specified, the sequence is inserted at the
258 the position spec'd by -ORDER, and existing sequences
259 are pushed down the storage array.
261 Args : A Bio::LocatableSeq object
262 Positive integer for the sequence position (optional)
264 See L<Bio::LocatableSeq> for more information
270 Carp
::carp
("addSeq - deprecated method. Use add_seq() instead.");
277 my ($seq, $order) = $self->_rearrange([qw(SEQ ORDER)], @args);
278 my ($name,$id,$start,$end);
281 $self->throw("LocatableSeq argument required");
283 if( ! ref $seq || ! $seq->isa('Bio::LocatableSeq') ) {
284 $self->throw("Unable to process non locatable sequences [". ref($seq). "]");
286 !defined($order) and $order = 1 + keys %{$self->{'_seq'}}; # default
287 $order--; # jay's patch (user-specified order is 1-origin)
290 $self->throw("User-specified value for ORDER must be >= 1");
293 $id = $seq->id() ||$seq->display_id || $seq->primary_id;
295 # build the symbol list for this sequence,
296 # will prune out the gap and missing/match chars
297 # when actually asked for the symbol list in the
299 # map { $self->{'_symbols'}->{$_} = 1; } split(//,$seq->seq) if $seq->seq;
301 $name = $seq->get_nse;
303 if( $self->{'_seq'}->{$name} ) {
304 $self->warn("Replacing one sequence [$name]\n") unless $self->verbose < 0;
307 $self->debug( "Assigning $name to $order\n");
309 my $ordh = $self->{'_order'};
310 if ($ordh->{$order}) {
311 # make space to insert
312 # $c->() returns (in reverse order) the first subsequence
313 # of consecutive integers; i.e., $c->(1,2,3,5,6,7) returns
314 # (3,2,1), and $c->(2,4,5) returns (2).
316 $c = sub { return (($_[1]-$_[0] == 1) ?
($c->(@_[1..$#_]),$_[0]) : $_[0]); };
318 $ordh->{$_+1} = $ordh->{$_}
319 } $c->(sort {$a <=> $b} grep {$_ >= $order} keys %{$ordh});
322 $ordh->{$order} = $name;
324 unless( exists( $self->{'_start_end_lists'}->{$id})) {
325 $self->{'_start_end_lists'}->{$id} = [];
327 push @
{$self->{'_start_end_lists'}->{$id}}, $seq;
330 $self->{'_seq'}->{$name} = $seq;
338 Usage : $aln->remove_seq($seq);
339 Function : Removes a single sequence from an alignment
341 Argument : a Bio::LocatableSeq object
347 Carp
::carp
("removeSeq - deprecated method. Use remove_seq() instead.");
348 $self->remove_seq(@_);
356 $self->throw("Need Bio::Locatable seq argument ")
357 unless ref $seq && $seq->isa( 'Bio::LocatableSeq');
360 $name = $seq->get_nse;
362 if( !exists $self->{'_seq'}->{$name} ) {
363 $self->throw("Sequence $name does not exist in the alignment to remove!");
366 delete $self->{'_seq'}->{$name};
368 # we need to remove this seq from the start_end_lists hash
370 if (exists $self->{'_start_end_lists'}->{$id}) {
371 # we need to find the sequence in the array.
374 for ($i=0; $i < @
{$self->{'_start_end_lists'}->{$id}}; $i++) {
375 if (${$self->{'_start_end_lists'}->{$id}}[$i] eq $seq) {
381 splice @
{$self->{'_start_end_lists'}->{$id}}, $i, 1;
384 $self->throw("Could not find the sequence to remoce from the start-end list");
388 $self->throw("There is no seq list for the name $id");
390 # we need to shift order hash
391 my %rev_order = reverse %{$self->{'_order'}};
392 my $no = $rev_order{$name};
393 my $num_sequences = $self->num_sequences;
394 for (; $no < $num_sequences; $no++) {
395 $self->{'_order'}->{$no} = $self->{'_order'}->{$no+1};
397 delete $self->{'_order'}->{$no};
405 Usage : $aln->purge(0.7);
406 Function: Removes sequences above given sequence similarity
407 This function will grind on large alignments. Beware!
409 Returns : An array of the removed sequences
410 Args : float, threshold for similarity
415 my ($self,$perc) = @_;
416 my (%duplicate, @dups);
418 my @seqs = $self->each_seq();
420 for (my $i=0;$i< @seqs - 1;$i++ ) { #for each seq in alignment
423 #skip if already in duplicate hash
424 next if exists $duplicate{$seq->display_id} ;
425 my $one = $seq->seq();
427 my @one = split '', $one; #split to get 1aa per array element
429 for (my $j=$i+1;$j < @seqs;$j++) {
430 my $seq2 = $seqs[$j];
432 #skip if already in duplicate hash
433 next if exists $duplicate{$seq2->display_id} ;
435 my $two = $seq2->seq();
436 my @two = split '', $two;
440 for (my $k=0;$k<@one;$k++) {
441 if ( $one[$k] ne '.' && $one[$k] ne '-' && defined($two[$k]) &&
442 $one[$k] eq $two[$k]) {
445 if ( $one[$k] ne '.' && $one[$k] ne '-' && defined($two[$k]) &&
446 $two[$k] ne '.' && $two[$k] ne '-' ) {
452 $ratio = $count/$res unless $res == 0;
454 # if above threshold put in duplicate hash and push onto
455 # duplicate array for returning to get_unique
456 if ( $ratio > $perc ) {
457 $self->warn("duplicate: ", $seq2->display_id) if $self->verbose > 0;
458 $duplicate{$seq2->display_id} = 1;
463 foreach my $seq (@dups) {
464 $self->remove_seq($seq);
469 =head2 sort_alphabetically
471 Title : sort_alphabetically
472 Usage : $ali->sort_alphabetically
473 Function : Changes the order of the alignment to alphabetical on name
474 followed by numerical by number.
480 sub sort_alphabetically
{
482 my ($seq,$nse,@arr,%hash,$count);
484 foreach $seq ( $self->each_seq() ) {
485 $nse = $seq->get_nse;
491 %{$self->{'_order'}} = (); # reset the hash;
493 foreach $nse ( sort _alpha_startend
keys %hash) {
494 $self->{'_order'}->{$count} = $nse;
504 Usage : $aln_ordered=$aln->sort_by_list($list_file)
505 Function : Arbitrarily order sequences in an alignment
506 Returns : A new Bio::SimpleAlign object
507 Argument : a file listing sequence names in intended order (one name per line)
512 my ($self, $list) = @_;
513 my (@seq, @ids, %order);
515 foreach my $seq ( $self->each_seq() ) {
517 push @ids, $seq->display_id;
521 open my $listfh, '<', $list or $self->throw("Could not read file '$list': $!");
525 $self->throw("Not found in alignment: $name") unless &_in_aln
($name, \
@ids);
530 # use the map-sort-map idiom:
531 my @sorted= map { $_->[1] } sort { $a->[0] <=> $b->[0] } map { [$order{$_->id()}, $_] } @seq;
532 my $aln = $self->new;
533 foreach (@sorted) { $aln->add_seq($_) }
537 =head2 set_new_reference
539 Title : set_new_reference
540 Usage : $aln->set_new_reference(3 or 'B31'): Select the 3rd sequence, or
541 the sequence whoes name is "B31" (full, exact, and case-sensitive),
542 as the reference (1st) sequence
543 Function : Change/Set a new reference (i.e., the first) sequence
544 Returns : a new Bio::SimpleAlign object.
545 Throws an exception if designated sequence not found
546 Argument : a positive integer of sequence order, or a sequence name
547 in the original alignment
551 sub set_new_reference
{
552 my ($self, $seqid) = @_;
553 my $aln = $self->new;
554 my (@seq, @ids, @new_seq);
556 foreach my $seq ( $self->each_seq() ) {
558 push @ids, $seq->display_id;
561 if ($seqid =~ /^\d+$/) { # argument is seq position
563 $self->throw("The new reference sequence number has to be a positive integer >1 and <= num_sequences ") if ($seqid <= 1 || $seqid > $self->num_sequences);
564 } else { # argument is a seq name
565 $self->throw("The new reference sequence not in alignment ") unless &_in_aln
($seqid, \
@ids);
568 for (my $i=0; $i<=$#seq; $i++) {
570 if ( ($is_num && $pos == $seqid) || ($seqid eq $seq[$i]->display_id) ) {
571 unshift @new_seq, $seq[$i];
573 push @new_seq, $seq[$i];
576 foreach (@new_seq) { $aln->add_seq($_); }
580 sub _in_aln
{ # check if input name exists in the alignment
581 my ($str, $ref) = @_;
583 return 1 if $str eq $_;
592 Usage : $aln->uniq_seq(): Remove identical sequences in
593 in the alignment. Ambiguous base ("N", "n") and
594 leading and ending gaps ("-") are NOT counted as
596 Function : Make a new alignment of unique sequence types (STs)
597 Returns : 1a. if called in a scalar context,
598 a new Bio::SimpleAlign object (all sequences renamed as "ST")
599 1b. if called in an array context,
600 a new Bio::SimpleAlign object, and a hashref whose keys
601 are sequence types, and whose values are arrayrefs to
602 lists of sequence ids within the corresponding sequence type
603 2. if $aln->verbose > 0, ST of each sequence is sent to
604 STDERR (in a tabular format)
610 my ($self, $seqid) = @_;
611 my $aln = $self->new;
612 my (%member, %order, @seq, @uniq_str, $st);
614 my $len = $self->length();
616 foreach my $seq ( $self->each_seq() ) {
617 my $str = $seq->seq();
619 # it's necessary to ignore "n", "N", leading gaps and ending gaps in
620 # comparing two sequence strings
622 # 1st, convert "n", "N" to "?" (for DNA sequence only):
623 $str =~ s/n/\?/gi if $str =~ /^[atcgn-]+$/i;
624 # 2nd, convert leading and ending gaps to "?":
625 $str = &_convert_leading_ending_gaps
($str, '-', '?');
626 # Note that '?' also can mean unknown residue.
627 # I don't like making global class member changes like this, too
628 # prone to errors... -- cjfields 08-11-18
629 local $Bio::LocatableSeq
::GAP_SYMBOLS
= '-\?';
630 my $new = Bio
::LocatableSeq
->new(
632 -alphabet
=> $seq->alphabet,
634 -start
=> $seq->start,
640 foreach my $seq (@seq) {
641 my $str = $seq->seq();
642 my ($seen, $key) = &_check_uniq
($str, \
@uniq_str, $len);
643 if ($seen) { # seen before
644 my @memb = @
{$member{$key}};
646 $member{$key} = \
@memb;
648 push @uniq_str, $key;
650 $member{$key} = [ ($seq) ];
651 $order{$key} = $order;
655 foreach my $str (sort {$order{$a} <=> $order{$b}} keys %order) { # sort by input order
656 # convert leading/ending "?" back into "-" ("?" throws errors by SimpleAlign):
657 my $str2 = &_convert_leading_ending_gaps
($str, '?', '-');
658 # convert middle "?" back into "N" ("?" throws errors by SimpleAlign):
659 $str2 =~ s/\?/N/g if $str2 =~ /^[atcg\-\?]+$/i;
661 my $end= CORE
::length($str2);
662 $end -= CORE
::length($1) while $str2 =~ m/($gap+)/g;
663 my $new = Bio
::LocatableSeq
->new(-id
=>"ST".$order{$str},
669 foreach (@
{$member{$str}}) {
670 push @
{$$st{$order{$str}}}, $_->id(); # per Tristan's patch/Bug #2805
671 $self->debug($_->id(), "\t", "ST", $order{$str}, "\n");
674 return wantarray ?
($aln, $st) : $aln;
677 sub _check_uniq
{ # check if same seq exists in the alignment
678 my ($str1, $ref, $length) = @_;
679 my @char1=split //, $str1;
682 return (0, $str1) if @array==0; # not seen (1st sequence)
684 foreach my $str2 (@array) {
686 my @char2=split //, $str2;
687 for (my $i=0; $i<=$length-1; $i++) {
688 next if $char1[$i] eq '?';
689 next if $char2[$i] eq '?';
690 $diff++ if $char1[$i] ne $char2[$i];
692 return (1, $str2) if $diff == 0; # seen before
695 return (0, $str1); # not seen
698 sub _convert_leading_ending_gaps
{
702 my @array=split //, $s;
703 # convert leading char:
704 for (my $i=0; $i<=$#array; $i++) {
705 ($array[$i] eq $sym1) ?
($array[$i] = $sym2):(last);
707 # convert ending char:
708 for (my $i = $#array; $i>= 0; $i--) {
709 ($array[$i] eq $sym1) ?
($array[$i] = $sym2):(last);
711 my $s_new=join '', @array;
715 =head1 Sequence selection methods
717 Methods returning one or more sequences objects.
722 Usage : foreach $seq ( $align->each_seq() )
723 Function : Gets a Seq object from the alignment
731 Carp
::carp
("eachSeq - deprecated method. Use each_seq() instead.");
739 foreach $order ( sort { $a <=> $b } keys %{$self->{'_order'}} ) {
740 if( exists $self->{'_seq'}->{$self->{'_order'}->{$order}} ) {
741 push(@arr,$self->{'_seq'}->{$self->{'_order'}->{$order}});
748 =head2 each_alphabetically
750 Title : each_alphabetically
751 Usage : foreach $seq ( $ali->each_alphabetically() )
752 Function : Returns a sequence object, but the objects are returned
753 in alphabetically sorted order.
754 Does not change the order of the alignment.
760 sub each_alphabetically
{
762 my ($seq,$nse,@arr,%hash,$count);
764 foreach $seq ( $self->each_seq() ) {
765 $nse = $seq->get_nse;
769 foreach $nse ( sort _alpha_startend
keys %hash) {
770 push(@arr,$hash{$nse});
775 sub _alpha_startend
{
776 my ($aname,$astart,$bname,$bstart);
777 ($aname,$astart) = split (/-/,$a);
778 ($bname,$bstart) = split (/-/,$b);
780 if( $aname eq $bname ) {
781 return $astart <=> $bstart;
784 return $aname cmp $bname;
788 =head2 each_seq_with_id
790 Title : each_seq_with_id
791 Usage : foreach $seq ( $align->each_seq_with_id() )
792 Function : Gets a Seq objects from the alignment, the contents
793 being those sequences with the given name (there may be
796 Argument : a seq name
802 Carp
::carp
("eachSeqWithId - deprecated method. Use each_seq_with_id() instead.");
803 $self->each_seq_with_id(@_);
806 sub each_seq_with_id
{
810 $self->throw("Method each_seq_with_id needs a sequence name argument")
815 if (exists($self->{'_start_end_lists'}->{$id})) {
816 @arr = @
{$self->{'_start_end_lists'}->{$id}};
821 =head2 get_seq_by_pos
823 Title : get_seq_by_pos
824 Usage : $seq = $aln->get_seq_by_pos(3) # third sequence from the alignment
825 Function : Gets a sequence based on its position in the alignment.
826 Numbering starts from 1. Sequence positions larger than
827 num_sequences() will throw an error.
828 Returns : a Bio::LocatableSeq object
829 Args : positive integer for the sequence position
838 $self->throw("Sequence position has to be a positive integer, not [$pos]")
839 unless $pos =~ /^\d+$/ and $pos > 0;
840 $self->throw("No sequence at position [$pos]")
841 unless $pos <= $self->num_sequences ;
843 my $nse = $self->{'_order'}->{--$pos};
844 return $self->{'_seq'}->{$nse};
849 Title : get_seq_by_id
850 Usage : $seq = $aln->get_seq_by_id($name) # seq named $name
851 Function : Gets a sequence based on its name.
852 Sequences that do not exist will warn and return undef
853 Returns : a Bio::LocatableSeq object
854 Args : string for sequence name
859 my ($self,$name) = @_;
860 unless( defined $name ) {
861 $self->warn("Must provide a sequence name");
864 for my $seq ( values %{$self->{'_seq'}} ) {
865 if ( $seq->id eq $name) {
872 =head2 seq_with_features
874 Title : seq_with_features
875 Usage : $seq = $aln->seq_with_features(-pos => 1,
878 sub { my $consensus = shift;
883 while($consensus =~ /[^?]$q[^?]/){
884 $consensus =~ s/([^?])$q([^?])/$1$n$2/;
890 Function: produces a Bio::Seq object by first splicing gaps from -pos
891 (by means of a splice_by_seq_pos() call), then creating
892 features using non-? chars (by means of a consensus_string()
893 call with stringency -consensus).
894 Returns : a Bio::Seq object
895 Args : -pos : required. sequence from which to build the Bio::Seq
897 -consensus : optional, defaults to consensus_string()'s
899 -mask : optional, a coderef to apply to consensus_string()'s
900 output before building features. this may be useful for
901 closing gaps of 1 bp by masking over them with N, for
906 sub seq_with_features
{
907 my ($self,%arg) = @_;
909 #first do the preparatory splice
910 $self->throw("must provide a -pos argument") unless $arg{-pos};
911 $self->splice_by_seq_pos($arg{-pos});
913 my $consensus_string = $self->consensus_string($arg{-consensus
});
914 $consensus_string = $arg{-mask
}->($consensus_string)
915 if defined($arg{-mask
});
919 push @bs, 1 if $consensus_string =~ /^[^?]/;
921 while($consensus_string =~ /\?[^?]/g){
922 push @bs, pos($consensus_string);
924 while($consensus_string =~ /[^?]\?/g){
925 push @es, pos($consensus_string);
928 push @es, CORE
::length($consensus_string) if $consensus_string =~ /[^?]$/;
930 my $seq = Bio
::Seq
->new();
932 # my $rootfeature = Bio::SeqFeature::Generic->new(
933 # -source_tag => 'location',
934 # -start => $self->get_seq_by_pos($arg{-pos})->start,
935 # -end => $self->get_seq_by_pos($arg{-pos})->end,
937 # $seq->add_SeqFeature($rootfeature);
939 while(my $b = shift @bs){
941 $seq->add_SeqFeature(
942 Bio
::SeqFeature
::Generic
->new(
943 -start
=> $b - 1 + $self->get_seq_by_pos($arg{-pos})->start,
944 -end
=> $e - 1 + $self->get_seq_by_pos($arg{-pos})->start,
945 -source_tag
=> $self->source || 'MSA',
954 =head1 Create new alignments
956 The result of these methods are horizontal or vertical subsets of the
962 Usage : $aln2 = $aln->select(1, 3) # three first sequences
963 Function : Creates a new alignment from a continuous subset of
964 sequences. Numbering starts from 1. Sequence positions
965 larger than num_sequences() will throw an error.
966 Returns : a Bio::SimpleAlign object
967 Args : positive integer for the first sequence
968 positive integer for the last sequence to include (optional)
974 my ($start, $end) = @_;
976 $self->throw("Select start has to be a positive integer, not [$start]")
977 unless $start =~ /^\d+$/ and $start > 0;
978 $self->throw("Select end has to be a positive integer, not [$end]")
979 unless $end =~ /^\d+$/ and $end > 0;
980 $self->throw("Select $start [$start] has to be smaller than or equal to end [$end]")
981 unless $start <= $end;
983 my $aln = $self->new;
984 foreach my $pos ($start .. $end) {
985 $aln->add_seq($self->get_seq_by_pos($pos));
988 # fix for meta, sf, ann
992 =head2 select_noncont
994 Title : select_noncont
995 Usage : # 1st and 3rd sequences, sorted
996 $aln2 = $aln->select_noncont(1, 3)
998 # 1st and 3rd sequences, sorted (same as first)
999 $aln2 = $aln->select_noncont(3, 1)
1001 # 1st and 3rd sequences, unsorted
1002 $aln2 = $aln->select_noncont('nosort',3, 1)
1004 Function : Creates a new alignment from a subset of sequences. Numbering
1005 starts from 1. Sequence positions larger than num_sequences() will
1006 throw an error. Sorts the order added to new alignment by default,
1007 to prevent sorting pass 'nosort' as the first argument in the list.
1008 Returns : a Bio::SimpleAlign object
1009 Args : array of integers for the sequences. If the string 'nosort' is
1010 passed as the first argument, the sequences will not be sorted
1011 in the new alignment but will appear in the order listed.
1015 sub select_noncont
{
1019 if ($pos[0] !~ m{^\d+$}) {
1020 my $sortcmd = shift @pos;
1021 if ($sortcmd eq 'nosort') {
1024 $self->throw("Command not recognized: $sortcmd. Only 'nosort' implemented at this time.");
1028 my $end = $self->num_sequences;
1030 $self->throw("position must be a positive integer, > 0 and <= $end not [$_]")
1031 unless( /^\d+$/ && $_ > 0 && $_ <= $end );
1034 @pos = sort {$a <=> $b} @pos unless $nosort;
1036 my $aln = $self->new;
1037 foreach my $p (@pos) {
1038 $aln->add_seq($self->get_seq_by_pos($p));
1040 $aln->id($self->id);
1041 # fix for meta, sf, ann
1045 =head2 select_noncont_by_name
1047 Title : select_noncont_by_name
1048 Usage : my $aln2 = $aln->select_noncont_by_name('A123', 'B456');
1049 Function : Creates a new alignment from a subset of sequences which are
1050 selected by name (sequence ID).
1051 Returns : a Bio::SimpleAlign object
1052 Args : array of names (i.e., identifiers) for the sequences.
1056 sub select_noncont_by_name
{
1057 my ($self, @names) = @_;
1059 my $aln = $self->new;
1060 foreach my $name (@names) {
1061 $aln->add_seq($self->get_seq_by_id($name));
1063 $aln->id($self->id);
1071 Usage : $aln2 = $aln->slice(20,30)
1072 Function : Creates a slice from the alignment inclusive of start and
1073 end columns, and the first column in the alignment is denoted 1.
1074 Sequences with no residues in the slice are excluded from the
1075 new alignment and a warning is printed. Slice beyond the length of
1076 the sequence does not do padding.
1077 Returns : A Bio::SimpleAlign object
1078 Args : Positive integer for start column, positive integer for end column,
1079 optional boolean which if true will keep gap-only columns in the newly
1080 created slice. Example:
1082 $aln2 = $aln->slice(20,30,1)
1088 my ($start, $end, $keep_gap_only) = @_;
1090 $self->throw("Slice start has to be a positive integer, not [$start]")
1091 unless $start =~ /^\d+$/ and $start > 0;
1092 $self->throw("Slice end has to be a positive integer, not [$end]")
1093 unless $end =~ /^\d+$/ and $end > 0;
1094 $self->throw("Slice start [$start] has to be smaller than or equal to end [$end]")
1095 unless $start <= $end;
1096 $self->throw("This alignment has only ". $self->length . " residues. Slice start " .
1097 "[$start] is too big.") if $start > $self->length;
1098 my $cons_meta = $self->consensus_meta;
1099 my $aln = $self->new;
1100 $aln->id($self->id);
1101 foreach my $seq ( $self->each_seq() ) {
1102 my $new_seq = $seq->isa('Bio::Seq::MetaI') ?
1105 -alphabet
=> $seq->alphabet,
1106 -strand
=> $seq->strand,
1107 -verbose
=> $self->verbose) :
1108 Bio
::LocatableSeq
->new
1110 -alphabet
=> $seq->alphabet,
1111 -strand
=> $seq->strand,
1112 -verbose
=> $self->verbose);
1116 $seq_end = $seq->length if( $end > $seq->length );
1118 my $slice_seq = $seq->subseq($start, $seq_end);
1119 $new_seq->seq( $slice_seq );
1121 # Allowed extra characters in string
1122 my $allowed_chars = '';
1123 if (exists $self->{_mask_char
}) {
1124 $allowed_chars = $self->{_mask_char
};
1125 $allowed_chars = quotemeta $allowed_chars;
1127 $slice_seq =~ s/[^\w$allowed_chars]//g;
1130 my $pre_start_seq = $seq->subseq(1, $start - 1);
1131 $pre_start_seq =~ s/[^\w$allowed_chars]//g;
1132 if (!defined($seq->strand)) {
1133 $new_seq->start( $seq->start + CORE
::length($pre_start_seq) );
1134 } elsif ($seq->strand < 0){
1135 $new_seq->start( $seq->end - CORE
::length($pre_start_seq) - CORE
::length($slice_seq) + 1);
1137 $new_seq->start( $seq->start + CORE
::length($pre_start_seq) );
1140 if ((defined $seq->strand)&&($seq->strand < 0)){
1141 $new_seq->start( $seq->end - CORE
::length($slice_seq) + 1);
1143 $new_seq->start( $seq->start);
1146 if ($new_seq->isa('Bio::Seq::MetaI')) {
1147 for my $meta_name ($seq->meta_names) {
1148 $new_seq->named_meta($meta_name, $seq->named_submeta($meta_name, $start, $end));
1151 $new_seq->end( $new_seq->start + CORE
::length($slice_seq) - 1 );
1153 if ($new_seq->start and $new_seq->end >= $new_seq->start) {
1154 $aln->add_seq($new_seq);
1156 if( $keep_gap_only ) {
1157 $aln->add_seq($new_seq);
1159 my $nse = $seq->get_nse();
1160 $self->warn("Slice [$start-$end] of sequence [$nse] contains no residues.".
1161 " Sequence excluded from the new alignment.");
1166 my $new = Bio
::Seq
::Meta
->new();
1167 for my $meta_name ($cons_meta->meta_names) {
1168 $new->named_meta($meta_name, $cons_meta->named_submeta($meta_name, $start, $end));
1170 $aln->consensus_meta($new);
1172 $aln->annotation($self->annotation);
1173 # fix for meta, sf, ann
1177 =head2 remove_columns
1179 Title : remove_columns
1180 Usage : $aln2 = $aln->remove_columns(['mismatch','weak']) or
1181 $aln2 = $aln->remove_columns([0,0],[6,8])
1182 Function : Creates an aligment with columns removed corresponding to
1183 the specified type or by specifying the columns by number.
1184 Returns : Bio::SimpleAlign object
1185 Args : Array ref of types ('match'|'weak'|'strong'|'mismatch'|'gaps'|
1186 'all_gaps_columns') or array ref where the referenced array
1187 contains a pair of integers that specify a range.
1188 The first column is 0
1192 sub remove_columns
{
1193 my ($self,@args) = @_;
1194 @args || $self->throw("Must supply column ranges or column types");
1197 if ($args[0][0] =~ /^[a-z_]+$/i) {
1198 $aln = $self->_remove_columns_by_type($args[0]);
1199 } elsif ($args[0][0] =~ /^\d+$/) {
1200 $aln = $self->_remove_columns_by_num(\
@args);
1202 $self->throw("You must pass array references to remove_columns(), not @args");
1204 # fix for meta, sf, ann
1212 Usage : $aln2 = $aln->remove_gaps
1213 Function : Creates an aligment with gaps removed
1214 Returns : a Bio::SimpleAlign object
1215 Args : a gap character(optional) if none specified taken
1216 from $self->gap_char,
1217 [optional] $all_gaps_columns flag (1 or 0, default is 0)
1218 indicates that only all-gaps columns should be deleted
1220 Used from method L<remove_columns> in most cases. Set gap character
1221 using L<gap_char()|gap_char>.
1226 my ($self,$gapchar,$all_gaps_columns) = @_;
1228 if ($all_gaps_columns) {
1229 $gap_line = $self->all_gap_line($gapchar);
1231 $gap_line = $self->gap_line($gapchar);
1233 my $aln = $self->new;
1237 my $del_char = $gapchar || $self->gap_char;
1238 # Do the matching to get the segments to remove
1239 while ($gap_line =~ m/[$del_char]/g) {
1240 my $start = pos($gap_line)-1;
1241 $gap_line =~ m/\G[$del_char]+/gc;
1242 my $end = pos($gap_line)-1;
1244 #have to offset the start and end for subsequent removes
1247 $length += ($end-$start+1);
1248 push @remove, [$start,$end];
1251 #remove the segments
1252 $aln = $#remove >= 0 ?
$self->_remove_col($aln,\
@remove) : $self;
1253 # fix for meta, sf, ann
1259 my ($self,$aln,$remove) = @_;
1262 my $gap = $self->gap_char;
1264 # splice out the segments and create new seq
1265 foreach my $seq($self->each_seq){
1266 my $new_seq = Bio
::LocatableSeq
->new(
1268 -alphabet
=> $seq->alphabet,
1269 -strand
=> $seq->strand,
1270 -verbose
=> $self->verbose);
1271 my $sequence = $seq->seq;
1272 foreach my $pair(@
{$remove}){
1273 my $start = $pair->[0];
1274 my $end = $pair->[1];
1275 $sequence = $seq->seq unless $sequence;
1276 my $orig = $sequence;
1277 my $head = $start > 0 ?
substr($sequence, 0, $start) : '';
1278 my $tail = ($end + 1) >= CORE
::length($sequence) ?
'' : substr($sequence, $end + 1);
1279 $sequence = $head.$tail;
1281 unless (defined $new_seq->start) {
1283 my $start_adjust = () = substr($orig, 0, $end + 1) =~ /$gap/g;
1284 $new_seq->start($seq->start + $end + 1 - $start_adjust);
1287 my $start_adjust = $orig =~ /^$gap+/;
1288 if ($start_adjust) {
1289 $start_adjust = $+[0] == $start;
1291 $new_seq->start($seq->start + $start_adjust);
1295 if (($end + 1) >= CORE
::length($orig)) {
1296 my $end_adjust = () = substr($orig, $start) =~ /$gap/g;
1297 $new_seq->end($seq->end - (CORE
::length($orig) - $start) + $end_adjust);
1300 $new_seq->end($seq->end);
1304 if ($new_seq->end < $new_seq->start) {
1305 # we removed all columns except for gaps: set to 0 to indicate no
1311 $new_seq->seq($sequence) if $sequence;
1312 push @new, $new_seq;
1314 # add the new seqs to the alignment
1315 foreach my $new(@new){
1316 $aln->add_seq($new);
1318 # fix for meta, sf, ann
1322 sub _remove_columns_by_type
{
1323 my ($self,$type) = @_;
1324 my $aln = $self->new;
1327 my $gap = $self->gap_char if (grep { $_ eq 'gaps'} @
{$type});
1328 my $all_gaps_columns = $self->gap_char if (grep /all_gaps_columns/,@
{$type});
1329 my %matchchars = ( 'match' => '\*',
1334 'all_gaps_columns' => ''
1336 # get the characters to delete against
1338 foreach my $type (@
{$type}){
1339 $del_char.= $matchchars{$type};
1343 my $match_line = $self->match_line;
1344 # do the matching to get the segments to remove
1346 while($match_line =~ m/[$del_char]/g ){
1347 my $start = pos($match_line)-1;
1348 $match_line=~/\G[$del_char]+/gc;
1349 my $end = pos($match_line)-1;
1351 #have to offset the start and end for subsequent removes
1354 $length += ($end-$start+1);
1355 push @remove, [$start,$end];
1359 # remove the segments
1360 $aln = $#remove >= 0 ?
$self->_remove_col($aln,\
@remove) : $self;
1361 $aln = $aln->remove_gaps() if $gap;
1362 $aln = $aln->remove_gaps('', 1) if $all_gaps_columns;
1363 # fix for meta, sf, ann
1368 sub _remove_columns_by_num
{
1369 my ($self,$positions) = @_;
1370 my $aln = $self->new;
1372 # sort the positions
1373 @
$positions = sort { $a->[0] <=> $b->[0] } @
$positions;
1377 foreach my $pos (@
{$positions}) {
1378 my ($start, $end) = @
{$pos};
1380 #have to offset the start and end for subsequent removes
1383 $length += ($end-$start+1);
1384 push @remove, [$start,$end];
1387 #remove the segments
1388 $aln = $#remove >= 0 ?
$self->_remove_col($aln,\
@remove) : $self;
1389 # fix for meta, sf, ann
1394 =head1 Change sequences within the MSA
1396 These methods affect characters in all sequences without changing the
1399 =head2 splice_by_seq_pos
1401 Title : splice_by_seq_pos
1402 Usage : $status = splice_by_seq_pos(1);
1403 Function: splices all aligned sequences where the specified sequence
1406 Returns : 1 on success
1407 Args : position of sequence to splice by
1412 sub splice_by_seq_pos
{
1413 my ($self,$pos) = @_;
1415 my $guide = $self->get_seq_by_pos($pos);
1416 my $guide_seq = $guide->seq;
1418 $guide_seq =~ s/\./\-/g;
1422 while(($pos = index($guide_seq, '-', $pos)) > -1 ){
1423 unshift @gaps, $pos;
1427 foreach my $seq ($self->each_seq){
1428 my @bases = split '', $seq->seq;
1430 splice(@bases, $_, 1) foreach @gaps;
1431 $seq->seq(join('', @bases));
1440 Usage : $ali->map_chars('\.','-')
1441 Function : Does a s/$arg1/$arg2/ on the sequences. Useful for gap
1444 Note that the first argument is interpreted as a regexp
1445 so be careful and escape any wild card characters (e.g.
1446 do $ali->map_chars('\.','-') to replace periods with dashes.
1447 Returns : 1 on success
1448 Argument : A regexp and a string
1458 $self->throw("Need two arguments: a regexp and a string")
1459 unless defined $from and defined $to;
1461 foreach $seq ( $self->each_seq() ) {
1462 $temp = $seq->seq();
1463 $temp =~ s/$from/$to/g;
1473 Usage : $ali->uppercase()
1474 Function : Sets all the sequences to uppercase
1475 Returns : 1 on success
1485 foreach $seq ( $self->each_seq() ) {
1486 $temp = $seq->seq();
1487 $temp =~ tr/[a-z]/[A-Z]/;
1496 Title : cigar_line()
1497 Usage : %cigars = $align->cigar_line()
1498 Function : Generates a "cigar" (Compact Idiosyncratic Gapped Alignment
1499 Report) line for each sequence in the alignment. Examples are
1500 "1,60" or "5,10:12,58", where the numbers refer to conserved
1501 positions within the alignment. The keys of the hash are the
1502 NSEs (name/start/end) assigned to each sequence.
1503 Args : threshold (optional, defaults to 100)
1504 Returns : Hash of strings (cigar lines)
1513 my @consensus = split "",($self->consensus_string($thr));
1514 my $len = $self->length;
1515 my $gapchar = $self->gap_char;
1517 # create a precursor, something like (1,4,5,6,7,33,45),
1518 # where each number corresponds to a conserved position
1519 foreach my $seq ( $self->each_seq ) {
1520 my @seq = split "", uc ($seq->seq);
1522 for (my $x = 0 ; $x < $len ; $x++ ) {
1523 if ($seq[$x] eq $consensus[$x]) {
1524 push @
{$cigars{$seq->get_nse}},$pos;
1526 } elsif ($seq[$x] ne $gapchar) {
1531 # duplicate numbers - (1,4,5,6,7,33,45) becomes (1,1,4,5,6,7,33,33,45,45)
1532 for my $name (keys %cigars) {
1533 splice @
{$cigars{$name}}, 1, 0, ${$cigars{$name}}[0] if
1534 ( ${$cigars{$name}}[0] + 1 < ${$cigars{$name}}[1] );
1535 push @
{$cigars{$name}}, ${$cigars{$name}}[$#{$cigars{$name}}] if
1536 ( ${$cigars{$name}}[($#{$cigars{$name}} - 1)] + 1 <
1537 ${$cigars{$name}}[$#{$cigars{$name}}] );
1538 for ( my $x = 1 ; $x < $#{$cigars{$name}} - 1 ; $x++) {
1539 if (${$cigars{$name}}[$x - 1] + 1 < ${$cigars{$name}}[$x] &&
1540 ${$cigars{$name}}[$x + 1] > ${$cigars{$name}}[$x] + 1) {
1541 splice @
{$cigars{$name}}, $x, 0, ${$cigars{$name}}[$x];
1545 # collapse series - (1,1,4,5,6,7,33,33,45,45) becomes (1,1,4,7,33,33,45,45)
1546 for my $name (keys %cigars) {
1548 for ( my $x = 0 ; $x < $#{$cigars{$name}} ; $x++) {
1549 if ( ${$cigars{$name}}[$x] == ${$cigars{$name}}[($x - 1)] + 1 &&
1550 ${$cigars{$name}}[$x] == ${$cigars{$name}}[($x + 1)] - 1 ) {
1554 for my $pos (@remove) {
1555 splice @
{$cigars{$name}}, $pos, 1;
1558 # join and punctuate
1559 for my $name (keys %cigars) {
1560 my ($start,$end,$str) = "";
1561 while ( ($start,$end) = splice @
{$cigars{$name}}, 0, 2 ) {
1562 $str .= ($start . "," . $end . ":");
1565 $cigars{$name} = $str;
1573 Title : match_line()
1574 Usage : $line = $align->match_line()
1575 Function : Generates a match line - much like consensus string
1576 except that a line indicating the '*' for a match.
1577 Args : (optional) Match line characters ('*' by default)
1578 (optional) Strong match char (':' by default)
1579 (optional) Weak match char ('.' by default)
1585 my ($self,$matchlinechar, $strong, $weak) = @_;
1586 my %matchchars = ('match' => $matchlinechar || '*',
1587 'weak' => $weak || '.',
1588 'strong' => $strong || ':',
1594 foreach my $seq ( $self->each_seq ) {
1595 push @seqchars, [ split(//, uc ($seq->seq)) ];
1596 $alphabet = $seq->alphabet unless defined $alphabet;
1598 my $refseq = shift @seqchars;
1599 # let's just march down the columns
1602 foreach my $pos ( 0..$self->length ) {
1603 my $refchar = $refseq->[$pos];
1604 my $char = $matchchars{'mismatch'};
1605 unless( defined $refchar ) {
1606 last if $pos == $self->length; # short circuit on last residue
1607 # this in place to handle jason's soon-to-be-committed
1608 # intron mapping code
1611 my %col = ($refchar => 1);
1612 my $dash = ($refchar eq '-' || $refchar eq '.' || $refchar eq ' ');
1613 foreach my $seq ( @seqchars ) {
1614 next if $pos >= scalar @
$seq;
1615 $dash = 1 if( $seq->[$pos] eq '-' || $seq->[$pos] eq '.' ||
1616 $seq->[$pos] eq ' ' );
1617 $col{$seq->[$pos]}++ if defined $seq->[$pos];
1619 my @colresidues = sort keys %col;
1621 # if all the values are the same
1622 if( $dash ) { $char = $matchchars{'mismatch'} }
1623 elsif( @colresidues == 1 ) { $char = $matchchars{'match'} }
1624 elsif( $alphabet eq 'protein' ) { # only try to do weak/strong
1625 # matches for protein seqs
1627 foreach my $type ( qw(strong weak) ) {
1628 # iterate through categories
1630 # iterate through each of the aa in the col
1631 # look to see which groups it is in
1632 foreach my $c ( @colresidues ) {
1633 foreach my $f ( grep { index($_,$c) >= 0 } @
{$CONSERVATION_GROUPS{$type}} ) {
1634 push @
{$groups{$f}},$c;
1638 foreach my $cols ( values %groups ) {
1639 @
$cols = sort @
$cols;
1640 # now we are just testing to see if two arrays
1641 # are identical w/o changing either one
1642 # have to be same len
1643 next if( scalar @
$cols != scalar @colresidues );
1644 # walk down the length and check each slot
1645 for($_=0;$_ < (scalar @
$cols);$_++ ) {
1646 next GRP
if( $cols->[$_] ne $colresidues[$_] );
1648 $char = $matchchars{$type};
1654 $matchline .= $char;
1663 Usage : $line = $align->gap_line()
1664 Function : Generates a gap line - much like consensus string
1665 except that a line where '-' represents gap
1666 Args : (optional) gap line characters ('-' by default)
1672 my ($self,$gapchar) = @_;
1673 $gapchar = $gapchar || $self->gap_char;
1674 my %gap_hsh; # column gaps vector
1675 foreach my $seq ( $self->each_seq ) {
1677 map {$gap_hsh{$_->[0]} = undef} grep {$_->[1] =~ m/[$gapchar]/}
1678 map {[$i++, $_]} split(//, uc ($seq->seq));
1681 foreach my $pos ( 0..$self->length-1 ) {
1682 $gap_line .= (exists $gap_hsh{$pos}) ?
$self->gap_char:'.';
1689 Title : all_gap_line()
1690 Usage : $line = $align->all_gap_line()
1691 Function : Generates a gap line - much like consensus string
1692 except that a line where '-' represents all-gap column
1693 Args : (optional) gap line characters ('-' by default)
1699 my ($self,$gapchar) = @_;
1700 $gapchar = $gapchar || $self->gap_char;
1701 my %gap_hsh; # column gaps counter hash
1702 my @seqs = $self->each_seq;
1703 foreach my $seq ( @seqs ) {
1705 map {$gap_hsh{$_->[0]}++} grep {$_->[1] =~ m/[$gapchar]/}
1706 map {[$i++, $_]} split(//, uc ($seq->seq));
1709 foreach my $pos ( 0..$self->length-1 ) {
1710 if (exists $gap_hsh{$pos} && $gap_hsh{$pos} == scalar @seqs) {
1712 $gap_line .= $self->gap_char;
1720 =head2 gap_col_matrix
1722 Title : gap_col_matrix()
1723 Usage : my $cols = $align->gap_col_matrix()
1724 Function : Generates an array where each element in the array is a
1725 hash reference with a key of the sequence name and a
1726 value of 1 if the sequence has a gap at that column
1727 Returns : Reference to an array
1728 Args : Optional: gap line character ($aln->gap_char or '-' by default)
1732 sub gap_col_matrix
{
1733 my ( $self, $gapchar ) = @_;
1734 $gapchar = $gapchar || $self->gap_char;
1735 my %gap_hsh; # column gaps vector
1737 foreach my $seq ( $self->each_seq ) {
1739 my $str = $seq->seq;
1740 my $len = $seq->length;
1742 my $id = $seq->display_id;
1743 while ( $i < $len ) {
1744 $ch = substr( $str, $i, 1 );
1745 $cols[ $i++ ]->{$id} = ( $ch =~ m/[$gapchar]/ );
1754 Usage : $ali->match()
1755 Function : Goes through all columns and changes residues that are
1756 identical to residue in first sequence to match '.'
1757 character. Sets match_char.
1759 USE WITH CARE: Most MSA formats do not support match
1760 characters in sequences, so this is mostly for output
1761 only. NEXUS format (Bio::AlignIO::nexus) can handle
1763 Returns : 1 on success
1764 Argument : a match character, optional, defaults to '.'
1769 my ( $self, $match ) = @_;
1772 my ($matching_char) = $match;
1773 $matching_char = "\\$match" if $match =~ /[\^.$|()\[\]]/; #';
1774 $self->map_chars( $matching_char, '-' );
1776 my @seqs = $self->each_seq();
1777 return 1 unless scalar @seqs > 1;
1779 my $refseq = shift @seqs;
1780 my @refseq = split //, $refseq->seq;
1781 my $gapchar = $self->gap_char;
1783 foreach my $seq (@seqs) {
1784 my @varseq = split //, $seq->seq();
1785 for ( my $i = 0; $i < scalar @varseq; $i++ ) {
1786 $varseq[$i] = $match
1787 if defined $refseq[$i]
1788 && ( $refseq[$i] =~ /[A-Za-z\*]/
1789 || $refseq[$i] =~ /$gapchar/ )
1790 && $refseq[$i] eq $varseq[$i];
1792 $seq->seq( join '', @varseq );
1794 $self->match_char($match);
1803 Usage : $ali->unmatch()
1804 Function : Undoes the effect of method match. Unsets match_char.
1805 Returns : 1 on success
1806 Argument : a match character, optional, defaults to '.'
1808 See L<match> and L<match_char>
1813 my ( $self, $match ) = @_;
1817 my @seqs = $self->each_seq();
1818 return 1 unless scalar @seqs > 1;
1820 my $refseq = shift @seqs;
1821 my @refseq = split //, $refseq->seq;
1822 my $gapchar = $self->gap_char;
1823 foreach my $seq (@seqs) {
1824 my @varseq = split //, $seq->seq();
1825 for ( my $i = 0; $i < scalar @varseq; $i++ ) {
1826 $varseq[$i] = $refseq[$i]
1827 if defined $refseq[$i]
1828 && ( $refseq[$i] =~ /[A-Za-z\*]/
1829 || $refseq[$i] =~ /$gapchar/ )
1830 && $varseq[$i] eq $match;
1832 $seq->seq( join '', @varseq );
1834 $self->match_char('');
1839 =head1 MSA attributes
1841 Methods for setting and reading the MSA attributes.
1843 Note that the methods defining character semantics depend on the user
1844 to set them sensibly. They are needed only by certain input/output
1845 methods. Unset them by setting to an empty string ('').
1850 Usage : $myalign->id("Ig")
1851 Function : Gets/sets the id field of the alignment
1852 Returns : An id string
1853 Argument : An id string (optional)
1858 my ( $self, $name ) = @_;
1860 if ( defined($name) ) {
1861 $self->{'_id'} = $name;
1864 return $self->{'_id'};
1870 Usage : $myalign->accession("PF00244")
1871 Function : Gets/sets the accession field of the alignment
1872 Returns : An acc string
1873 Argument : An acc string (optional)
1878 my ( $self, $acc ) = @_;
1880 if ( defined($acc) ) {
1881 $self->{'_accession'} = $acc;
1884 return $self->{'_accession'};
1890 Usage : $myalign->description("14-3-3 proteins")
1891 Function : Gets/sets the description field of the alignment
1892 Returns : An description string
1893 Argument : An description string (optional)
1898 my ( $self, $name ) = @_;
1900 if ( defined($name) ) {
1901 $self->{'_description'} = $name;
1904 return $self->{'_description'};
1909 Title : missing_char
1910 Usage : $myalign->missing_char("?")
1911 Function : Gets/sets the missing_char attribute of the alignment
1912 It is generally recommended to set it to 'n' or 'N'
1913 for nucleotides and to 'X' for protein.
1914 Returns : An missing_char string,
1915 Argument : An missing_char string (optional)
1920 my ( $self, $char ) = @_;
1922 if ( defined $char ) {
1923 $self->throw("Single missing character, not [$char]!")
1924 if CORE
::length($char) > 1;
1925 $self->{'_missing_char'} = $char;
1928 return $self->{'_missing_char'};
1935 Usage : $myalign->match_char('.')
1936 Function : Gets/sets the match_char attribute of the alignment
1937 Returns : An match_char string,
1938 Argument : An match_char string (optional)
1943 my ( $self, $char ) = @_;
1945 if ( defined $char ) {
1946 $self->throw("Single match character, not [$char]!")
1947 if CORE
::length($char) > 1;
1948 $self->{'_match_char'} = $char;
1951 return $self->{'_match_char'};
1957 Usage : $myalign->gap_char('-')
1958 Function : Gets/sets the gap_char attribute of the alignment
1959 Returns : An gap_char string, defaults to '-'
1960 Argument : An gap_char string (optional)
1965 my ( $self, $char ) = @_;
1967 if ( defined $char || !defined $self->{'_gap_char'} ) {
1968 $char = '-' unless defined $char;
1969 $self->throw("Single gap character, not [$char]!")
1970 if CORE
::length($char) > 1;
1971 $self->{'_gap_char'} = $char;
1973 return $self->{'_gap_char'};
1979 Title : symbol_chars
1980 Usage : my @symbolchars = $aln->symbol_chars;
1981 Function: Returns all the seen symbols (other than gaps)
1982 Returns : array of characters that are the seen symbols
1983 Args : boolean to include the gap/missing/match characters
1988 my ($self,$includeextra) = @_;
1990 unless ($self->{'_symbols'}) {
1991 foreach my $seq ($self->each_seq) {
1992 map { $self->{'_symbols'}->{$_} = 1; } split(//,$seq->seq);
1995 my %copy = %{$self->{'_symbols'}};
1996 if( ! $includeextra ) {
1997 foreach my $char ( $self->gap_char, $self->match_char,
1998 $self->missing_char) {
1999 delete $copy{$char} if( defined $char );
2005 =head1 Alignment descriptors
2007 These read only methods describe the MSA in various ways.
2013 Usage : $str = $ali->score()
2014 Function : get/set a score of the alignment
2015 Returns : a score for the alignment
2016 Argument : an optional score to set
2022 $self->{score
} = shift if @_;
2023 return $self->{score
};
2026 =head2 consensus_string
2028 Title : consensus_string
2029 Usage : $str = $ali->consensus_string($threshold_percent)
2030 Function : Makes a strict consensus
2031 Returns : Consensus string
2032 Argument : Optional threshold ranging from 0 to 100.
2033 The consensus residue has to appear at least threshold %
2034 of the sequences at a given location, otherwise a '?'
2035 character will be placed at that location.
2036 (Default value = 0%)
2040 sub consensus_string
{
2042 my $threshold = shift;
2045 my $len = $self->length - 1;
2047 foreach ( 0 .. $len ) {
2048 $out .= $self->_consensus_aa( $_, $threshold );
2054 =head2 consensus_conservation
2056 Title : consensus_conservation
2057 Usage : @conservation = $ali->consensus_conservation();
2058 Function : Conservation (as a percent) of each position of alignment
2059 Returns : Array of percentages [0-100]. Gap columns are 0% conserved.
2064 sub consensus_conservation
{
2067 my $num_sequences = $self->num_sequences;
2068 foreach my $point (0..$self->length-1) {
2069 my %hash = $self->_consensus_counts($point);
2070 # max frequency of a non-gap letter
2071 my $max = (sort {$b<=>$a} values %hash )[0];
2072 push @cons, 100 * $max / $num_sequences;
2080 my $threshold_percent = shift || -1 ;
2081 my ($seq,%hash,$count,$letter,$key);
2082 my $gapchar = $self->gap_char;
2083 %hash = $self->_consensus_counts($point);
2084 my $number_of_sequences = $self->num_sequences();
2085 my $threshold = $number_of_sequences * $threshold_percent / 100. ;
2089 foreach $key ( sort keys %hash ) {
2090 # print "Now at $key $hash{$key}\n";
2091 if( $hash{$key} > $count && $hash{$key} >= $threshold) {
2093 $count = $hash{$key};
2099 # Frequency of each letter in one column
2100 sub _consensus_counts
{
2104 my $gapchar = $self->gap_char;
2105 foreach my $seq ( $self->each_seq() ) {
2106 my $letter = substr($seq->seq,$point,1);
2107 $self->throw("--$point-----------") if $letter eq '';
2108 ($letter eq $gapchar || $letter =~ /\./) && next;
2115 =head2 consensus_iupac
2117 Title : consensus_iupac
2118 Usage : $str = $ali->consensus_iupac()
2119 Function : Makes a consensus using IUPAC ambiguity codes from DNA
2120 and RNA. The output is in upper case except when gaps in
2121 a column force output to be in lower case.
2123 Note that if your alignment sequences contain a lot of
2124 IUPAC ambiquity codes you often have to manually set
2125 alphabet. Bio::PrimarySeq::_guess_type thinks they
2126 indicate a protein sequence.
2127 Returns : consensus string
2129 Throws : on protein sequences
2133 sub consensus_iupac
{
2136 my $len = $self->length - 1;
2138 # only DNA and RNA sequences are valid
2139 foreach my $seq ( $self->each_seq() ) {
2140 $self->throw( "Seq [" . $seq->get_nse . "] is a protein" )
2141 if $seq->alphabet eq 'protein';
2144 # loop over the alignment columns
2145 foreach my $count ( 0 .. $len ) {
2146 $out .= $self->_consensus_iupac($count);
2151 sub _consensus_iupac
{
2152 my ($self, $column) = @_;
2153 my ($string, $char, $rna);
2155 #determine all residues in a column
2156 foreach my $seq ( $self->each_seq() ) {
2157 $string .= substr($seq->seq, $column, 1);
2159 $string = uc $string;
2161 # quick exit if there's an N in the string
2162 if ($string =~ /N/) {
2163 $string =~ /\W/ ?
return 'n' : return 'N';
2165 # ... or if there are only gap characters
2166 return '-' if $string =~ /^\W+$/;
2168 # treat RNA as DNA in regexps
2169 if ($string =~ /U/) {
2174 # the following s///'s only need to be done to the _first_ ambiguity code
2175 # as we only need to see the _range_ of characters in $string
2177 if ($string =~ /[VDHB]/) {
2178 $string =~ s/V/AGC/;
2179 $string =~ s/D/AGT/;
2180 $string =~ s/H/ACT/;
2181 $string =~ s/B/CTG/;
2184 if ($string =~ /[SKYRWM]/) {
2193 # and now the guts of the thing
2195 if ($string =~ /A/) {
2197 if ($string =~ /G/) {
2198 $char = 'R'; # A and G (purines) R
2199 if ($string =~ /C/) {
2200 $char = 'V'; # A and G and C V
2201 if ($string =~ /T/) {
2202 $char = 'N'; # A and G and C and T N
2204 } elsif ($string =~ /T/) {
2205 $char = 'D'; # A and G and T D
2207 } elsif ($string =~ /C/) {
2208 $char = 'M'; # A and C M
2209 if ($string =~ /T/) {
2210 $char = 'H'; # A and C and T H
2212 } elsif ($string =~ /T/) {
2213 $char = 'W'; # A and T W
2215 } elsif ($string =~ /C/) {
2217 if ($string =~ /T/) {
2218 $char = 'Y'; # C and T (pyrimidines) Y
2219 if ($string =~ /G/) {
2220 $char = 'B'; # C and T and G B
2222 } elsif ($string =~ /G/) {
2223 $char = 'S'; # C and G S
2225 } elsif ($string =~ /G/) {
2227 if ($string =~ /C/) {
2228 $char = 'S'; # G and C S
2229 } elsif ($string =~ /T/) {
2230 $char = 'K'; # G and T K
2232 } elsif ($string =~ /T/) {
2236 $char = 'U' if $rna and $char eq 'T';
2237 $char = lc $char if $string =~ /\W/;
2243 =head2 consensus_meta
2245 Title : consensus_meta
2246 Usage : $seqmeta = $ali->consensus_meta()
2247 Function : Returns a Bio::Seq::Meta object containing the consensus
2248 strings derived from meta data analysis.
2249 Returns : Bio::Seq::Meta
2250 Argument : Bio::Seq::Meta
2251 Throws : non-MetaI object
2255 sub consensus_meta
{
2256 my ($self, $meta) = @_;
2257 if ($meta && (!ref $meta || !$meta->isa('Bio::Seq::MetaI'))) {
2258 $self->throw('Not a Bio::Seq::MetaI object');
2260 return $self->{'_aln_meta'} = $meta if $meta;
2261 return $self->{'_aln_meta'}
2267 Usage : if ( $ali->is_flush() )
2268 Function : Tells you whether the alignment
2269 : is flush, i.e. all of the same length
2276 my ( $self, $report ) = @_;
2281 foreach $seq ( $self->each_seq() ) {
2282 if ( $length == (-1) ) {
2283 $length = CORE
::length( $seq->seq() );
2287 $temp = CORE
::length( $seq->seq() );
2288 if ( $temp != $length ) {
2290 "expecting $length not $temp from " . $seq->display_id )
2293 "expecting $length not $temp from " . $seq->display_id );
2294 $self->debug( $seq->seq() . "\n" );
2306 Usage : $len = $ali->length()
2307 Function : Returns the maximum length of the alignment.
2308 To be sure the alignment is a block, use is_flush
2316 Carp
::carp
("length_aln - deprecated method. Use length() instead.");
2326 foreach $seq ( $self->each_seq() ) {
2327 $temp = $seq->length();
2328 if( $temp > $length ) {
2337 =head2 maxdisplayname_length
2339 Title : maxdisplayname_length
2340 Usage : $ali->maxdisplayname_length()
2341 Function : Gets the maximum length of the displayname in the
2342 alignment. Used in writing out various MSA formats.
2348 sub maxname_length
{
2350 Carp
::carp
("maxname_length - deprecated method."
2351 . " Use maxdisplayname_length() instead.");
2352 $self->maxdisplayname_length();
2357 Carp
::carp
("maxnse_length - deprecated method."
2358 . " Use maxnse_length() instead.");
2359 $self->maxdisplayname_length();
2362 sub maxdisplayname_length
{
2367 foreach $seq ( $self->each_seq() ) {
2368 $len = CORE
::length $self->displayname( $seq->get_nse() );
2370 if ( $len > $maxname ) {
2378 =head2 max_metaname_length
2380 Title : max_metaname_length
2381 Usage : $ali->max_metaname_length()
2382 Function : Gets the maximum length of the meta name tags in the
2383 alignment for the sequences and for the alignment.
2384 Used in writing out various MSA formats.
2390 sub max_metaname_length
{
2395 # check seq meta first
2396 for $seq ( $self->each_seq() ) {
2397 next if !$seq->isa('Bio::Seq::MetaI' || !$seq->meta_names);
2398 for my $mtag ($seq->meta_names) {
2399 $len = CORE
::length $mtag;
2400 if( $len > $maxname ) {
2407 for my $meta ($self->consensus_meta) {
2409 for my $name ($meta->meta_names) {
2410 $len = CORE
::length $name;
2411 if( $len > $maxname ) {
2422 Title : num_residues
2423 Usage : $no = $ali->num_residues
2424 Function : number of residues in total in the alignment
2427 Note : replaces no_residues()
2435 foreach my $seq ( $self->each_seq ) {
2436 my $str = $seq->seq();
2438 $count += ( $str =~ s/[A-Za-z]//g );
2444 =head2 num_sequences
2446 Title : num_sequences
2447 Usage : $depth = $ali->num_sequences
2448 Function : number of sequence in the sequence alignment
2451 Note : replaces no_sequences()
2457 return scalar($self->each_seq);
2460 =head2 average_percentage_identity
2462 Title : average_percentage_identity
2463 Usage : $id = $align->average_percentage_identity
2464 Function: The function uses a fast method to calculate the average
2465 percentage identity of the alignment
2466 Returns : The average percentage identity of the alignment
2468 Notes : This method implemented by Kevin Howe calculates a figure that is
2469 designed to be similar to the average pairwise identity of the
2470 alignment (identical in the absence of gaps), without having to
2471 explicitly calculate pairwise identities proposed by Richard Durbin.
2472 Validated by Ewan Birney ad Alex Bateman.
2476 sub average_percentage_identity
{
2477 my ($self,@args) = @_;
2479 my @alphabet = ('A','B','C','D','E','F','G','H','I','J','K','L','M',
2480 'N','O','P','Q','R','S','T','U','V','W','X','Y','Z');
2482 my ($len, $total, $subtotal, $divisor, $subdivisor, @seqs, @countHashes);
2484 if (! $self->is_flush()) {
2485 $self->throw("All sequences in the alignment must be the same length");
2488 @seqs = $self->each_seq();
2489 $len = $self->length();
2491 # load the each hash with correct keys for existence checks
2493 for( my $index=0; $index < $len; $index++) {
2494 foreach my $letter (@alphabet) {
2495 $countHashes[$index]->{$letter} = 0;
2498 foreach my $seq (@seqs) {
2499 my @seqChars = split //, $seq->seq();
2500 for( my $column=0; $column < @seqChars; $column++ ) {
2501 my $char = uc($seqChars[$column]);
2502 if (exists $countHashes[$column]->{$char}) {
2503 $countHashes[$column]->{$char}++;
2510 for(my $column =0; $column < $len; $column++) {
2511 my %hash = %{$countHashes[$column]};
2513 foreach my $res (keys %hash) {
2514 $total += $hash{$res}*($hash{$res} - 1);
2515 $subdivisor += $hash{$res};
2517 $divisor += $subdivisor * ($subdivisor - 1);
2519 return $divisor > 0 ?
($total / $divisor )*100.0 : 0;
2522 =head2 percentage_identity
2524 Title : percentage_identity
2525 Usage : $id = $align->percentage_identity
2526 Function: The function calculates the average percentage identity
2527 (aliased to average_percentage_identity)
2528 Returns : The average percentage identity
2533 sub percentage_identity
{
2535 return $self->average_percentage_identity();
2538 =head2 overall_percentage_identity
2540 Title : overall_percentage_identity
2541 Usage : $id = $align->overall_percentage_identity
2542 $id = $align->overall_percentage_identity('short')
2543 Function: The function calculates the percentage identity of
2544 the conserved columns
2545 Returns : The percentage identity of the conserved columns
2546 Args : length value to use, optional defaults to alignment length
2547 possible values: 'align', 'short', 'long'
2549 The argument values 'short' and 'long' refer to shortest and longest
2550 sequence in the alignment. Method modification code by Hongyu Zhang.
2554 sub overall_percentage_identity
{
2555 my ($self, $length_measure) = @_;
2557 my %alphabet = map {$_ => undef} qw
(A C G T U B D E F H I J K L M N O P Q R S V W X Y Z
);
2559 my %enum = map {$_ => undef} qw
(align short long
);
2561 $self->throw("Unknown argument [$length_measure]")
2562 if $length_measure and not exists $enum{$length_measure};
2563 $length_measure ||= 'align';
2565 if (! $self->is_flush()) {
2566 $self->throw("All sequences in the alignment must be the same length");
2569 # Count the residues seen at each position
2571 my $total = 0; # number of positions with identical residues
2573 my @seqs = $self->each_seq;
2574 my $nof_seqs = scalar @seqs;
2575 my $aln_len = $self->length();
2576 for my $seq (@seqs) {
2577 my $seqstr = $seq->seq;
2579 # Count residues for given sequence
2580 for my $column (0 .. $aln_len-1) {
2581 my $char = uc( substr($seqstr, $column, 1) );
2582 if ( exists $alphabet{$char} ) {
2584 # This is a valid char
2585 if ( defined $countHashes[$column]->{$char} ) {
2586 $countHashes[$column]->{$char}++;
2588 $countHashes[$column]->{$char} = 1;
2591 if ( $countHashes[$column]->{$char} == $nof_seqs ) {
2592 # All sequences have this same residue
2600 if ($length_measure eq 'short' || $length_measure eq 'long') {
2601 my $seq_len = $seqstr =~ tr/[A-Za-z]//;
2602 if ($length_measure eq 'short') {
2603 if ( (not defined $len) || ($seq_len < $len) ) {
2606 } elsif ($length_measure eq 'long') {
2607 if ( (not defined $len) || ($seq_len > $len) ) {
2615 if ($length_measure eq 'align') {
2619 return ($total / $len ) * 100.0;
2624 =head1 Alignment positions
2626 Methods to map a sequence position into an alignment column and back.
2627 column_from_residue_number() does the former. The latter is really a
2628 property of the sequence object and can done using
2629 L<Bio::LocatableSeq::location_from_column>:
2631 # select somehow a sequence from the alignment, e.g.
2632 my $seq = $aln->get_seq_by_pos(1);
2633 #$loc is undef or Bio::LocationI object
2634 my $loc = $seq->location_from_column(5);
2636 =head2 column_from_residue_number
2638 Title : column_from_residue_number
2639 Usage : $col = $ali->column_from_residue_number( $seqname, $resnumber)
2640 Function: This function gives the position in the alignment
2641 (i.e. column number) of the given residue number in the
2642 sequence with the given name. For example, for the
2645 Seq1/91-97 AC..DEF.GH.
2646 Seq2/24-30 ACGG.RTY...
2647 Seq3/43-51 AC.DDEF.GHI
2649 column_from_residue_number( "Seq1", 94 ) returns 6.
2650 column_from_residue_number( "Seq2", 25 ) returns 2.
2651 column_from_residue_number( "Seq3", 50 ) returns 10.
2653 An exception is thrown if the residue number would lie
2654 outside the length of the aligment
2655 (e.g. column_from_residue_number( "Seq2", 22 )
2657 Note: If the the parent sequence is represented by more than
2658 one alignment sequence and the residue number is present in
2659 them, this method finds only the first one.
2661 Returns : A column number for the position in the alignment of the
2662 given residue in the given sequence (1 = first column)
2663 Args : A sequence id/name (not a name/start-end)
2664 A residue number in the whole sequence (not just that
2665 segment of it in the alignment)
2669 sub column_from_residue_number
{
2670 my ( $self, $name, $resnumber ) = @_;
2672 $self->throw("No sequence with name [$name]")
2673 unless $self->{'_start_end_lists'}->{$name};
2674 $self->throw("Second argument residue number missing") unless $resnumber;
2676 foreach my $seq ( $self->each_seq_with_id($name) ) {
2678 eval { $col = $seq->column_from_residue_number($resnumber); };
2683 $self->throw( "Could not find a sequence segment in $name "
2684 . "containing residue number $resnumber" );
2688 =head1 Sequence names
2690 Methods to manipulate the display name. The default name based on the
2691 sequence id and subsequence positions can be overridden in various
2697 Usage : $myalign->displayname("Ig", "IgA")
2698 Function : Gets/sets the display name of a sequence in the alignment
2699 Returns : A display name string
2700 Argument : name of the sequence
2701 displayname of the sequence (optional)
2706 my ( $self, $name, $disname ) = @_;
2708 $self->throw("No sequence with name [$name]")
2709 unless defined $self->{'_seq'}->{$name};
2711 if ( $disname and $name ) {
2712 $self->{'_dis_name'}->{$name} = $disname;
2715 elsif ( defined $self->{'_dis_name'}->{$name} ) {
2716 return $self->{'_dis_name'}->{$name};
2723 sub get_displayname
{
2725 Carp
::carp
("get_displayname - deprecated method. Use displayname() instead.");
2726 $self->displayname(@_);
2729 sub set_displayname
{
2731 Carp
::carp
("set_displayname - deprecated method. Use displayname() instead.");
2732 $self->displayname(@_);
2736 =head2 set_displayname_count
2738 Title : set_displayname_count
2739 Usage : $ali->set_displayname_count
2740 Function : Sets the names to be name_# where # is the number of
2741 times this name has been used.
2742 Returns : 1, on success
2747 sub set_displayname_count
{
2749 my (@arr,$name,$seq,$count,$temp,$nse);
2751 foreach $seq ( $self->each_alphabetically() ) {
2752 $nse = $seq->get_nse();
2754 #name will be set when this is the second
2755 #time (or greater) is has been seen
2757 if( defined $name and $name eq ($seq->id()) ) {
2758 $temp = sprintf("%s_%s",$name,$count);
2759 $self->displayname($nse,$temp);
2764 $temp = sprintf("%s_%s",$name,$count);
2765 $self->displayname($nse,$temp);
2772 =head2 set_displayname_flat
2774 Title : set_displayname_flat
2775 Usage : $ali->set_displayname_flat()
2776 Function : Makes all the sequences be displayed as just their name,
2777 not name/start-end (NSE)
2783 sub set_displayname_flat
{
2787 foreach $seq ( $self->each_seq() ) {
2788 $nse = $seq->get_nse();
2789 $self->displayname( $nse, $seq->id() );
2795 =head2 set_displayname_normal
2797 Title : set_displayname_normal
2798 Usage : $ali->set_displayname_normal()
2799 Function : Makes all the sequences be displayed as name/start-end (NSE)
2800 Returns : 1, on success
2805 sub set_displayname_normal
{
2809 foreach $seq ( $self->each_seq() ) {
2810 $nse = $seq->get_nse();
2811 $self->displayname( $nse, $nse );
2819 Usage : $obj->source($newval)
2820 Function: sets the Alignment source program
2822 Returns : value of source
2823 Args : newvalue (optional)
2828 my ( $self, $value ) = @_;
2829 if ( defined $value ) {
2830 $self->{'_source'} = $value;
2832 return $self->{'_source'};
2836 =head2 set_displayname_safe
2838 Title : set_displayname_safe
2839 Usage : ($new_aln, $ref_name)=$ali->set_displayname_safe(4)
2840 Function : Assign machine-generated serial names to sequences in input order.
2841 Designed to protect names during PHYLIP runs. Assign 10-char string
2842 in the form of "S000000001" to "S999999999". Restore the original
2843 names using "restore_displayname".
2844 Returns : 1. a new $aln with system names;
2845 2. a hash ref for restoring names
2846 Argument : Number for id length (default 10)
2850 sub set_displayname_safe
{
2852 my $idlength = shift || 10;
2853 my ( $seq, %phylip_name );
2855 my $new = Bio
::SimpleAlign
->new();
2856 foreach $seq ( $self->each_seq() ) {
2858 my $pname = "S" . sprintf "%0" . ( $idlength - 1 ) . "s", $ct;
2859 $phylip_name{$pname} = $seq->id();
2860 my $new_seq = Bio
::LocatableSeq
->new(
2862 -seq
=> $seq->seq(),
2863 -alphabet
=> $seq->alphabet,
2864 -start
=> $seq->{_start
},
2865 -end
=> $seq->{_end
}
2867 $new->add_seq($new_seq);
2871 "$ct seq names changed. Restore names by using restore_displayname.");
2872 return ( $new, \
%phylip_name );
2876 =head2 restore_displayname
2878 Title : restore_displayname
2879 Usage : $aln_name_restored=$ali->restore_displayname($hash_ref)
2880 Function : Restore original sequence names (after running
2881 $ali->set_displayname_safe)
2882 Returns : a new $aln with names restored.
2883 Argument : a hash reference of names from "set_displayname_safe".
2887 sub restore_displayname
{
2891 my $new=Bio
::SimpleAlign
->new();
2892 foreach my $seq ( $self->each_seq() ) {
2893 $self->throw("No sequence with name") unless defined $name{$seq->id()};
2894 my $new_seq= Bio
::LocatableSeq
->new(-id
=> $name{$seq->id()},
2895 -seq
=> $seq->seq(),
2896 -alphabet
=> $seq->alphabet,
2897 -start
=> $seq->{_start
},
2898 -end
=> $seq->{_end
}
2900 $new->add_seq($new_seq);
2905 =head2 sort_by_start
2907 Title : sort_by_start
2908 Usage : $ali->sort_by_start
2909 Function : Changes the order of the alignment to the start position of each
2911 Returns : 1 on success
2918 my ($seq,$nse,@arr,%hash,$count);
2919 foreach $seq ( $self->each_seq() ) {
2920 $nse = $seq->get_nse;
2924 %{$self->{'_order'}} = (); # reset the hash;
2925 foreach $nse ( sort _startend
keys %hash) {
2926 $self->{'_order'}->{$count} = $nse;
2933 my ($aname,$arange) = split (/[\/]/,$a);
2934 my ($bname,$brange) = split (/[\/]/,$b);
2935 my ($astart,$aend) = split(/\-/,$arange);
2936 my ($bstart,$bend) = split(/\-/,$brange);
2937 return $astart <=> $bstart;
2940 =head2 bracket_string
2942 Title : bracket_string
2943 Usage : my @params = (-refseq => 'testseq',
2944 -allele1 => 'allele1',
2945 -allele2 => 'allele2',
2946 -delimiters => '{}',
2948 $str = $aln->bracket_string(@params)
2950 Function : When supplied with a list of parameters (see below), returns a
2951 string in BIC format. This is used for allelic comparisons.
2952 Briefly, if either allele contains a base change when compared to
2953 the refseq, the base or gap for each allele is represented in
2954 brackets in the order present in the 'alleles' parameter.
2956 For the following data:
2965 the returned string with parameters 'refseq => testseq' and
2966 'alleles => [qw(allele1 allele2)]' would be:
2968 GGAT[C/-][C/-]ATT[C/C]CT[A/C]CT
2969 Returns : BIC-formatted string
2970 Argument : Required args
2971 refseq : string (ID) of the reference sequence used
2972 as basis for comparison
2973 allele1 : string (ID) of the first allele
2974 allele2 : string (ID) of the second allele
2976 delimiters: two symbol string of left and right delimiters.
2977 Only the first two symbols are used
2979 separator : string used as a separator. Only the first
2982 Throws : On no refseq/alleles, or invalid refseq/alleles.
2986 sub bracket_string
{
2987 my ($self, @args) = @_;
2988 my ($ref, $a1, $a2, $delim, $sep) =
2989 $self->_rearrange([qw(refseq allele1 allele2 delimiters separator)], @args);
2990 $self->throw('Missing refseq/allele1/allele2') if (!$a1 || !$a2 || !$ref);
2992 ($ld, $rd) = split('', $delim, 2) if $delim;
2996 my ($refseq, $allele1, $allele2) =
2997 map {( $self->each_seq_with_id($_) )} ($ref, $a1, $a2);
2998 if (!$refseq || !$allele1 || !$allele2) {
2999 $self->throw("One of your refseq/allele IDs is invalid!");
3001 my $len = $self->length-1;
3003 # loop over the alignment columns
3004 for my $column ( 0 .. $len ) {
3006 my ($compres, $res1, $res2) =
3007 map{substr($_->seq, $column, 1)} ($refseq, $allele1, $allele2);
3008 # are any of the allele symbols different from the refseq?
3009 $string = ($compres eq $res1 && $compres eq $res2) ?
$compres :
3010 $ld.$res1.$sep.$res2.$rd;
3017 =head2 methods implementing Bio::FeatureHolderI
3019 FeatureHolderI implementation to support labeled character sets like one
3020 would get from NEXUS represented data.
3022 =head2 get_SeqFeatures
3024 Usage : @features = $aln->get_SeqFeatures
3025 Function: Get the feature objects held by this feature holder.
3027 Returns : an array of Bio::SeqFeatureI implementing objects
3028 Args : optional filter coderef, taking a Bio::SeqFeatureI
3029 : as argument, returning TRUE if wanted, FALSE if
3034 sub get_SeqFeatures
{
3036 my $filter_cb = shift;
3037 $self->throw("Arg (filter callback) must be a coderef")
3038 unless !defined($filter_cb)
3039 or ref($filter_cb) eq 'CODE';
3040 if ( !defined $self->{'_as_feat'} ) {
3041 $self->{'_as_feat'} = [];
3044 return grep { $filter_cb->($_) } @
{ $self->{'_as_feat'} };
3046 return @
{ $self->{'_as_feat'} };
3050 =head2 add_SeqFeature
3052 Usage : $aln->add_SeqFeature($subfeat);
3053 Function: Adds a SeqFeature into the SeqFeature array. The 'EXPAND' qualifier
3054 (see L<Bio::FeatureHolderI>) is supported, but has no effect.
3056 Returns : 1 on success
3057 Args : a Bio::SeqFeatureI object
3061 sub add_SeqFeature
{
3062 my ($self, @feat) = @_;
3064 $self->{'_as_feat'} = [] unless $self->{'_as_feat'};
3066 if (scalar @feat > 1) {
3067 Carp
::carp
('Providing an array of features to Bio::SimpleAlign'
3068 . 'add_SeqFeature() is deprecated and will be removed in a'
3069 . ' future version. Add a single feature at a time instead.');
3072 for my $feat ( @feat ) {
3074 next if $feat eq 'EXPAND'; # Need to support it for FeatureHolderI compliance
3076 if( !$feat->isa("Bio::SeqFeatureI") ) {
3077 $self->throw("Expected a Bio::SeqFeatureI object, but got a $feat.");
3080 push @
{$self->{'_as_feat'}}, $feat;
3086 =head2 remove_SeqFeatures
3088 Usage : $obj->remove_SeqFeatures
3089 Function: Removes all SeqFeatures. If you want to remove only a subset,
3090 remove that subset from the returned array, and add back the rest.
3091 Returns : The array of Bio::SeqFeatureI features that was
3092 deleted from this alignment.
3097 sub remove_SeqFeatures
{
3100 return () unless $self->{'_as_feat'};
3101 my @feats = @
{$self->{'_as_feat'}};
3102 $self->{'_as_feat'} = [];
3106 =head2 feature_count
3108 Title : feature_count
3109 Usage : $obj->feature_count()
3110 Function: Return the number of SeqFeatures attached to the alignment
3111 Returns : integer representing the number of SeqFeatures
3119 if (defined($self->{'_as_feat'})) {
3120 return ($#{$self->{'_as_feat'}} + 1);
3126 =head2 get_all_SeqFeatures
3128 Title : get_all_SeqFeatures
3130 Function: Get all SeqFeatures.
3132 Returns : an array of Bio::SeqFeatureI implementing objects
3134 Note : Falls through to Bio::FeatureHolderI implementation.
3138 =head2 methods for Bio::AnnotatableI
3140 AnnotatableI implementation to support sequence alignments which
3141 contain annotation (NEXUS, Stockholm).
3146 Usage : $ann = $aln->annotation or
3147 $aln->annotation($ann)
3148 Function: Gets or sets the annotation
3149 Returns : Bio::AnnotationCollectionI object
3150 Args : None or Bio::AnnotationCollectionI object
3152 See L<Bio::AnnotationCollectionI> and L<Bio::Annotation::Collection>
3153 for more information
3158 my ($obj,$value) = @_;
3159 if( defined $value ) {
3160 $obj->throw("object of class ".ref($value)." does not implement ".
3161 "Bio::AnnotationCollectionI. Too bad.")
3162 unless $value->isa("Bio::AnnotationCollectionI");
3163 $obj->{'_annotation'} = $value;
3164 } elsif( ! defined $obj->{'_annotation'}) {
3165 $obj->{'_annotation'} = Bio
::Annotation
::Collection
->new();
3167 return $obj->{'_annotation'};
3170 =head1 Deprecated methods
3174 Title : mask_columns
3175 Usage : $aln2 = $aln->mask_columns(20,30)
3176 Function : Masks a slice of the alignment inclusive of start and
3177 end columns, and the first column in the alignment is denoted 1.
3178 Mask beyond the length of the sequence does not do padding.
3179 Returns : A Bio::SimpleAlign object
3180 Args : Positive integer for start column, positive integer for end column,
3181 optional string value use for the mask. Example:
3183 $aln2 = $aln->mask_columns(20,30,'?')
3184 Note : Masking must use a character that is not used for gaps or
3185 frameshifts. These can be adjusted using the relevant global
3186 variables, but be aware these may be (uncontrollably) modified
3187 elsewhere within BioPerl (see bug 2715)
3192 #based on slice(), but did not include the Bio::Seq::Meta sections as I was not sure what it is doing
3195 my $nonres = $Bio::LocatableSeq
::GAP_SYMBOLS
.
3196 $Bio::LocatableSeq
::FRAMESHIFT_SYMBOLS
;
3198 # coordinates are alignment-based, not sequence-based
3199 my ($start, $end, $mask_char) = @_;
3200 unless (defined $mask_char) { $mask_char = 'N' }
3202 $self->throw("Mask start has to be a positive integer and less than ".
3203 "alignment length, not [$start]")
3204 unless $start =~ /^\d+$/ && $start > 0 && $start <= $self->length;
3205 $self->throw("Mask end has to be a positive integer and less than ".
3206 "alignment length, not [$end]")
3207 unless $end =~ /^\d+$/ && $end > 0 && $end <= $self->length;
3208 $self->throw("Mask start [$start] has to be smaller than or equal to ".
3209 "end [$end]") unless $start <= $end;
3210 $self->throw("Mask character $mask_char has to be a single character ".
3211 "and not a gap or frameshift symbol")
3212 unless CORE
::length($mask_char) == 1 && $mask_char !~ m{$nonres};
3214 my $aln = $self->new;
3215 $aln->id($self->id);
3216 foreach my $seq ( $self->each_seq() ) {
3217 my $new_seq = Bio
::LocatableSeq
->new(-id
=> $seq->id,
3218 -alphabet
=> $seq->alphabet,
3219 -strand
=> $seq->strand,
3220 -verbose
=> $self->verbose);
3222 # convert from 1-based alignment coords!
3223 my $masked_string = substr($seq->seq, $start - 1, $end - $start + 1);
3224 $masked_string =~ s{[^$nonres]}{$mask_char}g;
3225 my $new_dna_string = substr($seq->seq,0,$start-1) . $masked_string . substr($seq->seq,$end);
3226 $new_seq->seq($new_dna_string);
3227 $aln->add_seq($new_seq);
3229 # Preserve chosen mask character, it may be need later (like in 'slice')
3230 $aln->{_mask_char
} = $mask_char;