Bio::Tools::CodonTable::is_start_codon: check in case of ambiguous codons (#266)
[bioperl-live.git] / lib / Bio / SimpleAlign.pm
blobe3e2ad9b5b3b0962f9d885febc87f4b95613f413
1 package Bio::SimpleAlign;
3 use strict;
4 use warnings;
6 use Carp;
8 use Bio::LocatableSeq; # uses Seq's as list
9 use Bio::Seq;
10 use Bio::SeqFeature::Generic;
12 use parent qw(Bio::Root::Root Bio::Align::AlignI Bio::AnnotatableI Bio::FeatureHolderI);
14 # BioPerl module for SimpleAlign
16 # Please direct questions and support issues to <bioperl-l@bioperl.org>
18 # Cared for by Heikki Lehvaslaiho <heikki-at-bioperl-dot-org>
20 # Copyright Ewan Birney
22 # You may distribute this module under the same terms as perl itself
24 # POD documentation - main docs before the code
26 # History:
27 # 11/3/00 Added threshold feature to consensus and consensus_aa - PS
28 # May 2001 major rewrite - Heikki Lehvaslaiho
30 =head1 NAME
32 Bio::SimpleAlign - Multiple alignments held as a set of sequences
34 =head1 SYNOPSIS
36 # Use Bio::AlignIO to read in the alignment
37 $str = Bio::AlignIO->new(-file => 't/data/testaln.pfam');
38 $aln = $str->next_aln();
40 # Describe
41 print $aln->length;
42 print $aln->num_residues;
43 print $aln->is_flush;
44 print $aln->num_sequences;
45 print $aln->score;
46 print $aln->percentage_identity;
47 print $aln->consensus_string(50);
49 # Find the position in the alignment for a sequence location
50 $pos = $aln->column_from_residue_number('1433_LYCES', 14); # = 6;
52 # Extract sequences and check values for the alignment column $pos
53 foreach $seq ($aln->each_seq) {
54 $res = $seq->subseq($pos, $pos);
55 $count{$res}++;
57 foreach $res (keys %count) {
58 printf "Res: %s Count: %2d\n", $res, $count{$res};
61 # Manipulate
62 $aln->remove_seq($seq);
63 $mini_aln = $aln->slice(20,30); # get a block of columns
64 $mini_aln = $aln->select_noncont(1,3,5,7,11); # select certain sequences
65 $new_aln = $aln->remove_columns([20,30]); # remove by position
66 $new_aln = $aln->remove_columns(['mismatch']); # remove by property
68 # Analyze
69 $str = $aln->consensus_string($threshold_percent);
70 $str = $aln->match_line();
71 $str = $aln->cigar_line();
72 $id = $aln->percentage_identity;
74 # See the module documentation for details and more methods.
76 =head1 DESCRIPTION
78 SimpleAlign is an object that handles a multiple sequence alignment
79 (MSA). It is very permissive of types (it does not insist on sequences
80 being all same length, for example). Think of it as a set of sequences
81 with a whole series of built-in manipulations and methods for reading and
82 writing alignments.
84 SimpleAlign uses L<Bio::LocatableSeq>, a subclass of L<Bio::PrimarySeq>,
85 to store its sequences. These are subsequences with a start and end
86 positions in the parent reference sequence. Each sequence in the
87 SimpleAlign object is a Bio::LocatableSeq.
89 SimpleAlign expects the combination of name, start, and end for a
90 given sequence to be unique in the alignment, and this is the key for the
91 internal hashes (name, start, end are abbreviated C<nse> in the code).
92 However, in some cases people do not want the name/start-end to be displayed:
93 either multiple names in an alignment or names specific to the alignment
94 (ROA1_HUMAN_1, ROA1_HUMAN_2 etc). These names are called
95 C<displayname>, and generally is what is used to print out the
96 alignment. They default to name/start-end.
98 The SimpleAlign Module is derived from the Align module by Ewan Birney.
100 =head1 FEEDBACK
102 =head2 Mailing Lists
104 User feedback is an integral part of the evolution of this and other
105 Bioperl modules. Send your comments and suggestions preferably to one
106 of the Bioperl mailing lists. Your participation is much appreciated.
108 bioperl-l@bioperl.org - General discussion
109 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
111 =head2 Support
113 Please direct usage questions or support issues to the mailing list:
115 I<bioperl-l@bioperl.org>
117 rather than to the module maintainer directly. Many experienced and
118 reponsive experts will be able look at the problem and quickly
119 address it. Please include a thorough description of the problem
120 with code and data examples if at all possible.
122 =head2 Reporting Bugs
124 Report bugs to the Bioperl bug tracking system to help us keep track
125 the bugs and their resolution. Bug reports can be submitted via the
126 web:
128 https://github.com/bioperl/bioperl-live/issues
130 =head1 AUTHOR
132 Ewan Birney, birney@ebi.ac.uk
134 =head1 CONTRIBUTORS
136 Allen Day, allenday-at-ucla.edu,
137 Richard Adams, Richard.Adams-at-ed.ac.uk,
138 David J. Evans, David.Evans-at-vir.gla.ac.uk,
139 Heikki Lehvaslaiho, heikki-at-bioperl-dot-org,
140 Allen Smith, allens-at-cpan.org,
141 Jason Stajich, jason-at-bioperl.org,
142 Anthony Underwood, aunderwood-at-phls.org.uk,
143 Xintao Wei & Giri Narasimhan, giri-at-cs.fiu.edu
144 Brian Osborne, bosborne at alum.mit.edu
145 Weigang Qiu, Weigang at GENECTR-HUNTER-CUNY-EDU
146 Hongyu Zhang, forward at hongyu.org
147 Jay Hannah, jay at jays.net
148 Alexandr Bezginov, albezg at gmail.com
150 =head1 SEE ALSO
152 L<Bio::LocatableSeq>
154 =head1 APPENDIX
156 The rest of the documentation details each of the object
157 methods. Internal methods are usually preceded with a _
159 =cut
161 ## This data should probably be in a more centralized module...
162 ## it is taken from Clustalw documentation.
163 ## These are all the positively scoring groups that occur in the
164 ## Gonnet Pam250 matrix. The strong and weak groups are
165 ## defined as strong score >0.5 and weak score =<0.5 respectively.
166 our %CONSERVATION_GROUPS = (
167 'strong' => [qw(STA NEQK NHQK NDEQ QHRK MILV MILF HY FYW )],
168 'weak' => [qw(CSA ATV SAG STNK STPA SGND SNDEQK NDEQHK NEQHRK FVLIM HFY)],
172 =head2 new
174 Title : new
175 Usage : my $aln = Bio::SimpleAlign->new();
176 Function : Creates a new simple align object
177 Returns : Bio::SimpleAlign
178 Args : -source => string representing the source program
179 where this alignment came from
180 -annotation => Bio::AnnotationCollectionI
181 -seq_annotation => Bio::AnnotationCollectionI for sequences (requires -annotation also be set)
182 -seqs => array ref containing Bio::LocatableSeq or Bio::Seq::Meta
183 -consensus => consensus string
184 -consensus_meta => Bio::Seq::Meta object containing consensus met information (kludge)
186 =cut
189 sub new {
190 my($class,@args) = @_;
192 my $self = $class->SUPER::new(@args);
194 my ($src, $score, $id, $acc, $desc, $seqs, $feats, $coll, $sa, $con, $cmeta) = $self->_rearrange([qw(
195 SOURCE
196 SCORE
198 ACCESSION
199 DESCRIPTION
200 SEQS
201 FEATURES
202 ANNOTATION
203 SEQ_ANNOTATION
204 CONSENSUS
205 CONSENSUS_META
206 )], @args);
207 $src && $self->source($src);
208 defined $score && $self->score($score);
209 # we need to set up internal hashs first!
211 $self->{'_seq'} = {};
212 $self->{'_order'} = {};
213 $self->{'_start_end_lists'} = {};
214 $self->{'_dis_name'} = {};
215 $self->{'_id'} = 'NoName';
216 # maybe we should automatically read in from args. Hmmm...
217 $id && $self->id($id);
218 $acc && $self->accession($acc);
219 $desc && $self->description($desc);
220 $coll && $self->annotation($coll);
221 # sequence annotation is layered into a provided annotation collection (or dies)
222 if ($sa) {
223 $self->throw("Must supply an alignment-based annotation collection (-annotation) ".
224 "with a sequence annotation collection")
225 if !$coll;
226 $coll->add_Annotation('seq_annotation', $sa);
228 if ($feats && ref $feats eq 'ARRAY') {
229 for my $feat (@$feats) {
230 $self->add_SeqFeature($feat);
233 $con && $self->consensus($con);
234 $cmeta && $self->consensus_meta($cmeta);
235 # assumes these are in correct alignment order
236 if ($seqs && ref($seqs) eq 'ARRAY') {
237 for my $seq (@$seqs) {
238 $self->add_seq($seq);
242 return $self; # success - we hope!
245 =head1 Modifier methods
247 These methods modify the MSA by adding, removing or shuffling complete
248 sequences.
250 =head2 add_seq
252 Title : add_seq
253 Usage : $myalign->add_seq($newseq);
254 $myalign->add_seq(-SEQ=>$newseq, -ORDER=>5);
255 Function : Adds another sequence to the alignment. *Does not* align
256 it - just adds it to the hashes.
257 If -ORDER is specified, the sequence is inserted at the
258 the position spec'd by -ORDER, and existing sequences
259 are pushed down the storage array.
260 Returns : nothing
261 Args : A Bio::LocatableSeq object
262 Positive integer for the sequence position (optional)
264 See L<Bio::LocatableSeq> for more information
266 =cut
268 sub addSeq {
269 my $self = shift;
270 Carp::carp("addSeq - deprecated method. Use add_seq() instead.");
271 $self->add_seq(@_);
274 sub add_seq {
275 my $self = shift;
276 my @args = @_;
277 my ($seq, $order) = $self->_rearrange([qw(SEQ ORDER)], @args);
278 my ($name,$id,$start,$end);
280 unless ($seq) {
281 $self->throw("LocatableSeq argument required");
283 if( ! ref $seq || ! $seq->isa('Bio::LocatableSeq') ) {
284 $self->throw("Unable to process non locatable sequences [". ref($seq). "]");
286 !defined($order) and $order = 1 + keys %{$self->{'_seq'}}; # default
287 $order--; # jay's patch (user-specified order is 1-origin)
289 if ($order < 0) {
290 $self->throw("User-specified value for ORDER must be >= 1");
293 $id = $seq->id() ||$seq->display_id || $seq->primary_id;
295 # build the symbol list for this sequence,
296 # will prune out the gap and missing/match chars
297 # when actually asked for the symbol list in the
298 # symbol_chars
299 # map { $self->{'_symbols'}->{$_} = 1; } split(//,$seq->seq) if $seq->seq;
301 $name = $seq->get_nse;
303 if( $self->{'_seq'}->{$name} ) {
304 $self->warn("Replacing one sequence [$name]\n") unless $self->verbose < 0;
306 else {
307 $self->debug( "Assigning $name to $order\n");
309 my $ordh = $self->{'_order'};
310 if ($ordh->{$order}) {
311 # make space to insert
312 # $c->() returns (in reverse order) the first subsequence
313 # of consecutive integers; i.e., $c->(1,2,3,5,6,7) returns
314 # (3,2,1), and $c->(2,4,5) returns (2).
315 my $c;
316 $c = sub { return (($_[1]-$_[0] == 1) ? ($c->(@_[1..$#_]),$_[0]) : $_[0]); };
317 map {
318 $ordh->{$_+1} = $ordh->{$_}
319 } $c->(sort {$a <=> $b} grep {$_ >= $order} keys %{$ordh});
322 $ordh->{$order} = $name;
324 unless( exists( $self->{'_start_end_lists'}->{$id})) {
325 $self->{'_start_end_lists'}->{$id} = [];
327 push @{$self->{'_start_end_lists'}->{$id}}, $seq;
330 $self->{'_seq'}->{$name} = $seq;
335 =head2 remove_seq
337 Title : remove_seq
338 Usage : $aln->remove_seq($seq);
339 Function : Removes a single sequence from an alignment
340 Returns :
341 Argument : a Bio::LocatableSeq object
343 =cut
345 sub removeSeq {
346 my $self = shift;
347 Carp::carp("removeSeq - deprecated method. Use remove_seq() instead.");
348 $self->remove_seq(@_);
351 sub remove_seq {
352 my $self = shift;
353 my $seq = shift;
354 my ($name,$id);
356 $self->throw("Need Bio::Locatable seq argument ")
357 unless ref $seq && $seq->isa( 'Bio::LocatableSeq');
359 $id = $seq->id();
360 $name = $seq->get_nse;
362 if( !exists $self->{'_seq'}->{$name} ) {
363 $self->throw("Sequence $name does not exist in the alignment to remove!");
366 delete $self->{'_seq'}->{$name};
368 # we need to remove this seq from the start_end_lists hash
370 if (exists $self->{'_start_end_lists'}->{$id}) {
371 # we need to find the sequence in the array.
373 my ($i, $found);;
374 for ($i=0; $i < @{$self->{'_start_end_lists'}->{$id}}; $i++) {
375 if (${$self->{'_start_end_lists'}->{$id}}[$i] eq $seq) {
376 $found = 1;
377 last;
380 if ($found) {
381 splice @{$self->{'_start_end_lists'}->{$id}}, $i, 1;
383 else {
384 $self->throw("Could not find the sequence to remoce from the start-end list");
387 else {
388 $self->throw("There is no seq list for the name $id");
390 # we need to shift order hash
391 my %rev_order = reverse %{$self->{'_order'}};
392 my $no = $rev_order{$name};
393 my $num_sequences = $self->num_sequences;
394 for (; $no < $num_sequences; $no++) {
395 $self->{'_order'}->{$no} = $self->{'_order'}->{$no+1};
397 delete $self->{'_order'}->{$no};
398 return 1;
402 =head2 purge
404 Title : purge
405 Usage : $aln->purge(0.7);
406 Function: Removes sequences above given sequence similarity
407 This function will grind on large alignments. Beware!
408 Example :
409 Returns : An array of the removed sequences
410 Args : float, threshold for similarity
412 =cut
414 sub purge {
415 my ($self,$perc) = @_;
416 my (%duplicate, @dups);
418 my @seqs = $self->each_seq();
420 for (my $i=0;$i< @seqs - 1;$i++ ) { #for each seq in alignment
421 my $seq = $seqs[$i];
423 #skip if already in duplicate hash
424 next if exists $duplicate{$seq->display_id} ;
425 my $one = $seq->seq();
427 my @one = split '', $one; #split to get 1aa per array element
429 for (my $j=$i+1;$j < @seqs;$j++) {
430 my $seq2 = $seqs[$j];
432 #skip if already in duplicate hash
433 next if exists $duplicate{$seq2->display_id} ;
435 my $two = $seq2->seq();
436 my @two = split '', $two;
438 my $count = 0;
439 my $res = 0;
440 for (my $k=0;$k<@one;$k++) {
441 if ( $one[$k] ne '.' && $one[$k] ne '-' && defined($two[$k]) &&
442 $one[$k] eq $two[$k]) {
443 $count++;
445 if ( $one[$k] ne '.' && $one[$k] ne '-' && defined($two[$k]) &&
446 $two[$k] ne '.' && $two[$k] ne '-' ) {
447 $res++;
451 my $ratio = 0;
452 $ratio = $count/$res unless $res == 0;
454 # if above threshold put in duplicate hash and push onto
455 # duplicate array for returning to get_unique
456 if ( $ratio > $perc ) {
457 $self->warn("duplicate: ", $seq2->display_id) if $self->verbose > 0;
458 $duplicate{$seq2->display_id} = 1;
459 push @dups, $seq2;
463 foreach my $seq (@dups) {
464 $self->remove_seq($seq);
466 return @dups;
469 =head2 sort_alphabetically
471 Title : sort_alphabetically
472 Usage : $ali->sort_alphabetically
473 Function : Changes the order of the alignment to alphabetical on name
474 followed by numerical by number.
475 Returns :
476 Argument :
478 =cut
480 sub sort_alphabetically {
481 my $self = shift;
482 my ($seq,$nse,@arr,%hash,$count);
484 foreach $seq ( $self->each_seq() ) {
485 $nse = $seq->get_nse;
486 $hash{$nse} = $seq;
489 $count = 0;
491 %{$self->{'_order'}} = (); # reset the hash;
493 foreach $nse ( sort _alpha_startend keys %hash) {
494 $self->{'_order'}->{$count} = $nse;
496 $count++;
501 =head2 sort_by_list
503 Title : sort_by_list
504 Usage : $aln_ordered=$aln->sort_by_list($list_file)
505 Function : Arbitrarily order sequences in an alignment
506 Returns : A new Bio::SimpleAlign object
507 Argument : a file listing sequence names in intended order (one name per line)
509 =cut
511 sub sort_by_list {
512 my ($self, $list) = @_;
513 my (@seq, @ids, %order);
515 foreach my $seq ( $self->each_seq() ) {
516 push @seq, $seq;
517 push @ids, $seq->display_id;
520 my $ct=1;
521 open my $listfh, '<', $list or $self->throw("Could not read file '$list': $!");
522 while (<$listfh>) {
523 chomp;
524 my $name=$_;
525 $self->throw("Not found in alignment: $name") unless &_in_aln($name, \@ids);
526 $order{$name}=$ct++;
528 close($listfh);
530 # use the map-sort-map idiom:
531 my @sorted= map { $_->[1] } sort { $a->[0] <=> $b->[0] } map { [$order{$_->id()}, $_] } @seq;
532 my $aln = $self->new;
533 foreach (@sorted) { $aln->add_seq($_) }
534 return $aln;
537 =head2 set_new_reference
539 Title : set_new_reference
540 Usage : $aln->set_new_reference(3 or 'B31'): Select the 3rd sequence, or
541 the sequence whoes name is "B31" (full, exact, and case-sensitive),
542 as the reference (1st) sequence
543 Function : Change/Set a new reference (i.e., the first) sequence
544 Returns : a new Bio::SimpleAlign object.
545 Throws an exception if designated sequence not found
546 Argument : a positive integer of sequence order, or a sequence name
547 in the original alignment
549 =cut
551 sub set_new_reference {
552 my ($self, $seqid) = @_;
553 my $aln = $self->new;
554 my (@seq, @ids, @new_seq);
555 my $is_num=0;
556 foreach my $seq ( $self->each_seq() ) {
557 push @seq, $seq;
558 push @ids, $seq->display_id;
561 if ($seqid =~ /^\d+$/) { # argument is seq position
562 $is_num=1;
563 $self->throw("The new reference sequence number has to be a positive integer >1 and <= num_sequences ") if ($seqid <= 1 || $seqid > $self->num_sequences);
564 } else { # argument is a seq name
565 $self->throw("The new reference sequence not in alignment ") unless &_in_aln($seqid, \@ids);
568 for (my $i=0; $i<=$#seq; $i++) {
569 my $pos=$i+1;
570 if ( ($is_num && $pos == $seqid) || ($seqid eq $seq[$i]->display_id) ) {
571 unshift @new_seq, $seq[$i];
572 } else {
573 push @new_seq, $seq[$i];
576 foreach (@new_seq) { $aln->add_seq($_); }
577 return $aln;
580 sub _in_aln { # check if input name exists in the alignment
581 my ($str, $ref) = @_;
582 foreach (@$ref) {
583 return 1 if $str eq $_;
585 return 0;
589 =head2 uniq_seq
591 Title : uniq_seq
592 Usage : $aln->uniq_seq(): Remove identical sequences in
593 in the alignment. Ambiguous base ("N", "n") and
594 leading and ending gaps ("-") are NOT counted as
595 differences.
596 Function : Make a new alignment of unique sequence types (STs)
597 Returns : 1a. if called in a scalar context,
598 a new Bio::SimpleAlign object (all sequences renamed as "ST")
599 1b. if called in an array context,
600 a new Bio::SimpleAlign object, and a hashref whose keys
601 are sequence types, and whose values are arrayrefs to
602 lists of sequence ids within the corresponding sequence type
603 2. if $aln->verbose > 0, ST of each sequence is sent to
604 STDERR (in a tabular format)
605 Argument : None
607 =cut
609 sub uniq_seq {
610 my ($self, $seqid) = @_;
611 my $aln = $self->new;
612 my (%member, %order, @seq, @uniq_str, $st);
613 my $order=0;
614 my $len = $self->length();
615 $st = {};
616 foreach my $seq ( $self->each_seq() ) {
617 my $str = $seq->seq();
619 # it's necessary to ignore "n", "N", leading gaps and ending gaps in
620 # comparing two sequence strings
622 # 1st, convert "n", "N" to "?" (for DNA sequence only):
623 $str =~ s/n/\?/gi if $str =~ /^[atcgn-]+$/i;
624 # 2nd, convert leading and ending gaps to "?":
625 $str = &_convert_leading_ending_gaps($str, '-', '?');
626 # Note that '?' also can mean unknown residue.
627 # I don't like making global class member changes like this, too
628 # prone to errors... -- cjfields 08-11-18
629 local $Bio::LocatableSeq::GAP_SYMBOLS = '-\?';
630 my $new = Bio::LocatableSeq->new(
631 -id => $seq->id(),
632 -alphabet=> $seq->alphabet,
633 -seq => $str,
634 -start => $seq->start,
635 -end => $seq->end
637 push @seq, $new;
640 foreach my $seq (@seq) {
641 my $str = $seq->seq();
642 my ($seen, $key) = &_check_uniq($str, \@uniq_str, $len);
643 if ($seen) { # seen before
644 my @memb = @{$member{$key}};
645 push @memb, $seq;
646 $member{$key} = \@memb;
647 } else { # not seen
648 push @uniq_str, $key;
649 $order++;
650 $member{$key} = [ ($seq) ];
651 $order{$key} = $order;
655 foreach my $str (sort {$order{$a} <=> $order{$b}} keys %order) { # sort by input order
656 # convert leading/ending "?" back into "-" ("?" throws errors by SimpleAlign):
657 my $str2 = &_convert_leading_ending_gaps($str, '?', '-');
658 # convert middle "?" back into "N" ("?" throws errors by SimpleAlign):
659 $str2 =~ s/\?/N/g if $str2 =~ /^[atcg\-\?]+$/i;
660 my $gap='-';
661 my $end= CORE::length($str2);
662 $end -= CORE::length($1) while $str2 =~ m/($gap+)/g;
663 my $new = Bio::LocatableSeq->new(-id =>"ST".$order{$str},
664 -seq =>$str2,
665 -start=>1,
666 -end =>$end
668 $aln->add_seq($new);
669 foreach (@{$member{$str}}) {
670 push @{$$st{$order{$str}}}, $_->id(); # per Tristan's patch/Bug #2805
671 $self->debug($_->id(), "\t", "ST", $order{$str}, "\n");
674 return wantarray ? ($aln, $st) : $aln;
677 sub _check_uniq { # check if same seq exists in the alignment
678 my ($str1, $ref, $length) = @_;
679 my @char1=split //, $str1;
680 my @array=@$ref;
682 return (0, $str1) if @array==0; # not seen (1st sequence)
684 foreach my $str2 (@array) {
685 my $diff=0;
686 my @char2=split //, $str2;
687 for (my $i=0; $i<=$length-1; $i++) {
688 next if $char1[$i] eq '?';
689 next if $char2[$i] eq '?';
690 $diff++ if $char1[$i] ne $char2[$i];
692 return (1, $str2) if $diff == 0; # seen before
695 return (0, $str1); # not seen
698 sub _convert_leading_ending_gaps {
699 my $s=shift;
700 my $sym1=shift;
701 my $sym2=shift;
702 my @array=split //, $s;
703 # convert leading char:
704 for (my $i=0; $i<=$#array; $i++) {
705 ($array[$i] eq $sym1) ? ($array[$i] = $sym2):(last);
707 # convert ending char:
708 for (my $i = $#array; $i>= 0; $i--) {
709 ($array[$i] eq $sym1) ? ($array[$i] = $sym2):(last);
711 my $s_new=join '', @array;
712 return $s_new;
715 =head1 Sequence selection methods
717 Methods returning one or more sequences objects.
719 =head2 each_seq
721 Title : each_seq
722 Usage : foreach $seq ( $align->each_seq() )
723 Function : Gets a Seq object from the alignment
724 Returns : Seq object
725 Argument :
727 =cut
729 sub eachSeq {
730 my $self = shift;
731 Carp::carp("eachSeq - deprecated method. Use each_seq() instead.");
732 $self->each_seq();
735 sub each_seq {
736 my $self = shift;
737 my (@arr,$order);
739 foreach $order ( sort { $a <=> $b } keys %{$self->{'_order'}} ) {
740 if( exists $self->{'_seq'}->{$self->{'_order'}->{$order}} ) {
741 push(@arr,$self->{'_seq'}->{$self->{'_order'}->{$order}});
744 return @arr;
748 =head2 each_alphabetically
750 Title : each_alphabetically
751 Usage : foreach $seq ( $ali->each_alphabetically() )
752 Function : Returns a sequence object, but the objects are returned
753 in alphabetically sorted order.
754 Does not change the order of the alignment.
755 Returns : Seq object
756 Argument :
758 =cut
760 sub each_alphabetically {
761 my $self = shift;
762 my ($seq,$nse,@arr,%hash,$count);
764 foreach $seq ( $self->each_seq() ) {
765 $nse = $seq->get_nse;
766 $hash{$nse} = $seq;
769 foreach $nse ( sort _alpha_startend keys %hash) {
770 push(@arr,$hash{$nse});
772 return @arr;
775 sub _alpha_startend {
776 my ($aname,$astart,$bname,$bstart);
777 ($aname,$astart) = split (/-/,$a);
778 ($bname,$bstart) = split (/-/,$b);
780 if( $aname eq $bname ) {
781 return $astart <=> $bstart;
783 else {
784 return $aname cmp $bname;
788 =head2 each_seq_with_id
790 Title : each_seq_with_id
791 Usage : foreach $seq ( $align->each_seq_with_id() )
792 Function : Gets a Seq objects from the alignment, the contents
793 being those sequences with the given name (there may be
794 more than one)
795 Returns : Seq object
796 Argument : a seq name
798 =cut
800 sub eachSeqWithId {
801 my $self = shift;
802 Carp::carp("eachSeqWithId - deprecated method. Use each_seq_with_id() instead.");
803 $self->each_seq_with_id(@_);
806 sub each_seq_with_id {
807 my $self = shift;
808 my $id = shift;
810 $self->throw("Method each_seq_with_id needs a sequence name argument")
811 unless defined $id;
813 my (@arr, $seq);
815 if (exists($self->{'_start_end_lists'}->{$id})) {
816 @arr = @{$self->{'_start_end_lists'}->{$id}};
818 return @arr;
821 =head2 get_seq_by_pos
823 Title : get_seq_by_pos
824 Usage : $seq = $aln->get_seq_by_pos(3) # third sequence from the alignment
825 Function : Gets a sequence based on its position in the alignment.
826 Numbering starts from 1. Sequence positions larger than
827 num_sequences() will throw an error.
828 Returns : a Bio::LocatableSeq object
829 Args : positive integer for the sequence position
831 =cut
833 sub get_seq_by_pos {
835 my $self = shift;
836 my ($pos) = @_;
838 $self->throw("Sequence position has to be a positive integer, not [$pos]")
839 unless $pos =~ /^\d+$/ and $pos > 0;
840 $self->throw("No sequence at position [$pos]")
841 unless $pos <= $self->num_sequences ;
843 my $nse = $self->{'_order'}->{--$pos};
844 return $self->{'_seq'}->{$nse};
847 =head2 get_seq_by_id
849 Title : get_seq_by_id
850 Usage : $seq = $aln->get_seq_by_id($name) # seq named $name
851 Function : Gets a sequence based on its name.
852 Sequences that do not exist will warn and return undef
853 Returns : a Bio::LocatableSeq object
854 Args : string for sequence name
856 =cut
858 sub get_seq_by_id {
859 my ($self,$name) = @_;
860 unless( defined $name ) {
861 $self->warn("Must provide a sequence name");
862 return;
864 for my $seq ( values %{$self->{'_seq'}} ) {
865 if ( $seq->id eq $name) {
866 return $seq;
869 return;
872 =head2 seq_with_features
874 Title : seq_with_features
875 Usage : $seq = $aln->seq_with_features(-pos => 1,
876 -consensus => 60
877 -mask =>
878 sub { my $consensus = shift;
880 for my $i (1..5){
881 my $n = 'N' x $i;
882 my $q = '\?' x $i;
883 while($consensus =~ /[^?]$q[^?]/){
884 $consensus =~ s/([^?])$q([^?])/$1$n$2/;
887 return $consensus;
890 Function: produces a Bio::Seq object by first splicing gaps from -pos
891 (by means of a splice_by_seq_pos() call), then creating
892 features using non-? chars (by means of a consensus_string()
893 call with stringency -consensus).
894 Returns : a Bio::Seq object
895 Args : -pos : required. sequence from which to build the Bio::Seq
896 object
897 -consensus : optional, defaults to consensus_string()'s
898 default cutoff value
899 -mask : optional, a coderef to apply to consensus_string()'s
900 output before building features. this may be useful for
901 closing gaps of 1 bp by masking over them with N, for
902 instance
904 =cut
906 sub seq_with_features{
907 my ($self,%arg) = @_;
909 #first do the preparatory splice
910 $self->throw("must provide a -pos argument") unless $arg{-pos};
911 $self->splice_by_seq_pos($arg{-pos});
913 my $consensus_string = $self->consensus_string($arg{-consensus});
914 $consensus_string = $arg{-mask}->($consensus_string)
915 if defined($arg{-mask});
917 my(@bs,@es);
919 push @bs, 1 if $consensus_string =~ /^[^?]/;
921 while($consensus_string =~ /\?[^?]/g){
922 push @bs, pos($consensus_string);
924 while($consensus_string =~ /[^?]\?/g){
925 push @es, pos($consensus_string);
928 push @es, CORE::length($consensus_string) if $consensus_string =~ /[^?]$/;
930 my $seq = Bio::Seq->new();
932 # my $rootfeature = Bio::SeqFeature::Generic->new(
933 # -source_tag => 'location',
934 # -start => $self->get_seq_by_pos($arg{-pos})->start,
935 # -end => $self->get_seq_by_pos($arg{-pos})->end,
936 # );
937 # $seq->add_SeqFeature($rootfeature);
939 while(my $b = shift @bs){
940 my $e = shift @es;
941 $seq->add_SeqFeature(
942 Bio::SeqFeature::Generic->new(
943 -start => $b - 1 + $self->get_seq_by_pos($arg{-pos})->start,
944 -end => $e - 1 + $self->get_seq_by_pos($arg{-pos})->start,
945 -source_tag => $self->source || 'MSA',
950 return $seq;
954 =head1 Create new alignments
956 The result of these methods are horizontal or vertical subsets of the
957 current MSA.
959 =head2 select
961 Title : select
962 Usage : $aln2 = $aln->select(1, 3) # three first sequences
963 Function : Creates a new alignment from a continuous subset of
964 sequences. Numbering starts from 1. Sequence positions
965 larger than num_sequences() will throw an error.
966 Returns : a Bio::SimpleAlign object
967 Args : positive integer for the first sequence
968 positive integer for the last sequence to include (optional)
970 =cut
972 sub select {
973 my $self = shift;
974 my ($start, $end) = @_;
976 $self->throw("Select start has to be a positive integer, not [$start]")
977 unless $start =~ /^\d+$/ and $start > 0;
978 $self->throw("Select end has to be a positive integer, not [$end]")
979 unless $end =~ /^\d+$/ and $end > 0;
980 $self->throw("Select $start [$start] has to be smaller than or equal to end [$end]")
981 unless $start <= $end;
983 my $aln = $self->new;
984 foreach my $pos ($start .. $end) {
985 $aln->add_seq($self->get_seq_by_pos($pos));
987 $aln->id($self->id);
988 # fix for meta, sf, ann
989 return $aln;
992 =head2 select_noncont
994 Title : select_noncont
995 Usage : # 1st and 3rd sequences, sorted
996 $aln2 = $aln->select_noncont(1, 3)
998 # 1st and 3rd sequences, sorted (same as first)
999 $aln2 = $aln->select_noncont(3, 1)
1001 # 1st and 3rd sequences, unsorted
1002 $aln2 = $aln->select_noncont('nosort',3, 1)
1004 Function : Creates a new alignment from a subset of sequences. Numbering
1005 starts from 1. Sequence positions larger than num_sequences() will
1006 throw an error. Sorts the order added to new alignment by default,
1007 to prevent sorting pass 'nosort' as the first argument in the list.
1008 Returns : a Bio::SimpleAlign object
1009 Args : array of integers for the sequences. If the string 'nosort' is
1010 passed as the first argument, the sequences will not be sorted
1011 in the new alignment but will appear in the order listed.
1013 =cut
1015 sub select_noncont {
1016 my $self = shift;
1017 my $nosort = 0;
1018 my (@pos) = @_;
1019 if ($pos[0] !~ m{^\d+$}) {
1020 my $sortcmd = shift @pos;
1021 if ($sortcmd eq 'nosort') {
1022 $nosort = 1;
1023 } else {
1024 $self->throw("Command not recognized: $sortcmd. Only 'nosort' implemented at this time.");
1028 my $end = $self->num_sequences;
1029 foreach ( @pos ) {
1030 $self->throw("position must be a positive integer, > 0 and <= $end not [$_]")
1031 unless( /^\d+$/ && $_ > 0 && $_ <= $end );
1034 @pos = sort {$a <=> $b} @pos unless $nosort;
1036 my $aln = $self->new;
1037 foreach my $p (@pos) {
1038 $aln->add_seq($self->get_seq_by_pos($p));
1040 $aln->id($self->id);
1041 # fix for meta, sf, ann
1042 return $aln;
1045 =head2 select_noncont_by_name
1047 Title : select_noncont_by_name
1048 Usage : my $aln2 = $aln->select_noncont_by_name('A123', 'B456');
1049 Function : Creates a new alignment from a subset of sequences which are
1050 selected by name (sequence ID).
1051 Returns : a Bio::SimpleAlign object
1052 Args : array of names (i.e., identifiers) for the sequences.
1054 =cut
1056 sub select_noncont_by_name {
1057 my ($self, @names) = @_;
1059 my $aln = $self->new;
1060 foreach my $name (@names) {
1061 $aln->add_seq($self->get_seq_by_id($name));
1063 $aln->id($self->id);
1065 return $aln;
1068 =head2 slice
1070 Title : slice
1071 Usage : $aln2 = $aln->slice(20,30)
1072 Function : Creates a slice from the alignment inclusive of start and
1073 end columns, and the first column in the alignment is denoted 1.
1074 Sequences with no residues in the slice are excluded from the
1075 new alignment and a warning is printed. Slice beyond the length of
1076 the sequence does not do padding.
1077 Returns : A Bio::SimpleAlign object
1078 Args : Positive integer for start column, positive integer for end column,
1079 optional boolean which if true will keep gap-only columns in the newly
1080 created slice. Example:
1082 $aln2 = $aln->slice(20,30,1)
1084 =cut
1086 sub slice {
1087 my $self = shift;
1088 my ($start, $end, $keep_gap_only) = @_;
1090 $self->throw("Slice start has to be a positive integer, not [$start]")
1091 unless $start =~ /^\d+$/ and $start > 0;
1092 $self->throw("Slice end has to be a positive integer, not [$end]")
1093 unless $end =~ /^\d+$/ and $end > 0;
1094 $self->throw("Slice start [$start] has to be smaller than or equal to end [$end]")
1095 unless $start <= $end;
1096 $self->throw("This alignment has only ". $self->length . " residues. Slice start " .
1097 "[$start] is too big.") if $start > $self->length;
1098 my $cons_meta = $self->consensus_meta;
1099 my $aln = $self->new;
1100 $aln->id($self->id);
1101 foreach my $seq ( $self->each_seq() ) {
1102 my $new_seq = $seq->isa('Bio::Seq::MetaI') ?
1103 Bio::Seq::Meta->new
1104 (-id => $seq->id,
1105 -alphabet => $seq->alphabet,
1106 -strand => $seq->strand,
1107 -verbose => $self->verbose) :
1108 Bio::LocatableSeq->new
1109 (-id => $seq->id,
1110 -alphabet => $seq->alphabet,
1111 -strand => $seq->strand,
1112 -verbose => $self->verbose);
1114 # seq
1115 my $seq_end = $end;
1116 $seq_end = $seq->length if( $end > $seq->length );
1118 my $slice_seq = $seq->subseq($start, $seq_end);
1119 $new_seq->seq( $slice_seq );
1121 # Allowed extra characters in string
1122 my $allowed_chars = '';
1123 if (exists $self->{_mask_char}) {
1124 $allowed_chars = $self->{_mask_char};
1125 $allowed_chars = quotemeta $allowed_chars;
1127 $slice_seq =~ s/[^\w$allowed_chars]//g;
1129 if ($start > 1) {
1130 my $pre_start_seq = $seq->subseq(1, $start - 1);
1131 $pre_start_seq =~ s/[^\w$allowed_chars]//g;
1132 if (!defined($seq->strand)) {
1133 $new_seq->start( $seq->start + CORE::length($pre_start_seq) );
1134 } elsif ($seq->strand < 0){
1135 $new_seq->start( $seq->end - CORE::length($pre_start_seq) - CORE::length($slice_seq) + 1);
1136 } else {
1137 $new_seq->start( $seq->start + CORE::length($pre_start_seq) );
1139 } else {
1140 if ((defined $seq->strand)&&($seq->strand < 0)){
1141 $new_seq->start( $seq->end - CORE::length($slice_seq) + 1);
1142 } else {
1143 $new_seq->start( $seq->start);
1146 if ($new_seq->isa('Bio::Seq::MetaI')) {
1147 for my $meta_name ($seq->meta_names) {
1148 $new_seq->named_meta($meta_name, $seq->named_submeta($meta_name, $start, $end));
1151 $new_seq->end( $new_seq->start + CORE::length($slice_seq) - 1 );
1153 if ($new_seq->start and $new_seq->end >= $new_seq->start) {
1154 $aln->add_seq($new_seq);
1155 } else {
1156 if( $keep_gap_only ) {
1157 $aln->add_seq($new_seq);
1158 } else {
1159 my $nse = $seq->get_nse();
1160 $self->warn("Slice [$start-$end] of sequence [$nse] contains no residues.".
1161 " Sequence excluded from the new alignment.");
1165 if ($cons_meta) {
1166 my $new = Bio::Seq::Meta->new();
1167 for my $meta_name ($cons_meta->meta_names) {
1168 $new->named_meta($meta_name, $cons_meta->named_submeta($meta_name, $start, $end));
1170 $aln->consensus_meta($new);
1172 $aln->annotation($self->annotation);
1173 # fix for meta, sf, ann
1174 return $aln;
1177 =head2 remove_columns
1179 Title : remove_columns
1180 Usage : $aln2 = $aln->remove_columns(['mismatch','weak']) or
1181 $aln2 = $aln->remove_columns([0,0],[6,8])
1182 Function : Creates an aligment with columns removed corresponding to
1183 the specified type or by specifying the columns by number.
1184 Returns : Bio::SimpleAlign object
1185 Args : Array ref of types ('match'|'weak'|'strong'|'mismatch'|'gaps'|
1186 'all_gaps_columns') or array ref where the referenced array
1187 contains a pair of integers that specify a range.
1188 The first column is 0
1190 =cut
1192 sub remove_columns {
1193 my ($self,@args) = @_;
1194 @args || $self->throw("Must supply column ranges or column types");
1195 my $aln;
1197 if ($args[0][0] =~ /^[a-z_]+$/i) {
1198 $aln = $self->_remove_columns_by_type($args[0]);
1199 } elsif ($args[0][0] =~ /^\d+$/) {
1200 $aln = $self->_remove_columns_by_num(\@args);
1201 } else {
1202 $self->throw("You must pass array references to remove_columns(), not @args");
1204 # fix for meta, sf, ann
1205 $aln;
1209 =head2 remove_gaps
1211 Title : remove_gaps
1212 Usage : $aln2 = $aln->remove_gaps
1213 Function : Creates an aligment with gaps removed
1214 Returns : a Bio::SimpleAlign object
1215 Args : a gap character(optional) if none specified taken
1216 from $self->gap_char,
1217 [optional] $all_gaps_columns flag (1 or 0, default is 0)
1218 indicates that only all-gaps columns should be deleted
1220 Used from method L<remove_columns> in most cases. Set gap character
1221 using L<gap_char()|gap_char>.
1223 =cut
1225 sub remove_gaps {
1226 my ($self,$gapchar,$all_gaps_columns) = @_;
1227 my $gap_line;
1228 if ($all_gaps_columns) {
1229 $gap_line = $self->all_gap_line($gapchar);
1230 } else {
1231 $gap_line = $self->gap_line($gapchar);
1233 my $aln = $self->new;
1235 my @remove;
1236 my $length = 0;
1237 my $del_char = $gapchar || $self->gap_char;
1238 # Do the matching to get the segments to remove
1239 while ($gap_line =~ m/[$del_char]/g) {
1240 my $start = pos($gap_line)-1;
1241 $gap_line =~ m/\G[$del_char]+/gc;
1242 my $end = pos($gap_line)-1;
1244 #have to offset the start and end for subsequent removes
1245 $start-=$length;
1246 $end -=$length;
1247 $length += ($end-$start+1);
1248 push @remove, [$start,$end];
1251 #remove the segments
1252 $aln = $#remove >= 0 ? $self->_remove_col($aln,\@remove) : $self;
1253 # fix for meta, sf, ann
1254 return $aln;
1258 sub _remove_col {
1259 my ($self,$aln,$remove) = @_;
1260 my @new;
1262 my $gap = $self->gap_char;
1264 # splice out the segments and create new seq
1265 foreach my $seq($self->each_seq){
1266 my $new_seq = Bio::LocatableSeq->new(
1267 -id => $seq->id,
1268 -alphabet=> $seq->alphabet,
1269 -strand => $seq->strand,
1270 -verbose => $self->verbose);
1271 my $sequence = $seq->seq;
1272 foreach my $pair(@{$remove}){
1273 my $start = $pair->[0];
1274 my $end = $pair->[1];
1275 $sequence = $seq->seq unless $sequence;
1276 my $orig = $sequence;
1277 my $head = $start > 0 ? substr($sequence, 0, $start) : '';
1278 my $tail = ($end + 1) >= CORE::length($sequence) ? '' : substr($sequence, $end + 1);
1279 $sequence = $head.$tail;
1280 # start
1281 unless (defined $new_seq->start) {
1282 if ($start == 0) {
1283 my $start_adjust = () = substr($orig, 0, $end + 1) =~ /$gap/g;
1284 $new_seq->start($seq->start + $end + 1 - $start_adjust);
1286 else {
1287 my $start_adjust = $orig =~ /^$gap+/;
1288 if ($start_adjust) {
1289 $start_adjust = $+[0] == $start;
1291 $new_seq->start($seq->start + $start_adjust);
1294 # end
1295 if (($end + 1) >= CORE::length($orig)) {
1296 my $end_adjust = () = substr($orig, $start) =~ /$gap/g;
1297 $new_seq->end($seq->end - (CORE::length($orig) - $start) + $end_adjust);
1299 else {
1300 $new_seq->end($seq->end);
1304 if ($new_seq->end < $new_seq->start) {
1305 # we removed all columns except for gaps: set to 0 to indicate no
1306 # sequence
1307 $new_seq->start(0);
1308 $new_seq->end(0);
1311 $new_seq->seq($sequence) if $sequence;
1312 push @new, $new_seq;
1314 # add the new seqs to the alignment
1315 foreach my $new(@new){
1316 $aln->add_seq($new);
1318 # fix for meta, sf, ann
1319 return $aln;
1322 sub _remove_columns_by_type {
1323 my ($self,$type) = @_;
1324 my $aln = $self->new;
1325 my @remove;
1327 my $gap = $self->gap_char if (grep { $_ eq 'gaps'} @{$type});
1328 my $all_gaps_columns = $self->gap_char if (grep /all_gaps_columns/,@{$type});
1329 my %matchchars = ( 'match' => '\*',
1330 'weak' => '\.',
1331 'strong' => ':',
1332 'mismatch' => ' ',
1333 'gaps' => '',
1334 'all_gaps_columns' => ''
1336 # get the characters to delete against
1337 my $del_char;
1338 foreach my $type (@{$type}){
1339 $del_char.= $matchchars{$type};
1342 my $length = 0;
1343 my $match_line = $self->match_line;
1344 # do the matching to get the segments to remove
1345 if($del_char){
1346 while($match_line =~ m/[$del_char]/g ){
1347 my $start = pos($match_line)-1;
1348 $match_line=~/\G[$del_char]+/gc;
1349 my $end = pos($match_line)-1;
1351 #have to offset the start and end for subsequent removes
1352 $start-=$length;
1353 $end -=$length;
1354 $length += ($end-$start+1);
1355 push @remove, [$start,$end];
1359 # remove the segments
1360 $aln = $#remove >= 0 ? $self->_remove_col($aln,\@remove) : $self;
1361 $aln = $aln->remove_gaps() if $gap;
1362 $aln = $aln->remove_gaps('', 1) if $all_gaps_columns;
1363 # fix for meta, sf, ann
1364 $aln;
1368 sub _remove_columns_by_num {
1369 my ($self,$positions) = @_;
1370 my $aln = $self->new;
1372 # sort the positions
1373 @$positions = sort { $a->[0] <=> $b->[0] } @$positions;
1375 my @remove;
1376 my $length = 0;
1377 foreach my $pos (@{$positions}) {
1378 my ($start, $end) = @{$pos};
1380 #have to offset the start and end for subsequent removes
1381 $start-=$length;
1382 $end -=$length;
1383 $length += ($end-$start+1);
1384 push @remove, [$start,$end];
1387 #remove the segments
1388 $aln = $#remove >= 0 ? $self->_remove_col($aln,\@remove) : $self;
1389 # fix for meta, sf, ann
1390 $aln;
1394 =head1 Change sequences within the MSA
1396 These methods affect characters in all sequences without changing the
1397 alignment.
1399 =head2 splice_by_seq_pos
1401 Title : splice_by_seq_pos
1402 Usage : $status = splice_by_seq_pos(1);
1403 Function: splices all aligned sequences where the specified sequence
1404 has gaps.
1405 Example :
1406 Returns : 1 on success
1407 Args : position of sequence to splice by
1410 =cut
1412 sub splice_by_seq_pos{
1413 my ($self,$pos) = @_;
1415 my $guide = $self->get_seq_by_pos($pos);
1416 my $guide_seq = $guide->seq;
1418 $guide_seq =~ s/\./\-/g;
1420 my @gaps = ();
1421 $pos = -1;
1422 while(($pos = index($guide_seq, '-', $pos)) > -1 ){
1423 unshift @gaps, $pos;
1424 $pos++;
1427 foreach my $seq ($self->each_seq){
1428 my @bases = split '', $seq->seq;
1430 splice(@bases, $_, 1) foreach @gaps;
1431 $seq->seq(join('', @bases));
1437 =head2 map_chars
1439 Title : map_chars
1440 Usage : $ali->map_chars('\.','-')
1441 Function : Does a s/$arg1/$arg2/ on the sequences. Useful for gap
1442 characters.
1444 Note that the first argument is interpreted as a regexp
1445 so be careful and escape any wild card characters (e.g.
1446 do $ali->map_chars('\.','-') to replace periods with dashes.
1447 Returns : 1 on success
1448 Argument : A regexp and a string
1450 =cut
1452 sub map_chars {
1453 my $self = shift;
1454 my $from = shift;
1455 my $to = shift;
1456 my ( $seq, $temp );
1458 $self->throw("Need two arguments: a regexp and a string")
1459 unless defined $from and defined $to;
1461 foreach $seq ( $self->each_seq() ) {
1462 $temp = $seq->seq();
1463 $temp =~ s/$from/$to/g;
1464 $seq->seq($temp);
1466 return 1;
1470 =head2 uppercase
1472 Title : uppercase()
1473 Usage : $ali->uppercase()
1474 Function : Sets all the sequences to uppercase
1475 Returns : 1 on success
1476 Argument :
1478 =cut
1480 sub uppercase {
1481 my $self = shift;
1482 my $seq;
1483 my $temp;
1485 foreach $seq ( $self->each_seq() ) {
1486 $temp = $seq->seq();
1487 $temp =~ tr/[a-z]/[A-Z]/;
1489 $seq->seq($temp);
1491 return 1;
1494 =head2 cigar_line
1496 Title : cigar_line()
1497 Usage : %cigars = $align->cigar_line()
1498 Function : Generates a "cigar" (Compact Idiosyncratic Gapped Alignment
1499 Report) line for each sequence in the alignment. Examples are
1500 "1,60" or "5,10:12,58", where the numbers refer to conserved
1501 positions within the alignment. The keys of the hash are the
1502 NSEs (name/start/end) assigned to each sequence.
1503 Args : threshold (optional, defaults to 100)
1504 Returns : Hash of strings (cigar lines)
1506 =cut
1508 sub cigar_line {
1509 my $self = shift;
1510 my $thr=shift||100;
1511 my %cigars;
1513 my @consensus = split "",($self->consensus_string($thr));
1514 my $len = $self->length;
1515 my $gapchar = $self->gap_char;
1517 # create a precursor, something like (1,4,5,6,7,33,45),
1518 # where each number corresponds to a conserved position
1519 foreach my $seq ( $self->each_seq ) {
1520 my @seq = split "", uc ($seq->seq);
1521 my $pos = 1;
1522 for (my $x = 0 ; $x < $len ; $x++ ) {
1523 if ($seq[$x] eq $consensus[$x]) {
1524 push @{$cigars{$seq->get_nse}},$pos;
1525 $pos++;
1526 } elsif ($seq[$x] ne $gapchar) {
1527 $pos++;
1531 # duplicate numbers - (1,4,5,6,7,33,45) becomes (1,1,4,5,6,7,33,33,45,45)
1532 for my $name (keys %cigars) {
1533 splice @{$cigars{$name}}, 1, 0, ${$cigars{$name}}[0] if
1534 ( ${$cigars{$name}}[0] + 1 < ${$cigars{$name}}[1] );
1535 push @{$cigars{$name}}, ${$cigars{$name}}[$#{$cigars{$name}}] if
1536 ( ${$cigars{$name}}[($#{$cigars{$name}} - 1)] + 1 <
1537 ${$cigars{$name}}[$#{$cigars{$name}}] );
1538 for ( my $x = 1 ; $x < $#{$cigars{$name}} - 1 ; $x++) {
1539 if (${$cigars{$name}}[$x - 1] + 1 < ${$cigars{$name}}[$x] &&
1540 ${$cigars{$name}}[$x + 1] > ${$cigars{$name}}[$x] + 1) {
1541 splice @{$cigars{$name}}, $x, 0, ${$cigars{$name}}[$x];
1545 # collapse series - (1,1,4,5,6,7,33,33,45,45) becomes (1,1,4,7,33,33,45,45)
1546 for my $name (keys %cigars) {
1547 my @remove;
1548 for ( my $x = 0 ; $x < $#{$cigars{$name}} ; $x++) {
1549 if ( ${$cigars{$name}}[$x] == ${$cigars{$name}}[($x - 1)] + 1 &&
1550 ${$cigars{$name}}[$x] == ${$cigars{$name}}[($x + 1)] - 1 ) {
1551 unshift @remove,$x;
1554 for my $pos (@remove) {
1555 splice @{$cigars{$name}}, $pos, 1;
1558 # join and punctuate
1559 for my $name (keys %cigars) {
1560 my ($start,$end,$str) = "";
1561 while ( ($start,$end) = splice @{$cigars{$name}}, 0, 2 ) {
1562 $str .= ($start . "," . $end . ":");
1564 $str =~ s/:$//;
1565 $cigars{$name} = $str;
1567 %cigars;
1571 =head2 match_line
1573 Title : match_line()
1574 Usage : $line = $align->match_line()
1575 Function : Generates a match line - much like consensus string
1576 except that a line indicating the '*' for a match.
1577 Args : (optional) Match line characters ('*' by default)
1578 (optional) Strong match char (':' by default)
1579 (optional) Weak match char ('.' by default)
1580 Returns : String
1582 =cut
1584 sub match_line {
1585 my ($self,$matchlinechar, $strong, $weak) = @_;
1586 my %matchchars = ('match' => $matchlinechar || '*',
1587 'weak' => $weak || '.',
1588 'strong' => $strong || ':',
1589 'mismatch' => ' ',
1592 my @seqchars;
1593 my $alphabet;
1594 foreach my $seq ( $self->each_seq ) {
1595 push @seqchars, [ split(//, uc ($seq->seq)) ];
1596 $alphabet = $seq->alphabet unless defined $alphabet;
1598 my $refseq = shift @seqchars;
1599 # let's just march down the columns
1600 my $matchline;
1601 POS:
1602 foreach my $pos ( 0..$self->length ) {
1603 my $refchar = $refseq->[$pos];
1604 my $char = $matchchars{'mismatch'};
1605 unless( defined $refchar ) {
1606 last if $pos == $self->length; # short circuit on last residue
1607 # this in place to handle jason's soon-to-be-committed
1608 # intron mapping code
1609 goto bottom;
1611 my %col = ($refchar => 1);
1612 my $dash = ($refchar eq '-' || $refchar eq '.' || $refchar eq ' ');
1613 foreach my $seq ( @seqchars ) {
1614 next if $pos >= scalar @$seq;
1615 $dash = 1 if( $seq->[$pos] eq '-' || $seq->[$pos] eq '.' ||
1616 $seq->[$pos] eq ' ' );
1617 $col{$seq->[$pos]}++ if defined $seq->[$pos];
1619 my @colresidues = sort keys %col;
1621 # if all the values are the same
1622 if( $dash ) { $char = $matchchars{'mismatch'} }
1623 elsif( @colresidues == 1 ) { $char = $matchchars{'match'} }
1624 elsif( $alphabet eq 'protein' ) { # only try to do weak/strong
1625 # matches for protein seqs
1626 TYPE:
1627 foreach my $type ( qw(strong weak) ) {
1628 # iterate through categories
1629 my %groups;
1630 # iterate through each of the aa in the col
1631 # look to see which groups it is in
1632 foreach my $c ( @colresidues ) {
1633 foreach my $f ( grep { index($_,$c) >= 0 } @{$CONSERVATION_GROUPS{$type}} ) {
1634 push @{$groups{$f}},$c;
1637 GRP:
1638 foreach my $cols ( values %groups ) {
1639 @$cols = sort @$cols;
1640 # now we are just testing to see if two arrays
1641 # are identical w/o changing either one
1642 # have to be same len
1643 next if( scalar @$cols != scalar @colresidues );
1644 # walk down the length and check each slot
1645 for($_=0;$_ < (scalar @$cols);$_++ ) {
1646 next GRP if( $cols->[$_] ne $colresidues[$_] );
1648 $char = $matchchars{$type};
1649 last TYPE;
1653 bottom:
1654 $matchline .= $char;
1656 return $matchline;
1660 =head2 gap_line
1662 Title : gap_line()
1663 Usage : $line = $align->gap_line()
1664 Function : Generates a gap line - much like consensus string
1665 except that a line where '-' represents gap
1666 Args : (optional) gap line characters ('-' by default)
1667 Returns : string
1669 =cut
1671 sub gap_line {
1672 my ($self,$gapchar) = @_;
1673 $gapchar = $gapchar || $self->gap_char;
1674 my %gap_hsh; # column gaps vector
1675 foreach my $seq ( $self->each_seq ) {
1676 my $i = 0;
1677 map {$gap_hsh{$_->[0]} = undef} grep {$_->[1] =~ m/[$gapchar]/}
1678 map {[$i++, $_]} split(//, uc ($seq->seq));
1680 my $gap_line;
1681 foreach my $pos ( 0..$self->length-1 ) {
1682 $gap_line .= (exists $gap_hsh{$pos}) ? $self->gap_char:'.';
1684 return $gap_line;
1687 =head2 all_gap_line
1689 Title : all_gap_line()
1690 Usage : $line = $align->all_gap_line()
1691 Function : Generates a gap line - much like consensus string
1692 except that a line where '-' represents all-gap column
1693 Args : (optional) gap line characters ('-' by default)
1694 Returns : string
1696 =cut
1698 sub all_gap_line {
1699 my ($self,$gapchar) = @_;
1700 $gapchar = $gapchar || $self->gap_char;
1701 my %gap_hsh; # column gaps counter hash
1702 my @seqs = $self->each_seq;
1703 foreach my $seq ( @seqs ) {
1704 my $i = 0;
1705 map {$gap_hsh{$_->[0]}++} grep {$_->[1] =~ m/[$gapchar]/}
1706 map {[$i++, $_]} split(//, uc ($seq->seq));
1708 my $gap_line;
1709 foreach my $pos ( 0..$self->length-1 ) {
1710 if (exists $gap_hsh{$pos} && $gap_hsh{$pos} == scalar @seqs) {
1711 # gaps column
1712 $gap_line .= $self->gap_char;
1713 } else {
1714 $gap_line .= '.';
1717 return $gap_line;
1720 =head2 gap_col_matrix
1722 Title : gap_col_matrix()
1723 Usage : my $cols = $align->gap_col_matrix()
1724 Function : Generates an array where each element in the array is a
1725 hash reference with a key of the sequence name and a
1726 value of 1 if the sequence has a gap at that column
1727 Returns : Reference to an array
1728 Args : Optional: gap line character ($aln->gap_char or '-' by default)
1730 =cut
1732 sub gap_col_matrix {
1733 my ( $self, $gapchar ) = @_;
1734 $gapchar = $gapchar || $self->gap_char;
1735 my %gap_hsh; # column gaps vector
1736 my @cols;
1737 foreach my $seq ( $self->each_seq ) {
1738 my $i = 0;
1739 my $str = $seq->seq;
1740 my $len = $seq->length;
1741 my $ch;
1742 my $id = $seq->display_id;
1743 while ( $i < $len ) {
1744 $ch = substr( $str, $i, 1 );
1745 $cols[ $i++ ]->{$id} = ( $ch =~ m/[$gapchar]/ );
1748 return \@cols;
1751 =head2 match
1753 Title : match()
1754 Usage : $ali->match()
1755 Function : Goes through all columns and changes residues that are
1756 identical to residue in first sequence to match '.'
1757 character. Sets match_char.
1759 USE WITH CARE: Most MSA formats do not support match
1760 characters in sequences, so this is mostly for output
1761 only. NEXUS format (Bio::AlignIO::nexus) can handle
1763 Returns : 1 on success
1764 Argument : a match character, optional, defaults to '.'
1766 =cut
1768 sub match {
1769 my ( $self, $match ) = @_;
1771 $match ||= '.';
1772 my ($matching_char) = $match;
1773 $matching_char = "\\$match" if $match =~ /[\^.$|()\[\]]/; #';
1774 $self->map_chars( $matching_char, '-' );
1776 my @seqs = $self->each_seq();
1777 return 1 unless scalar @seqs > 1;
1779 my $refseq = shift @seqs;
1780 my @refseq = split //, $refseq->seq;
1781 my $gapchar = $self->gap_char;
1783 foreach my $seq (@seqs) {
1784 my @varseq = split //, $seq->seq();
1785 for ( my $i = 0; $i < scalar @varseq; $i++ ) {
1786 $varseq[$i] = $match
1787 if defined $refseq[$i]
1788 && ( $refseq[$i] =~ /[A-Za-z\*]/
1789 || $refseq[$i] =~ /$gapchar/ )
1790 && $refseq[$i] eq $varseq[$i];
1792 $seq->seq( join '', @varseq );
1794 $self->match_char($match);
1795 return 1;
1800 =head2 unmatch
1802 Title : unmatch()
1803 Usage : $ali->unmatch()
1804 Function : Undoes the effect of method match. Unsets match_char.
1805 Returns : 1 on success
1806 Argument : a match character, optional, defaults to '.'
1808 See L<match> and L<match_char>
1810 =cut
1812 sub unmatch {
1813 my ( $self, $match ) = @_;
1815 $match ||= '.';
1817 my @seqs = $self->each_seq();
1818 return 1 unless scalar @seqs > 1;
1820 my $refseq = shift @seqs;
1821 my @refseq = split //, $refseq->seq;
1822 my $gapchar = $self->gap_char;
1823 foreach my $seq (@seqs) {
1824 my @varseq = split //, $seq->seq();
1825 for ( my $i = 0; $i < scalar @varseq; $i++ ) {
1826 $varseq[$i] = $refseq[$i]
1827 if defined $refseq[$i]
1828 && ( $refseq[$i] =~ /[A-Za-z\*]/
1829 || $refseq[$i] =~ /$gapchar/ )
1830 && $varseq[$i] eq $match;
1832 $seq->seq( join '', @varseq );
1834 $self->match_char('');
1835 return 1;
1839 =head1 MSA attributes
1841 Methods for setting and reading the MSA attributes.
1843 Note that the methods defining character semantics depend on the user
1844 to set them sensibly. They are needed only by certain input/output
1845 methods. Unset them by setting to an empty string ('').
1847 =head2 id
1849 Title : id
1850 Usage : $myalign->id("Ig")
1851 Function : Gets/sets the id field of the alignment
1852 Returns : An id string
1853 Argument : An id string (optional)
1855 =cut
1857 sub id {
1858 my ( $self, $name ) = @_;
1860 if ( defined($name) ) {
1861 $self->{'_id'} = $name;
1864 return $self->{'_id'};
1867 =head2 accession
1869 Title : accession
1870 Usage : $myalign->accession("PF00244")
1871 Function : Gets/sets the accession field of the alignment
1872 Returns : An acc string
1873 Argument : An acc string (optional)
1875 =cut
1877 sub accession {
1878 my ( $self, $acc ) = @_;
1880 if ( defined($acc) ) {
1881 $self->{'_accession'} = $acc;
1884 return $self->{'_accession'};
1887 =head2 description
1889 Title : description
1890 Usage : $myalign->description("14-3-3 proteins")
1891 Function : Gets/sets the description field of the alignment
1892 Returns : An description string
1893 Argument : An description string (optional)
1895 =cut
1897 sub description {
1898 my ( $self, $name ) = @_;
1900 if ( defined($name) ) {
1901 $self->{'_description'} = $name;
1904 return $self->{'_description'};
1907 =head2 missing_char
1909 Title : missing_char
1910 Usage : $myalign->missing_char("?")
1911 Function : Gets/sets the missing_char attribute of the alignment
1912 It is generally recommended to set it to 'n' or 'N'
1913 for nucleotides and to 'X' for protein.
1914 Returns : An missing_char string,
1915 Argument : An missing_char string (optional)
1917 =cut
1919 sub missing_char {
1920 my ( $self, $char ) = @_;
1922 if ( defined $char ) {
1923 $self->throw("Single missing character, not [$char]!")
1924 if CORE::length($char) > 1;
1925 $self->{'_missing_char'} = $char;
1928 return $self->{'_missing_char'};
1932 =head2 match_char
1934 Title : match_char
1935 Usage : $myalign->match_char('.')
1936 Function : Gets/sets the match_char attribute of the alignment
1937 Returns : An match_char string,
1938 Argument : An match_char string (optional)
1940 =cut
1942 sub match_char {
1943 my ( $self, $char ) = @_;
1945 if ( defined $char ) {
1946 $self->throw("Single match character, not [$char]!")
1947 if CORE::length($char) > 1;
1948 $self->{'_match_char'} = $char;
1951 return $self->{'_match_char'};
1954 =head2 gap_char
1956 Title : gap_char
1957 Usage : $myalign->gap_char('-')
1958 Function : Gets/sets the gap_char attribute of the alignment
1959 Returns : An gap_char string, defaults to '-'
1960 Argument : An gap_char string (optional)
1962 =cut
1964 sub gap_char {
1965 my ( $self, $char ) = @_;
1967 if ( defined $char || !defined $self->{'_gap_char'} ) {
1968 $char = '-' unless defined $char;
1969 $self->throw("Single gap character, not [$char]!")
1970 if CORE::length($char) > 1;
1971 $self->{'_gap_char'} = $char;
1973 return $self->{'_gap_char'};
1977 =head2 symbol_chars
1979 Title : symbol_chars
1980 Usage : my @symbolchars = $aln->symbol_chars;
1981 Function: Returns all the seen symbols (other than gaps)
1982 Returns : array of characters that are the seen symbols
1983 Args : boolean to include the gap/missing/match characters
1985 =cut
1987 sub symbol_chars{
1988 my ($self,$includeextra) = @_;
1990 unless ($self->{'_symbols'}) {
1991 foreach my $seq ($self->each_seq) {
1992 map { $self->{'_symbols'}->{$_} = 1; } split(//,$seq->seq);
1995 my %copy = %{$self->{'_symbols'}};
1996 if( ! $includeextra ) {
1997 foreach my $char ( $self->gap_char, $self->match_char,
1998 $self->missing_char) {
1999 delete $copy{$char} if( defined $char );
2002 return keys %copy;
2005 =head1 Alignment descriptors
2007 These read only methods describe the MSA in various ways.
2010 =head2 score
2012 Title : score
2013 Usage : $str = $ali->score()
2014 Function : get/set a score of the alignment
2015 Returns : a score for the alignment
2016 Argument : an optional score to set
2018 =cut
2020 sub score {
2021 my $self = shift;
2022 $self->{score} = shift if @_;
2023 return $self->{score};
2026 =head2 consensus_string
2028 Title : consensus_string
2029 Usage : $str = $ali->consensus_string($threshold_percent)
2030 Function : Makes a strict consensus
2031 Returns : Consensus string
2032 Argument : Optional threshold ranging from 0 to 100.
2033 The consensus residue has to appear at least threshold %
2034 of the sequences at a given location, otherwise a '?'
2035 character will be placed at that location.
2036 (Default value = 0%)
2038 =cut
2040 sub consensus_string {
2041 my $self = shift;
2042 my $threshold = shift;
2044 my $out = "";
2045 my $len = $self->length - 1;
2047 foreach ( 0 .. $len ) {
2048 $out .= $self->_consensus_aa( $_, $threshold );
2050 return $out;
2054 =head2 consensus_conservation
2056 Title : consensus_conservation
2057 Usage : @conservation = $ali->consensus_conservation();
2058 Function : Conservation (as a percent) of each position of alignment
2059 Returns : Array of percentages [0-100]. Gap columns are 0% conserved.
2060 Argument :
2062 =cut
2064 sub consensus_conservation {
2065 my $self = shift;
2066 my @cons;
2067 my $num_sequences = $self->num_sequences;
2068 foreach my $point (0..$self->length-1) {
2069 my %hash = $self->_consensus_counts($point);
2070 # max frequency of a non-gap letter
2071 my $max = (sort {$b<=>$a} values %hash )[0];
2072 push @cons, 100 * $max / $num_sequences;
2074 return @cons;
2077 sub _consensus_aa {
2078 my $self = shift;
2079 my $point = shift;
2080 my $threshold_percent = shift || -1 ;
2081 my ($seq,%hash,$count,$letter,$key);
2082 my $gapchar = $self->gap_char;
2083 %hash = $self->_consensus_counts($point);
2084 my $number_of_sequences = $self->num_sequences();
2085 my $threshold = $number_of_sequences * $threshold_percent / 100. ;
2086 $count = -1;
2087 $letter = '?';
2089 foreach $key ( sort keys %hash ) {
2090 # print "Now at $key $hash{$key}\n";
2091 if( $hash{$key} > $count && $hash{$key} >= $threshold) {
2092 $letter = $key;
2093 $count = $hash{$key};
2096 return $letter;
2099 # Frequency of each letter in one column
2100 sub _consensus_counts {
2101 my $self = shift;
2102 my $point = shift;
2103 my %hash;
2104 my $gapchar = $self->gap_char;
2105 foreach my $seq ( $self->each_seq() ) {
2106 my $letter = substr($seq->seq,$point,1);
2107 $self->throw("--$point-----------") if $letter eq '';
2108 ($letter eq $gapchar || $letter =~ /\./) && next;
2109 $hash{$letter}++;
2111 return %hash;
2115 =head2 consensus_iupac
2117 Title : consensus_iupac
2118 Usage : $str = $ali->consensus_iupac()
2119 Function : Makes a consensus using IUPAC ambiguity codes from DNA
2120 and RNA. The output is in upper case except when gaps in
2121 a column force output to be in lower case.
2123 Note that if your alignment sequences contain a lot of
2124 IUPAC ambiquity codes you often have to manually set
2125 alphabet. Bio::PrimarySeq::_guess_type thinks they
2126 indicate a protein sequence.
2127 Returns : consensus string
2128 Argument : none
2129 Throws : on protein sequences
2131 =cut
2133 sub consensus_iupac {
2134 my $self = shift;
2135 my $out = "";
2136 my $len = $self->length - 1;
2138 # only DNA and RNA sequences are valid
2139 foreach my $seq ( $self->each_seq() ) {
2140 $self->throw( "Seq [" . $seq->get_nse . "] is a protein" )
2141 if $seq->alphabet eq 'protein';
2144 # loop over the alignment columns
2145 foreach my $count ( 0 .. $len ) {
2146 $out .= $self->_consensus_iupac($count);
2148 return $out;
2151 sub _consensus_iupac {
2152 my ($self, $column) = @_;
2153 my ($string, $char, $rna);
2155 #determine all residues in a column
2156 foreach my $seq ( $self->each_seq() ) {
2157 $string .= substr($seq->seq, $column, 1);
2159 $string = uc $string;
2161 # quick exit if there's an N in the string
2162 if ($string =~ /N/) {
2163 $string =~ /\W/ ? return 'n' : return 'N';
2165 # ... or if there are only gap characters
2166 return '-' if $string =~ /^\W+$/;
2168 # treat RNA as DNA in regexps
2169 if ($string =~ /U/) {
2170 $string =~ s/U/T/;
2171 $rna = 1;
2174 # the following s///'s only need to be done to the _first_ ambiguity code
2175 # as we only need to see the _range_ of characters in $string
2177 if ($string =~ /[VDHB]/) {
2178 $string =~ s/V/AGC/;
2179 $string =~ s/D/AGT/;
2180 $string =~ s/H/ACT/;
2181 $string =~ s/B/CTG/;
2184 if ($string =~ /[SKYRWM]/) {
2185 $string =~ s/S/GC/;
2186 $string =~ s/K/GT/;
2187 $string =~ s/Y/CT/;
2188 $string =~ s/R/AG/;
2189 $string =~ s/W/AT/;
2190 $string =~ s/M/AC/;
2193 # and now the guts of the thing
2195 if ($string =~ /A/) {
2196 $char = 'A'; # A A
2197 if ($string =~ /G/) {
2198 $char = 'R'; # A and G (purines) R
2199 if ($string =~ /C/) {
2200 $char = 'V'; # A and G and C V
2201 if ($string =~ /T/) {
2202 $char = 'N'; # A and G and C and T N
2204 } elsif ($string =~ /T/) {
2205 $char = 'D'; # A and G and T D
2207 } elsif ($string =~ /C/) {
2208 $char = 'M'; # A and C M
2209 if ($string =~ /T/) {
2210 $char = 'H'; # A and C and T H
2212 } elsif ($string =~ /T/) {
2213 $char = 'W'; # A and T W
2215 } elsif ($string =~ /C/) {
2216 $char = 'C'; # C C
2217 if ($string =~ /T/) {
2218 $char = 'Y'; # C and T (pyrimidines) Y
2219 if ($string =~ /G/) {
2220 $char = 'B'; # C and T and G B
2222 } elsif ($string =~ /G/) {
2223 $char = 'S'; # C and G S
2225 } elsif ($string =~ /G/) {
2226 $char = 'G'; # G G
2227 if ($string =~ /C/) {
2228 $char = 'S'; # G and C S
2229 } elsif ($string =~ /T/) {
2230 $char = 'K'; # G and T K
2232 } elsif ($string =~ /T/) {
2233 $char = 'T'; # T T
2236 $char = 'U' if $rna and $char eq 'T';
2237 $char = lc $char if $string =~ /\W/;
2239 return $char;
2243 =head2 consensus_meta
2245 Title : consensus_meta
2246 Usage : $seqmeta = $ali->consensus_meta()
2247 Function : Returns a Bio::Seq::Meta object containing the consensus
2248 strings derived from meta data analysis.
2249 Returns : Bio::Seq::Meta
2250 Argument : Bio::Seq::Meta
2251 Throws : non-MetaI object
2253 =cut
2255 sub consensus_meta {
2256 my ($self, $meta) = @_;
2257 if ($meta && (!ref $meta || !$meta->isa('Bio::Seq::MetaI'))) {
2258 $self->throw('Not a Bio::Seq::MetaI object');
2260 return $self->{'_aln_meta'} = $meta if $meta;
2261 return $self->{'_aln_meta'}
2264 =head2 is_flush
2266 Title : is_flush
2267 Usage : if ( $ali->is_flush() )
2268 Function : Tells you whether the alignment
2269 : is flush, i.e. all of the same length
2270 Returns : 1 or 0
2271 Argument :
2273 =cut
2275 sub is_flush {
2276 my ( $self, $report ) = @_;
2277 my $seq;
2278 my $length = (-1);
2279 my $temp;
2281 foreach $seq ( $self->each_seq() ) {
2282 if ( $length == (-1) ) {
2283 $length = CORE::length( $seq->seq() );
2284 next;
2287 $temp = CORE::length( $seq->seq() );
2288 if ( $temp != $length ) {
2289 $self->warn(
2290 "expecting $length not $temp from " . $seq->display_id )
2291 if ($report);
2292 $self->debug(
2293 "expecting $length not $temp from " . $seq->display_id );
2294 $self->debug( $seq->seq() . "\n" );
2295 return 0;
2299 return 1;
2303 =head2 length
2305 Title : length()
2306 Usage : $len = $ali->length()
2307 Function : Returns the maximum length of the alignment.
2308 To be sure the alignment is a block, use is_flush
2309 Returns : Integer
2310 Argument :
2312 =cut
2314 sub length_aln {
2315 my $self = shift;
2316 Carp::carp("length_aln - deprecated method. Use length() instead.");
2317 $self->length(@_);
2320 sub length {
2321 my $self = shift;
2322 my $seq;
2323 my $length = -1;
2324 my $temp;
2326 foreach $seq ( $self->each_seq() ) {
2327 $temp = $seq->length();
2328 if( $temp > $length ) {
2329 $length = $temp;
2333 return $length;
2337 =head2 maxdisplayname_length
2339 Title : maxdisplayname_length
2340 Usage : $ali->maxdisplayname_length()
2341 Function : Gets the maximum length of the displayname in the
2342 alignment. Used in writing out various MSA formats.
2343 Returns : integer
2344 Argument :
2346 =cut
2348 sub maxname_length {
2349 my $self = shift;
2350 Carp::carp("maxname_length - deprecated method."
2351 . " Use maxdisplayname_length() instead.");
2352 $self->maxdisplayname_length();
2355 sub maxnse_length {
2356 my $self = shift;
2357 Carp::carp("maxnse_length - deprecated method."
2358 . " Use maxnse_length() instead.");
2359 $self->maxdisplayname_length();
2362 sub maxdisplayname_length {
2363 my $self = shift;
2364 my $maxname = (-1);
2365 my ( $seq, $len );
2367 foreach $seq ( $self->each_seq() ) {
2368 $len = CORE::length $self->displayname( $seq->get_nse() );
2370 if ( $len > $maxname ) {
2371 $maxname = $len;
2375 return $maxname;
2378 =head2 max_metaname_length
2380 Title : max_metaname_length
2381 Usage : $ali->max_metaname_length()
2382 Function : Gets the maximum length of the meta name tags in the
2383 alignment for the sequences and for the alignment.
2384 Used in writing out various MSA formats.
2385 Returns : integer
2386 Argument : None
2388 =cut
2390 sub max_metaname_length {
2391 my $self = shift;
2392 my $maxname = (-1);
2393 my ($seq,$len);
2395 # check seq meta first
2396 for $seq ( $self->each_seq() ) {
2397 next if !$seq->isa('Bio::Seq::MetaI' || !$seq->meta_names);
2398 for my $mtag ($seq->meta_names) {
2399 $len = CORE::length $mtag;
2400 if( $len > $maxname ) {
2401 $maxname = $len;
2406 # alignment meta
2407 for my $meta ($self->consensus_meta) {
2408 next unless $meta;
2409 for my $name ($meta->meta_names) {
2410 $len = CORE::length $name;
2411 if( $len > $maxname ) {
2412 $maxname = $len;
2417 return $maxname;
2420 =head2 num_residues
2422 Title : num_residues
2423 Usage : $no = $ali->num_residues
2424 Function : number of residues in total in the alignment
2425 Returns : integer
2426 Argument :
2427 Note : replaces no_residues()
2429 =cut
2431 sub num_residues {
2432 my $self = shift;
2433 my $count = 0;
2435 foreach my $seq ( $self->each_seq ) {
2436 my $str = $seq->seq();
2438 $count += ( $str =~ s/[A-Za-z]//g );
2441 return $count;
2444 =head2 num_sequences
2446 Title : num_sequences
2447 Usage : $depth = $ali->num_sequences
2448 Function : number of sequence in the sequence alignment
2449 Returns : integer
2450 Argument : none
2451 Note : replaces no_sequences()
2453 =cut
2455 sub num_sequences {
2456 my $self = shift;
2457 return scalar($self->each_seq);
2460 =head2 average_percentage_identity
2462 Title : average_percentage_identity
2463 Usage : $id = $align->average_percentage_identity
2464 Function: The function uses a fast method to calculate the average
2465 percentage identity of the alignment
2466 Returns : The average percentage identity of the alignment
2467 Args : None
2468 Notes : This method implemented by Kevin Howe calculates a figure that is
2469 designed to be similar to the average pairwise identity of the
2470 alignment (identical in the absence of gaps), without having to
2471 explicitly calculate pairwise identities proposed by Richard Durbin.
2472 Validated by Ewan Birney ad Alex Bateman.
2474 =cut
2476 sub average_percentage_identity{
2477 my ($self,@args) = @_;
2479 my @alphabet = ('A','B','C','D','E','F','G','H','I','J','K','L','M',
2480 'N','O','P','Q','R','S','T','U','V','W','X','Y','Z');
2482 my ($len, $total, $subtotal, $divisor, $subdivisor, @seqs, @countHashes);
2484 if (! $self->is_flush()) {
2485 $self->throw("All sequences in the alignment must be the same length");
2488 @seqs = $self->each_seq();
2489 $len = $self->length();
2491 # load the each hash with correct keys for existence checks
2493 for( my $index=0; $index < $len; $index++) {
2494 foreach my $letter (@alphabet) {
2495 $countHashes[$index]->{$letter} = 0;
2498 foreach my $seq (@seqs) {
2499 my @seqChars = split //, $seq->seq();
2500 for( my $column=0; $column < @seqChars; $column++ ) {
2501 my $char = uc($seqChars[$column]);
2502 if (exists $countHashes[$column]->{$char}) {
2503 $countHashes[$column]->{$char}++;
2508 $total = 0;
2509 $divisor = 0;
2510 for(my $column =0; $column < $len; $column++) {
2511 my %hash = %{$countHashes[$column]};
2512 $subdivisor = 0;
2513 foreach my $res (keys %hash) {
2514 $total += $hash{$res}*($hash{$res} - 1);
2515 $subdivisor += $hash{$res};
2517 $divisor += $subdivisor * ($subdivisor - 1);
2519 return $divisor > 0 ? ($total / $divisor )*100.0 : 0;
2522 =head2 percentage_identity
2524 Title : percentage_identity
2525 Usage : $id = $align->percentage_identity
2526 Function: The function calculates the average percentage identity
2527 (aliased to average_percentage_identity)
2528 Returns : The average percentage identity
2529 Args : None
2531 =cut
2533 sub percentage_identity {
2534 my $self = shift;
2535 return $self->average_percentage_identity();
2538 =head2 overall_percentage_identity
2540 Title : overall_percentage_identity
2541 Usage : $id = $align->overall_percentage_identity
2542 $id = $align->overall_percentage_identity('short')
2543 Function: The function calculates the percentage identity of
2544 the conserved columns
2545 Returns : The percentage identity of the conserved columns
2546 Args : length value to use, optional defaults to alignment length
2547 possible values: 'align', 'short', 'long'
2549 The argument values 'short' and 'long' refer to shortest and longest
2550 sequence in the alignment. Method modification code by Hongyu Zhang.
2552 =cut
2554 sub overall_percentage_identity{
2555 my ($self, $length_measure) = @_;
2557 my %alphabet = map {$_ => undef} qw (A C G T U B D E F H I J K L M N O P Q R S V W X Y Z);
2559 my %enum = map {$_ => undef} qw (align short long);
2561 $self->throw("Unknown argument [$length_measure]")
2562 if $length_measure and not exists $enum{$length_measure};
2563 $length_measure ||= 'align';
2565 if (! $self->is_flush()) {
2566 $self->throw("All sequences in the alignment must be the same length");
2569 # Count the residues seen at each position
2570 my $len;
2571 my $total = 0; # number of positions with identical residues
2572 my @countHashes;
2573 my @seqs = $self->each_seq;
2574 my $nof_seqs = scalar @seqs;
2575 my $aln_len = $self->length();
2576 for my $seq (@seqs) {
2577 my $seqstr = $seq->seq;
2579 # Count residues for given sequence
2580 for my $column (0 .. $aln_len-1) {
2581 my $char = uc( substr($seqstr, $column, 1) );
2582 if ( exists $alphabet{$char} ) {
2584 # This is a valid char
2585 if ( defined $countHashes[$column]->{$char} ) {
2586 $countHashes[$column]->{$char}++;
2587 } else {
2588 $countHashes[$column]->{$char} = 1;
2591 if ( $countHashes[$column]->{$char} == $nof_seqs ) {
2592 # All sequences have this same residue
2593 $total++;
2599 # Sequence length
2600 if ($length_measure eq 'short' || $length_measure eq 'long') {
2601 my $seq_len = $seqstr =~ tr/[A-Za-z]//;
2602 if ($length_measure eq 'short') {
2603 if ( (not defined $len) || ($seq_len < $len) ) {
2604 $len = $seq_len;
2606 } elsif ($length_measure eq 'long') {
2607 if ( (not defined $len) || ($seq_len > $len) ) {
2608 $len = $seq_len;
2615 if ($length_measure eq 'align') {
2616 $len = $aln_len;
2619 return ($total / $len ) * 100.0;
2624 =head1 Alignment positions
2626 Methods to map a sequence position into an alignment column and back.
2627 column_from_residue_number() does the former. The latter is really a
2628 property of the sequence object and can done using
2629 L<Bio::LocatableSeq::location_from_column>:
2631 # select somehow a sequence from the alignment, e.g.
2632 my $seq = $aln->get_seq_by_pos(1);
2633 #$loc is undef or Bio::LocationI object
2634 my $loc = $seq->location_from_column(5);
2636 =head2 column_from_residue_number
2638 Title : column_from_residue_number
2639 Usage : $col = $ali->column_from_residue_number( $seqname, $resnumber)
2640 Function: This function gives the position in the alignment
2641 (i.e. column number) of the given residue number in the
2642 sequence with the given name. For example, for the
2643 alignment
2645 Seq1/91-97 AC..DEF.GH.
2646 Seq2/24-30 ACGG.RTY...
2647 Seq3/43-51 AC.DDEF.GHI
2649 column_from_residue_number( "Seq1", 94 ) returns 6.
2650 column_from_residue_number( "Seq2", 25 ) returns 2.
2651 column_from_residue_number( "Seq3", 50 ) returns 10.
2653 An exception is thrown if the residue number would lie
2654 outside the length of the aligment
2655 (e.g. column_from_residue_number( "Seq2", 22 )
2657 Note: If the the parent sequence is represented by more than
2658 one alignment sequence and the residue number is present in
2659 them, this method finds only the first one.
2661 Returns : A column number for the position in the alignment of the
2662 given residue in the given sequence (1 = first column)
2663 Args : A sequence id/name (not a name/start-end)
2664 A residue number in the whole sequence (not just that
2665 segment of it in the alignment)
2667 =cut
2669 sub column_from_residue_number {
2670 my ( $self, $name, $resnumber ) = @_;
2672 $self->throw("No sequence with name [$name]")
2673 unless $self->{'_start_end_lists'}->{$name};
2674 $self->throw("Second argument residue number missing") unless $resnumber;
2676 foreach my $seq ( $self->each_seq_with_id($name) ) {
2677 my $col;
2678 eval { $col = $seq->column_from_residue_number($resnumber); };
2679 next if $@;
2680 return $col;
2683 $self->throw( "Could not find a sequence segment in $name "
2684 . "containing residue number $resnumber" );
2688 =head1 Sequence names
2690 Methods to manipulate the display name. The default name based on the
2691 sequence id and subsequence positions can be overridden in various
2692 ways.
2694 =head2 displayname
2696 Title : displayname
2697 Usage : $myalign->displayname("Ig", "IgA")
2698 Function : Gets/sets the display name of a sequence in the alignment
2699 Returns : A display name string
2700 Argument : name of the sequence
2701 displayname of the sequence (optional)
2703 =cut
2705 sub displayname {
2706 my ( $self, $name, $disname ) = @_;
2708 $self->throw("No sequence with name [$name]")
2709 unless defined $self->{'_seq'}->{$name};
2711 if ( $disname and $name ) {
2712 $self->{'_dis_name'}->{$name} = $disname;
2713 return $disname;
2715 elsif ( defined $self->{'_dis_name'}->{$name} ) {
2716 return $self->{'_dis_name'}->{$name};
2718 else {
2719 return $name;
2723 sub get_displayname {
2724 my $self = shift;
2725 Carp::carp("get_displayname - deprecated method. Use displayname() instead.");
2726 $self->displayname(@_);
2729 sub set_displayname {
2730 my $self = shift;
2731 Carp::carp("set_displayname - deprecated method. Use displayname() instead.");
2732 $self->displayname(@_);
2736 =head2 set_displayname_count
2738 Title : set_displayname_count
2739 Usage : $ali->set_displayname_count
2740 Function : Sets the names to be name_# where # is the number of
2741 times this name has been used.
2742 Returns : 1, on success
2743 Argument :
2745 =cut
2747 sub set_displayname_count {
2748 my $self= shift;
2749 my (@arr,$name,$seq,$count,$temp,$nse);
2751 foreach $seq ( $self->each_alphabetically() ) {
2752 $nse = $seq->get_nse();
2754 #name will be set when this is the second
2755 #time (or greater) is has been seen
2757 if( defined $name and $name eq ($seq->id()) ) {
2758 $temp = sprintf("%s_%s",$name,$count);
2759 $self->displayname($nse,$temp);
2760 $count++;
2761 } else {
2762 $count = 1;
2763 $name = $seq->id();
2764 $temp = sprintf("%s_%s",$name,$count);
2765 $self->displayname($nse,$temp);
2766 $count++;
2769 return 1;
2772 =head2 set_displayname_flat
2774 Title : set_displayname_flat
2775 Usage : $ali->set_displayname_flat()
2776 Function : Makes all the sequences be displayed as just their name,
2777 not name/start-end (NSE)
2778 Returns : 1
2779 Argument :
2781 =cut
2783 sub set_displayname_flat {
2784 my $self = shift;
2785 my ( $nse, $seq );
2787 foreach $seq ( $self->each_seq() ) {
2788 $nse = $seq->get_nse();
2789 $self->displayname( $nse, $seq->id() );
2791 return 1;
2795 =head2 set_displayname_normal
2797 Title : set_displayname_normal
2798 Usage : $ali->set_displayname_normal()
2799 Function : Makes all the sequences be displayed as name/start-end (NSE)
2800 Returns : 1, on success
2801 Argument :
2803 =cut
2805 sub set_displayname_normal {
2806 my $self = shift;
2807 my ( $nse, $seq );
2809 foreach $seq ( $self->each_seq() ) {
2810 $nse = $seq->get_nse();
2811 $self->displayname( $nse, $nse );
2813 return 1;
2816 =head2 source
2818 Title : source
2819 Usage : $obj->source($newval)
2820 Function: sets the Alignment source program
2821 Example :
2822 Returns : value of source
2823 Args : newvalue (optional)
2825 =cut
2827 sub source {
2828 my ( $self, $value ) = @_;
2829 if ( defined $value ) {
2830 $self->{'_source'} = $value;
2832 return $self->{'_source'};
2836 =head2 set_displayname_safe
2838 Title : set_displayname_safe
2839 Usage : ($new_aln, $ref_name)=$ali->set_displayname_safe(4)
2840 Function : Assign machine-generated serial names to sequences in input order.
2841 Designed to protect names during PHYLIP runs. Assign 10-char string
2842 in the form of "S000000001" to "S999999999". Restore the original
2843 names using "restore_displayname".
2844 Returns : 1. a new $aln with system names;
2845 2. a hash ref for restoring names
2846 Argument : Number for id length (default 10)
2848 =cut
2850 sub set_displayname_safe {
2851 my $self = shift;
2852 my $idlength = shift || 10;
2853 my ( $seq, %phylip_name );
2854 my $ct = 0;
2855 my $new = Bio::SimpleAlign->new();
2856 foreach $seq ( $self->each_seq() ) {
2857 $ct++;
2858 my $pname = "S" . sprintf "%0" . ( $idlength - 1 ) . "s", $ct;
2859 $phylip_name{$pname} = $seq->id();
2860 my $new_seq = Bio::LocatableSeq->new(
2861 -id => $pname,
2862 -seq => $seq->seq(),
2863 -alphabet => $seq->alphabet,
2864 -start => $seq->{_start},
2865 -end => $seq->{_end}
2867 $new->add_seq($new_seq);
2870 $self->debug(
2871 "$ct seq names changed. Restore names by using restore_displayname.");
2872 return ( $new, \%phylip_name );
2876 =head2 restore_displayname
2878 Title : restore_displayname
2879 Usage : $aln_name_restored=$ali->restore_displayname($hash_ref)
2880 Function : Restore original sequence names (after running
2881 $ali->set_displayname_safe)
2882 Returns : a new $aln with names restored.
2883 Argument : a hash reference of names from "set_displayname_safe".
2885 =cut
2887 sub restore_displayname {
2888 my $self = shift;
2889 my $ref=shift;
2890 my %name=%$ref;
2891 my $new=Bio::SimpleAlign->new();
2892 foreach my $seq ( $self->each_seq() ) {
2893 $self->throw("No sequence with name") unless defined $name{$seq->id()};
2894 my $new_seq= Bio::LocatableSeq->new(-id => $name{$seq->id()},
2895 -seq => $seq->seq(),
2896 -alphabet => $seq->alphabet,
2897 -start => $seq->{_start},
2898 -end => $seq->{_end}
2900 $new->add_seq($new_seq);
2902 return $new;
2905 =head2 sort_by_start
2907 Title : sort_by_start
2908 Usage : $ali->sort_by_start
2909 Function : Changes the order of the alignment to the start position of each
2910 subalignment
2911 Returns : 1 on success
2912 Argument :
2914 =cut
2916 sub sort_by_start {
2917 my $self = shift;
2918 my ($seq,$nse,@arr,%hash,$count);
2919 foreach $seq ( $self->each_seq() ) {
2920 $nse = $seq->get_nse;
2921 $hash{$nse} = $seq;
2923 $count = 0;
2924 %{$self->{'_order'}} = (); # reset the hash;
2925 foreach $nse ( sort _startend keys %hash) {
2926 $self->{'_order'}->{$count} = $nse;
2927 $count++;
2932 sub _startend {
2933 my ($aname,$arange) = split (/[\/]/,$a);
2934 my ($bname,$brange) = split (/[\/]/,$b);
2935 my ($astart,$aend) = split(/\-/,$arange);
2936 my ($bstart,$bend) = split(/\-/,$brange);
2937 return $astart <=> $bstart;
2940 =head2 bracket_string
2942 Title : bracket_string
2943 Usage : my @params = (-refseq => 'testseq',
2944 -allele1 => 'allele1',
2945 -allele2 => 'allele2',
2946 -delimiters => '{}',
2947 -separator => '/');
2948 $str = $aln->bracket_string(@params)
2950 Function : When supplied with a list of parameters (see below), returns a
2951 string in BIC format. This is used for allelic comparisons.
2952 Briefly, if either allele contains a base change when compared to
2953 the refseq, the base or gap for each allele is represented in
2954 brackets in the order present in the 'alleles' parameter.
2956 For the following data:
2958 >testseq
2959 GGATCCATTGCTACT
2960 >allele1
2961 GGATCCATTCCTACT
2962 >allele2
2963 GGAT--ATTCCTCCT
2965 the returned string with parameters 'refseq => testseq' and
2966 'alleles => [qw(allele1 allele2)]' would be:
2968 GGAT[C/-][C/-]ATT[C/C]CT[A/C]CT
2969 Returns : BIC-formatted string
2970 Argument : Required args
2971 refseq : string (ID) of the reference sequence used
2972 as basis for comparison
2973 allele1 : string (ID) of the first allele
2974 allele2 : string (ID) of the second allele
2975 Optional args
2976 delimiters: two symbol string of left and right delimiters.
2977 Only the first two symbols are used
2978 default = '[]'
2979 separator : string used as a separator. Only the first
2980 symbol is used
2981 default = '/'
2982 Throws : On no refseq/alleles, or invalid refseq/alleles.
2984 =cut
2986 sub bracket_string {
2987 my ($self, @args) = @_;
2988 my ($ref, $a1, $a2, $delim, $sep) =
2989 $self->_rearrange([qw(refseq allele1 allele2 delimiters separator)], @args);
2990 $self->throw('Missing refseq/allele1/allele2') if (!$a1 || !$a2 || !$ref);
2991 my ($ld, $rd);
2992 ($ld, $rd) = split('', $delim, 2) if $delim;
2993 $ld ||= '[';
2994 $rd ||= ']';
2995 $sep ||= '/';
2996 my ($refseq, $allele1, $allele2) =
2997 map {( $self->each_seq_with_id($_) )} ($ref, $a1, $a2);
2998 if (!$refseq || !$allele1 || !$allele2) {
2999 $self->throw("One of your refseq/allele IDs is invalid!");
3001 my $len = $self->length-1;
3002 my $bic = '';
3003 # loop over the alignment columns
3004 for my $column ( 0 .. $len ) {
3005 my $string;
3006 my ($compres, $res1, $res2) =
3007 map{substr($_->seq, $column, 1)} ($refseq, $allele1, $allele2);
3008 # are any of the allele symbols different from the refseq?
3009 $string = ($compres eq $res1 && $compres eq $res2) ? $compres :
3010 $ld.$res1.$sep.$res2.$rd;
3011 $bic .= $string;
3013 return $bic;
3017 =head2 methods implementing Bio::FeatureHolderI
3019 FeatureHolderI implementation to support labeled character sets like one
3020 would get from NEXUS represented data.
3022 =head2 get_SeqFeatures
3024 Usage : @features = $aln->get_SeqFeatures
3025 Function: Get the feature objects held by this feature holder.
3026 Example :
3027 Returns : an array of Bio::SeqFeatureI implementing objects
3028 Args : optional filter coderef, taking a Bio::SeqFeatureI
3029 : as argument, returning TRUE if wanted, FALSE if
3030 : unwanted
3032 =cut
3034 sub get_SeqFeatures {
3035 my $self = shift;
3036 my $filter_cb = shift;
3037 $self->throw("Arg (filter callback) must be a coderef")
3038 unless !defined($filter_cb)
3039 or ref($filter_cb) eq 'CODE';
3040 if ( !defined $self->{'_as_feat'} ) {
3041 $self->{'_as_feat'} = [];
3043 if ($filter_cb) {
3044 return grep { $filter_cb->($_) } @{ $self->{'_as_feat'} };
3046 return @{ $self->{'_as_feat'} };
3050 =head2 add_SeqFeature
3052 Usage : $aln->add_SeqFeature($subfeat);
3053 Function: Adds a SeqFeature into the SeqFeature array. The 'EXPAND' qualifier
3054 (see L<Bio::FeatureHolderI>) is supported, but has no effect.
3055 Example :
3056 Returns : 1 on success
3057 Args : a Bio::SeqFeatureI object
3059 =cut
3061 sub add_SeqFeature {
3062 my ($self, @feat) = @_;
3064 $self->{'_as_feat'} = [] unless $self->{'_as_feat'};
3066 if (scalar @feat > 1) {
3067 Carp::carp('Providing an array of features to Bio::SimpleAlign'
3068 . 'add_SeqFeature() is deprecated and will be removed in a'
3069 . ' future version. Add a single feature at a time instead.');
3072 for my $feat ( @feat ) {
3074 next if $feat eq 'EXPAND'; # Need to support it for FeatureHolderI compliance
3076 if( !$feat->isa("Bio::SeqFeatureI") ) {
3077 $self->throw("Expected a Bio::SeqFeatureI object, but got a $feat.");
3080 push @{$self->{'_as_feat'}}, $feat;
3082 return 1;
3086 =head2 remove_SeqFeatures
3088 Usage : $obj->remove_SeqFeatures
3089 Function: Removes all SeqFeatures. If you want to remove only a subset,
3090 remove that subset from the returned array, and add back the rest.
3091 Returns : The array of Bio::SeqFeatureI features that was
3092 deleted from this alignment.
3093 Args : none
3095 =cut
3097 sub remove_SeqFeatures {
3098 my $self = shift;
3100 return () unless $self->{'_as_feat'};
3101 my @feats = @{$self->{'_as_feat'}};
3102 $self->{'_as_feat'} = [];
3103 return @feats;
3106 =head2 feature_count
3108 Title : feature_count
3109 Usage : $obj->feature_count()
3110 Function: Return the number of SeqFeatures attached to the alignment
3111 Returns : integer representing the number of SeqFeatures
3112 Args : None
3114 =cut
3116 sub feature_count {
3117 my ($self) = @_;
3119 if (defined($self->{'_as_feat'})) {
3120 return ($#{$self->{'_as_feat'}} + 1);
3121 } else {
3122 return 0;
3126 =head2 get_all_SeqFeatures
3128 Title : get_all_SeqFeatures
3129 Usage :
3130 Function: Get all SeqFeatures.
3131 Example :
3132 Returns : an array of Bio::SeqFeatureI implementing objects
3133 Args : none
3134 Note : Falls through to Bio::FeatureHolderI implementation.
3136 =cut
3138 =head2 methods for Bio::AnnotatableI
3140 AnnotatableI implementation to support sequence alignments which
3141 contain annotation (NEXUS, Stockholm).
3143 =head2 annotation
3145 Title : annotation
3146 Usage : $ann = $aln->annotation or
3147 $aln->annotation($ann)
3148 Function: Gets or sets the annotation
3149 Returns : Bio::AnnotationCollectionI object
3150 Args : None or Bio::AnnotationCollectionI object
3152 See L<Bio::AnnotationCollectionI> and L<Bio::Annotation::Collection>
3153 for more information
3155 =cut
3157 sub annotation {
3158 my ($obj,$value) = @_;
3159 if( defined $value ) {
3160 $obj->throw("object of class ".ref($value)." does not implement ".
3161 "Bio::AnnotationCollectionI. Too bad.")
3162 unless $value->isa("Bio::AnnotationCollectionI");
3163 $obj->{'_annotation'} = $value;
3164 } elsif( ! defined $obj->{'_annotation'}) {
3165 $obj->{'_annotation'} = Bio::Annotation::Collection->new();
3167 return $obj->{'_annotation'};
3170 =head1 Deprecated methods
3172 =head2 mask_columns
3174 Title : mask_columns
3175 Usage : $aln2 = $aln->mask_columns(20,30)
3176 Function : Masks a slice of the alignment inclusive of start and
3177 end columns, and the first column in the alignment is denoted 1.
3178 Mask beyond the length of the sequence does not do padding.
3179 Returns : A Bio::SimpleAlign object
3180 Args : Positive integer for start column, positive integer for end column,
3181 optional string value use for the mask. Example:
3183 $aln2 = $aln->mask_columns(20,30,'?')
3184 Note : Masking must use a character that is not used for gaps or
3185 frameshifts. These can be adjusted using the relevant global
3186 variables, but be aware these may be (uncontrollably) modified
3187 elsewhere within BioPerl (see bug 2715)
3189 =cut
3191 sub mask_columns {
3192 #based on slice(), but did not include the Bio::Seq::Meta sections as I was not sure what it is doing
3193 my $self = shift;
3195 my $nonres = $Bio::LocatableSeq::GAP_SYMBOLS.
3196 $Bio::LocatableSeq::FRAMESHIFT_SYMBOLS;
3198 # coordinates are alignment-based, not sequence-based
3199 my ($start, $end, $mask_char) = @_;
3200 unless (defined $mask_char) { $mask_char = 'N' }
3202 $self->throw("Mask start has to be a positive integer and less than ".
3203 "alignment length, not [$start]")
3204 unless $start =~ /^\d+$/ && $start > 0 && $start <= $self->length;
3205 $self->throw("Mask end has to be a positive integer and less than ".
3206 "alignment length, not [$end]")
3207 unless $end =~ /^\d+$/ && $end > 0 && $end <= $self->length;
3208 $self->throw("Mask start [$start] has to be smaller than or equal to ".
3209 "end [$end]") unless $start <= $end;
3210 $self->throw("Mask character $mask_char has to be a single character ".
3211 "and not a gap or frameshift symbol")
3212 unless CORE::length($mask_char) == 1 && $mask_char !~ m{$nonres};
3214 my $aln = $self->new;
3215 $aln->id($self->id);
3216 foreach my $seq ( $self->each_seq() ) {
3217 my $new_seq = Bio::LocatableSeq->new(-id => $seq->id,
3218 -alphabet => $seq->alphabet,
3219 -strand => $seq->strand,
3220 -verbose => $self->verbose);
3222 # convert from 1-based alignment coords!
3223 my $masked_string = substr($seq->seq, $start - 1, $end - $start + 1);
3224 $masked_string =~ s{[^$nonres]}{$mask_char}g;
3225 my $new_dna_string = substr($seq->seq,0,$start-1) . $masked_string . substr($seq->seq,$end);
3226 $new_seq->seq($new_dna_string);
3227 $aln->add_seq($new_seq);
3229 # Preserve chosen mask character, it may be need later (like in 'slice')
3230 $aln->{_mask_char} = $mask_char;
3231 return $aln;