lib/Bio/SimpleAlign.pm

   1 package Bio::SimpleAlign;
   2
   3 use strict;
   4 use warnings;
   5
   6 use Carp;
   7
   8 use Bio::LocatableSeq;  # uses Seq's as list
   9 use Bio::Seq;
  10 use Bio::SeqFeature::Generic;
  11
  12 use parent qw(Bio::Root::Root Bio::Align::AlignI Bio::AnnotatableI Bio::FeatureHolderI);
  13
  14 # BioPerl module for SimpleAlign
  15 #
  16 # Please direct questions and support issues to <bioperl-l@bioperl.org>
  17 #
  18 # Cared for by Heikki Lehvaslaiho <heikki-at-bioperl-dot-org>
  19 #
  20 # Copyright Ewan Birney
  21 #
  22 # You may distribute this module under the same terms as perl itself
  23
  24 # POD documentation - main docs before the code
  25 #
  26 #  History:
  27 #       11/3/00 Added threshold feature to consensus and consensus_aa  - PS
  28 #       May 2001 major rewrite - Heikki Lehvaslaiho
  29
  30 =head1 NAME
  31
  32 Bio::SimpleAlign - Multiple alignments held as a set of sequences
  33
  34 =head1 SYNOPSIS
  35
  36   # Use Bio::AlignIO to read in the alignment
  37   $str = Bio::AlignIO->new(-file => 't/data/testaln.pfam');
  38   $aln = $str->next_aln();
  39
  40   # Describe
  41   print $aln->length;
  42   print $aln->num_residues;
  43   print $aln->is_flush;
  44   print $aln->num_sequences;
  45   print $aln->score;
  46   print $aln->percentage_identity;
  47   print $aln->consensus_string(50);
  48
  49   # Find the position in the alignment for a sequence location
  50   $pos = $aln->column_from_residue_number('1433_LYCES', 14); # = 6;
  51
  52   # Extract sequences and check values for the alignment column $pos
  53   foreach $seq ($aln->each_seq) {
  54       $res = $seq->subseq($pos, $pos);
  55       $count{$res}++;
  56   }
  57   foreach $res (keys %count) {
  58       printf "Res: %s  Count: %2d\n", $res, $count{$res};
  59   }
  60
  61   # Manipulate
  62   $aln->remove_seq($seq);
  63   $mini_aln = $aln->slice(20,30);  # get a block of columns
  64   $mini_aln = $aln->select_noncont(1,3,5,7,11); # select certain sequences
  65   $new_aln = $aln->remove_columns([20,30]); # remove by position
  66   $new_aln = $aln->remove_columns(['mismatch']); # remove by property
  67
  68   # Analyze
  69   $str = $aln->consensus_string($threshold_percent);
  70   $str = $aln->match_line();
  71   $str = $aln->cigar_line();
  72   $id = $aln->percentage_identity;
  73
  74   # See the module documentation for details and more methods.
  75
  76 =head1 DESCRIPTION
  77
  78 SimpleAlign is an object that handles a multiple sequence alignment
  79 (MSA). It is very permissive of types (it does not insist on sequences
  80 being all same length, for example). Think of it as a set of sequences
  81 with a whole series of built-in manipulations and methods for reading and
  82 writing alignments.
  83
  84 SimpleAlign uses L<Bio::LocatableSeq>, a subclass of L<Bio::PrimarySeq>,
  85 to store its sequences. These are subsequences with a start and end
  86 positions in the parent reference sequence. Each sequence in the
  87 SimpleAlign object is a Bio::LocatableSeq.
  88
  89 SimpleAlign expects the combination of name, start, and end for a
  90 given sequence to be unique in the alignment, and this is the key for the
  91 internal hashes (name, start, end are abbreviated C<nse> in the code).
  92 However, in some cases people do not want the name/start-end to be displayed:
  93 either multiple names in an alignment or names specific to the alignment
  94 (ROA1_HUMAN_1, ROA1_HUMAN_2 etc). These names are called
  95 C<displayname>, and generally is what is used to print out the
  96 alignment. They default to name/start-end.
  97
  98 The SimpleAlign Module is derived from the Align module by Ewan Birney.
  99
 100 =head1 FEEDBACK
 101
 102 =head2 Mailing Lists
 103
 104 User feedback is an integral part of the evolution of this and other
 105 Bioperl modules.  Send your comments and suggestions preferably to one
 106 of the Bioperl mailing lists.  Your participation is much appreciated.
 107
 108   bioperl-l@bioperl.org                  - General discussion
 109   http://bioperl.org/wiki/Mailing_lists  - About the mailing lists
 110
 111 =head2 Support
 112
 113 Please direct usage questions or support issues to the mailing list:
 114
 115 I<bioperl-l@bioperl.org>
 116
 117 rather than to the module maintainer directly. Many experienced and
 118 reponsive experts will be able look at the problem and quickly
 119 address it. Please include a thorough description of the problem
 120 with code and data examples if at all possible.
 121
 122 =head2 Reporting Bugs
 123
 124 Report bugs to the Bioperl bug tracking system to help us keep track
 125 the bugs and their resolution. Bug reports can be submitted via the
 126 web:
 127
 128   https://github.com/bioperl/bioperl-live/issues
 129
 130 =head1 AUTHOR
 131
 132 Ewan Birney, birney@ebi.ac.uk
 133
 134 =head1 CONTRIBUTORS
 135
 136 Allen Day, allenday-at-ucla.edu,
 137 Richard Adams, Richard.Adams-at-ed.ac.uk,
 138 David J. Evans, David.Evans-at-vir.gla.ac.uk,
 139 Heikki Lehvaslaiho, heikki-at-bioperl-dot-org,
 140 Allen Smith, allens-at-cpan.org,
 141 Jason Stajich, jason-at-bioperl.org,
 142 Anthony Underwood, aunderwood-at-phls.org.uk,
 143 Xintao Wei & Giri Narasimhan, giri-at-cs.fiu.edu
 144 Brian Osborne, bosborne at alum.mit.edu
 145 Weigang Qiu, Weigang at GENECTR-HUNTER-CUNY-EDU
 146 Hongyu Zhang, forward at hongyu.org
 147 Jay Hannah, jay at jays.net
 148 Alexandr Bezginov, albezg at gmail.com
 149
 150 =head1 SEE ALSO
 151
 152 L<Bio::LocatableSeq>
 153
 154 =head1 APPENDIX
 155
 156 The rest of the documentation details each of the object
 157 methods. Internal methods are usually preceded with a _
 158
 159 =cut
 160
 161 ## This data should probably be in a more centralized module...
 162 ## it is taken from Clustalw documentation.
 163 ## These are all the positively scoring groups that occur in the
 164 ## Gonnet Pam250 matrix. The strong and weak groups are
 165 ## defined as strong score >0.5 and weak score =<0.5 respectively.
 166 our %CONSERVATION_GROUPS = (
 167   'strong' => [qw(STA NEQK NHQK NDEQ QHRK MILV MILF HY FYW )],
 168   'weak'   => [qw(CSA ATV SAG STNK STPA SGND SNDEQK NDEQHK NEQHRK FVLIM HFY)],
 169 );
 170
 171
 172 =head2 new
 173
 174  Title     : new
 175  Usage     : my $aln = Bio::SimpleAlign->new();
 176  Function  : Creates a new simple align object
 177  Returns   : Bio::SimpleAlign
 178  Args      : -source     => string representing the source program
 179                             where this alignment came from
 180              -annotation => Bio::AnnotationCollectionI
 181              -seq_annotation => Bio::AnnotationCollectionI for sequences (requires -annotation also be set)
 182              -seqs       => array ref containing Bio::LocatableSeq or Bio::Seq::Meta
 183              -consensus  => consensus string
 184              -consensus_meta  => Bio::Seq::Meta object containing consensus met information (kludge)
 185
 186 =cut
 187
 188
 189 sub new {
 190   my($class,@args) = @_;
 191
 192   my $self = $class->SUPER::new(@args);
 193
 194   my ($src, $score, $id, $acc, $desc, $seqs, $feats, $coll, $sa, $con, $cmeta) = $self->_rearrange([qw(
 195                                             SOURCE
 196                                             SCORE
 197                                             ID
 198                                             ACCESSION
 199                                             DESCRIPTION
 200                                             SEQS
 201                                             FEATURES
 202                                             ANNOTATION
 203                                             SEQ_ANNOTATION
 204                                             CONSENSUS
 205                                             CONSENSUS_META
 206                                             )], @args);
 207   $src && $self->source($src);
 208   defined $score && $self->score($score);
 209   # we need to set up internal hashs first!
 210
 211   $self->{'_seq'} = {};
 212   $self->{'_order'} = {};
 213   $self->{'_start_end_lists'} = {};
 214   $self->{'_dis_name'} = {};
 215   $self->{'_id'} = 'NoName';
 216   # maybe we should automatically read in from args. Hmmm...
 217   $id  && $self->id($id);
 218   $acc && $self->accession($acc);
 219   $desc && $self->description($desc);
 220   $coll && $self->annotation($coll);
 221   # sequence annotation is layered into a provided annotation collection (or dies)
 222   if ($sa) {
 223     $self->throw("Must supply an alignment-based annotation collection (-annotation) ".
 224                  "with a sequence annotation collection")
 225         if !$coll;
 226     $coll->add_Annotation('seq_annotation', $sa);
 227   }
 228   if ($feats && ref $feats eq 'ARRAY') {
 229     for my $feat (@$feats) {
 230         $self->add_SeqFeature($feat);
 231     }
 232   }
 233   $con && $self->consensus($con);
 234   $cmeta && $self->consensus_meta($cmeta);
 235   # assumes these are in correct alignment order
 236   if ($seqs && ref($seqs) eq 'ARRAY') {
 237     for my $seq (@$seqs) {
 238         $self->add_seq($seq);
 239     }
 240   }
 241
 242   return $self; # success - we hope!
 243 }
 244
 245 =head1 Modifier methods
 246
 247 These methods modify the MSA by adding, removing or shuffling complete
 248 sequences.
 249
 250 =head2 add_seq
 251
 252  Title     : add_seq
 253  Usage     : $myalign->add_seq($newseq);
 254              $myalign->add_seq(-SEQ=>$newseq, -ORDER=>5);
 255  Function  : Adds another sequence to the alignment. *Does not* align
 256              it - just adds it to the hashes.
 257              If -ORDER is specified, the sequence is inserted at the
 258              the position spec'd by -ORDER, and existing sequences
 259              are pushed down the storage array.
 260  Returns   : nothing
 261  Args      : A Bio::LocatableSeq object
 262              Positive integer for the sequence position (optional)
 263
 264 See L<Bio::LocatableSeq> for more information
 265
 266 =cut
 267
 268 sub addSeq {
 269     my $self = shift;
 270     Carp::carp("addSeq - deprecated method. Use add_seq() instead.");
 271     $self->add_seq(@_);
 272 }
 273
 274 sub add_seq {
 275     my $self = shift;
 276     my @args = @_;
 277     my ($seq, $order) = $self->_rearrange([qw(SEQ ORDER)], @args);
 278     my ($name,$id,$start,$end);
 279
 280     unless ($seq) {
 281         $self->throw("LocatableSeq argument required");
 282     }
 283     if( ! ref $seq || ! $seq->isa('Bio::LocatableSeq') ) {
 284         $self->throw("Unable to process non locatable sequences [". ref($seq). "]");
 285     }
 286     !defined($order) and $order = 1 + keys %{$self->{'_seq'}}; # default
 287     $order--; # jay's patch (user-specified order is 1-origin)
 288
 289     if ($order < 0) {
 290         $self->throw("User-specified value for ORDER must be >= 1");
 291     }
 292
 293     $id = $seq->id() ||$seq->display_id || $seq->primary_id;
 294
 295     # build the symbol list for this sequence,
 296     # will prune out the gap and missing/match chars
 297     # when actually asked for the symbol list in the
 298     # symbol_chars
 299     # map { $self->{'_symbols'}->{$_} = 1; } split(//,$seq->seq) if $seq->seq;
 300
 301     $name = $seq->get_nse;
 302
 303     if( $self->{'_seq'}->{$name} ) {
 304         $self->warn("Replacing one sequence [$name]\n") unless $self->verbose < 0;
 305     }
 306     else {
 307         $self->debug( "Assigning $name to $order\n");
 308
 309     my $ordh = $self->{'_order'};
 310     if ($ordh->{$order}) {
 311         # make space to insert
 312         # $c->() returns (in reverse order) the first subsequence
 313         # of consecutive integers; i.e., $c->(1,2,3,5,6,7) returns
 314         # (3,2,1), and $c->(2,4,5) returns (2).
 315         my $c;
 316         $c = sub { return (($_[1]-$_[0] == 1) ? ($c->(@_[1..$#_]),$_[0]) : $_[0]); };
 317         map {
 318      $ordh->{$_+1} = $ordh->{$_}
 319         } $c->(sort {$a <=> $b} grep {$_ >= $order} keys %{$ordh});
 320
 321     }
 322     $ordh->{$order} = $name;
 323
 324         unless( exists( $self->{'_start_end_lists'}->{$id})) {
 325             $self->{'_start_end_lists'}->{$id} = [];
 326         }
 327         push @{$self->{'_start_end_lists'}->{$id}}, $seq;
 328     }
 329
 330     $self->{'_seq'}->{$name} = $seq;
 331
 332 }
 333
 334
 335 =head2 remove_seq
 336
 337  Title     : remove_seq
 338  Usage     : $aln->remove_seq($seq);
 339  Function  : Removes a single sequence from an alignment
 340  Returns   :
 341  Argument  : a Bio::LocatableSeq object
 342
 343 =cut
 344
 345 sub removeSeq {
 346     my $self = shift;
 347     Carp::carp("removeSeq - deprecated method. Use remove_seq() instead.");
 348     $self->remove_seq(@_);
 349 }
 350
 351 sub remove_seq {
 352     my $self = shift;
 353     my $seq = shift;
 354     my ($name,$id);
 355
 356     $self->throw("Need Bio::Locatable seq argument ")
 357         unless ref $seq && $seq->isa( 'Bio::LocatableSeq');
 358
 359     $id = $seq->id();
 360     $name = $seq->get_nse;
 361
 362     if( !exists $self->{'_seq'}->{$name} ) {
 363         $self->throw("Sequence $name does not exist in the alignment to remove!");
 364     }
 365
 366     delete $self->{'_seq'}->{$name};
 367
 368     # we need to remove this seq from the start_end_lists hash
 369
 370     if (exists $self->{'_start_end_lists'}->{$id}) {
 371         # we need to find the sequence in the array.
 372
 373         my ($i, $found);;
 374         for ($i=0; $i < @{$self->{'_start_end_lists'}->{$id}}; $i++) {
 375             if (${$self->{'_start_end_lists'}->{$id}}[$i] eq $seq) {
 376                 $found = 1;
 377                 last;
 378             }
 379         }
 380         if ($found) {
 381             splice @{$self->{'_start_end_lists'}->{$id}}, $i, 1;
 382         }
 383         else {
 384             $self->throw("Could not find the sequence to remoce from the start-end list");
 385         }
 386     }
 387     else {
 388         $self->throw("There is no seq list for the name $id");
 389     }
 390     # we need to shift order hash
 391     my %rev_order = reverse %{$self->{'_order'}};
 392     my $no = $rev_order{$name};
 393     my $num_sequences = $self->num_sequences;
 394     for (; $no < $num_sequences; $no++) {
 395        $self->{'_order'}->{$no} = $self->{'_order'}->{$no+1};
 396     }
 397     delete $self->{'_order'}->{$no};
 398     return 1;
 399 }
 400
 401
 402 =head2 purge
 403
 404  Title   : purge
 405  Usage   : $aln->purge(0.7);
 406  Function: Removes sequences above given sequence similarity
 407            This function will grind on large alignments. Beware!
 408  Example :
 409  Returns : An array of the removed sequences
 410  Args    : float, threshold for similarity
 411
 412 =cut
 413
 414 sub purge {
 415         my ($self,$perc) = @_;
 416         my (%duplicate, @dups);
 417
 418         my @seqs = $self->each_seq();
 419
 420         for (my $i=0;$i< @seqs - 1;$i++ ) { #for each seq in alignment
 421                 my $seq = $seqs[$i];
 422
 423                 #skip if already in duplicate hash
 424                 next if exists $duplicate{$seq->display_id} ;
 425                 my $one = $seq->seq();
 426
 427                 my @one = split '', $one;       #split to get 1aa per array element
 428
 429                 for (my $j=$i+1;$j < @seqs;$j++) {
 430                         my $seq2 = $seqs[$j];
 431
 432                         #skip if already in duplicate hash
 433                         next if exists $duplicate{$seq2->display_id} ;
 434
 435                         my $two = $seq2->seq();
 436                         my @two = split '', $two;
 437
 438                         my $count = 0;
 439                         my $res = 0;
 440                         for (my $k=0;$k<@one;$k++) {
 441                                 if ( $one[$k] ne '.' && $one[$k] ne '-' && defined($two[$k]) &&
 442                                           $one[$k] eq $two[$k]) {
 443                                         $count++;
 444                                 }
 445                                 if ( $one[$k] ne '.' && $one[$k] ne '-' && defined($two[$k]) &&
 446                                           $two[$k] ne '.' && $two[$k] ne '-' ) {
 447                                         $res++;
 448                                 }
 449                         }
 450
 451                         my $ratio = 0;
 452                         $ratio = $count/$res unless $res == 0;
 453
 454                         # if above threshold put in duplicate hash and push onto
 455                         # duplicate array for returning to get_unique
 456                         if ( $ratio > $perc ) {
 457                                 $self->warn("duplicate: ", $seq2->display_id) if $self->verbose > 0;
 458                                 $duplicate{$seq2->display_id} = 1;
 459                                 push @dups, $seq2;
 460                         }
 461                 }
 462         }
 463         foreach my $seq (@dups) {
 464                 $self->remove_seq($seq);
 465         }
 466         return @dups;
 467 }
 468
 469 =head2 sort_alphabetically
 470
 471  Title     : sort_alphabetically
 472  Usage     : $ali->sort_alphabetically
 473  Function  : Changes the order of the alignment to alphabetical on name
 474              followed by numerical by number.
 475  Returns   :
 476  Argument  :
 477
 478 =cut
 479
 480 sub sort_alphabetically {
 481     my $self = shift;
 482     my ($seq,$nse,@arr,%hash,$count);
 483
 484     foreach $seq ( $self->each_seq() ) {
 485         $nse = $seq->get_nse;
 486         $hash{$nse} = $seq;
 487     }
 488
 489     $count = 0;
 490
 491     %{$self->{'_order'}} = (); # reset the hash;
 492
 493     foreach $nse ( sort _alpha_startend keys %hash) {
 494         $self->{'_order'}->{$count} = $nse;
 495
 496         $count++;
 497     }
 498     1;
 499 }
 500
 501 =head2 sort_by_list
 502
 503  Title     : sort_by_list
 504  Usage     : $aln_ordered=$aln->sort_by_list($list_file)
 505  Function  : Arbitrarily order sequences in an alignment
 506  Returns   : A new Bio::SimpleAlign object
 507  Argument  : a file listing sequence names in intended order (one name per line)
 508
 509 =cut
 510
 511 sub sort_by_list {
 512     my ($self, $list) = @_;
 513     my (@seq, @ids, %order);
 514
 515     foreach my $seq ( $self->each_seq() ) {
 516         push @seq, $seq;
 517         push @ids, $seq->display_id;
 518     }
 519
 520     my $ct=1;
 521     open my $listfh, '<', $list or $self->throw("Could not read file '$list': $!");
 522     while (<$listfh>) {
 523       chomp;
 524       my $name=$_;
 525       $self->throw("Not found in alignment: $name") unless &_in_aln($name, \@ids);
 526       $order{$name}=$ct++;
 527     }
 528     close($listfh);
 529
 530     # use the map-sort-map idiom:
 531     my @sorted= map { $_->[1] } sort { $a->[0] <=> $b->[0] } map { [$order{$_->id()}, $_] } @seq;
 532     my $aln = $self->new;
 533     foreach (@sorted) { $aln->add_seq($_) }
 534     return $aln;
 535 }
 536
 537 =head2 set_new_reference
 538
 539  Title     : set_new_reference
 540  Usage     : $aln->set_new_reference(3 or 'B31'):  Select the 3rd sequence, or
 541              the sequence whoes name is "B31" (full, exact, and case-sensitive),
 542              as the reference (1st) sequence
 543  Function  : Change/Set a new reference (i.e., the first) sequence
 544  Returns   : a new Bio::SimpleAlign object.
 545              Throws an exception if designated sequence not found
 546  Argument  : a positive integer of sequence order, or a sequence name
 547              in the original alignment
 548
 549 =cut
 550
 551 sub set_new_reference {
 552     my ($self, $seqid) = @_;
 553     my $aln = $self->new;
 554     my (@seq, @ids, @new_seq);
 555     my $is_num=0;
 556     foreach my $seq ( $self->each_seq() ) {
 557         push @seq, $seq;
 558         push @ids, $seq->display_id;
 559     }
 560
 561     if ($seqid =~ /^\d+$/) { # argument is seq position
 562         $is_num=1;
 563         $self->throw("The new reference sequence number has to be a positive integer >1 and <= num_sequences ") if ($seqid <= 1 || $seqid > $self->num_sequences);
 564     } else { # argument is a seq name
 565         $self->throw("The new reference sequence not in alignment ") unless &_in_aln($seqid, \@ids);
 566     }
 567
 568     for (my $i=0; $i<=$#seq; $i++) {
 569         my $pos=$i+1;
 570         if ( ($is_num && $pos == $seqid) || ($seqid eq $seq[$i]->display_id) ) {
 571             unshift @new_seq, $seq[$i];
 572         } else {
 573             push @new_seq, $seq[$i];
 574         }
 575     }
 576     foreach (@new_seq) { $aln->add_seq($_);  }
 577     return $aln;
 578 }
 579
 580 sub _in_aln {  # check if input name exists in the alignment
 581     my ($str, $ref) = @_;
 582     foreach (@$ref) {
 583         return 1 if $str eq $_;
 584     }
 585     return 0;
 586 }
 587
 588
 589 =head2 uniq_seq
 590
 591  Title     : uniq_seq
 592  Usage     : $aln->uniq_seq():  Remove identical sequences in
 593              in the alignment.  Ambiguous base ("N", "n") and
 594              leading and ending gaps ("-") are NOT counted as
 595              differences.
 596  Function  : Make a new alignment of unique sequence types (STs)
 597  Returns   : 1a. if called in a scalar context,
 598                 a new Bio::SimpleAlign object (all sequences renamed as "ST")
 599              1b. if called in an array context,
 600                 a new Bio::SimpleAlign object, and a hashref whose keys
 601                 are sequence types, and whose values are arrayrefs to
 602                 lists of sequence ids within the corresponding sequence type
 603              2. if $aln->verbose > 0, ST of each sequence is sent to
 604                 STDERR (in a tabular format)
 605  Argument  : None
 606
 607 =cut
 608
 609 sub uniq_seq {
 610     my ($self, $seqid) = @_;
 611     my $aln = $self->new;
 612     my (%member, %order, @seq, @uniq_str, $st);
 613     my $order=0;
 614     my $len = $self->length();
 615     $st = {};
 616     foreach my $seq ( $self->each_seq() ) {
 617         my $str = $seq->seq();
 618
 619 # it's necessary to ignore "n", "N", leading gaps and ending gaps in
 620 # comparing two sequence strings
 621
 622     # 1st, convert "n", "N" to "?" (for DNA sequence only):
 623         $str =~ s/n/\?/gi if $str =~ /^[atcgn-]+$/i;
 624     # 2nd, convert leading and ending gaps to "?":
 625         $str = &_convert_leading_ending_gaps($str, '-', '?');
 626     # Note that '?' also can mean unknown residue.
 627     # I don't like making global class member changes like this, too
 628     # prone to errors... -- cjfields 08-11-18
 629     local $Bio::LocatableSeq::GAP_SYMBOLS = '-\?';
 630         my $new = Bio::LocatableSeq->new(
 631                      -id      => $seq->id(),
 632                                          -alphabet=> $seq->alphabet,
 633                                          -seq     => $str,
 634                                          -start   => $seq->start,
 635                                          -end     => $seq->end
 636                                          );
 637         push @seq, $new;
 638     }
 639
 640     foreach my $seq (@seq) {
 641         my $str = $seq->seq();
 642         my ($seen, $key) = &_check_uniq($str, \@uniq_str, $len);
 643         if ($seen) { # seen before
 644             my @memb = @{$member{$key}};
 645             push @memb, $seq;
 646             $member{$key} = \@memb;
 647         } else {  # not seen
 648             push @uniq_str, $key;
 649             $order++;
 650             $member{$key} = [ ($seq) ];
 651             $order{$key} = $order;
 652         }
 653     }
 654
 655     foreach my $str (sort {$order{$a} <=> $order{$b}} keys %order) { # sort by input order
 656 # convert leading/ending "?" back into "-" ("?" throws errors by SimpleAlign):
 657         my $str2 = &_convert_leading_ending_gaps($str, '?', '-');
 658 # convert middle "?" back into "N" ("?" throws errors by SimpleAlign):
 659         $str2 =~ s/\?/N/g if $str2 =~ /^[atcg\-\?]+$/i;
 660         my $gap='-';
 661         my $end= CORE::length($str2);
 662         $end -= CORE::length($1) while $str2 =~ m/($gap+)/g;
 663         my $new = Bio::LocatableSeq->new(-id   =>"ST".$order{$str},
 664                                          -seq  =>$str2,
 665                                          -start=>1,
 666                                          -end  =>$end
 667                                          );
 668         $aln->add_seq($new);
 669         foreach (@{$member{$str}}) {
 670             push @{$$st{$order{$str}}}, $_->id(); # per Tristan's patch/Bug #2805
 671         $self->debug($_->id(), "\t", "ST", $order{$str}, "\n");
 672         }
 673     }
 674     return wantarray ? ($aln, $st) : $aln;
 675 }
 676
 677 sub _check_uniq {  # check if same seq exists in the alignment
 678     my ($str1, $ref, $length) = @_;
 679     my @char1=split //, $str1;
 680     my @array=@$ref;
 681
 682     return (0, $str1) if @array==0; # not seen (1st sequence)
 683
 684     foreach my $str2 (@array) {
 685         my $diff=0;
 686         my @char2=split //, $str2;
 687         for (my $i=0; $i<=$length-1; $i++) {
 688             next if $char1[$i] eq '?';
 689             next if $char2[$i] eq '?';
 690             $diff++ if $char1[$i] ne $char2[$i];
 691         }
 692         return (1, $str2) if $diff == 0;  # seen before
 693     }
 694
 695     return (0, $str1); # not seen
 696 }
 697
 698 sub _convert_leading_ending_gaps {
 699     my $s=shift;
 700     my $sym1=shift;
 701     my $sym2=shift;
 702     my @array=split //, $s;
 703 # convert leading char:
 704     for (my $i=0; $i<=$#array; $i++) {
 705         ($array[$i] eq $sym1) ? ($array[$i] = $sym2):(last);
 706     }
 707 # convert ending char:
 708     for (my $i = $#array; $i>= 0; $i--) {
 709         ($array[$i] eq $sym1) ? ($array[$i] = $sym2):(last);
 710     }
 711     my $s_new=join '', @array;
 712     return $s_new;
 713 }
 714
 715 =head1 Sequence selection methods
 716
 717 Methods returning one or more sequences objects.
 718
 719 =head2 each_seq
 720
 721  Title     : each_seq
 722  Usage     : foreach $seq ( $align->each_seq() )
 723  Function  : Gets a Seq object from the alignment
 724  Returns   : Seq object
 725  Argument  :
 726
 727 =cut
 728
 729 sub eachSeq {
 730     my $self = shift;
 731     Carp::carp("eachSeq - deprecated method. Use each_seq() instead.");
 732     $self->each_seq();
 733 }
 734
 735 sub each_seq {
 736         my $self = shift;
 737         my (@arr,$order);
 738
 739         foreach $order ( sort { $a <=> $b } keys %{$self->{'_order'}} ) {
 740                 if( exists $self->{'_seq'}->{$self->{'_order'}->{$order}} ) {
 741                         push(@arr,$self->{'_seq'}->{$self->{'_order'}->{$order}});
 742                 }
 743         }
 744         return @arr;
 745 }
 746
 747
 748 =head2 each_alphabetically
 749
 750  Title     : each_alphabetically
 751  Usage     : foreach $seq ( $ali->each_alphabetically() )
 752  Function  : Returns a sequence object, but the objects are returned
 753              in alphabetically sorted order.
 754              Does not change the order of the alignment.
 755  Returns   : Seq object
 756  Argument  :
 757
 758 =cut
 759
 760 sub each_alphabetically {
 761         my $self = shift;
 762         my ($seq,$nse,@arr,%hash,$count);
 763
 764         foreach $seq ( $self->each_seq() ) {
 765                 $nse = $seq->get_nse;
 766                 $hash{$nse} = $seq;
 767         }
 768
 769         foreach $nse ( sort _alpha_startend keys %hash) {
 770                 push(@arr,$hash{$nse});
 771         }
 772         return @arr;
 773 }
 774
 775 sub _alpha_startend {
 776     my ($aname,$astart,$bname,$bstart);
 777     ($aname,$astart) = split (/-/,$a);
 778     ($bname,$bstart) = split (/-/,$b);
 779
 780     if( $aname eq $bname ) {
 781         return $astart <=> $bstart;
 782     }
 783     else {
 784         return $aname cmp $bname;
 785     }
 786 }
 787
 788 =head2 each_seq_with_id
 789
 790  Title     : each_seq_with_id
 791  Usage     : foreach $seq ( $align->each_seq_with_id() )
 792  Function  : Gets a Seq objects from the alignment, the contents
 793              being those sequences with the given name (there may be
 794              more than one)
 795  Returns   : Seq object
 796  Argument  : a seq name
 797
 798 =cut
 799
 800 sub eachSeqWithId {
 801     my $self = shift;
 802     Carp::carp("eachSeqWithId - deprecated method. Use each_seq_with_id() instead.");
 803     $self->each_seq_with_id(@_);
 804 }
 805
 806 sub each_seq_with_id {
 807     my $self = shift;
 808     my $id = shift;
 809
 810     $self->throw("Method each_seq_with_id needs a sequence name argument")
 811         unless defined $id;
 812
 813     my (@arr, $seq);
 814
 815     if (exists($self->{'_start_end_lists'}->{$id})) {
 816         @arr = @{$self->{'_start_end_lists'}->{$id}};
 817     }
 818     return @arr;
 819 }
 820
 821 =head2 get_seq_by_pos
 822
 823  Title     : get_seq_by_pos
 824  Usage     : $seq = $aln->get_seq_by_pos(3) # third sequence from the alignment
 825  Function  : Gets a sequence based on its position in the alignment.
 826              Numbering starts from 1.  Sequence positions larger than
 827              num_sequences() will throw an error.
 828  Returns   : a Bio::LocatableSeq object
 829  Args      : positive integer for the sequence position
 830
 831 =cut
 832
 833 sub get_seq_by_pos {
 834
 835     my $self = shift;
 836     my ($pos) = @_;
 837
 838     $self->throw("Sequence position has to be a positive integer, not [$pos]")
 839         unless $pos =~ /^\d+$/ and $pos > 0;
 840     $self->throw("No sequence at position [$pos]")
 841         unless $pos <= $self->num_sequences ;
 842
 843     my $nse = $self->{'_order'}->{--$pos};
 844     return $self->{'_seq'}->{$nse};
 845 }
 846
 847 =head2 get_seq_by_id
 848
 849  Title     : get_seq_by_id
 850  Usage     : $seq = $aln->get_seq_by_id($name) # seq named $name
 851  Function  : Gets a sequence based on its name.
 852              Sequences that do not exist will warn and return undef
 853  Returns   : a Bio::LocatableSeq object
 854  Args      : string for sequence name
 855
 856 =cut
 857
 858 sub get_seq_by_id {
 859     my ($self,$name) = @_;
 860     unless( defined $name ) {
 861       $self->warn("Must provide a sequence name");
 862       return;
 863     }
 864     for my $seq ( values %{$self->{'_seq'}} ) {
 865       if ( $seq->id eq $name) {
 866         return $seq;
 867       }
 868     }
 869     return;
 870 }
 871
 872 =head2 seq_with_features
 873
 874  Title   : seq_with_features
 875  Usage   : $seq = $aln->seq_with_features(-pos => 1,
 876                                           -consensus => 60
 877                                           -mask =>
 878            sub { my $consensus = shift;
 879
 880                  for my $i (1..5){
 881                     my $n = 'N' x $i;
 882                     my $q = '\?' x $i;
 883                     while($consensus =~ /[^?]$q[^?]/){
 884                        $consensus =~ s/([^?])$q([^?])/$1$n$2/;
 885                     }
 886                   }
 887                  return $consensus;
 888                }
 889                                          );
 890  Function: produces a Bio::Seq object by first splicing gaps from -pos
 891            (by means of a splice_by_seq_pos() call), then creating
 892            features using non-? chars (by means of a consensus_string()
 893            call with stringency -consensus).
 894  Returns : a Bio::Seq object
 895  Args    : -pos : required. sequence from which to build the Bio::Seq
 896              object
 897            -consensus : optional, defaults to consensus_string()'s
 898              default cutoff value
 899            -mask : optional, a coderef to apply to consensus_string()'s
 900              output before building features.  this may be useful for
 901              closing gaps of 1 bp by masking over them with N, for
 902              instance
 903
 904 =cut
 905
 906 sub seq_with_features{
 907    my ($self,%arg) = @_;
 908
 909    #first do the preparatory splice
 910    $self->throw("must provide a -pos argument") unless $arg{-pos};
 911    $self->splice_by_seq_pos($arg{-pos});
 912
 913    my $consensus_string = $self->consensus_string($arg{-consensus});
 914    $consensus_string = $arg{-mask}->($consensus_string)
 915          if defined($arg{-mask});
 916
 917    my(@bs,@es);
 918
 919    push @bs, 1 if $consensus_string =~ /^[^?]/;
 920
 921    while($consensus_string =~ /\?[^?]/g){
 922          push @bs, pos($consensus_string);
 923    }
 924    while($consensus_string =~ /[^?]\?/g){
 925          push @es, pos($consensus_string);
 926    }
 927
 928    push @es, CORE::length($consensus_string) if $consensus_string =~ /[^?]$/;
 929
 930    my $seq = Bio::Seq->new();
 931
 932 #   my $rootfeature = Bio::SeqFeature::Generic->new(
 933 #                -source_tag => 'location',
 934 #                -start      => $self->get_seq_by_pos($arg{-pos})->start,
 935 #                -end        => $self->get_seq_by_pos($arg{-pos})->end,
 936 #                                                  );
 937 #   $seq->add_SeqFeature($rootfeature);
 938
 939    while(my $b = shift @bs){
 940          my $e = shift @es;
 941          $seq->add_SeqFeature(
 942        Bio::SeqFeature::Generic->new(
 943          -start => $b - 1 + $self->get_seq_by_pos($arg{-pos})->start,
 944          -end   => $e - 1 + $self->get_seq_by_pos($arg{-pos})->start,
 945          -source_tag => $self->source || 'MSA',
 946        )
 947      );
 948    }
 949
 950    return $seq;
 951 }
 952
 953
 954 =head1 Create new alignments
 955
 956 The result of these methods are horizontal or vertical subsets of the
 957 current MSA.
 958
 959 =head2 select
 960
 961  Title     : select
 962  Usage     : $aln2 = $aln->select(1, 3) # three first sequences
 963  Function  : Creates a new alignment from a continuous subset of
 964              sequences.  Numbering starts from 1.  Sequence positions
 965              larger than num_sequences() will throw an error.
 966  Returns   : a Bio::SimpleAlign object
 967  Args      : positive integer for the first sequence
 968              positive integer for the last sequence to include (optional)
 969
 970 =cut
 971
 972 sub select {
 973     my $self = shift;
 974     my ($start, $end) = @_;
 975
 976     $self->throw("Select start has to be a positive integer, not [$start]")
 977         unless $start =~ /^\d+$/ and $start > 0;
 978     $self->throw("Select end has to be a positive integer, not [$end]")
 979         unless $end  =~ /^\d+$/ and $end > 0;
 980     $self->throw("Select $start [$start] has to be smaller than or equal to end [$end]")
 981         unless $start <= $end;
 982
 983     my $aln = $self->new;
 984     foreach my $pos ($start .. $end) {
 985         $aln->add_seq($self->get_seq_by_pos($pos));
 986     }
 987     $aln->id($self->id);
 988     # fix for meta, sf, ann
 989     return $aln;
 990 }
 991
 992 =head2 select_noncont
 993
 994  Title     : select_noncont
 995  Usage     : # 1st and 3rd sequences, sorted
 996              $aln2 = $aln->select_noncont(1, 3)
 997
 998              # 1st and 3rd sequences, sorted (same as first)
 999              $aln2 = $aln->select_noncont(3, 1)
1000
1001              # 1st and 3rd sequences, unsorted
1002              $aln2 = $aln->select_noncont('nosort',3, 1)
1003
1004  Function  : Creates a new alignment from a subset of sequences.  Numbering
1005              starts from 1.  Sequence positions larger than num_sequences() will
1006              throw an error.  Sorts the order added to new alignment by default,
1007              to prevent sorting pass 'nosort' as the first argument in the list.
1008  Returns   : a Bio::SimpleAlign object
1009  Args      : array of integers for the sequences.  If the string 'nosort' is
1010              passed as the first argument, the sequences will not be sorted
1011              in the new alignment but will appear in the order listed.
1012
1013 =cut
1014
1015 sub select_noncont {
1016         my $self = shift;
1017     my $nosort = 0;
1018         my (@pos) = @_;
1019     if ($pos[0] !~ m{^\d+$}) {
1020         my $sortcmd = shift @pos;
1021         if ($sortcmd eq 'nosort') {
1022             $nosort = 1;
1023         } else {
1024             $self->throw("Command not recognized: $sortcmd.  Only 'nosort' implemented at this time.");
1025         }
1026     }
1027
1028     my $end = $self->num_sequences;
1029     foreach ( @pos ) {
1030                 $self->throw("position must be a positive integer, > 0 and <= $end not [$_]")
1031                   unless( /^\d+$/ && $_ > 0 && $_ <= $end );
1032         }
1033
1034         @pos = sort {$a <=> $b} @pos unless $nosort;
1035
1036         my $aln = $self->new;
1037         foreach my $p (@pos) {
1038                 $aln->add_seq($self->get_seq_by_pos($p));
1039         }
1040         $aln->id($self->id);
1041     # fix for meta, sf, ann
1042         return $aln;
1043 }
1044
1045 =head2 select_noncont_by_name
1046
1047  Title     : select_noncont_by_name
1048  Usage     : my $aln2 = $aln->select_noncont_by_name('A123', 'B456');
1049  Function  : Creates a new alignment from a subset of sequences which are
1050              selected by name (sequence ID).
1051  Returns   : a Bio::SimpleAlign object
1052  Args      : array of names (i.e., identifiers) for the sequences.
1053
1054 =cut
1055
1056 sub select_noncont_by_name {
1057     my ($self, @names) = @_;
1058
1059     my $aln = $self->new;
1060     foreach my $name (@names) {
1061         $aln->add_seq($self->get_seq_by_id($name));
1062     }
1063     $aln->id($self->id);
1064
1065     return $aln;
1066 }
1067
1068 =head2 slice
1069
1070  Title     : slice
1071  Usage     : $aln2 = $aln->slice(20,30)
1072  Function  : Creates a slice from the alignment inclusive of start and
1073              end columns, and the first column in the alignment is denoted 1.
1074              Sequences with no residues in the slice are excluded from the
1075              new alignment and a warning is printed. Slice beyond the length of
1076              the sequence does not do padding.
1077  Returns   : A Bio::SimpleAlign object
1078  Args      : Positive integer for start column, positive integer for end column,
1079              optional boolean which if true will keep gap-only columns in the newly
1080              created slice. Example:
1081
1082              $aln2 = $aln->slice(20,30,1)
1083
1084 =cut
1085
1086 sub slice {
1087         my $self = shift;
1088         my ($start, $end, $keep_gap_only) = @_;
1089
1090         $self->throw("Slice start has to be a positive integer, not [$start]")
1091           unless $start =~ /^\d+$/ and $start > 0;
1092         $self->throw("Slice end has to be a positive integer, not [$end]")
1093           unless $end =~ /^\d+$/ and $end > 0;
1094         $self->throw("Slice start [$start] has to be smaller than or equal to end [$end]")
1095           unless $start <= $end;
1096         $self->throw("This alignment has only ". $self->length . " residues. Slice start " .
1097                                          "[$start] is too big.") if $start > $self->length;
1098     my $cons_meta = $self->consensus_meta;
1099         my $aln = $self->new;
1100         $aln->id($self->id);
1101         foreach my $seq ( $self->each_seq() ) {
1102             my $new_seq = $seq->isa('Bio::Seq::MetaI') ?
1103             Bio::Seq::Meta->new
1104         (-id      => $seq->id,
1105                  -alphabet => $seq->alphabet,
1106                  -strand  => $seq->strand,
1107                  -verbose => $self->verbose) :
1108             Bio::LocatableSeq->new
1109         (-id      => $seq->id,
1110                  -alphabet => $seq->alphabet,
1111                  -strand  => $seq->strand,
1112                  -verbose => $self->verbose);
1113
1114             # seq
1115             my $seq_end = $end;
1116             $seq_end = $seq->length if( $end > $seq->length );
1117
1118             my $slice_seq = $seq->subseq($start, $seq_end);
1119             $new_seq->seq( $slice_seq );
1120
1121         # Allowed extra characters in string
1122         my $allowed_chars = '';
1123         if (exists $self->{_mask_char}) {
1124             $allowed_chars = $self->{_mask_char};
1125             $allowed_chars = quotemeta $allowed_chars;
1126         }
1127         $slice_seq =~ s/[^\w$allowed_chars]//g;
1128
1129         if ($start > 1) {
1130             my $pre_start_seq = $seq->subseq(1, $start - 1);
1131             $pre_start_seq =~ s/[^\w$allowed_chars]//g;
1132             if (!defined($seq->strand)) {
1133                 $new_seq->start( $seq->start + CORE::length($pre_start_seq) );
1134             } elsif ($seq->strand < 0){
1135                 $new_seq->start( $seq->end - CORE::length($pre_start_seq) - CORE::length($slice_seq) + 1);
1136             } else {
1137                 $new_seq->start( $seq->start + CORE::length($pre_start_seq)  );
1138             }
1139             } else {
1140             if ((defined $seq->strand)&&($seq->strand < 0)){
1141                 $new_seq->start( $seq->end - CORE::length($slice_seq) + 1);
1142             } else {
1143                $new_seq->start( $seq->start);
1144             }
1145             }
1146         if ($new_seq->isa('Bio::Seq::MetaI')) {
1147             for my $meta_name ($seq->meta_names) {
1148                 $new_seq->named_meta($meta_name, $seq->named_submeta($meta_name, $start, $end));
1149             }
1150         }
1151             $new_seq->end( $new_seq->start + CORE::length($slice_seq) - 1 );
1152
1153             if ($new_seq->start and $new_seq->end >= $new_seq->start) {
1154             $aln->add_seq($new_seq);
1155             } else {
1156             if( $keep_gap_only ) {
1157                 $aln->add_seq($new_seq);
1158             } else {
1159                 my $nse = $seq->get_nse();
1160                 $self->warn("Slice [$start-$end] of sequence [$nse] contains no residues.".
1161                     " Sequence excluded from the new alignment.");
1162             }
1163             }
1164         }
1165     if ($cons_meta) {
1166         my $new = Bio::Seq::Meta->new();
1167         for my $meta_name ($cons_meta->meta_names) {
1168             $new->named_meta($meta_name, $cons_meta->named_submeta($meta_name, $start, $end));
1169         }
1170         $aln->consensus_meta($new);
1171     }
1172     $aln->annotation($self->annotation);
1173     # fix for meta, sf, ann
1174         return $aln;
1175 }
1176
1177 =head2 remove_columns
1178
1179  Title     : remove_columns
1180  Usage     : $aln2 = $aln->remove_columns(['mismatch','weak']) or
1181              $aln2 = $aln->remove_columns([0,0],[6,8])
1182  Function  : Creates an aligment with columns removed corresponding to
1183              the specified type or by specifying the columns by number.
1184  Returns   : Bio::SimpleAlign object
1185  Args      : Array ref of types ('match'|'weak'|'strong'|'mismatch'|'gaps'|
1186              'all_gaps_columns') or array ref where the referenced array
1187              contains a pair of integers that specify a range.
1188              The first column is 0
1189
1190 =cut
1191
1192 sub remove_columns {
1193     my ($self,@args) = @_;
1194     @args || $self->throw("Must supply column ranges or column types");
1195     my $aln;
1196
1197     if ($args[0][0] =~ /^[a-z_]+$/i) {
1198         $aln = $self->_remove_columns_by_type($args[0]);
1199     } elsif ($args[0][0] =~ /^\d+$/) {
1200         $aln = $self->_remove_columns_by_num(\@args);
1201     } else {
1202         $self->throw("You must pass array references to remove_columns(), not @args");
1203     }
1204     # fix for meta, sf, ann
1205     $aln;
1206 }
1207
1208
1209 =head2 remove_gaps
1210
1211  Title     : remove_gaps
1212  Usage     : $aln2 = $aln->remove_gaps
1213  Function  : Creates an aligment with gaps removed
1214  Returns   : a Bio::SimpleAlign object
1215  Args      : a gap character(optional) if none specified taken
1216                 from $self->gap_char,
1217              [optional] $all_gaps_columns flag (1 or 0, default is 0)
1218                         indicates that only all-gaps columns should be deleted
1219
1220 Used from method L<remove_columns> in most cases. Set gap character
1221 using L<gap_char()|gap_char>.
1222
1223 =cut
1224
1225 sub remove_gaps {
1226     my ($self,$gapchar,$all_gaps_columns) = @_;
1227     my $gap_line;
1228     if ($all_gaps_columns) {
1229         $gap_line = $self->all_gap_line($gapchar);
1230     } else {
1231         $gap_line = $self->gap_line($gapchar);
1232     }
1233     my $aln = $self->new;
1234
1235     my @remove;
1236     my $length = 0;
1237     my $del_char = $gapchar || $self->gap_char;
1238     # Do the matching to get the segments to remove
1239     while ($gap_line =~ m/[$del_char]/g) {
1240         my $start = pos($gap_line)-1;
1241         $gap_line =~ m/\G[$del_char]+/gc;
1242         my $end = pos($gap_line)-1;
1243
1244         #have to offset the start and end for subsequent removes
1245         $start-=$length;
1246         $end  -=$length;
1247         $length += ($end-$start+1);
1248         push @remove, [$start,$end];
1249     }
1250
1251     #remove the segments
1252     $aln = $#remove >= 0 ? $self->_remove_col($aln,\@remove) : $self;
1253     # fix for meta, sf, ann
1254     return $aln;
1255 }
1256
1257
1258 sub _remove_col {
1259     my ($self,$aln,$remove) = @_;
1260     my @new;
1261
1262     my $gap = $self->gap_char;
1263
1264     # splice out the segments and create new seq
1265     foreach my $seq($self->each_seq){
1266         my $new_seq = Bio::LocatableSeq->new(
1267                                              -id      => $seq->id,
1268                                              -alphabet=> $seq->alphabet,
1269                                              -strand  => $seq->strand,
1270                                              -verbose => $self->verbose);
1271         my $sequence = $seq->seq;
1272         foreach my $pair(@{$remove}){
1273             my $start = $pair->[0];
1274             my $end   = $pair->[1];
1275             $sequence = $seq->seq unless $sequence;
1276             my $orig = $sequence;
1277             my $head =  $start > 0 ? substr($sequence, 0, $start) : '';
1278             my $tail = ($end + 1) >= CORE::length($sequence) ? '' : substr($sequence, $end + 1);
1279             $sequence = $head.$tail;
1280             # start
1281             unless (defined $new_seq->start) {
1282                 if ($start == 0) {
1283                     my $start_adjust = () = substr($orig, 0, $end + 1) =~ /$gap/g;
1284                     $new_seq->start($seq->start + $end + 1 - $start_adjust);
1285                 }
1286                 else {
1287                     my $start_adjust = $orig =~ /^$gap+/;
1288                     if ($start_adjust) {
1289                         $start_adjust = $+[0] == $start;
1290                     }
1291                     $new_seq->start($seq->start + $start_adjust);
1292                 }
1293             }
1294             # end
1295             if (($end + 1) >= CORE::length($orig)) {
1296                 my $end_adjust = () = substr($orig, $start) =~ /$gap/g;
1297                 $new_seq->end($seq->end - (CORE::length($orig) - $start) + $end_adjust);
1298             }
1299             else {
1300                 $new_seq->end($seq->end);
1301             }
1302         }
1303
1304         if ($new_seq->end < $new_seq->start) {
1305             # we removed all columns except for gaps: set to 0 to indicate no
1306             # sequence
1307             $new_seq->start(0);
1308             $new_seq->end(0);
1309         }
1310
1311         $new_seq->seq($sequence) if $sequence;
1312                 push @new, $new_seq;
1313     }
1314     # add the new seqs to the alignment
1315     foreach my $new(@new){
1316         $aln->add_seq($new);
1317     }
1318     # fix for meta, sf, ann
1319     return $aln;
1320 }
1321
1322 sub _remove_columns_by_type {
1323         my ($self,$type) = @_;
1324         my $aln = $self->new;
1325         my @remove;
1326
1327         my $gap = $self->gap_char if (grep { $_ eq 'gaps'} @{$type});
1328         my $all_gaps_columns = $self->gap_char if (grep /all_gaps_columns/,@{$type});
1329         my %matchchars = ( 'match'           => '\*',
1330                        'weak'             => '\.',
1331                        'strong'           => ':',
1332                        'mismatch'         => ' ',
1333                        'gaps'             => '',
1334                        'all_gaps_columns' => ''
1335                      );
1336         # get the characters to delete against
1337         my $del_char;
1338         foreach my $type (@{$type}){
1339                 $del_char.= $matchchars{$type};
1340         }
1341
1342         my $length = 0;
1343         my $match_line = $self->match_line;
1344         # do the matching to get the segments to remove
1345         if($del_char){
1346                 while($match_line =~ m/[$del_char]/g ){
1347                         my $start = pos($match_line)-1;
1348                         $match_line=~/\G[$del_char]+/gc;
1349                         my $end = pos($match_line)-1;
1350
1351                         #have to offset the start and end for subsequent removes
1352                         $start-=$length;
1353                         $end  -=$length;
1354                         $length += ($end-$start+1);
1355                         push @remove, [$start,$end];
1356                 }
1357         }
1358
1359         # remove the segments
1360         $aln = $#remove >= 0 ? $self->_remove_col($aln,\@remove) : $self;
1361         $aln = $aln->remove_gaps() if $gap;
1362         $aln = $aln->remove_gaps('', 1) if $all_gaps_columns;
1363     # fix for meta, sf, ann
1364         $aln;
1365 }
1366
1367
1368 sub _remove_columns_by_num {
1369         my ($self,$positions) = @_;
1370         my $aln = $self->new;
1371
1372         # sort the positions
1373         @$positions = sort { $a->[0] <=> $b->[0] } @$positions;
1374
1375     my @remove;
1376     my $length = 0;
1377     foreach my $pos (@{$positions}) {
1378         my ($start, $end) = @{$pos};
1379
1380         #have to offset the start and end for subsequent removes
1381         $start-=$length;
1382         $end  -=$length;
1383         $length += ($end-$start+1);
1384         push @remove, [$start,$end];
1385     }
1386
1387     #remove the segments
1388     $aln = $#remove >= 0 ? $self->_remove_col($aln,\@remove) : $self;
1389     # fix for meta, sf, ann
1390         $aln;
1391 }
1392
1393
1394 =head1 Change sequences within the MSA
1395
1396 These methods affect characters in all sequences without changing the
1397 alignment.
1398
1399 =head2 splice_by_seq_pos
1400
1401  Title   : splice_by_seq_pos
1402  Usage   : $status = splice_by_seq_pos(1);
1403  Function: splices all aligned sequences where the specified sequence
1404            has gaps.
1405  Example :
1406  Returns : 1 on success
1407  Args    : position of sequence to splice by
1408
1409
1410 =cut
1411
1412 sub splice_by_seq_pos{
1413   my ($self,$pos) = @_;
1414
1415   my $guide = $self->get_seq_by_pos($pos);
1416   my $guide_seq = $guide->seq;
1417
1418   $guide_seq =~ s/\./\-/g;
1419
1420   my @gaps = ();
1421   $pos = -1;
1422   while(($pos = index($guide_seq, '-', $pos)) > -1 ){
1423     unshift @gaps, $pos;
1424     $pos++;
1425   }
1426
1427   foreach my $seq ($self->each_seq){
1428     my @bases = split '', $seq->seq;
1429
1430     splice(@bases, $_, 1) foreach @gaps;
1431     $seq->seq(join('', @bases));
1432   }
1433
1434   1;
1435 }
1436
1437 =head2 map_chars
1438
1439  Title     : map_chars
1440  Usage     : $ali->map_chars('\.','-')
1441  Function  : Does a s/$arg1/$arg2/ on the sequences. Useful for gap
1442              characters.
1443
1444              Note that the first argument is interpreted as a regexp
1445              so be careful and escape any wild card characters (e.g.
1446              do $ali->map_chars('\.','-') to replace periods with dashes.
1447  Returns   : 1 on success
1448  Argument  : A regexp and a string
1449
1450 =cut
1451
1452 sub map_chars {
1453     my $self = shift;
1454     my $from = shift;
1455     my $to   = shift;
1456     my ( $seq, $temp );
1457
1458     $self->throw("Need two arguments: a regexp and a string")
1459       unless defined $from and defined $to;
1460
1461     foreach $seq ( $self->each_seq() ) {
1462         $temp = $seq->seq();
1463         $temp =~ s/$from/$to/g;
1464         $seq->seq($temp);
1465     }
1466     return 1;
1467 }
1468
1469
1470 =head2 uppercase
1471
1472  Title     : uppercase()
1473  Usage     : $ali->uppercase()
1474  Function  : Sets all the sequences to uppercase
1475  Returns   : 1 on success
1476  Argument  :
1477
1478 =cut
1479
1480 sub uppercase {
1481     my $self = shift;
1482     my $seq;
1483     my $temp;
1484
1485     foreach $seq ( $self->each_seq() ) {
1486       $temp = $seq->seq();
1487       $temp =~ tr/[a-z]/[A-Z]/;
1488
1489       $seq->seq($temp);
1490     }
1491     return 1;
1492 }
1493
1494 =head2 cigar_line
1495
1496  Title    : cigar_line()
1497  Usage    : %cigars = $align->cigar_line()
1498  Function : Generates a "cigar" (Compact Idiosyncratic Gapped Alignment
1499             Report) line for each sequence in the alignment. Examples are
1500             "1,60" or "5,10:12,58", where the numbers refer to conserved
1501             positions within the alignment. The keys of the hash are the
1502             NSEs (name/start/end) assigned to each sequence.
1503  Args     : threshold (optional, defaults to 100)
1504  Returns  : Hash of strings (cigar lines)
1505
1506 =cut
1507
1508 sub cigar_line {
1509         my $self = shift;
1510         my $thr=shift||100;
1511         my %cigars;
1512
1513         my @consensus = split "",($self->consensus_string($thr));
1514         my $len = $self->length;
1515         my $gapchar = $self->gap_char;
1516
1517         # create a precursor, something like (1,4,5,6,7,33,45),
1518         # where each number corresponds to a conserved position
1519         foreach my $seq ( $self->each_seq ) {
1520                 my @seq = split "", uc ($seq->seq);
1521                 my $pos = 1;
1522                 for (my $x = 0 ; $x < $len ; $x++ ) {
1523                         if ($seq[$x] eq $consensus[$x]) {
1524                                 push @{$cigars{$seq->get_nse}},$pos;
1525                                 $pos++;
1526                         } elsif ($seq[$x] ne $gapchar) {
1527                                 $pos++;
1528                         }
1529                 }
1530         }
1531         # duplicate numbers - (1,4,5,6,7,33,45) becomes (1,1,4,5,6,7,33,33,45,45)
1532         for my $name (keys %cigars) {
1533                 splice @{$cigars{$name}}, 1, 0, ${$cigars{$name}}[0] if
1534                   ( ${$cigars{$name}}[0] + 1 < ${$cigars{$name}}[1] );
1535       push @{$cigars{$name}}, ${$cigars{$name}}[$#{$cigars{$name}}] if
1536            ( ${$cigars{$name}}[($#{$cigars{$name}} - 1)] + 1 <
1537                           ${$cigars{$name}}[$#{$cigars{$name}}] );
1538                 for ( my $x = 1 ; $x < $#{$cigars{$name}} - 1 ; $x++) {
1539                         if (${$cigars{$name}}[$x - 1] + 1 < ${$cigars{$name}}[$x]  &&
1540                        ${$cigars{$name}}[$x + 1]  > ${$cigars{$name}}[$x] + 1) {
1541                  splice @{$cigars{$name}}, $x, 0, ${$cigars{$name}}[$x];
1542                         }
1543       }
1544         }
1545   # collapse series - (1,1,4,5,6,7,33,33,45,45) becomes (1,1,4,7,33,33,45,45)
1546   for my $name (keys %cigars) {
1547           my @remove;
1548           for ( my $x = 0 ; $x < $#{$cigars{$name}} ; $x++) {
1549                    if ( ${$cigars{$name}}[$x] == ${$cigars{$name}}[($x - 1)] + 1 &&
1550                              ${$cigars{$name}}[$x] == ${$cigars{$name}}[($x + 1)] - 1 ) {
1551                       unshift @remove,$x;
1552               }
1553            }
1554       for my $pos (@remove) {
1555                         splice @{$cigars{$name}}, $pos, 1;
1556            }
1557    }
1558    # join and punctuate
1559    for my $name (keys %cigars) {
1560           my ($start,$end,$str) = "";
1561           while ( ($start,$end) = splice @{$cigars{$name}}, 0, 2 ) {
1562                   $str .= ($start . "," . $end . ":");
1563           }
1564           $str =~ s/:$//;
1565       $cigars{$name} = $str;
1566    }
1567    %cigars;
1568 }
1569
1570
1571 =head2 match_line
1572
1573  Title    : match_line()
1574  Usage    : $line = $align->match_line()
1575  Function : Generates a match line - much like consensus string
1576             except that a line indicating the '*' for a match.
1577  Args     : (optional) Match line characters ('*' by default)
1578             (optional) Strong match char (':' by default)
1579             (optional) Weak match char ('.' by default)
1580  Returns  : String
1581
1582 =cut
1583
1584 sub match_line {
1585         my ($self,$matchlinechar, $strong, $weak) = @_;
1586         my %matchchars = ('match'    => $matchlinechar || '*',
1587                                                           'weak'     => $weak          || '.',
1588                                                           'strong'   => $strong        || ':',
1589                                                           'mismatch' => ' ',
1590                                                   );
1591
1592         my @seqchars;
1593         my $alphabet;
1594         foreach my $seq ( $self->each_seq ) {
1595                 push @seqchars, [ split(//, uc ($seq->seq)) ];
1596                 $alphabet = $seq->alphabet unless defined $alphabet;
1597         }
1598         my $refseq = shift @seqchars;
1599         # let's just march down the columns
1600         my $matchline;
1601  POS:
1602         foreach my $pos ( 0..$self->length ) {
1603                 my $refchar = $refseq->[$pos];
1604                 my $char = $matchchars{'mismatch'};
1605                 unless( defined $refchar ) {
1606                         last if $pos == $self->length; # short circuit on last residue
1607                         # this in place to handle jason's soon-to-be-committed
1608                         # intron mapping code
1609                         goto bottom;
1610                 }
1611                 my %col = ($refchar => 1);
1612                 my $dash = ($refchar eq '-' || $refchar eq '.' || $refchar eq ' ');
1613                 foreach my $seq ( @seqchars ) {
1614                         next if $pos >= scalar @$seq;
1615                         $dash = 1 if( $seq->[$pos] eq '-' || $seq->[$pos] eq '.' ||
1616                                                           $seq->[$pos] eq ' ' );
1617                         $col{$seq->[$pos]}++ if defined $seq->[$pos];
1618                 }
1619                 my @colresidues = sort keys %col;
1620
1621                 # if all the values are the same
1622                 if( $dash ) { $char =  $matchchars{'mismatch'} }
1623                 elsif( @colresidues == 1 ) { $char = $matchchars{'match'} }
1624                 elsif( $alphabet eq 'protein' ) { # only try to do weak/strong
1625                         # matches for protein seqs
1626             TYPE:
1627                         foreach my $type ( qw(strong weak) ) {
1628                                 # iterate through categories
1629                                 my %groups;
1630                                 # iterate through each of the aa in the col
1631                                 # look to see which groups it is in
1632                                 foreach my $c ( @colresidues ) {
1633                                         foreach my $f ( grep { index($_,$c) >= 0 } @{$CONSERVATION_GROUPS{$type}} ) {
1634                                                 push @{$groups{$f}},$c;
1635                                         }
1636                                 }
1637                          GRP:
1638                                 foreach my $cols ( values %groups ) {
1639                                         @$cols = sort @$cols;
1640                                         # now we are just testing to see if two arrays
1641                                         # are identical w/o changing either one
1642                                         # have to be same len
1643                                         next if( scalar @$cols != scalar @colresidues );
1644                                         # walk down the length and check each slot
1645                                         for($_=0;$_ < (scalar @$cols);$_++ ) {
1646                                                 next GRP if( $cols->[$_] ne $colresidues[$_] );
1647                                         }
1648                                         $char = $matchchars{$type};
1649                                         last TYPE;
1650                                 }
1651                         }
1652                 }
1653          bottom:
1654                 $matchline .= $char;
1655         }
1656         return $matchline;
1657 }
1658
1659
1660 =head2 gap_line
1661
1662  Title    : gap_line()
1663  Usage    : $line = $align->gap_line()
1664  Function : Generates a gap line - much like consensus string
1665             except that a line where '-' represents gap
1666  Args     : (optional) gap line characters ('-' by default)
1667  Returns  : string
1668
1669 =cut
1670
1671 sub gap_line {
1672     my ($self,$gapchar) = @_;
1673     $gapchar = $gapchar || $self->gap_char;
1674     my %gap_hsh; # column gaps vector
1675     foreach my $seq ( $self->each_seq ) {
1676                 my $i = 0;
1677         map {$gap_hsh{$_->[0]} = undef} grep {$_->[1] =~ m/[$gapchar]/}
1678                   map {[$i++, $_]} split(//, uc ($seq->seq));
1679     }
1680     my $gap_line;
1681     foreach my $pos ( 0..$self->length-1 ) {
1682           $gap_line .= (exists $gap_hsh{$pos}) ? $self->gap_char:'.';
1683     }
1684     return $gap_line;
1685 }
1686
1687 =head2 all_gap_line
1688
1689  Title    : all_gap_line()
1690  Usage    : $line = $align->all_gap_line()
1691  Function : Generates a gap line - much like consensus string
1692             except that a line where '-' represents all-gap column
1693  Args     : (optional) gap line characters ('-' by default)
1694  Returns  : string
1695
1696 =cut
1697
1698 sub all_gap_line {
1699     my ($self,$gapchar) = @_;
1700     $gapchar = $gapchar || $self->gap_char;
1701     my %gap_hsh;                # column gaps counter hash
1702     my @seqs = $self->each_seq;
1703     foreach my $seq ( @seqs ) {
1704         my $i = 0;
1705         map {$gap_hsh{$_->[0]}++} grep {$_->[1] =~ m/[$gapchar]/}
1706         map {[$i++, $_]} split(//, uc ($seq->seq));
1707     }
1708     my $gap_line;
1709     foreach my $pos ( 0..$self->length-1 ) {
1710         if (exists $gap_hsh{$pos} && $gap_hsh{$pos} == scalar @seqs) {
1711             # gaps column
1712             $gap_line .= $self->gap_char;
1713         } else {
1714             $gap_line .= '.';
1715         }
1716     }
1717     return $gap_line;
1718 }
1719
1720 =head2 gap_col_matrix
1721
1722  Title    : gap_col_matrix()
1723  Usage    : my $cols = $align->gap_col_matrix()
1724  Function : Generates an array where each element in the array is a
1725             hash reference with a key of the sequence name and a
1726             value of 1 if the sequence has a gap at that column
1727  Returns  : Reference to an array
1728  Args     : Optional: gap line character ($aln->gap_char or '-' by default)
1729
1730 =cut
1731
1732 sub gap_col_matrix {
1733     my ( $self, $gapchar ) = @_;
1734     $gapchar = $gapchar || $self->gap_char;
1735     my %gap_hsh;    # column gaps vector
1736     my @cols;
1737     foreach my $seq ( $self->each_seq ) {
1738         my $i   = 0;
1739         my $str = $seq->seq;
1740         my $len = $seq->length;
1741         my $ch;
1742         my $id = $seq->display_id;
1743         while ( $i < $len ) {
1744             $ch = substr( $str, $i, 1 );
1745             $cols[ $i++ ]->{$id} = ( $ch =~ m/[$gapchar]/ );
1746         }
1747     }
1748     return \@cols;
1749 }
1750
1751 =head2 match
1752
1753  Title     : match()
1754  Usage     : $ali->match()
1755  Function  : Goes through all columns and changes residues that are
1756              identical to residue in first sequence to match '.'
1757              character. Sets match_char.
1758
1759              USE WITH CARE: Most MSA formats do not support match
1760              characters in sequences, so this is mostly for output
1761              only. NEXUS format (Bio::AlignIO::nexus) can handle
1762              it.
1763  Returns   : 1 on success
1764  Argument  : a match character, optional, defaults to '.'
1765
1766 =cut
1767
1768 sub match {
1769     my ( $self, $match ) = @_;
1770
1771     $match ||= '.';
1772     my ($matching_char) = $match;
1773     $matching_char = "\\$match" if $match =~ /[\^.$|()\[\]]/;    #';
1774     $self->map_chars( $matching_char, '-' );
1775
1776     my @seqs = $self->each_seq();
1777     return 1 unless scalar @seqs > 1;
1778
1779     my $refseq  = shift @seqs;
1780     my @refseq  = split //, $refseq->seq;
1781     my $gapchar = $self->gap_char;
1782
1783     foreach my $seq (@seqs) {
1784         my @varseq = split //, $seq->seq();
1785         for ( my $i = 0; $i < scalar @varseq; $i++ ) {
1786             $varseq[$i] = $match
1787                 if defined $refseq[$i]
1788                 && ( $refseq[$i] =~ /[A-Za-z\*]/
1789                 || $refseq[$i] =~ /$gapchar/ )
1790                 && $refseq[$i] eq $varseq[$i];
1791         }
1792         $seq->seq( join '', @varseq );
1793     }
1794     $self->match_char($match);
1795     return 1;
1796 }
1797
1798
1799
1800 =head2 unmatch
1801
1802  Title     : unmatch()
1803  Usage     : $ali->unmatch()
1804  Function  : Undoes the effect of method match. Unsets match_char.
1805  Returns   : 1 on success
1806  Argument  : a match character, optional, defaults to '.'
1807
1808 See L<match> and L<match_char>
1809
1810 =cut
1811
1812 sub unmatch {
1813     my ( $self, $match ) = @_;
1814
1815     $match ||= '.';
1816
1817     my @seqs = $self->each_seq();
1818     return 1 unless scalar @seqs > 1;
1819
1820     my $refseq  = shift @seqs;
1821     my @refseq  = split //, $refseq->seq;
1822     my $gapchar = $self->gap_char;
1823     foreach my $seq (@seqs) {
1824         my @varseq = split //, $seq->seq();
1825         for ( my $i = 0; $i < scalar @varseq; $i++ ) {
1826             $varseq[$i] = $refseq[$i]
1827                 if defined $refseq[$i]
1828                 && ( $refseq[$i] =~ /[A-Za-z\*]/
1829                 || $refseq[$i] =~ /$gapchar/ )
1830                 && $varseq[$i] eq $match;
1831         }
1832         $seq->seq( join '', @varseq );
1833     }
1834     $self->match_char('');
1835     return 1;
1836 }
1837
1838
1839 =head1 MSA attributes
1840
1841 Methods for setting and reading the MSA attributes.
1842
1843 Note that the methods defining character semantics depend on the user
1844 to set them sensibly.  They are needed only by certain input/output
1845 methods. Unset them by setting to an empty string ('').
1846
1847 =head2 id
1848
1849  Title     : id
1850  Usage     : $myalign->id("Ig")
1851  Function  : Gets/sets the id field of the alignment
1852  Returns   : An id string
1853  Argument  : An id string (optional)
1854
1855 =cut
1856
1857 sub id {
1858     my ( $self, $name ) = @_;
1859
1860     if ( defined($name) ) {
1861         $self->{'_id'} = $name;
1862     }
1863
1864     return $self->{'_id'};
1865 }
1866
1867 =head2 accession
1868
1869  Title     : accession
1870  Usage     : $myalign->accession("PF00244")
1871  Function  : Gets/sets the accession field of the alignment
1872  Returns   : An acc string
1873  Argument  : An acc string (optional)
1874
1875 =cut
1876
1877 sub accession {
1878     my ( $self, $acc ) = @_;
1879
1880     if ( defined($acc) ) {
1881         $self->{'_accession'} = $acc;
1882     }
1883
1884     return $self->{'_accession'};
1885 }
1886
1887 =head2 description
1888
1889  Title     : description
1890  Usage     : $myalign->description("14-3-3 proteins")
1891  Function  : Gets/sets the description field of the alignment
1892  Returns   : An description string
1893  Argument  : An description string (optional)
1894
1895 =cut
1896
1897 sub description {
1898     my ( $self, $name ) = @_;
1899
1900     if ( defined($name) ) {
1901         $self->{'_description'} = $name;
1902     }
1903
1904     return $self->{'_description'};
1905 }
1906
1907 =head2 missing_char
1908
1909  Title     : missing_char
1910  Usage     : $myalign->missing_char("?")
1911  Function  : Gets/sets the missing_char attribute of the alignment
1912              It is generally recommended to set it to 'n' or 'N'
1913              for nucleotides and to 'X' for protein.
1914  Returns   : An missing_char string,
1915  Argument  : An missing_char string (optional)
1916
1917 =cut
1918
1919 sub missing_char {
1920     my ( $self, $char ) = @_;
1921
1922     if ( defined $char ) {
1923         $self->throw("Single missing character, not [$char]!")
1924             if CORE::length($char) > 1;
1925         $self->{'_missing_char'} = $char;
1926     }
1927
1928     return $self->{'_missing_char'};
1929 }
1930
1931
1932 =head2 match_char
1933
1934  Title     : match_char
1935  Usage     : $myalign->match_char('.')
1936  Function  : Gets/sets the match_char attribute of the alignment
1937  Returns   : An match_char string,
1938  Argument  : An match_char string (optional)
1939
1940 =cut
1941
1942 sub match_char {
1943     my ( $self, $char ) = @_;
1944
1945     if ( defined $char ) {
1946         $self->throw("Single match character, not [$char]!")
1947             if CORE::length($char) > 1;
1948         $self->{'_match_char'} = $char;
1949     }
1950
1951     return $self->{'_match_char'};
1952 }
1953
1954 =head2 gap_char
1955
1956  Title     : gap_char
1957  Usage     : $myalign->gap_char('-')
1958  Function  : Gets/sets the gap_char attribute of the alignment
1959  Returns   : An gap_char string, defaults to '-'
1960  Argument  : An gap_char string (optional)
1961
1962 =cut
1963
1964 sub gap_char {
1965     my ( $self, $char ) = @_;
1966
1967     if ( defined $char || !defined $self->{'_gap_char'} ) {
1968         $char = '-' unless defined $char;
1969         $self->throw("Single gap character, not [$char]!")
1970             if CORE::length($char) > 1;
1971         $self->{'_gap_char'} = $char;
1972     }
1973     return $self->{'_gap_char'};
1974 }
1975
1976
1977 =head2 symbol_chars
1978
1979  Title   : symbol_chars
1980  Usage   : my @symbolchars = $aln->symbol_chars;
1981  Function: Returns all the seen symbols (other than gaps)
1982  Returns : array of characters that are the seen symbols
1983  Args    : boolean to include the gap/missing/match characters
1984
1985 =cut
1986
1987 sub symbol_chars{
1988    my ($self,$includeextra) = @_;
1989
1990    unless ($self->{'_symbols'}) {
1991        foreach my $seq ($self->each_seq) {
1992            map { $self->{'_symbols'}->{$_} = 1; } split(//,$seq->seq);
1993        }
1994    }
1995    my %copy = %{$self->{'_symbols'}};
1996    if( ! $includeextra ) {
1997        foreach my $char ( $self->gap_char, $self->match_char,
1998                           $self->missing_char) {
1999            delete $copy{$char} if( defined $char );
2000        }
2001    }
2002    return keys %copy;
2003 }
2004
2005 =head1 Alignment descriptors
2006
2007 These read only methods describe the MSA in various ways.
2008
2009
2010 =head2 score
2011
2012  Title     : score
2013  Usage     : $str = $ali->score()
2014  Function  : get/set a score of the alignment
2015  Returns   : a score for the alignment
2016  Argument  : an optional score to set
2017
2018 =cut
2019
2020 sub score {
2021   my $self = shift;
2022   $self->{score} = shift if @_;
2023   return $self->{score};
2024 }
2025
2026 =head2 consensus_string
2027
2028  Title     : consensus_string
2029  Usage     : $str = $ali->consensus_string($threshold_percent)
2030  Function  : Makes a strict consensus
2031  Returns   : Consensus string
2032  Argument  : Optional threshold ranging from 0 to 100.
2033              The consensus residue has to appear at least threshold %
2034              of the sequences at a given location, otherwise a '?'
2035              character will be placed at that location.
2036              (Default value = 0%)
2037
2038 =cut
2039
2040 sub consensus_string {
2041     my $self      = shift;
2042     my $threshold = shift;
2043
2044     my $out = "";
2045     my $len = $self->length - 1;
2046
2047     foreach ( 0 .. $len ) {
2048         $out .= $self->_consensus_aa( $_, $threshold );
2049     }
2050     return $out;
2051 }
2052
2053
2054 =head2 consensus_conservation
2055
2056  Title     : consensus_conservation
2057  Usage     : @conservation = $ali->consensus_conservation();
2058  Function  : Conservation (as a percent) of each position of alignment
2059  Returns   : Array of percentages [0-100]. Gap columns are 0% conserved.
2060  Argument  :
2061
2062 =cut
2063
2064 sub consensus_conservation {
2065     my $self = shift;
2066     my @cons;
2067     my $num_sequences = $self->num_sequences;
2068     foreach my $point (0..$self->length-1) {
2069         my %hash = $self->_consensus_counts($point);
2070         # max frequency of a non-gap letter
2071         my $max = (sort {$b<=>$a} values %hash )[0];
2072         push @cons, 100 * $max / $num_sequences;
2073     }
2074     return @cons;
2075 }
2076
2077 sub _consensus_aa {
2078     my $self = shift;
2079     my $point = shift;
2080     my $threshold_percent = shift || -1 ;
2081     my ($seq,%hash,$count,$letter,$key);
2082     my $gapchar = $self->gap_char;
2083     %hash = $self->_consensus_counts($point);
2084     my $number_of_sequences = $self->num_sequences();
2085     my $threshold = $number_of_sequences * $threshold_percent / 100. ;
2086     $count = -1;
2087     $letter = '?';
2088
2089     foreach $key ( sort keys %hash ) {
2090         # print "Now at $key $hash{$key}\n";
2091         if( $hash{$key} > $count && $hash{$key} >= $threshold) {
2092             $letter = $key;
2093             $count = $hash{$key};
2094         }
2095     }
2096     return $letter;
2097 }
2098
2099 # Frequency of each letter in one column
2100 sub _consensus_counts {
2101     my $self = shift;
2102     my $point = shift;
2103     my %hash;
2104     my $gapchar = $self->gap_char;
2105     foreach my $seq ( $self->each_seq() ) {
2106         my $letter = substr($seq->seq,$point,1);
2107         $self->throw("--$point-----------") if $letter eq '';
2108         ($letter eq $gapchar || $letter =~ /\./) && next;
2109         $hash{$letter}++;
2110     }
2111     return %hash;
2112 }
2113
2114
2115 =head2 consensus_iupac
2116
2117  Title     : consensus_iupac
2118  Usage     : $str = $ali->consensus_iupac()
2119  Function  : Makes a consensus using IUPAC ambiguity codes from DNA
2120              and RNA. The output is in upper case except when gaps in
2121              a column force output to be in lower case.
2122
2123              Note that if your alignment sequences contain a lot of
2124              IUPAC ambiquity codes you often have to manually set
2125              alphabet.  Bio::PrimarySeq::_guess_type thinks they
2126              indicate a protein sequence.
2127  Returns   : consensus string
2128  Argument  : none
2129  Throws    : on protein sequences
2130
2131 =cut
2132
2133 sub consensus_iupac {
2134     my $self = shift;
2135     my $out  = "";
2136     my $len  = $self->length - 1;
2137
2138     # only DNA and RNA sequences are valid
2139     foreach my $seq ( $self->each_seq() ) {
2140         $self->throw( "Seq [" . $seq->get_nse . "] is a protein" )
2141             if $seq->alphabet eq 'protein';
2142     }
2143
2144     # loop over the alignment columns
2145     foreach my $count ( 0 .. $len ) {
2146         $out .= $self->_consensus_iupac($count);
2147     }
2148     return $out;
2149 }
2150
2151 sub _consensus_iupac {
2152     my ($self, $column) = @_;
2153     my ($string, $char, $rna);
2154
2155     #determine all residues in a column
2156     foreach my $seq ( $self->each_seq() ) {
2157         $string .= substr($seq->seq, $column, 1);
2158     }
2159     $string = uc $string;
2160
2161     # quick exit if there's an N in the string
2162     if ($string =~ /N/) {
2163         $string =~ /\W/ ? return 'n' : return 'N';
2164     }
2165     # ... or if there are only gap characters
2166     return '-' if $string =~ /^\W+$/;
2167
2168     # treat RNA as DNA in regexps
2169     if ($string =~ /U/) {
2170         $string =~ s/U/T/;
2171         $rna = 1;
2172     }
2173
2174     # the following s///'s only need to be done to the _first_ ambiguity code
2175     # as we only need to see the _range_ of characters in $string
2176
2177     if ($string =~ /[VDHB]/) {
2178         $string =~ s/V/AGC/;
2179         $string =~ s/D/AGT/;
2180         $string =~ s/H/ACT/;
2181         $string =~ s/B/CTG/;
2182     }
2183
2184     if ($string =~ /[SKYRWM]/) {
2185         $string =~ s/S/GC/;
2186         $string =~ s/K/GT/;
2187         $string =~ s/Y/CT/;
2188         $string =~ s/R/AG/;
2189         $string =~ s/W/AT/;
2190         $string =~ s/M/AC/;
2191     }
2192
2193     # and now the guts of the thing
2194
2195     if ($string =~ /A/) {
2196         $char = 'A';                     # A                      A
2197         if ($string =~ /G/) {
2198             $char = 'R';                 # A and G (purines)      R
2199             if ($string =~ /C/) {
2200                 $char = 'V';             # A and G and C          V
2201                 if ($string =~ /T/) {
2202                     $char = 'N';         # A and G and C and T    N
2203                 }
2204             } elsif ($string =~ /T/) {
2205                 $char = 'D';             # A and G and T          D
2206             }
2207         } elsif ($string =~ /C/) {
2208             $char = 'M';                 # A and C                M
2209             if ($string =~ /T/) {
2210                 $char = 'H';             # A and C and T          H
2211             }
2212         } elsif ($string =~ /T/) {
2213             $char = 'W';                 # A and T                W
2214         }
2215     } elsif ($string =~ /C/) {
2216         $char = 'C';                     # C                      C
2217         if ($string =~ /T/) {
2218             $char = 'Y';                 # C and T (pyrimidines)  Y
2219             if ($string =~ /G/) {
2220                 $char = 'B';             # C and T and G          B
2221             }
2222         } elsif ($string =~ /G/) {
2223             $char = 'S';                 # C and G                S
2224         }
2225     } elsif ($string =~ /G/) {
2226         $char = 'G';                     # G                      G
2227         if ($string =~ /C/) {
2228             $char = 'S';                 # G and C                S
2229         } elsif ($string =~ /T/) {
2230             $char = 'K';                 # G and T                K
2231         }
2232     } elsif ($string =~ /T/) {
2233         $char = 'T';                     # T                      T
2234     }
2235
2236     $char = 'U' if $rna and $char eq 'T';
2237     $char = lc $char if $string =~ /\W/;
2238
2239     return $char;
2240 }
2241
2242
2243 =head2 consensus_meta
2244
2245  Title     : consensus_meta
2246  Usage     : $seqmeta = $ali->consensus_meta()
2247  Function  : Returns a Bio::Seq::Meta object containing the consensus
2248              strings derived from meta data analysis.
2249  Returns   : Bio::Seq::Meta
2250  Argument  : Bio::Seq::Meta
2251  Throws    : non-MetaI object
2252
2253 =cut
2254
2255 sub consensus_meta {
2256     my ($self, $meta) = @_;
2257     if ($meta && (!ref $meta || !$meta->isa('Bio::Seq::MetaI'))) {
2258         $self->throw('Not a Bio::Seq::MetaI object');
2259     }
2260     return $self->{'_aln_meta'} = $meta if $meta;
2261     return $self->{'_aln_meta'}
2262 }
2263
2264 =head2 is_flush
2265
2266  Title     : is_flush
2267  Usage     : if ( $ali->is_flush() )
2268  Function  : Tells you whether the alignment
2269            : is flush, i.e. all of the same length
2270  Returns   : 1 or 0
2271  Argument  :
2272
2273 =cut
2274
2275 sub is_flush {
2276     my ( $self, $report ) = @_;
2277     my $seq;
2278     my $length = (-1);
2279     my $temp;
2280
2281     foreach $seq ( $self->each_seq() ) {
2282         if ( $length == (-1) ) {
2283             $length = CORE::length( $seq->seq() );
2284             next;
2285         }
2286
2287         $temp = CORE::length( $seq->seq() );
2288         if ( $temp != $length ) {
2289             $self->warn(
2290                 "expecting $length not $temp from " . $seq->display_id )
2291                 if ($report);
2292             $self->debug(
2293                 "expecting $length not $temp from " . $seq->display_id );
2294             $self->debug( $seq->seq() . "\n" );
2295             return 0;
2296         }
2297     }
2298
2299     return 1;
2300 }
2301
2302
2303 =head2 length
2304
2305  Title     : length()
2306  Usage     : $len = $ali->length()
2307  Function  : Returns the maximum length of the alignment.
2308              To be sure the alignment is a block, use is_flush
2309  Returns   : Integer
2310  Argument  :
2311
2312 =cut
2313
2314 sub length_aln {
2315     my $self = shift;
2316     Carp::carp("length_aln - deprecated method. Use length() instead.");
2317     $self->length(@_);
2318 }
2319
2320 sub length {
2321     my $self = shift;
2322     my $seq;
2323     my $length = -1;
2324     my $temp;
2325
2326     foreach $seq ( $self->each_seq() ) {
2327         $temp = $seq->length();
2328         if( $temp > $length ) {
2329             $length = $temp;
2330         }
2331     }
2332
2333     return $length;
2334 }
2335
2336
2337 =head2 maxdisplayname_length
2338
2339  Title     : maxdisplayname_length
2340  Usage     : $ali->maxdisplayname_length()
2341  Function  : Gets the maximum length of the displayname in the
2342              alignment. Used in writing out various MSA formats.
2343  Returns   : integer
2344  Argument  :
2345
2346 =cut
2347
2348 sub maxname_length {
2349     my $self = shift;
2350     Carp::carp("maxname_length - deprecated method."
2351                . " Use maxdisplayname_length() instead.");
2352     $self->maxdisplayname_length();
2353 }
2354
2355 sub maxnse_length {
2356     my $self = shift;
2357     Carp::carp("maxnse_length - deprecated method."
2358                . " Use maxnse_length() instead.");
2359     $self->maxdisplayname_length();
2360 }
2361
2362 sub maxdisplayname_length {
2363     my $self    = shift;
2364     my $maxname = (-1);
2365     my ( $seq, $len );
2366
2367     foreach $seq ( $self->each_seq() ) {
2368         $len = CORE::length $self->displayname( $seq->get_nse() );
2369
2370         if ( $len > $maxname ) {
2371             $maxname = $len;
2372         }
2373     }
2374
2375     return $maxname;
2376 }
2377
2378 =head2 max_metaname_length
2379
2380  Title     : max_metaname_length
2381  Usage     : $ali->max_metaname_length()
2382  Function  : Gets the maximum length of the meta name tags in the
2383              alignment for the sequences and for the alignment.
2384              Used in writing out various MSA formats.
2385  Returns   : integer
2386  Argument  : None
2387
2388 =cut
2389
2390 sub max_metaname_length {
2391     my $self = shift;
2392     my $maxname = (-1);
2393     my ($seq,$len);
2394
2395     # check seq meta first
2396     for $seq ( $self->each_seq() ) {
2397         next if !$seq->isa('Bio::Seq::MetaI' || !$seq->meta_names);
2398         for my $mtag ($seq->meta_names) {
2399             $len = CORE::length $mtag;
2400             if( $len > $maxname ) {
2401                 $maxname = $len;
2402             }
2403         }
2404     }
2405
2406     # alignment meta
2407     for my $meta ($self->consensus_meta) {
2408         next unless $meta;
2409         for my $name ($meta->meta_names) {
2410             $len = CORE::length $name;
2411             if( $len > $maxname ) {
2412                 $maxname = $len;
2413             }
2414         }
2415     }
2416
2417     return $maxname;
2418 }
2419
2420 =head2 num_residues
2421
2422  Title     : num_residues
2423  Usage     : $no = $ali->num_residues
2424  Function  : number of residues in total in the alignment
2425  Returns   : integer
2426  Argument  :
2427  Note      : replaces no_residues()
2428
2429 =cut
2430
2431 sub num_residues {
2432     my $self  = shift;
2433     my $count = 0;
2434
2435     foreach my $seq ( $self->each_seq ) {
2436         my $str = $seq->seq();
2437
2438         $count += ( $str =~ s/[A-Za-z]//g );
2439     }
2440
2441     return $count;
2442 }
2443
2444 =head2 num_sequences
2445
2446  Title     : num_sequences
2447  Usage     : $depth = $ali->num_sequences
2448  Function  : number of sequence in the sequence alignment
2449  Returns   : integer
2450  Argument  : none
2451  Note      : replaces no_sequences()
2452
2453 =cut
2454
2455 sub num_sequences {
2456     my $self = shift;
2457     return scalar($self->each_seq);
2458 }
2459
2460 =head2 average_percentage_identity
2461
2462  Title   : average_percentage_identity
2463  Usage   : $id = $align->average_percentage_identity
2464  Function: The function uses a fast method to calculate the average
2465            percentage identity of the alignment
2466  Returns : The average percentage identity of the alignment
2467  Args    : None
2468  Notes   : This method implemented by Kevin Howe calculates a figure that is
2469            designed to be similar to the average pairwise identity of the
2470            alignment (identical in the absence of gaps), without having to
2471            explicitly calculate pairwise identities proposed by Richard Durbin.
2472            Validated by Ewan Birney ad Alex Bateman.
2473
2474 =cut
2475
2476 sub average_percentage_identity{
2477    my ($self,@args) = @_;
2478
2479    my @alphabet = ('A','B','C','D','E','F','G','H','I','J','K','L','M',
2480                    'N','O','P','Q','R','S','T','U','V','W','X','Y','Z');
2481
2482    my ($len, $total, $subtotal, $divisor, $subdivisor, @seqs, @countHashes);
2483
2484    if (! $self->is_flush()) {
2485        $self->throw("All sequences in the alignment must be the same length");
2486    }
2487
2488    @seqs = $self->each_seq();
2489    $len = $self->length();
2490
2491    # load the each hash with correct keys for existence checks
2492
2493    for( my $index=0; $index < $len; $index++) {
2494        foreach my $letter (@alphabet) {
2495            $countHashes[$index]->{$letter} = 0;
2496        }
2497    }
2498    foreach my $seq (@seqs)  {
2499        my @seqChars = split //, $seq->seq();
2500        for( my $column=0; $column < @seqChars; $column++ ) {
2501            my $char = uc($seqChars[$column]);
2502            if (exists $countHashes[$column]->{$char}) {
2503                $countHashes[$column]->{$char}++;
2504            }
2505        }
2506    }
2507
2508    $total = 0;
2509    $divisor = 0;
2510    for(my $column =0; $column < $len; $column++) {
2511        my %hash = %{$countHashes[$column]};
2512        $subdivisor = 0;
2513        foreach my $res (keys %hash) {
2514            $total += $hash{$res}*($hash{$res} - 1);
2515            $subdivisor += $hash{$res};
2516        }
2517        $divisor += $subdivisor * ($subdivisor - 1);
2518    }
2519    return $divisor > 0 ? ($total / $divisor )*100.0 : 0;
2520 }
2521
2522 =head2 percentage_identity
2523
2524  Title   : percentage_identity
2525  Usage   : $id = $align->percentage_identity
2526  Function: The function calculates the average percentage identity
2527            (aliased to average_percentage_identity)
2528  Returns : The average percentage identity
2529  Args    : None
2530
2531 =cut
2532
2533 sub percentage_identity {
2534     my $self = shift;
2535     return $self->average_percentage_identity();
2536 }
2537
2538 =head2 overall_percentage_identity
2539
2540  Title   : overall_percentage_identity
2541  Usage   : $id = $align->overall_percentage_identity
2542            $id = $align->overall_percentage_identity('short')
2543  Function: The function calculates the percentage identity of
2544            the conserved columns
2545  Returns : The percentage identity of the conserved columns
2546  Args    : length value to use, optional defaults to alignment length
2547                  possible values: 'align', 'short', 'long'
2548
2549 The argument values 'short' and 'long' refer to shortest and longest
2550 sequence in the alignment. Method modification code by Hongyu Zhang.
2551
2552 =cut
2553
2554 sub overall_percentage_identity{
2555    my ($self, $length_measure) = @_;
2556
2557    my %alphabet = map {$_ => undef} qw (A C G T U B D E F H I J K L M N O P Q R S V W X Y Z);
2558
2559    my %enum = map {$_ => undef} qw (align short long);
2560
2561    $self->throw("Unknown argument [$length_measure]")
2562        if $length_measure and not exists $enum{$length_measure};
2563    $length_measure ||= 'align';
2564
2565    if (! $self->is_flush()) {
2566        $self->throw("All sequences in the alignment must be the same length");
2567    }
2568
2569    # Count the residues seen at each position
2570    my $len;
2571    my $total = 0; # number of positions with identical residues
2572    my @countHashes;
2573    my @seqs = $self->each_seq;
2574    my $nof_seqs = scalar @seqs;
2575    my $aln_len = $self->length();
2576    for my $seq (@seqs)  {
2577        my $seqstr = $seq->seq;
2578
2579        # Count residues for given sequence
2580        for my $column (0 .. $aln_len-1) {
2581            my $char = uc( substr($seqstr, $column, 1) );
2582            if ( exists $alphabet{$char} ) {
2583
2584                # This is a valid char
2585                if ( defined $countHashes[$column]->{$char} ) {
2586                  $countHashes[$column]->{$char}++;
2587                } else {
2588                  $countHashes[$column]->{$char} = 1;
2589                }
2590
2591                if ( $countHashes[$column]->{$char} == $nof_seqs ) {
2592                    # All sequences have this same residue
2593                    $total++;
2594                }
2595
2596            }
2597        }
2598
2599        # Sequence length
2600        if ($length_measure eq 'short' || $length_measure eq 'long') {
2601            my $seq_len = $seqstr =~ tr/[A-Za-z]//;
2602            if ($length_measure eq 'short') {
2603                if ( (not defined $len) || ($seq_len < $len) ) {
2604                    $len = $seq_len;
2605                }
2606            } elsif ($length_measure eq 'long') {
2607                if ( (not defined $len) || ($seq_len > $len) ) {
2608                    $len = $seq_len;
2609                }
2610            }
2611        }
2612
2613    }
2614
2615    if ($length_measure eq 'align') {
2616        $len = $aln_len;
2617    }
2618
2619    return ($total / $len ) * 100.0;
2620 }
2621
2622
2623
2624 =head1 Alignment positions
2625
2626 Methods to map a sequence position into an alignment column and back.
2627 column_from_residue_number() does the former. The latter is really a
2628 property of the sequence object and can done using
2629 L<Bio::LocatableSeq::location_from_column>:
2630
2631     # select somehow a sequence from the alignment, e.g.
2632     my $seq = $aln->get_seq_by_pos(1);
2633     #$loc is undef or Bio::LocationI object
2634     my $loc = $seq->location_from_column(5);
2635
2636 =head2 column_from_residue_number
2637
2638  Title   : column_from_residue_number
2639  Usage   : $col = $ali->column_from_residue_number( $seqname, $resnumber)
2640  Function: This function gives the position in the alignment
2641            (i.e. column number) of the given residue number in the
2642            sequence with the given name. For example, for the
2643            alignment
2644
2645              Seq1/91-97 AC..DEF.GH.
2646              Seq2/24-30 ACGG.RTY...
2647                 Seq3/43-51 AC.DDEF.GHI
2648
2649            column_from_residue_number( "Seq1", 94 ) returns 6.
2650            column_from_residue_number( "Seq2", 25 ) returns 2.
2651            column_from_residue_number( "Seq3", 50 ) returns 10.
2652
2653            An exception is thrown if the residue number would lie
2654            outside the length of the aligment
2655            (e.g. column_from_residue_number( "Seq2", 22 )
2656
2657           Note: If the the parent sequence is represented by more than
2658                 one alignment sequence and the residue number is present in
2659                 them, this method finds only the first one.
2660
2661  Returns : A column number for the position in the alignment of the
2662            given residue in the given sequence (1 = first column)
2663  Args    : A sequence id/name (not a name/start-end)
2664            A residue number in the whole sequence (not just that
2665            segment of it in the alignment)
2666
2667 =cut
2668
2669 sub column_from_residue_number {
2670     my ( $self, $name, $resnumber ) = @_;
2671
2672     $self->throw("No sequence with name [$name]")
2673         unless $self->{'_start_end_lists'}->{$name};
2674     $self->throw("Second argument residue number missing") unless $resnumber;
2675
2676     foreach my $seq ( $self->each_seq_with_id($name) ) {
2677         my $col;
2678         eval { $col = $seq->column_from_residue_number($resnumber); };
2679         next if $@;
2680         return $col;
2681     }
2682
2683     $self->throw( "Could not find a sequence segment in $name "
2684             . "containing residue number $resnumber" );
2685
2686 }
2687
2688 =head1 Sequence names
2689
2690 Methods to manipulate the display name. The default name based on the
2691 sequence id and subsequence positions can be overridden in various
2692 ways.
2693
2694 =head2 displayname
2695
2696  Title     : displayname
2697  Usage     : $myalign->displayname("Ig", "IgA")
2698  Function  : Gets/sets the display name of a sequence in the alignment
2699  Returns   : A display name string
2700  Argument  : name of the sequence
2701              displayname of the sequence (optional)
2702
2703 =cut
2704
2705 sub displayname {
2706     my ( $self, $name, $disname ) = @_;
2707
2708     $self->throw("No sequence with name [$name]")
2709         unless defined $self->{'_seq'}->{$name};
2710
2711     if ( $disname and $name ) {
2712         $self->{'_dis_name'}->{$name} = $disname;
2713         return $disname;
2714     }
2715     elsif ( defined $self->{'_dis_name'}->{$name} ) {
2716         return $self->{'_dis_name'}->{$name};
2717     }
2718     else {
2719         return $name;
2720     }
2721 }
2722
2723 sub get_displayname {
2724     my $self = shift;
2725     Carp::carp("get_displayname - deprecated method. Use displayname() instead.");
2726     $self->displayname(@_);
2727 }
2728
2729 sub set_displayname {
2730     my $self = shift;
2731     Carp::carp("set_displayname - deprecated method. Use displayname() instead.");
2732     $self->displayname(@_);
2733 }
2734
2735
2736 =head2 set_displayname_count
2737
2738  Title     : set_displayname_count
2739  Usage     : $ali->set_displayname_count
2740  Function  : Sets the names to be name_# where # is the number of
2741              times this name has been used.
2742  Returns   : 1, on success
2743  Argument  :
2744
2745 =cut
2746
2747 sub set_displayname_count {
2748     my $self= shift;
2749     my (@arr,$name,$seq,$count,$temp,$nse);
2750
2751     foreach $seq ( $self->each_alphabetically() ) {
2752         $nse = $seq->get_nse();
2753
2754         #name will be set when this is the second
2755         #time (or greater) is has been seen
2756
2757         if( defined $name and $name eq ($seq->id()) ) {
2758             $temp = sprintf("%s_%s",$name,$count);
2759             $self->displayname($nse,$temp);
2760             $count++;
2761         } else {
2762             $count = 1;
2763             $name = $seq->id();
2764             $temp = sprintf("%s_%s",$name,$count);
2765             $self->displayname($nse,$temp);
2766             $count++;
2767         }
2768     }
2769     return 1;
2770 }
2771
2772 =head2 set_displayname_flat
2773
2774  Title     : set_displayname_flat
2775  Usage     : $ali->set_displayname_flat()
2776  Function  : Makes all the sequences be displayed as just their name,
2777              not name/start-end (NSE)
2778  Returns   : 1
2779  Argument  :
2780
2781 =cut
2782
2783 sub set_displayname_flat {
2784     my $self = shift;
2785     my ( $nse, $seq );
2786
2787     foreach $seq ( $self->each_seq() ) {
2788         $nse = $seq->get_nse();
2789         $self->displayname( $nse, $seq->id() );
2790     }
2791     return 1;
2792 }
2793
2794
2795 =head2 set_displayname_normal
2796
2797  Title     : set_displayname_normal
2798  Usage     : $ali->set_displayname_normal()
2799  Function  : Makes all the sequences be displayed as name/start-end (NSE)
2800  Returns   : 1, on success
2801  Argument  :
2802
2803 =cut
2804
2805 sub set_displayname_normal {
2806     my $self = shift;
2807     my ( $nse, $seq );
2808
2809     foreach $seq ( $self->each_seq() ) {
2810         $nse = $seq->get_nse();
2811         $self->displayname( $nse, $nse );
2812     }
2813     return 1;
2814 }
2815
2816 =head2 source
2817
2818  Title   : source
2819  Usage   : $obj->source($newval)
2820  Function: sets the Alignment source program
2821  Example :
2822  Returns : value of source
2823  Args    : newvalue (optional)
2824
2825 =cut
2826
2827 sub source {
2828     my ( $self, $value ) = @_;
2829     if ( defined $value ) {
2830         $self->{'_source'} = $value;
2831     }
2832     return $self->{'_source'};
2833 }
2834
2835
2836 =head2 set_displayname_safe
2837
2838  Title     : set_displayname_safe
2839  Usage     : ($new_aln, $ref_name)=$ali->set_displayname_safe(4)
2840  Function  : Assign machine-generated serial names to sequences in input order.
2841              Designed to protect names during PHYLIP runs. Assign 10-char string
2842              in the form of "S000000001" to "S999999999". Restore the original
2843              names using "restore_displayname".
2844  Returns   : 1. a new $aln with system names;
2845              2. a hash ref for restoring names
2846  Argument  : Number for id length (default 10)
2847
2848 =cut
2849
2850 sub set_displayname_safe {
2851     my $self = shift;
2852     my $idlength = shift || 10;
2853     my ( $seq, %phylip_name );
2854     my $ct  = 0;
2855     my $new = Bio::SimpleAlign->new();
2856     foreach $seq ( $self->each_seq() ) {
2857         $ct++;
2858         my $pname = "S" . sprintf "%0" . ( $idlength - 1 ) . "s", $ct;
2859         $phylip_name{$pname} = $seq->id();
2860         my $new_seq = Bio::LocatableSeq->new(
2861             -id       => $pname,
2862             -seq      => $seq->seq(),
2863             -alphabet => $seq->alphabet,
2864             -start    => $seq->{_start},
2865             -end      => $seq->{_end}
2866         );
2867         $new->add_seq($new_seq);
2868     }
2869
2870     $self->debug(
2871         "$ct seq names changed. Restore names by using restore_displayname.");
2872     return ( $new, \%phylip_name );
2873 }
2874
2875
2876 =head2 restore_displayname
2877
2878  Title     : restore_displayname
2879  Usage     : $aln_name_restored=$ali->restore_displayname($hash_ref)
2880  Function  : Restore original sequence names (after running
2881              $ali->set_displayname_safe)
2882  Returns   : a new $aln with names restored.
2883  Argument  : a hash reference of names from "set_displayname_safe".
2884
2885 =cut
2886
2887 sub restore_displayname {
2888     my $self = shift;
2889     my $ref=shift;
2890     my %name=%$ref;
2891     my $new=Bio::SimpleAlign->new();
2892     foreach my $seq ( $self->each_seq() ) {
2893       $self->throw("No sequence with name") unless defined $name{$seq->id()};
2894       my $new_seq= Bio::LocatableSeq->new(-id => $name{$seq->id()},
2895                                           -seq      => $seq->seq(),
2896                                           -alphabet => $seq->alphabet,
2897                                           -start    => $seq->{_start},
2898                                           -end      => $seq->{_end}
2899                                           );
2900       $new->add_seq($new_seq);
2901     }
2902     return $new;
2903 }
2904
2905 =head2 sort_by_start
2906
2907  Title     : sort_by_start
2908  Usage     : $ali->sort_by_start
2909  Function  : Changes the order of the alignment to the start position of each
2910              subalignment
2911  Returns   : 1 on success
2912  Argument  :
2913
2914 =cut
2915
2916 sub sort_by_start {
2917     my $self = shift;
2918     my ($seq,$nse,@arr,%hash,$count);
2919     foreach $seq ( $self->each_seq() ) {
2920         $nse = $seq->get_nse;
2921         $hash{$nse} = $seq;
2922     }
2923     $count = 0;
2924     %{$self->{'_order'}} = (); # reset the hash;
2925     foreach $nse ( sort _startend keys %hash) {
2926         $self->{'_order'}->{$count} = $nse;
2927         $count++;
2928     }
2929     1;
2930 }
2931
2932 sub _startend {
2933     my ($aname,$arange) = split (/[\/]/,$a);
2934     my ($bname,$brange) = split (/[\/]/,$b);
2935     my ($astart,$aend) = split(/\-/,$arange);
2936     my ($bstart,$bend) = split(/\-/,$brange);
2937     return $astart <=> $bstart;
2938 }
2939
2940 =head2 bracket_string
2941
2942  Title     : bracket_string
2943  Usage     : my @params = (-refseq     => 'testseq',
2944                            -allele1    => 'allele1',
2945                            -allele2    => 'allele2',
2946                            -delimiters => '{}',
2947                            -separator  => '/');
2948              $str = $aln->bracket_string(@params)
2949
2950  Function :  When supplied with a list of parameters (see below), returns a
2951              string in BIC format. This is used for allelic comparisons.
2952              Briefly, if either allele contains a base change when compared to
2953              the refseq, the base or gap for each allele is represented in
2954              brackets in the order present in the 'alleles' parameter.
2955
2956              For the following data:
2957
2958              >testseq
2959              GGATCCATTGCTACT
2960              >allele1
2961              GGATCCATTCCTACT
2962              >allele2
2963              GGAT--ATTCCTCCT
2964
2965              the returned string with parameters 'refseq => testseq' and
2966              'alleles => [qw(allele1 allele2)]' would be:
2967
2968              GGAT[C/-][C/-]ATT[C/C]CT[A/C]CT
2969  Returns   : BIC-formatted string
2970  Argument  : Required args
2971                 refseq    : string (ID) of the reference sequence used
2972                             as basis for comparison
2973                 allele1   : string (ID) of the first allele
2974                 allele2   : string (ID) of the second allele
2975              Optional args
2976                 delimiters: two symbol string of left and right delimiters.
2977                             Only the first two symbols are used
2978                             default = '[]'
2979                 separator : string used as a separator.  Only the first
2980                             symbol is used
2981                             default = '/'
2982  Throws    : On no refseq/alleles, or invalid refseq/alleles.
2983
2984 =cut
2985
2986 sub bracket_string {
2987     my ($self, @args) = @_;
2988     my ($ref, $a1, $a2, $delim, $sep) =
2989         $self->_rearrange([qw(refseq allele1 allele2 delimiters separator)], @args);
2990     $self->throw('Missing refseq/allele1/allele2') if (!$a1 || !$a2 || !$ref);
2991     my ($ld, $rd);
2992     ($ld, $rd) = split('', $delim, 2) if $delim;
2993     $ld ||= '[';
2994     $rd ||= ']';
2995     $sep ||= '/';
2996     my ($refseq, $allele1, $allele2) =
2997         map {( $self->each_seq_with_id($_) )} ($ref, $a1, $a2);
2998     if (!$refseq || !$allele1 || !$allele2) {
2999         $self->throw("One of your refseq/allele IDs is invalid!");
3000     }
3001     my $len = $self->length-1;
3002     my $bic = '';
3003     # loop over the alignment columns
3004     for my $column ( 0 .. $len ) {
3005         my $string;
3006         my ($compres, $res1, $res2) =
3007             map{substr($_->seq, $column, 1)} ($refseq, $allele1, $allele2);
3008         # are any of the allele symbols different from the refseq?
3009         $string = ($compres eq $res1 && $compres eq $res2) ? $compres :
3010                 $ld.$res1.$sep.$res2.$rd;
3011         $bic .= $string;
3012     }
3013     return $bic;
3014 }
3015
3016
3017 =head2 methods implementing Bio::FeatureHolderI
3018
3019 FeatureHolderI implementation to support labeled character sets like one
3020 would get from NEXUS represented data.
3021
3022 =head2 get_SeqFeatures
3023
3024  Usage   : @features = $aln->get_SeqFeatures
3025  Function: Get the feature objects held by this feature holder.
3026  Example :
3027  Returns : an array of Bio::SeqFeatureI implementing objects
3028  Args    : optional filter coderef, taking a Bio::SeqFeatureI
3029          : as argument, returning TRUE if wanted, FALSE if
3030          : unwanted
3031
3032 =cut
3033
3034 sub get_SeqFeatures {
3035     my $self      = shift;
3036     my $filter_cb = shift;
3037     $self->throw("Arg (filter callback) must be a coderef")
3038         unless !defined($filter_cb)
3039         or ref($filter_cb) eq 'CODE';
3040     if ( !defined $self->{'_as_feat'} ) {
3041         $self->{'_as_feat'} = [];
3042     }
3043     if ($filter_cb) {
3044         return grep { $filter_cb->($_) } @{ $self->{'_as_feat'} };
3045     }
3046     return @{ $self->{'_as_feat'} };
3047 }
3048
3049
3050 =head2 add_SeqFeature
3051
3052  Usage   : $aln->add_SeqFeature($subfeat);
3053  Function: Adds a SeqFeature into the SeqFeature array. The 'EXPAND' qualifier
3054            (see L<Bio::FeatureHolderI>) is supported, but has no effect.
3055  Example :
3056  Returns : 1 on success
3057  Args    : a Bio::SeqFeatureI object
3058
3059 =cut
3060
3061 sub add_SeqFeature {
3062    my ($self, @feat) = @_;
3063
3064    $self->{'_as_feat'} = [] unless $self->{'_as_feat'};
3065
3066    if (scalar @feat > 1) {
3067       Carp::carp('Providing an array of features to Bio::SimpleAlign'
3068                  . 'add_SeqFeature() is deprecated and will be removed in a'
3069                  . ' future version.  Add a single feature at a time instead.');
3070    }
3071
3072    for my $feat ( @feat ) {
3073
3074        next if $feat eq 'EXPAND'; # Need to support it for FeatureHolderI compliance
3075
3076        if( !$feat->isa("Bio::SeqFeatureI") ) {
3077            $self->throw("Expected a Bio::SeqFeatureI object, but got a $feat.");
3078        }
3079
3080        push @{$self->{'_as_feat'}}, $feat;
3081    }
3082    return 1;
3083 }
3084
3085
3086 =head2 remove_SeqFeatures
3087
3088  Usage   : $obj->remove_SeqFeatures
3089  Function: Removes all SeqFeatures.  If you want to remove only a subset,
3090            remove that subset from the returned array, and add back the rest.
3091  Returns : The array of Bio::SeqFeatureI features that was
3092            deleted from this alignment.
3093  Args    : none
3094
3095 =cut
3096
3097 sub remove_SeqFeatures {
3098     my $self = shift;
3099
3100     return () unless $self->{'_as_feat'};
3101     my @feats = @{$self->{'_as_feat'}};
3102     $self->{'_as_feat'} = [];
3103     return @feats;
3104 }
3105
3106 =head2 feature_count
3107
3108  Title   : feature_count
3109  Usage   : $obj->feature_count()
3110  Function: Return the number of SeqFeatures attached to the alignment
3111  Returns : integer representing the number of SeqFeatures
3112  Args    : None
3113
3114 =cut
3115
3116 sub feature_count {
3117     my ($self) = @_;
3118
3119     if (defined($self->{'_as_feat'})) {
3120         return ($#{$self->{'_as_feat'}} + 1);
3121     } else {
3122         return 0;
3123     }
3124 }
3125
3126 =head2 get_all_SeqFeatures
3127
3128  Title   : get_all_SeqFeatures
3129  Usage   :
3130  Function: Get all SeqFeatures.
3131  Example :
3132  Returns : an array of Bio::SeqFeatureI implementing objects
3133  Args    : none
3134  Note    : Falls through to Bio::FeatureHolderI implementation.
3135
3136 =cut
3137
3138 =head2 methods for Bio::AnnotatableI
3139
3140 AnnotatableI implementation to support sequence alignments which
3141 contain annotation (NEXUS, Stockholm).
3142
3143 =head2 annotation
3144
3145  Title   : annotation
3146  Usage   : $ann = $aln->annotation or
3147            $aln->annotation($ann)
3148  Function: Gets or sets the annotation
3149  Returns : Bio::AnnotationCollectionI object
3150  Args    : None or Bio::AnnotationCollectionI object
3151
3152 See L<Bio::AnnotationCollectionI> and L<Bio::Annotation::Collection>
3153 for more information
3154
3155 =cut
3156
3157 sub annotation {
3158     my ($obj,$value) = @_;
3159     if( defined $value ) {
3160         $obj->throw("object of class ".ref($value)." does not implement ".
3161                 "Bio::AnnotationCollectionI. Too bad.")
3162             unless $value->isa("Bio::AnnotationCollectionI");
3163         $obj->{'_annotation'} = $value;
3164     } elsif( ! defined $obj->{'_annotation'}) {
3165         $obj->{'_annotation'} = Bio::Annotation::Collection->new();
3166     }
3167     return $obj->{'_annotation'};
3168 }
3169
3170 =head1 Deprecated methods
3171
3172 =head2 mask_columns
3173
3174  Title     : mask_columns
3175  Usage     : $aln2 = $aln->mask_columns(20,30)
3176  Function  : Masks a slice of the alignment inclusive of start and
3177              end columns, and the first column in the alignment is denoted 1.
3178              Mask beyond the length of the sequence does not do padding.
3179  Returns   : A Bio::SimpleAlign object
3180  Args      : Positive integer for start column, positive integer for end column,
3181              optional string value use for the mask. Example:
3182
3183              $aln2 = $aln->mask_columns(20,30,'?')
3184  Note      : Masking must use a character that is not used for gaps or
3185              frameshifts.  These can be adjusted using the relevant global
3186              variables, but be aware these may be (uncontrollably) modified
3187              elsewhere within BioPerl (see bug 2715)
3188
3189 =cut
3190
3191 sub mask_columns {
3192     #based on slice(), but did not include the Bio::Seq::Meta sections as I was not sure what it is doing
3193     my $self = shift;
3194
3195     my $nonres = $Bio::LocatableSeq::GAP_SYMBOLS.
3196              $Bio::LocatableSeq::FRAMESHIFT_SYMBOLS;
3197
3198     # coordinates are alignment-based, not sequence-based
3199     my ($start, $end, $mask_char) = @_;
3200     unless (defined $mask_char) { $mask_char = 'N' }
3201
3202     $self->throw("Mask start has to be a positive integer and less than ".
3203                  "alignment length, not [$start]")
3204       unless $start =~ /^\d+$/ && $start > 0 && $start <= $self->length;
3205     $self->throw("Mask end has to be a positive integer and less than ".
3206                  "alignment length, not [$end]")
3207       unless $end =~ /^\d+$/ && $end > 0 && $end <= $self->length;
3208     $self->throw("Mask start [$start] has to be smaller than or equal to ".
3209                  "end [$end]") unless $start <= $end;
3210     $self->throw("Mask character $mask_char has to be a single character ".
3211                  "and not a gap or frameshift symbol")
3212       unless CORE::length($mask_char) == 1 && $mask_char !~ m{$nonres};
3213
3214     my $aln = $self->new;
3215     $aln->id($self->id);
3216     foreach my $seq ( $self->each_seq() ) {
3217         my $new_seq = Bio::LocatableSeq->new(-id => $seq->id,
3218          -alphabet => $seq->alphabet,
3219          -strand  => $seq->strand,
3220          -verbose => $self->verbose);
3221
3222         # convert from 1-based alignment coords!
3223         my $masked_string = substr($seq->seq, $start - 1, $end - $start + 1);
3224         $masked_string =~ s{[^$nonres]}{$mask_char}g;
3225         my $new_dna_string = substr($seq->seq,0,$start-1) . $masked_string . substr($seq->seq,$end);
3226         $new_seq->seq($new_dna_string);
3227         $aln->add_seq($new_seq);
3228     }
3229     # Preserve chosen mask character, it may be need later (like in 'slice')
3230     $aln->{_mask_char} = $mask_char;
3231     return $aln;
3232 }
3233
3234 1;