1 package Bio
::Align
::Utilities
;
5 use Bio
::Root
::Version
;
12 bootstrap_replicates_codons
16 our %EXPORT_TAGS = (all
=> \
@EXPORT_OK);
19 # BioPerl module for Bio::Align::Utilities
21 # Please direct questions and support issues to <bioperl-l@bioperl.org>
23 # Cared for by Jason Stajich <jason-at-bioperl.org> and Brian Osborne
25 # Copyright Jason Stajich
27 # You may distribute this module under the same terms as perl itself
29 # POD documentation - main docs before the code
33 Bio::Align::Utilities - A collection of utilities regarding converting
34 and manipulating alignment objects
38 use Bio::Align::Utilities qw(:all);
40 # Even if the protein alignments are local make sure the start/end
41 # stored in the LocatableSeq objects are to the full length protein.
42 # The coding sequence that is passed in should still be the full
43 # length CDS as the nt alignment will be generated.
44 # %dnaseqs is a hash of CDS sequences (spliced)
45 my $dna_aln = aa_to_dna_aln($aa_aln,\%dnaseqs);
47 # The reverse, which is simpler. The input alignment has to be
48 # translate-able, with gap lengths and an overall length divisible by 3
49 my $aa_aln = dna_to_aa_aln($dna_al);
52 my $replicates = bootstrap_replicates($aln,$count);
56 This module contains utility methods for manipulating sequence
57 alignments (L<Bio::Align::AlignI>) objects.
59 The B<aa_to_dna_aln> utility is essentially the same as the B<mrtrans>
60 program by Bill Pearson available at
61 ftp://ftp.virginia.edu/pub/fasta/other/mrtrans.shar. Of course this
62 is a pure-Perl implementation, but just to mention that if anything
63 seems odd you can check the alignments generated against Bill's
70 User feedback is an integral part of the evolution of this and other
71 Bioperl modules. Send your comments and suggestions preferably to
72 the Bioperl mailing list. Your participation is much appreciated.
74 bioperl-l@bioperl.org - General discussion
75 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
79 Please direct usage questions or support issues to the mailing list:
81 I<bioperl-l@bioperl.org>
83 rather than to the module maintainer directly. Many experienced and
84 reponsive experts will be able look at the problem and quickly
85 address it. Please include a thorough description of the problem
86 with code and data examples if at all possible.
90 Report bugs to the Bioperl bug tracking system to help us keep track
91 of the bugs and their resolution. Bug reports can be submitted via the
94 https://github.com/bioperl/bioperl-live/issues
96 =head1 AUTHOR - Jason Stajich
98 Email jason-at-bioperl-dot-org
102 The rest of the documentation details each of the object methods.
103 Internal methods are usually preceded with a _
107 use constant CODONSIZE
=> 3;
109 our $CODONGAP = $GAP x CODONSIZE
;
113 Title : aa_to_dna_aln
114 Usage : my $dnaaln = aa_to_dna_aln($aa_aln, \%seqs);
115 Function: Will convert an AA alignment to DNA space given the
116 corresponding DNA sequences. Note that this method expects
117 the DNA sequences to be in frame +1 (GFF frame 0) as it will
118 start to project into coordinates starting at the first base of
119 the DNA sequence, if this alignment represents a different
120 frame for the cDNA you will need to edit the DNA sequences
121 to remove the 1st or 2nd bases (and revcom if things should be).
122 Returns : Bio::Align::AlignI object
123 Args : 2 arguments, the alignment and a hashref.
124 Alignment is a Bio::Align::AlignI of amino acid sequences.
125 The hash reference should have keys which are
126 the display_ids for the aa
127 sequences in the alignment and the values are a
128 Bio::PrimarySeqI object for the corresponding
129 spliced cDNA sequence.
131 See also: L<Bio::Align::AlignI>, L<Bio::SimpleAlign>, L<Bio::PrimarySeq>
136 my ( $aln, $dnaseqs ) = @_;
137 unless ( defined $aln
139 && $aln->isa('Bio::Align::AlignI') )
142 'Must provide a valid Bio::Align::AlignI object as the first argument to aa_to_dna_aln, see the documentation for proper usage and the method signature'
145 my $alnlen = $aln->length;
146 my $dnaalign = Bio
::SimpleAlign
->new();
147 $aln->map_chars( '\.', $GAP );
149 foreach my $seq ( $aln->each_seq ) {
150 my $aa_seqstr = $seq->seq();
151 my $pepid = $seq->display_id;
152 my $dnaseq = $dnaseqs->{$pepid} || $aln->throw( "cannot find " . $seq->display_id );
153 my $start_offset = ( $seq->start - 1 ) * CODONSIZE
;
154 $dnaseq = $dnaseq->seq();
155 my $dnalen = $dnaseqs->{$pepid}->length;
156 my $dnaid = $dnaseqs->{$pepid}->display_id || $pepid; # try to use DNAseq obj ID (issue #137)
159 for ( my $i = 0 ; $i < $alnlen ; $i++ ) {
160 my $char = substr( $aa_seqstr, $i + $start_offset, 1 );
161 if ( $char eq $GAP || $j >= $dnalen ) {
162 $nt_seqstr .= $CODONGAP;
165 $nt_seqstr .= substr( $dnaseq, $j, CODONSIZE
);
169 $nt_seqstr .= $GAP x
( ( $alnlen * 3 ) - length($nt_seqstr) );
171 my $newdna = Bio
::LocatableSeq
->new(
172 -display_id
=> $dnaid,
174 -start
=> $start_offset + 1,
175 -end
=> ( $seq->end * CODONSIZE
),
179 $dnaalign->add_seq($newdna);
186 Title : dna_to_aa_aln
187 Usage : my $aa_aln = dna_to_aa_aln($dna_aln);
188 Function: Convert a DNA alignment to an amino acid alignment where
189 the length of all alignment strings and the lengths of any
190 gaps must be divisible by 3
191 Returns : Bio::Align::AlignI object
192 Args : the DNA alignment, a Bio::Align::AlignI of DNA sequences
194 See also: L<Bio::Align::AlignI>, L<Bio::SimpleAlign>, L<Bio::PrimarySeq>
200 unless ( defined $dna_aln
202 && $dna_aln->isa('Bio::Align::AlignI') ) {
204 'Must provide a valid Bio::Align::AlignI object as the argument to dna_to_aa_aln'
207 my $codon_table = Bio
::Tools
::CodonTable
->new;
208 my $aa_aln = Bio
::SimpleAlign
->new;
210 for my $seq ( $dna_aln->each_seq ) {
211 my ($aa_str, $aa_len);
212 my @aln_str = split '', $seq->seq;
213 croak
("All lines in the alignment must have lengths divisible by 3")
214 if ( scalar(@aln_str) % CODONSIZE
);
217 my $triplet = join '', (splice( @aln_str, 0, CODONSIZE
));
219 if ( $triplet =~ /^[GATC]+$/i ) {
220 $aa_str .= $codon_table->translate($triplet);
223 elsif ( $triplet =~ /^[$Bio::LocatableSeq::GAP_SYMBOLS]+$/ ) {
227 croak
("The triplet '$triplet' is neither a valid codon nor all gaps");
230 my $new_aa = Bio
::LocatableSeq
->new(
231 -display_id
=> $seq->display_id,
232 -alphabet
=> 'protein',
239 $aa_aln->add_seq($new_aa);
245 =head2 bootstrap_replicates
247 Title : bootstrap_replicates
248 Usage : my $alns = &bootstrap_replicates($aln,100);
249 Function: Generate a pseudo-replicate of the data by randomly
250 sampling, with replacement, the columns from an alignment for
251 the non-parametric bootstrap.
252 Returns : Arrayref of L<Bio::SimpleAlign> objects
253 Args : L<Bio::SimpleAlign> object
254 Number of replicates to generate
258 sub bootstrap_replicates
{
259 my ( $aln, $count ) = @_;
261 my $alen = $aln->length;
263 $aln->set_displayname_flat(1);
264 for my $s ( $aln->each_seq ) {
265 push @seqs, $s->seq();
269 while ( $count-- > 0 ) {
271 for ( $i = 0 ; $i < $alen ; $i++ ) {
272 my $index = int( rand($alen) );
275 $newseqs[ $c++ ] .= substr( $_, $index, 1 );
278 my $newaln = Bio
::SimpleAlign
->new();
280 for my $s (@newseqs) {
281 ( my $tmp = $s ) =~ s/[$Bio::LocatableSeq::GAP_SYMBOLS]+//g;
283 Bio
::LocatableSeq
->new(
285 -end
=> length($tmp),
286 -display_id
=> $nm[ $i++ ],
296 =head2 bootstrap_replicates_codons
298 Title : bootstrap_replicates_codons
299 Usage : my $alns = &bootstrap_replicates_codons($aln,100);
300 Function: Generate a pseudo-replicate of the data by randomly
301 sampling, with replacement, the columns from a codon alignment for
302 the non-parametric bootstrap. The alignment is assumed to start on
303 the first position of a codon.
304 Returns : Arrayref of L<Bio::SimpleAlign> objects
305 Args : L<Bio::SimpleAlign> object
306 Number of replicates to generate
310 sub bootstrap_replicates_codons
{
311 my ( $aln, $count ) = @_;
313 my $alen = $aln->length;
314 my $ncodon = int( $alen / 3 );
316 $aln->set_displayname_flat(1);
317 for my $s ( $aln->each_seq ) {
318 push @seqs, $s->seq();
322 while ( $count-- > 0 ) {
324 for ( $i = 0 ; $i < $ncodon ; $i++ ) {
325 my $index = int( rand($ncodon) );
326 my $seqpos = $index * 3;
329 $newseqs[ $c++ ] .= substr( $_, $seqpos, 3 );
332 my $newaln = Bio
::SimpleAlign
->new();
334 for my $s (@newseqs) {
335 ( my $tmp = $s ) =~ s{[$Bio::LocatableSeq::GAP_SYMBOLS]+}{}g;
337 Bio
::LocatableSeq
->new(
339 -end
=> length($tmp),
340 -display_id
=> $nm[ $i++ ],
353 Usage : $aln123 = cat($aln1, $aln2, $aln3)
354 Function : Concatenates alignment objects. Sequences are identified by id.
355 An error will be thrown if the sequence ids are not unique in the
356 first alignment. If any ids are not present or not unique in any
357 of the additional alignments then those sequences are omitted from
358 the concatenated alignment, and a warning is issued. An error will
359 be thrown if any of the alignments are not flush, since
360 concatenating such alignments is unlikely to make biological
362 Returns : A new Bio::SimpleAlign object
363 Args : A list of Bio::SimpleAlign objects
368 my ( $self, @aln ) = @_;
369 $self->throw("cat method called with no arguments") unless $self;
370 for ( $self, @aln ) {
371 $self->throw( $_->id . " is not a Bio::Align::AlignI object" )
372 unless $_->isa('Bio::Align::AlignI');
373 $self->throw( $_->id . " is not flush" ) unless $_->is_flush;
375 my $aln = $self->new;
376 $aln->id( $self->id );
377 $aln->annotation( $self->annotation );
379 SEQ
: foreach my $seq ( $self->each_seq() ) {
380 throw
( "ID: ", $seq->id, " is not unique in initial alignment." )
381 if exists $unique{ $seq->id };
382 $unique{ $seq->id } = 1;
384 # Can be Bio::LocatableSeq, Bio::Seq::Meta or Bio::Seq::Meta::Array
385 my $new_seq = $seq->new(
387 -strand
=> $seq->strand,
388 -verbose
=> $self->verbose
390 $new_seq->seq( $seq->seq );
391 $new_seq->start( $seq->start );
392 $new_seq->end( $seq->end );
393 if ( $new_seq->isa('Bio::Seq::MetaI') ) {
394 for my $meta_name ( $seq->meta_names ) {
395 $new_seq->named_submeta( $meta_name, $new_seq->start,
396 $new_seq->end, $seq->named_meta($meta_name) );
399 for my $cat_aln (@aln) {
400 my @cat_seq = $cat_aln->each_seq_with_id( $seq->id );
401 if ( @cat_seq == 0 ) {
402 $self->warn( $seq->id
403 . " not found in alignment "
405 . ", skipping this sequence." );
408 if ( @cat_seq > 1 ) {
409 $self->warn( $seq->id
410 . " found multiple times in alignment "
412 . ", skipping this sequence." );
415 my $cat_seq = $cat_seq[0];
416 my $old_end = $new_seq->end;
417 $new_seq->seq( $new_seq->seq . $cat_seq->seq );
419 # Not sure if this is a sensible way to deal with end coordinates
421 $new_seq->end + $cat_seq->end + 1 - $cat_seq->start );
423 if ( $cat_seq->isa('Bio::Seq::Meta::Array') ) {
424 unless ( $new_seq->isa('Bio::Seq::Meta::Array') ) {
425 my $meta_seq = Bio
::Seq
::Meta
::Array
->new;
426 $meta_seq->seq( $new_seq->seq );
427 $meta_seq->start( $new_seq->start );
428 $meta_seq->end( $new_seq->end );
429 if ( $new_seq->isa('Bio::Seq::Meta') ) {
430 for my $meta_name ( $new_seq->meta_names ) {
431 $meta_seq->named_submeta(
437 //, $new_seq->named_meta($meta_name)
443 $new_seq = $meta_seq;
445 for my $meta_name ( $cat_seq->meta_names ) {
446 $new_seq->named_submeta( $meta_name, $old_end + 1,
447 $new_seq->end, $cat_seq->named_meta($meta_name) );
450 elsif ( $cat_seq->isa('Bio::Seq::Meta') ) {
451 if ( $new_seq->isa('Bio::Seq::Meta::Array') ) {
452 for my $meta_name ( $cat_seq->meta_names ) {
453 $new_seq->named_submeta( $meta_name, $old_end + 1,
455 [ split( //, $cat_seq->named_meta($meta_name) ) ] );
459 unless ( $new_seq->isa('Bio::Seq::Meta') ) {
460 my $meta_seq = Bio
::Seq
::Meta
::Array
->new;
461 $meta_seq->seq( $new_seq->seq );
462 $meta_seq->start( $new_seq->start );
463 $meta_seq->end( $new_seq->end );
464 $new_seq = $meta_seq;
466 for my $meta_name ( $cat_seq->meta_names ) {
467 $new_seq->named_submeta( $meta_name, $old_end + 1,
468 $new_seq->end, $cat_seq->named_meta($meta_name) );
473 $aln->add_seq($new_seq);
475 my $cons_meta = $self->consensus_meta;
478 $new_cons_meta = Bio
::Seq
::Meta
->new();
479 for my $meta_name ( $cons_meta->meta_names ) {
480 $new_cons_meta->named_submeta( $meta_name, 1, $self->length,
481 $cons_meta->$meta_name );
484 my $end = $self->length;
485 for my $cat_aln (@aln) {
486 my $cat_cons_meta = $cat_aln->consensus_meta;
487 if ($cat_cons_meta) {
488 $new_cons_meta = Bio
::Seq
::Meta
->new() if !$new_cons_meta;
489 for my $meta_name ( $cat_cons_meta->meta_names ) {
490 $new_cons_meta->named_submeta(
491 $meta_name, $end + 1,
492 $end + $cat_aln->length,
493 $cat_cons_meta->$meta_name
497 $end += $cat_aln->length;
499 $aln->consensus_meta($new_cons_meta) if $new_cons_meta;
504 =head2 most_common_sequences
506 Title : most_common_sequences
507 Usage : @common = most_common_sequences ($align, $case_sensitivity)
508 Function : Returns an array of the sequences that appear most often in the
509 alignment (although this probably makes more sense when there is
510 only a single most common sequence). Sequences are compared after
511 removing any "-" (gap characters), and ambiguous units (e.g., R
512 for purines) are only compared to themselves. The returned
513 sequence is also missing the "-" since they don't actually make
514 part of the sequence.
515 Returns : Array of text strings.
516 Arguments : Optional argument defining whether the comparison between sequences
517 to find the most common should be case sensitive. Defaults to
518 false, i.e, not case sensitive.
522 sub most_common_sequences
{
524 or croak
("Must provide Bio::AlignI object to Bio::Align::Utilities::most_common_sequences");
525 my $case_sensitive = shift; # defaults to false (we get undef if nothing)
527 ## We keep track of the max on this loop. Saves us having to
528 ## transverse the hash table later to find the maximum value.
531 foreach ($align->each_seq) {
532 (my $seq = $_->seq) =~ tr/-//d;
533 $seq = uc ($seq) unless $case_sensitive;
534 $max++ if (++$counts{$seq} > $max);
536 my @common = grep ($counts{$_} == $max, keys %counts);