ide/bioperl-mode: into its own repository to be developed separately
[bioperl-live.git] / Bio / Align / AlignI.pm
blob3c79e4fa1153ac9f70f8a45e29cec1b2f0a2ef63
2 # BioPerl module for Bio::Align::AlignI
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Jason Stajich <jason@bioperl.org>
8 # Copyright Jason Stajich
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
14 =head1 NAME
16 Bio::Align::AlignI - An interface for describing sequence alignments.
18 =head1 SYNOPSIS
20 # get a Bio::Align::AlignI somehow - typically using Bio::AlignIO system
21 # some descriptors
22 print $aln->length, "\n";
23 print $aln->num_residues, "\n";
24 print $aln->is_flush, "\n";
25 print $aln->num_sequences, "\n";
26 print $aln->percentage_identity, "\n";
27 print $aln->consensus_string(50), "\n";
29 # find the position in the alignment for a sequence location
30 $pos = $aln->column_from_residue_number('1433_LYCES', 14); # = 6;
32 # extract sequences and check values for the alignment column $pos
33 foreach $seq ($aln->each_seq) {
34 $res = $seq->subseq($pos, $pos);
35 $count{$res}++;
37 foreach $res (keys %count) {
38 printf "Res: %s Count: %2d\n", $res, $count{$res};
41 =head1 DESCRIPTION
43 This interface describes the basis for alignment objects.
45 =head1 FEEDBACK
47 =head2 Mailing Lists
49 User feedback is an integral part of the evolution of this and other
50 Bioperl modules. Send your comments and suggestions preferably to
51 the Bioperl mailing list. Your participation is much appreciated.
53 bioperl-l@bioperl.org - General discussion
54 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
56 =head2 Support
58 Please direct usage questions or support issues to the mailing list:
60 I<bioperl-l@bioperl.org>
62 rather than to the module maintainer directly. Many experienced and
63 reponsive experts will be able look at the problem and quickly
64 address it. Please include a thorough description of the problem
65 with code and data examples if at all possible.
67 =head2 Reporting Bugs
69 Report bugs to the Bioperl bug tracking system to help us keep track
70 of the bugs and their resolution. Bug reports can be submitted via the
71 web:
73 https://github.com/bioperl/bioperl-live/issues
75 =head1 AUTHOR - Jason Stajich
77 Email jason@bioperl.org
79 =head1 CONTRIBUTORS
81 Ewan Birney, birney@ebi.ac.uk
82 Heikki Lehvaslaiho, heikki-at-bioperl-dot-org
84 =head1 APPENDIX
86 The rest of the documentation details each of the object methods.
87 Internal methods are usually preceded with a _
89 =cut
92 # Let the code begin...
95 package Bio::Align::AlignI;
96 use strict;
99 use base qw(Bio::Root::RootI);
101 =head1 Modifier methods
103 These methods modify the MSE by adding, removing or shuffling complete
104 sequences.
106 =head2 add_seq
108 Title : add_seq
109 Usage : $myalign->add_seq($newseq);
110 Function : Adds another sequence to the alignment. *Does not* align
111 it - just adds it to the hashes.
112 Returns : None
113 Argument : a Bio::LocatableSeq object
114 order (optional)
116 See L<Bio::LocatableSeq> for more information.
118 =cut
120 sub add_seq {
121 my ($self) = @_;
122 $self->throw_not_implemented();
125 =head2 remove_seq
127 Title : remove_seq
128 Usage : $aln->remove_seq($seq);
129 Function : Removes a single sequence from an alignment
130 Returns :
131 Argument : a Bio::LocatableSeq object
133 =cut
135 sub remove_seq {
136 my ($self) = @_;
137 $self->throw_not_implemented();
140 =head2 purge
142 Title : purge
143 Usage : $aln->purge(0.7);
144 Function:
146 Removes sequences above whatever %id.
148 This function will grind on large alignments. Beware!
149 (perhaps not ideally implemented)
151 Example :
152 Returns : An array of the removed sequences
153 Argument:
156 =cut
158 sub purge {
159 my ($self) = @_;
160 $self->throw_not_implemented();
163 =head2 sort_alphabetically
165 Title : sort_alphabetically
166 Usage : $ali->sort_alphabetically
167 Function :
169 Changes the order of the alignment to alphabetical on name
170 followed by numerical by number.
172 Returns : an array
173 Argument :
175 =cut
177 sub sort_alphabetically {
178 my ($self) = @_;
179 $self->throw_not_implemented();
182 =head1 Sequence selection methods
184 Methods returning one or more sequences objects.
186 =head2 each_seq
188 Title : each_seq
189 Usage : foreach $seq ( $align->each_seq() )
190 Function : Gets an array of Seq objects from the alignment
191 Returns : an array
192 Argument :
194 =cut
196 sub each_seq {
197 my ($self) = @_;
198 $self->throw_not_implemented();
201 =head2 each_alphabetically
203 Title : each_alphabetically
204 Usage : foreach $seq ( $ali->each_alphabetically() )
205 Function :
207 Returns an array of sequence object sorted alphabetically
208 by name and then by start point.
209 Does not change the order of the alignment
211 Returns :
212 Argument :
214 =cut
216 sub each_alphabetically {
217 my($self) = @_;
218 $self->throw_not_implemented();
221 =head2 each_seq_with_id
223 Title : each_seq_with_id
224 Usage : foreach $seq ( $align->each_seq_with_id() )
225 Function :
227 Gets an array of Seq objects from the
228 alignment, the contents being those sequences
229 with the given name (there may be more than one)
231 Returns : an array
232 Argument : a seq name
234 =cut
236 sub each_seq_with_id {
237 my ($self) = @_;
238 $self->throw_not_implemented();
241 =head2 get_seq_by_pos
243 Title : get_seq_by_pos
244 Usage : $seq = $aln->get_seq_by_pos(3) # third sequence from the alignment
245 Function :
247 Gets a sequence based on its position in the alignment.
248 Numbering starts from 1. Sequence positions larger than
249 num_sequences() will throw an error.
251 Returns : a Bio::LocatableSeq object
252 Argument : positive integer for the sequence position
254 =cut
256 sub get_seq_by_pos {
257 my ($self) = @_;
258 $self->throw_not_implemented();
261 =head1 Create new alignments
263 The result of these methods are horizontal or vertical subsets of the
264 current MSE.
266 =head2 select
268 Title : select
269 Usage : $aln2 = $aln->select(1, 3) # three first sequences
270 Function :
272 Creates a new alignment from a continuous subset of
273 sequences. Numbering starts from 1. Sequence positions
274 larger than num_sequences() will throw an error.
276 Returns : a Bio::SimpleAlign object
277 Argument : positive integer for the first sequence
278 positive integer for the last sequence to include (optional)
280 =cut
282 sub select {
283 my ($self) = @_;
284 $self->throw_not_implemented();
288 =head2 select_noncont
290 Title : select_noncont
291 Usage : $aln2 = $aln->select_noncont(1, 3) # first and 3rd sequences
292 Function :
294 Creates a new alignment from a subset of
295 sequences. Numbering starts from 1. Sequence positions
296 larger than num_sequences() will throw an error.
298 Returns : a Bio::SimpleAlign object
299 Args : array of integers for the sequences
301 =cut
303 sub select_noncont {
304 my ($self) = @_;
305 $self->throw_not_implemented();
308 =head2 slice
310 Title : slice
311 Usage : $aln2 = $aln->slice(20, 30)
312 Function :
314 Creates a slice from the alignment inclusive of start and
315 end columns. Sequences with no residues in the slice are
316 excluded from the new alignment and a warning is printed.
317 Slice beyond the length of the sequence does not do
318 padding.
320 Returns : a Bio::SimpleAlign object
321 Argument : positive integer for start column
322 positive integer for end column
324 =cut
326 sub slice {
327 my ($self) = @_;
328 $self->throw_not_implemented();
331 =head1 Change sequences within the MSE
333 These methods affect characters in all sequences without changing the
334 alignment.
337 =head2 map_chars
339 Title : map_chars
340 Usage : $ali->map_chars('\.','-')
341 Function :
343 Does a s/$arg1/$arg2/ on the sequences. Useful for gap
344 characters
346 Notice that the "from" (arg1) is interpreted as a regex,
347 so be careful about quoting meta characters (e.g.
348 $ali->map_chars('.', '-') won't do what you want)
350 Returns : None
351 Argument : 'from' rexexp
352 'to' string
354 =cut
356 sub map_chars {
357 my ($self) = @_;
358 $self->throw_not_implemented();
361 =head2 uppercase
363 Title : uppercase()
364 Usage : $ali->uppercase()
365 Function : Sets all the sequences to uppercase
366 Returns :
367 Argument :
369 =cut
371 sub uppercase {
372 my ($self) = @_;
373 $self->throw_not_implemented();
376 =head2 match_line
378 Title : match_line()
379 Usage : $align->match_line()
380 Function : Generates a match line - much like consensus string
381 except that a line indicating the '*' for a match.
382 Argument : (optional) Match line characters ('*' by default)
383 (optional) Strong match char (':' by default)
384 (optional) Weak match char ('.' by default)
386 =cut
388 sub match_line {
389 my ($self) = @_;
390 $self->throw_not_implemented();
393 =head2 match
395 Title : match()
396 Usage : $ali->match()
397 Function :
399 Goes through all columns and changes residues that are
400 identical to residue in first sequence to match '.'
401 character. Sets match_char.
403 USE WITH CARE: Most MSE formats do not support match
404 characters in sequences, so this is mostly for output
405 only. NEXUS format (Bio::AlignIO::nexus) can handle
408 Returns : 1
409 Argument : a match character, optional, defaults to '.'
411 =cut
413 sub match {
414 my ($self) = @_;
415 $self->throw_not_implemented();
418 =head2 unmatch
420 Title : unmatch()
421 Usage : $ali->unmatch()
422 Function :
424 Undoes the effect of method match. Unsets match_char.
426 Returns : 1
427 Argument : a match character, optional, defaults to '.'
429 =cut
431 sub unmatch {
432 my ($self) = @_;
433 $self->throw_not_implemented();
437 =head1 MSE attibutes
439 Methods for setting and reading the MSE attributes.
441 Note that the methods defining character semantics depend on the user
442 to set them sensibly. They are needed only by certain input/output
443 methods. Unset them by setting to an empty string ('').
445 =head2 id
447 Title : id
448 Usage : $myalign->id("Ig")
449 Function : Gets/sets the id field of the alignment
450 Returns : An id string
451 Argument : An id string (optional)
453 =cut
455 sub id {
456 my ($self) = @_;
457 $self->throw_not_implemented();
460 =head2 missing_char
462 Title : missing_char
463 Usage : $myalign->missing_char("?")
464 Function : Gets/sets the missing_char attribute of the alignment
465 It is generally recommended to set it to 'n' or 'N'
466 for nucleotides and to 'X' for protein.
467 Returns : An missing_char string,
468 Argument : An missing_char string (optional)
470 =cut
472 sub missing_char {
473 my ($self) = @_;
474 $self->throw_not_implemented();
477 =head2 match_char
479 Title : match_char
480 Usage : $myalign->match_char('.')
481 Function : Gets/sets the match_char attribute of the alignment
482 Returns : An match_char string,
483 Argument : An match_char string (optional)
485 =cut
487 sub match_char {
488 my ($self) = @_;
489 $self->throw_not_implemented();
492 =head2 gap_char
494 Title : gap_char
495 Usage : $myalign->gap_char('-')
496 Function : Gets/sets the gap_char attribute of the alignment
497 Returns : An gap_char string, defaults to '-'
498 Argument : An gap_char string (optional)
500 =cut
502 sub gap_char {
503 my ($self) = @_;
504 $self->throw_not_implemented();
507 =head2 symbol_chars
509 Title : symbol_chars
510 Usage : my @symbolchars = $aln->symbol_chars;
511 Function: Returns all the seen symbols (other than gaps)
512 Returns : array of characters that are the seen symbols
513 Argument: boolean to include the gap/missing/match characters
515 =cut
517 sub symbol_chars{
518 my ($self) = @_;
519 $self->throw_not_implemented();
522 =head1 Alignment descriptors
524 These read only methods describe the MSE in various ways.
527 =head2 consensus_string
529 Title : consensus_string
530 Usage : $str = $ali->consensus_string($threshold_percent)
531 Function : Makes a strict consensus
532 Returns : consensus string
533 Argument : Optional threshold ranging from 0 to 100.
534 The consensus residue has to appear at least threshold %
535 of the sequences at a given location, otherwise a '?'
536 character will be placed at that location.
537 (Default value = 0%)
539 =cut
541 sub consensus_string {
542 my ($self) = @_;
543 $self->throw_not_implemented();
546 =head2 consensus_iupac
548 Title : consensus_iupac
549 Usage : $str = $ali->consensus_iupac()
550 Function :
552 Makes a consensus using IUPAC ambiguity codes from DNA
553 and RNA. The output is in upper case except when gaps in
554 a column force output to be in lower case.
556 Note that if your alignment sequences contain a lot of
557 IUPAC ambiquity codes you often have to manually set
558 alphabet. Bio::PrimarySeq::_guess_type thinks they
559 indicate a protein sequence.
561 Returns : consensus string
562 Argument : none
563 Throws : on protein sequences
566 =cut
568 sub consensus_iupac {
569 my ($self) = @_;
570 $self->throw_not_implemented();
573 =head2 is_flush
575 Title : is_flush
576 Usage : if( $ali->is_flush() )
579 Function : Tells you whether the alignment
580 : is flush, ie all of the same length
583 Returns : 1 or 0
584 Argument :
586 =cut
588 sub is_flush {
589 my ($self) = @_;
590 $self->throw_not_implemented();
593 =head2 length
595 Title : length()
596 Usage : $len = $ali->length()
597 Function : Returns the maximum length of the alignment.
598 To be sure the alignment is a block, use is_flush
599 Returns : integer
600 Argument :
602 =cut
604 sub length {
605 my ($self) = @_;
606 $self->throw_not_implemented();
609 =head2 maxname_length
611 Title : maxname_length
612 Usage : $ali->maxname_length()
613 Function :
615 Gets the maximum length of the displayname in the
616 alignment. Used in writing out various MSE formats.
618 Returns : integer
619 Argument :
621 =cut
623 sub maxname_length {
624 my ($self) = @_;
625 $self->throw_not_implemented();
628 =head2 num_residues
630 Title : num_residues
631 Usage : $no = $ali->num_residues
632 Function : number of residues in total in the alignment
633 Returns : integer
634 Argument :
635 Note : replaces no_residues
637 =cut
639 sub num_residues {
640 my ($self) = @_;
641 $self->throw_not_implemented();
644 =head2 num_sequences
646 Title : num_sequences
647 Usage : $depth = $ali->num_sequences
648 Function : number of sequence in the sequence alignment
649 Returns : integer
650 Argument : None
651 Note : replaces no_sequences
653 =cut
655 sub num_sequences {
656 my ($self) = @_;
657 $self->throw_not_implemented();
660 =head2 percentage_identity
662 Title : percentage_identity
663 Usage : $id = $align->percentage_identity
664 Function: The function calculates the percentage identity of the alignment
665 Returns : The percentage identity of the alignment (as defined by the
666 implementation)
667 Argument: None
669 =cut
671 sub percentage_identity{
672 my ($self) = @_;
673 $self->throw_not_implemented();
676 =head2 overall_percentage_identity
678 Title : overall_percentage_identity
679 Usage : $id = $align->overall_percentage_identity
680 Function: The function calculates the percentage identity of
681 the conserved columns
682 Returns : The percentage identity of the conserved columns
683 Args : None
685 =cut
687 sub overall_percentage_identity{
688 my ($self) = @_;
689 $self->throw_not_implemented();
693 =head2 average_percentage_identity
695 Title : average_percentage_identity
696 Usage : $id = $align->average_percentage_identity
697 Function: The function uses a fast method to calculate the average
698 percentage identity of the alignment
699 Returns : The average percentage identity of the alignment
700 Args : None
702 =cut
704 sub average_percentage_identity{
705 my ($self) = @_;
706 $self->throw_not_implemented();
709 =head1 Alignment positions
711 Methods to map a sequence position into an alignment column and back.
712 column_from_residue_number() does the former. The latter is really a
713 property of the sequence object and can done using
714 L<Bio::LocatableSeq::location_from_column>:
716 # select somehow a sequence from the alignment, e.g.
717 my $seq = $aln->get_seq_by_pos(1);
718 #$loc is undef or Bio::LocationI object
719 my $loc = $seq->location_from_column(5);
722 =head2 column_from_residue_number
724 Title : column_from_residue_number
725 Usage : $col = $ali->column_from_residue_number( $seqname, $resnumber)
726 Function:
728 This function gives the position in the alignment
729 (i.e. column number) of the given residue number in the
730 sequence with the given name. For example, for the
731 alignment
733 Seq1/91-97 AC..DEF.GH
734 Seq2/24-30 ACGG.RTY..
735 Seq3/43-51 AC.DDEFGHI
737 column_from_residue_number( "Seq1", 94 ) returns 6.
738 column_from_residue_number( "Seq2", 25 ) returns 2.
739 column_from_residue_number( "Seq3", 50 ) returns 9.
741 An exception is thrown if the residue number would lie
742 outside the length of the alignment
743 (e.g. column_from_residue_number( "Seq2", 22 )
745 Note: If the parent sequence is represented by more than one
746 alignment sequence and the residue number is present in
747 them, this method finds only the first one.
749 Returns : A column number for the position in the alignment of the
750 given residue in the given sequence (1 = first column)
751 Args : A sequence id/name (not a name/start-end)
752 A residue number in the whole sequence (not just that
753 segment of it in the alignment)
755 =cut
757 sub column_from_residue_number {
758 my ($self) = @_;
759 $self->throw_not_implemented();
762 =head1 Sequence names
764 Methods to manipulate the display name. The default name based on the
765 sequence id and subsequence positions can be overridden in various
766 ways.
768 =head2 displayname
770 Title : displayname
771 Usage : $myalign->displayname("Ig", "IgA")
772 Function : Gets/sets the display name of a sequence in the alignment
774 Returns : A display name string
775 Argument : name of the sequence
776 displayname of the sequence (optional)
778 =cut
780 sub displayname {
781 my ($self) = @_;
782 $self->throw_not_implemented();
785 =head2 set_displayname_count
787 Title : set_displayname_count
788 Usage : $ali->set_displayname_count
789 Function :
791 Sets the names to be name_# where # is the number of
792 times this name has been used.
794 Returns : None
795 Argument : None
797 =cut
799 sub set_displayname_count {
800 my ($self) = @_;
801 $self->throw_not_implemented();
804 =head2 set_displayname_flat
806 Title : set_displayname_flat
807 Usage : $ali->set_displayname_flat()
808 Function : Makes all the sequences be displayed as just their name,
809 not name/start-end
810 Returns : 1
811 Argument : None
813 =cut
815 sub set_displayname_flat {
816 my ($self) = @_;
817 $self->throw_not_implemented();
820 =head2 set_displayname_normal
822 Title : set_displayname_normal
823 Usage : $ali->set_displayname_normal()
824 Function : Makes all the sequences be displayed as name/start-end
825 Returns : None
826 Argument : None
828 =cut
830 sub set_displayname_normal {
831 my ($self) = @_;
832 $self->throw_not_implemented();
835 =head1 Deprecated methods
837 =head2 no_residues
839 Title : no_residues
840 Usage : $no = $ali->no_residues
841 Function : number of residues in total in the alignment
842 Returns : integer
843 Argument :
844 Note : deprecated in favor of num_residues()
846 =cut
848 sub no_residues {
849 # immediate deprecation
850 shift->deprecated();
853 =head2 no_sequences
855 Title : no_sequences
856 Usage : $depth = $ali->no_sequences
857 Function : number of sequence in the sequence alignment
858 Returns : integer
859 Argument : None
860 Note : deprecated in favor of num_sequences()
862 =cut
864 sub no_sequences {
865 # immediate deprecation
866 shift->deprecated();