maint: remove Travis stuff which has been replaced with Github actions (#325)
[bioperl-live.git] / lib / Bio / Align / ProteinStatistics.pm
blob226f320802ef0cda1cace2196ba648e943474880
2 # BioPerl module for Bio::Align::ProteinStatistics
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Jason Stajich <jason-at-bioperl.org>
8 # Copyright Jason Stajich
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
14 =head1 NAME
16 Bio::Align::ProteinStatistics - Calculate Protein Alignment statistics (mostly distances)
18 =head1 SYNOPSIS
20 use Bio::Align::ProteinStatistics;
21 use Bio::AlignIO;
22 my $in = Bio::AlignIO->new(-format => 'fasta',
23 -file => 'pep-104.fasaln');
24 my $aln = $in->next_aln;
26 my $pepstats = Bio::Align::ProteinStatistics->new();
27 $kimura = $protstats->distance(-align => $aln,
28 -method => 'Kimura');
29 print $kimura->print_matrix;
32 =head1 DESCRIPTION
34 This object is for generating various statistics from a protein
35 alignment. Mostly it is where pairwise protein distances can be
36 calculated.
38 =head1 REFERENCES
40 D_Kimura - Kimura, M. 1983. The Neutral Theory of Molecular Evolution. CUP,
41 Cambridge.
43 =head1 FEEDBACK
45 =head2 Mailing Lists
47 User feedback is an integral part of the evolution of this and other
48 Bioperl modules. Send your comments and suggestions preferably to
49 the Bioperl mailing list. Your participation is much appreciated.
51 bioperl-l@bioperl.org - General discussion
52 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
54 =head2 Support
56 Please direct usage questions or support issues to the mailing list:
58 I<bioperl-l@bioperl.org>
60 rather than to the module maintainer directly. Many experienced and
61 reponsive experts will be able look at the problem and quickly
62 address it. Please include a thorough description of the problem
63 with code and data examples if at all possible.
65 =head2 Reporting Bugs
67 Report bugs to the Bioperl bug tracking system to help us keep track
68 of the bugs and their resolution. Bug reports can be submitted via the
69 web:
71 https://github.com/bioperl/bioperl-live/issues
73 =head1 AUTHOR - Jason Stajich
75 Email jason-at-bioperl.org
77 =head1 APPENDIX
79 The rest of the documentation details each of the object methods.
80 Internal methods are usually preceded with a _
82 =cut
85 # Let the code begin...
88 package Bio::Align::ProteinStatistics;
90 use vars qw(%DistanceMethods $Precision $DefaultGapPenalty);
91 use strict;
93 use Bio::Align::PairwiseStatistics;
94 use Bio::Matrix::PhylipDist;
96 %DistanceMethods = ('kimura|k' => 'Kimura',
98 $Precision = 5;
99 $DefaultGapPenalty = 0;
101 use base qw(Bio::Root::Root Bio::Align::StatisticsI);
103 =head2 new
105 Title : new
106 Usage : my $obj = Bio::Align::ProteinStatistics->new();
107 Function: Builds a new Bio::Align::ProteinStatistics object
108 Returns : an instance of Bio::Align::ProteinStatistics
109 Args :
112 =cut
114 sub new {
115 my($class,@args) = @_;
117 my $self = $class->SUPER::new(@args);
118 $self->pairwise_stats( Bio::Align::PairwiseStatistics->new());
120 return $self;
123 =head2 distance
125 Title : distance
126 Usage : my $distance_mat = $stats->distance(-align => $aln,
127 -method => $method);
128 Function: Calculates a distance matrix for all pairwise distances of
129 sequences in an alignment.
130 Returns : L<Bio::Matrix::PhylipDist> object
131 Args : -align => Bio::Align::AlignI object
132 -method => String specifying specific distance method
133 (implementing class may assume a default)
135 =cut
137 sub distance{
138 my ($self,@args) = @_;
139 my ($aln,$method) = $self->_rearrange([qw(ALIGN METHOD)],@args);
140 if( ! defined $aln || ! ref ($aln) || ! $aln->isa('Bio::Align::AlignI') ) {
141 $self->throw("Must supply a valid Bio::Align::AlignI for the -align parameter in distance");
143 $method ||= 'Kimura';
144 foreach my $m ( keys %DistanceMethods ) {
145 if(defined $m && $method =~ /$m/i ) {
146 my $mtd = "D_$DistanceMethods{$m}";
147 return $self->$mtd($aln);
150 $self->warn("Unrecognized distance method $method must be one of [".
151 join(',',$self->available_distance_methods())."]");
152 return;
155 =head2 available_distance_methods
157 Title : available_distance_methods
158 Usage : my @methods = $stats->available_distance_methods();
159 Function: Enumerates the possible distance methods
160 Returns : Array of strings
161 Args : none
164 =cut
166 sub available_distance_methods{
167 my ($self,@args) = @_;
168 return values %DistanceMethods;
171 =head2 D - distance methods
174 =cut
177 =head2 D_Kimura
179 Title : D_Kimura
180 Usage : my $matrix = $pepstats->D_Kimura($aln);
181 Function: Calculate Kimura protein distance (Kimura 1983) which
182 approximates PAM distance
183 D = -ln ( 1 - p - 0.2 * p^2 )
184 Returns : L<Bio::Matrix::PhylipDist>
185 Args : L<Bio::Align::AlignI>
188 =cut
190 # Kimura, M. 1983. The Neutral Theory of Molecular Evolution. CUP, Cambridge.
192 sub D_Kimura{
193 my ($self,$aln) = @_;
194 return 0 unless $self->_check_arg($aln);
195 # ambiguities ignored at this point
196 my (@seqs,@names,@values,%dist);
197 my $seqct = 0;
198 foreach my $seq ( $aln->each_seq) {
199 push @names, $seq->display_id;
200 push @seqs, uc $seq->seq();
201 $seqct++;
203 my $len = $aln->length;
204 my $precisionstr = "%.$Precision"."f";
206 for( my $i = 0; $i < $seqct-1; $i++ ) {
207 # (diagonals) distance is 0 for same sequence
208 $dist{$names[$i]}->{$names[$i]} = [$i,$i];
209 $values[$i][$i] = sprintf($precisionstr,0);
210 for( my $j = $i+1; $j < $seqct; $j++ ) {
211 my ($scored,$match) = (0,0);
212 for( my $k=0; $k < $len; $k++ ) {
213 my $m1 = substr($seqs[$i],$k,1);
214 my $m2 = substr($seqs[$j],$k,1);
215 if( $m1 ne '-' && $m2 ne '-' ) {
216 # score is number of scored bases (alignable bases)
217 # it could have also come from
218 # my $L = $self->pairwise_stats->number_of_comparable_bases($pairwise);
219 # match is number of matches weighting ambiguity bases
220 # as well
221 $match += _check_ambiguity_protein($m1,$m2);
222 $scored++;
225 # From Felsenstein's PHYLIP documentation:
226 # This is very quick to do but has some obvious
227 # limitations. It does not take into account which amino
228 # acids differ or to what amino acids they change, so some
229 # information is lost. The units of the distance measure
230 # are fraction of amino acids differing, as also in the
231 # case of the PAM distance. If the fraction of amino acids
232 # differing gets larger than 0.8541 the distance becomes
233 # infinite.
235 my $D = 1 - ( $match / $scored );
236 if( $D < 0.8541 ) {
237 $D = - log ( 1 - $D - (0.2 * ($D ** 2)));
238 $values[$j][$i] = $values[$i][$j] = sprintf($precisionstr,$D);
239 } else {
240 $values[$j][$i] = $values[$i][$j] = ' NaN';
242 # fwd and rev lookup
243 $dist{$names[$i]}->{$names[$j]} = [$i,$j];
244 $dist{$names[$j]}->{$names[$i]} = [$i,$j];
246 # (diagonals) distance is 0 for same sequence
247 $dist{$names[$j]}->{$names[$j]} = [$j,$j];
248 $values[$j][$j] = sprintf($precisionstr,0);
252 return Bio::Matrix::PhylipDist->new(-program => 'bioperl_PEPstats',
253 -matrix => \%dist,
254 -names => \@names,
255 -values => \@values);
259 # some methods from EMBOSS distmat
260 sub _check_ambiguity_protein
262 my ($t1,$t2) = @_;
263 my $n = 0;
265 if( $t1 ne 'X' && $t1 eq $t2 ) {
266 $n = 1.0;
267 } elsif( (($t1 eq 'B' && $t2 =~ /[DN]/ ) ||
268 ($t2 eq 'B' && $t1 =~ /[DN]/ )) ||
270 (($t1 eq 'Z' && $t2 =~ /[EQ]/) ||
271 ($t2 eq 'Z' && $t1 =~ /[EQ]/ ))) {
272 $n = 0.5;
273 } elsif ( $t1 eq 'X' && $t2 eq 'X' ) {
274 $n = 0.0025;
275 } elsif( $t1 eq 'X' || $t2 eq 'X' ) {
276 $n = 0.05;
278 return $n;
281 =head2 Data Methods
283 =cut
285 =head2 pairwise_stats
287 Title : pairwise_stats
288 Usage : $obj->pairwise_stats($newval)
289 Function:
290 Returns : value of pairwise_stats
291 Args : newvalue (optional)
294 =cut
296 sub pairwise_stats{
297 my ($self,$value) = @_;
298 if( defined $value) {
299 $self->{'_pairwise_stats'} = $value;
301 return $self->{'_pairwise_stats'};
305 sub _check_arg {
306 my($self,$aln ) = @_;
307 if( ! defined $aln || ! $aln->isa('Bio::Align::AlignI') ) {
308 $self->warn("Must provide a Bio::Align::AlignI compliant object to Bio::Align::DNAStatistics");
309 return 0;
310 } elsif( $aln->get_seq_by_pos(1)->alphabet ne 'protein' ) {
311 $self->warn("Must provide a protein alignment to Bio::Align::ProteinStatistics, you provided a " . $aln->get_seq_by_pos(1)->alphabet);
312 return 0;
314 return 1;