t/SeqFeature/Generic.t: fix typo on required module for testing
[bioperl-live.git] / lib / Bio / Align / PairwiseStatistics.pm
blob9600527c3f648f3071ed8d5025f16b98b824e948
2 # BioPerl module for Bio::Align::PairwiseStatistics
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Jason Stajich <jason@bioperl.org>
8 # Copyright Jason Stajich
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
14 =head1 NAME
16 Bio::Align::PairwiseStatistics - Base statistic object for Pairwise Alignments
18 =head1 SYNOPSIS
20 use strict;
21 my $stats = Bio::Align::PairwiseStatistics->new();
23 # get alignment object of two sequences somehow
24 my $pwaln;
25 print $stats->number_of_comparable_bases($pwaln);
26 my $score = $stats->score_nuc($pwaln);
29 =head1 DESCRIPTION
31 Calculate pairwise statistics.
33 =head1 FEEDBACK
35 =head2 Mailing Lists
37 User feedback is an integral part of the evolution of this and other
38 Bioperl modules. Send your comments and suggestions preferably to
39 the Bioperl mailing list. Your participation is much appreciated.
41 bioperl-l@bioperl.org - General discussion
42 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
44 =head2 Support
46 Please direct usage questions or support issues to the mailing list:
48 I<bioperl-l@bioperl.org>
50 rather than to the module maintainer directly. Many experienced and
51 reponsive experts will be able look at the problem and quickly
52 address it. Please include a thorough description of the problem
53 with code and data examples if at all possible.
55 =head2 Reporting Bugs
57 Report bugs to the Bioperl bug tracking system to help us keep track
58 of the bugs and their resolution. Bug reports can be submitted via the
59 web:
61 https://github.com/bioperl/bioperl-live/issues
63 =head1 AUTHOR - Jason Stajich
65 Email jason-at-bioperl-dot-org
67 =head1 APPENDIX
69 The rest of the documentation details each of the object methods.
70 Internal methods are usually preceded with a _
72 =cut
75 # Let the code begin...
78 package Bio::Align::PairwiseStatistics;
79 use vars qw($GapChars);
80 use strict;
83 BEGIN { $GapChars = '(\.|\-)'; }
85 use base qw(Bio::Root::Root Bio::Align::StatisticsI);
87 =head2 number_of_comparable_bases
89 Title : number_of_comparable_bases
90 Usage : my $bases = $stat->number_of_comparable_bases($aln);
91 Function: Returns the count of the number of bases that can be
92 compared (L) in this alignment ( length - gaps)
93 Returns : integer
94 Args : L<Bio::Align::AlignI>
97 =cut
99 sub number_of_comparable_bases{
100 my ($self,$aln) = @_;
101 if ( ! defined $aln || ! $aln->isa('Bio::Align::AlignI') ) {
102 $self->throw("Must provide a Bio::Align::AlignI compliant object to ".
103 "Bio::Align::PairwiseStatistics");
104 return 0;
105 } elsif ( $aln->num_sequences != 2 ) {
106 $self->throw("Only pairwise calculations supported. Found ".
107 $aln->num_sequences." sequences in alignment\n");
109 my $L = $aln->length - $self->number_of_gaps($aln);
110 return $L;
113 =head2 number_of_differences
115 Title : number_of_differences
116 Usage : my $nd = $stat->number_of_distances($aln);
117 Function: Returns the number of differences between two sequences
118 Returns : integer
119 Args : L<Bio::Align::AlignI>
122 =cut
124 sub number_of_differences{
125 my ($self,$aln) = @_;
126 if( ! defined $aln || ! $aln->isa('Bio::Align::AlignI') ) {
127 $self->throw("Must provide a Bio::Align::AlignI compliant object to ".
128 "Bio::Align::PairwiseStatistics");
129 } elsif ( $aln->num_sequences != 2 ) {
130 $self->throw("Only pairwise calculations supported. Found ".
131 $aln->num_sequences." sequences in alignment\n");
133 my (@seqs);
134 foreach my $seq ( $aln->each_seq ) {
135 push @seqs, [ split(//,$seq->seq())];
137 my $firstseq = shift @seqs;
138 #my $secondseq = shift @seqs;
139 my $diffcount = 0;
140 for (my $i = 0;$i<$aln->length; $i++ ) {
141 next if ( $firstseq->[$i] =~ /^$GapChars$/ );
142 foreach my $seq ( @seqs ) {
143 next if ( $seq->[$i] =~ /^$GapChars$/ );
144 if( $firstseq->[$i] ne $seq->[$i] ) {
145 $diffcount++;
149 return $diffcount;
152 =head2 number_of_gaps
154 Title : number_of_gaps
155 Usage : my $nd = $stat->number_of_gaps($aln);
156 Function: Returns the number of gapped positions among sequences in alignment
157 Returns : integer
158 Args : L<Bio::Align::AlignI>
161 =cut
163 sub number_of_gaps{
164 my ($self,$aln) = @_;
165 if ( ! defined $aln || ! $aln->isa('Bio::Align::AlignI') ) {
166 $self->throw("Must provide a Bio::Align::AlignI compliant object to ".
167 "Bio::Align::PairwiseStatistics");
168 } elsif ( $aln->num_sequences != 2 ) {
169 $self->throw("Only pairwise calculations supported. Found ".
170 $aln->num_sequences." sequences in alignment\n");
172 my $gapline = $aln->gap_line;
173 # this will count the number of '-' characters
174 return $gapline =~ tr/-/-/;
178 =head2 score_nuc
180 Title : score_nuc
181 Usage : my $score = $stat->score_nuc($aln);
183 my $score = $stat->score_nuc(
184 -aln =>$aln,
185 -match => 1,
186 -mismatch => -1,
187 -gap_open => -1,
188 -gap_ext => -1
190 Function: Calculate the score of an alignment of 2 nucleic acid sequences. The
191 scoring parameters can be specified. Otherwise the blastn default
192 parameters are used: match = 2, mismatch = -3, gap opening = -5, gap
193 extension = -2
194 Returns : alignment score (number)
195 Args : L<Bio::Align::AlignI>
196 match score [optional]
197 mismatch score [optional]
198 gap opening score [optional]
199 gap extension score [optional]
201 =cut
203 sub score_nuc {
204 my ($self, @args) = @_;
205 my ( $aln, $match, $mismatch, $gap_open, $gap_ext) = $self->_rearrange( [qw(
206 ALN MATCH MISMATCH GAP_OPEN GAP_EXT)], @args );
207 if ( ! defined $aln || ! $aln->isa('Bio::Align::AlignI') ) {
208 $self->throw("Must provide a Bio::Align::AlignI compliant object to ".
209 "Bio::Align::PairwiseStatistics");
210 } elsif ( $aln->num_sequences != 2 ) {
211 $self->throw("Only pairwise calculations supported. Found ".
212 $aln->num_sequences." sequences in alignment\n");
214 my $seq1 = $aln->get_seq_by_pos(1);
215 my $seq2 = $aln->get_seq_by_pos(2);
216 if (! ( ($seq1->alphabet eq 'dna' || $seq1->alphabet eq 'rna') &&
217 ($seq2->alphabet eq 'dna' || $seq2->alphabet eq 'rna') )) {
218 $self->throw("Can only score nucleic acid alignments");
220 $match ||= 2; # Blastn scoring defaults
221 $mismatch ||= -3;
222 $gap_open ||= -5;
223 $gap_ext ||= -2;
224 my $score = 0;
225 my $prevres1 = '-';
226 my $prevres2 = '-';
227 for (my $pos = 1 ; $pos <= $aln->length ; $pos++) {
228 my $res1 = $seq1->subseq($pos, $pos);
229 my $res2 = $seq2->subseq($pos, $pos);
230 if (!($res1 eq '-' || $res2 eq '-')) { # no gap
231 if ($res1 eq $res2) { # same residue
232 $score += $match;
233 } else { # other residue
234 $score += $mismatch;
236 } else { # open or ext gap?
237 my $open = 0;
238 if (!($res1 eq '-' && $res2 eq '-')) { # exactly one gap
239 my $prevres = $prevres1;
240 $prevres = $prevres2 if $res2 eq '-';
241 $open = 1 unless $prevres eq '-';
242 } else { # 2 gaps
243 $open = 1 unless $prevres1 eq '-' && $prevres2 eq '-';
245 if ($open) {
246 $score += $gap_open; # gap opening
247 } else {
248 $score += $gap_ext; # gap extension
251 $prevres1 = $res1;
252 $prevres2 = $res2;
254 return $score;