Merge pull request #311 from solgenomics/topic/accession_list_trial_search
[sgn.git] / lib / CXGN / SNPs.pm
blobf9fb860cc463cb73fa59f9018487361e2568de38
2 =head1 NAME
4 CXGN::SNPs - deal with collections of SNPs
6 =head1 SYNOPSYS
8 my $snps = CXGN::SNPs->new( { file => $file });
9 my $allele_feq = $snps->allele_freq();
10 my $dosage = $snps->calculate_snp_dosage($snp); # provide a CXGN::Genotype::SNP object
11 # etc
13 =head1 AUTHOR
15 Lukas Mueller <lam87@cornell.edu>
17 =head1 METHODS
19 =cut
21 package CXGN::SNPs;
23 use Moose;
25 use Data::Dumper;
26 use Math::BigInt;
28 =head2 accessors id()
30 The id of the SNP.
32 =cut
34 has 'id' => ( isa => 'Str',
35 is => 'rw',
38 =head2 accessors accessions()
40 All the accessions parsed in the SNP
42 =cut
44 has 'accessions' => ( isa => 'ArrayRef',
45 is => 'rw',
48 =head2 accessors depth()
50 The total depth of the SNP (usually from VCF file)
52 =cut
54 has 'depth' => ( isa => 'Int',
55 is => 'rw',
58 =head2 accessors ignore_accessions()
60 A hashref with accessions that have not passed some quality check and that should be ignored.
62 =cut
64 has 'ignore_accessions' => (isa => 'HashRef',
65 is => 'rw',
68 =head2 accessors valid_accessions()
70 A ArrayRef of valid accessions (the complement set to ignore_accessions)
72 =cut
74 has 'valid_accessions' => (isa => 'ArrayRef',
75 is => 'rw',
79 has 'scores' => ( isa => 'HashRef',
80 is => 'rw',
83 #has 'bad_clones' => ( isa => 'ArrayRef',
84 # is => 'rw',
85 # );
87 =head2 accessors snps()
89 An HashRef of CXGN::Genotype::SNP objects that represent the SNP calls, with genotype ids as the hash key.
91 =cut
93 has 'snps' => ( isa => 'HashRef',
94 is => 'rw',
97 =head2 accessors maf()
99 The minor allele frequency of the SNP set.
101 =cut
103 has 'maf' => ( isa => 'Num',
104 is => 'rw',
105 default => sub { 0.999 },
108 =head2 accessors allele_freq()
110 The allele frequency of the SNP set.
112 =cut
114 has 'allele_freq' => ( isa => 'Num',
115 is => 'rw',
120 has 'chr' => ( isa => 'Str',
121 is => 'rw',
124 has 'position' => (isa => 'Int',
125 is => 'rw',
128 =head2 accessors ref_allele()
130 reference allele nucleotide
132 =cut
134 has 'ref_allele' => ( isa => 'Str',
135 is => 'rw',
138 =head2 accessors alt_allele()
140 alternative allele nucleotide
142 =cut
144 has 'alt_allele' => ( isa => 'Str',
145 is => 'rw',
150 has 'qual' => ( isa => 'Str',
151 is => 'rw',
154 has 'filter' => ( isa => 'Str',
155 is => 'rw',
158 has 'info' => ( isa => 'Str',
159 is => 'rw',
162 has 'format' => ( isa => 'Str',
163 is => 'rw',
166 has 'pAA' => ( isa => 'Num',
167 is => 'rw',
170 has 'pAB' => ( isa => 'Num',
171 is => 'rw',
174 has 'pBB' => ( isa => 'Num',
175 is => 'rw',
178 has 'raw' => ( isa => 'Str',
179 is => 'rw',
182 has 'error_probability' => ( isa => 'Num',
183 is => 'rw',
184 default => 0.001,
187 =head2 get_score
189 Usage: $ms->get_score('XYZ');
190 Desc: gets the marker score associated with XYZ
191 Ret:
192 Args:
193 Side Effects:
194 Example:
196 =cut
198 sub get_score {
199 my $self = shift;
200 my $accession = shift;
202 if (exists($self->scores()->{$accession})) {
203 return $self->scores->{$accession};
205 else {
206 warn "accession $accession has no associated score.\n";
207 return undef;
211 sub snp_stats {
212 my $self = shift;
214 my $good_snps = 0;
215 my $invalid_snps = 0;
217 #print STDERR Dumper($self->snps());
218 foreach my $a (@{$self->valid_accessions}) {
219 my $snp = $self->snps()->{$a};
220 if (!$snp) { print "Warning! No snp found for accession $a!!!!\n"; }
221 if ($snp && $snp->good_call()) {
222 $good_snps++;
224 else {
225 $invalid_snps++;
228 return ($good_snps, $invalid_snps);
231 sub calculate_allele_frequency_using_counts {
232 my $self = shift;
234 my $total_c1 = 0;
235 my $total_c2 = 0;
237 my $missing = 0;
238 foreach my $k (@{$self->valid_accessions()}) {
239 my $s = $self->snps->{$k};
240 if (!$s) {
241 print STDERR "Accession $k has no valid SNPs associated with it. Skipping.\n";
242 $missing++;
243 next();
245 $total_c1 += $s->ref_count();
246 $total_c2 += $s->alt_count();
249 if ($total_c1 + $total_c2 == 0) {
250 return undef;
253 my $allele_freq = $total_c1 / ($total_c1 + $total_c2);
255 my $pAA = $allele_freq **2;
256 my $pAB = $allele_freq * (1 - $allele_freq) * 2 ;
257 my $pBB = (1 - $allele_freq) **2;
259 $self->allele_freq($allele_freq);
260 $self->pAA($pAA);
261 $self->pAB($pAB);
262 $self->pBB($pBB);
263 #print STDERR "$missing accessions had no valid snps associated with them for snp ".$self->id()."\n";
264 return $allele_freq;
267 sub calculate_dosages {
268 my $self = shift;
270 foreach my $k (keys %{$self->snps()}) {
271 my $s = $self->snps()->{$k};
272 $self->calculate_snp_dosage($s);
273 my $dosage = $s->dosage() || "no dosage";
274 #print STDERR "Dosage for SNP ".$s->id()."=$dosage for accession ".$s->accession()."\n";
276 #print STDERR Dumper($self->snps());
279 =head2 function calculate_snp_dosage()
281 $snps->calculate_snp_dosage($snp)
282 Calculates the SNP dosage of SNP $snp
283 $snp is a CXGN::Genotype::SNP object
285 =cut
287 sub calculate_snp_dosage {
288 my $self = shift;
289 my $snp = shift;
290 my $strict_dosage_filter = shift || 0;
292 my $error_probability = $self->error_probability();
294 #print STDERR "Working with strict dosage filter ($strict_dosage_filter) and error probability $error_probability\n";
295 my $c1 = $snp->ref_count();
296 my $c2 = $snp->alt_count();
298 #print STDERR "counts: $c1, $c2\n";
300 my $n = $c1 + $c2;
302 my $N1 = Math::BigInt->new($n);
303 my $N2 = Math::BigInt->new($n);
305 # print STDERR "$N1 bnok $c1 is: ". $N1->bnok($c1)."\n";
307 my $Nbnokc1 = $N1->bnok($c1)->numify();
308 my $Nbnokc2 = $N2->bnok($c2)->numify();
310 # print STDERR "NBnokc1: $Nbnokc1, NBnokc2 $Nbnokc2\n";
312 my $pDAA = $Nbnokc1 * ((1-$error_probability) ** $c1) * ($error_probability ** $c2);
313 my $pDAB = $Nbnokc1 * (0.5 ** $c1) * (0.5 ** $c2);
314 my $pDBB = $Nbnokc2 * ((1-$error_probability) ** $c2) * ($error_probability ** $c1);
316 # print STDERR "pDAA: $pDAA pDAB $pDAB, pDBB $pDBB\n";
318 my $pSAA = $pDAA * ($self->pAA || 0);
319 my $pSAB = $pDAB * ($self->pAB || 0);
320 my $pSBB = $pDBB * ($self->pBB || 0);
322 if ($pSAA + $pSAB + $pSBB == 0) {
323 return "NA";
326 my $x = 1 / ($pSAA + $pSAB + $pSBB);
328 my $dosage = ($pSAB + 2 * $pSBB) * $x;
330 if ($strict_dosage_filter) {
331 if ( ($dosage > 0.1) && ($dosage < 0.9) ) {
332 $dosage = "NA";
334 if ( ($dosage > 1.1) && ($dosage < 1.9) ) {
335 $dosage = "NA";
339 $snp->dosage($dosage);
341 return $dosage;
344 =head2 function hardy_weinberg_filter
346 $snps->hardy_weinberg_filter();
347 returns a hash with the following keys:
348 monomorphic - 1 if the SNP is monomorphic
349 allele_freq - the allele frequency derived from counts
350 chi - the chi square value for hardy weinberg distribution
351 scored_marker_fraction - the fraction of markers that were successfully scored
352 heterozygote_count - the number of heterozygote SNPs in the set
354 =cut
357 sub hardy_weinberg_filter {
358 my $self = shift;
359 my $dosages = shift; # ignored clones already removed
361 my %classes = ( AA => 0, AB => 0, BB => 0, NA => 0);
363 foreach my $d (@$dosages) {
364 if (! defined($d) || $d eq "NA") {
365 $classes{NA}++;
367 elsif ( ($d >= 0) && ($d <= 0.1) ) {
368 $classes{AA}++;
370 elsif ( ($d >=0.9) && ($d <= 1.1) ) {
371 $classes{AB}++;
374 elsif (($d >=1.9) && ($d <= 2.0)) {
375 $classes{BB}++;
377 else {
378 #print STDERR "Dosage outlier: $d\n";
383 print STDERR "Class counts: AA: $classes{AA}, BB: $classes{BB}, AB: $classes{AB}, NA: $classes{NA}\n";
385 if ( ( ($classes{AA} ==0) && ($classes{AB} ==0)) ||
386 ( ($classes{BB} == 0) && ($classes{AB} ==0)) ) {
387 return ( monomorphic => 1);
390 my $total = $classes{AA} + $classes{AB} + $classes{BB};
392 my %score = ();
394 $score{total} = $total;
396 $score{scored_marker_fraction} = $total / (@$dosages);
398 #print STDERR "AA $classes{AA}, AB $classes{AB}, BB $classes{BB} Total: $total\n";
399 my $allele_freq = (2 * $classes{AA} + $classes{AB}) / (2 * $total);
401 $score{heterozygote_count} = $classes{AB};
403 $score{allele_freq} = $allele_freq;
405 my $expected = $allele_freq * (1-$allele_freq) * 2 * $total;
407 #print STDERR "TOTAL: $total\n";
408 my $x = ($classes{AB} - $expected)**2 / $expected;
410 # only do the chi square test if the number of heterozygotes is larger than expected
412 if ($classes{AB} > $expected) {
413 $score{chi} = $x;
415 else {
416 $score{chi} = 0;
419 return %score;
423 __PACKAGE__->meta->make_immutable;