4 CXGN::SNPs - deal with collections of SNPs
8 my $snps = CXGN::SNPs->new( { file => $file });
9 my $allele_feq = $snps->allele_freq();
10 my $dosage = $snps->calculate_snp_dosage($snp); # provide a CXGN::Genotype::SNP object
15 Lukas Mueller <lam87@cornell.edu>
34 has
'id' => ( isa
=> 'Str',
38 =head2 accessors accessions()
40 All the accessions parsed in the SNP
44 has
'accessions' => ( isa
=> 'ArrayRef',
48 =head2 accessors depth()
50 The total depth of the SNP (usually from VCF file)
54 has
'depth' => ( isa
=> 'Int',
58 =head2 accessors ignore_accessions()
60 A hashref with accessions that have not passed some quality check and that should be ignored.
64 has
'ignore_accessions' => (isa
=> 'HashRef',
68 =head2 accessors valid_accessions()
70 A ArrayRef of valid accessions (the complement set to ignore_accessions)
74 has
'valid_accessions' => (isa
=> 'ArrayRef',
79 has
'scores' => ( isa
=> 'HashRef',
83 #has 'bad_clones' => ( isa => 'ArrayRef',
87 =head2 accessors snps()
89 An HashRef of CXGN::Genotype::SNP objects that represent the SNP calls, with genotype ids as the hash key.
93 has
'snps' => ( isa
=> 'HashRef',
97 =head2 accessors maf()
99 The minor allele frequency of the SNP set.
103 has
'maf' => ( isa
=> 'Num',
105 default => sub { 0.999 },
108 =head2 accessors allele_freq()
110 The allele frequency of the SNP set.
114 has
'allele_freq' => ( isa
=> 'Num',
120 has
'chr' => ( isa
=> 'Str',
124 has
'position' => (isa
=> 'Int',
128 =head2 accessors ref_allele()
130 reference allele nucleotide
134 has
'ref_allele' => ( isa
=> 'Str',
138 =head2 accessors alt_allele()
140 alternative allele nucleotide
144 has
'alt_allele' => ( isa
=> 'Str',
150 has
'qual' => ( isa
=> 'Str',
154 has
'filter' => ( isa
=> 'Str',
158 has
'info' => ( isa
=> 'Str',
162 has
'format' => ( isa
=> 'Str',
166 has
'pAA' => ( isa
=> 'Num',
170 has
'pAB' => ( isa
=> 'Num',
174 has
'pBB' => ( isa
=> 'Num',
178 has
'raw' => ( isa
=> 'Str',
182 has
'error_probability' => ( isa
=> 'Num',
189 Usage: $ms->get_score('XYZ');
190 Desc: gets the marker score associated with XYZ
200 my $accession = shift;
202 if (exists($self->scores()->{$accession})) {
203 return $self->scores->{$accession};
206 warn "accession $accession has no associated score.\n";
215 my $invalid_snps = 0;
217 #print STDERR Dumper($self->snps());
218 foreach my $a (@
{$self->valid_accessions}) {
219 my $snp = $self->snps()->{$a};
220 if (!$snp) { print "Warning! No snp found for accession $a!!!!\n"; }
221 if ($snp && $snp->good_call()) {
228 return ($good_snps, $invalid_snps);
231 sub calculate_allele_frequency_using_counts
{
238 foreach my $k (@
{$self->valid_accessions()}) {
239 my $s = $self->snps->{$k};
241 print STDERR
"Accession $k has no valid SNPs associated with it. Skipping.\n";
245 $total_c1 += $s->ref_count();
246 $total_c2 += $s->alt_count();
249 if ($total_c1 + $total_c2 == 0) {
253 my $allele_freq = $total_c1 / ($total_c1 + $total_c2);
255 my $pAA = $allele_freq **2;
256 my $pAB = $allele_freq * (1 - $allele_freq) * 2 ;
257 my $pBB = (1 - $allele_freq) **2;
259 $self->allele_freq($allele_freq);
263 #print STDERR "$missing accessions had no valid snps associated with them for snp ".$self->id()."\n";
267 sub calculate_dosages
{
270 foreach my $k (keys %{$self->snps()}) {
271 my $s = $self->snps()->{$k};
272 $self->calculate_snp_dosage($s);
273 my $dosage = $s->dosage() || "no dosage";
274 #print STDERR "Dosage for SNP ".$s->id()."=$dosage for accession ".$s->accession()."\n";
276 #print STDERR Dumper($self->snps());
279 =head2 function calculate_snp_dosage()
281 $snps->calculate_snp_dosage($snp)
282 Calculates the SNP dosage of SNP $snp
283 $snp is a CXGN::Genotype::SNP object
287 sub calculate_snp_dosage
{
290 my $strict_dosage_filter = shift || 0;
292 my $error_probability = $self->error_probability();
294 #print STDERR "Working with strict dosage filter ($strict_dosage_filter) and error probability $error_probability\n";
295 my $c1 = $snp->ref_count();
296 my $c2 = $snp->alt_count();
298 #print STDERR "counts: $c1, $c2\n";
302 my $N1 = Math
::BigInt
->new($n);
303 my $N2 = Math
::BigInt
->new($n);
305 # print STDERR "$N1 bnok $c1 is: ". $N1->bnok($c1)."\n";
307 my $Nbnokc1 = $N1->bnok($c1)->numify();
308 my $Nbnokc2 = $N2->bnok($c2)->numify();
310 # print STDERR "NBnokc1: $Nbnokc1, NBnokc2 $Nbnokc2\n";
312 my $pDAA = $Nbnokc1 * ((1-$error_probability) ** $c1) * ($error_probability ** $c2);
313 my $pDAB = $Nbnokc1 * (0.5 ** $c1) * (0.5 ** $c2);
314 my $pDBB = $Nbnokc2 * ((1-$error_probability) ** $c2) * ($error_probability ** $c1);
316 # print STDERR "pDAA: $pDAA pDAB $pDAB, pDBB $pDBB\n";
318 my $pSAA = $pDAA * ($self->pAA || 0);
319 my $pSAB = $pDAB * ($self->pAB || 0);
320 my $pSBB = $pDBB * ($self->pBB || 0);
322 if ($pSAA + $pSAB + $pSBB == 0) {
326 my $x = 1 / ($pSAA + $pSAB + $pSBB);
328 my $dosage = ($pSAB + 2 * $pSBB) * $x;
330 if ($strict_dosage_filter) {
331 if ( ($dosage > 0.1) && ($dosage < 0.9) ) {
334 if ( ($dosage > 1.1) && ($dosage < 1.9) ) {
339 $snp->dosage($dosage);
344 =head2 function hardy_weinberg_filter
346 $snps->hardy_weinberg_filter();
347 returns a hash with the following keys:
348 monomorphic - 1 if the SNP is monomorphic
349 allele_freq - the allele frequency derived from counts
350 chi - the chi square value for hardy weinberg distribution
351 scored_marker_fraction - the fraction of markers that were successfully scored
352 heterozygote_count - the number of heterozygote SNPs in the set
357 sub hardy_weinberg_filter
{
359 my $dosages = shift; # ignored clones already removed
361 my %classes = ( AA
=> 0, AB
=> 0, BB
=> 0, NA
=> 0);
363 foreach my $d (@
$dosages) {
364 if (! defined($d) || $d eq "NA") {
367 elsif ( ($d >= 0) && ($d <= 0.1) ) {
370 elsif ( ($d >=0.9) && ($d <= 1.1) ) {
374 elsif (($d >=1.9) && ($d <= 2.0)) {
378 #print STDERR "Dosage outlier: $d\n";
383 print STDERR
"Class counts: AA: $classes{AA}, BB: $classes{BB}, AB: $classes{AB}, NA: $classes{NA}\n";
385 if ( ( ($classes{AA
} ==0) && ($classes{AB
} ==0)) ||
386 ( ($classes{BB
} == 0) && ($classes{AB
} ==0)) ) {
387 return ( monomorphic
=> 1);
390 my $total = $classes{AA
} + $classes{AB
} + $classes{BB
};
394 $score{total
} = $total;
396 $score{scored_marker_fraction
} = $total / (@
$dosages);
398 #print STDERR "AA $classes{AA}, AB $classes{AB}, BB $classes{BB} Total: $total\n";
399 my $allele_freq = (2 * $classes{AA
} + $classes{AB
}) / (2 * $total);
401 $score{heterozygote_count
} = $classes{AB
};
403 $score{allele_freq
} = $allele_freq;
405 my $expected = $allele_freq * (1-$allele_freq) * 2 * $total;
407 #print STDERR "TOTAL: $total\n";
408 my $x = ($classes{AB
} - $expected)**2 / $expected;
410 # only do the chi square test if the number of heterozygotes is larger than expected
412 if ($classes{AB
} > $expected) {
423 __PACKAGE__
->meta->make_immutable;