new file: makefile
[GalaxyCodeBases.git] / python / mixture / vcf2gt.pl
bloba96a8444d59630f916092570d33abc114328079b
1 #!/usr/bin/env perl
2 use strict;
3 use warnings;
5 use Data::Dump qw(ddx);
7 my $Usage = "Usage: $0 <vcf.gz> <out prefix>\n";
8 die $Usage if @ARGV < 1;
10 my ($filename,$outp) = @ARGV;
11 my $cmd = "bcftools view -U -m2 -i '%QUAL>=40 & MIN(FMT/GQ)>20' -v snps $filename |bcftools view -e 'FMT/DP=\".\"'| bcftools query -f '%CHROM\t%POS\t%REF,%ALT\t%QUAL[\t%TGT:%AD:%GQ]\n'";
12 my @SampleIDs;
13 open(SID,'-|',"bcftools query -l $filename") or die "Error opening [$filename]: $!\n";
14 while(<SID>) {
15 chomp;
16 push @SampleIDs,$_;
18 close SID;
19 ddx \@SampleIDs;
21 my $theKiller = 'T3C';
22 my $theVictim = 'T10C';
23 my $Mixture = 'mixed';
25 sub mergeGT($) {
26 my $hashref = $_[0];
27 my @GTs = sort keys %{$hashref};
28 s/\./0/ for @GTs;
29 if (scalar @GTs ==1) {
30 push @GTs,@GTs;
32 return join(' ',@GTs);
34 my $HetR = 0.2;
35 sub doCal($$) {
36 my ($mix,$vit) = @_;
37 my @mixGT = keys %{$mix};
38 my @vitGT = keys %{$vit};
39 my %mixGT = map { $_ => 1 } @mixGT;
40 my ($sameGT,$vitGT);
41 for (@vitGT) {
42 if (exists $mixGT{$_}) {
43 delete $mixGT{$_};
46 my @extraBase = keys %mixGT;
47 #ddx ($mix,$vit);
48 if (scalar @extraBase > 1) {
49 warn "[@extraBase]";
50 return -1;
52 my $Extradepth = $mix->{$extraBase[0]};
53 my $Totaldepth = 0;
54 for (values %{$mix}) {
55 $Totaldepth += $_;
57 my $depth0 = $Extradepth / $Totaldepth;
58 my $ret1 = $depth0 / (1-0.5*$HetR);
59 my $ret2 = $depth0 * (2/3) / 0.6;
60 #ddx [$ret1,$ret2,$Extradepth,$Totaldepth];
61 return ($ret1,$Extradepth,$Totaldepth);
64 open O,'>',"$outp.tsv" or die "[$outp.tsv]:$!\n";
65 my (%Calcu,%Results);
66 open(IN,"-|",$cmd) or die "Error opening [$filename]: $!\n";
67 while (<IN>) {
68 chomp;
69 my ($Chrom,$Pos,$RefAlt,$Qual,@GTs) = split /\t/,$_;
70 next if $Chrom =~ /_/;
71 #print "$_\n";
72 my (%GTs,%GTstr,%GTped);
73 @GTstr{@SampleIDs} = @GTs;
74 #ddx \%GTstr;
75 for my $k (keys %GTstr) {
76 my ($sGT,$sAD,$sGQ) = split /:/,$GTstr{$k};
77 my @aGT = split /[|\/]/,$sGT;
78 my @aAD = split /\,/,$sAD;
79 my %tmp;
80 @tmp{@aGT} = @aAD;
81 $GTs{$k} = \%tmp;
83 #ddx \%GTs;
84 my %Killer = %{$GTs{$theKiller}};
85 my %Victim = %{$GTs{$theVictim}};
86 my %Mixture = %{$GTs{$Mixture}};
87 next if scalar(keys %Mixture) != 2;
88 next if scalar(keys %Victim) != 1;
89 my $gt1 = mergeGT(\%Mixture);
90 next if $gt1 =~ /0/;
91 next if length($gt1)>3;
92 my $gt2 = mergeGT(\%Victim);
93 next if $gt2 =~ /0/;
94 next if length($gt2)>3;
95 #ddx [\%Killer,\%Victim,\%Mixture,$gt1,$gt2];
96 my ($ret,$Extradepth,$Totaldepth) = doCal(\%Mixture,\%Victim);
97 #my $ret = doCal(\%Mixture,\%Killer);
98 next if $ret == -1;
99 ++$Results{int($ret*100)/100};
100 $Calcu{'S'} += $ret;
101 $Calcu{'SS'} += $ret*$ret;
102 ++$Calcu{'N'};
103 #ddx \%Results,\%Calcu;
104 print O join("\t",$Chrom,$Pos,$Qual,$gt1,$gt2,$ret,$Extradepth,$Totaldepth),"\n";
107 my $mean = $Calcu{'S'} / $Calcu{'N'};
108 my $std = sqrt($Calcu{'SS'}/$Calcu{'N'} - $mean*$mean);
109 my $var = $std/$mean;
111 print O "# $mean ± $std, $var\n";
112 close O;
113 print "$mean ± $std, $var\n";
115 __END__
116 my $theKiller = 'T3C';
117 my $theVictim = 'T10C';
119 T3C: 85,982,846,136 bp
120 T10C:18,277,246,581 bp
122 K% = 82%
124 # { N => 3859, S => 1342.27179667765, SS => 676.187587538486 },
126 0.347828918548237 ± 0.232891755122676, 0.669558345219586
128 # { N => 370586, S => 175277.892782687, SS => 98608.7057863698 },
130 0.472974944500566 ± 0.205872025106518, 0.43527046728428