modified: nfig1.py
[GalaxyCodeBases.git] / python / mixture / parse_vcf.pl
blobefa90fac4f7d361b07085e58d99f6cfe93e885d2
1 #!/usr/bin/env perl
2 use strict;
3 use warnings;
5 use Data::Dump qw(ddx);
7 my $Usage = "Usage: $0 <vcf.gz>\n";
8 die $Usage if @ARGV < 1;
10 my ($filename) = @ARGV;
11 #my $cmd = "bcftools view -m3 -v snps $filename | bcftools query -f '%CHROM\t%POS\t%REF,%ALT\t%QUAL[\t%TGT:%AD:%GQ]\n'";
12 my $cmd = "bcftools view -U -m2 -i '%QUAL>=40 & MIN(FMT/GQ)>20 & GT[2]=\"het\"' -v snps $filename |bcftools view -e 'FMT/DP=\".\"'| bcftools query -f '%CHROM\t%POS\t%REF,%ALT\t%QUAL[\t%TGT:%AD:%GQ]\n'";
13 =pod
14 chr1 102951 C,T 1575.58 C/C/C/T:23,6:25 C/C/C/T:88,27:67 C/C/C/T:111,33:93
15 chr1 633963 C,T 6936.96 T/T/T/T:5,242:99 C/C/C/C:250,0:99 C/C/C/T:172,77:75
16 chr1 889636 A,G,T 1356.65 A/A/G/T:14,3,8:21 A/A/G/T:36,20,10:48 A/A/G/T:50,23,18:75
17 chr1 1099396 G,A,C 214.66 G/G/G/G:22,1,0:24 G/G/A/C:67,5,9:58 G/G/A/C:89,6,9:24
18 =cut
19 my @SampleIDs;
20 open(SID,'-|',"bcftools query -l $filename") or die "Error opening [$filename]: $!\n";
21 while(<SID>) {
22 chomp;
23 push @SampleIDs,$_;
25 close SID;
26 ddx \@SampleIDs;
27 # ["T10C", "T3C", "mixed"] => v,k,m
29 sub mergeGT($) {
30 my $hashref = $_[0];
31 my @GTs = sort keys %{$hashref};
32 return join('',@GTs);
35 my %COUNTING;
36 sub doCnt($) {
37 my $s = $_[0];
38 ++$COUNTING{$s};
39 print " $s";
42 open(IN,"-|",$cmd) or die "Error opening [$filename]: $!\n";
43 while (<IN>) {
44 chomp;
45 my ($Chrom,$Pos,$RefAlt,$Qual,@GTs) = split /\t/,$_;
46 next if $Chrom =~ /_/;
47 print "$_\n";
48 my %GTs;
49 for my $i (0 .. $#SampleIDs) {
50 my ($sGT,$sAD,$sGQ) = split /:/,$GTs[$i];
51 my @aGT = split /[|\/]/,$sGT;
52 my @aAD = split /\,/,$sAD;
53 my %counter;
54 =pod
55 if ($SampleIDs[$i] eq 'mixed') {
56 ++$counter{$_} for @aGT;
57 } else {
58 ++$counter{$_} for @aGT[0,1];
60 =cut
61 for (@aGT) {
62 ++$counter{$_};
64 #my @result = sort(keys %counter);
65 $GTs{$SampleIDs[$i]} = \%counter;
67 $GTs{'__R'} = {%{$GTs{'mixed'}}};
68 for (keys %{$GTs{'T10C'}}) {
69 if (exists $GTs{'__R'}{$_}) {
70 delete $GTs{'__R'}{$_};
73 $GTs{'_V'} = mergeGT($GTs{'T10C'});
74 $GTs{'_K'} = mergeGT($GTs{'T3C'});
75 $GTs{'_M'} = mergeGT($GTs{'mixed'});
76 $GTs{'_R'} = mergeGT($GTs{'__R'});
77 ddx \%GTs;
78 my $str1 = $GTs{'_R'};
79 my $str2 = join('.+',split(//,$str1));
80 ++$COUNTING{'_All'};
81 if (length($GTs{'_M'})==1) {
82 doCnt('0M1');
83 } elsif (length($GTs{'_M'})==2) {
84 doCnt('0M2');
85 if ($GTs{'_R'} eq '') {
86 doCnt('1R0');
87 if ($GTs{'_M'} eq $GTs{'_V'}) {
88 doCnt('1R0e'); # M == V, het
89 } else {
90 doCnt('1R0nx');
92 } elsif (length($GTs{'_R'})==1) {
93 doCnt('1R1');
94 if (length($GTs{'_V'})==1) {
95 doCnt('1R1m');
96 if ($GTs{'_R'} eq $GTs{'_K'}) {
97 doCnt('2EQ'); # M-V=K
98 } elsif ($GTs{'_K'} =~ /$GTs{'_R'}/) {
99 doCnt('2LK'); # M-V ~= K
100 } else {
101 doCnt('2x'); # V=K, M:het
103 } else {
104 doCnt('1R1hx');
105 # V有第三碱基
107 } else {
108 doCnt('1Rx');
109 # V='.'
111 } else {
112 doCnt('0Mx');
114 print "\n";
115 =pod
116 if ($GTs{'_R'} eq '') {
117 ++$COUNTING{'0noR'};
118 } elsif ($GTs{'_R'} eq $GTs{'_K'}) {
119 ++$COUNTING{'1fulEQ'};
120 } elsif (length($GTs{'_R'})==1 and $GTs{'_K'} =~ /$GTs{'_R'}/) {
121 ++$COUNTING{'2inc'};
122 } elsif ($GTs{'_K'} =~ /$str2/) {
123 ++$COUNTING{'3incT'};
124 } else {
125 ++$COUNTING{'4ne'};
126 print STDERR "$_\n";
128 =cut
130 ddx \%COUNTING;
131 close IN;