modified: nfig1.py
[GalaxyCodeBases.git] / python / mixture / vcf2tped.pl
blobcd85b6140e8bf58a5eebb229db7663b7604db0df
1 #!/usr/bin/env perl
2 use strict;
3 use warnings;
5 use Data::Dump qw(ddx);
7 my $Usage = "Usage: $0 <vcf.gz> <out prefix>\n";
8 die $Usage if @ARGV < 1;
10 my ($filename,$outp) = @ARGV;
11 my $cmd = "bcftools view -U -m2 -i '%QUAL>=40 & MIN(FMT/GQ)>20 & GT[2]=\"het\"' -v snps $filename |bcftools view -e 'FMT/DP=\".\"'| bcftools query -f '%CHROM\t%POS\t%REF,%ALT\t%QUAL[\t%TGT:%AD:%GQ]\n'";
12 my @SampleIDs;
13 open(SID,'-|',"bcftools query -l $filename") or die "Error opening [$filename]: $!\n";
14 while(<SID>) {
15 chomp;
16 push @SampleIDs,$_;
18 close SID;
19 ddx \@SampleIDs;
21 my $theKiller = 'T3C';
22 my $theVictim = 'T10C';
23 my @addedSID = qw(Rsin);
24 my @AllSID = (@SampleIDs,@addedSID);
26 open O,'>',"$outp.tfam" or die "[$outp.tfam]:$!\n";
27 for (@AllSID) {
28 my $FID = 'F';
29 my $IID = $_;
30 print O join(' ',$FID,$IID,0,0,0,0),"\n";
31 # https://www.cog-genomics.org/plink/1.9/formats#fam
33 close O;
35 my %COUNTING;
36 sub doCnt($) {
37 my $s = $_[0];
38 ++$COUNTING{$s};
39 #print " $s";
41 sub mergeGT($) {
42 my $hashref = $_[0];
43 my @GTs = sort keys %{$hashref};
44 s/\./0/ for @GTs;
45 if (scalar @GTs ==1) {
46 push @GTs,@GTs;
48 return join(' ',@GTs);
50 my $vCounter=0;
51 open O,'>',"$outp.tped" or die "[$outp.tped]:$!\n";
52 open(IN,"-|",$cmd) or die "Error opening [$filename]: $!\n";
53 while (<IN>) {
54 chomp;
55 my ($Chrom,$Pos,$RefAlt,$Qual,@GTs) = split /\t/,$_;
56 next if $Chrom =~ /_/;
57 #print "$_\n";
58 my (%GTs,%GTstr,%GTped);
59 @GTstr{@SampleIDs} = @GTs;
60 #ddx \%GTstr;
61 for my $k (keys %GTstr) {
62 my ($sGT,$sAD,$sGQ) = split /:/,$GTstr{$k};
63 my @aGT = split /[|\/]/,$sGT;
64 my @aAD = split /\,/,$sAD;
65 my %counter;
66 for (@aGT) {
67 ++$counter{$_};
69 #my @result = sort(keys %counter);
70 $GTs{$k} = \%counter;
72 $GTs{$addedSID[0]} = {%{$GTs{'mixed'}}};
73 #$GTs{$addedSID[1]} = {%{$GTs{'mixed'}}};
74 for (keys %{$GTs{$theVictim}}) {
75 if (exists $GTs{$addedSID[0]}{$_}) {
76 delete $GTs{$addedSID[0]}{$_};
79 for my $k (keys %GTs) {
80 $GTped{$k} = mergeGT($GTs{$k});
82 if (length($GTped{$addedSID[0]})==0) {
83 doCnt('0same');
84 next;
86 if (length($RefAlt)>3) {
87 doCnt('1triallelic');
88 next;
90 #ddx \%GTs;
91 #ddx \%GTped;
92 my $ChrCode = $Chrom;
93 $ChrCode =~ s/^chr//i;
94 ++$vCounter;
95 my $VarID = 'p'.$vCounter.$RefAlt;
96 $VarID =~ s/\,//g;
97 print O join("\t",$ChrCode,$VarID,0,$Pos);
98 for my $k (@AllSID) {
99 print O "\t",$GTped{$k};
101 print O "\n";
103 close IN;
104 # https://www.cog-genomics.org/plink/1.9/formats#tped
105 # https://www.cog-genomics.org/plink/1.9/formats#map
106 # Chromosome code. PLINK 1.9 also permits contig names here, but most older programs do not.
107 # Variant identifier
108 # Position in morgans or centimorgans (optional; also safe to use dummy value of '0')
109 # Base-pair coordinate
110 close O;
111 ddx \%COUNTING;
113 print "plink --noweb --tfile $outp --make-bed --out xxx\n";