modified: Makefile
[GalaxyCodeBases.git] / perl / etc / radseq / bcf2ped.pl
blob4cb8fd1095b6a81bb940335cf396d3ad3cf2d484
1 #!/usr/bin/env perl
2 =pod
3 Author: Hu Xuesong @ BIOPIC <galaxy001@gmail.com>
4 Version: 1.0.0 @ 20120720
5 Purpose: Read bcf, get tped for p-link
6 Notes: rad2marker is deprecated.
7 =cut
8 use strict;
9 use warnings;
10 use Data::Dump qw(ddx);
11 use Galaxy::IO;
12 use Galaxy::SeqTools;
13 use Data::Dumper;
15 die "Usage: $0 <tfam file> <bcgv bcf> <min Sample Count> <Dominant/Recessive> <out> [sample list]\n" if @ARGV<5;
16 my $tfamfs=shift;
17 my $bcfs=shift;
18 my $minSampleCnt=shift; # 16 for 16 samples in paper
19 my $DomRec=shift;
20 my $outfs=shift;
21 my $sampleList=shift;
23 $DomRec='D' if $DomRec =~ /^D/i;
24 $DomRec='R' if $DomRec =~ /^R/i;
26 my (%Stat,$t);
27 open OP,'>',$outfs.'.tped' or die "Error opening $outfs.tped : $!\n";
28 open OPA,'>',$outfs.'.case.tped' or die $!;
29 open OPO,'>',$outfs.'.control.tped' or die $!;
31 open OM,'>',$outfs.'.MinorAllele' or die "Error opening $outfs.MinorAllele : $!\n";
32 open OD,'>',$outfs.'.dict' or die "Error opening $outfs.dict : $!\n";
33 if ($tfamfs ne $outfs.'.tfam') {
34 open OF,'>',$outfs.'.tfam' or die "Error opening $outfs.tfam : $!\n";
35 open OFA,'>',$outfs.'.case.tfam' or die $!;
36 open OFO,'>',$outfs.'.control.tfam' or die $!;
37 } else {die;}
38 open O,'>',$outfs.'.bcf2pedlog' or die "Error opening $outfs.bcf2pedlog : $!\n";
39 $t = "# In:[$bcfs], Out:[$outfs]. minSampleCnt=$minSampleCnt Dominant/Recessive:[$DomRec]\n";
40 print O $t;
41 print $t;
43 #open OV,'>',$outfs.'.vcf' or die "Error opening $outfs.vcf : $!\n";
44 my %selectedSamples;
45 if (defined $sampleList) {
46 open L,'<',$sampleList or die;
47 while (<L>) {
48 chomp;
49 ++$selectedSamples{$_};
53 my (%Pheno,@tfamSamples,%tfamDat,%tfamSampleFlag,%inFamily,%ISinFamily);
54 open L,'<',$tfamfs or die;
55 while (<L>) {
56 next if /^(#|$)/;
57 #print OF $_;
58 chomp;
59 my ($family,$ind,$P,$M,$sex,$pho) = split /\t/;
60 if (defined $sampleList) {
61 next unless exists $selectedSamples{$ind};
63 $tfamDat{$ind} = $_."\n";
64 if ($ind =~ s/^~//) {
65 $tfamSampleFlag{$ind} = 0;
66 } elsif ($pho == 1 or $pho == 2) {
67 $tfamSampleFlag{$ind} = $pho;
68 } else { die; } # $pho can only be 1 or 2
69 # $tfamSampleFlag{$ind} = 3 if $ind !~ /^GZXJ/;
70 push @tfamSamples,$ind;
71 push @{$inFamily{$family}},$ind;
72 $Pheno{$ind} = $pho; # disease phenotype (1=unaff/ctl, 2=aff/case, 0=miss)
74 for my $ind (@tfamSamples) {
75 if ($tfamSampleFlag{$ind} == 0) {
76 next;
77 } elsif ($tfamSampleFlag{$ind}) {
78 if ($tfamSampleFlag{$ind} & 1) {
79 print OFO $tfamDat{$ind};
81 if ($tfamSampleFlag{$ind} & 2) {
82 print OFA $tfamDat{$ind};
84 } else { die; }
85 print OF $tfamDat{$ind};
87 close L;
88 close OF;
89 close OFA;
90 close OFO;
91 for my $family (keys %inFamily) {
92 if (scalar @{$inFamily{$family}} == 1) {
93 delete $inFamily{$family};
94 } else {
95 $ISinFamily{$_}=$family for @{$inFamily{$family}};
98 ddx \%inFamily; ddx \%ISinFamily;
100 my $cmd = 'bcftools view --types snps -m2 -M2';
101 if (defined $sampleList) {
102 $cmd .= " -S $sampleList";
104 print O "Open:[$cmd] [$bcfs]\n";
105 print "Open:[$cmd] [$bcfs]\n";;
106 my $th = openpipe($cmd,$bcfs);
107 my (@Samples,@Parents);
108 while (<$th>) {
109 #print OV $_;
110 next if /^##/;
111 chomp;
112 my @data = split /\t/;
113 if ($data[0] eq '#CHROM') {
114 @Samples = map {
115 if (/^(\w+)_all\.bam$/) {
116 $_ = $1;
117 } elsif (/\.\/bam0\//) {
118 my $t=(split /_/)[-1]; $t=(split /\./,$t)[0]; $_=$t;
119 } else {
120 #my $t=(split /\//)[-1];$t=~s/_cut//g;$t=~s/-/./g; $_=join('_',(split /\./,$t)[-5,-6]);
121 s/_2$//;
122 my $t = $_;
124 } splice @data,9;
125 # ../5.bam_0000210210_merged/d1_4_merged.D4.JHH001.XTU.sort.rmdup.bam
126 #@Parents = grep(!/^GZXJ/,@Samples);
127 @Parents=();
128 last;
132 die "Sample Count in tfam and bcf not match !\n" if @tfamSamples != @Samples;
133 my @res;
134 for (my $i = 0; $i < @Samples; $i++) {
135 push @res,join(',',$tfamSamples[$i],$tfamSampleFlag{$Samples[$i]});
136 die "Samples in tfam and bcf not match ! [$tfamSamples[$i] <> $Samples[$i]]\n" if $tfamSamples[$i] ne $Samples[$i];
137 } # http://stackoverflow.com/questions/2591747/how-can-i-compare-arrays-in-perl
138 print O "# Samples: [",join('],[',@res),"]\t# 1=control, 2=case, 0=drop\n# Parents: [",join('],[',@Parents),"]\n";
139 warn "Samples: (1=control, 2=case, 0=drop)\n[",join("]\n[",@res),"]\nParents: [",join('],[',@Parents),"]\n";
141 my (%ChrID2Num);
142 sub genChrNumber($) {
143 my $id = $_[0];
144 if (exists $ChrID2Num{$id}) {
145 return $ChrID2Num{$id};
146 } else {
147 my $lastCnt = scalar keys %ChrID2Num;
148 $ChrID2Num{$id} = $lastCnt+1;
149 return $ChrID2Num{$id};
153 while (<$th>) {
154 next if /^#/;
155 chomp;
156 my ($CHROM, $POS, $ID, $REF, $ALT, $QUAL, $FILTER, $INFO, $FORMAT, @data) = split /\t/;
157 ++$Stat{'VCF_In'}; # also as rs#
158 my @Bases = split /\s*,\s*/,"$REF, $ALT";
159 my @groups = split(/\s*;\s*/, $INFO);
160 my (%INFO,$name);
161 # if ($groups[0] eq 'INDEL') {
162 # $INFO{'Type'} = 'INDEL';
163 # shift @groups;
164 # } else {
165 # $INFO{'Type'} = 'SNP';
167 for my $group (@groups) {
168 my ($tag,$value) = split /=/,$group;
169 #warn "- $group -> $tag,$value\n";
170 my @values = split /,/,$value;
171 if (@values == 1) {
172 $INFO{$tag}=$values[0];
173 } else {
174 $INFO{$tag}=\@values;
177 my (%GT,%GTcnt);
178 my @FMT = split /:/,$FORMAT;
179 for my $s (@Samples) {
180 my $dat = shift @data or die "bcf file error.";
181 my @dat = split /:/,$dat;
182 for my $i (@FMT) {
183 $GT{$s}{$i} = shift @dat;
186 my $SPcnt = 0;
187 my (%GTitemCnt,$Mut,@plinkGT);
188 for (@Samples) {
189 if ($GT{$_}{'DP'} > 0 and $GT{$_}{'GQ'} >= 20) {
190 my $gt = $GT{$_}{'GT'};
191 ++$GTcnt{$gt};
192 my @GT = split /[\/|]/,$gt;
193 ++$SPcnt;
194 if ($DomRec eq 'R') {
195 if ($Pheno{$_} == 2) {
196 ++$GTitemCnt{$_} for @GT;
198 } elsif ($DomRec eq 'D') {
199 if (exists $ISinFamily{$_}) {
200 ++$GTitemCnt{$_} for @GT;
202 } else {die;}
203 $GT{$_}{'GTp'} = join ' ',map($Bases[$_], @GT);
204 #$GT{$_}{'O_K'} = 1;
205 } else {
206 $GT{$_}{'GTp'} = '0 0';
207 #$GT{$_}{'O_K'} = 0;
209 push @plinkGT,$GT{$_}{'GTp'};
211 if ($DomRec eq 'R') {
212 ($Mut) = sort { $GTitemCnt{$b} <=> $GTitemCnt{$a} } keys %GTitemCnt; # Desc
213 } elsif ($DomRec eq 'D') {
214 ($Mut) = sort { $GTitemCnt{$a} <=> $GTitemCnt{$b} } keys %GTitemCnt; # Asc
215 } else {die;}
216 unless (defined $Mut) {
217 ++$Stat{'VCF_noAffInd_Skipped'};
218 next;
219 #$Mut = -1;
221 unless ($Mut) {
222 ++$Stat{'VCF_noMUT_Count'};
223 #next;
225 #$Mut = $Bases[$Mut];
226 =pod
227 ++$Stat{'GTcnt'}{$INFO{'FQ'} <=> 0}{scalar(keys %GTcnt)};
228 ddx $Stat{'GTcnt'};
229 ddx $CHROM, $POS, $ID, $REF, $ALT, $QUAL, $FILTER, $INFO,\%INFO,\%GT if scalar(keys %GTcnt) > 1 and $INFO{'FQ'} < 0 and $SPcnt>2;
230 # rad2marker.pl:135: {
231 # -1 => { 1 => 2850, 2 => 526, 3 => 37 },
232 # 1 => { 1 => 8, 2 => 2507, 3 => 1792 },
234 =cut
235 #warn "$CHROM, $POS, $ID, $REF, $ALT, $QUAL, $FILTER, $INFO\n";
236 #ddx $CHROM, $POS, $ID, $REF, $ALT, $QUAL, $FILTER, $INFO,\%GTcnt,\%INFO,\%GT,\%GTitemCnt,$Mut;
237 if ($QUAL<20 or scalar(keys %GTcnt)<2 or $SPcnt < $minSampleCnt) { # No 'FQ' in 'INFO' for v1.1
238 ++$Stat{'VCF_Skipped'};
239 next;
241 $Mut = $Bases[$Mut];
242 my $SNPid = "r".$Stat{'VCF_In'};
243 my $ChrID = genChrNumber($CHROM);
244 #print OP join("\t",$1,$SNPid,0,$POS,@plinkGT),"\n";
245 print OM join("\t",$SNPid,$Mut),"\n";
246 print OD join("\t",${CHROM},${POS},$SNPid),"\n";
247 my (@GTall,@GTcase,@GTcontrol);
248 for my $i (0 .. $#tfamSamples) {
249 my $ind = $tfamSamples[$i];
250 if ($tfamSampleFlag{$ind} == 0) {
251 next;
252 } else {
253 push @GTall,$plinkGT[$i];
254 if ($tfamSampleFlag{$ind}) {
255 # if ($tfamSampleFlag{$ind} == 3) {
256 # $plinkGT[$i] = '0 0';
258 if ($tfamSampleFlag{$ind} & 1) {
259 push @GTcontrol,$plinkGT[$i];
261 if ($tfamSampleFlag{$ind} & 2) {
262 push @GTcase,$plinkGT[$i];
264 } else { die; }
267 print OP join("\t",$ChrID,$SNPid,0,$POS,@GTall),"\n";
268 print OPA join("\t",$ChrID,$SNPid,0,$POS,@GTcase),"\n";
269 print OPO join("\t",$ChrID,$SNPid,0,$POS,@GTcontrol),"\n";
270 ++$Stat{'Marker_Out'};
271 #print OV $_,"\n";
273 close $th;
275 close OP;
276 close OPA;
277 close OPO;
278 close OM;
279 close OD;
280 #close OV;
282 print O Dumper(\%ChrID2Num);
283 print O Dumper(\%Stat);
284 close O;
286 ddx \%Stat;
288 print "[Prepare $outfs.phe.] And then:\np-link --tfile $outfs --reference-allele $outfs.MinorAllele --fisher --out ${outfs}P --model --cell 0 --allow-no-sex\n--pheno $outfs.phe --all-pheno [screen log will be saved by p-link itselt]\n";
289 __END__
291 bcftools view -s^FCAP114 -m2 mpileup_20150321HKT071334.vcf.gz \
292 | bcftools norm -Df ../ref/Felis_catus80_chr.fa -c e -m+both \
293 | bcftools filter -sLowQual -e'%QUAL<10' \
294 | bcftools filter -m+ -sDepthHigh -e'DP>650' \
295 | bcftools filter -m+ -sDepthLow -e'DP<2' \
296 | bcftools filter -m+ -sBadSites -e'%QUAL<10 && RPB<0.1' \
297 | tee >(bcftools view -Oz -o filter_20150321HKT071334.vcf.gz) \
298 | bcftools view -f .,PASS -Oz -o filtered.vcf.gz &
300 bcftools query -f '%CHROM,%POS\t%REF|%ALT|%QUAL\t%DP[\t%SAMPLE=%GT,%DP]\n' filtered.vcf.gz |les
302 ./bcf2ped.pl ~/work/catbtail/merged/kinkcats.tfam ~/work/catbtail/merged/mpileup_20150321HKT071334.vcf.gz 18 D xxxxx 2>xxxxx
304 ./bcf2ped.pl kinkcats.tfam mpileup_20150403.vcf.filtered.gz 14 D outA14 sample.a
306 ./dojoin.pl outA14.dict 3 outA14P.model.DOM 3 tmp.DOM
308 awk '{print $3" "$1" "$2" "$4" "$6" "$7" "$8" "$9" "$10" "$11}' tmp.DOM.out > tmp.DOM.out.j
310 cat tmp.DOM.out.j|perl -lane '@a=split /\//,"$F[7]/$F[8]";$sum=$a[0]+$a[1]+$a[2]+$a[3];
311 print join("\t",@F,$sum)' |sort -k2,2 -k3,3n > out14P.DOM.npa
313 === Old ===
314 grep -hv \# radseq.gt > radseq.tfam
316 ./bcf2bed.pl radseq.tfam radseq.bcgv.bcf radseq 2>&1 | tee radseq.pedlog
318 grep REC radseq.p.snow.model > radseq.p.snow.model.REC
319 grep DOM radseq.p.snow.model > radseq.p.snow.model.DOM
320 sort -nk8 radseq.p.snow.model.REC > radseq.p.snow.model.REC.sortnk8 &
321 sort -nk8 radseq.p.snow.model.DOM > radseq.p.snow.model.DOM.sortnk8 &
322 bcftools view -I radseq.bcgv.bcf |grep -v \# |cat -n > radseq.bcgv.bcf.rs &
324 bcftools view tigers.bcgv.bcf|grep -v \## > tigers.bcgv.vcf &
326 join -1 3 -2 2 <(sort -k3 radall.dict) <(sort -k2 radallP.model) > tmp &
327 #sed -n '75,380 p' tmp.s.REC.nk3.scaffold75
329 cat tmp.s.REC|perl -lane '@a=split /\//,"$F[7]/$F[8]";$sum=$a[0]+$a[1]+$a[2]+$a[3];
330 $theta=($a[1]+$a[2])/($sum*2);
331 $p=((1-$theta)**(2*$sum-($a[1]+$a[2])))*($theta**($a[1]+$a[2]))/(0.5**(2*$sum));
332 $LOD=int(0.5+log($p)*1000/log(10))/1000;
333 print join("\t",@F,$sum,int(0.5+$theta*100000)/100000,int($LOD),$LOD)' > rec.pa
335 sort -nk13 -k2 -nk3 rec.pa > rec.pas
337 grep -P "scaffold75\t" rec.pas > scaffold75.pas
338 sort -nk3 scaffold75.pas > scaffold75.pas.nk3
339 perl -lane 'print if $F[10]>16' scaffold75.pas.nk3 > scaffold75.pas.nk3.16
341 grep -P "scaffold1458\t" rec.pas > scaffold1458.pas
342 sort -nk3 scaffold1458.pas > scaffold1458.pas.nk3
343 perl -lane 'print if $F[10]>16' scaffold1458.pas.nk3 > scaffold1458.pas.nk3.16
345 -------------------
347 ./cat/dojoin.pl radall.dict 3 radallP.model.REC 3 tmp.REC
349 awk '{print $3" "$1" "$2" "$4" "$6" "$7" "$8" "$9" "$10" "$11}' tmp.REC.out > tmp.REC.out.j
351 cat tmp.REC.out.j|perl -lane '@a=split /\//,"$F[7]/$F[8]";$sum=$a[0]+$a[1]+$a[2]+$a[3];
352 $theta=($a[1]+$a[2])/($sum*2);
353 $p=((1-$theta)**(2*$sum-($a[1]+$a[2])))*($theta**($a[1]+$a[2]))/(0.5**(2*$sum));
354 $LOD=int(0.5+log($p)*1000/log(10))/1000;
355 print join("\t",@F,$sum,int(0.5+$theta*100000)/100000,int($LOD),$LOD)' |sort -k2,2 -k3,3n > rec.npa
357 sort -k13,13n -k2,2 -k3,3n rec.npa > rec.npas
359 Tue Aug 13 14:44:52 CST 2013
361 find *.model|while read a;do grep REC $a > $a.REC.0;done
362 find *.model|while read a;do perl -lane 'if (@F<8) {splice @F,3,0,".";} die if @F<8; print join("\t",@F)' $a.REC.0 > $a.REC;done
363 find *.model|while read a;do sort -k 8,8n -k 2,2 $a.REC > $a.REC.nk8;done
364 find *.dict|perl -lane 's/\.dict//;system "./dojoin.pl $_.dict 3 ${_}P.model.REC.nk8 2 ${_}R.REC"'
365 find *.dict|perl -lane 's/\.dict//;print "awk ",chr(39),"BEGIN {OFS=\"\\t\"} {print \$3,\$1,\$2,\$4,\$6,\$7,\$8,\$9,\$10,\$11}",chr(39)," ${_}R.REC.out > ${_}R.REC.out.j" '