3 Author: Hu Xuesong @ BIOPIC <galaxy001@gmail.com>
4 Version: 1.0.0 @ 20120720
5 Purpose: Read bcf, get tped for p-link
6 Notes: rad2marker is deprecated.
10 use Data
::Dump
qw(ddx);
15 die "Usage: $0 <tfam file> <bcgv bcf> <min Sample Count> <Dominant/Recessive> <out> [sample list]\n" if @ARGV<5;
18 my $minSampleCnt=shift; # 16 for 16 samples in paper
23 $DomRec='D' if $DomRec =~ /^D/i;
24 $DomRec='R' if $DomRec =~ /^R/i;
27 open OP
,'>',$outfs.'.tped' or die "Error opening $outfs.tped : $!\n";
28 open OPA
,'>',$outfs.'.case.tped' or die $!;
29 open OPO
,'>',$outfs.'.control.tped' or die $!;
31 open OM
,'>',$outfs.'.MinorAllele' or die "Error opening $outfs.MinorAllele : $!\n";
32 open OD
,'>',$outfs.'.dict' or die "Error opening $outfs.dict : $!\n";
33 if ($tfamfs ne $outfs.'.tfam') {
34 open OF
,'>',$outfs.'.tfam' or die "Error opening $outfs.tfam : $!\n";
35 open OFA
,'>',$outfs.'.case.tfam' or die $!;
36 open OFO
,'>',$outfs.'.control.tfam' or die $!;
38 open O
,'>',$outfs.'.bcf2pedlog' or die "Error opening $outfs.bcf2pedlog : $!\n";
39 $t = "# In:[$bcfs], Out:[$outfs]. minSampleCnt=$minSampleCnt Dominant/Recessive:[$DomRec]\n";
43 #open OV,'>',$outfs.'.vcf' or die "Error opening $outfs.vcf : $!\n";
45 if (defined $sampleList) {
46 open L
,'<',$sampleList or die;
49 ++$selectedSamples{$_};
53 my (%Pheno,@tfamSamples,%tfamDat,%tfamSampleFlag,%inFamily,%ISinFamily);
54 open L
,'<',$tfamfs or die;
59 my ($family,$ind,$P,$M,$sex,$pho) = split /\t/;
60 if (defined $sampleList) {
61 next unless exists $selectedSamples{$ind};
63 $tfamDat{$ind} = $_."\n";
65 $tfamSampleFlag{$ind} = 0;
66 } elsif ($pho == 1 or $pho == 2) {
67 $tfamSampleFlag{$ind} = $pho;
68 } else { die; } # $pho can only be 1 or 2
69 # $tfamSampleFlag{$ind} = 3 if $ind !~ /^GZXJ/;
70 push @tfamSamples,$ind;
71 push @
{$inFamily{$family}},$ind;
72 $Pheno{$ind} = $pho; # disease phenotype (1=unaff/ctl, 2=aff/case, 0=miss)
74 for my $ind (@tfamSamples) {
75 if ($tfamSampleFlag{$ind} == 0) {
77 } elsif ($tfamSampleFlag{$ind}) {
78 if ($tfamSampleFlag{$ind} & 1) {
79 print OFO
$tfamDat{$ind};
81 if ($tfamSampleFlag{$ind} & 2) {
82 print OFA
$tfamDat{$ind};
85 print OF
$tfamDat{$ind};
91 for my $family (keys %inFamily) {
92 if (scalar @
{$inFamily{$family}} == 1) {
93 delete $inFamily{$family};
95 $ISinFamily{$_}=$family for @
{$inFamily{$family}};
98 ddx \
%inFamily; ddx \
%ISinFamily;
100 my $cmd = 'bcftools view --types snps -m2 -M2';
101 if (defined $sampleList) {
102 $cmd .= " -S $sampleList";
104 print O
"Open:[$cmd] [$bcfs]\n";
105 print "Open:[$cmd] [$bcfs]\n";;
106 my $th = openpipe
($cmd,$bcfs);
107 my (@Samples,@Parents);
112 my @data = split /\t/;
113 if ($data[0] eq '#CHROM') {
115 if (/^(\w+)_all\.bam$/) {
117 } elsif (/\.\/bam0\
//) {
118 my $t=(split /_/)[-1]; $t=(split /\./,$t)[0]; $_=$t;
120 #my $t=(split /\//)[-1];$t=~s/_cut//g;$t=~s/-/./g; $_=join('_',(split /\./,$t)[-5,-6]);
125 # ../5.bam_0000210210_merged/d1_4_merged.D4.JHH001.XTU.sort.rmdup.bam
126 #@Parents = grep(!/^GZXJ/,@Samples);
132 die "Sample Count in tfam and bcf not match !\n" if @tfamSamples != @Samples;
134 for (my $i = 0; $i < @Samples; $i++) {
135 push @res,join(',',$tfamSamples[$i],$tfamSampleFlag{$Samples[$i]});
136 die "Samples in tfam and bcf not match ! [$tfamSamples[$i] <> $Samples[$i]]\n" if $tfamSamples[$i] ne $Samples[$i];
137 } # http://stackoverflow.com/questions/2591747/how-can-i-compare-arrays-in-perl
138 print O
"# Samples: [",join('],[',@res),"]\t# 1=control, 2=case, 0=drop\n# Parents: [",join('],[',@Parents),"]\n";
139 warn "Samples: (1=control, 2=case, 0=drop)\n[",join("]\n[",@res),"]\nParents: [",join('],[',@Parents),"]\n";
142 sub genChrNumber
($) {
144 if (exists $ChrID2Num{$id}) {
145 return $ChrID2Num{$id};
147 my $lastCnt = scalar keys %ChrID2Num;
148 $ChrID2Num{$id} = $lastCnt+1;
149 return $ChrID2Num{$id};
156 my ($CHROM, $POS, $ID, $REF, $ALT, $QUAL, $FILTER, $INFO, $FORMAT, @data) = split /\t/;
157 ++$Stat{'VCF_In'}; # also as rs#
158 my @Bases = split /\s*,\s*/,"$REF, $ALT";
159 my @groups = split(/\s*;\s*/, $INFO);
161 # if ($groups[0] eq 'INDEL') {
162 # $INFO{'Type'} = 'INDEL';
165 # $INFO{'Type'} = 'SNP';
167 for my $group (@groups) {
168 my ($tag,$value) = split /=/,$group;
169 #warn "- $group -> $tag,$value\n";
170 my @values = split /,/,$value;
172 $INFO{$tag}=$values[0];
174 $INFO{$tag}=\
@values;
178 my @FMT = split /:/,$FORMAT;
179 for my $s (@Samples) {
180 my $dat = shift @data or die "bcf file error.";
181 my @dat = split /:/,$dat;
183 $GT{$s}{$i} = shift @dat;
187 my (%GTitemCnt,$Mut,@plinkGT);
189 if ($GT{$_}{'DP'} > 0 and $GT{$_}{'GQ'} >= 20) {
190 my $gt = $GT{$_}{'GT'};
192 my @GT = split /[\/|]/,$gt;
194 if ($DomRec eq 'R') {
195 if ($Pheno{$_} == 2) {
196 ++$GTitemCnt{$_} for @GT;
198 } elsif ($DomRec eq 'D') {
199 if (exists $ISinFamily{$_}) {
200 ++$GTitemCnt{$_} for @GT;
203 $GT{$_}{'GTp'} = join ' ',map($Bases[$_], @GT);
206 $GT{$_}{'GTp'} = '0 0';
209 push @plinkGT,$GT{$_}{'GTp'};
211 if ($DomRec eq 'R') {
212 ($Mut) = sort { $GTitemCnt{$b} <=> $GTitemCnt{$a} } keys %GTitemCnt; # Desc
213 } elsif ($DomRec eq 'D') {
214 ($Mut) = sort { $GTitemCnt{$a} <=> $GTitemCnt{$b} } keys %GTitemCnt; # Asc
216 unless (defined $Mut) {
217 ++$Stat{'VCF_noAffInd_Skipped'};
222 ++$Stat{'VCF_noMUT_Count'};
225 #$Mut = $Bases[$Mut];
227 ++$Stat{'GTcnt'}{$INFO{'FQ'} <=> 0}{scalar(keys %GTcnt)};
229 ddx $CHROM, $POS, $ID, $REF, $ALT, $QUAL, $FILTER, $INFO,\%INFO,\%GT if scalar(keys %GTcnt) > 1 and $INFO{'FQ'} < 0 and $SPcnt>2;
230 # rad2marker.pl:135: {
231 # -1 => { 1 => 2850, 2 => 526, 3 => 37 },
232 # 1 => { 1 => 8, 2 => 2507, 3 => 1792 },
235 #warn "$CHROM, $POS, $ID, $REF, $ALT, $QUAL, $FILTER, $INFO\n";
236 #ddx $CHROM, $POS, $ID, $REF, $ALT, $QUAL, $FILTER, $INFO,\%GTcnt,\%INFO,\%GT,\%GTitemCnt,$Mut;
237 if ($QUAL<20 or scalar(keys %GTcnt)<2 or $SPcnt < $minSampleCnt) { # No 'FQ' in 'INFO' for v1.1
238 ++$Stat{'VCF_Skipped'};
242 my $SNPid = "r".$Stat{'VCF_In'};
243 my $ChrID = genChrNumber
($CHROM);
244 #print OP join("\t",$1,$SNPid,0,$POS,@plinkGT),"\n";
245 print OM
join("\t",$SNPid,$Mut),"\n";
246 print OD
join("\t",${CHROM
},${POS
},$SNPid),"\n";
247 my (@GTall,@GTcase,@GTcontrol);
248 for my $i (0 .. $#tfamSamples) {
249 my $ind = $tfamSamples[$i];
250 if ($tfamSampleFlag{$ind} == 0) {
253 push @GTall,$plinkGT[$i];
254 if ($tfamSampleFlag{$ind}) {
255 # if ($tfamSampleFlag{$ind} == 3) {
256 # $plinkGT[$i] = '0 0';
258 if ($tfamSampleFlag{$ind} & 1) {
259 push @GTcontrol,$plinkGT[$i];
261 if ($tfamSampleFlag{$ind} & 2) {
262 push @GTcase,$plinkGT[$i];
267 print OP
join("\t",$ChrID,$SNPid,0,$POS,@GTall),"\n";
268 print OPA
join("\t",$ChrID,$SNPid,0,$POS,@GTcase),"\n";
269 print OPO
join("\t",$ChrID,$SNPid,0,$POS,@GTcontrol),"\n";
270 ++$Stat{'Marker_Out'};
282 print O Dumper
(\
%ChrID2Num);
283 print O Dumper
(\
%Stat);
288 print "[Prepare $outfs.phe.] And then:\np-link --tfile $outfs --reference-allele $outfs.MinorAllele --fisher --out ${outfs}P --model --cell 0 --allow-no-sex\n--pheno $outfs.phe --all-pheno [screen log will be saved by p-link itselt]\n";
291 bcftools view
-s
^FCAP114
-m2 mpileup_20150321HKT071334
.vcf
.gz \
292 | bcftools norm
-Df
../ref/Felis_catus80_chr
.fa
-c e
-m
+both \
293 | bcftools filter
-sLowQual
-e
'%QUAL<10' \
294 | bcftools filter
-m
+ -sDepthHigh
-e
'DP>650' \
295 | bcftools filter
-m
+ -sDepthLow
-e
'DP<2' \
296 | bcftools filter
-m
+ -sBadSites
-e
'%QUAL<10 && RPB<0.1' \
297 | tee
>(bcftools view
-Oz
-o filter_20150321HKT071334
.vcf
.gz
) \
298 | bcftools view
-f
.,PASS
-Oz
-o filtered
.vcf
.gz
&
300 bcftools query
-f
'%CHROM,%POS\t%REF|%ALT|%QUAL\t%DP[\t%SAMPLE=%GT,%DP]\n' filtered
.vcf
.gz
|les
302 ./bcf2ped.pl ~/work
/catbtail/merged
/kinkcats.tfam ~/work
/catbtail/merged
/mpileup_20150321HKT071334
.vcf
.gz
18 D xxxxx
2>xxxxx
304 ./bcf2ped
.pl kinkcats
.tfam mpileup_20150403
.vcf
.filtered
.gz
14 D outA14 sample
.a
306 ./dojoin
.pl outA14
.dict
3 outA14P
.model
.DOM
3 tmp
.DOM
308 awk
'{print $3" "$1" "$2" "$4" "$6" "$7" "$8" "$9" "$10" "$11}' tmp
.DOM
.out
> tmp
.DOM
.out
.j
310 cat tmp
.DOM
.out
.j
|perl
-lane
'@a=split /\//,"$F[7]/$F[8]";$sum=$a[0]+$a[1]+$a[2]+$a[3];
311 print join("\t",@F,$sum)' |sort -k2
,2 -k3
,3n
> out14P
.DOM
.npa
314 grep -hv \# radseq
.gt > radseq
.tfam
316 ./bcf2bed
.pl radseq
.tfam radseq
.bcgv
.bcf radseq
2>&1 | tee radseq
.pedlog
318 grep REC radseq
.p
.snow
.model
> radseq
.p
.snow
.model
.REC
319 grep DOM radseq
.p
.snow
.model
> radseq
.p
.snow
.model
.DOM
320 sort -nk8 radseq
.p
.snow
.model
.REC
> radseq
.p
.snow
.model
.REC
.sortnk8
&
321 sort -nk8 radseq
.p
.snow
.model
.DOM
> radseq
.p
.snow
.model
.DOM
.sortnk8
&
322 bcftools view
-I radseq
.bcgv
.bcf
|grep -v \#
|cat
-n
> radseq
.bcgv
.bcf
.rs
&
324 bcftools view tigers
.bcgv
.bcf
|grep -v \#
# > tigers.bcgv.vcf &
326 join -1 3 -2 2 <(sort -k3 radall
.dict
) <(sort -k2 radallP
.model
) > tmp
&
327 #sed -n '75,380 p' tmp.s.REC.nk3.scaffold75
329 cat tmp
.s
.REC
|perl
-lane
'@a=split /\//,"$F[7]/$F[8]";$sum=$a[0]+$a[1]+$a[2]+$a[3];
330 $theta=($a[1]+$a[2])/($sum*2);
331 $p=((1-$theta)**(2*$sum-($a[1]+$a[2])))*($theta**($a[1]+$a[2]))/(0.5**(2*$sum));
332 $LOD=int(0.5+log($p)*1000/log(10))/1000;
333 print join("\t",@F,$sum,int(0.5+$theta*100000)/100000,int($LOD),$LOD)' > rec
.pa
335 sort -nk13
-k2
-nk3 rec
.pa
> rec
.pas
337 grep -P
"scaffold75\t" rec
.pas
> scaffold75
.pas
338 sort -nk3 scaffold75
.pas
> scaffold75
.pas
.nk3
339 perl
-lane
'print if $F[10]>16' scaffold75
.pas
.nk3
> scaffold75
.pas
.nk3
.16
341 grep -P
"scaffold1458\t" rec
.pas
> scaffold1458
.pas
342 sort -nk3 scaffold1458
.pas
> scaffold1458
.pas
.nk3
343 perl
-lane
'print if $F[10]>16' scaffold1458
.pas
.nk3
> scaffold1458
.pas
.nk3
.16
347 ./cat/dojoin
.pl radall
.dict
3 radallP
.model
.REC
3 tmp
.REC
349 awk
'{print $3" "$1" "$2" "$4" "$6" "$7" "$8" "$9" "$10" "$11}' tmp
.REC
.out
> tmp
.REC
.out
.j
351 cat tmp
.REC
.out
.j
|perl
-lane
'@a=split /\//,"$F[7]/$F[8]";$sum=$a[0]+$a[1]+$a[2]+$a[3];
352 $theta=($a[1]+$a[2])/($sum*2);
353 $p=((1-$theta)**(2*$sum-($a[1]+$a[2])))*($theta**($a[1]+$a[2]))/(0.5**(2*$sum));
354 $LOD=int(0.5+log($p)*1000/log(10))/1000;
355 print join("\t",@F,$sum,int(0.5+$theta*100000)/100000,int($LOD),$LOD)' |sort -k2
,2 -k3
,3n
> rec
.npa
357 sort -k13
,13n
-k2
,2 -k3
,3n rec
.npa
> rec
.npas
359 Tue Aug
13 14:44:52 CST
2013
361 find
*.model
|while read a
;do grep REC
$a > $a.REC
.0;done
362 find
*.model
|while read a
;do perl
-lane
'if (@F<8) {splice @F,3,0,".";} die if @F<8; print join("\t",@F)' $a.REC
.0 > $a.REC
;done
363 find
*.model
|while read a
;do sort -k
8,8n
-k
2,2 $a.REC
> $a.REC
.nk8
;done
364 find
*.dict
|perl
-lane
's/\.dict//;system "./dojoin.pl $_.dict 3 ${_}P.model.REC.nk8 2 ${_}R.REC"'
365 find
*.dict
|perl
-lane
's/\.dict//;print "awk ",chr(39),"BEGIN {OFS=\"\\t\"} {print \$3,\$1,\$2,\$4,\$6,\$7,\$8,\$9,\$10,\$11}",chr(39)," ${_}R.REC.out > ${_}R.REC.out.j" '