modified: makefile
[GalaxyCodeBases.git] / tools / genotyping / pl / genotyping.pl
blob3179af6fb76810730c566ffdbf7a1f2625ba3c02
1 #!/bin/env perl
2 use lib '/share/raid010/resequencing/soft/lib';
3 use lib 'E:/BGI/toGit/perlib/etc';
4 use strict;
5 use warnings;
6 use Time::HiRes qw ( gettimeofday tv_interval );
7 use File::Basename;
8 use Galaxy::ShowHelp;
9 use FindBin qw($RealBin);
11 $main::VERSION=0.0.1;
12 my $SCRIPTS="$RealBin/../scripts";
14 our $opts='a:i:p:o:m:z:c:v:g:q:e:f:n:bd';
15 our($opt_a, $opt_i, $opt_p, $opt_o, $opt_m, $opt_z, $opt_c, $opt_v, $opt_g, $opt_q, $opt_e, $opt_f, $opt_n, $opt_b, $opt_d);
17 #our $desc='';
18 our $help=<<EOH;
19 \t-i RIL pSNP list (ril.lst) in format: /^ChrID\\tpath toadd_ref\$/
20 \t-p RIL pSNP list (snp.lst) in format: /^ChrID\\tpath to_final.snp\$/
21 \t-m Parent A list (a.lst) in format: /^ChrID\\tpath to CNS.bz2\$/
22 \t-z Parent B list (b.lst)
23 \t-c ChrID to parse
24 \t-o Raw Genotype Output prefix (ril).ChrID.{rgt,intercross}
25 \t-g Dump Ref. Genotype to (ref).ChrID.gt
26 \t-a MinMACount to accept a popSNP site (17)
27 \t-q MinQual (15)
28 \t-n MaxCopyNum (1)
29 \t-e MinDepth (2)
30 \t-f MaxDepth (100)
31 \t-v Verbose level (undef=0)
32 \t-b No pause for batch runs
33 \t-d Debug Mode, for test only
34 EOH
36 ShowHelp();
38 $opt_i='ril.lst' if ! $opt_i;
39 $opt_p='snp.lst' if ! $opt_p;
40 $opt_m='a.lst' if ! $opt_m;
41 $opt_z='b.lst' if ! $opt_z;
42 $opt_g='ref' if ! $opt_g;
43 $opt_o='ril' if ! $opt_o;
44 $opt_a=17 if ! $opt_a;
45 $opt_q=15 if ! $opt_q;
46 $opt_e=2 if ! $opt_e;
47 $opt_f=100 if ! $opt_f;
48 $opt_n=1 if ! $opt_n;
49 die "[x]Must specify -c ChrID !\n" unless defined $opt_c;
51 no warnings;
52 $opt_v=int $opt_v;
53 $opt_a=int $opt_a;
54 $opt_q=int $opt_q;
55 $opt_e=int $opt_e;
56 $opt_f=int $opt_f;
57 use warnings;
58 die "[x]-i $opt_i not exists !\n" unless -f $opt_i;
59 die "[x]-m $opt_m not exists !\n" unless -f $opt_m;
60 die "[x]-z $opt_z not exists !\n" unless -f $opt_z;
61 my ($fileM,$fileZ,$fileRIL,$filePSNP);
63 print STDERR "From [$opt_i][$opt_p][$opt_c] with [$opt_m][$opt_z],\n [minQ:$opt_q,Depth:$opt_e,$opt_f,maxCopyNum:$opt_n; MinMACount:$opt_a]\n To [$opt_g].$opt_c.gt0, [$opt_o].$opt_c.rgt\n";
64 print STDERR "DEBUG Mode on !\n" if $opt_d;
65 print STDERR "Verbose Mode [$opt_v] !\n" if $opt_v;
66 unless ($opt_b) {print STDERR 'press [Enter] to continue...'; <>;}
68 my $start_time = [gettimeofday];
69 #BEGIN
70 # A1 T2 C4 G8
71 our %bIUB = (
72 A => 1,
73 C => 4,
74 G => 8,
75 T => 2,
76 M => 5,#[qw(A C)],
77 R => 9,#[qw(A G)],
78 W => 3,#[qw(A T)],
79 S => 12,#[qw(C G)],
80 Y => 6,#[qw(C T)],
81 K => 10,#[qw(G T)],
82 V => 13,#[qw(A C G)],
83 H => 7,#[qw(A C T)],
84 D => 11,#[qw(A G T)],
85 B => 14,#[qw(C G T)],
86 X => 15,#[qw(G A T C)],
87 N => 15,#[qw(G A T C)]
89 our %REV_IUB = (
90 1 => 'A',
91 2 => 'T',
92 4 => 'C',
93 8 => 'G',
94 5 => 'M,AC',
95 9 => 'R,AG',
96 3 => 'W,AT',
97 12 => 'S,CG',
98 6 => 'Y,CT',
99 10 => 'K,GT',
100 13 => 'V,ACG',
101 7 => 'H,ACT',
102 11 => 'D,AGT',
103 14 => 'B,CGT',
104 15 => 'N',
106 our %lb = (
107 0 => 0,
108 1 => 1,
109 4 => 1,
110 8 => 1,
111 2 => 1,
112 5 => 2,#[qw(A C)],
113 9 => 2,#[qw(A G)],
114 3 => 2,#[qw(A T)],
115 12 => 2,#[qw(C G)],
116 6 => 2,#[qw(C T)],
117 10 => 2,#[qw(G T)],
118 13 => 3,#[qw(A C G)],
119 7 => 3,#[qw(A C T)],
120 11 => 3,#[qw(A G T)],
121 14 => 3,#[qw(C G T)],
122 15 => 4,#[qw(G A T C)],
124 =pod
125 #http://doc.bioperl.org/bioperl-live/Bio/Tools/IUPAC.html#BEGIN1
126 our %IUB = ( A => [qw(A)],
127 C => [qw(C)],
128 G => [qw(G)],
129 T => [qw(T)],
130 U => [qw(U)],
131 M => [qw(A C)],
132 R => [qw(A G)],
133 W => [qw(A T)],
134 S => [qw(C G)],
135 Y => [qw(C T)],
136 K => [qw(G T)],
137 V => [qw(A C G)],
138 H => [qw(A C T)],
139 D => [qw(A G T)],
140 B => [qw(C G T)],
141 X => [qw(G A T C)],
142 N => [qw(G A T C)]
144 my %x = ( A => 1,
145 C => 4,
146 G => 8,
147 T => 2,);
148 for my $k (sort keys %IUB) {
149 next if $k eq 'U';
150 my $r=0;
151 $r += $x{$_} for (@{$IUB{$k}});
152 print join("\t",$k,$r,$bIUB{$k}),"\n";
153 warn "!" if $r != $bIUB{$k};
155 =cut
157 =pod
158 SoapSNP¸ñʽ£º
159 ²Î¼û£ºhttp://soap.genomics.org.cn/soapsnp.html#output2
160 chromosome_10 1226 A M 36 C 28 4 5 A 33 3 6 11 0.0285714 1.54545 0 166
161 1) Chromosome name
162 2) Position of locus
163 3) Nucleotide at corresponding locus of reference sequence
164 4) Genotype of sequencing sample
165 5) Quality value
166 6) nucleotide with the highest probability(first nucleotide)
167 7) Quality value of the nucleotide with the highest probability
168 8) Number of supported reads that can only be aligned to this locus
169 9) Number of all supported reads that can be aligned to this locus
170 10) Nucleotide with higher probability
171 11) Quality value of nucleotide with higher probability
172 12) Number of supported reads that can only be aligned to this locus
173 13) Number of all supported reads that can be aligned to this locus
174 14) Total number of reads that can be aligned to this locus
175 15) RST p-value
176 16) Estimated copy number for this locus
177 17) Presence of this locus in the dbSNP database. 1 refers to presence and 0 refers to inexistence
178 18) The distance between this locus and another closest SNP
179 =cut
180 open M,'<',$opt_m or die "[x]Error opening $opt_m: $!\n";
181 while (<M>) {
182 chomp;
183 my ($ChrID,$file)=split /\t/;
184 next if $ChrID ne $opt_c;
185 $fileM=$file and last;
187 close M;
188 open RIL,'<',$opt_i or die "[x]Error opening $opt_i: $!\n";
189 while (<RIL>) {
190 chomp;
191 my ($ChrID,$file)=split /\t/;
192 next if $ChrID ne $opt_c;
193 $fileRIL=$fileZ=$file and last;
195 close RIL;
196 open RIL,'<',$opt_p or die "[x]Error opening $opt_p: $!\n";
197 while (<RIL>) {
198 chomp;
199 my ($ChrID,$file)=split /\t/;
200 next if $ChrID ne $opt_c;
201 $filePSNP=$file and last;
203 close RIL;
204 open M,'<',$opt_z or die "[x]Error opening $opt_z: $!\n";
205 while (<M>) {
206 chomp;
207 my ($ChrID,$file)=split /\t/;
208 next if $ChrID ne $opt_c;
209 $fileZ=$file and last;
211 close M;
213 warn "[!]Using [$fileRIL]<-[$fileM][$fileZ].\n";
215 my ($ChrLen,$InZonea,$InZoneb,%SNPRefContent,%HeteroHomoContent,%DatS,$t,$NumType,$NumTypez,@OutTmp)=(0,0,0);
216 %SNPRefContent=%HeteroHomoContent=(1=>0, 2=>0, 3=>0);
217 open G,'>',$opt_g.".$opt_c.gt0" or die "[x]Error opening $opt_g.$opt_c.gt0: $!\n";
218 print G "# A1 T2 C4 G8
219 # minQ:$opt_q, Depth:$opt_e,$opt_f, maxCopyNum:$opt_n
220 # ChrID=$opt_c
221 # A=$fileM
222 # B=$fileZ
223 ",join("\t",'Pos','A','B','a','b'),"\n";
224 #open M,'<',$fileM or die "[x]Error opening $fileM: $!\n";
225 #open Z,'<',$fileZ or die "[x]Error opening $fileZ: $!\n";
226 open M,'-|',"bzip2 -dc $fileM" or warn "[x]Error opening [$fileM] with bzip2: $!\n";
227 open Z,'-|',"bzip2 -dc $fileZ" or warn "[x]Error opening [$fileZ] with bzip2: $!\n";
228 while (<M>) {
229 #chomp;
230 my ($ChrID,$Pos,$Ref,$Type,$Q,$Dep1,$Dep2,$DepA,$CN)=(split /\t/)[0,1,2,3,4,8,12,13,15];
231 $t=<Z>;
232 my ($ChrIDz,$Posz,$Refz,$Typez,$Qz,$Dep1z,$Dep2z,$DepAz,$CNz)=(split /\t/,$t)[0,1,2,3,4,8,12,13,15];
233 ++$ChrLen;
234 print STDERR "=>\b" unless $ChrLen % 1_000_000;
235 next if ($DepA < $opt_e or $DepA > $opt_f) or ($DepAz < $opt_e or $DepAz > $opt_f);
236 die "[x]CNS Files NOT match !\n" if $ChrID ne $ChrIDz or $Pos ne $Posz or $Ref ne $Refz;
237 my ($flag,$het)=(3,0);
238 if ( $Q < $opt_q or $CN > $opt_n or ($Type !~ /[ATCG]/i and ($het |= 1) and ($Dep1<$opt_e or $Dep2<$opt_e)) ) {
239 $Type = $Ref;
240 $flag &= 2;
241 } else { ++$InZonea; }
242 if ( $Qz < $opt_q or $CNz > $opt_n or ($Typez !~ /[ATCG]/i and ($het |= 2) and ($Dep1z<$opt_e or $Dep2z<$opt_e)) ) {
243 $Typez = $Refz;
244 $flag &= 1;
245 } else { ++$InZoneb; }
246 next if $Type eq $Typez;
247 $NumType=$bIUB{$Type}; $NumTypez=$bIUB{$Typez};
248 #next if $NumType & $NumTypez; # keep only A ^ B = NULL
249 next if $NumType == $NumTypez;
250 $DatS{$Pos}=[$NumType,$NumTypez,15^($NumType | $NumTypez)];
251 print G join("\t",$Pos,$Type,$Typez,$NumType,$NumTypez),"\n";
252 ++$SNPRefContent{$flag};
253 ++$HeteroHomoContent{$het};
255 @OutTmp=();
256 $t=0;
257 for (sort keys %SNPRefContent) {
258 push @OutTmp,"$_:$SNPRefContent{$_}";
259 $t += $SNPRefContent{$_};
261 print G "# ChrLen:$ChrLen\n# InZone: $InZonea, $InZoneb\n# SNP-Ref: ",join(', ',@OutTmp,"All:$t"),' -> ~',int(.5+$ChrLen/$t),' bp';
262 print STDERR "|\n[!] ChrLen:$ChrLen\n[!] InZone: $InZonea, $InZoneb\n[!] SNP-Ref: ",join(', ',@OutTmp,"All:$t"),' -> ~',int(.5+$ChrLen/$t),' bp';
263 @OutTmp=();
264 my $t2=0;
265 for (sort keys %HeteroHomoContent) {
266 push @OutTmp,"$_:$HeteroHomoContent{$_}";
267 $t2 += $HeteroHomoContent{$_} if $_;
269 print G "\n# Hetero-Homo: ",join(', ',@OutTmp,"All_but_0:$t2"),' -> ',100*$t2/$t," %\n";
270 print STDERR "\n[!] Hetero-Homo: ",join(', ',@OutTmp,"All_but_0:$t2"),' -> ',100*$t2/$t," %\n";
271 close Z;
272 close M;
273 #close G;
274 $t2=join('','-' x 24,'+','-' x 24,'|','-' x 24,'+---',"\n");
275 warn "[!]Parents' SNP done !\n",$t2;
277 sub GetGT($$) {
278 my ($Pos,$binBase)=@_;
279 my $Result=0;
280 #return 0 unless exists $DatS{$Pos};
281 my ($a,$b,$c)=@{$DatS{$Pos}} or return 0;
282 if ($binBase & $a) {
283 $Result |=1;
284 $Result |=16 unless $binBase & $b;
285 } # A
286 if ($binBase & $b) {
287 $Result |=2;
288 $Result |=32 unless $binBase & $a;
289 } # B
290 $Result |=4 if $binBase & $c; # Extra GenoType
291 #$Result = 0 if $Result == 3 and ($a & $b);
292 return $Result;
293 } # %DatS
295 my ($C_PSNP,$C_iSNP,$C_Pos,%GT)=(0,0,0);
296 open IN,'<',$fileRIL or die "[x]Error opening $fileRIL: $!\n";
297 open P,'<',$filePSNP or die "[x]Error opening $filePSNP: $!\n";
298 chomp($t=<IN>);
299 $t=(split /\t/,$t)[3];
300 my @Samples=split / /,$t;
301 warn '[!]Sample Order: ',(scalar @Samples),':[',join('] [',@Samples),']',"\n";
302 my ($chr,$pos,$ref,$tail,$pchr,$ppos,$Base1,$Base2,$Count1,$Count2);
303 while (<IN>) {
304 chomp;
305 ($chr,$pos,$ref,$tail)=split /\t/;
306 $t=<P>;
307 ($pchr,$ppos,$Base1,$Base2,$Count1,$Count2)=(split /\t/,$t)[0,1,5,6,7,8];
308 die "[x].\n" if $pos != $ppos;
309 next if $chr ne $opt_c;
310 ++$C_PSNP;
311 next unless $DatS{$pos};
312 ($Count1,$Count2) = sort {$b <=> $a} ($Count1,$Count2); # Desc
313 ++$C_Pos;
314 if ($Count2 >= $opt_a) {
315 my ($ap,$bp,$cp)=@{$DatS{$pos}}; # $DatS{$Pos}=[$NumType,$NumTypez,15^($NumType | $NumTypez)];
316 my $abp=$ap & $bp;
317 if ($lb{$abp} == 1) {
318 $abp = 15 ^ $abp;
319 my ($a,$b);
320 $a=$ap & $abp;
321 $b=$bp & $abp;
322 if ($a) {$ap=$a;}
323 elsif ($b) {$bp=$b;}
324 else {die "[x]Error.";}
325 warn "[2]@ $pos: ${$DatS{$pos}}[0],${$DatS{$pos}}[1],$cp -> $ap,$bp\n" if $opt_v > 1;
326 print G "$pos\t${$DatS{$pos}}[0],${$DatS{$pos}}[1],$cp\t->\t$ap,$bp\n";
327 $DatS{$pos} = [$ap,$bp,$cp];
330 $t=0;
331 my @indSNP=split / /,$tail; # /[ACGTRYMKSWHBVDNX-]/
332 #die join "\t",$chr,$pos,$ref,$tail,scalar @indSNP if @indSNP < 135;
333 for my $s (@Samples) {
334 unless ($tail=shift @indSNP) {
335 warn "[!].add_ref Error at ($chr,$pos: $ref) !\n" unless $t;
336 $t=1;
337 next;
339 if ($tail ne '-') {
340 $GT{$pos}{$s}=&GetGT($pos,$bIUB{$tail}); # if $DatBoth{$pos};
341 ++$C_iSNP;
342 } else {
343 next;
347 close P;
348 close IN;
349 close G;
350 warn "[!]GenoTyping done as ${C_PSNP}->$C_Pos,$C_iSNP,",$C_iSNP/$C_Pos,"\n";
352 my $Deleted=0;
353 unless ($opt_d) {
354 for my $Pos (keys %GT) {
355 my $flag=0;
356 for my $s (keys %{$GT{$Pos}}) {
357 if ($GT{$Pos}{$s} & 4) {
358 delete $GT{$Pos}{$s};
359 ++$Deleted;
363 warn "[!]GenoType filtered out $Deleted\n";
366 my ($PosC,$TypeC,%C_GT)=(0,0);
367 open O,'>',$opt_o.".$opt_c.rgt" or die "[x]Error opening $opt_o.$opt_c.rgt: $!\n";
368 open IC,'>',$opt_o.".$opt_c.intercross" or die "[x]Error opening $opt_o.$opt_c.intercross: $!\n";
369 my @PosList=sort {$a<=>$b} keys %GT;
370 $PosC = scalar @PosList;
372 my %FormatGT=(17=>'A', 34=>'B', 35=>'C', 19=>'D' ,3=>'H', 0=>'-'); # 0 for M(1), 1 for Z(2), 0.5 for mixture(3)
373 $fileM=`readlink -nf $fileM`;
374 $fileZ=`readlink -nf $fileZ`;
375 $fileRIL=`readlink -nf $fileRIL`;
376 print O "#A=A, $fileM\n#B=B, $fileZ\n#RIL $fileRIL\n#C for BH, D for AH\n",join("\t",@Samples),"\n";
377 print IC "data type f2 intercross\n",scalar @Samples,' ',scalar keys %GT,"\n";
378 for my $pos (@PosList) {
379 print O $pos;
380 print IC 'M',$pos;
381 for my $s (@Samples) {
382 if (exists $GT{$pos}{$s}) {
383 if (exists $FormatGT{$GT{$pos}{$s}}) {
384 $t=$FormatGT{$GT{$pos}{$s}};
385 print O "\t",$t;
386 print IC "\t",$t;
387 ++$TypeC;
388 ++$C_GT{$t};
389 } else {
390 print O "\t",'-';
391 print IC "\t",'-';
392 ++$C_GT{'Un'};
394 } else {
395 print O "\t",'-';
396 print IC "\t",'-';
397 ++$C_GT{'Err'};
400 print O "\n";
401 print IC "\n";
403 =pod
404 print O 'PosList',"\t",join(',',@PosList),"\n";
405 for my $s (@Samples) {
406 print O $s;
407 for my $pos (@PosList) {
408 if (exists $GT{$pos}{$s}) {
409 print O "\t",$GT{$pos}{$s};
410 ++$TypeC;
411 ++$C_GT{$GT{$pos}{$s}};
412 } else {
413 print O "\t",'-';
414 ++$C_GT{'-'};
416 #print O "\t",exists($GT{$pos}{$s})?$GT{$pos}{$s}:'-'; # exists is faster than defined ?
418 print O "\n";
420 =cut
421 warn "[!]GenoType written $PosC,$TypeC,",$TypeC/$PosC,"\n\n[!]GenoType Counts:\n";
422 for (sort keys %C_GT) {
423 warn "\t$_: $C_GT{$_}\n";
424 print O "# $_: $C_GT{$_}\n";
426 print O "#Filtered out: $Deleted\n#GenoType written $PosC,$TypeC -> ",$TypeC/$PosC,"\n";
427 close O;
429 #END
430 my $stop_time = [gettimeofday];
432 print STDERR "\nTime Elapsed:\t",tv_interval( $start_time, $stop_time )," second(s).\n";
433 __END__
434 cat ./9311/chrorder |xargs -n1 ./genotyping.pl -bz lpa64cns.lst -m l9311cns.lst -o ./20100916/ril -g ./20100916/ref -c
436 > binom.test(8,270,0.01)
438 Exact binomial test
440 data: 8 and 270
441 number of successes = 8, number of trials = 270, p-value = 0.006323
442 alternative hypothesis: true probability of success is not equal to 0.01
444 binom.test(13,270,0.02)
445 number of successes = 13, number of trials = 270, p-value = 0.003461
447 binom.test(17,270,0.03)
448 number of successes = 17, number of trials = 270, p-value = 0.003889