2 use lib
'/share/raid010/resequencing/soft/lib';
3 use lib
'E:/BGI/toGit/perlib/etc';
6 use Time
::HiRes qw
( gettimeofday tv_interval
);
9 use FindBin
qw($RealBin);
12 my $SCRIPTS="$RealBin/../scripts";
14 our $opts='a:i:p:o:m:z:c:v:g:q:e:f:n:bd';
15 our($opt_a, $opt_i, $opt_p, $opt_o, $opt_m, $opt_z, $opt_c, $opt_v, $opt_g, $opt_q, $opt_e, $opt_f, $opt_n, $opt_b, $opt_d);
19 \t-i RIL pSNP list (ril.lst) in format: /^ChrID\\tpath toadd_ref\$/
20 \t-p RIL pSNP list (snp.lst) in format: /^ChrID\\tpath to_final.snp\$/
21 \t-m Parent A list (a.lst) in format: /^ChrID\\tpath to CNS.bz2\$/
22 \t-z Parent B list (b.lst)
24 \t-o Raw Genotype Output prefix (ril).ChrID.{rgt,intercross}
25 \t-g Dump Ref. Genotype to (ref).ChrID.gt
26 \t-a MinMACount to accept a popSNP site (17)
31 \t-v Verbose level (undef=0)
32 \t-b No pause for batch runs
33 \t-d Debug Mode, for test only
38 $opt_i='ril.lst' if ! $opt_i;
39 $opt_p='snp.lst' if ! $opt_p;
40 $opt_m='a.lst' if ! $opt_m;
41 $opt_z='b.lst' if ! $opt_z;
42 $opt_g='ref' if ! $opt_g;
43 $opt_o='ril' if ! $opt_o;
44 $opt_a=17 if ! $opt_a;
45 $opt_q=15 if ! $opt_q;
47 $opt_f=100 if ! $opt_f;
49 die "[x]Must specify -c ChrID !\n" unless defined $opt_c;
58 die "[x]-i $opt_i not exists !\n" unless -f
$opt_i;
59 die "[x]-m $opt_m not exists !\n" unless -f
$opt_m;
60 die "[x]-z $opt_z not exists !\n" unless -f
$opt_z;
61 my ($fileM,$fileZ,$fileRIL,$filePSNP);
63 print STDERR
"From [$opt_i][$opt_p][$opt_c] with [$opt_m][$opt_z],\n [minQ:$opt_q,Depth:$opt_e,$opt_f,maxCopyNum:$opt_n; MinMACount:$opt_a]\n To [$opt_g].$opt_c.gt0, [$opt_o].$opt_c.rgt\n";
64 print STDERR
"DEBUG Mode on !\n" if $opt_d;
65 print STDERR
"Verbose Mode [$opt_v] !\n" if $opt_v;
66 unless ($opt_b) {print STDERR
'press [Enter] to continue...'; <>;}
68 my $start_time = [gettimeofday
];
86 X
=> 15,#[qw(G A T C)],
87 N
=> 15,#[qw(G A T C)]
118 13 => 3,#[qw(A C G)],
120 11 => 3,#[qw(A G T)],
121 14 => 3,#[qw(C G T)],
122 15 => 4,#[qw(G A T C)],
125 #http://doc.bioperl.org/bioperl-live/Bio/Tools/IUPAC.html#BEGIN1
126 our %IUB = ( A => [qw(A)],
148 for my $k (sort keys %IUB) {
151 $r += $x{$_} for (@{$IUB{$k}});
152 print join("\t",$k,$r,$bIUB{$k}),"\n";
153 warn "!" if $r != $bIUB{$k};
159 ²Î¼û£ºhttp://soap.genomics.org.cn/soapsnp.html#output2
160 chromosome_10 1226 A M 36 C 28 4 5 A 33 3 6 11 0.0285714 1.54545 0 166
163 3) Nucleotide at corresponding locus of reference sequence
164 4) Genotype of sequencing sample
166 6) nucleotide with the highest probability(first nucleotide)
167 7) Quality value of the nucleotide with the highest probability
168 8) Number of supported reads that can only be aligned to this locus
169 9) Number of all supported reads that can be aligned to this locus
170 10) Nucleotide with higher probability
171 11) Quality value of nucleotide with higher probability
172 12) Number of supported reads that can only be aligned to this locus
173 13) Number of all supported reads that can be aligned to this locus
174 14) Total number of reads that can be aligned to this locus
176 16) Estimated copy number for this locus
177 17) Presence of this locus in the dbSNP database. 1 refers to presence and 0 refers to inexistence
178 18) The distance between this locus and another closest SNP
180 open M
,'<',$opt_m or die "[x]Error opening $opt_m: $!\n";
183 my ($ChrID,$file)=split /\t/;
184 next if $ChrID ne $opt_c;
185 $fileM=$file and last;
188 open RIL
,'<',$opt_i or die "[x]Error opening $opt_i: $!\n";
191 my ($ChrID,$file)=split /\t/;
192 next if $ChrID ne $opt_c;
193 $fileRIL=$fileZ=$file and last;
196 open RIL
,'<',$opt_p or die "[x]Error opening $opt_p: $!\n";
199 my ($ChrID,$file)=split /\t/;
200 next if $ChrID ne $opt_c;
201 $filePSNP=$file and last;
204 open M
,'<',$opt_z or die "[x]Error opening $opt_z: $!\n";
207 my ($ChrID,$file)=split /\t/;
208 next if $ChrID ne $opt_c;
209 $fileZ=$file and last;
213 warn "[!]Using [$fileRIL]<-[$fileM][$fileZ].\n";
215 my ($ChrLen,$InZonea,$InZoneb,%SNPRefContent,%HeteroHomoContent,%DatS,$t,$NumType,$NumTypez,@OutTmp)=(0,0,0);
216 %SNPRefContent=%HeteroHomoContent=(1=>0, 2=>0, 3=>0);
217 open G
,'>',$opt_g.".$opt_c.gt0" or die "[x]Error opening $opt_g.$opt_c.gt0: $!\n";
218 print G
"# A1 T2 C4 G8
219 # minQ:$opt_q, Depth:$opt_e,$opt_f, maxCopyNum:$opt_n
223 ",join("\t",'Pos','A','B','a','b'),"\n";
224 #open M,'<',$fileM or die "[x]Error opening $fileM: $!\n";
225 #open Z,'<',$fileZ or die "[x]Error opening $fileZ: $!\n";
226 open M
,'-|',"bzip2 -dc $fileM" or warn "[x]Error opening [$fileM] with bzip2: $!\n";
227 open Z
,'-|',"bzip2 -dc $fileZ" or warn "[x]Error opening [$fileZ] with bzip2: $!\n";
230 my ($ChrID,$Pos,$Ref,$Type,$Q,$Dep1,$Dep2,$DepA,$CN)=(split /\t/)[0,1,2,3,4,8,12,13,15];
232 my ($ChrIDz,$Posz,$Refz,$Typez,$Qz,$Dep1z,$Dep2z,$DepAz,$CNz)=(split /\t/,$t)[0,1,2,3,4,8,12,13,15];
234 print STDERR
"=>\b" unless $ChrLen % 1_000_000
;
235 next if ($DepA < $opt_e or $DepA > $opt_f) or ($DepAz < $opt_e or $DepAz > $opt_f);
236 die "[x]CNS Files NOT match !\n" if $ChrID ne $ChrIDz or $Pos ne $Posz or $Ref ne $Refz;
237 my ($flag,$het)=(3,0);
238 if ( $Q < $opt_q or $CN > $opt_n or ($Type !~ /[ATCG]/i and ($het |= 1) and ($Dep1<$opt_e or $Dep2<$opt_e)) ) {
241 } else { ++$InZonea; }
242 if ( $Qz < $opt_q or $CNz > $opt_n or ($Typez !~ /[ATCG]/i and ($het |= 2) and ($Dep1z<$opt_e or $Dep2z<$opt_e)) ) {
245 } else { ++$InZoneb; }
246 next if $Type eq $Typez;
247 $NumType=$bIUB{$Type}; $NumTypez=$bIUB{$Typez};
248 #next if $NumType & $NumTypez; # keep only A ^ B = NULL
249 next if $NumType == $NumTypez;
250 $DatS{$Pos}=[$NumType,$NumTypez,15^($NumType | $NumTypez)];
251 print G
join("\t",$Pos,$Type,$Typez,$NumType,$NumTypez),"\n";
252 ++$SNPRefContent{$flag};
253 ++$HeteroHomoContent{$het};
257 for (sort keys %SNPRefContent) {
258 push @OutTmp,"$_:$SNPRefContent{$_}";
259 $t += $SNPRefContent{$_};
261 print G
"# ChrLen:$ChrLen\n# InZone: $InZonea, $InZoneb\n# SNP-Ref: ",join(', ',@OutTmp,"All:$t"),' -> ~',int(.5+$ChrLen/$t),' bp';
262 print STDERR
"|\n[!] ChrLen:$ChrLen\n[!] InZone: $InZonea, $InZoneb\n[!] SNP-Ref: ",join(', ',@OutTmp,"All:$t"),' -> ~',int(.5+$ChrLen/$t),' bp';
265 for (sort keys %HeteroHomoContent) {
266 push @OutTmp,"$_:$HeteroHomoContent{$_}";
267 $t2 += $HeteroHomoContent{$_} if $_;
269 print G
"\n# Hetero-Homo: ",join(', ',@OutTmp,"All_but_0:$t2"),' -> ',100*$t2/$t," %\n";
270 print STDERR
"\n[!] Hetero-Homo: ",join(', ',@OutTmp,"All_but_0:$t2"),' -> ',100*$t2/$t," %\n";
274 $t2=join('','-' x
24,'+','-' x
24,'|','-' x
24,'+---',"\n");
275 warn "[!]Parents' SNP done !\n",$t2;
278 my ($Pos,$binBase)=@_;
280 #return 0 unless exists $DatS{$Pos};
281 my ($a,$b,$c)=@
{$DatS{$Pos}} or return 0;
284 $Result |=16 unless $binBase & $b;
288 $Result |=32 unless $binBase & $a;
290 $Result |=4 if $binBase & $c; # Extra GenoType
291 #$Result = 0 if $Result == 3 and ($a & $b);
295 my ($C_PSNP,$C_iSNP,$C_Pos,%GT)=(0,0,0);
296 open IN
,'<',$fileRIL or die "[x]Error opening $fileRIL: $!\n";
297 open P
,'<',$filePSNP or die "[x]Error opening $filePSNP: $!\n";
299 $t=(split /\t/,$t)[3];
300 my @Samples=split / /,$t;
301 warn '[!]Sample Order: ',(scalar @Samples),':[',join('] [',@Samples),']',"\n";
302 my ($chr,$pos,$ref,$tail,$pchr,$ppos,$Base1,$Base2,$Count1,$Count2);
305 ($chr,$pos,$ref,$tail)=split /\t/;
307 ($pchr,$ppos,$Base1,$Base2,$Count1,$Count2)=(split /\t/,$t)[0,1,5,6,7,8];
308 die "[x].\n" if $pos != $ppos;
309 next if $chr ne $opt_c;
311 next unless $DatS{$pos};
312 ($Count1,$Count2) = sort {$b <=> $a} ($Count1,$Count2); # Desc
314 if ($Count2 >= $opt_a) {
315 my ($ap,$bp,$cp)=@
{$DatS{$pos}}; # $DatS{$Pos}=[$NumType,$NumTypez,15^($NumType | $NumTypez)];
317 if ($lb{$abp} == 1) {
324 else {die "[x]Error.";}
325 warn "[2]@ $pos: ${$DatS{$pos}}[0],${$DatS{$pos}}[1],$cp -> $ap,$bp\n" if $opt_v > 1;
326 print G
"$pos\t${$DatS{$pos}}[0],${$DatS{$pos}}[1],$cp\t->\t$ap,$bp\n";
327 $DatS{$pos} = [$ap,$bp,$cp];
331 my @indSNP=split / /,$tail; # /[ACGTRYMKSWHBVDNX-]/
332 #die join "\t",$chr,$pos,$ref,$tail,scalar @indSNP if @indSNP < 135;
333 for my $s (@Samples) {
334 unless ($tail=shift @indSNP) {
335 warn "[!].add_ref Error at ($chr,$pos: $ref) !\n" unless $t;
340 $GT{$pos}{$s}=&GetGT
($pos,$bIUB{$tail}); # if $DatBoth{$pos};
350 warn "[!]GenoTyping done as ${C_PSNP}->$C_Pos,$C_iSNP,",$C_iSNP/$C_Pos,"\n";
354 for my $Pos (keys %GT) {
356 for my $s (keys %{$GT{$Pos}}) {
357 if ($GT{$Pos}{$s} & 4) {
358 delete $GT{$Pos}{$s};
363 warn "[!]GenoType filtered out $Deleted\n";
366 my ($PosC,$TypeC,%C_GT)=(0,0);
367 open O
,'>',$opt_o.".$opt_c.rgt" or die "[x]Error opening $opt_o.$opt_c.rgt: $!\n";
368 open IC
,'>',$opt_o.".$opt_c.intercross" or die "[x]Error opening $opt_o.$opt_c.intercross: $!\n";
369 my @PosList=sort {$a<=>$b} keys %GT;
370 $PosC = scalar @PosList;
372 my %FormatGT=(17=>'A', 34=>'B', 35=>'C', 19=>'D' ,3=>'H', 0=>'-'); # 0 for M(1), 1 for Z(2), 0.5 for mixture(3)
373 $fileM=`readlink -nf $fileM`;
374 $fileZ=`readlink -nf $fileZ`;
375 $fileRIL=`readlink -nf $fileRIL`;
376 print O
"#A=A, $fileM\n#B=B, $fileZ\n#RIL $fileRIL\n#C for BH, D for AH\n",join("\t",@Samples),"\n";
377 print IC
"data type f2 intercross\n",scalar @Samples,' ',scalar keys %GT,"\n";
378 for my $pos (@PosList) {
381 for my $s (@Samples) {
382 if (exists $GT{$pos}{$s}) {
383 if (exists $FormatGT{$GT{$pos}{$s}}) {
384 $t=$FormatGT{$GT{$pos}{$s}};
404 print O 'PosList',"\t",join(',',@PosList),"\n";
405 for my $s (@Samples) {
407 for my $pos (@PosList) {
408 if (exists $GT{$pos}{$s}) {
409 print O "\t",$GT{$pos}{$s};
411 ++$C_GT{$GT{$pos}{$s}};
416 #print O "\t",exists($GT{$pos}{$s})?$GT{$pos}{$s}:'-'; # exists is faster than defined ?
421 warn "[!]GenoType written $PosC,$TypeC,",$TypeC/$PosC,"\n\n[!]GenoType Counts:\n";
422 for (sort keys %C_GT) {
423 warn "\t$_: $C_GT{$_}\n";
424 print O
"# $_: $C_GT{$_}\n";
426 print O
"#Filtered out: $Deleted\n#GenoType written $PosC,$TypeC -> ",$TypeC/$PosC,"\n";
430 my $stop_time = [gettimeofday
];
432 print STDERR
"\nTime Elapsed:\t",tv_interval
( $start_time, $stop_time )," second(s).\n";
434 cat
./9311/chrorder
|xargs
-n1
./genotyping.pl -bz lpa64cns.lst -m l9311cns.lst -o ./20100916/ril -g ./20100916/ref -c
436 > binom
.test
(8,270,0.01)
441 number of successes
= 8, number of trials
= 270, p
-value
= 0.006323
442 alternative hypothesis
: true probability of success is
not equal to
0.01
444 binom
.test
(13,270,0.02)
445 number of successes
= 13, number of trials
= 270, p
-value
= 0.003461
447 binom
.test
(17,270,0.03)
448 number of successes
= 17, number of trials
= 270, p
-value
= 0.003889