2 use lib
'/share/raid010/resequencing/soft/lib';
3 use lib
'E:/BGI/toGit/perlib/etc';
6 use Time
::HiRes qw
( gettimeofday tv_interval
);
11 our $opts='i:c:w:l:o:vb';
12 our ($opt_i, $opt_o, $opt_c, $opt_v, $opt_b, $opt_l, $opt_w);
15 \t-i Population SNP list (./psnp.lst) for *.individual.finalSNPs
16 \t-c Chr Info file (./chr.nfo) in format [ChrID\\tLen]
17 \t-w window size (40000) bp
18 \t-l step length (20000) bp
19 \t-o Output Prefix (./Hp_result)
20 \t Output Files are {./Hp_result}.dat \& {./Hp_result}.stat
21 \t-v show verbose info to STDOUT
22 \t-b No pause for batch runs
27 $opt_o='./Hp_result' if ! defined $opt_o;
28 $opt_i='./psnp.lst' if ! $opt_i;
29 $opt_c='./chr.nfo' if ! $opt_c;
30 $opt_w=40000 if ! $opt_w;
31 $opt_l=20000 if ! $opt_l;
33 print STDERR
"From [$opt_i] to [$opt_o].{dat,stat}, with [$opt_c][$opt_l][$opt_w]\n";
34 if (! $opt_b) {print STDERR
'press [Enter] to continue...'; <>;}
36 #http://doc.bioperl.org/bioperl-live/Bio/Tools/IUPAC.html#BEGIN1
37 our %IUB = ( A
=> [qw(A A)],
56 my $start_time = [gettimeofday
];
58 system('mkdir','-p',$opt_o);
59 system('rmdir',$opt_o); # if $opt_o =~ m#/[\s.]*[^/\s.]+[^/]*$#;
62 print STDERR
"[!]Load ChrNFO:\t";
63 open C
,'<',$opt_c or die "[x]Error opening $opt_c: $!\n";
66 my ($chr,$len)=split /\t/;
69 print STDERR
"$chr,$len\t"
76 print STDERR
"[!]Parsing PSNP: .";
77 open P
,'<',$opt_i or die "[x]Error opening $opt_i: $!\n";
78 while (my $file=<P
>) {
80 open SNP
,'<',$file or (warn "\n[!]Error opening $file: $!\n" and next);
82 my ($chr,$pos,$ref,$tail,$i,@SNPCounts);
84 ($chr,$pos,$ref,$tail)=split /\t/;
86 for (split / /,$tail) {
87 next unless /[ACGTRYMKSWHBVDNX]/;
88 ++$SNPcount{$_} for @
{$IUB{$_}};
90 @SNPCounts=sort {$a <=> $b} values %SNPcount;
91 next if $#SNPCounts < 1; # at least 2-1=1
92 #next if $SNPCounts[0] == $SNPCounts[-1]; # min != max (needed ?)
93 $SNP{$chr}{$pos}=[$SNPCounts[0],$SNPCounts[-1]]; # [min,max]
101 my $file=$opt_o.'.dat.tmp';
102 open O
,'>',$file or die "[x]Error opening $file: $!\n";
104 my ($POSa,$N,%Hpc,%Hpa,$Hpr,$sX,$sXX)=(1,0);
105 print STDERR
"[!]Caltulating Hp ";
106 for my $chr (@ChrID) {
108 my ($Cmax,$st,$ed,$pos)=($ChrLen{$chr},1,0);
109 my ($Smin,$Smax,$Hp,$sum,$snpcount);
110 while ($st <= $Cmax) {
111 $ed = $st + $win; # [$st,$ed]
112 $ed = $Cmax if $ed > $Cmax;
113 ($Smin,$Smax,$Hp,$sum,$snpcount)=(0,0,0,0,0);
114 for $pos ($st .. $ed) {
115 next unless exists $SNP{$chr}{$pos};
116 $Smin += $SNP{$chr}{$pos}->[0];
117 $Smax += $SNP{$chr}{$pos}->[1];
122 $Hp=2*$Smin*$Smax/($sum*$sum);
123 $Hpr=int($Hp*1000)/1000;
124 } else {$Hp=$Hpr='Inf';}
130 print O
"$chr\t$POSa\t$st\t$ed\t$Hp\t$Smin,$Smax,$sum\t$snpcount\n";
140 # http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
141 #my $Std=sqrt($sXX/$N-$Avg*$Avg);
142 my $Std=sqrt(($sXX-$Avg*$sX)/($N-1));
144 $file=$opt_o.'.dat.tmp';
145 open I
,'<',$file or die "[x]Error opening $file: $!\n";
147 open O
,'>',$file or die "[x]Error opening $file: $!\n";
149 my ($chr, $POSa, $st, $ed, $Hp, $Smms, $snpcount)=split /\t/;
150 my $ZHp=($Hp-$Avg)/$Std;
151 print O
join("\t",$chr, $POSa, $st, $ed, $ZHp, $Hp, $Smms, $snpcount);
156 unlink $opt_o.'.dat.tmp';
158 print STDERR
"[!]Stating Hp ";
159 $file=$opt_o.'.stat';
160 open S
,'>',$file or die "[x]Error opening $file: $!\n";
161 print S
"# Avg:\t$Avg\n# Std:\t$Std\n# N: $N, sX: $sX, sXX: $sXX\n";
163 if ($_[0] eq 'Inf') { return 1; }
164 elsif ($_[1] eq 'Inf') { return -1; }
165 else { return $_[0] <=> $_[1]; }
168 print S
"ALL\t$_\t$Hpa{$_}\n" for sort(sortInf
keys %Hpa);
170 for my $chr (@ChrID) {
171 print S
"$chr\t$_\t$Hpc{$chr}{$_}\n" for sort(sortInf
keys %{$Hpc{$chr}});
174 warn "\n[!] Done !\n";
176 my $stop_time = [gettimeofday
];
177 print STDERR
"\nTime Elapsed:\t",tv_interval
( $start_time, $stop_time )," second(s).\n";
180 a
=read.delim
('v4Hp_40k_20k.dat',F
,comment
.char
='#')
182 out
<- function
(c
) {
183 png
(paste
("E:\\BGI\\toGit\\popuSNP\\new\\",c
,'.png',sep
=''),2048,768)
184 plot
(a
$V3[a
$V1==c
],a
$V5[a
$V1==c
],type
='p',xlab
=c
,ylab
='Hp')
190 # pdf("E:\\BGI\\toGit\\popuSNP\\new\\wm_a_w40k_s20k.pdf",12,9,fonts='Arial',title="Watermelon Hp density Plot")
191 plot
(density
(a
$V5),main
='Watermelon Hp density Plot',sub='Window:40k bp, Slip:20k bp')
194 plot
(a
$V2[a
$V5<=0],a
$V5[a
$V5<=0],type
='p',xlab
=c
,ylab
='Hp',cex
=.4)