new file: cell2loc.py
[GalaxyCodeBases.git] / perl / popuSNP / new / single_hp.pl
blob29d82300d1d8053752d6ed5d2fa5cf813f2e08ab
1 #!/bin/env perl
2 use lib '/share/raid010/resequencing/soft/lib';
3 use lib 'E:/BGI/toGit/perlib/etc';
4 use strict;
5 use warnings;
6 use Time::HiRes qw ( gettimeofday tv_interval );
7 use Galaxy::ShowHelp;
9 $main::VERSION=0.0.3;
11 our $opts='i:c:w:l:o:vb';
12 our ($opt_i, $opt_o, $opt_c, $opt_v, $opt_b, $opt_l, $opt_w);
14 our $help=<<EOH;
15 \t-i Population SNP list (./psnp.lst) for *.individual.finalSNPs
16 \t-c Chr Info file (./chr.nfo) in format [ChrID\\tLen]
17 \t-w window size (40000) bp
18 \t-l step length (20000) bp
19 \t-o Output Prefix (./Hp_result)
20 \t Output Files are {./Hp_result}.dat \& {./Hp_result}.stat
21 \t-v show verbose info to STDOUT
22 \t-b No pause for batch runs
23 EOH
25 ShowHelp();
27 $opt_o='./Hp_result' if ! defined $opt_o;
28 $opt_i='./psnp.lst' if ! $opt_i;
29 $opt_c='./chr.nfo' if ! $opt_c;
30 $opt_w=40000 if ! $opt_w;
31 $opt_l=20000 if ! $opt_l;
33 print STDERR "From [$opt_i] to [$opt_o].{dat,stat}, with [$opt_c][$opt_l][$opt_w]\n";
34 if (! $opt_b) {print STDERR 'press [Enter] to continue...'; <>;}
36 #http://doc.bioperl.org/bioperl-live/Bio/Tools/IUPAC.html#BEGIN1
37 our %IUB = ( A => [qw(A A)],
38 C => [qw(C C)],
39 G => [qw(G G)],
40 T => [qw(T T)],
41 U => [qw(U U)],
42 M => [qw(A C)],
43 R => [qw(A G)],
44 W => [qw(A T)],
45 S => [qw(C G)],
46 Y => [qw(C T)],
47 K => [qw(G T)],
48 V => [qw(A C G)],
49 H => [qw(A C T)],
50 D => [qw(A G T)],
51 B => [qw(C G T)],
52 X => [qw(G A T C)],
53 N => [qw(G A T C)]
56 my $start_time = [gettimeofday];
58 system('mkdir','-p',$opt_o);
59 system('rmdir',$opt_o); # if $opt_o =~ m#/[\s.]*[^/\s.]+[^/]*$#;
61 my (%ChrLen,@ChrID);
62 print STDERR "[!]Load ChrNFO:\t";
63 open C,'<',$opt_c or die "[x]Error opening $opt_c: $!\n";
64 while (<C>) {
65 chomp;
66 my ($chr,$len)=split /\t/;
67 $ChrLen{$chr}=$len;
68 push @ChrID,$chr;
69 print STDERR "$chr,$len\t"
71 warn "\n";
73 my $win=$opt_w-1;
75 my %SNP;
76 print STDERR "[!]Parsing PSNP: .";
77 open P,'<',$opt_i or die "[x]Error opening $opt_i: $!\n";
78 while (my $file=<P>) {
79 chomp $file;
80 open SNP,'<',$file or (warn "\n[!]Error opening $file: $!\n" and next);
81 print STDERR ".\b";
82 my ($chr,$pos,$ref,$tail,$i,@SNPCounts);
83 while (<SNP>) {
84 ($chr,$pos,$ref,$tail)=split /\t/;
85 my %SNPcount;
86 for (split / /,$tail) {
87 next unless /[ACGTRYMKSWHBVDNX]/;
88 ++$SNPcount{$_} for @{$IUB{$_}};
90 @SNPCounts=sort {$a <=> $b} values %SNPcount;
91 next if $#SNPCounts < 1; # at least 2-1=1
92 #next if $SNPCounts[0] == $SNPCounts[-1]; # min != max (needed ?)
93 $SNP{$chr}{$pos}=[$SNPCounts[0],$SNPCounts[-1]]; # [min,max]
95 print STDERR "-";
96 close SNP;
98 close P;
99 warn ".\n";
101 my $file=$opt_o.'.dat.tmp';
102 open O,'>',$file or die "[x]Error opening $file: $!\n";
104 my ($POSa,$N,%Hpc,%Hpa,$Hpr,$sX,$sXX)=(1,0);
105 print STDERR "[!]Caltulating Hp ";
106 for my $chr (@ChrID) {
107 print STDERR ".\b";
108 my ($Cmax,$st,$ed,$pos)=($ChrLen{$chr},1,0);
109 my ($Smin,$Smax,$Hp,$sum,$snpcount);
110 while ($st <= $Cmax) {
111 $ed = $st + $win; # [$st,$ed]
112 $ed = $Cmax if $ed > $Cmax;
113 ($Smin,$Smax,$Hp,$sum,$snpcount)=(0,0,0,0,0);
114 for $pos ($st .. $ed) {
115 next unless exists $SNP{$chr}{$pos};
116 $Smin += $SNP{$chr}{$pos}->[0];
117 $Smax += $SNP{$chr}{$pos}->[1];
118 ++$snpcount;
120 $sum=$Smin+$Smax;
121 if ($sum>0) {
122 $Hp=2*$Smin*$Smax/($sum*$sum);
123 $Hpr=int($Hp*1000)/1000;
124 } else {$Hp=$Hpr='Inf';}
125 ++$Hpc{$chr}{$Hpr};
126 ++$Hpa{$Hpr};
127 ++$N;
128 $sX += $Hp;
129 $sXX += $Hp*$Hp;
130 print O "$chr\t$POSa\t$st\t$ed\t$Hp\t$Smin,$Smax,$sum\t$snpcount\n";
131 $st += $opt_l;
132 $POSa += $opt_l;
134 print STDERR '-';
136 close O;
137 print STDERR "|\b";
139 my $Avg=$sX/$N;
140 # http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
141 #my $Std=sqrt($sXX/$N-$Avg*$Avg);
142 my $Std=sqrt(($sXX-$Avg*$sX)/($N-1));
144 $file=$opt_o.'.dat.tmp';
145 open I,'<',$file or die "[x]Error opening $file: $!\n";
146 $file=$opt_o.'.dat';
147 open O,'>',$file or die "[x]Error opening $file: $!\n";
148 while (<I>) {
149 my ($chr, $POSa, $st, $ed, $Hp, $Smms, $snpcount)=split /\t/;
150 my $ZHp=($Hp-$Avg)/$Std;
151 print O join("\t",$chr, $POSa, $st, $ed, $ZHp, $Hp, $Smms, $snpcount);
153 close O;
154 warn ";\n";
155 close I;
156 unlink $opt_o.'.dat.tmp';
158 print STDERR "[!]Stating Hp ";
159 $file=$opt_o.'.stat';
160 open S,'>',$file or die "[x]Error opening $file: $!\n";
161 print S "# Avg:\t$Avg\n# Std:\t$Std\n# N: $N, sX: $sX, sXX: $sXX\n";
162 sub sortInf($$) {
163 if ($_[0] eq 'Inf') { return 1; }
164 elsif ($_[1] eq 'Inf') { return -1; }
165 else { return $_[0] <=> $_[1]; }
168 print S "ALL\t$_\t$Hpa{$_}\n" for sort(sortInf keys %Hpa);
170 for my $chr (@ChrID) {
171 print S "$chr\t$_\t$Hpc{$chr}{$_}\n" for sort(sortInf keys %{$Hpc{$chr}});
173 close S;
174 warn "\n[!] Done !\n";
176 my $stop_time = [gettimeofday];
177 print STDERR "\nTime Elapsed:\t",tv_interval( $start_time, $stop_time )," second(s).\n";
179 __END__
180 a=read.delim('v4Hp_40k_20k.dat',F,comment.char='#')
182 out <- function (c) {
183 png(paste("E:\\BGI\\toGit\\popuSNP\\new\\",c,'.png',sep=''),2048,768)
184 plot(a$V3[a$V1==c],a$V5[a$V1==c],type='p',xlab=c,ylab='Hp')
185 dev.off()
188 out('seg09')
190 # pdf("E:\\BGI\\toGit\\popuSNP\\new\\wm_a_w40k_s20k.pdf",12,9,fonts='Arial',title="Watermelon Hp density Plot")
191 plot(density(a$V5),main='Watermelon Hp density Plot',sub='Window:40k bp, Slip:20k bp')
192 dev.off()
194 plot(a$V2[a$V5<=0],a$V5[a$V5<=0],type='p',xlab=c,ylab='Hp',cex=.4)