4 use lib
'/nas/RD_09C/resequencing/soft/lib';
6 use Time
::HiRes qw
( gettimeofday tv_interval
);
7 use Statistics
::Regression
;
8 use Data
::Dump
qw(ddx);
12 our $opts='i:o:n:g:v:r:c:l:h:bqd';
13 our($opt_i, $opt_n, $opt_g, $opt_r, $opt_c, $opt_l, $opt_o, $opt_v, $opt_b, $opt_q, $opt_d, $opt_h);
16 \t-i Blast filted list (./markerblastf.lst)
17 \t-l linkage map list (./linkagemap.lst)
18 \t-n N zone file (Pa64.Nzone)
20 \t-v Verbose level (undef=0)
21 \t-b No pause for batch runs
22 \t-d Debug Mode, for test only
27 $opt_i='./markerblastf.lst' if ! $opt_i;
28 $opt_l='./linkagemap.lst' if ! $opt_l;
29 $opt_n='Pa64.Nzone' if ! $opt_n;
32 print STDERR
"From [$opt_i] to [$opt_o] refer to [$opt_n][$opt_l]\n";
33 print STDERR
"DEBUG Mode on !\n" if $opt_d;
34 print STDERR
"Verbose Mode [$opt_v] !\n" if $opt_v;
35 unless ($opt_b) {print STDERR
'press [Enter] to continue...'; <>;}
37 my $start_time = [gettimeofday
];
40 my (%LinkageMap,%cMcluster,%Marker);
42 open L
,'<',$opt_l or die "Error: $!\n";
45 my ($ChrID,$File)=split /\t/;
46 open LM
,'<',$File or die "Error: $!\n";
49 my ($Chr,$Pos,$cM)=split /\t/;
50 die "[x][$ChrID] not match [$File] !\n" if $Chr ne $ChrID;
51 $LinkageMap{$Chr}{$Pos}=$cM;
52 #$Marker{$Chr}{$cM}={$Pos};
57 sub splitMarkerid
($) {
59 my ($mChr,$mPos,$mSiderLen)=split /[_m]/,$MarkerID,3;
60 return [$mChr,$mPos,$mSiderLen];
63 my ($Qid,$Qs,$Qe,$Ss,$Se,$BTOP)=@_;
64 my ($mChr,$mPos,$mSiderLen)=@
{&splitMarkerid
($Qid)};
65 my $cM=$LinkageMap{$mChr}{$mPos};
66 unless (defined $cM) {
67 warn "[!]Cannot find Marker @ $mChr,$mPos !\n";
70 my $LeftBPalongQ=$mSiderLen-$Qs+1;
71 my $WalkingOn=$LeftBPalongQ;
72 my @btop0=split /(\D+)/,$BTOP;
73 # $ perl -le '$a="45YT9-Ac-c-c-11TC4";@b=split /(\D+)/,$a;print "[",join("|",@b),"]"'
74 # [45|YT|9|-Ac-c-c-|11|TC|4]
80 my @bin=split /([\w-]{2})/;
81 # $ perl -le '$a="-Ac-c-c-";@b=split /([\w-]{2})/,$a,0;print "[",join("|",@b),"]"'
91 if ($WalkingOn <= 0) {
92 print STDERR
"$Qid [$_] " if $opt_v and $WalkingOn == 0;
95 print STDERR
"-$Qid [$_]-" if $opt_v>1;
102 --$WalkingOn if $op[1] eq '-' and $op[0] ne '-';
103 ++$LeftBPalongS if $op[0] eq '-' and $op[1] ne '-';
109 print STDERR
"-$WalkingOn $LeftBPalongS\n" if $opt_v>1;
120 my $Spos=$Ss + $strand*$LeftBPalongS;
121 warn "$mChr,$Spos,$cM\n" if $opt_v;
122 return [$mChr,$Spos,$cM,$strand];
124 open L
,'<',$opt_i or die "Error: $!\n";
127 my ($ChrID,$File)=split /\t/;
128 open LM
,'<',$File or die "Error: $!\n";
133 my ($Qid,$Sid,$Pidentity,$AlnLen,$identical,$mismatches,$Gap,$Qs,$Qe,$Ss,$Se,$E,$bitScore,$BTOP,$Hit)=@Dat;
134 next if $Sid !~ /^chr/i;
135 my ($mChr,$mPos,$mSiderLen)=@
{&splitMarkerid
($Qid)};
136 next if $mChr ne $Sid; # Well, since we caltuate cM in such a way ... So, no duplication on same cM
137 next unless exists $LinkageMap{$mChr}{$mPos};
138 my $cM=$LinkageMap{$mChr}{$mPos};
139 if ($lastcM != $cM) {
141 my $weight=-log($E)/$Hit; # in case order problem, we will have to choose by weight
142 my ($chr,$pos,$cM,$strand)=@
{&getRel
($Qid,$Qs,$Qe,$Ss,$Se,$BTOP)};
143 #push @{$cMcluster{$Sid}{$cM}},[$pos,$strand,$weight];
144 $cMcluster{$Sid}{$cM}=[$pos,$strand,$weight,$Qid];
145 #warn "$Qid\t$Sid\t$pos,$strand,$weight\n" if scalar @{$cMcluster{$Sid}{$cM}} > 1;
152 for my $Sid (sort keys %cMcluster) {
153 my $reg = Statistics::Regression->new( "BP", [ "const", "cM" ] );
154 for my $cM (keys %{$cMcluster{$Sid}}) {
155 #for (@{$cMcluster{$Sid}{$cM}}) {
156 my ($pos,$strand,$weight)=@{$cMcluster{$Sid}{$cM}};
157 $weight /= 30 if $strand == -1; # Well, ...
158 $reg->include( $pos, [ 1.0, $cM ], $weight );
161 my ($b,$k) = $reg->theta;
164 for my $cM (keys %{$cMcluster{$Sid}}) {
165 my ($pos,$strand,$weight,@t)=@{$cMcluster{$Sid}{$cM}};
166 my $nPos=int(0.5+10*($k*$cM+$b))/10;
167 $cMcluster{$Sid}{$cM}=[$cM,$pos,$strand,$weight,@t,$nPos];
172 ddx \
%cMcluster if $opt_v>2;
173 for my $Sid (sort keys %cMcluster) {
175 for my $cM (sort {$a<=>$b} keys %{$cMcluster{$Sid}}) {
176 push @cMs,[$cM,@
{$cMcluster{$Sid}{$cM}}];
179 my $lastpos=$cMs[0][1];
180 print STDERR
"$Sid:\t" if $opt_v;
182 my ($cM,$pos,$strand,$weight)=@
{$cMs[$i]};
183 print STDERR
" $i|$cM" if $opt_v;
184 if ($pos >= $lastpos) {
188 my ($min,$max)=($i-5,$i+4); # 10 is a big local range, toka.
190 $max=$#cMs if $max>$#cMs;
191 my $reg = Statistics
::Regression
->new( "BP", [ "const", "cM" ] );
192 my ($N,$sX,$sXX,%t)=(0);
193 for my $j ($min..$max) {
194 my ($cM,$pos,$strand,$weight)=@
{$cMs[$j]};
196 #$weight /= 100 if $strand == -1; # Well, ...
197 next if $strand == -1; # Well, again, ...
198 $weight /= 5 if $j == $i or $j == $i-1;
199 $reg->include( $pos, [ 1.0, $cM ], $weight );
206 my ($B,$k) = $reg->theta;
207 for my $j ($min..$max) {
208 my ($cM,$pos,$strand,$weight)=@
{$cMs[$j]};
211 my $diff = $nPos- $pos;
214 $nPos=int(0.5+10*$nPos)/10;
215 $t{$j}=[$cM,$pos,$strand,$weight,$nPos,$diff];
218 # http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
219 #my $Std=sqrt($sXX/$N-$Avg*$Avg);
220 my $Std=sqrt(($sXX-$Avg*$sX)/($N-1));
222 for (my $j=$max;$j>=$min;$j--) { # from high to low, so that we can splice it.
223 next unless $j == $i or $j == $i-1;
224 next unless exists $t{$j};
225 my ($cM,$pos,$strand,$weight,$nPos,$diff)=@
{$t{$j}};
227 $limit=1 if $strand == -1;
228 if (abs($diff) > $limit*$Std) {
232 delete $cMcluster{$Sid}{$cM};
233 print STDERR
"\n>$i|$cM" if $opt_v;
240 for my $j ($min..$i) {
241 my ($cM,$pos,$strand,$weight)=@
{$cMs[$j]};
244 my $diff = $nPos- $pos;
247 $nPos=int(0.5+10*$nPos)/10;
248 $t{$j}=[$cM,$pos,$strand,$weight,$nPos,$diff];
251 my @keys=sort { $t{$b}[5] <=> $t{$a}[5] } keys %t;
253 for my $j ($min..$i) {
254 next unless exists $t{$j};
255 my ($cM,$pos,$strand,$weight,$nPos,$diff)=@
{$t{$j}};
256 ++$NaboveT if $diff > $Avg;
259 if (2*$NaboveT > $N) { # the extra one is the small one.
265 my ($cM,$pos,$strand,$weight)=@
{$cMs[$key]};
269 delete $cMcluster{$Sid}{$cM};
270 print STDERR
"\n<$key|$cM" if $opt_v;
276 } # Still buggy when more than 1 point out of order ...
277 print STDERR
"\n\n" if $opt_v;
279 ddx \
%cMcluster if $opt_v>2;
281 open O
,'>',$opt_o or die "Error: $!\n";
282 print O
"#Marker\tChr\tcM\tPos\tStrand\tWeight\n";
283 for my $Sid (sort keys %cMcluster) {
284 for my $cM (sort {$a<=>$b} keys %{$cMcluster{$Sid}}) {
285 my ($pos,$strand,$weight,$Qid)=@
{$cMcluster{$Sid}{$cM}};
286 print O
join("\t",$Qid,$Sid,$cM,$pos,$strand,$weight),"\n"; # $pos,$strand,$weight
291 ./mrelocate
.pl
-o markerpospa64
.dat
.0