tools/linkage/mrelocate.pl

   1 #!/bin/env perl
   2 use strict;
   3 use warnings;
   4 use lib '/nas/RD_09C/resequencing/soft/lib';
   5 use Galaxy::ShowHelp;
   6 use Time::HiRes qw ( gettimeofday tv_interval );
   7 use Statistics::Regression;
   8 use Data::Dump qw(ddx);
   9
  10 $main::VERSION=0.0.1;
  11
  12 our $opts='i:o:n:g:v:r:c:l:h:bqd';
  13 our($opt_i, $opt_n, $opt_g, $opt_r, $opt_c, $opt_l, $opt_o, $opt_v, $opt_b, $opt_q, $opt_d, $opt_h);
  14
  15 our $help=<<EOH;
  16 \t-i Blast filted list (./markerblastf.lst)
  17 \t-l linkage map list (./linkagemap.lst)
  18 \t-n N zone file (Pa64.Nzone)
  19 \t-o Output file
  20 \t-v Verbose level (undef=0)
  21 \t-b No pause for batch runs
  22 \t-d Debug Mode, for test only
  23 EOH
  24
  25 ShowHelp();
  26
  27 $opt_i='./markerblastf.lst' if ! $opt_i;
  28 $opt_l='./linkagemap.lst' if ! $opt_l;
  29 $opt_n='Pa64.Nzone' if ! $opt_n;
  30 $opt_v=0 if ! $opt_v;
  31
  32 print STDERR "From [$opt_i] to [$opt_o] refer to [$opt_n][$opt_l]\n";
  33 print STDERR "DEBUG Mode on !\n" if $opt_d;
  34 print STDERR "Verbose Mode [$opt_v] !\n" if $opt_v;
  35 unless ($opt_b) {print STDERR 'press [Enter] to continue...'; <>;}
  36
  37 my $start_time = [gettimeofday];
  38 #BEGIN
  39
  40 my (%LinkageMap,%cMcluster,%Marker);
  41
  42 open L,'<',$opt_l or die "Error: $!\n";
  43 while (<L>) {
  44         chomp;
  45         my ($ChrID,$File)=split /\t/;
  46         open LM,'<',$File or die "Error: $!\n";
  47         while (<LM>) {
  48                 next if /^#/;
  49                 my ($Chr,$Pos,$cM)=split /\t/;
  50                 die "[x][$ChrID] not match [$File] !\n" if $Chr ne $ChrID;
  51                 $LinkageMap{$Chr}{$Pos}=$cM;
  52                 #$Marker{$Chr}{$cM}={$Pos};
  53         }
  54 }
  55 close L;
  56
  57 sub splitMarkerid($) {
  58         my $MarkerID=$_[0];
  59         my ($mChr,$mPos,$mSiderLen)=split /[_m]/,$MarkerID,3;
  60         return [$mChr,$mPos,$mSiderLen];
  61 }
  62 sub getRel($$$$$) {
  63         my ($Qid,$Qs,$Qe,$Ss,$Se,$BTOP)=@_;
  64         my ($mChr,$mPos,$mSiderLen)=@{&splitMarkerid($Qid)};
  65         my $cM=$LinkageMap{$mChr}{$mPos};
  66         unless (defined $cM) {
  67                 warn "[!]Cannot find Marker @ $mChr,$mPos !\n";
  68                 $cM=0;
  69         }
  70         my $LeftBPalongQ=$mSiderLen-$Qs+1;
  71         my $WalkingOn=$LeftBPalongQ;
  72         my @btop0=split /(\D+)/,$BTOP;
  73         # $ perl -le '$a="45YT9-Ac-c-c-11TC4";@b=split /(\D+)/,$a;print "[",join("|",@b),"]"'
  74         # [45|YT|9|-Ac-c-c-|11|TC|4]
  75         my @btop=();
  76         for (@btop0) {
  77                 if (/\d+/) {
  78                         push @btop,$_;
  79                 } else {
  80                         my @bin=split /([\w-]{2})/;
  81                         # $ perl -le '$a="-Ac-c-c-";@b=split /([\w-]{2})/,$a,0;print "[",join("|",@b),"]"'
  82                         # [|-A||c-||c-||c-]
  83                         for (@bin) {
  84                                 next unless $_;
  85                                 push @btop,$_;
  86                         }
  87                 }
  88         }
  89         my $LeftBPalongS=0;
  90         for (@btop) {
  91                 if ($WalkingOn <= 0) {
  92                         print STDERR "$Qid [$_] " if $opt_v and $WalkingOn == 0;
  93                         last;
  94                 }
  95                 print STDERR "-$Qid [$_]-" if $opt_v>1;
  96                 if (/\d/) {
  97                         $LeftBPalongS += $_;
  98                         $WalkingOn -= $_;
  99                 } else {
 100                         my @op=split //;
 101                         if (/-/) {
 102                                 --$WalkingOn if $op[1] eq '-' and $op[0] ne '-';
 103                                 ++$LeftBPalongS if $op[0] eq '-' and $op[1] ne '-';
 104                         } else {
 105                                 --$WalkingOn;
 106                                 ++$LeftBPalongS;
 107                         }
 108                 }
 109                 print STDERR "-$WalkingOn $LeftBPalongS\n" if $opt_v>1;
 110                 my $NOP;
 111         }
 112         my $strand;
 113         if ($Ss < $Se) {
 114                 $strand=1;
 115                 #$mChr .= "\t+";
 116         } else {
 117                 $strand=-1;
 118                 #$mChr .= "\t-";
 119         }
 120         my $Spos=$Ss + $strand*$LeftBPalongS;
 121         warn "$mChr,$Spos,$cM\n" if $opt_v;
 122         return [$mChr,$Spos,$cM,$strand];
 123 }
 124 open L,'<',$opt_i or die "Error: $!\n";
 125 while (<L>) {
 126         chomp;
 127         my ($ChrID,$File)=split /\t/;
 128         open LM,'<',$File or die "Error: $!\n";
 129         my $lastcM=-1;
 130         while (<LM>) {
 131                 chomp;
 132                 my @Dat=split /\t/;
 133                 my ($Qid,$Sid,$Pidentity,$AlnLen,$identical,$mismatches,$Gap,$Qs,$Qe,$Ss,$Se,$E,$bitScore,$BTOP,$Hit)=@Dat;
 134                 next if $Sid !~ /^chr/i;
 135                 my ($mChr,$mPos,$mSiderLen)=@{&splitMarkerid($Qid)};
 136                 next if $mChr ne $Sid;  # Well, since we caltuate cM in such a way ... So, no duplication on same cM
 137                 next unless exists $LinkageMap{$mChr}{$mPos};
 138                 my $cM=$LinkageMap{$mChr}{$mPos};
 139                 if ($lastcM != $cM) {
 140                         $lastcM=$cM;
 141                         my $weight=-log($E)/$Hit;       # in case order problem, we will have to choose by weight
 142                         my ($chr,$pos,$cM,$strand)=@{&getRel($Qid,$Qs,$Qe,$Ss,$Se,$BTOP)};
 143                         #push @{$cMcluster{$Sid}{$cM}},[$pos,$strand,$weight];
 144                         $cMcluster{$Sid}{$cM}=[$pos,$strand,$weight,$Qid];
 145                         #warn "$Qid\t$Sid\t$pos,$strand,$weight\n" if scalar @{$cMcluster{$Sid}{$cM}} > 1;
 146                 }
 147         }
 148 }
 149 close L;
 150 #ddx \%LinkageMap;
 151 =pod
 152 for my $Sid (sort keys %cMcluster) {
 153         my $reg = Statistics::Regression->new( "BP", [ "const", "cM" ] );
 154         for my $cM (keys %{$cMcluster{$Sid}}) {
 155                 #for (@{$cMcluster{$Sid}{$cM}}) {
 156                         my ($pos,$strand,$weight)=@{$cMcluster{$Sid}{$cM}};
 157                         $weight /= 30 if $strand == -1; # Well, ...
 158                         $reg->include( $pos, [ 1.0, $cM ], $weight );
 159                 #}
 160         }
 161         my ($b,$k) = $reg->theta;
 162         warn "$Sid,$k,$b\n";
 163         $reg->print();
 164         for my $cM (keys %{$cMcluster{$Sid}}) {
 165                 my ($pos,$strand,$weight,@t)=@{$cMcluster{$Sid}{$cM}};
 166                 my $nPos=int(0.5+10*($k*$cM+$b))/10;
 167                 $cMcluster{$Sid}{$cM}=[$cM,$pos,$strand,$weight,@t,$nPos];
 168         }
 169 }
 170 =cut
 171 #__END__
 172 ddx \%cMcluster if $opt_v>2;
 173 for my $Sid (sort keys %cMcluster) {
 174         my @cMs;
 175         for my $cM (sort {$a<=>$b} keys %{$cMcluster{$Sid}}) {
 176                 push @cMs,[$cM,@{$cMcluster{$Sid}{$cM}}];
 177         }
 178         my $i=1;
 179         my $lastpos=$cMs[0][1];
 180         print STDERR "$Sid:\t" if $opt_v;
 181         while ($i<=$#cMs) {
 182                 my ($cM,$pos,$strand,$weight)=@{$cMs[$i]};
 183                 print STDERR "   $i|$cM" if $opt_v;
 184                 if ($pos >= $lastpos) {
 185                         $lastpos=$pos;
 186                         ++$i;
 187                 } else {
 188                         my ($min,$max)=($i-5,$i+4);     # 10 is a big local range, toka.
 189                         $min=0 if $min<0;
 190                         $max=$#cMs if $max>$#cMs;
 191                         my $reg = Statistics::Regression->new( "BP", [ "const", "cM" ] );
 192                         my ($N,$sX,$sXX,%t)=(0);
 193                         for my $j ($min..$max) {
 194                                 my ($cM,$pos,$strand,$weight)=@{$cMs[$j]};
 195                                 next unless $cM;
 196                                 #$weight /= 100 if $strand == -1;       # Well, ...
 197                                 next if $strand == -1;  # Well, again, ...
 198                                 $weight /= 5 if $j == $i or $j == $i-1;
 199                                 $reg->include( $pos, [ 1.0, $cM ], $weight );
 200                                 ++$N;
 201                         }
 202                         if ($N<4) {
 203                                 ++$i;
 204                                 next;
 205                         }
 206                         my ($B,$k) = $reg->theta;
 207                         for my $j ($min..$max) {
 208                                 my ($cM,$pos,$strand,$weight)=@{$cMs[$j]};
 209                                 next unless $cM;
 210                                 my $nPos=$k*$cM+$B;
 211                                 my $diff = $nPos- $pos;
 212                                 $sX += $diff;
 213                                 $sXX += $diff*$diff;
 214                                 $nPos=int(0.5+10*$nPos)/10;
 215                                 $t{$j}=[$cM,$pos,$strand,$weight,$nPos,$diff];
 216                         }
 217                         my $Avg=$sX/$N;
 218                         # http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
 219                         #my $Std=sqrt($sXX/$N-$Avg*$Avg);
 220                         my $Std=sqrt(($sXX-$Avg*$sX)/($N-1));
 221                         my $N_rm=0;
 222                         for (my $j=$max;$j>=$min;$j--) {        # from high to low, so that we can splice it.
 223                                 next unless $j == $i or $j == $i-1;
 224                                 next unless exists $t{$j};
 225                                 my ($cM,$pos,$strand,$weight,$nPos,$diff)=@{$t{$j}};
 226                                 my $limit=2.9;
 227                                 $limit=1 if $strand == -1;
 228                                 if (abs($diff) > $limit*$Std) {
 229                                         splice @cMs,$j,1;
 230                                         --$i;
 231                                         $lastpos=0;
 232                                         delete $cMcluster{$Sid}{$cM};
 233                                         print STDERR "\n>$i|$cM" if $opt_v;
 234                                         ++$N_rm;
 235                                 }
 236                         }
 237                         if ($N_rm < 1) {
 238                                 %t=();
 239                                 $sX=$N=0;
 240                                 for my $j ($min..$i) {
 241                                         my ($cM,$pos,$strand,$weight)=@{$cMs[$j]};
 242                                         next unless $cM;
 243                                         my $nPos=$k*$cM+$B;
 244                                         my $diff = $nPos- $pos;
 245                                         $sX += $diff;
 246                                         ++$N;
 247                                         $nPos=int(0.5+10*$nPos)/10;
 248                                         $t{$j}=[$cM,$pos,$strand,$weight,$nPos,$diff];
 249                                 }
 250                                 $Avg=$sX/$N;
 251                                 my @keys=sort { $t{$b}[5] <=> $t{$a}[5] } keys %t;
 252                                 my $NaboveT=0;
 253                                 for my $j ($min..$i) {
 254                                         next unless exists $t{$j};
 255                                         my ($cM,$pos,$strand,$weight,$nPos,$diff)=@{$t{$j}};
 256                                         ++$NaboveT if $diff > $Avg;
 257                                 }
 258                                 my $key;
 259                                 if (2*$NaboveT > $N) {  # the extra one is the small one.
 260                                         $key=$keys[-1];
 261                                 } else {
 262                                         $key=$keys[0];
 263                                 }
 264                                 {
 265                                         my ($cM,$pos,$strand,$weight)=@{$cMs[$key]};
 266                                         splice @cMs,$key,1;
 267                                         --$i;
 268                                         $lastpos=0;
 269                                         delete $cMcluster{$Sid}{$cM};
 270                                         print STDERR "\n<$key|$cM" if $opt_v;
 271                                         ++$N_rm;
 272                                 }
 273                         }
 274                         ++$i;
 275                 }
 276         }       # Still buggy when more than 1 point out of order ...
 277         print STDERR "\n\n" if $opt_v;
 278 }
 279 ddx \%cMcluster if $opt_v>2;
 280
 281 open O,'>',$opt_o or die "Error: $!\n";
 282 print O "#Marker\tChr\tcM\tPos\tStrand\tWeight\n";
 283 for my $Sid (sort keys %cMcluster) {
 284         for my $cM (sort {$a<=>$b} keys %{$cMcluster{$Sid}}) {
 285                 my ($pos,$strand,$weight,$Qid)=@{$cMcluster{$Sid}{$cM}};
 286                 print O join("\t",$Qid,$Sid,$cM,$pos,$strand,$weight),"\n";     # $pos,$strand,$weight
 287         }
 288 }
 289 close O;
 290 __END__
 291 ./mrelocate.pl -o markerpospa64.dat.0