limit fstBC to 30bp in Python3 ver.
[GalaxyCodeBases.git] / tools / linkage / pluginScaff.pl
blob8500b284b254c8e34ca7d1570f23e3a93c6a9714
1 #!/bin/env perl
2 use strict;
3 use warnings;
4 use lib '/nas/RD_09C/resequencing/soft/lib';
5 use Galaxy::ShowHelp;
6 use Time::HiRes qw ( gettimeofday tv_interval );
7 use Data::Dump qw(ddx);
9 $main::VERSION=0.0.1;
11 our $opts='i:o:n:g:v:r:c:l:p:bqd';
12 our($opt_i, $opt_n, $opt_g, $opt_r, $opt_c, $opt_l, $opt_o, $opt_v, $opt_b, $opt_q, $opt_d, $opt_h, $opt_p);
14 our $help=<<EOH;
15 \t-i Genome FASTA (./Pa64_genome.fa)
16 \t-p Anchor Pos file (./f3545ChrScaff.pos.n3)
17 \t-n N zone file (Pa64.Nzone)
18 \t-o Output prefix
19 \t-v Verbose level (undef=0)
20 \t-b No pause for batch runs
21 \t-d Debug Mode, for test only
22 EOH
24 ShowHelp();
26 $opt_i='./Pa64_genome.fa' if ! $opt_i;
27 $opt_p='./f3545ChrScaff.pos.n3' if ! $opt_p;
28 $opt_n='Pa64.Nzone' if ! $opt_n;
29 $opt_v=0 if ! $opt_v;
31 print STDERR "From [$opt_i] to [$opt_o].{fa,stat} refer to [$opt_n][$opt_p]\n";
32 print STDERR "DEBUG Mode on !\n" if $opt_d;
33 print STDERR "Verbose Mode [$opt_v] !\n" if $opt_v;
34 unless ($opt_b) {print STDERR 'press [Enter] to continue...'; <>;}
36 my $start_time = [gettimeofday];
37 #BEGIN
38 my (%Nzone,%SacffPos,%Genome,%GenomPos);
39 open I,'<',$opt_n or die $!;
40 while (<I>) {
41 #chomp;
42 my ($chr,$s,$e,$len)=split /\t/;
43 push @{$Nzone{$chr}},[$s,$e];
45 close I;
46 open I,'<',$opt_p or die $!;
47 while (<I>) {
48 next if /^#/;
49 #chomp;
50 my ($ScaffID,$sPosRel,$chr,$strand,$scM,$pos,$err)=split /\t/;
51 push @{$SacffPos{$chr}},[$ScaffID,$strand,$sPosRel,$pos,$err];
53 close I;
54 open I,'<',$opt_i or die $!;
56 local $/=">";
57 $_=<I>;
58 die "[x]Not a FASTA file ! [$opt_i]\n" unless /^\s*>/;
60 while (<I>) {
61 chomp;
62 my ($id,$desc)=split / /,$_,2;
63 $desc='' unless $desc;
64 $/=">";
65 my $seq=<I>;
66 chomp $seq;
67 $seq =~ s/\s//g;
68 $/="\n";
69 print STDERR ">$id,\t[$desc]: ",length $seq,"\n";
70 $Genome{$id}=$seq;
71 $GenomPos{$id}=0;
73 close I;
75 sub getChrSeq($$) { # Well, no more N triming.
76 my ($id,$end)=@_;
77 my $last=$GenomPos{$id};
78 $GenomPos{$id}=$end;
79 my $str;
80 if ($end != -1) {
81 $str=substr $Genome{$id},$last,$end-$last;
82 } else {
83 $str=substr $Genome{$id},$last;
85 if ($str) {
86 $str =~ s/^NN+/N/;
87 $str .= "\n";
88 warn "[!]$id\t$last\t",$end-$last,"\n";
90 return \$str;
92 sub searchtoPlug($$$$) {
93 my ($chr,$pos,$left,$right)=@_;
94 my $theZone;
95 for (@{$Nzone{$chr}}) {
96 my ($s,$e)=@$_;
97 next if $e < $pos;
98 $theZone=$_;
99 last if $s > $pos;
101 unless ($theZone) {
102 $theZone=[1,1];
103 for (@{$Nzone{$chr}}) {
104 my ($s,$e)=@$_;
105 next if $e < $left;
106 $theZone=$_;
107 last if $s > $right;
110 my ($thePos,$cutL,$cutR);
111 if ($pos>$theZone->[0] and $pos<$theZone->[1]) { # in zone
112 $thePos=$pos;
113 ($cutL,$cutR)=($thePos-$theZone->[0]-1,$theZone->[1]-$thePos-1);
114 } else {
115 $thePos=$theZone->[0];
116 $cutL=0;
117 $cutR=$theZone->[1]-$theZone->[0]-1;
119 return [$thePos,$cutL,$cutR];
121 sub revcom($) {
122 my $str = $_[0];
123 $str =~ tr/acgtrymkswhbvdnxACGTRYMKSWHBVDNX/tgcayrkmswdvbhnxTGCAYRKMSWDVBHNX/;
124 my $rev = reverse $str;
125 $rev =~ tr/[](){}<>/][)(}{></;
126 return $rev;
129 open S,'>',$opt_o.'.stat' or die $!;
130 open O,'>',$opt_o.'.fa' or die $!;
131 for my $chr (sort keys %SacffPos) {
132 my $lastPos=1;
133 print O ">$chr\n";
134 print S "[$chr]\n";
135 warn ">$chr\n";
136 for my $ref (sort {$a->[3] <=> $b->[3]} @{$SacffPos{$chr}}) {
137 my ($ScaffID,$strand,$sPosRel,$pos,$err)=@$ref;
138 my $ScaffLen=length $Genome{$ScaffID};
139 my ($ScaffLeft,$ScaffRight);
140 my $str;
141 if ($strand eq '+') {
142 $ScaffLeft=1-$sPosRel;
143 $ScaffRight=$ScaffLen-$sPosRel;
144 $str=$Genome{$ScaffID};
145 } else {
146 $ScaffLeft=$sPosRel-$ScaffLen;
147 $ScaffRight=$sPosRel-1;
148 $str=revcom($Genome{$ScaffID});
150 my ($left,$right)=($pos+$ScaffLeft-$err,$pos+$ScaffRight+$err); # this should be wrong but should be right.
151 my ($thePos,$cutL,$cutR)=@{&searchtoPlug($chr,$pos,$left,$right)};
152 print S "$ScaffID\t",length($Genome{$ScaffID}),"\t$strand\t$thePos\t$cutL,$cutR\t$thePos\n";
153 print O ${&getChrSeq($chr,$thePos-1)},"x",$str,"x\n";
154 #$lastPos=$thePos+$cutR;
155 delete $Genome{$ScaffID};
157 print O ${&getChrSeq($chr,-1)},"\n";
158 print S "\n";
160 my $SCount=0;
161 for my $id (keys %Genome) {
162 next if $id =~ /^chr/i;
163 ++$SCount;
164 print O '>',$id,"\n",$Genome{$id},"\n";
166 close O;
167 close S;
168 warn "[!]$SCount Scaffold(s) remains.\n";
170 __END__
171 ./pluginScaff.pl -o PA64new.fa