new file: cell2loc.py
[GalaxyCodeBases.git] / perl / popuSNP / conasmS.pl
blob7e78df2aae76e5c3aada70168994d67fddfbbda1
1 #!/bin/env perl
2 use strict;
3 use warnings;
4 #use DBI;
5 use lib 'E:/BGI/toGit/perlib/etc';
6 use lib '/nas/RD_09C/resequencing/soft/lib';
7 use Galaxy::ShowHelp;
9 $main::VERSION=0.1.4;
11 our $opts='i:o:c:m:p:sbvne';
12 our($opt_i, $opt_o, $opt_c, $opt_v, $opt_b, $opt_n, $opt_e, $opt_m, $opt_p, $opt_s);
14 our $desc='Genome creater for Single analyse';
15 our $help=<<EOH;
16 \t-c Consensus file with refbase
17 \t-p final SNP file (undef for directly from CNS)
18 \t-i filtered Indel file
19 \t-m merge.list for dividing mixed Chrs if any
20 \t-o Output file prefix for the sample, directories must exist
21 \t-n mask noncoverage bases to "n", otherwise little cased
22 \t-s change hete SNP to homo by subtracting Ref base
23 \t-e skip hete indels, otherwise both homo and hete(little cased) used
24 \t-v show verbose info to STDOUT
25 \t-b No pause for batch runs
26 EOH
28 ShowHelp();
30 our %IUB = ( A => [qw(A)],
31 C => [qw(C)],
32 G => [qw(G)],
33 T => [qw(T)],
34 U => [qw(U)],
35 M => [qw(A C)],
36 R => [qw(A G)],
37 W => [qw(A T)],
38 S => [qw(C G)],
39 Y => [qw(C T)],
40 K => [qw(G T)],
41 V => [qw(A C G)],
42 H => [qw(A C T)],
43 D => [qw(A G T)],
44 B => [qw(C G T)],
45 X => [qw(G A T C)],
46 N => [qw(G A T C)]
49 warn "[x]Must specify -c xx.consensus.txt !\n" unless $opt_c;
50 warn "[x]Must specify -i xx.indel.txt.filter !\n" unless $opt_i;
51 warn "[x]Must specify -o ./xx !\n" unless $opt_o;
52 exit 1 unless $opt_i and $opt_o and $opt_c;
53 if ($opt_m) {
54 die "[x]Invalid merge.list: [$opt_m] !\n" unless (-s $opt_m);
56 if ($opt_p) {
57 die "[x]Invalid SNP file: [$opt_p] !\n" unless (-s $opt_p);
60 print STDERR "From [$opt_c][$opt_i] to [$opt_o]\n ";
61 print STDERR "merge.list is [$opt_m]\n " if $opt_m;
62 print STDERR "SNP file is [$opt_p]\n " if $opt_p;
63 print STDERR "noncoverage mask to n. " if $opt_n;
64 print STDERR "hete indels will be skiped." if $opt_e;
65 warn "\n";
66 unless ($opt_b) {print STDERR 'press [Enter] to continue...'; <>;}
68 my (%Genome,%Depth,%Merged,%SNP,$snpChr);
69 #my %attr = (
70 # RaiseError => 0,
71 # PrintError => 1,
72 # AutoCommit => 0
73 #);
74 #my $dbh = DBI->connect('dbi:SQLite:dbname=:memory:','','',\%attr) or die $DBI::errstr;
75 #my $sql=q/
76 #CREATE TABLE IF NOT EXISTS merge
77 #( chrid TEXT,
78 # scaffold TEXT,
79 # start INTEGER,
80 # end INTEGER );
81 #/;
82 #$dbh->do($sql) or die $dbh->errstr;
83 #$dbh->commit;
84 #my $sthi = $dbh->prepare( 'INSERT INTO merge ( chrid,scaffold,start,end ) VALUES ( ?,?,?,? )' );
85 #my $stho = $dbh->prepare( 'SELECT DISTINCT scaffold,start,end FROM merge WHERE chrid=? AND ? BETWEEN start AND end' );
87 if ($opt_m) {
88 open M,'<',$opt_m or die "Error opening $opt_m: $!\n";
89 while (<M>) {
90 chomp;
91 my ($chrid,$scaffold,$start,$end)=split /\t/;
92 #$sthi->execute($chrid,$scaffold,$start,$end);
93 $Merged{$chrid}{$start}=[$scaffold,$end];
95 close M;
96 warn "\n[!]merge.list done.\n";
98 if ($opt_p) {
99 open M,'<',$opt_p or die "Error opening $opt_p: $!\n";
100 while (<M>) {
101 #chomp;
102 my ($chrid,$pos,$ref,$cons)=split /\t/;
103 #$sthi->execute($chrid,$scaffold,$start,$end);
104 =pod
105 print "SNP:$chrid,$pos,$ref,$cons -> " if $opt_v;
106 if ($opt_s) {
107 my @bases=@{$IUB{$cons}};
108 for (@bases) {
109 if ($_ ne $ref) {
110 $cons=$_;
111 last;
115 print "$cons\n" if $opt_v;
116 =cut
117 $SNP{$chrid}{$pos}=$cons;
119 close M;
120 warn "\n[!]SNP file done.\n";
123 sub prograss_bar($$) {
124 my ($i,$total) = ($_[0],$_[1]);
125 local $| = 1;
126 my $pos=int(($i/$total)*50);
127 print "\r [" , '=' x $pos, ($pos<50)?'>':'=', ' ' x (49 - $pos), '] ';
128 printf("%2.1f %% ",$i/$total*100);
129 local $| = 0;
133 #open CNS,'<',$opt_c or die "Error opening $opt_c: $!\n";
134 if ($opt_c =~ /\.gz$/) {
135 open( CNS,'-|',"gzip -dc $opt_c") or die "[x]Error opening $opt_c: $!\n";
136 } elsif ($opt_c =~ /\.bz2$/) {
137 open( CNS,'-|',"bzip2 -dc $opt_c") or die "[x]Error opening $opt_c: $!\n";
138 } else {open( CNS,'<',$opt_c) or die "[x]Error opening $opt_c: $!\n";}
140 my $FL = -s $opt_c;
141 my $count=0;
142 print "[!]CNS:\n";
143 my ($chr,$pos,$best,$depth,$ref,$indel,$bases,$homhet,$strand,$t,@bases);
144 while (<CNS>) {
145 chomp;
146 ($chr,$pos,$ref,$best,undef,undef,undef,undef,undef,undef,undef,undef,undef,$depth) = split /\t/;
147 unless ($ref) { # if file not completed.
148 print STDERR '.';
149 next;
151 ++$count;
152 &prograss_bar((tell CNS),$FL) if $count%1000 == 0;
153 if ($opt_s and $best !~ /[ATCG]/) {
154 @bases=@{$IUB{$best}};
155 for (@bases) {
156 if ($_ ne $ref) {
157 $best=$_;
158 @bases=();
159 last;
163 if ($depth==0) {
164 unless ($opt_n) {$best=lc $ref;}
165 else {$best=$ref='n';}
166 } else {$Depth{$chr}->{$pos}=chr $depth;} # string is smaller than double
167 # WARING: Something might be wrong if $depth > 255 !
168 # However, people should not have such much money.
169 #$best = $ref; # DEBUG ONLY !!!
170 if ($opt_p) {
171 if ($SNP{$chr}{$pos}) {$Genome{$chr}->[$pos]=$best;}
172 else {$Genome{$chr}->[$pos]=$ref;}
173 #warn '.' if $best ne $SNP{$chr}{$pos};
174 } else { $Genome{$chr}->[$pos]=$best; }
175 #$Depth{$chr}->[$pos]=$depth;
177 print "\n";
178 close CNS;
179 warn "[!]CNS done.\n";
181 open INDEL,'<',$opt_i or die "Error opening $opt_i: $!\n";
182 while (<INDEL>) {
183 ($chr,$pos,$indel,$bases,$strand,$homhet,undef,$depth,$t) = split /\t/;
184 unless ($t) { # if file not completed.
185 print STDERR '\'';
186 next;
188 next unless exists $Genome{$chr};
189 if ($homhet eq 'hete') {
190 unless ($opt_e) {$bases = lc $bases;}
191 else {next;}
193 if ($indel =~ /^([ID])(\d+)$/) {
194 if ($1 eq 'I') {
195 $Genome{$chr}->[$pos] .= $bases; # if length($Genome{$chr}->[$t]) == 1
196 $Depth{$chr}{$pos} .= chr($depth) x $2;
197 } else {
198 my $i=$2-1;
199 #++$pos; # deletion including it.
200 if ($opt_v) {
201 print "[D]$indel:${1}_${2}@","$pos $strand\t";
202 #print $Genome{$chr}->[$_] for ($pos..$pos+$i);
203 print(($t=$Genome{$chr}->[$_])?$t:'.') for ($pos-5..$pos-1);
204 print '<';
205 print(($t=$Genome{$chr}->[$_])?$t:'.',' ') for ($pos..$pos+$i);
206 print '>';
207 print(($t=$Genome{$chr}->[$_])?$t:'.') for ($pos+$i+1..$pos+$i+5);
208 print ' - ',$bases,"\n";
210 for $t ($pos..$pos+$i) {
211 $Genome{$chr}->[$t]='';
212 delete $Depth{$chr}{$pos};# if exists $Depth{$chr}{$pos};
215 } else {print STDERR '|';next;}
217 close INDEL;
218 warn "[!]INDEL done.\n";
220 my ($i,$dep,$scaf,$name);
221 for $chr (keys %Genome) {
222 $Genome{$chr}->[0]=''; # no longer undef
223 $t=0;
224 $i=-1;
225 unless (exists $Merged{$chr}) { # exists should faster than defined ?
226 $name=$opt_o.'.'.$chr;
227 open OUT,'>',$name.'.fa' or die "Error opening ${name}.fa: $!\n";
228 print OUT ">$chr\n";
229 open DEP,'>',$name.'.dep' or die "Error opening ${name}.dep: $!\n";
230 print DEP ">$chr\n";
231 #$t=0;
232 #$i=-1;
233 for (@{$Genome{$chr}}) {
234 ++$i;
235 next if $_ eq '';
236 ++$t;
237 print OUT $_;
238 $dep=$Depth{$chr}{$i} or $dep="\0";
239 $dep = join(' ',map ord,split //,$dep);
240 #print DEP '[',$dep,']';
241 if ($t > 79) {
242 $t=0;
243 print OUT "\n";
244 print DEP "$dep\n";
245 } else {print DEP $dep,' ';}
247 print OUT "\n";
248 print DEP "\n";
249 close OUT;
250 close DEP;
251 } else {
252 $t=$opt_o.'.'.$chr;
253 system('mkdir','-p',$t);
254 my ($end,$scafname,$toPrint)=(-1,'',0); # -1 to fix 'print() on unopened filehandle OUT at ./conasm.pl line 178 & 179 of elsif ($i == $end).'
255 $scaf=$Merged{$chr};
256 for (@{$Genome{$chr}}) {
257 ++$i;
258 if (exists $$scaf{$i}) { # Start point
259 ($scafname,$end)=@{$$scaf{$i}};
260 $name=$opt_o.'.'.$chr.'/'.$scafname;
261 open OUT,'>',$name.'.fa' or die "Error opening ${name}.fa: $!\n";
262 print OUT ">$scafname\n";
263 open DEP,'>',$name.'.dep' or die "Error opening ${name}.dep: $!\n";
264 print DEP ">$scafname\n";
265 $toPrint=1; # when set to 0, no output for manmade junctions.
267 if ($i != $end) {
268 next if ($_ eq '' or $toPrint==0);
269 ++$t;
270 print OUT $_;
271 $dep=$Depth{$chr}{$i} or $dep="\0";
272 $dep = join(' ',map ord,split //,$dep);
273 #print DEP '[',$dep,']';
274 if ($t > 79) {
275 $t=0;
276 print OUT "\n";
277 print DEP "$dep\n";
278 } else {print DEP $dep,' ';}
279 } else { # End point
280 if ($_ ne '') {
281 print OUT $_,"\n";
282 $dep=$Depth{$chr}{$i} or $dep="\0";
283 $dep = join(' ',map ord,split //,$dep);
284 print DEP "$dep\n";
285 } else {
286 print OUT "\n";
287 print DEP "\n";
289 close OUT;
290 close DEP;
291 $t=$toPrint=0;
292 next;
297 warn "[!]Output done.\n";
299 __END__
300 vf=11.9g for 1.3G Feb 20 07:23 consensus/QRS9.Gm03.txt, 47,781,076
302 find ./consensus/ -name '*.txt'|perl -lane '$m="";$vf="14g";$a=(split /\//)[-1];$b=(split /\./,$a)[0];open O,">./sh/$a.sh";if ($a=~/SGm/) {$m="-m soybean.merge.list ";$vf="4.7g"};print O "#\$ -N \"C_$a\"\n#\$ -cwd -r y -l vf=$vf,p=1\n#\$ -v PERL5LIB,PATH,PYTHONPATH,LD_LIBRARY_PATH\n#\$ -o /dev/null -e /dev/null\n./conasm.pl ${m}-i ./Indel/${b}.indel.txt.filter -c $_ -bno ./out/$b 2>./log/$a.log";close O'
304 $ cat sh/QRS21.Gm13.txt.sh
305 #$ -N "C_QRS21.Gm13.txt"
306 #$ -cwd -r y -l vf=14g,s_core=1
307 #$ -v PERL5LIB,PATH,PYTHONPATH,LD_LIBRARY_PATH
308 #$ -o /dev/null -e /dev/null
309 ./conasm.pl -i ./Indel/QRS21.indel.txt.filter -c ./consensus/QRS21.Gm13.txt -bo ./out/QRS21 2>./log/QRS21.Gm13.txt.log
311 ./conasmS.pl -c P142cns/P142.chromosome01.cns -p P142snp/P142.chromosome01.snp -i P142indel.list.filter -ne -o oP142j/m
313 cat chrorderNip | perl -lane '$p="142";$a=$_;open O,">./sh/do$p$a.sh";print O "#\$ -N \"C_$a\"\n#\$ -cwd -r y -l vf=12g,p=1\n#\$ -v PERL5LIB,PATH,PYTHONPATH,LD_LIBRARY_PATH\n#\$ -o /dev/null -e /dev/null\n./conasmS.pl -c P${p}cns/P${p}.$a.cns -p P${p}snp/P${p}.$a.snp -i P${p}indel.list.filter -nbe -o oP${p}j/m 2>./log/${p}_$a.log";close O'
315 cat chrorder9311 | perl -lane '$p="143";$a=$_;open O,">./sh/do$p$a.sh";print O "#\$ -N \"C_$a\"\n#\$ -cwd -r y -l vf=12g,p=1\n#\$ -v PERL5LIB,PATH,PYTHONPATH,LD_LIBRARY_PATH\n#\$ -o /dev/null -e /dev/null\n./conasmS.pl -c P${p}cns/P${p}.$a.cns -p P${p}snp/P${p}.$a.snp -i P${p}indel.list.filter -nbe -o oP${p}i/m 2>./log/${p}_$a.log";close O'
317 ./conasm.pl -m soybean.merge.list -i ./Indel/QRS29.indel.txt.filter -c ./consensus/QRS29.SGm1.txt -bno ./out/QRS29 2>./log/QRS29.SGm1.txt.log
319 cat ../9311/chrorder|while read a; do n="9308"; echo -e "#$ -N c$a\n#$ -cwd -r y -l vf=10.5g\n#$ -v PERL5LIB,PATH,PYTHONPATH,LD_LIBRARY_PATH\n#$ -o ./fa20100926/${n}_${a}.log -e ./fa20100926/${n}_${a}.err\nperl ./conasmS.pl -bi ./indel_f/$n.indel-result.filter -c ./3GLF/$a/${n}_${a}.cns -o ./fa20100926/$n\n" > ./sh/con$n$a.sh; done