modified: myjupyterlab.sh
[GalaxyCodeBases.git] / perl / popuSNP / snpindelmix.pl
blob6ed16ffd790b860efe50a067fcde55605b38eb77
1 #!/bin/env perl
2 use lib '/nas/RD_09C/resequencing/soft/lib';
3 use lib 'E:/BGI/toGit/perlib/etc';
4 use strict;
5 use warnings;
6 use Time::HiRes qw ( gettimeofday tv_interval );
7 use Galaxy::ShowHelp;
9 $main::VERSION=0.0.3;
11 our $opts='i:o:s:f:d:bv';
12 our ($opt_i, $opt_o, $opt_s, $opt_v, $opt_b, $opt_f, $opt_d);
14 our $help=<<EOH;
15 \t-i PSNP list (./psnp.lst) for chrid.individual.finalSNPs
16 \t-d Indel list (./indel.lst), in format [^SampleID\\tindel-result.filter\\n\$]
17 \t-f fabyChr path (./faByChr) for [chrid].fa
18 \t-s GLF list (./glf.lst), will use \$1 of (([^/]+)/[^/]+\$) for sample names
19 \t-o Output Prefix (./indGenomes/ig_)
20 \t-v show verbose info to STDOUT
21 \t-b No pause for batch runs
22 EOH
24 ShowHelp();
26 $opt_o='./indGenomes/ig_' if ! defined $opt_o;
27 $opt_i='./psnp.lst' if ! $opt_i;
28 $opt_s='./glf.lst' if ! $opt_s;
29 $opt_f='./faByChr' if ! $opt_f;
30 $opt_d='./indel.lst' if ! $opt_d;
32 $opt_i =~ s/\/$//;
33 $opt_f =~ s/\/$//;
34 print STDERR "From [$opt_i] to [$opt_o], with [$opt_s][$opt_d][$opt_f]/\n";
35 if (! $opt_b) {print STDERR 'press [Enter] to continue...'; <>;}
37 my $start_time = [gettimeofday];
39 system('mkdir','-p',$opt_o);
40 system('rmdir',$opt_o) if $opt_o =~ m#/[\s.]*[^/\s.]+[^/]*$#;
42 my (@Samples,%SampleToIndelf);
43 open L,'<',$opt_s or die "[x]Error opening $opt_s: $!\n";
44 print STDERR "[!]Sample Order: ";
45 while (<L>) {
46 m#([^/]+)/[^/]+$#;
47 push @Samples,$1;
48 print STDERR (scalar @Samples),':[',$1,"] ";
50 print STDERR "\n";
52 open L,'<',$opt_d or die "[x]Error opening $opt_d: $!\n";
53 print STDERR "[!]Indel files:\n";
54 while (<L>) {
55 chomp;
56 my ($sample,$file)=split /\t/;
57 $SampleToIndelf{$sample}=$file;
58 print STDERR "\t$sample -> $file\n";
60 print STDERR "\n";
62 sub GetNextInDel($$) {
63 my ($Aref,$theChr)=@_;
64 my ($FH,$curChr,$curSt,$curEd,$curInDe,$curIDseq)=@$Aref;
65 my $filePos = tell $FH; # maybe useful in future ?
66 $curChr=undef; # flag
67 $curSt=$curEd=-1;
68 while(<$FH>) {
69 my ($chr,$pos,$indel,$bases,$strand,$homhet,undef,$depth,$t) = split /\t/;
70 unless ($t) { # if file not completed.
71 print STDERR '\'';
72 next;
74 next if $chr ne $theChr;
75 $bases = lc $bases if $homhet eq 'hete';
76 if ($indel =~ /^([ID])(\d+)$/) {
77 $curSt=$pos;
78 $curIDseq=$bases;
79 if ($1 eq 'I') {
80 $curInDe = $2;
81 $curEd=$pos;
82 } else {
83 $curInDe = -$2;
84 $curEd = $pos-1 + $2;
86 } else {print STDERR '|';next;}
87 $curChr=$chr;
88 @$Aref=($FH,$curChr,$curSt,$curEd,$curInDe,$curIDseq);
89 if (defined $curChr) { return $filePos; }
90 else { return -1; }
92 @$Aref=($FH,$curChr,$curSt,$curEd,$curInDe,$curIDseq); # also set $curSt=-1
93 return -2; # Empty file
96 sub QueryInDel($$$$) {
97 my ($Aref,$theChr,$pos,$theBase)=@_;
98 my ($FH,$curChr,$curSt,$curEd,$curInDe,$curIDseq)=@$Aref;
99 return $theBase if $pos < $curSt; # Since we Recursion below, these two if is a must.
100 if ($pos >= $curSt and $pos <= $curEd) {
101 #return '<D>' if $curInDe<0;
102 return '' if $curInDe<0;
103 # Now, must be Insertion:
104 #print STDERR 'X' if $curSt ne $curEd; # just to recheck, safe to remove this line.
105 #return $theBase.'<I>'.$curIDseq.'</I>';
106 return $theBase.$curIDseq;
107 } else {
108 my $r=&GetNextInDel($Aref,$theChr);
109 if ($r < 0) {
110 return $theBase;
111 } else { return &QueryInDel($Aref,$theChr,$pos,$theBase); } # should be safe to Recursion on normal indel results.
113 print STDERR 'x'; # should never be here.
114 return '.';
117 print STDERR "[!]Parsing SNP ";
118 open P,'<',$opt_i or die "[x]Error opening $opt_i: $!\n";
119 while (my $file=<P>) {
120 chomp $file;
121 open SNP,'<',$file or (warn "\n[!]Error opening $file: $!\n" and next);
122 print STDERR ".\b";
123 my (%SNP,$chr,$pos,$ref,$tail,$i);
124 while (<SNP>) {
125 chomp;
126 ($chr,$pos,$ref,$tail)=split /\t/;
127 my @indSNP;
128 for (split / /,$tail) {
129 next unless /[ACGTRYMKSWHBVDNX-]/;
130 s/-/n/;
131 push @indSNP,$_;
133 $SNP{$pos}=[$ref,\@indSNP];
135 print STDERR "+\b";
137 my (@FH,@IndelH);
138 for my $sample (@Samples) { # !!! Cannot use $_ , as $_ will be the last read from $IndelH[0] from GetNextInDel() after the 1st psnp is finished.
139 $file=$opt_o.$sample.'.'.$chr.'.fa';
140 my ($fh,$fhIDl);
141 open $fh,'>',$file or die "[x]Error opening $file: $!\n";
142 print $fh ">${sample}---$chr\n";
143 push @FH,$fh;
144 open $fhIDl,'<',$SampleToIndelf{$sample} or die "[x]Error opening $SampleToIndelf{$sample}: $!\n";
145 push @IndelH,[$fhIDl];
146 &GetNextInDel($IndelH[-1],$chr) >=0 || warn "[!]No InDel for ${chr} @ ${sample}\n";
148 warn '[!]PSNP:[',1+$#{${$SNP{$pos}}[1]},'] != File:[',(scalar @FH),"].\n" if $#FH != $#{${$SNP{$pos}}[1]};
150 $file=$opt_f.'/'.$chr.'.fa';
151 open FA,'<',$file or (warn "\n[!]Error opening $file: $!\n" and next);
152 $ref=<FA>;
153 $ref =~ />(\S+)/;
154 warn "[!]Different ChrID, [>$1] in [$file] !\n" if $1 ne $chr;
155 $i=1;
156 while ($ref=getc(FA)) {
157 unless ($ref =~ /[ACGTRYMKSWHBVDNX]/i) {
158 last if $ref eq '>';
159 next;
161 unless ($i%80) {
162 print $_ "\n" for @FH;
164 unless ($SNP{$i}) { # Normal or InDel, which cannot start at '-' as not covered or SNP as filtering
165 my $t=0;
166 for my $fh (@FH) {
167 my $str;
168 #my ($FH,$curChr,$curSt,$curEd,$curInDe,$curIDseq)=@$Aref;
169 if ($i < $IndelH[$t]->[2] or $IndelH[$t]->[2] == -1) { # add -1 so that no more calling after the last indel item
170 $str = $ref;
171 } elsif ($i <= $IndelH[$t]->[3]) {
172 if ($IndelH[$t]->[4]<0) { $str = ''; }
173 else { $str = $ref . $IndelH[$t]->[5]; }
174 } else { $str = QueryInDel($IndelH[$t],$chr,$i,$ref); } # Well, the above is for speed
175 print $fh $str;
177 ++$t;
179 #print $_ $ref for @FH;
180 } else { # SNP
181 my ($refbase,$indSNPr)=@{$SNP{$i}};
182 warn "[!]RefBase differ, SNP:[$refbase] ne FASTA:[$ref].\n" if $refbase ne uc($ref);
183 my $t=0;
184 for (@$indSNPr) {
185 $tail=$FH[$t];
186 print $tail $_;
187 ++$t;
190 ++$i;
192 print STDERR '-';
193 close $$_[0] for @IndelH;
194 close $_ for @FH;
195 @IndelH=@FH=();
198 my $stop_time = [gettimeofday];
200 print STDERR "\nTime Elapsed:\t",tv_interval( $start_time, $stop_time )," second(s).\n";