2 use lib
'/nas/RD_09C/resequencing/soft/lib';
3 use lib
'E:/BGI/toGit/perlib/etc';
6 use Time
::HiRes qw
( gettimeofday tv_interval
);
11 our $opts='i:o:s:f:d:bv';
12 our ($opt_i, $opt_o, $opt_s, $opt_v, $opt_b, $opt_f, $opt_d);
15 \t-i PSNP list (./psnp.lst) for chrid.individual.finalSNPs
16 \t-d Indel list (./indel.lst), in format [^SampleID\\tindel-result.filter\\n\$]
17 \t-f fabyChr path (./faByChr) for [chrid].fa
18 \t-s GLF list (./glf.lst), will use \$1 of (([^/]+)/[^/]+\$) for sample names
19 \t-o Output Prefix (./indGenomes/ig_)
20 \t-v show verbose info to STDOUT
21 \t-b No pause for batch runs
26 $opt_o='./indGenomes/ig_' if ! defined $opt_o;
27 $opt_i='./psnp.lst' if ! $opt_i;
28 $opt_s='./glf.lst' if ! $opt_s;
29 $opt_f='./faByChr' if ! $opt_f;
30 $opt_d='./indel.lst' if ! $opt_d;
34 print STDERR
"From [$opt_i] to [$opt_o], with [$opt_s][$opt_d][$opt_f]/\n";
35 if (! $opt_b) {print STDERR
'press [Enter] to continue...'; <>;}
37 my $start_time = [gettimeofday
];
39 system('mkdir','-p',$opt_o);
40 system('rmdir',$opt_o) if $opt_o =~ m
#/[\s.]*[^/\s.]+[^/]*$#;
42 my (@Samples,%SampleToIndelf);
43 open L
,'<',$opt_s or die "[x]Error opening $opt_s: $!\n";
44 print STDERR
"[!]Sample Order: ";
48 print STDERR
(scalar @Samples),':[',$1,"] ";
52 open L
,'<',$opt_d or die "[x]Error opening $opt_d: $!\n";
53 print STDERR
"[!]Indel files:\n";
56 my ($sample,$file)=split /\t/;
57 $SampleToIndelf{$sample}=$file;
58 print STDERR
"\t$sample -> $file\n";
62 sub GetNextInDel
($$) {
63 my ($Aref,$theChr)=@_;
64 my ($FH,$curChr,$curSt,$curEd,$curInDe,$curIDseq)=@
$Aref;
65 my $filePos = tell $FH; # maybe useful in future ?
69 my ($chr,$pos,$indel,$bases,$strand,$homhet,undef,$depth,$t) = split /\t/;
70 unless ($t) { # if file not completed.
74 next if $chr ne $theChr;
75 $bases = lc $bases if $homhet eq 'hete';
76 if ($indel =~ /^([ID])(\d+)$/) {
86 } else {print STDERR
'|';next;}
88 @
$Aref=($FH,$curChr,$curSt,$curEd,$curInDe,$curIDseq);
89 if (defined $curChr) { return $filePos; }
92 @
$Aref=($FH,$curChr,$curSt,$curEd,$curInDe,$curIDseq); # also set $curSt=-1
93 return -2; # Empty file
96 sub QueryInDel
($$$$) {
97 my ($Aref,$theChr,$pos,$theBase)=@_;
98 my ($FH,$curChr,$curSt,$curEd,$curInDe,$curIDseq)=@
$Aref;
99 return $theBase if $pos < $curSt; # Since we Recursion below, these two if is a must.
100 if ($pos >= $curSt and $pos <= $curEd) {
101 #return '<D>' if $curInDe<0;
102 return '' if $curInDe<0;
103 # Now, must be Insertion:
104 #print STDERR 'X' if $curSt ne $curEd; # just to recheck, safe to remove this line.
105 #return $theBase.'<I>'.$curIDseq.'</I>';
106 return $theBase.$curIDseq;
108 my $r=&GetNextInDel
($Aref,$theChr);
111 } else { return &QueryInDel
($Aref,$theChr,$pos,$theBase); } # should be safe to Recursion on normal indel results.
113 print STDERR
'x'; # should never be here.
117 print STDERR
"[!]Parsing SNP ";
118 open P
,'<',$opt_i or die "[x]Error opening $opt_i: $!\n";
119 while (my $file=<P
>) {
121 open SNP
,'<',$file or (warn "\n[!]Error opening $file: $!\n" and next);
123 my (%SNP,$chr,$pos,$ref,$tail,$i);
126 ($chr,$pos,$ref,$tail)=split /\t/;
128 for (split / /,$tail) {
129 next unless /[ACGTRYMKSWHBVDNX-]/;
133 $SNP{$pos}=[$ref,\
@indSNP];
138 for my $sample (@Samples) { # !!! Cannot use $_ , as $_ will be the last read from $IndelH[0] from GetNextInDel() after the 1st psnp is finished.
139 $file=$opt_o.$sample.'.'.$chr.'.fa';
141 open $fh,'>',$file or die "[x]Error opening $file: $!\n";
142 print $fh ">${sample}---$chr\n";
144 open $fhIDl,'<',$SampleToIndelf{$sample} or die "[x]Error opening $SampleToIndelf{$sample}: $!\n";
145 push @IndelH,[$fhIDl];
146 &GetNextInDel
($IndelH[-1],$chr) >=0 || warn "[!]No InDel for ${chr} @ ${sample}\n";
148 warn '[!]PSNP:[',1+$#{${$SNP{$pos}}[1]},'] != File:[',(scalar @FH),"].\n" if $#FH != $#{${$SNP{$pos}}[1]};
150 $file=$opt_f.'/'.$chr.'.fa';
151 open FA
,'<',$file or (warn "\n[!]Error opening $file: $!\n" and next);
154 warn "[!]Different ChrID, [>$1] in [$file] !\n" if $1 ne $chr;
156 while ($ref=getc(FA
)) {
157 unless ($ref =~ /[ACGTRYMKSWHBVDNX]/i) {
162 print $_ "\n" for @FH;
164 unless ($SNP{$i}) { # Normal or InDel, which cannot start at '-' as not covered or SNP as filtering
168 #my ($FH,$curChr,$curSt,$curEd,$curInDe,$curIDseq)=@$Aref;
169 if ($i < $IndelH[$t]->[2] or $IndelH[$t]->[2] == -1) { # add -1 so that no more calling after the last indel item
171 } elsif ($i <= $IndelH[$t]->[3]) {
172 if ($IndelH[$t]->[4]<0) { $str = ''; }
173 else { $str = $ref . $IndelH[$t]->[5]; }
174 } else { $str = QueryInDel
($IndelH[$t],$chr,$i,$ref); } # Well, the above is for speed
179 #print $_ $ref for @FH;
181 my ($refbase,$indSNPr)=@
{$SNP{$i}};
182 warn "[!]RefBase differ, SNP:[$refbase] ne FASTA:[$ref].\n" if $refbase ne uc($ref);
193 close $$_[0] for @IndelH;
198 my $stop_time = [gettimeofday
];
200 print STDERR
"\nTime Elapsed:\t",tv_interval
( $start_time, $stop_time )," second(s).\n";