2 #use lib "/ifs1/ST_ASMB/USER/huxuesong/public/lib";
5 use Time
::HiRes qw
( gettimeofday tv_interval
);
9 our $opts='o:r:w:m:n:b';
10 our($opt_o, $opt_r, $opt_w, $opt_m, $opt_n, $opt_b);
14 \t-w window sizes (100,200,500,1000)
15 \t-r ref fasta file (./ref/human.fa)
16 \t-o output prefix (./gcdepdat).{mcount,mratio}
17 \t-m min win non-N-base count (30)
18 \t-n max N ratio in one win (0.75)
19 \t-b No pause for batch runs
21 our $ARG_DESC='coverage_fa_files{,.gz,.bz2}';
24 $opt_w='100,200,500,1000' if ! $opt_w;
25 $opt_r='./ref/human.fa' if ! $opt_r;
26 $opt_m=30 if ! $opt_m;
27 $opt_n=0.75 if ! $opt_n;
28 $opt_o='./gcdepdat' if ! $opt_o;
29 die "[x]No input files found !\n" unless @ARGV;
30 die "[!]Max 252 files supported.\n" if @ARGV>252;
32 my @wins=grep {$_>=50} map {int $_} split /,/,$opt_w;
33 die "[x]Window Size must >= 50.\n" unless @wins;
35 print STDERR
"From [@ARGV] with [$opt_r][$opt_m][$opt_n] to [$opt_o] of [",join(',',@wins),"]\n";
36 unless ($opt_b) {print STDERR
'press [Enter] to continue...'; <STDIN
>;}
38 my $start_time = [gettimeofday
];
41 while($_=shift @ARGV) {
44 open( $infile,"-|","bzip2 -dc $_") or die "Error opening $_: $!\n";
46 open( $infile,"-|","gzip -dc $_") or die "Error opening $_: $!\n";
47 } else {open( $infile,"<",$_) or die "Error opening $_: $!\n";}
50 warn '[!]depth files opened: ',scalar @FH,"\n[!]Reading Reference Genome:\n";
52 my (%Genome,%EffChrLen);
53 open GENOME
,'<',$opt_r or die "Error: $!\n";
58 print STDERR
" >$seqname ...";
64 $Genome{$seqname}=$genome;
65 my $n=($genome=~s/[^ATCG]/A/ig);
66 $EffChrLen{$seqname}=length($Genome{$seqname})-$n;
67 print STDERR
"\b\b\b \t",length $Genome{$seqname},".\n";
72 my (%Result,%Stat,%CVG,%Depth,%DepCnt);
74 warn "[!]Reading Depth Files:\n";
75 my $firstFH=shift @FH;
80 print STDERR
" \@$SeqName ...";
82 my $genome=<$firstFH>;
84 #print STDERR length $genome,"|\t";
86 while($genome=~/(\d+)/g) {
92 print STDERR
"\b\b\b \t",scalar @DepDatChr,': .';
98 die "[x]Depth file in different order ($SeqName ne $1) !\n" if $SeqName ne $1;
103 while($genome=~/(\d+)/gc) {
104 $DepDatChr[$i]+=$1 if ($1!=65536);
114 # $_=65535 if $_>65535;
118 for my $win (@wins) {
119 my $chrlenOK=length($Genome{$SeqName})-$win;
122 while($start<$chrlenOK) { # the last win is canceled ...
124 my $seq=substr $Genome{$SeqName},$start,$win;
125 my $gc=($seq=~tr/GC//);
126 my $n=($seq=~tr/RYMKSWHBVDNX//);
128 next if $size<$opt_m or ($n/$win)>$opt_n;
131 #$sum+=$DepDatChr[$_] for ($start..$start+$win);
132 $sum+=shift @tmp for (1..$win);
133 my $value=$sum/$size;
134 #push @{$Result{$win}{$gc}},$value;
135 $Result{$win}{$gc}->[0]=$value; # TEMPARY ONLY
136 $Stat{$win}[$gc][0] += $value;
137 ++$Stat{$win}[$gc][1];
148 $Depth{$SeqName}+=$_;
149 $Depth{'__ALL__'}+=$_;
150 ++$DepCnt{$_}{$SeqName};
151 ++$DepCnt{$_}{'__ALL__'};
156 close $_ for (@FH,$firstFH);
158 my @Chr=sort keys %Genome;
160 open ODEP
,'>',$opt_o.'_stat.tsv' or die "Error: $!\n";
161 print ODEP
"#ChrID\tDepth\tCovered\tCVGratio\tEffChrLen\tNzone\tChrLen\n";
162 for my $chr ('__ALL__',@Chr) {
163 print ODEP
join("\t",$chr,$Depth{$chr}/$EffChrLen{$chr},$CVG{$chr},$CVG{$chr}/$EffChrLen{$chr},$EffChrLen{$chr},length($Genome{$chr})-$EffChrLen{$chr},length($Genome{$chr})),"\n";
167 open OHST
,'>',$opt_o.'_hist.tsv' or die "Error: $!\n";
168 print OHST
"#Depth\t__ALL__";
169 print OHST
"\t$_" for @Chr;
171 for my $dep (sort {$a<=>$b} keys %DepCnt) {
173 for my $chr ('__ALL__',@Chr) {
175 $v=$DepCnt{$dep}{$chr} if exists $DepCnt{$dep}{$chr};
182 for my $win (@wins) {
183 open O
,'>',$opt_o."_gcdep.$win.tsv" or die "Error: $!\n";
184 print O
"#WinSize=$win\tWinCount=$Stat{$win}[1]
185 #GC%\tRefCnt\tMean\tSmall\tQ1\tMiddle\tQ3\tBig\tmin\tmax\trefcntcal\n";
186 for my $gc (sort {$a<=>$b} keys %{$Result{$win}}) {
187 print O
$gc,'.5',"\t",$Stat{$win}[$gc][1],"\t",$Stat{$win}[$gc][0]/$Stat{$win}[$gc][1],"\t";
195 my $stop_time = [gettimeofday
];
197 print STDERR
"\nTime Elapsed:\t",tv_interval
( $start_time, $stop_time )," second(s).\n";