modified: myjupyterlab.sh
[GalaxyCodeBases.git] / tools / gc_dep_plot / gcdepdat.pl
blobfc900936a221a62681afa0d7a6d9920d22865565
1 #!/bin/env perl
2 #use lib "/ifs1/ST_ASMB/USER/huxuesong/public/lib";
3 use strict;
4 use warnings;
5 use Time::HiRes qw ( gettimeofday tv_interval );
6 use Galaxy::ShowHelp;
8 $main::VERSION=0.0.1;
9 our $opts='o:r:w:m:n:b';
10 our($opt_o, $opt_r, $opt_w, $opt_m, $opt_n, $opt_b);
12 #our $desc='';
13 our $help=<<EOH;
14 \t-w window sizes (100,200,500,1000)
15 \t-r ref fasta file (./ref/human.fa)
16 \t-o output prefix (./gcdepdat).{mcount,mratio}
17 \t-m min win non-N-base count (30)
18 \t-n max N ratio in one win (0.75)
19 \t-b No pause for batch runs
20 EOH
21 our $ARG_DESC='coverage_fa_files{,.gz,.bz2}';
23 ShowHelp();
24 $opt_w='100,200,500,1000' if ! $opt_w;
25 $opt_r='./ref/human.fa' if ! $opt_r;
26 $opt_m=30 if ! $opt_m;
27 $opt_n=0.75 if ! $opt_n;
28 $opt_o='./gcdepdat' if ! $opt_o;
29 die "[x]No input files found !\n" unless @ARGV;
30 die "[!]Max 252 files supported.\n" if @ARGV>252;
32 my @wins=grep {$_>=50} map {int $_} split /,/,$opt_w;
33 die "[x]Window Size must >= 50.\n" unless @wins;
35 print STDERR "From [@ARGV] with [$opt_r][$opt_m][$opt_n] to [$opt_o] of [",join(',',@wins),"]\n";
36 unless ($opt_b) {print STDERR 'press [Enter] to continue...'; <STDIN>;}
38 my $start_time = [gettimeofday];
39 #BEGIN
40 my @FH;
41 while($_=shift @ARGV) {
42 my $infile;
43 if (/.bz2$/) {
44 open( $infile,"-|","bzip2 -dc $_") or die "Error opening $_: $!\n";
45 } elsif (/.gz$/) {
46 open( $infile,"-|","gzip -dc $_") or die "Error opening $_: $!\n";
47 } else {open( $infile,"<",$_) or die "Error opening $_: $!\n";}
48 push @FH,$infile;
50 warn '[!]depth files opened: ',scalar @FH,"\n[!]Reading Reference Genome:\n";
52 my (%Genome,%EffChrLen);
53 open GENOME,'<',$opt_r or die "Error: $!\n";
54 while (<GENOME>) {
55 s/^>//;
56 /^(\S+)/ or next;
57 my $seqname = $1;
58 print STDERR " >$seqname ...";
59 $/=">";
60 my $genome=<GENOME>;
61 chomp $genome;
62 $genome=~s/\s//g;
63 $/="\n";
64 $Genome{$seqname}=$genome;
65 my $n=($genome=~s/[^ATCG]/A/ig);
66 $EffChrLen{$seqname}=length($Genome{$seqname})-$n;
67 print STDERR "\b\b\b \t",length $Genome{$seqname},".\n";
68 $genome='';
70 close GENOME;
72 my (%Result,%Stat,%CVG,%Depth,%DepCnt);
74 warn "[!]Reading Depth Files:\n";
75 my $firstFH=shift @FH;
76 while(<$firstFH>) {
77 s/^>//;
78 /^(\S+)/ or next;
79 my $SeqName = $1;
80 print STDERR " \@$SeqName ...";
81 $/=">";
82 my $genome=<$firstFH>;
83 $/="\n";
84 #print STDERR length $genome,"|\t";
85 my @DepDatChr=();
86 while($genome=~/(\d+)/g) {
87 use integer;
88 my $v=$1+0;
89 $v=0 if $v==65536;
90 push @DepDatChr,$v;
92 print STDERR "\b\b\b \t",scalar @DepDatChr,': .';
93 $genome='';
94 for my $fh (@FH) {
95 while(<$fh>) {
96 s/^>//;
97 /^(\S+)/ or next;
98 die "[x]Depth file in different order ($SeqName ne $1) !\n" if $SeqName ne $1;
99 $/=">";
100 $genome=<$fh>;
101 $/="\n";
102 my $i=0;
103 while($genome=~/(\d+)/gc) {
104 $DepDatChr[$i]+=$1 if ($1!=65536);
105 ++$i;
107 $genome='';
108 last;
110 print STDERR '.';
112 print STDERR 'r';
113 # for (@DepDatChr) {
114 # $_=65535 if $_>65535;
116 print STDERR 'R';
117 # one chr done
118 for my $win (@wins) {
119 my $chrlenOK=length($Genome{$SeqName})-$win;
120 my $start=0;
121 my @tmp=@DepDatChr;
122 while($start<$chrlenOK) { # the last win is canceled ...
123 print STDERR '-';
124 my $seq=substr $Genome{$SeqName},$start,$win;
125 my $gc=($seq=~tr/GC//);
126 my $n=($seq=~tr/RYMKSWHBVDNX//);
127 my $size=$win-$n;
128 next if $size<$opt_m or ($n/$win)>$opt_n;
129 $gc=int($gc/$size);
130 my $sum=0;
131 #$sum+=$DepDatChr[$_] for ($start..$start+$win);
132 $sum+=shift @tmp for (1..$win);
133 my $value=$sum/$size;
134 #push @{$Result{$win}{$gc}},$value;
135 $Result{$win}{$gc}->[0]=$value; # TEMPARY ONLY
136 $Stat{$win}[$gc][0] += $value;
137 ++$Stat{$win}[$gc][1];
138 $start += $win;
139 print STDERR ',';
141 print STDERR 'w';
143 print STDERR 'C';
144 for (@DepDatChr) {
145 if ($_) {
146 ++$CVG{$SeqName};
147 ++$CVG{'__ALL__'};
148 $Depth{$SeqName}+=$_;
149 $Depth{'__ALL__'}+=$_;
150 ++$DepCnt{$_}{$SeqName};
151 ++$DepCnt{$_}{'__ALL__'};
154 print STDERR "\n";
156 close $_ for (@FH,$firstFH);
157 @FH=();
158 my @Chr=sort keys %Genome;
160 open ODEP,'>',$opt_o.'_stat.tsv' or die "Error: $!\n";
161 print ODEP "#ChrID\tDepth\tCovered\tCVGratio\tEffChrLen\tNzone\tChrLen\n";
162 for my $chr ('__ALL__',@Chr) {
163 print ODEP join("\t",$chr,$Depth{$chr}/$EffChrLen{$chr},$CVG{$chr},$CVG{$chr}/$EffChrLen{$chr},$EffChrLen{$chr},length($Genome{$chr})-$EffChrLen{$chr},length($Genome{$chr})),"\n";
165 close ODEP;
167 open OHST,'>',$opt_o.'_hist.tsv' or die "Error: $!\n";
168 print OHST "#Depth\t__ALL__";
169 print OHST "\t$_" for @Chr;
170 print OHST "\n";
171 for my $dep (sort {$a<=>$b} keys %DepCnt) {
172 print OHST $dep;
173 for my $chr ('__ALL__',@Chr) {
174 my $v=0;
175 $v=$DepCnt{$dep}{$chr} if exists $DepCnt{$dep}{$chr};
176 print OHST "\t",$v;
178 print OHST "\n";
180 close OHST;
182 for my $win (@wins) {
183 open O,'>',$opt_o."_gcdep.$win.tsv" or die "Error: $!\n";
184 print O "#WinSize=$win\tWinCount=$Stat{$win}[1]
185 #GC%\tRefCnt\tMean\tSmall\tQ1\tMiddle\tQ3\tBig\tmin\tmax\trefcntcal\n";
186 for my $gc (sort {$a<=>$b} keys %{$Result{$win}}) {
187 print O $gc,'.5',"\t",$Stat{$win}[$gc][1],"\t",$Stat{$win}[$gc][0]/$Stat{$win}[$gc][1],"\t";
188 ### &calboxvalues();
189 print O "\n";
194 #END
195 my $stop_time = [gettimeofday];
197 print STDERR "\nTime Elapsed:\t",tv_interval( $start_time, $stop_time )," second(s).\n";
198 __END__