modified: src1/input.c
[GalaxyCodeBases.git] / tools / fastafastq / getchrinfo.pl
blob000e1c92416c68bc0fd07129e271aa3c3135014b
1 #!/bin/env perl
2 use lib '/nas/RD_09C/resequencing/soft/lib';
3 use lib 'E:/BGI/toGit/perlib/etc';
4 use strict;
5 use warnings;
6 use File::Basename;
7 use Time::HiRes qw ( gettimeofday tv_interval );
8 use Galaxy::ShowHelp;
10 $main::VERSION=0.0.1;
13 our $opts='i:fb';
14 our($opt_i, $opt_f, $opt_b);
16 our $help=<<EOH;
17 \t-i Input FASTA file
18 \t-f Also Split to [fabychr/]
19 \t-b No pause for batch runs
20 EOH
22 ShowHelp();
24 die "[x]Must specify FASTA file !\n" if ! $opt_i;
25 die "[x]-i $opt_i not exists !\n" unless -f $opt_i;
27 my $FASTAwidth=75;
29 my ($filename, $outpath) = fileparse($opt_i);
30 unless (-w $outpath) {
31 warn "[!]Cannot write to [$outpath]. Change to [./].\n";
32 $outpath='./';
35 my $cutfa;
36 unless ($opt_f) {
37 $cutfa=", Stat. Only.";
38 } else {
39 $opt_f = 'fabychr';
40 $cutfa=" to [$outpath$opt_f/]";
43 print STDERR "From [$opt_i]$cutfa\n";
44 unless ($opt_b) {print STDERR 'press [Enter] to continue...'; <STDIN>;}
46 my $start_time = [gettimeofday];
47 if ($opt_f) {
48 mkdir $outpath.$opt_f,0755; # its $! is so unpredictable ...
49 die "[x]Error creating [$outpath$opt_f]. $!\n" unless -w $outpath.$opt_f;
52 #BEGIN
53 my $infile;
54 if ($opt_i =~ /.bz2$/) {
55 open( $infile,"-|","bzip2 -dc $opt_i") or die "Error: $!\n";
56 } elsif ($opt_i =~ /.gz$/) {
57 open( $infile,"-|","gzip -dc $opt_i") or die "Error: $!\n";
58 } else {open( $infile,"<",$opt_i) or die "Error: $!\n";}
61 local $/=">";
62 $_=<$infile>;
63 die "[x]Not a FASTA file !\n" unless /^\s*>/;
66 open CHRO,'>',$outpath.'chrorder' or die "Error: $!\n";
67 open STAT,'>',$outpath.'chr.nfo' or die "Error: $!\n";
68 print STAT '#',join("\t",qw/ChrID Len EffLen GCratio N n Xx lc GC ChrDesc/),"\n";
69 my ($tlen,$tN,$tn,$tX,$tlc,$tGC)=(0,0,0,0,0,0);
70 while (<$infile>) {
71 chomp;
72 my $Head;
73 my ($id,$desc)=split / /,$_,2;
74 if ($desc && $desc !~ /^\s*$/) {
75 $desc=~s/\t/_/g;
76 $Head="$id $desc";
77 } else { $desc='.';$Head=$id; }
78 $/=">";
79 my $seq=<$infile>;
80 chomp $seq;
81 $seq =~ s/\s//g;
82 $/="\n";
83 print STDERR ">$id,\t[$desc]:\n";
84 print CHRO "$id\n";
85 my $len=length($seq);
86 if ($opt_f) {
87 open O,'>',"$outpath$opt_f/$id.fa" or die "Error: $!\n";
88 print O ">$Head\n";
89 for (my $i=0; $i<$len; $i+=$FASTAwidth) {
90 print O substr($seq,$i,$FASTAwidth)."\n";
92 close O;
94 my $N = $seq=~tr/N//; # stand for gap or low quality
95 my $n = $seq=~tr/n//;
96 my $lc = $seq=~tr/agct/AGCT/; # may stand for masked region
97 my $GC = $seq=~tr/GCgc//;
98 my $X = $seq=~tr/Xx//; # stand for masked region
99 my $Efflen=$len-$N-$n-$X;
100 my $GCratio=sprintf('%.3f',int(0.5+1000*$GC/$len)/1000);
101 print STAT join("\t",$id,$len,$Efflen,$GCratio,$N,$n,$X,$lc,$GC,$desc),"\n";
102 print STDERR " Len:$len, Effictive_Len:$Efflen, GC_Ratio:$GCratio\n";
103 $tlen += $len;
104 $tN += $N;
105 $tn += $n;
106 $tX += $X;
107 $tlc += $lc;
108 $tGC += $GC;
110 my $Efflen=$tlen-$tN-$tn-$tX;
111 my $GCratio=sprintf('%.3f',int(0.5+1000*$tGC/$tlen)/1000);
112 print STAT join("\t",'#Total',$tlen,$Efflen,$GCratio,$tN,$tn,$tX,$tlc,$tGC,'Summary'),"\n";
113 print STDERR "\nTotal:\nLen:$tlen, Effictive_Len:$Efflen, GC_Ratio:$GCratio\n";
115 close CHRO;
116 close STAT;
118 #END
119 my $stop_time = [gettimeofday];
121 print STDERR "\nTime Elapsed:\t",tv_interval( $start_time, $stop_time )," second(s).\n";
122 __END__