new file: bilibili_4106a8cfc8d21f0fe15fba3ba825c31d.html
[GalaxyBlog.git] / assets / wp-uploads / 2012 / calengc.pl
blobddb804960c8b844dd171378964b0598aee62c920
1 #!/usr/bin/perl -w
2 use strict;
3 use warnings;
4 use Getopt::Long;
5 my %opts;
6 my ($List,$Len_th,$GC_th,$Help);
7 GetOptions(\%opts,"list:s"=>\$List,"len_theshold:s"=>\$Len_th,"gc_theshold:s"=>\$GC_th,"help"=>\$Help);
8 if ((defined $Help)||(@ARGV==0))
10 die "perl calengc.pl <IN:fasta|fastq> [-list calculate-contig-list] [len_theshold: length-theshold] [gc_theshold: GC-content-theshold] [-help]
11 Author: BENM <binxiaofeng\@gmail.com> Version: 1.2 alpha 2009-10-08\n
12 Example: perl calgc.pl contigs.fasta -list contigs.list -len_theshold 100 -gc_theshold 0.2-0.6\n
13 Note:
14 calculate contig list example:
16 >Contigs1
17 >Contigs2
21 \@Contigs1
22 \@Contigs2
24 which must the same as the FASTA/FASTQ contigs/reads name\n
25 len_theshold can be set as \"num\", for example: 500, or \"min-max\": 500-1000
26 gc_theshold can be set as \"num\", for example: 0.3, or \"min-max\": 0.1-0.5
29 my ($Len_min,$Len_max)=split/\-/,$Len_th if (defined $Len_th);
30 my ($GC_min,$GC_max)=split/-/,$GC_th if (defined $GC_th);
31 $Len_min ||= 0;
32 $GC_min ||= -1;
33 my ($chr,$gc,$len,$tgc,$tlen,$n)=("",0,0,0,0,0);
34 my @x=();
35 my @y=();
36 my $len_add_square=0;
37 my $gc_add_square=0;
38 my $gc_total=0;
39 my %hash=readlist($List) if (defined $List);
40 open (IN,$ARGV[0]) || die $!;
41 while(<IN>)
43 s/\s*$//;
44 next if ($_ eq "");
45 if ($_=~/^[\>\@](.*)/)
47 if (($chr ne "")&&($chr ne $_)
48 &&($len>0)&&($len>=$Len_min)&&($gc>=$GC_min)
49 &&((!defined $Len_max)||($len<=$Len_max))&&((!defined $GC_max)||($gc<=$GC_max))
50 &&((!defined $List)||(exists $hash{$chr})))
52 $tgc+=$gc;
53 $tlen+=$len;
54 my $gc_rate=int($gc*10000/$len+0.5)/100;
55 push @x,$len;
56 push @y,$gc/$len;
57 $gc_total+=$gc/$len;
58 $len_add_square+=$len*$len;
59 $gc_add_square+=($gc*$gc)/($len*$len);
60 print "$chr\tLength: $len\tGC_Content: $gc_rate%\n";
61 $n++;
63 $gc=0;
64 $len=0;
65 $chr=$1;
67 else
69 if (($_=~/^\+$/)||($_=~/^\+$chr/))
71 <IN>;next;
73 my ($len1,$len2)=cal_GC($_) unless ($_ =~ /\d+/);
74 $gc+=$len1;
75 $len+=$len2;
78 close IN;
79 if (($len>0)&&($len>=$Len_min)&&($gc>=$GC_min)
80 &&((!defined $Len_max)||($len<=$Len_max))&&((!defined $GC_max)||($gc<=$GC_max))
81 &&((!defined $List)||(exists $hash{$chr})))
83 my $last_gc=int($gc*10000/$len+0.5)/100;
84 $tgc+=$gc;
85 $tlen+=$len;
86 push @x,$len;
87 $len_add_square+=$len*$len;
88 push @y,$gc/$len;
89 $gc_add_square+=($gc*$gc)/($len*$len);
90 print "$chr\tLength: $len\tGC_Content: $last_gc%\n";
91 $n++;
94 my $GC=int($tgc*10000/$tlen+0.5)/100;
95 print "\n$n sequences Total Length: $tlen\tGC Content: $GC%\n";
97 @x=sort{$b<=>$a}@x;
98 my $count=0;
99 my ($n50,$n80,$n90);
100 my $len_mean=$tlen/$n;
101 foreach my $i(@x)
103 $count+=$i;
104 $n50=$i if (($count>=$tlen*0.5)&&(!defined $n50));
105 $n80=$i if (($count>=$tlen*0.8)&&(!defined $n80));
106 $n90=$i if (($count>=$tlen*0.9)&&(!defined $n90));
108 my $len_sd=sqrt($len_add_square-$n*$len_mean*$len_mean)/$n;
109 my $j=($n<=1)?int($n/2):int($n/2+0.5);
110 print "\n--- Length Statistics ---\n\n";
111 print " total number:\t$n\n";
112 print " total value:\t$tlen\n";
113 print " mean value:\t$len_mean\n";
114 print " median value:\t$x[$j]\n";
115 print " SD value:\t$len_sd\n";
116 print " max value:\t$x[0]\n";
117 print " min value:\t$x[-1]\n";
118 print " N50 value:\t$n50\n";
119 print " N80 value:\t$n80\n";
120 print " N90 value:\t$n90\n";
122 @y=sort{$b<=>$a}@y;
123 $count=0;
124 my ($g50,$g80,$g90);
125 my $gc_mean=$gc_total/$n;
126 foreach my $k(@y)
128 $count+=$k;
129 $g50=$k if (($count>=$gc_total/100*0.5)&&(!defined $g50));
130 $g80=$k if (($count>=$gc_total/100*0.8)&&(!defined $g80));
131 $g90=$k if (($count>=$gc_total/100*0.9)&&(!defined $g90));
133 my $gc_sd=sqrt($gc_add_square-$n*$gc_mean*$gc_mean)/$n;
134 print "\n--- GC content Statistics ---\n\n";
135 print " total number:\t$n\n";
136 print " total value:\t$gc_total\n";
137 print " mean value:\t$gc_mean\n";
138 print " median value:\t$y[$j]\n";
139 print " SD value:\t$gc_sd\n";
140 print " max value:\t$y[0]\n";
141 print " min value:\t$y[-1]\n";
142 print " N50 value:\t$g50\n";
143 print " N80 value:\t$g80\n";
144 print " N90 value:\t$g90\n";
147 sub cal_GC {
148 my $seq=shift;
149 $seq=~ s/[NX-]//gi;
150 my $length=length($seq);
151 $seq=~ s/[AT]//gi;
152 my $length_GC=length($seq);
153 return ($length_GC,$length);
156 sub readlist {
157 my $file =shift;
158 my %hash;
159 open (IN,$file) || die $!;
160 while (<IN>)
162 s/\s+$//;
163 s/^\s+//;
164 my $key = ($_ =~ /[\>\@](.*)/) ? $1 : $_;
165 $hash{$key}=1;
167 close IN;
168 return %hash;