6 my ($List,$Len_th,$GC_th,$Help);
7 GetOptions
(\
%opts,"list:s"=>\
$List,"len_theshold:s"=>\
$Len_th,"gc_theshold:s"=>\
$GC_th,"help"=>\
$Help);
8 if ((defined $Help)||(@ARGV==0))
10 die "perl calengc.pl <IN:fasta|fastq> [-list calculate-contig-list] [len_theshold: length-theshold] [gc_theshold: GC-content-theshold] [-help]
11 Author: BENM <binxiaofeng\@gmail.com> Version: 1.2 alpha 2009-10-08\n
12 Example: perl calgc.pl contigs.fasta -list contigs.list -len_theshold 100 -gc_theshold 0.2-0.6\n
14 calculate contig list example:
24 which must the same as the FASTA/FASTQ contigs/reads name\n
25 len_theshold can be set as \"num\", for example: 500, or \"min-max\": 500-1000
26 gc_theshold can be set as \"num\", for example: 0.3, or \"min-max\": 0.1-0.5
29 my ($Len_min,$Len_max)=split/\-/,$Len_th if (defined $Len_th);
30 my ($GC_min,$GC_max)=split/-/,$GC_th if (defined $GC_th);
33 my ($chr,$gc,$len,$tgc,$tlen,$n)=("",0,0,0,0,0);
39 my %hash=readlist
($List) if (defined $List);
40 open (IN
,$ARGV[0]) || die $!;
45 if ($_=~/^[\>\@](.*)/)
47 if (($chr ne "")&&($chr ne $_)
48 &&($len>0)&&($len>=$Len_min)&&($gc>=$GC_min)
49 &&((!defined $Len_max)||($len<=$Len_max))&&((!defined $GC_max)||($gc<=$GC_max))
50 &&((!defined $List)||(exists $hash{$chr})))
54 my $gc_rate=int($gc*10000/$len+0.5)/100;
58 $len_add_square+=$len*$len;
59 $gc_add_square+=($gc*$gc)/($len*$len);
60 print "$chr\tLength: $len\tGC_Content: $gc_rate%\n";
69 if (($_=~/^\+$/)||($_=~/^\+$chr/))
73 my ($len1,$len2)=cal_GC
($_) unless ($_ =~ /\d+/);
79 if (($len>0)&&($len>=$Len_min)&&($gc>=$GC_min)
80 &&((!defined $Len_max)||($len<=$Len_max))&&((!defined $GC_max)||($gc<=$GC_max))
81 &&((!defined $List)||(exists $hash{$chr})))
83 my $last_gc=int($gc*10000/$len+0.5)/100;
87 $len_add_square+=$len*$len;
89 $gc_add_square+=($gc*$gc)/($len*$len);
90 print "$chr\tLength: $len\tGC_Content: $last_gc%\n";
94 my $GC=int($tgc*10000/$tlen+0.5)/100;
95 print "\n$n sequences Total Length: $tlen\tGC Content: $GC%\n";
100 my $len_mean=$tlen/$n;
104 $n50=$i if (($count>=$tlen*0.5)&&(!defined $n50));
105 $n80=$i if (($count>=$tlen*0.8)&&(!defined $n80));
106 $n90=$i if (($count>=$tlen*0.9)&&(!defined $n90));
108 my $len_sd=sqrt($len_add_square-$n*$len_mean*$len_mean)/$n;
109 my $j=($n<=1)?
int($n/2):int($n/2+0.5);
110 print "\n--- Length Statistics ---\n\n";
111 print " total number:\t$n\n";
112 print " total value:\t$tlen\n";
113 print " mean value:\t$len_mean\n";
114 print " median value:\t$x[$j]\n";
115 print " SD value:\t$len_sd\n";
116 print " max value:\t$x[0]\n";
117 print " min value:\t$x[-1]\n";
118 print " N50 value:\t$n50\n";
119 print " N80 value:\t$n80\n";
120 print " N90 value:\t$n90\n";
125 my $gc_mean=$gc_total/$n;
129 $g50=$k if (($count>=$gc_total/100*0.5)&&(!defined $g50));
130 $g80=$k if (($count>=$gc_total/100*0.8)&&(!defined $g80));
131 $g90=$k if (($count>=$gc_total/100*0.9)&&(!defined $g90));
133 my $gc_sd=sqrt($gc_add_square-$n*$gc_mean*$gc_mean)/$n;
134 print "\n--- GC content Statistics ---\n\n";
135 print " total number:\t$n\n";
136 print " total value:\t$gc_total\n";
137 print " mean value:\t$gc_mean\n";
138 print " median value:\t$y[$j]\n";
139 print " SD value:\t$gc_sd\n";
140 print " max value:\t$y[0]\n";
141 print " min value:\t$y[-1]\n";
142 print " N50 value:\t$g50\n";
143 print " N80 value:\t$g80\n";
144 print " N90 value:\t$g90\n";
150 my $length=length($seq);
152 my $length_GC=length($seq);
153 return ($length_GC,$length);
159 open (IN
,$file) || die $!;
164 my $key = ($_ =~ /[\>\@](.*)/) ?
$1 : $_;