modified: _posts/2015-11-17-my-oaths.md
[GalaxyBlog.git] / _posts / 2012-03-26-cal-n50-n90.md
blob88bba5cd68065d915f7888fc4ab2068da6949893
1 ---
2 date: '2012-03-26 23:53:31'
3 layout: post
4 slug: cal-n50-n90
5 title: "计算序列的N50、N90等"
6 description: ""
7 category: BioInformatics
8 tags: [Biology, Galaxy, Programming, Analyse]
9 wordpress_id: '1146'
10 ---
11 {% include JB/setup %}
13 组装方面的老问题了,就是从大到小排序然后累加,第一次超过比例后输出当前值。<br>
14 关于N50,可以看<a href="http://en.wikipedia.org/wiki/N50_statistic">wiki</a>和<a href="http://www.sciencedirect.com/science/article/pii/S0888754310000492">文章 (Assembly algorithms for next-generation sequencing data) </a>。
16 <blockquote>
17 Assemblies are measured by the size and accuracy of their contigs and scaffolds. Assembly size is usually given by statistics including maximum length, average length, combined total length, and N50. <b>The contig N50 is the length of the smallest contig in the set that contains the fewest (largest) contigs whose combined length represents at least 50% of the assembly.</b> <i>The N50 statistics for different assemblies are not comparable unless each is calculated using the same combined length value.</i> Assembly accuracy is difficult to measure. Some inherent measure of accuracy is provided by the degrees of mate-constraint satisfaction and violation <a href="http://www.sciencedirect.com/science/article/pii/S0888754310000492#bib17">[17]</a>. Alignment to reference sequences is useful whenever trusted references exist.
18 </blockquote>
20 代码可以看<a href="http://seqanswers.com/forums/showthread.php?t=2766">SEQanswers论坛的帖子</a>,第二页的附件也是。<br>
21 关键的就是这堆:
23         @x=sort{$b<=>$a}@x;
24         my $count=0;
25         my ($n50,$n80,$n90);
26         my $len_mean=$tlen/$n;
27         foreach my $i(@x)
28         {
29                 $count+=$i;
30                 $n50=$i if (($count>=$tlen*0.5)&&(!defined $n50));
31                 $n80=$i if (($count>=$tlen*0.8)&&(!defined $n80));
32                 $n90=$i if (($count>=$tlen*0.9)&&(!defined $n90));
33         }
35 * * *
37         #/usr/bin/perl -w
38         use strict;
39         my ($len,$total)=(0,0);
40         my @x;
41         while(<>){
42                 if(/^[\>\@]/){
43                         if($len>0){
44                                 $total+=$len;
45                                 push @x,$len;
46                         }
47                         $len=0;
48                 }
49                 else{
50                         s/\s//g;
51                         $len+=length($_);
52                 }
53         }
54         if ($len>0){
55                 $total+=$len;
56                 push @x,$len;
57         }
58         @x=sort{$b<=>$a} @x; 
59         my ($count,$half)=(0,0);
60         for (my $j=0;$j<@x;$j++){
61                 $count+=$x[$j];
62                 if (($count>=$total/2)&&($half==0)){
63                         print "N50: $x[$j]\n";
64                         $half=$x[$j]
65                 }elsif ($count>=$total*0.9){
66                         print "N90: $x[$j]\n";
67                         exit;
68                 }
69         }
71 or run this command as before:<br>
72 Code:
74         perl -e 'my ($len,$total)=(0,0);my @x;while(<>){if(/^[\>\@]/){if($len>0){$total+=$len;push@x,$len;};$len=0;}else{s/\s//g;$len+=length($_);}}if ($len>0){$total+=$len;push @x,$len;}@x=sort{$b<=>$a}@x; my ($count,$half)=(0,0);for (my $j=0;$j<@x;$j++){$count+=$x[$j];if(($count>=$total/2)&&($half==0)){print "N50: $x[$j]\n";$half=$x[$j]}elsif($count>=$total*0.9){print "N90: $x[$j]\n";exit;}}' contigs.fa
76 论坛附件:<a href="/assets/wp-uploads/2012/calengc.pl">calengc.pl</a>