new file: cell2loc.py
[GalaxyCodeBases.git] / perl / etc / justonce / simcontigstat.pl
blobc171bfa548b2f874a2349d2bf52b8eca109e4ea6
1 #!/bin/env perl
2 =pod
3 Author: Hu Xuesong @ BGI <huxuesong@genomics.org.cn>
4 Version: 1.0.0 @ 20120309
5 =cut
6 use strict;
7 use warnings;
8 use Time::HiRes qw ( gettimeofday tv_interval );
10 my $SAMTOOLSBIN="samtools";
11 #$SAMTOOLSBIN="/ifs1/ST_ASMB/USER/yuanjy/huxuesong/tmp/group/rev/test/samtools";
12 my $SAMFILTER="-f 3 -F 1536";
13 $SAMFILTER="-F 1536";
15 my $MINCONTIFLEN=100;
17 die "Usage: $0 <single_sam_bam_file> <soap.coverage depthsingle> <output_prefix> <min_overlap> <min_depth>\n" if @ARGV<2;
18 my $name=shift;
19 my $cvg=shift;
20 my $out=shift;
21 my $overlap=shift;
22 my $mindepth=shift;
23 my (%LENGTH,%Stat,%N5090);
24 if ($name =~ /\.bam$/) {
25 open IN,'-|',"$SAMTOOLSBIN view $SAMFILTER $name" or die "Error opening $name : $!\n";
26 } elsif ($name =~ /\.sam\.gz$/) {
27 open IN,'-|',"$SAMTOOLSBIN view $SAMFILTER -S $name" or die "Error opening $name : $!\n";
28 } elsif ($name =~ /\.sam$/) {
29 open IN,'-|',"$SAMTOOLSBIN view $SAMFILTER -S $name" or die "Error opening $name : $!\n";
30 } else {
31 die "[x]Unsupport file type.";
34 sub openfile($) {
35 my ($filename)=@_;
36 my $infile;
37 if ($filename=~/.xz$/) {
38 open( $infile,"-|","xz -dc $filename") or die "Error opening $filename: $!\n";
39 } elsif ($filename=~/.gz$/) {
40 open( $infile,"-|","gzip -dc $filename") or die "Error opening $filename: $!\n";
41 } elsif ($filename=~/.bz2$/) {
42 open( $infile,"-|","bzip2 -dc $filename") or die "Error opening $filename: $!\n";
43 } else {open( $infile,"<",$filename) or die "Error opening $filename: $!\n";}
44 return $infile;
46 my $fh=openfile($cvg);
47 my ($genome,%DepDatChr);
48 while(<$fh>) {
49 s/^>//;
50 /^(\S+)/ or next;
51 my $SeqName = $1;
52 $/=">";
53 $genome=<$fh>;
54 $/="\n";
55 my $i=0;
56 while($genome=~/(\d+)/gc) {
57 $DepDatChr{$SeqName}->[$i]+=$1 if ($1!=65536);
58 ++$i;
60 $genome='';
61 last;
63 close $fh;
65 sub don5090($$$) {
66 my ($chr,$Len,$href)=@_;
67 my $n50 = -1;
68 my $n90 = -1;
69 my $s = 0;
70 my $count=0;
71 my @Length = sort {$b <=> $a} keys %{$href};
72 for my $l (@Length){
73 $s += $l*$$href{$l};
74 $count += $$href{$l};
75 if($n50 == -1 and $s >= $Len * 0.5){ $n50 = $l; }
76 if($n90 == -1 and $s >= $Len * 0.9){ $n90 = $l; last; }
78 $N5090{$chr}=[$n50,$n90,$count,$Length[0],$Length[-1]];
79 return " [$count,$Length[0],$Length[-1]] $n50,$n90 ";
83 my ($lastB,$lastE)=(0,0);
84 while (<IN>) {
85 next if /^@\w\w\t\w\w:/;
86 chomp;
87 my @read1=split /\t/;
88 my $cigar=$read1[5];
89 next unless $cigar =~ /M/;
90 #next unless ($read1[1] & 3) == 3; # paired + mapped in a proper pair
91 next if $read1[1] >= 256; # not primary || QC failure || optical or PCR duplicate
92 #next unless $read1[5] =~ /^(\d+)M$/;
93 # http://davetang.org/muse/2011/01/28/perl-and-sam/
94 my $position = '0';
95 while ($cigar !~ /^$/){
96 if ($cigar =~ /^([0-9]+[MIDSH])/){
97 my $cigar_part = $1;
98 if ($cigar_part =~ /(\d+)M/){
99 $position += $1;
100 } elsif ($cigar_part =~ /(\d+)[IH]/){
101 #$position += $1;
102 } elsif ($cigar_part =~ /(\d+)D/){
103 $position += $1;
104 } elsif ($cigar_part =~ /(\d+)S/){
105 die "[!]Not ready for this!\n";
106 #my $insertion = 'x' x $1;
107 #substr($new_ref,$position,0,$insertion);
108 #$position += $1;
110 $cigar =~ s/$cigar_part//;
111 } else {
112 die "Unexpected cigar: $cigar\n";
115 my $end=$read1[3]+$position;
117 my ($GapFlag,@Gaps)=(0);
118 for my $p ($read1[3] .. $end) {
119 if ($DepDatChr{$read1[2]}->[$p] < $mindepth) {
120 push @Gaps,[$p,-1] unless $GapFlag;
121 $GapFlag=1;
123 if ($GapFlag and $DepDatChr{$read1[2]}->[$p] >= $mindepth) {
124 $Gaps[-1]->[1]=$p;
125 $GapFlag=0;
128 if ($GapFlag) {
129 $Gaps[-1]->[1]=$end+1;
131 if ($lastB==0) { # just the 1st line.
132 ($lastB,$lastE)=($read1[3],$end);
133 if (@Gaps>0) {
134 $lastB=$Gaps[0]->[1];
136 next;
138 if (($lastE-$read1[3] +1 > $overlap) and ($end > $lastE)) {
142 close IN;