new file: cell2loc.py
[GalaxyCodeBases.git] / perl / etc / radseq / ecutpstat.pl
blobc231463ad11397bb8d1da7b8db352b38c9cea533
1 #!/usr/bin/env perl
2 =pod
3 Author: Hu Xuesong @ BIOPIC <galaxy001@gmail.com>
4 Version: 1.0.0 @ 20120330
5 =cut
6 use strict;
7 use warnings;
8 #use Data::Dump qw(ddx);
10 die "Usage: $0 <reference> <out>\n" if @ARGV<2;
11 my $fa=shift;
12 my $out=shift;
14 my $Eseq="CTGCAG";
15 my $EcutAt=5;
17 open O,'>',$out or die "Error opening $out : $!\n";
18 print O "# Ref: [$fa]\n#Enzyme: [$Eseq], Cut after $EcutAt\n\n#ChrID\tCut\n";
19 my ($CountAll,%Count,%Len);
21 open I,'<',$fa or die "Error opening $fa : $!\n";
22 while (<I>) {
23 s/^>//;
24 /^(\S+)/ or next;
25 my $seqname = $1;
26 #print STDERR " >$seqname ...";
27 $/=">";
28 my $genome=<I>;
29 chomp $genome;
30 $genome=~s/\s//g;
31 $/="\n";
32 #print STDERR "\b\b\b \t",length $genome,".\n";
34 $Count{$seqname} = 0;
35 $Len{$seqname} = length $genome;
36 while ( $genome =~ m/$Eseq/g ) {
37 # push @ret, [ $-[0], $+[0] ];
38 print O join("\t",$seqname,$-[0]+$EcutAt),"\n"; # starts from 0, so OK to use directly with $EcutAt for "cut after".
39 ++$CountAll;
40 ++$Count{$seqname};
43 $genome='';
45 close I;
47 my @notCut;
48 print O "\n#Cut stat:\n#ChrID\tLen\tCut\tAvg_distance\n";
49 for (sort keys %Count) {
50 print O '# ',join("\t",$_,$Len{$_},$Count{$_},$Count{$_}?int(0.5 + $Len{$_}/(1+$Count{$_})):'.'),"\n";
51 push @notCut,$_ unless $Count{$_};
53 print O "#Total: $CountAll\n#notCut: ",scalar @notCut,"\n";
54 print O "#notCut:[",join(',',@notCut),"]\n" if @notCut;
55 close O;