new file: cell2loc.py
[GalaxyCodeBases.git] / perl / etc / justonce / sperm / getkmhist.pl
blobc245046d1a234d77d7e2674b7b90460d862f5442
1 #!/bin/env perl
2 use strict;
3 use warnings;
4 use IO::Unread qw(unread);
5 use Data::Dump qw(ddx);
7 die "Usage: $0 <input>\n" if @ARGV < 1;
8 my ($inf)=@ARGV;
10 sub openfile($) {
11 my ($filename)=@_;
12 my $infile;
13 if ($filename=~/.xz$/) {
14 open( $infile,"-|","xz -dc $filename") or die "Error opening $filename: $!\n";
15 } elsif ($filename=~/.gz$/) {
16 open( $infile,"-|","gzip -dc $filename") or die "Error opening $filename: $!\n";
17 } elsif ($filename=~/.lz4$/) {
18 open( $infile,"-|","/opt/bin/lz4c -dy $filename /dev/stdout") or die "Error opening $filename: $!\n";
19 } else {open( $infile,"<",$filename) or die "Error opening $filename: $!\n";}
20 return $infile;
23 my $IN = openfile($inf);
24 my $tmp = <$IN>;
25 unread $IN, $tmp;
26 my $str = (split /\t/,$tmp)[0];
27 my $len = length $str;
28 my $allcnt = 4 ** $len;
30 my ($Sum,$kMax,$kMin,$cMax,$cMin,%Hist)=(0,0,999999999,0,999999999);
31 my ($KmerSum,$K90,$K75,$K50,$K25,$K10,$Kadding)=(0,0,0,0,0,0,0);
33 while (<$IN>) {
34 chomp;
35 my ($str,$cnt) = split /\t/;
36 #print "$str\t[$cnt]\n";
37 ++$Hist{$cnt};
38 ++$Sum;
39 $KmerSum += $cnt;
40 $kMax = $cnt if $cnt > $kMax;
41 $kMin = $cnt if $cnt < $kMin;
43 close $IN;
45 for (values %Hist) {
46 $cMax = $_ if $cMax < $_;
47 $cMin = $_ if $cMin > $_;
50 open OUT,'>',"$inf.hist" or die;
51 print OUT "# K\t$len\t$allcnt
52 # KmerFreq_Min-Max\t$kMin\t$kMax
53 # Cnt_Min-Max\t$cMin\t$cMax
54 # CntOfFoundKmer\t$Sum\t",$Sum/$allcnt,"\n";
56 my @order = sort {$a <=> $b} keys %Hist;
57 for (@order) {
58 $Kadding += $_ * $Hist{$_};
59 if ( 10 * $Kadding < $KmerSum ) {
60 $K90 = $_;
61 } elsif ( 4 * $Kadding < $KmerSum ) {
62 $K75 = $_;
63 } elsif ( 2 * $Kadding < $KmerSum ) {
64 $K50 = $_;
65 } elsif ( 4 * $Kadding < $KmerSum * 3 ) {
66 $K25 = $_;
67 } elsif ( 10 * $Kadding < $KmerSum * 9 ) {
68 $K10 = $_;
71 print OUT "# KmerSum: $KmerSum\n# K90: $K90, K75: $K75, K50: $K50, K25: $K25, K10: $K10\n";
72 my @t;
73 push @t, $_/$KmerSum for ($K90,$K75,$K50,$K25,$K10);
74 print OUT '# Kratio: ',join(', ',@t),"\n";
76 print OUT "# KmerFreq\tCntOfThisFreq\tKmerSumRatio\tCumulativeKmerSumRatio\n";
77 my $cumulative = 0;
78 for (@order) {
79 $cumulative += $_*$Hist{$_};
80 print OUT join("\t",$_,$Hist{$_},$_*$Hist{$_}/$KmerSum,$cumulative/$KmerSum),"\n";
82 close OUT;
84 __END__
85 cat kmerfreq.lst|while read a;do perl getkmhist.pl $a; done &
87 cat kmerfreq.lst|while read a;do echo perl "getkmhist.pl $a &"; done > t.sh