new file: cell2loc.py
[GalaxyCodeBases.git] / perl / etc / radseq / cmpsepe.pl
blob9dce3269116ff3f43c6bdabed5a65fa1f540db08
1 #!/usr/bin/env perl
2 =pod
3 Author: Hu Xuesong @ PKU <galaxy001@gmail.com>
4 Version: 1.0.0 @ 20120530
5 =cut
6 use strict;
7 use warnings;
8 #use Data::Dump qw(ddx);
10 die "Usage: $0 <sam1> <sam2> <out prefix>\n" if @ARGV<2;
11 my $in1=shift;
12 my $in2=shift;
13 my $outp=shift;
15 sub openfile($) {
16 my ($filename)=@_;
17 my ($infile,$lastline);
18 if ($filename=~/.bz2$/) {
19 open( $infile,"-|","bzip2 -dc $filename") or die "Error opening $filename: $!\n";
20 } elsif ($filename=~/.gz$/) {
21 open( $infile,"-|","gzip -dc $filename") or die "Error opening $filename: $!\n";
22 } else {open( $infile,"<",$filename) or die "Error opening $filename: $!\n";}
23 my $line;
24 while (defined($line=<$infile>)) {
25 if ($line =~ /^@\w\w\t\w\w:/) {
26 next;
28 chomp $line;
29 my @items = split /\t/,$line;
30 #warn "[",join(" | ",@items),"]\n";
31 if ($items[1] & 128) { # second read
32 next;
34 return [$infile,\@items];
36 die "File Error.";
39 my $samin1 = openfile($in1);
40 my $samin2 = openfile($in2);
41 open O,'|-',"gzip -9c >$outp.cmpsam.gz" or die "Error opening $outp.cmpsam.gz with gzip: $!\n";
42 open L,'>',$outp.'.log' or die "Error opening $outp.log: $!\n";
43 select(L);
44 $|=1;
45 select(STDOUT);
46 print L "From [$in1,$in2] to [$outp.cmpsam.gz]\n";
47 my ($Total,$Out,$notOut)=(0,0,0);
49 sub getNextRead($) {
50 my $FHa=$_[0];
51 my $t=$FHa->[1];
52 $FHa->[1] = undef;
53 my $line;
54 while (defined($line = readline($$FHa[0]))) {
55 chomp $line;
56 my @items = split /\t/,$line;
57 if ($items[1] & 128) { # second read
58 next;
60 $FHa->[1] = \@items;
61 return $t;
65 my ($StatSum,$arr1,$arr2,%Stat,$XT1,$XT2)=(0);
66 while (1) {
67 $arr1 = getNextRead($samin1) or last;
68 $arr2 = getNextRead($samin2) or last;
69 die "$$arr1[0] ne $$arr2[0]" if $$arr1[0] ne $$arr2[0];
70 my $diff=0;
71 $diff |= 1 if (($$arr1[1] & 16) != ($$arr2[1] & 16)); # strand of the query
72 $diff |= 2 if ($$arr1[2] ne $$arr2[2]); # Reference sequence NAME
73 $diff |= 4 if ($$arr1[3] ne $$arr2[3]); # POS
74 $diff |= 8 if (($$arr1[4]>0) != ($$arr2[4]>0)); # MAPQ == 0 or not.
75 $diff |= 16 if ($$arr1[5] ne $$arr2[5]); # CIAGR
76 $XT1 = $XT2 = '*';
77 if ($$arr1[2] eq '*') {
78 $diff |= 256;
79 } else {
80 $XT1 = (grep(/^XT:/,@$arr1))[0];
81 if (defined $XT1) {
82 $XT1 = (split(':',$XT1))[2];
83 $diff |= 64 if $XT1 eq 'U';
84 } else {
85 #warn "---[",join(" | ",@$arr1),"]\n";
86 $diff |= 1024;
89 if ($$arr2[2] eq '*') {
90 $diff |= 512;
91 } else {
92 $XT2 = (grep(/^XT:/,@$arr2))[0];
93 if (defined $XT2) {
94 $XT2 = (split(':',$XT2))[2];
95 $diff |= 128 if $XT2 eq 'U';
96 } else {
97 #warn "---[",join(" | ",@$arr2),"]\n";
98 $diff |= 2048;
101 $diff |= 32 if ( $XT1 and $XT2 and ($XT1 ne $XT2) );
102 print O "[$#$arr1] [$#$arr2] $$arr1[0] - $diff\n";
103 ++$Stat{$diff};
104 ++$StatSum;
106 print L '# ',join(',',reverse qw/Strand Ref Pos MapQ>0 CIAGR XT 1=U 2=U 1unmap 2unmap 1noXT 2noXT/),"\n";
107 close $samin1->[0];
108 close $samin2->[0];
109 close O;
111 print L "Stat of [$StatSum] Read_1s:\n";
112 for (sort {$Stat{$b}<=>$Stat{$a}} keys %Stat) {
113 print L "$_\t$Stat{$_}\t";
114 printf L "%.6f\t%#x\t%#014b\n",$Stat{$_}/$StatSum,($_) x 2;
116 print L "\n";
117 close L;
119 __END__
120 grep -P '0b[01]{5}1[01]{6}' PD2M-LSJ-BHX011.log|awk 'BEGIN{sum=0}{sum+=$2}END{print sum}'