modified: myjupyterlab.sh
[GalaxyCodeBases.git] / released / pIRS.old / bwasam / dupbamstat.pl
blob053a327869a136bdb36a25c5b3d9f55c09a46b04
1 #!/bin/env perl
2 use strict;
3 use warnings;
4 #use Data::Dump qw(dump ddx);
6 chomp(my $user=`id -nru`);
7 my ($SAMTOOLS);
8 if ($user eq 'galaxy') {
9 $SAMTOOLS='samtools';
10 } else {
11 $SAMTOOLS='/ifs1/ST_ASMB/USER/huxuesong/group/illumia/_reads/samtools';
14 unless (@ARGV){
15 print "perl $0 <maxPEpairs(0=Inf.)> <outfile> <sorted bam files>\n"; # soaps.nfo can store the file size of soap. Too
16 exit;
19 my ($STAGELENGTH,$stageborder)=(2000,2000);
21 my $maxpairs = 2*(shift @ARGV);
22 my $statout = shift @ARGV;
23 my ($DupSE,$DupPE,$ReadsStat,%SEDup,%PEDup)=(0,0,0);
24 my $FilesStr='[' . join('], [',@ARGV) . "]";
26 while(my $bamfile=shift @ARGV) {
27 next unless -f $bamfile;
28 open SAM,'-|',"$SAMTOOLS view $bamfile" or (warn "[!]Error opening $bamfile: $!\n" and next);
29 my ($READLEN,$i)=(0,0);
30 while (defined($_=<SAM>) and $i<2048) {
31 my @read1=split /\t/;
32 next unless $read1[5] =~ /^(\d+)M$/;
33 $READLEN = $1 if $READLEN < $1;
34 ++$i;
36 close SAM;
37 print STDERR "Read [$bamfile] $READLEN\n";
39 my ($lastID,$ReadID,$lastL,$lastR,$readL,$readR,$isDupSE,$isDupPE)=('','',0,0,0,0,0,0);
40 open SAM,'-|',"$SAMTOOLS view $bamfile" or (warn "[!]Error opening $bamfile: $!\n" and next);
41 while (<SAM>) {
42 #next if /^(#|@)/;
43 #chomp;
44 my @read1=split /\t/;
45 next if $read1[2] eq '*'; # unmap
46 next if $read1[5] eq '*'; # Single
47 next unless $read1[1] & 3; # paired + mapped in a proper pair
48 next if $read1[1] >= 256; # not primary || QC failure || optical or PCR duplicate
49 next unless $read1[5] =~ /^(\d+)M$/;
50 next unless $1 == $READLEN;
51 next if $read1[11] eq 'XT:A:R'; # Type: Unique/Repeat/N/Mate-sw, N not found.
52 #print "$_";
53 ++$ReadsStat;
54 ($ReadID,$readL,$readR)=@read1[0,3,7];
55 my $dupcnt;
56 ++$SEDup{$readL}{$ReadID} if $read1[1] & 0x40; # only Read 1
57 my ($reada,$readb) = sort {$a<=>$b} ($readL,$readR);
58 ++$PEDup{$readb}{$reada}{$ReadID};
59 if ($reada>$stageborder) {
60 for (sort {$a<=>$b} keys %SEDup) {
61 $dupcnt = scalar keys %{$SEDup{$_}};
62 $DupSE += $dupcnt if $dupcnt>=2;
64 %SEDup=();
65 for my $Rb (sort {$a<=>$b} keys %PEDup) {
66 if ($Rb<=$stageborder) {
67 for my $Ra (sort {$a<=>$b} keys %{$PEDup{$Rb}}) {
68 $dupcnt = scalar keys %{$PEDup{$Rb}{$Ra}};
69 $DupPE += $dupcnt if $dupcnt>=2;
71 delete $PEDup{$Rb};
74 $stageborder += $STAGELENGTH;
76 last if $maxpairs && $ReadsStat>=$maxpairs;
77 =pod
78 if ($readL == $lastL) {
79 $isDupSE=1;
80 ++$SEDup{$ReadID};
81 if ($readR == $lastR) {
82 $isDupPE=1;
83 ++$PEDup{$ReadID};
85 } else {
86 if ($isDupSE) {
87 ++$DupSE;
88 ++$PEDup{$lastID};
90 if ($isDupPE) {
91 ++$DupPE;
92 ++$SEDup{$lastID};
94 ($lastID,$lastL,$lastR)=($ReadID,$readL,$readR);
95 ($isDupSE,$isDupPE)=(0,0);
97 ++$ReadsStat;436748205075761`
98 next;
99 =cut
100 =pod
101 ┌────┬───────┬──────────────────────────────────────────────────────────┐
102 │Col │ Field │ Description │
103 ├────┼───────┼──────────────────────────────────────────────────────────┤
104 │ 1 │ QNAME │ Query (pair) NAME │
105 │ 2 │ FLAG │ bitwise FLAG │
106 │ 3 │ RNAME │ Reference sequence NAME │
107 │ 4 │ POS │ 1-based leftmost POSition/coordinate of clipped sequence │
108 │ 5 │ MAPQ │ MAPping Quality (Phred-scaled) │
109 │ 6 │ CIAGR │ extended CIGAR string │
110 │ 7 │ MRNM │ Mate Reference sequence NaMe (`=' if same as RNAME) │
111 │ 8 │ MPOS │ 1-based Mate POSistion │
112 │ 9 │ ISIZE │ Inferred insert SIZE │
113 │10 │ SEQ │ query SEQuence on the same strand as the reference │
114 │11 │ QUAL │ query QUALity (ASCII-33 gives the Phred base quality) │
115 │12 │ OPT │ variable OPTional fields in the format TAG:VTYPE:VALUE │
116 └────┴───────┴──────────────────────────────────────────────────────────┘
117 =cut
119 close SAM;
120 #ddx \%NFO;
123 $ReadsStat /= 2;
124 open OA,'>',$statout or die "Error: $!\n";
125 print OA "Input Files: $FilesStr\nTotal Read Pairs: $ReadsStat\nSE Dup: ",$DupSE,"\nPE dup: ",$DupPE,"\n",'-'x80,"\nPE dup ratio: ",$DupPE/$ReadsStat,"\nSE dup ratio: ",$DupSE/$ReadsStat,"\n";
126 close OA;
128 __END__