4 #use Data::Dump qw(dump ddx);
6 chomp(my $user=`id -nru`);
8 if ($user eq 'galaxy') {
11 $SAMTOOLS='/ifs1/ST_ASMB/USER/huxuesong/group/illumia/_reads/samtools';
15 print "perl $0 <maxPEpairs(0=Inf.)> <outfile> <sorted bam files>\n"; # soaps.nfo can store the file size of soap. Too
19 my ($STAGELENGTH,$stageborder)=(2000,2000);
21 my $maxpairs = 2*(shift @ARGV);
22 my $statout = shift @ARGV;
23 my ($DupSE,$DupPE,$ReadsStat,%SEDup,%PEDup)=(0,0,0);
24 my $FilesStr='[' . join('], [',@ARGV) . "]";
26 while(my $bamfile=shift @ARGV) {
27 next unless -f
$bamfile;
28 open SAM
,'-|',"$SAMTOOLS view $bamfile" or (warn "[!]Error opening $bamfile: $!\n" and next);
29 my ($READLEN,$i)=(0,0);
30 while (defined($_=<SAM
>) and $i<2048) {
32 next unless $read1[5] =~ /^(\d+)M$/;
33 $READLEN = $1 if $READLEN < $1;
37 print STDERR
"Read [$bamfile] $READLEN\n";
39 my ($lastID,$ReadID,$lastL,$lastR,$readL,$readR,$isDupSE,$isDupPE)=('','',0,0,0,0,0,0);
40 open SAM
,'-|',"$SAMTOOLS view $bamfile" or (warn "[!]Error opening $bamfile: $!\n" and next);
45 next if $read1[2] eq '*'; # unmap
46 next if $read1[5] eq '*'; # Single
47 next unless $read1[1] & 3; # paired + mapped in a proper pair
48 next if $read1[1] >= 256; # not primary || QC failure || optical or PCR duplicate
49 next unless $read1[5] =~ /^(\d+)M$/;
50 next unless $1 == $READLEN;
51 next if $read1[11] eq 'XT:A:R'; # Type: Unique/Repeat/N/Mate-sw, N not found.
54 ($ReadID,$readL,$readR)=@read1[0,3,7];
56 ++$SEDup{$readL}{$ReadID} if $read1[1] & 0x40; # only Read 1
57 my ($reada,$readb) = sort {$a<=>$b} ($readL,$readR);
58 ++$PEDup{$readb}{$reada}{$ReadID};
59 if ($reada>$stageborder) {
60 for (sort {$a<=>$b} keys %SEDup) {
61 $dupcnt = scalar keys %{$SEDup{$_}};
62 $DupSE += $dupcnt if $dupcnt>=2;
65 for my $Rb (sort {$a<=>$b} keys %PEDup) {
66 if ($Rb<=$stageborder) {
67 for my $Ra (sort {$a<=>$b} keys %{$PEDup{$Rb}}) {
68 $dupcnt = scalar keys %{$PEDup{$Rb}{$Ra}};
69 $DupPE += $dupcnt if $dupcnt>=2;
74 $stageborder += $STAGELENGTH;
76 last if $maxpairs && $ReadsStat>=$maxpairs;
78 if ($readL == $lastL) {
81 if ($readR == $lastR) {
94 ($lastID,$lastL,$lastR)=($ReadID,$readL,$readR);
95 ($isDupSE,$isDupPE)=(0,0);
97 ++$ReadsStat;436748205075761`
101 ┌────┬───────┬──────────────────────────────────────────────────────────┐
102 │Col │ Field │ Description │
103 ├────┼───────┼──────────────────────────────────────────────────────────┤
104 │ 1 │ QNAME │ Query (pair) NAME │
105 │ 2 │ FLAG │ bitwise FLAG │
106 │ 3 │ RNAME │ Reference sequence NAME │
107 │ 4 │ POS │ 1-based leftmost POSition/coordinate of clipped sequence │
108 │ 5 │ MAPQ │ MAPping Quality (Phred-scaled) │
109 │ 6 │ CIAGR │ extended CIGAR string │
110 │ 7 │ MRNM │ Mate Reference sequence NaMe (`=' if same as RNAME) │
111 │ 8 │ MPOS │ 1-based Mate POSistion │
112 │ 9 │ ISIZE │ Inferred insert SIZE │
113 │10 │ SEQ │ query SEQuence on the same strand as the reference │
114 │11 │ QUAL │ query QUALity (ASCII-33 gives the Phred base quality) │
115 │12 │ OPT │ variable OPTional fields in the format TAG:VTYPE:VALUE │
116 └────┴───────┴──────────────────────────────────────────────────────────┘
124 open OA
,'>',$statout or die "Error: $!\n";
125 print OA
"Input Files: $FilesStr\nTotal Read Pairs: $ReadsStat\nSE Dup: ",$DupSE,"\nPE dup: ",$DupPE,"\n",'-'x80
,"\nPE dup ratio: ",$DupPE/$ReadsStat,"\nSE dup ratio: ",$DupSE/$ReadsStat,"\n";