3 Author: Hu Xuesong @ BGI <huxuesong@genomics.org.cn>
4 Version: 0.0.1 @ 20110803
6 #use lib "/ifs1/ST_ASMB/USER/huxuesong/public/lib";
7 #use lib '/export/data0/gentoo/tmp';
10 use Time
::HiRes qw
( gettimeofday tv_interval
);
12 use Data
::Dump
qw(dump);
20 \t-o output prefix (./stat).{info,insD}
21 \t-b No pause for batch runs
22 For SOAP, both soap/single files and STDERR dump are needed.
23 For BWA , only insert size stat. is done currently.
25 our $ARG_DESC='soap/sam files{,.gz,.bz2}';
28 $opt_o='./stat' if ! $opt_o;
29 die "[x]No input files found !\n" unless @ARGV;
30 #die "[!]Max 252 files supported.\n" if @ARGV>252;
32 print STDERR
"From [@ARGV] to [$opt_o]\n";
33 unless ($opt_b) {print STDERR
'press [Enter] to continue...'; <STDIN
>;}
35 my $start_time = [gettimeofday
];
39 if ($href and %$href) {
41 $m = (sort {$a<=>$b} keys %$href)[-1];
44 push @str,join(':',$_,$$href{$_}||0);
46 return \
join(',',@str);
52 if ($href and %$href) {
54 for (sort {$a<=>$b} keys %$href) {
55 push @str,join(':',$_,$$href{$_});
57 return \join(',',@str);
61 sub getRealpos
($$$$) {
62 my ($len,$strand,$realpos,$trim)=@_;
63 if ($strand eq '-') { # Negative
64 $realpos += $len; # should be $len-1. So, starting 0. (+ & -, never meets.)
65 if ($trim =~ /(\d+)S$/) {
66 $realpos += $1; # $1 only reset after next /()/
68 } elsif ($strand eq '+') { # Positive
69 if ($trim =~ /^(\d+)S/) {
78 case 'R': opt->FR = 0;
80 alnSeq[0] = mseqs->seqList+i;
81 alnSeq[1] = mseqs->seqList+i+1;
82 pe_aux.len = alnSeq[0]->len;
84 fprintf(stderr,"%c\n", "+-"[strain]);
86 int strain1 = (p->info >> 24)&1;
87 int strain2 = (q->info >> 24)&1;
89 if(!strain1 && q->pos-p->pos + o->len >= o->min_ins && q->pos-p->pos + o->len <= o->max_ins) return TRUE;
90 else if(strain1 && p->pos-q->pos+o->len >= o->min_ins && p->pos-q->pos+o->len <= o->max_ins) return TRUE;
93 else if(!o->FR){ # RF, (-R)
94 if(strain1 && q->pos - p->pos >= o->min_ins && q->pos - p->pos + o->len <= o->max_ins) return TRUE;
95 else if(!strain1 && p->pos-q->pos >= o->min_ins && p->pos-q->pos+o->len <= o->max_ins) return TRUE;
99 Well, InsertSize should always be defined from the 2 terminals of a sequencing fragument.
100 Thus, for FR, the same as what from the gel; for RF, should be less than the gel result by the length of what actually been sequenced.
106 if ($filename=~/.bz2$/) {
107 open( $infile,"-|","bzip2 -dc $filename") or die "Error opening $filename: $!\n";
108 } elsif ($filename=~/.gz$/) {
109 open( $infile,"-|","gzip -dc $filename") or die "Error opening $filename: $!\n";
110 } else {open( $infile,"<",$filename) or die "Error opening $filename: $!\n";}
113 sub checkfiletype
($) {
117 if ($head =~ /^\[bwa_sai2sam_pe_core\]/) {
119 } elsif ($head =~ /^\@SQ/) {
121 } elsif ($head =~ /[ATCG]/) {
123 } elsif ($head =~ /^$/) {
125 00000000 0a 42 65 67 69 6e 20 50 72 6f 67 72 61 6d 20 53 |.Begin Program S|
126 00000010 4f 41 50 61 6c 69 67 6e 65 72 2f 73 6f 61 70 32 |OAPaligner/soap2|
127 00000020 0a 46 72 69 20 4a 75 6c 20 31 35 20 30 39 3a 32 |.Fri Jul 15 09:2|
128 00000030 39 3a 34 32 20 32 30 31 31 0a 52 65 66 65 72 65 |9:42 2011.Refere|
129 00000040 6e 63 65 3a 20 2e 2f 72 65 66 2f 68 75 6d 61 6e |nce: ./ref/human|
130 00000050 2e 66 61 2e 69 6e 64 65 78 0a 51 75 65 72 79 20 |.fa.index.Query |
131 00000060 46 69 6c 65 20 61 3a 20 2e 2f 30 6e 6f 6d 61 73 |File a: ./0nomas|
138 sub sumsoapdata
($$) {
139 my ($hit,$len,$chr,$types,$trim,$mistr)=@
{$_[0]};
141 my ($BPOut,$ReadsOut,$MisSum,$TrimedBP,$TrimedReads,%Hit9r,%Hit9bp,%misMatch,%Indel)=(0,0,0,0,0);
142 #my (%chrBPOut,%chrReadsOut,%chrMisSum,%chrTrimedBP,%chrTrimedReads,%chrHit9r,%chrHit9bp,%chrmisMatch,%chrIndel);
144 $BPOut += $len;# $chrBPOut{$chr} += $len;
145 ++$ReadsOut;# ++$chrReadsOut{$chr};
146 $missed=$mistr=~tr/ATCGatcg//;
147 ++$misMatch{$missed};
148 #++$chrmisMatch{$chr}{$missed};
150 #$chrMisSum{$chr} += $missed;
152 $hit=4 if $hit>4; # max to count 3, then >=4. Ancient Chinese wisdom, and a bit more ...
154 #++$chrHit9r{$chr}{$hit};
155 $Hit9bp{$hit} += $len;
156 #$chrHit9bp{$chr}{$hit} += $len;
159 #++$misMatch{$types};
160 #++$chrmisMatch{$chr}{$types};
161 } elsif ($types < 200) { # '3S33M9D39M', '32M1D14M29S' exists ?
162 ++$Indel{$types-100};
163 ++$chrIndel{$chr}{$types-100};
165 ++$Indel{200-$types};
166 ++$chrIndel{$chr}{200-$types};
170 ++$Indel{200-$types};
171 #++$chrIndel{$chr}{200-$types};
172 } elsif ($types > 100) {
173 ++$Indel{$types-100};
174 #++$chrIndel{$chr}{$types-100};
177 @trims = $trim =~ /(\d+)S/;
180 #++$chrTrimedReads{$chr};
183 #$chrTrimedBP{$chr} += $_;
186 $dathref->{'BPOut'} += $BPOut;
187 $dathref->{'ReadsOut'} += $ReadsOut;
188 $dathref->{'MisSum'} += $MisSum;
189 $dathref->{'TrimedReads'} += $TrimedReads;
190 $dathref->{'TrimedBP'} += $TrimedBP;
191 $dathref->{'misMatch'}{$_} += $misMatch{$_} for keys %misMatch;
192 $dathref->{'Indel'}{$_} += $Indel{$_} for keys %Indel;
193 $dathref->{'Hit9r'}{$_} += $Hit9r{$_} for keys %Hit9r;
194 $dathref->{'Hit9bp'}{$_} += $Hit9bp{$_} for keys %Hit9bp;
199 $datsum{'PEuniqPairs'}=0;
200 my ($BadLines,$PESE,$pairs,$lastpos,$line1,$line2,$pp,$pn,$calins,%insD)=(0,'PE',0);
201 while ($line1=<$fh>) {
203 my ($id1, $n1, $len1, $f1, $chr1, $x1, $types1, $m1, $mistr1)
204 = (split "\t", $line1)[0,3,5,6,7,8,9,-2,-1];
205 unless (defined $types1) { # soap2 output always more than 10 columes.
209 &sumsoapdata
([$n1, $len1, $chr1, $types1, $m1, $mistr1],\
%datsum);
212 my ($id2, $n2, $len2, $f2, $chr2, $x2, $types2, $m2, $mistr2)
213 = (split "\t", $line2)[0,3,5,6,7,8,9,-2,-1];
214 unless (defined $types2) { # soap2 output always more than 10 columes.
218 #($soapid,$hit,$len,$strand,$chr,$pos,$trim) = (split(/\t/))[0,3,5,6,7,8,-2];
219 # ($hit,$len, $chr,$types,$trim,$mistr) = @lines[3,5,7,9,-2,-1];
222 &sumsoapdata
([$n2, $len2, $chr2, $types2, $m2, $mistr2],\
%datsum);
223 if (($PESE eq 'SE') or ($id1 ne $id2)){ # single
227 next if $n1+$n2>2 or $chr1 ne $chr2;
234 if ($ins > 1500) { # FR => +.pos < -.pos; RF => -.pos < +.pos
236 } else { next if $pp > $pn; }
239 $line1=&getRealpos
($len1, $f1, $x1, $m1); # $len,$strand,$realpos,$trim
240 $line2=&getRealpos
($len2, $f2, $x2, $m2); # Well, $line{1,2} is recycled.
241 $calins=abs($line1-$line2); # -.starting=0
244 $datsum{'BadLines'}=$BadLines;
246 $datsum{'PEuniqPairs'}=$pairs;
248 return [$PESE,[0,0,0],\
%datsum,\
%insD];
252 my ($Pairs,$Paired,$Singled,$Reads,$Alignment)=(0,0,0,0,0);
254 $Pairs = (split)[-2] if /^Total Pairs:/;
255 $Paired = (split)[1] if /^Paired:/;
256 $Singled = (split)[1] if /^Singled:/;
257 $Reads = (split)[-1] if /^Total Reads/;
258 $Alignment = (split)[1] if /^Alignment:/;
262 @RET=('SE',[$Reads,0,$Alignment]);
264 @RET=('PE',[$Pairs*2,$Paired*2,$Singled]);
268 Total Pairs: 34776407 PE
269 Paired: 17719335 (50.95%) PE
270 Singled: 30467634 (43.81%) SE
272 IN: 34776407 reads x 2 fq file = 69552814 reads from _1 & _2.fq
273 -o: 17719335 pairs = 17719335 x 2 =35438670 lines in .soap
274 -2: 30467634 lines in .single
276 17719335/34776407 = 0.50952172833726037310294878939046
277 30467634/69552814 = 0.43805034257851882168275750856033
281 Alignment: 22 (88.00%)
286 #my ($QNAME,$FLAG,$RNAME,$POS,$MAPQ,$CIAGR,$MRNM,$MPOS,$ISIZE,$SEQ,$QUAL,$OPT)=@read1;
287 # 0 1 2 3 4 5 6 7 8 9 10 11
288 my ($FLAG,$RNAME,$POS,$CIAGR,$MRNM,$MPOS,$ISIZE)=@
{$_[0]}[1,2,3,5,6,7,8];
290 my ($BPOut,$ReadsOut,$MisSum,$TrimedBP,$TrimedReads,%Hit9r,%Hit9bp,%misMatch,%Indel)=(0,0,0,0,0);
291 my (@trims,@matches,@inserts,@deletes,$missed);
292 my ($isPaired,$isSingled)=(0,0);
293 unless ($CIAGR eq '*') {
295 } else { return [0,0]; }
298 } else { #($FLAG & 8) and ($FLAG & 4 == 0)
300 #print '[',join('|',@{$_[0]}),']' unless $FLAG & 8;
301 #print $FLAG & 8,'|';
303 @matches = $CIAGR =~ /(\d+)M/;
307 my ($Insertion,$Deletion)=(0,0);
308 @inserts = $CIAGR =~ /(\d+)I/;
313 @deletes = $CIAGR =~ /(\d+)D/;
317 if ($Insertion and $Deletion) {
318 $Indel{$Insertion} += 0.5;
319 $Indel{$Deletion} += 0.5;
320 } elsif ($Insertion) {
321 ++$Indel{$Insertion};
325 my $Alternativehits='';
327 $Alternativehits = $1 if /^XA:Z:([\w,+-;]+)$/; #XA:Z:chrX,+1144912,100M,0;
328 next unless /^XM:i:(\d+)$/;
330 #print '[',join('|',@{$_[0]}),']',"\t[$1,$MisSum]\n";
332 my @excigar=split(';',$Alternativehits);
333 my $hit=1+scalar @excigar;
334 $hit=4 if $hit>4; # max to count 3, then >=4. Ancient Chinese wisdom, and a bit more ...
336 $Hit9bp{$hit} += $BPOut;
337 ++$misMatch{$MisSum};
338 @trims = $CIAGR =~ /(\d+)S/;
341 #++$chrTrimedReads{$chr};
344 #$chrTrimedBP{$chr} += $_;
347 $dathref->{'BPOut'} += $BPOut;
348 $dathref->{'ReadsOut'} += $ReadsOut;
349 $dathref->{'MisSum'} += $MisSum;
350 $dathref->{'TrimedReads'} += $TrimedReads;
351 $dathref->{'TrimedBP'} += $TrimedBP;
352 $dathref->{'misMatch'}{$_} += $misMatch{$_} for keys %misMatch;
353 $dathref->{'Indel'}{$_} += $Indel{$_} for keys %Indel;
354 $dathref->{'Hit9r'}{$_} += $Hit9r{$_} for keys %Hit9r;
355 $dathref->{'Hit9bp'}{$_} += $Hit9bp{$_} for keys %Hit9bp;
356 return [$isPaired,$isSingled];
360 my ($PESE,$Reads,$toPaired,$toSingled,$BadLines,%datsum,%insD)=('PE',0,0,0,0);
361 my ($Paired,$Singled)=(0,0);
362 my ($line1,$line2,$calins,$sumret);
364 │Col │ Field │ Description │
365 ├────┼───────┼──────────────────────────────────────────────────────────┤
366 │ 1 │ QNAME │ Query (pair) NAME │
367 │ 2 │ FLAG │ bitwise FLAG │
368 │ 3 │ RNAME │ Reference sequence NAME │
369 │ 4 │ POS │ 1-based leftmost POSition/coordinate of clipped sequence │
370 │ 5 │ MAPQ │ MAPping Quality (Phred-scaled) │
371 │ 6 │ CIAGR │ extended CIGAR string │
372 │ 7 │ MRNM │ Mate Reference sequence NaMe (`=' if same as RNAME) │
373 │ 8 │ MPOS │ 1-based Mate POSistion │
374 │ 9 │ ISIZE │ Inferred insert SIZE │
375 │10 │ SEQ │ query SEQuence on the same strand as the reference │
376 │11 │ QUAL │ query QUALity (ASCII-33 gives the Phred base quality) │
377 │12 │ OPT │ variable OPTional fields in the format TAG:VTYPE:VALUE │
378 #my ($QNAME,$FLAG,$RNAME,$POS,$MAPQ,$CIAGR,$MRNM,$MPOS,$ISIZE,$SEQ,$QUAL,$OPT)=@read1;
379 # 0 1 2 3 4 5 6 7 8 9 10 11
381 while ($line1=<$fh>) {
382 next if $line1=~/^@\w\w\t\w\w:/;
383 my @read1=split /\t/, $line1;
384 #print "[",join('|',@read1),"]\n";
385 #print scalar @read1,"|\n";
386 if (scalar @read1 < 11) {
391 $sumret=&sumsamdata
(\
@read1,\
%datsum);
392 $Paired += $sumret->[0];
393 $Singled += $sumret->[1];
396 my @read2=split /\t/, $line2;
397 if (scalar @read2 < 11) {
402 $sumret=&sumsamdata
(\
@read2,\
%datsum);
403 $Paired += $sumret->[0];
404 $Singled += $sumret->[1];
405 if (($PESE eq 'SE') or ($read1[0] ne $read2[0])){ # single
409 $calins=abs($read2[8]);
410 ++$insD{$calins} if $sumret->[0] and $calins < 1500;
412 $datsum{'BadLines'}=$BadLines;
413 return [$PESE,[$Reads,$Paired,$Singled],\
%datsum,\
%insD];
417 'soap' => \
&statsoap
,
418 # 'samlog' => sub {},
419 'soaplog' => \
&statsoaplog
,
422 sub sumintohash
($$) {
423 my ($inhref,$intohref)=@_;
424 for my $key (keys %{$inhref}) {
425 if (ref($$inhref{$key}) eq 'HASH') {
426 $$intohref{$key}={} unless exists $$intohref{$key};
427 &sumintohash
($$inhref{$key},$$intohref{$key});
429 $$intohref{$key} += $$inhref{$key};
433 print STDERR '-' x 80,"\n";
435 print STDERR 'to',"\n";
437 print STDERR '-' x 80,"\n";
442 my ($withPE,$InReads,$mapPair,$mapSingle,%DatSum,%InsD)=(0,0,0,0);
443 while($_=shift @ARGV) {
447 $infile=openfile
($_);
448 my $type=checkfiletype
($infile);
450 print STDERR
"$files\t[$type] $_ ...";
451 $infile=openfile
($_);
452 if (exists $dostat{$type}) {
453 my $ret=&{$dostat{$type}}($infile); # [$PESE,[$Reads,$Paired*2,$Singled],\%datsum,\%insD]
454 $InReads += $ret->[1]->[0];
455 $mapPair += $ret->[1]->[1];
456 $mapSingle += $ret->[1]->[2];
457 if ($ret->[0] eq 'PE') {
459 &sumintohash
($ret->[3],\
%InsD) if $ret->[3];
461 &sumintohash
($ret->[2],\
%DatSum) if $ret->[2];
462 #print "\n[";dump($ret);print "]\n";
464 print STDERR
"\b\b\bskipped."
469 warn "[!]Files Read: $files\n";
471 my ($n,$v,$avg,$std,$Lsd,$Rsd,$max_y,$max_x,$sum,$sum2)=(0);
473 open O
,'>',"$opt_o.insD" or die "[x]Error opening $opt_o.insD: $!\n";
474 for my $k (keys %InsD) {
482 $std = sqrt($sum2/$n-$avg*$avg);
483 print O
"# $avg ± $std\n";
484 ($max_y,$max_x)=(-1,0);
485 for my $k ($avg-$std .. $avg+$std) { # 68.27%
486 next unless $v=$InsD{$k};
492 my $cutoff = $max_y / 1000;
493 $cutoff = 3 if $cutoff<3;
494 my ($diff, $Lc, $Rc);
495 for my $k (keys %InsD) {
497 next if $v < $cutoff;
500 $Lsd += $v * $diff * $diff;
504 $Rsd += $v * $diff * $diff;
510 $Lsd = sqrt($Lsd/$Lc);
511 $Rsd = sqrt($Rsd/$Rc);
512 print O
"# +$Lsd -$Rsd\n#InsertSize\tCount\n";
513 for my $k (sort {$a <=> $b} keys %InsD) {
520 open NFO
,'>',"$opt_o.info" or die "[x]Error opening $opt_o.info: $!\n";
522 my $p=sprintf "%.2f",100*$max_y/$n;
523 $avg=int($avg*10+.5)/10;
524 $Lsd=int($Lsd*100+.5)/100;
525 $Rsd=int($Rsd*100+.5)/100;
526 print NFO
"#fmtS\tTotalReads\ttoPaired\ttoSingled\tModeIns(p%),Lsd,Rsd,InsAvg,STD\n";
527 print NFO
"Summary\t",join("\t",$InReads,$mapPair,$mapSingle,"$max_x($p %),$Lsd,$Rsd,$avg,$std"),"\n";
529 print NFO
"#fmtS\tTotalReads\tAlignment\n";
530 print NFO
"Summary\t",join("\t",$InReads,$mapSingle),"\n";
532 print NFO
"\n#fmtC\tReadsOut\tBPOut\tMisSum\tTrimedReads\tTrimedBP\tmisMatchReads\tReads\@Hit\tBP\@Hit\tIndelReads\tBadLines\n";
533 print NFO
join("\t",'ALL',$DatSum{'ReadsOut'},$DatSum{'BPOut'},$DatSum{'MisSum'},$DatSum{'TrimedReads'},$DatSum{'TrimedBP'},
534 ${&combineC
($DatSum{'misMatch'})},${&combineJ
($DatSum{'Hit9r'})},${&combineJ
($DatSum{'Hit9bp'})},${&combineJ
($DatSum{'Indel'})},$DatSum{'BadLines'}),"\n\n";# if exists $DatSum{'Hit9r'};
535 #print NFO "#fmtP\tReadsOut\tBPOut\tMisSum\tTrimedReads\tTrimedBP\tmisMatchReads\tReads\@Hit\tBP\@Hit\tIndelReads\n";
536 #print NFO join("\t",";$_",$chrReadsOut{$_},$chrBPOut{$_},$chrMisSum{$_}||0,$chrTrimedReads{$_}||0,$chrTrimedBP{$_}||0,${&combineC(\%{$chrmisMatch{$_}})},${&combineJ(\%{$chrHit9r{$_}})},${&combineJ(\%{$chrHit9bp{$_}})},${&combineJ(\%{$chrIndel{$_}})}),"\n" for sort keys %chrReadsOut;
539 #print "\n[";dump([$InReads,$mapPair,$mapSingle,\%DatSum,\%InsD]);print "]\n";
542 [$InReads,$mapPair,$mapSingle,\
%DatSum,\
%InsD]
550 Hit9bp
=> { 1 => 323074, 2 => 7900, 3 => 3063, 4 => 62358 },
551 Hit9r
=> { 1 => 3266, 2 => 79, 3 => 31, 4 => 624 },