7 #include <boost/progress.hpp>
8 #include <boost/progress.hpp>
10 #include "self_util.h"
11 #include "stat_soap_coverage.h"
14 stat_soap_coverage::stat_soap_coverage(string str_ref_file_name
,
15 string str_output_prefix
, vector
<string
> vec_soap_file_name
,
16 vector
<string
> vec_width
, bool b_gcdump
, bool b_depwindump
)
18 this->str_ref_file_name
= str_ref_file_name
;
19 this->str_output_prefix
= str_output_prefix
;
20 this->vec_soap_file_name
= vec_soap_file_name
;
21 this->vec_width
= vec_width
;
22 this->b_gcdump
= b_gcdump
;
23 this->b_depwindump
= b_depwindump
;
28 stat_soap_coverage::~stat_soap_coverage()
33 void stat_soap_coverage::DealReference()
35 boost::progress_timer timer
;
36 igzstream
in(str_ref_file_name
.c_str());
40 //uint64_t countLine = 0;
43 while(getline(in
, line
))
49 if(sequence
.length() != 0)
51 map_reference_base
[keyname
] = sequence
;
52 cerr
<< "map_reference_base:" << keyname
<< ":" << sequence
.length() << endl
;
53 vec_chr_keyname
.push_back(keyname
);
57 if(((index
= line
.find(" ")) != string::npos
) || ((index
= line
.find("\t")) != string::npos
) || ((index
= line
.find("\n")) != string::npos
))
59 keyname
= line
.substr(1, index
-1);
63 keyname
= line
.substr(1);
73 if(sequence
.length() != 0)
75 map_reference_base
[keyname
] = sequence
;
76 vec_chr_keyname
.push_back(keyname
);
80 cout
<< "deal reference time: " ;
83 void stat_soap_coverage::DealSoapCoverage()
86 boost::progress_timer timer
;
87 boost::progress_display
display(vec_soap_file_name
.size());
89 for(vector
<string
>::iterator it
= vec_soap_file_name
.begin(); it
!= vec_soap_file_name
.end(); ++it
)
91 igzstream
in((*it
).c_str());
95 vector
<unsigned int> chr_soap_coverage
;
96 //uint64_t countLine = 0;
101 while(getline(in
, line
))
108 if(chr_soap_coverage
.size() == 0)
111 if(((index
= line
.find(" ")) != string::npos
) || ((index
= line
. find("\t")) != string::npos
) || ((index
= line
.find("\n")) != string::npos
))
113 keyname
= line
.substr(1, index
-1);
117 keyname
= line
.substr(1);
119 for(unsigned int i
=0; i
<map_reference_base
[keyname
].size(); ++i
)
127 chr_soap_coverage
.push_back((unsigned int)in_temp
);
129 if(map_soap_coverage
.count(keyname
) == 0)
131 map_soap_coverage
[keyname
] = chr_soap_coverage
;
135 for(unsigned int i
=0; i
<map_soap_coverage
[keyname
].size(); ++i
)
137 if ( uint64_t(map_soap_coverage
[keyname
][i
]) + chr_soap_coverage
[i
] > uint64_t(UINT32_MAX
) ) {
138 map_soap_coverage
[keyname
][i
] = UINT32_MAX
;
140 map_soap_coverage
[keyname
][i
] += chr_soap_coverage
[i
];
143 uint64_t temp = map_soap_coverage[keyname][i] + chr_soap_coverage[i];
146 map_soap_coverage[keyname][i] = 65534;
150 map_soap_coverage[keyname][i] += chr_soap_coverage[i];
160 chr_soap_coverage
.clear();
168 cout
<< "deal soapcoverage time: ";
171 void stat_soap_coverage::StatGC()
173 boost::progress_timer timer
;
174 boost::progress_display
display(vec_width
.size());
176 for(unsigned int i
=0; i
<vec_width
.size(); i
++)
178 unsigned int width
= toInt(vec_width
[i
]);
179 map
<double, vector
<double> > map_soap_gc_depth
;
180 vector
<double> gc_keyname
;
181 map
<double, uint64_t> map_temp_wincount
;
183 ogzstream gzgcdump
, gzdepwindump
;
184 winCountN
[width
] = 0;
187 gzgcdump
.open((str_output_prefix
+"_"+toStr(width
)+".refgc.gz").c_str());
192 gzdepwindump
.open((str_output_prefix
+"_"+toStr(width
)+".windep.gz").c_str());
196 for(map
<string
, string
>::iterator it
= map_reference_base
.begin(); it
!= map_reference_base
.end(); ++it
)
199 string keyname
= it
->first
;
202 gzgcdump
<< ">" << keyname
<<endl
;
207 gzdepwindump
<< ">" << keyname
<< endl
;
209 uint64_t countPos
= 0;
212 uint64_t sumBase
= 0;
214 for(unsigned int j
=0; j
<it
->second
.length(); ++j
)
220 if(map_stat_depth
.count(map_soap_coverage
[keyname
][j
]) == 0)
222 map
<string
, uint64_t> temp_depth
;
223 for(unsigned int chr_key
= 0; chr_key
< vec_chr_keyname
.size(); ++chr_key
)
225 temp_depth
[vec_chr_keyname
[chr_key
]] = 0;
227 temp_depth
[keyname
] = 1;
228 map_stat_depth
[map_soap_coverage
[keyname
][j
]] = temp_depth
;
232 map_stat_depth
[map_soap_coverage
[keyname
][j
]][keyname
] += 1;
240 if((it
->second
[j
] == 'N') || (it
->second
[j
] == 'n'))
246 sumBase
+= map_soap_coverage
[keyname
][j
];
249 if((it
->second
[j
] == 'G') || (it
->second
[j
] == 'C') || (it
->second
[j
] == 'g') || (it
->second
[j
] == 'c'))
256 if(map_sumwincount
.count(width
) == 0)
258 map_sumwincount
[width
] = 1;
262 map_sumwincount
[width
] += 1;
265 if((width
-countN
) != 0)
267 double rate
= (double)countGC
/(width
-countN
) * 100;
268 double key
= int(rate
) + 0.5;
269 if(map_temp_wincount
.count(key
) == 0)
271 map_temp_wincount
[key
] = 1;
275 map_temp_wincount
[key
] += 1;
280 winCountN
[width
] += 1;
283 if(((double)countN
/width
>= 0.75) || ((width
- countN
) <= 30))
290 gzgcdump
<< -1 << endl
;
295 gzdepwindump
<< -1 << endl
;
301 double gc_rate
= (double)countGC
/(width
-countN
) * 100;
304 gzgcdump
<< gc_rate
<< endl
;
307 double gc_key
= int(gc_rate
) + 0.5;
309 if(map_sumdepthcount
.count(width
) == 0)
311 map_sumdepthcount
[width
] = 1;
315 map_sumdepthcount
[width
] += 1;
318 if(map_soap_gc_depth
.count(gc_key
) == 0)
321 temp
.push_back((double)sumBase
/(width
-countN
));
322 map_soap_gc_depth
[gc_key
] = temp
;
323 gc_keyname
.push_back(gc_key
);
327 map_soap_gc_depth
[gc_key
].push_back((double)sumBase
/(width
-countN
));
332 gzdepwindump
<< ((double)sumBase
/(width
-countN
)) << endl
;
340 if((it
->second
[j
] == 'N') || (it
->second
[j
] == 'n'))
346 sumBase
+= map_soap_coverage
[keyname
][j
];
349 if((it
->second
[j
] == 'G') || (it
->second
[j
] == 'C') || (it
->second
[j
] == 'g') || (it
->second
[j
] == 'c'))
365 gzdepwindump
.close();
368 map_wincount
[width
] = map_temp_wincount
;
370 if(map_width_soap_gc_depth
.count(width
) == 0)
372 map_width_soap_gc_depth
[width
] = map_soap_gc_depth
;
373 map_gc_keyname
[width
] = gc_keyname
;
377 cerr
<< "error !" << __FILE__
<< ", " << __LINE__
<<endl
;
383 cout
<< "stat GC time: ";
386 void stat_soap_coverage::StatCoverage()
388 boost::progress_timer timer
;
389 boost::progress_display
display(map_reference_base
.size());
390 uint64_t all_countN
= 0;
391 uint64_t all_coverageNum
= 0;
392 uint64_t all_sumBase
= 0;
393 uint64_t all_sum
= 0;
394 for(map
<string
, string
>::iterator it
= map_reference_base
.begin(); it
!= map_reference_base
.end(); ++it
)
396 string keyname
= it
->first
;
398 uint64_t coverageNum
= 0;
399 uint64_t sumBase
= 0;
401 for(unsigned int i
=0; i
<it
->second
.length(); ++i
)
404 if((it
->second
[i
] == 'N') || (it
->second
[i
] == 'n'))
411 if(map_soap_coverage
[keyname
][i
] != 0)
415 sumBase
+= map_soap_coverage
[keyname
][i
];
416 all_sumBase
+= map_soap_coverage
[keyname
][i
];
420 if(map_stat_coverage
.count(keyname
) == 0)
423 temp
.push_back((double)sumBase
/(it
->second
.length()-countN
));
424 temp
.push_back((double)coverageNum
);
425 temp
.push_back((double)coverageNum
/(it
->second
.length()-countN
));
426 temp
.push_back((double)(it
->second
.length()-countN
));
427 temp
.push_back((double)countN
);
428 temp
.push_back((double)it
->second
.length());
429 map_stat_coverage
[keyname
] = temp
;
433 cerr
<< "error !" << __FILE__
<< ", " << __LINE__
<< endl
;
439 if(map_stat_coverage
.count("_All_") == 0)
442 temp
.push_back((double)all_sumBase
/(all_sum
- all_countN
));
443 temp
.push_back((double)all_coverageNum
);
444 temp
.push_back((double)all_coverageNum
/(all_sum
- all_countN
));
445 temp
.push_back((double)(all_sum
- all_countN
));
446 temp
.push_back((double)all_countN
);
447 temp
.push_back((double)all_sum
);
448 map_stat_coverage
["_All_"] = temp
;
450 cout
<< "stat coverage time: ";
453 void stat_soap_coverage::DealStat()
455 cout
<< "statGC begin!" << endl
;
457 cout
<< "statGC end!" << endl
;
458 cout
<< "statCoverage begin!" << endl
;
460 cout
<< "statCoverage end!" << endl
;
462 boost::progress_timer timer
;
463 boost::progress_display
display(map_width_soap_gc_depth
.size());
464 for(map
<int, map
<double, vector
<double> > >::iterator it
= map_width_soap_gc_depth
.begin(); it
!= map_width_soap_gc_depth
.end(); ++it
)
466 int width
= it
->first
;
467 ofstream
out((str_output_prefix
+ "_" + toStr(width
) + ".dat").c_str());
471 cerr
<< "can't open the file " << (str_output_prefix
+ "_" + toStr(width
) + ".dat") << endl
;
474 vector
<double> gc_keyname
;
475 gc_keyname
= map_gc_keyname
[width
];
476 sort(gc_keyname
.begin(), gc_keyname
.end());
477 map
<double, vector
<double> > temp_gc_output
;
479 for(unsigned int i
=0; i
<gc_keyname
.size(); ++i
)
481 vector
<double> temp
= map_width_soap_gc_depth
[width
][gc_keyname
[i
]];
482 //double gc_rate = gc_keyname[i];
483 uint64_t ref_count
= map_width_soap_gc_depth
[width
][gc_keyname
[i
]].size();
484 double sum_coverage
= 0;
486 for(unsigned int j
=0; j
<temp
.size(); ++j
)
488 sum_coverage
+= temp
[j
];
491 double avg_value
= sum_coverage
/temp
.size();
492 double min_value
= *min_element(temp
.begin(), temp
.end());
493 double max_value
= *max_element(temp
.begin(), temp
.end());
494 sort(temp
.begin(), temp
.end());
496 if(temp
.size() % 2 == 0)
506 Q1
= temp
[int((temp
.size()+1)/4)-1] + (temp
[int((temp
.size()+1)/4)] - temp
[int((temp
.size()+1)/4)-1]) * (((double)(temp
.size()+1)/4)-(int((temp
.size()+1)/4)));
507 Q2
= temp
[int((temp
.size()+1)/2) - 1] + (temp
[int((temp
.size()+1)/2)] - temp
[int((temp
.size()+1)/2) - 1]) * (((double)(temp
.size()+1)/2)-(int((temp
.size()+1)/2)));
508 Q3
= temp
[int((temp
.size()+1)/4*3) - 1] + (temp
[int((temp
.size()+1)/4*3)] - temp
[int((temp
.size()+1)/4*3)]) * (((double)(temp
.size()+1)/4*3)-(int((temp
.size()+1)/4*3)));
522 Q1
= temp
[int(temp
.size() + 1)/4 - 1];
523 Q2
= temp
[int(temp
.size() + 1)/2 - 1];
524 Q3
= temp
[int(temp
.size() + 1)/4 * 3 - 1];
527 double small_value
, big_value
;
528 double small_temp_value
= Q1
-1.5*(Q3
-Q1
);
529 double big_temp_value
= Q3
+1.5*(Q3
-Q1
);
531 for(unsigned int small_i
= 0; small_i
< temp
.size(); small_i
++)
533 if(small_temp_value
< temp
[small_i
])
535 small_value
= temp
[small_i
];
538 small_value
= temp
[small_i
];
541 for(unsigned int big_i
=0; big_i
< temp
.size();big_i
++)
543 if(big_temp_value
< temp
[big_i
])
546 big_value
= temp
[big_i
];
548 big_value
= temp
[big_i
- 1];
551 big_value
= temp
[big_i
];
554 vector
<double> temp_output
;
555 temp_output
.push_back(double(ref_count
));
556 temp_output
.push_back(double(avg_value
));
557 temp_output
.push_back(small_value
);
558 temp_output
.push_back(double(Q1
));
559 temp_output
.push_back(double(Q2
));
560 temp_output
.push_back(double(Q3
));
561 temp_output
.push_back(big_value
);
562 temp_output
.push_back(double(min_value
));
563 temp_output
.push_back(double(max_value
));
565 temp_gc_output
[gc_keyname
[i
]] = temp_output
;
569 double sum_ref_count
= 0;
570 for(unsigned int i
=0; i
<gc_keyname
.size(); ++i
)
572 sum_avg
+= temp_gc_output
[gc_keyname
[i
]][1];
573 sum_ref_count
+= temp_gc_output
[gc_keyname
[i
]][0];
576 double k
= sum_avg
/sum_ref_count
;
577 out
<< "#WinSize=" << width
<< "\tWinCount=" << map_sumwincount
[width
] << "\tDepthCount=" << map_sumdepthcount
[width
] << endl
578 << "#All-N windows count: " << winCountN
[width
] << endl
579 << "#GC%\tRefCnt\tDepthCnt\tMean\tSmall\tQ1\tMid\tQ3\tBig\tMin\tMax\tRefcntcal"
581 for(unsigned int i
=0; i
<gc_keyname
.size(); ++i
)
583 out
<< gc_keyname
[i
] << "\t" << map_wincount
[width
][gc_keyname
[i
]]
584 << "\t" << uint64_t(temp_gc_output
[gc_keyname
[i
]][0])
585 << "\t" << temp_gc_output
[gc_keyname
[i
]][1]
586 << "\t" << temp_gc_output
[gc_keyname
[i
]][2]
587 << "\t" << temp_gc_output
[gc_keyname
[i
]][3]
588 << "\t" << temp_gc_output
[gc_keyname
[i
]][4]
589 << "\t" << temp_gc_output
[gc_keyname
[i
]][5]
590 << "\t" << temp_gc_output
[gc_keyname
[i
]][6]
591 << "\t" << temp_gc_output
[gc_keyname
[i
]][7]
592 << "\t" << temp_gc_output
[gc_keyname
[i
]][8]
593 << "\t" << temp_gc_output
[gc_keyname
[i
]][0]*k
602 ofstream
log((str_output_prefix
+ "_stat" + ".dat").c_str());
605 cerr
<< "can't open file " << (str_output_prefix
+ "_stat" + ".dat") << __FILE__
<< ", " << __LINE__
<< endl
;
609 log
<< "#chrid\tdepth\tcovered\tcvgratio\tchrlen_no_N\tNzone\tchrlen" << endl
;
610 for(map
<string
, vector
<double> >::iterator it
=map_stat_coverage
.begin(); it
!=map_stat_coverage
.end(); ++it
)
612 log
<< it
->first
<< "\t" << it
->second
[0] << "\t" << uint64_t(it
->second
[1]) << "\t" << it
->second
[2] << "\t" << uint64_t(it
->second
[3]) << "\t" << uint64_t(it
->second
[4]) << "\t" << uint64_t(it
->second
[5]) << endl
;
616 cout
<< "deal stat time: ";
619 void stat_soap_coverage::Run()
621 cout
<< "dealReference begin!" << endl
;
623 cout
<< "dealReference end!" << endl
;
624 cout
<< "dealDealSoapCoverage begin!" << endl
;
626 cout
<< "dealDealSoapCoverage end!" << endl
;
632 void stat_soap_coverage::statDepth()
634 cout
<< "stat depth time: ";
635 boost::progress_timer timer
;
636 ofstream
out((str_output_prefix
+ "_" + "stat.depth").c_str());
638 out
<< "#Depth\t_All_";
639 for(unsigned int j
=0; j
<vec_chr_keyname
.size(); j
++)
641 out
<< "\t" << vec_chr_keyname
[j
];
646 for(map
<double, map
<string
, uint64_t> >::iterator it2
= map_stat_depth
.begin(); it2
!= map_stat_depth
.end(); it2
++)
648 temp
.push_back(it2
->first
);
650 sort(temp
.begin(), temp
.end());
651 for(unsigned int i
=0; i
<temp
.size(); ++i
)
654 for(unsigned int j
=0; j
<vec_chr_keyname
.size(); j
++)
656 sum
+= map_stat_depth
[temp
[i
]][vec_chr_keyname
[j
]];
659 out
<< temp
[i
] << "\t" << sum
<< "\t";
660 for(unsigned int j
=0; j
<vec_chr_keyname
.size(); j
++)
662 out
<< "\t" << uint64_t(map_stat_depth
[temp
[i
]][vec_chr_keyname
[j
]]);