modified: myjupyterlab.sh
[GalaxyCodeBases.git] / released / pIRS.old / stat_illumina_reads / gcvgstater / stat_soap_coverage.cpp
blob0dffafb6f86c9a48d06373bdc46a64d7d8231901
1 #include <iostream>
2 #include <map>
3 #include <string>
4 #include <cstdlib>
5 #include <algorithm>
6 #include <vector>
7 #include <boost/progress.hpp>
8 #include <boost/progress.hpp>
9 #include "gzstream.h"
10 #include "self_util.h"
11 #include "stat_soap_coverage.h"
12 using namespace std;
14 stat_soap_coverage::stat_soap_coverage(string str_ref_file_name,
15 string str_output_prefix, vector<string> vec_soap_file_name,
16 vector<string> vec_width, bool b_gcdump, bool b_depwindump)
18 this->str_ref_file_name = str_ref_file_name;
19 this->str_output_prefix = str_output_prefix;
20 this->vec_soap_file_name = vec_soap_file_name;
21 this->vec_width = vec_width;
22 this->b_gcdump = b_gcdump;
23 this->b_depwindump = b_depwindump;
25 Run();
28 stat_soap_coverage::~stat_soap_coverage()
33 void stat_soap_coverage::DealReference()
35 boost::progress_timer timer;
36 igzstream in(str_ref_file_name.c_str());
37 string line;
38 string keyname = "";
39 string sequence = "";
40 //uint64_t countLine = 0;
43 while(getline(in, line))
45 TrimLeft(line);
46 TrimRight(line);
47 if(line[0] == '>')
49 if(sequence.length() != 0)
51 map_reference_base[keyname] = sequence;
52 cerr << "map_reference_base:" << keyname << ":" << sequence.length() << endl;
53 vec_chr_keyname.push_back(keyname);
56 int index;
57 if(((index = line.find(" ")) != string::npos) || ((index = line.find("\t")) != string::npos) || ((index = line.find("\n")) != string::npos))
59 keyname = line.substr(1, index-1);
61 else
63 keyname = line.substr(1);
66 sequence.clear();
67 continue;
70 sequence += line;
73 if(sequence.length() != 0)
75 map_reference_base[keyname] = sequence;
76 vec_chr_keyname.push_back(keyname);
79 in.close();
80 cout << "deal reference time: " ;
83 void stat_soap_coverage::DealSoapCoverage()
86 boost::progress_timer timer;
87 boost::progress_display display(vec_soap_file_name.size());
89 for(vector<string>::iterator it = vec_soap_file_name.begin(); it != vec_soap_file_name.end(); ++it)
91 igzstream in((*it).c_str());
92 string line;
93 string keyname;
94 string strTemp = "";
95 vector<unsigned int> chr_soap_coverage;
96 //uint64_t countLine = 0;
97 int in_temp;
101 while(getline(in, line))
103 TrimLeft(line);
104 TrimRight(line);
106 if(line[0] == '>')
108 if(chr_soap_coverage.size() == 0)
110 int index;
111 if(((index = line.find(" ")) != string::npos) || ((index = line. find("\t")) != string::npos) || ((index = line.find("\n")) != string::npos))
113 keyname = line.substr(1, index-1);
115 else
117 keyname = line.substr(1);
119 for(unsigned int i=0; i<map_reference_base[keyname].size(); ++i)
122 in >> in_temp;
123 if(in_temp == 65535)
125 in_temp = 0;
127 chr_soap_coverage.push_back((unsigned int)in_temp);
129 if(map_soap_coverage.count(keyname) == 0)
131 map_soap_coverage[keyname] = chr_soap_coverage;
133 else
135 for(unsigned int i=0; i<map_soap_coverage[keyname].size(); ++i)
137 if ( uint64_t(map_soap_coverage[keyname][i]) + chr_soap_coverage[i] > uint64_t(UINT32_MAX) ) {
138 map_soap_coverage[keyname][i] = UINT32_MAX;
139 } else {
140 map_soap_coverage[keyname][i] += chr_soap_coverage[i];
143 uint64_t temp = map_soap_coverage[keyname][i] + chr_soap_coverage[i];
144 if(temp > 65534)
146 map_soap_coverage[keyname][i] = 65534;
148 else
150 map_soap_coverage[keyname][i] += chr_soap_coverage[i];
159 keyname = "";
160 chr_soap_coverage.clear();
165 in.close();
166 ++display;
168 cout << "deal soapcoverage time: ";
171 void stat_soap_coverage::StatGC()
173 boost::progress_timer timer;
174 boost::progress_display display(vec_width.size());
175 bool b_depth = true;
176 for(unsigned int i=0; i<vec_width.size(); i++)
178 unsigned int width = toInt(vec_width[i]);
179 map<double, vector<double> > map_soap_gc_depth;
180 vector<double> gc_keyname;
181 map<double, uint64_t> map_temp_wincount;
183 ogzstream gzgcdump, gzdepwindump;
184 winCountN[width] = 0;
185 if(b_gcdump)
187 gzgcdump.open((str_output_prefix+"_"+toStr(width)+".refgc.gz").c_str());
190 if(b_depwindump)
192 gzdepwindump.open((str_output_prefix+"_"+toStr(width)+".windep.gz").c_str());
196 for(map<string, string>::iterator it = map_reference_base.begin(); it != map_reference_base.end(); ++it)
199 string keyname = it->first;
200 if(b_gcdump)
202 gzgcdump << ">" << keyname <<endl;
205 if(b_depwindump)
207 gzdepwindump << ">" << keyname << endl;
209 uint64_t countPos = 0;
210 int countGC = 0;
211 int countN = 0;
212 uint64_t sumBase = 0;
214 for(unsigned int j=0; j<it->second.length(); ++j)
216 countPos++;
218 if(b_depth)
220 if(map_stat_depth.count(map_soap_coverage[keyname][j]) == 0)
222 map<string, uint64_t> temp_depth;
223 for(unsigned int chr_key = 0; chr_key < vec_chr_keyname.size(); ++chr_key)
225 temp_depth[vec_chr_keyname[chr_key]] = 0;
227 temp_depth[keyname] = 1;
228 map_stat_depth[map_soap_coverage[keyname][j]] = temp_depth;
230 else
232 map_stat_depth[map_soap_coverage[keyname][j]][keyname] += 1;
238 if(countPos < width)
240 if((it->second[j] == 'N') || (it->second[j] == 'n'))
242 countN++;
244 else
246 sumBase += map_soap_coverage[keyname][j];
249 if((it->second[j] == 'G') || (it->second[j] == 'C') || (it->second[j] == 'g') || (it->second[j] == 'c'))
251 countGC++;
254 else
256 if(map_sumwincount.count(width) == 0)
258 map_sumwincount[width] = 1;
260 else
262 map_sumwincount[width] += 1;
265 if((width-countN) != 0)
267 double rate = (double)countGC/(width-countN) * 100;
268 double key = int(rate) + 0.5;
269 if(map_temp_wincount.count(key) == 0)
271 map_temp_wincount[key] = 1;
273 else
275 map_temp_wincount[key] += 1;
278 else
280 winCountN[width] += 1;
283 if(((double)countN/width >= 0.75) || ((width - countN) <= 30))
285 countGC = 0;
286 sumBase = 0;
287 countN = 0;
288 if(b_gcdump)
290 gzgcdump << -1 << endl;
293 if(b_depwindump)
295 gzdepwindump << -1 << endl;
299 else
301 double gc_rate = (double)countGC/(width-countN) * 100;
302 if(b_gcdump)
304 gzgcdump << gc_rate << endl;
307 double gc_key = int(gc_rate) + 0.5;
309 if(map_sumdepthcount.count(width) == 0)
311 map_sumdepthcount[width] = 1;
313 else
315 map_sumdepthcount[width] += 1;
318 if(map_soap_gc_depth.count(gc_key) == 0)
320 vector<double> temp;
321 temp.push_back((double)sumBase/(width-countN));
322 map_soap_gc_depth[gc_key] = temp;
323 gc_keyname.push_back(gc_key);
325 else
327 map_soap_gc_depth[gc_key].push_back((double)sumBase/(width-countN));
330 if(b_depwindump)
332 gzdepwindump << ((double)sumBase/(width-countN)) << endl;
335 countGC = 0;
336 sumBase = 0;
337 countN = 0;
340 if((it->second[j] == 'N') || (it->second[j] == 'n'))
342 countN++;
344 else
346 sumBase += map_soap_coverage[keyname][j];
349 if((it->second[j] == 'G') || (it->second[j] == 'C') || (it->second[j] == 'g') || (it->second[j] == 'c'))
351 countGC++;
353 countPos = 1;
358 if(b_gcdump)
360 gzgcdump.close();
363 if(b_depwindump)
365 gzdepwindump.close();
368 map_wincount[width] = map_temp_wincount;
369 b_depth = false;
370 if(map_width_soap_gc_depth.count(width) == 0)
372 map_width_soap_gc_depth[width] = map_soap_gc_depth;
373 map_gc_keyname[width] = gc_keyname;
375 else
377 cerr << "error !" << __FILE__ << ", " << __LINE__ <<endl;
378 exit(EXIT_FAILURE);
381 ++display;
383 cout << "stat GC time: ";
386 void stat_soap_coverage::StatCoverage()
388 boost::progress_timer timer;
389 boost::progress_display display(map_reference_base.size());
390 uint64_t all_countN = 0;
391 uint64_t all_coverageNum = 0;
392 uint64_t all_sumBase = 0;
393 uint64_t all_sum = 0;
394 for(map<string, string>::iterator it = map_reference_base.begin(); it != map_reference_base.end(); ++it)
396 string keyname = it->first;
397 uint64_t countN = 0;
398 uint64_t coverageNum = 0;
399 uint64_t sumBase = 0;
401 for(unsigned int i=0; i<it->second.length(); ++i)
403 all_sum++;
404 if((it->second[i] == 'N') || (it->second[i] == 'n'))
406 all_countN++;
407 countN++;
408 continue;
411 if(map_soap_coverage[keyname][i] != 0)
413 all_coverageNum++;
414 coverageNum++;
415 sumBase += map_soap_coverage[keyname][i];
416 all_sumBase += map_soap_coverage[keyname][i];
420 if(map_stat_coverage.count(keyname) == 0)
422 vector<double> temp;
423 temp.push_back((double)sumBase/(it->second.length()-countN));
424 temp.push_back((double)coverageNum);
425 temp.push_back((double)coverageNum/(it->second.length()-countN));
426 temp.push_back((double)(it->second.length()-countN));
427 temp.push_back((double)countN);
428 temp.push_back((double)it->second.length());
429 map_stat_coverage[keyname] = temp;
431 else
433 cerr << "error !" << __FILE__ << ", " << __LINE__<< endl;
434 exit(EXIT_FAILURE);
436 ++display;
439 if(map_stat_coverage.count("_All_") == 0)
441 vector<double> temp;
442 temp.push_back((double)all_sumBase/(all_sum - all_countN));
443 temp.push_back((double)all_coverageNum);
444 temp.push_back((double)all_coverageNum/(all_sum - all_countN));
445 temp.push_back((double)(all_sum - all_countN));
446 temp.push_back((double)all_countN);
447 temp.push_back((double)all_sum);
448 map_stat_coverage["_All_"] = temp;
450 cout << "stat coverage time: ";
453 void stat_soap_coverage::DealStat()
455 cout << "statGC begin!" << endl;
456 StatGC();
457 cout << "statGC end!" << endl;
458 cout << "statCoverage begin!" << endl;
459 StatCoverage();
460 cout << "statCoverage end!" << endl;
462 boost::progress_timer timer;
463 boost::progress_display display(map_width_soap_gc_depth.size());
464 for(map<int, map<double, vector<double> > >::iterator it = map_width_soap_gc_depth.begin(); it != map_width_soap_gc_depth.end(); ++it)
466 int width = it->first;
467 ofstream out((str_output_prefix + "_" + toStr(width) + ".dat").c_str());
469 if(!out)
471 cerr << "can't open the file " << (str_output_prefix + "_" + toStr(width) + ".dat") << endl;
472 exit(EXIT_FAILURE);
474 vector<double> gc_keyname;
475 gc_keyname = map_gc_keyname[width];
476 sort(gc_keyname.begin(), gc_keyname.end());
477 map<double, vector<double> > temp_gc_output;
479 for(unsigned int i=0; i<gc_keyname.size(); ++i)
481 vector<double> temp = map_width_soap_gc_depth[width][gc_keyname[i]];
482 //double gc_rate = gc_keyname[i];
483 uint64_t ref_count = map_width_soap_gc_depth[width][gc_keyname[i]].size();
484 double sum_coverage = 0;
486 for(unsigned int j=0; j<temp.size(); ++j)
488 sum_coverage += temp[j];
491 double avg_value = sum_coverage/temp.size();
492 double min_value = *min_element(temp.begin(), temp.end());
493 double max_value = *max_element(temp.begin(), temp.end());
494 sort(temp.begin(), temp.end());
495 double Q1, Q2, Q3;
496 if(temp.size() % 2 == 0)
498 if(temp.size() < 4)
500 Q1 = 0;
501 Q2 = 0;
502 Q3 = 0;
504 else
506 Q1 = temp[int((temp.size()+1)/4)-1] + (temp[int((temp.size()+1)/4)] - temp[int((temp.size()+1)/4)-1]) * (((double)(temp.size()+1)/4)-(int((temp.size()+1)/4)));
507 Q2 = temp[int((temp.size()+1)/2) - 1] + (temp[int((temp.size()+1)/2)] - temp[int((temp.size()+1)/2) - 1]) * (((double)(temp.size()+1)/2)-(int((temp.size()+1)/2)));
508 Q3 = temp[int((temp.size()+1)/4*3) - 1] + (temp[int((temp.size()+1)/4*3)] - temp[int((temp.size()+1)/4*3)]) * (((double)(temp.size()+1)/4*3)-(int((temp.size()+1)/4*3)));
512 else
514 if(temp.size() < 3)
516 Q1 = 0;
517 Q2 = 0;
518 Q3 = 0;
520 else
522 Q1 = temp[int(temp.size() + 1)/4 - 1];
523 Q2 = temp[int(temp.size() + 1)/2 - 1];
524 Q3 = temp[int(temp.size() + 1)/4 * 3 - 1];
527 double small_value, big_value;
528 double small_temp_value = Q1-1.5*(Q3-Q1);
529 double big_temp_value = Q3+1.5*(Q3-Q1);
531 for(unsigned int small_i = 0; small_i < temp.size(); small_i++)
533 if(small_temp_value < temp[small_i])
535 small_value = temp[small_i];
536 break;
538 small_value = temp[small_i];
541 for(unsigned int big_i=0; big_i < temp.size();big_i++)
543 if(big_temp_value < temp[big_i])
545 if(big_i == 0)
546 big_value = temp[big_i];
547 else
548 big_value = temp[big_i - 1];
549 break;
551 big_value = temp[big_i];
554 vector<double> temp_output;
555 temp_output.push_back(double(ref_count));
556 temp_output.push_back(double(avg_value));
557 temp_output.push_back(small_value);
558 temp_output.push_back(double(Q1));
559 temp_output.push_back(double(Q2));
560 temp_output.push_back(double(Q3));
561 temp_output.push_back(big_value);
562 temp_output.push_back(double(min_value));
563 temp_output.push_back(double(max_value));
565 temp_gc_output[gc_keyname[i]] = temp_output;
568 double sum_avg = 0;
569 double sum_ref_count = 0;
570 for(unsigned int i=0; i<gc_keyname.size(); ++i)
572 sum_avg += temp_gc_output[gc_keyname[i]][1];
573 sum_ref_count += temp_gc_output[gc_keyname[i]][0];
576 double k = sum_avg/sum_ref_count;
577 out << "#WinSize=" << width << "\tWinCount=" << map_sumwincount[width] << "\tDepthCount=" << map_sumdepthcount[width] << endl
578 << "#All-N windows count: " << winCountN[width] << endl
579 << "#GC%\tRefCnt\tDepthCnt\tMean\tSmall\tQ1\tMid\tQ3\tBig\tMin\tMax\tRefcntcal"
580 << endl;
581 for(unsigned int i=0; i<gc_keyname.size(); ++i)
583 out << gc_keyname[i] << "\t" << map_wincount[width][gc_keyname[i]]
584 << "\t" << uint64_t(temp_gc_output[gc_keyname[i]][0])
585 << "\t" << temp_gc_output[gc_keyname[i]][1]
586 << "\t" << temp_gc_output[gc_keyname[i]][2]
587 << "\t" << temp_gc_output[gc_keyname[i]][3]
588 << "\t" << temp_gc_output[gc_keyname[i]][4]
589 << "\t" << temp_gc_output[gc_keyname[i]][5]
590 << "\t" << temp_gc_output[gc_keyname[i]][6]
591 << "\t" << temp_gc_output[gc_keyname[i]][7]
592 << "\t" << temp_gc_output[gc_keyname[i]][8]
593 << "\t" << temp_gc_output[gc_keyname[i]][0]*k
595 << endl;
598 out.close();
599 ++display;
602 ofstream log((str_output_prefix + "_stat" + ".dat").c_str());
603 if(!log)
605 cerr << "can't open file " << (str_output_prefix + "_stat" + ".dat") << __FILE__ << ", " << __LINE__ << endl;
606 exit(EXIT_FAILURE);
609 log << "#chrid\tdepth\tcovered\tcvgratio\tchrlen_no_N\tNzone\tchrlen" << endl;
610 for(map<string, vector<double> >::iterator it=map_stat_coverage.begin(); it!=map_stat_coverage.end(); ++it)
612 log << it->first << "\t" << it->second[0] << "\t" << uint64_t(it->second[1]) << "\t" << it->second[2] << "\t" << uint64_t(it->second[3]) << "\t" << uint64_t(it->second[4]) << "\t" << uint64_t(it->second[5]) << endl;
615 log.close();
616 cout << "deal stat time: ";
619 void stat_soap_coverage::Run()
621 cout << "dealReference begin!" << endl;
622 DealReference();
623 cout << "dealReference end!" << endl;
624 cout << "dealDealSoapCoverage begin!" << endl;
625 DealSoapCoverage();
626 cout << "dealDealSoapCoverage end!" << endl;
627 DealStat();
628 statDepth();
632 void stat_soap_coverage::statDepth()
634 cout << "stat depth time: ";
635 boost::progress_timer timer;
636 ofstream out((str_output_prefix + "_" + "stat.depth").c_str());
638 out << "#Depth\t_All_";
639 for(unsigned int j=0; j<vec_chr_keyname.size(); j++)
641 out << "\t" << vec_chr_keyname[j];
644 out << endl;
645 vector<double> temp;
646 for(map<double, map<string, uint64_t> >::iterator it2 = map_stat_depth.begin(); it2 != map_stat_depth.end(); it2++)
648 temp.push_back(it2->first);
650 sort(temp.begin(), temp.end());
651 for(unsigned int i=0; i<temp.size(); ++i)
653 uint64_t sum = 0;
654 for(unsigned int j=0; j<vec_chr_keyname.size(); j++)
656 sum += map_stat_depth[temp[i]][vec_chr_keyname[j]];
659 out << temp[i] << "\t" << sum << "\t";
660 for(unsigned int j=0; j<vec_chr_keyname.size(); j++)
662 out << "\t" << uint64_t(map_stat_depth[temp[i]][vec_chr_keyname[j]]);
664 out << endl;
666 out.close();