new file: cell2loc.py
[GalaxyCodeBases.git] / perl / soap / rmdup / junli / rmdSM.cpp
blobe738af1df71f6514735b3cc216b57b756916eabf
1 #include <iostream>
2 #include <fstream>
3 #include <string>
4 #include <map>
5 #include <vector>
6 #include <algorithm>
7 #include <iterator>
8 #include <sstream>
9 #include <functional>
11 #include <unistd.h>
12 #include <cstring>
13 #define DEF_ONE 10000000
14 using namespace std;
16 #define USAGE \
17 "--------------------------------------------------------------------------------------------------------\n"\
18 " Program: rmdSM remove duplication form soap result,then sort and merge by chr \n"\
19 " Vision: v1.0.0 low memory use, and correct the mismatch numbers of soap2 \n"\
20 " Contact: lijun3@genomics.org.cn\n\n"\
21 " Usage:[option]\n\n"\
22 " -in_list <str> input list of soap result files, support read gzip file \"*.soap,*.single\"\n"\
23 " only reads pairs in *.soap files used to remove duplication\n"\
24 " list file Format: soap_result_file_name [lane_id] \n"\
25 " if only give the soap_result_file_name, do not change the soap reasult,\n"\
26 " just sort,merge and remove duplication reads \n"\
27 " -chr <str> input list of chr name \n"\
28 " -out_dir <str> the directory of reslut \n"\
29 " -dp out put the duplication reads which have been removed in *.dp file\n"\
30 " -prefix <str> prefix of output file,such as \"sample1\"\n\n"\
31 " memory prediction: 1. (reads number(75b,of all files in list) / chr number)*0.35 G/M\n"\
32 " 2. 1M reads(75b,of the best covered chromosome) needs 0.35g\n"\
33 "---------------------------------------------------------------------------------------------------------\n"\
36 typedef unsigned long long _uint64;
38 typedef vector<string> Vstring;
40 typedef struct PosInfo
42 _uint64 rOne_pos;
43 _uint64 rTwo_pos;
44 int total_len;
45 int used;
46 string chrname;
47 _uint64 readsIndex;
49 }posInfo;
51 typedef struct Reads
53 _uint64 pos;
54 int len;
55 string new_id;
56 bool stat;
57 string info;
59 }reads;
61 typedef struct pe{
62 string id;
63 string ser;
64 int length;
65 string chrname;
66 _uint64 pos;
67 int hit_type;
69 }PE;
71 typedef struct ReadsCount{
72 _uint64 reads;
73 _uint64 bases;
74 }readsCount;
79 bool less_read(const reads & m1, const reads & m2) {
80 return m1.pos < m2.pos;
84 bool less_two(const posInfo & m1, const posInfo & m2) {
85 return (m1.rOne_pos < m2.rOne_pos)|| ((m1.rOne_pos == m2.rOne_pos) && (m1.rTwo_pos < m2.rTwo_pos));
88 ////////////
90 string int2str(int i)
92 string s;
93 stringstream ss(s);
94 ss << i;
95 return ss.str();
98 string uint_642str(_uint64 i)
100 string s;
101 stringstream ss(s);
102 ss << i;
103 return ss.str();
107 /////////////////
110 typedef struct LIST
112 string file;
113 int nu;
115 }list;
116 typedef vector<list> Vlist;
118 typedef vector<posInfo> Vposinfo;
120 typedef map<string, Vposinfo> MVposinfo;
122 typedef map<string,string> MCMD;
124 typedef vector<string> Vchr;
125 typedef map<string,_uint64> MCHRSIZE;
128 typedef vector<reads> VChrReads;
130 MCMD mCmd;
131 MCHRSIZE mChrSize;
133 string outDir;
134 string LibName;
135 bool control = true;
136 Vchr vChr;
137 string suff = "";
139 bool outdp = false;
142 /////////////////////////////////////////////////
144 bool readList(Vlist &vs,string listFile)
147 ifstream ifs(listFile.c_str(),ifstream::in);
149 if(!ifs.good())
151 cerr << "list of soap files is wrong,please check : " << listFile<< endl;
152 return false;
154 cerr << "soap result files: " << endl;
155 while(!ifs.eof())
157 string line;
158 getline(ifs,line);
160 if(line.length() > 0)
163 istringstream isone (line,istringstream::in);
165 string file;
166 int nu=-1;
167 list ls;
169 isone >> file >> nu;
170 ls.file = file;
171 ls.nu = nu;
173 vs.push_back(ls);
175 cerr << ls.file << " " << ls.nu << endl;
180 ifs.close();
182 return true;
189 bool readChr(Vstring &vs,string listFile)
192 ifstream ifs(listFile.c_str(),ifstream::in);
194 if(!ifs.good())
196 cerr << "list of chr is wrong,please check : " << listFile<< endl;
197 return false;
201 while(!ifs.eof())
203 string line;
204 ifs >> line;
206 if(line.length() > 0)
208 vs.push_back(line);
210 cerr << "chr : " << line << endl;
216 ifs.close();
218 return true;
223 _uint64 readpeNew(readsCount &sum,string file , MVposinfo & mvpi,string chr,VChrReads &vChrReads,_uint64 &Index,bool filter,string nu,bool isrmd)
226 ifstream ifs(file.c_str(),ifstream::in);
227 cerr << "read: "<<file << " IS rm duplication : " << isrmd << endl;
228 if(!ifs.good())
230 cerr << "open soap file: " << file << " error!" << endl;
231 return false;
235 _uint64 totalreads=0;
236 _uint64 totalbases=0;
237 PE beforReads;
238 beforReads.id="first";
239 reads line;
241 while(!ifs.eof())
243 string readsOne;
245 getline(ifs,readsOne);
247 string id,ser,quality,ab,fr,chrname,end;
248 _uint64 pos,s_pos;
249 int hit_type,length;
250 int nhits;
252 istringstream isone (readsOne,istringstream::in);
254 isone >>id >> ser >> quality >> nhits >> ab >> length >> fr >> chrname >> pos >> hit_type >> end;
255 int trim= 0;
256 string s_end = end;
258 if(chrname == chr)
262 size_t found = readsOne.find_last_of("\t");
264 if(found == string::npos)
267 // error line ,not split by table !
270 string last_seq= readsOne.substr(found+1);
271 size_t last_pos = last_seq.find_first_of("ATGC");
272 int mismatch =0;
274 while (last_pos!=string::npos) // count mismatch number
277 mismatch++;
279 last_pos =last_seq.find_first_of("ATGC",last_pos+1);
282 string setion;
285 s_pos = pos;
287 line.pos = pos;
288 line.stat= false;
289 line.len = length;
291 string OneId = id.substr(0,id.find_last_of("/"));
292 string TwoId = beforReads.id.substr(0,beforReads.id.find_last_of("/"));
295 if(s_end.find('S') != string::npos) { // to define is trim or not
296 trim =1;
299 size_t m = s_end.find_first_of('M');
301 if(fr == "+") { // get reads position in 5' before trim
303 size_t s = s_end.find_first_of('S');
305 if(m != string::npos && s != string::npos&&s < m) { // trim 5'
307 string dis = s_end.substr(0,s);
308 int d = atoi(dis.c_str());
309 s_pos = pos -d;
315 else { // get reads position in 5' before trim
317 size_t s = s_end.find_last_of('S');
319 if(m != string::npos && s != string::npos && s>m) // trim 5'
322 string dis = s_end.substr(m+1,s-m-1);
324 int d = atoi(dis.c_str());
325 s_pos = pos + length + d -1;
328 else { // trim 3'
330 s_pos = pos + length - 1;
336 if(filter)
339 end = int2str(trim);
341 string new_id = nu;
342 new_id +="-";
343 new_id +=OneId;
345 line.new_id = new_id;
346 line.new_id.reserve(new_id.size());
348 setion += ser; setion += "\t";
349 setion += quality; setion += "\t";
350 setion += int2str(nhits); setion += "\t";
351 setion += ab; setion += "\t";
352 setion += int2str(length); setion += "\t";
353 setion += fr; setion += "\t";
354 setion += chrname; setion += "\t";
356 setion += uint_642str(pos);
357 setion += "\t";
358 setion += int2str(mismatch);
359 setion += "\t";
360 setion += end;
362 line.info = setion;
364 line.info.reserve(setion.size());
367 else
370 line.new_id = id;
371 line.new_id.reserve(id.size());
373 string setion2 = readsOne.substr(readsOne.find_first_of("\t")+1);
375 line.info = setion2;
376 line.info.reserve(setion.size());
381 vChrReads.push_back(line);
383 if((isrmd && beforReads.chrname == chrname && OneId == TwoId ))
385 posInfo pinfo;
386 pinfo.chrname=chrname;
387 pinfo.chrname.reserve(chrname.size());
389 if(s_pos < beforReads.pos)
391 pinfo.rOne_pos = s_pos;
392 pinfo.rTwo_pos = beforReads.pos;
394 pinfo.total_len = length + beforReads.length;
398 else
400 pinfo.rOne_pos = beforReads.pos;
401 pinfo.rTwo_pos = s_pos;
403 pinfo.total_len = length + beforReads.length;
406 pinfo.used = 1;
407 pinfo.readsIndex = Index;
409 MVposinfo::iterator it = mvpi.find(pinfo.chrname);
411 if(mvpi.empty() || it == mvpi.end())
413 Vposinfo vpi;
414 vpi.push_back(pinfo);
415 mvpi.insert(pair<string,Vposinfo>(pinfo.chrname,vpi));
416 vChr.push_back(pinfo.chrname);
419 else
422 it->second.push_back(pinfo);
428 Index++;
429 totalreads++;
430 totalbases += length;
431 beforReads.id = id;
432 beforReads.pos = s_pos;
433 beforReads.length = length;
434 beforReads.chrname = chrname;
440 ifs.close();
441 cerr << "index: " << Index << " " << totalreads << endl;
442 sum.reads = totalreads;
444 sum.bases = totalbases;
445 return totalreads;
449 ////////////////////////////////////////////////////////////////////////////////////////////////////
450 /// out put result
452 bool outPutMerge(string file,VChrReads &vchr)
455 string outfile = file;
456 string outdbfile = file;
457 outdbfile += ".dp";
459 _uint64 num_reads = 0;
460 _uint64 num_bases = 0;
461 _uint64 nu_duplication=0;
463 cerr << "Output: " << outfile << endl;
465 ofstream out_fs(outfile.c_str());
466 ofstream out_dp;
468 if(outdp) {
470 out_dp.open(outdbfile.c_str());
471 if(!out_dp.good()) {
472 cerr << " open outdpput file error ! " << outfile << endl;
473 out_dp.close();
474 return false;
479 if(!out_fs.good()) {
480 cerr << " open output file error ! " << outfile << endl;
482 out_fs.close();
483 return false;
487 sort(vchr.begin(),vchr.end(),less_read);
488 cerr << "Size " << vchr.size() << endl;
490 for(int i=0;i < vchr.size();i++)
492 if(! vchr.at(i).stat)
494 out_fs <<vchr.at(i).new_id << "\t" << vchr.at(i).info << endl;
496 num_reads++;
497 num_bases += vchr.at(i).len;
500 else {
502 out_dp << vchr.at(i).new_id << "\t" << vchr.at(i).info << endl;
508 cout << num_reads << "\t" << num_bases << endl;
509 out_fs.close();
511 if(outdp) {
512 out_dp.close();
514 return true;
518 ////////////////////////////////////////////////////////////////////////////////////////////////////
519 //// delete duplication
521 _uint64 reset(MVposinfo& vpi,VChrReads &vChrReads)
525 _uint64 totalline = 0;
527 for(MVposinfo::iterator it =vpi.begin();it!=vpi.end();++it)
530 cerr << "size " << it->first << "\t"<< it->second.size() << "\t" << it->second.capacity() << "\t" << sizeof(it->second.at(1)) << endl;
531 cerr << "vector" << "\t" << vChrReads.size() << "\t" << vChrReads.capacity() << "\t" << sizeof(vChrReads.at(1)) << endl;
534 sort(it->second.begin(),it->second.end(),less_two);
536 _uint64 rOpos = 0;
537 _uint64 rTpos = 0;
540 totalline += it->second.size();
542 long long index =0;
544 for(unsigned long long i = 1; i<it->second.size(); ++i)
548 if(it->second.at(i).rOne_pos == it->second.at(i-1).rOne_pos && it->second.at(i).rTwo_pos == it->second.at(i-1).rTwo_pos)
552 if(it->second.at(i).total_len <= it->second.at(index).total_len)
554 it->second.at(i).used =0 ;
555 vChrReads.at(it->second.at(i).readsIndex).stat=true;
556 vChrReads.at(it->second.at(i).readsIndex-1).stat=true;
558 else
560 it->second.at(index).used = 0;
561 vChrReads.at(it->second.at(i).readsIndex).stat=true;
562 vChrReads.at(it->second.at(i).readsIndex-1).stat=true;
563 index = i;
569 else
571 index = i;
580 return totalline;
585 void doMerge(Vlist& soapList,string chr,string outdir,bool control)
589 MVposinfo vpi;
590 VChrReads vReads;
592 string outFile = outdir;
593 if(suff.length()!=0)
596 outFile += suff;
597 outFile += ".";
600 outFile += chr;
602 _uint64 Index =0;
603 _uint64 t_reads= 0;
604 _uint64 t_bases = 0;
606 _uint64 lastreads = 0;
608 cerr << "do chromosome " << chr << endl;
610 for(int i=0; i < soapList.size();i++)
614 bool filt =true;
615 readsCount sum;
616 bool isrmd = true;
619 string nu = int2str(soapList.at(i).nu);
620 if(soapList.at(i).file.find("single") != string::npos )
622 isrmd= false;
626 if(soapList.at(i).nu == -1 )
629 nu = int2str(i);
630 filt= false;
631 cerr << "IS filter: " << "no" << endl;
633 else
636 cerr << "IS filter: " << "yes " << endl;
639 readpeNew(sum,soapList.at(i).file,vpi,chr,vReads,Index,filt,nu,isrmd);
641 if(sum.reads == 0) {
642 cerr << " may donot have chromosome " << chr << " in file: " << soapList.at(i).file << endl;
645 t_reads += sum.reads;
646 t_bases += sum.bases;
650 reset(vpi,vReads);
652 cout << chr << "\t" << t_reads << "\t" << t_bases << "\t";
655 outPutMerge(outFile,vReads);
664 //////////////////////////////////////////////
667 int main(int argc ,char** argv)
670 Vstring chrList;
671 Vlist soapList;
672 string fn;
674 string soapListFile,chrListFile,cn;
675 string cmd;
677 --argc ; ++argv ;
678 if(argc ==0) {
679 cerr << USAGE;
680 exit(0);
684 while(argc) {
686 if( !strcmp(argv[0], "-in_list") && argc >=2) {
687 soapListFile= argv[1];
688 argc -=2; argv +=2;
690 cerr << "soap_results: "<< soapListFile << endl;
693 else if( !strcmp(argv[0],"-chr") && argc >=2) {
694 chrListFile = argv[1];
695 argc -=2; argv +=2;
696 cerr << "chr: " << chrListFile << endl;
699 else if( !strcmp(argv[0],"-out_dir") && argc >=2) {
700 outDir = argv[1];
701 argc -=2; argv +=2;
702 cerr << "Out direcotry: " << outDir << endl;
706 else if(!strcmp(argv[0],"-prefix") && argc >=2) {
707 suff = argv[1];
708 argc -=2; argv +=2;
710 cerr << "prefix: " << suff << endl;
712 else if(!strcmp(argv[0],"-dp") && argc >=1) {
714 outdp = true;
715 argc -=1; argv +=1;
718 else {
720 cerr << USAGE;
721 exit(0);
727 if(!readList(soapList,soapListFile))
729 cerr << "read list error !" << endl;
730 exit(0);
732 if(!readChr(chrList,chrListFile))
734 cerr << "read chr error !" << endl;
735 exit(0);
738 outDir += "/";
739 control = false;
741 for(Vstring::iterator it = chrList.begin();it!=chrList.end();it++)
744 doMerge(soapList,*it,outDir,control);