13 #define DEF_ONE 10000000
17 "--------------------------------------------------------------------------------------------------------\n"\
18 " Program: rmdSM remove duplication form soap result,then sort and merge by chr \n"\
19 " Vision: v1.0.0 low memory use, and correct the mismatch numbers of soap2 \n"\
20 " Contact: lijun3@genomics.org.cn\n\n"\
21 " Usage:[option]\n\n"\
22 " -in_list <str> input list of soap result files, support read gzip file \"*.soap,*.single\"\n"\
23 " only reads pairs in *.soap files used to remove duplication\n"\
24 " list file Format: soap_result_file_name [lane_id] \n"\
25 " if only give the soap_result_file_name, do not change the soap reasult,\n"\
26 " just sort,merge and remove duplication reads \n"\
27 " -chr <str> input list of chr name \n"\
28 " -out_dir <str> the directory of reslut \n"\
29 " -dp out put the duplication reads which have been removed in *.dp file\n"\
30 " -prefix <str> prefix of output file,such as \"sample1\"\n\n"\
31 " memory prediction: 1. (reads number(75b,of all files in list) / chr number)*0.35 G/M\n"\
32 " 2. 1M reads(75b,of the best covered chromosome) needs 0.35g\n"\
33 "---------------------------------------------------------------------------------------------------------\n"\
36 typedef unsigned long long _uint64;
38 typedef vector
<string
> Vstring
;
40 typedef struct PosInfo
71 typedef struct ReadsCount
{
79 bool less_read(const reads
& m1
, const reads
& m2
) {
80 return m1
.pos
< m2
.pos
;
84 bool less_two(const posInfo
& m1
, const posInfo
& m2
) {
85 return (m1
.rOne_pos
< m2
.rOne_pos
)|| ((m1
.rOne_pos
== m2
.rOne_pos
) && (m1
.rTwo_pos
< m2
.rTwo_pos
));
98 string
uint_642str(_uint64 i
)
116 typedef vector
<list
> Vlist
;
118 typedef vector
<posInfo
> Vposinfo
;
120 typedef map
<string
, Vposinfo
> MVposinfo
;
122 typedef map
<string
,string
> MCMD
;
124 typedef vector
<string
> Vchr
;
125 typedef map
<string
,_uint64
> MCHRSIZE
;
128 typedef vector
<reads
> VChrReads
;
142 /////////////////////////////////////////////////
144 bool readList(Vlist
&vs
,string listFile
)
147 ifstream
ifs(listFile
.c_str(),ifstream::in
);
151 cerr
<< "list of soap files is wrong,please check : " << listFile
<< endl
;
154 cerr
<< "soap result files: " << endl
;
160 if(line
.length() > 0)
163 istringstream
isone (line
,istringstream::in
);
175 cerr
<< ls
.file
<< " " << ls
.nu
<< endl
;
189 bool readChr(Vstring
&vs
,string listFile
)
192 ifstream
ifs(listFile
.c_str(),ifstream::in
);
196 cerr
<< "list of chr is wrong,please check : " << listFile
<< endl
;
206 if(line
.length() > 0)
210 cerr
<< "chr : " << line
<< endl
;
223 _uint64
readpeNew(readsCount
&sum
,string file
, MVposinfo
& mvpi
,string chr
,VChrReads
&vChrReads
,_uint64
&Index
,bool filter
,string nu
,bool isrmd
)
226 ifstream
ifs(file
.c_str(),ifstream::in
);
227 cerr
<< "read: "<<file
<< " IS rm duplication : " << isrmd
<< endl
;
230 cerr
<< "open soap file: " << file
<< " error!" << endl
;
235 _uint64 totalreads
=0;
236 _uint64 totalbases
=0;
238 beforReads
.id
="first";
245 getline(ifs
,readsOne
);
247 string id
,ser
,quality
,ab
,fr
,chrname
,end
;
252 istringstream
isone (readsOne
,istringstream::in
);
254 isone
>>id
>> ser
>> quality
>> nhits
>> ab
>> length
>> fr
>> chrname
>> pos
>> hit_type
>> end
;
262 size_t found
= readsOne
.find_last_of("\t");
264 if(found
== string::npos
)
267 // error line ,not split by table !
270 string last_seq
= readsOne
.substr(found
+1);
271 size_t last_pos
= last_seq
.find_first_of("ATGC");
274 while (last_pos
!=string::npos
) // count mismatch number
279 last_pos
=last_seq
.find_first_of("ATGC",last_pos
+1);
291 string OneId
= id
.substr(0,id
.find_last_of("/"));
292 string TwoId
= beforReads
.id
.substr(0,beforReads
.id
.find_last_of("/"));
295 if(s_end
.find('S') != string::npos
) { // to define is trim or not
299 size_t m
= s_end
.find_first_of('M');
301 if(fr
== "+") { // get reads position in 5' before trim
303 size_t s
= s_end
.find_first_of('S');
305 if(m
!= string::npos
&& s
!= string::npos
&&s
< m
) { // trim 5'
307 string dis
= s_end
.substr(0,s
);
308 int d
= atoi(dis
.c_str());
315 else { // get reads position in 5' before trim
317 size_t s
= s_end
.find_last_of('S');
319 if(m
!= string::npos
&& s
!= string::npos
&& s
>m
) // trim 5'
322 string dis
= s_end
.substr(m
+1,s
-m
-1);
324 int d
= atoi(dis
.c_str());
325 s_pos
= pos
+ length
+ d
-1;
330 s_pos
= pos
+ length
- 1;
345 line
.new_id
= new_id
;
346 line
.new_id
.reserve(new_id
.size());
348 setion
+= ser
; setion
+= "\t";
349 setion
+= quality
; setion
+= "\t";
350 setion
+= int2str(nhits
); setion
+= "\t";
351 setion
+= ab
; setion
+= "\t";
352 setion
+= int2str(length
); setion
+= "\t";
353 setion
+= fr
; setion
+= "\t";
354 setion
+= chrname
; setion
+= "\t";
356 setion
+= uint_642str(pos
);
358 setion
+= int2str(mismatch
);
364 line
.info
.reserve(setion
.size());
371 line
.new_id
.reserve(id
.size());
373 string setion2
= readsOne
.substr(readsOne
.find_first_of("\t")+1);
376 line
.info
.reserve(setion
.size());
381 vChrReads
.push_back(line
);
383 if((isrmd
&& beforReads
.chrname
== chrname
&& OneId
== TwoId
))
386 pinfo
.chrname
=chrname
;
387 pinfo
.chrname
.reserve(chrname
.size());
389 if(s_pos
< beforReads
.pos
)
391 pinfo
.rOne_pos
= s_pos
;
392 pinfo
.rTwo_pos
= beforReads
.pos
;
394 pinfo
.total_len
= length
+ beforReads
.length
;
400 pinfo
.rOne_pos
= beforReads
.pos
;
401 pinfo
.rTwo_pos
= s_pos
;
403 pinfo
.total_len
= length
+ beforReads
.length
;
407 pinfo
.readsIndex
= Index
;
409 MVposinfo::iterator it
= mvpi
.find(pinfo
.chrname
);
411 if(mvpi
.empty() || it
== mvpi
.end())
414 vpi
.push_back(pinfo
);
415 mvpi
.insert(pair
<string
,Vposinfo
>(pinfo
.chrname
,vpi
));
416 vChr
.push_back(pinfo
.chrname
);
422 it
->second
.push_back(pinfo
);
430 totalbases
+= length
;
432 beforReads
.pos
= s_pos
;
433 beforReads
.length
= length
;
434 beforReads
.chrname
= chrname
;
441 cerr
<< "index: " << Index
<< " " << totalreads
<< endl
;
442 sum
.reads
= totalreads
;
444 sum
.bases
= totalbases
;
449 ////////////////////////////////////////////////////////////////////////////////////////////////////
452 bool outPutMerge(string file
,VChrReads
&vchr
)
455 string outfile
= file
;
456 string outdbfile
= file
;
459 _uint64 num_reads
= 0;
460 _uint64 num_bases
= 0;
461 _uint64 nu_duplication
=0;
463 cerr
<< "Output: " << outfile
<< endl
;
465 ofstream
out_fs(outfile
.c_str());
470 out_dp
.open(outdbfile
.c_str());
472 cerr
<< " open outdpput file error ! " << outfile
<< endl
;
480 cerr
<< " open output file error ! " << outfile
<< endl
;
487 sort(vchr
.begin(),vchr
.end(),less_read
);
488 cerr
<< "Size " << vchr
.size() << endl
;
490 for(int i
=0;i
< vchr
.size();i
++)
492 if(! vchr
.at(i
).stat
)
494 out_fs
<<vchr
.at(i
).new_id
<< "\t" << vchr
.at(i
).info
<< endl
;
497 num_bases
+= vchr
.at(i
).len
;
502 out_dp
<< vchr
.at(i
).new_id
<< "\t" << vchr
.at(i
).info
<< endl
;
508 cout
<< num_reads
<< "\t" << num_bases
<< endl
;
518 ////////////////////////////////////////////////////////////////////////////////////////////////////
519 //// delete duplication
521 _uint64
reset(MVposinfo
& vpi
,VChrReads
&vChrReads
)
525 _uint64 totalline
= 0;
527 for(MVposinfo::iterator it
=vpi
.begin();it
!=vpi
.end();++it
)
530 cerr
<< "size " << it
->first
<< "\t"<< it
->second
.size() << "\t" << it
->second
.capacity() << "\t" << sizeof(it
->second
.at(1)) << endl
;
531 cerr
<< "vector" << "\t" << vChrReads
.size() << "\t" << vChrReads
.capacity() << "\t" << sizeof(vChrReads
.at(1)) << endl
;
534 sort(it
->second
.begin(),it
->second
.end(),less_two
);
540 totalline
+= it
->second
.size();
544 for(unsigned long long i
= 1; i
<it
->second
.size(); ++i
)
548 if(it
->second
.at(i
).rOne_pos
== it
->second
.at(i
-1).rOne_pos
&& it
->second
.at(i
).rTwo_pos
== it
->second
.at(i
-1).rTwo_pos
)
552 if(it
->second
.at(i
).total_len
<= it
->second
.at(index
).total_len
)
554 it
->second
.at(i
).used
=0 ;
555 vChrReads
.at(it
->second
.at(i
).readsIndex
).stat
=true;
556 vChrReads
.at(it
->second
.at(i
).readsIndex
-1).stat
=true;
560 it
->second
.at(index
).used
= 0;
561 vChrReads
.at(it
->second
.at(i
).readsIndex
).stat
=true;
562 vChrReads
.at(it
->second
.at(i
).readsIndex
-1).stat
=true;
585 void doMerge(Vlist
& soapList
,string chr
,string outdir
,bool control
)
592 string outFile
= outdir
;
606 _uint64 lastreads
= 0;
608 cerr
<< "do chromosome " << chr
<< endl
;
610 for(int i
=0; i
< soapList
.size();i
++)
619 string nu
= int2str(soapList
.at(i
).nu
);
620 if(soapList
.at(i
).file
.find("single") != string::npos
)
626 if(soapList
.at(i
).nu
== -1 )
631 cerr
<< "IS filter: " << "no" << endl
;
636 cerr
<< "IS filter: " << "yes " << endl
;
639 readpeNew(sum
,soapList
.at(i
).file
,vpi
,chr
,vReads
,Index
,filt
,nu
,isrmd
);
642 cerr
<< " may donot have chromosome " << chr
<< " in file: " << soapList
.at(i
).file
<< endl
;
645 t_reads
+= sum
.reads
;
646 t_bases
+= sum
.bases
;
652 cout
<< chr
<< "\t" << t_reads
<< "\t" << t_bases
<< "\t";
655 outPutMerge(outFile
,vReads
);
664 //////////////////////////////////////////////
667 int main(int argc
,char** argv
)
674 string soapListFile
,chrListFile
,cn
;
686 if( !strcmp(argv
[0], "-in_list") && argc
>=2) {
687 soapListFile
= argv
[1];
690 cerr
<< "soap_results: "<< soapListFile
<< endl
;
693 else if( !strcmp(argv
[0],"-chr") && argc
>=2) {
694 chrListFile
= argv
[1];
696 cerr
<< "chr: " << chrListFile
<< endl
;
699 else if( !strcmp(argv
[0],"-out_dir") && argc
>=2) {
702 cerr
<< "Out direcotry: " << outDir
<< endl
;
706 else if(!strcmp(argv
[0],"-prefix") && argc
>=2) {
710 cerr
<< "prefix: " << suff
<< endl
;
712 else if(!strcmp(argv
[0],"-dp") && argc
>=1) {
727 if(!readList(soapList
,soapListFile
))
729 cerr
<< "read list error !" << endl
;
732 if(!readChr(chrList
,chrListFile
))
734 cerr
<< "read chr error !" << endl
;
741 for(Vstring::iterator it
= chrList
.begin();it
!=chrList
.end();it
++)
744 doMerge(soapList
,*it
,outDir
,control
);