7 #include <boost/algorithm/string.hpp>
12 using namespace boost
;
15 double hetersnp_rate
=0.001;
16 double heterindel_rate
=0.0001;
17 double big_SV_rate
=0.000001; // structural variation rate
18 double snp_transition_by_transvertion_rate
= 2; //transition_number £ºtransvertion_rate_number = 2
22 string output_prefix
= "ref_sequence";
23 double snp_bias
[3]; //label transition and transvertion ratio
25 //get raw genome sequence.
26 void Get_raw_genome(igzstream
&inf
, ofstream
&snp
, ofstream
&indel
, ofstream
&invertion
);
28 //add snp and indel in raw seqence, and output result sequence.
29 void simulate_snp_indel_seq(string id_line
,string id
,string
&sequ
, ofstream
&snp
,ofstream
&indel
, ofstream
&invertion
);
31 const char *VERSION
="1.0";
32 const char *AUTHOR
="BGI-Shenzhen";
33 const char *CONTACT
="yuanjianying@genomics.org.cn";
36 cout
<<"\nDescription:"<<endl
;
38 cout
<<"\tIt is a program for simulating heterozygosis in diploid, such as SNP, small InDel and structural variation (large insertion, deletion and inversion). ";
39 cout
<<"When simulate SNP, we can set the total SNP rate with option -s, and set the value of transition divided by transvertion with option -a. ";
40 cout
<<"When simulate small InDel: insertion and deletion share 1/2 of the total rate respectively, 1~6bp bases as the InDel number ,and the ";
41 cout
<<"rate distribution of each type as below: 1bp-64.82%, 2bp-17.17%, 3bp-7.20%, 4bp-7.29%, 5bp-2.18%, 6bp-1.34%, which is the empirical distribution from panda re-sequencing data, we can ";
42 cout
<<"set the total small InDel rate with option -d. When simulate SV(structural variation): large insertion, deletion and invertion, each of them share 1/3 of total rate, here we use this rate ";
43 cout
<<"distribution: 100bp-70%, 200bp-20%, 500bp-7%, 1000bp-2%, 2000bp-1%, we can set the total SV rate with option -v. ";
44 cout
<<"Input sequence must be set ,because there is no default value."<<endl
;
45 cout
<<endl
<<"Program: simulate_diploid_genome"<<endl
;
46 cout
<<"\tAuthor: "<<AUTHOR
<<endl
;
47 cout
<<"\tVersion: "<<VERSION
<<endl
;
48 cout
<<"\tContact: "<<CONTACT
<<endl
;
49 cout
<<endl
<<"Usage:\tsimulate_diploid_genome [options]"<<endl
;
50 cout
<<"\t-i <string> input,input reference genome sequence *.fa/*.fa.gz"<<endl
;
51 cout
<<"\t-s <double> set the heterozygous SNP rate of the diploid genome,default:"<<hetersnp_rate
<<endl
;
52 cout
<<"\t-a <double> set the value of transition divided by transvertion for heterSNP,default:"<<snp_transition_by_transvertion_rate
<<endl
;
53 cout
<<"\t-d <double> set the small InDel rate of the diploid genome,default:"<<heterindel_rate
<<endl
;
54 cout
<<"\t-v <double> set the structural variation rate(large insertion,deletion,invertion) of the diploid genome,default:"<<big_SV_rate
<<endl
;
55 cout
<<"\t-c <int> set ouput file type, 0:text, 1:*.gz, default:"<< output_type
<<endl
;
56 cout
<<"\t-o <string> set output file prefix default:"<<output_prefix
<<endl
;
57 cout
<<"\t-h output help infomation"<<endl
;
58 cout
<<endl
<<"Example:"<<endl
;
59 cout
<<"\t1. ./simulate_diploid_genome -i ref_sequence.fa -s 0.001 -d 0.0001 -v 0.000001 -o ref_sequence >simulate_snp_indel.out 2>simulate_snp_indel.err"<<endl
;
63 void Getopt(int argc
,char *argv
[]){
65 while ((c
=getopt(argc
,argv
,"i:s:d:o:a:v:c:h"))!=-1)
68 case 'i': input
=optarg
;break;
69 case 's': hetersnp_rate
=strtod(optarg
,NULL
);break;
70 case 'd': heterindel_rate
=strtod(optarg
,NULL
);break;
71 case 'a': snp_transition_by_transvertion_rate
=strtod(optarg
,NULL
);break;
72 case 'v': big_SV_rate
=strtod(optarg
,NULL
);break;
73 case 'c': output_type
=atoi(optarg
);break;
74 case 'o': output_prefix
=optarg
;break;
75 case 'h': Usage();break;
81 int main(int argc
, char *argv
[])
83 time_t time_start
, time_end
;
84 time_start
= time(NULL
);
85 srand((unsigned)time(NULL
));
93 if(hetersnp_rate
< 0 || hetersnp_rate
> 1){cerr
<<"Error: hetersnp_rate should be set in 0~1 "<<endl
;exit(-1);}
94 if(snp_transition_by_transvertion_rate
< 0){cerr
<<"Error: the rate of transition/transvertion should be set greater than 0 "<<endl
;exit(-1);}
95 if(heterindel_rate
< 0 || heterindel_rate
> 1){cerr
<<"Error: heterindel_rate should be set in 0~1 "<<endl
;exit(-1);}
96 if(big_SV_rate
< 0 || big_SV_rate
> 1){cerr
<<"Error: large SV rate should be set in 0~1 "<<endl
;exit(-1);}
97 if(hetersnp_rate
== 0 && heterindel_rate
== 0 && big_SV_rate
== 0){cerr
<<"No variation for reference, please input at least one of parameter greater than 0!"<<endl
; exit(-1);}
98 if(output_type
!= 0 && output_type
!= 1){cerr
<<"Error: -c (output_type) should be set 0 or 1!"<<endl
; exit(-1);}
102 if(hetersnp_rate
> 0){
103 double snp_sum
= snp_transition_by_transvertion_rate
+ 1.0;
104 double transition_rate
= double(snp_transition_by_transvertion_rate
)/snp_sum
;
105 double transvertion_rate
= 1.0/snp_sum
;
106 snp_bias
[0] = transition_rate
;
107 snp_bias
[1] = transition_rate
+transvertion_rate
/2.0;
111 //set output file name
113 infile
.open(input
.c_str());
116 cerr
<<"Error:unable to open input file:"<<input
<<endl
;
119 string snp_output
= output_prefix
+"_snp.lst";
120 string indel_output
= output_prefix
+"_indel.lst";
121 string invertion_output
= output_prefix
+"_invertion.lst";
122 if(hetersnp_rate
!= 0){output_prefix
= output_prefix
+".snp";}
123 if(heterindel_rate
!= 0){output_prefix
= output_prefix
+".indel";}
124 if(big_SV_rate
!=0){output_prefix
= output_prefix
+".invertion";}
126 string output_ref_file
;
128 output_ref_file
= output_prefix
+ ".fa";
130 output_ref_file
= output_prefix
+ ".fa.gz";
134 outfile
.open(output_ref_file
.c_str());
137 cerr
<<"Error:unable to open output file:"<<output_ref_file
<<endl
;
141 gz_outfile
.open(output_ref_file
.c_str());
144 cerr
<<"Error:unable to open output file:"<<output_ref_file
<<endl
;
149 ofstream SNP_File
,Indel_File
,Ivertion_File
;
152 SNP_File
.open(snp_output
.c_str());
155 cerr
<<"Error:unable to open output file:"<<snp_output
<<endl
;
160 if(heterindel_rate
>0){
161 Indel_File
.open(indel_output
.c_str());
164 cerr
<<"Error:unable to open output file:"<<indel_output
<<endl
;
170 Ivertion_File
.open(invertion_output
.c_str());
173 cerr
<<"Error:unable to open output file:"<<invertion_output
<<endl
;
178 Get_raw_genome(infile
,SNP_File
,Indel_File
,Ivertion_File
);
180 time_end
= time(NULL
);
181 cerr
<<"All done! Run time: "<<time_end
-time_start
<<"s."<<endl
;
186 //get raw genome sequence.
187 void Get_raw_genome(igzstream
&inf
, ofstream
&snp_file
, ofstream
&indel_file
, ofstream
&invertion_file
)
189 string line
,id
,id_line
,seq
;
190 while (getline(inf
,line
,'\n'))
196 cerr
<<"Have finished reading scaffold "<<id
<<endl
;
197 //start to simulate one scaffold
198 simulate_snp_indel_seq(id_line
,id
,seq
,snp_file
,indel_file
,invertion_file
);
204 int pos
=line
.find(" ");
205 line
=line
.substr(0,pos
);
211 cerr
<<"Have finished reading scaffold "<<id
<<endl
;
212 //start to simulate one scaffold
213 simulate_snp_indel_seq(id_line
,id
,seq
,snp_file
,indel_file
, invertion_file
);
216 //add snp and indel in raw seqence, and output result sequence.
217 void simulate_snp_indel_seq(string id_line
,string id
,string
&sequence
, ofstream
&snp_file
,ofstream
&indel_file
, ofstream
&invertion_file
)
219 //convert lower case to upper case
222 if (hetersnp_rate
>0 || heterindel_rate
>0 || big_SV_rate
>0) //heterozygous SNP and heterozygous indel exists in diploid
226 cerr
<<"Begin to simulate snp"<<endl
;
227 sequence
=Get_snp(sequence
,snp_file
,id
,hetersnp_rate
,snp_bias
);
228 cerr
<<"Have finished simulating snp"<<endl
;
232 cerr
<<"Begin to simulate invertion"<<endl
;
233 sequence
=Get_invertion(sequence
,invertion_file
,id
,big_SV_rate
);
235 if (heterindel_rate
>0 || big_SV_rate
>0)
237 cerr
<<"Begin to simulate indel"<<endl
;
238 sequence
=Get_indel(sequence
,indel_file
,id
,heterindel_rate
,big_SV_rate
);
239 cerr
<<"Have finished simulating indel"<<endl
;
243 outfile
<<id_line
<<endl
;
244 uint64_t seq_len
= sequence
.size();
247 uint64_t counter
= 0;
250 outfile
<<sequence
.substr(counter
*50,50)<<endl
;
254 outfile
<<sequence
.substr(counter
*50)<<endl
;
256 gz_outfile
<<id_line
<<endl
;
257 uint64_t seq_len
= sequence
.size();
259 uint64_t counter
= 0;
262 gz_outfile
<<sequence
.substr(counter
*50,50)<<endl
;
266 gz_outfile
<<sequence
.substr(counter
*50)<<endl
;
269 cerr
<<"Have finished simulating "<<id_line
<<endl
;