modified: pixi.toml
[GalaxyCodeBases.git] / released / pIRS.old / pIRS_simulator / simulate_diploid_genome.cpp
blobd1ab285e28636090a7c55d6e16ca17ae83ad5301
1 #include <iostream>
2 #include <vector>
3 #include <string>
4 #include <fstream>
5 #include <math.h>
6 #include <map>
7 #include <boost/algorithm/string.hpp>
8 #include "simulate.h"
9 #include "gzstream.h"
11 using namespace std;
12 using namespace boost;
14 string input;
15 double hetersnp_rate=0.001;
16 double heterindel_rate=0.0001;
17 double big_SV_rate=0.000001; // structural variation rate
18 double snp_transition_by_transvertion_rate = 2; //transition_number £ºtransvertion_rate_number = 2
19 int output_type = 1;
20 ofstream outfile;
21 ogzstream gz_outfile;
22 string output_prefix = "ref_sequence";
23 double snp_bias[3]; //label transition and transvertion ratio
25 //get raw genome sequence.
26 void Get_raw_genome(igzstream &inf, ofstream &snp, ofstream &indel, ofstream &invertion);
28 //add snp and indel in raw seqence, and output result sequence.
29 void simulate_snp_indel_seq(string id_line,string id,string &sequ, ofstream &snp,ofstream &indel, ofstream &invertion);
31 const char *VERSION="1.0";
32 const char *AUTHOR="BGI-Shenzhen";
33 const char *CONTACT="yuanjianying@genomics.org.cn";
35 void Usage(){
36 cout<<"\nDescription:"<<endl;
37 cout<<endl;
38 cout<<"\tIt is a program for simulating heterozygosis in diploid, such as SNP, small InDel and structural variation (large insertion, deletion and inversion). ";
39 cout<<"When simulate SNP, we can set the total SNP rate with option -s, and set the value of transition divided by transvertion with option -a. ";
40 cout<<"When simulate small InDel: insertion and deletion share 1/2 of the total rate respectively, 1~6bp bases as the InDel number ,and the ";
41 cout<<"rate distribution of each type as below: 1bp-64.82%, 2bp-17.17%, 3bp-7.20%, 4bp-7.29%, 5bp-2.18%, 6bp-1.34%, which is the empirical distribution from panda re-sequencing data, we can ";
42 cout<<"set the total small InDel rate with option -d. When simulate SV(structural variation): large insertion, deletion and invertion, each of them share 1/3 of total rate, here we use this rate ";
43 cout<<"distribution: 100bp-70%, 200bp-20%, 500bp-7%, 1000bp-2%, 2000bp-1%, we can set the total SV rate with option -v. ";
44 cout<<"Input sequence must be set ,because there is no default value."<<endl;
45 cout<<endl<<"Program: simulate_diploid_genome"<<endl;
46 cout<<"\tAuthor: "<<AUTHOR<<endl;
47 cout<<"\tVersion: "<<VERSION<<endl;
48 cout<<"\tContact: "<<CONTACT<<endl;
49 cout<<endl<<"Usage:\tsimulate_diploid_genome [options]"<<endl;
50 cout<<"\t-i <string> input,input reference genome sequence *.fa/*.fa.gz"<<endl;
51 cout<<"\t-s <double> set the heterozygous SNP rate of the diploid genome,default:"<<hetersnp_rate<<endl;
52 cout<<"\t-a <double> set the value of transition divided by transvertion for heterSNP,default:"<<snp_transition_by_transvertion_rate<<endl;
53 cout<<"\t-d <double> set the small InDel rate of the diploid genome,default:"<<heterindel_rate<<endl;
54 cout<<"\t-v <double> set the structural variation rate(large insertion,deletion,invertion) of the diploid genome,default:"<<big_SV_rate<<endl;
55 cout<<"\t-c <int> set ouput file type, 0:text, 1:*.gz, default:"<< output_type <<endl;
56 cout<<"\t-o <string> set output file prefix default:"<<output_prefix<<endl;
57 cout<<"\t-h output help infomation"<<endl;
58 cout<<endl<<"Example:"<<endl;
59 cout<<"\t1. ./simulate_diploid_genome -i ref_sequence.fa -s 0.001 -d 0.0001 -v 0.000001 -o ref_sequence >simulate_snp_indel.out 2>simulate_snp_indel.err"<<endl;
60 exit(-1);
63 void Getopt(int argc,char *argv[]){
64 int c;
65 while ((c=getopt(argc,argv,"i:s:d:o:a:v:c:h"))!=-1)
67 switch(c){
68 case 'i': input=optarg;break;
69 case 's': hetersnp_rate=strtod(optarg,NULL);break;
70 case 'd': heterindel_rate=strtod(optarg,NULL);break;
71 case 'a': snp_transition_by_transvertion_rate=strtod(optarg,NULL);break;
72 case 'v': big_SV_rate=strtod(optarg,NULL);break;
73 case 'c': output_type=atoi(optarg);break;
74 case 'o': output_prefix=optarg;break;
75 case 'h': Usage();break;
76 default: Usage();
81 int main(int argc, char *argv[])
83 time_t time_start, time_end;
84 time_start = time(NULL);
85 srand((unsigned)time(NULL));
87 if (argc==1)
89 Usage();
91 Getopt(argc,argv);
93 if(hetersnp_rate < 0 || hetersnp_rate > 1){cerr<<"Error: hetersnp_rate should be set in 0~1 "<<endl;exit(-1);}
94 if(snp_transition_by_transvertion_rate < 0){cerr<<"Error: the rate of transition/transvertion should be set greater than 0 "<<endl;exit(-1);}
95 if(heterindel_rate < 0 || heterindel_rate > 1){cerr<<"Error: heterindel_rate should be set in 0~1 "<<endl;exit(-1);}
96 if(big_SV_rate < 0 || big_SV_rate > 1){cerr<<"Error: large SV rate should be set in 0~1 "<<endl;exit(-1);}
97 if(hetersnp_rate == 0 && heterindel_rate == 0 && big_SV_rate == 0){cerr<<"No variation for reference, please input at least one of parameter greater than 0!"<<endl; exit(-1);}
98 if(output_type != 0 && output_type != 1){cerr <<"Error: -c (output_type) should be set 0 or 1!"<<endl; exit(-1);}
101 //get snp bias
102 if(hetersnp_rate > 0){
103 double snp_sum = snp_transition_by_transvertion_rate + 1.0;
104 double transition_rate = double(snp_transition_by_transvertion_rate)/snp_sum;
105 double transvertion_rate = 1.0/snp_sum;
106 snp_bias[0] = transition_rate;
107 snp_bias[1] = transition_rate+transvertion_rate/2.0;
108 snp_bias[2] = 1;
111 //set output file name
112 igzstream infile;
113 infile.open(input.c_str());
114 if (!infile)
116 cerr<<"Error:unable to open input file:"<<input<<endl;
117 exit(-1);
119 string snp_output = output_prefix+"_snp.lst";
120 string indel_output = output_prefix+"_indel.lst";
121 string invertion_output = output_prefix+"_invertion.lst";
122 if(hetersnp_rate != 0){output_prefix = output_prefix+".snp";}
123 if(heterindel_rate != 0){output_prefix = output_prefix+".indel";}
124 if(big_SV_rate !=0){output_prefix = output_prefix+".invertion";}
126 string output_ref_file;
127 if(!output_type){
128 output_ref_file = output_prefix + ".fa";
129 }else{
130 output_ref_file = output_prefix + ".fa.gz";
133 if(!output_type){
134 outfile.open(output_ref_file.c_str());
135 if (!outfile)
137 cerr<<"Error:unable to open output file:"<<output_ref_file<<endl;
138 exit(-1);
140 }else{
141 gz_outfile.open(output_ref_file.c_str());
142 if (!gz_outfile)
144 cerr<<"Error:unable to open output file:"<<output_ref_file<<endl;
145 exit(-1);
149 ofstream SNP_File,Indel_File,Ivertion_File;
151 if(hetersnp_rate>0){
152 SNP_File.open(snp_output.c_str());
153 if(!SNP_File)
155 cerr<<"Error:unable to open output file:"<<snp_output<<endl;
156 exit(-1);
160 if(heterindel_rate>0){
161 Indel_File.open(indel_output.c_str());
162 if(!Indel_File)
164 cerr<<"Error:unable to open output file:"<<indel_output<<endl;
165 exit(-1);
169 if(big_SV_rate>0){
170 Ivertion_File.open(invertion_output.c_str());
171 if(!Ivertion_File)
173 cerr<<"Error:unable to open output file:"<<invertion_output<<endl;
174 exit(-1);
178 Get_raw_genome(infile,SNP_File,Indel_File,Ivertion_File);
180 time_end = time(NULL);
181 cerr<<"All done! Run time: "<<time_end-time_start<<"s."<<endl;
183 return 0;
186 //get raw genome sequence.
187 void Get_raw_genome(igzstream &inf, ofstream &snp_file, ofstream &indel_file, ofstream &invertion_file )
189 string line,id,id_line,seq;
190 while (getline(inf,line,'\n'))
192 if (line[0]=='>')
194 if (seq!="")
196 cerr<<"Have finished reading scaffold "<<id<<endl;
197 //start to simulate one scaffold
198 simulate_snp_indel_seq(id_line,id,seq,snp_file,indel_file,invertion_file);
199 seq="";
201 id_line = line;
202 line.erase(0,1);
203 // id=line;
204 int pos=line.find(" ");
205 line=line.substr(0,pos);
206 id=line;
207 }else{
208 seq+=line;
211 cerr<<"Have finished reading scaffold "<<id<<endl;
212 //start to simulate one scaffold
213 simulate_snp_indel_seq(id_line,id,seq,snp_file,indel_file, invertion_file);
216 //add snp and indel in raw seqence, and output result sequence.
217 void simulate_snp_indel_seq(string id_line,string id,string &sequence, ofstream &snp_file,ofstream &indel_file, ofstream &invertion_file)
219 //convert lower case to upper case
220 to_upper(sequence);
222 if (hetersnp_rate>0 || heterindel_rate>0 || big_SV_rate>0) //heterozygous SNP and heterozygous indel exists in diploid
224 if (hetersnp_rate>0)
226 cerr<<"Begin to simulate snp"<<endl;
227 sequence=Get_snp(sequence,snp_file,id,hetersnp_rate,snp_bias);
228 cerr<<"Have finished simulating snp"<<endl;
230 if (big_SV_rate>0)
232 cerr<<"Begin to simulate invertion"<<endl;
233 sequence=Get_invertion(sequence,invertion_file,id,big_SV_rate);
235 if (heterindel_rate>0 || big_SV_rate>0)
237 cerr<<"Begin to simulate indel"<<endl;
238 sequence=Get_indel(sequence,indel_file,id,heterindel_rate,big_SV_rate);
239 cerr<<"Have finished simulating indel"<<endl;
242 if(!output_type){
243 outfile<<id_line<<endl;
244 uint64_t seq_len = sequence.size();
246 //50bp base per line
247 uint64_t counter = 0;
248 while(seq_len > 50)
250 outfile<<sequence.substr(counter*50,50)<<endl;
251 seq_len -= 50;
252 counter++;
254 outfile<<sequence.substr(counter*50)<<endl;
255 }else{
256 gz_outfile<<id_line<<endl;
257 uint64_t seq_len = sequence.size();
259 uint64_t counter = 0;
260 while(seq_len > 50)
262 gz_outfile<<sequence.substr(counter*50,50)<<endl;
263 seq_len -= 50;
264 counter++;
266 gz_outfile<<sequence.substr(counter*50)<<endl;
269 cerr<<"Have finished simulating "<<id_line<<endl;