modified: pixi.toml
[GalaxyCodeBases.git] / released / pIRS.old / pIRS_simulator / simulate_Illumina_reads.cpp
blob12359d37db6e9e36cc35dd6d160b504ec7187a6f
1 #include <iostream>
2 #include <vector>
3 #include <string>
4 #include <fstream>
5 #include <math.h>
6 #include <map>
7 #include <boost/lexical_cast.hpp>
8 #include <boost/algorithm/string.hpp>
9 #include "gzstream.h"
10 #include "simulate.h"
11 #include "load_file.h"
12 #include "global.h"
14 using namespace std;
15 using namespace boost;
17 //parameter variable
18 PARAMETER InputParameter ={100,500,-1,0,1,1,64,1,40,-1,"","","","","Illumina"};
20 int Ref_Base_num = 0; //ATCG: 4
21 int Statistical_Cycle_num = 0; //the cycle number in Base-calling profile
22 int Seq_Base_num = 0; //ATCG: 4
23 int Quality_num = 0; //the quality score number in Base-calling profile
24 int Simulate_Cycle_num = 0; //the cycle number of simulation
25 double Statistical_average_error_rate = 0; //the average error rate in Base-calling profile
27 ogzstream Gz_outfile1;
28 ogzstream Gz_outfile2;
29 ofstream Outfile1;
30 ofstream Outfile2;
32 double*** Simulation_matrix = NULL; //for call base and quality simulation
33 double* GC_bias_abundance = NULL; //for GC bias simulation
34 map<int,uint64_t> InsertSize_distr; //record the insert size distribution
35 uint64_t* Error_pos_distr = NULL; //record the error position distribution
36 double* Q_to_Erate_distr = NULL; //record quality score to error rate distribution
37 uint64_t Total_read_pair = 0;
39 //get genome sequence and start to simulate read
40 void Get_genome(igzstream &inf,igzstream &inf2);
41 //contral the quantity of simulate reads for each chromosome one by one
42 uint64_t contral_reads_quantity(string id_line,string id,string &sequ,string &sequ2,uint64_t read_genome);
43 //simulate fastq reads
44 uint64_t simulate_fq_reads(string &seq,uint64_t seqlen, uint64_t rd_pair, string id_seq, uint64_t reads_all);
45 //simulate fasta reads
46 uint64_t simulate_fa_reads(string &seq,uint64_t seqlen, uint64_t rd_pair, string id_seq, uint64_t reads_all);
48 const char *VERSION="1.0";
49 const char *AUTHOR="BGI-Shenzhen";
50 const char *CONTACT1="yuanjianying@genomics.org.cn";
51 const char *CONTACT2="shiyujian@genomics.org.cn";
53 void Usage(){
54 cout<<"\nDescription:"<<endl;
55 cout<<" It is a program for simulating Illumina PE reads, with a series of characters generate by illumina ";
56 cout<<"sequencing machine, such as insertsize distribution, sequencing error, quality score and GC bias. ";
57 cout<<"User should set the value of insertsize_mean and insertsize_sd, they are the mean value and standard ";
58 cout<<"deviation of the normal distribution that used as the model function when simulating insertsize ";
59 cout<<"distribution, usually we set the insertsize_sd to be 1/20 of the insertsize_mean. The normal distribution ";
60 cout<<"function model we used in this program is f(x)=1/σ/sqrt(2*pi)*exp((x-μ)**2 / (2*σ**2)), and the insertsize ";
61 cout<<"distribution is simulated by Box-muller method. ";
62 cout<<"This program simulates Illumina sequencing error, quality score and GC bias according to the empirical distribution profile. ";
63 cout<<"User can set the path of profile or using the default file in this program package, which is generated by large real sequencing data. ";
64 cout<<"If you want to simulate diploid(heterozygosis SNP, heterozygosis Indel and structural variation) reads, you should input two ";
65 cout<<"reference genome sequence, you can get the another diploid genome sequence by the program \"simulate_diploid_genome\", but ";
66 cout<<"remember that heterozygosis SNP rate and heterozygosis Indel rate is only exists in diploid. ";
67 cout<<"At last, you should set another several parameters, read length, coverage of reads, input sequence, ";
68 cout<<"output prefix and so on, the option -i must be set, because there is no default value."<<endl;
69 cout<<endl<<"Program: simulate_Illumina_reads"<<endl;
70 cout<<"\tVersion: "<<VERSION<<endl;
71 cout<<"\tAuthor: "<<AUTHOR<<endl;
72 cout<<"\tContact: "<<CONTACT1<<" "<<CONTACT2<<endl;
73 cout<<endl<<"Usage:\tsimulate_Illumina_reads [options]"<<endl;
74 cout<<"\t-i <string> input_ref1,input reference genome sequence *.fa/*.fa.gz, no default vaule"<<endl;
75 cout<<"\t-I <string> input_ref2,for diploid genome, input another reference genome sequence which was generated by program \"simulate_snp_indel_seq\""<<endl;
76 cout<<"\t-s <string> Base-calling profile,input Base-calling profile for simulating sequencing error and quality score,default: (exe_path)/Profiles/Base-calling_profile/hum20110701.bwanosnp.count.matrix"<<endl;
77 cout<<"\t-d <string> GC-depth profile,input GC-depth file for simulating GC bias, the default GC bias profiles are determined based on the twice of read length"<<endl;
78 cout<<"\t-l <int> read_len,set length of read,read1 and read2 have the same length,default:"<<InputParameter.Read_length<<endl;
79 cout<<"\t-x <double> coverage,set the sequencing coverage(sometimes called depth),default:"<<InputParameter.Coverage<<endl;
80 cout<<"\t-m <int> insertsize_mean,set the average value of insert size,default:"<<InputParameter.Insertsize_mean<<endl;
81 cout<<"\t-v <int> insertsize_sd,set the standard deviation of insert sizes, default:insertsize_mean/20"<<endl;
82 cout<<"\t-e <double> error_rate,set the average error rate over all cycles,default=average error rate of Base-calling profile"<<endl;
83 cout<<"\t-g <int> whether simulate GC bias, 0:no, 1:yes, default:"<<InputParameter.Is_simulate_GC_bias<<endl;
84 cout<<"\t-q <int> whether simulate quality value, 0:no, 1:yes, default:"<<InputParameter.Is_simulate_quality<<endl;
85 cout<<"\t-Q <int> Quality value ascii shift, generally 64 or 33 for Illumina data, default:"<<InputParameter.Q_shift<<endl;
86 cout<<"\t-f <int> whether cyclize insert sequence(influence on PE-reads direction) 0: read1-forward read2-reverse, 1: read1-reverse read2-forward, default:"<<InputParameter.Is_cyclization<<endl;
87 cout<<"\t-c <int> set output file type, 0:text, 1:*.gz, default:"<<InputParameter.Output_type<<endl;
88 cout<<"\t-o <string> output,output file prefix default:"<<InputParameter.Output_prefix<<endl;
89 cout<<"\t-h output help infomation."<<endl;
90 cout<<endl<<"Example:"<<endl;
91 cout<<"\t1. ./simulate_Illumina_reads -i ref_sequence.fa"<<endl;
92 cout<<"\t Every parameter use the default value."<<endl;
93 cout<<"\t2. ./simulate_Illumina_reads -i ref_sequence.fa -l 100 -x 20 -o human_500_100"<<endl;
94 cout<<"\t Just set read length and coverage you needed."<<endl;
95 cout<<"\t3. ./simulate_Illumina_reads -i ref_sequence.fa -o human -m 600 -v 30 -e 0.01"<<endl;
96 cout<<"\t Set insertsize distribution and error rate."<<endl;
97 cout<<"\t4. ./simulate_Illumina_reads -i ref_sequence.fa -I ref_seq.snp.indel.invertion.fa.gz -o human "<<endl;
98 cout<<"\t The genome is diploid and you want to produce heterozygosis SNPs heterozygosis Indels in reads, "<<endl;
99 cout<<"\t the -I input file was generated by program \"simulate_diploid_genome\"."<<endl;
100 cout<<"\t5. ./simulate_Illumina_reads -i ref_sequence.fa -g 0 -m 50000 -f 1 -c 0 -o human "<<endl;
101 cout<<"\t Set simulate no GCbias and cyclize the large insert-size library, output file is text format."<<endl;
102 cout<<"\t6. ./simulate_Illumina_reads -i ref_sequence.fa -q 0 -o human "<<endl;
103 cout<<"\t Set simulate quality value, the ouput is in *.fa format."<<endl;
104 exit(-1);
107 void Getopt(int argc,char *argv[]){
108 int c;
109 while ((c=getopt(argc,argv,"i:I:s:d:l:x:m:v:e:f:g:q:Q:c:o:h"))!=-1)
111 switch(c){
112 case 'i': InputParameter.Input_ref1=optarg;break;
113 case 'I': InputParameter.Input_ref2=optarg;break;
114 case 's': InputParameter.BaseCalling_profile=optarg;break;
115 case 'd': InputParameter.GC_depth_profile=optarg;break;
116 case 'l': InputParameter.Read_length=atoi(optarg);break;
117 case 'x': InputParameter.Coverage=strtod(optarg,NULL);break;
118 case 'm': InputParameter.Insertsize_mean=atoi(optarg);break;
119 case 'v': InputParameter.Insertsize_sd=atoi(optarg);break;
120 case 'e': InputParameter.Error_rate=strtod(optarg,NULL);break;
121 case 'f': InputParameter.Is_cyclization=atoi(optarg);break;
122 case 'g': InputParameter.Is_simulate_GC_bias=atoi(optarg);break;
123 case 'q': InputParameter.Is_simulate_quality=atoi(optarg);break;
124 case 'Q': InputParameter.Q_shift=atoi(optarg);break;
125 case 'c': InputParameter.Output_type=atoi(optarg);break;
126 case 'o': InputParameter.Output_prefix=optarg;break;
127 case 'h': Usage();break;
128 default: Usage();
133 int main(int argc, char *argv[])
135 time_t time_start, time_end;
136 time_start = time(NULL);
137 srand((unsigned)time(NULL));
139 if (argc==1)
141 Usage();
144 Getopt(argc,argv);
146 //set insertsize sd default value
147 if(InputParameter.Insertsize_sd == -1){InputParameter.Insertsize_sd = int(InputParameter.Insertsize_mean/20);}
148 //check parameter
149 if(InputParameter.Input_ref1 == ""){cerr<<"Error: there is not default value with option -i, please input reference sequence!"<<endl;exit(-1);}
150 if(InputParameter.Read_length <= 0){cerr<<"Error: read length should be set bigger than 0, please check option -l !"<<endl;exit(-1);}
151 if(InputParameter.Coverage <= 0){cerr<<"Error: coverage should be set bigger than 0, please check option -x !"<<endl;exit(-1);}
152 if(InputParameter.Insertsize_mean < InputParameter.Read_length){cerr<<"Error: insertize mean should be set bigger than read_length, please check option -m !"<<endl;exit(-1);}
153 if(InputParameter.Insertsize_sd < 0){cerr<<"Error: insertsize_sd should be set bigger than 0, please check option -v !"<<endl;exit(-1);}
154 if(InputParameter.Error_rate != -1 && InputParameter.Error_rate < 0 || InputParameter.Error_rate >= 1){cerr<<"Error: error_rate should be set between 0 and 1, or set -1 to simulate default error rate according with error profile, please check option -e !"<<endl;exit(-1);}
155 if(InputParameter.Is_cyclization != 0 && InputParameter.Is_cyclization != 1){cerr<<"Error: Is_cyclization should be set 0 or 1, please check option -f !"<<endl;exit(-1);}
156 if(InputParameter.Is_simulate_GC_bias != 0 && InputParameter.Is_simulate_GC_bias != 1){cerr<<"Error: Is_simulate_GC_bias should be set 0 or 1, please check option -g !"<<endl;exit(-1);}
157 if(InputParameter.Is_simulate_quality != 0 && InputParameter.Is_simulate_quality != 1){cerr<<"Error: Is_simulate_quality should be set 0 or 1, please check option -q !"<<endl;exit(-1);}
158 if(InputParameter.Output_type != 0 && InputParameter.Output_type != 1){cerr<<"Error: output_type should be set 0 or 1, please check option -c !"<<endl;exit(-1);}
160 //set the simulate cycle number
161 Simulate_Cycle_num = InputParameter.Read_length*2;
162 //initialize error position distribution table
163 Error_pos_distr = new uint64_t[Simulate_Cycle_num+1];
164 Q_to_Erate_distr = new double[Simulate_Cycle_num+1];
165 for(int i=0; i<=Simulate_Cycle_num; i++)
167 Error_pos_distr[i] = 0;
168 Q_to_Erate_distr[i] = 0.0;
171 //input file
172 igzstream infile;
173 igzstream infile2;
174 //output file
175 ofstream insert_log;
176 ofstream error_log;
177 //check and open file
178 set_and_check_file( InputParameter, infile, infile2, Outfile1, Outfile2, Gz_outfile1, Gz_outfile2, insert_log, error_log);
181 ////////////////////Load error profile///////////////////////
183 string exe_path = argv[0]; //program package path
185 cerr<<"Start to preview error profile..."<<endl;
186 //get dimensions of error profile
187 preview_BaseCalling_profile (InputParameter, exe_path, Ref_Base_num, Statistical_Cycle_num, Seq_Base_num, Quality_num, Statistical_average_error_rate);
188 cerr << "Dimensions of error profile:\n";
189 cerr << " Ref_Base_num: " << Ref_Base_num << endl;
190 cerr << " Statistical_Cycle_num: " << Statistical_Cycle_num << endl;
191 cerr << " Seq_Base_num: " << Seq_Base_num << endl;
192 cerr << " Quality_num: " << Quality_num << endl;
193 cerr << " "<<InputParameter.Read_length<<"bp reads total average error rate: "<< Statistical_average_error_rate <<endl;
195 //initialize simulation matrix
196 if(InputParameter.Is_simulate_quality){
197 //initialize matrix table Ref_Base_num*Simulate_Cycle_num*Seq_Base_num*Quality_num
198 Simulation_matrix = new double**[Ref_Base_num];
199 for(int i=0; i<Ref_Base_num; i++)
201 Simulation_matrix[i] = new double*[Simulate_Cycle_num];
202 for(int j=0; j<Simulate_Cycle_num; j++)
204 Simulation_matrix[i][j] = new double[Seq_Base_num*Quality_num];
205 for(int k=0; k<Seq_Base_num*Quality_num; k++)
207 Simulation_matrix[i][j][k] = 0;
211 }else{ //simulate fa read matrix
212 //initialize matrix table Ref_Base_num*Simulate_Cycle_num*Seq_Base_num
213 Simulation_matrix = new double**[Ref_Base_num];
214 for(int i=0; i<Ref_Base_num; i++)
216 Simulation_matrix[i] = new double*[Simulate_Cycle_num];
217 for(int j=0; j<Simulate_Cycle_num; j++)
219 Simulation_matrix[i][j] = new double[Seq_Base_num];
220 for(int k=0; k<Seq_Base_num; k++)
222 Simulation_matrix[i][j][k] = 0;
228 //get the simulation matrix
229 load_BaseCalling_profile(InputParameter, exe_path, Statistical_Cycle_num, Seq_Base_num, Quality_num, Statistical_average_error_rate, Simulation_matrix);
232 ///////////////////////////load GC bias profile///////////////////////////////
234 //get GC abundance for simulate GC bias
235 if(InputParameter.Is_simulate_GC_bias){
236 GC_bias_abundance = new double[101];
237 for(int i = 0; i <= 100; i++)
239 GC_bias_abundance[i] = 0.0;
241 load_GC_depth_profile (InputParameter, exe_path, GC_bias_abundance);
245 ///////////////////////////get genome seq and start to simulate reads/////////////////////////////
247 //start simulation
248 Get_genome(infile,infile2);
251 /////////////////////////output error distribution and insert size distribution////////////////////////////
253 //ouput error position distribution
254 error_log<<"**********Error rate distribution**********"<<endl;
255 if(InputParameter.Is_simulate_quality){
256 error_log<<"Cycle\tReal_error_rate\tQuality_to_error_rate"<<endl;
257 for(int i=1; i<=Simulate_Cycle_num; i++){
258 error_log<<i<<"\t"<<double(Error_pos_distr[i])/double(Total_read_pair)<<"\t"<<Q_to_Erate_distr[i]/double(Total_read_pair)<<endl;
259 if(i== InputParameter.Read_length){error_log<<endl;}
261 }else{
262 error_log<<"Cycle\tReal_error_rate"<<endl;
263 for(int i=1; i<=Simulate_Cycle_num; i++){
264 error_log<<i<<"\t"<<double(Error_pos_distr[i])/double(Total_read_pair)<<endl;
265 if(i== InputParameter.Read_length){error_log<<endl;}
268 delete[] Error_pos_distr;
269 delete[] Q_to_Erate_distr;
271 //output insert size distribution
272 insert_log<<"**********Insert size distribution************"<<endl
273 <<"insert_size_len"<<"\t"<<"number"<<endl;
274 map<int, uint64_t>::const_iterator map_it = InsertSize_distr.begin();
275 while (map_it != InsertSize_distr.end())
277 insert_log<<map_it->first<<"\t"<<map_it->second<<endl;
278 //cout<<map_it->second<<endl;
279 map_it++;
282 if(InputParameter.Is_simulate_GC_bias)
284 delete[] GC_bias_abundance;
287 for(int i=0; i<Ref_Base_num; i++)
289 for(int j=0; j<Simulate_Cycle_num; j++)
291 delete[] Simulation_matrix[i][j];
293 delete[] Simulation_matrix[i];
295 delete[] Simulation_matrix;
297 time_end = time(NULL);
298 cerr<<"All done! Run time: "<<time_end-time_start<<"s."<<endl;
300 return 0;
303 //get genome sequence and start to simulate read
304 void Get_genome(igzstream &inf,igzstream &inf2){
305 string line,line2,id,id_line,seq,seq2;
306 uint64_t readINgenome=0;
308 while (getline(inf,line,'\n'))
310 if (line[0]=='>')
312 if (seq!="")
314 //another diploid seq
315 if(InputParameter.Input_ref2 != ""){
316 while(getline(inf2,line2,'\n'))
318 if(line2[0] == '>')
320 if(seq2!="")
322 //cerr <<seq2<<endl;
323 cerr<<"Have finished reading scaffold "<<id<<endl;
324 readINgenome+=contral_reads_quantity(id_line,id,seq,seq2,readINgenome);
325 seq="";
326 seq2="";
327 break;
329 }else{
330 seq2+=line2;
333 }else{
334 cerr<<"Have finished reading scaffold "<<id<<endl;
336 readINgenome+=contral_reads_quantity(id_line,id,seq,seq2,readINgenome);
337 seq="";
340 id_line = line;
341 line.erase(0,1);
342 int pos=line.find(" ");
343 line=line.substr(0,pos);
344 id=line;
345 }else{
346 seq+=line;
349 cerr<<"Have finished reading scaffold "<<id<<endl;
350 if(InputParameter.Input_ref2 != ""){
351 while(getline(inf2,line2,'\n'))
353 if(line2[0] == '>'){continue;}
354 seq2+=line2;
356 //cerr<<seq2<<endl;
358 readINgenome+=contral_reads_quantity(id_line,id,seq,seq2,readINgenome);
361 //contral the quantity of simulate reads for each chromosome one by one
362 uint64_t contral_reads_quantity(string id_line,string id,string &sequence,string &sequence2,uint64_t read_genome)
364 uint64_t readonchr=0;
365 if (sequence.size()<InputParameter.Insertsize_mean)
367 return 0;
369 //convert lower case to upper case
370 to_upper(sequence);
372 uint64_t sequence_length=sequence.size();
373 uint64_t reads_pair_num=0;
375 if (InputParameter.Input_ref2 != "")
377 reads_pair_num=(uint64_t)(sequence_length*InputParameter.Coverage/InputParameter.Read_length/2/2);
378 }else{
379 reads_pair_num=(uint64_t)(sequence_length*InputParameter.Coverage/InputParameter.Read_length/2);
381 Total_read_pair = Total_read_pair + (uint64_t)(sequence_length*InputParameter.Coverage/InputParameter.Read_length/2);
383 string id_1 = id+" 1";
384 if(InputParameter.Is_simulate_quality){
385 readonchr=simulate_fq_reads(sequence,sequence_length,reads_pair_num,
386 id_1,read_genome);
387 }else{
388 readonchr=simulate_fa_reads(sequence,sequence_length,reads_pair_num,
389 id_1,read_genome);
392 //simulate reads of another diploid genome
393 if (InputParameter.Input_ref2 != "")
395 sequence_length=sequence2.size();
396 if (sequence_length<InputParameter.Insertsize_mean)
398 return 0;
400 reads_pair_num=(uint64_t)(sequence_length*InputParameter.Coverage/InputParameter.Read_length/2/2);
402 uint64_t readonchr2=read_genome+readonchr;
403 string id_2 = id+" 2";
404 if(InputParameter.Is_simulate_quality){
405 readonchr+=simulate_fq_reads(sequence2,sequence_length,reads_pair_num,
406 id_2,readonchr2);
407 }else{
408 readonchr+=simulate_fa_reads(sequence2,sequence_length,reads_pair_num,
409 id_2,readonchr2);
412 return readonchr;
416 uint64_t simulate_fq_reads(string &seq,uint64_t seqlen, uint64_t rd_pair, string id_seq, uint64_t reads_all)
418 int whether_check_seq = 0;
419 if(!check_seq(seq)){
420 whether_check_seq = 1;
423 uint64_t reads_count=0;
424 cerr<<"Begin to output reads"<<endl;
425 for(uint64_t pair_count=0; pair_count < rd_pair; pair_count++)
427 //simulate insertsize
428 int insertsize = simulate_insertsize(InputParameter.Insertsize_mean,InputParameter.Insertsize_sd);
429 if (insertsize<InputParameter.Read_length){pair_count--;continue;}
430 if (seqlen<insertsize){return reads_count;}
431 uint64_t N = 1000000000000;
432 uint64_t pos = (uint64_t)(((uint64_t)((double)rand() / double(RAND_MAX) * N)) % seqlen);
433 if (pos+insertsize>seqlen){pair_count--;continue;}
435 //get insert seq
436 string sub_str=seq.substr(pos,insertsize);
438 //get read1 and read2
439 string read1=sub_str.substr(0,InputParameter.Read_length);
440 string read2=sub_str.substr(insertsize-InputParameter.Read_length,InputParameter.Read_length);
442 //check whether contain 'N' or nonbase char
443 if(whether_check_seq && (!check_seq(read1) || !check_seq(read2))){
444 pair_count--;
445 continue;
448 //simulate GC bias
449 if(InputParameter.Is_simulate_GC_bias){
450 string check_seq = read1+read2;
451 if(simulate_GC_bias(check_seq,GC_bias_abundance)){pair_count--;continue;}
454 //insertsize statistics
455 if(InsertSize_distr[insertsize]>0)
457 InsertSize_distr[insertsize]++;
458 }else{
459 InsertSize_distr[insertsize] = 1;
462 reads_count++;
464 if ( InputParameter.Is_cyclization == 1 )
466 read1=reversecomplementary(read1);
467 }else if (InputParameter.Insertsize_mean>0)
469 read2=reversecomplementary(read2);
470 }else{
471 cout<<"Error:insertsize mean is smaller than 0"<<endl;
472 exit(-1);
475 int selection=int(rand()%2); //0 or 1, for selecting output file randomly
477 string output_read1, output_read2, output_quality_seq1, output_quality_seq2;
479 //simulate read1
480 vector<int> error_pos1;
481 vector<char> raw_base1;
482 for(int i = 0; i < InputParameter.Read_length; i++)
484 double num=double(rand())/double(RAND_MAX);
485 char ref_base = read1[i];
486 int cycle;
487 if(selection == 0){
488 cycle = i;
489 }else{
490 cycle = i+Simulate_Cycle_num/2;
492 int location = search_location(Simulation_matrix[alphabet2[ref_base]][cycle], Seq_Base_num*Quality_num, num);
493 int call_base_num = int(location/Quality_num);
494 char call_base = Bases[call_base_num];
495 if(ref_base != call_base){
496 if(selection == 0){
497 Error_pos_distr[i+1]++;
498 }else{
499 Error_pos_distr[i+1+InputParameter.Read_length]++;
501 error_pos1.push_back(i);
502 raw_base1.push_back(ref_base);
504 output_read1.push_back(call_base);
505 int Qscore = location%Quality_num;
506 char quality_value = Qscore + InputParameter.Q_shift;
507 output_quality_seq1.push_back(quality_value);
508 if(selection == 0){
509 Q_to_Erate_distr[i+1]+= pow(10,double(Qscore)/double(-10));
510 }else{
511 Q_to_Erate_distr[i+1+InputParameter.Read_length]+= pow(10,double(Qscore)/double(-10));
515 //simulate read2
516 vector<int> error_pos2;
517 vector<char> raw_base2;
518 for(int i = 0; i < InputParameter.Read_length; i++)
520 double num=double(rand())/double(RAND_MAX);
521 char ref_base = read2[i];
522 int cycle;
523 if(selection == 0){
524 cycle = i+Simulate_Cycle_num/2;
525 }else{
526 cycle = i;
529 int location = search_location(Simulation_matrix[alphabet2[ref_base]][cycle], Seq_Base_num*Quality_num, num);
530 int call_base_num = int(location/Quality_num);
531 char call_base = Bases[call_base_num];
532 if(ref_base != call_base){
533 if(selection == 0){
534 Error_pos_distr[i+1+InputParameter.Read_length]++;
535 }else{
536 Error_pos_distr[i+1]++;
538 error_pos2.push_back(i);
539 raw_base2.push_back(ref_base);
541 output_read2.push_back(call_base);
542 int Qscore = location%Quality_num;
543 char quality_value = Qscore + InputParameter.Q_shift;
544 output_quality_seq2.push_back(quality_value);
545 if(selection == 0){
546 Q_to_Erate_distr[i+1+InputParameter.Read_length]+= pow(10,double(Qscore)/double(-10));
547 }else{
548 Q_to_Erate_distr[i+1]+= pow(10,double(Qscore)/double(-10));
552 //output simulate reads
553 if (reads_count%10000==0)
555 cerr<<"Output "<<reads_count<<" pair reads"<<endl;
558 //output read file
559 if(selection == 0){
560 //output read file1
561 //text output
562 if(!InputParameter.Output_type){
563 Outfile1<<"@read_"<<InputParameter.Insertsize_mean<<"_"<<reads_count+reads_all<<"/"<<1<<" "<<id_seq<<" "<<pos+1<<" "<<InputParameter.Read_length<<" "<<insertsize<<" ";
564 for(int i = 0; i < error_pos1.size(); i++)
566 Outfile1<<error_pos1[i]+1<<","<<raw_base1[i]<<";";
568 Outfile1<<endl<<output_read1<<endl
569 <<"+"<<endl
570 <<output_quality_seq1<<endl;
572 //output read file2
573 Outfile2<<"@read_"<<InputParameter.Insertsize_mean<<"_"<<reads_count+reads_all<<"/"<<2<<" "<<id_seq<<" "<<pos+insertsize-InputParameter.Read_length+1<<" "<<InputParameter.Read_length<<" "<<insertsize<<" ";
574 for(int i = 0; i < error_pos2.size(); i++)
576 Outfile2<<error_pos2[i]+1<<","<<raw_base2[i]<<";";
578 Outfile2<<endl<<output_read2<<endl
579 <<"+"<<endl
580 <<output_quality_seq2<<endl;
581 }else{
582 //*.gz output
583 //output read file1
584 Gz_outfile1<<"@read_"<<InputParameter.Insertsize_mean<<"_"<<reads_count+reads_all<<"/"<<1<<" "<<id_seq<<" "<<pos+1<<" "<<InputParameter.Read_length<<" "<<insertsize<<" ";
585 for(int i = 0; i < error_pos1.size(); i++)
587 Gz_outfile1<<error_pos1[i]+1<<","<<raw_base1[i]<<";";
589 Gz_outfile1<<endl<<output_read1<<endl
590 <<"+"<<endl
591 <<output_quality_seq1<<endl;
593 //output read file2
594 Gz_outfile2<<"@read_"<<InputParameter.Insertsize_mean<<"_"<<reads_count+reads_all<<"/"<<2<<" "<<id_seq<<" "<<pos+insertsize-InputParameter.Read_length+1<<" "<<InputParameter.Read_length<<" "<<insertsize<<" ";
595 for(int i = 0; i < error_pos2.size(); i++)
597 Gz_outfile2<<error_pos2[i]+1<<","<<raw_base2[i]<<";";
599 Gz_outfile2<<endl<<output_read2<<endl
600 <<"+"<<endl
601 <<output_quality_seq2<<endl;
604 }else{
606 //output read file2
607 //text output
608 if(!InputParameter.Output_type){
609 Outfile2<<"@read_"<<InputParameter.Insertsize_mean<<"_"<<reads_count+reads_all<<"/"<<2<<" "<<id_seq<<" "<<pos+1<<" "<<InputParameter.Read_length<<" "<<insertsize<<" ";
610 for(int i = 0; i < error_pos1.size(); i++)
612 Outfile2<<error_pos1[i]+1<<","<<raw_base1[i]<<";";
614 Outfile2<<endl<<output_read1<<endl
615 <<"+"<<endl
616 <<output_quality_seq1<<endl;
618 //output read file1
619 Outfile1<<"@read_"<<InputParameter.Insertsize_mean<<"_"<<reads_count+reads_all<<"/"<<1<<" "<<id_seq<<" "<<pos+insertsize-InputParameter.Read_length+1<<" "<<InputParameter.Read_length<<" "<<insertsize<<" ";
620 for(int i = 0; i < error_pos2.size(); i++)
622 Outfile1<<error_pos2[i]+1<<","<<raw_base2[i]<<";";
624 Outfile1<<endl<<output_read2<<endl
625 <<"+"<<endl
626 <<output_quality_seq2<<endl;
627 }else{
628 //*.gz output
629 //output read file2
630 Gz_outfile2<<"@read_"<<InputParameter.Insertsize_mean<<"_"<<reads_count+reads_all<<"/"<<2<<" "<<id_seq<<" "<<pos+1<<" "<<InputParameter.Read_length<<" "<<insertsize<<" ";
631 for(int i = 0; i < error_pos1.size(); i++)
633 Gz_outfile2<<error_pos1[i]+1<<","<<raw_base1[i]<<";";
635 Gz_outfile2<<endl<<output_read1<<endl
636 <<"+"<<endl
637 <<output_quality_seq1<<endl;
639 //output read file1
640 Gz_outfile1<<"@read_"<<InputParameter.Insertsize_mean<<"_"<<reads_count+reads_all<<"/"<<1<<" "<<id_seq<<" "<<pos+insertsize-InputParameter.Read_length+1<<" "<<InputParameter.Read_length<<" "<<insertsize<<" ";
641 for(int i = 0; i < error_pos2.size(); i++)
643 Gz_outfile1<<error_pos2[i]+1<<","<<raw_base2[i]<<";";
645 Gz_outfile1<<endl<<output_read2<<endl
646 <<"+"<<endl
647 <<output_quality_seq2<<endl;
651 cerr<<"Finish output reads"<<endl;
652 return reads_count;
656 uint64_t simulate_fa_reads(string &seq,uint64_t seqlen, uint64_t rd_pair, string id_seq, uint64_t reads_all)
658 int whether_check_seq = 0;
659 if(!check_seq(seq)){
660 whether_check_seq = 1;
663 uint64_t reads_count=0;
664 cerr<<"Begin to output reads"<<endl;
665 for(uint64_t pair_count=0; pair_count < rd_pair; pair_count++)
667 //simulate insertsize
668 int insertsize = simulate_insertsize(InputParameter.Insertsize_mean,InputParameter.Insertsize_sd);
669 if (insertsize<InputParameter.Read_length){pair_count--;continue;}
670 if (seqlen<insertsize){return reads_count;}
671 uint64_t N = 1000000000000;
672 uint64_t pos = (uint64_t)(((uint64_t)((double)rand() / double(RAND_MAX) * N)) % seqlen);
673 if (pos+insertsize>seqlen){pair_count--;continue;}
675 //get insert seq
676 string sub_str=seq.substr(pos,insertsize);
678 //get read1 and read2
679 string read1=sub_str.substr(0,InputParameter.Read_length);
680 string read2=sub_str.substr(insertsize-InputParameter.Read_length,InputParameter.Read_length);
682 //check whether contain 'N' or nonbase char
683 if(whether_check_seq && (!check_seq(read1) || !check_seq(read2))){
684 pair_count--;
685 continue;
688 //simulate GC bias
689 if(InputParameter.Is_simulate_GC_bias){
690 string check_seq = read1+read2;
691 if(simulate_GC_bias(check_seq,GC_bias_abundance)){pair_count--;continue;}
694 //insertsize statistics
695 if(InsertSize_distr[insertsize]>0)
697 InsertSize_distr[insertsize]++;
698 }else{
699 InsertSize_distr[insertsize] = 1;
702 reads_count++;
704 if ( InputParameter.Is_cyclization == 1 )
706 read1=reversecomplementary(read1);
707 }else if (InputParameter.Insertsize_mean>0)
709 read2=reversecomplementary(read2);
710 }else{
711 cout<<"Error:insertsize mean is smaller than 0"<<endl;
712 exit(-1);
715 int selection=int(rand()%2); //0 or 1, for selecting output file randomly
717 string output_read1, output_read2;
719 //simulate read1
720 vector<int> error_pos1;
721 vector<char> raw_base1;
722 for(int i = 0; i < InputParameter.Read_length; i++)
724 double num=double(rand())/double(RAND_MAX);
725 char ref_base = read1[i];
726 int cycle;
727 if(selection == 0){
728 cycle = i;
729 }else{
730 cycle = i+Simulate_Cycle_num/2;
733 int location = search_location(Simulation_matrix[alphabet2[ref_base]][cycle], Seq_Base_num, num);
734 char call_base = Bases[location];
735 if(ref_base != call_base){
736 if(selection == 0){
737 Error_pos_distr[i+1]++;
738 }else{
739 Error_pos_distr[i+1+InputParameter.Read_length]++;
741 error_pos1.push_back(i);
742 raw_base1.push_back(ref_base);
744 output_read1.push_back(call_base);
747 //simulate read2
748 vector<int> error_pos2;
749 vector<char> raw_base2;
750 for(int i = 0; i < InputParameter.Read_length; i++)
752 double num=double(rand())/double(RAND_MAX);
753 char ref_base = read2[i];
754 int cycle;
755 if(selection == 0){
756 cycle = i+Simulate_Cycle_num/2;
757 }else{
758 cycle = i;
760 int location = search_location(Simulation_matrix[alphabet2[ref_base]][cycle], Seq_Base_num, num);
761 char call_base = Bases[location];
762 if(ref_base != call_base){
763 if(selection == 0){
764 Error_pos_distr[i+1+InputParameter.Read_length]++;
765 }else{
766 Error_pos_distr[i+1]++;
768 error_pos2.push_back(i);
769 raw_base2.push_back(ref_base);
771 output_read2.push_back(call_base);
774 //output simulate reads
775 if (reads_count%10000==0)
777 cerr<<"Output "<<reads_count<<" pair reads"<<endl;
780 //output read file
781 if(selection == 0){
782 //output read file1
783 //text output
784 if(!InputParameter.Output_type){
786 Outfile1<<">read_"<<InputParameter.Insertsize_mean<<"_"<<reads_count+reads_all<<"/"<<1<<" "<<id_seq<<" "<<pos+1<<" "<<InputParameter.Read_length<<" "<<insertsize<<" ";
787 for(int i = 0; i < error_pos1.size(); i++)
789 Outfile1<<error_pos1[i]+1<<","<<raw_base1[i]<<";";
791 Outfile1<<endl<<output_read1<<endl;
793 //output read file2
794 Outfile2<<">read_"<<InputParameter.Insertsize_mean<<"_"<<reads_count+reads_all<<"/"<<2<<" "<<id_seq<<" "<<pos+insertsize-InputParameter.Read_length+1<<" "<<InputParameter.Read_length<<" "<<insertsize<<" ";
795 for(int i = 0; i < error_pos2.size(); i++)
797 Outfile2<<error_pos2[i]+1<<","<<raw_base2[i]<<";";
799 Outfile2<<endl<<output_read2<<endl;
800 }else{//*.gz output
801 Gz_outfile1<<">read_"<<InputParameter.Insertsize_mean<<"_"<<reads_count+reads_all<<"/"<<1<<" "<<id_seq<<" "<<pos+1<<" "<<InputParameter.Read_length<<" "<<insertsize<<" ";
802 for(int i = 0; i < error_pos1.size(); i++)
804 Gz_outfile1<<error_pos1[i]+1<<","<<raw_base1[i]<<";";
806 Gz_outfile1<<endl<<output_read1<<endl;
808 //output read file2
809 Gz_outfile2<<">read_"<<InputParameter.Insertsize_mean<<"_"<<reads_count+reads_all<<"/"<<2<<" "<<id_seq<<" "<<pos+insertsize-InputParameter.Read_length+1<<" "<<InputParameter.Read_length<<" "<<insertsize<<" ";
810 for(int i = 0; i < error_pos2.size(); i++)
812 Gz_outfile2<<error_pos2[i]+1<<","<<raw_base2[i]<<";";
814 Gz_outfile2<<endl<<output_read2<<endl;
817 }else{
818 //output read file2
819 //text output
820 if(!InputParameter.Output_type){
821 Outfile2<<">read_"<<InputParameter.Insertsize_mean<<"_"<<reads_count+reads_all<<"/"<<2<<" "<<id_seq<<" "<<pos+1<<" "<<InputParameter.Read_length<<" "<<insertsize<<" ";
822 for(int i = 0; i < error_pos1.size(); i++)
824 Outfile2<<error_pos1[i]+1<<","<<raw_base1[i]<<";";
826 Outfile2<<endl<<output_read1<<endl;
828 //output read file1
829 Outfile1<<">read_"<<InputParameter.Insertsize_mean<<"_"<<reads_count+reads_all<<"/"<<1<<" "<<id_seq<<" "<<pos+insertsize-InputParameter.Read_length+1<<" "<<InputParameter.Read_length<<" "<<insertsize<<" ";
830 for(int i = 0; i < error_pos2.size(); i++)
832 Outfile1<<error_pos2[i]+1<<","<<raw_base2[i]<<";";
834 Outfile1<<endl<<output_read2<<endl;
836 }else{//*.gz output
837 Gz_outfile2<<">read_"<<InputParameter.Insertsize_mean<<"_"<<reads_count+reads_all<<"/"<<2<<" "<<id_seq<<" "<<pos+1<<" "<<InputParameter.Read_length<<" "<<insertsize<<" ";
838 for(int i = 0; i < error_pos1.size(); i++)
840 Gz_outfile2<<error_pos1[i]+1<<","<<raw_base1[i]<<";";
842 Gz_outfile2<<endl<<output_read1<<endl;
844 //output read file1
845 Gz_outfile1<<">read_"<<InputParameter.Insertsize_mean<<"_"<<reads_count+reads_all<<"/"<<1<<" "<<id_seq<<" "<<pos+insertsize-InputParameter.Read_length+1<<" "<<InputParameter.Read_length<<" "<<insertsize<<" ";
846 for(int i = 0; i < error_pos2.size(); i++)
848 Gz_outfile1<<error_pos2[i]+1<<","<<raw_base2[i]<<";";
850 Gz_outfile1<<endl<<output_read2<<endl;
855 cerr<<"Finish output reads"<<endl;
856 return reads_count;