7 #include <boost/lexical_cast.hpp>
8 #include <boost/algorithm/string.hpp>
11 #include "load_file.h"
15 using namespace boost
;
18 PARAMETER InputParameter
={100,500,-1,0,1,1,64,1,40,-1,"","","","","Illumina"};
20 int Ref_Base_num
= 0; //ATCG: 4
21 int Statistical_Cycle_num
= 0; //the cycle number in Base-calling profile
22 int Seq_Base_num
= 0; //ATCG: 4
23 int Quality_num
= 0; //the quality score number in Base-calling profile
24 int Simulate_Cycle_num
= 0; //the cycle number of simulation
25 double Statistical_average_error_rate
= 0; //the average error rate in Base-calling profile
27 ogzstream Gz_outfile1
;
28 ogzstream Gz_outfile2
;
32 double*** Simulation_matrix
= NULL
; //for call base and quality simulation
33 double* GC_bias_abundance
= NULL
; //for GC bias simulation
34 map
<int,uint64_t> InsertSize_distr
; //record the insert size distribution
35 uint64_t* Error_pos_distr
= NULL
; //record the error position distribution
36 double* Q_to_Erate_distr
= NULL
; //record quality score to error rate distribution
37 uint64_t Total_read_pair
= 0;
39 //get genome sequence and start to simulate read
40 void Get_genome(igzstream
&inf
,igzstream
&inf2
);
41 //contral the quantity of simulate reads for each chromosome one by one
42 uint64_t contral_reads_quantity(string id_line
,string id
,string
&sequ
,string
&sequ2
,uint64_t read_genome
);
43 //simulate fastq reads
44 uint64_t simulate_fq_reads(string
&seq
,uint64_t seqlen
, uint64_t rd_pair
, string id_seq
, uint64_t reads_all
);
45 //simulate fasta reads
46 uint64_t simulate_fa_reads(string
&seq
,uint64_t seqlen
, uint64_t rd_pair
, string id_seq
, uint64_t reads_all
);
48 const char *VERSION
="1.0";
49 const char *AUTHOR
="BGI-Shenzhen";
50 const char *CONTACT1
="yuanjianying@genomics.org.cn";
51 const char *CONTACT2
="shiyujian@genomics.org.cn";
54 cout
<<"\nDescription:"<<endl
;
55 cout
<<" It is a program for simulating Illumina PE reads, with a series of characters generate by illumina ";
56 cout
<<"sequencing machine, such as insertsize distribution, sequencing error, quality score and GC bias. ";
57 cout
<<"User should set the value of insertsize_mean and insertsize_sd, they are the mean value and standard ";
58 cout
<<"deviation of the normal distribution that used as the model function when simulating insertsize ";
59 cout
<<"distribution, usually we set the insertsize_sd to be 1/20 of the insertsize_mean. The normal distribution ";
60 cout
<<"function model we used in this program is f(x)=1/σ/sqrt(2*pi)*exp((x-μ)**2 / (2*σ**2)), and the insertsize ";
61 cout
<<"distribution is simulated by Box-muller method. ";
62 cout
<<"This program simulates Illumina sequencing error, quality score and GC bias according to the empirical distribution profile. ";
63 cout
<<"User can set the path of profile or using the default file in this program package, which is generated by large real sequencing data. ";
64 cout
<<"If you want to simulate diploid(heterozygosis SNP, heterozygosis Indel and structural variation) reads, you should input two ";
65 cout
<<"reference genome sequence, you can get the another diploid genome sequence by the program \"simulate_diploid_genome\", but ";
66 cout
<<"remember that heterozygosis SNP rate and heterozygosis Indel rate is only exists in diploid. ";
67 cout
<<"At last, you should set another several parameters, read length, coverage of reads, input sequence, ";
68 cout
<<"output prefix and so on, the option -i must be set, because there is no default value."<<endl
;
69 cout
<<endl
<<"Program: simulate_Illumina_reads"<<endl
;
70 cout
<<"\tVersion: "<<VERSION
<<endl
;
71 cout
<<"\tAuthor: "<<AUTHOR
<<endl
;
72 cout
<<"\tContact: "<<CONTACT1
<<" "<<CONTACT2
<<endl
;
73 cout
<<endl
<<"Usage:\tsimulate_Illumina_reads [options]"<<endl
;
74 cout
<<"\t-i <string> input_ref1,input reference genome sequence *.fa/*.fa.gz, no default vaule"<<endl
;
75 cout
<<"\t-I <string> input_ref2,for diploid genome, input another reference genome sequence which was generated by program \"simulate_snp_indel_seq\""<<endl
;
76 cout
<<"\t-s <string> Base-calling profile,input Base-calling profile for simulating sequencing error and quality score,default: (exe_path)/Profiles/Base-calling_profile/hum20110701.bwanosnp.count.matrix"<<endl
;
77 cout
<<"\t-d <string> GC-depth profile,input GC-depth file for simulating GC bias, the default GC bias profiles are determined based on the twice of read length"<<endl
;
78 cout
<<"\t-l <int> read_len,set length of read,read1 and read2 have the same length,default:"<<InputParameter
.Read_length
<<endl
;
79 cout
<<"\t-x <double> coverage,set the sequencing coverage(sometimes called depth),default:"<<InputParameter
.Coverage
<<endl
;
80 cout
<<"\t-m <int> insertsize_mean,set the average value of insert size,default:"<<InputParameter
.Insertsize_mean
<<endl
;
81 cout
<<"\t-v <int> insertsize_sd,set the standard deviation of insert sizes, default:insertsize_mean/20"<<endl
;
82 cout
<<"\t-e <double> error_rate,set the average error rate over all cycles,default=average error rate of Base-calling profile"<<endl
;
83 cout
<<"\t-g <int> whether simulate GC bias, 0:no, 1:yes, default:"<<InputParameter
.Is_simulate_GC_bias
<<endl
;
84 cout
<<"\t-q <int> whether simulate quality value, 0:no, 1:yes, default:"<<InputParameter
.Is_simulate_quality
<<endl
;
85 cout
<<"\t-Q <int> Quality value ascii shift, generally 64 or 33 for Illumina data, default:"<<InputParameter
.Q_shift
<<endl
;
86 cout
<<"\t-f <int> whether cyclize insert sequence(influence on PE-reads direction) 0: read1-forward read2-reverse, 1: read1-reverse read2-forward, default:"<<InputParameter
.Is_cyclization
<<endl
;
87 cout
<<"\t-c <int> set output file type, 0:text, 1:*.gz, default:"<<InputParameter
.Output_type
<<endl
;
88 cout
<<"\t-o <string> output,output file prefix default:"<<InputParameter
.Output_prefix
<<endl
;
89 cout
<<"\t-h output help infomation."<<endl
;
90 cout
<<endl
<<"Example:"<<endl
;
91 cout
<<"\t1. ./simulate_Illumina_reads -i ref_sequence.fa"<<endl
;
92 cout
<<"\t Every parameter use the default value."<<endl
;
93 cout
<<"\t2. ./simulate_Illumina_reads -i ref_sequence.fa -l 100 -x 20 -o human_500_100"<<endl
;
94 cout
<<"\t Just set read length and coverage you needed."<<endl
;
95 cout
<<"\t3. ./simulate_Illumina_reads -i ref_sequence.fa -o human -m 600 -v 30 -e 0.01"<<endl
;
96 cout
<<"\t Set insertsize distribution and error rate."<<endl
;
97 cout
<<"\t4. ./simulate_Illumina_reads -i ref_sequence.fa -I ref_seq.snp.indel.invertion.fa.gz -o human "<<endl
;
98 cout
<<"\t The genome is diploid and you want to produce heterozygosis SNPs heterozygosis Indels in reads, "<<endl
;
99 cout
<<"\t the -I input file was generated by program \"simulate_diploid_genome\"."<<endl
;
100 cout
<<"\t5. ./simulate_Illumina_reads -i ref_sequence.fa -g 0 -m 50000 -f 1 -c 0 -o human "<<endl
;
101 cout
<<"\t Set simulate no GCbias and cyclize the large insert-size library, output file is text format."<<endl
;
102 cout
<<"\t6. ./simulate_Illumina_reads -i ref_sequence.fa -q 0 -o human "<<endl
;
103 cout
<<"\t Set simulate quality value, the ouput is in *.fa format."<<endl
;
107 void Getopt(int argc
,char *argv
[]){
109 while ((c
=getopt(argc
,argv
,"i:I:s:d:l:x:m:v:e:f:g:q:Q:c:o:h"))!=-1)
112 case 'i': InputParameter
.Input_ref1
=optarg
;break;
113 case 'I': InputParameter
.Input_ref2
=optarg
;break;
114 case 's': InputParameter
.BaseCalling_profile
=optarg
;break;
115 case 'd': InputParameter
.GC_depth_profile
=optarg
;break;
116 case 'l': InputParameter
.Read_length
=atoi(optarg
);break;
117 case 'x': InputParameter
.Coverage
=strtod(optarg
,NULL
);break;
118 case 'm': InputParameter
.Insertsize_mean
=atoi(optarg
);break;
119 case 'v': InputParameter
.Insertsize_sd
=atoi(optarg
);break;
120 case 'e': InputParameter
.Error_rate
=strtod(optarg
,NULL
);break;
121 case 'f': InputParameter
.Is_cyclization
=atoi(optarg
);break;
122 case 'g': InputParameter
.Is_simulate_GC_bias
=atoi(optarg
);break;
123 case 'q': InputParameter
.Is_simulate_quality
=atoi(optarg
);break;
124 case 'Q': InputParameter
.Q_shift
=atoi(optarg
);break;
125 case 'c': InputParameter
.Output_type
=atoi(optarg
);break;
126 case 'o': InputParameter
.Output_prefix
=optarg
;break;
127 case 'h': Usage();break;
133 int main(int argc
, char *argv
[])
135 time_t time_start
, time_end
;
136 time_start
= time(NULL
);
137 srand((unsigned)time(NULL
));
146 //set insertsize sd default value
147 if(InputParameter
.Insertsize_sd
== -1){InputParameter
.Insertsize_sd
= int(InputParameter
.Insertsize_mean
/20);}
149 if(InputParameter
.Input_ref1
== ""){cerr
<<"Error: there is not default value with option -i, please input reference sequence!"<<endl
;exit(-1);}
150 if(InputParameter
.Read_length
<= 0){cerr
<<"Error: read length should be set bigger than 0, please check option -l !"<<endl
;exit(-1);}
151 if(InputParameter
.Coverage
<= 0){cerr
<<"Error: coverage should be set bigger than 0, please check option -x !"<<endl
;exit(-1);}
152 if(InputParameter
.Insertsize_mean
< InputParameter
.Read_length
){cerr
<<"Error: insertize mean should be set bigger than read_length, please check option -m !"<<endl
;exit(-1);}
153 if(InputParameter
.Insertsize_sd
< 0){cerr
<<"Error: insertsize_sd should be set bigger than 0, please check option -v !"<<endl
;exit(-1);}
154 if(InputParameter
.Error_rate
!= -1 && InputParameter
.Error_rate
< 0 || InputParameter
.Error_rate
>= 1){cerr
<<"Error: error_rate should be set between 0 and 1, or set -1 to simulate default error rate according with error profile, please check option -e !"<<endl
;exit(-1);}
155 if(InputParameter
.Is_cyclization
!= 0 && InputParameter
.Is_cyclization
!= 1){cerr
<<"Error: Is_cyclization should be set 0 or 1, please check option -f !"<<endl
;exit(-1);}
156 if(InputParameter
.Is_simulate_GC_bias
!= 0 && InputParameter
.Is_simulate_GC_bias
!= 1){cerr
<<"Error: Is_simulate_GC_bias should be set 0 or 1, please check option -g !"<<endl
;exit(-1);}
157 if(InputParameter
.Is_simulate_quality
!= 0 && InputParameter
.Is_simulate_quality
!= 1){cerr
<<"Error: Is_simulate_quality should be set 0 or 1, please check option -q !"<<endl
;exit(-1);}
158 if(InputParameter
.Output_type
!= 0 && InputParameter
.Output_type
!= 1){cerr
<<"Error: output_type should be set 0 or 1, please check option -c !"<<endl
;exit(-1);}
160 //set the simulate cycle number
161 Simulate_Cycle_num
= InputParameter
.Read_length
*2;
162 //initialize error position distribution table
163 Error_pos_distr
= new uint64_t[Simulate_Cycle_num
+1];
164 Q_to_Erate_distr
= new double[Simulate_Cycle_num
+1];
165 for(int i
=0; i
<=Simulate_Cycle_num
; i
++)
167 Error_pos_distr
[i
] = 0;
168 Q_to_Erate_distr
[i
] = 0.0;
177 //check and open file
178 set_and_check_file( InputParameter
, infile
, infile2
, Outfile1
, Outfile2
, Gz_outfile1
, Gz_outfile2
, insert_log
, error_log
);
181 ////////////////////Load error profile///////////////////////
183 string exe_path
= argv
[0]; //program package path
185 cerr
<<"Start to preview error profile..."<<endl
;
186 //get dimensions of error profile
187 preview_BaseCalling_profile (InputParameter
, exe_path
, Ref_Base_num
, Statistical_Cycle_num
, Seq_Base_num
, Quality_num
, Statistical_average_error_rate
);
188 cerr
<< "Dimensions of error profile:\n";
189 cerr
<< " Ref_Base_num: " << Ref_Base_num
<< endl
;
190 cerr
<< " Statistical_Cycle_num: " << Statistical_Cycle_num
<< endl
;
191 cerr
<< " Seq_Base_num: " << Seq_Base_num
<< endl
;
192 cerr
<< " Quality_num: " << Quality_num
<< endl
;
193 cerr
<< " "<<InputParameter
.Read_length
<<"bp reads total average error rate: "<< Statistical_average_error_rate
<<endl
;
195 //initialize simulation matrix
196 if(InputParameter
.Is_simulate_quality
){
197 //initialize matrix table Ref_Base_num*Simulate_Cycle_num*Seq_Base_num*Quality_num
198 Simulation_matrix
= new double**[Ref_Base_num
];
199 for(int i
=0; i
<Ref_Base_num
; i
++)
201 Simulation_matrix
[i
] = new double*[Simulate_Cycle_num
];
202 for(int j
=0; j
<Simulate_Cycle_num
; j
++)
204 Simulation_matrix
[i
][j
] = new double[Seq_Base_num
*Quality_num
];
205 for(int k
=0; k
<Seq_Base_num
*Quality_num
; k
++)
207 Simulation_matrix
[i
][j
][k
] = 0;
211 }else{ //simulate fa read matrix
212 //initialize matrix table Ref_Base_num*Simulate_Cycle_num*Seq_Base_num
213 Simulation_matrix
= new double**[Ref_Base_num
];
214 for(int i
=0; i
<Ref_Base_num
; i
++)
216 Simulation_matrix
[i
] = new double*[Simulate_Cycle_num
];
217 for(int j
=0; j
<Simulate_Cycle_num
; j
++)
219 Simulation_matrix
[i
][j
] = new double[Seq_Base_num
];
220 for(int k
=0; k
<Seq_Base_num
; k
++)
222 Simulation_matrix
[i
][j
][k
] = 0;
228 //get the simulation matrix
229 load_BaseCalling_profile(InputParameter
, exe_path
, Statistical_Cycle_num
, Seq_Base_num
, Quality_num
, Statistical_average_error_rate
, Simulation_matrix
);
232 ///////////////////////////load GC bias profile///////////////////////////////
234 //get GC abundance for simulate GC bias
235 if(InputParameter
.Is_simulate_GC_bias
){
236 GC_bias_abundance
= new double[101];
237 for(int i
= 0; i
<= 100; i
++)
239 GC_bias_abundance
[i
] = 0.0;
241 load_GC_depth_profile (InputParameter
, exe_path
, GC_bias_abundance
);
245 ///////////////////////////get genome seq and start to simulate reads/////////////////////////////
248 Get_genome(infile
,infile2
);
251 /////////////////////////output error distribution and insert size distribution////////////////////////////
253 //ouput error position distribution
254 error_log
<<"**********Error rate distribution**********"<<endl
;
255 if(InputParameter
.Is_simulate_quality
){
256 error_log
<<"Cycle\tReal_error_rate\tQuality_to_error_rate"<<endl
;
257 for(int i
=1; i
<=Simulate_Cycle_num
; i
++){
258 error_log
<<i
<<"\t"<<double(Error_pos_distr
[i
])/double(Total_read_pair
)<<"\t"<<Q_to_Erate_distr
[i
]/double(Total_read_pair
)<<endl
;
259 if(i
== InputParameter
.Read_length
){error_log
<<endl
;}
262 error_log
<<"Cycle\tReal_error_rate"<<endl
;
263 for(int i
=1; i
<=Simulate_Cycle_num
; i
++){
264 error_log
<<i
<<"\t"<<double(Error_pos_distr
[i
])/double(Total_read_pair
)<<endl
;
265 if(i
== InputParameter
.Read_length
){error_log
<<endl
;}
268 delete[] Error_pos_distr
;
269 delete[] Q_to_Erate_distr
;
271 //output insert size distribution
272 insert_log
<<"**********Insert size distribution************"<<endl
273 <<"insert_size_len"<<"\t"<<"number"<<endl
;
274 map
<int, uint64_t>::const_iterator map_it
= InsertSize_distr
.begin();
275 while (map_it
!= InsertSize_distr
.end())
277 insert_log
<<map_it
->first
<<"\t"<<map_it
->second
<<endl
;
278 //cout<<map_it->second<<endl;
282 if(InputParameter
.Is_simulate_GC_bias
)
284 delete[] GC_bias_abundance
;
287 for(int i
=0; i
<Ref_Base_num
; i
++)
289 for(int j
=0; j
<Simulate_Cycle_num
; j
++)
291 delete[] Simulation_matrix
[i
][j
];
293 delete[] Simulation_matrix
[i
];
295 delete[] Simulation_matrix
;
297 time_end
= time(NULL
);
298 cerr
<<"All done! Run time: "<<time_end
-time_start
<<"s."<<endl
;
303 //get genome sequence and start to simulate read
304 void Get_genome(igzstream
&inf
,igzstream
&inf2
){
305 string line
,line2
,id
,id_line
,seq
,seq2
;
306 uint64_t readINgenome
=0;
308 while (getline(inf
,line
,'\n'))
314 //another diploid seq
315 if(InputParameter
.Input_ref2
!= ""){
316 while(getline(inf2
,line2
,'\n'))
323 cerr
<<"Have finished reading scaffold "<<id
<<endl
;
324 readINgenome
+=contral_reads_quantity(id_line
,id
,seq
,seq2
,readINgenome
);
334 cerr
<<"Have finished reading scaffold "<<id
<<endl
;
336 readINgenome
+=contral_reads_quantity(id_line
,id
,seq
,seq2
,readINgenome
);
342 int pos
=line
.find(" ");
343 line
=line
.substr(0,pos
);
349 cerr
<<"Have finished reading scaffold "<<id
<<endl
;
350 if(InputParameter
.Input_ref2
!= ""){
351 while(getline(inf2
,line2
,'\n'))
353 if(line2
[0] == '>'){continue;}
358 readINgenome
+=contral_reads_quantity(id_line
,id
,seq
,seq2
,readINgenome
);
361 //contral the quantity of simulate reads for each chromosome one by one
362 uint64_t contral_reads_quantity(string id_line
,string id
,string
&sequence
,string
&sequence2
,uint64_t read_genome
)
364 uint64_t readonchr
=0;
365 if (sequence
.size()<InputParameter
.Insertsize_mean
)
369 //convert lower case to upper case
372 uint64_t sequence_length
=sequence
.size();
373 uint64_t reads_pair_num
=0;
375 if (InputParameter
.Input_ref2
!= "")
377 reads_pair_num
=(uint64_t)(sequence_length
*InputParameter
.Coverage
/InputParameter
.Read_length
/2/2);
379 reads_pair_num
=(uint64_t)(sequence_length
*InputParameter
.Coverage
/InputParameter
.Read_length
/2);
381 Total_read_pair
= Total_read_pair
+ (uint64_t)(sequence_length
*InputParameter
.Coverage
/InputParameter
.Read_length
/2);
383 string id_1
= id
+" 1";
384 if(InputParameter
.Is_simulate_quality
){
385 readonchr
=simulate_fq_reads(sequence
,sequence_length
,reads_pair_num
,
388 readonchr
=simulate_fa_reads(sequence
,sequence_length
,reads_pair_num
,
392 //simulate reads of another diploid genome
393 if (InputParameter
.Input_ref2
!= "")
395 sequence_length
=sequence2
.size();
396 if (sequence_length
<InputParameter
.Insertsize_mean
)
400 reads_pair_num
=(uint64_t)(sequence_length
*InputParameter
.Coverage
/InputParameter
.Read_length
/2/2);
402 uint64_t readonchr2
=read_genome
+readonchr
;
403 string id_2
= id
+" 2";
404 if(InputParameter
.Is_simulate_quality
){
405 readonchr
+=simulate_fq_reads(sequence2
,sequence_length
,reads_pair_num
,
408 readonchr
+=simulate_fa_reads(sequence2
,sequence_length
,reads_pair_num
,
416 uint64_t simulate_fq_reads(string
&seq
,uint64_t seqlen
, uint64_t rd_pair
, string id_seq
, uint64_t reads_all
)
418 int whether_check_seq
= 0;
420 whether_check_seq
= 1;
423 uint64_t reads_count
=0;
424 cerr
<<"Begin to output reads"<<endl
;
425 for(uint64_t pair_count
=0; pair_count
< rd_pair
; pair_count
++)
427 //simulate insertsize
428 int insertsize
= simulate_insertsize(InputParameter
.Insertsize_mean
,InputParameter
.Insertsize_sd
);
429 if (insertsize
<InputParameter
.Read_length
){pair_count
--;continue;}
430 if (seqlen
<insertsize
){return reads_count
;}
431 uint64_t N
= 1000000000000;
432 uint64_t pos
= (uint64_t)(((uint64_t)((double)rand() / double(RAND_MAX
) * N
)) % seqlen
);
433 if (pos
+insertsize
>seqlen
){pair_count
--;continue;}
436 string sub_str
=seq
.substr(pos
,insertsize
);
438 //get read1 and read2
439 string read1
=sub_str
.substr(0,InputParameter
.Read_length
);
440 string read2
=sub_str
.substr(insertsize
-InputParameter
.Read_length
,InputParameter
.Read_length
);
442 //check whether contain 'N' or nonbase char
443 if(whether_check_seq
&& (!check_seq(read1
) || !check_seq(read2
))){
449 if(InputParameter
.Is_simulate_GC_bias
){
450 string check_seq
= read1
+read2
;
451 if(simulate_GC_bias(check_seq
,GC_bias_abundance
)){pair_count
--;continue;}
454 //insertsize statistics
455 if(InsertSize_distr
[insertsize
]>0)
457 InsertSize_distr
[insertsize
]++;
459 InsertSize_distr
[insertsize
] = 1;
464 if ( InputParameter
.Is_cyclization
== 1 )
466 read1
=reversecomplementary(read1
);
467 }else if (InputParameter
.Insertsize_mean
>0)
469 read2
=reversecomplementary(read2
);
471 cout
<<"Error:insertsize mean is smaller than 0"<<endl
;
475 int selection
=int(rand()%2); //0 or 1, for selecting output file randomly
477 string output_read1
, output_read2
, output_quality_seq1
, output_quality_seq2
;
480 vector
<int> error_pos1
;
481 vector
<char> raw_base1
;
482 for(int i
= 0; i
< InputParameter
.Read_length
; i
++)
484 double num
=double(rand())/double(RAND_MAX
);
485 char ref_base
= read1
[i
];
490 cycle
= i
+Simulate_Cycle_num
/2;
492 int location
= search_location(Simulation_matrix
[alphabet2
[ref_base
]][cycle
], Seq_Base_num
*Quality_num
, num
);
493 int call_base_num
= int(location
/Quality_num
);
494 char call_base
= Bases
[call_base_num
];
495 if(ref_base
!= call_base
){
497 Error_pos_distr
[i
+1]++;
499 Error_pos_distr
[i
+1+InputParameter
.Read_length
]++;
501 error_pos1
.push_back(i
);
502 raw_base1
.push_back(ref_base
);
504 output_read1
.push_back(call_base
);
505 int Qscore
= location
%Quality_num
;
506 char quality_value
= Qscore
+ InputParameter
.Q_shift
;
507 output_quality_seq1
.push_back(quality_value
);
509 Q_to_Erate_distr
[i
+1]+= pow(10,double(Qscore
)/double(-10));
511 Q_to_Erate_distr
[i
+1+InputParameter
.Read_length
]+= pow(10,double(Qscore
)/double(-10));
516 vector
<int> error_pos2
;
517 vector
<char> raw_base2
;
518 for(int i
= 0; i
< InputParameter
.Read_length
; i
++)
520 double num
=double(rand())/double(RAND_MAX
);
521 char ref_base
= read2
[i
];
524 cycle
= i
+Simulate_Cycle_num
/2;
529 int location
= search_location(Simulation_matrix
[alphabet2
[ref_base
]][cycle
], Seq_Base_num
*Quality_num
, num
);
530 int call_base_num
= int(location
/Quality_num
);
531 char call_base
= Bases
[call_base_num
];
532 if(ref_base
!= call_base
){
534 Error_pos_distr
[i
+1+InputParameter
.Read_length
]++;
536 Error_pos_distr
[i
+1]++;
538 error_pos2
.push_back(i
);
539 raw_base2
.push_back(ref_base
);
541 output_read2
.push_back(call_base
);
542 int Qscore
= location
%Quality_num
;
543 char quality_value
= Qscore
+ InputParameter
.Q_shift
;
544 output_quality_seq2
.push_back(quality_value
);
546 Q_to_Erate_distr
[i
+1+InputParameter
.Read_length
]+= pow(10,double(Qscore
)/double(-10));
548 Q_to_Erate_distr
[i
+1]+= pow(10,double(Qscore
)/double(-10));
552 //output simulate reads
553 if (reads_count
%10000==0)
555 cerr
<<"Output "<<reads_count
<<" pair reads"<<endl
;
562 if(!InputParameter
.Output_type
){
563 Outfile1
<<"@read_"<<InputParameter
.Insertsize_mean
<<"_"<<reads_count
+reads_all
<<"/"<<1<<" "<<id_seq
<<" "<<pos
+1<<" "<<InputParameter
.Read_length
<<" "<<insertsize
<<" ";
564 for(int i
= 0; i
< error_pos1
.size(); i
++)
566 Outfile1
<<error_pos1
[i
]+1<<","<<raw_base1
[i
]<<";";
568 Outfile1
<<endl
<<output_read1
<<endl
570 <<output_quality_seq1
<<endl
;
573 Outfile2
<<"@read_"<<InputParameter
.Insertsize_mean
<<"_"<<reads_count
+reads_all
<<"/"<<2<<" "<<id_seq
<<" "<<pos
+insertsize
-InputParameter
.Read_length
+1<<" "<<InputParameter
.Read_length
<<" "<<insertsize
<<" ";
574 for(int i
= 0; i
< error_pos2
.size(); i
++)
576 Outfile2
<<error_pos2
[i
]+1<<","<<raw_base2
[i
]<<";";
578 Outfile2
<<endl
<<output_read2
<<endl
580 <<output_quality_seq2
<<endl
;
584 Gz_outfile1
<<"@read_"<<InputParameter
.Insertsize_mean
<<"_"<<reads_count
+reads_all
<<"/"<<1<<" "<<id_seq
<<" "<<pos
+1<<" "<<InputParameter
.Read_length
<<" "<<insertsize
<<" ";
585 for(int i
= 0; i
< error_pos1
.size(); i
++)
587 Gz_outfile1
<<error_pos1
[i
]+1<<","<<raw_base1
[i
]<<";";
589 Gz_outfile1
<<endl
<<output_read1
<<endl
591 <<output_quality_seq1
<<endl
;
594 Gz_outfile2
<<"@read_"<<InputParameter
.Insertsize_mean
<<"_"<<reads_count
+reads_all
<<"/"<<2<<" "<<id_seq
<<" "<<pos
+insertsize
-InputParameter
.Read_length
+1<<" "<<InputParameter
.Read_length
<<" "<<insertsize
<<" ";
595 for(int i
= 0; i
< error_pos2
.size(); i
++)
597 Gz_outfile2
<<error_pos2
[i
]+1<<","<<raw_base2
[i
]<<";";
599 Gz_outfile2
<<endl
<<output_read2
<<endl
601 <<output_quality_seq2
<<endl
;
608 if(!InputParameter
.Output_type
){
609 Outfile2
<<"@read_"<<InputParameter
.Insertsize_mean
<<"_"<<reads_count
+reads_all
<<"/"<<2<<" "<<id_seq
<<" "<<pos
+1<<" "<<InputParameter
.Read_length
<<" "<<insertsize
<<" ";
610 for(int i
= 0; i
< error_pos1
.size(); i
++)
612 Outfile2
<<error_pos1
[i
]+1<<","<<raw_base1
[i
]<<";";
614 Outfile2
<<endl
<<output_read1
<<endl
616 <<output_quality_seq1
<<endl
;
619 Outfile1
<<"@read_"<<InputParameter
.Insertsize_mean
<<"_"<<reads_count
+reads_all
<<"/"<<1<<" "<<id_seq
<<" "<<pos
+insertsize
-InputParameter
.Read_length
+1<<" "<<InputParameter
.Read_length
<<" "<<insertsize
<<" ";
620 for(int i
= 0; i
< error_pos2
.size(); i
++)
622 Outfile1
<<error_pos2
[i
]+1<<","<<raw_base2
[i
]<<";";
624 Outfile1
<<endl
<<output_read2
<<endl
626 <<output_quality_seq2
<<endl
;
630 Gz_outfile2
<<"@read_"<<InputParameter
.Insertsize_mean
<<"_"<<reads_count
+reads_all
<<"/"<<2<<" "<<id_seq
<<" "<<pos
+1<<" "<<InputParameter
.Read_length
<<" "<<insertsize
<<" ";
631 for(int i
= 0; i
< error_pos1
.size(); i
++)
633 Gz_outfile2
<<error_pos1
[i
]+1<<","<<raw_base1
[i
]<<";";
635 Gz_outfile2
<<endl
<<output_read1
<<endl
637 <<output_quality_seq1
<<endl
;
640 Gz_outfile1
<<"@read_"<<InputParameter
.Insertsize_mean
<<"_"<<reads_count
+reads_all
<<"/"<<1<<" "<<id_seq
<<" "<<pos
+insertsize
-InputParameter
.Read_length
+1<<" "<<InputParameter
.Read_length
<<" "<<insertsize
<<" ";
641 for(int i
= 0; i
< error_pos2
.size(); i
++)
643 Gz_outfile1
<<error_pos2
[i
]+1<<","<<raw_base2
[i
]<<";";
645 Gz_outfile1
<<endl
<<output_read2
<<endl
647 <<output_quality_seq2
<<endl
;
651 cerr
<<"Finish output reads"<<endl
;
656 uint64_t simulate_fa_reads(string
&seq
,uint64_t seqlen
, uint64_t rd_pair
, string id_seq
, uint64_t reads_all
)
658 int whether_check_seq
= 0;
660 whether_check_seq
= 1;
663 uint64_t reads_count
=0;
664 cerr
<<"Begin to output reads"<<endl
;
665 for(uint64_t pair_count
=0; pair_count
< rd_pair
; pair_count
++)
667 //simulate insertsize
668 int insertsize
= simulate_insertsize(InputParameter
.Insertsize_mean
,InputParameter
.Insertsize_sd
);
669 if (insertsize
<InputParameter
.Read_length
){pair_count
--;continue;}
670 if (seqlen
<insertsize
){return reads_count
;}
671 uint64_t N
= 1000000000000;
672 uint64_t pos
= (uint64_t)(((uint64_t)((double)rand() / double(RAND_MAX
) * N
)) % seqlen
);
673 if (pos
+insertsize
>seqlen
){pair_count
--;continue;}
676 string sub_str
=seq
.substr(pos
,insertsize
);
678 //get read1 and read2
679 string read1
=sub_str
.substr(0,InputParameter
.Read_length
);
680 string read2
=sub_str
.substr(insertsize
-InputParameter
.Read_length
,InputParameter
.Read_length
);
682 //check whether contain 'N' or nonbase char
683 if(whether_check_seq
&& (!check_seq(read1
) || !check_seq(read2
))){
689 if(InputParameter
.Is_simulate_GC_bias
){
690 string check_seq
= read1
+read2
;
691 if(simulate_GC_bias(check_seq
,GC_bias_abundance
)){pair_count
--;continue;}
694 //insertsize statistics
695 if(InsertSize_distr
[insertsize
]>0)
697 InsertSize_distr
[insertsize
]++;
699 InsertSize_distr
[insertsize
] = 1;
704 if ( InputParameter
.Is_cyclization
== 1 )
706 read1
=reversecomplementary(read1
);
707 }else if (InputParameter
.Insertsize_mean
>0)
709 read2
=reversecomplementary(read2
);
711 cout
<<"Error:insertsize mean is smaller than 0"<<endl
;
715 int selection
=int(rand()%2); //0 or 1, for selecting output file randomly
717 string output_read1
, output_read2
;
720 vector
<int> error_pos1
;
721 vector
<char> raw_base1
;
722 for(int i
= 0; i
< InputParameter
.Read_length
; i
++)
724 double num
=double(rand())/double(RAND_MAX
);
725 char ref_base
= read1
[i
];
730 cycle
= i
+Simulate_Cycle_num
/2;
733 int location
= search_location(Simulation_matrix
[alphabet2
[ref_base
]][cycle
], Seq_Base_num
, num
);
734 char call_base
= Bases
[location
];
735 if(ref_base
!= call_base
){
737 Error_pos_distr
[i
+1]++;
739 Error_pos_distr
[i
+1+InputParameter
.Read_length
]++;
741 error_pos1
.push_back(i
);
742 raw_base1
.push_back(ref_base
);
744 output_read1
.push_back(call_base
);
748 vector
<int> error_pos2
;
749 vector
<char> raw_base2
;
750 for(int i
= 0; i
< InputParameter
.Read_length
; i
++)
752 double num
=double(rand())/double(RAND_MAX
);
753 char ref_base
= read2
[i
];
756 cycle
= i
+Simulate_Cycle_num
/2;
760 int location
= search_location(Simulation_matrix
[alphabet2
[ref_base
]][cycle
], Seq_Base_num
, num
);
761 char call_base
= Bases
[location
];
762 if(ref_base
!= call_base
){
764 Error_pos_distr
[i
+1+InputParameter
.Read_length
]++;
766 Error_pos_distr
[i
+1]++;
768 error_pos2
.push_back(i
);
769 raw_base2
.push_back(ref_base
);
771 output_read2
.push_back(call_base
);
774 //output simulate reads
775 if (reads_count
%10000==0)
777 cerr
<<"Output "<<reads_count
<<" pair reads"<<endl
;
784 if(!InputParameter
.Output_type
){
786 Outfile1
<<">read_"<<InputParameter
.Insertsize_mean
<<"_"<<reads_count
+reads_all
<<"/"<<1<<" "<<id_seq
<<" "<<pos
+1<<" "<<InputParameter
.Read_length
<<" "<<insertsize
<<" ";
787 for(int i
= 0; i
< error_pos1
.size(); i
++)
789 Outfile1
<<error_pos1
[i
]+1<<","<<raw_base1
[i
]<<";";
791 Outfile1
<<endl
<<output_read1
<<endl
;
794 Outfile2
<<">read_"<<InputParameter
.Insertsize_mean
<<"_"<<reads_count
+reads_all
<<"/"<<2<<" "<<id_seq
<<" "<<pos
+insertsize
-InputParameter
.Read_length
+1<<" "<<InputParameter
.Read_length
<<" "<<insertsize
<<" ";
795 for(int i
= 0; i
< error_pos2
.size(); i
++)
797 Outfile2
<<error_pos2
[i
]+1<<","<<raw_base2
[i
]<<";";
799 Outfile2
<<endl
<<output_read2
<<endl
;
801 Gz_outfile1
<<">read_"<<InputParameter
.Insertsize_mean
<<"_"<<reads_count
+reads_all
<<"/"<<1<<" "<<id_seq
<<" "<<pos
+1<<" "<<InputParameter
.Read_length
<<" "<<insertsize
<<" ";
802 for(int i
= 0; i
< error_pos1
.size(); i
++)
804 Gz_outfile1
<<error_pos1
[i
]+1<<","<<raw_base1
[i
]<<";";
806 Gz_outfile1
<<endl
<<output_read1
<<endl
;
809 Gz_outfile2
<<">read_"<<InputParameter
.Insertsize_mean
<<"_"<<reads_count
+reads_all
<<"/"<<2<<" "<<id_seq
<<" "<<pos
+insertsize
-InputParameter
.Read_length
+1<<" "<<InputParameter
.Read_length
<<" "<<insertsize
<<" ";
810 for(int i
= 0; i
< error_pos2
.size(); i
++)
812 Gz_outfile2
<<error_pos2
[i
]+1<<","<<raw_base2
[i
]<<";";
814 Gz_outfile2
<<endl
<<output_read2
<<endl
;
820 if(!InputParameter
.Output_type
){
821 Outfile2
<<">read_"<<InputParameter
.Insertsize_mean
<<"_"<<reads_count
+reads_all
<<"/"<<2<<" "<<id_seq
<<" "<<pos
+1<<" "<<InputParameter
.Read_length
<<" "<<insertsize
<<" ";
822 for(int i
= 0; i
< error_pos1
.size(); i
++)
824 Outfile2
<<error_pos1
[i
]+1<<","<<raw_base1
[i
]<<";";
826 Outfile2
<<endl
<<output_read1
<<endl
;
829 Outfile1
<<">read_"<<InputParameter
.Insertsize_mean
<<"_"<<reads_count
+reads_all
<<"/"<<1<<" "<<id_seq
<<" "<<pos
+insertsize
-InputParameter
.Read_length
+1<<" "<<InputParameter
.Read_length
<<" "<<insertsize
<<" ";
830 for(int i
= 0; i
< error_pos2
.size(); i
++)
832 Outfile1
<<error_pos2
[i
]+1<<","<<raw_base2
[i
]<<";";
834 Outfile1
<<endl
<<output_read2
<<endl
;
837 Gz_outfile2
<<">read_"<<InputParameter
.Insertsize_mean
<<"_"<<reads_count
+reads_all
<<"/"<<2<<" "<<id_seq
<<" "<<pos
+1<<" "<<InputParameter
.Read_length
<<" "<<insertsize
<<" ";
838 for(int i
= 0; i
< error_pos1
.size(); i
++)
840 Gz_outfile2
<<error_pos1
[i
]+1<<","<<raw_base1
[i
]<<";";
842 Gz_outfile2
<<endl
<<output_read1
<<endl
;
845 Gz_outfile1
<<">read_"<<InputParameter
.Insertsize_mean
<<"_"<<reads_count
+reads_all
<<"/"<<1<<" "<<id_seq
<<" "<<pos
+insertsize
-InputParameter
.Read_length
+1<<" "<<InputParameter
.Read_length
<<" "<<insertsize
<<" ";
846 for(int i
= 0; i
< error_pos2
.size(); i
++)
848 Gz_outfile1
<<error_pos2
[i
]+1<<","<<raw_base2
[i
]<<";";
850 Gz_outfile1
<<endl
<<output_read2
<<endl
;
855 cerr
<<"Finish output reads"<<endl
;