7 #include <boost/lexical_cast.hpp>
8 #include <boost/algorithm/string.hpp>
11 #include "load_file.h"
13 //from ASCII of A C G T to 0 1 2 3, auto dealing with upper or lower case.
14 //8bit char type, A=a=0, C=c=1, G=g=2, T=t=3, others as 4.
17 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
18 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
19 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
20 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
21 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
22 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
23 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
24 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4
27 //check and open the outfile file
28 void set_and_check_file(PARAMETER InputParameter
, igzstream
&infile
, igzstream
&infile2
, ofstream
&outfile1
, ofstream
&outfile2
,
29 ogzstream
&gz_outfile1
, ogzstream
&gz_outfile2
, ofstream
&insert_log
, ofstream
&error_log
)
31 //set and open output file
32 infile
.open(InputParameter
.Input_ref1
.c_str());
35 cerr
<<"Error:unable to open input file:"<<InputParameter
.Input_ref1
<<endl
;
39 if(InputParameter
.Input_ref2
!="")
41 infile2
.open(InputParameter
.Input_ref2
.c_str());
44 cerr
<<"Error:unable to open input file:"<<InputParameter
.Input_ref2
<<endl
;
49 string output_file1
, output_file2
, output_insert_distr
, output_error_distr
;
50 string infix_name
= "_"+boost::lexical_cast
<std::string
>(InputParameter
.Read_length
)+"_"+boost::lexical_cast
<std::string
>(InputParameter
.Insertsize_mean
);
52 if(InputParameter
.Is_simulate_quality
){
53 output_file1
= InputParameter
.Output_prefix
+infix_name
+"_1.fq";
55 output_file2
= InputParameter
.Output_prefix
+infix_name
+"_2.fq";
57 output_file1
= InputParameter
.Output_prefix
+infix_name
+"_1.fa";
59 output_file2
= InputParameter
.Output_prefix
+infix_name
+"_2.fa";
62 if(InputParameter
.Output_type
== 1){
67 //insert size distribution file
68 output_insert_distr
= InputParameter
.Output_prefix
+infix_name
+".insertsize.distr";
69 //error rate distribution file
70 output_error_distr
= InputParameter
.Output_prefix
+infix_name
+".error_rate.distr";
72 if(!InputParameter
.Output_type
){
73 outfile1
.open(output_file1
.c_str());
74 outfile2
.open(output_file2
.c_str());
76 gz_outfile1
.open(output_file1
.c_str());
77 gz_outfile2
.open(output_file2
.c_str());
80 insert_log
.open(output_insert_distr
.c_str());
81 error_log
.open(output_error_distr
.c_str());
84 if(!InputParameter
.Output_type
)
86 if (!outfile1
|| !outfile2
)
88 cerr
<<"Error:unable to open output file."<<endl
;
92 if (!gz_outfile1
|| !gz_outfile2
)
94 cerr
<<"Error:unable to open output file."<<endl
;
99 if(!insert_log
){cerr
<<"Error:unalbe to open output insertsize distribution file."<<endl
;exit(1);}
100 if(!error_log
){cerr
<<"Error:unalbe to open output error rate distribution file."<<endl
;exit(1);}
103 //get the attribute of Base-calling profile
104 void preview_BaseCalling_profile (PARAMETER InputParameter
, string exe_path
, int &ref_Base_num
,
105 int &statistical_Cycle_num
, int &seq_Base_num
, int &quality_num
, double &statistical_average_error_rate
)
109 if(InputParameter
.BaseCalling_profile
== ""){
110 int index
= exe_path
.find_last_of('/');
112 cerr
<<"Error: program path wrong!"<<endl
;
115 string directory_path
= exe_path
.substr(0,index
);
116 matrix_file
= directory_path
+ "/Profiles/Base-calling_profile/hum20110701.bwanosnp.count.matrix";
119 matrix_file
= InputParameter
.BaseCalling_profile
;
123 infile
.open(matrix_file
.c_str());
126 cerr
<< "fail to open input file" << matrix_file
<< endl
;
128 uint64_t total_count_sum
= 0;
129 uint64_t total_error_sum
= 0;
130 uint64_t total_correct_sum
= 0;
133 while (getline( infile
, lineStr
, '\n' ))
134 { if (lineStr
[0] == '#')
135 { if (lineStr
[1] == 'D' && lineStr
[2] == 'i' && lineStr
[3] == 'm')
136 { vector
<string
> lineVec
;
137 boost::split(lineVec
,lineStr
, boost::is_any_of(":,; \t\n"), boost::token_compress_on
);
138 ref_Base_num
= atoi(lineVec
[2].c_str());
139 statistical_Cycle_num
= atoi(lineVec
[4].c_str());
141 if(InputParameter
.Read_length
> statistical_Cycle_num
/2){cerr
<<"Error: according to the Base-calling profile, program can be simulate "<< statistical_Cycle_num
/2 <<"bp read length at most, please set read length again!"<<endl
;exit(-1);}
142 seq_Base_num
= atoi(lineVec
[6].c_str());
143 quality_num
= atoi(lineVec
[8].c_str());
152 if(lineStr
== ""){continue;}
153 vector
<string
> lineVec
;
154 boost::split(lineVec
,lineStr
, boost::is_any_of(" \t\n"), boost::token_compress_on
);
156 char ref_base
= lineVec
[0][0];
157 int current_cycle
= atoi(lineVec
[1].c_str());
159 if(current_cycle
<= InputParameter
.Read_length
|| (current_cycle
>= statistical_Cycle_num
/2 && current_cycle
<= statistical_Cycle_num
/2+InputParameter
.Read_length
)){
160 for(int i
= 0; i
< seq_Base_num
; i
++)
162 for(int j
= 0; j
< quality_num
; j
++)
164 string num
= lineVec
[i
* quality_num
+ j
+ 2];
165 uint64_t freqnum
= boost::lexical_cast
<uint64_t>(num
);
167 total_count_sum
+=freqnum
;
168 if( i
== alphabet3
[ref_base
]) // ACGT
170 total_correct_sum
+=freqnum
;
172 total_error_sum
+=freqnum
;
180 //get BaseCalling profile average error rate
181 statistical_average_error_rate
= double(total_error_sum
)/double(total_count_sum
);
185 //read in quality distribution file and get the quality distribution table.
186 void load_BaseCalling_profile(PARAMETER InputParameter
, string exe_path
, int statistical_Cycle_num
, int seq_Base_num
,
187 int quality_num
, double statistical_average_error_rate
, double*** simulation_matrix
)
190 if(InputParameter
.BaseCalling_profile
== ""){
191 int index
= exe_path
.find_last_of('/');
193 cerr
<<"Error: program path wrong!"<<endl
;
196 string directory_path
= exe_path
.substr(0,index
);
197 matrix_file
= directory_path
+ "/Profiles/Base-calling_profile/hum20110701.bwanosnp.count.matrix";
200 matrix_file
= InputParameter
.BaseCalling_profile
;
203 igzstream MatrixFile
;
204 MatrixFile
.open(matrix_file
.c_str());
206 { cerr
<< "fail to open input file " << matrix_file
<<", make sure statistics file place in program directory!"<< endl
;
211 cerr
<<"Start to construct simulation matrix..."<<endl
212 <<"Loading file: "<<matrix_file
<<endl
;
215 //user_error_rate/profile_error_rate
217 if(InputParameter
.Error_rate
== -1){
220 E_ratio
= InputParameter
.Error_rate
/statistical_average_error_rate
;
223 int simulate_cycle
= 0;
224 while(getline(MatrixFile
, str_line
, '\n'))
226 if(str_line
[0] == '#' || str_line
== ""){continue;}
228 vector
<string
> str_line_tokens
;
229 boost::split(str_line_tokens
,str_line
, boost::is_any_of(" \t\n"), boost::token_compress_on
);
230 char ref_base
= str_line_tokens
[0][0];
231 int current_cycle
= atoi(str_line_tokens
[1].c_str());
233 if(current_cycle
== 1){simulate_cycle
= 0;}
234 if(current_cycle
<= InputParameter
.Read_length
|| (current_cycle
> statistical_Cycle_num
/2 && current_cycle
<= statistical_Cycle_num
/2+InputParameter
.Read_length
))
237 uint64_t error_sum
= 0;
238 uint64_t count_sum
= 0;
239 uint64_t correct_sum
= 0;
240 //get the error rate of current cycle
241 for(int i
= 0; i
< seq_Base_num
; i
++)
243 for(int j
= 0; j
< quality_num
; j
++)
245 string num
= str_line_tokens
[i
* quality_num
+ j
+ 2];
246 uint64_t current_num
= boost::lexical_cast
<uint64_t>(num
);
247 count_sum
+=current_num
;
248 if( i
== alphabet3
[ref_base
]) // ACGT
250 correct_sum
+=current_num
;
252 error_sum
+=current_num
;
256 double this_cycle_error_rate
= double(error_sum
)/double(count_sum
);
258 if(InputParameter
.Is_simulate_quality
) //for simulating fastq
260 //convert profile error matrix to simulation error matrix
261 for(int i
= 0; i
< seq_Base_num
; i
++)
263 for(int j
= 0; j
< quality_num
; j
++)
265 string num
= str_line_tokens
[i
* quality_num
+ j
+ 2];
266 uint64_t current_num
= boost::lexical_cast
<uint64_t>(num
);
267 count_sum
+=current_num
;
268 if( i
== alphabet3
[ref_base
]) // ACGT
270 //the correct call base number : current_num*(1-this_cycle_error_rate*E/e)/(1-this_cycle_error_rate)
271 simulation_matrix
[alphabet3
[ref_base
]][simulate_cycle
-1][i
*quality_num
+j
] = current_num
*(1-this_cycle_error_rate
*E_ratio
)/(1-this_cycle_error_rate
);
272 //cerr<<simulation_matrix[alphabet3[ref_base]][simulate_cycle-1][i*quality_num+j]<<" ";
274 //the error call base number: current_num*E_ratio
275 simulation_matrix
[alphabet3
[ref_base
]][simulate_cycle
-1][i
*quality_num
+j
] = current_num
*E_ratio
;
276 //cerr<<simulation_matrix[alphabet3[ref_base]][simulate_cycle-1][i*quality_num+j]<<" ";
282 //the cumulative rate
284 for(int i
= 0; i
< seq_Base_num
*quality_num
; i
++)
286 sum
+= simulation_matrix
[alphabet3
[ref_base
]][simulate_cycle
-1][i
];
290 for(int i
= 0; i
< seq_Base_num
*quality_num
; i
++)
292 simulation_matrix
[alphabet3
[ref_base
]][simulate_cycle
-1][i
] = 0;
295 double accumulate_value
= 0;
296 for(int i
= 0; i
< seq_Base_num
*quality_num
; i
++)
298 accumulate_value
+= simulation_matrix
[alphabet3
[ref_base
]][simulate_cycle
-1][i
];
299 simulation_matrix
[alphabet3
[ref_base
]][simulate_cycle
-1][i
] = accumulate_value
/sum
;
300 // cerr << simulation_matrix[alphabet3[ref_base]][simulate_cycle-1][i] <<" ";
304 //for simulating fasta
305 //convert profile error matrix to simulation error matrix
306 for(int i
= 0; i
< seq_Base_num
; i
++)
308 for(int j
= 0; j
< quality_num
; j
++)
310 string num
= str_line_tokens
[i
* quality_num
+ j
+ 2];
311 uint64_t current_num
= boost::lexical_cast
<uint64_t>(num
);
312 count_sum
+=current_num
;
313 if( i
== alphabet3
[ref_base
]) // ACGT
315 //the correct call base number : current_num*(1-this_cycle_error_rate*E/e)/(1-this_cycle_error_rate)
316 simulation_matrix
[alphabet3
[ref_base
]][simulate_cycle
-1][i
] += current_num
*(1-this_cycle_error_rate
*E_ratio
)/(1-this_cycle_error_rate
);
317 //cerr<<simulation_matrix[alphabet3[ref_base]][simulate_cycle-1][i*quality_num+j]<<" ";
319 //the error call base number: current_num*E_ratio
320 simulation_matrix
[alphabet3
[ref_base
]][simulate_cycle
-1][i
] += current_num
*E_ratio
;
321 //cerr<<simulation_matrix[alphabet3[ref_base]][simulate_cycle-1][i*quality_num+j]<<" ";
327 //the cumulative rate
329 for(int i
= 0; i
< seq_Base_num
; i
++)
331 sum
+= simulation_matrix
[alphabet3
[ref_base
]][simulate_cycle
-1][i
];
335 for(int i
= 0; i
< seq_Base_num
; i
++)
337 simulation_matrix
[alphabet3
[ref_base
]][simulate_cycle
-1][i
] = 0;
340 double accumulate_value
= 0;
341 for(int i
= 0; i
< seq_Base_num
; i
++)
343 accumulate_value
+= simulation_matrix
[alphabet3
[ref_base
]][simulate_cycle
-1][i
];
344 simulation_matrix
[alphabet3
[ref_base
]][simulate_cycle
-1][i
] = accumulate_value
/sum
;
345 //cerr<<simulation_matrix[alphabet3[ref_base]][simulate_cycle-1][i]<<" ";
354 cerr
<<"Have finished constructing Base-calling simulation matrix"<<endl
;
357 void load_GC_depth_profile (PARAMETER InputParameter
, string exe_path
, double* GC_bias_abundance
)
361 if(InputParameter
.GC_depth_profile
== ""){
362 int index
= exe_path
.find_last_of('/');
364 cerr
<<"Error: program path wrong!"<<endl
;
367 //The default GC bias profile are determined based on the twice length of read
368 string GC_depth_profile_name
;
369 int window_size
= InputParameter
.Read_length
* 2;
370 if(window_size
< 125){
371 GC_depth_profile_name
= "gcdep100.txt";
373 if(window_size
< 175){
374 GC_depth_profile_name
= "gcdep150.txt";
376 GC_depth_profile_name
= "gcdep200.txt";
379 string directory_path
= exe_path
.substr(0,index
);
380 depth_file
= directory_path
+ "/Profiles/GC-depth_profile/"+GC_depth_profile_name
;
383 depth_file
= InputParameter
.GC_depth_profile
;
387 infile
.open(depth_file
.c_str());
390 cerr
<< "fail to open input file" << depth_file
<< endl
;
392 cerr
<< "Loading the GC bias profile: "<< depth_file
<<endl
;
394 uint64_t total_count_sum
= 0;
395 uint64_t total_error_sum
= 0;
398 vector
<double> GC_ratio_vec
;
399 vector
<double> depth_vec
;
400 while (getline( infile
, lineStr
, '\n' ))
401 { if (lineStr
[0] == '#' || lineStr
== "" || lineStr
[0] == ' ')
407 vector
<string
> lineVec
;
408 //#GC% RefCnt DepthCnt Mean Small Q1 Mid Q3 Big Min Max Refcntcal
409 //0.5 2285822 1371 0.334179 0 0 0 0.2 0.49 0 23.48 0.0945074
411 boost::split(lineVec
,lineStr
, boost::is_any_of(" \t\n"), boost::token_compress_on
);
412 GC_ratio_vec
.push_back(boost::lexical_cast
<double>(lineVec
[0])); //GC%
413 depth_vec
.push_back(boost::lexical_cast
<double>(lineVec
[3])); //Mean
418 double max_depth
= 0;
419 for(int i
= 0; i
< depth_vec
.size(); i
++)
421 if(depth_vec
[i
] > max_depth
){
422 max_depth
= depth_vec
[i
];
426 //convert to GC abundance
427 for(int i
= 1; i
< GC_ratio_vec
.size(); i
++)
429 for(int j
= int(GC_ratio_vec
[i
-1]); j
< int(GC_ratio_vec
[i
]); j
++)
431 GC_bias_abundance
[j
] = depth_vec
[i
-1]/max_depth
;
434 for(int i
= int(GC_ratio_vec
[GC_ratio_vec
.size()-1]); i
<=100; i
++)
436 GC_bias_abundance
[i
] = depth_vec
[depth_vec
.size()-1]/max_depth
;
440 for(int i
= 0; i
<=100; i
++)
442 cerr
<<"GC%:"<<i
<<"\t"<<GC_bias_abundance
[i
]<<endl
;
446 cerr
<<"Have finished constructing GC bias simulation matrix"<<endl
;