modified: pixi.toml
[GalaxyCodeBases.git] / released / pIRS.old / pIRS_simulator / load_file.cpp
blob6eed5ad1c81a88358f6267fac843a9241e5b0d96
1 #include <iostream>
2 #include <vector>
3 #include <string>
4 #include <fstream>
5 #include <math.h>
6 #include <map>
7 #include <boost/lexical_cast.hpp>
8 #include <boost/algorithm/string.hpp>
9 #include "gzstream.h"
10 #include "global.h"
11 #include "load_file.h"
13 //from ASCII of A C G T to 0 1 2 3, auto dealing with upper or lower case.
14 //8bit char type, A=a=0, C=c=1, G=g=2, T=t=3, others as 4.
15 char alphabet3[128] =
17 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
18 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
19 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
20 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
21 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
22 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
23 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
24 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4
27 //check and open the outfile file
28 void set_and_check_file(PARAMETER InputParameter, igzstream &infile, igzstream &infile2, ofstream &outfile1, ofstream &outfile2,
29 ogzstream &gz_outfile1, ogzstream &gz_outfile2, ofstream &insert_log, ofstream &error_log)
31 //set and open output file
32 infile.open(InputParameter.Input_ref1.c_str());
33 if (!infile)
35 cerr<<"Error:unable to open input file:"<<InputParameter.Input_ref1<<endl;
36 exit(-1);
39 if(InputParameter.Input_ref2!="")
41 infile2.open(InputParameter.Input_ref2.c_str());
42 if (!infile2)
44 cerr<<"Error:unable to open input file:"<<InputParameter.Input_ref2<<endl;
45 exit(-1);
49 string output_file1, output_file2, output_insert_distr, output_error_distr;
50 string infix_name = "_"+boost::lexical_cast <std::string>(InputParameter.Read_length)+"_"+boost::lexical_cast <std::string>(InputParameter.Insertsize_mean);
51 //read file1
52 if(InputParameter.Is_simulate_quality){
53 output_file1 = InputParameter.Output_prefix+infix_name+"_1.fq";
54 //read file2
55 output_file2 = InputParameter.Output_prefix+infix_name+"_2.fq";
56 }else{
57 output_file1 = InputParameter.Output_prefix+infix_name+"_1.fa";
58 //read file2
59 output_file2 = InputParameter.Output_prefix+infix_name+"_2.fa";
61 //compress ouput
62 if(InputParameter.Output_type == 1){
63 output_file1+=".gz";
64 output_file2+=".gz";
67 //insert size distribution file
68 output_insert_distr = InputParameter.Output_prefix+infix_name+".insertsize.distr";
69 //error rate distribution file
70 output_error_distr = InputParameter.Output_prefix+infix_name+".error_rate.distr";
71 //open file
72 if(!InputParameter.Output_type){
73 outfile1.open(output_file1.c_str());
74 outfile2.open(output_file2.c_str());
75 }else{
76 gz_outfile1.open(output_file1.c_str());
77 gz_outfile2.open(output_file2.c_str());
80 insert_log.open(output_insert_distr.c_str());
81 error_log.open(output_error_distr.c_str());
83 //check file
84 if(!InputParameter.Output_type)
86 if (!outfile1 || !outfile2)
88 cerr<<"Error:unable to open output file."<<endl;
89 exit(1);
91 }else{
92 if (!gz_outfile1 || !gz_outfile2)
94 cerr<<"Error:unable to open output file."<<endl;
95 exit(1);
99 if(!insert_log){cerr<<"Error:unalbe to open output insertsize distribution file."<<endl;exit(1);}
100 if(!error_log){cerr<<"Error:unalbe to open output error rate distribution file."<<endl;exit(1);}
103 //get the attribute of Base-calling profile
104 void preview_BaseCalling_profile (PARAMETER InputParameter, string exe_path, int &ref_Base_num,
105 int &statistical_Cycle_num, int &seq_Base_num, int &quality_num, double &statistical_average_error_rate)
107 string matrix_file;
109 if(InputParameter.BaseCalling_profile == ""){
110 int index = exe_path.find_last_of('/');
111 if(index == -1){
112 cerr<<"Error: program path wrong!"<<endl;
114 else{
115 string directory_path = exe_path.substr(0,index);
116 matrix_file = directory_path + "/Profiles/Base-calling_profile/hum20110701.bwanosnp.count.matrix";
118 }else{
119 matrix_file = InputParameter.BaseCalling_profile;
122 igzstream infile;
123 infile.open(matrix_file.c_str());
124 if ( ! infile )
126 cerr << "fail to open input file" << matrix_file << endl;
128 uint64_t total_count_sum = 0;
129 uint64_t total_error_sum = 0;
130 uint64_t total_correct_sum = 0;
132 string lineStr;
133 while (getline( infile, lineStr, '\n' ))
134 { if (lineStr[0] == '#')
135 { if (lineStr[1] == 'D' && lineStr[2] == 'i' && lineStr[3] == 'm')
136 { vector<string> lineVec;
137 boost::split(lineVec,lineStr, boost::is_any_of(":,; \t\n"), boost::token_compress_on);
138 ref_Base_num = atoi(lineVec[2].c_str());
139 statistical_Cycle_num = atoi(lineVec[4].c_str());
141 if(InputParameter.Read_length > statistical_Cycle_num/2){cerr<<"Error: according to the Base-calling profile, program can be simulate "<< statistical_Cycle_num/2 <<"bp read length at most, please set read length again!"<<endl;exit(-1);}
142 seq_Base_num = atoi(lineVec[6].c_str());
143 quality_num = atoi(lineVec[8].c_str());
145 else
147 continue;
150 else
152 if(lineStr == ""){continue;}
153 vector<string> lineVec;
154 boost::split(lineVec,lineStr, boost::is_any_of(" \t\n"), boost::token_compress_on);
156 char ref_base = lineVec[0][0];
157 int current_cycle = atoi(lineVec[1].c_str());
159 if(current_cycle <= InputParameter.Read_length || (current_cycle >= statistical_Cycle_num/2 && current_cycle <= statistical_Cycle_num/2+InputParameter.Read_length)){
160 for(int i = 0; i < seq_Base_num; i++)
162 for(int j = 0; j < quality_num; j++)
164 string num = lineVec[i * quality_num + j + 2];
165 uint64_t freqnum = boost::lexical_cast<uint64_t>(num);
167 total_count_sum+=freqnum;
168 if( i == alphabet3[ref_base]) // ACGT
170 total_correct_sum+=freqnum;
171 }else{
172 total_error_sum+=freqnum;
180 //get BaseCalling profile average error rate
181 statistical_average_error_rate = double(total_error_sum)/double(total_count_sum);
185 //read in quality distribution file and get the quality distribution table.
186 void load_BaseCalling_profile(PARAMETER InputParameter, string exe_path, int statistical_Cycle_num, int seq_Base_num,
187 int quality_num, double statistical_average_error_rate, double*** simulation_matrix)
189 string matrix_file;
190 if(InputParameter.BaseCalling_profile == ""){
191 int index = exe_path.find_last_of('/');
192 if(index == -1){
193 cerr<<"Error: program path wrong!"<<endl;
195 else{
196 string directory_path = exe_path.substr(0,index);
197 matrix_file = directory_path + "/Profiles/Base-calling_profile/hum20110701.bwanosnp.count.matrix";
199 }else{
200 matrix_file = InputParameter.BaseCalling_profile;
203 igzstream MatrixFile;
204 MatrixFile.open(matrix_file.c_str());
205 if ( ! MatrixFile )
206 { cerr << "fail to open input file " << matrix_file <<", make sure statistics file place in program directory!"<< endl;
207 exit(-1);
210 string str_line;
211 cerr <<"Start to construct simulation matrix..."<<endl
212 <<"Loading file: "<<matrix_file<<endl;
215 //user_error_rate/profile_error_rate
216 double E_ratio = 0;
217 if(InputParameter.Error_rate == -1){
218 E_ratio = 1;
219 }else{
220 E_ratio = InputParameter.Error_rate/statistical_average_error_rate;
223 int simulate_cycle = 0;
224 while(getline(MatrixFile, str_line, '\n'))
226 if(str_line[0] == '#' || str_line == ""){continue;}
228 vector<string> str_line_tokens;
229 boost::split(str_line_tokens,str_line, boost::is_any_of(" \t\n"), boost::token_compress_on);
230 char ref_base = str_line_tokens[0][0];
231 int current_cycle = atoi(str_line_tokens[1].c_str());
233 if(current_cycle == 1){simulate_cycle = 0;}
234 if(current_cycle <= InputParameter.Read_length || (current_cycle > statistical_Cycle_num/2 && current_cycle <= statistical_Cycle_num/2+InputParameter.Read_length))
236 simulate_cycle++;
237 uint64_t error_sum = 0;
238 uint64_t count_sum = 0;
239 uint64_t correct_sum = 0;
240 //get the error rate of current cycle
241 for(int i = 0; i < seq_Base_num; i++)
243 for(int j = 0; j < quality_num; j++)
245 string num = str_line_tokens[i * quality_num + j + 2];
246 uint64_t current_num = boost::lexical_cast<uint64_t>(num);
247 count_sum+=current_num;
248 if( i == alphabet3[ref_base]) // ACGT
250 correct_sum+=current_num;
251 }else{
252 error_sum+=current_num;
256 double this_cycle_error_rate = double(error_sum)/double(count_sum);
258 if(InputParameter.Is_simulate_quality) //for simulating fastq
260 //convert profile error matrix to simulation error matrix
261 for(int i = 0; i < seq_Base_num; i++)
263 for(int j = 0; j < quality_num; j++)
265 string num = str_line_tokens[i * quality_num + j + 2];
266 uint64_t current_num = boost::lexical_cast<uint64_t>(num);
267 count_sum+=current_num;
268 if( i == alphabet3[ref_base]) // ACGT
270 //the correct call base number : current_num*(1-this_cycle_error_rate*E/e)/(1-this_cycle_error_rate)
271 simulation_matrix[alphabet3[ref_base]][simulate_cycle-1][i*quality_num+j] = current_num*(1-this_cycle_error_rate*E_ratio)/(1-this_cycle_error_rate);
272 //cerr<<simulation_matrix[alphabet3[ref_base]][simulate_cycle-1][i*quality_num+j]<<" ";
273 }else{
274 //the error call base number: current_num*E_ratio
275 simulation_matrix[alphabet3[ref_base]][simulate_cycle-1][i*quality_num+j] = current_num*E_ratio;
276 //cerr<<simulation_matrix[alphabet3[ref_base]][simulate_cycle-1][i*quality_num+j]<<" ";
280 // cerr << endl;
282 //the cumulative rate
283 double sum = 0;
284 for(int i = 0; i < seq_Base_num*quality_num; i++)
286 sum += simulation_matrix[alphabet3[ref_base]][simulate_cycle-1][i];
288 //cerr<<sum<<endl;
289 if(sum == 0){
290 for(int i = 0; i < seq_Base_num*quality_num; i++)
292 simulation_matrix[alphabet3[ref_base]][simulate_cycle-1][i] = 0;
294 }else{
295 double accumulate_value = 0;
296 for(int i = 0; i < seq_Base_num*quality_num; i++)
298 accumulate_value += simulation_matrix[alphabet3[ref_base]][simulate_cycle-1][i];
299 simulation_matrix[alphabet3[ref_base]][simulate_cycle-1][i] = accumulate_value/sum;
300 // cerr << simulation_matrix[alphabet3[ref_base]][simulate_cycle-1][i] <<" ";
303 }else{
304 //for simulating fasta
305 //convert profile error matrix to simulation error matrix
306 for(int i = 0; i < seq_Base_num; i++)
308 for(int j = 0; j < quality_num; j++)
310 string num = str_line_tokens[i * quality_num + j + 2];
311 uint64_t current_num = boost::lexical_cast<uint64_t>(num);
312 count_sum+=current_num;
313 if( i == alphabet3[ref_base]) // ACGT
315 //the correct call base number : current_num*(1-this_cycle_error_rate*E/e)/(1-this_cycle_error_rate)
316 simulation_matrix[alphabet3[ref_base]][simulate_cycle-1][i] += current_num*(1-this_cycle_error_rate*E_ratio)/(1-this_cycle_error_rate);
317 //cerr<<simulation_matrix[alphabet3[ref_base]][simulate_cycle-1][i*quality_num+j]<<" ";
318 }else{
319 //the error call base number: current_num*E_ratio
320 simulation_matrix[alphabet3[ref_base]][simulate_cycle-1][i] += current_num*E_ratio;
321 //cerr<<simulation_matrix[alphabet3[ref_base]][simulate_cycle-1][i*quality_num+j]<<" ";
325 //cerr << endl;
327 //the cumulative rate
328 double sum = 0;
329 for(int i = 0; i < seq_Base_num; i++)
331 sum += simulation_matrix[alphabet3[ref_base]][simulate_cycle-1][i];
333 //cerr<<sum<<endl;
334 if(sum == 0){
335 for(int i = 0; i < seq_Base_num; i++)
337 simulation_matrix[alphabet3[ref_base]][simulate_cycle-1][i] = 0;
339 }else{
340 double accumulate_value = 0;
341 for(int i = 0; i < seq_Base_num; i++)
343 accumulate_value += simulation_matrix[alphabet3[ref_base]][simulate_cycle-1][i];
344 simulation_matrix[alphabet3[ref_base]][simulate_cycle-1][i] = accumulate_value/sum;
345 //cerr<<simulation_matrix[alphabet3[ref_base]][simulate_cycle-1][i]<<" ";
350 //cerr << endl;
354 cerr <<"Have finished constructing Base-calling simulation matrix"<<endl;
357 void load_GC_depth_profile (PARAMETER InputParameter, string exe_path, double* GC_bias_abundance)
359 string depth_file;
361 if(InputParameter.GC_depth_profile == ""){
362 int index = exe_path.find_last_of('/');
363 if(index == -1){
364 cerr<<"Error: program path wrong!"<<endl;
366 else{
367 //The default GC bias profile are determined based on the twice length of read
368 string GC_depth_profile_name;
369 int window_size = InputParameter.Read_length * 2;
370 if(window_size < 125){
371 GC_depth_profile_name = "gcdep100.txt";
372 }else{
373 if(window_size < 175){
374 GC_depth_profile_name = "gcdep150.txt";
375 }else{
376 GC_depth_profile_name = "gcdep200.txt";
379 string directory_path = exe_path.substr(0,index);
380 depth_file = directory_path + "/Profiles/GC-depth_profile/"+GC_depth_profile_name;
382 }else{
383 depth_file = InputParameter.GC_depth_profile;
386 igzstream infile;
387 infile.open(depth_file.c_str());
388 if ( ! infile )
390 cerr << "fail to open input file" << depth_file << endl;
391 }else{
392 cerr << "Loading the GC bias profile: "<< depth_file <<endl;
394 uint64_t total_count_sum = 0;
395 uint64_t total_error_sum = 0;
397 string lineStr;
398 vector<double> GC_ratio_vec;
399 vector<double> depth_vec;
400 while (getline( infile, lineStr, '\n' ))
401 { if (lineStr[0] == '#' || lineStr == "" || lineStr[0] == ' ')
403 continue;
405 else
407 vector<string> lineVec;
408 //#GC% RefCnt DepthCnt Mean Small Q1 Mid Q3 Big Min Max Refcntcal
409 //0.5 2285822 1371 0.334179 0 0 0 0.2 0.49 0 23.48 0.0945074
410 //............
411 boost::split(lineVec,lineStr, boost::is_any_of(" \t\n"), boost::token_compress_on);
412 GC_ratio_vec.push_back(boost::lexical_cast<double>(lineVec[0])); //GC%
413 depth_vec.push_back(boost::lexical_cast<double>(lineVec[3])); //Mean
417 //find max depth
418 double max_depth = 0;
419 for(int i = 0; i < depth_vec.size(); i++)
421 if(depth_vec[i] > max_depth){
422 max_depth = depth_vec[i];
426 //convert to GC abundance
427 for(int i = 1; i < GC_ratio_vec.size(); i++)
429 for(int j = int(GC_ratio_vec[i-1]); j < int(GC_ratio_vec[i]); j++)
431 GC_bias_abundance[j] = depth_vec[i-1]/max_depth;
434 for(int i = int(GC_ratio_vec[GC_ratio_vec.size()-1]); i <=100; i++)
436 GC_bias_abundance[i] = depth_vec[depth_vec.size()-1]/max_depth;
440 for(int i = 0; i <=100; i++)
442 cerr<<"GC%:"<<i<<"\t"<<GC_bias_abundance[i]<<endl;
446 cerr <<"Have finished constructing GC bias simulation matrix"<<endl;