modified: pixi.toml
[GalaxyCodeBases.git] / released / pIRS.old / pIRS_simulator / simulate.cpp
blob3e653cccfb4354aff2387ed2c383291a2121e7c8
1 #include <iostream>
2 #include <vector>
3 #include <string>
4 #include <fstream>
5 #include <math.h>
6 #include <map>
7 #include <boost/lexical_cast.hpp>
8 #include <boost/algorithm/string.hpp>
9 #include "simulate.h"
11 //from ASCII of A(N) C G T to 0 1 2 3, auto dealing with upper or lower case.
12 //8bit char type, A=a=N=n=0, C=c=1, G=g=2, T=t=3, others as 4.
13 char alphabet[128] =
15 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
16 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
17 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
18 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
19 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 0, 4,
20 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
21 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 0, 4,
22 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4
26 //from ASCII of A C G T to 0 1 2 3, auto dealing with upper or lower case.
27 //8bit char type, A=a=0, C=c=1, G=g=2, T=t=3, others as 4.
28 char alphabet2[128] =
30 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
31 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
32 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
33 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
34 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
35 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
36 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
37 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4
40 //from 0 1 2 3 4 to A C G T N
41 char Bases[5] ={
42 'A', 'C', 'G', 'T', 'N'
45 //from 0 1 2 3 4 to T G C A N
46 char c_bases[5] ={
47 'T', 'G', 'C', 'A', 'N'
50 //check whether a sequence contain non base characters, such as "N"
51 int check_seq (string &seq)
53 int is_good = 1;
54 for (int i = 0; i < seq.size(); i++)
55 { if (alphabet2[seq[i]] == 4)
56 { is_good = 0;
57 break;
60 return is_good;
64 //Rrealization of snp
65 char get_snp_match(char base, double snp_bias[]){
66 char bases[4][3]={{'G','T','C'}, //A->G transition A->C/T transvertion
67 {'C','G','A'}, //T->C transition T->G/A transvertion
68 {'T','A','G'}, //C->T transition C->A/T transvertion
69 {'A','T','C'}}; //G->A transition G->T/C transvertion
71 int a;
72 char n='N';
73 switch (base)
75 case 'A': a=0;break;
76 case 'T': a=1;break;
77 case 'C': a=2;break;
78 case 'G': a=3;break;
79 default: return n;
82 double num = double(rand())/double(RAND_MAX);
83 int i = 0;
84 for(i=0; i<3; i++)
86 double p = snp_bias[i];
87 if(num <= p){break;}
89 return bases[a][i];
93 //Produce heterozygous SNPs in multiploid
94 string Get_snp(string &seq,ofstream &snp,string id, double hetersnp_rate, double snp_bias[]){
95 map<uint64_t,string> snp_lst;
96 uint64_t seq_len=seq.size();
97 uint64_t N = 1000000000000;//the max random value
98 uint64_t snp_num = 0;
99 for(uint64_t seq_index = 0; seq_index < seq_len; seq_index++)
101 if(seq[seq_index] == 'N'){continue;}
102 double random_num = (double)rand() / double(RAND_MAX);
103 if(random_num <= hetersnp_rate)
105 string ss = boost::lexical_cast <std::string>(seq[seq_index]) + "\t";
106 //get snp base
107 seq[seq_index]=get_snp_match(seq[seq_index], snp_bias);
108 ss += boost::lexical_cast <std::string>(seq[seq_index]);
109 //put into list file
110 snp_lst[seq_index+1] = ss;
111 snp_num++;
115 // snp<<id<<" total SNP number: "<<snp_num<<endl;
116 map<uint64_t, string>::const_iterator map_it = snp_lst.begin();
117 while (map_it != snp_lst.end())
119 snp<<id<<"\t"<<map_it->first<<"\t"<<map_it->second<<endl;
120 map_it++;
123 return seq;
126 string Get_invertion(string &seq, ofstream &invertion_file, string id, double SV_rate)
128 uint64_t N = 1000000000000; //the max random value
129 uint64_t seq_len=seq.size();
130 double invertion_rate = SV_rate/3;
131 // cerr<< invertion_rate <<endl;
132 int invertion_len[5] = {100, 200, 500, 1000, 2000};
133 double array[5] = {0.70, 0.90, 0.97, 0.99, 1.0};
134 map<uint64_t,int> invertion_lst;
135 uint64_t invertion_num = 0;
136 for(uint64_t seq_index = 0; seq_index < seq_len; seq_index++)
138 double random_num = (double)rand() / double(RAND_MAX);
139 if(random_num <= invertion_rate)
141 // cerr << random_num<<endl;
142 double random_num2 = (double)rand() / double(RAND_MAX);
143 // cerr << random_num2<<endl;
144 int j = 0;
145 for(j=0; j<5; j++)
147 if(random_num2 <= array[j])
149 break;
152 if(seq_index+invertion_len[j]>seq_len){continue;}
153 string sub_seq = seq.substr(seq_index,invertion_len[j]);
154 if(!check_seq(sub_seq)){continue;} //contain 'N' or other nonbases char
155 string rc_sub_seq = reversecomplementary(sub_seq);
157 for(int i = 0; i < rc_sub_seq.size(); i++)
159 seq[seq_index+i] = rc_sub_seq[i];
162 invertion_lst[seq_index] = invertion_len[j];
163 invertion_num++;
167 // invertion_file<<id<<" total invertion number: "<<invertion_num<<endl;
168 map<uint64_t, int>::const_iterator map_it = invertion_lst.begin();
169 while (map_it != invertion_lst.end())
171 invertion_file<<id<<"\t"<<map_it->first<<"\t"<<map_it->second<<endl;
172 map_it++;
175 return seq;
178 //Getting insert sequence when heterozygous indel rate is bigger than 0
179 string get_insertion(int num){
180 char base[]={'A','T','G','C'};
181 string s;
182 for (int a=0;a<num;a++)
184 int index=int(rand()%4);
185 s.push_back(base[index]);
187 return s;
190 //Produce heterozygous indels in multiploid
191 string Get_indel(string &seq,ofstream &indel,string id1,double heterindel_rate,double SV_rate){
192 uint64_t seq_len=seq.size();
193 uint64_t N = 1000000000000; //the max random value
195 double small_insertion_rate = heterindel_rate/2;
196 double samll_deletion_rate = heterindel_rate/2;
197 //use the empirical distribution(1~6) from panda re-sequencing data
198 int small_indel_len[6] = {1,2,3,4,5,6};
199 // double array1[6] = {0.6482, 0.1717, 0.0720, 0.0729, 0.0218, 0.0134};
200 double array1[6] = {0.6482, 0.8199, 0.8919, 0.9648, 0.9866, 1};
202 double large_insertion_rate = SV_rate/3;
203 double large_deletion_rate = SV_rate/3;
204 //insertion,deletion and inversion share one third of total SV respectively
205 //SV-length 100(70%),200(20%),500(7%),1000(2%),2000(1%)
206 int large_indel_len[5] = {100, 200, 500, 1000, 2000};
207 // double array2[5] = {0.70, 0.20, 0.07, 0.02, 0.01};
208 double array2[5] = {0.70, 0.90, 0.97, 0.99, 1};
210 map<uint64_t,string> indel_lst;
211 //simulate small deletion
212 uint64_t small_deletion_count = 0;
213 for(uint64_t seq_index = 0; seq_index < seq_len; seq_index++)
215 double random_num = (double)rand() / double(RAND_MAX);
216 if(random_num <= samll_deletion_rate)
218 double random_num2 = (double)rand() / double(RAND_MAX);
219 int i = 0;
220 for(i = 0; i < 6; i++)
222 if(random_num2 <= array1[i])
224 break;
227 if(seq_index+small_indel_len[i]>seq_len){continue;}
228 string sub_str = seq.substr(seq_index, small_indel_len[i]);
229 if(!check_seq(sub_str)){continue;}
230 small_deletion_count++;
231 string ss = "-\t" + boost::lexical_cast <std::string>(small_indel_len[i]) + "\t";
233 for (int k=0;k<small_indel_len[i];k++)
235 ss += seq[seq_index+k];
236 // indel<<seq[seq_index+k];
237 seq[seq_index+k]='D';
239 indel_lst[seq_index] = ss;
243 //simulate large deletion
244 uint64_t large_deletion_count = 0;
245 for(uint64_t seq_index = 0; seq_index < seq_len; seq_index++)
247 double random_num = (double)rand() / double(RAND_MAX);
248 if(random_num <= large_deletion_rate)
250 double random_num2 = (double)rand() / double(RAND_MAX);
251 int i = 0;
252 for(i = 0; i < 5; i++)
254 if(random_num2 <= array2[i])
256 break;
259 if(seq_index+large_indel_len[i]>seq_len){continue;}
260 string sub_str = seq.substr(seq_index, large_indel_len[i]);
261 if(!check_seq(sub_str)){continue;}
262 large_deletion_count++;
263 string ss = "-\t" + boost::lexical_cast <std::string>(large_indel_len[i]) + "\t";
265 for (int k=0;k<large_indel_len[i];k++)
267 ss += seq[seq_index+k];
268 // indel<<seq[seq_index+k];
269 seq[seq_index+k]='D';
271 indel_lst[seq_index] = ss;
275 vector<short> insert(seq_len);
276 string s;
277 //simulate small insertion
278 uint64_t small_insertion_count = 0;
279 for(uint64_t seq_index = 0; seq_index < seq_len; seq_index++)
281 double random_num = (double)rand() / double(RAND_MAX);
282 if(random_num <= small_insertion_rate)
284 double random_num2 = (double)rand() / double(RAND_MAX);
285 int i = 0;
286 for(i = 0; i < 6; i++)
288 if(random_num2 <= array1[i]){break;}
290 insert[seq_index] = small_indel_len[i];
291 small_insertion_count++;
294 //simulate large insertion
295 uint64_t large_insertion_count = 0;
296 for(uint64_t seq_index = 0; seq_index < seq_len; seq_index++)
298 double random_num = (double)rand() / double(RAND_MAX);
299 if(random_num <= large_insertion_rate)
301 double random_num2 = (double)rand() / double(RAND_MAX);
302 int i = 0;
303 for(i = 0; i < 5; i++)
305 if(random_num2 <= array2[i]){break;}
307 insert[seq_index] = large_indel_len[i];
308 large_insertion_count++;
311 //modify the seq
312 uint64_t total_insertion_count = 0;
313 for (uint64_t i=0;i<seq_len;i++)
315 if (insert[i]>=1)
317 total_insertion_count++;
318 string temp;
319 temp=get_insertion(insert[i]);
320 s+=temp;
321 string ss = "+\t" + boost::lexical_cast <std::string>(int(insert[i])) + "\t" + temp;
323 //判断该位点是否有deletion
324 if(indel_lst[i][0] == 0)
326 indel_lst[i] = ss;
327 }else{
328 indel_lst[i] += "\n" + id1 + "\t" + boost::lexical_cast <std::string>(i) + "\t" + ss;
330 // indel<<id1<<"\t"<<i+1<<"\t+\t"<<int(insert[i])<<"\t"<<temp<<endl;
331 if (seq[i]!='D')
333 s+=seq[i];
335 }else{
336 if (seq[i]!='D')
338 s+=seq[i];
343 // indel <<"#small deletion times: "<<small_deletion_count << "; large deletion times: "<<large_deletion_count<<"; small insertion times: "<<small_insertion_count<<"; large insertion times: "<<large_insertion_count<<endl;
344 map<uint64_t, string>::const_iterator map_it = indel_lst.begin();
345 while (map_it != indel_lst.end())
347 indel<<id1<<"\t"<<map_it->first<<"\t"<<map_it->second<<endl;
348 map_it++;
351 return s;
355 //Binary search the random number location
356 int search_location(double *Arr, uint64_t ArrNum, double random_num){
358 uint64_t left = 0;
359 uint64_t right = ArrNum;
360 uint64_t middle = (left+right)/2;
362 if(random_num < Arr[0]){return 0;}
363 if(random_num > Arr[ArrNum-1]){ return ArrNum-1;}
365 //return the first location of bigger than 0
366 if(random_num == 0){
367 for(uint64_t i = 0; i < ArrNum; i++)
369 if(Arr[i] > 0){return i;}
373 while(!(random_num > Arr[middle-1] && random_num <= Arr[middle]))
375 if (left == middle || right == middle){
376 cerr <<"left == middle || right == middle"<<endl;
377 return middle;
379 if(random_num > Arr[middle]){
380 left = middle;
381 }else if(random_num < Arr[middle-1]){
382 right = middle;
383 }else if(random_num == Arr[middle-1]){ //return the first region that upper boundary equal to random_num
384 for(uint64_t i = middle-1; i>0; i--)
386 if(Arr[i-1] != Arr[i]){return i;}
389 middle = (left+right)/2;
392 return middle;
395 //get the reverse and complement sequence
396 string reversecomplementary (string read)
398 string rc_read;
399 for (int64_t i=read.size()-1; i>=0; i--)
401 rc_read.push_back(c_bases[alphabet[read[i]]]);
403 return rc_read;
406 //simulate normal distribution insertsize by Box-muller method
407 int simulate_insertsize(int mean, int sd)
409 int insertsize = 0;
410 double x1,x2,radius_pow2,y1 = 0.0;
414 x1 = 2.0*double(rand())/double(RAND_MAX)-1.0;
415 x2 = 2.0*double(rand())/double(RAND_MAX)-1.0;
416 radius_pow2 = x1*x1+x2*x2;
417 }while(radius_pow2>=1.0 || radius_pow2 == 0);
419 radius_pow2 = sqrt(-2.0*log(radius_pow2)/radius_pow2);
420 y1 = x1*radius_pow2;
421 insertsize = int(mean+y1*sd);
423 return insertsize;
427 //simulate GC bias
428 int simulate_GC_bias(string insert_str, double *GC_bias_abundance){
429 int is_ignore = 0;
430 int GC_num = 0;
431 int insert_size = insert_str.size();
432 //get GC rate
433 for (int i=0; i<insert_size; i++)
435 if(insert_str[i] == 'G' || insert_str[i] == 'C'){GC_num++;}
438 double GC_rate = double(GC_num)/double(insert_size);
440 //get relative abundance
441 double bias_abund = GC_bias_abundance[int(GC_rate*100)];
443 //decide whether ignore this insert string.
444 double num = double(rand())/double(RAND_MAX);
445 if(num > bias_abund){is_ignore = 1;}
447 return is_ignore;