7 #include <boost/lexical_cast.hpp>
8 #include <boost/algorithm/string.hpp>
11 //from ASCII of A(N) C G T to 0 1 2 3, auto dealing with upper or lower case.
12 //8bit char type, A=a=N=n=0, C=c=1, G=g=2, T=t=3, others as 4.
15 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
16 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
17 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
18 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
19 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 0, 4,
20 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
21 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 0, 4,
22 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4
26 //from ASCII of A C G T to 0 1 2 3, auto dealing with upper or lower case.
27 //8bit char type, A=a=0, C=c=1, G=g=2, T=t=3, others as 4.
30 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
31 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
32 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
33 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
34 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
35 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
36 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
37 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4
40 //from 0 1 2 3 4 to A C G T N
42 'A', 'C', 'G', 'T', 'N'
45 //from 0 1 2 3 4 to T G C A N
47 'T', 'G', 'C', 'A', 'N'
50 //check whether a sequence contain non base characters, such as "N"
51 int check_seq (string
&seq
)
54 for (int i
= 0; i
< seq
.size(); i
++)
55 { if (alphabet2
[seq
[i
]] == 4)
65 char get_snp_match(char base
, double snp_bias
[]){
66 char bases
[4][3]={{'G','T','C'}, //A->G transition A->C/T transvertion
67 {'C','G','A'}, //T->C transition T->G/A transvertion
68 {'T','A','G'}, //C->T transition C->A/T transvertion
69 {'A','T','C'}}; //G->A transition G->T/C transvertion
82 double num
= double(rand())/double(RAND_MAX
);
86 double p
= snp_bias
[i
];
93 //Produce heterozygous SNPs in multiploid
94 string
Get_snp(string
&seq
,ofstream
&snp
,string id
, double hetersnp_rate
, double snp_bias
[]){
95 map
<uint64_t,string
> snp_lst
;
96 uint64_t seq_len
=seq
.size();
97 uint64_t N
= 1000000000000;//the max random value
99 for(uint64_t seq_index
= 0; seq_index
< seq_len
; seq_index
++)
101 if(seq
[seq_index
] == 'N'){continue;}
102 double random_num
= (double)rand() / double(RAND_MAX
);
103 if(random_num
<= hetersnp_rate
)
105 string ss
= boost::lexical_cast
<std::string
>(seq
[seq_index
]) + "\t";
107 seq
[seq_index
]=get_snp_match(seq
[seq_index
], snp_bias
);
108 ss
+= boost::lexical_cast
<std::string
>(seq
[seq_index
]);
110 snp_lst
[seq_index
+1] = ss
;
115 // snp<<id<<" total SNP number: "<<snp_num<<endl;
116 map
<uint64_t, string
>::const_iterator map_it
= snp_lst
.begin();
117 while (map_it
!= snp_lst
.end())
119 snp
<<id
<<"\t"<<map_it
->first
<<"\t"<<map_it
->second
<<endl
;
126 string
Get_invertion(string
&seq
, ofstream
&invertion_file
, string id
, double SV_rate
)
128 uint64_t N
= 1000000000000; //the max random value
129 uint64_t seq_len
=seq
.size();
130 double invertion_rate
= SV_rate
/3;
131 // cerr<< invertion_rate <<endl;
132 int invertion_len
[5] = {100, 200, 500, 1000, 2000};
133 double array
[5] = {0.70, 0.90, 0.97, 0.99, 1.0};
134 map
<uint64_t,int> invertion_lst
;
135 uint64_t invertion_num
= 0;
136 for(uint64_t seq_index
= 0; seq_index
< seq_len
; seq_index
++)
138 double random_num
= (double)rand() / double(RAND_MAX
);
139 if(random_num
<= invertion_rate
)
141 // cerr << random_num<<endl;
142 double random_num2
= (double)rand() / double(RAND_MAX
);
143 // cerr << random_num2<<endl;
147 if(random_num2
<= array
[j
])
152 if(seq_index
+invertion_len
[j
]>seq_len
){continue;}
153 string sub_seq
= seq
.substr(seq_index
,invertion_len
[j
]);
154 if(!check_seq(sub_seq
)){continue;} //contain 'N' or other nonbases char
155 string rc_sub_seq
= reversecomplementary(sub_seq
);
157 for(int i
= 0; i
< rc_sub_seq
.size(); i
++)
159 seq
[seq_index
+i
] = rc_sub_seq
[i
];
162 invertion_lst
[seq_index
] = invertion_len
[j
];
167 // invertion_file<<id<<" total invertion number: "<<invertion_num<<endl;
168 map
<uint64_t, int>::const_iterator map_it
= invertion_lst
.begin();
169 while (map_it
!= invertion_lst
.end())
171 invertion_file
<<id
<<"\t"<<map_it
->first
<<"\t"<<map_it
->second
<<endl
;
178 //Getting insert sequence when heterozygous indel rate is bigger than 0
179 string
get_insertion(int num
){
180 char base
[]={'A','T','G','C'};
182 for (int a
=0;a
<num
;a
++)
184 int index
=int(rand()%4);
185 s
.push_back(base
[index
]);
190 //Produce heterozygous indels in multiploid
191 string
Get_indel(string
&seq
,ofstream
&indel
,string id1
,double heterindel_rate
,double SV_rate
){
192 uint64_t seq_len
=seq
.size();
193 uint64_t N
= 1000000000000; //the max random value
195 double small_insertion_rate
= heterindel_rate
/2;
196 double samll_deletion_rate
= heterindel_rate
/2;
197 //use the empirical distribution(1~6) from panda re-sequencing data
198 int small_indel_len
[6] = {1,2,3,4,5,6};
199 // double array1[6] = {0.6482, 0.1717, 0.0720, 0.0729, 0.0218, 0.0134};
200 double array1
[6] = {0.6482, 0.8199, 0.8919, 0.9648, 0.9866, 1};
202 double large_insertion_rate
= SV_rate
/3;
203 double large_deletion_rate
= SV_rate
/3;
204 //insertion,deletion and inversion share one third of total SV respectively
205 //SV-length 100(70%),200(20%),500(7%),1000(2%),2000(1%)
206 int large_indel_len
[5] = {100, 200, 500, 1000, 2000};
207 // double array2[5] = {0.70, 0.20, 0.07, 0.02, 0.01};
208 double array2
[5] = {0.70, 0.90, 0.97, 0.99, 1};
210 map
<uint64_t,string
> indel_lst
;
211 //simulate small deletion
212 uint64_t small_deletion_count
= 0;
213 for(uint64_t seq_index
= 0; seq_index
< seq_len
; seq_index
++)
215 double random_num
= (double)rand() / double(RAND_MAX
);
216 if(random_num
<= samll_deletion_rate
)
218 double random_num2
= (double)rand() / double(RAND_MAX
);
220 for(i
= 0; i
< 6; i
++)
222 if(random_num2
<= array1
[i
])
227 if(seq_index
+small_indel_len
[i
]>seq_len
){continue;}
228 string sub_str
= seq
.substr(seq_index
, small_indel_len
[i
]);
229 if(!check_seq(sub_str
)){continue;}
230 small_deletion_count
++;
231 string ss
= "-\t" + boost::lexical_cast
<std::string
>(small_indel_len
[i
]) + "\t";
233 for (int k
=0;k
<small_indel_len
[i
];k
++)
235 ss
+= seq
[seq_index
+k
];
236 // indel<<seq[seq_index+k];
237 seq
[seq_index
+k
]='D';
239 indel_lst
[seq_index
] = ss
;
243 //simulate large deletion
244 uint64_t large_deletion_count
= 0;
245 for(uint64_t seq_index
= 0; seq_index
< seq_len
; seq_index
++)
247 double random_num
= (double)rand() / double(RAND_MAX
);
248 if(random_num
<= large_deletion_rate
)
250 double random_num2
= (double)rand() / double(RAND_MAX
);
252 for(i
= 0; i
< 5; i
++)
254 if(random_num2
<= array2
[i
])
259 if(seq_index
+large_indel_len
[i
]>seq_len
){continue;}
260 string sub_str
= seq
.substr(seq_index
, large_indel_len
[i
]);
261 if(!check_seq(sub_str
)){continue;}
262 large_deletion_count
++;
263 string ss
= "-\t" + boost::lexical_cast
<std::string
>(large_indel_len
[i
]) + "\t";
265 for (int k
=0;k
<large_indel_len
[i
];k
++)
267 ss
+= seq
[seq_index
+k
];
268 // indel<<seq[seq_index+k];
269 seq
[seq_index
+k
]='D';
271 indel_lst
[seq_index
] = ss
;
275 vector
<short> insert(seq_len
);
277 //simulate small insertion
278 uint64_t small_insertion_count
= 0;
279 for(uint64_t seq_index
= 0; seq_index
< seq_len
; seq_index
++)
281 double random_num
= (double)rand() / double(RAND_MAX
);
282 if(random_num
<= small_insertion_rate
)
284 double random_num2
= (double)rand() / double(RAND_MAX
);
286 for(i
= 0; i
< 6; i
++)
288 if(random_num2
<= array1
[i
]){break;}
290 insert
[seq_index
] = small_indel_len
[i
];
291 small_insertion_count
++;
294 //simulate large insertion
295 uint64_t large_insertion_count
= 0;
296 for(uint64_t seq_index
= 0; seq_index
< seq_len
; seq_index
++)
298 double random_num
= (double)rand() / double(RAND_MAX
);
299 if(random_num
<= large_insertion_rate
)
301 double random_num2
= (double)rand() / double(RAND_MAX
);
303 for(i
= 0; i
< 5; i
++)
305 if(random_num2
<= array2
[i
]){break;}
307 insert
[seq_index
] = large_indel_len
[i
];
308 large_insertion_count
++;
312 uint64_t total_insertion_count
= 0;
313 for (uint64_t i
=0;i
<seq_len
;i
++)
317 total_insertion_count
++;
319 temp
=get_insertion(insert
[i
]);
321 string ss
= "+\t" + boost::lexical_cast
<std::string
>(int(insert
[i
])) + "\t" + temp
;
324 if(indel_lst
[i
][0] == 0)
328 indel_lst
[i
] += "\n" + id1
+ "\t" + boost::lexical_cast
<std::string
>(i
) + "\t" + ss
;
330 // indel<<id1<<"\t"<<i+1<<"\t+\t"<<int(insert[i])<<"\t"<<temp<<endl;
343 // indel <<"#small deletion times: "<<small_deletion_count << "; large deletion times: "<<large_deletion_count<<"; small insertion times: "<<small_insertion_count<<"; large insertion times: "<<large_insertion_count<<endl;
344 map
<uint64_t, string
>::const_iterator map_it
= indel_lst
.begin();
345 while (map_it
!= indel_lst
.end())
347 indel
<<id1
<<"\t"<<map_it
->first
<<"\t"<<map_it
->second
<<endl
;
355 //Binary search the random number location
356 int search_location(double *Arr
, uint64_t ArrNum
, double random_num
){
359 uint64_t right
= ArrNum
;
360 uint64_t middle
= (left
+right
)/2;
362 if(random_num
< Arr
[0]){return 0;}
363 if(random_num
> Arr
[ArrNum
-1]){ return ArrNum
-1;}
365 //return the first location of bigger than 0
367 for(uint64_t i
= 0; i
< ArrNum
; i
++)
369 if(Arr
[i
] > 0){return i
;}
373 while(!(random_num
> Arr
[middle
-1] && random_num
<= Arr
[middle
]))
375 if (left
== middle
|| right
== middle
){
376 cerr
<<"left == middle || right == middle"<<endl
;
379 if(random_num
> Arr
[middle
]){
381 }else if(random_num
< Arr
[middle
-1]){
383 }else if(random_num
== Arr
[middle
-1]){ //return the first region that upper boundary equal to random_num
384 for(uint64_t i
= middle
-1; i
>0; i
--)
386 if(Arr
[i
-1] != Arr
[i
]){return i
;}
389 middle
= (left
+right
)/2;
395 //get the reverse and complement sequence
396 string
reversecomplementary (string read
)
399 for (int64_t i
=read
.size()-1; i
>=0; i
--)
401 rc_read
.push_back(c_bases
[alphabet
[read
[i
]]]);
406 //simulate normal distribution insertsize by Box-muller method
407 int simulate_insertsize(int mean
, int sd
)
410 double x1
,x2
,radius_pow2
,y1
= 0.0;
414 x1
= 2.0*double(rand())/double(RAND_MAX
)-1.0;
415 x2
= 2.0*double(rand())/double(RAND_MAX
)-1.0;
416 radius_pow2
= x1
*x1
+x2
*x2
;
417 }while(radius_pow2
>=1.0 || radius_pow2
== 0);
419 radius_pow2
= sqrt(-2.0*log(radius_pow2
)/radius_pow2
);
421 insertsize
= int(mean
+y1
*sd
);
428 int simulate_GC_bias(string insert_str
, double *GC_bias_abundance
){
431 int insert_size
= insert_str
.size();
433 for (int i
=0; i
<insert_size
; i
++)
435 if(insert_str
[i
] == 'G' || insert_str
[i
] == 'C'){GC_num
++;}
438 double GC_rate
= double(GC_num
)/double(insert_size
);
440 //get relative abundance
441 double bias_abund
= GC_bias_abundance
[int(GC_rate
*100)];
443 //decide whether ignore this insert string.
444 double num
= double(rand())/double(RAND_MAX
);
445 if(num
> bias_abund
){is_ignore
= 1;}