8 It is a program for simulating solexa reads,with a series of problems generate by solexa
9 sequencing machine in though,such as insertsize distribution,error rate,heterozygosis SNP
10 and heterozygosis Indel in diploid. User should set the value of insertsize_mean and
11 insertsize_sd ,they are the mean value and standard deviation of the normal distribution
12 that used as the model function when simulating insertsize distribution.The normal
13 distribution function model we used in this program is
14 f(x)=1/σ/sqrt(2*pi)/exp((x-μ)**2 / (2*σ**2)) ,and the insertsize distribution
15 range is limited in (μ-5σ,μ+5σ), which will cover almost all the data.To simulate
16 illumina error rates on different cycles,we use the function f(x)=0.00001*x**3 as a model
17 function,because the error rate of a number of bases at the end is much larger than
18 other area in a read. User should set the heterozygosis SNP rate and heterozygosis Indel
19 rate as also ,if you want to use those function,but remember that heterozygosis SNP rate
20 and heterozygosis Indel rate is only exists in diploid. At last ,you should set another
21 several parameters ,read length ,coverage of reads ,input sequence and output prefix,the
22 input sequence and output prefix must be set ,because there are not default value.
24 =head1 Contact and Version
26 Contact: huxuesong@genomics.cn lujianliang@genomics.org.cn yuezhen@genomics.org.cn
27 Version: 1.1 Data: 2012-4-13
31 -input <string> input reference genome sequence
32 -read_len <int> set read length, read1 and read2 have the same length,default:100
33 -coverage <int> set the sequencing coverage(sometimes called depth),default:40
34 -insertsize_mean <int> set the average value of insert size,default:500
35 -insertsize_sd <int> set the standard deviation of insert sizes, default:25
36 -error_rate <float> set the average error rate over all cycles,default:0.01
37 -heterSNP_rate <float> set the heterozygous SNP rate of the diploid genome,default:0
38 -heterIndel_rate <float> set the heterozygous indel rate of the diploid genome,default:0
39 -output <string> output file prefix
40 -fq output to fastaq file
41 -b No pause for batch runs
42 -help output help infomation
46 1. perl simulate_solexa_reads.pl -input ref_sequence.fa -o humen
47 erery parameter use the default one .
48 2. perl simulate_solexa_reads.pl -input ref_sequence.fa -read_len 150 -coverage 20 -o humen
49 just set read length and coverage you needed.
50 3. perl simulate_solexa_reads.pl -input ref_sequence.fa -o humen -insertsize_mean 600 -insertsize_sd 30
52 set insertsize distribution and error rate.
53 4. perl simulate_solexa_reads.pl -input ref_sequence.fa -o humen -heterSNP_rate 0.001 -heterIndel_rate 0.001
54 the genome is diploid and you want to produce heterozygosis SNPs heterozygosis Indels in reads .
57 ################################################################################################
58 my ($inputref,$read_len,$coverage,$insertsize_mean,$insertsize_sd,$error_rate,$heterSNP_rate);
59 my ($heterIndel_rate,$ref_ploid,$n_ploid,$o,$help,$fq,$fqmode,$opt_b,$ext);
62 "input:s"=>\
$inputref,
63 "read_len:i"=>\
$read_len,
64 "coverage:i"=>\
$coverage,
65 "insertsize_mean:i"=>\
$insertsize_mean,
66 "insertsize_sd:i"=>\
$insertsize_sd,
67 "error_rate:f"=>\
$error_rate,
69 "heterSNP_rate:f"=>\
$heterSNP_rate,
70 "heterIndel_rate:f"=>\
$heterIndel_rate,
75 die `pod2text $0` if ($help);
76 die `pod2text $0` if (!$inputref || !$o);
77 #############################Set default value#############################################
78 $read_len=100 if (!defined $read_len);
79 $coverage=40 if (!defined $coverage);
80 $insertsize_mean=500 if (!defined $insertsize_mean);
81 $insertsize_sd=25 if (!defined $insertsize_sd);
82 $error_rate=0.01 if (!defined $error_rate);
86 } else {$fqmode='FASTA';$ext='fa';}
87 $heterSNP_rate=0 if (!defined $heterSNP_rate);
88 $heterIndel_rate=0 if (!defined $heterIndel_rate);
89 #############################Main Program##################################################
90 warn "[!]$fqmode mode.
91 From [$inputref] to [$o]_${read_len}_${insertsize_mean}_{1,2}.$ext
92 Read Len:[$read_len] Error Rate:[$error_rate] Coverage:[$coverage] Insertsize:[${insertsize_mean}±$insertsize_sd]
93 HeterSNP Rate:$heterSNP_rate HeterIndel Rate:$heterIndel_rate\n";
94 if (! $opt_b) {print STDERR
'press [Enter] to continue...'; <>;}
97 open REF
,'<',$inputref or die "[x]Can't open file $inputref\n";
98 system("mkdir -p $o && rmdir $o");
99 open $O1,">${o}-l${read_len}-i${insertsize_mean}_1.$ext" || die "[x]Can't open file ${o}-l${read_len}-i${insertsize_mean}_1.$ext\n";
100 open $O2,">${o}-l${read_len}-i${insertsize_mean}_2.$ext" || die "[x]Can't open file ${o}-l${read_len}-i${insertsize_mean}_1.$ext\n";
101 my (%insertsize,%read_info,$Qc);
108 my $id=(split (/\s+/,$line))[0];
109 my $refseq=(split (/\n/,$line,2))[1];
111 $refseq=~s/[BDEFHIJKLMNOPQRSUVWXYZ]+//g;
112 my $length=length $refseq;
113 print STDERR
">$id:$length ...";
115 if ($heterSNP_rate>0 || $heterIndel_rate>0) {
116 $read_num_pair=int ($length*$coverage/(2*2*$read_len));
118 $read_num_pair=int ($length*$coverage/(2*$read_len));
120 &insertsize_distribution
($insertsize_mean,$insertsize_sd,$read_num_pair,\
%insertsize);
123 if ($error_rate>0.0001){
124 &error_distribution
($length,$read_len,$error_rate,$read_num_pair,\
%read_info,\
$Qc);
126 print "The error rate is smaller than basic error rate 0.001\n";
129 &getreads
($refseq,\
%insertsize,\
%read_info);
130 foreach my $del (sort {$a<=>$b} keys %insertsize) {
131 delete $insertsize{$del};
133 foreach my $del (sort {$a<=>$b} keys %read_info) {
134 delete $read_info{$del};
137 if ($heterSNP_rate>0 || $heterIndel_rate>0) { ###heterozygous SNP and heterozygous indel exists is diploid
138 my $refseqsnp=&heter_SNP_Indel
($refseq,$heterSNP_rate,$heterIndel_rate,$id);
139 $length=length $refseqsnp;
140 $read_num_pair=int ($length*$coverage/(2*2*$read_len));
141 &insertsize_distribution
($insertsize_mean,$insertsize_sd,$read_num_pair,\
%insertsize);
143 if ($error_rate>0.0001) {
144 &error_distribution
($length,$read_len,$error_rate,$read_num_pair,\
%read_info,\
$Qc);
145 } else { print "The error rate is smaller than basic error rate 0.001\n"; }
147 &getreads
($refseqsnp,\
%insertsize,\
%read_info);
148 foreach my $del (sort {$a<=>$b} keys %insertsize) {
149 delete $insertsize{$del};
151 foreach my $del (sort {$a<=>$b} keys %read_info) {
152 delete $read_info{$del};
161 #############################Subroutines###################################################
162 ########simulate the insertsize distribution with the model of normal distribution function
163 ########The insertsize range is limited in (μ-5σ,μ+5σ), which covers almost all the data.
164 sub insertsize_distribution
{
165 (my $mean,my $sd,my $rd_num,my $isize)=@_;
168 for (my $i=$mean-5*$sd;$i<=$mean+5*$sd;$i++) {
169 $isize->{$i}=1/$sd/sqrt
(2*$pi)/exp(($i-$mean)**2 / (2*$sd**2));
170 $total+=$isize->{$i};
172 for (my $i=$mean-5*$sd;$i<=$mean+5*$sd;$i++) {
173 $isize->{$i}=0.5+$isize->{$i}/$total*$rd_num;
174 $isize->{$i}=int $isize->{$i};
175 $total1+=$isize->{$i};
177 $isize->{$mean}+=$rd_num-$total1 if ($rd_num>$total1);
180 #######Simulate illumina error distribution on different cycles,
181 #######with the model function f(x)=0.00001*x**3
182 sub error_distribution
{
183 my ($seq_len, $rd_len, $er_rate, $rd_num, $rd_info, $Qcr)=@_;
184 my (%hash,$total,$t,$q);
185 my $basic_err=0.0001;
186 my $total_err=($er_rate-$basic_err)*$rd_len;
187 for (my $i=1;$i<=$rd_len;$i++) {
188 $hash{$i}=0.00001*$i**3;
192 print STDERR
"[!]% & Q: ";
193 for (my $circle=1;$circle<=$rd_len;$circle++) {
194 $t = $basic_err+$hash{$circle}/$total*$total_err;
196 $q=int(-10*log($t)/log(10));
198 printf STDERR
"%u:%.1f,%u ",$circle,$t,$q;
200 $hash{$circle}=$hash{$circle}*$rd_num*2;
201 $hash{$circle}=int $hash{$circle};
203 print STDERR
"\n[!]Qstr:[$Qc]\n";
205 for(my $circle=1;$circle<=$rd_len;$circle++) {
206 while ($hash{$circle}>0) {
207 my $j=int (rand $rd_num);
208 push @
{$rd_info->{$j}},$circle;
213 ####Rrealization of error sequencing
216 (my $seq,my $position,my $r_l)=@_;
217 $position=int $position;
228 my $s1=substr $seq,0,$position-1;
229 my $s2=substr $seq,$position-1,1;
230 my $s3=substr $seq,-($r_l-$position),$r_l-$position;
231 my $k=int ((rand 100)%3);
233 $s2=$get_match{A
}[$k];
235 $s2=$get_match{T
}[$k];
237 $s2=$get_match{G
}[$k];
239 $s2=$get_match{C
}[$k];
241 $s2=$get_match{a
}[$k];
243 $s2=$get_match{t
}[$k];
245 $s2=$get_match{g
}[$k];
247 $s2=$get_match{c
}[$k];
252 }elsif($position==$r_l){
260 #####Getting insert sequence when heterozygous indel rate is bigger than 0
264 my @baseA=("A","T","G","C");
265 for (my $i=1;$i<=$num;$i++) {
266 my $j=int ((rand 100)%4);
271 #####Produce heterozygous SNPs and indels in multiploid
273 (my $sequence,my $hsnp_rate,my $hindel_rate,my $ID)=@_;
274 # print "It's begin to get snp sequence !\n";
285 my $seqlen=length $sequence;
287 @list=split /\s*/,$sequence;
290 my $snp_num=$seqlen*$hsnp_rate;
291 $snp_num=int $snp_num;
292 for (my $i=1;$i<=$snp_num;$i++) {
294 my $j=int (rand $seqlen);
296 # print "$q\t$list[$j]\t";
298 $list[$j]= $get_match{$list[$j]}[$k];
299 # print "$list[$j]\t$snp_num\t$seqlen\n";
301 if (($hsnp_rate>0) && ($hindel_rate<=0)){
302 foreach my $g (@list){
308 if ($hindel_rate>0) {
310 my $indel=$seqlen*$hindel_rate;
311 my $insertion=$indel/2;
312 my $deletion=$indel/2;
315 foreach my $k (@list1) {
316 for (my $i=1;$i<=$insertion/$k;$i++) {
317 my $j=int (rand $seqlen);
318 my $insert=&getinsertion
($p);
319 $insertion{$j}=$insert;
324 foreach my $i (@list1) {
325 for (my $j=1;$j<=$deletion/$i;$j++) {
326 my $k=int (rand $seqlen);
327 redo if ($k+$p>$seqlen);
328 for (my $t=$k;$t<$k+$p;$t++){
335 foreach my $ce (sort {$a<=>$b} keys %insertion){
336 for (my $i=$cs;$i<=$ce;$i++){
337 $sequence.=$list[$i];
340 $sequence.=$insertion{$ce};
343 for (my $i=$cs;$i<$seqlen;$i++){
344 $sequence.=$list[$i];
347 foreach my $i (keys %insertion){
348 delete $insertion{$i};
350 # print "Get deletion has been finished!\n";
352 # print "Finish Get snp and indel sequence\n";
355 #########Print to File
356 sub write2files
($$$) {
357 my ($idref,$stref,$Qcr)=@_; # reads_$read_count\_$read_len\_1\n$a
359 #my ($ra,$rb)=@$Qarref;
360 #push @Qa,chr(64+$_) for (@$ra);
361 print $O1 "\@$${idref}/1\n$$stref[0]\n+\n",join('',$$Qcr),"\n";
362 print $O2 "\@$${idref}/2\n$$stref[1]\n+\n",join('',$$Qcr),"\n";
364 print $O1 ">$${idref}_1\n$$stref[0]\n";
365 print $O2 ">$${idref}_2\n$$stref[1]\n";
368 #########Produce solexa reads
370 my ($refseq,$ISize,$Rd_Info)=@_;
371 my $length=length $refseq;
373 for (my $i=$insertsize_mean-5*$insertsize_sd;$i<=$insertsize_mean+5*$insertsize_sd;$i++) {
374 while ($ISize->{$i}>0) {
375 my $pos=rand $length;
377 my $substr=substr $refseq,$pos,$i;
378 redo if (length $substr<$i);
379 my $a=substr $substr,0,$read_len;
380 my $b=substr $substr,-$read_len,$read_len;
381 if ($insertsize_mean <=1000) {
383 $b=~tr/ATGCatgc/TACGtacg/;
384 } elsif ($insertsize_mean >1000) {
386 $a=~tr/ATGCatgc/TACGtacg/;
388 print "Insertsize can't be smaller than 0\n";
391 if (exists $Rd_Info->{$read_count}) {
392 foreach my $j (@
{$Rd_Info->{$read_count}}) {
393 my $k=int ((rand 10)%2);
395 $a=&match
($a,$j,$read_len);
397 $b=&match
($b,$j,$read_len);
401 my $d=int ((rand 10)%2);
405 #print $O1 ">reads_$read_count\_$read_len\_1\n$a\n";
406 #print $O2 ">reads_$read_count\_$read_len\_2\n$b\n";
409 #print $O1 ">reads_$read_count\_$read_len\_1\n$b\n";
410 #print $O2 ">reads_$read_count\_$read_len\_2\n$a\n";
413 $id="FC00XX:${read_count}:${pos}:${i}:$read_len";
414 &write2files
(\
$id,\
@strs,\
$Qc);
416 $id="reads_${read_count}_${pos}_${i}_$read_len";
417 &write2files
(\
$id,\
@strs,undef);
425 find
./out/ -name
'*.fa' | while read a
; do b
=`basename $a`; echo
"#$ -N \"sim_${b}\"" >./shell/sim_
${b
}.sh
;echo
"#$ -cwd -r y -l vf=1G,p=1 -v PERL5LIB,PATH,PYTHONPATH,LD_LIBRARY_PATH -o ./out/fq/sim_${b}.log -e ./out/fq/sim_${b}.err -hold_jid \"*_si\" " >> ./shell/sim_
${b
}.sh
; echo
./simsolexa_reads.pl -fq -b -input $a -read_len 76 -coverage 5 -output ./out
/fq/${b
}.sim1
>> ./shell/sim_
${b
}.sh
; echo
./simsolexa_reads.pl -fq -b -input $a -read_len 76 -coverage 5 -output ./out
/fq/${b
}.sim2
>> ./shell/sim_
${b
}.sh
; echo
./simsolexa_reads.pl -fq -b -input $a -read_len 76 -coverage 5 -output ./out
/fq/${b
}.sim3
>> ./shell/sim_
${b
}.sh
; echo
./simsolexa_reads.pl -fq -b -input $a -read_len 76 -coverage 5 -output ./out
/fq/${b
}.sim4
>> ./shell/sim_
${b
}.sh
; done