modified: myjupyterlab.sh
[GalaxyCodeBases.git] / tools / simulators / popudepth / simsolexa_reads.pl
blobd37a1c7ce0799273530cdb8428a3705993a9beb7
1 #!/usr/bin/perl
2 use strict;
3 use warnings;
4 use Getopt::Long;
6 =head1 Description
8 It is a program for simulating solexa reads,with a series of problems generate by solexa
9 sequencing machine in though,such as insertsize distribution,error rate,heterozygosis SNP
10 and heterozygosis Indel in diploid. User should set the value of insertsize_mean and
11 insertsize_sd ,they are the mean value and standard deviation of the normal distribution
12 that used as the model function when simulating insertsize distribution.The normal
13 distribution function model we used in this program is
14 f(x)=1/σ/sqrt(2*pi)/exp((x-μ)**2 / (2*σ**2)) ,and the insertsize distribution
15 range is limited in (μ-5σ,μ+5σ), which will cover almost all the data.To simulate
16 illumina error rates on different cycles,we use the function f(x)=0.00001*x**3 as a model
17 function,because the error rate of a number of bases at the end is much larger than
18 other area in a read. User should set the heterozygosis SNP rate and heterozygosis Indel
19 rate as also ,if you want to use those function,but remember that heterozygosis SNP rate
20 and heterozygosis Indel rate is only exists in diploid. At last ,you should set another
21 several parameters ,read length ,coverage of reads ,input sequence and output prefix,the
22 input sequence and output prefix must be set ,because there are not default value.
24 =head1 Contact and Version
26 Contact: huxuesong@genomics.cn lujianliang@genomics.org.cn yuezhen@genomics.org.cn
27 Version: 1.1 Data: 2012-4-13
29 =head1 Option
31 -input <string> input reference genome sequence
32 -read_len <int> set read length, read1 and read2 have the same length,default:100
33 -coverage <int> set the sequencing coverage(sometimes called depth),default:40
34 -insertsize_mean <int> set the average value of insert size,default:500
35 -insertsize_sd <int> set the standard deviation of insert sizes, default:25
36 -error_rate <float> set the average error rate over all cycles,default:0.01
37 -heterSNP_rate <float> set the heterozygous SNP rate of the diploid genome,default:0
38 -heterIndel_rate <float> set the heterozygous indel rate of the diploid genome,default:0
39 -output <string> output file prefix
40 -fq output to fastaq file
41 -b No pause for batch runs
42 -help output help infomation
44 =head1 Example
46 1. perl simulate_solexa_reads.pl -input ref_sequence.fa -o humen
47 erery parameter use the default one .
48 2. perl simulate_solexa_reads.pl -input ref_sequence.fa -read_len 150 -coverage 20 -o humen
49 just set read length and coverage you needed.
50 3. perl simulate_solexa_reads.pl -input ref_sequence.fa -o humen -insertsize_mean 600 -insertsize_sd 30
51 -error_rate 0.01
52 set insertsize distribution and error rate.
53 4. perl simulate_solexa_reads.pl -input ref_sequence.fa -o humen -heterSNP_rate 0.001 -heterIndel_rate 0.001
54 the genome is diploid and you want to produce heterozygosis SNPs heterozygosis Indels in reads .
56 =cut
57 ################################################################################################
58 my ($inputref,$read_len,$coverage,$insertsize_mean,$insertsize_sd,$error_rate,$heterSNP_rate);
59 my ($heterIndel_rate,$ref_ploid,$n_ploid,$o,$help,$fq,$fqmode,$opt_b,$ext);
61 GetOptions(
62 "input:s"=>\$inputref,
63 "read_len:i"=>\$read_len,
64 "coverage:i"=>\$coverage,
65 "insertsize_mean:i"=>\$insertsize_mean,
66 "insertsize_sd:i"=>\$insertsize_sd,
67 "error_rate:f"=>\$error_rate,
68 "fq"=>\$fq,
69 "heterSNP_rate:f"=>\$heterSNP_rate,
70 "heterIndel_rate:f"=>\$heterIndel_rate,
71 "output:s"=>\$o,
72 "b"=>\$opt_b,
73 "help"=>\$help
75 die `pod2text $0` if ($help);
76 die `pod2text $0` if (!$inputref || !$o);
77 #############################Set default value#############################################
78 $read_len=100 if (!defined $read_len);
79 $coverage=40 if (!defined $coverage);
80 $insertsize_mean=500 if (!defined $insertsize_mean);
81 $insertsize_sd=25 if (!defined $insertsize_sd);
82 $error_rate=0.01 if (!defined $error_rate);
83 if (defined $fq) {
84 $fqmode='FastaQ';
85 $ext='fq';
86 } else {$fqmode='FASTA';$ext='fa';}
87 $heterSNP_rate=0 if (!defined $heterSNP_rate);
88 $heterIndel_rate=0 if (!defined $heterIndel_rate);
89 #############################Main Program##################################################
90 warn "[!]$fqmode mode.
91 From [$inputref] to [$o]_${read_len}_${insertsize_mean}_{1,2}.$ext
92 Read Len:[$read_len] Error Rate:[$error_rate] Coverage:[$coverage] Insertsize:[${insertsize_mean}±$insertsize_sd]
93 HeterSNP Rate:$heterSNP_rate HeterIndel Rate:$heterIndel_rate\n";
94 if (! $opt_b) {print STDERR 'press [Enter] to continue...'; <>;}
96 my ($O1,$O2);
97 open REF,'<',$inputref or die "[x]Can't open file $inputref\n";
98 system("mkdir -p $o && rmdir $o");
99 open $O1,">${o}-l${read_len}-i${insertsize_mean}_1.$ext" || die "[x]Can't open file ${o}-l${read_len}-i${insertsize_mean}_1.$ext\n";
100 open $O2,">${o}-l${read_len}-i${insertsize_mean}_2.$ext" || die "[x]Can't open file ${o}-l${read_len}-i${insertsize_mean}_1.$ext\n";
101 my (%insertsize,%read_info,$Qc);
102 $/=">";
104 while (<REF>) {
105 chomp;
106 next if ($_ eq "");
107 my $line=$_;
108 my $id=(split (/\s+/,$line))[0];
109 my $refseq=(split (/\n/,$line,2))[1];
110 $refseq=~s/\n//g;
111 $refseq=~s/[BDEFHIJKLMNOPQRSUVWXYZ]+//g;
112 my $length=length $refseq;
113 print STDERR ">$id:$length ...";
114 my $read_num_pair;
115 if ($heterSNP_rate>0 || $heterIndel_rate>0) {
116 $read_num_pair=int ($length*$coverage/(2*2*$read_len));
117 }else{
118 $read_num_pair=int ($length*$coverage/(2*$read_len));
120 &insertsize_distribution($insertsize_mean,$insertsize_sd,$read_num_pair,\%insertsize);
121 print STDERR "\n";
122 if ($error_rate>0){
123 if ($error_rate>0.0001){
124 &error_distribution($length,$read_len,$error_rate,$read_num_pair,\%read_info,\$Qc);
125 }else{
126 print "The error rate is smaller than basic error rate 0.001\n";
129 &getreads($refseq,\%insertsize,\%read_info);
130 foreach my $del (sort {$a<=>$b} keys %insertsize) {
131 delete $insertsize{$del};
133 foreach my $del (sort {$a<=>$b} keys %read_info) {
134 delete $read_info{$del};
137 if ($heterSNP_rate>0 || $heterIndel_rate>0) { ###heterozygous SNP and heterozygous indel exists is diploid
138 my $refseqsnp=&heter_SNP_Indel($refseq,$heterSNP_rate,$heterIndel_rate,$id);
139 $length=length $refseqsnp;
140 $read_num_pair=int ($length*$coverage/(2*2*$read_len));
141 &insertsize_distribution($insertsize_mean,$insertsize_sd,$read_num_pair,\%insertsize);
142 if ($error_rate>0){
143 if ($error_rate>0.0001) {
144 &error_distribution($length,$read_len,$error_rate,$read_num_pair,\%read_info,\$Qc);
145 } else { print "The error rate is smaller than basic error rate 0.001\n"; }
147 &getreads($refseqsnp,\%insertsize,\%read_info);
148 foreach my $del (sort {$a<=>$b} keys %insertsize) {
149 delete $insertsize{$del};
151 foreach my $del (sort {$a<=>$b} keys %read_info) {
152 delete $read_info{$del};
156 $/="\n";
157 close REF;
158 close $O1;
159 close $O2;
160 warn "[!] Done !\n";
161 #############################Subroutines###################################################
162 ########simulate the insertsize distribution with the model of normal distribution function
163 ########The insertsize range is limited in (μ-5σ,μ+5σ), which covers almost all the data.
164 sub insertsize_distribution{
165 (my $mean,my $sd,my $rd_num,my $isize)=@_;
166 my ($total,$total1);
167 my $pi=3.1415926535;
168 for (my $i=$mean-5*$sd;$i<=$mean+5*$sd;$i++) {
169 $isize->{$i}=1/$sd/sqrt(2*$pi)/exp(($i-$mean)**2 / (2*$sd**2));
170 $total+=$isize->{$i};
172 for (my $i=$mean-5*$sd;$i<=$mean+5*$sd;$i++) {
173 $isize->{$i}=0.5+$isize->{$i}/$total*$rd_num;
174 $isize->{$i}=int $isize->{$i};
175 $total1+=$isize->{$i};
177 $isize->{$mean}+=$rd_num-$total1 if ($rd_num>$total1);
180 #######Simulate illumina error distribution on different cycles,
181 #######with the model function f(x)=0.00001*x**3
182 sub error_distribution{
183 my ($seq_len, $rd_len, $er_rate, $rd_num, $rd_info, $Qcr)=@_;
184 my (%hash,$total,$t,$q);
185 my $basic_err=0.0001;
186 my $total_err=($er_rate-$basic_err)*$rd_len;
187 for (my $i=1;$i<=$rd_len;$i++) {
188 $hash{$i}=0.00001*$i**3;
189 $total+=$hash{$i};
191 $$Qcr='';
192 print STDERR "[!]% & Q: ";
193 for (my $circle=1;$circle<=$rd_len;$circle++) {
194 $t = $basic_err+$hash{$circle}/$total*$total_err;
195 $hash{$circle}=$t;
196 $q=int(-10*log($t)/log(10));
197 $t *= 100;
198 printf STDERR "%u:%.1f,%u ",$circle,$t,$q;
199 $$Qcr .= chr(64+$q);
200 $hash{$circle}=$hash{$circle}*$rd_num*2;
201 $hash{$circle}=int $hash{$circle};
203 print STDERR "\n[!]Qstr:[$Qc]\n";
205 for(my $circle=1;$circle<=$rd_len;$circle++) {
206 while ($hash{$circle}>0) {
207 my $j=int (rand $rd_num);
208 push @{$rd_info->{$j}},$circle;
209 $hash{$circle}--;
213 ####Rrealization of error sequencing
215 sub match{
216 (my $seq,my $position,my $r_l)=@_;
217 $position=int $position;
218 my %get_match=(
219 "A"=>["T","G","C"],
220 "T"=>["A","G","C"],
221 "G"=>["A","T","C"],
222 "C"=>["A","T","G"],
223 "a"=>["t","g","c"],
224 "t"=>["a","g","c"],
225 "g"=>["a","t","c"],
226 "c"=>["a","t","g"],
228 my $s1=substr $seq,0,$position-1;
229 my $s2=substr $seq,$position-1,1;
230 my $s3=substr $seq,-($r_l-$position),$r_l-$position;
231 my $k=int ((rand 100)%3);
232 if ($s2 eq "A") {
233 $s2=$get_match{A}[$k];
234 }elsif($s2 eq "T"){
235 $s2=$get_match{T}[$k];
236 }elsif($s2 eq "G"){
237 $s2=$get_match{G}[$k];
238 }elsif($s2 eq "C"){
239 $s2=$get_match{C}[$k];
240 }elsif($s2 eq "a"){
241 $s2=$get_match{a}[$k];
242 }elsif($s2 eq "t"){
243 $s2=$get_match{t}[$k];
244 }elsif($s2 eq "g"){
245 $s2=$get_match{g}[$k];
246 }else{
247 $s2=$get_match{c}[$k];
249 my $s;
250 if ($position==1) {
251 $s=$s2.$s3;
252 }elsif($position==$r_l){
253 $s=$s1.$s2;
254 }else{
255 $s=$s1.$s2.$s3;
257 return $s;
260 #####Getting insert sequence when heterozygous indel rate is bigger than 0
261 sub getinsertion{
262 my $num=shift @_;
263 my $base;
264 my @baseA=("A","T","G","C");
265 for (my $i=1;$i<=$num;$i++) {
266 my $j=int ((rand 100)%4);
267 $base.=$baseA[$j];
269 return $base;
271 #####Produce heterozygous SNPs and indels in multiploid
272 sub heter_SNP_Indel{
273 (my $sequence,my $hsnp_rate,my $hindel_rate,my $ID)=@_;
274 # print "It's begin to get snp sequence !\n";
275 my %get_match=(
276 "A"=>["T","G","C"],
277 "T"=>["A","G","C"],
278 "G"=>["A","T","C"],
279 "C"=>["A","T","G"],
280 "a"=>["t","g","c"],
281 "t"=>["a","g","c"],
282 "g"=>["a","t","c"],
283 "c"=>["a","t","g"],
285 my $seqlen=length $sequence;
286 my @list;
287 @list=split /\s*/,$sequence;
288 $sequence="";
289 if ($hsnp_rate>0) {
290 my $snp_num=$seqlen*$hsnp_rate;
291 $snp_num=int $snp_num;
292 for (my $i=1;$i<=$snp_num;$i++) {
293 # print "$ID\t";
294 my $j=int (rand $seqlen);
295 # my $q=$j+1;
296 # print "$q\t$list[$j]\t";
297 my $k=$j%3;
298 $list[$j]= $get_match{$list[$j]}[$k];
299 # print "$list[$j]\t$snp_num\t$seqlen\n";
301 if (($hsnp_rate>0) && ($hindel_rate<=0)){
302 foreach my $g (@list){
303 $sequence.=$g;
308 if ($hindel_rate>0) {
309 my %insertion;
310 my $indel=$seqlen*$hindel_rate;
311 my $insertion=$indel/2;
312 my $deletion=$indel/2;
313 my @list1=(2,3,6);
314 my $p=1;
315 foreach my $k (@list1) {
316 for (my $i=1;$i<=$insertion/$k;$i++) {
317 my $j=int (rand $seqlen);
318 my $insert=&getinsertion($p);
319 $insertion{$j}=$insert;
321 $p++;
323 $p=1;
324 foreach my $i (@list1) {
325 for (my $j=1;$j<=$deletion/$i;$j++) {
326 my $k=int (rand $seqlen);
327 redo if ($k+$p>$seqlen);
328 for (my $t=$k;$t<$k+$p;$t++){
329 $list[$t]="";
332 $p++;
334 my $cs=0;
335 foreach my $ce (sort {$a<=>$b} keys %insertion){
336 for (my $i=$cs;$i<=$ce;$i++){
337 $sequence.=$list[$i];
340 $sequence.=$insertion{$ce};
341 $cs=$ce+1;
343 for (my $i=$cs;$i<$seqlen;$i++){
344 $sequence.=$list[$i];
347 foreach my $i (keys %insertion){
348 delete $insertion{$i};
350 # print "Get deletion has been finished!\n";
352 # print "Finish Get snp and indel sequence\n";
353 return $sequence;
355 #########Print to File
356 sub write2files($$$) {
357 my ($idref,$stref,$Qcr)=@_; # reads_$read_count\_$read_len\_1\n$a
358 if ($Qcr) {
359 #my ($ra,$rb)=@$Qarref;
360 #push @Qa,chr(64+$_) for (@$ra);
361 print $O1 "\@$${idref}/1\n$$stref[0]\n+\n",join('',$$Qcr),"\n";
362 print $O2 "\@$${idref}/2\n$$stref[1]\n+\n",join('',$$Qcr),"\n";
363 } else {
364 print $O1 ">$${idref}_1\n$$stref[0]\n";
365 print $O2 ">$${idref}_2\n$$stref[1]\n";
368 #########Produce solexa reads
369 sub getreads{
370 my ($refseq,$ISize,$Rd_Info)=@_;
371 my $length=length $refseq;
372 my $read_count=0;
373 for (my $i=$insertsize_mean-5*$insertsize_sd;$i<=$insertsize_mean+5*$insertsize_sd;$i++) {
374 while ($ISize->{$i}>0) {
375 my $pos=rand $length;
376 $pos=int $pos;
377 my $substr=substr $refseq,$pos,$i;
378 redo if (length $substr<$i);
379 my $a=substr $substr,0,$read_len;
380 my $b=substr $substr,-$read_len,$read_len;
381 if ($insertsize_mean <=1000) {
382 $b=reverse $b;
383 $b=~tr/ATGCatgc/TACGtacg/;
384 } elsif ($insertsize_mean >1000) {
385 $a=reverse $a;
386 $a=~tr/ATGCatgc/TACGtacg/;
387 } else {
388 print "Insertsize can't be smaller than 0\n";
390 $read_count++;
391 if (exists $Rd_Info->{$read_count}) {
392 foreach my $j (@{$Rd_Info->{$read_count}}) {
393 my $k=int ((rand 10)%2);
394 if ($k) {
395 $a=&match($a,$j,$read_len);
396 } else {
397 $b=&match($b,$j,$read_len);
401 my $d=int ((rand 10)%2);
402 my ($id,@strs);
403 if ($d) {
404 @strs=($a,$b);
405 #print $O1 ">reads_$read_count\_$read_len\_1\n$a\n";
406 #print $O2 ">reads_$read_count\_$read_len\_2\n$b\n";
407 } else {
408 @strs=($b,$a);
409 #print $O1 ">reads_$read_count\_$read_len\_1\n$b\n";
410 #print $O2 ">reads_$read_count\_$read_len\_2\n$a\n";
412 if ($fq) {
413 $id="FC00XX:${read_count}:${pos}:${i}:$read_len";
414 &write2files(\$id,\@strs,\$Qc);
415 } else {
416 $id="reads_${read_count}_${pos}_${i}_$read_len";
417 &write2files(\$id,\@strs,undef);
419 $ISize->{$i}--;
424 __END__
425 find ./out/ -name '*.fa' | while read a; do b=`basename $a`; echo "#$ -N \"sim_${b}\"" >./shell/sim_${b}.sh;echo "#$ -cwd -r y -l vf=1G,p=1 -v PERL5LIB,PATH,PYTHONPATH,LD_LIBRARY_PATH -o ./out/fq/sim_${b}.log -e ./out/fq/sim_${b}.err -hold_jid \"*_si\" " >> ./shell/sim_${b}.sh; echo ./simsolexa_reads.pl -fq -b -input $a -read_len 76 -coverage 5 -output ./out/fq/${b}.sim1 >> ./shell/sim_${b}.sh; echo ./simsolexa_reads.pl -fq -b -input $a -read_len 76 -coverage 5 -output ./out/fq/${b}.sim2 >> ./shell/sim_${b}.sh; echo ./simsolexa_reads.pl -fq -b -input $a -read_len 76 -coverage 5 -output ./out/fq/${b}.sim3 >> ./shell/sim_${b}.sh; echo ./simsolexa_reads.pl -fq -b -input $a -read_len 76 -coverage 5 -output ./out/fq/${b}.sim4 >> ./shell/sim_${b}.sh; done