tools/simulators/popudepth/simsolexa_reads.pl

   1 #!/usr/bin/perl
   2 use strict;
   3 use warnings;
   4 use Getopt::Long;
   5
   6 =head1 Description
   7
   8         It is a program for simulating solexa reads,with a series of problems generate by solexa
   9         sequencing machine in though,such as insertsize distribution,error rate,heterozygosis SNP
  10         and heterozygosis Indel in diploid. User should set the value of insertsize_mean and
  11         insertsize_sd ,they are the mean value and standard deviation of the normal distribution
  12         that used as the model function when simulating insertsize distribution.The normal
  13         distribution function model we used in this program is
  14         f(x)=1/σ/sqrt(2*pi)/exp((x-μ)**2 / (2*σ**2)) ,and the insertsize distribution
  15         range is limited in (μ-5σ，μ+5σ), which will cover almost all the data.To simulate
  16         illumina error rates on different cycles,we use the function f(x)=0.00001*x**3 as a model
  17         function,because the error rate of a number of bases at the end is much larger than
  18         other area in a read. User should set the heterozygosis SNP rate and heterozygosis Indel
  19         rate as also ,if you want to use those function,but remember that heterozygosis SNP rate
  20         and heterozygosis Indel rate is only exists in diploid. At last ,you should set another
  21         several parameters ,read length ,coverage of reads ,input sequence and output prefix,the
  22         input sequence and output prefix must be set ,because there are not default value.
  23
  24 =head1 Contact and Version
  25
  26         Contact:        huxuesong@genomics.cn lujianliang@genomics.org.cn yuezhen@genomics.org.cn
  27         Version:        1.1             Data:   2012-4-13
  28
  29 =head1 Option
  30
  31         -input                  <string>        input reference genome sequence
  32         -read_len               <int>           set read length, read1 and read2 have the same length,default:100
  33         -coverage               <int>           set the sequencing coverage(sometimes called depth),default:40
  34         -insertsize_mean        <int>           set the average value of insert size,default:500
  35         -insertsize_sd          <int>           set the standard deviation of insert sizes, default:25
  36         -error_rate             <float>         set the average error rate over all cycles,default:0.01
  37         -heterSNP_rate          <float>         set the heterozygous SNP rate of the diploid genome,default:0
  38         -heterIndel_rate        <float>         set the heterozygous indel rate of the diploid genome,default:0
  39         -output                 <string>        output file prefix
  40         -fq                                     output to fastaq file
  41         -b                                      No pause for batch runs
  42         -help                                   output help infomation
  43
  44 =head1 Example
  45
  46         1. perl simulate_solexa_reads.pl -input ref_sequence.fa  -o humen
  47         erery parameter use the default one .
  48         2. perl simulate_solexa_reads.pl -input ref_sequence.fa -read_len 150 -coverage 20 -o humen
  49         just set read length and coverage you needed.
  50         3. perl simulate_solexa_reads.pl -input ref_sequence.fa -o humen -insertsize_mean 600 -insertsize_sd 30
  51            -error_rate 0.01
  52         set insertsize distribution and error rate.
  53         4. perl simulate_solexa_reads.pl -input ref_sequence.fa -o humen -heterSNP_rate 0.001 -heterIndel_rate 0.001
  54         the genome is diploid and you want to produce heterozygosis SNPs  heterozygosis Indels in reads .
  55
  56 =cut
  57 ################################################################################################
  58 my ($inputref,$read_len,$coverage,$insertsize_mean,$insertsize_sd,$error_rate,$heterSNP_rate);
  59 my ($heterIndel_rate,$ref_ploid,$n_ploid,$o,$help,$fq,$fqmode,$opt_b,$ext);
  60
  61 GetOptions(
  62         "input:s"=>\$inputref,
  63         "read_len:i"=>\$read_len,
  64         "coverage:i"=>\$coverage,
  65         "insertsize_mean:i"=>\$insertsize_mean,
  66         "insertsize_sd:i"=>\$insertsize_sd,
  67         "error_rate:f"=>\$error_rate,
  68         "fq"=>\$fq,
  69         "heterSNP_rate:f"=>\$heterSNP_rate,
  70         "heterIndel_rate:f"=>\$heterIndel_rate,
  71         "output:s"=>\$o,
  72         "b"=>\$opt_b,
  73         "help"=>\$help
  74 );
  75 die `pod2text $0` if ($help);
  76 die `pod2text $0` if (!$inputref || !$o);
  77 #############################Set default value#############################################
  78 $read_len=100 if (!defined $read_len);
  79 $coverage=40 if (!defined $coverage);
  80 $insertsize_mean=500 if (!defined $insertsize_mean);
  81 $insertsize_sd=25 if (!defined $insertsize_sd);
  82 $error_rate=0.01 if (!defined $error_rate);
  83 if (defined $fq) {
  84         $fqmode='FastaQ';
  85         $ext='fq';
  86 } else {$fqmode='FASTA';$ext='fa';}
  87 $heterSNP_rate=0 if (!defined $heterSNP_rate);
  88 $heterIndel_rate=0 if (!defined $heterIndel_rate);
  89 #############################Main Program##################################################
  90 warn "[!]$fqmode mode.
  91 From [$inputref] to [$o]_${read_len}_${insertsize_mean}_{1,2}.$ext
  92 Read Len:[$read_len] Error Rate:[$error_rate] Coverage:[$coverage] Insertsize:[${insertsize_mean}±$insertsize_sd]
  93 HeterSNP Rate:$heterSNP_rate HeterIndel Rate:$heterIndel_rate\n";
  94 if (! $opt_b) {print STDERR 'press [Enter] to continue...'; <>;}
  95
  96 my ($O1,$O2);
  97 open REF,'<',$inputref or die "[x]Can't open file $inputref\n";
  98 system("mkdir -p $o && rmdir $o");
  99 open $O1,">${o}-l${read_len}-i${insertsize_mean}_1.$ext" || die "[x]Can't open file ${o}-l${read_len}-i${insertsize_mean}_1.$ext\n";
 100 open $O2,">${o}-l${read_len}-i${insertsize_mean}_2.$ext" || die "[x]Can't open file ${o}-l${read_len}-i${insertsize_mean}_1.$ext\n";
 101 my (%insertsize,%read_info,$Qc);
 102 $/=">";
 103
 104 while (<REF>) {
 105         chomp;
 106         next if ($_ eq "");
 107         my $line=$_;
 108         my $id=(split (/\s+/,$line))[0];
 109         my $refseq=(split (/\n/,$line,2))[1];
 110         $refseq=~s/\n//g;
 111         $refseq=~s/[BDEFHIJKLMNOPQRSUVWXYZ]+//g;
 112         my $length=length $refseq;
 113         print STDERR ">$id:$length ...";
 114         my $read_num_pair;
 115         if ($heterSNP_rate>0 || $heterIndel_rate>0) {
 116                 $read_num_pair=int ($length*$coverage/(2*2*$read_len));
 117         }else{
 118                 $read_num_pair=int ($length*$coverage/(2*$read_len));
 119         }
 120         &insertsize_distribution($insertsize_mean,$insertsize_sd,$read_num_pair,\%insertsize);
 121         print STDERR "\n";
 122         if ($error_rate>0){
 123                 if ($error_rate>0.0001){
 124                         &error_distribution($length,$read_len,$error_rate,$read_num_pair,\%read_info,\$Qc);
 125                 }else{
 126                         print "The error rate is smaller than basic error rate 0.001\n";
 127                 }
 128         }
 129         &getreads($refseq,\%insertsize,\%read_info);
 130         foreach my $del (sort {$a<=>$b} keys %insertsize) {
 131                 delete $insertsize{$del};
 132         }
 133         foreach my $del (sort {$a<=>$b} keys %read_info) {
 134                 delete $read_info{$del};
 135         }
 136
 137         if ($heterSNP_rate>0 || $heterIndel_rate>0) {   ###heterozygous SNP and heterozygous indel exists is diploid
 138                 my $refseqsnp=&heter_SNP_Indel($refseq,$heterSNP_rate,$heterIndel_rate,$id);
 139                 $length=length $refseqsnp;
 140                 $read_num_pair=int ($length*$coverage/(2*2*$read_len));
 141                 &insertsize_distribution($insertsize_mean,$insertsize_sd,$read_num_pair,\%insertsize);
 142                 if ($error_rate>0){
 143                         if ($error_rate>0.0001) {
 144                                 &error_distribution($length,$read_len,$error_rate,$read_num_pair,\%read_info,\$Qc);
 145                         } else { print "The error rate is smaller than basic error rate 0.001\n"; }
 146                 }
 147                 &getreads($refseqsnp,\%insertsize,\%read_info);
 148                 foreach my $del (sort {$a<=>$b} keys %insertsize) {
 149                         delete $insertsize{$del};
 150                 }
 151                 foreach my $del (sort {$a<=>$b} keys %read_info) {
 152                         delete $read_info{$del};
 153                 }
 154         }
 155 }
 156 $/="\n";
 157 close REF;
 158 close $O1;
 159 close $O2;
 160 warn "[!] Done !\n";
 161 #############################Subroutines###################################################
 162 ########simulate the insertsize distribution with the model of normal distribution function
 163 ########The insertsize range is limited in (μ-5σ，μ+5σ), which covers almost all the data.
 164 sub insertsize_distribution{
 165         (my $mean,my $sd,my $rd_num,my $isize)=@_;
 166         my ($total,$total1);
 167         my $pi=3.1415926535;
 168         for (my $i=$mean-5*$sd;$i<=$mean+5*$sd;$i++) {
 169                 $isize->{$i}=1/$sd/sqrt(2*$pi)/exp(($i-$mean)**2 / (2*$sd**2));
 170                 $total+=$isize->{$i};
 171         }
 172         for (my $i=$mean-5*$sd;$i<=$mean+5*$sd;$i++) {
 173                 $isize->{$i}=0.5+$isize->{$i}/$total*$rd_num;
 174                 $isize->{$i}=int $isize->{$i};
 175                 $total1+=$isize->{$i};
 176         }
 177         $isize->{$mean}+=$rd_num-$total1 if ($rd_num>$total1);
 178 }
 179
 180 #######Simulate illumina error distribution on different cycles,
 181 #######with the model function f(x)=0.00001*x**3
 182 sub error_distribution{
 183         my ($seq_len, $rd_len, $er_rate, $rd_num, $rd_info, $Qcr)=@_;
 184         my (%hash,$total,$t,$q);
 185         my $basic_err=0.0001;
 186         my $total_err=($er_rate-$basic_err)*$rd_len;
 187         for (my $i=1;$i<=$rd_len;$i++) {
 188                 $hash{$i}=0.00001*$i**3;
 189                 $total+=$hash{$i};
 190         }
 191         $$Qcr='';
 192         print STDERR "[!]% & Q: ";
 193         for (my $circle=1;$circle<=$rd_len;$circle++) {
 194                 $t = $basic_err+$hash{$circle}/$total*$total_err;
 195                 $hash{$circle}=$t;
 196                 $q=int(-10*log($t)/log(10));
 197                 $t *= 100;
 198                 printf STDERR "%u:%.1f,%u ",$circle,$t,$q;
 199                 $$Qcr .= chr(64+$q);
 200                 $hash{$circle}=$hash{$circle}*$rd_num*2;
 201                 $hash{$circle}=int $hash{$circle};
 202         }
 203         print STDERR "\n[!]Qstr:[$Qc]\n";
 204
 205         for(my $circle=1;$circle<=$rd_len;$circle++) {
 206                 while ($hash{$circle}>0) {
 207                         my $j=int (rand $rd_num);
 208                         push @{$rd_info->{$j}},$circle;
 209                         $hash{$circle}--;
 210                 }
 211         }
 212 }
 213 ####Rrealization of error sequencing
 214
 215 sub match{
 216         (my $seq,my $position,my $r_l)=@_;
 217         $position=int $position;
 218         my %get_match=(
 219                 "A"=>["T","G","C"],
 220                 "T"=>["A","G","C"],
 221                 "G"=>["A","T","C"],
 222                 "C"=>["A","T","G"],
 223                 "a"=>["t","g","c"],
 224                 "t"=>["a","g","c"],
 225                 "g"=>["a","t","c"],
 226                 "c"=>["a","t","g"],
 227         );
 228         my $s1=substr $seq,0,$position-1;
 229         my $s2=substr $seq,$position-1,1;
 230         my $s3=substr $seq,-($r_l-$position),$r_l-$position;
 231         my $k=int ((rand 100)%3);
 232         if ($s2 eq "A") {
 233                 $s2=$get_match{A}[$k];
 234         }elsif($s2 eq "T"){
 235                 $s2=$get_match{T}[$k];
 236         }elsif($s2 eq "G"){
 237                 $s2=$get_match{G}[$k];
 238         }elsif($s2 eq "C"){
 239                 $s2=$get_match{C}[$k];
 240         }elsif($s2 eq "a"){
 241                 $s2=$get_match{a}[$k];
 242         }elsif($s2 eq "t"){
 243                 $s2=$get_match{t}[$k];
 244         }elsif($s2 eq "g"){
 245                 $s2=$get_match{g}[$k];
 246         }else{
 247                 $s2=$get_match{c}[$k];
 248         }
 249         my $s;
 250         if ($position==1) {
 251                 $s=$s2.$s3;
 252         }elsif($position==$r_l){
 253                 $s=$s1.$s2;
 254         }else{
 255                 $s=$s1.$s2.$s3;
 256         }
 257         return $s;
 258 }
 259
 260 #####Getting insert sequence when heterozygous indel rate is bigger than 0
 261 sub getinsertion{
 262         my $num=shift @_;
 263         my $base;
 264         my @baseA=("A","T","G","C");
 265         for (my $i=1;$i<=$num;$i++) {
 266                 my $j=int ((rand 100)%4);
 267                 $base.=$baseA[$j];
 268         }
 269         return $base;
 270 }
 271 #####Produce heterozygous SNPs and indels in multiploid
 272 sub heter_SNP_Indel{
 273         (my $sequence,my $hsnp_rate,my $hindel_rate,my $ID)=@_;
 274 #       print "It's begin to get snp sequence !\n";
 275         my %get_match=(
 276                 "A"=>["T","G","C"],
 277                 "T"=>["A","G","C"],
 278                 "G"=>["A","T","C"],
 279                 "C"=>["A","T","G"],
 280                 "a"=>["t","g","c"],
 281                 "t"=>["a","g","c"],
 282                 "g"=>["a","t","c"],
 283                 "c"=>["a","t","g"],
 284         );
 285         my $seqlen=length $sequence;
 286         my @list;
 287         @list=split /\s*/,$sequence;
 288         $sequence="";
 289         if ($hsnp_rate>0) {
 290                 my $snp_num=$seqlen*$hsnp_rate;
 291                 $snp_num=int $snp_num;
 292                 for (my $i=1;$i<=$snp_num;$i++) {
 293 #                       print "$ID\t";
 294                         my $j=int (rand $seqlen);
 295 #                       my $q=$j+1;
 296 #                       print "$q\t$list[$j]\t";
 297                         my $k=$j%3;
 298                         $list[$j]= $get_match{$list[$j]}[$k];
 299 #                       print "$list[$j]\t$snp_num\t$seqlen\n";
 300                 }
 301                 if (($hsnp_rate>0) && ($hindel_rate<=0)){
 302                         foreach my $g (@list){
 303                                 $sequence.=$g;
 304                         }
 305                 }
 306         }
 307
 308         if ($hindel_rate>0) {
 309                 my %insertion;
 310                 my $indel=$seqlen*$hindel_rate;
 311                 my $insertion=$indel/2;
 312                 my $deletion=$indel/2;
 313                 my @list1=(2,3,6);
 314                 my $p=1;
 315                 foreach my $k (@list1) {
 316                         for (my $i=1;$i<=$insertion/$k;$i++) {
 317                                 my $j=int (rand $seqlen);
 318                                 my $insert=&getinsertion($p);
 319                                 $insertion{$j}=$insert;
 320                         }
 321                         $p++;
 322                 }
 323                 $p=1;
 324                 foreach my $i (@list1) {
 325                         for (my $j=1;$j<=$deletion/$i;$j++) {
 326                                 my $k=int (rand $seqlen);
 327                                 redo if ($k+$p>$seqlen);
 328                                 for (my $t=$k;$t<$k+$p;$t++){
 329                                         $list[$t]="";
 330                                 }
 331                         }
 332                         $p++;
 333                 }
 334                 my $cs=0;
 335                 foreach my $ce (sort {$a<=>$b} keys %insertion){
 336                         for (my $i=$cs;$i<=$ce;$i++){
 337                                $sequence.=$list[$i];
 338
 339                         }
 340                         $sequence.=$insertion{$ce};
 341                         $cs=$ce+1;
 342                 }
 343                 for (my $i=$cs;$i<$seqlen;$i++){
 344                         $sequence.=$list[$i];
 345
 346                 }
 347                 foreach my $i (keys %insertion){
 348                         delete $insertion{$i};
 349                 }
 350 #       print "Get deletion has been finished!\n";
 351         }
 352 #       print "Finish Get snp and indel sequence\n";
 353         return $sequence;
 354 }
 355 #########Print to File
 356 sub write2files($$$) {
 357         my ($idref,$stref,$Qcr)=@_;     # reads_$read_count\_$read_len\_1\n$a
 358         if ($Qcr) {
 359                 #my ($ra,$rb)=@$Qarref;
 360                 #push @Qa,chr(64+$_) for (@$ra);
 361                 print $O1 "\@$${idref}/1\n$$stref[0]\n+\n",join('',$$Qcr),"\n";
 362                 print $O2 "\@$${idref}/2\n$$stref[1]\n+\n",join('',$$Qcr),"\n";
 363         } else {
 364                 print $O1 ">$${idref}_1\n$$stref[0]\n";
 365                 print $O2 ">$${idref}_2\n$$stref[1]\n";
 366         }
 367 }
 368 #########Produce solexa reads
 369 sub getreads{
 370         my ($refseq,$ISize,$Rd_Info)=@_;
 371         my $length=length $refseq;
 372         my $read_count=0;
 373         for (my $i=$insertsize_mean-5*$insertsize_sd;$i<=$insertsize_mean+5*$insertsize_sd;$i++) {
 374                 while ($ISize->{$i}>0) {
 375                         my $pos=rand $length;
 376                         $pos=int $pos;
 377                         my $substr=substr $refseq,$pos,$i;
 378                         redo if (length $substr<$i);
 379                         my $a=substr $substr,0,$read_len;
 380                         my $b=substr $substr,-$read_len,$read_len;
 381                         if ($insertsize_mean <=1000) {
 382                                 $b=reverse $b;
 383                                 $b=~tr/ATGCatgc/TACGtacg/;
 384                         } elsif ($insertsize_mean >1000) {
 385                                 $a=reverse $a;
 386                                 $a=~tr/ATGCatgc/TACGtacg/;
 387                         } else {
 388                                 print "Insertsize can't be smaller than 0\n";
 389                         }
 390                         $read_count++;
 391                         if (exists $Rd_Info->{$read_count}) {
 392                                 foreach my $j (@{$Rd_Info->{$read_count}}) {
 393                                         my $k=int ((rand 10)%2);
 394                                         if ($k) {
 395                                                 $a=&match($a,$j,$read_len);
 396                                         } else {
 397                                                 $b=&match($b,$j,$read_len);
 398                                         }
 399                                 }
 400                         }
 401                         my $d=int ((rand 10)%2);
 402                         my ($id,@strs);
 403                         if ($d) {
 404                                 @strs=($a,$b);
 405                                 #print $O1 ">reads_$read_count\_$read_len\_1\n$a\n";
 406                                 #print $O2 ">reads_$read_count\_$read_len\_2\n$b\n";
 407                         } else {
 408                                 @strs=($b,$a);
 409                                 #print $O1 ">reads_$read_count\_$read_len\_1\n$b\n";
 410                                 #print $O2 ">reads_$read_count\_$read_len\_2\n$a\n";
 411                         }
 412                         if ($fq) {
 413                                 $id="FC00XX:${read_count}:${pos}:${i}:$read_len";
 414                                 &write2files(\$id,\@strs,\$Qc);
 415                         } else {
 416                                 $id="reads_${read_count}_${pos}_${i}_$read_len";
 417                                 &write2files(\$id,\@strs,undef);
 418                         }
 419                         $ISize->{$i}--;
 420                 }
 421         }
 422 }
 423
 424 __END__
 425 find ./out/ -name '*.fa' | while read a; do b=`basename $a`; echo "#$ -N \"sim_${b}\"" >./shell/sim_${b}.sh;echo "#$ -cwd -r y -l vf=1G,p=1 -v PERL5LIB,PATH,PYTHONPATH,LD_LIBRARY_PATH -o ./out/fq/sim_${b}.log -e ./out/fq/sim_${b}.err -hold_jid \"*_si\" " >> ./shell/sim_${b}.sh; echo ./simsolexa_reads.pl -fq -b -input $a -read_len 76 -coverage 5 -output ./out/fq/${b}.sim1 >> ./shell/sim_${b}.sh; echo ./simsolexa_reads.pl -fq -b -input $a -read_len 76 -coverage 5 -output ./out/fq/${b}.sim2 >> ./shell/sim_${b}.sh; echo ./simsolexa_reads.pl -fq -b -input $a -read_len 76 -coverage 5 -output ./out/fq/${b}.sim3 >> ./shell/sim_${b}.sh; echo ./simsolexa_reads.pl -fq -b -input $a -read_len 76 -coverage 5 -output ./out/fq/${b}.sim4 >> ./shell/sim_${b}.sh; done