modified: pixi.toml
[GalaxyCodeBases.git] / released / pIRS.old / pIRS_simulator / extra / run_simulate_reads.pl
blob3b7999022563ad36f36216dfe5fcf12c24b0d29f
1 #! /usr/bin/perl
2 use strict;
3 use warnings;
4 use FindBin qw($Bin);
5 use Getopt::Long;
7 =head1
8 Description
10 It is a program for simulating illumina PE reads, with a series of problems generate
11 by illumina sequencing machine, such as insertsize distribution, sequencing error and
12 quality score, GC bias.
13 User should set the value of insertsize_mean and insertsize_sd, they are the mean value
14 and standard deviation of the normal distribution that used as the model function when
15 simulating insertsize distribution, usually we set the insertsize_sd to be 1/20 of the
16 insertsize_mean. The normal distribution function model we used in this program is
17 f(x)=1/¦Ò/sqrt(2*pi)*exp((x-¦Ì)**2/(2*¦Ò**2)), and the insertsize distribution is simulated
18 by Box-muller method.
19 This program simulate illumina sequencing error, quality score and GC bias according to
20 the statistical profile, user can set path of profile or using the default file in this
21 program package, which is generated by large real sequencing data.
22 If you want to simulate diploid reads, you should set the rate of heterozygosis SNP,
23 heterozygosis Indel and big structure variation, but remember that these are only exists
24 in diploid.
25 At last, you should set another several parameters: whether simulate GC bias and quality
26 value, in addition, set the read length, coverage of reads, input sequence, max number of
27 running jobs, running mode, memory requirements and output prefix, the input sequence must
28 be set, because there is not default value.
30 =head1 History and Contributions
32 Version 1.0: Lu jianliang and Yue zhen. 2010-03-01
33 Version 2.0: Fan Wei, Shi Yujian, Hu Xuesong, Yuan Jianying. 2011-07-07
34 Version 3.0: Fan Wei, Shi Yujian, Hu Xuesong, Yuan Jianying. 2011-08-03
36 =head1 Contact and Version
38 Contact: yuanjianying@genomics.org.cn shiyujian@genomics.org.cn
39 Version: 3.0 Data: 2011-08-03
41 =head1 Usage
43 perl run_simulate_reads.pl [Option] <lib.lst>
45 =head1 Option
47 -i <string> input reference genome sequence, *.fa/*.fa.gz, no default value
48 -f <string> input error profile, default: $Bin/statistical_file_package/hum20110701.bwanosnp.count.matrix
49 -F <string> input GC depth profile for simulate GC bias, default: $Bin/statistical_file_package/stat_100.dat.gc
50 -s <double> set the heterozygous SNP rate of the diploid genome, reference value: 0.001, default:0
51 -a <double> set the value of transition divided by transvertion for SNP, default:2
52 -d <double> set the heterozygous indel rate of the diploid genome, reference value: 0.0001, default:0
53 -v <double> set the big structure variation rate of the diploid genome, reference value: 0.000001, default:0
54 -e <double> set the average error rate over all cycles, should be set between 0 and 1, or set -1 as default value: error profile average error ratio
55 -g <int> set whether simulate GC bias, 0:no, 1:yes, default:1
56 -q <int> set whether simulate quality value, 0:no,*.fa, 1:yes,*.fq, default:1
57 -j <int> set the max job, default: 1
58 -o <string> set output file prefix default:illumina
59 -c <int> set output file type, 0:text, 1:*.gz, default:1.
60 -r <int> set program running mode, 1:run directly(for testing node), 0:submit job (qsub), default:0
61 -m <int> set the memory requirements while running mode is set 0(submit job), unit of memory:G, default:0.5
62 -h output help infomation
64 =head1 <lib.lst>
66 In <lib.lst>, you must set parameter below:
68 insertsize_mean <int> set the average value of insert size
69 read_length <int> set read length, read1 and read2 have the same length
70 coverage <double> set the sequencing coverage(sometimes called depth)
71 insertsize_sd <int> set the standard deviation of insert sizes,usually set to be 1/20 of the insertsize_mean
72 is_cyclization <int> set whether cyclize insert sequence,(influence on PE-reads direction), 0: read1-forward read2-reverse, 1: read1-reverse read2-forward, default:0.
74 when you creat the lib.lst file, you need to set parameter orderly as follow:
76 insertsize_mean/read_length/coverage/insertsize_sd/is_cyclization
78 for example:
79 170 100 20 10 0
80 500 100 20 20 0
81 800 100 10 40 0
82 2500 100 5 100 1
84 =head1 Example
86 1. perl run_simulate_reads.pl -i ref_sequence.fa lib.lst
87 every parameter use the default one.
88 2. perl run_simulate_reads.pl -i ref_sequence.fa -g 1 -q 1 lib.lst
89 set simulate GC bias and quality value (*.fq)
90 3. perl run_simulate_reads.pl -i ref_sequence.fa -e 0 -c 0 lib.lst
91 set the average error rate over all cycles and output file is text type.
92 4. perl run_simulate_reads.pl -i ref_sequence.fa -j 2 -o test lib.lst
93 set the max number job and the output file prefix.
94 5. perl run_simulate_reads.pl -i ref_sequence.fa -r 0 -m 0.8 lib.lst
95 set the running mode and the memory size.
96 6. perl run_simulate_reads.pl -i ref_sequence.fa -s 0.001 -d 0.0001 lib.lst
97 the genome is diploid and you want to produce heterozygosis SNPs heterozygosis Indels in reads.
99 =cut
100 ################################################################################################
102 my ($inputref,$error_profile,$GC_profile,$heterSNP_rate,$snp_transition_by_transvertion_rate,$heterIndel_rate,$SV_rate,$Is_simulate_GC_bias,$Is_simulate_quality,$error_rate,$max_job,$output_type,$output_prefix,$run_mode,$memory_size,$help);
104 GetOptions(
105 "i:s"=>\$inputref,
106 "f:s"=>\$error_profile,
107 "F:s"=>\$GC_profile,
108 "s:f"=>\$heterSNP_rate,
109 "a:f"=>\$snp_transition_by_transvertion_rate,
110 "d:f"=>\$heterIndel_rate,
111 "v:f"=>\$SV_rate,
112 "e:f"=>\$error_rate,
113 "g:i"=>\$Is_simulate_GC_bias,
114 "q:i"=>\$Is_simulate_quality,
115 "j:i"=>\$max_job,
116 "c:i"=>\$output_type,
117 "o:s"=>\$output_prefix,
118 "r:i"=>\$run_mode,
119 "m:f"=>\$memory_size,
120 "h"=>\$help
122 die `pod2text $0` if ($help);
123 die `pod2text $0` if (!$inputref);
124 my $lib_lst = shift;
126 #############################Set default value#############################################
127 $error_profile = "$Bin/statistical_file_package/error_profile/hum20110701.bwanosnp.count.matrix" if (!$error_profile);
128 $GC_profile = "$Bin/statistical_file_package/GC_depth_profile/stat_100.dat.gc" if (!$GC_profile);
129 $heterSNP_rate=0 if (!defined $heterSNP_rate);
130 $snp_transition_by_transvertion_rate=2 if (!defined $snp_transition_by_transvertion_rate);
131 $heterIndel_rate=0 if (!defined $heterIndel_rate);
132 $SV_rate=0 if (!defined $SV_rate);
133 $Is_simulate_GC_bias=1 if (!defined $Is_simulate_GC_bias);
134 $Is_simulate_quality=1 if (!defined $Is_simulate_quality);
135 $error_rate=-1 if (!defined $error_rate);
136 $max_job=1 if (!defined $max_job);
137 $output_type=1 if (!defined $output_type);
138 $output_prefix="illumina" if (!defined $output_prefix);
139 $run_mode=0 if (!defined $run_mode);
140 $memory_size=0.5 if (!defined $memory_size);
142 die ("the -s(heterSNP_rate) should be set between 0 and 1\n") if ($heterSNP_rate<0 || $heterSNP_rate>=1);
143 die ("the -a(transition/transvertion rate) should be set greater than 0\n") if ($snp_transition_by_transvertion_rate<0);
144 die ("the -d(heterIndel_rate) should be set between 0 and 1\n") if ($heterIndel_rate<0 || $heterIndel_rate>=1);
145 die ("the -v(SV_rate) should be set between 0 and 1\n") if ($SV_rate<0 || $SV_rate>=1);
146 die ("the -g(Is_simulate_GC_bias) should be set 0 or 1\n")if ($Is_simulate_GC_bias!=0 && $Is_simulate_GC_bias!=1);
147 die ("the -q(Is_simulate_quality) should be set 0 or 1\n")if ($Is_simulate_quality!=0 && $Is_simulate_quality!=1);
148 die ("the -e(error_rate) should be set between 0 and 1, or set -1 for default error rate\n") if ($error_rate>=1 || $error_rate != -1 && $error_rate < 0);
149 die ("the -j(max number job) should be set greater than 1\n") if ($max_job<1);
150 die ("the -c(ouput_type) should be set 0 or 1\n") if ($output_type!=0 && $output_type!=1);
151 die ("the -r(run_mode) should be set 0 or 1\n") if ($run_mode!=0 && $run_mode!=1);
153 #############################Generate Run Script#############################################
154 if($heterSNP_rate != 0 || $heterIndel_rate != 0 || $SV_rate!=0)
156 open OUT1, ">simulate_snp_indel_seq.sh" or die "$!";
157 print OUT1 "$Bin/simulate_snp_indel_seq -i $inputref -s $heterSNP_rate -v $SV_rate -a $snp_transition_by_transvertion_rate -d $heterIndel_rate -c $output_type -o ./$output_prefix && echo OK \n";
158 print STDERR "Start simulate snp or indel of reference ... \n";
159 if($run_mode){
160 `sh simulate_snp_indel_seq.sh`;
161 }else{
162 # `qsub -l vf=0.3G -cwd simulate_snp_indel_seq.sh`;
163 my $memory = $memory_size."G";
164 `perl $Bin/qsub-sge.pl -verbose -maxjob 1 -resource vf=$memory -reqsub simulate_snp_indel_seq.sh`;
166 print STDERR "Finished. \n";
167 close OUT1;
171 open IN, $lib_lst or die "$!";
172 open OUT2,">simulate_illumina_reads.sh" or die "$!";
173 my $sleep_time = 0;
174 while(<IN>){
175 chomp;
176 my ($insertsize_mean,$read_length,$coverage,$insertsize_sd,$is_cyclization) = split(/\s+/);
177 if(!defined $insertsize_mean){last;}
178 die ("the mean insertsize should be set greater than 0\n") if ($insertsize_mean<=0);
179 die ("reads length should be set greater than 0\n") if ($read_length<=0);
180 die ("coverage should be set greater than 0\n") if ($coverage<=0);
181 die ("the insertsize_sd should be set greater than 0\n") if ($insertsize_sd<0);
182 die ("the is_cyclization should be set 0 or 1\n") if ($is_cyclization!=0 && $is_cyclization!=1);
184 my $ouput_dir = $output_prefix."_".$insertsize_mean."_".$read_length."_".$coverage."_".$insertsize_sd."_".$is_cyclization;
185 `mkdir $ouput_dir`;
187 my $ref_snp_indel_seq = $output_prefix;
188 if($heterSNP_rate != 0)
190 $ref_snp_indel_seq = $ref_snp_indel_seq.".snp";
192 if($heterIndel_rate != 0)
194 $ref_snp_indel_seq = $ref_snp_indel_seq.".indel";
196 if($SV_rate != 0)
198 $ref_snp_indel_seq = $ref_snp_indel_seq.".invertion";
200 if(!$output_type)
202 $ref_snp_indel_seq = $ref_snp_indel_seq.".fa";
203 }else{
204 $ref_snp_indel_seq = $ref_snp_indel_seq.".fa.gz";
207 $sleep_time++;
209 if($heterSNP_rate != 0 || $heterIndel_rate != 0)
211 if($run_mode == 1){
212 print OUT2 "sleep $sleep_time && $Bin/simulate_illumina_reads -i $inputref -I $ref_snp_indel_seq -s $error_profile -d $GC_profile -m $insertsize_mean -l $read_length -x $coverage -e $error_rate -v $insertsize_sd -g $Is_simulate_GC_bias -q $Is_simulate_quality -f $is_cyclization -c $output_type -o ./$ouput_dir/$output_prefix >./$ouput_dir/simulate_$insertsize_mean.o 2>./$ouput_dir/simulate_$insertsize_mean.e && echo OK \n";
213 }else{
214 print OUT2 "sleep $sleep_time && $Bin/simulate_illumina_reads -i $inputref -I $ref_snp_indel_seq -s $error_profile -d $GC_profile -m $insertsize_mean -l $read_length -x $coverage -e $error_rate -v $insertsize_sd -g $Is_simulate_GC_bias -q $Is_simulate_quality -f $is_cyclization -c $output_type -o ./$ouput_dir/$output_prefix && echo OK \n";
216 }else{
217 if($run_mode == 1){
218 print OUT2 "sleep $sleep_time && $Bin/simulate_illumina_reads -i $inputref -s $error_profile -d $GC_profile -m $insertsize_mean -l $read_length -x $coverage -e $error_rate -v $insertsize_sd -g $Is_simulate_GC_bias -q $Is_simulate_quality -f $is_cyclization -c $output_type -o ./$ouput_dir/$output_prefix >./$ouput_dir/simulate_$insertsize_mean.o 2>./$ouput_dir/simulate_$insertsize_mean.e && echo OK \n";
219 }else{
220 print OUT2 "sleep $sleep_time && $Bin/simulate_illumina_reads -i $inputref -s $error_profile -d $GC_profile -m $insertsize_mean -l $read_length -x $coverage -e $error_rate -v $insertsize_sd -g $Is_simulate_GC_bias -q $Is_simulate_quality -f $is_cyclization -c $output_type -o ./$ouput_dir/$output_prefix && echo OK \n";
224 close IN;
225 close OUT2;
227 #############################Contral Jobs Running#############################################
228 print STDERR "Start simulate reads ... \n";
229 if($run_mode){
230 open IN2, "simulate_illumina_reads.sh" or die "$!";
231 open OUT3, ">job_contral.stat" or die "$!";
232 close OUT3;
233 my $max_job_tem = $max_job;
234 my $job_num = 0;
235 while(<IN2>){
236 chomp;
237 $job_num++;
238 # print STDERR "$job_num $max_job_tem \n";
239 # print STDERR "Run: $max_job $job_num ... \n";
240 if($job_num < $max_job_tem){
241 my $run_script = $_." & ";
242 # print STDERR "Run: $run_script... \n";
243 system("nohup $_ && echo $job_num >>job_contral.stat &");
244 }else{
245 if($job_num == $max_job_tem){system("nohup $_ && echo $job_num >>job_contral.stat &"); }
246 ##contral process##
247 while(1){
248 sleep 10;
249 open IN3, "job_contral.stat" or die "$!";
250 my $line_num = 0;
251 while(<IN3>){
252 $line_num++;
254 close IN3;
255 if($job_num - $line_num >= 0 && $job_num - $line_num < $max_job){ $max_job_tem = $max_job_tem + ($max_job - ($job_num - $line_num));last;}
259 ##wait for all the jobs finished##
260 while(1){
261 sleep 10;
262 open IN3, "job_contral.stat" or die "$!";
263 my $line_num2 = 0;
264 while(<IN3>){
265 $line_num2++;
267 close IN3;
268 # print STDERR "$job_num $line_num2 \n";
269 if($job_num == $line_num2){last;}
272 `rm job_contral.stat`;
274 # print STDERR "BEGIN!\n";
275 # `sh simulate_reads.sh &`;
276 }else{
277 my $memory = $memory_size."G";
278 `perl $Bin/qsub-sge.pl -verbose -maxjob $max_job -resource vf=$memory -reqsub simulate_illumina_reads.sh`;
281 close IN2;
282 print STDERR "All finished! \n";