new file: cell2loc.py
[GalaxyCodeBases.git] / perl / bsim / simVirusInserts.pl
blobacb6b271fbd95b9bb9db8f9d6b24501ef86f8a5a
1 #!/usr/bin/env perl
2 use strict;
3 use warnings;
4 use Data::Dump qw(ddx);
5 use Cwd qw(abs_path getcwd);
7 use FindBin 1.51 qw($RealBin);
8 use lib "$RealBin";
9 use simVirusInserts; # 同时输出甲基化与非甲基化的结果。
10 #use simVirusInsertsOld; # 只输出100%甲基化的reads(四种碱基的)
12 my $RefNratioMax = 0.01; # /Nn/
13 our $RefMratioMax = 0.02; # masked as lower case in *.mfa.gz
14 ($RefNratioMax,$RefMratioMax)=(0,0); # 现在需要更好的结果,です!
16 die "Simulate Directional libraries of Bisulfite-Sequencing data.\nUsage: $0 <Host> <Virus> <Outprefix> [ReadLen=90 [Ticks.ini]]\nInvoke as: mkdir sim90 && cd sim90 && $0 && cd ..\n" if @ARGV <3;
18 my ($Reff,$Virf,$outp,$ReadLen,$TicksINI)=@ARGV;
19 $ReadLen = 90 unless defined $ReadLen;
21 #$Reff='hs_ref_GRCh38.p2_chr18.mfa.gz';
22 #$Virf='HBV.AJ507799.2.fa.gz';
24 my $Refh = openfile($Reff);
25 #my $Refstr = getRefChr1st($Refh);
26 #my $RefLen = length $Refstr;
27 my ($pRefHash,$RefLen,$Refstr) = getRefChrHash($Refh);
28 close $Refh;
29 my $Virfh = openfile($Virf);
30 #my $Virstr = getRefChr1st($Virfh);
31 #my $VirLen = length $Virstr;
32 my ($pVirHash,$VirLen,$Virstr) = getRefChrHash($Virfh);
33 close $Virfh;
34 # Now $Virstr and $VirLen are from the shortest seq.
35 $Virstr .= $Virstr; # circle
36 for my $k (keys %{$pVirHash}) {
37 $pVirHash->{$k} .= $pVirHash->{$k}; # circle
40 open INI,'>',$outp.'.ini' or die $!;
41 my $Refabsf = abs_path($Reff);
42 my $Virabsf = abs_path($Virf);
43 my $Cwdabs = abs_path(getcwd()."/../run");
44 print INI <<"CONTENT";
45 [RefFiles]
46 HostRef=$Refabsf
47 VirusRef=$Virabsf
49 [Output]
50 WorkDir=$Cwdabs
51 ProjectID=$outp
53 [Parameters]
54 Aligner=bwa-meth
55 # `bwa-meth` or `BSseeker2`
57 CONTENT
59 my %fqFiles;
61 my $ShortLen = int(0.5 + $ReadLen/2);
62 my $TinyLen1 = 5;
63 my $TinyLen2 = 10;
64 my $TinyLen3 = 20;
65 my $LongLen = int(0.5 + $ReadLen*4/3);
66 my @PEins = (100,150,200,420);
67 my $maxPEins = 750; # [77,96]
68 if ($ReadLen > 96) {
69 @PEins = (150,220,350,530);
70 $maxPEins = 500;
72 if ($ReadLen < 77) {
73 @PEins = (60,80,120,250);
74 $maxPEins = 250;
76 warn "ReadLen:$ReadLen, PE_ins:@PEins, maxPEins:$maxPEins.\n";
78 my %Para = (
79 PEinsertLen => $PEins[2],
80 SeqReadLen => $ReadLen,
81 RefNratioMax => $RefNratioMax,
82 RefLen => $RefLen,
83 VirLen => $VirLen,
84 VirFrag => 2 * $ReadLen,
85 OutPrefix => $outp . '_m13FG',
88 my ($pRefticks,$pVirticks);
89 if (defined $TicksINI and -f $TicksINI) {
90 print STDERR "Loading Ticks from [$TicksINI]: ";
91 my $inih = openfile($TicksINI);
92 while (<$inih>) {
93 chomp;
94 if (/^Refticks=\d/) {
95 my @dat = split /=/,$_;
96 @dat = split /,/,$dat[1];
97 $pRefticks = \@dat;
98 print STDERR "Ref:",scalar @dat,', ';
99 } elsif (/^Virticks=\d/) {
100 my @dat = split /=/,$_;
101 @dat = split /,/,$dat[1];
102 $pVirticks = \@dat;
103 print STDERR "Vir:",scalar @dat,', ';
106 my $virtickmax = $#$pVirticks;
107 if ($virtickmax < $#$pRefticks) {
108 for my $i ( (1+$virtickmax) .. $#$pRefticks ) {
109 my $j = int($i - 0.8*$virtickmax); # must use `$virtickmax` instead of `$#$pVirticks` !
110 while ( abs($j)>$virtickmax ) {
111 $j = abs($j) - $virtickmax;
113 $pVirticks->[$i] = $pVirticks->[$j];
115 print STDERR "Vir padded. ";
117 print STDERR "\b\b.\n";
118 } else {
119 my $SeqReadLen = $Para{SeqReadLen};
120 my $RefBorder = $maxPEins + 1000;
121 $pRefticks = getticks($RefBorder,$Refstr,$RefLen,$maxPEins,$RefNratioMax);
122 $pVirticks = getticks($Para{VirFrag},$Virstr,$VirLen,int(0.9+ 0.5*$Para{VirFrag}),$RefNratioMax);
124 $Para{pRefticks} = $pRefticks;
125 $Para{pVirticks} = $pVirticks;
127 dosim($pRefHash,$pVirHash,\%Para);
128 $fqFiles{$Para{OutPrefix}} = [$Para{PEinsertLen},$Para{VirFrag}];
130 $Para{PEinsertLen} = $PEins[2];
131 $Para{VirFrag} = $ShortLen;
132 $Para{OutPrefix} = $outp . '_m2D';
133 dosim($pRefHash,$pVirHash,\%Para);
134 $fqFiles{$Para{OutPrefix}} = [$Para{PEinsertLen},$Para{VirFrag}];
136 $Para{PEinsertLen} = $PEins[2];
137 $Para{VirFrag} = $TinyLen1;
138 $Para{OutPrefix} = $outp . '_m2Dty1';
139 dosim($pRefHash,$pVirHash,\%Para);
140 $fqFiles{$Para{OutPrefix}} = [$Para{PEinsertLen},$Para{VirFrag}];
142 $Para{PEinsertLen} = $PEins[2];
143 $Para{VirFrag} = $TinyLen2;
144 $Para{OutPrefix} = $outp . '_m2Dty2';
145 dosim($pRefHash,$pVirHash,\%Para);
146 $fqFiles{$Para{OutPrefix}} = [$Para{PEinsertLen},$Para{VirFrag}];
148 $Para{PEinsertLen} = $PEins[2];
149 $Para{VirFrag} = $TinyLen3;
150 $Para{OutPrefix} = $outp . '_m2Dty3';
151 dosim($pRefHash,$pVirHash,\%Para);
152 $fqFiles{$Para{OutPrefix}} = [$Para{PEinsertLen},$Para{VirFrag}];
154 $Para{PEinsertLen} = $PEins[3];
155 $Para{VirFrag} = $LongLen;
156 $Para{OutPrefix} = $outp . '_m458AE';
157 dosim($pRefHash,$pVirHash,\%Para);
158 $fqFiles{$Para{OutPrefix}} = [$Para{PEinsertLen},$Para{VirFrag}];
160 $Para{PEinsertLen} = $PEins[2];
161 $Para{VirFrag} = $LongLen;
162 $Para{OutPrefix} = $outp . '_m9';
163 dosim($pRefHash,$pVirHash,\%Para);
164 $fqFiles{$Para{OutPrefix}} = [$Para{PEinsertLen},$Para{VirFrag}];
166 $Para{PEinsertLen} = $PEins[1];
167 $Para{VirFrag} = $LongLen;
168 $Para{OutPrefix} = $outp . '_m6';
169 dosim($pRefHash,$pVirHash,\%Para);
170 $fqFiles{$Para{OutPrefix}} = [$Para{PEinsertLen},$Para{VirFrag}];
172 $Para{PEinsertLen} = $PEins[1];
173 $Para{VirFrag} = $ShortLen;
174 $Para{OutPrefix} = $outp . '_m7C';
175 dosim($pRefHash,$pVirHash,\%Para);
176 $fqFiles{$Para{OutPrefix}} = [$Para{PEinsertLen},$Para{VirFrag}];
178 $Para{PEinsertLen} = $PEins[0];
179 $Para{VirFrag} = $ShortLen;
180 $Para{OutPrefix} = $outp . '_mB';
181 dosim($pRefHash,$pVirHash,\%Para);
182 $fqFiles{$Para{OutPrefix}} = [$Para{PEinsertLen},$Para{VirFrag}];
184 my @fps = sort keys %fqFiles;
185 print INI "[DataFiles]\n";
186 for my $i (0 .. $#fps) {
187 print INI "F$fps[$i].1=",abs_path($fps[$i].'.b1.fq.gz'),"\n";
188 print INI "F$fps[$i].2=",abs_path($fps[$i].'.b2.fq.gz'),"\n";
190 print INI "[Base4Files]\n";
191 for my $i (0 .. $#fps) {
192 print INI "F$fps[$i].1=",abs_path($fps[$i].'.r1.fq.gz'),"\n";
193 print INI "F$fps[$i].2=",abs_path($fps[$i].'.r2.fq.gz'),"\n";
195 print INI "\n[InsertSizes]\n";
196 for my $i (0 .. $#fps) {
197 print INI "F$fps[$i]=",$fqFiles{$fps[$i]}->[0],"\nF$fps[$i].SD=",$i+1,"\n";
199 print INI "\n[Simed]\n";
200 for my $i (0 .. $#fps) {
201 print INI "F$fps[$i].VirFrag=",$fqFiles{$fps[$i]}->[1],"\n";
203 print INI 'Refticks=',join(',',@$pRefticks),"\n";
204 print INI 'Virticks=',join(',',@$pVirticks),"\n";
205 close INI;
207 __END__
208 ./simVirusInserts.pl hs_ref_GRCh38.p2_chr18.mfa.gz X04615.fa.gz simout
210 grep -h \> *_m*.Ref.fa | sed 's/^simout_m//'|sed 's/.Ref.fa:>/\t/'|sed 's/Ref_/Ref:/g'|sed 's/Vir_/Vir:/'|sed 's/R_/R:/'|sed 's/_/ /g'|cat -n >simed.lst
212 ~/git/toGit/perl/bsim/simVirusInserts.pl ~/nas/Ref/GRCh38_no_alt_analysis_set.fna.gz ~/nas/Ref/HBV.X04615.fa.gz s150 150
214 grep -h \> ../../sim90/*.Ref.fa |sed 's/>Ref_/Ref:/g'|sed 's/Vir_/Vir:/'|sed 's/R_/R:/'|sed 's/_/ /g'|cat -n