4 use Data
::Dump
qw(ddx);
5 use Cwd
qw(abs_path getcwd);
7 use FindBin
1.51 qw($RealBin);
9 use simVirusInserts; # 同时输出甲基化与非甲基化的结果。
10 #use simVirusInsertsOld; # 只输出100%甲基化的reads(四种碱基的)
12 my $RefNratioMax = 0.01; # /Nn/
13 our $RefMratioMax = 0.02; # masked as lower case in *.mfa.gz
14 ($RefNratioMax,$RefMratioMax)=(0,0); # 现在需要更好的结果,です!
16 die "Simulate Directional libraries of Bisulfite-Sequencing data.\nUsage: $0 <Host> <Virus> <Outprefix> [ReadLen=90 [Ticks.ini]]\nInvoke as: mkdir sim90 && cd sim90 && $0 && cd ..\n" if @ARGV <3;
18 my ($Reff,$Virf,$outp,$ReadLen,$TicksINI)=@ARGV;
19 $ReadLen = 90 unless defined $ReadLen;
21 #$Reff='hs_ref_GRCh38.p2_chr18.mfa.gz';
22 #$Virf='HBV.AJ507799.2.fa.gz';
24 my $Refh = openfile($Reff);
25 #my $Refstr = getRefChr1st($Refh);
26 #my $RefLen = length $Refstr;
27 my ($pRefHash,$RefLen,$Refstr) = getRefChrHash($Refh);
29 my $Virfh = openfile($Virf);
30 #my $Virstr = getRefChr1st($Virfh);
31 #my $VirLen = length $Virstr;
32 my ($pVirHash,$VirLen,$Virstr) = getRefChrHash($Virfh);
34 # Now $Virstr and $VirLen are from the shortest seq.
35 $Virstr .= $Virstr; # circle
36 for my $k (keys %{$pVirHash}) {
37 $pVirHash->{$k} .= $pVirHash->{$k}; # circle
40 open INI,'>',$outp.'.ini' or die $!;
41 my $Refabsf = abs_path($Reff);
42 my $Virabsf = abs_path($Virf);
43 my $Cwdabs = abs_path(getcwd()."/../run");
44 print INI <<"CONTENT";
55 # `bwa-meth` or `BSseeker2`
61 my $ShortLen = int(0.5 + $ReadLen/2);
65 my $LongLen = int(0.5 + $ReadLen*4/3);
66 my @PEins = (100,150,200,420);
67 my $maxPEins = 750; # [77,96]
69 @PEins = (150,220,350,530);
73 @PEins = (60,80,120,250);
76 warn "ReadLen:$ReadLen, PE_ins:@PEins, maxPEins:$maxPEins.\n";
79 PEinsertLen
=> $PEins[2],
80 SeqReadLen
=> $ReadLen,
81 RefNratioMax
=> $RefNratioMax,
84 VirFrag
=> 2 * $ReadLen,
85 OutPrefix
=> $outp . '_m13FG',
88 my ($pRefticks,$pVirticks);
89 if (defined $TicksINI and -f
$TicksINI) {
90 print STDERR
"Loading Ticks from [$TicksINI]: ";
91 my $inih = openfile
($TicksINI);
95 my @dat = split /=/,$_;
96 @dat = split /,/,$dat[1];
98 print STDERR
"Ref:",scalar @dat,', ';
99 } elsif (/^Virticks=\d/) {
100 my @dat = split /=/,$_;
101 @dat = split /,/,$dat[1];
103 print STDERR
"Vir:",scalar @dat,', ';
106 my $virtickmax = $#$pVirticks;
107 if ($virtickmax < $#$pRefticks) {
108 for my $i ( (1+$virtickmax) .. $#$pRefticks ) {
109 my $j = int($i - 0.8*$virtickmax); # must use `$virtickmax` instead of `$#$pVirticks` !
110 while ( abs($j)>$virtickmax ) {
111 $j = abs($j) - $virtickmax;
113 $pVirticks->[$i] = $pVirticks->[$j];
115 print STDERR
"Vir padded. ";
117 print STDERR
"\b\b.\n";
119 my $SeqReadLen = $Para{SeqReadLen
};
120 my $RefBorder = $maxPEins + 1000;
121 $pRefticks = getticks
($RefBorder,$Refstr,$RefLen,$maxPEins,$RefNratioMax);
122 $pVirticks = getticks
($Para{VirFrag
},$Virstr,$VirLen,int(0.9+ 0.5*$Para{VirFrag
}),$RefNratioMax);
124 $Para{pRefticks
} = $pRefticks;
125 $Para{pVirticks
} = $pVirticks;
127 dosim
($pRefHash,$pVirHash,\
%Para);
128 $fqFiles{$Para{OutPrefix
}} = [$Para{PEinsertLen
},$Para{VirFrag
}];
130 $Para{PEinsertLen
} = $PEins[2];
131 $Para{VirFrag
} = $ShortLen;
132 $Para{OutPrefix
} = $outp . '_m2D';
133 dosim
($pRefHash,$pVirHash,\
%Para);
134 $fqFiles{$Para{OutPrefix
}} = [$Para{PEinsertLen
},$Para{VirFrag
}];
136 $Para{PEinsertLen
} = $PEins[2];
137 $Para{VirFrag
} = $TinyLen1;
138 $Para{OutPrefix
} = $outp . '_m2Dty1';
139 dosim
($pRefHash,$pVirHash,\
%Para);
140 $fqFiles{$Para{OutPrefix
}} = [$Para{PEinsertLen
},$Para{VirFrag
}];
142 $Para{PEinsertLen
} = $PEins[2];
143 $Para{VirFrag
} = $TinyLen2;
144 $Para{OutPrefix
} = $outp . '_m2Dty2';
145 dosim
($pRefHash,$pVirHash,\
%Para);
146 $fqFiles{$Para{OutPrefix
}} = [$Para{PEinsertLen
},$Para{VirFrag
}];
148 $Para{PEinsertLen
} = $PEins[2];
149 $Para{VirFrag
} = $TinyLen3;
150 $Para{OutPrefix
} = $outp . '_m2Dty3';
151 dosim
($pRefHash,$pVirHash,\
%Para);
152 $fqFiles{$Para{OutPrefix
}} = [$Para{PEinsertLen
},$Para{VirFrag
}];
154 $Para{PEinsertLen
} = $PEins[3];
155 $Para{VirFrag
} = $LongLen;
156 $Para{OutPrefix
} = $outp . '_m458AE';
157 dosim
($pRefHash,$pVirHash,\
%Para);
158 $fqFiles{$Para{OutPrefix
}} = [$Para{PEinsertLen
},$Para{VirFrag
}];
160 $Para{PEinsertLen
} = $PEins[2];
161 $Para{VirFrag
} = $LongLen;
162 $Para{OutPrefix
} = $outp . '_m9';
163 dosim
($pRefHash,$pVirHash,\
%Para);
164 $fqFiles{$Para{OutPrefix
}} = [$Para{PEinsertLen
},$Para{VirFrag
}];
166 $Para{PEinsertLen
} = $PEins[1];
167 $Para{VirFrag
} = $LongLen;
168 $Para{OutPrefix
} = $outp . '_m6';
169 dosim
($pRefHash,$pVirHash,\
%Para);
170 $fqFiles{$Para{OutPrefix
}} = [$Para{PEinsertLen
},$Para{VirFrag
}];
172 $Para{PEinsertLen
} = $PEins[1];
173 $Para{VirFrag
} = $ShortLen;
174 $Para{OutPrefix
} = $outp . '_m7C';
175 dosim
($pRefHash,$pVirHash,\
%Para);
176 $fqFiles{$Para{OutPrefix
}} = [$Para{PEinsertLen
},$Para{VirFrag
}];
178 $Para{PEinsertLen
} = $PEins[0];
179 $Para{VirFrag
} = $ShortLen;
180 $Para{OutPrefix
} = $outp . '_mB';
181 dosim
($pRefHash,$pVirHash,\
%Para);
182 $fqFiles{$Para{OutPrefix
}} = [$Para{PEinsertLen
},$Para{VirFrag
}];
184 my @fps = sort keys %fqFiles;
185 print INI
"[DataFiles]\n";
186 for my $i (0 .. $#fps) {
187 print INI
"F$fps[$i].1=",abs_path
($fps[$i].'.b1.fq.gz'),"\n";
188 print INI
"F$fps[$i].2=",abs_path
($fps[$i].'.b2.fq.gz'),"\n";
190 print INI
"[Base4Files]\n";
191 for my $i (0 .. $#fps) {
192 print INI
"F$fps[$i].1=",abs_path
($fps[$i].'.r1.fq.gz'),"\n";
193 print INI
"F$fps[$i].2=",abs_path
($fps[$i].'.r2.fq.gz'),"\n";
195 print INI
"\n[InsertSizes]\n";
196 for my $i (0 .. $#fps) {
197 print INI
"F$fps[$i]=",$fqFiles{$fps[$i]}->[0],"\nF$fps[$i].SD=",$i+1,"\n";
199 print INI
"\n[Simed]\n";
200 for my $i (0 .. $#fps) {
201 print INI
"F$fps[$i].VirFrag=",$fqFiles{$fps[$i]}->[1],"\n";
203 print INI
'Refticks=',join(',',@
$pRefticks),"\n";
204 print INI
'Virticks=',join(',',@
$pVirticks),"\n";
208 ./simVirusInserts
.pl hs_ref_GRCh38
.p2_chr18
.mfa
.gz X04615
.fa
.gz simout
210 grep -h \
> *_m
*.Ref
.fa
| sed
's/^simout_m//'|sed
's/.Ref.fa:>/\t/'|sed
's/Ref_/Ref:/g'|sed
's/Vir_/Vir:/'|sed
's/R_/R:/'|sed
's/_/ /g'|cat
-n
>simed
.lst
212 ~/git/toGit
/perl/bsim
/simVirusInserts
.pl
~/nas
/Ref
/GRCh38_no_alt_analysis_set
.fna
.gz
~/nas
/Ref/HBV
.X04615
.fa
.gz s150
150
214 grep -h \
> ../../sim90/*.Ref.fa |sed 's/>Ref_
/Ref:/g'|sed 's/Vir_/Vir:/'|sed 's/R_/R:/'|sed 's/_/ /g'|cat -n