4 use Data
::Dump
qw(ddx);
9 my ($minRefLen,$maxRefLen) = (40,220);
13 open I
,'<','hg19chr17.bed.frag' or die;
14 open Z
,'<','zone.lst' or die;
15 open S
,'<','snp.lst' or die;
19 $str =~ tr/acgtrymkswhbvdnxACGTRYMKSWHBVDNX/tgcayrkmswdvbhnxTGCAYRKMSWDVBHNX/;
20 my $rev = reverse $str;
21 $rev =~ tr/[](){}<>/][)(}{></;
29 my (undef,$s,$e,$ra,$rb,$rHa,$rHb) = split /\t/;
30 $MetUnchgRate{$s}=[$e,$ra,$rb,$rHa,$rHb];
39 my ($HomHet,$chr,$s,undef,undef,undef,undef,$PosP1,undef,undef,$newseq) = split /\t/;
40 $SNPList{"$HomHet\t$s"} = $newseq;
41 $usedPos{$chr}{$s}{$PosP1-1} = $HomHet;
48 my ($chr,$s,$seq,$type) = @_;
49 my $key = "$type\t$s";
51 $newseq = $SNPList{$key} if exists $SNPList{$key};
58 while ($seq =~ /CG/gi) {
59 #print join("\t",'CG',$-[0], $+[0],"$` ($&) $'"),"\n";
64 sub getC
($) { # C but not CpG
67 while ($seq =~ /C(?!G)/gi) {
68 #print join("\t",'C',$-[0], $+[0],"$` ($&) $'"),"\n";
84 sub simPlusMinus
($$) { #甲基化只处理fq1所在的单链
85 my ($seq,$unchgRate) = @_;
86 my @CGpos = getCpG
($seq);
87 my @Cpos = getC
($seq);
90 substr $newseq,$_,1,'T';
93 $flag = 'CpGisC' if @CGpos;
94 unless (doCoin
($unchgRate)) {
96 substr $newseq,$_,1,'T';
100 return ($flag,$newseq);
103 my ($seq,$readlen) = @_;
104 my $seqlen = length $seq;
105 #die if $seqlen < $readlen;
106 if ($seqlen < $readlen) {
109 my $r1 = substr $seq,0,$readlen;
110 my $r2s = substr $seq,-$readlen,$readlen;
111 my $r2 = revcom
($r2s);
112 #print "$seqlen $r1,$r2s\t$seq\t$r2\n";
113 return ($r1,$r2,$readlen);
116 sub realdosim
($$$$$$$$$$) {
117 my ($homhet,$Het2MetR,$theseq,$thedepth,$fhref,$unchgRate,$chr,$s,$e,$MetStatRef) = @_;
119 my ($flag,$newseq,$str,$seqname,$r1,$r2,$realReadlen);
120 for ( 1 .. $thedepth ) {
122 my $realMetRate = $unchgRate;
123 if (defined $usedPos{$chr}{$s}) {
124 my @t = values %{$usedPos{$chr}{$s}};
125 my %t; @t{@t} = 0 .. $#t;
126 if (exists $t{'Het'}) {
127 $outputhomhet = 'Het';
129 $outputhomhet = 'Hom';
132 if ($$Het2MetR[1] > 0) {
133 $outputhomhet .= " i$homhet:HetMut=>Met";
134 if ($homhet eq 'Hom') {
135 $realMetRate = $$Het2MetR[0];
137 $realMetRate = $$Het2MetR[1];
139 } elsif ($$Het2MetR[1] < 0) {
140 $outputhomhet .= " i$homhet:Ref=>Met";
141 if ($homhet eq 'Het') {
142 $realMetRate = -1 * $$Het2MetR[0];
144 $realMetRate = -1 * $$Het2MetR[1];
147 $outputhomhet .= " i$homhet:RandomMet";
150 $outputhomhet = 'NoSNP';
152 #print STDERR "$outputhomhet $unchgRate\t$chr,$s,$e\n";
156 $str = "->$realMetRate $chr:$s $e Waston o$outputhomhet";
157 ($flag,$newseq) = simPlusMinus
($theseq,$realMetRate);
158 ++$$MetStatRef{$flag};
159 #print "$flag $unchgRate $str, $seq1, $newseq\n";
160 ($r1,$r2,$realReadlen) = getPE
($newseq,$ReadLen);
161 $seqname = "\@$outCnt $flag $unchgRate$str";
163 #print "$_ $seqname\n";
164 print {$FH[0]} join("\n",$seqname.'/1',$r1,'+',$QUAL x
$realReadlen),"\n";
165 print {$FH[1]} join("\n",$seqname.'/2',$r2,'+',$QUAL x
$realReadlen),"\n";
169 $str = "->$realMetRate $chr:$s $e Crick o$outputhomhet";
170 my $revtheseq = revcom
($theseq);
171 ($flag,$newseq) = simPlusMinus
($revtheseq,$realMetRate);
172 ++$$MetStatRef{$flag};
173 #print "$flag $unchgRate $str, $seq1, $newseq\n";
174 ($r1,$r2,$realReadlen) = getPE
($newseq,$ReadLen);
175 $seqname = "\@$outCnt $flag $unchgRate$str";
177 print {$FH[2]} join("\n",$seqname.'/1',$r1,'+',$QUAL x
$realReadlen),"\n";
178 print {$FH[3]} join("\n",$seqname.'/2',$r2,'+',$QUAL x
$realReadlen),"\n";
182 sub dosim
($$$$$$$$) {
183 my ($fhref,$unchgRate,$Het2MetR,$chr,$s,$e,$seq1,$seq2) = @_;
184 my $depth1 = int(0.9 + $eachDepth/2);
185 my $depth2 = $eachDepth - $depth1;
187 'NoCpG' => 0, 'CpGisC' => 0, 'CpGtoT' => 0
189 unless (defined $unchgRate) {
194 if ($unchgRate >= 0.5) {
195 $Het2MetR = [$Het2MetR*(2*$unchgRate-1),$Het2MetR];
197 $Het2MetR = [0,$Het2MetR*2*$unchgRate]; # 绝对值小的排前面。
199 realdosim
('Hom',$Het2MetR,$seq1,$depth1,$fhref,$unchgRate,$chr,$s,$e,\
%MetStat);
200 realdosim
('Het',$Het2MetR,$seq2,$depth2,$fhref,$unchgRate,$chr,$s,$e,\
%MetStat) if $depth2>0;
201 #print M join("\t",$chr,$s,$e,$e-$s+1,$MetStat{'CpGtoT'}),"\n";
205 open $fhC[0],'|-',"gzip -9c > Cwaston.1.fq.gz" or die;
206 open $fhC[1],'|-',"gzip -9c > Cwaston.2.fq.gz" or die;
207 open $fhC[2],'|-',"gzip -9c > Ccrick.1.fq.gz" or die;
208 open $fhC[3],'|-',"gzip -9c > Ccrick.2.fq.gz" or die;
209 open $fhN[0],'|-',"gzip -9c > Nwaston.1.fq.gz" or die;
210 open $fhN[1],'|-',"gzip -9c > Nwaston.2.fq.gz" or die;
211 open $fhN[2],'|-',"gzip -9c > Ncrick.1.fq.gz" or die;
212 open $fhN[3],'|-',"gzip -9c > Ncrick.2.fq.gz" or die;
214 print STDERR
"[!]Run `metsim0.pl` to refresh zone.lst if see any cross below:\n";
217 my ($chr,$s,$e,$len,$seq) = split /\t/;
218 next if $len < $minRefLen or $len > $maxRefLen;
220 if (exists $MetUnchgRate{$s}) {
221 @Paras = @
{ $MetUnchgRate{$s} };
224 my $seq1 = addSNP
($chr,$s,$seq,'Hom');
225 my $seq2 = addSNP
($chr,$s,$seq1,'Het');
227 #my ($a,$b)=($seq1,$seq2);
228 dosim
(\
@fhC,$Paras[1],$Paras[3],$chr,$s,$e,$seq1,$seq2); # now 'C' comes 1st.
229 #die if $a ne $seq1; die if $b ne $seq2;
230 dosim
(\
@fhN,$Paras[2],$Paras[4],$chr,$s,$e,$seq1,$seq2);
231 #die if $a ne $seq1; die if $b ne $seq2;
235 close $_ for (@fhC,@fhN);
236 warn "\n[!]Done !\n";
237 system("gzip -fd *.fq.gz");