new file: cell2loc.py
[GalaxyCodeBases.git] / perl / etc / justonce / tmp / metsim1.pl
blob45dc12ee168706910b7f0b227a5caca5fcaa8cad
1 #!/usr/bin/env perl
2 use strict;
3 use warnings;
4 use Data::Dump qw(ddx);
6 my $eachDepth = 10;
7 my $ReadLen = 50;
8 my $QUAL = 'e';
9 my ($minRefLen,$maxRefLen) = (40,220);
11 my $outCnt = 1;
13 open I,'<','hg19chr17.bed.frag' or die;
14 open Z,'<','zone.lst' or die;
15 open S,'<','snp.lst' or die;
17 sub revcom($) {
18 my $str = $_[0];
19 $str =~ tr/acgtrymkswhbvdnxACGTRYMKSWHBVDNX/tgcayrkmswdvbhnxTGCAYRKMSWDVBHNX/;
20 my $rev = reverse $str;
21 $rev =~ tr/[](){}<>/][)(}{></;
22 return $rev;
25 my %MetUnchgRate;
26 while (<Z>) {
27 chomp;
28 next if /^#/;
29 my (undef,$s,$e,$ra,$rb,$rHa,$rHb) = split /\t/;
30 $MetUnchgRate{$s}=[$e,$ra,$rb,$rHa,$rHb];
32 close Z;
33 ddx \%MetUnchgRate;
35 my %SNPList;
36 my %usedPos;
37 while (<S>) {
38 chomp;
39 my ($HomHet,$chr,$s,undef,undef,undef,undef,$PosP1,undef,undef,$newseq) = split /\t/;
40 $SNPList{"$HomHet\t$s"} = $newseq;
41 $usedPos{$chr}{$s}{$PosP1-1} = $HomHet;
43 close S;
44 #ddx \%usedPos;
46 my $SNPremained = 0;
47 sub addSNP($$$$) {
48 my ($chr,$s,$seq,$type) = @_;
49 my $key = "$type\t$s";
50 my $newseq = $seq;
51 $newseq = $SNPList{$key} if exists $SNPList{$key};
52 return $newseq;
55 sub getCpG($) {
56 my ($seq) = @_;
57 my @ret;
58 while ($seq =~ /CG/gi) {
59 #print join("\t",'CG',$-[0], $+[0],"$` ($&) $'"),"\n";
60 push @ret, $-[0];
62 return @ret;
64 sub getC($) { # C but not CpG
65 my ($seq) = @_;
66 my @ret;
67 while ($seq =~ /C(?!G)/gi) {
68 #print join("\t",'C',$-[0], $+[0],"$` ($&) $'"),"\n";
69 push @ret, $-[0];
71 return @ret;
74 sub doCoin($) {
75 my $rate = $_[0];
76 my $coin = rand(1);
77 if ($coin > $rate) {
78 return 0;
79 } else {
80 return 1;
84 sub simPlusMinus($$) { #甲基化只处理fq1所在的单链
85 my ($seq,$unchgRate) = @_;
86 my @CGpos = getCpG($seq);
87 my @Cpos = getC($seq);
88 my $newseq = $seq;
89 for (@Cpos) {
90 substr $newseq,$_,1,'T';
92 my $flag = 'NoCpG';
93 $flag = 'CpGisC' if @CGpos;
94 unless (doCoin($unchgRate)) {
95 for (@CGpos) {
96 substr $newseq,$_,1,'T';
98 $flag = 'CpGtoT';
100 return ($flag,$newseq);
102 sub getPE ($$) {
103 my ($seq,$readlen) = @_;
104 my $seqlen = length $seq;
105 #die if $seqlen < $readlen;
106 if ($seqlen < $readlen) {
107 $readlen = $seqlen;
109 my $r1 = substr $seq,0,$readlen;
110 my $r2s = substr $seq,-$readlen,$readlen;
111 my $r2 = revcom($r2s);
112 #print "$seqlen $r1,$r2s\t$seq\t$r2\n";
113 return ($r1,$r2,$readlen);
116 sub realdosim($$$$$$$$$$) {
117 my ($homhet,$Het2MetR,$theseq,$thedepth,$fhref,$unchgRate,$chr,$s,$e,$MetStatRef) = @_;
118 my @FH = @$fhref;
119 my ($flag,$newseq,$str,$seqname,$r1,$r2,$realReadlen);
120 for ( 1 .. $thedepth ) {
121 my $outputhomhet;
122 my $realMetRate = $unchgRate;
123 if (defined $usedPos{$chr}{$s}) {
124 my @t = values %{$usedPos{$chr}{$s}};
125 my %t; @t{@t} = 0 .. $#t;
126 if (exists $t{'Het'}) {
127 $outputhomhet = 'Het';
128 } else {
129 $outputhomhet = 'Hom';
132 if ($$Het2MetR[1] > 0) {
133 $outputhomhet .= " i$homhet:HetMut=>Met";
134 if ($homhet eq 'Hom') {
135 $realMetRate = $$Het2MetR[0];
136 } else {
137 $realMetRate = $$Het2MetR[1];
139 } elsif ($$Het2MetR[1] < 0) {
140 $outputhomhet .= " i$homhet:Ref=>Met";
141 if ($homhet eq 'Het') {
142 $realMetRate = -1 * $$Het2MetR[0];
143 } else {
144 $realMetRate = -1 * $$Het2MetR[1];
146 } else {
147 $outputhomhet .= " i$homhet:RandomMet";
149 } else {
150 $outputhomhet = 'NoSNP';
152 #print STDERR "$outputhomhet $unchgRate\t$chr,$s,$e\n";
154 # Hom -> seq1
155 # fq1 is Plus
156 $str = "->$realMetRate $chr:$s $e Waston o$outputhomhet";
157 ($flag,$newseq) = simPlusMinus($theseq,$realMetRate);
158 ++$$MetStatRef{$flag};
159 #print "$flag $unchgRate $str, $seq1, $newseq\n";
160 ($r1,$r2,$realReadlen) = getPE($newseq,$ReadLen);
161 $seqname = "\@$outCnt $flag $unchgRate$str";
162 $seqname =~ s/ /#/g;
163 #print "$_ $seqname\n";
164 print {$FH[0]} join("\n",$seqname.'/1',$r1,'+',$QUAL x $realReadlen),"\n";
165 print {$FH[1]} join("\n",$seqname.'/2',$r2,'+',$QUAL x $realReadlen),"\n";
166 ++$outCnt;
168 # fq1 in Minus
169 $str = "->$realMetRate $chr:$s $e Crick o$outputhomhet";
170 my $revtheseq = revcom($theseq);
171 ($flag,$newseq) = simPlusMinus($revtheseq,$realMetRate);
172 ++$$MetStatRef{$flag};
173 #print "$flag $unchgRate $str, $seq1, $newseq\n";
174 ($r1,$r2,$realReadlen) = getPE($newseq,$ReadLen);
175 $seqname = "\@$outCnt $flag $unchgRate$str";
176 $seqname =~ s/ /#/g;
177 print {$FH[2]} join("\n",$seqname.'/1',$r1,'+',$QUAL x $realReadlen),"\n";
178 print {$FH[3]} join("\n",$seqname.'/2',$r2,'+',$QUAL x $realReadlen),"\n";
179 ++$outCnt;
182 sub dosim($$$$$$$$) {
183 my ($fhref,$unchgRate,$Het2MetR,$chr,$s,$e,$seq1,$seq2) = @_;
184 my $depth1 = int(0.9 + $eachDepth/2);
185 my $depth2 = $eachDepth - $depth1;
186 my %MetStat = (
187 'NoCpG' => 0, 'CpGisC' => 0, 'CpGtoT' => 0
189 unless (defined $unchgRate) {
190 $unchgRate = 1;
191 $Het2MetR = 0;
192 print STDERR '+';
194 if ($unchgRate >= 0.5) {
195 $Het2MetR = [$Het2MetR*(2*$unchgRate-1),$Het2MetR];
196 } else {
197 $Het2MetR = [0,$Het2MetR*2*$unchgRate]; # 绝对值小的排前面。
199 realdosim('Hom',$Het2MetR,$seq1,$depth1,$fhref,$unchgRate,$chr,$s,$e,\%MetStat);
200 realdosim('Het',$Het2MetR,$seq2,$depth2,$fhref,$unchgRate,$chr,$s,$e,\%MetStat) if $depth2>0;
201 #print M join("\t",$chr,$s,$e,$e-$s+1,$MetStat{'CpGtoT'}),"\n";
204 my (@fhC,@fhN);
205 open $fhC[0],'|-',"gzip -9c > Cwaston.1.fq.gz" or die;
206 open $fhC[1],'|-',"gzip -9c > Cwaston.2.fq.gz" or die;
207 open $fhC[2],'|-',"gzip -9c > Ccrick.1.fq.gz" or die;
208 open $fhC[3],'|-',"gzip -9c > Ccrick.2.fq.gz" or die;
209 open $fhN[0],'|-',"gzip -9c > Nwaston.1.fq.gz" or die;
210 open $fhN[1],'|-',"gzip -9c > Nwaston.2.fq.gz" or die;
211 open $fhN[2],'|-',"gzip -9c > Ncrick.1.fq.gz" or die;
212 open $fhN[3],'|-',"gzip -9c > Ncrick.2.fq.gz" or die;
213 my @Paras;
214 print STDERR "[!]Run `metsim0.pl` to refresh zone.lst if see any cross below:\n";
215 while(<I>) {
216 chomp;
217 my ($chr,$s,$e,$len,$seq) = split /\t/;
218 next if $len < $minRefLen or $len > $maxRefLen;
219 #print "$s\n";
220 if (exists $MetUnchgRate{$s}) {
221 @Paras = @{ $MetUnchgRate{$s} };
222 ddx \@Paras;
224 my $seq1 = addSNP($chr,$s,$seq,'Hom');
225 my $seq2 = addSNP($chr,$s,$seq1,'Het');
226 #ddx \@Paras; die;
227 #my ($a,$b)=($seq1,$seq2);
228 dosim(\@fhC,$Paras[1],$Paras[3],$chr,$s,$e,$seq1,$seq2); # now 'C' comes 1st.
229 #die if $a ne $seq1; die if $b ne $seq2;
230 dosim(\@fhN,$Paras[2],$Paras[4],$chr,$s,$e,$seq1,$seq2);
231 #die if $a ne $seq1; die if $b ne $seq2;
233 close S;
234 #close M;
235 close $_ for (@fhC,@fhN);
236 warn "\n[!]Done !\n";
237 system("gzip -fd *.fq.gz");
239 __END__