modified: Makefile
[GalaxyCodeBases.git] / perl / etc / bs / filter.pl
blobeb7257d776a53a728a7fe21c6953d7450f44ffa3
1 #!/usr/bin/env perl
2 use strict;
3 use warnings;
4 use Data::Dump qw(ddx);
6 die "Usage: $0 <snp file> <Cancer_Watson,Cancer_Crick,Normal_Watson,Normal_Crick> >out.txt\n" if @ARGV < 2;
7 my @maxDepth = split /,/,$ARGV[1];
8 warn "[!]MaxDepth:[",join(',',@maxDepth),"]\n";
10 no warnings 'qw';
11 my @thePOS = qw(chrom position);
12 my @LastEight = qw(Number_of_watson[A,T,C,G]_Normal Number_of_crick[A,T,C,G]_Normal Mean_Quality_of_Watson[A,T,C,G]_Normal Mean_Quality_of_Crick[A,T,C,G]_Normall Number_of_watson[A,T,C,G]_Cancer Number_of_crick[A,T,C,G]_Cancer Mean_Quality_of_Watson[A,T,C,G]_Cancer Mean_Quality_of_Crick[A,T,C,G]_Cancer);
13 my @SELECTED = (qw(ref var),@LastEight[4,5,0,1],'somatic_status',@LastEight[6,7,2,3],'tumor_gt','normal_gt');
14 my @Bases = qw(A T C G);
15 my $i = 0;
16 my %L = map { $_ => $i++ } @Bases;
17 my (%gOrder,%bsBase,%cmpBase,%bscmpBase);
18 @gOrder{@Bases} = qw(0 1 2 3); # A T C G
19 @bsBase{@Bases} = qw(3 2 1 0); # G C T A # bsPaired
20 @cmpBase{@Bases} = qw(1 0 3 2); # T A G C
21 @bscmpBase{@Bases} = qw(2 3 0 1); # C G A T
23 sub readnext($) {
24 my $in = $_[0];
25 if ( ! eof($in->[0]) ) {
26 my $record = readline($in->[0]);
27 die "readline failed: $!" unless defined $record;
28 chomp($record);
29 my %hash;
30 @hash{@{$in->[1]}} = split /\t/, $record;
31 @{$in}[2,3] = @hash{@thePOS};
32 $in->[4] = [@hash{@SELECTED}];
33 $in->[5] = $record;
34 #ddx \%hash;
35 return 1;
36 } else {
37 @{$in}[2,3] = qw(_EOF_ 0);
38 $in->[4] = [qw(0 0 NA NA 0 0 NA NA NA NN NN)];
39 $in->[5] = '';
40 return 0;
44 sub initfile($) {
45 my ($filename)=@_;
46 my $infile;
47 if ($filename=~/.zst$/) {
48 open( $infile,"-|","zstd -qdc $filename") or die "Error opening $filename: $!\n";
49 } elsif ($filename=~/.gz$/) {
50 open( $infile,"-|","gzip -dc $filename") or die "Error opening $filename: $!\n";
51 } else {open( $infile,"<",$filename) or die "Error opening $filename: $!\n";}
52 chomp(my $t = <$infile>);
53 #print "$t\n";
54 my @tt = split("\t",$t);
55 map { s/_read(\d)$/_reads$1/ } @tt;
56 push @tt,qw(eW eC eA eB);
57 #print join("|",@tt),"\n";
58 @tt[-8 .. -1] = @LastEight;
59 print join("\t",@tt),"\n";
60 my $ret = [$infile,\@tt,undef,-1,[],[]];
61 #readnext($ret) or die "[x]File [$filename] is empty. $!\n";
62 return $ret;
65 my $FH = initfile($ARGV[0]);
66 #ddx $FH;
68 while($FH->[3]) {
69 readnext($FH) or last;
70 next unless defined $FH->[3]; # skip empty lines
71 if (length($FH->[4][1])>1) {
72 print $FH->[5],"\n";
73 next;
75 my $tumor_gt = $FH->[4][11];
76 my $normal_gt = $FH->[4][12];
77 my @tumorGT = sort {$L{$a} <=> $L{$b} || $a cmp $b} split('',$tumor_gt);
78 my @normalGT = sort {$L{$a} <=> $L{$b} || $a cmp $b} split('',$normal_gt);
79 my $tumGT = join('',@tumorGT);
80 my $norGT = join('',@normalGT);
81 my $t1 = $gOrder{$FH->[4][0]}; # ref
82 my $t2 = $gOrder{$FH->[4][1]}; # var
83 my @pGTs = sort {$L{$a} <=> $L{$b} || $a cmp $b} ($FH->[4][0],$FH->[4][1]);
84 my $pGT = join('',@pGTs);
85 next if $pGT eq 'AG' or $pGT eq 'TC';
86 my @m1 = ($gOrder{$tumorGT[0]},$bsBase{$tumorGT[0]},$cmpBase{$tumorGT[0]},$bscmpBase{$tumorGT[0]});
87 my @m2 = ($gOrder{$tumorGT[1]},$bsBase{$tumorGT[1]},$cmpBase{$tumorGT[1]},$bscmpBase{$tumorGT[1]});
88 my @m3 = ($gOrder{$normalGT[0]},$bsBase{$normalGT[0]},$cmpBase{$normalGT[0]},$bscmpBase{$normalGT[0]});
89 my @m4 = ($gOrder{$normalGT[1]},$bsBase{$normalGT[1]},$cmpBase{$normalGT[1]},$bscmpBase{$normalGT[1]});
90 my @d1 = (split(',',$FH->[4][2]))[0..3,$t1,$t2]; # Watson_Cancer, $d1[0]是A,$d1[1]是T,$d1[2]是C,$d1[3]是G, $d1[4]是Ref,$d1[5]是Var
91 my @d2 = (split(',',$FH->[4][3]))[0..3,$t1,$t2]; # Crick_Cancer
92 my @d3 = (split(',',$FH->[4][4]))[0..3,$t1,$t2]; # Watson_Normal
93 my @d4 = (split(',',$FH->[4][5]))[0..3,$t1,$t2]; # Crick_Normal
94 my @ds = ([@d1[@m1,@m2]],[@d2[@m1,@m2]],[@d3[@m3,@m4]],[@d4[@m3,@m4]]);
95 my ($g3) = (split(',',$FH->[4][4]))[$t1];
96 my ($g4) = (split(',',$FH->[4][5]))[$t1];
97 my $sa1 = $d1[4] + $d1[5];
98 my $sa2 = $d2[4] + $d2[5];
99 my $sa = $sa1+$sa2;
100 my $sb1 = $d3[4]+$d3[5];
101 my $sb2 = $d4[4]+$d4[5];
102 my $sb = $sb1+$sb2;
103 my $sb1a = $d3[0]+$d3[1]+$d3[2]+$d3[3];
104 my $sb2a = $d4[0]+$d4[1]+$d4[2]+$d4[3];
105 my $sba = $sb1a+$sb2a;
106 my @q1 = (split(',',$FH->[4][7]))[0..3,$t1,$t2]; # Watson_Cancer, $q1[0]是A,$q1[1]是T,$q1[2]是C,$q1[3]是G, $q1[4]是Ref,$q1[5]是Var
107 my @q2 = (split(',',$FH->[4][8]))[0..3,$t1,$t2]; # Crick_Cancer
108 my @q3 = (split(',',$FH->[4][9]))[0..3,$t1,$t2]; # Watson_Normal
109 my @q4 = (split(',',$FH->[4][10]))[0..3,$t1,$t2]; # Crick_Normal
110 my @qs = ([@q1[@m1,@m2]],[@q2[@m1,@m2]],[@q3[@m3,@m4]],[@q4[@m3,@m4]]);
111 my ($x,$y)=(); # GenoType bs dualize strand
112 if ($tumGT =~ /C/) {
113 ($x,$y)=(0,1);
114 } elsif ($tumGT =~ /G/) {
115 ($x,$y)=(1,0);
117 if ($FH->[4][6] eq 'Germline') {
118 next;
119 } elsif ($FH->[4][6] eq 'Somatic') {
120 next if $sa < 15;
121 if ($sa1>$maxDepth[0] or $sa2>$maxDepth[1] or $sb1>$maxDepth[2] or $sb2>$maxDepth[3]) {
122 next;
124 if ($tumGT eq 'AA' or $tumGT eq 'TT') { # AGTCx2,TCAGx2
125 next if $ds[0][0] < 5;
126 next if $ds[1][0] < 5;
127 next if $ds[0][1]+$ds[0][2]+$ds[0][3] + $ds[1][1]+$ds[1][2]+$ds[1][3] > 0;
128 #next if $qs[0][0] < 20;
129 #next if $qs[1][0] < 20;
130 } elsif ($tumGT eq 'AT') { # AGTCTCAG
131 next if $d1[5]<2 or $d2[5]<2;
132 next if (($d1[5]+$d2[5])/$sa)<0.2;
133 #next if ($q1[5]+$q2[5])<40;
134 next if $d1[2]+$d1[3] + $d2[2]+$d2[3] > 0;
135 } elsif ($tumGT eq 'CC' or $tumGT eq 'GG') { # CTGAx2,GACTx2
136 next if $ds[$x][0] + $ds[$x][1] < 5; # 正链(C+T)>5,G=0,A=0
137 next if $ds[$x][2] + $ds[$x][3] > 0;
138 next if $ds[$y][0] < 5; # 负链C>5,A=0,T=0,G=0
139 next if $ds[$y][1] + $ds[$y][2] + $ds[$y][3] > 0;
140 } elsif ($tumGT eq 'TC' or $tumGT eq 'AG') { # TCAGCTGA,AGTCGACT
141 next if $ds[$x][4] + $ds[$x][5] < 5; # 正链(C+T)>5,G=0,A=0
142 next if $ds[$x][6] + $ds[$x][7] > 0;
143 next if $ds[$y][4] < 2 or $ds[$y][5] < 2; # C>2,T>2,其他都是0
144 next if $ds[$y][6] + $ds[$y][7] > 0;
145 next if $ds[$y][0]/$ds[$y][1] <0.25;
146 next if $ds[$y][1]/$ds[$y][0] <0.25;
147 } elsif ($tumGT eq 'AC' or $tumGT eq 'TG') { # AGTCCTGA,TCAGGACT
148 next if $ds[$x][0] < 2;
149 next if $ds[$x][4] + $ds[$x][5] < 2;
150 next if $ds[$x][1] > 0;
151 next if $ds[$x][0]/($ds[$x][4] + $ds[$x][5]) <0.25;
152 next if ($ds[$x][4] + $ds[$x][5])/$ds[$x][0] <0.25;
153 next if $ds[$y][0] < 2;
154 next if $ds[$y][4] < 2;
155 next if $ds[$y][1] + $ds[$y][5] > 0;
156 next if $ds[$y][0]/$ds[$y][4] <0.25;
157 next if $ds[$y][4]/$ds[$y][0] <0.25;
158 } elsif ($tumGT eq 'CG') { # CTGAGACT | ATCG
159 next if $d1[2] + $d1[1] < 2;
160 next if $d1[3]<2;
161 next if $d1[0]>0;
162 next if $d1[3]/($d1[1]+$d1[2]) <0.25;
163 next if ($d1[1]+$d1[2])/$d1[3] <0.25;
164 next if $d2[0] + $d2[3] < 2;
165 next if $d2[2]<2;
166 next if $d2[1]>0;
167 next if $d2[2]/($d2[0]+$d2[3]) <0.25;
168 next if ($d2[0]+$d2[3])/$d2[2] <0.25;
169 } else {die;}
170 if ($t1 == 0 or $t1 == 1) { # A,T; ATCG
171 next if $sba > $d3[4]+$d4[4]; # 除了ref碱基,其他都是0.
172 next if $d3[4]+$d4[4] < 15; # ref总深度大于15
173 next if $d3[4] < 5;
174 next if $d4[4] < 5; # ref碱基正负链都大于5
175 } elsif ($t1 == 2) { # C; ATCG
176 next if $sba > ($d3[2]+$d4[2]+$d3[1]); # 除了C碱基和正链的T碱基,其他碱基都是0
177 next if $d4[2] < 5;
178 next if ($d3[2]+$d3[1]) < 5;
179 } elsif ($t1 == 3) { # G; ATCG
180 next if $sba > ($d3[3]+$d4[3]+$d4[0]); # 除了G碱基和负链A碱基,其他碱基个数都是0
181 next if $d3[3] < 5;
182 next if ($d4[0]+$d4[3]) < 5;
184 } elsif ($FH->[4][6] eq 'LOH') {
185 next;
186 } else {
187 next;
189 print $FH->[5],"\n";
190 #ddx $FH;
193 __END__
194 ./filter.pl obsmutect.snp.zst 100,100,100,100
195 ./filter.pl SNP.bsmap.final.h.zst 100,100,100,100 >bs.tsv
197 Germline 的都不要。
199 Ref+Alt >15
201 Genotype: # GT是纯合基因型,bsGT是亚硫酸盐处理后与GT结果一样的另一碱基。C链在GT含C时是正链,在GT含G时是负链,G链与C链相反。
202 AA, TT
203 Cancer:支持GT正负链都大于5,其他都是0
205 Cancer:突变碱基正负链大等于2
206 Cancer:突变频率大于0.2,正负链合计。
207 Cancer:除了AT,其他都是0,正负链合计。
208 CC, GG
209 Cancer:C链(GT+bsGT)>5,其他都是0
210 Cancer:G链GT>5,其他都是0
211 TC, AG
212 Cancer:C链(GT+bsGT)>5,其他都是0
213 Cancer:G链GT>2,bsGT>2,其他都是0
214 Cancer:G链GT/bsGT>1/4,bsGT/GT>1/4
215 AC, TG
216 Cancer:C链左碱基>2
217 Cancer:C链(bs左碱基+右碱基)>2
218 Cancer:C链【G,C】=0
219 Cancer:G链左碱基>2,右碱基>2,其他都是0
220 Cancer:G链左碱基/右碱基>1/4,右碱基/左碱基>1/4
221 Cancer:C链左碱基/(右碱基+互补左碱基)>1/4,(右碱基+互补左碱基)/左碱基>1/4
223 Cancer:正链:(C+T)>2; G>2, A=0
224 Cancer:正链:G/(T+C)>1/4, (T+C)/G>1/4
225 Cancer:负链:C>2,(G+A)>2, T=0,
226 Cancer:负链:C/(A+G)>1/4, (A+G)/C>1/4
228 A-,T-
229 Normal:除了ref碱基,其他碱基都是0.
230 Normal:ref碱基正负链都大于5
231 Normal:ref总深度大于15
233 Normal:除了C碱基和正链的T碱基,其他碱基都是0
234 Normal:负链C碱基个数大于5,正链(C+T大于5
236 Normal:除了G碱基和负链A碱基,其他碱基个数都是0
237 Normal:正链G碱基个数大于5,负链(A+G大于5个)
238 =============
239 先分类型:
240 第一, germline过滤条件:(原则:原则尽量宽松)
241 只考虑两个文件都是正负链都有支持突变类型。就通过。
243 第二,somatic过滤条件(原则:非常严格)
244 (一)癌症组(EA,EB,EC,ED),
245 1. 如果genotype是AT,AA,TT
246 1.1 突变碱基正负链大等于2
247 1.2 突变频率大于0.2,深度大于15
248 1.3 突变碱基平均质量值大于20
250 1. 如果genotype是AT,
251 1.1 突变碱基正负链大等于2
252 1.2 突变频率大于0.2,深度大于15
253 1.3 突变碱基平均质量值大于20
254 1.4 除了AT,其他大都是0
256 2. 如果是AA,
257 2.1 支持A正负链都大于5
258 2.2 总深度大于15
259 2.3 碱基质量值大于20
260 2.4 除了A,其他都是0
262 3. 如果是TT
263 3.1 支持T正负链都大于5
264 3.2 总深度大于15
265 3.3 碱基质量值大于20
266 3.4 除了T,其他都是0
270 3. genotype是CC
271 正链(C+T)>5,G=0,A=0
272 负链C>5,A=0,T=0,G=0
273 depth>15
275 6. GG
276 正链:G>5, 其他都是0
277 负链:G+A>5,其他都是0
278 depth >15
280 4. genotype是CT
281 正链:C+T>5,其他都是0
282 负链:C>2,T>2,其他都是0,C/(C+T)>0.2, T/(C+T)>0.2
283 detph>15
285 7. AG
286 正链:A>2,G>2,其他都是0, G/(G+A)>0.2, A/(G+A)>0.2
287 负链:A+G>5,其他都是0.
288 detph>15
290 2. 如果genotype是AC,
291 总depth>15
292 正链A>2,(C+T)>2,G=0
293 负链A>2,C>2.T=0,G=0
294 全部A 质量值>30
295 负链C 质量值>30
296 # MAF>0.2 =>
297 (C+T)/total >0.2 and A/total >0.2
299 5,genotype是TG,
300 正链:T>2, G>2, 其他都是0, T/(T+G)> 0.2, G/(T+G)>0.2
301 负链:T>2,(G+A)>2,C=0, T/(T+G+A)>0.2, (G+A)/(T+G+A) > 0.2
302 depth >15
305 8. CG
306 正链:(C+T)>2; G>2, A=0, G/(T+G+C)> 0.2, (C+T)/(C+T+G)>0.2
307 负链:C>2,(G+A)>2, T=0, C/(C+A+G)>0.2; (G+A)/(C+A+G)>0.2
308 depth >15
311 (二)正常组( Number_of_watson[A,T,C,G] Number_of_crick[A,T,C,G] Mean_Quality_of_Watson[A,T,C,G] Mean_Quality_of_Crick[A,T,C,G]
312 1. 如果ref是A或者T,
313 1.1 除了ref碱基,其他碱基都是0.
314 1.2 ref碱基正负链都大于5
315 1.3 ref总深度大于15
317 2.如果ref是C
318 2.1 除了C碱基和正链的T碱基,其他碱基都是0
319 2.2 负链C碱基质量值大于30,并且个数大于5,正链(C+T大于5
321 3. 如果ref是G
322 3.1 除了G碱基和负链A碱基,其他碱基个数都是0
323 3.2 正链G碱基质量值大于30,并且个数大于5,负链(A+G大于5个)
325 第三,LOH的过滤条件(这个条件更麻烦,后面单独讨论)