4 use Data
::Dump
qw(ddx);
6 die "Usage: $0 <snp file> <Cancer_Watson,Cancer_Crick,Normal_Watson,Normal_Crick> >out.txt\n" if @ARGV < 2;
7 my @maxDepth = split /,/,$ARGV[1];
8 warn "[!]MaxDepth:[",join(',',@maxDepth),"]\n";
11 my @thePOS = qw(chrom position);
12 my @LastEight = qw(Number_of_watson[A,T,C,G]_Normal Number_of_crick[A,T,C,G]_Normal Mean_Quality_of_Watson[A,T,C,G]_Normal Mean_Quality_of_Crick[A,T,C,G]_Normall Number_of_watson[A,T,C,G]_Cancer Number_of_crick[A,T,C,G]_Cancer Mean_Quality_of_Watson[A,T,C,G]_Cancer Mean_Quality_of_Crick[A,T,C,G]_Cancer);
13 my @SELECTED = (qw(ref var),@LastEight[4,5,0,1],'somatic_status',@LastEight[6,7,2,3],'tumor_gt','normal_gt');
14 my @Bases = qw(A T C G);
16 my %L = map { $_ => $i++ } @Bases;
17 my (%gOrder,%bsBase,%cmpBase,%bscmpBase);
18 @gOrder{@Bases} = qw(0 1 2 3); # A T C G
19 @bsBase{@Bases} = qw(3 2 1 0); # G C T A # bsPaired
20 @cmpBase{@Bases} = qw(1 0 3 2); # T A G C
21 @bscmpBase{@Bases} = qw(2 3 0 1); # C G A T
25 if ( ! eof($in->[0]) ) {
26 my $record = readline($in->[0]);
27 die "readline failed: $!" unless defined $record;
30 @hash{@
{$in->[1]}} = split /\t/, $record;
31 @
{$in}[2,3] = @hash{@thePOS};
32 $in->[4] = [@hash{@SELECTED}];
37 @
{$in}[2,3] = qw(_EOF_ 0);
38 $in->[4] = [qw(0 0 NA NA 0 0 NA NA NA NN NN)];
47 if ($filename=~/.zst$/) {
48 open( $infile,"-|","zstd -qdc $filename") or die "Error opening $filename: $!\n";
49 } elsif ($filename=~/.gz$/) {
50 open( $infile,"-|","gzip -dc $filename") or die "Error opening $filename: $!\n";
51 } else {open( $infile,"<",$filename) or die "Error opening $filename: $!\n";}
52 chomp(my $t = <$infile>);
54 my @tt = split("\t",$t);
55 map { s/_read(\d)$/_reads$1/ } @tt;
56 push @tt,qw(eW eC eA eB);
57 #print join("|",@tt),"\n";
58 @tt[-8 .. -1] = @LastEight;
59 print join("\t",@tt),"\n";
60 my $ret = [$infile,\
@tt,undef,-1,[],[]];
61 #readnext($ret) or die "[x]File [$filename] is empty. $!\n";
65 my $FH = initfile
($ARGV[0]);
69 readnext
($FH) or last;
70 next unless defined $FH->[3]; # skip empty lines
71 if (length($FH->[4][1])>1) {
75 my $tumor_gt = $FH->[4][11];
76 my $normal_gt = $FH->[4][12];
77 my @tumorGT = sort {$L{$a} <=> $L{$b} || $a cmp $b} split('',$tumor_gt);
78 my @normalGT = sort {$L{$a} <=> $L{$b} || $a cmp $b} split('',$normal_gt);
79 my $tumGT = join('',@tumorGT);
80 my $norGT = join('',@normalGT);
81 my $t1 = $gOrder{$FH->[4][0]}; # ref
82 my $t2 = $gOrder{$FH->[4][1]}; # var
83 my @pGTs = sort {$L{$a} <=> $L{$b} || $a cmp $b} ($FH->[4][0],$FH->[4][1]);
84 my $pGT = join('',@pGTs);
85 next if $pGT eq 'AG' or $pGT eq 'TC';
86 my @m1 = ($gOrder{$tumorGT[0]},$bsBase{$tumorGT[0]},$cmpBase{$tumorGT[0]},$bscmpBase{$tumorGT[0]});
87 my @m2 = ($gOrder{$tumorGT[1]},$bsBase{$tumorGT[1]},$cmpBase{$tumorGT[1]},$bscmpBase{$tumorGT[1]});
88 my @m3 = ($gOrder{$normalGT[0]},$bsBase{$normalGT[0]},$cmpBase{$normalGT[0]},$bscmpBase{$normalGT[0]});
89 my @m4 = ($gOrder{$normalGT[1]},$bsBase{$normalGT[1]},$cmpBase{$normalGT[1]},$bscmpBase{$normalGT[1]});
90 my @d1 = (split(',',$FH->[4][2]))[0..3,$t1,$t2]; # Watson_Cancer, $d1[0]是A,$d1[1]是T,$d1[2]是C,$d1[3]是G, $d1[4]是Ref,$d1[5]是Var
91 my @d2 = (split(',',$FH->[4][3]))[0..3,$t1,$t2]; # Crick_Cancer
92 my @d3 = (split(',',$FH->[4][4]))[0..3,$t1,$t2]; # Watson_Normal
93 my @d4 = (split(',',$FH->[4][5]))[0..3,$t1,$t2]; # Crick_Normal
94 my @ds = ([@d1[@m1,@m2]],[@d2[@m1,@m2]],[@d3[@m3,@m4]],[@d4[@m3,@m4]]);
95 my ($g3) = (split(',',$FH->[4][4]))[$t1];
96 my ($g4) = (split(',',$FH->[4][5]))[$t1];
97 my $sa1 = $d1[4] + $d1[5];
98 my $sa2 = $d2[4] + $d2[5];
100 my $sb1 = $d3[4]+$d3[5];
101 my $sb2 = $d4[4]+$d4[5];
103 my $sb1a = $d3[0]+$d3[1]+$d3[2]+$d3[3];
104 my $sb2a = $d4[0]+$d4[1]+$d4[2]+$d4[3];
105 my $sba = $sb1a+$sb2a;
106 my @q1 = (split(',',$FH->[4][7]))[0..3,$t1,$t2]; # Watson_Cancer, $q1[0]是A,$q1[1]是T,$q1[2]是C,$q1[3]是G, $q1[4]是Ref,$q1[5]是Var
107 my @q2 = (split(',',$FH->[4][8]))[0..3,$t1,$t2]; # Crick_Cancer
108 my @q3 = (split(',',$FH->[4][9]))[0..3,$t1,$t2]; # Watson_Normal
109 my @q4 = (split(',',$FH->[4][10]))[0..3,$t1,$t2]; # Crick_Normal
110 my @qs = ([@q1[@m1,@m2]],[@q2[@m1,@m2]],[@q3[@m3,@m4]],[@q4[@m3,@m4]]);
111 my ($x,$y)=(); # GenoType bs dualize strand
114 } elsif ($tumGT =~ /G/) {
117 if ($FH->[4][6] eq 'Germline') {
119 } elsif ($FH->[4][6] eq 'Somatic') {
121 if ($sa1>$maxDepth[0] or $sa2>$maxDepth[1] or $sb1>$maxDepth[2] or $sb2>$maxDepth[3]) {
124 if ($tumGT eq 'AA' or $tumGT eq 'TT') { # AGTCx2,TCAGx2
125 next if $ds[0][0] < 5;
126 next if $ds[1][0] < 5;
127 next if $ds[0][1]+$ds[0][2]+$ds[0][3] + $ds[1][1]+$ds[1][2]+$ds[1][3] > 0;
128 #next if $qs[0][0] < 20;
129 #next if $qs[1][0] < 20;
130 } elsif ($tumGT eq 'AT') { # AGTCTCAG
131 next if $d1[5]<2 or $d2[5]<2;
132 next if (($d1[5]+$d2[5])/$sa)<0.2;
133 #next if ($q1[5]+$q2[5])<40;
134 next if $d1[2]+$d1[3] + $d2[2]+$d2[3] > 0;
135 } elsif ($tumGT eq 'CC' or $tumGT eq 'GG') { # CTGAx2,GACTx2
136 next if $ds[$x][0] + $ds[$x][1] < 5; # 正链(C+T)>5,G=0,A=0
137 next if $ds[$x][2] + $ds[$x][3] > 0;
138 next if $ds[$y][0] < 5; # 负链C>5,A=0,T=0,G=0
139 next if $ds[$y][1] + $ds[$y][2] + $ds[$y][3] > 0;
140 } elsif ($tumGT eq 'TC' or $tumGT eq 'AG') { # TCAGCTGA,AGTCGACT
141 next if $ds[$x][4] + $ds[$x][5] < 5; # 正链(C+T)>5,G=0,A=0
142 next if $ds[$x][6] + $ds[$x][7] > 0;
143 next if $ds[$y][4] < 2 or $ds[$y][5] < 2; # C>2,T>2,其他都是0
144 next if $ds[$y][6] + $ds[$y][7] > 0;
145 next if $ds[$y][0]/$ds[$y][1] <0.25;
146 next if $ds[$y][1]/$ds[$y][0] <0.25;
147 } elsif ($tumGT eq 'AC' or $tumGT eq 'TG') { # AGTCCTGA,TCAGGACT
148 next if $ds[$x][0] < 2;
149 next if $ds[$x][4] + $ds[$x][5] < 2;
150 next if $ds[$x][1] > 0;
151 next if $ds[$x][0]/($ds[$x][4] + $ds[$x][5]) <0.25;
152 next if ($ds[$x][4] + $ds[$x][5])/$ds[$x][0] <0.25;
153 next if $ds[$y][0] < 2;
154 next if $ds[$y][4] < 2;
155 next if $ds[$y][1] + $ds[$y][5] > 0;
156 next if $ds[$y][0]/$ds[$y][4] <0.25;
157 next if $ds[$y][4]/$ds[$y][0] <0.25;
158 } elsif ($tumGT eq 'CG') { # CTGAGACT | ATCG
159 next if $d1[2] + $d1[1] < 2;
162 next if $d1[3]/($d1[1]+$d1[2]) <0.25;
163 next if ($d1[1]+$d1[2])/$d1[3] <0.25;
164 next if $d2[0] + $d2[3] < 2;
167 next if $d2[2]/($d2[0]+$d2[3]) <0.25;
168 next if ($d2[0]+$d2[3])/$d2[2] <0.25;
170 if ($t1 == 0 or $t1 == 1) { # A,T; ATCG
171 next if $sba > $d3[4]+$d4[4]; # 除了ref碱基,其他都是0.
172 next if $d3[4]+$d4[4] < 15; # ref总深度大于15
174 next if $d4[4] < 5; # ref碱基正负链都大于5
175 } elsif ($t1 == 2) { # C; ATCG
176 next if $sba > ($d3[2]+$d4[2]+$d3[1]); # 除了C碱基和正链的T碱基,其他碱基都是0
178 next if ($d3[2]+$d3[1]) < 5;
179 } elsif ($t1 == 3) { # G; ATCG
180 next if $sba > ($d3[3]+$d4[3]+$d4[0]); # 除了G碱基和负链A碱基,其他碱基个数都是0
182 next if ($d4[0]+$d4[3]) < 5;
184 } elsif ($FH->[4][6] eq 'LOH') {
194 ./filter
.pl obsmutect
.snp
.zst
100,100,100,100
195 ./filter
.pl SNP
.bsmap
.final
.h
.zst
100,100,100,100 >bs
.tsv
201 Genotype
: # GT是纯合基因型,bsGT是亚硫酸盐处理后与GT结果一样的另一碱基。C链在GT含C时是正链,在GT含G时是负链,G链与C链相反。
203 Cancer
:支持GT正负链都大于
5,其他都是
0。
206 Cancer
:突变频率大于
0.2,正负链合计。
207 Cancer
:除了AT,其他都是
0,正负链合计。
209 Cancer
:C链
(GT
+bsGT
)>5,其他都是
0。
212 Cancer
:C链
(GT
+bsGT
)>5,其他都是
0。
213 Cancer
:G链GT
>2,bsGT
>2,其他都是
0。
214 Cancer
:G链GT
/bsGT>1/4,bsGT
/GT>1/4。
217 Cancer
:C链
(bs左碱基
+右碱基
)>2
219 Cancer
:G链左碱基
>2,右碱基
>2,其他都是
0。
220 Cancer
:G链左碱基
/右碱基>1/4,右碱基
/左碱基>1/4
221 Cancer
:C链左碱基
/(右碱基+互补左碱基)>1/4,(右碱基
+互补左碱基
)/左碱基>1/4
223 Cancer
:正链:
(C
+T
)>2; G
>2, A
=0
224 Cancer
:正链:G
/(T+C)>1/4, (T
+C
)/G>1/4
225 Cancer
:负链:C
>2,(G
+A
)>2, T
=0,
226 Cancer
:负链:C
/(A+G)>1/4, (A
+G
)/C>1/4
229 Normal
:除了
ref碱基,其他碱基都是
0.
233 Normal
:除了C碱基和正链的T碱基,其他碱基都是
0
234 Normal
:负链C碱基个数大于
5,正链(C
+T大于
5)
236 Normal
:除了G碱基和负链A碱基,其他碱基个数都是
0
237 Normal
:正链G碱基个数大于
5,负链(A
+G大于
5个)
240 第一, germline过滤条件:(原则:原则尽量宽松)
241 只考虑两个文件都是正负链都有支持突变类型。就通过。
243 第二,somatic过滤条件(原则:非常严格)
245 1. 如果genotype是AT,AA,TT
247 1.2 突变频率大于
0.2,深度大于
15。
252 1.2 突变频率大于
0.2,深度大于
15。
282 负链:C
>2,T
>2,其他都是
0,C
/(C+T)>0.2, T/(C
+T
)>0.2
286 正链:A
>2,G
>2,其他都是
0, G
/(G+A)>0.2, A/(G
+A
)>0.2
297 (C
+T
)/total >0.2 and A/total
>0.2
300 正链:T
>2, G
>2, 其他都是
0, T
/(T+G)> 0.2, G/(T
+G
)>0.2
301 负链:T
>2,(G
+A
)>2,C
=0, T
/(T+G+A)>0.2, (G+A)/(T
+G
+A
) > 0.2
306 正链:(C
+T)
>2; G
>2, A
=0, G
/(T+G+C)> 0.2, (C+T)/(C
+T
+G
)>0.2
307 负链:C
>2,(G
+A
)>2, T
=0, C
/(C+A+G)>0.2; (G+A)/(C
+A
+G
)>0.2
311 (二)正常组( Number_of_watson
[A
,T
,C
,G
] Number_of_crick
[A
,T
,C
,G
] Mean_Quality_of_Watson
[A
,T
,C
,G
] Mean_Quality_of_Crick
[A
,T
,C
,G
])
318 2.1 除了C碱基和正链的T碱基,其他碱基都是
0
319 2.2 负链C碱基质量值大于
30,并且个数大于
5,正链(C
+T大于
5)
322 3.1 除了G碱基和负链A碱基,其他碱基个数都是
0
323 3.2 正链G碱基质量值大于
30,并且个数大于
5,负链(A
+G大于
5个)
325 第三,LOH的过滤条件(这个条件更麻烦,后面单独讨论)