3 Author: Hu Xuesong @ BIOPIC <galaxy001@gmail.com>
4 Version: 1.0.0 @ 20120720
10 use Data
::Dump
qw(ddx);
11 use Galaxy
::IO
::FASTAQ
;
12 use Galaxy
::SeqTools
qw(translate revcom);
14 die "Usage: $0 <vcf.gz with tabix indexed> <out> <regions> (eg.: scaffold75:924209-5441687)\n" if @ARGV<2;
15 my $fafs='/share/users/huxs/work/tiger/paper/parents75n1458.fa';
16 my $gtfs = '/share/users/huxs/work/tiger/paper/parents75n1458.gtf';
21 warn "From [$vcfs][$gtfs][$fafs] to [$outfs]\n";
22 warn "Regions:[$regions]\n" if $regions;
25 AAs = FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
26 Starts = ---M---------------M---------------M----------------------------
27 Base1 = TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
28 Base2 = TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
29 Base3 = TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
31 my @t=split /\n/,$code;
32 my (%t,%gen_code,%start_codes);
33 map {s/\s//g;@_=split /=/;$t{$_[0]}=[split //,$_[1]]} @t;
38 # map {my $t=shift @{$t{$_}};push @bases,$t} qw/Base1 Base2 Base3/;
39 # my $base=join('',@bases);
40 my $base=(shift @
{$t{Base1
}}).(shift @
{$t{Base2
}}).(shift @
{$t{Base3
}});
42 my $start=shift @
{$t{Starts
}};
43 ++$start_codes{$base} if $start eq 'M';
47 #ddx [\@t,\%gen_code,\%start_codes];
48 print "Codon Table:\n$code\n";
50 my (%GeneDat,%Annoted);
51 open GTF
,'<',$gtfs or dir
$!;
52 while(my $in_line = <GTF
>){
53 next if ($in_line =~ /^\s*\#/);
55 #remove leading whitespace and get comments
57 while($in_line =~ /^(.*)\#(.*)$/){
59 $comments = $2.$comments;
61 if($in_line =~ /^\s+(.*)$/){
64 my @data = split /\s+/,$in_line;
65 #verify line is correct length
67 my %feature = map {s/\s*;$//; s/\"//g; $_;} splice @data,8;
68 #ddx [\@data,\%feature];
69 if ($data[2] eq 'transcript') {
70 die "[$feature{gene_type}]" if $feature{gene_type
} ne 'protein_coding';
71 $GeneDat{$data[0]}{$feature{gene_id
}}=[$feature{gene_name
},$data[6],[],$data[3]];
72 } elsif ($data[2] eq 'CDS' or $data[2] eq 'stop_codon') {
73 push @
{$GeneDat{$data[0]}{$feature{gene_id
}}->[2]},[$data[3],$data[4]];
74 push @
{$Annoted{$data[0]}},[$feature{gene_id
},$data[3],$data[4]];
78 for my $chr (keys %GeneDat) {
79 $Annoted{$chr} = [ sort { $a->[1] <=> $b->[1] } @
{$Annoted{$chr}} ];
80 for my $id (keys %{$GeneDat{$chr}}) {
81 my ($gene,$strand,$cdsA) = @
{$GeneDat{$chr}{$id}};
83 $cdsA = [ sort {$a->[0] <=> $b->[0]} @
$cdsA ];
84 } elsif ($strand eq '-') {
85 $cdsA = [ sort {$b->[0] <=> $a->[0]} @
$cdsA ];
87 $GeneDat{$chr}{$id}->[2] = $cdsA;
96 open $fafh,'<',$fafs or die "$!";
98 my ($name, $comment, $seq, $ret);
99 my ($n, $len) = (0, 0);
100 while ( $ret = &readfq
($fafh, \
@aux) ) {
101 ($name, $comment, $seq) = @
$ret;
104 print "$n: ",join("\t", $name, $comment, $len), "\n";
105 $RefSeq{$name} = $seq;
110 open OUTCDNA
,'>',"$outfs.cDNA.fa" or die $!;
111 open OUTPT
,'>',"$outfs.protein.fa" or die $!;
112 for my $chr (sort keys %GeneDat) {
113 for my $id (sort { $GeneDat{$chr}{$a}->[3] <=> $GeneDat{$chr}{$b}->[3] } keys %{$GeneDat{$chr}}) {
114 my ($gene,$strand,$cdsA,$st) = @
{$GeneDat{$chr}{$id}};
115 print OUTCDNA
">${gene}_$id $chr:$strand:$st";
116 print OUTPT
">${gene}_$id $chr:$strand:$st";
117 my ($len,$seq,$tmpseq,$AA)=(0);
120 print OUTCDNA
",$s-$e";
122 $tmpseq = substr $RefSeq{$chr},$s-1,$e-$s+1;
123 $tmpseq = revcom
($tmpseq) if $strand eq '-';
124 $seq .= $tmpseq."\n";
126 print OUTCDNA
" =$len ",int($len*10/3)/10,"\n$seq\n";
128 $AA = translate
($seq,\
%gen_code);
129 print OUTPT
" $len ",int($len*10/3)/10,"\n$AA\n";
137 my $vcf = Vcf
->new(file
=>$vcfs,region
=>$regions);
138 $vcf->parse_header();
139 my (@samples) = $vcf->get_samples();
142 my (%mutGenes,%mutSNPs);
143 while (my $x=$vcf->next_data_hash()) {
144 die unless exists $Annoted{$$x{CHROM
}};
145 #ddx $x if exists $$x{INFO}{INDEL};
146 next if $$x{QUAL
} < 20;
147 my (%GTok,%GTcnt,%GTp1,%GTp2);
148 my %GTs = %{$$x{gtypes
}};
150 for my $sample (keys %GTs) {
151 next if $GTs{$sample}{DP
}<=0 or $GTs{$sample}{GQ
}<20;
153 $GTok{$sample} = $GTs{$sample}{GT
};
154 ++$GTcnt{ $GTs{$sample}{GT
} };
155 ++$GTp1{ $GTs{$sample}{GT
} } if $sample =~ /^B(HX|XH)01/; # bb
156 ++$GTp2{ $GTs{$sample}{GT
} } if $sample =~ /^JHH001/; # Bb
158 next unless keys %GTok;
159 next if (keys %GTcnt) != 2;
160 next unless keys(%GTp1)==1 and keys(%GTp2)==1;
161 my $GT1 = (keys %GTp1)[0];
162 my $GT2 = (keys %GTp2)[0];
163 my ($a1,$a2,$a3) = $vcf->split_gt($GT1);
167 ($a1,$a2,$a3) = $vcf->split_gt($GT2);
171 } elsif ($a2 eq $GT1) {
175 #print "B:$GT2, b:$GT1 [$sampleCNT]\n";
176 my @Bases = ($$x{REF
},@
{$$x{ALT
}});
177 my $GT2Base = $Bases[$GT2];
178 my $GT1Base = $Bases[$GT1];
179 my ($theGID,$theS,$theE) = @
{ChechRange
( $$x{CHROM
},$$x{POS
} )};
181 die if exists $$x{INFO
}{INDEL
}; # No need to do INDEL as there is none.
183 unless (exists $mutGenes{$$x{CHROM
}}{$theGID}) {
184 # if ($GeneDat{$$x{CHROM}}{$theGID}->[1] eq '+') {
185 # $mutGenes{$$x{CHROM}}{$theGID} = [$cDNA{$theGID},$cDNA{$theGID}];
186 # } elsif ($GeneDat{$$x{CHROM}}{$theGID}->[1] eq '-') {
187 # my $tmpstr = revcom( $cDNA{$theGID} );
188 # $mutGenes{$$x{CHROM}}{$theGID} = [$tmpstr,$tmpstr];
190 $mutGenes{$$x{CHROM
}}{$theGID} = [$cDNA{$theGID},$cDNA{$theGID}];
192 push @
{ $mutSNPs{$$x{CHROM
}}{$theGID} },[$$x{POS
}, $$x{REF
}, $GT2Base, $GT1Base];
194 print join(",",$$x{CHROM
},$$x{POS
},$GeneDat{$$x{CHROM
}}{$theGID}->[0]." \tB:$GT2,$GT2Base b:$GT1,$GT1Base [$sampleCNT] ",
195 $theGID,$GeneDat{$$x{CHROM
}}{$theGID}->[1],$theS,$theE),"\n";
196 # for my $gt (keys %GTs) {
197 # my ($a1,$a2,$a3) = $vcf->split_gt($gt);
198 # if ($a3 or ($a1 != $a2)) {
206 # CHROM => "scaffold75",
208 # FORMAT => ["GT", "PL", "DP", "SP", "GQ"],
210 # "BHX011.bam" => { DP => 19, GQ => 61, GT => "0/0", PL => "0,57,255", SP => 0 },
211 # "BHX019.bam" => { DP => 26, GQ => 82, GT => "0/0", PL => "0,78,255", SP => 0 },
212 # "JHH001.bam" => { DP => 28, GQ => 99, GT => "0/1", PL => "244,0,255", SP => 9 },
219 # DP4 => "34,26,4,9",
222 # PV4 => "0.13,1,0.12,1",
229 #print $vcf->format_line($x);
234 open OUTDNA
,'>',"$outfs.candidateDNA.fa" or die $!;
235 open OUTAA
,'>',"$outfs.candidateAA.fa" or die $!;
237 for my $chr (keys %mutSNPs) {
238 for my $gid (sort keys %{$mutSNPs{$chr}}) {
239 for ( @
{$mutSNPs{$chr}{$gid}} ) {
240 my ($pos, $ref, $GT2Base, $GT1base) = @
$_;
241 $mutGenes{$chr}{$gid}->[0] = mutpoint
($chr,$pos,$ref,$GT2Base,$gid,$mutGenes{$chr}{$gid}->[0]);
242 $mutGenes{$chr}{$gid}->[1] = mutpoint
($chr,$pos,$ref,$GT1base,$gid,$mutGenes{$chr}{$gid}->[1]);
244 my $id = $gid.$GeneDat{$chr}{$gid}->[1].$GeneDat{$chr}{$gid}->[0];
245 my @diff = cmpstr
($mutGenes{$chr}{$gid}->[0],$mutGenes{$chr}{$gid}->[1]);
247 my $AA0 = translate
($mutGenes{$chr}{$gid}->[0],\
%gen_code);
248 my $AA1 = translate
($mutGenes{$chr}{$gid}->[1],\
%gen_code);
249 my @diffAA = cmpstr
($AA0,$AA1);
251 for (@diff,@diffAA) {
252 $_ = join('',@
$_[1,0,2]);
254 print OUTDNA
">${id}_B $chr\n",$mutGenes{$chr}{$gid}->[0],"\n",
255 ">${id}_b $chr ",join(',',scalar(@diff),@diff),"\n",$mutGenes{$chr}{$gid}->[1],"\n\n";
256 print OUTAA
">${id}_B $chr\n",$AA0,"\n",
257 ">${id}_b $chr ",join(',',scalar(@diffAA),@diffAA),"\n",$AA1,"\n\n";
266 if (exists $Annoted{$chr}) {
267 my $datA = $Annoted{$chr};
268 if ($pos > $$datA[-1][2] or $pos < $$datA[0][1]) {
272 my ($gid,$s,$t) = @
$_;
276 push @
{$gidat{$gid}},[$s,$t];
278 if ( (keys %gidcnt) == 1 ) {
279 my $id=(keys %gidcnt)[0];
280 return [$id,@
{$gidat{$id}->[0]}];
282 die "$chr,$pos" if (keys %gidcnt) > 1;
285 #push @{$Annoted{$data[0]}},[$feature{gene_id},$data[3],$data[4]];
288 sub mutpoint
($$$$$$) {
289 my ($chr,$pos,$ref,$gt,$gid,$seq) = @_;
290 # $GeneDat{$chr}{$gene_id}=[$gene_name,$strand,[[s1,e1],[s2,e2]],$start];
291 # $cDNA{$gid} = $seq;
292 # $Protein{$gid} = $AA;
293 my ($gname,$strand,$cdsA) = @
{$GeneDat{$chr}{$gid}};
294 my ($met,$len,$tmp) = (0,0);
296 my $REFseq = $cDNA{$gid};
297 $gt =~ tr/ATCG/TAGC/ if $strand eq '-';
302 if ($pos >= $s and $pos <= $e) {
304 #$len += $pos - $s +1;
306 if ($strand eq '+') {
307 $len += $pos - $s +1 -1;
308 } elsif ($strand eq '-') {
309 $len += $e - $pos +1 -1;
311 substr($seq,$len,1) = $gt;
312 #$seq = substr($cDNA{$gid},0,$len) . $gt . substr($cDNA{$gid},$len+1)
313 my $test = substr($REFseq,$len,1);
314 $test =~ tr/ATCG/TAGC/ if $strand eq '-';
315 warn "$gid($gname),$strand: $test ne $ref" if $test ne $ref;
326 while ($c =~ /[^\0]/g) {
328 push @ret,[$p,substr($a,$p-1,1),substr($b,$p-1,1)];
335 ./vcf2cdsN
.pl parents
.vcf
.gz n3f
> n3f
.lg
337 diff
./vcf2cds.pl ./vcf2cdsN
.pl
339 < my $gtfs = '/share/users/huxs/work/tiger/paper/parents75n1458.gtf';
341 > my $gtfs = '/share/users/huxs/work/tiger/paper/new.tcs.gtf';