modified: Makefile
[GalaxyCodeBases.git] / projects / radseq / vcf-consensus
blob5360548e47279e37b4152da9d62346502ebf2be4
1 #!/usr/bin/env perl
3 use strict;
4 use warnings;
5 use Carp;
6 use Vcf;
8 our %IUP=(A => 'A',
9 T => 'T',
10 C => 'C',
11 G => 'G',
12 AC => 'M',
13 AG => 'R',
14 AT => 'W',
15 CG => 'S',
16 CT => 'Y',
17 'GT' => 'K',
18 ACG => 'V',
19 ACT => 'H',
20 AGT => 'D',
21 CGT => 'B',
22 ACGT=> 'N',
23 N => 'N'
26 my $opts = parse_params();
27 do_consensus($opts);
29 exit;
31 #--------------------------------
33 sub error
35 my (@msg) = @_;
36 if ( scalar @msg )
38 croak @msg;
40 die
41 "Usage: cat ref.fa | vcf-consensus [OPTIONS] in.vcf.gz > out.fa\n",
42 "Options:\n",
43 " -h, -?, --help This help message.\n",
44 " -H, --haplotype <int> Apply only variants for the given haplotype (1,2)\n",
45 " -i, --iupac-codes Apply variants in the form of IUPAC ambiguity codes\n",
46 " -s, --sample <name> If not given, all variants are applied\n",
47 "Examples:\n",
48 " # Get the consensus for one region. The fasta header lines are then expected\n",
49 " # in the form \">chr:from-to\".\n",
50 " samtools faidx ref.fa 8:11870-11890 | vcf-consensus in.vcf.gz > out.fa\n",
51 "\n";
55 sub parse_params
57 my $opts = { };
58 while (my $arg=shift(@ARGV))
60 if ( $arg eq '-?' || $arg eq '-h' || $arg eq '--help' ) { error(); }
61 if ( $arg eq '-s' || $arg eq '--sample' ) { $$opts{sample}=shift(@ARGV); next; }
62 if ( $arg eq '-i' || $arg eq '--iupac-codes' ) { $$opts{iupac}=1; next; }
63 if ( $arg eq '-H' || $arg eq '--haplotype' ) { $$opts{haplotype}=shift(@ARGV); next; }
64 if ( -e $arg && !exists($$opts{vcf_file}) ) { $$opts{vcf_file}=$arg; next; }
65 error("Unknown parameter \"$arg\". Run -h for help.\n");
68 if ( $$opts{iupac} )
70 delete($$opts{iupac});
71 $$opts{iupac}{CT} = 'Y';
72 $$opts{iupac}{TC} = 'Y';
73 $$opts{iupac}{AG} = 'R';
74 $$opts{iupac}{GA} = 'R';
75 $$opts{iupac}{AT} = 'W';
76 $$opts{iupac}{TA} = 'W';
77 $$opts{iupac}{GC} = 'S';
78 $$opts{iupac}{CG} = 'S';
79 $$opts{iupac}{TG} = 'K';
80 $$opts{iupac}{GT} = 'K';
81 $$opts{iupac}{CA} = 'M';
82 $$opts{iupac}{AC} = 'M';
83 $$opts{iupac}{AA} = 'A';
84 $$opts{iupac}{CC} = 'C';
85 $$opts{iupac}{GG} = 'G';
86 $$opts{iupac}{TT} = 'T';
89 if ( exists($$opts{haplotype}) && !exists($$opts{sample}) ) { error("Expected -s option with -H.\n"); }
90 return $opts;
93 sub do_consensus
95 my ($opts) = @_;
97 my $vcf = Vcf->new(file=>$$opts{vcf_file});
98 $vcf->parse_header;
99 if ( exists($$opts{sample}) )
101 if ( !exists($$vcf{has_column}{$$opts{sample}}) ) { error("No such sample: $$opts{sample}"); }
102 $$opts{vcf} = $vcf;
103 $$opts{sample_col} = $$vcf{has_column}{$$opts{sample}};
105 my $chrs = $vcf->get_chromosomes();
106 my %chrs = map { $_=>0 } @$chrs;
108 my ($chr,$vcf_pos,$warned,$vcf_line);
109 while (my $line=<STDIN>)
111 if ( $line=~/^>([^:\s]+)/ )
113 $chr = $1;
115 for my $line (@{$$vcf{buffer}}) { apply_variant($opts,$line); }
116 flush_fa_buffer($opts,0);
118 my $rest = $';
119 if ( $rest=~/^:(\d+)-\d+$/ )
121 print STDERR "Looks as fasta file snippet, the sequence $chr starts at position $1\n";
122 $$opts{fa_pos} = $1;
124 else
126 $$opts{fa_pos} = 1;
128 $$opts{fa_idx} = 0;
129 $$opts{fa_frz} = 0;
130 if ( exists($chrs{$chr}) ) { $chrs{$chr}=1; }
131 my $region = $$opts{fa_pos} > 1 ? "$chr:$$opts{fa_pos}" : $chr;
132 $vcf->open(region=>$region);
133 print $line;
134 next;
137 chomp($line);
138 if ( !$$opts{case_known} )
140 if ( uc($line) eq $line ) { $$opts{case_known} = 'u'; }
141 elsif ( lc($line) eq $line ) { $$opts{case_known} = 'l'; }
142 else { $$opts{case_known} = 'u'; }
144 $$opts{fa_buf} .= $line;
145 $$opts{fa_len} += length($line);
147 while ( defined($vcf_line = $vcf->next_data_array()) )
149 # can the beginning of the buffer be printed?
150 if ( $$opts{fa_pos}+$$opts{fa_len}-$$opts{fa_idx}<=$$vcf_line[1] )
152 $vcf->_unread_line($vcf_line);
153 flush_fa_buffer($opts,60);
154 last;
156 # is the buffer long enough?
157 if ( $$opts{fa_pos}+$$opts{fa_len}-$$opts{fa_idx}<=$$vcf_line[1]+length($$vcf_line[3]) )
159 $vcf->_unread_line($vcf_line);
160 last;
162 apply_variant($opts,$vcf_line);
165 if ( !defined $vcf_line ) { flush_fa_buffer($opts,60); }
167 flush_fa_buffer($opts,0);
169 for my $chr (keys %chrs)
171 if ( !$chrs{$chr} ) { warn("The sequence \"$chr\" not found in the fasta file.\n"); }
175 sub flush_fa_buffer
177 my ($opts,$len) = @_;
178 while ( $$opts{fa_len} && $$opts{fa_len}>=60 )
180 print substr($$opts{fa_buf},0,60,''), "\n";
181 $$opts{fa_len} -= 60;
182 $$opts{fa_pos} += 60 - $$opts{fa_idx};
183 $$opts{fa_idx} = 0;
185 if ( $len or !$$opts{fa_len} ) { return; }
186 print $$opts{fa_buf},"\n";
187 $$opts{fa_pos} += $$opts{fa_len}-$$opts{fa_idx};
188 $$opts{fa_len} = 0;
189 $$opts{fa_buf} = '';
190 $$opts{fa_idx} = 0;
193 sub apply_variant
195 my ($opts,$vline) = @_;
197 if ( $$vline[4] eq '.' ) { return; }
199 my $hap = exists($$opts{haplotype}) ? $$opts{haplotype} : 0;
200 my $alt;
201 if ( !exists($$opts{sample_col}) )
203 # No sample requested, applying all sites, first ALT
204 my $idx;
205 $alt = ($idx=index($$vline[4],','))==-1 ? $$vline[4] : substr($$vline[4],0,$idx);
206 if ( exists($$opts{iupac}) && length($$vline[3])==1 && length($alt)==1 )
208 $alt = uc($$vline[3].$alt);
209 if ( !exists($$opts{iupac}{$alt}) ) { error("No IUPAC code for \"$alt\"\n"); }
210 $alt = $$opts{iupac}{$alt};
213 else
215 my $igt = $$opts{vcf}->get_tag_index($$vline[8],'GT',':');
216 if ( $igt==-1 ) { return; }
217 my $gt = $$opts{vcf}->get_field($$vline[$$opts{sample_col}-1],$igt);
218 my @als = $$opts{vcf}->split_gt($gt);
219 if ( $hap )
221 # Note: we are not checking the phase or phase blocks, assuming the VCF is perfect
222 if ( $hap <= @als && $als[$hap-1] ne '0' )
224 $alt = $$opts{vcf}->get_field($$vline[4],$als[$hap-1]-1,',');
227 else
229 if ( exists($$opts{iupac}) && length($$vline[3])==1 ) # only for SNPs and with -i
231 my @alts;
232 my %GTs;
233 for my $al (@als) {
234 if ( $al eq '.' ) {
235 $alt = '.';
236 } elsif ( $al eq '0' ) {
237 $alt = uc($$vline[3]);
238 } else {
239 $alt = $$opts{vcf}->get_field($$vline[4],$al-1,',');
240 if ( length($alt)!=1 ) { last; }
242 ++$GTs{$alt}; # if $alt ne '.'
243 =pod
244 if ( $al eq '.' ) { last; }
245 if ( $al eq '0' ) { push @alts,uc($$vline[3]); }
246 else
248 $alt = $$opts{vcf}->get_field($$vline[4],$al-1,',');
249 push @alts, uc($alt);
250 if ( length($alt)!=1 ) { last; }
252 =cut
254 my $dGT = join('',(sort keys %GTs));
255 if ($dGT eq '') {
256 $alt = undef;
257 } elsif ( $dGT=~ /^\./ ) {
258 $alt = 'N';
259 } else {
260 $alt = $IUP{$dGT};
261 die "$dGT" unless exists $IUP{$dGT};
262 #warn "$alt,$dGT";
264 =pod
265 if ( @alts==2 )
267 if ( !exists($$opts{iupac}{$alts[0].$alts[1]}) ) { error("No IUPAC code for \"$alts[0]/$alts[1]\"\n"); }
268 $alt = $$opts{iupac}{$alts[0].$alts[1]};
270 elsif ( length($alts[0])==1 )
272 if ( !exists($$opts{iupac}{$alts[0].$alts[0]}) ) { error("No IUPAC code for \"$alts[0]/$alts[0]\"\n"); }
273 $alt = $$opts{iupac}{$alts[0].$alts[0]};
275 =cut
277 else
279 for my $al (@als)
281 if ( $al eq '0' or $al eq '.' ) { next; }
282 $alt = $$opts{vcf}->get_field($$vline[4],$al-1,',');
283 last;
287 if ( !defined $alt or $alt eq $$vline[3] ) { return; }
290 if ( $$vline[1] <= $$opts{fa_frz} )
292 print STDERR "Note: Conflicting variants at (or near) $$vline[0]:$$vline[1], cannot apply both.\n";
293 return;
296 my $pos = $$vline[1] - $$opts{fa_pos} + $$opts{fa_idx};
297 if ( $pos<0 or $pos>=$$opts{fa_len} ) { error("FIXME: $$vline[0]:$$vline[1] .. $$opts{fa_pos},$pos,$$opts{fa_len},$$opts{fa_frz}\n"); }
299 # Sanity check
300 my $ref_len = length($$vline[3]);
301 if ( $$vline[3] ne uc(substr($$opts{fa_buf},$pos,$ref_len)) )
303 error(sprintf "The fasta sequence does not match the REF at $$vline[0]:$$vline[1]. %s(%s) in .fa, %s in .vcf, frz=%d\n",
304 substr($$opts{fa_buf},$pos,$ref_len),
305 substr($$opts{fa_buf},$pos+1,$ref_len+5),
306 $$vline[3], $$opts{fa_frz}?$$opts{fa_frz}:0);
308 if ( $$opts{case_known} eq 'l' ) { $alt = lc($alt); }
310 my $alt_len = length($alt);
311 substr($$opts{fa_buf},$pos,$ref_len,$alt);
312 $$opts{fa_len} += $alt_len - $ref_len;
313 $$opts{fa_pos} += $ref_len; # position with respect to the original reference sequence
314 $$opts{fa_idx} += $alt_len; # position in the modified sequence
315 $$opts{fa_frz} = $$vline[1] + $ref_len - 1; # freeze changes until this position