Merge pull request #3987 from solgenomics/topic/description_to_text
[sgn.git] / bin / jbrowse_vcf_tools / finish_indiv.pl
blobd21ce2a007b7c7f7670ef4a4ad54347b58b829a9
1 #!/usr/bin/perl
3 use warnings;
4 use strict;
5 use Getopt::Std;
7 our ($opt_v, $opt_d, $opt_f);
9 getopts('v:d:f:');
11 my $raw_vcf_file = $opt_v;
12 my $dosage_file = $opt_d;
13 my $starting_file = $opt_f;
14 my $imputed_file;
15 my $filtered_file;
16 my @genotypes;
17 my $placeholder;
18 my $num_seen = 1;
19 my $this_rep = $starting_file;
20 $this_rep =~ s/^.+rep(\d+).vcf.+$/$1/;
21 print STDERR "this rep = $this_rep \n";
22 if (length($this_rep) != 1) {
23 $this_rep = 1;
25 print STDERR "this rep = $this_rep \n";
27 #-----------------------------------------------------------------------
28 # finish indiv vcf file
29 #-----------------------------------------------------------------------
31 my $vcf_file = $starting_file;
32 $vcf_file =~ s/^(.+)_(\d+)$/$1/;
33 print STDERR "finished filename = $vcf_file \n";
35 my $accession_name = $starting_file;
36 $accession_name =~ s:^output/(.+)_2015_V6.+:$1:;
37 print STDERR "accession name = $accession_name \n";
39 my $column_number = $starting_file;
40 $column_number =~ s/^(.+)_(\d+)$/$2/;
41 print STDERR "column number = $column_number \n";
43 system ("cut -f 1-9,$column_number $raw_vcf_file | tail -n +15761 >> $starting_file");
44 system ("mv $starting_file $vcf_file");
45 print "$vcf_file finished!\n";
47 #-----------------------------------------------------------------------
48 # extract imputed SNP data pertaining to this accession from dosage file
49 #-----------------------------------------------------------------------
51 open (IMPS, "<", $dosage_file) || die "Can't open $dosage_file!\n";
52 while (<IMPS>) {
53 chomp;
54 ($placeholder, @genotypes) = split /\t/;
55 unless ($placeholder eq $accession_name) {
56 next;
58 if ($this_rep > 1) {
59 if ($num_seen < $this_rep) {
60 $num_seen++;
61 next;
64 open (VCF, "<", $vcf_file) || die "Can't open $vcf_file!\n";
65 $imputed_file = $vcf_file;
66 $imputed_file =~ s/.vcf/_imputed.vcf/;
67 $filtered_file = $vcf_file;
68 $filtered_file =~ s/.vcf/_filtered.vcf/;
70 open (OUTFILE, ">", $imputed_file) || die "Can't open $imputed_file!\n";
71 open (OUTFILE2, ">", $filtered_file) || die "Can't open $filtered_file!\n";
73 LINE: while (<VCF>) {
74 if (m/^#/) {
75 print OUTFILE $_;
76 print OUTFILE2 $_;
77 } else {
78 chomp;
79 my ($CHROM, $POS, $ID, $REF, $ALT, $QUAL, $FILTER, $INFO, $FORMAT, $DATA) = split /\t/;
80 if (length($ALT) > 1) {
81 shift @genotypes;
82 next LINE;
83 } else {
84 $_ = $DATA;
85 if (m/(^0|^1)/) {
86 print OUTFILE join "\t", $CHROM, $POS, $ID, $REF, $ALT, $QUAL, $FILTER, $INFO, $FORMAT, $DATA;
87 print OUTFILE "\n";
88 print OUTFILE2 join "\t", $CHROM, $POS, $ID, $REF, $ALT, $QUAL, $FILTER, $INFO, $FORMAT, $DATA;
89 print OUTFILE2 "\n";
90 shift @genotypes;
91 } else {
92 $_ = shift @genotypes;
93 if ($_ <= 0.10) {
94 $DATA =~ s/\.\/\./0\/0/;
95 } elsif ($_ >= 0.90 && $_ <= 1.10) {
96 $DATA =~ s/\.\/\./0\/1/;
97 } elsif ($_ >= 1.90) {
98 $DATA =~ s/\.\/\./1\/1/;
99 } else {
100 next LINE;
102 print OUTFILE join "\t", $CHROM, $POS, $ID, $REF, $ALT, $QUAL, $FILTER, $INFO, $FORMAT, $DATA;
103 print OUTFILE "\n";
107 next;
109 last;
111 close VCF;
112 close OUTFILE;
113 close OUTFILE2;
115 #----------------------------------------------------
116 # Report a completed accession and compress files
117 #----------------------------------------------------
119 print STDERR "$vcf_file filtered and imputed files finished\n";
120 system ( "bgzip $vcf_file" );
121 system ( "bgzip $imputed_file" );
122 system ( "bgzip $filtered_file" );
123 system ( "tabix -p vcf $vcf_file.gz" );
124 system ( "tabix -p vcf $imputed_file.gz" );
125 system ( "tabix -p vcf $filtered_file.gz" );