make test pass for multicat parsing with two xlsx files for testing.
[sgn.git] / bin / convert_vcf_to_hmp.pl
blob82eecbc51a040e1eeab21359e12b0f08976931c1
2 use strict;
3 use Data::Dumper;
5 my $vcf_file = shift;
7 #print STDERR Dumper($vcf_file);
9 open(my $F, "<", $vcf_file) || die "Can't find file $vcf_file";
11 open(my $G, ">", $vcf_file.".hmp") || die "Can't open $vcf_file.hmp";
13 my $line;
14 while (<$F>) {
15 my $line = $_;
16 if ($line =~ m/^\#\#/) {
17 print STDERR "SKIPPING ## lines...\n";
19 else {
20 last;
24 my $header = $line;
26 print STDERR $header;
27 chomp($header);
29 my @keys = split("\t", $header);
30 #print STDERR Dumper($keys[1]);
32 for(my $n=0; $n <@keys; $n++) {
33 if ($keys[$n] =~ /\|CO\_/) {
34 $keys[$n] =~ s/\|CO\_.*//;
37 my @data = ();
39 my %nuconv = (
40 'A/A' => 'A',
41 'G/G' => 'G',
42 'T/T' => 'T',
43 'C/C' => 'C',
44 'A/G' => 'R',
45 'G/A' => 'R',
46 'C/T' => 'Y',
47 'T/C' => 'Y',
48 'G/C' => 'S',
49 'C/G' => 'S',
50 'A/T' => 'W',
51 'T/A' => 'W',
52 'G/T' => 'K',
53 'T/G' => 'K',
54 'A/C' => 'M',
55 'C/A' => 'M',
56 './.' => 'N',
60 while (<$F>) {
61 chomp;
62 my %line;
63 my @fields = split /\t/;
65 for(my $n=0; $n <@keys; $n++) {
66 if (exists($fields[$n]) && defined($fields[$n])) {
67 $line{$keys[$n]}=$fields[$n];
70 push @data, \%line;
73 foreach my $line (@data) {
74 my @formats = split /\:/, $line->{FORMAT};
75 my $gtindex = 0;
76 for(my $n =0; $n<@formats; $n++) {
77 if ($formats[$n] eq 'GT') {
78 $gtindex=$n;
82 for(my $gt = 0; $gt < @keys; $gt++) {
83 my @scores = split /\:/, $line->{$gt};
84 my $genotype;
85 if ($scores[$gtindex] eq '0/0') {
86 $genotype = $line->{REF}."/".$line->{REF};
88 if ($scores[$gtindex] eq '0/1') {
89 $genotype = $line->{REF}."/".$line->{ALT};
91 if ($scores[$gtindex] eq '1/1') {
92 $genotype = $line->{ALT}."/".$line->{ALT};
95 my $genotype_hmp = $nuconv{$genotype};
100 close($F);
101 close($G);