make test pass for multicat parsing with two xlsx files for testing.
[sgn.git] / bin / remove_duplicate_samples_vcf.pl
blob0872acf74163aedb970df13fe07d84e8a5a93c72
1 #!/usr/bin/perl
4 =head1 NAME
6 remove_duplicate_samples_vcf.pl - renames and removes duplicate samples in a vcf file.
7 keeps one copy of the samples.
9 =head1 DESCRIPTION
11 perl remove_duplicate_samples_vcf.pl -i [vcf_input_file] -o [vcf_output_file]
13 requires vcftools. Generates 3 files: a text file with the duplicated samples to remove,
14 a vcf files with the duplicates renamed and a vcf file with out the duplicates.
16 =head1 AUTHOR
18 Isaak Y Tecle <iyt2@cornell.edu>
20 =cut
22 use strict;
23 use Getopt::Std;
24 use File::Slurp qw /read_file write_file/;
27 our($opt_i, $opt_o);
29 getopts('i:o:');
31 open(my $V, "<", $opt_i) || die "Can't open vcf_file: $opt_i\n";
33 my $lines;
34 my @dupl_samples;
36 while (<$V>) {
37 chomp();
39 if ($_ =~ m/^\#CHROM/) {
40 print STDERR "Parsing ids in vcf file...\n";
41 my @orig_fields = split /\t/;
42 my @modified_fields;
43 for (my $i=0; $i <= $#orig_fields; $i++) {
44 my $field = $orig_fields[$i];
45 if ($i < 9) {
46 print "\nkeeping the first 9 columns: $field\n";
47 push @modified_fields, $field;
48 } else {
49 if (grep{$field eq $_} @modified_fields) {
50 $field = "${field}_dupl_${i}";
51 print "$orig_fields[$i] at col $i is a duplicate -- modified its name to $field\n";
52 push @dupl_samples, $field;
54 } else {
55 print STDERR "\n$field at col $i is a unique sample\n";
58 push @modified_fields, $field;
62 my $line = join("\t", @modified_fields);
63 $lines .= $line . "\n";
65 else {
66 $lines .= $_ ."\n";
71 my $dupl_samples = join("\n", @dupl_samples);
72 my $out_file = $opt_o =~ s/\.vcf//r;
73 my $remove_samples_file = "${out_file}_dupl_samples.txt";
74 my $removed_vcf = "${out_file}_removed.vcf";
75 my $renamed_vcf = "${out_file}_renamed.vcf";
77 print STDERR "Now writing to $remove_samples_file duplicate samples:\n$dupl_samples";
78 write_file($remove_samples_file, $dupl_samples);
80 print STDERR "Now writing to $renamed_vcf duplicate samples:\n$dupl_samples";
81 write_file($renamed_vcf, $lines);
83 print STDERR "Now removing duplicate samples:\n$dupl_samples";
84 `vcftools --remove $remove_samples_file --vcf $renamed_vcf --recode --out $removed_vcf`;
85 my $recode_file = "${removed_vcf}.recode.vcf";
87 print STDERR "Renaming $recode_file to $removed_vcf\n";
88 `mv $recode_file $removed_vcf`;
89 print STDERR"\nCleaned vcf file without the duplicates is $removed_vcf\n";
91 print STDERR "\nDone.";