6 remove_duplicate_samples_vcf.pl - renames and removes duplicate samples in a vcf file.
7 keeps one copy of the samples.
11 perl remove_duplicate_samples_vcf.pl -i [vcf_input_file] -o [vcf_output_file]
13 requires vcftools. Generates 3 files: a text file with the duplicated samples to remove,
14 a vcf files with the duplicates renamed and a vcf file with out the duplicates.
18 Isaak Y Tecle <iyt2@cornell.edu>
24 use File
::Slurp qw
/read_file write_file/;
31 open(my $V, "<", $opt_i) || die "Can't open vcf_file: $opt_i\n";
39 if ($_ =~ m/^\#CHROM/) {
40 print STDERR
"Parsing ids in vcf file...\n";
41 my @orig_fields = split /\t/;
43 for (my $i=0; $i <= $#orig_fields; $i++) {
44 my $field = $orig_fields[$i];
46 print "\nkeeping the first 9 columns: $field\n";
47 push @modified_fields, $field;
49 if (grep{$field eq $_} @modified_fields) {
50 $field = "${field}_dupl_${i}";
51 print "$orig_fields[$i] at col $i is a duplicate -- modified its name to $field\n";
52 push @dupl_samples, $field;
55 print STDERR
"\n$field at col $i is a unique sample\n";
58 push @modified_fields, $field;
62 my $line = join("\t", @modified_fields);
63 $lines .= $line . "\n";
71 my $dupl_samples = join("\n", @dupl_samples);
72 my $out_file = $opt_o =~ s/\.vcf//r;
73 my $remove_samples_file = "${out_file}_dupl_samples.txt";
74 my $removed_vcf = "${out_file}_removed.vcf";
75 my $renamed_vcf = "${out_file}_renamed.vcf";
77 print STDERR
"Now writing to $remove_samples_file duplicate samples:\n$dupl_samples";
78 write_file
($remove_samples_file, $dupl_samples);
80 print STDERR
"Now writing to $renamed_vcf duplicate samples:\n$dupl_samples";
81 write_file
($renamed_vcf, $lines);
83 print STDERR
"Now removing duplicate samples:\n$dupl_samples";
84 `vcftools --remove $remove_samples_file --vcf $renamed_vcf --recode --out $removed_vcf`;
85 my $recode_file = "${removed_vcf}.recode.vcf";
87 print STDERR
"Renaming $recode_file to $removed_vcf\n";
88 `mv $recode_file $removed_vcf`;
89 print STDERR
"\nCleaned vcf file without the duplicates is $removed_vcf\n";
91 print STDERR
"\nDone.";