perl/etc/WoodyMiaoLin/IndivID/IndivID.pl

   1 #!/usr/bin/perl
   2 use strict;
   3 use warnings;
   4
   5 die("
   6 Program: IndivID
   7 Version: 1.0
   8 Release: Mar. 6, 2013\n
   9 Auther: Woody
  10 Consultant: Galaxy\n
  11 Usage: $0 <input> <output>\n
  12 The input file should look like:\n
  13 #ID     Locus1          Locus2          Locus3          ...
  14 Sample1 160     162     166     166     0       0       ...
  15 Sample2 0       0       174     176     178     178     ...
  16 Sample3 180     182     186     186     188     188     ...
  17 ...     ...     ...     ...     ...     ...     ...     ...
  18 \nMissing data must be \"0\".
  19 \n") if (@ARGV<2);
  20
  21 my $in = shift;
  22 my $out = shift;
  23 open I, "<", "$in";
  24 open O, ">", "$out";
  25
  26 # Load input file into @gt and %gt.
  27 my $head = <I>;
  28 my (@gt, %gt);
  29 while (<I>) {
  30         s/\r//;
  31         chomp;
  32         my @a = split /\t/;
  33         push @gt, [@a];
  34         my $b = shift @a;
  35         $gt{$b} = \@a;
  36 }
  37 close I;
  38
  39 # Compare every two samples, and load compatible pairs into @line.
  40 my $ns = @gt; # number of sample
  41 my $nl = (@{$gt[0]}-1)/2; # number of loci
  42 my @line; # In the array @line of arrays, every array contains two compatible samples.
  43 foreach my $a (0 .. $ns-2) {
  44         foreach my $b ($a+1 .. $ns-1) {
  45                 my $d;
  46                 foreach my $c (1 .. $nl) {
  47                         my %na; # number of allele
  48                         $na{$gt[$a][2*$c-1]} = 1;
  49                         $na{$gt[$a][2*$c]} = 1;
  50                         $na{$gt[$b][2*$c-1]} = 1;
  51                         $na{$gt[$b][2*$c]} = 1;
  52                         if (!defined $na{0} and (keys %na) > 2) { # If there are no "0" alleles and number of allele > 2, it is not a compatible locus.
  53                                 $d = 1;
  54                                 last;
  55                         }
  56                 }
  57                 push @line, [$gt[$a][0], $gt[$b][0]] unless defined $d;
  58         }
  59 }
  60
  61 # Push certain individual group arrays into @ss, uncertain group arrays into @sn.
  62 my @ss; # samples sure
  63 my @sn; # samples not sure
  64 foreach (@line) {
  65         if (!defined ${$_}[2]) {
  66                 ${$_}[2] = 1; # Mark considered sample pair.
  67                 push my @a, (${$_}[0], ${$_}[1]);
  68                 my @s = @a;
  69                 while (@a) { # The while loop will recursively find $_'s all intercompatible samples, and push them into @s.
  70                         my @c;
  71                         foreach my $v (@a) {
  72                                 my @w;
  73                                 foreach (@line) { # The foreach loop will find $_'s all compatible samples, and push them into @w.
  74                                         if (!defined ${$_}[2]) {
  75                                                 if (${$_}[0] eq $v or ${$_}[1] eq $v) {
  76                                                         ${$_}[2] = 1;
  77                                                         push @w, (${$_}[0], ${$_}[1]);
  78                                                 }
  79                                         }
  80                                 }
  81                                 push @c, @w;
  82                         }
  83                         @a = @c;
  84                         push @s, @a;
  85                 }
  86                 my %c; # Array @s contain intercompatible samples. The occurrence number of one sample equals the number of its compatible samples.
  87                 foreach (@s) {
  88                         ++$c{$_};
  89                 }
  90                 my %d;
  91                 ++$d{$c{$_}} foreach keys %c;
  92                 push @ss, \@s if keys %d == 1; # If the occurrence numbers of all samples are equal, the samples should be one individual.
  93                 push @sn, \@s if keys %d >= 2; # If the numbers are not equal, the samples are uncertain.
  94         }
  95 }
  96
  97 # Merge samples in @ss into indivuals, and push both indivuals and samples into array @ig.
  98 my @ig; # individual genotype
  99 my $e = 1;
 100 foreach (@ss) {
 101         my %s;
 102         foreach (@{$_}) {
 103                 $s{$_} = $gt{$_};
 104         }
 105         my @u; # individual genotype
 106         foreach my $c (0 .. $nl-1) {
 107                 my %t;
 108                 foreach my $b (keys %s) {
 109                         $t{$s{$b}[2*$c]} = 1;
 110                         $t{$s{$b}[2*$c+1]} = 1;
 111                 }
 112                 my @d;# locus genotype
 113                 if (keys %t == 3) { # If there are 3 alleles, push two nonzero alleles into @d.
 114                         foreach (sort {$a <=> $b} keys %t) {
 115                                 push @d, $_ if $_ != 0;
 116                         }
 117                 } elsif (keys %t == 2) { # If there are two alleles, ...
 118                         if (defined $t{0}) { # If one of the two is zero, push the nonzero one twice into @d.
 119                                 foreach (sort {$a <=> $b} keys %t) {
 120                                         push @d, ($_, $_) if $_ != 0;
 121                                 }
 122                         } else { # If there is no zero allele, push the two alleles into @d.
 123                                 push @d, $_ foreach sort {$a <=> $b} keys %t;
 124                         }
 125                 } elsif (keys %t == 1) { # If there is only one allele, push it twice into @d.
 126                         push @d, (keys %t, keys %t);
 127                 } else {
 128                         warn keys %t;
 129                 }
 130                 push @u, @d;
 131         }
 132         unshift @u, sprintf("Indiv%03d", $e);
 133         ++$e;
 134         my @v;
 135         push @v, \@u;
 136         foreach (sort keys %s) {
 137                 my @f;
 138                 push @f, $_;
 139                 push @f, @{$s{$_}};
 140                 push @v, \@f;
 141         }
 142         push @ig, \@v;
 143 }
 144
 145 # Print individual groups.
 146 print O "$head\n";
 147 foreach (@ig) {
 148         foreach (@{$_}) {
 149                 my $a = join "\t", @{$_};
 150                 print O "$a\n";
 151         }
 152         print O "\n";
 153 }
 154
 155 # Print uncertain groups.
 156 foreach (@sn) {
 157         my %s;
 158         foreach (@{$_}) {
 159                 $s{$_} = $gt{$_};
 160         }
 161         print O "Uncertain $e\n";
 162         ++$e;
 163         foreach (sort keys %s) {
 164                 print O "$_\t";
 165                 print O join "\t", @{$s{$_}};
 166                 print O "\n";
 167         }
 168         print O "\n";
 169 }
 170
 171 # Print individuals with only one sample.
 172 foreach (@line) {
 173         $gt{${$_}[0]} = 0;
 174         $gt{${$_}[1]} = 0;
 175 }
 176 print O "Individuals With Only One Sample\n";
 177 foreach (sort keys %gt) {
 178         if ($gt{$_}) {
 179                 print O "$_\t";
 180                 print O join "\t", @{$gt{$_}};
 181                 print O "\n";
 182         }
 183 }
 184 close O;
 185