modified: Makefile
[GalaxyCodeBases.git] / perl / etc / WoodyMiaoLin / IndivID / IndivID.pl
blob970362e94b556f5b84a8fbb135a6bc0ebacf6c12
1 #!/usr/bin/perl
2 use strict;
3 use warnings;
5 die("
6 Program: IndivID
7 Version: 1.0
8 Release: Mar. 6, 2013\n
9 Auther: Woody
10 Consultant: Galaxy\n
11 Usage: $0 <input> <output>\n
12 The input file should look like:\n
13 #ID Locus1 Locus2 Locus3 ...
14 Sample1 160 162 166 166 0 0 ...
15 Sample2 0 0 174 176 178 178 ...
16 Sample3 180 182 186 186 188 188 ...
17 ... ... ... ... ... ... ... ...
18 \nMissing data must be \"0\".
19 \n") if (@ARGV<2);
21 my $in = shift;
22 my $out = shift;
23 open I, "<", "$in";
24 open O, ">", "$out";
26 # Load input file into @gt and %gt.
27 my $head = <I>;
28 my (@gt, %gt);
29 while (<I>) {
30 s/\r//;
31 chomp;
32 my @a = split /\t/;
33 push @gt, [@a];
34 my $b = shift @a;
35 $gt{$b} = \@a;
37 close I;
39 # Compare every two samples, and load compatible pairs into @line.
40 my $ns = @gt; # number of sample
41 my $nl = (@{$gt[0]}-1)/2; # number of loci
42 my @line; # In the array @line of arrays, every array contains two compatible samples.
43 foreach my $a (0 .. $ns-2) {
44 foreach my $b ($a+1 .. $ns-1) {
45 my $d;
46 foreach my $c (1 .. $nl) {
47 my %na; # number of allele
48 $na{$gt[$a][2*$c-1]} = 1;
49 $na{$gt[$a][2*$c]} = 1;
50 $na{$gt[$b][2*$c-1]} = 1;
51 $na{$gt[$b][2*$c]} = 1;
52 if (!defined $na{0} and (keys %na) > 2) { # If there are no "0" alleles and number of allele > 2, it is not a compatible locus.
53 $d = 1;
54 last;
57 push @line, [$gt[$a][0], $gt[$b][0]] unless defined $d;
61 # Push certain individual group arrays into @ss, uncertain group arrays into @sn.
62 my @ss; # samples sure
63 my @sn; # samples not sure
64 foreach (@line) {
65 if (!defined ${$_}[2]) {
66 ${$_}[2] = 1; # Mark considered sample pair.
67 push my @a, (${$_}[0], ${$_}[1]);
68 my @s = @a;
69 while (@a) { # The while loop will recursively find $_'s all intercompatible samples, and push them into @s.
70 my @c;
71 foreach my $v (@a) {
72 my @w;
73 foreach (@line) { # The foreach loop will find $_'s all compatible samples, and push them into @w.
74 if (!defined ${$_}[2]) {
75 if (${$_}[0] eq $v or ${$_}[1] eq $v) {
76 ${$_}[2] = 1;
77 push @w, (${$_}[0], ${$_}[1]);
81 push @c, @w;
83 @a = @c;
84 push @s, @a;
86 my %c; # Array @s contain intercompatible samples. The occurrence number of one sample equals the number of its compatible samples.
87 foreach (@s) {
88 ++$c{$_};
90 my %d;
91 ++$d{$c{$_}} foreach keys %c;
92 push @ss, \@s if keys %d == 1; # If the occurrence numbers of all samples are equal, the samples should be one individual.
93 push @sn, \@s if keys %d >= 2; # If the numbers are not equal, the samples are uncertain.
97 # Merge samples in @ss into indivuals, and push both indivuals and samples into array @ig.
98 my @ig; # individual genotype
99 my $e = 1;
100 foreach (@ss) {
101 my %s;
102 foreach (@{$_}) {
103 $s{$_} = $gt{$_};
105 my @u; # individual genotype
106 foreach my $c (0 .. $nl-1) {
107 my %t;
108 foreach my $b (keys %s) {
109 $t{$s{$b}[2*$c]} = 1;
110 $t{$s{$b}[2*$c+1]} = 1;
112 my @d;# locus genotype
113 if (keys %t == 3) { # If there are 3 alleles, push two nonzero alleles into @d.
114 foreach (sort {$a <=> $b} keys %t) {
115 push @d, $_ if $_ != 0;
117 } elsif (keys %t == 2) { # If there are two alleles, ...
118 if (defined $t{0}) { # If one of the two is zero, push the nonzero one twice into @d.
119 foreach (sort {$a <=> $b} keys %t) {
120 push @d, ($_, $_) if $_ != 0;
122 } else { # If there is no zero allele, push the two alleles into @d.
123 push @d, $_ foreach sort {$a <=> $b} keys %t;
125 } elsif (keys %t == 1) { # If there is only one allele, push it twice into @d.
126 push @d, (keys %t, keys %t);
127 } else {
128 warn keys %t;
130 push @u, @d;
132 unshift @u, sprintf("Indiv%03d", $e);
133 ++$e;
134 my @v;
135 push @v, \@u;
136 foreach (sort keys %s) {
137 my @f;
138 push @f, $_;
139 push @f, @{$s{$_}};
140 push @v, \@f;
142 push @ig, \@v;
145 # Print individual groups.
146 print O "$head\n";
147 foreach (@ig) {
148 foreach (@{$_}) {
149 my $a = join "\t", @{$_};
150 print O "$a\n";
152 print O "\n";
155 # Print uncertain groups.
156 foreach (@sn) {
157 my %s;
158 foreach (@{$_}) {
159 $s{$_} = $gt{$_};
161 print O "Uncertain $e\n";
162 ++$e;
163 foreach (sort keys %s) {
164 print O "$_\t";
165 print O join "\t", @{$s{$_}};
166 print O "\n";
168 print O "\n";
171 # Print individuals with only one sample.
172 foreach (@line) {
173 $gt{${$_}[0]} = 0;
174 $gt{${$_}[1]} = 0;
176 print O "Individuals With Only One Sample\n";
177 foreach (sort keys %gt) {
178 if ($gt{$_}) {
179 print O "$_\t";
180 print O join "\t", @{$gt{$_}};
181 print O "\n";
184 close O;