limit fstBC to 30bp in Python3 ver.
[GalaxyCodeBases.git] / perl / etc / WoodyMiaoLin / HaploID / HaploID.pl
blob96495b0dcc263e67e31ccb03c4e94ceb3187b61b
1 #!/usr/bin/perl
2 use strict;
3 use warnings;
5 die("
6 Program: HaploID
7 Version: 1.0
8 Release: Sep. 1, 2013\n
9 Auther: Woody
10 Consultant: Galaxy\n
11 Usage: $0 <input> <output>\n
12 Both input and output files are fasta format.
13 Use \"N\" for unknown basepair, \"-\" for gap, and \"?\" for missing data.
14 \n") if (@ARGV<2);
16 my $in = shift;
17 my $out = shift;
18 open I, "<", "$in";
20 # Read input file.
21 my %input; # input sequences
22 my $nchar; # number of characters
23 my $nin; # number of input sequences
24 while (<I>) {
25 $/ = ">";
26 my $seq = <I>;
27 chomp $seq;
28 $seq =~ s/\s//g;
29 $/ = "\n";
30 $nchar = length $seq unless $nchar;
31 s/>//;
32 /^(\S+)\s/;
33 push @{$input{$1}}, $seq;
34 ++$nin;
36 close I;
37 warn "Read input complete!\n";
38 warn "No. sequences = $nin\n";
40 # Make sample consensus sequence.
41 my %samseq; # sample consensus sequences
42 my @samid; # sample IDs
43 foreach my $a (sort keys %input) {
44 push @samid, $a;
45 my $con = $input{$a}[0];
46 my $n = @{$input{$a}} - 1;
47 if ($n == 0) {
48 $samseq{$a} = $con;
49 } else {
50 foreach my $b (1 .. $n) {
51 for (my $i = 0; $i < $nchar; $i++) {
52 my $j = $i + 1;
53 if (substr($input{$a}[$b],$i,1) eq "?") {
54 next;
55 } elsif (substr($input{$a}[$b],$i,1) eq "-") {
56 if (substr($con,$i,1) eq "?") {
57 substr($con,$i,1) = substr($input{$a}[$b],$i,1);
58 } elsif (substr($con,$i,1) eq "-") {
59 next;
60 } else {
61 warn "$a conflicts at $j bp!\n";
63 } elsif (substr($input{$a}[$b],$i,1) eq "N") {
64 if (substr($con,$i,1) eq "?") {
65 substr($con,$i,1) = substr($input{$a}[$b],$i,1);
66 } elsif (substr($con,$i,1) eq "-") {
67 warn "$a conflicts at $j bp!\n";
68 } else {
69 next;
71 } else {
72 if ((substr($con,$i,1) eq "?") or (substr($con,$i,1) eq "N")) {
73 substr($con,$i,1) = substr($input{$a}[$b],$i,1);
74 } elsif (substr($con,$i,1) eq "-") {
75 warn "$a conflicts at $j bp!\n";
76 } else {
77 if (substr($con,$i,1) eq substr($input{$a}[$b],$i,1)) {
78 next;
79 } else {
80 warn "$a conflicts at $j bp!\n";
86 $samseq{$a} = $con;
89 #open O1, ">", "${out}_sample.txt";
90 #print O1 ">$_\n$samseq{$_}\n" foreach sort keys %samseq;
91 #close O1;
92 #warn "Make sample consensus complete and print to ${out}_sample.txt!\n";
93 my $aa1 = keys %samseq;
94 warn "No. samples = $aa1\n";
96 # Compare every two samples, and load compatible pairs into @line.
97 my @line; # compatible sample pairs
98 my $nsam = @samid; # total number of samples
99 foreach my $a (0 .. $nsam-2) {
100 foreach my $b ($a+1 .. $nsam-1) {
101 my $error;
102 for (my $i = 0; $i < $nchar; $i++) {
103 if (substr($samseq{$samid[$a]},$i,1) eq "?") {
104 next;
105 } elsif (substr($samseq{$samid[$a]},$i,1) eq "-") {
106 if ((substr($samseq{$samid[$b]},$i,1) eq "?") or (substr($samseq{$samid[$b]},$i,1) eq "-")) {
107 next;
108 } else {
109 $error = 1;
110 last;
112 } elsif (substr($samseq{$samid[$a]},$i,1) eq "N") {
113 if (substr($samseq{$samid[$b]},$i,1) eq "-") {
114 $error = 1;
115 last;
116 } else {
117 next;
119 } else {
120 if ((substr($samseq{$samid[$b]},$i,1) eq "?") or (substr($samseq{$samid[$b]},$i,1) eq "N")) {
121 next;
122 } elsif (substr($samseq{$samid[$b]},$i,1) eq "-") {
123 $error = 1;
124 last;
125 } else {
126 if (substr($samseq{$samid[$a]},$i,1) eq substr($samseq{$samid[$b]},$i,1)) {
127 next;
128 } else {
129 $error = 1;
130 last;
135 push @line, [$samid[$a], $samid[$b]] unless $error;
138 warn "Compare every two samples complete!\n";
139 my $aa2 = @line;
140 warn "No. compatible pairs = $aa2\n";
142 # Push certain haplotype sample groups into @ss, but one sample many times.
143 my @unsam; # uncertain sample IDs;
144 my @ss; # samples sure
145 my @sn; # samples not sure
146 $sn[0] = 1;
147 while (defined $sn[0]) {
148 undef @ss;
149 undef @sn;
150 ${$_}[2] = 0 foreach @line;
151 foreach (@line) {
152 unless (${$_}[2]) {
153 ${$_}[2] = 1; # Mark considered sample pair.
154 push my @a, (${$_}[0], ${$_}[1]);
155 my @s = @a;
156 while (@a) { # The while loop will recursively find $_'s all intercompatible samples, and push them into @s.
157 my @c;
158 foreach my $v (@a) {
159 my @w;
160 foreach (@line) { # The foreach loop will find $_'s all compatible samples, and push them into @w.
161 unless (${$_}[2]) {
162 if (${$_}[0] eq $v or ${$_}[1] eq $v) {
163 ${$_}[2] = 1;
164 push @w, (${$_}[0], ${$_}[1]);
168 push @c, @w;
170 @a = @c;
171 push @s, @a;
173 my %c; # Array @s contain intercompatible samples. The occurrence number of one sample equals the number of its compatible samples.
174 ++$c{$_} foreach @s;
175 my %d;
176 ++$d{$c{$_}} foreach keys %c;
177 push @ss, \@s if keys %d == 1; # If the occurrence numbers of all samples are equal, the samples should be one haplotype.
178 push @sn, \@s if keys %d >= 2; # If the numbers are not equal, the samples are uncertain.
181 my $bb1 = @ss;
182 my $bb2 = @sn;
183 warn "No. certain haplotypes = $bb1\n";
184 warn "No. uncertain groups = $bb2\n";
185 last unless $bb2;
186 foreach (@sn) {
187 my %s; # uncertain haplotype sample IDs
188 my %sq; # $sq{sample ID} = "?|N" number
189 my %nq; # $nl{"?|N" number} = 1
190 $s{$_} = 1 foreach @{$_};
191 foreach (keys %s) {
192 my $a = @{[$samseq{$_} =~ /N|\?/g]};
193 $sq{$_} = $a;
194 $nq{$a} = 1;
196 my $c = ${[sort {$b <=> $a} keys %nq]}[0]; # most "?|N" number in one sample sequence
197 foreach my $d (keys %s) {
198 if ($sq{$d} == $c) {
199 push @unsam, $d;
200 foreach my $e (0 .. @line-1) {
201 if (defined $line[$e]) {
202 if (($line[$e][0] eq $d) or ($line[$e][1] eq $d)) {
203 splice @line, $e, 1;
204 redo;
206 } else {
207 last;
212 warn "Remove ambiguous samples which have $c \"N|?\".\n";
214 my $bb3 = @line;
215 warn "No. compatible pairs = $bb3\n";
217 warn "Find haplotype complete!\n";
219 # Make haplotype consensus.
220 my @hapseq; # haplotype sequence, @{$hapseq[n][0]} sample IDs, $hapseq[n][1] sequence.
221 foreach (@ss) {
222 my %s; # sample IDs
223 foreach (@{$_}) {
224 $s{$_} = 1;
226 my $con; # consensus sequence;
227 my @s; # samples IDs;
228 foreach my $a (sort keys %s) {
229 $con = $samseq{$a} unless $con;
230 for (my $i = 0; $i < $nchar; $i++) {
231 if (substr($samseq{$a},$i,1) eq "?") {
232 next;
233 } elsif ((substr($samseq{$a},$i,1) eq "-") or (substr($samseq{$a},$i,1) eq "N")) {
234 if (substr($con,$i,1) eq "?") {
235 substr($con,$i,1) = substr($samseq{$a},$i,1);
236 } else {
237 next;
239 } else {
240 if ((substr($con,$i,1) eq "?") or (substr($con,$i,1) eq "N")) {
241 substr($con,$i,1) = substr($samseq{$a},$i,1);
242 } else {
243 next;
247 push @s, $a;
249 push @hapseq, [\@s, $con];
251 warn "Make haplotype consensus complete!\n";
253 # Print haplotypes with many samples.
254 open O2, ">", $out;
255 my $nh = 1; # index of haplotype
256 foreach (@hapseq) {
257 print O2 sprintf(">Haplo%03d\t", $nh), join(" ", @{${$_}[0]}), "\n${$_}[1]\n";
258 ++$nh;
261 # Print haplotypes which have only one sample and print ambiguous samples.
262 foreach (@line) {
263 delete $samseq{${$_}[0]};
264 delete $samseq{${$_}[1]};
266 my $nas = 1; # index of ambiguous sample
267 my $pas; # print ambiguous samples
268 foreach (sort @unsam) {
269 $pas .= sprintf(">Ambig%03d\t$_\n$samseq{$_}\n", $nas);
270 ++$nas;
271 delete $samseq{$_};
273 foreach (sort keys %samseq) {
274 print O2 sprintf(">Haplo%03d\t$_\n$samseq{$_}\n", $nh);
275 ++$nh;
277 print O2 $pas if $pas;
278 close O2;
279 warn "Done and print result to ${out}_haplotype.txt!\n";