8 Release: Sep. 1, 2013\n
11 Usage: $0 <input> <output>\n
12 Both input and output files are fasta format.
13 Use \"N\" for unknown basepair, \"-\" for gap, and \"?\" for missing data.
21 my %input; # input sequences
22 my $nchar; # number of characters
23 my $nin; # number of input sequences
30 $nchar = length $seq unless $nchar;
33 push @
{$input{$1}}, $seq;
37 warn "Read input complete!\n";
38 warn "No. sequences = $nin\n";
40 # Make sample consensus sequence.
41 my %samseq; # sample consensus sequences
42 my @samid; # sample IDs
43 foreach my $a (sort keys %input) {
45 my $con = $input{$a}[0];
46 my $n = @
{$input{$a}} - 1;
50 foreach my $b (1 .. $n) {
51 for (my $i = 0; $i < $nchar; $i++) {
53 if (substr($input{$a}[$b],$i,1) eq "?") {
55 } elsif (substr($input{$a}[$b],$i,1) eq "-") {
56 if (substr($con,$i,1) eq "?") {
57 substr($con,$i,1) = substr($input{$a}[$b],$i,1);
58 } elsif (substr($con,$i,1) eq "-") {
61 warn "$a conflicts at $j bp!\n";
63 } elsif (substr($input{$a}[$b],$i,1) eq "N") {
64 if (substr($con,$i,1) eq "?") {
65 substr($con,$i,1) = substr($input{$a}[$b],$i,1);
66 } elsif (substr($con,$i,1) eq "-") {
67 warn "$a conflicts at $j bp!\n";
72 if ((substr($con,$i,1) eq "?") or (substr($con,$i,1) eq "N")) {
73 substr($con,$i,1) = substr($input{$a}[$b],$i,1);
74 } elsif (substr($con,$i,1) eq "-") {
75 warn "$a conflicts at $j bp!\n";
77 if (substr($con,$i,1) eq substr($input{$a}[$b],$i,1)) {
80 warn "$a conflicts at $j bp!\n";
89 #open O1, ">", "${out}_sample.txt";
90 #print O1 ">$_\n$samseq{$_}\n" foreach sort keys %samseq;
92 #warn "Make sample consensus complete and print to ${out}_sample.txt!\n";
93 my $aa1 = keys %samseq;
94 warn "No. samples = $aa1\n";
96 # Compare every two samples, and load compatible pairs into @line.
97 my @line; # compatible sample pairs
98 my $nsam = @samid; # total number of samples
99 foreach my $a (0 .. $nsam-2) {
100 foreach my $b ($a+1 .. $nsam-1) {
102 for (my $i = 0; $i < $nchar; $i++) {
103 if (substr($samseq{$samid[$a]},$i,1) eq "?") {
105 } elsif (substr($samseq{$samid[$a]},$i,1) eq "-") {
106 if ((substr($samseq{$samid[$b]},$i,1) eq "?") or (substr($samseq{$samid[$b]},$i,1) eq "-")) {
112 } elsif (substr($samseq{$samid[$a]},$i,1) eq "N") {
113 if (substr($samseq{$samid[$b]},$i,1) eq "-") {
120 if ((substr($samseq{$samid[$b]},$i,1) eq "?") or (substr($samseq{$samid[$b]},$i,1) eq "N")) {
122 } elsif (substr($samseq{$samid[$b]},$i,1) eq "-") {
126 if (substr($samseq{$samid[$a]},$i,1) eq substr($samseq{$samid[$b]},$i,1)) {
135 push @line, [$samid[$a], $samid[$b]] unless $error;
138 warn "Compare every two samples complete!\n";
140 warn "No. compatible pairs = $aa2\n";
142 # Push certain haplotype sample groups into @ss, but one sample many times.
143 my @unsam; # uncertain sample IDs;
144 my @ss; # samples sure
145 my @sn; # samples not sure
147 while (defined $sn[0]) {
150 ${$_}[2] = 0 foreach @line;
153 ${$_}[2] = 1; # Mark considered sample pair.
154 push my @a, (${$_}[0], ${$_}[1]);
156 while (@a) { # The while loop will recursively find $_'s all intercompatible samples, and push them into @s.
160 foreach (@line) { # The foreach loop will find $_'s all compatible samples, and push them into @w.
162 if (${$_}[0] eq $v or ${$_}[1] eq $v) {
164 push @w, (${$_}[0], ${$_}[1]);
173 my %c; # Array @s contain intercompatible samples. The occurrence number of one sample equals the number of its compatible samples.
176 ++$d{$c{$_}} foreach keys %c;
177 push @ss, \
@s if keys %d == 1; # If the occurrence numbers of all samples are equal, the samples should be one haplotype.
178 push @sn, \
@s if keys %d >= 2; # If the numbers are not equal, the samples are uncertain.
183 warn "No. certain haplotypes = $bb1\n";
184 warn "No. uncertain groups = $bb2\n";
187 my %s; # uncertain haplotype sample IDs
188 my %sq; # $sq{sample ID} = "?|N" number
189 my %nq; # $nl{"?|N" number} = 1
190 $s{$_} = 1 foreach @
{$_};
192 my $a = @
{[$samseq{$_} =~ /N|\?/g]};
196 my $c = ${[sort {$b <=> $a} keys %nq]}[0]; # most "?|N" number in one sample sequence
197 foreach my $d (keys %s) {
200 foreach my $e (0 .. @line-1) {
201 if (defined $line[$e]) {
202 if (($line[$e][0] eq $d) or ($line[$e][1] eq $d)) {
212 warn "Remove ambiguous samples which have $c \"N|?\".\n";
215 warn "No. compatible pairs = $bb3\n";
217 warn "Find haplotype complete!\n";
219 # Make haplotype consensus.
220 my @hapseq; # haplotype sequence, @{$hapseq[n][0]} sample IDs, $hapseq[n][1] sequence.
226 my $con; # consensus sequence;
227 my @s; # samples IDs;
228 foreach my $a (sort keys %s) {
229 $con = $samseq{$a} unless $con;
230 for (my $i = 0; $i < $nchar; $i++) {
231 if (substr($samseq{$a},$i,1) eq "?") {
233 } elsif ((substr($samseq{$a},$i,1) eq "-") or (substr($samseq{$a},$i,1) eq "N")) {
234 if (substr($con,$i,1) eq "?") {
235 substr($con,$i,1) = substr($samseq{$a},$i,1);
240 if ((substr($con,$i,1) eq "?") or (substr($con,$i,1) eq "N")) {
241 substr($con,$i,1) = substr($samseq{$a},$i,1);
249 push @hapseq, [\
@s, $con];
251 warn "Make haplotype consensus complete!\n";
253 # Print haplotypes with many samples.
255 my $nh = 1; # index of haplotype
257 print O2
sprintf(">Haplo%03d\t", $nh), join(" ", @
{${$_}[0]}), "\n${$_}[1]\n";
261 # Print haplotypes which have only one sample and print ambiguous samples.
263 delete $samseq{${$_}[0]};
264 delete $samseq{${$_}[1]};
266 my $nas = 1; # index of ambiguous sample
267 my $pas; # print ambiguous samples
268 foreach (sort @unsam) {
269 $pas .= sprintf(">Ambig%03d\t$_\n$samseq{$_}\n", $nas);
273 foreach (sort keys %samseq) {
274 print O2
sprintf(">Haplo%03d\t$_\n$samseq{$_}\n", $nh);
277 print O2
$pas if $pas;
279 warn "Done and print result to ${out}_haplotype.txt!\n";