perl/etc/WoodyMiaoLin/HaploID/HaploID.pl

   1 #!/usr/bin/perl
   2 use strict;
   3 use warnings;
   4
   5 die("
   6 Program: HaploID
   7 Version: 1.0
   8 Release: Sep. 1, 2013\n
   9 Auther: Woody
  10 Consultant: Galaxy\n
  11 Usage: $0 <input> <output>\n
  12 Both input and output files are fasta format.
  13 Use \"N\" for unknown basepair, \"-\" for gap, and \"?\" for missing data.
  14 \n") if (@ARGV<2);
  15
  16 my $in = shift;
  17 my $out = shift;
  18 open I, "<", "$in";
  19
  20 # Read input file.
  21 my %input; # input sequences
  22 my $nchar; # number of characters
  23 my $nin; # number of input sequences
  24 while (<I>) {
  25         $/ = ">";
  26         my $seq = <I>;
  27         chomp $seq;
  28         $seq =~ s/\s//g;
  29         $/ = "\n";
  30         $nchar = length $seq unless $nchar;
  31         s/>//;
  32         /^(\S+)\s/;
  33         push @{$input{$1}}, $seq;
  34         ++$nin;
  35 }
  36 close I;
  37 warn "Read input complete!\n";
  38 warn "No. sequences = $nin\n";
  39
  40 # Make sample consensus sequence.
  41 my %samseq; # sample consensus sequences
  42 my @samid; # sample IDs
  43 foreach my $a (sort keys %input) {
  44         push @samid, $a;
  45         my $con = $input{$a}[0];
  46         my $n = @{$input{$a}} - 1;
  47         if ($n == 0) {
  48                 $samseq{$a} = $con;
  49         } else {
  50                 foreach my $b (1 .. $n) {
  51                         for (my $i = 0; $i < $nchar; $i++) {
  52                                 my $j = $i + 1;
  53                                 if (substr($input{$a}[$b],$i,1) eq "?") {
  54                                         next;
  55                                 } elsif (substr($input{$a}[$b],$i,1) eq "-") {
  56                                         if (substr($con,$i,1) eq "?") {
  57                                                 substr($con,$i,1) = substr($input{$a}[$b],$i,1);
  58                                         } elsif (substr($con,$i,1) eq "-") {
  59                                                 next;
  60                                         } else {
  61                                                 warn "$a conflicts at $j bp!\n";
  62                                         }
  63                                 } elsif (substr($input{$a}[$b],$i,1) eq "N") {
  64                                         if (substr($con,$i,1) eq "?") {
  65                                                 substr($con,$i,1) = substr($input{$a}[$b],$i,1);
  66                                         } elsif (substr($con,$i,1) eq "-") {
  67                                                 warn "$a conflicts at $j bp!\n";
  68                                         } else {
  69                                                 next;
  70                                         }
  71                                 } else {
  72                                         if ((substr($con,$i,1) eq "?") or (substr($con,$i,1) eq "N")) {
  73                                                 substr($con,$i,1) = substr($input{$a}[$b],$i,1);
  74                                         } elsif (substr($con,$i,1) eq "-") {
  75                                                 warn "$a conflicts at $j bp!\n";
  76                                         } else {
  77                                                 if (substr($con,$i,1) eq substr($input{$a}[$b],$i,1)) {
  78                                                         next;
  79                                                 } else {
  80                                                         warn "$a conflicts at $j bp!\n";
  81                                                 }
  82                                         }
  83                                 }
  84                         }
  85                 }
  86                 $samseq{$a} = $con;
  87         }
  88 }
  89 #open O1, ">", "${out}_sample.txt";
  90 #print O1 ">$_\n$samseq{$_}\n" foreach sort keys %samseq;
  91 #close O1;
  92 #warn "Make sample consensus complete and print to ${out}_sample.txt!\n";
  93 my $aa1 = keys %samseq;
  94 warn "No. samples = $aa1\n";
  95
  96 # Compare every two samples, and load compatible pairs into @line.
  97 my @line; # compatible sample pairs
  98 my $nsam = @samid; # total number of samples
  99 foreach my $a (0 .. $nsam-2) {
 100         foreach my $b ($a+1 .. $nsam-1) {
 101                 my $error;
 102                 for (my $i = 0; $i < $nchar; $i++) {
 103                         if (substr($samseq{$samid[$a]},$i,1) eq "?") {
 104                                 next;
 105                         } elsif (substr($samseq{$samid[$a]},$i,1) eq "-") {
 106                                 if ((substr($samseq{$samid[$b]},$i,1) eq "?") or (substr($samseq{$samid[$b]},$i,1) eq "-")) {
 107                                         next;
 108                                 } else {
 109                                         $error = 1;
 110                                         last;
 111                                 }
 112                         } elsif (substr($samseq{$samid[$a]},$i,1) eq "N") {
 113                                 if (substr($samseq{$samid[$b]},$i,1) eq "-") {
 114                                         $error = 1;
 115                                         last;
 116                                 } else {
 117                                         next;
 118                                 }
 119                         } else {
 120                                 if ((substr($samseq{$samid[$b]},$i,1) eq "?") or (substr($samseq{$samid[$b]},$i,1) eq "N")) {
 121                                         next;
 122                                 } elsif (substr($samseq{$samid[$b]},$i,1) eq "-") {
 123                                         $error = 1;
 124                                         last;
 125                                 } else {
 126                                         if (substr($samseq{$samid[$a]},$i,1) eq substr($samseq{$samid[$b]},$i,1)) {
 127                                                 next;
 128                                         } else {
 129                                                 $error = 1;
 130                                                 last;
 131                                         }
 132                                 }
 133                         }
 134                 }
 135                 push @line, [$samid[$a], $samid[$b]] unless $error;
 136         }
 137 }
 138 warn "Compare every two samples complete!\n";
 139 my $aa2 = @line;
 140 warn "No. compatible pairs = $aa2\n";
 141
 142 # Push certain haplotype sample groups into @ss, but one sample many times.
 143 my @unsam; # uncertain sample IDs;
 144 my @ss; # samples sure
 145 my @sn; # samples not sure
 146 $sn[0] = 1;
 147 while (defined $sn[0]) {
 148         undef @ss;
 149         undef @sn;
 150         ${$_}[2] = 0 foreach @line;
 151         foreach (@line) {
 152                 unless (${$_}[2]) {
 153                         ${$_}[2] = 1; # Mark considered sample pair.
 154                         push my @a, (${$_}[0], ${$_}[1]);
 155                         my @s = @a;
 156                         while (@a) { # The while loop will recursively find $_'s all intercompatible samples, and push them into @s.
 157                                 my @c;
 158                                 foreach my $v (@a) {
 159                                         my @w;
 160                                         foreach (@line) { # The foreach loop will find $_'s all compatible samples, and push them into @w.
 161                                                 unless (${$_}[2]) {
 162                                                         if (${$_}[0] eq $v or ${$_}[1] eq $v) {
 163                                                                 ${$_}[2] = 1;
 164                                                                 push @w, (${$_}[0], ${$_}[1]);
 165                                                         }
 166                                                 }
 167                                         }
 168                                         push @c, @w;
 169                                 }
 170                                 @a = @c;
 171                                 push @s, @a;
 172                         }
 173                         my %c; # Array @s contain intercompatible samples. The occurrence number of one sample equals the number of its compatible samples.
 174                         ++$c{$_} foreach @s;
 175                         my %d;
 176                         ++$d{$c{$_}} foreach keys %c;
 177                         push @ss, \@s if keys %d == 1; # If the occurrence numbers of all samples are equal, the samples should be one haplotype.
 178                         push @sn, \@s if keys %d >= 2; # If the numbers are not equal, the samples are uncertain.
 179                 }
 180         }
 181         my $bb1 = @ss;
 182         my $bb2 = @sn;
 183         warn "No. certain haplotypes = $bb1\n";
 184         warn "No. uncertain groups  = $bb2\n";
 185         last unless $bb2;
 186         foreach (@sn) {
 187                 my %s; # uncertain haplotype sample IDs
 188                 my %sq; # $sq{sample ID} = "?|N" number
 189                 my %nq; # $nl{"?|N" number} = 1
 190                 $s{$_} = 1 foreach @{$_};
 191                 foreach (keys %s) {
 192                         my $a = @{[$samseq{$_} =~ /N|\?/g]};
 193                         $sq{$_} = $a;
 194                         $nq{$a} = 1;
 195                 }
 196                 my $c = ${[sort {$b <=> $a} keys %nq]}[0]; # most "?|N" number in one sample sequence
 197                 foreach my $d (keys %s) {
 198                         if ($sq{$d} == $c) {
 199                                 push @unsam, $d;
 200                                 foreach my $e (0 .. @line-1) {
 201                                         if (defined $line[$e]) {
 202                                                 if (($line[$e][0] eq $d) or ($line[$e][1] eq $d)) {
 203                                                         splice @line, $e, 1;
 204                                                         redo;
 205                                                 }
 206                                         } else {
 207                                                 last;
 208                                         }
 209                                 }
 210                         }
 211                 }
 212                 warn "Remove ambiguous samples which have $c \"N|?\".\n";
 213         }
 214         my $bb3 = @line;
 215         warn "No. compatible pairs = $bb3\n";
 216 }
 217 warn "Find haplotype complete!\n";
 218
 219 # Make haplotype consensus.
 220 my @hapseq; # haplotype sequence, @{$hapseq[n][0]} sample IDs, $hapseq[n][1] sequence.
 221 foreach (@ss) {
 222         my %s; # sample IDs
 223         foreach (@{$_}) {
 224                 $s{$_} = 1;
 225         }
 226         my $con; # consensus sequence;
 227         my @s; # samples IDs;
 228         foreach my $a (sort keys %s) {
 229                 $con = $samseq{$a} unless $con;
 230                 for (my $i = 0; $i < $nchar; $i++) {
 231                         if (substr($samseq{$a},$i,1) eq "?") {
 232                                 next;
 233                         } elsif ((substr($samseq{$a},$i,1) eq "-") or (substr($samseq{$a},$i,1) eq "N")) {
 234                                 if (substr($con,$i,1) eq "?") {
 235                                         substr($con,$i,1) = substr($samseq{$a},$i,1);
 236                                 } else {
 237                                         next;
 238                                 }
 239                         } else {
 240                                 if ((substr($con,$i,1) eq "?") or (substr($con,$i,1) eq "N")) {
 241                                         substr($con,$i,1) = substr($samseq{$a},$i,1);
 242                                 } else {
 243                                         next;
 244                                 }
 245                         }
 246                 }
 247                 push @s, $a;
 248         }
 249         push @hapseq, [\@s, $con];
 250 }
 251 warn "Make haplotype consensus complete!\n";
 252
 253 # Print haplotypes with many samples.
 254 open O2, ">", $out;
 255 my $nh = 1; # index of haplotype
 256 foreach (@hapseq) {
 257         print O2 sprintf(">Haplo%03d\t", $nh), join(" ", @{${$_}[0]}), "\n${$_}[1]\n";
 258         ++$nh;
 259 }
 260
 261 # Print haplotypes which have only one sample and print ambiguous samples.
 262 foreach (@line) {
 263         delete $samseq{${$_}[0]};
 264         delete $samseq{${$_}[1]};
 265 }
 266 my $nas = 1; # index of ambiguous sample
 267 my $pas; # print ambiguous samples
 268 foreach (sort @unsam) {
 269         $pas .= sprintf(">Ambig%03d\t$_\n$samseq{$_}\n", $nas);
 270         ++$nas;
 271         delete $samseq{$_};
 272 }
 273 foreach (sort keys %samseq) {
 274         print O2 sprintf(">Haplo%03d\t$_\n$samseq{$_}\n", $nh);
 275         ++$nh;
 276 }
 277 print O2 $pas if $pas;
 278 close O2;
 279 warn "Done and print result to ${out}_haplotype.txt!\n";