modified: n.fq
[GalaxyCodeBases.git] / tools / annot / indelmix.pl
blob9168626380e1ff02c77379ee23fb1532e1a8aedc
1 #!/bin/env perl
2 use lib '/share/raid010/resequencing/resequencing/tmp/bin/annotation/glfsqlite';
3 use strict;
4 use warnings;
5 use Time::HiRes qw ( gettimeofday tv_interval );
6 use Galaxy::ShowHelp;
8 $main::VERSION=0.0.3;
10 our $opts='i:o:s:bvl:f:c:';
11 our ($opt_i, $opt_o, $opt_s, $opt_v, $opt_b, $opt_d, $opt_f, $opt_l, $opt_c);
13 our $help=<<EOH;
14 \t-i Indel path (./Indel/) for [sample].indel.txt.filter
15 \t-f fabyChr path (./fabychr/) for [chr].fa
16 \t-c consensus path (./consensus/) for [sample].[chr].txt
17 \t-s Samples list (./samples.list) [sample\\s+]
18 \t-l Length of nearby sequence (200)bp
19 \t-o Output txt file (./indels.pop)
20 \t-v show verbose info to STDOUT
21 \t-b No pause for batch runs
22 EOH
24 ShowHelp();
26 $opt_o='./indels.pop' if ! defined $opt_o;
27 $opt_i='./Indel/' if ! $opt_i;
28 $opt_s='./samples.list' if ! $opt_s;
29 $opt_f='./fabychr/' if ! $opt_f;
30 $opt_c='./consensus/' if ! $opt_c;
31 $opt_l=200 if ! $opt_l;
33 $opt_i =~ s/\/$//;
34 $opt_f =~ s/\/$//;
35 print STDERR "From [$opt_i]/ to [$opt_o], with [$opt_s][$opt_f][$opt_l]/\n";
36 if (! $opt_b) {print STDERR 'press [Enter] to continue...'; <>;}
38 my $start_time = [gettimeofday];
40 my (@Samples,%Indels);
42 open( SAMP,'<',$opt_s) or die "Error: $!\n";
43 while (<SAMP>) {
44 chomp;
45 my ($sample)=split /\s+/;
46 push @Samples,$sample;
48 close SAMP;
49 if ($opt_v) {print "Samples List:\n";print "[$_]\t" for @Samples;print "\n";}
51 for my $Samp (@Samples) {
52 my $name=$opt_i.'/'.$Samp.'.indel.txt.filter';
53 open IN,'<',$name or die "Error: [$name] $!\n";
54 warn "Indel:[$Samp]\t...\n";
55 while (<IN>) {
56 my ($chr,$pos,$nid,$seq,undef,$homhet,undef,$pairs)=split /\t/;
57 if ($nid =~ /^I(\d+)$/) {$nid=$1;}
58 elsif ($nid =~ /^D(\d+)$/) {$nid=-$1;}
59 else {warn "File error @ uid !"; next;}
60 if ($homhet eq 'homo') {$seq = uc $seq;}
61 elsif ($homhet eq 'hete') {$seq = lc $seq;}
62 else {warn "File error @ homhet !"; next;}
63 print "$Samp\t$chr,$pos,$nid,$seq,$homhet\n" if $opt_v;
64 $Indels{$chr}{$pos}{$Samp}=[$nid,$seq,$pairs]; # $pairs just for test
66 close IN;
68 warn "Indels all loaded.\n\nLoading Depth:\n";
70 my (%Depth,$i,$j);
71 for my $Samp (@Samples) {
72 for my $chr (keys %Indels) {
73 print STDERR " ${Samp}_$chr\t";
74 my $name=$opt_c.'/'.$Samp.'.'.$chr.'.txt';
75 $i=$j=0;
76 open IN,'<',$name or (warn "Error: [$name] $!\n" and next);
77 while (<IN>) {
78 ++$i;
79 my ($chr,$pos,undef,undef,undef,undef,$depth)=split /\t/;
80 next unless exists $Indels{$chr}{$pos};
81 ++$j;
82 $Depth{$chr}{$pos}{$Samp}=$depth;
84 close IN;
85 warn "$j/$i\n";
88 warn "Depth all loaded.\n\n";
90 open( OUT,'>',$opt_o) or die "Error: $!\n";
91 for my $chr (sort keys %Indels) {
92 open FA,'<',$opt_f.'/'.$chr.'.fa' or die "Error: $!\n";
93 $_=<FA>;
94 chomp;
95 /^>(\S+)$/;
96 die "[x]FASTA file wrong for $chr & $1" if $1 ne $chr;
97 $/=">";
98 my $seq = <FA>;
99 chomp $seq;
100 $seq=~s/\s//g;
101 $seq = uc($seq); ## all upper case
102 $/="\n";
103 my $ref_len = length($seq);
104 warn "$chr: $ref_len bp\n";
105 close FA;
106 for my $pos (sort {$a <=> $b} keys %{$Indels{$chr}}) {
107 my ($posU,$lenU,$up);
108 if ($pos>=$opt_l+1) {$posU=$pos-$opt_l-1;$lenU=$opt_l;}
109 else {$posU=0;$lenU=$pos-1;}
110 if ($lenU>0) {$up=substr($seq,$posU,$lenU);}
111 else {$up='.';}
112 my $down=substr($seq,$pos,$opt_l); # First character is at offset 0
113 print OUT "$chr\t$pos\t$up\t$down\t";
114 warn "$chr,$pos\t$posU,$lenU\t$pos,$opt_l\n" if $opt_v;
115 for my $Samp (@Samples) {
116 my $item=$Indels{$chr}{$pos};
117 if (defined $$item{$Samp}) {
118 if (@{$$item{$Samp}}==3) { # if Indel, use depth of indel_file
119 $item=join '|',@{$$item{$Samp}}
120 } else {
121 $item=join '|',(@{$$item{$Samp}},$Depth{$chr}{$pos}{$Samp});
123 } else {$item='.|.|'.$Depth{$chr}{$pos}{$Samp};}
124 print OUT $item,'^';
126 print OUT "\n";
129 close OUT;
131 my $stop_time = [gettimeofday];
133 print STDERR "\nTime Elapsed:\t",tv_interval( $start_time, $stop_time )," second(s).\n";