modified: src1/common.h
[GalaxyCodeBases.git] / tools / GFF / ano_db_from_read.pl
blobf40894fa2e79cd9a161d0cabad60205f246f5fcf
1 #!/usr/bin/env perl
2 =pod
3 Author: Hu Xuesong @ BIOPIC <galaxy001@gmail.com>
4 Version: 1.0.0 @ 20120720
5 =cut
6 use strict;
7 use warnings;
8 use Getopt::Long;
9 #use GTF;
10 use Galaxy::IO;
11 use Data::Dump qw(ddx);
13 my ($strCDS, $strmRNA) = qw(CDS mRNA);
14 GetOptions ('CDS=s' => \$strCDS, 'mRNA=s' => \$strmRNA);
16 die "Usage: $0 [--cds=CDS] [--mrna=mRNA] <gene_file> <db_file>\n" if @ARGV<2;
17 my $infs = shift;
18 my $outfs = shift;
20 warn "From [$infs] to [$outfs] with [$strCDS] in [$strmRNA]\n";
22 open I,'<',$infs or die;
23 open O,'>',$outfs or die;
24 print O "# From [$infs] to [$outfs]\n";
26 my $secname = ']';
27 my (%GeneDat,%Gene2Chr,$strand,$seqname, $primary, $start, $end, $frame, $LN, $groups);
28 while (<I>) {
29 next if /^(#|((\s)*$))/;
30 #next if /^#/;
31 #print $_;
32 if (/^\[([^]]*)\] ([+-])$/) {
33 #ddx $Gene2Chr{$secname};
34 $secname = $1;
35 $strand = $2;
36 #print "[$secname $strand]\n";
37 die if length $secname == 0;
38 } else {
39 chomp;
40 ($seqname, $primary, $start, $end, $frame, $LN, $groups) = split /\t/,$_;
41 if ( $primary eq $strCDS or $primary eq $strmRNA ) {
42 push @{$GeneDat{"$secname\n$strand"}{$primary}},[$start, $end, $frame];
43 ++$Gene2Chr{$secname}{$seqname};
47 close I;
49 my (%Err,%Total);
50 for (sort keys %GeneDat) {
51 my ($secname,$strand) = split /\n/,$_;
52 my @ref = keys %{$Gene2Chr{$secname}};
53 if (@ref == 1) {
54 #print O "\n[$secname] $strand $ref[0]\n";
55 my $flag = 'PhaseOK';
56 my $CDSarray = $GeneDat{$_}{$strCDS};
57 my $mRNAarray = $GeneDat{$_}{$strmRNA};
58 my @CDS;
59 if ($strand eq '+') {
60 @CDS = sort { $a->[0] <=> $b->[0] } @{$CDSarray};
61 } elsif ($strand eq '-') {
62 @CDS = sort { $b->[0] <=> $a->[0] } @{$CDSarray};
64 my $CDSLen = $CDS[0][1] - $CDS[0][0] + 1;
65 $CDS[0][3] = $CDS[0][2];
66 if ( $CDS[0][2] != 0) {
67 $CDS[0][2] = 0;
68 ++$Err{'CDS0'};
69 $flag = 'PhaseERR0';
71 ++$Total{'CDS0'};
72 for (1 .. $#CDS) {
73 my $phase = -$CDSLen % 3;
74 $CDS[$_][3] = $CDS[$_][2];
75 if ( $CDS[$_][2] != $phase) {
76 $CDS[$_][2] = $phase;
77 ++$Err{$strand};
78 $flag = 'PhaseERR1';
79 ++$Err{'Reverse'} if $CDS[$_][2] + $CDS[$_][3] == 3;
81 ++$Total{$strand};
82 $CDSLen += $CDS[$_][1] - $CDS[$_][0] + 1;
84 $$mRNAarray[0][3] = $CDSLen;
85 #$CDSarray = \@CDS;
86 #print "$secname,$strand\n";
87 #ddx \@CDS;
88 #ddx $GeneDat{$_};
89 if ($CDSLen % 3) {
90 $flag .= ' InCompCDS';
91 ++$Err{'InCompCDS'};
93 print O "\n[$secname] $strand $ref[0] $flag\n";
94 print O join("\t",'mRNA',@{$$mRNAarray[0]}),"\n";
95 for (@CDS) {
96 print O join("\t",'CDS',@$_),"\n";
98 } else {
99 die "[x] MultiRef(",scalar @ref,") in $secname($strand): [@ref].\n";
102 close O;
104 #ddx \%Gene2Chr;
105 ddx \%Err;
106 ddx \%Total;
108 __END__
109 perl ano_db_from_read.pl Dasnov3.gene1 Dasnov3.db