modified: src1/common.h
[GalaxyCodeBases.git] / tools / GFF / tmpgff.pl
blob2a701f98d748b95ec619a301af7eb1c734305fde
1 #!/bin/env perl
2 use strict;
3 use warnings;
4 use Data::Dump qw(ddx);
6 my ($in,$out)=("refGene.filter.gff", "ymy20120321.lst");
8 my (%GFF,%TransID,%Dat);
10 sub unescape {
11 my $v = shift;
12 $v =~ tr/+/ /;
13 $v =~ s/%([0-9a-fA-F]{2})/chr hex($1)/ge;
14 return $v;
16 open IN,'<',$in or die "Error: $!\n";
17 while (<IN>) {
18 next if /^\s+$/ or /^[;#]+/;
19 chomp;
20 s/\r//g;
21 my ($seqname, $source, $primary, $start, $end,
22 $score, $strand, $frame, $groups) = split /\t/;
23 my @groups = split(/\s*;\s*/, $groups);
24 my (%groups,$name);
25 for my $group (@groups) {
26 my ($tag,$value) = split /=/,$group;
27 $tag = unescape($tag);
28 my @values = map {unescape($_)} split /,/,$value;
29 #$groups{$tag}=\@values; # patch for those alter-splices
30 $groups{$tag}=$values[0];
31 die if @values>1;
33 if ($primary eq 'mRNA') {
34 push @{$GFF{$groups{'name'}}},$groups{'ID'};
38 for my $k (keys %GFF) {
39 if (@{$GFF{$k}} != 2) {
40 delete $GFF{$k};
41 } else {
42 for (@{$GFF{$k}}) {
43 ++$TransID{$_};
47 ddx \%GFF;
48 close IN;
50 open IN,'<',$in or die "Error: $!\n"; # It is a small file after all, ...
51 while (<IN>) {
52 next if /^\s+$/ or /^[;#]+/;
53 chomp;
54 s/\r//g;
55 my ($seqname, $source, $primary, $start, $end,
56 $score, $strand, $frame, $groups) = split /\t/;
57 my @groups = split(/\s*;\s*/, $groups);
58 my (%groups,$name);
59 for my $group (@groups) {
60 my ($tag,$value) = split /=/,$group;
61 $tag = unescape($tag);
62 my @values = map {unescape($_)} split /,/,$value;
63 #$groups{$tag}=\@values; # patch for those alter-splices
64 $groups{$tag}=$values[0];
66 if ($primary =~ /CDS|3-UTR/) {
67 next unless exists $TransID{$groups{'Parent'}};
68 push @{$Dat{$groups{'Parent'}}{$primary}},[$start, $end];
69 } elsif ($primary eq 'mRNA') {
70 next unless exists $TransID{$groups{'ID'}};
71 $Dat{$groups{'ID'}}{'Strand'}=$strand;
74 close IN;
76 sub cmpUTR($$) {
77 my ($RefA,$RefB)=@_;
78 my ($StrA,$StrB)=('','');
79 if (defined $RefA) {
80 $StrA .= "$$_[0],$$_[1] " for (@$RefA);
81 return 'Same' if @$RefA != 1; # filter those with more 3-UTR
83 if (defined $RefB) {
84 $StrB .= "$$_[0],$$_[1] " for (@$RefB);
85 return 'Same' if @$RefB != 1;
87 if ($StrA eq $StrB) {
88 return 'Same';
89 } else {
90 return "[$StrA],[$StrB]";
93 open OUT,'>',$out or die "Error: $!\n";
94 for my $k (keys %GFF) {
95 my ($IDA,$IDB)=@{$GFF{$k}};
96 my $RefA=$Dat{$IDA};
97 my $RefB=$Dat{$IDB};
98 my $Strand = $$RefA{"Strand"};
99 die if $Strand ne $$RefB{"Strand"};
100 my $retStr=cmpUTR($$RefA{"3-UTR"},$$RefB{"3-UTR"});
101 next if $retStr eq 'Same';
102 =pod
103 my ($ua,$ub,$uc,$ud);
104 unless (exists $$RefA{"3-UTR"} and exists $$RefB{"3-UTR"}) {
105 #ddx $RefA; ddx $RefB;
106 warn "$k: [$IDA,$IDB]\n";
107 if (exists $$RefA{"3-UTR"}) {
108 ($ua,$ub)=@{$$RefA{"3-UTR"}->[0]};
109 ($uc,$ud)=(0,0);
110 } elsif (exists $$RefB{"3-UTR"}) {
111 ($ua,$ub)=(0,0);
112 ($uc,$ud)=@{$$RefB{"3-UTR"}->[0]};
113 } else {
114 warn "$k: [$IDA,$IDB]\n";
115 next;
117 } else {
118 ($ua,$ub)=@{$$RefA{"3-UTR"}->[0]};
119 ($uc,$ud)=@{$$RefB{"3-UTR"}->[0]};
121 next if $ua==$uc and $ub==$ud;
122 =cut
123 my ($a,$b,$c,$d);
124 if ($Strand eq '+') {
125 ($a,$b)=@{$$RefA{"CDS"}->[-1]}; # sorted
126 ($c,$d)=@{$$RefB{"CDS"}->[-1]};
127 } elsif ($Strand eq '-') {
128 ($a,$b)=@{$$RefA{"CDS"}->[0]};
129 ($c,$d)=@{$$RefB{"CDS"}->[0]};
130 #ddx $RefA; ddx $RefB;
131 #print "$a,$b $c,$d\n"; die;
132 } else {die;}
133 next unless $a==$c and $b==$d;
134 print OUT "$k\t[$IDA, $IDB]\t$Strand\t3-UTR:$retStr\tCDS:<$a,$b>\n";
136 close OUT;