bin/bp_biofetch_genbank_proxy: move to Bio-DB-NCBIHelper
[bioperl-live.git] / bin / bp_process_gadfly
blobab934ed46992f3a785b972e4ff542dfc46fcb148
1 #!/usr/bin/perl
2 if ($ARGV[0]=~/^-?-h/ || @ARGV < 1) {
3 die <<'USAGE';
5 This script massages the RELEASE 3 Flybase/Gadfly GFF files located at
6 http://www.fruitfly.org/sequence/release3download.shtml into the
7 "correct" version of the GFF format.
9 To use this script, download the whole genome FASTA file and save it
10 to disk. (The downloaded file will be called something like
11 "na_whole-genome_genomic_dmel_RELEASE3.FASTA", but the link on the
12 HTML page doesn't give the filename.) Do the same for the whole
13 genome GFF annotation file (the saved file will be called something
14 like "whole-genome_annotation-feature-region_dmel_RELEASE3.GFF".) If
15 you wish you can download the ZIP compressed versions of these files.
17 Next run this script on the two files, indicating the name of the
18 downloaded FASTA file first, followed by the gff file:
20 % process_gadfly.pl na_whole-genome_genomic_dmel_RELEASE3.FASTA whole-genome_annotation-feature-region_dmel_RELEASE3.GFF > fly.gff
22 The gadfly.gff file and the fasta file can now be loaded into a Bio::DB::GFF database
23 using the following command:
25 % bulk_load_gff.pl -d fly -fasta na_whole-genome_genomic_dmel_RELEASE3.FASTA fly.gff
27 (Where "fly" is the name of the database. Change it as appropriate.
28 The database must already exist and be writable by you!)
30 The resulting database will have the following feature types
31 (represented as "method:source"):
33 Component:arm A chromosome arm
34 Component:scaffold A chromosome scaffold (accession #)
35 Component:gap A gap in the assembly
36 clone:clonelocator A BAC clone
37 gene:gadfly A gene accession number
38 transcript:gadfly A transcript accession number
39 translation:gadfly A translation
40 codon:gadfly Significance unknown
41 exon:gadfly An exon
42 symbol:gadfly A classical gene symbol
43 similarity:blastn A BLASTN hit
44 similarity:blastx A BLASTX hit
45 similarity:sim4 EST->genome using SIM4
46 similarity:groupest EST->genome using GROUPEST
47 similarity:repeatmasker A repeat
49 IMPORTANT NOTE: This script will *only* work with the RELEASE3 gadfly
50 files and will not work with earlier releases.
52 USAGE
56 use strict;
57 use warnings;
59 foreach (@ARGV) {
60 $_ = "gunzip -c $_ |" if /\.gz$/;
63 if ($ARGV[0] =~ /fasta/i) {
64 process_fasta();
65 } else {
66 die "call as process_gadfly.pl \"release3_dna.FASTA\" \"release3_features.GFF\"";
69 while (<>) {
70 next if /^\#/;
71 chomp;
72 my ($ref,$csource,$cmethod,$start,$stop,$cscore,$strand,$cphase,$cgroup) = split "\t";
73 next if $start > $stop; # something wrong. Don't bother fixing it.
75 my $fixed_group = fix_group($csource,$cmethod,$cgroup);
76 print join("\t",$ref,$csource,$cmethod,$start,$stop,$cscore,$strand,$cphase,$fixed_group),"\n";
77 dump_symbol($ref,$csource,$cmethod,$start,$stop,$cscore,$strand,$cphase,$cgroup) if $cgroup =~ /symbol/i;
80 sub fix_group {
81 my ($source,$method,$group) = @_;
82 my (@group,$gene);
83 push @group,"Transcript $1" if $group =~ /transgrp=([^; ]+)/;
84 push @group,"Gene $1" if $method eq 'gene' && $group =~ /genegrp=([^; ]+)/;
86 $gene ||= qq(Note "FlyBase $1") if $group =~ /dbxref=FlyBase:(\w+)/;
87 $gene ||= qq(Note "GadFly $1") if $group =~ /genegrp=([^; ]+)/;
88 push @group,qq(Note "Symbol $1") if $group =~ /symbol=([^; ]+)/ && "Gene $1" ne $group[0];
89 push @group,$gene;
90 return join ' ; ',@group;
93 # called when we encounter a gene symbol
94 sub dump_symbol {
95 my ($ref,$csource,$cmethod,$start,$stop,$cscore,$strand,$cphase,$cgroup) = @_;
96 my ($symbol) = $cgroup=~/symbol=([^;]+)/;
97 my ($gene) = $cgroup=~/genegrp=([^;]+)/;
98 return if $symbol eq $gene;
99 $cmethod = 'symbol';
100 print join("\t",$ref,$csource,$cmethod,$start,$stop,$cscore,$strand,$cphase,qq(Symbol "$symbol")),"\n";
103 sub process_fasta {
104 my $file = shift @ARGV;
105 open my $F, '<', $file or die "Could not read file '$file': $!\n";
106 print STDERR "Reading big FASTA file, please be patient...\n";
107 my ($current_id,%lengths);
108 while (my $line = <$F>) {
109 if ($line =~ /^>(\S+)/) {
110 $current_id = $1;
111 next;
113 die "this doesn't look like a fasta file to me" unless $current_id;
114 chomp $line;
115 $lengths{$current_id} += length $line;
117 close $F;
119 foreach my $id (sort keys %lengths) {
120 print join("\t", $id, 'arm', 'Component', 1, $lengths{$id},
121 '.', '+', '.', qq(Sequence "$id")
122 ), "\n";
126 __END__
128 =head1 NAME
130 bp_process_gadfly.pl - Massage Gadfly/FlyBase GFF files into a version suitable for the Generic Genome Browser
132 =head1 SYNOPSIS
134 % bp_process_gadfly.pl ./RELEASE2 > gadfly.gff
136 =head1 DESCRIPTION
138 This script massages the RELEASE 3 Flybase/Gadfly GFF files located at
139 http://www.fruitfly.org/sequence/release3download.shtml into the "correct"
140 version of the GFF format.
142 To use this script, download the whole genome FASTA file and save it
143 to disk. (The downloaded file will be called something like
144 "na_whole-genome_genomic_dmel_RELEASE3.FASTA", but the link on the
145 HTML page doesn't give the filename.) Do the same for the whole
146 genome GFF annotation file (the saved file will be called something
147 like "whole-genome_annotation-feature-region_dmel_RELEASE3.GFF".) If
148 you wish you can download the ZIP compressed versions of these files.
150 Next run this script on the two files, indicating the name of the
151 downloaded FASTA file first, followed by the gff file:
153 % bp_process_gadfly.pl na_whole-genome_genomic_dmel_RELEASE3.FASTA whole-genome_annotation-feature-region_dmel_RELEASE3.GFF > fly.gff
155 The gadfly.gff file and the fasta file can now be loaded into a Bio::DB::GFF database
156 using the following command:
158 % bulk_load_gff.pl -d fly -fasta na_whole-genome_genomic_dmel_RELEASE3.FASTA fly.gff
160 (Where "fly" is the name of the database. Change it as appropriate.
161 The database must already exist and be writable by you!)
163 The resulting database will have the following feature types
164 (represented as "method:source"):
166 Component:arm A chromosome arm
167 Component:scaffold A chromosome scaffold (accession #)
168 Component:gap A gap in the assembly
169 clone:clonelocator A BAC clone
170 gene:gadfly A gene accession number
171 transcript:gadfly A transcript accession number
172 translation:gadfly A translation
173 codon:gadfly Significance unknown
174 exon:gadfly An exon
175 symbol:gadfly A classical gene symbol
176 similarity:blastn A BLASTN hit
177 similarity:blastx A BLASTX hit
178 similarity:sim4 EST->genome using SIM4
179 similarity:groupest EST->genome using GROUPEST
180 similarity:repeatmasker A repeat
182 IMPORTANT NOTE: This script will *only* work with the RELEASE3 gadfly
183 files and will not work with earlier releases.
185 =head1 SEE ALSO
187 L<Bio::DB::GFF>, L<bulk_load_gff.pl>, L<load_gff.pl>
189 =head1 AUTHOR
191 Lincoln Stein, lstein@cshl.org
193 Copyright (c) 2002 Cold Spring Harbor Laboratory
195 This library is free software; you can redistribute it and/or modify
196 it under the same terms as Perl itself. See DISCLAIMER.txt for
197 disclaimers of warranty.
199 =cut