Add empty line after package statement for $VERSION (for dzils's [PkgVersion])
[bioperl-live.git] / bin / bp_unflatten_seq
blob974301ef2b8f4f9b3357cec2eb1b1aaabbe08eb5
1 #!/usr/bin/perl
2 use strict;
3 use warnings;
4 # Author Chris Mungall <cjm-at-bioperl.org>
6 =head1 NAME
8 bp_unflatten_seq - unflatten a genbank or genbank-style feature file into
9 a nested SeqFeature hierarchy
11 =head1 SYNOPSIS
13 bp_unflatten_seq.PLS -e 3 -gff ~/cvs/bioperl-live/t/data/AE003644_Adh-genomic.gb
15 bp_unflatten_seq.PLS --detail ~/cvs/bioperl-live/t/data/AE003644_Adh-genomic.gb
17 bp_unflatten_seq.PLS -i foo.embl --from embl --to chadoxml -o out.chado.xml
19 bp_unflatten_seq.PLS --notypemap --detail --to asciitree -ethresh 2 AE003644_Adh-genomic.gb
21 =head1 DESCRIPTION
23 This script will B<unflatten> a genbank or genbank-style file of
24 SeqFeatures into a nested hierarchy.
26 See L<Bio::SeqFeature::Tools::Unflattener>
28 In a GenBank/EMBL representation, features are 'flat' - for example,
29 there is no link between an mRNA and a CDS, other than implicit links
30 (eg via tags or via splice site coordinates) which may be hard to code
31 for.
33 This is most easily illustrated with the default output format,
34 B<asciitree>
36 An unflattened genbank feature set may look like this (AB077698)
38 Seq: AB077698
39 databank_entry 1..2701[+]
40 gene
41 mRNA
42 CDS hCHCR-G 80..1144[+]
43 exon 80..1144[+]
44 five_prime_UTR 1..79[+]
45 located_sequence_feature 137..196[+]
46 located_sequence_feature 239..292[+]
47 located_sequence_feature 617..676[+]
48 located_sequence_feature 725..778[+]
49 three_prime_UTR 1145..2659[+]
50 polyA_site 1606..1606[+]
51 polyA_site 2660..2660[+]
53 Or like this (portion of AE003734)
56 gene
57 mRNA CG3320-RA
58 CDS CG3320-PA 53126..54971[-]
59 exon 52204..53323[-]
60 exon 53404..53631[-]
61 exon 53688..53735[-]
62 exon 53798..53918[-]
63 exon 54949..55287[-]
64 mRNA CG3320-RB
65 CDS CG3320-PB 53383..54971[-]
66 exon 52204..53631[-]
67 exon 53688..53735[-]
68 exon 53798..53918[-]
69 exon 54949..55287[-]
71 The unflattening will also 'normalize' the containment hierarchy (in
72 the sense of standardising it - e.g. making sure there is always a
73 transcript record, even if genbank just specifies CDS and gene)
75 By default, the GenBank types will be mapped to SO types
77 See L<Bio::SeqFeature::Tools::TypeMapper>
79 =head1 COMMAND LINE ARGUMENTS
81 =over
83 =item -i|input FILE
85 input file (can also be specified as last argument)
87 =item -from FORMAT
89 input format (defaults to genbank)
91 probably doesn't make so much sense to use this for non-flat formats;
92 ie other than embl/genbank
94 =item -to FORMAT
96 output format (defaults to asciitree)
98 should really be a format that is nested SeqFeature aware; I think
99 this is only asciitree, chadoxml and gff3
101 =item -gff
103 with export to GFF3 format (pre-3 GFFs make no sense with unflattened
104 sequences, as they have no set way of representing feature graphs)
106 =item -o|output FILE
108 outfile defaults to STDOUT
110 =item -detail
112 show extra detail on features (asciitree mode only)
114 =item -e|ethresh INT
116 sets the error threshold on unflattening
118 by default this script will throw a wobbly if it encounters weird
119 stuff in the genbank file - raise the error threshold to signal these
120 to be ignored (and reported on STDERR)
122 =item -nomagic
124 suppress use_magic in unflattening (see
125 L<Bio::SeqFeature::Tools::Unflattener>
127 =item -notypemap
129 suppress type mapping (see
130 L<Bio::SeqFeature::Tools::TypeMapper>
133 =back
135 =head1 TODO
137 L<Bio::SeqFeature::Tools::Unflattener> allows fine-grained control
138 over the unflattening process - need to add more options to allow this
139 control at the command line
142 =head1 FEEDBACK
144 =head2 Mailing Lists
146 User feedback is an integral part of the evolution of this and other
147 Bioperl modules. Send your comments and suggestions preferably to
148 the Bioperl mailing list. Your participation is much appreciated.
150 bioperl-l@bioperl.org - General discussion
151 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
153 =head2 Reporting Bugs
155 Report bugs to the Bioperl bug tracking system to help us keep track
156 of the bugs and their resolution. Bug reports can be submitted via
157 email or the web:
159 https://github.com/bioperl/bioperl-live/issues
161 =head1 AUTHOR
163 Chris Mungall E<lt>cjm-at-bioperl.orgE<gt>
165 =cut
167 use Bio::SeqIO;
168 use Bio::SeqFeature::Tools::Unflattener;
169 use Bio::SeqFeature::Tools::TypeMapper;
170 use Bio::SeqFeature::Tools::IDHandler;
171 use Bio::Tools::GFF;
173 use Getopt::Long;
175 my ($input,$from,$to,$output,$verbosity,$ethresh,$nomagic,$group_tag,$detail,
176 $notypemap);
177 $from = 'genbank';
178 $to = 'asciitree';
179 $ethresh = 3;
180 my $gff;
181 my @remove_types = ();
183 GetOptions(
184 'i|input:s' => \$input,
185 'from:s' => \$from,
186 'to:s' => \$to,
187 'o|output:s'=> \$output,
188 "verbosity|v=s"=>\$verbosity,
189 "ethresh|e=s"=>\$ethresh,
190 "remove_type=s@"=>\@remove_types,
191 "nomagic"=>\$nomagic,
192 "notypemap"=>\$notypemap,
193 "group_tag"=>\$group_tag,
194 "detail"=>\$detail,
195 "gff"=>\$gff,
196 "h|help"=>sub {
197 system("perldoc $0");
198 exit 0;
203 if ($to =~ /^gff/i) {
204 $gff = 1;
207 $input = $input || shift if @ARGV;
209 my $in = new Bio::SeqIO(-file => $input,
210 -format => $from);
211 my $out;
212 my @out_opt = $output ? (-file => ">$output") : ();
213 unless ($gff) {
214 $out = new Bio::SeqIO(-format=>$to, @out_opt);
215 $out->show_detail($detail) if $out->can("show_detail") && $detail;
218 my $unflattener = Bio::SeqFeature::Tools::Unflattener->new;
219 $unflattener->verbose($verbosity);
220 $unflattener->error_threshold($ethresh);
221 my $tm = Bio::SeqFeature::Tools::TypeMapper->new;
222 my $idhandler = Bio::SeqFeature::Tools::IDHandler->new;
224 while( my $seq = $in->next_seq ) {
225 $unflattener->remove_types(-seq=>$seq,
226 -types=>\@remove_types)
227 if @remove_types;
229 $unflattener->unflatten_seq(-seq=>$seq,
230 -use_magic=>!$nomagic,
231 -group_tag=>$group_tag,
233 $unflattener->report_problems(\*STDERR);
234 $tm->map_types_to_SO(-seq=>$seq) unless $notypemap;
236 my @seq_args = ($seq);
237 if ($to eq 'chadoxml') {
238 @seq_args = (-seq=>$seq, -nounflatten=>1)
240 if ($gff) {
241 my $gffio = Bio::Tools::GFF->new(@out_opt, -noparse=>1, -gff_version => 3);
242 $idhandler->set_ParentIDs_from_hierarchy($seq);
243 foreach my $feature ($seq->get_all_SeqFeatures) {
244 $gffio->write_feature($feature);
246 $gffio->close();
248 else {
249 $out->write_seq(@seq_args);
254 __END__