4 # Author Chris Mungall <cjm-at-bioperl.org>
8 bp_unflatten_seq - unflatten a genbank or genbank-style feature file into
9 a nested SeqFeature hierarchy
13 bp_unflatten_seq.PLS -e 3 -gff ~/cvs/bioperl-live/t/data/AE003644_Adh-genomic.gb
15 bp_unflatten_seq.PLS --detail ~/cvs/bioperl-live/t/data/AE003644_Adh-genomic.gb
17 bp_unflatten_seq.PLS -i foo.embl --from embl --to chadoxml -o out.chado.xml
19 bp_unflatten_seq.PLS --notypemap --detail --to asciitree -ethresh 2 AE003644_Adh-genomic.gb
23 This script will B<unflatten> a genbank or genbank-style file of
24 SeqFeatures into a nested hierarchy.
26 See L<Bio::SeqFeature::Tools::Unflattener>
28 In a GenBank/EMBL representation, features are 'flat' - for example,
29 there is no link between an mRNA and a CDS, other than implicit links
30 (eg via tags or via splice site coordinates) which may be hard to code
33 This is most easily illustrated with the default output format,
36 An unflattened genbank feature set may look like this (AB077698)
39 databank_entry 1..2701[+]
42 CDS hCHCR-G 80..1144[+]
44 five_prime_UTR 1..79[+]
45 located_sequence_feature 137..196[+]
46 located_sequence_feature 239..292[+]
47 located_sequence_feature 617..676[+]
48 located_sequence_feature 725..778[+]
49 three_prime_UTR 1145..2659[+]
50 polyA_site 1606..1606[+]
51 polyA_site 2660..2660[+]
53 Or like this (portion of AE003734)
58 CDS CG3320-PA 53126..54971[-]
65 CDS CG3320-PB 53383..54971[-]
71 The unflattening will also 'normalize' the containment hierarchy (in
72 the sense of standardising it - e.g. making sure there is always a
73 transcript record, even if genbank just specifies CDS and gene)
75 By default, the GenBank types will be mapped to SO types
77 See L<Bio::SeqFeature::Tools::TypeMapper>
79 =head1 COMMAND LINE ARGUMENTS
85 input file (can also be specified as last argument)
89 input format (defaults to genbank)
91 probably doesn't make so much sense to use this for non-flat formats;
92 ie other than embl/genbank
96 output format (defaults to asciitree)
98 should really be a format that is nested SeqFeature aware; I think
99 this is only asciitree, chadoxml and gff3
103 with export to GFF3 format (pre-3 GFFs make no sense with unflattened
104 sequences, as they have no set way of representing feature graphs)
108 outfile defaults to STDOUT
112 show extra detail on features (asciitree mode only)
116 sets the error threshold on unflattening
118 by default this script will throw a wobbly if it encounters weird
119 stuff in the genbank file - raise the error threshold to signal these
120 to be ignored (and reported on STDERR)
124 suppress use_magic in unflattening (see
125 L<Bio::SeqFeature::Tools::Unflattener>
129 suppress type mapping (see
130 L<Bio::SeqFeature::Tools::TypeMapper>
137 L<Bio::SeqFeature::Tools::Unflattener> allows fine-grained control
138 over the unflattening process - need to add more options to allow this
139 control at the command line
146 User feedback is an integral part of the evolution of this and other
147 Bioperl modules. Send your comments and suggestions preferably to
148 the Bioperl mailing list. Your participation is much appreciated.
150 bioperl-l@bioperl.org - General discussion
151 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
153 =head2 Reporting Bugs
155 Report bugs to the Bioperl bug tracking system to help us keep track
156 of the bugs and their resolution. Bug reports can be submitted via
159 https://github.com/bioperl/bioperl-live/issues
163 Chris Mungall E<lt>cjm-at-bioperl.orgE<gt>
168 use Bio
::SeqFeature
::Tools
::Unflattener
;
169 use Bio
::SeqFeature
::Tools
::TypeMapper
;
170 use Bio
::SeqFeature
::Tools
::IDHandler
;
175 my ($input,$from,$to,$output,$verbosity,$ethresh,$nomagic,$group_tag,$detail,
181 my @remove_types = ();
184 'i|input:s' => \
$input,
187 'o|output:s'=> \
$output,
188 "verbosity|v=s"=>\
$verbosity,
189 "ethresh|e=s"=>\
$ethresh,
190 "remove_type=s@"=>\
@remove_types,
191 "nomagic"=>\
$nomagic,
192 "notypemap"=>\
$notypemap,
193 "group_tag"=>\
$group_tag,
197 system("perldoc $0");
203 if ($to =~ /^gff/i) {
207 $input = $input || shift if @ARGV;
209 my $in = new Bio
::SeqIO
(-file
=> $input,
212 my @out_opt = $output ?
(-file
=> ">$output") : ();
214 $out = new Bio
::SeqIO
(-format
=>$to, @out_opt);
215 $out->show_detail($detail) if $out->can("show_detail") && $detail;
218 my $unflattener = Bio
::SeqFeature
::Tools
::Unflattener
->new;
219 $unflattener->verbose($verbosity);
220 $unflattener->error_threshold($ethresh);
221 my $tm = Bio
::SeqFeature
::Tools
::TypeMapper
->new;
222 my $idhandler = Bio
::SeqFeature
::Tools
::IDHandler
->new;
224 while( my $seq = $in->next_seq ) {
225 $unflattener->remove_types(-seq
=>$seq,
226 -types
=>\
@remove_types)
229 $unflattener->unflatten_seq(-seq
=>$seq,
230 -use_magic
=>!$nomagic,
231 -group_tag
=>$group_tag,
233 $unflattener->report_problems(\
*STDERR
);
234 $tm->map_types_to_SO(-seq
=>$seq) unless $notypemap;
236 my @seq_args = ($seq);
237 if ($to eq 'chadoxml') {
238 @seq_args = (-seq
=>$seq, -nounflatten
=>1)
241 my $gffio = Bio
::Tools
::GFF
->new(@out_opt, -noparse
=>1, -gff_version
=> 3);
242 $idhandler->set_ParentIDs_from_hierarchy($seq);
243 foreach my $feature ($seq->get_all_SeqFeatures) {
244 $gffio->write_feature($feature);
249 $out->write_seq(@seq_args);