Bio::DB::Universal: move into its own distribution
[bioperl-live.git] / Bio / Tools / GFF.pm
blobf8cc0685c5bffbaf2ac1f37bdc3609a33195edb3
2 # BioPerl module for Bio::Tools::GFF
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by the Bioperl core team
8 # Copyright Matthew Pocock
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
14 =head1 NAME
16 Bio::Tools::GFF - A Bio::SeqAnalysisParserI compliant GFF format parser
18 =head1 SYNOPSIS
20 use Bio::Tools::GFF;
22 # specify input via -fh or -file
23 my $gffio = Bio::Tools::GFF->new(-fh => \*STDIN, -gff_version => 2);
24 my $feature;
25 # loop over the input stream
26 while($feature = $gffio->next_feature()) {
27 # do something with feature
29 $gffio->close();
31 # you can also obtain a GFF parser as a SeqAnalasisParserI in
32 # HT analysis pipelines (see Bio::SeqAnalysisParserI and
33 # Bio::Factory::SeqAnalysisParserFactory)
34 my $factory = Bio::Factory::SeqAnalysisParserFactory->new();
35 my $parser = $factory->get_parser(-input => \*STDIN, -method => "gff");
36 while($feature = $parser->next_feature()) {
37 # do something with feature
40 =head1 DESCRIPTION
42 This class provides a simple GFF parser and writer. In the sense of a
43 SeqAnalysisParser, it parses an input file or stream into SeqFeatureI
44 objects, but is not in any way specific to a particular analysis
45 program and the output that program produces.
47 That is, if you can get your analysis program spit out GFF, here is
48 your result parser.
50 =head1 GFF3 AND SEQUENCE DATA
52 GFF3 supports sequence data; see
54 http://www.sequenceontology.org/gff3.shtml
56 There are a number of ways to deal with this -
58 If you call
60 $gffio->ignore_sequence(1)
62 prior to parsing the sequence data is ignored; this is useful if you
63 just want the features. It avoids the memory overhead in building and
64 caching sequences
66 Alternatively, you can call either
68 $gffio->get_seqs()
72 $gffio->seq_id_by_h()
74 At the B<end> of parsing to get either a list or hashref of Bio::Seq
75 objects (see the documentation for each of these methods)
77 Note that these objects will not have the features attached - you have
78 to do this yourself, OR call
80 $gffio->features_attached_to_seqs(1)
82 PRIOR to parsing; this will ensure that the Seqs have the features
83 attached; ie you will then be able to call
85 $seq->get_SeqFeatures();
87 And use Bio::SeqIO methods
89 Note that auto-attaching the features to seqs will incur a higher
90 memory overhead as the features must be cached until the sequence data
91 is found
93 =head1 TODO
95 Make a Bio::SeqIO class specifically for GFF3 with sequence data
97 =head1 FEEDBACK
99 =head2 Mailing Lists
101 User feedback is an integral part of the evolution of this and other
102 Bioperl modules. Send your comments and suggestions preferably to one
103 of the Bioperl mailing lists. Your participation is much appreciated.
105 bioperl-l@bioperl.org - General discussion
106 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
108 =head2 Support
110 Please direct usage questions or support issues to the mailing list:
112 I<bioperl-l@bioperl.org>
114 rather than to the module maintainer directly. Many experienced and
115 reponsive experts will be able look at the problem and quickly
116 address it. Please include a thorough description of the problem
117 with code and data examples if at all possible.
119 =head2 Reporting Bugs
121 Report bugs to the Bioperl bug tracking system to help us keep track
122 the bugs and their resolution. Bug reports can be submitted the web:
124 https://github.com/bioperl/bioperl-live/issues
126 =head1 AUTHOR - Matthew Pocock
128 Email mrp-at-sanger.ac.uk
130 =head1 CONTRIBUTORS
132 Jason Stajich, jason-at-biperl-dot-org
133 Chris Mungall, cjm-at-fruitfly-dot-org
134 Steffen Grossmann [SG], grossman at molgen.mpg.de
135 Malcolm Cook, mec-at-stowers-institute.org
137 =head1 APPENDIX
139 The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _
141 =cut
143 # Let the code begin...
145 package Bio::Tools::GFF;
147 use vars qw($HAS_HTML_ENTITIES);
148 use strict;
150 use Bio::Seq::SeqFactory;
151 use Bio::LocatableSeq;
152 use Bio::SeqFeature::Generic;
154 use base qw(Bio::Root::Root Bio::SeqAnalysisParserI Bio::Root::IO);
156 my $i = 0;
157 my %GFF3_ID_Tags = map { $_ => $i++ } qw(ID Parent Target);
159 # for skipping data that may be represented elsewhere; currently, this is
160 # only the score
161 my %SKIPPED_TAGS = map { $_ => 1 } qw(score);
164 =head2 new
166 Title : new
167 Usage : my $parser = Bio::Tools::GFF->new(-gff_version => 2,
168 -file => "filename.gff");
170 my $writer = Bio::Tools::GFF->new(-gff_version => 3,
171 -file => ">filename.gff3");
172 Function: Creates a new instance. Recognized named parameters are -file, -fh,
173 and -gff_version.
174 Returns : a new object
175 Args : named parameters
176 -gff_version => [1,2,3]
178 =cut
180 { # make a class variable such that we can generate unique ID's over
181 # a session, no matter how many instances of GFF.pm we make
182 # since these have to be unique within the scope of a GFF file.
184 my $gff3_featureID = 0;
186 sub _incrementGFF3ID {
187 my ($self) = @_;
188 return ++ $gff3_featureID;
193 sub new {
194 my ($class, @args) = @_;
195 my $self = $class->SUPER::new(@args);
197 my ($gff_version, $noparse) = $self->_rearrange([qw(GFF_VERSION NOPARSE)],@args);
199 # initialize IO
200 $self->_initialize_io(@args);
201 $self->_parse_header() unless $noparse;
203 $gff_version ||= 2;
204 if( ! $self->gff_version($gff_version) ) {
205 $self->throw("Can't build a GFF object with the unknown version ".
206 $gff_version);
208 $self->{'_first'} = 1;
209 return $self;
213 =head2 _parse_header
215 Title : _parse_header
216 Usage : $gffio->_parse_header()
217 Function: used to turn parse GFF header lines. currently
218 produces Bio::LocatableSeq objects from ##sequence-region
219 lines
220 Returns : 1 on success
221 Args : none
224 =cut
226 sub _parse_header{
227 my ($self) = @_;
229 my @unhandled;
230 local $^W = 0; # hide warnings when we try and parse from a file opened
231 # for writing - there isn't really a better way to do
232 # AFAIK - cannot detech if a FH is read or write.
233 while(my $line = $self->_readline()){
234 my $handled = 0;
235 next if /^\s+$/;
236 if($line =~ /^\#\#sequence-region\s+(\S+)\s+(\S+)\s+(\S+)\s*/){
237 my($seqid,$start,$end) = ($1,$2,$3);
238 push @{ $self->{'segments'} }, Bio::LocatableSeq->new(
239 -id => unescape($seqid),
240 -start => $start,
241 -end => $end,
242 -length => ($end - $start + 1), ## make the length explicit
244 $handled = 1;
245 } elsif($line =~ /^(\#\#feature-ontology)/) {
246 #to be implemented
247 $self->warn("$1 header tag parsing unimplemented");
248 } elsif($line =~ /^(\#\#attribute-ontology)/) {
249 #to be implemented
250 $self->warn("$1 header tag parsing unimplemented");
251 } elsif($line =~ /^(\#\#source-ontology)/) {
252 #to be implemented
253 $self->warn("$1 header tag parsing unimplemented");
254 } elsif($line =~ /^(\#\#\#)/) {
255 #to be implemented
256 $self->warn("$1 header tag parsing unimplemented");
257 } elsif($line =~ /^(\#\#FASTA)/) {
258 # initial ##FASTA is optional - artemis does not use it
259 $line = $self->_readline();
260 if ($line !~ /^\>(\S+)/) {
261 $self->throw("##FASTA directive must be followed by fasta header, not: $line");
265 if ($line =~ /^\>(.*)/) {
266 # seq data can be at header or footer
267 my $seq = $self->_parse_sequence($line);
268 if ($seq) {
269 $self->_seq_by_id_h->{$seq->primary_id} = $seq;
274 if(!$handled){
275 push @unhandled, $line;
278 #looks like the header is over!
279 last unless $line =~ /^\#/;
282 foreach my $line (@unhandled){
283 $self->_pushback($line);
286 return 1;
289 sub _parse_sequence {
290 my ($self, $line) = @_;
292 if ($line =~ /^\>(.*)/) {
294 my $seqid = $1;
295 $seqid =~ s/\s+$//;
296 my $desc = '';
297 if ($seqid =~ /(\S+)\s+(.*)/) {
298 ($seqid, $desc) = ($1,$2);
300 my $res = '';
301 while (my $line = $self->_readline) {
302 if ($line =~ /^\#/) {
303 last;
305 if ($line =~ /^\>/) {
306 $self->_pushback($line);
307 last;
309 $line =~ s/\s//g;
310 $res .= $line;
312 return if $self->ignore_sequence;
314 my $seqfactory = Bio::Seq::SeqFactory->new('Bio::Seq');
315 my $seq = $seqfactory->create(-seq=>$res,
316 -id=>$seqid,
317 -desc=>$desc);
318 $seq->accession_number($seqid);
319 if ($self->features_attached_to_seqs) {
320 my @feats =
321 @{$self->_feature_idx_by_seq_id->{$seqid}};
322 $seq->add_SeqFeature($_) foreach @feats;
323 @{$self->_feature_idx_by_seq_id->{$seqid}} = ();
325 return $seq;
327 else {
328 $self->throw("expected fasta header, not: $line");
333 =head2 next_segment
335 Title : next_segment
336 Usage : my $seq = $gffio->next_segment;
337 Function: Returns a Bio::LocatableSeq object corresponding to a
338 GFF "##sequence-region" header line.
339 Example :
340 Returns : A Bio::LocatableSeq object, or undef if
341 there are no more sequences.
342 Args : none
345 =cut
347 sub next_segment{
348 my ($self,@args) = @_;
349 return shift @{ $self->{'segments'} } if defined $self->{'segments'};
350 return;
354 =head2 next_feature
356 Title : next_feature
357 Usage : $seqfeature = $gffio->next_feature();
358 Function: Returns the next feature available in the input file or stream, or
359 undef if there are no more features.
360 Example :
361 Returns : A Bio::SeqFeatureI implementing object, or undef if there are no
362 more features.
363 Args : none
365 =cut
367 sub next_feature {
368 my ($self) = @_;
370 my $gff_string;
372 # be graceful about empty lines or comments, and make sure we return undef
373 # if the input's consumed
374 while(($gff_string = $self->_readline()) && defined($gff_string)) {
375 if ($gff_string =~ /^\#\#\#/) {
376 # all forward refs have been seen; TODO
378 next if($gff_string =~ /^\#/ || $gff_string =~ /^\s*$/ ||
379 $gff_string =~ m{^//});
381 while ($gff_string =~ /^\>(.+)/) {
382 # fasta can be in header or footer
383 my $seq = $self->_parse_sequence($gff_string);
384 if ($seq) {
385 $self->_seq_by_id_h->{$seq->primary_id} = $seq;
386 $gff_string = $self->_readline;
387 last unless $gff_string;
390 last;
392 return unless $gff_string;
394 my $feat = Bio::SeqFeature::Generic->new();
395 $self->from_gff_string($feat, $gff_string);
397 if ($self->features_attached_to_seqs) {
398 push(@{$self->_feature_idx_by_seq_id->{$feat->seq_id}},
399 $feat);
402 return $feat;
405 sub _feature_idx_by_seq_id {
406 my $self = shift;
407 $self->{__feature_idx_by_seq_id} = shift if @_;
408 $self->{__feature_idx_by_seq_id} = {}
409 unless $self->{__feature_idx_by_seq_id};
410 return $self->{__feature_idx_by_seq_id};
414 =head2 from_gff_string
416 Title : from_gff_string
417 Usage : $gff->from_gff_string($feature, $gff_string);
418 Function: Sets properties of a SeqFeatureI object from a GFF-formatted
419 string. Interpretation of the string depends on the version
420 that has been specified at initialization.
422 This method is used by next_feature(). It actually dispatches to
423 one of the version-specific (private) methods.
424 Example :
425 Returns : void
426 Args : A Bio::SeqFeatureI implementing object to be initialized
427 The GFF-formatted string to initialize it from
429 =cut
431 sub from_gff_string {
432 my ($self, $feat, $gff_string) = @_;
434 if($self->gff_version() == 1) {
435 return $self->_from_gff1_string($feat, $gff_string);
436 } elsif( $self->gff_version() == 3 ) {
437 return $self->_from_gff3_string($feat, $gff_string);
438 } else {
439 return $self->_from_gff2_string($feat, $gff_string);
444 =head2 _from_gff1_string
446 Title : _from_gff1_string
447 Usage :
448 Function:
449 Example :
450 Returns : void
451 Args : A Bio::SeqFeatureI implementing object to be initialized
452 The GFF-formatted string to initialize it from
454 =cut
456 sub _from_gff1_string {
457 my ($gff, $feat, $string) = @_;
458 chomp $string;
459 my ($seqname, $source, $primary, $start, $end, $score,
460 $strand, $frame, @group) = split(/\t/, $string);
462 if ( !defined $frame ) {
463 $feat->throw("[$string] does not look like GFF to me");
465 $frame = 0 unless( $frame =~ /^\d+$/);
466 $feat->seq_id($seqname);
467 $feat->source_tag($source);
468 $feat->primary_tag($primary);
469 $feat->start($start);
470 $feat->end($end);
471 $feat->frame($frame);
472 if ( $score eq '.' ) {
473 #$feat->score(undef);
474 } else {
475 $feat->score($score);
477 if ( $strand eq '-' ) { $feat->strand(-1); }
478 if ( $strand eq '+' ) { $feat->strand(1); }
479 if ( $strand eq '.' ) { $feat->strand(0); }
480 foreach my $g ( @group ) {
481 if ( $g =~ /(\S+)=(\S+)/ ) {
482 my $tag = $1;
483 my $value = $2;
484 $feat->add_tag_value($1, $2);
485 } else {
486 $feat->add_tag_value('group', $g);
492 =head2 _from_gff2_string
494 Title : _from_gff2_string
495 Usage :
496 Function:
497 Example :
498 Returns : void
499 Args : A Bio::SeqFeatureI implementing object to be initialized
500 The GFF2-formatted string to initialize it from
503 =cut
505 sub _from_gff2_string {
506 my ($gff, $feat, $string) = @_;
507 chomp($string);
509 # according to the Sanger website, GFF2 should be single-tab
510 # separated elements, and the free-text at the end should contain
511 # text-translated tab symbols but no "real" tabs, so splitting on
512 # \t is safe, and $attribs gets the entire attributes field to be
513 # parsed later
515 # sendu: but the tag value pair can (should?) be separated by a tab. The
516 # 'no tabs' thing seems to apply only to the free text that is allowed for
517 # the value
519 my ($seqname, $source, $primary, $start,
520 $end, $score, $strand, $frame, @attribs) = split(/\t+/, $string);
521 my $attribs = join ' ', @attribs;
523 if ( !defined $frame ) {
524 $feat->throw("[$string] does not look like GFF2 to me");
526 $feat->seq_id($seqname);
527 $feat->source_tag($source);
528 $feat->primary_tag($primary);
529 $feat->start($start);
530 $feat->end($end);
531 $feat->frame($frame);
532 if ( $score eq '.' ) {
533 # $feat->score(undef);
534 } else {
535 $feat->score($score);
537 if ( $strand eq '-' ) { $feat->strand(-1); }
538 if ( $strand eq '+' ) { $feat->strand(1); }
539 if ( $strand eq '.' ) { $feat->strand(0); }
542 # <Begin Inefficient Code from Mark Wilkinson>
543 # this routine is necessay to allow the presence of semicolons in
544 # quoted text Semicolons are the delimiting character for new
545 # tag/value attributes. it is more or less a "state" machine, with
546 # the "quoted" flag going up and down as we pass thorugh quotes to
547 # distinguish free-text semicolon and hash symbols from GFF control
548 # characters
550 my $flag = 0; # this could be changed to a bit and just be twiddled
551 my @parsed;
553 # run through each character one at a time and check it
554 # NOTE: changed to foreach loop which is more efficient in perl
555 # --jasons
556 for my $a ( split //, $attribs ) {
557 # flag up on entering quoted text, down on leaving it
558 if( $a eq '"') { $flag = ( $flag == 0 ) ? 1:0 }
559 elsif( $a eq ';' && $flag ) { $a = "INSERT_SEMICOLON_HERE"}
560 elsif( $a eq '#' && ! $flag ) { last }
561 push @parsed, $a;
563 $attribs = join "", @parsed; # rejoin into a single string
565 # <End Inefficient Code>
566 # Please feel free to fix this and make it more "perlish"
568 my @key_vals = split /;/, $attribs; # attributes are semicolon-delimited
570 foreach my $pair ( @key_vals ) {
571 # replace semicolons that were removed from free-text above.
572 $pair =~ s/INSERT_SEMICOLON_HERE/;/g;
574 # separate the key from the value
575 my ($blank, $key, $values) = split /^\s*([\w\d]+)\s/, $pair;
577 if( defined $values ) {
578 my @values;
579 # free text is quoted, so match each free-text block
580 # and remove it from the $values string
581 while ($values =~ s/"(.*?)"//){
582 # and push it on to the list of values (tags may have
583 # more than one value... and the value may be undef)
584 push @values, $1;
587 # and what is left over should be space-separated
588 # non-free-text values
590 my @othervals = split /\s+/, $values;
591 foreach my $othervalue(@othervals){
592 # get rid of any empty strings which might
593 # result from the split
594 if (CORE::length($othervalue) > 0) {push @values, $othervalue}
597 foreach my $value(@values){
598 $feat->add_tag_value($key, $value);
605 sub _from_gff3_string {
606 my ($gff, $feat, $string) = @_;
607 chomp($string);
609 # according to the now nearly final GFF3 spec, columns should
610 # be tab separated, allowing unescaped spaces to occur in
611 # column 9
613 my ($seqname, $source, $primary, $start, $end,
614 $score, $strand, $frame, $groups) = split(/\t/, $string);
616 if ( ! defined $frame ) {
617 $feat->throw("[$string] does not look like GFF3 to me");
619 $feat->seq_id($seqname);
620 $feat->source_tag($source);
621 $feat->primary_tag($primary);
622 $feat->start($start);
623 $feat->end($end);
624 $feat->frame($frame);
625 if ( $score eq '.' ) {
626 #$feat->score(undef);
627 } else {
628 $feat->score($score);
630 if ( $strand eq '-' ) { $feat->strand(-1); }
631 if ( $strand eq '+' ) { $feat->strand(1); }
632 if ( $strand eq '.' ) { $feat->strand(0); }
633 my @groups = split(/\s*;\s*/, $groups);
635 for my $group (@groups) {
636 my ($tag,$value) = split /=/,$group;
637 $tag = unescape($tag);
638 my @values = map {unescape($_)} split /,/,$value;
639 for my $v ( @values ) { $feat->add_tag_value($tag,$v); }
643 # taken from Bio::DB::GFF
644 sub unescape {
645 my $v = shift;
646 $v =~ s/%([0-9a-fA-F]{2})/chr hex($1)/ge;
647 return $v;
651 =head2 write_feature
653 Title : write_feature
654 Usage : $gffio->write_feature($feature);
655 Function: Writes the specified SeqFeatureI object in GFF format to the stream
656 associated with this instance.
657 Returns : none
658 Args : An array of Bio::SeqFeatureI implementing objects to be serialized
660 =cut
662 sub write_feature {
663 my ($self, @features) = @_;
664 return unless @features;
665 if( $self->{'_first'} && $self->gff_version() == 3 ) {
666 $self->_print("##gff-version 3\n");
668 $self->{'_first'} = 0;
669 foreach my $feature ( @features ) {
670 $self->_print($self->gff_string($feature)."\n");
675 =head2 gff_string
677 Title : gff_string
678 Usage : $gffstr = $gffio->gff_string($feature);
679 Function: Obtain the GFF-formatted representation of a SeqFeatureI object.
680 The formatting depends on the version specified at initialization.
682 This method is used by write_feature(). It actually dispatches to
683 one of the version-specific (private) methods.
684 Example :
685 Returns : A GFF-formatted string representation of the SeqFeature
686 Args : A Bio::SeqFeatureI implementing object to be GFF-stringified
688 =cut
690 sub gff_string{
691 my ($self, $feature) = @_;
693 if($self->gff_version() == 1) {
694 return $self->_gff1_string($feature);
695 } elsif( $self->gff_version() == 3 ) {
696 return $self->_gff3_string($feature);
697 } elsif( $self->gff_version() == 2.5 ) {
698 return $self->_gff25_string($feature);
699 } else {
700 return $self->_gff2_string($feature);
705 =head2 _gff1_string
707 Title : _gff1_string
708 Usage : $gffstr = $gffio->_gff1_string
709 Function:
710 Example :
711 Returns : A GFF1-formatted string representation of the SeqFeature
712 Args : A Bio::SeqFeatureI implementing object to be GFF-stringified
714 =cut
716 sub _gff1_string{
717 my ($gff, $feat) = @_;
718 my ($str,$score,$frame,$name,$strand);
720 if( $feat->can('score') ) {
721 $score = $feat->score();
723 $score = '.' unless defined $score;
725 if( $feat->can('frame') ) {
726 $frame = $feat->frame();
728 $frame = '.' unless defined $frame;
730 $strand = $feat->strand();
731 if(! $strand) {
732 $strand = ".";
733 } elsif( $strand == 1 ) {
734 $strand = '+';
735 } elsif ( $feat->strand == -1 ) {
736 $strand = '-';
739 if( $feat->can('seqname') ) {
740 $name = $feat->seq_id();
741 $name ||= 'SEQ';
742 } else {
743 $name = 'SEQ';
746 $str = join("\t",
747 $name,
748 $feat->source_tag,
749 $feat->primary_tag,
750 $feat->start,
751 $feat->end,
752 $score,
753 $strand,
754 $frame);
756 foreach my $tag ( $feat->get_all_tags ) {
757 next if exists $SKIPPED_TAGS{$tag};
758 foreach my $value ( $feat->get_tag_values($tag) ) {
759 $str .= " $tag=$value" if $value;
763 return $str;
767 =head2 _gff2_string
769 Title : _gff2_string
770 Usage : $gffstr = $gffio->_gff2_string
771 Function:
772 Example :
773 Returns : A GFF2-formatted string representation of the SeqFeature
774 Args : A Bio::SeqFeatureI implementing object to be GFF2-stringified
776 =cut
778 sub _gff2_string{
779 my ($gff, $origfeat) = @_;
780 my $feat;
781 if ($origfeat->isa('Bio::SeqFeature::FeaturePair')){
782 $feat = $origfeat->feature2;
783 } else {
784 $feat = $origfeat;
786 my ($str1, $str2,$score,$frame,$name,$strand);
788 if( $feat->can('score') ) {
789 $score = $feat->score();
791 $score = '.' unless defined $score;
793 if( $feat->can('frame') ) {
794 $frame = $feat->frame();
796 $frame = '.' unless defined $frame;
798 $strand = $feat->strand();
799 if(! $strand) {
800 $strand = ".";
801 } elsif( $strand == 1 ) {
802 $strand = '+';
803 } elsif ( $feat->strand == -1 ) {
804 $strand = '-';
807 if( $feat->can('seqname') ) {
808 $name = $feat->seq_id();
810 $name ||= 'SEQ';
812 $str1 = join("\t",
813 $name,
814 $feat->source_tag(),
815 $feat->primary_tag(),
816 $feat->start(),
817 $feat->end(),
818 $score,
819 $strand,
820 $frame);
821 # the routine below is the only modification I made to the original
822 # ->gff_string routine (above) as on November 17th, 2000, the
823 # Sanger webpage describing GFF2 format reads: "From version 2
824 # onwards, the attribute field must have a tag value structure
825 # following the syntax used within objects in a .ace file,
826 # flattened onto one line by semicolon separators. Tags must be
827 # standard identifiers ([A-Za-z][A-Za-z0-9_]*). Free text values
828 # must be quoted with double quotes".
829 # MW
831 my @group;
833 foreach my $tag ( $feat->get_all_tags ) {
834 next if exists $SKIPPED_TAGS{$tag};
835 my @v;
836 foreach my $value ( $feat->get_tag_values($tag) ) {
837 unless( defined $value && length($value) ) {
838 # quote anything other than valid tag/value characters
839 $value = '""';
840 } elsif ($value =~ /[^A-Za-z0-9_]/){
841 # substitute tab and newline chars by their UNIX equivalent
842 $value =~ s/\t/\\t/g;
843 $value =~ s/\n/\\n/g;
844 $value = '"' . $value . '" ';
846 push @v, $value;
847 # for this tag (allowed in GFF2 and .ace format)
849 push @group, "$tag ".join(" ", @v);
852 $str2 .= join(' ; ', @group);
853 # Add Target information for Feature Pairs
854 if( ! $feat->has_tag('Target') && # This is a bad hack IMHO
855 ! $feat->has_tag('Group') &&
856 $origfeat->isa('Bio::SeqFeature::FeaturePair') ) {
857 $str2 = sprintf("Target %s %d %d", $origfeat->feature1->seq_id,
858 ( $origfeat->feature1->strand < 0 ?
859 ( $origfeat->feature1->end,
860 $origfeat->feature1->start) :
861 ( $origfeat->feature1->start,
862 $origfeat->feature1->end)
863 )) . ($str2?" ; ".$str2:""); # need to put Target information before other tag/value pairs - mw
865 return $str1."\t".$str2;
869 =head2 _gff25_string
871 Title : _gff25_string
872 Usage : $gffstr = $gffio->_gff2_string
873 Function: To get a format of GFF that is peculiar to Gbrowse/Bio::DB::GFF
874 Example :
875 Returns : A GFF2.5-formatted string representation of the SeqFeature
876 Args : A Bio::SeqFeatureI implementing object to be GFF2.5-stringified
878 =cut
880 sub _gff25_string {
881 my ($gff, $origfeat) = @_;
882 my $feat;
883 if ($origfeat->isa('Bio::SeqFeature::FeaturePair')){
884 $feat = $origfeat->feature2;
885 } else {
886 $feat = $origfeat;
888 my ($str1, $str2,$score,$frame,$name,$strand);
890 if( $feat->can('score') ) {
891 $score = $feat->score();
893 $score = '.' unless defined $score;
895 if( $feat->can('frame') ) {
896 $frame = $feat->frame();
898 $frame = '.' unless defined $frame;
900 $strand = $feat->strand();
901 if(! $strand) {
902 $strand = ".";
903 } elsif( $strand == 1 ) {
904 $strand = '+';
905 } elsif ( $feat->strand == -1 ) {
906 $strand = '-';
909 if( $feat->can('seqname') ) {
910 $name = $feat->seq_id();
911 $name ||= 'SEQ';
912 } else {
913 $name = 'SEQ';
915 $str1 = join("\t",
916 $name,
917 $feat->source_tag(),
918 $feat->primary_tag(),
919 $feat->start(),
920 $feat->end(),
921 $score,
922 $strand,
923 $frame);
925 my @all_tags = $feat->all_tags;
926 my @group; my @firstgroup;
927 if (@all_tags) { # only play this game if it is worth playing...
928 foreach my $tag ( @all_tags ) {
929 my @v;
930 foreach my $value ( $feat->get_tag_values($tag) ) {
931 next if exists $SKIPPED_TAGS{$tag};
932 unless( defined $value && length($value) ) {
933 $value = '""';
934 } elsif ($value =~ /[^A-Za-z0-9_]/){
935 $value =~ s/\t/\\t/g; # substitute tab and newline
936 # characters
937 $value =~ s/\n/\\n/g; # to their UNIX equivalents
938 $value = '"' . $value . '" ';
939 } # if the value contains
940 # anything other than valid
941 # tag/value characters, then
942 # quote it
943 push @v, $value;
944 # for this tag (allowed in GFF2 and .ace format)
946 if (($tag eq 'Group') || ($tag eq 'Target')){ # hopefully we won't get both...
947 push @firstgroup, "$tag ".join(" ", @v);
948 } else {
949 push @group, "$tag ".join(" ", @v);
953 $str2 = join(' ; ', (@firstgroup, @group));
954 # Add Target information for Feature Pairs
955 if( ! $feat->has_tag('Target') && # This is a bad hack IMHO
956 ! $feat->has_tag('Group') &&
957 $origfeat->isa('Bio::SeqFeature::FeaturePair') ) {
958 $str2 = sprintf("Target %s ; tstart %d ; tend %d", $origfeat->feature1->seq_id,
959 ( $origfeat->feature1->strand < 0 ?
960 ( $origfeat->feature1->end,
961 $origfeat->feature1->start) :
962 ( $origfeat->feature1->start,
963 $origfeat->feature1->end)
964 )) . ($str2?" ; ".$str2:""); # need to put the target info before other tag/value pairs - mw
966 return $str1 . "\t". $str2;
970 =head2 _gff3_string
972 Title : _gff3_string
973 Usage : $gffstr = $gffio->_gff3_string
974 Function:
975 Example :
976 Returns : A GFF3-formatted string representation of the SeqFeature
977 Args : A Bio::SeqFeatureI implementing object to be GFF3-stringified
979 =cut
981 sub _gff3_string {
982 my ($gff, $origfeat) = @_;
983 my $feat;
984 if ($origfeat->isa('Bio::SeqFeature::FeaturePair')){
985 $feat = $origfeat->feature2;
986 } else {
987 $feat = $origfeat;
990 my $ID = $gff->_incrementGFF3ID();
992 my ($score,$frame,$name,$strand);
994 if( $feat->can('score') ) {
995 $score = $feat->score();
997 $score = '.' unless defined $score;
999 if( $feat->can('frame') ) {
1000 $frame = $feat->frame();
1002 $frame = '1' unless defined $frame;
1004 $strand = $feat->strand();
1006 if(! $strand) {
1007 $strand = ".";
1008 } elsif( $strand == 1 ) {
1009 $strand = '+';
1010 } elsif ( $feat->strand == -1 ) {
1011 $strand = '-';
1014 if( $feat->can('seqname') ) {
1015 $name = $feat->seq_id();
1016 $name ||= 'SEQ';
1017 } else {
1018 $name = 'SEQ';
1021 my @groups;
1023 # force leading ID and Parent tags
1024 my @all_tags = grep { ! exists $GFF3_ID_Tags{$_} } $feat->all_tags;
1025 for my $t ( sort { $GFF3_ID_Tags{$b} <=> $GFF3_ID_Tags{$a} }
1026 keys %GFF3_ID_Tags ) {
1027 unshift @all_tags, $t if $feat->has_tag($t);
1030 for my $tag ( @all_tags ) {
1031 next if exists $SKIPPED_TAGS{$tag};
1032 # next if $tag eq 'Target';
1033 if ($tag eq 'Target' && ! $origfeat->isa('Bio::SeqFeature::FeaturePair')){
1034 my @values = $feat->get_tag_values($tag);
1035 if(scalar(@values) > 1){ # How is it possible that Target is has a value list ??
1036 # simple Target,start,stop
1037 my ($target_id, $b,$e,$strand) = $feat->get_tag_values($tag);
1038 next unless(defined($e) && defined($b) && $target_id);
1039 ($b,$e)= ($e,$b) if(defined $strand && $strand<0);
1040 #if we have the strand we will print it
1041 if($strand){ push @groups, sprintf("Target=%s %d %d %s", $target_id,$b,$e,$strand); }
1042 else{ push @groups, sprintf("Target=%s %d %d", $target_id,$b,$e); }
1043 next;
1047 my $valuestr;
1048 # a string which will hold one or more values
1049 # for this tag, with quoted free text and
1050 # space-separated individual values.
1051 my @v;
1052 for my $value ( $feat->get_tag_values($tag) ) {
1053 if( defined $value && length($value) ) {
1054 #$value =~ tr/ /+/; #spaces are allowed now
1055 if ( ref $value eq 'Bio::Annotation::Comment') {
1056 $value = $value->text;
1059 if ($value =~ /[^a-zA-Z0-9\,\;\=\.:\%\^\*\$\@\!\+\_\?\-]/) {
1060 $value =~ s/\t/\\t/g; # substitute tab and newline
1061 # characters
1062 $value =~ s/\n/\\n/g; # to their UNIX equivalents
1064 # Unescaped quotes are not allowed in GFF3
1065 # $value = '"' . $value . '"';
1067 $value =~ s/([\t\n\r%&\=;,])/sprintf("%%%X",ord($1))/ge;
1068 } else {
1069 # if it is completely empty, then just make empty double quotes
1070 $value = '""';
1072 push @v, $value;
1074 # can we figure out how to improve this?
1075 $tag = lcfirst($tag) unless ( $tag =~
1076 /^(ID|Name|Alias|Parent|Gap|Target|Derives_from|Note|Dbxref|Ontology_term)$/);
1078 push @groups, "$tag=".join(",",@v);
1080 # Add Target information for Feature Pairs
1081 if( $feat->has_tag('Target') &&
1082 ! $feat->has_tag('Group') &&
1083 $origfeat->isa('Bio::SeqFeature::FeaturePair') ) {
1085 my $target_id = $origfeat->feature1->seq_id;
1086 $target_id =~ s/([\t\n\r%&\=;,])/sprintf("%%%X",ord($1))/ge;
1088 push @groups, sprintf("Target=%s %d %d",
1089 $target_id,
1090 ( $origfeat->feature1->strand < 0 ?
1091 ( $origfeat->feature1->end,
1092 $origfeat->feature1->start) :
1093 ( $origfeat->feature1->start,
1094 $origfeat->feature1->end)
1098 # unshift @groups, "ID=autogenerated$ID" unless ($feat->has_tag('ID'));
1099 if ( $feat->can('name') && defined($feat->name) ) {
1100 # such as might be for Bio::DB::SeqFeature
1101 unshift @groups, 'Name=' . $feat->name;
1104 my $gff_string = "";
1105 if ($feat->location->isa("Bio::Location::SplitLocationI")) {
1106 my @locs = $feat->location->each_Location;
1107 foreach my $loc (@locs) {
1108 $gff_string .= join("\t",
1109 $name,
1110 $feat->source_tag() || '.',
1111 $feat->primary_tag(),
1112 $loc->start(),
1113 $loc->end(),
1114 $score,
1115 $strand,
1116 $frame,
1117 join(';', @groups)) . "\n";
1119 chop $gff_string;
1120 return $gff_string;
1121 } else {
1122 $gff_string = join("\t",
1123 $name,
1124 $feat->source_tag() || '.',
1125 $feat->primary_tag(),
1126 $feat->start(),
1127 $feat->end(),
1128 $score,
1129 $strand,
1130 $frame,
1131 join(';', @groups));
1133 return $gff_string;
1137 =head2 gff_version
1139 Title : _gff_version
1140 Usage : $gffversion = $gffio->gff_version
1141 Function:
1142 Example :
1143 Returns : The GFF version this parser will accept and emit.
1144 Args : none
1146 =cut
1148 sub gff_version {
1149 my ($self, $value) = @_;
1150 if(defined $value && grep {$value == $_ } ( 1, 2, 2.5, 3)) {
1151 $self->{'GFF_VERSION'} = $value;
1153 return $self->{'GFF_VERSION'};
1157 # Make filehandles
1159 =head2 newFh
1161 Title : newFh
1162 Usage : $fh = Bio::Tools::GFF->newFh(-file=>$filename,-format=>'Format')
1163 Function: does a new() followed by an fh()
1164 Example : $fh = Bio::Tools::GFF->newFh(-file=>$filename,-format=>'Format')
1165 $feature = <$fh>; # read a feature object
1166 print $fh $feature; # write a feature object
1167 Returns : filehandle tied to the Bio::Tools::GFF class
1168 Args :
1170 =cut
1172 sub newFh {
1173 my $class = shift;
1174 return unless my $self = $class->new(@_);
1175 return $self->fh;
1179 =head2 fh
1181 Title : fh
1182 Usage : $obj->fh
1183 Function:
1184 Example : $fh = $obj->fh; # make a tied filehandle
1185 $feature = <$fh>; # read a feature object
1186 print $fh $feature; # write a feature object
1187 Returns : filehandle tied to Bio::Tools::GFF class
1188 Args : none
1190 =cut
1193 sub fh {
1194 my $self = shift;
1195 my $class = ref($self) || $self;
1196 my $s = Symbol::gensym;
1197 tie $$s,$class,$self;
1198 return $s;
1201 # This accessor is used for accessing the Bio::Seq objects from a GFF3
1202 # file; if the file you are using has no sequence data you can ignore
1203 # this accessor
1205 # This accessor returns a hash reference containing Bio::Seq objects,
1206 # indexed by Bio::Seq->primary_id
1208 sub _seq_by_id_h {
1209 my $self = shift;
1211 return $self->{'_seq_by_id_h'} = shift if @_;
1212 $self->{'_seq_by_id_h'} = {}
1213 unless $self->{'_seq_by_id_h'};
1214 return $self->{'_seq_by_id_h'};
1218 =head2 get_seqs
1220 Title : get_seqs
1221 Usage :
1222 Function: Returns all Bio::Seq objects populated by GFF3 file
1223 Example :
1224 Returns :
1225 Args :
1227 =cut
1229 sub get_seqs {
1230 my ($self,@args) = @_;
1231 return values %{$self->_seq_by_id_h};
1235 =head2 features_attached_to_seqs
1237 Title : features_attached_to_seqs
1238 Usage : $obj->features_attached_to_seqs(1);
1239 Function: For use with GFF3 containing sequence only
1241 Setting this B<before> parsing ensures that all Bio::Seq object
1242 created will have the appropriate features added to them
1244 defaults to false (off)
1246 Note that this mode will incur higher memory usage because features
1247 will have to be cached until the relevant feature comes along
1249 Example :
1250 Returns : value of features_attached_to_seqs (a boolean)
1251 Args : on set, new value (a boolean, optional)
1254 =cut
1256 sub features_attached_to_seqs{
1257 my $self = shift;
1259 return $self->{'_features_attached_to_seqs'} = shift if @_;
1260 return $self->{'_features_attached_to_seqs'};
1264 =head2 ignore_sequence
1266 Title : ignore_sequence
1267 Usage : $obj->ignore_sequence(1);
1268 Function: For use with GFF3 containing sequence only
1270 Setting this B<before> parsing means that all sequence data will be
1271 ignored
1273 Example :
1274 Returns : value of ignore_sequence (a boolean)
1275 Args : on set, new value (a boolean, optional)
1277 =cut
1279 sub ignore_sequence{
1280 my $self = shift;
1282 return $self->{'_ignore_sequence'} = shift if @_;
1283 return $self->{'_ignore_sequence'};
1287 sub DESTROY {
1288 my $self = shift;
1289 $self->close();
1292 sub TIEHANDLE {
1293 my ($class,$val) = @_;
1294 return bless {'gffio' => $val}, $class;
1297 sub READLINE {
1298 my $self = shift;
1299 return $self->{'gffio'}->next_feature() || undef unless wantarray;
1300 my (@list, $obj);
1301 push @list, $obj while $obj = $self->{'gffio'}->next_feature();
1302 return @list;
1305 sub PRINT {
1306 my $self = shift;
1307 $self->{'gffio'}->write_feature(@_);