bp_process_wormbase: move program to new Bio-DB-Ace distribution
[bioperl-live.git] / Bio / SeqFeatureI.pm
blob31799309181eeeadc1e488d35bca9f0afff2b1da
2 # BioPerl module for Bio::SeqFeatureI
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Ewan Birney <birney@ebi.ac.uk>
8 # Copyright Ewan Birney
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
14 =head1 NAME
16 Bio::SeqFeatureI - Abstract interface of a Sequence Feature
18 =head1 SYNOPSIS
20 # get a seqfeature somehow, eg, from a Sequence with Features attached
22 foreach $feat ( $seq->get_SeqFeatures() ) {
23 print "Feature from ", $feat->start, "to ",
24 $feat->end, " Primary tag ", $feat->primary_tag,
25 ", produced by ", $feat->source_tag(), "\n";
27 if ( $feat->strand == 0 ) {
28 print "Feature applicable to either strand\n";
30 else {
31 print "Feature on strand ", $feat->strand,"\n"; # -1,1
34 print "feature location is ",$feat->start, "..",
35 $feat->end, " on strand ", $feat->strand, "\n";
36 print "easy utility to print locations in GenBank/EMBL way ",
37 $feat->location->to_FTstring(), "\n";
39 foreach $tag ( $feat->get_all_tags() ) {
40 print "Feature has tag ", $tag, " with values, ",
41 join(' ',$feat->get_tag_values($tag)), "\n";
43 print "new feature\n" if $feat->has_tag('new');
44 # features can have sub features
45 my @subfeat = $feat->get_SeqFeatures();
48 =head1 DESCRIPTION
50 This interface is the functions one can expect for any Sequence
51 Feature, whatever its implementation or whether it is a more complex
52 type (eg, a Gene). This object does not actually provide any
53 implementation, it just provides the definitions of what methods one can
54 call. See Bio::SeqFeature::Generic for a good standard implementation
55 of this object
57 =head1 FEEDBACK
59 User feedback is an integral part of the evolution of this and other
60 Bioperl modules. Send your comments and suggestions preferably to one
61 of the Bioperl mailing lists. Your participation is much appreciated.
63 bioperl-l@bioperl.org - General discussion
64 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
66 =head2 Support
68 Please direct usage questions or support issues to the mailing list:
70 I<bioperl-l@bioperl.org>
72 rather than to the module maintainer directly. Many experienced and
73 reponsive experts will be able look at the problem and quickly
74 address it. Please include a thorough description of the problem
75 with code and data examples if at all possible.
77 =head2 Reporting Bugs
79 Report bugs to the Bioperl bug tracking system to help us keep track
80 the bugs and their resolution. Bug reports can be submitted via the
81 web:
83 https://github.com/bioperl/bioperl-live/issues
85 =head1 APPENDIX
87 The rest of the documentation details each of the object
88 methods. Internal methods are usually preceded with a _
90 =cut
93 # Let the code begin...
96 package Bio::SeqFeatureI;
97 use vars qw($HasInMemory);
98 use strict;
99 BEGIN {
100 eval { require Bio::DB::InMemoryCache };
101 if( $@ ) { $HasInMemory = 0 }
102 else { $HasInMemory = 1 }
105 use Bio::Seq;
107 use Carp;
109 use base qw(Bio::RangeI);
111 =head1 Bio::SeqFeatureI specific methods
113 New method interfaces.
115 =cut
117 =head2 get_SeqFeatures
119 Title : get_SeqFeatures
120 Usage : @feats = $feat->get_SeqFeatures();
121 Function: Returns an array of sub Sequence Features
122 Returns : An array
123 Args : none
125 =cut
127 sub get_SeqFeatures{
128 my ($self,@args) = @_;
130 $self->throw_not_implemented();
133 =head2 display_name
135 Title : display_name
136 Usage : $name = $feat->display_name()
137 Function: Returns the human-readable name of the feature for displays.
138 Returns : a string
139 Args : none
141 =cut
143 sub display_name {
144 shift->throw_not_implemented();
147 =head2 primary_tag
149 Title : primary_tag
150 Usage : $tag = $feat->primary_tag()
151 Function: Returns the primary tag for a feature,
152 eg 'exon'
153 Returns : a string
154 Args : none
157 =cut
159 sub primary_tag{
160 my ($self,@args) = @_;
162 $self->throw_not_implemented();
166 =head2 source_tag
168 Title : source_tag
169 Usage : $tag = $feat->source_tag()
170 Function: Returns the source tag for a feature,
171 eg, 'genscan'
172 Returns : a string
173 Args : none
176 =cut
178 sub source_tag{
179 my ($self,@args) = @_;
181 $self->throw_not_implemented();
184 =head2 has_tag
186 Title : has_tag
187 Usage : $tag_exists = $self->has_tag('some_tag')
188 Function:
189 Returns : TRUE if the specified tag exists, and FALSE otherwise
190 Args :
192 =cut
194 sub has_tag{
195 my ($self,@args) = @_;
197 $self->throw_not_implemented();
201 =head2 get_tag_values
203 Title : get_tag_values
204 Usage : @values = $self->get_tag_values('some_tag')
205 Function:
206 Returns : An array comprising the values of the specified tag.
207 Args : a string
209 throws an exception if there is no such tag
211 =cut
213 sub get_tag_values {
214 shift->throw_not_implemented();
217 =head2 get_tagset_values
219 Title : get_tagset_values
220 Usage : @values = $self->get_tagset_values(qw(label transcript_id product))
221 Function:
222 Returns : An array comprising the values of the specified tags, in order of tags
223 Args : An array of strings
225 does NOT throw an exception if none of the tags are not present
227 this method is useful for getting a human-readable label for a
228 SeqFeatureI; not all tags can be assumed to be present, so a list of
229 possible tags in preferential order is provided
231 =cut
233 # interface + abstract method
234 sub get_tagset_values {
235 my ($self, @args) = @_;
236 my @vals = ();
237 foreach my $arg (@args) {
238 if ($self->has_tag($arg)) {
239 push(@vals, $self->get_tag_values($arg));
242 return @vals;
245 =head2 get_all_tags
247 Title : get_all_tags
248 Usage : @tags = $feat->get_all_tags()
249 Function: gives all tags for this feature
250 Returns : an array of strings
251 Args : none
254 =cut
256 sub get_all_tags{
257 shift->throw_not_implemented();
260 =head2 attach_seq
262 Title : attach_seq
263 Usage : $sf->attach_seq($seq)
264 Function: Attaches a Bio::Seq object to this feature. This
265 Bio::Seq object is for the *entire* sequence: ie
266 from 1 to 10000
268 Note that it is not guaranteed that if you obtain a feature from
269 an object in bioperl, it will have a sequence attached. Also,
270 implementors of this interface can choose to provide an empty
271 implementation of this method. I.e., there is also no guarantee
272 that if you do attach a sequence, seq() or entire_seq() will not
273 return undef.
275 The reason that this method is here on the interface is to enable
276 you to call it on every SeqFeatureI compliant object, and
277 that it will be implemented in a useful way and set to a useful
278 value for the great majority of use cases. Implementors who choose
279 to ignore the call are encouraged to specifically state this in
280 their documentation.
282 Example :
283 Returns : TRUE on success
284 Args : a Bio::PrimarySeqI compliant object
287 =cut
289 sub attach_seq {
290 shift->throw_not_implemented();
293 =head2 seq
295 Title : seq
296 Usage : $tseq = $sf->seq()
297 Function: returns the truncated sequence (if there is a sequence attached)
298 for this feature
299 Example :
300 Returns : sub seq (a Bio::PrimarySeqI compliant object) on attached sequence
301 bounded by start & end, or undef if there is no sequence attached.
302 If the strand is defined and set to -1, the returned sequence is
303 the reverse-complement of the region
304 Args : none
307 =cut
309 sub seq {
310 shift->throw_not_implemented();
313 =head2 entire_seq
315 Title : entire_seq
316 Usage : $whole_seq = $sf->entire_seq()
317 Function: gives the entire sequence that this seqfeature is attached to
318 Example :
319 Returns : a Bio::PrimarySeqI compliant object, or undef if there is no
320 sequence attached
321 Args : none
324 =cut
326 sub entire_seq {
327 shift->throw_not_implemented();
331 =head2 seq_id
333 Title : seq_id
334 Usage : $obj->seq_id($newval)
335 Function: There are many cases when you make a feature that you
336 do know the sequence name, but do not know its actual
337 sequence. This is an attribute such that you can store
338 the ID (e.g., display_id) of the sequence.
340 This attribute should *not* be used in GFF dumping, as
341 that should come from the collection in which the seq
342 feature was found.
343 Returns : value of seq_id
344 Args : newvalue (optional)
347 =cut
349 sub seq_id {
350 shift->throw_not_implemented();
353 =head2 gff_string
355 Title : gff_string
356 Usage : $str = $feat->gff_string;
357 $str = $feat->gff_string($gff_formatter);
358 Function: Provides the feature information in GFF format.
360 The implementation provided here returns GFF2 by default. If you
361 want a different version, supply an object implementing a method
362 gff_string() accepting a SeqFeatureI object as argument. E.g., to
363 obtain GFF1 format, do the following:
365 my $gffio = Bio::Tools::GFF->new(-gff_version => 1);
366 $gff1str = $feat->gff_string($gff1io);
368 Returns : A string
369 Args : Optionally, an object implementing gff_string().
372 =cut
374 sub gff_string{
375 my ($self,$formatter) = @_;
377 $formatter = $self->_static_gff_formatter unless $formatter;
378 return $formatter->gff_string($self);
381 my $static_gff_formatter = undef;
383 =head2 _static_gff_formatter
385 Title : _static_gff_formatter
386 Usage :
387 Function:
388 Example :
389 Returns :
390 Args :
393 =cut
395 sub _static_gff_formatter{
396 my ($self,@args) = @_;
397 require Bio::Tools::GFF; # on the fly inclusion -- is this better?
398 if( !defined $static_gff_formatter ) {
399 $static_gff_formatter = Bio::Tools::GFF->new('-gff_version' => 2);
401 return $static_gff_formatter;
405 =head1 Decorating methods
407 These methods have an implementation provided by Bio::SeqFeatureI,
408 but can be validly overwritten by subclasses
410 =head2 spliced_seq
412 Title : spliced_seq
414 Usage : $seq = $feature->spliced_seq()
415 $seq = $feature_with_remote_locations->spliced_seq($db_for_seqs)
417 Function: Provides a sequence of the feature which is the most
418 semantically "relevant" feature for this sequence. A default
419 implementation is provided which for simple cases returns just
420 the sequence, but for split cases, loops over the split location
421 to return the sequence. In the case of split locations with
422 remote locations, eg
424 join(AB000123:5567-5589,80..1144)
426 in the case when a database object is passed in, it will attempt
427 to retrieve the sequence from the database object, and "Do the right thing",
428 however if no database object is provided, it will generate the correct
429 number of N's (DNA) or X's (protein, though this is unlikely).
431 This function is deliberately "magical" attempting to second guess
432 what a user wants as "the" sequence for this feature.
434 Implementing classes are free to override this method with their
435 own magic if they have a better idea what the user wants.
437 Args : [optional]
438 -db A L<Bio::DB::RandomAccessI> compliant object if
439 one needs to retrieve remote seqs.
440 -nosort boolean if the locations should not be sorted
441 by start location. This may occur, for instance,
442 in a circular sequence where a gene span starts
443 before the end of the sequence and ends after the
444 sequence start. Example : join(15685..16260,1..207)
445 (default = if sequence is_circular(), 1, otherwise 0)
446 -phase truncates the returned sequence based on the
447 intron phase (0,1,2).
449 Returns : A L<Bio::PrimarySeqI> object
451 =cut
453 sub spliced_seq {
454 my $self = shift;
455 my @args = @_;
456 my ($db, $nosort, $phase) =
457 $self->_rearrange([qw(DB NOSORT PHASE)], @args);
459 # set no_sort based on the parent sequence status
460 if ($self->entire_seq->is_circular) {
461 $nosort = 1;
464 # (added 7/7/06 to allow use old API (with warnings)
465 my $old_api = (!(grep {$_ =~ /(?:nosort|db|phase)/} @args)) ? 1 : 0;
466 if (@args && $old_api) {
467 $self->warn( q(API has changed; please use '-db' or '-nosort' )
468 . qq(for args. See POD for more details.));
469 $db = shift @args if @args;
470 $nosort = shift @args if @args;
471 $phase = shift @args if @args;
474 if (defined($phase) && ($phase < 0 || $phase > 2)) {
475 $self->warn("Phase must be 0,1, or 2. Setting phase to 0...");
476 $phase = 0;
479 if ( $db && ref($db) && ! $db->isa('Bio::DB::RandomAccessI') ) {
480 $self->warn( "Must pass in a valid Bio::DB::RandomAccessI object"
481 . " for access to remote locations for spliced_seq");
482 $db = undef;
484 elsif ( defined $db && $HasInMemory && $db->isa('Bio::DB::InMemoryCache') ) {
485 $db = Bio::DB::InMemoryCache->new(-seqdb => $db);
488 if ( not $self->location->isa("Bio::Location::SplitLocationI") ) {
489 if ($phase) {
490 $self->debug("Subseq start: ",$phase+1,"\tend: ",$self->end,"\n");
491 my $seqstr = substr($self->seq->seq, $phase);
492 my $out = Bio::Seq->new( -id => $self->entire_seq->display_id
493 . "_spliced_feat",
494 -seq => $seqstr);
495 return $out;
497 else {
498 return $self->seq(); # nice and easy!
502 # redundant test, but the above ISA is probably not ideal.
503 if ( not $self->location->isa("Bio::Location::SplitLocationI") ) {
504 $self->throw("not atomic, not split, yikes, in trouble!");
507 my $seqstr = '';
508 my $seqid = $self->entire_seq->display_id;
509 # This is to deal with reverse strand features
510 # so we are really sorting features 5' -> 3' on their strand
511 # i.e. rev strand features will be sorted largest to smallest
512 # as this how revcom CDSes seem to be annotated in genbank.
513 # Might need to eventually allow this to be programable?
514 # (can I mention how much fun this is NOT! --jason)
516 my ($mixed,$mixedloc, $fstrand) = (0);
518 if ( $self->isa('Bio::Das::SegmentI') and not $self->absolute ) {
519 $self->warn( "Calling spliced_seq with a Bio::Das::SegmentI which "
520 . "does have absolute set to 1 -- be warned you may not "
521 . "be getting things on the correct strand");
524 my @locset = $self->location->each_Location;
525 my @locs;
526 if ( not $nosort ) {
527 # @locs = map { $_->[0] }
528 # sort so that most negative is first basically to order
529 # the features on the opposite strand 5'->3' on their strand
530 # rather than they way most are input which is on the fwd strand
532 # sort { $a->[1] <=> $b->[1] } # Yes Tim, Schwartzian transformation
533 my @proc_locs =
534 map {
535 $fstrand = $_->strand unless defined $fstrand;
536 $mixed = 1 if defined $_->strand && $fstrand != $_->strand;
538 if( defined $_->seq_id ) {
539 $mixedloc = 1 if( $_->seq_id ne $seqid );
541 [ $_, $_->start * ($_->strand || 1) ];
542 } @locset;
544 my @sort_locs;
545 if ( $fstrand == 1 ) {
546 @sort_locs = sort { $a->[1] <=> $b->[1] } @proc_locs; # Yes Tim, Schwartzian transformation
547 }elsif ( $fstrand == -1 ){
548 @sort_locs = sort { $b->[1] <=> $a->[1] } @proc_locs; # Yes Tim, Schwartzian transformation
549 } else {
550 @sort_locs = @proc_locs;
552 @locs = map { $_->[0] } @sort_locs;
554 if ( $mixed ) {
555 $self->warn( "Mixed strand locations, spliced seq using the "
556 . "input order rather than trying to sort");
557 @locs = @locset;
560 else {
561 # use the original order instead of trying to sort
562 @locs = @locset;
563 $fstrand = $locs[0]->strand;
567 my $last_id = undef;
568 my $called_seq = undef;
569 # This will be left as undefined if 1) db is remote or 2)seq_id is undefined.
570 # In that case, old code is used to make exon sequence
571 my $called_seq_seq = undef;
572 my $called_seq_len = undef;
574 foreach my $loc ( @locs ) {
575 if ( not $loc->isa("Bio::Location::Atomic") ) {
576 $self->throw("Can only deal with one level deep locations");
579 if ( $fstrand != $loc->strand ) {
580 $self->warn("feature strand is different from location strand!");
583 my $loc_seq_id;
584 if ( defined $loc->seq_id ) {
585 $loc_seq_id = $loc->seq_id;
587 # deal with remote sequences
588 if ($loc_seq_id ne $seqid ) {
589 # might be too big to download whole sequence
590 $called_seq_seq = undef;
592 if ( defined $db ) {
593 my $sid = $loc_seq_id;
594 $sid =~ s/\.\d+$//g;
595 eval {
596 $called_seq = $db->get_Seq_by_acc($sid);
598 if( $@ ) {
599 $self->warn( "In attempting to join a remote location, sequence $sid "
600 . "was not in database. Will provide padding N's. Full exception \n\n$@");
601 $called_seq = undef;
604 else {
605 $self->warn( "cannot get remote location for ".$loc_seq_id ." without a valid "
606 . "Bio::DB::RandomAccessI database handle (like Bio::DB::GenBank)");
607 $called_seq = undef;
609 if ( !defined $called_seq ) {
610 $seqstr .= 'N' x $loc->length;
611 next;
614 # have local sequence available
615 else {
616 # don't have to pull out source sequence again if it's local unless
617 # it's the first exon or different from previous exon
618 unless (defined(($last_id) && $last_id eq $loc_seq_id )){
619 $called_seq = $self->entire_seq;
620 $called_seq_seq = $called_seq->seq(); # this is slow
624 #undefined $loc->seq->id
625 else {
626 $called_seq = $self->entire_seq;
627 $called_seq_seq = undef;
630 my ($start,$end) = ($loc->start,$loc->end);
632 # does the called sequence make sense? Bug 1780
633 my $called_seq_len;
635 # can avoid a seq() call on called_seq
636 if (defined($called_seq_seq)) {
637 $called_seq_len = length($called_seq_seq);
639 # can't avoid a seq() call on called_seq
640 else {
641 $called_seq_len = $called_seq->length # this is slow
644 if ($called_seq_len < $loc->end) {
645 my $accession = $called_seq->accession;
646 my $orig_id = $self->seq_id; # originating sequence
647 my ($locus) = $self->get_tagset_values("locus_tag");
648 $self->throw( "Location end ($end) exceeds length ($called_seq_len) of "
649 . "called sequence $accession.\nCheck sequence version used in "
650 . "$locus locus-tagged SeqFeature in $orig_id.");
653 if ( $self->isa('Bio::Das::SegmentI') ) {
654 # $called_seq is Bio::DB::GFF::RelSegment, as well as its subseq();
655 # Bio::DB::GFF::RelSegment::seq() returns a Bio::PrimarySeq, and using seq()
656 # in turn returns a string. Confused?
657 $seqstr .= $called_seq->subseq($start,$end)->seq()->seq(); # this is slow
659 else {
660 my $exon_seq;
661 if (defined ($called_seq_seq)){
662 $exon_seq = substr($called_seq_seq, $start-1, $end-$start+1); # this is quick
664 else {
665 $exon_seq = $called_seq->subseq($loc->start,$loc->end); # this is slow
668 # If guide_strand is defined, assemble the sequence first and revcom later if needed,
669 # if its not defined, apply revcom immediately to proper locations
670 if (defined $self->location->guide_strand) {
671 $seqstr .= $exon_seq;
673 else {
674 my $strand = defined ($loc->strand) ? ($loc->strand) : 0;
676 # revcomp $exon_seq
677 if ($strand == -1) {
678 $exon_seq = reverse($exon_seq);
679 $exon_seq =~ tr/ABCDGHKMNRSTUVWXYabcdghkmnrstuvwxy/TVGHCDMKNYSAABWXRtvghcdmknysaabwxr/;
680 $seqstr .= $exon_seq;
682 else {
683 $seqstr .= $exon_seq;
688 $last_id = $loc_seq_id if (defined($loc_seq_id));
689 } #next $loc
691 # Use revcom only after the whole sequence has been assembled
692 my $guide_strand = defined ($self->location->guide_strand) ? ($self->location->guide_strand) : 0;
693 if ($guide_strand == -1) {
694 my $seqstr_obj = Bio::Seq->new(-seq => $seqstr);
695 $seqstr = $seqstr_obj->revcom->seq;
698 if (defined($phase)) {
699 $seqstr = substr($seqstr, $phase);
702 my $out = Bio::Seq->new( -id => $self->entire_seq->display_id
703 . "_spliced_feat",
704 -seq => $seqstr);
706 return $out;
709 =head2 location
711 Title : location
712 Usage : my $location = $seqfeature->location()
713 Function: returns a location object suitable for identifying location
714 of feature on sequence or parent feature
715 Returns : Bio::LocationI object
716 Args : none
719 =cut
721 sub location {
722 my ($self) = @_;
724 $self->throw_not_implemented();
728 =head2 primary_id
730 Title : primary_id
731 Usage : $obj->primary_id($newval)
732 Function:
733 Example :
734 Returns : value of primary_id (a scalar)
735 Args : on set, new value (a scalar or undef, optional)
737 Primary ID is a synonym for the tag 'ID'
739 =cut
741 sub primary_id{
742 my $self = shift;
743 # note from cjm@fruitfly.org:
744 # I have commented out the following 2 lines:
746 #return $self->{'primary_id'} = shift if @_;
747 #return $self->{'primary_id'};
749 #... and replaced it with the following; see
750 # http://bioperl.org/pipermail/bioperl-l/2003-December/014150.html
751 # for the discussion that lead to this change
753 if (@_) {
754 if ($self->has_tag('ID')) {
755 $self->remove_tag('ID');
757 $self->add_tag_value('ID', shift);
759 my ($id) = $self->get_tagset_values('ID');
760 return $id;
763 sub generate_unique_persistent_id {
764 # DEPRECATED - us IDHandler
765 my $self = shift;
766 require Bio::SeqFeature::Tools::IDHandler;
767 Bio::SeqFeature::Tools::IDHandler->new->generate_unique_persistent_id($self);
771 =head2 phase
773 Title : phase
774 Usage : $obj->phase($newval)
775 Function: get/set this feature's phase.
776 Example :
777 Returns : undef if no phase is set,
778 otherwise 0, 1, or 2 (the only valid values for phase)
779 Args : on set, the new value
781 Most features do not have or need a defined phase.
783 For features representing a CDS, the phase indicates where the feature
784 begins with reference to the reading frame. The phase is one of the
785 integers 0, 1, or 2, indicating the number of bases that should be
786 removed from the beginning of this feature to reach the first base of
787 the next codon. In other words, a phase of "0" indicates that the next
788 codon begins at the first base of the region described by the current
789 line, a phase of "1" indicates that the next codon begins at the
790 second base of this region, and a phase of "2" indicates that the
791 codon begins at the third base of this region. This is NOT to be
792 confused with the frame, which is simply start modulo 3.
794 For forward strand features, phase is counted from the start
795 field. For reverse strand features, phase is counted from the end
796 field.
798 =cut
800 sub phase {
801 my $self = shift;
802 if( @_ ) {
803 $self->remove_tag('phase') if $self->has_tag('phase');
804 my $newphase = shift;
805 $self->throw("illegal phase value '$newphase', phase must be either undef, 0, 1, or 2")
806 unless !defined $newphase || $newphase == 0 || $newphase == 1 || $newphase == 2;
807 $self->add_tag_value('phase', $newphase );
808 return $newphase;
811 return $self->has_tag('phase') ? ($self->get_tag_values('phase'))[0] : undef;
815 =head1 Bio::RangeI methods
817 These methods are inherited from RangeI and can be used
818 directly from a SeqFeatureI interface. Remember that a
819 SeqFeature is-a RangeI, and so wherever you see RangeI you
820 can use a feature ($r in the below documentation).
822 =cut
824 =head2 start()
826 See L<Bio::RangeI>
828 =head2 end()
830 See L<Bio::RangeI>
832 =head2 strand()
834 See L<Bio::RangeI>
836 =head2 overlaps()
838 See L<Bio::RangeI>
840 =head2 contains()
842 See L<Bio::RangeI>
844 =head2 equals()
846 See L<Bio::RangeI>
848 =head2 intersection()
850 See L<Bio::RangeI>
852 =head2 union()
854 See L<Bio::RangeI>
856 =cut