Bio::Tools::CodonTable and Bio::Tools::IUPAC: prepare with dzil.
[bioperl-live.git] / lib / Bio / SeqFeatureI.pm
blobad3043aa9f246f8ba1d09fe02f6ed4ef929906e1
2 # BioPerl module for Bio::SeqFeatureI
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Ewan Birney <birney@ebi.ac.uk>
8 # Copyright Ewan Birney
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
14 =head1 NAME
16 Bio::SeqFeatureI - Abstract interface of a Sequence Feature
18 =head1 SYNOPSIS
20 # get a seqfeature somehow, eg, from a Sequence with Features attached
22 foreach $feat ( $seq->get_SeqFeatures() ) {
23 print "Feature from ", $feat->start, "to ",
24 $feat->end, " Primary tag ", $feat->primary_tag,
25 ", produced by ", $feat->source_tag(), "\n";
27 if ( $feat->strand == 0 ) {
28 print "Feature applicable to either strand\n";
30 else {
31 print "Feature on strand ", $feat->strand,"\n"; # -1,1
34 print "feature location is ",$feat->start, "..",
35 $feat->end, " on strand ", $feat->strand, "\n";
36 print "easy utility to print locations in GenBank/EMBL way ",
37 $feat->location->to_FTstring(), "\n";
39 foreach $tag ( $feat->get_all_tags() ) {
40 print "Feature has tag ", $tag, " with values, ",
41 join(' ',$feat->get_tag_values($tag)), "\n";
43 print "new feature\n" if $feat->has_tag('new');
44 # features can have sub features
45 my @subfeat = $feat->get_SeqFeatures();
48 =head1 DESCRIPTION
50 This interface is the functions one can expect for any Sequence
51 Feature, whatever its implementation or whether it is a more complex
52 type (eg, a Gene). This object does not actually provide any
53 implementation, it just provides the definitions of what methods one can
54 call. See Bio::SeqFeature::Generic for a good standard implementation
55 of this object
57 =head1 FEEDBACK
59 User feedback is an integral part of the evolution of this and other
60 Bioperl modules. Send your comments and suggestions preferably to one
61 of the Bioperl mailing lists. Your participation is much appreciated.
63 bioperl-l@bioperl.org - General discussion
64 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
66 =head2 Support
68 Please direct usage questions or support issues to the mailing list:
70 I<bioperl-l@bioperl.org>
72 rather than to the module maintainer directly. Many experienced and
73 reponsive experts will be able look at the problem and quickly
74 address it. Please include a thorough description of the problem
75 with code and data examples if at all possible.
77 =head2 Reporting Bugs
79 Report bugs to the Bioperl bug tracking system to help us keep track
80 the bugs and their resolution. Bug reports can be submitted via the
81 web:
83 https://github.com/bioperl/bioperl-live/issues
85 =head1 APPENDIX
87 The rest of the documentation details each of the object
88 methods. Internal methods are usually preceded with a _
90 =cut
93 # Let the code begin...
96 package Bio::SeqFeatureI;
98 use vars qw($HasInMemory);
99 use strict;
100 BEGIN {
101 eval { require Bio::DB::InMemoryCache };
102 if( $@ ) { $HasInMemory = 0 }
103 else { $HasInMemory = 1 }
106 use Bio::Seq;
108 use Carp;
110 use base qw(Bio::RangeI);
112 =head1 Bio::SeqFeatureI specific methods
114 New method interfaces.
116 =cut
118 =head2 get_SeqFeatures
120 Title : get_SeqFeatures
121 Usage : @feats = $feat->get_SeqFeatures();
122 Function: Returns an array of sub Sequence Features
123 Returns : An array
124 Args : none
126 =cut
128 sub get_SeqFeatures{
129 my ($self,@args) = @_;
131 $self->throw_not_implemented();
134 =head2 display_name
136 Title : display_name
137 Usage : $name = $feat->display_name()
138 Function: Returns the human-readable name of the feature for displays.
139 Returns : a string
140 Args : none
142 =cut
144 sub display_name {
145 shift->throw_not_implemented();
148 =head2 primary_tag
150 Title : primary_tag
151 Usage : $tag = $feat->primary_tag()
152 Function: Returns the primary tag for a feature,
153 eg 'exon'
154 Returns : a string
155 Args : none
158 =cut
160 sub primary_tag{
161 my ($self,@args) = @_;
163 $self->throw_not_implemented();
167 =head2 source_tag
169 Title : source_tag
170 Usage : $tag = $feat->source_tag()
171 Function: Returns the source tag for a feature,
172 eg, 'genscan'
173 Returns : a string
174 Args : none
177 =cut
179 sub source_tag{
180 my ($self,@args) = @_;
182 $self->throw_not_implemented();
185 =head2 has_tag
187 Title : has_tag
188 Usage : $tag_exists = $self->has_tag('some_tag')
189 Function:
190 Returns : TRUE if the specified tag exists, and FALSE otherwise
191 Args :
193 =cut
195 sub has_tag{
196 my ($self,@args) = @_;
198 $self->throw_not_implemented();
202 =head2 get_tag_values
204 Title : get_tag_values
205 Usage : @values = $self->get_tag_values('some_tag')
206 Function:
207 Returns : An array comprising the values of the specified tag.
208 Args : a string
210 throws an exception if there is no such tag
212 =cut
214 sub get_tag_values {
215 shift->throw_not_implemented();
218 =head2 get_tagset_values
220 Title : get_tagset_values
221 Usage : @values = $self->get_tagset_values(qw(label transcript_id product))
222 Function:
223 Returns : An array comprising the values of the specified tags, in order of tags
224 Args : An array of strings
226 does NOT throw an exception if none of the tags are not present
228 this method is useful for getting a human-readable label for a
229 SeqFeatureI; not all tags can be assumed to be present, so a list of
230 possible tags in preferential order is provided
232 =cut
234 # interface + abstract method
235 sub get_tagset_values {
236 my ($self, @args) = @_;
237 my @vals = ();
238 foreach my $arg (@args) {
239 if ($self->has_tag($arg)) {
240 push(@vals, $self->get_tag_values($arg));
243 return @vals;
246 =head2 get_all_tags
248 Title : get_all_tags
249 Usage : @tags = $feat->get_all_tags()
250 Function: gives all tags for this feature
251 Returns : an array of strings
252 Args : none
255 =cut
257 sub get_all_tags{
258 shift->throw_not_implemented();
261 =head2 attach_seq
263 Title : attach_seq
264 Usage : $sf->attach_seq($seq)
265 Function: Attaches a Bio::Seq object to this feature. This
266 Bio::Seq object is for the *entire* sequence: ie
267 from 1 to 10000
269 Note that it is not guaranteed that if you obtain a feature from
270 an object in bioperl, it will have a sequence attached. Also,
271 implementors of this interface can choose to provide an empty
272 implementation of this method. I.e., there is also no guarantee
273 that if you do attach a sequence, seq() or entire_seq() will not
274 return undef.
276 The reason that this method is here on the interface is to enable
277 you to call it on every SeqFeatureI compliant object, and
278 that it will be implemented in a useful way and set to a useful
279 value for the great majority of use cases. Implementors who choose
280 to ignore the call are encouraged to specifically state this in
281 their documentation.
283 Example :
284 Returns : TRUE on success
285 Args : a Bio::PrimarySeqI compliant object
288 =cut
290 sub attach_seq {
291 shift->throw_not_implemented();
294 =head2 seq
296 Title : seq
297 Usage : $tseq = $sf->seq()
298 Function: returns the truncated sequence (if there is a sequence attached)
299 for this feature
300 Example :
301 Returns : sub seq (a Bio::PrimarySeqI compliant object) on attached sequence
302 bounded by start & end, or undef if there is no sequence attached.
303 If the strand is defined and set to -1, the returned sequence is
304 the reverse-complement of the region
305 Args : none
308 =cut
310 sub seq {
311 shift->throw_not_implemented();
314 =head2 entire_seq
316 Title : entire_seq
317 Usage : $whole_seq = $sf->entire_seq()
318 Function: gives the entire sequence that this seqfeature is attached to
319 Example :
320 Returns : a Bio::PrimarySeqI compliant object, or undef if there is no
321 sequence attached
322 Args : none
325 =cut
327 sub entire_seq {
328 shift->throw_not_implemented();
332 =head2 seq_id
334 Title : seq_id
335 Usage : $obj->seq_id($newval)
336 Function: There are many cases when you make a feature that you
337 do know the sequence name, but do not know its actual
338 sequence. This is an attribute such that you can store
339 the ID (e.g., display_id) of the sequence.
341 This attribute should *not* be used in GFF dumping, as
342 that should come from the collection in which the seq
343 feature was found.
344 Returns : value of seq_id
345 Args : newvalue (optional)
348 =cut
350 sub seq_id {
351 shift->throw_not_implemented();
354 =head2 gff_string
356 Title : gff_string
357 Usage : $str = $feat->gff_string;
358 $str = $feat->gff_string($gff_formatter);
359 Function: Provides the feature information in GFF format.
361 The implementation provided here returns GFF2 by default. If you
362 want a different version, supply an object implementing a method
363 gff_string() accepting a SeqFeatureI object as argument. E.g., to
364 obtain GFF1 format, do the following:
366 my $gffio = Bio::Tools::GFF->new(-gff_version => 1);
367 $gff1str = $feat->gff_string($gff1io);
369 Returns : A string
370 Args : Optionally, an object implementing gff_string().
373 =cut
375 sub gff_string{
376 my ($self,$formatter) = @_;
378 $formatter = $self->_static_gff_formatter unless $formatter;
379 return $formatter->gff_string($self);
382 my $static_gff_formatter = undef;
384 =head2 _static_gff_formatter
386 Title : _static_gff_formatter
387 Usage :
388 Function:
389 Example :
390 Returns :
391 Args :
394 =cut
396 sub _static_gff_formatter{
397 my ($self,@args) = @_;
398 require Bio::Tools::GFF; # on the fly inclusion -- is this better?
399 if( !defined $static_gff_formatter ) {
400 $static_gff_formatter = Bio::Tools::GFF->new('-gff_version' => 2);
402 return $static_gff_formatter;
406 =head1 Decorating methods
408 These methods have an implementation provided by Bio::SeqFeatureI,
409 but can be validly overwritten by subclasses
411 =head2 spliced_seq
413 Title : spliced_seq
415 Usage : $seq = $feature->spliced_seq()
416 $seq = $feature_with_remote_locations->spliced_seq($db_for_seqs)
418 Function: Provides a sequence of the feature which is the most
419 semantically "relevant" feature for this sequence. A default
420 implementation is provided which for simple cases returns just
421 the sequence, but for split cases, loops over the split location
422 to return the sequence. In the case of split locations with
423 remote locations, eg
425 join(AB000123:5567-5589,80..1144)
427 in the case when a database object is passed in, it will attempt
428 to retrieve the sequence from the database object, and "Do the right thing",
429 however if no database object is provided, it will generate the correct
430 number of N's (DNA) or X's (protein, though this is unlikely).
432 This function is deliberately "magical" attempting to second guess
433 what a user wants as "the" sequence for this feature.
435 Implementing classes are free to override this method with their
436 own magic if they have a better idea what the user wants.
438 Args : [optional]
439 -db A L<Bio::DB::RandomAccessI> compliant object if
440 one needs to retrieve remote seqs.
441 -nosort boolean if the locations should not be sorted
442 by start location. This may occur, for instance,
443 in a circular sequence where a gene span starts
444 before the end of the sequence and ends after the
445 sequence start. Example : join(15685..16260,1..207)
446 (default = if sequence is_circular(), 1, otherwise 0)
447 -phase truncates the returned sequence based on the
448 intron phase (0,1,2).
450 Returns : A L<Bio::PrimarySeqI> object
452 =cut
454 sub spliced_seq {
455 my $self = shift;
456 my @args = @_;
457 my ($db, $nosort, $phase) =
458 $self->_rearrange([qw(DB NOSORT PHASE)], @args);
460 # set no_sort based on the parent sequence status
461 if ($self->entire_seq->is_circular) {
462 $nosort = 1;
465 # (added 7/7/06 to allow use old API (with warnings)
466 my $old_api = (!(grep {$_ =~ /(?:nosort|db|phase)/} @args)) ? 1 : 0;
467 if (@args && $old_api) {
468 $self->warn( q(API has changed; please use '-db' or '-nosort' )
469 . qq(for args. See POD for more details.));
470 $db = shift @args if @args;
471 $nosort = shift @args if @args;
472 $phase = shift @args if @args;
475 if (defined($phase) && ($phase < 0 || $phase > 2)) {
476 $self->warn("Phase must be 0,1, or 2. Setting phase to 0...");
477 $phase = 0;
480 if ( $db && ref($db) && ! $db->isa('Bio::DB::RandomAccessI') ) {
481 $self->warn( "Must pass in a valid Bio::DB::RandomAccessI object"
482 . " for access to remote locations for spliced_seq");
483 $db = undef;
485 elsif ( defined $db && $HasInMemory && $db->isa('Bio::DB::InMemoryCache') ) {
486 $db = Bio::DB::InMemoryCache->new(-seqdb => $db);
489 if ( not $self->location->isa("Bio::Location::SplitLocationI") ) {
490 if ($phase) {
491 $self->debug("Subseq start: ",$phase+1,"\tend: ",$self->end,"\n");
492 my $seqstr = substr($self->seq->seq, $phase);
493 my $out = Bio::Seq->new( -id => $self->entire_seq->display_id
494 . "_spliced_feat",
495 -seq => $seqstr);
496 return $out;
498 else {
499 return $self->seq(); # nice and easy!
503 # redundant test, but the above ISA is probably not ideal.
504 if ( not $self->location->isa("Bio::Location::SplitLocationI") ) {
505 $self->throw("not atomic, not split, yikes, in trouble!");
508 my $seqstr = '';
509 my $seqid = $self->entire_seq->display_id;
510 # This is to deal with reverse strand features
511 # so we are really sorting features 5' -> 3' on their strand
512 # i.e. rev strand features will be sorted largest to smallest
513 # as this how revcom CDSes seem to be annotated in genbank.
514 # Might need to eventually allow this to be programable?
515 # (can I mention how much fun this is NOT! --jason)
517 my ($mixed,$mixedloc, $fstrand) = (0);
519 if ( $self->isa('Bio::Das::SegmentI') and not $self->absolute ) {
520 $self->warn( "Calling spliced_seq with a Bio::Das::SegmentI which "
521 . "does have absolute set to 1 -- be warned you may not "
522 . "be getting things on the correct strand");
525 my @locset = $self->location->each_Location;
526 my @locs;
527 if ( not $nosort ) {
528 # @locs = map { $_->[0] }
529 # sort so that most negative is first basically to order
530 # the features on the opposite strand 5'->3' on their strand
531 # rather than they way most are input which is on the fwd strand
533 # sort { $a->[1] <=> $b->[1] } # Yes Tim, Schwartzian transformation
534 my @proc_locs =
535 map {
536 $fstrand = $_->strand unless defined $fstrand;
537 $mixed = 1 if defined $_->strand && $fstrand != $_->strand;
539 if( defined $_->seq_id ) {
540 $mixedloc = 1 if( $_->seq_id ne $seqid );
542 [ $_, $_->start * ($_->strand || 1) ];
543 } @locset;
545 my @sort_locs;
546 if ( $fstrand == 1 ) {
547 @sort_locs = sort { $a->[1] <=> $b->[1] } @proc_locs; # Yes Tim, Schwartzian transformation
548 }elsif ( $fstrand == -1 ){
549 @sort_locs = sort { $b->[1] <=> $a->[1] } @proc_locs; # Yes Tim, Schwartzian transformation
550 } else {
551 @sort_locs = @proc_locs;
553 @locs = map { $_->[0] } @sort_locs;
555 if ( $mixed ) {
556 $self->warn( "Mixed strand locations, spliced seq using the "
557 . "input order rather than trying to sort");
558 @locs = @locset;
561 else {
562 # use the original order instead of trying to sort
563 @locs = @locset;
564 $fstrand = $locs[0]->strand;
568 my $last_id = undef;
569 my $called_seq = undef;
570 # This will be left as undefined if 1) db is remote or 2)seq_id is undefined.
571 # In that case, old code is used to make exon sequence
572 my $called_seq_seq = undef;
573 my $called_seq_len = undef;
575 foreach my $loc ( @locs ) {
576 if ( not $loc->isa("Bio::Location::Atomic") ) {
577 $self->throw("Can only deal with one level deep locations");
580 if ( $fstrand != $loc->strand ) {
581 $self->warn("feature strand is different from location strand!");
584 my $loc_seq_id;
585 if ( defined $loc->seq_id ) {
586 $loc_seq_id = $loc->seq_id;
588 # deal with remote sequences
589 if ($loc_seq_id ne $seqid ) {
590 # might be too big to download whole sequence
591 $called_seq_seq = undef;
593 if ( defined $db ) {
594 my $sid = $loc_seq_id;
595 $sid =~ s/\.\d+$//g;
596 eval {
597 $called_seq = $db->get_Seq_by_acc($sid);
599 if( $@ ) {
600 $self->warn( "In attempting to join a remote location, sequence $sid "
601 . "was not in database. Will provide padding N's. Full exception \n\n$@");
602 $called_seq = undef;
605 else {
606 $self->warn( "cannot get remote location for ".$loc_seq_id ." without a valid "
607 . "Bio::DB::RandomAccessI database handle (like Bio::DB::GenBank)");
608 $called_seq = undef;
610 if ( !defined $called_seq ) {
611 $seqstr .= 'N' x $loc->length;
612 next;
615 # have local sequence available
616 else {
617 # don't have to pull out source sequence again if it's local unless
618 # it's the first exon or different from previous exon
619 unless (defined(($last_id) && $last_id eq $loc_seq_id )){
620 $called_seq = $self->entire_seq;
621 $called_seq_seq = $called_seq->seq(); # this is slow
625 #undefined $loc->seq->id
626 else {
627 $called_seq = $self->entire_seq;
628 $called_seq_seq = undef;
631 my ($start,$end) = ($loc->start,$loc->end);
633 # does the called sequence make sense? Bug 1780
634 my $called_seq_len;
636 # can avoid a seq() call on called_seq
637 if (defined($called_seq_seq)) {
638 $called_seq_len = length($called_seq_seq);
640 # can't avoid a seq() call on called_seq
641 else {
642 $called_seq_len = $called_seq->length # this is slow
645 if ($called_seq_len < $loc->end) {
646 my $accession = $called_seq->accession;
647 my $orig_id = $self->seq_id; # originating sequence
648 my ($locus) = $self->get_tagset_values("locus_tag");
649 $self->throw( "Location end ($end) exceeds length ($called_seq_len) of "
650 . "called sequence $accession.\nCheck sequence version used in "
651 . "$locus locus-tagged SeqFeature in $orig_id.");
654 if ( $self->isa('Bio::Das::SegmentI') ) {
655 # $called_seq is Bio::DB::GFF::RelSegment, as well as its subseq();
656 # Bio::DB::GFF::RelSegment::seq() returns a Bio::PrimarySeq, and using seq()
657 # in turn returns a string. Confused?
658 $seqstr .= $called_seq->subseq($start,$end)->seq()->seq(); # this is slow
660 else {
661 my $exon_seq;
662 if (defined ($called_seq_seq)){
663 $exon_seq = substr($called_seq_seq, $start-1, $end-$start+1); # this is quick
665 else {
666 $exon_seq = $called_seq->subseq($loc->start,$loc->end); # this is slow
669 # If guide_strand is defined, assemble the sequence first and revcom later if needed,
670 # if its not defined, apply revcom immediately to proper locations
671 if (defined $self->location->guide_strand) {
672 $seqstr .= $exon_seq;
674 else {
675 my $strand = defined ($loc->strand) ? ($loc->strand) : 0;
677 # revcomp $exon_seq
678 if ($strand == -1) {
679 $exon_seq = reverse($exon_seq);
680 $exon_seq =~ tr/ABCDGHKMNRSTUVWXYabcdghkmnrstuvwxy/TVGHCDMKNYSAABWXRtvghcdmknysaabwxr/;
681 $seqstr .= $exon_seq;
683 else {
684 $seqstr .= $exon_seq;
689 $last_id = $loc_seq_id if (defined($loc_seq_id));
690 } #next $loc
692 # Use revcom only after the whole sequence has been assembled
693 my $guide_strand = defined ($self->location->guide_strand) ? ($self->location->guide_strand) : 0;
694 if ($guide_strand == -1) {
695 my $seqstr_obj = Bio::Seq->new(-seq => $seqstr);
696 $seqstr = $seqstr_obj->revcom->seq;
699 if (defined($phase)) {
700 $seqstr = substr($seqstr, $phase);
703 my $out = Bio::Seq->new( -id => $self->entire_seq->display_id
704 . "_spliced_feat",
705 -seq => $seqstr);
707 return $out;
710 =head2 location
712 Title : location
713 Usage : my $location = $seqfeature->location()
714 Function: returns a location object suitable for identifying location
715 of feature on sequence or parent feature
716 Returns : Bio::LocationI object
717 Args : none
720 =cut
722 sub location {
723 my ($self) = @_;
725 $self->throw_not_implemented();
729 =head2 primary_id
731 Title : primary_id
732 Usage : $obj->primary_id($newval)
733 Function:
734 Example :
735 Returns : value of primary_id (a scalar)
736 Args : on set, new value (a scalar or undef, optional)
738 Primary ID is a synonym for the tag 'ID'
740 =cut
742 sub primary_id{
743 my $self = shift;
744 # note from cjm@fruitfly.org:
745 # I have commented out the following 2 lines:
747 #return $self->{'primary_id'} = shift if @_;
748 #return $self->{'primary_id'};
750 #... and replaced it with the following; see
751 # http://bioperl.org/pipermail/bioperl-l/2003-December/014150.html
752 # for the discussion that lead to this change
754 if (@_) {
755 if ($self->has_tag('ID')) {
756 $self->remove_tag('ID');
758 $self->add_tag_value('ID', shift);
760 my ($id) = $self->get_tagset_values('ID');
761 return $id;
764 sub generate_unique_persistent_id {
765 # DEPRECATED - us IDHandler
766 my $self = shift;
767 require Bio::SeqFeature::Tools::IDHandler;
768 Bio::SeqFeature::Tools::IDHandler->new->generate_unique_persistent_id($self);
772 =head2 phase
774 Title : phase
775 Usage : $obj->phase($newval)
776 Function: get/set this feature's phase.
777 Example :
778 Returns : undef if no phase is set,
779 otherwise 0, 1, or 2 (the only valid values for phase)
780 Args : on set, the new value
782 Most features do not have or need a defined phase.
784 For features representing a CDS, the phase indicates where the feature
785 begins with reference to the reading frame. The phase is one of the
786 integers 0, 1, or 2, indicating the number of bases that should be
787 removed from the beginning of this feature to reach the first base of
788 the next codon. In other words, a phase of "0" indicates that the next
789 codon begins at the first base of the region described by the current
790 line, a phase of "1" indicates that the next codon begins at the
791 second base of this region, and a phase of "2" indicates that the
792 codon begins at the third base of this region. This is NOT to be
793 confused with the frame, which is simply start modulo 3.
795 For forward strand features, phase is counted from the start
796 field. For reverse strand features, phase is counted from the end
797 field.
799 =cut
801 sub phase {
802 my $self = shift;
803 if( @_ ) {
804 $self->remove_tag('phase') if $self->has_tag('phase');
805 my $newphase = shift;
806 $self->throw("illegal phase value '$newphase', phase must be either undef, 0, 1, or 2")
807 unless !defined $newphase || $newphase == 0 || $newphase == 1 || $newphase == 2;
808 $self->add_tag_value('phase', $newphase );
809 return $newphase;
812 return $self->has_tag('phase') ? ($self->get_tag_values('phase'))[0] : undef;
816 =head1 Bio::RangeI methods
818 These methods are inherited from RangeI and can be used
819 directly from a SeqFeatureI interface. Remember that a
820 SeqFeature is-a RangeI, and so wherever you see RangeI you
821 can use a feature ($r in the below documentation).
823 =cut
825 =head2 start()
827 See L<Bio::RangeI>
829 =head2 end()
831 See L<Bio::RangeI>
833 =head2 strand()
835 See L<Bio::RangeI>
837 =head2 overlaps()
839 See L<Bio::RangeI>
841 =head2 contains()
843 See L<Bio::RangeI>
845 =head2 equals()
847 See L<Bio::RangeI>
849 =head2 intersection()
851 See L<Bio::RangeI>
853 =head2 union()
855 See L<Bio::RangeI>
857 =cut