Bio::Tools::CodonTable::is_start_codon: check in case of ambiguous codons (#266)
[bioperl-live.git] / lib / Bio / PrimarySeqI.pm
blob44be60c04322a95ccbc4040b9f8903f2af98b54c
2 # BioPerl module for Bio::PrimarySeqI
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Ewan Birney <birney@ebi.ac.uk>
8 # Copyright Ewan Birney
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
15 =head1 NAME
17 Bio::PrimarySeqI - Interface definition for a Bio::PrimarySeq
19 =head1 SYNOPSIS
21 # Bio::PrimarySeqI is the interface class for sequences.
22 # If you are a newcomer to bioperl, you might want to start with
23 # Bio::Seq documentation.
25 # Test if this is a seq object
26 $obj->isa("Bio::PrimarySeqI") ||
27 $obj->throw("$obj does not implement the Bio::PrimarySeqI interface");
29 # Accessors
30 $string = $obj->seq();
31 $substring = $obj->subseq(12,50);
32 $display = $obj->display_id(); # for human display
33 $id = $obj->primary_id(); # unique id for this object,
34 # implementation defined
35 $unique_key= $obj->accession_number(); # unique biological id
38 # Object manipulation
39 eval {
40 $rev = $obj->revcom();
42 if( $@ ) {
43 $obj->throw( "Could not reverse complement. ".
44 "Probably not DNA. Actual exception\n$@\n" );
47 $trunc = $obj->trunc(12,50);
48 # $rev and $trunc are Bio::PrimarySeqI compliant objects
51 =head1 DESCRIPTION
53 This object defines an abstract interface to basic sequence
54 information - for most users of the package the documentation (and
55 methods) in this class are not useful - this is a developers-only
56 class which defines what methods have to be implemented by other Perl
57 objects to comply to the Bio::PrimarySeqI interface. Go "perldoc
58 Bio::Seq" or "man Bio::Seq" for more information on the main class for
59 sequences.
61 PrimarySeq is an object just for the sequence and its name(s), nothing
62 more. Seq is the larger object complete with features. There is a pure
63 perl implementation of this in L<Bio::PrimarySeq>. If you just want to
64 use L<Bio::PrimarySeq> objects, then please read that module first. This
65 module defines the interface, and is of more interest to people who
66 want to wrap their own Perl Objects/RDBs/FileSystems etc in way that
67 they "are" bioperl sequence objects, even though it is not using Perl
68 to store the sequence etc.
70 This interface defines what bioperl considers necessary to "be" a
71 sequence, without providing an implementation of this, an
72 implementation is provided in L<Bio::PrimarySeq>. If you want to provide
73 a Bio::PrimarySeq-compliant object which in fact wraps another
74 object/database/out-of-perl experience, then this is the correct thing
75 to wrap, generally by providing a wrapper class which would inherit
76 from your object and this Bio::PrimarySeqI interface. The wrapper class
77 then would have methods lists in the "Implementation Specific
78 Functions" which would provide these methods for your object.
80 =head1 FEEDBACK
82 =head2 Mailing Lists
84 User feedback is an integral part of the evolution of this and other
85 Bioperl modules. Send your comments and suggestions preferably to one
86 of the Bioperl mailing lists. Your participation is much appreciated.
88 bioperl-l@bioperl.org - General discussion
89 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
91 =head2 Support
93 Please direct usage questions or support issues to the mailing list:
95 I<bioperl-l@bioperl.org>
97 rather than to the module maintainer directly. Many experienced and
98 reponsive experts will be able look at the problem and quickly
99 address it. Please include a thorough description of the problem
100 with code and data examples if at all possible.
102 =head2 Reporting Bugs
104 Report bugs to the Bioperl bug tracking system to help us keep track
105 the bugs and their resolution. Bug reports can be submitted via the
106 web:
108 https://github.com/bioperl/bioperl-live/issues
110 =head1 AUTHOR - Ewan Birney
112 Email birney@ebi.ac.uk
114 =head1 APPENDIX
116 The rest of the documentation details each of the object
117 methods. Internal methods are usually preceded with a _
119 =cut
122 package Bio::PrimarySeqI;
124 use strict;
125 use Bio::Tools::CodonTable;
127 use base qw(Bio::Root::RootI);
130 =head1 Implementation-specific Functions
132 These functions are the ones that a specific implementation must
133 define.
135 =head2 seq
137 Title : seq
138 Usage : $string = $obj->seq()
139 Function: Returns the sequence as a string of letters. The
140 case of the letters is left up to the implementer.
141 Suggested cases are upper case for proteins and lower case for
142 DNA sequence (IUPAC standard), but implementations are suggested to
143 keep an open mind about case (some users... want mixed case!)
144 Returns : A scalar
145 Status : Virtual
147 =cut
149 sub seq {
150 my ($self) = @_;
151 $self->throw_not_implemented();
155 =head2 subseq
157 Title : subseq
158 Usage : $substring = $obj->subseq(10,40);
159 Function: Returns the subseq from start to end, where the first base
160 is 1 and the number is inclusive, i.e. 1-2 are the first two
161 bases of the sequence.
163 Start cannot be larger than end but can be equal.
165 Returns : A string
166 Args :
167 Status : Virtual
169 =cut
171 sub subseq {
172 my ($self) = @_;
173 $self->throw_not_implemented();
177 =head2 display_id
179 Title : display_id
180 Usage : $id_string = $obj->display_id();
181 Function: Returns the display id, also known as the common name of the Sequence
182 object.
184 The semantics of this is that it is the most likely string
185 to be used as an identifier of the sequence, and likely to
186 have "human" readability. The id is equivalent to the ID
187 field of the GenBank/EMBL databanks and the id field of the
188 Swissprot/sptrembl database. In fasta format, the >(\S+) is
189 presumed to be the id, though some people overload the id
190 to embed other information. Bioperl does not use any
191 embedded information in the ID field, and people are
192 encouraged to use other mechanisms (accession field for
193 example, or extending the sequence object) to solve this.
195 Notice that $seq->id() maps to this function, mainly for
196 legacy/convenience reasons.
197 Returns : A string
198 Args : None
199 Status : Virtual
201 =cut
203 sub display_id {
204 my ($self) = @_;
205 $self->throw_not_implemented();
209 =head2 accession_number
211 Title : accession_number
212 Usage : $unique_biological_key = $obj->accession_number;
213 Function: Returns the unique biological id for a sequence, commonly
214 called the accession_number. For sequences from established
215 databases, the implementors should try to use the correct
216 accession number. Notice that primary_id() provides the
217 unique id for the implementation, allowing multiple objects
218 to have the same accession number in a particular implementation.
220 For sequences with no accession number, this method should return
221 "unknown".
222 Returns : A string
223 Args : None
224 Status : Virtual
226 =cut
228 sub accession_number {
229 my ($self,@args) = @_;
230 $self->throw_not_implemented();
234 =head2 primary_id
236 Title : primary_id
237 Usage : $unique_implementation_key = $obj->primary_id;
238 Function: Returns the unique id for this object in this
239 implementation. This allows implementations to manage their
240 own object ids in a way the implementation can control
241 clients can expect one id to map to one object.
243 For sequences with no accession number, this method should
244 return a stringified memory location.
246 Returns : A string
247 Args : None
248 Status : Virtual
250 =cut
252 sub primary_id {
253 my ($self,@args) = @_;
254 $self->throw_not_implemented();
258 =head2 can_call_new
260 Title : can_call_new
261 Usage : if( $obj->can_call_new ) {
262 $newobj = $obj->new( %param );
264 Function: Can_call_new returns 1 or 0 depending
265 on whether an implementation allows new
266 constructor to be called. If a new constructor
267 is allowed, then it should take the followed hashed
268 constructor list.
270 $myobject->new( -seq => $sequence_as_string,
271 -display_id => $id
272 -accession_number => $accession
273 -alphabet => 'dna',
275 Returns : 1 or 0
276 Args :
279 =cut
281 sub can_call_new {
282 my ($self,@args) = @_;
283 # we default to 0 here
284 return 0;
288 =head2 alphabet
290 Title : alphabet
291 Usage : if( $obj->alphabet eq 'dna' ) { /Do Something/ }
292 Function: Returns the type of sequence being one of
293 'dna', 'rna' or 'protein'. This is case sensitive.
295 This is not called "type" because this would cause
296 upgrade problems from the 0.5 and earlier Seq objects.
298 Returns : A string either 'dna','rna','protein'. NB - the object must
299 make a call of the alphabet, if there is no alphabet specified it
300 has to guess.
301 Args : None
302 Status : Virtual
304 =cut
306 sub alphabet {
307 my ( $self ) = @_;
308 $self->throw_not_implemented();
312 =head2 moltype
314 Title : moltype
315 Usage : Deprecated. Use alphabet() instead.
317 =cut
319 sub moltype {
320 my ($self,@args) = @_;
321 $self->warn("moltype: pre v1.0 method. Calling alphabet() instead...");
322 return $self->alphabet(@args);
326 =head1 Implementation-optional Functions
328 The following functions rely on the above functions. An
329 implementing class does not need to provide these functions, as they
330 will be provided by this class, but is free to override these
331 functions.
333 The revcom(), trunc(), and translate() methods create new sequence
334 objects. They will call new() on the class of the sequence object
335 instance passed as argument, unless can_call_new() returns FALSE. In
336 the latter case a Bio::PrimarySeq object will be created. Implementors
337 which really want to control how objects are created (eg, for object
338 persistence over a database, or objects in a CORBA framework), they
339 are encouraged to override these methods
341 =head2 revcom
343 Title : revcom
344 Usage : $rev = $seq->revcom()
345 Function: Produces a new Bio::PrimarySeqI implementing object which
346 is the reversed complement of the sequence. For protein
347 sequences this throws an exception of "Sequence is a
348 protein. Cannot revcom".
350 The id is the same id as the original sequence, and the
351 accession number is also identical. If someone wants to
352 track that this sequence has be reversed, it needs to
353 define its own extensions.
355 To do an inplace edit of an object you can go:
357 $seq = $seq->revcom();
359 This of course, causes Perl to handle the garbage
360 collection of the old object, but it is roughly speaking as
361 efficient as an inplace edit.
363 Returns : A new (fresh) Bio::PrimarySeqI object
364 Args : None
367 =cut
369 sub revcom {
370 my ($self) = @_;
372 # Create a new fresh object if $self is 'Bio::Seq::LargePrimarySeq'
373 # or 'Bio::Seq::LargeSeq', if not take advantage of
374 # Bio::Root::clone to get an object copy
375 my $out;
376 if ( $self->isa('Bio::Seq::LargePrimarySeq')
377 or $self->isa('Bio::Seq::LargeSeq')
379 my ($seqclass, $opts) = $self->_setup_class;
380 $out = $seqclass->new(
381 -seq => $self->_revcom_from_string($self->seq, $self->alphabet),
382 -is_circular => $self->is_circular,
383 -display_id => $self->display_id,
384 -accession_number => $self->accession_number,
385 -alphabet => $self->alphabet,
386 -desc => $self->desc,
387 -verbose => $self->verbose,
388 %$opts,
390 } else {
391 $out = $self->clone;
392 $out->seq( $out->_revcom_from_string($out->seq, $out->alphabet) );
394 return $out;
398 sub _revcom_from_string {
399 my ($self, $string, $alphabet) = @_;
401 # Check that reverse-complementing makes sense
402 if( $alphabet eq 'protein' ) {
403 $self->throw("Sequence is a protein. Cannot revcom.");
405 if( $alphabet ne 'dna' && $alphabet ne 'rna' ) {
406 my $msg = "Sequence is not dna or rna, but [$alphabet]. Attempting to revcom, ".
407 "but unsure if this is right.";
408 if( $self->can('warn') ) {
409 $self->warn($msg);
410 } else {
411 warn("[$self] $msg");
415 # If sequence is RNA, map to DNA (then map back later)
416 if( $alphabet eq 'rna' ) {
417 $string =~ tr/uU/tT/;
420 # Reverse-complement now
421 $string =~ tr/acgtrymkswhbvdnxACGTRYMKSWHBVDNX/tgcayrkmswdvbhnxTGCAYRKMSWDVBHNX/;
422 $string = CORE::reverse $string;
424 # Map back RNA to DNA
425 if( $alphabet eq 'rna' ) {
426 $string =~ tr/tT/uU/;
429 return $string;
433 =head2 trunc
435 Title : trunc
436 Usage : $subseq = $myseq->trunc(10,100);
437 Function: Provides a truncation of a sequence.
438 Returns : A fresh Bio::PrimarySeqI implementing object.
439 Args : Two integers denoting first and last base of the sub-sequence.
442 =cut
444 sub trunc {
445 my ($self,$start,$end) = @_;
447 my $str;
448 if( defined $start && ref($start) &&
449 $start->isa('Bio::LocationI') ) {
450 $str = $self->subseq($start); # start is a location actually
451 } elsif( !$end ) {
452 $self->throw("trunc start,end -- there was no end for $start");
453 } elsif( $end < $start ) {
454 my $msg = "start [$start] is greater than end [$end]. \n".
455 "If you want to truncated and reverse complement, \n".
456 "you must call trunc followed by revcom. Sorry.";
457 $self->throw($msg);
458 } else {
459 $str = $self->subseq($start,$end);
462 # Create a new fresh object if $self is 'Bio::Seq::LargePrimarySeq'
463 # or 'Bio::Seq::LargeSeq', if not take advantage of
464 # Bio::Root::clone to get an object copy
465 my $out;
466 if ( $self->isa('Bio::Seq::LargePrimarySeq')
467 or $self->isa('Bio::Seq::LargeSeq')
468 or $self->isa('Bio::Seq::RichSeq')
470 my ($seqclass, $opts) = $self->_setup_class;
471 $out = $seqclass->new(
472 -seq => $str,
473 -is_circular => $self->is_circular,
474 -display_id => $self->display_id,
475 -accession_number => $self->accession_number,
476 -alphabet => $self->alphabet,
477 -desc => $self->desc,
478 -verbose => $self->verbose,
479 %$opts,
481 } else {
482 $out = $self->clone;
483 $out->seq($str);
485 return $out;
489 =head2 translate
491 Title : translate
492 Usage : $protein_seq_obj = $dna_seq_obj->translate
494 Or if you expect a complete coding sequence (CDS) translation,
495 with initiator at the beginning and terminator at the end:
497 $protein_seq_obj = $cds_seq_obj->translate(-complete => 1);
499 Or if you want translate() to find the first initiation
500 codon and return the corresponding protein:
502 $protein_seq_obj = $cds_seq_obj->translate(-orf => 1);
504 Function: Provides the translation of the DNA sequence using full
505 IUPAC ambiguities in DNA/RNA and amino acid codes.
507 The complete CDS translation is identical to EMBL/TREMBL
508 database translation. Note that the trailing terminator
509 character is removed before returning the translated protein
510 object.
512 Note: if you set $dna_seq_obj->verbose(1) you will get a
513 warning if the first codon is not a valid initiator.
515 Returns : A Bio::PrimarySeqI implementing object
516 Args : -terminator
517 character for terminator, default '*'
518 -unknown
519 character for unknown, default 'X'
520 -frame
521 positive integer frame shift (in bases), default 0
522 -codontable_id
523 integer codon table id, default 1
524 -complete
525 boolean, if true, complete CDS is expected. default false
526 -complete_codons
527 boolean, if true, codons which are incomplete are translated if a
528 suitable amino acid is found. For instance, if the incomplete
529 codon is 'GG', the completed codon is 'GGN', which is glycine
530 (G). Defaults to 'false'; setting '-complete' also makes this
531 true.
532 -throw
533 boolean, throw exception if ORF not complete, default false
534 -orf
535 if 'longest', find longest ORF. other true value, find
536 first ORF. default 0
537 -codontable
538 optional L<Bio::Tools::CodonTable> object to use for
539 translation
540 -start
541 optional three-character string to force as initiation
542 codon (e.g. 'atg'). If unset, start codons are
543 determined by the CodonTable. Case insensitive.
544 -offset
545 optional positive integer offset for fuzzy locations.
546 if set, must be either 1, 2, or 3
548 =head3 Notes
550 The -start argument only applies when -orf is set to 1. By default all
551 initiation codons found in the given codon table are used but when
552 "start" is set to some codon this codon will be used exclusively as
553 the initiation codon. Note that the default codon table (NCBI
554 "Standard") has 3 initiation codons!
556 By default translate() translates termination codons to the some
557 character (default is *), both internal and trailing codons. Setting
558 "-complete" to 1 tells translate() to remove the trailing character.
560 -offset is used for seqfeatures which contain the the \codon_start tag
561 and can be set to 1, 2, or 3. This is the offset by which the
562 sequence translation starts relative to the first base of the feature
564 For details on codon tables used by translate() see L<Bio::Tools::CodonTable>.
566 Deprecated argument set (v. 1.5.1 and prior versions) where each argument is an
567 element in an array:
569 1: character for terminator (optional), defaults to '*'.
570 2: character for unknown amino acid (optional), defaults to 'X'.
571 3: frame (optional), valid values are 0, 1, 2, defaults to 0.
572 4: codon table id (optional), defaults to 1.
573 5: complete coding sequence expected, defaults to 0 (false).
574 6: boolean, throw exception if not complete coding sequence
575 (true), defaults to warning (false)
576 7: codontable, a custom Bio::Tools::CodonTable object (optional).
578 =cut
580 sub translate {
581 my ($self,@args) = @_;
582 my ($terminator, $unknown, $frame, $codonTableId, $complete,
583 $complete_codons, $throw, $codonTable, $orf, $start_codon, $offset);
585 ## new API with named parameters, post 1.5.1
586 if ($args[0] && $args[0] =~ /^-[A-Z]+/i) {
587 ($terminator, $unknown, $frame, $codonTableId, $complete,
588 $complete_codons, $throw,$codonTable, $orf, $start_codon, $offset) =
589 $self->_rearrange([qw(TERMINATOR
590 UNKNOWN
591 FRAME
592 CODONTABLE_ID
593 COMPLETE
594 COMPLETE_CODONS
595 THROW
596 CODONTABLE
598 START
599 OFFSET)], @args);
600 ## old API, 1.5.1 and preceding versions
601 } else {
602 ($terminator, $unknown, $frame, $codonTableId,
603 $complete, $throw, $codonTable, $offset) = @args;
606 ## Initialize termination codon, unknown codon, codon table id, frame
607 $terminator = '*' unless (defined($terminator) and $terminator ne '');
608 $unknown = "X" unless (defined($unknown) and $unknown ne '');
609 $frame = 0 unless (defined($frame) and $frame ne '');
610 $codonTableId = 1 unless (defined($codonTableId) and $codonTableId ne '');
611 $complete_codons ||= $complete || 0;
613 ## Get a CodonTable, error if custom CodonTable is invalid
614 if ($codonTable) {
615 $self->throw("Need a Bio::Tools::CodonTable object, not ". $codonTable)
616 unless $codonTable->isa('Bio::Tools::CodonTable');
617 } else {
619 # shouldn't this be cached? Seems wasteful to have a new instance
620 # every time...
621 $codonTable = Bio::Tools::CodonTable->new( -id => $codonTableId);
624 ## Error if alphabet is "protein"
625 $self->throw("Can't translate an amino acid sequence.") if
626 ($self->alphabet =~ /protein/i);
628 ## Error if -start parameter isn't a valid codon
629 if ($start_codon) {
630 $self->throw("Invalid start codon: $start_codon.") if
631 ( $start_codon !~ /^[A-Z]{3}$/i );
634 my $seq;
635 if ($offset) {
636 $self->throw("Offset must be 1, 2, or 3.") if
637 ( $offset !~ /^[123]$/ );
638 my ($start, $end) = ($offset, $self->length);
639 ($seq) = $self->subseq($start, $end);
640 } else {
641 ($seq) = $self->seq();
644 ## ignore frame if an ORF is supposed to be found
645 if ( $orf ) {
646 my ($orf_region) = $self->_find_orfs_nucleotide( $seq, $codonTable, $start_codon, $orf eq 'longest' ? 0 : 'first_only' );
647 $seq = $self->_orf_sequence( $seq, $orf_region );
648 } else {
649 ## use frame, error if frame is not 0, 1 or 2
650 $self->throw("Valid values for frame are 0, 1, or 2, not $frame.")
651 unless ($frame == 0 or $frame == 1 or $frame == 2);
652 $seq = substr($seq,$frame);
655 ## Translate it
656 my $output = $codonTable->translate($seq, $complete_codons);
657 # Use user-input terminator/unknown
658 $output =~ s/\*/$terminator/g;
659 $output =~ s/X/$unknown/g;
661 ## Only if we are expecting to translate a complete coding region
662 if ($complete) {
663 my $id = $self->display_id;
664 # remove the terminator character
665 if( substr($output,-1,1) eq $terminator ) {
666 chop $output;
667 } else {
668 $throw && $self->throw("Seq [$id]: Not using a valid terminator codon!");
669 $self->warn("Seq [$id]: Not using a valid terminator codon!");
671 # test if there are terminator characters inside the protein sequence!
672 if ($output =~ /\Q$terminator\E/) {
673 $id ||= '';
674 $throw && $self->throw("Seq [$id]: Terminator codon inside CDS!");
675 $self->warn("Seq [$id]: Terminator codon inside CDS!");
677 # if the initiator codon is not ATG, the amino acid needs to be changed to M
678 if ( substr($output,0,1) ne 'M' ) {
679 if ($codonTable->is_start_codon(substr($seq, 0, 3)) ) {
680 $output = 'M'. substr($output,1);
681 } elsif ($throw) {
682 $self->throw("Seq [$id]: Not using a valid initiator codon!");
683 } else {
684 $self->warn("Seq [$id]: Not using a valid initiator codon!");
689 # Create a new fresh object if $self is 'Bio::Seq::LargePrimarySeq'
690 # or 'Bio::Seq::LargeSeq', if not take advantage of
691 # Bio::Root::clone to get an object copy
692 my $out;
693 if ( $self->isa('Bio::Seq::LargePrimarySeq')
694 or $self->isa('Bio::Seq::LargeSeq')
696 my ($seqclass, $opts) = $self->_setup_class;
697 $out = $seqclass->new(
698 -seq => $output,
699 -is_circular => $self->is_circular,
700 -display_id => $self->display_id,
701 -accession_number => $self->accession_number,
702 -alphabet => 'protein',
703 -desc => $self->desc,
704 -verbose => $self->verbose,
705 %$opts,
707 } else {
708 $out = $self->clone;
709 $out->seq($output);
710 $out->alphabet('protein');
712 return $out;
716 =head2 transcribe()
718 Title : transcribe
719 Usage : $xseq = $seq->transcribe
720 Function: Convert base T to base U
721 Returns : PrimarySeqI object of alphabet 'rna' or
722 undef if $seq->alphabet ne 'dna'
723 Args :
725 =cut
727 sub transcribe {
728 my $self = shift;
729 return unless $self->alphabet eq 'dna';
730 my $s = $self->seq;
731 $s =~ tr/tT/uU/;
732 my $desc = $self->desc || '';
734 # Create a new fresh object if $self is 'Bio::Seq::LargePrimarySeq'
735 # or 'Bio::Seq::LargeSeq', if not take advantage of
736 # Bio::Root::clone to get an object copy
737 my $out;
738 if ( $self->isa('Bio::Seq::LargePrimarySeq')
739 or $self->isa('Bio::Seq::LargeSeq')
741 my ($seqclass, $opts) = $self->_setup_class;
742 $out = $seqclass->new(
743 -seq => $s,
744 -is_circular => $self->is_circular,
745 -display_id => $self->display_id,
746 -accession_number => $self->accession_number,
747 -alphabet => 'rna',
748 -desc => "${desc}[TRANSCRIBED]",
749 -verbose => $self->verbose,
750 %$opts,
752 } else {
753 $out = $self->clone;
754 $out->seq($s);
755 $out->alphabet('rna');
756 $out->desc($desc . "[TRANSCRIBED]");
758 return $out;
762 =head2 rev_transcribe()
764 Title : rev_transcribe
765 Usage : $rtseq = $seq->rev_transcribe
766 Function: Convert base U to base T
767 Returns : PrimarySeqI object of alphabet 'dna' or
768 undef if $seq->alphabet ne 'rna'
769 Args :
771 =cut
773 sub rev_transcribe {
774 my $self = shift;
775 return unless $self->alphabet eq 'rna';
776 my $s = $self->seq;
777 $s =~ tr/uU/tT/;
778 my $desc = $self->desc || '';
780 # Create a new fresh object if $self is 'Bio::Seq::LargePrimarySeq'
781 # or 'Bio::Seq::LargeSeq', if not take advantage of
782 # Bio::Root::clone to get an object copy
783 my $out;
784 if ( $self->isa('Bio::Seq::LargePrimarySeq')
785 or $self->isa('Bio::Seq::LargeSeq')
787 my ($seqclass, $opts) = $self->_setup_class;
788 $out = $seqclass->new(
789 -seq => $s,
790 -is_circular => $self->is_circular,
791 -display_id => $self->display_id,
792 -accession_number => $self->accession_number,
793 -alphabet => 'dna',
794 -desc => $self->desc . "[REVERSE TRANSCRIBED]",
795 -verbose => $self->verbose,
796 %$opts,
798 } else {
799 $out = $self->clone;
800 $out->seq($s);
801 $out->alphabet('dna');
802 $out->desc($desc . "[REVERSE TRANSCRIBED]");
804 return $out;
808 =head2 id
810 Title : id
811 Usage : $id = $seq->id()
812 Function: ID of the sequence. This should normally be (and actually is in
813 the implementation provided here) just a synonym for display_id().
814 Returns : A string.
815 Args :
817 =cut
819 sub id {
820 my ($self)= @_;
821 return $self->display_id();
825 =head2 length
827 Title : length
828 Usage : $len = $seq->length()
829 Function:
830 Returns : Integer representing the length of the sequence.
831 Args :
833 =cut
835 sub length {
836 my ($self)= @_;
837 $self->throw_not_implemented();
841 =head2 desc
843 Title : desc
844 Usage : $seq->desc($newval);
845 $description = $seq->desc();
846 Function: Get/set description text for a seq object
847 Returns : Value of desc
848 Args : newvalue (optional)
850 =cut
852 sub desc {
853 shift->throw_not_implemented();
857 =head2 is_circular
859 Title : is_circular
860 Usage : if( $obj->is_circular) { # Do something }
861 Function: Returns true if the molecule is circular
862 Returns : Boolean value
863 Args : none
865 =cut
867 sub is_circular {
868 shift->throw_not_implemented;
872 =head1 Private functions
874 These are some private functions for the PrimarySeqI interface. You do not
875 need to implement these functions
877 =head2 _find_orfs_nucleotide
879 Title : _find_orfs_nucleotide
880 Usage :
881 Function: Finds ORF starting at 1st initiation codon in nucleotide sequence.
882 The ORF is not required to have a termination codon.
883 Example :
884 Returns : a list of string coordinates of ORF locations (0-based half-open),
885 sorted descending by length (so that the longest is first)
886 as: [ start, end, frame, length ], [ start, end, frame, length ], ...
887 Args : Nucleotide sequence,
888 CodonTable object,
889 (optional) alternative initiation codon (e.g. 'ATA'),
890 (optional) boolean that, if true, stops after finding the
891 first available ORF
893 =cut
895 sub _find_orfs_nucleotide {
896 my ( $self, $sequence, $codon_table, $start_codon, $first_only ) = @_;
897 $sequence = uc $sequence;
898 $start_codon = uc $start_codon if $start_codon;
900 my $is_start = $start_codon
901 ? sub { shift eq $start_codon }
902 : sub { $codon_table->is_start_codon( shift ) };
904 # stores the begin index of the currently-running ORF in each
905 # reading frame
906 my @current_orf_start = (-1,-1,-1);
908 #< stores coordinates of longest observed orf (so far) in each
909 # reading frame
910 my @orfs;
912 # go through each base of the sequence, and each reading frame for each base
913 my $seqlen = CORE::length $sequence;
914 my @start_frame_order;
915 for( my $j = 0; $j <= $seqlen-3; $j++ ) {
916 my $frame = $j % 3;
918 my $this_codon = substr( $sequence, $j, 3 );
920 # if in an orf and this is either a stop codon or the last in-frame codon in the string
921 if ( $current_orf_start[$frame] >= 0 ) {
922 if ( $codon_table->is_ter_codon( $this_codon ) ||( my $is_last_codon_in_frame = ($j >= $seqlen-5)) ) {
923 # record ORF start, end (half-open), length, and frame
924 my @this_orf = ( $current_orf_start[$frame], $j+3, undef, $frame );
925 my $this_orf_length = $this_orf[2] = ( $this_orf[1] - $this_orf[0] );
927 if ($first_only && $frame == $start_frame_order[0]) {
928 $self->warn( "Translating partial ORF "
929 .$self->_truncate_seq( $self->_orf_sequence( $sequence, \@this_orf ))
930 .' from end of nucleotide sequence'
931 ) if $is_last_codon_in_frame;
932 return \@this_orf;
934 push @orfs, \@this_orf;
935 $current_orf_start[$frame] = -1;
938 # if this is a start codon
939 elsif ( $is_start->($this_codon) ) {
940 $current_orf_start[$frame] = $j;
941 push @start_frame_order, $frame;
945 return sort { $b->[2] <=> $a->[2] } @orfs;
949 sub _truncate_seq {
950 my ($self, $seq) = @_;
951 return CORE::length($seq) > 200 ? substr($seq,0,50).'...'.substr($seq,-50) : $seq;
955 sub _orf_sequence {
956 my ($self, $seq, $orf ) = @_;
957 return '' unless $orf;
958 return substr( $seq, $orf->[0], $orf->[2] )
962 =head2 _attempt_to_load_Seq
964 Title : _attempt_to_load_Seq
965 Usage :
966 Function:
967 Example :
968 Returns :
969 Args :
971 =cut
973 sub _attempt_to_load_Seq {
974 my ($self) = @_;
976 if( $main::{'Bio::PrimarySeq'} ) {
977 return 1;
978 } else {
979 eval {
980 require Bio::PrimarySeq;
982 if( $@ ) {
983 my $text = "Bio::PrimarySeq could not be loaded for [$self]\n".
984 "This indicates that you are using Bio::PrimarySeqI ".
985 "without Bio::PrimarySeq loaded or without providing a ".
986 "complete implementation.\nThe most likely problem is that there ".
987 "has been a misconfiguration of the bioperl environment\n".
988 "Actual exception:\n\n";
989 $self->throw("$text$@\n");
990 return 0;
992 return 1;
997 sub _setup_class {
998 # Return name of class and setup some default parameters
999 my ($self) = @_;
1000 my $seqclass;
1001 if ($self->can_call_new()) {
1002 $seqclass = ref($self);
1003 } else {
1004 $seqclass = 'Bio::PrimarySeq';
1005 $self->_attempt_to_load_Seq();
1007 my %opts;
1008 if ($seqclass eq 'Bio::PrimarySeq') {
1009 # Since sequence is in a Seq object, it has already been validated.
1010 # We do not need to validate its trunc(), revcom(), etc
1011 $opts{ -direct } = 1;
1013 return $seqclass, \%opts;