lib/Bio/PrimarySeqI.pm

   1 #
   2 # BioPerl module for Bio::PrimarySeqI
   3 #
   4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
   5 #
   6 # Cared for by Ewan Birney <birney@ebi.ac.uk>
   7 #
   8 # Copyright Ewan Birney
   9 #
  10 # You may distribute this module under the same terms as perl itself
  11
  12 # POD documentation - main docs before the code
  13
  14
  15 =head1 NAME
  16
  17 Bio::PrimarySeqI - Interface definition for a Bio::PrimarySeq
  18
  19 =head1 SYNOPSIS
  20
  21     # Bio::PrimarySeqI is the interface class for sequences.
  22     # If you are a newcomer to bioperl, you might want to start with
  23     # Bio::Seq documentation.
  24
  25     # Test if this is a seq object
  26     $obj->isa("Bio::PrimarySeqI") ||
  27       $obj->throw("$obj does not implement the Bio::PrimarySeqI interface");
  28
  29     # Accessors
  30     $string    = $obj->seq();
  31     $substring = $obj->subseq(12,50);
  32     $display   = $obj->display_id();       # for human display
  33     $id        = $obj->primary_id();       # unique id for this object,
  34                                            # implementation defined
  35     $unique_key= $obj->accession_number(); # unique biological id
  36
  37
  38     # Object manipulation
  39     eval {
  40        $rev = $obj->revcom();
  41     };
  42     if( $@ ) {
  43        $obj->throw( "Could not reverse complement. ".
  44                     "Probably not DNA. Actual exception\n$@\n" );
  45     }
  46
  47     $trunc = $obj->trunc(12,50);
  48     # $rev and $trunc are Bio::PrimarySeqI compliant objects
  49
  50
  51 =head1 DESCRIPTION
  52
  53 This object defines an abstract interface to basic sequence
  54 information - for most users of the package the documentation (and
  55 methods) in this class are not useful - this is a developers-only
  56 class which defines what methods have to be implemented by other Perl
  57 objects to comply to the Bio::PrimarySeqI interface. Go "perldoc
  58 Bio::Seq" or "man Bio::Seq" for more information on the main class for
  59 sequences.
  60
  61 PrimarySeq is an object just for the sequence and its name(s), nothing
  62 more. Seq is the larger object complete with features. There is a pure
  63 perl implementation of this in L<Bio::PrimarySeq>. If you just want to
  64 use L<Bio::PrimarySeq> objects, then please read that module first. This
  65 module defines the interface, and is of more interest to people who
  66 want to wrap their own Perl Objects/RDBs/FileSystems etc in way that
  67 they "are" bioperl sequence objects, even though it is not using Perl
  68 to store the sequence etc.
  69
  70 This interface defines what bioperl considers necessary to "be" a
  71 sequence, without providing an implementation of this, an
  72 implementation is provided in L<Bio::PrimarySeq>. If you want to provide
  73 a Bio::PrimarySeq-compliant object which in fact wraps another
  74 object/database/out-of-perl experience, then this is the correct thing
  75 to wrap, generally by providing a wrapper class which would inherit
  76 from your object and this Bio::PrimarySeqI interface. The wrapper class
  77 then would have methods lists in the "Implementation Specific
  78 Functions" which would provide these methods for your object.
  79
  80 =head1 FEEDBACK
  81
  82 =head2 Mailing Lists
  83
  84 User feedback is an integral part of the evolution of this and other
  85 Bioperl modules. Send your comments and suggestions preferably to one
  86 of the Bioperl mailing lists.  Your participation is much appreciated.
  87
  88   bioperl-l@bioperl.org                  - General discussion
  89   http://bioperl.org/wiki/Mailing_lists  - About the mailing lists
  90
  91 =head2 Support
  92
  93 Please direct usage questions or support issues to the mailing list:
  94
  95 I<bioperl-l@bioperl.org>
  96
  97 rather than to the module maintainer directly. Many experienced and
  98 reponsive experts will be able look at the problem and quickly
  99 address it. Please include a thorough description of the problem
 100 with code and data examples if at all possible.
 101
 102 =head2 Reporting Bugs
 103
 104 Report bugs to the Bioperl bug tracking system to help us keep track
 105 the bugs and their resolution.  Bug reports can be submitted via the
 106 web:
 107
 108   https://github.com/bioperl/bioperl-live/issues
 109
 110 =head1 AUTHOR - Ewan Birney
 111
 112 Email birney@ebi.ac.uk
 113
 114 =head1 APPENDIX
 115
 116 The rest of the documentation details each of the object
 117 methods. Internal methods are usually preceded with a _
 118
 119 =cut
 120
 121
 122 package Bio::PrimarySeqI;
 123
 124 use strict;
 125 use Bio::Tools::CodonTable;
 126
 127 use base qw(Bio::Root::RootI);
 128
 129
 130 =head1 Implementation-specific Functions
 131
 132 These functions are the ones that a specific implementation must
 133 define.
 134
 135 =head2 seq
 136
 137  Title   : seq
 138  Usage   : $string = $obj->seq()
 139  Function: Returns the sequence as a string of letters. The
 140            case of the letters is left up to the implementer.
 141            Suggested cases are upper case for proteins and lower case for
 142            DNA sequence (IUPAC standard), but implementations are suggested to
 143            keep an open mind about case (some users... want mixed case!)
 144  Returns : A scalar
 145  Status  : Virtual
 146
 147 =cut
 148
 149 sub seq {
 150     my ($self) = @_;
 151     $self->throw_not_implemented();
 152 }
 153
 154
 155 =head2 subseq
 156
 157  Title   : subseq
 158  Usage   : $substring = $obj->subseq(10,40);
 159  Function: Returns the subseq from start to end, where the first base
 160            is 1 and the number is inclusive, i.e. 1-2 are the first two
 161            bases of the sequence.
 162
 163            Start cannot be larger than end but can be equal.
 164
 165  Returns : A string
 166  Args    :
 167  Status  : Virtual
 168
 169 =cut
 170
 171 sub subseq {
 172     my ($self) = @_;
 173     $self->throw_not_implemented();
 174 }
 175
 176
 177 =head2 display_id
 178
 179  Title   : display_id
 180  Usage   : $id_string = $obj->display_id();
 181  Function: Returns the display id, also known as the common name of the Sequence
 182            object.
 183
 184            The semantics of this is that it is the most likely string
 185            to be used as an identifier of the sequence, and likely to
 186            have "human" readability.  The id is equivalent to the ID
 187            field of the GenBank/EMBL databanks and the id field of the
 188            Swissprot/sptrembl database. In fasta format, the >(\S+) is
 189            presumed to be the id, though some people overload the id
 190            to embed other information. Bioperl does not use any
 191            embedded information in the ID field, and people are
 192            encouraged to use other mechanisms (accession field for
 193            example, or extending the sequence object) to solve this.
 194
 195            Notice that $seq->id() maps to this function, mainly for
 196            legacy/convenience reasons.
 197  Returns : A string
 198  Args    : None
 199  Status  : Virtual
 200
 201 =cut
 202
 203 sub display_id {
 204     my ($self) = @_;
 205     $self->throw_not_implemented();
 206 }
 207
 208
 209 =head2 accession_number
 210
 211  Title   : accession_number
 212  Usage   : $unique_biological_key = $obj->accession_number;
 213  Function: Returns the unique biological id for a sequence, commonly
 214            called the accession_number. For sequences from established
 215            databases, the implementors should try to use the correct
 216            accession number. Notice that primary_id() provides the
 217            unique id for the implementation, allowing multiple objects
 218            to have the same accession number in a particular implementation.
 219
 220            For sequences with no accession number, this method should return
 221            "unknown".
 222  Returns : A string
 223  Args    : None
 224  Status  : Virtual
 225
 226 =cut
 227
 228 sub accession_number {
 229     my ($self,@args) = @_;
 230     $self->throw_not_implemented();
 231 }
 232
 233
 234 =head2 primary_id
 235
 236  Title   : primary_id
 237  Usage   : $unique_implementation_key = $obj->primary_id;
 238  Function: Returns the unique id for this object in this
 239            implementation. This allows implementations to manage their
 240            own object ids in a way the implementation can control
 241            clients can expect one id to map to one object.
 242
 243            For sequences with no accession number, this method should
 244            return a stringified memory location.
 245
 246  Returns : A string
 247  Args    : None
 248  Status  : Virtual
 249
 250 =cut
 251
 252 sub primary_id {
 253     my ($self,@args) = @_;
 254     $self->throw_not_implemented();
 255 }
 256
 257
 258 =head2 can_call_new
 259
 260  Title   : can_call_new
 261  Usage   : if( $obj->can_call_new ) {
 262              $newobj = $obj->new( %param );
 263          }
 264  Function: Can_call_new returns 1 or 0 depending
 265            on whether an implementation allows new
 266            constructor to be called. If a new constructor
 267            is allowed, then it should take the followed hashed
 268            constructor list.
 269
 270            $myobject->new( -seq => $sequence_as_string,
 271                            -display_id  => $id
 272                            -accession_number => $accession
 273                            -alphabet => 'dna',
 274                            );
 275  Returns : 1 or 0
 276  Args    :
 277
 278
 279 =cut
 280
 281 sub can_call_new {
 282     my ($self,@args) = @_;
 283     # we default to 0 here
 284     return 0;
 285 }
 286
 287
 288 =head2 alphabet
 289
 290  Title   : alphabet
 291  Usage   : if( $obj->alphabet eq 'dna' ) { /Do Something/ }
 292  Function: Returns the type of sequence being one of
 293            'dna', 'rna' or 'protein'. This is case sensitive.
 294
 295            This is not called "type" because this would cause
 296            upgrade problems from the 0.5 and earlier Seq objects.
 297
 298  Returns : A string either 'dna','rna','protein'. NB - the object must
 299            make a call of the alphabet, if there is no alphabet specified it
 300            has to guess.
 301  Args    : None
 302  Status  : Virtual
 303
 304 =cut
 305
 306 sub alphabet {
 307     my ( $self ) = @_;
 308     $self->throw_not_implemented();
 309 }
 310
 311
 312 =head2 moltype
 313
 314  Title   : moltype
 315  Usage   : Deprecated. Use alphabet() instead.
 316
 317 =cut
 318
 319 sub moltype {
 320     my ($self,@args) = @_;
 321     $self->warn("moltype: pre v1.0 method. Calling alphabet() instead...");
 322     return $self->alphabet(@args);
 323 }
 324
 325
 326 =head1 Implementation-optional Functions
 327
 328 The following functions rely on the above functions. An
 329 implementing class does not need to provide these functions, as they
 330 will be provided by this class, but is free to override these
 331 functions.
 332
 333 The revcom(), trunc(), and translate() methods create new sequence
 334 objects. They will call new() on the class of the sequence object
 335 instance passed as argument, unless can_call_new() returns FALSE. In
 336 the latter case a Bio::PrimarySeq object will be created. Implementors
 337 which really want to control how objects are created (eg, for object
 338 persistence over a database, or objects in a CORBA framework), they
 339 are encouraged to override these methods
 340
 341 =head2 revcom
 342
 343  Title   : revcom
 344  Usage   : $rev = $seq->revcom()
 345  Function: Produces a new Bio::PrimarySeqI implementing object which
 346            is the reversed complement of the sequence. For protein
 347            sequences this throws an exception of "Sequence is a
 348            protein. Cannot revcom".
 349
 350            The id is the same id as the original sequence, and the
 351            accession number is also identical. If someone wants to
 352            track that this sequence has be reversed, it needs to
 353            define its own extensions.
 354
 355            To do an inplace edit of an object you can go:
 356
 357            $seq = $seq->revcom();
 358
 359            This of course, causes Perl to handle the garbage
 360            collection of the old object, but it is roughly speaking as
 361            efficient as an inplace edit.
 362
 363  Returns : A new (fresh) Bio::PrimarySeqI object
 364  Args    : None
 365
 366
 367 =cut
 368
 369 sub revcom {
 370     my ($self) = @_;
 371
 372     # Create a new fresh object if $self is 'Bio::Seq::LargePrimarySeq'
 373     # or 'Bio::Seq::LargeSeq', if not take advantage of
 374     # Bio::Root::clone to get an object copy
 375     my $out;
 376     if (   $self->isa('Bio::Seq::LargePrimarySeq')
 377         or $self->isa('Bio::Seq::LargeSeq')
 378         ) {
 379         my ($seqclass, $opts) = $self->_setup_class;
 380         $out = $seqclass->new(
 381             -seq              => $self->_revcom_from_string($self->seq, $self->alphabet),
 382             -is_circular      => $self->is_circular,
 383             -display_id       => $self->display_id,
 384             -accession_number => $self->accession_number,
 385             -alphabet         => $self->alphabet,
 386             -desc             => $self->desc,
 387             -verbose          => $self->verbose,
 388             %$opts,
 389         );
 390     } else {
 391         $out = $self->clone;
 392         $out->seq( $out->_revcom_from_string($out->seq, $out->alphabet) );
 393     }
 394     return $out;
 395 }
 396
 397
 398 sub _revcom_from_string {
 399     my ($self, $string, $alphabet) = @_;
 400
 401     # Check that reverse-complementing makes sense
 402     if( $alphabet eq 'protein' ) {
 403         $self->throw("Sequence is a protein. Cannot revcom.");
 404     }
 405     if( $alphabet ne 'dna' && $alphabet ne 'rna' ) {
 406         my $msg = "Sequence is not dna or rna, but [$alphabet]. Attempting to revcom, ".
 407                   "but unsure if this is right.";
 408         if( $self->can('warn') ) {
 409             $self->warn($msg);
 410         } else {
 411             warn("[$self] $msg");
 412         }
 413     }
 414
 415     # If sequence is RNA, map to DNA (then map back later)
 416     if( $alphabet eq 'rna' ) {
 417         $string =~ tr/uU/tT/;
 418     }
 419
 420     # Reverse-complement now
 421     $string =~ tr/acgtrymkswhbvdnxACGTRYMKSWHBVDNX/tgcayrkmswdvbhnxTGCAYRKMSWDVBHNX/;
 422     $string = CORE::reverse $string;
 423
 424     # Map back RNA to DNA
 425     if( $alphabet eq 'rna' ) {
 426         $string =~ tr/tT/uU/;
 427     }
 428
 429     return $string;
 430 }
 431
 432
 433 =head2 trunc
 434
 435  Title   : trunc
 436  Usage   : $subseq = $myseq->trunc(10,100);
 437  Function: Provides a truncation of a sequence.
 438  Returns : A fresh Bio::PrimarySeqI implementing object.
 439  Args    : Two integers denoting first and last base of the sub-sequence.
 440
 441
 442 =cut
 443
 444 sub trunc {
 445     my ($self,$start,$end) = @_;
 446
 447     my $str;
 448     if( defined $start && ref($start) &&
 449         $start->isa('Bio::LocationI') ) {
 450         $str = $self->subseq($start); # start is a location actually
 451     } elsif( !$end ) {
 452         $self->throw("trunc start,end -- there was no end for $start");
 453     } elsif( $end < $start ) {
 454         my $msg = "start [$start] is greater than end [$end]. \n".
 455                   "If you want to truncated and reverse complement, \n".
 456                   "you must call trunc followed by revcom. Sorry.";
 457         $self->throw($msg);
 458     } else {
 459         $str = $self->subseq($start,$end);
 460     }
 461
 462     # Create a new fresh object if $self is 'Bio::Seq::LargePrimarySeq'
 463     # or 'Bio::Seq::LargeSeq', if not take advantage of
 464     # Bio::Root::clone to get an object copy
 465     my $out;
 466     if (   $self->isa('Bio::Seq::LargePrimarySeq')
 467         or $self->isa('Bio::Seq::LargeSeq')
 468         or $self->isa('Bio::Seq::RichSeq')
 469         ) {
 470         my ($seqclass, $opts) = $self->_setup_class;
 471         $out = $seqclass->new(
 472             -seq              => $str,
 473             -is_circular      => $self->is_circular,
 474             -display_id       => $self->display_id,
 475             -accession_number => $self->accession_number,
 476             -alphabet         => $self->alphabet,
 477             -desc             => $self->desc,
 478             -verbose          => $self->verbose,
 479             %$opts,
 480         );
 481     } else {
 482         $out = $self->clone;
 483         $out->seq($str);
 484     }
 485     return $out;
 486 }
 487
 488
 489 =head2 translate
 490
 491  Title   : translate
 492  Usage   : $protein_seq_obj = $dna_seq_obj->translate
 493
 494            Or if you expect a complete coding sequence (CDS) translation,
 495            with initiator at the beginning and terminator at the end:
 496
 497            $protein_seq_obj = $cds_seq_obj->translate(-complete => 1);
 498
 499            Or if you want translate() to find the first initiation
 500            codon and return the corresponding protein:
 501
 502            $protein_seq_obj = $cds_seq_obj->translate(-orf => 1);
 503
 504  Function: Provides the translation of the DNA sequence using full
 505            IUPAC ambiguities in DNA/RNA and amino acid codes.
 506
 507            The complete CDS translation is identical to EMBL/TREMBL
 508            database translation. Note that the trailing terminator
 509            character is removed before returning the translated protein
 510            object.
 511
 512            Note: if you set $dna_seq_obj->verbose(1) you will get a
 513            warning if the first codon is not a valid initiator.
 514
 515  Returns : A Bio::PrimarySeqI implementing object
 516  Args    : -terminator
 517                character for terminator, default '*'
 518            -unknown
 519                character for unknown, default 'X'
 520            -frame
 521                positive integer frame shift (in bases), default 0
 522            -codontable_id
 523                integer codon table id, default 1
 524            -complete
 525                boolean, if true, complete CDS is expected. default false
 526            -complete_codons
 527                boolean, if true, codons which are incomplete are translated if a
 528                suitable amino acid is found. For instance, if the incomplete
 529                codon is 'GG', the completed codon is 'GGN', which is glycine
 530                (G). Defaults to 'false'; setting '-complete' also makes this
 531                true.
 532            -throw
 533                boolean, throw exception if ORF not complete, default false
 534            -orf
 535                if 'longest', find longest ORF.  other true value, find
 536                first ORF.  default 0
 537            -codontable
 538                optional L<Bio::Tools::CodonTable> object to use for
 539                translation
 540            -start
 541                optional three-character string to force as initiation
 542                codon (e.g. 'atg'). If unset, start codons are
 543                determined by the CodonTable.  Case insensitive.
 544            -offset
 545                optional positive integer offset for fuzzy locations.
 546                if set, must be either 1, 2, or 3
 547
 548 =head3 Notes
 549
 550 The -start argument only applies when -orf is set to 1. By default all
 551 initiation codons found in the given codon table are used but when
 552 "start" is set to some codon this codon will be used exclusively as
 553 the initiation codon. Note that the default codon table (NCBI
 554 "Standard") has 3 initiation codons!
 555
 556 By default translate() translates termination codons to the some
 557 character (default is *), both internal and trailing codons. Setting
 558 "-complete" to 1 tells translate() to remove the trailing character.
 559
 560 -offset is used for seqfeatures which contain the the \codon_start tag
 561 and can be set to 1, 2, or 3.  This is the offset by which the
 562 sequence translation starts relative to the first base of the feature
 563
 564 For details on codon tables used by translate() see L<Bio::Tools::CodonTable>.
 565
 566 Deprecated argument set (v. 1.5.1 and prior versions) where each argument is an
 567 element in an array:
 568
 569   1: character for terminator (optional), defaults to '*'.
 570   2: character for unknown amino acid (optional), defaults to 'X'.
 571   3: frame (optional), valid values are 0, 1, 2, defaults to 0.
 572   4: codon table id (optional), defaults to 1.
 573   5: complete coding sequence expected, defaults to 0 (false).
 574   6: boolean, throw exception if not complete coding sequence
 575      (true), defaults to warning (false)
 576   7: codontable, a custom Bio::Tools::CodonTable object (optional).
 577
 578 =cut
 579
 580 sub translate {
 581     my ($self,@args) = @_;
 582     my ($terminator, $unknown, $frame, $codonTableId, $complete,
 583         $complete_codons, $throw, $codonTable, $orf, $start_codon, $offset);
 584
 585     ## new API with named parameters, post 1.5.1
 586     if ($args[0] && $args[0] =~ /^-[A-Z]+/i) {
 587         ($terminator, $unknown, $frame, $codonTableId, $complete,
 588         $complete_codons, $throw,$codonTable, $orf, $start_codon, $offset) =
 589             $self->_rearrange([qw(TERMINATOR
 590                                                UNKNOWN
 591                                                FRAME
 592                                                CODONTABLE_ID
 593                                                COMPLETE
 594                                                COMPLETE_CODONS
 595                                                THROW
 596                                                CODONTABLE
 597                                                ORF
 598                                                START
 599                                                OFFSET)], @args);
 600     ## old API, 1.5.1 and preceding versions
 601     } else {
 602         ($terminator, $unknown, $frame, $codonTableId,
 603          $complete, $throw, $codonTable, $offset) = @args;
 604     }
 605
 606     ## Initialize termination codon, unknown codon, codon table id, frame
 607     $terminator = '*'    unless (defined($terminator) and $terminator ne '');
 608     $unknown = "X"       unless (defined($unknown) and $unknown ne '');
 609     $frame = 0           unless (defined($frame) and $frame ne '');
 610     $codonTableId = 1    unless (defined($codonTableId) and $codonTableId ne '');
 611     $complete_codons ||= $complete || 0;
 612
 613     ## Get a CodonTable, error if custom CodonTable is invalid
 614     if ($codonTable) {
 615         $self->throw("Need a Bio::Tools::CodonTable object, not ". $codonTable)
 616             unless $codonTable->isa('Bio::Tools::CodonTable');
 617     } else {
 618
 619         # shouldn't this be cached?  Seems wasteful to have a new instance
 620         # every time...
 621         $codonTable = Bio::Tools::CodonTable->new( -id => $codonTableId);
 622     }
 623
 624     ## Error if alphabet is "protein"
 625     $self->throw("Can't translate an amino acid sequence.") if
 626         ($self->alphabet =~ /protein/i);
 627
 628     ## Error if -start parameter isn't a valid codon
 629     if ($start_codon) {
 630         $self->throw("Invalid start codon: $start_codon.") if
 631             ( $start_codon !~ /^[A-Z]{3}$/i );
 632     }
 633
 634     my $seq;
 635     if ($offset) {
 636         $self->throw("Offset must be 1, 2, or 3.") if
 637             ( $offset !~ /^[123]$/ );
 638         my ($start, $end) = ($offset, $self->length);
 639         ($seq) = $self->subseq($start, $end);
 640     } else {
 641         ($seq) = $self->seq();
 642     }
 643
 644          ## ignore frame if an ORF is supposed to be found
 645     if ( $orf ) {
 646         my ($orf_region) = $self->_find_orfs_nucleotide( $seq, $codonTable, $start_codon, $orf eq 'longest' ? 0 : 'first_only' );
 647         $seq = $self->_orf_sequence( $seq, $orf_region );
 648     } else {
 649         ## use frame, error if frame is not 0, 1 or 2
 650         $self->throw("Valid values for frame are 0, 1, or 2, not $frame.")
 651             unless ($frame == 0 or $frame == 1 or $frame == 2);
 652         $seq = substr($seq,$frame);
 653     }
 654
 655     ## Translate it
 656     my $output = $codonTable->translate($seq, $complete_codons);
 657     # Use user-input terminator/unknown
 658     $output =~ s/\*/$terminator/g;
 659     $output =~ s/X/$unknown/g;
 660
 661     ## Only if we are expecting to translate a complete coding region
 662     if ($complete) {
 663         my $id = $self->display_id;
 664         # remove the terminator character
 665         if( substr($output,-1,1) eq $terminator ) {
 666             chop $output;
 667         } else {
 668             $throw && $self->throw("Seq [$id]: Not using a valid terminator codon!");
 669             $self->warn("Seq [$id]: Not using a valid terminator codon!");
 670         }
 671         # test if there are terminator characters inside the protein sequence!
 672         if ($output =~ /\Q$terminator\E/) {
 673             $id ||= '';
 674             $throw && $self->throw("Seq [$id]: Terminator codon inside CDS!");
 675             $self->warn("Seq [$id]: Terminator codon inside CDS!");
 676         }
 677         # if the initiator codon is not ATG, the amino acid needs to be changed to M
 678         if ( substr($output,0,1) ne 'M' ) {
 679             if ($codonTable->is_start_codon(substr($seq, 0, 3)) ) {
 680                 $output = 'M'. substr($output,1);
 681             } elsif ($throw) {
 682                 $self->throw("Seq [$id]: Not using a valid initiator codon!");
 683             } else {
 684                 $self->warn("Seq [$id]: Not using a valid initiator codon!");
 685             }
 686         }
 687     }
 688
 689     # Create a new fresh object if $self is 'Bio::Seq::LargePrimarySeq'
 690     # or 'Bio::Seq::LargeSeq', if not take advantage of
 691     # Bio::Root::clone to get an object copy
 692     my $out;
 693     if (   $self->isa('Bio::Seq::LargePrimarySeq')
 694         or $self->isa('Bio::Seq::LargeSeq')
 695         ) {
 696         my ($seqclass, $opts) = $self->_setup_class;
 697         $out = $seqclass->new(
 698             -seq              => $output,
 699             -is_circular      => $self->is_circular,
 700             -display_id       => $self->display_id,
 701             -accession_number => $self->accession_number,
 702             -alphabet         => 'protein',
 703             -desc             => $self->desc,
 704             -verbose          => $self->verbose,
 705             %$opts,
 706         );
 707     } else {
 708         $out = $self->clone;
 709         $out->seq($output);
 710         $out->alphabet('protein');
 711     }
 712     return $out;
 713 }
 714
 715
 716 =head2 transcribe()
 717
 718  Title   : transcribe
 719  Usage   : $xseq = $seq->transcribe
 720  Function: Convert base T to base U
 721  Returns : PrimarySeqI object of alphabet 'rna' or
 722            undef if $seq->alphabet ne 'dna'
 723  Args    :
 724
 725 =cut
 726
 727 sub transcribe {
 728     my $self = shift;
 729     return unless $self->alphabet eq 'dna';
 730     my $s = $self->seq;
 731     $s =~ tr/tT/uU/;
 732     my $desc = $self->desc || '';
 733
 734     # Create a new fresh object if $self is 'Bio::Seq::LargePrimarySeq'
 735     # or 'Bio::Seq::LargeSeq', if not take advantage of
 736     # Bio::Root::clone to get an object copy
 737     my $out;
 738     if (   $self->isa('Bio::Seq::LargePrimarySeq')
 739         or $self->isa('Bio::Seq::LargeSeq')
 740         ) {
 741         my ($seqclass, $opts) = $self->_setup_class;
 742         $out = $seqclass->new(
 743             -seq              => $s,
 744             -is_circular      => $self->is_circular,
 745             -display_id       => $self->display_id,
 746             -accession_number => $self->accession_number,
 747             -alphabet         => 'rna',
 748             -desc             => "${desc}[TRANSCRIBED]",
 749             -verbose          => $self->verbose,
 750             %$opts,
 751         );
 752     } else {
 753         $out = $self->clone;
 754         $out->seq($s);
 755         $out->alphabet('rna');
 756         $out->desc($desc . "[TRANSCRIBED]");
 757     }
 758     return $out;
 759 }
 760
 761
 762 =head2 rev_transcribe()
 763
 764  Title   : rev_transcribe
 765  Usage   : $rtseq = $seq->rev_transcribe
 766  Function: Convert base U to base T
 767  Returns : PrimarySeqI object of alphabet 'dna' or
 768            undef if $seq->alphabet ne 'rna'
 769  Args    :
 770
 771 =cut
 772
 773 sub rev_transcribe {
 774     my $self = shift;
 775     return unless $self->alphabet eq 'rna';
 776     my $s = $self->seq;
 777     $s =~ tr/uU/tT/;
 778     my $desc = $self->desc || '';
 779
 780     # Create a new fresh object if $self is 'Bio::Seq::LargePrimarySeq'
 781     # or 'Bio::Seq::LargeSeq', if not take advantage of
 782     # Bio::Root::clone to get an object copy
 783     my $out;
 784     if (   $self->isa('Bio::Seq::LargePrimarySeq')
 785         or $self->isa('Bio::Seq::LargeSeq')
 786         ) {
 787         my ($seqclass, $opts) = $self->_setup_class;
 788         $out = $seqclass->new(
 789             -seq              => $s,
 790             -is_circular      => $self->is_circular,
 791             -display_id       => $self->display_id,
 792             -accession_number => $self->accession_number,
 793             -alphabet         => 'dna',
 794             -desc             => $self->desc . "[REVERSE TRANSCRIBED]",
 795             -verbose          => $self->verbose,
 796             %$opts,
 797         );
 798     } else {
 799         $out = $self->clone;
 800         $out->seq($s);
 801         $out->alphabet('dna');
 802         $out->desc($desc . "[REVERSE TRANSCRIBED]");
 803     }
 804     return $out;
 805 }
 806
 807
 808 =head2 id
 809
 810  Title   : id
 811  Usage   : $id = $seq->id()
 812  Function: ID of the sequence. This should normally be (and actually is in
 813            the implementation provided here) just a synonym for display_id().
 814  Returns : A string.
 815  Args    :
 816
 817 =cut
 818
 819 sub id {
 820    my ($self)= @_;
 821    return $self->display_id();
 822 }
 823
 824
 825 =head2 length
 826
 827  Title   : length
 828  Usage   : $len = $seq->length()
 829  Function:
 830  Returns : Integer representing the length of the sequence.
 831  Args    :
 832
 833 =cut
 834
 835 sub length {
 836     my ($self)= @_;
 837     $self->throw_not_implemented();
 838 }
 839
 840
 841 =head2 desc
 842
 843  Title   : desc
 844  Usage   : $seq->desc($newval);
 845            $description = $seq->desc();
 846  Function: Get/set description text for a seq object
 847  Returns : Value of desc
 848  Args    : newvalue (optional)
 849
 850 =cut
 851
 852 sub desc {
 853    shift->throw_not_implemented();
 854 }
 855
 856
 857 =head2 is_circular
 858
 859  Title   : is_circular
 860  Usage   : if( $obj->is_circular) { # Do something }
 861  Function: Returns true if the molecule is circular
 862  Returns : Boolean value
 863  Args    : none
 864
 865 =cut
 866
 867 sub is_circular {
 868     shift->throw_not_implemented;
 869 }
 870
 871
 872 =head1 Private functions
 873
 874 These are some private functions for the PrimarySeqI interface. You do not
 875 need to implement these functions
 876
 877 =head2 _find_orfs_nucleotide
 878
 879  Title   : _find_orfs_nucleotide
 880  Usage   :
 881  Function: Finds ORF starting at 1st initiation codon in nucleotide sequence.
 882            The ORF is not required to have a termination codon.
 883  Example :
 884  Returns : a list of string coordinates of ORF locations (0-based half-open),
 885            sorted descending by length (so that the longest is first)
 886            as: [ start, end, frame, length ], [ start, end, frame, length ], ...
 887  Args    : Nucleotide sequence,
 888            CodonTable object,
 889            (optional) alternative initiation codon (e.g. 'ATA'),
 890            (optional) boolean that, if true, stops after finding the
 891                       first available ORF
 892
 893 =cut
 894
 895 sub _find_orfs_nucleotide {
 896     my ( $self, $sequence, $codon_table, $start_codon, $first_only ) = @_;
 897     $sequence    = uc $sequence;
 898     $start_codon = uc $start_codon if $start_codon;
 899
 900     my $is_start = $start_codon
 901         ? sub { shift eq $start_codon }
 902         : sub { $codon_table->is_start_codon( shift ) };
 903
 904     # stores the begin index of the currently-running ORF in each
 905     # reading frame
 906     my @current_orf_start = (-1,-1,-1);
 907
 908     #< stores coordinates of longest observed orf (so far) in each
 909     #  reading frame
 910     my @orfs;
 911
 912     # go through each base of the sequence, and each reading frame for each base
 913     my $seqlen = CORE::length $sequence;
 914     my @start_frame_order;
 915     for( my $j = 0; $j <= $seqlen-3; $j++ ) {
 916         my $frame = $j % 3;
 917
 918         my $this_codon = substr( $sequence, $j, 3 );
 919
 920         # if in an orf and this is either a stop codon or the last in-frame codon in the string
 921         if ( $current_orf_start[$frame] >= 0 ) {
 922             if ( $codon_table->is_ter_codon( $this_codon ) ||( my $is_last_codon_in_frame = ($j >= $seqlen-5)) ) {
 923                 # record ORF start, end (half-open), length, and frame
 924                 my @this_orf = ( $current_orf_start[$frame], $j+3, undef, $frame );
 925                 my $this_orf_length = $this_orf[2] = ( $this_orf[1] - $this_orf[0] );
 926
 927                 if ($first_only && $frame == $start_frame_order[0]) {
 928                   $self->warn( "Translating partial ORF "
 929                                  .$self->_truncate_seq( $self->_orf_sequence( $sequence, \@this_orf ))
 930                                    .' from end of nucleotide sequence'
 931                                   ) if $is_last_codon_in_frame;
 932                   return \@this_orf;
 933                 }
 934                 push @orfs, \@this_orf;
 935                 $current_orf_start[$frame] = -1;
 936             }
 937         }
 938         # if this is a start codon
 939         elsif ( $is_start->($this_codon) ) {
 940             $current_orf_start[$frame] = $j;
 941             push @start_frame_order, $frame;
 942         }
 943     }
 944
 945     return sort { $b->[2] <=> $a->[2] } @orfs;
 946 }
 947
 948
 949 sub _truncate_seq {
 950     my ($self, $seq) = @_;
 951     return CORE::length($seq) > 200 ? substr($seq,0,50).'...'.substr($seq,-50) : $seq;
 952 }
 953
 954
 955 sub _orf_sequence {
 956     my ($self, $seq, $orf ) = @_;
 957     return '' unless $orf;
 958     return substr( $seq, $orf->[0], $orf->[2] )
 959 }
 960
 961
 962 =head2 _attempt_to_load_Seq
 963
 964  Title   : _attempt_to_load_Seq
 965  Usage   :
 966  Function:
 967  Example :
 968  Returns :
 969  Args    :
 970
 971 =cut
 972
 973 sub _attempt_to_load_Seq {
 974     my ($self) = @_;
 975
 976     if( $main::{'Bio::PrimarySeq'} ) {
 977         return 1;
 978     } else {
 979         eval {
 980             require Bio::PrimarySeq;
 981         };
 982         if( $@ ) {
 983             my $text = "Bio::PrimarySeq could not be loaded for [$self]\n".
 984                 "This indicates that you are using Bio::PrimarySeqI ".
 985                 "without Bio::PrimarySeq loaded or without providing a ".
 986                 "complete implementation.\nThe most likely problem is that there ".
 987                 "has been a misconfiguration of the bioperl environment\n".
 988                 "Actual exception:\n\n";
 989             $self->throw("$text$@\n");
 990             return 0;
 991         }
 992         return 1;
 993     }
 994 }
 995
 996
 997 sub _setup_class {
 998     # Return name of class and setup some default parameters
 999     my ($self) = @_;
1000     my $seqclass;
1001     if ($self->can_call_new()) {
1002         $seqclass = ref($self);
1003     } else {
1004         $seqclass = 'Bio::PrimarySeq';
1005         $self->_attempt_to_load_Seq();
1006     }
1007     my %opts;
1008     if ($seqclass eq 'Bio::PrimarySeq') {
1009         # Since sequence is in a Seq object, it has already been validated.
1010         # We do not need to validate its trunc(), revcom(), etc
1011         $opts{ -direct } = 1;
1012     }
1013     return $seqclass, \%opts;
1014 }
1015
1016
1017 1;