ide/bioperl-mode: into its own repository to be developed separately
[bioperl-live.git] / Bio / PrimarySeq.pm
blob3ac172cdf17e50ac68932bd9d2e9094b9624c48f
2 # bioperl module for Bio::PrimarySeq
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Ewan Birney <birney@ebi.ac.uk>
8 # Copyright Ewan Birney
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
14 =head1 NAME
16 Bio::PrimarySeq - Bioperl lightweight sequence object
18 =head1 SYNOPSIS
20 # Bio::SeqIO for file reading, Bio::DB::GenBank for
21 # database reading
23 use Bio::Seq;
24 use Bio::SeqIO;
25 use Bio::DB::GenBank;
27 # make from memory
29 $seqobj = Bio::PrimarySeq->new (
30 -seq => 'ATGGGGTGGGCGGTGGGTGGTTTG',
31 -id => 'GeneFragment-12',
32 -accession_number => 'X78121',
33 -alphabet => 'dna',
34 -is_circular => 1,
36 print "Sequence ", $seqobj->id(), " with accession ",
37 $seqobj->accession_number, "\n";
39 # read from file
41 $inputstream = Bio::SeqIO->new(
42 -file => "myseq.fa",
43 -format => 'Fasta',
45 $seqobj = $inputstream->next_seq();
46 print "Sequence ", $seqobj->id(), " and desc ", $seqobj->desc, "\n";
48 # to get out parts of the sequence.
50 print "Sequence ", $seqobj->id(), " with accession ",
51 $seqobj->accession_number, " and desc ", $seqobj->desc, "\n";
53 $string = $seqobj->seq();
54 $string2 = $seqobj->subseq(1,40);
56 =head1 DESCRIPTION
58 PrimarySeq is a lightweight sequence object, storing the sequence, its
59 name, a computer-useful unique name, and other fundamental attributes.
60 It does not contain sequence features or other information. To have a
61 sequence with sequence features you should use the Seq object which uses
62 this object.
64 Although new users will use Bio::PrimarySeq a lot, in general you will
65 be using it from the Bio::Seq object. For more information on Bio::Seq
66 see L<Bio::Seq>. For interest you might like to know that
67 Bio::Seq has-a Bio::PrimarySeq and forwards most of the function calls
68 to do with sequence to it (the has-a relationship lets us get out of a
69 otherwise nasty cyclical reference in Perl which would leak memory).
71 Sequence objects are defined by the Bio::PrimarySeqI interface, and this
72 object is a pure Perl implementation of the interface. If that's
73 gibberish to you, don't worry. The take home message is that this
74 object is the bioperl default sequence object, but other people can
75 use their own objects as sequences if they so wish. If you are
76 interested in wrapping your own objects as compliant Bioperl sequence
77 objects, then you should read the Bio::PrimarySeqI documentation
79 The documentation of this object is a merge of the Bio::PrimarySeq and
80 Bio::PrimarySeqI documentation. This allows all the methods which you can
81 call on sequence objects here.
83 =head1 FEEDBACK
85 =head2 Mailing Lists
87 User feedback is an integral part of the evolution of this and other
88 Bioperl modules. Send your comments and suggestions preferably to one
89 of the Bioperl mailing lists. Your participation is much appreciated.
91 bioperl-l@bioperl.org - General discussion
92 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
94 =head2 Support
96 Please direct usage questions or support issues to the mailing list:
98 I<bioperl-l@bioperl.org>
100 rather than to the module maintainer directly. Many experienced and
101 reponsive experts will be able look at the problem and quickly
102 address it. Please include a thorough description of the problem
103 with code and data examples if at all possible.
105 =head2 Reporting Bugs
107 Report bugs to the Bioperl bug tracking system to help us keep track
108 the bugs and their resolution. Bug reports can be submitted via the
109 web:
111 https://github.com/bioperl/bioperl-live/issues
113 =head1 AUTHOR - Ewan Birney
115 Email birney@ebi.ac.uk
117 =head1 APPENDIX
119 The rest of the documentation details each of the object
120 methods. Internal methods are usually preceded with a _
122 =cut
126 package Bio::PrimarySeq;
128 use strict;
130 our $MATCHPATTERN = 'A-Za-z\-\.\*\?=~';
131 our $GAP_SYMBOLS = '-~';
133 use base qw(Bio::Root::Root Bio::PrimarySeqI
134 Bio::IdentifiableI Bio::DescribableI);
137 # Setup the allowed values for alphabet()
138 my %valid_type = map {$_, 1} qw( dna rna protein );
141 =head2 new
143 Title : new
144 Usage : $seqobj = Bio::PrimarySeq->new( -seq => 'ATGGGGGTGGTGGTACCCT',
145 -id => 'human_id',
146 -accession_number => 'AL000012',
148 Function: Returns a new primary seq object from
149 basic constructors, being a string for the sequence
150 and strings for id and accession_number.
152 Note that you can provide an empty sequence string. However, in
153 this case you MUST specify the type of sequence you wish to
154 initialize by the parameter -alphabet. See alphabet() for possible
155 values.
156 Returns : a new Bio::PrimarySeq object
157 Args : -seq => sequence string
158 -ref_to_seq => ... or reference to a sequence string
159 -display_id => display id of the sequence (locus name)
160 -accession_number => accession number
161 -primary_id => primary id (Genbank id)
162 -version => version number
163 -namespace => the namespace for the accession
164 -authority => the authority for the namespace
165 -description => description text
166 -desc => alias for description
167 -alphabet => skip alphabet guess and set it to dna, rna or protein
168 -id => alias for display id
169 -is_circular => boolean to indicate that sequence is circular
170 -direct => boolean to directly set sequences. The next time -seq,
171 seq() or -ref_to_seq is use, the sequence will not be
172 validated. Be careful with this...
173 -nowarnonempty => boolean to avoid warning when sequence is empty
175 =cut
177 sub new {
178 my ($class, @args) = @_;
179 my $self = $class->SUPER::new(@args);
180 my ($seq, $id, $acc, $pid, $ns, $auth, $v, $oid, $desc, $description,
181 $alphabet, $given_id, $is_circular, $direct, $ref_to_seq, $len,
182 $nowarnonempty) =
183 $self->_rearrange([qw(SEQ
184 DISPLAY_ID
185 ACCESSION_NUMBER
186 PRIMARY_ID
187 NAMESPACE
188 AUTHORITY
189 VERSION
190 OBJECT_ID
191 DESC
192 DESCRIPTION
193 ALPHABET
195 IS_CIRCULAR
196 DIRECT
197 REF_TO_SEQ
198 LENGTH
199 NOWARNONEMPTY
201 @args);
203 # Private var _nowarnonempty, needs to be set before calling _guess_alphabet
204 $self->{'_nowarnonempty'} = $nowarnonempty;
205 $self->{'_direct'} = $direct;
207 if( defined $id && defined $given_id ) {
208 if( $id ne $given_id ) {
209 $self->throw("Provided both id and display_id constructors: [$id] [$given_id]");
212 if( defined $given_id ) { $id = $given_id; }
214 # Bernd's idea: set ids now for more informative invalid sequence messages
215 defined $id && $self->display_id($id);
216 $acc && $self->accession_number($acc);
217 defined $pid && $self->primary_id($pid);
219 # Set alphabet now to avoid guessing it later, when sequence is set
220 $alphabet && $self->alphabet($alphabet);
222 # Set the length before the seq. If there is a seq, length will be updated later
223 $self->{'length'} = $len || 0;
225 # Set the sequence (but also alphabet and length)
226 if ($ref_to_seq) {
227 $self->_set_seq_by_ref($ref_to_seq, $alphabet);
228 } else {
229 if (defined $seq) {
230 # Note: the sequence string may be empty
231 $self->seq($seq);
235 $desc && $self->desc($desc);
236 $description && $self->description($description);
237 $ns && $self->namespace($ns);
238 $auth && $self->authority($auth);
239 # Any variable that can have a value "0" must be tested with defined
240 # or it will fail to be added to the new object
241 defined($v) && $self->version($v);
242 defined($oid) && $self->object_id($oid);
243 defined($is_circular) && $self->is_circular($is_circular);
245 return $self;
249 =head2 seq
251 Title : seq
252 Usage : $string = $seqobj->seq();
253 Function: Get or set the sequence as a string of letters. The case of
254 the letters is left up to the implementer. Suggested cases are
255 upper case for proteins and lower case for DNA sequence (IUPAC
256 standard), but you should not rely on this. An error is thrown if
257 the sequence contains invalid characters: see validate_seq().
258 Returns : A scalar
259 Args : - Optional new sequence value (a string) to set
260 - Optional alphabet (it is guessed by default)
262 =cut
264 sub seq {
265 my ($self, @args) = @_;
267 if( scalar @args == 0 ) {
268 return $self->{'seq'};
271 my ($seq_str, $alphabet) = @args;
272 if (@args) {
273 $self->_set_seq_by_ref(\$seq_str, $alphabet);
276 return $self->{'seq'};
280 sub _set_seq_by_ref {
281 # Set a sequence by reference. A reference is used to avoid the cost of
282 # copying the sequence (which can be very large) between functions.
283 my ($self, $seq_str_ref, $alphabet) = @_;
285 # Validate sequence if sequence is not empty and we are not in direct mode
286 if ( (! $self->{'_direct'}) && (defined $$seq_str_ref) ) {
287 $self->validate_seq($$seq_str_ref, 1);
289 delete $self->{'_direct'}; # next sequence will have to be validated
291 # Record sequence length
292 my $len = CORE::length($$seq_str_ref || '');
293 my $is_changed_seq = (exists $self->{'seq'}) && ($len > 0);
294 # Note: if the new seq is empty or undef, this is not considered a change
295 delete $self->{'_freeze_length'} if $is_changed_seq;
296 $self->{'length'} = $len if not exists $self->{'_freeze_length'};
298 # Set sequence
299 $self->{'seq'} = $$seq_str_ref;
301 # Set or guess alphabet
302 if ($alphabet) {
303 # Alphabet specified, set it no matter what
304 $self->alphabet($alphabet);
305 } elsif ($is_changed_seq || (! defined($self->alphabet()))) {
306 # If we changed a previous sequence to a new one or if there is no
307 # alphabet yet at all, we need to guess the (possibly new) alphabet
308 $self->_guess_alphabet();
309 } # else (seq not changed and alphabet was defined) do nothing
311 return 1;
315 =head2 validate_seq
317 Title : validate_seq
318 Usage : if(! $seqobj->validate_seq($seq_str) ) {
319 print "sequence $seq_str is not valid for an object of
320 alphabet ",$seqobj->alphabet, "\n";
322 Function: Test that the given sequence is valid, i.e. contains only valid
323 characters. The allowed characters are all letters (A-Z) and '-','.',
324 '*','?','=' and '~'. Spaces are not valid. Note that this
325 implementation does not take alphabet() into account and that empty
326 sequences are considered valid.
327 Returns : 1 if the supplied sequence string is valid, 0 otherwise.
328 Args : - Sequence string to be validated
329 - Boolean to optionally throw an error if the sequence is invalid
331 =cut
333 sub validate_seq {
334 my ($self, $seqstr, $throw) = @_;
335 if ( (defined $seqstr ) &&
336 ($seqstr !~ /^[$MATCHPATTERN]*$/) ) {
337 if ($throw) {
338 $self->throw("Failed validation of sequence '".(defined($self->id) ||
339 '[unidentified sequence]')."'. Invalid characters were: " .
340 join('',($seqstr =~ /[^$MATCHPATTERN]/g)));
342 return 0;
344 return 1;
348 =head2 subseq
350 Title : subseq
351 Usage : $substring = $seqobj->subseq(10,40);
352 $substring = $seqobj->subseq(10,40,'nogap');
353 $substring = $seqobj->subseq(-start=>10, -end=>40, -replace_with=>'tga');
354 $substring = $seqobj->subseq($location_obj);
355 $substring = $seqobj->subseq($location_obj, -nogap => 1);
356 Function: Return the subseq from start to end, where the first sequence
357 character has coordinate 1 number is inclusive, ie 1-2 are the
358 first two characters of the sequence. The given start coordinate
359 has to be larger than the end, even if the sequence is circular.
360 Returns : a string
361 Args : integer for start position
362 integer for end position
364 Bio::LocationI location for subseq (strand honored)
365 Specify -NOGAP=>1 to return subseq with gap characters removed
366 Specify -REPLACE_WITH=>$new_subseq to replace the subseq returned
367 with $new_subseq in the sequence object
369 =cut
371 sub subseq {
372 my $self = shift;
373 my @args = @_;
374 my ($start, $end, $nogap, $replace) = $self->_rearrange([qw(START
376 NOGAP
377 REPLACE_WITH)], @args);
379 # If -replace_with is specified, validate the replacement sequence
380 if (defined $replace) {
381 $self->validate_seq( $replace ) ||
382 $self->throw("Replacement sequence does not look valid");
385 if( ref($start) && $start->isa('Bio::LocationI') ) {
386 my $loc = $start;
387 my $seq = '';
389 # For Split objects if Guide Strand is negative,
390 # pass the sublocations in reverse
391 my $order = 0;
392 if ($loc->isa('Bio::Location::SplitLocationI')) {
393 # guide_strand can return undef, so don't compare directly
394 # to avoid 'uninitialized value' warning
395 my $guide_strand = defined ($loc->guide_strand) ? ($loc->guide_strand) : 0;
396 $order = ($guide_strand == -1) ? -1 : 0;
398 # Reversing order using ->each_Location(-1) does not work well for
399 # cut by origin-splits (like "complement(join(1900..END,START..50))"),
400 # so use "reverse" instead
401 my @sublocs = ($order == -1) ? reverse $loc->each_Location(): $loc->each_Location;
402 foreach my $subloc (@sublocs) {
403 my $piece = $self->subseq(-start => $subloc->start(),
404 -end => $subloc->end(),
405 -replace_with => $replace,
406 -nogap => $nogap);
407 $piece =~ s/[$GAP_SYMBOLS]//g if $nogap;
409 # strand can return undef, so don't compare directly
410 # to avoid 'uninitialized value' warning
411 my $strand = defined ($subloc->strand) ? ($subloc->strand) : 0;
412 if ($strand < 0) {
413 $piece = $self->_revcom_from_string($piece, $self->alphabet);
415 $seq .= $piece;
417 return $seq;
418 } elsif( defined $start && defined $end ) {
419 if( $start > $end ){
420 $self->throw("Bad start,end parameters. Start [$start] has to be ".
421 "less than end [$end]");
423 if( $start <= 0 ) {
424 $self->throw("Bad start parameter ($start). Start must be positive.");
427 # Remove one from start, and then length is end-start
428 $start--;
430 my $seqstr;
431 if (defined $replace) {
432 $seqstr = substr $self->{seq}, $start, $end-$start, $replace;
433 } else {
434 $seqstr = substr $self->{seq}, $start, $end-$start;
438 if ($end > $self->length) {
439 if ($self->is_circular) {
440 my $start = 0;
441 my $end = $end - $self->length;
443 my $appendstr;
444 if (defined $replace) {
445 $appendstr = substr $self->{seq}, $start, $end-$start, $replace;
446 } else {
447 $appendstr = substr $self->{seq}, $start, $end-$start;
450 $seqstr .= $appendstr;
451 } else {
452 $self->throw("Bad end parameter ($end). End must be less than ".
453 "the total length of sequence (total=".$self->length.")")
457 $seqstr =~ s/[$GAP_SYMBOLS]//g if ($nogap);
458 return $seqstr;
460 } else {
461 $self->warn("Incorrect parameters to subseq - must be two integers or ".
462 "a Bio::LocationI object. Got:", $self,$start,$end,$replace,$nogap);
463 return;
468 =head2 length
470 Title : length
471 Usage : $len = $seqobj->length();
472 Function: Get the stored length of the sequence in number of symbols (bases
473 or amino acids). In some circumstances, you can also set this attribute:
475 1. For empty sequences, you can set the length to anything you want:
476 my $seqobj = Bio::PrimarySeq->new( -length => 123 );
477 my $len = $seqobj->len; # 123
478 2. To save memory when using very long sequences, you can set the
479 length of the sequence to the length of the sequence (and nothing
480 else):
481 my $seqobj = Bio::PrimarySeq->new( -seq => 'ACGT...' ); # 1 Mbp sequence
482 # process $seqobj... then after you're done with it
483 $seqobj->length($seqobj->length);
484 $seqobj->seq(undef); # free memory!
485 my $len = $seqobj->len; # 1 Mbp
487 Note that if you set seq() to a value other than undef at any time,
488 the length attribute will be reset.
489 Returns : integer representing the length of the sequence.
490 Args : Optionally, the value on set
492 =cut
494 sub length {
495 my ($self, $val) = @_;
496 if (defined $val) {
497 my $len = $self->{'length'};
498 if ($len && ($len != $val)) {
499 $self->throw("Can not set the length to $val, current length value is $len");
501 $self->{'length'} = $val;
502 $self->{'_freeze_length'} = undef;
504 return $self->{'length'};
508 =head2 display_id
510 Title : display_id or display_name
511 Usage : $id_string = $seqobj->display_id();
512 Function: Get or set the display id, aka the common name of the sequence object.
514 The semantics of this is that it is the most likely string to
515 be used as an identifier of the sequence, and likely to have
516 "human" readability. The id is equivalent to the ID field of
517 the GenBank/EMBL databanks and the id field of the
518 Swissprot/sptrembl database. In fasta format, the >(\S+) is
519 presumed to be the id, though some people overload the id to
520 embed other information. Bioperl does not use any embedded
521 information in the ID field, and people are encouraged to use
522 other mechanisms (accession field for example, or extending
523 the sequence object) to solve this.
525 With the new Bio::DescribeableI interface, display_name aliases
526 to this method.
527 Returns : A string for the display ID
528 Args : Optional string for the display ID to set
530 =cut
532 sub display_id {
533 my ($self, $value) = @_;
534 if( defined $value) {
535 $self->{'display_id'} = $value;
537 return $self->{'display_id'};
541 =head2 accession_number
543 Title : accession_number or object_id
544 Usage : $unique_key = $seqobj->accession_number;
545 Function: Returns the unique biological id for a sequence, commonly
546 called the accession_number. For sequences from established
547 databases, the implementors should try to use the correct
548 accession number. Notice that primary_id() provides the
549 unique id for the implementation, allowing multiple objects
550 to have the same accession number in a particular implementation.
552 For sequences with no accession number, this method should
553 return "unknown".
555 [Note this method name is likely to change in 1.3]
557 With the new Bio::IdentifiableI interface, this is aliased
558 to object_id
559 Returns : A string
560 Args : A string (optional) for setting
562 =cut
564 sub accession_number {
565 my( $self, $acc ) = @_;
566 if (defined $acc) {
567 $self->{'accession_number'} = $acc;
568 } else {
569 $acc = $self->{'accession_number'};
570 $acc = 'unknown' unless defined $acc;
572 return $acc;
576 =head2 primary_id
578 Title : primary_id
579 Usage : $unique_key = $seqobj->primary_id;
580 Function: Returns the unique id for this object in this
581 implementation. This allows implementations to manage their
582 own object ids in a way the implementation can control
583 clients can expect one id to map to one object.
585 For sequences with no natural primary id, this method
586 should return a stringified memory location.
587 Returns : A string
588 Args : A string (optional, for setting)
590 =cut
592 sub primary_id {
593 my $self = shift;
595 if(@_) {
596 $self->{'primary_id'} = shift;
598 if( ! defined($self->{'primary_id'}) ) {
599 return "$self";
601 return $self->{'primary_id'};
605 =head2 alphabet
607 Title : alphabet
608 Usage : if( $seqobj->alphabet eq 'dna' ) { # Do something }
609 Function: Get/set the alphabet of sequence, one of
610 'dna', 'rna' or 'protein'. This is case sensitive.
612 This is not called <type> because this would cause
613 upgrade problems from the 0.5 and earlier Seq objects.
614 Returns : a string either 'dna','rna','protein'. NB - the object must
615 make a call of the type - if there is no alphabet specified it
616 has to guess.
617 Args : optional string to set : 'dna' | 'rna' | 'protein'
620 =cut
622 sub alphabet {
623 my ($self,$value) = @_;
624 if (defined $value) {
625 $value = lc $value;
626 unless ( $valid_type{$value} ) {
627 $self->throw("Alphabet '$value' is not a valid alphabet (".
628 join(',', map "'$_'", sort keys %valid_type) .") lowercase");
630 $self->{'alphabet'} = $value;
632 return $self->{'alphabet'};
636 =head2 desc
638 Title : desc or description
639 Usage : $seqobj->desc($newval);
640 Function: Get/set description of the sequence.
642 'description' is an alias for this for compliance with the
643 Bio::DescribeableI interface.
644 Returns : value of desc (a string)
645 Args : newvalue (a string or undef, optional)
648 =cut
650 sub desc{
651 my $self = shift;
653 return $self->{'desc'} = shift if @_;
654 return $self->{'desc'};
658 =head2 can_call_new
660 Title : can_call_new
661 Usage :
662 Function:
663 Example :
664 Returns : true
665 Args :
667 =cut
669 sub can_call_new {
670 my ($self) = @_;
672 return 1;
676 =head2 id
678 Title : id
679 Usage : $id = $seqobj->id();
680 Function: This is mapped on display_id
681 Example :
682 Returns :
683 Args :
685 =cut
687 sub id {
688 return shift->display_id(@_);
692 =head2 is_circular
694 Title : is_circular
695 Usage : if( $seqobj->is_circular) { # Do something }
696 Function: Returns true if the molecule is circular
697 Returns : Boolean value
698 Args : none
700 =cut
702 sub is_circular{
703 my $self = shift;
704 return $self->{'is_circular'} = shift if @_;
705 return $self->{'is_circular'};
709 =head1 Methods for Bio::IdentifiableI compliance
711 =head2 object_id
713 Title : object_id
714 Usage : $string = $seqobj->object_id();
715 Function: Get or set a string which represents the stable primary identifier
716 in this namespace of this object. For DNA sequences this
717 is its accession_number, similarly for protein sequences.
719 This is aliased to accession_number().
720 Returns : A scalar
721 Args : Optional object ID to set.
723 =cut
725 sub object_id {
726 return shift->accession_number(@_);
730 =head2 version
732 Title : version
733 Usage : $version = $seqobj->version();
734 Function: Get or set a number which differentiates between versions of
735 the same object. Higher numbers are considered to be
736 later and more relevant, but a single object described
737 the same identifier should represent the same concept.
738 Returns : A number
739 Args : Optional version to set.
741 =cut
743 sub version{
744 my ($self,$value) = @_;
745 if( defined $value) {
746 $self->{'_version'} = $value;
748 return $self->{'_version'};
752 =head2 authority
754 Title : authority
755 Usage : $authority = $seqobj->authority();
756 Function: Get or set a string which represents the organisation which
757 granted the namespace, written as the DNS name of the
758 organisation (eg, wormbase.org).
759 Returns : A scalar
760 Args : Optional authority to set.
762 =cut
764 sub authority {
765 my ($self, $value) = @_;
766 if( defined $value) {
767 $self->{'authority'} = $value;
769 return $self->{'authority'};
773 =head2 namespace
775 Title : namespace
776 Usage : $string = $seqobj->namespace();
777 Function: Get or set a string representing the name space this identifier
778 is valid in, often the database name or the name describing the
779 collection.
780 Returns : A scalar
781 Args : Optional namespace to set.
783 =cut
785 sub namespace{
786 my ($self,$value) = @_;
787 if( defined $value) {
788 $self->{'namespace'} = $value;
790 return $self->{'namespace'} || "";
794 =head1 Methods for Bio::DescribableI compliance
796 This comprises of display_name and description.
798 =head2 display_name
800 Title : display_name
801 Usage : $string = $seqobj->display_name();
802 Function: Get or set a string which is what should be displayed to the user.
803 The string should have no spaces (ideally, though a cautious
804 user of this interface would not assume this) and should be
805 less than thirty characters (though again, double checking
806 this is a good idea).
808 This is aliased to display_id().
809 Returns : A string for the display name
810 Args : Optional string for the display name to set.
812 =cut
814 sub display_name {
815 return shift->display_id(@_);
819 =head2 description
821 Title : description
822 Usage : $string = $seqobj->description();
823 Function: Get or set a text string suitable for displaying to the user a
824 description. This string is likely to have spaces, but
825 should not have any newlines or formatting - just plain
826 text. The string should not be greater than 255 characters
827 and clients can feel justified at truncating strings at 255
828 characters for the purposes of display.
830 This is aliased to desc().
831 Returns : A string for the description
832 Args : Optional string for the description to set.
834 =cut
836 sub description {
837 return shift->desc(@_);
841 =head1 Methods Inherited from Bio::PrimarySeqI
843 These methods are available on Bio::PrimarySeq, although they are
844 actually implemented on Bio::PrimarySeqI
846 =head2 revcom
848 Title : revcom
849 Usage : $rev = $seqobj->revcom();
850 Function: Produces a new Bio::SeqI implementing object which
851 is the reversed complement of the sequence. For protein
852 sequences this throws an exception of
853 "Sequence is a protein. Cannot revcom".
855 The id is the same id as the original sequence, and the
856 accession number is also identical. If someone wants to
857 track that this sequence has be reversed, it needs to
858 define its own extensions.
860 To do an inplace edit of an object you can go:
862 $seqobj = $seqobj->revcom();
864 This of course, causes Perl to handle the garbage
865 collection of the old object, but it is roughly speaking as
866 efficient as an inplace edit.
867 Returns : A new (fresh) Bio::SeqI object
868 Args : none
870 =head2 trunc
872 Title : trunc
873 Usage : $subseq = $myseq->trunc(10,100);
874 Function: Provides a truncation of a sequence,
875 Returns : A fresh Bio::SeqI implementing object.
876 Args : Numbers for the start and end positions
878 =head1 Internal methods
880 These are internal methods to PrimarySeq
882 =head2 _guess_alphabet
884 Title : _guess_alphabet
885 Usage :
886 Function: Automatically guess and set the type of sequence: dna, rna, protein
887 or '' if the sequence was empty. This method first removes dots (.),
888 dashes (-) and question marks (?) before guessing the alphabet
889 using the IUPAC conventions for ambiguous residues. Since the DNA and
890 RNA characters are also valid characters for proteins, there is
891 no foolproof way of determining the right alphabet. This is our best
892 guess only!
893 Returns : string 'dna', 'rna', 'protein' or ''.
894 Args : none
896 =cut
898 sub _guess_alphabet {
899 my ($self) = @_;
900 # Guess alphabet
901 my $alphabet = $self->_guess_alphabet_from_string($self->seq, $self->{'_nowarnonempty'});
902 # Set alphabet unless it is unknown
903 $self->alphabet($alphabet) if $alphabet;
904 return $alphabet;
908 sub _guess_alphabet_from_string {
909 # Get the alphabet from a sequence string
910 my ($self, $str, $nowarnonempty) = @_;
912 $nowarnonempty = 0 if not defined $nowarnonempty;
914 # Remove chars that clearly don't denote nucleic or amino acids
915 $str =~ s/[-.?]//gi;
917 # Check for sequences without valid letters
918 my $alphabet;
919 my $total = CORE::length($str);
920 if( $total == 0 ) {
921 if (not $nowarnonempty) {
922 $self->warn("Got a sequence without letters. Could not guess alphabet");
924 $alphabet = '';
927 # Determine alphabet now
928 if (not defined $alphabet) {
929 if ($str =~ m/[EFIJLOPQXZ]/i) {
930 # Start with a safe method to find proteins.
931 # Unambiguous IUPAC letters for proteins are: E,F,I,J,L,O,P,Q,X,Z
932 $alphabet = 'protein';
933 } else {
934 # Alphabet is unsure, could still be DNA, RNA or protein
935 # DNA and RNA contain mostly A, T, U, G, C and N, but the other
936 # letters they use are also among the 15 valid letters that a
937 # protein sequence can contain at this stage. Make our best guess
938 # based on sequence composition. If it contains over 70% of ACGTUN,
939 # it is likely nucleic.
940 if( ($str =~ tr/ATUGCNWSKMatugcnwskm//) / $total > 0.7 ) {
941 if ( $str =~ m/U/i ) {
942 $alphabet = 'rna';
943 } else {
944 $alphabet = 'dna';
946 } else {
947 $alphabet = 'protein';
952 return $alphabet;
956 ############################################################################
957 # aliases due to name changes or to compensate for our lack of consistency #
958 ############################################################################
960 sub accession {
961 my $self = shift;
963 $self->warn(ref($self)."::accession is deprecated, ".
964 "use accession_number() instead");
965 return $self->accession_number(@_);