2 # bioperl module for Bio::PrimarySeq
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Ewan Birney <birney@ebi.ac.uk>
8 # Copyright Ewan Birney
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
16 Bio::PrimarySeq - Bioperl lightweight sequence object
20 # Bio::SeqIO for file reading, Bio::DB::GenBank for
29 $seqobj = Bio::PrimarySeq->new (
30 -seq => 'ATGGGGTGGGCGGTGGGTGGTTTG',
31 -id => 'GeneFragment-12',
32 -accession_number => 'X78121',
36 print "Sequence ", $seqobj->id(), " with accession ",
37 $seqobj->accession_number, "\n";
41 $inputstream = Bio::SeqIO->new(
45 $seqobj = $inputstream->next_seq();
46 print "Sequence ", $seqobj->id(), " and desc ", $seqobj->desc, "\n";
48 # to get out parts of the sequence.
50 print "Sequence ", $seqobj->id(), " with accession ",
51 $seqobj->accession_number, " and desc ", $seqobj->desc, "\n";
53 $string = $seqobj->seq();
54 $string2 = $seqobj->subseq(1,40);
58 PrimarySeq is a lightweight sequence object, storing the sequence, its
59 name, a computer-useful unique name, and other fundamental attributes.
60 It does not contain sequence features or other information. To have a
61 sequence with sequence features you should use the Seq object which uses
64 Although new users will use Bio::PrimarySeq a lot, in general you will
65 be using it from the Bio::Seq object. For more information on Bio::Seq
66 see L<Bio::Seq>. For interest you might like to know that
67 Bio::Seq has-a Bio::PrimarySeq and forwards most of the function calls
68 to do with sequence to it (the has-a relationship lets us get out of a
69 otherwise nasty cyclical reference in Perl which would leak memory).
71 Sequence objects are defined by the Bio::PrimarySeqI interface, and this
72 object is a pure Perl implementation of the interface. If that's
73 gibberish to you, don't worry. The take home message is that this
74 object is the bioperl default sequence object, but other people can
75 use their own objects as sequences if they so wish. If you are
76 interested in wrapping your own objects as compliant Bioperl sequence
77 objects, then you should read the Bio::PrimarySeqI documentation
79 The documentation of this object is a merge of the Bio::PrimarySeq and
80 Bio::PrimarySeqI documentation. This allows all the methods which you can
81 call on sequence objects here.
87 User feedback is an integral part of the evolution of this and other
88 Bioperl modules. Send your comments and suggestions preferably to one
89 of the Bioperl mailing lists. Your participation is much appreciated.
91 bioperl-l@bioperl.org - General discussion
92 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
96 Please direct usage questions or support issues to the mailing list:
98 I<bioperl-l@bioperl.org>
100 rather than to the module maintainer directly. Many experienced and
101 reponsive experts will be able look at the problem and quickly
102 address it. Please include a thorough description of the problem
103 with code and data examples if at all possible.
105 =head2 Reporting Bugs
107 Report bugs to the Bioperl bug tracking system to help us keep track
108 the bugs and their resolution. Bug reports can be submitted via the
111 https://github.com/bioperl/bioperl-live/issues
113 =head1 AUTHOR - Ewan Birney
115 Email birney@ebi.ac.uk
119 The rest of the documentation details each of the object
120 methods. Internal methods are usually preceded with a _
126 package Bio
::PrimarySeq
;
130 our $MATCHPATTERN = 'A-Za-z\-\.\*\?=~';
131 our $GAP_SYMBOLS = '-~';
133 use base
qw(Bio::Root::Root Bio::PrimarySeqI
134 Bio::IdentifiableI Bio::DescribableI);
137 # Setup the allowed values for alphabet()
138 my %valid_type = map {$_, 1} qw( dna rna protein );
144 Usage : $seqobj = Bio::PrimarySeq->new( -seq => 'ATGGGGGTGGTGGTACCCT',
146 -accession_number => 'AL000012',
148 Function: Returns a new primary seq object from
149 basic constructors, being a string for the sequence
150 and strings for id and accession_number.
152 Note that you can provide an empty sequence string. However, in
153 this case you MUST specify the type of sequence you wish to
154 initialize by the parameter -alphabet. See alphabet() for possible
156 Returns : a new Bio::PrimarySeq object
157 Args : -seq => sequence string
158 -ref_to_seq => ... or reference to a sequence string
159 -display_id => display id of the sequence (locus name)
160 -accession_number => accession number
161 -primary_id => primary id (Genbank id)
162 -version => version number
163 -namespace => the namespace for the accession
164 -authority => the authority for the namespace
165 -description => description text
166 -desc => alias for description
167 -alphabet => skip alphabet guess and set it to dna, rna or protein
168 -id => alias for display id
169 -is_circular => boolean to indicate that sequence is circular
170 -direct => boolean to directly set sequences. The next time -seq,
171 seq() or -ref_to_seq is use, the sequence will not be
172 validated. Be careful with this...
173 -nowarnonempty => boolean to avoid warning when sequence is empty
178 my ($class, @args) = @_;
179 my $self = $class->SUPER::new
(@args);
180 my ($seq, $id, $acc, $pid, $ns, $auth, $v, $oid, $desc, $description,
181 $alphabet, $given_id, $is_circular, $direct, $ref_to_seq, $len,
183 $self->_rearrange([qw(SEQ
203 # Private var _nowarnonempty, needs to be set before calling _guess_alphabet
204 $self->{'_nowarnonempty'} = $nowarnonempty;
205 $self->{'_direct'} = $direct;
207 if( defined $id && defined $given_id ) {
208 if( $id ne $given_id ) {
209 $self->throw("Provided both id and display_id constructors: [$id] [$given_id]");
212 if( defined $given_id ) { $id = $given_id; }
214 # Bernd's idea: set ids now for more informative invalid sequence messages
215 defined $id && $self->display_id($id);
216 $acc && $self->accession_number($acc);
217 defined $pid && $self->primary_id($pid);
219 # Set alphabet now to avoid guessing it later, when sequence is set
220 $alphabet && $self->alphabet($alphabet);
222 # Set the length before the seq. If there is a seq, length will be updated later
223 $self->{'length'} = $len || 0;
225 # Set the sequence (but also alphabet and length)
227 $self->_set_seq_by_ref($ref_to_seq, $alphabet);
230 # Note: the sequence string may be empty
235 $desc && $self->desc($desc);
236 $description && $self->description($description);
237 $ns && $self->namespace($ns);
238 $auth && $self->authority($auth);
239 # Any variable that can have a value "0" must be tested with defined
240 # or it will fail to be added to the new object
241 defined($v) && $self->version($v);
242 defined($oid) && $self->object_id($oid);
243 defined($is_circular) && $self->is_circular($is_circular);
252 Usage : $string = $seqobj->seq();
253 Function: Get or set the sequence as a string of letters. The case of
254 the letters is left up to the implementer. Suggested cases are
255 upper case for proteins and lower case for DNA sequence (IUPAC
256 standard), but you should not rely on this. An error is thrown if
257 the sequence contains invalid characters: see validate_seq().
259 Args : - Optional new sequence value (a string) to set
260 - Optional alphabet (it is guessed by default)
265 my ($self, @args) = @_;
267 if( scalar @args == 0 ) {
268 return $self->{'seq'};
271 my ($seq_str, $alphabet) = @args;
273 $self->_set_seq_by_ref(\
$seq_str, $alphabet);
276 return $self->{'seq'};
280 sub _set_seq_by_ref
{
281 # Set a sequence by reference. A reference is used to avoid the cost of
282 # copying the sequence (which can be very large) between functions.
283 my ($self, $seq_str_ref, $alphabet) = @_;
285 # Validate sequence if sequence is not empty and we are not in direct mode
286 if ( (! $self->{'_direct'}) && (defined $$seq_str_ref) ) {
287 $self->validate_seq($$seq_str_ref, 1);
289 delete $self->{'_direct'}; # next sequence will have to be validated
291 # Record sequence length
292 my $len = CORE
::length($$seq_str_ref || '');
293 my $is_changed_seq = (exists $self->{'seq'}) && ($len > 0);
294 # Note: if the new seq is empty or undef, this is not considered a change
295 delete $self->{'_freeze_length'} if $is_changed_seq;
296 $self->{'length'} = $len if not exists $self->{'_freeze_length'};
299 $self->{'seq'} = $$seq_str_ref;
301 # Set or guess alphabet
303 # Alphabet specified, set it no matter what
304 $self->alphabet($alphabet);
305 } elsif ($is_changed_seq || (! defined($self->alphabet()))) {
306 # If we changed a previous sequence to a new one or if there is no
307 # alphabet yet at all, we need to guess the (possibly new) alphabet
308 $self->_guess_alphabet();
309 } # else (seq not changed and alphabet was defined) do nothing
318 Usage : if(! $seqobj->validate_seq($seq_str) ) {
319 print "sequence $seq_str is not valid for an object of
320 alphabet ",$seqobj->alphabet, "\n";
322 Function: Test that the given sequence is valid, i.e. contains only valid
323 characters. The allowed characters are all letters (A-Z) and '-','.',
324 '*','?','=' and '~'. Spaces are not valid. Note that this
325 implementation does not take alphabet() into account and that empty
326 sequences are considered valid.
327 Returns : 1 if the supplied sequence string is valid, 0 otherwise.
328 Args : - Sequence string to be validated
329 - Boolean to optionally throw an error if the sequence is invalid
334 my ($self, $seqstr, $throw) = @_;
335 if ( (defined $seqstr ) &&
336 ($seqstr !~ /^[$MATCHPATTERN]*$/) ) {
338 $self->throw("Failed validation of sequence '".(defined($self->id) ||
339 '[unidentified sequence]')."'. Invalid characters were: " .
340 join('',($seqstr =~ /[^$MATCHPATTERN]/g)));
351 Usage : $substring = $seqobj->subseq(10,40);
352 $substring = $seqobj->subseq(10,40,'nogap');
353 $substring = $seqobj->subseq(-start=>10, -end=>40, -replace_with=>'tga');
354 $substring = $seqobj->subseq($location_obj);
355 $substring = $seqobj->subseq($location_obj, -nogap => 1);
356 Function: Return the subseq from start to end, where the first sequence
357 character has coordinate 1 number is inclusive, ie 1-2 are the
358 first two characters of the sequence. The given start coordinate
359 has to be larger than the end, even if the sequence is circular.
361 Args : integer for start position
362 integer for end position
364 Bio::LocationI location for subseq (strand honored)
365 Specify -NOGAP=>1 to return subseq with gap characters removed
366 Specify -REPLACE_WITH=>$new_subseq to replace the subseq returned
367 with $new_subseq in the sequence object
374 my ($start, $end, $nogap, $replace) = $self->_rearrange([qw(START
377 REPLACE_WITH)], @args);
379 # If -replace_with is specified, validate the replacement sequence
380 if (defined $replace) {
381 $self->validate_seq( $replace ) ||
382 $self->throw("Replacement sequence does not look valid");
385 if( ref($start) && $start->isa('Bio::LocationI') ) {
389 # For Split objects if Guide Strand is negative,
390 # pass the sublocations in reverse
392 if ($loc->isa('Bio::Location::SplitLocationI')) {
393 # guide_strand can return undef, so don't compare directly
394 # to avoid 'uninitialized value' warning
395 my $guide_strand = defined ($loc->guide_strand) ?
($loc->guide_strand) : 0;
396 $order = ($guide_strand == -1) ?
-1 : 0;
398 # Reversing order using ->each_Location(-1) does not work well for
399 # cut by origin-splits (like "complement(join(1900..END,START..50))"),
400 # so use "reverse" instead
401 my @sublocs = ($order == -1) ?
reverse $loc->each_Location(): $loc->each_Location;
402 foreach my $subloc (@sublocs) {
403 my $piece = $self->subseq(-start
=> $subloc->start(),
404 -end
=> $subloc->end(),
405 -replace_with
=> $replace,
407 $piece =~ s/[$GAP_SYMBOLS]//g if $nogap;
409 # strand can return undef, so don't compare directly
410 # to avoid 'uninitialized value' warning
411 my $strand = defined ($subloc->strand) ?
($subloc->strand) : 0;
413 $piece = $self->_revcom_from_string($piece, $self->alphabet);
418 } elsif( defined $start && defined $end ) {
420 $self->throw("Bad start,end parameters. Start [$start] has to be ".
421 "less than end [$end]");
424 $self->throw("Bad start parameter ($start). Start must be positive.");
427 # Remove one from start, and then length is end-start
431 if (defined $replace) {
432 $seqstr = substr $self->{seq
}, $start, $end-$start, $replace;
434 $seqstr = substr $self->{seq
}, $start, $end-$start;
438 if ($end > $self->length) {
439 if ($self->is_circular) {
441 my $end = $end - $self->length;
444 if (defined $replace) {
445 $appendstr = substr $self->{seq
}, $start, $end-$start, $replace;
447 $appendstr = substr $self->{seq
}, $start, $end-$start;
450 $seqstr .= $appendstr;
452 $self->throw("Bad end parameter ($end). End must be less than ".
453 "the total length of sequence (total=".$self->length.")")
457 $seqstr =~ s/[$GAP_SYMBOLS]//g if ($nogap);
461 $self->warn("Incorrect parameters to subseq - must be two integers or ".
462 "a Bio::LocationI object. Got:", $self,$start,$end,$replace,$nogap);
471 Usage : $len = $seqobj->length();
472 Function: Get the stored length of the sequence in number of symbols (bases
473 or amino acids). In some circumstances, you can also set this attribute:
475 1. For empty sequences, you can set the length to anything you want:
476 my $seqobj = Bio::PrimarySeq->new( -length => 123 );
477 my $len = $seqobj->len; # 123
478 2. To save memory when using very long sequences, you can set the
479 length of the sequence to the length of the sequence (and nothing
481 my $seqobj = Bio::PrimarySeq->new( -seq => 'ACGT...' ); # 1 Mbp sequence
482 # process $seqobj... then after you're done with it
483 $seqobj->length($seqobj->length);
484 $seqobj->seq(undef); # free memory!
485 my $len = $seqobj->len; # 1 Mbp
487 Note that if you set seq() to a value other than undef at any time,
488 the length attribute will be reset.
489 Returns : integer representing the length of the sequence.
490 Args : Optionally, the value on set
495 my ($self, $val) = @_;
497 my $len = $self->{'length'};
498 if ($len && ($len != $val)) {
499 $self->throw("Can not set the length to $val, current length value is $len");
501 $self->{'length'} = $val;
502 $self->{'_freeze_length'} = undef;
504 return $self->{'length'};
510 Title : display_id or display_name
511 Usage : $id_string = $seqobj->display_id();
512 Function: Get or set the display id, aka the common name of the sequence object.
514 The semantics of this is that it is the most likely string to
515 be used as an identifier of the sequence, and likely to have
516 "human" readability. The id is equivalent to the ID field of
517 the GenBank/EMBL databanks and the id field of the
518 Swissprot/sptrembl database. In fasta format, the >(\S+) is
519 presumed to be the id, though some people overload the id to
520 embed other information. Bioperl does not use any embedded
521 information in the ID field, and people are encouraged to use
522 other mechanisms (accession field for example, or extending
523 the sequence object) to solve this.
525 With the new Bio::DescribeableI interface, display_name aliases
527 Returns : A string for the display ID
528 Args : Optional string for the display ID to set
533 my ($self, $value) = @_;
534 if( defined $value) {
535 $self->{'display_id'} = $value;
537 return $self->{'display_id'};
541 =head2 accession_number
543 Title : accession_number or object_id
544 Usage : $unique_key = $seqobj->accession_number;
545 Function: Returns the unique biological id for a sequence, commonly
546 called the accession_number. For sequences from established
547 databases, the implementors should try to use the correct
548 accession number. Notice that primary_id() provides the
549 unique id for the implementation, allowing multiple objects
550 to have the same accession number in a particular implementation.
552 For sequences with no accession number, this method should
555 [Note this method name is likely to change in 1.3]
557 With the new Bio::IdentifiableI interface, this is aliased
560 Args : A string (optional) for setting
564 sub accession_number
{
565 my( $self, $acc ) = @_;
567 $self->{'accession_number'} = $acc;
569 $acc = $self->{'accession_number'};
570 $acc = 'unknown' unless defined $acc;
579 Usage : $unique_key = $seqobj->primary_id;
580 Function: Returns the unique id for this object in this
581 implementation. This allows implementations to manage their
582 own object ids in a way the implementation can control
583 clients can expect one id to map to one object.
585 For sequences with no natural primary id, this method
586 should return a stringified memory location.
588 Args : A string (optional, for setting)
596 $self->{'primary_id'} = shift;
598 if( ! defined($self->{'primary_id'}) ) {
601 return $self->{'primary_id'};
608 Usage : if( $seqobj->alphabet eq 'dna' ) { # Do something }
609 Function: Get/set the alphabet of sequence, one of
610 'dna', 'rna' or 'protein'. This is case sensitive.
612 This is not called <type> because this would cause
613 upgrade problems from the 0.5 and earlier Seq objects.
614 Returns : a string either 'dna','rna','protein'. NB - the object must
615 make a call of the type - if there is no alphabet specified it
617 Args : optional string to set : 'dna' | 'rna' | 'protein'
623 my ($self,$value) = @_;
624 if (defined $value) {
626 unless ( $valid_type{$value} ) {
627 $self->throw("Alphabet '$value' is not a valid alphabet (".
628 join(',', map "'$_'", sort keys %valid_type) .") lowercase");
630 $self->{'alphabet'} = $value;
632 return $self->{'alphabet'};
638 Title : desc or description
639 Usage : $seqobj->desc($newval);
640 Function: Get/set description of the sequence.
642 'description' is an alias for this for compliance with the
643 Bio::DescribeableI interface.
644 Returns : value of desc (a string)
645 Args : newvalue (a string or undef, optional)
653 return $self->{'desc'} = shift if @_;
654 return $self->{'desc'};
679 Usage : $id = $seqobj->id();
680 Function: This is mapped on display_id
688 return shift->display_id(@_);
695 Usage : if( $seqobj->is_circular) { # Do something }
696 Function: Returns true if the molecule is circular
697 Returns : Boolean value
704 return $self->{'is_circular'} = shift if @_;
705 return $self->{'is_circular'};
709 =head1 Methods for Bio::IdentifiableI compliance
714 Usage : $string = $seqobj->object_id();
715 Function: Get or set a string which represents the stable primary identifier
716 in this namespace of this object. For DNA sequences this
717 is its accession_number, similarly for protein sequences.
719 This is aliased to accession_number().
721 Args : Optional object ID to set.
726 return shift->accession_number(@_);
733 Usage : $version = $seqobj->version();
734 Function: Get or set a number which differentiates between versions of
735 the same object. Higher numbers are considered to be
736 later and more relevant, but a single object described
737 the same identifier should represent the same concept.
739 Args : Optional version to set.
744 my ($self,$value) = @_;
745 if( defined $value) {
746 $self->{'_version'} = $value;
748 return $self->{'_version'};
755 Usage : $authority = $seqobj->authority();
756 Function: Get or set a string which represents the organisation which
757 granted the namespace, written as the DNS name of the
758 organisation (eg, wormbase.org).
760 Args : Optional authority to set.
765 my ($self, $value) = @_;
766 if( defined $value) {
767 $self->{'authority'} = $value;
769 return $self->{'authority'};
776 Usage : $string = $seqobj->namespace();
777 Function: Get or set a string representing the name space this identifier
778 is valid in, often the database name or the name describing the
781 Args : Optional namespace to set.
786 my ($self,$value) = @_;
787 if( defined $value) {
788 $self->{'namespace'} = $value;
790 return $self->{'namespace'} || "";
794 =head1 Methods for Bio::DescribableI compliance
796 This comprises of display_name and description.
801 Usage : $string = $seqobj->display_name();
802 Function: Get or set a string which is what should be displayed to the user.
803 The string should have no spaces (ideally, though a cautious
804 user of this interface would not assume this) and should be
805 less than thirty characters (though again, double checking
806 this is a good idea).
808 This is aliased to display_id().
809 Returns : A string for the display name
810 Args : Optional string for the display name to set.
815 return shift->display_id(@_);
822 Usage : $string = $seqobj->description();
823 Function: Get or set a text string suitable for displaying to the user a
824 description. This string is likely to have spaces, but
825 should not have any newlines or formatting - just plain
826 text. The string should not be greater than 255 characters
827 and clients can feel justified at truncating strings at 255
828 characters for the purposes of display.
830 This is aliased to desc().
831 Returns : A string for the description
832 Args : Optional string for the description to set.
837 return shift->desc(@_);
841 =head1 Methods Inherited from Bio::PrimarySeqI
843 These methods are available on Bio::PrimarySeq, although they are
844 actually implemented on Bio::PrimarySeqI
849 Usage : $rev = $seqobj->revcom();
850 Function: Produces a new Bio::SeqI implementing object which
851 is the reversed complement of the sequence. For protein
852 sequences this throws an exception of
853 "Sequence is a protein. Cannot revcom".
855 The id is the same id as the original sequence, and the
856 accession number is also identical. If someone wants to
857 track that this sequence has be reversed, it needs to
858 define its own extensions.
860 To do an inplace edit of an object you can go:
862 $seqobj = $seqobj->revcom();
864 This of course, causes Perl to handle the garbage
865 collection of the old object, but it is roughly speaking as
866 efficient as an inplace edit.
867 Returns : A new (fresh) Bio::SeqI object
873 Usage : $subseq = $myseq->trunc(10,100);
874 Function: Provides a truncation of a sequence,
875 Returns : A fresh Bio::SeqI implementing object.
876 Args : Numbers for the start and end positions
878 =head1 Internal methods
880 These are internal methods to PrimarySeq
882 =head2 _guess_alphabet
884 Title : _guess_alphabet
886 Function: Automatically guess and set the type of sequence: dna, rna, protein
887 or '' if the sequence was empty. This method first removes dots (.),
888 dashes (-) and question marks (?) before guessing the alphabet
889 using the IUPAC conventions for ambiguous residues. Since the DNA and
890 RNA characters are also valid characters for proteins, there is
891 no foolproof way of determining the right alphabet. This is our best
893 Returns : string 'dna', 'rna', 'protein' or ''.
898 sub _guess_alphabet
{
901 my $alphabet = $self->_guess_alphabet_from_string($self->seq, $self->{'_nowarnonempty'});
902 # Set alphabet unless it is unknown
903 $self->alphabet($alphabet) if $alphabet;
908 sub _guess_alphabet_from_string
{
909 # Get the alphabet from a sequence string
910 my ($self, $str, $nowarnonempty) = @_;
912 $nowarnonempty = 0 if not defined $nowarnonempty;
914 # Remove chars that clearly don't denote nucleic or amino acids
917 # Check for sequences without valid letters
919 my $total = CORE
::length($str);
921 if (not $nowarnonempty) {
922 $self->warn("Got a sequence without letters. Could not guess alphabet");
927 # Determine alphabet now
928 if (not defined $alphabet) {
929 if ($str =~ m/[EFIJLOPQXZ]/i) {
930 # Start with a safe method to find proteins.
931 # Unambiguous IUPAC letters for proteins are: E,F,I,J,L,O,P,Q,X,Z
932 $alphabet = 'protein';
934 # Alphabet is unsure, could still be DNA, RNA or protein
935 # DNA and RNA contain mostly A, T, U, G, C and N, but the other
936 # letters they use are also among the 15 valid letters that a
937 # protein sequence can contain at this stage. Make our best guess
938 # based on sequence composition. If it contains over 70% of ACGTUN,
939 # it is likely nucleic.
940 if( ($str =~ tr/ATUGCNWSKMatugcnwskm//) / $total > 0.7 ) {
941 if ( $str =~ m/U/i ) {
947 $alphabet = 'protein';
956 ############################################################################
957 # aliases due to name changes or to compensate for our lack of consistency #
958 ############################################################################
963 $self->warn(ref($self)."::accession is deprecated, ".
964 "use accession_number() instead");
965 return $self->accession_number(@_);