2 # BioPerl module for Bio::LocatableSeq
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Ewan Birney <birney@ebi.ac.uk>
8 # Copyright Ewan Birney
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
16 Bio::LocatableSeq - A Bio::PrimarySeq object with start/end points on it
17 that can be projected into a MSA or have coordinates relative to
22 use Bio::LocatableSeq;
23 my $seq = Bio::LocatableSeq->new(-seq => "CAGT-GGT",
28 # a normal sequence object
32 # has start,end points
36 # inherits off RangeI, so range operations possible
40 The LocatableSeq sequence object was developed mainly because the SimpleAlign
41 object requires this functionality, and in the rewrite of the Sequence object we
42 had to decide what to do with this.
44 It is, to be honest, not well integrated with the rest of bioperl. For example,
45 the trunc() function does not return a LocatableSeq object, as some might have
46 thought. Also, the sequence is not a Bio::SeqI, so the location is simply
47 inherited from Bio::RangeI and is not stored in a Bio::Location.
49 There are all sorts of nasty gotcha's about interactions between coordinate
50 systems when these sort of objects are used. Some mapping now occurs to deal
51 with HSP data, however it can probably be integrated in better and most methods
52 do not implement it correctly yet. Also, several PrimarySeqI methods (subseq(),
53 trunc(), etc.) do not behave as expected and must be used with care. Due to this,
54 LocatableSeq functionality is to be refactored in a future BioPerl release.
55 However, for alignment functionality it works adequately for the time being.
57 If you do not need alignment functionality, L<Bio::SeqfeatureI>-implementing
58 modules may be a suitable alternative to L<Bio::LocatableSeq>. For example,
59 L<Bio::SeqFeature::Generic> and L<Bio::SeqFeature::Lite> provide methods to
60 attach a sequence to a specific region of a parent sequence and to set other
67 User feedback is an integral part of the evolution of this and other
68 Bioperl modules. Send your comments and suggestions preferably to one
69 of the Bioperl mailing lists. Your participation is much appreciated.
71 bioperl-l@bioperl.org - General discussion
72 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
76 Please direct usage questions or support issues to the mailing list:
78 I<bioperl-l@bioperl.org>
80 rather than to the module maintainer directly. Many experienced and
81 reponsive experts will be able look at the problem and quickly
82 address it. Please include a thorough description of the problem
83 with code and data examples if at all possible.
87 Report bugs to the Bioperl bug tracking system to help us keep track
88 the bugs and their resolution. Bug reports can be submitted via the
91 https://github.com/bioperl/bioperl-live/issues
95 The rest of the documentation details each of the object
96 methods. Internal methods are usually preceded with a _
102 package Bio
::LocatableSeq
;
106 use Bio
::Location
::Simple
;
107 use Bio
::Location
::Fuzzy
;
108 use vars
qw($GAP_SYMBOLS $OTHER_SYMBOLS $FRAMESHIFT_SYMBOLS $RESIDUE_SYMBOLS $MATCHPATTERN);
110 # The following global variables contain symbols used to represent gaps,
111 # frameshifts, residues, and other valid symbols. These are set at compile-time;
112 # expect scoping errors when using 'local' and resetting $MATCHPATTERN (see
115 $GAP_SYMBOLS = '\-\.=~';
116 $FRAMESHIFT_SYMBOLS = '\\\/';
117 $OTHER_SYMBOLS = '\?';
118 $RESIDUE_SYMBOLS = '0-9A-Za-z\*';
119 $MATCHPATTERN = $RESIDUE_SYMBOLS.$GAP_SYMBOLS.$FRAMESHIFT_SYMBOLS.$OTHER_SYMBOLS;
121 use base qw(Bio::PrimarySeq Bio::RangeI);
125 my ($class, @args) = @_;
126 my $self = $class->SUPER::new
(@args);
128 my ($start,$end,$strand, $mapping, $fs, $nse) =
129 $self->_rearrange( [qw(START
139 $self->mapping($mapping);
141 $self->force_nse($nse);
142 defined $fs && $self->frameshifts($fs);
143 defined $start && $self->start($start);
144 defined $end && $self->end($end);
145 defined $strand && $self->strand($strand);
147 return $self; # success - we hope!
154 Usage : $obj->start($newval)
155 Function: Get/set the 1-based start position of this sequence in the original
156 sequence. '0' means before the original sequence starts.
157 Returns : value of start
158 Args : newvalue (optional)
166 $self->{'start'} = $value;
168 return $self->{'start'} if defined $self->{'start'};
169 return 1 if $self->seq;
177 Usage : $obj->end($newval)
178 Function: Get/set the 1-based end position of this sequence in the original
179 sequence. '0' means before the original sequence starts.
180 Returns : value of end
181 Args : newvalue (optional)
182 Note : although this is a get/set, it checks passed values against the
183 calculated end point ( derived from the sequence and based on
184 $GAP_SYMBOLS and possible frameshifts() ). If there is no match,
185 it will warn and set the proper value. Probably best used for
186 debugging proper sequence calculations.
194 my $st = $self->start;
195 # start of 0 usually means the sequence is all gaps but maps to
196 # other sequences in an alignment
197 if ($self->seq && $st != 0 ) {
198 my $len = $self->_ungapped_len;
199 my $calend = $st + $len - 1;
200 my $id = $self->id || 'unknown';
201 if ($calend != $value) {
202 $self->warn("In sequence $id residue count gives end value ".
203 "$calend. \nOverriding value [$value] with value $calend for ".
204 "Bio::LocatableSeq::end().\n".$self->seq);
208 $self->{'end'} = $value;
211 if (defined $self->{'end'}) {
212 return $self->{'end'}
213 } elsif ( my $len = $self->_ungapped_len) {
214 return $len + $self->start - 1;
221 # changed 08.10.26 to return ungapped length, not the calculated end
225 return unless my $string = $self->seq;
226 my ($map_res, $map_coord) = $self->mapping;
228 if (my %data = $self->frameshifts) {
229 map {$offset += $_} values %data;
231 $string =~ s{[$GAP_SYMBOLS$FRAMESHIFT_SYMBOLS]+}{}g;
232 return CORE
::length($string)/($map_res/$map_coord) + $offset/($map_coord/$map_res);
237 # return unless my $string = $self->seq;
238 # $string =~ s{[$GAP_SYMBOLS$FRAMESHIFT_SYMBOLS]+}{}g;
239 # return CORE::length($string);
246 Usage : $obj->strand($newval)
247 Function: return or set the strandedness
248 Returns : the value of the strandedness (-1, 0 or 1)
249 Args : the value of the strandedness (-1, 0 or 1)
257 $self->{'strand'} = $value;
259 return $self->{'strand'};
266 Usage : $obj->mapping($newval)
267 Function: return or set the mapping indices (indicates # symbols/positions in
268 the source string mapping to # of coordinate positions)
269 Returns : two-element array (# symbols => # coordinate pos)
270 Args : two elements (# symbols => # coordinate pos); this can also be
271 passed in as an array reference of the two elements (as might be
272 passed upon Bio::LocatableSeq instantiation, for instance).
279 my @mapping = (ref($_[0]) eq 'ARRAY') ? @
{$_[0]} : @_;
280 $self->throw("Must pass two values (# residues mapped to # positions)")
282 if ((grep {$_ != 1 && $_ != 3} @mapping) || ($mapping[0] == 3 && $mapping[1] == 3)) {
283 $self->throw("Mapping values other than 1 or 3 are not currently supported")
285 $self->{'_mapping'} = \
@mapping;
287 $self->throw('Mapping for LocatableSeq not set') if !exists $self->{'_mapping'};
288 return @
{ $self->{'_mapping'} };
295 Usage : $obj->frameshifts($newval)
296 Function: get/set the frameshift hash, which contains sequence positions as
297 keys and the shift (-2, -1, 1, 2) as the value
299 Args : hash or hash reference
306 if (ref $_[0] eq 'HASH') {
307 $self->{_frameshifts
} = $_[0];
309 # assume this is a full list to be converted to a hash
310 $self->{_frameshifts
} = \
%{@_} # coerce into hash ref
313 (defined $self->{_frameshifts
} && ref $self->{_frameshifts
} eq 'HASH') ?
314 return %{$self->{_frameshifts
}} : return ();
322 Function: read-only name of form id/start-end
330 my ($self,$char1,$char2) = @_;
335 my ($id, $st, $end, $strand) = ($self->id(), $self->start(),
336 $self->end(), $self->strand || 0);
338 if ($self->force_nse) {
344 $self->throw("Attribute id not set") unless defined($id);
345 $self->throw("Attribute start not set") unless defined($st);
346 $self->throw("Attribute end not set") unless defined($end);
348 if ($strand && $strand == -1) {
349 ($st, $end) = ($end, $st);
352 #Stockholm Rfam includes version if present so it is optional
353 my $v = $self->version ?
'.'.$self->version : '';
354 return join('',$id, $v, $char1, $st, $char2, $end);
361 Usage : $ls->force_nse()
362 Function: Boolean which forces get_nse() to build an NSE, regardless
363 of whether id(), start(), or end() is set
364 Returns : Boolean value
365 Args : (optional) Boolean (1 or 0)
366 Note : This will convert any passed value evaluating as TRUE/FALSE to 1/0
372 my ($self, $flag) = @_;
374 $flag ?
(return $self->{'_force_nse'} = 1) : (return $self->{'_force_nse'} = 0);
376 return $self->{'_force_nse'};
383 Usage :$self->num_gaps('.')
384 Function:Gets number of gaps in the sequence. The count excludes
385 leading or trailing gap characters.
387 Valid bioperl sequence characters are [A-Za-z\-\.\*]. Of
388 these, '.' and '-' are counted as gap characters unless an
389 optional argument specifies one of them.
391 Returns : number of internal gaps in the sequence.
392 Args : a gap character (optional)
394 Note : replaces no_gaps
399 my ($self,$char) = @_;
400 my ($seq, $count) = (undef, 0);
402 # default gap characters
403 $char ||= $GAP_SYMBOLS;
405 $self->warn("I hope you know what you are doing setting gap to [$char]")
406 unless $char =~ /[$GAP_SYMBOLS]/;
409 return 0 unless $seq; # empty sequence does not have gaps
411 $seq =~ s/^([$char]+)//;
412 $seq =~ s/([$char]+)$//;
413 while ( $seq =~ /[$char]+/g ) {
421 =head2 column_from_residue_number
423 Title : column_from_residue_number
424 Usage : $col = $seq->column_from_residue_number($resnumber)
427 This function gives the position in the alignment
428 (i.e. column number) of the given residue number in the
429 sequence. For example, for the sequence
431 Seq1/91-97 AC..DEF.GH
433 column_from_residue_number(94) returns 6.
435 An exception is thrown if the residue number would lie
436 outside the length of the aligment
437 (e.g. column_from_residue_number( "Seq2", 22 )
439 Returns : A column number for the position of the
440 given residue in the given sequence (1 = first column)
441 Args : A residue number in the whole sequence (not just that
442 segment of it in the alignment)
446 sub column_from_residue_number
{
447 my ($self, $resnumber) = @_;
449 $self->throw("Residue number has to be a positive integer, not [$resnumber]")
450 unless $resnumber =~ /^\d+$/ and $resnumber > 0;
452 if ($resnumber >= $self->start() and $resnumber <= $self->end()) {
456 my $current_residue = $self->start - 1;
457 my $seq = $self->seq;
458 my $strand = $self->strand || 0;
461 #@chunks = reverse $seq =~ m/[^\.\-]+|[\.\-]+/go;
462 @chunks = reverse $seq =~ m/[$RESIDUE_SYMBOLS]+|[$GAP_SYMBOLS]+/go;
464 $current_column = (CORE
::length $seq) + 1;
467 #@chunks = $seq =~ m/[^\.\-]+|[\.\-]+/go;
468 @chunks = $seq =~ m/[$RESIDUE_SYMBOLS]+|[$GAP_SYMBOLS]+/go;
473 while (my $chunk = shift @chunks) {
474 #if ($chunk =~ m|^[\.\-]|o) {
475 if ($chunk =~ m
|^[$GAP_SYMBOLS]|o
) {
476 $current_column += $column_incr * CORE
::length($chunk);
479 if ($current_residue + CORE
::length($chunk) < $resnumber) {
480 $current_column += $column_incr * CORE
::length($chunk);
481 $current_residue += CORE
::length($chunk);
485 $current_column -= $resnumber - $current_residue;
488 $current_column += $resnumber - $current_residue;
490 return $current_column;
496 $self->throw("Could not find residue number $resnumber");
501 =head2 location_from_column
503 Title : location_from_column
504 Usage : $loc = $ali->location_from_column($column_number)
507 This function gives the residue number for a given position
508 in the alignment (i.e. column number) of the given. Gaps
509 complicate this process and force the output to be a
510 L<Bio::Location::Simple> where values can be undefined.
511 For example, for the sequence:
513 Seq/91-96 .AC..DEF.G.
515 location_from_column( 3 ) position 92
516 location_from_column( 4 ) position 92^93
517 location_from_column( 9 ) position 95^96
518 location_from_column( 1 ) position undef
520 An exact position returns a Bio::Location::Simple object
521 where where location_type() returns 'EXACT', if a position
522 is between bases location_type() returns 'IN-BETWEEN'.
523 Column before the first residue returns undef. Note that if
524 the position is after the last residue in the alignment,
525 that there is no guarantee that the original sequence has
526 residues after that position.
528 An exception is thrown if the column number is not within
531 Returns : Bio::Location::Simple or undef
532 Args : A column number
533 Throws : If column is not within the sequence
535 See L<Bio::Location::Simple> for more.
539 sub location_from_column
{
540 my ($self, $column) = @_;
542 $self->throw("Column number has to be a positive integer, not [$column]")
543 unless $column =~ /^\d+$/ and $column > 0;
544 $self->throw("Column number [$column] is larger than".
545 " sequence length [". $self->length. "]")
546 unless $column <= $self->length;
549 my $s = $self->subseq(1,$column);
550 $s =~ s/[^a-zA-Z\*]//g;
552 my $pos = CORE
::length $s;
554 my $start = $self->start || 0 ;
555 my $strand = $self->strand() || 1;
556 my $relative_pos = ($strand == -1)
557 ?
($self->end - $pos + 1)
558 : ($pos + $start - 1);
559 if ($self->subseq($column, $column) =~ /[a-zA-Z\*]/ ) {
560 $loc = Bio
::Location
::Simple
->new
561 (-start
=> $relative_pos,
562 -end
=> $relative_pos,
565 } elsif ($pos == 0 and $self->start == 1) {
567 my ($start,$end) = ($relative_pos, $relative_pos + $strand);
569 ($start,$end) = ($end,$start);
571 $loc = Bio
::Location
::Simple
->new
575 -location_type
=> 'IN-BETWEEN'
585 Usage : $rev = $seq->revcom()
586 Function: Produces a new Bio::LocatableSeq object which
587 has the reversed complement of the sequence. For protein
588 sequences this throws an exception of "Sequence is a
589 protein. Cannot revcom"
591 Returns : A new Bio::LocatableSeq object
598 # since we don't know whether sequences without 1 => 1 correlation can be
599 # revcom'd, kick back
600 if (grep {$_ != 1} $self->mapping) {
601 $self->warn('revcom() not supported for sequences with mapped values of > 1');
604 my $new = $self->SUPER::revcom
;
605 $new->strand($self->strand * -1) if $self->strand;
606 $new->start($self->start) if $self->start;
607 $new->end($self->end) if $self->end;
615 Usage : $subseq = $myseq->trunc(10,100);
616 Function: Provides a truncation of a sequence,
617 Returns : a fresh Bio::PrimarySeqI implementing object
618 Args : Two integers denoting first and last columns of the
619 sequence to be included into sub-sequence.
624 my ($self, $start, $end) = @_;
625 my $new = $self->SUPER::trunc
($start, $end);
626 $new->strand($self->strand);
628 # end will be automatically calculated
629 $start = $end if $self->strand && $self->strand == -1;
631 $start = $self->location_from_column($start);
632 $start ?
($start = $start->end) : ($start = 1);
633 $new->start($start) if $start;
642 Usage : if(! $seqobj->validate_seq($seq_str) ) {
643 print "sequence $seq_str is not valid for an object of
644 alphabet ",$seqobj->alphabet, "\n";
646 Function: Test that the given sequence is valid, i.e. contains only valid
647 characters. The allowed characters are all letters (A-Z) and '-','.',
648 '*','?','=' and '~'. Spaces are not valid. Note that this
649 implementation does not take alphabet() into account.
650 Returns : 1 if the supplied sequence string is valid, 0 otherwise.
651 Args : - Sequence string to be validated
652 - Boolean to throw an error if the sequence is invalid
657 my ($self, $seqstr, $throw) = @_;
658 $seqstr = '' if not defined $seqstr;
659 $throw = 0 if not defined $throw ; # 0 for backward compatibility
660 if ( (CORE
::length $seqstr > 0 ) &&
661 ($seqstr !~ /^([$MATCHPATTERN]+)$/) ) {
663 $self->throw("Failed validation of sequence '".(defined($self->id) ||
664 '[unidentified sequence]')."'. Invalid characters were: " .
665 join('',($seqstr =~ /([^$MATCHPATTERN]+)/g)));