Bio::Tools::CodonTable::is_start_codon: check in case of ambiguous codons (#266)
[bioperl-live.git] / lib / Bio / LocatableSeq.pm
blobfaff9cbee53cb4c30d8dbd1d412e5ff50806bb94
2 # BioPerl module for Bio::LocatableSeq
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Ewan Birney <birney@ebi.ac.uk>
8 # Copyright Ewan Birney
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
14 =head1 NAME
16 Bio::LocatableSeq - A Bio::PrimarySeq object with start/end points on it
17 that can be projected into a MSA or have coordinates relative to
18 another seq.
20 =head1 SYNOPSIS
22 use Bio::LocatableSeq;
23 my $seq = Bio::LocatableSeq->new(-seq => "CAGT-GGT",
24 -id => "seq1",
25 -start => 1,
26 -end => 7);
28 # a normal sequence object
29 $locseq->seq();
30 $locseq->id();
32 # has start,end points
33 $locseq->start();
34 $locseq->end();
36 # inherits off RangeI, so range operations possible
38 =head1 DESCRIPTION
40 The LocatableSeq sequence object was developed mainly because the SimpleAlign
41 object requires this functionality, and in the rewrite of the Sequence object we
42 had to decide what to do with this.
44 It is, to be honest, not well integrated with the rest of bioperl. For example,
45 the trunc() function does not return a LocatableSeq object, as some might have
46 thought. Also, the sequence is not a Bio::SeqI, so the location is simply
47 inherited from Bio::RangeI and is not stored in a Bio::Location.
49 There are all sorts of nasty gotcha's about interactions between coordinate
50 systems when these sort of objects are used. Some mapping now occurs to deal
51 with HSP data, however it can probably be integrated in better and most methods
52 do not implement it correctly yet. Also, several PrimarySeqI methods (subseq(),
53 trunc(), etc.) do not behave as expected and must be used with care. Due to this,
54 LocatableSeq functionality is to be refactored in a future BioPerl release.
55 However, for alignment functionality it works adequately for the time being.
57 If you do not need alignment functionality, L<Bio::SeqfeatureI>-implementing
58 modules may be a suitable alternative to L<Bio::LocatableSeq>. For example,
59 L<Bio::SeqFeature::Generic> and L<Bio::SeqFeature::Lite> provide methods to
60 attach a sequence to a specific region of a parent sequence and to set other
61 useful attributes.
63 =head1 FEEDBACK
65 =head2 Mailing Lists
67 User feedback is an integral part of the evolution of this and other
68 Bioperl modules. Send your comments and suggestions preferably to one
69 of the Bioperl mailing lists. Your participation is much appreciated.
71 bioperl-l@bioperl.org - General discussion
72 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
74 =head2 Support
76 Please direct usage questions or support issues to the mailing list:
78 I<bioperl-l@bioperl.org>
80 rather than to the module maintainer directly. Many experienced and
81 reponsive experts will be able look at the problem and quickly
82 address it. Please include a thorough description of the problem
83 with code and data examples if at all possible.
85 =head2 Reporting Bugs
87 Report bugs to the Bioperl bug tracking system to help us keep track
88 the bugs and their resolution. Bug reports can be submitted via the
89 web:
91 https://github.com/bioperl/bioperl-live/issues
93 =head1 APPENDIX
95 The rest of the documentation details each of the object
96 methods. Internal methods are usually preceded with a _
98 =cut
102 package Bio::LocatableSeq;
104 use strict;
106 use Bio::Location::Simple;
107 use Bio::Location::Fuzzy;
108 use vars qw($GAP_SYMBOLS $OTHER_SYMBOLS $FRAMESHIFT_SYMBOLS $RESIDUE_SYMBOLS $MATCHPATTERN);
110 # The following global variables contain symbols used to represent gaps,
111 # frameshifts, residues, and other valid symbols. These are set at compile-time;
112 # expect scoping errors when using 'local' and resetting $MATCHPATTERN (see
113 # LocatableSeq.t)
115 $GAP_SYMBOLS = '\-\.=~';
116 $FRAMESHIFT_SYMBOLS = '\\\/';
117 $OTHER_SYMBOLS = '\?';
118 $RESIDUE_SYMBOLS = '0-9A-Za-z\*';
119 $MATCHPATTERN = $RESIDUE_SYMBOLS.$GAP_SYMBOLS.$FRAMESHIFT_SYMBOLS.$OTHER_SYMBOLS;
121 use base qw(Bio::PrimarySeq Bio::RangeI);
124 sub new {
125 my ($class, @args) = @_;
126 my $self = $class->SUPER::new(@args);
128 my ($start,$end,$strand, $mapping, $fs, $nse) =
129 $self->_rearrange( [qw(START
131 STRAND
132 MAPPING
133 FRAMESHIFTS
134 FORCE_NSE
136 @args);
138 $mapping ||= [1,1];
139 $self->mapping($mapping);
140 $nse || 0;
141 $self->force_nse($nse);
142 defined $fs && $self->frameshifts($fs);
143 defined $start && $self->start($start);
144 defined $end && $self->end($end);
145 defined $strand && $self->strand($strand);
147 return $self; # success - we hope!
151 =head2 start
153 Title : start
154 Usage : $obj->start($newval)
155 Function: Get/set the 1-based start position of this sequence in the original
156 sequence. '0' means before the original sequence starts.
157 Returns : value of start
158 Args : newvalue (optional)
160 =cut
162 sub start {
163 my $self = shift;
164 if( @_ ) {
165 my $value = shift;
166 $self->{'start'} = $value;
168 return $self->{'start'} if defined $self->{'start'};
169 return 1 if $self->seq;
170 return;
174 =head2 end
176 Title : end
177 Usage : $obj->end($newval)
178 Function: Get/set the 1-based end position of this sequence in the original
179 sequence. '0' means before the original sequence starts.
180 Returns : value of end
181 Args : newvalue (optional)
182 Note : although this is a get/set, it checks passed values against the
183 calculated end point ( derived from the sequence and based on
184 $GAP_SYMBOLS and possible frameshifts() ). If there is no match,
185 it will warn and set the proper value. Probably best used for
186 debugging proper sequence calculations.
188 =cut
190 sub end {
191 my $self = shift;
192 if( @_ ) {
193 my $value = shift;
194 my $st = $self->start;
195 # start of 0 usually means the sequence is all gaps but maps to
196 # other sequences in an alignment
197 if ($self->seq && $st != 0 ) {
198 my $len = $self->_ungapped_len;
199 my $calend = $st + $len - 1;
200 my $id = $self->id || 'unknown';
201 if ($calend != $value) {
202 $self->warn("In sequence $id residue count gives end value ".
203 "$calend. \nOverriding value [$value] with value $calend for ".
204 "Bio::LocatableSeq::end().\n".$self->seq);
205 $value = $calend;
208 $self->{'end'} = $value;
211 if (defined $self->{'end'}) {
212 return $self->{'end'}
213 } elsif ( my $len = $self->_ungapped_len) {
214 return $len + $self->start - 1;
215 } else {
216 return;
221 # changed 08.10.26 to return ungapped length, not the calculated end
222 # of the sequence
223 sub _ungapped_len {
224 my $self = shift;
225 return unless my $string = $self->seq;
226 my ($map_res, $map_coord) = $self->mapping;
227 my $offset = 0;
228 if (my %data = $self->frameshifts) {
229 map {$offset += $_} values %data;
231 $string =~ s{[$GAP_SYMBOLS$FRAMESHIFT_SYMBOLS]+}{}g;
232 return CORE::length($string)/($map_res/$map_coord) + $offset/($map_coord/$map_res);
235 #sub length {
236 # my $self = shift;
237 # return unless my $string = $self->seq;
238 # $string =~ s{[$GAP_SYMBOLS$FRAMESHIFT_SYMBOLS]+}{}g;
239 # return CORE::length($string);
243 =head2 strand
245 Title : strand
246 Usage : $obj->strand($newval)
247 Function: return or set the strandedness
248 Returns : the value of the strandedness (-1, 0 or 1)
249 Args : the value of the strandedness (-1, 0 or 1)
251 =cut
253 sub strand {
254 my $self = shift;
255 if( @_ ) {
256 my $value = shift;
257 $self->{'strand'} = $value;
259 return $self->{'strand'};
263 =head2 mapping
265 Title : mapping
266 Usage : $obj->mapping($newval)
267 Function: return or set the mapping indices (indicates # symbols/positions in
268 the source string mapping to # of coordinate positions)
269 Returns : two-element array (# symbols => # coordinate pos)
270 Args : two elements (# symbols => # coordinate pos); this can also be
271 passed in as an array reference of the two elements (as might be
272 passed upon Bio::LocatableSeq instantiation, for instance).
274 =cut
276 sub mapping {
277 my $self = shift;
278 if( @_ ) {
279 my @mapping = (ref($_[0]) eq 'ARRAY') ? @{$_[0]} : @_;
280 $self->throw("Must pass two values (# residues mapped to # positions)")
281 if @mapping != 2;
282 if ((grep {$_ != 1 && $_ != 3} @mapping) || ($mapping[0] == 3 && $mapping[1] == 3)) {
283 $self->throw("Mapping values other than 1 or 3 are not currently supported")
285 $self->{'_mapping'} = \@mapping;
287 $self->throw('Mapping for LocatableSeq not set') if !exists $self->{'_mapping'};
288 return @{ $self->{'_mapping'} };
292 =head2 frameshifts
294 Title : frameshifts
295 Usage : $obj->frameshifts($newval)
296 Function: get/set the frameshift hash, which contains sequence positions as
297 keys and the shift (-2, -1, 1, 2) as the value
298 Returns : hash
299 Args : hash or hash reference
301 =cut
303 sub frameshifts {
304 my $self = shift;
305 if( @_ ) {
306 if (ref $_[0] eq 'HASH') {
307 $self->{_frameshifts} = $_[0];
308 } else {
309 # assume this is a full list to be converted to a hash
310 $self->{_frameshifts} = \%{@_} # coerce into hash ref
313 (defined $self->{_frameshifts} && ref $self->{_frameshifts} eq 'HASH') ?
314 return %{$self->{_frameshifts}} : return ();
318 =head2 get_nse
320 Title : get_nse
321 Usage :
322 Function: read-only name of form id/start-end
323 Example :
324 Returns :
325 Args :
327 =cut
329 sub get_nse {
330 my ($self,$char1,$char2) = @_;
332 $char1 ||= "/";
333 $char2 ||= "-";
335 my ($id, $st, $end, $strand) = ($self->id(), $self->start(),
336 $self->end(), $self->strand || 0);
338 if ($self->force_nse) {
339 $id ||= '';
340 $st ||= 0;
341 $end ||= 0;
344 $self->throw("Attribute id not set") unless defined($id);
345 $self->throw("Attribute start not set") unless defined($st);
346 $self->throw("Attribute end not set") unless defined($end);
348 if ($strand && $strand == -1) {
349 ($st, $end) = ($end, $st);
352 #Stockholm Rfam includes version if present so it is optional
353 my $v = $self->version ? '.'.$self->version : '';
354 return join('',$id, $v, $char1, $st, $char2, $end);
358 =head2 force_nse
360 Title : force_nse
361 Usage : $ls->force_nse()
362 Function: Boolean which forces get_nse() to build an NSE, regardless
363 of whether id(), start(), or end() is set
364 Returns : Boolean value
365 Args : (optional) Boolean (1 or 0)
366 Note : This will convert any passed value evaluating as TRUE/FALSE to 1/0
367 respectively
369 =cut
371 sub force_nse {
372 my ($self, $flag) = @_;
373 if (defined $flag) {
374 $flag ? (return $self->{'_force_nse'} = 1) : (return $self->{'_force_nse'} = 0);
376 return $self->{'_force_nse'};
380 =head2 num_gaps
382 Title : num_gaps
383 Usage :$self->num_gaps('.')
384 Function:Gets number of gaps in the sequence. The count excludes
385 leading or trailing gap characters.
387 Valid bioperl sequence characters are [A-Za-z\-\.\*]. Of
388 these, '.' and '-' are counted as gap characters unless an
389 optional argument specifies one of them.
391 Returns : number of internal gaps in the sequence.
392 Args : a gap character (optional)
393 Status : Stable
394 Note : replaces no_gaps
396 =cut
398 sub num_gaps {
399 my ($self,$char) = @_;
400 my ($seq, $count) = (undef, 0);
402 # default gap characters
403 $char ||= $GAP_SYMBOLS;
405 $self->warn("I hope you know what you are doing setting gap to [$char]")
406 unless $char =~ /[$GAP_SYMBOLS]/;
408 $seq = $self->seq;
409 return 0 unless $seq; # empty sequence does not have gaps
411 $seq =~ s/^([$char]+)//;
412 $seq =~ s/([$char]+)$//;
413 while ( $seq =~ /[$char]+/g ) {
414 $count++;
417 return $count;
421 =head2 column_from_residue_number
423 Title : column_from_residue_number
424 Usage : $col = $seq->column_from_residue_number($resnumber)
425 Function:
427 This function gives the position in the alignment
428 (i.e. column number) of the given residue number in the
429 sequence. For example, for the sequence
431 Seq1/91-97 AC..DEF.GH
433 column_from_residue_number(94) returns 6.
435 An exception is thrown if the residue number would lie
436 outside the length of the aligment
437 (e.g. column_from_residue_number( "Seq2", 22 )
439 Returns : A column number for the position of the
440 given residue in the given sequence (1 = first column)
441 Args : A residue number in the whole sequence (not just that
442 segment of it in the alignment)
444 =cut
446 sub column_from_residue_number {
447 my ($self, $resnumber) = @_;
449 $self->throw("Residue number has to be a positive integer, not [$resnumber]")
450 unless $resnumber =~ /^\d+$/ and $resnumber > 0;
452 if ($resnumber >= $self->start() and $resnumber <= $self->end()) {
453 my @chunks;
454 my $column_incr;
455 my $current_column;
456 my $current_residue = $self->start - 1;
457 my $seq = $self->seq;
458 my $strand = $self->strand || 0;
460 if ($strand == -1) {
461 #@chunks = reverse $seq =~ m/[^\.\-]+|[\.\-]+/go;
462 @chunks = reverse $seq =~ m/[$RESIDUE_SYMBOLS]+|[$GAP_SYMBOLS]+/go;
463 $column_incr = -1;
464 $current_column = (CORE::length $seq) + 1;
466 else {
467 #@chunks = $seq =~ m/[^\.\-]+|[\.\-]+/go;
468 @chunks = $seq =~ m/[$RESIDUE_SYMBOLS]+|[$GAP_SYMBOLS]+/go;
469 $column_incr = 1;
470 $current_column = 0;
473 while (my $chunk = shift @chunks) {
474 #if ($chunk =~ m|^[\.\-]|o) {
475 if ($chunk =~ m|^[$GAP_SYMBOLS]|o) {
476 $current_column += $column_incr * CORE::length($chunk);
478 else {
479 if ($current_residue + CORE::length($chunk) < $resnumber) {
480 $current_column += $column_incr * CORE::length($chunk);
481 $current_residue += CORE::length($chunk);
483 else {
484 if ($strand == -1) {
485 $current_column -= $resnumber - $current_residue;
487 else {
488 $current_column += $resnumber - $current_residue;
490 return $current_column;
496 $self->throw("Could not find residue number $resnumber");
501 =head2 location_from_column
503 Title : location_from_column
504 Usage : $loc = $ali->location_from_column($column_number)
505 Function:
507 This function gives the residue number for a given position
508 in the alignment (i.e. column number) of the given. Gaps
509 complicate this process and force the output to be a
510 L<Bio::Location::Simple> where values can be undefined.
511 For example, for the sequence:
513 Seq/91-96 .AC..DEF.G.
515 location_from_column( 3 ) position 92
516 location_from_column( 4 ) position 92^93
517 location_from_column( 9 ) position 95^96
518 location_from_column( 1 ) position undef
520 An exact position returns a Bio::Location::Simple object
521 where where location_type() returns 'EXACT', if a position
522 is between bases location_type() returns 'IN-BETWEEN'.
523 Column before the first residue returns undef. Note that if
524 the position is after the last residue in the alignment,
525 that there is no guarantee that the original sequence has
526 residues after that position.
528 An exception is thrown if the column number is not within
529 the sequence.
531 Returns : Bio::Location::Simple or undef
532 Args : A column number
533 Throws : If column is not within the sequence
535 See L<Bio::Location::Simple> for more.
537 =cut
539 sub location_from_column {
540 my ($self, $column) = @_;
542 $self->throw("Column number has to be a positive integer, not [$column]")
543 unless $column =~ /^\d+$/ and $column > 0;
544 $self->throw("Column number [$column] is larger than".
545 " sequence length [". $self->length. "]")
546 unless $column <= $self->length;
548 my ($loc);
549 my $s = $self->subseq(1,$column);
550 $s =~ s/[^a-zA-Z\*]//g;
552 my $pos = CORE::length $s;
554 my $start = $self->start || 0 ;
555 my $strand = $self->strand() || 1;
556 my $relative_pos = ($strand == -1)
557 ? ($self->end - $pos + 1)
558 : ($pos + $start - 1);
559 if ($self->subseq($column, $column) =~ /[a-zA-Z\*]/ ) {
560 $loc = Bio::Location::Simple->new
561 (-start => $relative_pos,
562 -end => $relative_pos,
563 -strand => 1,
565 } elsif ($pos == 0 and $self->start == 1) {
566 } else {
567 my ($start,$end) = ($relative_pos, $relative_pos + $strand);
568 if ($strand == -1) {
569 ($start,$end) = ($end,$start);
571 $loc = Bio::Location::Simple->new
572 (-start => $start,
573 -end => $end,
574 -strand => 1,
575 -location_type => 'IN-BETWEEN'
578 return $loc;
582 =head2 revcom
584 Title : revcom
585 Usage : $rev = $seq->revcom()
586 Function: Produces a new Bio::LocatableSeq object which
587 has the reversed complement of the sequence. For protein
588 sequences this throws an exception of "Sequence is a
589 protein. Cannot revcom"
591 Returns : A new Bio::LocatableSeq object
592 Args : none
594 =cut
596 sub revcom {
597 my ($self) = @_;
598 # since we don't know whether sequences without 1 => 1 correlation can be
599 # revcom'd, kick back
600 if (grep {$_ != 1} $self->mapping) {
601 $self->warn('revcom() not supported for sequences with mapped values of > 1');
602 return;
604 my $new = $self->SUPER::revcom;
605 $new->strand($self->strand * -1) if $self->strand;
606 $new->start($self->start) if $self->start;
607 $new->end($self->end) if $self->end;
608 return $new;
612 =head2 trunc
614 Title : trunc
615 Usage : $subseq = $myseq->trunc(10,100);
616 Function: Provides a truncation of a sequence,
617 Returns : a fresh Bio::PrimarySeqI implementing object
618 Args : Two integers denoting first and last columns of the
619 sequence to be included into sub-sequence.
621 =cut
623 sub trunc {
624 my ($self, $start, $end) = @_;
625 my $new = $self->SUPER::trunc($start, $end);
626 $new->strand($self->strand);
628 # end will be automatically calculated
629 $start = $end if $self->strand && $self->strand == -1;
631 $start = $self->location_from_column($start);
632 $start ? ($start = $start->end) : ($start = 1);
633 $new->start($start) if $start;
635 return $new;
639 =head2 validate_seq
641 Title : validate_seq
642 Usage : if(! $seqobj->validate_seq($seq_str) ) {
643 print "sequence $seq_str is not valid for an object of
644 alphabet ",$seqobj->alphabet, "\n";
646 Function: Test that the given sequence is valid, i.e. contains only valid
647 characters. The allowed characters are all letters (A-Z) and '-','.',
648 '*','?','=' and '~'. Spaces are not valid. Note that this
649 implementation does not take alphabet() into account.
650 Returns : 1 if the supplied sequence string is valid, 0 otherwise.
651 Args : - Sequence string to be validated
652 - Boolean to throw an error if the sequence is invalid
654 =cut
656 sub validate_seq {
657 my ($self, $seqstr, $throw) = @_;
658 $seqstr = '' if not defined $seqstr;
659 $throw = 0 if not defined $throw ; # 0 for backward compatibility
660 if ( (CORE::length $seqstr > 0 ) &&
661 ($seqstr !~ /^([$MATCHPATTERN]+)$/) ) {
662 if ($throw) {
663 $self->throw("Failed validation of sequence '".(defined($self->id) ||
664 '[unidentified sequence]')."'. Invalid characters were: " .
665 join('',($seqstr =~ /([^$MATCHPATTERN]+)/g)));
667 return 0;
669 return 1;