Bio::DB::Universal: move into its own distribution
[bioperl-live.git] / Bio / LocatableSeq.pm
blob2fff6491137c14036916e24355a92d639b3ab040
2 # BioPerl module for Bio::LocatableSeq
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Ewan Birney <birney@ebi.ac.uk>
8 # Copyright Ewan Birney
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
14 =head1 NAME
16 Bio::LocatableSeq - A Bio::PrimarySeq object with start/end points on it
17 that can be projected into a MSA or have coordinates relative to
18 another seq.
20 =head1 SYNOPSIS
22 use Bio::LocatableSeq;
23 my $seq = Bio::LocatableSeq->new(-seq => "CAGT-GGT",
24 -id => "seq1",
25 -start => 1,
26 -end => 7);
28 # a normal sequence object
29 $locseq->seq();
30 $locseq->id();
32 # has start,end points
33 $locseq->start();
34 $locseq->end();
36 # inherits off RangeI, so range operations possible
38 =head1 DESCRIPTION
40 The LocatableSeq sequence object was developed mainly because the SimpleAlign
41 object requires this functionality, and in the rewrite of the Sequence object we
42 had to decide what to do with this.
44 It is, to be honest, not well integrated with the rest of bioperl. For example,
45 the trunc() function does not return a LocatableSeq object, as some might have
46 thought. Also, the sequence is not a Bio::SeqI, so the location is simply
47 inherited from Bio::RangeI and is not stored in a Bio::Location.
49 There are all sorts of nasty gotcha's about interactions between coordinate
50 systems when these sort of objects are used. Some mapping now occurs to deal
51 with HSP data, however it can probably be integrated in better and most methods
52 do not implement it correctly yet. Also, several PrimarySeqI methods (subseq(),
53 trunc(), etc.) do not behave as expected and must be used with care. Due to this,
54 LocatableSeq functionality is to be refactored in a future BioPerl release.
55 However, for alignment functionality it works adequately for the time being.
57 If you do not need alignment functionality, L<Bio::SeqfeatureI>-implementing
58 modules may be a suitable alternative to L<Bio::LocatableSeq>. For example,
59 L<Bio::SeqFeature::Generic> and L<Bio::SeqFeature::Lite> provide methods to
60 attach a sequence to a specific region of a parent sequence and to set other
61 useful attributes.
63 =head1 FEEDBACK
65 =head2 Mailing Lists
67 User feedback is an integral part of the evolution of this and other
68 Bioperl modules. Send your comments and suggestions preferably to one
69 of the Bioperl mailing lists. Your participation is much appreciated.
71 bioperl-l@bioperl.org - General discussion
72 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
74 =head2 Support
76 Please direct usage questions or support issues to the mailing list:
78 I<bioperl-l@bioperl.org>
80 rather than to the module maintainer directly. Many experienced and
81 reponsive experts will be able look at the problem and quickly
82 address it. Please include a thorough description of the problem
83 with code and data examples if at all possible.
85 =head2 Reporting Bugs
87 Report bugs to the Bioperl bug tracking system to help us keep track
88 the bugs and their resolution. Bug reports can be submitted via the
89 web:
91 https://github.com/bioperl/bioperl-live/issues
93 =head1 APPENDIX
95 The rest of the documentation details each of the object
96 methods. Internal methods are usually preceded with a _
98 =cut
102 package Bio::LocatableSeq;
103 use strict;
105 use Bio::Location::Simple;
106 use Bio::Location::Fuzzy;
107 use vars qw($GAP_SYMBOLS $OTHER_SYMBOLS $FRAMESHIFT_SYMBOLS $RESIDUE_SYMBOLS $MATCHPATTERN);
109 # The following global variables contain symbols used to represent gaps,
110 # frameshifts, residues, and other valid symbols. These are set at compile-time;
111 # expect scoping errors when using 'local' and resetting $MATCHPATTERN (see
112 # LocatableSeq.t)
114 $GAP_SYMBOLS = '\-\.=~';
115 $FRAMESHIFT_SYMBOLS = '\\\/';
116 $OTHER_SYMBOLS = '\?';
117 $RESIDUE_SYMBOLS = '0-9A-Za-z\*';
118 $MATCHPATTERN = $RESIDUE_SYMBOLS.$GAP_SYMBOLS.$FRAMESHIFT_SYMBOLS.$OTHER_SYMBOLS;
120 use base qw(Bio::PrimarySeq Bio::RangeI);
123 sub new {
124 my ($class, @args) = @_;
125 my $self = $class->SUPER::new(@args);
127 my ($start,$end,$strand, $mapping, $fs, $nse) =
128 $self->_rearrange( [qw(START
130 STRAND
131 MAPPING
132 FRAMESHIFTS
133 FORCE_NSE
135 @args);
137 $mapping ||= [1,1];
138 $self->mapping($mapping);
139 $nse || 0;
140 $self->force_nse($nse);
141 defined $fs && $self->frameshifts($fs);
142 defined $start && $self->start($start);
143 defined $end && $self->end($end);
144 defined $strand && $self->strand($strand);
146 return $self; # success - we hope!
150 =head2 start
152 Title : start
153 Usage : $obj->start($newval)
154 Function: Get/set the 1-based start position of this sequence in the original
155 sequence. '0' means before the original sequence starts.
156 Returns : value of start
157 Args : newvalue (optional)
159 =cut
161 sub start {
162 my $self = shift;
163 if( @_ ) {
164 my $value = shift;
165 $self->{'start'} = $value;
167 return $self->{'start'} if defined $self->{'start'};
168 return 1 if $self->seq;
169 return;
173 =head2 end
175 Title : end
176 Usage : $obj->end($newval)
177 Function: Get/set the 1-based end position of this sequence in the original
178 sequence. '0' means before the original sequence starts.
179 Returns : value of end
180 Args : newvalue (optional)
181 Note : although this is a get/set, it checks passed values against the
182 calculated end point ( derived from the sequence and based on
183 $GAP_SYMBOLS and possible frameshifts() ). If there is no match,
184 it will warn and set the proper value. Probably best used for
185 debugging proper sequence calculations.
187 =cut
189 sub end {
190 my $self = shift;
191 if( @_ ) {
192 my $value = shift;
193 my $st = $self->start;
194 # start of 0 usually means the sequence is all gaps but maps to
195 # other sequences in an alignment
196 if ($self->seq && $st != 0 ) {
197 my $len = $self->_ungapped_len;
198 my $calend = $st + $len - 1;
199 my $id = $self->id || 'unknown';
200 if ($calend != $value) {
201 $self->warn("In sequence $id residue count gives end value ".
202 "$calend. \nOverriding value [$value] with value $calend for ".
203 "Bio::LocatableSeq::end().\n".$self->seq);
204 $value = $calend;
207 $self->{'end'} = $value;
210 if (defined $self->{'end'}) {
211 return $self->{'end'}
212 } elsif ( my $len = $self->_ungapped_len) {
213 return $len + $self->start - 1;
214 } else {
215 return;
220 # changed 08.10.26 to return ungapped length, not the calculated end
221 # of the sequence
222 sub _ungapped_len {
223 my $self = shift;
224 return unless my $string = $self->seq;
225 my ($map_res, $map_coord) = $self->mapping;
226 my $offset = 0;
227 if (my %data = $self->frameshifts) {
228 map {$offset += $_} values %data;
230 $string =~ s{[$GAP_SYMBOLS$FRAMESHIFT_SYMBOLS]+}{}g;
231 return CORE::length($string)/($map_res/$map_coord) + $offset/($map_coord/$map_res);
234 #sub length {
235 # my $self = shift;
236 # return unless my $string = $self->seq;
237 # $string =~ s{[$GAP_SYMBOLS$FRAMESHIFT_SYMBOLS]+}{}g;
238 # return CORE::length($string);
242 =head2 strand
244 Title : strand
245 Usage : $obj->strand($newval)
246 Function: return or set the strandedness
247 Returns : the value of the strandedness (-1, 0 or 1)
248 Args : the value of the strandedness (-1, 0 or 1)
250 =cut
252 sub strand {
253 my $self = shift;
254 if( @_ ) {
255 my $value = shift;
256 $self->{'strand'} = $value;
258 return $self->{'strand'};
262 =head2 mapping
264 Title : mapping
265 Usage : $obj->mapping($newval)
266 Function: return or set the mapping indices (indicates # symbols/positions in
267 the source string mapping to # of coordinate positions)
268 Returns : two-element array (# symbols => # coordinate pos)
269 Args : two elements (# symbols => # coordinate pos); this can also be
270 passed in as an array reference of the two elements (as might be
271 passed upon Bio::LocatableSeq instantiation, for instance).
273 =cut
275 sub mapping {
276 my $self = shift;
277 if( @_ ) {
278 my @mapping = (ref($_[0]) eq 'ARRAY') ? @{$_[0]} : @_;
279 $self->throw("Must pass two values (# residues mapped to # positions)")
280 if @mapping != 2;
281 if ((grep {$_ != 1 && $_ != 3} @mapping) || ($mapping[0] == 3 && $mapping[1] == 3)) {
282 $self->throw("Mapping values other than 1 or 3 are not currently supported")
284 $self->{'_mapping'} = \@mapping;
286 $self->throw('Mapping for LocatableSeq not set') if !exists $self->{'_mapping'};
287 return @{ $self->{'_mapping'} };
291 =head2 frameshifts
293 Title : frameshifts
294 Usage : $obj->frameshifts($newval)
295 Function: get/set the frameshift hash, which contains sequence positions as
296 keys and the shift (-2, -1, 1, 2) as the value
297 Returns : hash
298 Args : hash or hash reference
300 =cut
302 sub frameshifts {
303 my $self = shift;
304 if( @_ ) {
305 if (ref $_[0] eq 'HASH') {
306 $self->{_frameshifts} = $_[0];
307 } else {
308 # assume this is a full list to be converted to a hash
309 $self->{_frameshifts} = \%{@_} # coerce into hash ref
312 (defined $self->{_frameshifts} && ref $self->{_frameshifts} eq 'HASH') ?
313 return %{$self->{_frameshifts}} : return ();
317 =head2 get_nse
319 Title : get_nse
320 Usage :
321 Function: read-only name of form id/start-end
322 Example :
323 Returns :
324 Args :
326 =cut
328 sub get_nse {
329 my ($self,$char1,$char2) = @_;
331 $char1 ||= "/";
332 $char2 ||= "-";
334 my ($id, $st, $end, $strand) = ($self->id(), $self->start(),
335 $self->end(), $self->strand || 0);
337 if ($self->force_nse) {
338 $id ||= '';
339 $st ||= 0;
340 $end ||= 0;
343 $self->throw("Attribute id not set") unless defined($id);
344 $self->throw("Attribute start not set") unless defined($st);
345 $self->throw("Attribute end not set") unless defined($end);
347 if ($strand && $strand == -1) {
348 ($st, $end) = ($end, $st);
351 #Stockholm Rfam includes version if present so it is optional
352 my $v = $self->version ? '.'.$self->version : '';
353 return join('',$id, $v, $char1, $st, $char2, $end);
357 =head2 force_nse
359 Title : force_nse
360 Usage : $ls->force_nse()
361 Function: Boolean which forces get_nse() to build an NSE, regardless
362 of whether id(), start(), or end() is set
363 Returns : Boolean value
364 Args : (optional) Boolean (1 or 0)
365 Note : This will convert any passed value evaluating as TRUE/FALSE to 1/0
366 respectively
368 =cut
370 sub force_nse {
371 my ($self, $flag) = @_;
372 if (defined $flag) {
373 $flag ? (return $self->{'_force_nse'} = 1) : (return $self->{'_force_nse'} = 0);
375 return $self->{'_force_nse'};
379 =head2 num_gaps
381 Title : num_gaps
382 Usage :$self->num_gaps('.')
383 Function:Gets number of gaps in the sequence. The count excludes
384 leading or trailing gap characters.
386 Valid bioperl sequence characters are [A-Za-z\-\.\*]. Of
387 these, '.' and '-' are counted as gap characters unless an
388 optional argument specifies one of them.
390 Returns : number of internal gaps in the sequence.
391 Args : a gap character (optional)
392 Status : Stable
393 Note : replaces no_gaps
395 =cut
397 sub num_gaps {
398 my ($self,$char) = @_;
399 my ($seq, $count) = (undef, 0);
401 # default gap characters
402 $char ||= $GAP_SYMBOLS;
404 $self->warn("I hope you know what you are doing setting gap to [$char]")
405 unless $char =~ /[$GAP_SYMBOLS]/;
407 $seq = $self->seq;
408 return 0 unless $seq; # empty sequence does not have gaps
410 $seq =~ s/^([$char]+)//;
411 $seq =~ s/([$char]+)$//;
412 while ( $seq =~ /[$char]+/g ) {
413 $count++;
416 return $count;
420 =head2 column_from_residue_number
422 Title : column_from_residue_number
423 Usage : $col = $seq->column_from_residue_number($resnumber)
424 Function:
426 This function gives the position in the alignment
427 (i.e. column number) of the given residue number in the
428 sequence. For example, for the sequence
430 Seq1/91-97 AC..DEF.GH
432 column_from_residue_number(94) returns 6.
434 An exception is thrown if the residue number would lie
435 outside the length of the aligment
436 (e.g. column_from_residue_number( "Seq2", 22 )
438 Returns : A column number for the position of the
439 given residue in the given sequence (1 = first column)
440 Args : A residue number in the whole sequence (not just that
441 segment of it in the alignment)
443 =cut
445 sub column_from_residue_number {
446 my ($self, $resnumber) = @_;
448 $self->throw("Residue number has to be a positive integer, not [$resnumber]")
449 unless $resnumber =~ /^\d+$/ and $resnumber > 0;
451 if ($resnumber >= $self->start() and $resnumber <= $self->end()) {
452 my @chunks;
453 my $column_incr;
454 my $current_column;
455 my $current_residue = $self->start - 1;
456 my $seq = $self->seq;
457 my $strand = $self->strand || 0;
459 if ($strand == -1) {
460 #@chunks = reverse $seq =~ m/[^\.\-]+|[\.\-]+/go;
461 @chunks = reverse $seq =~ m/[$RESIDUE_SYMBOLS]+|[$GAP_SYMBOLS]+/go;
462 $column_incr = -1;
463 $current_column = (CORE::length $seq) + 1;
465 else {
466 #@chunks = $seq =~ m/[^\.\-]+|[\.\-]+/go;
467 @chunks = $seq =~ m/[$RESIDUE_SYMBOLS]+|[$GAP_SYMBOLS]+/go;
468 $column_incr = 1;
469 $current_column = 0;
472 while (my $chunk = shift @chunks) {
473 #if ($chunk =~ m|^[\.\-]|o) {
474 if ($chunk =~ m|^[$GAP_SYMBOLS]|o) {
475 $current_column += $column_incr * CORE::length($chunk);
477 else {
478 if ($current_residue + CORE::length($chunk) < $resnumber) {
479 $current_column += $column_incr * CORE::length($chunk);
480 $current_residue += CORE::length($chunk);
482 else {
483 if ($strand == -1) {
484 $current_column -= $resnumber - $current_residue;
486 else {
487 $current_column += $resnumber - $current_residue;
489 return $current_column;
495 $self->throw("Could not find residue number $resnumber");
500 =head2 location_from_column
502 Title : location_from_column
503 Usage : $loc = $ali->location_from_column($column_number)
504 Function:
506 This function gives the residue number for a given position
507 in the alignment (i.e. column number) of the given. Gaps
508 complicate this process and force the output to be a
509 L<Bio::Location::Simple> where values can be undefined.
510 For example, for the sequence:
512 Seq/91-96 .AC..DEF.G.
514 location_from_column( 3 ) position 92
515 location_from_column( 4 ) position 92^93
516 location_from_column( 9 ) position 95^96
517 location_from_column( 1 ) position undef
519 An exact position returns a Bio::Location::Simple object
520 where where location_type() returns 'EXACT', if a position
521 is between bases location_type() returns 'IN-BETWEEN'.
522 Column before the first residue returns undef. Note that if
523 the position is after the last residue in the alignment,
524 that there is no guarantee that the original sequence has
525 residues after that position.
527 An exception is thrown if the column number is not within
528 the sequence.
530 Returns : Bio::Location::Simple or undef
531 Args : A column number
532 Throws : If column is not within the sequence
534 See L<Bio::Location::Simple> for more.
536 =cut
538 sub location_from_column {
539 my ($self, $column) = @_;
541 $self->throw("Column number has to be a positive integer, not [$column]")
542 unless $column =~ /^\d+$/ and $column > 0;
543 $self->throw("Column number [$column] is larger than".
544 " sequence length [". $self->length. "]")
545 unless $column <= $self->length;
547 my ($loc);
548 my $s = $self->subseq(1,$column);
549 $s =~ s/[^a-zA-Z\*]//g;
551 my $pos = CORE::length $s;
553 my $start = $self->start || 0 ;
554 my $strand = $self->strand() || 1;
555 my $relative_pos = ($strand == -1)
556 ? ($self->end - $pos + 1)
557 : ($pos + $start - 1);
558 if ($self->subseq($column, $column) =~ /[a-zA-Z\*]/ ) {
559 $loc = Bio::Location::Simple->new
560 (-start => $relative_pos,
561 -end => $relative_pos,
562 -strand => 1,
564 } elsif ($pos == 0 and $self->start == 1) {
565 } else {
566 my ($start,$end) = ($relative_pos, $relative_pos + $strand);
567 if ($strand == -1) {
568 ($start,$end) = ($end,$start);
570 $loc = Bio::Location::Simple->new
571 (-start => $start,
572 -end => $end,
573 -strand => 1,
574 -location_type => 'IN-BETWEEN'
577 return $loc;
581 =head2 revcom
583 Title : revcom
584 Usage : $rev = $seq->revcom()
585 Function: Produces a new Bio::LocatableSeq object which
586 has the reversed complement of the sequence. For protein
587 sequences this throws an exception of "Sequence is a
588 protein. Cannot revcom"
590 Returns : A new Bio::LocatableSeq object
591 Args : none
593 =cut
595 sub revcom {
596 my ($self) = @_;
597 # since we don't know whether sequences without 1 => 1 correlation can be
598 # revcom'd, kick back
599 if (grep {$_ != 1} $self->mapping) {
600 $self->warn('revcom() not supported for sequences with mapped values of > 1');
601 return;
603 my $new = $self->SUPER::revcom;
604 $new->strand($self->strand * -1) if $self->strand;
605 $new->start($self->start) if $self->start;
606 $new->end($self->end) if $self->end;
607 return $new;
611 =head2 trunc
613 Title : trunc
614 Usage : $subseq = $myseq->trunc(10,100);
615 Function: Provides a truncation of a sequence,
616 Returns : a fresh Bio::PrimarySeqI implementing object
617 Args : Two integers denoting first and last columns of the
618 sequence to be included into sub-sequence.
620 =cut
622 sub trunc {
623 my ($self, $start, $end) = @_;
624 my $new = $self->SUPER::trunc($start, $end);
625 $new->strand($self->strand);
627 # end will be automatically calculated
628 $start = $end if $self->strand && $self->strand == -1;
630 $start = $self->location_from_column($start);
631 $start ? ($start = $start->end) : ($start = 1);
632 $new->start($start) if $start;
634 return $new;
638 =head2 validate_seq
640 Title : validate_seq
641 Usage : if(! $seqobj->validate_seq($seq_str) ) {
642 print "sequence $seq_str is not valid for an object of
643 alphabet ",$seqobj->alphabet, "\n";
645 Function: Test that the given sequence is valid, i.e. contains only valid
646 characters. The allowed characters are all letters (A-Z) and '-','.',
647 '*','?','=' and '~'. Spaces are not valid. Note that this
648 implementation does not take alphabet() into account.
649 Returns : 1 if the supplied sequence string is valid, 0 otherwise.
650 Args : - Sequence string to be validated
651 - Boolean to throw an error if the sequence is invalid
653 =cut
655 sub validate_seq {
656 my ($self, $seqstr, $throw) = @_;
657 $seqstr = '' if not defined $seqstr;
658 $throw = 0 if not defined $throw ; # 0 for backward compatibility
659 if ( (CORE::length $seqstr > 0 ) &&
660 ($seqstr !~ /^([$MATCHPATTERN]+)$/) ) {
661 if ($throw) {
662 $self->throw("Failed validation of sequence '".(defined($self->id) ||
663 '[unidentified sequence]')."'. Invalid characters were: " .
664 join('',($seqstr =~ /([^$MATCHPATTERN]+)/g)));
666 return 0;
668 return 1;
672 ################## DEPRECATED METHODS ##################
675 =head2 no_gap
677 Title : no_gaps
678 Usage : $self->no_gaps('.')
679 Function : Gets number of gaps in the sequence. The count excludes
680 leading or trailing gap characters.
682 Valid bioperl sequence characters are [A-Za-z\-\.\*]. Of
683 these, '.' and '-' are counted as gap characters unless an
684 optional argument specifies one of them.
686 Returns : number of internal gaps in the sequence.
687 Args : a gap character (optional)
688 Status : Deprecated (in favor of num_gaps())
690 =cut
692 sub no_gaps {
693 my $self = shift;
694 $self->deprecated( -warn_version => 1.0069,
695 -throw_version => 1.0075,
696 -message => 'Use of method no_gaps() is deprecated, use num_gaps() instead' );
697 return $self->num_gaps(@_);
701 =head2 no_sequences
703 Title : no_sequences
704 Usage : $gaps = $seq->no_sequences
705 Function : number of sequence in the sequence alignment
706 Returns : integer
707 Argument :
708 Status : Deprecated (in favor of num_sequences())
710 =cut
712 sub no_sequences {
713 my $self = shift;
714 $self->deprecated( -warn_version => 1.0069,
715 -throw_version => 1.0075,
716 -message => 'Use of method no_sequences() is deprecated, use num_sequences() instead' );
717 return $self->num_sequences(@_);