Use /usr/bin/perl instead of env even on examples
[bioperl-live.git] / lib / Bio / Seq / Meta / Array.pm
blob9a65f7042b671045ce1c14d2fe4fae0b8d9e66f9
2 # BioPerl module for Bio::Seq::Meta::Array
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Heikki Lehvaslaiho
8 # Copyright Heikki Lehvaslaiho
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
14 =head1 NAME
16 Bio::Seq::Meta::Array - array-based generic implementation of a
17 sequence class with residue-based meta information
19 =head1 SYNOPSIS
21 use Bio::LocatableSeq;
22 use Bio::Seq::Meta::Array;
24 my $seq = Bio::Seq::Meta::Array->new(-id=>'test',
25 -seq=>'ACTGCTAGCT',
26 -start=>2434,
27 -start=>2443,
28 -strand=>1,
29 -verbose=>1, # to see warnings
32 # to test this is a meta seq object
33 $seq->isa("Bio::Seq::Meta::Array")
34 || $seq->throw("$seq is not a Bio::Seq::Meta::Array");
36 $seq->meta('1 2 3 4 5 6 7 8 9 10');
38 # or you could create the Meta object directly
39 $seq = Bio::Seq::Meta::Array->new(-id=>'test',
40 -seq=>'ACTGCTAGCT',
41 -start=>2434,
42 -start=>2443,
43 -strand=>1,
44 -meta=>'1 2 3 4 5 6 7 8 9 10',
45 -verbose=>1, # to see warnings
49 # accessors
50 $arrayref = $seq->meta();
51 $string = $seq->meta_text();
52 $substring = $seq->submeta_text(2,5);
53 $unique_key = $seq->accession_number();
55 =head1 DESCRIPTION
57 This class implements generic methods for sequences with residue-based
58 meta information. Meta sequences with meta data are Bio::LocatableSeq
59 objects with additional methods to store that meta information. See
60 L<Bio::LocatableSeq> and L<Bio::Seq::MetaI>.
62 The meta information in this class can be a string of variable length
63 and can be a complex structure. Blank values are undef or zero.
65 Application specific implementations should inherit from this class to
66 override and add to these methods.
68 This class can be used for storing sequence quality values but
69 Bio::Seq::Quality has named methods that make it easier.
71 =head1 SEE ALSO
73 L<Bio::LocatableSeq>,
74 L<Bio::Seq::MetaI>,
75 L<Bio::Seq::Meta>,
76 L<Bio::Seq::Quality>
78 =head1 NOTE
80 This Bio::Seq::MetaI implementation inherits from Bio::LocatableSeq, which
81 itself inherits from Bio::PrimarySeq. It is not a Bio::SeqI, so bless-ing
82 objects of this class into a Bio::SeqI or vice versa and will not work as
83 expected (see bug 2262). This may be addressed in a future refactor of
84 Bio::LocatableSeq.
86 =head1 FEEDBACK
88 =head2 Mailing Lists
90 User feedback is an integral part of the evolution of this and other
91 Bioperl modules. Send your comments and suggestions preferably to one
92 of the Bioperl mailing lists. Your participation is much appreciated.
94 bioperl-l@bioperl.org - General discussion
95 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
97 =head2 Support
99 Please direct usage questions or support issues to the mailing list:
101 I<bioperl-l@bioperl.org>
103 rather than to the module maintainer directly. Many experienced and
104 reponsive experts will be able look at the problem and quickly
105 address it. Please include a thorough description of the problem
106 with code and data examples if at all possible.
108 =head2 Reporting Bugs
110 Report bugs to the Bioperl bug tracking system to help us keep track
111 the bugs and their resolution. Bug reports can be submitted via the
112 web:
114 https://github.com/bioperl/bioperl-live/issues
116 =head1 AUTHOR - Heikki Lehvaslaiho
118 Email heikki-at-bioperl-dot-org
120 =head1 CONTRIBUTORS
122 Chad Matsalla, bioinformatics@dieselwurks.com
123 Aaron Mackey, amackey@virginia.edu
125 =head1 APPENDIX
127 The rest of the documentation details each of the object methods.
128 Internal methods are usually preceded with a _
130 =cut
133 # Let the code begin...
136 package Bio::Seq::Meta::Array;
137 use strict;
139 use base qw(Bio::LocatableSeq Bio::Seq Bio::Seq::MetaI);
141 our $DEFAULT_NAME = 'DEFAULT';
142 our $GAP = '-';
143 our $META_GAP = 0;
145 =head2 new
147 Title : new
148 Usage : $metaseq = Bio::Seq::Meta::Array->new
149 ( -meta => 'aaaaaaaabbbbbbbb',
150 -seq => 'TKLMILVSHIVILSRM'
151 -id => 'human_id',
152 -accession_number => 'S000012',
154 Function: Constructor for Bio::Seq::Meta::Array class, meta data being in a
155 string. Note that you can provide an empty quality string.
156 Returns : a new Bio::Seq::Meta::Array object
158 =cut
160 sub new {
161 my ($class, %args) = @_;
163 # run-time modification of @ISA is extremely evil (you should't pick your
164 # interface on the fly); this has no obvious effect on any tests so
165 # commenting out - cjfields 2011-4-6
167 #defined inheritance according to stated baseclass,
168 #if undefined then will be PrimarySeq
169 #if (defined($args{'-baseclass'})) {
170 # @ISA = ($args{'-baseclass'},"Bio::Seq::MetaI");
172 #else {
173 # @ISA = qw( Bio::LocatableSeq Bio::Seq Bio::Seq::MetaI );
176 my $self = $class->SUPER::new(%args);
178 my($meta, $forceflush) =
179 $self->_rearrange([qw(META
180 FORCE_FLUSH
182 %args);
184 $self->{'_meta'}->{$DEFAULT_NAME} = [];
186 $meta && $self->meta($meta);
187 $forceflush && $self->force_flush($forceflush);
189 return $self;
193 =head2 meta
195 Title : meta
196 Usage : $meta_values = $obj->meta($values_string);
197 Function:
199 Get and set method for the meta data starting from residue
200 position one. Since it is dependent on the length of the
201 sequence, it needs to be manipulated after the sequence.
203 The length of the returned value always matches the length
204 of the sequence.
206 Returns : reference to an array of meta data
207 Args : new value, string or array ref, optional
209 =cut
211 sub meta {
212 shift->named_meta($DEFAULT_NAME, shift);
215 =head2 meta_text
217 Title : meta_text
218 Usage : $meta_values = $obj->meta_text($values_arrayref);
219 Function: Variant of meta() guarantied to return a string
220 representation of meta data. For details, see L<meta>.
221 Returns : a string
222 Args : new value, string or array ref, optional
224 =cut
226 sub meta_text {
227 return join ' ', map {0 unless $_} @{shift->meta(shift)};
230 =head2 named_meta
232 Title : named_meta()
233 Usage : $meta_values = $obj->named_meta($name, $values_arrayref);
234 Function: A more general version of meta(). Each meta data set needs
235 to be named. See also L<meta_names>.
236 Returns : reference to an array of meta data
237 Args : scalar, name of the meta data set
238 new value, string or array ref, optional
240 =cut
242 sub named_meta {
243 my ($self, $name, $value) = @_;
245 $name ||= $DEFAULT_NAME;
247 if (defined $value) {
248 my ($arrayref);
250 if (ref $value eq 'ARRAY' ) { # array ref
251 $arrayref = $value;
253 elsif (not ref($value)) { # scalar
254 $arrayref = [split /\s+/, $value];
255 } else {
256 $self->throw("I need a scalar or array ref, not [". ref($value). "]");
259 # test for length
260 my $diff = $self->length - @{$arrayref};
261 if ($diff > 0) {
262 foreach (1..$diff) { push @{$arrayref}, 0;}
265 $self->{'_meta'}->{$name} = $arrayref;
267 #$self->_test_gap_positions($name) if $self->verbose > 0;
270 $self->_do_flush if $self->force_flush;
272 return $self->{'_meta'}->{$name} || (" " x $self->length);
276 =head2 _test_gap_positions
278 Title : _test_gap_positions
279 Usage : $meta_values = $obj->_test_gap_positions($name);
280 Function: Internal test for correct position of gap characters.
281 Gap being only '-' this time.
283 This method is called from named_meta() when setting meta
284 data but only if verbose is positive as this can be an
285 expensive process on very long sequences. Set verbose(1) to
286 see warnings when gaps do not align in sequence and meta
287 data and turn them into errors by setting verbose(2).
289 Returns : true on success, prints warnings
290 Args : none
292 =cut
294 sub _test_gap_positions {
295 my $self = shift;
296 my $name = shift;
297 my $success = 1;
299 $self->seq || return $success;
300 my $len = CORE::length($self->seq);
301 for (my $i=0; $i < $len; $i++) {
302 my $s = substr $self->{seq}, $i, 1;
303 my $m = substr $self->{_meta}->{$name}, $i, 1;
304 $self->warn("Gap mismatch in column [". ($i+1). "] of [$name] meta data in seq [". $self->id. "]")
305 and $success = 0
306 if ($m eq '-') && $s ne $m;
308 return $success;
311 =head2 named_meta_text
313 Title : named_meta_text()
314 Usage : $meta_values = $obj->named_meta_text($name, $values_arrayref);
315 Function: Variant of named_meta() guarantied to return a textual
316 representation of the named meta data.
317 For details, see L<meta>.
318 Returns : a string
319 Args : scalar, name of the meta data set
320 new value, string or array ref, optional
322 =cut
324 sub named_meta_text {
325 return join ' ', @{shift->named_meta(@_)};
329 =head2 submeta
331 Title : submeta
332 Usage : $subset_of_meta_values = $obj->submeta(10, 20, $value_string);
333 $subset_of_meta_values = $obj->submeta(10, undef, $value_string);
334 Function:
336 Get and set method for meta data for subsequences.
338 Numbering starts from 1 and the number is inclusive, ie 1-2
339 are the first two residue of the sequence. Start cannot be
340 larger than end but can be equal.
342 If the second argument is missing the returned values
343 should extend to the end of the sequence.
345 The return value may be a string or an array reference,
346 depending on the implementation. If in doubt, use
347 submeta_text() which is a variant guarantied to return a
348 string. See L<submeta_text>.
350 Returns : A reference to an array or a string
351 Args : integer, start position
352 integer, end position, optional when a third argument present
353 new value, string or array ref, optional
355 =cut
357 sub submeta {
358 shift->named_submeta($DEFAULT_NAME, @_);
361 =head2 submeta_text
363 Title : submeta_text
364 Usage : $meta_values = $obj->submeta_text(20, $value_string);
365 Function: Variant of submeta() guarantied to return a textual
366 representation of meta data. For details, see L<meta>.
367 Returns : a string
368 Args : new value, string or array ref, optional
371 =cut
373 sub submeta_text {
374 return join ' ', @{shift->named_submeta($DEFAULT_NAME, @_)};
377 =head2 named_submeta
379 Title : named_submeta
380 Usage : $subset_of_meta_values = $obj->named_submeta($name, 10, 20, $value_string);
381 $subset_of_meta_values = $obj->named_submeta($name, 10);
382 Function: Variant of submeta() guarantied to return a textual
383 representation of meta data. For details, see L<meta>.
384 Returns : A reference to an array or a string
385 Args : scalar, name of the meta data set
386 integer, start position
387 integer, end position, optional when a third argument present (can be undef)
388 new value, string or array ref, optional
390 =cut
393 sub named_submeta {
394 my ($self, $name, $start, $end, $value) = @_;
396 $name ||= $DEFAULT_NAME;
397 $start ||=1;
398 $start =~ /^[+]?\d+$/ and $start > 0 or
399 $self->throw("Need at least a positive integer start value");
400 $start--;
401 my $meta_len = scalar(@{$self->{_meta}->{$name}});
402 if (defined $value) {
403 my $arrayref;
405 if (ref $value eq 'ARRAY' ) { # array ref
406 $arrayref = $value;
408 elsif (not ref($value)) { # scalar
409 $arrayref = [split /\s+/, $value];
410 } else {
411 $self->throw("I need a space separated scalar or array ref, not [". ref($value). "]");
414 $self->warn("You are setting meta values beyond the length of the sequence\n".
415 "[$start > ". length($self->seq)."] in sequence ". $self->id)
416 if $start + scalar @{$arrayref} -1 > $self->length;
419 $end or $end = @{$arrayref} + $start;
420 $end--;
422 # test for length; pad if needed
423 my $diff = $end - $start - scalar @{$arrayref};
424 if ($diff > 0) {
425 foreach (1..$diff) { push @{$arrayref}, $META_GAP}
428 @{$self->{_meta}->{$name}}[$start..$end] = @{$arrayref};
430 $self->_do_flush if $self->force_flush;
432 return $arrayref;
434 } else {
435 # don't set by seq length; use meta array length instead; bug 2478
436 $end ||= $meta_len;
437 if ($end > $meta_len) {
438 $self->warn("End is longer than meta sequence $name length; resetting to $meta_len");
439 $end = $meta_len;
441 # warn but don't reset (push use of trunc() instead)
442 $self->warn("End is longer than sequence length; use trunc() \n".
443 "if you want a fully truncated object") if $end > $self->length;
444 $end--;
445 return [@{$self->{_meta}->{$name}}[$start..$end]];
450 =head2 named_submeta_text
452 Title : named_submeta_text
453 Usage : $meta_values = $obj->named_submeta_text($name, 20, $value_string);
454 Function: Variant of submeta() guarantied to return a textual
455 representation of meta data. For details, see L<meta>.
456 Returns : a string
457 Args : scalar, name of the meta data
458 Args : integer, start position, optional
459 integer, end position, optional
460 new value, string or array ref, optional
462 =cut
464 sub named_submeta_text {
465 return join ' ', @{shift->named_submeta(@_)};
468 =head2 meta_names
470 Title : meta_names
471 Usage : @meta_names = $obj->meta_names()
472 Function: Retrieves an array of meta data set names. The default
473 (unnamed) set name is guarantied to be the first name if it
474 contains any data.
475 Returns : an array of names
476 Args : none
478 =cut
480 sub meta_names {
481 my ($self) = @_;
483 my @r;
484 foreach ( sort keys %{$self->{'_meta'}} ) {
485 push (@r, $_) unless $_ eq $DEFAULT_NAME;
487 unshift @r, $DEFAULT_NAME if $self->{'_meta'}->{$DEFAULT_NAME};
488 return @r;
492 =head2 meta_length
494 Title : meta_length()
495 Usage : $meta_len = $obj->meta_length();
496 Function: return the number of elements in the meta set
497 Returns : integer
498 Args : -
500 =cut
502 sub meta_length {
503 my ($self) = @_;
504 return $self->named_meta_length($DEFAULT_NAME);
508 =head2 named_meta_length
510 Title : named_meta_length()
511 Usage : $meeta_len = $obj->named_meta_length($name);
512 Function: return the number of elements in the named meta set
513 Returns : integer
514 Args : -
516 =cut
518 sub named_meta_length {
519 my ($self, $name) = @_;
520 $name ||= $DEFAULT_NAME;
521 return scalar @{$self->{'_meta'}->{$name}};
526 =head2 force_flush
528 Title : force_flush()
529 Usage : $force_flush = $obj->force_flush(1);
530 Function: Automatically pad with empty values or truncate meta values
531 to sequence length. Not done by default.
532 Returns : boolean 1 or 0
533 Args : optional boolean value
535 Note that if you turn this forced padding off, the previously padded
536 values are not changed.
538 =cut
540 sub force_flush {
541 my ($self, $value) = @_;
543 if (defined $value) {
544 if ($value) {
545 $self->{force_flush} = 1;
546 $self->_do_flush;
547 } else {
548 $self->{force_flush} = 0;
552 return $self->{force_flush};
556 =head2 _do_flush
558 Title : _do_flush
559 Usage :
560 Function: internal method to do the force that meta values are same
561 length as sequence . Called from L<force_flush>
562 Returns :
563 Args :
565 =cut
568 sub _do_flush {
569 my ($self) = @_;
571 foreach my $name ($self->meta_names) {
572 #print "seq: ", $self->length , " ", $name, ": ", $self->named_meta_length($name), "======\n";
574 # elongnation
575 if ($self->length > $self->named_meta_length($name)) {
576 my $diff = $self->length - $self->named_meta_length($name);
577 foreach (1..$diff) { push @{$self->{'_meta'}->{$name}}, $META_GAP}
579 # truncation
580 elsif ( $self->length < $self->named_meta_length($name) ) {
581 $self->{_meta}->{$name} = [@{$self->{_meta}->{$name}}[0..($self->length-1)]]
587 =head2 is_flush
589 Title : is_flush
590 Usage : $is_flush = $obj->is_flush()
591 or $is_flush = $obj->is_flush($my_meta_name)
592 Function: Boolean to tell if all meta values are in
593 flush with the sequence length.
594 Returns true if force_flush() is set
595 Set verbosity to a positive value to see failed meta sets
596 Returns : boolean 1 or 0
597 Args : optional name of the meta set
599 =cut
601 sub is_flush {
603 my ($self, $name) = shift;
605 return 1 if $self->force_flush;
607 my $sticky = '';
610 if ($name) {
611 $sticky .= "$name " if $self->length != $self->named_meta_length($name);
612 } else {
613 foreach my $m ($self->meta_names) {
614 $sticky .= "$m " if ($self->named_meta_length($m) > 0) && ($self->length != $self->named_meta_length($m)) ;
618 if ($sticky) {
619 print "These meta set are not flush: $sticky\n" if $self->verbose;
620 return 0;
623 return 1;
627 =head1 Bio::PrimarySeqI methods
629 =head2 revcom
631 Title : revcom
632 Usage : $newseq = $seq->revcom();
633 Function: Produces a new Bio::Seq::MetaI implementing object where
634 the order of residues and their meta information is reversed.
635 Returns : A new (fresh) Bio::Seq::Meta object
636 Args : none
637 Throws : if the object returns false on is_flush()
639 Note: The method does nothing to meta values, it reorders them, only.
641 =cut
643 sub revcom {
644 my $self = shift;
646 $self->throw("Can not get a reverse complement. The object is not flush.")
647 unless $self->is_flush;
649 my $new = $self->SUPER::revcom;
650 my $end = $self->length - 1;
651 map {
652 $new->{_meta}->{$_} = [ reverse @{$self->{_meta}->{$_}}[0..$end]]
653 } keys %{$self->{_meta}};
655 return $new;
658 =head2 trunc
660 Title : trunc
661 Usage : $subseq = $seq->trunc(10,100);
662 Function: Provides a truncation of a sequence together with meta data
663 Returns : a fresh Bio::Seq::Meta implementing object
664 Args : Two integers denoting first and last residue of the sub-sequence.
666 =cut
668 sub trunc {
669 my ($self, $start, $end) = @_;
671 # test arguments
672 $start =~ /^[+]?\d+$/ and $start > 0 or
673 $self->throw("Need at least a positive integer start value as start; got [$start]");
674 $end =~ /^[+]?\d+$/ and $end > 0 or
675 $self->throw("Need at least a positive integer start value as end; got [$end]");
676 $end >= $start or
677 $self->throw("End position has to be larger or equal to start; got [$start..$end]");
678 $end <= $self->length or
679 $self->throw("End position can not be larger than sequence length; got [$end]");
681 my $new = $self->SUPER::trunc($start, $end);
682 $start--;
683 $end--;
684 map {
685 $new->{_meta}->{$_} = [@{$self->{_meta}->{$_}}[$start..$end]]
686 } keys %{$self->{_meta}};
687 return $new;