2 # BioPerl module for Bio::Seq::Meta
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Heikki Lehvaslaiho
8 # Copyright Heikki Lehvaslaiho
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
16 Bio::Seq::Meta - Generic superclass for sequence objects with
17 residue-based meta information
21 use Bio::LocatableSeq;
23 use Bio::Tools::OddCodes;
26 my $seq = Bio::Seq::Meta->new(-id=>'test',
31 -verbose=>1, # to see warnings
34 # the existing sequence object can be a Bio::PrimarySeq, too
36 # to test this is a meta seq object
37 $seq->isa("Bio::Seq::Meta")
38 || $seq->throw("$seq is not a Bio::Seq::Meta");
41 $seq->meta('1234567890');
42 $seq = Bio::Seq::Meta->new(-id=>'test',
48 -verbose=>1, # to see warnings
52 $string = $seq->meta_text();
53 $substring = $seq->submeta_text(2,5);
54 $unique_key = $seq->accession_number();
56 # storing output from Bio::Tools::OddCodes as meta data
57 my $protcodes = Bio::Tools::OddCodes->new(-seq => $seq);
58 my @codes = qw(structural chemical functional charge hydrophobic);
59 map { $seq->named_meta($_, ${$protcodes->$_($seq) } )} @codes;
61 my $out = Bio::SeqIO->new(-format=>'metafasta');
62 $out->write_seq($seq);
66 This class implements generic methods for sequences with residue-based
67 meta information. Meta sequences with meta data are Bio::LocatableSeq
68 objects with additional methods to store that meta information. See
69 L<Bio::LocatableSeq> and L<Bio::Seq::MetaI>.
71 The meta information in this class is always one character per residue
72 long and blank values are space characters (ASCII 32).
74 After the latest rewrite, the meta information no longer covers all
75 the residues automatically. Methods to check the length of meta
76 information (L<meta_length>)and to see if the ends are flushed to the
77 sequence have been added (L<is_flush>). To force the old
78 functionality, set L<force_flush> to true.
80 It is assumed that meta data values do not depend on the nucleotide
81 sequence strand value.
83 Application specific implementations should inherit from this class to
84 override and add to these methods.
86 L<Bio::Seq::Meta::Array> allows for more complex meta values (scalars
87 or objects) to be used.
91 Character based meta data is read and set by method meta() and its
92 variants. These are the suffixes and prefixes used in the variants:
94 [named_] [sub] meta [_text]
100 Suffix B<_text> guaranties that output is a string. Note that it does
103 In this implementation, the output is always text, so these methods
108 Prefix B<sub>, like in subseq(), means that the method applies to sub
109 region of the sequence range and takes start and end as arguments.
110 Unlike subseq(), these methods are able to set values. If the range
111 is not defined, it defaults to the complete sequence.
115 Prefix B<named_> in method names allows the used to attach multiple
116 meta strings to one sequence by explicitly naming them. The name is
117 always the first argument to the method. The "unnamed" methods use the
118 class wide default name for the meta data and are thus special cases
121 Note that internally names are keys in a hash and any misspelling of a
122 name will silently store the data under a wrong name. The used names
123 (keys) can be retrieved using method meta_names(). See L<meta_names>.
129 This Bio::Seq::MetaI implementation inherits from Bio::LocatableSeq, which
130 itself inherits from Bio::PrimarySeq. It is not a Bio::SeqI, so bless-ing
131 objects of this class into a Bio::SeqI or vice versa and will not work as
132 expected (see bug 2262). This may be addressed in a future refactor of
138 L<Bio::LocatableSeq>,
140 L<Bio::Seq::Meta::Array>
146 User feedback is an integral part of the evolution of this and other
147 Bioperl modules. Send your comments and suggestions preferably to one
148 of the Bioperl mailing lists. Your participation is much appreciated.
150 bioperl-l@bioperl.org - General discussion
151 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
155 Please direct usage questions or support issues to the mailing list:
157 I<bioperl-l@bioperl.org>
159 rather than to the module maintainer directly. Many experienced and
160 reponsive experts will be able look at the problem and quickly
161 address it. Please include a thorough description of the problem
162 with code and data examples if at all possible.
164 =head2 Reporting Bugs
166 Report bugs to the Bioperl bug tracking system to help us keep track
167 the bugs and their resolution. Bug reports can be submitted via the
170 https://github.com/bioperl/bioperl-live/issues
172 =head1 AUTHOR - Heikki Lehvaslaiho
174 Email heikki-at-bioperl-dot-org
178 Chad Matsalla, bioinformatics@dieselwurks.com
180 Aaron Mackey, amackey@virginia.edu
184 The rest of the documentation details each of the object methods.
185 Internal methods are usually preceded with a _
190 # Let the code begin...
193 package Bio
::Seq
::Meta
;
195 use vars
qw($DEFAULT_NAME $GAP $META_GAP);
198 #use overload '""' => \&to_string;
200 use base qw(Bio::LocatableSeq Bio::Seq::MetaI);
205 $DEFAULT_NAME = 'DEFAULT';
213 Usage : $metaseq = Bio::Seq::Meta->new
214 ( -meta => 'aaaaaaaabbbbbbbb',
215 -seq => 'TKLMILVSHIVILSRM'
217 -accession_number => 'S000012',
219 Function: Constructor for Bio::Seq::Meta class, meta data being in a
220 string. Note that you can provide an empty quality string.
221 Returns : a new Bio::Seq::Meta object
227 my ($class, @args) = @_;
229 my $self = $class->SUPER::new
(@args);
231 my($meta, $forceflush, $nm) =
232 $self->_rearrange([qw(META
237 #$self->{'_meta'} = {};
238 $self->{'_meta'}->{$DEFAULT_NAME} = "";
240 $meta && $self->meta($meta);
241 if ($nm && ref($nm) eq 'HASH') {
242 while (my ($name, $meta) = each %$nm) {
243 $self->named_meta($name, $meta);
246 $forceflush && $self->force_flush($forceflush);
255 Usage : $meta_values = $obj->meta($values_string);
258 Get and set method for the meta data starting from residue
259 position one. Since it is dependent on the length of the
260 sequence, it needs to be manipulated after the sequence.
262 The length of the returned value always matches the length
263 of the sequence, if force_flush() is set. See L<force_flush>.
265 Returns : meta data in a string
266 Args : new value, string, optional
271 shift->named_meta($DEFAULT_NAME, shift);
277 Usage : $meta_values = $obj->meta_text($values_arrayref);
278 Function: Variant of meta() guarantied to return a textual
279 representation of meta data. For details, see L<meta>.
281 Args : new value, optional
292 Usage : $meta_values = $obj->named_meta($name, $values_arrayref);
293 Function: A more general version of meta(). Each meta data set needs
294 to be named. See also L<meta_names>.
296 Args : scalar, name of the meta data set
302 my ($self, $name, $value) = @_;
304 $name ||= $DEFAULT_NAME;
305 if( defined $value) {
307 $self->throw("I need a scalar value, not [". ref($value). "]")
311 my $diff = $self->length - CORE
::length($value);
313 $value .= (" " x
$diff);
316 $self->{'_meta'}->{$name} = $value;
318 #$self->_test_gap_positions($name) if $self->verbose > 0;
321 return " " x
$self->length
322 if $self->force_flush && not defined $self->{'_meta'}->{$name};
325 $self->_do_flush if $self->force_flush;
327 return $self->{'_meta'}->{$name};
330 =head2 _test_gap_positions
332 Title : _test_gap_positions
333 Usage : $meta_values = $obj->_test_gap_positions($name);
334 Function: Internal test for correct position of gap characters.
335 Gap being only '-' this time.
337 This method is called from named_meta() when setting meta
338 data but only if verbose is positive as this can be an
339 expensive process on very long sequences. Set verbose(1) to
340 see warnings when gaps do not align in sequence and meta
341 data and turn them into errors by setting verbose(2).
343 Returns : true on success, prints warnings
348 sub _test_gap_positions
{
353 $self->seq || return $success;
354 my $len = CORE
::length($self->seq);
355 for (my $i=0; $i < $len; $i++) {
356 my $s = substr $self->{seq
}, $i, 1;
357 my $m = substr $self->{_meta
}->{$name}, $i, 1;
358 $self->warn("Gap mismatch [$m/$s] in column [". ($i+1). "] of [$name] meta data in seq [". $self->id. "]")
360 if ($s eq $META_GAP) && $s ne $m;
365 =head2 named_meta_text
367 Title : named_meta_text()
368 Usage : $meta_values = $obj->named_meta_text($name, $values_arrayref);
369 Function: Variant of named_meta() guarantied to return a textual
370 representation of the named meta data.
371 For details, see L<meta>.
373 Args : scalar, name of the meta data set
378 sub named_meta_text
{
379 shift->named_meta(@_);
385 Usage : $subset_of_meta_values = $obj->submeta(10, 20, $value_string);
386 $subset_of_meta_values = $obj->submeta(10, undef, $value_string);
389 Get and set method for meta data for subsequences.
391 Numbering starts from 1 and the number is inclusive, ie 1-2
392 are the first two residue of the sequence. Start cannot be
393 larger than end but can be equal.
395 If the second argument is missing the returned values
396 should extend to the end of the sequence.
398 The return value may be a string or an array reference,
399 depending on the implementation. If in doubt, use
400 submeta_text() which is a variant guarantied to return a
401 string. See L<submeta_text>.
403 Returns : A reference to an array or a string
404 Args : integer, start position
405 integer, end position, optional when a third argument present
411 shift->named_submeta($DEFAULT_NAME, @_);
417 Usage : $meta_values = $obj->submeta_text(20, $value_string);
418 Function: Variant of submeta() guarantied to return a textual
419 representation of meta data. For details, see L<meta>.
421 Args : new value, optional
432 Title : named_submeta
433 Usage : $subset_of_meta_values = $obj->named_submeta($name, 10, 20, $value_string);
434 $subset_of_meta_values = $obj->named_submeta($name, 10);
435 Function: Variant of submeta() guarantied to return a textual
436 representation of meta data. For details, see L<meta>.
437 Returns : A reference to an array or a string
438 Args : scalar, name of the meta data set
439 integer, start position
440 integer, end position, optional when a third argument present
446 my ($self, $name, $start, $end, $value) = @_;
448 $name ||= $DEFAULT_NAME;
452 $start =~ /^[+]?\d+$/ and $start > 0 or
453 $self->throw("Need at least a positive integer start value");
456 $end ||= $start+length($value)-1;
457 $self->warn("You are setting meta values beyond the length of the sequence\n".
458 "[$start > ". length($self->seq)."] in sequence ". $self->id)
459 if $start > length $self->seq;
461 # pad meta data if needed
462 $self->{_meta
}->{$name} = () unless defined $self->{_meta
}->{$name};
463 if (length($self->{_meta
}->{$name}) < $start) {
464 $self->{'_meta'}->{$name} .= " " x
( $start - length($self->{'_meta'}->{$name}) -1);
468 $tail = substr ($self->{_meta
}->{$name}, $start-1+length($value))
469 if length($self->{_meta
}->{$name}) >= $start-1+length($value);
471 substr ($self->{_meta
}->{$name}, --$start) = $value;
472 $self->{_meta
}->{$name} .= $tail;
474 return substr ($self->{_meta
}->{$name}, $start, $end - $start + 1);
478 $end or $end = length $self->seq;
480 # pad meta data if needed
481 if (length($self->{_meta
}->{$name}) < $end) {
482 $self->{'_meta'}->{$name} .= " " x
( $start - length($self->{'_meta'}->{$name}));
485 return substr ($self->{_meta
}->{$name}, $start-1, $end - $start + 1)
490 =head2 named_submeta_text
492 Title : named_submeta_text
493 Usage : $meta_values = $obj->named_submeta_text($name, 20, $value_string);
494 Function: Variant of submeta() guarantied to return a textual
495 representation of meta data. For details, see L<meta>.
497 Args : scalar, name of the meta data
498 Args : integer, start position, optional
499 integer, end position, optional
504 sub named_submeta_text
{
505 shift->named_submeta(@_);
511 Usage : @meta_names = $obj->meta_names()
512 Function: Retrieves an array of meta data set names. The default
513 (unnamed) set name is guarantied to be the first name.
514 Returns : an array of names
523 foreach ( sort keys %{$self->{'_meta'}} ) {
524 push (@r, $_) unless $_ eq $DEFAULT_NAME;
526 unshift @r, $DEFAULT_NAME if $self->{'_meta'}->{$DEFAULT_NAME};
533 Title : meta_length()
534 Usage : $meeta_len = $obj->meta_length();
535 Function: return the number of elements in the meta set
543 return $self->named_meta_length($DEFAULT_NAME);
547 =head2 named_meta_length
549 Title : named_meta_length()
550 Usage : $meta_len = $obj->named_meta_length($name);
551 Function: return the number of elements in the named meta set
557 sub named_meta_length
{
558 my ($self, $name) = @_;
559 $name ||= $DEFAULT_NAME;
560 return length ($self->{'_meta'}->{$name});
566 Title : force_flush()
567 Usage : $force_flush = $obj->force_flush(1);
568 Function: Automatically pad with empty values or truncate meta values
569 to sequence length. Not done by default.
570 Returns : boolean 1 or 0
571 Args : optional boolean value
573 Note that if you turn this forced padding off, the previously padded
574 values are not changed.
579 my ($self, $value) = @_;
581 if (defined $value) {
583 $self->{force_flush
} = 1;
586 $self->{force_flush
} = 0;
590 return $self->{force_flush
};
598 Function: internal method to do the force that meta values are same
599 length as the sequence . Called from L<force_flush>
609 foreach my $name ( ('DEFAULT', $self->meta_names) ) {
612 if ($self->length > $self->named_meta_length($name)) {
613 $self->{'_meta'}->{$name} .= $META_GAP x
($self->length - $self->named_meta_length($name)) ;
616 elsif ( $self->length < $self->named_meta_length($name) ) {
617 $self->{_meta
}->{$name} = substr($self->{_meta
}->{$name}, 0, $self->length-1);
627 Usage : $is_flush = $obj->is_flush()
628 or $is_flush = $obj->is_flush($my_meta_name)
629 Function: Boolean to tell if all meta values are in
630 flush with the sequence length.
631 Returns true if force_flush() is set
632 Set verbosity to a positive value to see failed meta sets
633 Returns : boolean 1 or 0
634 Args : optional name of the meta set
640 my ($self, $name) = shift;
642 return 1 if $self->force_flush;
648 $sticky .= "$name " if $self->length != $self->named_meta_length($name);
650 foreach my $m ($self->meta_names) {
651 $sticky .= "$m " if ($self->named_meta_length($m) > 0) && ($self->length != $self->named_meta_length($m));
656 print "These meta set are not flush: $sticky\n" if $self->verbose;
664 =head1 Bio::PrimarySeqI methods
669 Usage : $newseq = $seq->revcom();
670 Function: Produces a new Bio::Seq::MetaI implementing object where
671 the order of residues and their meta information is reversed.
672 Returns : A new (fresh) Bio::Seq::Meta object
674 Throws : if the object returns false on is_flush()
676 Note: The method does nothing to meta values, it reorders them, only.
683 $self->throw("Can not get a reverse complement. The object is not flush.")
684 unless $self->is_flush;
686 my $new = $self->SUPER::revcom
;
687 foreach (keys %{$self->{_meta
}}) {
688 $new->named_meta($_, scalar reverse $self->{_meta
}->{$_} );
696 Usage : $subseq = $seq->trunc(10,100);
697 Function: Provides a truncation of a sequence together with meta data
698 Returns : a fresh Bio::Seq::Meta implementing object
699 Args : Two integers denoting first and last residue of the sub-sequence.
704 my ($self, $start, $end) = @_;
707 $start =~ /^[+]?\d+$/ and $start > 0 or
708 $self->throw("Need at least a positive integer start value as start");
709 $end =~ /^[+]?\d+$/ and $end > 0 or
710 $self->throw("Need at least a positive integer start value as end");
712 $self->throw("End position has to be larger or equal to start");
713 $end <= $self->length or
714 $self->throw("End position can not be larger than sequence length");
716 my $new = $self->SUPER::trunc
($start, $end);
718 foreach (keys %{$self->{_meta
}}) {
720 substr($self->{_meta
}->{$_}, $start, $end - $start)
729 my $out = Bio
::SeqIO
->new(-format
=>'metafasta');
730 $out->write_seq($self);