2 # BioPerl module for Bio::SeqIO::table
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Hilmar Lapp
8 # You may distribute this module under the same terms as perl itself.
9 # Refer to the Perl Artistic License (see the license accompanying this
10 # software package, or see http://www.perl.com/language/misc/Artistic.html)
11 # for the terms under which you may use, modify, and redistribute this module.
13 # THIS PACKAGE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED
14 # WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF
15 # MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE.
18 # POD documentation - main docs before the code
22 Bio::SeqIO::table - sequence input/output stream from a delimited table
26 # Do not to use this object directly, use Bio::SeqIO, for example:
28 $in = Bio::SeqIO->new(-file => $filename, -format => 'table');
30 while ( my $seq = $in->next_seq() ) {
31 # do something with $seq
36 This class transforms records in a table-formatted text file into
39 A table-formatted text file of sequence records for the purposes of
40 this module is defined as a text file with each row corresponding to a
41 sequence, and the attributes of the sequence being in different
42 columns. Columns are delimited by a common delimiter, for instance tab
45 The module permits specifying which columns hold which type of
46 annotation. The semantics of certain attributes, if present, are
47 pre-defined, e.g., accession number and sequence. Additional
48 attributes may be added to the annotation bundle.
54 User feedback is an integral part of the evolution of this and other
55 Bioperl modules. Send your comments and suggestions preferably to one
56 of the Bioperl mailing lists. Your participation is much appreciated.
58 bioperl-l@bioperl.org - General discussion
59 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
63 Please direct usage questions or support issues to the mailing list:
65 I<bioperl-l@bioperl.org>
67 rather than to the module maintainer directly. Many experienced and
68 reponsive experts will be able look at the problem and quickly
69 address it. Please include a thorough description of the problem
70 with code and data examples if at all possible.
74 Report bugs to the Bioperl bug tracking system to help us keep track
75 the bugs and their resolution.
77 Bug reports can be submitted via email or the web:
79 https://github.com/bioperl/bioperl-live/issues
81 =head1 AUTHOR - Hilmar Lapp
83 Email hlapp at gmx.net
87 The rest of the documentation details each of the object
88 methods. Internal methods are usually preceded with a _
92 # Let the code begin...
94 package Bio
::SeqIO
::table
;
98 use Bio
::Seq
::SeqFactory
;
99 use Bio
::Annotation
::Collection
;
100 use Bio
::Annotation
::SimpleValue
;
102 use base
qw(Bio::SeqIO);
107 Usage : $stream = Bio::SeqIO->new(-file => $filename, -format => 'table')
108 Function: Returns a new seqstream
109 Returns : A Bio::SeqIO stream for a table format
110 Args : Named parameters:
112 -file Name of file to read
113 -fh Filehandle to attach to
114 -comment Leading character(s) introducing a comment line
115 -header the number of header lines to skip; the first
116 non-comment header line will be used to obtain
117 column names; column names will be used as the
118 default tags for attaching annotation.
119 -delim The delimiter for columns as a regular expression;
120 consecutive occurrences of the delimiter will
122 -display_id The one-based index of the column containing
123 the display ID of the sequence
124 -accession_number The one-based index of the column
125 containing the accession number of the sequence
126 -seq The one-based index of the column containing
127 the sequence string of the sequence
128 -desc The one-based index of the column containing
129 the description of the sequence
130 -species The one-based index of the column containing the
131 species for the sequence record; if not a
132 number, will be used as the static species
133 common to all records
134 -annotation If provided and a scalar (but see below), a
135 flag whether or not all additional columns are
136 to be preserved as annotation, the tags used
137 will either be 'colX' if there is no column
138 header and where X is the one-based column
139 index, and otherwise the column headers will be
142 If a reference to an array, or a square
143 bracket-enclosed string of comma-delimited
144 values, only those columns (one-based index)
145 will be preserved as annotation, tags as before;
147 If a reference to a hash, or a curly
148 braces-enclosed string of comma-delimited key
149 and value pairs in alternating order, the keys
150 are one-based column indexes to be preserved,
151 and the values are the tags under which the
152 annotation is to be attached; if not provided or
153 supplied as undef, no additional annotation will
155 -colnames A reference to an array of column labels, or a
156 string of comma-delimited labels, denoting the
157 columns to be converted into annotation; this is
158 an alternative to -annotation and will be
159 ignored if -annotation is also supplied with a
161 -trim Flag determining whether or not all values should
162 be trimmed of leading and trailing white space
165 Additional arguments may be used to e.g. set factories and
166 builders involved in the sequence object creation (see the
172 my($self,@args) = @_;
174 # chained initialization
175 $self->SUPER::_initialize
(@args);
189 $self->_rearrange([qw(COMMENT
202 # store options and apply defaults
203 $self->comment_char(defined($cmtchars) ?
$cmtchars : "#")
204 if (!defined($self->comment_char)) || defined($cmtchars);
205 $self->delimiter(defined($delim) ?
$delim : "\t")
206 if (!defined($self->delimiter)) || defined($delim);
207 $self->header($header) if defined($header);
208 $self->trim_values($trim) if defined($trim);
212 $attrs->{-display_id
} = $display_id if defined($display_id);
213 $attrs->{-accession_number
} = $accnr if defined($accnr);
214 $attrs->{-seq
} = $seq if defined($seq);
215 $attrs->{-desc
} = $desc if defined($desc);
216 if (defined($taxon)) {
217 if (ref($taxon) || ($taxon =~ /^\d+$/)) {
218 # either a static object, or a column reference
219 $attrs->{-species
} = $taxon;
221 # static species as a string
222 $attrs->{-species
} = Bio
::Species
->new(
223 -classification
=> [reverse(split(' ',$taxon))]);
226 $self->attribute_map($attrs);
228 # annotation columns, if any
229 if ($useann && !ref($useann)) {
230 # it's a scalar; check whether this is in fact an array or
231 # hash as a string rather than just a flag
232 if ($useann =~ /^\[(.*)\]$/) {
233 $useann = [split(/[,;]/,$1)];
234 } elsif ($useann =~ /^{(.*)}$/) {
235 $useann = {split(/[,;]/,$1)};
236 } # else it is probably indeed just a flag
240 if (ref($useann) eq "ARRAY") {
241 my $has_header = ($self->header > 0);
243 foreach my $i (@
$useann) {
244 $ann_map->{$i} = $has_header ?
undef : "col$i";
247 # no special handling necessary
250 $self->annotation_map($ann_map);
252 $self->keep_annotation($useann || $colnames);
253 # annotation columns, if any
254 if ($colnames && !ref($colnames)) {
255 # an array as a string
256 $colnames =~ s/^\[(.*)\]$/$1/;
257 $colnames = [split(/[,;]/,$colnames)];
259 $self->annotation_columns($colnames) if ref($colnames);
262 # make sure we have a factory defined
263 if(!defined($self->sequence_factory)) {
264 $self->sequence_factory(
265 Bio
::Seq
::SeqFactory
->new(-verbose
=> $self->verbose(),
266 -type
=> 'Bio::Seq::RichSeq'));
273 Usage : $seq = $stream->next_seq()
274 Function: returns the next sequence in the stream
275 Returns : Bio::Seq::RichSeq object
283 # skip until not a comment and not an empty line
284 my $line_ok = $self->_next_record();
286 # if there is a header but we haven't read past it yet then do so now
287 if ($line_ok && (! $self->_header_skipped) && $self->header) {
288 $line_ok = $self->_parse_header();
289 $self->_header_skipped(1);
292 # return if we reached end-of-file
293 return unless $line_ok;
295 # otherwise, parse the record
298 my @cols = $self->_get_row_values();
299 # trim leading and trailing whitespace and quotes if desired
300 if ($self->trim_values) {
301 for(my $i = 0; $i < scalar(@cols); $i++) {
303 # trim off whitespace
304 $cols[$i] =~ s/^\s+//;
305 $cols[$i] =~ s/\s+$//;
306 # trim off double quotes
313 # assign values for columns in the attribute map
314 my $attrmap = $self->_attribute_map;
316 foreach my $attr (keys %$attrmap) {
317 if ((!ref($attrmap->{$attr})) && ($attrmap->{$attr} =~ /^\d+$/)) {
318 # this is a column index, add to instantiation parameters
319 $params{$attr} = $cols[$attrmap->{$attr}];
321 # not a column index; we assume it's a static value
322 $params{$attr} = $attrmap->{$attr};
326 # add annotation columns to the annotation bundle
327 my $annmap = $self->_annotation_map;
328 if ($annmap && %$annmap) {
329 my $anncoll = Bio
::Annotation
::Collection
->new();
330 foreach my $col (keys %$annmap) {
331 next unless $cols[$col]; # skip empty columns!
332 $anncoll->add_Annotation(
333 Bio
::Annotation
::SimpleValue
->new(-value
=> $cols[$col],
334 -tagname
=> $annmap->{$col}));
336 $params{'-annotation'} = $anncoll;
339 # ask the object builder to add the slots that we've gathered
340 my $builder = $self->sequence_builder();
341 $builder->add_slot_value(%params);
342 # and instantiate the object
343 my $seq = $builder->make_object();
352 Usage : $obj->comment_char($newval)
353 Function: Get/set the leading character(s) designating a line as
356 Returns : value of comment_char (a scalar)
357 Args : on set, new value (a scalar or undef, optional)
365 return $self->{'comment_char'} = shift if @_;
366 return $self->{'comment_char'};
372 Usage : $obj->header($newval)
373 Function: Get/set the number of header lines to skip before the
374 rows containing actual sequence records.
376 If set to zero or undef, means that there is no header and
377 therefore also no column headers.
380 Returns : value of header (a scalar)
381 Args : on set, new value (a scalar or undef, optional)
389 return $self->{'header'} = shift if @_;
390 return $self->{'header'};
396 Usage : $obj->delimiter($newval)
397 Function: Get/set the column delimiter. This will in fact be
398 treated as a regular expression. Consecutive occurrences
399 will not be collapsed to a single one.
402 Returns : value of delimiter (a scalar)
403 Args : on set, new value (a scalar or undef, optional)
411 return $self->{'delimiter'} = shift if @_;
412 return $self->{'delimiter'};
417 Title : attribute_map
418 Usage : $obj->attribute_map($newval)
419 Function: Get/set the map of sequence object initialization
420 attributes (keys) to one-based column index.
422 Attributes will usually need to be prefixed by a dash, just
423 as if they were passed to the new() method of the sequence
427 Returns : value of attribute_map (a reference to a hash)
428 Args : on set, new value (a reference to a hash or undef, optional)
436 # internally we store zero-based maps - so we need to convert back
440 # allow for and protect against undef
441 return delete $self->{'_attribute_map'} unless defined($arg);
442 # copy to avoid side-effects
443 my $attr_map = {%$arg};
444 foreach my $key (keys %$attr_map) {
445 if ((!ref($attr_map->{$key})) && ($attr_map->{$key} =~ /^\d+$/)) {
449 $self->{'_attribute_map'} = $attr_map;
451 # there may not be a map
452 return unless exists($self->{'_attribute_map'});
453 # we need to copy in order not to override the stored map!
454 my %attr_map = %{$self->{'_attribute_map'}};
455 foreach my $key (keys %attr_map) {
456 if ((!ref($attr_map{$key})) && ($attr_map{$key} =~ /^\d+$/)) {
463 =head2 annotation_map
465 Title : annotation_map
466 Usage : $obj->annotation_map($newval)
467 Function: Get/set the mapping between one-based column indexes
468 (keys) and annotation tags (values).
470 Note that the map returned by this method may change after
471 the first next_seq() call if the file contains a column
472 header and no annotation keys have been predefined in the
473 map, because upon reading the column header line the tag
474 names will be set automatically.
476 Note also that the map may reference columns that are used
477 as well in the sequence attribute map.
480 Returns : value of annotation_map (a reference to a hash)
481 Args : on set, new value (a reference to a hash or undef, optional)
489 # internally we store zero-based maps - so we need to convert back
493 # allow for and protect against undef
494 return delete $self->{'_annotation_map'} unless defined($arg);
495 # copy to avoid side-effects
496 my $ann_map = {%$arg};
497 # make sure we sort the keys numerically or otherwise we may
498 # clobber a key with a higher index
499 foreach my $key (sort { $a <=> $b } keys(%$ann_map)) {
500 $ann_map->{$key-1} = $ann_map->{$key};
501 delete $ann_map->{$key};
503 $self->{'_annotation_map'} = $ann_map;
504 # also make a note that we want to keep annotation
505 $self->keep_annotation(1);
507 # there may not be a map
508 return unless exists($self->{'_annotation_map'});
509 # we need to copy in order not to override the stored map!
510 my %ann_map = %{$self->{'_annotation_map'}};
511 # here we need to sort numerically in reverse order ...
512 foreach my $key (sort { $b <=> $a } keys(%ann_map)) {
513 $ann_map{$key+1} = $ann_map{$key};
514 delete $ann_map{$key};
519 =head2 keep_annotation
521 Title : keep_annotation
522 Usage : $obj->keep_annotation($newval)
523 Function: Get/set flag whether or not to keep values from
524 additional columns as annotation.
526 Additional columns are all those columns in the input file
527 that aren't referenced in the attribute map.
530 Returns : value of keep_annotation (a scalar)
531 Args : on set, new value (a scalar or undef, optional)
539 return $self->{'keep_annotation'} = shift if @_;
540 return $self->{'keep_annotation'};
543 =head2 annotation_columns
545 Title : annotation_columns
546 Usage : $obj->annotation_columns($newval)
547 Function: Get/set the names (labels) of the columns to be used for
550 This is an alternative to using annotation_map. In order to
551 have any effect, it must be set before the first call of
552 next_seq(), and obviously there must be a header line (or
553 row) too giving the column labels.
556 Returns : value of annotation_columns (a reference to an array)
557 Args : on set, new value (a reference to an array of undef, optional)
562 sub annotation_columns
{
565 return $self->{'annotation_columns'} = shift if @_;
566 return $self->{'annotation_columns'};
572 Usage : $obj->trim_values($newval)
573 Function: Get/set whether or not to trim leading and trailing
574 whitespace off all column values.
576 Returns : value of trim_values (a scalar)
577 Args : on set, new value (a scalar or undef, optional)
585 return $self->{'trim_values'} = shift if @_;
586 return $self->{'trim_values'};
592 Usage: write_seq() is not implemented for table format output.
597 shift->throw("write_seq() not implemented for 'table' format");
600 =head1 Internal methods
602 All methods with a leading underscore are not meant to be part of the
603 'official' API. They are for use by this module only, consider them
604 private unless you are a developer trying to modify this module.
608 =head2 _attribute_map
610 Title : _attribute_map
611 Usage : $obj->_attribute_map($newval)
612 Function: Get only. Same as attribute_map, but zero-based indexes.
614 Note that any changes made to the returned map will change
615 the map used by this instance. You should know what you are
616 doing if you modify the returned value (or if you call this
617 method in the first place).
620 Returns : value of _attribute_map (a reference to a hash)
629 return $self->{'_attribute_map'};
632 =head2 _annotation_map
634 Title : _annotation_map
635 Usage : $obj->_annotation_map($newval)
636 Function: Get only. Same as annotation_map, but with zero-based indexes.
638 Note that any changes made to the returned map will change
639 the map used by this instance. You should know what you are
640 doing if you modify the returned value (or if you call this
641 method in the first place).
644 Returns : value of _annotation_map (a reference to a hash)
653 return $self->{'_annotation_map'};
656 =head2 _header_skipped
658 Title : _header_skipped
659 Usage : $obj->_header_skipped($newval)
660 Function: Get/set the flag whether the header was already
661 read (and skipped) or not.
663 Returns : value of _header_skipped (a scalar)
664 Args : on set, new value (a scalar or undef, optional)
672 return $self->{'_header_skipped'} = shift if @_;
673 return $self->{'_header_skipped'};
680 Function: Navigates the underlying file to the next record.
682 For row-based records in delimited text files, this will
683 skip all empty lines and lines with a leading comment
686 This method is here is to serve as a hook for other formats
687 that conceptually also represent tables but aren't
688 formatted as row-based text files.
691 Returns : TRUE if the navigation was successful and FALSE
692 otherwise. Unsuccessful navigation will usually be treated
693 as an end-of-file condition.
702 my $cmtcc = $self->comment_char;
703 my $line = $self->_readline();
705 # skip until not a comment and not an empty line
706 while (defined($line)
707 && (($cmtcc && ($line =~ /^\s*$cmtcc/))
708 || ($line =~ /^\s*$/))) {
709 $line = $self->_readline();
712 return $self->{'_line'} = $line;
717 Title : _parse_header
719 Function: Parse the table header and navigate past it.
721 This method is called if the number of header rows has been
722 specified equal to or greater than one, and positioned at
723 the first header line (row). By default the first header
724 line (row) is used for setting column names, but additional
725 lines (rows) may be skipped too. Empty lines and comment
726 lines do not count as header lines (rows).
728 This method will call _next_record() to navigate to the
729 next header line (row), if there is more than one header
730 line (row). Upon return, the file is presumed to be
731 positioned at the first record after the header.
733 This method is here is to serve as a hook for other formats
734 that conceptually also represent tables but aren't
735 formatted as row-based text files.
737 Note however that the only methods used to access file
738 content or navigate the position are _get_row_values() and
739 _next_record(), so it should usually suffice to override
743 Returns : TRUE if navigation past the header was successful and FALSE
744 otherwise. Unsuccessful navigation will usually be treated
745 as an end-of-file condition.
754 # the first header line contains the column headers, see whether
756 if ($self->keep_annotation) {
757 my @colnames = $self->_get_row_values();
758 # trim leading and trailing whitespace if desired
759 if ($self->trim_values) {
760 # trim off whitespace
761 @colnames = map { $_ =~ s/^\s+//; $_ =~ s/\s+$//; $_; } @colnames;
762 # trim off double quotes
763 @colnames = map { $_ =~ s/^"//; $_ =~ s/"$//; $_; } @colnames;
765 # build or complete annotation column map
766 my $annmap = $self->annotation_map || {};
768 # check whether columns have been defined by name rather than index
769 if (my $anncols = $self->annotation_columns) {
770 # first sanity check: all column names must map
771 my %colmap = map { ($_,1); } @colnames;
772 foreach my $col (@
$anncols) {
773 if (!exists($colmap{$col})) {
774 $self->throw("no such column labeled '$col'");
777 # now map to the column indexes
778 %colmap = map { ($_,1); } @
$anncols;
779 for (my $i = 0; $i < scalar(@colnames); $i++) {
780 if (exists($colmap{$colnames[$i]})) {
781 $annmap->{$i+1} = $colnames[$i];
785 # no columns specified, default to all non-attribute columns
786 for (my $i = 0; $i < scalar(@colnames); $i++) {
787 $annmap->{$i+1} = $colnames[$i];
789 # subtract all attribute-referenced columns
790 foreach my $attrcol (values %{$self->attribute_map}) {
791 if ((!ref($attrcol)) && ($attrcol =~ /^\d+$/)) {
792 delete $annmap->{$attrcol};
797 # fill in where the tag names weren't pre-defined
798 for (my $i = 0; $i < scalar(@colnames); $i++) {
799 if (exists($annmap->{$i+1}) && ! defined($annmap->{$i+1})) {
800 $annmap->{$i+1} = $colnames[$i];
804 $self->annotation_map($annmap);
807 # now read past the header
808 my $header_lines = $self->header;
810 while (defined($line_ok) && ($header_lines > 0)) {
811 $line_ok = $self->_next_record();
818 =head2 _get_row_values
820 Title : _get_row_values
822 Function: Get the values for the current line (or row) as an array in
823 the order of columns.
825 This method is here is to serve as a hook for other formats
826 that conceptually also represent tables but aren't
827 formatted as row-based text files.
830 Returns : An array of column values for the current row.
838 my $delim = $self->delimiter;
839 my $line = $self->{'_line'};
841 my @cols = split(/$delim/,$line);