lib/Bio/AlignIO/arp.pm

   1 #
   2 # BioPerl module for Bio::AlignIO::arp
   3 #
   4 # Copyright Chris Fields
   5 #
   6 # You may distribute this module under the same terms as perl itself
   7 # POD documentation - main docs before the code
   8
   9 =head1 NAME
  10
  11 Bio::AlignIO::arp - ARP MSA Sequence input/output stream
  12
  13 =head1 SYNOPSIS
  14
  15 Do not use this module directly.  Use it via the L<Bio::AlignIO>
  16 class.
  17
  18 =head1 DESCRIPTION
  19
  20 This object can create L<Bio::SimpleAlign> objects from
  21 ARP flat files.  These are typically configuration-like data files
  22 for the program Arlequin.  For more information, see:
  23
  24   http://lgb.unige.ch/arlequin/
  25
  26 For the moment, this retains the allele sequence data in the DATA section and
  27 inserts them into SimpleAlign objects. ARP files that contain other data (RFLP,
  28 etc.) are not expected to parse properly.  Also, if the DNA data is actually SNP
  29 data, then the LocatableSeq object instantiation will throw an error.
  30
  31 This is now set up as a generic parser (i.e. it parses everything) and
  32 collects as much data as possible into the SimpleAlign object.  The following
  33 in a general mapping of where data can be found:
  34
  35     Tag        SimpleAlign
  36                Method
  37     ----------------------------------------------------------------------
  38     Title      description
  39     SampleName id
  40     ----------------------------------------------------------------------
  41
  42     Tag        Bio::Annotation   TagName                    Bio::Annotation
  43                Class                                        Parameters
  44     ----------------------------------------------------------------------
  45      NE        SimpleValue       pfam_family_accession      value
  46      NL        SimpleValue       sequence_start_stop        value
  47      SS        SimpleValue       sec_structure_source       value
  48      BM        SimpleValue       build_model                value
  49      RN        Reference         reference                  *
  50     ----------------------------------------------------------------------
  51   * RN is generated based on the number of Bio::Annotation::Reference objects
  52
  53 In addition, the number of samples found in the alignment is retained in a
  54 Bio::Annotation::TagTree object in the annotation collection and is accessible
  55 via:
  56
  57   ($samples) = $aln->annotation->get_Annotations('Samples');
  58   say $samples->display_text;
  59   # or use other relevant TagTree methods to retrieve data
  60
  61 =head1 FEEDBACK
  62
  63 =head2 Support
  64
  65 Please direct usage questions or support issues to the mailing list:
  66
  67 I<bioperl-l@bioperl.org>
  68
  69 rather than to the module maintainer directly. Many experienced and
  70 reponsive experts will be able look at the problem and quickly
  71 address it. Please include a thorough description of the problem
  72 with code and data examples if at all possible.
  73
  74 =head2 Reporting Bugs
  75
  76 Report bugs to the Bioperl bug tracking system to help us keep track
  77 the bugs and their resolution.  Bug reports can be submitted via the
  78 web:
  79
  80   https://github.com/bioperl/bioperl-live/issues
  81
  82 =head1 AUTHORS
  83
  84 Chris Fields (cjfields)
  85
  86 =head1 APPENDIX
  87
  88 The rest of the documentation details each of the object
  89 methods. Internal methods are usually preceded with a _
  90
  91 =cut
  92
  93 # Let the code begin...
  94
  95 package Bio::AlignIO::arp;
  96
  97 use strict;
  98 use base qw(Bio::AlignIO);
  99
 100 use Data::Dumper;
 101 use Bio::Annotation::AnnotationFactory;
 102
 103 =head2 next_aln
 104
 105  Title   : next_aln
 106  Usage   : $aln = $stream->next_aln
 107  Function: returns the next alignment in the stream.
 108  Returns : Bio::Align::AlignI object - returns 0 on end of file
 109             or on error
 110  Args    : -width => optional argument to specify the width sequence
 111            will be written (60 chars by default)
 112
 113 See L<Bio::Align::AlignI>
 114
 115 =cut
 116
 117 sub next_aln {
 118     my $self = shift;
 119     my $aln = Bio::SimpleAlign->new(-source => 'arp');
 120     my ($data, $cur_block, $cur_type, $cur_data);
 121     SCAN:
 122     while (defined ($data = $self->_readline) ) {
 123         next if $data =~ m{^\s*$}xms;
 124         if ($data =~ m{\[{1,2}(\w+)\]{1,2}}xms) {
 125             $self->{state}->{current_block} = $1;
 126             next SCAN;
 127         }
 128         elsif ($data =~ m{^\s*(\w+)=\s?(\S[^\n]*$)}xms) {
 129             ($cur_type, $cur_data) = ($1, $2);
 130             if ($cur_data =~ m{^\s*\{\s*$}) {
 131                 $self->throw("Curly block must be embedded in a named Block")
 132                     if !exists($self->{state}->{current_block});
 133                 $self->{state}->{in_curly_block} = 1;
 134                 next SCAN;
 135             }
 136             $cur_data =~ s{[\"\']}{}g;
 137             $cur_data =~ s{\s*$}{};
 138             # per alignment annotation data (i.e. Sample Blocks) or
 139             # annotation data retained for each alignment?
 140             $self->{state}->{current_block} eq 'Samples' ?
 141                 push @{$self->{state}->{SampleAnnotation}->{$cur_type}}, $cur_data :
 142                 push @{$self->{state}->{Annotation}->{$cur_type}}, $cur_data;
 143         }
 144         elsif ($data =~ m{^\s*\}\s*$}xms) {
 145             $self->throw("Unmatched bracket in ARP file:\n$data") if
 146                 !exists($self->{state}->{in_curly_block});
 147             if ($self->{state}->{current_block} eq 'Samples') {;
 148                 my $ac = $self->_process_annotation($aln);
 149                 delete $self->{state}->{SampleAnnotation};
 150             } else {
 151                 # process other data at a later point
 152             }
 153             delete $self->{state}->{blockdata};
 154             $self->{state}->{in_curly_block} = 0;
 155             last SCAN;
 156         }
 157         else {
 158             # all other data should be in a curly block and have a block title
 159             $self->throw("Data found outside of proper block:\n$data") if
 160                 !exists($self->{state}->{current_block}) && !$self->{state}->{in_curly_block};
 161             # bypass commented stuff (but we may want to process it at a later
 162             # point, so turn back here)
 163             next if $data =~ m{^\s*\#}xms;
 164             if ($self->{state}->{current_block} eq 'Samples') {
 165                 chomp $data;
 166                 # we have two possible ways to deal with sample number, either
 167                 # clone the LocatableSeq (in which case we need to deal with ID
 168                 # duplication), or store as annotation data. I chose the latter
 169                 # route using a Bio::Annotation::TagTree. YMMV - cjfields 10-15-08
 170                 my ($ls, $samples) = $self->_process_sequence($data);
 171                 my $id = $ls->id;
 172                 push @{ $self->{state}->{SampleAnnotation}->{Samples} }, [$id => $samples];
 173                 $aln->add_seq($ls);
 174             } else {
 175                 # add elsif's for further processing
 176                 #$self->debug('Unmatched data in block '.
 177                 #             $self->{state}->{current_block}.
 178                 #             ":\n$data\n");
 179                 $self->{state}->{blockdata} .= $data;
 180             }
 181         }
 182     }
 183     # alignments only returned if they contain sequences
 184     return $aln if $aln->num_sequences;
 185     return;
 186 }
 187
 188 =head2 write_aln
 189
 190  Title   : write_aln
 191  Usage   : $stream->write_aln(@aln)
 192  Function: writes the $aln object into the stream in xmfa format
 193  Returns : 1 for success and 0 for error
 194  Args    : L<Bio::Align::AlignI> object
 195
 196 See L<Bio::Align::AlignI>
 197
 198 =cut
 199
 200 sub write_aln {
 201     my ($self,@aln) = @_;
 202     $self->throw_not_implemented;
 203 }
 204
 205 ################ PRIVATE SUBS ################
 206
 207 sub _process_sequence {
 208     my ($self, $raw) = @_;
 209     return unless defined $raw;
 210     $raw =~ s{(?:^\s+|\s+$)}{}g;
 211     my ($id, $samples, $seq) = split(' ', $raw);
 212     my $ls = Bio::LocatableSeq->new('-seq'        => $seq,
 213                                     '-start'      => 1,
 214                                     '-display_id' => $id,
 215                                     '-alphabet'   => $self->alphabet);
 216     return($ls, $samples);
 217 }
 218
 219 sub _process_annotation {
 220     my ($self, $aln) = @_;
 221     my $coll = Bio::Annotation::Collection->new();
 222     my $factory = Bio::Annotation::AnnotationFactory->new(-type => 'Bio::Annotation::SimpleValue');
 223     for my $anntype (qw(SampleAnnotation Annotation)) {
 224         for my $key (keys %{ $self->{state}->{$anntype} }) {
 225             if ($key eq 'Title') {
 226                 $aln->description($self->{state}->{$anntype}->{$key}[0]);
 227             } elsif ($key eq 'Samples') {
 228                 $factory->type('Bio::Annotation::TagTree');
 229                 $coll->add_Annotation($key, $factory->create_object(
 230                     -value => [$key => $self->{state}->{$anntype}->{$key}]));
 231                 $factory->type('Bio::Annotation::SimpleValue');
 232             } elsif ($key eq 'SampleName') {
 233                 $aln->id($self->{state}->{$anntype}->{$key}[0]);
 234             } else {
 235                 $self->throw('Expecting an array reference') unless
 236                     ref $self->{state}->{$anntype}->{$key} eq 'ARRAY';
 237                 for my $a (@{ $self->{state}->{$anntype}->{$key} }) {
 238                     $coll->add_Annotation($key, $factory->create_object(
 239                         -value => $a) );
 240                 }
 241             }
 242         }
 243     }
 244     #$self->debug("Collection:".Dumper($coll)."\n");
 245     $aln->annotation($coll);
 246 }
 247
 248 1;