lib/Bio/SeqIO/tinyseq.pm

   1 # BioPerl module for Bio::SeqIO::tinyseq
   2 #
   3 # Please direct questions and support issues to <bioperl-l@bioperl.org>
   4 #
   5 # Cared for by Donald Jackson, donald.jackson@bms.com
   6 #
   7 # Copyright Bristol-Myers Squibb
   8 #
   9 # You may distribute this module under the same terms as perl itself
  10
  11 # POD documentation - main docs before the code
  12
  13 =head1 NAME
  14
  15 Bio::SeqIO::tinyseq - reading/writing sequences in NCBI TinySeq format
  16
  17 =head1 SYNOPSIS
  18
  19 Do not use this module directly; use the SeqIO handler system:
  20
  21   $stream = Bio::SeqIO->new( -file => $filename, -format => 'tinyseq' );
  22
  23   while ( my $seq = $stream->next_seq ) {
  24     ....
  25   }
  26
  27 =head1 DESCRIPTION
  28
  29 This object reads and writes Bio::Seq objects to and from TinySeq XML
  30 format.  A TinySeq is a lightweight XML file of sequence information,
  31 analgous to FASTA format.
  32
  33 See L<https://www.ncbi.nlm.nih.gov/dtd/NCBI_TSeq.mod.dtd> for the DTD.
  34
  35 =head1 FEEDBACK
  36
  37 =head2 Mailing Lists
  38
  39 User feedback is an integral part of the evolution of this and other
  40 Bioperl modules. Send your comments and suggestions preferably to
  41 the Bioperl mailing list.  Your participation is much appreciated.
  42
  43   bioperl-l@bioperl.org                  - General discussion
  44   http://bioperl.org/wiki/Mailing_lists  - About the mailing lists
  45
  46 =head2 Support
  47
  48 Please direct usage questions or support issues to the mailing list:
  49
  50 I<bioperl-l@bioperl.org>
  51
  52 rather than to the module maintainer directly. Many experienced and
  53 reponsive experts will be able look at the problem and quickly
  54 address it. Please include a thorough description of the problem
  55 with code and data examples if at all possible.
  56
  57 =head2 Reporting Bugs
  58
  59 Report bugs to the Bioperl bug tracking system to help us keep track
  60 of the bugs and their resolution. Bug reports can be submitted via
  61 the web:
  62
  63   https://github.com/bioperl/bioperl-live/issues
  64
  65 =head1 SEE ALSO
  66
  67 L<Bio::SeqIO>, L<Bio::Seq>.
  68
  69 =head1 AUTHOR
  70
  71 Donald Jackson, E<lt>donald.jackson@bms.comE<gt>
  72
  73 Parts of this module and the test script were patterned after Sheldon
  74 McKay's L<Bio::SeqIO::game>.  If it breaks, however, it's my fault not
  75 his ;).
  76
  77 =head1 APPENDIX
  78
  79 The rest of the documentation details each of the object methods.
  80 Internal methods are usually preceded with a _
  81
  82 =cut
  83
  84 package Bio::SeqIO::tinyseq;
  85
  86 use strict;
  87 use Bio::Seq::SeqFastaSpeedFactory;
  88 use Bio::Species;
  89 use Bio::SeqIO::tinyseq::tinyseqHandler;
  90 use XML::Parser::PerlSAX;
  91 use XML::Writer;
  92
  93 use base qw(Bio::SeqIO);
  94
  95 sub _initialize {
  96     my ($self, @args) = @_;
  97
  98     $self->SUPER::_initialize(@args);
  99
 100     unless (defined $self->sequence_factory) {
 101         $self->sequence_factory(Bio::Seq::SeqFastaSpeedFactory->new);
 102     }
 103
 104     $self->{'_species_objects'} = {};
 105     $self->{_parsed} = 0;
 106 }
 107
 108 =head2 next_seq
 109
 110   Title   : next_seq
 111   Usage   : $seq = $stream->next_seq()
 112   Function: returns the next sequence in the stream
 113   Returns : Bio::Seq object
 114   Args    : NONE
 115
 116 =cut
 117
 118 sub next_seq {
 119     my ($self) = @_;
 120
 121     $self->_get_seqs() unless ($self->{_parsed});
 122
 123     return shift @{$self->{_seqlist}};
 124 }
 125
 126 =head2 write_seq
 127
 128   Title   : write_seq
 129   Usage   : $seq = $stream->write_seq(@sequence_objects); undef $stream
 130   Function: outputs one or more sequence objects as TinySeq XML
 131   Returns : 1 on success
 132   Args    : one or more sequence objects as TinySeq XML
 133
 134 Because the TSeq dtd includes closing tags after all sets are written,
 135 the output will not be complete until the program terminates or the
 136 object is forced out of scope (see close_writer()).  May not perfectly
 137 reproduce TSeq_sid element for all sequences
 138
 139 =cut
 140
 141 sub write_seq {
 142     my ($self, @seqobjs) = @_;
 143
 144     $self->throw('write_seq must be called with at least one  Bio::SeqI or Bio::PrimarySeqI compliant object')
 145         unless (@seqobjs and ( $seqobjs[0]->isa('Bio::SeqI') || $seqobjs[0]->isa('Bio::PrimarySeqI')));
 146
 147     my $writer = $self->_get_writer;
 148
 149     foreach my $seqobj (@seqobjs) {
 150         my ($id_element, $id_value) = $self->_get_idstring($seqobj);
 151         $writer->startTag('TSeq');
 152         $writer->emptyTag('TSeq_seqtype', value => $self->_convert_seqtype($seqobj));
 153
 154         $writer->dataElement('TSeq_gi',       $seqobj->primary_id || '');
 155         $writer->dataElement($id_element,     $id_value);
 156         #$writer->dataElement('TSeq_orgname',  $seqobj->taxid) if ($seqobj->can('taxid')); # just a placeholder
 157         $writer->dataElement('TSeq_defline',  $seqobj->desc);
 158         $writer->dataElement('TSeq_length',   $seqobj->length);
 159         $writer->dataElement('TSeq_sequence', $seqobj->seq);
 160
 161         if ($seqobj->can('species') && $seqobj->species) {
 162             $self->_write_species($writer, $seqobj->species);
 163         }
 164
 165         $writer->endTag('TSeq');
 166     }
 167     1;
 168 }
 169
 170 =head2 _get_seqs
 171
 172   Title   : _get_seqs
 173   Usage   : Internal function - use next_seq() instead
 174   Function: parses the XML and creates Bio::Seq objects
 175   Returns : 1 on success
 176   Args    : NONE
 177
 178 Currently stores all sequence objects into memory.  I will work on do
 179 more of a stream-based approach
 180
 181 =cut
 182
 183 sub _get_seqs {
 184     my ($self) = @_;
 185     my $fh = $self->_fh;
 186
 187     my $handler = Bio::SeqIO::tinyseq::tinyseqHandler->new();
 188     my $parser  = XML::Parser::PerlSAX->new( Handler => $handler );
 189     my @seqatts = $parser->parse( Source => { ByteStream => $fh });
 190     my $factory = $self->sequence_factory;
 191
 192     $self->{_seqlist} ||= [];
 193     foreach my $seqatt(@seqatts) {
 194         foreach my $subatt(@$seqatt) { # why are there two hashes?
 195             my $seqobj = $factory->create(%$subatt);
 196             $self->_assign_identifier($seqobj, $subatt);
 197
 198             if ($seqobj->can('species')) {
 199 #               my $class = [reverse(split(/ /, $subatt->{'-organism'}))];
 200 #               my $species = Bio::Species->new( -classification => $class,
 201 #                                                -ncbi_taxid     => $subatt->{'-taxid'} );
 202                 my $species = $self->_get_species($subatt->{'-organism'}, $subatt->{'-taxid'});
 203                 $seqobj->species($species) if ($species);
 204             }
 205
 206             push(@{$self->{_seqlist}}, $seqobj);
 207         }
 208     }
 209     $self->{_parsed} = 1;
 210 }
 211
 212 =head2 _get_species
 213
 214   Title   : _get_species
 215   Usage   : Internal function
 216   Function: gets a Bio::Species object from cache or creates as needed
 217   Returns : a Bio::Species object on success, undef on failure
 218   Args    : a classification string (eg 'Homo sapiens') and
 219             a NCBI taxon id (optional)
 220
 221 Objects are cached for parsing multiple sequence files.
 222
 223 =cut
 224
 225 sub _get_species {
 226      my ($self, $orgname, $taxid) = @_;
 227
 228      unless ($self->{'_species_objects'}->{$orgname}) {
 229          my $species = $self->_create_species($orgname, $taxid);
 230          $self->{'_species_objects'}->{$orgname} = $species;
 231      }
 232      return $self->{'_species_objects'}->{$orgname};
 233 }
 234
 235 =head2 _create_species
 236
 237   Title   : _create_species
 238   Usage   : Internal function
 239   Function: creates a Bio::Species object
 240   Returns : a Bio::Species object on success, undef on failure
 241   Args    : a classification string (eg 'Homo sapiens') and
 242                   a NCBI taxon id (optional)
 243
 244 =cut
 245
 246 sub _create_species {
 247     my ($self, $orgname, $taxid) = @_;
 248     return unless ($orgname); # not required in TinySeq dtd so don't throw an error
 249
 250     my %params;
 251     $params{'-classification'} = [reverse(split(/ /, $orgname))];
 252     $params{'-ncbi_taxid'} = $taxid if ($taxid);
 253
 254     my $species = Bio::Species->new(%params)
 255         or return;
 256
 257     return $species;
 258 }
 259
 260
 261 =head2 _assign_identifier
 262
 263   Title   : _assign_identifier
 264   Usage   : Internal function
 265   Function: looks for sequence accession
 266   Returns : 1 on success
 267   Args    : NONE
 268
 269 NCBI puts refseq accessions in TSeq_sid, others in TSeq_accver.
 270
 271 =cut
 272
 273 sub _assign_identifier {
 274     my ($self, $seqobj, $atts) = @_;
 275     my ($accession, $version);
 276
 277    if ($atts->{'-accver'}) {
 278         ($accession, $version) = split(/\./, $atts->{'-accver'});;
 279     }
 280     elsif ($atts->{'-sid'}) {
 281         my $sidstring =$atts->{'-sid'};
 282         $sidstring =~ s/^.+?\|//;
 283         $sidstring =~ s/\|[^\|]*//;
 284         ($accession, $version) = split(/\./, $sidstring);;
 285     }
 286     else {
 287         $self->throw('NO accession information found for this sequence');
 288     }
 289     $seqobj->accession_number($accession) if ($seqobj->can('accession_number'));
 290     $seqobj->version($version) if ($seqobj->can('version'));
 291 }
 292
 293 =head2 _convert_seqtype
 294
 295   Title   : _convert_seqtype
 296   Usage   : Internal function
 297   Function: maps Bio::Seq::alphabet() values [dna/rna/protein] onto
 298             TSeq_seqtype values [protein/nucleotide]
 299
 300 =cut
 301
 302 sub _convert_seqtype {
 303     my ($self, $seqobj) = @_;
 304
 305     return 'protein'    if ($seqobj->alphabet eq 'protein');
 306     return 'nucleotide' if ($seqobj->alphabet eq 'dna');
 307     return 'nucleotide' if ($seqobj->alphabet eq 'rna');
 308
 309     # if we get here there's a problem!
 310     $self->throw("Alphabet not defined, can't assign type for $seqobj");
 311 }
 312
 313 =head2 _get_idstring
 314
 315   Title   : _get_idstring
 316   Usage   : Internal function
 317   Function: parse accession and version info from TSeq_accver
 318             or TSeq_sid
 319
 320 =cut
 321
 322 sub _get_idstring {
 323     # NCBI puts refseq ids in TSeq_sid, others in TSeq_accver.  No idea why.
 324     my ($self, $seqobj) = @_;
 325     my $accver = $seqobj->accession_number;
 326     $accver .= '.' . $seqobj->version if ($seqobj->can('version') and $seqobj->version);
 327     if ($accver =~ /^(NM_|NP_|XM_|XP_|NT_|NC_|NG_)/) {
 328         return ('TSeq_sid', join('|', 'ref', $accver, ''));
 329     }
 330     else {
 331         return ('TSeq_accver', $accver);
 332     }
 333 }
 334
 335 =head2 _get_writer
 336
 337   Title   : _get_writer
 338   Usage   : Internal function
 339   Function: instantiate XML::Writer object if needed,
 340                   output initial XML
 341
 342 =cut
 343
 344 sub _get_writer {
 345     # initialize writer, start doc so write_seq can work one at a time
 346     my ($self) = @_;
 347
 348     unless ($self->{_writer}) {
 349         my $fh = $self->_fh;
 350         my $writer = XML::Writer->new(OUTPUT      => $fh,
 351                                       DATA_MODE   => 1,
 352                                       DATA_INDENT => 2,
 353                                       NEWLINE     => 1,
 354                                       );
 355         $writer->doctype('TSeqSet', '-//NCBI//NCBI TSeq/EN', 'https://www.ncbi.nlm.nih.gov/dtd/NCBI_TSeq.dtd');
 356         $writer->comment("Generated by Bio::SeqIO::tinyseq VERSION " . $Bio::SeqIO::tinyseq::VERSION);
 357         $writer->startTag('TSeqSet');
 358
 359         $self->{_writer} = $writer;
 360     }
 361     return $self->{_writer};
 362 }
 363
 364 =head2 close_writer
 365
 366   Title   : close_writer
 367   Usage   : $self->close_writer()
 368   Function: terminate XML output
 369   Args    : NONE
 370   Returns : 1 on success
 371
 372 Called automatically by DESTROY when object goes out of scope
 373
 374 =cut
 375
 376 sub close_writer {
 377     # close out any dangling writer
 378     my ($self) = @_;
 379     if ($self->{_writer}) {
 380         my $writer = $self->{_writer};
 381         $writer->endTag('TSeqSet');
 382         $writer->end;
 383         undef $writer;
 384     }
 385     close($self->_fh) if ($self->_fh);
 386     1;
 387 }
 388
 389 sub _write_species {
 390     my ($self, $writer, $species) = @_;
 391     $writer->dataElement('TSeq_orgname', $species->binomial);
 392     $writer->dataElement('TSeq_taxid',   $species->ncbi_taxid)
 393         if($species->ncbi_taxid);
 394 }
 395
 396 sub DESTROY {
 397     # primarily to close out a writer!
 398     my ($self) = @_;
 399     $self->close_writer;
 400     undef $self;
 401 }
 402
 403 1;
 404 __END__