Bio/SeqIO/tinyseq.pm

   1 # BioPerl module for Bio::SeqIO::tinyseq
   2 #
   3 # Please direct questions and support issues to <bioperl-l@bioperl.org>
   4 #
   5 # Cared for by Donald Jackson, donald.jackson@bms.com
   6 #
   7 # Copyright Bristol-Myers Squibb
   8 #
   9 # You may distribute this module under the same terms as perl itself
  10
  11 # POD documentation - main docs before the code
  12
  13 =head1 NAME
  14
  15 Bio::SeqIO::tinyseq - reading/writing sequences in NCBI TinySeq format
  16
  17 =head1 SYNOPSIS
  18
  19 Do not use this module directly; use the SeqIO handler system:
  20
  21   $stream = Bio::SeqIO->new( -file => $filename, -format => 'tinyseq' );
  22
  23   while ( my $seq = $stream->next_seq ) {
  24     ....
  25   }
  26
  27 =head1 DESCRIPTION
  28
  29 This object reads and writes Bio::Seq objects to and from TinySeq XML
  30 format.  A TinySeq is a lightweight XML file of sequence information,
  31 analgous to FASTA format.
  32
  33 See L<https://www.ncbi.nlm.nih.gov/dtd/NCBI_TSeq.mod.dtd> for the DTD.
  34
  35 =head1 FEEDBACK
  36
  37 =head2 Mailing Lists
  38
  39 User feedback is an integral part of the evolution of this and other
  40 Bioperl modules. Send your comments and suggestions preferably to
  41 the Bioperl mailing list.  Your participation is much appreciated.
  42
  43   bioperl-l@bioperl.org                  - General discussion
  44   http://bioperl.org/wiki/Mailing_lists  - About the mailing lists
  45
  46 =head2 Support
  47
  48 Please direct usage questions or support issues to the mailing list:
  49
  50 I<bioperl-l@bioperl.org>
  51
  52 rather than to the module maintainer directly. Many experienced and
  53 reponsive experts will be able look at the problem and quickly
  54 address it. Please include a thorough description of the problem
  55 with code and data examples if at all possible.
  56
  57 =head2 Reporting Bugs
  58
  59 Report bugs to the Bioperl bug tracking system to help us keep track
  60 of the bugs and their resolution. Bug reports can be submitted via
  61 the web:
  62
  63   https://github.com/bioperl/bioperl-live/issues
  64
  65 =head1 SEE ALSO
  66
  67 L<Bio::SeqIO>, L<Bio::Seq>.
  68
  69 =head1 AUTHOR
  70
  71 Donald Jackson, E<lt>donald.jackson@bms.comE<gt>
  72
  73 Parts of this module and the test script were patterned after Sheldon
  74 McKay's L<Bio::SeqIO::game>.  If it breaks, however, it's my fault not
  75 his ;).
  76
  77 =head1 APPENDIX
  78
  79 The rest of the documentation details each of the object methods.
  80 Internal methods are usually preceded with a _
  81
  82 =cut
  83
  84 package Bio::SeqIO::tinyseq;
  85
  86 use strict;
  87 use Bio::Seq::SeqFastaSpeedFactory;
  88 use Bio::Species;
  89 use Bio::SeqIO::tinyseq::tinyseqHandler;
  90 use XML::Parser::PerlSAX;
  91 use XML::Writer;
  92 use Bio::Root::Version;
  93
  94 use base qw(Bio::SeqIO);
  95
  96 sub _initialize {
  97     my ($self, @args) = @_;
  98
  99     $self->SUPER::_initialize(@args);
 100
 101     unless (defined $self->sequence_factory) {
 102         $self->sequence_factory(Bio::Seq::SeqFastaSpeedFactory->new);
 103     }
 104
 105     $self->{'_species_objects'} = {};
 106     $self->{_parsed} = 0;
 107 }
 108
 109 =head2 next_seq
 110
 111   Title   : next_seq
 112   Usage   : $seq = $stream->next_seq()
 113   Function: returns the next sequence in the stream
 114   Returns : Bio::Seq object
 115   Args    : NONE
 116
 117 =cut
 118
 119 sub next_seq {
 120     my ($self) = @_;
 121
 122     $self->_get_seqs() unless ($self->{_parsed});
 123
 124     return shift @{$self->{_seqlist}};
 125 }
 126
 127 =head2 write_seq
 128
 129   Title   : write_seq
 130   Usage   : $seq = $stream->write_seq(@sequence_objects); undef $stream
 131   Function: outputs one or more sequence objects as TinySeq XML
 132   Returns : 1 on success
 133   Args    : one or more sequence objects as TinySeq XML
 134
 135 Because the TSeq dtd includes closing tags after all sets are written,
 136 the output will not be complete until the program terminates or the
 137 object is forced out of scope (see close_writer()).  May not perfectly
 138 reproduce TSeq_sid element for all sequences
 139
 140 =cut
 141
 142 sub write_seq {
 143     my ($self, @seqobjs) = @_;
 144
 145     $self->throw('write_seq must be called with at least one  Bio::SeqI or Bio::PrimarySeqI compliant object')
 146         unless (@seqobjs and ( $seqobjs[0]->isa('Bio::SeqI') || $seqobjs[0]->isa('Bio::PrimarySeqI')));
 147
 148     my $writer = $self->_get_writer;
 149
 150     foreach my $seqobj (@seqobjs) {
 151         my ($id_element, $id_value) = $self->_get_idstring($seqobj);
 152         $writer->startTag('TSeq');
 153         $writer->emptyTag('TSeq_seqtype', value => $self->_convert_seqtype($seqobj));
 154
 155         $writer->dataElement('TSeq_gi',       $seqobj->primary_id || '');
 156         $writer->dataElement($id_element,     $id_value);
 157         #$writer->dataElement('TSeq_orgname',  $seqobj->taxid) if ($seqobj->can('taxid')); # just a placeholder
 158         $writer->dataElement('TSeq_defline',  $seqobj->desc);
 159         $writer->dataElement('TSeq_length',   $seqobj->length);
 160         $writer->dataElement('TSeq_sequence', $seqobj->seq);
 161
 162         if ($seqobj->can('species') && $seqobj->species) {
 163             $self->_write_species($writer, $seqobj->species);
 164         }
 165
 166         $writer->endTag('TSeq');
 167     }
 168     1;
 169 }
 170
 171 =head2 _get_seqs
 172
 173   Title   : _get_seqs
 174   Usage   : Internal function - use next_seq() instead
 175   Function: parses the XML and creates Bio::Seq objects
 176   Returns : 1 on success
 177   Args    : NONE
 178
 179 Currently stores all sequence objects into memory.  I will work on do
 180 more of a stream-based approach
 181
 182 =cut
 183
 184 sub _get_seqs {
 185     my ($self) = @_;
 186     my $fh = $self->_fh;
 187
 188     my $handler = Bio::SeqIO::tinyseq::tinyseqHandler->new();
 189     my $parser  = XML::Parser::PerlSAX->new( Handler => $handler );
 190     my @seqatts = $parser->parse( Source => { ByteStream => $fh });
 191     my $factory = $self->sequence_factory;
 192
 193     $self->{_seqlist} ||= [];
 194     foreach my $seqatt(@seqatts) {
 195         foreach my $subatt(@$seqatt) { # why are there two hashes?
 196             my $seqobj = $factory->create(%$subatt);
 197             $self->_assign_identifier($seqobj, $subatt);
 198
 199             if ($seqobj->can('species')) {
 200 #               my $class = [reverse(split(/ /, $subatt->{'-organism'}))];
 201 #               my $species = Bio::Species->new( -classification => $class,
 202 #                                                -ncbi_taxid     => $subatt->{'-taxid'} );
 203                 my $species = $self->_get_species($subatt->{'-organism'}, $subatt->{'-taxid'});
 204                 $seqobj->species($species) if ($species);
 205             }
 206
 207             push(@{$self->{_seqlist}}, $seqobj);
 208         }
 209     }
 210     $self->{_parsed} = 1;
 211 }
 212
 213 =head2 _get_species
 214
 215   Title   : _get_species
 216   Usage   : Internal function
 217   Function: gets a Bio::Species object from cache or creates as needed
 218   Returns : a Bio::Species object on success, undef on failure
 219   Args    : a classification string (eg 'Homo sapiens') and
 220             a NCBI taxon id (optional)
 221
 222 Objects are cached for parsing multiple sequence files.
 223
 224 =cut
 225
 226 sub _get_species {
 227      my ($self, $orgname, $taxid) = @_;
 228
 229      unless ($self->{'_species_objects'}->{$orgname}) {
 230          my $species = $self->_create_species($orgname, $taxid);
 231          $self->{'_species_objects'}->{$orgname} = $species;
 232      }
 233      return $self->{'_species_objects'}->{$orgname};
 234 }
 235
 236 =head2 _create_species
 237
 238   Title   : _create_species
 239   Usage   : Internal function
 240   Function: creates a Bio::Species object
 241   Returns : a Bio::Species object on success, undef on failure
 242   Args    : a classification string (eg 'Homo sapiens') and
 243                   a NCBI taxon id (optional)
 244
 245 =cut
 246
 247 sub _create_species {
 248     my ($self, $orgname, $taxid) = @_;
 249     return unless ($orgname); # not required in TinySeq dtd so don't throw an error
 250
 251     my %params;
 252     $params{'-classification'} = [reverse(split(/ /, $orgname))];
 253     $params{'-ncbi_taxid'} = $taxid if ($taxid);
 254
 255     my $species = Bio::Species->new(%params)
 256         or return;
 257
 258     return $species;
 259 }
 260
 261
 262 =head2 _assign_identifier
 263
 264   Title   : _assign_identifier
 265   Usage   : Internal function
 266   Function: looks for sequence accession
 267   Returns : 1 on success
 268   Args    : NONE
 269
 270 NCBI puts refseq accessions in TSeq_sid, others in TSeq_accver.
 271
 272 =cut
 273
 274 sub _assign_identifier {
 275     my ($self, $seqobj, $atts) = @_;
 276     my ($accession, $version);
 277
 278    if ($atts->{'-accver'}) {
 279         ($accession, $version) = split(/\./, $atts->{'-accver'});;
 280     }
 281     elsif ($atts->{'-sid'}) {
 282         my $sidstring =$atts->{'-sid'};
 283         $sidstring =~ s/^.+?\|//;
 284         $sidstring =~ s/\|[^\|]*//;
 285         ($accession, $version) = split(/\./, $sidstring);;
 286     }
 287     else {
 288         $self->throw('NO accession information found for this sequence');
 289     }
 290     $seqobj->accession_number($accession) if ($seqobj->can('accession_number'));
 291     $seqobj->version($version) if ($seqobj->can('version'));
 292 }
 293
 294 =head2 _convert_seqtype
 295
 296   Title   : _convert_seqtype
 297   Usage   : Internal function
 298   Function: maps Bio::Seq::alphabet() values [dna/rna/protein] onto
 299             TSeq_seqtype values [protein/nucleotide]
 300
 301 =cut
 302
 303 sub _convert_seqtype {
 304     my ($self, $seqobj) = @_;
 305
 306     return 'protein'    if ($seqobj->alphabet eq 'protein');
 307     return 'nucleotide' if ($seqobj->alphabet eq 'dna');
 308     return 'nucleotide' if ($seqobj->alphabet eq 'rna');
 309
 310     # if we get here there's a problem!
 311     $self->throw("Alphabet not defined, can't assign type for $seqobj");
 312 }
 313
 314 =head2 _get_idstring
 315
 316   Title   : _get_idstring
 317   Usage   : Internal function
 318   Function: parse accession and version info from TSeq_accver
 319             or TSeq_sid
 320
 321 =cut
 322
 323 sub _get_idstring {
 324     # NCBI puts refseq ids in TSeq_sid, others in TSeq_accver.  No idea why.
 325     my ($self, $seqobj) = @_;
 326     my $accver = $seqobj->accession_number;
 327     $accver .= '.' . $seqobj->version if ($seqobj->can('version') and $seqobj->version);
 328     if ($accver =~ /^(NM_|NP_|XM_|XP_|NT_|NC_|NG_)/) {
 329         return ('TSeq_sid', join('|', 'ref', $accver, ''));
 330     }
 331     else {
 332         return ('TSeq_accver', $accver);
 333     }
 334 }
 335
 336 =head2 _get_writer
 337
 338   Title   : _get_writer
 339   Usage   : Internal function
 340   Function: instantiate XML::Writer object if needed,
 341                   output initial XML
 342
 343 =cut
 344
 345 sub _get_writer {
 346     # initialize writer, start doc so write_seq can work one at a time
 347     my ($self) = @_;
 348
 349     unless ($self->{_writer}) {
 350         my $fh = $self->_fh;
 351         my $writer = XML::Writer->new(OUTPUT      => $fh,
 352                                       DATA_MODE   => 1,
 353                                       DATA_INDENT => 2,
 354                                       NEWLINE     => 1,
 355                                       );
 356         $writer->doctype('TSeqSet', '-//NCBI//NCBI TSeq/EN', 'https://www.ncbi.nlm.nih.gov/dtd/NCBI_TSeq.dtd');
 357         $writer->comment("Generated by Bio::SeqIO::tinyseq VERSION " . $Bio::Root::Version::VERSION);
 358         $writer->startTag('TSeqSet');
 359
 360         $self->{_writer} = $writer;
 361     }
 362     return $self->{_writer};
 363 }
 364
 365 =head2 close_writer
 366
 367   Title   : close_writer
 368   Usage   : $self->close_writer()
 369   Function: terminate XML output
 370   Args    : NONE
 371   Returns : 1 on success
 372
 373 Called automatically by DESTROY when object goes out of scope
 374
 375 =cut
 376
 377 sub close_writer {
 378     # close out any dangling writer
 379     my ($self) = @_;
 380     if ($self->{_writer}) {
 381         my $writer = $self->{_writer};
 382         $writer->endTag('TSeqSet');
 383         $writer->end;
 384         undef $writer;
 385     }
 386     close($self->_fh) if ($self->_fh);
 387     1;
 388 }
 389
 390 sub _write_species {
 391     my ($self, $writer, $species) = @_;
 392     $writer->dataElement('TSeq_orgname', $species->binomial);
 393     $writer->dataElement('TSeq_taxid',   $species->ncbi_taxid)
 394         if($species->ncbi_taxid);
 395 }
 396
 397 sub DESTROY {
 398     # primarily to close out a writer!
 399     my ($self) = @_;
 400     $self->close_writer;
 401     undef $self;
 402 }
 403
 404 1;
 405 __END__