1 # BioPerl module for Bio::SeqIO::tinyseq
3 # Please direct questions and support issues to <bioperl-l@bioperl.org>
5 # Cared for by Donald Jackson, donald.jackson@bms.com
7 # Copyright Bristol-Myers Squibb
9 # You may distribute this module under the same terms as perl itself
11 # POD documentation - main docs before the code
15 Bio::SeqIO::tinyseq - reading/writing sequences in NCBI TinySeq format
19 Do not use this module directly; use the SeqIO handler system:
21 $stream = Bio::SeqIO->new( -file => $filename, -format => 'tinyseq' );
23 while ( my $seq = $stream->next_seq ) {
29 This object reads and writes Bio::Seq objects to and from TinySeq XML
30 format. A TinySeq is a lightweight XML file of sequence information,
31 analgous to FASTA format.
33 See L<https://www.ncbi.nlm.nih.gov/dtd/NCBI_TSeq.mod.dtd> for the DTD.
39 User feedback is an integral part of the evolution of this and other
40 Bioperl modules. Send your comments and suggestions preferably to
41 the Bioperl mailing list. Your participation is much appreciated.
43 bioperl-l@bioperl.org - General discussion
44 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
48 Please direct usage questions or support issues to the mailing list:
50 I<bioperl-l@bioperl.org>
52 rather than to the module maintainer directly. Many experienced and
53 reponsive experts will be able look at the problem and quickly
54 address it. Please include a thorough description of the problem
55 with code and data examples if at all possible.
59 Report bugs to the Bioperl bug tracking system to help us keep track
60 of the bugs and their resolution. Bug reports can be submitted via
63 https://github.com/bioperl/bioperl-live/issues
67 L<Bio::SeqIO>, L<Bio::Seq>.
71 Donald Jackson, E<lt>donald.jackson@bms.comE<gt>
73 Parts of this module and the test script were patterned after Sheldon
74 McKay's L<Bio::SeqIO::game>. If it breaks, however, it's my fault not
79 The rest of the documentation details each of the object methods.
80 Internal methods are usually preceded with a _
84 package Bio
::SeqIO
::tinyseq
;
87 use Bio
::Seq
::SeqFastaSpeedFactory
;
89 use Bio
::SeqIO
::tinyseq
::tinyseqHandler
;
90 use XML
::Parser
::PerlSAX
;
93 use base
qw(Bio::SeqIO);
96 my ($self, @args) = @_;
98 $self->SUPER::_initialize
(@args);
100 unless (defined $self->sequence_factory) {
101 $self->sequence_factory(Bio
::Seq
::SeqFastaSpeedFactory
->new);
104 $self->{'_species_objects'} = {};
105 $self->{_parsed
} = 0;
111 Usage : $seq = $stream->next_seq()
112 Function: returns the next sequence in the stream
113 Returns : Bio::Seq object
121 $self->_get_seqs() unless ($self->{_parsed
});
123 return shift @
{$self->{_seqlist
}};
129 Usage : $seq = $stream->write_seq(@sequence_objects); undef $stream
130 Function: outputs one or more sequence objects as TinySeq XML
131 Returns : 1 on success
132 Args : one or more sequence objects as TinySeq XML
134 Because the TSeq dtd includes closing tags after all sets are written,
135 the output will not be complete until the program terminates or the
136 object is forced out of scope (see close_writer()). May not perfectly
137 reproduce TSeq_sid element for all sequences
142 my ($self, @seqobjs) = @_;
144 $self->throw('write_seq must be called with at least one Bio::SeqI or Bio::PrimarySeqI compliant object')
145 unless (@seqobjs and ( $seqobjs[0]->isa('Bio::SeqI') || $seqobjs[0]->isa('Bio::PrimarySeqI')));
147 my $writer = $self->_get_writer;
149 foreach my $seqobj (@seqobjs) {
150 my ($id_element, $id_value) = $self->_get_idstring($seqobj);
151 $writer->startTag('TSeq');
152 $writer->emptyTag('TSeq_seqtype', value
=> $self->_convert_seqtype($seqobj));
154 $writer->dataElement('TSeq_gi', $seqobj->primary_id || '');
155 $writer->dataElement($id_element, $id_value);
156 #$writer->dataElement('TSeq_orgname', $seqobj->taxid) if ($seqobj->can('taxid')); # just a placeholder
157 $writer->dataElement('TSeq_defline', $seqobj->desc);
158 $writer->dataElement('TSeq_length', $seqobj->length);
159 $writer->dataElement('TSeq_sequence', $seqobj->seq);
161 if ($seqobj->can('species') && $seqobj->species) {
162 $self->_write_species($writer, $seqobj->species);
165 $writer->endTag('TSeq');
173 Usage : Internal function - use next_seq() instead
174 Function: parses the XML and creates Bio::Seq objects
175 Returns : 1 on success
178 Currently stores all sequence objects into memory. I will work on do
179 more of a stream-based approach
187 my $handler = Bio
::SeqIO
::tinyseq
::tinyseqHandler
->new();
188 my $parser = XML
::Parser
::PerlSAX
->new( Handler
=> $handler );
189 my @seqatts = $parser->parse( Source
=> { ByteStream
=> $fh });
190 my $factory = $self->sequence_factory;
192 $self->{_seqlist
} ||= [];
193 foreach my $seqatt(@seqatts) {
194 foreach my $subatt(@
$seqatt) { # why are there two hashes?
195 my $seqobj = $factory->create(%$subatt);
196 $self->_assign_identifier($seqobj, $subatt);
198 if ($seqobj->can('species')) {
199 # my $class = [reverse(split(/ /, $subatt->{'-organism'}))];
200 # my $species = Bio::Species->new( -classification => $class,
201 # -ncbi_taxid => $subatt->{'-taxid'} );
202 my $species = $self->_get_species($subatt->{'-organism'}, $subatt->{'-taxid'});
203 $seqobj->species($species) if ($species);
206 push(@
{$self->{_seqlist
}}, $seqobj);
209 $self->{_parsed
} = 1;
215 Usage : Internal function
216 Function: gets a Bio::Species object from cache or creates as needed
217 Returns : a Bio::Species object on success, undef on failure
218 Args : a classification string (eg 'Homo sapiens') and
219 a NCBI taxon id (optional)
221 Objects are cached for parsing multiple sequence files.
226 my ($self, $orgname, $taxid) = @_;
228 unless ($self->{'_species_objects'}->{$orgname}) {
229 my $species = $self->_create_species($orgname, $taxid);
230 $self->{'_species_objects'}->{$orgname} = $species;
232 return $self->{'_species_objects'}->{$orgname};
235 =head2 _create_species
237 Title : _create_species
238 Usage : Internal function
239 Function: creates a Bio::Species object
240 Returns : a Bio::Species object on success, undef on failure
241 Args : a classification string (eg 'Homo sapiens') and
242 a NCBI taxon id (optional)
246 sub _create_species
{
247 my ($self, $orgname, $taxid) = @_;
248 return unless ($orgname); # not required in TinySeq dtd so don't throw an error
251 $params{'-classification'} = [reverse(split(/ /, $orgname))];
252 $params{'-ncbi_taxid'} = $taxid if ($taxid);
254 my $species = Bio
::Species
->new(%params)
261 =head2 _assign_identifier
263 Title : _assign_identifier
264 Usage : Internal function
265 Function: looks for sequence accession
266 Returns : 1 on success
269 NCBI puts refseq accessions in TSeq_sid, others in TSeq_accver.
273 sub _assign_identifier
{
274 my ($self, $seqobj, $atts) = @_;
275 my ($accession, $version);
277 if ($atts->{'-accver'}) {
278 ($accession, $version) = split(/\./, $atts->{'-accver'});;
280 elsif ($atts->{'-sid'}) {
281 my $sidstring =$atts->{'-sid'};
282 $sidstring =~ s/^.+?\|//;
283 $sidstring =~ s/\|[^\|]*//;
284 ($accession, $version) = split(/\./, $sidstring);;
287 $self->throw('NO accession information found for this sequence');
289 $seqobj->accession_number($accession) if ($seqobj->can('accession_number'));
290 $seqobj->version($version) if ($seqobj->can('version'));
293 =head2 _convert_seqtype
295 Title : _convert_seqtype
296 Usage : Internal function
297 Function: maps Bio::Seq::alphabet() values [dna/rna/protein] onto
298 TSeq_seqtype values [protein/nucleotide]
302 sub _convert_seqtype
{
303 my ($self, $seqobj) = @_;
305 return 'protein' if ($seqobj->alphabet eq 'protein');
306 return 'nucleotide' if ($seqobj->alphabet eq 'dna');
307 return 'nucleotide' if ($seqobj->alphabet eq 'rna');
309 # if we get here there's a problem!
310 $self->throw("Alphabet not defined, can't assign type for $seqobj");
315 Title : _get_idstring
316 Usage : Internal function
317 Function: parse accession and version info from TSeq_accver
323 # NCBI puts refseq ids in TSeq_sid, others in TSeq_accver. No idea why.
324 my ($self, $seqobj) = @_;
325 my $accver = $seqobj->accession_number;
326 $accver .= '.' . $seqobj->version if ($seqobj->can('version') and $seqobj->version);
327 if ($accver =~ /^(NM_|NP_|XM_|XP_|NT_|NC_|NG_)/) {
328 return ('TSeq_sid', join('|', 'ref', $accver, ''));
331 return ('TSeq_accver', $accver);
338 Usage : Internal function
339 Function: instantiate XML::Writer object if needed,
345 # initialize writer, start doc so write_seq can work one at a time
348 unless ($self->{_writer
}) {
350 my $writer = XML
::Writer
->new(OUTPUT
=> $fh,
355 $writer->doctype('TSeqSet', '-//NCBI//NCBI TSeq/EN', 'https://www.ncbi.nlm.nih.gov/dtd/NCBI_TSeq.dtd');
356 $writer->comment("Generated by Bio::SeqIO::tinyseq VERSION " . $Bio::SeqIO
::tinyseq
::VERSION
);
357 $writer->startTag('TSeqSet');
359 $self->{_writer
} = $writer;
361 return $self->{_writer
};
367 Usage : $self->close_writer()
368 Function: terminate XML output
370 Returns : 1 on success
372 Called automatically by DESTROY when object goes out of scope
377 # close out any dangling writer
379 if ($self->{_writer
}) {
380 my $writer = $self->{_writer
};
381 $writer->endTag('TSeqSet');
385 close($self->_fh) if ($self->_fh);
390 my ($self, $writer, $species) = @_;
391 $writer->dataElement('TSeq_orgname', $species->binomial);
392 $writer->dataElement('TSeq_taxid', $species->ncbi_taxid)
393 if($species->ncbi_taxid);
397 # primarily to close out a writer!