1 # BioPerl module for Bio::SeqIO::tinyseq
3 # Please direct questions and support issues to <bioperl-l@bioperl.org>
5 # Cared for by Donald Jackson, donald.jackson@bms.com
7 # Copyright Bristol-Myers Squibb
9 # You may distribute this module under the same terms as perl itself
11 # POD documentation - main docs before the code
15 Bio::SeqIO::tinyseq - reading/writing sequences in NCBI TinySeq format
19 Do not use this module directly; use the SeqIO handler system:
21 $stream = Bio::SeqIO->new( -file => $filename, -format => 'tinyseq' );
23 while ( my $seq = $stream->next_seq ) {
29 This object reads and writes Bio::Seq objects to and from TinySeq XML
30 format. A TinySeq is a lightweight XML file of sequence information,
31 analgous to FASTA format.
33 See L<https://www.ncbi.nlm.nih.gov/dtd/NCBI_TSeq.mod.dtd> for the DTD.
39 User feedback is an integral part of the evolution of this and other
40 Bioperl modules. Send your comments and suggestions preferably to
41 the Bioperl mailing list. Your participation is much appreciated.
43 bioperl-l@bioperl.org - General discussion
44 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
48 Please direct usage questions or support issues to the mailing list:
50 I<bioperl-l@bioperl.org>
52 rather than to the module maintainer directly. Many experienced and
53 reponsive experts will be able look at the problem and quickly
54 address it. Please include a thorough description of the problem
55 with code and data examples if at all possible.
59 Report bugs to the Bioperl bug tracking system to help us keep track
60 of the bugs and their resolution. Bug reports can be submitted via
63 https://github.com/bioperl/bioperl-live/issues
67 L<Bio::SeqIO>, L<Bio::Seq>.
71 Donald Jackson, E<lt>donald.jackson@bms.comE<gt>
73 Parts of this module and the test script were patterned after Sheldon
74 McKay's L<Bio::SeqIO::game>. If it breaks, however, it's my fault not
79 The rest of the documentation details each of the object methods.
80 Internal methods are usually preceded with a _
84 package Bio
::SeqIO
::tinyseq
;
87 use Bio
::Seq
::SeqFastaSpeedFactory
;
89 use Bio
::SeqIO
::tinyseq
::tinyseqHandler
;
90 use XML
::Parser
::PerlSAX
;
92 use Bio
::Root
::Version
;
94 use base
qw(Bio::SeqIO);
97 my ($self, @args) = @_;
99 $self->SUPER::_initialize
(@args);
101 unless (defined $self->sequence_factory) {
102 $self->sequence_factory(Bio
::Seq
::SeqFastaSpeedFactory
->new);
105 $self->{'_species_objects'} = {};
106 $self->{_parsed
} = 0;
112 Usage : $seq = $stream->next_seq()
113 Function: returns the next sequence in the stream
114 Returns : Bio::Seq object
122 $self->_get_seqs() unless ($self->{_parsed
});
124 return shift @
{$self->{_seqlist
}};
130 Usage : $seq = $stream->write_seq(@sequence_objects); undef $stream
131 Function: outputs one or more sequence objects as TinySeq XML
132 Returns : 1 on success
133 Args : one or more sequence objects as TinySeq XML
135 Because the TSeq dtd includes closing tags after all sets are written,
136 the output will not be complete until the program terminates or the
137 object is forced out of scope (see close_writer()). May not perfectly
138 reproduce TSeq_sid element for all sequences
143 my ($self, @seqobjs) = @_;
145 $self->throw('write_seq must be called with at least one Bio::SeqI or Bio::PrimarySeqI compliant object')
146 unless (@seqobjs and ( $seqobjs[0]->isa('Bio::SeqI') || $seqobjs[0]->isa('Bio::PrimarySeqI')));
148 my $writer = $self->_get_writer;
150 foreach my $seqobj (@seqobjs) {
151 my ($id_element, $id_value) = $self->_get_idstring($seqobj);
152 $writer->startTag('TSeq');
153 $writer->emptyTag('TSeq_seqtype', value
=> $self->_convert_seqtype($seqobj));
155 $writer->dataElement('TSeq_gi', $seqobj->primary_id || '');
156 $writer->dataElement($id_element, $id_value);
157 #$writer->dataElement('TSeq_orgname', $seqobj->taxid) if ($seqobj->can('taxid')); # just a placeholder
158 $writer->dataElement('TSeq_defline', $seqobj->desc);
159 $writer->dataElement('TSeq_length', $seqobj->length);
160 $writer->dataElement('TSeq_sequence', $seqobj->seq);
162 if ($seqobj->can('species') && $seqobj->species) {
163 $self->_write_species($writer, $seqobj->species);
166 $writer->endTag('TSeq');
174 Usage : Internal function - use next_seq() instead
175 Function: parses the XML and creates Bio::Seq objects
176 Returns : 1 on success
179 Currently stores all sequence objects into memory. I will work on do
180 more of a stream-based approach
188 my $handler = Bio
::SeqIO
::tinyseq
::tinyseqHandler
->new();
189 my $parser = XML
::Parser
::PerlSAX
->new( Handler
=> $handler );
190 my @seqatts = $parser->parse( Source
=> { ByteStream
=> $fh });
191 my $factory = $self->sequence_factory;
193 $self->{_seqlist
} ||= [];
194 foreach my $seqatt(@seqatts) {
195 foreach my $subatt(@
$seqatt) { # why are there two hashes?
196 my $seqobj = $factory->create(%$subatt);
197 $self->_assign_identifier($seqobj, $subatt);
199 if ($seqobj->can('species')) {
200 # my $class = [reverse(split(/ /, $subatt->{'-organism'}))];
201 # my $species = Bio::Species->new( -classification => $class,
202 # -ncbi_taxid => $subatt->{'-taxid'} );
203 my $species = $self->_get_species($subatt->{'-organism'}, $subatt->{'-taxid'});
204 $seqobj->species($species) if ($species);
207 push(@
{$self->{_seqlist
}}, $seqobj);
210 $self->{_parsed
} = 1;
216 Usage : Internal function
217 Function: gets a Bio::Species object from cache or creates as needed
218 Returns : a Bio::Species object on success, undef on failure
219 Args : a classification string (eg 'Homo sapiens') and
220 a NCBI taxon id (optional)
222 Objects are cached for parsing multiple sequence files.
227 my ($self, $orgname, $taxid) = @_;
229 unless ($self->{'_species_objects'}->{$orgname}) {
230 my $species = $self->_create_species($orgname, $taxid);
231 $self->{'_species_objects'}->{$orgname} = $species;
233 return $self->{'_species_objects'}->{$orgname};
236 =head2 _create_species
238 Title : _create_species
239 Usage : Internal function
240 Function: creates a Bio::Species object
241 Returns : a Bio::Species object on success, undef on failure
242 Args : a classification string (eg 'Homo sapiens') and
243 a NCBI taxon id (optional)
247 sub _create_species
{
248 my ($self, $orgname, $taxid) = @_;
249 return unless ($orgname); # not required in TinySeq dtd so don't throw an error
252 $params{'-classification'} = [reverse(split(/ /, $orgname))];
253 $params{'-ncbi_taxid'} = $taxid if ($taxid);
255 my $species = Bio
::Species
->new(%params)
262 =head2 _assign_identifier
264 Title : _assign_identifier
265 Usage : Internal function
266 Function: looks for sequence accession
267 Returns : 1 on success
270 NCBI puts refseq accessions in TSeq_sid, others in TSeq_accver.
274 sub _assign_identifier
{
275 my ($self, $seqobj, $atts) = @_;
276 my ($accession, $version);
278 if ($atts->{'-accver'}) {
279 ($accession, $version) = split(/\./, $atts->{'-accver'});;
281 elsif ($atts->{'-sid'}) {
282 my $sidstring =$atts->{'-sid'};
283 $sidstring =~ s/^.+?\|//;
284 $sidstring =~ s/\|[^\|]*//;
285 ($accession, $version) = split(/\./, $sidstring);;
288 $self->throw('NO accession information found for this sequence');
290 $seqobj->accession_number($accession) if ($seqobj->can('accession_number'));
291 $seqobj->version($version) if ($seqobj->can('version'));
294 =head2 _convert_seqtype
296 Title : _convert_seqtype
297 Usage : Internal function
298 Function: maps Bio::Seq::alphabet() values [dna/rna/protein] onto
299 TSeq_seqtype values [protein/nucleotide]
303 sub _convert_seqtype
{
304 my ($self, $seqobj) = @_;
306 return 'protein' if ($seqobj->alphabet eq 'protein');
307 return 'nucleotide' if ($seqobj->alphabet eq 'dna');
308 return 'nucleotide' if ($seqobj->alphabet eq 'rna');
310 # if we get here there's a problem!
311 $self->throw("Alphabet not defined, can't assign type for $seqobj");
316 Title : _get_idstring
317 Usage : Internal function
318 Function: parse accession and version info from TSeq_accver
324 # NCBI puts refseq ids in TSeq_sid, others in TSeq_accver. No idea why.
325 my ($self, $seqobj) = @_;
326 my $accver = $seqobj->accession_number;
327 $accver .= '.' . $seqobj->version if ($seqobj->can('version') and $seqobj->version);
328 if ($accver =~ /^(NM_|NP_|XM_|XP_|NT_|NC_|NG_)/) {
329 return ('TSeq_sid', join('|', 'ref', $accver, ''));
332 return ('TSeq_accver', $accver);
339 Usage : Internal function
340 Function: instantiate XML::Writer object if needed,
346 # initialize writer, start doc so write_seq can work one at a time
349 unless ($self->{_writer
}) {
351 my $writer = XML
::Writer
->new(OUTPUT
=> $fh,
356 $writer->doctype('TSeqSet', '-//NCBI//NCBI TSeq/EN', 'https://www.ncbi.nlm.nih.gov/dtd/NCBI_TSeq.dtd');
357 $writer->comment("Generated by Bio::SeqIO::tinyseq VERSION " . $Bio::Root
::Version
::VERSION
);
358 $writer->startTag('TSeqSet');
360 $self->{_writer
} = $writer;
362 return $self->{_writer
};
368 Usage : $self->close_writer()
369 Function: terminate XML output
371 Returns : 1 on success
373 Called automatically by DESTROY when object goes out of scope
378 # close out any dangling writer
380 if ($self->{_writer
}) {
381 my $writer = $self->{_writer
};
382 $writer->endTag('TSeqSet');
386 close($self->_fh) if ($self->_fh);
391 my ($self, $writer, $species) = @_;
392 $writer->dataElement('TSeq_orgname', $species->binomial);
393 $writer->dataElement('TSeq_taxid', $species->ncbi_taxid)
394 if($species->ncbi_taxid);
398 # primarily to close out a writer!