Bio::Tools::CodonTable::is_start_codon: check in case of ambiguous codons (#266)
[bioperl-live.git] / lib / Bio / SeqIO / tinyseq.pm
blob537180379e9eb4782e414e24f8615f376921c68a
1 # BioPerl module for Bio::SeqIO::tinyseq
3 # Please direct questions and support issues to <bioperl-l@bioperl.org>
5 # Cared for by Donald Jackson, donald.jackson@bms.com
7 # Copyright Bristol-Myers Squibb
9 # You may distribute this module under the same terms as perl itself
11 # POD documentation - main docs before the code
13 =head1 NAME
15 Bio::SeqIO::tinyseq - reading/writing sequences in NCBI TinySeq format
17 =head1 SYNOPSIS
19 Do not use this module directly; use the SeqIO handler system:
21 $stream = Bio::SeqIO->new( -file => $filename, -format => 'tinyseq' );
23 while ( my $seq = $stream->next_seq ) {
24 ....
27 =head1 DESCRIPTION
29 This object reads and writes Bio::Seq objects to and from TinySeq XML
30 format. A TinySeq is a lightweight XML file of sequence information,
31 analgous to FASTA format.
33 See L<https://www.ncbi.nlm.nih.gov/dtd/NCBI_TSeq.mod.dtd> for the DTD.
35 =head1 FEEDBACK
37 =head2 Mailing Lists
39 User feedback is an integral part of the evolution of this and other
40 Bioperl modules. Send your comments and suggestions preferably to
41 the Bioperl mailing list. Your participation is much appreciated.
43 bioperl-l@bioperl.org - General discussion
44 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
46 =head2 Support
48 Please direct usage questions or support issues to the mailing list:
50 I<bioperl-l@bioperl.org>
52 rather than to the module maintainer directly. Many experienced and
53 reponsive experts will be able look at the problem and quickly
54 address it. Please include a thorough description of the problem
55 with code and data examples if at all possible.
57 =head2 Reporting Bugs
59 Report bugs to the Bioperl bug tracking system to help us keep track
60 of the bugs and their resolution. Bug reports can be submitted via
61 the web:
63 https://github.com/bioperl/bioperl-live/issues
65 =head1 SEE ALSO
67 L<Bio::SeqIO>, L<Bio::Seq>.
69 =head1 AUTHOR
71 Donald Jackson, E<lt>donald.jackson@bms.comE<gt>
73 Parts of this module and the test script were patterned after Sheldon
74 McKay's L<Bio::SeqIO::game>. If it breaks, however, it's my fault not
75 his ;).
77 =head1 APPENDIX
79 The rest of the documentation details each of the object methods.
80 Internal methods are usually preceded with a _
82 =cut
84 package Bio::SeqIO::tinyseq;
86 use strict;
87 use Bio::Seq::SeqFastaSpeedFactory;
88 use Bio::Species;
89 use Bio::SeqIO::tinyseq::tinyseqHandler;
90 use XML::Parser::PerlSAX;
91 use XML::Writer;
93 use base qw(Bio::SeqIO);
95 sub _initialize {
96 my ($self, @args) = @_;
98 $self->SUPER::_initialize(@args);
100 unless (defined $self->sequence_factory) {
101 $self->sequence_factory(Bio::Seq::SeqFastaSpeedFactory->new);
104 $self->{'_species_objects'} = {};
105 $self->{_parsed} = 0;
108 =head2 next_seq
110 Title : next_seq
111 Usage : $seq = $stream->next_seq()
112 Function: returns the next sequence in the stream
113 Returns : Bio::Seq object
114 Args : NONE
116 =cut
118 sub next_seq {
119 my ($self) = @_;
121 $self->_get_seqs() unless ($self->{_parsed});
123 return shift @{$self->{_seqlist}};
126 =head2 write_seq
128 Title : write_seq
129 Usage : $seq = $stream->write_seq(@sequence_objects); undef $stream
130 Function: outputs one or more sequence objects as TinySeq XML
131 Returns : 1 on success
132 Args : one or more sequence objects as TinySeq XML
134 Because the TSeq dtd includes closing tags after all sets are written,
135 the output will not be complete until the program terminates or the
136 object is forced out of scope (see close_writer()). May not perfectly
137 reproduce TSeq_sid element for all sequences
139 =cut
141 sub write_seq {
142 my ($self, @seqobjs) = @_;
144 $self->throw('write_seq must be called with at least one Bio::SeqI or Bio::PrimarySeqI compliant object')
145 unless (@seqobjs and ( $seqobjs[0]->isa('Bio::SeqI') || $seqobjs[0]->isa('Bio::PrimarySeqI')));
147 my $writer = $self->_get_writer;
149 foreach my $seqobj (@seqobjs) {
150 my ($id_element, $id_value) = $self->_get_idstring($seqobj);
151 $writer->startTag('TSeq');
152 $writer->emptyTag('TSeq_seqtype', value => $self->_convert_seqtype($seqobj));
154 $writer->dataElement('TSeq_gi', $seqobj->primary_id || '');
155 $writer->dataElement($id_element, $id_value);
156 #$writer->dataElement('TSeq_orgname', $seqobj->taxid) if ($seqobj->can('taxid')); # just a placeholder
157 $writer->dataElement('TSeq_defline', $seqobj->desc);
158 $writer->dataElement('TSeq_length', $seqobj->length);
159 $writer->dataElement('TSeq_sequence', $seqobj->seq);
161 if ($seqobj->can('species') && $seqobj->species) {
162 $self->_write_species($writer, $seqobj->species);
165 $writer->endTag('TSeq');
170 =head2 _get_seqs
172 Title : _get_seqs
173 Usage : Internal function - use next_seq() instead
174 Function: parses the XML and creates Bio::Seq objects
175 Returns : 1 on success
176 Args : NONE
178 Currently stores all sequence objects into memory. I will work on do
179 more of a stream-based approach
181 =cut
183 sub _get_seqs {
184 my ($self) = @_;
185 my $fh = $self->_fh;
187 my $handler = Bio::SeqIO::tinyseq::tinyseqHandler->new();
188 my $parser = XML::Parser::PerlSAX->new( Handler => $handler );
189 my @seqatts = $parser->parse( Source => { ByteStream => $fh });
190 my $factory = $self->sequence_factory;
192 $self->{_seqlist} ||= [];
193 foreach my $seqatt(@seqatts) {
194 foreach my $subatt(@$seqatt) { # why are there two hashes?
195 my $seqobj = $factory->create(%$subatt);
196 $self->_assign_identifier($seqobj, $subatt);
198 if ($seqobj->can('species')) {
199 # my $class = [reverse(split(/ /, $subatt->{'-organism'}))];
200 # my $species = Bio::Species->new( -classification => $class,
201 # -ncbi_taxid => $subatt->{'-taxid'} );
202 my $species = $self->_get_species($subatt->{'-organism'}, $subatt->{'-taxid'});
203 $seqobj->species($species) if ($species);
206 push(@{$self->{_seqlist}}, $seqobj);
209 $self->{_parsed} = 1;
212 =head2 _get_species
214 Title : _get_species
215 Usage : Internal function
216 Function: gets a Bio::Species object from cache or creates as needed
217 Returns : a Bio::Species object on success, undef on failure
218 Args : a classification string (eg 'Homo sapiens') and
219 a NCBI taxon id (optional)
221 Objects are cached for parsing multiple sequence files.
223 =cut
225 sub _get_species {
226 my ($self, $orgname, $taxid) = @_;
228 unless ($self->{'_species_objects'}->{$orgname}) {
229 my $species = $self->_create_species($orgname, $taxid);
230 $self->{'_species_objects'}->{$orgname} = $species;
232 return $self->{'_species_objects'}->{$orgname};
235 =head2 _create_species
237 Title : _create_species
238 Usage : Internal function
239 Function: creates a Bio::Species object
240 Returns : a Bio::Species object on success, undef on failure
241 Args : a classification string (eg 'Homo sapiens') and
242 a NCBI taxon id (optional)
244 =cut
246 sub _create_species {
247 my ($self, $orgname, $taxid) = @_;
248 return unless ($orgname); # not required in TinySeq dtd so don't throw an error
250 my %params;
251 $params{'-classification'} = [reverse(split(/ /, $orgname))];
252 $params{'-ncbi_taxid'} = $taxid if ($taxid);
254 my $species = Bio::Species->new(%params)
255 or return;
257 return $species;
261 =head2 _assign_identifier
263 Title : _assign_identifier
264 Usage : Internal function
265 Function: looks for sequence accession
266 Returns : 1 on success
267 Args : NONE
269 NCBI puts refseq accessions in TSeq_sid, others in TSeq_accver.
271 =cut
273 sub _assign_identifier {
274 my ($self, $seqobj, $atts) = @_;
275 my ($accession, $version);
277 if ($atts->{'-accver'}) {
278 ($accession, $version) = split(/\./, $atts->{'-accver'});;
280 elsif ($atts->{'-sid'}) {
281 my $sidstring =$atts->{'-sid'};
282 $sidstring =~ s/^.+?\|//;
283 $sidstring =~ s/\|[^\|]*//;
284 ($accession, $version) = split(/\./, $sidstring);;
286 else {
287 $self->throw('NO accession information found for this sequence');
289 $seqobj->accession_number($accession) if ($seqobj->can('accession_number'));
290 $seqobj->version($version) if ($seqobj->can('version'));
293 =head2 _convert_seqtype
295 Title : _convert_seqtype
296 Usage : Internal function
297 Function: maps Bio::Seq::alphabet() values [dna/rna/protein] onto
298 TSeq_seqtype values [protein/nucleotide]
300 =cut
302 sub _convert_seqtype {
303 my ($self, $seqobj) = @_;
305 return 'protein' if ($seqobj->alphabet eq 'protein');
306 return 'nucleotide' if ($seqobj->alphabet eq 'dna');
307 return 'nucleotide' if ($seqobj->alphabet eq 'rna');
309 # if we get here there's a problem!
310 $self->throw("Alphabet not defined, can't assign type for $seqobj");
313 =head2 _get_idstring
315 Title : _get_idstring
316 Usage : Internal function
317 Function: parse accession and version info from TSeq_accver
318 or TSeq_sid
320 =cut
322 sub _get_idstring {
323 # NCBI puts refseq ids in TSeq_sid, others in TSeq_accver. No idea why.
324 my ($self, $seqobj) = @_;
325 my $accver = $seqobj->accession_number;
326 $accver .= '.' . $seqobj->version if ($seqobj->can('version') and $seqobj->version);
327 if ($accver =~ /^(NM_|NP_|XM_|XP_|NT_|NC_|NG_)/) {
328 return ('TSeq_sid', join('|', 'ref', $accver, ''));
330 else {
331 return ('TSeq_accver', $accver);
335 =head2 _get_writer
337 Title : _get_writer
338 Usage : Internal function
339 Function: instantiate XML::Writer object if needed,
340 output initial XML
342 =cut
344 sub _get_writer {
345 # initialize writer, start doc so write_seq can work one at a time
346 my ($self) = @_;
348 unless ($self->{_writer}) {
349 my $fh = $self->_fh;
350 my $writer = XML::Writer->new(OUTPUT => $fh,
351 DATA_MODE => 1,
352 DATA_INDENT => 2,
353 NEWLINE => 1,
355 $writer->doctype('TSeqSet', '-//NCBI//NCBI TSeq/EN', 'https://www.ncbi.nlm.nih.gov/dtd/NCBI_TSeq.dtd');
356 $writer->comment("Generated by Bio::SeqIO::tinyseq VERSION " . $Bio::SeqIO::tinyseq::VERSION);
357 $writer->startTag('TSeqSet');
359 $self->{_writer} = $writer;
361 return $self->{_writer};
364 =head2 close_writer
366 Title : close_writer
367 Usage : $self->close_writer()
368 Function: terminate XML output
369 Args : NONE
370 Returns : 1 on success
372 Called automatically by DESTROY when object goes out of scope
374 =cut
376 sub close_writer {
377 # close out any dangling writer
378 my ($self) = @_;
379 if ($self->{_writer}) {
380 my $writer = $self->{_writer};
381 $writer->endTag('TSeqSet');
382 $writer->end;
383 undef $writer;
385 close($self->_fh) if ($self->_fh);
389 sub _write_species {
390 my ($self, $writer, $species) = @_;
391 $writer->dataElement('TSeq_orgname', $species->binomial);
392 $writer->dataElement('TSeq_taxid', $species->ncbi_taxid)
393 if($species->ncbi_taxid);
396 sub DESTROY {
397 # primarily to close out a writer!
398 my ($self) = @_;
399 $self->close_writer;
400 undef $self;
404 __END__