1 # BioPerl module for Bio::SeqIO::seqxml
3 # Please direct questions and support issues to <bioperl-l@bioperl.org>
5 # Cared for by Dave Messina <dmessina@cpan.org>
7 # Copyright Dave Messina
9 # You may distribute this module under the same terms as perl itself
11 # December 2009 - initial version
12 # July 2 2010 - updated for SeqXML v0.2
13 # November 11 2010 - added schemaLocation
14 # December 9 2010 - SeqXML v0.3
17 # POD documentation - main docs before the code
21 Bio::SeqIO::seqxml - SeqXML sequence input/output stream
25 # Do not use this module directly. Use it via the Bio::SeqIO class.
30 my $seqio = Bio::SeqIO->new(-format => 'seqxml',
31 -file => 'my_seqs.xml');
33 while (my $seq_object = $seqio->next_seq) {
35 $seq_object->display_id,
36 $seq_object->description,
43 # Note that you can (optionally) specify the source
44 # (usually a database) and source version.
45 my $seqwriter = Bio::SeqIO->new(-format => 'seqxml',
46 -file => ">outfile.xml",
48 -sourceVersion => '56');
49 $seqwriter->write_seq($seq_object);
51 # once you've written all of your seqs, you may want to do
52 # an explicit close to get the closing </seqXML> tag
57 This object can transform Bio::Seq objects to and from SeqXML format.
58 For more information on the SeqXML standard, visit L<http://www.seqxml.org>.
60 In short, SeqXML is a lightweight sequence format that takes advantage
61 of the validation capabilities of XML while not overburdening you
62 with a strict and complicated schema.
64 This module is based in part (particularly the XML-parsing part) on
65 Bio::TreeIO::phyloxml by Mira Han.
71 User feedback is an integral part of the evolution of this and other
72 Bioperl modules. Send your comments and suggestions preferably to one
73 of the Bioperl mailing lists. Your participation is much appreciated.
75 bioperl-l@bioperl.org - General discussion
76 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
80 Please direct usage questions or support issues to the mailing list:
82 I<bioperl-l@bioperl.org>
84 rather than to the module maintainer directly. Many experienced and
85 reponsive experts will be able look at the problem and quickly
86 address it. Please include a thorough description of the problem
87 with code and data examples if at all possible.
91 Report bugs to the Bioperl bug tracking system to help us keep track
92 the bugs and their resolution. Bug reports can be submitted via the
95 https://github.com/bioperl/bioperl-live/issues
97 =head1 AUTHORS - Dave Messina
99 Email: I<dmessina@cpan.org>
106 The rest of the documentation details each of the object
107 methods. Internal methods are usually preceded with a _
111 # Let the code begin...
113 package Bio
::SeqIO
::seqxml
;
118 use Bio
::Seq
::SeqFactory
;
120 use Bio
::Annotation
::DBLink
;
121 use Bio
::Annotation
::SimpleValue
;
123 use XML
::LibXML
::Reader
;
126 use base
qw(Bio::SeqIO);
128 # define seqXML header stuff
129 # there's no API for XMLNS XMLNS_XSI; you must set them here.
130 use constant SEQXML_VERSION
=> 0.3;
131 use constant SCHEMA_LOCATION
=> 'http://www.seqxml.org/0.3/seqxml.xsd';
132 use constant XMLNS_XSI
=> 'http://www.w3.org/2001/XMLSchema-instance';
137 Usage : $self->_initialize(@args)
138 Function: constructor (for internal use only).
140 Besides the usual SeqIO arguments (-file, -fh, etc.),
141 Bio::SeqIO::seqxml accepts three arguments which are used
142 when writing out a seqxml file. They are all optional.
144 Args : -source => source string (usually a database name)
145 -sourceVersion => source version. The version number of the source
146 -seqXMLversion => the version of seqXML that will be used
147 Throws : Exception if XML::LibXML::Reader or XML::Writer
153 my ( $self, @args ) = @_;
155 $self->SUPER::_initialize
(@args);
156 if ( !defined $self->sequence_factory ) {
157 $self->sequence_factory(
158 Bio
::Seq
::SeqFactory
->new(
159 -verbose
=> $self->verbose(),
165 # holds version and source data
166 $self->{'_seqxml_metadata'} = {};
168 # load any passed parameters
170 if ($params{'-sourceVersion'}) {
171 $self->sourceVersion($params{'-sourceVersion'});
173 if ($params{'-source'}) {
174 $self->source($params{'-source'});
176 if ($params{'-seqXMLversion'}) {
177 $self->seqXMLversion($params{'-seqXMLversion'});
180 if ( $self->mode eq 'r' ) {
182 $self->{'_reader'} = XML
::LibXML
::Reader
->new(
187 if ( !$self->{'_reader'} ) {
188 $self->throw("XML::LibXML::Reader not initialized");
191 # holds data temporarily during parsing
192 $self->{'_current_entry_data'} = {};
194 $self->_initialize_seqxml_node_methods();
197 $self->parseHeader();
201 elsif ( $self->mode eq 'w' ) {
203 $self->{'_writer'} = XML
::Writer
->new(
204 OUTPUT
=> $self->_fh,
208 if ( !$self->{'_writer'} ) {
209 $self->throw("XML::Writer not initialized");
212 # write SeqXML header
213 $self->{'_writer'}->xmlDecl("UTF-8");
214 if ($self->source || $self->sourceVersion) {
215 $self->{'_writer'}->startTag(
217 'seqXMLversion' => $self->seqXMLversion(SEQXML_VERSION
),
218 'xmlns:xsi' => XMLNS_XSI
,
219 'xsi:noNamespaceSchemaLocation' => $self->schemaLocation(SCHEMA_LOCATION
),
220 'source' => $self->source,
221 'sourceVersion' => $self->sourceVersion,
225 $self->{'_writer'}->startTag(
227 'seqXMLversion' => $self->seqXMLversion(SEQXML_VERSION
),
228 'xmlns:xsi' => XMLNS_XSI
,
229 'xsi:noNamespaceSchemaLocation' => $self->schemaLocation(SCHEMA_LOCATION
),
240 Usage : $seq = $stream->next_seq()
241 Function: returns the next sequence in the stream
242 Returns : L<Bio::Seq> object, or nothing if no more available
249 my $reader = $self->{'_reader'};
252 while ( $reader->read ) {
254 # we're done if we hit </entry>
255 if ( $reader->nodeType == XML_READER_TYPE_END_ELEMENT
) {
256 if ( $reader->name eq 'entry' ) {
257 $entry = $self->end_element_entry();
261 $self->processXMLnode;
270 Usage : $stream->write_seq(@seq)
271 Function: Writes the $seq object into the stream
272 Returns : 1 for success and 0 for error
273 Args : Array of 1 or more L<Bio::PrimarySeqI> objects
278 my ( $self, @seqs ) = @_;
279 my $writer = $self->{'_writer'};
281 foreach my $seqobj (@seqs) {
282 $self->throw("Trying to write with no seq!") unless defined $seqobj;
284 if ( !ref $seqobj || !$seqobj->isa('Bio::SeqI') ) {
286 " $seqobj is not a SeqI compliant module. Attempting to dump, but may fail!"
290 # opening tag, ID, and source (if present -- it's optional)
291 my $id = $seqobj->display_id;
292 my ($source_obj) = $seqobj->get_Annotations('source');
293 if (defined $source_obj && defined $id) {
294 $writer->startTag( 'entry', 'id' => $id, 'source' => $source_obj->value );
296 elsif (defined $id) {
297 $writer->startTag( 'entry', 'id' => $id );
300 $self->throw(" $seqobj has no ID!");
303 # species and NCBI taxID
304 if ( $seqobj->species ) {
305 my $name = $seqobj->species->node_name;
306 my $taxid = $seqobj->species->ncbi_taxid;
307 if ( $name && ( $taxid =~ /[0-9]+/ ) ) {
311 'ncbiTaxID' => $taxid
315 $self->throw("$seqobj has malformed species data");
320 if ( $seqobj->desc ) {
321 $writer->dataElement( 'description', $seqobj->desc );
325 # - throws if seq is empty or missing because having a sequence
326 # is a SeqXML requirement
327 if ( $seqobj->seq ) {
328 # check that there's actually sequence in there
329 unless ( length($seqobj->seq) > 0 ) {
330 $self->throw("sequence entry $id lacks a sequence!");
332 my $alphabet = $seqobj->alphabet;
338 unless ( exists( $seqtype{$alphabet} ) ) {
339 $self->throw("invalid sequence alphabet $alphabet!");
341 $writer->dataElement( $seqtype{$alphabet}, $seqobj->seq );
344 $self->throw("sequence entry $id lacks a sequence!");
347 # Database crossreferences
348 my @dblinks = $seqobj->get_Annotations('dblink');
349 foreach my $dblink (@dblinks) {
350 unless ( $dblink->database && $dblink->primary_id ) {
351 $self->throw("dblink $dblink is malformed");
353 if (defined($dblink->type)) {
356 'type' => $dblink->type,
357 'source' => $dblink->database,
358 'id' => $dblink->primary_id,
364 'source' => $dblink->database,
365 'id' => $dblink->primary_id,
371 my @annotations = $seqobj->get_Annotations();
372 foreach my $annot_obj (@annotations) {
373 next if ( $annot_obj->tagname eq 'dblink' );
374 next if ( $annot_obj->tagname eq 'source' ); # handled above
376 # SeqXML doesn't support references
377 next if ( $annot_obj->tagname eq 'reference' );
379 unless ( $annot_obj->tagname ) {
380 $self->throw("property $annot_obj is missing a tagname");
382 if ( $annot_obj->value ) {
385 'name' => $annot_obj->tagname,
386 'value' => $annot_obj->value,
392 'name' => $annot_obj->tagname,
399 $writer->endTag('entry');
401 # make sure it gets written to the file
402 $self->flush if $self->_flush_on_write && defined $self->_fh;
407 =head2 _initialize_seqxml_node_methods
409 Title : _initialize_seqxml_node_methods
410 Usage : $self->_initialize_xml_node_methods
411 Function: sets up code ref mapping of each seqXML node type
412 to a method for processing that node type
418 sub _initialize_seqxml_node_methods
{
421 my %start_elements = (
422 'seqXML' => \
&element_seqXML
,
423 'entry' => \
&element_entry
,
424 'species' => \
&element_species
,
425 'description' => \
&element_description
,
426 'RNAseq' => \
&element_RNAseq
,
427 'DNAseq' => \
&element_DNAseq
,
428 'AAseq' => \
&element_AAseq
,
429 'DBRef' => \
&element_DBRef
,
430 'property' => \
&element_property
,
432 $self->{'_start_elements'} = \
%start_elements;
435 'seqXML' => \
&end_element_default
,
436 'entry' => \
&end_element_entry
,
437 'species' => \
&end_element_default
,
438 'description' => \
&end_element_default
,
439 'RNAseq' => \
&end_element_RNAseq
,
440 'DNAseq' => \
&end_element_DNAseq
,
441 'AAseq' => \
&end_element_AAseq
,
442 'DBRef' => \
&end_element_default
,
443 'property' => \
&end_element_default
,
445 $self->{'_end_elements'} = \
%end_elements;
449 =head2 schemaLocation
451 Title : schemaLocation
452 Usage : $self->schemaLocation
453 Function: gets/sets the schema location in the <seqXML> header
454 Returns : the schema location string
455 Args : To set the schemaLocation, call with a schemaLocation as the argument.
460 my ( $self, $value ) = @_;
461 my $metadata = $self->{'_seqxml_metadata'};
463 # set if a value is supplied
465 $metadata->{'schemaLocation'} = $value;
468 return $metadata->{'schemaLocation'};
474 Usage : $self->source
475 Function: gets/sets the data source in the <seqXML> header
476 Returns : the data source string
477 Args : To set the source, call with a source string as the argument.
482 my ( $self, $value ) = @_;
483 my $metadata = $self->{'_seqxml_metadata'};
485 # set if a value is supplied
487 $metadata->{'source'} = $value;
490 return $metadata->{'source'};
495 Title : sourceVersion
496 Usage : $self->sourceVersion
497 Function: gets/sets the data source version in the <seqXML> header
498 Returns : the data source version string
499 Args : To set the source version, call with a source version string
505 my ( $self, $value ) = @_;
506 my $metadata = $self->{'_seqxml_metadata'};
508 # set if a value is supplied
510 $metadata->{'sourceVersion'} = $value;
513 return $metadata->{'sourceVersion'};
518 Title : seqXMLversion
519 Usage : $self->seqXMLversion
520 Function: gets/sets the seqXML version in the <seqXML> header
521 Returns : the seqXML version string.
522 Args : To set the seqXML version, call with a seqXML version string
528 my ( $self, $value ) = @_;
529 my $metadata = $self->{'_seqxml_metadata'};
531 # set if a value is supplied
533 $metadata->{'seqXMLversion'} = $value;
536 return $metadata->{'seqXMLversion'};
539 =head1 Methods for parsing the XML document
543 =head2 processXMLNode
545 Title : processXMLNode
546 Usage : $seqio->processXMLNode
547 Function: reads the XML node and processes according to the node type
550 Throws : Exception on unexpected XML node type, warnings on unexpected
557 my $reader = $self->{'_reader'};
558 my $nodetype = $reader->nodeType;
560 if ( $nodetype == XML_READER_TYPE_ELEMENT
) {
561 $self->{'_current_element_name'} = $reader->name;
563 if ( exists $self->{'_start_elements'}->{ $reader->name } ) {
564 my $method = $self->{'_start_elements'}->{ $reader->name };
568 my $name = $reader->name;
569 $self->warn("unexpected start element encountered: $name");
572 elsif ( $nodetype == XML_READER_TYPE_TEXT
) {
574 # store key-value pair of element name and the corresponding text
575 my $name = $self->{'_current_element_name'};
576 $self->{'_current_entry_data'}->{$name} = $reader->value;
579 elsif ( $nodetype == XML_READER_TYPE_END_ELEMENT
) {
580 if ( exists $self->{'_end_elements'}->{ $reader->name } ) {
581 my $method = $self->{'_end_elements'}->{ $reader->name };
585 my $name = $reader->name;
586 $self->warn("unexpected end element encountered: $name");
588 $self->{'_current_element_name'} = {}; # empty current element name
592 "unexpected node type " . $nodetype,
593 " encountered (name: ",
598 if ( $self->debug ) {
599 printf "%d %d %s %d\n",
601 $reader->depth, $reader->nodeType,
602 $reader->name, $reader->isEmptyElement
607 =head2 processAttribute
609 Title : processAttribute
610 Usage : $seqio->processAttribute(\%hash_for_attribute);
611 Function: reads the attributes of the current element into a hash
613 Args : hash reference where the attributes will be stored.
617 sub processAttribute
{
618 my ( $self, $data ) = @_;
619 my $reader = $self->{'_reader'};
621 # several ways of reading attributes:
622 # read all attributes:
623 if ( $reader->moveToFirstAttribute ) {
625 $data->{ $reader->name() } = $reader->value;
626 } while ( $reader->moveToNextAttribute );
627 $reader->moveToElement;
634 Usage : $self->parseHeader();
635 Function: reads the opening <seqXML> block and grabs the metadata from it,
636 namely the source, sourceVersion, and seqXMLversion.
639 Throws : Exception if it hits an <entry> tag, because that means it's
640 missed the <seqXML> tag and read too far into the file.
646 my $reader = $self->{'_reader'};
648 while($reader->read) {
650 # just read the header
651 if ( $reader->nodeType == XML_READER_TYPE_ELEMENT
) {
652 if ( $reader->name eq 'seqXML' ) {
653 $self->element_seqXML();
656 elsif ( $reader->name eq 'entry' ) {
657 my $name = $reader->name;
658 $self->throw("Missed the opening <seqXML> tag. Got $name instead.");
664 =head2 element_seqXML
666 Title : element_seqXML
667 Usage : $self->element_seqXML
668 Function: processes the opening <seqXML> node
676 my $reader = $self->{'_reader'};
678 # reset for every new <seqXML> block
679 $self->{'_seqxml_metadata'} = {};
681 if ( $reader->hasAttributes() ) {
682 $self->processAttribute( $self->{'_seqxml_metadata'} );
685 $self->throw("no SeqXML metadata!");
691 Title : element_entry
692 Usage : $self->element_entry
693 Function: processes a sequence <entry> node
696 Throws : Exception if sequence ID is not present in <entry> element
702 my $reader = $self->{'_reader'};
704 if ( $reader->hasAttributes() ) {
705 $self->processAttribute( $self->{'_current_entry_data'} );
708 $self->throw("no sequence ID!");
712 =head2 element_species
714 Title : element_entry
715 Usage : $self->element_entry
716 Function: processes a <species> node, creating a Bio::Species object
719 Throws : Exception if <species> tag exists but is empty,
720 or if the attributes 'name' or 'ncbiTaxID' are undefined
724 sub element_species
{
726 my $reader = $self->{'_reader'};
727 my $data = $self->{'_current_entry_data'};
729 my $species_data = {};
732 if ( $reader->hasAttributes() ) {
733 $self->processAttribute($species_data);
736 $self->throw("no species information!");
739 if ( defined $species_data->{'name'}
740 && defined $species_data->{'ncbiTaxID'} )
743 Bio
::Species
->new( -ncbi_taxid
=> $species_data->{'ncbiTaxID'}, );
744 $species_obj->node_name( $species_data->{'name'} );
745 $data->{'species'} = $species_obj;
748 $self->throw("<species> attributes name and ncbiTaxID are undefined");
753 =head2 element_description
755 Title : element_description
756 Usage : $self->element_description
757 Function: processes a sequence <description> node;
758 a no-op -- description text is read by
765 sub element_description
{
769 =head2 element_RNAseq
771 Title : element_RNAseq
772 Usage : $self->element_RNAseq
773 Function: processes a sequence <RNAseq> node
781 my $reader = $self->{'_reader'};
783 my $data = $self->{'_current_entry_data'};
784 $data->{'alphabet'} = 'rna';
785 $data->{'sequence'} = $data->{'RNAseq'};
789 =head2 element_DNAseq
791 Title : element_DNAseq
792 Usage : $self->element_DNAseq
793 Function: processes a sequence <DNAseq> node
801 my $reader = $self->{'_reader'};
803 my $data = $self->{'_current_entry_data'};
804 $data->{'alphabet'} = 'dna';
805 $data->{'sequence'} = $data->{'DNAseq'};
811 Title : element_AAseq
812 Usage : $self->element_AAseq
813 Function: processes a sequence <AAseq> node
821 my $reader = $self->{'_reader'};
823 my $data = $self->{'_current_entry_data'};
824 $data->{'alphabet'} = 'protein';
825 $data->{'sequence'} = $data->{'AAseq'};
831 Title : element_DBRef
832 Usage : $self->element_DBRef
833 Function: processes a sequence <DBRef> node,
834 creating a Bio::Annotation::DBLink object
842 my $reader = $self->{'_reader'};
843 my $data = $self->{'_current_entry_data'};
848 if ( $reader->hasAttributes() ) {
849 $self->processAttribute($DBRef);
852 $self->throw("no DBRef data!");
855 if ( defined $DBRef->{'source'}
856 && defined $DBRef->{'id'}
857 && defined $DBRef->{'type'})
859 $annotation_obj = Bio
::Annotation
::DBLink
->new(
860 -primary_id
=> $DBRef->{'id'},
861 -database
=> $DBRef->{'source'},
862 -type
=> $DBRef->{'type'},
863 -tagname
=> 'dblink',
865 push @
{ $data->{'DBRefs'} }, $annotation_obj;
868 $self->throw("malformed DBRef data!");
872 =head2 element_property
874 Title : element_property
875 Usage : $self->element_property
876 Function: processes a sequence <property> node, creating a
877 Bio::Annotation::SimpleValue object
883 sub element_property
{
885 my $reader = $self->{'_reader'};
886 my $data = $self->{'_current_entry_data'};
891 if ( $reader->hasAttributes() ) {
892 $self->processAttribute($property);
895 $self->throw("no property data!");
898 if ( defined $property->{'name'} ) {
900 Bio
::Annotation
::SimpleValue
->new( -tagname
=> $property->{'name'} );
902 if ( defined $property->{'value'} ) {
903 $annotation_obj->value( $property->{'value'} );
906 push @
{ $data->{'properties'} }, $annotation_obj;
909 $self->throw("malformatted property!");
913 =head2 end_element_RNAseq
915 Title : end_element_RNAseq
916 Usage : $self->end_element_RNAseq
917 Function: processes a sequence <RNAseq> node
923 sub end_element_RNAseq
{
925 my $reader = $self->{'_reader'};
927 my $data = $self->{'_current_entry_data'};
928 $data->{'alphabet'} = 'rna';
929 $data->{'sequence'} = $data->{'RNAseq'};
932 =head2 end_element_DNAseq
934 Title : end_element_DNAseq
935 Usage : $self->end_element_DNAseq
936 Function: processes a sequence <DNAseq> node
942 sub end_element_DNAseq
{
944 my $reader = $self->{'_reader'};
946 my $data = $self->{'_current_entry_data'};
947 $data->{'alphabet'} = 'dna';
948 $data->{'sequence'} = $data->{'DNAseq'};
952 =head2 end_element_AAseq
954 Title : end_element_AAseq
955 Usage : $self->end_element_AAseq
956 Function: processes a sequence <AAseq> node
962 sub end_element_AAseq
{
964 my $reader = $self->{'_reader'};
966 my $data = $self->{'_current_entry_data'};
967 $data->{'alphabet'} = 'protein';
968 $data->{'sequence'} = $data->{'AAseq'};
972 =head2 end_element_entry
974 Title : end_element_entry
975 Usage : $self->end_element_entry
976 Function: processes the closing </entry> node, creating the Seq object
977 Returns : a Bio::Seq object
979 Throws : Exception if sequence, sequence ID, or alphabet are missing
983 sub end_element_entry
{
985 my $reader = $self->{'_reader'};
987 my $data = $self->{'_current_entry_data'};
989 # make sure we've got at least a seq, an ID, and an alphabet
990 unless ( $data->{'sequence'} && length($data->{'sequence'}) > 0) {
991 $self->throw("this entry lacks a sequence");
993 unless ( $data->{'id'} ) {
994 $self->throw("this entry lacks an id");
996 unless ( $data->{'alphabet'} ) {
997 $self->throw("this entry lacks an alphabet");
1000 # create new sequence object with minimum necessary parameters
1001 my $seq_obj = $self->sequence_factory->create(
1002 -seq
=> $data->{'sequence'},
1003 -alphabet
=> $data->{'alphabet'},
1004 -id
=> $data->{'id'},
1005 -primary_id
=> $data->{'id'},
1008 # add additional parameters if available
1009 if ( $data->{'description'} ) {
1010 $seq_obj->desc( $data->{'description'} );
1012 if ( $data->{'species'} ) {
1013 $seq_obj->species( $data->{'species'} );
1015 if ( $data->{'DBRefs'} ) {
1016 foreach my $annotation_obj ( @
{ $data->{'DBRefs'} } ) {
1017 $seq_obj->add_Annotation($annotation_obj);
1020 if ( $data->{'properties'} ) {
1021 foreach my $annotation_obj ( @
{ $data->{'properties'} } ) {
1022 $seq_obj->add_Annotation($annotation_obj);
1025 if ( $data->{'source'} ) {
1026 my $annotation_obj = Bio
::Annotation
::SimpleValue
->new(
1027 '-tagname' => 'source',
1028 '-value' => $data->{'source'},
1030 $seq_obj->add_Annotation($annotation_obj);
1033 # empty the temporary data store
1034 $self->{'_current_entry_data'} = {};
1039 =head2 end_element_default
1041 Title : end_element_default
1042 Usage : $self->end_element_default
1043 Function: processes all other closing tags;
1050 sub end_element_default
{
1057 Usage : called automatically by Perl just before object
1059 Function: performs a write flush
1067 $self->flush if $self->_flush_on_write && defined $self->_fh;
1068 $self->SUPER::DESTROY
;
1074 Usage : $seqio_obj->close().
1075 Function: writes closing </seqXML> tag.
1077 close() will be called automatically by Perl when your
1078 program exits, but if you want to use the seqXML file
1079 you've written before then, you'll need to do an explicit
1080 close first to get the final </seqXML> tag.
1088 if ( $self->mode eq 'w' && $self->{'_writer'}->within_element('seqXML') ) {
1089 $self->{'_writer'}->endTag("seqXML");
1090 $self->{'_writer'}->end();
1092 $self->SUPER::close();