2 # BioPerl module for Bio::SeqIO::chaos
4 # Chris Mungall <cjm@fruitfly.org>
6 # You may distribute this module under the same terms as perl itself
8 # POD documentation - main docs before the code
12 Bio::SeqIO::chaos - chaos sequence input/output stream
16 #In general you will not want to use this module directly;
17 #use the chaosxml format via SeqIO
19 $outstream = Bio::SeqIO->new(-file => $filename,
20 -format => 'chaosxml');
22 while ( my $seq = $instream->next_seq() ) {
23 $outstream->write_seq($seq);
28 This is the guts of L<Bio::SeqIO::chaosxml> - please refer to the
29 documentation for this module
31 B<CURRENTLY WRITE ONLY>
33 ChaosXML is an XML mapping of the chado relational database; for more
34 information, see http://www.fruitfly.org/chaos-xml
36 chaos can be represented in various syntaxes - XML, S-Expressions or
37 indented text. You should see the relevant SeqIO file. You will
38 probably want to use L<Bio::SeqIO::chaosxml>, which is a wrapper to
41 =head2 USING STAG OBJECTS
43 B<non-standard bioperl stuff you don't necessarily need to know follows>
45 This module (in write mode) is an B<event producer> - it generates XML
46 events via the L<Data::Stag> module. If you only care about the final
47 end-product xml, use L<Bio::SeqIO::chaosxml>
49 You can treat the resulting chaos-xml stream as stag XML objects;
51 $outstream = Bio::SeqIO->new(-file => $filename, -format => 'chaos');
53 while ( my $seq = $instream->next_seq() ) {
54 $outstream->write_seq($seq);
56 my $chaos = $outstream->handler->stag;
57 # stag provides get/set methods for xml elements
58 # (these are chaos objects, not bioperl objects)
59 my @features = $chaos->get_feature;
60 my @feature_relationships = $chaos->get_feature_relationships;
61 # stag objects can be queried with functional-programming
63 my @features_in_range =
64 $chaos->where('feature',
66 my $featureloc = shift->get_featureloc;
67 $featureloc->strand == 1 &&
68 $featureloc->nbeg > 10000 &&
69 $featureloc->nend < 20000;
71 foreach my $feature (@features_in_range) {
72 my $featureloc = $feature->get_featureloc;
73 printf "%s [%d->%d on %s]\n",
75 $featureloc->sget_nbeg,
76 $featureloc->sget_end,
77 $featureloc->sget_srcfeature_id;
80 =head1 MODULES REQUIRED
84 Downloadable from CPAN; see also http://stag.sourceforge.net
90 User feedback is an integral part of the evolution of this and other
91 Bioperl modules. Send your comments and suggestions preferably to one
92 of the Bioperl mailing lists. Your participation is much appreciated.
94 bioperl-l@bioperl.org - General discussion
95 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
99 Please direct usage questions or support issues to the mailing list:
101 I<bioperl-l@bioperl.org>
103 rather than to the module maintainer directly. Many experienced and
104 reponsive experts will be able look at the problem and quickly
105 address it. Please include a thorough description of the problem
106 with code and data examples if at all possible.
108 =head2 Reporting Bugs
110 Report bugs to the Bioperl bug tracking system to help us keep track
111 the bugs and their resolution.
112 Bug reports can be submitted via the web:
114 https://github.com/bioperl/bioperl-live/issues
116 =head1 AUTHOR - Chris Mungall
118 Email cjm@fruitfly.org
122 The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _
126 # Let the code begin...
128 package Bio
::SeqIO
::chaos
;
131 use Bio
::SeqFeature
::Generic
;
133 use Bio
::Seq
::SeqFactory
;
134 use Bio
::Annotation
::Collection
;
135 use Bio
::Annotation
::Comment
;
136 use Bio
::Annotation
::Reference
;
137 use Bio
::Annotation
::DBLink
;
138 use Bio
::SeqFeature
::Tools
::TypeMapper
;
139 use Bio
::SeqFeature
::Tools
::FeatureNamer
;
140 use Bio
::SeqFeature
::Tools
::IDHandler
;
141 use Data
::Stag
qw(:all);
143 use base
qw(Bio::SeqIO);
145 our $TM = 'Bio::SeqFeature::Tools::TypeMapper';
146 our $FNAMER = 'Bio::SeqFeature::Tools::FeatureNamer';
147 our $IDH = 'Bio::SeqFeature::Tools::IDHandler';
150 my($self,@args) = @_;
152 $self->SUPER::_initialize
(@args);
153 if( ! defined $self->sequence_factory ) {
154 $self->sequence_factory(Bio
::Seq
::SeqFactory
->new
155 (-verbose
=> $self->verbose(),
156 -type
=> 'Bio::Seq::RichSeq'));
158 my $wclass = $self->default_handler_class;
159 $self->handler($wclass);
161 $self->handler->fh($self->_fh);
163 $self->{_end_of_data
} = 0;
164 $self->_type_by_id_h({});
166 my $ppt = localtime $t;
167 $self->handler->S("chaos");
168 $self->handler->ev(chaos_metadata
=>[
170 [chaos_flavour
=>'bioperl'],
171 [feature_unique_key
=>'feature_id'],
172 [equiv_chado_release
=>'chado_1_01'],
173 [export_unixtime
=>$t],
174 [export_localtime
=>$ppt],
175 [export_host
=>$ENV{HOST
}],
176 [export_user
=>$ENV{USER
}],
177 [export_perl5lib
=>$ENV{PERL5LIB
}],
178 [export_program
=>$0],
179 [export_module
=>'Bio::SeqIO::chaos'],
180 [export_module_cvs_id
=>'$Id$'],
188 $self->end_of_data();
189 $self->SUPER::DESTROY
();
194 return if $self->{_end_of_data
};
195 $self->{_end_of_data
} = 1;
196 $self->handler->E("chaos");
199 sub default_handler_class
{
200 return Data
::Stag
->makehandler;
203 =head2 context_namespace
205 Title : context_namespace
206 Usage : $obj->context_namespace($newval)
209 Returns : value of context_namespace (a scalar)
210 Args : on set, new value (a scalar or undef, optional)
212 IDs will be preceded with the context namespace
216 sub context_namespace
{
219 return $self->{'context_namespace'} = shift if @_;
220 return $self->{'context_namespace'};
227 Usage : $seq = $stream->next_seq()
228 Function: returns the next sequence in the stream
229 Returns : Bio::Seq object
235 my ($self,@args) = @_;
236 my $seq = $self->sequence_factory->create
238 # '-verbose' =>$self->verbose(),
241 # -annotation => $annotation,
242 # -features => \@features
249 $self->{_handler
} = shift if @_;
250 return $self->{_handler
};
257 Usage : $stream->write_seq($seq)
258 Function: writes the $seq object (must be seq) to the stream
259 Returns : 1 for success and 0 for error
266 my ($self,$seq) = @_;
268 if( !defined $seq ) {
269 $self->throw("Attempting to write with no seq!");
272 if( ! ref $seq || ! $seq->isa('Bio::SeqI') ) {
273 $self->warn(" $seq is not a SeqI compliant module. Attempting to dump, but may fail!");
276 # get a handler - must inherit from Data::Stag::BaseHandler;
277 my $w = $self->handler;
280 ### $w->S("chaos_block");
282 my $seq_chaos_feature_id;
284 # different seq objects have different version accessors -
286 my $version = $seq->can('seq_version') ?
$seq->seq_version : $seq->version;
288 my $accversion = $seq->accession_number;
290 $accversion .= ".$version";
294 $seq_chaos_feature_id = $accversion;
297 $seq_chaos_feature_id = $self->get_chaos_feature_id($seq);
298 $accversion = $seq_chaos_feature_id;
301 # All ids must have a namespace prefix
302 if ($seq_chaos_feature_id !~ /:/) {
303 $seq_chaos_feature_id = "GenericSeqDB:$seq_chaos_feature_id";
306 # if ($seq->accession_number eq 'unknown') {
307 # $seq_chaos_feature_id = $self->get_chaos_feature_id('contig', $seq);
311 if ($seq->desc =~ /haplotype(.*)/i) {
312 # yikes, no consistent way to specify haplotype in gb
314 $haplotype =~ s/\s+/_/g;
315 $haplotype =~ s/\W+//g;
320 if (my $spec = $seq->species) {
321 my ($species, $genus, @class) = $spec->classification();
322 $OS = "$genus $species";
323 if (my $ssp = $spec->sub_species) {
326 $self->genus_species($OS);
327 if( $spec->common_name ) {
328 my $common = $spec->common_name;
329 # genbank parser sets species->common_name to
330 # be "Genus Species (common name)" which is wrong;
331 # we will correct for this; if common_name is set
332 # correctly then carry on
333 if ($common =~ /\((.*)\)/) {
336 $OS .= " (".$common.")";
340 $self->organismstr($OS);
343 # genus_species is part of uniquename - add haplotype
344 # to make it genuinely unique
345 $self->genus_species($self->genus_species .= " $haplotype");
348 my $uname = $self->make_uniquename($self->genus_species, $accversion);
350 # data structure representing the core sequence for this record
352 Data
::Stag
->new(feature
=>[
353 [feature_id
=>$seq_chaos_feature_id],
354 [dbxrefstr
=>'SEQDB:'.$accversion],
355 [name
=>$seq->display_name],
356 [uniquename
=>$uname],
357 [residues
=>$seq->seq],
363 $seqnode->set_type('databank_entry');
366 $prop{$_} = $seq->$_() if $seq->can($_);
367 } qw(desc keywords division molecule is_circular);
368 $prop{dates
} = join("; ", $seq->get_dates) if $seq->can("get_dates");
370 local($^W
) = 0; # suppressing warnings about uninitialized fields.
374 foreach my $ref ( $seq->annotation->get_Annotations('reference') ) {
379 $seqnode->add_featureprop([[type
=>'haplotype'],[value
=>$haplotype]])
381 foreach my $comment ( $seq->annotation->get_Annotations('comment') ) {
382 $seqnode->add_featureprop([[type
=>'comment'],[value
=>$comment->text]]);
385 $seqnode->set_organismstr($OS);
388 my @sfs = $seq->get_SeqFeatures;
390 # genbank usually includes a 'source' feature - we just
391 # migrate the data from this to the actual source feature
392 my @sources = grep {$_->primary_tag eq 'source'} @sfs;
393 @sfs = grep {$_->primary_tag ne 'source'} @sfs;
394 $self->throw(">1 source types") if @sources > 1;
395 my $source = shift @sources;
398 my $tempw = Data
::Stag
->makehandler;
399 $self->write_sf($source, $seq_chaos_feature_id, $tempw);
400 my $snode = $tempw->stag;
401 $seqnode->add($_->name, $_->data)
402 foreach ($snode->get_featureprop,
403 $snode->get_feature_dbxref);
408 # throw the writer an event
411 $seqnode = undef; # free memory
413 # make events for all the features within the record
414 foreach my $sf ( @sfs ) {
415 $FNAMER->name_feature($sf);
416 $FNAMER->name_contained_features($sf);
417 $self->write_sf($sf, $seq_chaos_feature_id);
421 ### $w->E("chaos_block");
429 return $self->{'organismstr'} = shift if @_;
430 return $self->{'organismstr'};
437 return $self->{'genus_species'} = shift if @_;
438 return $self->{'genus_species'};
445 $self->{_type_by_id_h
} = shift if @_;
446 return $self->{_type_by_id_h
};
452 # writes a seq feature
458 my $seq_chaos_feature_id = shift;
459 my $w = shift || $self->handler;
463 lc($_)=>[$sf->each_tag_value($_)]
466 my $loc = $sf->location;
467 my $name = $FNAMER->generate_feature_name($sf);
468 my $type = $sf->primary_tag;
470 # The CDS (eg in a genbank feature) implicitly represents
472 $type =~ s/CDS/polypeptide/;
474 my @subsfs = $sf->sub_SeqFeature;
476 my $sid = $loc->is_remote ?
$loc->seq_id : $seq_chaos_feature_id;
478 my $CREATE_SPLIT_SFS = 0;
480 if($CREATE_SPLIT_SFS &&
481 $loc->isa("Bio::Location::SplitLocationI") ) {
482 # turn splitlocs into subfeatures
487 Bio
::SeqFeature
::Generic
->new(
492 -primary
=>$self->subpartof($type),
495 $ssf->location->is_remote(1);
496 $ssf->location->seq_id($_->seq_id);
499 } $loc->each_Location);
501 elsif( $loc->isa("Bio::Location::RemoteLocationI") ) {
502 # turn splitlocs into subfeatures
506 Bio
::SeqFeature
::Generic
->new(
507 # -name=>$name.'.'.$n++,
511 -primary
=>$self->subpartof($type),
513 } $loc->each_Location);
516 my ($beg, $end, $strand) = $self->bp2ib($loc);
519 print Dumper
$sf, $loc;
520 $self->throw("($beg, $end, $strand) - no strand\n");
527 [srcfeature_id
=>$sid],
534 my $feature_id = $self->get_chaos_feature_id($sf);
536 delete $props{id
} if $props{id
};
537 # do something with genbank stuff
538 my $pid = $props{'protein_id'};
539 my $tn = $props{'translation'};
540 my @xrefs = @
{$props{'db_xref'} || []};
542 push(@xrefs, "protein:$pid->[0]");
545 my $org = $props{organism
} ?
$props{organism
}->[0] : undef;
546 if (!$org && $self->organismstr) {
547 $org = $self->organismstr;
549 my $uname = $name ?
$name.'/'.$feature_id : $feature_id;
550 if ($self->genus_species && $name) {
551 $uname = $self->make_uniquename($self->genus_species, $name);
554 $self->throw("cannot make uniquename for $feature_id $name");
556 $self->_type_by_id_h->{$feature_id} = $type;
559 [feature_id
=>$feature_id],
560 $name ?
([name
=>$name]) : (),
561 [uniquename
=>$uname],
563 $tn ?
([residues
=>$tn->[0]],
564 [seqlen
=>length($tn->[0])],
565 #####[md5checksum=>md5checksum($tn->[0])],
567 $org ?
([organismstr
=>$org]) : (),
578 map { [featureprop
=>[[type
=>$k],[value
=>$_],[rank
=>$rank++]]] } @
{$props{$k}}
585 # strand is always determined by FIRST feature listed
586 # (see genbank entry for trans-spliced mod(mdg4) AE003734)
587 my $strand = $subsfs[0];
589 # almost all the time, all features are on same strand
590 my @sfs_on_main_strand = grep {$_->strand == $strand} @subsfs;
591 my @sfs_on_other_strand = grep {$_->strand != $strand} @subsfs;
593 sort_by_strand
($strand, \
@sfs_on_main_strand);
594 sort_by_strand
(0-$strand, \
@sfs_on_other_strand);
595 @subsfs = (@sfs_on_main_strand, @sfs_on_other_strand);
597 foreach my $ssf (@subsfs) {
598 my $ssfid = $self->write_sf($ssf, $sid);
599 #my $rtype = 'part_of';
601 $TM->get_relationship_type_by_parent_child($sf,$ssf);
602 if ($ssf->primary_tag eq 'CDS') {
603 $rtype = 'derives_from';
605 $w->ev(feature_relationship
=>[
606 [subject_id
=>$ssfid],
607 [object_id
=>$feature_id],
615 # parents not stored as bioperl containment hierarchy
616 my @parent_ids = @
{$props{parent
} || []};
617 foreach my $parent_id (@parent_ids) {
619 $self->_type_by_id_h->{$parent_id} || 'unknown';
621 $TM->get_relationship_type_by_parent_child($ptype,$type);
622 $w->ev(feature_relationship
=>[
623 [subject_id
=>$feature_id],
624 [object_id
=>$parent_id],
635 my $strand = shift || 1;
637 @
$sfs = sort { ($a->start <=> $b->start) * $strand } @
$sfs;
641 sub make_uniquename
{
657 sub get_chaos_feature_id
{
662 if ($ob->isa("Bio::SeqI")) {
663 $id = $ob->accession_number . '.' . ($ob->can('seq_version') ?
$ob->seq_version : $ob->version);
666 $ob->isa("Bio::SeqFeatureI") || $self->throw("$ob must be either SeqI or SeqFeatureI");
668 if ($ob->primary_id) {
669 $id = $ob->primary_id;
673 $id = $IDH->generate_unique_persistent_id($ob);
677 $id = "$ob"; # last resort - use memory pointer ref
678 # will not be persistent, but will be unique
683 if ($ob->isa("Bio::SeqFeatureI")) {
684 $id = $IDH->generate_unique_persistent_id($ob);
687 $self->throw("Cannot generate a unique persistent ID for a Seq without either primary_id or accession");
691 $id = $self->context_namespace ?
$self->context_namespace . ":" . $id : $id;
697 # interbase and directional semantics
702 ref($loc) eq "ARRAY" ?
(@
$loc) : ($loc->start, $loc->end, $loc->strand);
707 return ($s, $e, $str || 1);
712 my $type = 'partof_'.shift;
713 $type =~ s/partof_CDS/CDS_exon/;
714 $type =~ s/partof_protein/CDS_exon/;
715 $type =~ s/partof_polypeptide/CDS_exon/;
716 $type =~ s/partof_\w*RNA/exon/;