1 # Let the code begin...
3 package Bio
::AlignIO
::Handler
::GenericAlignHandler
;
8 use Bio
::Annotation
::Collection
;
9 use Bio
::Annotation
::Comment
;
10 use Bio
::Annotation
::SimpleValue
;
11 use Bio
::Annotation
::Target
;
12 use Bio
::Annotation
::DBLink
;
13 use Bio
::Annotation
::Reference
;
17 use base
qw(Bio::Root::Root Bio::HandlerBaseI);
19 # only stockholm is defined for now...
21 # stockholm has sequence and alignment specific annotation; this
23 'CONSENSUS_META' => \
&_generic_consensus_meta
,
24 'SEQUENCE' => \
&_generic_metaseq
,
25 'NAMED_META' => \
&_generic_metaseq
,
26 'ACCESSION' => \
&_generic_store
,
27 'ALPHABET' => \
&_generic_store
,
28 'ID' => \
&_generic_store
,
29 'DESCRIPTION' => \
&_generic_store
,
30 'REFERENCE' => \
&_generic_reference
,
31 'DBLINK' => \
&_stockholm_target
,
32 'DATABASE_COMMENT' => \
&_generic_comment
,
33 'ALIGNMENT_COMMENT' => \
&_generic_comment
,
34 '_DEFAULT_' => \
&_generic_simplevalue
39 my ($class, @args) = @_;
40 my $self = $class->SUPER::new
(@args);
41 my ($format, $verbose) = $self->_rearrange([qw(FORMAT VERBOSE)], @args);
42 $self->throw("Must define alignment record format") if !$format;
43 $verbose && $self->verbose($verbose);
44 $self->format($format);
45 $self->handler_methods();
46 # if we intend at a later point we can add a Builder
47 #$builder && $self->alignbuilder($builder);
53 if (!($self->{'handlers'})) {
54 $self->throw("No handlers defined for alignment format ",$self->format)
55 unless exists $HANDLERS{$self->format};
56 $self->{'handlers'} = $HANDLERS{$self->format};
58 return ($self->{'handlers'});
62 my ($self, $data) = @_;
63 my $nm = $data->{NAME
} || $self->throw("No name tag defined!");
64 # this should handle data on the fly w/o caching; any caching should be
66 my $method = (exists $self->{'handlers'}->{$nm}) ?
($self->{'handlers'}->{$nm}) :
67 (exists $self->{'handlers'}->{'_DEFAULT_'}) ?
($self->{'handlers'}->{'_DEFAULT_'}) :
70 $self->debug("No handler defined for $nm\n");
73 $self->$method($data);
76 sub reset_parameters
{
78 $self->{'_params'} = undef;
79 $self->{'_nse_cache'} = undef;
80 $self->{'_features'} = undef;
86 my $format = lc shift;
87 $self->throw("Format $format not supported") unless exists $HANDLERS{$format};
88 $self->{'_alignformat'} = $format;
90 return $self->{'_alignformat'};
94 my ($self, @ids) = @_;
98 if (!index($id, '-')==0) {
101 $data->{$id} = $self->{'_params'}->{$id} if (exists $self->{'_params'}->{$id});
105 $data = $self->{'_params'};
111 shift->throw('Not implemented yet!');
114 sub build_alignment
{
118 my $param = $self->get_params;
119 if (defined $param->{-seqs
}) {
120 return Bio
::SimpleAlign
->new(%$param, -source
=> $self->format);
125 sub annotation_collection
{
126 my ($self, $coll) = @_;
128 $self->throw("Must have Bio::AnnotationCollectionI ".
129 "when explicitly setting annotation_collection()")
130 unless (ref($coll) && $coll->isa('Bio::AnnotationCollectionI'));
131 $self->{'_params'}->{'-annotation'} = $coll;
132 } elsif (!exists($self->{'_params'}->{'-annotation'})) {
133 $self->{'_params'}->{'-annotation'} = Bio
::Annotation
::Collection
->new()
135 return $self->{'_params'}->{'-annotation'};
138 sub seq_annotation_collection
{
139 my ($self, $coll) = @_;
141 $self->throw("Must have Bio::AnnotationCollectionI ".
142 "when explicitly setting seq_annotation_collection()")
143 unless (ref($coll) && $coll->isa('Bio::AnnotationCollectionI'));
144 $self->{'_params'}->{'-seq_annotation'} = $coll;
145 } elsif (!exists($self->{'_params'}->{'-seq_annotation'})) {
146 $self->{'_params'}->{'-seq_annotation'} = Bio
::Annotation
::Collection
->new()
148 return $self->{'_params'}->{'-seq_annotation'};
154 my $data = $self->get_params(qw(-seqs -seq_class -consensus_meta));
155 my $class = $data->{-seq_class
} || 'Bio::LocatableSeq';
156 # cache classes loaded already
157 if (!exists($self->{'_loaded_modules'}->{$class})) {
158 $self->_load_module($class);
159 $self->{'_loaded_modules'}->{$class}++;
161 # process any meta sequence data
162 if ( $data->{-consensus_meta
} && !UNIVERSAL
::isa
($data->{-consensus_meta
},'Bio::Seq::Meta')) {
163 my $ref = $data->{-consensus_meta
};
164 if (!exists($self->{'_loaded_modules'}->{'Bio::Seq::Meta'})) {
165 $self->_load_module('Bio::Seq::Meta');
166 $self->{'_loaded_modules'}->{'Bio::Seq::Meta'}++;
168 my $ms = Bio
::Seq
::Meta
->new();
169 for my $tag (sort keys %{$ref}) {
170 $ms->named_meta($tag, $ref->{$tag});
172 $self->{'_params'}->{'-consensus_meta'} = $ms;
174 # this should always be an array ref!
175 for my $seq (@
{$data->{-seqs
}}) {
176 next if (UNIVERSAL
::isa
($seq,'Bio::LocatableI'));
177 # process anything else
178 $self->_from_nse($seq) if $seq->{NSE
};
179 if (UNIVERSAL
::isa
($seq,'HASH')) {
181 for my $p (keys %$seq) {
182 $param{'-'.lc $p} = $seq->{$p} if exists $seq->{$p};
184 my $ls = $class->new(%param);
185 # a little switcheroo to attach the sequence
186 # (though using it to get seq() doesn't work correctly yet!)
187 if (defined $seq->{NSE
} &&
188 exists $self->{'_features'} &&
189 exists $self->{'_features'}->{ $seq->{NSE
} }) {
190 for my $feat (@
{ $self->{'_features'}->{ $seq->{NSE
} } }) {
191 push @
{ $self->{'_params'}->{'-features'} }, $feat;
192 $feat->attach_seq($ls);
200 ####################### SEQUENCE HANDLERS #######################
202 # any sequence data for a Bio::Seq::Meta
203 sub _generic_metaseq
{
204 my ($self, $data) = @_;
206 $self->throw("No alignment position passed") if !exists($data->{BLOCK_LINE
});
207 $self->throw("Alignment position must be an index greater than 0") if $data->{BLOCK_LINE
} < 1;
208 $self->{'_params'}->{'-seq_class'} = 'Bio::Seq::Meta';
209 my $index = $data->{BLOCK_LINE
} - 1;
210 if (my $nse = $self->{'_params'}->{'-seqs'}->[$index]->{NSE
}) {
211 $self->throw("NSE in passed data doesn't match stored data in same position: $nse") unless $nse eq $data->{NSE
};
213 $self->{'_params'}->{'-seqs'}->[$index]->{NSE
} = $data->{NSE
};
215 if ($data->{NAME
} eq 'SEQUENCE') {
216 $self->{'_params'}->{'-seqs'}->[$index]->{SEQ
} .= $data->{DATA
};
217 } elsif ($data->{NAME
} eq 'NAMED_META') {
218 $self->{'_params'}->{'-seqs'}->[$index]->{NAMED_META
}->{$data->{META_TAG
}} .= $data->{DATA
};
222 sub _generic_consensus_meta
{
223 my ($self, $data) = @_;
225 if ($data->{NAME
} eq 'CONSENSUS_META') {
226 $self->{'_params'}->{'-consensus_meta'}->{$data->{META_TAG
}} .= $data->{DATA
};
230 # any sequence data for a Bio::LocatableSeq
231 sub _generic_locatableseq
{
232 my ($self, $data) = @_;
234 $self->throw("No alignment position passed") if !exists($data->{BLOCK_LINE
});
235 $self->throw("Alignment position must be an index greater than 0") if $data->{BLOCK_LINE
} < 1;
236 my $index = $data->{BLOCK_LINE
} - 1;
237 if (my $nse = $self->{'_params'}->{'-seqs'}->[$index]->{NSE
}) {
238 $self->throw("NSE in passed data doesn't match stored data in same position: $nse") if $nse ne $data->{NSE
};
240 $self->{'_params'}->{'-seqs'}->[$index]->{NSE
} = $data->{NSE
};
242 if ($data->{NAME
} eq 'SEQUENCE') {
243 $self->{'_params'}->{'-seqs'}->[$index]->{SEQ
} .= $data->{DATA
};
247 ####################### RAW DATA HANDLERS #######################
249 # store by data name (ACCESSION, ID, etc), which can be mapped to the
250 # appropriate alignment or sequence parameter
252 my ($self, $data) = @_;
254 if ($data->{ALIGNMENT
}) {
255 $self->{'_params'}->{'-'.lc $data->{NAME
}} = $data->{DATA
};
257 $self->{'_params'}->{'-seq_'.lc $data->{NAME
}}->{$data->{NSE
}} = $data->{DATA
}
261 sub _generic_reference
{
262 my ($self, $data) = @_;
263 my $ref = Bio
::Annotation
::Reference
->new(-title
=> $data->{TITLE
},
264 -authors
=> $data->{AUTHORS
},
265 -pubmed
=> $data->{PUBMED
},
266 -location
=> $data->{JOURNAL
},
267 -tagname
=> lc $data->{NAME
});
268 $self->annotation_collection->add_Annotation($ref);
271 sub _generic_simplevalue
{
272 my ($self, $data) = @_;
273 my $sv = Bio
::Annotation
::SimpleValue
->new(-value
=> $data->{DATA
},
274 -tagname
=> lc $data->{NAME
});
275 $self->annotation_collection->add_Annotation($sv);
278 sub _generic_comment
{
279 my ($self, $data) = @_;
280 my $comment = Bio
::Annotation
::Comment
->new(-type
=> lc $data->{NAME
},
281 -text
=> $data->{DATA
},
282 -tagname
=> lc $data->{NAME
});
283 $self->annotation_collection->add_Annotation($comment);
286 # Some DBLinks in Stockholm format are unique, so a unique handler for them
287 sub _stockholm_target
{
288 my ($self, $data) = @_;
289 # process database info
290 $self->_from_stk_dblink($data);
292 # Bio::Annotation::Target is now a DBLink, but has additional (RangeI)
293 # capabilities (for PDB data)
294 my $dblink = Bio
::Annotation
::Target
->new(
295 -database
=> $data->{DBLINK_DB
},
296 -primary_id
=> $data->{DBLINK_ACC
},
297 -optional_id
=> $data->{DBLINK_OPT
},
298 -start
=> $data->{DBLINK_START
},
299 -end
=> $data->{DBLINK_END
},
300 -strand
=> $data->{DBLINK_STRAND
},
301 -comment
=> $comment,
302 -tagname
=> 'dblink',
304 if ($data->{ALIGNMENT
}) {
305 # Alignment-specific DBLinks
306 $self->annotation_collection->add_Annotation($dblink);
308 # Sequence-specific DBLinks
309 # These should come with identifying information of some sort
310 # (ID/START/END/STRAND). Make into a SeqFeature (SimpleAlign is
311 # FeatureHolderI) spanning the length acc. to the NSE. Add the DBLink as
312 # Annotation specific to that SeqFeature, store in an internal hash by
313 # NSE so we can tie the LocatableSeq to the proper Features
314 $self->_from_nse($data) if $data->{NSE
};
315 $self->throw("Must supply an sequence DISPLAY_ID or NSE for sequence-related
316 DBLinks") unless $data->{ACCESSION_NUMBER
} || $data->{DISPLAY_ID
};
317 my $sf = Bio
::SeqFeature
::Generic
->new(-seq_id
=> $data->{DISPLAY_ID
},
318 -accession_number
=> $data->{ACCESSION_NUMBER
},
319 -start
=> $data->{START
},
320 -end
=> $data->{END},
321 -strand
=> $data->{STRAND
}
323 $sf->annotation->add_Annotation($dblink);
325 push @
{ $self->{'_features'}->{ $data->{NSE
} } }, $sf;
326 #$self->seq_annotation_collection->add_Annotation($dblink);
330 ####################### HELPER METHODS #######################
332 # returns ACCESSION VERSION START END STRAND ALPHABET
333 # cached for multiple lookups, should reset in between uses
335 my ($self, $data) = @_;
336 return unless my $nse = $data->{NSE
};
337 $data->{ALPHABET
} = $self->get_params('-alphabet')->{'-alphabet'} || 'protein';
338 # grab any accessions if present, switch out with ACCESSION from NSE
339 # (move that to primary_id)
341 if (exists $self->{'_params'}->{'-seq_accession'}) {
342 $new_acc = $self->{'_params'}->{'-seq_accession'}->{$data->{NSE
}};
344 if ($nse =~ m{(\S+?)(?:\.(\d+))?/(\d+)-(\d+)}xmso) {
345 my $strand = $data->{ALPHABET
} eq 'dna' || $data->{ALPHABET
} eq 'rna' ?
1 : undef;
346 my ($start, $end) = ($3, $4);
348 ($start, $end, $strand) = ($end, $start, -1);
350 $data->{ACCESSION_NUMBER
} = $new_acc || $1;
351 $data->{DISPLAY_ID
} = $1;
352 $data->{VERSION
} = $2;
353 $data->{START
} = $start;
355 $data->{STRAND
} = $strand;
357 # we can parse for version here if needed
358 $data->{DISPLAY_ID
} = $data->{NSE
};
362 # this will probably be split up into subhandlers based on Record/DB
363 sub _from_stk_dblink
{
364 my ($self, $data) = @_;
365 return unless my $raw = $data->{DATA
};
366 my @rawdata = split(m{\s*;\s*}, $raw);
368 if ($rawdata[0] eq 'PDB') {
369 # fix for older Stockholm PDB range format
370 if (scalar(@rawdata) == 3 && $rawdata[2] =~ m{-}) {
371 @rawdata[2,3] = split('-',$rawdata[2],2);
373 $self->throw("Not standard PDB form: ".$data->{DATA
}) if scalar(@rawdata) != 4;
374 my ($main, $chain) = split(m{\s+}, $rawdata[1]);
376 DBLINK_DB
=> $rawdata[0],
378 DBLINK_OPT
=> $chain || '',
379 DBLINK_START
=> $rawdata[2],
380 DBLINK_END
=> $rawdata[3]
382 } elsif ($rawdata[0] eq 'SCOP') {
383 $self->throw("Not standard SCOP form: ".$data->{DATA
}) if scalar(@rawdata) != 3;
385 DBLINK_DB
=> $rawdata[0],
386 DBLINK_ACC
=> $rawdata[1],
387 DBLINK_OPT
=> $rawdata[2],
390 $self->warn("Some data missed: ".$data->{DATA
}) if scalar(@rawdata) > 2;
392 DBLINK_DB
=> $rawdata[0],
393 DBLINK_ACC
=> $rawdata[1],
396 while (my ($k, $v) = each %dblink_data) {
397 $data->{$k} = $v if $v;
405 # $Id: GenericAlignHandler.pm 14816 2008-08-21 16:00:12Z cjfields $
407 # BioPerl module for Bio::AlignIO::Handler::GenericAlignHandler
409 # Please direct questions and support issues to <bioperl-l@bioperl.org>
411 # Cared for by Chris Fields
413 # Copyright Chris Fields
415 # You may distribute this module under the same terms as perl itself
417 # POD documentation - main docs before the code
419 # Documentation after the __END__ marker
423 Bio::AlignIO::Handler::GenericAlignHandler - Bio::HandlerI-based
424 generic data handler class for alignment-based data
428 # MyHandler is a GenericAlignHandler object.
429 # inside a parser (driver) constructor....
431 $self->alignhandler($handler || MyHandler->new(-format => 'stockholm'));
433 # in next_aln() in driver...
435 $hobj = $self->alignhandler();
437 # roll data up into hashref chunks, pass off into Handler for processing...
439 $hobj->data_handler($data);
441 # or retrieve Handler methods and pass data directly to Handler methods...
443 my $hmeth = $hobj->handler_methods;
445 if ($hmeth->{ $data->{NAME} }) {
446 my $mth = $hmeth->{ $data->{NAME} };
452 This is an experimental implementation of a alignment-based HandlerBaseI parser
453 and may change over time. It is possible that the way handler methods are set up
454 will change over development to allow more flexibility.
456 Standard Developer caveats:
458 Here thar be dragoons...
460 Consider yourself warned!
464 As in the SeqIO Handler object (still in development), data is passed in as
465 chunks. The Annotation and SeqFeatures are essentially the same as the SeqIO
466 parser; the significant difference is that data hash being passed could pertain
467 to either the alignment or to a specific sequence, so an extra tag may be needed
468 to disambiguate between the two in some cases. Here I use the ALIGNMENT tag as a
469 boolean flag: it must be present and set to 0 for the data to be tagged for
470 Bio::LocatableSeq or similar (in all other cases it is assumed to be for the
471 alignment). In some cases this will not matter (the actual sequence data, for
472 instance) but it is highly recommended adding this tag in to prevent possible
475 This is the current Annotation data chunk (via Data::Dumper):
478 'NAME' => 'REFERENCE',
479 'DATA' => '1 (bases 1 to 10001)'
480 'AUTHORS' => 'International Human Genome Sequencing Consortium.'
481 'TITLE' => 'The DNA sequence of Homo sapiens'
482 'JOURNAL' => 'Unpublished (2003)'
486 In the case of LocatableSeqs, one can pass them in as follows for simplicity
487 (note the block line):
490 'NAME' => 'SEQUENCE',
492 'NSE' => 'Q7WNI7_BORBR/113-292',
493 'ALPHABET' => 'protein',
494 'DATA' => 'VALILGVYRRL...CYVNREM..RAG....QW',
498 This can be done as the parser parses each block instead of parsing all the
499 blocks and then passing them in one at a time; the handler will store the
500 sequence data by the block line in an internal hash, concatenating them along
501 the way. This behaviour is b/c the alignment building step requires that
502 the sequence be checked for start/end/strand, possible meta sequence, optional
505 Similarly, a Meta sequence line can be passed in as follows:
508 'NAME' => 'NAMED_META',
510 'NSE' => 'Q7WNI7_BORBR/113-292',
512 'DATA' => '................................',
516 The meta sequence will be checked against the NSE for the block position and
517 stored based on the meta tag. A meta sequence does not have to correspond to a
518 real sequence. At this time, unique meta sequence tags must be used for each
519 sequence or they will be overwritten (this may change).
521 An alignment consensus string:
524 'NAME' => 'CONSENSUS',
525 'DATA' => 'VALILGVYRRL...CYVNREM..RAG....QW',
529 A consensus meta sequence:
532 'NAME' => 'CONSENSUS_META',
534 'DATA' => '................................',
542 User feedback is an integral part of the evolution of this and other
543 Bioperl modules. Send your comments and suggestions preferably to one
544 of the Bioperl mailing lists. Your participation is much appreciated.
546 bioperl-l@bioperl.org - General discussion
547 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
551 Please direct usage questions or support issues to the mailing list:
553 I<bioperl-l@bioperl.org>
555 rather than to the module maintainer directly. Many experienced and
556 reponsive experts will be able look at the problem and quickly
557 address it. Please include a thorough description of the problem
558 with code and data examples if at all possible.
560 =head2 Reporting Bugs
562 Report bugs to the Bioperl bug tracking system to help us keep track
563 the bugs and their resolution. Bug reports can be submitted via the
566 https://github.com/bioperl/bioperl-live/issues
568 =head1 AUTHOR - Chris Fields
570 Email cjfields at bioperl dot org
574 The rest of the documentation details each of the object methods. Internal
575 methods are usually preceded with a _
585 Args : -format Sequence format to be mapped for handler methods
586 -builder Bio::Seq::SeqBuilder object (normally defined in
587 SequenceStreamI object implementation constructor)
588 Throws : On undefined '-format' sequence format parameter
589 Note : Still under heavy development
593 =head1 L<Bio::HandlerBaseI> implementing methods
595 =head2 handler_methods
597 Title : handler_methods
598 Usage : $handler->handler_methods('GenBank')
599 %handlers = $handler->handler_methods();
600 Function: Retrieve the handler methods used for the current format() in
601 the handler. This assumes the handler methods are already
602 described in the HandlerI-implementing class.
603 Returns : a hash reference with the data type handled and the code ref
605 Args : [optional] String representing the sequence format. If set here
606 this will also set sequence_format()
607 Throws : On unimplemented sequence format in %HANDLERS
614 Usage : $handler->data_handler($data)
615 Function: Centralized method which accepts all data chunks, then distributes
616 to the appropriate methods for processing based on the chunk name
617 from within the HandlerBaseI object.
621 Args : an hash ref containing a data chunk.
625 =head2 reset_parameters
627 Title : reset_parameters
628 Usage : $handler->reset_parameters()
629 Function: Resets the internal cache of data (normally object parameters for
630 a builder or factory)
639 Usage : $handler->format('GenBank')
640 Function: Get/Set the format for the report/record being parsed. This can be
641 used to set handlers in classes which are capable of processing
642 similar data chunks from multiple driver modules.
643 Returns : String with the sequence format
644 Args : [optional] String with the sequence format
645 Note : The format may be used to set the handlers (as in the
646 current GenericRichSeqHandler implementation)
653 Usage : $handler->get_params('-species')
654 Function: Convenience method used to retrieve the specified
655 parameters from the internal parameter cache
656 Returns : Hash ref containing parameters requested and data as
657 key-value pairs. Note that some parameter values may be
658 objects, arrays, etc.
659 Args : List (array) representing the parameters requested
666 Usage : $handler->set_param({'-seqs' => $seqs})
667 Function: Convenience method used to set specific parameters
669 Args : Hash ref containing the data to be passed as key-value pairs
673 =head1 Methods unique to this implementation
675 =head2 build_alignment
677 Title : build_alignment
680 Returns : a Bio::SimpleAlign
683 Note : This may be replaced by a Builder object at some point
687 =head2 annotation_collection
689 Title : annotation_collection
699 =head2 seq_annotation_collection
701 Title : seq_annotation_collection
714 Usage : $handler->process_seqs;
715 Function: checks internal sequences to ensure they are converted over
716 to the proper Bio::AlignI-compatible sequence class
717 Returns : 1 if successful