2 # BioPerl module for Bio::SeqIO::locuslink
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Keith Ching <kching at gnf.org>
8 # Copyright Keith Ching
10 # You may distribute this module under the same terms as perl itself
13 # (c) Keith Ching, kching at gnf.org, 2002.
14 # (c) GNF, Genomics Institute of the Novartis Research Foundation, 2002.
16 # You may distribute this module under the same terms as perl itself.
17 # Refer to the Perl Artistic License (see the license accompanying this
18 # software package, or see http://www.perl.com/language/misc/Artistic.html)
19 # for the terms under which you may use, modify, and redistribute this module.
21 # THIS PACKAGE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED
22 # WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF
23 # MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE.
26 # POD documentation - main docs before the code
30 Bio::SeqIO::locuslink - LocusLink input/output stream
34 # don't instantiate directly - instead do
35 my $seqio = Bio::SeqIO->new(-format => "locuslink", -file => \STDIN);
39 This module parses LocusLink into Bio::SeqI objects with rich
40 annotation, but no sequence.
42 The input file has to be in the LL_tmpl format - the tabular format
45 The way the current implementation populates the object is rather a
46 draft work than a finished work of art. Note that at this stage the
47 LocusLink entries cannot be round-tripped, because the parser loses
48 certain information. For instance, most of the alternative transcript
49 descriptions are not retained. The parser also misses any element
50 that deals with visual representation (e.g., 'button') except for the
51 URLs. Almost all of the pieces of the annotation are kept in a
52 Bio::Annotation::Collection object, see L<Bio::Annotation::Collection>
59 User feedback is an integral part of the evolution of this and other
60 Bioperl modules. Send your comments and suggestions preferably to
61 the Bioperl mailing list. Your participation is much appreciated.
63 bioperl-l@bioperl.org - General discussion
64 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
68 Please direct usage questions or support issues to the mailing list:
70 I<bioperl-l@bioperl.org>
72 rather than to the module maintainer directly. Many experienced and
73 reponsive experts will be able look at the problem and quickly
74 address it. Please include a thorough description of the problem
75 with code and data examples if at all possible.
79 Report bugs to the Bioperl bug tracking system to help us keep track
80 of the bugs and their resolution. Bug reports can be submitted via
83 https://github.com/bioperl/bioperl-live/issues
85 =head1 AUTHOR - Keith Ching
87 Email kching at gnf.org
91 Hilmar Lapp, hlapp at gmx.net
95 The rest of the documentation details each of the object methods.
96 Internal methods are usually preceded with a _
100 package Bio
::SeqIO
::locuslink
;
104 use Bio
::Seq
::SeqFactory
;
106 use Bio
::Annotation
::DBLink
;
107 #use Bio::Annotation::Reference;
108 use Bio
::Annotation
::Comment
;
109 use Bio
::Annotation
::SimpleValue
;
110 use Bio
::Annotation
::OntologyTerm
;
111 use Bio
::Annotation
::Collection
;
113 use base
qw(Bio::SeqIO);
115 # list of all the field names in locuslink
116 my @locuslink_keys = qw(
171 # list of fields to make simple annotations from
172 # fields not listed here or as a key in feature hash are ignored (lost).
203 # certain fields are not named the same as the symgene database list
217 # certain fields are not named the same as the symgene
218 # database list: rename the fields the symgene database name
219 # key = field name in locuslink
220 # value = database name in sym
225 UNIGENE
=> 'UniGene',
228 # certain CDD entries use the wrong prefix for the accession number
229 # cddprefix will replace the key w/ the value for these entries
235 # alternate mappings if one field does not exist
236 my %alternate_map = (
237 OFFICIAL_GENE_NAME
=> 'PREFERRED_GENE_NAME',
238 OFFICIAL_SYMBOL
=> 'PREFERRED_SYMBOL',
241 # for these field names, we only care about the first value X in value X|Y|Z
242 my @ll_firstelements = qw(
257 # these fields need to be flattened into a single string, using the given
261 ORGANISM
=> '', # this should occur only once
262 OFFICIAL_SYMBOL
=> '', # this should occur only once
263 OFFICIAL_GENE_NAME
=> '', # this should occur only once
264 LOCUSID
=> '', # this should occur only once
266 PREFERRED_SYMBOL
=> ', ',
267 PREFERRED_GENE_NAME
=> ', '
270 # set the default search pattern for all the field names
271 my %feature_pat_map = map { ($_ , "^$_: (.+)\n"); } @locuslink_keys;
274 my($self,@args) = @_;
276 $self->SUPER::_initialize
(@args);
278 # overwrite the search pattern w/ the first value pattern
279 foreach my $key(@ll_firstelements){
280 $feature_pat_map{$key}="^$key: ([^|]+)";
283 # special search pattern for cdd entries
284 foreach my $key(keys %cddprefix) {
285 $feature_pat_map{$key}='^CDD: .+\|'.$key.'(\d+)';
288 # special patterns for specific fields
289 $feature_pat_map{MAP
} = '^MAP: (.+?)\|';
290 $feature_pat_map{MAPHTML
} = '^MAP: .+\|(<.+>)\|';
291 $feature_pat_map{GO
} = '^GO: .+\|.+\|\w+\|(GO:\d+)\|';
292 $feature_pat_map{GO_DESC
} = '^GO: .+\|(.+)\|\w+\|GO:\d+\|';
293 $feature_pat_map{GO_CAT
} = '^GO: (.+)\|.+\|\w+\|GO:\d+\|';
294 $feature_pat_map{EXTANNOT
} = '^EXTANNOT: (.+)\|(.+)\|\w+\|.+\|\d+';
296 # set the sequence factory of none has been set already
297 if(! $self->sequence_factory()) {
298 $self->sequence_factory(Bio
::Seq
::SeqFactory
->new(
299 -type
=> 'Bio::Seq::RichSeq'));
304 #########################
308 #########################
310 $entry, #text to search
311 $searchconfirm, #to make sure you got the right thing
314 my @query = $entry=~/$searchpattern/gm;
315 if ($searchconfirm ne "FALSE"){
316 $self->warn("No $searchtype found\n$entry\n") unless @query;
318 if (!($_=~/$searchconfirm/)){
319 $self->throw("error\n$entry\n$searchtype parse $_ does not match $searchconfirm\n");
333 ($genus,$species)=$spline=~/([^ ]+) ([^ ]+)/;
334 my $make = Bio
::Species
->new();
335 $make->classification( ($species,$genus) );
343 my ($ann,$db,$ref)=@_;
344 my @results=$ref ? @
$ref : ();
345 foreach my $id(@results){
347 $ann->add_Annotation('dblink',
348 Bio
::Annotation
::DBLink
->new(
361 my ($ann,$db,$results)=@_;
365 my @ids=split(/,/,$results);
366 $ann = read_dblink
($ann,$db,\
@ids) if @ids;
376 my ($ac,$type,$text,$anntype)=@_;
379 $anntype = 'SimpleValue' unless $anntype;
381 $anntype eq 'SimpleValue' && do {
382 push(@args, -value
=> $text, -tagname
=> $type);
385 $anntype eq 'Comment' && do {
386 push(@args, -text
=> $text, -tagname
=> 'comment');
390 $ac->add_Annotation("Bio::Annotation::$anntype"->new(@args));
396 sub add_annotation_ref
{
399 my ($ann,$type,$textref)=@_;
400 my @text=$textref ? @
$textref : ();
402 foreach my $text(@text){
403 $ann->add_Annotation($type,Bio
::Annotation
::SimpleValue
->new(-value
=> $text));
416 foreach my $dbl ($ann->remove_Annotations($key)) {
417 if(!exists($seen{$dbl->as_text()})) {
418 $seen{$dbl->as_text()} = 1;
419 $ann->add_Annotation($dbl);
430 my ($self, @args)=@_;
431 my (@results,$search,$ref,$cddref);
433 # LOCUSLINK entries begin w/ >>
436 # slurp in a whole entry and return if no more entries
437 return unless my $entry = $self->_readline;
439 # strip the leading '>>' if it's the first entry
440 if (index($entry,'>>') == 0) { #first entry
441 $entry = substr($entry,2);
444 # we aren't interested in obsoleted entries, so we need to loop
445 # and skip those until we've found the next not obsoleted
447 while($entry && ($entry =~ /\w/)) {
448 if (!($entry=~/LOCUSID/)){
449 $self->throw("No LOCUSID in first line of record. ".
450 "Not LocusLink in my book.");
452 # see whether it's an obsoleted entry, and if so jump to the next
453 # one entry right away
454 if($entry =~ /^CURRENT_LOCUSID:/m) {
455 # read next entry and continue
456 $entry = $self->_readline;
460 # loop through list of features and get field values
461 # place into record hash as array refs
462 foreach my $key (keys %feature_pat_map){
463 $search=$feature_pat_map{$key};
464 @results=$self->search_pattern($entry,'FALSE',$search,$search);
465 $record{$key} = @results ?
[@results] : undef;
467 # terminate loop as this one hasn't been obsoleted
471 # we have reached the end-of-file ...
472 return unless %record;
474 # special processing for CDD entries like pfam and smart
476 foreach my $key(keys %cddprefix){
477 #print "check CDD $key\n";
480 foreach my $list (@
{$record{$key}}) {
481 # replace AC with correct AC number
482 push(@keep,$cddprefix{$key}.$list);
484 # replace CDD ref with correctly prefixed AC number
485 $record{$key} = [@keep];
488 # modify CDD references @=();
491 foreach my $cdd (@
{$record{CDD
}}) {
493 foreach my $key (keys %cddprefix) {
499 push(@keep,$cdd) if(! $PRESENT);
501 $record{CDD
} = [@keep];
504 # create annotation collection - we'll need it now
505 my $ann = Bio
::Annotation
::Collection
->new();
507 foreach my $field(keys %dbname_map){
508 $ann=read_dblink
($ann,$dbname_map{$field},$record{$field});
511 # add GO link as an OntologyTerm annotation
513 for(my $j = 0; $j < @
{$record{GO
}}; $j++) {
514 my $goann = Bio
::Annotation
::OntologyTerm
->new(
515 -identifier
=> $record{GO
}->[$j],
516 -name
=> $record{GO_DESC
}->[$j],
517 -ontology
=> $record{GO_CAT
}->[$j]);
518 $ann->add_Annotation($goann);
522 $ann=add_annotation_ref
($ann,'URL',$record{LINK
});
523 $ann=add_annotation_ref
($ann,'URL',$record{DB_LINK
});
525 # everything else gets a simple tag or comment value annotation
526 foreach my $anntype (keys %anntype_map) {
527 foreach my $key (@
{$anntype_map{$anntype}}){
529 foreach (@
{$record{$key}}){
530 #print "$key\t\t$_\n";
531 $ann=add_annotation
($ann,$key,$_,$anntype);
537 # flatten designated attributes into a scalar value
538 foreach my $field (keys %flatten_tags) {
539 if($record{$field}) {
540 $record{$field} = join($flatten_tags{$field},
545 # annotation that expects the array flattened out
546 $ann=read_reference
($ann,'PUBMED',$record{PMID
});
547 if($record{ASSEMBLY
}) {
548 my @assembly=split(/,/,$record{ASSEMBLY
});
549 $ann=read_dblink
($ann,'GenBank',\
@assembly);
552 # replace fields w/ alternate if original does not exist
553 foreach my $fieldval (keys %alternate_map){
554 if((! $record{$fieldval}) && ($record{$alternate_map{$fieldval}})){
555 $record{$fieldval}=$record{$alternate_map{$fieldval}};
559 # presently we can't store types or context of dblinks - therefore
560 # we need to remove duplicates that only differ in context
561 make_unique
($ann,'dblink');
563 # create sequence object (i.e., let seq.factory create one)
564 my $seq = $self->sequence_factory->create(
565 -verbose
=> $self->verbose(),
566 -accession_number
=> $record{LOCUSID
},
567 -desc
=> $record{OFFICIAL_GENE_NAME
},
568 -display_id
=> $record{OFFICIAL_SYMBOL
},
569 -species
=> read_species
($record{ORGANISM
}),
570 -annotation
=> $ann);
572 # dump out object contents
584 my @list=@
$seqlistref;
585 my $out = Bio
::SeqIO
->new('-fh' => \
*STDOUT
, -format
=> 'genbank' );
586 my ($ann,@values,$val);
588 foreach my $seq(@list){
589 $out->write_seq($seq);
590 $ann=$seq->annotation;
591 foreach my $key ( $ann->get_all_annotation_keys() ) {
592 @values = $ann->get_Annotations($key);
593 foreach my $value ( @values ) {
594 # value is an Bio::AnnotationI, and defines a "as_text" method
595 $val=$value->as_text;
596 print "Annotation ",$key,"\t\t",$val,"\n";