Bio::DB::TFBS namespace has been moved to its own distribution named after itself
[bioperl-live.git] / Bio / SeqIO / locuslink.pm
blob2de8a814b9b9c2364558158438fc7af359f2d549
2 # BioPerl module for Bio::SeqIO::locuslink
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Keith Ching <kching at gnf.org>
8 # Copyright Keith Ching
10 # You may distribute this module under the same terms as perl itself
13 # (c) Keith Ching, kching at gnf.org, 2002.
14 # (c) GNF, Genomics Institute of the Novartis Research Foundation, 2002.
16 # You may distribute this module under the same terms as perl itself.
17 # Refer to the Perl Artistic License (see the license accompanying this
18 # software package, or see http://www.perl.com/language/misc/Artistic.html)
19 # for the terms under which you may use, modify, and redistribute this module.
21 # THIS PACKAGE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED
22 # WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF
23 # MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE.
26 # POD documentation - main docs before the code
28 =head1 NAME
30 Bio::SeqIO::locuslink - LocusLink input/output stream
32 =head1 SYNOPSIS
34 # don't instantiate directly - instead do
35 my $seqio = Bio::SeqIO->new(-format => "locuslink", -file => \STDIN);
37 =head1 DESCRIPTION
39 This module parses LocusLink into Bio::SeqI objects with rich
40 annotation, but no sequence.
42 The input file has to be in the LL_tmpl format - the tabular format
43 will not work.
45 The way the current implementation populates the object is rather a
46 draft work than a finished work of art. Note that at this stage the
47 LocusLink entries cannot be round-tripped, because the parser loses
48 certain information. For instance, most of the alternative transcript
49 descriptions are not retained. The parser also misses any element
50 that deals with visual representation (e.g., 'button') except for the
51 URLs. Almost all of the pieces of the annotation are kept in a
52 Bio::Annotation::Collection object, see L<Bio::Annotation::Collection>
53 for more information.
55 =head1 FEEDBACK
57 =head2 Mailing Lists
59 User feedback is an integral part of the evolution of this and other
60 Bioperl modules. Send your comments and suggestions preferably to
61 the Bioperl mailing list. Your participation is much appreciated.
63 bioperl-l@bioperl.org - General discussion
64 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
66 =head2 Support
68 Please direct usage questions or support issues to the mailing list:
70 I<bioperl-l@bioperl.org>
72 rather than to the module maintainer directly. Many experienced and
73 reponsive experts will be able look at the problem and quickly
74 address it. Please include a thorough description of the problem
75 with code and data examples if at all possible.
77 =head2 Reporting Bugs
79 Report bugs to the Bioperl bug tracking system to help us keep track
80 of the bugs and their resolution. Bug reports can be submitted via
81 the web:
83 https://github.com/bioperl/bioperl-live/issues
85 =head1 AUTHOR - Keith Ching
87 Email kching at gnf.org
89 =head1 CONTRIBUTORS
91 Hilmar Lapp, hlapp at gmx.net
93 =head1 APPENDIX
95 The rest of the documentation details each of the object methods.
96 Internal methods are usually preceded with a _
98 =cut
100 package Bio::SeqIO::locuslink;
102 use strict;
104 use Bio::Seq::SeqFactory;
105 use Bio::Species;
106 use Bio::Annotation::DBLink;
107 #use Bio::Annotation::Reference;
108 use Bio::Annotation::Comment;
109 use Bio::Annotation::SimpleValue;
110 use Bio::Annotation::OntologyTerm;
111 use Bio::Annotation::Collection;
113 use base qw(Bio::SeqIO);
115 # list of all the field names in locuslink
116 my @locuslink_keys = qw(
117 ACCNUM
118 ALIAS_PROT
119 ALIAS_SYMBOL
120 ASSEMBLY
121 BUTTON
124 COMP
125 CONTIG
126 CURRENT_LOCUSID
127 DB_DESCR
128 DB_LINK
129 ECNUM
130 EVID
131 EXTANNOT
133 GRIF
134 LINK
135 LOCUSID
136 LOCUS_CONFIRMED
137 LOCUS_TYPE
139 MAPLINK
145 OFFICIAL_GENE_NAME
146 OFFICIAL_SYMBOL
147 OMIM
148 ORGANISM
149 PHENOTYPE
150 PHENOTYPE_ID
151 PMID
152 PREFERRED_GENE_NAME
153 PREFERRED_PRODUCT
154 PREFERRED_SYMBOL
155 PRODUCT
156 PROT
157 RELL
158 STATUS
160 SUMFUNC
161 SUMMARY
162 TRANSVAR
163 TYPE
164 UNIGENE
171 # list of fields to make simple annotations from
172 # fields not listed here or as a key in feature hash are ignored (lost).
173 my %anntype_map = (
174 SimpleValue => [qw(
175 ALIAS_PROT
176 ALIAS_SYMBOL
179 CURRENT_LOCUSID
180 ECNUM
181 EXTANNOT
185 OFFICIAL_GENE_NAME
186 OFFICIAL_SYMBOL
187 PHENOTYPE
188 PREFERRED_GENE_NAME
189 PREFERRED_PRODUCT
190 PREFERRED_SYMBOL
191 PRODUCT
192 RELL
193 SUMFUNC
196 Comment => [qw(
197 SUMMARY
203 # certain fields are not named the same as the symgene database list
204 my %dbname_map = (
205 pfam => 'Pfam',
206 smart => 'SMART',
207 NM => 'RefSeq',
208 NP => 'RefSeq',
209 XP => 'RefSeq',
210 XM => 'RefSeq',
211 NG => 'RefSeq',
212 XG => 'RefSeq',
213 XR => 'RefSeq',
214 PROT => 'GenBank',
215 ACCNUM => 'GenBank',
216 CONTIG => 'GenBank',
217 # certain fields are not named the same as the symgene
218 # database list: rename the fields the symgene database name
219 # key = field name in locuslink
220 # value = database name in sym
221 #GO => 'GO',
222 OMIM => 'MIM',
223 GRIF => 'GRIF',
224 STS => 'STS',
225 UNIGENE => 'UniGene',
228 # certain CDD entries use the wrong prefix for the accession number
229 # cddprefix will replace the key w/ the value for these entries
230 my %cddprefix = (
231 pfam => 'PF',
232 smart => 'SM',
235 # alternate mappings if one field does not exist
236 my %alternate_map = (
237 OFFICIAL_GENE_NAME => 'PREFERRED_GENE_NAME',
238 OFFICIAL_SYMBOL => 'PREFERRED_SYMBOL',
241 # for these field names, we only care about the first value X in value X|Y|Z
242 my @ll_firstelements = qw(
250 PROT
252 ACCNUM
253 CONTIG
254 GRIF
257 # these fields need to be flattened into a single string, using the given
258 # join string
259 my %flatten_tags = (
260 ASSEMBLY => ',',
261 ORGANISM => '', # this should occur only once
262 OFFICIAL_SYMBOL => '', # this should occur only once
263 OFFICIAL_GENE_NAME => '', # this should occur only once
264 LOCUSID => '', # this should occur only once
265 PMID => ',',
266 PREFERRED_SYMBOL => ', ',
267 PREFERRED_GENE_NAME => ', '
270 # set the default search pattern for all the field names
271 my %feature_pat_map = map { ($_ , "^$_: (.+)\n"); } @locuslink_keys;
273 sub _initialize {
274 my($self,@args) = @_;
276 $self->SUPER::_initialize(@args);
278 # overwrite the search pattern w/ the first value pattern
279 foreach my $key(@ll_firstelements){
280 $feature_pat_map{$key}="^$key: ([^|]+)";
283 # special search pattern for cdd entries
284 foreach my $key(keys %cddprefix) {
285 $feature_pat_map{$key}='^CDD: .+\|'.$key.'(\d+)';
288 # special patterns for specific fields
289 $feature_pat_map{MAP} = '^MAP: (.+?)\|';
290 $feature_pat_map{MAPHTML} = '^MAP: .+\|(<.+>)\|';
291 $feature_pat_map{GO} = '^GO: .+\|.+\|\w+\|(GO:\d+)\|';
292 $feature_pat_map{GO_DESC} = '^GO: .+\|(.+)\|\w+\|GO:\d+\|';
293 $feature_pat_map{GO_CAT} = '^GO: (.+)\|.+\|\w+\|GO:\d+\|';
294 $feature_pat_map{EXTANNOT} = '^EXTANNOT: (.+)\|(.+)\|\w+\|.+\|\d+';
296 # set the sequence factory of none has been set already
297 if(! $self->sequence_factory()) {
298 $self->sequence_factory(Bio::Seq::SeqFactory->new(
299 -type => 'Bio::Seq::RichSeq'));
304 #########################
306 sub search_pattern{
308 #########################
309 my ($self,
310 $entry, #text to search
311 $searchconfirm, #to make sure you got the right thing
312 $searchpattern,
313 $searchtype) = @_;
314 my @query = $entry=~/$searchpattern/gm;
315 if ($searchconfirm ne "FALSE"){
316 $self->warn("No $searchtype found\n$entry\n") unless @query;
317 foreach (@query){
318 if (!($_=~/$searchconfirm/)){
319 $self->throw("error\n$entry\n$searchtype parse $_ does not match $searchconfirm\n");
321 }#endforeach
322 }#endsearchconfirm
323 return(@query);
324 }#endsub
325 ############
327 sub read_species{
329 ############
330 my ($spline)=@_;
331 my $species;
332 my $genus;
333 ($genus,$species)=$spline=~/([^ ]+) ([^ ]+)/;
334 my $make = Bio::Species->new();
335 $make->classification( ($species,$genus) );
336 return $make;
338 ################
340 sub read_dblink{
342 ################
343 my ($ann,$db,$ref)=@_;
344 my @results=$ref ? @$ref : ();
345 foreach my $id(@results){
346 if($id){
347 $ann->add_Annotation('dblink',
348 Bio::Annotation::DBLink->new(
349 -database =>$db ,
350 -primary_id =>$id));
353 return($ann);
356 ################
358 sub read_reference{
360 ################
361 my ($ann,$db,$results)=@_;
363 if($results){
364 chomp($results);
365 my @ids=split(/,/,$results);
366 $ann = read_dblink($ann,$db,\@ids) if @ids;
368 return $ann;
369 }#endsub
371 ################
373 sub add_annotation{
375 ################
376 my ($ac,$type,$text,$anntype)=@_;
377 my @args;
379 $anntype = 'SimpleValue' unless $anntype;
380 SWITCH : {
381 $anntype eq 'SimpleValue' && do {
382 push(@args, -value => $text, -tagname => $type);
383 last SWITCH;
385 $anntype eq 'Comment' && do {
386 push(@args, -text => $text, -tagname => 'comment');
387 last SWITCH;
390 $ac->add_Annotation("Bio::Annotation::$anntype"->new(@args));
391 return($ac);
392 }#endsub
394 ################
396 sub add_annotation_ref{
398 ################
399 my ($ann,$type,$textref)=@_;
400 my @text=$textref ? @$textref : ();
402 foreach my $text(@text){
403 $ann->add_Annotation($type,Bio::Annotation::SimpleValue->new(-value => $text));
405 return($ann);
406 }#endsub
408 ################
410 sub make_unique{
412 ##############
413 my ($ann,$key) = @_;
415 my %seen = ();
416 foreach my $dbl ($ann->remove_Annotations($key)) {
417 if(!exists($seen{$dbl->as_text()})) {
418 $seen{$dbl->as_text()} = 1;
419 $ann->add_Annotation($dbl);
422 return $ann;
425 ################
427 sub next_seq{
429 ##############
430 my ($self, @args)=@_;
431 my (@results,$search,$ref,$cddref);
433 # LOCUSLINK entries begin w/ >>
434 local $/="\n>>";
436 # slurp in a whole entry and return if no more entries
437 return unless my $entry = $self->_readline;
439 # strip the leading '>>' if it's the first entry
440 if (index($entry,'>>') == 0) { #first entry
441 $entry = substr($entry,2);
444 # we aren't interested in obsoleted entries, so we need to loop
445 # and skip those until we've found the next not obsoleted
446 my %record = ();
447 while($entry && ($entry =~ /\w/)) {
448 if (!($entry=~/LOCUSID/)){
449 $self->throw("No LOCUSID in first line of record. ".
450 "Not LocusLink in my book.");
452 # see whether it's an obsoleted entry, and if so jump to the next
453 # one entry right away
454 if($entry =~ /^CURRENT_LOCUSID:/m) {
455 # read next entry and continue
456 $entry = $self->_readline;
457 %record = ();
458 next;
460 # loop through list of features and get field values
461 # place into record hash as array refs
462 foreach my $key (keys %feature_pat_map){
463 $search=$feature_pat_map{$key};
464 @results=$self->search_pattern($entry,'FALSE',$search,$search);
465 $record{$key} = @results ? [@results] : undef;
466 }#endfor
467 # terminate loop as this one hasn't been obsoleted
468 last;
471 # we have reached the end-of-file ...
472 return unless %record;
474 # special processing for CDD entries like pfam and smart
475 my ($PRESENT,@keep);
476 foreach my $key(keys %cddprefix){
477 #print "check CDD $key\n";
478 if($record{$key}) {
479 @keep=();
480 foreach my $list (@{$record{$key}}) {
481 # replace AC with correct AC number
482 push(@keep,$cddprefix{$key}.$list);
484 # replace CDD ref with correctly prefixed AC number
485 $record{$key} = [@keep];
488 # modify CDD references @=();
489 if($record{CDD}) {
490 @keep=();
491 foreach my $cdd (@{$record{CDD}}) {
492 $PRESENT = undef;
493 foreach my $key (keys %cddprefix) {
494 if ($cdd=~/$key/){
495 $PRESENT = 1;
496 last;
499 push(@keep,$cdd) if(! $PRESENT);
501 $record{CDD} = [@keep];
504 # create annotation collection - we'll need it now
505 my $ann = Bio::Annotation::Collection->new();
507 foreach my $field(keys %dbname_map){
508 $ann=read_dblink($ann,$dbname_map{$field},$record{$field});
511 # add GO link as an OntologyTerm annotation
512 if($record{GO}) {
513 for(my $j = 0; $j < @{$record{GO}}; $j++) {
514 my $goann = Bio::Annotation::OntologyTerm->new(
515 -identifier => $record{GO}->[$j],
516 -name => $record{GO_DESC}->[$j],
517 -ontology => $record{GO_CAT}->[$j]);
518 $ann->add_Annotation($goann);
522 $ann=add_annotation_ref($ann,'URL',$record{LINK});
523 $ann=add_annotation_ref($ann,'URL',$record{DB_LINK});
525 # everything else gets a simple tag or comment value annotation
526 foreach my $anntype (keys %anntype_map) {
527 foreach my $key (@{$anntype_map{$anntype}}){
528 if($record{$key}){
529 foreach (@{$record{$key}}){
530 #print "$key\t\t$_\n";
531 $ann=add_annotation($ann,$key,$_,$anntype);
537 # flatten designated attributes into a scalar value
538 foreach my $field (keys %flatten_tags) {
539 if($record{$field}) {
540 $record{$field} = join($flatten_tags{$field},
541 @{$record{$field}});
545 # annotation that expects the array flattened out
546 $ann=read_reference($ann,'PUBMED',$record{PMID});
547 if($record{ASSEMBLY}) {
548 my @assembly=split(/,/,$record{ASSEMBLY});
549 $ann=read_dblink($ann,'GenBank',\@assembly);
552 # replace fields w/ alternate if original does not exist
553 foreach my $fieldval (keys %alternate_map){
554 if((! $record{$fieldval}) && ($record{$alternate_map{$fieldval}})){
555 $record{$fieldval}=$record{$alternate_map{$fieldval}};
559 # presently we can't store types or context of dblinks - therefore
560 # we need to remove duplicates that only differ in context
561 make_unique($ann,'dblink');
563 # create sequence object (i.e., let seq.factory create one)
564 my $seq = $self->sequence_factory->create(
565 -verbose => $self->verbose(),
566 -accession_number => $record{LOCUSID},
567 -desc => $record{OFFICIAL_GENE_NAME},
568 -display_id => $record{OFFICIAL_SYMBOL},
569 -species => read_species($record{ORGANISM}),
570 -annotation => $ann);
572 # dump out object contents
573 # show_obj([$seq]);
575 return($seq);
578 ################
580 sub show_obj{
582 ################
583 my ($seqlistref)=@_;
584 my @list=@$seqlistref;
585 my $out = Bio::SeqIO->new('-fh' => \*STDOUT, -format => 'genbank' );
586 my ($ann,@values,$val);
588 foreach my $seq(@list){
589 $out->write_seq($seq);
590 $ann=$seq->annotation;
591 foreach my $key ( $ann->get_all_annotation_keys() ) {
592 @values = $ann->get_Annotations($key);
593 foreach my $value ( @values ) {
594 # value is an Bio::AnnotationI, and defines a "as_text" method
595 $val=$value->as_text;
596 print "Annotation ",$key,"\t\t",$val,"\n";
600 }#endsub