lib/Bio/SeqFeature/Tools/TypeMapper.pm

   1 #
   2 # bioperl module for Bio::SeqFeature::Tools::TypeMapper
   3 #
   4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
   5 #
   6 # Cared for by Chris Mungall <cjm@fruitfly.org>
   7 #
   8 # Copyright Chris Mungall
   9 #
  10 # You may distribute this module under the same terms as perl itself
  11
  12 # POD documentation - main docs before the code
  13
  14 =head1 NAME
  15
  16 Bio::SeqFeature::Tools::TypeMapper - maps $seq_feature-E<gt>primary_tag
  17
  18 =head1 SYNOPSIS
  19
  20   use Bio::SeqIO;
  21   use Bio::SeqFeature::Tools::TypeMapper;
  22
  23   # first fetch a genbank SeqI object
  24   $seqio =
  25     Bio::SeqIO->new(-file=>'AE003644.gbk',
  26                     -format=>'GenBank');
  27   $seq = $seqio->next_seq();
  28
  29   $tm = Bio::SeqFeature::Tools::TypeMapper->new;
  30
  31   # map all the types in the sequence
  32   $tm->map_types(-seq=>$seq,
  33                  {CDS=>'ORF',
  34                   variation=>sub {
  35                       my $f = shift;
  36                       $f->length > 1 ?
  37                         'variation' : 'SNP'
  38                   },
  39                  });
  40
  41    # alternatively, use the hardcoded SO mapping
  42    $tm->map_types_to_SO(-seq=>$seq);
  43
  44 =head1 DESCRIPTION
  45
  46 This class implements an object for mapping between types; for
  47 example, the types in a genbank feature table, and the types specified
  48 in the Sequence Ontology.
  49
  50 You can specify your own mapping, either as a simple hash index, or by
  51 providing your own subroutines.
  52
  53 =head1 FEEDBACK
  54
  55 =head2 Mailing Lists
  56
  57 User feedback is an integral part of the evolution of this and other
  58 Bioperl modules. Send your comments and suggestions preferably to the
  59 Bioperl mailing lists  Your participation is much appreciated.
  60
  61   bioperl-l@bioperl.org                         - General discussion
  62   http://bioperl.org/wiki/Mailing_lists  - About the mailing lists
  63
  64 =head2 Support
  65
  66 Please direct usage questions or support issues to the mailing list:
  67
  68 I<bioperl-l@bioperl.org>
  69
  70 rather than to the module maintainer directly. Many experienced and
  71 reponsive experts will be able look at the problem and quickly
  72 address it. Please include a thorough description of the problem
  73 with code and data examples if at all possible.
  74
  75 =head2 Reporting Bugs
  76
  77 report bugs to the Bioperl bug tracking system to help us keep track
  78 the bugs and their resolution.  Bug reports can be submitted via the
  79 web:
  80
  81   https://github.com/bioperl/bioperl-live/issues
  82
  83 =head1 AUTHOR - Chris Mungall
  84
  85 Email:  cjm@fruitfly.org
  86
  87 =head1 APPENDIX
  88
  89 The rest of the documentation details each of the object
  90 methods. Internal methods are usually preceded with a _
  91
  92 =cut
  93
  94
  95 # Let the code begin...
  96
  97 package Bio::SeqFeature::Tools::TypeMapper;
  98 use strict;
  99
 100 # Object preamble - inherits from Bio::Root::Root
 101
 102 use base qw(Bio::Root::Root);
 103
 104 =head2 new
 105
 106  Title   : new
 107  Usage   : $unflattener = Bio::SeqFeature::Tools::TypeMapper->new();
 108  Function: constructor
 109  Example :
 110  Returns : a new Bio::SeqFeature::Tools::TypeMapper
 111  Args    : see below
 112
 113
 114 =cut
 115
 116 sub new {
 117     my($class,@args) = @_;
 118     my $self = $class->SUPER::new(@args);
 119
 120     my($typemap) =
 121         $self->_rearrange([qw(TYPEMAP
 122                              )],
 123                           @args);
 124
 125     $typemap  && $self->typemap($typemap);
 126     return $self; # success - we hope!
 127 }
 128
 129 =head2 typemap
 130
 131  Title   : typemap
 132  Usage   : $obj->typemap($newval)
 133  Function:
 134  Example :
 135  Returns : value of typemap (a scalar)
 136  Args    : on set, new value (a scalar or undef, optional)
 137
 138
 139 =cut
 140
 141 sub typemap{
 142     my $self = shift;
 143
 144     return $self->{'typemap'} = shift if @_;
 145     return $self->{'typemap'};
 146 }
 147
 148 =head2 map_types
 149
 150  Title   : map_types
 151  Usage   :
 152  Function:
 153  Example :
 154  Returns :
 155  Args    :
 156
 157  dgg: added -undefined => "region" option to produce all valid SO mappings.
 158
 159 =cut
 160
 161 sub map_types{
 162    my ($self,@args) = @_;
 163
 164    my($sf, $seq, $type_map, $undefmap) =
 165      $self->_rearrange([qw(FEATURE
 166                            SEQ
 167                            TYPE_MAP
 168                            UNDEFINED
 169                           )],
 170                           @args);
 171    if (!$sf && !$seq) {
 172        $self->throw("you need to pass in either -feature or -seq");
 173    }
 174
 175    my @sfs = ($sf);
 176    if ($seq) {
 177        $seq->isa("Bio::SeqI") || $self->throw("$seq NOT A SeqI");
 178        @sfs = $seq->get_all_SeqFeatures;
 179    }
 180    $type_map = $type_map || $self->typemap; # dgg: was type_map;
 181    foreach my $sf (@sfs) {
 182
 183        $sf->isa("Bio::SeqFeatureI") || $self->throw("$sf NOT A SeqFeatureI");
 184        $sf->isa("Bio::FeatureHolderI") || $self->throw("$sf NOT A FeatureHolderI");
 185
 186        my $type = $sf->primary_tag;
 187        my $mtype = $type_map->{$type};
 188        if ($mtype) {
 189            if (ref($mtype)) {
 190                if (ref($mtype) eq 'CODE') {
 191                    $mtype = $mtype->($sf);
 192                }
 193                else {
 194                    $self->throw('type_map values must be scalar or CODE ref. You said: '.$mtype.' for type: '.$type);
 195                }
 196            }
 197            elsif ($undefmap && $mtype eq 'undefined') { # dgg
 198               $mtype= $undefmap;
 199            }
 200            $sf->primary_tag($mtype);
 201        }
 202    }
 203    return;
 204 }
 205
 206 =head2 map_types_to_SO
 207
 208  Title   : map_types_to_SO
 209  Usage   :
 210  Function:
 211  Example :
 212  Returns :
 213  Args    :
 214
 215 hardcodes the genbank to SO mapping
 216
 217 Based on revision 1.22 of SO
 218
 219 Please see the actual code for the mappings
 220
 221 Taken from
 222
 223 L<http://sequenceontology.org/resources/mapping/FT_SO.txt>
 224
 225 dgg: separated out FT_SO_map for caller changes. Update with:
 226
 227   open(FTSO,"curl -s http://sequenceontology.org/resources/mapping/FT_SO.txt|");
 228   while(<FTSO>){
 229     chomp; ($ft,$so,$sid,$ftdef,$sodef)= split"\t";
 230     print "     '$ft' => '$so',\n" if($ft && $so && $ftdef);
 231   }
 232
 233 =cut
 234
 235 sub FT_SO_map  {
 236   # $self= shift;
 237   # note : some of the ft_so mappings are commented out and overriden...
 238     return {
 239         "-" => ["located_sequence_feature", "so:0000110"],
 240         "-10_signal" => ["minus_10_signal", "so:0000175"],
 241         "-35_signal" => ["minus_35_signal", "so:0000176"],
 242         "3'utr" => ["three_prime_utr", "so:0000205"],
 243         "3'clip" => ["three_prime_clip", "so:0000557"],
 244         "5'utr" => ["five_prime_utr", "so:0000204"],
 245         "5'clip" => ["five_prime_clip", "so:0000555"],
 246         "caat_signal" => ["caat_signal", "so:0000172"],
 247         "cds" => ["cds", "so:0000316"],
 248         "c_region" => ["undefined", ""],
 249         "d-loop" => ["d_loop", "so:0000297"],
 250         "d_segment" => ["d_gene", "so:0000458"],
 251         "gc_signal" => ["gc_rich_region", "so:0000173"],
 252         "j_segment" => ["undefined", ""],
 253         "ltr" => ["long_terminal_repeat", "so:0000286"],
 254         "n_region" => ["undefined", ""],
 255         "rbs" => ["ribosome_entry_site", "so:0000139"],
 256         "sts" => ["sts", "so:0000331"],
 257         "s_region" => ["undefined", ""],
 258         "tata_signal" => ["tata_box", "so:0000174"],
 259         "v_region" => ["undefined", ""],
 260         "v_segment" => ["undefined", ""],
 261         "attenuator" => ["attenuator", "so:0000140"],
 262         "conflict" => ["undefined", ""],
 263         "enhancer" => ["enhancer", "so:0000165"],
 264         "exon" => ["exon", "so:0000147"],
 265         "gap" => ["gap", "so:0000730"],
 266         "gene" => ["gene", "so:0000704"],
 267         "idna" => ["idna", "so:0000723"],
 268         "intron" => ["intron", "so:0000188"],
 269         "mRNA" => ["mRNA", "so:0000234"],
 270         "mat_peptide" => ["mature_protein_region", "so:0000419"],
 271         "mature_peptide" => ["mature_protein_region", "so:0000419"],
 272         #"misc_RNA" => ["transcript", "so:0000673"],
 273         "misc_binding" => ["binding_site", "so:0000409"],
 274         "misc_difference" => ["sequence_difference", "so:0000413"],
 275         "misc_feature" => ["region", undef],
 276         "misc_recomb" => ["recombination_feature", "so:0000298"],
 277         "misc_signal" => ["regulatory_region", "so:0005836"],
 278         "misc_structure" => ["sequence_secondary_structure", "so:0000002"],
 279         "modified_base" => ["modified_base_site", "so:0000305"],
 280         "old_sequence" => ["undefined", ""],
 281         "operon" => ["operon", "so:0000178"],
 282         "oriT" => ["origin_of_transfer", "so:0000724"],
 283         "polya_signal" => ["polyA_signal_sequence", "so:0000551"],
 284         "polya_site" => ["polyA_site", "so:0000553"],
 285         "precursor_RNA" => ["primary_transcript", "so:0000185"],
 286         "prim_transcript" => ["primary_transcript", "so:0000185"],
 287         "primer_bind" => ["primer_binding_site", "so:0005850"],
 288         "promoter" => ["promoter", "so:0000167"],
 289         "protein_bind" => ["protein_binding_site", "so:0000410"],
 290         "rRNA" => ["rRNA", "so:0000252"],
 291         "repeat_region" => ["repeat_region", "so:0000657"],
 292         "repeat_unit" => ["repeat_unit", "so:0000726"],
 293         "satellite" => ["satellite_dna", "so:0000005"],
 294         "scRNA" => ["scRNA", "so:0000013"],
 295         "sig_peptide" => ["signal_peptide", "so:0000418"],
 296         "snRNA" => ["snRNA", "so:0000274"],
 297         "snoRNA" => ["snoRNA", "so:0000275"],
 298         #"source" => ["databank_entry", "so:2000061"],
 299         "stem_loop" => ["stem_loop", "so:0000313"],
 300         "tRNA" => ["tRNA", "so:0000253"],
 301         "terminator" => ["terminator", "so:0000141"],
 302         "transit_peptide" => ["transit_peptide", "so:0000725"],
 303         "unsure" => "undefined",
 304         "variation" => ["sequence_variant", "so:0000109"],
 305
 306         # manually added
 307         ## has parent = pseudogene ; dgg
 308         "pseudomRNA" => ["pseudogenic_transcript", "so:0000516"],
 309         ## from unflattener misc_rna ; dgg
 310         "pseudotranscript" => ["pseudogenic_transcript", "so:0000516"],
 311         "pseudoexon" => ["pseudogenic_exon", "so:0000507"],
 312         "pseudoCDS" => ["pseudogenic_exon", "so:0000507"],
 313         "pseudomisc_feature" => ["pseudogenic_region", "so:0000462"],
 314         "pseudointron" => ["pseudogenic_region", "so:0000462"],
 315
 316
 317         ## "undefined" => "region",
 318
 319         # this is the most generic form for rnas;
 320         # we always represent the processed form of
 321         # the transcript
 322         misc_RNA => ['mature_transcript',"so:0000233"],
 323
 324         # not sure about this one...
 325         source=>['contig', "SO:0000149"],
 326
 327         rep_origin=>['origin_of_replication',"SO:0000296"],
 328
 329         Protein=>['polypeptide',"SO:0000104"],
 330     };
 331 #  return {
 332      #"FT term" => "SO term",
 333      #"-" => "located_sequence_feature",
 334      #"-10_signal" => "minus_10_signal",
 335      #"-35_signal" => "minus_35_signal",
 336      #"3'UTR" => "three_prime_UTR",
 337      #"3'clip" => "three_prime_clip",
 338      #"5'UTR" => "five_prime_UTR",
 339      #"5'clip" => "five_prime_clip",
 340      #"CAAT_signal" => "CAAT_signal",
 341      #"CDS" => "CDS",
 342      #"C_region" => "undefined",
 343      #"D-loop" => "D_loop",
 344      #"D_segment" => "D_gene",
 345      #"GC_signal" => "GC_rich_region",
 346      #"J_segment" => "undefined",
 347      #"LTR" => "long_terminal_repeat",
 348      #"N_region" => "undefined",
 349      #"RBS" => "ribosome_entry_site",
 350      #"STS" => "STS",
 351      #"S_region" => "undefined",
 352      #"TATA_signal" => "TATA_box",
 353      #"V_region" => "undefined",
 354      #"V_segment" => "undefined",
 355      #"attenuator" => "attenuator",
 356      #"conflict" => "undefined",
 357      #"enhancer" => "enhancer",
 358      #"exon" => "exon",
 359      #"gap" => "gap",
 360      #"gene" => "gene",
 361      #"iDNA" => "iDNA",
 362      #"intron" => "intron",
 363      #"mRNA" => "mRNA",
 364      #"mat_peptide" => "mature_protein_region",
 365      #"mature_peptide" => "mature_protein_region",
 366 ##                     "misc_RNA" => "transcript",
 367      #"misc_binding" => "binding_site",
 368      #"misc_difference" => "sequence_difference",
 369      #"misc_feature" => "region",
 370      #"misc_recomb" => "recombination_feature",
 371      #"misc_signal" => "regulatory_region",
 372      #"misc_structure" => "sequence_secondary_structure",
 373      #"modified_base" => "modified_base_site",
 374      #"old_sequence" => "undefined",
 375      #"operon" => "operon",
 376      #"oriT" => "origin_of_transfer",
 377      #"polyA_signal" => "polyA_signal_sequence",
 378      #"polyA_site" => "polyA_site",
 379      #"precursor_RNA" => "primary_transcript",
 380      #"prim_transcript" => "primary_transcript",
 381      #"primer_bind" => "primer_binding_site",
 382      #"promoter" => "promoter",
 383      #"protein_bind" => "protein_binding_site",
 384      #"rRNA" => "rRNA",
 385      #"repeat_region" => "repeat_region",
 386      #"repeat_unit" => "repeat_unit",
 387      #"satellite" => "satellite_DNA",
 388      #"scRNA" => "scRNA",
 389      #"sig_peptide" => "signal_peptide",
 390      #"snRNA" => "snRNA",
 391      #"snoRNA" => "snoRNA",
 392 ##                     "source" => "databank_entry",
 393      #"stem_loop" => "stem_loop",
 394      #"tRNA" => "tRNA",
 395      #"terminator" => "terminator",
 396      #"transit_peptide" => "transit_peptide",
 397      #"unsure" => "undefined",
 398      #"variation" => "sequence_variant",
 399
 400       #"pseudomRNA" => "pseudogenic_transcript", ## has parent = pseudogene ; dgg
 401       #"pseudotranscript" => "pseudogenic_transcript", ## from Unflattener misc_RNA ; dgg
 402       #"pseudoexon" => "pseudogenic_exon",
 403       #"pseudoCDS"  => "pseudogenic_exon",
 404       #"pseudomisc_feature" => "pseudogenic_region",
 405       #"pseudointron" => "pseudogenic_region",
 406
 407       ### "undefined" => "region",
 408
 409       ## this is the most generic form for RNAs;
 410       ## we always represent the processed form of
 411       ## the transcript
 412       #misc_RNA=>'processed_transcript',
 413
 414       ## not sure about this one...
 415       #source=>'contig',
 416
 417       #rep_origin=>'origin_of_replication',
 418
 419       #Protein=>'protein',
 420       #};
 421 }
 422
 423 sub map_types_to_SO{
 424    my ($self,@args) = @_;
 425
 426    push(@args, (-type_map=> $self->FT_SO_map() ) );
 427    return $self->map_types(@args);
 428 }
 429
 430 =head2 get_relationship_type_by_parent_child
 431
 432  Title   : get_relationship_type_by_parent_child
 433  Usage   : $type = $tm->get_relationship_type_by_parent_child($parent_sf, $child_sf);
 434  Usage   : $type = $tm->get_relationship_type_by_parent_child('mRNA', 'protein');
 435  Function: given two features where the parent contains the child,
 436            will determine what the relationship between them in
 437  Example :
 438  Returns :
 439  Args    : parent SeqFeature, child SeqFeature OR
 440            parent type string, child type string OR
 441
 442 bioperl Seq::FeatureHolderI hierarchies are equivalent to unlabeled
 443 graphs (where parent nodes are the containers, and child nodes are the
 444 features being contained). For example, a feature of type mRNA can
 445 contain features of type exon.
 446
 447 Some external representations (eg chadoxml or chaosxml) require that
 448 the edges in the feature relationship graph are labeled. For example,
 449 the type between mRNA and exon would be B<part_of>. Although it
 450 stretches the bioperl notion of containment, we could have a CDS
 451 contained by an mRNA (for example, the
 452 L<Bio::SeqFeature::Tools::Unflattener> module takes genbank records
 453 and makes these kind of links. The relationship here would be
 454 B<produced_by>
 455
 456 In chado speak, the child is the B<subject> feature and the parent is
 457 the B<object> feature
 458
 459 =cut
 460
 461 sub get_relationship_type_by_parent_child {
 462    my ($self,$parent,$child) = @_;
 463    $parent = ref($parent) ? $parent->primary_tag : $parent;
 464    $child = ref($child) ? $child->primary_tag : $child;
 465
 466    my $type = 'part_of'; # default
 467
 468    # TODO - do this with metadata, or infer via SO itself
 469
 470    if (lc($child) eq 'protein') {
 471        $type = 'derives_from';
 472    }
 473    if (lc($child) eq 'polypeptide') {
 474        $type = 'derives_from';
 475    }
 476    return $type;
 477 }
 478
 479
 480 1;