2 # bioperl module for Bio::SeqFeature::Tools::TypeMapper
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Chris Mungall <cjm@fruitfly.org>
8 # Copyright Chris Mungall
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
16 Bio::SeqFeature::Tools::TypeMapper - maps $seq_feature-E<gt>primary_tag
21 use Bio::SeqFeature::Tools::TypeMapper;
23 # first fetch a genbank SeqI object
25 Bio::SeqIO->new(-file=>'AE003644.gbk',
27 $seq = $seqio->next_seq();
29 $tm = Bio::SeqFeature::Tools::TypeMapper->new;
31 # map all the types in the sequence
32 $tm->map_types(-seq=>$seq,
41 # alternatively, use the hardcoded SO mapping
42 $tm->map_types_to_SO(-seq=>$seq);
46 This class implements an object for mapping between types; for
47 example, the types in a genbank feature table, and the types specified
48 in the Sequence Ontology.
50 You can specify your own mapping, either as a simple hash index, or by
51 providing your own subroutines.
57 User feedback is an integral part of the evolution of this and other
58 Bioperl modules. Send your comments and suggestions preferably to the
59 Bioperl mailing lists Your participation is much appreciated.
61 bioperl-l@bioperl.org - General discussion
62 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
66 Please direct usage questions or support issues to the mailing list:
68 I<bioperl-l@bioperl.org>
70 rather than to the module maintainer directly. Many experienced and
71 reponsive experts will be able look at the problem and quickly
72 address it. Please include a thorough description of the problem
73 with code and data examples if at all possible.
77 report bugs to the Bioperl bug tracking system to help us keep track
78 the bugs and their resolution. Bug reports can be submitted via the
81 https://github.com/bioperl/bioperl-live/issues
83 =head1 AUTHOR - Chris Mungall
85 Email: cjm@fruitfly.org
89 The rest of the documentation details each of the object
90 methods. Internal methods are usually preceded with a _
95 # Let the code begin...
97 package Bio
::SeqFeature
::Tools
::TypeMapper
;
100 # Object preamble - inherits from Bio::Root::Root
102 use base
qw(Bio::Root::Root);
107 Usage : $unflattener = Bio::SeqFeature::Tools::TypeMapper->new();
108 Function: constructor
110 Returns : a new Bio::SeqFeature::Tools::TypeMapper
117 my($class,@args) = @_;
118 my $self = $class->SUPER::new
(@args);
121 $self->_rearrange([qw(TYPEMAP
125 $typemap && $self->typemap($typemap);
126 return $self; # success - we hope!
132 Usage : $obj->typemap($newval)
135 Returns : value of typemap (a scalar)
136 Args : on set, new value (a scalar or undef, optional)
144 return $self->{'typemap'} = shift if @_;
145 return $self->{'typemap'};
157 dgg: added -undefined => "region" option to produce all valid SO mappings.
162 my ($self,@args) = @_;
164 my($sf, $seq, $type_map, $undefmap) =
165 $self->_rearrange([qw(FEATURE
172 $self->throw("you need to pass in either -feature or -seq");
177 $seq->isa("Bio::SeqI") || $self->throw("$seq NOT A SeqI");
178 @sfs = $seq->get_all_SeqFeatures;
180 $type_map = $type_map || $self->typemap; # dgg: was type_map;
181 foreach my $sf (@sfs) {
183 $sf->isa("Bio::SeqFeatureI") || $self->throw("$sf NOT A SeqFeatureI");
184 $sf->isa("Bio::FeatureHolderI") || $self->throw("$sf NOT A FeatureHolderI");
186 my $type = $sf->primary_tag;
187 my $mtype = $type_map->{$type};
190 if (ref($mtype) eq 'CODE') {
191 $mtype = $mtype->($sf);
194 $self->throw('type_map values must be scalar or CODE ref. You said: '.$mtype.' for type: '.$type);
197 elsif ($undefmap && $mtype eq 'undefined') { # dgg
200 $sf->primary_tag($mtype);
206 =head2 map_types_to_SO
208 Title : map_types_to_SO
215 hardcodes the genbank to SO mapping
217 Based on revision 1.22 of SO
219 Please see the actual code for the mappings
223 L<http://sequenceontology.org/resources/mapping/FT_SO.txt>
225 dgg: separated out FT_SO_map for caller changes. Update with:
227 open(FTSO,"curl -s http://sequenceontology.org/resources/mapping/FT_SO.txt|");
229 chomp; ($ft,$so,$sid,$ftdef,$sodef)= split"\t";
230 print " '$ft' => '$so',\n" if($ft && $so && $ftdef);
237 # note : some of the ft_so mappings are commented out and overriden...
239 "-" => ["located_sequence_feature", "so:0000110"],
240 "-10_signal" => ["minus_10_signal", "so:0000175"],
241 "-35_signal" => ["minus_35_signal", "so:0000176"],
242 "3'utr" => ["three_prime_utr", "so:0000205"],
243 "3'clip" => ["three_prime_clip", "so:0000557"],
244 "5'utr" => ["five_prime_utr", "so:0000204"],
245 "5'clip" => ["five_prime_clip", "so:0000555"],
246 "caat_signal" => ["caat_signal", "so:0000172"],
247 "cds" => ["cds", "so:0000316"],
248 "c_region" => ["undefined", ""],
249 "d-loop" => ["d_loop", "so:0000297"],
250 "d_segment" => ["d_gene", "so:0000458"],
251 "gc_signal" => ["gc_rich_region", "so:0000173"],
252 "j_segment" => ["undefined", ""],
253 "ltr" => ["long_terminal_repeat", "so:0000286"],
254 "n_region" => ["undefined", ""],
255 "rbs" => ["ribosome_entry_site", "so:0000139"],
256 "sts" => ["sts", "so:0000331"],
257 "s_region" => ["undefined", ""],
258 "tata_signal" => ["tata_box", "so:0000174"],
259 "v_region" => ["undefined", ""],
260 "v_segment" => ["undefined", ""],
261 "attenuator" => ["attenuator", "so:0000140"],
262 "conflict" => ["undefined", ""],
263 "enhancer" => ["enhancer", "so:0000165"],
264 "exon" => ["exon", "so:0000147"],
265 "gap" => ["gap", "so:0000730"],
266 "gene" => ["gene", "so:0000704"],
267 "idna" => ["idna", "so:0000723"],
268 "intron" => ["intron", "so:0000188"],
269 "mRNA" => ["mRNA", "so:0000234"],
270 "mat_peptide" => ["mature_protein_region", "so:0000419"],
271 "mature_peptide" => ["mature_protein_region", "so:0000419"],
272 #"misc_RNA" => ["transcript", "so:0000673"],
273 "misc_binding" => ["binding_site", "so:0000409"],
274 "misc_difference" => ["sequence_difference", "so:0000413"],
275 "misc_feature" => ["region", undef],
276 "misc_recomb" => ["recombination_feature", "so:0000298"],
277 "misc_signal" => ["regulatory_region", "so:0005836"],
278 "misc_structure" => ["sequence_secondary_structure", "so:0000002"],
279 "modified_base" => ["modified_base_site", "so:0000305"],
280 "old_sequence" => ["undefined", ""],
281 "operon" => ["operon", "so:0000178"],
282 "oriT" => ["origin_of_transfer", "so:0000724"],
283 "polya_signal" => ["polyA_signal_sequence", "so:0000551"],
284 "polya_site" => ["polyA_site", "so:0000553"],
285 "precursor_RNA" => ["primary_transcript", "so:0000185"],
286 "prim_transcript" => ["primary_transcript", "so:0000185"],
287 "primer_bind" => ["primer_binding_site", "so:0005850"],
288 "promoter" => ["promoter", "so:0000167"],
289 "protein_bind" => ["protein_binding_site", "so:0000410"],
290 "rRNA" => ["rRNA", "so:0000252"],
291 "repeat_region" => ["repeat_region", "so:0000657"],
292 "repeat_unit" => ["repeat_unit", "so:0000726"],
293 "satellite" => ["satellite_dna", "so:0000005"],
294 "scRNA" => ["scRNA", "so:0000013"],
295 "sig_peptide" => ["signal_peptide", "so:0000418"],
296 "snRNA" => ["snRNA", "so:0000274"],
297 "snoRNA" => ["snoRNA", "so:0000275"],
298 #"source" => ["databank_entry", "so:2000061"],
299 "stem_loop" => ["stem_loop", "so:0000313"],
300 "tRNA" => ["tRNA", "so:0000253"],
301 "terminator" => ["terminator", "so:0000141"],
302 "transit_peptide" => ["transit_peptide", "so:0000725"],
303 "unsure" => "undefined",
304 "variation" => ["sequence_variant", "so:0000109"],
307 ## has parent = pseudogene ; dgg
308 "pseudomRNA" => ["pseudogenic_transcript", "so:0000516"],
309 ## from unflattener misc_rna ; dgg
310 "pseudotranscript" => ["pseudogenic_transcript", "so:0000516"],
311 "pseudoexon" => ["pseudogenic_exon", "so:0000507"],
312 "pseudoCDS" => ["pseudogenic_exon", "so:0000507"],
313 "pseudomisc_feature" => ["pseudogenic_region", "so:0000462"],
314 "pseudointron" => ["pseudogenic_region", "so:0000462"],
317 ## "undefined" => "region",
319 # this is the most generic form for rnas;
320 # we always represent the processed form of
322 misc_RNA
=> ['mature_transcript',"so:0000233"],
324 # not sure about this one...
325 source
=>['contig', "SO:0000149"],
327 rep_origin
=>['origin_of_replication',"SO:0000296"],
329 Protein
=>['polypeptide',"SO:0000104"],
332 #"FT term" => "SO term",
333 #"-" => "located_sequence_feature",
334 #"-10_signal" => "minus_10_signal",
335 #"-35_signal" => "minus_35_signal",
336 #"3'UTR" => "three_prime_UTR",
337 #"3'clip" => "three_prime_clip",
338 #"5'UTR" => "five_prime_UTR",
339 #"5'clip" => "five_prime_clip",
340 #"CAAT_signal" => "CAAT_signal",
342 #"C_region" => "undefined",
343 #"D-loop" => "D_loop",
344 #"D_segment" => "D_gene",
345 #"GC_signal" => "GC_rich_region",
346 #"J_segment" => "undefined",
347 #"LTR" => "long_terminal_repeat",
348 #"N_region" => "undefined",
349 #"RBS" => "ribosome_entry_site",
351 #"S_region" => "undefined",
352 #"TATA_signal" => "TATA_box",
353 #"V_region" => "undefined",
354 #"V_segment" => "undefined",
355 #"attenuator" => "attenuator",
356 #"conflict" => "undefined",
357 #"enhancer" => "enhancer",
362 #"intron" => "intron",
364 #"mat_peptide" => "mature_protein_region",
365 #"mature_peptide" => "mature_protein_region",
366 ## "misc_RNA" => "transcript",
367 #"misc_binding" => "binding_site",
368 #"misc_difference" => "sequence_difference",
369 #"misc_feature" => "region",
370 #"misc_recomb" => "recombination_feature",
371 #"misc_signal" => "regulatory_region",
372 #"misc_structure" => "sequence_secondary_structure",
373 #"modified_base" => "modified_base_site",
374 #"old_sequence" => "undefined",
375 #"operon" => "operon",
376 #"oriT" => "origin_of_transfer",
377 #"polyA_signal" => "polyA_signal_sequence",
378 #"polyA_site" => "polyA_site",
379 #"precursor_RNA" => "primary_transcript",
380 #"prim_transcript" => "primary_transcript",
381 #"primer_bind" => "primer_binding_site",
382 #"promoter" => "promoter",
383 #"protein_bind" => "protein_binding_site",
385 #"repeat_region" => "repeat_region",
386 #"repeat_unit" => "repeat_unit",
387 #"satellite" => "satellite_DNA",
389 #"sig_peptide" => "signal_peptide",
391 #"snoRNA" => "snoRNA",
392 ## "source" => "databank_entry",
393 #"stem_loop" => "stem_loop",
395 #"terminator" => "terminator",
396 #"transit_peptide" => "transit_peptide",
397 #"unsure" => "undefined",
398 #"variation" => "sequence_variant",
400 #"pseudomRNA" => "pseudogenic_transcript", ## has parent = pseudogene ; dgg
401 #"pseudotranscript" => "pseudogenic_transcript", ## from Unflattener misc_RNA ; dgg
402 #"pseudoexon" => "pseudogenic_exon",
403 #"pseudoCDS" => "pseudogenic_exon",
404 #"pseudomisc_feature" => "pseudogenic_region",
405 #"pseudointron" => "pseudogenic_region",
407 ### "undefined" => "region",
409 ## this is the most generic form for RNAs;
410 ## we always represent the processed form of
412 #misc_RNA=>'processed_transcript',
414 ## not sure about this one...
417 #rep_origin=>'origin_of_replication',
424 my ($self,@args) = @_;
426 push(@args, (-type_map
=> $self->FT_SO_map() ) );
427 return $self->map_types(@args);
430 =head2 get_relationship_type_by_parent_child
432 Title : get_relationship_type_by_parent_child
433 Usage : $type = $tm->get_relationship_type_by_parent_child($parent_sf, $child_sf);
434 Usage : $type = $tm->get_relationship_type_by_parent_child('mRNA', 'protein');
435 Function: given two features where the parent contains the child,
436 will determine what the relationship between them in
439 Args : parent SeqFeature, child SeqFeature OR
440 parent type string, child type string OR
442 bioperl Seq::FeatureHolderI hierarchies are equivalent to unlabeled
443 graphs (where parent nodes are the containers, and child nodes are the
444 features being contained). For example, a feature of type mRNA can
445 contain features of type exon.
447 Some external representations (eg chadoxml or chaosxml) require that
448 the edges in the feature relationship graph are labeled. For example,
449 the type between mRNA and exon would be B<part_of>. Although it
450 stretches the bioperl notion of containment, we could have a CDS
451 contained by an mRNA (for example, the
452 L<Bio::SeqFeature::Tools::Unflattener> module takes genbank records
453 and makes these kind of links. The relationship here would be
456 In chado speak, the child is the B<subject> feature and the parent is
457 the B<object> feature
461 sub get_relationship_type_by_parent_child
{
462 my ($self,$parent,$child) = @_;
463 $parent = ref($parent) ?
$parent->primary_tag : $parent;
464 $child = ref($child) ?
$child->primary_tag : $child;
466 my $type = 'part_of'; # default
468 # TODO - do this with metadata, or infer via SO itself
470 if (lc($child) eq 'protein') {
471 $type = 'derives_from';
473 if (lc($child) eq 'polypeptide') {
474 $type = 'derives_from';