maint: restructure to use Dist::Zilla
[bioperl-live.git] / lib / Bio / SeqFeature / Tools / TypeMapper.pm
blobe718fc2106f7231bb2e03ca02a86116b831a8319
2 # bioperl module for Bio::SeqFeature::Tools::TypeMapper
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Chris Mungall <cjm@fruitfly.org>
8 # Copyright Chris Mungall
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
14 =head1 NAME
16 Bio::SeqFeature::Tools::TypeMapper - maps $seq_feature-E<gt>primary_tag
18 =head1 SYNOPSIS
20 use Bio::SeqIO;
21 use Bio::SeqFeature::Tools::TypeMapper;
23 # first fetch a genbank SeqI object
24 $seqio =
25 Bio::SeqIO->new(-file=>'AE003644.gbk',
26 -format=>'GenBank');
27 $seq = $seqio->next_seq();
29 $tm = Bio::SeqFeature::Tools::TypeMapper->new;
31 # map all the types in the sequence
32 $tm->map_types(-seq=>$seq,
33 {CDS=>'ORF',
34 variation=>sub {
35 my $f = shift;
36 $f->length > 1 ?
37 'variation' : 'SNP'
39 });
41 # alternatively, use the hardcoded SO mapping
42 $tm->map_types_to_SO(-seq=>$seq);
44 =head1 DESCRIPTION
46 This class implements an object for mapping between types; for
47 example, the types in a genbank feature table, and the types specified
48 in the Sequence Ontology.
50 You can specify your own mapping, either as a simple hash index, or by
51 providing your own subroutines.
53 =head1 FEEDBACK
55 =head2 Mailing Lists
57 User feedback is an integral part of the evolution of this and other
58 Bioperl modules. Send your comments and suggestions preferably to the
59 Bioperl mailing lists Your participation is much appreciated.
61 bioperl-l@bioperl.org - General discussion
62 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
64 =head2 Support
66 Please direct usage questions or support issues to the mailing list:
68 I<bioperl-l@bioperl.org>
70 rather than to the module maintainer directly. Many experienced and
71 reponsive experts will be able look at the problem and quickly
72 address it. Please include a thorough description of the problem
73 with code and data examples if at all possible.
75 =head2 Reporting Bugs
77 report bugs to the Bioperl bug tracking system to help us keep track
78 the bugs and their resolution. Bug reports can be submitted via the
79 web:
81 https://github.com/bioperl/bioperl-live/issues
83 =head1 AUTHOR - Chris Mungall
85 Email: cjm@fruitfly.org
87 =head1 APPENDIX
89 The rest of the documentation details each of the object
90 methods. Internal methods are usually preceded with a _
92 =cut
95 # Let the code begin...
97 package Bio::SeqFeature::Tools::TypeMapper;
98 use strict;
100 # Object preamble - inherits from Bio::Root::Root
102 use base qw(Bio::Root::Root);
104 =head2 new
106 Title : new
107 Usage : $unflattener = Bio::SeqFeature::Tools::TypeMapper->new();
108 Function: constructor
109 Example :
110 Returns : a new Bio::SeqFeature::Tools::TypeMapper
111 Args : see below
114 =cut
116 sub new {
117 my($class,@args) = @_;
118 my $self = $class->SUPER::new(@args);
120 my($typemap) =
121 $self->_rearrange([qw(TYPEMAP
123 @args);
125 $typemap && $self->typemap($typemap);
126 return $self; # success - we hope!
129 =head2 typemap
131 Title : typemap
132 Usage : $obj->typemap($newval)
133 Function:
134 Example :
135 Returns : value of typemap (a scalar)
136 Args : on set, new value (a scalar or undef, optional)
139 =cut
141 sub typemap{
142 my $self = shift;
144 return $self->{'typemap'} = shift if @_;
145 return $self->{'typemap'};
148 =head2 map_types
150 Title : map_types
151 Usage :
152 Function:
153 Example :
154 Returns :
155 Args :
157 dgg: added -undefined => "region" option to produce all valid SO mappings.
159 =cut
161 sub map_types{
162 my ($self,@args) = @_;
164 my($sf, $seq, $type_map, $undefmap) =
165 $self->_rearrange([qw(FEATURE
167 TYPE_MAP
168 UNDEFINED
170 @args);
171 if (!$sf && !$seq) {
172 $self->throw("you need to pass in either -feature or -seq");
175 my @sfs = ($sf);
176 if ($seq) {
177 $seq->isa("Bio::SeqI") || $self->throw("$seq NOT A SeqI");
178 @sfs = $seq->get_all_SeqFeatures;
180 $type_map = $type_map || $self->typemap; # dgg: was type_map;
181 foreach my $sf (@sfs) {
183 $sf->isa("Bio::SeqFeatureI") || $self->throw("$sf NOT A SeqFeatureI");
184 $sf->isa("Bio::FeatureHolderI") || $self->throw("$sf NOT A FeatureHolderI");
186 my $type = $sf->primary_tag;
187 my $mtype = $type_map->{$type};
188 if ($mtype) {
189 if (ref($mtype)) {
190 if (ref($mtype) eq 'CODE') {
191 $mtype = $mtype->($sf);
193 else {
194 $self->throw('type_map values must be scalar or CODE ref. You said: '.$mtype.' for type: '.$type);
197 elsif ($undefmap && $mtype eq 'undefined') { # dgg
198 $mtype= $undefmap;
200 $sf->primary_tag($mtype);
203 return;
206 =head2 map_types_to_SO
208 Title : map_types_to_SO
209 Usage :
210 Function:
211 Example :
212 Returns :
213 Args :
215 hardcodes the genbank to SO mapping
217 Based on revision 1.22 of SO
219 Please see the actual code for the mappings
221 Taken from
223 L<http://sequenceontology.org/resources/mapping/FT_SO.txt>
225 dgg: separated out FT_SO_map for caller changes. Update with:
227 open(FTSO,"curl -s http://sequenceontology.org/resources/mapping/FT_SO.txt|");
228 while(<FTSO>){
229 chomp; ($ft,$so,$sid,$ftdef,$sodef)= split"\t";
230 print " '$ft' => '$so',\n" if($ft && $so && $ftdef);
233 =cut
235 sub FT_SO_map {
236 # $self= shift;
237 # note : some of the ft_so mappings are commented out and overriden...
238 return {
239 "-" => ["located_sequence_feature", "so:0000110"],
240 "-10_signal" => ["minus_10_signal", "so:0000175"],
241 "-35_signal" => ["minus_35_signal", "so:0000176"],
242 "3'utr" => ["three_prime_utr", "so:0000205"],
243 "3'clip" => ["three_prime_clip", "so:0000557"],
244 "5'utr" => ["five_prime_utr", "so:0000204"],
245 "5'clip" => ["five_prime_clip", "so:0000555"],
246 "caat_signal" => ["caat_signal", "so:0000172"],
247 "cds" => ["cds", "so:0000316"],
248 "c_region" => ["undefined", ""],
249 "d-loop" => ["d_loop", "so:0000297"],
250 "d_segment" => ["d_gene", "so:0000458"],
251 "gc_signal" => ["gc_rich_region", "so:0000173"],
252 "j_segment" => ["undefined", ""],
253 "ltr" => ["long_terminal_repeat", "so:0000286"],
254 "n_region" => ["undefined", ""],
255 "rbs" => ["ribosome_entry_site", "so:0000139"],
256 "sts" => ["sts", "so:0000331"],
257 "s_region" => ["undefined", ""],
258 "tata_signal" => ["tata_box", "so:0000174"],
259 "v_region" => ["undefined", ""],
260 "v_segment" => ["undefined", ""],
261 "attenuator" => ["attenuator", "so:0000140"],
262 "conflict" => ["undefined", ""],
263 "enhancer" => ["enhancer", "so:0000165"],
264 "exon" => ["exon", "so:0000147"],
265 "gap" => ["gap", "so:0000730"],
266 "gene" => ["gene", "so:0000704"],
267 "idna" => ["idna", "so:0000723"],
268 "intron" => ["intron", "so:0000188"],
269 "mRNA" => ["mRNA", "so:0000234"],
270 "mat_peptide" => ["mature_protein_region", "so:0000419"],
271 "mature_peptide" => ["mature_protein_region", "so:0000419"],
272 #"misc_RNA" => ["transcript", "so:0000673"],
273 "misc_binding" => ["binding_site", "so:0000409"],
274 "misc_difference" => ["sequence_difference", "so:0000413"],
275 "misc_feature" => ["region", undef],
276 "misc_recomb" => ["recombination_feature", "so:0000298"],
277 "misc_signal" => ["regulatory_region", "so:0005836"],
278 "misc_structure" => ["sequence_secondary_structure", "so:0000002"],
279 "modified_base" => ["modified_base_site", "so:0000305"],
280 "old_sequence" => ["undefined", ""],
281 "operon" => ["operon", "so:0000178"],
282 "oriT" => ["origin_of_transfer", "so:0000724"],
283 "polya_signal" => ["polyA_signal_sequence", "so:0000551"],
284 "polya_site" => ["polyA_site", "so:0000553"],
285 "precursor_RNA" => ["primary_transcript", "so:0000185"],
286 "prim_transcript" => ["primary_transcript", "so:0000185"],
287 "primer_bind" => ["primer_binding_site", "so:0005850"],
288 "promoter" => ["promoter", "so:0000167"],
289 "protein_bind" => ["protein_binding_site", "so:0000410"],
290 "rRNA" => ["rRNA", "so:0000252"],
291 "repeat_region" => ["repeat_region", "so:0000657"],
292 "repeat_unit" => ["repeat_unit", "so:0000726"],
293 "satellite" => ["satellite_dna", "so:0000005"],
294 "scRNA" => ["scRNA", "so:0000013"],
295 "sig_peptide" => ["signal_peptide", "so:0000418"],
296 "snRNA" => ["snRNA", "so:0000274"],
297 "snoRNA" => ["snoRNA", "so:0000275"],
298 #"source" => ["databank_entry", "so:2000061"],
299 "stem_loop" => ["stem_loop", "so:0000313"],
300 "tRNA" => ["tRNA", "so:0000253"],
301 "terminator" => ["terminator", "so:0000141"],
302 "transit_peptide" => ["transit_peptide", "so:0000725"],
303 "unsure" => "undefined",
304 "variation" => ["sequence_variant", "so:0000109"],
306 # manually added
307 ## has parent = pseudogene ; dgg
308 "pseudomRNA" => ["pseudogenic_transcript", "so:0000516"],
309 ## from unflattener misc_rna ; dgg
310 "pseudotranscript" => ["pseudogenic_transcript", "so:0000516"],
311 "pseudoexon" => ["pseudogenic_exon", "so:0000507"],
312 "pseudoCDS" => ["pseudogenic_exon", "so:0000507"],
313 "pseudomisc_feature" => ["pseudogenic_region", "so:0000462"],
314 "pseudointron" => ["pseudogenic_region", "so:0000462"],
317 ## "undefined" => "region",
319 # this is the most generic form for rnas;
320 # we always represent the processed form of
321 # the transcript
322 misc_RNA => ['mature_transcript',"so:0000233"],
324 # not sure about this one...
325 source=>['contig', "SO:0000149"],
327 rep_origin=>['origin_of_replication',"SO:0000296"],
329 Protein=>['polypeptide',"SO:0000104"],
331 # return {
332 #"FT term" => "SO term",
333 #"-" => "located_sequence_feature",
334 #"-10_signal" => "minus_10_signal",
335 #"-35_signal" => "minus_35_signal",
336 #"3'UTR" => "three_prime_UTR",
337 #"3'clip" => "three_prime_clip",
338 #"5'UTR" => "five_prime_UTR",
339 #"5'clip" => "five_prime_clip",
340 #"CAAT_signal" => "CAAT_signal",
341 #"CDS" => "CDS",
342 #"C_region" => "undefined",
343 #"D-loop" => "D_loop",
344 #"D_segment" => "D_gene",
345 #"GC_signal" => "GC_rich_region",
346 #"J_segment" => "undefined",
347 #"LTR" => "long_terminal_repeat",
348 #"N_region" => "undefined",
349 #"RBS" => "ribosome_entry_site",
350 #"STS" => "STS",
351 #"S_region" => "undefined",
352 #"TATA_signal" => "TATA_box",
353 #"V_region" => "undefined",
354 #"V_segment" => "undefined",
355 #"attenuator" => "attenuator",
356 #"conflict" => "undefined",
357 #"enhancer" => "enhancer",
358 #"exon" => "exon",
359 #"gap" => "gap",
360 #"gene" => "gene",
361 #"iDNA" => "iDNA",
362 #"intron" => "intron",
363 #"mRNA" => "mRNA",
364 #"mat_peptide" => "mature_protein_region",
365 #"mature_peptide" => "mature_protein_region",
366 ## "misc_RNA" => "transcript",
367 #"misc_binding" => "binding_site",
368 #"misc_difference" => "sequence_difference",
369 #"misc_feature" => "region",
370 #"misc_recomb" => "recombination_feature",
371 #"misc_signal" => "regulatory_region",
372 #"misc_structure" => "sequence_secondary_structure",
373 #"modified_base" => "modified_base_site",
374 #"old_sequence" => "undefined",
375 #"operon" => "operon",
376 #"oriT" => "origin_of_transfer",
377 #"polyA_signal" => "polyA_signal_sequence",
378 #"polyA_site" => "polyA_site",
379 #"precursor_RNA" => "primary_transcript",
380 #"prim_transcript" => "primary_transcript",
381 #"primer_bind" => "primer_binding_site",
382 #"promoter" => "promoter",
383 #"protein_bind" => "protein_binding_site",
384 #"rRNA" => "rRNA",
385 #"repeat_region" => "repeat_region",
386 #"repeat_unit" => "repeat_unit",
387 #"satellite" => "satellite_DNA",
388 #"scRNA" => "scRNA",
389 #"sig_peptide" => "signal_peptide",
390 #"snRNA" => "snRNA",
391 #"snoRNA" => "snoRNA",
392 ## "source" => "databank_entry",
393 #"stem_loop" => "stem_loop",
394 #"tRNA" => "tRNA",
395 #"terminator" => "terminator",
396 #"transit_peptide" => "transit_peptide",
397 #"unsure" => "undefined",
398 #"variation" => "sequence_variant",
400 #"pseudomRNA" => "pseudogenic_transcript", ## has parent = pseudogene ; dgg
401 #"pseudotranscript" => "pseudogenic_transcript", ## from Unflattener misc_RNA ; dgg
402 #"pseudoexon" => "pseudogenic_exon",
403 #"pseudoCDS" => "pseudogenic_exon",
404 #"pseudomisc_feature" => "pseudogenic_region",
405 #"pseudointron" => "pseudogenic_region",
407 ### "undefined" => "region",
409 ## this is the most generic form for RNAs;
410 ## we always represent the processed form of
411 ## the transcript
412 #misc_RNA=>'processed_transcript',
414 ## not sure about this one...
415 #source=>'contig',
417 #rep_origin=>'origin_of_replication',
419 #Protein=>'protein',
423 sub map_types_to_SO{
424 my ($self,@args) = @_;
426 push(@args, (-type_map=> $self->FT_SO_map() ) );
427 return $self->map_types(@args);
430 =head2 get_relationship_type_by_parent_child
432 Title : get_relationship_type_by_parent_child
433 Usage : $type = $tm->get_relationship_type_by_parent_child($parent_sf, $child_sf);
434 Usage : $type = $tm->get_relationship_type_by_parent_child('mRNA', 'protein');
435 Function: given two features where the parent contains the child,
436 will determine what the relationship between them in
437 Example :
438 Returns :
439 Args : parent SeqFeature, child SeqFeature OR
440 parent type string, child type string OR
442 bioperl Seq::FeatureHolderI hierarchies are equivalent to unlabeled
443 graphs (where parent nodes are the containers, and child nodes are the
444 features being contained). For example, a feature of type mRNA can
445 contain features of type exon.
447 Some external representations (eg chadoxml or chaosxml) require that
448 the edges in the feature relationship graph are labeled. For example,
449 the type between mRNA and exon would be B<part_of>. Although it
450 stretches the bioperl notion of containment, we could have a CDS
451 contained by an mRNA (for example, the
452 L<Bio::SeqFeature::Tools::Unflattener> module takes genbank records
453 and makes these kind of links. The relationship here would be
454 B<produced_by>
456 In chado speak, the child is the B<subject> feature and the parent is
457 the B<object> feature
459 =cut
461 sub get_relationship_type_by_parent_child {
462 my ($self,$parent,$child) = @_;
463 $parent = ref($parent) ? $parent->primary_tag : $parent;
464 $child = ref($child) ? $child->primary_tag : $child;
466 my $type = 'part_of'; # default
468 # TODO - do this with metadata, or infer via SO itself
470 if (lc($child) eq 'protein') {
471 $type = 'derives_from';
473 if (lc($child) eq 'polypeptide') {
474 $type = 'derives_from';
476 return $type;