2 # BioPerl module for Bio::Variation::VariantI
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Heikki Lehvaslaiho <heikki-at-bioperl-dot-org>
8 # Copyright Heikki Lehvaslaiho
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
16 Bio::Variation::VariantI - Sequence Change SeqFeature abstract class
20 #get Bio::Variant::VariantI somehow
21 print $var->restriction_changes, "\n";
22 foreach $allele ($var->each_Allele) {
23 #work on Bio::Variation::Allele objects
28 This superclass defines common methods to basic sequence changes. The
29 instantiable classes Bio::Variation::DNAMutation,
30 Bio::Variation::RNAChange and Bio::Variation::AAChange use them.
31 See L<Bio::Variation::DNAMutation>, L<Bio::Variation::RNAChange>,
32 and L<Bio::Variation::AAChange> for more information.
34 These classes store information, heavy computation to determine allele
35 sequences is done elsewhere.
37 The database cross-references are implemented as
38 Bio::Annotation::DBLink objects. The methods to access them are
39 defined in Bio::DBLinkContainerI. See L<Bio::Annotation::DBLink>
40 and L<Bio::DBLinkContainerI> for details.
42 Bio::Variation::VariantI redifines and extends
43 Bio::SeqFeature::Generic for sequence variations. This class
44 describes specific sequence change events. These events are always
45 from a specific reference sequence to something different. See
46 L<Bio::SeqFeature::Generic> for more information.
48 IMPORTANT: The notion of reference sequence permeates all
49 Bio::Variation classes. This is especially important to remember when
50 dealing with Alleles. In a polymorphic site, there can be a large
51 number of alleles. One of then has to be selected to be the reference
52 allele (allele_ori). ALL the rest has to be passed to the Variant
53 using the method add_Allele, including the mutated allele in a
54 canonical mutation. The IO modules and generated attributes depend on
55 it. They ignore the allele linked to using allele_mut and circulate
56 each Allele returned by each_Allele into allele_mut and calculate
57 the changes between that and allele_ori.
64 User feedback is an integral part of the evolution of this and other
65 Bioperl modules. Send your comments and suggestions preferably to the
66 Bioperl mailing lists Your participation is much appreciated.
68 bioperl-l@bioperl.org - General discussion
69 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
73 Please direct usage questions or support issues to the mailing list:
75 I<bioperl-l@bioperl.org>
77 rather than to the module maintainer directly. Many experienced and
78 reponsive experts will be able look at the problem and quickly
79 address it. Please include a thorough description of the problem
80 with code and data examples if at all possible.
84 Report bugs to the Bioperl bug tracking system to help us keep track
85 the bugs and their resolution. Bug reports can be submitted via the
88 https://github.com/bioperl/bioperl-live/issues
90 =head1 AUTHOR - Heikki Lehvaslaiho
92 Email: heikki-at-bioperl-dot-org
96 The rest of the documentation details each of the object
97 methods. Internal methods are usually preceded with a _
102 # Let the code begin...
105 package Bio
::Variation
::VariantI
;
107 # Object preamble - inheritance
109 use base
qw(Bio::Root::Root Bio::SeqFeature::Generic Bio::DBLinkContainerI);
117 Read only method. Returns the id of the variation object.
118 The id is the id of the first DBLink object attached to this object.
128 my @ids = $self->each_DBLink;
129 my $id = $ids[0] if scalar @ids > 0;
130 return $id->database. "::". $id->primary_id if $id;
137 Usage : $self->add_Allele($allele)
140 Adds one Bio::Variation::Allele into the list of alleles.
141 Note that the method forces the convention that nucleotide
142 sequence is in lower case and amino acds are in upper
146 Returns : 1 when succeeds, 0 for failure.
153 my ($self,$value) = @_;
154 if (defined $value) {
155 if( ! $value->isa('Bio::Variation::Allele') ) {
156 my $com = ref $value;
157 $self->throw("Is not a Allele object but a [$com]");
160 if ( $self->isa('Bio::Variation::AAChange') ) {
161 $value->seq( uc $value->seq) if $value->seq;
163 $value->seq( lc $value->seq) if $value->seq;
165 push(@
{$self->{'alleles'}},$value);
166 $self->allele_mut($value); #????
178 Usage : $obj->each_Allele();
181 Returns a list of Bio::Variation::Allele objects
184 Returns : list of Alleles
190 my ($self,@args) = @_;
191 return @
{$self->{'alleles'}};
198 Usage : print join('/', $obj->each_Allele) if not $obj->isMutation;
201 Returns or sets the boolean value indicating that the
202 variant described is a canonical mutation with two alleles
203 assinged to be the original (wild type) allele and mutated
204 allele, respectively. If this value is not set, it is
205 assumed that the Variant describes polymorphisms.
212 my ($self,$value) = @_;
213 if (defined $value) {
215 $self->{'isMutation'} = 1;
217 $self->{'isMutation'} = 0;
220 return $self->{'isMutation'};
227 Usage : $obj->allele_ori();
230 Links to and returns the Bio::Variation::Allele object.
231 If value is not set, returns false. All other Alleles are
234 Amino acid sequences are stored in upper case characters,
235 others in lower case.
241 See L<Bio::Variation::Allele> for more.
246 my ($self,$value) = @_;
247 if( defined $value) {
248 if ( ! ref $value || ! $value->isa('Bio::Variation::Allele')) {
249 $self->throw("Value is not Bio::Variation::Allele but [$value]");
251 if ( $self->isa('Bio::Variation::AAChange') ) {
252 $value->seq( uc $value->seq) if $value->seq;
254 $value->seq( lc $value->seq) if $value->seq;
256 $self->{'allele_ori'} = $value;
259 return $self->{'allele_ori'};
266 Usage : $obj->allele_mut();
269 Links to and returns the Bio::Variation::Allele
270 object. Sets and returns the mutated allele sequence.
271 If value is not set, returns false.
273 Amino acid sequences are stored in upper case characters,
274 others in lower case.
280 See L<Bio::Variation::Allele> for more.
286 my ($self,$value) = @_;
287 if( defined $value) {
288 if ( ! ref $value || ! $value->isa('Bio::Variation::Allele')) {
289 $self->throw("Value is not Bio::Variation::Allele but [$value]");
291 if ( $self->isa('Bio::Variation::AAChange') ) {
292 $value->seq( uc $value->seq) if $value->seq;
294 $value->seq( lc $value->seq) if $value->seq;
296 $self->{'allele_mut'} = $value;
299 return $self->{'allele_mut'};
305 Usage : $obj->length();
308 Sets and returns the length of the affected original
309 allele sequence. If value is not set, returns false == 0.
311 Value 0 means that the variant position is before the
312 start=end sequence position. (Value 1 would denote a point
313 mutation). This follows the convension to report an
314 insertion (2insT) in equivalent way to a corresponding
315 deletion (2delT) (Think about indel polymorpism ATC <=> AC
316 where the origianal state is not known ).
326 my ($self,$value) = @_;
327 if ( defined $value) {
328 $self->{'length'} = $value;
330 if ( ! exists $self->{'length'} ) {
333 return $self->{'length'};
339 Usage : $obj->upStreamSeq();
342 Sets and returns upstream flanking sequence string. If
343 value is not set, returns false. The sequence should be
344 >=25 characters long, if possible.
347 Returns : string or false
354 my ($self,$value) = @_;
355 if( defined $value) {
356 $self->{'upstreamseq'} = $value;
358 return $self->{'upstreamseq'};
365 Usage : $obj->dnStreamSeq();
368 Sets and returns dnstream flanking sequence string. If
369 value is not set, returns false. The sequence should be
370 >=25 characters long, if possible.
373 Returns : string or false
380 my ($self,$value) = @_;
381 if( defined $value) {
382 $self->{'dnstreamseq'} = $value;
384 return $self->{'dnstreamseq'};
392 Usage : $obj->label();
395 Sets and returns mutation event label(s). If value is not
396 set, or no argument is given returns false. Each
397 instantiable class needs to implement this method. Valid
398 values are listed in 'Mutation event controlled vocabulary' in
399 http://www.ebi.ac.uk/mutations/recommendations/mutevent.html.
409 my ($self,$value) = @_;
410 $self->throw_not_implemented();
418 Usage : $obj->status()
421 Returns the status of the sequence change object.
422 Valid values are: 'suspected' and 'proven'
424 Example : $obj->status('proven');
426 Args : valid string (optional, for setting)
433 my ($self,$value) = @_;
434 my %status = (suspected
=> 1,
438 if( defined $value) {
440 if ($status{$value}) {
441 $self->{'status'} = $value;
444 $self->throw("$value is not valid status value!");
447 if( ! exists $self->{'status'} ) {
450 return $self->{'status'};
457 Usage : $obj->proof()
460 Returns the proof of the sequence change object.
461 Valid values are: 'computed' and 'experimental'.
463 Example : $obj->proof('computed');
465 Args : valid string (optional, for setting)
472 my ($self,$value) = @_;
473 my %proof = (computed
=> 1,
477 if( defined $value) {
479 if ($proof{$value}) {
480 $self->{'proof'} = $value;
482 $self->throw("$value is not valid proof value!");
485 return $self->{'proof'};
492 Usage : $obj->region();
495 Sets and returns the name of the sequence region type or
496 protein domain at this location. If value is not set,
507 my ($self,$value) = @_;
508 if( defined $value) {
509 $self->{'region'} = $value;
511 return $self->{'region'};
518 Usage : $obj->region_value();
521 Sets and returns the name of the sequence region_value or
522 protein domain at this location. If value is not set,
533 my ($self,$value) = @_;
534 if( defined $value) {
535 $self->{'region_value'} = $value;
537 return $self->{'region_value'};
543 Usage : $obj->region_dist();
546 Sets and returns the distance tot the closest region
547 (i.e. intro/exon or domain) boundary. If distance is not
558 my ($self,$value) = @_;
559 if( defined $value) {
560 if ( not $value =~ /^[+-]?\d+$/ ) {
561 $self->throw("[$value] for region_dist has to be an integer\n");
563 $self->{'region_dist'} = $value;
566 return $self->{'region_dist'};
573 Usage : $obj->numbering()
576 Returns the numbering chema used locating sequnce features.
577 Valid values are: 'entry' and 'coding'
579 Example : $obj->numbering('coding');
581 Args : valid string (optional, for setting)
588 my ($self,$value) = @_;
589 my %numbering = (entry
=> 1,
593 if( defined $value) {
595 if ($numbering{$value}) {
596 $self->{'numbering'} = $value;
599 $self->throw("'$value' is not a valid for numbering!");
602 if( ! exists $self->{'numbering'} ) {
605 return $self->{'numbering'};
611 Usage : $num = $obj->mut_number;
612 : $num = $obj->mut_number($number);
615 Returns or sets the number identifying the order in which the
616 mutation has been issued. Numbers shouldstart from 1.
617 If the number has never been set, the method will return ''
619 If you want the output from IO modules look nice and, for
620 multivariant/allele variations, make sense you better set
629 my ($self,$value) = @_;
630 if (defined $value) {
631 $self->{'mut_number'} = $value;
633 unless (exists $self->{'mut_number'}) {
636 return $self->{'mut_number'};
644 Usage : $mutobj = $obj->SeqDiff;
645 : $mutobj = $obj->SeqDiff($objref);
648 Returns or sets the link-reference to the umbrella
649 Bio::Variation::SeqDiff object. If there is no link,
652 Note: Adding a variant into a SeqDiff object will
653 automatically set this value.
655 Returns : an obj_ref or undef
657 See L<Bio::Variation::SeqDiff> for more information.
662 my ($self,$value) = @_;
663 if (defined $value) {
664 if( ! $value->isa('Bio::Variation::SeqDiff') ) {
665 $self->throw("Is not a Bio::Variation::SeqDiff object but a [$value]");
669 $self->{'seqDiff'} = $value;
672 unless (exists $self->{'seqDiff'}) {
675 return $self->{'seqDiff'};
682 Usage : $self->add_DBLink($ref)
683 Function: adds a link object
693 my ($self,$com) = @_;
694 if( $com && ! $com->isa('Bio::Annotation::DBLink') ) {
695 $self->throw("Is not a link object but a [$com]");
697 $com && push(@
{$self->{'link'}},$com);
703 Usage : foreach $ref ( $self->each_DBlink() )
704 Function: gets an array of DBlink of objects
715 return @
{$self->{'link'}};
718 =head2 restriction_changes
720 Title : restriction_changes
721 Usage : $obj->restriction_changes();
724 Returns a string containing a list of restriction
725 enzyme changes of form +EcoRI, separated by
726 commas. Strings need to be valid restriction enzyme names
727 as stored in REBASE. allele_ori and allele_mut need to be assigned.
735 sub restriction_changes
{
738 if (not $self->{'re_changes'}) {
741 # complain if used on AA data
742 if ($self->isa('Bio::Variation::AAChange')) {
743 $self->throw('Restriction enzymes do not bite polypeptides!');
747 $self->warn('Upstream sequence is empty!')
748 if $self->upStreamSeq eq '';
749 $self->warn('Downstream sequence is empty!')
750 if $self->dnStreamSeq eq '';
751 # $self->warn('Original allele sequence is empty!')
752 # if $self->allele_ori eq '';
753 # $self->warn('Mutated allele sequence is empty!')
754 # if $self->allele_mut eq '';
756 #reuse the non empty DNA level list at RNA level if the flanks are identical
757 #Hint: Check DNAMutation object first
758 if ($self->isa('Bio::Variation::RNAChange') and $self->DNAMutation and
759 $self->upStreamSeq eq $self->DNAMutation->upStreamSeq and
760 $self->dnStreamSeq eq $self->DNAMutation->dnStreamSeq and
761 $self->DNAMutation->restriction_changes ne '' ) {
762 $self->{'re_changes'} = $self->DNAMutation->restriction_changes;
765 #maximum length of a type II restriction site in the current REBASE
767 my ($le_up) = $le_dn;
769 #reduce the flank lengths if the desired length is not available
770 $le_dn = CORE
::length ($self->dnStreamSeq) if $le_dn > CORE
::length ($self->dnStreamSeq);
771 $le_up = CORE
::length ($self->upStreamSeq) if $le_up > CORE
::length ($self->upStreamSeq);
773 #Build sequence strings to compare
774 my ($oriseq, $mutseq);
775 $oriseq = $mutseq = substr($self->upStreamSeq, -$le_up, $le_up);
776 $oriseq .= $self->allele_ori->seq if $self->allele_ori->seq;
777 $mutseq .= $self->allele_mut->seq if $self->allele_mut->seq;
778 $oriseq .= substr($self->dnStreamSeq, 0, $le_dn);
779 $mutseq .= substr($self->dnStreamSeq, 0, $le_dn);
781 # ... and their reverse complements
782 my $oriseq_rev = _revcompl
($oriseq);
783 my $mutseq_rev = _revcompl
($mutseq);
785 # collect results into a string
787 foreach my $enz (sort keys (%re)) {
788 my $site = $re{$enz};
789 my @ori = ($oriseq=~ /$site/g);
790 my @mut = ($mutseq=~ /$site/g);
791 my @ori_r = ($oriseq_rev =~ /$site/g);
792 my @mut_r = ($mutseq_rev =~ /$site/g);
794 $rec .= '+'. $enz. ", "
795 if (scalar @ori < scalar @mut) or (scalar @ori_r < scalar @mut_r);
796 $rec .= '-'. $enz. ", "
797 if (scalar @ori > scalar @mut) or (scalar @ori_r > scalar @mut_r);
800 $rec = substr($rec, 0, CORE
::length($rec) - 2) if $rec ne '';
801 $self->{'re_changes'} = $rec;
804 return $self->{'re_changes'}
809 # side effect: lower case letters
813 $seq =~ tr/acgtrymkswhbvdnx/tgcayrkmswdvbhnx/;
814 return CORE
::reverse $seq;
819 #REBASE version 005 type2.005
823 'AccI' => 'gt[ac][gt]ac',
824 'AceIII' => 'cagctc',
827 'AcyI' => 'g[ag]cg[ct]c',
829 'AflIII' => 'ac[ag][ct]gt',
831 'AhaIII' => 'tttaaa',
832 'AloI' => 'gaac[acgt][acgt][acgt][acgt][acgt][acgt]tcc',
834 'AlwNI' => 'cag[acgt][acgt][acgt]ctg',
835 'ApaBI' => 'gca[acgt][acgt][acgt][acgt][acgt]tgc',
838 'ApoI' => '[ag]aatt[ct]',
839 'AscI' => 'ggcgcgcc',
840 'AsuI' => 'gg[acgt]cc',
842 'AvaI' => 'c[ct]cg[ag]g',
843 'AvaII' => 'gg[at]cc',
844 'AvaIII' => 'atgcat',
846 'BaeI' => 'ac[acgt][acgt][acgt][acgt]gta[ct]c',
849 'BbvCI' => 'cctcagc',
853 'Bce83I' => 'cttgag',
855 'BcgI' => 'cga[acgt][acgt][acgt][acgt][acgt][acgt]tgc',
858 'BetI' => '[at]ccgg[at]',
860 'BglI' => 'gcc[acgt][acgt][acgt][acgt][acgt]ggc',
863 'BmgI' => 'g[gt]gccc',
864 'BplI' => 'gag[acgt][acgt][acgt][acgt][acgt]ctc',
865 'Bpu10I' => 'cct[acgt]agc',
866 'BsaAI' => '[ct]acgt[ag]',
867 'BsaBI' => 'gat[acgt][acgt][acgt][acgt]atc',
868 'BsaXI' => 'ac[acgt][acgt][acgt][acgt][acgt]ctcc',
874 'BseSI' => 'g[gt]gc[ac]c',
877 'BsiYI' => 'cc[acgt][acgt][acgt][acgt][acgt][acgt][acgt]gg',
880 'Bsp1407I' => 'tgtaca',
881 'Bsp24I' => 'gac[acgt][acgt][acgt][acgt][acgt][acgt]tgg',
884 'BspLU11I' => 'acatgt',
886 'BspMII' => 'tccgga',
890 'BstEII' => 'ggt[acgt]acc',
891 'BstXI' => 'cca[acgt][acgt][acgt][acgt][acgt][acgt]tgg',
894 'Cac8I' => 'gc[acgt][acgt]gc',
895 'CauII' => 'cc[cg]gg',
896 'Cfr10I' => '[ag]ccgg[ct]',
897 'CfrI' => '[ct]ggcc[ag]',
898 'CjeI' => 'cca[acgt][acgt][acgt][acgt][acgt][acgt]gt',
899 'CjePI' => 'cca[acgt][acgt][acgt][acgt][acgt][acgt][acgt]tc',
901 'CviJI' => '[ag]gc[ct]',
903 'DdeI' => 'ct[acgt]ag',
905 'DraII' => '[ag]gg[acgt]cc[ct]',
906 'DraIII' => 'cac[acgt][acgt][acgt]gtg',
907 'DrdI' => 'gac[acgt][acgt][acgt][acgt][acgt][acgt]gtc',
909 'DsaI' => 'cc[ag][ct]gg',
910 'Eam1105I' => 'gac[acgt][acgt][acgt][acgt][acgt]gtc',
912 'Eco31I' => 'ggtctc',
913 'Eco47III' => 'agcgct',
914 'Eco57I' => 'ctgaag',
915 'EcoNI' => 'cct[acgt][acgt][acgt][acgt][acgt]agg',
917 'EcoRII' => 'cc[at]gg',
920 'EspI' => 'gct[acgt]agc',
923 'Fnu4HI' => 'gc[acgt]gc',
926 'FseI' => 'ggccggcc',
927 'GdiII' => 'cggcc[ag]',
929 'HaeI' => '[at]ggcc[at]',
930 'HaeII' => '[ag]gcgc[ct]',
932 'HaeIV' => 'ga[ct][acgt][acgt][acgt][acgt][acgt][ag]tc',
934 'HgiAI' => 'g[at]gc[at]c',
935 'HgiCI' => 'gg[ct][ag]cc',
936 'HgiEII' => 'acc[acgt][acgt][acgt][acgt][acgt][acgt]ggt',
937 'HgiJII' => 'g[ag]gc[ct]c',
939 'Hin4I' => 'ga[cgt][acgt][acgt][acgt][acgt][acgt][acg]tc',
940 'HindII' => 'gt[ct][ag]ac',
941 'HindIII' => 'aagctt',
942 'HinfI' => 'ga[acgt]tc',
946 'Hpy178III' => 'tc[acgt][acgt]ga',
947 'Hpy188I' => 'tc[acgt]ga',
948 'Hpy99I' => 'cg[at]cg',
950 'Ksp632I' => 'ctcttc',
953 'MaeIII' => 'gt[acgt]ac',
956 'McrI' => 'cg[ag][ct]cg',
958 'MjaIV' => 'gt[acgt][acgt]ac',
960 'MmeI' => 'tcc[ag]ac',
963 'MslI' => 'ca[ct][acgt][acgt][acgt][acgt][ag]tg',
965 'MwoI' => 'gc[acgt][acgt][acgt][acgt][acgt][acgt][acgt]gc',
972 'NlaIV' => 'gg[acgt][acgt]cc',
973 'NotI' => 'gcggccgc',
975 'NspBII' => 'c[ac]gc[gt]g',
976 'NspI' => '[ag]catg[ct]',
977 'PacI' => 'ttaattaa',
978 'Pfl1108I' => 'tcgtag',
979 'PflMI' => 'cca[acgt][acgt][acgt][acgt][acgt]tgg',
982 'PmeI' => 'gtttaaac',
983 'PpiI' => 'gaac[acgt][acgt][acgt][acgt][acgt]ctc',
984 'PpuMI' => '[ag]gg[at]cc[ct]',
985 'PshAI' => 'gac[acgt][acgt][acgt][acgt]gtc',
992 'RsrII' => 'cgg[at]ccg',
996 'SanDI' => 'ggg[at]ccc',
998 'SauI' => 'cct[acgt]agg',
1000 'ScrFI' => 'cc[acgt]gg',
1001 'SduI' => 'g[agt]gc[act]c',
1002 'SecI' => 'cc[acgt][acgt]gg',
1003 'SexAI' => 'acc[at]ggt',
1005 'SfeI' => 'ct[ag][ct]ag',
1006 'SfiI' => 'ggcc[acgt][acgt][acgt][acgt][acgt]ggcc',
1007 'SgfI' => 'gcgatcgc',
1008 'SgrAI' => 'c[ag]ccgg[ct]g',
1011 'SmlI' => 'ct[ct][ag]ag',
1012 'SnaBI' => 'tacgta',
1017 'SrfI' => 'gcccgggc',
1018 'Sse232I' => 'cgccggcg',
1019 'Sse8387I' => 'cctgcagg',
1020 'Sse8647I' => 'agg[at]cct',
1022 'Sth132I' => 'cccg',
1024 'StyI' => 'cc[at][at]gg',
1025 'SwaI' => 'atttaaat',
1027 'TaqII' => 'gaccga',
1028 'TatI' => '[at]gtac[at]',
1029 'TauI' => 'gc[cg]gc',
1030 'TfiI' => 'ga[at]tc',
1031 'TseI' => 'gc[at]gc',
1032 'Tsp45I' => 'gt[cg]ac',
1033 'Tsp4CI' => 'ac[acgt]gt',
1035 'TspRI' => 'ca[cg]tg[acgt][acgt]',
1036 'Tth111I' => 'gac[acgt][acgt][acgt]gtc',
1037 'Tth111II' => 'caa[ag]ca',
1038 'UbaGI' => 'cac[acgt][acgt][acgt][acgt]gtg',
1039 'UbaPI' => 'cgaacg',
1042 'XcmI' => 'cca[acgt][acgt][acgt][acgt][acgt][acgt][acgt][acgt][acgt]tgg',
1044 'XhoII' => '[ag]gatc[ct]',
1045 'XmaIII' => 'cggccg',
1046 'XmnI' => 'gaa[acgt][acgt][acgt][acgt]ttc'