2 # bioperl module for Bio::LiveSeq::IO::Loader
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Joseph Insana <insana@ebi.ac.uk> <jinsana@gmx.net>
8 # Copyright Joseph Insana
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
16 Bio::LiveSeq::IO::Loader - Parent Loader for LiveSeq
24 This package holds common methods used by BioPerl and file loaders.
25 It contains methods to create LiveSeq objects out of entire entries or from a
26 localized sequence region surrounding a particular gene.
28 =head1 AUTHOR - Joseph A.L. Insana
30 Email: Insana@ebi.ac.uk, jinsana@gmx.net
34 The rest of the documentation details each of the object
35 methods. Internal methods are usually preceded with a _
39 # Let the code begin...
41 package Bio
::LiveSeq
::IO
::Loader
;
44 use Carp
qw(cluck croak carp);
45 use Bio
::LiveSeq
::DNA
;
46 use Bio
::LiveSeq
::Exon
;
47 use Bio
::LiveSeq
::Transcript
;
48 use Bio
::LiveSeq
::Translation
;
49 use Bio
::LiveSeq
::Gene
;
50 use Bio
::LiveSeq
::Intron
;
51 use Bio
::LiveSeq
::Prim_Transcript
;
52 use Bio
::LiveSeq
::Repeat_Region
;
53 use Bio
::LiveSeq
::Repeat_Unit
;
54 use Bio
::LiveSeq
::AARange
;
55 use Bio
::Tools
::CodonTable
;
60 Usage : @translationobjects=$loader->entry2liveseq();
61 : @translationobjects=$loader->entry2liveseq(-getswissprotinfo => 0);
62 Function: creates LiveSeq objects from an entry previously loaded
63 Returns : array of references to objects of class Translation
65 Args : optional boolean flag to avoid the retrieval of SwissProt
66 information for all Transcripts containing SwissProt x-reference
67 default is 1 (to retrieve those information and create AARange
69 Note : this method can get really slow for big entries. The lightweight
70 gene2liveseq method is recommended
75 my ($self, %args) = @_;
76 my ($getswissprotinfo)=($args{-getswissprotinfo
});
77 if (defined($getswissprotinfo)) {
78 if (($getswissprotinfo ne 0)&&($getswissprotinfo ne 1)) {
79 carp
"-getswissprotinfo argument can take only boolean (1 or 0) values. Setting it to 0, i.e. not trying to retrieve SwissProt information....";
85 my $hashref=$self->{'hash'};
86 unless ($hashref) { return (0); }
87 my @translationobjects=$self->hash2liveseq($hashref,$getswissprotinfo);
89 if ($test_transl) { $self->test_transl($hashref,\
@translationobjects);}
90 return @translationobjects;
93 =head2 novelaasequence2gene
95 Title : novelaasequence2gene
96 Usage : $gene=$loader->novelaasequence2gene(-aasequence => "MGLAAPTRS*");
97 : $gene=$loader->novelaasequence2gene(-aasequence => "MGLAAPTRS*");
99 -gene_name => "tyr-kinase");
101 Function: creates LiveSeq objects from a novel amino acid sequence,
102 using codon usage database to choose codons according to
103 relative frequencies.
104 If a taxon ID is not specified, the default is to use the human
105 one (taxonomy ID 9606).
106 Returns : reference to a Gene object containing references to LiveSeq objects
108 Args : string containing an amino acid sequence
109 integer (optional) with a taxonomy ID
110 string specifying a gene name
117 Usage : $gene=$loader->gene2liveseq(-gene_name => "gene name");
118 : $gene=$loader->gene2liveseq(-gene_name => "gene name",
120 : $gene=$loader->gene2liveseq(-gene_name => "gene name",
121 -getswissprotinfo => 0);
122 : $gene=$loader->gene2liveseq(-position => 4);
124 Function: creates LiveSeq objects from an entry previously loaded
125 It is a "light weight" creation because it creates
126 a LiveSequence just for the interesting region in an entry
127 (instead than for the total entry, like in entry2liveseq) and for
128 the flanking regions up to 500 nucleotides (default) or up to
129 the specified amount of nucleotides (given as argument) around the
131 Returns : reference to a Gene object containing possibly alternative
134 Args : string containing the gene name as in the EMBL feature qualifier
135 integer (optional) "flanking": amount of flanking bases to be kept
136 boolean (optional) "getswissprotinfo": if set to "0" it will avoid
137 trying to fetch information from a crossreference to a SwissProt
138 entry, avoiding the process of creation of AARange objects
139 It is "1" (on) by default
141 Alternative to a gene_name, a position can be given: an
142 integer (1-) containing the position of the desired CDS in the
148 my ($self, %args) = @_;
149 my ($gene_name,$flanking,$getswissprotinfo,$cds_position)=($args{-gene_name
},$args{-flanking
},$args{-getswissprotinfo
},$args{-position
});
151 unless (($gene_name)||($cds_position)) {
152 carp
"Gene_Name or Position not specified for gene2liveseq loading function";
155 if (($gene_name)&&($cds_position)) {
156 carp
"Gene_Name and Position cannot be given together";
158 } elsif ($gene_name) {
161 $input="cds-position:".$cds_position;
164 if (defined($getswissprotinfo)) {
165 if (($getswissprotinfo ne 0)&&($getswissprotinfo ne 1)) {
166 carp
"-getswissprotinfo argument can take only boolean (1 or 0) values. Setting it to 0, i.e. not trying to retrieve SwissProt information....";
173 if (defined($flanking)) {
174 unless ($flanking >= 0) {
175 carp
"No sense in specifying a number < 0 for flanking regions to be created for gene2liveseq loading function";
179 $flanking=500; # the default flanking length
181 my $hashref=$self->{'hash'};
182 unless ($hashref) { return (0); }
183 my $gene=$self->hash2gene($hashref,$input,$flanking,$getswissprotinfo);
184 unless ($gene) { # if $gene == 0 it means problems in hash2gene
185 carp
"gene2liveseq produced error";
191 # TODO: update so that it will work even if CDS is not only accepted FEATURE!!
192 # this method is for now deprecated and not supported
194 my ($self,$entry)=@_;
195 my @features=@
{$entry->{'Features'}};
196 my @translationobjects=@
{$_[1]};
197 my ($i,$translation);
198 my ($obj_transl,$hash_transl);
199 my @cds=@
{$entry->{'CDS'}};
200 foreach $translation (@translationobjects) {
201 $obj_transl=$translation->seq;
202 $hash_transl=$cds[$i]->{'qualifiers'}->{'translation'};
203 #before seq was changed in Translation 1.4# chop $obj_transl; # to remove trailing "*"
204 unless ($obj_transl eq $hash_transl) {
205 cluck
"Serious error: Translation from the Entry does not match Translation from object's seq for CDS at position $i";
206 carp
"\nEntry's transl: ",$hash_transl,"\n";
207 carp
"\nObject's transl: ",$obj_transl,"\n";
214 # argument: hashref containing the EMBL entry datas,
215 # getswissprotinfo boolean flag
216 # creates the liveseq objects
217 # returns: an array of Translation object references
219 my ($self,$entry,$getswissprotinfo)=@_;
222 my $dna=Bio
::LiveSeq
::DNA
->new(-seq
=> $entry->{'Sequence'} );
223 $dna->alphabet(lc($entry->{'Molecule'}));
224 $dna->display_id($entry->{'ID'});
225 $dna->accession_number($entry->{'AccNumber'});
226 $dna->desc($entry->{'Description'});
227 my @cds=@
{$entry->{'CDS'}};
228 my ($swissacc,$swisshash); my @swisshashes;
230 #my @transcript=@{$cds[$i]->{'range'}};
231 #$transcript=\@transcript;
232 #push (@transcripts,$transcript);
233 push (@transcripts,$cds[$i]->{'range'});
234 if ($getswissprotinfo) {
235 $swissacc=$cds[$i]->{'qualifiers'}->{'db_xref'};
236 $swisshash=$self->get_swisshash($swissacc);
237 #$self->printswissprot($swisshash); # DEBUG
238 push (@swisshashes,$swisshash);
241 my @translations=($self->transexonscreation($dna,\
@transcripts));
242 my $translation; my $j=0;
243 foreach $translation (@translations) {
244 if ($swisshashes[$j]) { # if not 0
245 $self->swisshash2liveseq($swisshashes[$j],$translation);
249 return (@translations);
252 # only features pertaining to a specified gene are created
253 # only the sequence of the gene and appropriate context flanking regions
254 # are created as chain
255 # arguments: hashref, gene_name (OR: cds_position), length_of_flanking_sequences, getswissprotinfo boolean flag
256 # returns: reference to Gene object
258 # Note: if entry contains just one CDS, all the features get added
259 # this is useful because often the features in these entries do not
260 # carry the /gene qualifier
264 my ($self,$entry,$input,$flanking,$getswissprotinfo)=@_;
266 my $genefeatureshash;
268 my @cds=@
{$entry->{'CDS'}};
270 # checking if a position has been given instead than a gene_name
271 if (index($input,"cds-position:") == 0 ) {
272 my $cds_position=substr($input,13); # extracting the cds position
273 if (($cds_position >= 1)&&($cds_position <= scalar(@cds))) {
274 $genefeatureshash=$self->_findgenefeatures($entry,undef,$cds_position,$getswissprotinfo);
277 $genefeatureshash=$self->_findgenefeatures($entry,$input,undef,$getswissprotinfo);
280 unless (($genefeatureshash)&&(scalar(@
{$genefeatureshash->{'genefeatures'}}))) { # array empty, no gene features found
281 my @genes=$self->genes($entry);
282 my $cds_number=scalar(@cds);
283 warn "Warning! Not even one genefeature found for /$input/....
284 The genes present in this entry are:\n\t@genes\n
285 The number of CDS in this entry is:\n\t$cds_number\n";
289 # get max and min, check flankings
290 my ($min,$max)=$self->rangeofarray(@
{$genefeatureshash->{'labels'}}); # gene "boundaries"
291 my $seqlength=$entry->{'SeqLength'};
292 my ($mindna,$maxdna); # some flanking region next to the gene "boundaries"
293 if ($min-$flanking < 1) {
296 $mindna=$min-$flanking;
298 if ($max+$flanking > $seqlength) {
301 $maxdna=$max+$flanking;
303 my $subseq=substr($entry->{'Sequence'},$mindna-1,$maxdna-$mindna+1);
305 # create LiveSeq objects
308 my $dna=Bio
::LiveSeq
::DNA
->new(-seq
=> $subseq, -offset
=> $mindna);
309 $dna->alphabet(lc($entry->{'Molecule'}));
310 $dna->source($entry->{'Organism'});
311 $dna->display_id($entry->{'ID'});
312 $dna->accession_number($entry->{'AccNumber'});
313 $dna->desc($entry->{'Description'});
315 my @transcripts=@
{$genefeatureshash->{'transcripts'}};
316 # create Translations, Transcripts, Exons out of the CDS
317 unless (@transcripts) {
318 cluck
"no CDS feature found for /$input/....";
321 my @translationobjs=$self->transexonscreation($dna,\
@transcripts);
324 # get the Transcript obj_refs
327 my @ttables=@
{$genefeatureshash->{'ttables'}};
328 my @swisshashes=@
{$genefeatureshash->{'swisshashes'}};
329 foreach $translation (@translationobjs) {
330 push(@transcriptobjs,$translation->get_Transcript);
331 if ($ttables[$j]) { # if not undef
332 $translation->get_Transcript->translation_table($ttables[$j]);
334 # print "\n\t\tno translation table information....\n";
336 if ($swisshashes[$j]) { # if not 0
337 $self->swisshash2liveseq($swisshashes[$j],$translation);
342 my %gene; # this is the hash to store created object references
344 $gene{Transcripts
}=\
@transcriptobjs;
345 $gene{Translations
}=\
@translationobjs;
347 my @exonobjs; my @intronobjs;
348 my @repeatunitobjs; my @repeatregionobjs;
349 my @primtranscriptobjs;
351 my ($object,$range,$start,$end,$strand);
353 my @exons=@
{$genefeatureshash->{'exons'}};
354 my @exondescs=@
{$genefeatureshash->{'exondescs'}};
357 foreach $range (@exons) {
358 ($start,$end,$strand)=@
{$range};
359 $object = Bio
::LiveSeq
::Exon
->new(-seq
=>$dna,-start
=>$start,-end
=>$end,-strand
=>$strand);
361 $object->desc($exondescs[$exoncount]) if defined $exondescs[$exoncount];
363 push (@exonobjs,$object);
368 $gene{Exons
}=\
@exonobjs;
370 my @introns=@
{$genefeatureshash->{'introns'}};
371 my @introndescs=@
{$genefeatureshash->{'introndescs'}};
374 foreach $range (@introns) {
375 ($start,$end,$strand)=@
{$range};
376 $object=Bio
::LiveSeq
::Intron
->new(-seq
=>$dna,-start
=>$start,-end
=>$end,-strand
=>$strand);
378 $object->desc($introndescs[$introncount]);
380 push (@intronobjs,$object);
385 $gene{Introns
}=\
@intronobjs;
387 my @prim_transcripts=@
{$genefeatureshash->{'prim_transcripts'}};
388 if (@prim_transcripts) {
389 foreach $range (@prim_transcripts) {
390 ($start,$end,$strand)=@
{$range};
391 $object=Bio
::LiveSeq
::Prim_Transcript
->new(-seq
=>$dna,-start
=>$start,-end
=>$end,-strand
=>$strand);
392 if ($object != -1) { push (@primtranscriptobjs,$object); }
394 $gene{Prim_Transcripts
}=\
@primtranscriptobjs;
396 my @repeat_regions=@
{$genefeatureshash->{'repeat_regions'}};
397 my @repeat_regions_family=@
{$genefeatureshash->{'repeat_regions_family'}};
398 if (@repeat_regions) {
400 foreach $range (@repeat_regions) {
401 ($start,$end,$strand)=@
{$range};
402 $object=Bio
::LiveSeq
::Repeat_Region
->new(-seq
=>$dna,-start
=>$start,-end
=>$end,-strand
=>$strand);
404 $object->desc($repeat_regions_family[$k]);
406 push (@repeatregionobjs,$object);
411 $gene{Repeat_Regions
}=\
@repeatregionobjs;
413 my @repeat_units=@
{$genefeatureshash->{'repeat_units'}};
414 my @repeat_units_family=@
{$genefeatureshash->{'repeat_units_family'}};
417 foreach $range (@repeat_units) {
418 ($start,$end,$strand)=@
{$range};
419 $object=Bio
::LiveSeq
::Repeat_Unit
->new(-seq
=>$dna,-start
=>$start,-end
=>$end,-strand
=>$strand);
421 $object->desc($repeat_units_family[$k]);
423 push (@repeatunitobjs,$object);
428 $gene{Repeat_Units
}=\
@repeatunitobjs;
432 my $gene_name=$genefeatureshash->{'gene_name'}; # either a name or a cdspos
433 return (Bio
::LiveSeq
::Gene
->new(-name
=>$gene_name,-features
=>\
%gene,
434 -upbound
=>$min,-downbound
=>$max));
437 # maybe this function will be moved to general utility package
438 # argument: array of numbers
439 # returns: (min,max) numbers in the array
443 #print "\n-=-=-=-=-=-=-=-=-=-=array: @array\n";
444 my ($max,$min,$element);
445 $min=$max=shift(@array);
446 foreach $element (@array) {
447 $element = 0 unless defined $element;
448 if ($element < $min) {
451 if ($element > $max) {
455 #print "\n-=-=-=-=-=-=-=-=-=-=min: $min\tmax: $max\n";
460 # argument: reference to DNA object, reference to array of transcripts
461 # returns: an array of Translation object references
462 sub transexonscreation
{
465 my @transcripts=@
{$_[1]};
467 my (@transexons,$start,$end,$strand,$exonref,$exonobj,$transcript,$transcriptobj);
469 my @translationobjects;
470 foreach $transcript (@transcripts) {
471 foreach $exonref (@
{$transcript}) {
472 ($start,$end,$strand)=@
{$exonref};
473 #print "Creating Exon: start $start end $end strand $strand\n";
474 $exonobj=Bio
::LiveSeq
::Exon
->new(-seq
=>$dna,-start
=>$start,-end
=>$end,-strand
=>$strand);
475 #push (@exonobjects,$exonobj);
476 push (@transexons,$exonobj);
478 $transcriptobj=Bio
::LiveSeq
::Transcript
->new(-exons
=> \
@transexons );
479 if ($transcriptobj != -1) {
480 $translationobj=Bio
::LiveSeq
::Translation
->new(-transcript
=>$transcriptobj);
481 @transexons=(); # cleans it
482 #push (@transcriptobjects,$transcriptobj);
483 push (@translationobjects,$translationobj);
486 return (@translationobjects);
490 # deleted. Some functionality placed in Gene->printfeaturesnum
492 =head2 printswissprot
494 Title : printswissprot
495 Usage : $loader->printswissprot($hashref);
496 Function: prints out all information loaded from a database entry into the
497 loader. Mainly used for testing purposes.
498 Args : a hashref containing the SWISSPROT entry datas
499 Note : the hashref can be obtained with a call to the method
500 $loader->get_swisshash() (BioPerl via Bio::DB::EMBL.pm)
501 that takes as argument a string like "SWISS-PROT:P10275"
505 # argument: hashref containing the SWISSPROT entry datas
506 # prints out that hash, showing the information loaded
508 my ($self,$entry)=@_;
515 $entry->{'AccNumber'};
519 $entry->{'Description'};
521 $entry->{'Organism'};
522 printf "SEQLN: %s\n",
523 $entry->{'SeqLength'};
525 substr($entry->{'Sequence'},0,64);
526 if ($entry->{'Features'}) {
527 my @features=@
{$entry->{'Features'}};
529 for $i (0..$#features) {
530 print "|",$features[$i]->{'name'},"|";
531 print " at ",$features[$i]->{'location'},": ";
532 print "",$features[$i]->{'desc'},"\n";
540 Usage : $loader->printembl();
541 Function: prints out all information loaded from a database entry into the
542 loader. Mainly used for testing purposes.
547 # argument: hashref containing the EMBL entry datas
548 # prints out that hash, showing the information loaded
550 my ($self,$entry)=@_;
552 $entry=$self->{'hash'};
554 my ($i,$featurename);
558 $entry->{'AccNumber'};
560 $entry->{'Molecule'};
562 $entry->{'Division'};
564 $entry->{'Description'};
566 $entry->{'Organism'};
567 printf "SEQLN: %s\n",
568 $entry->{'SeqLength'};
570 substr($entry->{'Sequence'},0,64);
571 my @features=@
{$entry->{'Features'}};
572 my @cds=@
{$entry->{'CDS'}};
573 print "\nFEATURES\nNumber of CDS: ",scalar(@cds)," (out of ",$entry->{'FeaturesNumber'}, " total features)\n";
574 my ($exonref,$transcript);
575 my ($qualifiernumber,$qualifiers,$key);
576 my ($start,$end,$strand);
578 for $i (0..$#features) {
579 $featurename=$features[$i]->{'name'};
580 if ($featurename eq "CDS") {
581 print "|CDS| number $j at feature position: $i\n";
582 #print $features[$i]->{'location'},"\n";
583 $transcript=$features[$i]->{'range'};
584 foreach $exonref (@
{$transcript}) {
585 ($start,$end,$strand)=@
{$exonref};
586 print "\tExon: start $start end $end strand $strand\n";
590 print "|$featurename| at feature position: $i\n";
592 print join("\t",@
{$features[$i]->{'range'}}),"\n";
593 #print $features[$i]->{'location'},"\n";
595 $qualifiernumber=$features[$i]->{'qual_number'};
596 $qualifiers=$features[$i]->{'qualifiers'}; # hash
597 foreach $key (keys (%{$qualifiers})) {
598 print "\t\t",$key,": ";
599 print $qualifiers->{$key},"\n";
607 Usage : $loader->genes();
608 Function: Returns an array of gene_names (strings) contained in the loaded
614 # argument: entryhashref
615 # returns: array of genenames found in the entry
617 my ($self,$entry)=@_;
619 $entry=$self->{'hash'};
621 my @entryfeatures=@
{$entry->{'Features'}};
622 my ($genename,$genenames,$entryfeature);
623 for $entryfeature (@entryfeatures) {
624 $genename=$entryfeature->{'qualifiers'}->{'gene'};
626 if (index($genenames,$genename) == -1) { # if name is new
627 $genenames .= $genename . " "; # add the name
631 return (split(/ /,$genenames)); # assumes no space inbetween each genename
634 # arguments: swisshash, translation objref
635 # adds information to the Translation, creates AARange objects, sets the
636 # aa_range attribute on the Translation, pointing to those objects
637 sub swisshash2liveseq
{
638 my ($self,$entry,$translation)=@_;
639 my $translength=$translation->length;
640 $translation->desc($translation->desc . $entry->{'Description'});
641 $translation->display_id("SWISSPROT:" . $entry->{'ID'});
642 $translation->accession_number("SWISSPROT:" . $entry->{'AccNumber'});
643 $translation->name($entry->{'Gene'});
644 $translation->source($entry->{'Organism'});
646 my ($start,$end,$aarangeobj,$feature);
647 my @features; my @newfeatures;
648 if ($entry->{'Features'}) {
649 @features=@
{$entry->{'Features'}};
652 # check for cleaved Met
653 foreach $feature (@features) {
654 if (($feature->{'name'} eq "INIT_MET")&&($feature->{'location'} eq "0 0")) {
656 $translation->{'offset'}="1"; # from swissprot to liveseq protein sequence
658 push(@newfeatures,$feature);
662 my $swissseq=$entry->{'Sequence'};
663 my $liveseqtransl=$translation->seq;
664 chop $liveseqtransl; # to take away the trailing STOP "*"
665 my $translated=substr($liveseqtransl,$cleavedmet);
667 my ($liveseq_aa,$swiss_aa,$codes_aa)=$self->_get_alignment($translated,$swissseq); # alignment after cleavage of possible initial met
669 if ((index($liveseq_aa,"-") != -1)||(index($swiss_aa,"-") != -1)) { # there are gaps, how to proceed?
670 print "LIVE-SEQ=\'$liveseq_aa\'\nIDENTITY=\'$codes_aa\'\nSWS-PROT=\'$swiss_aa\'\n";
671 carp
"Nucleotides translation and SwissProt translation are different in size, cannot attach the SwissSequence to the EMBL one, cannot add any AminoAcidRange object/Domain information!";
676 @features=@newfeatures;
677 foreach $feature (@features) {
678 #print "Processing SwissProtFeature: $i\n"; # debug
679 ($start,$end)=split(/ /,$feature->{'location'});
680 # Note: cleavedmet is taken in account for updating numbering
681 $aarangeobj=Bio
::LiveSeq
::AARange
->new(-start
=> $start+$cleavedmet, -end
=> $end+$cleavedmet, -name
=> $feature->{'name'}, -description
=> $feature->{'description'}, -translation
=> $translation, -translength
=> $translength);
682 if ($aarangeobj != -1) {
683 push(@aarangeobjects,$aarangeobj);
687 $translation->{'aa_ranges'}=\
@aarangeobjects;
690 # if there is no SRS support, the default will be to return 0
691 # i.e. this function is overridden in SRS package
696 # Args: $entry hashref, gene_name OR cds_position (undef is used to
697 # choose between the two), getswissprotinfo boolean flag
698 # Returns: an hash holding various arrayref used in the hash2gene method
699 # Function: examines the nucleotide entry, identifying features belonging
700 # to the gene (defined either by its name or by the position of its CDS in
703 sub _findgenefeatures
{
704 my ($self,$entry,$gene_name,$cds_position,$getswissprotinfo)=@_;
706 my @entryfeatures=@
{$entry->{'Features'}};
707 my @exons; my @introns; my @prim_transcripts; my @transcripts;
708 my @repeat_units; my @repeat_regions;
709 my @repeat_units_family; my @repeat_regions_family; my $rpt_family;
710 my $entryfeature; my @genefeatures;
711 my $desc; my @exondescs; my @introndescs;
713 # for swissprot xreference
714 my ($swissacc,$swisshash); my @swisshashes;
716 # for translation_tables
721 my @range; my @cdsexons; my @labels;
723 # maybe here also could be added special case when there is no CDS feature
724 # in the entry (e.g. tRNA entry -> TOCHECK).
725 # let's deal with the special case in which there is just one gene per entry
726 # usually without /gene qualifier
727 my @cds=@
{$entry->{'CDS'}};
730 if (scalar(@cds) == 1) {
731 #carp "Note: only one CDS in this entry. Treating all features found in entry as Gene features.";
735 my ($cds_begin,$cds_end,$proximity);
736 if ($cds_position) { # if a position has been requested
737 my @cds_exons=@
{$cds[$cds_position-1]->{'range'}};
738 ($cds_begin,$cds_end)=($cds_exons[0]->[0],$cds_exons[-1]->[1]); # begin and end of CDS
739 $gene_name=$cds[$cds_position-1]->{'qualifiers'}->{'gene'};
741 unless ($skipgenematch) {
742 carp
"--DEBUG-- cdsbegin $cds_begin cdsend $cds_end--------";
744 $proximity=100; # proximity CONSTANT to decide whether a feature "belongs" to the CDS
747 for $entryfeature (@entryfeatures) { # get only features for the desired gene
748 if (($skipgenematch)||(($cds_position)&&($self->_checkfeatureproximity($entryfeature->{'range'},$cds_begin,$cds_end,$proximity)))||(!($cds_position)&&($entryfeature->{'qualifiers'}->{'gene'} eq "$gene_name"))) {
749 push(@genefeatures,$entryfeature);
751 my @range=@
{$entryfeature->{'range'}};
752 $name=$entryfeature->{'name'};
753 my %qualifierhash=%{$entryfeature->{'qualifiers'}};
754 if ($name eq "CDS") { # that has range containing array of exons
756 # swissprot crossindexing (if without SRS support it will fill array
757 # with zeros and do nothing
758 if ($getswissprotinfo) {
759 $swissacc=$entryfeature->{'qualifiers'}->{'db_xref'};
760 $swisshash=$self->get_swisshash($swissacc);
761 #$self->printswissprot($swisshash); # DEBUG
762 push (@swisshashes,$swisshash);
765 push (@ttables,$entryfeature->{'qualifiers'}->{'transl_table'}); # undef if not specified
767 # create labels array
769 push(@labels,$exon->[0],$exon->[1]); # start and end of every exon of the CDS
771 push (@transcripts,$entryfeature->{'range'});
773 # "simplifying" the joinedlocation features. I.e. changing them from
774 # multijoined ones to simple plain start-end features, taking only
775 # the start of the first "exon" and the end of the last "exon" as
776 # start and end of the entire feature
777 if ($entryfeature->{'locationtype'} && $entryfeature->{'locationtype'} eq "joined") { # joined location
778 @range=($range[0]->[0],$range[-1]->[1]);
780 push(@labels,$range[0],$range[1]); # start and end of every feature
781 if ($name eq "exon") {
782 $desc=$entryfeature->{'qualifiers'}->{'number'};
783 if ($entryfeature->{'qualifiers'}->{'note'}) {
785 $desc .= "|" . $entryfeature->{'qualifiers'}->{'note'};
787 $desc = $entryfeature->{'qualifiers'}->{'note'};
790 push (@exondescs,$desc || "unknown");
791 push(@exons,\
@range);
793 if ($name eq "intron") {
794 $desc=$entryfeature->{'qualifiers'}->{'number'};
796 $desc .= "|" . $entryfeature->{'qualifiers'}->{'note'};
798 $desc = $entryfeature->{'qualifiers'}->{'note'};
800 push (@introndescs,$desc || "unknown");
801 push(@introns,\
@range);
803 if (($name eq "prim_transcript")||($name eq "mRNA")) { push(@prim_transcripts,\
@range); }
804 if ($name eq "repeat_unit") { push(@repeat_units,\
@range);
805 $rpt_family=$entryfeature->{'qualifiers'}->{'rpt_family'};
806 push (@repeat_units_family,$rpt_family || "unknown");
808 if ($name eq "repeat_region") { push(@repeat_regions,\
@range);
809 $rpt_family=$entryfeature->{'qualifiers'}->{'rpt_family'};
810 push (@repeat_regions_family,$rpt_family || "unknown");
815 unless ($gene_name) { $gene_name="cds-position:".$cds_position; }
816 my %genefeatureshash;
817 $genefeatureshash{gene_name
}=$gene_name;
818 $genefeatureshash{genefeatures
}=\
@genefeatures;
819 $genefeatureshash{labels
}=\
@labels;
820 $genefeatureshash{ttables
}=\
@ttables;
821 $genefeatureshash{swisshashes
}=\
@swisshashes;
822 $genefeatureshash{transcripts
}=\
@transcripts;
823 $genefeatureshash{exons
}=\
@exons;
824 $genefeatureshash{exondescs
}=\
@exondescs;
825 $genefeatureshash{introns
}=\
@introns;
826 $genefeatureshash{introndescs
}=\
@introndescs;
827 $genefeatureshash{prim_transcripts
}=\
@prim_transcripts;
828 $genefeatureshash{repeat_units
}=\
@repeat_units;
829 $genefeatureshash{repeat_regions
}=\
@repeat_regions;
830 $genefeatureshash{repeat_units_family
}=\
@repeat_units_family;
831 $genefeatureshash{repeat_regions_family
}=\
@repeat_regions_family;
832 return (\
%genefeatureshash);
836 # used by _findgenefeatures, when a CDS at a certain position is requested,
837 # to retrieve only features quite close to the wanted CDS.
838 # Args: range hashref, begin and end positions of the CDS, $proximity
839 # $proximity holds the maximum distance between the extremes of the CDS
840 # and of the feature under exam.
842 sub _checkfeatureproximity
{
843 my ($self,$range,$cds_begin,$cds_end,$proximity)=@_;
845 my ($begin,$end,$strand);
846 if (ref($range[0]) eq "ARRAY") { # like in CDS, whose range equivals to exons
847 ($begin,$end,$strand)=($range[0]->[0],$range[-1]->[1],$range[0]->[2]);
849 ($begin,$end,$strand)=@range;
851 if ($cds_begin > $cds_end) { # i.e. reverse strand CDS
852 ($cds_begin,$cds_end)=($cds_end,$cds_begin); # swap boundaries
854 if ($strand == -1) { # reverse strand
855 ($begin,$end)=($end,$begin); # swap boundaries
857 if (($cds_begin-$end)>$proximity) {
858 carp
"--DEBUG-- feature rejected: begin $begin end $end -------";
861 if (($begin-$cds_end)>$proximity) {
862 carp
"--DEBUG-- feature rejected: begin $begin end $end -------";
865 carp
"--DEBUG-- feature accepted: begin $begin end $end -------";
866 return (1); # otherwise ok, feature considered next to CDS
870 # function that calls the external program "align" (on the fasta2 package)
871 # to create an alignment between two sequences, returning the aligned
872 # strings and the codes for the identity (:: ::::)
875 my ($self,$seq1,$seq2)=@_;
877 my $null = ($^O
=~ m/mswin/i) ?
'NUL' : '/dev/null';
878 my $fastafile1="/tmp/tmpfastafile1";
879 my $fastafile2="/tmp/tmpfastafile2";
880 my $grepcut='egrep -v "[[:digit:]]|^ *$|sequences" | cut -c8-'; # grep/cut
881 my $alignprogram="/usr/local/etc/bioinfo/fasta2/align -s /usr/local/etc/bioinfo/fasta2/idnaa.mat $fastafile1 $fastafile2 2>$null | $grepcut"; # ALIGN
882 open my $TMPFASTAFILE1, '>', $fastafile1 or croak
"Could not write file '$fastafile1' for aa alignment: $!";
883 open my $TMPFASTAFILE2, '>', $fastafile2 or croak
"Could not write file '$fastafile2' for aa alignment: $!";
884 print $TMPFASTAFILE1 ">firstseq\n$seq1\n";
885 print $TMPFASTAFILE2 ">secondseq\n$seq2\n";
886 close $TMPFASTAFILE1;
887 close $TMPFASTAFILE2;
888 my $alignment=`$alignprogram`;
889 my @alignlines=split(/\n/,$alignment);
890 my ($linecount,$seq1_aligned,$seq2_aligned,$codes);
891 for ($linecount=0; $linecount < @alignlines; $linecount+=3) {
892 $seq1_aligned .= $alignlines[$linecount];
893 $codes .= $alignlines[$linecount+1];
894 $seq2_aligned .= $alignlines[$linecount+2];
896 return ($seq1_aligned,$seq2_aligned,$codes);
899 # common part of the function to create a novel liveseq gene structure
900 # from an amino acid sequence, using codon usage frequencies
901 # args: codon_usage_array transltableid aasequence gene_name
902 sub _common_novelaasequence2gene
{
903 my ($species_codon_usage,$ttabid,$aasequence,$gene_name)=@_;
904 my @species_codon_usage=@
{$species_codon_usage};
905 my @codon_usage_label=
906 qw
(cga cgc cgg cgt aga agg cta ctc ctg ctt tta ttg tca tcc tcg
907 tct agc agt aca acc acg act cca ccc ccg cct gca gcc gcg gct gga
908 ggc ggg ggt gta gtc gtg gtt aaa aag aac aat caa cag cac cat gaa
909 gag gac gat tac tat tgc tgt ttc ttt ata atc att atg tgg taa tag
912 my %codon_usage_value;
914 for ($i=0;$i<64;$i++) {
915 $codon_usage_value{$codon_usage_label[$i]}=$species_codon_usage[$i];
918 my $CodonTable = Bio
::Tools
::CodonTable
->new ( -id
=> $ttabid );
919 my @aminoacids = split(//,uc($aasequence));
920 my @alt_codons; my ($relativeusage,$dnasequence,$chosen_codon,$dice,$partial,$thiscodon);
921 for $i (@aminoacids) {
922 @alt_codons = $CodonTable->revtranslate($i);
923 unless (@alt_codons) {
924 carp
"No reverse translation possible for aminoacid \'$i\'";
925 $dnasequence .= "???";
928 for $j (@alt_codons) {
929 $aa_codon_total+=$codon_usage_value{$j};
931 # print "aminoacid $i, codonchoice: "; # verbose
933 #for $j (@alt_codons) {
934 #printf "%s %.2f ",$j,$partial+$codon_usage_value{$j}/$aa_codon_total;
935 #$partial+=($codon_usage_value{$j}/$aa_codon_total);
939 #print "roulette: $dice\n"; # verbose
943 for $j (0..@alt_codons) { # last one not accounted
944 $thiscodon=$alt_codons[$j];
945 $relativeusage=($codon_usage_value{$thiscodon}/$aa_codon_total);
946 if ($dice < $relativeusage+$partial) {
947 $chosen_codon=$thiscodon;
950 $partial += $relativeusage;
953 unless ($chosen_codon) {
954 $chosen_codon = $alt_codons[-1]; # the last one
956 # print ".....adding $chosen_codon\n"; # verbose
957 $dnasequence .= $chosen_codon;
961 my $dna = Bio
::LiveSeq
::DNA
->new(-seq
=> $dnasequence);
963 my $max=length($dnasequence);
964 my $exon = Bio
::LiveSeq
::Exon
->new(-seq
=> $dna, -start
=> $min, -end
=> $max, -strand
=> 1);
966 my $transcript = Bio
::LiveSeq
::Transcript
->new(-exons
=> \
@exons);
967 $transcript->translation_table($ttabid);
968 my @transcripts=($transcript);
969 my $translation = Bio
::LiveSeq
::Translation
->new(-transcript
=> $transcript);
970 my @translations=($translation);
971 my %features=(DNA
=> $dna, Transcripts
=> \
@transcripts, Translations
=> \
@translations);
972 my $gene = Bio
::LiveSeq
::Gene
->new(-name
=> $gene_name, -features
=> \
%features, -upbound
=> $min, -downbound
=> $max);
975 unless ($gene) { # if $gene == 0 it means problems in hash2gene
976 carp
"Error in Gene creation phase";