maint: restructure to use Dist::Zilla
[bioperl-live.git] / lib / Bio / LiveSeq / IO / Loader.pm
blob417b733ad342651dab887e3ddbbd555d6934d611
2 # bioperl module for Bio::LiveSeq::IO::Loader
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Joseph Insana <insana@ebi.ac.uk> <jinsana@gmx.net>
8 # Copyright Joseph Insana
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
14 =head1 NAME
16 Bio::LiveSeq::IO::Loader - Parent Loader for LiveSeq
18 =head1 SYNOPSIS
20 #documentation needed
22 =head1 DESCRIPTION
24 This package holds common methods used by BioPerl and file loaders.
25 It contains methods to create LiveSeq objects out of entire entries or from a
26 localized sequence region surrounding a particular gene.
28 =head1 AUTHOR - Joseph A.L. Insana
30 Email: Insana@ebi.ac.uk, jinsana@gmx.net
32 =head1 APPENDIX
34 The rest of the documentation details each of the object
35 methods. Internal methods are usually preceded with a _
37 =cut
39 # Let the code begin...
41 package Bio::LiveSeq::IO::Loader;
43 use strict;
44 use Carp qw(cluck croak carp);
45 use Bio::LiveSeq::DNA;
46 use Bio::LiveSeq::Exon;
47 use Bio::LiveSeq::Transcript ;
48 use Bio::LiveSeq::Translation;
49 use Bio::LiveSeq::Gene;
50 use Bio::LiveSeq::Intron;
51 use Bio::LiveSeq::Prim_Transcript;
52 use Bio::LiveSeq::Repeat_Region;
53 use Bio::LiveSeq::Repeat_Unit;
54 use Bio::LiveSeq::AARange;
55 use Bio::Tools::CodonTable;
57 =head2 entry2liveseq
59 Title : entry2liveseq
60 Usage : @translationobjects=$loader->entry2liveseq();
61 : @translationobjects=$loader->entry2liveseq(-getswissprotinfo => 0);
62 Function: creates LiveSeq objects from an entry previously loaded
63 Returns : array of references to objects of class Translation
64 Errorcode 0
65 Args : optional boolean flag to avoid the retrieval of SwissProt
66 information for all Transcripts containing SwissProt x-reference
67 default is 1 (to retrieve those information and create AARange
68 LiveSeq objects)
69 Note : this method can get really slow for big entries. The lightweight
70 gene2liveseq method is recommended
72 =cut
74 sub entry2liveseq {
75 my ($self, %args) = @_;
76 my ($getswissprotinfo)=($args{-getswissprotinfo});
77 if (defined($getswissprotinfo)) {
78 if (($getswissprotinfo ne 0)&&($getswissprotinfo ne 1)) {
79 carp "-getswissprotinfo argument can take only boolean (1 or 0) values. Setting it to 0, i.e. not trying to retrieve SwissProt information....";
80 $getswissprotinfo=0;
82 } else {
83 $getswissprotinfo=1;
85 my $hashref=$self->{'hash'};
86 unless ($hashref) { return (0); }
87 my @translationobjects=$self->hash2liveseq($hashref,$getswissprotinfo);
88 my $test_transl=0;
89 if ($test_transl) { $self->test_transl($hashref,\@translationobjects);}
90 return @translationobjects;
93 =head2 novelaasequence2gene
95 Title : novelaasequence2gene
96 Usage : $gene=$loader->novelaasequence2gene(-aasequence => "MGLAAPTRS*");
97 : $gene=$loader->novelaasequence2gene(-aasequence => "MGLAAPTRS*");
98 -taxon => 9606,
99 -gene_name => "tyr-kinase");
101 Function: creates LiveSeq objects from a novel amino acid sequence,
102 using codon usage database to choose codons according to
103 relative frequencies.
104 If a taxon ID is not specified, the default is to use the human
105 one (taxonomy ID 9606).
106 Returns : reference to a Gene object containing references to LiveSeq objects
107 Errorcode 0
108 Args : string containing an amino acid sequence
109 integer (optional) with a taxonomy ID
110 string specifying a gene name
112 =cut
114 =head2 gene2liveseq
116 Title : gene2liveseq
117 Usage : $gene=$loader->gene2liveseq(-gene_name => "gene name");
118 : $gene=$loader->gene2liveseq(-gene_name => "gene name",
119 -flanking => 64);
120 : $gene=$loader->gene2liveseq(-gene_name => "gene name",
121 -getswissprotinfo => 0);
122 : $gene=$loader->gene2liveseq(-position => 4);
124 Function: creates LiveSeq objects from an entry previously loaded
125 It is a "light weight" creation because it creates
126 a LiveSequence just for the interesting region in an entry
127 (instead than for the total entry, like in entry2liveseq) and for
128 the flanking regions up to 500 nucleotides (default) or up to
129 the specified amount of nucleotides (given as argument) around the
130 Gene.
131 Returns : reference to a Gene object containing possibly alternative
132 Transcripts.
133 Errorcode 0
134 Args : string containing the gene name as in the EMBL feature qualifier
135 integer (optional) "flanking": amount of flanking bases to be kept
136 boolean (optional) "getswissprotinfo": if set to "0" it will avoid
137 trying to fetch information from a crossreference to a SwissProt
138 entry, avoiding the process of creation of AARange objects
139 It is "1" (on) by default
141 Alternative to a gene_name, a position can be given: an
142 integer (1-) containing the position of the desired CDS in the
143 loaded entry
145 =cut
147 sub gene2liveseq {
148 my ($self, %args) = @_;
149 my ($gene_name,$flanking,$getswissprotinfo,$cds_position)=($args{-gene_name},$args{-flanking},$args{-getswissprotinfo},$args{-position});
150 my $input;
151 unless (($gene_name)||($cds_position)) {
152 carp "Gene_Name or Position not specified for gene2liveseq loading function";
153 return (0);
155 if (($gene_name)&&($cds_position)) {
156 carp "Gene_Name and Position cannot be given together";
157 return (0);
158 } elsif ($gene_name) {
159 $input=$gene_name;
160 } else {
161 $input="cds-position:".$cds_position;
164 if (defined($getswissprotinfo)) {
165 if (($getswissprotinfo ne 0)&&($getswissprotinfo ne 1)) {
166 carp "-getswissprotinfo argument can take only boolean (1 or 0) values. Setting it to 0, i.e. not trying to retrieve SwissProt information....";
167 $getswissprotinfo=0;
169 } else {
170 $getswissprotinfo=1;
173 if (defined($flanking)) {
174 unless ($flanking >= 0) {
175 carp "No sense in specifying a number < 0 for flanking regions to be created for gene2liveseq loading function";
176 return (0);
178 } else {
179 $flanking=500; # the default flanking length
181 my $hashref=$self->{'hash'};
182 unless ($hashref) { return (0); }
183 my $gene=$self->hash2gene($hashref,$input,$flanking,$getswissprotinfo);
184 unless ($gene) { # if $gene == 0 it means problems in hash2gene
185 carp "gene2liveseq produced error";
186 return (0);
188 return $gene;
191 # TODO: update so that it will work even if CDS is not only accepted FEATURE!!
192 # this method is for now deprecated and not supported
193 sub test_transl {
194 my ($self,$entry)=@_;
195 my @features=@{$entry->{'Features'}};
196 my @translationobjects=@{$_[1]};
197 my ($i,$translation);
198 my ($obj_transl,$hash_transl);
199 my @cds=@{$entry->{'CDS'}};
200 foreach $translation (@translationobjects) {
201 $obj_transl=$translation->seq;
202 $hash_transl=$cds[$i]->{'qualifiers'}->{'translation'};
203 #before seq was changed in Translation 1.4# chop $obj_transl; # to remove trailing "*"
204 unless ($obj_transl eq $hash_transl) {
205 cluck "Serious error: Translation from the Entry does not match Translation from object's seq for CDS at position $i";
206 carp "\nEntry's transl: ",$hash_transl,"\n";
207 carp "\nObject's transl: ",$obj_transl,"\n";
208 exit;
210 $i++;
214 # argument: hashref containing the EMBL entry datas,
215 # getswissprotinfo boolean flag
216 # creates the liveseq objects
217 # returns: an array of Translation object references
218 sub hash2liveseq {
219 my ($self,$entry,$getswissprotinfo)=@_;
220 my $i;
221 my @transcripts;
222 my $dna=Bio::LiveSeq::DNA->new(-seq => $entry->{'Sequence'} );
223 $dna->alphabet(lc($entry->{'Molecule'}));
224 $dna->display_id($entry->{'ID'});
225 $dna->accession_number($entry->{'AccNumber'});
226 $dna->desc($entry->{'Description'});
227 my @cds=@{$entry->{'CDS'}};
228 my ($swissacc,$swisshash); my @swisshashes;
229 for $i (0..$#cds) {
230 #my @transcript=@{$cds[$i]->{'range'}};
231 #$transcript=\@transcript;
232 #push (@transcripts,$transcript);
233 push (@transcripts,$cds[$i]->{'range'});
234 if ($getswissprotinfo) {
235 $swissacc=$cds[$i]->{'qualifiers'}->{'db_xref'};
236 $swisshash=$self->get_swisshash($swissacc);
237 #$self->printswissprot($swisshash); # DEBUG
238 push (@swisshashes,$swisshash);
241 my @translations=($self->transexonscreation($dna,\@transcripts));
242 my $translation; my $j=0;
243 foreach $translation (@translations) {
244 if ($swisshashes[$j]) { # if not 0
245 $self->swisshash2liveseq($swisshashes[$j],$translation);
247 $j++;
249 return (@translations);
252 # only features pertaining to a specified gene are created
253 # only the sequence of the gene and appropriate context flanking regions
254 # are created as chain
255 # arguments: hashref, gene_name (OR: cds_position), length_of_flanking_sequences, getswissprotinfo boolean flag
256 # returns: reference to Gene object
258 # Note: if entry contains just one CDS, all the features get added
259 # this is useful because often the features in these entries do not
260 # carry the /gene qualifier
262 # errorcode: 0
263 sub hash2gene {
264 my ($self,$entry,$input,$flanking,$getswissprotinfo)=@_;
265 my $entryfeature;
266 my $genefeatureshash;
268 my @cds=@{$entry->{'CDS'}};
270 # checking if a position has been given instead than a gene_name
271 if (index($input,"cds-position:") == 0 ) {
272 my $cds_position=substr($input,13); # extracting the cds position
273 if (($cds_position >= 1)&&($cds_position <= scalar(@cds))) {
274 $genefeatureshash=$self->_findgenefeatures($entry,undef,$cds_position,$getswissprotinfo);
276 } else {
277 $genefeatureshash=$self->_findgenefeatures($entry,$input,undef,$getswissprotinfo);
280 unless (($genefeatureshash)&&(scalar(@{$genefeatureshash->{'genefeatures'}}))) { # array empty, no gene features found
281 my @genes=$self->genes($entry);
282 my $cds_number=scalar(@cds);
283 warn "Warning! Not even one genefeature found for /$input/....
284 The genes present in this entry are:\n\t@genes\n
285 The number of CDS in this entry is:\n\t$cds_number\n";
286 return(0);
289 # get max and min, check flankings
290 my ($min,$max)=$self->rangeofarray(@{$genefeatureshash->{'labels'}}); # gene "boundaries"
291 my $seqlength=$entry->{'SeqLength'};
292 my ($mindna,$maxdna); # some flanking region next to the gene "boundaries"
293 if ($min-$flanking < 1) {
294 $mindna=1;
295 } else {
296 $mindna=$min-$flanking;
298 if ($max+$flanking > $seqlength) {
299 $maxdna=$seqlength;
300 } else {
301 $maxdna=$max+$flanking;
303 my $subseq=substr($entry->{'Sequence'},$mindna-1,$maxdna-$mindna+1);
305 # create LiveSeq objects
307 # create DNA
308 my $dna=Bio::LiveSeq::DNA->new(-seq => $subseq, -offset => $mindna);
309 $dna->alphabet(lc($entry->{'Molecule'}));
310 $dna->source($entry->{'Organism'});
311 $dna->display_id($entry->{'ID'});
312 $dna->accession_number($entry->{'AccNumber'});
313 $dna->desc($entry->{'Description'});
315 my @transcripts=@{$genefeatureshash->{'transcripts'}};
316 # create Translations, Transcripts, Exons out of the CDS
317 unless (@transcripts) {
318 cluck "no CDS feature found for /$input/....";
319 return(0);
321 my @translationobjs=$self->transexonscreation($dna,\@transcripts);
322 my @transcriptobjs;
324 # get the Transcript obj_refs
325 my $translation;
326 my $j=0;
327 my @ttables=@{$genefeatureshash->{'ttables'}};
328 my @swisshashes=@{$genefeatureshash->{'swisshashes'}};
329 foreach $translation (@translationobjs) {
330 push(@transcriptobjs,$translation->get_Transcript);
331 if ($ttables[$j]) { # if not undef
332 $translation->get_Transcript->translation_table($ttables[$j]);
333 #} else { # DEBUG
334 # print "\n\t\tno translation table information....\n";
336 if ($swisshashes[$j]) { # if not 0
337 $self->swisshash2liveseq($swisshashes[$j],$translation);
339 $j++;
342 my %gene; # this is the hash to store created object references
343 $gene{DNA}=$dna;
344 $gene{Transcripts}=\@transcriptobjs;
345 $gene{Translations}=\@translationobjs;
347 my @exonobjs; my @intronobjs;
348 my @repeatunitobjs; my @repeatregionobjs;
349 my @primtranscriptobjs;
351 my ($object,$range,$start,$end,$strand);
353 my @exons=@{$genefeatureshash->{'exons'}};
354 my @exondescs=@{$genefeatureshash->{'exondescs'}};
355 if (@exons) {
356 my $exoncount = 0;
357 foreach $range (@exons) {
358 ($start,$end,$strand)=@{$range};
359 $object = Bio::LiveSeq::Exon->new(-seq=>$dna,-start=>$start,-end=>$end,-strand=>$strand);
360 if ($object != -1) {
361 $object->desc($exondescs[$exoncount]) if defined $exondescs[$exoncount];
362 $exoncount++;
363 push (@exonobjs,$object);
364 } else {
365 $exoncount++;
368 $gene{Exons}=\@exonobjs;
370 my @introns=@{$genefeatureshash->{'introns'}};
371 my @introndescs=@{$genefeatureshash->{'introndescs'}};
372 if (@introns) {
373 my $introncount = 0;
374 foreach $range (@introns) {
375 ($start,$end,$strand)=@{$range};
376 $object=Bio::LiveSeq::Intron->new(-seq=>$dna,-start=>$start,-end=>$end,-strand=>$strand);
377 if ($object != -1) {
378 $object->desc($introndescs[$introncount]);
379 $introncount++;
380 push (@intronobjs,$object);
381 } else {
382 $introncount++;
385 $gene{Introns}=\@intronobjs;
387 my @prim_transcripts=@{$genefeatureshash->{'prim_transcripts'}};
388 if (@prim_transcripts) {
389 foreach $range (@prim_transcripts) {
390 ($start,$end,$strand)=@{$range};
391 $object=Bio::LiveSeq::Prim_Transcript->new(-seq=>$dna,-start=>$start,-end=>$end,-strand=>$strand);
392 if ($object != -1) { push (@primtranscriptobjs,$object); }
394 $gene{Prim_Transcripts}=\@primtranscriptobjs;
396 my @repeat_regions=@{$genefeatureshash->{'repeat_regions'}};
397 my @repeat_regions_family=@{$genefeatureshash->{'repeat_regions_family'}};
398 if (@repeat_regions) {
399 my $k=0;
400 foreach $range (@repeat_regions) {
401 ($start,$end,$strand)=@{$range};
402 $object=Bio::LiveSeq::Repeat_Region->new(-seq=>$dna,-start=>$start,-end=>$end,-strand=>$strand);
403 if ($object != -1) {
404 $object->desc($repeat_regions_family[$k]);
405 $k++;
406 push (@repeatregionobjs,$object);
407 } else {
408 $k++;
411 $gene{Repeat_Regions}=\@repeatregionobjs;
413 my @repeat_units=@{$genefeatureshash->{'repeat_units'}};
414 my @repeat_units_family=@{$genefeatureshash->{'repeat_units_family'}};
415 if (@repeat_units) {
416 my $k=0;
417 foreach $range (@repeat_units) {
418 ($start,$end,$strand)=@{$range};
419 $object=Bio::LiveSeq::Repeat_Unit->new(-seq=>$dna,-start=>$start,-end=>$end,-strand=>$strand);
420 if ($object != -1) {
421 $object->desc($repeat_units_family[$k]);
422 $k++;
423 push (@repeatunitobjs,$object);
424 } else {
425 $k++;
428 $gene{Repeat_Units}=\@repeatunitobjs;
431 # create the Gene
432 my $gene_name=$genefeatureshash->{'gene_name'}; # either a name or a cdspos
433 return (Bio::LiveSeq::Gene->new(-name=>$gene_name,-features=>\%gene,
434 -upbound=>$min,-downbound=>$max));
437 # maybe this function will be moved to general utility package
438 # argument: array of numbers
439 # returns: (min,max) numbers in the array
440 sub rangeofarray {
441 my $self=shift;
442 my @array=@_;
443 #print "\n-=-=-=-=-=-=-=-=-=-=array: @array\n";
444 my ($max,$min,$element);
445 $min=$max=shift(@array);
446 foreach $element (@array) {
447 $element = 0 unless defined $element;
448 if ($element < $min) {
449 $min=$element;
451 if ($element > $max) {
452 $max=$element;
455 #print "\n-=-=-=-=-=-=-=-=-=-=min: $min\tmax: $max\n";
456 return ($min,$max);
460 # argument: reference to DNA object, reference to array of transcripts
461 # returns: an array of Translation object references
462 sub transexonscreation {
463 my $self=shift;
464 my $dna=$_[0];
465 my @transcripts=@{$_[1]};
467 my (@transexons,$start,$end,$strand,$exonref,$exonobj,$transcript,$transcriptobj);
468 my $translationobj;
469 my @translationobjects;
470 foreach $transcript (@transcripts) {
471 foreach $exonref (@{$transcript}) {
472 ($start,$end,$strand)=@{$exonref};
473 #print "Creating Exon: start $start end $end strand $strand\n";
474 $exonobj=Bio::LiveSeq::Exon->new(-seq=>$dna,-start=>$start,-end=>$end,-strand=>$strand);
475 #push (@exonobjects,$exonobj);
476 push (@transexons,$exonobj);
478 $transcriptobj=Bio::LiveSeq::Transcript->new(-exons => \@transexons );
479 if ($transcriptobj != -1) {
480 $translationobj=Bio::LiveSeq::Translation->new(-transcript=>$transcriptobj);
481 @transexons=(); # cleans it
482 #push (@transcriptobjects,$transcriptobj);
483 push (@translationobjects,$translationobj);
486 return (@translationobjects);
489 #sub printgene {
490 # deleted. Some functionality placed in Gene->printfeaturesnum
492 =head2 printswissprot
494 Title : printswissprot
495 Usage : $loader->printswissprot($hashref);
496 Function: prints out all information loaded from a database entry into the
497 loader. Mainly used for testing purposes.
498 Args : a hashref containing the SWISSPROT entry datas
499 Note : the hashref can be obtained with a call to the method
500 $loader->get_swisshash() (BioPerl via Bio::DB::EMBL.pm)
501 that takes as argument a string like "SWISS-PROT:P10275"
503 =cut
505 # argument: hashref containing the SWISSPROT entry datas
506 # prints out that hash, showing the information loaded
507 sub printswissprot {
508 my ($self,$entry)=@_;
509 unless ($entry) {
510 return;
512 printf "ID: %s\n",
513 $entry->{'ID'};
514 printf "ACC: %s\n",
515 $entry->{'AccNumber'};
516 printf "GENE: %s\n",
517 $entry->{'Gene'};
518 printf "DES: %s\n",
519 $entry->{'Description'};
520 printf "ORG: %s\n",
521 $entry->{'Organism'};
522 printf "SEQLN: %s\n",
523 $entry->{'SeqLength'};
524 printf "SEQ: %s\n",
525 substr($entry->{'Sequence'},0,64);
526 if ($entry->{'Features'}) {
527 my @features=@{$entry->{'Features'}};
528 my $i;
529 for $i (0..$#features) {
530 print "|",$features[$i]->{'name'},"|";
531 print " at ",$features[$i]->{'location'},": ";
532 print "",$features[$i]->{'desc'},"\n";
537 =head2 printembl
539 Title : printembl
540 Usage : $loader->printembl();
541 Function: prints out all information loaded from a database entry into the
542 loader. Mainly used for testing purposes.
543 Args : none
545 =cut
547 # argument: hashref containing the EMBL entry datas
548 # prints out that hash, showing the information loaded
549 sub printembl {
550 my ($self,$entry)=@_;
551 unless ($entry) {
552 $entry=$self->{'hash'};
554 my ($i,$featurename);
555 printf "ID: %s\n",
556 $entry->{'ID'};
557 printf "ACC: %s\n",
558 $entry->{'AccNumber'};
559 printf "MOL: %s\n",
560 $entry->{'Molecule'};
561 printf "DIV: %s\n",
562 $entry->{'Division'};
563 printf "DES: %s\n",
564 $entry->{'Description'};
565 printf "ORG: %s\n",
566 $entry->{'Organism'};
567 printf "SEQLN: %s\n",
568 $entry->{'SeqLength'};
569 printf "SEQ: %s\n",
570 substr($entry->{'Sequence'},0,64);
571 my @features=@{$entry->{'Features'}};
572 my @cds=@{$entry->{'CDS'}};
573 print "\nFEATURES\nNumber of CDS: ",scalar(@cds)," (out of ",$entry->{'FeaturesNumber'}, " total features)\n";
574 my ($exonref,$transcript);
575 my ($qualifiernumber,$qualifiers,$key);
576 my ($start,$end,$strand);
577 my $j=0;
578 for $i (0..$#features) {
579 $featurename=$features[$i]->{'name'};
580 if ($featurename eq "CDS") {
581 print "|CDS| number $j at feature position: $i\n";
582 #print $features[$i]->{'location'},"\n";
583 $transcript=$features[$i]->{'range'};
584 foreach $exonref (@{$transcript}) {
585 ($start,$end,$strand)=@{$exonref};
586 print "\tExon: start $start end $end strand $strand\n";
588 $j++;
589 } else {
590 print "|$featurename| at feature position: $i\n";
591 print "\trange: ";
592 print join("\t",@{$features[$i]->{'range'}}),"\n";
593 #print $features[$i]->{'location'},"\n";
595 $qualifiernumber=$features[$i]->{'qual_number'};
596 $qualifiers=$features[$i]->{'qualifiers'}; # hash
597 foreach $key (keys (%{$qualifiers})) {
598 print "\t\t",$key,": ";
599 print $qualifiers->{$key},"\n";
604 =head2 genes
606 Title : genes
607 Usage : $loader->genes();
608 Function: Returns an array of gene_names (strings) contained in the loaded
609 entry.
610 Args : none
612 =cut
614 # argument: entryhashref
615 # returns: array of genenames found in the entry
616 sub genes {
617 my ($self,$entry)=@_;
618 unless ($entry) {
619 $entry=$self->{'hash'};
621 my @entryfeatures=@{$entry->{'Features'}};
622 my ($genename,$genenames,$entryfeature);
623 for $entryfeature (@entryfeatures) {
624 $genename=$entryfeature->{'qualifiers'}->{'gene'};
625 if ($genename) {
626 if (index($genenames,$genename) == -1) { # if name is new
627 $genenames .= $genename . " "; # add the name
631 return (split(/ /,$genenames)); # assumes no space inbetween each genename
634 # arguments: swisshash, translation objref
635 # adds information to the Translation, creates AARange objects, sets the
636 # aa_range attribute on the Translation, pointing to those objects
637 sub swisshash2liveseq {
638 my ($self,$entry,$translation)=@_;
639 my $translength=$translation->length;
640 $translation->desc($translation->desc . $entry->{'Description'});
641 $translation->display_id("SWISSPROT:" . $entry->{'ID'});
642 $translation->accession_number("SWISSPROT:" . $entry->{'AccNumber'});
643 $translation->name($entry->{'Gene'});
644 $translation->source($entry->{'Organism'});
645 my @aarangeobjects;
646 my ($start,$end,$aarangeobj,$feature);
647 my @features; my @newfeatures;
648 if ($entry->{'Features'}) {
649 @features=@{$entry->{'Features'}};
651 my $cleavedmet=0;
652 # check for cleaved Met
653 foreach $feature (@features) {
654 if (($feature->{'name'} eq "INIT_MET")&&($feature->{'location'} eq "0 0")) {
655 $cleavedmet=1;
656 $translation->{'offset'}="1"; # from swissprot to liveseq protein sequence
657 } else {
658 push(@newfeatures,$feature);
662 my $swissseq=$entry->{'Sequence'};
663 my $liveseqtransl=$translation->seq;
664 chop $liveseqtransl; # to take away the trailing STOP "*"
665 my $translated=substr($liveseqtransl,$cleavedmet);
667 my ($liveseq_aa,$swiss_aa,$codes_aa)=$self->_get_alignment($translated,$swissseq); # alignment after cleavage of possible initial met
669 if ((index($liveseq_aa,"-") != -1)||(index($swiss_aa,"-") != -1)) { # there are gaps, how to proceed?
670 print "LIVE-SEQ=\'$liveseq_aa\'\nIDENTITY=\'$codes_aa\'\nSWS-PROT=\'$swiss_aa\'\n";
671 carp "Nucleotides translation and SwissProt translation are different in size, cannot attach the SwissSequence to the EMBL one, cannot add any AminoAcidRange object/Domain information!";
672 return;
675 #my $i=0; # debug
676 @features=@newfeatures;
677 foreach $feature (@features) {
678 #print "Processing SwissProtFeature: $i\n"; # debug
679 ($start,$end)=split(/ /,$feature->{'location'});
680 # Note: cleavedmet is taken in account for updating numbering
681 $aarangeobj=Bio::LiveSeq::AARange->new(-start => $start+$cleavedmet, -end => $end+$cleavedmet, -name => $feature->{'name'}, -description => $feature->{'description'}, -translation => $translation, -translength => $translength);
682 if ($aarangeobj != -1) {
683 push(@aarangeobjects,$aarangeobj);
685 # $i++; # debug
687 $translation->{'aa_ranges'}=\@aarangeobjects;
690 # if there is no SRS support, the default will be to return 0
691 # i.e. this function is overridden in SRS package
692 sub get_swisshash {
693 return (0);
696 # Args: $entry hashref, gene_name OR cds_position (undef is used to
697 # choose between the two), getswissprotinfo boolean flag
698 # Returns: an hash holding various arrayref used in the hash2gene method
699 # Function: examines the nucleotide entry, identifying features belonging
700 # to the gene (defined either by its name or by the position of its CDS in
701 # the entry)
703 sub _findgenefeatures {
704 my ($self,$entry,$gene_name,$cds_position,$getswissprotinfo)=@_;
706 my @entryfeatures=@{$entry->{'Features'}};
707 my @exons; my @introns; my @prim_transcripts; my @transcripts;
708 my @repeat_units; my @repeat_regions;
709 my @repeat_units_family; my @repeat_regions_family; my $rpt_family;
710 my $entryfeature; my @genefeatures;
711 my $desc; my @exondescs; my @introndescs;
713 # for swissprot xreference
714 my ($swissacc,$swisshash); my @swisshashes;
716 # for translation_tables
717 my @ttables;
719 # to create labels
720 my ($name,$exon);
721 my @range; my @cdsexons; my @labels;
723 # maybe here also could be added special case when there is no CDS feature
724 # in the entry (e.g. tRNA entry -> TOCHECK).
725 # let's deal with the special case in which there is just one gene per entry
726 # usually without /gene qualifier
727 my @cds=@{$entry->{'CDS'}};
729 my $skipgenematch=0;
730 if (scalar(@cds) == 1) {
731 #carp "Note: only one CDS in this entry. Treating all features found in entry as Gene features.";
732 $skipgenematch=1;
735 my ($cds_begin,$cds_end,$proximity);
736 if ($cds_position) { # if a position has been requested
737 my @cds_exons=@{$cds[$cds_position-1]->{'range'}};
738 ($cds_begin,$cds_end)=($cds_exons[0]->[0],$cds_exons[-1]->[1]); # begin and end of CDS
739 $gene_name=$cds[$cds_position-1]->{'qualifiers'}->{'gene'};
740 # DEBUG
741 unless ($skipgenematch) {
742 carp "--DEBUG-- cdsbegin $cds_begin cdsend $cds_end--------";
744 $proximity=100; # proximity CONSTANT to decide whether a feature "belongs" to the CDS
747 for $entryfeature (@entryfeatures) { # get only features for the desired gene
748 if (($skipgenematch)||(($cds_position)&&($self->_checkfeatureproximity($entryfeature->{'range'},$cds_begin,$cds_end,$proximity)))||(!($cds_position)&&($entryfeature->{'qualifiers'}->{'gene'} eq "$gene_name"))) {
749 push(@genefeatures,$entryfeature);
751 my @range=@{$entryfeature->{'range'}};
752 $name=$entryfeature->{'name'};
753 my %qualifierhash=%{$entryfeature->{'qualifiers'}};
754 if ($name eq "CDS") { # that has range containing array of exons
756 # swissprot crossindexing (if without SRS support it will fill array
757 # with zeros and do nothing
758 if ($getswissprotinfo) {
759 $swissacc=$entryfeature->{'qualifiers'}->{'db_xref'};
760 $swisshash=$self->get_swisshash($swissacc);
761 #$self->printswissprot($swisshash); # DEBUG
762 push (@swisshashes,$swisshash);
765 push (@ttables,$entryfeature->{'qualifiers'}->{'transl_table'}); # undef if not specified
767 # create labels array
768 for $exon (@range) {
769 push(@labels,$exon->[0],$exon->[1]); # start and end of every exon of the CDS
771 push (@transcripts,$entryfeature->{'range'});
772 } else {
773 # "simplifying" the joinedlocation features. I.e. changing them from
774 # multijoined ones to simple plain start-end features, taking only
775 # the start of the first "exon" and the end of the last "exon" as
776 # start and end of the entire feature
777 if ($entryfeature->{'locationtype'} && $entryfeature->{'locationtype'} eq "joined") { # joined location
778 @range=($range[0]->[0],$range[-1]->[1]);
780 push(@labels,$range[0],$range[1]); # start and end of every feature
781 if ($name eq "exon") {
782 $desc=$entryfeature->{'qualifiers'}->{'number'};
783 if ($entryfeature->{'qualifiers'}->{'note'}) {
784 if ($desc) {
785 $desc .= "|" . $entryfeature->{'qualifiers'}->{'note'};
786 } else {
787 $desc = $entryfeature->{'qualifiers'}->{'note'};
790 push (@exondescs,$desc || "unknown");
791 push(@exons,\@range);
793 if ($name eq "intron") {
794 $desc=$entryfeature->{'qualifiers'}->{'number'};
795 if ($desc) {
796 $desc .= "|" . $entryfeature->{'qualifiers'}->{'note'};
797 } else {
798 $desc = $entryfeature->{'qualifiers'}->{'note'};
800 push (@introndescs,$desc || "unknown");
801 push(@introns,\@range);
803 if (($name eq "prim_transcript")||($name eq "mRNA")) { push(@prim_transcripts,\@range); }
804 if ($name eq "repeat_unit") { push(@repeat_units,\@range);
805 $rpt_family=$entryfeature->{'qualifiers'}->{'rpt_family'};
806 push (@repeat_units_family,$rpt_family || "unknown");
808 if ($name eq "repeat_region") { push(@repeat_regions,\@range);
809 $rpt_family=$entryfeature->{'qualifiers'}->{'rpt_family'};
810 push (@repeat_regions_family,$rpt_family || "unknown");
815 unless ($gene_name) { $gene_name="cds-position:".$cds_position; }
816 my %genefeatureshash;
817 $genefeatureshash{gene_name}=$gene_name;
818 $genefeatureshash{genefeatures}=\@genefeatures;
819 $genefeatureshash{labels}=\@labels;
820 $genefeatureshash{ttables}=\@ttables;
821 $genefeatureshash{swisshashes}=\@swisshashes;
822 $genefeatureshash{transcripts}=\@transcripts;
823 $genefeatureshash{exons}=\@exons;
824 $genefeatureshash{exondescs}=\@exondescs;
825 $genefeatureshash{introns}=\@introns;
826 $genefeatureshash{introndescs}=\@introndescs;
827 $genefeatureshash{prim_transcripts}=\@prim_transcripts;
828 $genefeatureshash{repeat_units}=\@repeat_units;
829 $genefeatureshash{repeat_regions}=\@repeat_regions;
830 $genefeatureshash{repeat_units_family}=\@repeat_units_family;
831 $genefeatureshash{repeat_regions_family}=\@repeat_regions_family;
832 return (\%genefeatureshash);
836 # used by _findgenefeatures, when a CDS at a certain position is requested,
837 # to retrieve only features quite close to the wanted CDS.
838 # Args: range hashref, begin and end positions of the CDS, $proximity
839 # $proximity holds the maximum distance between the extremes of the CDS
840 # and of the feature under exam.
841 # Returns: boolean
842 sub _checkfeatureproximity {
843 my ($self,$range,$cds_begin,$cds_end,$proximity)=@_;
844 my @range=@{$range};
845 my ($begin,$end,$strand);
846 if (ref($range[0]) eq "ARRAY") { # like in CDS, whose range equivals to exons
847 ($begin,$end,$strand)=($range[0]->[0],$range[-1]->[1],$range[0]->[2]);
848 } else {
849 ($begin,$end,$strand)=@range;
851 if ($cds_begin > $cds_end) { # i.e. reverse strand CDS
852 ($cds_begin,$cds_end)=($cds_end,$cds_begin); # swap boundaries
854 if ($strand == -1) { # reverse strand
855 ($begin,$end)=($end,$begin); # swap boundaries
857 if (($cds_begin-$end)>$proximity) {
858 carp "--DEBUG-- feature rejected: begin $begin end $end -------";
859 return (0);
861 if (($begin-$cds_end)>$proximity) {
862 carp "--DEBUG-- feature rejected: begin $begin end $end -------";
863 return (0);
865 carp "--DEBUG-- feature accepted: begin $begin end $end -------";
866 return (1); # otherwise ok, feature considered next to CDS
870 # function that calls the external program "align" (on the fasta2 package)
871 # to create an alignment between two sequences, returning the aligned
872 # strings and the codes for the identity (:: ::::)
874 sub _get_alignment {
875 my ($self,$seq1,$seq2)=@_;
877 my $null = ($^O =~ m/mswin/i) ? 'NUL' : '/dev/null';
878 my $fastafile1="/tmp/tmpfastafile1";
879 my $fastafile2="/tmp/tmpfastafile2";
880 my $grepcut='egrep -v "[[:digit:]]|^ *$|sequences" | cut -c8-'; # grep/cut
881 my $alignprogram="/usr/local/etc/bioinfo/fasta2/align -s /usr/local/etc/bioinfo/fasta2/idnaa.mat $fastafile1 $fastafile2 2>$null | $grepcut"; # ALIGN
882 open my $TMPFASTAFILE1, '>', $fastafile1 or croak "Could not write file '$fastafile1' for aa alignment: $!";
883 open my $TMPFASTAFILE2, '>', $fastafile2 or croak "Could not write file '$fastafile2' for aa alignment: $!";
884 print $TMPFASTAFILE1 ">firstseq\n$seq1\n";
885 print $TMPFASTAFILE2 ">secondseq\n$seq2\n";
886 close $TMPFASTAFILE1;
887 close $TMPFASTAFILE2;
888 my $alignment=`$alignprogram`;
889 my @alignlines=split(/\n/,$alignment);
890 my ($linecount,$seq1_aligned,$seq2_aligned,$codes);
891 for ($linecount=0; $linecount < @alignlines; $linecount+=3) {
892 $seq1_aligned .= $alignlines[$linecount];
893 $codes .= $alignlines[$linecount+1];
894 $seq2_aligned .= $alignlines[$linecount+2];
896 return ($seq1_aligned,$seq2_aligned,$codes);
899 # common part of the function to create a novel liveseq gene structure
900 # from an amino acid sequence, using codon usage frequencies
901 # args: codon_usage_array transltableid aasequence gene_name
902 sub _common_novelaasequence2gene {
903 my ($species_codon_usage,$ttabid,$aasequence,$gene_name)=@_;
904 my @species_codon_usage=@{$species_codon_usage};
905 my @codon_usage_label=
906 qw (cga cgc cgg cgt aga agg cta ctc ctg ctt tta ttg tca tcc tcg
907 tct agc agt aca acc acg act cca ccc ccg cct gca gcc gcg gct gga
908 ggc ggg ggt gta gtc gtg gtt aaa aag aac aat caa cag cac cat gaa
909 gag gac gat tac tat tgc tgt ttc ttt ata atc att atg tgg taa tag
910 tga);
911 my ($i,$j);
912 my %codon_usage_value;
913 my $aa_codon_total;
914 for ($i=0;$i<64;$i++) {
915 $codon_usage_value{$codon_usage_label[$i]}=$species_codon_usage[$i];
918 my $CodonTable = Bio::Tools::CodonTable->new ( -id => $ttabid );
919 my @aminoacids = split(//,uc($aasequence));
920 my @alt_codons; my ($relativeusage,$dnasequence,$chosen_codon,$dice,$partial,$thiscodon);
921 for $i (@aminoacids) {
922 @alt_codons = $CodonTable->revtranslate($i);
923 unless (@alt_codons) {
924 carp "No reverse translation possible for aminoacid \'$i\'";
925 $dnasequence .= "???";
926 } else {
927 $aa_codon_total=0;
928 for $j (@alt_codons) {
929 $aa_codon_total+=$codon_usage_value{$j};
931 # print "aminoacid $i, codonchoice: "; # verbose
932 #$partial=0;
933 #for $j (@alt_codons) {
934 #printf "%s %.2f ",$j,$partial+$codon_usage_value{$j}/$aa_codon_total;
935 #$partial+=($codon_usage_value{$j}/$aa_codon_total);
937 #print "\n";
938 $dice=rand(1);
939 #print "roulette: $dice\n"; # verbose
940 $partial=0;
941 $chosen_codon="";
942 CODONCHOICE:
943 for $j (0..@alt_codons) { # last one not accounted
944 $thiscodon=$alt_codons[$j];
945 $relativeusage=($codon_usage_value{$thiscodon}/$aa_codon_total);
946 if ($dice < $relativeusage+$partial) {
947 $chosen_codon=$thiscodon;
948 last CODONCHOICE;
949 } else {
950 $partial += $relativeusage;
953 unless ($chosen_codon) {
954 $chosen_codon = $alt_codons[-1]; # the last one
956 # print ".....adding $chosen_codon\n"; # verbose
957 $dnasequence .= $chosen_codon;
961 my $dna = Bio::LiveSeq::DNA->new(-seq => $dnasequence);
962 my $min=1;
963 my $max=length($dnasequence);
964 my $exon = Bio::LiveSeq::Exon->new(-seq => $dna, -start => $min, -end => $max, -strand => 1);
965 my @exons=($exon);
966 my $transcript = Bio::LiveSeq::Transcript->new(-exons => \@exons);
967 $transcript->translation_table($ttabid);
968 my @transcripts=($transcript);
969 my $translation = Bio::LiveSeq::Translation->new(-transcript => $transcript);
970 my @translations=($translation);
971 my %features=(DNA => $dna, Transcripts => \@transcripts, Translations => \@translations);
972 my $gene = Bio::LiveSeq::Gene->new(-name => $gene_name, -features => \%features, -upbound => $min, -downbound => $max);
974 # creation of gene
975 unless ($gene) { # if $gene == 0 it means problems in hash2gene
976 carp "Error in Gene creation phase";
977 return (0);
979 return $gene;