2 # bioperl module for Bio::LiveSeq::Transcript
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Joseph Insana <insana@ebi.ac.uk> <jinsana@gmx.net>
8 # Copyright Joseph Insana
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
16 Bio::LiveSeq::Transcript - Transcript class for LiveSeq
20 # documentation needed
24 This stores information about coding sequences (CDS).
25 The implementation is that a Transcript object accesses a collection of
26 Exon objects, inferring from them the nucleotide structure and sequence.
28 =head1 AUTHOR - Joseph A.L. Insana
30 Email: Insana@ebi.ac.uk, jinsana@gmx.net
34 The rest of the documentation details each of the object
35 methods. Internal methods are usually preceded with a _
39 # Let the code begin...
41 package Bio
::LiveSeq
::Transcript
;
44 # use Carp qw(carp cluck);
45 use Bio
::LiveSeq
::Exon
; # uses Exon to create new exon in case of deletion
46 use base
qw(Bio::LiveSeq::SeqI);
51 Usage : $transcript = Bio::LiveSeq::Transcript->new(-exons => \@obj_refs);
53 Function: generates a new Bio::LiveSeq::Transcript
54 Returns : reference to a new object of class Transcript
56 Args : reference to an array of Exon object references
61 my ($thing, %args) = @_;
62 my $class = ref($thing) || $thing;
63 my ($obj,%transcript);
65 my @exons=@
{$args{-exons
}};
68 $obj = bless $obj, $class;
71 $obj->warn("$class not initialised because exons array empty");
75 # now useless, after start and end methods have been overridden here
76 my $firstexon = $exons[0];
77 #my $lastexon = $exons[-1];
78 #my $start = $firstexon->start;
79 #my $end = $lastexon->end;
80 my $strand = $firstexon->strand;
81 my $seq = $firstexon->{'seq'};
82 $obj->alphabet('rna');
84 unless (_checkexons
(\
@exons)) {
85 $obj->warn("$class not initialised because of problems in the exon structure");
88 $obj->{'strand'}=$strand;
89 $obj->{'exons'}=\
@exons;
92 # set Transcript into each Exon
94 foreach $exon (@exons) {
95 $exon->{'transcript'}=$obj;
104 Usage : $transcript_obj->all_Exons()
105 Function: returns references to all Exon objects the Transcript is composed of
106 Example : foreach $exon ($transcript->all_Exons()) { do_something }
107 Returns : array of object references
114 my $exonsref=$self->{'exons'};
115 my @exons=@
{$exonsref};
118 foreach $exon (@exons) {
119 unless ($exon->obj_valid) {
120 $self->warn("$exon no more valid, start or end label lost, skipping....",1); # ignorable
122 push(@newexons,$exon);
125 if ($#exons != $#newexons) {
127 $self->{'exons'}=\
@newexons;
132 =head2 downstream_seq
134 Title : downstream_seq
135 Usage : $transcript_obj->downstream_seq()
136 : $transcript_obj->downstream_seq(64)
137 Function: returns a string of nucleotides downstream of the end of the
138 CDS. If there is some information of the real mRNA, from features in
139 an attached Gene object, it will return up to those boundaries.
140 Otherwise it will return 1000 nucleotides.
141 If an argument is given it will override the default 1000 number
142 and return instead /that/ requested number of nucleotides.
143 But if a Gene object is attached, this argument will be ignored.
145 Args : an optional integer number of nucleotides to be returned instead of
146 the default if no gene attached
151 my ($self,$howmany)=@_;
153 if (defined ($howmany)) {
154 unless ($howmany > 0) {
155 $self->throw("No sense in asking less than 1 downstream nucleotides!");
158 unless ($self->{'seq'}->alphabet eq 'rna') { # if rna retrieve until the end
159 #$str=$DNAobj->labelsubseq($self->end,undef,undef,"unsecuremoderequested");
160 #return(substr($str,1)); # delete first nucleotide that is the last of Transcript
161 if ($self->gene) { # if there is Gene object attached fetch relevant info
162 $str=$self->{'seq'}->labelsubseq($self->end,undef,$self->gene->maxtranscript->end); # retrieve from end of this Transcript to end of the maxtranscript
163 $str=substr($str,1); # delete first nucleotide that is the last of Transcript
164 if (CORE
::length($str) > 0) {
166 } else { # if there was no downstream through the gene's maxtranscript, go the usual way
174 my @exons=$self->all_Exons;
175 my $strand=$self->strand();
176 my $lastexon=$exons[-1];
177 my $lastexonlength=$lastexon->length;
178 # $howmany nucs after end of last exon
179 #my $downstream_seq=$lastexon->subseq($lastexonlength+1,undef,$howmany);
183 $downstream_seq=substr($lastexon->labelsubseq($self->end,$howmany,undef,"unsecuremoderequested"),1);
186 $downstream_seq=substr($lastexon->labelsubseq($self->end,undef,$self->{'seq'}->end,"unsecuremoderequested"),1);
188 $downstream_seq=substr($lastexon->labelsubseq($self->end,undef,$self->{'seq'}->start,"unsecuremoderequested"),1);
191 return $downstream_seq;
197 Usage : $transcript_obj->upstream_seq()
198 : $transcript_obj->upstream_seq(64)
199 Function: just like downstream_seq but returns nucleotides before the ATG
200 Note : the default, if no Gene information present and no nucleotides
201 number given, is to return up to 400 nucleotides.
206 my ($self,$howmany)=@_;
207 if (defined ($howmany)) {
208 unless ($howmany > 0) {
209 $self->throw("No sense in asking less than 1 upstream nucleotides!");
212 unless ($self->{'seq'}->alphabet eq 'rna') { # if rna retrieve from the start
213 if ($self->gene) { # if there is Gene object attached fetch relevant info
214 my $str=$self->{'seq'}->labelsubseq($self->gene->maxtranscript->start,undef,$self->start); # retrieve from start of maxtranscript to start of this Transcript
215 chop $str; # delete last nucleotide that is the A of starting ATG
216 if (length($str) > 0) {
218 } else { # if there was no upstream through the gene's maxtranscript, go the usual way
226 my @exons=$self->all_Exons;
227 my $firstexon=$exons[0];
230 my $strand=$self->strand();
232 if ($howmany) {# $howmany nucs before begin of first exon
233 my $labelbefore=$firstexon->label(-$howmany,$firstexon->start);
234 if ($labelbefore < 1) {
236 $labelbefore=$self->{'seq'}->start;
238 $labelbefore=$self->{'seq'}->end;
241 $upstream_seq=$firstexon->labelsubseq($labelbefore,undef,$firstexon->start,"unsecuremoderequested");
245 $upstream_seq=$firstexon->labelsubseq($self->{'seq'}->start,undef,$self->start,"unsecuremoderequested");
246 chop $upstream_seq; # delete last nucleotide that is the A of starting ATG
248 $upstream_seq=$firstexon->labelsubseq($self->{'seq'}->end,undef,$self->start,"unsecuremoderequested");
249 chop $upstream_seq; # delete last nucleotide that is the A of starting ATG
252 return $upstream_seq;
255 # These get redefined here, overriding the SeqI one because they draw their
256 # information from the Exons a Transcript is built of
257 # optional argument: firstlabel. If not given, it checks coordinate_start
258 # This is useful when called by Translation
259 # also used by _delete
261 my ($self,$position,$firstlabel)=@_;
262 unless ($position) { # if position = 0 complain ?
263 $self->warn("Position not given or position 0");
266 my ($start,$end,$strand)=($self->start(),$self->end(),$self->strand());
267 my ($label,@labels,$length,$arraypos);
268 unless (defined ($firstlabel)) {
269 $firstlabel=$self->coordinate_start; # this is inside Transcript obj
271 my $coord_pos=$self->_inside_position($firstlabel);
272 $length=$self->length;
275 $position++; # to account for missing of 0 position
277 $arraypos=$position+$coord_pos-2;
278 #print "\n=-=-=-=-DEBUG: arraypos $arraypos, pos $position, coordpos: $coord_pos";
280 $label=$self->{'seq'}->label($arraypos,$start,$strand); #?
281 } elsif ($arraypos >= $length) {
282 $label=$self->{'seq'}->label($arraypos-$length+2,$end,$strand); #?
283 } else { # inside the Transcript
284 @labels=$self->all_labels;
285 $label=$labels[$arraypos];
291 # returns: position of label according to coord_start
292 # errorcode: 0 label not found
293 # optional argument: firstlabel. If not given, it checks coordinate_start
294 # This is useful when called by Translation
296 my ($self,$label,$firstlabel)=@_;
297 unless ($self->{'seq'}->valid($label)) {
298 $self->warn("label is not valid");
301 unless (defined ($firstlabel)) {
302 $firstlabel=$self->coordinate_start; # this is inside Transcript obj
304 if ($label == $firstlabel) {
307 my ($start,$end,$strand)=($self->start(),$self->end(),$self->strand());
308 my ($position,$in_pos,$out_pos,$coord_pos);
309 my $length=$self->length;
310 $coord_pos=$self->_inside_position($firstlabel);
311 if ($self->valid($label)) { # if label is inside the Transcript
312 $in_pos=$self->_inside_position($label);
313 $position=$in_pos-$coord_pos+1;
314 if ($position <= 0) {
315 return ($position-1); # accounts for the missing of the 0 position
318 if ($self->follows($end,$label)) { # label after end of transcript
319 $out_pos=$self->{'seq'}->position($label,$end,$strand);
320 #print "\n+++++++++DEBUG label $label FOLLOWS end $end outpos $out_pos coordpos $coord_pos";
321 $position=$out_pos+$length-$coord_pos;
322 } elsif ($self->follows($label,$start)) { # label before begin of transcript
323 #print "\n+++++++++DEBUG label $label BEFORE start $start outpos $out_pos coordpos $coord_pos";
324 $out_pos=$self->{'seq'}->position($label,$start,$strand);
325 $position=$out_pos-$coord_pos+1;
326 } else { # label is in intron (not valid, not after, not before)!
327 $self->warn("Cannot give position of label pointing to intron according to CDS numbering!",1);
337 my @exons=$self->all_Exons();
338 foreach $exon (@exons) {
339 $str .= $exon->seq();
347 my @exons=$self->all_Exons();
348 foreach $exon (@exons) {
349 $length += $exon->length();
357 my @exons=$self->all_Exons();
358 foreach $exon (@exons) {
359 push (@labels,$exon->all_labels());
364 # redefined here so that it will retrieve effective subseq without introns
365 # otherwise it would have retrieved an underlying DNA (possibly with introns)
367 # Drawback: this is really bulky, label->position and then a call to
368 # subseq that will do the opposite position-> label
370 # one day this can be rewritten as the main one so that the normal subseq
371 # will rely on this one and hence avoid this double (useless and lengthy)
372 # conversion between labels and positions
373 sub old_labelsubseq
{
374 my ($self,$start,$length,$end)=@_;
377 unless ($self->valid($start)) {
378 $self->warn("Start label not valid"); return (-1);
380 $pos1=$self->position($start);
383 if ($end == $start) {
386 unless ($self->valid($end)) {
387 $self->warn("End label not valid"); return (-1);
389 unless ($self->follows($start,$end) == 1) {
390 $self->warn("End label does not follow Start label!"); return (-1);
392 $pos2=$self->position($end);
396 return ($self->subseq($pos1,$pos2,$length));
399 # rewritten, eventually
402 my ($self,$start,$length,$end,$unsecuremode)=@_;
403 unless (defined $unsecuremode &&
404 $unsecuremode eq "unsecuremoderequested")
405 { # to skip security checks (faster)
407 unless ($self->valid($start)) {
408 $self->warn("Start label not valid"); return (-1);
414 if ($end == $start) {
418 undef $length; # end argument overrides length argument
419 unless ($self->valid($end)) {
420 $self->warn("End label not valid"); return (-1);
422 unless ($self->follows($start,$end) == 1) {
423 $self->warn("End label does not follow Start label!"); return (-1);
430 my ($seq,$exon,$startexon,$endexon); my @exonlabels;
431 my @exons=$self->all_Exons;
433 foreach $exon (@exons) {
434 if ((!(defined($startexon)))&&($exon->valid($start))) { # checks only if not yet found
437 if ($exon->valid($end)) {
440 if ((!(defined($seq)) && (defined($startexon)))) { # initializes only once
441 if ((defined($endexon)) && ($endexon eq $startexon)) { # then perfect, we are finished
443 $seq = $startexon->labelsubseq($start,$length,undef,"unsecuremoderequested");
448 $seq = $startexon->labelsubseq($start,undef,$end,"unsecuremoderequested");
451 } else { # get up to the end of the exon
452 $seq = $startexon->labelsubseq($start,undef,undef,"unsecuremoderequested");
455 if (($startexon)&&($exon ne $startexon)) {
456 if (defined($endexon)) { # we arrived to the last exon
457 $seq .= $endexon->labelsubseq(undef,undef,$end,"unsecuremoderequested"); # get from the start of the exon
460 } elsif (defined($startexon)) { # we are in a whole-exon-in-the-middle case
461 $seq .= $exon->seq; # we add it completely to the seq
462 } # else, we still have to reach the start point, exon useless, we move on
463 if ($length) { # if length argument specified
464 if (($seq && (CORE
::length($seq) >= $length))) {
471 return (substr($seq,0,$length));
479 # returns: the objref and progressive number of the Exon containing that label
482 my ($self,$label)=@_;
484 my @exons=$self->all_Exons;
485 foreach $exon (@exons) {
486 $count++; # 1st exon is numbered "1"
487 if ($exon->valid($label)) {
488 return ($exon,$count)
491 return (-1); # if nothing found
494 # recoded to exploit the new fast labelsubseq()
495 # valid only inside Transcript
497 my ($self,$pos1,$pos2,$length) = @_;
498 my ($str,$startlabel,$endlabel);
499 if (defined ($pos1)) {
500 if ($pos1 == 0) { # if position = 0 complain
501 $self->warn("Position cannot be 0!"); return (-1);
503 if ((defined ($pos2))&&($pos1>$pos2)) {
504 $self->warn("1st position($pos1) cannot be > 2nd position($pos2)!"); return (-1);
506 $startlabel=$self->label($pos1);
507 unless ($self->valid($startlabel)) {
508 $self->warn("Start label not valid"); return (-1);
510 if ($startlabel < 1) {
511 $self->warn("position $pos1 not valid as start of subseq!"); return (-1);
514 $startlabel=$self->start;
516 if (defined ($pos2)) {
517 if ($pos2 == 0) { # if position = 0 complain
518 $self->warn("Position cannot be 0!"); return (-1);
521 if ((defined ($pos1))&&($pos1>$pos2)) {
522 $self->warn("1st position($pos1) cannot be > 2nd position($pos2)!"); return (-1);
524 $endlabel=$self->label($pos2);
525 unless ($self->valid($endlabel)) {
526 $self->warn("End label not valid"); return (-1);
529 $self->warn("position $pos2 not valid as end of subseq!"); return (-1);
532 unless (defined ($length)) {
533 $endlabel=$self->end;
536 return ($self->labelsubseq($startlabel,$length,$endlabel,"unsecuremoderequested"));
539 # works only inside the transcript, complains if asked outside
541 my ($self,$pos1,$pos2,$length) = @_;
542 my ($str,$startcount,$endcount,$seq,$seqlength);
543 if (defined ($length)) {
545 $self->warn("No sense asking for a subseq of length < 1");
549 my $firstlabel=$self->coordinate_start; # this is inside Transcript obj
550 my $coord_pos=$self->_inside_position($firstlabel); # TESTME old
552 $seqlength=CORE
::length($seq);
553 unless (defined ($pos1)) {
554 $startcount=1+$coord_pos-1; # i.e. coord_pos
556 if ($pos1 == 0) { # if position = 0 complain
557 $self->warn("Position cannot be 0!"); return (-1);
558 } elsif ($pos1 < 0) {
561 if ((defined ($pos2))&&($pos1>$pos2)) {
562 $self->warn("1st position ($pos1) cannot be > 2nd position ($pos2)!");
565 $startcount=$pos1+$coord_pos-1;
567 unless (defined ($pos2)) {
570 if ($pos2 == 0) { # if position = 0 complain
571 $self->warn("Position cannot be 0!"); return (-1);
572 } elsif ($pos2 < 0) {
575 if ((defined ($pos1))&&($pos1>$pos2)) {
576 $self->warn("1st position ($pos1) cannot be > 2nd position ($pos2)!");
579 $endcount=$pos2+$coord_pos-1;
580 if ($endcount > $seqlength) {
581 #print "\n###DEBUG###: pos1 $pos1 pos2 $pos2 coordpos $coord_pos endcount $endcount seqln $seqlength\n";
582 $self->warn("Cannot access end position after the end of Transcript");
585 $length=$endcount-$startcount+1;
587 #print "\n###DEBUG pos1 $pos1 pos2 $pos2 start $startcount end $endcount length $length coordpos $coord_pos\n";
588 my $offset=$startcount-1;
590 $self->warn("Cannot access startposition before the beginning of Transcript, returning from start",1); # ignorable
591 return (substr($seq,0,$length));
592 } elsif ($offset >= $seqlength) {
593 $self->warn("Cannot access startposition after the end of Transcript");
596 $str=substr($seq,$offset,$length);
597 if (CORE
::length($str) < $length) {
598 $self->warn("Attention, cannot return the length requested ".
599 "for subseq",1) if $self->verbose > 0; # ignorable
605 # redefined so that it doesn't require other methods (after deletions) to
609 my $exonsref=$self->{'exons'};
610 my @exons=@
{$exonsref};
611 return ($exons[0]->start);
616 my $exonsref=$self->{'exons'};
617 my @exons=@
{$exonsref};
618 return ($exons[-1]->end);
622 # internal methods begin here
624 # returns: position of label in transcript's all_labels
625 # with STARTlabel == 1
626 # errorcode 0 -> label not found
628 sub _inside_position
{
629 my ($self,$label)=@_;
630 my ($start,$end,$strand)=($self->start(),$self->end(),$self->strand());
631 my ($position,$checkme);
632 my @labels=$self->all_labels;
633 foreach $checkme (@labels) {
635 if ($label == $checkme) {
642 # returns 1 OK or 0 ERROR
643 # arguments: reference to array of Exon object references
645 my ($exon,$thisstart);
648 my @exons=@
{$exonsref};
650 my $firstexon = $exons[0];
652 unless (ref($firstexon) eq "Bio::LiveSeq::Exon") {
653 $self->warn("Object not of class Exon");
656 my $strand = $firstexon->strand;
658 my $prevend = $firstexon->end;
659 shift @exons; # skip first one
660 foreach $exon (@exons) {
661 unless (ref($exon) eq "Bio::LiveSeq::Exon") { # object class check
662 $self->warn("Object not of class Exon");
665 if ($exon->strand != $strand) { # strand consistency check
666 $self->warn("Exons' strands not consistent when trying to create Transcript");
669 $thisstart = $exon->start;
670 unless ($exon->{'seq'}->follows($prevend,$thisstart,$strand)) {
671 $self->warn("Exons not in correct order when trying to create Transcript");
674 $prevend = $exon->end;
679 =head2 get_Translation
682 Usage : $translation = $obj->get_Translation()
683 Function: retrieves the reference to the object of class Translation (if any)
684 attached to a LiveSeq object
685 Returns : object reference
690 sub get_Translation
{
692 return ($self->{'translation'}); # this is set when Translation->new is called
695 # this checks so that deletion spanning multiple exons is
696 # handled accordingly and correctly
697 # arguments: begin and end label of a deletion
698 # this is called BEFORE any deletion in the chain
700 my ($self,$startlabel,$endlabel)=@_;
701 my $exonsref=$self->{'exons'};
702 my @exons=@
{$exonsref};
703 my ($startexon,$endexon,$exon);
704 $startexon=$endexon=0;
705 foreach $exon (@exons) {
706 if (($startexon == 0)&&($exon->valid($startlabel))) {
707 $startexon=$exon; # exon containing start of deletion
709 if (($endexon == 0)&&($exon->valid($endlabel))) {
710 $endexon=$exon; # exon containing end of deletion
712 if (($startexon)&&($endexon)) {
713 last; # don't check further
716 my $nextend=$self->label(2,$endlabel); # retrieve the next label
717 my $prevstart=$self->label(-1,$startlabel); # retrieve the prev label
719 if ($startexon eq $endexon) { # intra-exon deletion
720 if (($startexon->start eq $startlabel) && ($startexon->end eq $endlabel)) {
721 # let's delete the entire exon
723 foreach $exon (@exons) {
724 unless ($exon eq $startexon) {
725 push(@newexons,$exon);
728 $self->{'exons'}=\
@newexons;
729 } elsif ($startexon->start eq $startlabel) { # special cases
730 $startexon->{'start'}=$nextend; # set a new start of exon
731 } elsif ($startexon->end eq $endlabel) {
732 $startexon->{'end'}=$prevstart; # set a new end of exon
736 } else { # two new exons to be created, inter-exons deletion
739 my $dna=$self->{'seq'};
740 my $strand=$self->strand;
741 my $notmiddle=1; # flag for skipping exons in the middle of deletion
742 foreach $exon (@exons) {
743 if ($exon eq $startexon) {
744 $exonobj=Bio
::LiveSeq
::Exon
->new('-seq'=>$dna,'-start'=>$exon->start,'-end'=>$prevstart,'-strand'=>$strand); # new partial exon
745 push(@newexons,$exonobj);
746 $notmiddle=0; # now we enter totally deleted exons
747 } elsif ($exon eq $endexon) {
748 $exonobj=Bio
::LiveSeq
::Exon
->new('-seq'=>$dna,'-start'=>$nextend,'-end'=>$exon->end,'-strand'=>$strand); # new partial exon
749 push(@newexons,$exonobj);
750 $notmiddle=1; # exiting totally deleted exons
752 if ($notmiddle) { # if before or after exons with deletion
753 push(@newexons,$exon);
757 $self->{'exons'}=\
@newexons;
761 =head2 translation_table
763 Title : translation_table
764 Usage : $name = $obj->translation_table;
765 : $name = $obj->translation_table(11);
766 Function: Returns or sets the translation_table used for translating the
768 If it has never been set, it will return undef.
773 sub translation_table
{
774 my ($self,$value) = @_;
775 if (defined $value) {
776 $self->{'translation_table'} = $value;
778 unless (exists $self->{'translation_table'}) {
781 return $self->{'translation_table'};
788 Usage : $frame = $transcript->frame($label);
789 Function: Returns the frame of a particular nucleotide.
790 Frame can be 0 1 or 2 and means the position in the codon triplet
791 of the particulat nucleotide. 0 is the first codon_position.
792 Codon_position (1 2 3) is simply frame+1.
793 If the label asked for is not inside the Transcript, -1 will be
802 # returns: frame of nucleotide (0 1 2)
805 my ($self,$inputlabel)=@_;
806 my @labels=$self->all_labels;
807 my ($label,$frame,$count);
808 foreach $label (@labels) {
809 if ($inputlabel == $label) {
812 $count++; # 0 1 2 3 4....
814 return (-1); # label not found amid Transcript labels