Bio::Tools::CodonTable and Bio::Tools::IUPAC: prepare with dzil.
[bioperl-live.git] / lib / Bio / SeqUtils.pm
blob309eac21633807d63d382ea482902f5064f65535
1 # BioPerl module for Bio::SeqUtils
3 # Please direct questions and support issues to <bioperl-l@bioperl.org>
5 # Cared for by Heikki Lehvaslaiho <heikki-at-bioperl-dot-org>
7 # Copyright Heikki Lehvaslaiho
9 # You may distribute this module under the same terms as perl itself
11 # POD documentation - main docs before the code
13 =head1 NAME
15 Bio::SeqUtils - Additional methods for PrimarySeq objects
17 =head1 SYNOPSIS
19 use Bio::SeqUtils;
20 # get a Bio::PrimarySeqI compliant object, $seq, somehow
21 $util = Bio::SeqUtils->new();
22 $polypeptide_3char = $util->seq3($seq);
23 # or
24 $polypeptide_3char = Bio::SeqUtils->seq3($seq);
26 # set the sequence string (stored in one char code in the object)
27 Bio::SeqUtils->seq3($seq, $polypeptide_3char);
29 # translate a sequence in all six frames
30 @seqs = Bio::SeqUtils->translate_6frames($seq);
32 # inplace editing of the sequence
33 Bio::SeqUtils->mutate($seq,
34 Bio::LiveSeq::Mutation->new(-seq => 'c',
35 -pos => 3
36 ));
37 # mutate a sequence to desired similarity%
38 $newseq = Bio::SeqUtils-> evolve
39 ($seq, $similarity, $transition_transversion_rate);
41 # concatenate two or more sequences with annotations and features,
42 # the first sequence will be modified
43 Bio::SeqUtils->cat(@seqs);
44 my $catseq=$seqs[0];
46 # truncate a sequence, retaining features and adjusting their
47 # coordinates if necessary
48 my $truncseq = Bio::SeqUtils->trunc_with_features($seq, 100, 200);
50 # reverse complement a sequence and its features
51 my $revcomseq = Bio::SeqUtils->revcom_with_features($seq);
53 # simulate cloning of a fragment into a vector. Cut the vector at
54 # positions 1000 and 1100 (deleting positions 1001 to 1099) and
55 # "ligate" a fragment into the sites. The fragment is
56 # reverse-complemented in this example (option "flip").
57 # All features of the vector and fragment are preserved and
58 # features that are affected by the deletion/insertion are
59 # modified accordingly.
60 # $vector and $fragment must be Bio::SeqI compliant objects
61 my $new_molecule = Bio::Sequtils->ligate(
62 -vector => $vector,
63 -fragment => $fragment,
64 -left => 1000,
65 -right => 1100,
66 -flip => 1
69 # delete a segment of a sequence (from pos 1000 to 1100, inclusive),
70 # again preserving features and annotations
71 my $new_molecule = Bio::SeqUtils->cut( $seq, 1000, 1100 );
73 # insert a fragment into a recipient between positions 1000 and
74 # 1001. $recipient is a Bio::SeqI compliant object
75 my $new_molecule = Bio::SeqUtils::PbrTools->insert(
76 $recipient_seq,
77 $fragment_seq,
78 1000
81 =head1 DESCRIPTION
83 This class is a holder of methods that work on Bio::PrimarySeqI-
84 compliant sequence objects, e.g. Bio::PrimarySeq and
85 Bio::Seq. These methods are not part of the Bio::PrimarySeqI
86 interface and should in general not be essential to the primary function
87 of sequence objects. If you are thinking of adding essential
88 functions, it might be better to create your own sequence class.
89 See L<Bio::PrimarySeqI>, L<Bio::PrimarySeq>, and L<Bio::Seq> for more.
91 The methods take as their first argument a sequence object. It is
92 possible to use methods without first creating a SeqUtils object,
93 i.e. use it as an anonymous hash.
95 The first two methods, seq3() and seq3in(), give out or read in protein
96 sequences coded in three letter IUPAC amino acid codes.
98 The next two methods, translate_3frames() and translate_6frames(), wrap
99 around the standard translate method to give back an array of three
100 forward or all six frame translations.
102 The mutate() method mutates the sequence string with a mutation
103 description object.
105 The cat() method concatenates two or more sequences. The first sequence
106 is modified by addition of the remaining sequences. All annotations and
107 sequence features will be transferred.
109 The revcom_with_features() and trunc_with_features() methods are similar
110 to the revcom() and trunc() methods from Bio::Seq, but also adjust any
111 features associated with the sequence as appropriate.
113 There are also methods that simulate molecular cloning with rich
114 sequence objects.
115 The delete() method cuts a segment out of a sequence and re-joins the
116 left and right fragments (like splicing or digesting and re-ligating a
117 molecule). Positions (and types) of sequence features are adjusted
118 accordingly:
119 Features that span the deleted segment are converted to split featuress
120 to indicate the disruption. (Sub)Features that extend into the deleted
121 segment are truncated.
122 A new molecule is created and returned.
124 The insert() method inserts a fragment (which can be a rich Bio::Seq
125 object) into another sequence object adding all annotations and
126 features to the final product.
127 Features that span the insertion site are converted to split features
128 to indicate the disruption.
129 A new feature is added to indicate the inserted fragment itself.
130 A new molecule is created and returned.
132 The ligate() method simulates digesting a recipient (vector) and
133 ligating a fragment into it, which can also be flipped if needed. It
134 is simply a combination of a deletion and an insertion step and
135 returns a new molecule. The rules for modifying feature locations
136 outlined above are also used here, e.g. features that span the cut
137 sites are converted to split features with truncated sub-locations.
140 =head1 FEEDBACK
142 =head2 Mailing Lists
144 User feedback is an integral part of the evolution of this and other
145 Bioperl modules. Send your comments and suggestions preferably to one
146 of the Bioperl mailing lists. Your participation is much appreciated.
148 bioperl-l@bioperl.org - General discussion
149 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
151 =head2 Support
153 Please direct usage questions or support issues to the mailing list:
155 I<bioperl-l@bioperl.org>
157 rather than to the module maintainer directly. Many experienced and
158 reponsive experts will be able look at the problem and quickly
159 address it. Please include a thorough description of the problem
160 with code and data examples if at all possible.
162 =head2 Reporting Bugs
164 Report bugs to the Bioperl bug tracking system to help us keep track
165 the bugs and their resolution. Bug reports can be submitted via the
166 web:
168 https://github.com/bioperl/bioperl-live/issues
170 =head1 AUTHOR - Heikki Lehvaslaiho
172 Email: heikki-at-bioperl-dot-org
174 =head1 CONTRIBUTORS
176 Roy R. Chaudhuri - roy.chaudhuri at gmail.com
177 Frank Schwach - frank.schwach@sanger.ac.uk
179 =head1 APPENDIX
181 The rest of the documentation details each of the object
182 methods. Internal methods are usually preceded with a _
184 =cut
186 # Let the code begin...
188 package Bio::SeqUtils;
190 use strict;
191 use warnings;
192 use Scalar::Util qw(blessed);
193 use parent qw(Bio::Root::Root);
195 # new inherited from RootI
197 our %ONECODE = (
198 'Ala' => 'A',
199 'Asx' => 'B',
200 'Cys' => 'C',
201 'Asp' => 'D',
202 'Glu' => 'E',
203 'Phe' => 'F',
204 'Gly' => 'G',
205 'His' => 'H',
206 'Ile' => 'I',
207 'Lys' => 'K',
208 'Leu' => 'L',
209 'Met' => 'M',
210 'Asn' => 'N',
211 'Pro' => 'P',
212 'Gln' => 'Q',
213 'Arg' => 'R',
214 'Ser' => 'S',
215 'Thr' => 'T',
216 'Val' => 'V',
217 'Trp' => 'W',
218 'Xaa' => 'X',
219 'Tyr' => 'Y',
220 'Glx' => 'Z',
221 'Ter' => '*',
222 'Sec' => 'U',
223 'Pyl' => 'O',
224 'Xle' => 'J'
227 our %THREECODE = (
228 'A' => 'Ala',
229 'B' => 'Asx',
230 'C' => 'Cys',
231 'D' => 'Asp',
232 'E' => 'Glu',
233 'F' => 'Phe',
234 'G' => 'Gly',
235 'H' => 'His',
236 'I' => 'Ile',
237 'K' => 'Lys',
238 'L' => 'Leu',
239 'M' => 'Met',
240 'N' => 'Asn',
241 'P' => 'Pro',
242 'Q' => 'Gln',
243 'R' => 'Arg',
244 'S' => 'Ser',
245 'T' => 'Thr',
246 'V' => 'Val',
247 'W' => 'Trp',
248 'Y' => 'Tyr',
249 'Z' => 'Glx',
250 'X' => 'Xaa',
251 '*' => 'Ter',
252 'U' => 'Sec',
253 'O' => 'Pyl',
254 'J' => 'Xle'
257 =head2 seq3
259 Title : seq3
260 Usage : $string = Bio::SeqUtils->seq3($seq)
261 Function: Read only method that returns the amino acid sequence as a
262 string of three letter codes. alphabet has to be
263 'protein'. Output follows the IUPAC standard plus 'Ter' for
264 terminator. Any unknown character, including the default
265 unknown character 'X', is changed into 'Xaa'. A noncoded
266 aminoacid selenocystein is recognized (Sec, U).
268 Returns : A scalar
269 Args : character used for stop in the protein sequence optional,
270 defaults to '*' string used to separate the output amino
271 acid codes, optional, defaults to ''
273 =cut
275 sub seq3 {
276 my ( $self, $seq, $stop, $sep ) = @_;
278 $seq->isa('Bio::PrimarySeqI')
279 || $self->throw('Not a Bio::PrimarySeqI object but [$self]');
280 $seq->alphabet eq 'protein'
281 || $self->throw('Not a protein sequence');
283 if ( defined $stop ) {
284 length $stop != 1
285 and $self->throw('One character stop needed, not [$stop]');
286 $THREECODE{$stop} = "Ter";
288 $sep ||= '';
290 my $aa3s;
291 foreach my $aa ( split //, uc $seq->seq ) {
292 $THREECODE{$aa} and $aa3s .= $THREECODE{$aa} . $sep, next;
293 $aa3s .= 'Xaa' . $sep;
295 $sep and substr( $aa3s, -( length $sep ), length $sep ) = '';
296 return $aa3s;
299 =head2 seq3in
301 Title : seq3in
302 Usage : $seq = Bio::SeqUtils->seq3in($seq, 'MetGlyTer')
303 Function: Method for changing of the sequence of a
304 Bio::PrimarySeqI sequence object. The three letter amino
305 acid input string is converted into one letter code. Any
306 unknown character triplet, including the default 'Xaa', is
307 converted into 'X'.
309 Returns : Bio::PrimarySeq object
310 Args : sequence string
311 optional character to be used for stop in the protein sequence,
312 defaults to '*'
313 optional character to be used for unknown in the protein sequence,
314 defaults to 'X'
316 =cut
318 sub seq3in {
319 my ( $self, $seq, $string, $stop, $unknown ) = @_;
321 $seq->isa('Bio::PrimarySeqI')
322 || $self->throw("Not a Bio::PrimarySeqI object but [$self]");
323 $seq->alphabet eq 'protein'
324 || $self->throw('Not a protein sequence');
326 if ( defined $stop ) {
327 length $stop != 1
328 and $self->throw("One character stop needed, not [$stop]");
329 $ONECODE{'Ter'} = $stop;
331 if ( defined $unknown ) {
332 length $unknown != 1
333 and $self->throw("One character stop needed, not [$unknown]");
334 $ONECODE{'Xaa'} = $unknown;
337 my ( $aas, $aa3 );
338 my $length = ( length $string ) - 2;
339 for ( my $i = 0 ; $i < $length ; $i += 3 ) {
340 $aa3 = substr( $string, $i, 3 );
341 $aa3 = ucfirst( lc($aa3) );
342 $ONECODE{$aa3} and $aas .= $ONECODE{$aa3}, next;
343 $aas .= $ONECODE{'Xaa'};
345 $seq->seq($aas);
346 return $seq;
349 =head2 translate_3frames
351 Title : translate_3frames
352 Usage : @prots = Bio::SeqUtils->translate_3frames($seq)
353 Function: Translate a nucleotide sequence in three forward frames.
354 The IDs of the sequences are appended with '-0F', '-1F', '-2F'.
355 Returns : An array of seq objects
356 Args : sequence object
357 same arguments as to Bio::PrimarySeqI::translate
359 =cut
361 sub translate_3frames {
362 my ( $self, $seq, @args ) = @_;
364 $self->throw( 'Object [$seq] '
365 . 'of class ['
366 . ref($seq)
367 . '] can not be translated.' )
368 unless $seq->can('translate');
370 my ( $stop, $unknown, $frame, $tableid, $fullCDS, $throw ) = @args;
371 my @seqs;
372 my $f = 0;
373 while ( $f != 3 ) {
374 my $translation =
375 $seq->translate( $stop, $unknown, $f, $tableid, $fullCDS, $throw );
376 $translation->id( $seq->id . "-" . $f . "F" );
377 push @seqs, $translation;
378 $f++;
381 return @seqs;
384 =head2 translate_6frames
386 Title : translate_6frames
387 Usage : @prots = Bio::SeqUtils->translate_6frames($seq)
388 Function: translate a nucleotide sequence in all six frames
389 The IDs of the sequences are appended with '-0F', '-1F', '-2F',
390 '-0R', '-1R', '-2R'.
391 Returns : An array of seq objects
392 Args : sequence object
393 same arguments as to Bio::PrimarySeqI::translate
395 =cut
397 sub translate_6frames {
398 my ( $self, $seq, @args ) = @_;
400 my @seqs = $self->translate_3frames( $seq, @args );
401 my @seqs2 = $self->translate_3frames( $seq->revcom, @args );
402 foreach my $seq2 (@seqs2) {
403 my ($tmp) = $seq2->id;
404 $tmp =~ s/F$/R/g;
405 $seq2->id($tmp);
407 return @seqs, @seqs2;
410 =head2 valid_aa
412 Title : valid_aa
413 Usage : my @aa = $table->valid_aa
414 Function: Retrieves a list of the valid amino acid codes.
415 The list is ordered so that first 21 codes are for unique
416 amino acids. The rest are ['B', 'Z', 'X', '*'].
417 Returns : array of all the valid amino acid codes
418 Args : [optional] $code => [0 -> return list of 1 letter aa codes,
419 1 -> return list of 3 letter aa codes,
420 2 -> return associative array of both ]
422 =cut
424 sub valid_aa {
425 my ( $self, $code ) = @_;
427 if ( !$code ) {
428 my @codes;
429 foreach my $c ( sort values %ONECODE ) {
430 push @codes, $c unless ( $c =~ /[BZX\*]/ );
432 push @codes, qw(B Z X *); # so they are in correct order ?
433 return @codes;
435 elsif ( $code == 1 ) {
436 my @codes;
437 foreach my $c ( sort keys %ONECODE ) {
438 push @codes, $c unless ( $c =~ /(Asx|Glx|Xaa|Ter)/ );
440 push @codes, ( 'Asx', 'Glx', 'Xaa', 'Ter' );
441 return @codes;
443 elsif ( $code == 2 ) {
444 my %codes = %ONECODE;
445 foreach my $c ( keys %ONECODE ) {
446 my $aa = $ONECODE{$c};
447 $codes{$aa} = $c;
449 return %codes;
451 else {
452 $self->warn(
453 "unrecognized code in " . ref($self) . " method valid_aa()" );
454 return ();
458 =head2 mutate
460 Title : mutate
461 Usage : Bio::SeqUtils->mutate($seq,$mutation1, $mutation2);
462 Function: Inplace editing of the sequence.
464 The second argument can be a Bio::LiveSeq::Mutation object
465 or an array of them. The mutations are applied sequentially
466 checking only that their position is within the current
467 sequence. Insertions are inserted before the given
468 position.
470 Returns : boolean
471 Args : sequence object
472 mutation, a Bio::LiveSeq::Mutation object, or an array of them
474 See L<Bio::LiveSeq::Mutation>.
476 =cut
478 sub mutate {
479 my ( $self, $seq, @mutations ) = @_;
481 $self->throw( 'Object [$seq] '
482 . 'of class ['
483 . ref($seq)
484 . '] should be a Bio::PrimarySeqI ' )
485 unless $seq->isa('Bio::PrimarySeqI');
486 $self->throw( 'Object [$mutations[0]] '
487 . 'of class ['
488 . ref( $mutations[0] )
489 . '] should be a Bio::LiveSeq::Mutation' )
490 unless $mutations[0]->isa('Bio::LiveSeq::Mutation');
492 foreach my $mutation (@mutations) {
493 $self->throw('Attempting to mutate sequence beyond its length')
494 unless $mutation->pos - 1 <= $seq->length;
496 my $string = $seq->seq;
497 substr $string, $mutation->pos - 1, $mutation->len, $mutation->seq;
498 $seq->seq($string);
503 =head2 cat
505 Title : cat
506 Usage : Bio::SeqUtils->cat(@seqs);
507 my $catseq=$seqs[0];
508 Function: Concatenates a list of Bio::Seq objects, adding them all on to the
509 end of the first sequence. Annotations and sequence features are
510 copied over from any additional objects, and the coordinates of any
511 copied features are adjusted appropriately.
512 Returns : a boolean
513 Args : array of sequence objects
515 Note that annotations have no sequence locations. If you concatenate
516 sequences with the same annotations they will all be added.
518 =cut
520 sub cat {
521 my ( $self, $seq, @seqs ) = @_;
522 $self->throw( 'Object [$seq] '
523 . 'of class ['
524 . ref($seq)
525 . '] should be a Bio::PrimarySeqI ' )
526 unless $seq->isa('Bio::PrimarySeqI');
528 for my $catseq (@seqs) {
529 $self->throw( 'Object [$catseq] '
530 . 'of class ['
531 . ref($catseq)
532 . '] should be a Bio::PrimarySeqI ' )
533 unless $catseq->isa('Bio::PrimarySeqI');
535 $self->throw(
536 'Trying to concatenate sequences with different alphabets: '
537 . $seq->display_id . '('
538 . $seq->alphabet
539 . ') and '
540 . $catseq->display_id . '('
541 . $catseq->alphabet
542 . ')' )
543 unless $catseq->alphabet eq $seq->alphabet;
545 my $length = $seq->length;
546 $seq->seq( $seq->seq . $catseq->seq );
548 # move annotations
549 if ( $seq->isa("Bio::AnnotatableI")
550 and $catseq->isa("Bio::AnnotatableI") )
552 foreach my $key ( $catseq->annotation->get_all_annotation_keys() ) {
554 foreach my $value ( $catseq->annotation->get_Annotations($key) )
556 $seq->annotation->add_Annotation( $key, $value );
561 # move SeqFeatures
562 if ( $seq->isa('Bio::SeqI') and $catseq->isa('Bio::SeqI') ) {
563 for my $feat ( $catseq->get_SeqFeatures ) {
564 $seq->add_SeqFeature( $self->_coord_adjust( $feat, $length ) );
572 =head2 trunc_with_features
574 Title : trunc_with_features
575 Usage : $trunc=Bio::SeqUtils->trunc_with_features($seq, $start, $end);
576 Function: Like Bio::Seq::trunc, but keeps features (adjusting coordinates
577 where necessary. Features that partially overlap the region have
578 their location changed to a Bio::Location::Fuzzy.
579 Returns : A new sequence object
580 Args : A sequence object, start coordinate, end coordinate (inclusive)
583 =cut
585 sub trunc_with_features {
586 use Bio::Range;
587 my ( $self, $seq, $start, $end ) = @_;
588 $self->throw( 'Object [$seq] '
589 . 'of class ['
590 . ref($seq)
591 . '] should be a Bio::SeqI ' )
592 unless $seq->isa('Bio::SeqI');
593 my $trunc = $seq->trunc( $start, $end );
594 my $truncrange =
595 Bio::Range->new( -start => $start, -end => $end, -strand => 0 );
597 # make sure that there is no annotation or features in $trunc
598 # (->trunc() now clone objects except for Bio::Seq::LargePrimarySeq)
599 $trunc->annotation->remove_Annotations;
600 $trunc->remove_SeqFeatures;
602 # move annotations
603 foreach my $key ( $seq->annotation->get_all_annotation_keys() ) {
604 foreach my $value ( $seq->annotation->get_Annotations($key) ) {
605 $trunc->annotation->add_Annotation( $key, $value );
609 # move features
610 foreach (
611 grep {
612 $_ = $self->_coord_adjust( $_, 1 - $start, $end + 1 - $start )
613 if $_->overlaps($truncrange)
614 } $seq->get_SeqFeatures
617 $trunc->add_SeqFeature($_);
619 return $trunc;
622 =head2 delete
624 Title : delete
625 Function: cuts a segment out of a sequence and re-joins the left and right fragments
626 (like splicing or digesting and re-ligating a molecule).
627 Positions (and types) of sequence features are adjusted accordingly:
628 Features that span the cut site are converted to split featuress to
629 indicate the disruption.
630 Features that extend into the cut-out fragment are truncated.
631 A new molecule is created and returned.
632 Usage : my $cutseq = Bio::SeqUtils::PbrTools->cut( $seq, 1000, 1100 );
633 Args : a Bio::PrimarySeqI compliant object to cut,
634 first nt of the segment to be deleted
635 last nt of the segment to be deleted
636 optional:
637 hash-ref of options:
638 clone_obj: if true, clone the input sequence object rather
639 than calling "new" on the object's class
641 Returns : a new Bio::Seq object
643 =cut
645 sub delete {
646 my $self = shift;
647 my ( $seq, $left, $right, $opts_ref ) = @_;
648 $self->throw( 'was expecting 3-4 paramters but got ' . @_ )
649 unless @_ == 3 || @_ == 4;
651 $self->throw(
652 'Object of class [' . ref($seq) . '] should be a Bio::PrimarySeqI ' )
653 unless blessed($seq) && $seq->isa('Bio::PrimarySeqI');
655 $self->throw("Left coordinate ($left) must be >= 1") if $left < 1;
656 if ( $right > $seq->length ) {
657 $self->throw( "Right coordinate ($right) must be less than "
658 . 'sequence length ('
659 . $seq->length
660 . ')' );
663 # piece together the sequence string of the remaining fragments
664 my $left_seq = $seq->subseq( 1, $left - 1 );
665 my $right_seq = $seq->subseq( $right + 1, $seq->length );
666 if ( !$left_seq || !$right_seq ) {
667 $self->throw(
668 'could not assemble sequences. At least one of the fragments is empty'
671 my $seq_str = $left_seq . $right_seq;
673 # create the new seq object with the same class as the recipient
674 # or (if requested), make a clone of the existing object. In the
675 # latter case we need to remove sequence features from the cloned
676 # object instead of copying them
677 my $product;
678 if ( $opts_ref->{clone_obj} ) {
679 $product = $self->_new_seq_via_clone( $seq, $seq_str );
681 else {
682 $product = $self->_new_seq_from_old( $seq, { seq => $seq_str } );
685 # move sequence features
686 if ( $product->isa('Bio::SeqI') && $seq->isa('Bio::SeqI') ) {
687 for my $feat ( $seq->get_SeqFeatures ) {
688 my $adjfeat = $self->_coord_adjust_deletion( $feat, $left, $right );
689 $product->add_SeqFeature($adjfeat) if $adjfeat;
693 # add a feature to annotatde the deletion
694 my $deletion_feature = Bio::SeqFeature::Generic->new(
695 -primary_tag => 'misc_feature',
696 -tag => { note => 'deletion of ' . ( $right - $left + 1 ) . 'bp' },
697 -location => Bio::Location::Simple->new(
698 -start => $left - 1,
699 -end => $left,
700 -location_type => 'IN-BETWEEN'
703 $product->add_SeqFeature($deletion_feature);
704 return $product;
707 =head2 insert
709 Title : insert
710 Function: inserts a fragment (a Bio::Seq object) into a nother sequence object
711 adding all annotations and features to the final product.
712 Features that span the insertion site are converted to split
713 features to indicate the disruption.
714 A new feature is added to indicate the inserted fragment itself.
715 A new molecule is created and returned.
716 Usage : # insert a fragment after pos 1000
717 my $insert_seq = Bio::SeqUtils::PbrTools->insert(
718 $recipient_seq,
719 $fragment_seq,
720 1000
722 Args : recipient sequence (a Bio::PrimarySeqI compliant object),
723 a fragmetn to insert (Bio::PrimarySeqI compliant object),
724 insertion position (fragment is inserted to the right of this pos)
725 pos=0 will prepend the fragment to the recipient
726 optional:
727 hash-ref of options:
728 clone_obj: if true, clone the input sequence object rather
729 than calling "new" on the object's class
730 Returns : a new Bio::Seq object
732 =cut
734 sub insert {
735 my $self = shift;
736 my ( $recipient, $fragment, $insert_pos, $opts_ref ) = @_;
737 $self->throw( 'was expecting 3-4 paramters but got ' . @_ )
738 unless @_ == 3 || @_ == 4;
740 $self->throw( 'Recipient object of class ['
741 . ref($recipient)
742 . '] should be a Bio::PrimarySeqI ' )
743 unless blessed($recipient) && $recipient->isa('Bio::PrimarySeqI');
745 $self->throw( 'Fragment object of class ['
746 . ref($fragment)
747 . '] should be a Bio::PrimarySeqI ' )
748 unless blessed($fragment) && $fragment->isa('Bio::PrimarySeqI');
750 $self->throw( 'Can\'t concatenate sequences with different alphabets: '
751 . 'recipient is '
752 . $recipient->alphabet
753 . ' and fragment is '
754 . $fragment->alphabet )
755 unless $recipient->alphabet eq $fragment->alphabet;
757 if ( $insert_pos < 0 or $insert_pos > $recipient->length ) {
758 $self->throw( "insertion position ($insert_pos) must be between 0 and "
759 . 'recipient sequence length ('
760 . $recipient->length
761 . ')' );
764 if ( $fragment->can('is_circular') && $fragment->is_circular ) {
765 $self->throw('Can\'t insert circular fragments');
768 if ( !$recipient->seq ) {
769 $self->throw(
770 'Recipient has no sequence, can not insert into this object');
773 # construct raw sequence of the new molecule
774 my $left_seq =
775 $insert_pos > 0
776 ? $recipient->subseq( 1, $insert_pos )
777 : '';
778 my $mid_seq = $fragment->seq;
779 my $right_seq =
780 $insert_pos < $recipient->length
781 ? $recipient->subseq( $insert_pos + 1, $recipient->length )
782 : '';
783 my $seq_str = $left_seq . $mid_seq . $right_seq;
785 # create the new seq object with the same class as the recipient
786 # or (if requested), make a clone of the existing object. In the
787 # latter case we need to remove sequence features from the cloned
788 # object instead of copying them
789 my $product;
790 if ( $opts_ref->{clone_obj} ) {
791 $product = $self->_new_seq_via_clone( $recipient, $seq_str );
793 else {
794 my @desc;
795 push @desc, 'Inserted fragment: ' . $fragment->desc
796 if defined $fragment->desc;
797 push @desc, 'Recipient: ' . $recipient->desc
798 if defined $recipient->desc;
799 $product = $self->_new_seq_from_old(
800 $recipient,
802 seq => $seq_str,
803 display_id => $recipient->display_id,
804 accession_number => $recipient->accession_number || '',
805 alphabet => $recipient->alphabet,
806 desc => join( '; ', @desc ),
807 verbose => $recipient->verbose || $fragment->verbose,
808 is_circular => $recipient->is_circular || 0,
812 } # if clone_obj
814 # move annotations from fragment to product
815 if ( $product->isa("Bio::AnnotatableI")
816 && $fragment->isa("Bio::AnnotatableI") )
818 foreach my $key ( $fragment->annotation->get_all_annotation_keys ) {
819 foreach my $value ( $fragment->annotation->get_Annotations($key) ) {
820 $product->annotation->add_Annotation( $key, $value );
825 # move sequence features to product with adjusted coordinates
826 if ( $product->isa('Bio::SeqI') ) {
828 # for the fragment, just shift the features to new position
829 if ( $fragment->isa('Bio::SeqI') ) {
830 for my $feat ( $fragment->get_SeqFeatures ) {
831 my $adjfeat = $self->_coord_adjust( $feat, $insert_pos );
832 $product->add_SeqFeature($adjfeat) if $adjfeat;
836 # for recipient, shift and modify features according to insertion.
837 if ( $recipient->isa('Bio::SeqI') ) {
838 for my $feat ( $recipient->get_SeqFeatures ) {
839 my $adjfeat =
840 $self->_coord_adjust_insertion( $feat, $insert_pos,
841 $fragment->length );
842 $product->add_SeqFeature($adjfeat) if $adjfeat;
847 # add a feature to annotate the insertion
848 my $insertion_feature = Bio::SeqFeature::Generic->new(
849 -start => $insert_pos + 1,
850 -end => $insert_pos + $fragment->length,
851 -primary_tag => 'misc_feature',
852 -tag => { note => 'inserted fragment' },
854 $product->add_SeqFeature($insertion_feature);
856 return $product;
859 =head2 ligate
861 title : ligate
862 function: pastes a fragment (which can also have features) into a recipient
863 sequence between two "cut" sites, preserving features and adjusting
864 their locations.
865 This is a shortcut for deleting a segment from a sequence object followed
866 by an insertion of a fragmnet and is supposed to be used to simulate
867 in-vitro cloning where a recipient (a vector) is digested and a fragment
868 is then ligated into the recipient molecule. The fragment can be flipped
869 (reverse-complemented with all its features).
870 A new sequence object is returned to represent the product of the reaction.
871 Features and annotations are transferred from the insert to the product
872 and features on the recipient are adjusted according to the methods
873 L</"delete"> amd L</"insert">:
874 Features spanning the insertion site will be split up into two sub-locations.
875 (Sub-)features in the deleted region are themselves deleted.
876 (Sub-)features that extend into the deleted region are truncated.
877 The class of the product object depends on the class of the recipient (vector)
878 sequence object. if it is not possible to instantiate a new
879 object of that class, a Bio::Primaryseq object is created instead.
880 usage : # insert the flipped fragment between positions 1000 and 1100 of the
881 # vector, i.e. everything between these two positions is deleted and
882 # replaced by the fragment
883 my $new_molecule = Bio::Sequtils::Pbrtools->ligate(
884 -recipient => $vector,
885 -fragment => $fragment,
886 -left => 1000,
887 -right => 1100,
888 -flip => 1,
889 -clone_obj => 1
891 args : recipient: the recipient/vector molecule
892 fragment: molecule that is to be ligated into the vector
893 left: left cut site (fragment will be inserted to the right of
894 this position)
895 optional:
896 right: right cut site (fragment will be inseterted to the
897 left of this position). defaults to left+1
898 flip: boolean, if true, the fragment is reverse-complemented
899 (including features) before inserting
900 clone_obj: if true, clone the recipient object to create the product
901 instead of calling "new" on its class
902 returns : a new Bio::Seq object of the ligated fragments
904 =cut
906 sub ligate {
907 my $self = shift;
908 my ( $recipient, $fragment, $left, $right, $flip, $clone_obj ) =
909 $self->_rearrange( [qw(RECIPIENT FRAGMENT LEFT RIGHT FLIP CLONE_OBJ )],
910 @_ );
911 $self->throw("missing required parameter 'recipient'") unless $recipient;
912 $self->throw("missing required parameter 'fragment'") unless $fragment;
913 $self->throw("missing required parameter 'left'") unless defined $left;
915 $right ||= $left + 1;
917 $self->throw(
918 "Fragment must be a Bio::PrimarySeqI compliant object but it is a "
919 . ref($fragment) )
920 unless blessed($fragment) && $fragment->isa('Bio::PrimarySeqI');
922 $fragment = $self->revcom_with_features($fragment) if $flip;
924 my $opts_ref = {};
925 $opts_ref->{clone_obj} = 1 if $clone_obj;
927 # clone in two steps: first delete between the insertion sites,
928 # then insert the fragment. Step 1 is skipped if insert positions
929 # are adjacent (no deletion)
930 my ( $product1, $product2 );
931 eval {
932 if ( $right == $left + 1 ) {
933 $product1 = $recipient;
935 else {
936 $product1 =
937 $self->delete( $recipient, $left + 1, $right - 1, $opts_ref );
940 $self->throw( "Failed in step 1 (cut recipient): " . $@ ) if $@;
941 eval { $product2 = $self->insert( $product1, $fragment, $left, $opts_ref ) };
942 $self->throw( "Failed in step 2 (insert fragment): " . $@ ) if $@;
944 return $product2;
948 =head2 _coord_adjust_deletion
950 title : _coord_adjust_deletion
951 function: recursively adjusts coordinates of seqfeatures on a molecule
952 where a segment has been deleted.
953 (sub)features that span the deletion site become split features.
954 (sub)features that extend into the deletion site are truncated.
955 A note is added to the feature to inform about the size and
956 position of the deletion.
957 usage : my $adjusted_feature = Bio::Sequtils::_coord_adjust_deletion(
958 $feature,
959 $start,
960 $end
962 args : a Bio::SeqFeatureI compliant object,
963 start (inclusive) position of the deletion site,
964 end (inclusive) position of the deletion site
965 returns : a Bio::SeqFeatureI compliant object
967 =cut
969 sub _coord_adjust_deletion {
970 my ( $self, $feat, $left, $right ) = @_;
972 $self->throw( 'object [$feat] '
973 . 'of class ['
974 . ref($feat)
975 . '] should be a Bio::SeqFeatureI ' )
976 unless $feat->isa('Bio::SeqFeatureI');
977 $self->throw('missing coordinates: need a left and a right position')
978 unless defined $left && defined $right;
980 if ( $left > $right ) {
981 if ( $feat->can('is_circular') && $feat->is_circular ) {
983 # todo handle circular molecules
984 $self->throw(
985 'can not yet handle deletions in circular molecules if deletion spans origin'
988 else {
989 $self->throw(
990 "left coordinate ($left) must be less than right ($right)"
991 . " but it was greater" );
994 my $deletion = Bio::Location::Simple->new(
995 -start => $left,
996 -end => $right,
998 my $del_length = $right - $left + 1;
1000 my @adjsubfeat;
1001 for my $subfeat ( $feat->get_SeqFeatures ) {
1002 my $adjsubfeat =
1003 $self->_coord_adjust_deletion( $subfeat, $left, $right );
1004 push @adjsubfeat, $adjsubfeat if $adjsubfeat;
1007 my @loc;
1008 my $note;
1009 for ( $feat->location->each_Location ) {
1010 next if $deletion->contains($_); # this location will be deleted;
1011 my $strand = $_->strand;
1012 my $type = $_->location_type;
1013 my $start = $_->start;
1014 my $start_type = $_->can('start_pos_type') ? $_->start_pos_type : undef;
1015 my $end = $_->end;
1016 my $end_type = $_->can('end_pos_type') ? $_->end_pos_type : undef;
1017 my @newcoords = ();
1018 if ( $start < $deletion->start && $end > $deletion->end )
1019 { # split the feature
1020 @newcoords = (
1021 [ $start, ( $deletion->start - 1 ), $start_type, $end_type ],
1023 ( $deletion->start ), $end - $del_length,
1024 $start_type, $end_type
1027 $note =
1028 $del_length
1029 . 'bp internal deletion between pos '
1030 . ( $deletion->start - 1 ) . ' and '
1031 . $deletion->start;
1033 elsif ( $_->start < $deletion->start && $_->end >= $deletion->start )
1034 { # truncate feature end
1035 @newcoords =
1036 ( [ $start, ( $deletion->start - 1 ), $start_type, $end_type ] );
1037 $note =
1038 ( $end - $deletion->start + 1 ) . 'bp deleted from feature ';
1039 if ( $feat->strand ) {
1040 $note .= $feat->strand == 1 ? "3' " : "5' ";
1042 $note .= 'end';
1044 elsif ( $_->start <= $deletion->end && $_->end > $deletion->end )
1045 { # truncate feature start and shift end
1046 @newcoords = (
1048 ( $deletion->start ), $end - $del_length,
1049 $start_type, $end_type
1052 $note =
1053 ( $deletion->end - $start + 1 ) . 'bp deleted from feature ';
1054 if ( $feat->strand ) {
1055 $note .= $feat->strand == 1 ? "5' end" : "3' end";
1057 else {
1058 $note .= 'start';
1061 elsif ( $start >= $deletion->end ) { # just shift entire location
1062 @newcoords = (
1064 $start - $del_length, $end - $del_length,
1065 $start_type, $end_type
1069 else { # not affected by deletion
1070 @newcoords = ( [ $start, $end, $start_type, $end_type ] );
1073 # if we have no coordinates, we return nothing
1074 # the feature is deleted
1075 return unless @newcoords;
1077 my @subloc =
1078 $self->_location_objects_from_coordinate_list( \@newcoords, $strand,
1079 $type );
1080 push @loc, $self->_single_loc_object_from_collection(@subloc);
1081 } # each location
1083 # create new feature based on original one and move annotation across
1084 my $newfeat =
1085 Bio::SeqFeature::Generic->new( -primary => $feat->primary_tag );
1086 foreach my $key ( $feat->annotation->get_all_annotation_keys() ) {
1087 foreach my $value ( $feat->annotation->get_Annotations($key) ) {
1088 $newfeat->annotation->add_Annotation( $key, $value );
1091 foreach my $key ( $feat->get_all_tags() ) {
1092 $newfeat->add_tag_value( $key, $feat->get_tag_values($key) );
1095 # If we have a note about the deleted bases, add it
1096 if ($note) {
1097 $newfeat->add_tag_value( 'note', $note );
1100 # set modified location(s) for the new feature and
1101 # add its subfeatures if any
1102 my $loc = $self->_single_loc_object_from_collection(@loc);
1103 $loc ? $newfeat->location($loc) : return;
1104 $newfeat->add_SeqFeature($_) for @adjsubfeat;
1106 return $newfeat;
1110 =head2 _coord_adjust_insertion
1112 title : _coord_adjust_insertion
1113 function: recursively adjusts coordinates of seqfeatures on a molecule
1114 where another sequence has been inserted.
1115 (sub)features that span the insertion site become split features
1116 and a note is added about the size and positin of the insertion.
1117 Features with an IN-BETWEEN location at the insertion site
1118 are lost (such features can only exist between adjacent bases)
1119 usage : my $adjusted_feature = Bio::Sequtils::_coord_adjust_insertion(
1120 $feature,
1121 $insert_pos,
1122 $insert_length
1124 args : a Bio::SeqFeatureI compliant object,
1125 insertion position (insert to the right of this position)
1126 length of inserted fragment
1127 returns : a Bio::SeqFeatureI compliant object
1129 =cut
1131 sub _coord_adjust_insertion {
1132 my ( $self, $feat, $insert_pos, $insert_len ) = @_;
1134 $self->throw( 'object [$feat] '
1135 . 'of class ['
1136 . ref($feat)
1137 . '] should be a Bio::SeqFeatureI ' )
1138 unless $feat->isa('Bio::SeqFeatureI');
1139 $self->throw('missing insert position') unless defined $insert_pos;
1140 $self->throw('missing insert length') unless defined $insert_len;
1142 my @adjsubfeat;
1143 for my $subfeat ( $feat->get_SeqFeatures ) {
1144 push @adjsubfeat,
1145 $self->_coord_adjust_insertion( $subfeat, $insert_pos, $insert_len );
1148 my @loc;
1149 my $note;
1150 for ( $feat->location->each_Location ) {
1152 # loose IN-BETWEEN features at the insertion site
1153 next
1154 if ( $_->location_type eq 'IN-BETWEEN' && $_->start == $insert_pos );
1155 my $strand = $_->strand;
1156 my $type = $_->location_type;
1157 my $start = $_->start;
1158 my $start_type = $_->can('start_pos_type') ? $_->start_pos_type : undef;
1159 my $end = $_->end;
1160 my $end_type = $_->can('end_pos_type') ? $_->end_pos_type : undef;
1161 my @newcoords = ();
1162 if ( $start <= $insert_pos && $end > $insert_pos ) { # split the feature
1163 @newcoords = (
1164 [ $start, $insert_pos, $start_type, $end_type ],
1166 ( $insert_pos + 1 + $insert_len ), $end + $insert_len,
1167 $start_type, $end_type
1170 $note =
1171 $insert_len
1172 . 'bp internal insertion between pos '
1173 . $insert_pos . ' and '
1174 . ( $insert_pos + $insert_len + 1 );
1177 elsif ( $start > $insert_pos ) { # just shift entire location
1178 @newcoords = (
1180 $start + $insert_len, $end + $insert_len,
1181 $start_type, $end_type
1185 else { # not affected
1186 @newcoords = ( [ $start, $end, $start_type, $end_type ] );
1189 # if we have deleted all coordinates, return nothing
1190 # (possible if all locations are IN-BETWEEN)
1191 return unless @newcoords;
1193 my @subloc =
1194 $self->_location_objects_from_coordinate_list( \@newcoords, $strand,
1195 $type );
1197 # put together final location which could be a split now
1198 push @loc, $self->_single_loc_object_from_collection(@subloc);
1199 } # each location
1201 # create new feature based on original one and move annotation across
1202 my $newfeat =
1203 Bio::SeqFeature::Generic->new( -primary => $feat->primary_tag );
1204 foreach my $key ( $feat->annotation->get_all_annotation_keys() ) {
1205 foreach my $value ( $feat->annotation->get_Annotations($key) ) {
1206 $newfeat->annotation->add_Annotation( $key, $value );
1209 foreach my $key ( $feat->get_all_tags() ) {
1210 $newfeat->add_tag_value( $key, $feat->get_tag_values($key) );
1213 # If we have a note about the inserted bases, add it
1214 if ($note) {
1215 $newfeat->add_tag_value( 'note', $note );
1218 # set modified location(s) for the new feature and
1219 # add its subfeatures if any
1220 my $loc = $self->_single_loc_object_from_collection(@loc);
1221 $loc ? $newfeat->location($loc) : return;
1222 $newfeat->add_SeqFeature($_) for @adjsubfeat;
1224 return $newfeat;
1228 =head2 _single_loc_object_from_collection
1230 Title : _single_loc_object_from_collection
1231 Function: takes an array of location objects. Returns either a split
1232 location object if there are more than one locations in the
1233 array or returns the single location if there is only one
1234 Usage : my $loc = _single_loc_object_from_collection( @sublocs );
1235 Args : array of Bio::Location objects
1236 Returns : a single Bio:;Location object containing all locations
1238 =cut
1240 sub _single_loc_object_from_collection {
1241 my ( $self, @locs ) = @_;
1242 my $loc;
1243 if ( @locs > 1 ) {
1244 $loc = Bio::Location::Split->new;
1245 $loc->add_sub_Location(@locs);
1247 elsif ( @locs == 1 ) {
1248 $loc = shift @locs;
1250 return $loc;
1251 } # _single_loc_object_from_collection
1253 =head2 _location_objects_from_coordinate_list
1255 Title : _location_objects_from_coordinate_list
1256 Function: takes an array-ref of start/end coordinates, a strand and a
1257 type and returns a list of Bio::Location objects (Fuzzy by
1258 default, Simple in case of in-between coordinates).
1259 If location type is not "IN-BETWEEN", individual types may be
1260 passed in for start and end location as per Bio::Location::Fuzzy
1261 documentation.
1262 Usage : my @loc_objs = $self->_location_objects_from_coordinate_list(
1263 \@coords,
1264 $strand,
1265 $type
1267 Args : array-ref of array-refs each containing:
1268 start, end [, start-type, end-type]
1269 where types are optional. If given, must be
1270 a one of ('BEFORE', 'AFTER', 'EXACT','WITHIN', 'BETWEEN')
1271 strand (all locations must be on same strand)
1272 location-type (EXACT, IN-BETWEEN etc)
1273 Returns : list of Bio::Location objects
1275 =cut
1277 sub _location_objects_from_coordinate_list {
1278 my $self = shift;
1279 my ( $coords_ref, $strand, $type ) = @_;
1280 $self->throw( 'expected 3 parameters but got ' . @_ ) unless @_ == 3;
1281 $self->throw('first argument must be an ARRAY reference#')
1282 unless ref($coords_ref) eq 'ARRAY';
1284 my @loc;
1285 foreach my $coords_set (@$coords_ref) {
1286 my ( $start, $end, $start_type, $end_type ) = @$coords_set;
1288 # taken from Bio::SeqUtils::_coord_adjust
1289 if ( $type ne 'IN-BETWEEN' ) {
1290 my $loc = Bio::Location::Fuzzy->new(
1291 -start => $start,
1292 -end => $end,
1293 -strand => $strand,
1294 -location_type => $type
1296 $loc->start_pos_type($start_type) if $start_type;
1297 $loc->end_pos_type($end_type) if $end_type;
1298 push @loc, $loc;
1300 else {
1301 push @loc,
1302 Bio::Location::Simple->new(
1303 -start => $start,
1304 -end => $end,
1305 -strand => $strand,
1306 -location_type => $type
1309 } # each coords_set
1310 return @loc;
1311 } # _location_objects_from_coordinate_list
1313 =head2 _new_seq_via_clone
1315 Title : _new_seq_via_clone
1316 Function: clone a sequence object using Bio::Root::Root::clone and set the new sequence string
1317 sequence features are removed.
1318 Usage : my $new_seq = $self->_new_seq_via_clone( $seq_obj, $seq_str );
1319 Args : original seq object [, new sequence string]
1320 Returns : a clone of the original sequence object, optionally with new sequence string
1322 =cut
1324 sub _new_seq_via_clone {
1325 my ( $self, $in_seq_obj, $seq_str ) = @_;
1326 my $out_seq_obj = $in_seq_obj->clone;
1327 $out_seq_obj->remove_SeqFeatures if $out_seq_obj->can('remove_SeqFeatures');
1328 if ( blessed $out_seq_obj->seq
1329 && $out_seq_obj->seq->isa('Bio::PrimarySeq') )
1331 $out_seq_obj->seq->seq($seq_str);
1333 else {
1334 $out_seq_obj->seq($seq_str);
1336 return $out_seq_obj;
1338 } # _new_seq_via_clone
1340 =head2 _new_seq_from_old
1342 Title : _new_seq_from_old
1343 Function: creates a new sequence obejct, if possible of the same class as the old and adds
1344 attributes to it. Also copies annotation across to the new object.
1345 Usage : my $new_seq = $self->_new_seq_from_old( $seq_obj, { seq => $seq_str, display_id => 'some_ID'});
1346 Args : old sequence object
1347 hashref of attributes for the new sequence (sequence string etc.)
1348 Returns : a new Bio::Seq object
1350 =cut
1352 sub _new_seq_from_old {
1353 my ( $self, $in_seq_obj, $attr ) = @_;
1354 $self->throw('attributes must be a hashref')
1355 if $attr && ref($attr) ne 'HASH';
1357 my $seqclass;
1358 if ( $in_seq_obj->can_call_new ) {
1359 $seqclass = ref($in_seq_obj);
1361 else {
1362 $seqclass = 'Bio::Primaryseq';
1363 $self->_attempt_to_load_seq;
1366 my $out_seq_obj = $seqclass->new(
1367 -seq => $attr->{seq} || $in_seq_obj->seq,
1368 -display_id => $attr->{display_id} || $in_seq_obj->display_id,
1369 -accession_number => $attr->{accession_number}
1370 || $in_seq_obj->accession_number
1371 || '',
1372 -alphabet => $in_seq_obj->alphabet,
1373 -desc => $attr->{desc} || $in_seq_obj->desc,
1374 -verbose => $attr->{verbose} || $in_seq_obj->verbose,
1375 -is_circular => $attr->{is_circular} || $in_seq_obj->is_circular || 0,
1378 # move the annotation across to the product
1379 if ( $out_seq_obj->isa("Bio::AnnotatableI")
1380 && $in_seq_obj->isa("Bio::AnnotatableI") )
1382 foreach my $key ( $in_seq_obj->annotation->get_all_annotation_keys ) {
1383 foreach my $value ( $in_seq_obj->annotation->get_Annotations($key) )
1385 $out_seq_obj->annotation->add_Annotation( $key, $value );
1389 return $out_seq_obj;
1390 } # _new_seq_from_old
1392 =head2 _coord_adjust
1394 Title : _coord_adjust
1395 Usage : my $newfeat=Bio::SeqUtils->_coord_adjust($feature, 100, $seq->length);
1396 Function: Recursive subroutine to adjust the coordinates of a feature
1397 and all its subfeatures. If a sequence length is specified, then
1398 any adjusted features that have locations beyond the boundaries
1399 of the sequence are converted to Bio::Location::Fuzzy objects.
1401 Returns : A Bio::SeqFeatureI compliant object.
1402 Args : A Bio::SeqFeatureI compliant object,
1403 the number of bases to add to the coordinates
1404 (optional) the length of the parent sequence
1407 =cut
1409 sub _coord_adjust {
1410 my ( $self, $feat, $add, $length ) = @_;
1411 $self->throw( 'Object [$feat] '
1412 . 'of class ['
1413 . ref($feat)
1414 . '] should be a Bio::SeqFeatureI ' )
1415 unless $feat->isa('Bio::SeqFeatureI');
1416 my @adjsubfeat;
1417 for my $subfeat ( $feat->get_SeqFeatures ) {
1418 push @adjsubfeat, $self->_coord_adjust( $subfeat, $add, $length );
1420 my @loc;
1421 for ( $feat->location->each_Location ) {
1422 my @coords = ( $_->start, $_->end );
1423 my $strand = $_->strand;
1424 my $type = $_->location_type;
1425 foreach (@coords) {
1426 $self->throw("can not handle negative feature positions (got: $_)")
1427 if $_ < 0;
1428 if ( $add + $_ < 1 ) {
1429 $_ = '<1';
1431 elsif ( defined $length and $add + $_ > $length ) {
1432 $_ = ">$length";
1434 else {
1435 $_ = $add + $_;
1438 push @loc,
1439 $self->_location_objects_from_coordinate_list( [ \@coords ],
1440 $strand, $type );
1442 my $newfeat =
1443 Bio::SeqFeature::Generic->new( -primary => $feat->primary_tag );
1444 foreach my $key ( $feat->annotation->get_all_annotation_keys() ) {
1445 foreach my $value ( $feat->annotation->get_Annotations($key) ) {
1446 $newfeat->annotation->add_Annotation( $key, $value );
1449 foreach my $key ( $feat->get_all_tags() ) {
1450 $newfeat->add_tag_value( $key, $feat->get_tag_values($key) );
1452 my $loc = $self->_single_loc_object_from_collection(@loc);
1453 $loc ? $newfeat->location($loc) : return;
1454 $newfeat->add_SeqFeature($_) for @adjsubfeat;
1455 return $newfeat;
1458 =head2 revcom_with_features
1460 Title : revcom_with_features
1461 Usage : $revcom=Bio::SeqUtils->revcom_with_features($seq);
1462 Function: Like Bio::Seq::revcom, but keeps features (adjusting coordinates
1463 as appropriate.
1464 Returns : A new sequence object
1465 Args : A sequence object
1468 =cut
1470 sub revcom_with_features {
1471 my ( $self, $seq ) = @_;
1472 $self->throw( 'Object [$seq] '
1473 . 'of class ['
1474 . ref($seq)
1475 . '] should be a Bio::SeqI ' )
1476 unless $seq->isa('Bio::SeqI');
1477 my $revcom = $seq->revcom;
1479 # make sure that there is no annotation or features in $trunc
1480 # (->revcom() now clone objects except for Bio::Seq::LargePrimarySeq)
1481 $revcom->annotation->remove_Annotations;
1482 $revcom->remove_SeqFeatures;
1484 #move annotations
1485 foreach my $key ( $seq->annotation->get_all_annotation_keys() ) {
1486 foreach my $value ( $seq->annotation->get_Annotations($key) ) {
1487 $revcom->annotation->add_Annotation( $key, $value );
1491 #move features
1492 for ( map { $self->_feature_revcom( $_, $seq->length ) }
1493 reverse $seq->get_SeqFeatures )
1495 $revcom->add_SeqFeature($_);
1497 return $revcom;
1500 =head2 _feature_revcom
1502 Title : _feature_revcom
1503 Usage : my $newfeat=Bio::SeqUtils->_feature_revcom($feature, $seq->length);
1504 Function: Recursive subroutine to reverse complement a feature and
1505 all its subfeatures. The length of the parent sequence must be
1506 specified.
1508 Returns : A Bio::SeqFeatureI compliant object.
1509 Args : A Bio::SeqFeatureI compliant object,
1510 the length of the parent sequence
1513 =cut
1515 sub _feature_revcom {
1516 my ( $self, $feat, $length ) = @_;
1517 $self->throw( 'Object [$feat] '
1518 . 'of class ['
1519 . ref($feat)
1520 . '] should be a Bio::SeqFeatureI ' )
1521 unless $feat->isa('Bio::SeqFeatureI');
1522 my @adjsubfeat;
1523 for my $subfeat ( $feat->get_SeqFeatures ) {
1524 push @adjsubfeat, $self->_feature_revcom( $subfeat, $length );
1526 my @loc;
1527 for ( $feat->location->each_Location ) {
1528 my $type = $_->location_type;
1529 my $strand;
1530 if ( $_->strand == -1 ) { $strand = 1 }
1531 elsif ( $_->strand == 1 ) { $strand = -1 }
1532 else { $strand = $_->strand }
1533 my $newend =
1534 $self->_coord_revcom( $_->start, $_->start_pos_type, $length );
1535 my $newstart =
1536 $self->_coord_revcom( $_->end, $_->end_pos_type, $length );
1537 my $newstart_type = $_->end_pos_type;
1538 $newstart_type = 'BEFORE' if $_->end_pos_type eq 'AFTER';
1539 $newstart_type = 'AFTER' if $_->end_pos_type eq 'BEFORE';
1540 my $newend_type = $_->start_pos_type;
1541 $newend_type = 'BEFORE' if $_->start_pos_type eq 'AFTER';
1542 $newend_type = 'AFTER' if $_->start_pos_type eq 'BEFORE';
1543 push @loc,
1544 $self->_location_objects_from_coordinate_list(
1545 [ [ $newstart, $newend, $newstart_type, $newend_type ] ],
1546 $strand, $type );
1548 my $newfeat =
1549 Bio::SeqFeature::Generic->new( -primary => $feat->primary_tag );
1550 foreach my $key ( $feat->annotation->get_all_annotation_keys() ) {
1551 foreach my $value ( $feat->annotation->get_Annotations($key) ) {
1552 $newfeat->annotation->add_Annotation( $key, $value );
1555 foreach my $key ( $feat->get_all_tags() ) {
1556 $newfeat->add_tag_value( $key, $feat->get_tag_values($key) );
1559 my $loc = $self->_single_loc_object_from_collection(@loc);
1560 $loc ? $newfeat->location($loc) : return;
1562 $newfeat->add_SeqFeature($_) for @adjsubfeat;
1563 return $newfeat;
1566 sub _coord_revcom {
1567 my ( $self, $coord, $type, $length ) = @_;
1568 if ( $type eq 'BETWEEN' or $type eq 'WITHIN' ) {
1569 $coord =~ s/(\d+)(\D*)(\d+)/$length+1-$3.$2.$length+1-$1/ge;
1571 else {
1572 $coord =~ s/(\d+)/$length+1-$1/ge;
1573 $coord =~ tr/<>/></;
1574 $coord = '>' . $coord
1575 if $type eq 'BEFORE' and substr( $coord, 0, 1 ) ne '>';
1576 $coord = '<' . $coord
1577 if $type eq 'AFTER' and substr( $coord, 0, 1 ) ne '<';
1579 return $coord;
1582 =head2 evolve
1584 Title : evolve
1585 Usage : my $newseq = Bio::SeqUtils->
1586 evolve($seq, $similarity, $transition_transversion_rate);
1587 Function: Mutates the sequence by point mutations until the similarity of
1588 the new sequence has decreased to the required level.
1589 Transition/transversion rate is adjustable.
1590 Returns : A new Bio::PrimarySeq object
1591 Args : sequence object
1592 percentage similarity (e.g. 80)
1593 tr/tv rate, optional, defaults to 1 (= 1:1)
1595 Set the verbosity of the Bio::SeqUtils object to positive integer to
1596 see the mutations as they happen.
1598 This method works only on nucleotide sequences. It prints a warning if
1599 you set the target similarity to be less than 25%.
1601 Transition/transversion ratio is an observed attribute of an sequence
1602 comparison. We are dealing here with the transition/transversion rate
1603 that we set for our model of sequence evolution.
1605 =cut
1607 sub evolve {
1608 my ( $self, $seq, $sim, $rate ) = @_;
1609 $rate ||= 1;
1611 $self->throw( 'Object [$seq] '
1612 . 'of class ['
1613 . ref($seq)
1614 . '] should be a Bio::PrimarySeqI ' )
1615 unless $seq->isa('Bio::PrimarySeqI');
1617 $self->throw(
1618 "[$sim] " . ' should be a positive integer or float under 100' )
1619 unless $sim =~ /^[+\d.]+$/ and $sim <= 100;
1621 $self->warn(
1622 "Nucleotide sequences are 25% similar by chance.
1623 Do you really want to set similarity to [$sim]%?\n"
1624 ) unless $sim > 25;
1626 $self->throw('Only nucleotide sequences are supported')
1627 if $seq->alphabet eq 'protein';
1629 # arrays of possible changes have transitions as first items
1630 my %changes;
1631 $changes{'a'} = [ 't', 'c', 'g' ];
1632 $changes{'t'} = [ 'a', 'c', 'g' ];
1633 $changes{'c'} = [ 'g', 'a', 't' ];
1634 $changes{'g'} = [ 'c', 'a', 't' ];
1636 # given the desired rate, find out where cut off points need to be
1637 # when random numbers are generated from 0 to 100
1638 # we are ignoring identical mutations (e.g. A->A) to speed things up
1639 my $bin_size = 100 / ( $rate + 2 );
1640 my $transition = 100 - ( 2 * $bin_size );
1641 my $first_transversion = $transition + $bin_size;
1643 # unify the look of sequence strings
1644 my $string = lc $seq->seq; # lower case
1645 $string =~
1646 s/u/t/; # simplyfy our life; modules should deal with the change anyway
1647 # store the original sequence string
1648 my $oristring = $string;
1649 my $length = $seq->length;
1651 # stop evolving if the limit has been reached
1652 until ( $self->_get_similarity( $oristring, $string ) <= $sim ) {
1654 # find the location in the string to change
1655 my $loc = int( rand $length ) + 1;
1657 # nucleotide to change
1658 my $oldnuc = substr $string, $loc - 1, 1;
1659 my $newnuc;
1661 # nucleotide it is changed to
1662 my $choose = rand(100);
1663 if ( $choose < $transition ) {
1664 $newnuc = $changes{$oldnuc}[0];
1666 elsif ( $choose < $first_transversion ) {
1667 $newnuc = $changes{$oldnuc}[1];
1669 else {
1670 $newnuc = $changes{$oldnuc}[2];
1673 # do the change
1674 substr $string, $loc - 1, 1, $newnuc;
1676 $self->debug("$loc$oldnuc>$newnuc\n");
1679 return new Bio::PrimarySeq(
1680 -id => $seq->id . "-$sim",
1681 -description => $seq->description,
1682 -seq => $string
1686 sub _get_similarity {
1687 my ( $self, $oriseq, $seq ) = @_;
1689 my $len = length($oriseq);
1690 my $c;
1692 for ( my $i = 0 ; $i < $len ; $i++ ) {
1693 $c++ if substr( $oriseq, $i, 1 ) eq substr( $seq, $i, 1 );
1695 return 100 * $c / $len;