Bio::DB::Universal: move into its own distribution
[bioperl-live.git] / Bio / Tools / SiRNA.pm
blob55df1b98a386e82879f09581c38139fea8a28e7d
2 # BioPerl module for Bio::Tools::SiRNA
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Donald Jackson, donald.jackson@bms.com
8 # Copyright Bristol-Myers Squibb
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
14 =head1 NAME
16 SiRNA - Perl object for designing small inhibitory RNAs.
18 =head1 SYNOPSIS
20 use Bio::Tools::SiRNA;
22 my $sirna_designer = Bio::Tools::SiRNA->new( -target => $bio_seq,
23 -rules => 'saigo'
25 my @pairs = $sirna_designer->design;
27 foreach $pair (@pairs) {
28 my $sense_oligo_sequence = $pair->sense->seq;
29 my $antisense_oligo_sequence = $pair->antisense->seq;
31 # print out results
32 print join ("\t", $pair->start, $pair->end, $pair->rank,
33 $sense_oligo_sequence, $antisense_oligo_sequence), "\n";
36 =head1 DESCRIPTION
38 Package for designing siRNA reagents.
40 Input is a L<Bio::SeqI>-compliant object (the target).
42 Output is a list of Bio::SeqFeature::SiRNA::Pair objects, which are
43 added to the feature table of the target sequence. Each
44 Bio::SeqFeature::SiRNA::Pair contains two subfeatures
45 (Bio::SeqFeature::Oligo objects) which correspond to the individual
46 oligos. These objects provide accessors for the information on the
47 individual reagent pairs.
49 This version of Bio::Tools::SiRNA represents a major change in architecture.
50 Specific 'rulesets' for siRNA selection as developed by various groups are
51 implemented as Bio::Tools::SiRNA::Ruleset objects, which inherit from
52 Bio::Tools::SiRNA. This will make it easier to add new rule sets or modify
53 existing approaches. Currently the Tuschl and Ui-Tei (2004) rules are
54 implemented. For consistency, the Tuschl rules are implemented by default.
56 In addition, this module provides three 'extra' rules which can be added
57 above and beyond any ruleset.
59 =over 3
61 =item 1.
63 SiRNAs that overlap known SNPs (identified as SeqFeatures with
64 primary tag = variation) can be avoided.
66 =item 2.
68 Other regions (with primary tag = 'Excluded') can also be skipped. I
69 use this with Bio::Tools::Run::Mdust to avoid low-complexity regions
70 (must be run separately), but other programs could also be used.
72 =item 3.
74 SiRNAs may also be selected in the 3 prime UTR of a gene by setting
75 $sirna_designer-E<gt>include_3pr() to true.
77 =back
79 =head2 EXPORT
81 None.
83 =head1 SEE ALSO
85 L<Bio::Tools::Run::Mdust>, L<Bio::SeqFeature::SiRNA::Pair>,
86 L<Bio::SeqFeature::SiRNA::Oligo>..
88 =head1 FEEDBACK
90 =head2 Mailing Lists
92 User feedback is an integral part of the evolution of this and other
93 Bioperl modules. Send your comments and suggestions preferably to
94 the Bioperl mailing list. Your participation is much appreciated.
96 bioperl-l@bioperl.org - General discussion
97 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
99 =head2 Support
101 Please direct usage questions or support issues to the mailing list:
103 I<bioperl-l@bioperl.org>
105 rather than to the module maintainer directly. Many experienced and
106 reponsive experts will be able look at the problem and quickly
107 address it. Please include a thorough description of the problem
108 with code and data examples if at all possible.
110 =head2 Reporting Bugs
112 Report bugs to the Bioperl bug tracking system to help us keep track
113 of the bugs and their resolution. Bug reports can be submitted via
114 the web:
116 https://github.com/bioperl/bioperl-live/issues
118 =head1 AUTHOR
120 Donald Jackson (donald.jackson@bms.com)
122 =head1 APPENDIX
124 The rest of the documentation details each of the object methods.
125 Internal methods are usually preceded with a _
127 =cut
129 package Bio::Tools::SiRNA;
131 use strict;
132 use warnings;
134 use vars qw($AUTOLOAD);
136 use Bio::Seq::RichSeq;
137 use Bio::SeqFeature::Generic;
138 use Bio::SeqFeature::SiRNA::Oligo;
139 use Bio::SeqFeature::SiRNA::Pair;
142 use base qw(Bio::Root::Root);
145 our %COMP = ( A => 'T',
146 T => 'A',
147 C => 'G',
148 G => 'C',
149 N => 'N',
152 our @ARGNAMES = qw(RULES START_PAD END_PAD MIN_GC CUTOFF OLIGOS AVOID_SNPS
153 GSTRING TMPDIR TARGET DEBUG);
156 =head2 new
158 Title : new
159 Usage : my $sirna_designer = Bio::Tools::SiRNA->new();
160 Function : Constructor for designer object
161 Returns : Bio::Tools::SiRNA object
162 Args : target - the target sequence for the SiRNAs as a Bio::Seq::RichSeq
163 start_pad - distance from the CDS start to skip (default 75)
164 end_pad - distance from the CDS end to skip (default 50)
165 include_3pr - set to true to include SiRNAs in the 3prime UTR (default false)
166 rules - rules for selecting siRNAs, currently supporting saigo and tuschl
167 min_gc - minimum GC fraction (NOT percent) (default 0.4)
168 max_gc - maximum GC fraction (NOT percent) (default 0.6)
169 cutoff - worst 'rank' accepted(default 3)
170 avoid_snps - boolean - reject oligos that overlap a variation
171 SeqFeature in the target (default true)
172 gstring - maximum allowed consecutive Gs.
173 Too many can cause problems in synthesis (default 4)
174 Note : All arguments can also be changed/accessed using autoloaded
175 methods such as:
177 my $start_pad = $sirna_designer->start_pad().
179 =cut
181 sub new {
182 my ($proto, @args) = @_;
183 my $pkg = ref($proto) || $proto;
185 my $self = {};
186 bless ($self, $pkg);
188 my %args;
190 @args{@ARGNAMES} = $self->_rearrange(\@ARGNAMES, @args);
192 if ($args{'RULES'}) {
193 $self->rules($args{'RULES'});
196 $self->{'start_pad'} = $args{'START_PAD'} || 75; # nt from start to mask
197 $self->{'end_pad'} = $args{'END_PAD'} || 50; # nt from end to mask
198 $self->{'include_3pr'} = $args{'INCLUDE_3PR'} || 0; # look for oligos in 3prime UTR
199 $self->{'min_gc'} = $args{'MIN_GC'} || 0.40;
200 $self->{'max_gc'} = $args{'MAX_GC'} || 0.60;
201 $self->{'cutoff'} = $args{'CUTOFF'} || 3; # highest (worst) rank wanted
202 $self->{'oligos'} = [];
203 defined($args{'AVOID_SNPS'}) ? $self->{'avoid_snps'} = $args{'AVOID_SNPS'} :
204 $self->{'avoid_snps'} = 1; # (t/f to avoid or include reagents that cover SNPs)
205 $self->{'gstring'} = $args{'GSTRING'} || 4; # maximum allowed consecutive Gs - too many can cause problems in oligo synthesis
206 $self->{'tmpdir'} = $args{'TMPDIR'} || $ENV{'TMPDIR'} || $ENV{'TMP'} || '';
207 $self->{'debug'} = $args{'DEBUG'} || 0;
209 $self->target($args{'TARGET'}) if ($args{'TARGET'});
211 return $self;
215 =head2 target
217 Title : target
218 Usage : my $target_seq = $sirna_designer->target(); # get the current target
220 $sirna_designer->target($new_target_seq); # set a new target
221 Function : Set/get the target as a Bio::SeqI-compliant object
222 Returns : a Bio::SeqI-compliant object
223 Args : a Bio::SeqI-compliant object (optional)
225 =cut
227 sub target {
228 my ($self, $target) = @_;
230 if ($target) {
231 unless ($target->isa('Bio::SeqI')) {
232 $self->throw( -class => 'Bio::Root::BadParameter',
233 -text => "Target must be passed as a Bio::Seq object" );
235 if ($target->can('molecule')) {
236 ( grep { uc($target->molecule) eq $_ } qw(DNA MRNA CDNA)) or
237 $self->throw( -class => 'Bio::Root::BadParameter',
238 -text => "Sequences of type ". $target->molecule. " are not supported"
241 else {
242 ($target->alphabet eq 'dna') or
243 $self->throw( -class => 'Bio::Root::BadParameter',
244 -text => "Sequences of alphabet ". $target->alphabet. " are not supported"
248 $self->{'target'} = $target;
249 return 1;
252 elsif ($self->{'target'}) {
253 return $self->{'target'};
255 else {
256 $self->throw("Target sequence not defined");
260 =head2 rules
262 Title : rules
263 Usage : $sirna->rules('ruleset')
264 Purpose : set/get ruleset to use for selecting SiRNA oligo pairs.
265 Returns : not sure yet
266 Args : a ruleset name (currently supported: Tuschl, Saigo)
267 or a Bio::Tools::SiRNA::RulesetI compliant object
269 =cut
271 sub rules {
272 my ($self, $rules) = @_;
274 if ($rules) {
275 $self->_load_ruleset($rules);
277 # default: use tuschl rules
278 unless ($self->{_rules}) {
279 $self->_load_ruleset('tuschl');
281 return $self->{_rules};
284 sub _load_ruleset {
285 my ($self, $ruleset) = @_;
287 my $rule_module = join('::', ref($self), 'Ruleset', lc($ruleset));
289 eval "require $rule_module";
291 if ($@) {
292 #warn join("\n", '@INC contains:', @INC, undef);
293 $self->throw("Unable to load $rule_module: $@");
294 return;
297 else {
298 $self->{_rules} = $rule_module;
299 bless($self, $rule_module); # recast as subclass
302 return 1;
305 =head2 design
307 Title : design
308 Usage : my @pairs = $sirna_designer->design();
309 Purpose : Design SiRNA oligo pairs.
310 Returns : A list of SiRNA pairs as Bio::SeqFeature::SiRNA::Pair objects
311 Args : none
313 =cut
315 sub design {
316 my ($self) = @_;
318 ($self->rules) or $self->throw('Unable to design siRNAs: no rule set specified');
320 # unless ( grep { $_->primary_tag eq 'Target' } $self->target->top_SeqFeatures ) {
321 # $self->_define_target();
324 my @oligos = $self->_get_oligos();
326 return ( grep { $_->isa('Bio::SeqFeature::SiRNA::Pair') } $self->target->top_SeqFeatures );
329 sub _define_target {
330 my ($self) = @_;
331 my ($feat, $cds, $left, $right);
333 my $target = $self->target or
334 $self->throw("Unable to design oligos - no target provided");
336 ($cds) = grep { $_->primary_tag eq 'CDS' } $target->top_SeqFeatures if ($target->can('top_SeqFeatures'));
338 if ($cds) {
339 $left = $cds->start + $self->start_pad;
340 if (!$self->include_3pr) {
341 $right = $cds->end - $self->end_pad;
343 else {
344 $right = $target->length - $self->end_pad;
347 else {
348 $left = 0 + $self->start_pad;
349 $right = $target->length - $self->end_pad;
353 # is there anything left?
354 if (($right - $left) < 20) {
355 $self->throw("There isn't enough sequence to design oligos. Please reduce start_pad and end_pad or supply more sequence");
357 # define target region
358 my $targregion = Bio::SeqFeature::Generic->new( -start => $left,
359 -end => $right,
360 -primary => 'Target' );
361 $self->target->add_SeqFeature($targregion);
363 # locate excluded regions
364 my @excluded = grep { $_->primary_tag eq 'Excluded' } $self->target->top_SeqFeatures;
366 if ($self->avoid_snps) {
367 my @snps = grep { $_->primary_tag eq 'variation' } $self->target->top_SeqFeatures;
368 push(@excluded, @snps);
371 $self->excluded(\@excluded);
373 return $targregion;
376 sub _get_targetregion {
377 my ($self) = @_;
379 my ($targregion) = grep { $_->primary_tag eq 'Target' } $self->target->top_SeqFeatures;
380 $targregion ||= $self->_define_target;
382 $self->throw("Target region for SiRNA design not defined") unless ($targregion);
384 my $seq = $targregion->seq->seq;
385 # but this way I loose start info
386 my $targstart = $targregion->start;
388 return ($seq, $targstart);
391 # MOVE to SiRNA::Ruleset::tuschl
392 # sub _regex {
393 # my ($self, $rank) = @_;
394 # return $PATTERNS{$rank};
397 # sub _get_oligos {
398 # # use regular expressions to pull out oligos
400 # my ($self, $rank) = @_;
401 # my $regex = $self->_regex($rank);
402 # my @exclude;
405 # my ($targregion) = grep { $_->primary_tag eq 'Target' } $self->target->top_SeqFeatures;
406 # my $seq = $targregion->seq->seq;
407 # # but this way I loose start info
408 # my $targstart = $targregion->start;
410 # # exclude masked region
411 # push(@exclude, grep { $_->primary_tag eq 'Excluded' } $self->target->top_SeqFeatures);
413 # # add SNP checking
414 # if ($self->avoid_snps) {
415 # my @snps = grep { $_->primary_tag eq 'variation' } $self->target->top_SeqFeatures;
416 # push(@exclude, @snps);
419 # while ( $seq =~ /$regex/gi ) {
420 # my $target = $1;
422 # # check for too many Gs (or Cs on the other strand)
423 # next if ( $target =~ /G{ $self->gstring,}/io );
424 # next if ( $target =~ /C{ $self->gstring,}/io );
425 # # skip Ns (for filtering)
426 # next if ( $target =~ /N/i);
428 # my $start = length($`) + $targstart;
429 # my $stop = $start + length($target) -1;
431 # my @gc = ( $target =~ /G|C/gi);
432 # my $fxGC = sprintf("%2.2f", (scalar(@gc) / length($target)));
433 # next if ($fxGC < $self->min_gc);
434 # next if ($fxGC > $self->max_gc);
436 # my $sense = Bio::SeqFeature::SiRNA::Oligo->new( -start => $start,
437 # -end => $stop,
438 # -strand => 1,
439 # -seq => _get_sense($target),
440 # -source_tag => ref($self),
441 # );
443 # my $asense = Bio::SeqFeature::SiRNA::Oligo->new( -start => $start,
444 # -end => $stop,
445 # -strand => -1,
446 # -seq => _get_anti($target),
447 # -source_tag => ref($self),
448 # );
450 # my $sirna = Bio::SeqFeature::SiRNA::Pair->new( -rank => $rank,
451 # -fxGC => $fxGC,
452 # -sense => $sense,
453 # -antisense => $asense,
454 # -source_tag => ref($self),
455 # );
457 # unless ($self->_has_overlap($sirna, \@exclude)) {
458 # $self->target->add_SeqFeature($sirna);
461 # }
463 =head2 add_oligos
465 Title : add_oligos
466 Usage : $sirna_designer->add_oligos($sequence, $start, $rank);
467 Purpose : Add SiRNA olgos to target Bio::Seq as Bio::SeqFeature::SiRNA::Pair objects
468 Args : Oligo sequence and start position (required), rank/score (optional)
470 =cut
472 sub add_oligos {
473 my ($self, $seq, $start, $rank) = @_;
475 ($seq) or throw ('No sequence supplied for add_oligos');
476 (defined $start) or throw ('No start position specified for add_oligos');
478 my ($end) = $start + length($seq);
480 my ($sseq) = $self->_get_sense($seq);
481 my $sense = Bio::SeqFeature::SiRNA::Oligo->new( -start => $start,
482 -end => ($start + length($sseq)),
483 -strand => 1,
484 -seq => $sseq,
485 -source_tag => ref($self),
488 my $aseq = $self->_get_anti($seq);
489 my $asense = Bio::SeqFeature::SiRNA::Oligo->new( -start => $end,
490 -end => ($end - length($aseq)),
491 -strand => -1,
492 -seq => $aseq,
493 -source_tag => ref($self),
496 my $sirna = Bio::SeqFeature::SiRNA::Pair->new( -rank => $rank,
497 # -fxGC => $fxGC,
498 -sense => $sense,
499 -antisense => $asense,
500 -source_tag => ref($self),
503 unless ($self->_has_overlap($sirna, $self->excluded)) {
504 $self->target->add_SeqFeature($sirna);
508 sub _has_overlap {
509 # flag any pairs that overlap an UNDESIRED feature (eg SNP)
510 # return true if there is overlap, false if not
512 my ($self, $test, $flist) = @_;
513 print STDERR "Checking oligo at ", $test->start, " to ",$test->end, "\n"
514 if ($self->debug);
516 foreach my $feat (@$flist) {
517 if (($test->start <= $feat->end) and ($test->end >= $feat->start)) {
518 print STDERR "Overlaps ", $feat->primary_tag, " at ",
519 $feat->start, " to ", $feat->end, "\n" if ($self->debug);
520 return 1;
523 return 0; # default - no overlap
526 # MOVE to SiRNA::Ruleset::tuschl
528 # sub _get_sense {
529 # my ($target) = @_;
530 # # trim off 1st 2 nt to get overhang
531 # $target =~ s/^..//;
532 # # convert T's to U's (transcribe)
533 # $target =~ s/T/U/gi;
534 # # force last 2 nt to be T's
535 # $target =~ s/..$/TT/;
537 # return $target;
540 # sub _get_anti {
541 # my ($target) = @_;
542 # my @target = split(//, $target);
543 # my ($nt,@antitarget);
545 # while ($nt = pop @target) {
546 # push(@antitarget, $COMP{$nt});
548 # my $anti = join('', @antitarget);
549 # # trim off 1st 2 nt to get overhang
550 # $anti =~ s/^..//;
551 # # convert T's to U's
552 # $anti =~ s/T/U/gi;
553 # # convert last 2 NT's to T
554 # $anti =~ s/..$/TT/;
556 # return $anti;
560 sub AUTOLOAD {
561 my ($self, $value) = @_;
562 my $name = $AUTOLOAD;
563 $name =~ s/.+:://;
565 return if ($name eq 'DESTROY');
568 if (defined $value) {
569 $self->{$name} = $value;
572 unless (exists $self->{$name}) {
573 $self->throw("Attribute $name not defined for ". ref($self));
576 return $self->{$name};
579 sub _comp {
580 my ($self, $char) = @_;
582 return unless ($char);
583 $char = uc($char);
584 return $COMP{ $char };