1 package Bio
::Tools
::IUPAC
;
7 use base
qw(Bio::Root::Root);
9 # ABSTRACT: Generates unique sequence objects or regular expressions from an ambiguous IUPAC sequence
10 # AUTHOR: Aaron Mackey <amackey@virginia.edu>
11 # OWNER: Aaron Mackey <amackey@virginia.edu>
17 use Bio::Tools::IUPAC;
19 # Get the IUPAC code for proteins
20 my %iupac_prot = Bio::Tools::IUPAC->new->iupac_iup;
22 # Create a sequence with degenerate residues
23 my $ambiseq = Bio::PrimarySeq->new(-seq => 'ARTCGUTGN', -alphabet => 'dna');
25 # Create all possible non-degenerate sequences
26 my $iupac = Bio::Tools::IUPAC->new(-seq => $ambiseq);
27 while ($uniqueseq = $iupac->next_seq()) {
28 # process the unique Bio::Seq object.
31 # Get a regular expression that matches all possible sequences
32 my $regexp = $iupac->regexp();
36 Bio::Tools::IUPAC is a tool that manipulates sequences with ambiguous residues
37 following the IUPAC conventions. Non-standard characters have the meaning
40 IUPAC-IUB SYMBOLS FOR NUCLEOTIDE (DNA OR RNA) NOMENCLATURE:
41 Cornish-Bowden (1985) Nucl. Acids Res. 13: 3021-3030
43 ---------------------------------------------------------------
44 Symbol Meaning Nucleic Acid
45 ---------------------------------------------------------------
57 V A or C or G not T (closest unused char after T)
58 H A or C or T not G (closest unused char after G)
59 D A or G or T not C (closest unused char after C)
60 B C or G or T not A (closest unused char after A)
61 X G or A or T or C Unknown (very rarely used)
62 N G or A or T or C Unknown (commonly used)
65 IUPAC-IUP AMINO ACID SYMBOLS:
66 Biochem J. 1984 Apr 15; 219(2): 345-373
67 Eur J Biochem. 1993 Apr 1; 213(1): 2
69 ------------------------------------------
71 ------------------------------------------
73 B Aspartic Acid, Asparagine
97 Z Glutamic Acid, Glutamine
100 There are a few things Bio::Tools::IUPAC can do for you:
106 report the IUPAC mapping between ambiguous and non-ambiguous residues
110 produce a stream of all possible corresponding unambiguous Bio::Seq objects given
111 an ambiguous sequence object
115 convert an ambiguous sequence object to a corresponding regular expression
122 # Ambiguous nucleic residues are matched to unambiguous residues
143 # Same as %IUB but ambiguous residues are matched to ambiguous residues only
155 N
=> [qw(B D H K M N R S V W Y)],
158 # The inverse of %IUB
179 # Same thing with proteins now
220 Usage : Bio::Tools::IUPAC->new($seq);
221 Function: Create a new IUPAC object, which acts as a sequence stream (akin to
223 Args : an ambiguously coded sequence object that has a specified 'alphabet'
224 Returns : a Bio::Tools::IUPAC object.
229 my ($class,@args) = @_;
230 my $self = $class->SUPER::new
(@args);
231 my ($seq) = $self->_rearrange([qw(SEQ)],@args);
233 if ( (not defined $seq) && @args && ref($args[0]) ) {
234 # parameter not passed as named parameter?
239 if (not $seq->isa('Bio::PrimarySeqI')) {
240 $self->throw('Must supply a sequence object');
242 if (length $seq->seq == 0) {
243 $self->throw('Sequence had zero-length');
245 $self->{'_seq'} = $seq;
254 my %iupac = $self->iupac;
255 $self->{'_alpha'} = [ map { $iupac{uc $_} } split('', $self->{'_seq'}->seq) ];
256 $self->{'_string'} = [(0) x
length($self->{'_seq'}->seq())];
257 $self->{'_string'}->[0] = -1;
264 Usage : $iupac->next_seq();
265 Function: returns the next unique sequence object
267 Returns : a Bio::Seq object
274 if (not exists $self->{'_string'}) {
275 $self->_initialize();
278 for my $i ( 0 .. $#{$self->{'_string'}} ) {
279 next unless $self->{'_string'}->[$i] || @
{$self->{'_alpha'}->[$i]} > 1;
280 if ( $self->{'_string'}->[$i] == $#{$self->{'_alpha'}->[$i]} ) { # rollover
281 if ( $i == $#{$self->{'_string'}} ) { # end of possibilities
284 $self->{'_string'}->[$i] = 0;
288 $self->{'_string'}->[$i]++;
290 my $seqstr = join('', map { $j++; $self->{'_alpha'}->[$j]->[$_]; } @
{$self->{'_string'}});
291 my $desc = $self->{'_seq'}->desc() || '';
293 1 while $self->{'_num'} =~ s/(\d)(\d\d\d)(?!\d)/$1,$2/;
294 $desc =~ s/( \[Bio::Tools::IUPAC-generated\sunique sequence # [^\]]*\])|$/ \[Bio::Tools::IUPAC-generated unique sequence # $self->{'_num'}\]/;
295 $self->{'_num'} =~ s/,//g;
297 # Return a fresh sequence object
298 return Bio
::PrimarySeq
->new(-seq
=> $seqstr, -desc
=> $desc);
307 Usage : my %symbols = $iupac->iupac;
308 Function: Returns a hash of symbols -> symbol components of the right type
309 for the given sequence, i.e. it is the same as iupac_iup() if
310 Bio::Tools::IUPAC was given a proteic sequence, or iupac_iub() if the
311 sequence was nucleic. For example, the key 'M' has the value ['A', 'C'].
319 my $alphabet = lc( $self->{'_seq'}->alphabet() );
320 if ( ($alphabet eq 'dna') or ($alphabet eq 'rna') ) {
321 return %IUB; # nucleic
322 } elsif ( $alphabet eq 'protein' ) {
323 return %IUP; # proteic
325 $self->throw("The input sequence had the unknown alphabet '$alphabet'\n");
334 Usage : my %symbols = $iupac->iupac_amb;
335 Function: Same as iupac() but only contains a mapping between ambiguous residues
336 and the ambiguous residues they map to. For example, the key 'N' has
337 the value ['R', 'Y', 'K', 'M', 'S', 'W', 'B', 'D', 'H', 'V', 'N'],
338 i.e. it matches all other ambiguous residues.
346 my $alphabet = lc( $self->{'_seq'}->alphabet() );
347 if ( ($alphabet eq 'dna') or ($alphabet eq 'rna') ) {
348 return %IUB_AMB; # nucleic
349 } elsif ( $alphabet eq 'protein' ) {
350 return %IUP_AMB; # proteic
352 $self->throw("The input sequence had the unknown alphabet '$alphabet'\n");
360 Usage : my %aasymbols = $iupac->iupac_iup;
361 Function: Returns a hash of PROTEIN symbols -> non-ambiguous symbol components
374 Title : iupac_iup_amb
375 Usage : my %aasymbols = $iupac->iupac_iup_amb;
376 Function: Returns a hash of PROTEIN symbols -> ambiguous symbol components
390 Usage : my %dnasymbols = $iupac->iupac_iub;
391 Function: Returns a hash of DNA symbols -> non-ambiguous symbol components
404 Title : iupac_iub_amb
405 Usage : my %dnasymbols = $iupac->iupac_iub;
406 Function: Returns a hash of DNA symbols -> ambiguous symbol components
419 Title : iupac_rev_iub
420 Usage : my %dnasymbols = $iupac->iupac_rev_iub;
421 Function: Returns a hash of nucleotide combinations -> IUPAC code
422 (a reverse of the iupac_iub hash).
436 Usage : my $total = $iupac->count();
437 Function: Calculates the number of unique, unambiguous sequences that
438 this ambiguous sequence could generate
446 if (not exists $self->{'_string'}) {
447 $self->_initialize();
450 $count *= scalar(@
$_) for (@
{$self->{'_alpha'}});
458 Usage : my $re = $iupac->regexp();
459 Function: Converts the ambiguous sequence into a regular expression that
460 matches all of the corresponding ambiguous and non-ambiguous sequences.
461 You can further manipulate the resulting regular expression with the
462 Bio::Tools::SeqPattern module. After you are done building your
463 regular expression, you might want to compile it and make it case-
466 Args : 1 to match RNA: T and U characters will match interchangeably
467 Return : regular expression
472 my ($self, $match_rna) = @_;
474 my $seq = $self->{'_seq'}->seq;
475 my %iupac = $self->iupac;
476 my %iupac_amb = $self->iupac_amb;
477 for my $pos (0 .. length($seq)-1) {
478 my $res = substr $seq, $pos, 1;
479 my $iupacs = $iupac{$res};
480 my $iupacs_amb = $iupac_amb{$res} || [];
481 if (not defined $iupacs) {
482 $self->throw("Primer sequence '$seq' is not a valid IUPAC sequence.".
483 " Offending character was '$res'.\n");
485 my $part = join '', (@
$iupacs, @
$iupacs_amb);
487 $part =~ s/T/TU/i || $part =~ s/U/TU/i;
489 if (length $part > 1) {
490 $part = '['.$part.']';
501 my $method = $AUTOLOAD;
503 return $self->{'_seq'}->$method(@_)
504 unless $method eq 'DESTROY';