lib/Bio/Tools/IUPAC.pm

   1 package Bio::Tools::IUPAC;
   2
   3 use utf8;
   4 use strict;
   5 use warnings;
   6
   7 use base qw(Bio::Root::Root);
   8
   9 # ABSTRACT:  Generates unique sequence objects or regular expressions from an ambiguous IUPAC sequence
  10 # AUTHOR: Aaron Mackey <amackey@virginia.edu>
  11 # OWNER: Aaron Mackey <amackey@virginia.edu>
  12 # LICENSE: Perl_5
  13
  14 =head1 SYNOPSIS
  15
  16  use Bio::PrimarySeq;
  17  use Bio::Tools::IUPAC;
  18
  19  # Get the IUPAC code for proteins
  20  my %iupac_prot = Bio::Tools::IUPAC->new->iupac_iup;
  21
  22  # Create a sequence with degenerate residues
  23  my $ambiseq = Bio::PrimarySeq->new(-seq => 'ARTCGUTGN', -alphabet => 'dna');
  24
  25  # Create all possible non-degenerate sequences
  26  my $iupac = Bio::Tools::IUPAC->new(-seq => $ambiseq);
  27  while ($uniqueseq = $iupac->next_seq()) {
  28      # process the unique Bio::Seq object.
  29  }
  30
  31  # Get a regular expression that matches all possible sequences
  32  my $regexp = $iupac->regexp();
  33
  34 =head1 DESCRIPTION
  35
  36 Bio::Tools::IUPAC is a tool that manipulates sequences with ambiguous residues
  37 following the IUPAC conventions. Non-standard characters have the meaning
  38 described below:
  39
  40     IUPAC-IUB SYMBOLS FOR NUCLEOTIDE (DNA OR RNA) NOMENCLATURE:
  41       Cornish-Bowden (1985) Nucl. Acids Res. 13: 3021-3030
  42
  43     ---------------------------------------------------------------
  44     Symbol       Meaning      Nucleic Acid
  45     ---------------------------------------------------------------
  46      A            A           Adenine
  47      C            C           Cytosine
  48      G            G           Guanine
  49      T            T           Thymine
  50      U            U           Uracil
  51      M          A or C        aMino
  52      R          A or G        puRine
  53      W          A or T        Weak
  54      S          C or G        Strong
  55      Y          C or T        pYrimidine
  56      K          G or T        Keto
  57      V        A or C or G     not T (closest unused char after T)
  58      H        A or C or T     not G (closest unused char after G)
  59      D        A or G or T     not C (closest unused char after C)
  60      B        C or G or T     not A (closest unused char after A)
  61      X      G or A or T or C  Unknown (very rarely used)
  62      N      G or A or T or C  Unknown (commonly used)
  63
  64
  65     IUPAC-IUP AMINO ACID SYMBOLS:
  66       Biochem J. 1984 Apr 15; 219(2): 345-373
  67       Eur J Biochem. 1993 Apr 1; 213(1): 2
  68
  69     ------------------------------------------
  70     Symbol           Meaning
  71     ------------------------------------------
  72     A        Alanine
  73     B        Aspartic Acid, Asparagine
  74     C        Cysteine
  75     D        Aspartic Acid
  76     E        Glutamic Acid
  77     F        Phenylalanine
  78     G        Glycine
  79     H        Histidine
  80     I        Isoleucine
  81     J        Isoleucine/Leucine
  82     K        Lysine
  83     L        Leucine
  84     M        Methionine
  85     N        Asparagine
  86     O        Pyrrolysine
  87     P        Proline
  88     Q        Glutamine
  89     R        Arginine
  90     S        Serine
  91     T        Threonine
  92     U        Selenocysteine
  93     V        Valine
  94     W        Tryptophan
  95     X        Unknown
  96     Y        Tyrosine
  97     Z        Glutamic Acid, Glutamine
  98     *        Terminator
  99
 100 There are a few things Bio::Tools::IUPAC can do for you:
 101
 102 =over
 103
 104 =item *
 105
 106 report the IUPAC mapping between ambiguous and non-ambiguous residues
 107
 108 =item *
 109
 110 produce a stream of all possible corresponding unambiguous Bio::Seq objects given
 111 an ambiguous sequence object
 112
 113 =item *
 114
 115 convert an ambiguous sequence object to a corresponding regular expression
 116
 117 =back
 118
 119 =cut
 120
 121
 122 # Ambiguous nucleic residues are matched to unambiguous residues
 123 our %IUB = (
 124     A => [qw(A)],
 125     C => [qw(C)],
 126     G => [qw(G)],
 127     T => [qw(T)],
 128     U => [qw(U)],
 129     M => [qw(A C)],
 130     R => [qw(A G)],
 131     S => [qw(C G)],
 132     W => [qw(A T)],
 133     Y => [qw(C T)],
 134     K => [qw(G T)],
 135     V => [qw(A C G)],
 136     H => [qw(A C T)],
 137     D => [qw(A G T)],
 138     B => [qw(C G T)],
 139     N => [qw(A C G T)],
 140     X => [qw(A C G T)],
 141 );
 142
 143 # Same as %IUB but ambiguous residues are matched to ambiguous residues only
 144 our %IUB_AMB = (
 145     M => [qw(M)],
 146     R => [qw(R)],
 147     W => [qw(W)],
 148     S => [qw(S)],
 149     Y => [qw(Y)],
 150     K => [qw(K)],
 151     V => [qw(M R S V)],
 152     H => [qw(H M W Y)],
 153     D => [qw(D K R W)],
 154     B => [qw(B K S Y)],
 155     N => [qw(B D H K M N R S V W Y)],
 156     );
 157
 158 # The inverse of %IUB
 159 our %REV_IUB = (
 160     A    => 'A',
 161     T    => 'T',
 162     U    => 'U',
 163     C    => 'C',
 164     G    => 'G',
 165     AC   => 'M',
 166     AG   => 'R',
 167     AT   => 'W',
 168     CG   => 'S',
 169     CT   => 'Y',
 170     GT   => 'K',
 171     ACG  => 'V',
 172     ACT  => 'H',
 173     AGT  => 'D',
 174     CGT  => 'B',
 175     ACGT => 'N',
 176     N    => 'N'
 177 );
 178
 179 # Same thing with proteins now
 180 our %IUP = (
 181     A => [qw(A)],
 182     B => [qw(D N)],
 183     C => [qw(C)],
 184     D => [qw(D)],
 185     E => [qw(E)],
 186     F => [qw(F)],
 187     G => [qw(G)],
 188     H => [qw(H)],
 189     I => [qw(I)],
 190     J => [qw(I L)],
 191     K => [qw(K)],
 192     L => [qw(L)],
 193     M => [qw(M)],
 194     N => [qw(N)],
 195     O => [qw(O)],
 196     P => [qw(P)],
 197     Q => [qw(Q)],
 198     R => [qw(R)],
 199     S => [qw(S)],
 200     T => [qw(T)],
 201     U => [qw(U)],
 202     V => [qw(V)],
 203     W => [qw(W)],
 204     X => [qw(X)],
 205     Y => [qw(Y)],
 206     Z => [qw(E Q)],
 207     '*' => [qw(*)],
 208 );
 209
 210 our %IUP_AMB = (
 211     B => [qw(B)],
 212     J => [qw(J)],
 213     Z => [qw(Z)],
 214 );
 215
 216
 217 =head2 new
 218
 219  Title   : new
 220  Usage   : Bio::Tools::IUPAC->new($seq);
 221  Function: Create a new IUPAC object, which acts as a sequence stream (akin to
 222            SeqIO)
 223  Args    : an ambiguously coded sequence object that has a specified 'alphabet'
 224  Returns : a Bio::Tools::IUPAC object.
 225
 226 =cut
 227
 228 sub new {
 229     my ($class,@args) = @_;
 230     my $self = $class->SUPER::new(@args);
 231     my ($seq) = $self->_rearrange([qw(SEQ)],@args);
 232
 233     if ( (not defined $seq) && @args && ref($args[0]) ) {
 234         # parameter not passed as named parameter?
 235         $seq = $args[0];
 236     }
 237
 238     if (defined $seq) {
 239         if (not $seq->isa('Bio::PrimarySeqI')) {
 240             $self->throw('Must supply a sequence object');
 241         }
 242         if (length $seq->seq == 0) {
 243             $self->throw('Sequence had zero-length');
 244         }
 245         $self->{'_seq'} = $seq;
 246     }
 247
 248     return $self;
 249 }
 250
 251
 252 sub _initialize {
 253     my ($self) = @_;
 254     my %iupac = $self->iupac;
 255     $self->{'_alpha'} = [ map { $iupac{uc $_} } split('', $self->{'_seq'}->seq) ];
 256     $self->{'_string'} = [(0) x length($self->{'_seq'}->seq())];
 257     $self->{'_string'}->[0] = -1;
 258 }
 259
 260
 261 =head2 next_seq
 262
 263  Title   : next_seq
 264  Usage   : $iupac->next_seq();
 265  Function: returns the next unique sequence object
 266  Args    : none.
 267  Returns : a Bio::Seq object
 268
 269 =cut
 270
 271 sub next_seq {
 272     my ($self) = @_;
 273
 274     if (not exists $self->{'_string'}) {
 275         $self->_initialize();
 276     }
 277
 278     for my $i ( 0 .. $#{$self->{'_string'}} ) {
 279         next unless $self->{'_string'}->[$i] || @{$self->{'_alpha'}->[$i]} > 1;
 280         if ( $self->{'_string'}->[$i] == $#{$self->{'_alpha'}->[$i]} ) { # rollover
 281             if ( $i == $#{$self->{'_string'}} ) { # end of possibilities
 282                 return;
 283             } else {
 284                 $self->{'_string'}->[$i] = 0;
 285                 next;
 286             }
 287         } else {
 288             $self->{'_string'}->[$i]++;
 289             my $j = -1;
 290             my $seqstr = join('', map { $j++; $self->{'_alpha'}->[$j]->[$_]; } @{$self->{'_string'}});
 291             my $desc   = $self->{'_seq'}->desc() || '';
 292             $self->{'_num'}++;
 293             1 while $self->{'_num'} =~ s/(\d)(\d\d\d)(?!\d)/$1,$2/;
 294             $desc =~ s/( \[Bio::Tools::IUPAC-generated\sunique sequence # [^\]]*\])|$/ \[Bio::Tools::IUPAC-generated unique sequence # $self->{'_num'}\]/;
 295             $self->{'_num'} =~ s/,//g;
 296
 297             # Return a fresh sequence object
 298             return Bio::PrimarySeq->new(-seq  => $seqstr, -desc => $desc);
 299         }
 300     }
 301 }
 302
 303
 304 =head2 iupac
 305
 306  Title   : iupac
 307  Usage   : my %symbols = $iupac->iupac;
 308  Function: Returns a hash of symbols -> symbol components of the right type
 309            for the given sequence, i.e. it is the same as iupac_iup() if
 310            Bio::Tools::IUPAC was given a proteic sequence, or iupac_iub() if the
 311            sequence was nucleic. For example, the key 'M' has the value ['A', 'C'].
 312  Args    : none
 313  Returns : Hash
 314
 315 =cut
 316
 317 sub iupac {
 318     my ($self) = @_;
 319     my $alphabet = lc( $self->{'_seq'}->alphabet() );
 320     if ( ($alphabet eq 'dna') or ($alphabet eq 'rna') ) {
 321         return %IUB; # nucleic
 322     } elsif ( $alphabet eq 'protein' ) {
 323         return %IUP; # proteic
 324     } else {
 325         $self->throw("The input sequence had the unknown alphabet '$alphabet'\n");
 326     }
 327 }
 328
 329
 330
 331 =head2 iupac_amb
 332
 333  Title   : iupac_amb
 334  Usage   : my %symbols = $iupac->iupac_amb;
 335  Function: Same as iupac() but only contains a mapping between ambiguous residues
 336            and the ambiguous residues they map to. For example, the key 'N' has
 337            the value ['R', 'Y', 'K', 'M', 'S', 'W', 'B', 'D', 'H', 'V', 'N'],
 338            i.e. it matches all other ambiguous residues.
 339  Args    : none
 340  Returns : Hash
 341
 342 =cut
 343
 344 sub iupac_amb {
 345     my ($self) = @_;
 346     my $alphabet = lc( $self->{'_seq'}->alphabet() );
 347     if ( ($alphabet eq 'dna') or ($alphabet eq 'rna') ) {
 348         return %IUB_AMB; # nucleic
 349     } elsif ( $alphabet eq 'protein' ) {
 350         return %IUP_AMB; # proteic
 351     } else {
 352         $self->throw("The input sequence had the unknown alphabet '$alphabet'\n");
 353     }
 354 }
 355
 356
 357 =head2 iupac_iup
 358
 359  Title   : iupac_iup
 360  Usage   : my %aasymbols = $iupac->iupac_iup;
 361  Function: Returns a hash of PROTEIN symbols -> non-ambiguous symbol components
 362  Args    : none
 363  Returns : Hash
 364
 365 =cut
 366
 367 sub iupac_iup {
 368    return %IUP;
 369 }
 370
 371
 372 =head2 iupac_iup_amb
 373
 374  Title   : iupac_iup_amb
 375  Usage   : my %aasymbols = $iupac->iupac_iup_amb;
 376  Function: Returns a hash of PROTEIN symbols -> ambiguous symbol components
 377  Args    : none
 378  Returns : Hash
 379
 380 =cut
 381
 382 sub iupac_iup_amb {
 383    return %IUP_AMB;
 384 }
 385
 386
 387 =head2 iupac_iub
 388
 389  Title   : iupac_iub
 390  Usage   : my %dnasymbols = $iupac->iupac_iub;
 391  Function: Returns a hash of DNA symbols -> non-ambiguous symbol components
 392  Args    : none
 393  Returns : Hash
 394
 395 =cut
 396
 397 sub iupac_iub {
 398    return %IUB;
 399 }
 400
 401
 402 =head2 iupac_iub_amb
 403
 404  Title   : iupac_iub_amb
 405  Usage   : my %dnasymbols = $iupac->iupac_iub;
 406  Function: Returns a hash of DNA symbols -> ambiguous symbol components
 407  Args    : none
 408  Returns : Hash
 409
 410 =cut
 411
 412 sub iupac_iub_amb {
 413    return %IUB_AMB;
 414 }
 415
 416
 417 =head2 iupac_rev_iub
 418
 419  Title   : iupac_rev_iub
 420  Usage   : my %dnasymbols = $iupac->iupac_rev_iub;
 421  Function: Returns a hash of nucleotide combinations -> IUPAC code
 422            (a reverse of the iupac_iub hash).
 423  Args    : none
 424  Returns : Hash
 425
 426 =cut
 427
 428 sub iupac_rev_iub {
 429    return %REV_IUB;
 430 }
 431
 432
 433 =head2 count
 434
 435  Title   : count
 436  Usage   : my $total = $iupac->count();
 437  Function: Calculates the number of unique, unambiguous sequences that
 438            this ambiguous sequence could generate
 439  Args    : none
 440  Return  : int
 441
 442 =cut
 443
 444 sub count {
 445     my ($self) = @_;
 446     if (not exists $self->{'_string'}) {
 447         $self->_initialize();
 448     }
 449     my $count = 1;
 450     $count *= scalar(@$_) for (@{$self->{'_alpha'}});
 451     return $count;
 452 }
 453
 454
 455 =head2 regexp
 456
 457  Title   : regexp
 458  Usage   : my $re = $iupac->regexp();
 459  Function: Converts the ambiguous sequence into a regular expression that
 460            matches all of the corresponding ambiguous and non-ambiguous sequences.
 461            You can further manipulate the resulting regular expression with the
 462            Bio::Tools::SeqPattern module. After you are done building your
 463            regular expression, you might want to compile it and make it case-
 464            insensitive:
 465               $re = qr/$re/i;
 466  Args    : 1 to match RNA: T and U characters will match interchangeably
 467  Return  : regular expression
 468
 469 =cut
 470
 471 sub regexp {
 472     my ($self, $match_rna) = @_;
 473     my $re;
 474     my $seq = $self->{'_seq'}->seq;
 475     my %iupac = $self->iupac;
 476     my %iupac_amb = $self->iupac_amb;
 477     for my $pos (0 .. length($seq)-1) {
 478         my $res = substr $seq, $pos, 1;
 479         my $iupacs = $iupac{$res};
 480         my $iupacs_amb = $iupac_amb{$res} || [];
 481         if (not defined $iupacs) {
 482             $self->throw("Primer sequence '$seq' is not a valid IUPAC sequence.".
 483                          " Offending character was '$res'.\n");
 484         }
 485         my $part = join '', (@$iupacs, @$iupacs_amb);
 486         if ($match_rna) {
 487             $part =~ s/T/TU/i || $part =~ s/U/TU/i;
 488         }
 489         if (length $part > 1) {
 490            $part = '['.$part.']';
 491         }
 492         $re .= $part;
 493     }
 494     return $re;
 495 }
 496
 497
 498 sub AUTOLOAD {
 499     my $self = shift @_;
 500     our $AUTOLOAD;
 501     my $method = $AUTOLOAD;
 502     $method =~ s/.*:://;
 503     return $self->{'_seq'}->$method(@_)
 504         unless $method eq 'DESTROY';
 505 }
 506
 507 1;