2 # BioPerl module for IUPAC
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Aaron Mackey <amackey@virginia.edu>
8 # Copyright Aaron Mackey
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
16 Bio::Tools::IUPAC - Generates unique sequence objects or regular expressions from
17 an ambiguous IUPAC sequence
22 use Bio::Tools::IUPAC;
24 # Get the IUPAC code for proteins
25 my %iupac_prot = Bio::Tools::IUPAC->new->iupac_iup;
27 # Create a sequence with degenerate residues
28 my $ambiseq = Bio::PrimarySeq->new(-seq => 'ARTCGUTGN', -alphabet => 'dna');
30 # Create all possible non-degenerate sequences
31 my $iupac = Bio::Tools::IUPAC->new(-seq => $ambiseq);
32 while ($uniqueseq = $iupac->next_seq()) {
33 # process the unique Bio::Seq object.
36 # Get a regular expression that matches all possible sequences
37 my $regexp = $iupac->regexp();
41 Bio::Tools::IUPAC is a tool that manipulates sequences with ambiguous residues
42 following the IUPAC conventions. Non-standard characters have the meaning
45 IUPAC-IUB SYMBOLS FOR NUCLEOTIDE (DNA OR RNA) NOMENCLATURE:
46 Cornish-Bowden (1985) Nucl. Acids Res. 13: 3021-3030
48 ---------------------------------------------------------------
49 Symbol Meaning Nucleic Acid
50 ---------------------------------------------------------------
62 V A or C or G not T (closest unused char after T)
63 H A or C or T not G (closest unused char after G)
64 D A or G or T not C (closest unused char after C)
65 B C or G or T not A (closest unused char after A)
66 X G or A or T or C Unknown (very rarely used)
67 N G or A or T or C Unknown (commonly used)
70 IUPAC-IUP AMINO ACID SYMBOLS:
71 Biochem J. 1984 Apr 15; 219(2): 345-373
72 Eur J Biochem. 1993 Apr 1; 213(1): 2
74 ------------------------------------------
76 ------------------------------------------
78 B Aspartic Acid, Asparagine
102 Z Glutamic Acid, Glutamine
105 There are a few things Bio::Tools::IUPAC can do for you:
111 report the IUPAC mapping between ambiguous and non-ambiguous residues
115 produce a stream of all possible corresponding unambiguous Bio::Seq objects given
116 an ambiguous sequence object
120 convert an ambiguous sequence object to a corresponding regular expression
128 User feedback is an integral part of the evolution of this and other
129 Bioperl modules. Send your comments and suggestions preferably to one
130 of the Bioperl mailing lists. Your participation is much appreciated.
132 bioperl-l@bioperl.org - General discussion
133 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
137 Please direct usage questions or support issues to the mailing list:
139 I<bioperl-l@bioperl.org>
141 rather than to the module maintainer directly. Many experienced and
142 reponsive experts will be able look at the problem and quickly
143 address it. Please include a thorough description of the problem
144 with code and data examples if at all possible.
146 =head2 Reporting Bugs
148 Report bugs to the Bioperl bug tracking system to help us keep track
149 the bugs and their resolution. Bug reports can be submitted via the
152 https://github.com/bioperl/bioperl-live/issues
154 =head1 AUTHOR - Aaron Mackey
156 Email amackey-at-virginia.edu
160 The rest of the documentation details each of the object
161 methods. Internal methods are usually preceded with a _
166 package Bio
::Tools
::IUPAC
;
169 use base
qw(Bio::Root::Root);
170 use vars
qw(%IUB %IUB_AMB %REV_IUB %IUP %IUP_AMB $AUTOLOAD);
173 # Ambiguous nucleic residues are matched to unambiguous residues
194 # Same as %IUB but ambiguous residues are matched to ambiguous residues only
206 N
=> [qw(B D H K M N R S V W Y)],
209 # The inverse of %IUB
230 # Same thing with proteins now
273 Usage : Bio::Tools::IUPAC->new($seq);
274 Function: Create a new IUPAC object, which acts as a sequence stream (akin to
276 Args : an ambiguously coded sequence object that has a specified 'alphabet'
277 Returns : a Bio::Tools::IUPAC object.
282 my ($class,@args) = @_;
283 my $self = $class->SUPER::new
(@args);
284 my ($seq) = $self->_rearrange([qw(SEQ)],@args);
286 if ( (not defined $seq) && @args && ref($args[0]) ) {
287 # parameter not passed as named parameter?
292 if (not $seq->isa('Bio::PrimarySeqI')) {
293 $self->throw('Must supply a sequence object');
295 if (length $seq->seq == 0) {
296 $self->throw('Sequence had zero-length');
298 $self->{'_seq'} = $seq;
307 my %iupac = $self->iupac;
308 $self->{'_alpha'} = [ map { $iupac{uc $_} } split('', $self->{'_seq'}->seq) ];
309 $self->{'_string'} = [(0) x
length($self->{'_seq'}->seq())];
310 $self->{'_string'}->[0] = -1;
317 Usage : $iupac->next_seq();
318 Function: returns the next unique sequence object
320 Returns : a Bio::Seq object
327 if (not exists $self->{'_string'}) {
328 $self->_initialize();
331 for my $i ( 0 .. $#{$self->{'_string'}} ) {
332 next unless $self->{'_string'}->[$i] || @
{$self->{'_alpha'}->[$i]} > 1;
333 if ( $self->{'_string'}->[$i] == $#{$self->{'_alpha'}->[$i]} ) { # rollover
334 if ( $i == $#{$self->{'_string'}} ) { # end of possibilities
337 $self->{'_string'}->[$i] = 0;
341 $self->{'_string'}->[$i]++;
343 my $seqstr = join('', map { $j++; $self->{'_alpha'}->[$j]->[$_]; } @
{$self->{'_string'}});
344 my $desc = $self->{'_seq'}->desc() || '';
346 1 while $self->{'_num'} =~ s/(\d)(\d\d\d)(?!\d)/$1,$2/;
347 $desc =~ s/( \[Bio::Tools::IUPAC-generated\sunique sequence # [^\]]*\])|$/ \[Bio::Tools::IUPAC-generated unique sequence # $self->{'_num'}\]/;
348 $self->{'_num'} =~ s/,//g;
350 # Return a fresh sequence object
351 return Bio
::PrimarySeq
->new(-seq
=> $seqstr, -desc
=> $desc);
360 Usage : my %symbols = $iupac->iupac;
361 Function: Returns a hash of symbols -> symbol components of the right type
362 for the given sequence, i.e. it is the same as iupac_iup() if
363 Bio::Tools::IUPAC was given a proteic sequence, or iupac_iub() if the
364 sequence was nucleic. For example, the key 'M' has the value ['A', 'C'].
372 my $alphabet = lc( $self->{'_seq'}->alphabet() );
373 if ( ($alphabet eq 'dna') or ($alphabet eq 'rna') ) {
374 return %IUB; # nucleic
375 } elsif ( $alphabet eq 'protein' ) {
376 return %IUP; # proteic
378 $self->throw("The input sequence had the unknown alphabet '$alphabet'\n");
387 Usage : my %symbols = $iupac->iupac_amb;
388 Function: Same as iupac() but only contains a mapping between ambiguous residues
389 and the ambiguous residues they map to. For example, the key 'N' has
390 the value ['R', 'Y', 'K', 'M', 'S', 'W', 'B', 'D', 'H', 'V', 'N'],
391 i.e. it matches all other ambiguous residues.
399 my $alphabet = lc( $self->{'_seq'}->alphabet() );
400 if ( ($alphabet eq 'dna') or ($alphabet eq 'rna') ) {
401 return %IUB_AMB; # nucleic
402 } elsif ( $alphabet eq 'protein' ) {
403 return %IUP_AMB; # proteic
405 $self->throw("The input sequence had the unknown alphabet '$alphabet'\n");
413 Usage : my %aasymbols = $iupac->iupac_iup;
414 Function: Returns a hash of PROTEIN symbols -> non-ambiguous symbol components
427 Title : iupac_iup_amb
428 Usage : my %aasymbols = $iupac->iupac_iup_amb;
429 Function: Returns a hash of PROTEIN symbols -> ambiguous symbol components
443 Usage : my %dnasymbols = $iupac->iupac_iub;
444 Function: Returns a hash of DNA symbols -> non-ambiguous symbol components
457 Title : iupac_iub_amb
458 Usage : my %dnasymbols = $iupac->iupac_iub;
459 Function: Returns a hash of DNA symbols -> ambiguous symbol components
472 Title : iupac_rev_iub
473 Usage : my %dnasymbols = $iupac->iupac_rev_iub;
474 Function: Returns a hash of nucleotide combinations -> IUPAC code
475 (a reverse of the iupac_iub hash).
489 Usage : my $total = $iupac->count();
490 Function: Calculates the number of unique, unambiguous sequences that
491 this ambiguous sequence could generate
499 if (not exists $self->{'_string'}) {
500 $self->_initialize();
503 $count *= scalar(@
$_) for (@
{$self->{'_alpha'}});
511 Usage : my $re = $iupac->regexp();
512 Function: Converts the ambiguous sequence into a regular expression that
513 matches all of the corresponding ambiguous and non-ambiguous sequences.
514 You can further manipulate the resulting regular expression with the
515 Bio::Tools::SeqPattern module. After you are done building your
516 regular expression, you might want to compile it and make it case-
519 Args : 1 to match RNA: T and U characters will match interchangeably
520 Return : regular expression
525 my ($self, $match_rna) = @_;
527 my $seq = $self->{'_seq'}->seq;
528 my %iupac = $self->iupac;
529 my %iupac_amb = $self->iupac_amb;
530 for my $pos (0 .. length($seq)-1) {
531 my $res = substr $seq, $pos, 1;
532 my $iupacs = $iupac{$res};
533 my $iupacs_amb = $iupac_amb{$res} || [];
534 if (not defined $iupacs) {
535 $self->throw("Primer sequence '$seq' is not a valid IUPAC sequence.".
536 " Offending character was '$res'.\n");
538 my $part = join '', (@
$iupacs, @
$iupacs_amb);
540 $part =~ s/T/TU/i || $part =~ s/U/TU/i;
542 if (length $part > 1) {
543 $part = '['.$part.']';
553 my $method = $AUTOLOAD;
555 return $self->{'_seq'}->$method(@_)
556 unless $method eq 'DESTROY';