Bio::Tools::CodonTable and Bio::Tools::IUPAC: prepare with dzil.
[bioperl-live.git] / lib / Bio / Tools / IUPAC.pm
blob7116bbf12e28ee208b505f6abe07e7feb3ffd22e
1 package Bio::Tools::IUPAC;
3 use utf8;
4 use strict;
5 use warnings;
7 use base qw(Bio::Root::Root);
9 # ABSTRACT: Generates unique sequence objects or regular expressions from an ambiguous IUPAC sequence
10 # AUTHOR: Aaron Mackey <amackey@virginia.edu>
11 # OWNER: Aaron Mackey <amackey@virginia.edu>
12 # LICENSE: Perl_5
14 =head1 SYNOPSIS
16 use Bio::PrimarySeq;
17 use Bio::Tools::IUPAC;
19 # Get the IUPAC code for proteins
20 my %iupac_prot = Bio::Tools::IUPAC->new->iupac_iup;
22 # Create a sequence with degenerate residues
23 my $ambiseq = Bio::PrimarySeq->new(-seq => 'ARTCGUTGN', -alphabet => 'dna');
25 # Create all possible non-degenerate sequences
26 my $iupac = Bio::Tools::IUPAC->new(-seq => $ambiseq);
27 while ($uniqueseq = $iupac->next_seq()) {
28 # process the unique Bio::Seq object.
31 # Get a regular expression that matches all possible sequences
32 my $regexp = $iupac->regexp();
34 =head1 DESCRIPTION
36 Bio::Tools::IUPAC is a tool that manipulates sequences with ambiguous residues
37 following the IUPAC conventions. Non-standard characters have the meaning
38 described below:
40 IUPAC-IUB SYMBOLS FOR NUCLEOTIDE (DNA OR RNA) NOMENCLATURE:
41 Cornish-Bowden (1985) Nucl. Acids Res. 13: 3021-3030
43 ---------------------------------------------------------------
44 Symbol Meaning Nucleic Acid
45 ---------------------------------------------------------------
46 A A Adenine
47 C C Cytosine
48 G G Guanine
49 T T Thymine
50 U U Uracil
51 M A or C aMino
52 R A or G puRine
53 W A or T Weak
54 S C or G Strong
55 Y C or T pYrimidine
56 K G or T Keto
57 V A or C or G not T (closest unused char after T)
58 H A or C or T not G (closest unused char after G)
59 D A or G or T not C (closest unused char after C)
60 B C or G or T not A (closest unused char after A)
61 X G or A or T or C Unknown (very rarely used)
62 N G or A or T or C Unknown (commonly used)
65 IUPAC-IUP AMINO ACID SYMBOLS:
66 Biochem J. 1984 Apr 15; 219(2): 345-373
67 Eur J Biochem. 1993 Apr 1; 213(1): 2
69 ------------------------------------------
70 Symbol Meaning
71 ------------------------------------------
72 A Alanine
73 B Aspartic Acid, Asparagine
74 C Cysteine
75 D Aspartic Acid
76 E Glutamic Acid
77 F Phenylalanine
78 G Glycine
79 H Histidine
80 I Isoleucine
81 J Isoleucine/Leucine
82 K Lysine
83 L Leucine
84 M Methionine
85 N Asparagine
86 O Pyrrolysine
87 P Proline
88 Q Glutamine
89 R Arginine
90 S Serine
91 T Threonine
92 U Selenocysteine
93 V Valine
94 W Tryptophan
95 X Unknown
96 Y Tyrosine
97 Z Glutamic Acid, Glutamine
98 * Terminator
100 There are a few things Bio::Tools::IUPAC can do for you:
102 =over
104 =item *
106 report the IUPAC mapping between ambiguous and non-ambiguous residues
108 =item *
110 produce a stream of all possible corresponding unambiguous Bio::Seq objects given
111 an ambiguous sequence object
113 =item *
115 convert an ambiguous sequence object to a corresponding regular expression
117 =back
119 =cut
122 # Ambiguous nucleic residues are matched to unambiguous residues
123 our %IUB = (
124 A => [qw(A)],
125 C => [qw(C)],
126 G => [qw(G)],
127 T => [qw(T)],
128 U => [qw(U)],
129 M => [qw(A C)],
130 R => [qw(A G)],
131 S => [qw(C G)],
132 W => [qw(A T)],
133 Y => [qw(C T)],
134 K => [qw(G T)],
135 V => [qw(A C G)],
136 H => [qw(A C T)],
137 D => [qw(A G T)],
138 B => [qw(C G T)],
139 N => [qw(A C G T)],
140 X => [qw(A C G T)],
143 # Same as %IUB but ambiguous residues are matched to ambiguous residues only
144 our %IUB_AMB = (
145 M => [qw(M)],
146 R => [qw(R)],
147 W => [qw(W)],
148 S => [qw(S)],
149 Y => [qw(Y)],
150 K => [qw(K)],
151 V => [qw(M R S V)],
152 H => [qw(H M W Y)],
153 D => [qw(D K R W)],
154 B => [qw(B K S Y)],
155 N => [qw(B D H K M N R S V W Y)],
158 # The inverse of %IUB
159 our %REV_IUB = (
160 A => 'A',
161 T => 'T',
162 U => 'U',
163 C => 'C',
164 G => 'G',
165 AC => 'M',
166 AG => 'R',
167 AT => 'W',
168 CG => 'S',
169 CT => 'Y',
170 GT => 'K',
171 ACG => 'V',
172 ACT => 'H',
173 AGT => 'D',
174 CGT => 'B',
175 ACGT => 'N',
176 N => 'N'
179 # Same thing with proteins now
180 our %IUP = (
181 A => [qw(A)],
182 B => [qw(D N)],
183 C => [qw(C)],
184 D => [qw(D)],
185 E => [qw(E)],
186 F => [qw(F)],
187 G => [qw(G)],
188 H => [qw(H)],
189 I => [qw(I)],
190 J => [qw(I L)],
191 K => [qw(K)],
192 L => [qw(L)],
193 M => [qw(M)],
194 N => [qw(N)],
195 O => [qw(O)],
196 P => [qw(P)],
197 Q => [qw(Q)],
198 R => [qw(R)],
199 S => [qw(S)],
200 T => [qw(T)],
201 U => [qw(U)],
202 V => [qw(V)],
203 W => [qw(W)],
204 X => [qw(X)],
205 Y => [qw(Y)],
206 Z => [qw(E Q)],
207 '*' => [qw(*)],
210 our %IUP_AMB = (
211 B => [qw(B)],
212 J => [qw(J)],
213 Z => [qw(Z)],
217 =head2 new
219 Title : new
220 Usage : Bio::Tools::IUPAC->new($seq);
221 Function: Create a new IUPAC object, which acts as a sequence stream (akin to
222 SeqIO)
223 Args : an ambiguously coded sequence object that has a specified 'alphabet'
224 Returns : a Bio::Tools::IUPAC object.
226 =cut
228 sub new {
229 my ($class,@args) = @_;
230 my $self = $class->SUPER::new(@args);
231 my ($seq) = $self->_rearrange([qw(SEQ)],@args);
233 if ( (not defined $seq) && @args && ref($args[0]) ) {
234 # parameter not passed as named parameter?
235 $seq = $args[0];
238 if (defined $seq) {
239 if (not $seq->isa('Bio::PrimarySeqI')) {
240 $self->throw('Must supply a sequence object');
242 if (length $seq->seq == 0) {
243 $self->throw('Sequence had zero-length');
245 $self->{'_seq'} = $seq;
248 return $self;
252 sub _initialize {
253 my ($self) = @_;
254 my %iupac = $self->iupac;
255 $self->{'_alpha'} = [ map { $iupac{uc $_} } split('', $self->{'_seq'}->seq) ];
256 $self->{'_string'} = [(0) x length($self->{'_seq'}->seq())];
257 $self->{'_string'}->[0] = -1;
261 =head2 next_seq
263 Title : next_seq
264 Usage : $iupac->next_seq();
265 Function: returns the next unique sequence object
266 Args : none.
267 Returns : a Bio::Seq object
269 =cut
271 sub next_seq {
272 my ($self) = @_;
274 if (not exists $self->{'_string'}) {
275 $self->_initialize();
278 for my $i ( 0 .. $#{$self->{'_string'}} ) {
279 next unless $self->{'_string'}->[$i] || @{$self->{'_alpha'}->[$i]} > 1;
280 if ( $self->{'_string'}->[$i] == $#{$self->{'_alpha'}->[$i]} ) { # rollover
281 if ( $i == $#{$self->{'_string'}} ) { # end of possibilities
282 return;
283 } else {
284 $self->{'_string'}->[$i] = 0;
285 next;
287 } else {
288 $self->{'_string'}->[$i]++;
289 my $j = -1;
290 my $seqstr = join('', map { $j++; $self->{'_alpha'}->[$j]->[$_]; } @{$self->{'_string'}});
291 my $desc = $self->{'_seq'}->desc() || '';
292 $self->{'_num'}++;
293 1 while $self->{'_num'} =~ s/(\d)(\d\d\d)(?!\d)/$1,$2/;
294 $desc =~ s/( \[Bio::Tools::IUPAC-generated\sunique sequence # [^\]]*\])|$/ \[Bio::Tools::IUPAC-generated unique sequence # $self->{'_num'}\]/;
295 $self->{'_num'} =~ s/,//g;
297 # Return a fresh sequence object
298 return Bio::PrimarySeq->new(-seq => $seqstr, -desc => $desc);
304 =head2 iupac
306 Title : iupac
307 Usage : my %symbols = $iupac->iupac;
308 Function: Returns a hash of symbols -> symbol components of the right type
309 for the given sequence, i.e. it is the same as iupac_iup() if
310 Bio::Tools::IUPAC was given a proteic sequence, or iupac_iub() if the
311 sequence was nucleic. For example, the key 'M' has the value ['A', 'C'].
312 Args : none
313 Returns : Hash
315 =cut
317 sub iupac {
318 my ($self) = @_;
319 my $alphabet = lc( $self->{'_seq'}->alphabet() );
320 if ( ($alphabet eq 'dna') or ($alphabet eq 'rna') ) {
321 return %IUB; # nucleic
322 } elsif ( $alphabet eq 'protein' ) {
323 return %IUP; # proteic
324 } else {
325 $self->throw("The input sequence had the unknown alphabet '$alphabet'\n");
331 =head2 iupac_amb
333 Title : iupac_amb
334 Usage : my %symbols = $iupac->iupac_amb;
335 Function: Same as iupac() but only contains a mapping between ambiguous residues
336 and the ambiguous residues they map to. For example, the key 'N' has
337 the value ['R', 'Y', 'K', 'M', 'S', 'W', 'B', 'D', 'H', 'V', 'N'],
338 i.e. it matches all other ambiguous residues.
339 Args : none
340 Returns : Hash
342 =cut
344 sub iupac_amb {
345 my ($self) = @_;
346 my $alphabet = lc( $self->{'_seq'}->alphabet() );
347 if ( ($alphabet eq 'dna') or ($alphabet eq 'rna') ) {
348 return %IUB_AMB; # nucleic
349 } elsif ( $alphabet eq 'protein' ) {
350 return %IUP_AMB; # proteic
351 } else {
352 $self->throw("The input sequence had the unknown alphabet '$alphabet'\n");
357 =head2 iupac_iup
359 Title : iupac_iup
360 Usage : my %aasymbols = $iupac->iupac_iup;
361 Function: Returns a hash of PROTEIN symbols -> non-ambiguous symbol components
362 Args : none
363 Returns : Hash
365 =cut
367 sub iupac_iup {
368 return %IUP;
372 =head2 iupac_iup_amb
374 Title : iupac_iup_amb
375 Usage : my %aasymbols = $iupac->iupac_iup_amb;
376 Function: Returns a hash of PROTEIN symbols -> ambiguous symbol components
377 Args : none
378 Returns : Hash
380 =cut
382 sub iupac_iup_amb {
383 return %IUP_AMB;
387 =head2 iupac_iub
389 Title : iupac_iub
390 Usage : my %dnasymbols = $iupac->iupac_iub;
391 Function: Returns a hash of DNA symbols -> non-ambiguous symbol components
392 Args : none
393 Returns : Hash
395 =cut
397 sub iupac_iub {
398 return %IUB;
402 =head2 iupac_iub_amb
404 Title : iupac_iub_amb
405 Usage : my %dnasymbols = $iupac->iupac_iub;
406 Function: Returns a hash of DNA symbols -> ambiguous symbol components
407 Args : none
408 Returns : Hash
410 =cut
412 sub iupac_iub_amb {
413 return %IUB_AMB;
417 =head2 iupac_rev_iub
419 Title : iupac_rev_iub
420 Usage : my %dnasymbols = $iupac->iupac_rev_iub;
421 Function: Returns a hash of nucleotide combinations -> IUPAC code
422 (a reverse of the iupac_iub hash).
423 Args : none
424 Returns : Hash
426 =cut
428 sub iupac_rev_iub {
429 return %REV_IUB;
433 =head2 count
435 Title : count
436 Usage : my $total = $iupac->count();
437 Function: Calculates the number of unique, unambiguous sequences that
438 this ambiguous sequence could generate
439 Args : none
440 Return : int
442 =cut
444 sub count {
445 my ($self) = @_;
446 if (not exists $self->{'_string'}) {
447 $self->_initialize();
449 my $count = 1;
450 $count *= scalar(@$_) for (@{$self->{'_alpha'}});
451 return $count;
455 =head2 regexp
457 Title : regexp
458 Usage : my $re = $iupac->regexp();
459 Function: Converts the ambiguous sequence into a regular expression that
460 matches all of the corresponding ambiguous and non-ambiguous sequences.
461 You can further manipulate the resulting regular expression with the
462 Bio::Tools::SeqPattern module. After you are done building your
463 regular expression, you might want to compile it and make it case-
464 insensitive:
465 $re = qr/$re/i;
466 Args : 1 to match RNA: T and U characters will match interchangeably
467 Return : regular expression
469 =cut
471 sub regexp {
472 my ($self, $match_rna) = @_;
473 my $re;
474 my $seq = $self->{'_seq'}->seq;
475 my %iupac = $self->iupac;
476 my %iupac_amb = $self->iupac_amb;
477 for my $pos (0 .. length($seq)-1) {
478 my $res = substr $seq, $pos, 1;
479 my $iupacs = $iupac{$res};
480 my $iupacs_amb = $iupac_amb{$res} || [];
481 if (not defined $iupacs) {
482 $self->throw("Primer sequence '$seq' is not a valid IUPAC sequence.".
483 " Offending character was '$res'.\n");
485 my $part = join '', (@$iupacs, @$iupacs_amb);
486 if ($match_rna) {
487 $part =~ s/T/TU/i || $part =~ s/U/TU/i;
489 if (length $part > 1) {
490 $part = '['.$part.']';
492 $re .= $part;
494 return $re;
498 sub AUTOLOAD {
499 my $self = shift @_;
500 our $AUTOLOAD;
501 my $method = $AUTOLOAD;
502 $method =~ s/.*:://;
503 return $self->{'_seq'}->$method(@_)
504 unless $method eq 'DESTROY';