Bio::DB::TFBS namespace has been moved to its own distribution named after itself
[bioperl-live.git] / Bio / Seq / SimulatedRead.pm
blob948749b366bc3b704b58536a2447489f2f56fd51
1 package Bio::Seq::SimulatedRead;
4 =head1 NAME
6 Bio::Seq::SimulatedRead - Read with sequencing errors taken from a reference sequence
8 =head1 SYNOPSIS
10 use Bio::Seq::SimulatedRead;
11 use Bio::PrimarySeq;
13 # Create a reference sequence
14 my $genome = Bio::PrimarySeq->new( -id => 'human_chr2',
15 -seq => 'TAAAAAAACCCCTG',
16 -desc => 'The human genome' );
18 # A 10-bp error-free read taken from a genome
19 my $read = Bio::Seq::SimulatedRead->new(
20 -reference => $genome , # sequence to generate the read from
21 -id => 'read001', # read ID
22 -start => 3 , # start of the read on the genome forward strand
23 -end => 12 , # end of the read on the genome forward strand
24 -strand => 1 , # genome strand that the read is on
27 # Display the sequence of the read
28 print $read->seq."\n";
30 # Add a tag or MID to the beginning of the read
31 $read->mid('ACGT');
33 # Add sequencing errors (error positions are 1-based and relative to the
34 # error-free MID-containing read)
35 my $errors = {};
36 $errors->{'8'}->{'+'} = 'AAA'; # insertion of AAA after residue 8
37 $errors->{'1'}->{'%'} = 'G'; # substitution of residue 1 by a G
38 $errors->{'4'}->{'-'} = undef; # deletion of residue 4
39 $read->errors($errors);
41 # Display the sequence of the read with errors
42 print $read->seq."\n";
44 # String representation of where the read came from and its errors
45 print $read->desc."\n";
47 =head1 DESCRIPTION
49 This object is a simulated read with sequencing errors. The user can provide a
50 reference sequence to take a read from, the position and orientation of the
51 read on the reference sequence, and the sequencing errors to generate.
53 The sequence of the read is automatically calculated based on this information.
54 By default, the description of the reads contain tracking information and will
55 look like this (Bioperl-style):
57 reference=human_chr2 start=3 end=12 strand=-1 mid=ACGT errors=1%G,4-,8+AAA description="The human genome"
59 or Genbank-style:
61 reference=human_chr2 position=complement(3..12) mid=ACGT errors=1%G,4-,8+AAA description="The human genome"
63 Creating a simulated read follows these steps:
64 1/ Define the read start(), end(), strand() and qual_levels() if you want
65 quality scores to be generated. Do not change these values once set because
66 the read will not be updated.
67 2/ Specify the reference sequence that the read should be taken from. Once
68 this is done, you have a fully functional read. Do not use the reference()
69 method again after you have gone to the next step.
70 3/ Use mid() to input a MID (or tag or barcode) to add to the beginning of the
71 read. You can change the MID until you go to next step.
72 4/ Give sequencing error specifications using errors() as the last step. You
73 can do that as many times as you like, and the read will be updated.
75 =head1 AUTHOR
77 Florent E Angly E<lt>florent . angly @ gmail-dot-comE<gt>.
79 Copyright (c) 2011 Florent E Angly.
81 This library is free software; you can redistribute it under the GNU General
82 Public License version 3.
84 =cut
87 use strict;
88 use warnings;
89 use Bio::LocatableSeq;
90 use base qw( Bio::Seq::Quality Bio::LocatableSeq );
93 =head2 new
95 Title : new
96 Function : Create a new simulated read object
97 Usage : my $read = Bio::Seq::SimulatedRead->new(
98 -id => 'read001',
99 -reference => $seq_obj ,
100 -errors => $errors ,
101 -start => 10 ,
102 -end => 135 ,
103 -strand => 1 ,
105 Arguments: -reference => Bio::SeqI, Bio::PrimarySeqI object representing the
106 reference sequence to take the read from. See
107 reference().
108 -errors => Hashref representing the position of errors in the read
109 See errors().
110 -mid => String of a MID to prepend to the read. See mid().
111 -track => Track where the read came from in the read description?
112 See track().
113 -coord_style => Define what coordinate system to use. See coord_style().
114 All other methods from Bio::LocatableSeq are available.
115 Returns : new Bio::Seq::SimulatedRead object
117 =cut
119 sub new {
120 my ($class, @args) = @_;
121 my $self = $class->SUPER::new(@args);
122 my ($qual_levels, $reference, $mid, $errors, $track, $coord_style) =
123 $self->_rearrange([qw(QUAL_LEVELS REFERENCE MID ERRORS TRACK COORD_STYLE)], @args);
124 $coord_style = defined $coord_style ? $coord_style : 'bioperl';
125 $self->coord_style($coord_style);
126 $track = defined $track ? $track : 1;
127 $self->track($track);
128 $qual_levels = defined $qual_levels ? $qual_levels : [];
129 $self->qual_levels($qual_levels) if defined $qual_levels;
130 $self->reference($reference) if defined $reference;
131 $self->mid($mid) if defined $mid;
132 $self->{_mutated} = 0;
133 $self->errors($errors) if defined $errors;
134 return $self;
138 =head2 qual_levels
140 Title : qual_levels
141 Function : Get or set the quality scores to give to the read. By default, if your
142 reference sequence does not have quality scores, no quality scores
143 are generated for the simulated read. The generated quality scores
144 are very basic. If a residue is error-free, it gets the quality score
145 defined for good residues. If the residue has an error (is an
146 addition or a mutation), the residue gets the quality score specified
147 for bad residues. Call the qual_levels() method before using the
148 reference() method.
149 Usage : my $qual_levels = $read->qual_levels( );
150 Arguments: Array reference containing the quality scores to use for:
151 1/ good residues (e.g. 30)
152 2/ bad residues (e.g. 10)
153 Returns : Array reference containing the quality scores to use.
155 =cut
157 sub qual_levels {
158 my ($self, $qual_levels) = @_;
159 if (defined $qual_levels) {
160 if ( (scalar @$qual_levels != 0) && (scalar @$qual_levels != 2) ) {
161 $self->throw("The quality score specification must define the score".
162 " to use for good and for bad residues\n");
164 $self->{qual_levels} = $qual_levels;
166 return $self->{qual_levels};
170 =head2 reference
172 Title : reference
173 Function : Get or set the reference sequence that the read comes from. Once the
174 reference has been set, you have a functional simulated read which
175 supports all the Bio::LocatableSeq methods. This method must be
176 called after qual_levels() but before mid() or errors().
177 Usage : my $seq_obj = $read->reference();
178 Arguments: Bio::SeqI or Bio::PrimarySeqI object
179 Returns : Bio::SeqI or Bio::PrimarySeqI object
181 =cut
183 sub reference {
184 my ($self, $reference) = @_;
185 if (defined $reference) {
186 # Sanity check 1
187 if ( (not $reference->isa('Bio::SeqI')) && (not $reference->isa('Bio::PrimarySeqI')) ) {
188 $self->throw("Expected a Bio::SeqI object as reference, but got: $reference\n");
190 # Sanity check 2
191 if ($self->{mid} || $self->{errors}) {
192 $self->throw("Cannot change the reference sequence after an MID or ".
193 "sequencing errors have been added to the read\n");
195 # Use beginning of reference sequence as start default
196 if (not defined $self->start) {
197 $self->start(1);
199 # Use end of reference sequence as end default
200 if (not defined $self->end) {
201 $self->end($reference->length);
203 # Use strand 1 as strand default
204 if (not defined $self->strand) {
205 $self->strand(1);
207 # Set the reference sequence object
208 $self->{reference} = $reference;
209 # Create a sequence, quality scores and description from the reference
210 $self->_create_seq;
211 $self->_create_qual if scalar @{$self->qual_levels};
212 $self->_create_desc if $self->track;
214 return $self->{reference};
218 sub _create_seq {
219 my $self = shift;
220 # Get a truncation of the reference sequence
221 my $reference = $self->reference;
222 my $read_obj = $reference->trunc( $self->start, $self->end );
223 # Reverse complement the read if needed
224 if ($self->strand == -1) {
225 $read_obj = $read_obj->revcom();
227 $self->seq($read_obj->seq);
228 return 1;
232 sub _create_qual {
233 my $self = shift;
234 $self->qual([ ($self->qual_levels->[0]) x ($self->end - $self->start + 1) ]);
235 return 1;
239 sub _create_desc {
240 # Create the read description of the error-free read
241 my $self = shift;
242 # Reference sequence ID
243 my $desc_str = '';
244 my $ref_id = $self->reference->id;
245 if (defined $ref_id) {
246 $desc_str .= 'reference='.$ref_id.' ';
248 # Position of read on reference sequence: start, end and strand
249 my $strand = $self->strand;
250 if ($self->coord_style eq 'bioperl') {
251 $desc_str .= 'start='.$self->start.' end='.$self->end.' ';
252 if (defined $strand) {
253 # Strand of the reference sequence that the read is on
254 $strand = '+1' if $strand == 1;
255 $desc_str .= 'strand='.$strand.' ';
257 } else {
258 if ( (defined $strand) && ($strand == -1) ) {
259 # Reverse complemented
260 $desc_str .= 'position=complement('.$self->start.'..'.$self->end.') ';
261 } else {
262 # Regular (forward) orientation
263 $desc_str .= 'position='.$self->start.'..'.$self->end.' ';
266 # Description of the original sequence
267 my $ref_desc = $self->reference->desc;
268 if ( (defined $self->reference->desc) && ($self->reference->desc !~ m/^\s*$/) ) {
269 $ref_desc =~ s/"/\\"/g; # escape double-quotes to \"
270 $desc_str .= 'description="'.$ref_desc.'" ';
272 $desc_str =~ s/\s$//g;
273 # Record new description
274 $self->desc($desc_str);
275 return 1;
279 =head2 mid
281 Title : mid
282 Function : Get or set a multiplex identifier (or MID, or tag, or barcode) to
283 add to the read. By default, no MID is used. This method must be
284 called after reference() but before errors().
285 Usage : my $mid = read->mid();
286 Arguments: MID sequence string (e.g. 'ACGT')
287 Returns : MID sequence string
289 =cut
291 sub mid {
292 my ($self, $mid) = @_;
293 if (defined $mid) {
294 # Sanity check 1
295 if (not defined $self->reference) {
296 $self->throw("Cannot add MID because the reference sequence was not ".
297 "set\n");
299 # Sanity check 2
300 if ($self->{errors}) {
301 $self->throw("Cannot add an MID after sequencing errors have been ".
302 "introduced in the read\n");
304 # Sanity check 3
305 if (not $self->validate_seq($mid)) {
306 $self->throw("MID is not a valid DNA sequence\n");
308 # Update sequence, quality scores and description with the MID
309 $self->_update_seq_mid($mid);
310 $self->_update_qual_mid($mid) if scalar @{$self->qual_levels};
311 $self->_update_desc_mid($mid) if $self->track;
312 # Set the MID value
313 $self->{mid} = $mid;
315 return $self->{mid}
319 sub _update_seq_mid {
320 # Update the MID of a sequence
321 my ($self, $mid) = @_;
322 # Remove old MID
323 my $seq = $self->seq;
324 my $old_mid = $self->{mid};
325 if (defined $old_mid) {
326 $seq =~ s/^$old_mid//;
328 # Add new MID
329 $seq = $mid . $seq;
330 $self->seq( $seq );
331 return 1;
335 sub _update_qual_mid {
336 # Update the MID of a quality scores
337 my ($self, $mid) = @_;
338 # Remove old MID
339 my $qual = $self->qual;
340 my $old_mid = $self->{mid};
341 if (defined $old_mid) {
342 splice @$qual, 0, length($old_mid);
344 $qual = [($self->qual_levels->[0]) x length($mid), @$qual];
345 $self->qual( $qual );
346 return 1;
350 sub _update_desc_mid {
351 # Update MID specifications in the read description
352 my ($self, $mid) = @_;
353 if ($mid) {
354 # Sequencing errors introduced in the read
355 my $mid_str = "mid=".$mid;
356 my $desc_str = $self->desc;
357 $desc_str =~ s/((position|strand)=\S+)( mid=\S+)?/$1 $mid_str/g;
358 $self->desc( $desc_str );
360 return 1;
364 =head2 errors
366 Title : errors
367 Function : Get or set the sequencing errors and update the read. By default, no
368 errors are made. This method must be called after the mid() method.
369 Usage : my $errors = $read->errors();
370 Arguments: Reference to a hash of the position and nature of sequencing errors.
371 The positions are 1-based relative to the error-free MID-containing
372 read (not relative to the reference sequence). For example:
373 $errors->{34}->{'%'} = 'T' ; # substitution of residue 34 by a T
374 $errors->{23}->{'+'} = 'GG' ; # insertion of GG after residue 23
375 $errors->{45}->{'-'} = undef; # deletion of residue 45
376 Substitutions and deletions are for a single residue, but additions
377 can be additions of several residues.
378 An alternative way to specify errors is by using array references
379 instead of scalar for the hash values. This allows one to specify
380 redundant mutations. For example, the case presented above would
381 result in the same read sequence as the example below:
382 $errors->{34}->{'%'} = ['C', 'T'] ; # substitution by a C and then a T
383 $errors->{23}->{'+'} = ['G', 'G'] ; # insertion of G and then a G
384 $errors->{45}->{'-'} = [undef, undef]; # deletion of residue, and again
385 Returns : Reference to a hash of the position and nature of sequencing errors.
387 =cut
389 sub errors {
390 my ($self, $errors) = @_;
391 if (defined $errors) {
392 # Verify that we have a hashref
393 if ( (not defined ref $errors) || (not ref $errors eq 'HASH') ) {
394 $self->throw("Error specification has to be a hashref. Got: $errors\n");
396 # Verify that we have a reference sequence
397 if (not defined $self->reference) {
398 $self->throw("Cannot add errors because the reference sequence was not set\n");
400 # Convert scalar error specs to arrayref specs
401 $errors = $self->_scalar_to_arrayref($errors);
402 # Check validity of error specifications
403 $errors = $self->_validate_error_specs($errors);
404 # Set the error specifications
405 $self->{errors} = $errors;
406 # Need to recalculate the read from the reference if previously mutated
407 if ($self->{_mutated}) {
408 $self->_create_seq;
409 $self->_create_qual if scalar @{$self->qual_levels};
410 $self->_create_desc if $self->track;
412 # Now mutate the read, quality score and description
413 $self->_update_seq_errors;
414 $self->_update_qual_errors if scalar @{$self->qual_levels};
415 $self->_update_desc_errors if $self->track;
418 return $self->{errors};
422 sub _scalar_to_arrayref {
423 # Replace the scalar values in the error specs by more versatile arrayrefs
424 my ($self, $errors) = @_;
425 while ( my ($pos, $ops) = each %$errors ) {
426 while ( my ($op, $res) = each %$ops ) {
427 if (ref $res eq '') {
428 my $arr = [ split //, ($res || '') ];
429 $arr = [undef] if scalar @$arr == 0;
430 $$errors{$pos}{$op} = $arr;
434 return $errors;
438 sub _validate_error_specs {
439 # Clean error specifications and warn of any issues encountered
440 my ($self, $errors) = @_;
441 my %valid_ops = ('%' => undef, '-' => undef, '+' => undef); # valid operations
443 # Calculate read length
444 my $read_length = $self->length;
445 while ( my ($pos, $ops) = each %$errors ) {
447 # Position cannot be no longer than the read length
448 if ( (defined $read_length) && ($pos > $read_length) ) {
449 $self->warn("Position $pos is beyond end of read ($read_length ".
450 "residues). Skipping errors specified at this position.\n");
451 delete $errors->{$pos};
454 # Position has to be 0+ for addition, 1+ for substitution and deletion
455 if ( $pos < 1 && (exists $ops->{'%'} || exists $ops->{'-'}) ) {
456 $self->warn("Positions of substitutions and deletions have to be ".
457 "strictly positive but got $pos. Skipping substitution or deletion".
458 " at this position\n");
459 delete $ops->{'%'};
460 delete $ops->{'-'};
462 if ( $pos < 0 && exists $ops->{'+'}) {
463 $self->warn("Positions of additions have to be zero or more. ".
464 "Skipping addition at position $pos.\n");
465 delete $ops->{'+'};
468 # Valid operations are '%', '+' and '-'
469 while ( my ($op, $res) = each %$ops ) {
470 if (not exists $valid_ops{$op}) {
471 $self->warn("Skipping unknown error operation '$op' at position".
472 " $pos\n");
473 delete $ops->{$op};
474 } else {
475 # Substitutions: have to have at least one residue to substitute
476 if ( ($op eq '%') && (scalar @$res < 1) ) {
477 $self->warn("At least one residue must be provided for substitutions,".
478 "but got ".scalar(@$res)." at position $pos.\n");
480 # Additions: have to have at least one residue to add
481 if ( ($op eq '+') && (scalar @$res < 1) ) {
482 $self->warn("At least one residue must be provided for additions,".
483 "but got ".scalar(@$res)." at position $pos.\n");
485 # Deletions
486 if ( ($op eq '-') && (scalar @$res < 1) ) {
487 $self->warn("At least one 'undef' must be provided for deletions,".
488 "but got ".scalar(@$res)." at position $pos.\n");
493 delete $errors->{$pos} unless scalar keys %$ops;
496 return $errors;
500 sub _update_seq_errors {
501 my $self = shift;
502 my $seq_str = $self->seq;
503 my $errors = $self->errors;
504 if (scalar keys %$errors > 0) {
505 my $off = 0;
506 for my $pos ( sort {$a <=> $b} (keys %$errors) ) {
507 # Process sequencing errors at that position
508 for my $type ( '%', '-', '+' ) {
509 next if not exists $$errors{$pos}{$type};
510 my $arr = $$errors{$pos}{$type};
511 if ($type eq '%') {
512 # Substitution at residue position. If there are multiple
513 # substitutions to do, directly skip to the last one.
514 substr $seq_str, $pos - 1 + $off, 1, $$arr[-1];
515 } elsif ($type eq '-') {
516 # Deletion at residue position
517 substr $seq_str, $pos - 1 + $off, 1, '';
518 $off--;
519 } elsif ($type eq '+') {
520 # Insertion after residue position
521 substr $seq_str, $pos + $off, 0, join('', @$arr);
522 $off += scalar @$arr;
526 $self->{_mutated} = 1;
527 } else {
528 $self->{_mutated} = 0;
530 $self->seq($seq_str);
531 return 1;
535 sub _update_qual_errors {
536 my $self = shift;
537 my $qual = $self->qual;
538 my $errors = $self->errors;
539 my $bad_qual = $self->qual_levels->[1];
540 if (scalar keys %$errors > 0) {
541 my $off = 0;
542 for my $pos ( sort {$a <=> $b} (keys %$errors) ) {
543 # Process sequencing errors at that position
544 for my $type ( '%', '-', '+' ) {
545 next if not exists $$errors{$pos}{$type};
546 my $arr = $$errors{$pos}{$type};
547 if ($type eq '%') {
548 # Substitution at residue position
549 splice @$qual, $pos - 1 + $off, 1, $bad_qual;
550 } elsif ($type eq '-') {
551 # Deletion at residue position
552 splice @$qual, $pos - 1 + $off, 1;
553 $off--;
554 } elsif ($type eq '+') {
555 # Insertion after residue position
556 splice @$qual, $pos + $off, 0, ($bad_qual) x scalar(@$arr);
557 $off += scalar @$arr;
562 $self->qual($qual);
563 return 1;
567 sub _update_desc_errors {
568 # Add or update error specifications in the read description
569 my $self = shift;
570 my $errors = $self->errors;
571 if (defined $errors and scalar keys %$errors > 0) {
572 # Sequencing errors introduced in the read
573 my $err_str = 'errors=';
574 for my $pos ( sort {$a <=> $b} (keys %$errors) ) {
575 # Process sequencing errors at that position
576 for my $type ( '%', '-', '+' ) {
577 next if not exists $$errors{$pos}{$type};
578 for my $val ( @{$$errors{$pos}{$type}} ) {
579 $val = '' if not defined $val;
580 $err_str .= $pos . $type . $val . ',';
584 $err_str =~ s/,$//;
585 my $desc_str = $self->desc;
586 $desc_str =~ s/((position|strand)=\S+( mid=\S+)?)( errors=\S+)?/$1 $err_str/g;
587 $self->desc( $desc_str );
589 return 1;
593 =head2 track
595 Title : track
596 Function : Get or set the tracking status in the read description. By default,
597 tracking is on. This method can be called at any time.
598 Usage : my $track = $read->track();
599 Arguments: 1 for tracking, 0 otherwise
600 Returns : 1 for tracking, 0 otherwise
602 =cut
604 sub track {
605 my ($self, $track) = @_;
606 if (defined $track) {
607 if (defined $self->reference) {
608 if ($track == 1) {
609 $self->_create_desc;
610 $self->_update_desc_mid($self->mid);
611 $self->_update_desc_errors;
612 } else {
613 $self->desc(undef);
616 $self->{track} = $track;
618 return $self->{track};
622 =head2 coord_style
624 Title : coord_style
625 Function : When tracking is on, define which 1-based coordinate system to use
626 in the read description:
627 * 'bioperl' uses the start, end and strand keywords (default),
628 similarly to the GFF3 format. Example:
629 start=1 end=10 strand=+1
630 start=1 end=10 strand=-1
631 * 'genbank' does only provide the position keyword. Example:
632 position=1..10
633 position=complement(1..10)
634 Usage : my $coord_style = $read->track();
635 Arguments: 'bioperl' or 'genbank'
636 Returns : 'bioperl' or 'genbank'
638 =cut
640 sub coord_style {
641 my ($self, $coord_style) = @_;
642 my %styles = ( 'bioperl' => undef, 'genbank' => undef );
643 if (defined $coord_style) {
644 if (not exists $styles{$coord_style}) {
645 die "Error: Invalid coordinate style '$coord_style'\n";
647 $self->{coord_style} = $coord_style;
649 return $self->{coord_style};