1 package Bio
::Seq
::SimulatedRead
;
6 Bio::Seq::SimulatedRead - Read with sequencing errors taken from a reference sequence
10 use Bio::Seq::SimulatedRead;
13 # Create a reference sequence
14 my $genome = Bio::PrimarySeq->new( -id => 'human_chr2',
15 -seq => 'TAAAAAAACCCCTG',
16 -desc => 'The human genome' );
18 # A 10-bp error-free read taken from a genome
19 my $read = Bio::Seq::SimulatedRead->new(
20 -reference => $genome , # sequence to generate the read from
21 -id => 'read001', # read ID
22 -start => 3 , # start of the read on the genome forward strand
23 -end => 12 , # end of the read on the genome forward strand
24 -strand => 1 , # genome strand that the read is on
27 # Display the sequence of the read
28 print $read->seq."\n";
30 # Add a tag or MID to the beginning of the read
33 # Add sequencing errors (error positions are 1-based and relative to the
34 # error-free MID-containing read)
36 $errors->{'8'}->{'+'} = 'AAA'; # insertion of AAA after residue 8
37 $errors->{'1'}->{'%'} = 'G'; # substitution of residue 1 by a G
38 $errors->{'4'}->{'-'} = undef; # deletion of residue 4
39 $read->errors($errors);
41 # Display the sequence of the read with errors
42 print $read->seq."\n";
44 # String representation of where the read came from and its errors
45 print $read->desc."\n";
49 This object is a simulated read with sequencing errors. The user can provide a
50 reference sequence to take a read from, the position and orientation of the
51 read on the reference sequence, and the sequencing errors to generate.
53 The sequence of the read is automatically calculated based on this information.
54 By default, the description of the reads contain tracking information and will
55 look like this (Bioperl-style):
57 reference=human_chr2 start=3 end=12 strand=-1 mid=ACGT errors=1%G,4-,8+AAA description="The human genome"
61 reference=human_chr2 position=complement(3..12) mid=ACGT errors=1%G,4-,8+AAA description="The human genome"
63 Creating a simulated read follows these steps:
64 1/ Define the read start(), end(), strand() and qual_levels() if you want
65 quality scores to be generated. Do not change these values once set because
66 the read will not be updated.
67 2/ Specify the reference sequence that the read should be taken from. Once
68 this is done, you have a fully functional read. Do not use the reference()
69 method again after you have gone to the next step.
70 3/ Use mid() to input a MID (or tag or barcode) to add to the beginning of the
71 read. You can change the MID until you go to next step.
72 4/ Give sequencing error specifications using errors() as the last step. You
73 can do that as many times as you like, and the read will be updated.
77 Florent E Angly E<lt>florent . angly @ gmail-dot-comE<gt>.
79 Copyright (c) 2011 Florent E Angly.
81 This library is free software; you can redistribute it under the GNU General
82 Public License version 3.
89 use Bio
::LocatableSeq
;
90 use base
qw( Bio::Seq::Quality Bio::LocatableSeq );
96 Function : Create a new simulated read object
97 Usage : my $read = Bio::Seq::SimulatedRead->new(
99 -reference => $seq_obj ,
105 Arguments: -reference => Bio::SeqI, Bio::PrimarySeqI object representing the
106 reference sequence to take the read from. See
108 -errors => Hashref representing the position of errors in the read
110 -mid => String of a MID to prepend to the read. See mid().
111 -track => Track where the read came from in the read description?
113 -coord_style => Define what coordinate system to use. See coord_style().
114 All other methods from Bio::LocatableSeq are available.
115 Returns : new Bio::Seq::SimulatedRead object
120 my ($class, @args) = @_;
121 my $self = $class->SUPER::new
(@args);
122 my ($qual_levels, $reference, $mid, $errors, $track, $coord_style) =
123 $self->_rearrange([qw(QUAL_LEVELS REFERENCE MID ERRORS TRACK COORD_STYLE)], @args);
124 $coord_style = defined $coord_style ?
$coord_style : 'bioperl';
125 $self->coord_style($coord_style);
126 $track = defined $track ?
$track : 1;
127 $self->track($track);
128 $qual_levels = defined $qual_levels ?
$qual_levels : [];
129 $self->qual_levels($qual_levels) if defined $qual_levels;
130 $self->reference($reference) if defined $reference;
131 $self->mid($mid) if defined $mid;
132 $self->{_mutated
} = 0;
133 $self->errors($errors) if defined $errors;
141 Function : Get or set the quality scores to give to the read. By default, if your
142 reference sequence does not have quality scores, no quality scores
143 are generated for the simulated read. The generated quality scores
144 are very basic. If a residue is error-free, it gets the quality score
145 defined for good residues. If the residue has an error (is an
146 addition or a mutation), the residue gets the quality score specified
147 for bad residues. Call the qual_levels() method before using the
149 Usage : my $qual_levels = $read->qual_levels( );
150 Arguments: Array reference containing the quality scores to use for:
151 1/ good residues (e.g. 30)
152 2/ bad residues (e.g. 10)
153 Returns : Array reference containing the quality scores to use.
158 my ($self, $qual_levels) = @_;
159 if (defined $qual_levels) {
160 if ( (scalar @
$qual_levels != 0) && (scalar @
$qual_levels != 2) ) {
161 $self->throw("The quality score specification must define the score".
162 " to use for good and for bad residues\n");
164 $self->{qual_levels
} = $qual_levels;
166 return $self->{qual_levels
};
173 Function : Get or set the reference sequence that the read comes from. Once the
174 reference has been set, you have a functional simulated read which
175 supports all the Bio::LocatableSeq methods. This method must be
176 called after qual_levels() but before mid() or errors().
177 Usage : my $seq_obj = $read->reference();
178 Arguments: Bio::SeqI or Bio::PrimarySeqI object
179 Returns : Bio::SeqI or Bio::PrimarySeqI object
184 my ($self, $reference) = @_;
185 if (defined $reference) {
187 if ( (not $reference->isa('Bio::SeqI')) && (not $reference->isa('Bio::PrimarySeqI')) ) {
188 $self->throw("Expected a Bio::SeqI object as reference, but got: $reference\n");
191 if ($self->{mid
} || $self->{errors
}) {
192 $self->throw("Cannot change the reference sequence after an MID or ".
193 "sequencing errors have been added to the read\n");
195 # Use beginning of reference sequence as start default
196 if (not defined $self->start) {
199 # Use end of reference sequence as end default
200 if (not defined $self->end) {
201 $self->end($reference->length);
203 # Use strand 1 as strand default
204 if (not defined $self->strand) {
207 # Set the reference sequence object
208 $self->{reference
} = $reference;
209 # Create a sequence, quality scores and description from the reference
211 $self->_create_qual if scalar @
{$self->qual_levels};
212 $self->_create_desc if $self->track;
214 return $self->{reference
};
220 # Get a truncation of the reference sequence
221 my $reference = $self->reference;
222 my $read_obj = $reference->trunc( $self->start, $self->end );
223 # Reverse complement the read if needed
224 if ($self->strand == -1) {
225 $read_obj = $read_obj->revcom();
227 $self->seq($read_obj->seq);
234 $self->qual([ ($self->qual_levels->[0]) x
($self->end - $self->start + 1) ]);
240 # Create the read description of the error-free read
242 # Reference sequence ID
244 my $ref_id = $self->reference->id;
245 if (defined $ref_id) {
246 $desc_str .= 'reference='.$ref_id.' ';
248 # Position of read on reference sequence: start, end and strand
249 my $strand = $self->strand;
250 if ($self->coord_style eq 'bioperl') {
251 $desc_str .= 'start='.$self->start.' end='.$self->end.' ';
252 if (defined $strand) {
253 # Strand of the reference sequence that the read is on
254 $strand = '+1' if $strand == 1;
255 $desc_str .= 'strand='.$strand.' ';
258 if ( (defined $strand) && ($strand == -1) ) {
259 # Reverse complemented
260 $desc_str .= 'position=complement('.$self->start.'..'.$self->end.') ';
262 # Regular (forward) orientation
263 $desc_str .= 'position='.$self->start.'..'.$self->end.' ';
266 # Description of the original sequence
267 my $ref_desc = $self->reference->desc;
268 if ( (defined $self->reference->desc) && ($self->reference->desc !~ m/^\s*$/) ) {
269 $ref_desc =~ s/"/\\"/g; # escape double-quotes to \"
270 $desc_str .= 'description="'.$ref_desc.'" ';
272 $desc_str =~ s/\s$//g;
273 # Record new description
274 $self->desc($desc_str);
282 Function : Get or set a multiplex identifier (or MID, or tag, or barcode) to
283 add to the read. By default, no MID is used. This method must be
284 called after reference() but before errors().
285 Usage : my $mid = read->mid();
286 Arguments: MID sequence string (e.g. 'ACGT')
287 Returns : MID sequence string
292 my ($self, $mid) = @_;
295 if (not defined $self->reference) {
296 $self->throw("Cannot add MID because the reference sequence was not ".
300 if ($self->{errors
}) {
301 $self->throw("Cannot add an MID after sequencing errors have been ".
302 "introduced in the read\n");
305 if (not $self->validate_seq($mid)) {
306 $self->throw("MID is not a valid DNA sequence\n");
308 # Update sequence, quality scores and description with the MID
309 $self->_update_seq_mid($mid);
310 $self->_update_qual_mid($mid) if scalar @
{$self->qual_levels};
311 $self->_update_desc_mid($mid) if $self->track;
319 sub _update_seq_mid
{
320 # Update the MID of a sequence
321 my ($self, $mid) = @_;
323 my $seq = $self->seq;
324 my $old_mid = $self->{mid
};
325 if (defined $old_mid) {
326 $seq =~ s/^$old_mid//;
335 sub _update_qual_mid
{
336 # Update the MID of a quality scores
337 my ($self, $mid) = @_;
339 my $qual = $self->qual;
340 my $old_mid = $self->{mid
};
341 if (defined $old_mid) {
342 splice @
$qual, 0, length($old_mid);
344 $qual = [($self->qual_levels->[0]) x
length($mid), @
$qual];
345 $self->qual( $qual );
350 sub _update_desc_mid
{
351 # Update MID specifications in the read description
352 my ($self, $mid) = @_;
354 # Sequencing errors introduced in the read
355 my $mid_str = "mid=".$mid;
356 my $desc_str = $self->desc;
357 $desc_str =~ s/((position|strand)=\S+)( mid=\S+)?/$1 $mid_str/g;
358 $self->desc( $desc_str );
367 Function : Get or set the sequencing errors and update the read. By default, no
368 errors are made. This method must be called after the mid() method.
369 Usage : my $errors = $read->errors();
370 Arguments: Reference to a hash of the position and nature of sequencing errors.
371 The positions are 1-based relative to the error-free MID-containing
372 read (not relative to the reference sequence). For example:
373 $errors->{34}->{'%'} = 'T' ; # substitution of residue 34 by a T
374 $errors->{23}->{'+'} = 'GG' ; # insertion of GG after residue 23
375 $errors->{45}->{'-'} = undef; # deletion of residue 45
376 Substitutions and deletions are for a single residue, but additions
377 can be additions of several residues.
378 An alternative way to specify errors is by using array references
379 instead of scalar for the hash values. This allows one to specify
380 redundant mutations. For example, the case presented above would
381 result in the same read sequence as the example below:
382 $errors->{34}->{'%'} = ['C', 'T'] ; # substitution by a C and then a T
383 $errors->{23}->{'+'} = ['G', 'G'] ; # insertion of G and then a G
384 $errors->{45}->{'-'} = [undef, undef]; # deletion of residue, and again
385 Returns : Reference to a hash of the position and nature of sequencing errors.
390 my ($self, $errors) = @_;
391 if (defined $errors) {
392 # Verify that we have a hashref
393 if ( (not defined ref $errors) || (not ref $errors eq 'HASH') ) {
394 $self->throw("Error specification has to be a hashref. Got: $errors\n");
396 # Verify that we have a reference sequence
397 if (not defined $self->reference) {
398 $self->throw("Cannot add errors because the reference sequence was not set\n");
400 # Convert scalar error specs to arrayref specs
401 $errors = $self->_scalar_to_arrayref($errors);
402 # Check validity of error specifications
403 $errors = $self->_validate_error_specs($errors);
404 # Set the error specifications
405 $self->{errors
} = $errors;
406 # Need to recalculate the read from the reference if previously mutated
407 if ($self->{_mutated
}) {
409 $self->_create_qual if scalar @
{$self->qual_levels};
410 $self->_create_desc if $self->track;
412 # Now mutate the read, quality score and description
413 $self->_update_seq_errors;
414 $self->_update_qual_errors if scalar @
{$self->qual_levels};
415 $self->_update_desc_errors if $self->track;
418 return $self->{errors
};
422 sub _scalar_to_arrayref
{
423 # Replace the scalar values in the error specs by more versatile arrayrefs
424 my ($self, $errors) = @_;
425 while ( my ($pos, $ops) = each %$errors ) {
426 while ( my ($op, $res) = each %$ops ) {
427 if (ref $res eq '') {
428 my $arr = [ split //, ($res || '') ];
429 $arr = [undef] if scalar @
$arr == 0;
430 $$errors{$pos}{$op} = $arr;
438 sub _validate_error_specs
{
439 # Clean error specifications and warn of any issues encountered
440 my ($self, $errors) = @_;
441 my %valid_ops = ('%' => undef, '-' => undef, '+' => undef); # valid operations
443 # Calculate read length
444 my $read_length = $self->length;
445 while ( my ($pos, $ops) = each %$errors ) {
447 # Position cannot be no longer than the read length
448 if ( (defined $read_length) && ($pos > $read_length) ) {
449 $self->warn("Position $pos is beyond end of read ($read_length ".
450 "residues). Skipping errors specified at this position.\n");
451 delete $errors->{$pos};
454 # Position has to be 0+ for addition, 1+ for substitution and deletion
455 if ( $pos < 1 && (exists $ops->{'%'} || exists $ops->{'-'}) ) {
456 $self->warn("Positions of substitutions and deletions have to be ".
457 "strictly positive but got $pos. Skipping substitution or deletion".
458 " at this position\n");
462 if ( $pos < 0 && exists $ops->{'+'}) {
463 $self->warn("Positions of additions have to be zero or more. ".
464 "Skipping addition at position $pos.\n");
468 # Valid operations are '%', '+' and '-'
469 while ( my ($op, $res) = each %$ops ) {
470 if (not exists $valid_ops{$op}) {
471 $self->warn("Skipping unknown error operation '$op' at position".
475 # Substitutions: have to have at least one residue to substitute
476 if ( ($op eq '%') && (scalar @
$res < 1) ) {
477 $self->warn("At least one residue must be provided for substitutions,".
478 "but got ".scalar(@
$res)." at position $pos.\n");
480 # Additions: have to have at least one residue to add
481 if ( ($op eq '+') && (scalar @
$res < 1) ) {
482 $self->warn("At least one residue must be provided for additions,".
483 "but got ".scalar(@
$res)." at position $pos.\n");
486 if ( ($op eq '-') && (scalar @
$res < 1) ) {
487 $self->warn("At least one 'undef' must be provided for deletions,".
488 "but got ".scalar(@
$res)." at position $pos.\n");
493 delete $errors->{$pos} unless scalar keys %$ops;
500 sub _update_seq_errors
{
502 my $seq_str = $self->seq;
503 my $errors = $self->errors;
504 if (scalar keys %$errors > 0) {
506 for my $pos ( sort {$a <=> $b} (keys %$errors) ) {
507 # Process sequencing errors at that position
508 for my $type ( '%', '-', '+' ) {
509 next if not exists $$errors{$pos}{$type};
510 my $arr = $$errors{$pos}{$type};
512 # Substitution at residue position. If there are multiple
513 # substitutions to do, directly skip to the last one.
514 substr $seq_str, $pos - 1 + $off, 1, $$arr[-1];
515 } elsif ($type eq '-') {
516 # Deletion at residue position
517 substr $seq_str, $pos - 1 + $off, 1, '';
519 } elsif ($type eq '+') {
520 # Insertion after residue position
521 substr $seq_str, $pos + $off, 0, join('', @
$arr);
522 $off += scalar @
$arr;
526 $self->{_mutated
} = 1;
528 $self->{_mutated
} = 0;
530 $self->seq($seq_str);
535 sub _update_qual_errors
{
537 my $qual = $self->qual;
538 my $errors = $self->errors;
539 my $bad_qual = $self->qual_levels->[1];
540 if (scalar keys %$errors > 0) {
542 for my $pos ( sort {$a <=> $b} (keys %$errors) ) {
543 # Process sequencing errors at that position
544 for my $type ( '%', '-', '+' ) {
545 next if not exists $$errors{$pos}{$type};
546 my $arr = $$errors{$pos}{$type};
548 # Substitution at residue position
549 splice @
$qual, $pos - 1 + $off, 1, $bad_qual;
550 } elsif ($type eq '-') {
551 # Deletion at residue position
552 splice @
$qual, $pos - 1 + $off, 1;
554 } elsif ($type eq '+') {
555 # Insertion after residue position
556 splice @
$qual, $pos + $off, 0, ($bad_qual) x
scalar(@
$arr);
557 $off += scalar @
$arr;
567 sub _update_desc_errors
{
568 # Add or update error specifications in the read description
570 my $errors = $self->errors;
571 if (defined $errors and scalar keys %$errors > 0) {
572 # Sequencing errors introduced in the read
573 my $err_str = 'errors=';
574 for my $pos ( sort {$a <=> $b} (keys %$errors) ) {
575 # Process sequencing errors at that position
576 for my $type ( '%', '-', '+' ) {
577 next if not exists $$errors{$pos}{$type};
578 for my $val ( @
{$$errors{$pos}{$type}} ) {
579 $val = '' if not defined $val;
580 $err_str .= $pos . $type . $val . ',';
585 my $desc_str = $self->desc;
586 $desc_str =~ s/((position|strand)=\S+( mid=\S+)?)( errors=\S+)?/$1 $err_str/g;
587 $self->desc( $desc_str );
596 Function : Get or set the tracking status in the read description. By default,
597 tracking is on. This method can be called at any time.
598 Usage : my $track = $read->track();
599 Arguments: 1 for tracking, 0 otherwise
600 Returns : 1 for tracking, 0 otherwise
605 my ($self, $track) = @_;
606 if (defined $track) {
607 if (defined $self->reference) {
610 $self->_update_desc_mid($self->mid);
611 $self->_update_desc_errors;
616 $self->{track
} = $track;
618 return $self->{track
};
625 Function : When tracking is on, define which 1-based coordinate system to use
626 in the read description:
627 * 'bioperl' uses the start, end and strand keywords (default),
628 similarly to the GFF3 format. Example:
629 start=1 end=10 strand=+1
630 start=1 end=10 strand=-1
631 * 'genbank' does only provide the position keyword. Example:
633 position=complement(1..10)
634 Usage : my $coord_style = $read->track();
635 Arguments: 'bioperl' or 'genbank'
636 Returns : 'bioperl' or 'genbank'
641 my ($self, $coord_style) = @_;
642 my %styles = ( 'bioperl' => undef, 'genbank' => undef );
643 if (defined $coord_style) {
644 if (not exists $styles{$coord_style}) {
645 die "Error: Invalid coordinate style '$coord_style'\n";
647 $self->{coord_style
} = $coord_style;
649 return $self->{coord_style
};