2 # Copyright (c) 1997-2001 bioperl, Chad Matsalla. All Rights Reserved.
3 # This module is free software; you can redistribute it and/or
4 # modify it under the same terms as Perl itself.
6 # Copyright Chad Matsalla
8 # You may distribute this module under the same terms as perl itself
10 # POD documentation - main docs before the code
14 Bio::SeqIO::scf - .scf file input/output stream
18 Do not use this module directly. Use it via the Bio::SeqIO class, see
19 L<Bio::SeqIO> for more information.
23 This object can transform .scf files to and from Bio::Seq::SequenceTrace
24 objects. Mechanisms are present to retrieve trace data from scf
31 User feedback is an integral part of the evolution of this and other
32 Bioperl modules. Send your comments and suggestions preferably to one
33 of the Bioperl mailing lists. Your participation is much appreciated.
35 bioperl-l@bioperl.org - General discussion
36 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
40 Please direct usage questions or support issues to the mailing list:
42 I<bioperl-l@bioperl.org>
44 rather than to the module maintainer directly. Many experienced and
45 reponsive experts will be able look at the problem and quickly
46 address it. Please include a thorough description of the problem
47 with code and data examples if at all possible.
51 Report bugs to the Bioperl bug tracking system to help us keep track
52 the bugs and their resolution. Bug reports can be submitted via
55 https://github.com/bioperl/bioperl-live/issues
57 =head1 AUTHOR Chad Matsalla
60 bioinformatics@dieselwurks.com
64 Jason Stajich, jason@bioperl.org
65 Tony Cox, avc@sanger.ac.uk
66 Heikki Lehvaslaiho, heikki-at-bioperl-dot-org
67 Nancy Hansen, nhansen at mail.nih.gov
71 The rest of the documentation details each of the object
72 methods. Internal methods are usually preceded with a _
76 # Let the code begin...
78 package Bio
::SeqIO
::scf
;
79 use vars
qw($DEFAULT_QUALITY);
81 use Bio::Seq::SeqFactory;
82 use Bio::Seq::SequenceTrace;
83 use Bio::Annotation::Comment;
86 my $dumper = Dumpvalue->new();
87 $dumper->veryCompact(1);
93 use base qw(Bio::SeqIO);
97 $self->SUPER::_initialize
(@args);
98 if( ! defined $self->sequence_factory ) {
99 $self->sequence_factory(Bio
::Seq
::SeqFactory
->new
100 (-verbose
=> $self->verbose(),
101 -type
=> 'Bio::Seq::Quality'));
103 binmode $self->_fh; # for the Win32/Mac crowds
109 Usage : $scf = $stream->next_seq()
110 Function: returns the next scf sequence in the stream
111 Returns : a Bio::Seq::SequenceTrace object
113 Notes : Fills the interface specification for SeqIO.
114 The SCF specification does not provide for having more then
115 one sequence in a given scf. So once the filehandle has been open
116 and passed to SeqIO do not expect to run this function more then
117 once on a given scf unless you embraced and extended the SCF
118 standard. SCF comments are accessible through the Bio::SeqI
119 interface method annotation().
126 my ($seq, $seqc, $fh, $buffer, $offset, $length, $read_bytes, @read,
128 # set up a filehandle to read in the scf
129 return if $self->{_readfile
};
131 unless ($fh) { # simulate the <> function
132 if ( !fileno(ARGV
) or eof(ARGV
) ) {
133 return unless my $ARGV = shift;
135 $self->throw("Could not open $ARGV for SCF stream reading $!");
139 return unless read $fh, $buffer, 128; # no exception; probably end of file
140 # now, the master data structure will be the creator
142 # he first thing to do is parse the header. This is common
143 # among all versions of scf.
144 # the rest of the the information is different between the
145 # the different versions of scf.
147 $creator->{header
} = $self->_get_header($buffer);
148 if ($creator->{header
}->{'version'} lt "3.00") {
149 $self->debug("scf.pm is working with a version 2 scf.\n");
150 # first gather the trace information
151 $length = $creator->{header
}->{'samples'} *
152 $creator->{header
}->{sample_size
}*4;
153 $buffer = $self->read_from_buffer($fh, $buffer, $length,
154 $creator->{header
}->{samples_offset
});
155 # @read = unpack "n$length",$buffer;
156 # these traces need to be split
157 # returns a reference to a hash
158 $creator->{traces
} = $self->_parse_v2_traces(
159 $buffer,$creator->{header
}->{sample_size
});
160 # now go and get the base information
161 $offset = $creator->{header
}->{bases_offset
};
162 $length = ($creator->{header
}->{bases
} * 12);
164 $buffer = $self->read_from_buffer($fh,$buffer,$length,$creator->{header
}->{bases_offset
});
165 # now distill the information into its fractions.
166 # the old way : $self->_set_v2_bases($buffer);
167 # ref to an array, ref to a hash, string
168 ($creator->{peak_indices
},
169 $creator->{qualities
},
170 $creator->{sequence
},
171 $creator->{accuracies
}) = $self->_parse_v2_bases($buffer);
174 $self->debug("scf.pm is working with a version 3+ scf.\n");
175 my $transformed_read;
176 my $current_read_position = $creator->{header
}->{sample_offset
};
177 $length = $creator->{header
}->{'samples'}*
178 $creator->{header
}->{sample_size
};
179 # $dumper->dumpValue($creator->{header});
180 foreach (qw(a c g t)) {
181 $buffer = $self->read_from_buffer($fh,$buffer,$length,$current_read_position);
183 if ($creator->{header
}->{sample_size
} == 1) {
186 @read = unpack "${byte}${length}",$buffer;
187 # this little spurt of nonsense is because
188 # the trace values are given in the binary
189 # file as unsigned shorts but they really
190 # are signed deltas. 30000 is an arbitrary number
191 # (will there be any traces with a given
192 # point greater then 30000? I hope not.
193 # once the read is read, it must be changed
200 $transformed_read = $self->_delta(\
@read,"backward");
201 # For 8-bit data we need to emulate a signed/unsigned
202 # cast that is implicit in the C implementations.....
203 if ($creator->{header
}->{sample_size
} == 1) {
204 foreach (@
{$transformed_read}) {
205 $_ += 256 if ($_ < 0);
208 $current_read_position += $length;
209 $creator->{'traces'}->{$_} = join(' ',@
{$transformed_read});
212 # now go and get the peak index information
213 $offset = $creator->{header
}->{bases_offset
};
214 $length = ($creator->{header
}->{bases
} * 4);
215 $buffer = $self->read_from_buffer($fh,$buffer,$length,$offset);
216 $creator->{peak_indices
} = $self->_get_v3_peak_indices($buffer);
218 # now go and get the accuracy information
219 $buffer = $self->read_from_buffer($fh,$buffer,$length,$offset);
220 $creator->{accuracies
} = $self->_get_v3_base_accuracies($buffer);
221 # OK, now go and get the base information.
223 $length = $creator->{header
}->{bases
};
224 $buffer = $self->read_from_buffer($fh,$buffer,$length,$offset);
225 $creator->{'sequence'} = unpack("a$length",$buffer);
226 # now, finally, extract the calls from the accuracy information.
227 $creator->{qualities
} = $self->_get_v3_quality(
228 $creator->{'sequence'},$creator->{accuracies
});
230 # now go and get the comment information
231 $offset = $creator->{header
}->{comments_offset
};
233 $length = $creator->{header
}->{comment_size
};
234 $buffer = $self->read_from_buffer($fh,$buffer,$length);
235 $creator->{comments
} = $self->_get_comments($buffer);
236 my @name_comments = grep {$_->tagname() eq 'NAME'}
237 $creator->{comments
}->get_Annotations('comment');
240 $name_comment = $name_comments[0]->as_text();
241 $name_comment =~ s/^Comment:\s+//;
244 my $swq = Bio
::Seq
::Quality
->new(
245 -seq
=> $creator->{'sequence'},
246 -qual
=> $creator->{'qualities'},
249 my $returner = Bio
::Seq
::SequenceTrace
->new(
251 -trace_a
=> $creator->{'traces'}->{'a'},
252 -trace_t
=> $creator->{'traces'}->{'t'},
253 -trace_g
=> $creator->{'traces'}->{'g'},
254 -trace_c
=> $creator->{'traces'}->{'c'},
255 -accuracy_a
=> $creator->{'accuracies'}->{'a'},
256 -accuracy_t
=> $creator->{'accuracies'}->{'t'},
257 -accuracy_g
=> $creator->{'accuracies'}->{'g'},
258 -accuracy_c
=> $creator->{'accuracies'}->{'c'},
259 -peak_indices
=> $creator->{'peak_indices'}
262 $returner->annotation($creator->{'comments'}); # add SCF comments
263 $self->{'_readfile'} = 1;
268 =head2 _get_v3_quality()
270 Title : _get_v3_quality()
271 Usage : $self->_get_v3_quality()
272 Function: Set the base qualities from version3 scf
273 Returns : Nothing. Alters $self.
280 sub _get_v3_quality
{
281 my ($self,$sequence,$accuracies) = @_;
282 my @bases = split//,$sequence;
283 my (@qualities,$currbase,$currqual,$counter);
284 for ($counter=0; $counter <= $#bases ; $counter++) {
285 $currbase = lc($bases[$counter]);
286 if ($currbase eq "a") { $currqual = $accuracies->{'a'}->[$counter]; }
287 elsif ($currbase eq "c") { $currqual = $accuracies->{'c'}->[$counter]; }
288 elsif ($currbase eq "g") { $currqual = $accuracies->{'g'}->[$counter]; }
289 elsif ($currbase eq "t") { $currqual = $accuracies->{'t'}->[$counter]; }
290 else { $currqual = "unknown"; }
291 push @qualities,$currqual;
296 =head2 _get_v3_peak_indices($buffer)
298 Title : _get_v3_peak_indices($buffer)
299 Usage : $self->_get_v3_peak_indices($buffer);
300 Function: Unpacks the base accuracies for version3 scf
301 Returns : Nothing. Alters $self
302 Args : A scalar containing binary data.
307 sub _get_v3_peak_indices
{
308 my ($self,$buffer) = @_;
309 my $length = length($buffer);
310 my @read = unpack "N$length",$buffer;
311 return join(' ',@read);
314 =head2 _get_v3_base_accuracies($buffer)
316 Title : _get_v3_base_accuracies($buffer)
317 Usage : $self->_get_v3_base_accuracies($buffer)
318 Function: Set the base accuracies for version 3 scf's
319 Returns : Nothing. Alters $self.
320 Args : A scalar containing binary data.
326 sub _get_v3_base_accuracies
{
327 my ($self,$buffer) = @_;
328 my $length = length($buffer);
329 my $qlength = $length/4;
331 my (@qualities,@sorter,$counter,$round,$last_base,$accuracies,$currbase);
332 foreach $currbase (qw(a c g t)) {
334 $last_base = $offset + $qlength;
335 for (;$offset < $last_base; $offset += $qlength) {
336 # a bioperler (perhaps me?) changed the unpack string to include 'n' rather than 'C'
337 # on 040322 I think that 'C' is correct. please email chad if you would like to accuse me of being incorrect
338 @read = unpack "C$qlength", substr($buffer,$offset,$qlength);
339 $accuracies->{$currbase} = \
@read;
346 =head2 _get_comments($buffer)
348 Title : _get_comments($buffer)
349 Usage : $self->_get_comments($buffer);
350 Function: Gather the comments section from the scf and parse it into its
352 Returns : a Bio::Annotation::Collection object
353 Args : The buffer. It is expected that the buffer contains a binary
354 string for the comments section of an scf file according to
355 the scf file specifications.
361 my ($self,$buffer) = @_;
362 my $comments = Bio
::Annotation
::Collection
->new();
363 my $size = length($buffer);
364 my $comments_retrieved = unpack "a$size",$buffer;
365 $comments_retrieved =~ s/\0//;
366 my @comments_split = split/\n/,$comments_retrieved;
367 if (@comments_split) {
368 foreach (@comments_split) {
371 my ($tagname, $text) = ($1, $2);
372 my $comment_obj = Bio
::Annotation
::Comment
->new(
374 -tagname
=> $tagname);
376 $comments->add_Annotation('comment', $comment_obj);
380 $self->{'comments'} = $comments;
386 Title : _get_header($buffer)
387 Usage : $self->_get_header($buffer);
388 Function: Gather the header section from the scf and parse it into its
390 Returns : Reference to a hash containing the header components.
391 Args : The buffer. It is expected that the buffer contains a binary
392 string for the header section of an scf file according to the
393 scf file specifications.
399 my ($self,$buffer) = @_;
402 $header->{'samples'},
403 $header->{'sample_offset'},
405 $header->{'bases_left_clip'},
406 $header->{'bases_right_clip'},
407 $header->{'bases_offset'},
408 $header->{'comment_size'},
409 $header->{'comments_offset'},
410 $header->{'version'},
411 $header->{'sample_size'},
412 $header->{'code_set'},
413 @
{$header->{'header_spare'}} ) = unpack "a4 NNNNNNNN a4 NN N20", $buffer;
415 $self->{'header'} = $header;
419 =head2 _parse_v2_bases($buffer)
421 Title : _parse_v2_bases($buffer)
422 Usage : $self->_parse_v2_bases($buffer);
423 Function: Gather the bases section from the scf and parse it into its
426 Args : The buffer. It is expected that the buffer contains a binary
427 string for the bases section of an scf file according to the
428 scf file specifications.
433 sub _parse_v2_bases
{
434 my ($self,$buffer) = @_;
435 my $length = length($buffer);
436 my ($offset2,$currbuff,$currbase,$currqual,$sequence,@qualities,@indices);
437 my (@read,$harvester,$accuracies);
438 for ($offset2=0;$offset2<$length;$offset2+=12) {
439 @read = unpack "N C C C C a C3", substr($buffer,$offset2,$length);
440 push @indices,$read[0];
441 $currbase = lc($read[5]);
442 if ($currbase eq "a") { $currqual = $read[1]; }
443 elsif ($currbase eq "c") { $currqual = $read[2]; }
444 elsif ($currbase eq "g") { $currqual = $read[3]; }
445 elsif ($currbase eq "t") { $currqual = $read[4]; }
446 else { $currqual = "UNKNOWN"; }
447 push @
{$accuracies->{"a"}},$read[1];
448 push @
{$accuracies->{"c"}},$read[2];
449 push @
{$accuracies->{"g"}},$read[3];
450 push @
{$accuracies->{"t"}},$read[4];
452 $sequence .= $currbase;
453 push @qualities,$currqual;
455 return (\
@indices,\
@qualities,$sequence,$accuracies)
458 =head2 _parse_v2_traces(\@traces_array)
460 Title : _pares_v2_traces(\@traces_array)
461 Usage : $self->_parse_v2_traces(\@traces_array);
462 Function: Parses an scf Version2 trace array into its base components.
463 Returns : Nothing. Modifies $self.
464 Args : A reference to an array of the unpacked traces section of an
469 sub _parse_v2_traces
{
470 my ($self,$buffer,$sample_size) = @_;
472 if ($sample_size == 1) { $byte = "c"; }
473 else { $byte = "n"; }
474 my $length = CORE
::length($buffer);
475 my @read = unpack "${byte}${length}",$buffer;
476 # this will be an array to the reference holding the array
479 for (my $offset2 = 0; $offset2< scalar(@read); $offset2+=4) {
480 push @
{$traces->{'a'}},$read[$offset2];
481 push @
{$traces->{'c'}},$read[$offset2+1];
482 push @
{$traces->{'g'}},$read[$offset2+3];
483 push @
{$traces->{'t'}},$read[$offset2+2];
489 sub get_trace_deprecated_use_the_sequencetrace_object_instead
{
490 # my ($self,$base_channel,$traces) = @_;
491 # $base_channel =~ tr/a-z/A-Z/;
492 # if ($base_channel !~ /A|T|G|C/) {
493 # $self->throw("You tried to ask for a base channel that wasn't A,T,G, or C. Ask for one of those next time.");
494 ##} elsif ($base_channel) {
495 # my @temp = split(' ',$self->{'traces'}->{$base_channel});
500 sub _deprecated_get_peak_indices_deprecated_use_the_sequencetrace_object_instead
{
502 my @temp = split(' ',$self->{'parsed'}->{'peak_indices'});
510 Usage : %header = %{$obj->get_header()};
511 Function: Return the header for this scf.
512 Returns : A reference to a hash containing the header for this scf.
520 return $self->{'header'};
523 =head2 get_comments()
525 Title : get_comments()
526 Usage : %comments = %{$obj->get_comments()};
527 Function: Return the comments for this scf.
528 Returns : A Bio::Annotation::Collection object
536 return $self->{'comments'};
539 sub _dump_traces_outgoing_deprecated_use_the_sequencetrace_object
{
540 my ($self,$transformed) = @_;
541 my (@sA,@sT,@sG,@sC);
543 @sA = @
{$self->{'text'}->{'t_samples_a'}};
544 @sC = @
{$self->{'text'}->{'t_samples_c'}};
545 @sG = @
{$self->{'text'}->{'t_samples_g'}};
546 @sT = @
{$self->{'text'}->{'t_samples_t'}};
549 @sA = @
{$self->{'text'}->{'samples_a'}};
550 @sC = @
{$self->{'text'}->{'samples_c'}};
551 @sG = @
{$self->{'text'}->{'samples_g'}};
552 @sT = @
{$self->{'text'}->{'samples_t'}};
554 print ("Count\ta\tc\tg\tt\n");
555 for (my $curr=0; $curr < scalar(@sG); $curr++) {
556 print("$curr\t$sA[$curr]\t$sC[$curr]\t$sG[$curr]\t$sT[$curr]\n");
561 sub _dump_traces_incoming_deprecated_use_the_sequencetrace_object
{
563 # my (@sA,@sT,@sG,@sC);
564 # @sA = @{$self->{'traces'}->{'A'}};
565 # @sC = @{$self->{'traces'}->{'C'}};
566 # @sG = @{$self->{'traces'}->{'G'}};
567 # @sT = @{$self->{'traces'}->{'T'}};
568 # @sA = @{$self->get_trace('A')};
569 # @sC = @{$self->get_trace('C')};
570 # @sG = @{$self->get_trace('G')};
571 # @sT = @{$self->get_trace('t')};
572 # print ("Count\ta\tc\tg\tt\n");
573 # for (my $curr=0; $curr < scalar(@sG); $curr++) {
574 # print("$curr\t$sA[$curr]\t$sC[$curr]\t$sG[$curr]\t$sT[$curr]\n");
581 Title : write_seq(-target => $swq, <comments>)
582 Usage : $obj->write_seq(
585 -CONV => "Bioperl-Chads Mighty SCF writer.");
586 Function: Write out an scf.
588 Args : Requires: a reference to a Bio::Seq::Quality object to form the
590 if -version is provided, it should be "2" or "3". A SCF of that
591 version will be written.
592 Any other arguments are assumed to be comments and are put into
593 the comments section of the scf. Read the specifications for scf
594 to decide what might be good to put in here.
597 For best results, use a SequenceTrace object.
598 The things that you need to write an scf:
603 - You _can_ write an scf with just a and b by passing in a
604 Bio::Seq::Quality object- false traces will be synthesized
610 my ($self,%args) = @_;
613 my ($swq) = $self->_rearrange([qw(TARGET)], %args);
615 if (ref($swq) =~ /Bio::Seq::SequenceTrace|Bio::Seq::Quality/) {
616 if (ref($swq) eq "Bio::Seq::Quality") {
617 # this means that the object *has no trace data*
618 # we might as well synthesize some now, ok?
619 $swq = Bio
::Seq
::SequenceTrace
->new(
625 $self->throw("You must pass a Bio::Seq::Quality or a Bio::Seq::SequenceTrace object to write_seq as a parameter named \"target\"");
627 # all of the rest of the arguments are comments for the scf
628 foreach $arg (sort keys %args) {
629 next if ($arg =~ /target/i);
630 ($label = $arg) =~ s/^\-//;
631 $writer_fodder->{comments
}->{$label} = $args{$arg};
633 if (!$comments{'NAME'}) { $comments{'NAME'} = $swq->id(); }
635 $writer_fodder->{comments
}->{'CONV'} = "Bioperl-Chads Mighty SCF writer." unless defined $comments{'CONV'};
636 # now deal with the version of scf they want to write
637 if ($writer_fodder->{comments
}->{version
}) {
638 if ($writer_fodder->{comments
}->{version
} != 2 && $writer_fodder->{comments
}->{version
} != 3) {
639 $self->warn("This module can only write version 2.0 or 3.0 scf's. Writing a version 2.0 scf by default.");
640 $writer_fodder->{header
}->{version
} = "2.00";
642 elsif ($writer_fodder->{comments
}->{'version'} > 2) {
643 $writer_fodder->{header
}->{'version'} = "3.00";
646 $writer_fodder->{header
}->{version
} = "2";
650 $writer_fodder->{header
}->{'version'} = "3.00";
652 # set a few things in the header
653 $writer_fodder->{'header'}->{'magic'} = ".scf";
654 $writer_fodder->{'header'}->{'sample_size'} = "2";
655 $writer_fodder->{'header'}->{'bases'} = length($swq->seq());
656 $writer_fodder->{'header'}->{'bases_left_clip'} = "0";
657 $writer_fodder->{'header'}->{'bases_right_clip'} = "0";
658 $writer_fodder->{'header'}->{'sample_size'} = "2";
659 $writer_fodder->{'header'}->{'code_set'} = "9";
660 @
{$writer_fodder->{'header'}->{'spare'}} = qw(0 0 0 0 0 0 0 0 0 0
661 0 0 0 0 0 0 0 0 0 0);
662 $writer_fodder->{'header'}->{'samples_offset'} = "128";
663 $writer_fodder->{'header'}->{'samples'} = $swq->trace_length();
664 # create the binary for the comments and file it in writer_fodder
665 $writer_fodder->{comments
} = $self->_get_binary_comments(
666 $writer_fodder->{comments
});
667 # create the binary and the strings for the traces, bases,
668 # offsets (if necessary), and accuracies (if necessary)
669 $writer_fodder->{traces
} = $self->_get_binary_traces(
670 $writer_fodder->{'header'}->{'version'},
671 $swq,$writer_fodder->{'header'}->{'sample_size'});
672 my ($b_base_offsets,$b_base_accuracies,$samples_size,$bases_size);
676 if ($writer_fodder->{'header'}->{'version'} == 2) {
677 $writer_fodder->{bases
} = $self->_get_binary_bases(
680 $writer_fodder->{'header'}->{'sample_size'});
681 $samples_size = CORE
::length($writer_fodder->{traces
}->{'binary'});
682 $bases_size = CORE
::length($writer_fodder->{bases
}->{binary
});
683 $writer_fodder->{'header'}->{'bases_offset'} = 128 + $samples_size;
684 $writer_fodder->{'header'}->{'comments_offset'} = 128 +
685 $samples_size + $bases_size;
686 $writer_fodder->{'header'}->{'comments_size'} =
687 length($writer_fodder->{'comments'}->{binary
});
688 $writer_fodder->{'header'}->{'private_size'} = "0";
689 $writer_fodder->{'header'}->{'private_offset'} = 128 +
690 $samples_size + $bases_size +
691 $writer_fodder->{'header'}->{'comments_size'};
692 $writer_fodder->{'header'}->{'binary'} =
693 $self->_get_binary_header($writer_fodder->{header
});
694 $dumper->dumpValue($writer_fodder) if $self->verbose > 0;
695 $self->_print ($writer_fodder->{'header'}->{'binary'})
696 or print("Could not write binary header...\n");
697 $self->_print ($writer_fodder->{'traces'}->{'binary'})
698 or print("Could not write binary traces...\n");
699 $self->_print ($writer_fodder->{'bases'}->{'binary'})
700 or print("Could not write binary base structures...\n");
701 $self->_print ($writer_fodder->{'comments'}->{'binary'})
702 or print("Could not write binary comments...\n");
705 ($writer_fodder->{peak_indices
},
706 $writer_fodder->{accuracies
},
707 $writer_fodder->{bases
},
708 $writer_fodder->{reserved
} ) =
709 $self->_get_binary_bases(
712 $writer_fodder->{'header'}->{'sample_size'}
714 $writer_fodder->{'header'}->{'bases_offset'} = 128 +
715 length($writer_fodder->{'traces'}->{'binary'});
716 $writer_fodder->{'header'}->{'comments_size'} =
717 length($writer_fodder->{'comments'}->{'binary'});
719 # bases_offset + base_offsets + accuracies + called_bases +
721 $writer_fodder->{'header'}->{'private_size'} = "0";
723 $writer_fodder->{'header'}->{'comments_offset'} =
724 128+length($writer_fodder->{'traces'}->{'binary'})+
725 length($writer_fodder->{'peak_indices'}->{'binary'})+
726 length($writer_fodder->{'accuracies'}->{'binary'})+
727 length($writer_fodder->{'bases'}->{'binary'})+
728 length($writer_fodder->{'reserved'}->{'binary'});
729 $writer_fodder->{'header'}->{'private_offset'} =
730 $writer_fodder->{'header'}->{'comments_offset'} +
731 $writer_fodder->{'header'}->{'comments_size'};
732 $writer_fodder->{'header'}->{'spare'}->[1] =
733 $writer_fodder->{'header'}->{'comments_offset'} +
734 length($writer_fodder->{'comments'}->{'binary'});
735 $writer_fodder->{header
}->{binary
} =
736 $self->_get_binary_header($writer_fodder->{header
});
737 $self->_print ($writer_fodder->{'header'}->{'binary'})
738 or print("Couldn't write header\n");
739 $self->_print ($writer_fodder->{'traces'}->{'binary'})
740 or print("Couldn't write samples\n");
741 $self->_print ($writer_fodder->{'peak_indices'}->{'binary'})
742 or print("Couldn't write peak offsets\n");
743 $self->_print ($writer_fodder->{'accuracies'}->{'binary'})
744 or print("Couldn't write accuracies\n");
745 $self->_print ($writer_fodder->{'bases'}->{'binary'})
746 or print("Couldn't write called_bases\n");
747 $self->_print ($writer_fodder->{'reserved'}->{'binary'})
748 or print("Couldn't write reserved\n");
749 $self->_print ($writer_fodder->{'comments'}->{'binary'})
750 or print ("Couldn't write comments\n");
753 # kinda unnecessary, given the close() below, but maybe that'll go
755 $self->flush if $self->_flush_on_write && defined $self->_fh;
765 =head2 _get_binary_header()
767 Title : _get_binary_header();
768 Usage : $self->_get_binary_header();
769 Function: Provide the binary string that will be used as the header for
771 Returns : A binary string.
772 Args : None. Uses the entries in the $self->{'header'} hash. These
773 are set on construction of the object (hopefully correctly!).
778 sub _get_binary_header
{
779 my ($self,$header) = @_;
780 my $binary = pack "a4 NNNNNNNN a4 NN N20",
783 $header->{'samples'},
784 $header->{'samples_offset'},
786 $header->{'bases_left_clip'},
787 $header->{'bases_right_clip'},
788 $header->{'bases_offset'},
789 $header->{'comments_size'},
790 $header->{'comments_offset'},
791 $header->{'version'},
792 $header->{'sample_size'},
793 $header->{'code_set'},
794 @
{$header->{'spare'}}
799 =head2 _get_binary_traces($version,$ref)
801 Title : _set_binary_tracesbases($version,$ref)
802 Usage : $self->_set_binary_tracesbases($version,$ref);
803 Function: Constructs the trace and base strings for all scfs
804 Returns : Nothing. Alters self.
805 Args : $version - "2" or "3"
806 $sequence - a scalar containing arbitrary sequence data
807 $ref - a reference to either a SequenceTraces or a
808 SequenceWithQuality object.
809 Notes : This is a really complicated thing.
813 sub _get_binary_traces
{
814 my ($self,$version,$ref,$sample_size) = @_;
815 # ref _should_ be a Bio::Seq::SequenceTrace, but might be a
818 my $sequence = $ref->seq();
819 my $sequence_length = length($sequence);
820 # first of all, do we need to synthesize the trace?
821 # if so, call synthesize_base
822 my ($traceobj,@traces,$current);
823 if ( ref($ref) eq "Bio::Seq::Quality" ) {
824 $traceobj = Bio
::Seq
::Quality
->new(
827 $traceobj->_synthesize_traces();
831 if ($version eq "2") {
832 my $trace_length = $traceobj->trace_length();
833 for ($current = 1; $current <= $trace_length; $current++) {
834 foreach (qw(a c g t)) {
835 push @traces,$traceobj->trace_value_at($_,$current);
839 elsif ($version == 3) {
840 foreach my $current_trace (qw(a c g t)) {
841 my @trace = @
{$traceobj->trace($current_trace)};
847 my $transformed = $self->_delta(\
@trace,"forward");
848 if($sample_size == 1){
849 foreach (@
{$transformed}) {
850 $_ += 256 if ($_ < 0);
853 push @traces,@
{$transformed};
857 $returner->{version
} = $version;
858 $returner->{string
} = \
@traces;
859 my $length_of_traces = scalar(@traces);
861 if ($sample_size == 1) { $byte = "c"; } else { $byte = "n"; }
862 # an unsigned integer should be I, but this is too long
864 $returner->{binary
} = pack "n${length_of_traces}",@traces;
865 $returner->{length} = CORE
::length($returner->{binary
});
870 sub _get_binary_bases
{
871 my ($self,$version,$trace,$sample_size) = @_;
873 if ($sample_size == 1) { $byte = "c"; } else { $byte = "n"; }
874 my ($returner,@current_row,$current_base,$string,$binary);
875 my $length = $trace->length();
877 $returner->{'version'} = "2";
878 for (my $current_base =1; $current_base <= $length; $current_base++) {
880 push @current_row,$trace->peak_index_at($current_base);
881 push @current_row,$trace->accuracy_at("a",$current_base);
882 push @current_row,$trace->accuracy_at("c",$current_base);
883 push @current_row,$trace->accuracy_at("g",$current_base);
884 push @current_row,$trace->accuracy_at("t",$current_base);
885 push @current_row,$trace->baseat($current_base);
886 push @current_row,0,0,0;
887 push @
{$returner->{string
}},@current_row;
888 $returner->{binary
} .= pack "N C C C C a C3",@current_row;
893 $returner->{'version'} = "3.00";
894 $returner->{peak_indices
}->{string
} = $trace->peak_indices();
895 my $length = scalar(@
{$returner->{peak_indices
}->{string
}});
896 $returner->{peak_indices
}->{binary
} =
897 pack "N$length",@
{$returner->{peak_indices
}->{string
}};
898 $returner->{peak_indices
}->{length} =
899 CORE
::length($returner->{peak_indices
}->{binary
});
901 foreach my $base (qw(a c g t)) {
902 $returner->{accuracies
}->{$base} = $trace->accuracies($base);
903 push @accuracies,@
{$trace->accuracies($base)};
905 $returner->{sequence
} = $trace->seq();
906 $length = scalar(@accuracies);
907 # this really is "c" for samplesize == 2
908 $returner->{accuracies
}->{binary
} = pack "C${length}",@accuracies;
909 $returner->{accuracies
}->{length} =
910 CORE
::length($returner->{accuracies
}->{binary
});
911 $length = $trace->seq_obj()->length();
912 for (my $count=0; $count< $length; $count++) {
913 push @
{$returner->{reserved
}->{string
}},0,0,0;
916 $length = scalar(@
{$returner->{reserved
}->{string
}});
918 $returner->{'reserved'}->{'binary'} =
919 pack "c$length",@
{$returner->{reserved
}->{string
}};
920 $returner->{'reserved'}->{'length'} =
921 CORE
::length($returner->{'reserved'}->{'binary'});
922 # $returner->{'bases'}->{'string'} = $trace->seq();
923 my @bases = split('',$trace->seq());
924 $length = $trace->length();
925 $returner->{'bases'}->{'binary'} = $trace->seq();
926 # print("Returning this:\n");
927 # $dumper->dumpValue($returner);
928 return ($returner->{peak_indices
},
929 $returner->{accuracies
},
931 $returner->{reserved
});
936 =head2 _make_trace_string($version)
938 Title : _make_trace_string($version)
939 Usage : $self->_make_trace_string($version)
940 Function: Merges trace data for the four bases to produce an scf
941 trace string. _requires_ $version
942 Returns : Nothing. Alters $self.
943 Args : $version - a version number. "2" or "3"
948 sub _make_trace_string
{
949 my ($self,$version) = @_;
952 my @as = @
{$self->{'text'}->{'samples_a'}};
953 my @cs = @
{$self->{'text'}->{'samples_c'}};
954 my @gs = @
{$self->{'text'}->{'samples_g'}};
955 my @ts = @
{$self->{'text'}->{'samples_t'}};
957 for (my $curr=0; $curr < scalar(@as); $curr++) {
958 $as[$curr] = $DEFAULT_QUALITY unless defined $as[$curr];
959 $cs[$curr] = $DEFAULT_QUALITY unless defined $cs[$curr];
960 $gs[$curr] = $DEFAULT_QUALITY unless defined $gs[$curr];
961 $ts[$curr] = $DEFAULT_QUALITY unless defined $ts[$curr];
962 push @traces,($as[$curr],$cs[$curr],$gs[$curr],$ts[$curr]);
965 elsif ($version == 3) {
966 @traces = (@as,@cs,@gs,@ts);
969 $self->throw("No idea what version required to make traces here. You gave #$version# Bailing.");
971 my $length = scalar(@traces);
972 $self->{'text'}->{'samples_all'} = \
@traces;
976 =head2 _get_binary_comments(\@comments)
978 Title : _get_binary_comments(\@comments)
979 Usage : $self->_get_binary_comments(\@comments);
980 Function: Provide a binary string that will be the comments section of
981 the scf file. See the scf specifications for detailed
982 specifications for the comments section of an scf file. Hint:
983 CODE=something\nBODE=something\n\0
985 Args : A reference to an array containing comments.
990 sub _get_binary_comments
{
991 my ($self,$rcomments) = @_;
993 my $comments_string = '';
994 my %comments = %$rcomments;
995 foreach my $key (sort keys %comments) {
996 $comments{$key} ||= '';
997 $comments_string .= "$key=$comments{$key}\n";
999 $comments_string .= "\n\0";
1000 my $length = CORE
::length($comments_string);
1001 $returner->{length} = $length;
1002 $returner->{string
} = $comments_string;
1003 $returner->{binary
} = pack "A$length",$comments_string;
1007 #=head2 _fill_missing_data($swq)
1009 # Title : _fill_missing_data($swq)
1010 # Usage : $self->_fill_missing_data($swq);
1011 # Function: If the $swq with quality has no qualities, set all qualities
1013 # If the $swq has no sequence, set the sequence to N's.
1014 # Returns : Nothing. Modifies the Bio::Seq::Quality that was passed as an
1016 # Args : A reference to a Bio::Seq::Quality
1022 #sub _fill_missing_data {
1023 # my ($self,$swq) = @_;
1024 # my $qual_obj = $swq->qual_obj();
1025 # my $seq_obj = $swq->seq_obj();
1026 # if ($qual_obj->length() == 0 && $seq_obj->length() != 0) {
1027 # my $fake_qualities = ("$DEFAULT_QUALITY ")x$seq_obj->length();
1028 # $swq->qual($fake_qualities);
1030 # if ($seq_obj->length() == 0 && $qual_obj->length != 0) {
1031 # my $sequence = ("N")x$qual_obj->length();
1032 # $swq->seq($sequence);
1036 =head2 _delta(\@trace_data,$direction)
1038 Title : _delta(\@trace_data,$direction)
1039 Usage : $self->_delta(\@trace_data,$direction);
1041 Returns : A reference to an array containing modified trace values.
1042 Args : A reference to an array containing trace data and a string
1043 indicating the direction of conversion. ("forward" or
1045 Notes : This code is taken from the specification for SCF3.2.
1046 http://www.mrc-lmb.cam.ac.uk/pubseq/manual/formats_unix_4.html
1052 my ($self,$rsamples,$direction) = @_;
1053 my @samples = @
$rsamples;
1054 # /* If job == DELTA_IT:
1055 # * change a series of sample points to a series of delta delta values:
1056 # * ie change them in two steps:
1057 # * first: delta = current_value - previous_value
1058 # * then: delta_delta = delta - previous_delta
1063 # uint_2 p_delta, p_sample;
1065 my ($i,$num_samples,$p_delta,$p_sample,@samples_converted,$p_sample1,$p_sample2);
1066 my $SLOW_BUT_CLEAR = 0;
1067 $num_samples = scalar(@samples);
1068 # c-programmers are funny people with their single-letter variables
1070 if ( $direction eq "forward" ) {
1071 if($SLOW_BUT_CLEAR){
1073 for ($i=0; $i < $num_samples; $i++) {
1074 $p_sample = $samples[$i];
1075 $samples[$i] = $samples[$i] - $p_delta;
1076 $p_delta = $p_sample;
1079 for ($i=0; $i < $num_samples; $i++) {
1080 $p_sample = $samples[$i];
1081 $samples[$i] = $samples[$i] - $p_delta;
1082 $p_delta = $p_sample;
1085 for ($i = $num_samples-1; $i > 1; $i--){
1086 $samples[$i] = $samples[$i] - 2*$samples[$i-1] + $samples[$i-2];
1088 $samples[1] = $samples[1] - 2*$samples[0];
1091 elsif ($direction eq "backward") {
1092 if($SLOW_BUT_CLEAR){
1094 for ($i=0; $i < $num_samples; $i++) {
1095 $samples[$i] = $samples[$i] + $p_sample;
1096 $p_sample = $samples[$i];
1099 for ($i=0; $i < $num_samples; $i++) {
1100 $samples[$i] = $samples[$i] + $p_sample;
1101 $p_sample = $samples[$i];
1104 $p_sample1 = $p_sample2 = 0;
1105 for ($i = 0; $i < $num_samples; $i++){
1106 $p_sample1 = $p_sample1 + $samples[$i];
1107 $samples[$i] = $p_sample1 + $p_sample2;
1108 $p_sample2 = $samples[$i];
1114 $self->warn("Bad direction. Use \"forward\" or \"backward\".");
1119 =head2 _unpack_magik($buffer)
1121 Title : _unpack_magik($buffer)
1122 Usage : $self->_unpack_magik($buffer)
1123 Function: What unpack specification should be used? Try them all.
1125 Args : A buffer containing arbitrary binary data.
1126 Notes : Eliminate the ambiguity and the guesswork. Used in the
1127 adaptation of _delta(), mostly.
1132 my ($self,$buffer) = @_;
1133 my $length = length($buffer);
1134 my (@read,$counter);
1135 foreach (qw(c C s S i I l L n N v V)) {
1136 @read = unpack "$_$length", $buffer;
1137 for ($counter=0; $counter < 20; $counter++) {
1138 print("$read[$counter]\n");
1143 =head2 read_from_buffer($filehandle,$buffer,$length)
1145 Title : read_from_buffer($filehandle,$buffer,$length)
1146 Usage : $self->read_from_buffer($filehandle,$buffer,$length);
1147 Function: Read from the buffer.
1148 Returns : $buffer, containing a read of $length
1149 Args : a filehandle, a buffer, and a read length
1150 Notes : I just got tired of typing
1151 "unless (length($buffer) == $length)" so I put it here.
1155 sub read_from_buffer
{
1156 my ($self,$fh,$buffer,$length,$start_position) = @_;
1157 # print("Reading from a buffer!!! length($length) ");
1158 if ($start_position) {
1159 # print(" startposition($start_position)(".sprintf("%X", $start_position).")\n");
1162 if ($start_position) {
1163 # print("seeking to this position in the file: (".$start_position.")\n");
1164 seek ($fh,$start_position,0);
1165 # print("done. here is where I am now: (".tell($fh).")\n");
1168 # print("You did not specify a start position. Going from this position (the current position) (".tell($fh).")\n");
1170 read $fh, $buffer, $length;
1171 unless (length($buffer) == $length) {
1172 $self->warn("The read was incomplete! Trying harder.");
1173 my $missing_length = $length - length($buffer);
1175 read $fh,$buffer2,$missing_length;
1176 $buffer .= $buffer2;
1177 if (length($buffer) != $length) {
1178 $self->throw("Unexpected end of file while reading from SCF file. I should have read $length but instead got ".length($buffer)."! Current file position is ".tell($fh).".");
1187 Title : _dump_keys()
1188 Usage : &_dump_keys($a_reference_to_some_hash)
1189 Function: Dump out the keys in a hash.
1191 Args : A reference to a hash.
1192 Notes : A debugging method.
1198 if ($rhash !~ /HASH/) {
1199 print("_dump_keys: that was not a hash.\nIt was #$rhash# which was this reference:".ref($rhash)."\n");
1202 print("_dump_keys: The keys for $rhash are:\n");
1203 foreach (sort keys %$rhash) {
1208 =head2 _dump_base_accuracies()
1210 Title : _dump_base_accuracies()
1211 Usage : $self->_dump_base_accuracies();
1212 Function: Dump out the v3 base accuracies in an easy to read format.
1215 Notes : A debugging method.
1219 sub _dump_base_accuracies
{
1221 print("Dumping base accuracies! for v3\n");
1222 print("There are this many elements in a,c,g,t:\n");
1223 print(scalar(@
{$self->{'text'}->{'v3_base_accuracy_a'}}).",".scalar(@
{$self->{'text'}->{'v3_base_accuracy_c'}}).",".scalar(@
{$self->{'text'}->{'v3_base_accuracy_g'}}).",".scalar(@
{$self->{'text'}->{'v3_base_accuracy_t'}})."\n");
1224 my $number_traces = scalar(@
{$self->{'text'}->{'v3_base_accuracy_a'}});
1225 for (my $counter=0; $counter < $number_traces; $counter++ ) {
1226 print("$counter\t");
1227 print $self->{'text'}->{'v3_base_accuracy_a'}->[$counter]."\t";
1228 print $self->{'text'}->{'v3_base_accuracy_c'}->[$counter]."\t";
1229 print $self->{'text'}->{'v3_base_accuracy_g'}->[$counter]."\t";
1230 print $self->{'text'}->{'v3_base_accuracy_t'}->[$counter]."\t";
1235 =head2 _dump_peak_indices_incoming()
1237 Title : _dump_peak_indices_incoming()
1238 Usage : $self->_dump_peak_indices_incoming();
1239 Function: Dump out the v3 peak indices in an easy to read format.
1242 Notes : A debugging method.
1246 sub _dump_peak_indices_incoming
{
1248 print("Dump peak indices incoming!\n");
1249 my $length = $self->{'bases'};
1250 print("The length is $length\n");
1251 for (my $count=0; $count < $length; $count++) {
1252 print("$count\t$self->{parsed}->{peak_indices}->[$count]\n");
1256 =head2 _dump_base_accuracies_incoming()
1258 Title : _dump_base_accuracies_incoming()
1259 Usage : $self->_dump_base_accuracies_incoming();
1260 Function: Dump out the v3 base accuracies in an easy to read format.
1263 Notes : A debugging method.
1267 sub _dump_base_accuracies_incoming
{
1269 print("Dumping base accuracies! for v3\n");
1270 # print("There are this many elements in a,c,g,t:\n");
1271 # print(scalar(@{$self->{'parsed'}->{'v3_base_accuracy_a'}}).",".scalar(@{$self->{'text'}->{'v3_base_accuracy_c'}}).",".scalar(@{$self->{'text'}->{'v3_base_accuracy_g'}}).",".scalar(@{$self->{'text'}->{'v3_base_accuracy_t'}})."\n");
1272 my $number_traces = $self->{'bases'};
1273 for (my $counter=0; $counter < $number_traces; $counter++ ) {
1274 print("$counter\t");
1275 foreach (qw(A T G C)) {
1276 print $self->{'parsed'}->{'base_accuracies'}->{$_}->[$counter]."\t";
1283 =head2 _dump_comments()
1285 Title : _dump_comments()
1286 Usage : $self->_dump_comments();
1287 Function: Debug dump the comments section from the scf.
1294 sub _dump_comments
{
1296 warn ("SCF comments:\n");
1297 foreach my $k (keys %{$self->{'comments'}}) {
1298 warn ("\t {$k} ==> ", $self->{'comments'}->{$k}, "\n");