1 # Bio::Tools::Alignment::Consed
3 # Please direct questions and support issues to <bioperl-l@bioperl.org>
5 # Cared for by Chad Matsalla
7 # Copyright Chad Matsalla
9 # You may distribute this module under the same terms as perl itself
11 # POD documentation - main docs before the code
15 Bio::Tools::Alignment::Consed - A module to work with objects from consed .ace files
19 # a report for sequencing stuff
20 my $o_consed = Bio::Tools::Alignment::Consed->new(
21 -acefile => "/path/to/an/acefile.ace.1",
23 my $foo = $o_consed->set_reverse_designator("r");
24 my $bar = $o_consed->set_forward_designator("f");
26 # get the contig numbers
27 my @keys = $o_consed->get_contigs();
29 # construct the doublets
30 my $setter_doublets = $o_consed->choose_doublets();
33 my @doublets = $o_consed->get_doublets();
37 L<Bio::Tools::Alignment::Consed> provides methods and objects to deal
38 with the output from the Consed software suite. Specifically,
39 takes an C<.ace> file and provides objects for the results.
41 A word about doublets: This module was written to accommodate a large
42 EST sequencing operation. In this case, EST's were sequenced from the
43 3' and from the 5' end of the EST. The objective was to find a
44 consensus sequence for these two reads. Thus, a contig of two is what
45 we wanted, and this contig should consist of the forward and reverse
46 reads of a getn clone. For example, for a forward designator of "F"
47 and a reverse designator of "R", if the two reads chad1F and chad1R
48 were in a single contig (for example Contig 5) it will be determined
49 that the consensus sequence for Contig 5 will be the sequence for
54 This module parses C<.ace> and related files. A detailed list of methods
55 can be found at the end of this document.
57 I wrote a detailed rationale for design that may explain the reasons
58 why some things were done the way they were done. That document is
59 beyond the scope of this pod and can probably be found in the
60 directory from which this module was 'made' or at
61 L<http://www.dieselwurks.com/bioinformatics/consedpm_documentation.pdf>.
63 Note that the POD in that document might be old but the original
64 rationale still stands.
70 User feedback is an integral part of the evolution of this and other
71 Bioperl modules. Send your comments and suggestions preferably to one
72 of the Bioperl mailing lists. Your participation is much appreciated.
74 bioperl-l@bioperl.org - General discussion
75 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
79 Please direct usage questions or support issues to the mailing list:
81 I<bioperl-l@bioperl.org>
83 rather than to the module maintainer directly. Many experienced and
84 reponsive experts will be able look at the problem and quickly
85 address it. Please include a thorough description of the problem
86 with code and data examples if at all possible.
90 Report bugs to the Bioperl bug tracking system to help us keep track
91 the bugs and their resolution. Bug reports can be submitted via the
94 https://github.com/bioperl/bioperl-live/issues
96 =head1 AUTHOR - Chad Matsalla
98 Email chad-at-dieselwurks.com
102 The rest of the documentation details each of the object
103 methods. Internal methods are usually preceded with a _
109 package Bio
::Tools
::Alignment
::Consed
;
114 use Dumpvalue
qw(dumpValue);
115 use Bio
::Tools
::Alignment
::Trim
;
118 use base
qw(Bio::Root::Root Bio::Root::IO);
120 our %DEFAULTS = ( 'f_designator' => 'f',
121 'r_designator' => 'r');
125 Title : new(-acefile => $path_to_some_acefile, -verbose => "1")
126 Usage : $o_consed = Bio::Tools::Alignment::Consed->
127 new(-acefile => $path_to_some_acefile, -verbose => "1");
128 Function: Construct the Bio::Tools::Alignment::Consed object. Sets
129 verbosity for the following procedures, if necessary:
130 1. Construct a new Bio::Tools::Alignment::Trim object, to
131 handle quality trimming 2. Read in the acefile and parse it
133 Returns : A reference to a Bio::Tools::Alignment::Consed object.
134 Args : A hash. (-acefile) is the filename of an acefile. If a full path
135 is not specified "./" is prepended to the filename and used from
136 instantiation until destruction. If you want
137 Bio::Tools::Alignment::Consed to be noisy during parsing of
138 the acefile, specify some value for (-verbose).
143 my ($class,%args) = @_;
144 my $self = $class->SUPER::new
(%args);
146 $self->{'filename'} = $args{'-acefile'};
148 # this is special to UNIX and should probably use catfile : DONE!
149 # if (!($self->{'filename'} =~ m{/})) {
150 # $self->{'filename'} = "./".$self->{'filename'};
152 # $self->{'filename'} =~ s#\\#\/#g if $^O =~ m/mswin/i;
153 # $self->{'filename'} =~ m/(.*\/)(.*)ace.*$/;
154 # $self->{'path'} = $1;
156 # this is more generic and should work on most systems
157 (undef, $self->{'path'}, undef) = File
::Spec
->splitpath($self->{'filename'});
159 $self->_initialize_io('-file'=>$self->{'filename'});
160 $self->{'o_trim'} = Bio
::Tools
::Alignment
::Trim
->new(-verbose
=> $self->verbose());
161 $self->set_forward_designator($DEFAULTS{'f_designator'});
162 $self->set_reverse_designator($DEFAULTS{'r_designator'});
170 Title : set_verbose()
171 Usage : $o_consed->set_verbose(1);
172 Function: Set the verbosity level for debugging messages. On instantiation
173 of the Bio::Tools::Alignment::Consed object the verbosity level
176 Args : The verbosity levels are:
182 This method for setting verbosity has largely been superseded by a
183 sub-by-sub way, where for every sub you can provide a (-verbose)
184 switch. I am doing converting this bit-by-bit so do not be surprised
185 if some subs do not honour this.
192 sub set_verbose
{ (shift)->verbose(@_) }
194 =head2 get_filename()
196 Title : get_filename()
197 Usage : $o_consed->get_filename();
198 Function: Returns the name of the acefile being used by the
199 Bio::Tools::Alignment::Consed object.
200 Returns : A scalar containing the name of a file.
208 return $self->{'filename'};
211 =head2 count_sequences_with_grep()
213 Title : count_sequences_with_grep()
214 Usage : $o_consed->count_sequences_with_grep();
215 Function: Use /bin/grep to scan through the files in the ace project dir
216 and count sequences in those files. I used this method in the
217 development of this module to verify that I was getting all of the
218 sequences. It works, but it is (I think) unix-like platform
220 Returns : A scalar containing the number of sequences in the ace project
224 If you are on a non-UNIX platform, you really do not have to use
225 this. It is more of a debugging routine designed to address very
228 This method was reimplemented to be platform independent with a pure
229 perl implementation. The above note can be ignored.
233 sub count_sequences_with_grep
{
235 my ($working_dir,$grep_cli,@total_grep_sequences);
236 # this should be migrated to a pure perl implementation ala
237 # Tom Christiansen's 'tcgrep'
238 # http://www.cpan.org/modules/by-authors/id/TOMC/scripts/tcgrep.gz
240 open my $FILE, '<', $self->{'filename'} or do {
241 $self->warn("Could not read file '$self->{'filename'}' for grepping: $!");
245 while(<$FILE>) { $counter++ if(/^AF/); }
248 opendir my $SINGLETS, $self->{'path'};
249 foreach my $f ( readdir($SINGLETS) ) {
250 next unless ($f =~ /\.singlets$/);
252 my $singlet_file = File
::Spec
->catfile($self->{'path'}, $f);
253 open my $S_FILE, '<', $singlet_file or do {
254 $self->warn("Could not read file '$singlet_file': $!");
257 while(<$S_FILE>) { $counter++ if(/^>/) }
267 Usage : $o_consed->get_path();
268 Function: Returns the path to the acefile this object is working with.
269 Returns : Scalar. The path to the working acefile.
276 return $self->{'path'};
281 Title : get_contigs()
282 Usage : $o_consed->get_contigs();
283 Function: Return the keys to the Bio::Tools::Alignment::Consed object.
284 Returns : An array containing the keynames in the
285 Bio::Tools::Alignment::Consed object.
288 This would normally be used to get the keynames for some sort of
289 iterator. These keys are worthless in general day-to-day use because
290 in the Consed acefile they are simply Contig1, Contig2, ...
295 my ($self,$contig) = @_;
296 my @contigs = sort keys %{$self->{'contigs'}};
300 =head2 get_class($contig_keyname)
302 Title : get_class($contig_keyname)
303 Usage : $o_consed->get_class($contig_keyname);
304 Function: Return the class name for this contig
305 Returns : A scalar representing the class of this contig.
312 my ($self,$contig) = @_;
313 return $self->{contigs
}->{$contig}->{class};
316 =head2 get_quality_array($contig_keyname)
318 Title : get_quality_array($contig_keyname)
319 Usage : $o_consed->get_quality_array($contig_keyname);
320 Function: Returns the quality for the consensus sequence for the given
321 contig as an array. See get_quality_scalar to get this as a scalar.
322 Returns : An array containing the quality for the consensus sequence with
324 Args : The keyname of a contig. Note: This is a keyname. The key would
325 normally come from get_contigs.
327 Returns an array, not a reference. Is this a bug? I<thinking> No.
328 Well, maybe. Why was this developed like this? I was using FreezeThaw
329 for object persistence, and when it froze out these arrays it took a
330 long time to thaw it. Much better as a scalar.
332 See L<get_quality_scalar()|get_quality_scalar>
336 sub get_quality_array
{
337 my ($self,$contig) = @_;
338 return split ' ', $self->{contigs
}->{$contig}->{quality
};
341 =head2 get_quality_scalar($contig_keyname)
343 Title : get_quality_scalar($contig_keyname)
344 Usage : $o_consed->get_quality_scalar($contig_keyname);
345 Function: Returns the quality for the consensus sequence for the given
346 contig as a scalar. See get_quality_array to get this as an array.
347 Returns : An scalar containing the quality for the consensus sequence with
349 Args : The keyname of a contig. Note this is a _keyname_. The key would
350 normally come from get_contigs.
352 Why was this developed like this? I was using FreezeThaw for object
353 persistence, and when it froze out these arrays it took a coon's age
354 to thaw it. Much better as a scalar.
356 See L<get_quality_array()|get_quality_array>
361 sub get_quality_scalar
{
362 my ($self,$contig) = @_;
363 return $self->{'contigs'}->{$contig}->{'quality'};
368 Title : freeze_hash()
369 Usage : $o_consed->freeze_hash();
371 Function: Use Ilya's FreezeThaw module to create a persistent data
372 object for this Bio::Tools::Alignment::Consed data
373 structure. In the case of AAFC, we use
374 Bio::Tools::Alignment::Consed to pre-process bunches of
375 sequences, freeze the structures, and send in a harvesting
376 robot later to do database stuff.
380 This procedure was removed so Consed.pm won't require FreezeThaw.
387 $self->warn("This method (freeze_hash) was removed ".
388 "from the bioperl consed.pm. Sorry.\n");
390 $self->debug("Bio::Tools::Alignment::Consed::freeze_hash:".
391 " \$self->{path} is $self->{path}\n");
392 my $filename = $self->{'path'}."frozen";
393 my %contigs = %{$self->{'contigs'}};
394 my $frozen = freeze
(%contigs);
396 open my $FREEZE, '>', $filename or do {
397 $self->warn( "Bio::Tools::Alignment::Consed could not ".
398 "freeze the contig hash because the file ".
399 "($filename) could not be opened: $!");
402 print $FREEZE $frozen;
407 =head2 get_members($contig_keyname)
409 Title : get_members($contig_keyname)
410 Usage : $o_consed->get_members($contig_keyname);
411 Function: Return the _names_ of the reads in this contig.
412 Returns : An array containing the names of the reads in this contig.
413 Args : The keyname of a contig. Note this is a keyname. The keyname
414 would normally come from get_contigs.
416 See L<get_contigs()|get_contigs>
421 my ($self,$contig) = @_;
423 $self->warn("You need to provide the name of a contig to ".
424 "use Bio::Tools::Alignment::Consed::get_members!\n");
427 return @
{$self->{'contigs'}->{$contig}->{'member_array'}};
430 =head2 get_members_by_name($some_arbitrary_name)
432 Title : get_members_by_name($some_arbitrary_name)
433 Usage : $o_consed->get_members_by_name($some_arbitrary_name);
434 Function: Return the names of the reads in a contig. This is the name given
435 to $contig{key} based on what is in the contig. This is different
436 from the keys retrieved through get_contigs().
437 Returns : An array containing the names of the reads in the contig with this
439 Args : The name of a contig. Not a key, but a name.
441 Highly inefficient. use some other method if possible.
442 See L<get_contigs()|get_contigs>
446 sub get_members_by_name
{
447 my ($self,$name) = @_;
448 # build a list to try to screen for redundancy
449 my @contigs_with_that_name;
450 foreach my $currkey ( sort keys %{$self->{'contigs'}} ) {
451 next if (!$self->{'contigs'}->{$currkey}->{'name'});
452 if ($self->{'contigs'}->{$currkey}->{'name'} eq "$name") {
453 push @contigs_with_that_name,$currkey;
456 my $count = @contigs_with_that_name;
458 my $contig_num = $contigs_with_that_name[0];
459 return @
{$self->{'contigs'}->{$contig_num}->{'member_array'}};
463 =head2 get_contig_number_by_name($some_arbitrary_name)
465 Title : get_contig_number_by_name($some_arbitrary_name)
466 Usage : $o_consed->get_contig_number_by_name($some_arbitrary_name);
467 Function: Return the names of the reads in a contig. This is the name given
468 to $contig{key} based on what is in the contig. This is different
469 from the keys retrieved through get_contigs().
470 Returns : An array containing the names of the reads in the contig with this
472 Args : The name of a contig. Not a key, but a name.
474 See L<get_contigs()|get_contigs>
478 sub get_contig_number_by_name
{
479 my ($self,$name) = @_;
480 foreach my $currkey (sort keys %{$self->{'contigs'}}) {
481 if ($self->{'contigs'}->{$currkey}->{'name'} &&
482 $self->{'contigs'}->{$currkey}->{'name'} eq "$name") {
488 =head2 get_sequence($contig_keyname)
490 Title : get_sequence($contig_keyname)
491 Usage : $o_consed->get_sequence($contig_keyname);
492 Function: Returns the consensus sequence for a given contig.
493 Returns : A scalar containing a sequence.
494 Args : The keyname of a contig. Note this is a key. The key would
495 normally come from get_contigs.
497 See L<get_contigs()|get_contigs>
502 my ($self,$contig) = @_;
503 return $self->{'contigs'}->{$contig}->{'consensus'};
506 =head2 set_final_sequence($some_sequence)
508 Title : set_final_sequence($name,$some_sequence)
509 Usage : $o_consed->set_final_sequence($name,$some_sequence);
510 Function: Provides a manual way to set the sequence for a given key in the
511 contig hash. Rarely used.
513 Args : The name (not the keyname) of a contig and an arbitrary string.
515 A method with a questionable and somewhat mysterious origin. May raise
516 the dead or something like that.
520 sub set_final_sequence
{
521 my ($self,$name,$sequence) = @_;
522 if (!$self->{'contigs'}->{$name}) {
523 $self->warn("You cannot set the final sequence for ".
524 "$name because it doesn't exist!\n");
528 $self->{'contigs'}->{$name}->{'final_sequence'} = $sequence;
535 Title : _read_file();
536 Usage : _read_file();
537 Function: An internal subroutine used to read in an acefile and parse it
538 into a Bio::Tools::Alignment::Consed object.
542 This routine creates and saves the filhandle for reading the files in
549 my ($line,$in_contig,$in_quality,$contig_number,$top);
550 # make it easier to type $fhl
551 while (defined($line=$self->_readline()) ) {
553 # check if there is anything on this line
554 # if not, you can stop gathering consensus sequence
556 # if the line is blank you are no longer to gather consensus
557 # sequence or quality values
561 # you are currently gathering consensus sequence
563 if ($in_contig == 1) {
564 $self->debug("Adding $line to consensus of contig number $contig_number.\n");
565 $self->{'contigs'}->{$contig_number}->{'consensus'} .= $line;
568 elsif ($in_quality) {
574 # I wrote this in here because acefiles produced by
575 # cap3 do not have a leading space like the acefiles
576 # produced by phrap and there is the potential to have
577 # concatenated quality values like this: 2020 rather
578 # then 20 20 whre lines collide. Thanks Andrew for
581 if ($self->{'contigs'}->{$contig_number}->{'quality'} &&
582 !($self->{'contigs'}->{$contig_number}->{'quality'} =~ m/\ $/)) {
583 $self->{'contigs'}->{$contig_number}->{'quality'} .= " ";
585 $self->{'contigs'}->{$contig_number}->{'quality'} .= $line;
588 elsif ($line =~ /^BQ/) {
592 # the line /^CO/ like this:
593 # CO Contig1 796 1 1 U
594 # can be broken down as follows:
596 # Contig1 - the name of this contig
597 # 796 - Number of bases in this contig
598 # 1 - Number of reads in this contig
599 # 1 - number of base segments in this contig
602 elsif ($line =~ /^CO/) {
603 $line =~ m/^CO\ Contig(\d+)\ \d+\ \d+\ \d+\ (\w)/;
606 $self->debug("Contig $contig_number is complemented!\n");
608 $self->{'contigs'}->{$contig_number}->{'member_array'} = [];
609 $self->{'contigs'}->{$contig_number}->{'contig_direction'} = "$2";
614 # this BS is deprecated, I think.
615 # haha, I am really witty. <ew>
617 elsif ($line =~ /^BSDEPRECATED/) {
618 $line =~ m/^BS\s+\d+\s+\d+\s+(.+)/;
620 $self->{'contigs'}->{$contig_number}->{$member}++;
622 # the members of the contigs are determined by the AF line in the ace file
623 elsif ($line =~ /^AF/) {
624 $self->debug("I see an AF line here.\n");
625 $line =~ /^AF\ (\S+)\ (\w)\ (\S+)/;
627 # push the name of the current read onto the member array for this contig
628 push @
{$self->{'contigs'}->{$contig_number}->{'member_array'}},$1;
630 # the first read in the contig will be named the "top" read
632 $self->debug("\$top is not set.\n");
633 if ($self->{'contigs'}->{$contig_number}->{'contig_direction'} eq "C") {
634 $self->debug("Reversing the order of the reads. The bottom will be $1\n");
636 # if the contig sequence is marked as the
637 # complement, the top becomes the bottom and$
638 $self->{'contigs'}->{$contig_number}->{'bottom_name'} = $1;
639 $self->{'contigs'}->{$contig_number}->{'bottom_complement'} = $2;
640 $self->{'contigs'}->{$contig_number}->{'bottom_start'} = $3;
643 $self->debug("NOT reversing the order of the reads. ".
644 "The top_name will be $1\n");
645 # if the contig sequence is marked as the
646 # complement, the top becomes the bottom and$
647 $self->{'contigs'}->{$contig_number}->{'top_name'} = $1;
648 $self->{'contigs'}->{$contig_number}->{'top_complement'} = $2;
649 $self->{'contigs'}->{$contig_number}->{'top_start'} = $3;
655 # if the contig sequence is marked as the complement,
656 # the top becomes the bottom and the bottom becomes
658 if ($self->{'contigs'}->{$contig_number}->{'contig_direction'} eq "C") {
659 $self->debug("Reversing the order of the reads. The top will be $1\n");
660 $self->{'contigs'}->{$contig_number}->{'top_name'} = $1;
661 $self->{'contigs'}->{$contig_number}->{'top_complement'} = $2;
662 $self->{'contigs'}->{$contig_number}->{'top_start'} = $3;
665 $self->debug("NOT reversing the order of the reads. The bottom will be $1\n");
666 $self->{'contigs'}->{$contig_number}->{'bottom_name'} = $1;
667 $self->{'contigs'}->{$contig_number}->{'bottom_complement'} = $2;
668 $self->{'contigs'}->{$contig_number}->{'bottom_start'} = $3;
677 =head2 set_reverse_designator($some_string)
679 Title : set_reverse_designator($some_string)
680 Usage : $o_consed->set_reverse_designator($some_string);
681 Function: Set the designator for the reverse read of contigs in this
682 Bio::Tools::Alignment::Consed object. Used to determine if
683 contigs containing two reads can be named.
684 Returns : The value of $o_consed->{reverse_designator} so you can check
685 to see that it was set properly.
686 Args : An arbitrary string.
688 May be useful only to me. I<shrug>
692 sub set_reverse_designator
{
693 my ($self,$reverse_designator) = @_;
694 $self->{'reverse_designator'} = $reverse_designator;
695 $self->{'o_trim'}->set_reverse_designator($reverse_designator);
696 return $self->{'reverse_designator'};
697 } # end set_reverse_designator
699 =head2 set_forward_designator($some_string)
701 Title : set_forward_designator($some_string)
702 Usage : $o_consed->set_forward_designator($some_string);
703 Function: Set the designator for the forward read of contigs in this
704 Bio::Tools::Alignment::Consed object. Used to determine if
705 contigs containing two reads can be named.
706 Returns : The value of $o_consed->{forward_designator} so you can check
707 to see that it was set properly.
708 Args : An arbitrary string.
710 May be useful only to me. I<shrug>
714 sub set_forward_designator
{
715 my ($self,$forward_designator) = @_;
716 $self->{'forward_designator'} = $forward_designator;
717 $self->{'o_trim'}->set_forward_designator($forward_designator);
718 return $self->{'forward_designator'};
719 } # end set_forward_designator
721 =head2 set_designator_ignore_case("yes")
723 Title : set_designator_ignore_case("yes")
724 Usage : $o_consed->set_designator_ignore_case("yes");
725 Function: Deprecated.
726 Returns : Deprecated.
729 Deprecated. Really. Trust me.
733 sub set_designator_ignore_case
{
734 my ($self,$ignore_case) = @_;
735 if ($ignore_case eq "yes") {
736 $self->{'designator_ignore_case'} = 1;
738 return $self->{'designator_ignore_case'};
739 } # end set_designator_ignore_case
741 =head2 set_trim_points_singlets_and_singletons()
743 Title : set_trim_points_singlets_and_singletons()
744 Usage : $o_consed->set_trim_points_singlets_and_singletons();
745 Function: Set the trim points for singlets and singletons based on
746 quality. Uses the Bio::Tools::Alignment::Trim object. Use
747 at your own risk because the Bio::Tools::Alignment::Trim
748 object was designed specifically for me and is mysterious
749 in its ways. Every time somebody other then me uses it a
750 swarm of locusts decends on a small Central American
751 village so do not say you weren't warned.
755 Working on exceptions and warnings here.
757 See L<Bio::Tools::Alignment::Trim> for more information
761 #' to make my emacs happy
763 sub set_trim_points_singlets_and_singletons
{
765 $self->debug("Consed.pm : \$self is $self\n");
766 my (@points,$trimmed_sequence);
767 if (!$self->{'doublets_set'}) {
768 $self->debug("You need to set the doublets before you use ".
769 "set_trim_points_singlets_and_doublets. Doing that now.");
770 $self->set_doublets();
772 foreach (sort keys %{$self->{'contigs'}}) {
773 if ($self->{'contigs'}->{$_}->{'class'} eq "singlet") {
774 $self->debug("Singlet $_\n");
775 # this is what Warehouse wants
776 # my ($self,$sequence,$quality,$name) = @_;
777 # this is what Bio::Tools::Alignment::Trim::trim_singlet wants:
778 # my ($self,$sequence,$quality,$name,$class) = @_;
779 # the following several lines are to make the parameter passing legible.
780 my ($sequence,$quality,$name,$class);
781 $sequence = $self->{'contigs'}->{$_}->{'consensus'};
782 if (!$self->{'contigs'}->{$_}->{'quality'}) { $quality = "unset"; }
783 else { $quality = $self->{'contigs'}->{$_}->{'quality'}; }
784 $name = $self->{'contigs'}->{$_}->{'name'};
785 $class = $self->{'contigs'}->{$_}->{'class'};
786 @points = @
{$self->{'o_trim'}->trim_singlet($sequence,$quality,$name,$class)};
787 $self->{'contigs'}->{$_}->{'start_point'} = $points[0];
788 $self->{'contigs'}->{$_}->{'end_point'} = $points[1];
789 $self->{'contigs'}->{$_}->{'sequence_trimmed'} =
790 substr($self->{contigs
}->{$_}->{'consensus'},$points[0],$points[1]-$points[0]);
793 $self->debug("Bio::Tools::Alignment::Consed::set_trim_points_singlets".
794 "_and_singletons: Done setting the quality trimpoints.\n");
796 } # end set_trim_points_singlet
798 =head2 set_trim_points_doublets()
800 Title : set_trim_points_doublets()
801 Usage : $o_consed->set_trim_points_doublets();
802 Function: Set the trim points for doublets based on quality. Uses the
803 Bio::Tools::Alignment::Trim object. Use at your own risk because
804 the Bio::Tools::Alignment::Trim object was designed specifically
805 for me and is mysterious in its ways. Every time somebody other
806 then me uses it you risk a biblical plague being loosed on your
810 Notes : Working on exceptions here.
812 See L<Bio::Tools::Alignment::Trim> for more information
816 sub set_trim_points_doublets
{
819 $self->debug("Bio::Tools::Alignment::Consed::set_trim_points_doublets:".
820 " Restoring zeros for doublets.\n");
821 # &show_missing_sequence($self);
822 $self->debug("Bio::Tools::Alignment::Consed::set_trim_points_doublets:".
823 " Setting doublet trim points.\n");
824 foreach (sort keys %{$self->{'contigs'}}) {
825 if ($self->{'contigs'}->{$_}->{'class'} eq "doublet") {
826 # my ($self,$sequence,$quality,$name,$class) = @_;
827 my @quals = split(' ',$self->{'contigs'}->{$_}->{'quality'});
829 @points = $self->{o_trim
}->trim_doublet
830 ($self->{'contigs'}->{$_}->{'consensus'},
831 $self->{'contigs'}->{$_}->{'quality'},
832 $self->{'contigs'}->{$_}->{name
},
833 $self->{'contigs'}->{$_}->{'class'});
834 $self->{'contigs'}->{$_}->{'start_point'} = $points[0];
835 $self->{'contigs'}->{$_}->{'end_point'} = $points[1];
837 $self->{'contigs'}->{$_}->{'sequence_trimmed'} =
838 substr($self->{contigs
}->{$_}->{'consensus'},
839 $points[0],$points[1]-$points[0]);
840 # 010102 the deprecated way to do things:
843 $self->debug("Bio::Tools::Alignment::Consed::set_trim_points_doublets:".
844 " Done setting doublet trim points.\n");
846 } # end set_trim_points_doublets
848 =head2 get_trimmed_sequence_by_name($name)
850 Title : get_trimmed_sequence_by_name($name)
851 Usage : $o_consed->get_trimmed_sequence_by_name($name);
852 Function: Returns the trimmed_sequence of a contig with {name} eq $name.
853 Returns : A scalar- the trimmed sequence.
854 Args : The {name} of a contig.
859 sub get_trimmed_sequence_by_name
{
860 my ($self,$name) = @_;
861 my $trimmed_sequence;
862 my $contigname = &get_contig_number_by_name
($self,$name);
863 my $class = $self->{'contigs'}->{$contigname}->{'class'};
864 # what is this business and who was smoking crack while writing this?
865 # if ($class eq "singlet") {
866 # send the sequence, the quality, and the name
867 # $trimmed_sequence = $self->{o_trim}->trim_singlet
868 # ($self->{'contigs'}->{$contigname}->{consensus},
869 # $self->{'contigs'}->{$contigname}->{'quality'},$name);
871 return $self->{'contigs'}->{$contigname}->{'sequence_trimmed'};
874 =head2 set_dash_present_in_sequence_name("yes")
876 Title : set_dash_present_in_sequence_name("yes")
877 Usage : $o_consed->set_dash_present_in_sequence_name("yes");
878 Function: Deprecated. Part of an uncompleted thought. ("Oooh! Shiny!")
880 Args : "yes" to set {dash_present_in_sequence_name} to 1
885 sub set_dash_present_in_sequence_name
{
886 my ($self,$dash_present) = @_;
887 if ($dash_present eq "yes") {
888 $self->{'dash_present_in_sequence_name'} = 1;
891 $self->{'dash_present_in_sequence_name'} = 0;
893 return $self->{'dash_present_in_sequence_name'};
894 } # end set_dash_present_in_sequence_name
896 =head2 set_doublets()
898 Title : set_doublets()
899 Usage : $o_consed->set_doublets();
900 Function: Find pairs that have similar names and mark them as doublets
905 A complicated subroutine that iterates over the
906 Bio::Tools::Alignment::Consed looking for contigs of 2. If the forward
907 and reverse designator are removed from each of the reads in
908 {'member_array'} and the remaining reads are the same, {name} is set
909 to that name and the contig's class is set as "doublet". If any of
910 those cases fail the contig is marked as a "pair".
914 #' make my emacs happy
918 # set the designators in the Bio::Tools::Alignment::Trim object
920 $self->{'o_trim'}->set_designators($self->{'reverse_designator'},
921 $self->{'forward_designator'});
922 foreach my $key_contig (sort keys %{$self->{'contigs'}}) {
924 # if there is a member array (why would there not be? This should be a die()able offence
925 # but for now I will leave it
926 if ($self->{'contigs'}->{$key_contig}->{'member_array'}) {
927 # if there are two reads in this contig
928 # i am pretty sure that this is wrong but i am keeping it for reference
929 # if (@{$self->{'contigs'}->{$key_contig}->{'member_array'}} == 2 || !$self->{'contigs'}->{$key_contig}->{'class'}) {
931 # <nod> WRONG. Was I on crack?
932 if (@
{$self->{'contigs'}->{$key_contig}->{'member_array'}} == 2) {
933 $self->{'contigs'}->{$key_contig}->{'num_members'} = 2;
934 $self->debug("\tThere are 2 members! Looking for the contig name...\n");
935 my $name = _get_contig_name
($self,$self->{'contigs'}->{$key_contig}->{'member_array'});
936 $self->debug("The name is $name\n") if defined $name;
938 $self->{'contigs'}->{$key_contig}->{'name'} = $name;
939 $self->{'contigs'}->{$key_contig}->{'class'} = "doublet";
941 $self->debug("$key_contig is a pair.\n");
942 $self->{'contigs'}->{$key_contig}->{'class'} = "pair";
945 # this is all fair and good but what about singlets?
946 # they have one reads in the member_array but certainly are not singletons
947 elsif (@
{$self->{'contigs'}->{$key_contig}->{'member_array'}} == 1) {
948 # set the name to be the name of the read
949 $self->{'contigs'}->{$key_contig}->{name
} = @
{$self->{'contigs'}->{$key_contig}->{'member_array'}}[0];
950 # set the number of members to be one
951 $self->{'contigs'}->{$key_contig}->{num_members
} = 1;
952 # if this was a singlet, it would already belong to the class "singlet"
954 # if it is not a singlet, it is a singleton! lablel it appropriately
955 unless ($self->{'contigs'}->{$key_contig}->{'class'}) {
956 $self->{'contigs'}->{$key_contig}->{'class'} = "singleton";
959 # set the multiplet characteristics
960 elsif (@
{$self->{'contigs'}->{$key_contig}->{'member_array'}} >= 3) {
961 $self->{'contigs'}->{$key_contig}->{'num_members'} = @
{$self->{'contigs'}->{$key_contig}->{'member_array'}};
962 $self->{'contigs'}->{$key_contig}->{'class'} = "multiplet";
964 $self->{'contigs'}->{$key_contig}->{'num_members'} = @
{$self->{'contigs'}->{$key_contig}->{'member_array'}};
968 $self->{'doublets_set'} = "done";
975 Usage : $o_consed->set_singlets();
976 Function: Read in a singlets file and place them into the
977 Bio::Tools::Alignment::Consed object.
979 Args : A scalar to turn on verbose parsing of the singlets file.
985 # parse out the contents of the singlets file
987 $self->debug("Bio::Tools::Alignment::Consed Adding singlets to the contig hash...\n");
988 my $full_filename = $self->{'filename'};
989 $self->debug("Bio::Tools::Alignment::Consed::set_singlets: \$full_filename is $full_filename\n");
990 $full_filename =~ s
#\\#\/#g if $^O =~ m/mswin/i;
991 $full_filename =~ m/(.*\/)(.*ace
.*)$/;
992 my ($base_path,$filename) = ($1,$2);
993 $self->debug("Bio::Tools::Alignment::Consed::set_singlets: singlets filename is $filename and \$base_path is $base_path\n");
994 $filename =~ m/(.*)ace.*$/;
995 my $singletsfile = $base_path.$1."singlets";
996 $self->debug("\$singletsfile is $singletsfile\n");
997 if (!-f
$singletsfile) {
998 # there is no singlets file.
999 $self->{'singlets_set'} = "done";
1002 $self->debug("$singletsfile is indeed a file. Trying to open it...\n");
1003 my $singlets_fh = Bio
::Root
::IO
->new(-file
=> $singletsfile);
1004 my ($sequence,$name,$count);
1005 while ($_ = $singlets_fh->_readline()) {
1008 if ($name && $sequence) {
1009 $self->debug("Adding $name with sequence $sequence to hash...\n");
1010 push @
{$self->{'contigs'}->{$name}->{'member_array'}},$name;
1011 $self->{'contigs'}->{$name}->{'consensus'} = $sequence;
1012 $self->{'contigs'}->{$name}->{'name'} = $name;
1013 $self->{'contigs'}->{$name}->{"singlet"} = 1;
1014 $self->{'contigs'}->{$name}->{'class'} = "singlet";
1016 $sequence = $name = undef;
1018 m/^\>(.*)\s\sCHROMAT/;
1025 else { $sequence .= $_; }
1027 if ($name && $sequence) {
1028 $self->debug("Pushing the last of the singlets ($name)\n");
1029 @
{$self->{'contigs'}->{$name}->{'member_array'}} = $name;
1030 $self->{'contigs'}->{$name}->{'consensus'} = $sequence;
1031 $self->{'contigs'}->{$name}->{'name'} = $name;
1032 $self->{'contigs'}->{$name}->{"singlet"} = 1;
1033 $self->{'contigs'}->{$name}->{'class'} = "singlet";
1035 $self->debug("Bio::Tools::Alignment::Consed::set_singlets: Done adding singlets to the singlets hash.\n");
1036 $self->{'singlets_set'} = "done";
1038 } # end sub set_singlets
1040 =head2 get_singlets()
1042 Title : get_singlets()
1043 Usage : $o_consed->get_singlets();
1044 Function: Return the keynames of the singlets.
1045 Returns : An array containing the keynames of all
1046 Bio::Tools::Alignment::Consed sequences in the class "singlet".
1053 # returns an array of singlet names
1054 # singlets have "singlet"=1 in the hash
1056 if (!$self->{singlets_set
}) {
1057 $self->debug("You need to set the singlets before you get them. Doing that now.");
1058 $self->set_singlets();
1061 my (@singlets,@array);
1062 foreach my $key (sort keys %{$self->{'contigs'}}) {
1063 # @array = @{$Consed::contigs{$key}->{'member_array'}};
1064 # somethimes a user will try to get a list of singlets before the classes for the rest of the
1065 # contigs has been set (see t/test.t for how I figured this out. <bah>
1066 # so either way, just return class=singlets
1067 if (!$self->{'contigs'}->{$key}->{'class'}) {
1068 # print("$key has no class. why?\n");
1070 elsif ($self->{'contigs'}->{$key}->{'class'} eq "singlet") {
1071 push @singlets,$key;
1077 =head2 set_quality_by_name($name,$quality)
1079 Title : set_quality_by_name($name,$quality)
1080 Usage : $o_consed->set_quality_by_name($name,$quality);
1081 Function: Deprecated. Make the contig with {name} have {'quality'} $quality.
1082 Probably used for testing.
1084 Args : The name of a contig and a scalar for its quality.
1089 sub set_quality_by_name
{
1090 # this is likely deprecated
1091 my ($self,$name,$quality) = shift;
1093 foreach (sort keys %{$self->{'contigs'}}) {
1094 if ($self->{'contigs'} eq "$name" || $self->{'contigs'}->{'name'} eq "$name") {
1095 $self->{'contigs'}->{'quality'} = $quality;
1099 if ($return) { return "0"; } else { return "1"; }
1100 } # end set quality by name
1102 =head2 set_singlet_quality()
1104 Title : set_singlet_quality()
1105 Usage : $o_consed->set_singlet_quality();
1106 Function: For each singlet, go to the appropriate file in phd_dir and read
1107 in the phred quality for that read and place it into {'quality'}
1110 Notes : This is the next subroutine that will receive substantial revision
1111 in the next little while. It really should eval the creation of
1112 Bio::Tools::Alignment::Phred objects and go from there.
1116 sub set_singlet_quality
{
1118 my $full_filename = $self->{'filename'};
1119 $full_filename =~ s
#\\#\/#g if $^O =~ m/mswin/i;
1120 $full_filename =~ m/(.*\/)(.*)ace
.*$/;
1121 my ($base_path,$filename) = ($1,"$2"."qual");
1122 my $singletsfile = $base_path.$filename;
1123 if (-f
$singletsfile) {
1124 # print("$singletsfile is indeed a file. Trying to open it...\n");
1127 $self->warn("$singletsfile is not a file. Sorry.\n");
1130 my $singlets_fh = Bio
::Root
::IO
->new(-file
=> $singletsfile);
1131 my ($sequence,$name,$count);
1132 my ($identity,$line,$quality,@qline);
1133 while ($line = $singlets_fh->_readline()) {
1135 if ($line =~ /^\>/) {
1137 $line =~ m/\>(\S*)\s/;
1141 if ($self->{'contigs'}->{$identity}) {
1142 $self->{'contigs'}->{$identity}->{'quality'} .= "$line ";
1150 =head2 set_contig_quality()
1152 Title : set_contig_quality()
1153 Usage : $o_consed->set_contig_quality();
1154 Function: Deprecated.
1155 Returns : Deprecated.
1157 Notes : Deprecated. Really. Trust me.
1161 sub set_contig_quality
{
1162 # note: contigs _include_ singletons but _not_ singlets
1164 # the unexpected results I am referring to here are a doubling of quality values.
1165 # the profanity I uttered on discovering this reminded me of the simpsons:
1166 # Ned Flanders: "That is the loudest profanity I have ever heard!"
1167 $self->warn("set_contig_quality is deprecated and will likely produce unexpected results");
1168 my $full_filename = $self->{'filename'};
1169 # Run_SRC3700_2000-08-01_73+74.fasta.screen.contigs.qual
1171 $full_filename =~ s
#\\#\/#g if $^O =~ m/mswin/i;
1172 $full_filename =~ m/(.*\/)(.*)ace
.*$/;
1173 my ($base_path,$filename) = ($1,"$2"."contigs.qual");
1174 my $singletsfile = $base_path.$filename;
1175 if (-f
$singletsfile) {
1176 # print("$singletsfile is indeed a file. Trying to open it...\n");
1179 $self->warn("Bio::Tools::Alignment::Consed::set_contig_quality $singletsfile is not a file. Sorry.\n");
1182 my $contig_quality_fh = Bio
::Root
::IO
->new(-file
=> $singletsfile);
1184 my ($sequence,$name,$count,$identity,$line,$quality);
1185 while ($line = $contig_quality_fh->_readline()) {
1187 if ($line =~ /^\>/) {
1189 $line =~ m/\>.*Contig(\d+)\s/;
1193 if ($self->{'contigs'}->{$identity} ) {
1194 $self->{'contigs'}->{$identity}->{'quality'} .= " $line";
1198 } # end set_contig_quality
1200 =head2 get_multiplets()
1202 Title : get_multiplets()
1203 Usage : $o_consed->get_multiplets();
1204 Function: Return the keynames of the multiplets.
1205 Returns : Returns an array containing the keynames of all
1206 Bio::Tools::Alignment::Consed sequences in the class "multiplet".
1212 sub get_multiplets
{
1213 # returns an array of multiplet names
1214 # multiplets have # members > 2
1216 my (@multiplets,@array);
1217 foreach my $key (sort keys %{$self->{'contigs'}}) {
1218 if ($self->{'contigs'}->{$key}->{'class'}) {
1219 if ($self->{'contigs'}->{$key}->{'class'} eq "multiplet") {
1220 push @multiplets,$key;
1227 =head2 get_all_members()
1229 Title : get_all_members()
1230 Usage : @all_members = $o_consed->get_all_members();
1231 Function: Return a list of all of the read names in the
1232 Bio::Tools::Alignment::Consed object.
1233 Returns : An array containing all of the elements in all of the
1240 sub get_all_members
{
1243 foreach my $key (sort keys %{$self->{'contigs'}}) {
1244 if ($key =~ /^singlet/) {
1245 push @members,$self->{'contigs'}->{$key}->{'member_array'}[0];
1247 elsif ($self->{'contigs'}->{$key}->{'member_array'}) {
1248 push @members,@
{$self->{'contigs'}->{$key}->{'member_array'}};
1251 # print("Bio::Tools::Alignment::Consed: $key is _not_ an array. Pushing $self->{'contigs'}->{$key}->{'member_array'} onto \@members\n");
1252 # push @members,$self->{'contigs'}->{$key}->{'member_array'};
1258 =head2 sum_lets($total_only)
1260 Title : sum_lets($total_only)
1261 Usage : $statistics = $o_consed->sum_lets($total_only);
1262 Function: Provide numbers for how many sequences were accounted for in the
1263 Bio::Tools::Alignment::Consed object.
1264 Returns : If a scalar is present, returns the total number of
1265 sequences accounted for in all classes. If no scalar passed
1266 then returns a string that looks like this:
1267 Singt/singn/doub/pair/mult/total : 2,0,1(2),0(0),0(0),4
1268 This example means the following: There were 1 singlets.
1269 There were 0 singletons. There were 1 doublets for a total
1270 of 2 sequences in this class. There were 0 pairs for a
1271 total of 0 sequences in this class. There were 0
1272 multiplets for a total of 0 sequences in this class. There
1273 were a total of 4 sequences accounted for in the
1274 Bio::Tools::Alignment::Consed object.
1275 Args : A scalar is optional to change the way the numbers are returned.
1281 my ($self,$total_only) = @_;
1282 my ($count,$count_multiplets,$multiplet_count);
1283 my $singlets = &get_singlets
($self); $count += $singlets;
1284 my $doublets = &get_doublets
($self); $count += ($doublets * 2);
1285 my $pairs = &get_pairs
($self); $count += ($pairs * 2);
1286 my $singletons = &get_singletons
($self); $count += $singletons;
1287 my @multiplets = &get_multiplets
($self);
1288 $count_multiplets = @multiplets;
1290 foreach (@multiplets) {
1291 my $number_members = $self->{'contigs'}->{$_}->{num_members
};
1292 $multiplet_count += $number_members;
1294 if ($multiplet_count) {
1295 $count += $multiplet_count;
1297 foreach (qw(multiplet_count singlets doublets pairs singletons
1298 multiplets count_multiplets)) {
1299 no strict
'refs'; # renege for the block
1304 if (!$multiplet_count) { $multiplet_count = 0; }
1308 $return_string = "Singt/singn/doub/pair/mult/total : ".
1309 "$singlets,$singletons,$doublets(".
1310 ($doublets*2)."),$pairs(".($pairs*2).
1311 "),$count_multiplets($multiplet_count),$count";
1312 return $return_string;
1315 =head2 write_stats()
1317 Title : write_stats()
1318 Usage : $o_consed->write_stats();
1319 Function: Write a file called "statistics" containing numbers similar to
1320 those provided in sum_lets().
1321 Returns : Nothing. Write a file in $o_consed->{path} containing something
1324 0,0,50(100),0(0),0(0),100
1326 Where the numbers provided are in the format described in the
1327 documentation for sum_lets().
1329 Notes : This might break platform independence, I do not know.
1331 See L<sum_lets()|sum_lets>
1336 # worry about platform dependence here?
1339 my $stats_filename = $self->{'path'}."statistics";
1340 my $statistics_raw = $self->sum_lets;
1341 my ($statsfilecontents) = $statistics_raw =~ s/.*\ \:\ //g;
1343 my $fh = Bio
::Root
::IO
->new(-file
=>"$stats_filename");
1344 # open my $STATSFILE, '>', $stats_filename or print "Could not write the statsfile: $!\n");
1345 $fh->_print("$statsfilecontents");
1350 =head2 get_singletons()
1352 Title : get_singletons()
1353 Usage : @singletons = $o_consed->get_singletons();
1354 Function: Return the keynames of the singletons.
1355 Returns : Returns an array containing the keynames of all
1356 Bio::Tools::Alignment::Consed sequences in the class "singleton".
1362 sub get_singletons
{
1363 # returns an array of singleton names
1364 # singletons are contigs with one member (see consed documentation)
1366 my (@singletons,@array);
1367 foreach my $key (sort keys %{$self->{'contigs'}}) {
1368 if ($self->{'contigs'}->{$key}->{'class'}) {
1369 # print ("$key class: $self->{'contigs'}->{$key}->{'class'}\n");
1372 # print("$key belongs to no class. why?\n");
1374 if ($self->{'contigs'}->{$key}->{'member_array'}) {
1375 @array = @
{$self->{'contigs'}->{$key}->{'member_array'}};
1377 my $num_array_elem = @array;
1378 if ($num_array_elem == 1 && $self->{'contigs'}->{$key}->{'class'} && $self->{'contigs'}->{$key}->{'class'} eq "singleton") { push @singletons,$key; }
1386 Usage : @pairs = $o_consed->get_pairs();
1387 Function: Return the keynames of the pairs.
1388 Returns : Returns an array containing the keynames of all
1389 Bio::Tools::Alignment::Consed sequences in the class "pair".
1396 # returns an array of pair contig names
1397 # a pair is a contig of two where the names do not match
1400 foreach my $key (sort keys %{$self->{'contigs'}}) {
1401 if ($self->{'contigs'}->{$key}->{'member_array'}) {
1402 if (@
{$self->{'contigs'}->{$key}->{'member_array'}} == 2 &&
1403 $self->{'contigs'}->{$key}->{'class'} eq "pair") {
1411 =head2 get_name($contig_keyname)
1413 Title : get_name($contig_keyname)
1414 Usage : $name = $o_consed->get_name($contig_keyname);
1415 Function: Return the {name} for $contig_keyname.
1416 Returns : A string. ({name})
1417 Args : A contig keyname.
1423 my ($self,$contig) = @_;
1424 return $self->{'contigs'}->{$contig}->{'name'};
1427 =head2 _get_contig_name(\@array_containing_reads)
1429 Title : _get_contig_name(\@array_containing_reads)
1430 Usage : $o_consed->_get_contig_name(\@array_containing_reads);
1431 Function: The logic for the set_doublets subroutine.
1432 Returns : The name for this contig.
1433 Args : A reference to an array containing read names.
1434 Notes : Depends on reverse_designator. Be sure this is set the way you
1439 sub _get_contig_name
{
1440 my ($self,$r_array) = @_;
1441 my @contig_members = @
$r_array;
1443 foreach (@contig_members) {
1444 # how can I distinguish the clone name from the direction label?
1445 # look for $Consed::reverse_designator and $Consed::forward_designator
1446 # what if you do not find _any_ of those?
1447 my $forward_designator = $self->{'forward_designator'} || "f";
1448 my $reverse_designator = $self->{'reverse_designator'} || "r";
1449 my $any_hits = /(.+)($forward_designator.*)/ || /(.+)($reverse_designator.*)/||/(.+)(_.+)/;
1453 # print("\t\$name is $name ");
1456 # print("and \$suffix is $suffix.\n");
1458 # Jee, I hope we get a naming convention soon
1460 if ($suffix =~ /^$forward_designator/ || $suffix =~ /^$reverse_designator/) {
1461 push @name_nodir,$name;
1463 # bugwatch here! should this be unnested?
1465 push @name_nodir,"$name$suffix";
1469 # print("\@name_nodir: @name_nodir\n");
1471 for (my $counter=0; $counter<@name_nodir;$counter++) {
1472 next if ($name_nodir[0] eq $name_nodir[$counter]);
1475 if ($mismatch == 0) {
1476 # print("\tYou have a cohesive contig named $name_nodir[0].\n\n");
1477 return $name_nodir[0];
1479 # print("\tYou have mixed names in this contig.\n\n");
1481 } # end _get_contig_name
1483 =head2 get_doublets()
1485 Title : get_doublets()
1486 Usage : @doublets = $o_consed->get_doublets();
1487 Function: Return the keynames of the doublets.
1488 Returns : Returns an array containing the keynames of all
1489 Bio::Tools::Alignment::Consed sequences in the class "doublet".
1497 if (!$self->{doublets_set
}) {
1498 $self->warn("You need to set the doublets before you can get them. Doing that now.");
1499 $self->set_doublets();
1502 foreach (sort keys %{$self->{'contigs'}}) {
1503 if ($self->{'contigs'}->{$_}->{name
} && $self->{'contigs'}->{$_}->{'class'} eq "doublet") {
1508 } # end get_doublets
1513 Usage : $o_consed->dump_hash();
1514 Function: Use dumpvar.pl to dump out the Bio::Tools::Alignment::Consed
1518 Notes : I used this a lot in debugging.
1524 my $dumper = Dumpvalue
->new();
1525 $self->debug( "Bio::Tools::Alignment::Consed::dump_hash - ".
1526 "The following is the contents of the contig hash...\n");
1527 $dumper->dumpValue($self->{'contigs'});
1530 =head2 dump_hash_compact()
1532 Title : dump_hash_compact()
1533 Usage : $o_consed->dump_hash_compact();
1534 Function: Dump out the Bio::Tools::Alignment::Consed object in a compact way.
1537 Notes : Cleaner then dumpValue(), dumpHash(). I used this a lot in
1542 sub dump_hash_compact
{
1543 no strict
'refs'; # renege for the block
1544 my ($self,$sequence) = @_;
1546 my @singlets = $self->get_singlets();
1547 my @singletons = $self->get_singletons();
1548 my @doublets = $self->get_doublets();
1549 my @pairs = $self->get_pairs();
1550 my @multiplets = $self->get_multiplets();
1551 print("Name\tClass\tMembers\tQuality?\n");
1552 foreach (@singlets) {
1553 my @members = $self->get_members($_);
1554 print($self->get_name($_)."\tsinglets\t".(join',',@members)."\t");
1555 if ($self->{'contigs'}->{$_}->{'quality'}) {
1556 print("qualities found here\n");
1558 print("no qualities found here\n");
1562 foreach (@singletons) {
1563 my @members = $self->get_members($_);
1564 print($self->get_name($_)."\tsingletons\t".(join',',@members)."\t");
1565 if ($self->{'contigs'}->{$_}->{'quality'}) {
1566 print("qualities found here\n");
1568 print("no qualities found here\n");
1571 foreach my $pair (@pairs) {
1572 my @members = $self->get_members($pair);
1574 if (!$self->get_name($pair)) {
1577 $name = $self->get_name($pair);
1579 print("$name\tpairs\t".(join',',@members)."\n");
1581 foreach (@doublets) {
1582 my @members = $self->get_members_by_name($_);
1583 print("$_\tdoublets\t".(join',',@members)."\t");
1584 my $contig_number = &get_contig_number_by_name
($self,$_);
1585 if ($self->{'contigs'}->{$contig_number}->{'quality'}) {
1586 print("qualities found here\n");
1588 print("no qualities found here\n");
1590 # print($_."\tdoublets\t".(join',',@members)."\n");
1592 foreach (@multiplets) {
1593 my @members = $self->get_members($_);
1594 print("Contig $_"."\tmultiplets\t".(join',',@members)."\n");
1596 } # end dump_hash_compact
1600 Title : get_phreds()
1601 Usage : @phreds = $o_consed->get_phreds();
1602 Function: For each doublet in the Bio::Tools::Alignment::Consed hash, go
1603 and get the phreds for the top and bottom reads. Place them into
1604 {top_phreds} and {bottom_phreds}.
1608 Requires parse_phd() and reverse_and_complement(). I realize that it
1609 would be much more elegant to pull qualities as required but there
1610 were certain "features" in the acefile that required a bit more
1611 detailed work be done to get the qualities for certain parts of the
1612 consensus sequence. In order to make _sure_ that this was done
1613 properly I wrote things to do all steps and then I used dump_hash()
1614 and checked each one to ensure expected behavior. I have never changed
1615 this, so there you are.
1620 # this subroutine is the target of a rewrite to use the Bio::Tools::Alignment::Phred object.
1623 foreach $current_contig (sort keys %{$self->{'contigs'}}) {
1624 if ($self->{'contigs'}->{$current_contig}->{'class'} eq "doublet") {
1625 $self->debug("$current_contig is a doublet. Going to parse_phd for top($self->{'contigs'}->{$current_contig}->{'top_name'}) and bottom($self->{'contigs'}->{$current_contig}->{'bottom_name'})\n");
1626 my $r_phreds_top = &parse_phd
($self,$self->{'contigs'}->{$current_contig}->{'top_name'});
1627 my $r_phreds_bottom = &parse_phd
($self,$self->{'contigs'}->{$current_contig}->{'bottom_name'});
1628 if ($self->{'contigs'}->{$current_contig}->{'top_complement'} eq "C") {
1629 # print("Reversing and complementing...\n");
1630 $r_phreds_top = &reverse_and_complement
($r_phreds_top);
1632 if ($self->{'contigs'}->{$current_contig}->{'bottom_complement'} eq "C") {
1633 $r_phreds_bottom = &reverse_and_complement
($r_phreds_bottom);
1635 $self->{'contigs'}->{$current_contig}->{'top_phreds'} = $r_phreds_top;
1636 $self->{'contigs'}->{$current_contig}->{'bottom_phreds'} = $r_phreds_bottom;
1641 =head2 parse_phd($read_name)
1643 Title : parse_phd($read_name)
1644 Usage : $o_consed->parse_phd($read_name);
1645 Function: Suck in the contents of a .phd file.
1646 Returns : A reference to an array containing the quality values for the read.
1647 Args : The name of a read.
1648 Notes : This is a significantly weak subroutine because it was always
1649 intended that these functions, along with the functions provided by
1650 get_phreds() be put into the Bio::SeqIO:phd module. This is done
1651 now but the Bio::Tools::Alignment::Consed module has not be
1652 rewritten to reflect this change.
1654 See L<Bio::SeqIO::phd> for more information.
1659 my ($self,$sequence_name) = @_;
1660 $self->debug("Parsing phd for $sequence_name\n");
1662 my $base_number = 0;
1663 my (@bases,@current_line);
1664 # print("parse_phd: $sequence_name\n");
1665 my $fh = Bio
::Root
::IO
->new
1666 (-file
=>"$self->{path}/../phd_dir/$sequence_name.phd.1");
1667 while ($fh->_readline()) {
1668 # print("Reading a line from a phredfile!\n");
1670 if (/^BEGIN_DNA/) { $in_dna = 1; next}
1671 if (/^END_DNA/) { last; }
1672 if (!$in_dna) { next; }
1678 =head2 reverse_and_complement(\@source)
1680 Title : reverse_and_complement(\@source)
1681 Usage : $reference_to_array = $o_consed->reverse_and_complement(\@source);
1682 Function: A stub for the recursive routine reverse_recurse().
1683 Returns : A reference to a reversed and complemented array of phred data.
1684 Args : A reference to an array of phred data.
1689 sub reverse_and_complement
{
1690 my $r_source = shift;
1692 $r_destination = &reverse_recurse
($r_source,$r_destination);
1693 return $r_destination;
1696 =head2 reverse_recurse($r_source,$r_destination)
1698 Title : reverse_recurse(\@source,\@destination)
1699 Usage : $o_consed->reverse_recurse(\@source,\@destination);
1700 Function: A recursive routine to reverse and complement an array of
1702 Returns : A reference to an array containing reversed phred data.
1703 Args : A reference to a source array and a reverence to a destination
1706 Recursion is kewl, but this sub should likely be _reverse_recurse.
1711 sub reverse_recurse
($$) {
1712 my ($r_source,$r_destination) = @_;
1714 return $r_destination;
1717 s/c/g/ || s/g/c/ || s/a/t/ || s/t/a/;
1718 push(@
$r_destination,$_);
1719 &reverse_recurse
($r_source,$r_destination);
1722 =head2 show_missing_sequence()
1724 Title : show_missing_sequence();
1725 Usage : $o_consed->show_missing_sequence();
1726 Function: Used by set_trim_points_doublets() to fill in quality values where
1727 consed (phrap?) set them to 0 at the beginning and/or end of the
1728 consensus sequences.
1732 Acts on doublets only. Really very somewhat quite ugly. A disgusting
1733 kludge. I<insert pride here> It was written stepwise with no real plan
1734 because it was not really evident why consed (phrap?) was doing this.
1738 sub show_missing_sequence
() {
1740 # decide which sequence should not have been clipped at consensus
1745 my ($current_contig,@qualities);
1746 foreach $current_contig (sort keys %{$self->{'contigs'}}) {
1747 if ($self->{'contigs'}->{$current_contig}->{'class'} eq "doublet") {
1748 my $number_leading_xs = 0;
1749 my $number_trailing_xs = 0;
1750 my $measurer = $self->{'contigs'}->{$current_contig}->{'quality'};
1751 while ($measurer =~ s/^\ 0\ /\ /) {
1752 $number_leading_xs++;
1754 while ($measurer =~ s/\ 0(\s*)$/$1/) {
1755 $number_trailing_xs++;
1757 @qualities = split(' ',$self->{'contigs'}->{$current_contig}->{'quality'});
1758 my $in_initial_zeros = 0;
1759 for (my $count=0;$count<scalar(@qualities); $count++) {
1760 if ($qualities[$count] == 0) {
1761 my ($quality,$top_phred_position,$bottom_phred_position,$top_phred_data,$bottom_phred_data);
1762 # print("The quality of the consensus at ".($count+1)." is zero. Retrieving the real quality value.\n");
1763 # how do I know which strand to get these quality values from????
1765 my $top_quality_here = $self->{'contigs'}->{$current_contig}->{'top_phreds'}->[0-$self->{'contigs'}->{$current_contig}->{'top_start'}+$count+1];
1766 my $bottom_quality_here = $self->{'contigs'}->{$current_contig}->{'bottom_phreds'}->[1-$self->{'contigs'}->{$current_contig}->{'bottom_start'}+$count];
1767 if (!$bottom_quality_here || (1-$self->{'contigs'}->{$current_contig}->{'bottom_start'}+$count)<0) {
1768 $bottom_quality_here = "not found";
1770 if (!$top_quality_here) {
1771 $top_quality_here = "not found";
1773 # print("Looking for quals at position $count of $current_contig: top position ".(0-$self->{'contigs'}->{$current_contig}->{top_start}+$count)." ($self->{'contigs'}->{$current_contig}->{top_name}) $top_quality_here , bottom position ".(1-$self->{'contigs'}->{$current_contig}->{bottom_start}+$count)." ($self->{'contigs'}->{$current_contig}->{bottom_name}) $bottom_quality_here\n");
1774 if ($count<$number_leading_xs) {
1775 # print("$count is less then $number_leading_xs so I will get the quality from the top strand\n");
1776 # print("retrieved quality is ".$self->{'contigs'}->{$current_contig}->{top_phreds}[0-$self->{'contigs'}->{$current_contig}->{top_start}+$count+1]."\n");
1777 my $quality = $top_quality_here;
1778 $quality =~ /\S+\s(\d+)\s+/;
1780 # print("retrieved quality for leading zero $count is $quality\n");
1782 $qualities[$count] = $quality;
1784 # this part is tricky
1785 # if the contig is like this
1787 # ffffffffffffffffff
1789 # then take the quality value for the trailing zeros in the cons. seq from the r
1791 # but if the contig is like this
1792 # cccccccccccccccccc
1793 # ffffffffffffffffffffffffffffffff
1794 # rrrrrrrrrrrrrrrrrrrrrrrxxxxxxxxr
1796 # then any zeros that fall in the positions (^) must be decided whether the quality
1797 # is the qual from the f or r strand. I will use the greater number
1798 # does a similar situation exist for the leading zeros? i dunno
1800 # print("$count is greater then $number_leading_xs so I will get the quality from the bottom strand\n");
1801 # print("retrieved quality is ".$contigs->{$current_contig}->{top_phreds}[0-$contigs->{$current_contig}->{top_start}+$count+1]."\n");
1802 # my ($quality,$top_phred_position,$bottom_phred_position,$top_phred_data,$bottom_phred_data);
1803 if ($bottom_quality_here eq "not found") {
1804 # $top_phred_position = 1-$contigs->{$current_contig}->{bottom_start}+$count;
1805 # print("Going to get quality from here: $top_phred_position of the top.\n");
1806 # my $temp_quality - $contigs->{$current_contig}->{top_phreds}
1807 # $quality = $contigs->{$current_contig}->{top_phreds}[$top_phred_position];
1808 $top_quality_here =~ /\w+\s(\d+)\s/;
1810 } elsif ($top_quality_here eq "not found") {
1811 # $bottom_phred_position = 1+$contigs->{$current_contig}->{bottom_start}+$count;
1812 # print("Going to get quality from here: $bottom_phred_position of the bottom.\n");
1813 # $quality = $contigs->{$current_contig}->{bottom_phreds}[$bottom_phred_position];
1814 # print("Additional: no top quality but bottom is $quality\n");
1815 $bottom_quality_here =~ /\w+\s(\d+)\s/;
1818 # print("Oh jeepers, there are 2 qualities to choose from at this position.\n");
1819 # print("Going to compare these phred qualities: top: #$top_quality_here# bottom: #$bottom_quality_here#\n");
1820 # now you have to compare them
1821 # my $top_quality_phred = $contigs->{$current_contig}->{top_phreds}[$top_phred_position];
1823 # print("regexing #$top_quality_here#... ");
1824 $top_quality_here =~ /\w\ (\d+)\s/;
1825 my $top_quality = $1;
1826 # print("$top_quality\nregexing #$bottom_quality_here#... ");
1827 $bottom_quality_here =~ /\w\ (\d+)\s/;
1828 my $bottom_quality = $1;
1829 # print("$bottom_quality\n");
1830 # print("top_quality: $top_quality bottom quality: $bottom_quality\n");
1831 if ($bottom_quality > $top_quality) {
1832 # print("Chose to take the bottom quality: $bottom_quality\n");
1833 $quality = $bottom_quality;
1835 # print("Chose to take the top quality: $top_quality\n");
1836 $quality = $top_quality;
1840 # print("Warning: no quality value for $current_contig, position $count!\n");
1841 # print("Additional data: top quality phred: $top_quality_here\n");
1842 # print("Additional data: bottom quality phred: $bottom_quality_here\n");
1844 $qualities[$count] = $quality;
1850 unless (!@qualities) {
1851 $self->{'contigs'}->{$current_contig}->{'quality'} = join(" ",@qualities);
1853 $self->{'contigs'}->{$current_contig}->{'bottom_phreds'} = undef;
1854 $self->{'contigs'}->{$current_contig}->{'top_phreds'} = undef;