Bio::DB::Universal: move into its own distribution
[bioperl-live.git] / Bio / Tools / Alignment / Consed.pm
blob7aa46a87172dbaaa845a3828ac349bd22bcf4d25
1 # Bio::Tools::Alignment::Consed
3 # Please direct questions and support issues to <bioperl-l@bioperl.org>
5 # Cared for by Chad Matsalla
7 # Copyright Chad Matsalla
9 # You may distribute this module under the same terms as perl itself
11 # POD documentation - main docs before the code
13 =head1 NAME
15 Bio::Tools::Alignment::Consed - A module to work with objects from consed .ace files
17 =head1 SYNOPSIS
19 # a report for sequencing stuff
20 my $o_consed = Bio::Tools::Alignment::Consed->new(
21 -acefile => "/path/to/an/acefile.ace.1",
22 -verbose => 1);
23 my $foo = $o_consed->set_reverse_designator("r");
24 my $bar = $o_consed->set_forward_designator("f");
26 # get the contig numbers
27 my @keys = $o_consed->get_contigs();
29 # construct the doublets
30 my $setter_doublets = $o_consed->choose_doublets();
32 # get the doublets
33 my @doublets = $o_consed->get_doublets();
35 =head1 DESCRIPTION
37 L<Bio::Tools::Alignment::Consed> provides methods and objects to deal
38 with the output from the Consed software suite. Specifically,
39 takes an C<.ace> file and provides objects for the results.
41 A word about doublets: This module was written to accommodate a large
42 EST sequencing operation. In this case, EST's were sequenced from the
43 3' and from the 5' end of the EST. The objective was to find a
44 consensus sequence for these two reads. Thus, a contig of two is what
45 we wanted, and this contig should consist of the forward and reverse
46 reads of a getn clone. For example, for a forward designator of "F"
47 and a reverse designator of "R", if the two reads chad1F and chad1R
48 were in a single contig (for example Contig 5) it will be determined
49 that the consensus sequence for Contig 5 will be the sequence for
50 clone chad1.
52 Doublets are good!
54 This module parses C<.ace> and related files. A detailed list of methods
55 can be found at the end of this document.
57 I wrote a detailed rationale for design that may explain the reasons
58 why some things were done the way they were done. That document is
59 beyond the scope of this pod and can probably be found in the
60 directory from which this module was 'made' or at
61 L<http://www.dieselwurks.com/bioinformatics/consedpm_documentation.pdf>.
63 Note that the POD in that document might be old but the original
64 rationale still stands.
66 =head1 FEEDBACK
68 =head2 Mailing Lists
70 User feedback is an integral part of the evolution of this and other
71 Bioperl modules. Send your comments and suggestions preferably to one
72 of the Bioperl mailing lists. Your participation is much appreciated.
74 bioperl-l@bioperl.org - General discussion
75 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
77 =head2 Support
79 Please direct usage questions or support issues to the mailing list:
81 I<bioperl-l@bioperl.org>
83 rather than to the module maintainer directly. Many experienced and
84 reponsive experts will be able look at the problem and quickly
85 address it. Please include a thorough description of the problem
86 with code and data examples if at all possible.
88 =head2 Reporting Bugs
90 Report bugs to the Bioperl bug tracking system to help us keep track
91 the bugs and their resolution. Bug reports can be submitted via the
92 web:
94 https://github.com/bioperl/bioperl-live/issues
96 =head1 AUTHOR - Chad Matsalla
98 Email chad-at-dieselwurks.com
100 =head1 APPENDIX
102 The rest of the documentation details each of the object
103 methods. Internal methods are usually preceded with a _
105 =cut
109 package Bio::Tools::Alignment::Consed;
111 use strict;
113 use FileHandle;
114 use Dumpvalue qw(dumpValue);
115 use Bio::Tools::Alignment::Trim;
116 use File::Spec;
118 use base qw(Bio::Root::Root Bio::Root::IO);
120 our %DEFAULTS = ( 'f_designator' => 'f',
121 'r_designator' => 'r');
123 =head2 new()
125 Title : new(-acefile => $path_to_some_acefile, -verbose => "1")
126 Usage : $o_consed = Bio::Tools::Alignment::Consed->
127 new(-acefile => $path_to_some_acefile, -verbose => "1");
128 Function: Construct the Bio::Tools::Alignment::Consed object. Sets
129 verbosity for the following procedures, if necessary:
130 1. Construct a new Bio::Tools::Alignment::Trim object, to
131 handle quality trimming 2. Read in the acefile and parse it
133 Returns : A reference to a Bio::Tools::Alignment::Consed object.
134 Args : A hash. (-acefile) is the filename of an acefile. If a full path
135 is not specified "./" is prepended to the filename and used from
136 instantiation until destruction. If you want
137 Bio::Tools::Alignment::Consed to be noisy during parsing of
138 the acefile, specify some value for (-verbose).
140 =cut
142 sub new {
143 my ($class,%args) = @_;
144 my $self = $class->SUPER::new(%args);
146 $self->{'filename'} = $args{'-acefile'};
148 # this is special to UNIX and should probably use catfile : DONE!
149 # if (!($self->{'filename'} =~ m{/})) {
150 # $self->{'filename'} = "./".$self->{'filename'};
152 # $self->{'filename'} =~ s#\\#\/#g if $^O =~ m/mswin/i;
153 # $self->{'filename'} =~ m/(.*\/)(.*)ace.*$/;
154 # $self->{'path'} = $1;
156 # this is more generic and should work on most systems
157 (undef, $self->{'path'}, undef) = File::Spec->splitpath($self->{'filename'});
159 $self->_initialize_io('-file'=>$self->{'filename'});
160 $self->{'o_trim'} = Bio::Tools::Alignment::Trim->new(-verbose => $self->verbose());
161 $self->set_forward_designator($DEFAULTS{'f_designator'});
162 $self->set_reverse_designator($DEFAULTS{'r_designator'});
164 $self->_read_file();
165 return $self;
168 =head2 set_verbose()
170 Title : set_verbose()
171 Usage : $o_consed->set_verbose(1);
172 Function: Set the verbosity level for debugging messages. On instantiation
173 of the Bio::Tools::Alignment::Consed object the verbosity level
174 is set to 0 (quiet).
175 Returns : 1 or 0.
176 Args : The verbosity levels are:
177 0 - quiet
178 1 - noisy
179 2 - noisier
180 3 - annoyingly noisy
182 This method for setting verbosity has largely been superseded by a
183 sub-by-sub way, where for every sub you can provide a (-verbose)
184 switch. I am doing converting this bit-by-bit so do not be surprised
185 if some subs do not honour this.
187 =cut
189 # from RootI
191 # backwards compat
192 sub set_verbose { (shift)->verbose(@_) }
194 =head2 get_filename()
196 Title : get_filename()
197 Usage : $o_consed->get_filename();
198 Function: Returns the name of the acefile being used by the
199 Bio::Tools::Alignment::Consed object.
200 Returns : A scalar containing the name of a file.
201 Args : None.
203 =cut
206 sub get_filename {
207 my $self = shift;
208 return $self->{'filename'};
211 =head2 count_sequences_with_grep()
213 Title : count_sequences_with_grep()
214 Usage : $o_consed->count_sequences_with_grep();
215 Function: Use /bin/grep to scan through the files in the ace project dir
216 and count sequences in those files. I used this method in the
217 development of this module to verify that I was getting all of the
218 sequences. It works, but it is (I think) unix-like platform
219 dependent.
220 Returns : A scalar containing the number of sequences in the ace project
221 directory.
222 Args : None.
224 If you are on a non-UNIX platform, you really do not have to use
225 this. It is more of a debugging routine designed to address very
226 specific problems.
228 This method was reimplemented to be platform independent with a pure
229 perl implementation. The above note can be ignored.
231 =cut
233 sub count_sequences_with_grep {
234 my $self = shift;
235 my ($working_dir,$grep_cli,@total_grep_sequences);
236 # this should be migrated to a pure perl implementation ala
237 # Tom Christiansen's 'tcgrep'
238 # http://www.cpan.org/modules/by-authors/id/TOMC/scripts/tcgrep.gz
240 open my $FILE, '<', $self->{'filename'} or do {
241 $self->warn("Could not read file '$self->{'filename'}' for grepping: $!");
242 return
244 my $counter =0;
245 while(<$FILE>) { $counter++ if(/^AF/); }
246 close $FILE;
248 opendir my $SINGLETS, $self->{'path'};
249 foreach my $f ( readdir($SINGLETS) ) {
250 next unless ($f =~ /\.singlets$/);
252 my $singlet_file = File::Spec->catfile($self->{'path'}, $f);
253 open my $S_FILE, '<', $singlet_file or do {
254 $self->warn("Could not read file '$singlet_file': $!");
255 next
257 while(<$S_FILE>) { $counter++ if(/^>/) }
258 close $S_FILE;
260 closedir $SINGLETS;
261 return $counter;
264 =head2 get_path()
266 Title : get_path()
267 Usage : $o_consed->get_path();
268 Function: Returns the path to the acefile this object is working with.
269 Returns : Scalar. The path to the working acefile.
270 Args : None.
272 =cut
274 sub get_path {
275 my $self = shift;
276 return $self->{'path'};
279 =head2 get_contigs()
281 Title : get_contigs()
282 Usage : $o_consed->get_contigs();
283 Function: Return the keys to the Bio::Tools::Alignment::Consed object.
284 Returns : An array containing the keynames in the
285 Bio::Tools::Alignment::Consed object.
286 Args : None.
288 This would normally be used to get the keynames for some sort of
289 iterator. These keys are worthless in general day-to-day use because
290 in the Consed acefile they are simply Contig1, Contig2, ...
292 =cut
294 sub get_contigs {
295 my ($self,$contig) = @_;
296 my @contigs = sort keys %{$self->{'contigs'}};
297 return @contigs;
300 =head2 get_class($contig_keyname)
302 Title : get_class($contig_keyname)
303 Usage : $o_consed->get_class($contig_keyname);
304 Function: Return the class name for this contig
305 Returns : A scalar representing the class of this contig.
306 Args : None.
307 Notes :
309 =cut
311 sub get_class {
312 my ($self,$contig) = @_;
313 return $self->{contigs}->{$contig}->{class};
316 =head2 get_quality_array($contig_keyname)
318 Title : get_quality_array($contig_keyname)
319 Usage : $o_consed->get_quality_array($contig_keyname);
320 Function: Returns the quality for the consensus sequence for the given
321 contig as an array. See get_quality_scalar to get this as a scalar.
322 Returns : An array containing the quality for the consensus sequence with
323 the given keyname.
324 Args : The keyname of a contig. Note: This is a keyname. The key would
325 normally come from get_contigs.
327 Returns an array, not a reference. Is this a bug? I<thinking> No.
328 Well, maybe. Why was this developed like this? I was using FreezeThaw
329 for object persistence, and when it froze out these arrays it took a
330 long time to thaw it. Much better as a scalar.
332 See L<get_quality_scalar()|get_quality_scalar>
334 =cut
336 sub get_quality_array {
337 my ($self,$contig) = @_;
338 return split ' ', $self->{contigs}->{$contig}->{quality};
341 =head2 get_quality_scalar($contig_keyname)
343 Title : get_quality_scalar($contig_keyname)
344 Usage : $o_consed->get_quality_scalar($contig_keyname);
345 Function: Returns the quality for the consensus sequence for the given
346 contig as a scalar. See get_quality_array to get this as an array.
347 Returns : An scalar containing the quality for the consensus sequence with
348 the given keyname.
349 Args : The keyname of a contig. Note this is a _keyname_. The key would
350 normally come from get_contigs.
352 Why was this developed like this? I was using FreezeThaw for object
353 persistence, and when it froze out these arrays it took a coon's age
354 to thaw it. Much better as a scalar.
356 See L<get_quality_array()|get_quality_array>
358 =cut
361 sub get_quality_scalar {
362 my ($self,$contig) = @_;
363 return $self->{'contigs'}->{$contig}->{'quality'};
366 =head2 freeze_hash()
368 Title : freeze_hash()
369 Usage : $o_consed->freeze_hash();
371 Function: Use Ilya's FreezeThaw module to create a persistent data
372 object for this Bio::Tools::Alignment::Consed data
373 structure. In the case of AAFC, we use
374 Bio::Tools::Alignment::Consed to pre-process bunches of
375 sequences, freeze the structures, and send in a harvesting
376 robot later to do database stuff.
377 Returns : 0 or 1;
378 Args : None.
380 This procedure was removed so Consed.pm won't require FreezeThaw.
382 =cut
385 sub freeze_hash {
386 my $self = shift;
387 $self->warn("This method (freeze_hash) was removed ".
388 "from the bioperl consed.pm. Sorry.\n");
389 if (1==2) {
390 $self->debug("Bio::Tools::Alignment::Consed::freeze_hash:".
391 " \$self->{path} is $self->{path}\n");
392 my $filename = $self->{'path'}."frozen";
393 my %contigs = %{$self->{'contigs'}};
394 my $frozen = freeze(%contigs);
395 umask 0001;
396 open my $FREEZE, '>', $filename or do {
397 $self->warn( "Bio::Tools::Alignment::Consed could not ".
398 "freeze the contig hash because the file ".
399 "($filename) could not be opened: $!");
400 return 1;
402 print $FREEZE $frozen;
403 return 0;
407 =head2 get_members($contig_keyname)
409 Title : get_members($contig_keyname)
410 Usage : $o_consed->get_members($contig_keyname);
411 Function: Return the _names_ of the reads in this contig.
412 Returns : An array containing the names of the reads in this contig.
413 Args : The keyname of a contig. Note this is a keyname. The keyname
414 would normally come from get_contigs.
416 See L<get_contigs()|get_contigs>
418 =cut
420 sub get_members {
421 my ($self,$contig) = @_;
422 if (!$contig) {
423 $self->warn("You need to provide the name of a contig to ".
424 "use Bio::Tools::Alignment::Consed::get_members!\n");
425 return;
427 return @{$self->{'contigs'}->{$contig}->{'member_array'}};
430 =head2 get_members_by_name($some_arbitrary_name)
432 Title : get_members_by_name($some_arbitrary_name)
433 Usage : $o_consed->get_members_by_name($some_arbitrary_name);
434 Function: Return the names of the reads in a contig. This is the name given
435 to $contig{key} based on what is in the contig. This is different
436 from the keys retrieved through get_contigs().
437 Returns : An array containing the names of the reads in the contig with this
438 name.
439 Args : The name of a contig. Not a key, but a name.
441 Highly inefficient. use some other method if possible.
442 See L<get_contigs()|get_contigs>
444 =cut
446 sub get_members_by_name {
447 my ($self,$name) = @_;
448 # build a list to try to screen for redundancy
449 my @contigs_with_that_name;
450 foreach my $currkey ( sort keys %{$self->{'contigs'}} ) {
451 next if (!$self->{'contigs'}->{$currkey}->{'name'});
452 if ($self->{'contigs'}->{$currkey}->{'name'} eq "$name") {
453 push @contigs_with_that_name,$currkey;
456 my $count = @contigs_with_that_name;
457 if ($count == 1) {
458 my $contig_num = $contigs_with_that_name[0];
459 return @{$self->{'contigs'}->{$contig_num}->{'member_array'}};
463 =head2 get_contig_number_by_name($some_arbitrary_name)
465 Title : get_contig_number_by_name($some_arbitrary_name)
466 Usage : $o_consed->get_contig_number_by_name($some_arbitrary_name);
467 Function: Return the names of the reads in a contig. This is the name given
468 to $contig{key} based on what is in the contig. This is different
469 from the keys retrieved through get_contigs().
470 Returns : An array containing the names of the reads in the contig with this
471 name.
472 Args : The name of a contig. Not a key, but a name.
474 See L<get_contigs()|get_contigs>
476 =cut
478 sub get_contig_number_by_name {
479 my ($self,$name) = @_;
480 foreach my $currkey (sort keys %{$self->{'contigs'}}) {
481 if ($self->{'contigs'}->{$currkey}->{'name'} &&
482 $self->{'contigs'}->{$currkey}->{'name'} eq "$name") {
483 return $currkey;
488 =head2 get_sequence($contig_keyname)
490 Title : get_sequence($contig_keyname)
491 Usage : $o_consed->get_sequence($contig_keyname);
492 Function: Returns the consensus sequence for a given contig.
493 Returns : A scalar containing a sequence.
494 Args : The keyname of a contig. Note this is a key. The key would
495 normally come from get_contigs.
497 See L<get_contigs()|get_contigs>
499 =cut
501 sub get_sequence {
502 my ($self,$contig) = @_;
503 return $self->{'contigs'}->{$contig}->{'consensus'};
506 =head2 set_final_sequence($some_sequence)
508 Title : set_final_sequence($name,$some_sequence)
509 Usage : $o_consed->set_final_sequence($name,$some_sequence);
510 Function: Provides a manual way to set the sequence for a given key in the
511 contig hash. Rarely used.
512 Returns : 0 or 1;
513 Args : The name (not the keyname) of a contig and an arbitrary string.
515 A method with a questionable and somewhat mysterious origin. May raise
516 the dead or something like that.
518 =cut
520 sub set_final_sequence {
521 my ($self,$name,$sequence) = @_;
522 if (!$self->{'contigs'}->{$name}) {
523 $self->warn("You cannot set the final sequence for ".
524 "$name because it doesn't exist!\n");
525 return 1;
527 else {
528 $self->{'contigs'}->{$name}->{'final_sequence'} = $sequence;
530 return 0;
533 =head2 _read_file()
535 Title : _read_file();
536 Usage : _read_file();
537 Function: An internal subroutine used to read in an acefile and parse it
538 into a Bio::Tools::Alignment::Consed object.
539 Returns : 0 or 1.
540 Args : Nothing.
542 This routine creates and saves the filhandle for reading the files in
543 {fh}
545 =cut
547 sub _read_file {
548 my ($self) = @_;
549 my ($line,$in_contig,$in_quality,$contig_number,$top);
550 # make it easier to type $fhl
551 while (defined($line=$self->_readline()) ) {
552 chomp $line;
553 # check if there is anything on this line
554 # if not, you can stop gathering consensus sequence
555 if (!$line) {
556 # if the line is blank you are no longer to gather consensus
557 # sequence or quality values
558 $in_contig = 0;
559 $in_quality = 0;
561 # you are currently gathering consensus sequence
562 elsif ($in_contig) {
563 if ($in_contig == 1) {
564 $self->debug("Adding $line to consensus of contig number $contig_number.\n");
565 $self->{'contigs'}->{$contig_number}->{'consensus'} .= $line;
568 elsif ($in_quality) {
569 if (!$line) {
570 $in_quality = undef;
572 else {
574 # I wrote this in here because acefiles produced by
575 # cap3 do not have a leading space like the acefiles
576 # produced by phrap and there is the potential to have
577 # concatenated quality values like this: 2020 rather
578 # then 20 20 whre lines collide. Thanks Andrew for
579 # noticing.
581 if ($self->{'contigs'}->{$contig_number}->{'quality'} &&
582 !($self->{'contigs'}->{$contig_number}->{'quality'} =~ m/\ $/)) {
583 $self->{'contigs'}->{$contig_number}->{'quality'} .= " ";
585 $self->{'contigs'}->{$contig_number}->{'quality'} .= $line;
588 elsif ($line =~ /^BQ/) {
589 $in_quality = 1;
592 # the line /^CO/ like this:
593 # CO Contig1 796 1 1 U
594 # can be broken down as follows:
595 # CO - Contig!
596 # Contig1 - the name of this contig
597 # 796 - Number of bases in this contig
598 # 1 - Number of reads in this contig
599 # 1 - number of base segments in this contig
600 # U - Uncomplemented
602 elsif ($line =~ /^CO/) {
603 $line =~ m/^CO\ Contig(\d+)\ \d+\ \d+\ \d+\ (\w)/;
604 $contig_number = $1;
605 if ($2 eq "C") {
606 $self->debug("Contig $contig_number is complemented!\n");
608 $self->{'contigs'}->{$contig_number}->{'member_array'} = [];
609 $self->{'contigs'}->{$contig_number}->{'contig_direction'} = "$2";
610 $in_contig = 1;
613 # 000713
614 # this BS is deprecated, I think.
615 # haha, I am really witty. <ew>
617 elsif ($line =~ /^BSDEPRECATED/) {
618 $line =~ m/^BS\s+\d+\s+\d+\s+(.+)/;
619 my $member = $1;
620 $self->{'contigs'}->{$contig_number}->{$member}++;
622 # the members of the contigs are determined by the AF line in the ace file
623 elsif ($line =~ /^AF/) {
624 $self->debug("I see an AF line here.\n");
625 $line =~ /^AF\ (\S+)\ (\w)\ (\S+)/;
627 # push the name of the current read onto the member array for this contig
628 push @{$self->{'contigs'}->{$contig_number}->{'member_array'}},$1;
630 # the first read in the contig will be named the "top" read
631 if (!$top) {
632 $self->debug("\$top is not set.\n");
633 if ($self->{'contigs'}->{$contig_number}->{'contig_direction'} eq "C") {
634 $self->debug("Reversing the order of the reads. The bottom will be $1\n");
636 # if the contig sequence is marked as the
637 # complement, the top becomes the bottom and$
638 $self->{'contigs'}->{$contig_number}->{'bottom_name'} = $1;
639 $self->{'contigs'}->{$contig_number}->{'bottom_complement'} = $2;
640 $self->{'contigs'}->{$contig_number}->{'bottom_start'} = $3;
642 else {
643 $self->debug("NOT reversing the order of the reads. ".
644 "The top_name will be $1\n");
645 # if the contig sequence is marked as the
646 # complement, the top becomes the bottom and$
647 $self->{'contigs'}->{$contig_number}->{'top_name'} = $1;
648 $self->{'contigs'}->{$contig_number}->{'top_complement'} = $2;
649 $self->{'contigs'}->{$contig_number}->{'top_start'} = $3;
651 $top = 1;
653 else {
655 # if the contig sequence is marked as the complement,
656 # the top becomes the bottom and the bottom becomes
657 # the top
658 if ($self->{'contigs'}->{$contig_number}->{'contig_direction'} eq "C") {
659 $self->debug("Reversing the order of the reads. The top will be $1\n");
660 $self->{'contigs'}->{$contig_number}->{'top_name'} = $1;
661 $self->{'contigs'}->{$contig_number}->{'top_complement'} = $2;
662 $self->{'contigs'}->{$contig_number}->{'top_start'} = $3;
664 else {
665 $self->debug("NOT reversing the order of the reads. The bottom will be $1\n");
666 $self->{'contigs'}->{$contig_number}->{'bottom_name'} = $1;
667 $self->{'contigs'}->{$contig_number}->{'bottom_complement'} = $2;
668 $self->{'contigs'}->{$contig_number}->{'bottom_start'} = $3;
670 $top = undef;
674 return 0;
677 =head2 set_reverse_designator($some_string)
679 Title : set_reverse_designator($some_string)
680 Usage : $o_consed->set_reverse_designator($some_string);
681 Function: Set the designator for the reverse read of contigs in this
682 Bio::Tools::Alignment::Consed object. Used to determine if
683 contigs containing two reads can be named.
684 Returns : The value of $o_consed->{reverse_designator} so you can check
685 to see that it was set properly.
686 Args : An arbitrary string.
688 May be useful only to me. I<shrug>
690 =cut
692 sub set_reverse_designator {
693 my ($self,$reverse_designator) = @_;
694 $self->{'reverse_designator'} = $reverse_designator;
695 $self->{'o_trim'}->set_reverse_designator($reverse_designator);
696 return $self->{'reverse_designator'};
697 } # end set_reverse_designator
699 =head2 set_forward_designator($some_string)
701 Title : set_forward_designator($some_string)
702 Usage : $o_consed->set_forward_designator($some_string);
703 Function: Set the designator for the forward read of contigs in this
704 Bio::Tools::Alignment::Consed object. Used to determine if
705 contigs containing two reads can be named.
706 Returns : The value of $o_consed->{forward_designator} so you can check
707 to see that it was set properly.
708 Args : An arbitrary string.
710 May be useful only to me. I<shrug>
712 =cut
714 sub set_forward_designator {
715 my ($self,$forward_designator) = @_;
716 $self->{'forward_designator'} = $forward_designator;
717 $self->{'o_trim'}->set_forward_designator($forward_designator);
718 return $self->{'forward_designator'};
719 } # end set_forward_designator
721 =head2 set_designator_ignore_case("yes")
723 Title : set_designator_ignore_case("yes")
724 Usage : $o_consed->set_designator_ignore_case("yes");
725 Function: Deprecated.
726 Returns : Deprecated.
727 Args : Deprecated.
729 Deprecated. Really. Trust me.
731 =cut
733 sub set_designator_ignore_case {
734 my ($self,$ignore_case) = @_;
735 if ($ignore_case eq "yes") {
736 $self->{'designator_ignore_case'} = 1;
738 return $self->{'designator_ignore_case'};
739 } # end set_designator_ignore_case
741 =head2 set_trim_points_singlets_and_singletons()
743 Title : set_trim_points_singlets_and_singletons()
744 Usage : $o_consed->set_trim_points_singlets_and_singletons();
745 Function: Set the trim points for singlets and singletons based on
746 quality. Uses the Bio::Tools::Alignment::Trim object. Use
747 at your own risk because the Bio::Tools::Alignment::Trim
748 object was designed specifically for me and is mysterious
749 in its ways. Every time somebody other then me uses it a
750 swarm of locusts decends on a small Central American
751 village so do not say you weren't warned.
752 Returns : Nothing.
753 Args : None.
755 Working on exceptions and warnings here.
757 See L<Bio::Tools::Alignment::Trim> for more information
759 =cut
761 #' to make my emacs happy
763 sub set_trim_points_singlets_and_singletons {
764 my ($self) = @_;
765 $self->debug("Consed.pm : \$self is $self\n");
766 my (@points,$trimmed_sequence);
767 if (!$self->{'doublets_set'}) {
768 $self->debug("You need to set the doublets before you use ".
769 "set_trim_points_singlets_and_doublets. Doing that now.");
770 $self->set_doublets();
772 foreach (sort keys %{$self->{'contigs'}}) {
773 if ($self->{'contigs'}->{$_}->{'class'} eq "singlet") {
774 $self->debug("Singlet $_\n");
775 # this is what Warehouse wants
776 # my ($self,$sequence,$quality,$name) = @_;
777 # this is what Bio::Tools::Alignment::Trim::trim_singlet wants:
778 # my ($self,$sequence,$quality,$name,$class) = @_;
779 # the following several lines are to make the parameter passing legible.
780 my ($sequence,$quality,$name,$class);
781 $sequence = $self->{'contigs'}->{$_}->{'consensus'};
782 if (!$self->{'contigs'}->{$_}->{'quality'}) { $quality = "unset"; }
783 else { $quality = $self->{'contigs'}->{$_}->{'quality'}; }
784 $name = $self->{'contigs'}->{$_}->{'name'};
785 $class = $self->{'contigs'}->{$_}->{'class'};
786 @points = @{$self->{'o_trim'}->trim_singlet($sequence,$quality,$name,$class)};
787 $self->{'contigs'}->{$_}->{'start_point'} = $points[0];
788 $self->{'contigs'}->{$_}->{'end_point'} = $points[1];
789 $self->{'contigs'}->{$_}->{'sequence_trimmed'} =
790 substr($self->{contigs}->{$_}->{'consensus'},$points[0],$points[1]-$points[0]);
793 $self->debug("Bio::Tools::Alignment::Consed::set_trim_points_singlets".
794 "_and_singletons: Done setting the quality trimpoints.\n");
795 return;
796 } # end set_trim_points_singlet
798 =head2 set_trim_points_doublets()
800 Title : set_trim_points_doublets()
801 Usage : $o_consed->set_trim_points_doublets();
802 Function: Set the trim points for doublets based on quality. Uses the
803 Bio::Tools::Alignment::Trim object. Use at your own risk because
804 the Bio::Tools::Alignment::Trim object was designed specifically
805 for me and is mysterious in its ways. Every time somebody other
806 then me uses it you risk a biblical plague being loosed on your
807 city.
808 Returns : Nothing.
809 Args : None.
810 Notes : Working on exceptions here.
812 See L<Bio::Tools::Alignment::Trim> for more information
814 =cut
816 sub set_trim_points_doublets {
817 my $self = shift;
818 my @points;
819 $self->debug("Bio::Tools::Alignment::Consed::set_trim_points_doublets:".
820 " Restoring zeros for doublets.\n");
821 # &show_missing_sequence($self);
822 $self->debug("Bio::Tools::Alignment::Consed::set_trim_points_doublets:".
823 " Setting doublet trim points.\n");
824 foreach (sort keys %{$self->{'contigs'}}) {
825 if ($self->{'contigs'}->{$_}->{'class'} eq "doublet") {
826 # my ($self,$sequence,$quality,$name,$class) = @_;
827 my @quals = split(' ',$self->{'contigs'}->{$_}->{'quality'});
829 @points = $self->{o_trim}->trim_doublet
830 ($self->{'contigs'}->{$_}->{'consensus'},
831 $self->{'contigs'}->{$_}->{'quality'},
832 $self->{'contigs'}->{$_}->{name},
833 $self->{'contigs'}->{$_}->{'class'});
834 $self->{'contigs'}->{$_}->{'start_point'} = $points[0];
835 $self->{'contigs'}->{$_}->{'end_point'} = $points[1];
836 # now set this
837 $self->{'contigs'}->{$_}->{'sequence_trimmed'} =
838 substr($self->{contigs}->{$_}->{'consensus'},
839 $points[0],$points[1]-$points[0]);
840 # 010102 the deprecated way to do things:
843 $self->debug("Bio::Tools::Alignment::Consed::set_trim_points_doublets:".
844 " Done setting doublet trim points.\n");
845 return;
846 } # end set_trim_points_doublets
848 =head2 get_trimmed_sequence_by_name($name)
850 Title : get_trimmed_sequence_by_name($name)
851 Usage : $o_consed->get_trimmed_sequence_by_name($name);
852 Function: Returns the trimmed_sequence of a contig with {name} eq $name.
853 Returns : A scalar- the trimmed sequence.
854 Args : The {name} of a contig.
855 Notes :
857 =cut
859 sub get_trimmed_sequence_by_name {
860 my ($self,$name) = @_;
861 my $trimmed_sequence;
862 my $contigname = &get_contig_number_by_name($self,$name);
863 my $class = $self->{'contigs'}->{$contigname}->{'class'};
864 # what is this business and who was smoking crack while writing this?
865 # if ($class eq "singlet") {
866 # send the sequence, the quality, and the name
867 # $trimmed_sequence = $self->{o_trim}->trim_singlet
868 # ($self->{'contigs'}->{$contigname}->{consensus},
869 # $self->{'contigs'}->{$contigname}->{'quality'},$name);
871 return $self->{'contigs'}->{$contigname}->{'sequence_trimmed'};
874 =head2 set_dash_present_in_sequence_name("yes")
876 Title : set_dash_present_in_sequence_name("yes")
877 Usage : $o_consed->set_dash_present_in_sequence_name("yes");
878 Function: Deprecated. Part of an uncompleted thought. ("Oooh! Shiny!")
879 Returns : Nothing.
880 Args : "yes" to set {dash_present_in_sequence_name} to 1
881 Notes :
883 =cut
885 sub set_dash_present_in_sequence_name {
886 my ($self,$dash_present) = @_;
887 if ($dash_present eq "yes") {
888 $self->{'dash_present_in_sequence_name'} = 1;
890 else {
891 $self->{'dash_present_in_sequence_name'} = 0;
893 return $self->{'dash_present_in_sequence_name'};
894 } # end set_dash_present_in_sequence_name
896 =head2 set_doublets()
898 Title : set_doublets()
899 Usage : $o_consed->set_doublets();
900 Function: Find pairs that have similar names and mark them as doublets
901 and set the {name}.
902 Returns : 0 or 1.
903 Args : None.
905 A complicated subroutine that iterates over the
906 Bio::Tools::Alignment::Consed looking for contigs of 2. If the forward
907 and reverse designator are removed from each of the reads in
908 {'member_array'} and the remaining reads are the same, {name} is set
909 to that name and the contig's class is set as "doublet". If any of
910 those cases fail the contig is marked as a "pair".
912 =cut
914 #' make my emacs happy
916 sub set_doublets {
917 my ($self) = @_;
918 # set the designators in the Bio::Tools::Alignment::Trim object
920 $self->{'o_trim'}->set_designators($self->{'reverse_designator'},
921 $self->{'forward_designator'});
922 foreach my $key_contig (sort keys %{$self->{'contigs'}}) {
924 # if there is a member array (why would there not be? This should be a die()able offence
925 # but for now I will leave it
926 if ($self->{'contigs'}->{$key_contig}->{'member_array'}) {
927 # if there are two reads in this contig
928 # i am pretty sure that this is wrong but i am keeping it for reference
929 # if (@{$self->{'contigs'}->{$key_contig}->{'member_array'}} == 2 || !$self->{'contigs'}->{$key_contig}->{'class'}) {
930 # <seconds later>
931 # <nod> WRONG. Was I on crack?
932 if (@{$self->{'contigs'}->{$key_contig}->{'member_array'}} == 2) {
933 $self->{'contigs'}->{$key_contig}->{'num_members'} = 2;
934 $self->debug("\tThere are 2 members! Looking for the contig name...\n");
935 my $name = _get_contig_name($self,$self->{'contigs'}->{$key_contig}->{'member_array'});
936 $self->debug("The name is $name\n") if defined $name;
937 if ($name) {
938 $self->{'contigs'}->{$key_contig}->{'name'} = $name;
939 $self->{'contigs'}->{$key_contig}->{'class'} = "doublet";
940 } else {
941 $self->debug("$key_contig is a pair.\n");
942 $self->{'contigs'}->{$key_contig}->{'class'} = "pair";
945 # this is all fair and good but what about singlets?
946 # they have one reads in the member_array but certainly are not singletons
947 elsif (@{$self->{'contigs'}->{$key_contig}->{'member_array'}} == 1) {
948 # set the name to be the name of the read
949 $self->{'contigs'}->{$key_contig}->{name} = @{$self->{'contigs'}->{$key_contig}->{'member_array'}}[0];
950 # set the number of members to be one
951 $self->{'contigs'}->{$key_contig}->{num_members} = 1;
952 # if this was a singlet, it would already belong to the class "singlet"
953 # so leave it alone
954 # if it is not a singlet, it is a singleton! lablel it appropriately
955 unless ($self->{'contigs'}->{$key_contig}->{'class'}) {
956 $self->{'contigs'}->{$key_contig}->{'class'} = "singleton";
959 # set the multiplet characteristics
960 elsif (@{$self->{'contigs'}->{$key_contig}->{'member_array'}} >= 3) {
961 $self->{'contigs'}->{$key_contig}->{'num_members'} = @{$self->{'contigs'}->{$key_contig}->{'member_array'}};
962 $self->{'contigs'}->{$key_contig}->{'class'} = "multiplet";
964 $self->{'contigs'}->{$key_contig}->{'num_members'} = @{$self->{'contigs'}->{$key_contig}->{'member_array'}};
968 $self->{'doublets_set'} = "done";
969 return 0;
970 } # end set_doublets
972 =head2 set_singlets
974 Title : set_singlets
975 Usage : $o_consed->set_singlets();
976 Function: Read in a singlets file and place them into the
977 Bio::Tools::Alignment::Consed object.
978 Returns : Nothing.
979 Args : A scalar to turn on verbose parsing of the singlets file.
980 Notes :
982 =cut
984 sub set_singlets {
985 # parse out the contents of the singlets file
986 my ($self) = @_;
987 $self->debug("Bio::Tools::Alignment::Consed Adding singlets to the contig hash...\n");
988 my $full_filename = $self->{'filename'};
989 $self->debug("Bio::Tools::Alignment::Consed::set_singlets: \$full_filename is $full_filename\n");
990 $full_filename =~ s#\\#\/#g if $^O =~ m/mswin/i;
991 $full_filename =~ m/(.*\/)(.*ace.*)$/;
992 my ($base_path,$filename) = ($1,$2);
993 $self->debug("Bio::Tools::Alignment::Consed::set_singlets: singlets filename is $filename and \$base_path is $base_path\n");
994 $filename =~ m/(.*)ace.*$/;
995 my $singletsfile = $base_path.$1."singlets";
996 $self->debug("\$singletsfile is $singletsfile\n");
997 if (!-f $singletsfile) {
998 # there is no singlets file.
999 $self->{'singlets_set'} = "done";
1000 return;
1002 $self->debug("$singletsfile is indeed a file. Trying to open it...\n");
1003 my $singlets_fh = Bio::Root::IO->new(-file => $singletsfile);
1004 my ($sequence,$name,$count);
1005 while ($_ = $singlets_fh->_readline()) {
1006 chomp $_;
1007 if (/\>/) {
1008 if ($name && $sequence) {
1009 $self->debug("Adding $name with sequence $sequence to hash...\n");
1010 push @{$self->{'contigs'}->{$name}->{'member_array'}},$name;
1011 $self->{'contigs'}->{$name}->{'consensus'} = $sequence;
1012 $self->{'contigs'}->{$name}->{'name'} = $name;
1013 $self->{'contigs'}->{$name}->{"singlet"} = 1;
1014 $self->{'contigs'}->{$name}->{'class'} = "singlet";
1016 $sequence = $name = undef;
1017 $count++;
1018 m/^\>(.*)\s\sCHROMAT/;
1019 $name = $1;
1020 if (!$name) {
1021 m/\>(\S+)\s/;
1022 $name = $1;
1025 else { $sequence .= $_; }
1027 if ($name && $sequence) {
1028 $self->debug("Pushing the last of the singlets ($name)\n");
1029 @{$self->{'contigs'}->{$name}->{'member_array'}} = $name;
1030 $self->{'contigs'}->{$name}->{'consensus'} = $sequence;
1031 $self->{'contigs'}->{$name}->{'name'} = $name;
1032 $self->{'contigs'}->{$name}->{"singlet"} = 1;
1033 $self->{'contigs'}->{$name}->{'class'} = "singlet";
1035 $self->debug("Bio::Tools::Alignment::Consed::set_singlets: Done adding singlets to the singlets hash.\n");
1036 $self->{'singlets_set'} = "done";
1037 return 0;
1038 } # end sub set_singlets
1040 =head2 get_singlets()
1042 Title : get_singlets()
1043 Usage : $o_consed->get_singlets();
1044 Function: Return the keynames of the singlets.
1045 Returns : An array containing the keynames of all
1046 Bio::Tools::Alignment::Consed sequences in the class "singlet".
1047 Args : None.
1048 Notes :
1050 =cut
1052 sub get_singlets {
1053 # returns an array of singlet names
1054 # singlets have "singlet"=1 in the hash
1055 my $self = shift;
1056 if (!$self->{singlets_set}) {
1057 $self->debug("You need to set the singlets before you get them. Doing that now.");
1058 $self->set_singlets();
1061 my (@singlets,@array);
1062 foreach my $key (sort keys %{$self->{'contigs'}}) {
1063 # @array = @{$Consed::contigs{$key}->{'member_array'}};
1064 # somethimes a user will try to get a list of singlets before the classes for the rest of the
1065 # contigs has been set (see t/test.t for how I figured this out. <bah>
1066 # so either way, just return class=singlets
1067 if (!$self->{'contigs'}->{$key}->{'class'}) {
1068 # print("$key has no class. why?\n");
1070 elsif ($self->{'contigs'}->{$key}->{'class'} eq "singlet") {
1071 push @singlets,$key;
1074 return @singlets;
1077 =head2 set_quality_by_name($name,$quality)
1079 Title : set_quality_by_name($name,$quality)
1080 Usage : $o_consed->set_quality_by_name($name,$quality);
1081 Function: Deprecated. Make the contig with {name} have {'quality'} $quality.
1082 Probably used for testing.
1083 Returns : Nothing.
1084 Args : The name of a contig and a scalar for its quality.
1085 Notes : Deprecated.
1087 =cut
1089 sub set_quality_by_name {
1090 # this is likely deprecated
1091 my ($self,$name,$quality) = shift;
1092 my $return;
1093 foreach (sort keys %{$self->{'contigs'}}) {
1094 if ($self->{'contigs'} eq "$name" || $self->{'contigs'}->{'name'} eq "$name") {
1095 $self->{'contigs'}->{'quality'} = $quality;
1096 $return=1;
1099 if ($return) { return "0"; } else { return "1"; }
1100 } # end set quality by name
1102 =head2 set_singlet_quality()
1104 Title : set_singlet_quality()
1105 Usage : $o_consed->set_singlet_quality();
1106 Function: For each singlet, go to the appropriate file in phd_dir and read
1107 in the phred quality for that read and place it into {'quality'}
1108 Returns : 0 or 1.
1109 Args : None.
1110 Notes : This is the next subroutine that will receive substantial revision
1111 in the next little while. It really should eval the creation of
1112 Bio::Tools::Alignment::Phred objects and go from there.
1114 =cut
1116 sub set_singlet_quality {
1117 my $self = shift;
1118 my $full_filename = $self->{'filename'};
1119 $full_filename =~ s#\\#\/#g if $^O =~ m/mswin/i;
1120 $full_filename =~ m/(.*\/)(.*)ace.*$/;
1121 my ($base_path,$filename) = ($1,"$2"."qual");
1122 my $singletsfile = $base_path.$filename;
1123 if (-f $singletsfile) {
1124 # print("$singletsfile is indeed a file. Trying to open it...\n");
1126 else {
1127 $self->warn("$singletsfile is not a file. Sorry.\n");
1128 return;
1130 my $singlets_fh = Bio::Root::IO->new(-file => $singletsfile);
1131 my ($sequence,$name,$count);
1132 my ($identity,$line,$quality,@qline);
1133 while ($line = $singlets_fh->_readline()) {
1134 chomp $line;
1135 if ($line =~ /^\>/) {
1136 $quality = undef;
1137 $line =~ m/\>(\S*)\s/;
1138 $identity = $1;
1140 else {
1141 if ($self->{'contigs'}->{$identity}) {
1142 $self->{'contigs'}->{$identity}->{'quality'} .= "$line ";
1147 return 0;
1150 =head2 set_contig_quality()
1152 Title : set_contig_quality()
1153 Usage : $o_consed->set_contig_quality();
1154 Function: Deprecated.
1155 Returns : Deprecated.
1156 Args : Deprecated.
1157 Notes : Deprecated. Really. Trust me.
1159 =cut
1161 sub set_contig_quality {
1162 # note: contigs _include_ singletons but _not_ singlets
1163 my ($self) = shift;
1164 # the unexpected results I am referring to here are a doubling of quality values.
1165 # the profanity I uttered on discovering this reminded me of the simpsons:
1166 # Ned Flanders: "That is the loudest profanity I have ever heard!"
1167 $self->warn("set_contig_quality is deprecated and will likely produce unexpected results");
1168 my $full_filename = $self->{'filename'};
1169 # Run_SRC3700_2000-08-01_73+74.fasta.screen.contigs.qual
1170 # from Consed.pm
1171 $full_filename =~ s#\\#\/#g if $^O =~ m/mswin/i;
1172 $full_filename =~ m/(.*\/)(.*)ace.*$/;
1173 my ($base_path,$filename) = ($1,"$2"."contigs.qual");
1174 my $singletsfile = $base_path.$filename;
1175 if (-f $singletsfile) {
1176 # print("$singletsfile is indeed a file. Trying to open it...\n");
1178 else {
1179 $self->warn("Bio::Tools::Alignment::Consed::set_contig_quality $singletsfile is not a file. Sorry.\n");
1180 return;
1182 my $contig_quality_fh = Bio::Root::IO->new(-file => $singletsfile);
1184 my ($sequence,$name,$count,$identity,$line,$quality);
1185 while ($line = $contig_quality_fh->_readline()) {
1186 chomp $line;
1187 if ($line =~ /^\>/) {
1188 $quality = undef;
1189 $line =~ m/\>.*Contig(\d+)\s/;
1190 $identity = $1;
1192 else {
1193 if ($self->{'contigs'}->{$identity} ) {
1194 $self->{'contigs'}->{$identity}->{'quality'} .= " $line";
1198 } # end set_contig_quality
1200 =head2 get_multiplets()
1202 Title : get_multiplets()
1203 Usage : $o_consed->get_multiplets();
1204 Function: Return the keynames of the multiplets.
1205 Returns : Returns an array containing the keynames of all
1206 Bio::Tools::Alignment::Consed sequences in the class "multiplet".
1207 Args : None.
1208 Notes :
1210 =cut
1212 sub get_multiplets {
1213 # returns an array of multiplet names
1214 # multiplets have # members > 2
1215 my $self = shift;
1216 my (@multiplets,@array);
1217 foreach my $key (sort keys %{$self->{'contigs'}}) {
1218 if ($self->{'contigs'}->{$key}->{'class'}) {
1219 if ($self->{'contigs'}->{$key}->{'class'} eq "multiplet") {
1220 push @multiplets,$key;
1224 return @multiplets;
1227 =head2 get_all_members()
1229 Title : get_all_members()
1230 Usage : @all_members = $o_consed->get_all_members();
1231 Function: Return a list of all of the read names in the
1232 Bio::Tools::Alignment::Consed object.
1233 Returns : An array containing all of the elements in all of the
1234 {'member_array'}s.
1235 Args : None.
1236 Notes :
1238 =cut
1240 sub get_all_members {
1241 my $self = shift;
1242 my @members;
1243 foreach my $key (sort keys %{$self->{'contigs'}}) {
1244 if ($key =~ /^singlet/) {
1245 push @members,$self->{'contigs'}->{$key}->{'member_array'}[0];
1247 elsif ($self->{'contigs'}->{$key}->{'member_array'}) {
1248 push @members,@{$self->{'contigs'}->{$key}->{'member_array'}};
1250 # else {
1251 # print("Bio::Tools::Alignment::Consed: $key is _not_ an array. Pushing $self->{'contigs'}->{$key}->{'member_array'} onto \@members\n");
1252 # push @members,$self->{'contigs'}->{$key}->{'member_array'};
1255 return @members;
1258 =head2 sum_lets($total_only)
1260 Title : sum_lets($total_only)
1261 Usage : $statistics = $o_consed->sum_lets($total_only);
1262 Function: Provide numbers for how many sequences were accounted for in the
1263 Bio::Tools::Alignment::Consed object.
1264 Returns : If a scalar is present, returns the total number of
1265 sequences accounted for in all classes. If no scalar passed
1266 then returns a string that looks like this:
1267 Singt/singn/doub/pair/mult/total : 2,0,1(2),0(0),0(0),4
1268 This example means the following: There were 1 singlets.
1269 There were 0 singletons. There were 1 doublets for a total
1270 of 2 sequences in this class. There were 0 pairs for a
1271 total of 0 sequences in this class. There were 0
1272 multiplets for a total of 0 sequences in this class. There
1273 were a total of 4 sequences accounted for in the
1274 Bio::Tools::Alignment::Consed object.
1275 Args : A scalar is optional to change the way the numbers are returned.
1276 Notes:
1278 =cut
1280 sub sum_lets {
1281 my ($self,$total_only) = @_;
1282 my ($count,$count_multiplets,$multiplet_count);
1283 my $singlets = &get_singlets($self); $count += $singlets;
1284 my $doublets = &get_doublets($self); $count += ($doublets * 2);
1285 my $pairs = &get_pairs($self); $count += ($pairs * 2);
1286 my $singletons = &get_singletons($self); $count += $singletons;
1287 my @multiplets = &get_multiplets($self);
1288 $count_multiplets = @multiplets;
1289 my $return_string;
1290 foreach (@multiplets) {
1291 my $number_members = $self->{'contigs'}->{$_}->{num_members};
1292 $multiplet_count += $number_members;
1294 if ($multiplet_count) {
1295 $count += $multiplet_count;
1297 foreach (qw(multiplet_count singlets doublets pairs singletons
1298 multiplets count_multiplets)) {
1299 no strict 'refs'; # renege for the block
1300 if (!${$_}) {
1301 ${$_} = 0;
1304 if (!$multiplet_count) { $multiplet_count = 0; }
1305 if ($total_only) {
1306 return $count;
1308 $return_string = "Singt/singn/doub/pair/mult/total : ".
1309 "$singlets,$singletons,$doublets(".
1310 ($doublets*2)."),$pairs(".($pairs*2).
1311 "),$count_multiplets($multiplet_count),$count";
1312 return $return_string;
1315 =head2 write_stats()
1317 Title : write_stats()
1318 Usage : $o_consed->write_stats();
1319 Function: Write a file called "statistics" containing numbers similar to
1320 those provided in sum_lets().
1321 Returns : Nothing. Write a file in $o_consed->{path} containing something
1322 like this:
1324 0,0,50(100),0(0),0(0),100
1326 Where the numbers provided are in the format described in the
1327 documentation for sum_lets().
1328 Args : None.
1329 Notes : This might break platform independence, I do not know.
1331 See L<sum_lets()|sum_lets>
1333 =cut
1335 sub write_stats {
1336 # worry about platform dependence here?
1337 # oh shucksdarn.
1338 my $self = shift;
1339 my $stats_filename = $self->{'path'}."statistics";
1340 my $statistics_raw = $self->sum_lets;
1341 my ($statsfilecontents) = $statistics_raw =~ s/.*\ \:\ //g;
1342 umask 0001;
1343 my $fh = Bio::Root::IO->new(-file=>"$stats_filename");
1344 # open my $STATSFILE, '>', $stats_filename or print "Could not write the statsfile: $!\n");
1345 $fh->_print("$statsfilecontents");
1346 # close $STATSFILE;
1347 $fh->close();
1350 =head2 get_singletons()
1352 Title : get_singletons()
1353 Usage : @singletons = $o_consed->get_singletons();
1354 Function: Return the keynames of the singletons.
1355 Returns : Returns an array containing the keynames of all
1356 Bio::Tools::Alignment::Consed sequences in the class "singleton".
1357 Args : None.
1358 Notes :
1360 =cut
1362 sub get_singletons {
1363 # returns an array of singleton names
1364 # singletons are contigs with one member (see consed documentation)
1365 my $self = shift;
1366 my (@singletons,@array);
1367 foreach my $key (sort keys %{$self->{'contigs'}}) {
1368 if ($self->{'contigs'}->{$key}->{'class'}) {
1369 # print ("$key class: $self->{'contigs'}->{$key}->{'class'}\n");
1371 else {
1372 # print("$key belongs to no class. why?\n");
1374 if ($self->{'contigs'}->{$key}->{'member_array'}) {
1375 @array = @{$self->{'contigs'}->{$key}->{'member_array'}};
1377 my $num_array_elem = @array;
1378 if ($num_array_elem == 1 && $self->{'contigs'}->{$key}->{'class'} && $self->{'contigs'}->{$key}->{'class'} eq "singleton") { push @singletons,$key; }
1380 return @singletons;
1383 =head2 get_pairs()
1385 Title : get_pairs()
1386 Usage : @pairs = $o_consed->get_pairs();
1387 Function: Return the keynames of the pairs.
1388 Returns : Returns an array containing the keynames of all
1389 Bio::Tools::Alignment::Consed sequences in the class "pair".
1390 Args : None.
1391 Notes :
1393 =cut
1395 sub get_pairs {
1396 # returns an array of pair contig names
1397 # a pair is a contig of two where the names do not match
1398 my $self = shift;
1399 my (@pairs,@array);
1400 foreach my $key (sort keys %{$self->{'contigs'}}) {
1401 if ($self->{'contigs'}->{$key}->{'member_array'}) {
1402 if (@{$self->{'contigs'}->{$key}->{'member_array'}} == 2 &&
1403 $self->{'contigs'}->{$key}->{'class'} eq "pair") {
1404 push @pairs,$key;
1408 return @pairs;
1411 =head2 get_name($contig_keyname)
1413 Title : get_name($contig_keyname)
1414 Usage : $name = $o_consed->get_name($contig_keyname);
1415 Function: Return the {name} for $contig_keyname.
1416 Returns : A string. ({name})
1417 Args : A contig keyname.
1418 Notes :
1420 =cut
1422 sub get_name {
1423 my ($self,$contig) = @_;
1424 return $self->{'contigs'}->{$contig}->{'name'};
1427 =head2 _get_contig_name(\@array_containing_reads)
1429 Title : _get_contig_name(\@array_containing_reads)
1430 Usage : $o_consed->_get_contig_name(\@array_containing_reads);
1431 Function: The logic for the set_doublets subroutine.
1432 Returns : The name for this contig.
1433 Args : A reference to an array containing read names.
1434 Notes : Depends on reverse_designator. Be sure this is set the way you
1435 intend.
1437 =cut
1439 sub _get_contig_name {
1440 my ($self,$r_array) = @_;
1441 my @contig_members = @$r_array;
1442 my @name_nodir;
1443 foreach (@contig_members) {
1444 # how can I distinguish the clone name from the direction label?
1445 # look for $Consed::reverse_designator and $Consed::forward_designator
1446 # what if you do not find _any_ of those?
1447 my $forward_designator = $self->{'forward_designator'} || "f";
1448 my $reverse_designator = $self->{'reverse_designator'} || "r";
1449 my $any_hits = /(.+)($forward_designator.*)/ || /(.+)($reverse_designator.*)/||/(.+)(_.+)/;
1450 my $name = $1;
1451 my $suffix = $2;
1452 if ($name) {
1453 # print("\t\$name is $name ");
1455 if ($suffix) {
1456 # print("and \$suffix is $suffix.\n");
1458 # Jee, I hope we get a naming convention soon
1459 if ($suffix) {
1460 if ($suffix =~ /^$forward_designator/ || $suffix =~ /^$reverse_designator/) {
1461 push @name_nodir,$name;
1463 # bugwatch here! should this be unnested?
1464 else {
1465 push @name_nodir,"$name$suffix";
1469 # print("\@name_nodir: @name_nodir\n");
1470 my $mismatch = 0;
1471 for (my $counter=0; $counter<@name_nodir;$counter++) {
1472 next if ($name_nodir[0] eq $name_nodir[$counter]);
1473 $mismatch = 1;
1475 if ($mismatch == 0) {
1476 # print("\tYou have a cohesive contig named $name_nodir[0].\n\n");
1477 return $name_nodir[0];
1478 } else {
1479 # print("\tYou have mixed names in this contig.\n\n");
1481 } # end _get_contig_name
1483 =head2 get_doublets()
1485 Title : get_doublets()
1486 Usage : @doublets = $o_consed->get_doublets();
1487 Function: Return the keynames of the doublets.
1488 Returns : Returns an array containing the keynames of all
1489 Bio::Tools::Alignment::Consed sequences in the class "doublet".
1490 Args : None.
1491 Notes :
1493 =cut
1495 sub get_doublets {
1496 my $self = shift;
1497 if (!$self->{doublets_set}) {
1498 $self->warn("You need to set the doublets before you can get them. Doing that now.");
1499 $self->set_doublets();
1501 my @doublets;
1502 foreach (sort keys %{$self->{'contigs'}}) {
1503 if ($self->{'contigs'}->{$_}->{name} && $self->{'contigs'}->{$_}->{'class'} eq "doublet") {
1504 push @doublets,$_;
1507 return @doublets;
1508 } # end get_doublets
1510 =head2 dump_hash()
1512 Title : dump_hash()
1513 Usage : $o_consed->dump_hash();
1514 Function: Use dumpvar.pl to dump out the Bio::Tools::Alignment::Consed
1515 object to STDOUT.
1516 Returns : Nothing.
1517 Args : None.
1518 Notes : I used this a lot in debugging.
1520 =cut
1522 sub dump_hash {
1523 my $self = shift;
1524 my $dumper = Dumpvalue->new();
1525 $self->debug( "Bio::Tools::Alignment::Consed::dump_hash - ".
1526 "The following is the contents of the contig hash...\n");
1527 $dumper->dumpValue($self->{'contigs'});
1530 =head2 dump_hash_compact()
1532 Title : dump_hash_compact()
1533 Usage : $o_consed->dump_hash_compact();
1534 Function: Dump out the Bio::Tools::Alignment::Consed object in a compact way.
1535 Returns : Nothing.
1536 Args : Nothing.
1537 Notes : Cleaner then dumpValue(), dumpHash(). I used this a lot in
1538 debugging.
1540 =cut
1542 sub dump_hash_compact {
1543 no strict 'refs'; # renege for the block
1544 my ($self,$sequence) = @_;
1545 # get the classes
1546 my @singlets = $self->get_singlets();
1547 my @singletons = $self->get_singletons();
1548 my @doublets = $self->get_doublets();
1549 my @pairs = $self->get_pairs();
1550 my @multiplets = $self->get_multiplets();
1551 print("Name\tClass\tMembers\tQuality?\n");
1552 foreach (@singlets) {
1553 my @members = $self->get_members($_);
1554 print($self->get_name($_)."\tsinglets\t".(join',',@members)."\t");
1555 if ($self->{'contigs'}->{$_}->{'quality'}) {
1556 print("qualities found here\n");
1557 } else {
1558 print("no qualities found here\n");
1562 foreach (@singletons) {
1563 my @members = $self->get_members($_);
1564 print($self->get_name($_)."\tsingletons\t".(join',',@members)."\t");
1565 if ($self->{'contigs'}->{$_}->{'quality'}) {
1566 print("qualities found here\n");
1567 } else {
1568 print("no qualities found here\n");
1571 foreach my $pair (@pairs) {
1572 my @members = $self->get_members($pair);
1573 my $name;
1574 if (!$self->get_name($pair)) {
1575 $name = "BLANK";
1576 } else {
1577 $name = $self->get_name($pair);
1579 print("$name\tpairs\t".(join',',@members)."\n");
1581 foreach (@doublets) {
1582 my @members = $self->get_members_by_name($_);
1583 print("$_\tdoublets\t".(join',',@members)."\t");
1584 my $contig_number = &get_contig_number_by_name($self,$_);
1585 if ($self->{'contigs'}->{$contig_number}->{'quality'}) {
1586 print("qualities found here\n");
1587 } else {
1588 print("no qualities found here\n");
1590 # print($_."\tdoublets\t".(join',',@members)."\n");
1592 foreach (@multiplets) {
1593 my @members = $self->get_members($_);
1594 print("Contig $_"."\tmultiplets\t".(join',',@members)."\n");
1596 } # end dump_hash_compact
1598 =head2 get_phreds()
1600 Title : get_phreds()
1601 Usage : @phreds = $o_consed->get_phreds();
1602 Function: For each doublet in the Bio::Tools::Alignment::Consed hash, go
1603 and get the phreds for the top and bottom reads. Place them into
1604 {top_phreds} and {bottom_phreds}.
1605 Returns : Nothing.
1606 Args : Nothing.
1608 Requires parse_phd() and reverse_and_complement(). I realize that it
1609 would be much more elegant to pull qualities as required but there
1610 were certain "features" in the acefile that required a bit more
1611 detailed work be done to get the qualities for certain parts of the
1612 consensus sequence. In order to make _sure_ that this was done
1613 properly I wrote things to do all steps and then I used dump_hash()
1614 and checked each one to ensure expected behavior. I have never changed
1615 this, so there you are.
1617 =cut
1619 sub get_phreds {
1620 # this subroutine is the target of a rewrite to use the Bio::Tools::Alignment::Phred object.
1621 my $self = shift;
1622 my $current_contig;
1623 foreach $current_contig (sort keys %{$self->{'contigs'}}) {
1624 if ($self->{'contigs'}->{$current_contig}->{'class'} eq "doublet") {
1625 $self->debug("$current_contig is a doublet. Going to parse_phd for top($self->{'contigs'}->{$current_contig}->{'top_name'}) and bottom($self->{'contigs'}->{$current_contig}->{'bottom_name'})\n");
1626 my $r_phreds_top = &parse_phd($self,$self->{'contigs'}->{$current_contig}->{'top_name'});
1627 my $r_phreds_bottom = &parse_phd($self,$self->{'contigs'}->{$current_contig}->{'bottom_name'});
1628 if ($self->{'contigs'}->{$current_contig}->{'top_complement'} eq "C") {
1629 # print("Reversing and complementing...\n");
1630 $r_phreds_top = &reverse_and_complement($r_phreds_top);
1632 if ($self->{'contigs'}->{$current_contig}->{'bottom_complement'} eq "C") {
1633 $r_phreds_bottom = &reverse_and_complement($r_phreds_bottom);
1635 $self->{'contigs'}->{$current_contig}->{'top_phreds'} = $r_phreds_top;
1636 $self->{'contigs'}->{$current_contig}->{'bottom_phreds'} = $r_phreds_bottom;
1641 =head2 parse_phd($read_name)
1643 Title : parse_phd($read_name)
1644 Usage : $o_consed->parse_phd($read_name);
1645 Function: Suck in the contents of a .phd file.
1646 Returns : A reference to an array containing the quality values for the read.
1647 Args : The name of a read.
1648 Notes : This is a significantly weak subroutine because it was always
1649 intended that these functions, along with the functions provided by
1650 get_phreds() be put into the Bio::SeqIO:phd module. This is done
1651 now but the Bio::Tools::Alignment::Consed module has not be
1652 rewritten to reflect this change.
1654 See L<Bio::SeqIO::phd> for more information.
1656 =cut
1658 sub parse_phd {
1659 my ($self,$sequence_name) = @_;
1660 $self->debug("Parsing phd for $sequence_name\n");
1661 my $in_dna = 0;
1662 my $base_number = 0;
1663 my (@bases,@current_line);
1664 # print("parse_phd: $sequence_name\n");
1665 my $fh = Bio::Root::IO->new
1666 (-file=>"$self->{path}/../phd_dir/$sequence_name.phd.1");
1667 while ($fh->_readline()) {
1668 # print("Reading a line from a phredfile!\n");
1669 chomp;
1670 if (/^BEGIN_DNA/) { $in_dna = 1; next}
1671 if (/^END_DNA/) { last; }
1672 if (!$in_dna) { next; }
1673 push(@bases,$_);
1675 return \@bases;
1678 =head2 reverse_and_complement(\@source)
1680 Title : reverse_and_complement(\@source)
1681 Usage : $reference_to_array = $o_consed->reverse_and_complement(\@source);
1682 Function: A stub for the recursive routine reverse_recurse().
1683 Returns : A reference to a reversed and complemented array of phred data.
1684 Args : A reference to an array of phred data.
1685 Notes :
1687 =cut
1689 sub reverse_and_complement {
1690 my $r_source = shift;
1691 my $r_destination;
1692 $r_destination = &reverse_recurse($r_source,$r_destination);
1693 return $r_destination;
1696 =head2 reverse_recurse($r_source,$r_destination)
1698 Title : reverse_recurse(\@source,\@destination)
1699 Usage : $o_consed->reverse_recurse(\@source,\@destination);
1700 Function: A recursive routine to reverse and complement an array of
1701 phred data.
1702 Returns : A reference to an array containing reversed phred data.
1703 Args : A reference to a source array and a reverence to a destination
1704 array.
1706 Recursion is kewl, but this sub should likely be _reverse_recurse.
1708 =cut
1711 sub reverse_recurse($$) {
1712 my ($r_source,$r_destination) = @_;
1713 if (!@$r_source) {
1714 return $r_destination;
1716 $_=pop(@$r_source);
1717 s/c/g/ || s/g/c/ || s/a/t/ || s/t/a/;
1718 push(@$r_destination,$_);
1719 &reverse_recurse($r_source,$r_destination);
1722 =head2 show_missing_sequence()
1724 Title : show_missing_sequence();
1725 Usage : $o_consed->show_missing_sequence();
1726 Function: Used by set_trim_points_doublets() to fill in quality values where
1727 consed (phrap?) set them to 0 at the beginning and/or end of the
1728 consensus sequences.
1729 Returns : Nothing.
1730 Args : None.
1732 Acts on doublets only. Really very somewhat quite ugly. A disgusting
1733 kludge. I<insert pride here> It was written stepwise with no real plan
1734 because it was not really evident why consed (phrap?) was doing this.
1736 =cut
1738 sub show_missing_sequence() {
1740 # decide which sequence should not have been clipped at consensus
1741 # position = 0
1743 my $self = shift;
1744 &get_phreds($self);
1745 my ($current_contig,@qualities);
1746 foreach $current_contig (sort keys %{$self->{'contigs'}}) {
1747 if ($self->{'contigs'}->{$current_contig}->{'class'} eq "doublet") {
1748 my $number_leading_xs = 0;
1749 my $number_trailing_xs = 0;
1750 my $measurer = $self->{'contigs'}->{$current_contig}->{'quality'};
1751 while ($measurer =~ s/^\ 0\ /\ /) {
1752 $number_leading_xs++;
1754 while ($measurer =~ s/\ 0(\s*)$/$1/) {
1755 $number_trailing_xs++;
1757 @qualities = split(' ',$self->{'contigs'}->{$current_contig}->{'quality'});
1758 my $in_initial_zeros = 0;
1759 for (my $count=0;$count<scalar(@qualities); $count++) {
1760 if ($qualities[$count] == 0) {
1761 my ($quality,$top_phred_position,$bottom_phred_position,$top_phred_data,$bottom_phred_data);
1762 # print("The quality of the consensus at ".($count+1)." is zero. Retrieving the real quality value.\n");
1763 # how do I know which strand to get these quality values from????
1764 # boggle
1765 my $top_quality_here = $self->{'contigs'}->{$current_contig}->{'top_phreds'}->[0-$self->{'contigs'}->{$current_contig}->{'top_start'}+$count+1];
1766 my $bottom_quality_here = $self->{'contigs'}->{$current_contig}->{'bottom_phreds'}->[1-$self->{'contigs'}->{$current_contig}->{'bottom_start'}+$count];
1767 if (!$bottom_quality_here || (1-$self->{'contigs'}->{$current_contig}->{'bottom_start'}+$count)<0) {
1768 $bottom_quality_here = "not found";
1770 if (!$top_quality_here) {
1771 $top_quality_here = "not found";
1773 # print("Looking for quals at position $count of $current_contig: top position ".(0-$self->{'contigs'}->{$current_contig}->{top_start}+$count)." ($self->{'contigs'}->{$current_contig}->{top_name}) $top_quality_here , bottom position ".(1-$self->{'contigs'}->{$current_contig}->{bottom_start}+$count)." ($self->{'contigs'}->{$current_contig}->{bottom_name}) $bottom_quality_here\n");
1774 if ($count<$number_leading_xs) {
1775 # print("$count is less then $number_leading_xs so I will get the quality from the top strand\n");
1776 # print("retrieved quality is ".$self->{'contigs'}->{$current_contig}->{top_phreds}[0-$self->{'contigs'}->{$current_contig}->{top_start}+$count+1]."\n");
1777 my $quality = $top_quality_here;
1778 $quality =~ /\S+\s(\d+)\s+/;
1779 $quality = $1;
1780 # print("retrieved quality for leading zero $count is $quality\n");
1781 # t 9 9226
1782 $qualities[$count] = $quality;
1783 } else {
1784 # this part is tricky
1785 # if the contig is like this
1786 # cccccccccccccccc
1787 # ffffffffffffffffff
1788 # rrrrrrrrrrrrrrrrr
1789 # then take the quality value for the trailing zeros in the cons. seq from the r
1791 # but if the contig is like this
1792 # cccccccccccccccccc
1793 # ffffffffffffffffffffffffffffffff
1794 # rrrrrrrrrrrrrrrrrrrrrrrxxxxxxxxr
1795 # ^^^
1796 # then any zeros that fall in the positions (^) must be decided whether the quality
1797 # is the qual from the f or r strand. I will use the greater number
1798 # does a similar situation exist for the leading zeros? i dunno
1800 # print("$count is greater then $number_leading_xs so I will get the quality from the bottom strand\n");
1801 # print("retrieved quality is ".$contigs->{$current_contig}->{top_phreds}[0-$contigs->{$current_contig}->{top_start}+$count+1]."\n");
1802 # my ($quality,$top_phred_position,$bottom_phred_position,$top_phred_data,$bottom_phred_data);
1803 if ($bottom_quality_here eq "not found") {
1804 # $top_phred_position = 1-$contigs->{$current_contig}->{bottom_start}+$count;
1805 # print("Going to get quality from here: $top_phred_position of the top.\n");
1806 # my $temp_quality - $contigs->{$current_contig}->{top_phreds}
1807 # $quality = $contigs->{$current_contig}->{top_phreds}[$top_phred_position];
1808 $top_quality_here =~ /\w+\s(\d+)\s/;
1809 $quality = $1;
1810 } elsif ($top_quality_here eq "not found") {
1811 # $bottom_phred_position = 1+$contigs->{$current_contig}->{bottom_start}+$count;
1812 # print("Going to get quality from here: $bottom_phred_position of the bottom.\n");
1813 # $quality = $contigs->{$current_contig}->{bottom_phreds}[$bottom_phred_position];
1814 # print("Additional: no top quality but bottom is $quality\n");
1815 $bottom_quality_here =~ /\w+\s(\d+)\s/;
1816 $quality = $1;
1817 } else {
1818 # print("Oh jeepers, there are 2 qualities to choose from at this position.\n");
1819 # print("Going to compare these phred qualities: top: #$top_quality_here# bottom: #$bottom_quality_here#\n");
1820 # now you have to compare them
1821 # my $top_quality_phred = $contigs->{$current_contig}->{top_phreds}[$top_phred_position];
1822 # #t 40 875#
1823 # print("regexing #$top_quality_here#... ");
1824 $top_quality_here =~ /\w\ (\d+)\s/;
1825 my $top_quality = $1;
1826 # print("$top_quality\nregexing #$bottom_quality_here#... ");
1827 $bottom_quality_here =~ /\w\ (\d+)\s/;
1828 my $bottom_quality = $1;
1829 # print("$bottom_quality\n");
1830 # print("top_quality: $top_quality bottom quality: $bottom_quality\n");
1831 if ($bottom_quality > $top_quality) {
1832 # print("Chose to take the bottom quality: $bottom_quality\n");
1833 $quality = $bottom_quality;
1834 } else {
1835 # print("Chose to take the top quality: $top_quality\n");
1836 $quality = $top_quality;
1839 if (!$quality) {
1840 # print("Warning: no quality value for $current_contig, position $count!\n");
1841 # print("Additional data: top quality phred: $top_quality_here\n");
1842 # print("Additional data: bottom quality phred: $bottom_quality_here\n");
1843 } else {
1844 $qualities[$count] = $quality;
1850 unless (!@qualities) {
1851 $self->{'contigs'}->{$current_contig}->{'quality'} = join(" ",@qualities);
1853 $self->{'contigs'}->{$current_contig}->{'bottom_phreds'} = undef;
1854 $self->{'contigs'}->{$current_contig}->{'top_phreds'} = undef;
1855 my $count = 1;
1856 } # end foreach key