1 # Bio::Tools::Alignment::Trim.pm
3 # Please direct questions and support issues to <bioperl-l@bioperl.org>
5 # Cared for by Chad Matsalla
7 # Copyright Chad Matsalla
9 # You may distribute this module under the same terms as perl itself
11 # POD documentation - main docs before the code
15 Bio::Tools::Alignment::Trim - A kludge to do specialized trimming of
16 sequence based on quality.
20 use Bio::Tools::Alignment::Trim;
21 $o_trim = Bio::Tools::Alignment::Trim->new();
22 $o_trim->set_reverse_designator("R");
23 $o_trim->set_forward_designator("F");
28 This is a specialized module designed by Chad for Chad to trim sequences
29 based on a highly specialized list of requirements. In other words, write
30 something that will trim sequences 'just like the people in the lab would
33 I settled on a sliding-window-average style of search which is ugly and
34 slow but does _exactly_ what I want it to do.
36 Mental note: rewrite this.
38 It is very important to keep in mind the context in which this module was
39 written: strictly to support the projects for which Consed.pm was
46 User feedback is an integral part of the evolution of this and other
47 Bioperl modules. Send your comments and suggestions preferably to one
48 of the Bioperl mailing lists. Your participation is much appreciated.
50 bioperl-l@bioperl.org - General discussion
51 http://bioperl.org/wiki/Mailing_lists - About the mailing
56 Please direct usage questions or support issues to the mailing list:
58 I<bioperl-l@bioperl.org>
60 rather than to the module maintainer directly. Many experienced and
61 reponsive experts will be able look at the problem and quickly
62 address it. Please include a thorough description of the problem
63 with code and data examples if at all possible.
67 Report bugs to the Bioperl bug tracking system to help us keep track
68 the bugs and their resolution. Bug reports can be submitted via the
71 https://github.com/bioperl/bioperl-live/issues
73 =head1 AUTHOR - Chad Matsalla
75 Email bioinformatics-at-dieselwurks.com
79 The rest of the documentation details each of the object methods.
80 Internal methods are usually preceded with a _
84 package Bio
::Tools
::Alignment
::Trim
;
89 use vars
qw(%DEFAULTS);
91 use base qw(Bio::Root::Root);
94 %DEFAULTS = ( 'f_designator' => 'f',
95 'r_designator' => 'r',
103 Usage : $o_trim = Bio::Tools::Alignment::Trim->new();
104 Function: Construct the Bio::Tools::Alignment::Trim object. No parameters
105 are required to create this object. It is strictly a bundle of
106 functions, as far as I am concerned.
107 Returns : A reference to a Bio::Tools::Alignment::Trim object.
109 -windowsize (default 10)
116 my ($class,@args) = @_;
117 my $self = $class->SUPER::new
(@args);
118 my($windowsize,$phreds) =
119 $self->_rearrange([qw(
124 $self->{windowsize
} = $windowsize || $DEFAULTS{'windowsize'};
125 $self->{phreds
} = $phreds || $DEFAULTS{'phreds'};
126 # print("Constructor set phreds to ".$self->{phreds}."\n") if $self->verbose > 0;
127 $self->set_designators($DEFAULTS{'f_designator'},
128 $DEFAULTS{'r_designator'});
132 =head2 set_designators($forward_designator,$reverse_designator)
134 Title : set_designators(<forward>,<reverse>)
135 Usage : $o_trim->set_designators("F","R")
136 Function: Set the string by which the system determines whether a given
137 sequence represents a forward or a reverse read.
139 Args : two scalars: one representing the forward designator and one
140 representing the reverse designator
144 sub set_designators
{
146 ($self->{'f_designator'},$self->{'r_designator'}) = @_;
149 =head2 set_forward_designator($designator)
151 Title : set_forward_designator($designator)
152 Usage : $o_trim->set_forward_designator("F")
153 Function: Set the string by which the system determines if a given
154 sequence is a forward read.
156 Args : A string representing the forward designator of this project.
160 sub set_forward_designator
{
161 my ($self,$desig) = @_;
162 $self->{'f_designator'} = $desig;
165 =head2 set_reverse_designator($reverse_designator)
167 Title : set_reverse_designator($reverse_designator)
168 Function: Set the string by which the system determines if a given
169 sequence is a reverse read.
170 Usage : $o_trim->set_reverse_designator("R")
172 Args : A string representing the forward designator of this project.
176 sub set_reverse_designator
{
177 my ($self,$desig) = @_;
178 $self->{'r_designator'} = $desig;
181 =head2 get_designators()
183 Title : get_designators()
184 Usage : $o_trim->get_designators()
185 Returns : A string describing the current designators.
187 Notes : Really for informational purposes only. Duh.
191 sub get_designators
{
193 return("forward: ".$self->{'f_designator'}." reverse: ".$self->{'r_designator'});
196 =head2 trim_leading_polys()
198 Title : trim_leading_polys()
199 Usage : $o_trim->trim_leading_polys()
200 Function: Not implemented. Does nothing.
203 Notes : This function is not implemented. Part of something I wanted to
204 do but never got around to doing.
208 sub trim_leading_polys
{
209 my ($self, $sequence) = @_;
215 Usage : $o_trim->dump_hash()
216 Function: Unimplemented.
219 Notes : Does nothing.
225 my %hash = %{$self->{'qualities'}};
228 =head2 trim_singlet($sequence,$quality,$name,$class)
230 Title : trim_singlet($sequence,$quality,$name,$class)
231 Usage : ($r_trim_points,$trimmed_sequence) =
232 @{$o_trim->trim_singlet($sequence,$quality,$name,$class)};
233 Function: Trim a singlet based on its quality.
234 Returns : a reference to an array containing the forward and reverse
235 trim points and the trimmed sequence.
236 Args : $sequence : A sequence (SCALAR, please)
237 $quality : A _scalar_ of space-delimited quality values.
238 $name : the name of the sequence
239 $class : The class of the sequence. One of qw(singlet
240 singleton doublet pair multiplet)
241 Notes : At the time this was written the bioperl objects SeqWithQuality
242 and PrimaryQual did not exist. This is what is with the clumsy
243 passing of references and so on. I will rewrite this next time I
244 have to work with it. I also wasn't sure whether this function
245 should return just the trim points or the points and the sequence.
246 I decided that I always wanted both so that's how I implemented
248 - Note that the size of the sliding windows is set during construction of
249 the Bio::Tools::Alignment::Trim object.
254 my ($self,$sequence,$quality,$name,$class) = @_;
255 # this split is done because I normally store quality values in a
256 # space-delimited scalar rather then in an array.
257 # I do this because serialization of the arrays is tough.
258 my @qual = split(' ',$quality);
260 my $sequence_length = length($sequence);
261 my ($returnstring,$processed_sequence);
262 # smooth out the qualities
263 my $r_windows = &_sliding_window
(\
@qual,$self->{windowsize
});
264 # find out the leading and trailing trimpoints
265 my $start_base = $self->_get_start($r_windows,$self->{windowsize
},$self->{phreds
});
266 my (@new_points,$trimmed_sequence);
267 # do you think that any sequence shorter then 100 should be
268 # discarded? I don't think that this should be the decision of this
271 $points[0] = $start_base;
272 # whew! now for the end base
273 # required parameters: reference_to_windows,windowsize,$phredvalue,start_base
274 my $end_base = &_get_end
($r_windows,$self->{windowsize
},
275 $self->{phreds
},$start_base);
276 $points[1] = $end_base;
277 # now do the actual trimming
278 # CHAD : I don't think that it is a good idea to call chop_sequence here
279 # because chop_sequence also removes X's and N's and things
280 # and that is not always what is wanted
284 =head2 trim_doublet($sequence,$quality,$name,$class)
286 Title : trim_doublet($sequence,$quality,$name,$class)
287 Usage : ($r_trim_points,$trimmed_sequence) =
288 @{$o_trim->trim_singlet($sequence,$quality,$name,$class)};
289 Function: Trim a singlet based on its quality.
290 Returns : a reference to an array containing the forward and reverse
291 Args : $sequence : A sequence
292 $quality : A _scalar_ of space-delimited quality values.
293 $name : the name of the sequence
294 $class : The class of the sequence. One of qw(singlet
295 singleton doublet pair multiplet)
296 Notes : At the time this was written the bioperl objects SeqWithQuality
297 and PrimaryQual did not exist. This is what is with the clumsy
298 passing of references and so on. I will rewrite this next time I
299 have to work with it. I also wasn't sure whether this function
300 should return just the trim points or the points and the sequence.
301 I decided that I always wanted both so that's how I implemented
308 my ($self,$sequence,$quality,$name,$class) = @_;
309 my @qual = split(' ',$quality);
311 my $sequence_length = length($sequence);
312 my ($returnstring,$processed_sequence);
313 # smooth out the qualities
314 my $r_windows = &_sliding_window
(\
@qual,$self->{windowsize
});
315 # determine where the consensus sequence starts
317 for (my $current = 0; $current<$sequence_length;$current++) {
318 if ($qual[$current] != 0) {
323 # start_base required: r_quality,$windowsize,$phredvalue
324 my $start_base = $self->_get_start($r_windows,$self->{windowsize
},$self->{phreds
},$offset);
325 if ($start_base > ($sequence_length - 100)) {
326 $points[0] = ("FAILED");
327 $points[1] = ("FAILED");
330 $points[0] = $start_base;
332 # whew! now for the end base
334 # required parameters: reference_to_windows,windowsize,$phredvalue,start_base
336 # 010420 NOTE: We will no longer get the end base to avoid the Q/--\___/-- syndrome
337 my $end_base = $sequence_length;
338 my $start_of_trailing_zeros = &count_doublet_trailing_zeros
(\
@qual);
339 $points[1] = $end_base;
340 # CHAD : I don't think that it is a good idea to call chop_sequence here
341 # because chop_sequence also removes X's and N's and things
342 # and that is not always what is wanted
346 =head2 chop_sequence($name,$class,$sequence,@points)
348 Title : chop_sequence($name,$class,$sequence,@points)
349 Usage : ($start_point,$end_point,$chopped_sequence) =
350 $o_trim->chop_sequence($name,$class,$sequence,@points);
351 Function: Chop a sequence based on its name, class, and sequence.
352 Returns : an array containing three scalars:
353 1- the start trim point
354 2- the end trim point
355 3- the chopped sequence
357 $name : the name of the sequence
358 $class : The class of the sequence. One of qw(singlet
359 singleton doublet pair multiplet)
360 $sequence : A sequence
361 @points : An array containing two elements- the first contains
362 the start trim point and the second conatines the end trim
368 my ($self,$name,$class,$sequence,@points) = @_;
369 print("Coming into chop_sequence, \@points are @points\n");
370 my $fdesig = $self->{'f_designator'};
371 my $rdesig = $self->{'r_designator'};
372 if (!$points[0] && !$points[1]) {
376 if ($class eq "singlet" && $name =~ /$fdesig$/) {
377 $sequence = substr($sequence,$points[0],$points[1]-$points[0]);
379 elsif ($class eq "singlet" && $name =~ /$rdesig$/) {
380 $sequence = substr($sequence,$points[0],$points[1]-$points[0]);
382 elsif ($class eq "singleton" && $name =~ /$fdesig$/) {
383 $sequence = substr($sequence,$points[0],$points[1]-$points[0]);
385 elsif ($class eq "singleton" && $name =~ /$rdesig$/) {
386 $sequence = substr($sequence,$points[0],$points[1]-$points[0]);
388 elsif ($class eq "doublet") {
389 $sequence = substr($sequence,$points[0],$points[1]-$points[0]);
391 # this is a _terrible_ to do this! i couldn't seem to find a better way
392 # i thought something like s/(^.*[Xx]{5,})//g; might work, but no go
393 # no time to find a fix!
394 my $length_before_trimming = length($sequence);
395 my $subs_Xs = $sequence =~ s/^.*[Xx]{5,}//g;
397 my $length_after_trimming = length($sequence);
398 my $number_Xs_trimmed = $length_before_trimming - $length_after_trimming;
399 $points[0] += $number_Xs_trimmed;
401 $length_before_trimming = length($sequence);
402 my $subs_Ns = $sequence =~ s/[Nn]{1,}$//g;
404 my $length_after_trimming = length($sequence);
405 my $number_Ns_trimmed = $length_before_trimming - $length_after_trimming;
406 $points[1] -= $number_Ns_trimmed;
409 push @points,$sequence;
410 print("chop_sequence \@points are @points\n");
414 =head2 _get_start($r_quals,$windowsize,$phreds,$offset)
416 Title : _get_start($r_quals,$windowsize,$phreds,$offset)
417 Usage : $start_base = $self->_get_start($r_windows,5,20);
418 Function: Provide the start trim point for this sequence.
419 Returns : a scalar representing the start of the sequence
421 $r_quals : A reference to an array containing quality values. In
422 context, this array of values has been smoothed by then
423 sliding window-look ahead algorithm.
424 $windowsize : The size of the window used when the sliding window
425 look-ahead average was calculated.
426 $phreds : <fill in what this does here>
427 $offset : <fill in what this does here>
432 my ($self,$r_quals,$windowsize,$phreds,$offset) = @_;
433 print("Using $phreds phreds\n") if $self->verbose > 0;
434 # this is to help determine whether the sequence is good at all
435 my @quals = @
$r_quals;
436 my ($count,$count2,$qualsum);
437 if ($offset) { $count = $offset; } else { $count = 0; }
438 # search along the length of the sequence
439 for (; ($count+$windowsize) <= scalar(@quals); $count++) {
440 # sum all of the quality values in this window.
442 for($count2 = $count; $count2 < $count+$windowsize; $count2++) {
443 if (!$quals[$count2]) {
444 # print("Quals don't exist here!\n");
447 $qualsum += $quals[$count2];
448 # print("Incremented qualsum to ($qualsum)\n");
452 # print("The sum of this window (starting at $count) is $qualsum. I counted $cumulative bases.\n");
453 # if the total of windowsize * phreds is
454 if ($qualsum && $qualsum >= $windowsize*$phreds) { return $count; }
457 # if ($count > scalar(@quals)-$windowsize) { return; }
461 =head2 _get_end($r_qual,$windowsize,$phreds,$count)
463 Title : _get_end($r_qual,$windowsize,$phreds,$count)
464 Usage : my $end_base = &_get_end($r_windows,20,20,$start_base);
465 Function: Get the end trim point for this sequence.
466 Returns : A scalar representing the end trim point for this sequence.
468 $r_qual : A reference to an array containing quality values. In
469 context, this array of values has been smoothed by then
470 sliding window-look ahead algorithm.
471 $windowsize : The size of the window used when the sliding window
472 look-ahead average was calculated.
473 $phreds : <fill in what this does here>
474 $count : Start looking for the end of the sequence here.
479 my ($r_qual,$windowsize,$phreds,$count) = @_;
480 my @quals = @
$r_qual;
481 my $total_bases = scalar(@quals);
482 my ($count2,$qualsum,$end_of_quals,$bases_counted);
483 if (!$count) { $count=0; }
484 BASE
: for (; $count < $total_bases; $count++) {
487 POSITION
: for($count2 = $count; $count2 < $total_bases; $count2++) {
490 if ($count2 == $total_bases-1) {
491 $qualsum += $quals[$count2];
495 elsif ($bases_counted == $windowsize) {
496 $qualsum += $quals[$count2];
497 if ($qualsum < $bases_counted*$phreds) {
498 return $count+$bases_counted+$windowsize;
503 $qualsum += $quals[$count2];
506 if ($qualsum < $bases_counted*$phreds) {
507 return $count+$bases_counted+$windowsize;
513 my $bases_for_average = $total_bases-$count2;
517 if ($qualsum) { } # print ("$qualsum\n");
521 =head2 count_doublet_trailing_zeros($r_qual)
523 Title : count_doublet_trailing_zeros($r_qual)
524 Usage : my $start_of_trailing_zeros = &count_doublet_trailing_zeros(\@qual);
525 Function: Find out when the trailing zero qualities start.
526 Returns : A scalar representing where the zeros start.
527 Args : A reference to an array of quality values.
528 Notes : Again, this should be rewritten to use PrimaryQual objects.
529 A more detailed explanation of why phrap puts these zeros here should
530 be written and placed here. Please email and hassle the author.
535 sub count_doublet_trailing_zeros
{
536 my ($r_qual) = shift;
537 my $number_of_trailing_zeros = 0;
538 my @qualities = @
$r_qual;
539 for (my $current=scalar(@qualities);$current>0;$current--) {
540 if ($qualities[$current] && $qualities[$current] != 0) {
541 $number_of_trailing_zeros = scalar(@qualities)-$current;
545 return scalar(@qualities);
546 } # end count_doublet_trailing_zeros
548 =head2 _sliding_window($r_quals,$windowsize)
550 Title : _sliding_window($r_quals,$windowsize)
551 Usage : my $r_windows = &_sliding_window(\@qual,$windowsize);
552 Function: Create a sliding window, look-forward-average on an array
553 of quality values. Used to smooth out differences in qualities.
554 Returns : A reference to an array containing the smoothed values.
555 Args : $r_quals: A reference to an array containing quality values.
556 $windowsize : The size of the sliding window.
557 Notes : This was written before PrimaryQual objects existed. They
558 should use that object but I haven't rewritten this yet.
563 sub _sliding_window
{
564 my ($r_quals,$windowsize) = @_;
565 my (@window,@quals,$qualsum,$count,$count2,$average,@averages,$bases_counted);
567 my $size_of_quality = scalar(@quals);
568 # do this loop for all of the qualities
569 for ($count=0; $count <= $size_of_quality; $count++) {
571 BASE
: for($count2 = $count; $count2 < $size_of_quality; $count2++) {
573 # if the search hits the end of the averages, stop
574 # this is for the case near the end where bases remaining < windowsize
575 if ($count2 == $size_of_quality) {
576 $qualsum += $quals[$count2];
579 # if the search hits the size of the window
580 elsif ($bases_counted == $windowsize) {
581 $qualsum += $quals[$count2];
584 # otherwise add the quality value
585 unless (!$quals[$count2]) {
586 $qualsum += $quals[$count2];
589 unless (!$qualsum || !$windowsize) {
590 $average = $qualsum / $bases_counted;
591 if (!$average) { $average = "0"; }
592 push @averages,$average;
596 # 02101 Yes, I repaired the mismatching numbers between averages and windows.
597 # print("There are ".scalar(@$r_quals)." quality values. They are @$r_quals\n");
598 # print("There are ".scalar(@averages)." average values. They are @averages\n");
603 =head2 _print_formatted_qualities
605 Title : _print_formatted_qualities(\@quals)
606 Usage : &_print_formatted_qualities(\@quals);
607 Returns : Nothing. Prints.
608 Args : A reference to an array containing quality values.
609 Notes : An internal procedure used in debugging. Prints out an array nicely.
613 sub _print_formatted_qualities
{
616 for (my $count=0; $count<scalar(@qual) ; $count++) {
617 if (($count%10)==0) { print("\n$count\t"); }
618 if ($qual[$count]) { print ("$qual[$count]\t");}
619 else { print("0\t"); }
624 =head2 _get_end_old($r_qual,$windowsize,$phreds,$count)
626 Title : _get_end_old($r_qual,$windowsize,$phreds,$count)
627 Usage : Deprecated. Don't use this!
628 Returns : Deprecated. Don't use this!
629 Args : Deprecated. Don't use this!
635 my ($r_qual,$windowsize,$phreds,$count) = @_;
636 warn("Do Not Use this function (_get_end_old)");
637 my $target = $windowsize*$phreds;
638 my @quals = @
$r_qual;
639 my $total_bases = scalar(@quals);
640 my ($count2,$qualsum,$end_of_quals);
641 if (!$count) { $count=0; }
642 BASE
: for (; $count < $total_bases; $count++) {
643 for($count2 = $count; $count2 < $count+$windowsize; $count2++) {
644 if ($count2 == scalar(@quals)-1) {
645 $qualsum += $quals[$count2];
650 $qualsum += $quals[$count2];
652 if ($qualsum < $windowsize*$phreds) {
653 return $count+$windowsize;
660 # Autoload methods go after =cut, and are processed by the autosplit program.