Bio::DB::Universal: move into its own distribution
[bioperl-live.git] / Bio / Tools / Lucy.pm
bloba13fe680da2ebb86c733c076120eb94dd7e114bc
2 # BioPerl module for Bio::Tools::Lucy
4 # Copyright Her Majesty the Queen of England
5 # written by Andrew Walsh (paeruginosa@hotmail.com) during employment with
6 # Agriculture and Agri-food Canada, Cereal Research Centre, Winnipeg, MB
8 # You may distribute this module under the same terms as perl itself
9 # POD documentation - main docs before the code
11 =head1 NAME
13 Bio::Tools::Lucy - Object for analyzing the output from Lucy,
14 a vector and quality trimming program from TIGR
16 =head1 SYNOPSIS
18 # Create the Lucy object from an existing Lucy output file
19 @params = ('seqfile' => 'lucy.seq', 'lucy_verbose' => 1);
20 $lucyObj = Bio::Tools::Lucy->new(@params);
22 # Get names of all sequences
23 $names = $lucyObj->get_sequence_names();
25 # Print seq and qual values for sequences >400 bp in order to run CAP3
26 foreach $name (@$names) {
27 next unless $lucyObj->length_clear($name) > 400;
28 print SEQ ">$name\n", $lucyObj->sequence($name), "\n";
29 print QUAL ">$name\n", $lucyObj->quality($name), "\n";
32 # Get an array of Bio::PrimarySeq objects
33 @seqObjs = $lucyObj->get_Seq_Objs();
36 =head1 DESCRIPTION
38 Bio::Tools::Lucy.pm provides methods for analyzing the sequence and
39 quality values generated by Lucy program from TIGR.
41 Lucy will identify vector, poly-A/T tails, and poor quality regions in
42 a sequence. (www.genomics.purdue.edu/gcg/other/lucy.pdf)
44 The input to Lucy can be the Phred sequence and quality files
45 generated from running Phred on a set of chromatograms.
47 Lucy can be obtained (free of charge to academic users) from
48 www.tigr.org/softlab
50 There are a few methods that will only be available if you make some
51 minor changes to the source for Lucy and then recompile. The changes
52 are in the 'lucy.c' file and there is a diff between the original and
53 the modified file in the Appendix
55 Please contact the author of this module if you have any problems
56 making these modifications.
58 You do not have to make these modifications to use this module.
60 =head2 Creating a Lucy object
62 @params = ('seqfile' => 'lucy.seq', 'adv_stderr' => 1,
63 'fwd_desig' => '_F', 'rev_desig' => '_R');
64 $lucyObj = Bio::Tools::Lucy->new(@params);
66 =head2 Using a Lucy object
68 You should get an array with the sequence names in order to use
69 accessor methods. Note: The Lucy binary program will fail unless
70 the sequence names provided as input are unique.
72 $names_ref = $lucyObj->get_sequence_names();
74 This code snippet will produce a Fasta format file with sequence
75 lengths and %GC in the description line.
77 foreach $name (@$names) {
78 print FILE ">$name\t",
79 $lucyObj->length_clear($name), "\t",
80 $lucyObj->per_GC($name), "\n",
81 $lucyObj->sequence($name), "\n";
85 Print seq and qual values for sequences >400 bp in order to assemble
86 them with CAP3 (or other assembler).
88 foreach $name (@$names) {
89 next unless $lucyObj->length_clear($name) > 400;
90 print SEQ ">$name\n", $lucyObj->sequence($name), "\n";
91 print QUAL ">$name\n", $lucyObj->quality($name), "\n";
94 Get all the sequences as Bio::PrimarySeq objects (eg., for use with
95 Bio::Tools::Run::StandaloneBlast to perform BLAST).
97 @seqObjs = $lucyObj->get_Seq_Objs();
99 Or use only those sequences that are full length and have a Poly-A
100 tail.
102 foreach $name (@$names) {
103 next unless ($lucyObj->full_length($name) and $lucy->polyA($name));
104 push @seqObjs, $lucyObj->get_Seq_Obj($name);
108 Get the names of those sequences that were rejected by Lucy.
110 $rejects_ref = $lucyObj->get_rejects();
112 Print the names of the rejects and 1 letter code for reason they
113 were rejected.
115 foreach $key (sort keys %$rejects_ref) {
116 print "$key: ", $rejects_ref->{$key};
119 There is a lot of other information available about the sequences
120 analyzed by Lucy (see APPENDIX). This module can be used with the
121 DBI module to store this sequence information in a database.
123 =head1 FEEDBACK
125 =head2 Mailing Lists
127 User feedback is an integral part of the evolution of this and other
128 Bioperl modules. Send your comments and suggestions preferably to one
129 of the Bioperl mailing lists. Your participation is much appreciated.
131 bioperl-l@bioperl.org - General discussion
132 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
134 =head2 Support
136 Please direct usage questions or support issues to the mailing list:
138 I<bioperl-l@bioperl.org>
140 rather than to the module maintainer directly. Many experienced and
141 reponsive experts will be able look at the problem and quickly
142 address it. Please include a thorough description of the problem
143 with code and data examples if at all possible.
145 =head2 Reporting Bugs
147 Report bugs to the Bioperl bug tracking system to help us keep track
148 the bugs and their resolution. Bug reports can be submitted via the web:
150 https://github.com/bioperl/bioperl-live/issues
152 =head1 AUTHOR
154 Andrew G. Walsh paeruginosa@hotmail.com
156 =head1 APPENDIX
158 Methods available to Lucy objects are described below. Please note
159 that any method beginning with an underscore is considered internal
160 and should not be called directly.
162 =cut
165 package Bio::Tools::Lucy;
167 use vars qw($AUTOLOAD @ATTR %OK_FIELD);
168 use strict;
169 use Bio::PrimarySeq;
171 use base qw(Bio::Root::Root Bio::Root::IO);
172 @ATTR = qw(seqfile qualfile stderrfile infofile lucy_verbose fwd_desig rev_desig adv_stderr);
173 foreach my $attr (@ATTR) {
174 $OK_FIELD{$attr}++
177 sub AUTOLOAD {
178 my $self = shift;
179 my $attr = $AUTOLOAD;
180 $attr =~ s/.*:://;
181 $attr = lc $attr;
182 $self->throw("Unallowed parameter: $attr !") unless $OK_FIELD{$attr};
183 $self->{$attr} = shift if @_;
184 return $self->{$attr};
187 =head2 new
189 Title : new
190 Usage : $lucyObj = Bio::Tools::Lucy->new(seqfile => lucy.seq, rev_desig => '_R',
191 fwd_desig => '_F')
192 Function: creates a Lucy object from Lucy analysis files
193 Returns : reference to Bio::Tools::Lucy object
194 Args : seqfile Fasta sequence file generated by Lucy
195 qualfile Quality values file generated by Lucy
196 infofile Info file created when Lucy is run with -debug
197 'infofile' option
198 stderrfile Standard error captured from Lucy when Lucy is run
199 with -info option and STDERR is directed to stderrfile
200 (ie. lucy ... 2> stderrfile).
201 Info in this file will include sequences dropped for low
202 quality. If you've modified Lucy source (see adv_stderr below),
203 it will also include info on which sequences were dropped because
204 they were vector, too short, had no insert, and whether a poly-A
205 tail was found (if Lucy was run with -cdna option).
206 lucy_verbose verbosity level (0-1).
207 fwd_desig The string used to determine whether sequence is a
208 forward read.
209 The parser will assume that this match will occus at the
210 end of the sequence name string.
211 rev_desig As above, for reverse reads.
212 adv_stderr Can be set to a true value (1). Will only work if
213 you have modified
214 the Lucy source code as outlined in DESCRIPTION and capture
215 the standard error from Lucy.
217 If you don't provide filenames for qualfile, infofile or stderrfile,
218 the module will assume that .qual, .info, and .stderr are the file
219 extensions and search in the same directory as the .seq file for these
220 files.
222 For example, if you create a Lucy object with $lucyObj =
223 Bio::Tools::Lucy-E<gt>new(seqfile =E<gt>lucy.seq), the module will
224 find lucy.qual, lucy.info and lucy.stderr.
226 You can omit any or all of the quality, info or stderr files, but you
227 will not be able to use all of the object methods (see method
228 documentation below).
230 =cut
232 sub new {
233 my ($class,@args) = @_;
234 my $self = $class->SUPER::new(@args);
235 my ($attr, $value);
236 while (@args) {
237 $attr = shift @args;
238 $attr = lc $attr;
239 $value = shift @args;
240 $self->{$attr} = $value;
242 &_parse($self);
243 return $self;
246 =head2 _parse
248 Title : _parse
249 Usage : n/a (internal function)
250 Function: called by new() to parse Lucy output files
251 Returns : nothing
252 Args : none
254 =cut
256 sub _parse {
257 my $self = shift;
258 $self->{seqfile} =~ /^(\S+)\.\S+$/;
259 my $file = $1;
261 $self->warn("Opening $self->{seqfile} for parsing...\n") if $self->{lucy_verbose};
262 open my $SEQ, '<', $self->{seqfile} or $self->throw("Could not read sequence file '$self->{seqfile}': $!");
263 my ($name, $line);
264 my $seq = "";
265 my @lines = <$SEQ>;
266 while ($line = pop @lines) {
267 chomp $line;
268 if ($line =~ /^>(\S+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)/) {
269 $name = $1;
270 if ($self->{fwd_desig}) {
271 $self->{sequences}{$name}{direction} = "F" if $name =~ /^(\S+)($self->{fwd_desig})$/;
273 if ($self->{rev_desig}) {
274 $self->{sequences}{$name}{direction} = "R" if $name =~ /^(\S+)($self->{rev_desig})$/;
276 $self->{sequences}{$name}{min_clone_len} = $2; # this is used for TIGR Assembler, as are $3 and $4
277 $self->{sequences}{$name}{max_clone_len} = $3;
278 $self->{sequences}{$name}{med_clone_len} = $4;
279 $self->{sequences}{$name}{beg_clear} = $5;
280 $self->{sequences}{$name}{end_clear} = $6;
281 $self->{sequences}{$name}{length_raw} = $seq =~ tr/[AGCTN]//; # from what I've seen, these are the bases Phred calls. Please let me know if I'm wrong.
282 my $beg = $5-1; # substr function begins with index 0
283 $seq = $self->{sequences}{$name}{sequence} = substr ($seq, $beg, $6-$beg);
284 my $count = $self->{sequences}{$name}{length_clear} = $seq =~ tr/[AGCTN]//;
285 my $countGC = $seq =~ tr/[GC]//;
286 $self->{sequences}{$name}{per_GC} = $countGC/$count * 100;
287 $seq = "";
289 else {
290 $seq = $line.$seq;
294 # now parse quality values (check for presence of quality file first)
295 if ($self->{qualfile}) {
296 open my $QUAL, '<', $self->{qualfile} or $self->throw("Could not read quality file '$self->{qualfile}': $!");
297 @lines = <$QUAL>;
299 elsif (-e "$file.qual") {
300 $self->warn("You did not set qualfile, but I'm opening $file.qual\n") if $self->{lucy_verbose};
301 $self->qualfile("$file.qual");
302 open my $QUAL, '<', "$file.qual" or $self->throw("Could not read quality file '$file.qual': $!");
303 @lines = <$QUAL>;
305 else {
306 $self->warn("I did not find a quality file. You will not be able to use all of the accessor methods.\n") if $self->{lucy_verbose};
307 @lines = ();
310 my (@vals, @slice, $num, $tot, $vals);
311 my $qual = "";
312 while ($line = pop @lines) {
313 chomp $line;
314 if ($line =~ /^>(\S+)/) {
315 $name = $1;
316 @vals = split /\s/ , $qual;
317 @slice = @vals[$self->{sequences}{$name}{beg_clear} - 1 .. $self->{sequences}{$name}{end_clear} - 1];
318 $vals = join "\t", @slice;
319 $self->{sequences}{$name}{quality} = $vals;
320 $qual = "";
321 foreach $num (@slice) {
322 $tot += $num;
324 $num = @slice;
325 $self->{sequences}{$name}{avg_quality} = $tot/$num;
326 $tot = 0;
328 else {
329 $qual = $line.$qual;
333 # determine whether reads are full length
334 if ($self->{infofile}) {
335 open my $INFO, '<', $self->{infofile} or $self->throw("Could not read info file '$self->{infofile}': $!");
336 @lines = <$INFO>;
338 elsif (-e "$file.info") {
339 $self->warn("You did not set infofile, but I'm opening $file.info\n") if $self->{lucy_verbose};
340 $self->infofile("$file.info");
341 open my $INFO, '<', "$file.info" or $self->throw("Could not read info file '$file.info': $!");
342 @lines = <$INFO>;
344 else {
345 $self->warn("I did not find an info file. You will not be able to use all of the accessor methods.\n") if $self->{lucy_verbose};
346 @lines = ();
349 foreach (@lines) {
350 /^(\S+).+CLV\s+(\d+)\s+(\d+)$/;
351 if ($2>0 && $3>0) {
352 $self->{sequences}{$1}{full_length} = 1 if $self->{sequences}{$1}; # will show cleavage info for rejected sequences too
357 # parse rejects (and presence of poly-A if Lucy has been modified)
358 if ($self->{stderrfile}) {
359 open my $STDERR_LUCY, '<', $self->{stderrfile} or $self->throw("Could not read quality file '$self->{stderrfile}': $!");
360 @lines = <$STDERR_LUCY>;
362 elsif (-e "$file.stderr") {
363 $self->warn("You did not set stderrfile, but I'm opening $file.stderr\n") if $self->{lucy_verbose};
364 $self->stderrfile("$file.stderr");
365 open my $STDERR_LUCY, '<', "$file.stderr" or $self->throw("Could not read quality file '$file.stderr': $!");
366 @lines = <$STDERR_LUCY>;
368 else {
369 $self->warn("I did not find a standard error file. You will not be able to use all of the accessor methods.\n") if $self->{lucy_verbose};
370 @lines = ();
373 if ($self->{adv_stderr}) {
374 foreach (@lines) {
375 $self->{reject}{$1} = "Q" if /dropping\s+(\S+)/;
376 $self->{reject}{$1} = "V" if /Vector: (\S+)/;
377 $self->{reject}{$1} = "E" if /Empty: (\S+)/;
378 $self->{reject}{$1} = "S" if m{Short/ no insert: (\S+)};
379 $self->{sequences}{$1}{polyA} = 1 if /(\S+) has PolyA/;
380 if (/Dropped PolyA: (\S+)/) {
381 $self->{reject}{$1} = "P";
382 delete $self->{sequences}{$1};
386 else {
387 foreach (@lines) {
388 $self->{reject}{$1} = "R" if /dropping\s+(\S+)/;
393 =head2 get_Seq_Objs
395 Title : get_Seq_Objs
396 Usage : $lucyObj->get_Seq_Objs()
397 Function: returns an array of references to Bio::PrimarySeq objects
398 where -id = 'sequence name' and -seq = 'sequence'
400 Returns : array of Bio::PrimarySeq objects
401 Args : none
403 =cut
405 sub get_Seq_Objs {
406 my $self = shift;
407 my($seqobj, @seqobjs);
408 foreach my $key (sort keys %{$self->{sequences}}) {
409 $seqobj = Bio::PrimarySeq->new( -seq => "$self->{sequences}{$key}{sequence}",
410 -id => "$key");
411 push @seqobjs, $seqobj;
413 return \@seqobjs;
416 =head2 get_Seq_Obj
418 Title : get_Seq_Obj
419 Usage : $lucyObj->get_Seq_Obj($seqname)
420 Function: returns reference to a Bio::PrimarySeq object where -id = 'sequence name'
421 and -seq = 'sequence'
422 Returns : reference to Bio::PrimarySeq object
423 Args : name of a sequence
425 =cut
427 sub get_Seq_Obj {
428 my ($self, $key) = @_;
429 my $seqobj = Bio::PrimarySeq->new( -seq => "$self->{sequences}{$key}{sequence}",
430 -id => "$key");
431 return $seqobj;
434 =head2 get_sequence_names
436 Title : get_sequence_names
437 Usage : $lucyObj->get_sequence_names
438 Function: returns reference to an array of names of the sequences analyzed by Lucy.
439 These names are required for most of the accessor methods.
440 Note: The Lucy binary will fail unless sequence names are unique.
441 Returns : array reference
442 Args : none
444 =cut
446 sub get_sequence_names {
447 my $self = shift;
448 my @keys = sort keys %{$self->{sequences}};
449 return \@keys;
452 =head2 sequence
454 Title : sequence
455 Usage : $lucyObj->sequence($seqname)
456 Function: returns the DNA sequence of one of the sequences analyzed by Lucy.
457 Returns : string
458 Args : name of a sequence
460 =cut
462 sub sequence {
463 my ($self, $key) = @_;
464 return $self->{sequences}{$key}{sequence};
467 =head2 quality
469 Title : quality
470 Usage : $lucyObj->quality($seqname)
471 Function: returns the quality values of one of the sequences analyzed by Lucy.
472 This method depends on the user having provided a quality file.
473 Returns : string
474 Args : name of a sequence
476 =cut
478 sub quality {
479 my($self, $key) = @_;
480 return $self->{sequences}{$key}{quality};
483 =head2 avg_quality
485 Title : avg_quality
486 Usage : $lucyObj->avg_quality($seqname)
487 Function: returns the average quality value for one of the sequences analyzed by Lucy.
488 Returns : float
489 Args : name of a sequence
491 =cut
493 sub avg_quality {
494 my($self, $key) = @_;
495 return $self->{sequences}{$key}{avg_quality};
498 =head2 direction
500 Title : direction
501 Usage : $lucyObj->direction($seqname)
502 Function: returns the direction for one of the sequences analyzed by Lucy
503 providing that 'fwd_desig' or 'rev_desig' were set when the
504 Lucy object was created.
505 Strings returned are: 'F' for forward, 'R' for reverse.
506 Returns : string
507 Args : name of a sequence
509 =cut
511 sub direction {
512 my($self, $key) = @_;
513 return $self->{sequences}{$key}{direction} if $self->{sequences}{$key}{direction};
514 return "";
517 =head2 length_raw
519 Title : length_raw
520 Usage : $lucyObj->length_raw($seqname)
521 Function: returns the length of a DNA sequence prior to quality/ vector
522 trimming by Lucy.
523 Returns : integer
524 Args : name of a sequence
526 =cut
528 sub length_raw {
529 my($self, $key) = @_;
530 return $self->{sequences}{$key}{length_raw};
533 =head2 length_clear
535 Title : length_clear
536 Usage : $lucyObj->length_clear($seqname)
537 Function: returns the length of a DNA sequence following quality/ vector
538 trimming by Lucy.
539 Returns : integer
540 Args : name of a sequence
542 =cut
544 sub length_clear {
545 my($self, $key) = @_;
546 return $self->{sequences}{$key}{length_clear};
549 =head2 start_clear
551 Title : start_clear
552 Usage : $lucyObj->start_clear($seqname)
553 Function: returns the beginning position of good quality, vector free DNA sequence
554 determined by Lucy.
555 Returns : integer
556 Args : name of a sequence
558 =cut
560 sub start_clear {
561 my($self, $key) = @_;
562 return $self->{sequences}{$key}{beg_clear};
566 =head2 end_clear
568 Title : end_clear
569 Usage : $lucyObj->end_clear($seqname)
570 Function: returns the ending position of good quality, vector free DNA sequence
571 determined by Lucy.
572 Returns : integer
573 Args : name of a sequence
575 =cut
577 sub end_clear {
578 my($self, $key) = @_;
579 return $self->{sequences}{$key}{end_clear};
582 =head2 per_GC
584 Title : per_GC
585 Usage : $lucyObj->per_GC($seqname)
586 Function: returns the percente of the good quality, vector free DNA sequence
587 determined by Lucy.
588 Returns : float
589 Args : name of a sequence
591 =cut
593 sub per_GC {
594 my($self, $key) = @_;
595 return $self->{sequences}{$key}{per_GC};
598 =head2 full_length
600 Title : full_length
601 Usage : $lucyObj->full_length($seqname)
602 Function: returns the truth value for whether or not the sequence read was
603 full length (ie. vector present on both ends of read). This method
604 depends on the user having provided the 'info' file (Lucy must be
605 run with the -debug 'info_filename' option to get this file).
606 Returns : boolean
607 Args : name of a sequence
609 =cut
611 sub full_length {
612 my($self, $key) = @_;
613 return 1 if $self->{sequences}{$key}{full_length};
614 return 0;
617 =head2 polyA
619 Title : polyA
620 Usage : $lucyObj->polyA($seqname)
621 Function: returns the truth value for whether or not a poly-A tail was detected
622 and clipped by Lucy. This method depends on the user having modified
623 the source for Lucy as outlined in DESCRIPTION and invoking Lucy with
624 the -cdna option and saving the standard error.
625 Note, the final sequence will not show the poly-A/T region.
626 Returns : boolean
627 Args : name of a sequence
629 =cut
631 sub polyA {
632 my($self, $key) = @_;
633 return 1 if $self->{sequences}{$key}{polyA};
634 return 0;
637 =head2 get_rejects
639 Title : get_rejects
640 Usage : $lucyObj->get_rejects()
641 Function: returns a hash containing names of rejects and a 1 letter code for the
642 reason Lucy rejected the sequence.
643 Q- rejected because of low quality values
644 S- sequence was short
645 V- sequence was vector
646 E- sequence was empty
647 P- poly-A/T trimming caused sequence to be too short
648 In order to get the rejects, you must provide a file with the standard
649 error from Lucy. You will only get the quality category rejects unless
650 you have modified the source and recompiled Lucy as outlined in DESCRIPTION.
651 Returns : hash reference
652 Args : none
654 =cut
656 sub get_rejects {
657 my $self = shift;
658 return $self->{reject};
661 =head2 Diff for Lucy source code
663 352a353,354
664 > /* AGW added next line */
665 > fprintf(stderr, "Empty: %s\n", seqs[i].name);
666 639a642,643
667 > /* AGW added next line */
668 > fprintf(stderr, "Short/ no insert: %s\n", seqs[i].name);
669 678c682,686
670 < if (left) seqs[i].left+=left;
672 > if (left) {
673 > seqs[i].left+=left;
674 > /* AGW added next line */
675 > fprintf(stderr, "%s has PolyA (left).\n", seqs[i].name);
677 681c689,693
678 < if (right) seqs[i].right-=right;
680 > if (right) {
681 > seqs[i].right-=right;
682 > /* AGW added next line */
683 > fprintf(stderr, "%s has PolyA (right).\n", seqs[i].name);
685 682a695,696
686 > /* AGW added next line */
687 > fprintf(stderr, "Dropped PolyA: %s\n", seqs[i].name);
688 734a749,750
689 > /* AGW added next line */
690 > fprintf(stderr, "Vector: %s\n", seqs[i].name);
692 =cut
694 =head2 This patch is to be applied to lucy.c from the lucy-1.19p release
696 277a278,279
697 > /* AGW added next line */
698 > fprintf(stderr, "Short/ no insert: %s\n", seqs[i].name);
699 588c590,592
700 < if ((seqs[i].len=bases)<=0)
702 > if ((seqs[i].len=bases)<=0) {
703 > /* AGW added next line */
704 > fprintf(stderr, "Empty: %s\n", seqs[i].name);
705 589a594
707 893c898,902
708 < if (left) seqs[i].left+=left;
710 > if (left) {
711 > seqs[i].left+=left;
712 > /* AGW added next line */
713 > fprintf(stderr, "%s has PolyA (left).\n", seqs[i].name);
715 896c905,909
716 < if (right) seqs[i].right-=right;
718 > if (right) {
719 > seqs[i].right-=right;
720 > /* AGW added next line */
721 > fprintf(stderr, "%s has PolyA (right).\n", seqs[i].name);
723 898a912,913
724 > /* AGW added next line */
725 > fprintf(stderr, "Dropped PolyA: %s\n", seqs[i].name);
726 949a965,966
727 > /* AGW added next line */
728 > fprintf(stderr, "Vector: %s\n", seqs[i].name);
730 =cut