Bring base classes from BioPerl-run back to BioPerl.
[bioperl-live.git] / Bio / Root / Utilities.pm
blob86bd72002b25040fd6ffba01315e4f06632aa439
1 package Bio::Root::Utilities;
2 use strict;
3 use Bio::Root::IO;
4 use Bio::Root::Exception;
5 use base qw(Bio::Root::Root Exporter);
7 =head1 SYNOPSIS
9 =head2 Object Creation
11 # Using the supplied singleton object:
12 use Bio::Root::Utilities qw(:obj);
13 $Util->some_method();
15 # Create an object manually:
16 use Bio::Root::Utilities;
17 my $util = Bio::Root::Utilities->new();
18 $util->some_method();
20 $date_stamp = $Util->date_format('yyy-mm-dd');
22 $clean = $Util->untaint($dirty);
24 $compressed = $Util->compress('/home/me/myfile.txt')
26 my ($mean, $stdev) = $Util->mean_stdev( @data );
28 $Util->authority("me@example.com");
29 $Util->mail_authority("Something you should know about...");
31 ...and a host of other methods. See below.
33 =head1 DESCRIPTION
35 Provides general-purpose utilities of potential interest to any Perl script.
37 The C<:obj> tag is a convenience that imports a $Util symbol into your
38 namespace representing a Bio::Root::Utilities object. This saves you
39 from creating your own Bio::Root::Utilities object via
40 C<Bio::Root::Utilities-E<gt>new()> or from prefixing all method calls with
41 C<Bio::Root::Utilities>, though feel free to do these things if desired.
42 Since there should normally not be a need for a script to have more
43 than one Bio::Root::Utilities object, this module thus comes with it's
44 own singleton.
46 =head1 INSTALLATION
48 This module is included with the central Bioperl distribution:
50 http://www.bioperl.org/wiki/Getting_BioPerl
51 ftp://bio.perl.org/pub/DIST
53 Follow the installation instructions included in the README file.
55 =head1 DEPENDENCIES
57 Inherits from L<Bio::Root::Root>, and uses L<Bio::Root::IO>
58 and L<Bio::Root::Exception>.
60 Relies on external executables for file compression/uncompression
61 and sending mail. No paths to these are hard coded but are located
62 as needed.
64 =head1 SEE ALSO
66 http://bioperl.org - Bioperl Project Homepage
68 =head1 ACKNOWLEDGEMENTS
70 This module was originally developed under the auspices of the
71 Saccharomyces Genome Database: http://www.yeastgenome.org/
73 =head1 AUTHOR Steve Chervitz
75 =cut
77 use vars qw(@EXPORT_OK %EXPORT_TAGS);
78 @EXPORT_OK = qw($Util);
79 %EXPORT_TAGS = ( obj => [qw($Util)],
80 std => [qw($Util)],);
82 use vars qw($ID $Util $GNU_PATH $TIMEOUT_SECS
83 @COMPRESSION_UTILS @UNCOMPRESSION_UTILS
84 $DEFAULT_NEWLINE $NEWLINE $AUTHORITY
85 @MONTHS @DAYS $BASE_YEAR $DEFAULT_CENTURY
88 $ID = 'Bio::Root::Utilities';
89 # Number of seconds to wait before timing out when reading input (taste_file())
90 $TIMEOUT_SECS = 30;
91 $NEWLINE = $ENV{'NEWLINE'} || undef;
92 $BASE_YEAR = 1900; # perl's localtime() assumes this for it's year data.
93 # TODO: update this every hundred years. Y2K-sensitive code.
94 $DEFAULT_CENTURY = $BASE_YEAR + 100;
95 @MONTHS = qw(Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec);
96 @DAYS = qw(Sun Mon Tue Wed Thu Fri Sat);
97 # Sets the preference for compression utilities to be used by compress().
98 # The first executable in this list to be found in the current PATH will be used,
99 # unless overridden in the call to that function. See docs for details.
100 @COMPRESSION_UTILS = qw(gzip bzip2 zip compress);
101 @UNCOMPRESSION_UTILS = qw(gunzip gzip bunzip2 unzip uncompress);
103 # Default person to receive feedback from users and possibly automatic error messages.
104 $AUTHORITY = '';
106 # Note: $GNU_PATH is now deprecated, shouldn't be needed since now this module
107 # will automatically locate the compression utility in the current PATH.
108 # Retaining $GNU_PATH for backward compatibility.
110 # $GNU_PATH points to the directory containing the gzip and gunzip
111 # executables. It may be required for executing gzip/gunzip
112 # in some situations (e.g., when $ENV{PATH} doesn't contain this dir.
113 # Customize $GNU_PATH for your site if the compress() or
114 # uncompress() functions are generating exceptions.
115 $GNU_PATH = '';
116 #$GNU_PATH = '/tools/gnu/bin/';
118 $DEFAULT_NEWLINE = "\012"; # \n (used if get_newline() fails for some reason)
120 ## Static UTIL object.
121 $Util = Bio::Root::Root->new();
124 =head2 date_format
126 Title : date_format
127 Usage : $Util->date_format( [FMT], [DATE])
128 Purpose : -- Get a string containing the formatted date or time
129 : taken when this routine is invoked.
130 : -- Provides a way to avoid using `date`.
131 : -- Provides an interface to localtime().
132 : -- Interconverts some date formats.
134 : (For additional functionality, use Date::Manip or
135 : Date::DateCalc available from CPAN).
136 Example : $Util->date_format();
137 : $date = $Util->date_format('yyyy-mmm-dd', '11/22/92');
138 Returns : String (unless 'list' is provided as argument, see below)
140 : 'yyyy-mm-dd' = 1996-05-03 # default format.
141 : 'yyyy-dd-mm' = 1996-03-05
142 : 'yyyy-mmm-dd' = 1996-May-03
143 : 'd-m-y' = 3-May-1996
144 : 'd m y' = 3 May 1996
145 : 'dmy' = 3may96
146 : 'mdy' = May 3, 1996
147 : 'ymd' = 96may3
148 : 'md' = may3
149 : 'year' = 1996
150 : 'hms' = 23:01:59 # when not converting a format, 'hms' can be
151 : # tacked on to any of the above options
152 : # to add the time stamp: eg 'dmyhms'
153 : 'full' | 'unix' = UNIX-style date: Tue May 5 22:00:00 1998
154 : 'list' = the contents of localtime(time) in an array.
155 Argument : (all are optional)
156 : FMT = yyyy-mm-dd | yyyy-dd-mm | yyyy-mmm-dd |
157 : mdy | ymd | md | d-m-y | hms | hm
158 : ('hms' may be appended to any of these to
159 : add a time stamp)
161 : DATE = String containing date to be converted.
162 : Acceptable input formats:
163 : 12/1/97 (for 1 December 1997)
164 : 1997-12-01
165 : 1997-Dec-01
166 Throws :
167 Comments : If you don't care about formatting or using backticks, you can
168 : always use: $date = `date`;
170 : For more features, use Date::Manip.pm, (which I should
171 : probably switch to...)
173 See Also : L<file_date()|file_date>, L<month2num()|month2num>
175 =cut
177 #---------------'
178 sub date_format {
179 #---------------
180 my $self = shift;
181 my $option = shift;
182 my $date = shift; # optional date to be converted.
184 my($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst);
186 $option ||= 'yyyy-mm-dd';
188 my ($month_txt, $day_txt, $month_num, $fullYear);
189 my ($converting, @date);
191 # Load a supplied date for conversion:
192 if(defined($date) && ($date =~ /[\D-]+/)) {
193 $converting = 1;
194 if( $date =~ m{/}) {
195 ($mon,$mday,$year) = split(m{/}, $date);
196 } elsif($date =~ /(\d{4})-(\d{1,2})-(\d{1,2})/) {
197 ($year,$mon,$mday) = ($1, $2, $3);
198 } elsif($date =~ /(\d{4})-(\w{3,})-(\d{1,2})/) {
199 ($year,$mon,$mday) = ($1, $2, $3);
200 $mon = $self->month2num($2);
201 } else {
202 print STDERR "\n*** Unsupported input date format: $date\n";
204 if(length($year) == 4) {
205 $fullYear = $year;
206 $year = substr $year, 2;
207 } else {
208 # Heuristics to guess what century was intended when a 2-digit year is given
209 # If number is over 50, assume it's for prev century; under 50 = default century.
210 # TODO: keep an eye on this Y2K-sensitive code
211 if ($year > 50) {
212 $fullYear = $DEFAULT_CENTURY + $year - 100;
213 } else {
214 $fullYear = $DEFAULT_CENTURY + $year;
217 $mon -= 1;
218 } else {
219 ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = @date =
220 localtime(($date ? $date : time()));
221 return @date if $option =~ /list/i;
222 $fullYear = $BASE_YEAR+$year;
224 $month_txt = $MONTHS[$mon];
225 $day_txt = $DAYS[$wday] if defined $wday;
226 $month_num = $mon+1;
228 # print "sec: $sec, min: $min, hour: $hour, month: $mon, m-day: $mday, year: $year\nwday: $wday, yday: $yday, dst: $isdst";<STDIN>;
230 if( $option =~ /yyyy-mm-dd/i ) {
231 $date = sprintf "%4d-%02d-%02d",$fullYear,$month_num,$mday;
232 } elsif( $option =~ /yyyy-dd-mm/i ) {
233 $date = sprintf "%4d-%02d-%02d",$fullYear,$mday,$month_num;
234 } elsif( $option =~ /yyyy-mmm-dd/i ) {
235 $date = sprintf "%4d-%3s-%02d",$fullYear,$month_txt,$mday;
236 } elsif( $option =~ /full|unix/i ) {
237 $date = sprintf "%3s %3s %2d %02d:%02d:%02d %d",$day_txt, $month_txt, $mday, $hour, $min, $sec, $fullYear;
238 } elsif( $option =~ /mdy/i ) {
239 $date = "$month_txt $mday, $fullYear";
240 } elsif( $option =~ /ymd/i ) {
241 $date = $year."\l$month_txt$mday";
242 } elsif( $option =~ /dmy/i ) {
243 $date = $mday."\l$month_txt$year";
244 } elsif( $option =~ /md/i ) {
245 $date = "\l$month_txt$mday";
246 } elsif( $option =~ /d-m-y/i ) {
247 $date = "$mday-$month_txt-$fullYear";
248 } elsif( $option =~ /d m y/i ) {
249 $date = "$mday $month_txt $fullYear";
250 } elsif( $option =~ /year/i ) {
251 $date = $fullYear;
252 } elsif( $option =~ /dmy/i ) {
253 $date = $mday.'-'.$month_txt.'-'.$fullYear;
254 } elsif($option and $option !~ /hms/i) {
255 print STDERR "\n*** Unrecognized date format request: $option\n";
258 if( $option =~ /hms/i and not $converting) {
259 $date .= " $hour:$min:$sec" if $date;
260 $date ||= "$hour:$min:$sec";
263 return $date || join(" ", @date);
267 =head2 month2num
269 Title : month2num
270 Purpose : Converts a string containing a name of a month to integer
271 : representing the number of the month in the year.
272 Example : $Util->month2num("march"); # returns 3
273 Argument : The string argument must contain at least the first
274 : three characters of the month's name. Case insensitive.
275 Throws : Exception if the conversion fails.
277 =cut
279 #--------------'
280 sub month2num {
281 #--------------
282 my ($self, $str) = @_;
284 # Get string in proper format for conversion.
285 $str = substr($str, 0, 3);
286 for my $month (0..$#MONTHS) {
287 return $month+1 if $str =~ /$MONTHS[$month]/i;
289 $self->throw("Invalid month name: $str");
292 =head2 num2month
294 Title : num2month
295 Purpose : Does the opposite of month2num.
296 : Converts a number into a string containing a name of a month.
297 Example : $Util->num2month(3); # returns 'Mar'
298 Throws : Exception if supplied number is out of range.
300 =cut
302 #-------------
303 sub num2month {
304 #-------------
305 my ($self, $num) = @_;
307 $self->throw("Month out of range: $num") if $num < 1 or $num > 12;
308 return $MONTHS[$num-1];
311 =head2 compress
313 Title : compress
314 Usage : $Util->compress(full-path-filename);
315 : $Util->compress(<named parameters>);
316 Purpose : Compress a file.
317 Example : $Util->compress("/usr/people/me/data.txt");
318 : $Util->compress(-file=>"/usr/people/me/data.txt",
319 : -tmp=>1,
320 : -outfile=>"/usr/people/share/data.txt.gz",
321 : -exe=>"/usr/local/bin/fancyzip");
322 Returns : String containing full, absolute path to compressed file
323 Argument : Named parameters (case-insensitive):
324 : -FILE => String (name of file to be compressed, full path).
325 : If the supplied filename ends with '.gz' or '.Z',
326 : that extension will be removed before attempting to compress.
327 : Optional:
328 : -TMP => boolean. If true, (or if user is not the owner of the file)
329 : the file is compressed to a temp file. If false, file may be
330 : clobbered with the compressed version (if using a utility like
331 : gzip, which is the default)
332 : -OUTFILE => String (name of the output compressed file, full path).
333 : -EXE => Name of executable for compression utility to use.
334 : Will supersede those in @COMPRESSION_UTILS defined by
335 : this module. If the absolute path to the executable is not provided,
336 : it will be searched in the PATH env variable.
337 Throws : Exception if file cannot be compressed.
338 : If user is not owner of the file, generates a warning and compresses to
339 : a tmp file. To avoid this warning, use the -o file test operator
340 : and call this function with -TMP=>1.
341 Comments : Attempts to compress using utilities defined in the @COMPRESSION_UTILS
342 : defined by this module, in the order defined. The first utility that is
343 : found to be executable will be used. Any utility defined in optional -EXE param
344 : will be tested for executability first.
345 : To minimize security risks, the -EXE parameter value is untained using
346 : the untaint() method of this module (in 'relaxed' mode to permit path separators).
348 See Also : L<uncompress()|uncompress>
350 =cut
352 #------------'
353 sub compress {
354 #------------
355 my ($self, @args) = @_;
356 # This method formerly didn't use named params and expected fileName, tmp
357 # in that order. This should be backward compatibile.
358 my ($fileName, $tmp, $outfile, $exe) = $self->_rearrange([qw(FILE TMP OUTFILE EXE)], @args);
359 my ($file, $get, $fmt);
361 # in case the supplied name already has a compressed extension
362 if($fileName =~ /(\.gz|\.Z|\.bz2|\.zip)$/) { $fileName =~ s/$1$//; };
363 $self->debug("compressing file $fileName");
365 my @util_to_use = @COMPRESSION_UTILS;
367 if (defined $exe){
368 $exe = $self->untaint($exe, 1);
369 unshift @util_to_use, $exe;
372 my @checked = @util_to_use;
373 $exe ||= '';
374 while (not -x $exe and scalar(@util_to_use)) {
375 $exe = $self->find_exe(shift @util_to_use);
378 unless (-x $exe) {
379 $self->throw("Can't find compression utility. Looked for @checked");
382 my ($compressed, @cmd, $handle);
384 if(defined($outfile) or $tmp or not -o $fileName) {
385 if (defined $outfile) {
386 $compressed = $outfile;
387 } else {
388 # obtain a temporary file name (not using the handle)
389 # and insert some special text to flag it as a bioperl-based temp file
390 my $io = Bio::Root::IO->new();
391 ($handle, $compressed) = $io->tempfile();
392 $compressed .= '.tmp.bioperl.gz';
395 # Use double quotes if executable path have empty spaces
396 if ($exe =~ m/ /) {
397 $exe = "\"$exe\"";
400 if ($exe =~ /gzip|bzip2|compress/) {
401 @cmd = ("$exe -f < \"$fileName\" > \"$compressed\"");
402 } elsif ($exe eq 'zip') {
403 @cmd = ("$exe -r \"$fileName.zip\" \"$fileName\"");
405 not $tmp and
406 $self->warn("Not owner of file $fileName. Compressing to temp file $compressed.");
407 $tmp = 1;
408 } else {
409 # Need to compute the compressed name based on exe since we're returning it.
410 $compressed = $fileName;
411 if ($exe =~ /gzip/) {
412 $compressed .= '.gz';
413 } elsif ($exe =~ /bzip2/) {
414 $compressed .= '.bz2';
415 } elsif ($exe =~ /zip/) {
416 $compressed .= '.zip';
417 } elsif ($exe =~ /compress/) {
418 $compressed .= '.Z';
420 if ($exe =~ /gzip|bzip2|compress/) {
421 @cmd = ($exe, '-f', $fileName);
422 } elsif ($exe eq 'zip') {
423 @cmd = ($exe, '-r', "$compressed", $fileName);
427 if(system(@cmd) != 0) {
428 $self->throw( -class => 'Bio::Root::SystemException',
429 -text => "Failed to compress file $fileName using $exe: $!");
432 return $compressed;
435 =head2 uncompress
437 Title : uncompress
438 Usage : $Util->uncompress(full-path-filename);
439 : $Util->uncompress(<named parameters>);
440 Purpose : Uncompress a file.
441 Example : $Util->uncompress("/usr/people/me/data.txt");
442 : $Util->uncompress(-file=>"/usr/people/me/data.txt.gz",
443 : -tmp=>1,
444 : -outfile=>"/usr/people/share/data.txt",
445 : -exe=>"/usr/local/bin/fancyzip");
446 Returns : String containing full, absolute path to uncompressed file
447 Argument : Named parameters (case-insensitive):
448 : -FILE => String (name of file to be uncompressed, full path).
449 : If the supplied filename ends with '.gz' or '.Z',
450 : that extension will be removed before attempting to uncompress.
451 : Optional:
452 : -TMP => boolean. If true, (or if user is not the owner of the file)
453 : the file is uncompressed to a temp file. If false, file may be
454 : clobbered with the uncompressed version (if using a utility like
455 : gzip, which is the default)
456 : -OUTFILE => String (name of the output uncompressed file, full path).
457 : -EXE => Name of executable for uncompression utility to use.
458 : Will supersede those in @UNCOMPRESSION_UTILS defined by
459 : this module. If the absolute path to the executable is not provided,
460 : it will be searched in the PATH env variable.
461 Throws : Exception if file cannot be uncompressed.
462 : If user is not owner of the file, generates a warning and uncompresses to
463 : a tmp file. To avoid this warning, use the -o file test operator
464 : and call this function with -TMP=>1.
465 Comments : Attempts to uncompress using utilities defined in the @UNCOMPRESSION_UTILS
466 : defined by this module, in the order defined. The first utility that is
467 : found to be executable will be used. Any utility defined in optional -EXE param
468 : will be tested for executability first.
469 : To minimize security risks, the -EXE parameter value is untained using
470 : the untaint() method of this module (in 'relaxed' mode to permit path separators).
472 See Also : L<compress()|compress>
474 =cut
476 #------------'
477 sub uncompress {
478 #------------
479 my ($self, @args) = @_;
480 # This method formerly didn't use named params and expected fileName, tmp
481 # in that order. This should be backward compatibile.
482 my ($fileName, $tmp, $outfile, $exe) = $self->_rearrange([qw(FILE TMP OUTFILE EXE)], @args);
483 my ($file, $get, $fmt);
485 # in case the supplied name lacks a compressed extension
486 if(not $fileName =~ /(\.gz|\.Z|\.bz2|\.zip)$/) { $fileName .= $1; };
487 $self->debug("uncompressing file $fileName");
489 my @util_to_use = @UNCOMPRESSION_UTILS;
491 if (defined $exe){
492 $exe = $self->untaint($exe, 1);
493 unshift @util_to_use, $exe;
496 $exe ||= '';
497 while (not -x $exe and scalar(@util_to_use)) {
498 $exe = $self->find_exe(shift @util_to_use);
501 unless (-x $exe) {
502 $self->throw("Can't find compression utility. Looked for @util_to_use");
505 my ($uncompressed, @cmd, $handle);
507 $uncompressed = $fileName;
508 $uncompressed =~ s/\.\w+$//;
510 if(defined($outfile) or $tmp or not -o $fileName) {
511 if (defined $outfile) {
512 $uncompressed = $outfile;
513 } else {
514 # obtain a temporary file name (not using the handle)
515 my $io = Bio::Root::IO->new();
516 ($handle, $uncompressed) = $io->tempfile();
517 # insert some special text to flag it as a bioperl-based temp file
518 $uncompressed .= '.tmp.bioperl';
521 # Use double quotes if executable path have empty spaces
522 if ($exe =~ m/ /) {
523 $exe = "\"$exe\"";
526 if ($exe =~ /gunzip|bunzip2|uncompress/) {
527 @cmd = ("$exe -f < \"$fileName\" > \"$uncompressed\"");
528 } elsif ($exe =~ /gzip/) {
529 @cmd = ("$exe -df < \"$fileName\" > \"$uncompressed\"");
530 } elsif ($exe eq 'unzip') {
531 @cmd = ("$exe -p \"$fileName\" > \"$uncompressed\"");
533 not $tmp and
534 $self->warn("Not owner of file $fileName. Uncompressing to temp file $uncompressed.");
535 $tmp = 1;
536 } else {
537 if ($exe =~ /gunzip|bunzip2|uncompress/) {
538 @cmd = ($exe, '-f', $fileName);
539 } elsif ($exe =~ /gzip/) {
540 @cmd = ($exe, '-df', $fileName);
541 } elsif ($exe eq 'zip') {
542 @cmd = ($exe, $fileName);
546 if(system(@cmd) != 0) {
547 $self->throw( -class => 'Bio::Root::SystemException',
548 -text => "Failed to uncompress file $fileName using $exe: $!");
551 return $uncompressed;
555 =head2 file_date
557 Title : file_date
558 Usage : $Util->file_date( filename [,date_format])
559 Purpose : Obtains the date of a given file.
560 : Provides flexible formatting via date_format().
561 Returns : String = date of the file as: yyyy-mm-dd (e.g., 1997-10-15)
562 Argument : filename = string, full path name for file
563 : date_format = string, desired format for date (see date_format()).
564 : Default = yyyy-mm-dd
565 Thows : Exception if no file is provided or does not exist.
566 Comments : Uses the mtime field as obtained by stat().
568 =cut
570 #--------------
571 sub file_date {
572 #--------------
573 my ($self, $file, $fmt) = @_;
575 $self->throw("No such file: $file") if not $file or not -e $file;
577 $fmt ||= 'yyyy-mm-dd';
579 my @file_data = stat($file);
580 return $self->date_format($fmt, $file_data[9]); # mtime field
584 =head2 untaint
586 Title : untaint
587 Purpose : To remove nasty shell characters from untrusted data
588 : and allow a script to run with the -T switch.
589 : Potentially dangerous shell meta characters: &;`'\"|*?!~<>^()[]{}$\n\r
590 : Accept only the first block of contiguous characters:
591 : Default allowed chars = "-\w.', ()"
592 : If $relax is true = "-\w.', ()\/=%:^<>*"
593 Usage : $Util->untaint($value, $relax)
594 Returns : String containing the untained data.
595 Argument: $value = string
596 : $relax = boolean
597 Comments:
598 This general untaint() function may not be appropriate for every situation.
599 To allow only a more restricted subset of special characters
600 (for example, untainting a regular expression), then using a custom
601 untainting mechanism would permit more control.
603 Note that special trusted vars (like $0) require untainting.
605 =cut
607 #------------`
608 sub untaint {
609 #------------
610 my($self,$value,$relax) = @_;
611 $relax ||= 0;
612 my $untainted;
614 $self->debug("\nUNTAINT: $value\n");
616 unless (defined $value and $value ne '') {
617 return $value;
620 if( $relax ) {
621 $value =~ /([-\w.\', ()\/=%:^<>*]+)/;
622 $untainted = $1
623 # } elsif( $relax == 2 ) { # Could have several degrees of relax.
624 # $value =~ /([-\w.\', ()\/=%:^<>*]+)/;
625 # $untainted = $1
626 } else {
627 $value =~ /([-\w.\', ()]+)/;
628 $untainted = $1
631 $self->debug("UNTAINTED: $untainted\n");
633 $untainted;
637 =head2 mean_stdev
639 Title : mean_stdev
640 Usage : ($mean, $stdev) = $Util->mean_stdev( @data )
641 Purpose : Calculates the mean and standard deviation given a list of numbers.
642 Returns : 2-element list (mean, stdev)
643 Argument : list of numbers (ints or floats)
644 Thows : n/a
646 =cut
648 #---------------
649 sub mean_stdev {
650 #---------------
651 my ($self, @data) = @_;
652 return (undef, undef) if not @data; # case of empty @data list
653 my $mean = 0;
654 my $N = 0;
655 foreach my $num (@data) {
656 $mean += $num;
657 $N++
659 $mean /= $N;
660 my $sum_diff_sqd = 0;
661 foreach my $num (@data) {
662 $sum_diff_sqd += ($mean - $num) * ($mean - $num);
664 # if only one element in @data list, unbiased stdev is undefined
665 my $stdev = $N <= 1 ? undef : sqrt( $sum_diff_sqd / ($N-1) );
666 return ($mean, $stdev);
670 =head2 count_files
672 Title : count_files
673 Purpose : Counts the number of files/directories within a given directory.
674 : Also reports the number of text and binary files in the dir
675 : as well as names of these files and directories.
676 Usage : count_files(\%data)
677 : $data{-DIR} is the directory to be analyzed. Default is ./
678 : $data{-PRINT} = 0|1; if 1, prints results to STDOUT, (default=0).
679 Argument : Hash reference (empty)
680 Returns : n/a;
681 : Modifies the hash ref passed in as the sole argument.
682 : $$href{-TOTAL} scalar
683 : $$href{-NUM_TEXT_FILES} scalar
684 : $$href{-NUM_BINARY_FILES} scalar
685 : $$href{-NUM_DIRS} scalar
686 : $$href{-T_FILE_NAMES} array ref
687 : $$href{-B_FILE_NAMES} array ref
688 : $$href{-DIRNAMES} array ref
690 =cut
692 #----------------
693 sub count_files {
694 #----------------
695 my $self = shift;
696 my $href = shift; # Reference to an empty hash.
697 my( $name, @fileLine);
698 my $dir = $$href{-DIR} || './'; # THIS IS UNIX SPECIFIC? FIXME/TODO
699 my $print = $$href{-PRINT} || 0;
701 ### Make sure $dir ends with /
702 $dir !~ m{/$} and do{ $dir .= '/'; $$href{-DIR} = $dir; };
704 open ( my $PIPE, "ls -1 $dir |" ) || $self->throw("Can't open input pipe: $!");
706 ### Initialize the hash data.
707 $$href{-TOTAL} = 0;
708 $$href{-NUM_TEXT_FILES} = $$href{-NUM_BINARY_FILES} = $$href{-NUM_DIRS} = 0;
709 $$href{-T_FILE_NAMES} = [];
710 $$href{-B_FILE_NAMES} = [];
711 $$href{-DIR_NAMES} = [];
712 while( my $line = <$PIPE> ) {
713 chomp();
714 $$href{-TOTAL}++;
715 if( -T $dir.$line ) {
716 $$href{-NUM_TEXT_FILES}++;
717 push @{$$href{-T_FILE_NAMES}}, $line; }
718 if( -B $dir.$line and not -d $dir.$line) {
719 $$href{-NUM_BINARY_FILES}++;
720 push @{$$href{-B_FILE_NAMES}}, $line; }
721 if( -d $dir.$line ) {
722 $$href{-NUM_DIRS}++;
723 push @{$$href{-DIR_NAMES}}, $line; }
725 close $PIPE;
727 if( $print) {
728 printf( "\n%4d %s\n", $$href{-TOTAL}, "total files+dirs in $dir");
729 printf( "%4d %s\n", $$href{-NUM_TEXT_FILES}, "text files");
730 printf( "%4d %s\n", $$href{-NUM_BINARY_FILES}, "binary files");
731 printf( "%4d %s\n", $$href{-NUM_DIRS}, "directories");
736 =head2 file_info
738 Title : file_info
739 Purpose : Obtains a variety of date for a given file.
740 : Provides an interface to Perl's stat().
741 Status : Under development. Not ready. Don't use!
743 =cut
745 #--------------
746 sub file_info {
747 #--------------
748 my ($self, %param) = @_;
749 my ($file, $get, $fmt) = $self->_rearrange([qw(FILE GET FMT)], %param);
750 $get ||= 'all';
751 $fmt ||= 'yyyy-mm-dd';
753 my($dev, $ino, $mode, $nlink, $uid, $gid, $rdev, $size,
754 $atime, $mtime, $ctime, $blksize, $blocks) = stat $file;
756 if($get =~ /date/i) {
757 ## I can get the elapsed time since the file was modified but
758 ## it's not so straightforward to get the date in a nice format...
759 ## Think about using a standard CPAN module for this, like
760 ## Date::Manip or Date::DateCalc.
762 my $date = $mtime;
763 my $elsec = time - $mtime;
764 printf "\nFile age: %.0f sec %.0f hrs %.0f days", $elsec, $elsec/3600, $elsec/(3600*24);<STDIN>;
765 my $days = sprintf "%.0f", $elsec/(3600*24);
766 } elsif($get eq 'all') {
767 return stat $file;
771 =head2 delete
773 Title : delete
774 Purpose :
776 =cut
778 #------------
779 sub delete {
780 #------------
781 my $self = shift;
782 my $fileName = shift;
783 if(not -e $fileName) {
784 $self->throw("Could not delete file '$fileName': Does not exist.");
785 } elsif(not -o $fileName) {
786 $self->throw("Could not delete file '$fileName': Not owner.");
788 my $ulval = unlink($fileName) > 0
789 or $self->throw("Failed to delete file '$fileName': $!");
793 =head2 create_filehandle
795 Usage : $object->create_filehandle(<named parameters>);
796 Purpose : Create a FileHandle object from a file or STDIN.
797 : Mainly used as a helper method by read() and get_newline().
798 Example : $data = $object->create_filehandle(-FILE =>'usr/people/me/data.txt')
799 Argument : Named parameters (case-insensitive):
800 : (all optional)
801 : -CLIENT => object reference for the object submitting
802 : the request. Default = $Util.
803 : -FILE => string (full path to file) or a reference
804 : to a FileHandle object or typeglob. This is an
805 : optional parameter (if not defined, STDIN is used).
806 Returns : Reference to a FileHandle object.
807 Throws : Exception if cannot open a supplied file or if supplied with a
808 : reference that is not a FileHandle ref.
809 Comments : If given a FileHandle reference, this method simply returns it.
810 : This method assumes the user wants to read ascii data. So, if
811 : the file is binary, it will be treated as a compressed (gzipped)
812 : file and access it using gzip -ce. The problem here is that not
813 : all binary files are necessarily compressed. Therefore,
814 : this method should probably have a -mode parameter to
815 : specify ascii or binary.
817 See Also : L<get_newline()|get_newline>
819 =cut
821 #---------------------
822 sub create_filehandle {
823 #---------------------
824 my($self, @param) = @_;
825 my($client, $file, $handle) =
826 $self->_rearrange([qw( CLIENT FILE HANDLE )], @param);
828 if(not ref $client) { $client = $self; }
829 $file ||= $handle;
830 if( $client->can('file')) {
831 $file = $client->file($file);
834 my $FH;
835 my ($handle_ref);
837 if($handle_ref = ref($file)) {
838 if($handle_ref eq 'FileHandle') {
839 $FH = $file;
840 $client->{'_input_type'} = "FileHandle";
841 } elsif($handle_ref eq 'GLOB') {
842 $FH = $file;
843 $client->{'_input_type'} = "Glob";
844 } else {
845 $self->throw(-class => 'Bio::Root::IOException',
846 -text => "Could not read file '$file': Not a FileHandle or GLOB ref.");
848 $self->verbose > 0 and printf STDERR "$ID: reading data from FileHandle\n";
850 } elsif($file) {
851 $client->{'_input_type'} = "FileHandle for $file";
853 # Use gzip -cd to access compressed data.
854 if( -B $file ) {
855 $client->{'_input_type'} .= " (compressed)";
856 my $gzip = $self->find_exe('gzip');
857 $file = "$gzip -cd $file |"
860 require FileHandle;
861 $FH = FileHandle->new();
862 open ($FH, $file) || $self->throw(-class=>'Bio::Root::FileOpenException',
863 -text =>"Could not access data file '$file': $!");
864 $self->verbose > 0 and printf STDERR "$ID: reading data from file '$file'\n";
866 } else {
867 # Read from STDIN.
868 $FH = \*STDIN;
869 $self->verbose > 0 and printf STDERR "$ID: reading data from STDIN\n";
870 $client->{'_input_type'} = "STDIN";
873 return $FH;
876 =head2 get_newline
878 Usage : $object->get_newline(<named parameters>);
879 Purpose : Determine the character(s) used for newlines in a given file or
880 : input stream. Delegates to Bio::Root::Utilities::get_newline()
881 Example : $data = $object->get_newline(-CLIENT => $anObj,
882 : -FILE =>'usr/people/me/data.txt')
883 Argument : Same arguemnts as for create_filehandle().
884 Returns : Reference to a FileHandle object.
885 Throws : Propagates any exceptions thrown by Bio::Root::Utilities::get_newline().
887 See Also : L<taste_file()|taste_file>, L<create_filehandle()|create_filehandle>
889 =cut
891 #-----------------
892 sub get_newline {
893 #-----------------
894 my($self, @param) = @_;
896 return $NEWLINE if defined $NEWLINE;
898 my($client ) =
899 $self->_rearrange([qw( CLIENT )], @param);
901 my $FH = $self->create_filehandle(@param);
903 if(not ref $client) { $client = $self; }
905 if($client->{'_input_type'} =~ /STDIN|Glob|compressed/) {
906 # Can't taste from STDIN since we can't seek 0 on it.
907 # Are other non special Glob refs seek-able?
908 # Attempt to guess newline based on platform.
909 # Not robust since we could be reading Unix files on a Mac, e.g.
910 if(defined $ENV{'MACPERL'}) {
911 $NEWLINE = "\015"; # \r
912 } else {
913 $NEWLINE = "\012"; # \n
915 } else {
916 $NEWLINE = $self->taste_file($FH);
919 close ($FH) unless ($client->{'_input_type'} eq 'STDIN' ||
920 $client->{'_input_type'} eq 'FileHandle' ||
921 $client->{'_input_type'} eq 'Glob' );
923 delete $client->{'_input_type'};
925 return $NEWLINE || $DEFAULT_NEWLINE;
929 =head2 taste_file
931 Usage : $object->taste_file( <FileHandle> );
932 : Mainly a utility method for get_newline().
933 Purpose : Sample a filehandle to determine the character(s) used for a newline.
934 Example : $char = $Util->taste_file($FH)
935 Argument : Reference to a FileHandle object.
936 Returns : String containing an octal represenation of the newline character string.
937 : Unix = "\012" ("\n")
938 : Win32 = "\012\015" ("\r\n")
939 : Mac = "\015" ("\r")
940 Throws : Exception if no input is read within $TIMEOUT_SECS seconds.
941 : Exception if argument is not FileHandle object reference.
942 : Warning if cannot determine neewline char(s).
943 Comments : Based on code submitted by Vicki Brown (vlb@deltagen.com).
945 See Also : L<get_newline()|get_newline>
947 =cut
949 #---------------
950 sub taste_file {
951 #---------------
952 my ($self, $FH) = @_;
953 my $BUFSIZ = 256; # Number of bytes read from the file handle.
954 my ($buffer, $octal, $str, $irs, $i);
956 ref($FH) eq 'FileHandle' or $self->throw("Can't taste file: not a FileHandle ref");
958 $buffer = '';
960 # this is a quick hack to check for availability of alarm(); just copied
961 # from Bio/Root/IOManager.pm HL 02/19/01
962 my $alarm_available = 1;
963 eval {
964 alarm(0);
966 if($@) {
967 # alarm() not available (ActiveState perl for win32 doesn't have it.
968 # See jitterbug PR#98)
969 $alarm_available = 0;
971 $SIG{ALRM} = sub { die "Timed out!"; };
972 my $result;
973 eval {
974 $alarm_available && alarm( $TIMEOUT_SECS );
975 $result = read($FH, $buffer, $BUFSIZ); # read the $BUFSIZ characters of file
976 $alarm_available && alarm(0);
978 if($@ =~ /Timed out!/) {
979 $self->throw( "Timed out while waiting for input.",
980 "Timeout period = $TIMEOUT_SECS seconds.\n"
981 ."For longer time before timing out, edit \$TIMEOUT_SECS in Bio::Root::Utilities.pm.");
983 } elsif(not $result) {
984 my $err = $@;
985 $self->throw("read taste failed to read from FileHandle.", $err);
987 } elsif($@ =~ /\S/) {
988 my $err = $@;
989 $self->throw("Unexpected error during read: $err");
992 seek($FH, 0, 0) or $self->throw("seek failed to seek 0 on FileHandle.");
994 my @chars = split(//, $buffer);
995 my $flavor;
997 for ($i = 0; $i <$BUFSIZ; $i++) {
998 if (($chars[$i] eq "\012")) {
999 unless ($chars[$i-1] eq "\015") {
1000 $flavor='Unix';
1001 $octal = "\012";
1002 $str = '\n';
1003 $irs = "^J";
1004 last;
1006 } elsif (($chars[$i] eq "\015") && ($chars[$i+1] eq "\012")) {
1007 $flavor='DOS';
1008 $octal = "\015\012";
1009 $str = '\r\n';
1010 $irs = "^M^J";
1011 last;
1012 } elsif (($chars[$i] eq "\015")) {
1013 $flavor='Mac';
1014 $octal = "\015";
1015 $str = '\r';
1016 $irs = "^M";
1017 last;
1020 if (not $octal) {
1021 $self->warn("Could not determine newline char. Using '\012'");
1022 $octal = "\012";
1023 } else {
1024 #print STDERR "FLAVOR=$flavor, NEWLINE CHAR = $irs\n";
1026 return($octal);
1029 =head2 file_flavor
1031 Usage : $object->file_flavor( <filename> );
1032 Purpose : Returns the 'flavor' of a given file (unix, dos, mac)
1033 Example : print "$file has flavor: ", $Util->file_flavor($file);
1034 Argument : filename = string, full path name for file
1035 Returns : String describing flavor of file and handy info about line endings.
1036 : One of these is returned:
1037 : unix (\n or 012 or ^J)
1038 : dos (\r\n or 015,012 or ^M^J)
1039 : mac (\r or 015 or ^M)
1040 : unknown
1041 Throws : Exception if argument is not a file
1042 : Propagates any exceptions thrown by Bio::Root::Utilities::get_newline().
1044 See Also : L<get_newline()|get_newline>, L<taste_file()|taste_file>
1046 =cut
1048 #---------------
1049 sub file_flavor {
1050 #---------------
1051 my ($self, $file) = @_;
1052 my %flavors=("\012" =>'unix (\n or 012 or ^J)',
1053 "\015\012" =>'dos (\r\n or 015,012 or ^M^J)',
1054 "\015" =>'mac (\r or 015 or ^M)'
1057 -f $file or $self->throw("Could not determine flavor: arg '$file' is either non existant or is not a file.\n");
1058 my $octal = $self->get_newline($file);
1059 my $flavor = $flavors{$octal} || "unknown";
1060 return $flavor;
1063 ######################################
1064 ##### Mail Functions ########
1065 ######################################
1067 =head2 mail_authority
1069 Title : mail_authority
1070 Usage : $Util->mail_authority( $message )
1071 Purpose : Syntactic sugar to send email to $Bio::Root::Global::AUTHORITY
1073 See Also : L<send_mail()|send_mail>
1075 =cut
1077 #---------------
1078 sub mail_authority {
1079 #---------------
1080 my( $self, $message ) = @_;
1081 my $script = $self->untaint($0,1);
1083 my $email = $self->{'_auth_email'} || $AUTHORITY;
1084 if (defined $email) {
1085 $self->send_mail( -TO=>$AUTHORITY, -SUBJ=>$script, -MSG=>$message);
1086 } else {
1087 $self->throw("Can't email authority. No email defined.");
1091 =head2 authority
1093 Title : authority
1094 Usage : $Util->authority('admin@example.com');
1095 Purpose : Set/get the email address that should be notified by mail_authority()
1097 See Also : L<mail_authority()|mail_authority>
1099 =cut
1101 #-------------
1102 sub authority {
1103 #-------------
1104 my( $self, $email ) = @_;
1105 $self->{'_auth_email'} = $email if defined $email;
1106 return $self->{'_auth_email'};
1110 =head2 send_mail
1112 Title : send_mail
1113 Usage : $Util->send_mail( named_parameters )
1114 Purpose : Provides an interface to mail or sendmail, if available
1115 Returns : n/a
1116 Argument : Named parameters: (case-insensitive)
1117 : -TO => e-mail address to send to
1118 : -SUBJ => subject for message (optional)
1119 : -MSG => message to be sent (optional)
1120 : -CC => cc: e-mail address (optional)
1121 Thows : Exception if TO: address appears bad or is missing.
1122 : Exception if mail cannot be sent.
1123 Comments : Based on TomC's tip at:
1124 : http://www.perl.com/CPAN/doc/FMTEYEWTK/safe_shellings
1126 : Using default 'From:' information.
1127 : sendmail options used:
1128 : -t: ignore the address given on the command line and
1129 : get To:address from the e-mail header.
1130 : -oi: prevents send_mail from ending the message if it
1131 : finds a period at the start of a line.
1133 See Also : L<mail_authority()|mail_authority>
1135 =cut
1138 #-------------
1139 sub send_mail {
1140 #-------------
1141 my( $self, @param) = @_;
1142 my($recipient,$subj,$message,$cc) = $self->_rearrange([qw(TO SUBJ MSG CC)],@param);
1144 $self->throw("Invalid or missing e-mail address: $recipient")
1145 if not $recipient =~ /\S+\@\S+/;
1147 $subj ||= 'empty subject'; $message ||= '';
1149 # Best to use mail rather than sendmail. Permissions on sendmail in
1150 # linux distros have been significantly locked down in recent years,
1151 # due to the perception that it is insecure.
1152 my ($exe, $ccinfo);
1153 if ($exe = $self->find_exe('mail')) {
1154 if (defined $cc) {
1155 $ccinfo = "-c $cc";
1157 $self->debug("send_mail: $exe -s '$subj' $ccinfo $recipient\n");
1158 open (MAIL, "| $exe -s '$subj' $ccinfo $recipient") ||
1159 $self->throw("Can't send email: mail cannot fork: $!");
1160 print MAIL <<QQ_EOFM_QQ;
1161 $message
1162 QQ_EOFM_QQ
1163 $? and $self->warn("mail didn't exit nicely: $?");
1164 close(MAIL);
1165 } elsif ($exe = $self->find_exe('sendmail')) {
1166 open (SENDMAIL, "| $exe -oi -t") ||
1167 $self->throw("Can't send email: sendmail cannot fork: $!");
1168 print SENDMAIL <<QQ_EOFSM_QQ;
1169 To: $recipient
1170 Subject: $subj
1171 Cc: $cc
1173 $message
1175 QQ_EOFSM_QQ
1176 $? and $self->warn("sendmail didn't exit nicely: $?");
1178 close(SENDMAIL);
1179 } else {
1180 $self->throw("Can't find executable for mail or sendmail.");
1185 =head2 find_exe
1187 Title : find_exe
1188 Usage : $Util->find_exe(name);
1189 Purpose : Locate an executable (for use in a system() call, e.g.))
1190 Example : $Util->find_exe("gzip");
1191 Returns : String containing executable that passes the -x test.
1192 Returns undef if an executable of the supplied name cannot be found.
1193 Argument : Name of executable to be found.
1194 : Can be a full path. If supplied name is not executable, an executable
1195 : of that name will be searched in all directories in the currently
1196 : defined PATH environment variable.
1197 Throws : No exceptions, but issues a warning if multiple paths are found
1198 : for a given name. The first one is used.
1199 Comments : TODO: Confirm functionality on all bioperl-supported platforms.
1200 May get tripped up by variation in path separator character used
1201 for splitting ENV{PATH}.
1202 See Also :
1204 =cut
1206 #------------
1207 sub find_exe {
1208 #------------
1209 my ($self, $name) = @_;
1210 my @bindirs;
1211 if ($^O =~ m/mswin/i) {
1212 @bindirs = split ';', $ENV{'PATH'};
1213 # Add usual executable extension if missing or -x won't work
1214 $name.= '.exe' if ($name !~ m/\.exe$/i);
1216 else {
1217 @bindirs = split ':', $ENV{'PATH'};
1219 my $exe = $name;
1220 unless (-x $exe) {
1221 undef $exe;
1222 my @exes;
1223 foreach my $d (@bindirs) {
1224 # Note: Windows also understand '/' as folder separator,
1225 # so there is no need to use a conditional with '\'
1226 push(@exes, "$d/$name") if -x "$d/$name";
1228 if (scalar @exes) {
1229 $exe = $exes[0];
1230 if (defined $exes[1]) {
1231 $self->warn("find_exe: Multiple paths to '$name' found. Using $exe.");
1235 return $exe;
1239 ######################################
1240 ### Interactive Functions #####
1241 ######################################
1244 =head2 yes_reply
1246 Title : yes_reply()
1247 Usage : $Util->yes_reply( [query_string]);
1248 Purpose : To test an STDIN input value for affirmation.
1249 Example : print +( $Util->yes_reply('Are you ok') ? "great!\n" : "sorry.\n" );
1250 : $Util->yes_reply('Continue') || die;
1251 Returns : Boolean, true (1) if input string begins with 'y' or 'Y'
1252 Argument: query_string = string to be used to prompt user (optional)
1253 : If not provided, 'Yes or no' will be used.
1254 : Question mark is automatically appended.
1256 =cut
1258 #-------------
1259 sub yes_reply {
1260 #-------------
1261 my $self = shift;
1262 my $query = shift;
1263 my $reply;
1264 $query ||= 'Yes or no';
1265 print "\n$query? (y/n) [n] ";
1266 chomp( $reply = <STDIN> );
1267 $reply =~ /^y/i;
1272 =head2 request_data
1274 Title : request_data()
1275 Usage : $Util->request_data( [value_name]);
1276 Purpose : To request data from a user to be entered via keyboard (STDIN).
1277 Example : $name = $Util->request_data('Name');
1278 : # User will see: % Enter Name:
1279 Returns : String, (data entered from keyboard, sans terminal newline.)
1280 Argument: value_name = string to be used to prompt user.
1281 : If not provided, 'data' will be used, (not very helpful).
1282 : Question mark is automatically appended.
1284 =cut
1286 #----------------
1287 sub request_data {
1288 #----------------
1289 my $self = shift;
1290 my $data = shift || 'data';
1291 print "Enter $data: ";
1292 # Remove the terminal newline char.
1293 chomp($data = <STDIN>);
1294 $data;
1297 =head2 quit_reply
1299 Title : quit_reply
1300 Usage :
1301 Purpose :
1303 =cut
1305 sub quit_reply {
1306 # Not much used since you can use request_data()
1307 # and test for an empty string.
1308 my $self = shift;
1309 my $reply;
1310 chop( $reply = <STDIN> );
1311 $reply =~ /^q.*/i;
1315 =head2 verify_version
1317 Purpose : Checks the version of Perl used to invoke the script.
1318 : Aborts program if version is less than the given argument.
1319 Usage : verify_version('5.000')
1321 =cut
1323 #------------------
1324 sub verify_version {
1325 #------------------
1326 my $self = shift;
1327 my $reqVersion = shift;
1329 $] < $reqVersion and do {
1330 printf STDERR ( "\a\n%s %0.3f.\n", "** Sorry. This Perl script requires at least version", $reqVersion);
1331 printf STDERR ( "%s %0.3f %s\n\n", "You are running Perl version", $], "Please update your Perl!\n\n" );
1332 exit(1);
1338 __END__