bin/bp_oligo_count

   1 #!/usr/bin/perl
   2 #
   3 # oligomer_freq.pl
   4 # We use this to determine what primers are useful for frequent priming of
   5 # nucleic acid for random labeling
   6 # Input: Sequence file, oligomer length
   7 # Output: Tab-delimited text file of oligomer frequencies
   8 # Written July 2, 2001
   9 # Charles C. Kim
  10
  11 ###########
  12 # MODULES #
  13 ###########
  14 use Bio::Seq;
  15 use Bio::SeqIO;
  16 use Getopt::Long;
  17
  18 #########################
  19 # VARIABLES & FILENAMES #
  20 #########################
  21
  22 use strict;
  23 use warnings;
  24
  25 my ($format, $infile, $help, $outfile, $oligomerlength) = ('fasta');
  26 GetOptions(
  27            'f|format:s'            => \$format,
  28            'i|in|s|sequence:s'     => \$infile,
  29            'h|help|?'              => \$help,
  30            'o|out:s'               => \$outfile,
  31            'length:i'              => \$oligomerlength
  32           );
  33
  34 my $USAGE = "Usage:\toligo_count [-h/--help] [-l/--length OLIGOLENGTH]\n".
  35     "\t[-f/--format SEQFORMAT] [-i/--in/-s/--sequence SEQFILE]\n".
  36     "\t[-o/--out OUTFILE]\n".
  37     "\tDefault SEQFORMAT is fasta\n";
  38
  39 print $USAGE and exit if $help;
  40
  41 unless ($infile ) {
  42     print 'Enter your concatenated FASTA sequence filename: ';
  43     chomp ($infile=<STDIN>);
  44 }
  45 unless (-e $infile) { die "$infile not found\n"; }
  46
  47 if ($outfile) {
  48     if (-e $outfile) {
  49         print "$outfile already exists!  Overwrite (Y/N)? ";
  50         chomp ($_ = <STDIN>);
  51         while (/[^yn]/i) {
  52             print 'Y or N, please: ';
  53             chomp ($_ = <STDIN>);
  54         }
  55         if (/n/i) { die "$outfile not overwritten.\n"; }
  56     }
  57 #} else {
  58 #    print 'Enter an output filename: ';
  59 #    chomp ($outfile=<STDIN>);
  60 #    if (-e $outfile) {
  61 #       print "$outfile already exists!  Overwrite (Y/N)? ";
  62 #       chomp ($_ = <STDIN>);
  63 #       while (/[^yn]/i) {
  64 #           print 'Y or N, please: ';
  65 #           chomp ($_ = <STDIN>);
  66 #       }
  67 #       if (/n/i) { die "$outfile not overwritten.\n"; }
  68 #    }
  69 }
  70
  71 unless ($oligomerlength) {
  72     while () {
  73         print 'Enter an oligomer length to count: ';
  74         chomp($oligomerlength=<STDIN>);
  75         if ($oligomerlength !~ /\d/) {
  76             print "Value is non-numeric!\n";
  77         }
  78         else {last;}
  79     }
  80 }
  81
  82
  83 ########
  84 # MAIN #
  85 ########
  86
  87 if ($oligomerlength >= 9) {
  88     print "An oligomer length of $oligomerlength will generate ";
  89     print 4 ** $oligomerlength, " combinations,\nwhich could cause ";
  90     print "an out of memory error.  Proceed? (y/n) ";
  91     chomp($_=<STDIN>);
  92     if (/y/i) { ; }
  93     else { die "Program terminated\n"; }
  94 }
  95 my @oligoseqs = &generate_all_oligos($oligomerlength);
  96 my %oligos = ();
  97 foreach  (@oligoseqs) {
  98     $oligos{$_} = 0;
  99 }
 100
 101 my $in = Bio::SeqIO->new( -file => $infile,
 102                        -format => $format);
 103 my $seqnumber = 0;
 104 my $oligocounts = 0;
 105 my $exception;
 106 while (my $seq = $in->next_seq() ) {
 107     my $len = $seq->length();
 108     my $position = 1;
 109     if ($position+$oligomerlength > $len) {
 110         $exception = 2;
 111         next;
 112     }
 113     $seq = uc $seq->seq; #string
 114     $exception = 1 if $seq =~ /[^GATC]/;
 115
 116     while ($position + $oligomerlength-1 <= $len) {
 117         $oligos{substr $seq, $position-1, $oligomerlength}++;
 118         $position++;
 119         if ($position%250000 == 0) {print "$position\n";}
 120     }
 121     $oligocounts += $position-1;
 122     $seqnumber++;
 123 }
 124
 125 my $OUTFILE;
 126 if ($outfile) {
 127     open $OUTFILE, '>', $outfile or die "Could not open file '$outfile': $!\n";
 128 } else {
 129     open $OUTFILE, '>-'; # STDOUT
 130 }
 131 print $OUTFILE "$seqnumber sequences analyzed\n";
 132 print $OUTFILE "$oligocounts total $oligomerlength-mers counted\n";
 133 print $OUTFILE "$oligomerlength-mer\tNumber\tFrequency\n";
 134 foreach my $key (sort keys %oligos) {
 135     print $OUTFILE "$key\t$oligos{$key}\t", $oligos{$key}/$oligocounts, "\n";
 136 }
 137 close $OUTFILE;
 138
 139 if ($exception) {
 140     if ($exception == 1) {
 141         print "Non-standard (non-GATC) bases were found in sequence\n";
 142     }
 143     if ($exception == 2) {
 144         print "Oligomer length greater than sequence length\n";
 145     }
 146 }
 147
 148 #&notify();
 149
 150 ###############
 151 # SUBROUTINES #
 152 ###############
 153
 154 sub generate_all_oligos {
 155     my $oligolength = $_[0];
 156     my $iter = 1;
 157     my @newarray = qw{A C G T};
 158     my @bases = qw{A C G T};
 159
 160     while ($iter < $oligolength) {
 161         my @oldarray = @newarray;
 162         @newarray = ();
 163         foreach my $oligoseq (@oldarray) {
 164             foreach my $newbase (@bases) {
 165                 push @newarray, $oligoseq . $newbase;
 166             }
 167         }
 168         $iter++;
 169     }
 170     return @newarray;
 171 }
 172
 173 # if you wanted to be notified about status of running
 174 #my $EMAILADDRESS = undef;
 175 #die("Must change script to a valid email addres for notification")
 176 #    unless( defined $EMAILADDRESS );
 177
 178 #sub notify {
 179 #    $address = $EMAILADDRESS;
 180 #    $address = $_[0] if $_[0];
 181 #    open(SENDMAIL, "|/usr/lib/sendmail -oi -t") or die "Can't fork for sendmail: $!\n";
 182 #    print SENDMAIL <<"EOF";
 183 #From: Computer
 184 #To: $address
 185 #Subject: Program Finished
 186 #
 187 #EOF
 188 #    close(SENDMAIL) or warn "sendmail didn't close nicely";
 189 #}
 190
 191 __END__
 192
 193 =head1 NAME
 194
 195 bp_oligo_count - oligo count and frequency
 196
 197 =head1 SYNOPSIS
 198
 199   Usage:  bp_oligo_count [-h/--help] [-l/--length OLIGOLENGTH]
 200           [-f/--format SEQFORMAT] [-i/--in/-s/--sequence SEQFILE]
 201           [-o/--out OUTFILE]
 202
 203 =head1 DESCRIPTION
 204
 205 This scripts counts occurrence and frequency for all oligonucleotides
 206 of given length.
 207
 208 It can be used to determine what primers are useful for
 209 frequent priming of nucleic acid for random labeling.
 210
 211 Note that this script could be run by utilizing the compseq
 212 program which is part of EMBOSS.
 213
 214 =head1 OPTIONS
 215
 216 The default sequence format is fasta. If no outfile is given, the
 217 results will be printed to standard out. All other options can entered
 218 interactively.
 219
 220 =head1 FEEDBACK
 221
 222 =head2 Mailing Lists
 223
 224 User feedback is an integral part of the evolution of this and other
 225 Bioperl modules. Send your comments and suggestions preferably to
 226 the Bioperl mailing list.  Your participation is much appreciated.
 227
 228   bioperl-l@bioperl.org                  - General discussion
 229   http://bioperl.org/wiki/Mailing_lists  - About the mailing lists
 230
 231 =head2 Reporting Bugs
 232
 233 Report bugs to the Bioperl bug tracking system to help us keep track
 234 of the bugs and their resolution. Bug reports can be submitted via the
 235 web:
 236
 237   https://github.com/bioperl/bioperl-live/issues
 238
 239 =head1 AUTHOR - Charles C. Kim
 240
 241 Email cckim@stanford.edu
 242
 243 =head1 HISTORY
 244
 245 Written July 2, 2001
 246
 247 Submitted to bioperl scripts project 2001/08/06
 248
 249 E<gt>E<gt> 100 x speed optimization by Heikki Lehvaslaiho
 250
 251 =cut