4 # We use this to determine what primers are useful for frequent priming of
5 # nucleic acid for random labeling
6 # Input: Sequence file, oligomer length
7 # Output: Tab-delimited text file of oligomer frequencies
18 #########################
19 # VARIABLES & FILENAMES #
20 #########################
25 my ($format, $infile, $help, $outfile, $oligomerlength) = ('fasta');
27 'f|format:s' => \
$format,
28 'i|in|s|sequence:s' => \
$infile,
30 'o|out:s' => \
$outfile,
31 'length:i' => \
$oligomerlength
34 my $USAGE = "Usage:\toligo_count [-h/--help] [-l/--length OLIGOLENGTH]\n".
35 "\t[-f/--format SEQFORMAT] [-i/--in/-s/--sequence SEQFILE]\n".
36 "\t[-o/--out OUTFILE]\n".
37 "\tDefault SEQFORMAT is fasta\n";
39 print $USAGE and exit if $help;
42 print 'Enter your concatenated FASTA sequence filename: ';
43 chomp ($infile=<STDIN
>);
45 unless (-e
$infile) { die "$infile not found\n"; }
49 print "$outfile already exists! Overwrite (Y/N)? ";
52 print 'Y or N, please: ';
55 if (/n/i) { die "$outfile not overwritten.\n"; }
58 # print 'Enter an output filename: ';
59 # chomp ($outfile=<STDIN>);
61 # print "$outfile already exists! Overwrite (Y/N)? ";
62 # chomp ($_ = <STDIN>);
64 # print 'Y or N, please: ';
65 # chomp ($_ = <STDIN>);
67 # if (/n/i) { die "$outfile not overwritten.\n"; }
71 unless ($oligomerlength) {
73 print 'Enter an oligomer length to count: ';
74 chomp($oligomerlength=<STDIN
>);
75 if ($oligomerlength !~ /\d/) {
76 print "Value is non-numeric!\n";
87 if ($oligomerlength >= 9) {
88 print "An oligomer length of $oligomerlength will generate ";
89 print 4 ** $oligomerlength, " combinations,\nwhich could cause ";
90 print "an out of memory error. Proceed? (y/n) ";
93 else { die "Program terminated\n"; }
95 my @oligoseqs = &generate_all_oligos
($oligomerlength);
97 foreach (@oligoseqs) {
101 my $in = Bio
::SeqIO
->new( -file
=> $infile,
106 while (my $seq = $in->next_seq() ) {
107 my $len = $seq->length();
109 if ($position+$oligomerlength > $len) {
113 $seq = uc $seq->seq; #string
114 $exception = 1 if $seq =~ /[^GATC]/;
116 while ($position + $oligomerlength-1 <= $len) {
117 $oligos{substr $seq, $position-1, $oligomerlength}++;
119 if ($position%250000 == 0) {print "$position\n";}
121 $oligocounts += $position-1;
127 open $OUTFILE, '>', $outfile or die "Could not open file '$outfile': $!\n";
129 open $OUTFILE, '>-'; # STDOUT
131 print $OUTFILE "$seqnumber sequences analyzed\n";
132 print $OUTFILE "$oligocounts total $oligomerlength-mers counted\n";
133 print $OUTFILE "$oligomerlength-mer\tNumber\tFrequency\n";
134 foreach my $key (sort keys %oligos) {
135 print $OUTFILE "$key\t$oligos{$key}\t", $oligos{$key}/$oligocounts, "\n";
140 if ($exception == 1) {
141 print "Non-standard (non-GATC) bases were found in sequence\n";
143 if ($exception == 2) {
144 print "Oligomer length greater than sequence length\n";
154 sub generate_all_oligos
{
155 my $oligolength = $_[0];
157 my @newarray = qw{A C G T
};
158 my @bases = qw{A C G T
};
160 while ($iter < $oligolength) {
161 my @oldarray = @newarray;
163 foreach my $oligoseq (@oldarray) {
164 foreach my $newbase (@bases) {
165 push @newarray, $oligoseq . $newbase;
173 # if you wanted to be notified about status of running
174 #my $EMAILADDRESS = undef;
175 #die("Must change script to a valid email addres for notification")
176 # unless( defined $EMAILADDRESS );
179 # $address = $EMAILADDRESS;
180 # $address = $_[0] if $_[0];
181 # open(SENDMAIL, "|/usr/lib/sendmail -oi -t") or die "Can't fork for sendmail: $!\n";
182 # print SENDMAIL <<"EOF";
185 #Subject: Program Finished
188 # close(SENDMAIL) or warn "sendmail didn't close nicely";
195 bp_oligo_count - oligo count and frequency
199 Usage: bp_oligo_count [-h/--help] [-l/--length OLIGOLENGTH]
200 [-f/--format SEQFORMAT] [-i/--in/-s/--sequence SEQFILE]
205 This scripts counts occurrence and frequency for all oligonucleotides
208 It can be used to determine what primers are useful for
209 frequent priming of nucleic acid for random labeling.
211 Note that this script could be run by utilizing the compseq
212 program which is part of EMBOSS.
216 The default sequence format is fasta. If no outfile is given, the
217 results will be printed to standard out. All other options can entered
224 User feedback is an integral part of the evolution of this and other
225 Bioperl modules. Send your comments and suggestions preferably to
226 the Bioperl mailing list. Your participation is much appreciated.
228 bioperl-l@bioperl.org - General discussion
229 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
231 =head2 Reporting Bugs
233 Report bugs to the Bioperl bug tracking system to help us keep track
234 of the bugs and their resolution. Bug reports can be submitted via the
237 https://github.com/bioperl/bioperl-live/issues
239 =head1 AUTHOR - Charles C. Kim
241 Email cckim@stanford.edu
247 Submitted to bioperl scripts project 2001/08/06
249 E<gt>E<gt> 100 x speed optimization by Heikki Lehvaslaiho