Bio::DB::CUTG: move into its own distribution
[bioperl-live.git] / bin / bp_oligo_count
blobe14baaef9186ba8219c82c8844623f8064b36402
1 #!/usr/bin/perl
3 # oligomer_freq.pl
4 # We use this to determine what primers are useful for frequent priming of
5 # nucleic acid for random labeling
6 # Input: Sequence file, oligomer length
7 # Output: Tab-delimited text file of oligomer frequencies
8 # Written July 2, 2001
9 # Charles C. Kim
11 ###########
12 # MODULES #
13 ###########
14 use Bio::Seq;
15 use Bio::SeqIO;
16 use Getopt::Long;
18 #########################
19 # VARIABLES & FILENAMES #
20 #########################
22 use strict;
23 use warnings;
25 my ($format, $infile, $help, $outfile, $oligomerlength) = ('fasta');
26 GetOptions(
27 'f|format:s' => \$format,
28 'i|in|s|sequence:s' => \$infile,
29 'h|help|?' => \$help,
30 'o|out:s' => \$outfile,
31 'length:i' => \$oligomerlength
34 my $USAGE = "Usage:\toligo_count [-h/--help] [-l/--length OLIGOLENGTH]\n".
35 "\t[-f/--format SEQFORMAT] [-i/--in/-s/--sequence SEQFILE]\n".
36 "\t[-o/--out OUTFILE]\n".
37 "\tDefault SEQFORMAT is fasta\n";
39 print $USAGE and exit if $help;
41 unless ($infile ) {
42 print 'Enter your concatenated FASTA sequence filename: ';
43 chomp ($infile=<STDIN>);
45 unless (-e $infile) { die "$infile not found\n"; }
47 if ($outfile) {
48 if (-e $outfile) {
49 print "$outfile already exists! Overwrite (Y/N)? ";
50 chomp ($_ = <STDIN>);
51 while (/[^yn]/i) {
52 print 'Y or N, please: ';
53 chomp ($_ = <STDIN>);
55 if (/n/i) { die "$outfile not overwritten.\n"; }
57 #} else {
58 # print 'Enter an output filename: ';
59 # chomp ($outfile=<STDIN>);
60 # if (-e $outfile) {
61 # print "$outfile already exists! Overwrite (Y/N)? ";
62 # chomp ($_ = <STDIN>);
63 # while (/[^yn]/i) {
64 # print 'Y or N, please: ';
65 # chomp ($_ = <STDIN>);
66 # }
67 # if (/n/i) { die "$outfile not overwritten.\n"; }
68 # }
71 unless ($oligomerlength) {
72 while () {
73 print 'Enter an oligomer length to count: ';
74 chomp($oligomerlength=<STDIN>);
75 if ($oligomerlength !~ /\d/) {
76 print "Value is non-numeric!\n";
78 else {last;}
83 ########
84 # MAIN #
85 ########
87 if ($oligomerlength >= 9) {
88 print "An oligomer length of $oligomerlength will generate ";
89 print 4 ** $oligomerlength, " combinations,\nwhich could cause ";
90 print "an out of memory error. Proceed? (y/n) ";
91 chomp($_=<STDIN>);
92 if (/y/i) { ; }
93 else { die "Program terminated\n"; }
95 my @oligoseqs = &generate_all_oligos($oligomerlength);
96 my %oligos = ();
97 foreach (@oligoseqs) {
98 $oligos{$_} = 0;
101 my $in = Bio::SeqIO->new( -file => $infile,
102 -format => $format);
103 my $seqnumber = 0;
104 my $oligocounts = 0;
105 my $exception;
106 while (my $seq = $in->next_seq() ) {
107 my $len = $seq->length();
108 my $position = 1;
109 if ($position+$oligomerlength > $len) {
110 $exception = 2;
111 next;
113 $seq = uc $seq->seq; #string
114 $exception = 1 if $seq =~ /[^GATC]/;
116 while ($position + $oligomerlength-1 <= $len) {
117 $oligos{substr $seq, $position-1, $oligomerlength}++;
118 $position++;
119 if ($position%250000 == 0) {print "$position\n";}
121 $oligocounts += $position-1;
122 $seqnumber++;
125 my $OUTFILE;
126 if ($outfile) {
127 open $OUTFILE, '>', $outfile or die "Could not open file '$outfile': $!\n";
128 } else {
129 open $OUTFILE, '>-'; # STDOUT
131 print $OUTFILE "$seqnumber sequences analyzed\n";
132 print $OUTFILE "$oligocounts total $oligomerlength-mers counted\n";
133 print $OUTFILE "$oligomerlength-mer\tNumber\tFrequency\n";
134 foreach my $key (sort keys %oligos) {
135 print $OUTFILE "$key\t$oligos{$key}\t", $oligos{$key}/$oligocounts, "\n";
137 close $OUTFILE;
139 if ($exception) {
140 if ($exception == 1) {
141 print "Non-standard (non-GATC) bases were found in sequence\n";
143 if ($exception == 2) {
144 print "Oligomer length greater than sequence length\n";
148 #&notify();
150 ###############
151 # SUBROUTINES #
152 ###############
154 sub generate_all_oligos {
155 my $oligolength = $_[0];
156 my $iter = 1;
157 my @newarray = qw{A C G T};
158 my @bases = qw{A C G T};
160 while ($iter < $oligolength) {
161 my @oldarray = @newarray;
162 @newarray = ();
163 foreach my $oligoseq (@oldarray) {
164 foreach my $newbase (@bases) {
165 push @newarray, $oligoseq . $newbase;
168 $iter++;
170 return @newarray;
173 # if you wanted to be notified about status of running
174 #my $EMAILADDRESS = undef;
175 #die("Must change script to a valid email addres for notification")
176 # unless( defined $EMAILADDRESS );
178 #sub notify {
179 # $address = $EMAILADDRESS;
180 # $address = $_[0] if $_[0];
181 # open(SENDMAIL, "|/usr/lib/sendmail -oi -t") or die "Can't fork for sendmail: $!\n";
182 # print SENDMAIL <<"EOF";
183 #From: Computer
184 #To: $address
185 #Subject: Program Finished
187 #EOF
188 # close(SENDMAIL) or warn "sendmail didn't close nicely";
191 __END__
193 =head1 NAME
195 bp_oligo_count - oligo count and frequency
197 =head1 SYNOPSIS
199 Usage: bp_oligo_count [-h/--help] [-l/--length OLIGOLENGTH]
200 [-f/--format SEQFORMAT] [-i/--in/-s/--sequence SEQFILE]
201 [-o/--out OUTFILE]
203 =head1 DESCRIPTION
205 This scripts counts occurrence and frequency for all oligonucleotides
206 of given length.
208 It can be used to determine what primers are useful for
209 frequent priming of nucleic acid for random labeling.
211 Note that this script could be run by utilizing the compseq
212 program which is part of EMBOSS.
214 =head1 OPTIONS
216 The default sequence format is fasta. If no outfile is given, the
217 results will be printed to standard out. All other options can entered
218 interactively.
220 =head1 FEEDBACK
222 =head2 Mailing Lists
224 User feedback is an integral part of the evolution of this and other
225 Bioperl modules. Send your comments and suggestions preferably to
226 the Bioperl mailing list. Your participation is much appreciated.
228 bioperl-l@bioperl.org - General discussion
229 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
231 =head2 Reporting Bugs
233 Report bugs to the Bioperl bug tracking system to help us keep track
234 of the bugs and their resolution. Bug reports can be submitted via the
235 web:
237 https://github.com/bioperl/bioperl-live/issues
239 =head1 AUTHOR - Charles C. Kim
241 Email cckim@stanford.edu
243 =head1 HISTORY
245 Written July 2, 2001
247 Submitted to bioperl scripts project 2001/08/06
249 E<gt>E<gt> 100 x speed optimization by Heikki Lehvaslaiho
251 =cut