2 # longorf.pl v0208020920
3 # (c) Dan Kortschak 2002
11 $USAGE = "longorf [--help] [--notstrict] [--verbose] [--graph] [--width printwidth] [--format seqformat] --input seqfile\n";
13 my ($sequencefile,$sequenceformat,$notstrict,$graph,$verb,$printwidth,$help) =
14 (undef, 'fasta', undef, undef,undef,50, undef);
16 &GetOptions('input|i=s' => \$sequencefile,
17 'format|f=s' => \$sequenceformat,
18 'notstrict|n' => \$notstrict,
19 'width|w=s' => \$printwidth,
21 'verbose|v' => \$verb,
30 if (!defined $sequencefile) {
31 die($USAGE . "\nPlease specify an input filename.\n");
36 my ($bests,$beste,$beststrand)=(-1,-1,0);
40 my $dna=Bio::Seq->new(-seq => $_[0]);
41 my %strand=('+'=>$dna->seq,
42 '-'=>$dna->revcom->seq);
44 foreach my $direction (keys %strand) {
48 for (my $frame=0;$frame<3;$frame++) {
49 unless ($strand{$direction}=~m/^.{$frame}(taa|tga|tag)/i) {
50 push @starts,$frame+1;
54 while ($strand{$direction}=~m/(atg)/gi) {
55 push @starts,pos($strand{$direction})-2;
58 while ($strand{$direction}=~m/(taa|tga|tag)/gi) {
59 push @ends,pos($strand{$direction})-2;
61 push @ends,($dna->length-2,$dna->length-1,$dna->length);
65 if ($e%3==$s%3 and $e>$s) {
68 ($bests,$beste,$beststrand)=($s,$e,$direction);
69 $bestorf=Bio::Seq->new(-seq=>$strand{$direction})->subseq($s,$e);
78 return ($best,$bests,$beste,$beststrand,$bestorf);
82 my $seqio = new Bio::SeqIO('-format' => $sequenceformat,
83 '-file' => $sequencefile );
85 my ($length,$start,$end,$direction,$sequence);
90 while (my $dna = $seqio->next_seq) {
92 ($length,$start,$end,$direction,$sequence)=longestORF($dna->seq,$notstrict);
94 print $dna->display_id," ",$dna->desc,": ";
95 print "$length, $start, $end ($direction)\n$sequence\n\n",Bio::Seq->new(-seq=>$sequence)->translate->seq,"\n\n--\n\n";
97 $totallength+=$length;
98 $lengths[$length/3]++;
101 print "Average ORF length: ", $totallength/$count,"\n\n";
103 print "Length distribution is:\n";
108 for ($length=0;$length<@lengths;$length++) {
109 $lengths[$length]=0 unless $lengths[$length];
110 $maxlength=$lengths[$length] if ($lengths[$length]>$maxlength);
112 for ($length=0;$length<@lengths;$length++) {
113 print $length*3,"\t",$lengths[$length],"\t|";
114 print "#"x(($lengths[$length])*$printwidth/$maxlength);
118 for ($length=0;$length<@lengths;$length++) {
119 print $length*3,"\t",($lengths[$length]or"0"),"\n";
127 longorf.pl - perl script to find the longest ORF of a sequence
131 % longorf.pl [-h] [-n] [-v] [-g] [-w printwidth] [-f seqformat] -i seqfile
135 This script will examine a set of nucleotide sequences and determine
136 the longest ORF in each sequence. ORFs may start at the canonical ATG
137 or at the beginning of the sequence if the notstrict option is chosen.
138 The script will output a list of the longest ORF lengths, starts, ends
139 and strands with the ORF and amino acid sequence if the verbose option
140 is chosen. A histogram of the longest ORFs in the input set may be
141 printed by choosing the graph option.
145 This script is not supported by anyone.
147 =head1 AUTHOR - Dan Kortschak <kortschak@rsbs.anu.spanner.edu.au>