Bio::Tools::Analysis::Protein: move modules into their own distribution
[bioperl-live.git] / examples / longorf.pl
blob70f33dc10c6c1e9ea82de0169e18073f41092601
1 #!/usr/bin/perl
2 # longorf.pl v0208020920
3 # (c) Dan Kortschak 2002
5 use vars qw($USAGE);
7 use strict;
8 use Getopt::Long;
9 use Bio::SeqIO;
11 $USAGE = "longorf [--help] [--notstrict] [--verbose] [--graph] [--width printwidth] [--format seqformat] --input seqfile\n";
13 my ($sequencefile,$sequenceformat,$notstrict,$graph,$verb,$printwidth,$help) =
14 (undef, 'fasta', undef, undef,undef,50, undef);
16 &GetOptions('input|i=s' => \$sequencefile,
17 'format|f=s' => \$sequenceformat,
18 'notstrict|n' => \$notstrict,
19 'width|w=s' => \$printwidth,
20 'graph|g' => \$graph,
21 'verbose|v' => \$verb,
22 'help|h' => \$help,
25 if ($help) {
26 exec('perldoc', $0);
27 die;
30 if (!defined $sequencefile) {
31 die($USAGE . "\nPlease specify an input filename.\n");
34 sub longestORF {
35 my $best=0;
36 my ($bests,$beste,$beststrand)=(-1,-1,0);
37 my $bestorf="";
39 my $relaxed=$_[1];
40 my $dna=Bio::Seq->new(-seq => $_[0]);
41 my %strand=('+'=>$dna->seq,
42 '-'=>$dna->revcom->seq);
44 foreach my $direction (keys %strand) {
45 my @starts=();
46 my @ends=();
47 if ($relaxed) {
48 for (my $frame=0;$frame<3;$frame++) {
49 unless ($strand{$direction}=~m/^.{$frame}(taa|tga|tag)/i) {
50 push @starts,$frame+1;
54 while ($strand{$direction}=~m/(atg)/gi) {
55 push @starts,pos($strand{$direction})-2;
58 while ($strand{$direction}=~m/(taa|tga|tag)/gi) {
59 push @ends,pos($strand{$direction})-2;
61 push @ends,($dna->length-2,$dna->length-1,$dna->length);
63 for my $s (@starts) {
64 for my $e (@ends) {
65 if ($e%3==$s%3 and $e>$s) {
66 if ($e-$s>$best) {
67 $best=$e-$s;
68 ($bests,$beste,$beststrand)=($s,$e,$direction);
69 $bestorf=Bio::Seq->new(-seq=>$strand{$direction})->subseq($s,$e);
71 last
72 } else {
73 next
78 return ($best,$bests,$beste,$beststrand,$bestorf);
82 my $seqio = new Bio::SeqIO('-format' => $sequenceformat,
83 '-file' => $sequencefile );
85 my ($length,$start,$end,$direction,$sequence);
86 my $count=0;
87 my @lengths;
88 my $totallength=0;
90 while (my $dna = $seqio->next_seq) {
91 $count++;
92 ($length,$start,$end,$direction,$sequence)=longestORF($dna->seq,$notstrict);
93 if ($verb) {
94 print $dna->display_id," ",$dna->desc,": ";
95 print "$length, $start, $end ($direction)\n$sequence\n\n",Bio::Seq->new(-seq=>$sequence)->translate->seq,"\n\n--\n\n";
97 $totallength+=$length;
98 $lengths[$length/3]++;
101 print "Average ORF length: ", $totallength/$count,"\n\n";
103 print "Length distribution is:\n";
105 if ($graph) {
106 my $length;
107 my $maxlength=0;
108 for ($length=0;$length<@lengths;$length++) {
109 $lengths[$length]=0 unless $lengths[$length];
110 $maxlength=$lengths[$length] if ($lengths[$length]>$maxlength);
112 for ($length=0;$length<@lengths;$length++) {
113 print $length*3,"\t",$lengths[$length],"\t|";
114 print "#"x(($lengths[$length])*$printwidth/$maxlength);
115 print "\n";
117 } else {
118 for ($length=0;$length<@lengths;$length++) {
119 print $length*3,"\t",($lengths[$length]or"0"),"\n";
123 __END__
125 =head1 NAME
127 longorf.pl - perl script to find the longest ORF of a sequence
129 =head1 SYNOPSIS
131 % longorf.pl [-h] [-n] [-v] [-g] [-w printwidth] [-f seqformat] -i seqfile
133 =head1 DESCRIPTION
135 This script will examine a set of nucleotide sequences and determine
136 the longest ORF in each sequence. ORFs may start at the canonical ATG
137 or at the beginning of the sequence if the notstrict option is chosen.
138 The script will output a list of the longest ORF lengths, starts, ends
139 and strands with the ORF and amino acid sequence if the verbose option
140 is chosen. A histogram of the longest ORFs in the input set may be
141 printed by choosing the graph option.
143 =head1 FEEDBACK
145 This script is not supported by anyone, but requests can be made to the
146 author.
148 =head1 AUTHOR - Dan Kortschak <kortschak@rsbs.anu.spanner.edu.au>
150 =cut