4 use vars
qw($USAGE %VALIDALIGN $CODONSIZE);
10 use Bio::Tools::CodonTable;
14 $CODONSIZE = 3; # parametrize everything like a good little programmer
15 if( ! defined $ENV{'CLUSTALDIR'} ) {
16 $ENV{'CLUSTALDIR'} = '/usr/local/bin';
18 if( ! defined $ENV{'TCOFFEEDIR'} ) {
19 $ENV{'TCOFFEEDIR'} = '/usr/local/bin';
22 qq{align_on_codons.pl < file.fa
23 -h/--help See this information
24 -f/--frame Translation Frame (0,1,2) are valid (defaults to '0')
25 -ct/--codontable Codon table to use (defaults to '1')
26 see perldoc Bio::PrimarySeq for more information
27 -i/--input Input Filename (defaults to STDIN)
28 -o/--output Output Filename (defaults to STDOUT)
29 -sf/--seqformat Input format (defaults to FASTA/Pearson)
30 -af/--alignformat Alignment output format (clustal,fasta,nexus,phylip,
31 msf,pfam,mase,meme,prodom,selex,stockholm)
32 -ap/--alignprog ClustalW, TCoffee (currently only support
34 -v/--verbose Run in verbose mode
37 %VALIDALIGN = ('clustalw' => 'Bio::Tools::Run::Alignment::Clustalw',
38 'tcoffee' => 'Bio::Tools::Run::Alignment::TCoffee');
41 my ($help, $input, $output);
43 my ($alignprog, $sformat, $aformat, $frame, $codontable, $verbose)
44 = ('clustalw', 'fasta', 'clustalw', 0, 1, 0);
46 GetOptions( 'h|help' => \$help,
47 'i|input:s' => \$input,
48 'o|output:s' => \$output,
49 'sf|seqformat:s' => \$sformat,
50 'af|alignformat:s' => \$aformat,
51 'ap|alignprog:s' => \$alignprog,
53 'f|frame:s' => \$frame,
54 'ct|codontable:s' => \$codontable,
55 'v|verbose' => \$verbose,
61 if( ! $alignprog || !defined $VALIDALIGN{$alignprog} ) {
62 die("Cannot use $alignprog as 'alignprog' parameter");
64 my $modname = $VALIDALIGN{$alignprog} .".pm";
65 $modname =~ s/::/\//g;
71 $alignout = new Bio::AlignIO('-format' => $aformat,
72 '-file' => ">$output");
74 $alignout = new Bio::AlignIO('-format' => $aformat);
77 my (@nucseqs,@protseqs);
81 $seqio = new Bio::SeqIO('-format' => $sformat,
84 $seqio = new Bio::SeqIO('-format' => $sformat,
88 my $table = new Bio::Tools::CodonTable();
89 while( my $seq = $seqio->next_seq ) {
91 # if( $frame == 0 && $alignprog eq 'tcoffee' ) {
92 # print "last codon is ",$seq->subseq($seq->length() -2,
93 # $seq->length()), "\n";
94 # if( $table->is_ter_codon($seq->subseq($seq->length() -2,
95 # $seq->length())) ) {
96 # $seq->seq($seq->subseq(1,$seq->length() - 3));
101 push @protseqs, $seq->translate(-frame => $frame,
102 -codontable_id => $codontable );
105 if( @nucseqs <= 1 ) {
106 die("Must specify > 1 sequence for alignment on codons");
109 # allow these to be tweaked by cmdline parameters at some point
110 my @params = ('ktuple' => 2, 'matrix' => 'BLOSUM');
112 my $alignengine = $VALIDALIGN{$alignprog}->new('-verbose' => $verbose,
115 my $aln = $alignengine->align(\@protseqs);
117 my $dnaalign = new Bio::SimpleAlign;
119 my $alnlen = $aln->length;
120 foreach my $seq ( $aln->each_seq ) {
123 foreach my $pos ( 1..$alnlen ) {
124 my $loc = $seq->location_from_column($pos);
126 if( !defined $loc || $loc->location_type ne 'EXACT' ) {
129 # to readjust to codon boundaries
130 # end needs to be +1 so we can just multiply by CODONSIZE
132 my ($start,$end) = ((($loc->start - 1)*$CODONSIZE) +1,
133 ($loc->end)*$CODONSIZE);
134 if( $start <=0 || $end > $nucseqs[$seqorder]->length() ) {
135 print "start is ", $loc->start, " end is ", $loc->end, "\n";
136 warn("codons don't seem to be matching up for $start,$end");
139 $dna = $nucseqs[$seqorder]->subseq($start,$end);
145 # funky looking math is to readjust to codon boundaries and deal
146 # with fact that sequence start with 1
147 my $newdna = new Bio::LocatableSeq(-display_id => $seq->id(),
148 -start => (($seq->start - 1) *
150 -end => ($seq->end * $CODONSIZE),
151 -strand => $seq->strand,
153 $dnaalign->add_seq($newdna);
156 $alignout->write_aln($dnaalign);