4 # Author Jason Stajich <jason-at-bioperl-dot-org>
8 bp_pairwise_kaks - script to calculate pairwise Ka,Ks for a set of sequences
12 bp_pairwise_kaks.PLS -i t/data/worm_fam_2785.cdna [-f fasta/genbank/embl...] [-msa tcoffee/clustal] [-kaks yn00/codeml]
16 This script will take as input a dataset of cDNA sequences verify
17 that they contain no stop codons, align them in protein space,
18 project the alignment back into cDNA and estimate the Ka
19 (non-synonymous) and Ks (synonymous) substitutions based on the ML
20 method of Yang with the PAML package.
24 * PAML program codeml or yn00
25 * Multiple sequence alignment programs Clustalw OR T-Coffee
27 Often there are specific specific parameters you want to run when you
28 a computing Ka/Ks ratios so consider this script a starting point and
29 do not rely it on for every situation.
35 User feedback is an integral part of the evolution of this and other
36 Bioperl modules. Send your comments and suggestions preferably to
37 the Bioperl mailing list. Your participation is much appreciated.
39 bioperl-l@bioperl.org - General discussion
40 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
44 Report bugs to the Bioperl bug tracking system to help us keep track
45 of the bugs and their resolution. Bug reports can be submitted via the
48 https://github.com/bioperl/bioperl-live/issues
52 Jason Stajich jason-at-bioperl-dot-org
58 require Bio
::Tools
::Run
::Phylo
::PAML
::Codeml
;
59 require Bio
::Tools
::Run
::Phylo
::PAML
::Yn00
;
61 # Multiple Sequence Alignment programs
62 require Bio
::Tools
::Run
::Alignment
::Clustalw
;
63 require Bio
::Tools
::Run
::Alignment
::TCoffee
;
66 die("Must have bioperl-run pkg installed to run this script");
68 # for projecting alignments from protein to R/DNA space
69 use Bio
::Align
::Utilities
qw(aa_to_dna_aln);
71 # for input of the sequence data
75 # for the command line argument parsing
78 my ($aln_prog, $kaks_prog,$format, $output,
79 $cdna,$verbose,$help) = qw(clustalw codeml fasta);
82 'i|input:s' => \
$cdna,
83 'o|output:s' => \
$output,
84 'f|format:s' => \
$format,
85 'msa:s' => \
$aln_prog,
86 'kaks:s' => \
$kaks_prog,
87 'v|verbose' => \
$verbose,
95 $verbose = -1 unless $verbose;
96 my ($aln_factory,$kaks_factory);
97 if( $aln_prog =~ /clus/i ) {
98 $aln_factory = Bio
::Tools
::Run
::Alignment
::Clustalw
->new(-verbose
=> $verbose);
99 } elsif( $aln_prog =~ /t\_?cof/i ) {
100 $aln_factory = Bio
::Tools
::Run
::Alignment
::TCoffee
->new(-verbose
=> $verbose);
102 warn("Did not provide either 'clustalw' or 'tcoffee' as alignment program names");
105 unless( $aln_factory->executable ) {
106 warn("Could not find the executable for $aln_prog, make sure you have installed it and have either set ".uc($aln_prog)."DIR or it is in your PATH");
111 if( $kaks_prog =~ /yn00/i ) {
112 $kaks_factory = Bio
::Tools
::Run
::Phylo
::PAML
::Yn00
->new(-verbose
=> $verbose);
113 } elsif( $kaks_prog =~ /codeml/i ) {
114 # change the parameters here if you want to tweak your Codeml running!
115 $kaks_factory = Bio
::Tools
::Run
::Phylo
::PAML
::Codeml
->new
116 (-verbose
=> $verbose,
117 -params
=> { 'runmode' => -2,
122 unless ( $kaks_factory->executable ) {
123 warn("Could not find the executable for $kaks_prog, make sure you have installed it and you have defined PAMLDIR or it is in your PATH");
127 unless ( $cdna && -f
$cdna && -r
$cdna && ! -z
$cdna ) {
128 warn("Did not specify a valid cDNA sequence file as input");
132 my $seqin = new Bio
::SeqIO
(-file
=> $cdna,
137 while( my $seq = $seqin->next_seq ) {
138 $seqs{$seq->display_id} = $seq;
139 my $protein = $seq->translate();
140 my $pseq = $protein->seq();
143 if( $pseq =~ /\*/ ) {
144 warn("provided a cDNA (".$seq->display_id.") sequence with a stop codon, PAML will choke!");
147 # Tcoffee can't handle '*'
149 $protein->seq($pseq);
150 push @prots, $protein;
153 warn("Need at least 2 cDNA sequences to proceed");
159 open(OUT
, ">$output") || die("cannot open output $output for writing");
164 my $aa_aln = $aln_factory->align(\
@prots);
165 my $dna_aln = &aa_to_dna_aln
($aa_aln, \
%seqs);
167 my @each = $dna_aln->each_seq();
170 $kaks_factory->alignment($dna_aln);
172 my ($rc,$parser) = $kaks_factory->run();
174 warn($kaks_factory->error_string,"\n");
177 my $result = $parser->next_result;
179 if ($result->version =~ m/3\.15/) {
180 warn("This script does not work with v3.15 of PAML! Please use 3.14 instead.");
184 my $MLmatrix = $result->get_MLmatrix();
186 my @otus = $result->get_seqs();
190 foreach my $s ( @each ) {
191 last if( $s->display_id eq $_->display_id );
197 print OUT
join("\t", qw(SEQ1 SEQ2 Ka Ks Ka/Ks PROT_PERCENTID CDNA_PERCENTID)), "\n";
198 for( my $i = 0; $i < (scalar @otus -1) ; $i++) {
199 for( my $j = $i+1; $j < (scalar @otus); $j++ ) {
200 my $sub_aa_aln = $aa_aln->select_noncont($pos[$i],$pos[$j]);
201 my $sub_dna_aln = $dna_aln->select_noncont($pos[$i],$pos[$j]);
203 $otus[$i]->display_id,
204 $otus[$j]->display_id,$MLmatrix->[$i]->[$j]->{'dN'},
205 $MLmatrix->[$i]->[$j]->{'dS'},
206 $MLmatrix->[$i]->[$j]->{'omega'},
207 sprintf("%.2f",$sub_aa_aln->percentage_identity),
208 sprintf("%.2f",$sub_dna_aln->percentage_identity),