modified: diffout.py
[GalaxyCodeBases.git] / c_cpp / neosv / yass2blast.pl
blob651edd60b096c6268c3496b88d54b69f7c820304
1 #!/usr/bin/perl -w
3 # a quick&dirty script to convert "yass -d 1 " output files into
4 # - axt format
5 # - fasta format
6 # - blast format
7 # usage: yass2blast.pl [-blast|-fasta|-axt] {yassoutputfiles}\n";
9 use strict;
12 # Print an alignment according to $format = {"fasta",blast","blast header","axt"}
15 sub PrintAlign() {
16 # 1) read parameters
17 my ($nbAlignments,$nbSSequences,$pos_q_str,$pos_q_end,$pos_s_str,$pos_s_end,$name_q,$name_s,$size_q,$size_s,$reverse,$evalue,$score,$bitscore,$bias,$ts,$tv,$proba,$entropy,$fasta1,$middle,$fasta2,$format,$newchunk_q,$newchunk_s) = @_ ;
19 # 2) reverse and complement sequences to keep the "query order"
20 if ($reverse eq "-") {
21 # change first file positions (increasing order)
22 my $tmp = $pos_q_str;
23 $pos_q_str = $pos_q_end;
24 $pos_q_end = $tmp;
25 # sequence complement
26 $fasta1 =~ tr /ATUGCRYKMSWVBDHatugcrykmswvbdh/TAACGYRMKWSBVHDtaacgyrmkwsbvhd/;
27 $fasta1 = reverse($fasta1);
28 $fasta2 =~ tr /ATUGCRYKMSWVBDHatugcrykmswvbdh/TAACGYRMKWSBVHDtaacgyrmkwsbvhd/;
29 $fasta2 = reverse($fasta2);
30 $middle = reverse($middle);
35 # 3) format selection
36 if ($format eq "fasta") {
37 # 3.1) fasta but with "-" to keep alignments
38 print ">".$nbAlignments."a||".$name_q.": [".$pos_q_str."-".$pos_q_end ."]".$reverse."\n";
39 print $fasta1."\n";
40 print ">".$nbAlignments."b||".$name_s.": [".$pos_s_str."-".$pos_s_end ."]".$reverse."\n";
41 print $fasta2."\n";
43 } elsif ($format eq "axt") {
44 # 3.2) axt (blastz) format
45 print $nbAlignments."\t".$name_q."\t".$pos_q_str."\t".$pos_q_end ."\t".$name_s."\t".$pos_s_str."\t".$pos_s_end ."\t".$reverse."\t".$score."\n";
46 print $fasta1."\n";
47 print $fasta2."\n";
49 } elsif ($format eq "blast") {
50 # 3.3) blast like format
51 $middle =~ tr /:\./ /;
52 my $nbMatches = ($middle =~ tr/\|//);
53 # a) print Sequence Header
54 if ($newchunk_s){
55 print ">".$nbSSequences."_0 ".$name_s."\n";
56 print " Length = ".$size_s."\n";
57 print "\n";
59 # b) print Alignment Header
60 print " Score = ".$bitscore." bits (".$score."), Expect = ".$evalue."\n";
61 print " Identities = ".$nbMatches."/".(length($middle))." (".(int(100*$nbMatches/length($middle)))."%)\n";
62 print " Strand = Plus / ";
63 if ($reverse eq "-"){
64 print "Minus"."\n";
65 }else{
66 print "Plus"."\n";
68 print "\n\n";
71 # c) print Alignment
72 my $c = 0;
73 my $i_fasta1 = $pos_q_str;
74 my $i_fasta2 = 0;
75 if ($reverse eq "-") {
76 $i_fasta2 = $pos_s_end;
77 } else {
78 $i_fasta2 = $pos_s_str;
80 while ($c < length($middle)){
81 my $subfasta1 = substr($fasta1,$c,60);
82 my $submiddle = substr($middle,$c,60);
83 my $subfasta2 = substr($fasta2,$c,60);
84 my $letters_subfasta1 = $subfasta1; $letters_subfasta1 =~ s/-//g;
85 my $letters_subfasta2 = $subfasta2; $letters_subfasta2 =~ s/-//g;
86 my $nbletters_subfasta1 = length($letters_subfasta1);
87 my $nbletters_subfasta2 = length($letters_subfasta2);
88 printf ("Query: %-9d ",$i_fasta1); $i_fasta1 += $nbletters_subfasta1-1;
89 print($subfasta1);
90 printf (" %d\n", $i_fasta1); $i_fasta1 += 1;
92 printf (" ");
93 print($submiddle);
94 printf ("\n");
96 if ($reverse eq "-") {
97 printf ("Sbjct: %-9d ",$i_fasta2); $i_fasta2 -= $nbletters_subfasta2-1;
98 print($subfasta2);
99 printf (" %d\n", $i_fasta2); $i_fasta2 -= 1;
100 } else {
101 printf ("Sbjct: %-9d ",$i_fasta2); $i_fasta2 += $nbletters_subfasta2-1;
102 print($subfasta2);
103 printf (" %d\n", $i_fasta2); $i_fasta2 += 1;
105 $c += 60;
106 print "\n\n";
108 } elsif ($format eq "blastheader") {
109 # 3.4) just the header of blast
110 if ($newchunk_s){
111 if (length ($name_s) > 60) {
112 print "".$nbSSequences."_0 ".(substr($name_s,0,60))."... ".$bitscore." ".$evalue."\n";
113 } else {
114 print "".$nbSSequences."_0 ".$name_s;
115 for(my $e = length ($name_s); $e <= 63 ;$e ++) {
116 print " ";
118 print " ".$bitscore." ".$evalue."\n";
120 }#1_0 embl|AF417609|AF417609 Colpidium campylum telomerase RNA gen... 32 0.063
125 sub ScanFile($$) {
127 my ($yassOutputFile,$format) = @_ ;
129 open(FIC,$yassOutputFile) or die print "cant open ".$yassOutputFile."\n";
131 my $nbAlignments = 0;
132 my $nbSSequences = 0;
133 my $selector = 0;
134 my $previous_name_s = "";
135 my $previous_name_q = "";
137 #A) alignments
138 my ($pos_q_str,
139 $pos_q_end,
140 $pos_s_str,
141 $pos_s_end,
142 $name_q,
143 $name_s,
144 $size_q,
145 $size_s,
146 $reverse,
147 $evalue,
148 $score,
149 $bitscore,
150 $bias,
151 $ts,
152 $tv,
153 $proba,
154 $entropy,
155 $fasta1,
156 $middle,
157 $fasta2,)
179 "");
181 while (my $line = <FIC>){
183 # potential header (for alignemnts)
184 if ($format eq "blastheader" && !($name_q eq $previous_name_q)) {
185 print "Query= ".$name_q."\n";
186 print " (".$size_q." letters)\n\n\n";
187 print " Score E\n";
188 print "Sequences producing significant alignments: (bits) Value\n\n";
189 $previous_name_q = $name_q;
192 # lines begining with "*"
193 if($line =~ /^\*/ ){
195 # lines begining with "*("
196 if($line =~ /^\*\(/ ) {
198 # print previous alignments
199 if (!($middle eq "")){
200 &PrintAlign($nbAlignments,$nbSSequences,
201 $pos_q_str,$pos_q_end,$pos_s_str,$pos_s_end,
202 $name_q,$name_s,$size_q,$size_s,
203 $reverse,$evalue,$score,$bitscore,
204 $bias,$ts,$tv,$proba,$entropy,
205 $fasta1,$middle,$fasta2,$format,
206 ($previous_name_q eq "") || !($previous_name_q eq $name_q),
207 ($previous_name_s eq "") || !($previous_name_s eq $name_s)
209 $fasta1 = "";
210 $middle = "";
211 $fasta2 = "";
215 # count number of new alignments (and new sequences)
216 $nbAlignments++;
217 if (($previous_name_s eq "") ||
218 !($previous_name_s eq $name_s)){
219 $nbSSequences++;
223 #first line :
224 #*(546486-566813)(515659-536310) Ev: 0 s: 20328/20652 f
225 $line =~ m/\(([0-9]+)-([0-9]+)\)\(([0-9]+)-([0-9]+)\) Ev: (.+) s: ([0-9]+)\/([0-9]+) ([rf])$/;
226 # get positions
227 $pos_q_str = $1;
228 $pos_q_end = $2;
229 $pos_s_str = $3;
230 $pos_s_end = $4;
231 $evalue = $5;
232 $size_q = $6;
233 $size_s = $7;
234 $reverse = $8;
235 if ($reverse =~ /^f/) {
236 $reverse = "+";
237 } else {
238 $reverse = "-";
240 } elsif ($line =~ /^\* score/ ){
241 # third line :
242 $line =~ m/([0-9]+).*bitscore = (.*)/;
243 $score = $1 + 0;
244 $bitscore = $2 + 0;
245 } elsif ($line =~ /^\* mutations/){
246 # fourth line :
247 #* mutations per triplet 129, 103, 124 (4.54e-04) | ts : 190 tv : 166 | entropy : 5.88743
248 $line =~ m/^\* mutations per triplet ([0-9]+, [0-9]+, [0-9]+) \((.*)\) \| ts : ([0-9]+) tv : ([0-9]+) \| entropy : (.*)/;
249 $bias = $1;
250 $proba = $2;
251 $ts = $3;
252 $tv = $4;
253 $entropy = $5;
254 }else{
255 # second line :
256 $line =~ m/\*[ ]*"(.+)" \(([0-9]+) bp\)[ ]*\/[ ]*"(.+)" \(([0-9]+) bp\)[ ]*$/;
257 $previous_name_q = $name_q;
258 $name_q = $1;
259 $size_q = $2;
260 $previous_name_s = $name_s;
261 $name_s = $3;
262 $size_s = $4;
264 } else {
265 # alignment :
266 if($line =~ /^[A-Za-z-]+$/){
267 $line =~ s/\n//;
268 $selector++;
269 if ($selector % 2){
270 $fasta1 .= "".$line;
271 } else {
272 $fasta2 .= "".$line;
274 } elsif ($line =~ /^[| :.]+$/ && (length($fasta1) > length($fasta2)) ){
275 $line =~ s/\n//;
276 $middle .= $line;
279 } # while <FIC>
281 if (!($middle eq "")){
282 &PrintAlign($nbAlignments,$nbSSequences,
283 $pos_q_str,$pos_q_end,$pos_s_str,$pos_s_end,
284 $name_q,$name_s,$size_q,$size_s,
285 $reverse,$evalue,$score,$bitscore,
286 $bias,$ts,$tv,$proba,$entropy,
287 $fasta1,$middle,$fasta2,$format,
288 ($previous_name_q eq "") &&!($previous_name_q eq $name_q),
289 ($previous_name_s eq "") &&!($previous_name_s eq $name_s)
291 $fasta1 = "";
292 $middle = "";
293 $fasta2 = "";
295 close FIC;
296 return $nbAlignments;
303 #------#
304 # Main #
305 #------#
307 my $outputformat = "blast";
309 ($#ARGV >= 0) or die "yass2blast:\n".
310 " a quick&dirty script to convert \"yass -d 1 \" output files into\n".
311 " - axt format\n".
312 " - fasta format\n".
313 " - blast format\n".
314 "\n".
315 " usage: yass2blast.pl [-blast|-fasta|-axt] {yassoutputfiles}\n".
316 "\n";
319 for (my $i = 0 ; $i <= $#ARGV ; $i++) {
320 #a) select on the fly parameters ...
321 if (($ARGV[$i]) eq "-blast") {
322 $outputformat = "blast";
323 } elsif (($ARGV[$i]) eq "-fasta") {
324 $outputformat = "fasta";
325 } elsif (($ARGV[$i]) eq "-axt") {
326 $outputformat = "axt";
327 } else {
328 #b) or scan files
329 if ($outputformat eq "blast") {
330 &ScanFile($ARGV[$i],"blastheader");
331 print "\n";
333 &ScanFile($ARGV[$i],$outputformat);