modified: diffout.py
[GalaxyCodeBases.git] / c_cpp / lib / htslib / test / test-bcf-sr.pl
blob8f78acb864a3ce31c60db1958e4c2395d79cbe4e
1 #!/usr/bin/env perl
3 # Author: petr.danecek@sanger
5 # Test bcf synced reader's allele pairing
8 use strict;
9 use warnings;
10 use Carp;
11 use Data::Dumper;
12 use List::Util 'shuffle';
13 use File::Temp qw/ tempfile tempdir /;
14 use FindBin;
15 use lib "$FindBin::Bin";
17 my $opts = parse_params();
18 run_test($opts);
20 exit;
22 #--------------------------------
24 sub error
26 my (@msg) = @_;
27 if ( scalar @msg ) { confess @msg; }
28 print
29 "Usage: test-bcf-sr.pl [OPTIONS]\n",
30 "Options:\n",
31 " -s, --seed <int> Random seed\n",
32 " -t, --temp-dir <dir> When given, temporary files will not be removed\n",
33 " -v, --verbose \n",
34 " -h, -?, --help This help message\n",
35 "\n";
36 exit -1;
38 sub parse_params
40 my $opts = {};
41 while (defined(my $arg=shift(@ARGV)))
43 if ( $arg eq '-t' || $arg eq '--temp-dir' ) { $$opts{keep_files}=shift(@ARGV); next }
44 if ( $arg eq '-v' || $arg eq '--verbose' ) { $$opts{verbose}=1; next }
45 if ( $arg eq '-s' || $arg eq '--seed' ) { $$opts{seed}=shift(@ARGV); next }
46 if ( $arg eq '-?' || $arg eq '-h' || $arg eq '--help' ) { error(); }
47 error("Unknown parameter \"$arg\". Run -h for help.\n");
49 $$opts{tmp} = exists($$opts{keep_files}) ? $$opts{keep_files} : tempdir(CLEANUP=>1);
50 if ( $$opts{keep_files} ) { cmd("mkdir -p $$opts{keep_files}"); }
51 if ( !exists($$opts{seed}) )
53 $$opts{seed} = time();
54 print STDERR "Random seed is $$opts{seed}\n";
56 srand($$opts{seed});
57 return $opts;
60 sub _cmd
62 my ($cmd) = @_;
63 my $kid_io;
64 my @out;
65 my $pid = open($kid_io, "-|");
66 if ( !defined $pid ) { error("Cannot fork: $!"); }
67 if ($pid)
69 # parent
70 @out = <$kid_io>;
71 close($kid_io);
73 else
75 # child
76 exec('bash', '-o','pipefail','-c', $cmd) or error("Cannot execute the command [/bin/sh -o pipefail -c $cmd]: $!");
78 return ($? >> 8, join('',@out));
80 sub cmd
82 my ($cmd) = @_;
83 my ($ret,$out) = _cmd($cmd);
84 if ( $ret ) { error("The command failed [$ret]: $cmd\n", $out); }
85 return $out;
88 sub save_vcf
90 my ($opts,$vars,$fname) = @_;
91 open(my $fh,"| $FindBin::Bin/../bgzip -c > $fname") or error("$FindBin::Bin/../bgzip -c > $fname: !");
92 print $fh qq[##fileformat=VCFv4.3\n];
93 print $fh qq[##FILTER=<ID=PASS,Description="All filters passed">\n];
94 print $fh qq[##contig=<ID=1>\n];
95 print $fh qq[##contig=<ID=2>\n];
96 print $fh '#'. join("\t", qw(CHROM POS ID REF ALT QUAL FILTER INFO))."\n";
97 for my $var (@$vars)
99 my @als = split(/,/,$var);
100 my @alts = ();
101 my $ref;
102 for my $al (@als)
104 my ($xref,$alt) = split(/>/,$al);
105 $ref = $xref;
106 push @alts,$alt;
108 print $fh join("\t", (1,100,'.',$ref,join(',',@alts),'.','.','.'))."\n";
110 for my $var (@$vars)
112 my @als = split(/,/,$var);
113 my @alts = ();
114 my $ref;
115 for my $al (@als)
117 my ($xref,$alt) = split(/>/,$al);
118 $ref = $xref;
119 push @alts,$alt;
121 print $fh join("\t", (1,300,'.',$ref,join(',',@alts),'.','.','.'))."\n";
123 for my $var (@$vars)
125 my @als = split(/,/,$var);
126 my @alts = ();
127 my $ref;
128 for my $al (@als)
130 my ($xref,$alt) = split(/>/,$al);
131 $ref = $xref;
132 push @alts,$alt;
134 print $fh join("\t", (2,100,'.',$ref,join(',',@alts),'.','.','.'))."\n";
136 close($fh) or error("close failed: bgzip -c > $fname");
137 cmd("$FindBin::Bin/../tabix -f $fname");
140 sub random_alt
142 my ($ref,$is_snp) = @_;
143 my @acgt = qw(A C G T);
144 my $alt = $acgt[rand @acgt];
145 if ( $ref eq $alt ) { return '.'; } # ref
146 if ( !$is_snp ) { $alt = $ref.$alt; }
147 return $alt;
150 sub check_outputs
152 my ($fname_bin,$fname_perl) = @_;
153 my %out = ();
154 open(my $fh,'<',$fname_bin) or error("$fname_bin: $!");
155 while (my $line=<$fh>)
157 my ($pos,@vals) = split(/\t/,$line);
158 chomp($vals[-1]);
159 push @{$out{$pos}},join("\t",@vals);
161 close($fh) or error("close failed: $fname_bin");
162 if ( keys %out != 3 ) { error("Expected 3 positions, found ",scalar keys %out,": $fname_bin\n"); }
163 my $n;
164 for my $pos (keys %out)
166 if ( !defined $n ) { $n = scalar @{$out{$pos}}; }
167 if ( @{$out{$pos}} != $n ) { error("Expected $n positions, found ",scalar keys %{$out{$pos}},"\n"); }
169 my @blines = @{$out{(keys %out)[0]}};
171 my @plines = ();
172 open($fh,'<',$fname_perl) or error("$fname_perl: $!");
173 while (my $line=<$fh>)
175 chomp($line);
176 push @plines,$line;
178 close($fh) or error("close failed: $fname_perl");
179 if ( @blines != @plines ) { error("Different number of lines: ",scalar @blines," vs ",scalar @plines," in $fname_bin vs $fname_perl\n"); }
180 @blines = sort @blines;
181 @plines = sort @plines;
182 for (my $i=0; $i<@plines; $i++)
184 if ( $blines[$i] ne $plines[$i] )
186 #error("Different lines in $fname_bin vs $fname_perl:\n\t$blines[$i].\nvs\n\t$plines[$i].\n");
187 error("Different lines in $fname_bin vs $fname_perl:\n\t".join("\n\t",@blines)."\nvs\n\t".join("\n\t",@plines)."\n");
192 sub run_test
194 my ($opts) = @_;
195 my @acgt = qw(A C G T);
196 my $ref = $acgt[rand @acgt];
197 my @vcfs = ();
198 my $nvcf = 1 + int(rand(10));
199 for (my $i=0; $i<$nvcf; $i++)
201 my %vars = ();
202 my $nvars = 1 + int(rand(6));
203 for (my $j=0; $j<$nvars; $j++)
205 my $snp = int(rand(2));
206 my $alt = random_alt($ref,$snp);
207 my $var = "$ref>$alt";
208 if ( $alt ne '.' && !int(rand(5)) ) # create multiallelic site
210 my $alt2 = random_alt($ref,$snp);
211 if ( $alt2 ne '.' && $alt ne $alt2 )
213 $var .= ",$ref>$alt2";
216 $vars{$var} = 1;
218 my $ndup = 1 + int(rand(4));
219 for (my $j=0; $j<$ndup; $j++)
221 my @keys = shuffle keys %vars;
222 push @vcfs, \@keys;
225 @vcfs = shuffle @vcfs;
226 open(my $fh,'>',"$$opts{tmp}/list.txt") or error("$$opts{tmp}/list.txt: $!");
227 my %groups = ();
228 my @group_list = ();
229 for (my $i=0; $i<@vcfs; $i++)
231 my $vcf = $vcfs[$i];
232 my $key = join(';',sort @$vcf);
233 if ( !exists($groups{$key}) )
235 push @group_list,$key;
236 $groups{$key}{vars} = [@$vcf];
237 $groups{$key}{key} = $key;
239 push @{$groups{$key}{vcfs}},$i;
240 save_vcf($opts,$vcf,"$$opts{tmp}/$i.vcf.gz");
241 print $fh "$$opts{tmp}/$i.vcf.gz\n";
243 close($fh);
245 my @groups = ();
246 for my $group (@group_list) { push @groups, $groups{$group}; }
247 for my $logic (qw(snps indels both snps+ref indels+ref both+ref exact some all))
248 #for my $logic (qw(snps))
250 print STDERR "$FindBin::Bin/test-bcf-sr $$opts{tmp}/list.txt -p $logic > $$opts{tmp}/rmme.bin.out\n" unless !$$opts{verbose};
251 cmd("$FindBin::Bin/test-bcf-sr $$opts{tmp}/list.txt -p $logic > $$opts{tmp}/rmme.bin.out");
253 open(my $fh,'>',"$$opts{tmp}/rmme.perl.out") or error("$$opts{tmp}/rmme.perl.out: $!");
254 $$opts{fh} = $fh;
255 $$opts{logic} = $logic;
256 pair_lines($opts,\@groups);
257 close($fh) or error("close failed: $$opts{tmp}/rmme.perl.out");
259 check_outputs("$$opts{tmp}/rmme.bin.out","$$opts{tmp}/rmme.perl.out");
263 sub pair_lines
265 my ($opts,$groups) = @_;
267 #print 'groups: '.Dumper($groups);
269 # get a list of all unique variants and their groups
270 my %vars = ();
271 my @var_list = ();
272 for (my $igrp=0; $igrp<@$groups; $igrp++)
274 my $grp = $$groups[$igrp];
275 for (my $ivar=0; $ivar<@{$$grp{vars}}; $ivar++)
277 my $var = $$grp{vars}[$ivar];
278 if ( !exists($vars{$var}) ) { push @var_list,$var; } # just to keep the order
279 push @{$vars{$var}}, { igrp=>$igrp, ivar=>$ivar, cnt=>scalar @{$$grp{vcfs}} };
283 # each variant has a list of groups that it is present in
284 my @vars = ();
285 for my $var (@var_list) { push @vars, $vars{$var}; }
287 #print STDERR 'unique variants: '.Dumper(\@var_list);
288 # for (my $i=0; $i<@vars; $i++)
290 # my $igrp = $vars[$i][0]{igrp};
291 # my $jvar = $vars[$i][0]{ivar};
292 # my $var = $$groups[$igrp]{vars}[$jvar];
293 # print STDERR "$i: $var\n";
296 # initialize variant sets - combinations of compatible variants across multiple reader groups
297 my @var_sets = ();
298 for (my $i=0; $i<@vars; $i++) { push @var_sets,[$i]; }
300 my @bitmask = ();
301 my @pmatrix = ();
302 for (my $iset=0; $iset<@var_sets; $iset++)
304 $pmatrix[$iset] = [(0) x (scalar @$groups)];
305 $bitmask[$iset] = 0;
307 my @max;
308 for (my $iset=0; $iset<@var_sets; $iset++)
310 my $tmp_max = 0;
311 for my $ivar (@{$var_sets[$iset]})
313 my $var = $vars[$ivar];
314 for my $grp (@$var)
316 my $igrp = $$grp{igrp};
317 $pmatrix[$iset][$igrp] += $$grp{cnt};
318 if ( $bitmask[$iset] & (1<<$igrp) ) { error("Uh!"); }
319 $bitmask[$iset] |= 1<<$igrp;
320 $tmp_max += $$grp{cnt};
323 push @max, $tmp_max;
326 # pair the lines
327 while ( @var_sets )
329 my $imax = 0;
330 for (my $iset=1; $iset<@var_sets; $iset++)
332 if ( $max[$iset] > $max[$imax] ) { $imax = $iset; }
334 # if ( @var_sets == @vars ) { dump_pmatrix($groups,\@vars,\@var_sets,\@pmatrix,\@bitmask); }
336 my $ipair = undef;
337 my $max_score = 0;
338 for (my $iset=0; $iset<@var_sets; $iset++)
340 if ( $bitmask[$imax] & $bitmask[$iset] ) { next; } # cannot merge
341 my $score = pairing_score($opts,$groups,\@vars,$var_sets[$imax],$var_sets[$iset]);
342 if ( $max_score < $score ) { $max_score = $score; $ipair = $iset; }
345 # merge rows thus creating a new variant set
346 if ( defined $ipair && $ipair != $imax )
348 $imax = merge_rows($groups,\@vars,\@var_sets,\@pmatrix,\@bitmask,\@max,$imax,$ipair);
349 next;
352 output_row($opts,$groups,\@vars,\@var_sets,\@pmatrix,\@bitmask,\@max,$imax);
353 # dump_pmatrix($groups,\@vars,\@var_sets,\@pmatrix,\@bitmask);
357 sub merge_rows
359 my ($grps,$vars,$var_sets,$pmat,$bitmask,$max,$ivset,$jvset) = @_;
360 if ( $ivset > $jvset ) { my $tmp = $ivset; $ivset = $jvset; $jvset = $tmp; }
361 push @{$$var_sets[$ivset]}, @{$$var_sets[$jvset]};
362 for (my $igrp=0; $igrp<@{$$pmat[$ivset]}; $igrp++)
364 $$pmat[$ivset][$igrp] += $$pmat[$jvset][$igrp];
366 $$max[$ivset] += $$max[$jvset];
367 $$bitmask[$ivset] |= $$bitmask[$jvset];
368 splice(@$var_sets,$jvset,1);
369 splice(@$pmat,$jvset,1);
370 splice(@$bitmask,$jvset,1);
371 splice(@$max,$jvset,1);
372 return $ivset;
375 sub output_row
377 my ($opts,$grps,$vars,$var_sets,$pmat,$bitmask,$max,$ivset) = @_;
378 my $varset = $$var_sets[$ivset];
379 my @tmp = ();
380 for my $grp (@$grps)
382 for my $vcf (@{$$grp{vcfs}}) { push @tmp, '-'; }
384 for my $idx (@$varset)
386 for my $var (@{$$vars[$idx]})
388 my $igrp = $$var{igrp};
389 my $jvar = $$var{ivar};
390 my $str = $$grps[$igrp]{vars}[$jvar];
391 $str =~ s/[^>]>//g;
392 for my $ivcf (@{$$grps[$igrp]{vcfs}}) { $tmp[$ivcf] = $str; }
395 print {$$opts{fh}} join("\t",@tmp)."\n";
396 splice(@$var_sets,$ivset,1);
397 splice(@$pmat,$ivset,1);
398 splice(@$bitmask,$ivset,1);
399 splice(@$max,$ivset,1);
402 sub dump_pmatrix
404 my ($grps,$vars,$var_sets,$pmat,$bitmask) = @_;
405 for (my $ivset=0; $ivset<@$var_sets; $ivset++)
407 my $varset = $$var_sets[$ivset];
408 my @tmp = ();
409 for my $ivar (@$varset)
411 my $igrp = $$vars[$ivar][0]{igrp};
412 my $jvar = $$vars[$ivar][0]{ivar};
413 push @tmp, $$grps[$igrp]{vars}[$jvar];
415 printf STDERR "%-10s",join(',',@tmp);
416 for (my $igrp=0; $igrp<@{$$pmat[0]}; $igrp++)
418 print STDERR "\t$$pmat[$ivset][$igrp]";
420 print STDERR "\n";
422 print STDERR "\n";
425 sub var_type
427 my ($vars) = @_;
428 my %type = ();
429 for my $var (split(/,/,$vars))
431 my ($ref,$alt) = split(/>/,$var);
432 if ( $ref eq $alt or $alt eq '.' ) { $type{ref} = 1; }
433 elsif ( length($ref)==length($alt) && length($ref)==1 ) { $type{snp} = 1; }
434 else { $type{indel} = 1; }
436 return keys %type;
438 sub multi_is_subset
440 my ($avar,$bvar) = @_;
441 my %avars = ();
442 my %bvars = ();
443 for my $var (split(/,/,$avar)) { $avars{$var} = 1; }
444 for my $var (split(/,/,$bvar)) { $bvars{$var} = 1; }
445 for my $var (keys %avars)
447 if ( exists($bvars{$var}) ) { return 1; }
449 for my $var (keys %bvars)
451 if ( exists($avars{$var}) ) { return 1; }
453 return 0;
455 sub multi_is_exact
457 my ($avar,$bvar) = @_;
458 my %avars = ();
459 my %bvars = ();
460 for my $var (split(/,/,$avar)) { $avars{$var} = 1; }
461 for my $var (split(/,/,$bvar)) { $bvars{$var} = 1; }
462 for my $var (keys %avars)
464 if ( !exists($bvars{$var}) ) { return 0; }
466 for my $var (keys %bvars)
468 if ( !exists($avars{$var}) ) { return 0; }
470 return 1;
472 sub pairing_score
474 my ($opts,$grps,$vars,$avset,$bvset) = @_;
476 my $score = {};
477 if ( $$opts{logic}=~/both/ or $$opts{logic}=~/snps/ or $$opts{logic}=~/all/ )
479 $$score{snp}{snp} = 3;
480 if ( $$opts{logic}=~/ref/ or $$opts{logic}=~/all/ ) { $$score{snp}{ref} = 2; }
482 if ( $$opts{logic}=~/both/ or $$opts{logic}=~/indels/ or $$opts{logic}=~/all/ )
484 $$score{indel}{indel} = 3;
485 if ( $$opts{logic}=~/ref/ or $$opts{logic}=~/all/ ) { $$score{indel}{ref} = 2; }
487 if ( $$opts{logic}=~/all/ )
489 $$score{snp}{indel} = 1;
490 $$score{indel}{snp} = 1;
492 for my $a (keys %$score)
494 for my $b (keys %{$$score{$a}})
496 $$score{$b}{$a} = $$score{$a}{$b};
500 my $max_int = 0xFFFFFFFF;
501 my $min = $max_int;
502 for my $ia (@$avset)
504 for my $ib (@$bvset)
506 my $avar = $$grps[ $$vars[$ia][0]{igrp} ]{vars}[ $$vars[$ia][0]{ivar} ];
507 my $bvar = $$grps[ $$vars[$ib][0]{igrp} ]{vars}[ $$vars[$ib][0]{ivar} ];
509 if ( $avar eq $bvar ) { return $max_int; }
510 if ( $$opts{logic} eq 'exact' )
512 if ( multi_is_exact($avar,$bvar) ) { return $max_int; }
513 next;
515 elsif ( multi_is_subset($avar,$bvar) ) { return $max_int; }
517 my @atype = var_type($avar);
518 my @btype = var_type($bvar);
519 my $max = 0;
520 for my $a (@atype)
522 for my $b (@btype)
524 if ( !exists($$score{$a}{$b}) ) { next; }
525 if ( $max < $$score{$a}{$b} ) { $max = $$score{$a}{$b}; }
528 if ( !$max ) { return 0; } # some of the variants in the two groups are not compatible
529 if ( $min > $max ) { $min = $max; }
532 if ( $$opts{logic} eq 'exact' ) { return 0; }
534 my $cnt = 0;
535 for my $ivar (@$avset,@$bvset)
537 my $var = $$vars[$ivar];
538 for my $grp (@$var)
540 $cnt += $$grp{cnt};
543 return (1<<(28+$min)) + $cnt;