3 # Author: petr.danecek@sanger
5 # Test bcf synced reader's allele pairing
12 use List
::Util
'shuffle';
13 use File
::Temp qw
/ tempfile tempdir /;
15 use lib
"$FindBin::Bin";
17 my $opts = parse_params
();
22 #--------------------------------
27 if ( scalar @msg ) { confess
@msg; }
29 "Usage: test-bcf-sr.pl [OPTIONS]\n",
31 " -s, --seed <int> Random seed\n",
32 " -t, --temp-dir <dir> When given, temporary files will not be removed\n",
34 " -h, -?, --help This help message\n",
41 while (defined(my $arg=shift(@ARGV)))
43 if ( $arg eq '-t' || $arg eq '--temp-dir' ) { $$opts{keep_files
}=shift(@ARGV); next }
44 if ( $arg eq '-v' || $arg eq '--verbose' ) { $$opts{verbose
}=1; next }
45 if ( $arg eq '-s' || $arg eq '--seed' ) { $$opts{seed
}=shift(@ARGV); next }
46 if ( $arg eq '-?' || $arg eq '-h' || $arg eq '--help' ) { error
(); }
47 error
("Unknown parameter \"$arg\". Run -h for help.\n");
49 $$opts{tmp
} = exists($$opts{keep_files
}) ?
$$opts{keep_files
} : tempdir
(CLEANUP
=>1);
50 if ( $$opts{keep_files
} ) { cmd
("mkdir -p $$opts{keep_files}"); }
51 if ( !exists($$opts{seed
}) )
53 $$opts{seed
} = time();
54 print STDERR
"Random seed is $$opts{seed}\n";
65 my $pid = open($kid_io, "-|");
66 if ( !defined $pid ) { error
("Cannot fork: $!"); }
76 exec('bash', '-o','pipefail','-c', $cmd) or error
("Cannot execute the command [/bin/sh -o pipefail -c $cmd]: $!");
78 return ($?
>> 8, join('',@out));
83 my ($ret,$out) = _cmd
($cmd);
84 if ( $ret ) { error
("The command failed [$ret]: $cmd\n", $out); }
90 my ($opts,$vars,$fname) = @_;
91 open(my $fh,"| $FindBin::Bin/../bgzip -c > $fname") or error
("$FindBin::Bin/../bgzip -c > $fname: !");
92 print $fh qq[##fileformat=VCFv4.3\n];
93 print $fh qq[##FILTER=<ID=PASS,Description="All filters passed">\n];
94 print $fh qq[##contig=<ID=1>\n];
95 print $fh qq[##contig=<ID=2>\n];
96 print $fh '#'. join("\t", qw(CHROM POS ID REF ALT QUAL FILTER INFO))."\n";
99 my @als = split(/,/,$var);
104 my ($xref,$alt) = split(/>/,$al);
108 print $fh join("\t", (1,100,'.',$ref,join(',',@alts),'.','.','.'))."\n";
112 my @als = split(/,/,$var);
117 my ($xref,$alt) = split(/>/,$al);
121 print $fh join("\t", (1,300,'.',$ref,join(',',@alts),'.','.','.'))."\n";
125 my @als = split(/,/,$var);
130 my ($xref,$alt) = split(/>/,$al);
134 print $fh join("\t", (2,100,'.',$ref,join(',',@alts),'.','.','.'))."\n";
136 close($fh) or error
("close failed: bgzip -c > $fname");
137 cmd
("$FindBin::Bin/../tabix -f $fname");
142 my ($ref,$is_snp) = @_;
143 my @acgt = qw(A C G T);
144 my $alt = $acgt[rand @acgt];
145 if ( $ref eq $alt ) { return '.'; } # ref
146 if ( !$is_snp ) { $alt = $ref.$alt; }
152 my ($fname_bin,$fname_perl) = @_;
154 open(my $fh,'<',$fname_bin) or error
("$fname_bin: $!");
155 while (my $line=<$fh>)
157 my ($pos,@vals) = split(/\t/,$line);
159 push @
{$out{$pos}},join("\t",@vals);
161 close($fh) or error
("close failed: $fname_bin");
162 if ( keys %out != 3 ) { error
("Expected 3 positions, found ",scalar keys %out,": $fname_bin\n"); }
164 for my $pos (keys %out)
166 if ( !defined $n ) { $n = scalar @
{$out{$pos}}; }
167 if ( @
{$out{$pos}} != $n ) { error
("Expected $n positions, found ",scalar keys %{$out{$pos}},"\n"); }
169 my @blines = @
{$out{(keys %out)[0]}};
172 open($fh,'<',$fname_perl) or error
("$fname_perl: $!");
173 while (my $line=<$fh>)
178 close($fh) or error
("close failed: $fname_perl");
179 if ( @blines != @plines ) { error
("Different number of lines: ",scalar @blines," vs ",scalar @plines," in $fname_bin vs $fname_perl\n"); }
180 @blines = sort @blines;
181 @plines = sort @plines;
182 for (my $i=0; $i<@plines; $i++)
184 if ( $blines[$i] ne $plines[$i] )
186 #error("Different lines in $fname_bin vs $fname_perl:\n\t$blines[$i].\nvs\n\t$plines[$i].\n");
187 error
("Different lines in $fname_bin vs $fname_perl:\n\t".join("\n\t",@blines)."\nvs\n\t".join("\n\t",@plines)."\n");
195 my @acgt = qw(A C G T);
196 my $ref = $acgt[rand @acgt];
198 my $nvcf = 1 + int(rand(10));
199 for (my $i=0; $i<$nvcf; $i++)
202 my $nvars = 1 + int(rand(6));
203 for (my $j=0; $j<$nvars; $j++)
205 my $snp = int(rand(2));
206 my $alt = random_alt
($ref,$snp);
207 my $var = "$ref>$alt";
208 if ( $alt ne '.' && !int(rand(5)) ) # create multiallelic site
210 my $alt2 = random_alt
($ref,$snp);
211 if ( $alt2 ne '.' && $alt ne $alt2 )
213 $var .= ",$ref>$alt2";
218 my $ndup = 1 + int(rand(4));
219 for (my $j=0; $j<$ndup; $j++)
221 my @keys = shuffle
keys %vars;
225 @vcfs = shuffle
@vcfs;
226 open(my $fh,'>',"$$opts{tmp}/list.txt") or error
("$$opts{tmp}/list.txt: $!");
229 for (my $i=0; $i<@vcfs; $i++)
232 my $key = join(';',sort @
$vcf);
233 if ( !exists($groups{$key}) )
235 push @group_list,$key;
236 $groups{$key}{vars
} = [@
$vcf];
237 $groups{$key}{key
} = $key;
239 push @
{$groups{$key}{vcfs
}},$i;
240 save_vcf
($opts,$vcf,"$$opts{tmp}/$i.vcf.gz");
241 print $fh "$$opts{tmp}/$i.vcf.gz\n";
246 for my $group (@group_list) { push @groups, $groups{$group}; }
247 for my $logic (qw(snps indels both snps+ref indels+ref both+ref exact some all))
248 #for my $logic (qw(snps))
250 print STDERR
"$FindBin::Bin/test-bcf-sr $$opts{tmp}/list.txt -p $logic > $$opts{tmp}/rmme.bin.out\n" unless !$$opts{verbose
};
251 cmd
("$FindBin::Bin/test-bcf-sr $$opts{tmp}/list.txt -p $logic > $$opts{tmp}/rmme.bin.out");
253 open(my $fh,'>',"$$opts{tmp}/rmme.perl.out") or error
("$$opts{tmp}/rmme.perl.out: $!");
255 $$opts{logic
} = $logic;
256 pair_lines
($opts,\
@groups);
257 close($fh) or error
("close failed: $$opts{tmp}/rmme.perl.out");
259 check_outputs
("$$opts{tmp}/rmme.bin.out","$$opts{tmp}/rmme.perl.out");
265 my ($opts,$groups) = @_;
267 #print 'groups: '.Dumper($groups);
269 # get a list of all unique variants and their groups
272 for (my $igrp=0; $igrp<@
$groups; $igrp++)
274 my $grp = $$groups[$igrp];
275 for (my $ivar=0; $ivar<@
{$$grp{vars
}}; $ivar++)
277 my $var = $$grp{vars
}[$ivar];
278 if ( !exists($vars{$var}) ) { push @var_list,$var; } # just to keep the order
279 push @
{$vars{$var}}, { igrp
=>$igrp, ivar
=>$ivar, cnt
=>scalar @
{$$grp{vcfs
}} };
283 # each variant has a list of groups that it is present in
285 for my $var (@var_list) { push @vars, $vars{$var}; }
287 #print STDERR 'unique variants: '.Dumper(\@var_list);
288 # for (my $i=0; $i<@vars; $i++)
290 # my $igrp = $vars[$i][0]{igrp};
291 # my $jvar = $vars[$i][0]{ivar};
292 # my $var = $$groups[$igrp]{vars}[$jvar];
293 # print STDERR "$i: $var\n";
296 # initialize variant sets - combinations of compatible variants across multiple reader groups
298 for (my $i=0; $i<@vars; $i++) { push @var_sets,[$i]; }
302 for (my $iset=0; $iset<@var_sets; $iset++)
304 $pmatrix[$iset] = [(0) x
(scalar @
$groups)];
308 for (my $iset=0; $iset<@var_sets; $iset++)
311 for my $ivar (@
{$var_sets[$iset]})
313 my $var = $vars[$ivar];
316 my $igrp = $$grp{igrp
};
317 $pmatrix[$iset][$igrp] += $$grp{cnt
};
318 if ( $bitmask[$iset] & (1<<$igrp) ) { error
("Uh!"); }
319 $bitmask[$iset] |= 1<<$igrp;
320 $tmp_max += $$grp{cnt
};
330 for (my $iset=1; $iset<@var_sets; $iset++)
332 if ( $max[$iset] > $max[$imax] ) { $imax = $iset; }
334 # if ( @var_sets == @vars ) { dump_pmatrix($groups,\@vars,\@var_sets,\@pmatrix,\@bitmask); }
338 for (my $iset=0; $iset<@var_sets; $iset++)
340 if ( $bitmask[$imax] & $bitmask[$iset] ) { next; } # cannot merge
341 my $score = pairing_score
($opts,$groups,\
@vars,$var_sets[$imax],$var_sets[$iset]);
342 if ( $max_score < $score ) { $max_score = $score; $ipair = $iset; }
345 # merge rows thus creating a new variant set
346 if ( defined $ipair && $ipair != $imax )
348 $imax = merge_rows
($groups,\
@vars,\
@var_sets,\
@pmatrix,\
@bitmask,\
@max,$imax,$ipair);
352 output_row
($opts,$groups,\
@vars,\
@var_sets,\
@pmatrix,\
@bitmask,\
@max,$imax);
353 # dump_pmatrix($groups,\@vars,\@var_sets,\@pmatrix,\@bitmask);
359 my ($grps,$vars,$var_sets,$pmat,$bitmask,$max,$ivset,$jvset) = @_;
360 if ( $ivset > $jvset ) { my $tmp = $ivset; $ivset = $jvset; $jvset = $tmp; }
361 push @
{$$var_sets[$ivset]}, @
{$$var_sets[$jvset]};
362 for (my $igrp=0; $igrp<@
{$$pmat[$ivset]}; $igrp++)
364 $$pmat[$ivset][$igrp] += $$pmat[$jvset][$igrp];
366 $$max[$ivset] += $$max[$jvset];
367 $$bitmask[$ivset] |= $$bitmask[$jvset];
368 splice(@
$var_sets,$jvset,1);
369 splice(@
$pmat,$jvset,1);
370 splice(@
$bitmask,$jvset,1);
371 splice(@
$max,$jvset,1);
377 my ($opts,$grps,$vars,$var_sets,$pmat,$bitmask,$max,$ivset) = @_;
378 my $varset = $$var_sets[$ivset];
382 for my $vcf (@
{$$grp{vcfs
}}) { push @tmp, '-'; }
384 for my $idx (@
$varset)
386 for my $var (@
{$$vars[$idx]})
388 my $igrp = $$var{igrp
};
389 my $jvar = $$var{ivar
};
390 my $str = $$grps[$igrp]{vars
}[$jvar];
392 for my $ivcf (@
{$$grps[$igrp]{vcfs
}}) { $tmp[$ivcf] = $str; }
395 print {$$opts{fh
}} join("\t",@tmp)."\n";
396 splice(@
$var_sets,$ivset,1);
397 splice(@
$pmat,$ivset,1);
398 splice(@
$bitmask,$ivset,1);
399 splice(@
$max,$ivset,1);
404 my ($grps,$vars,$var_sets,$pmat,$bitmask) = @_;
405 for (my $ivset=0; $ivset<@
$var_sets; $ivset++)
407 my $varset = $$var_sets[$ivset];
409 for my $ivar (@
$varset)
411 my $igrp = $$vars[$ivar][0]{igrp
};
412 my $jvar = $$vars[$ivar][0]{ivar
};
413 push @tmp, $$grps[$igrp]{vars
}[$jvar];
415 printf STDERR
"%-10s",join(',',@tmp);
416 for (my $igrp=0; $igrp<@
{$$pmat[0]}; $igrp++)
418 print STDERR
"\t$$pmat[$ivset][$igrp]";
429 for my $var (split(/,/,$vars))
431 my ($ref,$alt) = split(/>/,$var);
432 if ( $ref eq $alt or $alt eq '.' ) { $type{ref} = 1; }
433 elsif ( length($ref)==length($alt) && length($ref)==1 ) { $type{snp
} = 1; }
434 else { $type{indel
} = 1; }
440 my ($avar,$bvar) = @_;
443 for my $var (split(/,/,$avar)) { $avars{$var} = 1; }
444 for my $var (split(/,/,$bvar)) { $bvars{$var} = 1; }
445 for my $var (keys %avars)
447 if ( exists($bvars{$var}) ) { return 1; }
449 for my $var (keys %bvars)
451 if ( exists($avars{$var}) ) { return 1; }
457 my ($avar,$bvar) = @_;
460 for my $var (split(/,/,$avar)) { $avars{$var} = 1; }
461 for my $var (split(/,/,$bvar)) { $bvars{$var} = 1; }
462 for my $var (keys %avars)
464 if ( !exists($bvars{$var}) ) { return 0; }
466 for my $var (keys %bvars)
468 if ( !exists($avars{$var}) ) { return 0; }
474 my ($opts,$grps,$vars,$avset,$bvset) = @_;
477 if ( $$opts{logic
}=~/both/ or $$opts{logic
}=~/snps/ or $$opts{logic
}=~/all/ )
479 $$score{snp
}{snp
} = 3;
480 if ( $$opts{logic
}=~/ref/ or $$opts{logic
}=~/all
/ ) { $$score{snp
}{ref} = 2; }
482 if ( $$opts{logic
}=~/both/ or $$opts{logic
}=~/indels/ or $$opts{logic
}=~/all/ )
484 $$score{indel
}{indel
} = 3;
485 if ( $$opts{logic
}=~/ref/ or $$opts{logic
}=~/all
/ ) { $$score{indel
}{ref} = 2; }
487 if ( $$opts{logic
}=~/all/ )
489 $$score{snp
}{indel
} = 1;
490 $$score{indel
}{snp
} = 1;
492 for my $a (keys %$score)
494 for my $b (keys %{$$score{$a}})
496 $$score{$b}{$a} = $$score{$a}{$b};
500 my $max_int = 0xFFFFFFFF;
506 my $avar = $$grps[ $$vars[$ia][0]{igrp
} ]{vars
}[ $$vars[$ia][0]{ivar
} ];
507 my $bvar = $$grps[ $$vars[$ib][0]{igrp
} ]{vars
}[ $$vars[$ib][0]{ivar
} ];
509 if ( $avar eq $bvar ) { return $max_int; }
510 if ( $$opts{logic
} eq 'exact' )
512 if ( multi_is_exact
($avar,$bvar) ) { return $max_int; }
515 elsif ( multi_is_subset
($avar,$bvar) ) { return $max_int; }
517 my @atype = var_type
($avar);
518 my @btype = var_type
($bvar);
524 if ( !exists($$score{$a}{$b}) ) { next; }
525 if ( $max < $$score{$a}{$b} ) { $max = $$score{$a}{$b}; }
528 if ( !$max ) { return 0; } # some of the variants in the two groups are not compatible
529 if ( $min > $max ) { $min = $max; }
532 if ( $$opts{logic
} eq 'exact' ) { return 0; }
535 for my $ivar (@
$avset,@
$bvset)
537 my $var = $$vars[$ivar];
543 return (1<<(28+$min)) + $cnt;