new file: cell2loc.py
[GalaxyCodeBases.git] / perl / etc / radseq / deprecated / bcf4bb.pl
blobf40d086e599ed91be1f07e9afc101fb6c6599e85
1 #!/usr/bin/env perl
2 =pod
3 Author: Hu Xuesong @ BIOPIC <galaxy001@gmail.com>
4 Version: 1.0.0 @ 20120720
5 =cut
6 use strict;
7 use warnings;
8 use Data::Dump qw(dump ddx);
9 use Galaxy::IO;
10 use Galaxy::SeqTools;
12 die "Usage: $0 <bcgv bcf> <out>\n" if @ARGV<2;
13 my $bcfs=shift;
14 my $outfs=shift;
16 my (%Stat,$t);
17 open O,'>',$outfs.'.out' or die "Error opening $outfs.out : $!\n";
18 $t = "# In: [$bcfs], Out: [$outfs]\n";
19 print O $t;
20 print $t;
22 my $th = openpipe('bcftools view -I',$bcfs); # -I skip indels
23 my (@Samples,@Parents);
24 while (<$th>) {
25 next if /^##/;
26 chomp;
27 my @data = split /\t/;
28 if ($data[0] eq '#CHROM') {
29 @Samples = map {
30 if (/^(\w+)_all\.bam$/) {
31 $_ = $1;
32 } else {
33 my $t=(split /\//)[-1];$t=~s/_cut//g;$t=~s/-/./g; $_=join('_',(split /\./,$t)[-5,-6]);
35 } splice @data,9;
36 # ../5.bam_0000210210_merged/d1_4_merged.D4.JHH001.XTU.sort.rmdup.bam
37 @Parents = grep(!/^GZXJ/,@Samples);
38 last;
41 print O "# Samples: [",join('],[',@Samples),"]\n# Parents: [",join('],[',@Parents),"]\n";
42 warn "Samples:\n[",join("]\n[",@Samples),"]\nParents: [",join('],[',@Parents),"]\n";
44 while (<$th>) {
45 next if /^#/;
46 chomp;
47 my ($CHROM, $POS, $ID, $REF, $ALT, $QUAL, $FILTER, $INFO, $FORMAT, @data) = split /\t/;
48 ++$Stat{'VCF_In'}; # also as rs#
49 my @Bases = split /\s*,\s*/,"$REF, $ALT";
50 my @groups = split(/\s*;\s*/, $INFO);
51 my (%INFO,$name);
52 # if ($groups[0] eq 'INDEL') {
53 # $INFO{'Type'} = 'INDEL';
54 # shift @groups;
55 # } else {
56 # $INFO{'Type'} = 'SNP';
57 # }
58 for my $group (@groups) {
59 my ($tag,$value) = split /=/,$group;
60 #warn "- $group -> $tag,$value\n";
61 my @values = split /,/,$value;
62 if (@values == 1) {
63 $INFO{$tag}=$values[0];
64 } else {
65 $INFO{$tag}=\@values;
68 my (%GT,%GTcnt);
69 my @FMT = split /:/,$FORMAT;
70 for my $s (@Samples) {
71 my $dat = shift @data or die "bcf file error.";
72 my @dat = split /:/,$dat;
73 for my $i (@FMT) {
74 $GT{$s}{$i} = shift @dat;
77 my $SPcnt = 0;
78 my (%GTitemCnt,$Mut,@plinkGT);
79 for (@Samples) {
80 if ($GT{$_}{'DP'} > 0 and $GT{$_}{'GQ'} >= 20) {
81 my $gt = $GT{$_}{'GT'};
82 ++$GTcnt{$gt};
83 my @GT = split /[\/|]/,$gt;
84 ++$SPcnt;
85 ++$GTitemCnt{$_} for @GT;
86 $GT{$_}{'GTp'} = join ' ',map($Bases[$_], @GT);
87 #$GT{$_}{'O_K'} = 1;
88 } else {
89 $GT{$_}{'GTp'} = '0 0';
90 #$GT{$_}{'O_K'} = 0;
92 push @plinkGT,$GT{$_}{'GTp'};
94 ($Mut) = sort { $GTitemCnt{$b} <=> $GTitemCnt{$a} } keys %GTitemCnt;
95 next if (keys %GTitemCnt) > 1;
96 unless (defined $Mut) {
97 ++$Stat{'VCF_noAffInd_Skipped'};
98 next;
100 unless ($Mut) {
101 ++$Stat{'VCF_noMUT_Skipped'};
102 next;
104 #$Mut = $Bases[$Mut];
105 =pod
106 ++$Stat{'GTcnt'}{$INFO{'FQ'} <=> 0}{scalar(keys %GTcnt)};
107 ddx $Stat{'GTcnt'};
108 ddx $CHROM, $POS, $ID, $REF, $ALT, $QUAL, $FILTER, $INFO,\%INFO,\%GT if scalar(keys %GTcnt) > 1 and $INFO{'FQ'} < 0 and $SPcnt>2;
109 # rad2marker.pl:135: {
110 # -1 => { 1 => 2850, 2 => 526, 3 => 37 },
111 # 1 => { 1 => 8, 2 => 2507, 3 => 1792 },
113 =cut
114 #warn "$CHROM, $POS, $ID, $REF, $ALT, $QUAL, $FILTER, $INFO\n";
115 #ddx $CHROM, $POS, $ID, $REF, $ALT, $QUAL, $FILTER, $INFO,\%GTcnt,\%INFO,\%GT,\%GTitemCnt,$Mut;
116 if ($QUAL<20 or $SPcnt<18) {
117 ++$Stat{'VCF_Skipped'};
118 next;
120 $Mut = $Bases[$Mut];
121 my $SNPid = "r".$Stat{'VCF_In'};
122 $CHROM =~ /(\d+)/;
123 print STDERR "[$SPcnt] $CHROM, $POS, $REF, $ALT, $QUAL, $INFO ",dump(\%GT),"\n";
125 close $th;
127 ddx \%Stat;
129 close O;
130 __END__
132 ./bcf4bb.pl tigers.bcgv.bcf t 2>&1|tee t.log
134 grep scaffold b17.log |awk '{print $1}'|uniq|perl -lane 's/\,//;print $_'|while read a;do cat bychr/$a.fa >> bgene.fa;done
136 cat bgene.chr|while read a;do cat P_tigris.gene.gtf |grep -P "$a\t" >> bgene.vcf ;done