new file: cell2loc.py
[GalaxyCodeBases.git] / perl / etc / radseq / deprecated / rad2marker.pl
blob4961842f124aac9b115c91aa5fa36cc2ff64b7e8
1 #!/usr/bin/env perl
2 =pod
3 Author: Hu Xuesong @ BIOPIC <galaxy001@gmail.com>
4 Version: 1.0.0 @ 20120720
5 =cut
6 use strict;
7 use warnings;
8 use Data::Dump qw(ddx);
9 use Galaxy::IO;
10 use Galaxy::IO::FASTAQ qw(readfq getQvaluesFQ);
11 use Galaxy::SeqTools;
13 die "Usage: $0 <ec list> <bcgv bcf> <out>\n" if @ARGV<2;
14 my $eclst=shift;
15 my $bcfs=shift;
16 my $outfs=shift;
18 my $ReadLen = 101-5;
19 my $minOKalnLen = 30;
20 my $minAlignLen = int($ReadLen * 0.6);
21 $minAlignLen = $minOKalnLen if $minAlignLen < $minOKalnLen;
22 die if $ReadLen < $minOKalnLen;
24 my $Eseq="CTGCAG";
25 my $EseqLen = length $Eseq;
26 my $EcutAt=5;
27 my $EseqL="CTGCA";#substr $Eseq,0,$EcutAt;
28 my $EseqR="TGCAG";
30 my $EfwdTerm5=1-$EcutAt+1; # -3
31 my $EfwdTerm3=1-$EcutAt+$ReadLen; # 92
32 my $ErevTerm3=0; # 0
33 my $ErevTerm5=$ErevTerm3-$ReadLen+1; # -95
34 # 12008 -> [12005,12099],[11914,12008]
35 my $Rfwd2Ec = -$EfwdTerm5; # 3
36 my $Rrev2Ec = -$ErevTerm3; # 0
38 # C|TGCAG
39 my $PosLeft = -$EfwdTerm3; # -92
40 my $PosRight = -$ErevTerm5; # 95;
41 my $PosECsft = $Rfwd2Ec; # 3;
42 # - [x-$PosLeft,x+$PosECsft], Len=96
43 # + [x,x+$PosRight], Len=96
44 # A [x-$PosLeft,x+$PosRight], Len=95+92+1=186
46 my %Stat;
47 my $t;
48 open O,'>',$outfs or die "Error opening $outfs : $!\n";
49 $t = "# EClst: [$eclst], Enzyme: [$Eseq], Cut after $EcutAt\n# Bams: [$bcfs]\n";
50 print O $t;
51 print $t;
53 my %Markers;
54 open L,'<',$eclst or die;
55 while (<L>) {
56 next if /^(#|$)/;
57 my ($chr,$pos,$strand,$mark,$count,$samples) = split /\t/;
58 #warn "$chr,$pos,$strand,$samples,$mark\n";
59 next if $samples < 2;
60 push @{$Markers{$chr}},[$pos,$strand];
62 close L;
64 my $th = openpipe('bcftools view',$bcfs);
65 my (@Samples,@Parents);
66 while (<$th>) {
67 next if /^##/;
68 chomp;
69 my @data = split /\t/;
70 if ($data[0] eq '#CHROM') {
71 @Samples = map {my $t=(split /\//)[-1];$t=~s/_cut//g;$t=~s/-/./g; $_=join('_',(split /\./,$t)[-5,-6]);} splice @data,9;
72 # ../5.bam_0000210210_merged/d1_4_merged.D4.JHH001.XTU.sort.rmdup.bam
73 @Parents = grep(!/^GZXJ/,@Samples);
74 last;
77 print O "# Samples: [",join('],[',@Samples),"]\n# Parents: [",join('],[',@Parents),"]\n";
78 warn "Samples:\n[",join("]\n[",@Samples),"]\nParents: [",join('],[',@Parents),"]\n";
79 =pod
80 Samples:
81 [JHH001_D4]
82 [GZXJ03_A1]
83 [GZXJ05_A3]
84 [GZXJ26_B1]
85 [GZXJ27_B2]
86 [GZXJ28_B3]
87 [GZXJ29_B4]
88 [GZXJ30_C1]
89 [GZXJ33_C4]
90 [BHX011_LSJ]
91 [BHX019_LSJ]
92 [GZXJ04_A2]
93 [GZXJ06_A4]
94 [GZXJ31_C2]
95 [GZXJ32_C3]
96 [GZXJ36_D1]
97 [GZXJ37_D2]
98 [GZXJ38_D3]
99 =cut
100 my ($lastChr) = ('');
101 my @items;
102 while (<$th>) {
103 next if /^#/;
104 chomp;
105 my ($CHROM, $POS, $ID, $REF, $ALT, $QUAL, $FILTER, $INFO, $FORMAT, @data) = split /\t/;
106 ++$Stat{'VCF_In'};
107 my @groups = split(/\s*;\s*/, $INFO);
108 my (%INFO,$name);
109 if ($groups[0] eq 'INDEL') {
110 $INFO{'Type'} = 'INDEL';
111 shift @groups;
112 } else {
113 $INFO{'Type'} = 'SNP';
115 for my $group (@groups) {
116 my ($tag,$value) = split /=/,$group;
117 #warn "- $group -> $tag,$value\n";
118 my @values = split /,/,$value;
119 if (@values == 1) {
120 $INFO{$tag}=$values[0];
121 } else {
122 $INFO{$tag}=\@values;
125 my (%GT,%GTcnt);
126 my @FMT = split /:/,$FORMAT;
127 for my $s (@Samples) {
128 my $dat = shift @data or die "Bam file error.";
129 my @dat = split /:/,$dat;
130 for my $i (@FMT) {
131 $GT{$s}{$i} = shift @dat;
134 my $SPcnt = 0;
135 for (@Samples) {
136 if ($GT{$_}{'DP'} > 0 and $GT{$_}{'GQ'} > 17) {
137 ++$GTcnt{$GT{$_}{'GT'}};
138 ++$SPcnt;
139 $GT{$_}{'O_K'} = 1;
140 } else {
141 $GT{$_}{'O_K'} = 0;
144 =pod
145 ++$Stat{'GTcnt'}{$INFO{'FQ'} <=> 0}{scalar(keys %GTcnt)};
146 ddx $Stat{'GTcnt'};
147 ddx $CHROM, $POS, $ID, $REF, $ALT, $QUAL, $FILTER, $INFO,\%INFO,\%GT if scalar(keys %GTcnt) > 1 and $INFO{'FQ'} < 0 and $SPcnt>2;
148 # rad2marker.pl:135: {
149 # -1 => { 1 => 2850, 2 => 526, 3 => 37 },
150 # 1 => { 1 => 8, 2 => 2507, 3 => 1792 },
152 =cut
153 #warn "$CHROM, $POS, $ID, $REF, $ALT, $QUAL, $FILTER, $INFO\n";
154 if ($QUAL<20 or $INFO{'FQ'}<0 or scalar(keys %GTcnt)<2 or $SPcnt<3 or $INFO{'DP'}<6) {
155 ++$Stat{'VCF_Skipped'};
156 next;
158 #ddx $CHROM, $POS, $ID, $REF, $ALT, $QUAL, $FILTER, $INFO,\%GTcnt,\%INFO,\%GT;
159 unless ($CHROM eq $lastChr) {
160 my ($mark,$flag,$sampleA) = @{&deal_cluster($CHROM,\@items)};
161 @items = ();
162 $lastChr = $CHROM;
164 push @items,[$POS, $REF, $ALT, $SPcnt, \%INFO,\%GT];
167 ddx \%Stat;
168 close O;
170 sub getGT() {
171 my ($REF, $ALT, $GT) = @_;
173 sub deal_cluster() {
174 my ($Chr,$itemsA) = @_;
175 my $mark = 0;
176 if ( (not exists $Markers{$Chr}) ) {
177 ++$Stat{'Cluster_err'} if $Chr ne '';
178 return [$mark,'',[]];
180 for (@{$Markers{$Chr}}) {
181 my ($pos,$strand) = @$_;
182 my ($left,$right);
183 if ($strand eq '+') {
184 ($left,$right)=($pos,$pos+$PosRight);
185 } elsif ($strand eq '-') {
186 ($left,$right)=($pos-$PosLeft,$pos+$PosECsft);
187 } else {
188 ($left,$right)=($pos-$PosLeft,$pos+$PosRight);
190 print "$pos,$strand [$left,$right] ";
191 my @thisDat;
192 for (@$itemsA) {
193 my ($POS, $REF, $ALT, $SPcnt, $pINFO,$pGT) = @$_;
194 next if $POS < $left;
195 last if $POS > $right;
196 push @thisDat,$_
198 next if @thisDat == 0;
199 print "\n$pos,$strand [$left,$right]\n";
200 ddx \@thisDat;
202 ++$Stat{'Cluster_cnt'};
203 #ddx $itemsA;
205 return [$mark,1,[]];
208 __END__
209 zcat radseq.bcgv.vcf.gz|perl -ne 'if (/^#CHROM/) {my @data = split /\t/;@Samples = map {my $t=(split /\//)[-1];$t=~s/_cut//g;$t=~s/-/./g; $_=join('_',(split /\./,$t)[-5,-6]);} splice @data,9;splice @data,9,$#data,@Samples;print join("\t",@data),"\n"} else {print $_;}' > radseqA.vcf &
210 zcat radseq.bcgv.vcf.gz|perl -lane 'BEGIN {my $i;} next if /^#/;++$i;$F[0]=~s/^\D+//;print "$F[0]\trs$i\t0\t$F[1]"' > radseqA.map &
212 zcat radseq.bcgv.vcf.gz | perl -ne 'my @data = split /\t/;if (/^#CHROM/) {@Samples = map {my $t=(split /\//)[-1];$t=~s/_cut//g;$t=~s/-/./g; $_=join('_',(split /\./,$t)[-5,-6]);} splice @data,9;splice @data,9,$#data,@Samples;print join("\t",@data),"\n"} elsif (/^#/) {print $_;} else {print $_;}' > radseqA.vcf
214 p-link --tfile test --reference-allele test.refallele --pheno test.pheno --all-pheno --model --cell 0 --fisher
216 $ head test.tfam
217 B19 JHH001_D4 0 0 2 1
218 B19 GZXJ03_A1 BHX019_LSJ JHH001_D4 2 1
219 $ head test.tped
220 1001 rs1 0 5188 A T A T A T A T A T A T A T A T A T T T T T T T T T T T T T T T T T T T
221 1001 rs2 0 5193 A A A A A A A A A A A A A A A A A A A C A C A C A C 0 0 A C A C A C 0 0
222 $ head -2 test.refallele
223 rs1 T
224 rs2 T
225 $ head -2 test.pheno
226 FID IID snow sex
227 B19 JHH001_D4 1 2
229 p-link --tfile test --reference-allele test.refallele --pheno test.pheno --all-pheno --model --cell 0 --fisher --out testO
230 $ ll testO*
231 -rw-r--r-- 1 huxs users 2526 Aug 1 18:41 testO.log
232 -rw-r--r-- 1 huxs users 3672 Aug 1 18:41 testO.sex.model
233 -rw-r--r-- 1 huxs users 3672 Aug 1 18:41 testO.snow.model
235 #--freq --missing --tdt