new file: cell2loc.py
[GalaxyCodeBases.git] / perl / etc / radseq / annoplink.pl
blob8ca4ea27471fb3d2a1a94277e8a77667c7cdc36f
1 #!/usr/bin/env perl
2 =pod
3 Author: Hu Xuesong @ BIOPIC <galaxy001@gmail.com>
4 Version: 1.0.0 @ 20120720
5 =cut
6 use strict;
7 use warnings;
8 use Data::Dump qw(ddx);
9 use Galaxy::IO;
10 use Galaxy::SeqTools;
11 use List::MoreUtils 'first_index';
13 die "Usage: $0 <plink model file> <bcgv bcf> <out>\n" if @ARGV<3;
14 my $plinkfs=shift;
15 my $bcfs=shift;
16 my $outfs=shift;
18 my (%Stat,$t);
19 open O,'>',$outfs or die "Error opening $outfs: $!\n";
20 $t = "# In: [$plinkfs],[$bcfs], Out: [$outfs]\n";
21 print O $t;
22 print $t;
24 my (%Plink,@PlinkS,%Vcf,$SortI,$TypeI,%Types);
25 open L,'<',$plinkfs or die;
26 while (<L>) {
27 if (/^ CHR /) {
28 s/^\s+//;
29 $SortI = first_index { $_ eq 'P' } (split /\s+/);
30 $TypeI = first_index { $_ eq 'TEST' } (split /\s+/);
31 #warn $SortI,(split /\s+/)[$SortI];
32 next;
34 chomp;
35 s/^\s+//;
36 my @Dat = split /\s+/; # $chr,$markID,$min,$maj,$model,@other
37 $Dat[1] =~ /(\d+)/;
38 $t = $1;
39 #die '[',join('|',@Dat),"]\n";
40 $Plink{$t}{$Dat[$TypeI]} = \@Dat;
41 ++$Types{$TypeI};
43 close L;
44 #ddx \%Plink;
46 my $th = openpipe('bcftools view -I',$bcfs); # -I skip indels
47 my (@Samples,@Parents);
48 while (<$th>) {
49 next if /^##/;
50 chomp;
51 my @data = split /\t/;
52 if ($data[0] eq '#CHROM') {
53 @Samples = map {my $t=(split /\//)[-1];$t=~s/_cut//g;$t=~s/-/./g; $_=join('_',(split /\./,$t)[-5,-6]);} splice @data,9;
54 # ../5.bam_0000210210_merged/d1_4_merged.D4.JHH001.XTU.sort.rmdup.bam
55 @Parents = grep(!/^GZXJ/,@Samples);
56 last;
59 print O "# Samples: [",join('],[',@Samples),"]\n# Parents: [",join('],[',@Parents),"]\n";
60 warn "Samples:\n[",join("]\n[",@Samples),"]\nParents: [",join('],[',@Parents),"]\n";
62 my ($VCF_In,%ChrPn,%ChrPs,%TypePsum);
63 #for (keys %Types) {
64 # $ChrPs{$_} = ();
66 while (<$th>) {
67 next if /^#/;
68 chomp;
69 #my ($CHROM, $POS, $ID, $REF, $ALT, $QUAL, $FILTER, $INFO, $FORMAT, @data) = split /\t/;
70 my @data = split /\t/;
71 ++$VCF_In; # also as rs#
72 if (exists $Plink{$VCF_In}) {
73 $Vcf{$VCF_In} = \@data;
74 for (keys %{$Plink{$VCF_In}}) {
75 $t = $Plink{$VCF_In}{$_}->[$SortI];
76 $ChrPs{$_}{$data[0]} += $t;
77 $TypePsum{$_} += $t;
79 ++$ChrPn{$data[0]};
80 #$ChrPs{$data[0]} += $Plink{$VCF_In}->[$SortI];
83 close $th;
84 warn "bcf done.\n";
86 print O '# ChrID Count: ',scalar(keys %ChrPn),"\n",'# SNP Count: ',scalar(keys %Plink),"\n";
88 ddx \%TypePsum;
89 ddx \%Plink;
90 ddx \%ChrPs;
92 for my $type (sort { $TypePsum{$a} <=> $TypePsum{$b} } keys %Types) {
93 for my $chr (keys %ChrPn) {
94 $ChrPs{$type}{$chr} /= $ChrPn{$chr};
96 @PlinkS = sort { $ChrPs{$type}{$Vcf{$a}->[0]} cmp $ChrPs{$type}{$Vcf{$b}->[0]} || $Plink{$a}{$type}->[$SortI] <=> $Plink{$b}{$type}->[$SortI] } keys %Plink;
97 for (@PlinkS) {
98 print O join("\t",@{$Plink{$_}{$type}},@{$Vcf{$_}}),"\n";
99 print "{$_}{$type},@{$Plink{$_}{$type}}\n";
102 close O;