new file: cell2loc.py
[GalaxyCodeBases.git] / perl / etc / justonce / dmr / cdmr.pl
blob645af6983c8197ade273ea57bea4d752977e0c9e
1 #!/usr/bin/env perl
2 =pod
3 Author: Hu Xuesong @ BGI <huxuesong@genomics.org.cn>
4 Version: 1.0.0 @ 20180614
5 =cut
6 use strict;
7 use warnings;
8 use Cwd qw(abs_path cwd);
9 use Parse::CSV;
10 use List::Util qw[min max];
11 use Statistics::TTest;
12 #use lib '.';
13 use Data::Dump qw(ddx);
15 my ($bedfn,$cgLfn,$cgRfn,$outfn) = @ARGV;
16 if (@ARGV < 4) {
17 die "Usage: $0 <bed> <Left.cg> <Right.cg> <out.tsv>\n";;
20 my $cgLin = Parse::CSV->new(file => $cgLfn, names => 1, sep_char => "\t");
21 my $cgRin = Parse::CSV->new(file => $cgRfn, names => 1, sep_char => "\t");
23 open B,'<',$bedfn or die $?;
24 my (%Dbed,%Rbed);
25 while (<B>) {
26 my @L = split /\t/;
27 next unless $L[2];
28 push @{$Dbed{$L[0]}},[$L[1],$L[2]];
29 my @rng = $L[1] .. $L[2];
30 for (@rng) {
31 ++$Rbed{$L[0]}{$_};
34 #ddx %Dbed;
35 close B;
37 sub loadCG($$) {
38 my ($rngD,$csvh) = @_;
39 my %Dcg;
40 while ( my $value = $csvh->fetch ) {
41 #ddx $value;
42 if (exists $rngD->{ $value->{'#CHROM'} }{ $value->{'POS'} }) {
43 my ($cgCnt,$cgC,$cgW,$flag)=(0,0,0,0);
44 if ($value->{'Crick-COVERAGE'} ne '.') {
45 if ($value->{'Crick-COVERAGE'} >= 5) {
46 $cgC = $value->{'Crick-METH'}/$value->{'Crick-COVERAGE'};
47 $cgCnt += 1;
48 $flag |= 1;
51 if ($value->{'Watson-COVERAGE'} ne '.') {
52 if ($value->{'Watson-COVERAGE'} >= 5) {
53 $cgW = $value->{'Watson-METH'}/$value->{'Watson-COVERAGE'};
54 $cgCnt += 1;
55 $flag |= 2;
58 if ($cgCnt) {
59 $Dcg{$value->{'#CHROM'}}{$value->{'POS'}} = [$cgCnt/2, ($cgC+$cgW)/$cgCnt,$flag];
61 } else {
62 next;
65 return \%Dcg;
68 my %DcgL = %{ loadCG(\%Rbed,$cgLin) };
69 #ddx \%DcgL;
70 my %DcgR = %{ loadCG(\%Rbed,$cgRin) };
71 #ddx \%DcgR;
73 sub getCG($$$) {
74 my ($dat,$st,$ed)=@_;
75 my %ret;
76 for my $p ($st .. $ed) {
77 if (exists $dat->{$p}) {
78 $ret{$p} = $dat->{$p};
81 return %ret;
84 open O,'>',$outfn or die $?;
85 print O "# L:[$cgLfn], R:[$cgRfn].\n";
86 open B,'<',$bedfn or die $?;
87 while (<B>) {
88 my @L = split /\t/;
89 next unless $L[2];
90 print O join("\t",@L[0..2]),"\t";
91 my %cgLr = getCG($DcgL{$L[0]},$L[1],$L[2]);
92 my %cgRr = getCG($DcgR{$L[0]},$L[1],$L[2]);
93 #ddx \%cgLr;
94 my (%rPoses,%cgR);
95 for (keys %cgLr,keys %cgRr) {
96 ++$rPoses{$_};
98 for (keys %rPoses) {
99 my $dL = [0,0,0];
100 my $dR = [0,0,0];
101 $dL = $cgLr{$_} if exists $cgLr{$_};
102 $dR = $cgRr{$_} if exists $cgRr{$_};
103 $cgR{$_} = [@$dL,@$dR];
105 #ddx \%cgR;
106 (%cgLr,%cgRr)=();
107 my ($shared,$left,$right,$unioned)=(0,0,0,0);
108 my ($Lsum,$Lcnt,$Rsum,$Rcnt)=(0,0,0,0);
109 my (@tL,@tR);
110 for (keys %cgR) {
111 my @d = @{ $cgR{$_} };
112 $shared += min($d[0],$d[3]);
113 $left += $d[0];
114 $right += $d[3];
115 my $flag = $d[2] | $d[5];
116 if ($flag==3) {
117 $unioned += 1;
118 push @tL,$d[1];
119 push @tR,$d[4];
120 } elsif ($flag==1 or $flag==2) {
121 $unioned += 0.5;
123 #ddx $flag;
124 $Lsum += $d[1]*$d[0]; $Lcnt += $d[0];
125 $Rsum += $d[4]*$d[3]; $Rcnt += $d[3];
127 my $p='NA';
128 if (@tL>1 and @tR>1) {
129 my $ttest = new Statistics::TTest;
130 $ttest->load_data(\@tL,\@tR);
131 $p = $ttest->{t_prob};
133 my ($avgL,$avgR)=qw(NA NA);
134 $avgL = $Lsum/$Lcnt if $Lcnt;
135 $avgR = $Rsum/$Rcnt if $Rcnt;
136 print O join(',',$shared,$left,$right,$unioned),"\t",join("\t",$avgL,$avgR,$p),"\n";
138 close B;
139 close O;
141 __END__
142 ./cdmr.pl gencode.v30.annotation.bed.h B7B.cg.h Normal.cg.h o.tsv