modified: myjupyterlab.sh
[GalaxyCodeBases.git] / perl / bsvf / bsvir / mergebam.pl
blob98ba9d3be50856b0a6d57af36174a5e10f8e54c0
1 #!/usr/bin/env perl
2 use strict;
3 use warnings;
4 use DBI qw(:sql_types);
5 use File::Basename;
6 use Data::Dump qw(ddx);
8 die "Usage: $0 <hum_sorted_bam> <vir_sorted_bam> <out_prefix>\n" if @ARGV < 3;
9 my ($inhum,$invir,$out)=@ARGV;
11 unlink "./${out}.sqlite";
12 my $dbh = DBI->connect("dbi:SQLite:dbname=./${out}.sqlite","","",{
13 RaiseError => 0,
14 PrintError => 1,
15 AutoCommit => 0
16 }) or die $DBI::errstr;
17 $dbh->do("PRAGMA cache_size = -2000000"); # 2G http://www.sqlite.org/pragma.html#pragma_cache_size
18 print '[!]SQLite version: ',$dbh->{sqlite_version}," (3.7.10 or above is better).\n";
20 my $sql=q/
21 CREATE TABLE MergedSam
22 ( FQID12 TEXT,
23 FQSeq TEXT,
24 FQQual TEXT,
25 HumFlag INTEGER,
26 HumChr TEXT,
27 HumPos INTEGER,
28 HumCIGAR TEXT,
29 HumMapQ INTEGER,
30 VirFlag INTEGER,
31 VirChr TEXT,
32 VirPos INTEGER,
33 VirCIGAR TEXT,
34 VirMapQ INTEGER,
35 YDYCHum TEXT,
36 YDYCVir TEXT
38 CREATE INDEX nFQID12 ON MergedSam(FQID12);
39 CREATE TABLE SamInfo (
40 Type TEXT,
41 Ref TEXT,
42 FQ1 TEXT,
43 FQ2 TEXT,
44 ReadLen1 INTEGER,
45 ReadLen2 INTEGER,
46 Sam TEXT
49 for (split /;\s*/,$sql) {
50 next if /^\s*$/;
51 $dbh->do($_) or die $dbh->errstr,"[$_]";
53 $dbh->commit;
55 my $inStr0 = "INSERT INTO MergedSam (FQID12,FQSeq,FQQual,{-}Flag,{-}Chr,{-}Pos,{-}CIGAR,{-}MapQ,YDYC{-}) VALUES (?,?,?,?,?,?,?,?,?)";
56 my ($inStr1,$inStr2) = ($inStr0,$inStr0); $inStr1 =~ s/{-}/Hum/g; $inStr2 =~ s/{-}/Vir/g;
57 my $sthins1 = $dbh->prepare($inStr1); my $sthins2 = $dbh->prepare($inStr2);
59 my $upStr0 = "UPDATE MergedSam SET {-}Flag=?,{-}Chr=?,{-}Pos=?,{-}CIGAR=?,{-}MapQ=?,YDYC{-}=? WHERE FQID12=?";
60 my ($upStr1,$upStr2) = ($upStr0,$upStr0); $upStr1 =~ s/{-}/Hum/g; $upStr2 =~ s/{-}/Vir/g;
61 my $sthupd1 = $dbh->prepare($upStr1); my $sthupd2 = $dbh->prepare($upStr2);
63 my $sthid = $dbh->prepare( "SELECT * FROM MergedSam WHERE FQID12=?" );
65 sub openfile($) {
66 my ($filename)=@_;
67 my $infile;
68 if ($filename=~/.xz$/) {
69 open( $infile,"-|","xz -dc $filename") or die "Error opening $filename: $!\n";
70 } elsif ($filename=~/.gz$/) {
71 open( $infile,"-|","gzip -dc $filename") or die "Error opening $filename: $!\n";
72 } elsif ($filename=~/.bz2$/) {
73 open( $infile,"-|","bzip2 -dc $filename") or die "Error opening $filename: $!\n";
74 } else {open( $infile,"<",$filename) or die "Error opening $filename: $!\n";}
75 return $infile;
77 sub getsamChrLen($$) {
78 my ($Type,$in) = @_;
79 my $inpath = dirname($in);
80 my ($Ref,$fq1,$fq2,$readlen1,$readlen2);
81 open( IN,"-|","samtools view -H $in") or die "Error(m1) opening $in: $!\n";
82 while (<IN>) {
83 chomp;
84 my ($t,$id,$ln)=split /\t/;
85 if ($t eq '@SQ') {
86 $id = (split /\:/,$id)[1];
87 $ln = (split /\:/,$ln)[1];
88 #$ChrLen{$id} = $ln;
89 } elsif ($t eq '@PG') {
90 # @PG ID:BSMAP VN:2.87 CL:"bsmap -u -d HomoGRCh38.fa -a F12HPCCCSZ0010_Upload/s00_C.bs_1.fq.gz -b F12HPCCCSZ0010_Upload/s00_C.bs_2.fq.gz -z 64 -p 12 -v 10 -q 2 -o s00_C.bs.bam"
91 # @PG ID:bwa-meth PN:bwa-meth VN:0.10 CL:"./bwameth.py -t 24 -p s00_C.bshum --read-group s00_C --reference HomoGRCh38.fa s00_C.bs_1.fq.gz s00_C.bs_2.fq.gz"
92 if ($id eq 'ID:BSMAP') {
93 $Ref = $1 if /\-d\s+([.\w]+)\b/;
94 $fq1 = $1 if /\-a\s+([^\s"]+)(\s|"?$)/;
95 $fq2 = $1 if /\-b\s+([^\s"]+)(\s|"?$)/;
96 } elsif ($id eq 'ID:bwa-meth') {
97 $Ref = $1 if /\--reference\s+([.\w]+)\b/;
98 if (/CL:.+\s([^-\s"]+)\s+([^-\s"]+)(\s|"?$)/) {
99 $fq1 = $1;
100 $fq2 = $2;
102 } else {die;}
106 if ($inpath ne '.') {
107 ($Ref,$fq1,$fq2) = map { "$inpath/$_" } ($Ref,$fq1,$fq2);
109 #warn "$Ref,$fq1,$fq2,$inpath";
110 my $FQ1 = openfile($fq1);
111 <$FQ1>;chomp($_=<$FQ1>);
112 $readlen1 = length $_;
113 close $FQ1;
114 my $FQ2 = openfile($fq2);
115 <$FQ2>;chomp($_=<$FQ2>);
116 $readlen2 = length $_;
117 close $FQ2;
118 my $sthinsnfo = $dbh->prepare("INSERT INTO SamInfo (Type,Ref,FQ1,FQ2,ReadLen1,ReadLen2,Sam) VALUES (?,?,?,?,?,?,?)");
119 $sthinsnfo->execute($Type,$Ref,$fq1,$fq2,$readlen1,$readlen2,$in);
120 warn "[!]Reading [$in]: ($Type,$Ref,$fq1,$fq2,$readlen1,$readlen2)\n";
123 sub Sam2FQ($) {
124 my $rd = $_[0];
125 my ($rd12);
126 if ($$rd[1] & 0x40) {
127 $rd12 = 1;
128 } elsif ($$rd[1] & 0x80) {
129 $rd12 = 2;
131 die "CIGAR:$$rd[5]" if $$rd[5] =~ /H/;
132 my $seq = $$rd[9];
133 my $qual = $$rd[10];
134 if ($$rd[1] & 0x10) {
135 $seq =~ tr/acgtrymkswhbvdnxACGTRYMKSWHBVDNX/tgcayrkmswdvbhnxTGCAYRKMSWDVBHNX/;
136 $seq = reverse $seq;
137 $qual = reverse $qual;
139 my $id = join('',$$rd[0],'/',$rd12);
140 return [$rd12,$id,$seq,$qual];
142 sub InsertSam($$) {
143 my ($Type,$in)=@_;
144 getsamChrLen($Type,$in);
145 open( IN,"-|","samtools view -F768 $in") or die "Error opening $in: $!\n";
146 while (my $line = <IN>) {
147 #my ($id, $flag, $ref, $pos, $mapq, $CIGAR, $mref, $mpos, $isize, $seq, $qual, @OPT) = split /\t/,$line;
148 #print "$id, $flag, $ref, $pos, $mapq, $CIGAR, $mref, $mpos, $isize\n";
149 my @Dat1 = split /\t/,$line;
150 my ($id, $flag, $ref, $pos, $mapq, $CIGAR, $mref, $mpos, $isize, $seq, $qual, @OPT) = @Dat1;
151 my $ret1 = Sam2FQ(\@Dat1);
152 (undef,$id,$seq,$qual) = @$ret1;
153 my $YDYC;
155 my ($YD,$YC)=('','');
156 for (@OPT) {
157 if (/YD:Z:(\w+)/) {
158 $YD = $1;
160 $YC=$1 if /YC:Z:(\w+)/
162 $YDYC = "$YD,$YC";
164 $sthid->execute($id);
165 my $qres = $sthid->fetchall_arrayref;
166 if ($#$qres == -1) {
167 $sthins1->execute($id,$seq,$qual,$flag,$ref,$pos,$CIGAR,$mapq,$YDYC) if $Type eq 'Hum';
168 $sthins2->execute($id,$seq,$qual,$flag,$ref,$pos,$CIGAR,$mapq,$YDYC) if $Type eq 'Vir';
169 } elsif ($#$qres == 0) {
170 $sthupd1->execute($flag,$ref,$pos,$CIGAR,$mapq,$YDYC,$id) if $Type eq 'Hum';
171 $sthupd2->execute($flag,$ref,$pos,$CIGAR,$mapq,$YDYC,$id) if $Type eq 'Vir';
172 } else {die;}
176 InsertSam('Hum',$inhum);
177 $dbh->commit;
178 InsertSam('Vir',$invir);
179 $dbh->commit;
183 $sql=q/
184 CREATE INDEX nSort ON MergedSam(HumChr,HumPos,VirChr,VirPos);
186 for (split /;\s*/,$sql) {
187 next if /^\s*$/;
188 $dbh->do($_) or die $dbh->errstr,"[$_]";
190 $dbh->commit;
191 $dbh->disconnect;
193 __END__
194 ../mergebam.pl n3_grep.vircandi.sam.gz n3_grep.vircandi.bshbv.bam n3_merged
195 sqlite3 n3_merged.sqlite .dump >n3_merged.sqlite.dump
198 sqlite> EXPLAIN QUERY PLAN SELECT * FROM MergedSam WHERE HumChr IS NOT NULL AND VirChr IS NOT NULL AND HumCIGAR <> '*' AND VirCIGAR <> '*' ORDER BY HumChr,HumPos,VirChr,VirPos ASC;
199 0|0|0|SCAN TABLE MergedSam USING INDEX nSort (~62500 rows)