modified: makefile
[GalaxyCodeBases.git] / tools / lh3misc / biodb / batchUCSC.pl
blob5671e1c93511c93985aabc63be3ed2f9a473c9e2
1 #!/usr/bin/perl -w
3 # Author: lh3
4 # Version: 0.2.0
6 use strict;
7 use warnings;
8 use DBI;
9 use Getopt::Std;
11 &main;
12 exit;
14 sub main {
15 my %preset =
16 (snpAnc=>['snp129OrthoPt2Pa2Rm2', '', '',
17 'chrom,chromStart,chromEnd-chromStart,humanAllele,chimpAllele,orangAllele,macaqueAllele'],
18 snp=>['snp129', '', '', '*'],
19 refGeneBrief=>['refGene', 'txStart', 'txEnd',
20 'name,strand,chrom,txStart,txEnd,cdsStart,cdsEnd,exonCount,name2']);
22 my %opts = (d=>'hg18', s=>'genomep:password@genome-mysql.cse.ucsc.edu:3306', p=>'refGeneBrief', S=>'');
23 getopts('d:s:p:Che', \%opts);
24 &help_message if (defined $opts{h});
25 &usage(\%opts, \%preset) if (@ARGV == 0 && -t STDIN && !$opts{S});
26 $opts{C} = (defined $opts{C})? 1 : 0;
28 # parse the server string
29 $_ = $opts{s};
30 s/\s//g;
31 die("(EE) fail to parse the server string.\n") unless (/^(\S+):(\S*)\@(\S+)$/);
32 my ($user, $passwd, $server) = ($1, $2, $3);
34 # retrieving data
35 my $dbh = DBI->connect_cached("dbi:mysql:$opts{d}:$server", $user, $passwd);
36 &config($dbh, \%opts, \%preset);
37 &retrieve($dbh, \%opts);
38 $dbh->disconnect;
41 sub config {
42 my ($dbh, $opts, $preset) = @_;
43 if ($opts->{p} =~ /^\S+(:\S*){3}$/) {
44 ($opts->{t}, $opts->{beg}, $opts->{end}, $opts->{c}) = split(':', $opts->{p});
45 } else {
46 die("(EE) undefined preset.\n") unless (defined $preset->{$opts->{p}});
47 ($opts->{t}, $opts->{beg}, $opts->{end}, $opts->{c}) = @{$preset->{$opts->{p}}};
49 $opts->{b} = 0;
50 $opts->{c} = '*' unless ($opts->{c});
51 my $sth = $dbh->prepare("DESCRIBE $opts->{t}");
52 $sth->execute;
53 while ((@_ = $sth->fetchrow_array)) {
54 if ($_[0] eq 'bin' && $_[1] =~ /int/) {
55 $opts->{b} = 1;
56 } elsif ($_[0] =~ /Start$/) {
57 $opts->{beg} = $_[0] unless ($opts->{beg});
58 warn("(WW) multiple start columns: $opts->{beg} and $_[0]; use $opts->{beg}\n") if ($opts->{beg} ne $_[0]);
59 } elsif ($_[0] =~ /End$/) {
60 $opts->{end} = $_[0] unless ($opts->{end});
61 warn("(WW) multiple end columns: $opts->{end} and $_[0]; use $opts->{end}\n") if ($opts->{end} ne $_[0]);
64 die("(EE) fail to detect the column name for the start or the end of a region.\n") unless ($opts->{beg} && $opts->{end});
65 $sth->finish;
68 sub retrieve {
69 my ($dbh, $opts) = @_;
70 my (@sth_cache, $sth, @bin);
71 my $sql_comm = qq/SELECT $opts->{c} FROM $opts->{t} WHERE chrom=? AND /;
72 $sql_comm .= $opts->{C}? qq/$opts->{beg}>=? AND $opts->{end}<=?/ : qq/$opts->{end}>=? AND $opts->{beg}<=?/;
73 $sql_comm .= ' AND (bin=?' if ($opts->{b});
74 while (<>) {
75 my @t = split;
76 $t[0] = "chr$t[0]" if ($opts->{d} =~ /^hg/ && $t[0] !~ /^chr/);
77 if ($t[1] > $t[2]) { my $tt = $t[1]; $t[1] = $t[2]; $t[2] = $tt; }
78 @bin = ();
79 if ($opts->{b}) {
80 @bin = &region2bin($t[1], $t[2]);
81 my $sql = $sql_comm . (' OR bin=?' x (@bin-1)) . ')';
82 $sth_cache[@bin] ||= $dbh->prepare($sql);
83 my ($k, $sql_out) = (0, $sql);
84 my @a = (@t[0..2], @bin);
85 $a[0] = qq/"$a[0]"/;
86 $sql_out =~ s/\?/$a[$k++]/eg;
87 warn("(MM) $sql_out\n") if ($opts->{e});
88 } else {
89 $sth_cache[0] ||= $dbh->prepare($sql_comm);
91 $sth = $sth_cache[@bin];
92 $sth->execute(@t[0..2], @bin);
93 while ((@_ = $sth->fetchrow_array)) {
94 print join("\t", @_), "\n";
97 for (@sth_cache) {
98 $_->finish if (defined $_);
102 sub region2bin {
103 my ($beg, $end) = @_;
104 my @bin = (1);
105 push(@bin, ( 1 + ($beg>>26) .. 1 + (($end-1)>>26)));
106 push(@bin, ( 9 + ($beg>>23) .. 9 + (($end-1)>>23)));
107 push(@bin, ( 73 + ($beg>>20) .. 73 + (($end-1)>>20)));
108 push(@bin, (585 + ($beg>>17) .. 585 + (($end-1)>>17)));
109 return @bin;
112 sub usage {
113 my ($opts, $preset) = @_;
114 my $tmp = join(", ", keys(%$preset));
115 print qq{
116 Program: batchUCSC.pl (Batch data downloader from UCSC)
117 Contact: Heng Li <lh3\@sanger.ac.uk>
118 Usage: batchUCSC.pl [options] <in.region_file>
120 Options: -s STR server [$opts->{s}]
121 -d STR database [$opts->{d}]
122 -p STR query: ${tmp} [$opts->{p}]
123 -C contained (by default overlap)
124 -e print the SQL
125 -h longer help
127 Examples:
129 batchUCSC.pl -p snpAnc inp.txt
130 batchUCSC.pl -p snp129::: inp.txt
131 batchUCSC.pl -p 'refGene:cdsStart:cdsEnd:*' inp.txt
132 batchUCSC.pl -p simpleRepeat::: inp.txt
134 IMPORTANT NOTES:
136 DO NOT retrieve the data from a whole chromosome, not to speak from
137 the whole genome. Doing in this way is extremely slow and will put a
138 lot of burden on the UCSC MySQL server. If you want the whole genome
139 data, please download from UCSC's FTP site. You can also import UCSC
140 data to your local MySQL database, but you need to change '-s' and
141 '-d' according to your local configuration of MySQL.
144 exit;
147 sub help_message {
148 print qq{
149 Composing queries:
151 a) batchUCSC.pl -p snp inp.txt
153 You can use presetting. The above command is equivalent to (for
154 explanation, see below):
156 batchUCSC.pl -p 'snp129:::' inp.txt
158 b) batchUCSC.pl -p 'knownGene:txStart:txEnd:*' inp.txt
160 For a customized query, the query consists of four fields,
161 separated by colons. The four fields are: the table name
162 (required), the column name for the start of a region, name for the
163 end of a region and the list of fields to be displayed. When the
164 second the third fields are missing, this script will infer from
165 the table schema. The default value for the last column is '*',
166 which shows all the columns. You can even do calculation between
167 cloumns at the last field, as long as that fields follows the SQL
168 syntax.
170 The list of available UCSC tables can be found on its FTP site or
171 here:
173 http://genome.ucsc.edu/cgi-bin/hgTables?command=start
175 c) batchUCSC.pl -p 'simpleRepeat:::' inp.txt
177 The above command is equivalent to:
179 batchUCSC.pl -p 'simpleRepeat:chromStart:chromEnd:*' inp.txt
181 If the script finds multiple start or end columns (for example in
182 table knownGene), it will use the first such column in the
183 table and give a warning.
186 exit;