16 (snpAnc
=>['snp129OrthoPt2Pa2Rm2', '', '',
17 'chrom,chromStart,chromEnd-chromStart,humanAllele,chimpAllele,orangAllele,macaqueAllele'],
18 snp
=>['snp129', '', '', '*'],
19 refGeneBrief
=>['refGene', 'txStart', 'txEnd',
20 'name,strand,chrom,txStart,txEnd,cdsStart,cdsEnd,exonCount,name2']);
22 my %opts = (d
=>'hg18', s
=>'genomep:password@genome-mysql.cse.ucsc.edu:3306', p
=>'refGeneBrief', S
=>'');
23 getopts
('d:s:p:Che', \
%opts);
24 &help_message
if (defined $opts{h
});
25 &usage
(\
%opts, \
%preset) if (@ARGV == 0 && -t STDIN
&& !$opts{S
});
26 $opts{C
} = (defined $opts{C
})?
1 : 0;
28 # parse the server string
31 die("(EE) fail to parse the server string.\n") unless (/^(\S+):(\S*)\@(\S+)$/);
32 my ($user, $passwd, $server) = ($1, $2, $3);
35 my $dbh = DBI
->connect_cached("dbi:mysql:$opts{d}:$server", $user, $passwd);
36 &config
($dbh, \
%opts, \
%preset);
37 &retrieve
($dbh, \
%opts);
42 my ($dbh, $opts, $preset) = @_;
43 if ($opts->{p
} =~ /^\S+(:\S*){3}$/) {
44 ($opts->{t
}, $opts->{beg
}, $opts->{end
}, $opts->{c
}) = split(':', $opts->{p
});
46 die("(EE) undefined preset.\n") unless (defined $preset->{$opts->{p
}});
47 ($opts->{t
}, $opts->{beg
}, $opts->{end
}, $opts->{c
}) = @
{$preset->{$opts->{p
}}};
50 $opts->{c
} = '*' unless ($opts->{c
});
51 my $sth = $dbh->prepare("DESCRIBE $opts->{t}");
53 while ((@_ = $sth->fetchrow_array)) {
54 if ($_[0] eq 'bin' && $_[1] =~ /int/) {
56 } elsif ($_[0] =~ /Start$/) {
57 $opts->{beg
} = $_[0] unless ($opts->{beg
});
58 warn("(WW) multiple start columns: $opts->{beg} and $_[0]; use $opts->{beg}\n") if ($opts->{beg
} ne $_[0]);
59 } elsif ($_[0] =~ /End$/) {
60 $opts->{end
} = $_[0] unless ($opts->{end
});
61 warn("(WW) multiple end columns: $opts->{end} and $_[0]; use $opts->{end}\n") if ($opts->{end
} ne $_[0]);
64 die("(EE) fail to detect the column name for the start or the end of a region.\n") unless ($opts->{beg
} && $opts->{end
});
69 my ($dbh, $opts) = @_;
70 my (@sth_cache, $sth, @bin);
71 my $sql_comm = qq/SELECT $opts->{c} FROM $opts->{t} WHERE chrom=? AND /;
72 $sql_comm .= $opts->{C
}?
qq/$opts->{beg}>=? AND $opts->{end}<=?/ : qq/$opts->{end}>=? AND $opts->{beg}<=?/;
73 $sql_comm .= ' AND (bin=?' if ($opts->{b
});
76 $t[0] = "chr$t[0]" if ($opts->{d
} =~ /^hg/ && $t[0] !~ /^chr/);
77 if ($t[1] > $t[2]) { my $tt = $t[1]; $t[1] = $t[2]; $t[2] = $tt; }
80 @bin = ®ion2bin
($t[1], $t[2]);
81 my $sql = $sql_comm . (' OR bin=?' x
(@bin-1)) . ')';
82 $sth_cache[@bin] ||= $dbh->prepare($sql);
83 my ($k, $sql_out) = (0, $sql);
84 my @a = (@t[0..2], @bin);
86 $sql_out =~ s/\?/$a[$k++]/eg;
87 warn("(MM) $sql_out\n") if ($opts->{e
});
89 $sth_cache[0] ||= $dbh->prepare($sql_comm);
91 $sth = $sth_cache[@bin];
92 $sth->execute(@t[0..2], @bin);
93 while ((@_ = $sth->fetchrow_array)) {
94 print join("\t", @_), "\n";
98 $_->finish if (defined $_);
103 my ($beg, $end) = @_;
105 push(@bin, ( 1 + ($beg>>26) .. 1 + (($end-1)>>26)));
106 push(@bin, ( 9 + ($beg>>23) .. 9 + (($end-1)>>23)));
107 push(@bin, ( 73 + ($beg>>20) .. 73 + (($end-1)>>20)));
108 push(@bin, (585 + ($beg>>17) .. 585 + (($end-1)>>17)));
113 my ($opts, $preset) = @_;
114 my $tmp = join(", ", keys(%$preset));
116 Program
: batchUCSC
.pl
(Batch data downloader from UCSC
)
117 Contact
: Heng Li
<lh3\
@sanger.ac
.uk
>
118 Usage
: batchUCSC
.pl
[options
] <in.region_file
>
120 Options
: -s STR server
[$opts->{s
}]
121 -d STR database
[$opts->{d
}]
122 -p STR query
: ${tmp
} [$opts->{p
}]
123 -C contained
(by
default overlap
)
129 batchUCSC
.pl
-p snpAnc inp
.txt
130 batchUCSC
.pl
-p snp129
::: inp
.txt
131 batchUCSC
.pl
-p
'refGene:cdsStart:cdsEnd:*' inp
.txt
132 batchUCSC
.pl
-p simpleRepeat
::: inp
.txt
136 DO NOT retrieve the data from a whole chromosome
, not to speak from
137 the whole genome
. Doing
in this way is extremely slow
and will put a
138 lot of burden on the UCSC MySQL server
. If you want the whole genome
139 data
, please download from UCSC
's FTP site. You can also import UCSC
140 data to your local MySQL database, but you need to change '-s
' and
141 '-d
' according to your local configuration of MySQL.
151 a) batchUCSC.pl -p snp inp.txt
153 You can use presetting. The above command is equivalent to (for
154 explanation, see below):
156 batchUCSC.pl -p 'snp129
:::' inp.txt
158 b) batchUCSC.pl -p 'knownGene
:txStart
:txEnd
:*' inp.txt
160 For a customized query, the query consists of four fields,
161 separated by colons. The four fields are: the table name
162 (required), the column name for the start of a region, name for the
163 end of a region and the list of fields to be displayed. When the
164 second the third fields are missing, this script will infer from
165 the table schema. The default value for the last column is '*',
166 which shows all the columns. You can even do calculation between
167 cloumns at the last field, as long as that fields follows the SQL
170 The list of available UCSC tables can be found on its FTP site or
173 http://genome.ucsc.edu/cgi-bin/hgTables?command=start
175 c) batchUCSC.pl -p 'simpleRepeat
:::' inp.txt
177 The above command is equivalent to:
179 batchUCSC.pl -p 'simpleRepeat
:chromStart
:chromEnd
:*' inp.txt
181 If the script finds multiple start or end columns (for example in
182 table knownGene), it will use the first such column in the
183 table and give a warning.