5 # use lib './blib/lib';
11 use Bio
::DB
::GFF
::Util
::Binning
'bin';
13 use constant MYSQL
=> 'mysql';
14 use constant FDATA
=> 'fdata';
15 use constant FTYPE
=> 'ftype';
16 use constant FGROUP
=> 'fgroup';
17 use constant FDNA
=> 'fdna';
18 use constant FATTRIBUTE
=> 'fattribute';
19 use constant FATTRIBUTE_TO_FEATURE
=> 'fattribute_to_feature';
23 bp_bulk_load_gff.pl - Bulk-load a Bio::DB::GFF database from GFF files.
27 % bp_bulk_load_gff.pl -d testdb dna1.fa dna2.fa features1.gff features2.gff ...
31 This script loads a Bio::DB::GFF database with the features contained
32 in a list of GFF files and/or FASTA sequence files. You must use the
33 exact variant of GFF described in L<Bio::DB::GFF>. Various
34 command-line options allow you to control which database to load and
35 whether to allow an existing database to be overwritten.
37 This script differs from bp_load_gff.pl in that it is hard-coded to use
38 MySQL and cannot perform incremental loads. See L<bp_load_gff.pl> for an
39 incremental loader that works with all databases supported by
40 Bio::DB::GFF, and L<bp_fast_load_gff.pl> for a MySQL loader that supports
41 fast incremental loads.
45 If the filename is given as "-" then the input is taken from standard
46 input. Compressed files (.gz, .Z, .bz2) are automatically
49 FASTA format files are distinguished from GFF files by their filename
50 extensions. Files ending in .fa, .fasta, .fast, .seq, .dna and their
51 uppercase variants are treated as FASTA files. Everything else is
52 treated as a GFF file. If you wish to load -fasta files from STDIN,
53 then use the -f command-line swith with an argument of '-', as in
55 gunzip my_data.fa.gz | bp_fast_load_gff.pl -d test -f -
57 The nature of the bulk load requires that the database be on the local
58 machine and that the indicated user have the "file" privilege to load
59 the tables and have enough room in /usr/tmp (or whatever is specified
60 by the \$TMPDIR environment variable), to hold the tables transiently.
62 Local data may now be uploaded to a remote server via the --local option
63 with the database host specified in the dsn, e.g. dbi:mysql:test:db_host
65 The adaptor used is dbi::mysqlopt. There is currently no way to
68 About maxfeature: the default value is 100,000,000 bases. If you have
69 features that are close to or greater that 100Mb in length, then the
70 value of maxfeature should be increased to 1,000,000,000. This value
71 must be a power of 10.
73 Note that Windows users must use the --create option.
75 If the list of GFF or fasta files exceeds the kernel limit for the
76 maximum number of command-line arguments, use the
77 --long_list /path/to/files option.
80 =head1 COMMAND-LINE OPTIONS
82 Command-line options can be abbreviated to single-letter options.
83 e.g. -d instead of --database.
85 --database <dsn> Database name (default dbi:mysql:test)
86 --adaptor Adaptor name (default mysql)
87 --create Reinitialize/create data tables without asking
88 --user Username to log in as
89 --fasta File or directory containing fasta files to load
90 --long_list Directory containing a very large number of
91 GFF and/or FASTA files
92 --password Password to use for authentication
93 (Does not work with Postgres, password must be
94 supplied interactively or be left empty for
96 --maxbin Set the value of the maximum bin size
97 --local Flag to indicate that the data source is local
98 --maxfeature Set the value of the maximum feature size (power of 10)
99 --group A list of one or more tag names (comma or space separated)
100 to be used for grouping in the 9th column.
101 --gff3_munge Activate GFF3 name munging (see Bio::DB::GFF)
102 --summary Generate summary statistics for drawing coverage histograms.
103 This can be run on a previously loaded database or during
105 --Temporary Location of a writable scratch directory
109 L<Bio::DB::GFF>, L<fast_load_gff.pl>, L<load_gff.pl>
113 Lincoln Stein, lstein@cshl.org
115 Copyright (c) 2002 Cold Spring Harbor Laboratory
117 This library is free software; you can redistribute it and/or modify
118 it under the same terms as Perl itself. See DISCLAIMER.txt for
119 disclaimers of warranty.
123 package Bio
::DB
::GFF
::Adaptor
::fauxmysql
;
125 use Bio
::DB
::GFF
::Adaptor
::dbi
::mysqlopt
;
127 @ISA = 'Bio::DB::GFF::Adaptor::dbi::mysqlopt';
129 sub insert_sequence
{
131 my ($id,$offset,$seq) = @_;
132 print join("\t",$id,$offset,$seq),"\n";
135 package Bio
::DB
::GFF
::Adaptor
::fauxmysqlcmap
;
137 use Bio
::DB
::GFF
::Adaptor
::dbi
::mysqlcmap
;
139 @ISA = 'Bio::DB::GFF::Adaptor::dbi::mysqlcmap';
141 sub insert_sequence
{
143 my ($id,$offset,$seq) = @_;
144 print join("\t",$id,$offset,$seq),"\n";
147 package Bio
::DB
::GFF
::Adaptor
::fauxpg
;
149 use Bio
::DB
::GFF
::Adaptor
::dbi
::pg
;
151 @ISA = 'Bio::DB::GFF::Adaptor::dbi::pg';
153 #these two subs are to separate the table creation from the
158 $self->drop_all if $erase;
160 my $dbh = $self->features_db;
161 my $schema = $self->schema;
162 foreach my $table_name ($self->tables) {
163 my $create_table_stmt = $schema->{$table_name}{table
} ;
164 $dbh->do($create_table_stmt) || warn $dbh->errstr;
165 # $self->create_other_schema_objects(\%{$schema->{$table_name}});
170 sub _create_indexes_etc
{
173 my $dbh = $self->features_db;
174 my $schema = $self->schema;
175 foreach my $table_name ($self->tables) {
176 $self->create_other_schema_objects(\
%{$schema->{$table_name}});
180 sub insert_sequence
{
182 my ($id,$offset,$seq) = @_;
183 print "$id\t$offset\t$seq\n";
188 eval "use Time::HiRes"; undef $@
;
189 my $timer = defined &Time
::HiRes
::time;
191 my $bWINDOWS = 0; # Boolean: is this a MSWindows operating system?
192 if ($^O
=~ /MSWin32/i) {
196 my ($DSN,$ADAPTOR,$FORCE,$USER,$PASSWORD,$FASTA,$LOCAL,$MAX_BIN,$GROUP_TAG,$LONG_LIST,$MUNGE,$TMPDIR);
198 GetOptions
('database:s' => \
$DSN,
199 'adaptor:s' => \
$ADAPTOR,
202 'password:s' => \
$PASSWORD,
203 'fasta:s' => \
$FASTA,
205 'maxbin|maxfeature:s' => \
$MAX_BIN,
206 'group:s' => \
$GROUP_TAG,
207 'long_list:s' => \
$LONG_LIST,
208 'gff3_munge' => \
$MUNGE,
209 'Temporary:s' => \
$TMPDIR,
210 ) or (system('pod2text', $0), exit -1);
212 # If called as pg_bulk_load_gff.pl behave as that did.
213 if ($0 =~/pg_bulk_load_gff.pl/){
217 $DSN ||= 'dbi:mysql:test';
218 $MAX_BIN ||= 1_000_000_000
; # to accomodate human-sized chromosomes
221 if ($bWINDOWS && not $FORCE) {
222 die "Note that Windows users must use the --create option.\n";
226 die "This will delete all existing data in database $DSN. If you want to do this, rerun with the --create option.\n"
228 open (TTY
,"/dev/tty") or die "/dev/tty: $!\n"; #TTY use removed for win compatability
229 print STDERR
"This operation will delete all existing data in database $DSN. Continue? ";
231 die "Aborted\n" unless $f =~ /^[yY]/;
235 # postgres DBD::Pg allows 'database', but also 'dbname', and 'db':
236 # and it must be Pg (not pg)
237 $DSN=~s/pg:database=/Pg:/i;
238 $DSN=~s/pg:dbname=/Pg:/i;
239 $DSN=~s/pg:db=/Pg:/i;
241 # leave these lines for mysql
242 $DSN=~s/database=//i;
243 $DSN=~s/;host=/:/i; #cater for dsn in the form of "dbi:mysql:database=$dbname;host=$host"
246 my($DBI,$DBD,$DBNAME,$HOST)=split /:/,$DSN;
247 $DBNAME=$DSN unless $DSN=~/:/;
249 $ADAPTOR ||= 'mysql';
252 # rebuild DSN, DBD::Pg requires full dbname=<name> format
253 $DSN = "dbi:Pg:dbname=$DBNAME";
254 if ($HOST) { $DSN .= ";host=$HOST"; }
257 my ($use_mysql,$use_mysqlcmap,$use_pg) = (0,0,0);
258 if ( $ADAPTOR eq 'mysqlcmap' ) {
261 elsif ( $ADAPTOR =~ /^mysql/ ) {
264 elsif ( $ADAPTOR eq "Pg" ) {
268 die "$ADAPTOR is not an acceptable database adaptor.";
274 push @auth,(-user
=>$USER);
275 if ( $use_mysql or $use_mysqlcmap ) {
279 $AUTH .= " -U $USER ";
282 if (defined $PASSWORD) {
283 push @auth,(-pass
=>$PASSWORD);
284 if ( $use_mysql or $use_mysqlcmap ) {
285 $AUTH .= " -p$PASSWORD";
287 # elsif ( $use_pg ) {
288 # $AUTH .= " -W $PASSWORD ";
295 if (defined $DBNAME) {
296 if ( $use_mysql or $use_mysqlcmap ) {
297 $AUTH .= " -D$DBNAME ";
300 if (defined $LOCAL) {
302 $AUTH.=' --local-infile=1';
308 if ( $use_mysqlcmap ) {
309 $faux_adaptor = "fauxmysqlcmap";
311 elsif ( $use_mysql ) {
312 $faux_adaptor = "fauxmysql";
315 $faux_adaptor = "fauxpg";
318 my $db = Bio
::DB
::GFF
->new(-adaptor
=>$faux_adaptor,-dsn
=> $DSN,@auth)
319 or die "Can't open database: ",Bio
::DB
::GFF
->error,"\n";
321 $db->gff3_name_munging(1) if $MUNGE;
323 $MAX_BIN ?
$db->initialize(-erase
=>1,-MAX_BIN
=>$MAX_BIN) : $db->initialize(1);
324 $MAX_BIN ||= $db->meta('max_bin') || 100_000_000
;
326 # deal with really long lists of files
328 -d
$LONG_LIST or die "The --long_list argument must be a directory\n";
329 opendir GFFDIR
,$LONG_LIST or die "Could not open $LONG_LIST for reading: $!";
330 @ARGV = map { "$LONG_LIST\/$_" } readdir GFFDIR
;
333 if (defined $FASTA && -d
$FASTA) {
334 opendir FASTA
,$FASTA or die "Could not open $FASTA for reading: $!";
335 push @ARGV, map { "$FASTA\/$_" } readdir FASTA
;
338 elsif (defined $FASTA && -f
$FASTA) {
344 $_ = "gunzip -c $_ |" if /\.gz$/;
345 $_ = "uncompress -c $_ |" if /\.Z$/;
346 $_ = "bunzip2 -c $_ |" if /\.bz2$/;
351 if (/\.(fa|fasta|dna|seq|fast)(?:$|\.)/i) {
358 push @fasta,$FASTA if defined $FASTA;
360 # drop everything that was there before
362 my $tmpdir = File
::Spec
->tmpdir() || '/tmp';
363 $tmpdir =~ s!\\!\\\\!g if $bWINDOWS; #eliminates backslash mis-interpretation
364 -d
$tmpdir or die <<END;
365 I could not find a suitable temporary directory to write scratch files into ($tmpdir by default).
366 Please select a directory and indicate its location by setting the TMP environment variable, or
367 by using the --Temporary switch.
369 my @fasta_files_to_be_unlinked;
370 my @files = (FDATA
,FTYPE
,FGROUP
,FDNA
,FATTRIBUTE
,FATTRIBUTE_TO_FEATURE
);
372 $FH{$_} = IO
::File
->new(">$tmpdir/$_.$$") or die $_,": $!";
377 $FH{FDATA
() }->print("COPY fdata (fid, fref, fstart, fstop, fbin, ftypeid, fscore, fstrand, fphase, gid, ftarget_start, ftarget_stop) FROM stdin;\n");
378 $FH{FTYPE
() }->print("COPY ftype (ftypeid, fmethod, fsource) FROM stdin;\n");
379 $FH{FGROUP
() }->print("COPY fgroup (gid, gclass, gname) FROM stdin;\n");
380 $FH{FATTRIBUTE
() }->print("COPY fattribute (fattribute_id, fattribute_name) FROM stdin;\n");
381 $FH{FATTRIBUTE_TO_FEATURE
()}->print("COPY fattribute_to_feature (fid, fattribute_id, fattribute_value) FROM stdin;\n");
389 my %ATTRIBUTEID = ();
393 my %tmpfiles; # keep track of temporary fasta files
395 my $fasta_sequence_id;
397 my $current_file; #used to reset GFF3 flag in mix of GFF and GFF3 files
399 $db->preferred_groups(split (/[,\s]+/,$GROUP_TAG)) if defined $GROUP_TAG;
401 my $last = Time
::HiRes
::time() if $timer;
404 # avoid hanging on standalone --fasta load
406 $FH{NULL
} = IO
::File
->new(">$tmpdir/null");
407 push @ARGV, "$tmpdir/null";
414 FetchHashKeyName
=> 'NAME_lc',
420 $cmap_db = DBI
->connect( $DSN, $USER, $PASSWORD, $options );
422 # Only load CMap::Utils if using cmap
423 unless (!$use_mysqlcmap or
425 require Bio
::GMOD
::CMap
::Utils
;
426 Bio
::GMOD
::CMap
::Utils
->import('next_number');
430 print STDERR
"Error loading Bio::GMOD::CMap::Utils\n";
436 $current_file ||= $ARGV;
438 # reset GFF3 flag if new filehandle
439 unless($current_file eq $ARGV){
441 $current_file = $ARGV;
445 my ($ref,$source,$method,$start,$stop,$score,$strand,$phase,$group);
447 # close sequence filehandle if required
448 if ( /^\#|\s+|^$|^>|\t/ && defined $FH{FASTA
}) {
453 # print to fasta file if the handle is open
454 if ( defined $FH{FASTA
} ) {
455 $FH{FASTA
}->print("$_\n");
459 elsif (/^>(\S+)/) { # uh oh, sequence coming
460 $FH{FASTA
} = IO
::File
->new(">$tmpdir/$1\.fa") or die "FASTA: $!\n";
461 $FH{FASTA
}->print("$_\n");
462 print STDERR
"Preparing embedded sequence $1\n";
463 push @fasta, "$tmpdir/$1\.fa";
464 push @fasta_files_to_be_unlinked,"$tmpdir/$1\.fa";
465 $tmpfiles{"$tmpdir/$1\.fa"}++;
469 elsif (/^\#\#\s*gff-version\s+(\d+)/) {
471 $db->print_gff3_warning() if $gff3;
475 elsif (/^\#\#\s*group-tags\s+(.+)/) {
476 $db->preferred_groups(split(/\s+/,$1));
480 elsif (/^\#\#\s*sequence-region\s+(\S+)\s+(\d+)\s+(\d+)/i) { # header line
481 ($ref,$source,$method,$start,$stop,$score,$strand,$phase,$group) =
482 ($1,'reference','Component',$2,$3,'.','.','.',$gff3 ?
"ID=Sequence:$1": qq(Sequence
"$1"));
490 ($ref,$source,$method,$start,$stop,$score,$strand,$phase,$group) = split "\t";
492 if ( not defined( $ref ) or length ($ref) == 0) {
493 warn "\$ref is null. source = $source, method = $method, group = $group\n";
497 my $size = $stop-$start+1;
498 warn "Feature $group ($size) is larger than $MAX_BIN. You will have trouble retrieving this feature.\nRerun script with --maxfeature set to a higher power of 10.\n" if $size > $MAX_BIN;
500 $source = '\N' unless defined $source;
501 $score = '\N' if $score eq '.';
502 $strand = '\N' if $strand eq '.';
503 $phase = '\N' if $phase eq '.';
505 my ($group_class,$group_name,$target_start,$target_stop,$attributes) = $db->split_group($group,$gff3);
508 $group_class = [$group_class] unless ref $group_class;
509 $group_name = [$group_name] unless ref $group_name;
511 for (my $i=0; $i < @
$group_name; $i++) {
512 $group_class->[$i] ||= '\N';
513 $group_name->[$i] ||= '\N';
514 $target_start ||= '\N';
515 $target_stop ||= '\N';
520 my $gid = $GROUPID{lc join('',$group_class->[$i],$group_name->[$i])} ||= $GID++;
521 my $ftypeid = $FTYPEID{lc join('',$source,$method)} ||= $FTYPEID++;
523 my $bin = bin
($start,$stop,$db->min_bin);
524 $FH{ FDATA
() }->print( join("\t",$fid,$ref,$start,$stop,$bin,$ftypeid,$score,$strand,$phase,$gid,$target_start,$target_stop),"\n" );
526 my $feature_id = next_number
(
528 table_name
=> 'cmap_feature',
529 id_field
=> 'feature_id',
531 or die 'No feature id';
532 my $direction = $strand eq '-' ?
-1:1;
533 $FH{ FGROUP
() }->print(
534 join("\t",$feature_id,$feature_id,'NULL',0, $group_name->[$i],0,0,'NULL',1,$direction, $group_class->[$i],)
536 ) unless $DONE{"G$gid"}++;
539 $FH{ FGROUP
() }->print( join("\t",$gid,$group_class->[$i],$group_name->[$i]),"\n") unless $DONE{"G$gid"}++;
541 $FH{ FTYPE
() }->print( join("\t",$ftypeid,$method,$source),"\n" ) unless $DONE{"T$ftypeid"}++;
543 foreach (@
$attributes) {
544 my ($key,$value) = @
$_;
545 my $attributeid = $ATTRIBUTEID{$key} ||= $ATTRIBUTEID++;
546 $FH{ FATTRIBUTE
() }->print( join("\t",$attributeid,$key),"\n" ) unless $DONE{"A$attributeid"}++;
547 $FH{ FATTRIBUTE_TO_FEATURE
() }->print( join("\t",$fid,$attributeid,$value),"\n");
550 if ( $fid % 1000 == 0) {
551 my $now = Time
::HiRes
::time() if $timer;
552 my $elapsed = $timer ?
sprintf(" in %5.2fs",$now - $last) : '';
554 print STDERR
"$fid features parsed$elapsed...";
555 print STDERR
-t STDOUT
&& !$ENV{EMACS
} ?
"\r" : "\n";
560 $FH{FASTA
}->close if exists $FH{FASTA
};
562 for my $file (@fasta) {
563 warn "Preparing DNA file $file....\n";
565 $FH{FDNA
() }->print("COPY fdna (fref, foffset, fdna) FROM stdin;\n");
567 my $old = select($FH{FDNA
()});
568 $db->load_fasta($file) or warn "Couldn't load fasta file $file: $!";
570 $FH{FDNA
() }->print("\\.\n\n");
574 unlink $file if $tmpfiles{$file};
578 $FH{FDATA
() }->print("\\.\n\n");
579 $FH{FTYPE
() }->print("\\.\n\n");
580 $FH{FGROUP
() }->print("\\.\n\n");
581 $FH{FATTRIBUTE
() }->print("\\.\n\n");
582 $FH{FATTRIBUTE_TO_FEATURE
()}->print("\\.\n\n");
586 $_->close foreach values %FH;
587 printf STDERR
"Total parse time %5.2fs\n",(Time
::HiRes
::time() - $start) if $timer;
588 warn "Loading feature data and analyzing tables. You may see RDBMS messages here...\n";
591 warn "Loading feature data. You may see Postgres comments...\n";
594 my $file = "$tmpdir/$_.$$";
596 $AUTH ?
system("psql $AUTH -f $file $DBNAME")
597 : system('psql','-f', $file, $DBNAME);
602 warn "Updating sequences ...\n";
603 $db->update_sequences();
605 warn "Creating indexes ...\n";
606 $db->_create_indexes_etc();
612 elsif( $use_mysql or $use_mysqlcmap ) {
616 my $TERMINATEDBY = $bWINDOWS ? q
( LINES TERMINATED BY
'\r\n') : '';
618 my $table = function_to_table
($f,$ADAPTOR);
619 my $sql = join ('; ',
620 "lock tables $table write",
621 "delete from $table",
622 "load data $LOCAL infile '$tmpdir/$f.$$' replace into table $table $TERMINATEDBY",
624 my $command = MYSQL
. qq[$AUTH -s
-e
"$sql"];
625 $command =~ s/\n/ /g;
626 $success &&= system($command) == 0;
627 unlink "$tmpdir/$f.$$";
629 printf STDERR
"Total load time %5.2fs\n",(time() - $start) if $timer;
630 print STDERR
"done...\n";
632 print STDERR
"Analyzing/optimizing tables. You will see database messages...\n";
636 my $table = function_to_table
($f,$ADAPTOR);
637 $sql .= "analyze table $table;";
639 my $command = MYSQL
. qq[$AUTH -N
-s
-e
"$sql"];
640 $success &&= system($command) == 0;
641 printf STDERR
"Optimization time time %5.2fs\n",(time() - $start);
644 print "$FEATURES features successfully loaded\n";
646 print "FAILURE: Please see standard error for details\n";
651 foreach (@fasta_files_to_be_unlinked) {
652 unlink "$tmpdir/$_.$$";
655 warn "Building summary statistics for coverage histograms...\n";
658 push @args,(-user
=>$USER);
661 if (defined $PASSWORD) {
662 push @args,(-pass
=>$PASSWORD);
663 $AUTH .= " -p$PASSWORD";
665 push @args,(-preferred_groups
=>[split(/[,\s+]+/,$GROUP_TAG)]) if defined $GROUP_TAG;
666 my $db = Bio
::DB
::GFF
->new(-adaptor
=>"dbi::$ADAPTOR",-dsn
=> $DSN,@args)
667 or die "Can't open database: ",Bio
::DB
::GFF
->error,"\n";
668 $db->build_summary_statistics;
672 sub function_to_table
{
673 my $function = shift;
676 if ($function eq 'fdata'){
679 elsif ($function eq 'ftype'){
682 elsif ($function eq 'fgroup'){
683 return 'cmap_feature' if ($adaptor eq 'mysqlcmap');
686 elsif ($function eq 'fdna'){
689 elsif ($function eq 'fattribute'){
692 elsif ($function eq 'fattribute_to_feature'){
693 return 'fattribute_to_feature';