Bio::Phenotype::* move what's left of the namespace to its own distribution.
[bioperl-live.git] / bin / bp_seqfeature_load
blobb7898424ec2254e1dffb2040416d1704ab09cc2f
1 #!/usr/bin/perl
3 use strict;
4 use warnings;
6 ## Used to output the 'usage' message
7 use Pod::Usage;
9 ## Used to parse command line options
10 use Getopt::Long;
12 ## Used to create temporary files, if necessary
13 use File::Spec;
15 ## BioPerl!
16 use Bio::DB::SeqFeature::Store;
17 use Bio::DB::SeqFeature::Store::GFF3Loader;
21 ## The available options. Note, these defaults are 'hard coded' into
22 ## the USAGE POD, so if you change one of the defaults (you shouldn't),
23 ## you should update the USAGE.
25 my $DSN = 'dbi:mysql:test';
26 my $SFCLASS = 'Bio::DB::SeqFeature';
27 my $ADAPTOR = 'DBI::mysql';
28 my $NAMESPACE;
29 my $VERBOSE = 1;
30 my $FAST = 0;
31 my $TMP = File::Spec->tmpdir();
32 my $IGNORE_SEQREGION = 0;
33 my $CREATE = 0;
34 my $USER = '';
35 my $PASS = '';
36 my $COMPRESS = 0;
37 my $INDEX_SUB = 1;
38 my $NOALIAS_TARGET = 0;
39 my $SUMMARY_STATS = 0;
40 my $NOSUMMARY_STATS = 0;
41 my $FTS = 0;
43 ## Two flags based on http://stackoverflow.com/questions/1232116
44 ## how-to-create-pod-and-use-pod2usage-in-perl
45 my $opt_help;
46 my $opt_man;
48 GetOptions( 'd|dsn=s' => \$DSN,
49 's|seqfeature=s' => \$SFCLASS,
50 'n|namespace=s' => \$NAMESPACE,
51 'a|adaptor=s' => \$ADAPTOR,
52 'v|verbose!' => \$VERBOSE,
53 'f|fast' => \$FAST,
54 'T|temporary-directory=s' => \$TMP,
55 'i|ignore-seqregion' => \$IGNORE_SEQREGION,
56 'c|create' => \$CREATE,
57 'u|user=s' => \$USER,
58 'p|password=s' => \$PASS,
59 'z|zip' => \$COMPRESS,
60 'S|subfeatures!' => \$INDEX_SUB,
62 ## Any good single letter choices here?
63 'noalias-target' => \$NOALIAS_TARGET,
64 'summary' => \$SUMMARY_STATS,
65 'N|nosummary' => \$NOSUMMARY_STATS,
66 'fts' => \$FTS,
68 ## I miss '--help' when it isn't there!
69 'h|help!' => \$opt_help,
70 'm|man!' => \$opt_man,
72 or pod2usage( -message =>
73 "\nTry 'bp_seqfeature_load.pl --help' for more information\n",
74 -verbose => 0,
75 -exitval => 2,
78 ## Should we output usage information?
79 pod2usage( -verbose => 1 ) if $opt_help;
80 pod2usage( -verbose => 2 ) if $opt_man;
82 ## Did we get any files to process?
83 @ARGV
84 or pod2usage( -message =>
85 "\nYou need to pass some GFF or fasta files to load\n",
86 -verbose => 0,
87 -exitval => 2,
90 pod2usage( -message => "\n--fts requires --create\n",
91 -verbose => 0,
92 -exitval => 2,
93 ) if ($FTS and not $CREATE);
96 ## POD
98 =head1 NAME
100 bp_seqfeature_load.pl - Load GFF into a SeqFeature database
102 =head1 DESCRIPTION
104 Pass any number of GFF or fasta format files (or GFF with embedded
105 fasta) to load the features and sequences into a SeqFeature
106 database. The database (and adaptor) to use is specified on the
107 command line. Use the --create flag to create a new SeqFeature
108 database.
110 =head1 SYNOPSIS
112 bp_seqfeature_load.pl [options] gff_or_fasta_file1 [gff_or_fasta_file2 [...]]
114 Try 'bp_seqfeature_load.pl --help' or '--man' for more information.
116 =head1 OPTIONS
118 =over 4
120 =item -d, --dsn
122 DBI data source (default dbi:mysql:test)
124 =item -n, --namespace
126 The table prefix to use (default undef) Allows several independent
127 sequence feature databases to be stored in a single database
129 =item -s, --seqfeature
131 The type of SeqFeature to create... RTSC (default Bio::DB::SeqFeature)
133 =item -a, --adaptor
135 The storage adaptor (class) to use (default DBI::mysql)
137 =item -v, --verbose
139 Turn on verbose progress reporting (default true) Use --noverbose to
140 switch this off.
142 =item -f, --fast
144 Activate fast loading. (default 0) Only available for some adaptors.
146 =item -T, --temporary-directory
148 Specify temporary directory for fast loading (default
149 File::Spec->tmpdir())
151 =item -i, --ignore-seqregion
153 If true, then ignore ##sequence-region directives in the GFF3 file
154 (default, create a feature for each region)
156 =item -c, --create
158 Create the database and reinitialize it (default false) Note, this
159 will erase previous database contents, if any.
161 =item -u, --user
163 User to connect to database as
165 =item -p, --password
167 Password to use to connect to database
169 =item -z, --zip
171 Compress database tables to save space (default false)
173 =item -S, --subfeatures
175 Turn on indexing of subfeatures (default true) Use --nosubfeatures to
176 switch this off.
178 =item --fts
180 Index the attribute table for full-text search (default false). Applicable
181 only when --create is specified. Currently applicable to the DBI::SQLite
182 storage adaptor only (using the most recent supported FTS indexing method,
183 which may not be portable to older DBI::SQLite versions).
185 =item --summary
187 Generate summary statistics for coverage graphs (default false) This
188 can be run on a previously loaded database or during the load. It will
189 default to true if --create is used.
191 =item -N, --nosummary
193 Do not generate summary statistics to save some space and load time (default if
194 --create is not specified, use this option to explicitly turn off summary
195 statistics when --create is specified)
197 =item --noalias-target
199 Don't create an Alias attribute whose value is the target_id in a
200 Target attribute (if the feature contains a Target attribute, the
201 default is to create an Alias attribute whose value is the target_id
202 in the Target attribute)
204 =back
206 Please see http://www.sequenceontology.org/gff3.shtml for information
207 about the GFF3 format. BioPerl extends the format slightly by adding a
208 ##index-subfeatures directive. Set this to a true value if you wish
209 the database to be able to retrieve a feature's individual parts (such
210 as the exons of a transcript) independently of the top level feature:
212 ##index-subfeatures 1
214 It is also possible to control the indexing of subfeatures on a
215 case-by-case basis by adding "index=1" or "index=0" to the feature's
216 attribute list. This should only be used for subfeatures.
218 Subfeature indexing is true by default. Set to false (0) to save lots
219 of database space and speed performance. You may use --nosubfeatures
220 to force this.
222 =cut
228 if ($FAST) {
229 -d $TMP && -w $TMP
230 or die "Fast loading is requested, but I cannot write into the directory $TMP";
231 $DSN .= ";mysql_local_infile=1" if $ADAPTOR =~ /mysql/i && $DSN !~ /mysql_local_infile/;
234 my @options;
235 @options = ($USER,$PASS) if $USER || $PASS;
237 my $store = Bio::DB::SeqFeature::Store->new
239 -dsn => $DSN,
240 -namespace => $NAMESPACE,
241 -adaptor => $ADAPTOR,
242 -tmpdir => $TMP,
243 -user => $USER,
244 -pass => $PASS,
245 -write => 1,
246 -create => $CREATE,
247 -compress => $COMPRESS,
248 -fts => $FTS,
250 or die "Couldn't create connection to the database";
252 $store->init_database('erase') if $CREATE;
253 $SUMMARY_STATS++ if $CREATE; # this is a good thing
255 my $loader = Bio::DB::SeqFeature::Store::GFF3Loader->new
257 -store => $store,
258 -sf_class => $SFCLASS,
259 -verbose => $VERBOSE,
260 -tmpdir => $TMP,
261 -fast => $FAST,
262 -ignore_seqregion => $IGNORE_SEQREGION,
263 -index_subfeatures => $INDEX_SUB,
264 -noalias_target => $NOALIAS_TARGET,
265 -summary_stats => $NOSUMMARY_STATS ? 0 : $SUMMARY_STATS,
267 or die "Couldn't create GFF3 loader";
269 # on signals, give objects a chance to call their DESTROY methods
270 $SIG{TERM} = $SIG{INT} = sub { undef $loader; undef $store; die "Aborted..."; };
272 $loader->load(@ARGV);
274 exit 0;