4 load_sequence_metadata.pl - verify and store sequence metadata from a gff3 file
7 This script uses the CXGN::Genotype::SequenceMetadata package to verify and store sequence metadata
10 The verification step checks to make sure the seqid column (#1) in the gff3 file matches existing features
11 stored in the database. If a list of attributes are provided, the verification step will check if those
12 attributes exist in the gff3 file and if there are attributes in the file missing from the provided list.
14 If the verification passes, the script will store the annotations from the gff3 file as sequence metadata
15 in the featureprop_json table.
17 IMPORTANT: A sequence metadata protocol must already exist in the nd_protocol table and must have an nd_protocolprop
18 of type 'sequence_metadata_protocol_properties' that include the sequence metadata type and attributes described by
19 the protocol. If the protocol does not yet exist, you can use the load_sequence_metadata_protocol.pl script to create one.
21 =head1 COMMAND-LINE OPTIONS
23 -H host name (required) e.g. "localhost"
24 -D database name (required) e.g. "cxgn_cassava"
25 -U database username (required)
26 -P database password (optional, default=prompt user for password)
27 -n nd_protocol_id of the sequence metadata protocol to associated the data with (required)
28 -i input gff file (required)
29 -o output/processed gff file (optional, default=$input.processed)
30 -s species name (required)
31 -w flag to skip verification warnings about missing/undefined attributes (optional, default=prompt user on warnings)
34 David Waring <djw64@cornell.edu>
44 use Bio
::Chado
::Schema
;
45 use CXGN
::Genotype
::SequenceMetadata
;
50 our ($opt_H, $opt_D, $opt_U, $opt_P, $opt_n, $opt_i, $opt_o, $opt_s, $opt_w);
51 getopts
('H:D:U:P:n:i:o:s:w');
54 # Check for required arguments
55 if ( !$opt_H || !$opt_U || !$opt_D ) {
56 die "ERROR: Database options -H, -D, and -U are required!\n";
59 die "ERROR: Sequence Metadata Protocol ID is required!\n";
62 die "ERROR: Input gff file path required!\n";
65 die "ERROR: Species name is required!\n";
69 # Set parameters from options
70 my $nd_protocol_id = $opt_n;
72 my $output = $opt_o ?
$opt_o : $opt_i . ".processed";
74 my $ignore_warnings = $opt_w;
77 # Check if input file exists
78 if( !(-e
-f
-r
$input) ){
79 die "ERROR: Input file ($input) does not exist or is not readable!\n";
86 print "Password for $opt_H / $opt_D: \n";
91 print STDERR
"Connecting to database...\n";
92 my $dsn = 'dbi:Pg:database='.$opt_D.";host=".$opt_H.";port=5432";
93 my $schema = Bio
::Chado
::Schema
->connect($dsn, $opt_U, $pass);
94 my $dbh = $schema->storage->dbh;
97 # Get the protocol information (type_id and attributes)
98 my $q = "SELECT value FROM public.nd_protocolprop WHERE nd_protocol_id = ?;";
99 my $h = $dbh->prepare($q);
100 $h->execute($nd_protocol_id);
101 my ($value_json) = $h->fetchrow_array();
103 die "ERROR: Sequence Metadata Protocol ID ($nd_protocol_id) is not valid!\n";
105 my $value = decode_json
$value_json;
106 my $type_id = $value->{'sequence_metadata_type_id'};
107 my $attribute_descriptions = $value->{'attribute_descriptions'};
108 if ( !$type_id || !$attribute_descriptions ) {
109 die "ERROR: Sequence Metadata Protocol does not have the correct attributes (sequence_metadata_type_id, attribute_descriptions) set!\n";
111 my @attributes = keys %{$attribute_descriptions};
115 print STDERR
"====> VERIFY GFF FILE <====\n";
116 print STDERR
"Input File: $input\n";
117 print STDERR
"Output File: $output\n";
118 print STDERR
"Species: $species\n";
119 print STDERR
"Type ID: $type_id\n";
120 print STDERR
"Protocol ID: $nd_protocol_id\n";
121 print STDERR
"Attributes: " . join(', ', @attributes) . "\n";
123 my $smd = CXGN
::Genotype
::SequenceMetadata
->new(bcs_schema
=> $schema, type_id
=> $type_id, nd_protocol_id
=> $nd_protocol_id);
124 my $verification_results = $smd->verify($input, $output, $species, \
@attributes);
127 if ( $verification_results->{'processed'} ne 1 || $verification_results->{'verified'} ne 1 ) {
128 print STDERR
"Missing features:\n";
129 print STDERR Dumper
$verification_results->{'missing_features'};
130 die "ERROR: Could not verify the input gff file!\n";
132 elsif ( !$ignore_warnings && ($verification_results->{'missing_attributes'} || $verification_results->{'undefined_attributes'}) ) {
133 print STDERR
"WARNING: Missing and/or undefined attributes:\n";
134 print STDERR
"Missing Attributes (Provided in the list of attributes but not found in the file)\n";
135 print STDERR Dumper
$verification_results->{'missing_attributes'};
136 print STDERR
"Undefined Attributes (Found in the file but not provided in the list of attributes)\n";
137 print STDERR Dumper
$verification_results->{'undefined_attributes'};
139 print "Do you want to continue storing the file? (Y/N): ";
140 chomp(my $answer = <STDIN
>);
141 if ( !(lc($answer) eq 'y') ) {
142 die "ERROR: Verification failed due to missing/undefined attributes\n";
146 print "--> VERIFICATION COMPLETE!\n";
150 print STDERR
"====> STORE GFF FILE <====\n";
152 my $store_results = $smd->store($output, $species);
155 if ( $store_results->{'stored'} ne 1 ) {
156 die $store_results->{'error'};
159 print STDERR
"--> STORAGE COMPLETE (" . $store_results->{'chunks'} . " chunks written to the database)!\n";