Merge pull request #3948 from solgenomics/lukasmueller-patch-1
[sgn.git] / bin / load_sequence_metadata.pl
blob30c50645e5f1ca3d5431f39cc9bfb02e164732b7
1 #! /usr/bin/perl
3 =head1
4 load_sequence_metadata.pl - verify and store sequence metadata from a gff3 file
6 =head1 SYNOPSIS
7 This script uses the CXGN::Genotype::SequenceMetadata package to verify and store sequence metadata
8 from a gff3 file.
10 The verification step checks to make sure the seqid column (#1) in the gff3 file matches existing features
11 stored in the database. If a list of attributes are provided, the verification step will check if those
12 attributes exist in the gff3 file and if there are attributes in the file missing from the provided list.
14 If the verification passes, the script will store the annotations from the gff3 file as sequence metadata
15 in the featureprop_json table.
17 IMPORTANT: A sequence metadata protocol must already exist in the nd_protocol table and must have an nd_protocolprop
18 of type 'sequence_metadata_protocol_properties' that include the sequence metadata type and attributes described by
19 the protocol. If the protocol does not yet exist, you can use the load_sequence_metadata_protocol.pl script to create one.
21 =head1 COMMAND-LINE OPTIONS
22 ARGUMENTS
23 -H host name (required) e.g. "localhost"
24 -D database name (required) e.g. "cxgn_cassava"
25 -U database username (required)
26 -P database password (optional, default=prompt user for password)
27 -n nd_protocol_id of the sequence metadata protocol to associated the data with (required)
28 -i input gff file (required)
29 -o output/processed gff file (optional, default=$input.processed)
30 -s species name (required)
31 -w flag to skip verification warnings about missing/undefined attributes (optional, default=prompt user on warnings)
33 =head1 AUTHOR
34 David Waring <djw64@cornell.edu>
36 =cut
38 use strict;
40 use Getopt::Std;
41 use Data::Dumper;
42 use JSON;
44 use Bio::Chado::Schema;
45 use CXGN::Genotype::SequenceMetadata;
49 # Read CLI Options
50 our ($opt_H, $opt_D, $opt_U, $opt_P, $opt_n, $opt_i, $opt_o, $opt_s, $opt_w);
51 getopts('H:D:U:P:n:i:o:s:w');
54 # Check for required arguments
55 if ( !$opt_H || !$opt_U || !$opt_D ) {
56 die "ERROR: Database options -H, -D, and -U are required!\n";
58 if ( !$opt_n ) {
59 die "ERROR: Sequence Metadata Protocol ID is required!\n";
61 if ( !$opt_i ) {
62 die "ERROR: Input gff file path required!\n";
64 if ( !$opt_s ) {
65 die "ERROR: Species name is required!\n";
69 # Set parameters from options
70 my $nd_protocol_id = $opt_n;
71 my $input = $opt_i;
72 my $output = $opt_o ? $opt_o : $opt_i . ".processed";
73 my $species = $opt_s;
74 my $ignore_warnings = $opt_w;
77 # Check if input file exists
78 if( !(-e -f -r $input) ){
79 die "ERROR: Input file ($input) does not exist or is not readable!\n";
83 # Connect to DB
84 my $pass = $opt_P;
85 if ( !$opt_P ) {
86 print "Password for $opt_H / $opt_D: \n";
87 my $pw = <>;
88 chomp($pw);
89 $pass = $pw;
91 print STDERR "Connecting to database...\n";
92 my $dsn = 'dbi:Pg:database='.$opt_D.";host=".$opt_H.";port=5432";
93 my $schema = Bio::Chado::Schema->connect($dsn, $opt_U, $pass);
94 my $dbh = $schema->storage->dbh;
97 # Get the protocol information (type_id and attributes)
98 my $q = "SELECT value FROM public.nd_protocolprop WHERE nd_protocol_id = ?;";
99 my $h = $dbh->prepare($q);
100 $h->execute($nd_protocol_id);
101 my ($value_json) = $h->fetchrow_array();
102 if (!$value_json){
103 die "ERROR: Sequence Metadata Protocol ID ($nd_protocol_id) is not valid!\n";
105 my $value = decode_json $value_json;
106 my $type_id = $value->{'sequence_metadata_type_id'};
107 my $attribute_descriptions = $value->{'attribute_descriptions'};
108 if ( !$type_id || !$attribute_descriptions ) {
109 die "ERROR: Sequence Metadata Protocol does not have the correct attributes (sequence_metadata_type_id, attribute_descriptions) set!\n";
111 my @attributes = keys %{$attribute_descriptions};
114 # VERIFY
115 print STDERR "====> VERIFY GFF FILE <====\n";
116 print STDERR "Input File: $input\n";
117 print STDERR "Output File: $output\n";
118 print STDERR "Species: $species\n";
119 print STDERR "Type ID: $type_id\n";
120 print STDERR "Protocol ID: $nd_protocol_id\n";
121 print STDERR "Attributes: " . join(', ', @attributes) . "\n";
123 my $smd = CXGN::Genotype::SequenceMetadata->new(bcs_schema => $schema, type_id => $type_id, nd_protocol_id => $nd_protocol_id);
124 my $verification_results = $smd->verify($input, $output, $species, \@attributes);
126 # Verification Error
127 if ( $verification_results->{'processed'} ne 1 || $verification_results->{'verified'} ne 1 ) {
128 print STDERR "Missing features:\n";
129 print STDERR Dumper $verification_results->{'missing_features'};
130 die "ERROR: Could not verify the input gff file!\n";
132 elsif ( !$ignore_warnings && ($verification_results->{'missing_attributes'} || $verification_results->{'undefined_attributes'}) ) {
133 print STDERR "WARNING: Missing and/or undefined attributes:\n";
134 print STDERR "Missing Attributes (Provided in the list of attributes but not found in the file)\n";
135 print STDERR Dumper $verification_results->{'missing_attributes'};
136 print STDERR "Undefined Attributes (Found in the file but not provided in the list of attributes)\n";
137 print STDERR Dumper $verification_results->{'undefined_attributes'};
139 print "Do you want to continue storing the file? (Y/N): ";
140 chomp(my $answer = <STDIN>);
141 if ( !(lc($answer) eq 'y') ) {
142 die "ERROR: Verification failed due to missing/undefined attributes\n";
146 print "--> VERIFICATION COMPLETE!\n";
149 # STORE
150 print STDERR "====> STORE GFF FILE <====\n";
152 my $store_results = $smd->store($output, $species);
154 # Store Error
155 if ( $store_results->{'stored'} ne 1 ) {
156 die $store_results->{'error'};
159 print STDERR "--> STORAGE COMPLETE (" . $store_results->{'chunks'} . " chunks written to the database)!\n";