Merge pull request #4606 from solgenomics/topic/fix-germplasm-log
[sgn.git] / bin / load_phenotypes_spreadsheet_csv.pl
blobc69320946ada5e1f144aef77bd58b5c6e4075ca6
1 #!/usr/bin/perl
3 =head1
5 load_phenotypes_spreadsheet_csv.pl - backend script for loading phenotypes into cxgn databases from a spreadsheet csv file. uses same process as online interface.
7 =head1 SYNOPSIS
9 load_phenotypes_spreadsheet_csv.pl -H [dbhost] -D [dbname] -U [dbuser] -P [dbpass] -b [basepath] -i [infile] -a [archive path on server] -d [datalevel] -u [username] -t [timestamps included] -o [overwrite previous values] -r [temp_file_nd_experiment_id]
11 =head1 COMMAND-LINE OPTIONS
12 ARGUMENTS
13 -H host name (required) e.g. "localhost"
14 -D database name (required) e.g. "cxgn_cassava"
15 -U database username (required)
16 -P database userpass (required)
17 -b basepath (required) e.g. "/home/me/cxgn/sgn"
18 -r temp_file_nd_experiment_id (required) e.g. "/tmp/delete_nd_experiment_ids.txt"
19 -i path to infile (required)
20 -a archive path (required) e.g. /export/prod/archive/
21 -d datalevel (required) must be plots or plants
22 -u username (required) username in database of peron uploading phenotypes
23 -t timestamps included (optional) 1 or 0
24 -o overwrite previous values (optional) 1 or 0
26 =head1 DESCRIPTION
28 perl bin/load_phenotypes_spreadsheet_csv.pl -D cass -H localhost -U postgres -P postgres -b /home/me/cxgn/sgn -u nmorales -i ~/Downloads/combined_counts.csv -a /export/prod/archive/ -d plants -r /tmp/delete_nd_experiment_ids.txt
30 This script will parse and validate the input file. If there are any warnings or errors during validation it will die.
31 If there are no warnings or errors during validation it will then store the data.
33 input file should be a spreadsheet csv file. All fields should be quoted.
34 header line should be: "studyYear","studyDbId","studyName","studyDesign","locationDbId","locationName","germplasmDbId","germplasmName","germplasmSynonyms","observationLevel","observationUnitDbId","observationUnitName","replicate","blockNumber","plotNumber" followed by all measured traits
36 To include timestamps, values should be "value,YYYY-MM-DD 13:45:01+0004".
38 =head1 AUTHOR
40 Nicolas Morales (nm529@cornell.edu)
42 =cut
44 use strict;
46 use Getopt::Std;
47 use Data::Dumper;
48 use Carp qw /croak/ ;
49 use Pod::Usage;
50 use DateTime;
51 use Bio::Chado::Schema;
52 use CXGN::Metadata::Schema;
53 use CXGN::Phenome::Schema;
54 use CXGN::DB::InsertDBH;
55 use CXGN::Phenotypes::StorePhenotypes;
56 use CXGN::Phenotypes::ParseUpload;
57 use CXGN::UploadFile;
58 use File::Basename;
60 our ($opt_H, $opt_D, $opt_U, $opt_P, $opt_b, $opt_i, $opt_a, $opt_d, $opt_u, $opt_t, $opt_o, $opt_r);
62 getopts('H:D:U:P:b:i:a:d:u:t:o:r:');
64 if (!$opt_H || !$opt_D || !$opt_U ||!$opt_P || !$opt_b || !$opt_i || !$opt_a || !$opt_d || !$opt_u || !$opt_r) {
65 die "Must provide options -H (hostname), -D (database name), -U (database user), -P (database password), -b (basepath), -i (input file), -a (archive path), -d (datalevel), -u (username in db), -r (temp_file_nd_experiment_id)\n";
68 my $schema = Bio::Chado::Schema->connect(
69 "dbi:Pg:database=$opt_D;host=$opt_H", # DSN Line
70 $opt_U, # Username
71 $opt_P # Password
73 my $metadata_schema = CXGN::Metadata::Schema->connect(
74 "dbi:Pg:database=$opt_D;host=$opt_H", # DSN Line
75 $opt_U, # Username
76 $opt_P # Password
78 $metadata_schema->storage->dbh->do('SET search_path TO metadata');
80 my $phenome_schema = CXGN::Phenome::Schema->connect(
81 "dbi:Pg:database=$opt_D;host=$opt_H", # DSN Line
82 $opt_U, # Username
83 $opt_P # Password
85 $phenome_schema->storage->dbh->do('SET search_path TO phenome');
87 my $dbh = CXGN::DB::InsertDBH->new({
88 dbhost=>$opt_H,
89 dbname=>$opt_D,
90 dbargs => {AutoCommit => 1, RaiseError => 1}
91 });
92 $dbh->do('SET search_path TO public,sgn');
94 my $q = "SELECT sp_person_id from sgn_people.sp_person where username = '$opt_u';";
95 my $h = $dbh->prepare($q);
96 $h->execute();
97 my ($sp_person_id) = $h->fetchrow_array();
98 if (!$sp_person_id){
99 die "Not a valid -u\n";
102 my $timestamp_included;
103 if ($opt_t == 1){
104 $timestamp_included = 1;
107 my $parser = CXGN::Phenotypes::ParseUpload->new();
108 my $subdirectory = "spreadsheet_csv_phenotype_upload";
109 my $validate_type = "phenotype spreadsheet csv";
110 my $metadata_file_type = "spreadsheet csv phenotype file";
111 my $upload = $opt_i;
112 my $data_level = $opt_d;
114 my $time = DateTime->now();
115 my $timestamp = $time->ymd()."_".$time->hms();
117 my $uploader = CXGN::UploadFile->new({
118 tempfile => $upload,
119 subdirectory => $subdirectory,
120 archive_path => $opt_a,
121 archive_filename => basename($upload),
122 timestamp => $timestamp,
123 user_id => $sp_person_id,
124 user_role => 'curator'
126 my $archived_filename_with_path = $uploader->archive();
127 my $md5 = $uploader->get_md5($archived_filename_with_path);
128 if (!$archived_filename_with_path) {
129 die "Could not archive file!\n";
130 } else {
131 print STDERR "File saved in archive.\n";
134 my %phenotype_metadata;
135 $phenotype_metadata{'archived_file'} = $archived_filename_with_path;
136 $phenotype_metadata{'archived_file_type'} = $metadata_file_type;
137 $phenotype_metadata{'operator'} = $opt_u;
138 $phenotype_metadata{'date'} = $timestamp;
140 my $validate_file = $parser->validate($validate_type, $archived_filename_with_path, $timestamp_included, $data_level, $schema, undef, undef);
141 if (!$validate_file) {
142 die "Input file itself not valid.\n";
144 if ($validate_file == 1){
145 print STDERR "File itself valid. Will now parse.\n";
146 } else {
147 if ($validate_file->{'error'}) {
148 die $validate_file->{'error'}."\n";
152 my $parsed_file = $parser->parse($validate_type, $archived_filename_with_path, $timestamp_included, $data_level, $schema, undef, $sp_person_id, undef, undef);
153 if (!$parsed_file) {
154 die "Error parsing file.\n";
156 if ($parsed_file->{'error'}) {
157 die $parsed_file->{'error'},"\n";
160 print STDERR "File parsed. Will now validate contents.\n";
162 my %parsed_data;
163 my @plots;
164 my @traits;
165 if ($parsed_file && !$parsed_file->{'error'}) {
166 %parsed_data = %{$parsed_file->{'data'}};
167 @plots = @{$parsed_file->{'units'}};
168 @traits = @{$parsed_file->{'variables'}};
171 my $store_phenotypes = CXGN::Phenotypes::StorePhenotypes->new(
172 basepath=>$opt_b,
173 dbhost=>$opt_H,
174 dbname=>$opt_D,
175 dbuser=>$opt_U,
176 dbpass=>$opt_P,
177 temp_file_nd_experiment_id=>$opt_r,
178 bcs_schema=>$schema,
179 metadata_schema=>$metadata_schema,
180 phenome_schema=>$phenome_schema,
181 user_id=>$sp_person_id,
182 stock_list=>\@plots,
183 trait_list=>\@traits,
184 values_hash=>\%parsed_data,
185 has_timestamps=>$timestamp_included,
186 metadata_hash=>\%phenotype_metadata,
189 my ($verified_warning, $verified_error) = $store_phenotypes->verify();
190 if ($verified_error) {
191 die $verified_error."\n";
193 if ($verified_warning && !$opt_o) {
194 die $verified_warning."\n";
197 print STDERR "Done validating. Now storing\n";
199 my ($stored_phenotype_error, $stored_Phenotype_success) = $store_phenotypes->store();
200 if ($stored_phenotype_error) {
201 die $stored_phenotype_error."\n";
203 print STDERR $stored_Phenotype_success."\n";
204 print STDERR "Script Complete.\n";