5 load_phenotypes_spreadsheet_csv.pl - backend script for loading phenotypes into cxgn databases from a spreadsheet csv file. uses same process as online interface.
9 load_phenotypes_spreadsheet_csv.pl -H [dbhost] -D [dbname] -U [dbuser] -P [dbpass] -i [infile] -a [archive path on server] -d [datalevel] -u [username] -t [timestamps included] -o [overwrite previous values]
11 =head1 COMMAND-LINE OPTIONS
13 -H host name (required) e.g. "localhost"
14 -D database name (required) e.g. "cxgn_cassava"
15 -U database username (required)
16 -P database userpass (required)
17 -i path to infile (required)
18 -a archive path (required) e.g. /export/prod/archive/
19 -d datalevel (required) must be plots or plants
20 -u username (required) username in database of peron uploading phenotypes
21 -t timestamps included (optional) 1 or 0
22 -o overwrite previous values (optional) 1 or 0
26 perl bin/load_phenotypes_spreadsheet_csv.pl -D cass -H localhost -U postgres -P postgres -u nmorales -i ~/Downloads/combined_counts.csv -a /export/prod/archive/ -d plants
28 This script will parse and validate the input file. If there are any warnings or errors during validation it will die.
29 If there are no warnings or errors during validation it will then store the data.
31 input file should be a spreadsheet csv file. All fields should be quoted.
32 header line should be: "studyYear","studyDbId","studyName","studyDesign","locationDbId","locationName","germplasmDbId","germplasmName","germplasmSynonyms","observationLevel","observationUnitDbId","observationUnitName","replicate","blockNumber","plotNumber" followed by all measured traits
34 To include timestamps, values should be "value,YYYY-MM-DD 13:45:01+0004".
38 Nicolas Morales (nm529@cornell.edu)
49 use Bio
::Chado
::Schema
;
50 use CXGN
::Metadata
::Schema
;
51 use CXGN
::Phenome
::Schema
;
52 use CXGN
::DB
::InsertDBH
;
53 use CXGN
::Phenotypes
::StorePhenotypes
;
54 use CXGN
::Phenotypes
::ParseUpload
;
58 our ($opt_H, $opt_D, $opt_U, $opt_P, $opt_i, $opt_a, $opt_d, $opt_u, $opt_t, $opt_o);
60 getopts
('H:D:U:P:i:a:d:u:t:o:');
62 if (!$opt_H || !$opt_D || !$opt_U ||!$opt_P || !$opt_i || !$opt_a || !$opt_d || !$opt_u ) {
63 die "Must provide options -H (hostname), -D (database name), -U (database user), -P (database password), -i (input file), -a (archive path), -d (datalevel), -u (username in db)\n";
66 my $schema = Bio
::Chado
::Schema
->connect(
67 "dbi:Pg:database=$opt_D;host=$opt_H", # DSN Line
71 my $metadata_schema = CXGN
::Metadata
::Schema
->connect(
72 "dbi:Pg:database=$opt_D;host=$opt_H", # DSN Line
76 $metadata_schema->storage->dbh->do('SET search_path TO metadata');
78 my $phenome_schema = CXGN
::Phenome
::Schema
->connect(
79 "dbi:Pg:database=$opt_D;host=$opt_H", # DSN Line
83 $phenome_schema->storage->dbh->do('SET search_path TO phenome');
85 my $dbh = CXGN
::DB
::InsertDBH
->new({
88 dbargs
=> {AutoCommit
=> 1, RaiseError
=> 1}
90 $dbh->do('SET search_path TO public,sgn');
92 my $q = "SELECT sp_person_id from sgn_people.sp_person where username = '$opt_u';";
93 my $h = $dbh->prepare($q);
95 my ($sp_person_id) = $h->fetchrow_array();
97 die "Not a valid -u\n";
100 my $timestamp_included;
102 $timestamp_included = 1;
105 my $parser = CXGN
::Phenotypes
::ParseUpload
->new();
106 my $subdirectory = "spreadsheet_csv_phenotype_upload";
107 my $validate_type = "phenotype spreadsheet csv";
108 my $metadata_file_type = "spreadsheet csv phenotype file";
110 my $data_level = $opt_d;
112 my $time = DateTime
->now();
113 my $timestamp = $time->ymd()."_".$time->hms();
115 my $uploader = CXGN
::UploadFile
->new({
117 subdirectory
=> $subdirectory,
118 archive_path
=> $opt_a,
119 archive_filename
=> basename
($upload),
120 timestamp
=> $timestamp,
121 user_id
=> $sp_person_id,
122 user_role
=> 'curator'
124 my $archived_filename_with_path = $uploader->archive();
125 my $md5 = $uploader->get_md5($archived_filename_with_path);
126 if (!$archived_filename_with_path) {
127 die "Could not archive file!\n";
129 print STDERR
"File saved in archive.\n";
132 my %phenotype_metadata;
133 $phenotype_metadata{'archived_file'} = $archived_filename_with_path;
134 $phenotype_metadata{'archived_file_type'} = $metadata_file_type;
135 $phenotype_metadata{'operator'} = $opt_u;
136 $phenotype_metadata{'date'} = $timestamp;
138 my $validate_file = $parser->validate($validate_type, $archived_filename_with_path, $timestamp_included, $data_level, $schema);
139 if (!$validate_file) {
140 die "Input file itself not valid.\n";
142 if ($validate_file == 1){
143 print STDERR
"File itself valid. Will now parse.\n";
145 if ($validate_file->{'error'}) {
146 die $validate_file->{'error'}."\n";
150 my $parsed_file = $parser->parse($validate_type, $archived_filename_with_path, $timestamp_included, $data_level, $schema);
152 die "Error parsing file.\n";
154 if ($parsed_file->{'error'}) {
155 die $parsed_file->{'error'},"\n";
158 print STDERR
"File parsed. Will now validate contents.\n";
163 if ($parsed_file && !$parsed_file->{'error'}) {
164 %parsed_data = %{$parsed_file->{'data'}};
165 @plots = @
{$parsed_file->{'units'}};
166 @traits = @
{$parsed_file->{'variables'}};
169 my $store_phenotypes = CXGN
::Phenotypes
::StorePhenotypes
->new(
171 metadata_schema
=>$metadata_schema,
172 phenome_schema
=>$phenome_schema,
173 user_id
=>$sp_person_id,
175 trait_list
=>\
@traits,
176 values_hash
=>\
%parsed_data,
177 has_timestamps
=>$timestamp_included,
178 metadata_hash
=>\
%phenotype_metadata,
181 my ($verified_warning, $verified_error) = $store_phenotypes->verify();
182 if ($verified_error) {
183 die $verified_error."\n";
185 if ($verified_warning && !$opt_o) {
186 die $verified_warning."\n";
189 print STDERR
"Done validating. Now storing\n";
191 my ($stored_phenotype_error, $stored_Phenotype_success) = $store_phenotypes->store();
192 if ($stored_phenotype_error) {
193 die $stored_phenotype_error."\n";
195 print STDERR
$stored_Phenotype_success."\n";
196 print STDERR
"Script Complete.\n";