make test pass for multicat parsing with two xlsx files for testing.
[sgn.git] / bin / load_locations_extended.pl
blob6f3c5391640edb6c3c468971b77f60f3347e15ca
1 #!/usr/bin/perl
3 =head1
5 load_locations_extended.pl - loading locations into cxgn databases.
7 =head1 SYNOPSIS
9 load_locations_extended.pl -H [dbhost] -D [dbname] -i [infile]
11 =head1 COMMAND-LINE OPTIONS
12 ARGUMENTS
13 -H host name (required) e.g. "localhost"
14 -D database name (required) e.g. "cxgn_cassava"
15 -i path to infile (required)
17 =head1 DESCRIPTION
19 This script loads locations data into Chado, by adding data to nd_geolocation table, and properties to nd_geolocationprop. Infile is Excel .xls and .xlsx format.
20 Header must have in the first 4 columns: 'Name', 'Longitude', 'Latitude', 'Altitude'.
21 Header columns from colum 5 and onwards are stored as cvterms that are part of the cv 'geolocation_property'.
24 =head1 AUTHOR
26 Nicolas Morales (nm529@cornell.edu)
28 =cut
30 use strict;
32 use Getopt::Std;
33 use Data::Dumper;
34 use Carp qw /croak/ ;
35 use Pod::Usage;
36 use Spreadsheet::ParseExcel;
37 use Spreadsheet::ParseXLSX;
38 use Bio::Chado::Schema;
39 use CXGN::DB::InsertDBH;
41 our ($opt_H, $opt_D, $opt_i);
43 getopts('H:D:i:');
45 if (!$opt_H || !$opt_D || !$opt_i) {
46 pod2usage(-verbose => 2, -message => "Must provide options -H (hostname), -D (database name), -i (input file) \n");
49 my $dbhost = $opt_H;
50 my $dbname = $opt_D;
52 # Match a dot, extension .xls / .xlsx
53 my ($extension) = $opt_i =~ /(\.[^.]+)$/;
54 my $parser;
56 if ($extension eq '.xlsx') {
57 $parser = Spreadsheet::ParseXLSX->new();
59 else {
60 $parser = Spreadsheet::ParseExcel->new();
63 my $excel_obj = $parser->parse($opt_i);
65 my $dbh = CXGN::DB::InsertDBH->new({
66 dbhost=>$dbhost,
67 dbname=>$dbname,
68 dbargs => {AutoCommit => 1, RaiseError => 1}
69 });
71 my $schema= Bio::Chado::Schema->connect( sub { $dbh->get_actual_dbh() } );
72 $dbh->do('SET search_path TO public,sgn');
75 my $worksheet = ( $excel_obj->worksheets() )[0]; #support only one worksheet
76 my ( $row_min, $row_max ) = $worksheet->row_range();
77 my ( $col_min, $col_max ) = $worksheet->col_range();
79 if ($col_max ne '11' || $worksheet->get_cell(0,0)->value() ne 'Full Name' || $worksheet->get_cell(0,1)->value() ne 'Longitude' || $worksheet->get_cell(0,2)->value() ne 'Latitude' || $worksheet->get_cell(0,3)->value() ne 'Altitude' || $worksheet->get_cell(0,4)->value() ne 'Uniquename' || $worksheet->get_cell(0,5)->value() ne 'Agricultural Ecological Zone'|| $worksheet->get_cell(0,6)->value() ne 'Continent' || $worksheet->get_cell(0,7)->value() ne 'Country' || $worksheet->get_cell(0,8)->value() ne 'Country Code' || $worksheet->get_cell(0,9)->value() ne 'adm1' || $worksheet->get_cell(0,10)->value() ne 'adm2' || $worksheet->get_cell(0,11)->value() ne 'adm3') {
80 pod2usage(-verbose => 2, -message => "Headers must be only in this order: Full Name, Longitude, Latitude, Altitude, Uniquename, Agricultural Ecological Zone, Continent, Country, Country Code, adm1, adm2, adm3\n");
84 for my $row ( 1 .. $row_max ) {
86 my $name = $worksheet->get_cell($row,0)->value();
87 my $longitude = $worksheet->get_cell($row,1)->value();
88 my $latitude = $worksheet->get_cell($row,2)->value();
89 my $altitude = $worksheet->get_cell($row,3)->value();
91 my $new_row;
92 $new_row = $schema->resultset('NaturalDiversity::NdGeolocation')->new({ description => $name });
93 if ($longitude) {
94 $new_row->longitude($longitude);
96 if ($latitude) {
97 $new_row->latitude($latitude);
99 if ($altitude) {
100 $new_row->altitude($altitude);
102 $new_row->insert();
104 print STDERR "Stored: ".$name." in nd_geolocation.\n";
106 for my $col ( 4 .. $col_max ) {
108 my $prop_term = $worksheet->get_cell(0, $col)->value();
109 my $prop_value = $worksheet->get_cell($row,$col)->value();
111 my $prop_cvterm = $schema->resultset("Cv::Cvterm")->create_with({
112 name => $prop_term,
113 cv => 'geolocation_property',
116 my $new_prop = $schema->resultset('NaturalDiversity::NdGeolocationprop')->create({
117 nd_geolocation_id => $new_row->nd_geolocation_id(),
118 type_id => $prop_cvterm->cvterm_id(),
119 value => $prop_value,
122 print STDERR "Stored: ".$prop_value." in nd_geolocationprop.\n";
127 print STDERR "Script Complete.\n";