From 2710ed1587bfc04e2344c9c6e6fa1f43e6a3f947 Mon Sep 17 00:00:00 2001 From: Lukas Mueller Date: Tue, 29 Oct 2024 17:06:12 -0400 Subject: [PATCH] add rename location script. --- bin/rename_locations.pl | 100 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 100 insertions(+) create mode 100644 bin/rename_locations.pl diff --git a/bin/rename_locations.pl b/bin/rename_locations.pl new file mode 100644 index 0000000000..4de3024ea6 --- /dev/null +++ b/bin/rename_locations.pl @@ -0,0 +1,100 @@ +#!/usr/bin/perl + +=head1 + +rename_locations.pl - renaming locations in a cxgn database + +=head1 SYNOPSIS + + rename_locations.pl -H [dbhost] -D [dbname] -i [infile] + +=head1 COMMAND-LINE OPTIONS + ARGUMENTS + -H host name (required) e.g. "localhost" + -D database name (required) e.g. "cxgn_cassava" + -i path to infile (required) + +=head1 DESCRIPTION + +This script loads locations data into Chado, by adding data to nd_geolocation table. Infile is Excel .xls and .xlsx format. +Header is in this order: 'old name', 'new name' + +=head1 AUTHOR + + Lukas Mueller (lam87@cornell.edu) + + Based on a script by Nicolas Morales (nm529@cornell.edu) + +=cut + +use strict; + +use Getopt::Std; +use Data::Dumper; +use Carp qw /croak/ ; +use Pod::Usage; +use Spreadsheet::ParseExcel; +use Spreadsheet::ParseXLSX; +use Bio::Chado::Schema; +use CXGN::DB::InsertDBH; + +our ($opt_H, $opt_D, $opt_i); + +getopts('H:D:i:'); + +if (!$opt_H || !$opt_D || !$opt_i) { + pod2usage(-verbose => 2, -message => "Must provide options -H (hostname), -D (database name), -i (input file) \n"); +} + +my $dbhost = $opt_H; +my $dbname = $opt_D; + +# Match a dot, extension .xls / .xlsx +my ($extension) = $opt_i =~ /(\.[^.]+)$/; +my $parser; + +if ($extension eq '.xlsx') { + $parser = Spreadsheet::ParseXLSX->new(); +} +else { + $parser = Spreadsheet::ParseExcel->new(); +} + +my $excel_obj = $parser->parse($opt_i); + +my $dbh = CXGN::DB::InsertDBH->new({ + dbhost=>$dbhost, + dbname=>$dbname, + dbargs => {AutoCommit => 1, RaiseError => 1} +}); + +my $schema= Bio::Chado::Schema->connect( sub { $dbh->get_actual_dbh() } ); +$dbh->do('SET search_path TO public,sgn'); + + +my $worksheet = ( $excel_obj->worksheets() )[0]; #support only one worksheet +my ( $row_min, $row_max ) = $worksheet->row_range(); +my ( $col_min, $col_max ) = $worksheet->col_range(); + +if ($col_max ne '2' || $worksheet->get_cell(0,0)->value() ne 'old_location_name' || $worksheet->get_cell(0,1)->value() ne 'new_location_name') { + pod2usage(-verbose => 2, -message => "Headers must be only in this order: old_location_name, new_location_name.\n"); +} + + +for my $row ( 1 .. $row_max ) { + + my $old_name = $worksheet->get_cell($row,0)->value(); + my $new_name = $worksheet->get_cell($row,1)->value(); + + my $row = $schema->resultset('NaturalDiversity::NdGeolocation')->find({ description => $old_name }); + if ($row) { + $row->description($new_name); + $row->update(); + print STDERR "Updated $old_name to $new_name\n"; + } + else { + print STDERR "Location $old_name was not found. Skipping.\n"; + } +} + +print STDERR "Script Complete.\n"; -- 2.11.4.GIT