make test pass for multicat parsing with two xlsx files for testing.
[sgn.git] / bin / find_accessions_for_seedlots.pl
blob4fed4d03e176cd102c6a03fe7189c661a5f5168a
1 #!/usr/bin/perl
3 =head1
5 find_accessions_for_seedlots.pl - give the accession for each seedlot name
7 =head1 SYNOPSIS
9 find_accessions_for_seedlots.pl -H [dbhost] -D [dbname] -i [infile]
11 =head1 COMMAND-LINE OPTIONS
12 ARGUMENTS
13 -H host name (required) e.g. "localhost"
14 -D database name (required) e.g. "cxgn_cassava"
15 -i path to infile (required)
17 =head1 DESCRIPTION
21 =head1 AUTHOR
23 Lukas Mueller <lam87@cornell.edu>, based on a script by Nick Morales
25 =cut
27 use strict;
29 use Getopt::Std;
30 use Data::Dumper;
31 use Carp qw /croak/ ;
32 use Pod::Usage;
33 use Spreadsheet::ParseExcel;
34 use Spreadsheet::ParseXLSX;
35 use Bio::Chado::Schema;
36 use CXGN::DB::InsertDBH;
37 use SGN::Model::Cvterm;
38 use CXGN::Stock;
39 use CXGN::Stock::Seedlot;
41 our ($opt_H, $opt_D, $opt_i);
43 getopts('H:D:i:');
45 if (!$opt_H || !$opt_D || !$opt_i) {
46 pod2usage(-verbose => 2, -message => "Must provide options -H (hostname), -D (database name), -i (input file) \n");
49 my $dbhost = $opt_H;
50 my $dbname = $opt_D;
52 # Match a dot, extension .xls / .xlsx
53 my ($extension) = $opt_i =~ /(\.[^.]+)$/;
54 my $parser;
56 print STDERR "Figuring out file format... ($extension)...\n";
57 if ($extension eq '.xlsx') {
58 $parser = Spreadsheet::ParseXLSX->new();
60 else {
61 $parser = Spreadsheet::ParseExcel->new();
64 #print STDERR "Parsing file... (please wait...)\n";
66 #my $excel_obj = $parser->parse($opt_i);
68 print STDERR "Connecting to database $dbname on $dbhost ...\n";
70 my $dbh = CXGN::DB::InsertDBH->new({
71 dbhost=>$dbhost,
72 dbname=>$dbname,
73 dbargs => {AutoCommit => 1, RaiseError => 1}
74 });
76 my $schema= Bio::Chado::Schema->connect( sub { $dbh->get_actual_dbh() } );
77 $dbh->do('SET search_path TO public,sgn');
80 print STDERR "Parsing file...\n";
82 #my $worksheet = ( $excel_obj->worksheets() )[0]; #support only one worksheet
83 #my ( $row_min, $row_max ) = $worksheet->row_range();
84 #my ( $col_min, $col_max ) = $worksheet->col_range();
86 my $seedlot_type_id = SGN::Model::Cvterm->get_cvterm_row($schema, 'seedlot', 'stock_type')->cvterm_id();
87 my $stock_type_id = SGN::Model::Cvterm->get_cvterm_row($schema, 'accession', 'stock_type')->cvterm_id();
89 print STDERR "Seedlot type id = $seedlot_type_id\n";
91 open(my $F, "<", $opt_i) || die "Can't open file $opt_i";
94 while (<$F>) {
96 chomp($_);
98 my ($empty, $seedlot, @row) = split /\t/, $_;
100 print STDERR "Reading row '$empty', $seedlot, and the rest is: ". join(",", @row)."\n";
102 my $seedlot_row = $schema->resultset("Stock::Stock")->find( { uniquename => $seedlot, type_id=> $seedlot_type_id });
104 my $name = "";
105 my $type = "";
107 if (! defined($seedlot_row)) {
108 $name = "[SEEDLOT NOT IN DB]";
109 print STDERR "ROW === ".Dumper($seedlot_row);
111 else {
113 print STDERR "FOUND SEEDLOT!\n";
115 my $seedlot = CXGN::Stock::Seedlot->new( schema => $schema, seedlot_id => $seedlot_row->stock_id() );
117 if (my $accession = $seedlot->accession()) {
118 $name = $accession->[1];
119 $type = "accession";
121 elsif (my $cross = $seedlot->cross()) {
122 $name = $cross->[1];
123 $type = "cross";
127 print join("\t", $name, $seedlot, @row)."\n";
130 close($F);
131 print STDERR "Script Complete.\n";