bin/load_fish_images_and_data.pl

   1 #!/usr/bin/env perl
   2
   3 # Standard stuff.
   4 use strict;
   5 use Data::Dumper;
   6 use File::Basename;
   7 use File::Glob;
   8 use Getopt::Std;
   9 use SGN::Image;
  10
  11 # Beth's stuff.
  12 use CXGN::DB::Connection;
  13
  14
  15 # Rob's stuff.
  16 use CXGN::Genomic::Clone;
  17 #use CXGN::Genomic::CloneNameParser;
  18 \f
  19 # Just a little CSV parser.  (In defense of this package's style and existence,
  20 # I wrote it on my third or fourth week here as an exercise in learning Perl,
  21 # and it has required no maintenance since it was written.  You're welcome to
  22 # curse me for not having found some crap in CPAN that does what this does.)
  23 package CSV;
  24
  25 use Exporter;
  26 our @ISA = qw(Exporter);
  27 our @EXPORT = qw(csv_line_to_list read_csv_record);
  28
  29 ### CSV parser.
  30 # There are three states in this parser, code where the state
  31 # changes occur can be visited by searching for $state assignments.
  32 use constant ex_quote => 0;
  33 use constant in_quote => 1;
  34 use constant quote_maybe_escape => 2;
  35
  36 # The following three functions are only used in csv_to_list.
  37 sub parse_error {
  38   my ($position, $errmsg, $line) = @_;
  39   my $errstr = "CSV parse error at position $position: $errmsg\n$line";
  40   my $fmtstr = "%" . ($position+1) . "s";
  41   $errstr = $errstr . sprintf ($fmtstr, "^");
  42   die ($errstr);
  43 }
  44
  45 # Note: this is factored out to make it easier to find all places in
  46 # the code where state is changed, but it doesn't actually do much.
  47 # In this and in the next sub, we use prototypes to make the call
  48 # sites cleaner, not to omit parentheses.
  49 sub change_state (\$$) {
  50   my ($statevarref, $state) = @_;
  51   $$statevarref = $state;
  52 }
  53
  54 sub collect_field (\@\$\$) {
  55   my ($accumulator_array_ref, $accumulator_scalar_ref, $stateref) = @_;
  56   push @$accumulator_array_ref, $$accumulator_scalar_ref;
  57   $$accumulator_scalar_ref = "";
  58   $$stateref = ex_quote;
  59 }
  60
  61 # Return an array of scalars consisting of strings scraped out of a
  62 # line that's been CSV encoded.
  63 sub csv_line_to_list {
  64   my ($csv_line, $separator_char, $quote_char) = @_;
  65   # A string for accumulating tokens.
  66   my $accstr = "";
  67   # An array for accumulating strings.
  68   my @accarr = ();
  69   # For useful error messages, the position in the line.
  70   my $pos = -1;
  71   # Parser state.  There are only 3 possible states: in_quote,  ex_quote,
  72   # and quote_maybe_escape.  The quote character is used both to terminate
  73   # quoted strings and to escape itself inside quoted strings.
  74   my $state = ex_quote;
  75   while ($csv_line =~ m/(.)/g) {
  76     my $char = $1;
  77     $pos++;
  78     # Note: parser states are numbers, and so == is the optimal
  79     # comparison operator.  If you change parser states to strings, you'll have
  80     # to change these to eq comparisons.
  81     if ($state == ex_quote) {
  82       if ($char eq $quote_char) {
  83         if ($accstr) { # we've accumulated some datum, and see a quote: bogus.
  84           parse_error ($pos, "quote character in unquoted datum", $csv_line);
  85         } else { # we're seeing a quote right after a separator.
  86           change_state ($state, in_quote);
  87         }
  88       } elsif ($char eq $separator_char) { # end of field
  89         collect_field (@accarr, $accstr, $state);
  90       } else {
  91         $accstr .= $char;
  92       }
  93     }
  94     elsif ($state == in_quote) {
  95       if ($char eq $quote_char) {
  96         change_state ($state, quote_maybe_escape);
  97       } else {
  98         $accstr .= $char;
  99       }
 100     }
 101     elsif ($state == quote_maybe_escape) {
 102       if ($char eq $quote_char) {
 103         $accstr .= $quote_char;
 104         change_state ($state, in_quote);
 105       } elsif ($char eq $separator_char) {
 106         collect_field (@accarr, $accstr, $state);
 107       } else { # anything other than a quote or separator after a quote is bogus
 108         parse_error ($pos, "garbage character after close quote", $csv_line);
 109       }
 110     }
 111     else {
 112       parse_error ($pos, "bug in csv parser, unknown state $state", $csv_line);
 113     }
 114   }
 115   # If in datum at end of line
 116   # FIXME: ",XXX\r\n"
 117   if (($accstr ne "") && ($accstr ne "\r")) {
 118     if ($state == in_quote) {
 119       parse_error ($pos, "end of line reached inside quoted datum", $csv_line);
 120     }
 121     push (@accarr, $accstr);
 122   }
 123   return @accarr;
 124 }
 125
 126 sub read_csv_record {
 127   my ($filehandle, $separator_char, $quote_char) = @_;
 128   my $line = <$filehandle>;
 129   chomp $line;
 130   return csv_line_to_list ($line, $separator_char, $quote_char);
 131 }
 132 \f
 133 use Exporter;
 134 package Schmenomic;
 135 our @ISA = qw(Exporter);
 136 our @EXPORT = qw(decompose_bac_name canonicalize_bac_name id_query_for_bac_name id_for_bac_name);
 137 # The existing genomic API and its implementation are needlessly
 138 # complex.
 139 # As of 2006/4/12, there are 3 libraries of clones, with the
 140 # following maxima for each of platenum, wellrow, and wellcol:
 141 #
 142 #  336 | P   |  24
 143 #  148 | P   |  24
 144 #  132 | P   |  24
 145 sub decompose_bac_name {
 146   my ($bac_name) = @_;
 147   unless ($bac_name) {
 148     print STDERR "no BAC name supplied";
 149     return (undef);
 150   }
 151   if ($bac_name =~ m/^([[:alpha:]_]+)?(\d{1,3})([[:alpha:]]{1})(\d{1,2})/) {
 152     my ($shortname, $platenum, $wellrow, $wellcol) = ($1, $2, $3, $4);
 153     return ([$shortname, $platenum, $wellrow, $wellcol]);
 154   } else {
 155 #    warn ()"Unparseable BAC name $bac_name.  If the BAC name is valid fix decompose_bac_name().\n");
 156     return (undef);
 157   }
 158 }
 159
 160 sub canonicalize_bac_name {
 161   my ($bac_name) = @_;
 162   my $decomposed_bac_name = decompose_bac_name ($bac_name);
 163   unless ($decomposed_bac_name) {
 164     return (undef);
 165   }
 166   my ($shortname, $platenum, $wellrow, $wellcol) = @$decomposed_bac_name;
 167   my $ret = sprintf "%s%0.3d%s%0.2d", $shortname, $platenum, uc($wellrow), $wellcol;
 168   return ($ret);
 169 }
 170
 171 sub id_query_for_bac_name {
 172   my ($bac_name, $optional_schema_name) = @_;
 173   my $canonical = canonicalize_bac_name ($bac_name);
 174   unless ($canonical) {
 175     return (undef);
 176   }
 177   my ($shortname, $platenum, $wellrow, $wellcol) = @{decompose_bac_name ($canonical)};
 178   unless ($shortname) {
 179     print STDERR "can't lookup BAC name $bac_name: no library shortname.\n";
 180     return (undef);
 181   }
 182   unless ($platenum) {
 183     print STDERR "can't lookup BAC name $bac_name: no plate number.\n";
 184     return (undef);
 185   }
 186   unless ($wellrow) {
 187     print STDERR "can't lookup BAC name $bac_name: no well row.\n";
 188     return (undef);
 189   }
 190   unless ($wellcol) {
 191     print STDERR "can't lookup BAC name $bac_name: no well column.\n";
 192     return (undef);
 193   }
 194   my $genomic;
 195   if ($optional_schema_name) {
 196     $genomic = $optional_schema_name;
 197   } else {
 198     $genomic = "genomic";
 199   }
 200   my $query = "SELECT clone_id
 201                  FROM $genomic.clone
 202                  JOIN $genomic.library USING (library_id)
 203                 WHERE shortname ILIKE '$shortname'
 204                   AND platenum = $platenum
 205                   AND wellrow ILIKE '$wellrow%'
 206                   AND wellcol = $wellcol";
 207   return ($query);
 208 }
 209
 210 sub id_for_bac_name {
 211   my ($dbh, $bac_name) = @_;;
 212   my $schema = $dbh->qualify_schema ("genomic");
 213   my $query = id_query_for_bac_name ($bac_name, $schema);
 214   unless ($query) {
 215     return (undef);
 216   }
 217   my $result = $dbh->selectall_arrayref ($query);
 218   return ($result->[0][0]);
 219 }
 220 \f
 221 package main;
 222 ## Globals.
 223 #
 224 # The body of the program below turns rows of a spreadsheet into hash tables
 225 # whose keys are the chromosome number, the chromosome arm, the BAC ID,
 226 # the experimenter's name for the experiment, and the distance from the
 227 # centromere as a percentage of the arm length.  Unsurprisingly, these 5
 228 # fields plus a couple of constants external to the spreadsheet are the
 229 # significant columns in the fish_result table in the database.
 230 #
 231 # So here is the main query to be performed.  We'll run this for each
 232 # row of the spreadsheet. Note that in order to hike this string up
 233 # here before any argument processing, we've had to escape the two
 234 # constants; we'll eval the string before using it.
 235 our $result_insert_query =
 236   "INSERT INTO fish_result
 237           (chromo_num, chromo_arm,
 238            experiment_name, percent_from_centromere,
 239            clone_id, fish_experimenter_id, map_id)
 240           SELECT ?, ?, ?, ?, ?,
 241                  (SELECT fish_experimenter_id
 242                     FROM fish_experimenter
 243                    WHERE fish_experimenter_name = '%s'),
 244                  (SELECT map_id
 245                     FROM map
 246                    WHERE short_name = '%s')";
 247 # The names of the fields we need to supply as bind parameters to the
 248 # above query.  Make sure the order of field names match up.
 249 our @result_insert_fields = ("chromo_num", "chromo_arm",
 250                                "experiment_name", "percent_from_centromere",
 251                                "clone_id");
 252
 253
 254
 255
 256
 257 # TODO
 258 # A query for inserting a filename into the fish_file table.  FISH results
 259 # are uniquely identified by the experimenter, their experiment name, and
 260 # the clone_id so the linkage is pretty simple.
 261 our $file_insert_query =
 262   "INSERT INTO fish_file (filename, fish_result_id)
 263           SELECT ?, (SELECT fish_result_id
 264                        FROM fish_result
 265                NATURAL JOIN fish_experimenter
 266                       WHERE fish_experimenter_name = '%s'
 267                         AND experiment_name = ?
 268                         AND clone_id = ?)";
 269 #/TODO
 270
 271
 272 # Next, because we don't really trust the submitters to maintain the same
 273 # formatting of their spreadsheet (preserving column ordering, mostly), or
 274 # to use the same file layouts from submission to submission, this program
 275 # doesn't expect any specific spreadsheet structure or file layout, except
 276 # that the spreadsheet must be tabular and the files associated with a row
 277 # in the spreadsheet must be describable by a Unix glob.
 278 #
 279 # We use a format-stringy notation for both describing spreadsheet
 280 # structure and constructing filenames for each spreadsheet row.
 281 # The mapping of format codes to programmer-friendly keys is as follows:
 282 my %formats = (
 283                "a" => "chromo_arm",
 284                "b" => "bac",
 285                "c" => "chromo_num",
 286                "e" => "experiment_name",
 287                "p" => "percent_from_centromere",
 288               );
 289
 290 # The default ordering of columns in the spreadsheets we read.  Overridable
 291 # with -f.
 292 our $default_read_format = "%b%e%c%-%-%a%p";
 293
 294 # The default file name glob whose expansion names all files associated
 295 # with a given row in the spreadsheet.  Overridable with -d.
 296 our $default_file_glob = "Tomato_%c%a/BAC_%b/Photo_ID_%e/%e*";
 297
 298 # The name in table fish_experimenter of the FISH experimenter.  Overridable
 299 # with -e.
 300 our $default_experimenter_name = "fish_stack";
 301 # The name in table maps of the FISH map.  Overridable with -m.
 302 our $default_map_name = "Tomato FISH map";
 303 # The number of files expected to be found for each row of spreadsheet
 304 # data.  Overridable with -E
 305 our $default_extfiles_per_experiment = 4;
 306 \f
 307 ## Process command line arguments.
 308 our %opts;
 309 getopts ("d:e:E:hlm:f:q:s:t", \%opts);
 310
 311 # Help message.
 312 if ($opts{h}) {
 313     print_usage_and_quit(0);
 314 }
 315 # Make a DB connection.  We do this before processing further
 316 # arguments so that any options that require querying the
 317 # database can assume $dbh is set.
 318 our $dbh = CXGN::DB::Connection->new;
 319 $dbh->ping or die ("bogus database handle.");
 320 unless ($dbh) {
 321   die ("Can't connect to database.");
 322 }
 323 $dbh->dbh_param(PrintError=>0);
 324
 325 # Display the names of known FISH experimenters
 326 if ($opts{l}) {
 327     print_fish_experimenters_and_quit(0);
 328 }
 329 # File glob format
 330 our $file_glob = $default_file_glob;
 331 if ($opts{d}) {
 332     $file_glob = $opts{d};
 333 }
 334 # Experimenter name
 335 our $experimenter_name = $default_experimenter_name;
 336 if ($opts{e}) {
 337     $experimenter_name = $opts{e};
 338 }
 339 # External files per experiment
 340 our $extfiles_per_experiment = $default_extfiles_per_experiment;
 341 if ($opts{E}) {
 342   $extfiles_per_experiment = $opts{E};
 343 }
 344 # Spreadsheet column "format" (it's actually parsed by a CSV routine;
 345 # these are just the ordering of the columns)
 346 our $read_format = $default_read_format;
 347 if ($opts{f}) {
 348   $read_format = $opts{f};
 349 }
 350 # Map name.
 351 our $map_name = $default_map_name;
 352 if ($opts{m}) {
 353   $map_name = $opts{m};
 354 }
 355 # Parameters to the CSV parser.
 356 our $quote = "\"";
 357 if (defined($opts{q})) {
 358   $quote = $opts{q};
 359 }
 360 our $separator = ",";
 361 if (defined($opts{s})) {
 362   $separator = $opts{s};
 363 }
 364 # Required arguments: a directory and some filenames
 365 if (@ARGV < 2) {
 366   print "$0: too few arguments.\n";
 367   print_usage_and_quit (1);
 368 }
 369 our $directory = shift;
 370 our @files = @ARGV;
 371 \f
 372 # We're done processing arguments.  Now we construct a few structures
 373 # that will be constant through the rest of the program.
 374
 375 # Construct a list whose elements are the names of the columns
 376 # in the spreadsheet, or undef if we don't care about those columns.
 377 my @fields = @{reckon_fields ($read_format)};
 378
 379 # Create a function that takes a hash representing a row in a spreadsheet
 380 # and returns all files found for that row.
 381 my $find_files = make_file_finder ($directory."/".$file_glob);
 382
 383 # Create a function that inserts the FISH data hash into the fish_result table.
 384 my $result_inserter = make_inserter ($dbh, "fish_result", sprintf ($result_insert_query, $experimenter_name, $map_name));
 385 # Create a function that inserts filenames into the fish_file table for
 386
 387
 388
 389
 390
 391 # TODO
 392 # a given FISH data hash.
 393 my $file_inserter = make_inserter ($dbh, "fish_file", sprintf ($file_insert_query, $experimenter_name));
 394 #/TODO
 395
 396
 397
 398 \f
 399 # The main event.
 400 eval {
 401   foreach my $file (@files) {
 402     printf "Processing data file $file...\n";
 403     open (my $fh, "<$file") || die ("$0: failed to open spreadsheet $file.");
 404     count("spreadsheet");
 405   RECORD:  while (my @record = CSV::read_csv_record ($fh, $separator, $quote)) {
 406       count("line");
 407       unless (@record >= @fields) { # Too few records
 408         #print STDERR @record+0, join("\t", @record);
 409         skip("Record has " . (@record) . " fields, not " . (@fields) . " fields.");
 410         next RECORD;
 411       }
 412
 413       my %fish_params;
 414       for (my $i = 0; $i < @fields; $i++) {
 415         my $value = $record[$i];
 416         my $fieldname = $fields[$i];
 417         if ($fieldname) { # we don't care about undef fieldnames
 418           # Trim the value, and stash it in %fish_params.
 419           $value =~ s/(^\s|\s$)//g;
 420           $fish_params{$fieldname} = $value;
 421         }
 422       }
 423       # Do some cleanup/error checking on the data.
 424       # cleanup_fish_data is expected to return a string
 425       # only if there's something wrong with the data.
 426       my $invalidity = cleanup_fish_data (\%fish_params);
 427       if ($invalidity) {
 428         skip ($invalidity);
 429         next RECORD;
 430       }
 431
 432       # If we've got this far, we have all we need to start inserting.
 433       my @params = @fish_params{@result_insert_fields};
 434       # We'll make a savepoint before each spreadsheet row,
 435       # so that (1) if the row has already been inserted, then
 436       # we let the database generate the error, rollback that row
 437       # and proceed; (2) if the row inserts but has no
 438       # corresponding external files, then we rollback that row
 439       # and proceed.
 440       my $saveptnm = name_savepoint();
 441       $dbh->pg_savepoint($saveptnm);
 442       eval {
 443         $result_inserter->(@params);
 444       };
 445       if ($@) { # The row didn't insert.
 446         # The only acceptable reason why this could occur is a violated UNIQUE
 447         # constraint (which we expect many of).
 448         # I started writing a module for mapping error codes to readable strings,
 449         # but keeping that sort of thing in sync with future database releases
 450         # is not worthwhile, at least given how little use is made of the
 451         # error code.  The error codes themselves are reputedly standardized
 452         # and therefore in principle stable.
 453         if ($dbh->state eq "23505") {
 454           skip ("Failed to insert row for $experimenter_name, $fish_params{bac}, $fish_params{experiment_name} (already in database).");
 455         } else {
 456           die ("Unexpected database insert error $@");
 457         }
 458         $dbh->pg_rollback_to($saveptnm);
 459         next RECORD;
 460       } else { # Row inserted, now do the external files.
 461         # Find any files in this upload associated with this row.
 462         my @extfiles = @{$find_files->(\%fish_params)};
 463         # FIXME: provide some way of allowing the number
 464         # of extfiles to vary.  But only bother to do this
 465         # in case some submitter really needs this to be the case.
 466         unless (@extfiles == $extfiles_per_experiment) {
 467           warn "Found ".@extfiles." files for $fish_params{bac} / $fish_params{experiment_name}.  Skipping.\n";
 468           $dbh->pg_rollback_to($saveptnm);
 469           next RECORD;
 470         }
 471         if (@extfiles) {
 472             count ("row");
 473             foreach my $filename (@extfiles) {
 474                 # XXX: fixme: make this pattern settable by command-line argument.
 475                 unless ($filename =~ m/(Thumbs.db|xls|xlsx)$/i) {
 476 #                   $file_inserter->(File::Basename::basename($filename), File::Basename::basename($fish_params{experiment_name}), $fish_params{clone_id});
 477                     my $image = SGN::Image->new($dbh);
 478                     my ($fish_result_id) = $dbh->selectrow_array(
 479                         <<'',
 480                      SELECT fish_result_id
 481                        FROM fish_result
 482                NATURAL JOIN fish_experimenter
 483                       WHERE fish_experimenter_name = ?
 484                         AND experiment_name = ?
 485                         AND clone_id = ?
 486
 487                         undef,
 488                         $experimenter_name,
 489                         File::Basename::basename($fish_params{experiment_name}),
 490                         $fish_params{clone_id}
 491                        );
 492
 493                     #print STDERR "$fish_result_id\n";
 494                     my $return_value = $image->process_image("$filename", "fish",$fish_result_id,0);
 495                     unless ($return_value > 0) { die "failed to process image: $!\n"; }
 496                     $image->set_description("$experimenter_name");
 497                     $image->set_sp_person_id(233);
 498                     $image->set_obsolete("f");
 499                     $image->store();
 500                     count ("extfile");
 501                 }
 502             }
 503         } else { # No external files found.
 504           skip ("No files found for row $fish_params{experiment_name}.", 1);
 505           $dbh->pg_rollback_to ($saveptnm);
 506           next RECORD;
 507         }
 508       }
 509       # If we got here, the row and its files loaded.
 510 #      $dbh->pg_release ($saveptnm);
 511     }
 512     close ($fh);
 513   }
 514   # Number of lines, minus the first line of each spreadsheet.
 515   my $total_lines = check("line");
 516   my $possible_files = $total_lines * 4;
 517   print "
 518 LOAD REPORT FOR RUN:
 519 ========================================================
 520 Processed ".check("spreadsheet")." spreadsheets.
 521
 522 \tRows\tFiles
 523 Seen\t$total_lines\t$possible_files (expected)
 524 Loaded\t".check("row")."\t".check("extfile")."
 525 Skipped\t".check("skip")."
 526
 527 Expected to skip ".check("spreadsheet")." lines.
 528
 529 ";
 530
 531 $dbh->commit;
 532 };
 533 if ($@) {
 534   print "Some sort of unhandled error in transaction.\n";
 535   print $@;
 536   $dbh->rollback;
 537   exit(1);
 538 }
 539 exit (0);
 540 \f
 541 # Helper functions, etc.
 542 sub print_fish_experimenters_and_quit {
 543     my ($exitcode) = @_;
 544     print "FISH Experimenters:\n";
 545     print "-------------------\n";
 546     my $schema = $dbh->qualify_schema('sgn');
 547     my $q = "SELECT fish_experimenter_name FROM $schema.fish_experimenter";
 548     my $result = $dbh->selectcol_arrayref($q);
 549     foreach my $experimenter (@$result) {
 550         print $experimenter."\n";
 551     }
 552     exit ($exitcode);
 553 }
 554
 555 sub print_usage_and_quit {
 556     my ($exitcode) = @_;
 557     print "Usage: $0 [OPTIONS] DIR FILES
 558
 559 Load FISH data from FILES, which must be .csv files.  All files
 560 associated with the experiment must be found under DIR.
 561 Options:
 562
 563 -d FORMAT       When looking for files associated with a given experiment,
 564                 look in a directory designated by DIR/<format glob>, with
 565                 these format specifiers
 566
 567                 %a  --  Chromosome arm
 568                 %b  --  BAC ID (DDDADD notation)
 569                 %c  --  Chromosome number
 570                 %e  --  Experimenter's experiment ID
 571                 %p  --  Percentage distance from centromere
 572                 %%  --  Literal percent sign
 573
 574                 The default format is '$default_file_glob'.
 575 -e EXPERIMENTER Experimenter name (default '$default_experimenter_name').
 576 -E COUNT        Expect COUNT external files per experiment (default $default_extfiles_per_experiment).
 577 -f FORMAT       Parse the CSV file with each record's fields in order
 578                 specified by FORMAT.  Valid format specifiers are:
 579
 580                 %a  --  Chromosome arm
 581                 %b  --  BAC ID (DDDADD notation)
 582                 %c  --  Chromosome number
 583                 %e  --  Experimenter's experiment ID
 584                 %p  --  Percentage distance from centromere
 585                 %-  --  Some field we don't care about
 586
 587                 The default format is '$default_read_format'.
 588 -h              Print this message.
 589 -l              List known FISH experimenters.
 590 -m MAP_NAME     Map name (default '$default_map_name').
 591 -q QUOTE        Use QUOTE as the field quote character (default \")
 592 -s SEPARATOR    Use SEPARATOR as the field separator (default ,)
 593 ";
 594   exit ($exitcode);
 595 }
 596
 597 # Turn the read format into an ordered list of field names.
 598 sub reckon_fields {
 599   my ($format) = @_;
 600   my @fields = ();
 601   my $counter = 0;
 602   foreach my $format_char (split "%(?!%)", $format) {
 603     if ($format_char eq "") { # empty string at beginning of format
 604       next;
 605     }
 606     if ($format_char eq "-") { # "ignore this field" char
 607       $fields[$counter++] = undef;
 608     } else {
 609       if (grep { $formats{$format_char} eq $_ } @fields) {
 610         die ("$0: $format_char appears more than once in $read_format");
 611       }
 612       $fields[$counter++] = $formats{$format_char};
 613     }
 614   }
 615   return (\@fields);
 616 }
 617
 618 # I get the feeling that you won't like this part of the program.  I'm
 619 # sorry about that.  Here's the idea: given a row in the input spreadsheet,
 620 # we need to find those files that are related to the row.  We only have
 621 # a vague idea about what they'll be sending us (a few images, and maybe a
 622 # spreadsheet per row), and don't really trust submitters to use the same
 623 # directory layout consistently, so it seemed reasonable to use globs
 624 # to describe the set of files associated with a row in the spreadsheet.
 625 # So, e.g., the default glob for the Stack group's uploads is this:
 626 #
 627 # Tomato_<chromo_num><chromo_arm>/BAC_<bac_id>/Photo_ID_<experiment_name>/<experiment_name>*";
 628 #
 629 # But since this is cumbersome to type, we offer a format-string notation
 630 # for the operator, by which we can write the glob above as follows:
 631 #
 632 # Tomato_%c%a/BAC_%b/Photo_ID_%e/%e*
 633 #
 634 # Here is a routine that takes a format string and returns a
 635 # function that takes a hash whose keys are the fields in the
 636 # format structure and returns a reference to an array of the
 637 # file names.  So the usage will be:
 638 #
 639 # my $find_files = make_file_finder ($globfmt);
 640 # my %fish_hash = { chromo_num => 2, chromo_arm = 'P', ... }
 641 # my $files = $find_files->(\%fish_hash);
 642 #
 643 # Now @$files will be the list of files associated with the experiment.
 644 sub make_file_finder {
 645   my ($format) = @_;
 646   my $globfmt = "";
 647   my @keys = ();
 648   # Here we turn our format string into an sprintf format string,
 649   # while also collecting the order of the format codes, so that
 650   # we can turn a filled-in hash of FISH parameters into a list
 651   # of arguments to be formatted.
 652   while ($format =~ m/\G(.)/g) {
 653     my $char = $1;
 654     if ($char eq "%") {
 655       $format =~ m/\G(.)/gc;
 656       my $nextchar = $1;
 657       if ($nextchar eq "%" ) {
 658         $globfmt .= "%";
 659       } else {
 660         $globfmt .= "%s";
 661         push @keys, $formats{$nextchar};
 662       }
 663     } else {
 664       $globfmt .= $char;
 665     }
 666   }
 667   return sub {
 668     my ($hashref) = @_;
 669     my $glob = sprintf $globfmt, map { $$hashref{$_} || ""; } @keys;
 670     #print STDERR "find glob: $glob\n";
 671     my @files = File::Glob::bsd_glob($glob);
 672     return (\@files);
 673   }
 674 }
 675 \f
 676 # Given a dbh, a table name, and a query, prepare the query in the db that
 677 # the dbh connects to, and return a function that executes the prepared query
 678 # with whatever arguments are passed to it.  The point here is to provide
 679 # a lightweight way to wrap statement handle execute() calls, e.g., to print
 680 # out dbh properties at the time the statement handle is executed, etc.
 681 # At present, we don't use the table name.
 682 sub make_inserter {
 683   my ($dbh, $table, $query) = @_;
 684   my $st = $dbh->prepare($query);
 685   sub {
 686     eval {
 687       $st->execute(@_);
 688     };
 689     if ($@) {
 690       die ("$@ with arguments: " . (join ", ", @_));
 691     }
 692   }
 693 }
 694
 695 # This is constructor that produces a new string every time it's
 696 # called.  This ensures that we never reuse the same savepoint name
 697 # twice.
 698 {
 699   my $savepointnum = 1;
 700   sub name_savepoint {
 701     return ("savept".$savepointnum++);
 702   }
 703 }
 704
 705 # Some dinky counters for doing checksums.
 706 {
 707   my $spreadsheet_count = 0;
 708   my $line_count = 0;
 709   my $loaded_row_count = 0;
 710   my $skipped_rows = 0;
 711   my $extfile_count = 0;
 712   sub count {
 713     ($_) = @_;
 714     /^spreadsheet$/ && do { $spreadsheet_count++; };
 715     /^line$/        && do { $line_count++; };
 716     /^row$/         && do { $loaded_row_count++; };
 717     /^skip$/        && do { $skipped_rows++; };
 718     /^extfile$/     && do { $extfile_count++; };
 719   }
 720   sub check {
 721     ($_) = @_;
 722     /^spreadsheet$/ && do { return($spreadsheet_count); };
 723     /^line$/        && do { return($line_count); };
 724     /^row$/         && do { return($loaded_row_count); };
 725     /^skip$/        && do { return($skipped_rows); };
 726     /^extfile$/     && do { return($extfile_count); };
 727   }
 728 }
 729 sub skip {
 730   my ($msg, $serious) = @_;
 731   if ($serious) {
 732     print STDERR $msg." Skipping record.\n";
 733   }
 734   count ("skip");
 735 }
 736
 737 # Tidy up the data in a row.  Return something only if the data is bogus.
 738 sub cleanup_fish_data {
 739   my ($fish_row) = @_;
 740
 741   foreach my $fieldname (keys (%$fish_row)) {
 742     # None of the ersatz case statement equivalents in the Camel book looked
 743     # less opaque to me than the straightforward if/elsif*/else construct.
 744     # Note that the fieldnames are set up by this program, and so can't
 745     # fall off this statement.
 746     if ($fieldname eq "bac") {
 747       my $bac = $fish_row->{bac};
 748       # We need to turn BAC names into clone_ids from the genomic db.
 749       my $clone;
 750       find_clone: for my $lib ("", "LE_HBA", "SL_MboI") {
 751           $clone = CXGN::Genomic::Clone->retrieve_from_clone_name("$lib$bac");
 752           if ($clone) {
 753               last find_clone;
 754           }
 755       }
 756 #id_for_bac_name($dbh, "LE_HBA$bac");
 757       if ($clone) {
 758 #         print STDERR $clone->clone_name."\n";
 759         $fish_row->{clone_id} = $clone->clone_id();
 760       } else {
 761         return ("Ostensible BAC name '$bac' is either unparseable or not found in database.");
 762       }
 763     } elsif ($fieldname eq "experiment_name") {
 764       ; # There's nothing to validate for experiment_names at present.
 765     } elsif ($fieldname eq "chromo_num") {
 766       ; # We can't do much with chromo nums (we don't know what
 767         # species we're looking at).
 768     } elsif ($fieldname eq "chromo_arm") {
 769       # We canonicalize the chromo arm:
 770       if ($fish_row->{chromo_arm} =~ m/[ps]/i ) {
 771         $fish_row->{chromo_arm} = "P";
 772       } elsif ($fish_row->{chromo_arm} =~ m/[ql]/i ) {
 773         $fish_row->{chromo_arm} = "Q";
 774       } else {
 775         return ("$fish_row->{chromo_arm} doesn't look like a chromosome arm identifier.");
 776       }
 777     } elsif ($fieldname eq "percent_from_centromere") {
 778       # Percentage distance from the centromere.  If this is given
 779       # as an integer, normalize it. otherwise.
 780       my $percent_dist = $fish_row->{percent_from_centromere};
 781       if (($percent_dist >= 0.0) && ($percent_dist <= 100.0)) {
 782         $percent_dist = $percent_dist/100;
 783       }
 784       if (($percent_dist > 1.0) || ($percent_dist < 0.0)) {
 785         return ("$percent_dist doesn't look like a percentage.");
 786       }
 787       $fish_row->{percent_from_centromere} = $percent_dist;
 788     }
 789   }
 790   # If we got here, then we don't return anything for
 791   # the caller to report.
 792   return (undef);
 793 }