minor tweaks.
[sgn.git] / bin / load_fish_images_and_data.pl
blobbe08a45c7b269e87aa5a470a7e9244cb82d7526d
1 #!/usr/bin/env perl
3 # Standard stuff.
4 use strict;
5 use Data::Dumper;
6 use File::Basename;
7 use File::Glob;
8 use Getopt::Std;
9 use SGN::Image;
11 # Beth's stuff.
12 use CXGN::DB::Connection;
15 # Rob's stuff.
16 use CXGN::Genomic::Clone;
17 #use CXGN::Genomic::CloneNameParser;
19 # Just a little CSV parser. (In defense of this package's style and existence,
20 # I wrote it on my third or fourth week here as an exercise in learning Perl,
21 # and it has required no maintenance since it was written. You're welcome to
22 # curse me for not having found some crap in CPAN that does what this does.)
23 package CSV;
25 use Exporter;
26 our @ISA = qw(Exporter);
27 our @EXPORT = qw(csv_line_to_list read_csv_record);
29 ### CSV parser.
30 # There are three states in this parser, code where the state
31 # changes occur can be visited by searching for $state assignments.
32 use constant ex_quote => 0;
33 use constant in_quote => 1;
34 use constant quote_maybe_escape => 2;
36 # The following three functions are only used in csv_to_list.
37 sub parse_error {
38 my ($position, $errmsg, $line) = @_;
39 my $errstr = "CSV parse error at position $position: $errmsg\n$line";
40 my $fmtstr = "%" . ($position+1) . "s";
41 $errstr = $errstr . sprintf ($fmtstr, "^");
42 die ($errstr);
45 # Note: this is factored out to make it easier to find all places in
46 # the code where state is changed, but it doesn't actually do much.
47 # In this and in the next sub, we use prototypes to make the call
48 # sites cleaner, not to omit parentheses.
49 sub change_state (\$$) {
50 my ($statevarref, $state) = @_;
51 $$statevarref = $state;
54 sub collect_field (\@\$\$) {
55 my ($accumulator_array_ref, $accumulator_scalar_ref, $stateref) = @_;
56 push @$accumulator_array_ref, $$accumulator_scalar_ref;
57 $$accumulator_scalar_ref = "";
58 $$stateref = ex_quote;
61 # Return an array of scalars consisting of strings scraped out of a
62 # line that's been CSV encoded.
63 sub csv_line_to_list {
64 my ($csv_line, $separator_char, $quote_char) = @_;
65 # A string for accumulating tokens.
66 my $accstr = "";
67 # An array for accumulating strings.
68 my @accarr = ();
69 # For useful error messages, the position in the line.
70 my $pos = -1;
71 # Parser state. There are only 3 possible states: in_quote, ex_quote,
72 # and quote_maybe_escape. The quote character is used both to terminate
73 # quoted strings and to escape itself inside quoted strings.
74 my $state = ex_quote;
75 while ($csv_line =~ m/(.)/g) {
76 my $char = $1;
77 $pos++;
78 # Note: parser states are numbers, and so == is the optimal
79 # comparison operator. If you change parser states to strings, you'll have
80 # to change these to eq comparisons.
81 if ($state == ex_quote) {
82 if ($char eq $quote_char) {
83 if ($accstr) { # we've accumulated some datum, and see a quote: bogus.
84 parse_error ($pos, "quote character in unquoted datum", $csv_line);
85 } else { # we're seeing a quote right after a separator.
86 change_state ($state, in_quote);
88 } elsif ($char eq $separator_char) { # end of field
89 collect_field (@accarr, $accstr, $state);
90 } else {
91 $accstr .= $char;
94 elsif ($state == in_quote) {
95 if ($char eq $quote_char) {
96 change_state ($state, quote_maybe_escape);
97 } else {
98 $accstr .= $char;
101 elsif ($state == quote_maybe_escape) {
102 if ($char eq $quote_char) {
103 $accstr .= $quote_char;
104 change_state ($state, in_quote);
105 } elsif ($char eq $separator_char) {
106 collect_field (@accarr, $accstr, $state);
107 } else { # anything other than a quote or separator after a quote is bogus
108 parse_error ($pos, "garbage character after close quote", $csv_line);
111 else {
112 parse_error ($pos, "bug in csv parser, unknown state $state", $csv_line);
115 # If in datum at end of line
116 # FIXME: ",XXX\r\n"
117 if (($accstr ne "") && ($accstr ne "\r")) {
118 if ($state == in_quote) {
119 parse_error ($pos, "end of line reached inside quoted datum", $csv_line);
121 push (@accarr, $accstr);
123 return @accarr;
126 sub read_csv_record {
127 my ($filehandle, $separator_char, $quote_char) = @_;
128 my $line = <$filehandle>;
129 chomp $line;
130 return csv_line_to_list ($line, $separator_char, $quote_char);
133 use Exporter;
134 package Schmenomic;
135 our @ISA = qw(Exporter);
136 our @EXPORT = qw(decompose_bac_name canonicalize_bac_name id_query_for_bac_name id_for_bac_name);
137 # The existing genomic API and its implementation are needlessly
138 # complex.
139 # As of 2006/4/12, there are 3 libraries of clones, with the
140 # following maxima for each of platenum, wellrow, and wellcol:
142 # 336 | P | 24
143 # 148 | P | 24
144 # 132 | P | 24
145 sub decompose_bac_name {
146 my ($bac_name) = @_;
147 unless ($bac_name) {
148 print STDERR "no BAC name supplied";
149 return (undef);
151 if ($bac_name =~ m/^([[:alpha:]_]+)?(\d{1,3})([[:alpha:]]{1})(\d{1,2})/) {
152 my ($shortname, $platenum, $wellrow, $wellcol) = ($1, $2, $3, $4);
153 return ([$shortname, $platenum, $wellrow, $wellcol]);
154 } else {
155 # warn ()"Unparseable BAC name $bac_name. If the BAC name is valid fix decompose_bac_name().\n");
156 return (undef);
160 sub canonicalize_bac_name {
161 my ($bac_name) = @_;
162 my $decomposed_bac_name = decompose_bac_name ($bac_name);
163 unless ($decomposed_bac_name) {
164 return (undef);
166 my ($shortname, $platenum, $wellrow, $wellcol) = @$decomposed_bac_name;
167 my $ret = sprintf "%s%0.3d%s%0.2d", $shortname, $platenum, uc($wellrow), $wellcol;
168 return ($ret);
171 sub id_query_for_bac_name {
172 my ($bac_name, $optional_schema_name) = @_;
173 my $canonical = canonicalize_bac_name ($bac_name);
174 unless ($canonical) {
175 return (undef);
177 my ($shortname, $platenum, $wellrow, $wellcol) = @{decompose_bac_name ($canonical)};
178 unless ($shortname) {
179 print STDERR "can't lookup BAC name $bac_name: no library shortname.\n";
180 return (undef);
182 unless ($platenum) {
183 print STDERR "can't lookup BAC name $bac_name: no plate number.\n";
184 return (undef);
186 unless ($wellrow) {
187 print STDERR "can't lookup BAC name $bac_name: no well row.\n";
188 return (undef);
190 unless ($wellcol) {
191 print STDERR "can't lookup BAC name $bac_name: no well column.\n";
192 return (undef);
194 my $genomic;
195 if ($optional_schema_name) {
196 $genomic = $optional_schema_name;
197 } else {
198 $genomic = "genomic";
200 my $query = "SELECT clone_id
201 FROM $genomic.clone
202 JOIN $genomic.library USING (library_id)
203 WHERE shortname ILIKE '$shortname'
204 AND platenum = $platenum
205 AND wellrow ILIKE '$wellrow%'
206 AND wellcol = $wellcol";
207 return ($query);
210 sub id_for_bac_name {
211 my ($dbh, $bac_name) = @_;;
212 my $schema = $dbh->qualify_schema ("genomic");
213 my $query = id_query_for_bac_name ($bac_name, $schema);
214 unless ($query) {
215 return (undef);
217 my $result = $dbh->selectall_arrayref ($query);
218 return ($result->[0][0]);
221 package main;
222 ## Globals.
224 # The body of the program below turns rows of a spreadsheet into hash tables
225 # whose keys are the chromosome number, the chromosome arm, the BAC ID,
226 # the experimenter's name for the experiment, and the distance from the
227 # centromere as a percentage of the arm length. Unsurprisingly, these 5
228 # fields plus a couple of constants external to the spreadsheet are the
229 # significant columns in the fish_result table in the database.
231 # So here is the main query to be performed. We'll run this for each
232 # row of the spreadsheet. Note that in order to hike this string up
233 # here before any argument processing, we've had to escape the two
234 # constants; we'll eval the string before using it.
235 our $result_insert_query =
236 "INSERT INTO fish_result
237 (chromo_num, chromo_arm,
238 experiment_name, percent_from_centromere,
239 clone_id, fish_experimenter_id, map_id)
240 SELECT ?, ?, ?, ?, ?,
241 (SELECT fish_experimenter_id
242 FROM fish_experimenter
243 WHERE fish_experimenter_name = '%s'),
244 (SELECT map_id
245 FROM map
246 WHERE short_name = '%s')";
247 # The names of the fields we need to supply as bind parameters to the
248 # above query. Make sure the order of field names match up.
249 our @result_insert_fields = ("chromo_num", "chromo_arm",
250 "experiment_name", "percent_from_centromere",
251 "clone_id");
257 # TODO
258 # A query for inserting a filename into the fish_file table. FISH results
259 # are uniquely identified by the experimenter, their experiment name, and
260 # the clone_id so the linkage is pretty simple.
261 our $file_insert_query =
262 "INSERT INTO fish_file (filename, fish_result_id)
263 SELECT ?, (SELECT fish_result_id
264 FROM fish_result
265 NATURAL JOIN fish_experimenter
266 WHERE fish_experimenter_name = '%s'
267 AND experiment_name = ?
268 AND clone_id = ?)";
269 #/TODO
272 # Next, because we don't really trust the submitters to maintain the same
273 # formatting of their spreadsheet (preserving column ordering, mostly), or
274 # to use the same file layouts from submission to submission, this program
275 # doesn't expect any specific spreadsheet structure or file layout, except
276 # that the spreadsheet must be tabular and the files associated with a row
277 # in the spreadsheet must be describable by a Unix glob.
279 # We use a format-stringy notation for both describing spreadsheet
280 # structure and constructing filenames for each spreadsheet row.
281 # The mapping of format codes to programmer-friendly keys is as follows:
282 my %formats = (
283 "a" => "chromo_arm",
284 "b" => "bac",
285 "c" => "chromo_num",
286 "e" => "experiment_name",
287 "p" => "percent_from_centromere",
290 # The default ordering of columns in the spreadsheets we read. Overridable
291 # with -f.
292 our $default_read_format = "%b%e%c%-%-%a%p";
294 # The default file name glob whose expansion names all files associated
295 # with a given row in the spreadsheet. Overridable with -d.
296 our $default_file_glob = "Tomato_%c%a/BAC_%b/Photo_ID_%e/%e*";
298 # The name in table fish_experimenter of the FISH experimenter. Overridable
299 # with -e.
300 our $default_experimenter_name = "fish_stack";
301 # The name in table maps of the FISH map. Overridable with -m.
302 our $default_map_name = "Tomato FISH map";
303 # The number of files expected to be found for each row of spreadsheet
304 # data. Overridable with -E
305 our $default_extfiles_per_experiment = 4;
307 ## Process command line arguments.
308 our %opts;
309 getopts ("d:e:E:hlm:f:q:s:t", \%opts);
311 # Help message.
312 if ($opts{h}) {
313 print_usage_and_quit(0);
315 # Make a DB connection. We do this before processing further
316 # arguments so that any options that require querying the
317 # database can assume $dbh is set.
318 our $dbh = CXGN::DB::Connection->new;
319 $dbh->ping or die ("bogus database handle.");
320 unless ($dbh) {
321 die ("Can't connect to database.");
323 $dbh->dbh_param(PrintError=>0);
325 # Display the names of known FISH experimenters
326 if ($opts{l}) {
327 print_fish_experimenters_and_quit(0);
329 # File glob format
330 our $file_glob = $default_file_glob;
331 if ($opts{d}) {
332 $file_glob = $opts{d};
334 # Experimenter name
335 our $experimenter_name = $default_experimenter_name;
336 if ($opts{e}) {
337 $experimenter_name = $opts{e};
339 # External files per experiment
340 our $extfiles_per_experiment = $default_extfiles_per_experiment;
341 if ($opts{E}) {
342 $extfiles_per_experiment = $opts{E};
344 # Spreadsheet column "format" (it's actually parsed by a CSV routine;
345 # these are just the ordering of the columns)
346 our $read_format = $default_read_format;
347 if ($opts{f}) {
348 $read_format = $opts{f};
350 # Map name.
351 our $map_name = $default_map_name;
352 if ($opts{m}) {
353 $map_name = $opts{m};
355 # Parameters to the CSV parser.
356 our $quote = "\"";
357 if (defined($opts{q})) {
358 $quote = $opts{q};
360 our $separator = ",";
361 if (defined($opts{s})) {
362 $separator = $opts{s};
364 # Required arguments: a directory and some filenames
365 if (@ARGV < 2) {
366 print "$0: too few arguments.\n";
367 print_usage_and_quit (1);
369 our $directory = shift;
370 our @files = @ARGV;
372 # We're done processing arguments. Now we construct a few structures
373 # that will be constant through the rest of the program.
375 # Construct a list whose elements are the names of the columns
376 # in the spreadsheet, or undef if we don't care about those columns.
377 my @fields = @{reckon_fields ($read_format)};
379 # Create a function that takes a hash representing a row in a spreadsheet
380 # and returns all files found for that row.
381 my $find_files = make_file_finder ($directory."/".$file_glob);
383 # Create a function that inserts the FISH data hash into the fish_result table.
384 my $result_inserter = make_inserter ($dbh, "fish_result", sprintf ($result_insert_query, $experimenter_name, $map_name));
385 # Create a function that inserts filenames into the fish_file table for
391 # TODO
392 # a given FISH data hash.
393 my $file_inserter = make_inserter ($dbh, "fish_file", sprintf ($file_insert_query, $experimenter_name));
394 #/TODO
399 # The main event.
400 eval {
401 foreach my $file (@files) {
402 printf "Processing data file $file...\n";
403 open (my $fh, "<$file") || die ("$0: failed to open spreadsheet $file.");
404 count("spreadsheet");
405 RECORD: while (my @record = CSV::read_csv_record ($fh, $separator, $quote)) {
406 count("line");
407 unless (@record >= @fields) { # Too few records
408 #print STDERR @record+0, join("\t", @record);
409 skip("Record has " . (@record) . " fields, not " . (@fields) . " fields.");
410 next RECORD;
413 my %fish_params;
414 for (my $i = 0; $i < @fields; $i++) {
415 my $value = $record[$i];
416 my $fieldname = $fields[$i];
417 if ($fieldname) { # we don't care about undef fieldnames
418 # Trim the value, and stash it in %fish_params.
419 $value =~ s/(^\s|\s$)//g;
420 $fish_params{$fieldname} = $value;
423 # Do some cleanup/error checking on the data.
424 # cleanup_fish_data is expected to return a string
425 # only if there's something wrong with the data.
426 my $invalidity = cleanup_fish_data (\%fish_params);
427 if ($invalidity) {
428 skip ($invalidity);
429 next RECORD;
432 # If we've got this far, we have all we need to start inserting.
433 my @params = @fish_params{@result_insert_fields};
434 # We'll make a savepoint before each spreadsheet row,
435 # so that (1) if the row has already been inserted, then
436 # we let the database generate the error, rollback that row
437 # and proceed; (2) if the row inserts but has no
438 # corresponding external files, then we rollback that row
439 # and proceed.
440 my $saveptnm = name_savepoint();
441 $dbh->pg_savepoint($saveptnm);
442 eval {
443 $result_inserter->(@params);
445 if ($@) { # The row didn't insert.
446 # The only acceptable reason why this could occur is a violated UNIQUE
447 # constraint (which we expect many of).
448 # I started writing a module for mapping error codes to readable strings,
449 # but keeping that sort of thing in sync with future database releases
450 # is not worthwhile, at least given how little use is made of the
451 # error code. The error codes themselves are reputedly standardized
452 # and therefore in principle stable.
453 if ($dbh->state eq "23505") {
454 skip ("Failed to insert row for $experimenter_name, $fish_params{bac}, $fish_params{experiment_name} (already in database).");
455 } else {
456 die ("Unexpected database insert error $@");
458 $dbh->pg_rollback_to($saveptnm);
459 next RECORD;
460 } else { # Row inserted, now do the external files.
461 # Find any files in this upload associated with this row.
462 my @extfiles = @{$find_files->(\%fish_params)};
463 # FIXME: provide some way of allowing the number
464 # of extfiles to vary. But only bother to do this
465 # in case some submitter really needs this to be the case.
466 unless (@extfiles == $extfiles_per_experiment) {
467 warn "Found ".@extfiles." files for $fish_params{bac} / $fish_params{experiment_name}. Skipping.\n";
468 $dbh->pg_rollback_to($saveptnm);
469 next RECORD;
471 if (@extfiles) {
472 count ("row");
473 foreach my $filename (@extfiles) {
474 # XXX: fixme: make this pattern settable by command-line argument.
475 unless ($filename =~ m/(Thumbs.db|xls|xlsx)$/i) {
476 # $file_inserter->(File::Basename::basename($filename), File::Basename::basename($fish_params{experiment_name}), $fish_params{clone_id});
477 my $image = SGN::Image->new($dbh);
478 my ($fish_result_id) = $dbh->selectrow_array(
479 <<'',
480 SELECT fish_result_id
481 FROM fish_result
482 NATURAL JOIN fish_experimenter
483 WHERE fish_experimenter_name = ?
484 AND experiment_name = ?
485 AND clone_id = ?
487 undef,
488 $experimenter_name,
489 File::Basename::basename($fish_params{experiment_name}),
490 $fish_params{clone_id}
493 #print STDERR "$fish_result_id\n";
494 my $return_value = $image->process_image("$filename", "fish",$fish_result_id,0);
495 unless ($return_value > 0) { die "failed to process image: $!\n"; }
496 $image->set_description("$experimenter_name");
497 $image->set_sp_person_id(233);
498 $image->set_obsolete("f");
499 $image->store();
500 count ("extfile");
503 } else { # No external files found.
504 skip ("No files found for row $fish_params{experiment_name}.", 1);
505 $dbh->pg_rollback_to ($saveptnm);
506 next RECORD;
509 # If we got here, the row and its files loaded.
510 # $dbh->pg_release ($saveptnm);
512 close ($fh);
514 # Number of lines, minus the first line of each spreadsheet.
515 my $total_lines = check("line");
516 my $possible_files = $total_lines * 4;
517 print "
518 LOAD REPORT FOR RUN:
519 ========================================================
520 Processed ".check("spreadsheet")." spreadsheets.
522 \tRows\tFiles
523 Seen\t$total_lines\t$possible_files (expected)
524 Loaded\t".check("row")."\t".check("extfile")."
525 Skipped\t".check("skip")."
527 Expected to skip ".check("spreadsheet")." lines.
531 $dbh->commit;
533 if ($@) {
534 print "Some sort of unhandled error in transaction.\n";
535 print $@;
536 $dbh->rollback;
537 exit(1);
539 exit (0);
541 # Helper functions, etc.
542 sub print_fish_experimenters_and_quit {
543 my ($exitcode) = @_;
544 print "FISH Experimenters:\n";
545 print "-------------------\n";
546 my $schema = $dbh->qualify_schema('sgn');
547 my $q = "SELECT fish_experimenter_name FROM $schema.fish_experimenter";
548 my $result = $dbh->selectcol_arrayref($q);
549 foreach my $experimenter (@$result) {
550 print $experimenter."\n";
552 exit ($exitcode);
555 sub print_usage_and_quit {
556 my ($exitcode) = @_;
557 print "Usage: $0 [OPTIONS] DIR FILES
559 Load FISH data from FILES, which must be .csv files. All files
560 associated with the experiment must be found under DIR.
561 Options:
563 -d FORMAT When looking for files associated with a given experiment,
564 look in a directory designated by DIR/<format glob>, with
565 these format specifiers
567 %a -- Chromosome arm
568 %b -- BAC ID (DDDADD notation)
569 %c -- Chromosome number
570 %e -- Experimenter's experiment ID
571 %p -- Percentage distance from centromere
572 %% -- Literal percent sign
574 The default format is '$default_file_glob'.
575 -e EXPERIMENTER Experimenter name (default '$default_experimenter_name').
576 -E COUNT Expect COUNT external files per experiment (default $default_extfiles_per_experiment).
577 -f FORMAT Parse the CSV file with each record's fields in order
578 specified by FORMAT. Valid format specifiers are:
580 %a -- Chromosome arm
581 %b -- BAC ID (DDDADD notation)
582 %c -- Chromosome number
583 %e -- Experimenter's experiment ID
584 %p -- Percentage distance from centromere
585 %- -- Some field we don't care about
587 The default format is '$default_read_format'.
588 -h Print this message.
589 -l List known FISH experimenters.
590 -m MAP_NAME Map name (default '$default_map_name').
591 -q QUOTE Use QUOTE as the field quote character (default \")
592 -s SEPARATOR Use SEPARATOR as the field separator (default ,)
594 exit ($exitcode);
597 # Turn the read format into an ordered list of field names.
598 sub reckon_fields {
599 my ($format) = @_;
600 my @fields = ();
601 my $counter = 0;
602 foreach my $format_char (split "%(?!%)", $format) {
603 if ($format_char eq "") { # empty string at beginning of format
604 next;
606 if ($format_char eq "-") { # "ignore this field" char
607 $fields[$counter++] = undef;
608 } else {
609 if (grep { $formats{$format_char} eq $_ } @fields) {
610 die ("$0: $format_char appears more than once in $read_format");
612 $fields[$counter++] = $formats{$format_char};
615 return (\@fields);
618 # I get the feeling that you won't like this part of the program. I'm
619 # sorry about that. Here's the idea: given a row in the input spreadsheet,
620 # we need to find those files that are related to the row. We only have
621 # a vague idea about what they'll be sending us (a few images, and maybe a
622 # spreadsheet per row), and don't really trust submitters to use the same
623 # directory layout consistently, so it seemed reasonable to use globs
624 # to describe the set of files associated with a row in the spreadsheet.
625 # So, e.g., the default glob for the Stack group's uploads is this:
627 # Tomato_<chromo_num><chromo_arm>/BAC_<bac_id>/Photo_ID_<experiment_name>/<experiment_name>*";
629 # But since this is cumbersome to type, we offer a format-string notation
630 # for the operator, by which we can write the glob above as follows:
632 # Tomato_%c%a/BAC_%b/Photo_ID_%e/%e*
634 # Here is a routine that takes a format string and returns a
635 # function that takes a hash whose keys are the fields in the
636 # format structure and returns a reference to an array of the
637 # file names. So the usage will be:
639 # my $find_files = make_file_finder ($globfmt);
640 # my %fish_hash = { chromo_num => 2, chromo_arm = 'P', ... }
641 # my $files = $find_files->(\%fish_hash);
643 # Now @$files will be the list of files associated with the experiment.
644 sub make_file_finder {
645 my ($format) = @_;
646 my $globfmt = "";
647 my @keys = ();
648 # Here we turn our format string into an sprintf format string,
649 # while also collecting the order of the format codes, so that
650 # we can turn a filled-in hash of FISH parameters into a list
651 # of arguments to be formatted.
652 while ($format =~ m/\G(.)/g) {
653 my $char = $1;
654 if ($char eq "%") {
655 $format =~ m/\G(.)/gc;
656 my $nextchar = $1;
657 if ($nextchar eq "%" ) {
658 $globfmt .= "%";
659 } else {
660 $globfmt .= "%s";
661 push @keys, $formats{$nextchar};
663 } else {
664 $globfmt .= $char;
667 return sub {
668 my ($hashref) = @_;
669 my $glob = sprintf $globfmt, map { $$hashref{$_} || ""; } @keys;
670 #print STDERR "find glob: $glob\n";
671 my @files = File::Glob::bsd_glob($glob);
672 return (\@files);
676 # Given a dbh, a table name, and a query, prepare the query in the db that
677 # the dbh connects to, and return a function that executes the prepared query
678 # with whatever arguments are passed to it. The point here is to provide
679 # a lightweight way to wrap statement handle execute() calls, e.g., to print
680 # out dbh properties at the time the statement handle is executed, etc.
681 # At present, we don't use the table name.
682 sub make_inserter {
683 my ($dbh, $table, $query) = @_;
684 my $st = $dbh->prepare($query);
685 sub {
686 eval {
687 $st->execute(@_);
689 if ($@) {
690 die ("$@ with arguments: " . (join ", ", @_));
695 # This is constructor that produces a new string every time it's
696 # called. This ensures that we never reuse the same savepoint name
697 # twice.
699 my $savepointnum = 1;
700 sub name_savepoint {
701 return ("savept".$savepointnum++);
705 # Some dinky counters for doing checksums.
707 my $spreadsheet_count = 0;
708 my $line_count = 0;
709 my $loaded_row_count = 0;
710 my $skipped_rows = 0;
711 my $extfile_count = 0;
712 sub count {
713 ($_) = @_;
714 /^spreadsheet$/ && do { $spreadsheet_count++; };
715 /^line$/ && do { $line_count++; };
716 /^row$/ && do { $loaded_row_count++; };
717 /^skip$/ && do { $skipped_rows++; };
718 /^extfile$/ && do { $extfile_count++; };
720 sub check {
721 ($_) = @_;
722 /^spreadsheet$/ && do { return($spreadsheet_count); };
723 /^line$/ && do { return($line_count); };
724 /^row$/ && do { return($loaded_row_count); };
725 /^skip$/ && do { return($skipped_rows); };
726 /^extfile$/ && do { return($extfile_count); };
729 sub skip {
730 my ($msg, $serious) = @_;
731 if ($serious) {
732 print STDERR $msg." Skipping record.\n";
734 count ("skip");
737 # Tidy up the data in a row. Return something only if the data is bogus.
738 sub cleanup_fish_data {
739 my ($fish_row) = @_;
741 foreach my $fieldname (keys (%$fish_row)) {
742 # None of the ersatz case statement equivalents in the Camel book looked
743 # less opaque to me than the straightforward if/elsif*/else construct.
744 # Note that the fieldnames are set up by this program, and so can't
745 # fall off this statement.
746 if ($fieldname eq "bac") {
747 my $bac = $fish_row->{bac};
748 # We need to turn BAC names into clone_ids from the genomic db.
749 my $clone;
750 find_clone: for my $lib ("", "LE_HBA", "SL_MboI") {
751 $clone = CXGN::Genomic::Clone->retrieve_from_clone_name("$lib$bac");
752 if ($clone) {
753 last find_clone;
756 #id_for_bac_name($dbh, "LE_HBA$bac");
757 if ($clone) {
758 # print STDERR $clone->clone_name."\n";
759 $fish_row->{clone_id} = $clone->clone_id();
760 } else {
761 return ("Ostensible BAC name '$bac' is either unparseable or not found in database.");
763 } elsif ($fieldname eq "experiment_name") {
764 ; # There's nothing to validate for experiment_names at present.
765 } elsif ($fieldname eq "chromo_num") {
766 ; # We can't do much with chromo nums (we don't know what
767 # species we're looking at).
768 } elsif ($fieldname eq "chromo_arm") {
769 # We canonicalize the chromo arm:
770 if ($fish_row->{chromo_arm} =~ m/[ps]/i ) {
771 $fish_row->{chromo_arm} = "P";
772 } elsif ($fish_row->{chromo_arm} =~ m/[ql]/i ) {
773 $fish_row->{chromo_arm} = "Q";
774 } else {
775 return ("$fish_row->{chromo_arm} doesn't look like a chromosome arm identifier.");
777 } elsif ($fieldname eq "percent_from_centromere") {
778 # Percentage distance from the centromere. If this is given
779 # as an integer, normalize it. otherwise.
780 my $percent_dist = $fish_row->{percent_from_centromere};
781 if (($percent_dist >= 0.0) && ($percent_dist <= 100.0)) {
782 $percent_dist = $percent_dist/100;
784 if (($percent_dist > 1.0) || ($percent_dist < 0.0)) {
785 return ("$percent_dist doesn't look like a percentage.");
787 $fish_row->{percent_from_centromere} = $percent_dist;
790 # If we got here, then we don't return anything for
791 # the caller to report.
792 return (undef);