make test pass for multicat parsing with two xlsx files for testing.
[sgn.git] / bin / load_marker_data.pl
blob8c56a0891bc30f8798036927068394fb37151ac2
1 #!/usr/bin/perl
3 # basic script to load pcr marker information
6 # usage: load_marker_data.pl -H hostname D dbname -i infile -m map_id
8 # copy and edit this file as necessary
9 # common changes include the following:
10 # experiment_type_id
11 # accession_id
12 # different column headings
14 =head1
16 NAME
18 load_solcap_markers.pl - a script to load markers into the SGN database.
20 =head1 DESCRIPTION
22 usage: load_solcap_markers.pl
24 Options:
26 =over 6
28 =item -H
30 The hostname of the server hosting the database.
32 =item -D
34 the name of the database
36 =item -t
38 (optional) test mode. Rollback after the script terminates. Database should not be affected. Good for test runs.
40 =item p
42 protocol (e.g. SSR, SNP, Indel, dCAPs, GG, etc)
44 =item -i
46 infile with the marker info
48 =item -m
50 sgn map_id
53 =back
55 The tab-delimited map file has the following columns:
57 Marker
58 Temp (annealing temperature)
59 fwd (forward primer)
60 rev (reverse primer)
62 (optional columns, depending on the protocol)
63 pd (an additional primer seq, if protocol is dCAPS)
64 Indel (additional primer seq, if protocol is Indel)
65 ASPE1 (additional primer seq, if protocol is SNP)
66 ASPE2 (additional primer seq, if protocol is SNP)
67 seq3 (3' flanking sequence, if protocol is SNP or Indel)
68 seq5 (5' flanking sequence, if protocol is SNP or Indel)
70 <accession name a> provide band sizes for accession name a
71 (provided using -a)
72 <accession name b> provide band sizes for accession name b
73 (provided using -b)
75 =head1 AUTHORS
77 Naama Menda <nm249@cornell.edu>
80 =cut
82 use strict;
83 use warnings;
85 use CXGN::Tools::File::Spreadsheet;
86 use CXGN::Tools::Text;
87 use CXGN::Marker::Modifiable;
88 use CXGN::Marker::Tools;
89 use CXGN::Marker::Location;
90 use CXGN::Marker::PCR::Experiment;
91 use CXGN::Map::Version;
92 use CXGN::DB::Connection;
93 use CXGN::DB::InsertDBH;
94 use Data::Dumper;
95 use CXGN::DB::SQLWrappers;
96 use CXGN::Cview::Map::Tools;
98 use Getopt::Std;
101 our ($opt_H, $opt_D, $opt_i, $opt_t, $opt_p, $opt_m, $opt_a, $opt_b);
103 getopts('H:D:i:tp:m:a:b:');
106 my $map_id = $opt_m || die "Must pass a -m option with a valid sgn map_id!\n";
108 my $protocol = $opt_p || die "ERROR: No -p option passed for protocol name (SolCap markers are loaded with one file per protocol: SNP, Indel, SSR, CAPS) \n";
110 my $dbh = CXGN::DB::InsertDBH->new({
111 dbname => $opt_D,
112 dbhost => $opt_H,
113 dbargs => {AutoCommit => 0,
114 RaiseError => 1}
117 my $sql=CXGN::DB::SQLWrappers->new($dbh);
119 my $map_version_id = CXGN::Cview::Map::Tools::find_current_version($dbh, $map_id);
121 eval {
123 # parameters for this specific instance
124 my $experiment_type_id = 1; # 'mapping'
126 my $stock_id ;#= '88x87'; # parent1 x parent2
128 # make an object to give us the values from the spreadsheet
129 my $ss = CXGN::Tools::File::Spreadsheet->new($opt_i);
130 my @markers = $ss->row_labels(); # row labels are the marker names
131 my @columns = $ss->column_labels(); # column labels are the headings for the data columns
133 # make sure the spreadsheet is how we expect it to be
134 @columns = qw | marker protocol temp | #fwd rev |
135 or die"Column headings are not as expected";
137 for my $marker_name (@markers) {
139 print "\n\nMARKER: $marker_name\n";
141 my @marker_ids = CXGN::Marker::Tools::marker_name_to_ids($dbh,$marker_name);
142 if (@marker_ids>1) { die "Too many IDs found for marker '$marker_name'" }
143 # just get the first ID in the list (if the list is longer than 1, we've already died)
144 my $marker_id = $marker_ids[0];
146 if(!$marker_id) {
147 print STDERR "Marker $marker_name does not exist in database. SKIPPING!\n";
148 next();
149 #$marker_id = CXGN::Marker::Tools::insert_marker($dbh,$marker_name);
150 #print "marker added: $marker_id\n";
152 else { print "marker_id found: $marker_id\n" }
154 # clean this up?
155 my $annealing_temp =
156 ((grep {/temp/} @columns) && ($ss->value_at($marker_name,'temp')))
157 ? $ss->value_at($marker_name,'temp') : 55;
158 print "temp: $annealing_temp\n";
160 my ($primer_id_fwd, $primer_id_rev, $fwd, $rev);
161 if ($protocol ne 'SNP') { # SNP only has seq5 and seq3
162 $fwd = $ss->value_at($marker_name,'fwd')
163 or die "No foward primer found for $marker_name";
164 print "fwd: $fwd\n";
166 $primer_id_fwd = CXGN::Marker::Tools::get_sequence_id($dbh,$fwd);
167 $primer_id_fwd = CXGN::Marker::Tools::insert($dbh,"sequence","sequence_id",['sequence'], ($fwd)) if !$primer_id_fwd;
168 print "primer_id_fwd: $primer_id_fwd\n";
170 $rev=$ss->value_at($marker_name,'rev')
171 or warn "No reverse primer found for $marker_name";
172 print "rev: $rev\n";
173 $primer_id_rev = CXGN::Marker::Tools::get_sequence_id($dbh,$rev);
174 $primer_id_rev = CXGN::Marker::Tools::insert($dbh,"sequence","sequence_id",['sequence'], ($rev)) if !$primer_id_rev;
175 print "primer_id_rev: $primer_id_rev\n";
177 #my $protocol = $ss->value_at($marker_name,'protocol') ;
179 my ($pd, $primer_id_pd, $indel, $indel_id, $snp, $snp_id, $aspe1, $aspe2, $aspe1_id, $aspe2_id, $seq5, $seq5_id, $seq3, $seq3_id);
181 if (($protocol eq 'dCAPS') && ($pd = $ss->value_at($marker_name,'pd'))) {
182 print "pd: $pd\n";
183 $primer_id_pd = CXGN::Marker::Tools::get_sequence_id($dbh,$pd);
184 $primer_id_pd = CXGN::Marker::Tools::insert($dbh,"sequence","sequence_id",['sequence'], ($pd)) if !$primer_id_pd;
185 print "primer_id_pd: $primer_id_pd\n";
186 }############## add here sequences for different protocols
187 if (($protocol eq 'Indel') && ($indel = $ss->value_at($marker_name,'Indel'))) {
188 print "indel: $indel\n";
189 $indel_id = CXGN::Marker::Tools::get_sequence_id($dbh,$indel);
190 $indel_id = CXGN::Marker::Tools::insert($dbh,"sequence","sequence_id",['sequence'], ($indel)) if !$indel_id;
191 print "indel: $indel_id\n";
193 if (($protocol eq 'ASPE') && ($snp = $ss->value_at($marker_name,'ASPE'))) {
194 print "snp: $snp\n";
195 $snp_id = CXGN::Marker::Tools::get_sequence_id($dbh,$snp);
196 $snp_id = CXGN::Marker::Tools::insert($dbh,"sequence","sequence_id",['sequence'], ($snp)) if !$snp_id;
197 print "snp_id: $snp_id\n";
199 $aspe1 = $ss->value_at($marker_name,'ASPE1');
200 print "aspe1: $aspe1\n";
201 if ($aspe1) {
202 $aspe1_id = CXGN::Marker::Tools::get_sequence_id($dbh,$aspe1);
203 $aspe1_id = CXGN::Marker::Tools::insert($dbh,"sequence","sequence_id",['sequence'], ($aspe1)) if !$aspe1_id;
204 print "aspe1_id: $aspe1_id\n";
206 $aspe2 = $ss->value_at($marker_name,'ASPE2');
207 if ($aspe2) {
208 print "aspe2: $aspe2\n";
209 $aspe2_id = CXGN::Marker::Tools::get_sequence_id($dbh,$aspe2);
210 $aspe2_id = CXGN::Marker::Tools::insert($dbh,"sequence","sequence_id",['sequence'], ($aspe2)) if !$aspe2_id;
211 print "aspe2_id: $aspe2_id\n";
214 if ($protocol eq 'ASPE' || $protocol eq 'Indel') {
215 $seq3 = $ss->value_at($marker_name,'seq3');
216 print "seq3: $seq3\n";
217 $seq3_id = CXGN::Marker::Tools::get_sequence_id($dbh,$seq3);
218 $seq3_id = CXGN::Marker::Tools::insert($dbh,"sequence","sequence_id",['sequence'], ($seq3)) if !$seq3_id;
219 print "seq3_id: $seq3_id\n";
221 $seq5 = $ss->value_at($marker_name,'seq5');
222 print "seq5: $seq5\n";
223 $seq5_id = CXGN::Marker::Tools::get_sequence_id($dbh,$seq5);
224 $seq5_id = CXGN::Marker::Tools::insert($dbh,"sequence","sequence_id",['sequence'], ($seq5)) if !$seq5_id;
225 print "seq5_id: $seq5_id\n";
228 if (($protocol eq 'SNP') && ($snp = $ss->value_at($marker_name,'SNP'))) {
229 print "snp: $snp\n";
230 $snp_id = CXGN::Marker::Tools::get_sequence_id($dbh,$snp);
231 $snp_id = CXGN::Marker::Tools::insert($dbh,"sequence","sequence_id",['sequence'], ($snp)) if !$snp_id;
232 print "snp_id: $snp_id\n";
234 # $aspe1 = $ss->value_at($marker_name,'ASPE1');
235 # print "aspe1: $aspe1\n";
236 # if ($aspe1) {
237 # $aspe1_id = CXGN::Marker::Tools::get_sequence_id($dbh,$aspe1);
238 # $aspe1_id = CXGN::Marker::Tools::insert($dbh,"sequence","sequence_id",['sequence'], ($aspe1)) if !$aspe1_id;
239 # print "aspe1_id: $aspe1_id\n";
241 # $aspe2 = $ss->value_at($marker_name,'ASPE2');
242 # if ($aspe2) {
243 # print "aspe2: $aspe2\n";
244 # $aspe2_id = CXGN::Marker::Tools::get_sequence_id($dbh,$aspe2);
245 # $aspe2_id = CXGN::Marker::Tools::insert($dbh,"sequence","sequence_id",['sequence'], ($aspe2)) if !$aspe2_id;
246 # print "aspe2_id: $aspe2_id\n";
249 if ($protocol eq 'SNP' || $protocol eq 'Indel') {
250 $seq3 = $ss->value_at($marker_name,'seq3');
251 print "seq3: $seq3\n";
252 $seq3_id = CXGN::Marker::Tools::get_sequence_id($dbh,$seq3);
253 $seq3_id = CXGN::Marker::Tools::insert($dbh,"sequence","sequence_id",['sequence'], ($seq3)) if !$seq3_id;
254 print "seq3_id: $seq3_id\n";
256 $seq5 = $ss->value_at($marker_name,'seq5');
257 print "seq5: $seq5\n";
258 $seq5_id = CXGN::Marker::Tools::get_sequence_id($dbh,$seq5);
259 $seq5_id = CXGN::Marker::Tools::insert($dbh,"sequence","sequence_id",['sequence'], ($seq5)) if !$seq5_id;
260 print "seq5_id: $seq5_id\n";
264 my $band_size_a;
265 my $band_size_b;
266 if ($opt_a) {
267 $band_size_a = $ss->value_at($marker_name, $opt_a);
269 if ($opt_b) {
270 $band_size_b = $ss->value_at($marker_name, $opt_b);
273 my $enzyme = $ss->value_at($marker_name, "enzyme");
275 # check if data already in pcr_experiment and marker_experiment, and if not, add it
276 # there's a lot of stuff to check here.. I know these aren't in the database so will come back later
278 my $names = ["marker_id", "annealing_temp", "primer_id_fwd",
279 "primer_id_rev", "experiment_type_id", "map_id","primer_id_pd","stock_id"];
280 my @fields = ($marker_id,$annealing_temp,$primer_id_fwd,
281 $primer_id_rev,$experiment_type_id,$map_id,$primer_id_pd,$stock_id);
283 # does this check if pcr_experiment already exists?
284 #NOW it does!! Should not have 2 rows with the same pcr_experiment data!
286 my $q = "SELECT marker_experiment.marker_experiment_id, pcr_experiment_id FROM sgn.marker_experiment JOIN sgn.marker_location using(location_id) join sgn.map_version using(map_version_id) WHERE map_id=? and marker_id=?";
287 my $s = $dbh->prepare($q);
288 $s->execute($opt_m, $marker_id);
289 my ($marker_experiment_id, $pcr_experiment_id) = $s->fetchrow_array();
291 print STDERR "This marker ($marker_id) has marker_experiment_id $marker_experiment_id,pcr_experiment_id $pcr_experiment_id on $map_id\n";
293 print STDERR "marker_id: $marker_id, map_id=$map_id, stock_id=$stock_id\n";
294 my $pcr_exp_info=$sql->insert_unless_exists('pcr_experiment',{marker_id=>$marker_id,annealing_temp=>$annealing_temp,primer_id_fwd=>$primer_id_fwd, primer_id_rev=>$primer_id_rev, experiment_type_id=>$experiment_type_id,map_id=>$map_id,primer_id_pd=>$primer_id_pd, additional_enzymes=> $enzyme }); #,stock_id => $stock_id } );
298 if($pcr_exp_info->{inserted}) { print "INSERTED NEW pcr_experiment\n" ; }
299 if($pcr_exp_info->{exists}) { print "EXISTING pcr_experiment\n" ; }
303 $pcr_experiment_id = $pcr_exp_info->{id};
305 if ($marker_experiment_id) {
306 $q = "UPDATE sgn.marker_experiment set pcr_experiment_id=? WHERE marker_experiment_id=?";
307 my $s = $dbh->prepare($q);
308 $s->execute($pcr_experiment_id,$marker_experiment_id);
313 #THIS DOES NOT CHECK FOR EXISTING ID
314 #CXGN::Marker::Tools::insert($dbh,"pcr_experiment","pcr_experiment_id",$names,@fields);
315 print STDERR "pcr experiment added: $pcr_experiment_id\n";
317 print STDERR "Instantiating the PCR::Experiment object...\n";
319 my $pcr_ex = CXGN::Marker::PCR::Experiment->new($dbh, $pcr_experiment_id);
320 # set the sequence types
322 print STDERR "MARKER_ID = $marker_id\n";
324 # if (!$stock_id) {
325 # print STDERR "The marker $marker_name exists, but is not on this map...\n"; next();
328 if (!$marker_id) {
329 warn "marker $marker_name is not in the database!!!!\n";
331 else {
332 $pcr_ex->store_sequence('forward_primer', $fwd) if ($protocol ne 'SNP');
333 $pcr_ex->store_sequence('reverse_primer', $rev) if ($protocol ne 'SNP');
334 $pcr_ex->store_sequence('aspe_primer', $aspe1) if $aspe1;
335 $pcr_ex->store_sequence('aspe_primer', $aspe2) if $aspe2;
336 $pcr_ex->store_sequence('indel', $indel) if $indel;
337 $pcr_ex->store_sequence('SNP', $snp) if $snp;
338 $pcr_ex->store_sequence('five_prime_flanking_region', $seq5) if $seq5;
339 $pcr_ex->store_sequence('three_prime_flanking_region', $seq3) if $seq3;
342 $pcr_ex->add_pcr_bands_for_stock($band_size_a, $opt_a) if $band_size_a;
343 $pcr_ex->add_pcr_bands_for_stock($band_size_b, $opt_b) if $band_size_b;
344 print STDERR "Storing pcr_experiment_id $pcr_experiment_id for marker $marker_id ($pcr_ex->{marker_id})\n";
346 $pcr_ex->store_unless_exists();
349 print STDERR "Checking if map_version_id=$map_version_id, map_id=$map_id , marker $marker_id and protocol $protocol exist in marker_experiment\n";
350 # check for existing marker_experiment and update if found
351 my $q = "SELECT marker_experiment_id FROM marker_experiment "
352 . "JOIN marker_location USING (location_id) JOIN map_version "
353 . "USING (map_version_id) WHERE rflp_experiment_id is null "
354 . "AND map_version_id = ? AND marker_id = ? AND protocol ilike ?";
356 my $sth = $dbh->prepare($q);
357 $sth->execute($map_version_id,$marker_id,$protocol);
358 my @exp_id;
359 while (my ($id) = $sth->fetchrow_array()) {
360 print "Found experiment id $id\n";
361 push (@exp_id,$id);
364 # load the first experiment (several occurences here means multiple placed markers).
365 if (my $exp_id = shift(@exp_id)) {
366 #if (@exp_id > 1) { print STDERR join(', ', @exp_id)."\n\n"; }
367 # this really should not be the case
368 # update
369 my $marker_experiment_id = $exp_id;
370 print STDERR "Updating marker_experiment $marker_experiment_id\n";
371 my $u = "UPDATE marker_experiment set pcr_experiment_id = ? where marker_experiment_id = ?";
372 $sth = $dbh->prepare($u);
373 $sth->execute($pcr_experiment_id,$marker_experiment_id);
374 print STDERR "UPDATED pcr_experiment_id = $pcr_experiment_id for marker_experiment $marker_experiment_id\n";
377 # if not loading map and experiments together, may want to match other protocols
379 # if not, insert new marker_experiment
380 else {
381 print "No experiment_id found for marker $marker_name. SKIPPING!!!\n";
382 next();
383 $names = ["marker_id", "pcr_experiment_id", "protocol"];
384 my @fields = ($marker_id, $pcr_experiment_id, $protocol);
385 # 'SSR' or 'unknown'?
387 #my $marker_experiment_id = CXGN::Marker::Tools::insert
388 # ($dbh,"marker_experiment","marker_experiment_id",$names,@fields);
389 # print "marker experiment added: $marker_experiment_id\n";
390 #print "ADD $marker_experiment_id\n";
396 if ($@) {
397 print $@;
398 print "Failed; rolling back.\n";
399 $dbh->rollback();
401 else {
402 print"Succeeded.\n";
403 if ($opt_t) {
404 print"Rolling back.\n";
405 $dbh->rollback();
407 else {
408 print"Committing.\n";
409 $dbh->commit();